I've read up on a number of threads (here and here) and the docs (here and here). However, I can't get this to work. I get an error of
AxisError: axis 0 is out of bounds for array of dimension 0
Thanks.
import pandas as pd
from scipy.stats import levene
data = {'A': [1,2,3,4,5,6,7,8],
'B': [9,10,11,12,13,14,15,16],
'C': [1,2,3,4,5,6,7,8]}
df3 = pd.DataFrame(data, columns=['A', 'B','C'])
print(levene(df3['A'], df3['C'])) # this works as intended
cols_of_interest = ['A','C'] # my requirement could make this any combination
# function to pass through arguments into Levene test
def func(df,cols_of_interest):
cols = [col for col in 'df.'+df[cols_of_interest].columns] # my strategy to mimic the arguments
lev = levene(*cols)
print(lev)
func(df3,cols_of_interest)
Replace your list comprehension inside def with:
cols = [df[x] for x in cols_of_interest]
Related
What would be the best method of turning a code like below to be able to accept as many dataframes as we would like?
def q_grab(df, df2, df3, q): #accepts three dataframes and a column name. Looks up column in all dataframes and combine to one
data = df[q], df2[q], df3[q]
headers = [q+"_1", q+"_2", q+"_3"]
data2 = pd.concat(data, axis = 1, keys=headers)
return data2
q = 'covid_condition'
data2 = q_grab(df, df2, df3, q) #If I run function pid_set first, it will create new df based on pID it looks like
One approach is to use * operator to get a list of arguments
(but name your final argument, so it isn't part of the list):
Something like this:
def q_grab(*dfs, q=None): # q is a named argument to signal end of positional arguments
data = [df[q] for df in dfs]
headers = [q+"_"+str(i) for i in range(len(dfs))]
data2 = pd.concat(data, axis = 1, keys=headers)
return data2
q = 'covid_condition'
data2 = q_grab(df, df2, df3, q=q)
A probably cleaner alternative, is to go ahead and pass a list of dataframes as the first argument:
def q_grab(dfs,q):
called with:
data2 = q.grab([df,df2,df3], q)
using the function code as above
I'm preparing a big multivariate time series data set for a supervised learning task and I would like to create time shifted versions of my input features so my model also infers from past values. In pandas there's the shift(n) command that lets you shift a column by n rows. Is there something similar in vaex?
I could not find anything comparable in the vaex documentation.
No, we do not support that yet (https://github.com/vaexio/vaex/issues/660). Because vaex is extensible (see http://docs.vaex.io/en/latest/tutorial.html#Adding-DataFrame-accessors) I thought I would give you the solution in the form of that:
import vaex
import numpy as np
#vaex.register_dataframe_accessor('mytool', override=True)
class mytool:
def __init__(self, df):
self.df = df
def shift(self, column, n, inplace=False):
# make a copy without column
df = self.df.copy().drop(column)
# make a copy with just the colum
df_column = self.df[[column]]
# slice off the head and tail
df_head = df_column[-n:]
df_tail = df_column[:-n]
# stitch them together
df_shifted = df_head.concat(df_tail)
# and join (based on row number)
return df.join(df_shifted, inplace=inplace)
x = np.arange(10)
y = x**2
df = vaex.from_arrays(x=x, y=y)
df['shifted_y'] = df.y
df2 = df.mytool.shift('shifted_y', 2)
df2
It generates a single column datagram, slices that up, concatenates and joins it back. All without a single memory copy.
I am assuming here a cyclic shift/rotate.
The function needs to be modified slightly in order to work in the latest release (vaex 4.0.0ax), see this thread.
Code by Maarten should be updated as follows:
import vaex
import numpy as np
#vaex.register_dataframe_accessor('mytool', override=True)
class mytool:
def __init__(self, df):
self.df = df
# mytool.shift is the analog of pandas.shift() but add the shifted column with specified name to the end of initial df
def shift(self, column, new_column, n, cyclic=True):
df = self.df.copy().drop(column)
df_column = self.df[[column]]
if cyclic:
df_head = df_column[-n:]
else:
df_head = vaex.from_dict({column: np.ma.filled(np.ma.masked_all(n, dtype=float), 0)})
df_tail = df_column[:-n]
df_shifted = df_head.concat(df_tail)
df_shifted.rename(column, new_column)
return df_shifted
x = np.arange(10)
y = x**2
df = vaex.from_arrays(x=x, y=y)
df2 = df.join(df.mytool.shift('y', 'shifted_y', 2))
df2
I have a dataframe as follows:
from uncertainties import ufloat # pip3 uncertainties.py
import random
random.seed(0)
values = [[round(random.random(),2) for i in range(3)] for j in range(4)]
df = pd.DataFrame(values, index=['name1','sd', 'name2', 'sd'], columns=['A','B','C'])
and I want to rearrange the data, where I combine the mean and sd into one row, as ufloat, of which the desired output looks as follows:
new_values = [[ufloat(0.91,0.90), ufloat(0.98,0.31), ufloat(0.81,0.73)],
[ufloat(0.90,0.10), ufloat(0.68,0.43), ufloat(0.47, 0.61)]]
df = pd.DataFrame(new_values, index=['name1', 'name2'], columns=['A','B','C'])
I think it might be easiest to create two dataframes and combine them somehow
mean = df.iloc[::2].reset_index()
std = df.iloc[1::2].reset_index()
where now I need to merge the two and apply ufloat
This is my current solution:
mean = df.iloc[::2]
std = df.iloc[1::2]
tmp = np.array([ufloat(x[0], x[1]) for x
in zip(mean.values.ravel(), std.values.ravel())])
df = pd.DataFrame(tmp.reshape(mean.shape), columns=mean.columns, index=mean.index)
Or is it possible to capture the function call itself in any way (describe which values are assigned to the different arguments)?
Sorry for the poor phrasing of the question. Let me explain with some reproducible code:
import pandas as pd
import numpy as np
import matplotlib.dates as mdates
import inspect
# 1. Here is Dataframe with some random numbers
np.random.seed(123)
rows = 10
df = pd.DataFrame(np.random.randint(90,110,size=(rows, 2)), columns=list('AB'))
datelist = pd.date_range(pd.datetime(2017, 1, 1).strftime('%Y-%m-%d'), periods=rows).tolist()
df['dates'] = datelist
df = df.set_index(['dates'])
df.index = pd.to_datetime(df.index)
#print(df)
# 2. And here is a very basic function to do something with the dataframe
def manipulate(df, factor):
df = df * factor
return df
# 3. Now I can describe the function using:
print(inspect.getargspec(manipulate))
# And get:
# ArgSpec(args=['df', 'factor'], varargs=None, keywords=None,
# defaults=None)
# __main__:1: DeprecationWarning: inspect.getargspec() is
# deprecated, use inspect.signature() or inspect.getfullargspec()
# 4. But what I'm really looking for is a way to
# extract or store the function AND the variables
# used when the function is called, like this:
df2 = manipulate(df = df, factor = 20)
# So in the example using Inspect, the desired output could be:
# ArgSpec(args=['df = df', 'factor = 10'], varargs=None,
# and so on...
I realize that this may seem a bit peculiar, but it would actually be of great use to me to be able to do something like this. If anyone is interested, I'd be happy to explain everything in more detail, including how this would fit in in mye data science work-flow.
Thank you for any suggestions!
You can bind the parameters to the function and create a new callable
import functools
func = functools.partial(manipulate, df=df, factor=20)
the resulting partial object allows argument inspection and modification using the attributes args and keywords:
func.keywords # {'df': <pandas dataframe>, 'factor': 20}
and and can finally be called using
func()
I'm new to pandas, and, given a data frame, I was trying to drop some columns that don't accomplish an specific requirement. Researching how to do it, I got to this structure:
df = df.loc[df['DS_FAMILIA_PROD'].isin(['CARTOES', 'CARTÕES'])]
However, when processing the frame, I get this error:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self[name] = value
I'm not sure about what to do because I'm already using the .loc function.
What am I missing?
f = ['ID_manifest', 'issue_date', 'channel', 'product', 'ID_client', 'desc_manifest']
df = pd.DataFrame(columns=f)
for chunk in df2017_chunks:
aux = preProcess(chunk, f)
df = pd.concat([df, aux])
def preProcess(df, f):
stops = list(stopwords.words("portuguese"))
stops.extend(['reclama', 'cliente', 'santander', 'cartao', 'cartão'])
df = df.loc[df['DS_FAMILIA_PROD'].isin(['CARTOES', 'CARTÕES'])]
df.columns = f
df.desc_manifest = df.desc_manifest.str.lower() # All lower case
df.desc_manifest = df.desc_manifest.apply(lambda x: re.sub('[^A-zÀ-ÿ]', ' ', str(x))) # Just letters
df.replace(['NaN', 'nan'], np.nan, inplace = True) # Remone nan
df.dropna(subset=['desc_manifest'], inplace=True)
df.desc_manifest = df.desc_manifest.apply(lambda x: [word for word in str(x).split() if word not in stops]) # Remove stop words
return df
You need copy, because if you modify values in df later you will find that the modifications do not propagate back to the original data (df), and that Pandas does warning.
loc can be omit, but warning without copy too.
df = pd.DataFrame({'DS_FAMILIA_PROD':['a','d','b'],
'desc_manifest':['F','rR', 'H'],
'C':[7,8,9]})
def preProcess(df):
df = df[df['DS_FAMILIA_PROD'].isin([u'a', u'b'])].copy()
df.desc_manifest = df.desc_manifest.str.lower() # All
...
...
return df
print (preProcess(df))
C DS_FAMILIA_PROD desc_manifest
0 7 a f
2 9 b h
The purpose of the warning is to show users that they may be operating on a copy and not the original but there can be False positives. As mentioned in the comments, this is not an issue for your use case.
You can simply turn off the check for your dataframe:
df.is_copy = False
or you can explicitly copy:
df = df.loc[df['DS_FAMILIA_PROD'].isin(['CARTOES', 'CARTÕES'])].copy()
If your program intends to take a copy of the df on purpose, you can stop the warning with this:
pd.set_option('mode.chained_assignment', None)
pd.set_option('mode.chained_assignment', 'warn')
# if you set a value on a copy, warning will show
df = DataFrame({'DS_FAMILIA_PROD' : [1, 2, 3], 'COL2' : [5, 6, 7]})
df = df[df.DS_FAMILIA_PROD.isin([1, 2])]
df
Out[29]:
COL2 DS_FAMILIA_PROD
0 5 1
1 6 2