I wrote the following function:
def unique_values(df, column):
unique = df[column].unique()
clean = pd.DataFrame.from_dict(unique)
clean.columns = [column]
return clean
I would like to apply the following function to various columns in a df. As something like this:
unique1, unique2, unique3 = unique_values(df, "Column1", "Column2", "Column3")
If I add an args in the following way:
def unique_values(df, *column):
unique = df[column].unique()
clean = pd.DataFrame.from_dict(unique)
clean.columns = [column]
return clean
and apply the function like this:
unique1, unique2, unique3 = unique_values(df, "Column1", "Column2", "Column3")
I get the following error:
KeyError: ('Column1', 'Column2', 'Column3')
Any help would be appreciated
You can do it this way by iterating though column:
def unique_values(df, *column):
to_return=[]
for col in column:
unique = df[col].unique()
clean = pd.DataFrame.from_dict(unique)
clean.columns = [col]
to_return.append(clean)
return to_return
# this way this works:
unique1, unique2, unique3 = unique_values(df, "Column1", "Column2", "Column3")
You can write a small wrapper function that calls your unique_values() function with the list of columns that you pass as arguments, like so:
def df_unique_values(df, *columns):
return [unique_values(df, x) for x in columns]
This function returns a list containing the result of each call to unique_values() for a different column. Use this function like so:
unique1, unique2, unique3 = df_unique_values(df, "Column1", "Column2", "Column3")
Related
I have a codebase where this pattern is very common:
df # Some pandas dataframe with columns userId, sessionId
def add_session_statistics(df):
df_statistics = get_session_statistics(df.sessionId.unique())
return df.merge(df_statistics, on='sessionId', how='left')
def add_user_statistics(df):
df_statistics = add_user_statistics(df.userId.unique())
return df.merge(df_statistics, on='sessionId', how='left')
# etc..
df_enriched = (df
.pipe(add_session_statistics)
.pipe(add_user_statistics)
)
However, in another part of the codebase I have 'userId', 'sessionId' as the index of the dataframe. Something like:
X = df.set_index(['userId', 'sessionId'])
This means I can't use the add_{somthing}_statistics() functions on X without resetting the index each time.
Is there any decorator I can add to the add_{somthing}_statistics() to make them reset the index if they get a KeyError when attempting the merge on a column that is not there?
This seems to work:
def index_suspension_on_add(add_function):
def _helper(df):
try:
return df.pipe(add_function)
except Exception:
index_names = df.index.names
return (df
.reset_index()
.pipe(add_function)
.set_index(index_names)
)
return _helper
#index_suspension_on_add
def add_user_statistics(df):
...
I have a function to transform camelCase to snake_case:
def camel_to_snake(name):
name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
So, I did a for loop in my dataframe columns to apply the function.
Example of columns name variable:
df_columns = ['colNameOne', 'colNameTwo', 'colNameThree']
for column in df_columns:
camel_to_snake(column)
The output example:
'col_name_one'
'col_name_two'
'col_name_three'
How can I store the result of the for loop in a list to change the name of dataframe columns?
I tried using an empty list variable with append method, but it didn't work.
df = df.toDF(*[camel_to_snake(c) for c in df.columns])
You are only calling the method camel_to_snake but not saving what it return anywhere. Make a new list and save the values in it.
df_columns = ['colNameOne', 'colNameTwo', 'colNameThree']
sn_columns = []
for column in df_columns:
sn_columns.append(camel_to_snake(column))
print(sn_columns)
What would be the best method of turning a code like below to be able to accept as many dataframes as we would like?
def q_grab(df, df2, df3, q): #accepts three dataframes and a column name. Looks up column in all dataframes and combine to one
data = df[q], df2[q], df3[q]
headers = [q+"_1", q+"_2", q+"_3"]
data2 = pd.concat(data, axis = 1, keys=headers)
return data2
q = 'covid_condition'
data2 = q_grab(df, df2, df3, q) #If I run function pid_set first, it will create new df based on pID it looks like
One approach is to use * operator to get a list of arguments
(but name your final argument, so it isn't part of the list):
Something like this:
def q_grab(*dfs, q=None): # q is a named argument to signal end of positional arguments
data = [df[q] for df in dfs]
headers = [q+"_"+str(i) for i in range(len(dfs))]
data2 = pd.concat(data, axis = 1, keys=headers)
return data2
q = 'covid_condition'
data2 = q_grab(df, df2, df3, q=q)
A probably cleaner alternative, is to go ahead and pass a list of dataframes as the first argument:
def q_grab(dfs,q):
called with:
data2 = q.grab([df,df2,df3], q)
using the function code as above
I've a UDF function with output in tuple format. I want to apply that UDF to my input column and based on what I need out, I want to choose either out1 or out2 value as the value for my column.
Something like this:
def my_f(inp):
return out1,out2
df = df.withColumn('first_val', F.udf(my_f, StringType())(F.col('inp_col'))[0]
df = df.withColumn('second_val', F.udf(my_f, StringType())(F.col('inp_col'))[1]
I want the first_val col to have first element of tuple, and second_val to have second element of the tuple. This code of course doesn't work.
I tried it by passing the required part of tuple as function input and it worked. Like this:
def my_f(inp, out='full'):
if out=='first':
return out1
elif out=='second':
return out2
else: # case 'full'
return out1,out2
df = df.withColumn('first_val', F.udf(my_f, StringType())(F.col('inp_col'), F.col('inp_col'))
df = df.withColumn('second_val', F.udf(my_f, StringType())(F.col('inp_col'),'F.col('second'))
But is there a simpler way of getting the nth element of tuple within the line without passing this parameter?
If your UDF is returning a tuple you should change your return type to ArrayType(StringType) assuming you are returning a tuple of Strings. Then you will be able to access the first and second element of your tuple by using the [n] notation. Here is an example:
import pyspark.sql.functions as F
import pyspark.sql.types as T
...
#F.udf(T.ArrayType(T.StringType())
def my_f(inp):
...
return (out1, out2)
df = df.withColumn('first_val', my_f('inp_col')[0])
df = df.withColumn('second_val', my_f('inp_col')[1])
In case you need different types in your tuple you might want to consider returning a StructType instead. Here would be an example where the first element of the tuple is a string and the second is an integer:
import pyspark.sql.functions as F
import pyspark.sql.types as T
...
#F.udf(T.StructType([
T.StructField("first", T.StringType()),
T.StructField("second", T.IntegerType())
]))
def my_f(inp):
...
return {"first": out1, "second": out2}
df = df.withColumn('first_val', my_f('inp_col')["first"])
df = df.withColumn('second_val', my_f('inp_col')["second"])
I have a dataframe df1, like this:
date sentence
29/03/1029 i like you
.....
I want to create new dataframe df2 like this:
date verb object
29/03/2019 like you
....
with the function like this:
def getSplit(df1):
verbList = []
objList = []
df2 = pd.DataFrame()
for row in df1['sentence']:
verb = getVerb(row)
obj = getObj(row)
verbList.append(verb)
objList.append(obj)
df2 = df1[[date]].copy
df2['verb'] = verbList
df2['object'] = objList
return df2
my function run well, but it's slow. Could someone help me improve the function so that can run faster?
Thank you
You can Use apply method of pandas to process fast:-
getverb(row):
pass # Your function
getobj(row):
passs # Your function
df2 = df1.copy() # Making copy of your dataframe.
df2['verb'] = df2['sentence'].apply(getverb)
df2['obj'] = df2['sentence'].apply(getobj)
df2.drop('sentence', axis=1, inplace=True) # Droping sentence column
df2
I hope it may help you. (accept and upvote answer)