Apply function to manipulate Python Pandas DataFrame group - python

I have data in a pandas DataFrame that requires considerable clean up with functions applied to the DataFrame's 'ID' groups. How does one apply any arbitrary function to manipulate Pandas DataFrame groups? A simplified example of the DataFrame is below:
import pandas as pd
import numpy as np
waypoint_time_string = ['0.5&3.0&6.0' for x in range(10)]
moving_string = ['0 0 0&0 0.1 0&1 1 1.2' for x in range(10)]
df = pd.DataFrame({'ID':[1,1,1,1,1,2,2,2,2,2], 'time':[1,2,3,4,5,1,2,3,4,5],
'X':[0,0,0,0,0,1,1,1,1,1],'Y':[0,0,0,0,0,1,1,1,1,1],'Z':[0,0,0,0,0,1,1,1,1,1],
'waypoint_times':waypoint_time_string,
'moving':moving_string})
I would like to apply the function set_group_positions (defined below) to each 'ID' group of df. I have only been successful looping through the DataFrame. It seems that there must be a more 'Pandas.groupby' way of doing this. Here is an example of my implementation that I'm looking to replace:
sub_frames = []
unique_IDs = df['ID'].unique()
for unique_ID in unique_IDs:
working_df = df.loc[df['ID']==unique_ID]
working_df = set_group_positions(working_df)
sub_frames.append(working_df)
final_df = pd.concat(sub_frames)
And to complete a working example, here are additional helper functions:
def set_x_vel(row):
return(row['X'] + row['x_movement'])
def set_y_vel(row):
return(row['Y'] + row['y_movement'])
def set_z_vel(row):
return(row['Z'] + row['z_movement'])
output_time_list = df['time'].unique().tolist()
#main function to apply to each ID group in the data frame:
def set_group_positions(df): #pass the combined df here
working_df = df
times_string = working_df['waypoint_times'].iloc[0]
times_list = times_string.split('&')
times_list = [float(x) for x in times_list]
points_string = working_df['moving']
points_string = points_string.iloc[0]
points_list = points_string.split('&')
points_x = []
points_y = []
points_z = []
for point in points_list:
point_list = point.split(' ')
points_x.append(point_list[0])
points_y.append(point_list[1])
points_z.append(point_list[2])
#get corresponding positions for HPAC times,
#since there could be mismatches
points_x = np.cumsum([float(x) for x in points_x])
points_y = np.cumsum([float(x) for x in points_x])
points_z = np.cumsum([float(x) for x in points_x])
x_interp = np.interp(output_time_list,times_list,points_x).tolist()
y_interp = np.interp(output_time_list,times_list,points_y).tolist()
z_interp = np.interp(output_time_list,times_list,points_z).tolist()
working_df.loc[:,('x_movement')] = x_interp
working_df.loc[:,('y_movement')] = y_interp
working_df.loc[:,('z_movement')] = z_interp
working_df.loc[:,'x_pos'] = working_df.apply(set_x_vel, axis = 1)
working_df.loc[:,'y_pos'] = working_df.apply(set_y_vel, axis = 1)
working_df.loc[:,'z_pos'] = working_df.apply(set_z_vel, axis = 1)
return(working_df)
While my current implementation works, on my real data set, it takes about 20 minutes for me to run, where a simple groupby.apply lambda call on my DataFrame takes only seconds to a minute.

Instead of looping, you can use apply with groupby and a function call:
df = df.groupby('ID').apply(set_group_positions)

Related

simplify splitting a dataframe to several dataframes

So I have some dataframes (df0, df1, df2) with various numbers of rows. I wanted to split any dataframe which has a number of rows more than 30 to several dataframes consists of 30 rows only. So for example my dataframe df0 has 156 rows, then I would separated this dataframe into several dataframes like this:
if len(df0) > 30:
df0_A = df0[0:30]
df0_B = df0[31:60]
df0_C = df0[61:90]
df0_D = df0[91:120]
df0_E = df0[121:150]
df0_F = df0[151:180]
else:
df0= df0
The problem with this code is that I need to repeat the code exhaustively many times for the next code like this:
df0= pd.DataFrame(df0)
df0_A = pd.DataFrame(df0_A)
df0_B = pd.DataFrame(df0_B)
df0_C = pd.DataFrame(df0_C)
df0_D = pd.DataFrame(df0_D)
df0_E = pd.DataFrame(df0_E)
df0_F = pd.DataFrame(df0_F)
df0= df0.to_string(header=False,
index=False,
index_names=False).split('\n')
df0_A = df0_A.to_string(header=False,
index=False,
index_names=False).split('\n')
df0_B = df0_B.to_string(header=False,
index=False,
index_names=False).split('\n')
df0_C = df0_C.to_string(header=False,
index=False,
index_names=False).split('\n')
df0_D = df0_D.to_string(header=False,
index=False,
index_names=False).split('\n')
df0_E = df0_E.to_string(header=False,
index=False,
index_names=False).split('\n')
df0_F = idUGS0_F.to_string(header=False,
index=False,
index_names=False).split('\n')
df0= [','.join(ele.split()) for ele in df0]
df0_A = [','.join(ele.split()) for ele in df0_A]
df0_B = [','.join(ele.split()) for ele in df0_B]
df0_C = [','.join(ele.split()) for ele in df0_C]
df0_D = [','.join(ele.split()) for ele in df0_D]
df0_E = [','.join(ele.split()) for ele in df0_E]
df0_F = [','.join(ele.split()) for ele in df0_F]
now imagine I have ten dataframes that I need to split each into five dataframes. Then I need to make the same code for 50 times!
I'm quite new to Python. So, can anyone help me with how to simplify this code, maybe with simple for loop? thanks
You could probably automate it a little bit more, but this should be enough!
import copy
import numpy as np
df0 = pd.DataFrame({'Test' : np.random.randint(100000,999999,size=180)})
len(df0)
if len(df0) > 30:
df_dict = {}
x=0
y=30
for df_letter in ['A','B','C','D','E','F']:
df_name = f'df0_{df_letter}'
df_dict[df_name] = copy.deepcopy(df_letter)
df_dict[df_name] = pd.DataFrame(df0[x:y]).to_string(header=False, index=False, index_names=False).split('\n ')
df_dict[df_name] = [','.join(ele.split()) for ele in df_dict[df_name]]
x += 30
y += 30
df_name
else:
df0
for df in df_dict:
print(df)
print('--------------------------------------------------------------------')
print(f'length: {len(df_dict[df])}')
print('--------------------------------------------------------------------')
print(df_dict[df])
print('--------------------------------------------------------------------')
Assuming you have one column for identification,
def split_df(idf, idcol, nsize):
g = idf.groupby(idcol)
# Compute the size for each value of identification column
size = g.size()
dflist = []
for _id,_idcount in size.iteritems():
if _idcount > nsize:
# print(_id, ' = ', _idcount)
idx = idf[ idf[idcol].eq(_id) ].index
# print(idx)
# lets split the array into equal parts of `nsize`
# e.g. [1,2,3,4,5] with nsize = 2 will split into ([1,2], [3,4], [5])
ilist = np.array_split(idx, round(idx.shape[0]/nsize + 0.5))
dflist += ilist
return [idf.loc[idx].copy(deep=True) for idx in dflist]
df = pd.DataFrame(data=np.hstack((np.random.choice(np.arange(1,3), 10).reshape(10, -1), np.random.rand(10,3))), columns=['id', 'a', 'b', 'c'])
df = df.astype({'id': np.int64})
split(df, 'id', 2)
This is a great problem, you can use this (data is the DataFrame here):
# Create subsets of size 30 for the DataFrame
subsets = list(range(0, len(data), 30))
# Create start cutoffs for subsets of the DataFrame
start_cutoff = subsets
# Create end cutoffs for subsets of the DataFrame
end_cutoff = subsets[1:] + [len(data)]
# Zip the start cutoffs and end cutoffs into a List of Cutoffs
cutoffs = list(zip(start_cutoff, end_cutoff))
# List containing Splitted Dataframes
list_dfs = [data.iloc[cutoff[0]: cutoff[-1]] for cutoff in cutoffs]
# convert list to string DFs
string_dfs = [df.to_string(header=False, index=False, index_names=False).split('\n') for df in list_dfs]
final_df_list = [','.join(ele.split()) for string_df in string_dfs for ele in string_df]
Now you can access the DataFrames by:
print(final_df_list[0])
print(final_df_list[1])

How to save only the rows with a specific numpy array / matrix shape in Pandas dataframe?

Say I have a dataframe df, and a column 'Array' that contains bunch of numpy arrays. Now I want to save only the rows with the array shape that most commonly existed in this column, and drop the other rows. I want to input these arrays as features to do some machine learning stuff, so I need to ensure they are all in a same shape. Below is my script, but it doesn't work. I hope the answer should also works when the features are matrices.
import pandas as pd
df = pd.DataFrame(data)
lst = [i.shape for i in df['Array']]
most_common = max(set(lst), key=lst.count)
df = df[df['Array'].shape==most_common]
Basically I want to convert atomic structures into numerical features like numpy arrays or matrices. I'm using a method called Coulomb Matrix to achieve this. In case you want to try yourselves, this is my whole script:
import pandas as pd
from ase.db import connect
from ase import Atoms
from ase.io import read
import numpy as np
import numpy.linalg as linalg
def eig_data(atoms):
init_R = atoms.positions
init_Z = atoms.numbers
x_list = []
y_list = []
z_list = []
M_list = []
M_tmp = []
M_matrix = []
for x,y,z in init_R:
x_list.append(x)
y_list.append(y)
z_list.append(z)
order=0
for xl,yl,zl,Z in zip(x_list,y_list,z_list,init_Z):
M_list.append((order,xl,yl,zl,Z))
order+=1
for order,x,y,z,charge in M_list:
r = np.array((x,y,z))
M_tmp = []
for oorder,ox,oy,oz,ocharge in M_list:
if oorder == order:
IJ = 0.5*ocharge**2.4
M_tmp.append(IJ)
else:
otr = np.array((ox,oy,oz))
dist = np.linalg.norm(r-otr)
InJ = (charge*ocharge)/dist
M_tmp.append(InJ)
M_matrix.append(M_tmp)
M = np.array(M_matrix)
w,v = np.linalg.eig(M)
w_sort = np.sort(w)[::-1]
# print(np.amax(w_sort))
return w_sort
res = connect('c2db.db')
df = []
for row in res.select():
atoms = row.toatoms()
i = row.id
c = eig_data(atoms)
f = row.formula
E = row.energy
g = row.gap
w = row.workfunction
df.append({'id': i, 'c_matrix':c, 'formula':f, 'energy': E, 'band_gap':g, 'work_function':w})
df = pd.DataFrame(df)
lst = [i.shape for i in df['c_matrix']]
most_common = max(set(lst), key=lst.count)
df = df[df['c_matrix'].shape==most_common]
The database can be downloaded here: c2db_database
My suggestion would be to create a filter in this way for your dataframe
filter = [i.shape == most_common for i in df['c_matrix']]
df = df[filter]
Or simply
df = df[[i.shape == most_common for i in df['c_matrix']]]

Dask Concatenate a Series of Dataframes

I have a Dask Series of Pandas DataFrames. I would like to use dask.dataframe.multi.concat to convert this into a Dask DataFrame. However the dask.dataframe.multi.concat always requires a list of DataFrames.
I could perform a compute on the Dask series of Pandas DataFrames to get a Pandas series of DataFrames, at which point I could turn that into a list. But I think it would be better not to call compute and instead directly acquire the Dask DataFrame from the Dask Series of Pandas DataFrames.
What would the best way to do this? Here's my code that produces the series of dataframes
import pandas as pd
import dask.dataframe as dd
import operator
import numpy as np
import math
import itertools
def apportion_pcts(pcts, total):
"""Apportion an integer by percentages
Uses the largest remainder method
"""
if (sum(pcts) != 100):
raise ValueError('Percentages must add up to 100')
proportions = [total * (pct / 100) for pct in pcts]
apportions = [math.floor(p) for p in proportions]
remainder = total - sum(apportions)
remainders = [(i, p - math.floor(p)) for (i, p) in enumerate(proportions)]
remainders.sort(key=operator.itemgetter(1), reverse=True)
for (i, _) in itertools.cycle(remainders):
if remainder == 0:
break
else:
apportions[i] += 1
remainder -= 1
return apportions
# images_df = dd.read_csv('./tests/data/classification/images.csv')
images_df = pd.DataFrame({"image_id": [0,1,2,3,4,5], "image_class_id": [0,1,1,3,3,5]})
images_df = dd.from_pandas(images_df, npartitions=1)
output_ratio = [80, 20]
def partition_class (partition):
size = len(partition)
proportions = apportion_pcts(output_ratio, size)
slices = []
start = 0
for proportion in proportions:
s = slice(start, start + proportion)
slices.append(partition.iloc[s, :])
start = start+proportion
slicess = pd.Series(slices)
return slicess
partitioned_schema = dd.utils.make_meta(
[(0, object), (1, object)], pd.Index([], name='image_class_id'))
partitioned_df = images_df.groupby('image_class_id')
partitioned_df = partitioned_df.apply(partition_class, meta=partitioned_schema)
In the partitioned_df, we can get partitioned_df[0] or partitioned_df[1] to get a series of dataframe objects.
Here is an example of the CSV file:
image_id,image_width,image_height,image_path,image_class_id
0,224,224,tmp/data/image_matrices/0.npy,5
1,224,224,tmp/data/image_matrices/1.npy,0
2,224,224,tmp/data/image_matrices/2.npy,4
3,224,224,tmp/data/image_matrices/3.npy,1
4,224,224,tmp/data/image_matrices/4.npy,9
5,224,224,tmp/data/image_matrices/5.npy,2
6,224,224,tmp/data/image_matrices/6.npy,1
7,224,224,tmp/data/image_matrices/7.npy,3
8,224,224,tmp/data/image_matrices/8.npy,1
9,224,224,tmp/data/image_matrices/9.npy,4
I tried to do a reduction afterwards, but this doesn't quite make sense due to a proxy foo string.
def zip_partitions(s):
r = []
for c in s.columns:
l = s[c].tolist()
r.append(pd.concat(l))
return pd.Series(r)
output_df = partitioned_df.reduction(
chunk=zip_partitions
)
The proxy list that I'm attempting to concat is ['foo', 'foo']. What is this phase for? To discover how to do the task? But then certain operations don't work. I'm wondering if it's because I'm operating over objects that I'm getting these strings.
I figured out an answer by applying the reduction at the very end to "zip" up each dataframe into a series of dataframes.
import pandas as pd
import dask.dataframe as dd
import operator
import numpy as np
import math
import itertools
def apportion_pcts(pcts, total):
"""Apportion an integer by percentages
Uses the largest remainder method
"""
if (sum(pcts) != 100):
raise ValueError('Percentages must add up to 100')
proportions = [total * (pct / 100) for pct in pcts]
apportions = [math.floor(p) for p in proportions]
remainder = total - sum(apportions)
remainders = [(i, p - math.floor(p)) for (i, p) in enumerate(proportions)]
remainders.sort(key=operator.itemgetter(1), reverse=True)
for (i, _) in itertools.cycle(remainders):
if remainder == 0:
break
else:
apportions[i] += 1
remainder -= 1
return apportions
images_df = dd.read_csv('./tests/data/classification/images.csv', blocksize=1024)
output_ratio = [80, 20]
def partition_class(group_df, ratio):
proportions = apportion_pcts(ratio, len(group_df))
partitions = []
start = 0
for proportion in proportions:
s = slice(start, start + proportion)
partitions.append(group_df.iloc[s, :])
start += proportion
return pd.Series(partitions)
partitioned_schema = dd.utils.make_meta(
[(i, object) for i in range(len(output_ratio))],
pd.Index([], name='image_class_id'))
partitioned_df = images_df.groupby('image_class_id')
partitioned_df = partitioned_df.apply(
partition_class, meta=partitioned_schema, ratio=output_ratio)
def zip_partitions(partitions_df):
partitions = []
for i in partitions_df.columns:
partitions.append(pd.concat(partitions_df[i].tolist()))
return pd.Series(partitions)
zipped_schema = dd.utils.make_meta((None, object))
partitioned_ds = partitioned_df.reduction(
chunk=zip_partitions, meta=zipped_schema)
I think it should be possible to combine both the reduction and apply to a single custom aggregation to represent a map reduce operation.
However I could not figure out how to do such a thing with the custom aggregation since it uses a series groupby.

Duplicating rows in dataframe python

Good afternoon everyone,
I am currently writing a thesis on the KMV model in python. I took inspiration from the code here to solve the non-linear equations. Here is the link to the CSV file used to create the dataframe. And this is the code I have so far:
Importation of the required modules
from datetime import datetime
import pandas as pd
import numpy as np
import scipy.optimize as sco
from scipy.stats import norm
df = pd.DataFrame()
df = pd.read_csv("AREX.csv", sep=';', engine = "python", decimal=',')
Functions to prepare the file for the model to run
def clean():
# df.rename(columns ={"Date": "Date"}, inplace = True)
# df["Date"] = pd.to_datetime(df['Date'])
df.set_index("Date", inplace = True)
df['AREX.O']=df['AREX.O'].astype(float)
df.drop(['Total Short Term debt'], axis =1, inplace = True)
return df
def preparation():
df['e']=df['AREX.O']*df['Share Outstanding']
df['Short Term Debt']=df['Debt']-df['Total Long term Debt']
df['f']=df['Short Term Debt']+df['Total Long term Debt']*0.5
df['log_ret'] = np.log(df['AREX.O']) - np.log(df['AREX.O'].shift(1))
# df['stdev']=df['log_ret'].rolling(252).std()*m.sqrt(252)
return df
Algorithm used to solve for a and sigma_a.
I only tried to adapt the code to my dataframe here
def algo1():
# formatting the vaules as required
df["f"] = df["f"].astype(float)
df["e"] = df["e"].astype(float)
# #computating of key input variable for the model
df['a'] = df['f'].add(df["e"])
#defining a function for the black Scholes equation
def bseqn(a, debug=False):
d1 = (np.log(a/f) + (r + 0.5*sigma_a**2)*T)/(sigma_a*np.sqrt(T))
d2 = d1 - sigma_a*np.sqrt(T)
y1 = e - (a*norm.cdf(d1) - np.exp(-r*T)*f*norm.cdf(d2))
if debug:
print("d1 = {:.6f}".format(d1))
print("d2 = {:.6f}".format(d2))
print("Error = {:.6f}".format('y1'))
return y1
#Solving the model
time_horizon=[1]
timesteps = range(1, len(df))
results = np.empty((df.shape[0],len(time_horizon)))
#looping to solve for each row
for i, years in enumerate(time_horizon):
T = 1
results[:,i] = df.loc[:,'a']
for i_t, t in enumerate(timesteps):
a = results[t-10:t,i]
ra =np.log(a/np.roll(a,1))
sigma_a = np.nanstd(ra) #gives initial value of sigma_a
if i_t == 0:
subset_timesteps = range(t-1, t+1)
print(subset_timesteps)
else:
subset_timesteps = [t]
n_its = 0
while n_its < 10:
n_its += 1
for t_sub in subset_timesteps:
r = df.iloc[t_sub]['r']
f = df.iloc[t_sub]['f']
e = df.iloc[t_sub]['e']
sol = sco.fsolve(bseqn, results[t_sub,i]) #if I replace newton with fsolve the code works properly
results[t_sub,i] = sol # stores the new values of a
# Update sigma_a based on new values of a
last_sigma_a = sigma_a
a = results[t-10:t,i]
ra = np.log(a/np.roll(a,1))
sigma_a = np.nanstd(ra) #new val of sigma
diff = last_sigma_a - sigma_a
if abs(diff) < 1e-3:
df.loc[t_sub,'sigma_a'] = sigma_a
break
else:
pass
return df
Run function
def run():
clean()
preparation()
algo1()
print(df)
print(list(df))
# main_df = df.to_csv("AREX_D.csv")
The output should write the results of sigma_a on the created sigma_a column but instead of that it adds a row so instead of 1500 rows i end-up with 3000 rows most of it being Nan values. I do not understand where the code asks that...
I suspect it to come from these lines:
diff = last_sigma_a - sigma_a
if abs(diff) < 1e-3:
df.loc[t_sub,'sigma_a'] = sigma_a
break
Does anyone has any insight on what is happening ?
Here is a picture of the output :
Thank you very much!

appending to a pandas dataframe

I want to add make a pandas dataframe with two columns : read_id and score
I am using the following code :
reads_array = []
for x in Bio.SeqIO.parse("inp.fasta","fasta"):
reads_array.append(x)
columns = ["read_id","score"]
df = pd.DataFrame(columns = columns)
df = df.fillna(0)
for x in reads_array:
alignments=pairwise2.align.globalms("ACTTGAT",str(x.seq),2,-1,-.5,-.1)
sorted_alignments = sorted(alignments, key=operator.itemgetter(2),reverse = True)
read_id = x.name
score = sorted_alignments[0][2]
df['read_id'] = read_id
df['score'] = score
But this does not work. Can you suggest a way of generating the dataframe df
At the top make sure you have
import numpy as np
Then replace the code you shared with
reads_array = []
for x in Bio.SeqIO.parse("inp.fastq", "fastq"):
reads_array.append(x)
df = pd.DataFrame(np.zeros((len(reads_array), 2)), columns=["read_id", "score"])
for index, x in enumerate(reads_array):
alignments = pairwise2.align.globalms("ACTTGAT", str(x.seq), 2, -1, -.5, -.1)
sorted_alignments = sorted(alignments, key=operator.itemgetter(2), reverse=True)
read_id = x.name
score = sorted_alignments[0][2]
df.loc[index, 'read_id'] = read_id
df.loc[index, 'score'] = score
The main problem with your original code was two things:
1) Your dataframe had 0 rows
2) df['column_name'] refers to the entire column, not a single cell, so when you execute df['column_name'] = value, all cells in that column get set to that value
df['read_id'] and df['score'] is Series. So if you want to iterate reads_array and calculate some value, then assign it to df's columns, try following:
for i, x in enumerate(reads_array):
...
df.ix[i]['read_id'] = read_id
df.ix[i]['score'] = score

Categories