Function to split & expand returning NameError - python

def unique_unit_split(df):
df_unit_list = df_master.loc[df_master['type'] == 'unit']
df_unit_list = df_unit_list.key.tolist()
for i in range(len(df_unit_list)):
df_unit_list[i] = int(df_unit_list[i])
split_1 = df_units.units.str.split('[","]',expand=True).stack()
df_units_update = df_units.join(pd.Series(index=split_1.index.droplevel(1), data=split_1.values, name='unit_split'))
df_units_final = df_units_update[df_units_update['unit_split'].isin(df_unit_list)]
return(df)
Updated script: still not working
df_unit_list = []
split_1 = pd.DataFrame()
df_units_update = pd.DataFrame()
df_units_final = pd.DataFrame()
def unique_unit_split(df):
df_unit_list = df_master.loc[df_master['type'] == 'unit']
df_unit_list = df_unit_list.key.tolist()
for i in range(len(df_unit_list)):
df_unit_list[i] = int(df_unit_list[i])
split_1 = df_units.units.str.split('[","]',expand=True).stack()
df_units_update = df_units.join(pd.Series(index=split_1.index.droplevel(1), data=split_1.values, name='unit_split'))
df_units_final = df_units_update[df_units_update['unit_split'].isin(df_unit_list)]
return(df)
Above function originally worked when I split up the two actions (code inclusive of the for loop and above was in a function then everything below split_1 was in another function). Now that I tried to condense them, I am getting a NameError (image attached). Anyone know how I can resolve this issue and ensure my final df (df_units_final) is defined?
For more insight on this function: I have a df with comma separated values in one column and I needed to split that column, drop the [] and only keep rows with the #s I need which were defined in the list created "df_unit_list".
NameError Details

The issue was stated above (not defining df_units_final) AND my for_loop was forcing the list to be int when the values in the other df were actually strings.
Working Code

Related

Python Panda-Specific format of text with hyphen

I need to change multiple such python dataframe columns that do not follow a specific format like Name-ID-Date. And want to change that to follow the same format. I have attached the input and Corrected output format as images.
I have written some code that basically looks at all the columns in dataframe and if it follows the format then it separates the data into 3 different columns but if does not follow the specific format of Name-ID-Date the code is not able to proceed. Any help will be highly appreciated here.
dff = df[['PPS_REQ','Candidate1', 'Candidate2',
'Candidate3', 'Candidate4', 'Candidate5', 'Candidate6', 'Candidate7',
'Candidate8', 'Candidate9','Candidate10', 'Candidate11', 'Candidate12',
'Candidate13', 'Candidate14', 'Candidate15', 'Candidate16',
'Candidate17', 'Candidate18', 'Candidate19', 'Candidate20','Candidate21',
'Candidate22','Candidate23','Candidate24','Candidate25','Candidate26','Candidate27','Candidate28']]
all_candiadates = ['Candidate1', 'Candidate2',
'Candidate3', 'Candidate4', 'Candidate5', 'Candidate6', 'Candidate7',
'Candidate8', 'Candidate9','Candidate10', 'Candidate11', 'Candidate12',
'Candidate13', 'Candidate14', 'Candidate15', 'Candidate16',
'Candidate17', 'Candidate18', 'Candidate19', 'Candidate20','Candidate21',
'Candidate22','Candidate23','Candidate24','Candidate25','Candidate26','Candidate27','Candidate28']#,'Candidate29','Candidate30','Candidate31','Candidate32','Candidate33','Candidate34','Candidate35','Candidate36','Candidate37','Candidate38']
blank = pd.DataFrame()
for index, row in dff.iterrows():
for c in all_candiadates:
print('the value of c :',c)
candidate = dff[['PPS_REQ',c]]
candidate[['Name','Id','Sdate']] = candidate[c].str.split('-',n=-1,expand=True)
blank = blank.append(candidate)
Thank you
i have done some workaround in the code something like below, But the problem I am facing with this part of code:
candidate['Sdate'] = candidate[c].str.extract('(../..)', expand=True)
Here if Date is 11/18 it works fine, but if date is 11/8 it returns nan.
for index, row in dff.iterrows():
for c in all_candiadates:
print('the value of c :',c)
candidate = dff[['PPS_REQ',c]]
candidate['Sdate'] = candidate[c].str.extract('(../..)', expand=True)
candidate['Id'] = candidate[c].str.extract('(\d\d\d\d\d\d\d)', expand=True)
candidate['Name'] = candidate[c].str.extract('([a-zA-Z ]*)\d*.*', expand=False)
# candidate[['Name','Id','Sdate']] = candidate[c].str.split('-',n=-1,expand=True)
blank = blank.append(candidate)
Finally this is fixed, just adding this if this is useful for someone else.
blank = pd.DataFrame()
#for index, row in dff.iterrows():
for c in all_candiadates:
# print('the value of c :',c)
try:
candidate = dff[['PPS_REQ',c]]
candidate = candidate[candidate[c].str.contains('FILL|Reopen|Fill|REOPEN|Duplicate|reopen|FILED|fill') != True]
candidate=candidate.loc[(candidate[c] !="")]
candidate['Sdate'] = candidate[c].str.extract('(\d+/\d+)', expand=True)
candidate['Id'] = candidate[c].str.extract('(\d\d\d\d\d\d\d)', expand=True)
candidate['Name'] = candidate[c].str.extract('([a-zA-Z ]*)\d*.*', expand=False)
# candidate[['Name','Id','Sdate']] = candidate[c].str.split('-',n=-1,expand=True)
blank = blank.append(candidate)
except:
pass
blank = blank[['PPS_REQ', 'Name','Id','Sdate']]
bb = blank.drop_duplicates()

Creating a python function to change sequence of columns

I am able to change the sequence of columns using below code I found on stackoverflow, now I am trying to convert it into a function for regular use but it doesnt seem to do anything. Pycharm says local variable df_name value is not used in last line of my function.
Working Code
columnsPosition = list(df.columns)
F, H = columnsPosition.index('F'), columnsPosition.index('H')
columnsPosition[F], columnsPosition[H] = columnsPosition[H], columnsPosition[F]
df = df[columnsPosition]
My Function - Doesnt work, need to make this work
def change_col_seq(df_name, old_col_position, new_col_position):
columnsPosition = list(df_name.columns)
F, H = columnsPosition.index(old_col_position), columnsPosition.index(new_col_position)
columnsPosition[F], columnsPosition[H] = columnsPosition[H], columnsPosition[F]
df_name = df_name[columnsPosition] # pycharm has issue on this line
I have tried adding return on last statement of function but I am unable to make it work.
To re-order the Columns
To change the position of 2 columns:
def change_col_seq(df_name:pd.DataFrame, old_col_position:str, new_col_position:str):
df_name[new_col_position], df_name[old_col_position] = df_name[old_col_position].copy(), df_name[new_col_position].copy()
df = df_name.rename(columns={old_col_position:new_col_position, new_col_position:old_col_position})
return df
To Rename the Columns
You can use the rename method (Documentation)
If you want to change the name of just one column:
def change_col_name(df_name, old_col_name:str, new_col_name:str):
df = df_name.rename(columns={old_col_name: new_col_name})
return df
If you want to change the name of multiple column:
def change_col_name(df_name, old_col_name:list, new_col_name:list):
df = df_name.rename(columns=dict(zip(old_col_name, new_col_name)))
return df

Strange difference in performance of Pandas, dataframe on small & large scale

I have a dataframe read from a CSV file. I need to generate new data and add them to the end of old ones.
But it's strange that it shows a totally different result when compare small scale and large scale. I guess it may relate to view, copy() & Chained assignment.
I tried 2 options to use pd.copy() to avoid potential problems.
First option:
d_jlist = pd.read_csv('127case.csv', sep=',') #got the data shape: (46355,48) from CSV file
d_jlist2 = d_jlist.copy() #Use deep copy, in case of change the raw data
d_jlist3 = pd.DataFrame()
a = np.random.choice(range(5,46350),size = 1000*365) #Select from row 5 to row 46350
for i in a:
d_jlist3 = d_jlist3.append(d_jlist.iloc[i].copy() +np.random.uniform(-1,1) )
d_jlist3 = d_jlist3.replace(0,0.001,regex=True)
d_jlist3 = d_jlist3.round(3)
d_jlist = d_jlist.append(d_jlist3)
a = consumption.columns.values #Something to do with header
a = a[5:53]
d_jlist.to_csv('1127case_1.csv',header = a,index=False)
Second option:
d_jlist = pd.read_csv('127case.csv', sep=',')
d_jlist2 = d_jlist.copy()
d_jlist3 = pd.DataFrame()
a = np.random.choice(range(5,46350),size = 1000*365)
for i in a:
d_jlist3 = d_jlist3.append(d_jlist2.iloc[i] +np.random.uniform(-1,1) )
d_jlist3 = d_jlist3.replace(0,0.001,regex=True)
d_jlist3 = d_jlist3.round(3)
d_jlist = d_jlist.append(d_jlist3)
a = consumption.columns.values #Something to do with header
a = a[5:53]
d_jlist.to_csv('1117case_2.csv',header = a,index=False)
The problem is, if I use these code on a small scale, it works as expected. New rows add to the old ones, and nothing in old data changed.
However, if I come to the scale above (1000*365), the old rows will get changed.
And the strange thing is: only the first two columns of each row will stay unchanged. The rest of the columns of each row will all get changed.
The results:
The left one is old dataframe, it has (46356,48) shape. Below are the new data generated.
The right one is result got from option 1 (both options got same result). From the third columns, the old data got changed.
If I try either of the options in smaller scale (3 rows), it will be fine. All the old data will be kept.
d_jlist = pd.read_csv('127case.csv', sep=',')
d_jlist = d_jlist.iloc[:10] #Only select 10 rows from old ones
d_jlist2 = d_jlist.copy()
d_jlist3 = pd.DataFrame()
a = np.random.choice(range(5,6),size = 3) #Only select 3 rows randomly from old data
for i in a:
d_jlist3 = d_jlist3.append(d_jlist2.iloc[i] +np.random.uniform(-1,1) )
d_jlist3 = d_jlist3.replace(0,0.001,regex=True)
d_jlist3 = d_jlist3.round(3)
d_jlist = d_jlist.append(d_jlist3)
a = consumption.columns.values #Something to do with header
a = a[5:53]
d_jlist.to_csv('1117case_2.csv',header = a,index=False)
How can I understand this? I spent lots of time try to find explanation for this but failed.
Are some rules changed in Pandas when the scale is larger (To 365K level)?

Merging two dataframes based on column values available in both databases, where column name is not known

Here is my code. Basically, I am trying to figure out common rows based on a column called 'PolicyNum' (which has to be a variable, and therefore I cannot use df.PolicyNum.isin method).
while you can suggest a better way to do it, I am also wondering why would the length of the two dataframes (printing in lines 11 and 12) be different?
import pandas as pd
def getPreparedDataForComparison( baseDataFrame, secondaryDataFrame, sourceName, indexKey):
baseDataFrameCommon = baseDataFrame[(baseDataFrame[indexKey].isin(secondaryDataFrame[indexKey]) == False)]
baseDataFrameCommon['COMBO'] = baseDataFrameCommon.apply(lambda x :','.join(x.astype(str)),axis=1)
baseDataFrameCommon['DataSource'] = sourceName
return baseDataFrameCommon
def compareDataFrames(dataframe1, dataframe2, indexKey):
dataframe1Common = getPreparedDataForComparison(dataframe1, dataframe2, 'TXT', indexKey)
dataframe2Common = getPreparedDataForComparison(dataframe2, dataframe1, 'SQL', indexKey)
print(len(dataframe1Common))
print(len(dataframe2Common))
def sampleData1():
cols = ['PolicyNum','firsttransactiondate','subsequentbonustotalcumulative','subsequentpremiumtotalcumulative','totalautocumulative','totalautoposteffectivedatecumulative','totalpartialcumulative','totalpartialposteffectivedatecumulative']
sourceData = [ ('E001','#1985-01-01#',100,100,100,100,100,100),
('E002','#1985-01-01#',200,200,200,200,200,200),
('E003','#1985-01-01#',100,100,100,100,100,100),
('E004','#1985-01-01#',100,100,100,100,100,100),
('E005','#1985-01-01#',100,100,100,100,100,100),
('E201','#1985-01-01#',100,100,100,100,100,100),
('E202','#1985-01-01#',100,100,100,100,100,100),
('1006','#1985-01-01#',100,100,100,100,100,100),
('1007','#1985-01-01#',100,100,100,100,100,100),
('1008','#1985-01-01#',100,100,100,100,100,100),]
x = pd.DataFrame(sourceData,columns=cols)
return x
def sampleData2():
cols = ['PolicyNum','firsttransactiondate','subsequentbonustotalcumulative','subsequentpremiumtotalcumulative','totalautocumulative','totalautoposteffectivedatecumulative','totalpartialcumulative','totalpartialposteffectivedatecumulative']
sourceData = [ ('E001','#1985-01-01#',100,300,100,100,100,400),
('E002','#1985-01-01#',200,200,200,200,200,200),
('E005','#1989-01-01#',100,100,100,100,100,100),
('E105','#1989-01-01#',100,100,100,100,100,100),
('1106','#1985-01-01#',100,100,100,100,900,100),
('1006','#1985-01-01#',100,100,100,100,900,100),
('1007','#1985-01-01#',100,100,100,100,100,100),]
x = pd.DataFrame(sourceData,columns=cols)
return x
compareDataFrames(sampleData1(),sampleData2(),'PolicyNum')
Not the best way I think, but this is what I landed up doing.
pandas junkies can have a dig at it.
def getPreparedDataForComparison( baseDataFrame, secondaryDataFrame, sourceName, indexKey):
baseDataFrame['DataSource'] = sourceName
baseDataFrame['Common'] = np.where((baseDataFrame[indexKey].isin( list(secondaryDataFrame[indexKey]))),True,False)
baseDataFrameCommon = baseDataFrame.loc[baseDataFrame['Common'] == True]
baseDataFrameCommon.drop(['Common'],axis=1,inplace=True)
baseDataFrameCommon['COMBO'] = baseDataFrameCommon.apply(lambda x :','.join(x.astype(str)),axis=1)
baseDataFrame = None
secondaryDataFrame = None
return baseDataFrameCommon

Python looping and Pandas rank/index quirk

This question pertains to one posted here:
Sort dataframe rows independently by values in another dataframe
In the linked question, I utilize a Pandas Dataframe to sort each row independently using values in another Pandas Dataframe. The function presented there works perfectly every single time it is directly called. For example:
import pandas as pd
import numpy as np
import os
##Generate example dataset
d1 = {}
d2 = {}
d3 = {}
d4 = {}
## generate data:
np.random.seed(5)
for col in list("ABCDEF"):
d1[col] = np.random.randn(12)
d2[col+'2'] = np.random.random_integers(0,100, 12)
d3[col+'3'] = np.random.random_integers(0,100, 12)
d4[col+'4'] = np.random.random_integers(0,100, 12)
t_index = pd.date_range(start = '2015-01-31', periods = 12, freq = "M")
#place data into dataframes
dat1 = pd.DataFrame(d1, index = t_index)
dat2 = pd.DataFrame(d2, index = t_index)
dat3 = pd.DataFrame(d3, index = t_index)
dat4 = pd.DataFrame(d4, index = t_index)
## Functions
def sortByAnthr(X,Y,Xindex, Reverse=False):
#order the subset of X.index by Y
ordrX = [x for (x,y) in sorted(zip(Xindex,Y), key=lambda pair: pair[1],reverse=Reverse)]
return(ordrX)
def OrderRow(row,df):
ordrd_row = df.ix[row.dropna().name,row.dropna().values].tolist()
return(ordrd_row)
def r_selectr(dat2,dat1, n, Reverse=False):
ordr_cols = dat1.apply(lambda x: sortByAnthr(x,dat2.loc[x.name,:],x.index,Reverse),axis=1).iloc[:,-n:]
ordr_cols.columns = list(range(0,n)) #assign interpretable column names
ordr_r = ordr_cols.apply(lambda x: OrderRow(x,dat1),axis=1)
return([ordr_cols, ordr_r])
## Call functions
ordr_cols2,ordr_r = r_selectr(dat2,dat1,5)
##print output:
print("Ordering set:\n",dat2.iloc[-2:,:])
print("Original set:\n", dat1.iloc[-2:,:])
print("Column ordr:\n",ordr_cols2.iloc[-2:,:])
As can be checked, the columns of dat1 are correctly ordered according to the values in dat2.
However, when called from a loop over dataframes, it does not rank/index correctly and produces completely dubious results. Although I am not quite able to recreate the problem using the reduced version presented here, the idea should be the same.
## Loop test:
out_dict = {}
data_dicts = {'dat2':dat2, 'dat3': dat3, 'dat4':dat4}
for i in range(3):
#this outer for loop supplies different parameter values to a wrapper
#function that calls r_selectr.
for key in data_dicts.keys():
ordr_cols,_ = r_selectr(data_dicts[key], dat1,5)
out_list.append(ordr_cols)
#do stuff here
#print output:
print("Ordering set:\n",dat3.iloc[-2:,:])
print("Column ordr:\n",ordr_cols2.iloc[-2:,:])
In my code (almost completely analogous to the example given here), the ordr_cols are no longer ordered correctly for any of the sorting data frames.
I currently solve the issue by separating the ordering and indexing operations with r_selectr into two separate functions. That, for some reason, resolves the issue though I have no idea why.

Categories