Python Panda-Specific format of text with hyphen - python

I need to change multiple such python dataframe columns that do not follow a specific format like Name-ID-Date. And want to change that to follow the same format. I have attached the input and Corrected output format as images.
I have written some code that basically looks at all the columns in dataframe and if it follows the format then it separates the data into 3 different columns but if does not follow the specific format of Name-ID-Date the code is not able to proceed. Any help will be highly appreciated here.
dff = df[['PPS_REQ','Candidate1', 'Candidate2',
'Candidate3', 'Candidate4', 'Candidate5', 'Candidate6', 'Candidate7',
'Candidate8', 'Candidate9','Candidate10', 'Candidate11', 'Candidate12',
'Candidate13', 'Candidate14', 'Candidate15', 'Candidate16',
'Candidate17', 'Candidate18', 'Candidate19', 'Candidate20','Candidate21',
'Candidate22','Candidate23','Candidate24','Candidate25','Candidate26','Candidate27','Candidate28']]
all_candiadates = ['Candidate1', 'Candidate2',
'Candidate3', 'Candidate4', 'Candidate5', 'Candidate6', 'Candidate7',
'Candidate8', 'Candidate9','Candidate10', 'Candidate11', 'Candidate12',
'Candidate13', 'Candidate14', 'Candidate15', 'Candidate16',
'Candidate17', 'Candidate18', 'Candidate19', 'Candidate20','Candidate21',
'Candidate22','Candidate23','Candidate24','Candidate25','Candidate26','Candidate27','Candidate28']#,'Candidate29','Candidate30','Candidate31','Candidate32','Candidate33','Candidate34','Candidate35','Candidate36','Candidate37','Candidate38']
blank = pd.DataFrame()
for index, row in dff.iterrows():
for c in all_candiadates:
print('the value of c :',c)
candidate = dff[['PPS_REQ',c]]
candidate[['Name','Id','Sdate']] = candidate[c].str.split('-',n=-1,expand=True)
blank = blank.append(candidate)
Thank you

i have done some workaround in the code something like below, But the problem I am facing with this part of code:
candidate['Sdate'] = candidate[c].str.extract('(../..)', expand=True)
Here if Date is 11/18 it works fine, but if date is 11/8 it returns nan.
for index, row in dff.iterrows():
for c in all_candiadates:
print('the value of c :',c)
candidate = dff[['PPS_REQ',c]]
candidate['Sdate'] = candidate[c].str.extract('(../..)', expand=True)
candidate['Id'] = candidate[c].str.extract('(\d\d\d\d\d\d\d)', expand=True)
candidate['Name'] = candidate[c].str.extract('([a-zA-Z ]*)\d*.*', expand=False)
# candidate[['Name','Id','Sdate']] = candidate[c].str.split('-',n=-1,expand=True)
blank = blank.append(candidate)

Finally this is fixed, just adding this if this is useful for someone else.
blank = pd.DataFrame()
#for index, row in dff.iterrows():
for c in all_candiadates:
# print('the value of c :',c)
try:
candidate = dff[['PPS_REQ',c]]
candidate = candidate[candidate[c].str.contains('FILL|Reopen|Fill|REOPEN|Duplicate|reopen|FILED|fill') != True]
candidate=candidate.loc[(candidate[c] !="")]
candidate['Sdate'] = candidate[c].str.extract('(\d+/\d+)', expand=True)
candidate['Id'] = candidate[c].str.extract('(\d\d\d\d\d\d\d)', expand=True)
candidate['Name'] = candidate[c].str.extract('([a-zA-Z ]*)\d*.*', expand=False)
# candidate[['Name','Id','Sdate']] = candidate[c].str.split('-',n=-1,expand=True)
blank = blank.append(candidate)
except:
pass
blank = blank[['PPS_REQ', 'Name','Id','Sdate']]
bb = blank.drop_duplicates()

Related

How to convert value data to another string data using pandas dataframe?

I want to ask about how to convert data value to another data string from table 1 to table 2. Here's my explanation.
This is my function python program
def freq_action_requirement_measurement(keyword= stopwords_ucd1_topic.action, requirement= stopwords_freq_topic.requirement):
disambiguation_df = []
for angka in range(0, len(requirement)):
a = [cosine_similarity(requirement[angka], keyword[num]) for num in range(0, len(keyword))]
disambiguation_df.append(a)
hasil_disambiguation = pd.DataFrame(disambiguation_df, index= requirement, columns= keyword)
return hasil_disambiguation
using that syntax, I called it
list_action_requirement3 = freq_action_requirement_measurement()
and get data like this:
enter image description here
I can get data from that table 1 to be table 2. That have been result such as Table usecase dan threshold value. Here's my code..
data_d1 = []
df_stop1 = stopwords_ucd1_topic
for idx, angka in enumerate(list_action_requirement3):
for jdx, num in enumerate(list_action_requirement3.iloc[:,idx]):
if num >= 0.1:
d1 = df_stop1[df_stop1.action == angka].iloc[0].usecase
# d1 = len(df_stop1[df_stop1.action == angka])
data_d1.append([d1, num])
df_b = pd.DataFrame(data_d1, columns= ['usecase', 'threshold'])
print(tabulate(df_b, headers = 'keys', tablefmt = 'psql'))
and the result, like this one..
enter image description here
From that threshold value, I wanna to change that value data to string data as a usecase data column. Like this one..
enter image description here
Please help me soon, thank you..
Here's also another qna from another forum qna.

Even after adding a column to a dataframe my shape remains the same which means I am not able to add a column to my dataframe

This is my piece of code:
def segregate_files(self, list_of_csv, each_sub_folder):
new_list_of_csv = []
for each_csv in list_of_csv:
pattern = f"{each_sub_folder}/(.*?)/"
self.data_centre = re.search(pattern, each_csv).group(1)
if "org_dashboards/" in each_csv:
each_csv = each_csv.replace("org_dashboards/", f"{self.file_path}/")
else:
each_csv = each_csv.replace("dashboards/", f"{self.file_path}/")
df = pd.read_csv(each_csv)
print(df.shape)
df["Data Centre"] = self.data_centre
print(df.shape)
df.to_csv(each_csv)
new_list_of_csv.append(each_csv)
# self.list_of_sub_folder.append(f"files/{blob_name}")
print(new_list_of_csv)
self.aggregate_csv_path = f"{self.file_path}/{each_sub_folder}"
return new_list_of_csv, self.aggregate_csv_path
and my dataframe is properly able to read the csv
and there is no error in df["Data Centre"] = self.data_centre
only the shape remains the same
FYI the value of self.data_centre is also correct
Sorry my bad. It was a file write issue. Now it has been resolved. Thank you.

Strange difference in performance of Pandas, dataframe on small & large scale

I have a dataframe read from a CSV file. I need to generate new data and add them to the end of old ones.
But it's strange that it shows a totally different result when compare small scale and large scale. I guess it may relate to view, copy() & Chained assignment.
I tried 2 options to use pd.copy() to avoid potential problems.
First option:
d_jlist = pd.read_csv('127case.csv', sep=',') #got the data shape: (46355,48) from CSV file
d_jlist2 = d_jlist.copy() #Use deep copy, in case of change the raw data
d_jlist3 = pd.DataFrame()
a = np.random.choice(range(5,46350),size = 1000*365) #Select from row 5 to row 46350
for i in a:
d_jlist3 = d_jlist3.append(d_jlist.iloc[i].copy() +np.random.uniform(-1,1) )
d_jlist3 = d_jlist3.replace(0,0.001,regex=True)
d_jlist3 = d_jlist3.round(3)
d_jlist = d_jlist.append(d_jlist3)
a = consumption.columns.values #Something to do with header
a = a[5:53]
d_jlist.to_csv('1127case_1.csv',header = a,index=False)
Second option:
d_jlist = pd.read_csv('127case.csv', sep=',')
d_jlist2 = d_jlist.copy()
d_jlist3 = pd.DataFrame()
a = np.random.choice(range(5,46350),size = 1000*365)
for i in a:
d_jlist3 = d_jlist3.append(d_jlist2.iloc[i] +np.random.uniform(-1,1) )
d_jlist3 = d_jlist3.replace(0,0.001,regex=True)
d_jlist3 = d_jlist3.round(3)
d_jlist = d_jlist.append(d_jlist3)
a = consumption.columns.values #Something to do with header
a = a[5:53]
d_jlist.to_csv('1117case_2.csv',header = a,index=False)
The problem is, if I use these code on a small scale, it works as expected. New rows add to the old ones, and nothing in old data changed.
However, if I come to the scale above (1000*365), the old rows will get changed.
And the strange thing is: only the first two columns of each row will stay unchanged. The rest of the columns of each row will all get changed.
The results:
The left one is old dataframe, it has (46356,48) shape. Below are the new data generated.
The right one is result got from option 1 (both options got same result). From the third columns, the old data got changed.
If I try either of the options in smaller scale (3 rows), it will be fine. All the old data will be kept.
d_jlist = pd.read_csv('127case.csv', sep=',')
d_jlist = d_jlist.iloc[:10] #Only select 10 rows from old ones
d_jlist2 = d_jlist.copy()
d_jlist3 = pd.DataFrame()
a = np.random.choice(range(5,6),size = 3) #Only select 3 rows randomly from old data
for i in a:
d_jlist3 = d_jlist3.append(d_jlist2.iloc[i] +np.random.uniform(-1,1) )
d_jlist3 = d_jlist3.replace(0,0.001,regex=True)
d_jlist3 = d_jlist3.round(3)
d_jlist = d_jlist.append(d_jlist3)
a = consumption.columns.values #Something to do with header
a = a[5:53]
d_jlist.to_csv('1117case_2.csv',header = a,index=False)
How can I understand this? I spent lots of time try to find explanation for this but failed.
Are some rules changed in Pandas when the scale is larger (To 365K level)?

Randomization of a list with conditions using Pandas

I'm new to any kind of programming as you can tell by this 'beautiful' piece of hard coding. With sweat and tears (not so bad, just a little), I've created a very sequential code and that's actually my problem. My goal is to create a somewhat-automated script - probably including for-loop (I've unsuccessfully tried).
The main aim is to create a randomization loop which takes original dataset looking like this:
dataset
From this data set picking randomly row by row and saving it one by one to another excel list. The point is that the row from columns called position01 and position02 should be always selected so it does not match with the previous pick in either of those two column values. That should eventually create an excel sheet with randomized rows that are followed always by a row that does not include values from the previous pick. So row02 should not include any of those values in columns position01 and position02 of the row01, row3 should not contain values of the row2, etc. It should also iterate in the range of the list length, which is 0-11. Important is also the excel output since I need the rest of the columns, I just need to shuffle the order.
I hope my aim and description are clear enough, if not, happy to answer any questions. I would appreciate any hint or help, that helps me 'unstuck'. Thank you. Code below. (PS: I'm aware of the fact that there is probably much more neat solution to it than this)
import pandas as pd
import random
dataset = pd.read_excel("C:\\Users\\ibm\\Documents\\Psychopy\\DataInput_Training01.xlsx")
# original data set use for comparisons
imageDataset = dataset.loc[0:11, :]
# creating empty df for storing rows from imageDataset
emptyExcel = pd.DataFrame()
randomPick = imageDataset.sample() # select randomly one row from imageDataset
emptyExcel = emptyExcel.append(randomPick) # append a row to empty df
randomPickIndex = randomPick.index.tolist() # get index of the row
imageDataset2 = imageDataset.drop(index=randomPickIndex) # delete the row with index selected before
# getting raw values from the row 'position01'/02 are columns headers
randomPickTemp1 = randomPick['position01'].values[0]
randomPickTemp2 = randomPick
randomPickTemp2 = randomPickTemp2['position02'].values[0]
# getting a dataset which not including row values from position01 and position02
isit = imageDataset2[(imageDataset2.position01 != randomPickTemp1) & (imageDataset2.position02 != randomPickTemp1) & (imageDataset2.position01 != randomPickTemp2) & (imageDataset2.position02 != randomPickTemp2)]
# pick another row from dataset not including row selected at the beginning - randomPick
randomPick2 = isit.sample()
# save it in empty df
emptyExcel = emptyExcel.append(randomPick2, sort=False)
# get index of this second row to delete it in next step
randomPick2Index = randomPick2.index.tolist()
# delete the another row
imageDataset3 = imageDataset2.drop(index=randomPick2Index)
# AND REPEAT the procedure of comparison of the raw values with dataset already not including the original row:
randomPickTemp1 = randomPick2['position01'].values[0]
randomPickTemp2 = randomPick2
randomPickTemp2 = randomPickTemp2['position02'].values[0]
isit2 = imageDataset3[(imageDataset3.position01 != randomPickTemp1) & (imageDataset3.position02 != randomPickTemp1) & (imageDataset3.position01 != randomPickTemp2) & (imageDataset3.position02 != randomPickTemp2)]
# AND REPEAT with another pick - save - matching - picking again.. until end of the length of the dataset (which is 0-11)
So at the end I've used a solution provided by David Bridges (post from Sep 19 2019) on psychopy websites. In case anyone is interested, here is a link: https://discourse.psychopy.org/t/how-do-i-make-selective-no-consecutive-trials/9186
I've just adjusted the condition in for loop to my case like this:
remaining = [choices[x] for x in choices if last['position01'] != choices[x]['position01'] and last['position01'] != choices[x]['position02'] and last['position02'] != choices[x]['position01'] and last['position02'] != choices[x]['position02']]
Thank you very much for the helpful answer! and hopefully I did not spam it over here too much.
import itertools as it
import random
import pandas as pd
# list of pair of numbers
tmp1 = [x for x in it.permutations(list(range(6)),2)]
df = pd.DataFrame(tmp1, columns=["position01","position02"])
df1 = pd.DataFrame()
i = random.choice(df.index)
df1 = df1.append(df.loc[i],ignore_index = True)
df = df.drop(index = i)
while not df.empty:
val = list(df1.iloc[-1])
tmp = df[(df["position01"]!=val[0])&(df["position01"]!=val[1])&(df["position02"]!=val[0])&(df["position02"]!=val[1])]
if tmp.empty: #looped for 10000 times, was never empty
print("here")
break
i = random.choice(tmp.index)
df1 = df1.append(df.loc[i],ignore_index = True)
df = df.drop(index=i)

Function to split & expand returning NameError

def unique_unit_split(df):
df_unit_list = df_master.loc[df_master['type'] == 'unit']
df_unit_list = df_unit_list.key.tolist()
for i in range(len(df_unit_list)):
df_unit_list[i] = int(df_unit_list[i])
split_1 = df_units.units.str.split('[","]',expand=True).stack()
df_units_update = df_units.join(pd.Series(index=split_1.index.droplevel(1), data=split_1.values, name='unit_split'))
df_units_final = df_units_update[df_units_update['unit_split'].isin(df_unit_list)]
return(df)
Updated script: still not working
df_unit_list = []
split_1 = pd.DataFrame()
df_units_update = pd.DataFrame()
df_units_final = pd.DataFrame()
def unique_unit_split(df):
df_unit_list = df_master.loc[df_master['type'] == 'unit']
df_unit_list = df_unit_list.key.tolist()
for i in range(len(df_unit_list)):
df_unit_list[i] = int(df_unit_list[i])
split_1 = df_units.units.str.split('[","]',expand=True).stack()
df_units_update = df_units.join(pd.Series(index=split_1.index.droplevel(1), data=split_1.values, name='unit_split'))
df_units_final = df_units_update[df_units_update['unit_split'].isin(df_unit_list)]
return(df)
Above function originally worked when I split up the two actions (code inclusive of the for loop and above was in a function then everything below split_1 was in another function). Now that I tried to condense them, I am getting a NameError (image attached). Anyone know how I can resolve this issue and ensure my final df (df_units_final) is defined?
For more insight on this function: I have a df with comma separated values in one column and I needed to split that column, drop the [] and only keep rows with the #s I need which were defined in the list created "df_unit_list".
NameError Details
The issue was stated above (not defining df_units_final) AND my for_loop was forcing the list to be int when the values in the other df were actually strings.
Working Code

Categories