Modifying strings in a pandas DataFrame by row

Modifying strings in a pandas DataFrame by row - python

I have the following strings in a pandas DataFrame in Python3, column string1 and string2:
import pandas as pd
datainput = [
{ 'string1': 'TTTABCDABCDTTTTT', 'string2': 'ABABABABABABABAA' },
{ 'string1': 'AAAAAAAA', 'string2': 'TTAAAATT' },
{ 'string1': 'TTABCDTTTTT', 'string2': 'ABABABABABA' }
]
df = pd.DataFrame(datainput)
df
string1 string2
0 TTTABCDABCDTTTTT ABABABABABABABAA
1 AAAAAAAA TTAAAATT
2 TTABCDTTTTT ABABABABABA
For each row, strings in columns string1 and string2 are defined to be the same length.
For each row of the DataFrame, the strings may need to be "cleaned" of beginning/trailing letters 'T'. However, for each row, the strings need to both be stripped of the same number of characters, so as the strings remain the same length.
The correct output is as follows:
df
string1 string2
0 ABCDABCD BABABABA
1 AAAA AAAA
2 ABCD ABAB
If these were two variables, it would be straightforward to calculate this with strip(), e.g.
string1 = "TTTABCDABCDTTTTT"
string2 = "ABABABABABABABAA"
length_original = len(string1)
num_left_chars = len(string1) - len(string1.lstrip('T'))
num_right_chars = len(string1.rstrip('T'))
edited = string1[num_left_chars:num_right_chars]
## print(edited)
## 'ABCDABCD'
However, in this case, one needs to iterate through all rows and redefine two rows at once. How could one modify each these strings row by row?
EDIT: My main confusion is, given both columns could T, how do I re-define them both?

A bit lengthy but gets the job done..
import re
def count_head(s):
head = re.findall('^T+', s)
if head:
return len(head[0])
return 0
def count_tail(s):
tail = re.findall('T+$', s)
if tail:
return len(tail[0])
return 0
df1 = df.copy()
df1['st1_head'] = df1['string1'].apply(count_head)
df1['st2_head'] = df1['string2'].apply(count_head)
df1['st1_tail'] = df1['string1'].apply(count_tail)
df1['st2_tail'] = df1['string2'].apply(count_tail)
df1['length'] = df1['string1'].str.len()
def trim_strings(row):
head = max(row['st1_head'], row['st2_head'])
tail = max(row['st1_tail'], row['st2_tail'])
l = row['length']
return {'string1': row['string1'][head:(l-tail)],
'string2': row['string2'][head:(l-tail)]}
new_df = pd.DataFrame(list(df1.apply(trim_strings, axis=1)))
print(new_df)
output:
string1 string2
0 ABCDABCD BABABABA
1 AAAA AAAA
2 ABCD ABAB
A more compact version:
def trim(st1, st2):
l = len(st1)
head = max(len(st1) - len(st1.lstrip('T')),
len(st2) - len(st2.lstrip('T')))
tail = max(len(st1) - len(st1.rstrip('T')),
len(st2) - len(st2.rstrip('T')))
return (st1[head:(l-tail)],
st2[head:(l-tail)])
new_df = pd.DataFrame(list(
df.apply(lambda r: trim(r['string1'], r['string2']),
axis=1)), columns=['string1', 'string2'])
print(new_df)
The main thing to notice is the df.apply(<your function>, axis=1), which lets you do any function (in this case acting on both columns at once), on each row.

raw_data = {'name': ['Will Morris', 'Alferd Hitcock', 'Sir William', 'Daniel Thomas'],
'age': [11, 49, 66, 77],
'color': ['TblueT', 'redT', 'white', "cyan"],
'marks': [74, 90, 44, 17]}
df = pd.DataFrame(raw_data, columns = ['name', 'age', 'color', 'grade'])
print(df)
cols = ['name','color']
print("new df")
#following line does the magic
df[cols] = df[cols].apply(lambda row: row.str.lstrip('T').str.rstrip('T'), axis=1)
print(df)
Will print
name age color grade
0 TWillard MorrisT 20 TblueT 88
1 Al Jennings 19 redT 92
2 Omar Mullins 22 yellow 95
3 Spencer McDaniel 21 green 70
new df
name age color grade
0 Willard Morris 20 blue 88
1 Al Jennings 19 red 92
2 Omar Mullins 22 yellow 95
3 Spencer McDaniel 21 green 70

Related

Calculate a difference in times between each pair of values in class using Pandas

I am trying to calculate the difference in the "time" column between each pair of elements having the same value in the "class" column.
This is an example of an input:
class name time
0 A Bob 2022-09-05 07:22:15
1 A Sam 2022-09-04 17:18:29
2 B Bob 2022-09-04 03:29:06
3 B Sue 2022-09-04 01:28:34
4 A Carol 2022-09-04 10:40:23
And this is an output:
class name1 name2 timeDiff
0 A Bob Carol 0 days 20:41:52
1 A Bob Sam 0 days 14:03:46
2 A Carol Sam 0 days 06:38:06
3 B Bob Sue 0 days 02:00:32
I wrote this code to solve this problem:
from itertools import combinations
df2 = pd.DataFrame(columns=['class', 'name1', 'name2', 'timeDiff'])
for c in df['class'].unique():
df_class = df[df['class'] == c]
groups = df_class.groupby(['name'])['time']
if len(df_class) > 1:
out = (pd
.concat({f'{k1} {k2}': pd.Series(data=np.abs(np.diff([g2.values[0],g1.values[0]])).astype('timedelta64[s]'), index=[f'{k1} {k2}'], name='timeDiff')
for (k1, g1), (k2, g2) in combinations(groups, 2)},
names=['name']
)
.reset_index()
)
new = out["name"].str.split(" ", n = -1, expand = True)
out["name1"]= new[0].astype(str)
out["name2"]= new[1].astype(str)
out["class"] = c
del out['level_1'], out['name']
df2 = df2.append(out, ignore_index=True)
I didn't come up with a solution without going through all the class values in a loop. However, this is very time-consuming if the input table is large. Does anyone have any solutions without using a loop?

The whole thing is a self cross join and a time difference
import pandas as pd
df = pd.DataFrame({
'class': ['A', 'A', 'B', 'B', 'A'],
'name': ['Bob', 'Sam', 'Bob', 'Sue', 'Carol'],
'time': [
pd.Timestamp('2022-09-05 07:22:15'),
pd.Timestamp('2022-09-04 17:18:29'),
pd.Timestamp('2022-09-04 03:29:06'),
pd.Timestamp('2022-09-04 01:28:34'),
pd.Timestamp('2022-09-04 10:40:23'),
]
})
rs = list()
for n, df_g in df.groupby('class'):
t_df = df_g.merge(
df_g, how='cross',
suffixes=('_1', '_2')
)
t_df = t_df[t_df['name_1'] != t_df['name_2']]
t_df = t_df.drop(['class_2'], axis=1)\
.rename({'class_1': 'class'}, axis=1).reset_index(drop=True)
t_df['timeDiff'] = abs(t_df['time_1'] - t_df['time_2'])\
.astype('timedelta64[ns]')
t_df = t_df.drop(['time_1', 'time_2'], axis=1)
rs.append(t_df)
rs_df = pd.concat(rs).reset_index(drop=True)

Check below code without Outerjoin, using Aggegrate & Itertools
from itertools import combinations
# Function to create list of names
def agg_to_list(value):
return list(list(i) for i in combinations(list(value), 2))
# Fucntion to calculate list of time & calculate differences between them
def agg_to_list_time(value):
return [ t[0] - t[1] for t in list(combinations(list(value), 2))]
# Apply aggregate functions
updated_df = df.groupby(['class']).agg({'name':agg_to_list,'time':agg_to_list_time})
# Explode DataFrame & rename column
updated_df = updated_df.explode(['name','time']).rename(columns={'time':'timediff'})
# Unpack name column in two new columns
updated_df[['name1','name2']] = pd.DataFrame(updated_df.name.tolist(), index=updated_df.index)
# Final DataFrame
print(updated_df.reset_index()[['class','name1','name2','timediff']])
Output:
class name1 name2 timediff
0 A Bob Sam 0 days 14:03:46
1 A Bob Carol 0 days 20:41:52
2 A Sam Carol 0 days 06:38:06
3 B Bob Cue 0 days 02:00:32

return True if partial match success between two column

I am looking for partial matches success. below is the code for the dataframe.
import pandas as pd
data = [['tom', 10,'aaaaa','aaa'], ['nick', 15,'vvvvv','vv'], ['juli', 14,'sssssss','kk']]
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['Name', 'Age','random','partial'])
df
Output:
I am expecting the output shown below
Name Age random partial Matches
0 tom 10 aaaaa aaa True
1 nick 15 vvvvv vv True
2 juli 14 sssssss kk False

You can use df.apply in combination with a lambda function that checks whether the partial string is part of the other string by using in.
Then we can assign this to a new column in the dataframe:
>>>import pandas as pd
>>>data = [['tom', 10,'aaaaa','aaa'], ['nick', 15,'vvvvv','vv'], ['juli',14,'sssssss','kk']]
>>>df = pd.DataFrame(data, columns = ['Name', 'Age','random','partial'])
>>>df['matching'] = df.apply(lambda x : x.partial in x.random, axis=1)
>>>print(df)
Name Age random partial matching
0 tom 10 aaaaa aaa True
1 nick 15 vvvvv vv True
2 juli 14 sssssss kk False
One important thing to be aware of when using df.apply is the axis argument, as it here allows us to access all columns of a given row at once.

df['Matches'] = df.apply(lambda row: row['partial'] in row['random'], axis = 'columns')
df
gives
Name Age random partial Matches
0 tom 10 aaaaa aaa True
1 nick 15 vvvvv vv True
2 juli 14 sssssss kk False

How do you replace duplicate values with multiple unique strings in Pandas?

import pandas as pd
import numpy as np
data = {'Name':['Tom', 'Tom', 'Jack', 'Terry'], 'Age':[20, 21, 19, 18]}
df = pd.DataFrame(data)
Lets say I have a dataframe that looks like this. I am trying to figure out how to check the Name column for the value 'Tom' and if I find it the first time I replace it with the value 'FirstTom' and the second time it appears I replace it with the value 'SecondTom'. How do you accomplish this? I've used the replace method before but only for replacing all Toms with a single value. I don't want to add a 1 on the end of the value, but completely change the string to something else.
Edit:
If the df looked more like this below, how would we check for Tom in the first column and the second column and then replace the first instance with FirstTom and the second instance with SecondTom
data = {'Name':['Tom', 'Jerry', 'Jack', 'Terry'], 'OtherName':[Tom, John, Bob,Steve]}

Just adding in to the existing solutions , you can use inflect to create dynamic dictionary
import inflect
p = inflect.engine()
df['Name'] += df.groupby('Name').cumcount().add(1).map(p.ordinal).radd('_')
print(df)
Name Age
0 Tom_1st 20
1 Tom_2nd 21
2 Jack_1st 19
3 Terry_1st 18

We can do cumcount
df.Name=df.Name+df.groupby('Name').cumcount().astype(str)
df
Name Age
0 Tom0 20
1 Tom1 21
2 Jack0 19
3 Terry0 18
Update
suf = lambda n: "%d%s"%(n,{1:"st",2:"nd",3:"rd"}.get(n if n<20 else n%10,"th"))
g=df.groupby('Name')
df.Name=df.Name.radd(g.cumcount().add(1).map(suf).mask(g.Name.transform('count')==1,''))
df
Name Age
0 1stTom 20
1 2ndTom 21
2 Jack 19
3 Terry 18
Update 2 for column
suf = lambda n: "%d%s"%(n,{1:"st",2:"nd",3:"rd"}.get(n if n<20 else n%10,"th"))
g=s.groupby([s.index.get_level_values(0),s])
s=s.radd(g.cumcount().add(1).map(suf).mask(g.transform('count')==1,''))
s=s.unstack()
Name OtherName
0 1stTom 2ndTom
1 Jerry John
2 Jack Bob
3 Terry Steve

EDIT: For count duplicated per rows use:
df = pd.DataFrame(data = {'Name':['Tom', 'Jerry', 'Jack', 'Terry'],
'OtherName':['Tom', 'John', 'Bob','Steve'],
'Age':[20, 21, 19, 18]})
print (df)
Name OtherName Age
0 Tom Tom 20
1 Jerry John 21
2 Jack Bob 19
3 Terry Steve 18
import inflect
p = inflect.engine()
#map by function for dynamic counter
f = lambda i: p.number_to_words(p.ordinal(i))
#columns filled by names
cols = ['Name','OtherName']
#reshaped to MultiIndex Series
s = df[cols].stack()
#counter per groups
count = s.groupby([s.index.get_level_values(0),s]).cumcount().add(1)
#mask for filter duplicates
mask = s.reset_index().duplicated(['level_0',0], keep=False).values
#filter only duplicates and map, reshape back and add to original data
df[cols] = count[mask].map(f).unstack().add(df[cols], fill_value='')
print (df)
Name OtherName Age
0 firstTom secondTom 20
1 Jerry John 21
2 Jack Bob 19
3 Terry Steve 18
Use GroupBy.cumcount with Series.map, but only for duplicated values by Series.duplicated:
data = {'Name':['Tom', 'Tom', 'Jack', 'Terry'], 'Age':[20, 21, 19, 18]}
df = pd.DataFrame(data)
nth = {
0: "First",
1: "Second",
2: "Third",
3: "Fourth"
}
mask = df.Name.duplicated(keep=False)
df.loc[mask, 'Name'] = df[mask].groupby('Name').cumcount().map(nth) + df.loc[mask, 'Name']
print (df)
Name Age
0 FirstTom 20
1 SecondTom 21
2 Jack 19
3 Terry 18
Dynamic dictionary should be like:
import inflect
p = inflect.engine()
mask = df.Name.duplicated(keep=False)
f = lambda i: p.number_to_words(p.ordinal(i))
df.loc[mask, 'Name'] = df[mask].groupby('Name').cumcount().add(1).map(f) + df.loc[mask, 'Name']
print (df)
Name Age
0 firstTom 20
1 secondTom 21
2 Jack 19
3 Terry 18

transform
nth = ['First', 'Second', 'Third', 'Fourth']
def prefix(d):
n = len(d)
if n > 1:
return d.radd([nth[i] for i in range(n)])
else:
return d
df.assign(Name=df.groupby('Name').Name.transform(prefix))
Name Age
0 FirstTom 20
1 SecondTom 21
2 Jack 19
3 Terry 18
4 FirstSteve 17
5 SecondSteve 16
6 ThirdSteve 15

Pandas Merging on condition

I am trying to do conditional merging between pandas df:
My df's look like this:
df1
import numpy as np
import pandas as pd
data = {'Name':['Tom', 'JJ', 'ABC', 'Tom', 'JJ', 'ABC', 'Tom', 'Tom'], 'Age':[10, 20, 25, 15, 25, 30, 30, 50]}
df = pd.DataFrame(data)
df.sort_values(['Name'], ascending = True, inplace = True)
and
data_new = {'Name':['Tom', 'JJ', 'ABC', 'JJ', 'ABC'], 'Start_Age':[24, 18, 24, 25, 29], 'End_Age':[32, 22, 27, 25, 34]}
df_2 = pd.DataFrame(data_new)
df_2["Score"] = np.random.randint(1, 100, df_2.shape[0])
df_2.sort_values(['Name'], ascending = True, inplace = True)
I would like to merge df with df 2 to get score corresponding to the age present in df.
Below is how I am trying to do:
df_new_2 = pd.merge(df, df_2, how='left', left_on = ["Name"], right_on = ["Name"])
df_new_2 = df_new_2[(df_new_2['Age']>=df_new_2['Start_Age'])& (df_new_2['Age']<=df_new_2['End_Age']) ]
df_final = df.merge(df_new_2, how = 'left', on=['Name', 'Age'])
df_final[['Name', 'Score']].ffill(axis = 0)
My expected output is:
Name Age Score
ABC 25 86
ABC 30 87
JJ 20 59
JJ 25 22
Tom 10 Nan
Tom 15 Nan
Tom 30 98
Tom 50 98
But, I am getting something else....where am I wrong??

This would be my solution based on using np.where() to create the filters and then create a new dataframe with the output. Futhermore, I've changed the name of the column Name in df_2 to avoid having columns with equal names. df_2 = pd.DataFrame(data_new).rename(columns={'Name':'Name_new'}). Besides that, here is my code:
Age = df['Age'].values
e_age = df_2['End_Age'].values
s_age = df_2['Start_Age'].values
i, j = np.where((Age[:, None] >= s_age) & (Age[:, None] <= e_age))
final_df = pd.DataFrame(
np.column_stack([df.values[i], df_2.values[j]]),
columns=df.columns.append(df_2.columns)
)
final_df = final_df[final_df['Name'] == final_df['Name_new']]
df_max = df.merge(final_df,how='left')
df_max['Score'] = df_max.groupby('Name').ffill()['Score']
df_max = df_max[['Name','Age','Score']]
Output:
Name Age Score
0 ABC 25 41
1 ABC 30 46
2 JJ 20 39
3 JJ 25 96
4 Tom 10 NaN
5 Tom 15 NaN
6 Tom 30 78
7 Tom 50 78

Your ffill is incorrect. You need first to sort by name and age to make sure the order is correct and also group by name so only the score from the same person is considered. Otherwise the forward fill will take the previous score from any person:
df_final = df_final.sort_values(['Name', 'Age'])
df_final['Score'] = df_final.groupby('Name').ffill()['Score']
This is another solution to the problem.
It uses a helper function to lookup the score.
The helper function is then used on each row to get the score for name and age.
def get_score(name, age):
score = df_2.loc[(df_2.Name == name) &
(df_2.Start_Age <= age) &
(df_2.End_Age >= age)]['Score'].values
return score[0] if len(score) >= 1 else np.NaN
# user helper function for each row
df['Score'] = df.apply(lambda x: get_score(x.Name, x.Age), axis=1)
You can still do the forward fill afterwards like this:
df = df.sort_values(['Name', 'Age'])
df['Score'] = df.groupby('Name').ffill()['Score']

iterating through a dataframe

I have a pandas dataframe df
name e_count e_start e_end
aaaa 3 13,14,15, 18,20,25,
bbbb 2 90,94, 100,102,
The field e_count described the number of elements in e_start and e_end. I want to make a new data frame that adds a column e_end-e_start. For example
name e_count e_start e_end e_end-e_start
aaaa 3 13,14,15, 18,20,25, 5,6,10,
bbbb 2 90,94, 100,102, 10,8,
I tried the following :
df['e_end-e_start'] = ""
new_frame = pd.DataFrame(columns = df.columns)
new_frame['e_end-e_start'] = ""
new_frame_idx = -1
for idx,row in df.iterrows():
new_frame_idx = new_frame_idx + 1
new_row = df.ix[idx]
new_frame = new_frame.append(new_row,ignore_index = True)
df.ix[idx,'e_end-e_start'] =df.ix[idx,'e_end']-df.ix[idx,'target_end']
new_frame.ix[new_frame_idx,'e_end-e_start'] =df.ix[idx,'e_end-e_start'] =df.ix[idx,'e_end']-df.ix[idx,'target_end']
print new_frame
But I get an error. Can you help?

Generally, you'll get much better performance storing your data as ints instead
of a numeric strings seperated by commas. A flat, long format such as
In [73]: df
Out[73]:
name e_start e_end
0 aaaa 13 18
0 aaaa 14 20
0 aaaa 15 25
1 bbb 90 100
1 bbb 94 102
makes computation much easier. Here is how you can convert your DataFrame to the
flat format:
import pandas as pd
df = pd.DataFrame({'e_count': [3, 2],
'e_end': ['18,20,25,', '100,102,'],
'e_start': ['13,14,15,', '90,94,'],
'name': ['aaaa', 'bbb']})
dfs = []
for col in ['e_start', 'e_end']:
tmp = df[col].str.strip(',').str.split(',').apply(pd.Series)
tmp = tmp.stack()
tmp.index = tmp.index.droplevel(-1)
tmp.name = col
tmp = tmp.astype(int)
dfs.append(tmp)
df = pd.concat([df[['name']]]+dfs, axis=1)
Then, to compute the differences, you could use
df['diff'] = df['e_end'] - df['e_start']
To convert back to the comma separated strings,
In [76]: df.groupby('name').agg(lambda x: ','.join(x.astype(str)))
Out[76]:
e_start e_end diff
name
aaaa 13,14,15 18,20,25 5,6,10
bbb 90,94 100,102 10,8

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Modifying strings in a pandas DataFrame by row - python

Related

Calculate a difference in times between each pair of values in class using Pandas

return True if partial match success between two column

How do you replace duplicate values with multiple unique strings in Pandas?

Pandas Merging on condition

iterating through a dataframe

Categories

Resources