How do you replace duplicate values with multiple unique strings in Pandas? - python

import pandas as pd
import numpy as np
data = {'Name':['Tom', 'Tom', 'Jack', 'Terry'], 'Age':[20, 21, 19, 18]}
df = pd.DataFrame(data)
Lets say I have a dataframe that looks like this. I am trying to figure out how to check the Name column for the value 'Tom' and if I find it the first time I replace it with the value 'FirstTom' and the second time it appears I replace it with the value 'SecondTom'. How do you accomplish this? I've used the replace method before but only for replacing all Toms with a single value. I don't want to add a 1 on the end of the value, but completely change the string to something else.
Edit:
If the df looked more like this below, how would we check for Tom in the first column and the second column and then replace the first instance with FirstTom and the second instance with SecondTom
data = {'Name':['Tom', 'Jerry', 'Jack', 'Terry'], 'OtherName':[Tom, John, Bob,Steve]}

Just adding in to the existing solutions , you can use inflect to create dynamic dictionary
import inflect
p = inflect.engine()
df['Name'] += df.groupby('Name').cumcount().add(1).map(p.ordinal).radd('_')
print(df)
Name Age
0 Tom_1st 20
1 Tom_2nd 21
2 Jack_1st 19
3 Terry_1st 18

We can do cumcount
df.Name=df.Name+df.groupby('Name').cumcount().astype(str)
df
Name Age
0 Tom0 20
1 Tom1 21
2 Jack0 19
3 Terry0 18
Update
suf = lambda n: "%d%s"%(n,{1:"st",2:"nd",3:"rd"}.get(n if n<20 else n%10,"th"))
g=df.groupby('Name')
df.Name=df.Name.radd(g.cumcount().add(1).map(suf).mask(g.Name.transform('count')==1,''))
df
Name Age
0 1stTom 20
1 2ndTom 21
2 Jack 19
3 Terry 18
Update 2 for column
suf = lambda n: "%d%s"%(n,{1:"st",2:"nd",3:"rd"}.get(n if n<20 else n%10,"th"))
g=s.groupby([s.index.get_level_values(0),s])
s=s.radd(g.cumcount().add(1).map(suf).mask(g.transform('count')==1,''))
s=s.unstack()
Name OtherName
0 1stTom 2ndTom
1 Jerry John
2 Jack Bob
3 Terry Steve

EDIT: For count duplicated per rows use:
df = pd.DataFrame(data = {'Name':['Tom', 'Jerry', 'Jack', 'Terry'],
'OtherName':['Tom', 'John', 'Bob','Steve'],
'Age':[20, 21, 19, 18]})
print (df)
Name OtherName Age
0 Tom Tom 20
1 Jerry John 21
2 Jack Bob 19
3 Terry Steve 18
import inflect
p = inflect.engine()
#map by function for dynamic counter
f = lambda i: p.number_to_words(p.ordinal(i))
#columns filled by names
cols = ['Name','OtherName']
#reshaped to MultiIndex Series
s = df[cols].stack()
#counter per groups
count = s.groupby([s.index.get_level_values(0),s]).cumcount().add(1)
#mask for filter duplicates
mask = s.reset_index().duplicated(['level_0',0], keep=False).values
#filter only duplicates and map, reshape back and add to original data
df[cols] = count[mask].map(f).unstack().add(df[cols], fill_value='')
print (df)
Name OtherName Age
0 firstTom secondTom 20
1 Jerry John 21
2 Jack Bob 19
3 Terry Steve 18
Use GroupBy.cumcount with Series.map, but only for duplicated values by Series.duplicated:
data = {'Name':['Tom', 'Tom', 'Jack', 'Terry'], 'Age':[20, 21, 19, 18]}
df = pd.DataFrame(data)
nth = {
0: "First",
1: "Second",
2: "Third",
3: "Fourth"
}
mask = df.Name.duplicated(keep=False)
df.loc[mask, 'Name'] = df[mask].groupby('Name').cumcount().map(nth) + df.loc[mask, 'Name']
print (df)
Name Age
0 FirstTom 20
1 SecondTom 21
2 Jack 19
3 Terry 18
Dynamic dictionary should be like:
import inflect
p = inflect.engine()
mask = df.Name.duplicated(keep=False)
f = lambda i: p.number_to_words(p.ordinal(i))
df.loc[mask, 'Name'] = df[mask].groupby('Name').cumcount().add(1).map(f) + df.loc[mask, 'Name']
print (df)
Name Age
0 firstTom 20
1 secondTom 21
2 Jack 19
3 Terry 18

transform
nth = ['First', 'Second', 'Third', 'Fourth']
def prefix(d):
n = len(d)
if n > 1:
return d.radd([nth[i] for i in range(n)])
else:
return d
df.assign(Name=df.groupby('Name').Name.transform(prefix))
Name Age
0 FirstTom 20
1 SecondTom 21
2 Jack 19
3 Terry 18
4 FirstSteve 17
5 SecondSteve 16
6 ThirdSteve 15
​

Related

Calculate a difference in times between each pair of values in class using Pandas

I am trying to calculate the difference in the "time" column between each pair of elements having the same value in the "class" column.
This is an example of an input:
class name time
0 A Bob 2022-09-05 07:22:15
1 A Sam 2022-09-04 17:18:29
2 B Bob 2022-09-04 03:29:06
3 B Sue 2022-09-04 01:28:34
4 A Carol 2022-09-04 10:40:23
And this is an output:
class name1 name2 timeDiff
0 A Bob Carol 0 days 20:41:52
1 A Bob Sam 0 days 14:03:46
2 A Carol Sam 0 days 06:38:06
3 B Bob Sue 0 days 02:00:32
I wrote this code to solve this problem:
from itertools import combinations
df2 = pd.DataFrame(columns=['class', 'name1', 'name2', 'timeDiff'])
for c in df['class'].unique():
df_class = df[df['class'] == c]
groups = df_class.groupby(['name'])['time']
if len(df_class) > 1:
out = (pd
.concat({f'{k1} {k2}': pd.Series(data=np.abs(np.diff([g2.values[0],g1.values[0]])).astype('timedelta64[s]'), index=[f'{k1} {k2}'], name='timeDiff')
for (k1, g1), (k2, g2) in combinations(groups, 2)},
names=['name']
)
.reset_index()
)
new = out["name"].str.split(" ", n = -1, expand = True)
out["name1"]= new[0].astype(str)
out["name2"]= new[1].astype(str)
out["class"] = c
del out['level_1'], out['name']
df2 = df2.append(out, ignore_index=True)
I didn't come up with a solution without going through all the class values in a loop. However, this is very time-consuming if the input table is large. Does anyone have any solutions without using a loop?
The whole thing is a self cross join and a time difference
import pandas as pd
df = pd.DataFrame({
'class': ['A', 'A', 'B', 'B', 'A'],
'name': ['Bob', 'Sam', 'Bob', 'Sue', 'Carol'],
'time': [
pd.Timestamp('2022-09-05 07:22:15'),
pd.Timestamp('2022-09-04 17:18:29'),
pd.Timestamp('2022-09-04 03:29:06'),
pd.Timestamp('2022-09-04 01:28:34'),
pd.Timestamp('2022-09-04 10:40:23'),
]
})
rs = list()
for n, df_g in df.groupby('class'):
t_df = df_g.merge(
df_g, how='cross',
suffixes=('_1', '_2')
)
t_df = t_df[t_df['name_1'] != t_df['name_2']]
t_df = t_df.drop(['class_2'], axis=1)\
.rename({'class_1': 'class'}, axis=1).reset_index(drop=True)
t_df['timeDiff'] = abs(t_df['time_1'] - t_df['time_2'])\
.astype('timedelta64[ns]')
t_df = t_df.drop(['time_1', 'time_2'], axis=1)
rs.append(t_df)
rs_df = pd.concat(rs).reset_index(drop=True)
Check below code without Outerjoin, using Aggegrate & Itertools
from itertools import combinations
# Function to create list of names
def agg_to_list(value):
return list(list(i) for i in combinations(list(value), 2))
# Fucntion to calculate list of time & calculate differences between them
def agg_to_list_time(value):
return [ t[0] - t[1] for t in list(combinations(list(value), 2))]
# Apply aggregate functions
updated_df = df.groupby(['class']).agg({'name':agg_to_list,'time':agg_to_list_time})
# Explode DataFrame & rename column
updated_df = updated_df.explode(['name','time']).rename(columns={'time':'timediff'})
# Unpack name column in two new columns
updated_df[['name1','name2']] = pd.DataFrame(updated_df.name.tolist(), index=updated_df.index)
# Final DataFrame
print(updated_df.reset_index()[['class','name1','name2','timediff']])
Output:
class name1 name2 timediff
0 A Bob Sam 0 days 14:03:46
1 A Bob Carol 0 days 20:41:52
2 A Sam Carol 0 days 06:38:06
3 B Bob Cue 0 days 02:00:32

Change panda DataFrame into paragraph with a certain format

Here is the code:
# Import pandas library
import pandas as pd
# initialize list of lists
data = [['tom', 10], ['nick', 15], ['juli', 14]]
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['Name', 'Age'])
# print dataframe.
df
here is the output
I wanted to change this dataframe to a text with certain format like:
" Here is the name and age Data:
Tom 10
Nick 15
Juli 14
Thank you for your request"
Use Series.str.capitalize and join values of Age converted to strings:
df['new'] = df['Name'].str.capitalize() + ' ' + df['Age'].astype(str)
print (df)
Name Age new
0 tom 10 Tom 10
1 nick 15 Nick 15
2 juli 14 Juli 14
Or:
df['new'] = df['Name'].str.capitalize().str.cat(df['Age'].astype(str), sep=' ')
If need loop by values:
s = df['Name'].str.capitalize() + ' ' + df['Age'].astype(str)
for v in s:
print (v)
Tom 10
Nick 15
Juli 14

return True if partial match success between two column

I am looking for partial matches success. below is the code for the dataframe.
import pandas as pd
data = [['tom', 10,'aaaaa','aaa'], ['nick', 15,'vvvvv','vv'], ['juli', 14,'sssssss','kk']]
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['Name', 'Age','random','partial'])
df
Output:
I am expecting the output shown below
Name Age random partial Matches
0 tom 10 aaaaa aaa True
1 nick 15 vvvvv vv True
2 juli 14 sssssss kk False
You can use df.apply in combination with a lambda function that checks whether the partial string is part of the other string by using in.
Then we can assign this to a new column in the dataframe:
>>>import pandas as pd
>>>data = [['tom', 10,'aaaaa','aaa'], ['nick', 15,'vvvvv','vv'], ['juli',14,'sssssss','kk']]
>>>df = pd.DataFrame(data, columns = ['Name', 'Age','random','partial'])
>>>df['matching'] = df.apply(lambda x : x.partial in x.random, axis=1)
>>>print(df)
Name Age random partial matching
0 tom 10 aaaaa aaa True
1 nick 15 vvvvv vv True
2 juli 14 sssssss kk False
One important thing to be aware of when using df.apply is the axis argument, as it here allows us to access all columns of a given row at once.
df['Matches'] = df.apply(lambda row: row['partial'] in row['random'], axis = 'columns')
df
gives
Name Age random partial Matches
0 tom 10 aaaaa aaa True
1 nick 15 vvvvv vv True
2 juli 14 sssssss kk False

Pandas Merging on condition

I am trying to do conditional merging between pandas df:
My df's look like this:
df1
import numpy as np
import pandas as pd
data = {'Name':['Tom', 'JJ', 'ABC', 'Tom', 'JJ', 'ABC', 'Tom', 'Tom'], 'Age':[10, 20, 25, 15, 25, 30, 30, 50]}
df = pd.DataFrame(data)
df.sort_values(['Name'], ascending = True, inplace = True)
and
data_new = {'Name':['Tom', 'JJ', 'ABC', 'JJ', 'ABC'], 'Start_Age':[24, 18, 24, 25, 29], 'End_Age':[32, 22, 27, 25, 34]}
df_2 = pd.DataFrame(data_new)
df_2["Score"] = np.random.randint(1, 100, df_2.shape[0])
df_2.sort_values(['Name'], ascending = True, inplace = True)
I would like to merge df with df 2 to get score corresponding to the age present in df.
Below is how I am trying to do:
df_new_2 = pd.merge(df, df_2, how='left', left_on = ["Name"], right_on = ["Name"])
df_new_2 = df_new_2[(df_new_2['Age']>=df_new_2['Start_Age'])& (df_new_2['Age']<=df_new_2['End_Age']) ]
df_final = df.merge(df_new_2, how = 'left', on=['Name', 'Age'])
df_final[['Name', 'Score']].ffill(axis = 0)
My expected output is:
Name Age Score
ABC 25 86
ABC 30 87
JJ 20 59
JJ 25 22
Tom 10 Nan
Tom 15 Nan
Tom 30 98
Tom 50 98
But, I am getting something else....where am I wrong??
This would be my solution based on using np.where() to create the filters and then create a new dataframe with the output. Futhermore, I've changed the name of the column Name in df_2 to avoid having columns with equal names. df_2 = pd.DataFrame(data_new).rename(columns={'Name':'Name_new'}). Besides that, here is my code:
Age = df['Age'].values
e_age = df_2['End_Age'].values
s_age = df_2['Start_Age'].values
i, j = np.where((Age[:, None] >= s_age) & (Age[:, None] <= e_age))
final_df = pd.DataFrame(
np.column_stack([df.values[i], df_2.values[j]]),
columns=df.columns.append(df_2.columns)
)
final_df = final_df[final_df['Name'] == final_df['Name_new']]
df_max = df.merge(final_df,how='left')
df_max['Score'] = df_max.groupby('Name').ffill()['Score']
df_max = df_max[['Name','Age','Score']]
Output:
Name Age Score
0 ABC 25 41
1 ABC 30 46
2 JJ 20 39
3 JJ 25 96
4 Tom 10 NaN
5 Tom 15 NaN
6 Tom 30 78
7 Tom 50 78
Your ffill is incorrect. You need first to sort by name and age to make sure the order is correct and also group by name so only the score from the same person is considered. Otherwise the forward fill will take the previous score from any person:
df_final = df_final.sort_values(['Name', 'Age'])
df_final['Score'] = df_final.groupby('Name').ffill()['Score']
This is another solution to the problem.
It uses a helper function to lookup the score.
The helper function is then used on each row to get the score for name and age.
def get_score(name, age):
score = df_2.loc[(df_2.Name == name) &
(df_2.Start_Age <= age) &
(df_2.End_Age >= age)]['Score'].values
return score[0] if len(score) >= 1 else np.NaN
# user helper function for each row
df['Score'] = df.apply(lambda x: get_score(x.Name, x.Age), axis=1)
You can still do the forward fill afterwards like this:
df = df.sort_values(['Name', 'Age'])
df['Score'] = df.groupby('Name').ffill()['Score']

Pandas data frame sum of column and collecting the results

Given the following dataframe:
import pandas as pd
p1 = {'name': 'willy', 'age': 11, 'interest': "Lego"}
p2 = {'name': 'willy', 'age': 11, 'interest': "games"}
p3 = {'name': 'zoe', 'age': 9, 'interest': "cars"}
df = pd.DataFrame([p1, p2, p3])
df
age interest name
0 11 Lego willy
1 11 games willy
2 9 cars zoe
I want to know the sum of interests of each person and let each person only show once in the list. I do the following:
Interests = df[['age', 'name', 'interest']].groupby(['age' , 'name']).count()
Interests.reset_index(inplace=True)
Interests.sort('interest', ascending=False, inplace=True)
Interests
age name interest
1 11 willy 2
0 9 zoe 1
This works but I have the feeling that I'm doing it wrong. Now I'm using the column 'interest' to display my sum values which is okay but like I said I expect there to be a nicer way to do this.
I saw many questions about counting/sum in Pandas but for me the part where I leave out the 'duplicates' is key.
You can use size (the length of each group), rather than count, the non-NaN enties in each column of the group.
In [11]: df[['age', 'name', 'interest']].groupby(['age' , 'name']).size()
Out[11]:
age name
9 zoe 1
11 willy 2
dtype: int64
In [12]: df[['age', 'name', 'interest']].groupby(['age' , 'name']).size().reset_index(name='count')
Out[12]:
age name count
0 9 zoe 1
1 11 willy 2
In [2]: df
Out[2]:
age interest name
0 11 Lego willy
1 11 games willy
2 9 cars zoe
In [3]: for name,group in df.groupby('name'):
...: print name
...: print group.interest.count()
...:
willy
2
zoe
1

Categories