Conditional grouping in pandas data frame - python

Imagine a pandas data frame given by
df = pd.DataFrame({
'id': range(5),
'desc': ('This is text', 'John Doe ABC', 'John Doe', 'Something JKL', 'Something more'),
'mfr': ('ABC', 'DEF', 'DEF', 'GHI', 'JKL')
})
which yields
id desc mfr
0 0 This is text ABC
1 1 John Doe ABC DEF
2 2 John Doe DEF
3 3 Something JKL GHI
4 4 Something more JKL
I wish to determine which id's belong to eachother. Either they are matched by mfrcolumn or if mfrvalue are contained in desccolumn. E.g. id = 1 and 2 are the same group because mfr are equal but id = 0 and 1 are also the same group since ABC from mfr in id = 0 are part of desc in id = 1.
The resulting data frame should be
id desc mfr group
0 0 This is text ABC 0
1 1 John Doe ABC DEF 0
2 2 John Doe DEF 0
3 3 Something JKL GHI 1
4 4 Something more JKL 1
Are there anyone out there with a good solution for this? I imagine that there are no really simple ones so any is welcome.

I'm assuming 'desc' does not contain multiple 'mfr' values
Solution1:
import numpy as np
import pandas as pd
# original dataframe
df = pd.DataFrame({
'id': range(5),
'desc': ('This is text', 'John Doe ABC', 'John Doe', 'Something JKL', 'Something more'),
'mfr': ('ABC', 'DEF', 'DEF', 'GHI', 'JKL')
})
# for final merge
ori = df.copy()
# max words used in 'desc'
max_len = max(df.desc.apply(lambda x: len(x.split(' '))))
# unique 'mfr' values
uniq_mfr = df.mfr.unique().tolist()
# if list is less than max len, then pad with nan
def padding(lst, mx):
for i in range(mx):
if len(lst) < mx:
lst.append(np.nan)
return lst
df['desc'] = df.desc.apply(lambda x: x.split(' ')).apply(padding, args=(max_len,))
# each word makes 1 column
for i in range(max_len):
newcol = 'desc{}'.format(i)
df[newcol] = df.desc.apply(lambda x: x[i])
df.loc[~df[newcol].isin(uniq_mfr), newcol] = np.nan
# merge created columns into 1 by taking 'mfr' values only
df['desc'] = df[df.columns[3:]].fillna('').sum(axis=1).replace('', np.nan)
# create [ABC, ABC] type of column by merging two columns (desc & mfr)
df = df[df.columns[:3]]
df.desc.fillna(df.mfr, inplace=True)
df.desc = [[x, y] for x, y in zip(df.desc.tolist(), df.mfr.tolist())]
df = df[['id', 'desc']]
df = df.sort_values('desc').reset_index(drop=True)
# BELOW IS COMMON WITH SOLUTION2
# from here I borrowed the solution by #mimomu from below URL (slightly modified)
# try to get merged tuple based on the common elements
# https://stackoverflow.com/questions/4842613/merge-lists-that-share-common-elements
import itertools
L = df.desc.tolist()
LL = set(itertools.chain.from_iterable(L))
for each in LL:
components = [x for x in L if each in x]
for i in components:
L.remove(i)
L += [tuple(set(itertools.chain.from_iterable(components)))]
# allocate merged tuple to 'desc'
df['desc'] = sorted(L)
# grouping by 'desc' value (tuple can be key list cannot be fyi...)
df['group'] = df.groupby('desc').grouper.group_info[0]
# merge with the original
df = df.drop('desc', axis=1).merge(ori, on='id', how='left')
df = df[['id', 'desc', 'mfr', 'group']]
Solution2 (2nd half is common with Solution1):
import numpy as np
import pandas as pd
# original dataframe
df = pd.DataFrame({
'id': range(5),
'desc': ('This is text', 'John Doe ABC', 'John Doe', 'Something JKL', 'Something more'),
'mfr': ('ABC', 'DEF', 'DEF', 'GHI', 'JKL')
})
# for final merge
ori = df.copy()
# unique 'mfr' values
uniq_mfr = df.mfr.unique().tolist()
# make desc entries as lists
df['desc'] = df.desc.apply(lambda x: x.split(' '))
# pick up mfr values in desc column otherwise nan
mfr_in_descs = []
for ds, ms in zip(df.desc, df.mfr):
for i, d in enumerate(ds):
if d in uniq_mfr:
mfr_in_descs.append(d)
continue
if i == (len(ds) - 1):
mfr_in_descs.append(np.nan)
# create column whose element is like [ABC, ABC]
df['desc'] = mfr_in_descs
df['desc'].fillna(df.mfr, inplace=True)
df['desc'] = [[x, y] for x, y in zip(df.desc.tolist(), df.mfr.tolist())]
df = df[['id', 'desc']]
df = df.sort_values('desc').reset_index(drop=True)
# BELOW IS COMMON WITH SOLUTION1
# from here I borrowed the solution by #mimomu from below URL (slightly modified)
# try to get merged tuple based on the common elements
# https://stackoverflow.com/questions/4842613/merge-lists-that-share-common-elements
import itertools
L = df.desc.tolist()
LL = set(itertools.chain.from_iterable(L))
for each in LL:
components = [x for x in L if each in x]
for i in components:
L.remove(i)
L += [tuple(set(itertools.chain.from_iterable(components)))]
# allocate merged tuple to 'desc'
df['desc'] = sorted(L)
# grouping by 'desc' value (tuple can be key list cannot be fyi...)
df['group'] = df.groupby('desc').grouper.group_info[0]
# merge with the original
df = df.drop('desc', axis=1).merge(ori, on='id', how='left')
df = df[['id', 'desc', 'mfr', 'group']]
From 2 solutions above, I get the same results df:
id desc mfr group
0 0 This is text ABC 0
1 1 John Doe ABC DEF 0
2 2 John Doe DEF 0
3 3 Something JKL GHI 1
4 4 Something more JKL 1

Related

Calculate a difference in times between each pair of values in class using Pandas

I am trying to calculate the difference in the "time" column between each pair of elements having the same value in the "class" column.
This is an example of an input:
class name time
0 A Bob 2022-09-05 07:22:15
1 A Sam 2022-09-04 17:18:29
2 B Bob 2022-09-04 03:29:06
3 B Sue 2022-09-04 01:28:34
4 A Carol 2022-09-04 10:40:23
And this is an output:
class name1 name2 timeDiff
0 A Bob Carol 0 days 20:41:52
1 A Bob Sam 0 days 14:03:46
2 A Carol Sam 0 days 06:38:06
3 B Bob Sue 0 days 02:00:32
I wrote this code to solve this problem:
from itertools import combinations
df2 = pd.DataFrame(columns=['class', 'name1', 'name2', 'timeDiff'])
for c in df['class'].unique():
df_class = df[df['class'] == c]
groups = df_class.groupby(['name'])['time']
if len(df_class) > 1:
out = (pd
.concat({f'{k1} {k2}': pd.Series(data=np.abs(np.diff([g2.values[0],g1.values[0]])).astype('timedelta64[s]'), index=[f'{k1} {k2}'], name='timeDiff')
for (k1, g1), (k2, g2) in combinations(groups, 2)},
names=['name']
)
.reset_index()
)
new = out["name"].str.split(" ", n = -1, expand = True)
out["name1"]= new[0].astype(str)
out["name2"]= new[1].astype(str)
out["class"] = c
del out['level_1'], out['name']
df2 = df2.append(out, ignore_index=True)
I didn't come up with a solution without going through all the class values in a loop. However, this is very time-consuming if the input table is large. Does anyone have any solutions without using a loop?
The whole thing is a self cross join and a time difference
import pandas as pd
df = pd.DataFrame({
'class': ['A', 'A', 'B', 'B', 'A'],
'name': ['Bob', 'Sam', 'Bob', 'Sue', 'Carol'],
'time': [
pd.Timestamp('2022-09-05 07:22:15'),
pd.Timestamp('2022-09-04 17:18:29'),
pd.Timestamp('2022-09-04 03:29:06'),
pd.Timestamp('2022-09-04 01:28:34'),
pd.Timestamp('2022-09-04 10:40:23'),
]
})
rs = list()
for n, df_g in df.groupby('class'):
t_df = df_g.merge(
df_g, how='cross',
suffixes=('_1', '_2')
)
t_df = t_df[t_df['name_1'] != t_df['name_2']]
t_df = t_df.drop(['class_2'], axis=1)\
.rename({'class_1': 'class'}, axis=1).reset_index(drop=True)
t_df['timeDiff'] = abs(t_df['time_1'] - t_df['time_2'])\
.astype('timedelta64[ns]')
t_df = t_df.drop(['time_1', 'time_2'], axis=1)
rs.append(t_df)
rs_df = pd.concat(rs).reset_index(drop=True)
Check below code without Outerjoin, using Aggegrate & Itertools
from itertools import combinations
# Function to create list of names
def agg_to_list(value):
return list(list(i) for i in combinations(list(value), 2))
# Fucntion to calculate list of time & calculate differences between them
def agg_to_list_time(value):
return [ t[0] - t[1] for t in list(combinations(list(value), 2))]
# Apply aggregate functions
updated_df = df.groupby(['class']).agg({'name':agg_to_list,'time':agg_to_list_time})
# Explode DataFrame & rename column
updated_df = updated_df.explode(['name','time']).rename(columns={'time':'timediff'})
# Unpack name column in two new columns
updated_df[['name1','name2']] = pd.DataFrame(updated_df.name.tolist(), index=updated_df.index)
# Final DataFrame
print(updated_df.reset_index()[['class','name1','name2','timediff']])
Output:
class name1 name2 timediff
0 A Bob Sam 0 days 14:03:46
1 A Bob Carol 0 days 20:41:52
2 A Sam Carol 0 days 06:38:06
3 B Bob Cue 0 days 02:00:32

Performing a pandas grouby on a list containing 2 and 3 character string subset of a column

Say I have a simple dataframe with the names of people. I perform a groupby on name
import pandas as pd
df = pd.DataFrame({'col1' : [1,2,3,4,5,6,7], 'name': ['George', 'John', 'Tim', 'Joe', 'Issac', 'George', 'Tim'] })
df1 = df.groupby('name')
Question: How can I select out a table of specific names out of a list which contains a string subset of the names, either 2 or 3 characters?
e.g say I have the following list where both Tim & Geo are the first 3 characters of some entries in the name column and Jo is the first 2 characters of a certain entry in the name column.
list = ['Jo', 'Tim', 'Geo']
Attempted: My initial thought was to create new columns in the original dataframe which were either a 2 or 3 character subset of the name column and then try grouping by that however since 2 and 3 string characters are different the grouping wouldn't output the correct result.
Not sure whether it would be better to use some if condition such as if v in list is len(2) groupby(2char) else groupby(3char) and output the result as 1 dataframe.
list
df1['name_2char_subset] = df1['name'].str[0:2]
df1['name_3char_subset] = df1['name'].str[0:3]
if v in list is len(2):
df2 = df1.groupby('name_2char_subset')
else:
df2 = df1.groupby('name_3char_subset')
Desired Output: Since there are 2 counts of each of Jo, Geo & Tim. The output should group by each case. ie for Jo there are both John & Joe hence a count of 2 in the groupby.
df3 = pd.DataFrame({'name': ['Jo', 'Tim', 'Geo'], 'col1': [2,2,2]})
How could we group by name and output the entries in name which have the given initial characters as in the list?
Any alternative ways of doing this will be helpful. For example, can perform this in the group by of extract values in the list after the group by has been performed.
First dont use list for variable, because python code word. Then use Series.str.extract for test if match by starting of strings by ^ and count in Series.value_counts:
L = ['Jo', 'Tim', 'Geo']
pat = '|'.join(r"^{}".format(x) for x in L)
df = (df['name'].str.extract('('+ pat + ')', expand=False)
.dropna()
.value_counts()
.reindex(L, fill_value=0)
.rename_axis('name')
.reset_index(name='col1'))
print (df)
name col1
0 Jo 2
1 Tim 2
2 Geo 2
Your solution:
L = ['Jo', 'Tim', 'Geo']
s1 = df['name'].str[:2]
s2 = df['name'].str[:3]
df = (s1.where(s1.isin(L)).fillna(s2.where(s2.isin(L)))
.dropna()
.value_counts()
.reindex(L, fill_value=0)
.rename_axis('name')
.reset_index(name='col1'))
print (df)
name col1
0 Jo 2
1 Tim 2
2 Geo 2
Solution from deleted answer with change by Series.str.startswith for test if starting string by list:
L = ['Jo', 'Tim', 'Geo']
df3 = pd.DataFrame({'name': L})
df3['col1'] = df3['name'].apply(lambda x: sum(df['name'].str.startswith(x)))
print (df3)
name col1
0 Jo 2
1 Tim 2
2 Geo 2
EDIT: If need groupby more columns use first or second solution, assign columns back and aggregate by names aggregation in GroupBy.agg:
df = pd.DataFrame({'age' : [1,2,3,4,5,6,7],
'name': ['George', 'John', 'Tim', 'Joe', 'Issac', 'George', 'Tim'] })
print (df)
L = ['Jo', 'Tim', 'Geo']
pat = '|'.join(r"^{}".format(x) for x in L)
df['name'] = df['name'].str.extract('('+ pat + ')', expand=False)
df = df.groupby('name').agg(sum_age=('age','sum'), col1=('name', 'count'))
print (df)
sum_age col1
name
Geo 7 2
Jo 6 2
Tim 10 2

Explode rows into columns [duplicate]

I have several columns named the same in a df. I need to rename them but the problem is that the df.rename method renames them all the same way. How I can rename the below blah(s) to blah1, blah4, blah5?
df = pd.DataFrame(np.arange(2*5).reshape(2,5))
df.columns = ['blah','blah2','blah3','blah','blah']
df
# blah blah2 blah3 blah blah
# 0 0 1 2 3 4
# 1 5 6 7 8 9
Here is what happens when using the df.rename method:
df.rename(columns={'blah':'blah1'})
# blah1 blah2 blah3 blah1 blah1
# 0 0 1 2 3 4
# 1 5 6 7 8 9
Starting with Pandas 0.19.0 pd.read_csv() has improved support for duplicate column names
So we can try to use the internal method:
In [137]: pd.io.parsers.ParserBase({'names':df.columns})._maybe_dedup_names(df.columns)
Out[137]: ['blah', 'blah2', 'blah3', 'blah.1', 'blah.2']
Since Pandas 1.3.0:
pd.io.parsers.base_parser.ParserBase({'names':df.columns, 'usecols':None})._maybe_dedup_names(df.columns)
This is the "magic" function:
def _maybe_dedup_names(self, names):
# see gh-7160 and gh-9424: this helps to provide
# immediate alleviation of the duplicate names
# issue and appears to be satisfactory to users,
# but ultimately, not needing to butcher the names
# would be nice!
if self.mangle_dupe_cols:
names = list(names) # so we can index
counts = {}
for i, col in enumerate(names):
cur_count = counts.get(col, 0)
if cur_count > 0:
names[i] = '%s.%d' % (col, cur_count)
counts[col] = cur_count + 1
return names
I was looking to find a solution within Pandas more than a general Python solution.
Column's get_loc() function returns a masked array if it finds duplicates with 'True' values pointing to the locations where duplicates are found. I then use the mask to assign new values into those locations. In my case, I know ahead of time how many dups I'm going to get and what I'm going to assign to them but it looks like df.columns.get_duplicates() would return a list of all dups and you can then use that list in conjunction with get_loc() if you need a more generic dup-weeding action
'''UPDATED AS-OF SEPT 2020'''
cols=pd.Series(df.columns)
for dup in df.columns[df.columns.duplicated(keep=False)]:
cols[df.columns.get_loc(dup)] = ([dup + '.' + str(d_idx)
if d_idx != 0
else dup
for d_idx in range(df.columns.get_loc(dup).sum())]
)
df.columns=cols
blah blah2 blah3 blah.1 blah.2
0 0 1 2 3 4
1 5 6 7 8 9
New Better Method (Update 03Dec2019)
This code below is better than above code. Copied from another answer below (#SatishSK):
#sample df with duplicate blah column
df=pd.DataFrame(np.arange(2*5).reshape(2,5))
df.columns=['blah','blah2','blah3','blah','blah']
df
# you just need the following 4 lines to rename duplicates
# df is the dataframe that you want to rename duplicated columns
cols=pd.Series(df.columns)
for dup in cols[cols.duplicated()].unique():
cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
# rename the columns with the cols list.
df.columns=cols
df
Output:
blah blah2 blah3 blah.1 blah.2
0 0 1 2 3 4
1 5 6 7 8 9
You could use this:
def df_column_uniquify(df):
df_columns = df.columns
new_columns = []
for item in df_columns:
counter = 0
newitem = item
while newitem in new_columns:
counter += 1
newitem = "{}_{}".format(item, counter)
new_columns.append(newitem)
df.columns = new_columns
return df
Then
import numpy as np
import pandas as pd
df=pd.DataFrame(np.arange(2*5).reshape(2,5))
df.columns=['blah','blah2','blah3','blah','blah']
so that df:
blah blah2 blah3 blah blah
0 0 1 2 3 4
1 5 6 7 8 9
then
df = df_column_uniquify(df)
so that df:
blah blah2 blah3 blah_1 blah_2
0 0 1 2 3 4
1 5 6 7 8 9
You could assign directly to the columns:
In [12]:
df.columns = ['blah','blah2','blah3','blah4','blah5']
df
Out[12]:
blah blah2 blah3 blah4 blah5
0 0 1 2 3 4
1 5 6 7 8 9
[2 rows x 5 columns]
If you want to dynamically just rename the duplicate columns then you could do something like the following (code taken from answer 2: Index of duplicates items in a python list):
In [25]:
import collections
dups = collections.defaultdict(list)
dup_indices=[]
col_list=list(df.columns)
for i, e in enumerate(list(df.columns)):
dups[e].append(i)
for k, v in sorted(dups.items()):
if len(v) >= 2:
dup_indices = v
for i in dup_indices:
col_list[i] = col_list[i] + ' ' + str(i)
col_list
Out[25]:
['blah 0', 'blah2', 'blah3', 'blah 3', 'blah 4']
You could then use this to assign back, you could also have a function to generate a unique name that is not present in the columns prior to renaming.
duplicated_idx = dataset.columns.duplicated()
duplicated = dataset.columns[duplicated_idx].unique()
rename_cols = []
i = 1
for col in dataset.columns:
if col in duplicated:
rename_cols.extend([col + '_' + str(i)])
else:
rename_cols.extend([col])
dataset.columns = rename_cols
Thank you #Lamakaha for the solution. Your idea gave me a chance to modify it and make it workable in all the cases.
I am using Python 3.7.3 version.
I tried your piece of code on my data set which had only one duplicated column i.e. two columns with same name. Unfortunately, the column names remained As-Is without being renamed. On top of that I got a warning that "get_duplicates() is deprecated and same will be removed in future version". I used duplicated() coupled with unique() in place of get_duplicates() which did not yield the expected result.
I have modified your piece of code little bit which is working for me now for my data set as well as in other general cases as well.
Here are the code runs with and without code modification on the example data set mentioned in the question along with results:
df=pd.DataFrame(np.arange(2*5).reshape(2,5))
df.columns=['blah','blah2','blah3','blah','blah']
df
cols=pd.Series(df.columns)
for dup in df.columns.get_duplicates():
cols[df.columns.get_loc(dup)]=[dup+'.'+str(d_idx) if d_idx!=0 else dup for d_idx in range(df.columns.get_loc(dup).sum())]
df.columns=cols
df
f:\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning:
'get_duplicates' is deprecated and will be removed in a future
release. You can use idx[idx.duplicated()].unique() instead
Output:
blah blah2 blah3 blah blah.1
0 0 1 2 3 4
1 5 6 7 8 9
Two of the three "blah"(s) are not renamed properly.
Modified code
df=pd.DataFrame(np.arange(2*5).reshape(2,5))
df.columns=['blah','blah2','blah3','blah','blah']
df
cols=pd.Series(df.columns)
for dup in cols[cols.duplicated()].unique():
cols[cols[cols == dup].index.values.tolist()] = [dup + '.' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
df.columns=cols
df
Output:
blah blah2 blah3 blah.1 blah.2
0 0 1 2 3 4
1 5 6 7 8 9
Here is a run of modified code on some another example:
cols = pd.Series(['X', 'Y', 'Z', 'A', 'B', 'C', 'A', 'A', 'L', 'M', 'A', 'Y', 'M'])
for dup in cols[cols.duplicated()].unique():
cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
cols
Output:
0 X
1 Y
2 Z
3 A
4 B
5 C
6 A_1
7 A_2
8 L
9 M
10 A_3
11 Y_1
12 M_1
dtype: object
Hope this helps anybody who is seeking answer to the aforementioned question.
Since the accepted answer (by Lamakaha) is not working for recent versions of pandas, and because the other suggestions looked a bit clumsy, I worked out my own solution:
def dedupIndex(idx, fmt=None, ignoreFirst=True):
# fmt: A string format that receives two arguments:
# name and a counter. By default: fmt='%s.%03d'
# ignoreFirst: Disable/enable postfixing of first element.
idx = pd.Series(idx)
duplicates = idx[idx.duplicated()].unique()
fmt = '%s.%03d' if fmt is None else fmt
for name in duplicates:
dups = idx==name
ret = [ fmt%(name,i) if (i!=0 or not ignoreFirst) else name
for i in range(dups.sum()) ]
idx.loc[dups] = ret
return pd.Index(idx)
Use the function as follows:
df.columns = dedupIndex(df.columns)
# Result: ['blah', 'blah2', 'blah3', 'blah.001', 'blah.002']
df.columns = dedupIndex(df.columns, fmt='%s #%d', ignoreFirst=False)
# Result: ['blah #0', 'blah2', 'blah3', 'blah #1', 'blah #2']
Here's a solution that also works for multi-indexes
# Take a df and rename duplicate columns by appending number suffixes
def rename_duplicates(df):
import copy
new_columns = df.columns.values
suffix = {key: 2 for key in set(new_columns)}
dup = pd.Series(new_columns).duplicated()
if type(df.columns) == pd.core.indexes.multi.MultiIndex:
# Need to be mutable, make it list instead of tuples
for i in range(len(new_columns)):
new_columns[i] = list(new_columns[i])
for ix, item in enumerate(new_columns):
item_orig = copy.copy(item)
if dup[ix]:
for level in range(len(new_columns[ix])):
new_columns[ix][level] = new_columns[ix][level] + f"_{suffix[tuple(item_orig)]}"
suffix[tuple(item_orig)] += 1
for i in range(len(new_columns)):
new_columns[i] = tuple(new_columns[i])
df.columns = pd.MultiIndex.from_tuples(new_columns)
# Not a MultiIndex
else:
for ix, item in enumerate(new_columns):
if dup[ix]:
new_columns[ix] = item + f"_{suffix[item]}"
suffix[item] += 1
df.columns = new_columns
I just wrote this code it uses a list comprehension to update all duplicated names.
df.columns = [x[1] if x[1] not in df.columns[:x[0]] else f"{x[1]}_{list(df.columns[:x[0]]).count(x[1])}" for x in enumerate(df.columns)]
Created a function with some tests so it should be drop in ready; this is a little different than Lamakaha's excellent solution since it renames the first appearance of a duplicate column:
from collections import defaultdict
from typing import Dict, List, Set
import pandas as pd
def rename_duplicate_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Rename column headers to ensure no header names are duplicated.
Args:
df (pd.DataFrame): A dataframe with a single index of columns
Returns:
pd.DataFrame: The dataframe with headers renamed; inplace
"""
if not df.columns.has_duplicates:
return df
duplicates: Set[str] = set(df.columns[df.columns.duplicated()].tolist())
indexes: Dict[str, int] = defaultdict(lambda: 0)
new_cols: List[str] = []
for col in df.columns:
if col in duplicates:
indexes[col] += 1
new_cols.append(f"{col}.{indexes[col]}")
else:
new_cols.append(col)
df.columns = new_cols
return df
def test_rename_duplicate_columns():
df = pd.DataFrame(data=[[1, 2]], columns=["a", "b"])
assert rename_duplicate_columns(df).columns.tolist() == ["a", "b"]
df = pd.DataFrame(data=[[1, 2]], columns=["a", "a"])
assert rename_duplicate_columns(df).columns.tolist() == ["a.1", "a.2"]
df = pd.DataFrame(data=[[1, 2, 3]], columns=["a", "b", "a"])
assert rename_duplicate_columns(df).columns.tolist() == ["a.1", "b", "a.2"]
We can just assign each column a different name.
Suppoese duplicate column name is like = [a,b,c,d,d,c]
Then just create a list of name what you want to assign:
C = [a,b,c,d,D1,C1]
df.columns = c
This works for me.
This is my solution:
cols = [] # for tracking if we alread seen it before
new_cols = []
for col in df.columns:
cols.append(col)
count = cols.count(col)
if count > 1:
new_cols.append(f'{col}_{count}')
else:
new_cols.append(col)
df.columns = new_cols
Here's an elegant solution:
Isolate a dataframe with only the repeated columns (looks like it will be a series but it will be a dataframe if >1 column with that name):
df1 = df['blah']
For each "blah" column, give it a unique number
df1.columns = ['blah_' + str(int(x)) for x in range(len(df1.columns))]
Isolate a dataframe with all but the repeated columns:
df2 = df[[x for x in df.columns if x != 'blah']]
Merge back together on indices:
df3 = pd.merge(df1, df2, left_index=True, right_index=True)
Et voila:
blah_0 blah_1 blah_2 blah2 blah3
0 0 3 4 1 2
1 5 8 9 6 7

replace alphabets

I would like to replace the letters by their order number in the alphabet
import string
import pandas as pd
new_vals = {c: ord(c)-96 for c in string.ascii_lowercase}
df = pd.DataFrame({'Values': ['aaa', 'abc', 'def']})
df['Values_new'] = [''.join(str(new_vals[c]) for c in row) for row in df['Values']]
df is now:
>>> df
Values Values_new
0 aaa 111
1 abc 123
2 def 456
Then you can go in and add your what-seems-like-decimal notation, although the logic there seems a little unclear to me (you have a comma listed above):
df['Values_new'] = [v[0] + '.' + v[1:] for v in df['Values_new']]
Result:
>>> df
Values Values_new
0 aaa 1.11
1 abc 1.23
2 def 4.56

python list comprehension return names based on letter

I have a DataFrame:
df = pd.DataFrame(['A','B','C'], columns = ['Letters'])
I have a list with names:
names = ['George All', 'George Ball','George Ago','George Call']
How do I create a new column in my DataFrame that contains the list of names that have a last name that start with the Letters column.
For example:
Letters Names
A ['George All','George Ago']
B George Ball
C George Call
This is what I have right now:
df['Names'] = [name for name in names if (name.split()[1][0] == df['Letters'])]
>>> df['Names'] = [[n for n in names if n.split()[1][0] == x] for x in df['Letters']]
>>> df
Letters Names
0 A [George All, George Ago]
1 B [George Ball]
2 C [George Call]
[3 rows x 2 columns]
You could do this more efficiently (important if names is a large list) by grouping all the names by first letter beforehand.
>>> from collections import defaultdict
>>> d = defaultdict(list)
>>> for item in names:
... d[item.split()[1][0]].append(item)
...
>>> df['Names'] = [d[x] for x in df['Letters']]
>>> df
Letters Names
0 A [George All, George Ago]
1 B [George Ball]
2 C [George Call]

Categories