Transpose rows using gropby with multiple values - python

I have this below DataFrame:
df = pd.DataFrame({'DC':['Alexandre', 'Alexandre', 'Afonso de Sousa', 'Afonso de Sousa','Afonso de Sousa'],
'PN':['Farmacia Ekolelo', 'Farmacia Havemos De Voltar', 'Farmacia Gloria', 'Farmacia Mambofar','Farmacia Metamorfose'],
'PC':['C-HO-002815', 'C-HO-005192', 'C-HO-002719', 'C-HO-003030','C-SCC-012430'],
'KCP':['NA', 'DOMINGAS PAULO', 'HELDER', 'Mambueno','RITA'],
'MBN':['NA', 'NA', 29295486345, 9.40407E+11,2.92955E+11]})
Trying to convert data into below format.
By grouping DC column needs to transpose other columns as per above format.

You can group by DC then aggregate with list. From there you can concat the dataframes created from the aggregated lists:
import pandas as pd
df = pd.DataFrame({'DC':['Alexandre', 'Alexandre', 'Afonso de Sousa', 'Afonso de Sousa','Afonso de Sousa'],
'PN':['Farmacia Ekolelo', 'Farmacia Havemos De Voltar', 'Farmacia Gloria', 'Farmacia Mambofar','Farmacia Metamorfose'],
'PC':['C-HO-002815', 'C-HO-005192', 'C-HO-002719', 'C-HO-003030','C-SCC-012430'],
'KCP':['NA', 'DOMINGAS PAULO', 'HELDER', 'Mambueno','RITA'],
'MBN':['NA', 'NA', 29295486345, 9.40407E+11,2.92955E+11]})
df = df.groupby('DC', as_index=False).agg(list)
#print(df)
df_out = pd.concat([df[['DC']]] +
[
pd.DataFrame(l := df[col].to_list(),
columns=[f'{col}_{i}' for i in range(1, max(len(s) for s in l) + 1)]
) for col in df.columns[1:]
],
axis=1)
Note: the assignment in the comprehension l := df[col].to_list() only works for Python versions >= 3.8.
This will give you:
DC PN_1 PN_2 PN_3 ... KCP_3 MBN_1 MBN_2 MBN_3
0 Afonso de Sousa Farmacia Gloria Farmacia Mambofar Farmacia Metamorfose ... RITA 29295486345 940407000000.0 2.929550e+11
1 Alexandre Farmacia Ekolelo Farmacia Havemos De Voltar None ... None NA NA NaN
You can then sort the columns with your own function:
def sort_columns(col_lbl):
col, ind = col_lbl.split('_')
return (int(ind), df.columns.to_list().index(col))
df_out.columns = ['DC'] + sorted(df_out.columns[1:].to_list(), key=sort_columns)
Output:
DC PN_1 PC_1 KCP_1 ... PN_3 PC_3 KCP_3 MBN_3
0 Afonso de Sousa Farmacia Gloria Farmacia Mambofar Farmacia Metamorfose ... RITA 29295486345 940407000000.0 2.929550e+11
1 Alexandre Farmacia Ekolelo Farmacia Havemos De Voltar None ... None NA NA NaN

Related

Data-frame using values from different rows while iterating

updated info at bottom
I have a group from a df.groupby that looks like this:
stop_id stop_name arrival_time departure_time stop_sequence
0 87413013 Gare de Le Havre 05:20:00 05:20:00 0.0
1 87413344 Gare de Bréauté-Beuzeville 05:35:00 05:36:00 1.0
2 87413385 Gare de Yvetot 05:49:00 05:50:00 2.0
3 87411017 Gare de Rouen-Rive-Droite 06:12:00 06:15:00 3.0
4 87384008 Gare de Paris-St-Lazare 07:38:00 07:38:00 4.0
I want to loop each row and use "stop_name" as the location of departure
and then get the following "stop_name" of the next rows as the location of arrival.
Finally I use the below func in order to parse the times and calc the trip duration in seconds.
def timestrToSeconds(timestr):
ftr = [3600,60,1]
return sum([a*b for a,b in zip(ftr, map(int,timestr.split(':')))])
The output is expected to be an array with all possible combinations like below :
result = [
('Gare de Le Havre', 'Gare de Bréauté-Beuzeville', 900),
('Gare de Le Havre', 'Gare de Yvetot', 1740),
('Gare de Le Havre', 'Gare de Rouen-Rive-Droite', 3120),
('Gare de Le Havre', 'Gare de Paris-St-Lazare', 8280),
('Gare de Bréauté-Beuzeville', 'Gare de Yvetot', 780),
('Gare de Bréauté-Beuzeville', 'Gare de Rouen-Rive-Droite', 2160),
('Gare de Bréauté-Beuzeville', 'Gare de Paris-St-Lazare', 7320),
('Gare de Yvetot', 'Gare de Rouen-Rive-Droite', 3120),
('Gare de Yvetot', 'Gare de Paris-St-Lazare', 6480),
('Gare de Rouen-Rive-Droite', 'Gare de Paris-St-Lazare', 4980),
]
I have tried with nested loops but ended up being too abstract for me.
Any advice is more than welcome
UPDATE
Mazhar's solution seems to work find on a single group, but when i loop through my groupby like this :
timeBetweenStops = []
for group_name, group in xgrouped:
group.arrival_time = pd.to_timedelta(group.arrival_time)
group.departure_time = pd.to_timedelta(group.departure_time)
new_df = group['departure_time'].apply(lambda x: (
group['arrival_time']-x).apply(lambda y: y.total_seconds()))
new_df.index = group.stop_name
new_df.columns = group.stop_name
for i in new_df.index:
for j in new_df.columns:
if new_df.loc[i, j] > 0:
r = (i, j, new_df.loc[i, j])
timeBetweenStops.append(r)
I get the following error:
ValueError Traceback (most recent call last)
<ipython-input-196-ec050382d2b5> in <module>
14 for i in new_df.index:
15 for j in new_df.columns:
---> 16 if new_df.loc[i, j] > 0:
17 r = (i, j, new_df.loc[i, j])
18 timeBetweenStopsA.append(r)
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in __nonzero__(self)
1476
1477 def __nonzero__(self):
-> 1478 raise ValueError(
1479 f"The truth value of a {type(self).__name__} is ambiguous. "
1480 "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
I have tried to use if np.where(new_df.loc[i, j] > 0): , but then i get plenty of incoherences in my result.
Convert your time columns to Timedelta with to_timedelta
df['arrival_time'] = pd.to_timedelta(df['arrival_time'])
df['departure_time'] = pd.to_timedelta(df['departure_time'])
Now use itertools.combinations to generate all combinations:
from itertools import combinations
comb = lambda x: [
(x.loc[i1, 'stop_name'], x.loc[i2, 'stop_name'],
int((x.loc[i2, 'departure_time'] - x.loc[i1, 'arrival_time']).total_seconds()))
for i1, i2 in combinations(x.index, 2)
]
For your current group:
>>> comb(df)
[('Gare de Le Havre', 'Gare de Bréauté-Beuzeville', 960),
('Gare de Le Havre', 'Gare de Yvetot', 1800),
('Gare de Le Havre', 'Gare de Rouen-Rive-Droite', 3300),
('Gare de Le Havre', 'Gare de Paris-St-Lazare', 8280),
('Gare de Bréauté-Beuzeville', 'Gare de Yvetot', 900),
('Gare de Bréauté-Beuzeville', 'Gare de Rouen-Rive-Droite', 2400),
('Gare de Bréauté-Beuzeville', 'Gare de Paris-St-Lazare', 7380),
('Gare de Yvetot', 'Gare de Rouen-Rive-Droite', 1560),
('Gare de Yvetot', 'Gare de Paris-St-Lazare', 6540),
('Gare de Rouen-Rive-Droite', 'Gare de Paris-St-Lazare', 5160)]
On many groups:
>>> df.groupby(...).apply(comb)
1 [(Gare de Le Havre, Gare de Bréauté-Beuzeville...
dtype: object
df.arrival_time = pd.to_timedelta(df.arrival_time)
df.departure_time = pd.to_timedelta(df.departure_time)
new_df = df['departure_time'].apply(lambda x: (
df['arrival_time']-x).apply(lambda y: y.total_seconds()))
new_df.index = df.stop_name
new_df.columns = df.stop_name
for i in new_df.index:
for j in new_df.columns:
if new_df.loc[i, j] > 0:
print(i, j, new_df.loc[i, j])
Until you update your question so this code can be checked with real data, here is one solution:
all_combs=combinations(df['stop_name'].to_list())
results=[]
for c in all_combs:
results.append((*c,abs(df.loc[df['stop_name']==c[0],'arrival_time']-df.loc[df['stop_name']==c[1],'arrival_time'])))
That's assum,ing that arrival_time (or whatever desired column you try to look into) is already in pandas.timedate format. If not, take a look here and convert to timedate:
Pandas convert Column to time
Note: This code works assuming that you have one value for each location in the column.
I don't think you can escape nested loops here. It may be possible to do it using list comprehension but it will be even more abstract...
You can get your result with the following code:
resultat = []
for i, ligne1 in df.iterrows():
depart = ligne1.stop_name
departure_time = ligne1.departure_time
for _, ligne2 in df.iloc[(i + 1):].iterrows():
arrivee = ligne2.stop_name
arrival_time = ligne2.arrival_time
duree = timestrToSeconds(arrival_time) - timestrToSeconds(departure_time)
resultat = resultat + [(depart, arrivee, duree)]
(Edit) This code works assuming that stations are ordered from departure to arrival. If it's not the case, you can order the dataframe with:
df = df.sort_values(by = 'departure_time')
I think you can do this without loops, substituting a heavy-handed cross join instead:
from io import StringIO
import pandas
import numpy
filedata = StringIO("""\
stop_id stop_name arrival_time departure_time stop_sequence
87413013 Gare de Le Havre 05:20:00 05:20:00 0.0
87413344 Gare de Bréauté-Beuzeville 05:35:00 05:36:00 1.0
87413385 Gare de Yvetot 05:49:00 05:50:00 2.0
87411017 Gare de Rouen-Rive-Droite 06:12:00 06:15:00 3.0
87384008 Gare de Paris-St-Lazare 07:38:00 07:38:00 4.0
""")
df = (
pandas.read_csv(filedata, sep="\s\s+", parse_dates=["arrival_time", "departure_time"])
)
results = (
df.merge(df, how="cross")
.loc[lambda df: df["stop_sequence_x"] < df["stop_sequence_y"]]
.assign(travel_time_seconds=lambda df:
df["arrival_time_y"]
.sub(df["departure_time_x"])
.dt.total_seconds()
)
.loc[:, ["stop_name_x", "stop_name_y", "travel_time_seconds"]]
.reset_index(drop=True)
)
and that gives me:
stop_name_x stop_name_y travel_time_seconds
0 Gare de Le Havre Gare de Bréauté-Beuzeville 900.0
1 Gare de Le Havre Gare de Yvetot 1740.0
2 Gare de Le Havre Gare de Rouen-Rive-Droite 3120.0
3 Gare de Le Havre Gare de Paris-St-Lazare 8280.0
4 Gare de Bréauté-Beuzeville Gare de Yvetot 780.0
5 Gare de Bréauté-Beuzeville Gare de Rouen-Rive-Droite 2160.0
6 Gare de Bréauté-Beuzeville Gare de Paris-St-Lazare 7320.0
7 Gare de Yvetot Gare de Rouen-Rive-Droite 1320.0
8 Gare de Yvetot Gare de Paris-St-Lazare 6480.0
9 Gare de Rouen-Rive-Droite Gare de Paris-St-Lazare 4980.0

Splitting columns in a DF based on multiple conditions

I have a df with multiple columns. I need to separate one of those columns into two columns, one based on ID and another one based on the description.
for example on row 34:
data['cpv'][34] = '45232460-4 - Obras de saneamento'
I would need to obtain column cpvid to be 45232460-4 and column cpvdescription to be Obras de saneamento.
This would be fairly easy to do with a string split.
However there are some some cases where
df['cpv'][45] = '45112500-0 - Movimento de terras | 45232411-6 - Construção de condutas para águas residuais | 45232423-3 - Construção de estações de bombagem de águas residuais'
Meaning there are multiple ID's and multiple descriptions on the same row. I was wondering if there is any efficient way to split the columns based on more than one condition. Meaning the first condition would be '-' (under brackets for space) and another condition for '|'.
Could anyone please assist? I'm still a newbie I tried to find some similar posts however none seem to fit my desired output.
Thanks!
If you want the long format you can make use of a string split combined with the explode method (I've create a dummy df based on your data):
df = pd.DataFrame({
'cpv':['45232460-4 - Obras de saneamento', '45112500-0 - Movimento de terras | 45232411-6 - Construção de condutas para águas residuais | 45232423-3 - Construção de estações de bombagem de águas residuais'],
'val':[1,2]
})
df = df.assign(cpv=df.cpv.str.split(r' \| ')).explode('cpv')
df = pd.concat([df, df.cpv.str.split(r' - ', expand=True).rename(columns={0:'cpvid', 1:'cpvdescription'})], axis=1).drop('cpv', axis=1)
print(df)
val cpvid cpvdescription
0 1 45232460-4 Obras de saneamento
1 2 45112500-0 Movimento de terras
1 2 45232411-6 Construção de condutas para águas residuais
1 2 45232423-3 Construção de estações de bombagem de águas re...
If you want the wide format you can try:
df = pd.DataFrame({
'cpv':['45232460-4 - Obras de saneamento', '45112500-0 - Movimento de terras | 45232411-6 - Construção de condutas para águas residuais | 45232423-3 - Construção de estações de bombagem de águas residuais'],
'val':[1,2]
})
cpv_df = pd.DataFrame(df.assign(cpv=df.cpv.str.split(r' \| ')).cpv.to_list())
df = pd.concat([df]+[cpv_df[col].str.split(r' - ', expand=True).rename(columns={0:f'cpvid_{col}', 1:f'cpvdescription_{col}'}) for col in cpv_df], axis=1).drop('cpv', axis=1)
print(df)
val cpvid_0 cpvdescription_0 cpvid_1 \
0 1 45232460-4 Obras de saneamento None
1 2 45112500-0 Movimento de terras 45232411-6
cpvdescription_1 cpvid_2 \
0 None None
1 Construção de condutas para águas residuais 45232423-3
cpvdescription_2
0 None
1 Construção de estações de bombagem de águas re...

Counting the characters of dictionary values inside a dataframe column

After downloading Facebook data, they provide json files with your post information. I read the json and dataframe with pandas. Now I want to count the characters of every post I made. The posts are in: df['data'] like: [{'post': 'Happy bday Raul'}].
I want the output to be the count of characters of: "Happy bday Raul" which will be 15 in this case or 7 in the case of "Morning" from [{'post': 'Morning'}].
df=pd.read_json('posts_1.json')
The columns are Date and Data with this format:
Date Data
01-01-2020 *[{'post': 'Morning'}]*
10-03-2020 *[{'post': 'Happy bday Raul'}]*
17-03-2020 *[{'post': 'This lockdown is sad'}]*
I tried to count the characters of this [{'post': 'Morning'}] by doing this
df['count']=df['data'].str.len()
But it's not working as result in "1".
I need to extract the value of the dictionary and do the len to count the characters. The output will be:
Date Data COUNT
01-01-2020 *[{'post': 'Morning'}]* 5
10-03-2020 *[{'post': 'Happy bday Raul'}]* 15
17-03-2020 *[{'post': 'This lockdown is sad'}]* 20
EDITED:
Used to_dict()
df11=df_post['data'].to_dict()
Output
{0: [{'post': 'Feliz cumpleaños Raul'}],
1: [{'post': 'Muchas felicidades Tere!!! Espero que todo vaya genial y siga aún mejor! Un beso desde la Escandinavia profunda'}],
2: [{'post': 'Hola!\nUna investigadora vendrá a finales de mayo, ¿Alguien tiene una habitación libre en su piso para ella? Many Thanks!'}],
3: [{'post': '¿Cómo va todo? Se que muchos estáis o estábais por Galicia :D\n\nOs recuerdo, el proceso de Matriculación tiene unos plazos concretos: desde el lunes 13 febrero hasta el viernes 24 de febrero.'}]
}
You can access the value of the post key for each row using list comprehension and count the length with str.len():
In one line of code, it would look like this:
df[1] = pd.Series([x['post'] for x in df[0]]).str.len()
This would also work, but I think it would be slower to execute:
df[1] = df[0].apply(lambda x: x['post']).str.len()
Full reproducible code below:
df = pd.DataFrame({0: [{'post': 'Feliz cumpleaños Raul'}],
1: [{'post': 'Muchas felicidades Tere!!! Espero que todo vaya genial y siga aún mejor! Un beso desde la Escandinavia profunda'}],
2: [{'post': 'Hola!\nUna investigadora vendrá a finales de mayo, ¿Alguien tiene una habitación libre en su piso para ella? Many Thanks!'}],
3: [{'post': '¿Cómo va todo? Se que muchos estáis o estábais por Galicia :D\n\nOs recuerdo, el proceso de Matriculación tiene unos plazos concretos: desde el lunes 13 febrero hasta el viernes 24 de febrero.'}]
})
df = df.T
df[1] = [x['post'] for x in df[0]]
df[2] = df[1].str.len()
df
Out[1]:
0 \
0 {'post': 'Feliz cumpleaños Raul'}
1 {'post': 'Muchas felicidades Tere!!! Espero qu...
2 {'post': 'Hola!
Una investigadora vendrá a fi...
3 {'post': '¿Cómo va todo? Se que muchos está...
1 2
0 Feliz cumpleaños Raul 22
1 Muchas felicidades Tere!!! Espero que todo vay... 112
2 Hola!\nUna investigadora vendrá a finales de ... 123
3 ¿Cómo va todo? Se que muchos estáis o está... 195

How to compare two dataframes by a key and create a new one, but just keeping the keys that are not in the first?

In python 3 and pandas I have two dataframes with the same structure:
data_1 = {
'numero_cnj' : ['0700488-61.2018.8.07.0017', '0003557-92.2008.4.01.3801', '1009486-37.2017.8.26.0053', '5005742-49.2017.4.04.9999', '0700488-61.2018.8.07.0017'],
'nome_normalizado' : ['MARIA DOS REIS DE OLIVEIRA SILVA', 'MARIA SELMA OLIVEIRA DE SOUZA E ANDRADE FERREIRA', 'SAO PAULO PREVIDENCIA - SPPREV', 'INSTITUTO NACIONAL DO SEGURO SOCIAL', 'GERALDO CAVALCANTE DA SILVEIRA']
}
df_1 = pd.DataFrame(data_1)
data_2 = {
'numero_cnj' : ['0700488-61.2018.8.07.0017', '5005742-49.2017.4.04.9999', '1009486-37.2017.8.26.0053', '0700488-61.2018.8.07.0017'],
'nome_normalizado' : ['MARIA DOS REIS DE OLIVEIRA SILVA', 'INSTITUTO NACIONAL DO SEGURO SOCIAL', 'SAO PAULO PREVIDENCIA - SPPREV', 'GERALDO CAVALCANTE DA SILVEIRA']
}
df_2 = pd.DataFrame(data_2)
The "numero_cnj" column is an identifying key for the same item, but it can be repeated because more than one person/name can refer to that item.
I want to compare the two dataframes by the key "numero_cnj" and create a new dataframe from df_1, but just keeping the rows or keys that are in df_2 but not in df_1 - keep all keys from df_1 that were not found in df_2
For example
df_1
numero_cnj nome_normalizado
0 0700488-61.2018.8.07.0017 MARIA DOS REIS DE OLIVEIRA SILVA
1 0003557-92.2008.4.01.3801 MARIA SELMA OLIVEIRA DE SOUZA E ANDRADE FERREIRA
2 1009486-37.2017.8.26.0053 SAO PAULO PREVIDENCIA - SPPREV
3 5005742-49.2017.4.04.9999 INSTITUTO NACIONAL DO SEGURO SOCIAL
4 0700488-61.2018.8.07.0017 GERALDO CAVALCANTE DA SILVEIRA
df_2
numero_cnj nome_normalizado
0 0700488-61.2018.8.07.0017 MARIA DOS REIS DE OLIVEIRA SILVA
1 5005742-49.2017.4.04.9999 INSTITUTO NACIONAL DO SEGURO SOCIAL
2 1009486-37.2017.8.26.0053 SAO PAULO PREVIDENCIA - SPPREV
3 0700488-61.2018.8.07.0017 GERALDO CAVALCANTE DA SILVEIRA
In this case, the new dataframe would have only the line:
0003557-92.2008.4.01.3801 MARIA SELMA OLIVEIRA DE SOUZA E ANDRADE FERREIRA
Please, does anyone know the best strategy to do this?
If I'm reading your question correctly, you should use join (merge) with how=outer:
merge = pd.merge(df_1, df_2, on = "numero_cnj", suffixes = ["", "_y"], how = "outer", indicator=True)
merge[merge._merge == "left_only"][["numero_cnj", "nome_normalizado"]]
The output is:
numero_cnj nome_normalizado
4 0003557-92.2008.4.01.3801 MARIA SELMA OLIVEIRA DE SOUZA E ANDRADE FERREIRA

TypeError: cannot concatenate object of type '<class 'str'>'; only Series and DataFrame objs are valid

my data which called car_A :
Source
0 CAULAINCOURT
1 MARCHE DE L'EUROPE
2 AU MAIRE
I would like to find from all path from sources to destination something like:
Source Destination
0 CAULAINCOURT MARCHE DE L'EUROPE
2 CAULAINCOURT AU MAIRE
3 MARCHE DE L'EUROPE AU MAIRE
.
.
.
I already have tried
for i in car_A['Names']:
for j in range(len(car_A)-1):
car_A = car_A.append(car_A.iloc[j+1,0])
But i got
TypeError: cannot concatenate object of type '<class 'str'>'; only Series and DataFrame objs are valid
How can i get mentioned dataset?
A small variation on the fine answer from #James. itertools.permutations removes the duplicates for you.
import pandas as pd
from itertools import permutations
df = pd.DataFrame({'sources': [
"CAULAINCOURT",
"MARCHE DE L'EUROPE",
"AU MAIRE"
]})
df_pairs = pd.DataFrame(
[x for x in permutations(df.sources, 2)],
columns=['source', 'dest'])
df_pairs
# returns
source dest
0 CAULAINCOURT MARCHE DE L'EUROPE
1 CAULAINCOURT AU MAIRE
2 MARCHE DE L'EUROPE CAULAINCOURT
3 MARCHE DE L'EUROPE AU MAIRE
4 AU MAIRE CAULAINCOURT
5 AU MAIRE MARCHE DE L'EUROPE
Another solution, using DataFrame.merge():
import pandas as pd
df = pd.DataFrame({'Source': [
"CAULAINCOURT",
"MARCHE DE L'EUROPE",
"AU MAIRE"
]})
df = df.assign(key=1).merge(df.assign(key=1), on='key').drop('key', 1).rename(columns={'Source_x':'Source', 'Source_y':'Destination'})
df = df[df.Source != df.Destination]
print(df)
Prints:
Source Destination
1 CAULAINCOURT MARCHE DE L'EUROPE
2 CAULAINCOURT AU MAIRE
3 MARCHE DE L'EUROPE CAULAINCOURT
5 MARCHE DE L'EUROPE AU MAIRE
6 AU MAIRE CAULAINCOURT
7 AU MAIRE MARCHE DE L'EUROPE
You can use itertools.product to build a set of all of the pairs, filter to remove when the source and destination are the same location, and then construct a new data frame.
import pandas as pd
from itertools import product
df = pd.DataFrame({'sources': [
"CAULAINCOURT",
"MARCHE DE L'EUROPE",
"AU MAIRE"
]})
df_pairs = pd.DataFrame(
filter(lambda x: x[0]!=x[1], product(df.sources, df.sources)),
columns=['source', 'dest']
)
df_pairs
# returns:
source dest
0 CAULAINCOURT MARCHE DE L'EUROPE
1 CAULAINCOURT AU MAIRE
2 MARCHE DE L'EUROPE CAULAINCOURT
3 MARCHE DE L'EUROPE AU MAIRE
4 AU MAIRE CAULAINCOURT
5 AU MAIRE MARCHE DE L'EUROPE

Categories