How to flatten a dataframe by a column containing ranges - python

Input dataframe:
df=
pd.DataFrame(columns=['id', 'antibiotic','start_date', 'end_date'],
data=[['Sophie', 'amoxicillin', 15, 17],
['Sophie', 'doxycycline', 19, 21],
['Sophie', 'amoxicillin', 20, 22],
['Robert', 'cephalexin', 12, 14],
['Robert', 'ciprofloxacin', 17, 18],
['Robert', 'clindamycin', 18, 18],
['Robert', 'cephalexin', 17, 19]
])
I would like to flatten out/expand the dates, and also join ('/') the antibiotics fields when they concur in the same date. like below:
df_flat=
pd.DataFrame(columns=['id', 'date', 'antibiotic'],
data=[['Sophie', 15, 'amoxicillin'],
['Sophie', 16, 'amoxicillin'],
['Sophie', 17, 'amoxicillin'],
['Sophie', 18, NaN],
['Sophie', 19, 'doxycycline'],
['Sophie', 20, 'doxycycline/amoxicillin'],
['Sophie', 21, 'doxycycline/amoxicillin'],
['Sophie', 22, 'amoxicillin'],
['Robert', 12, 'cephalexin'],
['Robert', 13, 'cephalexin'],
['Robert', 14, 'cephalexin'],
['Robert', 15, NaN],
['Robert', 16, NaN],
['Robert', 17, 'ciprofloxacin/cephalexin'],
['Robert', 18, 'ciprofloxacin/clindamycin/cephalexin'],
['Robert', 19, 'cephalexin']
])
What I'm trying...
#get mixmax of dates
minmax = df.groupby('id').agg({'start_date':min,'end_date':max})
#create multiindex manually with mins and maxes
multi_index = []
for i, row in minmax.iterrows():
for d in range(row.start_date, row.end_date):
tup = (i, d)
multi_index.append(tup)
# create output dataframe with this multiindex
df_flat = pd.DataFrame(index=pd.MultiIndex.from_tuples(multi_index),\
columns=['date','antibiotics'])
# And then fill up this df_flat using the original dataframe by matching the index of df_flat
# with the date values of original df, in a for loop :)
for i, row in df.iterrows():
for tup in multi_index:
if tup[1]>=row.start_date & ...
.
.
.
but this seems inefficient and inelegant. I'm sure something more smart can be done.

One option is to generate a range for each row, explode to create one row per date, then aggregate per id/date:
(df.assign(date=lambda d: d.apply(lambda r: range(r['start_date'], r['end_date']+1), axis=1))
.explode('date')
.groupby(['id', 'date'], dropna=False)['antibiotic'].agg('/'.join)
.reset_index()
)
output:
id date antibiotic
0 Robert 12 cephalexin
1 Robert 13 cephalexin
2 Robert 14 cephalexin
3 Robert 17 ciprofloxacin/cephalexin
4 Robert 18 ciprofloxacin/clindamycin/cephalexin
5 Robert 19 cephalexin
6 Sophie 15 amoxicillin
7 Sophie 16 amoxicillin
8 Sophie 17 amoxicillin
9 Sophie 19 doxycycline
10 Sophie 20 doxycycline/amoxicillin
11 Sophie 21 doxycycline/amoxicillin
12 Sophie 22 amoxicillin
keeping the NaNs:
(df.assign(date=lambda d: d.apply(lambda r: range(r['start_date'], r['end_date']+1), axis=1))
.explode('date')
.groupby(['id', 'date'], dropna=False)['antibiotic'].agg('/'.join)
.reset_index(level=0)
.groupby('id')['antibiotic']
.apply(lambda g: g.reindex(range(g.index.min(), g.index.max()+1)))
.reset_index()
)
output:
id date antibiotic
0 Robert 12 cephalexin
1 Robert 13 cephalexin
2 Robert 14 cephalexin
3 Robert 15 NaN
4 Robert 16 NaN
5 Robert 17 ciprofloxacin/cephalexin
6 Robert 18 ciprofloxacin/clindamycin/cephalexin
7 Robert 19 cephalexin
8 Sophie 15 amoxicillin
9 Sophie 16 amoxicillin
10 Sophie 17 amoxicillin
11 Sophie 18 NaN
12 Sophie 19 doxycycline
13 Sophie 20 doxycycline/amoxicillin
14 Sophie 21 doxycycline/amoxicillin
15 Sophie 22 amoxicillin
alternative to have all days for all patients:
(df.assign(date=lambda d: d.apply(lambda r: range(r['start_date'], r['end_date']+1), axis=1))
.explode('date')
.groupby(['id', 'date'], dropna=False)['antibiotic'].agg('/'.join)
.unstack('date')
.stack('date', dropna=False).rename('antibiotic')
.reset_index()
)
output:
id date antibiotic
0 Robert 12 cephalexin
1 Robert 13 cephalexin
2 Robert 14 cephalexin
3 Robert 15 NaN
4 Robert 16 NaN
5 Robert 17 ciprofloxacin/cephalexin
6 Robert 18 ciprofloxacin/clindamycin/cephalexin
7 Robert 19 cephalexin
8 Robert 20 NaN
9 Robert 21 NaN
10 Robert 22 NaN
11 Sophie 12 NaN
12 Sophie 13 NaN
13 Sophie 14 NaN
14 Sophie 15 amoxicillin
15 Sophie 16 amoxicillin
16 Sophie 17 amoxicillin
17 Sophie 18 NaN
18 Sophie 19 doxycycline
19 Sophie 20 doxycycline/amoxicillin
20 Sophie 21 doxycycline/amoxicillin
21 Sophie 22 amoxicillin

First is repeat subtracted times by Index.repeat, then add counter to start_date column with agregate join, last add missing ranges:
df = df.loc[df.index.repeat(df['end_date'].sub(df['start_date']).add(1))].copy()
df['date'] = df['start_date'].add(df.groupby(level=0).cumcount())
df = (df.groupby(['id','date'], sort=False)['antibiotic'].agg('/'.join)
.reset_index(level=0)
.groupby('id', sort=False)['antibiotic']
.apply(lambda x: x.reindex(range(x.index.min(), x.index.max()+1)))
.reset_index()
)
print (df)
id date antibiotic
0 Sophie 15 amoxicillin
1 Sophie 16 amoxicillin
2 Sophie 17 amoxicillin
3 Sophie 18 NaN
4 Sophie 19 doxycycline
5 Sophie 20 doxycycline/amoxicillin
6 Sophie 21 doxycycline/amoxicillin
7 Sophie 22 amoxicillin
8 Robert 12 cephalexin
9 Robert 13 cephalexin
10 Robert 14 cephalexin
11 Robert 15 NaN
12 Robert 16 NaN
13 Robert 17 ciprofloxacin/cephalexin
14 Robert 18 ciprofloxacin/clindamycin/cephalexin
15 Robert 19 cephalexin

You could also cross-merge against your own data, then subset back to the matches before group-by and then joining.
df = df.merge(
pd.RangeIndex(df['start_date'].min(), df['end_date'].max() + 1).to_series().rename('date'),
how='cross')
df = df[df['date'] <= df['end_date']]
df = df[df['date'] >= df['start_date']]
# df.sort_values(by=['id', 'date'], inplace=True)
final = df.groupby(['id', 'date'])['antibiotic'] \
.agg('/'.join) \
.reset_index()
Producing:
id date antibiotic
0 Robert 12 cephalexin
1 Robert 13 cephalexin
2 Robert 14 cephalexin
3 Robert 17 ciprofloxacin/cephalexin
4 Robert 18 cephalexin/ciprofloxacin/clindamycin
5 Robert 19 cephalexin
6 Sophie 15 amoxicillin
7 Sophie 16 amoxicillin
8 Sophie 17 amoxicillin
9 Sophie 19 doxycycline
10 Sophie 20 doxycycline/amoxicillin
11 Sophie 21 doxycycline/amoxicillin
12 Sophie 22 amoxicillin

Use:
df['temp']=list(zip(df.start_date, df.end_date))
df.explode('temp').groupby(['id', 'temp'])['antibiotic'].apply(comb)
def comb(row):
out = ''
for part in row:
out+=part
out+='/'
return out[:-1]
First you combine the dates columns. Then you explode on the new column. Then you need to groupby using the custom function.
Result
id temp
Robert 12 cephalexin
14 cephalexin
17 ciprofloxacin/cephalexin
18 ciprofloxacin/clindamycin/clindamycin
19 cephalexin
Sophie 25 amoxicillin
27 amoxicillin
29 doxycycline
30 amoxicillin
31 doxycycline
32 amoxicillin

def function1(dd:pd.DataFrame):
dd1=dd.assign(date=pd.to_datetime(dd.date, format='%d')).set_index('date').asfreq(freq='d')
dd1.index=dd1.index.strftime('%d')
return dd1
df.apply(lambda ss:range(ss.start_date,ss.end_date+1),axis=1).explode()\
.to_frame('date').join(df).groupby(['id','date'])['antibiotic'].agg('/'.join)\
.reset_index(level=1).groupby(level=0).apply(function1)\
.reset_index()
id date antibiotic
0 Robert 12 cephalexin
1 Robert 13 cephalexin
2 Robert 14 cephalexin
3 Robert 15 NaN
4 Robert 16 NaN
5 Robert 17 ciprofloxacin/cephalexin
6 Robert 18 ciprofloxacin/clindamycin/cephalexin
7 Robert 19 cephalexin
8 Sophie 15 amoxicillin
9 Sophie 16 amoxicillin
10 Sophie 17 amoxicillin
11 Sophie 18 NaN
12 Sophie 19 doxycycline
13 Sophie 20 doxycycline/amoxicillin
14 Sophie 21 doxycycline/amoxicillin
15 Sophie 22 amoxicillin

Related

combining specific row conditionally and add output to existing row in pandas

suppose I have following data frame :
data = {'age' :[10,11,12,11,11,10,11,13,13,13,14,14,15,15,15],
'num1':[10,11,12,13,14,15,16,17,18,19,20,21,22,23,24],
'num2':[20,21,22,23,24,25,26,27,28,29,30,31,32,33,34]}
df = pd.DataFrame(data)
I want to sum rows for age 14 and 15 and keep those new values as age 14. my expected output would be like this:
age time1 time2
1 10 10 20
2 11 11 21
3 12 12 22
4 11 13 23
5 11 14 24
6 10 15 25
7 11 16 26
8 13 17 27
9 13 18 28
10 13 19 29
11 14 110 160
in the code below, I have tried to group.by age but it does not work for me:
df1 =df.groupby(age[age >=14])['num1', 'num2'].apply(', '.join).reset_index(drop=True).to_frame()
limit_age = 14
new = df.query("age < #limit_age").copy()
new.loc[len(new)] = [limit_age,
*df.query("age >= #limit_age").drop(columns="age").sum()]
first get the "before 14" dataframe
then assign it to a new row where
age is 14
other values are the row-wise sums of "after 14" dataframe
to get
>>> new
age num1 num2
0 10 10 20
1 11 11 21
2 12 12 22
3 11 13 23
4 11 14 24
5 10 15 25
6 11 16 26
7 13 17 27
8 13 18 28
9 13 19 29
10 14 110 160
(new.index += 1 can be used for a 1-based index at the end.)
I would use a mask and concat:
m = df['age'].isin([14, 15])
out = pd.concat([df[~m],
df[m].agg({'age': 'min', 'num1': 'sum', 'num2': 'sum'})
.to_frame().T
], ignore_index=True)
Output:
age num1 num2
0 10 10 20
1 11 11 21
2 12 12 22
3 11 13 23
4 11 14 24
5 10 15 25
6 11 16 26
7 13 17 27
8 13 18 28
9 13 19 29
10 14 110 160

I am trying to generate Choropleth Maps , Nothings Shows on screen

I am trying to generate a choropleth map of india with some data , everything seems to work fine, i cleaned the data and made a proper Pandas Dataframe, but somehow its still blank. the geojson _id matches with the dataframe yet no resolve.
import plotly.express as px
import pandas as pd
import json
import numpy as np
india_map = json.load(open('INDIA_STATES.json','r'))
df = pd.read_excel("data.xlsx")
df_s = pd.DataFrame({'Stats':df['Unnamed: 7'][3:38]},)
#print(df_s)
df = pd.concat([df['Unnamed: 1'][3:38],df_s],axis=1,ignore_index=True)
df.reset_index(drop=True,inplace=True)
df = df.rename(columns={df.columns[0]:"State",df.columns[1]:"Stat"})
#print(df)
state_id_json = []
state_name_json = []
for st in india_map['features']:
state_name_json.append(st['properties']['STATE'])
state_id_json.append(st['_id'])
state_name_json = list(map(lambda x:x.upper(),state_name_json))
df['State'] = list(map(lambda x:x.upper(),df['State']))
state_id = pd.DataFrame({"State":state_name_json,"_id":state_id_json}).sort_values(by=['State'])
#print(state_id,df)
df = pd.merge(df,state_id,on='State')
print(df)
fig = px.choropleth(df,geojson=india_map,locations='_id',color='Stat',scope= 'asia')
fig.show()
I get this in Terminal :
State Stat _id
0 ANDAMAN AND NICOBAR ISLANDS 356.0 8
1 ANDHRA PRADESH 76210.0 9
2 ARUNACHAL PRADESH 1098.0 10
3 ASSAM 26656.0 11
4 BIHAR 82999.0 row_112
5 CHANDIGARH 901.0 35
6 CHANDIGARH 901.0 13
7 CHHATTISGARH 20834.0 row_251
8 DAMAN AND DIU 158.0 34
9 DELHI 13851.0 16
10 GOA 1348.0 17
11 GUJARAT 50671.0 18
12 HARYANA 21145.0 19
13 HIMACHAL PRADESH 6078.0 20
14 JAMMU AND KASHMIR 10144.0 7
15 JHARKHAND 26946.0 row_267
16 KARNATAKA 52851.0 21
17 KERALA 31841.0 22
18 LAKSHADWEEP 61.0 23
19 MADHYA PRADESH 60348.0 row_250
20 MAHARASHTRA 96879.0 25
21 MANIPUR 2294.0 26
22 MEGHALAYA 2319.0 27
23 MIZORAM 889.0 28
24 NAGALAND 1990.0 29
25 ODISHA 36805.0 30
26 PUDUCHERRY 974.0 37
27 PUNJAB 24359.0 1
28 RAJASTHAN 56507.0 2
29 SIKKIM 541.0 3
30 TAMIL NADU 62406.0 4
31 TRIPURA 3199.0 5
32 UTTAR PRADESH 166198.0 31
33 WEST BENGAL 80176.0 32
Plotly Result:
Empty Map with no data
Since no geojson was provided, I used Json retrieved from the web and created the code with the data you provided. geojson was not provided, so I may not be certain, but I think it was probably due to a missing featureidkey setting. In my example, I have associated the state name with NAME_1 in geojson. Some of the state names do not match, so I adapted them to geojson.
Note:
Some states are missing because the state name in the user data does not match NAME_1 in geojson. Please replace it with your data.
from urllib.request import urlopen
import json
import pandas as pd
import numpy as np
import io
import plotly.express as px
# https://github.com/Subhash9325/GeoJson-Data-of-Indian-States/blob/master/Indian_States
url = 'https://raw.githubusercontent.com/Subhash9325/GeoJson-Data-of-Indian-States/master/Indian_States'
with urlopen(url) as response:
india_map = json.load(response)
data = '''
State Stat _id
0 "ANDAMAN AND NICOBAR ISLANDS" 356.0 8
1 "ANDHRA PRADESH" 76210.0 9
2 "ARUNACHAL PRADESH" 1098.0 10
3 ASSAM 26656.0 11
4 BIHAR 82999.0 row_112
5 CHANDIGARH 901.0 35
6 CHANDIGARH 901.0 13
7 CHHATTISGARH 20834.0 row_251
8 "DAMAN AND DIU" 158.0 34
9 DELHI 13851.0 16
10 GOA 1348.0 17
11 GUJARAT 50671.0 18
12 HARYANA 21145.0 19
13 "HIMACHAL PRADESH" 6078.0 20
14 "JAMMU AND KASHMIR" 10144.0 7
15 JHARKHAND 26946.0 row_267
16 KARNATAKA 52851.0 21
17 KERALA 31841.0 22
18 LAKSHADWEEP 61.0 23
19 "MADHYA PRADESH" 60348.0 row_250
20 MAHARASHTRA 96879.0 25
21 MANIPUR 2294.0 26
22 MEGHALAYA 2319.0 27
23 MIZORAM 889.0 28
24 NAGALAND 1990.0 29
25 ODISHA 36805.0 30
26 PUDUCHERRY 974.0 37
27 PUNJAB 24359.0 1
28 RAJASTHAN 56507.0 2
29 SIKKIM 541.0 3
30 "TAMIL NADU" 62406.0 4
31 TRIPURA 3199.0 5
32 "UTTAR PRADESH" 166198.0 31
33 "WEST BENGAL" 80176.0 32
'''
df = pd.read_csv(io.StringIO(data), delim_whitespace=True)
df['State'] = df['State'].str.title()
df['State'] = df['State'].str.replace(' And ', ' and ')
df.loc[0, 'State'] = 'Andaman and Nicobar'
fig = px.choropleth(df,
geojson=india_map,
locations='State',
color='Stat',
featureidkey="properties.NAME_1",
scope='asia')
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(autosize=False,
width=1000,
height=600
)
fig.show()

Pull specific values from one dataframe based on values in another

I have two dataframes
df1:
Country
value
Average
Week Rank
UK
42
42
1
US
9
9.5
2
DE
10
9.5
3
NL
15
15.5
4
ESP
16
15.5
5
POL
17
18
6
CY
18
18
7
IT
20
18
8
AU
17
18
9
FI
18
18
10
SW
20
18
11
df2:
Country
value
Average
Year Rank
US
42
42
1
UK
9
9.5
2
ESP
10
9.5
3
SW
15
15.5
4
IT
16
15.5
5
POL
17
18
6
NO
18
18
7
SL
20
18
8
PO
17
18
9
FI
18
18
10
NL
20
18
11
DE
17
18
12
AU
18
18
13
CY
20
18
14
Im looking to create a column in df1 that shows the 'Year Rank' of the countries in df1 so that I have the following:
Country
value
Average
Week Rank
Year Rank
UK
42
42
1
2
US
9
9.5
2
1
DE
10
9.5
3
9
NL
15
15.5
4
8
ESP
16
15.5
5
3
POL
17
18
6
6
CY
18
18
7
7
IT
20
18
8
5
AU
17
18
9
13
FI
18
18
10
10
SW
20
18
11
4
How would i loop through the countries in df1 and find the corresponding rank in df2?
Edit: I am only looking for the yearly rank of the countries in df1
Thanks!
Use:
df1['Year Rank'] = df1.merge(df2, on='Country')['Year Rank']

Create a new column in pandas dataframe based on multiple conditions

I have a dataframe like the one below, and I have to create a new column year_val that is equal to the values of col2016 through col2019 based on the Years column, so that the value for year_val will be the value of col#### when Years is equal to the suffix of col####
import pandas as pd
sampleDF = pd.DataFrame({'Years':[2016,2016,2017,2017,2018,2018,2019,2019],
'col2016':[1,2,3,4,5,6,7,8],
'col2017':[9,10,11,12,13,14,15,16],
'col2018':[17,18,19,20,21,22,23,24],
'col2019':[25,26,27,28,29,30,31,32]})
sampleDF['year_val'] = ?????
Use DataFrame.lookup with change values in Years column with prepend col and cast to string:
sampleDF['year_val'] = sampleDF.lookup(sampleDF.index, 'col' + sampleDF['Years'].astype(str))
print (sampleDF)
Years col2016 col2017 col2018 col2019 year_val
0 2016 1 9 17 25 1
1 2016 2 10 18 26 2
2 2017 3 11 19 27 11
3 2017 4 12 20 28 12
4 2018 5 13 21 29 21
5 2018 6 14 22 30 22
6 2019 7 15 23 31 31
7 2019 8 16 24 32 32
EDIT: If check definition of lookup function:
result = [df.get_value(row, col) for row, col in zip(row_labels, col_labels)]
you can modify it with try-except statement with Series.at for prevent:
FutureWarning: get_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead
oup.append(sampleDF.at[row, col] )
sampleDF = pd.DataFrame({'Years':[2015,2016,2017,2017,2018,2018,2019,2019],
'col2016':[1,2,3,4,5,6,7,8],
'col2017':[9,10,11,12,13,14,15,16],
'col2018':[17,18,19,20,21,22,23,24],
'col2019':[25,26,27,28,29,30,31,32]})
print (sampleDF)
Years col2016 col2017 col2018 col2019
0 2015 1 9 17 25
1 2016 2 10 18 26
2 2017 3 11 19 27
3 2017 4 12 20 28
4 2018 5 13 21 29
5 2018 6 14 22 30
6 2019 7 15 23 31
7 2019 8 16 24 32
out= []
for row, col in zip(sampleDF.index, 'col' + sampleDF['Years'].astype(str)):
try:
out.append(sampleDF.at[row, col] )
except KeyError:
out.append(np.nan)
sampleDF['year_val'] = out
print (sampleDF)
Years col2016 col2017 col2018 col2019 year_val
0 2015 1 9 17 25 NaN
1 2016 2 10 18 26 2.0
2 2017 3 11 19 27 11.0
3 2017 4 12 20 28 12.0
4 2018 5 13 21 29 21.0
5 2018 6 14 22 30 22.0
6 2019 7 15 23 31 31.0
7 2019 8 16 24 32 32.0

Multi column filtering in python data frame

I have created a pandas dataframe. I want to filter all with the values 9, 12, 24, 18.
df:
index no1 no2 no3 no4 no5 no6 no7
1 9 11 12 14 18 24 30
2 9 12 13 18 19 24 31
3 9 12 13 42 20 19 24
4 10 9 13 42 18 24 12
5 13 12 13 44 18 24 30
6 2 9 12 18 24 31 44
7 10 12 14 42 18 24 30
8 10 12 14 42 18 24 31
Code:
a = df['no1'].isin([9,12,18 ,24])
b = df['no2'].isin([9,12,18,24])
c = df['no3'].isin([9,12 , 18, 24])
d = df['no4'].isin([9,12 , 18, 24])
e = df['no5'].isin([9,12,18,24])
f = df['no6'].isin([9,12 , 18, 24])
g = df['no7'].isin([9,12 , 18, 24])
df [a & b & c & d & e & f & g]
Desired output:
index no1 no2 no3 no4 no5 no6 no7
1 9 11 12 14 18 24 30
2 9 12 13 18 19 24 31
4 10 9 13 42 18 24 12
6 2 9 12 18 24 31 44
original data frame and expected output
Try:
df[df.isin([9,12,18,24])]
This should give you the exact answer
df=pd.DataFrame({'no1':[9,9,9,10,13,2,10,10],
'no2':[11,12,12,9,12,9,12,12],
'no3':[12,13,13,13,13,12,14,14],
'no4':[14,18,42,42,44,18,42,42],
'no5':[18,19,20,18,18,24,18,18],
'no6':[24,24,19,24,24,31,24,24],
'no7':[30,31,24,12,30,44,30,31]}) # Creating the data frame
df_new=df[df.isin([9,12,18,24])]
df_new=df_new.dropna(thresh=4)
df_new=df_new.fillna(df)
The result would be:
no1 no2 no3 no4 no5 no6 no7
0 9.0 11.0 12.0 14.0 18.0 24.0 30.0
1 9.0 12.0 13.0 18.0 19.0 24.0 31.0
3 10.0 9.0 13.0 42.0 18.0 24.0 12.0
5 2.0 9.0 12.0 18.0 24.0 31.0 44.0

Categories