New index level name after DataFrame.stack() - python

(Note that this SO question is similar-looking but different.)
I have a MultiIndexed DataFrame with columns representing yearly data:
>>> x = pd.DataFrame({
'country': {0: 4.0, 1: 8.0, 2: 12.0},
'series': {0: 553.0, 1: 553.0, 2: 553.0},
'2000': {0: '1100', 1: '28', 2: '120'},
'2005': {0: '730', 1: '24', 2: '100'}
}).set_index(['country', 'series'])
>>> x
2000 2005
country series
4 553 1100 730
8 553 28 24
12 553 120 100
When I stack the years, the new index level has no name:
>>> x.stack()
country series
4 553 2000 1100
2005 730
8 553 2000 28
2005 24
12 553 2000 120
2005 100
dtype: object
Is there a nice way to tell stack I'd like the new level to be called 'year'? It doesn't mention this in the docs.
I can always do:
>>> x.columns.name = 'year'
>>> x.stack()
But, to my mind, this doesn't qualify as very 'nice'. Can anyone do it in one line?

There is a chaining-friendly way to do it in one line (although admittedly not much nicer) using DataFrame.rename_axis:
x.rename_axis('year', axis=1).stack()

Related

Group by, filter top rows while customise aggregating columns in Python

I have a dataset of national sales and I'd like to
group by state, Store and YearMonth, and
filter the top 10 stores(not shown in the expected output due to small sample data table) by total Sales in each state (should this be done in a separate step?) while
aggregating other columns in different ways: 'Sales':'sum', 'Qty':'sum', 'Item': join the unique values, I'm thinking about using custom aggregation like df1= df.astype(str).groupby(['State','Store', 'YearMonth']).agg(lambda x: ','.join(x.unique())) but can I do this together with other aggregations at the same time?
df:
YearMonth State Store Qty Sales Item
0 2020-06 AA JW442 1.0 100 SP006
1 2020-06 AA JW442 1.0 1200 SP007
2 2019-09 CC JW600 4.0 700 SP020
3 2019-05 AA JW100 5.0 30 SP00
4 2019-05 AA JW100 4.0 8500 SP5
...
Expected output:
State Store YearMonth Qty Sales Item
0 AA JW100 2019-05 9.0 85300 SP00, SP5
JW442 2020-06 2.0 1300 SP006,SP007
...
2 CC JW600 2019-09 4.0 700 SP020
...
Question:
What's the best way to do this? For filtering top 10 stores by Sales should I do it in a separate step? I learnt about nlargest, is it appropriate to use it here?
Reproducible example:
pd.DataFrame({'YearMonth': {0: Period('2020-07', 'M'),
1: Period('2020-06', 'M'),
2: Period('2019-09', 'M'),
3: Period('2019-03', 'M'),
4: Period('2019-05', 'M'),
5: Period('2019-01', 'M'),
6: Period('2019-03', 'M'),
7: Period('2019-05', 'M'),
8: Period('2019-05', 'M'),
9: Period('2019-05', 'M')},
'State': {0: 'QLD',
1: 'AA',
2: 'AA',
3: 'CC',
4: 'AA',
5: 'SA',
6: 'AA',
7: 'CC',
8: 'AA',
9: 'CC'},
'Store': {0: 'HJR411-140',
1: 'JW442',
2: 'JW442',
3: 'JW600',
4: 'JW600',
5: 'JW442',
6: 'JW600',
7: 'JW100',
8: 'JW100',
9: 'JW100'},
'Qty': {0: 1.0,
1: 1.0,
2: 4.0,
3: 0.0,
4: 4.0,
5: 1.0,
6: 1.0,
7: 22.0,
8: 1.0,
9: 1.0},
'Sales': {0: 118.17000000000002,
1: 49.1075,
2: 725.4,
3: 0.0,
4: 785.85,
5: 457.145,
6: 619.814,
7: 1542.97,
8: 266.5,
9: 159.95200000000003},
'Item': {0: 'SP006',
1: 'SP007',
2: 'SP007',
3: 'SP020',
4: 'SP020',
5: 'SP5',
6: 'SP5',
7: 'SP007',
8: 'SP00',
9: 'SP00'}})
Edit 1:
Tried:
# Get The Largest store values by total sales
stores = df.groupby('Store')['Sales'].sum().nlargest(10).index
df = (
df[df['Store'].isin(stores)] # Filter to include only largest stores
.groupby(['Store', 'State', 'YearMonth']) # Groupby
.agg({'Qty': 'sum', # Agg Columns
'Sales': 'sum',
'Item': lambda i: ','.join(i.unique())})
).reset_index()
and caught error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-157-6008e53ec50d> in <module>
22
23 df1= (
---> 24 df[df['Store'].isin(stores)] # Filter to include only largest stores
25 .groupby(['State', 'Store', 'YearMonth']) # Groupby
26 .agg({'Qty': 'sum', # Agg Columns
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\groupby\generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
943 )
944
--> 945 result, how = self._aggregate(func, *args, **kwargs)
946 if how is None:
947 return result
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\base.py in _aggregate(self, arg, *args, **kwargs)
414
415 try:
--> 416 result = _agg(arg, _agg_1dim)
417 except SpecificationError:
418
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\base.py in _agg(arg, func)
381 result = {}
382 for fname, agg_how in arg.items():
--> 383 result[fname] = func(fname, agg_how)
384 return result
385
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\base.py in _agg_1dim(name, how, subset)
365 "nested dictionary is ambiguous in aggregation"
366 )
--> 367 return colg.aggregate(how)
368
369 def _agg_2dim(how):
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\groupby\generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
253
254 if self.grouper.nkeys > 1:
--> 255 return self._python_agg_general(
256 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
257 )
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\groupby\groupby.py in _python_agg_general(self, func, engine, engine_kwargs, *args, **kwargs)
1088
1089 if len(output) == 0:
-> 1090 return self._python_apply_general(f, self._selected_obj)
1091
1092 if self.grouper._filter_empty_groups:
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f, data)
890 data after applying f
891 """
--> 892 keys, values, mutated = self.grouper.apply(f, data, self.axis)
893
894 return self._wrap_applied_output(
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
211 # group might be modified
212 group_axes = group.axes
--> 213 res = f(group)
214 if not _is_indexed_like(res, group_axes):
215 mutated = True
~\AppData\Roaming\Python\Python38\site-packages\pandas\core\groupby\groupby.py in <lambda>(x)
1056 func = self._is_builtin_func(func)
1057 if engine != "numba":
-> 1058 f = lambda x: func(x, *args, **kwargs)
1059
1060 # iterate through "columns" ex exclusions to populate output dict
<ipython-input-157-6008e53ec50d> in <lambda>(i)
26 .agg({'Qty': 'sum', # Agg Columns
27 'Sales': 'sum',
---> 28 'Item': lambda i: ','.join(i.unique())})
29 ).reset_index()
30
TypeError: sequence item 5: expected str instance, NoneType found
Edit 2:
If I add .astype(str) before groupby like:
# Get The Largest store values by total sales
stores = df.groupby('Store')['Sales'].sum().nlargest(10).index
df = (
df[df['Store'].isin(stores)] # Filter to include only largest stores
.astype(str).groupby(['Store', 'State', 'YearMonth']) # Groupby
.agg({'Qty': 'sum', # Agg Columns
'Sales': 'sum',
'Item': lambda i: ','.join(i.unique())})
).reset_index()
it will run but the Qty and Sales would join values as if they are strings, instead of sum.
Created df from the data you created.
Input df
YearMonth State Store Qty Sales Item
0 2020-07 QLD HJR411-140 1.0 118.1700 SP006
1 2020-06 AA JW442 1.0 49.1075 SP007
2 2019-09 AA JW442 4.0 725.4000 SP007
3 2019-03 CC JW600 0.0 0.0000 SP020
4 2019-05 AA JW600 4.0 785.8500 SP020
5 2019-01 SA JW442 1.0 457.1450 SP5
6 2019-03 AA JW600 1.0 619.8140 SP5
7 2019-05 CC JW100 22.0 1542.9700 SP007
8 2019-05 AA JW100 1.0 266.5000 SP00
9 2019-05 CC JW100 1.0 159.9520 SP00
Because of sample data tested top 2 stores by sales in each state. You can replace the varible top_n=10 for top 10 store by sales for each state.
top_n = 2
grouped = df.groupby(['State', 'Store', 'YearMonth'])
df = grouped.agg({'Sales':'sum', 'Qty':'sum', 'Item':list})
df = df.groupby(['State','Store'], group_keys=False).apply(lambda x: x.nlargest(top_n,'Sales'))
df
Output
Sales Qty Item
State Store YearMonth
AA JW100 2019-05 266.5000 1.0 [SP00]
JW442 2019-09 725.4000 4.0 [SP007]
2020-06 49.1075 1.0 [SP007]
JW600 2019-05 785.8500 4.0 [SP020]
2019-03 619.8140 1.0 [SP5]
CC JW100 2019-05 1702.9220 23.0 [SP007, SP00]
JW600 2019-03 0.0000 0.0 [SP020]
QLD HJR411-140 2020-07 118.1700 1.0 [SP006]
SA JW442 2019-01 457.1450 1.0 [SP5]

Pandas pivot or reshape dataframe with NaN

I have this dataframe that i need to pivot or reshape based on the frame col
df = {'frame': {0: 0, 1: 1, 2: 2, 3: 0, 4: 1, 5: 2}, 'pvol': {0: nan, 1: nan, 2: nan, 3: 23.1, 4: 24.3, 5: 25.6}, 'vvol': {0: 109.8, 1: 140.5, 2: 160.4, 3: nan, 4: nan, 5: nan}, 'area': {0: 120, 1: 130, 2: 140, 3: 110, 4: 110, 5: 112}, 'label': {0: 'v', 1: 'v', 2: 'v', 3: 'p', 4: 'p', 5: 'p'}}
Current dataframe
frame pvol vvol area label
0 NaN 109.8 120 v
1 NaN 140.5 130 v
2 NaN 160.4 140 v
0 23.1 NaN 110 p
1 24.3 NaN 110 p
2 25.6 NaN 112 p
Expected output
frame pvol vvol v_area p_area
0 23.1 109.8 110 110
1 24.3 140.5 110 110
2 25.6 160.4 112 112
The prefix v and p aren't necessary I just need a way to tell the columns apart
This is how I got it to work, but it seems long. I'm sure there is a better way
for name, tdf in df.groupby('label'):
df.loc[tdf.index, '{}_area'.format(name)] = tdf['area']
pdf = df[df['label'].eq('p')][['frame', 'label', 'pvol', 'p_area']]
vdf = df[df['label'].eq('v')][['frame', 'vvol', 'v_area']]
df = pdf.merge(vdf, on='frame', how='outer')
Let's try pivot and dropna:
out = df.pivot(index='frame', columns='label').dropna(axis=1)
out.columns = [f'{y}_{x}' for x,y in out.columns]
Output:
p_pvol v_vvol p_area v_area
frame
0 23.1 109.8 110 120
1 24.3 140.5 110 130
2 25.6 160.4 112 140
Let us try
s = df.set_index(['frame','label']).unstack().dropna(axis=1)
s.columns=s.columns.map('_'.join)
s
Out[102]:
pvol_p vvol_v area_p area_v
frame
0 23.1 109.8 110 120
1 24.3 140.5 110 130
2 25.6 160.4 112 140
I feel stupid for posting this after the gems dropped by #BEN_YO and #Quang_Hoang..
df.set_index('label', inplace=True)
d1 = df.loc['v', ['frame', 'vvol', 'area']].rename(columns={'area':'v_area'})
d2 = df.loc['p', ['frame', 'pvol', 'area']].rename(columns={'area':'p_area'})
pd.merge(d1, d2, on='frame')
frame vvol v_area pvol p_area
0 0 109.8 120 23.1 110
1 1 140.5 130 24.3 110
2 2 160.4 140 25.6 112

How to do conditionals operations in columns in python pandas?

I'm trying to make a code that calculates the variation of "prod"("rgdpna"/"emp") in relation to one specific year. In an excel data, that contain data from several countries, and I need to do it for all of them.
(country, year, rgdpna and emp, are the data from excel)
Contry year rgdpna emp "prod"(rgdpna/emp) "prodvar"
Brazil 1980 100 12 8.3 (8.3/8.3) = 1
Brazil 1981 120 12 10 (10/8.3) = 1.2
Brazil 1982 140 15 9.3 (9.3/8.3) = 1.1
...
Canada 1980 300 11 27.2 (27.2/27.2) = 1
Canada 1981 327 10 32.7 (32.7/27.2) = 1.2
Canada 1982 500 12 41.6 (41.6/27.2) = 1.5
...
Something like this : "prodvar" = ("prod" when "year" >= 1980) divided by ("prod" when "year"==1980)
And i think i need to do with "while", but i don't know.
df["prod"] = df["rgdpna"].div (df["emp"])
For pandas, avoid doing for and while loops wherever possible.
Try this.
df['prod'] = df.apply(lambda x: x['prod']/df['prod'].loc[(df['year']==1980)&(df['country']==x['country'])].values[0], axis=1)
First of all, let's get your data into a complete, minimal example. For that we don't need the intermediate columns, so let's keep the relevant column only, and call it 'value' for clearness' sake:
data_dict = {'country': {0: 'Brazil',
1: 'Brazil',
2: 'Brazil',
3: 'Canada',
4: 'Canada',
5: 'Canada'},
'value': {0: 8.3, 1: 10, 2: 9.3, 3: 27.2, 4: 32.7, 5: 41.6},
'year': {0: 1980.0, 1: 1981.0, 2: 1982.0, 3: 1980.0, 4: 1981.0, 5: 1982.0}}
df = pd.DataFrame(data_dict)
(I'm also using clear column names in the rest of this answer, even if they're long)
Secondly, we will create an intermediate values column, that just holds the value when year is 1980:
df['value_1980'] = df.apply(lambda row: df.set_index(['year','country']).loc[1980]['value'][row['country']], axis=1)
Finally, we just divide the two, as in your example:
df['value_relative_to_1980'] = df['value'] / df['value_1980']
Check the result.

Pandas slice string with index from another column

I want to slice this string using indexes from another column. I'm getting NaN instead of slices of the string.
import pandas as pd
from pandas import DataFrame, Series
sales = {'name': ['MSFTCA', 'GTX', 'MSFTUSA', ],
'n_chars': [2, 2, 3],
'Jan': [150, 200, 50],
'Feb': [200, 210, 90],
'Mar': [140, 215, 95]}
df = pd.DataFrame.from_dict(sales)
df
def extract_location(name, n_chars):
return( name.str[-n_chars:])
df.assign(location=(lambda x: extract_location(x['name'], x['n_chars']))).to_dict()
Gives:
{'Feb': {0: 200, 1: 210, 2: 90},
'Jan': {0: 150, 1: 200, 2: 50},
'Mar': {0: 140, 1: 215, 2: 95},
'location': {0: nan, 1: nan, 2: nan},
'n_chars': {0: 2, 1: 2, 2: 3},
'name': {0: 'MSFTCA', 1: 'GTX', 2: 'MSFTUSA'}}
You need apply with axis=1 for processing by rows:
def extract_location(name, n_chars):
return( name[-n_chars:])
df=df.assign(location=df.apply(lambda x: extract_location(x['name'], x['n_chars']), axis=1))
print (df)
Feb Jan Mar n_chars name location
0 200 150 140 2 MSFTCA CA
1 210 200 215 2 GTX TX
2 90 50 95 3 MSFTUSA USA
df = df.assign(location=df.apply(lambda x: x['name'][-x['n_chars']:], axis=1))
print (df)
Feb Jan Mar n_chars name location
0 200 150 140 2 MSFTCA CA
1 210 200 215 2 GTX TX
2 90 50 95 3 MSFTUSA USA
Using a comprehension
df.assign(location=[name[-n:] for n, name in zip(df.n_chars, df.name)])
Feb Jan Mar n_chars name location
0 200 150 140 2 MSFTCA CA
1 210 200 215 2 GTX TX
2 90 50 95 3 MSFTUSA USA
You can speed it up a bit with
df.assign(location=[name[-n:] for n, name in zip(df.n_chars.values, df.name.values)])

how to calculate percentage for particular rows for given columns using python pandas?

student,total,m1,m2,m3
a,500,120,220,160
b,600,180,120,200
This is my dataframe and I just want to calculate m1,m2,m3 columns as percentages of the total column. I need output like following dataframe
student,total,m1,m2,m3,m1(%),m2(%),m3(%)
a,500,120,220,160,24,44,32
...
for example m1(%) column will be calculated by using (m1/total)*100.
I think you can use div:
df = pd.DataFrame({'total': {0: 500, 1: 600},
'm1': {0: 120, 1: 180},
'm3': {0: 160, 1: 200},
'student': {0: 'a', 1: 'b'},
'm2': {0: 220, 1: 120}},
columns=['student','total','m1','m2','m3'])
print df
student total m1 m2 m3
0 a 500 120 220 160
1 b 600 180 120 200
df[['m1(%)','m2(%)','m3(%)']] = df[['m1','m2','m3']].div(df.total, axis=0)*100
print df
student total m1 m2 m3 m1(%) m2(%) m3(%)
0 a 500 120 220 160 24.0 44.0 32.000000
1 b 600 180 120 200 30.0 20.0 33.333333

Categories