Multiple boxplots based on pandas groups - python

Here is how my dataframe looks like:
year item_id sales_quantity
2014 1 10
2014 1 4
... ... ...
2015 1 7
2015 1 10
... ... ...
2014 2 1
2014 2 8
... ... ...
2015 2 17
2015 2 30
... ... ...
2014 3 9
2014 3 18
... ... ...
For each item_id, I want to plot a boxplot showing the distribution for each year.
Here is what I tried:
data = pd.DataFrame.from_csv('electronics.csv')
grouped = data.groupby(['year'])
ncols=4
nrows = int(np.ceil(grouped.ngroups/ncols))
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(35,45),
sharey=False)
for (key, ax) in zip(grouped.groups.keys(), axes.flatten()):
grouped.get_group(key).boxplot(x='year', y='sales_quantity',
ax=ax, label=key)
I get the error boxplot() got multiple values for argument 'x'. Can someone please tell me how to do this right?
If I have only a single item, then the following works
sns.boxplot(data.sales_quantity, groupby = data.year). How could I extend it for multiple items?
Link to csv

Please check comment on the code.
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('electronics_157_3cols.csv')
print(df)
fig, axes = plt.subplots(1, len(df['item_id_copy'].unique()), sharey=True)
for n, i in enumerate(df['item_id_copy'].unique()):
idf = df[df['item_id_copy'] == int('{}'.format(i))][['year', 'sales_quantity']].pivot(columns='year')
print(idf)
idf.plot.box(ax=axes[n])
axes[n].set_title('ID {}'.format(i))
axes[n].set_xticklabels([e[1] for e in idf.columns], rotation=45)
axes[n].set_ylim(0, 1) # You should disable this line to specify outlier properly. (but I didn't to show you a normal graph)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('electronics_157_3cols.csv')
print(df)
fig, axes = plt.subplots(2, 5, sharey=True)
gen_n = (n for n in range(1, 11))
gen_i = (i for i in df['item_id_copy'].unique())
for r in range(2):
for c in range(5):
n = gen_n.__next__()
i = gen_i.__next__()
idf = df[df['item_id_copy'] == int('{}'.format(i))][['year', 'sales_quantity']].pivot(columns='year')
print(idf)
idf.plot.box(ax=axes[r][c])
axes[r][c].set_title('ID {}'.format(i))
axes[r][c].set_xticklabels([e[1] for e in idf.columns], rotation=0)
axes[r][c].set_ylim(0, 1)
plt.show()

I will leave this simple version for others...
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_table('sample.txt', delimiter='\s+')
fig, axes = plt.subplots(1, 3, sharey=True)
for n, i in enumerate(df['item_id'].unique()):
idf = df[df['item_id'] == int('{}'.format(i))][['year', 'sales_quantity']].pivot(columns='year')
print(idf)
idf.plot.box(ax=axes[n])
axes[n].set_title('Item ID {}'.format(i))
axes[n].set_xticklabels([e[1] for e in idf.columns])
plt.show()
sample.txt
year item_id sales_quantity
2014 1 10
2014 1 4
2015 1 7
2015 1 10
2014 2 1
2014 2 8
2015 2 17
2015 2 30
2014 3 9
2014 3 18

Related

Plot line from dataframe

I have the following dataframe [1] which contains information relating to music listening. I would like to print a line graph like the following 2 (I got it by putting the data manually) in which the slotID and the average bpm are related, without writing the values by hand . Each segment must be one unit long and must match the average bpm.
[1]
slotID NUn NTot MeanBPM
2 2 3 13 107.987769
9 11 3 30 133.772100
10 12 3 40 122.354025
13 15 4 44 123.221659
14 16 4 30 129.083900
15 17 9 66 123.274409
16 18 4 25 131.323480
18 20 5 40 124.782625
19 21 6 30 127.664467
20 22 6 19 120.483579
The code I used to obtain the plot is the following:
import numpy as np
import pylab as pl
from matplotlib import collections as mc
lines = [ [(2, 107), (3,107)], [(11,133),(12,133)], [(12,122),(13,122)], ]
c = np.array([(1, 0, 0, 1), (0, 1, 0, 1), (0, 0, 1, 1)])
lc = mc.LineCollection(lines, colors=c, linewidths=2)
fig, ax = pl.subplots()
ax.add_collection(lc)
ax.autoscale()
ax.margins(0.1)
To obtain data:
import numpy as np
import pandas as pd
dfLunedi = pd.read_csv( "5.sab.csv", encoding = "ISO-8859-1", sep = ';')
dfSlotMean = dfLunedi.groupby('slotID', as_index=False).agg( NSabUn=('date', 'nunique'),NSabTot = ('date', 'count'), MeanBPM=('tempo', 'mean') )
df = pd.DataFrame(dfSlotMean)
df.to_csv('sil.csv', sep = ';', index=False)
df.drop(df[df.NSabUn < 3].index, inplace=True)
You can loop through the rows and plot each segment like this:
for _, r in df.iterrows():
plt.plot([r['slotID'], r['slotID']+1], [r['MeanBPM']]*2)
Output:

Different binning for histplot as JoinGrid (x,y) marginal plot

I have a pandas dataframe like this:
Date
Weight
Year
Month
Day
Week
DayOfWeek
0
2017-11-13
76.1
2017
11
13
46
0
1
2017-11-14
76.2
2017
11
14
46
1
2
2017-11-15
76.6
2017
11
15
46
2
3
2017-11-16
77.1
2017
11
16
46
3
4
2017-11-17
76.7
2017
11
17
46
4
...
...
...
...
...
...
...
...
I created a JoinGrid with:
g = sns.JointGrid(data=df,
x="Date",
y="Weight",
marginal_ticks=True,
height=6,
ratio=2,
space=.05)
Then a defined joint and marginal plots:
g.plot_joint(sns.scatterplot,
hue=df["Year"],
alpha=.4,
legend=True)
g.plot_marginals(sns.histplot,
multiple="stack",
bins=20,
hue=df["Year"])
Result is this.
Now the question is: "is it possible to specify different binning for the two histplot resulting in the x and y marginal plot?"
I don't think there is a built-in way to do that, by you can plot directly on the marginal axes using the plotting function of your choice, like so:
penguins = sns.load_dataset('penguins')
data = penguins
x_col = "bill_length_mm"
y_col = "bill_depth_mm"
hue_col = "species"
g = sns.JointGrid(data=data, x=x_col, y=y_col, hue=hue_col)
g.plot_joint(sns.scatterplot)
# top marginal
sns.histplot(data=data, x=x_col, hue=hue_col, bins=5, ax=g.ax_marg_x, legend=False, multiple='stack')
# right marginal
sns.histplot(data=data, y=y_col, hue=hue_col, bins=40, ax=g.ax_marg_y, legend=False, multiple='stack')

Drawing of Cluster Column Graph in Matplotlib

Can anyone explain how can I draw a cluster column chart exactly like this in Matplotlib? I found some similar graphs but I want exactly the graph as shown. I have fruit names such as apples and pears etc as keys and their sale in years as values of these keys.
The following code first creates some toy data and then uses matplotlib to draw a bar plot.
import matplotlib.pyplot as plt
from matplotlib.transforms import blended_transform_factory
from matplotlib.ticker import MultipleLocator
import numpy as np
import pandas as pd
import seaborn as sns
fruits = ['apples', 'pears', 'nectarines', 'plums', 'grapes', 'strawberries']
years = [2015, 2016, 2017]
num_fruit = len(fruits)
num_years = len(years)
df = pd.DataFrame({'fruit': np.tile(fruits, num_years),
'year': np.repeat(years, num_fruit),
'value': np.random.randint(1, 8, num_fruit * num_years)})
width = 0.8
for i, fruit in enumerate(fruits):
for j, year in enumerate(years):
plt.bar(i + width / num_years * (j - (num_years - 1) / 2),
df[(df['fruit'] == fruit) & (df['year'] == year)]['value'],
width / num_years, color='skyblue', ec='white')
plt.xticks([i + width / num_years * (j - (num_years - 1) / 2) for i in range(num_fruit) for j in range(num_years)],
np.tile(years, num_fruit), rotation=45)
ax = plt.gca()
ax.yaxis.set_major_locator(MultipleLocator(1))
ax.yaxis.set_minor_locator(MultipleLocator(0.2))
ax.grid(True, axis='y')
ax.autoscale(False, axis='y')
trans = blended_transform_factory(ax.transData, ax.transAxes)
for i, fruit in enumerate(fruits):
ax.text(i, -0.2, fruit, transform=trans, ha='center')
if i != 0:
ax.vlines(i - 0.5, 0, -0.3, color='lightgrey', clip_on=False, transform=trans)
plt.tight_layout()
print(df)
plt.show()
For this example the data looked like:
fruit year value
0 apples 2015 1
1 pears 2015 3
2 nectarines 2015 6
3 plums 2015 3
4 grapes 2015 3
5 strawberries 2015 1
6 apples 2016 4
7 pears 2016 6
8 nectarines 2016 1
9 plums 2016 6
10 grapes 2016 4
11 strawberries 2016 5
12 apples 2017 3
13 pears 2017 6
14 nectarines 2017 7
15 plums 2017 3
16 grapes 2017 5
17 strawberries 2017 1

How to show two figures for every x in matplotlib boxplot?

I'm trying to show two figures for every x in my boxplot next to each other. But my code keeps the figures on top of each other. I cannot figure out how to fix this because I'm using three different separated data-frames (org_data, holiday_false and holiday_true). Please help.
data.csv:
weekday | holiday | casual | registered
---------------------------------------
0 1 500 153
2 0 412 654
6 1 846 113
2 0 456 121
3 0 124 654
... ... ... ...
... ... ... ...
code:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
plt.style.use('seaborn-notebook')
%matplotlib inline
fig, axes = plt.subplots(figsize=(16, 7))
org_data = pd.read_csv("data.csv")
holiday_false = org_data[(org_data["holiday"] == 0) & (org_data["weekday"] != 0) & (org_data["weekday"] != 6)]
holiday_true = org_data[(org_data["holiday"] == 1) | (org_data["weekday"] == 0) | (org_data["weekday"] == 6)]
ax1 = plt.subplot(121)
sns.boxplot(x=org_data["weekday"], y=holiday_false["casual"], color="orange")
sns.boxplot(x=org_data["weekday"], y=holiday_true["casual"], color="skyblue")
ax1.set_title("Nuber of Casual Users on Holidays and Non-Holidays")
ax1.set_xlabel("Days")
ax1.set_ylabel("Number of Casual Users")
ax2 = plt.subplot(122)
sns.boxplot(x=org_data["weekday"], y=holiday_false["registered"], width=0.50, color="orange")
sns.boxplot(x=org_data["weekday"], y=holiday_true["registered"], width=0.50, color="skyblue")
ax2.set_title("Number of Registered Users on Holidays and Non-Holidays")
ax2.set_xlabel("Days")
ax2.set_ylabel("Number of Registered Users")
plt.show()
the type of chart I get:
the chart I want:

python pandas bar plot another column text

max min mincount maxcount
0 12 10 1 6
1 21 14 1 6
2 34 19 1 6
3 6 20 1 4
4 8 22 1 4
5 41 23 1 4
this is pandas DataFrame.
so I want like this image.
enter image description here
text label is very important.
here my code
df = pd.DataFrame({'maxcount': max_count, 'mincount': min_count, 'max': max, 'min': min})
ax = df[['maxcount', 'mincount']].plot(kind='bar')
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#create your dataframe
d= {'max':[12,21,34,6,8,41],'min':[10,14,19,20,22,23],
'mincount':[1,1,1,1,1,1],'maxcount':[6,6,6,4,4,4]}
df=pd.DataFrame(d)
#create 2 dataframes counts and max_min (1 for plotting and 1 for text)
counts=pd.DataFrame(df,columns=['maxcount','mincount'])
max_min=pd.DataFrame(df,columns=['max','min'])
#plot the counts
ax=counts[counts.columns].plot(kind='bar',colormap='Paired',figsize= (12,4))
#using zip() and ax.annotate specify where (location by means of z)
#and what (max_min or counts) you want to plot
for x,y,z in zip(max_min.iloc[:,0].values,counts.iloc[:,0].values, range(len(counts))):
ax.annotate('%.d' % x, (z-0.2, counts.iloc[z,0]), va='bottom', ha='center', fontsize=10)
ax.annotate("("'%.d' % y+")", (z-0.1, counts.iloc[z,0]), va='bottom', ha='center', fontsize=10)
for x,y,z in zip(max_min.iloc[:,1].values,counts.iloc[:,1].values, range(len(counts))):
ax.annotate('%.d' % x, (z+0.1, counts.iloc[z,1]), va='bottom', ha='center', fontsize=10)
ax.annotate("("'%.d' % y+")", (z+0.2, counts.iloc[z,1]), va='bottom', ha='center', fontsize=10)
This is the output:

Categories