Drawing of Cluster Column Graph in Matplotlib - python

Can anyone explain how can I draw a cluster column chart exactly like this in Matplotlib? I found some similar graphs but I want exactly the graph as shown. I have fruit names such as apples and pears etc as keys and their sale in years as values of these keys.

The following code first creates some toy data and then uses matplotlib to draw a bar plot.
import matplotlib.pyplot as plt
from matplotlib.transforms import blended_transform_factory
from matplotlib.ticker import MultipleLocator
import numpy as np
import pandas as pd
import seaborn as sns
fruits = ['apples', 'pears', 'nectarines', 'plums', 'grapes', 'strawberries']
years = [2015, 2016, 2017]
num_fruit = len(fruits)
num_years = len(years)
df = pd.DataFrame({'fruit': np.tile(fruits, num_years),
'year': np.repeat(years, num_fruit),
'value': np.random.randint(1, 8, num_fruit * num_years)})
width = 0.8
for i, fruit in enumerate(fruits):
for j, year in enumerate(years):
plt.bar(i + width / num_years * (j - (num_years - 1) / 2),
df[(df['fruit'] == fruit) & (df['year'] == year)]['value'],
width / num_years, color='skyblue', ec='white')
plt.xticks([i + width / num_years * (j - (num_years - 1) / 2) for i in range(num_fruit) for j in range(num_years)],
np.tile(years, num_fruit), rotation=45)
ax = plt.gca()
ax.yaxis.set_major_locator(MultipleLocator(1))
ax.yaxis.set_minor_locator(MultipleLocator(0.2))
ax.grid(True, axis='y')
ax.autoscale(False, axis='y')
trans = blended_transform_factory(ax.transData, ax.transAxes)
for i, fruit in enumerate(fruits):
ax.text(i, -0.2, fruit, transform=trans, ha='center')
if i != 0:
ax.vlines(i - 0.5, 0, -0.3, color='lightgrey', clip_on=False, transform=trans)
plt.tight_layout()
print(df)
plt.show()
For this example the data looked like:
fruit year value
0 apples 2015 1
1 pears 2015 3
2 nectarines 2015 6
3 plums 2015 3
4 grapes 2015 3
5 strawberries 2015 1
6 apples 2016 4
7 pears 2016 6
8 nectarines 2016 1
9 plums 2016 6
10 grapes 2016 4
11 strawberries 2016 5
12 apples 2017 3
13 pears 2017 6
14 nectarines 2017 7
15 plums 2017 3
16 grapes 2017 5
17 strawberries 2017 1

Related

How to add a box plot and a vertical line in a histogram diagram in python Plotly Express graph objects subplots

Below is the data that is used to create the histogram subplot charts in ploty express graph objects.
Below code is used to create histogram subplot charts in ploty express graph objects.
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
specs = [[{'type':'histogram'}, {'type':'histogram'},{'type':'histogram'}]]
fig = make_subplots(rows=1, cols=3, specs=specs, subplot_titles=['<b> Millenials </b>',
'<b> Generation X </b>',
'<b> Boomers </b>'])
fig.add_trace(go.Histogram(
x=df[df['Generation']=='Millenials']['NumCompaniesWorked'],
opacity = 0.5,
marker_color = ['#455f66'] * 15
),1,1)
fig.add_trace(go.Histogram(
x=df[df['Generation']=='Generation X']['NumCompaniesWorked'],
opacity = 0.5,
marker_color = ['#455f66'] * 15
),1,2)
fig.add_trace(go.Histogram(
x=df[df['Generation']=='Boomers']['NumCompaniesWorked'],
opacity = 0.5,
marker_color = ['#455f66'] * 15
),1,3)
fig.update_layout(
showlegend=False,
title=dict(text="<b> Histogram - <br> <span style='color: #f55142'> How to add the box plot and mean vertical line on each diagram </span></b> ",
font=dict(
family="Arial",
size=20,
color='#283747')
))
fig.show()
And below is the output I get from the above code
How can I include the mean (Average) vertical line in a histogram diagrams as the mean values are,
Millenials = 2.2
Generation X = 3.4
Boomers = 4.1
and a box plot above all 03 histogram diagrams.
Which should look like the shown diagram below for all 03 histogram diagrams.
import pandas as pd
import numpy as np
#original df
df = pd.DataFrame({'NumCompaniesWorked':list(range(10)),
'Millenials':[139,407,54,57,55,32,35,28,17,24],
'Generation X':[53,108,83,90,70,27,32,40,26,24],
'Boomers':[5,6,9,12,14,4,3,6,6,4]})
#reorganizing df
dfs = []
for col in ['Millenials', 'Generation X', 'Boomers']:
dfs.append(df[['NumCompaniesWorked', col]].rename(columns={col:'count'}).assign(Generation=col))
df = pd.concat(dfs)
#output
NumCompaniesWorked count Generation
0 0 139 Millenials
1 1 407 Millenials
2 2 54 Millenials
3 3 57 Millenials
4 4 55 Millenials
5 5 32 Millenials
6 6 35 Millenials
7 7 28 Millenials
8 8 17 Millenials
9 9 24 Millenials
0 0 53 Generation X
1 1 108 Generation X
2 2 83 Generation X
3 3 90 Generation X
4 4 70 Generation X
5 5 27 Generation X
6 6 32 Generation X
7 7 40 Generation X
8 8 26 Generation X
9 9 24 Generation X
0 0 5 Boomers
1 1 6 Boomers
2 2 9 Boomers
3 3 12 Boomers
4 4 14 Boomers
5 5 4 Boomers
6 6 3 Boomers
7 7 6 Boomers
8 8 6 Boomers
9 9 4 Boomers
fig = px.histogram(df,
x='NumCompaniesWorked',
y='count',
marginal='box',
facet_col='Generation')
fig.add_vline(x=2.2, line_width=1, line_dash='dash', line_color='gray', col=1)
fig.add_vline(x=3.4, line_width=1, line_dash='dash', line_color='gray', col=2)
fig.add_vline(x=4.1, line_width=1, line_dash='dash', line_color='gray', col=3)
fig.show()

Python Matplotlib bars subplots by Category and Aggregation

I have a table like this:
data = {'Category':["Toys","Toys","Toys","Toys","Food","Food","Food","Food","Food","Food","Food","Food","Furniture","Furniture","Furniture"],
'Product':["AA","BB","CC","DD","SSS","DDD","FFF","RRR","EEE","WWW","LLLLL","PPPPPP","LPO","NHY","MKO"],
'QTY':[100,200,300,50,20,800,300,450,150,320,400,1000,150,900,1150]}
df = pd.DataFrame(data)
df
Out:
Category Product QTY
0 Toys AA 100
1 Toys BB 200
2 Toys CC 300
3 Toys DD 50
4 Food SSS 20
5 Food DDD 800
6 Food FFF 300
7 Food RRR 450
8 Food EEE 150
9 Food WWW 320
10 Food LLLLL 400
11 Food PPPPP 1000
12 Furniture LPO 150
13 Furniture NHY 900
14 Furniture MKO 1150
So, I need to make bars subplots like this (Sum Products in each Category):
My problem is that I can't figure out how to combine categories, series, and aggregation.
I manage to split them into 3 subplots (1 always stays blank) but I can not unite them ...
import matplotlib.pyplot as plt
fig, axarr = plt.subplots(2, 2, figsize=(12, 8))
df['Category'].value_counts().plot.bar(
ax=axarr[0][0], fontsize=12, color='b'
)
axarr[0][0].set_title("Category", fontsize=18)
df['Product'].value_counts().plot.bar(
ax=axarr[1][0], fontsize=12, color='b'
)
axarr[1][0].set_title("Product", fontsize=18)
df['QTY'].value_counts().plot.bar(
ax=axarr[1][1], fontsize=12, color='b'
)
axarr[1][1].set_title("QTY", fontsize=18)
plt.subplots_adjust(hspace=.3)
plt.show()
Out
What do I need to add to combine them?
This would be a lot easier with seaborn and FacetGrid
import pandas as pd
import seaborn as sns
data = {'Category':["Toys","Toys","Toys","Toys","Food","Food","Food","Food","Food","Food","Food","Food","Furniture","Furniture","Furniture"],
'Product':["AA","BB","CC","DD","SSS","DDD","FFF","RRR","EEE","WWW","LLLLL","PPPPPP","LPO","NHY","MKO"],
'QTY':[100,200,300,50,20,800,300,450,150,320,400,1000,150,900,1150]}
df = pd.DataFrame(data)
g = sns.FacetGrid(df, col='Category', sharex=False, sharey=False, col_wrap=2, height=3, aspect=1.5)
g.map_dataframe(sns.barplot, x='Product', y='QTY')

Pandas Groubpy plotting with unstack()

I have the following code
df = pd.DataFrame({
'type':['john','bill','john','bill','bill','bill','bill','john','john'],
'num':[1006,1004,1006,1004,1006,1006,1006,1004,1004],
'date':[2017,2016,2015,2017,2017,2013,2012,2013,2012],
'pos':[0,0,1,4,0,3,3,8,9],
'force':[5,2,7,10,6,12,4,7,8]})
fig, ax = plt.subplots()
grp=df.sort_values('date').groupby(['type'])
for name, group in grp :
print(name)
print(group)
group.plot(x='date', y='force', label=name)
plt.show()
The result obtained is as follows:
bill
type num date pos force
6 bill 1006 2012 3 4
5 bill 1006 2013 3 12
1 bill 1004 2016 0 2
3 bill 1004 2017 4 10
4 bill 1006 2017 0 6
john
type num date pos force
8 john 1004 2012 9 8
7 john 1004 2013 8 7
2 john 1006 2015 1 7
0 john 1006 2017 0 5
[img1_force_Bill][1]
[img2_Force_john][2]
how can i get 4 Fig, in each one 2 lines:
Fig1 for bill: line1(x=date , y= force) for num(1004)/
line2(x=date , y= force) for num(1006)
Fig2 for bill: line1(x=date , y= pos) for num(1004)/
line2(x=date , y= pos) for num(1006)
Fig3 for john: line1(x=date , y= force) for num(1004)/
line2(x=date , y= force) for num(1006)
Fig4 for john: line1(x=date , y= pos) for num(1004)/
line2(x=date , y= pos) for num(1006)
Let's try this:
df = pd.DataFrame({
'type':['john','bill','john','bill','bill','bill','bill','john','john'],
'num':[1006,1004,1006,1004,1006,1006,1006,1004,1004],
'date':[2017,2016,2015,2017,2017,2013,2012,2013,2012],
'pos':[0,0,1,4,0,3,3,8,9],
'force':[5,2,7,10,6,12,4,7,8]})
fig, ax = plt.subplots(2,2)
axi=iter(ax.flatten())
grp=df.sort_values('date').groupby(['type'])
for name, group in grp :
# print(name)
# print(group)
group.set_index(['date','num'])['force'].unstack().plot(title=name+' - force', ax=next(axi), legend=False)
group.set_index(['date','num'])['pos'].unstack().plot(title=name+ ' - pos', ax=next(axi), legend=False)
plt.tight_layout()
plt.legend(loc='upper center', bbox_to_anchor=(0, -.5), ncol=2)
plt.show()
Output:
Update per comment below:
dfj = df[df['type'] == 'john']
ax = dfj.set_index(['date','num'])['force'].unstack().plot(title=name+' - force', legend=False)
ax.axhline(y=dfj['force'].max(), color='red', alpha=.8)
Chart:
#Scott Boston
.... thank you alot for your help.
unfortunately after using the following code with big data to plot 2 lines
for name, group in grp_new:
axn= group.set_index(['date', 'num'])['pos'].unstack().plot(title= name+' _pos', legend=False)
the plot looks like plot2Lines .They are not continuous plots.I tried to plot single lines and it were ok.

python pandas bar plot another column text

max min mincount maxcount
0 12 10 1 6
1 21 14 1 6
2 34 19 1 6
3 6 20 1 4
4 8 22 1 4
5 41 23 1 4
this is pandas DataFrame.
so I want like this image.
enter image description here
text label is very important.
here my code
df = pd.DataFrame({'maxcount': max_count, 'mincount': min_count, 'max': max, 'min': min})
ax = df[['maxcount', 'mincount']].plot(kind='bar')
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#create your dataframe
d= {'max':[12,21,34,6,8,41],'min':[10,14,19,20,22,23],
'mincount':[1,1,1,1,1,1],'maxcount':[6,6,6,4,4,4]}
df=pd.DataFrame(d)
#create 2 dataframes counts and max_min (1 for plotting and 1 for text)
counts=pd.DataFrame(df,columns=['maxcount','mincount'])
max_min=pd.DataFrame(df,columns=['max','min'])
#plot the counts
ax=counts[counts.columns].plot(kind='bar',colormap='Paired',figsize= (12,4))
#using zip() and ax.annotate specify where (location by means of z)
#and what (max_min or counts) you want to plot
for x,y,z in zip(max_min.iloc[:,0].values,counts.iloc[:,0].values, range(len(counts))):
ax.annotate('%.d' % x, (z-0.2, counts.iloc[z,0]), va='bottom', ha='center', fontsize=10)
ax.annotate("("'%.d' % y+")", (z-0.1, counts.iloc[z,0]), va='bottom', ha='center', fontsize=10)
for x,y,z in zip(max_min.iloc[:,1].values,counts.iloc[:,1].values, range(len(counts))):
ax.annotate('%.d' % x, (z+0.1, counts.iloc[z,1]), va='bottom', ha='center', fontsize=10)
ax.annotate("("'%.d' % y+")", (z+0.2, counts.iloc[z,1]), va='bottom', ha='center', fontsize=10)
This is the output:

Multiple boxplots based on pandas groups

Here is how my dataframe looks like:
year item_id sales_quantity
2014 1 10
2014 1 4
... ... ...
2015 1 7
2015 1 10
... ... ...
2014 2 1
2014 2 8
... ... ...
2015 2 17
2015 2 30
... ... ...
2014 3 9
2014 3 18
... ... ...
For each item_id, I want to plot a boxplot showing the distribution for each year.
Here is what I tried:
data = pd.DataFrame.from_csv('electronics.csv')
grouped = data.groupby(['year'])
ncols=4
nrows = int(np.ceil(grouped.ngroups/ncols))
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(35,45),
sharey=False)
for (key, ax) in zip(grouped.groups.keys(), axes.flatten()):
grouped.get_group(key).boxplot(x='year', y='sales_quantity',
ax=ax, label=key)
I get the error boxplot() got multiple values for argument 'x'. Can someone please tell me how to do this right?
If I have only a single item, then the following works
sns.boxplot(data.sales_quantity, groupby = data.year). How could I extend it for multiple items?
Link to csv
Please check comment on the code.
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('electronics_157_3cols.csv')
print(df)
fig, axes = plt.subplots(1, len(df['item_id_copy'].unique()), sharey=True)
for n, i in enumerate(df['item_id_copy'].unique()):
idf = df[df['item_id_copy'] == int('{}'.format(i))][['year', 'sales_quantity']].pivot(columns='year')
print(idf)
idf.plot.box(ax=axes[n])
axes[n].set_title('ID {}'.format(i))
axes[n].set_xticklabels([e[1] for e in idf.columns], rotation=45)
axes[n].set_ylim(0, 1) # You should disable this line to specify outlier properly. (but I didn't to show you a normal graph)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('electronics_157_3cols.csv')
print(df)
fig, axes = plt.subplots(2, 5, sharey=True)
gen_n = (n for n in range(1, 11))
gen_i = (i for i in df['item_id_copy'].unique())
for r in range(2):
for c in range(5):
n = gen_n.__next__()
i = gen_i.__next__()
idf = df[df['item_id_copy'] == int('{}'.format(i))][['year', 'sales_quantity']].pivot(columns='year')
print(idf)
idf.plot.box(ax=axes[r][c])
axes[r][c].set_title('ID {}'.format(i))
axes[r][c].set_xticklabels([e[1] for e in idf.columns], rotation=0)
axes[r][c].set_ylim(0, 1)
plt.show()
I will leave this simple version for others...
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_table('sample.txt', delimiter='\s+')
fig, axes = plt.subplots(1, 3, sharey=True)
for n, i in enumerate(df['item_id'].unique()):
idf = df[df['item_id'] == int('{}'.format(i))][['year', 'sales_quantity']].pivot(columns='year')
print(idf)
idf.plot.box(ax=axes[n])
axes[n].set_title('Item ID {}'.format(i))
axes[n].set_xticklabels([e[1] for e in idf.columns])
plt.show()
sample.txt
year item_id sales_quantity
2014 1 10
2014 1 4
2015 1 7
2015 1 10
2014 2 1
2014 2 8
2015 2 17
2015 2 30
2014 3 9
2014 3 18

Categories