max min mincount maxcount
0 12 10 1 6
1 21 14 1 6
2 34 19 1 6
3 6 20 1 4
4 8 22 1 4
5 41 23 1 4
this is pandas DataFrame.
so I want like this image.
enter image description here
text label is very important.
here my code
df = pd.DataFrame({'maxcount': max_count, 'mincount': min_count, 'max': max, 'min': min})
ax = df[['maxcount', 'mincount']].plot(kind='bar')
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#create your dataframe
d= {'max':[12,21,34,6,8,41],'min':[10,14,19,20,22,23],
'mincount':[1,1,1,1,1,1],'maxcount':[6,6,6,4,4,4]}
df=pd.DataFrame(d)
#create 2 dataframes counts and max_min (1 for plotting and 1 for text)
counts=pd.DataFrame(df,columns=['maxcount','mincount'])
max_min=pd.DataFrame(df,columns=['max','min'])
#plot the counts
ax=counts[counts.columns].plot(kind='bar',colormap='Paired',figsize= (12,4))
#using zip() and ax.annotate specify where (location by means of z)
#and what (max_min or counts) you want to plot
for x,y,z in zip(max_min.iloc[:,0].values,counts.iloc[:,0].values, range(len(counts))):
ax.annotate('%.d' % x, (z-0.2, counts.iloc[z,0]), va='bottom', ha='center', fontsize=10)
ax.annotate("("'%.d' % y+")", (z-0.1, counts.iloc[z,0]), va='bottom', ha='center', fontsize=10)
for x,y,z in zip(max_min.iloc[:,1].values,counts.iloc[:,1].values, range(len(counts))):
ax.annotate('%.d' % x, (z+0.1, counts.iloc[z,1]), va='bottom', ha='center', fontsize=10)
ax.annotate("("'%.d' % y+")", (z+0.2, counts.iloc[z,1]), va='bottom', ha='center', fontsize=10)
This is the output:
Related
I am analysing data which is organised as following:
There are 4 different pandas data fram for each groups (A, B and C).
Each dataframe representing a group has 4 subroups (columns) and rows representing thoer corresponding observations.
For example, a single group of data looks like:
subgroup-1
subgroup-2
subgroup-3
subgroup-4
12
4
NaN
9
15
3
4
NaN
16
8
3
11
17
12
8
13
11
17
12
14
I want to visualise the distributions for each subgroup for the different group. Can anyone let me know what are the available options in Python to do this (the chart types I can use). Thanks.
I tried using histogram, density plots but all of them work only for 2 variables.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# pandas Dataframes
group_A = pd.DataFrame(np.random.rand(50, 4) , columns=['subgroup-1' , 'subgroup-2' , 'subgroup-3' , 'subgroup-4'])
group_B = pd.DataFrame(np.random.rand(50, 4) , columns=['subgroup-1' , 'subgroup-2' , 'subgroup-3' , 'subgroup-4'])
group_C = pd.DataFrame(np.random.rand(50, 4) , columns=['subgroup-1' , 'subgroup-2' , 'subgroup-3' , 'subgroup-4'])
def plot_hist(subgroup):
np.random.seed(19680801)
n_bins = 10
x = np.dstack([group_A[subgroup] , group_B[subgroup] , group_C[subgroup]])[0]
fig, axes = plt.subplots(nrows=2, ncols=2)
ax0, ax1, ax2, ax3 = axes.flatten()
ax0.hist(x, n_bins, density=True, histtype='bar', label = ['A', 'B', 'C'])
ax0.legend(prop={'size': 10})
ax0.set_title('bars with legend')
ax1.hist(x, n_bins, density=True, histtype='bar', stacked=True)
ax1.set_title('stacked bar')
ax2.hist(x, n_bins, histtype='step', stacked=True, fill=False)
ax2.set_title('stack step (unfilled)')
# Make a multiple-histogram of data-sets with different length.
x_multi = [np.random.randn(n) for n in [10000, 5000, 2000]]
ax3.hist(x_multi, n_bins, histtype='bar')
ax3.set_title('different sample sizes')
fig.tight_layout()
plt.show()
plot_hist('subgroup-1')
reference
I'm trying to plot the data (see below). With company_name on the x-axis, status_mission_2_y on the y axis and percentage on the other y_axis. I have tried using the twinx() fucntion but I can't get it to work.
Please can you help? Thanks in advance!
def twinplot(data):
x_ = data.columns[0]
y_ = data.columns[1]
y_2 = data.columns[2]
data1 = data[[x_, y_]]
data2 = data[[x_, y_2]]
plt.figure(figsize=(15, 8))
ax = sns.barplot(x=x_, y=y_, data=data1)
ax2 = ax.twinx()
g2 = sns.barplot(x=x_, y=y_2, data=data2, ax=ax2)
plt.show()
data = ten_company_missions_failed
twinplot(data)
company_name
percentage
status_mission_2_y
EER
1
1
Ghot
1
1
Trv
1
1
Sandia
1
1
Test
1
1
US Navy
0.823529412
17
Zed
0.8
5
Gov
0.75
4
Knight
0.666666667
3
Had
0.666666667
3
Seaborn plots the two bar plots with the same color and on the same x-positions.
The following example code resizes the bar widths, with the bars belonging ax moved to the left. And the bars of ax2 moved to the right. To differentiate the right bars, a semi-transparency (alpha=0.7) and hatching is used.
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import pandas as pd
import seaborn as sns
from io import StringIO
data_str = '''company_name percentage status_mission_2_y
EER 1 1
Ghot 1 1
Trv 1 1
Sandia 1 1
Test 1 1
"US Navy" 0.823529412 17
Zed 0.8 5
Gov 0.75 4
Knight 0.666666667 3
Had 0.666666667 3'''
data = pd.read_csv(StringIO(data_str), delim_whitespace=True)
x_ = data.columns[0]
y_ = data.columns[1]
y_2 = data.columns[2]
data1 = data[[x_, y_]]
data2 = data[[x_, y_2]]
plt.figure(figsize=(15, 8))
ax = sns.barplot(x=x_, y=y_, data=data1)
width_scale = 0.45
for bar in ax.containers[0]:
bar.set_width(bar.get_width() * width_scale)
ax.yaxis.set_major_formatter(PercentFormatter(1))
ax2 = ax.twinx()
sns.barplot(x=x_, y=y_2, data=data2, alpha=0.7, hatch='xx', ax=ax2)
for bar in ax2.containers[0]:
x = bar.get_x()
w = bar.get_width()
bar.set_x(x + w * (1- width_scale))
bar.set_width(w * width_scale)
plt.show()
A simpler alternative could be to combine a barplot on ax and a lineplot on ax2.
plt.figure(figsize=(15, 8))
ax = sns.barplot(x=x_, y=y_, data=data1)
ax.yaxis.set_major_formatter(PercentFormatter(1))
ax2 = ax.twinx()
sns.lineplot(x=x_, y=y_2, data=data2, marker='o', color='crimson', lw=3, ax=ax2)
plt.show()
I have a table like this:
data = {'Category':["Toys","Toys","Toys","Toys","Food","Food","Food","Food","Food","Food","Food","Food","Furniture","Furniture","Furniture"],
'Product':["AA","BB","CC","DD","SSS","DDD","FFF","RRR","EEE","WWW","LLLLL","PPPPPP","LPO","NHY","MKO"],
'QTY':[100,200,300,50,20,800,300,450,150,320,400,1000,150,900,1150]}
df = pd.DataFrame(data)
df
Out:
Category Product QTY
0 Toys AA 100
1 Toys BB 200
2 Toys CC 300
3 Toys DD 50
4 Food SSS 20
5 Food DDD 800
6 Food FFF 300
7 Food RRR 450
8 Food EEE 150
9 Food WWW 320
10 Food LLLLL 400
11 Food PPPPP 1000
12 Furniture LPO 150
13 Furniture NHY 900
14 Furniture MKO 1150
So, I need to make bars subplots like this (Sum Products in each Category):
My problem is that I can't figure out how to combine categories, series, and aggregation.
I manage to split them into 3 subplots (1 always stays blank) but I can not unite them ...
import matplotlib.pyplot as plt
fig, axarr = plt.subplots(2, 2, figsize=(12, 8))
df['Category'].value_counts().plot.bar(
ax=axarr[0][0], fontsize=12, color='b'
)
axarr[0][0].set_title("Category", fontsize=18)
df['Product'].value_counts().plot.bar(
ax=axarr[1][0], fontsize=12, color='b'
)
axarr[1][0].set_title("Product", fontsize=18)
df['QTY'].value_counts().plot.bar(
ax=axarr[1][1], fontsize=12, color='b'
)
axarr[1][1].set_title("QTY", fontsize=18)
plt.subplots_adjust(hspace=.3)
plt.show()
Out
What do I need to add to combine them?
This would be a lot easier with seaborn and FacetGrid
import pandas as pd
import seaborn as sns
data = {'Category':["Toys","Toys","Toys","Toys","Food","Food","Food","Food","Food","Food","Food","Food","Furniture","Furniture","Furniture"],
'Product':["AA","BB","CC","DD","SSS","DDD","FFF","RRR","EEE","WWW","LLLLL","PPPPPP","LPO","NHY","MKO"],
'QTY':[100,200,300,50,20,800,300,450,150,320,400,1000,150,900,1150]}
df = pd.DataFrame(data)
g = sns.FacetGrid(df, col='Category', sharex=False, sharey=False, col_wrap=2, height=3, aspect=1.5)
g.map_dataframe(sns.barplot, x='Product', y='QTY')
I need to compare different sets of daily data between 4 shifts(categorical / groupby), using bar graphs and line graphs. I have looked everywhere and have not found a working solution for this that doesn't include generating new pivots and such.
I've used both, matplotlib and seaborn, and while I can do one or the other(different colored bars/lines for each shift), once I incorporate the other, either one disappears, or other anomalies happen like only one plot point shows. I have looked all over and there are solutions for representing a single series of data on both chart types, but none that goes into multi category or grouped for both.
Data Example:
report_date wh_id shift Head_Count UTL_R
3/17/19 55 A 72 25%
3/18/19 55 A 71 10%
3/19/19 55 A 76 20%
3/20/19 55 A 59 33%
3/21/19 55 A 65 10%
3/22/19 55 A 54 20%
3/23/19 55 A 66 14%
3/17/19 55 1 11 10%
3/17/19 55 2 27 13%
3/17/19 55 3 18 25%
3/18/19 55 1 23 100%
3/18/19 55 2 16 25%
3/18/19 55 3 12 50%
3/19/19 55 1 28 10%
3/19/19 55 2 23 50%
3/19/19 55 3 14 33%
3/20/19 55 1 29 25%
3/20/19 55 2 29 25%
3/20/19 55 3 10 50%
3/21/19 55 1 17 20%
3/21/19 55 2 29 14%
3/21/19 55 3 30 17%
3/22/19 55 1 12 14%
3/22/19 55 2 10 100%
3/22/19 55 3 17 14%
3/23/19 55 1 16 10%
3/23/19 55 2 11 100%
3/23/19 55 3 13 10%
tm_daily_df = pd.read_csv('fg_TM_Daily.csv')
tm_daily_df = tm_daily_df.set_index('report_date')
fig2, ax2 = plt.subplots(figsize=(12,8))
ax3 = ax2.twinx()
group_obj = tm_daily_df.groupby('shift')
g = group_obj['Head_Count'].plot(kind='bar', x='report_date', y='Head_Count',ax=ax2,stacked=False,alpha = .2)
g = group_obj['UTL_R'].plot(kind='line',x='report_date', y='UTL_R', ax=ax3,marker='d', markersize=12)
plt.legend(tm_daily_df['shift'].unique())
This code has gotten me the closest I've been able to get. Notice that even with stacked = False, they are still stacked. I changed the setting to True, and nothing changes.
All i need is for the bars to be next to each other with the same color scheme representative of the shift
The graph:
Here are two solutions (stacked and unstacked). Based on your questions we will:
plot Head_Count in the left y axis and UTL_R in the right y axis.
report_date will be our x axis
shift will represent the hue of our graph.
The stacked version uses pandas default plotting feature, and the unstacked version uses seaborn.
EDIT
From your request, I added a 100% stacked graph. While it is not quite exactly what you asked in the comment, the graph type you asked may create some confusion when reading (are the values based on the upper line of the stack or the width of the stack). An alternative solution may be using a 100% stacked graph.
Stacked
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
dfg = df.set_index(['report_date', 'shift']).sort_index(level=[0,1])
fig, ax = plt.subplots(figsize=(12,6))
ax2 = ax.twinx()
dfg['Head_Count'].unstack().plot.bar(stacked=True, ax=ax, alpha=0.6)
dfg['UTL_R'].unstack().plot(kind='line', ax=ax2, marker='o', legend=None)
ax.set_title('My Graph')
plt.show()
Stacked 100%
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
dfg = df.set_index(['report_date', 'shift']).sort_index(level=[0,1])
# Create `Head_Count_Pct` column
for date in dfg.index.get_level_values('report_date').unique():
for shift in dfg.loc[date, :].index.get_level_values('shift').unique():
dfg.loc[(date, shift), 'Head_Count_Pct'] = dfg.loc[(date, shift), 'Head_Count'].sum() / dfg.loc[(date, 'A'), 'Head_Count'].sum()
fig, ax = plt.subplots(figsize=(12,6))
ax2 = ax.twinx()
pal = sns.color_palette("Set1")
dfg[dfg.index.get_level_values('shift').isin(['1','2','3'])]['Head_Count_Pct'].unstack().plot.bar(stacked=True, ax=ax, alpha=0.5, color=pal)
dfg['UTL_R'].unstack().plot(kind='line', ax=ax2, marker='o', legend=None, color=pal)
ax.set_title('My Graph')
plt.show()
Unstacked
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
dfg = df.set_index(['report_date', 'shift']).sort_index(level=[0,1])
fig, ax = plt.subplots(figsize=(15,6))
ax2 = ax.twinx()
sns.barplot(x=dfg.index.get_level_values('report_date'),
y=dfg.Head_Count,
hue=dfg.index.get_level_values('shift'), ax=ax, alpha=0.7)
sns.lineplot(x=dfg.index.get_level_values('report_date'),
y=dfg.UTL_R,
hue=dfg.index.get_level_values('shift'), ax=ax2, marker='o', legend=None)
ax.set_title('My Graph')
plt.show()
EDIT #2
Here is the graph as you requested in a second time (stacked, but stack n+1 does not start where stack n ends).
It is slightly more involving as we have to do multiple things:
- we need to manually assign our color to our shift in our df
- once we have our colors assign, we will iterate through each date range and 1) sort or Head_Count values descending (so that our largest sack is in the back when we plot the graph), and 2) plot the data and assign the color to each stacj
- Then we can create our second y axis and plot our UTL_R values
- Then we need to assign the correct color to our legend labels
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def assignColor(shift):
if shift == 'A':
return 'R'
if shift == '1':
return 'B'
if shift == '2':
return 'G'
if shift == '3':
return 'Y'
# map a color to a shift
df['color'] = df['shift'].apply(assignColor)
fig, ax = plt.subplots(figsize=(15,6))
# plot our Head_Count values
for date in df.report_date.unique():
d = df[df.report_date == date].sort_values(by='Head_Count', ascending=False)
y = d.Head_Count.values
x = date
color = d.color
b = plt.bar(x,y, color=color)
# Plot our UTL_R values
ax2 = ax.twinx()
sns.lineplot(x=df.report_date, y=df.UTL_R, hue=df['shift'], marker='o', legend=None)
# Assign the color label color to our legend
leg = ax.legend(labels=df['shift'].unique(), loc=1)
legend_maping = dict()
for shift in df['shift'].unique():
legend_maping[shift] = df[df['shift'] == shift].color.unique()[0]
i = 0
for leg_lab in leg.texts:
leg.legendHandles[i].set_color(legend_maping[leg_lab.get_text()])
i += 1
How about this?
tm_daily_df['UTL_R'] = tm_daily_df['UTL_R'].str.replace('%', '').astype('float') / 100
pivoted = tm_daily_df.pivot_table(values=['Head_Count', 'UTL_R'],
index='report_date',
columns='shift')
pivoted
# Head_Count UTL_R
# shift 1 2 3 A 1 2 3 A
# report_date
# 3/17/19 11 27 18 72 0.10 0.13 0.25 0.25
# 3/18/19 23 16 12 71 1.00 0.25 0.50 0.10
# 3/19/19 28 23 14 76 0.10 0.50 0.33 0.20
# 3/20/19 29 29 10 59 0.25 0.25 0.50 0.33
# 3/21/19 17 29 30 65 0.20 0.14 0.17 0.10
# 3/22/19 12 10 17 54 0.14 1.00 0.14 0.20
# 3/23/19 16 11 13 66 0.10 1.00 0.10 0.14
fig, ax = plt.subplots()
pivoted['Head_Count'].plot.bar(ax=ax)
pivoted['UTL_R'].plot.line(ax=ax, legend=False, secondary_y=True, marker='D')
ax.legend(loc='upper left', title='shift')
Here is how my dataframe looks like:
year item_id sales_quantity
2014 1 10
2014 1 4
... ... ...
2015 1 7
2015 1 10
... ... ...
2014 2 1
2014 2 8
... ... ...
2015 2 17
2015 2 30
... ... ...
2014 3 9
2014 3 18
... ... ...
For each item_id, I want to plot a boxplot showing the distribution for each year.
Here is what I tried:
data = pd.DataFrame.from_csv('electronics.csv')
grouped = data.groupby(['year'])
ncols=4
nrows = int(np.ceil(grouped.ngroups/ncols))
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(35,45),
sharey=False)
for (key, ax) in zip(grouped.groups.keys(), axes.flatten()):
grouped.get_group(key).boxplot(x='year', y='sales_quantity',
ax=ax, label=key)
I get the error boxplot() got multiple values for argument 'x'. Can someone please tell me how to do this right?
If I have only a single item, then the following works
sns.boxplot(data.sales_quantity, groupby = data.year). How could I extend it for multiple items?
Link to csv
Please check comment on the code.
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('electronics_157_3cols.csv')
print(df)
fig, axes = plt.subplots(1, len(df['item_id_copy'].unique()), sharey=True)
for n, i in enumerate(df['item_id_copy'].unique()):
idf = df[df['item_id_copy'] == int('{}'.format(i))][['year', 'sales_quantity']].pivot(columns='year')
print(idf)
idf.plot.box(ax=axes[n])
axes[n].set_title('ID {}'.format(i))
axes[n].set_xticklabels([e[1] for e in idf.columns], rotation=45)
axes[n].set_ylim(0, 1) # You should disable this line to specify outlier properly. (but I didn't to show you a normal graph)
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('electronics_157_3cols.csv')
print(df)
fig, axes = plt.subplots(2, 5, sharey=True)
gen_n = (n for n in range(1, 11))
gen_i = (i for i in df['item_id_copy'].unique())
for r in range(2):
for c in range(5):
n = gen_n.__next__()
i = gen_i.__next__()
idf = df[df['item_id_copy'] == int('{}'.format(i))][['year', 'sales_quantity']].pivot(columns='year')
print(idf)
idf.plot.box(ax=axes[r][c])
axes[r][c].set_title('ID {}'.format(i))
axes[r][c].set_xticklabels([e[1] for e in idf.columns], rotation=0)
axes[r][c].set_ylim(0, 1)
plt.show()
I will leave this simple version for others...
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_table('sample.txt', delimiter='\s+')
fig, axes = plt.subplots(1, 3, sharey=True)
for n, i in enumerate(df['item_id'].unique()):
idf = df[df['item_id'] == int('{}'.format(i))][['year', 'sales_quantity']].pivot(columns='year')
print(idf)
idf.plot.box(ax=axes[n])
axes[n].set_title('Item ID {}'.format(i))
axes[n].set_xticklabels([e[1] for e in idf.columns])
plt.show()
sample.txt
year item_id sales_quantity
2014 1 10
2014 1 4
2015 1 7
2015 1 10
2014 2 1
2014 2 8
2015 2 17
2015 2 30
2014 3 9
2014 3 18