Draw plot for a grouped data frame

Draw plot for a grouped data frame - python

I have an issue, I need to draw a plot for a data that contains factory_name and date in format YYYY. I need to have plot which will contain value for each factory_name, sum of parts that was sold/bought and the mean value for each one for each year.
I tried to make it like that:
pd.pivot_table(df.reset_index(),
index='Year', columns='factory_name', values='Demand').plot()
And this is ok however I do not have the mean value for each factory_name Demand which I could get but only in data frame and I do not know how I can add this results to my plot.
df.groupby(['factory_name','Year']).agg(['sum','mean'])
Here is the code to create data frame:
df = pd.DataFrame({'factory_name' : ['A','B','A','B','A','B','B','A','B','A','A','A'],
'Year': [2001,2002,2003,2001,2002,2003,2002,2003,2003,2003,2003,2003],
'Demand': [100,200,-20,40,30,50,100,200,50,-100,40,50]})
Thanks for help!

colors = ["brown", "darkgreen"]
plt.figure(figsize=(12,8))
for factory, color in zip(df.factory_name.unique(), colors):
s = df.loc[df.factory_name==factory].groupby("Year").Demand.mean()
plt.plot(
s.index,
s.values,
color=color,
linewidth=2,
alpha=.5,
label="%s mean"%factory
)
for factory, color in zip(df.factory_name.unique(), colors):
s = df.loc[df.factory_name==factory].groupby("Year").Demand.sum()
plt.plot(
s.index,
s.values,
color=color,
linewidth=4,
alpha=.25,
label="%s sum"%factory
)
plt.ylim(0,500)
plt.xticks(df.Year.unique())
plt.xlabel("year")
plt.legend()
plt.show()
EDIT:
I edited the code in order to expand the figure size and add the legend.
OUTPUT:

Related

Create a stacked bar plot of percentages and annotate with count

I have this data (df) and I get their percentages (data=rel) and plotted a stacked bar graph.
Now I want to add values (non percentage values) to the centers of each bar but from my first dataframe.
My code for now:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from csv import reader
import seaborn as sns
df = pd.DataFrame({'IL':['Balıkesir', 'Bursa', 'Çanakkale', 'Edirne', 'İstanbul', 'Kırklareli', 'Kocaeli', 'Sakarya','Tekirdağ','Yalova'],'ENGELLIUYGUN':[7,13,3,1,142,1,14,1,2,2],'ENGELLIUYGUNDEGIL':[1,5,0,0,55,0,3,0,1,0]})
iller=df.iloc[:,[0]]
df_total = df["ENGELLIUYGUN"] + df["ENGELLIUYGUNDEGIL"]
df_rel = df[df.columns[1:]].div(df_total, 0)*100
rel=[]
rel=pd.DataFrame(df_rel)
rel['İller'] = iller
d=df.iloc[:,[1]] #I want to add these values to the center of blue bars.
f=df.iloc[:,[2]] #I want to add these values to the center of green bars.
sns.set_theme (style='whitegrid')
ax=rel.plot(x='İller',kind='bar', stacked=True, color=["#3a88e2","#5c9e1e"], label=("Uygun","Uygun Değil"))
plt.legend(["Evet","Hayır"],fontsize=8, bbox_to_anchor=(1, 0.5))
plt.xlabel('...........',fontsize=12)
plt.ylabel('..........',fontsize=12)
plt.title('.............',loc='center',fontsize=14)
plt.ylim(0,100)
ax.yaxis.grid(color='gray', linestyle='dashed')
plt.show()
I have this for now:
I want the exact same style of this photo:
I am using Anaconda-Jupyter Notebook.

Answering: I want to add values (non percentage values) to the centers of each bar but from my first dataframe.
The correct way to annotate bars, is with .bar_label, as explained in this answer.
The values from df can be sent to the label= parameter instead of the percentages.
This answer shows how to succinctly calculate the percentages, but plots the counts and annotates with percentage and value, whereas this OP wants to plot the percentage on the y-axis and annotate with counts.
This answer shows how to place the legend at the bottom of the plot.
This answer shows how to format the axis tick labels as percent.
See pandas.DataFrame.plot for an explanation of the available parameters.
I am using Anaconda-Jupyter Notebook. Everything from the comment, # plot percent; ..., should be in the same notebook cell.
Tested in python 3.11, pandas 1.5.2, matplotlib 3.6.2
import pandas as pd
import matplotlib.ticker as tkr
# sample data
df = pd.DataFrame({'IL': ['Balıkesir', 'Bursa', 'Çanakkale', 'Edirne', 'İstanbul', 'Kırklareli', 'Kocaeli', 'Sakarya','Tekirdağ','Yalova'],
'ENGELLIUYGUN': [7, 13, 3, 1, 142, 1, 14, 1, 2, 2],
'ENGELLIUYGUNDEGIL': [1, 5, 0, 0, 55, 0, 3, 0, 1, 0]})
# set IL as the index
df = df.set_index('IL')
# calculate the percent
per = df.div(df.sum(axis=1), axis=0).mul(100)
# plot percent; adjust rot= for the rotation of the xtick labels
ax = per.plot(kind='bar', stacked=True, figsize=(10, 8), rot=0,
color=['#3a88e2', '#5c9e1e'], yticks=range(0, 101, 10),
title='my title', ylabel='', xlabel='')
# move the legend
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=2, frameon=False)
# format the y-axis tick labels
ax.yaxis.set_major_formatter(tkr.PercentFormatter())
# iterate through the containers
for c in ax.containers:
# get the current segment label (a string); corresponds to column / legend
col = c.get_label()
# use label to get the appropriate count values from df
# customize the label to account for cases when there might not be a bar section
labels = [v if v > 0 else '' for v in df[col]]
# the following will also work
# labels = df[col].replace(0, '')
# add the annotation
ax.bar_label(c, labels=labels, label_type='center', fontweight='bold')
Alternate Annotation Implementation
Since the column names in df and per are the same, they can be extracted directly from per.
# iterate through the containers and per column names
for c, col in zip(ax.containers, per):
# add the annotations with custom labels from df
ax.bar_label(c, labels=df[col].replace(0, ''), label_type='center', fontweight='bold')

I don't think any subtle method exist. So you have to print those yourself, adding explicitly text. Which is not that hard to do. For example, if you add this just after your plot
for i in range(len(d)):
ax.text(i, df_rel.iloc[i,0]/2, d.iloc[i,0], ha='center', fontweight='bold', color='#ffff00', fontsize='small')
ax.text(i, 50+df_rel.iloc[i,0]/2, f.iloc[i,0], ha='center', fontweight='bold', color='#400040', fontsize='small')
you get this result
You can of course change color, size, position, etc. (I am well known for by total lack of bon goût for those matter). But also decide some arbitrary rule, such as not printing '0' (that the advantage of doing things explicitly: your code, your rule; you don't have to fight an existing API to convince it to do it your way).

Matplotlib: How to plot errorbar plots based on a color map of third category column(Not X and Y)

I'm working on an experimentation personal project. I have the following dataframes:
treat_repr = pd.DataFrame({'kpi': ['cpsink', 'hpu', 'mpu', 'revenue', 'wallet']
,'diff_pct': [0.655280, 0.127299, 0.229958, 0.613308, -0.718421]
,'me_pct': [1.206313, 0.182875, 0.170821, 1.336590, 2.229763]
,'p': [0.287025, 0.172464, 0.008328, 0.368466, 0.527718]
,'significance': ['insignificant', 'insignificant', 'significant', 'insignificant', 'insignificant']})
pre_treat_repr = pd.DataFrame({'kpi': ['cpsink', 'hpu', 'mpu', 'revenue', 'wallet']
,'diff_pct': [0.137174, 0.111005, 0.169490, -0.152929, -0.450667]
,'me_pct': [1.419080, 0.207081, 0.202014, 1.494588, 1.901672]
,'p': [0.849734, 0.293427, 0.100091, 0.841053, 0.642303]
,'significance': ['insignificant', 'insignificant', 'insignificant', 'insignificant', 'insignificant']})
I have used the below code to construct errorbar plot, which works fine:
def confint_plot(df):
plt.style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(18, 10))
plt.errorbar(df[df['significance'] == 'significant']["diff_pct"], df[df['significance'] == 'significant']["kpi"], xerr = df[df['significance'] == 'significant']["me_pct"], color = '#d62828', fmt = 'o', capsize = 10)
plt.errorbar(df[df['significance'] == 'insignificant']["diff_pct"], df[df['significance'] == 'insignificant']["kpi"], xerr = df[df['significance'] == 'insignificant']["me_pct"], color = '#2a9d8f', fmt = 'o', capsize = 10)
plt.legend(['significant', 'insignificant'], loc = 'best')
ax.axvline(0, c='red', alpha=0.5, linewidth=3.0,
linestyle = '--', ymin=0.0, ymax=1)
plt.title("Confidence Intervals of Continous Metrics", size=14, weight='bold')
plt.xlabel("% Difference of Control over Treatment", size=12)
plt.show()
for which the output of confint_plot(treat_repr) looks like this:
Now if I run the same plot function on a pre-treatment dataframe confint_plot(pre_treat_repr), the plot looks like this:
We can observe from both the plots that the order of the variables changed from 1st plot to 2nd plot depending on whether the kpi is significant(that's the way I figured after exhausting many attempts).
Questions:
How do I make a change to the code to dynamically allocate color maps without changing the order of the kpis on y axis?
Currently I have manually typed in the legends. Is there a way to dynamically populate legends?
Appreciate the help!

Because you plot the significant KPIs first, they will always appear on the bottom of the chart. How you solve this and keep the desired colors depends on the kind of charts you are making with matplotlib. With scatter charts, you can specify a color array in c parameter. Error bar charts do not offer that functionality.
One way to work around that is to sort your KPIs, give them numeric position (0, 1, 2, 3 , ...), plot them twice (once for significants, once for insignificants) and re-tick them:
def confint_plot(df):
plt.style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(18, 10))
# Sort the KPIs alphabetically. You can change the order to anything
# that fits your purpose
df_plot = df.sort_values('kpi').assign(y=range(len(df)))
for significance in ['significant', 'insignificant']:
cond = df_plot['significance'] == significance
color = '#d62828' if significance == 'significant' else '#2a9d8f'
# Plot them in their numeric positions first
plt.errorbar(
df_plot.loc[cond, 'diff_pct'], df_plot.loc[cond, 'y'],
xerr=df_plot.loc[cond, 'me_pct'], label=significance,
fmt='o', capsize=10, c=color
)
plt.legend(loc='best')
ax.axvline(0, c='red', alpha=0.5, linewidth=3.0,
linestyle = '--', ymin=0.0, ymax=1)
# Re-tick to show the KPIs
plt.yticks(df_plot['y'], df_plot['kpi'])
plt.title("Confidence Intervals of Continous Metrics", size=14, weight='bold')
plt.xlabel("% Difference of Control over Treatment", size=12)
plt.show()

How to add a mean line to a seaborn stripplot or swarmplot

I have a rather simple strip plot with vertical data.
planets = sns.load_dataset("planets")
sns.stripplot(x="method", y="distance", data=planets, size=4, color=".7")
plt.xticks(rotation=45, ha="right")
plt.show()
I want to plot the mean of each x-element (method) as a small horizontal bar similar to what you get with:
sns.boxplot(
x="method",
y="distance",
data=planets,
whis=[50, 50],
showfliers=False,
showbox=False,
showcaps=False
)
But without the vertical lines (with whis=[50,50] just spots) for the first / third quartile and showing mean instead of median. Maybe there is a more elegant solution not involving a Boxplot.
Thanks in advance.

Boxplot objects are defined in matplotlib.pyplot.boxplot
showmeans=True
meanline=True makes a line instead of a marker
meanprops={'color': 'k', 'ls': '-', 'lw': 2} sets the color, style and width of the line.
See matplotlib.lines.Line2D for other line properties.
medianprops={'visible': False} makes the median line not visible
whiskerprops={'visible': False} makes the whisker line not visible
zorder=10 places the line on the top layer
Tested in matplotlib v3.4.2 and seaborn v0.11.1
import seaborn as sns
import matplotlib.pyplot as plt
# load the dataset
planets = sns.load_dataset("planets")
p = sns.stripplot(x="method", y="distance", data=planets, size=4, color=".7")
plt.xticks(rotation=45, ha="right")
p.set(yscale='log')
# plot the mean line
sns.boxplot(showmeans=True,
meanline=True,
meanprops={'color': 'k', 'ls': '-', 'lw': 2},
medianprops={'visible': False},
whiskerprops={'visible': False},
zorder=10,
x="method",
y="distance",
data=planets,
showfliers=False,
showbox=False,
showcaps=False,
ax=p)
plt.show()
Works similarly with a seaborn.swarmplot

Here's a solution using ax.hlines with find the mean using groupby and list comprehension:
import seaborn as sns
import matplotlib.pyplot as plt
# load the dataset
planets = sns.load_dataset("planets")
p = sns.stripplot(x="method", y="distance", data=planets, size=4, color=".7", zorder=1)
plt.xticks(rotation=45, ha="right")
p.set(yscale='log');
df_mean = planets.groupby('method', sort=False)['distance'].mean()
_ = [p.hlines(y, i-.25, i+.25, zorder=2) for i, y in df_mean.reset_index()['distance'].items()]
Output:

Here's another hack that is similar to the boxplot idea but requires less overriding: draw a pointplot but with a confidence interval of width 0, and activate the errorbar "caps" to get a horizontal line with a parametrizable width:
planets = sns.load_dataset("planets")
spec = dict(x="method", y="distance", data=planets)
sns.stripplot(**spec, size=4, color=".7")
sns.pointplot(**spec, join=False, ci=0, capsize=.7, scale=0)
plt.xticks(rotation=45, ha="right")
One downside that is evident here is that bootstrapping gets skipped for groups with a single observation, so you don't get a mean line there. This may or may not be a problem in an actual application.
Another trick would be to do the groupby yourself and then draw a scatterplot with a very wide vertical line marker:
planets = sns.load_dataset("planets")
variables = dict(x="method", y="distance")
sns.stripplot(data=planets, **variables, size=4, color=".7")
sns.scatterplot(
data=planets.groupby("method")["distance"].mean().reset_index(),
**variables, marker="|", s=2, linewidth=25
)
plt.xticks(rotation=45, ha="right")

How to add legend with labels of IDs to my code

I have the electricity consumption of 25 houses, and Im doing K-Means clustering on the dataset that holds those houses. After importing the dataset, pre-processing it, and applying K-Means with K=2, I plotted the data but when Im adding the legend I`m getting this:
No handles with labels found to put in legend.
No error in the code and it is running but I want my code to generate automatic legends that hold the ID of each house starting from 0 to 24.
Here is my code where I`m plotting the data:
plt.figure(figsize=(13,13))
import itertools
marker = itertools.cycle(('+', 'o', '*' , 'X', 's','8','>','1','<'))
for cluster_index in [0,1]:
plt.subplot(2,1,cluster_index + 1)
for index, row in data1.iterrows():
if row.iloc[-1] == cluster_index:
plt.plot(row.iloc[1:-1] ,marker = next(marker) , alpha=1)
plt.legend(loc="right")
plt.plot(kmeans.cluster_centers_[cluster_index], color='k' ,marker='o', alpha=1)
ax = plt.gca()
ax.tick_params(axis = 'x', which = 'major', labelsize = 10)
plt.xticks(rotation="vertical")
plt.ylabel('Monthly Mean Consumption 2018-2019', fontsize=10)
plt.title(f'Cluster {cluster_index}', fontsize=15)
plt.tight_layout()
plt.show()
plt.close()
I just want to have the legend in the output figure with the id of each house, please any help

As I do not have your data, I can not test it in a plot right now, but I assume the problem comes from not passing a label argument to plt.plot i.e.:
for index, row in data1.iterrows():
if row.iloc[-1] == cluster_index:
plt.plot(row.iloc[1:-1] ,marker = next(marker), alpha=1, label=index)
plt.legend(loc="right")

Seaborn (time series) boxplot using hue and different scale axes

I have a dataframe which has a number of values per date (datetime field). This values are classified in U (users) and S (session) by using a column Group. Seaborn is used to visualize two boxplots per date, where the hue is set to Group.
The problem comes when considering that the values corresponding to U (users) are much bigger than those corresponding to S (session), making the S data illegible. Thus, I need to come up with a solution that allows me to plot both series (U and S) in the same figure in an understandable manner.
I wonder if independent Y axes (with different scales) can be set to each hue, so that both Y axes are shown (as when using twinx but without losing hue visualization capabilities).
Any other alternative would be welcome =)
The S boxplot time series boxplot:
The combined boxplot time series using hue. Obviously it's not possible to see any information about the S group because of the scale of the Y axis:
The columns of the dataframe:
| Day (datetime) | n_data (numeric) | Group (S or U)|
The code line generating the combined boxplot:
seaborn.boxplot(ax=ax,x='Day', y='n_data', hue='Group', data=df,
palette='PRGn', showfliers=False)
Managed to find a solution by using twinx:
fig,ax= plt.subplots(figsize=(50,10))
tmpU = groups.copy()
tmpU.loc[tmp['Group']!='U','n_data'] = np.nan
tmpS = grupos.copy()
tmpS.loc[tmp['Group']!='S','n_data'] = np.nan
ax=seaborn.boxplot(ax=ax,x='Day', y = 'n_data', hue='Group', data=tmpU, palette = 'PRGn', showfliers=False)
ax2 = ax.twinx()
seaborn.boxplot(ax=ax2,x='Day', y = 'n_data', hue='Group', data=tmpS, palette = 'PRGn', showfliers=False)
handles,labels = ax.get_legend_handles_labels()
l= plt.legend(handles[0:2],labels[0:2],loc=1)
plt.setp(ax.get_xticklabels(),rotation=30,horizontalalignment='right')
for label in ax.get_xticklabels()[::2]:
label.set_visible(False)
plt.show()
plt.close('all')
The code above generates the following figure:
Which in this case turns out to be too dense to be published. Therefore I would adopt a visualization based in subplots, as Parfait susgested in his/her answer.
It wasn't an obvious solution to me so I would like to thank Parfait for his/her answer.

Consider building separate plots on same figure with y-axes ranges tailored to subsetted data. Below demonstrates with random data seeded for reproducibility (for readers of this post).
Data (with U values higher than S values)
import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt
np.random.seed(2018)
u_df = pd.DataFrame({'Day': pd.date_range('2016-10-01', periods=10)\
.append(pd.date_range('2016-10-01', periods=10)),
'n_data': np.random.uniform(0,800,20),
'Group': 'U'})
s_df = pd.DataFrame({'Day': pd.date_range('2016-10-01', periods=10)\
.append(pd.date_range('2016-10-01', periods=10)),
'n_data': np.random.uniform(0,200,20),
'Group': 'S'})
df = pd.concat([u_df, s_df], ignore_index=True)
df['Day'] = df['Day'].astype('str')
Plot
fig = plt.figure(figsize=(10,5))
for i,g in enumerate(df.groupby('Group')):
plt.title('N_data of {}'.format(g[0]))
plt.subplot(2, 1, i+1)
seaborn.boxplot(x="Day", y="n_data", data=g[1], palette="PRGn", showfliers=False)
plt.tight_layout()
plt.show()
plt.clf()
plt.close('all')
To retain original hue and grouping, render all non-group n_data to np.nan:
fig = plt.figure(figsize=(10,5))
for i,g in enumerate(df.Group.unique()):
plt.subplot(2, 1, i+1)
tmp = df.copy()
tmp.loc[tmp['Group']!=g, 'n_data'] = np.nan
seaborn.boxplot(x="Day", y="n_data", hue="Group", data=tmp,
palette="PRGn", showfliers=False)
plt.tight_layout()
plt.show()
plt.clf()
plt.close('all')

So one option to do a grouped box plot with two separate axis is to use hue_order= ['value, np.nan] in your argument for sns.boxplot:
fig = plt.figure(figsize=(14,8))
ax = sns.boxplot(x="lon_bucketed", y="value", data=m, hue='name', hue_order=['co2',np.nan],
width=0.75,showmeans=True,meanprops={"marker":"s","markerfacecolor":"black", "markeredgecolor":"black"},linewidth=0.5 ,palette = customPalette)
ax2 = ax.twinx()
ax2 = sns.boxplot(ax=ax2,x="lon_bucketed", y="value", data=m, hue='name', hue_order=[np.nan,'g_xco2'],
width=0.75,showmeans=True,meanprops={"marker":"s","markerfacecolor":"black", "markeredgecolor":"black"},linewidth=0.5, palette = customPalette)
ax1.grid(alpha=0.5, which = 'major')
plt.tight_layout()
ax.legend_.remove()
GW = mpatches.Patch(color='seagreen', label='$CO_2$')
WW = mpatches.Patch(color='mediumaquamarine', label='$XCO_2$')
ax, ax2.legend(handles=[GW,WW], loc='upper right',prop={'size': 14}, fontsize=12)
ax.set_title("$XCO_2$ vs. $CO_2$",fontsize=18)
ax.set_xlabel('Longitude [\u00b0]',fontsize=14)
ax.set_ylabel('$CO_2$ [ppm]',fontsize=14)
ax2.set_ylabel('$XCO_2$ [ppm]',fontsize=14)
ax.tick_params(labelsize=14)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Draw plot for a grouped data frame - python

Related

Create a stacked bar plot of percentages and annotate with count

Matplotlib: How to plot errorbar plots based on a color map of third category column(Not X and Y)

How to add a mean line to a seaborn stripplot or swarmplot

How to add legend with labels of IDs to my code

Seaborn (time series) boxplot using hue and different scale axes

Categories

Resources