Is there a concise way to plot summary statistics in Python as a boxplot? The code below gives a barchart of each mean, I want to swap each barchart to a boxplot.
I realise that I don't need to summarise, however with the real data, just plotting one of the boxes took a long time (even with showfliers=False); I don't need to see the outliers and I will also want to add a "population-wide" bar (i.e. across all clusters) for each "pc" (any suggestions for that would be greatly appreciated .. I am again attempting to move from R to python and just getting these few lines of code took long enough)
import matplotlib.pyplot as plt
import seaborn as sns
out = pd.DataFrame({'cluster':['a']*100+['b']*100,
'pc': ['w', 'x', 'y', 'z']*50,
'value': np.random.normal(size=200)})
grouped = out.groupby(['cluster', 'pc'])
out = grouped.describe()
out = out.reset_index()
out.columns = [e[0] if e[0] != 'value' else e[1] for e in out.columns.tolist()]
#sns.catplot(x='cluster', y='mean', col='pc', kind='bar', data=out)
g = sns.FacetGrid(out, col="pc", col_wrap = 2)
g = g.map(plt.bar, "cluster", "mean")
You can draw a boxplot from the summary statistics using Axes.bxp(). This needs to be encapsulated in a custom plotting function passed to map():
def my_bxp(**kwargs):
ax = plt.gca()
data = kwargs.pop('data')
color = kwargs.pop('color')
bxpstats = []
for _,row in data.iterrows():
print(row)
d = {'med': row.loc['50%'],
'q1': row.loc['25%'],
'q3': row.loc['75%'],
'whislo': row.loc['min'],
'whishi': row.loc['max'],
'label': row.loc['cluster']}
bxpstats.append(d)
ax.bxp(bxpstats, showfliers=False, boxprops=dict(color=color),
whiskerprops=dict(color=color),
capprops=dict(color=color))
g = sns.FacetGrid(out, col="pc", col_wrap = 2)
g = g.map_dataframe(my_bxp)
Note that, for simplicity's sake, I have the whiskers extend from min to max, which is not the usual representation. You may have to calculate proper whiskers extents when you calculate your summary statistics if that's what you want.
It's easier to draw multiple boxplots with the original data intact.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
df = pd.DataFrame({'cluster':['a']*100+['b']*100,
'pc': ['w', 'x', 'y', 'z']*50,
'value': np.random.normal(size=200)})
c = ['a']*25+['b']*25
df1 = pd.concat([pd.Series(c),
df[df['pc'] == 'w']['value'].reset_index(drop=True),
df[df['pc'] == 'x']['value'].reset_index(drop=True),
df[df['pc'] == 'y']['value'].reset_index(drop=True),
df[df['pc'] == 'z']['value'].reset_index(drop=True)], axis=1, ignore_index=True)
df1.columns = ['cluster','w','x','y','z']
fig, axes = plt.subplots(1, 4, figsize=(8, 4)
fig.subplots_adjust(wspace=0.3, hspace=0.4)
ax = sns.boxplot(x="cluster", y='w', data=df1, orient='v', ax=axes[0])
ax = sns.boxplot(x="cluster", y='x', data=df1, orient='v', ax=axes[1])
ax = sns.boxplot(x="cluster", y='y', data=df1, orient='v', ax=axes[2])
ax = sns.boxplot(x="cluster", y='z', data=df1, orient='v', ax=axes[3])
Related
I want to make a plot in seaborn but I am having some difficulties. The data has 2 variable: time (2 levels) and state (2 levels). I want to plot time on the x axis and state as different subplots, showing individual data lines. Finally, to the right of these I want to show a difference plot of the difference between time 2 and time 1, for each of the levels of state. I cannot do it very well, because I cannot get the second plot to show onto the right. Here has been my try:
import numpy as np
import pandas as pd
import seaborn as sns
# Just making some fake data
ids = [1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5]
times = [1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2]
states = ['A', 'B', 'A', 'B'] * 5
np.random.seed(121)
resps = [(i*t) + np.random.normal() for i, t in zip(ids, times)]
DATA = {
'identity': ids,
'time': times,
'state': states,
'resps': resps
}
df = pd.DataFrame(DATA)
# Done with data
g = sns.relplot(
data=df, kind='line',
col='state', x='time', y='resps', units='identity',
estimator=None, alpha=.5, height=5, aspect=.7)
# # Draw a line onto each Axes
g.map(sns.lineplot,"time", "resps", lw=5, ci=None)
# Make a wide data to make the difference
wide = df.set_index(['identity', 'state', 'time']).unstack().reset_index()
A = wide['state']=='A'
B = wide['state']=='B'
wide['diffA'] = wide[A][('resps', 2)] - wide[A][('resps', 1)]
wide['diffB'] = wide[B][('resps', 2)] - wide[B][('resps', 1)]
wide['difference'] = wide[['diffA', 'diffB']].sum(axis=1)
wide = wide.drop(columns=[('diffA', ''), ('diffB', '')])
sns.pointplot(x='state', y='difference', data=wide, join=False)
Output from the first
And output from the second:
Is there no way to put them together? Even though they are different data? I did try to use matplotlib. And then achieved slightly better results but this still had a problem because I wanted the two left plots to have a shared y axis but not the difference. This created lots of work as well, because I want to be flexible for different numbers of the state variable, but only kept to 2 for simplicity. Here is a paint version of what I want to do (sorry for the poor quality), hopefully with some more control over appearance but this is secondary:
Is there a reliable way to do this in a simpler way? Thanks!
The problem is that sns.relplot operates at a figure level. This means it creates its own figure object and we cannot control the axes it uses. If you want to leverage seaborn for the creation of the lines without using "pure" matplotlib, you can copy the lines on matplotlib axes:
import numpy as np
import pandas as pd
import seaborn as sns
# Just making some fake data
ids = [1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5]
times = [1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2]
states = ['A', 'B', 'A', 'B'] * 5
np.random.seed(121)
resps = [(i*t) + np.random.normal() for i, t in zip(ids, times)]
DATA = {
'identity': ids,
'time': times,
'state': states,
'resps': resps
}
df = pd.DataFrame(DATA)
# Done with data
g = sns.relplot(
data=df, kind='line',
col='state', x='time', y='resps', units='identity',
estimator=None, alpha=.5, height=5, aspect=.7)
# # Draw a line onto each Axes
g.map(sns.lineplot,"time", "resps", lw=5, ci=None)
# Make a wide data to make the difference
wide = df.set_index(['identity', 'state', 'time']).unstack().reset_index()
A = wide['state']=='A'
B = wide['state']=='B'
wide['diffA'] = wide[A][('resps', 2)] - wide[A][('resps', 1)]
wide['diffB'] = wide[B][('resps', 2)] - wide[B][('resps', 1)]
wide['difference'] = wide[['diffA', 'diffB']].sum(axis=1)
wide = wide.drop(columns=[('diffA', ''), ('diffB', '')])
# New code ----------------------------------------
import matplotlib.pyplot as plt
plt.close(g.figure)
fig = plt.figure(figsize=(12, 4))
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2, sharey=ax1)
ax3 = fig.add_subplot(1, 3, 3)
l = list(g.axes[0][0].get_lines())
l2 = list(g.axes[0][1].get_lines())
for ax, g_ax in zip([ax1, ax2], g.axes[0]):
l = list(g_ax.get_lines())
for line in l:
ax.plot(line.get_data()[0], line.get_data()[1], color=line.get_color(), lw=line.get_linewidth())
ax.set_title(g_ax.get_title())
sns.pointplot(ax=ax3, x='state', y='difference', data=wide, join=False)
# End of new code ----------------------------------
plt.show()
Result:
This is link to the data I'm using:
https://github.com/fivethirtyeight/data/tree/master/drug-use-by-age
I'm using Jupyter Lab, and here's the code:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sb
url = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/drug-use-by-age/drug-use-by-age.csv'
df = pd.read_csv(url, index_col = 0)
df.dtypes
df.replace('-', np.nan, inplace=True)
df = df.iloc[:,:].astype(float)
df = df.loc[:, df.columns != 'n']
#df.columns = df.columns.str.rstrip('-use')
df
fig, axes = plt.subplots(1,2, figsize=(20, 8))
fig.subplots_adjust(wspace=0.1)
fig.colorbar(ax.collections[0], ax=ax,location="right", use_gridspec=False, pad=0.2)
#plt.figure(figsize=(16, 16))
df_percentage = df.iloc[:,range(0,26,2)]
plot_precentage = sb.heatmap(df_percentage, cmap='Reds', ax=axes[0], cbar_kws={'format': '%.0f%%', 'label': '% used in past 12 months'})
df_frequency = df.iloc[:,range(1,27,2)]
plot_frequency = sb.heatmap(df_frequency, cmap='Blues', ax=axes[1], cbar_kws= dict(label = 'median frequency a user used'))
I can just show two of them in a subplot in separate diagrams.
I want to make it look like this (this is made in paint):
Also show the data side by side. Is there a simple way to achieve that?
A pretty simple solution with mask option:
mask = np.vstack([np.arange(df.shape[1])]* df.shape[0]) % 2
fig, axes = plt.subplots()
plot_precentage = sns.heatmap(df,mask=mask, cmap='Reds', ax=axes,
cbar_kws={'format': '%.0f%%',
'label': '% used in past 12 months'}
)
plot_frequency = sns.heatmap(df, mask=1-mask, cmap='Blues', ax=axes,
cbar_kws= dict(label = 'median frequency a user used')
)
Output:
I want seaborn heatmap to display multiple values in each cell of the heatmap. Here is a manual example of what I want to see, just to be clear:
data = np.array([[0.000000,0.000000],[-0.231049,0.000000],[-0.231049,0.000000]])
labels = np.array([['A\nExtra Stuff','B'],['C','D'],['E','F']])
fig, ax = plt.subplots()
ax = sns.heatmap(data, annot = labels, fmt = '')
Here as an example to get seaborn.heat to display flightsRoundUp values in the cells.
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
def RoundUp(x):
return int(np.ceil(x/10)*10)
# Load the example flights dataset and conver to long-form
flights_long = sns.load_dataset("flights")
flights = flights_long.pivot("month", "year", "passengers")
flightsRoundUp = flights.applymap(RoundUp)
# Draw a heatmap with the numeric values in each cell
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(flights, annot=flightsRoundUp, fmt="", linewidths=.5, ax=ax)
What is the best way to display both flightsRoundUp and flights in all cells? Something like the first manual example above, but for all the cells in a vectorized-like way...
Rotail's answer didn't work for me, I got an error when applying that lambda function.
However, I found a solution that exploits the fact that seaborn plots sequential figures on top of each other. All you have to do is use one call to heatmap to establish the figure, and then a subsequent call for each of the annotations. Use the annot_kws arg to make sure the text aren't written over eachother.
X = pd.DataFrame({'a':[1, 2, 3], 'b':[4, 5, 6]})
Y = pd.DataFrame({'A':['A', 'B', 'C'], 'B':['E', 'F', 'G']})
Z = pd.DataFrame({'A':['(Extra Stuff)', '(Extra Stuff)', '(Extra Stuff)'], 'B':['(Extra Stuff)', '(Extra Stuff)', '(Extra Stuff)']})
sns.heatmap(X, annot=False)
sns.heatmap(X, annot=Y, annot_kws={'va':'bottom'}, fmt="", cbar=False)
sns.heatmap(X, annot=Z, annot_kws={'va':'top'}, fmt="", cbar=False)
Following works for me too:
X = pd.DataFrame({'a':[1, 2, np.nan], 'b':[10, 20, 30]})
Y = pd.DataFrame({'A':[11, 222, np.nan], 'B':[110, np.nan, 330]})
# convert to string
X_value_ann = (X).astype('|S5').reset_index()
Y_value_ann = (Y).astype('|S5').reset_index()
# define () and new line to glue on later
br = np.char.array(pd.DataFrame('\n(', index=X_value_ann.index, columns=X_value_ann.columns))
cl = np.char.array(pd.DataFrame(')', index=X_value_ann.index, columns=X_value_ann.columns))
# convert to chararray
X_value_ann = np.char.array(X_value_ann)
Y_value_ann = np.char.array(Y_value_ann)
# glue and reshape
my_annotation = pd.DataFrame(X_value_ann+br+Y_value_ann+cl)
my_annotation = my_annotation.applymap(lambda x: x.decode('utf-8'))
my_annotation = my_annotation.drop(columns=[0])
my_annotation
you should be able to set fmt="" and format you labels with appropriate "\n" to have multiple lines of annotations.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(0)
sns.set_theme()
uniform_data = np.random.rand(4, 4)
fig,ax = plt.subplots(figsize=(50,20))
uniform_data_labels = \[\]
for i in uniform_data:
tmp_arr=\[\]
for j in i:
tmp_arr.append('Example\nExample')
uniform_data_labels.append(tmp_arr)
sns.heatmap(uniform_data, vmin=0, vmax=1, annot=uniform_data_labels ,ax=ax,fmt="",annot_kws={"fontsize":30})
plt.show()
1
Updated question and code!
Probably, the tips dataset is not the best example to use, however my issue is reproduced in it, i.e. we see that both point and bar plots share the same Y
I need to combine line and bar plots on one chart. To do this I used seaborn and the following code:
tips = sns.load_dataset('tips')
g = sns.FacetGrid(tips, hue='sex', col='sex', size=4, aspect=2.1, sharey=False, sharex=False)
g = g.map(sns.pointplot, 'day', 'tip', ci=0)
g = g.map(sns.barplot, 'day', 'total_bill', ci=0)
g.set_xticklabels(rotation=45, fontsize=9)
g.set_xticklabels(rotation=45, fontsize=9)
plt.show()
Here is the result:
Everything is okay except the fact that one Y axis is used for both bars and lines on each facetgrid object. I am new to seaborn and currently cannot find a solution. Tried to add "sharey=False" to this line of code
> `g.map(sns.pointplot, 'date', 'worthusdcount')`
however it didn't help.
Any solutions on how to add second Y axis would be appreciated
Here's an example where you apply a custom mapping function to the dataframe of interest. Within the function, you can call plt.gca() to get the current axis at the facet being currently plotted in FacetGrid. Once you have the axis, twinx() can be called just like you would in plain old matplotlib plotting.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
def facetgrid_two_axes(*args, **kwargs):
data = kwargs.pop('data')
dual_axis = kwargs.pop('dual_axis')
alpha = kwargs.pop('alpha', 0.2)
kwargs.pop('color')
ax = plt.gca()
if dual_axis:
ax2 = ax.twinx()
ax2.set_ylabel('Second Axis!')
ax.plot(data['x'],data['y1'], **kwargs, color='red',alpha=alpha)
if dual_axis:
ax2.bar(df['x'],df['y2'], **kwargs, color='blue',alpha=alpha)
df = pd.DataFrame()
df['x'] = np.arange(1,5,1)
df['y1'] = 1 / df['x']
df['y2'] = df['x'] * 100
df['facet'] = 'foo'
df2 = df.copy()
df2['facet'] = 'bar'
df3 = pd.concat([df,df2])
win_plot = sns.FacetGrid(df3, col='facet', size=6)
(win_plot.map_dataframe(facetgrid_two_axes, dual_axis=True)
.set_axis_labels("X", "First Y-axis"))
plt.show()
This isn't the prettiest plot as you might want to adjust the presence of the second y-axis' label, the spacing between plots, etc. but the code suffices to show how to plot two series of differing magnitudes within FacetGrids.
I have a dataframe which has a number of values per date (datetime field). This values are classified in U (users) and S (session) by using a column Group. Seaborn is used to visualize two boxplots per date, where the hue is set to Group.
The problem comes when considering that the values corresponding to U (users) are much bigger than those corresponding to S (session), making the S data illegible. Thus, I need to come up with a solution that allows me to plot both series (U and S) in the same figure in an understandable manner.
I wonder if independent Y axes (with different scales) can be set to each hue, so that both Y axes are shown (as when using twinx but without losing hue visualization capabilities).
Any other alternative would be welcome =)
The S boxplot time series boxplot:
The combined boxplot time series using hue. Obviously it's not possible to see any information about the S group because of the scale of the Y axis:
The columns of the dataframe:
| Day (datetime) | n_data (numeric) | Group (S or U)|
The code line generating the combined boxplot:
seaborn.boxplot(ax=ax,x='Day', y='n_data', hue='Group', data=df,
palette='PRGn', showfliers=False)
Managed to find a solution by using twinx:
fig,ax= plt.subplots(figsize=(50,10))
tmpU = groups.copy()
tmpU.loc[tmp['Group']!='U','n_data'] = np.nan
tmpS = grupos.copy()
tmpS.loc[tmp['Group']!='S','n_data'] = np.nan
ax=seaborn.boxplot(ax=ax,x='Day', y = 'n_data', hue='Group', data=tmpU, palette = 'PRGn', showfliers=False)
ax2 = ax.twinx()
seaborn.boxplot(ax=ax2,x='Day', y = 'n_data', hue='Group', data=tmpS, palette = 'PRGn', showfliers=False)
handles,labels = ax.get_legend_handles_labels()
l= plt.legend(handles[0:2],labels[0:2],loc=1)
plt.setp(ax.get_xticklabels(),rotation=30,horizontalalignment='right')
for label in ax.get_xticklabels()[::2]:
label.set_visible(False)
plt.show()
plt.close('all')
The code above generates the following figure:
Which in this case turns out to be too dense to be published. Therefore I would adopt a visualization based in subplots, as Parfait susgested in his/her answer.
It wasn't an obvious solution to me so I would like to thank Parfait for his/her answer.
Consider building separate plots on same figure with y-axes ranges tailored to subsetted data. Below demonstrates with random data seeded for reproducibility (for readers of this post).
Data (with U values higher than S values)
import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt
np.random.seed(2018)
u_df = pd.DataFrame({'Day': pd.date_range('2016-10-01', periods=10)\
.append(pd.date_range('2016-10-01', periods=10)),
'n_data': np.random.uniform(0,800,20),
'Group': 'U'})
s_df = pd.DataFrame({'Day': pd.date_range('2016-10-01', periods=10)\
.append(pd.date_range('2016-10-01', periods=10)),
'n_data': np.random.uniform(0,200,20),
'Group': 'S'})
df = pd.concat([u_df, s_df], ignore_index=True)
df['Day'] = df['Day'].astype('str')
Plot
fig = plt.figure(figsize=(10,5))
for i,g in enumerate(df.groupby('Group')):
plt.title('N_data of {}'.format(g[0]))
plt.subplot(2, 1, i+1)
seaborn.boxplot(x="Day", y="n_data", data=g[1], palette="PRGn", showfliers=False)
plt.tight_layout()
plt.show()
plt.clf()
plt.close('all')
To retain original hue and grouping, render all non-group n_data to np.nan:
fig = plt.figure(figsize=(10,5))
for i,g in enumerate(df.Group.unique()):
plt.subplot(2, 1, i+1)
tmp = df.copy()
tmp.loc[tmp['Group']!=g, 'n_data'] = np.nan
seaborn.boxplot(x="Day", y="n_data", hue="Group", data=tmp,
palette="PRGn", showfliers=False)
plt.tight_layout()
plt.show()
plt.clf()
plt.close('all')
So one option to do a grouped box plot with two separate axis is to use hue_order= ['value, np.nan] in your argument for sns.boxplot:
fig = plt.figure(figsize=(14,8))
ax = sns.boxplot(x="lon_bucketed", y="value", data=m, hue='name', hue_order=['co2',np.nan],
width=0.75,showmeans=True,meanprops={"marker":"s","markerfacecolor":"black", "markeredgecolor":"black"},linewidth=0.5 ,palette = customPalette)
ax2 = ax.twinx()
ax2 = sns.boxplot(ax=ax2,x="lon_bucketed", y="value", data=m, hue='name', hue_order=[np.nan,'g_xco2'],
width=0.75,showmeans=True,meanprops={"marker":"s","markerfacecolor":"black", "markeredgecolor":"black"},linewidth=0.5, palette = customPalette)
ax1.grid(alpha=0.5, which = 'major')
plt.tight_layout()
ax.legend_.remove()
GW = mpatches.Patch(color='seagreen', label='$CO_2$')
WW = mpatches.Patch(color='mediumaquamarine', label='$XCO_2$')
ax, ax2.legend(handles=[GW,WW], loc='upper right',prop={'size': 14}, fontsize=12)
ax.set_title("$XCO_2$ vs. $CO_2$",fontsize=18)
ax.set_xlabel('Longitude [\u00b0]',fontsize=14)
ax.set_ylabel('$CO_2$ [ppm]',fontsize=14)
ax2.set_ylabel('$XCO_2$ [ppm]',fontsize=14)
ax.tick_params(labelsize=14)