Related
I have a rather simple strip plot with vertical data.
planets = sns.load_dataset("planets")
sns.stripplot(x="method", y="distance", data=planets, size=4, color=".7")
plt.xticks(rotation=45, ha="right")
plt.show()
I want to plot the mean of each x-element (method) as a small horizontal bar similar to what you get with:
sns.boxplot(
x="method",
y="distance",
data=planets,
whis=[50, 50],
showfliers=False,
showbox=False,
showcaps=False
)
But without the vertical lines (with whis=[50,50] just spots) for the first / third quartile and showing mean instead of median. Maybe there is a more elegant solution not involving a Boxplot.
Thanks in advance.
Boxplot objects are defined in matplotlib.pyplot.boxplot
showmeans=True
meanline=True makes a line instead of a marker
meanprops={'color': 'k', 'ls': '-', 'lw': 2} sets the color, style and width of the line.
See matplotlib.lines.Line2D for other line properties.
medianprops={'visible': False} makes the median line not visible
whiskerprops={'visible': False} makes the whisker line not visible
zorder=10 places the line on the top layer
Tested in matplotlib v3.4.2 and seaborn v0.11.1
import seaborn as sns
import matplotlib.pyplot as plt
# load the dataset
planets = sns.load_dataset("planets")
p = sns.stripplot(x="method", y="distance", data=planets, size=4, color=".7")
plt.xticks(rotation=45, ha="right")
p.set(yscale='log')
# plot the mean line
sns.boxplot(showmeans=True,
meanline=True,
meanprops={'color': 'k', 'ls': '-', 'lw': 2},
medianprops={'visible': False},
whiskerprops={'visible': False},
zorder=10,
x="method",
y="distance",
data=planets,
showfliers=False,
showbox=False,
showcaps=False,
ax=p)
plt.show()
Works similarly with a seaborn.swarmplot
Here's a solution using ax.hlines with find the mean using groupby and list comprehension:
import seaborn as sns
import matplotlib.pyplot as plt
# load the dataset
planets = sns.load_dataset("planets")
p = sns.stripplot(x="method", y="distance", data=planets, size=4, color=".7", zorder=1)
plt.xticks(rotation=45, ha="right")
p.set(yscale='log');
df_mean = planets.groupby('method', sort=False)['distance'].mean()
_ = [p.hlines(y, i-.25, i+.25, zorder=2) for i, y in df_mean.reset_index()['distance'].items()]
Output:
Here's another hack that is similar to the boxplot idea but requires less overriding: draw a pointplot but with a confidence interval of width 0, and activate the errorbar "caps" to get a horizontal line with a parametrizable width:
planets = sns.load_dataset("planets")
spec = dict(x="method", y="distance", data=planets)
sns.stripplot(**spec, size=4, color=".7")
sns.pointplot(**spec, join=False, ci=0, capsize=.7, scale=0)
plt.xticks(rotation=45, ha="right")
One downside that is evident here is that bootstrapping gets skipped for groups with a single observation, so you don't get a mean line there. This may or may not be a problem in an actual application.
Another trick would be to do the groupby yourself and then draw a scatterplot with a very wide vertical line marker:
planets = sns.load_dataset("planets")
variables = dict(x="method", y="distance")
sns.stripplot(data=planets, **variables, size=4, color=".7")
sns.scatterplot(
data=planets.groupby("method")["distance"].mean().reset_index(),
**variables, marker="|", s=2, linewidth=25
)
plt.xticks(rotation=45, ha="right")
import matplotlib.pyplot as plt
import numpy as np
# data
x=["IEEE", "Elsevier", "Others"]
y=[7, 6, 2]
import seaborn as sns
plt.legend()
plt.scatter(x, y, s=300, c="blue", alpha=0.4, linewidth=3)
plt.ylabel("No. of Papers")
plt.figure(figsize=(10, 4))
I want to make a graph as shown in the image. I am not sure how to provide data for both journal and conference categories. (Currently, I just include one). Also, I am not sure how to add different colors for each category.
You can try this code snippet for you problem.
- I modified your Data format, I suggest you to use pandas for
data visualization.
- I added one more field to visualize the data more efficiently.
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
# data
x=["IEEE", "Elsevier", "Others", "IEEE", "Elsevier", "Others"]
y=[7, 6, 2, 5, 4, 3]
z=["conference", "journal", "conference", "journal", "conference", "journal"]
# create pandas dataframe
data_list = pd.DataFrame(
{'x_axis': x,
'y_axis': y,
'category': z
})
# change size of data points
minsize = min(data_list['y_axis'])
maxsize = max(data_list['y_axis'])
# scatter plot
sns.catplot(x="x_axis", y="y_axis", kind="swarm", hue="category",sizes=(minsize*100, maxsize*100), data=data_list)
plt.grid()
How to create the graph with correct bubble sizes and with no overlap
Seaborn stripplot and swarmplot (or sns.catplot(kind=strip or kind=swarm)) provide the handy dodge argument which prevents the bubbles from overlapping. The only downside is that the size argument applies a single size to all bubbles and the sizes argument (as used in the other answer) is of no use here. They do not work like the s and size arguments of scatterplot. Therefore, the size of each bubble must be edited after generating the plot:
import numpy as np # v 1.19.2
import pandas as pd # v 1.1.3
import seaborn as sns # v 0.11.0
# Create sample data
x = ['IEEE', 'Elsevier', 'Others', 'IEEE', 'Elsevier', 'Others']
y = np.array([7, 6, 3, 7, 1, 3])
z = ['conference', 'conference', 'conference', 'journal', 'journal', 'journal']
df = pd.DataFrame(dict(organisation=x, count=y, category=z))
# Create seaborn stripplot (swarmplot can be used the same way)
ax = sns.stripplot(data=df, x='organisation', y='count', hue='category', dodge=True)
# Adjust the size of the bubbles
for coll in ax.collections[:-2]:
y = coll.get_offsets()[0][1]
coll.set_sizes([100*y])
# Format figure size, spines and grid
ax.figure.set_size_inches(7, 5)
ax.grid(axis='y', color='black', alpha=0.2)
ax.grid(axis='x', which='minor', color='black', alpha=0.2)
ax.spines['bottom'].set(position='zero', color='black', alpha=0.2)
sns.despine(left=True)
# Format ticks
ax.tick_params(axis='both', length=0, pad=10, labelsize=12)
ax.tick_params(axis='x', which='minor', length=25, width=0.8, color=[0, 0, 0, 0.2])
minor_xticks = [tick+0.5 for tick in ax.get_xticks() if tick != ax.get_xticks()[-1]]
ax.set_xticks(minor_xticks, minor=True)
ax.set_yticks(range(0, df['count'].max()+2))
# Edit labels and legend
ax.set_xlabel('Organisation', labelpad=15, size=12)
ax.set_ylabel('No. of Papers', labelpad=15, size=12)
ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left', frameon=False);
Alternatively, you can use scatterplot with the convenient s argument (or size) and then edit the space between the bubbles to reproduce the effect of the missing dodge argument (note that the x_jitter argument seems to have no effect). Here is an example using the same data as before and without all the extra formatting:
# Create seaborn scatterplot with size argument
ax = sns.scatterplot(data=df, x='organisation', y='count',
hue='category', s=100*df['count'])
ax.figure.set_size_inches(7, 5)
ax.margins(0.2)
# Dodge bubbles
bubbles = ax.collections[0].get_offsets()
signs = np.repeat([-1, 1], df['organisation'].nunique())
for bubble, sign in zip(bubbles, signs):
bubble[0] += sign*0.15
As a side note, I recommend that you consider other types of plots for this data. A grouped bar chart:
df.pivot(index='organisation', columns='category').plot.bar()
Or a balloon plot (aka categorical bubble plot):
sns.scatterplot(data=df, x='organisation', y='category', s=100*count).margins(0.4)
Why? In the bubble graph, the counts are displayed using 2 visual attributes, i) the y-coordinate location and ii) the bubble size. Only one of them is really necessary.
I am trying to display a data table under each of two subplots in figure, which I have plotted using pandas' plot function. I have got the plots to look as I want, and can get one of the two tables to show up under one of the two subplots, but it is unreadable (and only one of the two I want to display).
My goal is to have each plot and table look something like this: https://matplotlib.org/gallery/misc/table_demo.html#sphx-glr-gallery-misc-table-demo-py where the axes act as column headers and the data is directly beneath each plot. Unfortunately, my figure is not coming out like that. Below is my code, which if run should produce exactly what I'm seeing.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Set up df
rowcol = {'ID':[101,101,101,101], 'Assessment': ['Read-Write-1','Math-1', 'Read-Write-Math-3', 'Read-Write-Math-4'],
'Math': [np.nan,4,3,3], 'MScore': [np.nan, 636.5, 577.2, 545.4],
'RW': [3, np.nan, 3, 3], 'RWScore': [559.7, np.nan, 621.6, 563.7]}
df = pd.DataFrame(data = rowcol)
df = df.interpolate()
# Set up subplots
fig, axes = plt.subplots(nrows=1, ncols=2, sharex=True)
df.plot(x='Assessment', y=['MScore', 'RWScore'], use_index = True,
grid = True, style=['+r-.', 'xb--'], legend=['MScore','RWScore'],
ax=axes[0], xticks=[0,1,2,3])
df.plot(x='Assessment', y=['Math', 'RW'], use_index = True,
grid = True, style=['+g-.', 'xc--'], legend=['M','RW'],
xticks=[0,1,2,3], ylim=[0,4], yticks=[1,2,3,4], ax=axes[1])
# Add labels, titles, and legend
axes[0].set_title(df['ID'][0])
axes[1].set_title(df['ID'][0])
plt.xlabel('Assessment')
axes[0].set_ylabel('Score')
axes[1].set_ylabel('Performance Level')
plt.legend(loc='best')
# Add data tables
table1 = plt.table(cellText = [df.MScore, df.RWScore],
rowLabels = ['MScore', 'RWScore'],
rowColours = ['r','b'], loc='bottom',
colLabels = df['Assessment'])
table2 = plt.table(cellText = [df.Math, df.RW],
rowLabels = ['Math', 'RW'],
rowColours = ['g','c'], loc='bottom',
colLabels = df['Assessment'])
# Show plot
plt.show()
As you can see, this does not produce anything particularly pretty, or even readable. What needs to be changed in this code to make it work as in the example in the link?
What needs to be changed in this code to make it work as in the example in the link?
The dataset in the example is quite different from the one in your question. In addition, there are two subplots and tables involved. For readability sake, you can change the code to increase the figure sizes, make space for the tables, show table below each subplot, hide the xticklabels and the x-axis labels.
rowcol = {'ID':[101,101,101,101], 'Assessment': ['Read-Write-1','Math-1', 'Read-Write-Math-3', 'Read-Write-Math-4'],
'Math': [np.nan,4,3,3], 'MScore': [np.nan, 636.5, 577.2, 545.4],
'RW': [3, np.nan, 3, 3], 'RWScore': [559.7, np.nan, 621.6, 563.7]}
df = pd.DataFrame(data = rowcol)
df = df.interpolate()
# print(df)
# Set up subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(30, 10)) #specify size of subplots
df.plot(x='Assessment', y=['MScore', 'RWScore'], use_index = True,
grid = True, style=['+r-.', 'xb--'], legend=['MScore','RWScore'],
ax=axes[0], xticks=[0,1,2,3])
df.plot(x='Assessment', y=['Math', 'RW'], use_index = True,
grid = True, style=['+g-.', 'xc--'], legend=['M','RW'],
xticks=[0,1,2,3], ylim=[0,4], yticks=[1,2,3,4], ax=axes[1])
# Add labels, titles, and legend
plt.subplots_adjust(left=0.3, bottom=0.2, wspace = 0.3)
axes[0].set_title(df['ID'][0])
axes[1].set_title(df['ID'][0])
axes[0].set_ylabel('Score')
axes[1].set_ylabel('Performance Level')
#set visibility of x-axis and y-axis, xticklabels and yticklabels
axes[0].xaxis.set_ticklabels([])
axes[1].xaxis.set_ticklabels([])
axes[0].get_xaxis().set_visible(False)
axes[1].get_xaxis().set_visible(False)
plt.legend(loc='best')
# Add data tables for each subplot
table1 = axes[0].table(cellText = [df.MScore, df.RWScore],
rowLabels = ['MScore', 'RWScore'],
rowColours = ['r','b'], loc='bottom',
colLabels = df['Assessment'])
table2 = axes[1].table(cellText = [df.Math, df.RW],
rowLabels = ['Math', 'RW'],
rowColours = ['g','c'], loc='bottom',
colLabels = df['Assessment'], fontsize=15)
# Show plot
plt.show()
Output
I am using ax.stem to draw lollipop plot in python. However, I found it difficult to assign different colors to each lollipop
as shown here
As you can see I have 2 categories "GWP" & "FDP".
In my project, each category should be divided into 4 subcategories "ingredient", "Waste", "energy" and "infrastructure". Therefore, I want to assign them different colors to indicate the subcategory.
There is a solution proposed here: https://python-graph-gallery.com/181-custom-lollipop-plot/
But this only teaches you how to change color for all lollipops.
And there is another solution: https://python-graph-gallery.com/183-highlight-a-group-in-lollipop/
But this one doesn't really use ax.stem.
Please let me know how to assign different colors to each lollipop.
(Also, I don't know somehow why my plot is displayed upside down. Also, the y axis does not align in order, and there is one dot not connected by a line. It displays correctly in my original plot though.)
Here is my code:
#%%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
# my dataset
columns = np.array(['types', 'GWP100 (year)', 'FDP (year)'])
types = np.array(['Total (ingredient) per kg', 'Total (waste) per kg',
'energy (whole process) per kg', 'Infrastructure', 'Total (Total)']).reshape(5,1)
gwp = np.array([ 2.86982617e+02, 2.16824983e+02, 4.38920760e+01,
6.02400000e-02, 5.47759916e+02]).reshape(5,1)
fdp = np.array([ 1.35455867e+02, 7.02868322e+00, 1.26622560e+01,
1.64568000e-02, 1.55163263e+02]).reshape(5,1)
original_data = np.concatenate((types, gwp, fdp), axis = 1)
# produce dataframe
data = pd.DataFrame(original_data, columns = columns)
# types GWP100 (year) FDP (year)
#0 Total (ingredient) per kg 286.982617 135.455867
#1 Total (waste) per kg 216.824983 7.02868322
#2 energy (whole process) per kg 43.892076 12.662256
#3 Infrastructure 0.06024 0.0164568
#4 Total (Total) 547.759916 155.163263
#%% graph
fig = plt.figure(1, figsize =(8,6))
# 1st subplot
ax1 = fig.add_subplot(1,2,1)
gwp = data[data.columns[1]]
ax1.stem(gwp)
ax1.set_ylabel(r'kg CO$_2$-Eq', fontsize=10)
ax1.set_xlabel('GWP', fontsize=10)
# 2nd subplot
ax2 = fig.add_subplot(1,2,2)
fdp = data[data.columns[2]]
ax2.stem(fdp)
ax2.set_ylabel(r'kg oil-Eq', fontsize = 10)
ax2.set_xlabel('FDP', fontsize=10)
The stem currently consists of a couple of lines and a "line" consisting of dots on top. It has no option to colorize the lines separately within its interface.
You may replicate the stem plot to draw the lines manually with the color you like.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
columns = np.array(['types', 'GWP100 (year)', 'FDP (year)'])
types = np.array(['Total (ingredient) per kg', 'Total (waste) per kg',
'energy (whole process) per kg', 'Infrastructure', 'Total (Total)'])
gwp = np.array([ 2.86982617e+02, 2.16824983e+02, 4.38920760e+01,
6.02400000e-02, 5.47759916e+02])
fdp = np.array([ 1.35455867e+02, 7.02868322e+00, 1.26622560e+01,
1.64568000e-02, 1.55163263e+02])
# produce dataframe
data = pd.DataFrame([types,gwp,fdp], index = columns).transpose()
colors = list("bgryk")
fig, (ax, ax2) = plt.subplots(ncols=2)
for t, y, c in zip(data["types"], data["GWP100 (year)"],colors):
ax.plot([t,t], [0,y], color=c, marker="o", markevery=(1,2))
ax.set_ylim(0,None)
plt.setp(ax.get_xticklabels(), rotation=90)
fig.tight_layout()
plt.show()
A more efficient solution is of course to use a LineCollection in combination with a scatter plot for the dots.
fig, (ax, ax2) = plt.subplots(ncols=2)
segs = np.zeros((len(data), 2, 2))
segs[:,:,0] = np.repeat(np.arange(len(data)),2).reshape(len(data),2)
segs[:,1,1] = data["GWP100 (year)"].values
lc = LineCollection(segs, colors=colors)
ax.add_collection(lc)
ax.scatter(np.arange(len(data)), data["GWP100 (year)"].values, c=colors)
ax.set_xticks(np.arange(len(data)))
ax.set_xticklabels(data["types"], rotation=90)
ax.autoscale()
ax.set_ylim(0,None)
fig.tight_layout()
plt.show()
I will answer one of your main questions regarding the same coloring of the lines and markers category wise. There seems to be no direct option while calling ax1.stem() to specify the list of colors as per the official docs. In fact they say that the resulting plot might not be reasonable if one do so. Nevertheless, below is one trick to get things done your way.
The idea is following:
Get the objects (stemline) displayed on the subplot
Get the x-y data of the markers
Loop over the data and change the color of each stemline. Plot the marker individually with the same color as stemline. The colors is an array specifying the colors of your choice.
Following is the relevant part of the code:
# 1st subplot
ax1 = fig.add_subplot(1,2,1)
gwp = data[data.columns[1]]
colors = ['r', 'g', 'b', 'y', 'k']
_, stemlines, _ = ax1.stem(gwp)
line = ax1.get_lines()
xd = line[0].get_xdata()
yd = line[0].get_ydata()
# mec and mfc stands for markeredgecolor and markerfacecolor
for i in range(len(stemlines)):
plt.plot([xd[i]], [yd[i]], 'o', ms=7, mfc=colors[i], mec=colors[i])
plt.setp(stemlines[i], 'color', colors[i])
ax1.set_ylabel(r'kg CO$_2$-Eq', fontsize=10)
ax1.set_xlabel('GWP', fontsize=10)
# 2nd subplot
ax2 = fig.add_subplot(1,2,2)
fdp = data[data.columns[2]]
_, stemlines, _ = ax2.stem(fdp)
line = ax2.get_lines()
xd = line[0].get_xdata()
yd = line[0].get_ydata()
for i in range(len(stemlines)):
plt.plot([xd[i]], [yd[i]], 'o', ms=7, mfc=colors[i], mec=colors[i])
plt.setp(stemlines[i], 'color', colors[i])
I have a dataframe which has a number of values per date (datetime field). This values are classified in U (users) and S (session) by using a column Group. Seaborn is used to visualize two boxplots per date, where the hue is set to Group.
The problem comes when considering that the values corresponding to U (users) are much bigger than those corresponding to S (session), making the S data illegible. Thus, I need to come up with a solution that allows me to plot both series (U and S) in the same figure in an understandable manner.
I wonder if independent Y axes (with different scales) can be set to each hue, so that both Y axes are shown (as when using twinx but without losing hue visualization capabilities).
Any other alternative would be welcome =)
The S boxplot time series boxplot:
The combined boxplot time series using hue. Obviously it's not possible to see any information about the S group because of the scale of the Y axis:
The columns of the dataframe:
| Day (datetime) | n_data (numeric) | Group (S or U)|
The code line generating the combined boxplot:
seaborn.boxplot(ax=ax,x='Day', y='n_data', hue='Group', data=df,
palette='PRGn', showfliers=False)
Managed to find a solution by using twinx:
fig,ax= plt.subplots(figsize=(50,10))
tmpU = groups.copy()
tmpU.loc[tmp['Group']!='U','n_data'] = np.nan
tmpS = grupos.copy()
tmpS.loc[tmp['Group']!='S','n_data'] = np.nan
ax=seaborn.boxplot(ax=ax,x='Day', y = 'n_data', hue='Group', data=tmpU, palette = 'PRGn', showfliers=False)
ax2 = ax.twinx()
seaborn.boxplot(ax=ax2,x='Day', y = 'n_data', hue='Group', data=tmpS, palette = 'PRGn', showfliers=False)
handles,labels = ax.get_legend_handles_labels()
l= plt.legend(handles[0:2],labels[0:2],loc=1)
plt.setp(ax.get_xticklabels(),rotation=30,horizontalalignment='right')
for label in ax.get_xticklabels()[::2]:
label.set_visible(False)
plt.show()
plt.close('all')
The code above generates the following figure:
Which in this case turns out to be too dense to be published. Therefore I would adopt a visualization based in subplots, as Parfait susgested in his/her answer.
It wasn't an obvious solution to me so I would like to thank Parfait for his/her answer.
Consider building separate plots on same figure with y-axes ranges tailored to subsetted data. Below demonstrates with random data seeded for reproducibility (for readers of this post).
Data (with U values higher than S values)
import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt
np.random.seed(2018)
u_df = pd.DataFrame({'Day': pd.date_range('2016-10-01', periods=10)\
.append(pd.date_range('2016-10-01', periods=10)),
'n_data': np.random.uniform(0,800,20),
'Group': 'U'})
s_df = pd.DataFrame({'Day': pd.date_range('2016-10-01', periods=10)\
.append(pd.date_range('2016-10-01', periods=10)),
'n_data': np.random.uniform(0,200,20),
'Group': 'S'})
df = pd.concat([u_df, s_df], ignore_index=True)
df['Day'] = df['Day'].astype('str')
Plot
fig = plt.figure(figsize=(10,5))
for i,g in enumerate(df.groupby('Group')):
plt.title('N_data of {}'.format(g[0]))
plt.subplot(2, 1, i+1)
seaborn.boxplot(x="Day", y="n_data", data=g[1], palette="PRGn", showfliers=False)
plt.tight_layout()
plt.show()
plt.clf()
plt.close('all')
To retain original hue and grouping, render all non-group n_data to np.nan:
fig = plt.figure(figsize=(10,5))
for i,g in enumerate(df.Group.unique()):
plt.subplot(2, 1, i+1)
tmp = df.copy()
tmp.loc[tmp['Group']!=g, 'n_data'] = np.nan
seaborn.boxplot(x="Day", y="n_data", hue="Group", data=tmp,
palette="PRGn", showfliers=False)
plt.tight_layout()
plt.show()
plt.clf()
plt.close('all')
So one option to do a grouped box plot with two separate axis is to use hue_order= ['value, np.nan] in your argument for sns.boxplot:
fig = plt.figure(figsize=(14,8))
ax = sns.boxplot(x="lon_bucketed", y="value", data=m, hue='name', hue_order=['co2',np.nan],
width=0.75,showmeans=True,meanprops={"marker":"s","markerfacecolor":"black", "markeredgecolor":"black"},linewidth=0.5 ,palette = customPalette)
ax2 = ax.twinx()
ax2 = sns.boxplot(ax=ax2,x="lon_bucketed", y="value", data=m, hue='name', hue_order=[np.nan,'g_xco2'],
width=0.75,showmeans=True,meanprops={"marker":"s","markerfacecolor":"black", "markeredgecolor":"black"},linewidth=0.5, palette = customPalette)
ax1.grid(alpha=0.5, which = 'major')
plt.tight_layout()
ax.legend_.remove()
GW = mpatches.Patch(color='seagreen', label='$CO_2$')
WW = mpatches.Patch(color='mediumaquamarine', label='$XCO_2$')
ax, ax2.legend(handles=[GW,WW], loc='upper right',prop={'size': 14}, fontsize=12)
ax.set_title("$XCO_2$ vs. $CO_2$",fontsize=18)
ax.set_xlabel('Longitude [\u00b0]',fontsize=14)
ax.set_ylabel('$CO_2$ [ppm]',fontsize=14)
ax2.set_ylabel('$XCO_2$ [ppm]',fontsize=14)
ax.tick_params(labelsize=14)