I am using Seaborn to plot two violin plots, and I would like to obtain the X-coordinates of the edges of the violins, such that I may subtract one from the other to find the difference between the two KDEs/distributions. I suspect it has to do with the properties of the matplotlib.collections.PolyCollection objects, but I have had difficulty navigating the documentation for this - so I apologize I donnot have much code to attach... but I'm including my current violin plot in case that helps
import pandas as pd
import seaborn as sns
observed_results = [11.10283128625, 6.031906445000001, 4.625099850384, 4.371541683749999, 4.188776438315, 4.169933187839999, 4.147271982216, 3.137545605726, 2.727390606468, 2.706991071933, 2.483074510875, 2.470624399684, 2.45608460474, 2.413902898276, 2.390530982763, 2.347653087613, 2.3049660823, 2.173520711313, 2.114398085409, 2.072213409552, 1.96126510972, 1.768724290017, 1.722913211104, 1.715972042575, 1.71293343376, 1.686909025847, 1.546933962564, 1.520621928225, 1.50428319008, 1.4944074417, 1.409957657136, 1.40292975245, 1.3157577856, 1.3078804375, 1.2974806016, 1.288403682732, 1.236493409437, 1.225900768752, 1.222094652926, 1.202655483344, 1.109818003441, 1.108678687017, 1.103788138352, 1.066656041116, 0.9799193812, 0.9729697610879998, 0.9532061536159999, 0.908066599737, 0.8847958337999999, 0.8698585519490001, 0.859714675264, 0.8422146736200001, 0.8229930509580001, 0.79569571686, 0.79170962662, 0.7855221024799999, 0.775488805524, 0.7690100510069999, 0.765153773568, 0.677797640336, 0.62878587992, 0.6278724034559998, 0.619961861949, 0.555740507912, 0.5458990340579999, 0.495263271752, 0.4744517001510001, 0.453783299787, 0.426952628919, 0.4079525625, 0.4030645275, 0.401266962432, 0.3807181439999999, 0.3313936548, 0.2707379496719999, 0.256952683998, 0.248430471184, 0.242090703552, 0.235644786034, 0.214602373804, 0.199521746592, 0.1951300125, 0.173116351962, 0.172562334309, 0.156660445172, 0.145336979139, 0.1190036898, 0.114659525376, 0.094888354288, 0.06696725615, 0.0305541639, 0.023464003706, 0.021922131125]
observed_labels = ["Observed" for n in observed_results]
expected_results = [4.5217885770490405, 3.828641396489095, 3.4231762883809305, 3.1354942159291497, 2.91235066461494, 2.7300291078209855, 2.575878427993727, 2.4423470353692043, 2.324563999712821, 2.2192034840549946, 2.12389330425067, 2.03688192726104, 1.9568392195875037, 1.8827312474337816, 1.8137383759468302, 1.749199854809259, 1.6885752329928243, 1.6314168191528757, 1.5773495978825998, 1.5260563034950494, 1.4772661393256175, 1.4307461236907244, 1.3862943611198906, 1.3437347467010947, 1.3029127521808397, 1.2636920390275583, 1.2259517110447113, 1.1895840668738362, 1.1544927470625663, 1.120591195386885, 1.087801372563894, 1.0560526742493137, 1.02528101558256, 0.995428052432879, 0.9664405155596267, 0.9382696385929304, 0.9108706644048158, 0.8842024173226546, 0.858226930919394, 0.832909122935104, 0.8082165103447325, 0.7841189587656721, 0.760588461355478, 0.7375989431307791, 0.7151260872787205, 0.6931471805599453, 0.6716409753389818, 0.6505875661411494, 0.6299682789384137, 0.6097655716208943, 0.5899629443247145, 0.570544858467613, 0.5514966634969185, 0.5328045304847661, 0.5144553918165694, 0.496436886313891, 0.4787373092144902, 0.46134556650262093, 0.44425113314332093, 0.4274440148269396, 0.4109147128757291, 0.39465419200394874, 0.37865385065750773, 0.3629054936893685, 0.34740130715340317, 0.3321338350226148, 0.31709595765807425, 0.3022808718729337, 0.2876820724517809, 0.2732933349996814, 0.2591087000077249, 0.2451224580329851, 0.2313291359006492, 0.21772348384487053, 0.20430046351272993, 0.19105523676270922, 0.17798315519535654, 0.16507975035944858, 0.1523407245820189, 0.13976194237515874, 0.1273394223766015, 0.11506932978478723, 0.10294796925244237, 0.09097177820572676, 0.07913732055872386, 0.06744128079553265, 0.055880458394456614, 0.04445176257083381, 0.03315220731690051, 0.02197890671877523, 0.010929070532190317, -0.0]
expected_labels = ["Expected" for n in expected_results]
all_results = observed_results + expected_results
all_labels = observed_labels + expected_labels
df_longform = {r"Z-score": all_results, 'Condition': all_labels}
df_longform = pd.DataFrame(data=df_longform)
ax = sns.violinplot(x='Condition', y=r"Z-score", data=df_longform, inner=None,
scale='area', bw=0.3, width=0.8, saturation=1,
linewidth=1, cut=0)
print(ax.collections)
plt.show()
Please let me know if there if anything about this question is unclear, or there is anything else I forgot to provide here. Can't figure this out for the life of me, so any advice is really appreciated -
Here is a comparison between:
calculating and visualizing the two distributions via a kde plot and their intersection
simulating a violinplot given the kde curves
a boxenplot, which may be another valuable way to compare the distributions.
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import seaborn as sns
import pandas as pd
observed_results = [11.10283128625, 6.031906445000001, 4.625099850384, 4.371541683749999, 4.188776438315, 4.169933187839999, 4.147271982216, 3.137545605726, 2.727390606468, 2.706991071933, 2.483074510875, 2.470624399684, 2.45608460474, 2.413902898276, 2.390530982763, 2.347653087613, 2.3049660823, 2.173520711313, 2.114398085409, 2.072213409552, 1.96126510972, 1.768724290017, 1.722913211104, 1.715972042575, 1.71293343376, 1.686909025847, 1.546933962564, 1.520621928225, 1.50428319008, 1.4944074417, 1.409957657136, 1.40292975245, 1.3157577856, 1.3078804375, 1.2974806016, 1.288403682732, 1.236493409437, 1.225900768752, 1.222094652926, 1.202655483344, 1.109818003441, 1.108678687017, 1.103788138352, 1.066656041116, 0.9799193812, 0.9729697610879998, 0.9532061536159999, 0.908066599737, 0.8847958337999999, 0.8698585519490001, 0.859714675264, 0.8422146736200001, 0.8229930509580001, 0.79569571686, 0.79170962662, 0.7855221024799999, 0.775488805524, 0.7690100510069999, 0.765153773568, 0.677797640336, 0.62878587992, 0.6278724034559998, 0.619961861949, 0.555740507912, 0.5458990340579999, 0.495263271752, 0.4744517001510001, 0.453783299787, 0.426952628919, 0.4079525625, 0.4030645275, 0.401266962432, 0.3807181439999999, 0.3313936548, 0.2707379496719999, 0.256952683998, 0.248430471184, 0.242090703552, 0.235644786034, 0.214602373804, 0.199521746592, 0.1951300125, 0.173116351962, 0.172562334309, 0.156660445172, 0.145336979139, 0.1190036898, 0.114659525376, 0.094888354288, 0.06696725615, 0.0305541639, 0.023464003706, 0.021922131125]
expected_results = [4.5217885770490405, 3.828641396489095, 3.4231762883809305, 3.1354942159291497, 2.91235066461494, 2.7300291078209855, 2.575878427993727, 2.4423470353692043, 2.324563999712821, 2.2192034840549946, 2.12389330425067, 2.03688192726104, 1.9568392195875037, 1.8827312474337816, 1.8137383759468302, 1.749199854809259, 1.6885752329928243, 1.6314168191528757, 1.5773495978825998, 1.5260563034950494, 1.4772661393256175, 1.4307461236907244, 1.3862943611198906, 1.3437347467010947, 1.3029127521808397, 1.2636920390275583, 1.2259517110447113, 1.1895840668738362, 1.1544927470625663, 1.120591195386885, 1.087801372563894, 1.0560526742493137, 1.02528101558256, 0.995428052432879, 0.9664405155596267, 0.9382696385929304, 0.9108706644048158, 0.8842024173226546, 0.858226930919394, 0.832909122935104, 0.8082165103447325, 0.7841189587656721, 0.760588461355478, 0.7375989431307791, 0.7151260872787205, 0.6931471805599453, 0.6716409753389818, 0.6505875661411494, 0.6299682789384137, 0.6097655716208943, 0.5899629443247145, 0.570544858467613, 0.5514966634969185, 0.5328045304847661, 0.5144553918165694, 0.496436886313891, 0.4787373092144902, 0.46134556650262093, 0.44425113314332093, 0.4274440148269396, 0.4109147128757291, 0.39465419200394874, 0.37865385065750773, 0.3629054936893685, 0.34740130715340317, 0.3321338350226148, 0.31709595765807425, 0.3022808718729337, 0.2876820724517809, 0.2732933349996814, 0.2591087000077249, 0.2451224580329851, 0.2313291359006492, 0.21772348384487053, 0.20430046351272993, 0.19105523676270922, 0.17798315519535654, 0.16507975035944858, 0.1523407245820189, 0.13976194237515874, 0.1273394223766015, 0.11506932978478723, 0.10294796925244237, 0.09097177820572676, 0.07913732055872386, 0.06744128079553265, 0.055880458394456614, 0.04445176257083381, 0.03315220731690051, 0.02197890671877523, 0.010929070532190317, -0.0]
x0 = observed_results
x1 = expected_results
kde0 = gaussian_kde(x0, bw_method=0.3)
kde1 = gaussian_kde(x1, bw_method=0.3)
xmin = min(min(x0), min(x1))
xmax = max(max(x0), max(x1))
dx = 0.2 * (xmax - xmin) # add a 20% margin, as the kde is wider than the data
xmin -= dx
xmax += dx
x = np.linspace(xmin, xmax, 500)
kde0_x = kde0(x)
kde1_x = kde1(x)
inters_x = np.minimum(kde0_x, kde1_x)
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15, 7))
ax1.plot(x, kde0_x, color='b', label='Observed')
ax1.fill_between(x, kde0_x, 0, color='b', alpha=0.2)
ax1.plot(x, kde1_x, color='orange', label='Expected')
ax1.fill_between(x, kde1_x, 0, color='orange', alpha=0.2)
ax1.plot(x, inters_x, color='r')
ax1.fill_between(x, inters_x, 0, facecolor='none', edgecolor='r', hatch='xx', label='intersection')
area_inters_x = np.trapz(inters_x, x)
handles, labels = ax1.get_legend_handles_labels()
labels[2] += f': {area_inters_x * 100:.1f} %'
ax1.legend(handles, labels)
ax1.set_ylim(ymin=0)
ax1.set_title("kde plot with intersection")
ax2.plot(kde0_x/2, x, color='b', label='Observed')
ax2.plot(-kde0_x/2, x, color='b')
ax2.fill_betweenx(x, -kde0_x/2, kde0_x/2, color='b', alpha=0.2)
ax2.plot(kde1_x/2, x, color='r', label='Expected')
ax2.plot(-kde1_x/2, x, color='r')
ax2.fill_betweenx(x, -kde1_x/2, kde1_x/2, color='r', alpha=0.2)
ax2.plot(inters_x/2, x, color='k')
ax2.plot(-inters_x/2, x, color='k')
ax2.fill_betweenx(x, -inters_x/2, inters_x/2, facecolor='none', edgecolor='k', hatch='oo', label='intersection')
handles, labels = ax2.get_legend_handles_labels()
labels[2] += f': {area_inters_x * 100:.1f} %'
ax2.legend(handles, labels)
ax2.set_title("simulated violinplot with intersection")
df_longform = pd.DataFrame(data= {"Z-score": observed_results + expected_results,
"Condition": ["Observed"] * len(observed_results)+["Expected"] * len(expected_results) })
sns.boxenplot(x="Condition", y="Z-score", data=df_longform, ax=ax3)
ax3.set_title("sns.boxenplot")
plt.tight_layout()
plt.show()
I have a rather simple strip plot with vertical data.
planets = sns.load_dataset("planets")
sns.stripplot(x="method", y="distance", data=planets, size=4, color=".7")
plt.xticks(rotation=45, ha="right")
plt.show()
I want to plot the mean of each x-element (method) as a small horizontal bar similar to what you get with:
sns.boxplot(
x="method",
y="distance",
data=planets,
whis=[50, 50],
showfliers=False,
showbox=False,
showcaps=False
)
But without the vertical lines (with whis=[50,50] just spots) for the first / third quartile and showing mean instead of median. Maybe there is a more elegant solution not involving a Boxplot.
Thanks in advance.
Boxplot objects are defined in matplotlib.pyplot.boxplot
showmeans=True
meanline=True makes a line instead of a marker
meanprops={'color': 'k', 'ls': '-', 'lw': 2} sets the color, style and width of the line.
See matplotlib.lines.Line2D for other line properties.
medianprops={'visible': False} makes the median line not visible
whiskerprops={'visible': False} makes the whisker line not visible
zorder=10 places the line on the top layer
Tested in matplotlib v3.4.2 and seaborn v0.11.1
import seaborn as sns
import matplotlib.pyplot as plt
# load the dataset
planets = sns.load_dataset("planets")
p = sns.stripplot(x="method", y="distance", data=planets, size=4, color=".7")
plt.xticks(rotation=45, ha="right")
p.set(yscale='log')
# plot the mean line
sns.boxplot(showmeans=True,
meanline=True,
meanprops={'color': 'k', 'ls': '-', 'lw': 2},
medianprops={'visible': False},
whiskerprops={'visible': False},
zorder=10,
x="method",
y="distance",
data=planets,
showfliers=False,
showbox=False,
showcaps=False,
ax=p)
plt.show()
Works similarly with a seaborn.swarmplot
Here's a solution using ax.hlines with find the mean using groupby and list comprehension:
import seaborn as sns
import matplotlib.pyplot as plt
# load the dataset
planets = sns.load_dataset("planets")
p = sns.stripplot(x="method", y="distance", data=planets, size=4, color=".7", zorder=1)
plt.xticks(rotation=45, ha="right")
p.set(yscale='log');
df_mean = planets.groupby('method', sort=False)['distance'].mean()
_ = [p.hlines(y, i-.25, i+.25, zorder=2) for i, y in df_mean.reset_index()['distance'].items()]
Output:
Here's another hack that is similar to the boxplot idea but requires less overriding: draw a pointplot but with a confidence interval of width 0, and activate the errorbar "caps" to get a horizontal line with a parametrizable width:
planets = sns.load_dataset("planets")
spec = dict(x="method", y="distance", data=planets)
sns.stripplot(**spec, size=4, color=".7")
sns.pointplot(**spec, join=False, ci=0, capsize=.7, scale=0)
plt.xticks(rotation=45, ha="right")
One downside that is evident here is that bootstrapping gets skipped for groups with a single observation, so you don't get a mean line there. This may or may not be a problem in an actual application.
Another trick would be to do the groupby yourself and then draw a scatterplot with a very wide vertical line marker:
planets = sns.load_dataset("planets")
variables = dict(x="method", y="distance")
sns.stripplot(data=planets, **variables, size=4, color=".7")
sns.scatterplot(
data=planets.groupby("method")["distance"].mean().reset_index(),
**variables, marker="|", s=2, linewidth=25
)
plt.xticks(rotation=45, ha="right")
import matplotlib.pyplot as plt
import numpy as np
# data
x=["IEEE", "Elsevier", "Others"]
y=[7, 6, 2]
import seaborn as sns
plt.legend()
plt.scatter(x, y, s=300, c="blue", alpha=0.4, linewidth=3)
plt.ylabel("No. of Papers")
plt.figure(figsize=(10, 4))
I want to make a graph as shown in the image. I am not sure how to provide data for both journal and conference categories. (Currently, I just include one). Also, I am not sure how to add different colors for each category.
You can try this code snippet for you problem.
- I modified your Data format, I suggest you to use pandas for
data visualization.
- I added one more field to visualize the data more efficiently.
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
# data
x=["IEEE", "Elsevier", "Others", "IEEE", "Elsevier", "Others"]
y=[7, 6, 2, 5, 4, 3]
z=["conference", "journal", "conference", "journal", "conference", "journal"]
# create pandas dataframe
data_list = pd.DataFrame(
{'x_axis': x,
'y_axis': y,
'category': z
})
# change size of data points
minsize = min(data_list['y_axis'])
maxsize = max(data_list['y_axis'])
# scatter plot
sns.catplot(x="x_axis", y="y_axis", kind="swarm", hue="category",sizes=(minsize*100, maxsize*100), data=data_list)
plt.grid()
How to create the graph with correct bubble sizes and with no overlap
Seaborn stripplot and swarmplot (or sns.catplot(kind=strip or kind=swarm)) provide the handy dodge argument which prevents the bubbles from overlapping. The only downside is that the size argument applies a single size to all bubbles and the sizes argument (as used in the other answer) is of no use here. They do not work like the s and size arguments of scatterplot. Therefore, the size of each bubble must be edited after generating the plot:
import numpy as np # v 1.19.2
import pandas as pd # v 1.1.3
import seaborn as sns # v 0.11.0
# Create sample data
x = ['IEEE', 'Elsevier', 'Others', 'IEEE', 'Elsevier', 'Others']
y = np.array([7, 6, 3, 7, 1, 3])
z = ['conference', 'conference', 'conference', 'journal', 'journal', 'journal']
df = pd.DataFrame(dict(organisation=x, count=y, category=z))
# Create seaborn stripplot (swarmplot can be used the same way)
ax = sns.stripplot(data=df, x='organisation', y='count', hue='category', dodge=True)
# Adjust the size of the bubbles
for coll in ax.collections[:-2]:
y = coll.get_offsets()[0][1]
coll.set_sizes([100*y])
# Format figure size, spines and grid
ax.figure.set_size_inches(7, 5)
ax.grid(axis='y', color='black', alpha=0.2)
ax.grid(axis='x', which='minor', color='black', alpha=0.2)
ax.spines['bottom'].set(position='zero', color='black', alpha=0.2)
sns.despine(left=True)
# Format ticks
ax.tick_params(axis='both', length=0, pad=10, labelsize=12)
ax.tick_params(axis='x', which='minor', length=25, width=0.8, color=[0, 0, 0, 0.2])
minor_xticks = [tick+0.5 for tick in ax.get_xticks() if tick != ax.get_xticks()[-1]]
ax.set_xticks(minor_xticks, minor=True)
ax.set_yticks(range(0, df['count'].max()+2))
# Edit labels and legend
ax.set_xlabel('Organisation', labelpad=15, size=12)
ax.set_ylabel('No. of Papers', labelpad=15, size=12)
ax.legend(bbox_to_anchor=(1.0, 0.5), loc='center left', frameon=False);
Alternatively, you can use scatterplot with the convenient s argument (or size) and then edit the space between the bubbles to reproduce the effect of the missing dodge argument (note that the x_jitter argument seems to have no effect). Here is an example using the same data as before and without all the extra formatting:
# Create seaborn scatterplot with size argument
ax = sns.scatterplot(data=df, x='organisation', y='count',
hue='category', s=100*df['count'])
ax.figure.set_size_inches(7, 5)
ax.margins(0.2)
# Dodge bubbles
bubbles = ax.collections[0].get_offsets()
signs = np.repeat([-1, 1], df['organisation'].nunique())
for bubble, sign in zip(bubbles, signs):
bubble[0] += sign*0.15
As a side note, I recommend that you consider other types of plots for this data. A grouped bar chart:
df.pivot(index='organisation', columns='category').plot.bar()
Or a balloon plot (aka categorical bubble plot):
sns.scatterplot(data=df, x='organisation', y='category', s=100*count).margins(0.4)
Why? In the bubble graph, the counts are displayed using 2 visual attributes, i) the y-coordinate location and ii) the bubble size. Only one of them is really necessary.
I am following the NMT with attention (https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb) tutorial and I am applying it for my own use case. Unfortunately, when I try to plot the attention weigths, I get alignment problems of the x-axis if the input is too long (e.g. 14 instead of 7).
In this code block, the plotting works as expected:
import numpy as np
from matplotlib import pyplot as plt
def plot_attention():
attention = np.array([[7.78877574e-10, 4.04739769e-10, 6.65854022e-05, 1.63362725e-04,
2.85054208e-04, 8.50252633e-04, 4.58042100e-02],
[9.23501700e-02, 5.69618285e-01, 1.80586591e-01, 9.78111699e-02,
2.71992851e-02, 9.59911197e-03, 2.54837354e-03]])
sentence = ['<start>', 'hace', 'mucho', 'frio', 'aqui', '.', '<end>']
predicted_sentence = ['it', 's']
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(1, 1, 1)
ax.matshow(attention, cmap='viridis')
fontdict = {'fontsize': 14}
ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)
plt.show()
plot_attention()
but with more elements in the list "sentence", it seems to misalign:
def plot_attention():
attention = np.array([[7.78877574e-10, 4.04739769e-10, 6.65854022e-05, 1.63362725e-04,
2.85054208e-04, 8.50252633e-04, 4.58042100e-02, 7.78877574e-10, 4.04739769e-10, 6.65854022e-05, 1.63362725e-04,
2.85054208e-04, 8.50252633e-04, 4.58042100e-02],
[9.23501700e-02, 5.69618285e-01, 1.80586591e-01, 9.78111699e-02,
2.71992851e-02, 9.59911197e-03, 2.54837354e-03, 7.78877574e-10, 4.04739769e-10, 6.65854022e-05, 1.63362725e-04,
2.85054208e-04, 8.50252633e-04, 4.58042100e-02]])
sentence = ['<start>', 'hace', 'mucho', 'frio', 'aqui', '.', '<end>', '<start>', 'hace', 'mucho', 'frio', 'aqui', '.', '<end>']
predicted_sentence = ['it', 's']
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(1, 1, 1)
ax.matshow(attention, cmap='viridis')
fontdict = {'fontsize': 14}
ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)
plt.show()
plot_attention()
I expect the x-axis to be perfectly aligned and that all elements of the x-axis are shown (not every second one as it is right now)
The problem is that you are only setting the tick-labels without specifying the positions of the ticks. Whenever you modify the tick labels, you should always first set the tick positions. So, do the following in your code
ax.set_xticks(range(len(sentence)))
ax.set_yticks(range(len(predicted_sentence)))
ax.set_xticklabels(sentence, fontdict=fontdict, rotation=90)
ax.set_yticklabels(predicted_sentence, fontdict=fontdict)