Plotting results (scatter graph) from DataFrame issues. Python - python

I am plotting a DataFrame as a scatter graph using this code:
My dataframe somewhat looks like this -
Sector AvgDeg
0 1 52
1 2 52
2 3 52
3 4 54
4 5 52
... ... ...
df.plot.scatter(x='Sector', y='AvgDeg', s=df['AvgDeg'], color='LightBlue',grid=True)
plt.show()
and I'm getting this result:
What I need is to draw every dot with a different color and with the corresponding legend. For example: -blue dot- 'Sector 1', -red dot- 'Sector 2', and so on.
Do you have any idea how to do this? Tks!!

What you have to do is to use a list of the same size as the points in the c parameter of scatter plot.
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
txt = ["text1", "text2", "text3", "text4"]
fig, ax = plt.subplots()
x = np.arange(1, 5)
y = np.arange(1, 5)
#c will change the colors of each point
#s is the size of each point...
#c_map is the color map you want to use
ax.scatter(x, y,s = 40, cmap = cmap_light, c=np.arange(1, 5))
for i, j in enumerate(txt):
#use the below code to display the text for each point
ax.annotate(j, (x[i], y[i]))
plt.show()
What this gives you as a result is -
To assign more different colors for 31 points for example you just gotta change the size...
ax.scatter(x, y,s = 40, cmap = cmap_light, c=np.arange(1, 32))
Similarly you can annotate those points by changing the txt list above.

i would do it this way:
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use('ggplot')
colorlist = list(mpl.colors.ColorConverter.colors.keys())
ax = df.plot.scatter(x='Sector', y='AvgDeg', s=df.AvgDeg*1.2,
c=(colorlist * len(df))[:len(df)])
df.apply(lambda x: ax.text(x.Sector, x.AvgDeg, 'Sector {}'.format(x.Sector)), axis=1)
plt.show()
Result

Related

histogram subplots with multiple axes

I have 4 sets (i.e bbs) of 3 .csv files (i.e. replicas) with 2 columns each: time (X-Axis) and interaction Frequency (Y-Axis 1). I also need to plot error bars to a second y axis which i have achieved.
Since they have similar paths, I am reading them through filename = with %s for each set and replica.
Right now I can plot them all one after the other but what I would like to achieve would be:
Create a subplot to plot all the replicas of each set into one fig to save them together in one jpg. and not have 12 jpg. figures ​
To have a cycler of three colours for each replica so that they are distinguishable in the final figure, plus a common legend for each fig
and ultimately have a common y Axis range so that the results are visibly comparable since im comparing frequencies in a histogram.
here is my code so far:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
bbss = [60,80,100,120]
replicas = ["1", "2", "3"]
colors = ['tab:blue', 'tab:orange', 'tab:green']
for bbs in bbss:
for i, replica in enumerate(replicas):
filename = "elliottIV_HA_IHSS/box4x4x4/bbs%s/bbpm2/NA/pH7.0/r%s/analysis/mdf/MIN_1:13.dat" % (bbs, replica)
data = pd.read_csv(filename, engine='python', delimiter="\s+|:", names=["time", "distance", "mol", "atom"], usecols=[0,1,3,4])
fig, ax1 = plt.subplots()
data["mol"] -= 1
data["mol"].plot.hist(figsize=(15, 5), rwidth=0.8, bins=range(1,int(bbs/2+2)), align="left", ax=ax1)
plt.suptitle('Replica-{}, {} building blocks:'.format(replica, bbs), fontsize=14)
plt.xticks(range(1,int(bbs/2+1)))
plt.xlabel("Molecule Nr.")
#plt.savefig('{}bbs.jpg'.format(bbs), bbox_inches='tight')
#plt.clf()
ax2 = ax1.twinx()
m = data.groupby("mol").mean()["distance"].values
s = data.groupby("mol").std()["distance"].values
i = data.groupby("mol").mean().index.values
ax2.errorbar(i, m, yerr=s, marker="o", fmt="none", color="tab:red")
ax2.set_ylabel("Distance [nm]")
fig.tight_layout()
plt.show()
The dataframe looks something like this:
time distance mol atom
0 0.0 0.368683 16 3
1 1.0 0.364314 16 1
2 2.0 0.358840 16 3
3 3.0 0.321033 16 3
4 4.0 0.361127 16 3
... ... ... ... ...
249995 249995.0 0.536088 13 3
249996 249996.0 0.508320 13 3
249997 249997.0 0.475273 13 3
249998 249998.0 0.559773 13 3
249999 249999.0 0.515042 7 11
The Frequency on y-axis 1 shows the frequency of each "mol" appearing (there are 30)
X is obviously the time which there are 250k time steps
and the second y-axis shows the average of the distance column for each of the "mol" as well as the error and st.dev
Hope that clarifies it a bit - thanks!
For simplicity, I create the following data set:
data = pd.DataFrame([[0.0, 0.368683 , 16,3], [1.0, 0.364314 , 15, 1], [2.0 , 0.358840 , 16 , 3], [3.0 , 0.321033 , 17 , 3], [4.0 , 0.361127 ,17 , 3]],
columns=["time", "distance", "mol", "atom"])
Afterwarfs I created via the following code the desired three plots via the following lines of code:
bbss = [60,80,100,120]^M
replicas = ["1", "2", "3"]^M
colors = ['blue', 'orange', 'green']^M
for bbs in bbss:^M
fig, axes = plt.subplots(3,1)
plt.xticks(range(1,int(bbs/2+1)))
plt.xlabel("Molecule Nr.")
for i, replica in enumerate(replicas):
filename = "elliottIV_HA_IHSS/box4x4x4/bbs%s/bbpm2/NA/pH7.0/r%s/analysis/mdf/MIN_1:13.dat" % (bbs, replica)^M
# data = pd.read_csv(filename, engine='python', delimiter="\s+|:", names=["time", "distance", "mol", "atom"], usecols=[0,1,3,4])
data["mol"] -= 1
ax = axes[i]
data["mol"].plot.hist(figsize=(15, 5), rwidth=0.8, bins=range(1,int(bbs/2+2)), color=colors[i], align="left", ax=ax)
ax2 = ax.twinx()
m = data.groupby("mol").mean()["distance"].values
s = data.groupby("mol").std()["distance"].values
i = data.groupby("mol").mean().index.values
ax2.errorbar(i, m, yerr=s, marker="o", fmt="none", color="tab:red")
ax2.set_ylabel("Distance [nm]")
plt.suptitle('Replica-{}, {} building blocks:'.format(replica, bbs), fontsize=14)
fig.tight_layout()
plt.show()
Note for your solution you should uncomment the line of reading data. The resulting plot looks like:
I hope this is the solution you are looking for.

Matplotlib customize rank line plot

I have the following dataframe where it contains the best equipment in operation ranked by 1 to 300 (1 is the best, 300 is the worst) over a few days (df columns)
Equipment 21-03-27 21-03-28 21-03-29 21-03-30 21-03-31 21-04-01 21-04-02
P01-INV-1-1 1 1 1 1 1 2 2
P01-INV-1-2 2 2 4 4 5 1 1
P01-INV-1-3 4 4 3 5 6 10 10
I would like to customize a line plot (example found here) but I'm having some troubles trying to modify the example code provided:
import matplotlib.pyplot as plt
import numpy as np
def energy_rank(data, marker_width=0.1, color='blue'):
y_data = np.repeat(data, 2)
x_data = np.empty_like(y_data)
x_data[0::2] = np.arange(1, len(data)+1) - (marker_width/2)
x_data[1::2] = np.arange(1, len(data)+1) + (marker_width/2)
lines = []
lines.append(plt.Line2D(x_data, y_data, lw=1, linestyle='dashed', color=color))
for x in range(0,len(data)*2, 2):
lines.append(plt.Line2D(x_data[x:x+2], y_data[x:x+2], lw=2, linestyle='solid', color=color))
return lines
data = ranks.head(4).to_numpy() #ranks is the above dataframe
artists = []
for row, color in zip(data, ('red','blue','green','magenta')):
artists.extend(energy_rank(row, color=color))
fig, ax = plt.subplots()
ax.set_xticklabels(ranks.columns) # set X axis to be dataframe columns
ax.set_xticklabels(ax.get_xticklabels(), rotation=35, fontsize = 10)
for artist in artists:
ax.add_artist(artist)
ax.set_ybound([15,0])
ax.set_xbound([.5,8.5])
When using ax.set_xticklabels(ranks.columns), for some reason, it only plots 5 of the 7 days from ranks columns, removing specifically the first and last values. I tried to duplicate those values but this did not work as well. I end up having this below:
In summary, I would like to know if its possible to do 3 customizations:
input all dates from ranks columns on X axis
revert Y axis. ax.set_ybound([15,0]) is not working. It would make more sense to see the graph starting with 0 on top, since 1 is the most important rank to look at
add labels to the end of each line at the last day (last value on X axis). I could add the little window label, but it often gets really messy when you plot more data, so adding just the text at the end of each line would really make it look cleaner
Please let me know if those customizations are impossible to do and any help is really appreciated! Thank you in advance!
To show all the dates, use plt.xticks() and set_xbound to start at 0. To reverse the y axis, use ax.set_ylim(ax.get_ylim()[::-1]). To set the legends the way you described, you can use annotation and set the coordinates of the annotation at your last datapoint for each series.
fig, ax = plt.subplots()
plt.xticks(np.arange(len(ranks.columns)), list(ranks.columns), rotation = 35, fontsize = 10)
plt.xlabel('Date')
plt.ylabel('Rank')
for artist in artists:
ax.add_artist(artist)
ax.set_ybound([0,15])
ax.set_ylim(ax.get_ylim()[::-1])
ax.set_xbound([0,8.5])
ax.annotate('Series 1', xy =(7.1, 2), color = 'red')
ax.annotate('Series 2', xy =(7.1, 1), color = 'blue')
ax.annotate('Series 3', xy =(7.1, 10), color = 'green')
plt.show()
Here is the plot for the three rows of data in your sample dataframe:

Matplotlib shifts line plots along the xaxis inadvertently

I am trying to plot three lines from different Pandas Dataframes to the same subplot in matplotlib. But, when I did this I found one of the plots shifted along the xaxis (The xrange is different for each line). However, when I plot each line individually the xlimits are correct and none of them are shifted. I have tried to reproduce my problem here:
def plot_dH1(title1, title_sim, a, b, c):
fig = plt.figure(figsize=(4,4))
plt.style.use(‘ggplot’)
sns.set_style(‘ticks’)
plt.rcParams[‘font.size’] = 15
n = 0
for i in range(len(a)):
ax = fig.add_subplot(1,1,n+1)
ax = a[i].groupby(level=(0, 1)).mean()[0].plot(label=‘$\chi_{180}$‘)
ax = b[i].groupby(level=(0, 1)).mean()[0].plot(label=‘All’)
ax = c[i].groupby(level=(0, 1)).mean()[0].plot(label=‘$\chi_{90}$‘)
ax.set_ylabel(‘$dH$‘)
ax.set_xlabel(‘$\lambda_{0}$, $\lambda_{1}$‘)
ax.set_title(title_sim[i])
title = title_sim[i]
sns.despine(offset=10, ax=ax)
plt.xticks(rotation=90)
# plt.yticks(range(0,350,20),range(0,350,20))
n = n+1
lgd = ax.legend(loc=‘upper center’, bbox_to_anchor=(0.35, -0.8),fancybox=True, shadow=True, ncol=3)
# plt.tight_layout()
# fig.savefig(‘{}.pdf’.format(‘dHdl_all’))
fig.savefig(‘{}.pdf’.format(‘dHdl_all’),bbox_extra_artists=(lgd,), bbox_inches=‘tight’)
array = [range(10), range(10,20)]
tuples = list(zip(*array))
index = pd.MultiIndex.from_tuples(tuples)
a = [pd.DataFrame(np.random.randn(10,1), index=index)]
b = [pd.DataFrame(np.random.randn(5,1), index=index[5:])]
c = [pd.DataFrame(np.random.randn(8,1), index=index[2:])]
plot_dH1(title1, title_sim, a, b, c)
a, b, c are lists of Pandas Data Frame. I am not able to upload an image. But if you run it you will see the problem. Does anyone knows why one of the lines is shifted along the xaxis?
You'll get answers quicker and more reliably if you can provide a minimal, working example. Your supplied code was missing several imports that were renamed, the title definitions, and had several commented lines cluttering things. Using the following code, I see that all of the lines start with the same x shift:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
def plot_dH1(title1, title_sim, a, b, c):
fig = plt.figure(figsize=(4,4))
plt.style.use('ggplot')
sns.set_style('ticks')
plt.rcParams['font.size'] = 15
n = 0
for i in range(len(a)):
ax = fig.add_subplot(1,1,n+1)
ax = a[i].groupby(level=(0, 1)).mean()[0].plot(label='$\chi_{180}$')
ax = b[i].groupby(level=(0, 1)).mean()[0].plot(label='All')
ax = c[i].groupby(level=(0, 1)).mean()[0].plot(label='$\chi_{90}$')
ax.set_ylabel('$dH$')
ax.set_xlabel('$\lambda_{0}$, $\lambda_{1}$')
ax.set_title(title_sim[i])
sns.despine(offset=10, ax=ax)
plt.xticks(rotation=90)
n = n+1
lgd = ax.legend(loc='upper center', bbox_to_anchor=(0.35, -0.8),fancybox=True, shadow=True, ncol=3)
fig.savefig('{}.pdf'.format('dHdl_all'),bbox_extra_artists=(lgd,), bbox_inches='tight')
array = [range(10), range(10,20)]
tuples = list(zip(*array))
index = pd.MultiIndex.from_tuples(tuples)
a = [pd.DataFrame(np.random.randn(10,1), index=index)]
b = [pd.DataFrame(np.random.randn(5,1), index=index[5:])]
c = [pd.DataFrame(np.random.randn(8,1), index=index[2:])]
title_sim = np.arange(10)
title1 = ''
plot_dH1(title1, title_sim, a, b, c)
produces the following plot using Python 3.5.2:
I don't see an x shift of one of the three curves plotted relative to the other two. An x offset is defined, and is controlled by the offset parameter of the line sns.despine(offset=10, ax=ax). Setting it to zero makes all of the lines adjacent to the y-axis:
No, Dataframe b is acutually shifted. Look at the blue curve. Its index are defined as index[5:] which means that it should have these values:
[ 0
5 15 1.398019
6 16 0.325211
7 17 0.113059
8 18 0.814993
9 19 0.402437]
So it should start from (5 15) along the X axis, but it is actually starting from (2, 12), which means that it is shifted.

Trying to set y axis labels and ticks aligned to 2D faces

This is my plot:
If I were to draw your attention to the axis labelled 'B' you'll see that everything is not as it should be.
The plots was produced using this:
def newPoly3D(self):
from matplotlib.cm import autumn
# This passes a pandas dataframe of shape (data on rows x 4 columns)
df = self.loadData()
fig = plt.figure(figsize=(10,10))
ax = fig.gca(projection='3d')
vels = [1.42,1.11,0.81,0.50]
which_joints = df.columns
L = len(which_joints)
dmin,dmax = df.min().min(),df.max().max()
dix = df.index.values
offset=-5
for i,j in enumerate(which_joints):
ax.add_collection3d(plt.fill_between(dix,df[j],
dmin,
lw=1.5,
alpha=0.3/float(i+1.),
facecolor=autumn(i/float(L))),
zs=vels[i],
zdir='y')
ax.grid(False)
ax.set_xlabel('A')
ax.set_xlim([0,df.index[-1]])
ax.set_xticks([])
ax.xaxis.set_ticklabels([])
ax.set_axis_off
ax.set_ylabel('B')
ax.set_ylim([0.4, max(vels)+0.075])
ax.set_yticks(vels)
ax.tick_params(direction='out', pad=10)
ax.set_zlabel('C')
ax.set_zlim([dmin,dmax])
ax.xaxis.labelpad = -10
ax.yaxis.labelpad = 15
ax.zaxis.labelpad = 15
# Note the inversion of the axis
plt.gca().invert_yaxis()
First I want to align the ticks on the yaxis (labelled B) with each coloured face. As you can see they are now offset slightly down.
Second I want to align the yaxis tick labels with the above, as you cans see they are currently very much offset downwards. I do not know why.
EDIT:
Here is some example data; each column represents one coloured face on the above plot.
-13.216256 -7.851065 -9.965357 -25.502654
-13.216253 -7.851063 -9.965355 -25.502653
-13.216247 -7.851060 -9.965350 -25.502651
-13.216236 -7.851052 -9.965342 -25.502647
-13.216214 -7.851038 -9.965324 -25.502639
-13.216169 -7.851008 -9.965289 -25.502623
-13.216079 -7.850949 -9.965219 -25.502592
-13.215900 -7.850830 -9.965078 -25.502529
Here we are again, with a simpler plot, reproduced with this data:
k = 10
df = pd.DataFrame(np.array([range(k),
[x + 1 for x in range(k)],
[x + 4 for x in range(k)],
[x + 9 for x in range(k)]]).T,columns=list('abcd'))
If you want to try this with the above function, comment out the df line in the function and change its argument as so def newPoly3D(df): so that you can pass the the test df above.

How to label the bars of a stacked bar plot from a pandas DataFrame? [duplicate]

I am trying to replicate the following image in matplotlib and it seems barh is my only option. Though it appears that you can't stack barh graphs so I don't know what to do
If you know of a better python library to draw this kind of thing, please let me know.
This is all I could come up with as a start:
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
people = ('A','B','C','D','E','F','G','H')
y_pos = np.arange(len(people))
bottomdata = 3 + 10 * np.random.rand(len(people))
topdata = 3 + 10 * np.random.rand(len(people))
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)
ax.barh(y_pos, bottomdata,color='r',align='center')
ax.barh(y_pos, topdata,color='g',align='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(people)
ax.set_xlabel('Distance')
plt.show()
I would then have to add labels individually using ax.text which would be tedious. Ideally I would like to just specify the width of the part to be inserted then it updates the center of that section with a string of my choosing. The labels on the outside (e.g. 3800) I can add myself later, it is mainly the labeling over the bar section itself and creating this stacked method in a nice way I'm having problems with. Can you even specify a 'distance' i.e. span of color in any way?
Edit 2: for more heterogeneous data. (I've left the above method since I find it more usual to work with the same number of records per series)
Answering the two parts of the question:
a) barh returns a container of handles to all the patches that it drew. You can use the coordinates of the patches to aid the text positions.
b) Following these two answers to the question that I noted before (see Horizontal stacked bar chart in Matplotlib), you can stack bar graphs horizontally by setting the 'left' input.
and additionally c) handling data that is less uniform in shape.
Below is one way you could handle data that is less uniform in shape is simply to process each segment independently.
import numpy as np
import matplotlib.pyplot as plt
# some labels for each row
people = ('A','B','C','D','E','F','G','H')
r = len(people)
# how many data points overall (average of 3 per person)
n = r * 3
# which person does each segment belong to?
rows = np.random.randint(0, r, (n,))
# how wide is the segment?
widths = np.random.randint(3,12, n,)
# what label to put on the segment (xrange in py2.7, range for py3)
labels = range(n)
colors ='rgbwmc'
patch_handles = []
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)
left = np.zeros(r,)
row_counts = np.zeros(r,)
for (r, w, l) in zip(rows, widths, labels):
print r, w, l
patch_handles.append(ax.barh(r, w, align='center', left=left[r],
color=colors[int(row_counts[r]) % len(colors)]))
left[r] += w
row_counts[r] += 1
# we know there is only one patch but could enumerate if expanded
patch = patch_handles[-1][0]
bl = patch.get_xy()
x = 0.5*patch.get_width() + bl[0]
y = 0.5*patch.get_height() + bl[1]
ax.text(x, y, "%d%%" % (l), ha='center',va='center')
y_pos = np.arange(8)
ax.set_yticks(y_pos)
ax.set_yticklabels(people)
ax.set_xlabel('Distance')
plt.show()
Which produces a graph like this , with a different number of segments present in each series.
Note that this is not particularly efficient since each segment used an individual call to ax.barh. There may be more efficient methods (e.g. by padding a matrix with zero-width segments or nan values) but this likely to be problem-specific and is a distinct question.
Edit: updated to answer both parts of the question.
import numpy as np
import matplotlib.pyplot as plt
people = ('A','B','C','D','E','F','G','H')
segments = 4
# generate some multi-dimensional data & arbitrary labels
data = 3 + 10* np.random.rand(segments, len(people))
percentages = (np.random.randint(5,20, (len(people), segments)))
y_pos = np.arange(len(people))
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)
colors ='rgbwmc'
patch_handles = []
left = np.zeros(len(people)) # left alignment of data starts at zero
for i, d in enumerate(data):
patch_handles.append(ax.barh(y_pos, d,
color=colors[i%len(colors)], align='center',
left=left))
# accumulate the left-hand offsets
left += d
# go through all of the bar segments and annotate
for j in range(len(patch_handles)):
for i, patch in enumerate(patch_handles[j].get_children()):
bl = patch.get_xy()
x = 0.5*patch.get_width() + bl[0]
y = 0.5*patch.get_height() + bl[1]
ax.text(x,y, "%d%%" % (percentages[i,j]), ha='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(people)
ax.set_xlabel('Distance')
plt.show()
You can achieve a result along these lines (note: the percentages I used have nothing to do with the bar widths, as the relationship in the example seems unclear):
See Horizontal stacked bar chart in Matplotlib for some ideas on stacking horizontal bar plots.
Imports and Test DataFrame
Tested in python 3.10, pandas 1.4.2, matplotlib 3.5.1, seaborn 0.11.2
For vertical stacked bars see Stacked Bar Chart with Centered Labels
import pandas as pd
import numpy as np
# create sample data as shown in the OP
np.random.seed(365)
people = ('A','B','C','D','E','F','G','H')
bottomdata = 3 + 10 * np.random.rand(len(people))
topdata = 3 + 10 * np.random.rand(len(people))
# create the dataframe
df = pd.DataFrame({'Female': bottomdata, 'Male': topdata}, index=people)
# display(df)
Female Male
A 12.41 7.42
B 9.42 4.10
C 9.85 7.38
D 8.89 10.53
E 8.44 5.92
F 6.68 11.86
G 10.67 12.97
H 6.05 7.87
Updated with matplotlib v3.4.2
Use matplotlib.pyplot.bar_label
See How to add value labels on a bar chart for additional details and examples with .bar_label.
labels = [f'{v.get_width():.2f}%' if v.get_width() > 0 else '' for v in c ] for python < 3.8, without the assignment expression (:=).
Plotted using pandas.DataFrame.plot with kind='barh'
ax = df.plot(kind='barh', stacked=True, figsize=(8, 6))
for c in ax.containers:
# customize the label to account for cases when there might not be a bar section
labels = [f'{w:.2f}%' if (w := v.get_width()) > 0 else '' for v in c ]
# set the bar label
ax.bar_label(c, labels=labels, label_type='center')
# uncomment and use the next line if there are no nan or 0 length sections; just use fmt to add a % (the previous two lines of code are not needed, in this case)
# ax.bar_label(c, fmt='%.2f%%', label_type='center')
# move the legend
ax.legend(bbox_to_anchor=(1.025, 1), loc='upper left', borderaxespad=0.)
# add labels
ax.set_ylabel("People", fontsize=18)
ax.set_xlabel("Percent", fontsize=18)
plt.show()
Using seaborn
sns.barplot does not have an option for stacked bar plots, however, sns.histplot and sns.displot can be used to create horizontal stacked bars.
seaborn typically requires the dataframe to be in a long, instead of wide, format, so use pandas.DataFrame.melt to reshape the dataframe.
Reshape dataframe
# convert the dataframe to a long form
df = df.reset_index()
df = df.rename(columns={'index': 'People'})
dfm = df.melt(id_vars='People', var_name='Gender', value_name='Percent')
# display(dfm)
People Gender Percent
0 A Female 12.414557
1 B Female 9.416027
2 C Female 9.846105
3 D Female 8.885621
4 E Female 8.438872
5 F Female 6.680709
6 G Female 10.666258
7 H Female 6.050124
8 A Male 7.420860
9 B Male 4.104433
10 C Male 7.383738
11 D Male 10.526158
12 E Male 5.916262
13 F Male 11.857227
14 G Male 12.966913
15 H Male 7.865684
sns.histplot: axes-level plot
fig, axe = plt.subplots(figsize=(8, 6))
sns.histplot(data=dfm, y='People', hue='Gender', discrete=True, weights='Percent', multiple='stack', ax=axe)
# iterate through each set of containers
for c in axe.containers:
# add bar annotations
axe.bar_label(c, fmt='%.2f%%', label_type='center')
axe.set_xlabel('Percent')
plt.show()
sns.displot: figure-level plot
g = sns.displot(data=dfm, y='People', hue='Gender', discrete=True, weights='Percent', multiple='stack', height=6)
# iterate through each facet / supbplot
for axe in g.axes.flat:
# iteate through each set of containers
for c in axe.containers:
# add the bar annotations
axe.bar_label(c, fmt='%.2f%%', label_type='center')
axe.set_xlabel('Percent')
plt.show()
Original Answer - before matplotlib v3.4.2
The easiest way to plot a horizontal or vertical stacked bar, is to load the data into a pandas.DataFrame
This will plot, and annotate correctly, even when all categories ('People'), don't have all segments (e.g. some value is 0 or NaN)
Once the data is in the dataframe:
It's easier to manipulate and analyze
It can be plotted with the matplotlib engine, using:
pandas.DataFrame.plot.barh
label_text = f'{width}' for annotations
pandas.DataFrame.plot.bar
label_text = f'{height}' for annotations
SO: Vertical Stacked Bar Chart with Centered Labels
These methods return a matplotlib.axes.Axes or a numpy.ndarray of them.
Using the .patches method unpacks a list of matplotlib.patches.Rectangle objects, one for each of the sections of the stacked bar.
Each .Rectangle has methods for extracting the various values that define the rectangle.
Each .Rectangle is in order from left the right, and bottom to top, so all the .Rectangle objects, for each level, appear in order, when iterating through .patches.
The labels are made using an f-string, label_text = f'{width:.2f}%', so any additional text can be added as needed.
Plot and Annotate
Plotting the bar, is 1 line, the remainder is annotating the rectangles
# plot the dataframe with 1 line
ax = df.plot.barh(stacked=True, figsize=(8, 6))
# .patches is everything inside of the chart
for rect in ax.patches:
# Find where everything is located
height = rect.get_height()
width = rect.get_width()
x = rect.get_x()
y = rect.get_y()
# The height of the bar is the data value and can be used as the label
label_text = f'{width:.2f}%' # f'{width:.2f}' to format decimal values
# ax.text(x, y, text)
label_x = x + width / 2
label_y = y + height / 2
# only plot labels greater than given width
if width > 0:
ax.text(label_x, label_y, label_text, ha='center', va='center', fontsize=8)
# move the legend
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# add labels
ax.set_ylabel("People", fontsize=18)
ax.set_xlabel("Percent", fontsize=18)
plt.show()
Example with Missing Segment
# set one of the dataframe values to 0
df.iloc[4, 1] = 0
Note the annotations are all in the correct location from df.
For this case, the above answers work perfectly. The issue I had, and didn't find a plug-and-play solution online, was that I often have to plot stacked bars in multi-subplot figures, with many values, which tend to have very non-homogenous amplitudes.
(Note: I work usually with pandas dataframes, and matplotlib. I couldn't make the bar_label() method of matplotlib to work all the times.)
So, I just give a kind of ad-hoc, but easily generalizable solution. In this example, I was working with single-row dataframes (for power-exchange monitoring purposes per hour), so, my dataframe (df) had just one row.
(I provide an example figure to show how this can be useful in very densely-packed plots)
[enter image description here][1]
[1]: https://i.stack.imgur.com/9akd8.png
'''
This implementation produces a stacked, horizontal bar plot.
df --> pandas dataframe. Columns are used as the iterator, and only the firs value of each column is used.
waterfall--> bool: if True, apart from the stack-direction, also a perpendicular offset is added.
cyclic_offset_x --> list (of any length) or None: loop through these values to use as x-offset pixels.
cyclic_offset_y --> list (of any length) or None: loop through these values to use as y-offset pixels.
ax --> matplotlib Axes, or None: if None, creates a new axis and figure.
'''
def magic_stacked_bar(df, waterfall=False, cyclic_offset_x=None, cyclic_offset_y=None, ax=None):
if isinstance(cyclic_offset_x, type(None)):
cyclic_offset_x = [0, 0]
if isinstance(cyclic_offset_y, type(None)):
cyclic_offset_y = [0, 0]
ax0 = ax
if isinstance(ax, type(None)):
fig, ax = plt.subplots()
fig.set_size_inches(19, 10)
cycler = 0;
prev = 0 # summation variable to make it stacked
for c in df.columns:
if waterfall:
y = c ; label = "" # bidirectional stack
else:
y = 0; label = c # unidirectional stack
ax.barh(y=y, width=df[c].values[0], height=1, left=prev, label = label)
prev += df[c].values[0] # add to sum-stack
offset_x = cyclic_offset_x[divmod(cycler, len(cyclic_offset_x))[1]]
offset_y = cyclic_offset_y[divmod(cycler, len(cyclic_offset_y))[1]]
ax.annotate(text="{}".format(int(df[c].values[0])), xy=(prev - df[c].values / 2, y),
xytext=(offset_x, offset_y), textcoords='offset pixels',
ha='center', va='top', fontsize=8,
arrowprops=dict(facecolor='black', shrink=0.01, width=0.3, headwidth=0.3),
bbox=dict(boxstyle='round', facecolor='grey', alpha=0.5))
cycler += 1
if not waterfall:
ax.legend() # if waterfall, the index annotates the columns. If
# waterfall ==False, the legend annotates the columns
if isinstance(ax0, type(None)):
ax.set_title("Voi la")
ax.set_xlabel("UltraWatts")
plt.show()
else:
return ax
''' (Sometimes, it is more tedious and requires some custom functions to make the labels look alright.
'''
A, B = 80,80
n_units = df.shape[1]
cyclic_offset_x = -A*np.cos(2*np.pi / (2*n_units) *np.arange(n_units))
cyclic_offset_y = B*np.sin(2*np.pi / (2*n_units) * np.arange(n_units)) + B/2

Categories