How to annotate a seaborn barplot with the aggregated value - python

How can the following code be modified to show the mean as well as the different error bars on each bar of the bar plot?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
a,b,c,d = [],[],[],[]
for i in range(1,5):
np.random.seed(i)
a.append(np.random.uniform(35,55))
b.append(np.random.uniform(40,70))
c.append(np.random.uniform(63,85))
d.append(np.random.uniform(59,80))
data_df =pd.DataFrame({'stages':[1,2,3,4],'S1':a,'S2':b,'S3':c,'S4':d})
print("Delay:")
display(data_df)
S1 S2 S3 S4
0 43.340440 61.609735 63.002516 65.348984
1 43.719898 40.777787 75.092575 68.141770
2 46.015958 61.244435 69.399904 69.727380
3 54.340597 56.416967 84.399056 74.011136
meansd_df=data_df.describe().loc[['mean', 'std'],:].drop('stages', axis = 1)
display(meansd_df)
sns.set()
sns.set_style('darkgrid',{"axes.facecolor": ".92"}) # (1)
sns.set_context('notebook')
fig, ax = plt.subplots(figsize = (8,6))
x = meansd_df.columns
y = meansd_df.loc['mean',:]
yerr = meansd_df.loc['std',:]
plt.xlabel("Time", size=14)
plt.ylim(-0.3, 100)
width = 0.45
for i, j,k in zip(x,y,yerr): # (2)
ax.bar(i,j, width, yerr = k, edgecolor = "black",
error_kw=dict(lw=1, capsize=8, capthick=1)) # (3)
ax.set(ylabel = 'Delay')
from matplotlib import ticker
ax.yaxis.set_major_locator(ticker.MultipleLocator(10))
plt.savefig("Over.png", dpi=300, bbox_inches='tight')

Given the example data, for a seaborn.barplot with capped error bars, data_df must be converted from a wide format, to a tidy (long) format, which can be accomplished with pandas.DataFrame.stack or pandas.DataFrame.melt
It is also important to keep in mind that a bar plot shows only the mean (or other estimator) value
Sample Data and DataFrame
.iloc[:, 1:] is used to skip the 'stages' column at column index 0.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# given data_df from the OP, select the columns except stage and reshape to long format
df = data_df.iloc[:, 1:].melt(var_name='set', value_name='val')
# display(df.head())
set val
0 S1 43.340440
1 S1 43.719898
2 S1 46.015958
3 S1 54.340597
4 S2 61.609735
Updated as of matplotlib v3.4.2
Use matplotlib.pyplot.bar_label
See How to add value labels on a bar chart for additional details and examples with .bar_label.
Some formatting can be done with the fmt parameter, but more sophisticated formatting should be done with the labels parameter, as show in How to add multiple annotations to a barplot.
Tested with seaborn v0.11.1, which is using matplotlib as the plot engine.
fig, ax = plt.subplots(figsize=(8, 6))
# add the plot
sns.barplot(x='set', y='val', data=df, capsize=0.2, ax=ax)
# add the annotation
ax.bar_label(ax.containers[-1], fmt='Mean:\n%.2f', label_type='center')
ax.set(ylabel='Mean Time')
plt.show()
plot with seaborn.barplot
Using matplotlib before version 3.4.2
The default for the estimator parameter is mean, so the height of the bar is the mean of the group.
The bar height is extracted from p with .get_height, which can be used to annotate the bar.
fig, ax = plt.subplots(figsize=(8, 6))
sns.barplot(x='set', y='val', data=df, capsize=0.2, ax=ax)
# show the mean
for p in ax.patches:
h, w, x = p.get_height(), p.get_width(), p.get_x()
xy = (x + w / 2., h / 2)
text = f'Mean:\n{h:0.2f}'
ax.annotate(text=text, xy=xy, ha='center', va='center')
ax.set(xlabel='Delay', ylabel='Time')
plt.show()

Seaborn is most powerfull with long form data. So you might want to transform your data, something like this:
sns.barplot(data=data_df.melt('stages', value_name='Delay', var_name='Time'),
x='Time', y='Delay',
capsize=0.1, edgecolor='k')
Output:

Related

adjusting horizontal bar chart matplotlib to accommodate the bars

I am doing a horizontal bar chart but struggling with adjusting ylim, or maybe another parameter to make my labels clearer and make all the labels fit the y axis . I played around with ylim and the text size can be bigger or smaller but the bars do not fit the y axis. Any idea about the right approach?
My code:
import matplotlib.pyplot as plt #we load the library that contains the plotting capabilities
from operator import itemgetter
D=[]
for att, befor, after in zip(df_portion['attributes'], df_portion['2005_2011 (%)'], df_portion['2012_2015 (%)']):
i=(att, befor, after)
D.append(i)
Dsort = sorted(D, key=itemgetter(1), reverse=False) #sort the list in order of usage
attri = [x[0] for x in Dsort]
aft = [x[1] for x in Dsort]
bef = [x[2] for x in Dsort]
ind = np.arange(len(attri))
width=3
ax = plt.subplot(111)
ax.barh(ind, aft, width,align='center',alpha=1, color='r', label='from 2012 to 2015') #a horizontal bar chart (use .bar instead of .barh for vertical)
ax.barh(ind - width, bef, width, align='center', alpha=1, color='b', label='from 2005 to 2008') #a horizontal bar chart (use .bar instead of .barh for vertical)
ax.set(yticks=ind, yticklabels=attri,ylim=[1, len(attri)/2])
plt.xlabel('Frequency distribution (%)')
plt.title('Frequency distribution (%) of common attributes between 2005_2008 and between 2012_2015')
plt.legend()
plt.show()
This is the plot for above code
To make the labels fit, you need to set a smaller fontsize, or use a larger figsize. Changing the ylim will either just show a subset of the bars (in case ylim is set too narrow), or will show more whitespace (when ylim is larger).
The biggest problem in the code is width being too large. Twice the width needs to fit over a distance of 1.0 (the ticks are placed via ind, which is an array 0,1,2,...). As matplotlib calls the thickness of a horizontal bar plot "height", this name is used in the example code below. Using align='edge' lets you position the bars directly (align='center' will move them half their "height").
Pandas has simple functions to sort dataframes according to one or more rows.
Code to illustrate the ideas:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# first create some test data
df = pd.DataFrame({'attributes': ["alpha", "beta", "gamma", "delta", "epsilon", "zata", "eta", "theta", "iota",
"kappa", "lambda", "mu", "nu", "xi", "omikron", "pi", "rho", "sigma", "tau",
"upsilon", "phi", "chi", "psi", "omega"]})
totals_2005_2011 = np.random.uniform(100, 10000, len(df))
totals_2012_2015 = totals_2005_2011 * np.random.uniform(0.70, 2, len(df))
df['2005_2011 (%)'] = totals_2005_2011 / totals_2005_2011.sum() * 100
df['2012_2015 (%)'] = totals_2012_2015 / totals_2012_2015.sum() * 100
# sort all rows via the '2005_2011 (%)' column, sort from large to small
df = df.sort_values('2005_2011 (%)', ascending=False)
ind = np.arange(len(df))
height = 0.3 # two times height needs to be at most 1
fig, ax = plt.subplots(figsize=(12, 6))
ax.barh(ind, df['2012_2015 (%)'], height, align='edge', alpha=1, color='crimson', label='from 2012 to 2015')
ax.barh(ind - height, df['2005_2011 (%)'], height, align='edge', alpha=1, color='dodgerblue', label='from 2005 to 2011')
ax.set_yticks(ind)
ax.set_yticklabels(df['attributes'], fontsize=10)
ax.grid(axis='x')
ax.set_xlabel('Frequency distribution (%)')
ax.set_title('Frequency distribution (%) of common attributes between 2005_2011 and between 2012_2015')
ax.legend()
ax.margins(y=0.01) # use smaller margins in the y-direction
plt.tight_layout()
plt.show()
The seaborn library has some functions to create barplots with multiple bars per attribute, without the need to manually fiddle with bar positions. Seaborn prefers its data in "long form", which can be created via pandas' melt().
Example code:
import seaborn as sns
df = df.sort_values('2005_2011 (%)', ascending=True)
df_long = df.melt(id_vars='attributes', value_vars=['2005_2011 (%)', '2012_2015 (%)'],
var_name='period', value_name='distribution')
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(data=df_long, y='attributes', x='distribution', hue='period', palette='turbo', ax=ax)
ax.set_xlabel('Frequency distribution (%)')
ax.set_title('Frequency distribution (%) of common attributes between 2005_2011 and between 2012_2015')
ax.grid(axis='x')
ax.tick_params(axis='y', labelsize=12)
sns.despine()
plt.tight_layout()
plt.show()

How do I flip the color scaling of seaborn's sns.heatmap?

I have this piece of code that compares chess openings to their outcomes:
z = df2.groupby(["winner", "opening_name"]).size().unstack().fillna(0).astype(int)
fig, ax = plt.subplots(figsize=(32, 16))
sns.heatmap(z.apply(lambda x: x/x.sum()), xticklabels=True, yticklabels=True, cmap='YlOrBr',
annot=True, linewidths=0.005, linecolor='black', annot_kws={"fontsize":16}, fmt='.2f', cbar=False)
plt.xticks(fontsize = 16)
plt.yticks(fontsize=16)
plt.show()
del z
This is the result:
Is there a way to change seaborn's sns.heatmap's configurations so that it applies the color scaling horizontally instead of vertically? Without changing the given values?
If I change the method's axis(z.apply(lambda x: x/x.sum(), axis = 1), it also changes the actual outcomes:
I want to apply the horizontal color scaling of the 2nd picture to the data of the first picture.
Instead of just setting annot=True, it can also be an array of values (or o labels). You can use a different data= parameter to define the coloring.
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
np.random.seed(31416)
weights = np.random.rand(20) ** 1.5
weights /= weights.sum() # random weights summing to 1
N = 2000
df2 = pd.DataFrame({'winner': np.random.choice(['white', 'draw', 'black'], N, p=[0.47, 0.07, 0.46]),
'opening_name': np.random.choice([*'ABCDEFGHIJKLMNOPQRST'], N, p=weights)})
z = df2.groupby(["winner", "opening_name"]).size().unstack().fillna(0).astype(int)
fig, ax = plt.subplots(figsize=(32, 16))
sns.heatmap(data=z.apply(lambda x: x / x.sum(), axis=1),
annot=z.apply(lambda x: x / x.sum()),
xticklabels=True, yticklabels=True, cmap='YlOrBr',
linewidths=0.005, linecolor='black', annot_kws={"fontsize": 16}, fmt='.2f', cbar=False, ax=ax)
ax.tick_params(labelsize=16)
plt.tight_layout()
plt.show()

Replicate distplot with rug without histogram

As I go through online tutorials and\or articles in general, when I encounter a plot that uses
the Seaborn distplot plot I re-create it using either histplot or displot.
I do this because distplot is deprecated and I want to re-write the code using newer standards.
I am going through this article: https://www.kite.com/blog/python/data-analysis-visualization-python/
and there is a section using distplot whose output I cannot replicate.
This is the section of code that I am trying to replicate:
col_names = ['StrengthFactor', 'PriceReg', 'ReleaseYear', 'ItemCount', 'LowUserPrice', 'LowNetPrice']
fig, ax = plt.subplots(len(col_names), figsize=(8, 40))
for i, col_val in enumerate(col_names):
x = sales_data_hist[col_val][:1000]
sns.distplot(x, ax=ax[i], rug=True, hist=False)
outliers = x[percentile_based_outlier(x)]
ax[i].plot(outliers, np.zeros_like(outliers), 'ro', clip_on=False)
ax[i].set_title('Outlier detection - {}'.format(col_val), fontsize=10)
ax[i].set_xlabel(col_val, fontsize=8)
plt.show()
Both the distplot itself and the axis variable are no longer used. The code, for now, runs.
In a nutshell, all I am trying to do is replicate the exact output of the code above (rug plot, the red dots representing the removed values, etc.) without using deprecated code.
I have tried various combinations of displot and histplot but I have been unable to get the exact same output any other way.
The sns.kdeplot() function shows the kde curve available in distplot. (In fact, distplot just calls kdeplot internally). Similarly, there is sns.rugplot() to show the rug.
Here is an example with the easier to replicate iris dataset:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
def percentile_based_outlier(data, threshold=95):
diff = (100 - threshold) / 2
minval, maxval = np.percentile(data, [diff, 100 - diff])
return (data < minval) | (data > maxval)
iris = sns.load_dataset('iris')
col_names = [col for col in iris.columns if iris[col].dtype == 'float64'] # the numerical columns
fig, axs = plt.subplots(len(col_names), figsize=(5, 12))
for ax, col_val in zip(axs, col_names):
x = iris[col_val]
sns.kdeplot(x, ax=ax)
sns.rugplot(x, ax=ax, color='C0')
outliers = x[percentile_based_outlier(x)]
ax.plot(outliers, np.zeros_like(outliers), 'ro', clip_on=False)
ax.set_title(f'Outlier detection - {col_val}', fontsize=10)
ax.set_xlabel('') # ax[i].set_xlabel(col_val, fontsize=8)
plt.tight_layout()
plt.show()
To use displot, the dataframe can be converted to "long form" via pd.melt(). The outliers can be added via a custom function called by g.map_dataframe(...):
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
def percentile_based_outlier(data, threshold=95):
diff = (100 - threshold) / 2
minval, maxval = np.percentile(data, [diff, 100 - diff])
return (data < minval) | (data > maxval)
def show_outliers(data, color):
col_name = data['variable'].values[0]
x = data['value'].to_numpy()
outliers = x[percentile_based_outlier(x)]
plt.plot(outliers, np.zeros_like(outliers), 'ro', clip_on=False)
plt.xlabel('')
iris = sns.load_dataset('iris')
col_names = [col for col in iris.columns if iris[col].dtype == 'float64'] # the numerical columns
iris_long = iris.melt(value_vars=col_names)
g = sns.displot(data=iris_long, x='value', kind='kde', rug=True, row='variable',
height=2.2, aspect=3,
facet_kws={'sharey': False, 'sharex': False})
g.map_dataframe(show_outliers)

Multiple histograms from multiple dataframes into one in pandas

I am beginner in python and pandas
I have three CSV data. I want to make one histogram from these three dataframe.
I used this code
import pandas as pd
from matplotlib import pyplot as plt
X = pd.read_csv("data1.csv")
Y = pd.read_csv("data2.csv")
Z = pd.read_csv("data3.csv")
X.hist(column='speed', weights=X.ID,figsize=(20,10), stacked=True, bins=50, color = 'Blue', )
Y.hist(column='speed', weights=Y.ID,figsize=(20,10), stacked=True, bins=50, color = 'Red')
Z.hist(column='speed', weights=Z.ID,figsize=(20,10), stacked=True, bins=50, color = 'Grey')
plt.rc('xtick',labelsize=25)
plt.rc('ytick',labelsize=25)
but I got three different histograms.
How to make these three into one histogram with three colour of each histogram included?
I would go for this:
import pandas as pd
import matplotlib.pyplot as plt
X = pd.read_csv("data1.csv")
Y = pd.read_csv("data2.csv")
Z = pd.read_csv("data3.csv")
plt.hist([X.speed.values.flatten(), Y.speed.values.flatten(), Z.speed.values.flatten()], weights=[X.ID.values.flatten(), Y.ID.values.flatten(), Z.values.flatten()], label=['X', 'Y', 'Z'])
plt.legend()
plt.rc('xtick', labelsize=25)
plt.rc('ytick', labelsize=25)
Merge your dataframes:
X = pd.read_csv("data1.csv")
Y = pd.read_csv("data2.csv")
Z = pd.read_csv("data3.csv")
df = X.merge(Y).merge(Z)
df.hist(...)

Python Create Bar Chart Comparing 2 sets of data

I have a notebook with 2* bar charts, one is winter data & one is summer data. I have counted the total of all the crimes and plotted them in a bar chart, using code:
ax = summer["crime_type"].value_counts().plot(kind='bar')
plt.show()
Which shows a graph like:
I have another chart nearly identical, but for winter:
ax = winter["crime_type"].value_counts().plot(kind='bar')
plt.show()
And I would like to have these 2 charts compared against one another in the same bar chart (Where every crime on the x axis has 2 bars coming from it, one winter & one summer).
I have tried, which is just me experimenting:
bx = (summer["crime_type"],winter["crime_type"]).value_counts().plot(kind='bar')
plt.show()
Any advice would be appreciated!
The following generates dummies of your data and does the grouped bar chart you wanted:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
s = "Crime Type Summer|Crime Type Winter".split("|")
# Generate dummy data into a dataframe
j = {x: [random.choice(["ASB", "Violence", "Theft", "Public Order", "Drugs"]
) for j in range(300)] for x in s}
df = pd.DataFrame(j)
index = np.arange(5)
bar_width = 0.35
fig, ax = plt.subplots()
summer = ax.bar(index, df["Crime Type Summer"].value_counts(), bar_width,
label="Summer")
winter = ax.bar(index+bar_width, df["Crime Type Winter"].value_counts(),
bar_width, label="Winter")
ax.set_xlabel('Category')
ax.set_ylabel('Incidence')
ax.set_title('Crime incidence by season, type')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(["ASB", "Violence", "Theft", "Public Order", "Drugs"])
ax.legend()
plt.show()
With this script I got:
You can check out the demo in the matplotlib docs here: https://matplotlib.org/gallery/statistics/barchart_demo.html
The important thing to note is the index!
index = np.arange(5) # Set an index of n crime types
...
summer = ax.bar(index, ...)
winter = ax.bar(index+bar_width, ...)
...
ax.set_xticks(index + bar_width / 2)
These are the lines that arrange the bars on the horizontal axis so that they are grouped together.
Create a pandas dataframe with 3 columns crimetype, count, Season and try this function.
#Importing required packages
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import MaxNLocator
#Function Creation
def plt_grouped_bar(Plot_Nm,group_bar,x, y,plt_data,**bar_kwargs):
plt_fig=plt.figure(figsize=(18,9))
ax=plt_fig.add_subplot()
g = sns.catplot(x=x, y=y, hue=group_bar,data=plt_data,ax=ax,kind="bar",**bar_kwargs)
for p in ax.patches:
height = p.get_height()
ax.text(x = p.get_x()+(p.get_width()/2),
y = height+0.05,
s = '{:.0f}'.format(height),
ha = 'center',va = 'bottom',zorder=20, rotation=90)
ax.set_title(Plot_Nm,fontweight="bold",fontsize=18,alpha=0.7,y=1.03)
g.set_xticklabels(x,fontsize=10,alpha=0.8,fontweight="bold")
plt.setp(ax.get_xticklabels(), rotation=90)
ax.set_yticklabels("")
ax.set_xlabel("")
ax.set_ylabel("")
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
ax.tick_params(axis=u'both',length=0)
ax.legend(loc='upper right')
for spine in ax.spines:
ax.spines[spine].set_visible(False)
plt.close()
#Calling the function
plt_grouped_bar('Title of bar','weather','crimetype','count',pandasdataframename)

Categories