How to show the value of quantiles in a boxplot - python

I managed to get a boxplot of 2 categories in the x-axis and a continuous variable in the y-axis. I just want to add to the plot the value of the quartiles, near to the boxes.
Like this:

Here is an example:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(0)
df = pd.DataFrame({'churn': np.random.choice(['No', 'Si'], size=1000),
'value': np.random.random(size=1000)})
box_width = 0.5
ax = sns.boxplot(data=df, x='churn', y='value', width=box_width)
i = 0
for name, group in df.groupby('churn'):
Q1, Q3 = group['value'].quantile([0.25,0.75])
for q in (Q1, Q3):
x = i-box_width/2
y = q
ax.annotate('%.2f' % q, (x,y),
xytext=(x-0.1, y), textcoords='data',
arrowprops=dict(facecolor='black', shrink=0.05),
va='center', ha='right')
i+=1

Related

How can I create a series of density contours in a row ( i.e. subplots)

I can create a density contour plot with
from astropy.table import Table, join
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy import stats
# CLEAN Data
RErange = Table.read('../../GAMA_Data/REMassEClassEmeasure.fits')
RErange = RErange[RErange['SurfaceDensityFlag'] == 0]
#RErange = RErange[RErange['SurfaceDensity'] < 50]
RErange = RErange[RErange['AGEDenParFlag'] == 0]
RErange = RErange[RErange['CountInCylFlag'] == 0]
RErange = RErange[RErange['uminusr']> 0.001]
print(RErange.colnames)
yfield = 'uminusr'
xfield ='logmstar'
# set seaborn style
#sns.set_style("white")
df = RErange.to_pandas()
sns.displot(df, x='logmstar', y='uminusr', kind="kde")
plt.show()
But how can I create a number of them ( 3 in in a line ) as per subplots?
as seaborn displot does not seem to have an axis facility.
Solution does not have to use seaborn.
Okay resolved by using a Dataframe which resolved the endedness
df = RErange.to_pandas()
fig = plt.figure(figsize=(12, 16), dpi=200)
ax1 = fig.add_subplot(1, 1, 1)
sns.kdeplot(df.logmstar, df.uminusr, ax=ax1, cmap='Blues')
plt.show()

Formatting a plot in Seaborn

I made a PMF plot using seaborn:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
n= 1000 #number of trials
p= 0.5 #probability
trial_2 = np.random.binomial(n,p,1000)
sns.displot(trial_2, stat = 'probability')
trial_2_mean= np.mean(trial_2)
plt.axvline(trial_2_mean,color='red')
plt.xlabel("Number of Successes")
red_patch = mpatches.Patch(color='red', label='Mean')
plt.legend(handles=[red_patch])
I want to add text to the plot like below (the n=60 and p=0.1):
Also how do I plot in a format similar to the one in the picture (straight lines)
You can do following:
from scipy.stats import binom
n = 50
p = 0.1
x = [x for x in range(15)]
trial_2 = binom.pmf(x, n, p)
sns.scatterplot(x, trial_2,label=('$n=50, p=0.1$'))
plt.vlines(x, 0, trial_2, colors='red', lw=3, alpha=0.4)
plt.xticks(x)
plt.ylabel('Probability')
plt.xlabel('Number of Successes')
plt.show()
Produces:

How to annotate a seaborn barplot with the aggregated value

How can the following code be modified to show the mean as well as the different error bars on each bar of the bar plot?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
a,b,c,d = [],[],[],[]
for i in range(1,5):
np.random.seed(i)
a.append(np.random.uniform(35,55))
b.append(np.random.uniform(40,70))
c.append(np.random.uniform(63,85))
d.append(np.random.uniform(59,80))
data_df =pd.DataFrame({'stages':[1,2,3,4],'S1':a,'S2':b,'S3':c,'S4':d})
print("Delay:")
display(data_df)
S1 S2 S3 S4
0 43.340440 61.609735 63.002516 65.348984
1 43.719898 40.777787 75.092575 68.141770
2 46.015958 61.244435 69.399904 69.727380
3 54.340597 56.416967 84.399056 74.011136
meansd_df=data_df.describe().loc[['mean', 'std'],:].drop('stages', axis = 1)
display(meansd_df)
sns.set()
sns.set_style('darkgrid',{"axes.facecolor": ".92"}) # (1)
sns.set_context('notebook')
fig, ax = plt.subplots(figsize = (8,6))
x = meansd_df.columns
y = meansd_df.loc['mean',:]
yerr = meansd_df.loc['std',:]
plt.xlabel("Time", size=14)
plt.ylim(-0.3, 100)
width = 0.45
for i, j,k in zip(x,y,yerr): # (2)
ax.bar(i,j, width, yerr = k, edgecolor = "black",
error_kw=dict(lw=1, capsize=8, capthick=1)) # (3)
ax.set(ylabel = 'Delay')
from matplotlib import ticker
ax.yaxis.set_major_locator(ticker.MultipleLocator(10))
plt.savefig("Over.png", dpi=300, bbox_inches='tight')
Given the example data, for a seaborn.barplot with capped error bars, data_df must be converted from a wide format, to a tidy (long) format, which can be accomplished with pandas.DataFrame.stack or pandas.DataFrame.melt
It is also important to keep in mind that a bar plot shows only the mean (or other estimator) value
Sample Data and DataFrame
.iloc[:, 1:] is used to skip the 'stages' column at column index 0.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# given data_df from the OP, select the columns except stage and reshape to long format
df = data_df.iloc[:, 1:].melt(var_name='set', value_name='val')
# display(df.head())
set val
0 S1 43.340440
1 S1 43.719898
2 S1 46.015958
3 S1 54.340597
4 S2 61.609735
Updated as of matplotlib v3.4.2
Use matplotlib.pyplot.bar_label
See How to add value labels on a bar chart for additional details and examples with .bar_label.
Some formatting can be done with the fmt parameter, but more sophisticated formatting should be done with the labels parameter, as show in How to add multiple annotations to a barplot.
Tested with seaborn v0.11.1, which is using matplotlib as the plot engine.
fig, ax = plt.subplots(figsize=(8, 6))
# add the plot
sns.barplot(x='set', y='val', data=df, capsize=0.2, ax=ax)
# add the annotation
ax.bar_label(ax.containers[-1], fmt='Mean:\n%.2f', label_type='center')
ax.set(ylabel='Mean Time')
plt.show()
plot with seaborn.barplot
Using matplotlib before version 3.4.2
The default for the estimator parameter is mean, so the height of the bar is the mean of the group.
The bar height is extracted from p with .get_height, which can be used to annotate the bar.
fig, ax = plt.subplots(figsize=(8, 6))
sns.barplot(x='set', y='val', data=df, capsize=0.2, ax=ax)
# show the mean
for p in ax.patches:
h, w, x = p.get_height(), p.get_width(), p.get_x()
xy = (x + w / 2., h / 2)
text = f'Mean:\n{h:0.2f}'
ax.annotate(text=text, xy=xy, ha='center', va='center')
ax.set(xlabel='Delay', ylabel='Time')
plt.show()
Seaborn is most powerfull with long form data. So you might want to transform your data, something like this:
sns.barplot(data=data_df.melt('stages', value_name='Delay', var_name='Time'),
x='Time', y='Delay',
capsize=0.1, edgecolor='k')
Output:

Multiple histograms from multiple dataframes into one in pandas

I am beginner in python and pandas
I have three CSV data. I want to make one histogram from these three dataframe.
I used this code
import pandas as pd
from matplotlib import pyplot as plt
X = pd.read_csv("data1.csv")
Y = pd.read_csv("data2.csv")
Z = pd.read_csv("data3.csv")
X.hist(column='speed', weights=X.ID,figsize=(20,10), stacked=True, bins=50, color = 'Blue', )
Y.hist(column='speed', weights=Y.ID,figsize=(20,10), stacked=True, bins=50, color = 'Red')
Z.hist(column='speed', weights=Z.ID,figsize=(20,10), stacked=True, bins=50, color = 'Grey')
plt.rc('xtick',labelsize=25)
plt.rc('ytick',labelsize=25)
but I got three different histograms.
How to make these three into one histogram with three colour of each histogram included?
I would go for this:
import pandas as pd
import matplotlib.pyplot as plt
X = pd.read_csv("data1.csv")
Y = pd.read_csv("data2.csv")
Z = pd.read_csv("data3.csv")
plt.hist([X.speed.values.flatten(), Y.speed.values.flatten(), Z.speed.values.flatten()], weights=[X.ID.values.flatten(), Y.ID.values.flatten(), Z.values.flatten()], label=['X', 'Y', 'Z'])
plt.legend()
plt.rc('xtick', labelsize=25)
plt.rc('ytick', labelsize=25)
Merge your dataframes:
X = pd.read_csv("data1.csv")
Y = pd.read_csv("data2.csv")
Z = pd.read_csv("data3.csv")
df = X.merge(Y).merge(Z)
df.hist(...)

Python Create Bar Chart Comparing 2 sets of data

I have a notebook with 2* bar charts, one is winter data & one is summer data. I have counted the total of all the crimes and plotted them in a bar chart, using code:
ax = summer["crime_type"].value_counts().plot(kind='bar')
plt.show()
Which shows a graph like:
I have another chart nearly identical, but for winter:
ax = winter["crime_type"].value_counts().plot(kind='bar')
plt.show()
And I would like to have these 2 charts compared against one another in the same bar chart (Where every crime on the x axis has 2 bars coming from it, one winter & one summer).
I have tried, which is just me experimenting:
bx = (summer["crime_type"],winter["crime_type"]).value_counts().plot(kind='bar')
plt.show()
Any advice would be appreciated!
The following generates dummies of your data and does the grouped bar chart you wanted:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
s = "Crime Type Summer|Crime Type Winter".split("|")
# Generate dummy data into a dataframe
j = {x: [random.choice(["ASB", "Violence", "Theft", "Public Order", "Drugs"]
) for j in range(300)] for x in s}
df = pd.DataFrame(j)
index = np.arange(5)
bar_width = 0.35
fig, ax = plt.subplots()
summer = ax.bar(index, df["Crime Type Summer"].value_counts(), bar_width,
label="Summer")
winter = ax.bar(index+bar_width, df["Crime Type Winter"].value_counts(),
bar_width, label="Winter")
ax.set_xlabel('Category')
ax.set_ylabel('Incidence')
ax.set_title('Crime incidence by season, type')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(["ASB", "Violence", "Theft", "Public Order", "Drugs"])
ax.legend()
plt.show()
With this script I got:
You can check out the demo in the matplotlib docs here: https://matplotlib.org/gallery/statistics/barchart_demo.html
The important thing to note is the index!
index = np.arange(5) # Set an index of n crime types
...
summer = ax.bar(index, ...)
winter = ax.bar(index+bar_width, ...)
...
ax.set_xticks(index + bar_width / 2)
These are the lines that arrange the bars on the horizontal axis so that they are grouped together.
Create a pandas dataframe with 3 columns crimetype, count, Season and try this function.
#Importing required packages
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import MaxNLocator
#Function Creation
def plt_grouped_bar(Plot_Nm,group_bar,x, y,plt_data,**bar_kwargs):
plt_fig=plt.figure(figsize=(18,9))
ax=plt_fig.add_subplot()
g = sns.catplot(x=x, y=y, hue=group_bar,data=plt_data,ax=ax,kind="bar",**bar_kwargs)
for p in ax.patches:
height = p.get_height()
ax.text(x = p.get_x()+(p.get_width()/2),
y = height+0.05,
s = '{:.0f}'.format(height),
ha = 'center',va = 'bottom',zorder=20, rotation=90)
ax.set_title(Plot_Nm,fontweight="bold",fontsize=18,alpha=0.7,y=1.03)
g.set_xticklabels(x,fontsize=10,alpha=0.8,fontweight="bold")
plt.setp(ax.get_xticklabels(), rotation=90)
ax.set_yticklabels("")
ax.set_xlabel("")
ax.set_ylabel("")
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
ax.tick_params(axis=u'both',length=0)
ax.legend(loc='upper right')
for spine in ax.spines:
ax.spines[spine].set_visible(False)
plt.close()
#Calling the function
plt_grouped_bar('Title of bar','weather','crimetype','count',pandasdataframename)

Categories