bar graph with wrong width

bar graph with wrong width - python

I want to create a bar graph for a dataframe contains multiple categories, with a different color for each category. Below is my simplified code and resulting graph. The top subplot is a regular bar graph in one color, the bottom subplot is color coded but the bar width is messed up. Any suggestions? Thanks!
import random
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({'Cat': [1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4],
'A': [2, 3, 6, 7, 9, 10, 15, 18, 22, 23, 24, 25],
'B': random.sample(range(1, 20), 12)})
fig = plt.figure(figsize=(15, 15/2.3))
ax = plt.subplot(2, 1, 1)
plt.bar(df.A, df.B)
plt.xlim(0, 30)
ax = plt.subplot(2, 1, 2)
for cat in df.Cat.unique():
df_ = df.loc[(df.Cat==cat), :]
plt.bar(df_.A, df_.B, width=0.5)
plt.xlim(0, 30)
plt.show()

Related

Python - Bland-Altman Plot with Text Customization

I am trying to Create the Bland-Altman Plot with the text having on the left side of the plot instead of having it as the default configuration on the right hand side
This is my code
import pandas as pd
df = pd.DataFrame({'A': [5, 5, 5, 6, 6, 7, 7, 7, 8, 8, 9,
10, 11, 13, 14, 14, 15, 18, 22, 25],
'B': [4, 4, 5, 5, 5, 7, 8, 6, 9, 7, 7, 11,
13, 13, 12, 13, 14, 19, 19, 24]})
import statsmodels.api as sm
import matplotlib.pyplot as plt
#create Bland-Altman plot
f, ax = plt.subplots(1, figsize = (8,5))
sm.graphics.mean_diff_plot(df.A, df.B, ax = ax)
#display Bland-Altman plot
plt.show()
So I want to have the "mean", the "SD+" and the "SD-" on the left side of the X-axis, not on the right.
thanks for your help or any suggestions!

I don't know, but I can use pyplot so:
mean_diff = (df.A-df.B).mean()
diff_range = (df.A-df.B).std()*1.96
plt.figure(figsize = (9,6))
plt.scatter(df.A, df.A-df.B, alpha=.5)
plt.hlines(mean_diff, df.A.min()-2, df.A.max()+2, color="k", linewidth=1)
plt.text(
df.A.min()-1, mean_diff+.05*diff_range, "mean diff: %.2f"%mean_diff,
fontsize=13,
)
plt.hlines(
[mean_diff+diff_range, mean_diff-diff_range],
df.A.min()-2, df.A.max()+2, color="k", linewidth=1,
linestyle="--"
)
plt.text(
df.A.min()-1, mean_diff+diff_range+.05*diff_range,
"+SD1.96: %.2f"%(mean_diff+diff_range),
fontsize=13,
)
plt.text(
df.A.min()-1, mean_diff-diff_range+.05*diff_range,
"-SD1.96: %.2f"%(mean_diff-diff_range),
fontsize=13,
)
plt.xlim(df.A.min()-2, df.A.max()+2)
plt.ylim(mean_diff-diff_range*1.5, mean_diff+diff_range*1.5)
plt.xlabel("Means", fontsize=15)
plt.ylabel("Difference", fontsize=15)
plt.show()
result:

A way to update figure layout in a for loop for each subplot (Plotly)

Is there a way I can update each figure's layout in a loop like this? I added each layout to a list and am looping through each but can't seem to update the figures in the subplot:
# Data Visualization
from plotly.subplots import make_subplots
import plotly.graph_objects as go
epoch_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
loss_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
val_loss_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
error_rate = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
val_error_rate = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
layout_list = []
loss_plots = [go.Scatter(x=epoch_list,
y=loss_list,
mode='lines',
name='Loss',
line=dict(width=4)),
go.Scatter(x=epoch_list,
y=val_loss_list,
mode='lines',
name='Validation Loss',
line=dict(width=4))]
loss_layout = dict(font_color='black',
title_font_color='black',
title=dict(text='Loss Graph',
font_size=30),
xaxis_title=dict(text='Epochs',
font_size=25),
yaxis_title=dict(text='Loss',
font_size=25),
legend=dict(font_size=15))
loss_figure = go.Figure(data=loss_plots)
layout_list.append(loss_layout)
error_plots = [go.Scatter(x=epoch_list,
y=loss_list,
mode='lines',
name='Error Rate',
line=dict(width=4)),
go.Scatter(x=epoch_list,
y=val_loss_list,
mode='lines',
name='Validation Error Rate',
line=dict(width=4))]
error_rate_layout = dict(font_color='black',
title_font_color='black',
title=dict(text='Error Rate Graph',
font_size=30),
xaxis_title=dict(text='Epochs',
font_size=25),
yaxis_title=dict(text='Error Rate',
font_size=25),
legend=dict(font_size=15))
error_figure = go.Figure(data=error_plots)
layout_list.append(error_rate_layout)
metric_figure = make_subplots(
rows=3, cols=2,
specs=[[{}, {}],
[{}, {}],
[{}, {}]])
for t in loss_figure.data:
metric_figure.append_trace(t, row=1, col=1)
for t in error_figure.data:
metric_figure.append_trace(t, row=1, col=2)
for (figure, layout) in zip(metric_figure, layout_list):
figure.update_layout(layout)
metric_figure.show()
It seems that doing this doesn't work either as the layout does not transfer over because I am looping through the traces only:
loss_figure = go.Figure(data=loss_plots, layout=loss_layout)

you can use python dict merging techniques
metric_figure.update_layout({**loss_layout, **error_rate_layout})
alternatively, if layouts are in figures
metric_figure.update_layout({**error_figure.to_dict()["layout"],**error_ficture.to_dict()["layout"]})
both of these are of limited use as sub-plot layouts are significantly different from individual figures. There will be different x-axis and y-axis definitions than individual figures / layouts and where dictionary keys overlap only one can be used - for example title

Issue with plotting normal distribution curve with available set of values

I'm trying to plot a normal distribution curve for a set of values. Unfortunately, the below code (taken from this post) doesn't seem to be plotting the curve correctly over the histograms (please refer attached image). I'm sure I'm missing something or have done something silly but can't seem to figure out. Can someone please help? I've included my code below - I'm getting the values from a dataframe but have included these as a list s for convenience:
import numpy as np
import scipy
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
from matplotlib.mlab import normpdf
mu = 0
sigma = 1
n_bins = 50
s = [8, 8, 4, 4, 1, 14, 0, 10, 1, 4, 21, 9, 5, 2, 7, 6, 7, 9, 7, 3, 3, 4, 7, 9, 9, 4, 10, 8, 10, 10, 7, 10, 1, 8, 7, 8, 1, 7, 4, 15, 8, 1, 1, 6, 7, 3, 8, 8, 8, 4][![enter image description here][1]][1]
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
#histogram
n, bins, patches = axes[1].hist(s, n_bins, normed=True, alpha=.1, edgecolor='black' )
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
print(pdf)
median, q1, q3 = np.percentile(s, 50), np.percentile(s, 25), np.percentile(s, 75)
#probability density function
axes[1].plot(bins, pdf, color='orange', alpha=.6)
#to ensure pdf and bins line up to use fill_between.
bins_1 = bins[(bins >= q1-1.5*(q3-q1)) & (bins <= q1)] # to ensure fill starts from Q1-1.5*IQR
bins_2 = bins[(bins <= q3+1.5*(q3-q1)) & (bins >= q3)]
pdf_1 = pdf[:int(len(pdf)/2)]
pdf_2 = pdf[int(len(pdf)/2):]
pdf_1 = pdf_1[(pdf_1 >= norm(mu,sigma).pdf(q1-1.5*(q3-q1))) & (pdf_1 <= norm(mu,sigma).pdf(q1))]
pdf_2 = pdf_2[(pdf_2 >= norm(mu,sigma).pdf(q3+1.5*(q3-q1))) & (pdf_2 <= norm(mu,sigma).pdf(q3))]
#fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
#axes[1].fill_between(bins_1, pdf_1, 0, alpha=.6, color='orange')
#axes[1].fill_between(bins_2, pdf_2, 0, alpha=.6, color='orange')
#add text to bottom graph.
axes[1].annotate("{:.1f}%".format(100*norm(mu, sigma).cdf(q1)), xy=((q1-1.5*(q3-q1)+q1)/2, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3)-norm(mu, sigma).cdf(q1))), xy=(median, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+1.5*(q3-q1)-q3)-norm(mu, sigma).cdf(q3))), xy=((q3+1.5*(q3-q1)+q3)/2, 0), ha='center')
axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')
axes[1].set_ylabel('Probability Density')
#top boxplot
axes[0].boxplot(s, 0, 'gD', vert=False)
axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
axes[0].axis('off')
plt.rcParams["figure.figsize"] = (10,10)
plt.subplots_adjust(hspace=0)
plt.show()

You have set mu and sigma arbitrarily to 0 and 1 respectively but you should calculate it for your actual data:
data = pd.Series(s)
mu = data.mean()
sigma = data.std()
Update with full working example:
import numpy as np
import scipy
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
n_bins = 50
s = [8, 8, 4, 4, 1, 14, 0, 10, 1, 4, 21, 9, 5, 2, 7, 6, 7, 9, 7, 3, 3, 4, 7, 9, 9, 4, 10, 8, 10, 10, 7, 10, 1, 8, 7, 8, 1, 7, 4, 15, 8, 1, 1, 6, 7, 3, 8, 8, 8, 4]
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
#histogram
n, bins, patches = axes[1].hist(s, n_bins, density=True, alpha=.1, edgecolor='black' )
data = pd.Series(s)
mu = data.mean()
sigma = data.std()
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
median, q1, q3 = np.percentile(s, 50), np.percentile(s, 25), np.percentile(s, 75)
#probability density function
axes[1].plot(bins, pdf, color='orange', alpha=.6)
#fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
iqr = 1.5 * (q3-q1)
x1 = np.linspace(q1 - iqr, q1)
x2 = np.linspace(q3, q3 + iqr)
pdf1 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x1-mu)**2/(2*sigma**2))
pdf2 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x2-mu)**2/(2*sigma**2))
axes[1].fill_between(x1, pdf1, 0, alpha=.6, color='orange')
axes[1].fill_between(x2, pdf2, 0, alpha=.6, color='orange')
#add text to bottom graph.
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q1) -norm(mu, sigma).cdf(q1-iqr))), xy=(q1-iqr/2, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3) -norm(mu, sigma).cdf(q1) )), xy=(median , 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+iqr)-norm(mu, sigma).cdf(q3) )), xy=(q3+iqr/2, 0), ha='center')
axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')
axes[1].set_ylabel('Probability Density')
#top boxplot
axes[0].boxplot(s, 0, 'gD', vert=False)
axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
axes[0].axis('off')

Putting it all in a fuction:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
def CTD(df):
for col in df.columns:
n_bins = 50
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
#histogram
n, bins, patches = axes[1].hist(boston[col], n_bins, density=True, alpha=.1, edgecolor='black' )
#data = pd.Series(s)
mu = boston[col].mean()
sigma = boston[col].std()
pdf = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2))
median, q1, q3 = np.percentile(boston.age, 50), np.percentile(boston[col], 25), np.percentile(boston[col], 75)
#probability density function
axes[1].plot(bins, pdf, color='orange', alpha=.6)
#axes[1].figsize=(10,20)
#fill from Q1-1.5*IQR to Q1 and Q3 to Q3+1.5*IQR
iqr = 1.5 * (q3-q1)
x1 = np.linspace(q1 - iqr, q1)
x2 = np.linspace(q3, q3 + iqr)
pdf1 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x1-mu)**2/(2*sigma**2))
pdf2 = 1/(sigma*np.sqrt(2*np.pi))*np.exp(-(x2-mu)**2/(2*sigma**2))
axes[1].fill_between(x1, pdf1, 0, alpha=.6, color='orange')
axes[1].fill_between(x2, pdf2, 0, alpha=.6, color='orange')
#add text to bottom graph.
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q1) -norm(mu, sigma).cdf(q1-iqr))), xy=(q1-iqr/2, 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3) -norm(mu, sigma).cdf(q1) )), xy=(median , 0), ha='center')
axes[1].annotate("{:.1f}%".format(100*(norm(mu, sigma).cdf(q3+iqr)-norm(mu, sigma).cdf(q3) )), xy=(q3+iqr/2, 0), ha='center')
axes[1].annotate('q1', xy=(q1, norm(mu, sigma).pdf(q1)), ha='center')
axes[1].annotate('q3', xy=(q3, norm(mu, sigma).pdf(q3)), ha='center')
#dashed lines
plt.axvline(df[col].quantile(0),color='b', linestyle='-.')
plt.axvline(df[col].quantile(0.25),color='g', linestyle='--')
plt.axvline(df[col].quantile(0.50),color='g', linestyle='--')
plt.axvline(df[col].quantile(0.75),color='b', linestyle='--')
plt.axvline(df[col].quantile(1),color='r', linestyle='-.')
axes[1].set_ylabel('Probability Density')
#top boxplot
axes[0].boxplot(df[col], 0, 'gD', vert=False)
axes[0].axvline(median, color='orange', alpha=.6, linewidth=.5)
axes[0].axis('off')
plt.rcParams["figure.figsize"] = (18,10)
calling function:
CTD(boston)
If this doesn't work for you:
Try this:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
def CTD(df):
for col in df.columns:
sns.set(rc={'figure.figsize':(24,6)})
plt.figure()
plt.subplot(121)
sns.distplot(df[col])
plt.axvline(np.mean(df[col]),color='b', linestyle='--') # Blue line for mean
plt.axvline(np.median(df[col]),color='r', linestyle='--')# Red line for Median
plt.subplot(122)
sns.distplot(df[col])
plt.axvline(df[col].quantile(0),color='b', linestyle='-.')
plt.axvline(df[col].quantile(0.25),color='g', linestyle='--')
plt.axvline(df[col].quantile(0.50),color='g', linestyle='--')
plt.axvline(df[col].quantile(0.75),color='b', linestyle='--')
plt.axvline(df[col].quantile(1),color='r', linestyle='-.')
This Create dashed lines on KDE plot having quantiles.

Textposition not displaying on plotly

I am trying to plot the accuracy of the training and test set of my neural network using plotly.
I want also to add a marker with a text that says when was the maximum value of each but also displays a text that says what that value was. I tried doing something like in this example.
Here my mcve:
import plotly.graph_objects as go
data = {
'test acc': [1, 2, 3, 4, 5, 6, 7, 9, 10],
'train acc': [3, 5, 5, 6, 7, 8, 9, 10, 8]
}
fig = go.Figure()
color_train = 'rgb(255, 0, 0)'
color_test = 'rgb(0, 255, 0)'
assert len(data["train acc"]) == len(data["test acc"])
x = list(range(len(data["train acc"])))
fig.add_trace(go.Scatter(x=x,
y=data["train acc"],
mode='lines',
name='train acc',
line_color=color_train))
fig.add_trace(go.Scatter(x=x,
y=data["test acc"],
mode='lines',
name='test acc',
line_color=color_test))
# Max points
train_max = max(data["train acc"])
test_max = max(data["test acc"])
# ATTENTION! this will only give you first occurrence
train_max_index = data["train acc"].index(train_max)
test_max_index = data["test acc"].index(test_max)
fig.add_trace(go.Scatter(x=[train_max_index],
y=[train_max],
mode='markers',
name='max value train',
text=['{}%'.format(int(train_max * 100))],
textposition="top center",
marker_color=color_train))
fig.add_trace(go.Scatter(x=[test_max_index],
y=[test_max],
mode='markers',
name='max value test',
text=['{}%'.format(int(test_max*100))],
textposition="top center",
marker_color=color_test))
fig.update_layout(title='Train vs Test accuracy',
xaxis_title='epochs',
yaxis_title='accuracy (%)'
)
fig.show()
However, my output fire is the following:
As you can see, the value is not being displayed as in the example I found.
How can I make it appear?

If you'd only like to highlight a few certain values, use add_annotation(). In your case just find the max and min Y for the X that you'd like to put into focus. Lacking a data sample from your side, here's how I'd do it with a generic data sample:
Plot:
Code:
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default='browser'
fig = go.Figure()
xVars1=[0, 1, 2, 3, 4, 5, 6, 7, 8]
yVars1=[0, 1, 3, 2, 4, 3, 4, 6, 5]
xVars2=[0, 1, 2, 3, 4, 5, 6, 7, 8]
yVars2=[0, 4, 5, 1, 2, 2, 3, 4, 2]
fig.add_trace(go.Scatter(
x=xVars1,
y=yVars1
))
fig.add_trace(go.Scatter(
x=xVars2,
y=yVars2
))
fig.add_annotation(
x=yVars1.index(max(yVars1)),
y=max(yVars1),
text="yVars1 max")
fig.add_annotation(
x=yVars2.index(max(yVars2)),
y=max(yVars2),
text="yVars2 max")
fig.update_annotations(dict(
xref="x",
yref="y",
showarrow=True,
arrowhead=7,
ax=0,
ay=-40
))
fig.update_layout(showlegend=False)
fig.show()

Matplotlib twinx for different scales

I am using matplotlib twinx for graphing multiple variables on the same axes. But I have a problem, for which I can't find a solution. For simplicity, I have attached little code and graph plotted by that code below.
In this picture, I need those bars to be displayed at the bottom of axes as shown in picture 2. But in picture 2, yticks of ax1t remained as the same. I also need them to be displayed at the bottom. How can I do that?
Code:
import matplotlib.pyplot as plt
import numpy as np
fig, ax1 = plt.subplots()
ax1.plot([4, 2, 8, 6, 4, 7, 3, 5])
ax1t = ax1.twinx()
ax1t.bar(np.arange(8), [45, 42, 55, 36, 58, 45, 48, 62], alpha=0.4)
plt.show()
Picture 2

I guess this is what you want -
import matplotlib.pyplot as plt
import numpy as np
fig, ax1 = plt.subplots()
ax1.plot([4, 2, 8, 6, 4, 7, 3, 5])
ax1t = ax1.twinx()
ax1t.bar(np.arange(8), [45, 42, 55, 36, 58, 45, 48, 62], alpha=0.4)
ax1t.set_ylim([10,500])
ax1t.set_yticks([10, 50, 90])
plt.show()
Change the y axis scale using set_ylim and then explicitly pass the y ticks using set_yticks. You play around with the parameters to adjust it according to your convenience.

from matplotlib examples
import matplotlib.pyplot as plt
import numpy as np
f, (ax1, ax2) = plt.subplots(2, sharex=True, sharey=True)
ax1.plot([4, 2, 8, 6, 4, 7, 3, 5])
ax2.bar(np.arange(8), [45, 42, 55, 36, 58, 45, 48, 62], alpha=0.4)
# Fine-tune figure; make subplots close to each other and hide x ticks for
# all but bottom plot.
f.subplots_adjust(hspace=0)
plt.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False)
plt.show()

You could also use Plotly library, which could easily do that with great visualisation.
import plotly.plotly as py
import plotly.graph_objs as go
trace1 = go.Scatter(
x=[0, 1, 2, 3, 4, 5],
y=[1.5, 1, 1.3, 0.7, 0.8, 0.9]
)
trace2 = go.Bar(
x=[0, 1, 2, 3, 4, 5],
y=[1, 0.5, 0.7, -1.2, 0.3, 0.4]
)
data = [trace1, trace2]
py.iplot(data, filename='bar-line')
Result (it is .png format, therefore not interactive)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

bar graph with wrong width - python

Related

Python - Bland-Altman Plot with Text Customization

A way to update figure layout in a for loop for each subplot (Plotly)

Issue with plotting normal distribution curve with available set of values

Textposition not displaying on plotly

Matplotlib twinx for different scales

Categories

Resources