How to normalize probability distribution values in the matplotlib histogram plot?

How to normalize probability distribution values in the matplotlib histogram plot? - python

I am trying to show both cumulative and non-cumulative distributions on the same plot.
fig, ax = plt.subplots(figsize=(10, 5))
n, bins, patches = ax.hist(x, n_bins, density=True, stacked=True, histtype='step',
cumulative=True, label='Empirical cumulative')
# Overlay a non-cumulative histogram.
ax.hist(x, bins=bins, density=True, stacked=True, histtype='step', cumulative=False, label='Empirical non-cumulative')
plt.show()
The Empirical cumulative curve looks well and the values do not exceed 1. However, the Empirical non-cumulative curve has Y values higher than 1. How can I normalize them?
Update:
Sample data:
n_bins = 20
x = [
0.0051055006412772065,
0.09770815865459548,
0.20666651037049322,
0.5433266733820051,
0.5717169069724539,
0.5421114013759187,
0.4994941193115986,
0.4391978276380223,
0.3673067648294034,
0.3150259778098451,
0.4072059689437963,
0.5781929593356039,
0.6494934859266276,
0.620882081680377,
0.5845829440637116,
0.515705471234385]
Please see the orange curve.

The easiest way to create a histogram with probability instead of probability density is to use seaborn's sns.histplot(.... stat='probability').
To mimic this with standard matplotlib, you could calculate all values manually. For example:
import matplotlib.pyplot as plt
import numpy as np
n_bins = 20
x = np.random.normal(0, 1, (1000, 3))
bin_edges = np.linspace(x.min(), x.max(), n_bins + 1)
bin_values = np.array([np.histogram(x[:, i], bins=bin_edges)[0] for i in range(x.shape[1])])
cum_values = bin_values.cumsum(axis=1).cumsum(axis=0)
cum_values = cum_values / cum_values.max()
fig, ax = plt.subplots(figsize=(10, 5))
prev = 0
for c in cum_values:
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], c, [prev]]))
prev = c[-1]
ax.set_prop_cycle(None)
prev = 0
for c in cum_values:
c = np.diff(c)
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], c, [c[-1], prev]]), ls='--')
prev = c[-1]
plt.show()
If you have just one distribution, stacked=True doesn't make a difference. The code would be simpler:
import matplotlib.pyplot as plt
import numpy as np
n_bins = 20
x = np.random.normal(0, 1, 1000)
bin_edges = np.linspace(x.min(), x.max(), n_bins + 1)
bin_values = np.histogram(x, bins=bin_edges)[0]
cum_values = bin_values.cumsum()
cum_values = cum_values / cum_values.max()
fig, ax = plt.subplots(figsize=(10, 5))
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], cum_values, [0]]))
ax.set_prop_cycle(None)
c = np.diff(cum_values)
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], c, [c[-1], 0]]), ls='--')
plt.show()

Related

How to plot a vertical thermal plot?

How to plot this kind of thermal plot in Python? I tried to search for any sample plot like this but didn't find one.
This image I got from the internet. I want to plot something same like this:

FROM
TO
To represent this type of data the canonical solution is, of course, a heat map. Here it is the code to produce both the figures at the top of this post.
import numpy as np
import matplotlib.pyplot as plt
t = np.linspace(0, 5, 501)
x = np.linspace(0, 1, 201)[:, None]
T = 50 + (30-6*t)*(4*x*(1-x)) + 4*t
fig, ax = plt.subplots(layout='constrained')
hm = ax.imshow(T, cmap='plasma',
aspect='auto', origin='lower', extent=(0, 5, 0, 1))
fig.colorbar(hm)
def heat_lines(x, t, T, n):
from matplotlib.cm import ScalarMappable
from matplotlib.collections import LineCollection
lx, lt = T.shape
ones = np.ones(lx)
norm = plt.Normalize(np.min(T), np.max(T))
plasma = plt.cm.plasma
fig, ax = plt.subplots(figsize=(1+1.2*n, 9), layout='constrained')
ax.set_xlim((-0.6, n-0.4))
ax.set_ylim((x[0], x[-1]))
ax.set_xticks(range(n))
ax.tick_params(right=False,top=False, bottom=False)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.grid(axis='y')
fig.colorbar(ScalarMappable(cmap=plasma, norm=norm))
dt = round(lt/(n-1))
for pos, ix in enumerate(range(0, len(t)+dt//2, dt)):
points = np.array([ones*pos, x[:,0]]).T.reshape(-1,1,2)
segments = np.concatenate([points[:-1], points[1:]], axis=1)
lc = LineCollection(segments, linewidth=72, ec=None,
color=plasma(norm(T[:,ix])))
lc.set_array(T[:,ix])
ax.add_collection(lc)
heat_lines(x, t, T, 6)

How to combine two matplotlib (python) colormaps from scatter plot

I am trying to combine two colourmap legends in one. Colour values are defined from third (z) data.
I am trying plot one legend colormap with two color scheme.
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
df = pd.read_excel('C:\\Users\user1\\PycharmProjects\\untitled\\Python_test.xlsx')
x = df['Vp_dry']
y = df['Vs_dry']
q = df['Vp_wet']
w = df['Vs_wet']
fig, ax = plt.subplots()
popt, pcov = curve_fit(lambda fx, a, b: a * fx ** -b, x, y)
x_linspace = np.linspace(min(x - 100), max(x + 100), 100)
power_y = popt[0]*x_linspace ** -popt[1]
ax1 = plt.scatter(x, y, c=df['Porosity'], cmap=plt.cm.Greys, vmin=2, vmax=df['Porosity'].max(), edgecolors="#B6BBBD")
plt.plot(x_linspace, power_y, color='grey', label='Dry')
popt, pcov = curve_fit(lambda fx, a, b: a * fx ** -b, q, w)
q_linspace = np.linspace(min(q - 100), max(q + 100), 100)
power_w = popt[0]*q_linspace ** -popt[1]
ax2 = plt.scatter(q, w, c=df['Porosity'], cmap=plt.cm.Blues, vmin=2, vmax=df['Porosity'].max(), edgecolors="#3D83C1")
plt.plot(q_linspace, power_w, label='Wet')
cbar = fig.colorbar(ax2)
cbar = fig.colorbar(ax1)
cbar.set_label("Porosity (%)")
plt.xlabel('Vp (m/s)')
plt.ylabel('Vs (m/s)')
plt.grid()
plt.legend()
plt.show()
Desired result:

You seem to need a colorbar with two color maps combined, one of them reversed, and have the ticks changed to percentage values.
An approach is to manually create a second subplot, use two images and make it look like a colorbar:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
# first create some dummy data to plot
N = 100
x = np.random.uniform(0, 10, N)
y = np.random.normal(15, 2, N)
q = np.random.uniform(0, 10, N)
w = np.random.normal(10, 2, N)
df_porosity = np.random.uniform(0, 5, N)
fig, (ax, ax2) = plt.subplots(ncols=2, figsize=(6, 4), gridspec_kw={"width_ratios": [1, 0.08]})
plot1 = ax.scatter(x, y, c=df_porosity, cmap=plt.cm.Greys, vmin=2, vmax=df_porosity.max(), edgecolors="#B6BBBD")
plot2 = ax.scatter(q, w, c=df_porosity, cmap=plt.cm.Blues, vmin=2, vmax=df_porosity.max(), edgecolors="#3D83C1")
img_cbar = np.linspace(0, 1, 256).reshape(256, 1)
ax2.imshow(img_cbar, cmap=plt.cm.Blues, extent=[0, 1, 1, 0]) # aspect='auto')
ax2.imshow(img_cbar, cmap=plt.cm.Greys, extent=[0, 1, -1, 0])
ax2.set_ylim(-1, 1)
ax2.set_aspect(10)
ax2.set_ylabel("Porosity (%)")
ax2.yaxis.set_label_position("right")
ax2.set_xticks([])
ax2.yaxis.tick_right()
# optionally show the ticks as percentage, where 1.0 corresponds to 100 %
ax2.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
plt.tight_layout()
plt.show()

Increase space between y tick and custom x axis

I'm trying to a build a plot that has an exponential function on the top and the utility function on the bottom. With the Y-Axis in the top plot showing the latency and X-Axis as the congestion; similarly, in the second plot, Y-Axis is the throughput and the X-Axis is the congestion.
Where I fail to get is, how do I set the X-Axis as a percentage, and is there a way to superimpose these two graphs.
#!/usr/bin/env python3
import numpy as np
import math
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import matplotlib
fig = plt.figure()
x = np.arange(1,9,1)
y = [math.exp(_) for _ in x]
ax = fig.add_subplot(211)
ax.plot(x, y)
ax.set_ylabel('Y_plot1')
ax.set_xlabel('X_plot1')
ax.set_yticks([],[])
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.yaxis.set_tick_params(which='major', direction='out')
ax.set_ymargin(1)
ax1 = fig.add_subplot(212)
mu = 5
variance = 1
sigma = math.sqrt(variance)
x_normal = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
y_normal = mlab.normpdf(x_normal, mu, sigma)
#y_normal += 1000
x_normal = [0, 0] + list(x_normal)
y_normal = [0, 0] + list(y_normal)
ax1.plot(x_normal, y_normal)
ax1.set_ylabel('Y_plot2')
ax1.set_xlabel('X_plot2')
ax1.set_yticks([],[])
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.xaxis.set_ticks_position('bottom')
ax1.yaxis.set_ticks_position('left')
ax1.set_ymargin(1)
fig.tight_layout()
fig.savefig('bw-latency' +'.pdf',format='pdf',bbox_inches='tight', pad_inches=0.1, dpi=1000)
plt.clf()
plt.close()

To convert your x-axis to percent, you could normalize x_normaland adjust the xticks:
x_normal = x_normal/(max(x_normal)-min(x_normal)) + min(x_normal)
ax1.plot(x_normal, y_normal)
ax1.set_xticks(np.linspace(0,1,5))
ax1.set_xticklabels([str(int(i*100)) for i in np.linspace(0,1,5)])
To superimpose two graphs, have a look at: https://matplotlib.org/gallery/api/two_scales.html
I your case:
ax3 = ax1.twinx()
y = [math.exp(_) for _ in x_normal]
ax3.plot(x_normal, y,color="r")
EDIT: Is this the kind of output you are seeking?:
Here is the code that worked for me:
def plot_percentage(x, y, ax):
x = x/max(x)
ax.plot(x, y)
ax.set_xticks(np.linspace(0, 1, 10))
ax.set_xticklabels([str(int(i*100)) for i in np.linspace(0,1, 10)])
fig = plt.figure()
x = np.arange(1,9,1)
y = [math.exp(_) for _ in x]
ax = fig.add_subplot(211)
plot_percentage(x, y, ax)
ax.set_ylabel('Y_plot1')
ax.set_xlabel('X_plot1')
ax.set_yticks([],[])
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.yaxis.set_tick_params(which='major', direction='out')
ax.set_ymargin(1)
ax1 = fig.add_subplot(212)
mu = 5
variance = 1
sigma = math.sqrt(variance)
x_normal = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
y_normal = mlab.normpdf(x_normal, mu, sigma)
#y_normal += 1000
x_normal = [0, 0] + list(x_normal)
y_normal = [0, 0] + list(y_normal)
plot_percentage(x_normal, y_normal, ax1)
ax3 = ax1.twinx()
y = [math.exp(_) for _ in x_normal]
plot_percentage(x_normal, y, ax3)
plt.show()

python: sns distplot area overlap

How can I get the overlapping area of 2 sns.distplots?
Apart from the difference in mean (as below) I would like to add a number that descripes how different the (normalised) distributions are (for example 2 distributions could have the same mean but still look very different if they are not normal).
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
x1 = np.random.normal(size=2000)
x2 = np.random.normal(size=1000)+1
sns.distplot(x1, hist=False, kde=True, color="r", norm_hist=True)
sns.distplot(x2, hist=False, kde=True, color="b", norm_hist=True)
m1 = x1.mean()
m2 = x2.mean()
plt.title("m1={:2.2f}, m2={:2.2f} (diffInMean={:2.2f})".format(m1, m2, m1-m2))
plt.show(block=True)

If somebody is interested: I have approximated it now with an integral of the distributions (unfortunately not quite the 1-liner I was searching for):
data1 = np.random.normal(size=9000)
data2 = np.random.normal(size=5000, loc=0.5, scale=1.5)
num_bins = 100
xmin = min(data1.min(), data2.min())
xmax = max(data1.max(), data2.max())
bins = np.linspace(xmin, xmax, num_bins)
weights1 = np.ones_like(data1) / float(len(data1))
weights2 = np.ones_like(data2) / float(len(data2))
hist_1 = np.histogram(data1, bins, weights=weights1)[0]
hist_2 = np.histogram(data2, bins, weights=weights2)[0]
tvd = 0.5*sum(abs(hist_1 - hist_2))
print("overlap: {:2.2f} percent".format((1-tvd)*100))
plt.figure()
ax = plt.gca()
ax.hist(data1, bins, weights=weights1, color='red', edgecolor='white', alpha=0.5)[0]
ax.hist(data2, bins, weights=weights2, color='blue', edgecolor='white', alpha=0.5)[0]
plt.show()

How can I create custom break points in a matplotlib colorbar?

I'm borrowing an example from the matplotlib custom cmap examples page:
https://matplotlib.org/examples/pylab_examples/custom_cmap.html
This produces the same image with different numbers of shading contours, as specified in the number of bins: n_bins:
https://matplotlib.org/_images/custom_cmap_00.png
However, I'm interested not only in the number of bins, but the specific break points between the color values. For example, when nbins=6 in the top right subplot, how can I specify the ranges of the bins to such that the shading is filled in these custom areas:
n_bins_ranges = ([-10,-5],[-5,-2],[-2,-0.5],[-0.5,2.5],[2.5,7.5],[7.5,10])
Is it also possible to specify the inclusivity of the break points? For example, I'd like to specify in the range between -2 and 0.5 whether it's -2 < x <= -0.5 or -2 <= x < -0.5.
EDIT WITH ANSWER BELOW:
Using the accepted answer below, here is code that plots each step including finally adding custom colorbar ticks at the midpoint. Note I can't post an image since I'm a new user.
Set up data and 6 color bins:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
# Make some illustrative fake data:
x = np.arange(0, np.pi, 0.1)
y = np.arange(0, 2*np.pi, 0.1)
X, Y = np.meshgrid(x, y)
Z = np.cos(X) * np.sin(Y) * 10
# Create colormap with 6 discrete bins
colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)] # R -> G -> B
n_bin = 6
cmap_name = 'my_list'
cm = matplotlib.colors.LinearSegmentedColormap.from_list(
cmap_name, colors, N=n_bin)
Plot different options:
# Set up 4 subplots
fig, axs = plt.subplots(2, 2, figsize=(6, 9))
fig.subplots_adjust(left=0.02, bottom=0.06, right=0.95, top=0.94, wspace=0.05)
# Plot 6 bin figure
im = axs[0,0].imshow(Z, interpolation='nearest', origin='lower', cmap=cm)
axs[0,0].set_title("Original 6 Bin")
fig.colorbar(im, ax=axs[0,0])
# Change the break points
n_bins_ranges = [-10,-5,-2,-0.5,2.5,7.5,10]
norm = matplotlib.colors.BoundaryNorm(n_bins_ranges, len(n_bins_ranges))
im = axs[0,1].imshow(Z, interpolation='nearest', origin='lower', cmap=cm, norm=norm)
axs[0,1].set_title("Custom Break Points")
fig.colorbar(im, ax=axs[0,1])
# Arrange color labels by data interval (not colors)
im = axs[1,0].imshow(Z, interpolation='nearest', origin='lower', cmap=cm, norm=norm)
axs[1,0].set_title("Linear Color Distribution")
fig.colorbar(im, ax=axs[1,0], spacing="proportional")
# Provide custom labels at color midpoints
# And change inclusive equality by adding arbitrary small value
n_bins_ranges_arr = np.asarray(n_bins_ranges)+1e-9
norm = matplotlib.colors.BoundaryNorm(n_bins_ranges, len(n_bins_ranges))
n_bins_ranges_midpoints = (n_bins_ranges_arr[1:] + n_bins_ranges_arr[:-1])/2.0
im = axs[1,1].imshow(Z, interpolation='nearest', origin='lower', cmap=cm ,norm=norm)
axs[1,1].set_title("Midpoint Labels\n Switched Equal Sign")
cbar=fig.colorbar(im, ax=axs[1,1], spacing="proportional",
ticks=n_bins_ranges_midpoints.tolist())
cbar.ax.set_yticklabels(['Red', 'Brown', 'Green 1','Green 2','Gray Blue','Blue'])
plt.show()

You can use a BoundaryNorm as follows:
import matplotlib.pyplot as plt
import matplotlib.colors
import numpy as np
x = np.arange(0, np.pi, 0.1)
y = np.arange(0, 2*np.pi, 0.1)
X, Y = np.meshgrid(x, y)
Z = np.cos(X) * np.sin(Y) * 10
colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)] # R -> G -> B
n_bin = 6 # Discretizes the interpolation into bins
n_bins_ranges = [-10,-5,-2,-0.5,2.5,7.5,10]
cmap_name = 'my_list'
fig, ax = plt.subplots()
# Create the colormap
cm = matplotlib.colors.LinearSegmentedColormap.from_list(
cmap_name, colors, N=n_bin)
norm = matplotlib.colors.BoundaryNorm(n_bins_ranges, len(n_bins_ranges))
# Fewer bins will result in "coarser" colomap interpolation
im = ax.imshow(Z, interpolation='nearest', origin='lower', cmap=cm, norm=norm)
ax.set_title("N bins: %s" % n_bin)
fig.colorbar(im, ax=ax)
plt.show()
Or, if you want proportional spacing, i.e. the distance between colors according to their values,
fig.colorbar(im, ax=ax, spacing="proportional")
As the boundary norm documentation states
If b[i] <= v < b[i+1]
then v is mapped to color j; as i varies from 0 to len(boundaries)-2, j goes from 0 to ncolors-1.
So the colors are always chosen as -2 <= x < -0.5, in order to obtain the equal sign on the other side you would need to supply
something like n_bins_ranges = np.array([-10,-5,-2,-0.5,2.5,7.5,10])-1e-9

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to normalize probability distribution values in the matplotlib histogram plot? - python

Related

How to plot a vertical thermal plot?

How to combine two matplotlib (python) colormaps from scatter plot

Increase space between y tick and custom x axis

python: sns distplot area overlap

How can I create custom break points in a matplotlib colorbar?

Categories

Resources