python: sns distplot area overlap - python

How can I get the overlapping area of 2 sns.distplots?
Apart from the difference in mean (as below) I would like to add a number that descripes how different the (normalised) distributions are (for example 2 distributions could have the same mean but still look very different if they are not normal).
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
x1 = np.random.normal(size=2000)
x2 = np.random.normal(size=1000)+1
sns.distplot(x1, hist=False, kde=True, color="r", norm_hist=True)
sns.distplot(x2, hist=False, kde=True, color="b", norm_hist=True)
m1 = x1.mean()
m2 = x2.mean()
plt.title("m1={:2.2f}, m2={:2.2f} (diffInMean={:2.2f})".format(m1, m2, m1-m2))
plt.show(block=True)

If somebody is interested: I have approximated it now with an integral of the distributions (unfortunately not quite the 1-liner I was searching for):
data1 = np.random.normal(size=9000)
data2 = np.random.normal(size=5000, loc=0.5, scale=1.5)
num_bins = 100
xmin = min(data1.min(), data2.min())
xmax = max(data1.max(), data2.max())
bins = np.linspace(xmin, xmax, num_bins)
weights1 = np.ones_like(data1) / float(len(data1))
weights2 = np.ones_like(data2) / float(len(data2))
hist_1 = np.histogram(data1, bins, weights=weights1)[0]
hist_2 = np.histogram(data2, bins, weights=weights2)[0]
tvd = 0.5*sum(abs(hist_1 - hist_2))
print("overlap: {:2.2f} percent".format((1-tvd)*100))
plt.figure()
ax = plt.gca()
ax.hist(data1, bins, weights=weights1, color='red', edgecolor='white', alpha=0.5)[0]
ax.hist(data2, bins, weights=weights2, color='blue', edgecolor='white', alpha=0.5)[0]
plt.show()

Related

How to normalize probability distribution values in the matplotlib histogram plot?

I am trying to show both cumulative and non-cumulative distributions on the same plot.
fig, ax = plt.subplots(figsize=(10, 5))
n, bins, patches = ax.hist(x, n_bins, density=True, stacked=True, histtype='step',
cumulative=True, label='Empirical cumulative')
# Overlay a non-cumulative histogram.
ax.hist(x, bins=bins, density=True, stacked=True, histtype='step', cumulative=False, label='Empirical non-cumulative')
plt.show()
The Empirical cumulative curve looks well and the values do not exceed 1. However, the Empirical non-cumulative curve has Y values higher than 1. How can I normalize them?
Update:
Sample data:
n_bins = 20
x = [
0.0051055006412772065,
0.09770815865459548,
0.20666651037049322,
0.5433266733820051,
0.5717169069724539,
0.5421114013759187,
0.4994941193115986,
0.4391978276380223,
0.3673067648294034,
0.3150259778098451,
0.4072059689437963,
0.5781929593356039,
0.6494934859266276,
0.620882081680377,
0.5845829440637116,
0.515705471234385]
Please see the orange curve.
The easiest way to create a histogram with probability instead of probability density is to use seaborn's sns.histplot(.... stat='probability').
To mimic this with standard matplotlib, you could calculate all values manually. For example:
import matplotlib.pyplot as plt
import numpy as np
n_bins = 20
x = np.random.normal(0, 1, (1000, 3))
bin_edges = np.linspace(x.min(), x.max(), n_bins + 1)
bin_values = np.array([np.histogram(x[:, i], bins=bin_edges)[0] for i in range(x.shape[1])])
cum_values = bin_values.cumsum(axis=1).cumsum(axis=0)
cum_values = cum_values / cum_values.max()
fig, ax = plt.subplots(figsize=(10, 5))
prev = 0
for c in cum_values:
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], c, [prev]]))
prev = c[-1]
ax.set_prop_cycle(None)
prev = 0
for c in cum_values:
c = np.diff(c)
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], c, [c[-1], prev]]), ls='--')
prev = c[-1]
plt.show()
If you have just one distribution, stacked=True doesn't make a difference. The code would be simpler:
import matplotlib.pyplot as plt
import numpy as np
n_bins = 20
x = np.random.normal(0, 1, 1000)
bin_edges = np.linspace(x.min(), x.max(), n_bins + 1)
bin_values = np.histogram(x, bins=bin_edges)[0]
cum_values = bin_values.cumsum()
cum_values = cum_values / cum_values.max()
fig, ax = plt.subplots(figsize=(10, 5))
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], cum_values, [0]]))
ax.set_prop_cycle(None)
c = np.diff(cum_values)
plt.step(np.append(bin_edges, bin_edges[-1]), np.concatenate([[0], c, [c[-1], 0]]), ls='--')
plt.show()

Plot CDF with confidence interval using Seaborn

I'm trying to plot a CDF from multiple simulation runs using Seaborn. I created a very simple code to emulate my results:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df1 = pd.DataFrame({'A':np.random.randint(0, 100, 1000)})
df2 = pd.DataFrame({'A':np.random.randint(0, 100, 1000)})
df3 = pd.DataFrame({'A':np.random.randint(0, 100, 1000)})
f, ax = plt.subplots(figsize=(8, 8))
ax = sns.kdeplot(df1['A'], cumulative=True)
ax = sns.kdeplot(df2['A'], cumulative=True)
ax = sns.kdeplot(df3['A'], cumulative=True)
plt.show()
The code above creates the following plot:
But, since the three lines are results from the same simulation with different seeds, I'd like to "merge" the three lines into one and add a shaded area around the line, representing min and max or the std of the three different runs.
How can this be accomplished in Seaborn?
You may use fill_between to fill between two curves. Now here the problem is that the kde support would be different for the three curves. Obtaining a common kde support will require to calculate the cdf manually. This could be done as follows.
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
def cdf(data, limits="auto", npoints=600):
kde = stats.gaussian_kde(data)
bw = kde.factor
if limits == "auto":
limits = (data.min(), data.max())
limits = (limits[0]-bw*np.diff(limits)[0],
limits[1]+bw*np.diff(limits)[0])
x = np.linspace(limits[0], limits[1], npoints)
y = [kde.integrate_box(x[0],x[i]) for i in range(len(x))]
return x, np.array(y)
d1 = np.random.randint(14, 86, 1000)
d2 = np.random.randint(10, 100, 1000)
d3 = np.random.randint(0, 90, 1000)
mini = np.min((d1.min(), d2.min(), d3.min()))
maxi = np.max((d1.max(), d2.max(), d3.max()))
x1,y1 = cdf(d1, limits=(mini, maxi))
x2,y2 = cdf(d2, limits=(mini, maxi))
x3,y3 = cdf(d3, limits=(mini, maxi))
y = np.column_stack((y1, y2, y3))
ymin = np.min(y, axis=1)
ymax = np.max(y, axis=1)
f, ax = plt.subplots()
ax.plot(x1,y1)
ax.plot(x2,y2)
ax.plot(x3,y3)
ax.fill_between(x1, ymin, ymax, color="turquoise", alpha=0.4, zorder=0)
plt.show()

Oceanographic plotting in Python

I am plotting graphs. And I would like to have the range of values on the colorbars for graphs "U_velocity" and "U_shear_velocity" from -0.3 to 0.3. Moreover, I am trying to make the range of x ax in hours from 0 to 12.5 for U and V shear velocity plots but nothing works and instead of that I have meanings of the speed. How can I do that, please help me.
from netCDF4 import *
import matplotlib as mp
import numpy as np
#import matplotlib.pyplot as plt
import pylab as plt
#%%
file = "/home/vlad/Desktop/task/Untitled Folder/result.nc"
ncdata = Dataset(file, 'r')
u = np.squeeze(ncdata.variables['u'][:])
v = np.squeeze(ncdata.variables['v'][:])
z = np.squeeze(ncdata.variables['z'][:])
time = ncdata.variables['time'][:]/3600
ncdata.close()
u_mean = np.mean(u[0:100,:],0)
z_mean = np.mean(z[0:100,:],0)
v_mean = np.mean(v[0:100,:],0)
u_mean_10 = u[900:1000,:]
v_mean_10 = v[900:1000,:]
z_10 = np.mean(z[900:1000,:],0)
time_10 = time[900:1000] - time[900]
T = len(time_10)
L = len(z_10)
fig = plt.figure(6)
plt.pcolormesh(time_10,z_10,u_mean_10.T)
plt.xlim([0, time_10[-1]])
fig.suptitle('U_velocity', fontsize=25)
plt.xlabel('time', fontsize=20)
plt.ylabel('depth(m)', fontsize=20)
plt.colorbar()
plt.show()
shear_u_mean_10 = np.zeros([T,L])
for t in range(T):
for i in range(L-1):
tmp=(u_mean_10[t, i+1]-u_mean_10[t, i])/(z_10[i+1]-z_10[i])
tmp_depth = 0.5 * (z_10[i+1]+z_10[i])
shear_u_mean_10[t,i] = tmp
fig = plt.figure(10)
plt.pcolormesh(time_10/3600,z_10, shear_u_mean_10.T)
plt.xlim([0, time_10[-1]/3600])
plt.colorbar()
#plt.ylim([-30, -25])
fig.suptitle('U_shear velocity', fontsize=25)
plt.xlabel('time', fontsize=20)
plt.ylabel('depth(m)', fontsize=20)
plt.show()
shear_v_mean_10 = np.zeros([T,L])
for t in range(T):
for i in range(L-1):
tmp=(v_mean_10[t, i+1]-v_mean_10[t, i])/(z_10[i+1]-z_10[i])
tmp_depth = 0.5 * (z_10[i+1]+z_10[i])
shear_v_mean_10[t,i] = tmp
fig = plt.figure(11)
plt.pcolormesh(time_10/3600,z_10, shear_v_mean_10.T)
plt.xlim([0, time_10[-1]/3600])
plt.colorbar()
#plt.ylim([-30, -25])
fig.suptitle('V_shear velocity', fontsize=25)
plt.xlabel('time', fontsize=20)
plt.ylabel('depth(m)', fontsize=20)
plt.show()
fig = plt.figure(7)
plt.pcolormesh(time_10,z_10,v_mean_10.T)
plt.xlim([0, time_10[-1]])
fig.suptitle('V_velocity', fontsize=25)
plt.xlabel('time', fontsize=20)
plt.ylabel('depth(m)', fontsize=20)
plt.colorbar()
plt.show()
This is not an easy question to answer with a wall of code, reference to unknown file result.nc and several unrelated and fairly specific problems. The following may help:
The colorbar range can be set by passing vmin=-0.3 and vmax=0.3 to pcolormesh.
To limiting the range of time, you can use array slicing (e.g. time[time<12.5], u[time<12.5]).
For your data, surely the speed is just speed = np.sqrt(np.power(u,2) + np.power(v,2)
Please provide a Minimal, Complete, and Verifiable example if you want further help.

Plot two histograms on the same graph and have their columns sum to 100

I have two sets of different sizes that I'd like to plot on the same histogram. However, since one set has ~330,000 values and the other has about ~16,000 values, their frequency histograms are hard to compare. I'd like to plot a histogram comparing the two sets such that the y-axis is the % of occurrences in that bin. My code below gets close to this, except that rather than having the individual bin values sum to 1.0, the integral of the histogram sums to 1.0 (this is because of the normed=True parameter).
How can I achieve my goal? I've already tried manually calculating the % frequency and using plt.bar() but rather than overlaying the plots, the plots are compared side by side. I want to keep the effect of having the alpha=0.5
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
if plt.get_fignums():
plt.close('all')
electric = pd.read_csv('electric.tsv', sep='\t')
gas = pd.read_csv('gas.tsv', sep='\t')
electric_df = pd.DataFrame(electric)
gas_df = pd.DataFrame(ngma_nonheat)
electric = electric_df['avg_daily']*30
gas = gas_df['avg_daily']*30
## Create a plot for NGMA gas usage
plt.figure("Usage Comparison")
weights_electric = np.ones_like(electric)/float(len(electric))
weights_gas = np.ones_like(gas)/float(len(gas))
bins=np.linspace(0, 200, num=50)
n, bins, rectangles = plt.hist(electric, bins, alpha=0.5, label='electric usage', normed=True, weights=weights_electric)
plt.hist(gas, bins, alpha=0.5, label='gas usage', normed=True, weights=weights_gas)
plt.legend(loc='upper right')
plt.xlabel('Average 30 day use in therms')
plt.ylabel('% of customers')
plt.title('NGMA Customer Usage Comparison')
plt.show()
It sounds like you don't want the normed/density kwarg in this case. You're already using weights. If you multiply your weights by 100 and leave out the normed=True option, you should get exactly what you had in mind.
For example:
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1)
x = np.random.normal(5, 2, 10000)
y = np.random.normal(2, 1, 3000000)
xweights = 100 * np.ones_like(x) / x.size
yweights = 100 * np.ones_like(y) / y.size
fig, ax = plt.subplots()
ax.hist(x, weights=xweights, color='lightblue', alpha=0.5)
ax.hist(y, weights=yweights, color='salmon', alpha=0.5)
ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
ax.margins(0.05)
ax.set_ylim(bottom=0)
plt.show()
On the other hand, what you're currently doing (weights and normed) would result in (note the units on the y-axis):
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1)
x = np.random.normal(5, 2, 10000)
y = np.random.normal(2, 1, 3000000)
xweights = 100 * np.ones_like(x) / x.size
yweights = 100 * np.ones_like(y) / y.size
fig, ax = plt.subplots()
ax.hist(x, weights=xweights, color='lightblue', alpha=0.5, normed=True)
ax.hist(y, weights=yweights, color='salmon', alpha=0.5, normed=True)
ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
ax.margins(0.05)
ax.set_ylim(bottom=0)
plt.show()

How to create equal number of primary and secondary y-axes ticks with matplotlib?

I have been working for a while to create a plot with secondary axis so that both the primary and secondary axes have equal number of major ticks so that the grid lines coincide. In the figure below I have shown grid lines on the secondary axis to illustrate the problem.
By manually setting the secondary axis limits I got this plot, which is my desired output:
I have included the reproducible code:
import numpy as np
import matplotlib.pyplot as plt
data = np.loadtxt('data.dat', skiprows=2, delimiter=',', unpack=True).transpose()
time = data[:,0]
pressure = data[:,1]
lift = data[:,2]
figure_pressure_trace = plt.figure(figsize=(5.15, 5.15))
figure_pressure_trace.clf()
P_vs_t = plt.subplot(111)
P_vs_t.plot(time, pressure, linewidth=1.0)
P_vs_t.set_ylabel(r'\textit{Pressure (bar)}', labelpad=6)
P_vs_t.set_xlabel(r'\textit{Time (ms)}', labelpad=6)
lift_vs_t = P_vs_t.twinx()
lift_vs_t.plot(time, lift, color='#4DAF4A')
lift_vs_t.set_ylabel(r'\textit{Lift(mm)}', labelpad=6)
plt.show()
plt.close()
The data is available here.
UPDATE:
I created a function to create equal number of ticks, the entire code is:
import numpy as np
import matplotlib.pyplot as plt
def equal_y_ticks(primary, secondary):
y_min_primary, y_max_primary = primary.get_ybound()
y_min_secondary, y_max_secondary = secondary.get_ybound()
primary_ticks = len(primary.yaxis.get_major_ticks())
secondary_ticks = len(secondary.yaxis.get_major_ticks())
primary_spacing = (y_max_primary - y_min_primary) / (primary_ticks - 1)
secondary_spacing = (y_max_secondary - y_min_secondary) / (secondary_ticks - 1)
ticks = max(primary_ticks, secondary_ticks)
if secondary_ticks < primary_ticks:
y_max_secondary = y_min_secondary + (primary_ticks * secondary_spacing)
secondary.yaxis.set_ticks(np.arange(y_min_secondary, y_max_secondary, secondary_spacing))
else:
y_max_primary = y_min_primary + (secondary_ticks * primary_spacing)
primary.yaxis.set_ticks(np.arange(y_min_primary, y_max_primary, primary_spacing))
data = np.loadtxt('data.dat', skiprows=2, delimiter=',', unpack=True).transpose()
time = data[:,0]
pressure = data[:,1]
lift = data[:,2]
figure_pressure_trace = plt.figure(figsize=(5.15, 5.15))
figure_pressure_trace.clf()
P_vs_t = plt.subplot(111)
P_vs_t.plot(time, pressure, linewidth=1.0)
P_vs_t.set_ylabel(r'\textit{Pressure (bar)}', labelpad=6)
P_vs_t.set_xlabel(r'\textit{Time (ms)}', labelpad=6)
lift_vs_t = P_vs_t.twinx()
lift_vs_t.plot(time, lift, color='#4DAF4A')
equal_y_ticks(P_vs_t, lift_vs_t)
lift_vs_t.set_ylabel(r'\textit{Lift(mm)}', labelpad=6)
plt.show()
plt.close()
But this function gives me plots like these (for some data):
I think you are looking for LinearLocator (docs)
import matplotlib.pyplot as plt
from matplotlib import ticker as mtick
fig, ax = plt.subplots()
ax2 = ax.twinx()
ax.yaxis.set_major_locator(mtick.LinearLocator(5))
ax2.yaxis.set_major_locator(mtick.LinearLocator(5))
ax.set_ylim(0, 15)
ax2.set_ylim(0, 1500)
ax.yaxis.grid(True, lw=7, color='g', ls='--')
ax2.yaxis.grid(True, color='k', ls='-', lw=3)
Which will put N evenly spaced ticks between the min and max.

Categories