Plot CDF with confidence interval using Seaborn - python

I'm trying to plot a CDF from multiple simulation runs using Seaborn. I created a very simple code to emulate my results:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df1 = pd.DataFrame({'A':np.random.randint(0, 100, 1000)})
df2 = pd.DataFrame({'A':np.random.randint(0, 100, 1000)})
df3 = pd.DataFrame({'A':np.random.randint(0, 100, 1000)})
f, ax = plt.subplots(figsize=(8, 8))
ax = sns.kdeplot(df1['A'], cumulative=True)
ax = sns.kdeplot(df2['A'], cumulative=True)
ax = sns.kdeplot(df3['A'], cumulative=True)
plt.show()
The code above creates the following plot:
But, since the three lines are results from the same simulation with different seeds, I'd like to "merge" the three lines into one and add a shaded area around the line, representing min and max or the std of the three different runs.
How can this be accomplished in Seaborn?

You may use fill_between to fill between two curves. Now here the problem is that the kde support would be different for the three curves. Obtaining a common kde support will require to calculate the cdf manually. This could be done as follows.
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
def cdf(data, limits="auto", npoints=600):
kde = stats.gaussian_kde(data)
bw = kde.factor
if limits == "auto":
limits = (data.min(), data.max())
limits = (limits[0]-bw*np.diff(limits)[0],
limits[1]+bw*np.diff(limits)[0])
x = np.linspace(limits[0], limits[1], npoints)
y = [kde.integrate_box(x[0],x[i]) for i in range(len(x))]
return x, np.array(y)
d1 = np.random.randint(14, 86, 1000)
d2 = np.random.randint(10, 100, 1000)
d3 = np.random.randint(0, 90, 1000)
mini = np.min((d1.min(), d2.min(), d3.min()))
maxi = np.max((d1.max(), d2.max(), d3.max()))
x1,y1 = cdf(d1, limits=(mini, maxi))
x2,y2 = cdf(d2, limits=(mini, maxi))
x3,y3 = cdf(d3, limits=(mini, maxi))
y = np.column_stack((y1, y2, y3))
ymin = np.min(y, axis=1)
ymax = np.max(y, axis=1)
f, ax = plt.subplots()
ax.plot(x1,y1)
ax.plot(x2,y2)
ax.plot(x3,y3)
ax.fill_between(x1, ymin, ymax, color="turquoise", alpha=0.4, zorder=0)
plt.show()

Related

Color overlapped lines with plt.plot (Matplotlib)

How can I configure plt.plot such that overlapped lines will have darker colors?
For example, I would like to use plt.plot to display the samples in such a way that the density that can be seen in the upper plot will be clear in the lower plot.
From the lower plot it's hard to understand where most of the samples are located
Here is the code I used in order to generate the example:
import numpy as np
import matplotlib.pyplot as plt
time = 100
n_samples = 7000
x = np.linspace(0, time, n_samples)
r1 = np.random.normal(0, 1, x.size)
r2 = np.random.uniform(-6, 6, x.size)
data = np.dstack((r1, r2)).flatten()
fig, axs = plt.subplots(2, 1, figsize=(9, 6))
axs[0].scatter(np.arange(len(data)), data, alpha=0.1)
axs[1].plot(np.arange(len(data)), data, alpha=0.2)
plt.show()
Update: segmentation and plotting into separated function
Instead of drawing one large curve, you could create each line segment separately and then draw these. That way, the overlapping segments will be blended via the transparency.
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np
def plot_line_as_segments(xs, ys=None, ax=None, **kwargs):
ax = ax or plt.gca()
if ys is None:
ys = xs
xs = np.arange(len(ys))
segments = np.c_[xs[:-1], ys[:-1], xs[1:], ys[1:]].reshape(-1, 2, 2)
added_collection = ax.add_collection(LineCollection(segments, **kwargs))
ax.autoscale()
return added_collection
time = 100
n_samples = 7000
x = np.linspace(0, time, n_samples)
r1 = np.random.normal(0, 1, x.size)
r2 = np.random.uniform(-6, 6, x.size)
data = np.dstack((r1, r2)).flatten()
fig, axs = plt.subplots(2, 1, figsize=(9, 6))
axs[0].scatter(np.arange(len(data)), data, alpha=0.1)
axs[0].margins(x=0)
plot_line_as_segments(data, ax=axs[1], alpha=0.05)
axs[1].margins(x=0)
plt.show()

Fill between subplots with matplotlib cmap

I have 2 line plots on the same figure, plotted from pandas dataframes.
I want to fill between them with a gradient/colour map of sorts.
I understand I can do this with a cmap, only it will not work for me (see code below).
General example I found are filling between x axis and line, i do not want that and also i am interested in simplest solution possible for this as i am a begginer at this and complicated, though maybe better code will just make it more confusing honestly.
Code for which fill is plain blue:
import matplotlib.pyplot as plt
import pandas as pd
ax = plt.gca()
df0.plot(kind='line', x='something', y='other', color='orange', ax=ax, legend=False, figsize=(20,10))
df1.plot(kind='line', x='something', y='other2', color='c', ax=ax, legend=False, figsize=(20,10))
ax.fill_between(x=df0['daysInAYear'], y1=df0['other'], y2 = df1['other2'], alpha=0.2, cmap=plt.cm.get_cmap("winter"))
plt.show()
EDIT/UPDATE: DATA EXAMPLE
other is ALWAYS >= other2
other other2 something (same for both)
15.6 -16.0 1
13.9 -26.7 2
13.3 -26.7 3
10.6 -26.1 4
12.8 -15.0 5
Final graph example:
I would like the fill to go from orange on top to blue at the bottom
Edit
In response to the edited question, here is an alternative approach which does the gradient vertically but doesn't use imshow.
import matplotlib.pyplot as plt
from matplotlib import colors, patches
import numpy as np
import pandas as pd
n = 100
nc = 100
x = np.linspace(0, np.pi*5, n)
y1 = [-50.0]
y2 = [50.0]
for ii in range(1, n):
y1.append(y1[ii-1] + (np.random.random()-0.3)*3)
y2.append(y2[ii-1] + (np.random.random()-0.5)*3)
y1 = np.array(y1)
y2 = np.array(y2)
z = np.linspace(0, 10, nc)
normalize = colors.Normalize(vmin=z.min(), vmax=z.max())
cmap = plt.cm.get_cmap('winter')
fig, ax = plt.subplots(1)
for ii in range(len(df['x'].values)-1):
y = np.linspace(y1[ii], y2[ii], nc)
yn = np.linspace(y1[ii+1], y2[ii+1], nc)
for kk in range(nc - 1):
p = patches.Polygon([[x[ii], y[kk]],
[x[ii+1], yn[kk]],
[x[ii+1], yn[kk+1]],
[x[ii], y[kk+1]]], color=cmap(normalize(z[kk])))
ax.add_patch(p)
plt.plot(x, y1, 'k-', lw=1)
plt.plot(x, y2, 'k-', lw=1)
plt.show()
The idea here being similar to that in my original answer, except the trapezoids are divided into nc pieces and each piece is colored separately. This has the advantage of scaling correctly for varying y1[ii], y2[ii] distances, as shown in this comparison,
It does, however, have the disadvantages of being much, much slower than imshow or the horizontal gradient method and of being unable to handle 'crossing' correctly.
The code to generate the second image in the above comparison:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import patches
from matplotlib.path import Path
x = np.linspace(0, 10, n)
y1 = [-50.0]
y2 = [50.0]
for ii in range(1, n):
y1.append(y1[ii-1] + (np.random.random()-0.2)*3)
y2.append(y2[ii-1] + (np.random.random()-0.5)*3)
y1 = np.array(y1)
y2 = np.array(y2)
verts = np.vstack([np.stack([x, y1], 1), np.stack([np.flip(x), np.flip(y2)], 1)])
path = Path(verts)
patch = patches.PathPatch(path, facecolor='k', lw=2, alpha=0.0)
plt.gca().add_patch(patch)
plt.imshow(np.arange(10).reshape(10,-1), cmap=plt.cm.winter, interpolation="bicubic",
origin='upper', extent=[0,10,-60,60], aspect='auto', clip_path=patch,
clip_on=True)
plt.show()
Original
This is a bit of a hack, partly based on the answers in this question. It does seem to work fairly well but works best with higher density along the x axis. The idea is to call fill_between separately for each trapezoid corresponding to x pairs, [x[ii], x[ii+1]]. Here is a complete example using some generated data
import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np
import pandas as pd
n = 1000
X = np.linspace(0, np.pi*5, n)
Y1 = np.sin(X)
Y2 = np.cos(X)
Z = np.linspace(0, 10, n)
normalize = colors.Normalize(vmin=Z.min(), vmax=Z.max())
cmap = plt.cm.get_cmap('winter')
df = pd.DataFrame({'x': X, 'y1': Y1, 'y2': Y2, 'z': Z})
x = df['x'].values
y1 = df['y1'].values
y2 = df['y2'].values
z = df['z'].values
for ii in range(len(df['x'].values)-1):
plt.fill_between([x[ii], x[ii+1]], [y1[ii], y1[ii+1]],
[y2[ii], y2[ii+1]], color=cmap(normalize(z[ii])))
plt.plot(x, y1, 'k-', x, y2, 'k-')
plt.show()
This can be generalized to a 2 dimensional color grid but would require non-trivial modification

How to avoid overlapping error bars in matplotlib?

I want to create a plot for two different datasets similar to the one presented in this answer:
In the above image, the author managed to fix the overlapping problem of the error bars by adding some small random scatter in x to the new dataset.
In my problem, I must plot a similar graphic, but having some categorical data in the x axis:
Any ideas on how to slightly move one the error bars of the second dataset using categorical variables at the x axis? I want to avoid the overlapping between the bars for making the visualization easier.
You can translate each errorbar by adding the default data transform to a prior translation in data space. This is possible when knowing that categories are in general one data unit away from each other.
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
from matplotlib.transforms import Affine2D
x = list("ABCDEF")
y1, y2 = np.random.randn(2, len(x))
yerr1, yerr2 = np.random.rand(2, len(x))*4+0.3
fig, ax = plt.subplots()
trans1 = Affine2D().translate(-0.1, 0.0) + ax.transData
trans2 = Affine2D().translate(+0.1, 0.0) + ax.transData
er1 = ax.errorbar(x, y1, yerr=yerr1, marker="o", linestyle="none", transform=trans1)
er2 = ax.errorbar(x, y2, yerr=yerr2, marker="o", linestyle="none", transform=trans2)
plt.show()
Alternatively, you could translate the errorbars after applying the data transform and hence move them in units of points.
import numpy as np; np.random.seed(42)
import matplotlib.pyplot as plt
from matplotlib.transforms import ScaledTranslation
x = list("ABCDEF")
y1, y2 = np.random.randn(2, len(x))
yerr1, yerr2 = np.random.rand(2, len(x))*4+0.3
fig, ax = plt.subplots()
trans1 = ax.transData + ScaledTranslation(-5/72, 0, fig.dpi_scale_trans)
trans2 = ax.transData + ScaledTranslation(+5/72, 0, fig.dpi_scale_trans)
er1 = ax.errorbar(x, y1, yerr=yerr1, marker="o", linestyle="none", transform=trans1)
er2 = ax.errorbar(x, y2, yerr=yerr2, marker="o", linestyle="none", transform=trans2)
plt.show()
While results look similar in both cases, they are fundamentally different. You will observe this difference when interactively zooming the axes or changing the figure size.
Consider the following approach to highlight plots - combination of errorbar and fill_between with non-zero transparency:
import random
import matplotlib.pyplot as plt
# create sample data
N = 8
data_1 = {
'x': list(range(N)),
'y': [10. + random.random() for dummy in range(N)],
'yerr': [.25 + random.random() for dummy in range(N)]}
data_2 = {
'x': list(range(N)),
'y': [10.25 + .5 * random.random() for dummy in range(N)],
'yerr': [.5 * random.random() for dummy in range(N)]}
# plot
plt.figure()
# only errorbar
plt.subplot(211)
for data in [data_1, data_2]:
plt.errorbar(**data, fmt='o')
# errorbar + fill_between
plt.subplot(212)
for data in [data_1, data_2]:
plt.errorbar(**data, alpha=.75, fmt=':', capsize=3, capthick=1)
data = {
'x': data['x'],
'y1': [y - e for y, e in zip(data['y'], data['yerr'])],
'y2': [y + e for y, e in zip(data['y'], data['yerr'])]}
plt.fill_between(**data, alpha=.25)
Result:
Threre is example on lib site: https://matplotlib.org/stable/gallery/lines_bars_and_markers/errorbar_subsample.html
enter image description here
You need parameter errorevery=(m, n),
n - how often plot error lines, m - shift with range from 0 to n

python: sns distplot area overlap

How can I get the overlapping area of 2 sns.distplots?
Apart from the difference in mean (as below) I would like to add a number that descripes how different the (normalised) distributions are (for example 2 distributions could have the same mean but still look very different if they are not normal).
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
x1 = np.random.normal(size=2000)
x2 = np.random.normal(size=1000)+1
sns.distplot(x1, hist=False, kde=True, color="r", norm_hist=True)
sns.distplot(x2, hist=False, kde=True, color="b", norm_hist=True)
m1 = x1.mean()
m2 = x2.mean()
plt.title("m1={:2.2f}, m2={:2.2f} (diffInMean={:2.2f})".format(m1, m2, m1-m2))
plt.show(block=True)
If somebody is interested: I have approximated it now with an integral of the distributions (unfortunately not quite the 1-liner I was searching for):
data1 = np.random.normal(size=9000)
data2 = np.random.normal(size=5000, loc=0.5, scale=1.5)
num_bins = 100
xmin = min(data1.min(), data2.min())
xmax = max(data1.max(), data2.max())
bins = np.linspace(xmin, xmax, num_bins)
weights1 = np.ones_like(data1) / float(len(data1))
weights2 = np.ones_like(data2) / float(len(data2))
hist_1 = np.histogram(data1, bins, weights=weights1)[0]
hist_2 = np.histogram(data2, bins, weights=weights2)[0]
tvd = 0.5*sum(abs(hist_1 - hist_2))
print("overlap: {:2.2f} percent".format((1-tvd)*100))
plt.figure()
ax = plt.gca()
ax.hist(data1, bins, weights=weights1, color='red', edgecolor='white', alpha=0.5)[0]
ax.hist(data2, bins, weights=weights2, color='blue', edgecolor='white', alpha=0.5)[0]
plt.show()

Plot two histograms on the same graph and have their columns sum to 100

I have two sets of different sizes that I'd like to plot on the same histogram. However, since one set has ~330,000 values and the other has about ~16,000 values, their frequency histograms are hard to compare. I'd like to plot a histogram comparing the two sets such that the y-axis is the % of occurrences in that bin. My code below gets close to this, except that rather than having the individual bin values sum to 1.0, the integral of the histogram sums to 1.0 (this is because of the normed=True parameter).
How can I achieve my goal? I've already tried manually calculating the % frequency and using plt.bar() but rather than overlaying the plots, the plots are compared side by side. I want to keep the effect of having the alpha=0.5
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
if plt.get_fignums():
plt.close('all')
electric = pd.read_csv('electric.tsv', sep='\t')
gas = pd.read_csv('gas.tsv', sep='\t')
electric_df = pd.DataFrame(electric)
gas_df = pd.DataFrame(ngma_nonheat)
electric = electric_df['avg_daily']*30
gas = gas_df['avg_daily']*30
## Create a plot for NGMA gas usage
plt.figure("Usage Comparison")
weights_electric = np.ones_like(electric)/float(len(electric))
weights_gas = np.ones_like(gas)/float(len(gas))
bins=np.linspace(0, 200, num=50)
n, bins, rectangles = plt.hist(electric, bins, alpha=0.5, label='electric usage', normed=True, weights=weights_electric)
plt.hist(gas, bins, alpha=0.5, label='gas usage', normed=True, weights=weights_gas)
plt.legend(loc='upper right')
plt.xlabel('Average 30 day use in therms')
plt.ylabel('% of customers')
plt.title('NGMA Customer Usage Comparison')
plt.show()
It sounds like you don't want the normed/density kwarg in this case. You're already using weights. If you multiply your weights by 100 and leave out the normed=True option, you should get exactly what you had in mind.
For example:
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1)
x = np.random.normal(5, 2, 10000)
y = np.random.normal(2, 1, 3000000)
xweights = 100 * np.ones_like(x) / x.size
yweights = 100 * np.ones_like(y) / y.size
fig, ax = plt.subplots()
ax.hist(x, weights=xweights, color='lightblue', alpha=0.5)
ax.hist(y, weights=yweights, color='salmon', alpha=0.5)
ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
ax.margins(0.05)
ax.set_ylim(bottom=0)
plt.show()
On the other hand, what you're currently doing (weights and normed) would result in (note the units on the y-axis):
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1)
x = np.random.normal(5, 2, 10000)
y = np.random.normal(2, 1, 3000000)
xweights = 100 * np.ones_like(x) / x.size
yweights = 100 * np.ones_like(y) / y.size
fig, ax = plt.subplots()
ax.hist(x, weights=xweights, color='lightblue', alpha=0.5, normed=True)
ax.hist(y, weights=yweights, color='salmon', alpha=0.5, normed=True)
ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
ax.margins(0.05)
ax.set_ylim(bottom=0)
plt.show()

Categories