How to plot Comparative Boxplot with a PDF like KDnuggets Style - python

While going through the Understanding Boxplots from the KDnuggets Article. I found a detailed plot of Boxplot with a probability density function (pdf)
I'm trying to plot a comparative Boxplot and a probability density function (pdf) as shown in the article as below fig
I know plotting a basic box plot and pdf individually. My knowledge of visualization was minimum.I'm not asking the exact replicate of the above Plot, a similar plot with detail would be highly appreciated.
I'm open to new ideas and approaches and wanted to put some feelers out before diving into getting started
Can it be possible to plot the above plot with Python if YES, Which package would be used to plot the above plot? Can anybody shed some light on plotting the above plot with Python? I would be happy to receive any leads on it from you.

Here is an attempt to recreate the graphical elements of the plot. Instead of a perfect normal distribution, some random data is used, so you can plug in your own data. (For a more perfect curve, generate a higher number of samples.)
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
x = np.random.normal(0, 1, 1000)
mean = x.mean()
std = x.std()
q1, median, q3 = np.percentile(x, [25, 50, 75])
iqr = q3 - q1
fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)
medianprops = dict(linestyle='-', linewidth=2, color='yellow')
sns.boxplot(x=x, color='lightcoral', saturation=1, medianprops=medianprops,
flierprops={'markerfacecolor': 'mediumseagreen'}, whis=1.5, ax=ax1)
ticks = [mean + std * i for i in range(-4, 5)]
ticklabels = [f'${i}\\sigma$' for i in range(-4, 5)]
ax1.set_xticks(ticks)
ax1.set_xticklabels(ticklabels)
ax1.set_yticks([])
ax1.tick_params(labelbottom=True)
ax1.set_ylim(-1, 1.5)
ax1.errorbar([q1, q3], [1, 1], yerr=[-0.2, 0.2], color='black', lw=1)
ax1.text(q1, 0.6, 'Q1', ha='center', va='center', color='black')
ax1.text(q3, 0.6, 'Q3', ha='center', va='center', color='black')
ax1.text(median, -0.6, 'median', ha='center', va='center', color='black')
ax1.text(median, 1.2, 'IQR', ha='center', va='center', color='black')
ax1.text(q1 - 1.5*iqr, 0.4, 'Q1 - 1.5*IQR', ha='center', va='center', color='black')
ax1.text(q3 + 1.5*iqr, 0.4, 'Q3 + 1.5*IQR', ha='center', va='center', color='black')
# ax1.vlines([q1 - 1.5*iqr, q1, q3, q3 + 1.5*iqr], 0, -2, color='darkgrey', ls=':', clip_on=False, zorder=0)
sns.kdeplot(x, ax=ax2)
kdeline = ax2.lines[0]
xs = kdeline.get_xdata()
ys = kdeline.get_ydata()
ylims = ax2.get_ylim()
ax2.fill_between(xs, 0, ys, color='mediumseagreen')
ax2.fill_between(xs, 0, ys, where=(xs >= q1 - 1.5*iqr) & (xs <= q3 + 1.5*iqr), color='skyblue')
ax2.fill_between(xs, 0, ys, where=(xs >= q1) & (xs <= q3), color='lightcoral')
# ax2.vlines([q1 - 1.5*iqr, q1, q3, q3 + 1.5*iqr], 0, 100, color='darkgrey', ls=':', zorder=0)
ax2.set_ylim(0, ylims[1])
plt.show()
Some remarks:
Often the median and the mean don't coincide, so the 0 sigma might be a bit off from the median line.
Matplotlib draws the whiskers at the data point that is closest to the calculated Q1 - 1.5 IQR and Q3 + 1.5 IQR, so when there aren't a huge number of points, the position of the whisker might be off a bit.
For real data, the distribution seldom looks like a perfect bell curve.
Here is an example for 1 million samples:

Related

Calculate distance between the center of a point on a scatter plot to the edge of it's marker, for dynamically changing marker sizes in Matplotlib

I have a scatter plot which I'd like to place by another scatter plot, however they have a dynamic marker size.
The green triangles (x, y) are calculated from the original scatter and they're close but not perfect (just from trial and error).
import pandas as pd
from mplsoccer import Pitch, VerticalPitch
data = [['JA', 35, 60, 2000], ['RN', 20, 47, 1500], ['GG', 10, 32, 1000]]
df = pd.DataFrame(data, columns=['Name', 'x', 'y', 'marker_size'])
#This is calculated from x or y length divided by marker size of biggest marker,
# divide by 2 for the radius, but the marker sizes seem to be non-linear.
df['xDiff'] = df['marker_size'] * ((7.3/2000) / 2)
df['yDiff'] = df['marker_size'] * ((11.3/2000) / 2)
df['leftArrowX'] = df['x'] - df['xDiff']
df['leftArrowY'] = df['y']
df['rightArrowX'] = df['x'] + df['xDiff']
df['rightArrowY'] = df['y']
df['downArrowY'] = df['y'] - df['yDiff']
df['downArrowX'] = df['x']
df['upArrowY'] = df['y'] + df['yDiff']
df['upArrowX'] = df['x']
pitch = Pitch(pitch_type='opta', pitch_color='#202428', line_color='#F2F2F2', linewidth=2)
fig, ax = pitch.draw(figsize=(16, 10))
players = pitch.scatter(df.x, df.y, s=df.marker_size, marker='8', color='orange', edgecolors='black', linewidth=1, alpha=1, ax=ax)
leftArrows = pitch.scatter(df.leftArrowX, df.leftArrowY, s=100, marker='<', color='lightgreen', alpha=1, ax=ax)
rightArrows = pitch.scatter(df.rightArrowX, df.rightArrowY, s=100, marker='>', color='lightgreen', alpha=1, ax=ax)
downArrows = pitch.scatter(df.downArrowX, df.downArrowY, s=100, marker='v', color='lightgreen', alpha=1, ax=ax)
upArrows = pitch.scatter(df.upArrowX, df.upArrowY, s=100, marker='^', color='lightgreen', alpha=1, ax=ax)
Result
How can I calculate the co-ordinates for the triangles more accurately given the original marker co-ordinates & marker size, so that they are placed evenly away at each point.
Or possibly any other solution to my problem.
Note: The pitch has co-ordinates 100x100, done in Jupyter Notebook. Thanks.

Simulate CDF curve for penetration/adoption extrapolation

I'd like to be able to plot a line like the cumulative distribution function for the normal distribution, because it's useful for simulating the adoption curve:
Specifically, I'd like to be able to use initial data (percentage adoption of a product) to extrapolate what the rest of that curve would look like, to give a rough estimate of the timeline to each of the phases. So, for example, if we got to 10% penetration by 30 days and 20% penetration by 40 days, and we try to fit this curve, I'd like to know when we're going to get to 80% penetration (vs another population that may have taken 50 days to get to 10% penetration).
So, my question is, how could I go about doing this? I would ideally be able to provide initial data (time and penetration), and use python (e.g. matplotlib) to plot out the rest of the chart for me. But I don't know where to start! Can anyone point me in the right direction?
(Incidentally, I also posted this question on CrossValidated, but I wasn't sure whether it belonged there, as it's a stats question, or here, as it's a python question. Apologies for duplication!)
The cdf can be calculated via scipy.stats.norm.cdf(). Its ppf can be used to help map the desired correspondences. scipy.interpolate.pchip can then create a function to so that the transformation interpolates smoothly.
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import numpy as np
from scipy.interpolate import pchip # monotonic cubic interpolation
from scipy.stats import norm
desired_xy = np.array([(30, 10), (40, 20)]) # (number of days, percentage adoption)
# desired_xy = np.array([(0, 1), (30, 10), (40, 20), (90, 99)])
labels = ['Innovators', 'Early\nAdopters', 'Early\nMajority', 'Late\nMajority', 'Laggards']
xmin, xmax = 0, 90 # minimum and maximum day on the x-axis
px = desired_xy[:, 0]
py = desired_xy[:, 1] / 100
# smooth function that transforms the x-values to the corresponding spots to get the desired y-values
interpfunc = pchip(px, norm.ppf(py))
fig, ax = plt.subplots(figsize=(12, 4))
# ax.scatter(px, py, color='crimson', s=50, zorder=3) # show desired correspondances
x = np.linspace(xmin, xmax, 1000)
ax.plot(x, norm.cdf(interpfunc(x)), lw=4, color='navy', clip_on=False)
label_divs = np.linspace(xmin, xmax, len(labels) + 1)
label_pos = (label_divs[:-1] + label_divs[1:]) / 2
ax.set_xticks(label_pos)
ax.set_xticklabels(labels, size=18, color='navy')
min_alpha, max_alpha = 0.1, 0.4
for p0, p1, alpha in zip(label_divs[:-1], label_divs[1:], np.linspace(min_alpha, max_alpha, len(labels))):
ax.axvspan(p0, p1, color='navy', alpha=alpha, zorder=-1)
ax.axvline(p0, color='white', lw=1, zorder=0)
ax.axhline(0, color='navy', lw=2, clip_on=False)
ax.axvline(0, color='navy', lw=2, clip_on=False)
ax.yaxis.set_major_formatter(PercentFormatter(1))
ax.set_xlim(xmin, xmax)
ax.set_ylim(0, 1)
ax.set_ylabel('Total Adoption', size=18, color='navy')
ax.set_title('Adoption Curve', size=24, color='navy')
for s in ax.spines:
ax.spines[s].set_visible(False)
ax.tick_params(axis='x', length=0)
ax.tick_params(axis='y', labelcolor='navy')
plt.tight_layout()
plt.show()
Using just two points for desired_xy the curve will be linearly stretched. If more points are given, a smooth transformation will be applied. Here is how it looks like with [(0, 1), (30, 10), (40, 20), (90, 99)]. Note that 0 % and 100 % will cause problems, as they lie at minus at plus infinity.

scatter plot does not plot all the points in the array + ternary library

I am using the ternary library in python and trying to plot a scatter of points in the ternary plot.
However it appears the below code only plots one single point for each of the 3 scatter calls made.
Can you please advise
scale = 1
figure, tax = ternary.figure(scale=scale)
tax.set_title("Scatter Plot", fontsize=20)
tax.boundary(linewidth=2.0)
tax.gridlines(multiple=5, color="blue")
# Plot a few different styles with a legend
points = np.array([df['73_prop']])
tax.scatter(points, marker='s', color='red', label="Outflows")
points = np.array([df['72_prop']])
tax.scatter(points, marker='D', color='green', label="HQLA")
points = np.array([df['74_prop']])
tax.scatter(points, marker='o', color='blue', label="Inflows")
tax.legend()
tax.ticks(axis='lbr', linewidth=1, multiple=5)
tax.show()
Here is the plot i get right now
In [213]:points
Out[213]:
array([[ 0.47426346, 0.50559869, 0.50368877, ..., 0.65636812,
0.56024801, 0.49020411]])
P.S. Am trying to mimic what's there on:
https://github.com/marcharper/python-ternary#scatter-plots
I have also tried using certain for loop but hasn't helped.
Your input data has the wrong format. You supply a list of flots, you need to supply a list of list of floats: [ [x1,y1] , [x2,y2], ...]
This works:
import ternary
import numpy as np
scale = 1
figure, tax = ternary.figure(scale=scale)
tax.set_title("Scatter Plot", fontsize=20)
tax.boundary(linewidth=2.0)
tax.gridlines(multiple=5, color="blue")
rnd = np.random.random(120)
points = [ [x*x, np.random.random(1) * np.random.random(1) * (1-x)] for x in rnd]
tax.scatter(points, marker='s', color='red', label="Outflows")
rnd = np.random.random(120)
# [[x1,y1], [x2,y2], ..., [xn,yn]]
points = [ [x*x, np.random.random(1) * np.random.random(1) * (1-x)] for x in rnd]
tax.scatter(points, marker='D', color='green', label="HQLA")
rnd = np.random.random(120)
points = [ [x*x, np.random.random(1) * np.random.random(1) * (1-x)] for x in rnd]
tax.scatter(points, marker='o', color='blue', label="Inflows")
tax.legend()
tax.ticks(axis='lbr', linewidth=1, multiple=5)
tax.show()

3D vector diagram in python - arrow heads not showing properly

This is the plot I get from the code below:
aa = np.zeros(len(self.depthrange))
bb = np.zeros(len(self.depthrange))
for i in range(0, self.n):
goodin = ~np.isnan(self.u[:, i])
bb[i] = self.u[goodin, i].mean()
aa[i] = self.v[goodin, i].mean()
speed = np.sqrt(bb**2 + aa**2)
dirt = np.arctan2(bb, aa) * 180 / np.pi
dirt[dirt < 360] += 360
dirt[dirt > 360] -= 360
binrange = -np.mean(self.variables.depth) + self.binrange[1, :]
aa = speed * np.cos(dirt * np.pi / 180)
bb = speed * np.sin(dirt * np.pi / 180)
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.quiver(0, 0, binrange, aa, bb, np.zeros(len(bb)), pivot='tail', arrow_length_ratio=0.3, length=0.15, color='tomato', lw='2')
ax.text(0, -0.025, -15, 'W', color='k', fontsize=15)
ax.text(-0.025, 0, -15, 'S', color='k', fontsize=15)
ax.text(0.025, 0, -15, 'N', color='k', fontsize=15)
ax.text(0, 0.025, -15, 'E', color='k', fontsize=15)
ax.set_zlim(-15, 0)
ax.set_xlim(-0.25, 0.25)
ax.set_ylim(-0.25, 0.25)
plt.gca().invert_xaxis()
#plt.gca().invert_yaxis()
ax.view_init(elev=18, azim=30)
ax.dist = 8
ax.set_xlabel('m/s')
ax.set_ylabel('m/s')
ax.set_zlabel('Depth (m)')
ax.set_title('Mean Current Vector')
ax.plot([0, 0], [0, 0], zs=[-15, 0], lw=2, color='grey')
for i in range(0, self.n):
ax.plot([-0.1*0.25, 0.1*0.25], [0, 0], zs=[binrange[i], binrange[i]], lw=2, color='grey')
ax.plot([0, 0], [-0.1*0.25, 0.1*0.25], zs=[binrange[i], binrange[i]], lw=2, color='grey')
plt.show()
There are arrow heads present as you can see a difference in the colour of the line however they do not look like arrows, does anyone know how I get the arrows to show properly?
Many thanks,
The arrow heads (in red) are there. I believe this to be a bug in the matplotlib library, but in any case the problem is with the components (U,V,W). Try to rotate them slightly in the axis of the shaft and you should see the heads appear. Here is a minimal example:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.gca(projection='3d')
X = (0, 0, 0)
Y = (0, 1, 2)
Z = (0,0,0)
ax.quiver(X,Y,Z,(0,2,1),(1,1,1),(1,3,1),length=0.05,arrow_length_ratio=0.3)
plt.show()
, the result is this:
Notice how one the arrows suffers from the same problem as yours. The others are fine. The only difference between them is the components.
I think you should try providing the data (you're using a class that we can't see so your plot remains with difficult reproduction) along with the code, in case you'll have problems in setting up the correct components.

Label python data points on plot

I searched for ages (hours which is like ages) to find the answer to a really annoying (seemingly basic) problem, and because I cant find a question that quite fits the answer I am posting a question and answering it in the hope that it will save someone else the huge amount of time I just spent on my noobie plotting skills.
If you want to label your plot points using python matplotlib
from matplotlib import pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
A = anyarray
B = anyotherarray
plt.plot(A,B)
for i,j in zip(A,B):
ax.annotate('%s)' %j, xy=(i,j), xytext=(30,0), textcoords='offset points')
ax.annotate('(%s,' %i, xy=(i,j))
plt.grid()
plt.show()
I know that xytext=(30,0) goes along with the textcoords, you use those 30,0 values to position the data label point, so its on the 0 y axis and 30 over on the x axis on its own little area.
You need both the lines plotting i and j otherwise you only plot x or y data label.
You get something like this out (note the labels only):
Its not ideal, there is still some overlap - but its better than nothing which is what I had..
How about print (x, y) at once.
from matplotlib import pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
A = -0.75, -0.25, 0, 0.25, 0.5, 0.75, 1.0
B = 0.73, 0.97, 1.0, 0.97, 0.88, 0.73, 0.54
ax.plot(A,B)
for xy in zip(A, B): # <--
ax.annotate('(%s, %s)' % xy, xy=xy, textcoords='data') # <--
ax.grid()
plt.show()
I had a similar issue and ended up with this:
For me this has the advantage that data and annotation are not overlapping.
from matplotlib import pyplot as plt
import numpy as np
fig = plt.figure()
ax = fig.add_subplot(111)
A = -0.75, -0.25, 0, 0.25, 0.5, 0.75, 1.0
B = 0.73, 0.97, 1.0, 0.97, 0.88, 0.73, 0.54
plt.plot(A,B)
# annotations at the side (ordered by B values)
x0,x1=ax.get_xlim()
y0,y1=ax.get_ylim()
for ii, ind in enumerate(np.argsort(B)):
x = A[ind]
y = B[ind]
xPos = x1 + .02 * (x1 - x0)
yPos = y0 + ii * (y1 - y0)/(len(B) - 1)
ax.annotate('',#label,
xy=(x, y), xycoords='data',
xytext=(xPos, yPos), textcoords='data',
arrowprops=dict(
connectionstyle="arc3,rad=0.",
shrinkA=0, shrinkB=10,
arrowstyle= '-|>', ls= '-', linewidth=2
),
va='bottom', ha='left', zorder=19
)
ax.text(xPos + .01 * (x1 - x0), yPos,
'({:.2f}, {:.2f})'.format(x,y),
transform=ax.transData, va='center')
plt.grid()
plt.show()
Using the text argument in .annotate ended up with unfavorable text positions.
Drawing lines between a legend and the data points is a mess, as the location of the legend is hard to address.

Categories