Render y-axis properly when overlaying pandas KDE and histogram - python

Similar questions to this have been asked before but not using these exact two plotting functions together so here we are:
I have a column from a pandas DataFrame that I am plotting both a histogram and the KDE. However, when I plot them, the y-axis is using the raw data value range instead of discrete number of samples/bin (what I want). How can I fix this? The actual plot is perfect, but the y-axis is wrong.
Data:
t2 = [140547476703.0, 113395471484.0, 158360225172.0, 105497674121.0, 186457736557.0, 153705359063.0, 36826568371.0, 200653068740.0, 190761317478.0, 126529980843.0, 98776029557.0, 132773701862.0, 14780432449.0, 167507656251.0, 121353262386.0, 136377019007.0, 134190768743.0, 218619462126.0, 07912778721.0, 215628911255.0, 147024833865.0, 94136343562.0, 135685803096.0, 165901502129.0, 45476074790.0, 125195690010.0, 113910844263.0, 123134290987.0, 112028565305.0, 93448218430.0, 07341012378.0, 93146854494.0, 132958913610.0, 102326700019.0, 196826471714.0, 122045354980.0, 76591131961.0, 134694468251.0, 120212625727.0, 108456858852.0, 106363042112.0, 193367024628.0, 39578667378.0, 178075400604.0, 155513974664.0, 132834624567.0, 137336282646.0, 125379267464.0]
Code:
fig = plt.figure()
# plot hist + kde
t2[t2.columns[0]].plot.kde(color = "maroon", label = "_nolegend_")
t2[t2.columns[0]].plot.hist(density = True, edgecolor = "grey", color = "tomato", title = t2.columns[0])
# plot mean/stdev
m = t2[t2.columns[0]].mean()
stdev = t2[t2.columns[0]].std()
plt.axvline(m, color = "black", ymax = 0.05, label = "mean")
plt.axvline(m-2*stdev, color = "black", ymax = 0.05, linestyle = ":", label = "+/- 2*Stdev")
plt.axvline(m+2*stdev, color = "black", ymax = 0.05, linestyle = ":")
plt.legend()
What it looks like now:

If you want the real counts, the you'll need to scale the KDE up by the width of the bins multiplied by the number of observations. The trickiest part is accessing the data pandas uses to plot the KDE. (I've removed parts related to the legend to simplify the problem at hand).
import matplotlib.pyplot as plt
import numpy as np
# Calculate KDE, get data
axis = t2[t2.columns[0]].plot.kde(color = "maroon", label = "_nolegend_")
xdata = axis.get_children()[0]._x
ydata = axis.get_children()[0]._y
plt.clf()
# Real figure
fig, ax = plt.subplots(figsize=(7,5))
# Plot Histogram, no density.
x = ax.hist(t2[t2.columns[0]], edgecolor = "grey", color = "tomato")
# size of the bins * N obs
scale = np.diff(x[1])[0]*len(t2)
# Plot scaled KDE
ax.plot(xdata, ydata*scale, color='blue')
ax.set_ylabel('N observations')
plt.show()

Related

How nonlinearly re-normalize Matplotlib colorbar?

I am trying to obtain a 2D pseudocolor plot with a nonlinear color map (cmap). Independently I want to have a colobar that uses a similar cmap but differently scaled/stretched to avoid overlapping of the colorbar yticks.
The first one I can obtain using some nonlinear norm as an argument of pcolormesh.
But how to get the second part in an efficient way?
Finally, I was able to obtain the desired effect (see the bottom right corner in the below figure) but I am pretty sure that this is not the best/easiest/desired/Pythonic way of doing it.
Is there an easier way of obtaining such an effect?
Figure:
Here is the code that reproduces the above figure:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cbook as cbook
from matplotlib import cm
import copy
def transform_cmap(cmap_basic,
new_cmap_name = None,
cmap_trans_fun = None,
how_many_levels = 256,
ticks = None,
ticks_trans_fun = None):
'''returns new cmap and new ticks locators for transformed ticks.
If ticks_trans_fun is None then ticks locators are linearly transformed
to the [0,1] interval, as a result, cmap is stretched, but when used with
colorbar than linearly spaced ticks are linearly spaced along the colorbar'''
# be sure that cmap is really a cmap
if not isinstance(cmap_basic,colors.Colormap):
try:
cmap_basic = cm.get_cmap(name=cmap_basic)
except:
print('basic_cmap is not a valid cmap or cmap name!')
if cmap_trans_fun is None:
cmap_trans_fun = colors.Normalize()
if new_cmap_name is None:
new_cmap_name = cmap_basic.name+'_new'
c_coords_linear = np.linspace(0,1, how_many_levels)
cmap_trans_fun_copy = copy.deepcopy(cmap_trans_fun)
# deppcopy to avoid overwritting the vmin, vmax values
cmap_trans_fun_copy.autoscale([0,1])
c_coords_after = cmap_trans_fun_copy(c_coords_linear)
c_list_after = cmap_basic(c_coords_after)
new_cmap = colors.LinearSegmentedColormap.from_list(new_cmap_name,
c_list_after,
N=how_many_levels)
if ticks_trans_fun is None:
ticks_trans_fun = colors.Normalize()
ticks_trans_fun_copy = copy.deepcopy(ticks_trans_fun)
ticks_trans_fun_copy.vmin = cmap_trans_fun.vmin
ticks_trans_fun_copy.vmax = cmap_trans_fun.vmax
new_ticks_locators = ticks_trans_fun_copy(ticks)
return new_cmap, new_ticks_locators
###########################################
# Prepare some data
# based on https://matplotlib.org/stable/gallery/userdemo/colormap_normalizations.html#sphx-glr-gallery-userdemo-colormap-normalizations-py
N = 100
X, Y = np.mgrid[-3:3:complex(0, N), -2:2:complex(0, N)]
# A low hump with a spike coming out of the top right. Needs to have
# z/colour axis on a log scale so we see both hump and spike. linear
# scale only shows the spike.
Z1 = np.exp(-X**2 - Y**2)
Z1b = 2*np.exp(-10*(X-0.5)**2 - 20*(Y-0.5)**2)
Z2 = np.exp(-(X * 10)**2 - (Y * 10)**2)
Z = Z1 + 50 * Z2+ Z1b
Z = Z*10
# data prepared!
###########################################
cbar_ticks = [0,1,10,50,100,200,300,400]
# prepare basic 'linear' cmap from matplotlib avaliable cmap
cmap = 'inferno'
lin_cmap = cm.get_cmap(cmap)
############################################
# prepare nonlinear norm (good for data viz)
gamma = 0.1
nonlin_norm = colors.PowerNorm(gamma=gamma)
nonlin_norm.autoscale(Z)
# prepare nonlinear norm b (better for colorbar)
gamma_b = 0.40
nonlin_norm_b = colors.PowerNorm(gamma=gamma_b)
nonlin_norm.autoscale(Z)
################# PLOT ####################
# create 4 plots in 2x2 grid
fig, axs = plt.subplots(nrows = 2,
ncols = 2,
figsize=(6,5.5),
dpi=108,
squeeze = False)
fs = 8 # the same plot title fontsize
LLL_ax = axs[0,0]
LNL_ax = axs[0,1]
LNN_ax = axs[1,0]
LNNb_ax = axs[1,1]
#------------- Top left plot --------------
LLL_ax.set_title('linear cmap\nlinear norm\nlinear cbar',
fontsize = fs)
LLL_pcm = LLL_ax.pcolormesh(X, Y, Z,
cmap = lin_cmap,
shading='auto')
# colorbar takes LLL_pcm object to figure out colormap and scale
fig.colorbar(LLL_pcm,
ax=LLL_ax,
extend='both',
ticks=cbar_ticks)
#------------- Top right plot -------------
# an easy way of obtaining good color-scaling
# the colorbar shows cmap in a linear way
# the cbar yticks are nonlinearly scaled but
# they are overlapping
LNL_ax.set_title('linear cmap\nnonlinear norm\nlinear cbar cmap (nonlinear ticks)',
fontsize = fs)
nonlin_norm.autoscale(Z)
LNL_pcm = LNL_ax.pcolormesh(X, Y, Z,
cmap = lin_cmap,
norm = nonlin_norm,
shading='auto')
fig.colorbar(LNL_pcm,
ax=LNL_ax,
extend='both',
ticks=cbar_ticks)
#------------- Bottom left plot -----------
# the colorbar cmap is nonlinear
# the cbar yticks are linearly scaled but
# the overall effect is not good
# the cbar yticks are overlapping again
LNN_ax.set_title('linear cmap\nnonlinear norm\nnonlinear cbar cmap (linear ticks)',
fontsize = fs)
LNN_pcm = LNN_ax.pcolormesh(X, Y, Z,
cmap = lin_cmap,
norm = nonlin_norm,
shading='auto')
# create new, nonlinear cmap
nonlin_cmap, new_ticks_coords = transform_cmap(cmap_basic = lin_cmap ,
cmap_trans_fun = nonlin_norm,
how_many_levels = 256,
ticks = cbar_ticks,
ticks_trans_fun = None,
new_cmap_name = 'nonlinear_cmap')
# create object based on new cmap for colorbar
scalar_mappable = cm.ScalarMappable(cmap=nonlin_cmap)
LNN_cbar = fig.colorbar(scalar_mappable,
ax=LNN_ax,
extend='both',
ticks=new_ticks_coords)
# ticks are in correct places but they are normalized to [0,1] interval
# we need to overwrite them with desired labels
LNN_cbar.ax.set_yticklabels(cbar_ticks)
#------------- Bottom right plot ----------
# the colorbar shows cmap in a nonlinear way
# this is different nonlinear scaling than before (nonlin_norm_b)
# the cbar yticks are also nonlinearly scaled
# this is A GOOD LOOKING PLOT
LNNb_ax.set_title('linear cmap\nnonlinear norm\n2nd nonlinear cbar cmap (nonlinear ticks)',
fontsize = fs)
LNNb_pcm = LNNb_ax.pcolormesh(X, Y, Z,
cmap = lin_cmap,
norm = nonlin_norm,
shading='auto')
# this time as the cbar cmap is with different norm than data cmap
# we also need to recalculate positions of the cbar ticks using second norm
nonlin_cmap_b, new_ticks_coords_b = transform_cmap(cmap_basic = lin_cmap ,
cmap_trans_fun = nonlin_norm_b,
how_many_levels = 256,
ticks = cbar_ticks,
ticks_trans_fun = nonlin_norm_b,
new_cmap_name = 'nonlinear_cmap_v2')
scalar_mappable_b = cm.ScalarMappable(cmap=nonlin_cmap_b)
LNNb_cbar = fig.colorbar(scalar_mappable_b,
ax=LNNb_ax,
extend='both',
ticks=new_ticks_coords_b)
LNNb_cbar.ax.set_yticklabels(cbar_ticks)
#------------------------------
fig.tight_layout()
plt.show()
I was using this answer as a base:
Uniform tick labels for non-linear colorbar in Matplotlib
These answers may be useful but were looking too complicated:
Arbirtrary non-linear colorbar using Matplotlib
nonlinear colormap, matplotlib
I have a feeling that wiser usage of norm parameter in pcolor and perhaps in cbar should give me the desired result. Unfortunately, I was not able to obtain it in this way.

How to I fill the central 95% confidence interval of a matplotlib histogram?

I am able to make a matplotlib histogram no problem. However, I'm wondering if it's possible to use something like fillbetween to change the fill color of the central 95% CI of my data.
I can only get fillbetween to work when if I use a trick with a numpy histogram and bincenters. i.e.:
bins = np.linspace(-a.max(),a.max(),400)
hist = np.histogram(a,bins = bins)[0]
bincenters = 0.5*(bins[1:] + bins[:-1])
b = plt.plot(bincenters,hist, linestyle = 'None')
plt.fill_between(bincenters,hist, color = '#7f7f7f')
plt.fill_between(bincenters, hist, interpolate=False,
where=((bincenters>=lower_p) & (bincenters<=upper_p)), hatch = '...', facecolor = '#7f7f7f')```
Here's my existing code that I'd rather use to create the matplotlib histogram (which I think looks better) with some extras plotting on top:
#Create Histogram
axes[1] = boota.plot.hist(ax = axes[1],bins = 50, legend = None, histtype = 'bar', color = '#7f7f7f')
axes[1].set_xlabel('Spatial Decay Rate (α)', size = 16, fontweight = 'bold')
axes[1].set_ylabel('Frequency', labelpad = 11, size = 16, fontweight = 'bold')
#Ticklabels
axes[0].tick_params(labelsize = 14)
axes[1].tick_params(labelsize = 14)
#draw vertical line at remote powerlaw (rem_a)
rem_a = 0.649
axes[1].axvline(x=rem_a, color='k', linestyle='dashed', linewidth=1.5, label='remote decay \nrate $α_r$ = 0.649')
legend = axes[1].legend(ncol = 1, loc = 'upper left', fontsize='large')
legend.draw_frame(False)
at2 = AnchoredText("B",prop=dict(size=20), loc='upper right',frameon=False)
axes[1].add_artist(at2)
Check out fill_betweenx which I think is better fit here
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
arr = np.random.normal(size=500)
ci = norm(*norm.fit(arr)).interval(0.95) # fit a normal distribution and get 95% c.i.
height, bins, patches = plt.hist(arr, alpha=0.3)
plt.fill_betweenx([0, height.max()], ci[0], ci[1], color='g', alpha=0.1) # Mark between 0 and the highest bar in the histogram

Polar plot - Put one grid line in bold

I am trying to make use the polar plot projection to make a radar chart. I would like to know how to put only one grid line in bold (while the others should remain standard).
For my specific case, I would like to highlight the gridline associated to the ytick "0".
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
#Variables
sespi = pd.read_csv("country_progress.csv")
labels = sespi.country
progress = sespi.progress
angles=np.linspace(0, 2*np.pi, len(labels), endpoint=False)
#Concatenation to close the plots
progress=np.concatenate((progress,[progress[0]]))
angles=np.concatenate((angles,[angles[0]]))
#Polar plot
fig=plt.figure()
ax = fig.add_subplot(111, polar=True)
ax.plot(angles, progress, '.--', linewidth=1, c="g")
#ax.fill(angles, progress, alpha=0.25)
ax.set_thetagrids(angles * 180/np.pi, labels)
ax.set_yticklabels([-200,-150,-100,-50,0,50,100,150,200])
#ax.set_title()
ax.grid(True)
plt.show()
The gridlines of a plot are Line2D objects. Therefore you can't make it bold. What you can do (as shown, in part, in the other answer) is to increase the linewidth and change the colour but rather than plot a new line you can do this to the specified gridline.
You first need to find the index of the y tick labels which you want to change:
y_tick_labels = [-100,-10,0,10]
ind = y_tick_labels.index(0) # find index of value 0
You can then get a list of the gridlines using gridlines = ax.yaxis.get_gridlines(). Then use the index you found previously on this list to change the properties of the correct gridline.
Using the example from the gallery as a basis, a full example is shown below:
r = np.arange(0, 2, 0.01)
theta = 2 * np.pi * r
ax = plt.subplot(111, projection='polar')
ax.set_rmax(2)
ax.set_rticks([0.5, 1, 1.5, 2]) # less radial ticks
ax.set_rlabel_position(-22.5) # get radial labels away from plotted line
ax.grid(True)
y_tick_labels = [-100, -10, 0, 10]
ax.set_yticklabels(y_tick_labels)
ind = y_tick_labels.index(0) # find index of value 0
gridlines = ax.yaxis.get_gridlines()
gridlines[ind].set_color("k")
gridlines[ind].set_linewidth(2.5)
plt.show()
Which gives:
It is just a trick, but I guess you could just plot a circle and change its linewidth and color to whatever could be bold for you.
For example:
import matplotlib.pyplot as plt
import numpy as np
Yline = 0
Npoints = 300
angles = np.linspace(0,360,Npoints)*np.pi/180
line = 0*angles + Yline
ax = plt.subplot(111, projection='polar')
plt.plot(angles, line, color = 'k', linewidth = 3)
plt.ylim([-1,1])
plt.grid(True)
plt.show()
In this piece of code, I plot a line using plt.plot between any point of the two vectors angles and line. The former is actually all the angles between 0 and 2*np.pi. The latter is constant, and equal to the 'height' you want to plot that line Yline.
I suggest you try to decrease and increase Npoints while having a look to the documentaion of np.linspace() in order to understand your problem with the roundness of the circle.

Loop to create subplot /Python

i have a little problem to create a subplot loop.
The following code show my result for one plot.... So it starts with a dayloop than with a hour loop (8 timesteps).
If i run the code i get a nice QUiver plot with the colorbar.
for dd in range(1,15):
day=str(dd)
readfile=fns[files_indizes[dd]]
if dd < 10:
nc_u_comp = NetCDFFile(ROOT+u_comp1+'0'+day+comp)
nc_v_comp = NetCDFFile(ROOT+v_comp1+'0'+day+comp)
else:
nc_u_comp = NetCDFFile(ROOT+u_comp1+day+comp)
nc_v_comp = NetCDFFile(ROOT+v_comp1+day+comp)
time = nc_u_comp.variables['time'][:]
index=readfile.find(comp)
index=index+len(comp)
date=readfile[index-14:index-6]
plt.clf()
for tt in range(0,len(time)):
if tt < 10:
h =str(0)+str(tt)
else:
h=str(tt)
varU=nc_u_comp.variables['u10'][tt,:,:]
varV=nc_v_comp.variables['v10'][tt,:,:]
lat = nc_u_comp.variables['latitude'][:]
lon = nc_u_comp.variables['longitude'][:]
plt.rcParams["figure.figsize"] = [10,10]
#plane projection of the world
#map with box size (defintion on the top)
box = sgeom.box(minx=llcrnrlon, maxx=urcrnrlon, miny=llcrnrlat, maxy=urcrnrlat)
x0, y0, x1, y1 = box.bounds
#Map plot. The middel of the map is central_longitude
#proj = ccrs.PlateCarree(central_longitude=0)
proj=ccrs.PlateCarree()
#Change middelpoint of the map
box_proj = ccrs.PlateCarree(central_longitude=0)
ax2 = plt.axes(projection=proj)
ax2.set_extent([x0, x1, y0, y1], box_proj)
ax2.add_feature(cartopy.feature.BORDERS, linestyle='-', alpha=.5)
ax2.coastlines(resolution='50m')
#Definition of the scale_bar
gl = ax2.gridlines(ccrs.PlateCarree(), \
linestyle='--', alpha=1, linewidth=0.5, draw_labels=True)
gl.xlabels_top = False
gl.ylabels_right = False
gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER
magnitude = (varU ** 2 + varV ** 2) ** 0.5
strm =plt.streamplot(lon , lat , varU, varV, linewidth=2, density=2, color=magnitude)
cbar= plt.colorbar()
cbar.set_label('$m/s$')
name='Wind in 10 m '+ date + h+' UTC'
ax2.set_aspect('auto')
plt.title(name, y=1)
Now i want to create an 2x4 Subplot array with a colorbar allocate to the complete Subplot array.
I find some infromation in the internet, but it doesn't run with my code. Maybe someone can help me?
This shows how to plot an array of simple Cartopy maps in 4 rows 2 columns. Also shows how to plot a colorbar to accompany the maps array. Hope it helps.
import numpy as np
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import matplotlib as mpl
# create figure with figsize big enough to accomodate all maps, labels, etc.
fig = plt.figure(figsize=(8, 10), tight_layout=False)
# define plot array's arrangement
columns = 2
rows = 4
# set projection to use
projex = ccrs.PlateCarree()
# set the colormap and norm for
# the colorbar to use
cmap1 = mpl.cm.magma
norm1 = mpl.colors.Normalize(vmin=0, vmax=100)
def plotmymap(axs):
# your plot specs of each map should replace this
img = np.random.randint(100, size=(15, 30)) # 2d array of random values (1-100)
# render image on current axis
plims = plt.imshow(img, extent=[-180,180,-90,90], alpha=0.5, cmap=cmap1, norm=norm1)
axs.set_global()
axs.coastlines()
# add title to the map
axs.set_title("Map_"+str(i))
return plims # for use by colorbar
for i in range(1, columns*rows +1):
# add a subplot into the array of plots
ax = fig.add_subplot(rows, columns, i, projection=projex)
plims = plotmymap(ax) # a simple maps is created on subplot
# add a subplot for vertical colorbar
bottom, top = 0.1, 0.9
left, right = 0.1, 0.8
fig.subplots_adjust(top=top, bottom=bottom, left=left, right=right, hspace=0.15, wspace=0.25)
cbar_ax = fig.add_axes([0.85, bottom, 0.05, top-bottom])
fig.colorbar(plims, cax=cbar_ax) # plot colorbar
plt.show() # this plot all the maps
The resulting plots:

Seaborn distplot y-axis normalisation wrong ticklabels

Just to note, I have already checked this question and this question.
So, I'm using distplot to draw some histograms on separate subplots:
import numpy as np
#import netCDF4 as nc # used to get p0_dict
import matplotlib.pyplot as plt
from collections import OrderedDict
import seaborn.apionly as sns
import cPickle as pickle
'''
LINK TO PICKLE
https://drive.google.com/file/d/0B8Xks3meeDq0aTFYcTZEZGFFVk0/view?usp=sharing
'''
p0_dict = pickle.load(open('/path/to/pickle/test.dat', 'r'))
fig = plt.figure(figsize = (15,10))
ax = plt.gca()
j=1
for region, val in p0_dict.iteritems():
val = np.asarray(val)
subax = plt.subplot(5,5,j)
print region
try:
sns.distplot(val, bins=11, hist=True, kde=True, rug=True,
ax = subax, color = 'k', norm_hist=True)
except Exception as Ex:
print Ex
subax.set_title(region)
subax.set_xlim(0, 1) # the data varies from 0 to 1
j+=1
plt.subplots_adjust(left = 0.06, right = 0.99, bottom = 0.07,
top = 0.92, wspace = 0.14, hspace = 0.6)
fig.text(0.5, 0.02, r'$ P(W) = 0,1 $', ha ='center', fontsize = 15)
fig.text(0.02, 0.5, '% occurrence', ha ='center',
rotation='vertical', fontsize = 15)
# obviously I'd multiply the fractional ticklabels by 100 to get
# the percentage...
plt.show()
What I expect is for the area under the KDE curve to sum to 1, and for the y axis ticklabels to reflect this. However, I get the following:
As you can see, the y axis ticklabels are not in the range [0,1], as would be expected. Turning on/off norm_hist or kde does not change this. For reference, the output with both turned off:
Just to verify:
aus = np.asarray(p0_dict['AUS'])
aus_bins = np.histogram(aus, bins=11)[0]
plt.subplot(121)
plt.hist(aus,11)
plt.subplot(122)
plt.bar(range(0,11),aus_bins.astype(np.float)/np.sum(aus_bins))
plt.show()
The y ticklabels in this case properly reflect those of a normalised histogram.
What am I doing wrong?
Thank you for your help.
The y axis is a density, not a probability. I think you are expecting the normalized histogram to show a probability mass function, where the sum the bar heights equals 1. But that's wrong; the normalization ensures that the sum of the bar heights times the bar widths equals 1. This is what ensures that the normalized histogram is comparable to the kernel density estimate, which is normalized so that the area under the curve is equal to 1.

Categories