How to plot grid of histograms by categorical variable in python? - python

I have a dataset containing 50 numeric variables and 1 categorical variable (segment_hc_print, having 6 categories). I want to see the spread of each variable in each category by plotting a grid of histogram, where each row would represent a category, column would represent the variable and each cell in a grid is a histogram. I am trying the code below to generate grid for single variable :
def grid_histogram(variable, bins):
fig = plt.figure(figsize=(20,10))
fig.set_size_inches(10,10, forward = True)
fig.suptitle(variable, fontsize = 8)
plt.locator_params(numticks = 4)
for i in np.arange(0, 6, 1):
ax = plt.subplot(6,1,i+1)
ax.hist(sensor_df_print_sample_v2[sensor_df_print_sample_v2.segment_hc_print == i][variable], bins)
ax.set_title("cluster = " + str(i), fontsize = 5)
ymin, ymax = ax.get_ylim()
ax.set_yticks(np.round(np.linspace(ymin, ymax, 3), 2))
xmin, xmax = ax.get_xlim()
ax.set_xticks(np.round(np.linspace(xmin, xmax,3),2))
plt.setp(ax.get_xticklabels(), rotation = 'vertical', fontsize = 4)
fig.tight_layout()
fig.savefig(str(variable) + '_histogram.pdf')
plt.show()
And this is what I am getting :
sample histogram
How do I generate a grid of such histograms, each variable stacked to the right of another ?
This code below generates the ideal size of histogram I need.
sample histogram

if I understand correctly, you could just create a grid with plt.subplots(). In the example below, I am plotting the first 5 variables as columns:
nr_of_categories = 6
nr_of_variables = 5
fig, ax = plt.subplots(nrows = nr_of_categories, cols = nr_of_variables, figsize = (20, 20))
for category in np.arange(0, nr_of_categories):
for variable in np.arange(0, nr_of_variables):
ax[category, variable].hist(sensor_df_print_sample_v2[sensor_df_print_sample_v2.segment_hc_print == i][variable], bins)
# and then the rest of your code where you replace ax with ax[category, variable]

Related

Seaborn Align twinx and x Axis

I am trying to align X axis with its twin but I'm not finding a way to do it.
Here is my code
# Initialize the figure
plt.figure(figsize=(16, 10))
# Adding a title
plt.title(f'Client Retention Quarters: Monthly Cohorts', fontsize = 14)
# Creating the heatmap
sns.heatmap(retention, annot = True,vmin = 0, vmax =30,cmap="flare", fmt='g')
plt.ylabel('Cohort Quarter')
plt.xlabel('')
plt.yticks( rotation='360')
#Twinx
ax2 = plt.twiny()
ax2.set_xticks(range(0,len(x2)))
ax2.set_xticklabels(labels=x2)
ax2.spines['top'].set_position(('axes', -0.10))
plt.show()
And here is the output:
I want to align the percentages with the x ticks.
Is it possible?
You can use the below updated code. See if this works. Note that I have used random data for retention and x2. Basically, the main change it to get the xlim()s for both axes and then adjust it (see lambda f) so that the ticks align. Finally use set_major_locator() to fix the points. Hope this is what you are looking for...
retention = np.random.rand(10, 12) ##My random data
# Initialize the figure
plt.figure(figsize=(16, 10))
# Adding a title
plt.title(f'Client Retention Quarters: Monthly Cohorts', fontsize = 14)
# Creating the heatmap
ax=sns.heatmap(retention, annot = True,vmin = 0, vmax =30,cmap="flare", fmt='g') ## Note I am assigning to ax
plt.ylabel('Cohort Quarter')
plt.xlabel('')
plt.yticks( rotation='360')
x2 = np.around(np.linspace(1, 25, 12),2)
#Twinx
ax2 = ax.twiny()
#ax2.set_xticks(range(0,len(x2))) ## Commented as not required
#ax2.set_xticklabels(labels=x2) ## Commented as not required
## New code here ##
import matplotlib.ticker
l = ax.get_xlim()
l2 = ax2.get_xlim()
f = lambda y : l2[0]+(y-l[0])/(l[1]-l[0])*(l2[1]-l2[0]) ##Add delta to each tick
ticks = f(ax.get_xticks())
ax2.xaxis.set_major_locator(matplotlib.ticker.FixedLocator(ticks)) ##Set the ticks
ax2.spines['top'].set_position(('axes', -0.10))
plt.show()

plt.subplots does not correctly draw sns.lineplot [duplicate]

This question already has answers here:
What is the difference between drawing plots using plot, axes or figure in matplotlib?
(2 answers)
How to add a title to each subplot
(10 answers)
Closed 11 months ago.
I have the following code:
df = sns.load_dataset('titanic')
# Data
data = df[df.age.notna()].age
# Fit a normal distribution to the data:
mu, std = scipy.stats.norm.fit(data)
# bin formulas
bin_f = {'sturges' : 1 + math.log(len(df), 2)}
# Plot the histogram.
sns.histplot( data = data, stat='density', bins=int(bin_f['sturges']), alpha=0.6, color='g', kde = True, legend = True)
# Plot the PDF.
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 1000)
p = scipy.stats.norm.pdf(x, mu, std)
sns.lineplot(x = x, y = p, color = 'black', linewidth=2)
title = f"Fit results: mu = {round(mu, 2)}, std ={round(std, 2)} "
plt.title(title)
Which produces this plot:
When I try to produce it in a subplot it wont work as expected:
f, ax = plt.subplots(nrows = 1, ncols = 2, figsize=(15, 8))
# Data
data = df[df.age.notna()].age
# Fit a normal distribution to the data:
mu, std = scipy.stats.norm.fit(data)
# bin formulas
bin_f = {'sturges' : 1 + math.log(len(df), 2)}
# Plot the histogram.
sns.histplot(ax = ax[0], data = data, stat='density', bins=int(bin_f['sturges']), alpha=0.4, color='g', kde = True, legend = True)
# Plot the PDF.
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 1000)
p = scipy.stats.norm.pdf(x, mu, std)
sns.lineplot(x = x, y = p, color = 'black', linewidth=2, ax=ax[0])
title = f"Fit results: mu = {round(mu, 2)}, std ={round(std, 2)} "
plt.title(title)
For some reason the title is only for a second plot and the previously plotted lineplot ( the black one ) is only a small tick in a second plot rather than a normal curve as in the first image. I am not sure why this is happening as the only difference is just using plt.subplots and referencing ax, where is my mistake?
My goal is to have the first graph as seen in the first picture as a the first subplot in the second plot.

Colorbar line plot of multiple lines according to a third variable value

Dataset: I have a series (n = 30) of X (wavelength) and Y (reflectance) data, each associated with a unique value Z (age). Z values are stored as a separate ordered list.
Goal: I am trying to create a series of line plots which display each of the 30 datasets together, where each line is appropriately colored according their Z value (age). I am hoping for weighted colorization depending on the Z value, and an associated colorbar() or similar.
Attempts: I tried manipulating rcParams to do this by iterating through a color-scheme per plot [i], but the colors are not weighted properly to the Z value. See example figure. I think my issue is similar to this question here.
I feel like this shouldn't be so hard and that I am missing something obvious!
#plot
target_x = nm_names
target_y = data_plot
target_names = ages
N = len(target_y) # number of objects to plot i.e. color cycle count
plt.rcParams["figure.figsize"] = [16,7] # fig size
plt.rcParams["axes.prop_cycle"] = plt.cycler("color", plt.cm.PiYG(np.linspace(0,1,N))) # colors to cycle through, choose default like 'viridis' or 'PiYG'
fig, ax = plt.subplots()
for i in range(N):
ax.plot(target_x, target_y.iloc[i], label = target_names[i]) # for i in range of objects, plot x,y
#axes
plt.xticks(fontsize = 10, rotation=70, size = 8)
ax.xaxis.set_major_locator(ticker.MultipleLocator(50))
plt.xlabel('Wavelength (nm)', fontsize = 14)
plt.yticks(fontsize = 12)
plt.ylabel('Normalized Relative Reflectance', fontsize = 13)
plt.title("Spectral Profile", size = 14)
plt.title
plt.xlim(375,2500)
# legend location
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.1,
box.width, box.height * .9])
ax.legend(loc='lower left', bbox_to_anchor=(1, 0),
fancybox=True, shadow=True, ncol=1, title = 'Age (ky)') # Put a legend below current axis
plt.rcdefaults() # reset global plt parameters, IMPORTANT!
plt.show()
My plot, where 'age' is the 'Z' value

Visualizing the difference between two numeric arrays

I have two numeric arrays of equal length, with one array always having the element value >= to the corresponding (same index) element in the second array.
I am trying to visualize in a single graph:
i) difference between the corresponding elements,
ii) values of the corresponding elements in the two arrays.
I have tried plotting the CDF as below:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
arr1 = np.random.uniform(1,20,[25,1])
arr2 = arr1 + np.random.uniform(1,10,[25,1])
df1 = pd.DataFrame(arr1)
df2 = pd.DataFrame(arr2)
fix, ax = plt.subplots()
sns.kdeplot(df1[0], cumulative=True, color='orange', label='arr1')
sns.kdeplot(df2[0], cumulative=True, color='b', label='arr2')
sns.kdeplot(df2[0]-df1[0], cumulative=True, color='r', label='difference')
plt.show()
which gives the following output:
However, it does not capture the difference, and values of the corresponding elements together. For example, suppose the difference between two elements is 3. The two numbers can be 2 and 5, but they can also be 15 and 18, and this can not be determined from the CDF.
Which kind of plotting can visualize both the difference between the elements and the values of the elements?
I do not wish to line plot as below because not much statistical insights can be derived from the visualization.
ax.plot(df1[0])
ax.plot(df2[0])
ax.plot(df2[0]-df1[0])
There are lots of ways to show difference between two values. It really depends on your goal for the chart, how quantitative or qualitative you want to be, or if you want to show the raw data somehow. Here are a few ideas that come to mind that do not involve simple line plots or density functions. I strongly recommend the book Better Data Visualization by Johnathan Schwabish. He discusses interesting considerations regarding data presentation.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import ticker
arr1 = np.random.uniform(1,20, size=25)
arr2 = arr1 + np.random.uniform(1,10, size=25)
df = pd.DataFrame({
'col1' : arr1,
'col2' : arr2
})
df['diff'] = df.col2 - df.col1
df['sum'] = df.col1 + df.col2
fig, axes = plt.subplots(ncols=2, nrows=3, figsize=(15,15))
axes = axes.flatten()
# Pyramid chart
df_sorted = df.sort_values(by='sum', ascending=True)
axes[0].barh(
y = np.arange(1,26),
width = -df_sorted.col1
)
axes[0].barh(
y = np.arange(1,26),
width = df_sorted.col2
)
# Style axes[0]
style_func(axes[0], 'Pyramid Chart')
# Dot Plot
axes[1].scatter(df.col1, np.arange(1, 26), label='col1')
axes[1].scatter(df.col2, np.arange(1, 26), label='col2')
axes[1].hlines(
y = np.arange(1, 26),
xmin = df.col1, xmax = df.col2,
zorder=0, linewidth=1.5, color='k'
)
# Style axes[1]
legend = axes[1].legend(ncol=2, loc='center', bbox_to_anchor=(0.14,1.025), edgecolor='w')
style_func(axes[1], 'Dot Plot')
set_xlim = axes[1].set_xlim(0,25)
# Dot Plot 2
df_sorted = df.sort_values(by=['col1', 'diff'], ascending=False)
axes[2].scatter(df_sorted.col1, np.arange(1, 26), label='col1')
axes[2].scatter(df_sorted.col2, np.arange(1, 26), label='col2')
axes[2].hlines(
y = np.arange(1, 26),
xmin = df_sorted.col1, xmax = df_sorted.col2,
zorder=0, linewidth=1.5, color='k'
)
# Style axes[2]
legend = axes[2].legend(ncol=2, loc='center', bbox_to_anchor=(0.14,1.025), edgecolor='w')
style_func(axes[2], 'Dot Plot')
set_xlim = axes[2].set_xlim(0,25)
# Dot Plot 3
df_sorted = df.sort_values(by='sum', ascending=True)
axes[3].scatter(-df_sorted.col1, np.arange(1, 26), label='col1')
axes[3].scatter(df_sorted.col2, np.arange(1, 26), label='col2')
axes[3].vlines(x=0, ymin=-1, ymax=27, linewidth=2.5, color='k')
axes[3].hlines(
y = np.arange(1, 26),
xmin = -df_sorted.col1, xmax = df_sorted.col2,
zorder=0, linewidth=2
)
# Style axes[3]
legend = axes[3].legend(ncol=2, loc='center', bbox_to_anchor=(0.14,1.025), edgecolor='w')
style_func(axes[3], 'Dot Plot')
# Strip plot
axes[4].scatter(df.col1, [4] * 25)
axes[4].scatter(df.col2, [6] * 25)
axes[4].set_ylim(0, 10)
axes[4].vlines(
x = [df.col1.mean(), df.col2.mean()],
ymin = [3.5, 5.5], ymax=[4.5,6.5],
color='black', linewidth =2
)
# Style axes[4]
axes[4].yaxis.set_major_locator(ticker.FixedLocator([4,6]))
axes[4].yaxis.set_major_formatter(ticker.FixedFormatter(['col1','col2']))
hide_spines = [axes[4].spines[x].set_visible(False) for x in ['left','top','right']]
set_title = axes[4].set_title('Strip Plot', fontweight='bold')
tick_params = axes[4].tick_params(axis='y', left=False)
grid = axes[4].grid(axis='y', dashes=(8,3), alpha=0.3, color='gray')
# Slope chart
for i in range(25):
axes[5].plot([0,1], [df.col1[i], df.col2[i]], color='k')
align = ['left', 'right']
for i in range(1,3):
axes[5].text(x = i - 1, y = 0, s = 'col' + str(i),
fontsize=14, fontweight='bold', ha=align[i-1])
set_title = axes[5].set_title('Slope chart', fontweight='bold')
axes[5].axis('off')
def style_func(ax, title):
hide_spines = [ax.spines[x].set_visible(False) for x in ['left','top','right']]
set_title = ax.set_title(title, fontweight='bold')
set_xlim = ax.set_xlim(-25,25)
x_locator = ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
y_locator = ax.yaxis.set_major_locator(ticker.FixedLocator(np.arange(1,26, 2)))
spine_width = ax.spines['bottom'].set_linewidth(1.5)
x_tick_params = ax.tick_params(axis='x', length=8, width=1.5)
x_tick_params = ax.tick_params(axis='y', left=False)
What about a parallel coordinates plot with plotly? This will allow to see the distinct values of each original array but then also if they converge on the same diffrence?
https://plot.ly/python/parallel-coordinates-plot/

I have a figure with 2 axes, how do I make them have the same scale in y axis in matplotlib?

This is the code. The two axes have two different scales. (ax1 has both negative and positive values and ax2 has only negative values). The graph should be such a way that ax1 would have one line(the positive points) on top of the x-axis and the other line (with negative x-ticks) under the x-axis. Same for ax2.
you can see the resultant graph here:
x_ax = list(np.arange(10))
y_ax = list(np.arange(-10,0))
x_ax1 = list(np.arange(10))
y_ax1 = list(np.arange(0,10))
x_ax_p = list(np.arange(10))
y_ax_p = list(np.arange(-30,-20))
x_ax1_p = list(np.arange(10))
y_ax1_p = list(np.arange(-40,-30))
fig = plt.figure(figsize = (10,10))
ax1 = fig.add_axes([0.1,0.1,0.8,0.8])
ax2 = fig.add_axes([0.9,0.1,0.8,0.8])
ax1.spines['left'].set_color('none')
# ax1.spines['right'].set_color('none')
ax1.spines['top'].set_color('none')
# ax1.spines['bottom'].set_position('zero')
# ax2.spines['left'].set_position(('axes', 0))
ax2.spines['right'].set_color('none')
ax2.spines['top'].set_color('none')
# ax2.spines['bottom'].set_position('zero')
#ax1.set_xticks([])
ax1.set_yticks([])
#ax2.set_xticks([])
ax2.set_yticks([])
x_offset= 0.2 # for displaying the values
y_offset= 2
##################enter code here#########################
ax1.plot(range(len(x_ax)),y_ax,marker='o', label = 'Test_store')
ax1.plot(range(len(x_ax1)),y_ax1,marker='o', label = 'Control_store')
##############################################
ax2.plot(range(len(x_ax_p)),y_ax_p,marker='o', label = 'Test_store')
ax2.plot(range(len(x_ax1_p)),y_ax1_p,marker='o', label = 'Control_store')
for i,j in zip(range(len(x_ax)),y_ax):
ax1.annotate(round(j,2),xy=(i,j),xytext = (i,j))
for i,j in zip(range(len(x_ax1)),y_ax1):
ax1.annotate(str(round(j,2)),xy=(i,j),xytext = (i,j))
for i,j in zip(range(len(x_ax_p)),y_ax_p):
ax2.annotate(str(round(j,2)),xy=(i,j),xytext = (i,j))
for i,j in zip(range(len(x_ax1_p)),y_ax1_p):
ax2.annotate(str(round(j,2)),xy=(i,j),xytext = (i,j))
ax1.text(8.5, -15, 'Week', ha='right')
ax1.set_ylabel("Sales")
ax1.set_title("Period : "+ per + " "+ "\n" +"Week vs Sales")
ax2.set_title("Period : "+"Pilot Phase"+ "\n"+"Week vs Sales")
ax1.set_xticks(np.arange(len(x_ax)))
ax2.set_xticks(np.arange(len(x_ax)))
ax2.legend()
plt.show()
fig.clear()
To specify the same y-axis limits for both figures, you can use set_ylim:
ymin, ymax = 0, 100 # Change these to whatever values you require
ax1.set_ylim(ymin, ymax)
ax2.set_ylim(ymin, ymax)
You could also use the minimum and maximum of the default y-limits generated by Matplotlib to set the limits dynamically, i.e. without having to specify them manually:
ymin = min([ax1.get_ylim()[0], ax2.get_ylim()[0]])
ymax = max([ax1.get_ylim()[1], ax2.get_ylim()[1]])
ax1.set_ylim(ymin, ymax)
ax2.set_ylim(ymin, ymax)
To set the vertical location of the x-axis ticks at zero, you can use the following:
ax1.spines['bottom'].set_position('zero')
ax2.spines['bottom'].set_position('zero')
If you want to use a value other than zero, the above is shorthand for:
ax1.spines['bottom'].set_position(("data", 0))
ax2.spines['bottom'].set_position(("data", 0))

Categories