Seaborn grouped boxplot symbol on each boxplot

Seaborn grouped boxplot symbol on each boxplot - python

I made a grouped boxplot with seaborn. I have two subplots that describe different types of data and in order to also compare the types (I want to keep the groups as they are), I'd like to plot the median of the data frame for type 2 on the boxplot for type 1 and vice versa. This is my script
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import netCDF4 as nc
sns.set_theme(style='ticks', palette='pastel')
fig = plt.figure(figsize=(15,5))
fig.subplots_adjust(hspace=0.12)
fig.subplots_adjust(wspace=0.15)
fig.subplots_adjust(right=0.98)
fig.subplots_adjust(left=0.12)
fig.subplots_adjust(bottom=0.1)
fig.subplots_adjust(top=0.98)
plt.rcParams['text.usetex'] = False
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['font.size'] = 11
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['xtick.labelsize'] = 11
plt.rcParams['ytick.labelsize'] = 11
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
def grouped_boxplot(axis_type1, axis_type2):
methods = ['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7']
df_model1_type1 = pd.DataFrame()
df_model1_type2 = pd.DataFrame()
df_model2_type1 = pd.DataFrame()
df_model2_type2 = pd.DataFrame()
df_model3_type1 = pd.DataFrame()
df_model3_type2 = pd.DataFrame()
df_model4_type1 = pd.DataFrame()
df_model4_type2 = pd.DataFrame()
for m in methods:
df_model1_type1[m] = np.random.randint(1,101,10)
df_model1_type2[m] = np.random.randint(1,101,10)
for m in methods:
df_model2_type1[m] = np.random.randint(1,101,10)
df_model2_type2[m] = np.random.randint(1,101,10)
for m in methods:
df_model3_type1[m] = np.random.randint(1,101,10)
df_model3_type2[m] = np.random.randint(1,101,10)
for m in methods:
df_model4_type1[m] = np.random.randint(1,101,10)
df_model4_type2[m] = np.random.randint(1,101,10)
df_model1_type1 = df_model1_type1.assign(Model='model1')
df_model1_type2 = df_model1_type2.assign(Model='model1')
df_model2_type1 = df_model2_type1.assign(Model='model2')
df_model2_type2 = df_model2_type2.assign(Model='model2')
df_model3_type1 = df_model3_type1.assign(Model='model3')
df_model3_type2 = df_model3_type2.assign(Model='model3')
df_model4_type1 = df_model4_type1.assign(Model='model4')
df_model4_type2 = df_model4_type2.assign(Model='model4')
df_type1 = pd.concat([df_model1_type1,df_model2_type1,df_model3_type1,
df_model4_type1])
df_type2 = pd.concat([df_model1_type2,df_model2_type2,df_model3_type2,
df_model4_type2])
df_type1_long = pd.melt(df_type1, 'Model', var_name='Method',
value_name='var')
df_type2_long = pd.melt(df_type2, 'Model', var_name='Method',
value_name='var')
axis_type1 = sns.boxplot(x='Model', hue='Method', y='var',
data=df_type1_long, showfliers=False, whis=0,
ax=axis_type1)
axis_type2 = sns.boxplot(x='Model', hue='Method', y='var', data=df_type2_long,
showfliers=False, whis=0, ax=axis_type2)
type1_median = df_type1.median().to_numpy()
type2_median = df_type2.median().to_numpy()
for xtick, ytick in zip(axis_type1.get_xticks(), type2_median):
axis_type1.scatter(xtick, ytick, s=20, marker='*', color='red')
for xtick, ytick in zip(axis_type2.get_xticks(), type1_median):
axis_type2.scatter(xtick, ytick, s=20, marker='*', color='red')
axis_type1.legend([],[], frameon=False)
axis_type2.legend(loc='lower center', bbox_to_anchor=(-0.2,-0.25), ncol=7)
grouped_boxplot(ax1, ax2)
plt.show()
# plt.savefig('the_ultimate_boxplot.pdf')
I managed to plot the median on to the boxplot that is right on the xtick.
Is there a way so I can have a symbol for the median of m1 (blue boxplot) for model 1 for type 2 on m1 (blue boxplot) for model 1 for type 1, the median for m2 (orange boxplot) for model 1 for type 2 on m2 (orange boxplot) for model 1 for type 1 [...]?
Thanks for your help!

sns.pointplot can be used to calculate and position the medians.
The example code uses following parameters for pointplot:
dodge=.8 - .8 / len(methods): dodge separates out the points per hue. The default dodge width is different for point plots as for box plots. See this github issue.
linestyles='': don't draw lines between the points
markers='D': use a diamond marker
color='black': the color for the marker (default the color would come from the hue
estimator=np.median: calculate the median of the y-values; note that these are on the same spot as the central line of the box plots
ci=None: don't show a confidence interval
The legend has been changed to remove the entries from the pointplot. The x-position of bbox_to_anchor is set to half of wspace in an attempt to center the legend between the two subplots.
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
sns.set_theme(style='ticks', palette='pastel')
fig = plt.figure(figsize=(15, 5))
fig.subplots_adjust(wspace=0.15, right=0.98, left=0.04, bottom=0.14, top=0.98)
axis_type1 = fig.add_subplot(1, 2, 1)
axis_type2 = fig.add_subplot(1, 2, 2)
methods = ['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7']
models = ['model1', 'model2', 'model3', 'model4']
df_type1_long = pd.DataFrame({'Model': np.random.choice(models, 500),
'Method': np.random.choice(methods, 500),
'var': np.random.randint(1, 101, 500)})
df_type2_long = pd.DataFrame({'Model': np.random.choice(models, 800),
'Method': np.random.choice(methods, 800),
'var': np.random.randint(1, 101, 800)})
for df_long, ax in zip([df_type1_long, df_type2_long], [axis_type1, axis_type2]):
sns.boxplot(x='Model', hue='Method', y='var', data=df_long,
showfliers=False, whis=0, ax=ax)
sns.pointplot(x='Model', hue='Method', y='var', dodge=.8 - .8 / len(methods),
linestyles='', markers='D', color='black', estimator=np.median, ci=None,
data=df_long, ax=ax)
# sns.pointplot(x='Model', hue='Method', y='var', dodge=.8 - .8 / len(methods),
# linestyles='', markers='v', color='black', estimator=np.min, ci=None,
# data=df_long, ax=ax)
axis_type1.set_xlabel('')
axis_type2.set_xlabel('')
axis_type1.legend_.remove()
axis_type2.legend(handles=axis_type2.legend_.legendHandles[:len(methods)],
loc='upper center', bbox_to_anchor=(-0.075, -0.06), ncol=len(methods))
plt.show()

Related

How box plot 'all' axvspan from time series with right code?

I would also like to indicate the "box plot all" for the monthly box plots too (for years box plot works) , but I can't: function axvspan() for axes[1].
Also, I can't edit (for example) xtick and ytick correctly and the image is blurry. What do you think of this code of mine? How to improve it?
The code:
import os
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import seaborn as sns
import calendar
df_air = pd.read_csv('https://raw.githubusercontent.com/AileenNielsen/TimeSeriesAnalysisWithPython/master/data/AirPassengers.csv',
parse_dates=['Month'], date_parser=lambda x: pd.to_datetime(x, format='%Y-%m',
errors = 'coerce'))
# data preparation
df_air['year'] = [d.year for d in df_air.Month]
df_air['month'] = [d.strftime('%b') for d in df_air.Month]
years = df_air['year'].unique()
# plot drawing
fig, axes = plt.subplots(2, 1, figsize=(20,22), dpi= 120)
plt.subplots_adjust(hspace=0.25) # problem with 'hspace' because doesn't work
axes[0] = sns.boxplot(x='year', y='#Passengers', data=pd.concat([df_air_all, df_air]),
ax=axes[0], showmeans=True, meanprops={"marker":"d"})
axes[1] = sns.boxplot(x='month', y='#Passengers', data=df_air.loc[~df_air.year.isin([1949, 1961]), :], ax=axes[1], showmeans=True,
meanprops={"marker":"d"})
# Set Title
axes[0].set_title('Yearly Boxplot', fontsize=20, fontweight="bold", pad=20)
axes[1].set_title('Monthly Boxplot', fontsize=20, fontweight="bold", pad=20)
axes[0].set_ylabel("Passengers")
axes[1].set_ylabel("Passengers")
axes[0].axvspan(-0.5, 0.5, color='0.85', zorder=-1)
sns.set_style('darkgrid')
palette = ['#9400D3'] + sns.color_palette('viridis', len(df_air['year'].unique()))
axes[0].get_xticklabels()[0].set_weight('bold')
axes[0].set_xlabel('')
axes[1].set_xlabel('')
axes[0].grid(color = 'green', linestyle = '--', linewidth = 0.5)
axes[1].grid(color = 'green', linestyle = '--', linewidth = 0.5)
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 18
plt.rc('xtick',labelsize=16)
plt.rc('ytick',labelsize=16)
plt.rcParams["figure.autolayout"] = True
plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["axes.linewidth"] = 1
plt.show()

Line chart to surface chart

[UPDATE: Sorry for not providing the piece where the author of the codes create example data. I have updated the codes]
I found an example of a 3D mesh line chart that satisfied what I need (colouring change with level on z dimension). However, instead of line, I want surface plot. How can I change the codes to have the 3d surface plot?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import animation, rc
from matplotlib.cm import get_cmap
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.font_manager import FontProperties
from matplotlib.collections import LineCollection
from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d.art3d import Line3DCollection
index_returns = np.random.normal(loc=1e-4, scale=5e-3, size=(783, 9))
index_returns = np.vstack((np.zeros(shape=(1, 9)) + 100, index_returns))
index_prices = np.cumprod(1 + index_returns, axis=0)
window = 261
df = np.zeros(shape=(index_prices.shape[0]-window, 9))
for i in range(window, index_prices.shape[0], 1):
df[i-window] = (index_prices[i]/index_prices[i-window]) - 1
index = pd.date_range('2019-01-01', periods=index_prices.shape[0]-window, freq='B')
columns = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
df = pd.DataFrame(df, index=index, columns=columns)
# create the figure
fig = plt.figure(figsize=(14.4, 9))
ax = fig.add_subplot(111, projection='3d')
fig.patch.set_alpha(1)
# get the cmap to use
cmap = get_cmap('RdYlGn')
# get the slice based on data frame
current_slice = df.values[:261, :]
index_names = df.columns
index_dates = df.index
# list holding the lines
lines = []
# for each index...
for i in range(current_slice.shape[1]):
# get the coordinates
x = np.array(np.arange(current_slice.shape[0]))
y = np.tile(i, current_slice.shape[0])
z = np.array(current_slice[:, i])
# crete points and segments to color
points = np.array([x, y, z]).T.reshape(-1, 1, 3)
segments = np.concatenate([points[:-1], points[1:]], axis=1)
# Create a continuous norm to map from data points to colors
norm = plt.Normalize(-0.19, 0.19)
lc = Line3DCollection(segments, cmap=cmap, norm=norm, zorder=current_slice.shape[1]-i)
# Set the values used for colormapping
lc.set_array(z)
lc.set_linewidth(2)
lc.set_color(cmap(z[-1] * 2.5 + 0.5))
lc.set_label(index_names[i])
lines.append(ax.add_collection(lc))
# add the grids
ax.legend(loc='center right', bbox_to_anchor=(1.1, 0.46), fancybox=True, facecolor=(.95,.95,.95,1), framealpha=1, shadow=False, frameon=True, ncol=1, columnspacing=0, prop={'family': 'DejaVu Sans Mono'})
ax.set_zlabel('Rolling Equity 1Y', labelpad=10)
ax.set_zlim(-0.39, 0.39)
ax.set_zticklabels([' '* 3 + '{:.0%}'.format(val) for val in ax.get_zticks()], fontdict={'verticalalignment': 'center', 'horizontalalignment': 'center'})
ax.set_xlabel('Date', labelpad=30)
ax.set_xlim(0, current_slice.shape[0]-1)
ax.set_xticklabels([index_dates[int(val)].strftime('%m/%y') for val in ax.get_xticks()[:-1]] + [''], rotation=0, fontdict={'verticalalignment': 'top', 'horizontalalignment': 'center'})
ax.set_yticks(np.arange(current_slice.shape[1]))
ax.set_yticklabels([index_names[i] for i in range(current_slice.shape[1])], rotation=-15, fontdict={'verticalalignment': 'center', 'horizontalalignment': 'left'})
# show the plot
plt.show()

Automatic add text to matplotlib plot in Python

I try to produce a plot and want to automatically add text (in this case is percentage) to each circle in correspond to each y axis types. Any help would be very helpful.
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# Make some data
index=['Stream flow',
'Soil moisture',
'Water indices',
'Others',
'Temperature',
'Precipitation',
'Vegetative indices']
value=[2.13, 6.38, 10.64, 12.77, 17.73, 21.99, 28.37]
# create dataframe
percentages = pd.Series(value,index=index)
df = pd.DataFrame({'percentage' : percentages})
df = df.sort_values(by='percentage')
# we first need a numeric placeholder for the y axis
my_range=list(range(1,len(df.index)+1))
fig, ax = plt.subplots(figsize=(15,8))
# create for each expense type an horizontal line that starts at x = 0 with the length
plt.hlines(y=my_range, xmin=0, xmax=df['percentage']-0.5, color='black', alpha=0.8, linewidth=1)
# create for each expense type a dot at the level of the expense percentage value
line=plt.plot(df['percentage'], my_range, "o", markersize=30, color='#fd8c00', alpha=0.6, linewidth=0.3)
# set labels
ax.set_xlabel('Percentage', fontsize=15)
ax.set_ylabel('')
# set axis
ax.tick_params(axis='both', which='major', labelsize=14)
plt.yticks(my_range, df.index)
ax.set_xlim(0,30)

You can use matplotlib.axes.Axes.text:
x_space = 0.4
y_space = 0.05
fontsize = 7
for y_i, val in enumerate(value, 1):
ax.text(x = val - x_space, y = y_i - y_space, s = f'{val}%', fontsize = fontsize)
You have to adjust x_space, y_space and fontsize in order to fit properly the text within the circles.
Complete code
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# Make some data
index=['Stream flow',
'Soil moisture',
'Water indices',
'Others',
'Temperature',
'Precipitation',
'Vegetative indices']
value=[2.13, 6.38, 10.64, 12.77, 17.73, 21.99, 28.37]
# create dataframe
percentages = pd.Series(value,index=index)
df = pd.DataFrame({'percentage' : percentages})
df = df.sort_values(by='percentage')
# we first need a numeric placeholder for the y axis
my_range=list(range(1,len(df.index)+1))
fig, ax = plt.subplots(figsize=(15,8))
# create for each expense type an horizontal line that starts at x = 0 with the length
plt.hlines(y=my_range, xmin=0, xmax=df['percentage']-0.5, color='black', alpha=0.8, linewidth=1)
# create for each expense type a dot at the level of the expense percentage value
line=plt.plot(df['percentage'], my_range, "o", markersize=30, color='#fd8c00', alpha=0.6, linewidth=0.3)
# set labels
ax.set_xlabel('Percentage', fontsize=15)
ax.set_ylabel('')
# set axis
ax.tick_params(axis='both', which='major', labelsize=14)
plt.yticks(my_range, df.index)
ax.set_xlim(0,30)
x_space = 0.4
y_space = 0.05
for y_i, val in enumerate(value, 1):
ax.text(x = val - x_space, y = y_i - y_space, s = f'{val:>5.2f}%', fontsize = 7)
plt.show()
Same code as above, but with increased circle radius and font, in order to improve readability.
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# Make some data
index=['Stream flow',
'Soil moisture',
'Water indices',
'Others',
'Temperature',
'Precipitation',
'Vegetative indices']
value=[2.13, 6.38, 10.64, 12.77, 17.73, 21.99, 28.37]
# create dataframe
percentages = pd.Series(value,index=index)
df = pd.DataFrame({'percentage' : percentages})
df = df.sort_values(by='percentage')
# we first need a numeric placeholder for the y axis
my_range=list(range(1,len(df.index)+1))
fig, ax = plt.subplots(figsize=(15,8))
# create for each expense type an horizontal line that starts at x = 0 with the length
plt.hlines(y=my_range, xmin=0, xmax=df['percentage']-0.85, color='black', alpha=0.8, linewidth=1)
# create for each expense type a dot at the level of the expense percentage value
line=plt.plot(df['percentage'], my_range, "o", markersize=50, color='#fd8c00', alpha=0.6, linewidth=0.3)
# set labels
ax.set_xlabel('Percentage', fontsize=15)
ax.set_ylabel('')
# set axis
ax.tick_params(axis='both', which='major', labelsize=14)
plt.yticks(my_range, df.index)
ax.set_xlim(0,30)
ax.set_ylim(0, len(value) + 1)
x_space = 0.75
y_space = 0.06
fontsize = 12
for y_i, val in enumerate(value, 1):
ax.text(x = val - x_space, y = y_i - y_space, s = f'{val:>5.2f}%', fontsize = fontsize)
plt.show()
Even better, you can use matplotlib.axes.Axes.annotate to get rid of x_space and y_space:
fontsize = 12
for y_i, x_i in enumerate(value, 1):
ax.annotate(f'{x_i:>5.2f}%', xy = (x_i, y_i), xytext = (0, 0), textcoords = 'offset points', ha = 'center', va = 'center', fontsize = fontsize)
You still have to adjust the fontsize to properly fit the radius of the circles.

Set custom and changing baseline to stem plot in Matplotlib

I'm triying to make a figure where the stem plot has the baseline on the data of dataframe_3_merged['TOTAL'].
import numpy as np
from eurostatapiclient import EurostatAPIClient
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import pandas as pd
#Set versions and formats, so far only the ones used here are availeable and call client
VERSION = 'v2.1'
FORMAT = 'json'
LANGUAGE = 'en'
client = EurostatAPIClient(VERSION, FORMAT, LANGUAGE)
dataframe_3_query_total = 'ilc_peps01?precision=1&sex=T&geo=AT&geo=BE&geo=BG&geo=CH&geo=CY&geo=CZ&geo=DK&geo=EA19&geo=EE&geo=EL&geo=ES&geo=EU28&geo=FI&geo=FR&geo=HR&geo=HU&geo=IE&geo=IS&geo=IT&geo=LT&geo=LU&geo=LV&geo=ME&geo=MK&geo=MT&geo=NL&geo=NO&geo=PL&geo=PT&geo=RO&geo=RS&geo=SE&geo=SI&geo=SK&geo=TR&geo=UK&unit=PC&unitLabel=label&time=2018&age=TOTAL'
dataframe_3_query_urb = 'ilc_peps13?precision=1&deg_urb=DEG1&deg_urb=DEG2&deg_urb=DEG3&geo=AT&geo=BE&geo=BG&geo=CH&geo=CY&geo=CZ&geo=DE&geo=DK&geo=EA19&geo=EE&geo=EL&geo=ES&geo=EU28&geo=FI&geo=FR&geo=HR&geo=HU&geo=IE&geo=IS&geo=IT&geo=LT&geo=LU&geo=LV&geo=MK&geo=MT&geo=NL&geo=NO&geo=PL&geo=PT&geo=RO&geo=RS&geo=SE&geo=SI&geo=SK&geo=UK&unit=PC&unitLabel=label&time=2018'
dataframe_3_total = client.get_dataset(dataframe_3_query_total).to_dataframe().pivot(index = 'geo',columns = 'age',values = 'values')
dataframe_3_urb =client.get_dataset(dataframe_3_query_urb).to_dataframe().pivot(index = 'geo',columns = 'deg_urb',values = 'values')
dataframe_3_merged = dataframe_3_total.join(dataframe_3_urb).dropna()
fig, ax = plt.subplots(figsize=(15, 4))
plt.ylim(0,51)
x = range(0,32,1)
stem_1 =plt.stem(x,dataframe_3_merged['DEG1'])
stem_2=plt.stem(x, dataframe_3_merged['DEG2'])
stem_3=plt.stem(x, dataframe_3_merged['DEG3'])
plt.setp(stem_2, color = 'r')
plt.setp(stem_3, color = 'g')
scatterplot= sns.scatterplot(x=dataframe_3_merged.index, #We draw the scatterplot and specify the arguments
y = dataframe_3_merged['TOTAL'],
ax=ax ,
s = 100 ,
legend = False,
marker="_",
color = 'b')
The goal is to have a plot similar to this image:
I tried to use the list dataframe_3_merged['TOTAL'] as the parameter in the bottom argument of plt.stem but I have this traceback: ValueError: setting an array element with a sequence.
Thank you for your help!

You could replace each stem plot by a scatter plot and a plot of vertical lines (plt.vlines). Setting the zorder=0 ensures the lines are drawn behind the dots.
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
names = ['hydrogen', 'helium', 'lithium', 'beryllium', 'boron', 'carbon', 'nitrogen', 'oxygen', 'fluorine', 'neon', 'sodium', 'magnesium', 'aluminium', 'silicon', 'phosphorus', 'sulphur', 'chlorine', 'argon', 'potassium', 'calcium', 'scandium', 'titanium', 'vanadium', 'chromium', 'manganese', 'iron', 'cobalt', 'nickel', 'copper', 'zinc', 'gallium', 'germanium', 'arsenic', 'selenium', 'bromine', 'krypton']
N = len(names)
df = pd.DataFrame({'Deg1': 35 + np.random.normal(size=N).cumsum(),
'Deg2': 25 + np.random.normal(size=N).cumsum(),
'Deg3': 15 + np.random.normal(size=N).cumsum()},
index=names)
df['Total'] = df.mean(axis=1)
for deg, color, label in zip(['Deg1', 'Deg2', 'Deg3'], ['tomato', 'orange', 'palegreen'],
['label1', 'label2', 'label3']):
plt.vlines(df.index, df[deg], df['Total'], lw=0.2, color='k', zorder=0)
plt.scatter(df.index, df[deg], marker='o', color=color, label=label)
plt.scatter(df.index, df['Total'], marker='_', color='deepskyblue', s=100)
plt.xticks(rotation='vertical')
plt.ylim(0, 51)
plt.margins(x=0.02)
plt.legend(ncol=3, bbox_to_anchor=(0.5, -0.4), loc='upper center')
plt.grid(True, axis='y')
plt.tick_params(length=0)
for where in ['top', 'left', 'right']:
plt.gca().spines[where].set_visible(False)
plt.tight_layout()
plt.show()

Visualizing the difference between two numeric arrays

I have two numeric arrays of equal length, with one array always having the element value >= to the corresponding (same index) element in the second array.
I am trying to visualize in a single graph:
i) difference between the corresponding elements,
ii) values of the corresponding elements in the two arrays.
I have tried plotting the CDF as below:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
arr1 = np.random.uniform(1,20,[25,1])
arr2 = arr1 + np.random.uniform(1,10,[25,1])
df1 = pd.DataFrame(arr1)
df2 = pd.DataFrame(arr2)
fix, ax = plt.subplots()
sns.kdeplot(df1[0], cumulative=True, color='orange', label='arr1')
sns.kdeplot(df2[0], cumulative=True, color='b', label='arr2')
sns.kdeplot(df2[0]-df1[0], cumulative=True, color='r', label='difference')
plt.show()
which gives the following output:
However, it does not capture the difference, and values of the corresponding elements together. For example, suppose the difference between two elements is 3. The two numbers can be 2 and 5, but they can also be 15 and 18, and this can not be determined from the CDF.
Which kind of plotting can visualize both the difference between the elements and the values of the elements?
I do not wish to line plot as below because not much statistical insights can be derived from the visualization.
ax.plot(df1[0])
ax.plot(df2[0])
ax.plot(df2[0]-df1[0])

There are lots of ways to show difference between two values. It really depends on your goal for the chart, how quantitative or qualitative you want to be, or if you want to show the raw data somehow. Here are a few ideas that come to mind that do not involve simple line plots or density functions. I strongly recommend the book Better Data Visualization by Johnathan Schwabish. He discusses interesting considerations regarding data presentation.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import ticker
arr1 = np.random.uniform(1,20, size=25)
arr2 = arr1 + np.random.uniform(1,10, size=25)
df = pd.DataFrame({
'col1' : arr1,
'col2' : arr2
})
df['diff'] = df.col2 - df.col1
df['sum'] = df.col1 + df.col2
fig, axes = plt.subplots(ncols=2, nrows=3, figsize=(15,15))
axes = axes.flatten()
# Pyramid chart
df_sorted = df.sort_values(by='sum', ascending=True)
axes[0].barh(
y = np.arange(1,26),
width = -df_sorted.col1
)
axes[0].barh(
y = np.arange(1,26),
width = df_sorted.col2
)
# Style axes[0]
style_func(axes[0], 'Pyramid Chart')
# Dot Plot
axes[1].scatter(df.col1, np.arange(1, 26), label='col1')
axes[1].scatter(df.col2, np.arange(1, 26), label='col2')
axes[1].hlines(
y = np.arange(1, 26),
xmin = df.col1, xmax = df.col2,
zorder=0, linewidth=1.5, color='k'
)
# Style axes[1]
legend = axes[1].legend(ncol=2, loc='center', bbox_to_anchor=(0.14,1.025), edgecolor='w')
style_func(axes[1], 'Dot Plot')
set_xlim = axes[1].set_xlim(0,25)
# Dot Plot 2
df_sorted = df.sort_values(by=['col1', 'diff'], ascending=False)
axes[2].scatter(df_sorted.col1, np.arange(1, 26), label='col1')
axes[2].scatter(df_sorted.col2, np.arange(1, 26), label='col2')
axes[2].hlines(
y = np.arange(1, 26),
xmin = df_sorted.col1, xmax = df_sorted.col2,
zorder=0, linewidth=1.5, color='k'
)
# Style axes[2]
legend = axes[2].legend(ncol=2, loc='center', bbox_to_anchor=(0.14,1.025), edgecolor='w')
style_func(axes[2], 'Dot Plot')
set_xlim = axes[2].set_xlim(0,25)
# Dot Plot 3
df_sorted = df.sort_values(by='sum', ascending=True)
axes[3].scatter(-df_sorted.col1, np.arange(1, 26), label='col1')
axes[3].scatter(df_sorted.col2, np.arange(1, 26), label='col2')
axes[3].vlines(x=0, ymin=-1, ymax=27, linewidth=2.5, color='k')
axes[3].hlines(
y = np.arange(1, 26),
xmin = -df_sorted.col1, xmax = df_sorted.col2,
zorder=0, linewidth=2
)
# Style axes[3]
legend = axes[3].legend(ncol=2, loc='center', bbox_to_anchor=(0.14,1.025), edgecolor='w')
style_func(axes[3], 'Dot Plot')
# Strip plot
axes[4].scatter(df.col1, [4] * 25)
axes[4].scatter(df.col2, [6] * 25)
axes[4].set_ylim(0, 10)
axes[4].vlines(
x = [df.col1.mean(), df.col2.mean()],
ymin = [3.5, 5.5], ymax=[4.5,6.5],
color='black', linewidth =2
)
# Style axes[4]
axes[4].yaxis.set_major_locator(ticker.FixedLocator([4,6]))
axes[4].yaxis.set_major_formatter(ticker.FixedFormatter(['col1','col2']))
hide_spines = [axes[4].spines[x].set_visible(False) for x in ['left','top','right']]
set_title = axes[4].set_title('Strip Plot', fontweight='bold')
tick_params = axes[4].tick_params(axis='y', left=False)
grid = axes[4].grid(axis='y', dashes=(8,3), alpha=0.3, color='gray')
# Slope chart
for i in range(25):
axes[5].plot([0,1], [df.col1[i], df.col2[i]], color='k')
align = ['left', 'right']
for i in range(1,3):
axes[5].text(x = i - 1, y = 0, s = 'col' + str(i),
fontsize=14, fontweight='bold', ha=align[i-1])
set_title = axes[5].set_title('Slope chart', fontweight='bold')
axes[5].axis('off')
def style_func(ax, title):
hide_spines = [ax.spines[x].set_visible(False) for x in ['left','top','right']]
set_title = ax.set_title(title, fontweight='bold')
set_xlim = ax.set_xlim(-25,25)
x_locator = ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
y_locator = ax.yaxis.set_major_locator(ticker.FixedLocator(np.arange(1,26, 2)))
spine_width = ax.spines['bottom'].set_linewidth(1.5)
x_tick_params = ax.tick_params(axis='x', length=8, width=1.5)
x_tick_params = ax.tick_params(axis='y', left=False)

What about a parallel coordinates plot with plotly? This will allow to see the distinct values of each original array but then also if they converge on the same diffrence?
https://plot.ly/python/parallel-coordinates-plot/

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Seaborn grouped boxplot symbol on each boxplot - python

Related

How box plot 'all' axvspan from time series with right code?

Line chart to surface chart

Automatic add text to matplotlib plot in Python

Set custom and changing baseline to stem plot in Matplotlib

Visualizing the difference between two numeric arrays

Categories

Resources