Clustered stacked bar plot with error bars

Clustered stacked bar plot with error bars - python

I am using the code (posted here: https://stackoverflow.com/a/22845857/6649485) below to generate a clustered stacked bar plot. Unfortunately the error bars are not shifted in the same way as the data bars. I am not sure how to adress them and set their x-value accordingly.
def plot_clustered_stacked(dfall, labels=None, title="SEC stress study", H="/", **kwargs):
"""Given a list of dataframes, with identical columns and index, create a clustered stacked bar plot. labels is a list of the names of the dataframe, used for the legend title is a string for the title of the plot H is the hatch used for identification of the different dataframe"""
n_df = len(dfall)
n_col = len(dfall[0].columns)
n_ind = len(dfall[0].index)
axe = plt.subplot(111)
for df in dfall : # for each data frame
axe = df.plot(kind="bar",
linewidth=0,
stacked=True,
ax=axe,
legend=False,
grid=False,
yerr=0.1,
**kwargs) # make bar plots
h,l = axe.get_legend_handles_labels() # get the handles we want to modify
for i in range(0, n_df * n_col, n_col): # len(h) = n_col * n_df
for j, pa in enumerate(h[i:i+n_col]):
for rect in pa.patches: # for each index
rect.set_x(rect.get_x() + 1 / float(n_df + 1) * i / float(n_col))
rect.set_hatch(H * int(i / n_col)) #edited part
rect.set_width(1 / float(n_df + 1))
axe.set_xticks((np.arange(0, 2 * n_ind, 2) + 1 / float(n_df + 1)) / 2.)
axe.set_xticklabels(df.index, rotation = 0)
axe.set_title(title)
# Add invisible data to add another legend
n=[]
for i in range(n_df):
n.append(axe.bar(0, 0, color="gray", hatch=H * i))
l1 = axe.legend(h[:n_col], l[:n_col], loc=[1.01, 0.5])
if labels is not None:
l2 = plt.legend(n, labels, loc=[1.01, 0.1])
axe.add_artist(l1)
return axe

This is what I came up with by now.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
def plot_clustered_stacked(dfall):
for j, df in enumerate(dfall):
set_count=1
width = 0.2
N=len(df.index)
b_width = 0.2
index=np.arange(N/set_count)
labels=df.index
p1 = plt.bar(index +j*b_width, df['Average Monomer Area'], width=width, yerr='0.5 Stdev Monomer Area', data=df)
p2 = plt.bar(index +j*b_width, df['Average HMW Area'], bottom=df['Average Monomer Area'], width=width, yerr='0.5 Stdev HMW Area', data=df)
plt.xticks(index+b_width, labels)
plt.legend((p1[0], p2[0]), ('Average Monomer Area', 'Average
HMW Area'))
data=pd.read_csv("SEC.csv")
df1 = data[data['Time'].str.contains("0 weeks")].drop(['High/low', 'Time'], axis=1)
df1.set_index('Excipient Name', inplace=True)
df2 = data[data['Time'].str.contains("1 week")].drop(['High/low', 'Time'], axis=1)
df2.set_index('Excipient Name', inplace=True)
df3 = data[data['Time'].str.contains("3 weeks")].drop(['High/low', 'Time'], axis=1)
df3.set_index('Excipient Name', inplace=True)
plot_clustered_stacked([df1, df2, df3])
plt.show()

Related

Readable values in on axis with Matplotlib

I am working with Matplotlib and trying to plot a combo box with bars and lines. Below you can see my data:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from matplotlib.ticker import FormatStrFormatter
# Data
data = {
'Year': ['2010','2011','2012','2013','2014','2015','2016','2017','2018','2019'],
'Rate':[10,10,9,7,5,5,5,5,5,5],
'ChangeRate_1':[7,-50,24,150,8,10,60,5,180,5],
'ChangeRate_2':[7,6,-3,1,8,5,8,5,15,5],
}
df = pd.DataFrame(data, columns = ['Year',
'Rate',
'ChangeRate_1',
'ChangeRate_2'
])
df
Below you can see code :
# Ploting combo plot
fig, ax_1 = plt.subplots(figsize = (8, 5))
ax_2 = ax_1.twinx()
ax_3 = ax_2.twinx() ### <---- Problem is probably here
cmap = get_cmap('tab10')
ax_1.bar(df['Year'], df['Rate'], label = 'Rate', color = cmap(0))
ax_2.plot(df['Year'], df['ChangeRate_1'], label = 'ChangeRate_2', color = cmap(0.1),linewidth = '3.5')
ax_3.plot(df['Year'], df['ChangeRate_2'], label = 'ChangeRate_2', color = cmap(0.2),linewidth = '3.5')
handles_1, labels_1 = ax_1.get_legend_handles_labels()
handles_2, labels_2 = ax_2.get_legend_handles_labels()
handles_3, labels_3 = ax_3.get_legend_handles_labels()
ax_1.set_ylim(0, 16)
ax_2.set_ylim(-50,180)
ax_1.legend(handles = handles_1 + handles_2 + labels_3,
labels = labels_1 + labels_2 + labels_3,
loc = 'upper right',
shadow = True)
ax_1.grid(axis = 'y')
ax_1.set_title('Comparison of revenues',fontsize=11)
ax_1.set_ylabel('Rate')
ax_2.set_ylabel('ChangeRate_1')
ax_3.set_ylabel('ChangeRate_2')
ax_1.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
plt.savefig('ComparisonOfRevenues.pdf')
plt.show()
The above code produces a plot that is shown below.
As shown in the above plot, values for the y-axis for the left and for the right side overlap with values and are not readable.
For the left side, the scale for the 'Rate' should be in the range of 0 to 16, while for the right side, for ChangeRate_1 and ChangeRate_2, from -50 to 180.
So can anybody help me how to solve this problem ?

The instantiation of the third Axes object with ax_3 = ax_2.twinx() can be circumvented by using just one extra y-axis on the right and plotting ChangeRate_1 and ChangeRate_2 on that axis keeping the (right) y-axis label as ChangeRate and then assigning correct labels to the lines.
Code:
fig, ax_1 = plt.subplots(figsize=(8, 5))
ax_2 = ax_1.twinx()
cmap = get_cmap('tab10')
ax_1.bar(df['Year'], df['Rate'], label='Rate', color=cmap(0))
ax_2.plot(df['Year'], df['ChangeRate_1'], label='ChangeRate_1', color=cmap(0.1), linewidth='3.5')
ax_2.plot(df['Year'], df['ChangeRate_2'], label='ChangeRate_2', color=cmap(0.2), linewidth='3.5')
handles_1, labels_1 = ax_1.get_legend_handles_labels()
handles_2, labels_2 = ax_2.get_legend_handles_labels()
ax_1.set_ylim(0, 16)
ax_2.set_ylim(-50,180)
ax_1.legend(handles=handles_1 + handles_2, labels=labels_1 + labels_2,
loc='upper right', shadow=True)
ax_1.grid(axis='y')
ax_1.set_title('Comparison of revenues',fontsize=11)
ax_1.set_ylabel('Rate')
ax_2.set_ylabel('ChangeRate')
ax_1.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
plt.show()

What is wrong with my multiple line graph plotting?

I am attempting to plot multiple line graphs in a graph table itself. However, I run into an error that mentioned:
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
Not only this happened but my legend tables of the 3 lines don't merge together and my X-axis does not show the months but random numbers from my dataframe. Here is my code and graph result to look through.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_excel (r'C:\Users\admin\Desktop\Question Folder\Sales of top 30 customers.xlsx')
#Refine and adjust the dataframe for suitable manipulation
df = df.drop('Unnamed: 0', axis = 1)
df = df.iloc[2: , :]
row_detail = df.head(1).values.tolist()
row_detail = row_detail[0]
a = df.iloc[-3:, :].values.tolist()
a = a[0]
df.columns = row_detail
df = df.iloc[1:, :]
print(df) # This is for checking purpose
# This creates a dataframe needed for the practice
df1 = df.iloc[:3]
# This is to plot a line graph from df1
df_chosen = df1
a = 0
# Turning data row of a customer into a list
data_row_1 = df_chosen.iloc[a].values.tolist()
data_row_2 = df_chosen.iloc[a + 1].values.tolist()
data_row_3 = df_chosen.iloc[a + 2].values.tolist()
date = data_row_1[1:]
cus_1 = data_row_1[0]
cus_2 = data_row_2[0]
cus_3 = data_row_3[0]
y1 = data_row_1[1:]
y2 = data_row_2[1:]
y3 = data_row_3[1:]
x = np.arange(len(date)) # the label locations
width = 0.60 # the width of the bars
fig, ax = plt.subplots()
# Increase size of plot in jupyter
plt.rcParams["figure.figsize"] = (20,15)
plt.rcParams.update({'font.size':25})
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('Months', fontsize=30)
ax.set_ylabel('Sales', fontsize=30)
ax.set_title('Monthly Sales from ' + cus_1 +", " + cus_2+ " and " + cus_3, fontsize=30)
ax.set_xticks(x, date)
ax.set_ylim(bottom = 0, top = 1000)
legend1 = plt.legend(())
ax.legend(loc='best', fontsize=30)
plt.grid(True)
# set up the 1st line graph
ax.plot(x, y1, "r", label = cus_1, marker='x')
#ax.set_yticks(
ax.grid(True) # turn on grid #1
ax.set_ylim(bottom = 0, top = 1000)
ax.legend(loc='upper left', fontsize=25)
ax2 = ax.twinx()
ax2.plot(x, y2, "b", label= cus_2, marker='x')
ax2.set_yticks([])
ax2.grid(False) # turn off grid #2
ax2.set_ylim(bottom = 0, top = 10000)
ax2.legend(loc='upper left', fontsize=25)
ax3 = ax2.twinx()
ax3.plot(x, y3, "g", label= cus_3, marker='x')
ax3.set_yticks([])
ax3.grid(False) # turn off grid #2
ax3.set_ylim(bottom = 0, top = 10000)
ax3.legend(loc='upper left', fontsize=25)
I just need to understand and know the solutions for the following:
Why is the X-axis not showing the months' names?
Why is the 3 separate legend tables not connected together?
How do I avoid the 'No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.' error warning?
Hope to receive a favorable reply soon. :)
Edit notice: Here is the dataframe used for this problem:

Seaborn swarmplot break into lines

I'm trying to make this swarmplot with seaborn
My problem is that the swarms are too wide. I want to be able to break them up into rows of maximum 3 dots per row
This is my code:
# Import modules
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
###
# Import and clead dataset
url = "https://raw.githubusercontent.com/amirnakar/scratchboard/master/Goodreads/goodreads_library_export.csv"
Books = pd.read_csv(url)
Books = Books[Books['Date Read'].notna()] # Remove NA
Books['Year'] = pd.to_datetime( # Convert to dates
Books['Date Read'],
format='%YYYY%mm%dd',
errors='coerce')
Books['Year'] = pd.DatetimeIndex(Books['Date Read']).year # Take only years
Books[['Year', 'Date Read']] # merge the years in
###
# Calculate mean rate by year
RateMeans = (Books["My Rating"].groupby(Books["Year"]).mean())
Years = list(RateMeans.index.values)
Rates = list(RateMeans)
RateMeans = pd.DataFrame(
{'Years': Years,
'Rates': Rates
})
###
# Plot
fig,ax = plt.subplots(figsize=(20,10))
## Violin Plot:
plot = sns.violinplot(
data=Books,
x = "Year",
y = 'My Rating',
ax=ax,
color = "white",
inner=None,
#palette=colors_from_values(ArrayRates[:,1], "Blues")
)
## Swarm Plot
plot = sns.swarmplot(
data=Books,
x = "Year",
y = 'My Rating',
ax=ax,
hue = "My Rating",
size = 10
)
## Style
### Title
ax.text(x=0.5, y=1.1, s='Book Ratings: Distribution per Year', fontsize=32, weight='bold', ha='center', va='bottom', transform=ax.transAxes)
ax.text(x=0.5, y=1.05, s='Source: Goodreads.com (amirnakar)', fontsize=24, alpha=0.75, ha='center', va='bottom', transform=ax.transAxes)
### Axis
ax.set(xlim=(4.5, None), ylim=(0,6))
#ax.set_title('Book Ratings: Distribution per Year \n', fontsize = 32)
ax.set_ylabel('Rating (out of 5 stars)', fontsize = 24)
ax.set_xlabel('Year', fontsize = 24)
ax.set_yticklabels(ax.get_yticks().astype(int), size=20)
ax.set_xticklabels(ax.get_xticks(), size=20)
### Legend
plot.legend(loc="lower center", ncol = 5 )
### Colour pallete
colorset = ["#FAFF04", "#FFD500", "#9BFF00", "#0099FF", "#000BFF"]
colorset.reverse()
sns.set_palette(sns.color_palette(colorset))
# Save the plot
#plt.show(plot)
plt.savefig("Rate-Python.svg", format="svg")
This is the output:
What I want to have happen:
I want to be able to define that each row of dots should have a maximum of 3, if it's more, than break it into a new row. I demonstrate it here (done manually in PowerPoint) on two groups, but I want it for the entire plot
BEFORE:
AFTER:

Here is an attempt to relocate the dots a bit upward/downward. The value for delta comes from experimenting.
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# Import and clea dataset
url = "https://raw.githubusercontent.com/amirnakar/scratchboard/master/Goodreads/goodreads_library_export.csv"
Books = pd.read_csv(url)
Books = Books[Books['Date Read'].notna()] # Remove NA
Books['Year'] = pd.DatetimeIndex(Books['Date Read']).year # Take only years
# Calculate mean rate by year
RatePerYear = Books[["My Rating", "Year"]].groupby("Year")["My Rating"].value_counts()
modified_ratings = []
delta = 0.2 # distance to move overlapping ratings
for (year, rating), count in RatePerYear.iteritems():
higher = max(0, ((count - 3) + 1) // 2)
lower = max(0, (count - 3) // 2)
modified_ratings.append([year, rating, count - higher - lower])
for k in range((higher + 2) // 3):
modified_ratings.append([year, rating + (k + 1) * delta, 3 if (k + 1) * 3 <= higher else higher % 3])
for k in range((lower + 2) // 3):
modified_ratings.append([year, rating - (k + 1) * delta, 3 if (k + 1) * 3 <= lower else lower % 3])
modified_ratings = np.array(modified_ratings)
modified_ratings_df = pd.DataFrame(
{'Year': np.repeat(modified_ratings[:, 0].astype(int), modified_ratings[:, 2].astype(int)),
'My Rating': np.repeat(modified_ratings[:, 1], modified_ratings[:, 2].astype(int))})
modified_ratings_df['Rating'] = modified_ratings_df['My Rating'].round().astype(int)
fig, ax = plt.subplots(figsize=(20, 10))
sns.violinplot(data=Books, x="Year", y='My Rating', ax=ax, color="white", inner=None)
palette = ["#FAFF04", "#FFD500", "#9BFF00", "#0099FF", "#000BFF"].reverse()
sns.swarmplot(data=modified_ratings_df, x="Year", y='My Rating', ax=ax, hue="Rating", size=10, palette=palette)
ax.set(xlim=(4.5, None), ylim=(0, 6))
ax.legend(loc="lower center", ncol=5)
plt.tight_layout()
plt.show()

How to plot a bar chart with multiple x-axis data?

I'd like to plot a bar chart in Python, similar to Excel. However, I am struggling to have two different x-axes. For example, for each size (like 8M), I want to plot the results of all 5 strategies. For each strategy, there are 3 metrics (Fit, boot, and exp).
You can download the original excel file here here.
This is my code so far:
df = pd.read_excel("data.xlsx",sheet_name="Sheet1")
r1= df['Fit']
r2= df['Boot']
r3= df['Exp']
x= df['strategy']
n_groups = 5
# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
names = ["8M","16M","32M","64M","128M"]
bar_width = 0.1
opacity = 0.8
Fit8= [r1[0],r1[1],r1[2],r1[3],r1[4]]
Boot8= [r2[0],r2[1],r2[2],r2[3],r2[4]]
Exp8= [r3[0],r3[1],r3[2],r3[3],r3[4]]
Fit16= [r1[5],r1[6],r1[7],r1[8],r1[9]]
Boot16= [r2[5],r2[6],r2[7],r2[8],r2[9]]
Exp16= [r3[5],r3[6],r3[7],r3[8],r3[9]]
rects1 = plt.bar(
index, Fit8, bar_width,
alpha=opacity,
color='g',
label='Fit'
)
rects2 = plt.bar(
index + 0.1, Boot8, bar_width,
alpha=opacity,
color='b',
label='Boot'
)
rects3 = plt.bar(
index + 0.2, Exp8, bar_width,
alpha=opacity,
color='y',
label='EXP'
)
rects4 = plt.bar(
index + 0.5, Fit16, bar_width,
alpha=opacity,
color='g'
)
rects5 = plt.bar(
index + 0.6, Boot16, bar_width,
alpha=opacity,
color='b'
)
rects6 = plt.bar(
index + 0.7, Exp16, bar_width,
alpha=opacity,
color='y'
)
plt.xticks(index + 0.2, (names))
plt.legend()
plt.tight_layout()
plt.show()

Something like this?
Here the code:
import pandas as pd
import pylab as plt
# read dataframe, take advantage of Multiindex
df = pd.read_excel(
"data.xlsx",
sheet_name="Sheet1", engine='openpyxl',
index_col=[0, 1],
)
# plot the content of the dataframe
ax = df.plot.bar()
# Show minor ticks
ax.minorticks_on()
# Get location of the center of each bar
bar_locations = list(map(lambda x: x.get_x() + x.get_width() / 2., ax.patches))
# Set minor and major tick positions
# Minor are used for S1, ..., S5
# Major for sizes 8M, ..., 128M
# tick locations are sorted according to the 3 metrics, so first all the 25 bars for the fit, then the 25
# for the boot and at the end the 25 for the exp. We set the major tick at the position of the bar at the center
# of the size group, that is the third boot bar of each size.
ax.set_xticks(bar_locations[27:50:5], minor=False) # use the 7th bar of each size group
ax.set_xticks(bar_locations[len(df):2 * len(df)], minor=True) # use the bar in the middle of each group of 3 bars
# Labels for groups of 3 bars and for each group of size
ax.set_xticklabels(df.index.get_level_values(0)[::5], minor=False, rotation=0)
ax.set_xticklabels(df.index.get_level_values(1), minor=True, rotation=0)
# Set tick parameters
ax.tick_params(axis='x', which='major', pad=15, bottom='off')
ax.tick_params(axis='x', which='both', top='off')
# You can use a different color for each group
# You can comment out these lines if you don't like it
size_colors = 'rgbym'
# major ticks
for l, c in zip(ax.get_xticklabels(minor=False), size_colors):
l.set_color(c)
l.set_fontweight('bold')
# minor ticks
for i, l in enumerate(ax.get_xticklabels(minor=True)):
l.set_color(size_colors[i // len(size_colors)])
# remove x axis label
ax.set_xlabel('')
plt.tight_layout()
plt.show()
The main idea here is to use the Multiindex of Pandas, with some minor tweaks.
EDIT
If you want spaces between groups, you can add a dummy category (a.k.a strategy) in the dataframe to create an artificial space, obtaining:
Here the code:
import numpy as np
import pandas as pd
import pylab as plt
# read dataframe, take advantage of Multiindex
df = pd.read_excel(
"data.xlsx",
sheet_name="Sheet1", engine='openpyxl',
index_col=[0, 1],
)
# plot the content of the dataframe
sizes = list(df.index.get_level_values(0).drop_duplicates())
strategies = list(df.index.get_level_values(1).drop_duplicates())
n_sizes = len(sizes)
n_strategies = len(strategies)
n_metrics = len(df.columns)
empty_rows = pd.DataFrame(
data=[[np.nan] * n_metrics] * n_sizes, index=pd.MultiIndex.from_tuples([(s, 'SN') for s in sizes], names=df.index.names),
columns=df.columns,
)
old_columns = list(df.columns)
df = df.merge(empty_rows, how='outer', left_index=True, right_index=True, sort=False).drop(
columns=[f'{c}_y' for c in df.columns]
).sort_index(
ascending=True, level=0, key=lambda x: sorted(x, key=lambda y: int(y[:-1]))
)
df.columns = old_columns
# Update number of strategies
n_strategies += 1
# Plot with Pandas
ax = df.plot.bar()
# Show minor ticks
ax.minorticks_on()
# Get location of the center of each bar
bar_locations = list(map(lambda x: x.get_x() + x.get_width() / 2., ax.patches))
# Set minor and major tick positions
# Major for sizes 8M, ..., 128M
# Minor are used for S1, ..., S5, SN
# Tick locations are sorted according to the 3 metrics, so first 30 (5 sizes * 6 strategies) bars for the fit,
# then 30 (5 sizes * 6 strategies) for the boot and at the end 30 (5 sizes * 6 strategies) for the exp.
# We set the major tick at the position of the bar at the center of the size group (+7),
# that is the third boot bar of each size.
n_bars_per_metric = n_sizes * n_strategies
strategy_ticks = bar_locations[len(df):2 * len(df)]
strategy_ticks = np.concatenate([strategy_ticks[b * n_strategies:b * n_strategies + n_strategies - 1] for b in range(n_sizes)]) # get only positions of the first 5 bars
size_ticks = strategy_ticks[2::n_sizes] + 0.01
ax.set_xticks(size_ticks, minor=False) # use the 7th bar of each size group
ax.set_xticks(strategy_ticks, minor=True) # use the bar in the middle of each group of 3 bars
# Labels for groups of 3 bars and for each group of size
ax.set_xticklabels(sizes, minor=False, rotation=0)
ax.set_xticklabels(strategies * n_sizes, minor=True, rotation=0)
# Set tick parameters
ax.tick_params(axis='x', which='major', pad=15, bottom=False)
ax.tick_params(axis='x', which='both', top=False)
# You can use a different color for each group
# You can comment out these lines if you don't like it
size_colors = 'rgbym'
# major ticks
for l, c in zip(ax.get_xticklabels(minor=False), size_colors):
l.set_color(c)
l.set_fontweight('bold')
# minor ticks
for i, l in enumerate(ax.get_xticklabels(minor=True)):
l.set_color(size_colors[i // len(size_colors)])
# remove x axis label
ax.set_xlabel('')
plt.tight_layout()
plt.show()
As you can see, you have to play with the DataFrame, adding some extra code. Maybe there is a simpler solution, but it was the first that I can think of.

Visualizing the difference between two numeric arrays

I have two numeric arrays of equal length, with one array always having the element value >= to the corresponding (same index) element in the second array.
I am trying to visualize in a single graph:
i) difference between the corresponding elements,
ii) values of the corresponding elements in the two arrays.
I have tried plotting the CDF as below:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
arr1 = np.random.uniform(1,20,[25,1])
arr2 = arr1 + np.random.uniform(1,10,[25,1])
df1 = pd.DataFrame(arr1)
df2 = pd.DataFrame(arr2)
fix, ax = plt.subplots()
sns.kdeplot(df1[0], cumulative=True, color='orange', label='arr1')
sns.kdeplot(df2[0], cumulative=True, color='b', label='arr2')
sns.kdeplot(df2[0]-df1[0], cumulative=True, color='r', label='difference')
plt.show()
which gives the following output:
However, it does not capture the difference, and values of the corresponding elements together. For example, suppose the difference between two elements is 3. The two numbers can be 2 and 5, but they can also be 15 and 18, and this can not be determined from the CDF.
Which kind of plotting can visualize both the difference between the elements and the values of the elements?
I do not wish to line plot as below because not much statistical insights can be derived from the visualization.
ax.plot(df1[0])
ax.plot(df2[0])
ax.plot(df2[0]-df1[0])

There are lots of ways to show difference between two values. It really depends on your goal for the chart, how quantitative or qualitative you want to be, or if you want to show the raw data somehow. Here are a few ideas that come to mind that do not involve simple line plots or density functions. I strongly recommend the book Better Data Visualization by Johnathan Schwabish. He discusses interesting considerations regarding data presentation.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import ticker
arr1 = np.random.uniform(1,20, size=25)
arr2 = arr1 + np.random.uniform(1,10, size=25)
df = pd.DataFrame({
'col1' : arr1,
'col2' : arr2
})
df['diff'] = df.col2 - df.col1
df['sum'] = df.col1 + df.col2
fig, axes = plt.subplots(ncols=2, nrows=3, figsize=(15,15))
axes = axes.flatten()
# Pyramid chart
df_sorted = df.sort_values(by='sum', ascending=True)
axes[0].barh(
y = np.arange(1,26),
width = -df_sorted.col1
)
axes[0].barh(
y = np.arange(1,26),
width = df_sorted.col2
)
# Style axes[0]
style_func(axes[0], 'Pyramid Chart')
# Dot Plot
axes[1].scatter(df.col1, np.arange(1, 26), label='col1')
axes[1].scatter(df.col2, np.arange(1, 26), label='col2')
axes[1].hlines(
y = np.arange(1, 26),
xmin = df.col1, xmax = df.col2,
zorder=0, linewidth=1.5, color='k'
)
# Style axes[1]
legend = axes[1].legend(ncol=2, loc='center', bbox_to_anchor=(0.14,1.025), edgecolor='w')
style_func(axes[1], 'Dot Plot')
set_xlim = axes[1].set_xlim(0,25)
# Dot Plot 2
df_sorted = df.sort_values(by=['col1', 'diff'], ascending=False)
axes[2].scatter(df_sorted.col1, np.arange(1, 26), label='col1')
axes[2].scatter(df_sorted.col2, np.arange(1, 26), label='col2')
axes[2].hlines(
y = np.arange(1, 26),
xmin = df_sorted.col1, xmax = df_sorted.col2,
zorder=0, linewidth=1.5, color='k'
)
# Style axes[2]
legend = axes[2].legend(ncol=2, loc='center', bbox_to_anchor=(0.14,1.025), edgecolor='w')
style_func(axes[2], 'Dot Plot')
set_xlim = axes[2].set_xlim(0,25)
# Dot Plot 3
df_sorted = df.sort_values(by='sum', ascending=True)
axes[3].scatter(-df_sorted.col1, np.arange(1, 26), label='col1')
axes[3].scatter(df_sorted.col2, np.arange(1, 26), label='col2')
axes[3].vlines(x=0, ymin=-1, ymax=27, linewidth=2.5, color='k')
axes[3].hlines(
y = np.arange(1, 26),
xmin = -df_sorted.col1, xmax = df_sorted.col2,
zorder=0, linewidth=2
)
# Style axes[3]
legend = axes[3].legend(ncol=2, loc='center', bbox_to_anchor=(0.14,1.025), edgecolor='w')
style_func(axes[3], 'Dot Plot')
# Strip plot
axes[4].scatter(df.col1, [4] * 25)
axes[4].scatter(df.col2, [6] * 25)
axes[4].set_ylim(0, 10)
axes[4].vlines(
x = [df.col1.mean(), df.col2.mean()],
ymin = [3.5, 5.5], ymax=[4.5,6.5],
color='black', linewidth =2
)
# Style axes[4]
axes[4].yaxis.set_major_locator(ticker.FixedLocator([4,6]))
axes[4].yaxis.set_major_formatter(ticker.FixedFormatter(['col1','col2']))
hide_spines = [axes[4].spines[x].set_visible(False) for x in ['left','top','right']]
set_title = axes[4].set_title('Strip Plot', fontweight='bold')
tick_params = axes[4].tick_params(axis='y', left=False)
grid = axes[4].grid(axis='y', dashes=(8,3), alpha=0.3, color='gray')
# Slope chart
for i in range(25):
axes[5].plot([0,1], [df.col1[i], df.col2[i]], color='k')
align = ['left', 'right']
for i in range(1,3):
axes[5].text(x = i - 1, y = 0, s = 'col' + str(i),
fontsize=14, fontweight='bold', ha=align[i-1])
set_title = axes[5].set_title('Slope chart', fontweight='bold')
axes[5].axis('off')
def style_func(ax, title):
hide_spines = [ax.spines[x].set_visible(False) for x in ['left','top','right']]
set_title = ax.set_title(title, fontweight='bold')
set_xlim = ax.set_xlim(-25,25)
x_locator = ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
y_locator = ax.yaxis.set_major_locator(ticker.FixedLocator(np.arange(1,26, 2)))
spine_width = ax.spines['bottom'].set_linewidth(1.5)
x_tick_params = ax.tick_params(axis='x', length=8, width=1.5)
x_tick_params = ax.tick_params(axis='y', left=False)

What about a parallel coordinates plot with plotly? This will allow to see the distinct values of each original array but then also if they converge on the same diffrence?
https://plot.ly/python/parallel-coordinates-plot/

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Clustered stacked bar plot with error bars - python

Related

Readable values in on axis with Matplotlib

What is wrong with my multiple line graph plotting?

Seaborn swarmplot break into lines

How to plot a bar chart with multiple x-axis data?

Visualizing the difference between two numeric arrays

Categories

Resources