how to plot pairs in different subplots with difference on the side - python

I want to make a plot in seaborn but I am having some difficulties. The data has 2 variable: time (2 levels) and state (2 levels). I want to plot time on the x axis and state as different subplots, showing individual data lines. Finally, to the right of these I want to show a difference plot of the difference between time 2 and time 1, for each of the levels of state. I cannot do it very well, because I cannot get the second plot to show onto the right. Here has been my try:
import numpy as np
import pandas as pd
import seaborn as sns
# Just making some fake data
ids = [1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5]
times = [1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2]
states = ['A', 'B', 'A', 'B'] * 5
np.random.seed(121)
resps = [(i*t) + np.random.normal() for i, t in zip(ids, times)]
DATA = {
'identity': ids,
'time': times,
'state': states,
'resps': resps
}
df = pd.DataFrame(DATA)
# Done with data
g = sns.relplot(
data=df, kind='line',
col='state', x='time', y='resps', units='identity',
estimator=None, alpha=.5, height=5, aspect=.7)
# # Draw a line onto each Axes
g.map(sns.lineplot,"time", "resps", lw=5, ci=None)
# Make a wide data to make the difference
wide = df.set_index(['identity', 'state', 'time']).unstack().reset_index()
A = wide['state']=='A'
B = wide['state']=='B'
wide['diffA'] = wide[A][('resps', 2)] - wide[A][('resps', 1)]
wide['diffB'] = wide[B][('resps', 2)] - wide[B][('resps', 1)]
wide['difference'] = wide[['diffA', 'diffB']].sum(axis=1)
wide = wide.drop(columns=[('diffA', ''), ('diffB', '')])
sns.pointplot(x='state', y='difference', data=wide, join=False)
Output from the first
And output from the second:
Is there no way to put them together? Even though they are different data? I did try to use matplotlib. And then achieved slightly better results but this still had a problem because I wanted the two left plots to have a shared y axis but not the difference. This created lots of work as well, because I want to be flexible for different numbers of the state variable, but only kept to 2 for simplicity. Here is a paint version of what I want to do (sorry for the poor quality), hopefully with some more control over appearance but this is secondary:
Is there a reliable way to do this in a simpler way? Thanks!

The problem is that sns.relplot operates at a figure level. This means it creates its own figure object and we cannot control the axes it uses. If you want to leverage seaborn for the creation of the lines without using "pure" matplotlib, you can copy the lines on matplotlib axes:
import numpy as np
import pandas as pd
import seaborn as sns
# Just making some fake data
ids = [1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5]
times = [1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2]
states = ['A', 'B', 'A', 'B'] * 5
np.random.seed(121)
resps = [(i*t) + np.random.normal() for i, t in zip(ids, times)]
DATA = {
'identity': ids,
'time': times,
'state': states,
'resps': resps
}
df = pd.DataFrame(DATA)
# Done with data
g = sns.relplot(
data=df, kind='line',
col='state', x='time', y='resps', units='identity',
estimator=None, alpha=.5, height=5, aspect=.7)
# # Draw a line onto each Axes
g.map(sns.lineplot,"time", "resps", lw=5, ci=None)
# Make a wide data to make the difference
wide = df.set_index(['identity', 'state', 'time']).unstack().reset_index()
A = wide['state']=='A'
B = wide['state']=='B'
wide['diffA'] = wide[A][('resps', 2)] - wide[A][('resps', 1)]
wide['diffB'] = wide[B][('resps', 2)] - wide[B][('resps', 1)]
wide['difference'] = wide[['diffA', 'diffB']].sum(axis=1)
wide = wide.drop(columns=[('diffA', ''), ('diffB', '')])
# New code ----------------------------------------
import matplotlib.pyplot as plt
plt.close(g.figure)
fig = plt.figure(figsize=(12, 4))
ax1 = fig.add_subplot(1, 3, 1)
ax2 = fig.add_subplot(1, 3, 2, sharey=ax1)
ax3 = fig.add_subplot(1, 3, 3)
l = list(g.axes[0][0].get_lines())
l2 = list(g.axes[0][1].get_lines())
for ax, g_ax in zip([ax1, ax2], g.axes[0]):
l = list(g_ax.get_lines())
for line in l:
ax.plot(line.get_data()[0], line.get_data()[1], color=line.get_color(), lw=line.get_linewidth())
ax.set_title(g_ax.get_title())
sns.pointplot(ax=ax3, x='state', y='difference', data=wide, join=False)
# End of new code ----------------------------------
plt.show()
Result:

Related

Plot facetted barchart using summary statistics within Python

Is there a concise way to plot summary statistics in Python as a boxplot? The code below gives a barchart of each mean, I want to swap each barchart to a boxplot.
I realise that I don't need to summarise, however with the real data, just plotting one of the boxes took a long time (even with showfliers=False); I don't need to see the outliers and I will also want to add a "population-wide" bar (i.e. across all clusters) for each "pc" (any suggestions for that would be greatly appreciated .. I am again attempting to move from R to python and just getting these few lines of code took long enough)
import matplotlib.pyplot as plt
import seaborn as sns
out = pd.DataFrame({'cluster':['a']*100+['b']*100,
'pc': ['w', 'x', 'y', 'z']*50,
'value': np.random.normal(size=200)})
grouped = out.groupby(['cluster', 'pc'])
out = grouped.describe()
out = out.reset_index()
out.columns = [e[0] if e[0] != 'value' else e[1] for e in out.columns.tolist()]
#sns.catplot(x='cluster', y='mean', col='pc', kind='bar', data=out)
g = sns.FacetGrid(out, col="pc", col_wrap = 2)
g = g.map(plt.bar, "cluster", "mean")
You can draw a boxplot from the summary statistics using Axes.bxp(). This needs to be encapsulated in a custom plotting function passed to map():
def my_bxp(**kwargs):
ax = plt.gca()
data = kwargs.pop('data')
color = kwargs.pop('color')
bxpstats = []
for _,row in data.iterrows():
print(row)
d = {'med': row.loc['50%'],
'q1': row.loc['25%'],
'q3': row.loc['75%'],
'whislo': row.loc['min'],
'whishi': row.loc['max'],
'label': row.loc['cluster']}
bxpstats.append(d)
ax.bxp(bxpstats, showfliers=False, boxprops=dict(color=color),
whiskerprops=dict(color=color),
capprops=dict(color=color))
g = sns.FacetGrid(out, col="pc", col_wrap = 2)
g = g.map_dataframe(my_bxp)
Note that, for simplicity's sake, I have the whiskers extend from min to max, which is not the usual representation. You may have to calculate proper whiskers extents when you calculate your summary statistics if that's what you want.
It's easier to draw multiple boxplots with the original data intact.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
df = pd.DataFrame({'cluster':['a']*100+['b']*100,
'pc': ['w', 'x', 'y', 'z']*50,
'value': np.random.normal(size=200)})
c = ['a']*25+['b']*25
df1 = pd.concat([pd.Series(c),
df[df['pc'] == 'w']['value'].reset_index(drop=True),
df[df['pc'] == 'x']['value'].reset_index(drop=True),
df[df['pc'] == 'y']['value'].reset_index(drop=True),
df[df['pc'] == 'z']['value'].reset_index(drop=True)], axis=1, ignore_index=True)
df1.columns = ['cluster','w','x','y','z']
fig, axes = plt.subplots(1, 4, figsize=(8, 4)
fig.subplots_adjust(wspace=0.3, hspace=0.4)
ax = sns.boxplot(x="cluster", y='w', data=df1, orient='v', ax=axes[0])
ax = sns.boxplot(x="cluster", y='x', data=df1, orient='v', ax=axes[1])
ax = sns.boxplot(x="cluster", y='y', data=df1, orient='v', ax=axes[2])
ax = sns.boxplot(x="cluster", y='z', data=df1, orient='v', ax=axes[3])

Merging legends of different lineplots using seaborn

I would like to plot two dataframes with a 'long' representation, and differing axis, to one plot using sns.lineplot(). Yet, I am failing plot it with a single legend containing the elements of both lineplots.
The issue is similar to this: Secondary axis with twinx(): how to add to legend?, though I'd like to use seaborn.
A minimal working example up to the point I got stuck is given below.
import pandas as pd
import seaborn as sns
import numpy as np
import itertools
# mock dataset
lst = range(1,11)
steps1 = list(itertools.chain.from_iterable(itertools.repeat(x, 4) for x in lst))
labels1 = ['A','B']*20
values1 = list(np.random.uniform(0,1,40))
df1 = pd.DataFrame({'steps':steps1, 'lab':labels1, 'vals':values1})
lst = range(6,11)
steps2 = list(itertools.chain.from_iterable(itertools.repeat(x, 4) for x in lst))
labels2 = ['C','D']*10
values2 = list(np.random.uniform(10,20,20))
df2 = pd.DataFrame({'steps':steps2, 'lab2':labels2, 'others':values2})
# plotting
fig, ax = plt.subplots()
fig = sns.lineplot(x='steps',y='vals', data=df1, hue='lab',palette='bright', legend='brief')
ax2 = ax.twinx()
fig2 = sns.lineplot(x='steps',y='others', hue='lab2', data=df2 ,palette='dark', legend='brief')
# How do I merge the legends into one?
# the solution below gives me one merged and one separate legend
h1,l1 = fig.get_legend_handles_labels()
h2,l2 = fig2.get_legend_handles_labels()
ax.legend(loc=3, handles=h1+h2, labels = l1+l2)
I just resolved it by removing the obsolete legend by ax2.get_legend().remove().

How to display multiple annotations in Seaborn Heatmap cells?

I want seaborn heatmap to display multiple values in each cell of the heatmap. Here is a manual example of what I want to see, just to be clear:
data = np.array([[0.000000,0.000000],[-0.231049,0.000000],[-0.231049,0.000000]])
labels = np.array([['A\nExtra Stuff','B'],['C','D'],['E','F']])
fig, ax = plt.subplots()
ax = sns.heatmap(data, annot = labels, fmt = '')
Here as an example to get seaborn.heat to display flightsRoundUp values in the cells.
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
def RoundUp(x):
return int(np.ceil(x/10)*10)
# Load the example flights dataset and conver to long-form
flights_long = sns.load_dataset("flights")
flights = flights_long.pivot("month", "year", "passengers")
flightsRoundUp = flights.applymap(RoundUp)
# Draw a heatmap with the numeric values in each cell
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(flights, annot=flightsRoundUp, fmt="", linewidths=.5, ax=ax)
What is the best way to display both flightsRoundUp and flights in all cells? Something like the first manual example above, but for all the cells in a vectorized-like way...
Rotail's answer didn't work for me, I got an error when applying that lambda function.
However, I found a solution that exploits the fact that seaborn plots sequential figures on top of each other. All you have to do is use one call to heatmap to establish the figure, and then a subsequent call for each of the annotations. Use the annot_kws arg to make sure the text aren't written over eachother.
X = pd.DataFrame({'a':[1, 2, 3], 'b':[4, 5, 6]})
Y = pd.DataFrame({'A':['A', 'B', 'C'], 'B':['E', 'F', 'G']})
Z = pd.DataFrame({'A':['(Extra Stuff)', '(Extra Stuff)', '(Extra Stuff)'], 'B':['(Extra Stuff)', '(Extra Stuff)', '(Extra Stuff)']})
sns.heatmap(X, annot=False)
sns.heatmap(X, annot=Y, annot_kws={'va':'bottom'}, fmt="", cbar=False)
sns.heatmap(X, annot=Z, annot_kws={'va':'top'}, fmt="", cbar=False)
Following works for me too:
X = pd.DataFrame({'a':[1, 2, np.nan], 'b':[10, 20, 30]})
Y = pd.DataFrame({'A':[11, 222, np.nan], 'B':[110, np.nan, 330]})
# convert to string
X_value_ann = (X).astype('|S5').reset_index()
Y_value_ann = (Y).astype('|S5').reset_index()
# define () and new line to glue on later
br = np.char.array(pd.DataFrame('\n(', index=X_value_ann.index, columns=X_value_ann.columns))
cl = np.char.array(pd.DataFrame(')', index=X_value_ann.index, columns=X_value_ann.columns))
# convert to chararray
X_value_ann = np.char.array(X_value_ann)
Y_value_ann = np.char.array(Y_value_ann)
# glue and reshape
my_annotation = pd.DataFrame(X_value_ann+br+Y_value_ann+cl)
my_annotation = my_annotation.applymap(lambda x: x.decode('utf-8'))
my_annotation = my_annotation.drop(columns=[0])
my_annotation
you should be able to set fmt="" and format you labels with appropriate "\n" to have multiple lines of annotations.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(0)
sns.set_theme()
uniform_data = np.random.rand(4, 4)
fig,ax = plt.subplots(figsize=(50,20))
uniform_data_labels = \[\]
for i in uniform_data:
tmp_arr=\[\]
for j in i:
tmp_arr.append('Example\nExample')
uniform_data_labels.append(tmp_arr)
sns.heatmap(uniform_data, vmin=0, vmax=1, annot=uniform_data_labels ,ax=ax,fmt="",annot_kws={"fontsize":30})
plt.show()
1

Matplotlib: custom ticker for pandas MultiIndex DataFrame

I have a large pandas MultiIndex DataFrame that I would like to plot. A minimal example would look like:
import pandas as pd
years = range(2015, 2018)
fields = range(4)
days = range(4)
bands = ['R', 'G', 'B']
index = pd.MultiIndex.from_product(
[years, fields], names=['year', 'field'])
columns = pd.MultiIndex.from_product(
[days, bands], names=['day', 'band'])
df = pd.DataFrame(0, index=index, columns=columns)
df.loc[(2015,), (0,)] = 1
df.loc[(2016,), (1,)] = 1
df.loc[(2017,), (2,)] = 1
If I plot this using plt.spy, I get:
However, the tick locations and labels are less than desirable. I would like the ticks to completely ignore the second level of the MultiIndex. Using IndexLocator and IndexFormatter, I'm able to do the following:
from matplotlib.ticker import IndexFormatter, IndexLocator
import matplotlib.pyplot as plt
ax = plt.gca()
plt.spy(df)
xbase = len(bands)
xoffset = xbase / 2
xlabels = df.columns.get_level_values('day')
ax.xaxis.set_major_locator(IndexLocator(base=xbase, offset=xoffset))
ax.xaxis.set_major_formatter(IndexFormatter(xlabels))
plt.xlabel('Day')
ax.xaxis.tick_bottom()
ybase = len(fields)
yoffset = ybase / 2
ylabels = df.index.get_level_values('year')
ax.yaxis.set_major_locator(IndexLocator(base=ybase, offset=yoffset))
ax.yaxis.set_major_formatter(IndexFormatter(ylabels))
plt.ylabel('Year')
plt.show()
This gives me exactly what I want:
But here's the problem. My actual DataFrame has 15 years, 4,000 fields, 365 days, and 7 bands. If I actually label every single day, the labels would be illegible. I could place a tick every 50 days, but I would like the ticks to be dynamic so that when I zoom in, the ticks become more fine-grained. Basically what I'm looking for is a custom MultiIndexLocator that combines the placement of IndexLocator with the dynamism of MaxNLocator.
Bonus: My data is really nice in the sense that there are always the same number of fields for every year and the same number of bands for every day. But what if this was not the case? I would love to contribute a generic MultiIndexLocator and MultiIndexFormatter to matplotlib that works for any MultiIndex DataFrame.
Matplotlib does not know about dataframes or MultiIndex. It simply plots the data you supply. I.e. you get the same as if you were plotting the numpy array of data, spy(df.values).
So I would suggest to first set the extent of the image correctly such that you may use numeric tickers. Then a MaxNLocator should work fine, unless you do not zoom in too much.
import numpy as np
import pandas as pd
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
plt.rcParams['axes.formatter.useoffset'] = False
years = range(2000, 2018)
fields = range(9) #17
days = range(120) #365
bands = ['R', 'G', 'B', 'A']
index = pd.MultiIndex.from_product(
[years, fields], names=['year', 'field'])
columns = pd.MultiIndex.from_product(
[days, bands], names=['day', 'band'])
data = np.random.rand(len(years)*len(fields),len(days)*len(bands))
x,y = np.meshgrid(np.arange(data.shape[1]),np.arange(data.shape[0]))
data += 2*((y//len(fields)+x//len(bands)) % 2)
df = pd.DataFrame(data, index=index, columns=columns)
############
# Plotting
############
xbase = len(bands)
xlabels = df.columns.get_level_values('day')
ybase = len(fields)
ylabels = df.index.get_level_values('year')
extent = [xlabels.min()-np.diff(np.unique(xlabels))[0]/2.,
xlabels.max()+np.diff(np.unique(xlabels))[0]/2.,
ylabels.min()-np.diff(np.unique(ylabels))[0]/2.,
ylabels.max()+np.diff(np.unique(ylabels))[0]/2.,]
fig, ax = plt.subplots()
ax.imshow(df.values, extent=extent, aspect="auto")
ax.set_ylabel('Year')
ax.set_xlabel('Day')
ax.xaxis.set_major_locator(MaxNLocator(integer=True,min_n_ticks=1))
ax.yaxis.set_major_locator(MaxNLocator(integer=True,min_n_ticks=1))
plt.show()

Sharing two y axes on multiple matplotlib subplots [duplicate]

This question already has an answer here:
How to share secondary y-axis between subplots in matplotlib
(1 answer)
Closed 5 years ago.
My goal is to have two rows and three columns of plots using matplotlib. Each graph in the top row will contain two data series, and two y-axes. I want to make the scales on each axis line up so that the corresponding data series are directly comparable. Right now I have it so that the primary y-axis on each graph is aligned, but I can't get the secondary y-axes to align. Here is my current code:
import matplotlib.pyplot as plt
import pandas as pd
excel_file = 'test_data.xlsx'
sims = ['Sim 02', 'Sim 01', 'Sim 03']
if __name__ == '__main__':
data = pd.read_excel(excel_file, skiprows=[0, 1, 2, 3], sheetname=None, header=1, index_col=[0, 1], skip_footer=10)
plot_cols = len(sims)
plot_rows = 2
f, axes = plt.subplots(plot_rows, plot_cols, sharex='col', sharey='row')
secondary_ax = []
for i, sim in enumerate(sims):
df = data[sim]
modern = df.loc['Modern']
traditional = df.loc['Traditional']
axes[0][i].plot(modern.index, modern['Idle'])
secondary_ax.append(axes[0][i].twinx())
secondary_ax[i].plot(modern.index, modern['Work'])
axes[1][i].bar(modern.index, modern['Result'])
axes[0][i].set_xlim(12, 6)
if i > 0:
secondary_ax[0].get_shared_y_axes().join(secondary_ax[0], secondary_ax[i])
# secondary_ax[0].get_shared_y_axes().join(x for x in secondary_ax)
plt.show()
The solution I tried (Both the line in the if statement, and the last line before plt.show()) were solutions to similar questions, however it didn't resolve my issue. Nothing breaks, the secondary axes just aren't aligned.
I also tried adding an extra row in the plt.subplots method and using twinx() to combined the first two rows, but it created an empty second row of plots none-the-less.
As a fall back I think I could manually check each axes for the maxes and mins, and loop through each to update manually, but I'd love to find a cleaner solution if one is out there, and anyone has any insight. Thanks.
You just need to share the y axes before plotting your data:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# excel_file = 'test_data.xlsx'
sims = ['Sim 02', 'Sim 01', 'Sim 03']
if __name__ == '__main__':
# data = pd.read_excel(excel_file, skiprows=[0, 1, 2, 3], sheetname=None, header=1, index_col=[0, 1], skip_footer=10)
modern = pd.DataFrame(np.random.randint(0, 100, (100, 3)), columns=sims)
traditional = pd.DataFrame(np.random.randint(10, 30, (100, 3)), columns=sims)
traditional[sims[1]] = traditional[sims[1]] + 40
traditional[sims[2]] = traditional[sims[2]] - 40
data3 = pd.DataFrame(np.random.randint(0, 100, (100, 3)), columns=sims)
plot_cols = len(sims)
plot_rows = 2
f, axes = plt.subplots(plot_rows, plot_cols, sharex='col', sharey='row', figsize=(30, 10))
secondary_ax = []
for i, sim in enumerate(sims):
df = data[sim]
modern_series = modern[sim]
traditional_series = traditional[sim]
idle = data3
axes[0][i].plot(modern_series.index, modern_series)
secondary_ax.append(axes[0][i].twinx())
if i > 0:
secondary_ax[0].get_shared_y_axes().join(secondary_ax[0], secondary_ax[i])
secondary_ax[i].plot(traditional_series.index, traditional_series)
# axes[1][i].bar(data3.index, data3)
axes[0][i].set_xlim(12, 6)
plt.show()

Categories