I know that you can use the mosaic plot from statsmodels but it is a bit frustrating when your categories have some empty values (like here). I was wondering whether it exists a solution with a graphic library like matplotlib or seaborn, which would be more handy.
I think it would be a nice feature for seaborn, as contingency tables are frequently built with pandas. However it seems that it won't be implemented anytime soon.
Finally, how to have a mosaic plot with 3 dimensions, and possible empty categories ?
Here is a generic mosaic plot (from wikipedia)
As nothing existed in python, here is the code I made. The last dimension should be of size 1 (i.e. a regular table) or 2 for now. Feel free to update the code to fix that, it might be unreadable with more than 3, though.
It's a bit long but it does the job. Example below.
There are few options, most are self explanatory, otherwise:
dic_color_row: a dictionary where keys are the outer-most index (Index_1 in example below) and the values are colors, avoid black/gray colors
pad: the space between each bar of the plot
alpha_label: the 3rd dimension use alpha trick to differentiate, between them, it will be rendered as dark grey / light grey in the legend and you can change the name of each label (similar to col_labels or row_labels)
color_label: to add background color to the y-tick labels. [True/False]
def mosaic_plot(df, dic_color_row, row_labels=None, col_labels=None, alpha_label=None, top_label="Size",
x_label=None, y_label=None, pad=0.01, color_ylabel=False, ax=None, order="Size"):
"""
From a contingency table NxM, plot a mosaic plot with the values inside. There should be a double-index for rows
e.g.
3 4 1 0 2 5
Index_1 Index_2
AA C 0 0 0 2 3 0
P 6 0 0 13 0 0
BB C 0 2 0 0 0 0
P 45 1 10 10 1 0
CC C 0 6 35 15 29 0
P 1 1 0 2 0 0
DD C 0 56 0 3 0 0
P 30 4 2 0 1 9
order: how columns are order, by default, from the biggest to the smallest in term of category. Possible values are
- "Size" [default]
- "Normal" : as the columns are order in the input df
- list of column names to reorder the column
top_label: Size of each columns. The label can be changed to adapt to your value.
If `False`, nothing is displayed and the secondary legend is set on top instead of on right.
"""
is_multi = len(df.index.names) == 2
if ax == None:
fig, ax = plt.subplots(1,1, figsize=(len(df.columns), len(df.index.get_level_values(0).unique())))
size_col = df.sum().sort_values(ascending=False)
prop_com = size_col.div(size_col.sum())
if order == "Size":
df = df[size_col.index.values]
elif order == "Normal":
prop_com = prop_com[df.columns]
size_col = size_col[df.columns]
else:
df = df[order]
prop_com = prop_com[order]
size_col = size_col[order]
if is_multi:
inner_index = df.index.get_level_values(1).unique()
prop_ii0 = (df.swaplevel().loc[inner_index[0]]/(df.swaplevel().loc[inner_index[0]]+df.swaplevel().loc[inner_index[1]])).fillna(0)
alpha_ii = 0.5
true_y_labels = df.index.levels[0]
else:
alpha_ii = 1
true_y_labels = df.index
Yt = (df.groupby(level=0).sum().iloc[:,0].div(df.groupby(level=0).sum().iloc[:,0].sum())+pad).cumsum() - pad
Ytt = df.groupby(level=0).sum().iloc[:,0].div(df.groupby(level=0).sum().iloc[:,0].sum())
x = 0
for j in df.groupby(level=0).sum().iteritems():
bot = 0
S = float(j[1].sum())
for lab, k in j[1].iteritems():
bars = []
ax.bar(x, k/S, width=prop_com[j[0]], bottom=bot, color=dic_color_row[lab], alpha=alpha_ii, lw=0, align="edge")
if is_multi:
ax.bar(x, k/S, width=prop_com[j[0]]*prop_ii0.loc[lab, j[0]], bottom=bot, color=dic_color_row[lab], lw=0, alpha=1, align="edge")
bot += k/S + pad
x += prop_com[j[0]] + pad
## Aesthetic of the plot and ticks
# Y-axis
if row_labels == None:
row_labels = Yt.index
ax.set_yticks(Yt - Ytt/2)
ax.set_yticklabels(row_labels)
ax.set_ylim(0, 1 + (len(j[1]) - 1) * pad)
if y_label == None:
y_label = df.index.names[0]
ax.set_ylabel(y_label)
# X-axis
if col_labels == None:
col_labels = prop_com.index
xticks = (prop_com + pad).cumsum() - pad - prop_com/2.
ax.set_xticks(xticks)
ax.set_xticklabels(col_labels)
ax.set_xlim(0, prop_com.sum() + pad * (len(prop_com)-1))
if x_label == None:
x_label = df.columns.name
ax.set_xlabel(x_label)
# Top label
if top_label:
ax2 = ax.twiny()
ax2.set_xlim(*ax.get_xlim())
ax2.set_xticks(xticks)
ax2.set_xticklabels(size_col.values.astype(int))
ax2.set_xlabel(top_label)
ax2.tick_params(top=False, right=False, pad=0, length=0)
# Ticks and axis settings
ax.tick_params(top=False, right=False, pad=5)
sns.despine(left=0, bottom=False, right=0, top=0, offset=3)
# Legend
if is_multi:
if alpha_label == None:
alpha_label = inner_index
bars = [ax.bar(np.nan, np.nan, color="0.2", alpha=[1, 0.5][b]) for b in range(2)]
if top_label:
plt.legend(bars, alpha_label, loc='center left', bbox_to_anchor=(1, 0.5), ncol=1, )
else:
plt.legend(bars, alpha_label, loc="lower center", bbox_to_anchor=(0.5, 1), ncol=2)
plt.tight_layout(rect=[0, 0, .9, 0.95])
if color_ylabel:
for tick, label in zip(ax.get_yticklabels(), true_y_labels):
tick.set_bbox(dict( pad=5, facecolor=dic_color_row[label]))
tick.set_color("w")
tick.set_fontweight("bold")
return ax
With a dataframe you get after a crosstabulation:
df
Index_1 Index_2 v w x y z
AA Q 0 0 0 2 3
AA P 6 0 0 13 0
BB Q 0 2 0 0 0
BB P 45 1 10 10 1
CC Q 0 6 0 15 9
CC P 0 1 0 2 0
DD Q 0 56 0 3 0
DD P 30 4 2 0 1
make sure that you have the 2 columns as index:
df.set_index(["Index_1", "Index_2"], inplace=True)
and then just call:
mosaic_plot(df,
{"AA":"r", "BB":"b", "CC":"y", "DD":"g"}, # dict of color, mandatory
x_label='My Category',
)
It's not perfect, but I hope it will help others.
Related
I am trying to change the color of each individual bar in my figure here. The code that I used it down below. Instead of each bar changing to the color that I have set in c, there are several colors within each bar. I have included a screenshot of this. How can I fix this? Thank you all in advance!
Clusters is just a categorical variable of 5 groups, ranging from 0 to 4. I have included a second screenshot of the dataframe.
So essentially, what I am trying to do is to plot each cluster for economic ideology and social ideology so I can have a visual comparison of the 5 different clusters over these two dimensions (economic and social ideology). Each cluster should be represented by one color. For example, cluster 0 should be red in color.
c = ['#bf1111', '#1c4975', '#278f36', '#47167a', '#de8314']
plt.subplot(1, 2, 1)
plt.bar(data = ANESdf_LatNEW, height = "EconIdeo",
x = "clusters", color = c)
plt.title('Economic Ideology')
plt.xticks([0, 1, 2, 3, 4])
plt.xlabel('Clusters')
plt.ylabel('')
plt.subplot(1, 2, 2)
plt.bar(data = ANESdf_LatNEW, height = "SocialIdeo",
x = "clusters", color = c)
plt.title('Social Ideology')
plt.xticks([0, 1, 2, 3, 4])
plt.xlabel('Clusters')
plt.ylabel('')
plt.show()
Bar graph here
Top 5 rows of dataframe
I have tried multiple ways of changing colors. For example, instead of having c, I had put in the colors directly at color = ... This did not work either.
Here is a script that does what you seem to be looking for based on your edits and comment.
Note that I do not assume that all clusters have the same size in this context; if that is the case, this approach can be simplified.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# sample dataframe
df = pd.DataFrame(
{
'EconIdeo':[1,2,3,4,3,5,7],
'Clusters':[2,3,0,1,3,0,3]
})
print(df)
# parameters: width for each cluster, colors for each cluster
# (if clusters are not sequential from zero, replace c with dictionary)
width = .75
c = ['#bf1111', '#1c4975', '#278f36', '#47167a', '#de8314']
df['xpos'] = df['Clusters']
df['width'] = width
df['color'] = ''
clusters = df['Clusters'].unique()
for k in clusters:
where = (df['Clusters'] == k)
n = where.sum()
df.loc[where,'xpos'] += np.linspace(-width/2,width/2,2*n+1)[1:-1:2]
df.loc[where,'width'] /=n
df.loc[where,'color'] = c[k]
plt.bar(data = df, height = "EconIdeo", x = 'xpos',
width = 'width', color = 'color')
plt.xticks(clusters,clusters)
plt.show()
Resulting plot:
Input dataframe:
EconIdeo Clusters
0 1 2
1 2 3
2 3 0
3 4 1
4 3 3
5 5 0
6 7 3
Dataframe after script applies changes (to include plotting specifications)
EconIdeo Clusters xpos width color
0 1 2 2.0000 0.750 #278f36
1 2 3 2.7500 0.250 #47167a
2 3 0 -0.1875 0.375 #bf1111
3 4 1 1.0000 0.750 #1c4975
4 3 3 3.0000 0.250 #47167a
5 5 0 0.1875 0.375 #bf1111
6 7 3 3.2500 0.250 #47167a
I have three-column data in a file named "sample1.dat" and a code that reads the columns and tries to plot the 3rd column against the 2nd column. I pick up parameter values from the 1st column elements as long as their values remain the same.
"sample1.dat" reads
0 1 1
0 2 4
0 3 9
0 4 16
0 5 25
0 6 36
1 1 1
1 2 8
1 3 27
1 4 64
1 5 125
1 6 216
2 1 1
2 2 16
2 3 81
2 4 256
2 5 625
2 6 1296
And my code:
import matplotlib.pyplot as plt
import numpy as np
data = np.loadtxt('sample1.dat')
x = data[:,0]
y = data[:,1]
z = data[:,2]
L = len(data)
col = ['r','g','b']
x0 = x[0]; j=0; jold=-1
for i in range(L):
print('j, col[j]=',j, col[j])
if x[i] == x0:
print('y[i], z[i]=',y[i],z[i])
if i==0 or j != jold: # j-index decides new or the same paramet
label = 'parameter = {}'.format(x0)
else:
label = ''
print('label =',label)
plt.plot(y[i], z[i], color=col[j], marker='o', label=label)
else:
x0 = x[i] # Update when x-value changes,
# i.e. pick up the next parameter value
i -= 1 # Shift back else we miss the 1st point for new x-value
j += 1; jold = j
plt.legend()
plt.xlabel('2nd column')
plt.ylabel('3rd column')
plt.savefig('sample1.png')
plt.show()
The plot outcome:
One can clearly see that two issues persist:
The legends appear only for the first parameter though I tried to avoid the repitition in my code.
The default linestyle is not appearing though the legends show line plus marker plots.
How could I resolve these or is there a smarter way of coding to fulfill the same purpose.
The first issue is due to some strange logic involving j,jold and x0. The code can be simplified by drawing all y,z for each x-value at once. Numpy allows selecting the y's corresponding to a given x0 as y[x==x0s].
The second issue can be solved by explicitly setting the desired linestyle, i.e. ls=''.
import matplotlib.pyplot as plt
import numpy as np
data = np.loadtxt('sample1.dat')
x = data[:, 0]
y = data[:, 1]
z = data[:, 2]
colors = ['r', 'g', 'b']
for x0, color in zip(np.unique(x), colors):
plt.plot(y[x == x0], z[x == x0], color=color, marker='o', ls='', label=f'parameter = {x0:.0f}')
plt.legend()
plt.xlabel('2nd column')
plt.ylabel('3rd column')
plt.show()
An alternative approach would use the seaborn library, which does the selecting and coloring without a lot of intervention, for example:
import seaborn as sns
sns.scatterplot(x=y, y=z, hue=x, palette=['r', 'g', 'b'])
Seaborn can automatically add labels if the data is organized as a dictionary or a pandas dataframe:
data = {'first column': x.astype(int),
'second column': y,
'third column': z}
sns.scatterplot(data=data, x='second column', y='third column', hue='first column', palette=['r', 'g', 'b'])
You can get the result you want in a few lines by using pandas and seaborn.
If you add column names (for instance A, B, and C) to the data in the sample1.dat file as follow:
A B C
0 1 1
0 2 4
0 3 9
0 4 16
0 5 25
0 6 36
1 1 1
1 2 8
1 3 27
1 4 64
1 5 125
1 6 216
2 1 1
2 2 16
2 3 81
2 4 256
2 5 625
2 6 1296
You can then load your data in a pandas dataframe and plot it with seaborn:
import pandas as pd
import seaborn as sns
df=pd.read_fwf('sample1.dat')
col = ['r','g','b']
sns.scatterplot(data=df,x='B',y='C',hue='A',palette=col)
And the output gives:
I have a dataframe where I have a variable 'Gender' (0 or 1) indicating if one is Male or Female, and another variable 'Dis' which says the state of the Disease (0,1,2 or 3).
> df.head()
Gender Dis
0 1 2
1 0 0
2 0 1
3 1 3
4 0 0
5 0 1
I want to make a barplot with the count values for each one of the'Dis' values but I want it to be separated by Gender, i.e, I want two bars for each one of the states of the disease. I want this:
However, I can't do this barplot automatically without manually writing the count values of each one. I had to check the count values for each one of the combinations aside. I produced this plot manually with the following:
X = ['0','1','2','3']
M = [43,9,20,11]
F = [118,21,168,20]
X_axis = np.arange(len(X))
plt.bar(X_axis - 0.2, M, 0.4, label = 'Male')
plt.bar(X_axis + 0.2, F, 0.4, label = 'Female')
plt.xticks(X_axis, X)
plt.xlabel("")
plt.ylabel("")
plt.legend()
plt.title("title")
def autolabel(rects):
for rect in rects:
h = rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2., 1.05*h, '%d'%int(h),
ha='center', va='bottom')
plt.show()
Can I do something more "automatic" directly from the dataframe? Also, can I also display the count values on top of each bar?
Let's try with crosstab + DataFrame.plot:
plot_df = (
pd.crosstab(df['Dis'], df['Gender'])
.rename(columns={0: 'Male', 1: 'Female'})
)
ax = plot_df.plot(kind='bar', rot=0, xlabel='', ylabel='', title='title')
plt.show()
crosstab will produce the counts for Male/Female per Dis.
rename is used to turn the column names 0/1 to Male/Female:
plot_df:
Gender Male Female
Dis
0 119 128
1 140 121
2 124 120
3 112 136
Moving legend, and values on top of bars:
ax = plot_df.plot(kind='bar', rot=0, xlabel='', ylabel='', title='title')
for container in ax.containers:
ax.bar_label(container)
plt.legend(title='Gender', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
To add percentages to the top of the columns:
divide plot_df by the column totals
format as desired
zip with containers to add bar labels
plot_df = (
pd.crosstab(df['Dis'], df['Gender'])
.rename(columns={0: 'Male', 1: 'Female'})
)
# Calculate Percentages and format
labels_df = (
plot_df.div(plot_df.sum(axis=0)).mul(100).applymap('{:.2f}%'.format)
)
ax = plot_df.plot(kind='bar', rot=0, figsize=(9, 6), width=0.8,
xlabel='', ylabel='', title='title')
for container, col in zip(ax.containers, labels_df):
ax.bar_label(container, labels=labels_df[col])
plt.legend(title='Gender', bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()
plt.show()
labels_df:
Gender Male Female
Dis
0 24.04% 25.35%
1 28.28% 23.96%
2 25.05% 23.76%
3 22.63% 26.93%
Sample Data and imports used:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
np.random.seed(5)
df = pd.DataFrame({'Gender': np.random.choice([0, 1], 1000),
'Dis': np.random.choice([0, 1, 2, 3], 1000)})
If you want to do this with a for loop:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# assign data of lists.
data = {'Gender': [1,0,0,1,0,0,1,1], 'Dis': [2,0,1,3,0,1,0,1]}
# Create DataFrame
df = pd.DataFrame(data)
# Print the output.
print(df)
Then you create empty variables:
number_males_dis_0 = 0
number_females_dis_0 = 0
number_males_dis_1 = 0
number_females_dis_1 = 0
number_males_dis_2 = 0
number_females_dis_2 = 0
number_males_dis_3 = 0
number_females_dis_3 = 0
for i in range(0,len(data['Dis'])):
#print(i)
#dis = 0
if data['Dis'][i] == 0 and data['Gender'][i] == 0:
number_males_dis_0 += 1
elif data['Dis'][i] == 0 and data['Gender'][i] == 1:
number_females_dis_0 += 1
#dis = 1
elif data['Dis'][i] == 1 and data['Gender'][i] == 0:
number_males_dis_1 += 1
elif data['Dis'][i] == 1 and data['Gender'][i] == 1:
number_females_dis_1 += 1
#dis = 2
elif data['Dis'][i] == 2 and data['Gender'][i] == 0:
number_males_dis_2 += 1
elif data['Dis'][i] == 2 and data['Gender'][i] == 1:
number_females_dis_2 += 1
#dis = 3
elif data['Dis'][i] == 3 and data['Gender'][i] == 0:
number_males_dis_3 += 1
elif data['Dis'][i] == 3 and data['Gender'][i] == 1:
number_females_dis_3 += 1
Then the plot:
X = ['0','1','2','3']
M = [number_males_dis_0,number_males_dis_1,number_males_dis_2,number_males_dis_3]
F = [number_females_dis_0,number_females_dis_1,number_females_dis_2,number_females_dis_3]
X_axis = np.arange(len(X))
plt.bar(X_axis - 0.2, M, 0.4, label = 'Male')
plt.bar(X_axis + 0.2, F, 0.4, label = 'Female')
plt.xticks(X_axis, X)
plt.xlabel("")
plt.ylabel("")
plt.ylim(0,max([max(F),max(M)])+0.5)
plt.legend()
plt.title("title")
# Text on the top of each bar
for i in range(0,4):
plt.text(x = i - 0.25 , y = M[i] + 0.05, s = M[i], size = 10)
plt.text(x = i + 0.15 , y = F[i] + 0.05, s = F[i], size = 10)
plt.show()
Result:
Result
I have panel data with 50 different individuals over 60 months. I've created facet grids using Seaborn to plot the tone for each individual over time and now want to add vertical lines for when an event happens. Ideally, I want to add say a blue line for when event 1 occurs and a red line when event 2 occurs.
Sample of the data:
f_a tone t event1 event2
01_01 -1.9 0 0 0
01_01 -1.1 1 1 0
01_01 -2.5 2 0 0
01_01 -3.0 3 0 1
...
01_01 1.3 40 1 0
01_01 0.7 41 0 0
01_01 -0.6 42 0 0
01_01 -2.3 43 0 1
'f_a' is the ID (grouping variable)
'tone' is the y-axis
't' is the time variable
'event1' and 'event2' equal 1 if the event occurs during period t and zero otherwise.
Here's the code I have to create the plots:
# Initialize a grid of plots with an Axes for each pair
grid1 = sns.FacetGrid(df,col='f_a',hue='f_a',col_wrap=5,height=1.5)
#Draw a horizontal line to show the starting point
grid1.map(plt.axhline,y=0,ls=":",c=".5")
#Draw a line plot to show the trajectory of tone for each f-a pair over time
grid1.map(plt.plot,"t","tone",marker='o')
Here's a sample of the plot output:
Plots
Here's the code I used to generate the data:
# Create list of observation pairs
fs = np.arange(1,11,1)
ans = np.arange(1,6,1)
f_a=[str(f).zfill(2)+'_'+str(a).zfill(2)
for f in fs for a in ans]
# Create dataframe with ARMA process by f_a pair with 60 months of observations of tone
# per pair
d={}
for f in f_a:
arparams = np.array([.5, .25])
maparams = np.array([.5, .3])
ar = np.r_[1,-arparams]
ma = np.r_[1, maparams]
y = sm.tsa.arma_generate_sample(ar,ma,60)
d[f]=y
df=pd.melt(pd.DataFrame(d)).rename(columns={'variable':'f_a','value':'tone'})
df['t']=df.groupby('f_a').cumcount()
# One occurrence of event 1 and 2 per f_a pair
up = [np.random.choice(np.arange(2,61,1)) for f in f_a]
down = [np.random.choice(np.arange(2,61,1)) for f in f_a]
# Dataframe with event 1 and 2
events=pd.DataFrame(data=[f_a,up,down]).T.rename(columns={0:'f_a',1:'event1_t',2:'event2_t'})
# Merge Datasets
df=df.merge(right=events,how='left',on='f_a')
# Create dummies for event1/event2
df['event1']=(df.t==df.event1_t)*1
df['event2']=(df.t==df.event2_t)*1
# Clean up dataset
df=df.drop(columns=['event1_t','event2_t'])
You can iterate over your groups and plot lines using axvline. To iterate over subplots, use grid1.axes:
for ax, (_, subdata) in zip(grid1.axes, df.groupby('f_a')):
xs = subdata[subdata['event1'] == 1].t
for x in xs:
ax.axvline(x, color='r', ls='--')
xs = subdata[subdata['event2'] == 1].t
for x in xs:
ax.axvline(x, color='b', ls='--')
I have 69 machines and each machine has 12-month production data.
I plot them all with groupby.plot() and got a long list of views. Wondering how to make a tight layout so I can view them at once? Result wanted is each row has 7 columns and 69/7 rows. Please help!
c1.groupby('System ID').plot(x='Month', y='Monthly Production',kind='bar',legend=True)
I thought I'd add an example using seaborn as it might be useful in this context as it's quite easy to wrap things by columns with it. I expect that there's someone who could provide a nicer answer, perhaps using pandas, and I hope they do.
import seaborn as sns
import pandas as pd
import numpy as np
np.random.seed(1)
N = 2000
df = pd.DataFrame(np.random.randint(0,4, (N,7)))
df['system'] = np.random.randint(0, 69, N )
Which gives df as;
0 1 2 3 4 5 6 system
674 1 2 3 1 0 0 0 15
1699 0 0 1 3 0 0 1 9
1282 0 0 0 0 1 0 2 47
1315 0 3 1 3 1 1 1 37
1210 1 1 0 3 1 3 1 11
Melting the data before plotting:
df_plot = df.melt(id_vars='system')
Which looks as
system variable value
8756 23 4 2
5474 24 2 2
11242 12 5 2
7820 56 3 3
Then
sns.catplot(x = 'variable', y = 'value', col = 'system',
hue = 'variable', dodge = False,
col_wrap = 6, data = df_plot, kind = 'bar', ci = False)
Here's my final answer.
# We can ask for ALL THE AXES and put them into axes
fig, axes = plt.subplots(nrows=10, ncols=7, sharex=True, sharey=False, figsize=(20,15))
axes_list = [item for sublist in axes for item in sublist]
ordered_systems = grouped['Monthly Production'].last().sort_values(ascending=False).index
# Now instead of looping through the groupby
# you CREATE the groupby
# you LOOP through the ordered names
# and you use .get_group to get the right group
grouped = c1.groupby("System ID")
first_month = c1['Month'].min()
last_month = c1['Month'].max()
for system in ordered_systems:
selection = grouped.get_group(system)
ax = axes_list.pop(0)
selection.plot(x='Month', y='Monthly Production', label=system, ax=ax, legend=False)
selection.plot(x='Month', y='Monthly Usage',secondary_y=True, ax=ax, legend=False)
ax.set_title(system)
ax.tick_params(
which='both',
bottom='off',
left='off',
right='off',
top='off'
)
ax.grid(linewidth=0.25)
ax.set_xlim((first_month, last_month))
ax.set_xlabel("")
ax.set_xticks((first_month, last_month))
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Now use the matplotlib .remove() method to
# delete anything we didn't use
for ax in axes_list:
ax.remove()
plt.subplots_adjust(hspace=1)
plt.tight_layout()