Plotly - "grouped" scatter plot - python

Suppose I have the following pandas data frame:
import pandas as pd
d = {'Person': ['Bob']*9 + ['Alice']*9,
'Time': ['Morining']*3 + ['Noon']*3 + ['Evening']*3 + ['Morining']*3 + ['Noon']*3 + ['Evening']*3,
'Color': ['Red','Blue','Green']*6,
'Energy': [1,5,4,7,3,6,8,4,2,9,8,5,2,6,7,3,8,1]}
df = pd.DataFrame(d)
How can I create a plot like this?
(Excuse the crude plotting)
I've tried tricking scatter, strip and box plots into this, but with no success.
Thank you!

generate a scatter trace per Person
a bit of logic on x so that each person is offset. Hence hovertext and xaxis ticks
import plotly.graph_objects as go
xbase = pd.Series(df["Time"].unique()).reset_index().rename(columns={"index":"x",0:"Time"})
dfp = df.merge(xbase, on="Time").set_index("Person")
go.Figure(
[
go.Scatter(
name=p,
x=dfp.loc[p, "x"] + i/10,
y=dfp.loc[p, "Energy"],
text=dfp.loc[p, "Time"],
mode="markers",
marker={"color": dfp.loc[p, "Color"], "symbol":i, "size":10},
hovertemplate="(%{text},%{y})"
)
for i, p in enumerate(dfp.index.get_level_values("Person").unique())
]
).update_layout(xaxis={"tickmode":"array", "tickvals":xbase["x"], "ticktext":xbase["Time"]})

You've already received some great suggestions, but since you're still wondering about:
What if I also want the colors to show in the legend?
I'd just like to chip in that px.scatter comes really close to being an optimal approach right out of the box. The only thing that's missing is jitter. Still, the plot below can be produced by these few lines of code:
fig = px.scatter(df, x = 'Time', y = 'Energy', color = 'Color', symbol = 'Person')
fig.for_each_trace(lambda t: t.update(marker_color = t.name.split(',')[0],
name = t.name.split(',')[1], x = [1,2,3]))
fig.for_each_trace(lambda t: t.update(x=tuple([x + 0.2 for x in list(t.x)])) if t.name == ' Alice' else ())
Complete code:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
# data
d = {'Person': ['Bob']*9 + ['Alice']*9,
'Time': ['Morining']*3 + ['Noon']*3 + ['Evening']*3 + ['Morning']*3 + ['Noon']*3 + ['Evening']*3,
'Color': ['Red','Blue','Green']*6,
'Energy': [1,5,4,7,3,6,8,4,2,9,8,5,2,6,7,3,8,1]}
df = pd.DataFrame(d)
# figure setup
fig = px.scatter(df, x = 'Time', y = 'Energy', color = 'Color', symbol = 'Person')
# some customizations in order to get to the desired result:
fig.for_each_trace(lambda t: t.update(marker_color = t.name.split(',')[0],
name = t.name.split(',')[1],
x = [1,2,3]))
# jitter
fig.for_each_trace(lambda t: t.update(x=tuple([x + 0.2 for x in list(t.x)])) if t.name == ' Alice' else ())
# layout
fig.update_layout(xaxis={"tickmode":"array","tickvals":[1,2,3],"ticktext":df.Time.unique()})
fig.show()
Room for improvement:
Some elements of the snippet above could undoubtedly be made more dynamic, like x = [1,2,3] which should take into account a varying number of elements on the x-axis. The same goes for the number of people and the arguments used for jitter. But I can look into that too if this is something you can use.

You can go through each row the DataFrame using itertuples (better performance than iterrows), and map 'Morning', 'Noon', and 'Evening' values to 1,2,3, respectively, and then jitter the x-values by mapping 'Bob' to '-0.05' and 'Alice' to 0.05 and adding these values to each of the x-values. You can also pass the 'Color' information to the marker_color argument.
Then map the tickvalues of 1,2,3 back to 'Morning','Noon' and 'Evening' and also use a legendgroup to get only one Bob and one Alice legend marker to display (to stop the marker for each trace from displaying in the legend)
import pandas as pd
import plotly.graph_objects as go
d = {'Person': ['Bob']*9 + ['Alice']*9,
'Time': ['Morning']*3 + ['Noon']*3 + ['Evening']*3 + ['Morning']*3 + ['Noon']*3 + ['Evening']*3,
'Color': ['Red','Blue','Green']*6,
'Energy': [1,5,4,7,3,6,8,4,2,9,8,5,2,6,7,3,8,1]}
df = pd.DataFrame(d)
shapes = {'Bob': 'circle', 'Alice': 'diamond'}
time = {'Morning':1, 'Noon':2, 'Evening':3}
jitter = {'Bob': -0.05, 'Alice': 0.05}
fig = go.Figure()
## position 1 of each row is Person... position 4 is the Energy value
s = df.Person.shift() != df.Person
name_changes = s[s].index.values
for row in df.itertuples():
if row[0] in name_changes:
fig.add_trace(go.Scatter(
x=[time[row[2]] + jitter[row[1]]],
y=[row[4]],
legendgroup=row[1],
name=row[1],
mode='markers',
marker_symbol=shapes[row[1]],
marker_color=row[3],
showlegend=True
))
else:
fig.add_trace(go.Scatter(
x=[time[row[2]] + jitter[row[1]]],
y=[row[4]],
legendgroup=row[1],
name=row[1],
mode='markers',
marker_symbol=shapes[row[1]],
marker_color=row[3],
showlegend=False
))
fig.update_traces(marker=dict(size=12,line=dict(width=2,color='DarkSlateGrey')))
fig.update_layout(
xaxis=dict(
tickmode='array',
tickvals=list(time.values()),
ticktext=list(time.keys())
)
)
fig.show()

In case you only want to go with matplotlib and don't want any extra dependencies, here is a sample code. (Pandas operations groupbys etc are left for you to optimize)
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
from matplotlib.lines import Line2D
df = pd.DataFrame(
{
'Person': ['Bob'] * 9 + ['Alice'] * 9,
'Time': ['Morning'] * 3
+ ['Noon'] * 3
+ ['Evening'] * 3
+ ['Morning'] * 3
+ ['Noon'] * 3
+ ['Evening'] * 3,
'Color': ['Red', 'Blue', 'Green'] * 6,
'Energy': [1, 5, 4, 7, 3, 6, 8, 4, 2, 9, 8, 5, 2, 6, 7, 3, 8, 1],
}
)
plt.figure()
x = ['Morning', 'Noon', 'Evening']
# Transform function
offset = lambda p: transforms.ScaledTranslation(
p / 72.0, 0, plt.gcf().dpi_scale_trans
)
trans = plt.gca().transData
# Use this to center transformation
start_offset = -len(df['Person'].unique()) // 2
# Define as many markers as people you have
markers = ['o', '^']
# Use this for custom legend
custom_legend = []
# Do this if you need to aggregate
df = df.groupby(['Person', 'Time', 'Color'])['Energy'].sum().reset_index()
df = df.set_index('Time')
for i, [person, pgroup] in enumerate(df.groupby('Person')):
pts = (i + start_offset) * 10
marker = markers[i]
transform = trans + offset(pts)
# This is for legend, not plotted
custom_legend.append(
Line2D(
[0],
[0],
color='w',
markerfacecolor='black',
marker=marker,
markersize=10,
label=person,
)
)
for color, cgroup in pgroup.groupby('Color'):
mornings = cgroup.loc[cgroup.index == 'Morning', 'Energy'].values[0]
noons = cgroup.loc[cgroup.index == 'Noon', 'Energy'].values[0]
evenings = cgroup.loc[cgroup.index == 'Evening', 'Energy'].values[0]
# This stupid if is because you need to define at least one non
# transformation scatter be it first or whatever.
if pts == 0:
plt.scatter(
x,
[mornings, noons, evenings],
c=color.lower(),
s=25,
marker=marker,
)
else:
plt.scatter(
x,
[mornings, noons, evenings],
c=color.lower(),
s=25,
marker=marker,
transform=transform,
)
plt.ylabel('Energy')
plt.xlabel('Time')
plt.legend(handles=custom_legend)
plt.margins(x=0.5)
plt.show()

Related

Plotly: bar plot with color red<0, green>0, divided by groups

Given a dataframe with 2 groups: (group1, group2), that have values > and < than 0:
plot:
Bar plot
x = x
y = values, divided by group1, group2
color = red if value<0, green if value>0
legend shows group1, grou2 with different colors.
My current code however is not coloring as i would expect, and the legend is shown with the same color:
import pandas as pd
import numpy as np
import plotly.express as px
df = pd.DataFrame( {
"x" : [1,2,3],
"group1" : [np.nan, 1, -0.5],
"group2" : [np.nan, -0.2, 1],
}).set_index("x")
df_ = df.reset_index().melt(id_vars = 'x')
fig = px.bar(df_, x='x', y='value', color='variable', barmode='group')
fig.update_traces(marker_color=['red' if val < 0 else 'green' for val in df_['value']], marker_line_color='black', marker_line_width=1.5)
fig.show()
OUT with indications of what i want to achieve:
Since this cannot be achieved with express, we use a graph object to draw a bar chart for each group. The logic for color determination by numerical values is changed for Group 1 and Group 2, changing the conditions for Group 1 and Group 2. The reason is that the legend will not be color-coded unless this is done.
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
df = pd.DataFrame( {
"x" : [1,2,3],
"group1" : [np.nan, 1, -0.5],
"group2" : [np.nan, -0.2, 1],
}).set_index("x")
df_ = df.reset_index().melt(id_vars = 'x')
fig = go.Figure()
fig.add_trace(go.Bar(x=df_.query('variable =="group1"')['x'],
y=df_.query('variable =="group1"')['value'],
marker_color=['red' if val < 0 else 'green' for val in df_.query('variable =="group1"')['value']],
marker_line_color='black',
marker_line_width=1.5,
name='group1',
#legendgroup='group1'
)
)
fig.add_trace(go.Bar(x=df_.query('variable =="group2"')['x'],
y=df_.query('variable =="group2"')['value'],
marker_color=['green' if val > 0 else 'red' for val in df_.query('variable =="group2"')['value']],
marker_line_color='black',
marker_line_width=1.5,
name='group2',
#legendgroup='group2'
)
)
fig.update_layout(barmode='group', xaxis=dict(title_text='x'), yaxis=dict(title_text='value'))
fig.show()
If you want to color bar according their x value (and not variable group), you have to update traces individually (or you have to draw graph objects manually):
fig = px.bar(df_, x='x', y='value', color='variable', barmode='group')
fig.for_each_trace(
lambda trace: trace.update(marker_color=np.where(df_.loc[df_['variable'].eq(trace.name), 'value'] < 0, 'red', 'green'))
)
fig.update_layout(showlegend=False) # Hide legend because there is no distinct group
fig.show()
Output:
To stick with plotly.express, I would add a column to your dataframe, e.g. df_['positive'] with a boolean, and then color your plot by this variable.
It would look like this:
import pandas as pd
import numpy as np
import plotly.express as px
df = pd.DataFrame( {
"x" : [1,2,3],
"group1" : [np.nan, 1, -0.5],
"group2" : [np.nan, -0.2, 1],
}).set_index("x")
df_ = df.reset_index().melt(id_vars = 'x')
df_['positive'] = (df_['value']>=0)
fig = px.bar(df_, x='x', y='value',barmode = 'group',
color='positive',
color_discrete_map={
True: 'green',
False: 'red'
}
)
fig.update_traces(marker_line_color='black', marker_line_width=1.5)
fig.show('browser')
which yields the following :
EDIT following comments
If you want to keep the colors AND the group distinction within plotly.express, one way could be to add patterns...
Solution 1 : Every combination has its legend entry
df = pd.DataFrame( {
"x" : [1,2,3],
"group1" : [np.nan, 1, -0.5],
"group2" : [np.nan, -0.2, 1],
}).set_index("x")
df_ = df.reset_index().melt(id_vars = 'x')
positive = (df_['value']>=0)
df_['positive'] = positive
df_['sign'] = ['positive' if x else 'negative' for x in df_['positive']]
# Each compbination of color and patterns
fig = px.bar(df_, x='x', y='value',barmode = 'group',
color='sign',
color_discrete_map={
'positive': 'green',
'negative': 'red'
},
pattern_shape="variable")
fig.update_layout(legend_title="Groups & Signs", bargap=0.5,bargroupgap=0.1)
fig.show('browser')
which outputs the following
Solution 2 : Legend only reflects patterns
# Only patterns in legend
fig = px.bar(df_, x='x', y='value', color='variable',
barmode='group',pattern_shape="variable")
fig.update_layout(legend_title="Groups", bargap=0.5,bargroupgap=0.1)
fig.for_each_trace(
lambda trace: trace.update(marker_color=np.where(df_.loc[df_['variable'].eq(trace.name), 'value'] < 0, 'red', 'green'))
)
fig.show('browser')
which outputs :
However I was not able to 'remove' the green color from the legend...

plotly.py: Carpet plot 4 variable axis problem

I am trying to use plotly in python to duplicate the 4 variable carpet plot in this link titled "A four-variable carpet plot showing interpolation"
I have noticed two things. First, the length of arrays a and b must be the same. Ok, I can live with that. More importantly, I seem to need to switch the arguments when I call the carpet plot function. I don't understand what I am doing wrong, but if I call
fig = go.Figure(go.Carpet(a=b, b=a, ...
then it work as desired. But why are my a and b vectors reversed...
Here is the entire working example:
import plotly.graph_objects as go
import numpy as np
na = 7
nb = 7
x = np.ndarray(shape=(na, nb))
y = np.ndarray(shape=(na, nb))
a = np.array(range(4,4+na))
b = np.array(range(10,10+nb))
#print(x)
for i in range(0,na):
for j in range(0,nb):
x[i,j] = np.sqrt(a[i] * b[j])
y[i,j] = np.sqrt(b[j])**3 / a[i]
print(a[i], b[j], x[i,j], y[i,j])
print('a=',a)
print('b=',b)
print(x)
fig = go.Figure(go.Carpet(a=b, b=a, x=x, y=y,
aaxis = dict(
tickprefix = 'b=',
ticksuffix = 'm',
smoothing = 1,
minorgridcount = 9,
minorgridwidth = 0.6,
minorgridcolor = 'gray',
gridcolor = 'black',
color = 'blue'
),
baxis = dict(
tickprefix = 'a=',
ticksuffix = 'Pa',
smoothing = 1,
minorgridcount = 9,
minorgridwidth = 0.6,
minorgridcolor = 'gray',
gridcolor = 'black',
color = 'blue'
)
))
fig.show()
Why must I send a=b and b=a?
Why does the code fail if na != nb?
Thanks!

Toggle points on and off in altair

I'd like to be able to toggle the display of the points on and off in the below chart. The 2 lines are the means of the points in groups 1 and 2. I thought there would be a way to do this using interactive but cannot find any examples. Any help is much appreciated.
import math
import numpy as np
import pandas as pd
import altair as alt
x = np.arange(0,math.pi,0.1);
y = np.sin(x);
a, b = -0.2, 0.2
summary_df=[]
for i in range(0,2):
for j in range(0,5):
rand_y = (b - a)*np.random.rand(len(y)) + a
df = pd.DataFrame({
'group': i,
'batch': j,
'x': x,
'y': y+rand_y
})
summary_df.append(df)
summary_df = pd.concat(summary_df)
base = alt.Chart(
summary_df
).properties(
width=200,
height=400
)
mean_selection = alt.selection_multi(fields=['group'], bind='legend')
mean_line = base.mark_line(size=2).encode(
x=alt.X('x:Q'),
y=alt.Y('y:Q', aggregate='mean', axis=alt.Axis(title='y')),
color='group:N',
opacity=alt.condition(mean_selection, alt.value(1), alt.value(0.2))
).add_selection(
mean_selection
).interactive()
all_selection = alt.selection_multi(fields=['group'], bind='legend')
all_points = base.mark_square(size=10).encode(
y=alt.Y('y:Q', axis=alt.Axis(title='y')),
x=alt.X('x:Q',),
color='group:N',
tooltip='batch:N',
opacity=alt.condition(all_selection, alt.value(1), alt.value(0.2))
).add_selection(
all_selection
).interactive()
(mean_line+all_points)

how to plot a range with a line in the center with Plotly, in Python [duplicate]

How can I use Plotly to produce a line plot with a shaded standard deviation? I am trying to achieve something similar to seaborn.tsplot. Any help is appreciated.
The following approach is fully flexible with regards to the number of columns in a pandas dataframe and uses the default color cycle of plotly. If the number of lines exceed the number of colors, the colors will be re-used from the start. As of now px.colors.qualitative.Plotly can be replaced with any hex color sequence that you can find using px.colors.qualitative:
Alphabet = ['#AA0DFE', '#3283FE', '#85660D', '#782AB6', '#565656', '#1...
Alphabet_r = ['#FA0087', '#FBE426', '#B00068', '#FC1CBF', '#C075A6', '...
[...]
Complete code:
# imports
import plotly.graph_objs as go
import plotly.express as px
import pandas as pd
import numpy as np
# sample data in a pandas dataframe
np.random.seed(1)
df=pd.DataFrame(dict(A=np.random.uniform(low=-1, high=2, size=25).tolist(),
B=np.random.uniform(low=-4, high=3, size=25).tolist(),
C=np.random.uniform(low=-1, high=3, size=25).tolist(),
))
df = df.cumsum()
# define colors as a list
colors = px.colors.qualitative.Plotly
# convert plotly hex colors to rgba to enable transparency adjustments
def hex_rgba(hex, transparency):
col_hex = hex.lstrip('#')
col_rgb = list(int(col_hex[i:i+2], 16) for i in (0, 2, 4))
col_rgb.extend([transparency])
areacol = tuple(col_rgb)
return areacol
rgba = [hex_rgba(c, transparency=0.2) for c in colors]
colCycle = ['rgba'+str(elem) for elem in rgba]
# Make sure the colors run in cycles if there are more lines than colors
def next_col(cols):
while True:
for col in cols:
yield col
line_color=next_col(cols=colCycle)
# plotly figure
fig = go.Figure()
# add line and shaded area for each series and standards deviation
for i, col in enumerate(df):
new_col = next(line_color)
x = list(df.index.values+1)
y1 = df[col]
y1_upper = [(y + np.std(df[col])) for y in df[col]]
y1_lower = [(y - np.std(df[col])) for y in df[col]]
y1_lower = y1_lower[::-1]
# standard deviation area
fig.add_traces(go.Scatter(x=x+x[::-1],
y=y1_upper+y1_lower,
fill='tozerox',
fillcolor=new_col,
line=dict(color='rgba(255,255,255,0)'),
showlegend=False,
name=col))
# line trace
fig.add_traces(go.Scatter(x=x,
y=y1,
line=dict(color=new_col, width=2.5),
mode='lines',
name=col)
)
# set x-axis
fig.update_layout(xaxis=dict(range=[1,len(df)]))
fig.show()
I was able to come up with something similar. I post the code here to be used by someone else or for any suggestions for improvements.
import matplotlib
import random
import plotly.graph_objects as go
import numpy as np
#random color generation in plotly
hex_colors_dic = {}
rgb_colors_dic = {}
hex_colors_only = []
for name, hex in matplotlib.colors.cnames.items():
hex_colors_only.append(hex)
hex_colors_dic[name] = hex
rgb_colors_dic[name] = matplotlib.colors.to_rgb(hex)
data = [[1, 3, 5, 4],
[2, 3, 5, 4],
[1, 1, 4, 5],
[2, 3, 5, 4]]
#calculating mean and standard deviation
mean=np.mean(data,axis=0)
std=np.std(data,axis=0)
#draw figure
fig = go.Figure()
c = random.choice(hex_colors_only)
fig.add_trace(go.Scatter(x=np.arange(4), y=mean+std,
mode='lines',
line=dict(color=c,width =0.1),
name='upper bound'))
fig.add_trace(go.Scatter(x=np.arange(4), y=mean,
mode='lines',
line=dict(color=c),
fill='tonexty',
name='mean'))
fig.add_trace(go.Scatter(x=np.arange(4), y=mean-std,
mode='lines',
line=dict(color=c, width =0.1),
fill='tonexty',
name='lower bound'))
fig.show()
Great custom responses posted by others. In case someone is interested in code from the official plotly website, see here: https://plotly.com/python/continuous-error-bars/
I wrote a function to extend plotly.express.line with the same high level interface of Plotly Express. The line function (source code below) is used in the same exact way as plotly.express.line but allows for continuous error bands with the flag argument error_y_mode which can be either 'band' or 'bar'. In the second case it produces the same result as the original plotly.express.line. Here is an usage example:
import plotly.express as px
df = px.data.gapminder().query('continent=="Americas"')
df = df[df['country'].isin({'Argentina','Brazil','Colombia'})]
df['lifeExp std'] = df['lifeExp']*.1 # Invent some error data...
for error_y_mode in {'band', 'bar'}:
fig = line(
data_frame = df,
x = 'year',
y = 'lifeExp',
error_y = 'lifeExp std',
error_y_mode = error_y_mode, # Here you say `band` or `bar`.
color = 'country',
title = f'Using error {error_y_mode}',
markers = '.',
)
fig.show()
which produces the following two plots:
The source code of the line function that extends plotly.express.line is this:
import plotly.express as px
import plotly.graph_objs as go
def line(error_y_mode=None, **kwargs):
"""Extension of `plotly.express.line` to use error bands."""
ERROR_MODES = {'bar','band','bars','bands',None}
if error_y_mode not in ERROR_MODES:
raise ValueError(f"'error_y_mode' must be one of {ERROR_MODES}, received {repr(error_y_mode)}.")
if error_y_mode in {'bar','bars',None}:
fig = px.line(**kwargs)
elif error_y_mode in {'band','bands'}:
if 'error_y' not in kwargs:
raise ValueError(f"If you provide argument 'error_y_mode' you must also provide 'error_y'.")
figure_with_error_bars = px.line(**kwargs)
fig = px.line(**{arg: val for arg,val in kwargs.items() if arg != 'error_y'})
for data in figure_with_error_bars.data:
x = list(data['x'])
y_upper = list(data['y'] + data['error_y']['array'])
y_lower = list(data['y'] - data['error_y']['array'] if data['error_y']['arrayminus'] is None else data['y'] - data['error_y']['arrayminus'])
color = f"rgba({tuple(int(data['line']['color'].lstrip('#')[i:i+2], 16) for i in (0, 2, 4))},.3)".replace('((','(').replace('),',',').replace(' ','')
fig.add_trace(
go.Scatter(
x = x+x[::-1],
y = y_upper+y_lower[::-1],
fill = 'toself',
fillcolor = color,
line = dict(
color = 'rgba(255,255,255,0)'
),
hoverinfo = "skip",
showlegend = False,
legendgroup = data['legendgroup'],
xaxis = data['xaxis'],
yaxis = data['yaxis'],
)
)
# Reorder data as said here: https://stackoverflow.com/a/66854398/8849755
reordered_data = []
for i in range(int(len(fig.data)/2)):
reordered_data.append(fig.data[i+int(len(fig.data)/2)])
reordered_data.append(fig.data[i])
fig.data = tuple(reordered_data)
return fig

How to do waffle charts in python? (square piechart)

Something like this:
There is a very good package to do it in R. In python, the best that I could figure out is this, using the squarify package (inspired by a post on how to do treemaps):
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns # just to have better line color and width
import squarify
# for those using jupyter notebooks
%matplotlib inline
df = pd.DataFrame({
'v1': np.ones(100),
'v2': np.random.randint(1, 4, 100)})
df.sort_values(by='v2', inplace=True)
# color scale
cmap = mpl.cm.Accent
mini, maxi = df['v2'].min(), df['v2'].max()
norm = mpl.colors.Normalize(vmin=mini, vmax=maxi)
colors = [cmap(norm(value)) for value in df['v2']]
# figure
fig = plt.figure()
ax = fig.add_subplot(111, aspect="equal")
ax = squarify.plot(df['v1'], color=colors, ax=ax)
ax.set_xticks([])
ax.set_yticks([]);
But when I create not 100 but 200 elements (or other non-square numbers), the squares become misaligned.
Another problem is that if I change v2 to some categorical variable (e.g., a hundred As, Bs, Cs and Ds), I get this error:
could not convert string to float: 'a'
So, could anyone help me with these two questions:
how can I solve the alignment problem with non-square numbers of observations?
how can use categorical variables in v2?
Beyond this, I am really open if there are any other python packages that can create waffle plots more efficiently.
I spent a few days to build a more general solution, PyWaffle.
You can install it through
pip install pywaffle
The source code: https://github.com/gyli/PyWaffle
PyWaffle does not use matshow() method, but builds those squares one by one. That makes it easier for customization. Besides, what it provides is a custom Figure class, which returns a figure object. By updating attributes of the figure, you can basically control everything in the chart.
Some examples:
Colored or transparent background:
import matplotlib.pyplot as plt
from pywaffle import Waffle
data = {'Democratic': 48, 'Republican': 46, 'Libertarian': 3}
fig = plt.figure(
FigureClass=Waffle,
rows=5,
values=data,
colors=("#983D3D", "#232066", "#DCB732"),
title={'label': 'Vote Percentage in 2016 US Presidential Election', 'loc': 'left'},
labels=["{0} ({1}%)".format(k, v) for k, v in data.items()],
legend={'loc': 'lower left', 'bbox_to_anchor': (0, -0.4), 'ncol': len(data), 'framealpha': 0}
)
fig.gca().set_facecolor('#EEEEEE')
fig.set_facecolor('#EEEEEE')
plt.show()
Use icons replacing squares:
data = {'Democratic': 48, 'Republican': 46, 'Libertarian': 3}
fig = plt.figure(
FigureClass=Waffle,
rows=5,
values=data,
colors=("#232066", "#983D3D", "#DCB732"),
legend={'loc': 'upper left', 'bbox_to_anchor': (1, 1)},
icons='child', icon_size=18,
icon_legend=True
)
Multiple subplots in one chart:
import pandas as pd
data = pd.DataFrame(
{
'labels': ['Hillary Clinton', 'Donald Trump', 'Others'],
'Virginia': [1981473, 1769443, 233715],
'Maryland': [1677928, 943169, 160349],
'West Virginia': [188794, 489371, 36258],
},
).set_index('labels')
fig = plt.figure(
FigureClass=Waffle,
plots={
'311': {
'values': data['Virginia'] / 30000,
'labels': ["{0} ({1})".format(n, v) for n, v in data['Virginia'].items()],
'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.05, 1), 'fontsize': 8},
'title': {'label': '2016 Virginia Presidential Election Results', 'loc': 'left'}
},
'312': {
'values': data['Maryland'] / 30000,
'labels': ["{0} ({1})".format(n, v) for n, v in data['Maryland'].items()],
'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.2, 1), 'fontsize': 8},
'title': {'label': '2016 Maryland Presidential Election Results', 'loc': 'left'}
},
'313': {
'values': data['West Virginia'] / 30000,
'labels': ["{0} ({1})".format(n, v) for n, v in data['West Virginia'].items()],
'legend': {'loc': 'upper left', 'bbox_to_anchor': (1.3, 1), 'fontsize': 8},
'title': {'label': '2016 West Virginia Presidential Election Results', 'loc': 'left'}
},
},
rows=5,
colors=("#2196f3", "#ff5252", "#999999"), # Default argument values for subplots
figsize=(9, 5) # figsize is a parameter of plt.figure
)
I've put together a working example, below, which I think meets your needs. Some work is needed to fully generalize the approach, but I think you'll find that it's a good start. The trick was to use matshow() to solve your non-square problem, and to build a custom legend to easily account for categorical values.
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
# Let's make a default data frame with catagories and values.
df = pd.DataFrame({ 'catagories': ['cat1', 'cat2', 'cat3', 'cat4'],
'values': [84911, 14414, 10062, 8565] })
# Now, we define a desired height and width.
waffle_plot_width = 20
waffle_plot_height = 7
classes = df['catagories']
values = df['values']
def waffle_plot(classes, values, height, width, colormap):
# Compute the portion of the total assigned to each class.
class_portion = [float(v)/sum(values) for v in values]
# Compute the number of tiles for each catagories.
total_tiles = width * height
tiles_per_class = [round(p*total_tiles) for p in class_portion]
# Make a dummy matrix for use in plotting.
plot_matrix = np.zeros((height, width))
# Popoulate the dummy matrix with integer values.
class_index = 0
tile_index = 0
# Iterate over each tile.
for col in range(waffle_plot_width):
for row in range(height):
tile_index += 1
# If the number of tiles populated is sufficient for this class...
if tile_index > sum(tiles_per_class[0:class_index]):
# ...increment to the next class.
class_index += 1
# Set the class value to an integer, which increases with class.
plot_matrix[row, col] = class_index
# Create a new figure.
fig = plt.figure()
# Using matshow solves your "non-square" problem.
plt.matshow(plot_matrix, cmap=colormap)
plt.colorbar()
# Get the axis.
ax = plt.gca()
# Minor ticks
ax.set_xticks(np.arange(-.5, (width), 1), minor=True);
ax.set_yticks(np.arange(-.5, (height), 1), minor=True);
# Gridlines based on minor ticks
ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
# Manually constructing a legend solves your "catagorical" problem.
legend_handles = []
for i, c in enumerate(classes):
lable_str = c + " (" + str(values[i]) + ")"
color_val = colormap(float(i+1)/len(classes))
legend_handles.append(mpatches.Patch(color=color_val, label=lable_str))
# Add the legend. Still a bit of work to do here, to perfect centering.
plt.legend(handles=legend_handles, loc=1, ncol=len(classes),
bbox_to_anchor=(0., -0.1, 0.95, .10))
plt.xticks([])
plt.yticks([])
# Call the plotting function.
waffle_plot(classes, values, waffle_plot_height, waffle_plot_width,
plt.cm.coolwarm)
Below is an example of the output this script produced. As you can see, it works fairly well for me, and meets all of your stated needs. Just let me know if it gives you any trouble. Enjoy!
You can use this function for automatic creation of a waffle with simple parameters:
def create_waffle_chart(categories, values, height, width, colormap, value_sign=''):
# compute the proportion of each category with respect to the total
total_values = sum(values)
category_proportions = [(float(value) / total_values) for value in values]
# compute the total number of tiles
total_num_tiles = width * height # total number of tiles
print ('Total number of tiles is', total_num_tiles)
# compute the number of tiles for each catagory
tiles_per_category = [round(proportion * total_num_tiles) for proportion in category_proportions]
# print out number of tiles per category
for i, tiles in enumerate(tiles_per_category):
print (df_dsn.index.values[i] + ': ' + str(tiles))
# initialize the waffle chart as an empty matrix
waffle_chart = np.zeros((height, width))
# define indices to loop through waffle chart
category_index = 0
tile_index = 0
# populate the waffle chart
for col in range(width):
for row in range(height):
tile_index += 1
# if the number of tiles populated for the current category
# is equal to its corresponding allocated tiles...
if tile_index > sum(tiles_per_category[0:category_index]):
# ...proceed to the next category
category_index += 1
# set the class value to an integer, which increases with class
waffle_chart[row, col] = category_index
# instantiate a new figure object
fig = plt.figure()
# use matshow to display the waffle chart
colormap = plt.cm.coolwarm
plt.matshow(waffle_chart, cmap=colormap)
plt.colorbar()
# get the axis
ax = plt.gca()
# set minor ticks
ax.set_xticks(np.arange(-.5, (width), 1), minor=True)
ax.set_yticks(np.arange(-.5, (height), 1), minor=True)
# add dridlines based on minor ticks
ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
plt.xticks([])
plt.yticks([])
# compute cumulative sum of individual categories to match color schemes between chart and legend
values_cumsum = np.cumsum(values)
total_values = values_cumsum[len(values_cumsum) - 1]
# create legend
legend_handles = []
for i, category in enumerate(categories):
if value_sign == '%':
label_str = category + ' (' + str(values[i]) + value_sign + ')'
else:
label_str = category + ' (' + value_sign + str(values[i]) + ')'
color_val = colormap(float(values_cumsum[i])/total_values)
legend_handles.append(mpatches.Patch(color=color_val, label=label_str))
# add legend to chart
plt.legend(
handles=legend_handles,
loc='lower center',
ncol=len(categories),
bbox_to_anchor=(0., -0.2, 0.95, .1)
)

Categories