I have the following code:
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(123456)
import pandas as pd
df = pd.DataFrame(3 * np.random.rand(4, 4), index=['a', 'b', 'c', 'd'],
columns=['x', 'y','z','w'])
plt.style.use('ggplot')
colors = plt.rcParams['axes.color_cycle']
fig, axes = plt.subplots(nrows=2, ncols=3)
for ax in axes.flat:
ax.axis('off')
for ax, col in zip(axes.flat, df.columns):
ax.pie(df[col], labels=df.index, autopct='%.2f', colors=colors)
ax.set(ylabel='', title=col, aspect='equal')
axes[0, 0].legend(bbox_to_anchor=(0, 0.5))
fig.savefig('your_file.png') # Or whichever format you'd like
plt.show()
Which produce the following:
My question is, how can I remove the label based on a condition. For example I'd only want to display labels with percent > 20%. Such that the labels and value of a,c,d won't be displayed in X, etc.
The autopct argument from pie can be a callable, which will receive the current percentage. So you only would need to provide a function that returns an empty string for the values you want to omit the percentage.
Function
def my_autopct(pct):
return ('%.2f' % pct) if pct > 20 else ''
Plot with matplotlib.axes.Axes.pie
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 6))
for ax, col in zip(axes.flat, df.columns):
ax.pie(df[col], labels=df.index, autopct=my_autopct)
ax.set(ylabel='', title=col, aspect='equal')
fig.tight_layout()
Plot directly with the dataframe
axes = df.plot(kind='pie', autopct=my_autopct, figsize=(8, 6), subplots=True, layout=(2, 2), legend=False)
for ax in axes.flat:
yl = ax.get_ylabel()
ax.set(ylabel='', title=yl)
fig = axes[0, 0].get_figure()
fig.tight_layout()
If you need to parametrize the value on the autopct argument, you'll need a function that returns a function, like:
def autopct_generator(limit):
def inner_autopct(pct):
return ('%.2f' % pct) if pct > limit else ''
return inner_autopct
ax.pie(df[col], labels=df.index, autopct=autopct_generator(20), colors=colors)
For the labels, the best thing I can come up with is using list comprehension:
for ax, col in zip(axes.flat, df.columns):
data = df[col]
labels = [n if v > data.sum() * 0.2 else ''
for n, v in zip(df.index, data)]
ax.pie(data, autopct=my_autopct, colors=colors, labels=labels)
Note, however, that the legend by default is being generated from the first passed labels, so you'll need to pass all values explicitly to keep it intact.
axes[0, 0].legend(df.index, bbox_to_anchor=(0, 0.5))
For labels I have used:
def my_level_list(data):
list = []
for i in range(len(data)):
if (data[i]*100/np.sum(data)) > 2 : #2%
list.append('Label '+str(i+1))
else:
list.append('')
return list
patches, texts, autotexts = plt.pie(data, radius = 1, labels=my_level_list(data), autopct=my_autopct, shadow=True)
You can make the labels function a little shorter using list comprehension:
def my_autopct(pct):
return ('%1.1f' % pct) if pct > 1 else ''
def get_new_labels(sizes, labels):
new_labels = [label if size > 1 else '' for size, label in zip(sizes, labels)]
return new_labels
fig, ax = plt.subplots()
_,_,_ = ax.pie(sizes, labels=get_new_labels(sizes, labels), colors=colors, autopct=my_autopct, startangle=90, rotatelabels=False)
Related
I try to produce a plot and want to automatically add text (in this case is percentage) to each circle in correspond to each y axis types. Any help would be very helpful.
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# Make some data
index=['Stream flow',
'Soil moisture',
'Water indices',
'Others',
'Temperature',
'Precipitation',
'Vegetative indices']
value=[2.13, 6.38, 10.64, 12.77, 17.73, 21.99, 28.37]
# create dataframe
percentages = pd.Series(value,index=index)
df = pd.DataFrame({'percentage' : percentages})
df = df.sort_values(by='percentage')
# we first need a numeric placeholder for the y axis
my_range=list(range(1,len(df.index)+1))
fig, ax = plt.subplots(figsize=(15,8))
# create for each expense type an horizontal line that starts at x = 0 with the length
plt.hlines(y=my_range, xmin=0, xmax=df['percentage']-0.5, color='black', alpha=0.8, linewidth=1)
# create for each expense type a dot at the level of the expense percentage value
line=plt.plot(df['percentage'], my_range, "o", markersize=30, color='#fd8c00', alpha=0.6, linewidth=0.3)
# set labels
ax.set_xlabel('Percentage', fontsize=15)
ax.set_ylabel('')
# set axis
ax.tick_params(axis='both', which='major', labelsize=14)
plt.yticks(my_range, df.index)
ax.set_xlim(0,30)
You can use matplotlib.axes.Axes.text:
x_space = 0.4
y_space = 0.05
fontsize = 7
for y_i, val in enumerate(value, 1):
ax.text(x = val - x_space, y = y_i - y_space, s = f'{val}%', fontsize = fontsize)
You have to adjust x_space, y_space and fontsize in order to fit properly the text within the circles.
Complete code
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# Make some data
index=['Stream flow',
'Soil moisture',
'Water indices',
'Others',
'Temperature',
'Precipitation',
'Vegetative indices']
value=[2.13, 6.38, 10.64, 12.77, 17.73, 21.99, 28.37]
# create dataframe
percentages = pd.Series(value,index=index)
df = pd.DataFrame({'percentage' : percentages})
df = df.sort_values(by='percentage')
# we first need a numeric placeholder for the y axis
my_range=list(range(1,len(df.index)+1))
fig, ax = plt.subplots(figsize=(15,8))
# create for each expense type an horizontal line that starts at x = 0 with the length
plt.hlines(y=my_range, xmin=0, xmax=df['percentage']-0.5, color='black', alpha=0.8, linewidth=1)
# create for each expense type a dot at the level of the expense percentage value
line=plt.plot(df['percentage'], my_range, "o", markersize=30, color='#fd8c00', alpha=0.6, linewidth=0.3)
# set labels
ax.set_xlabel('Percentage', fontsize=15)
ax.set_ylabel('')
# set axis
ax.tick_params(axis='both', which='major', labelsize=14)
plt.yticks(my_range, df.index)
ax.set_xlim(0,30)
x_space = 0.4
y_space = 0.05
for y_i, val in enumerate(value, 1):
ax.text(x = val - x_space, y = y_i - y_space, s = f'{val:>5.2f}%', fontsize = 7)
plt.show()
Same code as above, but with increased circle radius and font, in order to improve readability.
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# Make some data
index=['Stream flow',
'Soil moisture',
'Water indices',
'Others',
'Temperature',
'Precipitation',
'Vegetative indices']
value=[2.13, 6.38, 10.64, 12.77, 17.73, 21.99, 28.37]
# create dataframe
percentages = pd.Series(value,index=index)
df = pd.DataFrame({'percentage' : percentages})
df = df.sort_values(by='percentage')
# we first need a numeric placeholder for the y axis
my_range=list(range(1,len(df.index)+1))
fig, ax = plt.subplots(figsize=(15,8))
# create for each expense type an horizontal line that starts at x = 0 with the length
plt.hlines(y=my_range, xmin=0, xmax=df['percentage']-0.85, color='black', alpha=0.8, linewidth=1)
# create for each expense type a dot at the level of the expense percentage value
line=plt.plot(df['percentage'], my_range, "o", markersize=50, color='#fd8c00', alpha=0.6, linewidth=0.3)
# set labels
ax.set_xlabel('Percentage', fontsize=15)
ax.set_ylabel('')
# set axis
ax.tick_params(axis='both', which='major', labelsize=14)
plt.yticks(my_range, df.index)
ax.set_xlim(0,30)
ax.set_ylim(0, len(value) + 1)
x_space = 0.75
y_space = 0.06
fontsize = 12
for y_i, val in enumerate(value, 1):
ax.text(x = val - x_space, y = y_i - y_space, s = f'{val:>5.2f}%', fontsize = fontsize)
plt.show()
Even better, you can use matplotlib.axes.Axes.annotate to get rid of x_space and y_space:
fontsize = 12
for y_i, x_i in enumerate(value, 1):
ax.annotate(f'{x_i:>5.2f}%', xy = (x_i, y_i), xytext = (0, 0), textcoords = 'offset points', ha = 'center', va = 'center', fontsize = fontsize)
You still have to adjust the fontsize to properly fit the radius of the circles.
I'm working on using a for loop to produce graphs for each set of data I have. Each plot prints correctly however the savefig() portion of my code only saves the last plot under each file name.
Here is a section of my code
total = 3
idx_list = []
dct = {}
for i, df in enumerate(graph_list):
data = pd.DataFrame(df)
for idx, v in enumerate(data['content'].unique()):
dct[f'x{idx}'] = data.loc[data['content'] == v]
idx_list.append(idx)
xs = dct[f'x{idx}'].Time
yB = dct[f'x{idx}'].Weight
yA = dct[f'x{idx}'].Height
fig, ax = plt.subplots(figsize =(10,8))
legends = ['Weight', 'Height']
ax.plot(xs, yB, linestyle = ':', color ='#4c4c4c', linewidth = 4.0)
ax.plot(xs, yA, color = '#fac346', linewidth = 3.0)
ax.legend(legends, loc = 'lower center', ncol = len(legends), bbox_to_anchor = (0.5, -0.15), frameon = False)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals = None, symbol ='%', is_latex = False))
ax.set_xticks(xs[::4])
ax.tick_params(axis = 'x', labelrotation = 45, labelsize = 10)
ax.yaxis.grid()
new_idx = [x+1 for x in idx_list]
for graph in range(total+1):
if graph != 0:
for ids in set(new_idx):
print('Graph {0} ID {1}'.format(graph, ids))
fig.savefig('Graph {0} ID {1}.jpg'.format(graph, ids))
I want each graph to save under the file names:
Graph 1 ID 1
Graph 1 ID 2
Graph 2 ID 1
Graph 2 ID 2
Graph 3 ID 1
Graph 3 ID 2
Thanks for any help you can provide!
You do not keep a reference to each figure, so when you call fig.savefig in the final loop you are actually saving the figure referenced by fig (which is the last figure) each time. There are many ways to manage this: you can save the figure in the same loop that created it, you can assign a unique name to each figure, or you can keep a reference to each figure in a list. The first option is simpler:
dct = {} # I assume this dict is used for something after saving the figures. Otherwise it is not necessary
for i, df in enumerate(graph_list):
data = pd.DataFrame(df)
for idx, v in enumerate(data['content'].unique()):
dct[f'x{idx}'] = data.loc[data['content'] == v]
xs = dct[f'x{idx}'].Time
yB = dct[f'x{idx}'].Weight
yA = dct[f'x{idx}'].Height
fig, ax = plt.subplots(figsize=(10, 8))
legends = ['Weight', 'Height']
ax.plot(xs, yB, linestyle=':', color='#4c4c4c', linewidth=4.0)
ax.plot(xs, yA, color='#fac346', linewidth=3.0)
ax.legend(legends, loc='lower center', ncol=len(legends),
bbox_to_anchor=(0.5, -0.15), frameon=False)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1,
decimals=None, symbol='%', is_latex=False))
ax.set_xticks(xs[::4])
ax.tick_params(axis='x', labelrotation=45, labelsize=10)
ax.yaxis.grid()
print('Graph {0} ID {1}'.format(i+1, idx+1))
fig.savefig('Graph {0} ID {1}.jpg'.format(i+1, idx+1))
plt.close(fig) # if you do not need to leave the figures open
I want to make a clustermap/heatmap of gene presence-absence data from patients where the genes will be grouped into categories (e.g chemotaxis, endotoxin etc) and labelled appropriately. I haven't found any such option in seaborn documentation. I know how to generate the heatmap, I just don't know how to label yticks as categories. Here is a sample (unrelated to my work) of what I want to achieve:
Here , yticklabels January, February and March are given group label winter and other yticklabels are also similarly labelled.
I've reproduced the example you gave in seaborn, adapting #Stein's answer from here.
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from itertools import groupby
import datetime
import seaborn as sns
def test_table():
months = [datetime.date(2008, i+1, 1).strftime('%B') for i in range(12)]
seasons = ['Winter',]*3 + ['Spring',]*2 + ['Summer']*3 + ['Pre-Winter',]*4
tuples = list(zip(months, seasons))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
d = {i: [np.random.randint(0,50) for _ in range(12)] for i in range(1950, 1960)}
df = pd.DataFrame(d, index=index)
return df
def add_line(ax, xpos, ypos):
line = plt.Line2D([ypos, ypos+ .2], [xpos, xpos], color='black', transform=ax.transAxes)
line.set_clip_on(False)
ax.add_line(line)
def label_len(my_index,level):
labels = my_index.get_level_values(level)
return [(k, sum(1 for i in g)) for k,g in groupby(labels)]
def label_group_bar_table(ax, df):
xpos = -.2
scale = 1./df.index.size
for level in range(df.index.nlevels):
pos = df.index.size
for label, rpos in label_len(df.index,level):
add_line(ax, pos*scale, xpos)
pos -= rpos
lypos = (pos + .5 * rpos)*scale
ax.text(xpos+.1, lypos, label, ha='center', transform=ax.transAxes)
add_line(ax, pos*scale , xpos)
xpos -= .2
df = test_table()
fig = plt.figure(figsize = (10, 10))
ax = fig.add_subplot(111)
sns.heatmap(df)
#Below 3 lines remove default labels
labels = ['' for item in ax.get_yticklabels()]
ax.set_yticklabels(labels)
ax.set_ylabel('')
label_group_bar_table(ax, df)
fig.subplots_adjust(bottom=.1*df.index.nlevels)
plt.show()
Gives:
Hope that helps.
I haven't tested this with seaborn yet, but the following works with vanilla matplotlib.
#!/usr/bin/env python
"""
Annotate a group of y-tick labels as such.
"""
import matplotlib.pyplot as plt
from matplotlib.transforms import TransformedBbox
def annotate_yranges(groups, ax=None):
"""
Annotate a group of consecutive yticklabels with a group name.
Arguments:
----------
groups : dict
Mapping from group label to an ordered list of group members.
ax : matplotlib.axes object (default None)
The axis instance to annotate.
"""
if ax is None:
ax = plt.gca()
label2obj = {ticklabel.get_text() : ticklabel for ticklabel in ax.get_yticklabels()}
for ii, (group, members) in enumerate(groups.items()):
first = members[0]
last = members[-1]
bbox0 = _get_text_object_bbox(label2obj[first], ax)
bbox1 = _get_text_object_bbox(label2obj[last], ax)
set_yrange_label(group, bbox0.y0 + bbox0.height/2,
bbox1.y0 + bbox1.height/2,
min(bbox0.x0, bbox1.x0),
-2,
ax=ax)
def set_yrange_label(label, ymin, ymax, x, dx=-0.5, ax=None, *args, **kwargs):
"""
Annotate a y-range.
Arguments:
----------
label : string
The label.
ymin, ymax : float, float
The y-range in data coordinates.
x : float
The x position of the annotation arrow endpoints in data coordinates.
dx : float (default -0.5)
The offset from x at which the label is placed.
ax : matplotlib.axes object (default None)
The axis instance to annotate.
"""
if not ax:
ax = plt.gca()
dy = ymax - ymin
props = dict(connectionstyle='angle, angleA=90, angleB=180, rad=0',
arrowstyle='-',
shrinkA=10,
shrinkB=10,
lw=1)
ax.annotate(label,
xy=(x, ymin),
xytext=(x + dx, ymin + dy/2),
annotation_clip=False,
arrowprops=props,
*args, **kwargs,
)
ax.annotate(label,
xy=(x, ymax),
xytext=(x + dx, ymin + dy/2),
annotation_clip=False,
arrowprops=props,
*args, **kwargs,
)
def _get_text_object_bbox(text_obj, ax):
# https://stackoverflow.com/a/35419796/2912349
transform = ax.transData.inverted()
# the figure needs to have been drawn once, otherwise there is no renderer?
plt.ion(); plt.show(); plt.pause(0.001)
bb = text_obj.get_window_extent(renderer = ax.get_figure().canvas.renderer)
# handle canvas resizing
return TransformedBbox(bb, transform)
if __name__ == '__main__':
import numpy as np
fig, ax = plt.subplots(1,1)
# so we have some extra space for the annotations
fig.subplots_adjust(left=0.3)
data = np.random.rand(10,10)
ax.imshow(data)
ticklabels = 'abcdefghij'
ax.set_yticks(np.arange(len(ticklabels)))
ax.set_yticklabels(ticklabels)
groups = {
'abc' : ('a', 'b', 'c'),
'def' : ('d', 'e', 'f'),
'ghij' : ('g', 'h', 'i', 'j')
}
annotate_yranges(groups)
plt.show()
I'm trying to make my subplots share the same axis, as they're currently different (it appears the circles in the plot itself are not perfectly aligned). When I try to pass sharex=True into ax = plt.subplot(1, 5, row+1, polar=True, sharex=True) I return an error suggesting TypeError: cannot create weak reference to 'bool' object.
Here is what my plot currently looks like, as you should be able to see, the axis (circles) inside the plot are not perfectly aligned, and I cannot work out how to align them using plt.subplot.
Does anybody have any recommendations?
Code to reproduce example:
import matplotlib.pyplot as plt
import pandas as pd
def make_spider(row, title, color):
import math
categories = list(df)
N = len(categories)
angles = [n / float(N) * 2 * math.pi for n in range(N)]
angles += angles[:1]
ax = plt.subplot(1, 5, row+1, polar=True)
plt.xticks(angles[:-1], categories, color='grey', size=8)
values = df.iloc[row].values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, color=color, linewidth=2, linestyle='solid')
ax.fill(angles, values, color=color, alpha = .4)
plt.gca().set_rmax(.2)
my_dpi = 40
plt.figure(figsize=(1000/my_dpi, 1000/my_dpi), dpi=96)
my_palette = plt.cm.get_cmap('Set2', len(df.index)+1)
for row in range(0, len(df.index)):
make_spider( row = row, title='Cluster: ' + str(row), color=my_palette(row) )
Dataframe:
df = pd.DataFrame.from_dict({"no_rooms":{"0":-0.3470532925,"1":-0.082144001,"2":-0.082144001,"3":-0.3470532925,"4":-0.3470532925},"total_area":{"0":-0.1858487321,"1":-0.1685491141,"2":-0.1632483955,"3":-0.1769700284,"4":-0.0389887094},"car_park_spaces":{"0":-0.073703681,"1":-0.073703681,"2":-0.073703681,"3":-0.073703681,"4":-0.073703681},"house_price":{"0":-0.2416123064,"1":-0.2841806825,"2":-0.259622004,"3":-0.3529449824,"4":-0.3414842657},"pop_density":{"0":-0.1271390651,"1":-0.3105853643,"2":-0.2316607937,"3":-0.3297832328,"4":-0.4599021194},"business_rate":{"0":-0.1662745006,"1":-0.1426329043,"2":-0.1577528867,"3":-0.163560133,"4":-0.1099718326},"noqual_pc":{"0":-0.0251535462,"1":-0.1540641646,"2":-0.0204666924,"3":-0.0515740013,"4":-0.0445135996},"level4qual_pc":{"0":-0.0826103951,"1":-0.1777759951,"2":-0.114263357,"3":-0.1787044751,"4":-0.2709496389},"badhealth_pc":{"0":-0.105481688,"1":-0.1760349683,"2":-0.128215043,"3":-0.1560577648,"4":-0.1760349683}})
Best create the sharing a priori to plotting. The plot to the already shared axes.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dic = {"no_rooms":{"0":-0.347,"1":-0.082,"2":-0.082, "3":-0.347,"4":-0.347},
"total_area":{"0":-0.185,"1":-0.168,"2":-0.163, "3":-0.176,"4":-0.038},
"car_park_spaces":{"0":-0.073,"1":-0.073,"2":-0.073, "3":-0.073,"4":-0.073},
"house_price":{"0":-0.241,"1":-0.284,"2":-0.259,"3":-0.352,"4":-0.341},
"pop_density":{"0":-0.127,"1":-0.310,"2":-0.231,"3":-0.329,"4":-0.459},
"business_rate":{"0":-0.166,"1":-0.142,"2":-0.157,"3":-0.163,"4":-0.109},
"noqual_pc":{"0":-0.025,"1":-0.15,"2":-0.020,"3":-0.051,"4":-0.044},
"level4qual_pc":{"0":-0.082,"1":-0.17,"2":-0.114,"3":-0.178,"4":-0.270},
"badhealth_pc":{"0":-0.105,"1":-0.176,"2":-0.128,"3":-0.156,"4":-0.176}}
df = pd.DataFrame.from_dict(dic)
def make_spider(row, title, color, ax=None):
categories = list(df)
N = len(categories)
angles = np.arange(N+1)/N*2*np.pi
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, color='grey', size=8)
ax.tick_params(labelleft=True)
values = df.iloc[row].values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, color=color, linewidth=2, linestyle='solid')
ax.fill(angles, values, color=color, alpha = .4)
fig, axes = plt.subplots(ncols=len(df.index), subplot_kw=dict(polar=True), sharey=True,
figsize=(15,8))
my_palette = plt.cm.get_cmap('Set2', len(df.index)+1)
for row, ax in enumerate(axes):
make_spider( row = row, title='Cluster: ' + str(row), color=my_palette(row), ax=ax )
plt.show()
You have to set the same y_lim/r_lim and y_ticks/r_ticks to all axes. This can for example be done by passing the last ax reference to plt.subplot to set sharey for all axes:
def make_spider(row, title, color, last_ax=None):
import math
categories = list(df)
N = len(categories)
angles = [n / float(N) * 2 * math.pi for n in range(N)]
angles += angles[:1]
# add last ax as sharey here:
ax = plt.subplot(1, 5, row+1, polar=True, sharey=last_ax)
plt.xticks(angles[:-1], categories, color='grey', size=8)
values = df.iloc[row].values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, color=color, linewidth=2, linestyle='solid')
ax.fill(angles, values, color=color, alpha = .4)
plt.gca().set_rmax(.2)
# return axes to store them
return plt.gca()
my_dpi = 40
plt.figure(figsize=(1000/my_dpi, 1000/my_dpi), dpi=96)
my_palette = plt.cm.get_cmap('Set2', len(df.index)+1)
axs = [] # store axes
for row in range(0, len(df.index)):
if row != 0: # if not the first subplot, pass last ax as argument
axs.append(
make_spider(row=row, title='Cluster: ' + str(row), color=my_palette(row),
last_ax=axs[row - 1]))
else:
axs.append(
make_spider(row=row, title='Cluster: ' + str(row), color=my_palette(row)))
OR by passing the limits/ticks to the plots directly:
def make_spider(row, title, color, rlim, rticks):
import math
categories = list(df)
N = len(categories)
angles = [n / float(N) * 2 * math.pi for n in range(N)]
angles += angles[:1]
ax = plt.subplot(1, 5, row+1, polar=True)
plt.xticks(angles[:-1], categories, color='grey', size=8)
values = df.iloc[row].values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, color=color, linewidth=2, linestyle='solid')
ax.fill(angles, values, color=color, alpha = .4)
ax.set_rlim(rlim)
ax.set_rticks(rticks)
# return axes to store them (not needed but may help later)
return ax
my_dpi = 40
plt.figure(figsize=(1000/my_dpi, 1000/my_dpi), dpi=96)
my_palette = plt.cm.get_cmap('Set2', len(df.index)+1)
axs = []
for row in range(0, len(df.index)):
axs.append(
make_spider(
row=row, title='Cluster: ' + str(row), color=my_palette(row),
rlim=(-.5, 0), rticks=[-.3, -.2, -.1, 0.]))
I have a binary classification problem, which I want to solve with a RandomForestClassifier. My target column is 'successful' which is either 0 or 1. I want to investigate the data, and see how it looks like. For that I tried to do count plots by category. But it's not saying how much in percentage from total are 'successful' (i.e. successful == 1)
How can I change the following plot, so that these subplots display the percentage of (successful == 1) of total of all posts? (Let's say in category weekday, in day 'Saturday' I have 10 datapoints, 7 of them are successful ('successful' == 1), so I want to have a bar with points at that day at 0.7.
Here is the actual plot (counts :-/):
And here is a part of my dataframe:
And here is the actual code used to generate the actual plot:
# Plot
sns.set(style="darkgrid")
x_vals = [['page_name', 'weekday'],['type', 'industry']]
subtitles = [['by Page', 'by Weekday'],['by Content Type', 'by Industry']]
fig, ax = plt.subplots(2,2, figsize=(15,10))
#jitter = [[False, 1], [0.5, 0.2]]
for j in range(len(ax)):
for i in range(len(ax[j])):
ax[j][i].tick_params(labelsize=15)
ax[j][i].set_xlabel('label', fontsize=17, position=(.5,20))
if (j == 0) :
ax[j][i].tick_params(axis="x", rotation=50)
ax[j][i].set_ylabel('label', fontsize=17)
ax[j][i] = sns.countplot(x=x_vals[j][i], hue="successful", data=mainDf, ax=ax[j][i])
for j in range(len(ax)):
for i in range(len(ax[j])):
ax[j][i].set_xlabel('', fontsize=17)
ax[j][i].set_ylabel('count', fontsize=17)
ax[j][i].set_title(subtitles[j][i], fontsize=18)
fig.suptitle('Success Count by Category', position=(.5,1.05), fontsize=20)
fig.tight_layout()
fig.show()
PS: Please not, I am using Seaborn. Solution should be also with Seaborn, if possible. Thanks!
You can use barplot here. I wasn't 100% sure of what you actually want to achieve so I developed several solutions.
Frequency of successful (unsuccessful) per total successful (unsuccessful)
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
mainDf['frequency'] = 0 # a dummy column to refer to
for col, ax in zip(['page_name', 'weekday', 'type', 'industry'], axes.flatten()):
counts = mainDf.groupby([col, 'successful']).count()
freq_per_group = counts.div(counts.groupby('successful').transform('sum')).reset_index()
sns.barplot(x=col, y='frequency', hue='successful', data=freq_per_group, ax=ax)
Frequency of successful (unsuccessful) per group
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
mainDf['frequency'] = 0 # a dummy column to refer to
for col, ax in zip(['page_name', 'weekday', 'type', 'industry'], axes.flatten()):
counts = mainDf.groupby([col, 'successful']).count()
freq_per_group = counts.div(counts.groupby(col).transform('sum')).reset_index()
sns.barplot(x=col, y='frequency', hue='successful', data=freq_per_group, ax=ax)
which, based on the data you provided, gives
Frequency of successful (unsuccessful) per total
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
mainDf['frequency'] = 0 # a dummy column to refer to
total = len(mainDf)
for col, ax in zip(['page_name', 'weekday', 'type', 'industry'], axes.flatten()):
counts = mainDf.groupby([col, 'successful']).count()
freq_per_total = counts.div(total).reset_index()
sns.barplot(x=col, y='frequency', hue='successful', data=freq_per_total, ax=ax)
Change the line ax[j][i] = sns.countplot(x=x_vals[j][i], hue="successful", data=mainDf, ax=ax[j][i]) to ax[j][i] = sns.barplot(x=x_vals[j][i], y='successful', data=mainDf, ax=ax[j][i], ci=None, estimator=lambda x: sum(x) / len(x) * 100)
Your code would be
sns.set(style="darkgrid")
x_vals = [['page_name', 'weekday'],['type', 'industry']]
subtitles = [['by Page', 'by Weekday'],['by Content Type', 'by Industry']]
fig, ax = plt.subplots(2,2, figsize=(15,10))
#jitter = [[False, 1], [0.5, 0.2]]
for j in range(len(ax)):
for i in range(len(ax[j])):
ax[j][i].tick_params(labelsize=15)
ax[j][i].set_xlabel('label', fontsize=17, position=(.5,20))
if (j == 0) :
ax[j][i].tick_params(axis="x", rotation=50)
ax[j][i].set_ylabel('label', fontsize=17)
ax[j][i] = sns.barplot(x=x_vals[j][i], y='successful', data=mainDf, ax=ax[j][i], ci=None, estimator=lambda x: sum(x) / len(x) * 100)
for j in range(len(ax)):
for i in range(len(ax[j])):
ax[j][i].set_xlabel('', fontsize=17)
ax[j][i].set_ylabel('percent', fontsize=17)
ax[j][i].set_title(subtitles[j][i], fontsize=18)
fig.suptitle('Success Percentage by Category', position=(.5,1.05), fontsize=20)
fig.tight_layout()
fig.show()