grouping, percentage, and barchart in Python - python

I am very new to Python, and I am trying to plot a bar chart that shows the winner_rank_status percentage, and within each bar, I want to show the percentage of the winner (colour).
My dataset is like:
The code that I wrote:
Q3_df=games_df[['winner','winner_rank_status']]
Q3_df=Q3_df.groupby(['winner_rank_status','winner']).size().groupby(level=0).apply(lambda x: round(100*x/x.sum(),2))
Q3_df=Q3_df.unstack()
ax= Q3_df.plot(
kind='bar',
stacked=True,
figsize=(14,7),
rot=0,
title='Effect of piece colour and winner rating status on the result',
color=['black','grey','white'],
edgecolor='black',
)
for c in ax.containers:
ax.bar_label(c, label_type='center',color='b')
And it's the result that I get:
This result is wrong as it shows 100% for all categories!!! I need to show each category (Equal, Higher, Lower) their true percentage and then within each category the proportion of each colour...
Would you please guide me on how I can achieve it?
I appreciate your help.

You can give a different color to the labels for each set of bars. To get the percentage where all 9 values sum to 100, you could divide by the total number games:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
winner_options = ['black', 'draw', 'white']
rank_options = ['lower', 'equal', 'higher']
Q3_df = pd.DataFrame({'winner_rank_status': pd.Categorical(np.random.choice(rank_options, 1000, p=[.46, .07, .47]), rank_options),
'winner': pd.Categorical(np.random.choice(winner_options, 1000, p=[.51, .03, .46]), winner_options)})
Q3_rank_winner_df = Q3_df.groupby(['winner_rank_status', 'winner']).size().groupby(level=0).apply(
lambda x: np.round(100 * x / len(Q3_df), 2))
Q3_rank_winner_df = Q3_rank_winner_df.unstack()
ax = Q3_rank_winner_df.plot(
kind='bar',
stacked=True,
figsize=(14, 7),
rot=0,
title='Effect of piece colour and winner rating status on the result',
color=['black', 'grey', 'white'],
edgecolor='black')
for bars, color in zip(ax.containers, ['skyblue', 'navy', 'darkblue']):
ax.bar_label(bars, label_type='center', color=color)
ax.legend(bbox_to_anchor=[1.01, 1.02], loc='upper left')
plt.tight_layout()
plt.show()
The new requirements are a bit confusing. One might add the percentages of each rank at the top of the bars:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
winner_options = ['black', 'draw', 'white']
rank_options = ['lower', 'equal', 'higher']
Q3_df = pd.DataFrame(
{'winner_rank_status': pd.Categorical(np.random.choice(rank_options, 1000, p=[.65, .05, .30]), rank_options),
'winner': pd.Categorical(np.random.choice(winner_options, 1000, p=[.46, .07, .47]), winner_options)})
Q3_rank_winner_df = Q3_df.groupby(['winner_rank_status', 'winner']).size().groupby(level=0).apply(
lambda x: np.round(100 * x / x.sum(), 2))
Q3_rank_winner_df = Q3_rank_winner_df.unstack()
ax = Q3_rank_winner_df.plot(
kind='bar',
stacked=True,
figsize=(14, 7),
rot=0,
title='Effect of piece colour and winner rating status on the result',
color=['black', 'grey', 'white'],
edgecolor='black')
for bars, color in zip(ax.containers, ['skyblue', 'navy', 'darkblue']):
ax.bar_label(bars, label_type='center', color=color)
Q3_rank_df = Q3_df.groupby(['winner_rank_status']).size() * 100 / len(Q3_df)
for row, percent in enumerate(Q3_rank_df):
ax.text(row, 103, f'{percent:.02f} %', color='navy', ha='center', va='center')
ax.margins(y=0.08) # more space on top
ax.legend(bbox_to_anchor=[1.01, 1.02], loc='upper left')
plt.tight_layout()
plt.show()

Related

How to customize seaborn boxplot with specific color sequence when boxplots have hue

I want to make boxplots with hues but I want to color code it so that each specific X string is a certain color with the hue just being a lighter color. I am able to do a boxplot without a hue. When I incorporate the hue, I get the second boxplot which loses the colors. Can someone help me customize the colors for the figure that contains the hue?
Essentially, its what the answer for this question is but with boxplots.
This is my code:
first boxplot
order=['Ash1','E1A','FUS','p53']
colors=['gold','teal','darkorange','royalblue']
color_dict=dict(zip(order,colors))
fig,ax=plt.subplots(figsize=(25,15))
bp=sns.boxplot(data=df_idrs, x=df_idrs["construct"], y=df_idrs['Norm_Ef_IDR/Ef_GS'],ax=ax,palette=color_dict)
sns.stripplot(ax=ax,y='Norm_Ef_IDR/Ef_GS', x='construct', data=df_idrs,palette=color_dict,
jitter=1, marker='o', alpha=0.4,edgecolor='black',linewidth=1, dodge=True)
ax.axhline(y=1,linestyle="--",color='black',linewidth=2)
plt.legend(loc='upper left', bbox_to_anchor=(1.03, 1))
second boxplot
order=['Ash1','E1A','FUS','p53']
colors=['gold','teal','darkorange','royalblue']
color_dict=dict(zip(order,colors))
fig,ax=plt.subplots(figsize=(25,15))
bp=sns.boxplot(data=df_idrs, x=df_idrs["construct"], y=df_idrs['Norm_Ef_IDR/Ef_GS'],ax=ax, hue=df_idrs["location"])
sns.stripplot(y='Norm_Ef_IDR/Ef_GS', x='construct', data=df_idrs, hue=df_idrs["location"],
jitter=1, marker='o', alpha=0.4,edgecolor='black',linewidth=1, dodge=True)
ax.axhline(y=1,linestyle="--",color='black',linewidth=2)
plt.legend(loc='upper left', bbox_to_anchor=(1.03, 1))
The only thing that changed was the palette to hue. I have seen many examples on here but I am unable to get them to work. Using the second code, I have tried the following:
Nothing happens for this one.
for ind, bp in enumerate(ax.findobj(PolyCollection)):
rgb = to_rgb(colors[ind // 2])
if ind % 2 != 0:
rgb = 0.5 + 0.5 * np.array(rgb) # make whiter
bp.set_facecolor(rgb)
I get index out of range for the following one.
for i in range(0,4):
mybox = bp.artists[i]
mybox.set_facecolor(color_dict[order[i]])
Matplotlib stores the boxes in ax.patches, but there are also 2 dummy patches (used to construct the legend) that need to be filtered away. The dots of the stripplot are stored in ax.collections. There are also 2 dummy collections for the legend, but as those come at the end, they don't form a problem.
Some remarks:
sns.boxplot returns the subplot on which it was drawn; as it is called with ax=ax it will return that same ax
Setting jitter=1in the stripplot will smear the dots over a width of 1. 1 is the distance between the x positions, and the boxes are only 0.4 wide. To avoid clutter, the code below uses jitter=0.4.
Here is some example code starting from dummy test data:
from matplotlib import pyplot as plt
from matplotlib.legend_handler import HandlerTuple
from matplotlib.patches import PathPatch
from matplotlib.colors import to_rgb
import seaborn as sns
import pandas as pd
import numpy as np
np.random.seed(20230215)
order = ['Ash1', 'E1A', 'FUS', 'p53']
colors = ['gold', 'teal', 'darkorange', 'royalblue']
hue_order = ['A', 'B']
df_idrs = pd.DataFrame({'construct': np.repeat(order, 200),
'Norm_Ef_IDR/Ef_GS': (np.random.normal(0.03, 1, 800).cumsum() + 10) / 15,
'location': np.tile(np.repeat(hue_order, 100), 4)})
fig, ax = plt.subplots(figsize=(12, 5))
sns.boxplot(data=df_idrs, x=df_idrs['construct'], y=df_idrs['Norm_Ef_IDR/Ef_GS'], hue='location',
order=order, hue_order=hue_order, ax=ax)
box_colors = [f + (1 - f) * np.array(to_rgb(c)) # whiten colors depending on hue
for c in colors for f in np.linspace(0, 0.5, len(hue_order))]
box_patches = [p for p in ax.patches if isinstance(p, PathPatch)]
for patch, color in zip(box_patches, box_colors):
patch.set_facecolor(color)
sns.stripplot(y='Norm_Ef_IDR/Ef_GS', x='construct', data=df_idrs, hue=df_idrs['location'],
jitter=0.4, marker='o', alpha=0.4, edgecolor='black', linewidth=1, dodge=True, ax=ax)
for collection, color in zip(ax.collections, box_colors):
collection.set_facecolor(color)
ax.axhline(y=1, linestyle='--', color='black', linewidth=2)
handles = [tuple(box_patches[i::len(hue_order)]) for i in range(len(hue_order))]
ax.legend(handles=handles, labels=hue_order, title='hue category',
handlelength=4, handler_map={tuple: HandlerTuple(ndivide=None, pad=0)},
loc='upper left', bbox_to_anchor=(1.01, 1))
plt.tight_layout()
plt.show()

Two different graph ticks parameters on Y axes from one table

Considering below table:
country
points
price
England
91.550725
51.681159
India
90.222222
13.333333
Austria
90.190782
30.762772
Germany
89.836321
42.257547
Canada
89.377953
35.712598
d = {'points': [91.5, 90.2, 90.1, 89.8, 89.3],
'price': [51.6, 13.3,30.7, 42.2, 35.7]}
index=['England', 'India','Austria', 'Germany','Canada']
df = pd.DataFrame(index=index,data=d)
fig, ax1 = plt.subplots(figsize = (10,5))
color = 'tab:purple'
ax1.set_xlabel('Country', fontsize=12)
ax1.set_ylabel('Average Ratings', color=color, fontsize=12)
sns.barplot(x=df['points'],y=df.index, color=color)
ax1.tick_params(axis='y', labelcolor=color, labelsize = 12)
ax2 = ax1.twinx()
plt.xlim(12, 92)
color = 'tab:red'
ax2.set_ylabel('Price', color=color, fontsize=12)
sns.barplot(x=df['price'],y=df.index,color=color)
ax2.tick_params(axis='y', labelcolor=color, labelsize = 12)
My question: How can I modify the right side Y axis ticks parameters to price (red), so that it represents the numbers of price column as well as the title.
Pandas: 1.2.4
Seaborn: 0.11.1
Matplotlib: 3.3.4
I assume this comes close to what you want:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
d = {'points': [91.5, 90.2, 90.1, 89.8, 89.3],
'price': [51.6, 13.3,30.7, 42.2, 35.7]}
index=['England', 'India','Austria', 'Germany','Canada']
df = pd.DataFrame(index=index,data=d)
fig, ax1 = plt.subplots(figsize = (10,5))
color = 'tab:purple'
#ax1.set_xlabel('Country', fontsize=12) <-- not necessary for your output
ax1.set_ylabel('Average country rating (in points)', color=color, fontsize=12) #mention unit for rating
sns.barplot(x=df['points'],y=df.index, color=color)
ax1.tick_params(axis='y', labelcolor=color, labelsize = 12)
ax2 = ax1.twinx()
plt.xlim(12, 92)
color = 'tab:red'
ax2.set_ylabel('Price (in $)', color=color, fontsize=12) #mention unit for price
sns.barplot(x=df['price'],y=df.index,color=color)
ax2.tick_params(axis='y', labelcolor=color, labelsize = 12)
ax2.set_yticklabels(df['price']) #relabel right axis with price values
ax1.set_xlabel("") #remove x-label because this axis applies to both categories
plt.show()
Sample output:
However, I hope you take the point into account that Trenton mentioned in a comment (now deleted). This graph is indeed rather difficult to read. The values on the left have their labels on the right, and vice versa.

Hatch color shown in legend different from what I have in the plot

Here I have used scatter to plot the days when volume was unusually high. The problem is that the legend doesnt show the hatch effect that I have added to the scatter. It shows black hatch when its red. I dont want to change the this with rcParams['hatch.color'] since I am planning to add another scatter with different hatch color.
Here is the code:
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
df = yf.download('aapl', '2015-01-01', '2016-01-01')
high_vol = df[df.Volume > df.Volume.std() * 5]
fig, ax1 = plt.subplots(figsize= (15, 6))
ax1.grid(True)
ax1.set_xlim(df.index[0], df.index[-1])
ax1.plot(df.index, df.Adj_close, label= 'Price')
ax1.scatter(high_vol.index, high_vol.Adj_close, color= 'none',
edgecolor= 'r', hatch= '///',
s= 300, label= 'High vol')
ax1.legend()
fig.autofmt_xdate()
plt.show()
Here is the output:
the output file for the plot
Any way to fix this?

How to plot and annotate grouped bars in seaborn / matplotlib

I have a dataframe that looks like this:
I have used a barplot to represent the subscribers for each row. This is what I did:
data = channels.sort_values('subscribers', ascending=False).head(5)
chart = sns.barplot(x = 'name', y='subscribers',data=data)
chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
for p in chart.patches:
chart.annotate("{:,.2f}".format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
Now I want to show the 'video_count' for each user on this same plot. The goal is to compare how the number of subscribers relate to the number of videos. How can I depict this on the chart?
Data
The data needs to be converted to a long format using .melt
Because of the scale of values, 'log' is used for the yscale
All of the categories in 'cats' are included for the example.
Select only the desired columns before melting, or use dfl = dfl[dfl.cats.isin(['sub', 'vc']) to filter for the desired 'cats'.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# setup dataframe
data = {'vc': [76, 47, 140, 106, 246],
'tv': [29645400, 28770702, 50234486, 30704017, 272551386],
'sub': [66100, 15900, 44500, 37000, 76700],
'name': ['a', 'b', 'c', 'd', 'e']}
df = pd.DataFrame(data)
vc tv sub name
0 76 29645400 66100 a
1 47 28770702 15900 b
2 140 50234486 44500 c
# convert to long form
dfl = (df.melt(id_vars='name', var_name='cats', value_name='values')
.sort_values('values', ascending=False).reset_index(drop=True))
name cats values
0 e tv 272551386
1 c tv 50234486
2 d tv 30704017
Updated as of matplotlib v3.4.2
Use matplotlib.pyplot.bar_label
.bar_label works for matplotlib, seaborn, and pandas plots.
See How to add value labels on a bar chart for additional details and examples with .bar_label.
Tested with seaborn v0.11.1, which is using matplotlib as the plot engine.
# plot
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(x='name', y='values', data=dfl, hue='cats', ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
ax.set_yscale('log')
for c in ax.containers:
# set the bar label
ax.bar_label(c, fmt='%.0f', label_type='edge', padding=1)
# pad the spacing between the number and the edge of the figure
ax.margins(y=0.1)
Plot with seaborn v0.11.1
Using matplotlib before version 3.4.2
Note that using .annotate and .patches is much more verbose than with .bar_label.
# plot
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(x='name', y='values', data=dfl, hue='cats', ax=ax)
ax.set_xticklabels(chart.get_xticklabels(), rotation=0)
ax.set_yscale('log')
for p in ax.patches:
ax.annotate(f"{p.get_height():.0f}", (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext =(0, 7), textcoords='offset points')

Create separate distplot from countplot

How can I create distplot from countplot
plt.rcdefaults()
%config InlineBackend.figure_format='retina'
sns.set_style('darkgrid')
ax = sns.countplot(x='Age',hue='Gender',data=df,edgecolor="None")
ax.tick_params(bottom=False, left=False)
ax.set_axisbelow(True)
for rect in ax.patches:
x = rect.get_x() + rect.get_width()/2.
y = rect.get_height()
try:
ax.annotate("{}".format(int(y)), (x,y), ha='center', va='bottom', clip_on=True)
except:
pass
ax.set_xlabel('Age', color='green')
ax.set_ylabel('Count', color='green')
ax.set_title('Countplot for Age(Gender)', color='tomato',weight='bold')
plt.legend(title='Gender', fontsize='large', loc='upper right').get_frame().set_facecolor('white')
plt.tight_layout()
plt.savefig('files\\Countplot_for_Age(Gender).jpg')
I want distplot for 2 Genders either in same plot or separately
Any suggestions or help will be highly appreciable
The x-axis of a countplot is categorical: it puts one bar for each encountered age, skipping bars when there are no rows for a certain age (21 and 23 in the example). Internally the bars are numbered as 0, 1, 2, ...
The y-axis is the count, which is proportional to the number of rows.
For a distplot, the x-axis are the ages themselves, and the y-axis is a probability distribution, which usually are quite small numbers (the area under the curve is normalized to be 1).
So, as both the x-axis and the y-axis are different, it is better to use separate subplots.
A distplot can be generated directly from the given data. Passing the same ax results in two distplots in the same subplot. A distplot is a combination of a histogram and a kdeplot. If the histogram isn't needed, hist=False leaves
it out, or the kdeplot can be called directly. The shade=True option adds shading to the plot.
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
NF = 50
NM = 10
df = pd.DataFrame({'Age': np.concatenate([np.random.randint(13, 20, NF) + np.random.randint(2, 7, NF),
np.random.randint(15, 23, NM)]),
'Gender': np.repeat(['female', 'male'], (NF, NM))})
df['Age'] = df['Age'].where((df['Age'] != 21) & (df['Age'] != 23), 20)
sns.set_style('darkgrid')
fig, axs = plt.subplots(ncols=2, figsize=(12, 4))
ax = sns.countplot(x='Age', hue='Gender', data=df, edgecolor="None", ax=axs[0])
ax.tick_params(bottom=False, left=False)
ax.set_axisbelow(True)
for rect in ax.patches:
x = rect.get_x() + rect.get_width() / 2.
y = rect.get_height()
ax.annotate(f"{y:.0f}", (x, y), ha='center', va='bottom', clip_on=True)
ax.set_xlabel('Age', color='green')
ax.set_ylabel('Count', color='green')
ax.set_title('Countplot for Age(Gender)', color='tomato', weight='bold')
ax.legend(title='Gender', fontsize='large', loc='upper right').get_frame().set_facecolor('white')
for gender in ('female', 'male'):
# ax2 = sns.kdeplot(df[df['Gender'] == gender]['Age'], shade=True, ax=axs[1], label=gender)
ax2 = sns.distplot(df[df['Gender'] == gender]['Age'], hist=False, kde_kws={'shade': True}, ax=axs[1], label=gender)
ax2.set_axisbelow(True)
ax2.set_xlabel('Age', color='green')
ax2.set_ylabel('probability distribution', color='green')
ax2.set_title('Distplot for Age(Gender)', color='tomato', weight='bold')
ax2.legend(title='Gender', fontsize='large', loc='upper right').get_frame().set_facecolor('white')
plt.tight_layout()
plt.show()

Categories