Plot individual data points in each group after pandas groupby - python

I'd like to group a dataframe using several criteria and then visualize individual data points in each group using a scattered plot.
import pandas as pd
import seaborn as sns
df_tips = sns.load_dataset('tips')
df_tips.groupby(['sex', 'day', 'smoker'])['tip'] # How could I scatter plot individual tip in each group?
Ideally, I'd like to have something looks like this:

I would do:
df_tips = sns.load_dataset('tips')
groups = df_tips.groupby(['sex', 'day', 'smoker'])['tip']
fig,ax = plt.subplots()
for i,(k,v) in enumerate(groups):
ax.scatter([i]*len(v), v)
ax.set_xticks(np.arange(len(groups)))
ax.set_xticklabels([k for k,v in groups],rotation=90);
Output:

I found a simpler way to do this and the plot is more beautiful (I think).
import pandas as pd
import seaborn as sns
df_tips = sns.load_dataset('tips')
df_tips['Groups'] = df_tips[['sex', 'day', 'smoker']].astype(str).agg('.'.join, axis=1)
sns.swarmplot(x='Groups', y='tip', data=df_tips)
plt.xticks(
rotation=90,
fontweight='light',
fontsize='x-large'
)
Here is the output:

Related

How to create a FacetGrid stacked barplot using Seaborn?

I am trying to plot a facet_grid with stacked bar charts inside.
I would like to use Seaborn. Its barplot function does not include a stacked argument.
I tried to use FacetGrid.map with a custom callable function.
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
def custom_stacked_barplot(col_day, col_time, col_total_bill, **kwargs):
dict_df={}
dict_df['day']=col_day
dict_df['time']=col_time
dict_df['total_bill']=col_total_bill
df_data_graph=pd.DataFrame(dict_df)
df = pd.crosstab(index=df_data_graph['time'], columns=tips['day'], values=tips['total_bill'], aggfunc=sum)
df.plot.bar(stacked=True)
tips=sns.load_dataset("tips")
g = sns.FacetGrid(tips, col='size', row='smoker')
g = g.map(custom_stacked_barplot, "day", 'time', 'total_bill')
However I get an empty canvas and stacked bar charts separately.
Empty canvas:
Graph1 apart:
Graph2:.
How can I fix this issue? Thanks for the help!
The simplest code to achive that result is this:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
tips=sns.load_dataset("tips")
g = sns.FacetGrid(tips, col = 'size', row = 'smoker', hue = 'day')
g = (g.map(sns.barplot, 'time', 'total_bill', ci = None).add_legend())
plt.show()
which gives this result:
Your different mixes of APIs (pandas.DataFrame.plot) appears not to integrate with (seaborn.FacetGrid). Since stacked bar plots are not supported in seaborn plotting, consider developing your own version with matplotlib subplots by iterating across groupby levels:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def custom_stacked_barplot(t, sub_df, ax):
plot_df = pd.crosstab(index=sub_df["time"], columns=sub_df['day'],
values=sub_df['total_bill'], aggfunc=sum)
p = plot_df.plot(kind="bar", stacked=True, ax = ax,
title = " | ".join([str(i) for i in t]))
return p
tips = sns.load_dataset("tips")
g_dfs = tips.groupby(["smoker", "size"])
# INITIALIZE PLOT
# sns.set()
fig, axes = plt.subplots(nrows=2, ncols=int(len(g_dfs)/2)+1, figsize=(15,6))
# BUILD PLOTS ACROSS LEVELS
for ax, (i,g) in zip(axes.ravel(), sorted(g_dfs)):
custom_stacked_barplot(i, g, ax)
plt.tight_layout()
plt.show()
plt.clf()
plt.close()
And use seaborn.set to adjust theme and pallette:

How to add a marker from a different column to a seaborn pandas barplot

I have the following dataset, code and plot:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
data = [['tom', 10,15], ['matt', 13,10]]
df3 = pd.DataFrame(data, columns = ['Name', 'Attempts','L4AverageAttempts'])
f,ax = plt.subplots(nrows=1,figsize=(16,9))
sns.barplot(x='Attempts',y='Name',data=df3)
plt.show()
How can get a marker of some description (dot, *, shape, etc) to show that tomhas averaged 15 (so is below his average) and matt has averaged 10 so is above average. So a marker basxed off the L4AverageAttempts value for each person.
I have looked into axvline but that seems to be only a set number rather than a specific value for each y axis category. Any help would be much appreciated! thanks!
You can simply plot a scatter plot on top of your bar plot using L4AverageAttempts as the x value:
You can use seaborn.scatterplot for this. Make sure to set the zorder parameter so that the markers appear on top of the bars.
import seaborn as sns
import pandas as pd
data = [['tom', 10,15], ['matt', 13,10]]
df3 = pd.DataFrame(data, columns = ['Name', 'Attempts','L4AverageAttempts'])
f,ax = plt.subplots(nrows=1,figsize=(16,9))
sns.barplot(x='Attempts',y='Name',data=df3)
sns.scatterplot(x='L4AverageAttempts', y="Name", data=df3, zorder=10, color='k', edgecolor='k')
plt.show()

Modifying Seaborn Violinplot to show means not median

Currently displaying some data with Seaborn / Pandas. I'm looking to overlay the mean of each category (x=ks2) - but can't figure out how to do this with Seaborn.
I can remove the inner="box" - but want to replace that with a marker for the mean of each category.
Ideally, then link each mean calculated...
Any pointers greatly received.
Cheers
Science.csv has 9k+ entries
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
sns.set(style="whitegrid", palette="pastel", color_codes=True)
# Load the dataset
# df = pd.read_csv("science.csv") << loaded from csv
df = pd.DataFrame({'ks2': [1, 1, 2,3,3,4],
'science': [40, 50, 34,20,0,44]})
# Draw a nested violinplot and split the violins for easier comparison
sns.violinplot(x="ks2", y="science", data=df, split=True,
inner="box",linewidth=2)
sns.despine(left=True)
plt.savefig('plot.png')
try:
from numpy import mean
then overlay sns.pointplot with estimator=mean
sns.pointplot(x = 'ks2', y='science', data=df, estimator=mean)
then play with linestyles

How to use time as x axis for a scatterplot with seaborn?

I have a simple dataframe with the time as index and dummy values as example.[]
I did a simple scatter plot as you see here:
Simple question: How to adjust the xaxis, so that all time values from 00:00 to 23:00 are visible in the xaxis? The rest of the plot is fine, it shows all the datapoints, it is just the labeling. Tried different things but didn't work out.
All my code so far is:
import pandas as pd
import seaborn as sns
import matplotlib.dates as mdates
from datetime import time
data = []
for i in range(0, 24):
temp_list = []
temp_list.append(time(i))
temp_list.append(i)
data.append(temp_list)
my_df = pd.DataFrame(data, columns=["time", "values"])
my_df.set_index(['time'],inplace=True)
my_df
fig = sns.scatterplot(my_df.index, my_df['values'])
fig.set(xlabel='time', ylabel='values')
I think you're gonna have to go down to the matplotlib level for this:
import pandas as pd
import seaborn as sns
import matplotlib.dates as mdates
from datetime import time
import matplotlib.pyplot as plt
data = []
for i in range(0, 24):
temp_list = []
temp_list.append(time(i))
temp_list.append(i)
data.append(temp_list)
df = pd.DataFrame(data, columns=["time", "values"])
df.time = pd.to_datetime(df.time, format='%H:%M:%S')
df.set_index(['time'],inplace=True)
ax = sns.scatterplot(df.index, df["values"])
ax.set(xlabel="time", ylabel="measured values")
ax.set_xlim(df.index[0], df.index[-1])
ax.xaxis.set_major_locator(mdates.HourLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M:%S"))
ax.tick_params(axis="x", rotation=45)
This produces
i think you have 2 options:
convert the time to hour only, for that just extract the hour to new column in your df
df['hour_'] = datetime.hour
than use it as your xaxis
if you need the time in the format you described, it may cause you a visibility problem in which timestamps will overlay each other. i'm using the
plt.xticks(rotation=45, horizontalalignment='right')
ax.xaxis.set_major_locator(plt.MaxNLocator(12))
so first i rotate the text then i'm limiting the ticks number.
here is a full script where i used it:
sns.set()
sns.set_style("whitegrid")
sns.axes_style("whitegrid")
for k, g in df_forPlots.groupby('your_column'):
fig = plt.figure(figsize=(10,5))
wide_df = g[['x', 'y', 'z']]
wide_df.set_index(['x'], inplace=True)
ax = sns.lineplot(data=wide_df)
plt.xticks(rotation=45,
horizontalalignment='right')
ax.yaxis.set_major_locator(plt.MaxNLocator(14))
ax.xaxis.set_major_locator(plt.MaxNLocator(35))
plt.title(f"your {k} in somthing{g.z.unique()}")
plt.tight_layout()
hope i halped

Plot aggregate groupby Count data in SeaBorn Python?

how to use groupby function in the y-axis? the below code doesn't display what i expect, due to y = df.groupby('column1')['column2'].count()
import seaborn as sns
import pandas as pd
sns.set(style="whitegrid", color_codes=True)
sns.stripplot(x="column1", y = df.groupby('column1')['column2'].count(), data=df)
Seaborn just doesn't work that way. In seaborn, you specify the x and y columns as well as the data frame. Seaborn will do the aggregation itself.
import seaborn as sns
sns.striplot('column1', 'column2', data=df)
For the count, maybe what you need is countplot
sns.countplot('column1', data=df)
The equivalent pandas code is:
df.groupby('column1').size().plot(kind='bar')
this code will create a count plot with horizontal bar equivalent and descending sorted values
fig,ax = plt.subplots(figsize=(10,16))
grouped=df.groupby('Age').size(). \
sort_values(ascending=False).plot(kind='barh',ax=ax)

Categories