I am drawing boxplots with Python Seaborn package. I have facet grid with both rows and columns. That much I've been able to do with the Seaborn function catplot.
I also want to annotate the outliers. I have found some nice examples at SO for annotating the outliers but without facet structure. That's where I'm struggling.
Here is what I've got (borrows heavily from this post: Boxplot : Outliers Labels Python):
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.cbook import boxplot_stats
sns.set_style('darkgrid')
Month = np.repeat(np.arange(1, 11), 10)
Id = np.arange(1, 101)
Value = np.random.randn(100)
Row = ["up", "down"]*50
df = pd.DataFrame({'Value': Value, 'Month': Month, 'Id': Id, 'Row': Row})
g = sns.catplot(data=df, x="Month", y="Value", row="Row", kind="box", height=3, aspect=3)
for name, group in df.groupby(["Month", "Row"]):
fliers = [y for stat in boxplot_stats(group["Value"]) for y in stat["fliers"]]
d = group[group["Value"].isin(fliers)]
g.axes.flatten().annotate(d["Id"], xy=(d["Month"] - 1, d["Value"]))
The dataframe d collects all the outliers by patch. The last line aims to match d with the graph g patches. However, that doesn't work, but I haven't found a way to flatten axes to a list where each element would correspond to a grouped dataframe element.
I'd be glad to hear alternative versions for achieving this too.
One way to do it:
for name, group in df.groupby(["Month", "Row"]):
fliers = [y for stat in boxplot_stats(group["Value"]) for y in stat["fliers"]]
d = group[group["Value"].isin(fliers)]
for i in range(len(d)):
ngrid = (0 if d.iloc[i,3]=='up' else 1)
g.fig.axes[ngrid].annotate(d.iloc[i, 2], xy=(d.iloc[i, 1] - 1, d.iloc[i, 0]))
You can loop through g.axes_dict to visit each of the axes.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.cbook import boxplot_stats
sns.set_style('darkgrid')
Month = np.repeat(np.arange(1, 11), 10)
Id = np.arange(1, 101)
Value = np.random.randn(100)
Row = ["up", "down"] * 50
df = pd.DataFrame({'Value': Value, 'Month': Month, 'Id': Id, 'Row': Row})
g = sns.catplot(data=df, x="Month", y="Value", row="Row", kind="box", height=3, aspect=3)
for row, ax in g.axes_dict.items():
for month in np.unique(df["Month"]):
group = df.loc[(df["Row"] == row) & (df["Month"] == month), :]
fliers = boxplot_stats(group["Value"])[0]["fliers"]
if len(fliers) > 0:
for mon, val, id in zip(group["Month"], group["Value"], group["Id"]):
if val in fliers:
ax.annotate(f' {id}', xy=(mon - 1, val))
plt.tight_layout()
plt.show()
Related
I have a data which has various values of A, B, C and D based different dates, i want to make a stripplot of these points, such that data points of recent date should be shaded darker(or have more alpha value) compared data points of previous dates.
this is what i have right now, all i need is to shade the points based on date for each bucket. but i am not able to figure that out
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mlp
plt.style.use("ggplot")
data = pd.DataFrame({"Date":pd.date_range(start="2020-01-06", end="2020-08-10", freq="W-MON"),
"A":[np.random.randint(-5, 50) for _ in range(len(pd.date_range(start="2020-01-06", end="2020-08-10", freq="W-MON")))],
"B":[np.random.randint(-5, 50) for _ in range(len(pd.date_range(start="2020-01-06", end="2020-08-10", freq="W-MON")))],
"C":[np.random.randint(-10, 50) for _ in range(len(pd.date_range(start="2020-01-06", end="2020-08-10", freq="W-MON")))],
"D":[np.random.randint(9, 50) for _ in range(len(pd.date_range(start="2020-01-06", end="2020-08-10", freq="W-MON")))]})
data.set_index("Date", inplace=True)
data.head()
sns.catplot(data=data, aspect=15/6, height=6)
This is the result of the above code
A scatter plot with randomized x-displacements can be used to apply one colormap per column.
To illustrate the effect, the example below uses random data with the most recent values being the largest.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
plt.style.use("ggplot")
dates = pd.date_range(start="2020-01-06", end="2020-08-10", freq="W-MON")
N = len(dates)
data = pd.DataFrame({"Date": dates,
"A": 30 + np.random.uniform(-5, 8, N).cumsum(),
"B": 20 + np.random.uniform(-4, 9, N).cumsum(),
"C": 25 + np.random.uniform(-4, 7, N).cumsum(),
"D": 40 + np.random.uniform(-2, 8, N).cumsum()})
data.set_index("Date", inplace=True)
columns = data.columns
for col_id, (column, cmap) in enumerate(zip(columns, ['Reds', 'Blues', 'Greens', 'Purples'])):
plt.scatter(col_id + np.random.uniform(-0.2, 0.2, N), data[column], c=range(N), cmap=cmap)
plt.xticks(range(len(columns)), columns)
plt.show()
I was trying to plot multiple lmplots in the same figure. But I am getting too many unwanted subplots.
I found another SO link How to plot 2 seaborn lmplots side-by-side? but that also did not help me.
In this example I want 1 row 2 columns.
MWE
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# data
df = sns.load_dataset('titanic')
# plot
m,n = 1,2
figsize=(12,8)
cols1 = ['age','fare']
cols2 = ['fare','age']
target = 'survived'
fontsize = 12
fig, ax = plt.subplots(m,n,figsize=figsize)
for i, (col1,col2) in enumerate(zip(cols1,cols2)):
plt.subplot(m,n,i+1)
sns.lmplot(x=col1,y=col2,data=df,
hue=target, palette='Set1',
scatter_kws={'alpha':0.3})
plt.xlabel(col1,fontsize=fontsize)
plt.ylabel(col2,fontsize=fontsize)
plt.tick_params(axis='both', which='major', labelsize=fontsize)
plt.tight_layout()
for i in range(m*n-len(cols1)):
ax.flat[-(i+1)].set_visible(False)
My attempt so far:
df = pd.DataFrame({'x0':[10,20,30,40],
'y0': [100,200,300,400],
'x1':[0.1,0.2,0.3,0.1],
'y1':[0.01,0.02,0.03,0.01],
'target': [0,1,1,1]
})
df1 = df.append(df)
df1 = df1.reset_index(drop=True)
df1['x0'].iloc[len(df):] = df['x1'].to_numpy()
df1['y0'].iloc[len(df):] = df['y1'].to_numpy()
df1['col'] = ['c0']* len(df) + ['c1'] * len(df)
df1 = df1.drop(['x1','y1'],axis=1)
df1 = df1.rename(columns={'x0':'x','y0':'y'})
sns.lmplot(x='x',y='y',hue='target',data=df1,col='col')
Output:
I'm using Pandas and matplotlib to try to replicate this graph from tableau:
So far, I have this code:
group = df.groupby(["Region","Rep"]).sum()
total_price = group["Total Price"].groupby(level=0, group_keys=False)
total_price.nlargest(5).plot(kind="bar")
Which produces this graph:
It correctly groups the data, but is it possible to get it grouped similar to how Tableau shows it?
You can create some lines and labels using the respective matplotlib methods (ax.text and ax.axhline).
import pandas as pd
import numpy as np; np.random.seed(5)
import matplotlib.pyplot as plt
a = ["West"]*25+ ["Central"]*10+ ["East"]*10
b = ["Mattz","McDon","Jeffs","Warf","Utter"]*5 + ["Susanne","Lokomop"]*5 + ["Richie","Florence"]*5
c = np.random.randint(5,55, size=len(a))
df=pd.DataFrame({"Region":a, "Rep":b, "Total Price":c})
group = df.groupby(["Region","Rep"]).sum()
total_price = group["Total Price"].groupby(level=0, group_keys=False)
gtp = total_price.nlargest(5)
ax = gtp.plot(kind="bar")
#draw lines and titles
count = gtp.groupby("Region").count()
cum = np.cumsum(count)
for i in range(len(count)):
title = count.index.values[i]
ax.axvline(cum[i]-.5, lw=0.8, color="k")
ax.text(cum[i]-(count[i]+1)/2., 1.02, title, ha="center",
transform=ax.get_xaxis_transform())
# shorten xticklabels
ax.set_xticklabels([l.get_text().split(", ")[1][:-1] for l in ax.get_xticklabels()])
plt.show()
I have a Dataframe and I slice the Dataframe into three subsets. Each subset has 3 to 4 rows of data. After I slice the data frame into three subsets, I plot them using Matplotlib.
The problem I have is I am not able to create a plot where each subplot is plotted using sliced DataFrame. For example, in a group of three in a set, I have only one of the plots (last subplot) plotted where there is no data for the remaining two plots initial sets in a group. it looks like the 'r' value does not pass to 'r.plot' for all three subplots.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'))
df['key1'] = 0
df.key1.iloc[0:3] = 1
df.key1.iloc[3:7] = 2
df.key1.iloc[7:] = 3
df_grouped = df.groupby('key1')
for group_name, group_value in df_grouped:
rows, columns = group_value.shape
fig, axes = plt.subplots(rows, 1, sharex=True, sharey=True, figsize=(15,20))
for i,r in group_value.iterrows():
r = r[0:columns-1]
r.plot(kind='bar', fill=False, log=False)
I think you might want what I call df_subset to be summarized in some way, but here's a way to plot each group in its own panel.
# Your Code Setting Up the Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'))
df['key1'] = 0
df.key1.iloc[0:3] = 1
df.key1.iloc[3:7] = 2
df.key1.iloc[7:] = 3
# My Code to Plot in Three Panels
distinct_keys = df['key1'].unique()
fig, axes = plt.subplots(len(distinct_keys), 1, sharex=True, figsize=(3,5))
for i, key in enumerate(distinct_keys):
df_subset = df[df.key1==key]
# {maybe insert a line here to summarize df_subset somehow interesting?}
# plot
axes[i] = df_subset.plot(kind='bar', fill=False, log=False)
Plotting histogram on a seaborn PairGrid with hue leads to stacking by default. Is there a way to avoid this ? (stacked=False is inefficient.)
I tried with seaborn.distplot, kde=False but the bars are too wide in my case and decreasing rwidth kind of shifts the bars away from the corresponding variable values (which does not happen with plt.hist).
EDIT code to illustrate so-called 'shifting away from the corresponding variable values' (actually plt.hist does it too but it is less obvious).
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.DataFrame()
for n in ['a', 'b']:
tmp = pd.DataFrame({'name': [n] * 100,
'prior': [1, 10] * 50,
'post': [1, 10] * 50})
df = df.append(tmp)
g = sns.PairGrid(df, hue='name', diag_sharey=False)
g.map_offdiag(sns.regplot, fit_reg=False, x_jitter=.1)
g.map_diag(plt.hist, rwidth=0.2, stacked=False)
g = sns.PairGrid(df, hue='name', diag_sharey=False)
g.map_offdiag(sns.regplot, fit_reg=False, x_jitter=.1)
g.map_diag(sns.distplot, kde=False, hist_kws={'rwidth':0.2})