I'm using Pandas and matplotlib to try to replicate this graph from tableau:
So far, I have this code:
group = df.groupby(["Region","Rep"]).sum()
total_price = group["Total Price"].groupby(level=0, group_keys=False)
total_price.nlargest(5).plot(kind="bar")
Which produces this graph:
It correctly groups the data, but is it possible to get it grouped similar to how Tableau shows it?
You can create some lines and labels using the respective matplotlib methods (ax.text and ax.axhline).
import pandas as pd
import numpy as np; np.random.seed(5)
import matplotlib.pyplot as plt
a = ["West"]*25+ ["Central"]*10+ ["East"]*10
b = ["Mattz","McDon","Jeffs","Warf","Utter"]*5 + ["Susanne","Lokomop"]*5 + ["Richie","Florence"]*5
c = np.random.randint(5,55, size=len(a))
df=pd.DataFrame({"Region":a, "Rep":b, "Total Price":c})
group = df.groupby(["Region","Rep"]).sum()
total_price = group["Total Price"].groupby(level=0, group_keys=False)
gtp = total_price.nlargest(5)
ax = gtp.plot(kind="bar")
#draw lines and titles
count = gtp.groupby("Region").count()
cum = np.cumsum(count)
for i in range(len(count)):
title = count.index.values[i]
ax.axvline(cum[i]-.5, lw=0.8, color="k")
ax.text(cum[i]-(count[i]+1)/2., 1.02, title, ha="center",
transform=ax.get_xaxis_transform())
# shorten xticklabels
ax.set_xticklabels([l.get_text().split(", ")[1][:-1] for l in ax.get_xticklabels()])
plt.show()
Related
The dataframe I created is as follows:
import pandas as pd
import numpy as np
import seaborn as sns
date = pd.date_range('2003-01-01', '2022-11-01', freq='MS').strftime('%Y-%m-%d').tolist()
mom = [np.nan] + list(np.repeat([0.01], 238))
cpi = [100] + list(np.repeat([np.nan], 238))
df = pd.DataFrame(list(zip(date, mom, cpi)), columns=['date','mom','cpi'])
df['date'] = pd.to_datetime(df['date'])
for i in range(1,len(df),1):
df['cpi'][i] = df['cpi'][(i-1)] * (1 + df['mom'][i])
df['yoy'] = df['cpi'].pct_change(periods=12)
Y-axis values not displaying correctly as can be seen below.
sns.lineplot(
x = 'date',
y = 'yoy',
data = df
)
I think the percentage changes I calculated for the yoy column are the cause of the issue. Because there are no issues if I manually fill in the yoy column.
Thanks in advance.
You can use matplotlib to set the axis scaling, as the difference is really subtle in your data:
import matplotlib.pyplot as plt
ax = plt.gca()
ax.set_ylim([df.yoy.min(numeric_only=True), df.yoy.max(numeric_only=True)])
sns.lineplot(
x = 'date',
y = 'yoy',
data = df,
ax = ax
)
With this the result should be more of a stepping function.
You can use something like the max difference to the mean times 1.01 to set the limits a little better, but this is the idea. You can set the axis ticks using ax.set_yticks(ticks=<list of ticks>) (documentation).
I'm trying to plot two histogram using the result of a group by. But the labels just appear in one of the labels.
How can I put the label in both charts?
And how can I put different title for the charts (e.g. first as Men's grade and Second as Woman's grade)
import pandas as pd
import matplotlib.pyplot as plt
microdataEnem = pd.read_csv('C:\\Users\\Lucas\\AppData\\Local\\Programs\\Python\\Python39\\Scripts\\Data Science\\Data Analysis\\Projects\\ENEM\\DADOS\\MICRODADOS_ENEM_2019.csv', sep = ';', encoding = 'ISO-8859-1', nrows=10000)
sex_essaygrade = ['TP_SEXO', 'NU_NOTA_REDACAO']
filter_sex_essaygrade = microdataEnem.filter(items = sex_essaygrade)
filter_sex_essaygrade.dropna(subset = ['NU_NOTA_REDACAO'], inplace = True)
filter_sex_essaygrade.groupby('TP_SEXO').hist()
plt.xlabel('Grade')
plt.ylabel('Number of students')
plt.show()
Instead of using filter_sex_essaygrade.groupby('TP_SEXO').hist() you can try the following format: axs = filter_sex_essaygrade['NU_NOTA_REDACAO'].hist(by=filter_sex_essaygrade['TP_SEXO']). This will automatically title each histogram with the group name.
You'll want to set an the variable axs equal to this histogram object so that you can modify the x and y labels for both plots.
I created some data similar to yours, and I get the following result:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(42)
sex_essaygrade = ['TP_SEXO', 'NU_NOTA_REDACAO']
## create two distinct sets of grades
sample_grades = np.concatenate((np.random.randint(low=70,high=100,size=100), np.random.randint(low=80,high=100,size=100)))
filter_sex_essaygrade = pd.DataFrame({
'NU_NOTA_REDACAO': sample_grades,
'TP_SEXO': ['Men']*100 + ['Women']*100
})
axs = filter_sex_essaygrade['NU_NOTA_REDACAO'].hist(by=filter_sex_essaygrade['TP_SEXO'])
for ax in axs.flatten():
ax.set_xlabel("Grade")
ax.set_ylabel("Number of students")
plt.show()
I was trying to plot multiple lmplots in the same figure. But I am getting too many unwanted subplots.
I found another SO link How to plot 2 seaborn lmplots side-by-side? but that also did not help me.
In this example I want 1 row 2 columns.
MWE
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# data
df = sns.load_dataset('titanic')
# plot
m,n = 1,2
figsize=(12,8)
cols1 = ['age','fare']
cols2 = ['fare','age']
target = 'survived'
fontsize = 12
fig, ax = plt.subplots(m,n,figsize=figsize)
for i, (col1,col2) in enumerate(zip(cols1,cols2)):
plt.subplot(m,n,i+1)
sns.lmplot(x=col1,y=col2,data=df,
hue=target, palette='Set1',
scatter_kws={'alpha':0.3})
plt.xlabel(col1,fontsize=fontsize)
plt.ylabel(col2,fontsize=fontsize)
plt.tick_params(axis='both', which='major', labelsize=fontsize)
plt.tight_layout()
for i in range(m*n-len(cols1)):
ax.flat[-(i+1)].set_visible(False)
My attempt so far:
df = pd.DataFrame({'x0':[10,20,30,40],
'y0': [100,200,300,400],
'x1':[0.1,0.2,0.3,0.1],
'y1':[0.01,0.02,0.03,0.01],
'target': [0,1,1,1]
})
df1 = df.append(df)
df1 = df1.reset_index(drop=True)
df1['x0'].iloc[len(df):] = df['x1'].to_numpy()
df1['y0'].iloc[len(df):] = df['y1'].to_numpy()
df1['col'] = ['c0']* len(df) + ['c1'] * len(df)
df1 = df1.drop(['x1','y1'],axis=1)
df1 = df1.rename(columns={'x0':'x','y0':'y'})
sns.lmplot(x='x',y='y',hue='target',data=df1,col='col')
Output:
I have three different data sets where I produce a facetplot, each
a = sns.FacetGrid(data1, col="overlap", hue="comp")
a = (g.map(sns.kdeplot, "val",bw=0.8))
b = sns.FacetGrid(data2, col="overlap", hue="comp")
b = (g.map(sns.kdeplot, "val",bw=0.8))
c = sns.FacetGrid(data3, col="overlap", hue="comp")
c = (g.map(sns.kdeplot, "val",bw=0.8))
Each of those plots has three subplots in one row, so in total I have nine plots.
I would like to combine these plots, in a subplots setting like this
f, (ax1, ax2, ax3) = plt.subplots(3,1)
ax1.a
ax2.b
ax3.c
How can I do that?
A FacetGrid creates its own figure. Combining several figures into one is not an easy task. Additionally, there is no such thing as subplot rows which can be added to a figure. So one would need to manipulate the axes individually.
That said, it might be easier to find workarounds. E.g. if the dataframes to show have the same structure as it seems to be from the question code, one can combine the dataframes into a single frame with a new column and use this as the row attribute of the facet grid.
import numpy as np; np.random.seed(3)
import pandas as pd
import seaborn.apionly as sns
import matplotlib.pyplot as plt
def get_data(n=266, s=[5,13]):
val = np.c_[np.random.poisson(lam=s[0], size=n),
np.random.poisson(lam=s[1], size=n)].T.flatten()
comp = [s[0]]*n + [s[1]]*n
ov = np.random.choice(list("ABC"), size=2*n)
return pd.DataFrame({"val":val, "overlap":ov, "comp":comp})
data1 = get_data(s=[9,11])
data2 = get_data(s=[7,19])
data3 = get_data(s=[1,27])
#option1 combine
for i, df in enumerate([data1,data2,data3]):
df["data"] = ["data{}".format(i+1)] * len(df)
data = data1.append(data2)
data = data.append(data3)
bw = 2
a = sns.FacetGrid(data, col="overlap", hue="comp", row="data")
a = (a.map(sns.kdeplot, "val",bw=bw ))
plt.show()
You can also loop over the dataframes and axes to obtain the desired result.
import numpy as np; np.random.seed(3)
import pandas as pd
import seaborn.apionly as sns
import matplotlib.pyplot as plt
def get_data(n=266, s=[5,13]):
val = np.c_[np.random.poisson(lam=s[0], size=n),
np.random.poisson(lam=s[1], size=n)].T.flatten()
comp = [s[0]]*n + [s[1]]*n
ov = np.random.choice(list("ABC"), size=2*n)
return pd.DataFrame({"val":val, "overlap":ov, "comp":comp})
data1 = get_data(s=[9,11])
data2 = get_data(s=[7,19])
data3 = get_data(s=[1,27])
#option2 plot each subplot individually
data = [data1,data2,data3]
bw = 2
fig, axes = plt.subplots(3,3, sharex=True, sharey=True)
for i in range(3):
for j in range(3):
x = data[i]
x = x[x["overlap"] == x["overlap"].unique()[j]]
for hue in x["comp"].unique():
d = x[x["comp"] == hue]
sns.kdeplot(d["val"], ax=axes[i,j], bw=bw, label=hue )
plt.show()
I am trying to show time series lines representing an effort amount using matplotlib and pandas.
I've got my DF's to all to overlay in one plot, however when I do python seems to strip the x axis of the date and input some numbers. (I'm not sure where these come from but at a guess, not all days contain the same data so python has reverted to using an index id number). If I plot any one of these they come up with date on the x-axis.
Any hints or solutions to make the x axis show date for the multiple plot would be much appreciated.
This is the single figure plot with time axis:
Code I'm using to plot is
fig = pl.figure()
ax = fig.add_subplot(111)
ax.plot(b342,color='black')
ax.plot(b343,color='blue')
ax.plot(b344,color='red')
ax.plot(b345,color='green')
ax.plot(b346,color='pink')
ax.plot(fi,color='yellow')
plt.show()
This is the multiple plot fig with weird x axis:
One option would be to manually specify the x-axis based on the DataFrame index, and then plot directly using matplotlib.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# make up some data
n = 100
dates = pd.date_range(start = "2015-01-01", periods = n, name = "yearDate")
dfs = []
for i in range(3):
df = pd.DataFrame(data = np.random.random(n)*(i + 1), index = dates,
columns = ["FishEffort"] )
df.df_name = str(i)
dfs.append(df)
# plot it directly using matplotlib instead of through the DataFrame
fig = plt.figure()
ax = fig.add_subplot()
for df in dfs:
plt.plot(df.index,df["FishEffort"], label = df.df_name)
plt.legend()
plt.show()
Another option would be to concatenate your DataFrames and plot using Pandas. If you give your "FishEffort" field the correct label name when loading the data or via DataFrame.rename then the labels will be specified automatically.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
n = 100
dates = pd.date_range(start = "2015-01-01", periods = n, name = "yearDate")
dfs = []
for i in range(3):
df = pd.DataFrame(data = np.random.random(n)*(i + 1), index = dates,
columns = ["DataFrame #" + str(i) ] )
df.df_name = str(i)
dfs.append(df)
df = pd.concat(dfs, axis = 1)
df.plot()
I've found an answer that does what I want, it seems that calling plt.plot wasn't using the date as the x axis, however calling it using the pandas documentation did the trick.
ax = b342.plot(label='342')
b343.plot(ax=ax, label='test')
b344.plot(ax=ax)
b345.plot(ax=ax)
b346.plot(ax=ax)
fi.plot(ax=ax)
plt.show()
I was wondering if anyone knew hwo to change the labels here?