Merge Count Plot and Mean in same plot SNS - python

I am trying to create a count plot and also add another plot on it which would actually be the mean of the other columns.
The sample data is in the below link:
Sample Data
I have used the below code to create the sns count plot:
df = pd.read_csv("latestfile.csv")
df.sort_values(by=["Business"],inplace=True)
sns.countplot(data=df,x=df["Business"],hue="location")
and I generate the below:
Now I use the groupby and use the below code to get the desired data:
dfg = df.groupby(["Business","location"])['Ageing'].mean().reset_index()
dfg.set_index("Business",inplace=True)
but how do I plot this on the same count plot on the different y axis.
Unable to think of a way to do it.
Below is what I am finally looking for:

Of course, you can squeeze another bar plot into the countplot graph:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
df = pd.read_csv("test.csv")
df.sort_values(by=["Business"],inplace=True)
ax1 = sns.countplot(data=df, x="Business", hue="location", palette="muted", edgecolor="black")
for patch in ax1.patches:
patch.set_x(patch.get_x() + 0.3 * patch.get_width())
ax1.legend(title="Count")
ax2 = ax1.twinx()
sns.barplot(data=df, x="Business", y="Ageing", hue="location", palette="bright", ci=None, ax=ax2, edgecolor="white")
ax2.legend(title="Ageing")
ax1.autoscale_view()
plt.show()
However, I would definitely prefer two subplots:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
df = pd.read_csv("test.csv")
df.sort_values(by=["Business"],inplace=True)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
sns.countplot(data=df, x="Business", hue="location", ax=ax1)
ax1.legend(title="Count")
sns.barplot(data=df, x="Business", y="Ageing", hue="location", ci=None, ax=ax2)
ax2.legend(title="Ageing")
plt.show()
Since you prefer now the distribution, you can combine the countplot with a stripplot:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
df = pd.read_csv("test.csv")
df.sort_values(by=["Business"],inplace=True)
ax1 = sns.countplot(data=df, x="Business", hue="location")
ax2 = ax1.twinx()
sns.stripplot(data=df, x="Business", y="Ageing", hue="location", jitter=True, dodge=True, ax=ax2, linewidth=1)
ax1.legend(title="", loc="upper center")
ax2.legend_.remove()
plt.show()

Related

Python Seaborn heatmap with custom order on both axes and values from a frequency table (data included)

I have this data in a frequency table. I just want to be able to create a heatmap with Fac1 on Y axis, Fac2 on X axis and the frequency values as heatmap. The order of the Factors in Fac1 and Fac2 must be maintained in the same sequence (after removing duplicates from both Fac1 and Fac2 columns). I haven't been able to get this working after so many tries but I've managed to get the data in order and the simplest representation. I'd greatly appreciate any help in this.
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
url = "https://raw.githubusercontent.com/rroyss/stack/main/dfso.csv"
df = pd.read_csv(url)
plt.subplots(figsize=(15,30))
plt.tick_params(axis='both', which='major', labelsize=10, labelbottom = False, bottom=False, top = True, labeltop=True)
sns.heatmap(df, cmap="Blues", linewidth=1, xticklabels=True, yticklabels=True)
You have to convert your dataframe if you want to sue heatmap:
df2 = df.drop_duplicates().pivot_table(index='Fac1', columns='Fac2', values='Frequency Fac1-Fac2 pair', sort=False)
plt.subplots(figsize=(15, 30))
plt.tick_params(axis='both', which='major', labelsize=10, labelbottom=False, bottom=False, top=True, labeltop=True)
sns.heatmap(df2, cmap="Blues", linewidth=1, xticklabels=True, yticklabels=True)
This is the result (zoomed on the first rows and columns):
You first need to reorganize the dataframe such that Fac1 becomes the index, Fac2 the columns, and the values are aggregated from the third column. E.g. df_pivoted = df.pivot_table(index='Fac1', columns='Fac2', values='Frequency Fac1-Fac2 pair').
The heatmap will use the order provided by the columns and index as created by pivot_table. Keeping the original order is a bit tricky, but can be achieved by pd.Categorical (which forces an order) combined by pd.unique() (which keeps the original order, unlike np.unique).
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
url = "https://raw.githubusercontent.com/rroyss/stack/main/dfso.csv"
df = pd.read_csv(url)
df['Fac1'] = pd.Categorical(df['Fac1'], categories=pd.unique(df['Fac1']))
df['Fac2'] = pd.Categorical(df['Fac2'], categories=pd.unique(df['Fac2']))
df_pivoted = df.pivot_table(index='Fac1', columns='Fac2', values='Frequency Fac1-Fac2 pair')
fig, ax = plt.subplots(figsize=(20, 30))
sns.heatmap(data=df_pivoted, cmap='Blues', xticklabels=True, yticklabels=True, ax=ax)
ax.tick_params(axis='both', which='major', labelsize=10, labeltop=True, top=True, labelbottom=False, bottom=False)
ax.tick_params(axis='x', labelrotation=90)
plt.tight_layout()
plt.show()
If you are aiming for a 2d histogram or kde plot where the last column is intended as weights:
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
url = "https://raw.githubusercontent.com/rroyss/stack/main/dfso.csv"
df = pd.read_csv(url)
df['Fac1'] = [int(f[5:]) for f in df['Fac1']]
df['Fac2'] = [int(f[6:]) for f in df['Fac2']]
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 10))
sns.histplot(data=df, x='Fac1', y='Fac2', weights='Frequency Fac1-Fac2 pair', bins=20, color='blue', ax=ax1)
sns.kdeplot(data=df, x='Fac1', y='Fac2', weights='Frequency Fac1-Fac2 pair', color='blue', ax=ax2)
for ax in (ax1, ax2):
ax.tick_params(axis='both', which='major', labelsize=10)
plt.tight_layout()
plt.show()

pandas barplot choose color for each variable

I usually use matplotlib, but was playing with pandas plotting and experienced unexpected behaviour. I was assuming the following would return red and green edges rather than alternating. What am I missing here?
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({"col1":[1,2,4,5,6], "col2":[4,5,1,2,3]})
def amounts(df):
fig, ax = plt.subplots(1,1, figsize=(3,4))
(df.filter(['col1','col2'])
.plot.bar(ax=ax,stacked=True, edgecolor=["red","green"],
fill=False,linewidth=2,rot=0))
ax.set_xlabel("")
plt.tight_layout()
plt.show()
amounts(df)
I think plotting each column separately and setting the bottom argument to stack the bars provides the output you desire.
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({"col1":[1,2,4,5,6], "col2":[4,5,1,2,3]})
def amounts(df):
fig, ax = plt.subplots(1,1, figsize=(3,4))
df['col1'].plot.bar(ax=ax, linewidth=2, edgecolor='green', rot=0, fill=False)
df['col2'].plot.bar(ax=ax, bottom=df['col1'], linewidth=2, edgecolor='red', rot=0, fill=False)
plt.legend()
plt.tight_layout()
plt.show()
amounts(df)

seaborn plot with two y axis

I have some data and need to generate a plot like this image, I just wonder how to do this using python seaborn scatter plot?
Thanks, heaps!
example:
Here is a minimal example using seaborn.scatterplot:
import pandas as pd
import seaborn as sns
import numpy as np
np.random.seed(0)
df1 = pd.DataFrame({'x': np.random.random(size=10),
'y1': np.random.random(size=10),
})
df2 = pd.DataFrame({'x': np.random.random(size=10),
'y2': np.random.random(size=10)*100,
})
ax1 = plt.subplot()
ax2 = ax1.twinx()
sns.scatterplot(data=df1, x='x', y='y1', ax=ax1)
sns.scatterplot(data=df2, x='x', y='y2', color='r', ax=ax2)
ax2.tick_params(axis='y', colors='red')
output:

Second y-axis time series seaborn

Using the data frame
df = pd.DataFrame({
"date" : ["2018-01-01", "2018-01-02", "2018-01-03", "2018-01-04"],
"column1" : [555,525,532,585],
"column2" : [50,48,49,51]
})
one can plot with seaborn say column1 with sns.tsplot(data=df.column1, color="g").
How can we plot both time series with two y-axis in seaborn ?
As seaborn is built on the top of matplotlib, you can use its power:
import matplotlib.pyplot as plt
sns.lineplot(data=df.column1, color="g")
ax2 = plt.twinx()
sns.lineplot(data=df.column2, color="b", ax=ax2)
I would recommend using a normal line plot. You can get a twin axes via ax.twinx().
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({"date": ["2018-01-01", "2018-01-02", "2018-01-03", "2018-01-04"],
"column1": [555,525,532,585],
"column2": [50,48,49,51]})
ax = df.plot(x="date", y="column1", legend=False)
ax2 = ax.twinx()
df.plot(x="date", y="column2", ax=ax2, legend=False, color="r")
ax.figure.legend()
plt.show()
You could try the following code, based on #Andrey Sobolev's solution, but which will also generate a complete legend.
from matplotlib.lines import Line2D
g = sb.lineplot(data=df.column1, color="g")
sb.lineplot(data=df.column2, color="b", ax=g.axes.twinx())
g.legend(handles=[Line2D([], [], marker='_', color="g", label='column1'), Line2D([], [], marker='_', color="b", label='column2')])

Creating legend in matplotlib after plotting two Pandas Series

I plotted two Pandas Series from the same DataFrame with the same x axis and everything worked out fine. However, when I tried to manually create a Legend, it appears but only with the title and not with the actually content. I've tried other solutions without any luck. Here's my code:
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax2 = ax1.twinx()
width = .3
df.tally.plot(kind='bar', color='red', ax=ax1, width=width, position=1, grid=False)
df.costs.plot(kind='bar', color='blue', ax=ax2, width=width, position=0, grid=True)
ax1.set_ylabel('Tally')
ax2.set_ylabel('Total Cost')
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
plt.legend([handles1, handles2], [labels1, labels2], loc='upper left', title='Legend')
plt.show()
plt.clf()
Maybe you have a good reason to do it your way, but if not, this is much easier:
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Optional, just better looking
import seaborn as sns
# Generate random data
df = pd.DataFrame(np.random.randn(10,3), columns=['tally', 'costs', 'other'])
df[['tally', 'costs']].plot(kind='bar', width=.3)
plt.show();
Out[1]:
Edit
After learning that this is because you have a much different scale for the other one, here's the pandas approach:
# Generate same data as Jianxun Li
np.random.seed(0)
df = pd.DataFrame(np.random.randint(50,100,(20,3)), columns=['tally', 'costs', 'other'])
df.costs = df.costs * 5
width = .3
df.tally.plot(kind='bar', color='#55A868', position=1, width=width, legend=True, figsize=(12,6))
df.costs.plot(kind='bar', color='#4C72B0', position=0, width=width, legend=True, secondary_y=True)
plt.show();
Something like this?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# your data
# ===============================
np.random.seed(0)
df = pd.DataFrame(np.random.randint(50,100,(20,3)), columns=['col1', 'col2', 'col3'])
df.col2 = df.col2 * 5
# bar plot with twinx
# ===============================
fig, ax = plt.subplots()
width=0.3
ax.bar(df.index, df.col1, width=width, color='red', label='col1_data')
ax.legend(loc='best')
ax2 = ax.twinx()
ax2.bar(df.index+width, df.col2, width=width, color='blue', label='col2_data')
ax2.legend(loc='best')

Categories