Python Create Bar Chart Comparing 2 sets of data - python

I have a notebook with 2* bar charts, one is winter data & one is summer data. I have counted the total of all the crimes and plotted them in a bar chart, using code:
ax = summer["crime_type"].value_counts().plot(kind='bar')
plt.show()
Which shows a graph like:
I have another chart nearly identical, but for winter:
ax = winter["crime_type"].value_counts().plot(kind='bar')
plt.show()
And I would like to have these 2 charts compared against one another in the same bar chart (Where every crime on the x axis has 2 bars coming from it, one winter & one summer).
I have tried, which is just me experimenting:
bx = (summer["crime_type"],winter["crime_type"]).value_counts().plot(kind='bar')
plt.show()
Any advice would be appreciated!

The following generates dummies of your data and does the grouped bar chart you wanted:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
s = "Crime Type Summer|Crime Type Winter".split("|")
# Generate dummy data into a dataframe
j = {x: [random.choice(["ASB", "Violence", "Theft", "Public Order", "Drugs"]
) for j in range(300)] for x in s}
df = pd.DataFrame(j)
index = np.arange(5)
bar_width = 0.35
fig, ax = plt.subplots()
summer = ax.bar(index, df["Crime Type Summer"].value_counts(), bar_width,
label="Summer")
winter = ax.bar(index+bar_width, df["Crime Type Winter"].value_counts(),
bar_width, label="Winter")
ax.set_xlabel('Category')
ax.set_ylabel('Incidence')
ax.set_title('Crime incidence by season, type')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(["ASB", "Violence", "Theft", "Public Order", "Drugs"])
ax.legend()
plt.show()
With this script I got:
You can check out the demo in the matplotlib docs here: https://matplotlib.org/gallery/statistics/barchart_demo.html
The important thing to note is the index!
index = np.arange(5) # Set an index of n crime types
...
summer = ax.bar(index, ...)
winter = ax.bar(index+bar_width, ...)
...
ax.set_xticks(index + bar_width / 2)
These are the lines that arrange the bars on the horizontal axis so that they are grouped together.

Create a pandas dataframe with 3 columns crimetype, count, Season and try this function.
#Importing required packages
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import MaxNLocator
#Function Creation
def plt_grouped_bar(Plot_Nm,group_bar,x, y,plt_data,**bar_kwargs):
plt_fig=plt.figure(figsize=(18,9))
ax=plt_fig.add_subplot()
g = sns.catplot(x=x, y=y, hue=group_bar,data=plt_data,ax=ax,kind="bar",**bar_kwargs)
for p in ax.patches:
height = p.get_height()
ax.text(x = p.get_x()+(p.get_width()/2),
y = height+0.05,
s = '{:.0f}'.format(height),
ha = 'center',va = 'bottom',zorder=20, rotation=90)
ax.set_title(Plot_Nm,fontweight="bold",fontsize=18,alpha=0.7,y=1.03)
g.set_xticklabels(x,fontsize=10,alpha=0.8,fontweight="bold")
plt.setp(ax.get_xticklabels(), rotation=90)
ax.set_yticklabels("")
ax.set_xlabel("")
ax.set_ylabel("")
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
ax.tick_params(axis=u'both',length=0)
ax.legend(loc='upper right')
for spine in ax.spines:
ax.spines[spine].set_visible(False)
plt.close()
#Calling the function
plt_grouped_bar('Title of bar','weather','crimetype','count',pandasdataframename)

Related

Plotting a scatterplot gif from a dataframe

I have a Dataframe with 6 rows of data and 4 columns. Is there any way to generate a gif scatterplot (y which are the 4 columns in different color versus x which are the index rows) plot in which in every frame of the gif, first data point of the Column 1 and its first respective row data is plotted in different color versus the shared x axis which are the indexes, at the same time, column 2, 3 and 4 first data points are plotted, and this goes progressively until the last 6th point is plotted for all of the columns? If a gif is not possible at all, is there any other way to generate at least movie so that I can include in my ppt slide? I appreciate any feedback you might have! The error I am getting is generating an empty plot and saying: TypeError: cannot unpack non-iterable AxesSubplot object. But I am not sure if this is preventing the result from the plotting.
This is a sample of my data and code effort:
import pandas as pd
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import random
from itertools import count
from IPython import display
row_data = np.arange(0, 6)
column_X = np.random.rand(6,)
column_Y = np.random.rand(6,)
column_Z = np.random.rand(6,)
column_K = np.random.rand(6,)
my_df = pd.DataFrame()
my_df['column_X'] = column_X
my_df['column_Y'] = column_Y
my_df['column_Z'] = column_Z
my_df['column_K'] = column_K
my_df.index = row_data
my_df['index'] = row_data
def animate(j):
fig, ax = plt.subplot(sharex= True)
ax[1]=my_df['column_X', color = 'blue']
ax[2]=my_df['column_Y', color = 'red']
ax[3]=my_df['column_Z', color = 'brown']
ax[4]=my_df['column_K', color = 'green']
y=my_df['index']
x.append()
y.append()
plt.xlabel(color = 'blue')
plt.ylabel(color = 'red')
ax.set_ylabel("progressive sales through time")
ax.set_xlabel("progressive time")
plt.plot(x,y)
animation_1 = animation.FuncAnimation(plt.gcf(),animate,interval=1000)
plt.show()
# Inside Jupyter:
video_1 = animation_1.to_html5_video()
html_code_1 = display.HTML(video_1)
display.display(html_code_1)
plt.tight_layout()
plt.show()
Good question! matplotlib animations can be tricky. I struggled a bit with this one, mainly because you want different colors for the different columns. You need 4 different Line2D objects to do this.
# VSCode notebook magic
%matplotlib widget
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
my_df = pd.DataFrame()
my_df["column_X"] = np.random.rand(6)
my_df["column_Y"] = np.random.rand(6)
my_df["column_Z"] = np.random.rand(6)
my_df["column_K"] = np.random.rand(6)
fig, ax = plt.subplots()
# four y-data lists, x-data is shared
xdata, y1, y2, y3, y4 = [], [], [], [], []
# four Line3D objects with different colors
graph1, = ax.plot([], [], 'ro-')
graph2, = ax.plot([], [], 'go-')
graph3, = ax.plot([], [], 'bo-')
graph4, = ax.plot([], [], 'ko-')
# set up the plot
plt.xlim(-1, 6)
plt.xlabel('Time')
plt.ylim(0, 1)
plt.ylabel('Price')
# animation function
def animate(i):
xdata.append(i)
y1.append(my_df.iloc[i,0])
y2.append(my_df.iloc[i,1])
y3.append(my_df.iloc[i,2])
y4.append(my_df.iloc[i,3])
graph1.set_data(xdata, y1)
graph2.set_data(xdata, y2)
graph3.set_data(xdata, y3)
graph4.set_data(xdata, y4)
return (graph1,graph2,graph3,graph4,)
anim = animation.FuncAnimation(fig, animate, frames=6, interval=500, blit=True)
anim.save('test.mp4')
#plt.show()
Here's the resulting .gif (converted from .mp4 using Adobe Express):

Matplotlib bar graph organization

I have a CSV file on employee salaries in 2020 and I cant figure out how to organize my bar graph. here is the CSV for reference: https://catalog.data.gov/dataset/employee-salaries-2020
I would like to present the average salary of each department in a bar graph.
I've started by organizing the bar graph by Department and its value_count() but I would like the x axis to represent the average salary in that department. Any tips on how I can achieve this?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
file_path = 'Employee_Salaries_-_2020.csv'
salaries = pd.read_csv(file_path)
a = salaries.Department.value_counts()
x = list(a.index)
y = list(a)
f, ax = plt.subplots(figsize=(20,10))
width = 0.75 # the width of the bars
ind = np.arange(len(y)) # the x locations for the groups
ax.barh(ind, y, width, color="blue")
ax.set_yticks(ind+width/2)
ax.set_yticklabels(x, minor=False)
for i, v in enumerate(y):
ax.text(v + .25, i + .25, str(v), color='blue', fontweight='bold') #add value labels into bar
plt.title('Average Base Pay by Department')
plt.xlabel('Average Base Pay')
plt.ylabel('Department')
plt.show()
Instead of value counts you can get the average salary by doing salaries.groupby('Department')['Base Salary'].mean(). This should be the value you are looking for.

adjusting horizontal bar chart matplotlib to accommodate the bars

I am doing a horizontal bar chart but struggling with adjusting ylim, or maybe another parameter to make my labels clearer and make all the labels fit the y axis . I played around with ylim and the text size can be bigger or smaller but the bars do not fit the y axis. Any idea about the right approach?
My code:
import matplotlib.pyplot as plt #we load the library that contains the plotting capabilities
from operator import itemgetter
D=[]
for att, befor, after in zip(df_portion['attributes'], df_portion['2005_2011 (%)'], df_portion['2012_2015 (%)']):
i=(att, befor, after)
D.append(i)
Dsort = sorted(D, key=itemgetter(1), reverse=False) #sort the list in order of usage
attri = [x[0] for x in Dsort]
aft = [x[1] for x in Dsort]
bef = [x[2] for x in Dsort]
ind = np.arange(len(attri))
width=3
ax = plt.subplot(111)
ax.barh(ind, aft, width,align='center',alpha=1, color='r', label='from 2012 to 2015') #a horizontal bar chart (use .bar instead of .barh for vertical)
ax.barh(ind - width, bef, width, align='center', alpha=1, color='b', label='from 2005 to 2008') #a horizontal bar chart (use .bar instead of .barh for vertical)
ax.set(yticks=ind, yticklabels=attri,ylim=[1, len(attri)/2])
plt.xlabel('Frequency distribution (%)')
plt.title('Frequency distribution (%) of common attributes between 2005_2008 and between 2012_2015')
plt.legend()
plt.show()
This is the plot for above code
To make the labels fit, you need to set a smaller fontsize, or use a larger figsize. Changing the ylim will either just show a subset of the bars (in case ylim is set too narrow), or will show more whitespace (when ylim is larger).
The biggest problem in the code is width being too large. Twice the width needs to fit over a distance of 1.0 (the ticks are placed via ind, which is an array 0,1,2,...). As matplotlib calls the thickness of a horizontal bar plot "height", this name is used in the example code below. Using align='edge' lets you position the bars directly (align='center' will move them half their "height").
Pandas has simple functions to sort dataframes according to one or more rows.
Code to illustrate the ideas:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# first create some test data
df = pd.DataFrame({'attributes': ["alpha", "beta", "gamma", "delta", "epsilon", "zata", "eta", "theta", "iota",
"kappa", "lambda", "mu", "nu", "xi", "omikron", "pi", "rho", "sigma", "tau",
"upsilon", "phi", "chi", "psi", "omega"]})
totals_2005_2011 = np.random.uniform(100, 10000, len(df))
totals_2012_2015 = totals_2005_2011 * np.random.uniform(0.70, 2, len(df))
df['2005_2011 (%)'] = totals_2005_2011 / totals_2005_2011.sum() * 100
df['2012_2015 (%)'] = totals_2012_2015 / totals_2012_2015.sum() * 100
# sort all rows via the '2005_2011 (%)' column, sort from large to small
df = df.sort_values('2005_2011 (%)', ascending=False)
ind = np.arange(len(df))
height = 0.3 # two times height needs to be at most 1
fig, ax = plt.subplots(figsize=(12, 6))
ax.barh(ind, df['2012_2015 (%)'], height, align='edge', alpha=1, color='crimson', label='from 2012 to 2015')
ax.barh(ind - height, df['2005_2011 (%)'], height, align='edge', alpha=1, color='dodgerblue', label='from 2005 to 2011')
ax.set_yticks(ind)
ax.set_yticklabels(df['attributes'], fontsize=10)
ax.grid(axis='x')
ax.set_xlabel('Frequency distribution (%)')
ax.set_title('Frequency distribution (%) of common attributes between 2005_2011 and between 2012_2015')
ax.legend()
ax.margins(y=0.01) # use smaller margins in the y-direction
plt.tight_layout()
plt.show()
The seaborn library has some functions to create barplots with multiple bars per attribute, without the need to manually fiddle with bar positions. Seaborn prefers its data in "long form", which can be created via pandas' melt().
Example code:
import seaborn as sns
df = df.sort_values('2005_2011 (%)', ascending=True)
df_long = df.melt(id_vars='attributes', value_vars=['2005_2011 (%)', '2012_2015 (%)'],
var_name='period', value_name='distribution')
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(data=df_long, y='attributes', x='distribution', hue='period', palette='turbo', ax=ax)
ax.set_xlabel('Frequency distribution (%)')
ax.set_title('Frequency distribution (%) of common attributes between 2005_2011 and between 2012_2015')
ax.grid(axis='x')
ax.tick_params(axis='y', labelsize=12)
sns.despine()
plt.tight_layout()
plt.show()

How to annotate a seaborn barplot with the aggregated value

How can the following code be modified to show the mean as well as the different error bars on each bar of the bar plot?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
a,b,c,d = [],[],[],[]
for i in range(1,5):
np.random.seed(i)
a.append(np.random.uniform(35,55))
b.append(np.random.uniform(40,70))
c.append(np.random.uniform(63,85))
d.append(np.random.uniform(59,80))
data_df =pd.DataFrame({'stages':[1,2,3,4],'S1':a,'S2':b,'S3':c,'S4':d})
print("Delay:")
display(data_df)
S1 S2 S3 S4
0 43.340440 61.609735 63.002516 65.348984
1 43.719898 40.777787 75.092575 68.141770
2 46.015958 61.244435 69.399904 69.727380
3 54.340597 56.416967 84.399056 74.011136
meansd_df=data_df.describe().loc[['mean', 'std'],:].drop('stages', axis = 1)
display(meansd_df)
sns.set()
sns.set_style('darkgrid',{"axes.facecolor": ".92"}) # (1)
sns.set_context('notebook')
fig, ax = plt.subplots(figsize = (8,6))
x = meansd_df.columns
y = meansd_df.loc['mean',:]
yerr = meansd_df.loc['std',:]
plt.xlabel("Time", size=14)
plt.ylim(-0.3, 100)
width = 0.45
for i, j,k in zip(x,y,yerr): # (2)
ax.bar(i,j, width, yerr = k, edgecolor = "black",
error_kw=dict(lw=1, capsize=8, capthick=1)) # (3)
ax.set(ylabel = 'Delay')
from matplotlib import ticker
ax.yaxis.set_major_locator(ticker.MultipleLocator(10))
plt.savefig("Over.png", dpi=300, bbox_inches='tight')
Given the example data, for a seaborn.barplot with capped error bars, data_df must be converted from a wide format, to a tidy (long) format, which can be accomplished with pandas.DataFrame.stack or pandas.DataFrame.melt
It is also important to keep in mind that a bar plot shows only the mean (or other estimator) value
Sample Data and DataFrame
.iloc[:, 1:] is used to skip the 'stages' column at column index 0.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# given data_df from the OP, select the columns except stage and reshape to long format
df = data_df.iloc[:, 1:].melt(var_name='set', value_name='val')
# display(df.head())
set val
0 S1 43.340440
1 S1 43.719898
2 S1 46.015958
3 S1 54.340597
4 S2 61.609735
Updated as of matplotlib v3.4.2
Use matplotlib.pyplot.bar_label
See How to add value labels on a bar chart for additional details and examples with .bar_label.
Some formatting can be done with the fmt parameter, but more sophisticated formatting should be done with the labels parameter, as show in How to add multiple annotations to a barplot.
Tested with seaborn v0.11.1, which is using matplotlib as the plot engine.
fig, ax = plt.subplots(figsize=(8, 6))
# add the plot
sns.barplot(x='set', y='val', data=df, capsize=0.2, ax=ax)
# add the annotation
ax.bar_label(ax.containers[-1], fmt='Mean:\n%.2f', label_type='center')
ax.set(ylabel='Mean Time')
plt.show()
plot with seaborn.barplot
Using matplotlib before version 3.4.2
The default for the estimator parameter is mean, so the height of the bar is the mean of the group.
The bar height is extracted from p with .get_height, which can be used to annotate the bar.
fig, ax = plt.subplots(figsize=(8, 6))
sns.barplot(x='set', y='val', data=df, capsize=0.2, ax=ax)
# show the mean
for p in ax.patches:
h, w, x = p.get_height(), p.get_width(), p.get_x()
xy = (x + w / 2., h / 2)
text = f'Mean:\n{h:0.2f}'
ax.annotate(text=text, xy=xy, ha='center', va='center')
ax.set(xlabel='Delay', ylabel='Time')
plt.show()
Seaborn is most powerfull with long form data. So you might want to transform your data, something like this:
sns.barplot(data=data_df.melt('stages', value_name='Delay', var_name='Time'),
x='Time', y='Delay',
capsize=0.1, edgecolor='k')
Output:

Is there any way we label the plots in boxplot using matplotlib?

Is there any way to label the outliers in a box plot.
like i am plotting the prices for each drug and trying to find places with overpriced drug.
so i want to label the outliers with the name of the place from where it belong.
How to achieve it using matplotlib ?
Boxplot lets you pass an object for flierprops.
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Mock data from the boxplot demo
spread = np.random.rand(50) * 100
center = np.ones(25) * 50
flier_high = np.random.rand(10) * 100 + 100
flier_low = np.random.rand(10) * -100
data = np.concatenate((spread, center, flier_high, flier_low))
# Set up
fig, ax = plt.subplots()
# flierprops example
red_square = dict(markerfacecolor='r', marker='s')
box = ax.boxplot(data, flierprops=red_square)
This simple sample produces:
If you want to label something, you can use plt.annotate like so:
box = ax.boxplot(data,)
top_points = box["fliers"][0].get_data()
ax.scatter(top_points[0], top_points[1], marker="o")
# Roughly based on https://stackoverflow.com/a/5147430/10553976
ax.annotate("I labeled this", xy=(top_points[0][1], top_points[1][1]),
xytext=(-20, 20),
textcoords='offset points', ha='right', va='bottom',
bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))
And this sample produces:

Categories