Is there any way we label the plots in boxplot using matplotlib? - python

Is there any way to label the outliers in a box plot.
like i am plotting the prices for each drug and trying to find places with overpriced drug.
so i want to label the outliers with the name of the place from where it belong.
How to achieve it using matplotlib ?

Boxplot lets you pass an object for flierprops.
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Mock data from the boxplot demo
spread = np.random.rand(50) * 100
center = np.ones(25) * 50
flier_high = np.random.rand(10) * 100 + 100
flier_low = np.random.rand(10) * -100
data = np.concatenate((spread, center, flier_high, flier_low))
# Set up
fig, ax = plt.subplots()
# flierprops example
red_square = dict(markerfacecolor='r', marker='s')
box = ax.boxplot(data, flierprops=red_square)
This simple sample produces:
If you want to label something, you can use plt.annotate like so:
box = ax.boxplot(data,)
top_points = box["fliers"][0].get_data()
ax.scatter(top_points[0], top_points[1], marker="o")
# Roughly based on https://stackoverflow.com/a/5147430/10553976
ax.annotate("I labeled this", xy=(top_points[0][1], top_points[1][1]),
xytext=(-20, 20),
textcoords='offset points', ha='right', va='bottom',
bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))
And this sample produces:

Related

Using matlotlib: why do imshow and contourf not plot together? (contourf "overrides" imshow)

I am trying to plot some meteorological data onto a map and I would like to add an image of a plane using imshow. Plotting i) the trajectory, ii) some contour-data and iii) the image, works fine. But as soon as I add a contourf-plot (see below) the image dissapears!
Any ideas how to fix this?
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import cartopy.crs as crs
import cartopy.feature as cfeature
def plot_test():
#DEFINE DATA
x,y = np.meshgrid(np.linspace(0,90,100),np.linspace(0,90,100))
z = x**3 + y**3
#BEGIN FIGURE (IN THIS CASE A MAP, IM PLOTTING METEOROLOGICAL DATA)
fig = plt.figure(figsize = (6,6))
ax1 = plt.axes(projection=crs.PlateCarree(central_longitude=0))
ax1.set_extent([0,90,0,90], crs=crs.PlateCarree())
ax1.coastlines(resolution='auto', color='k')
#EXAMPLE DATA PLOTTED AS CONTOURF
v_max = int(z.max())
v_min = int(z.min())
qcs = ax1.contourf(x, y, z, cmap = "Blues", vmin = v_min, vmax = v_max)
sm = plt.cm.ScalarMappable(cmap="Blues",norm=qcs.norm)
sm._A = []
cbar = plt.colorbar(sm, ax=ax1,orientation="vertical")
cbar.ax.set_ylabel("some contourf data", rotation=90, fontsize = 15)
#PLOT IMAGE OF A PLANE (THIS IS NOT SHOWING UP ON THE PLOT!)
x0 = 50
y0 = 40
img=plt.imread("plane2.png")
ax1.imshow(img,extent=[x0,x0 - 10, y0, y0-10], label = "plane")
plt.show()
without contourf (code from above with lines 14-20 commented out):
with contourf:
Thank you 1000 times #JohanC (see comments). I simply had to place the z-order:
ax1.imshow(img, ...., zorder=3)
which made the plane show up!

adjusting horizontal bar chart matplotlib to accommodate the bars

I am doing a horizontal bar chart but struggling with adjusting ylim, or maybe another parameter to make my labels clearer and make all the labels fit the y axis . I played around with ylim and the text size can be bigger or smaller but the bars do not fit the y axis. Any idea about the right approach?
My code:
import matplotlib.pyplot as plt #we load the library that contains the plotting capabilities
from operator import itemgetter
D=[]
for att, befor, after in zip(df_portion['attributes'], df_portion['2005_2011 (%)'], df_portion['2012_2015 (%)']):
i=(att, befor, after)
D.append(i)
Dsort = sorted(D, key=itemgetter(1), reverse=False) #sort the list in order of usage
attri = [x[0] for x in Dsort]
aft = [x[1] for x in Dsort]
bef = [x[2] for x in Dsort]
ind = np.arange(len(attri))
width=3
ax = plt.subplot(111)
ax.barh(ind, aft, width,align='center',alpha=1, color='r', label='from 2012 to 2015') #a horizontal bar chart (use .bar instead of .barh for vertical)
ax.barh(ind - width, bef, width, align='center', alpha=1, color='b', label='from 2005 to 2008') #a horizontal bar chart (use .bar instead of .barh for vertical)
ax.set(yticks=ind, yticklabels=attri,ylim=[1, len(attri)/2])
plt.xlabel('Frequency distribution (%)')
plt.title('Frequency distribution (%) of common attributes between 2005_2008 and between 2012_2015')
plt.legend()
plt.show()
This is the plot for above code
To make the labels fit, you need to set a smaller fontsize, or use a larger figsize. Changing the ylim will either just show a subset of the bars (in case ylim is set too narrow), or will show more whitespace (when ylim is larger).
The biggest problem in the code is width being too large. Twice the width needs to fit over a distance of 1.0 (the ticks are placed via ind, which is an array 0,1,2,...). As matplotlib calls the thickness of a horizontal bar plot "height", this name is used in the example code below. Using align='edge' lets you position the bars directly (align='center' will move them half their "height").
Pandas has simple functions to sort dataframes according to one or more rows.
Code to illustrate the ideas:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# first create some test data
df = pd.DataFrame({'attributes': ["alpha", "beta", "gamma", "delta", "epsilon", "zata", "eta", "theta", "iota",
"kappa", "lambda", "mu", "nu", "xi", "omikron", "pi", "rho", "sigma", "tau",
"upsilon", "phi", "chi", "psi", "omega"]})
totals_2005_2011 = np.random.uniform(100, 10000, len(df))
totals_2012_2015 = totals_2005_2011 * np.random.uniform(0.70, 2, len(df))
df['2005_2011 (%)'] = totals_2005_2011 / totals_2005_2011.sum() * 100
df['2012_2015 (%)'] = totals_2012_2015 / totals_2012_2015.sum() * 100
# sort all rows via the '2005_2011 (%)' column, sort from large to small
df = df.sort_values('2005_2011 (%)', ascending=False)
ind = np.arange(len(df))
height = 0.3 # two times height needs to be at most 1
fig, ax = plt.subplots(figsize=(12, 6))
ax.barh(ind, df['2012_2015 (%)'], height, align='edge', alpha=1, color='crimson', label='from 2012 to 2015')
ax.barh(ind - height, df['2005_2011 (%)'], height, align='edge', alpha=1, color='dodgerblue', label='from 2005 to 2011')
ax.set_yticks(ind)
ax.set_yticklabels(df['attributes'], fontsize=10)
ax.grid(axis='x')
ax.set_xlabel('Frequency distribution (%)')
ax.set_title('Frequency distribution (%) of common attributes between 2005_2011 and between 2012_2015')
ax.legend()
ax.margins(y=0.01) # use smaller margins in the y-direction
plt.tight_layout()
plt.show()
The seaborn library has some functions to create barplots with multiple bars per attribute, without the need to manually fiddle with bar positions. Seaborn prefers its data in "long form", which can be created via pandas' melt().
Example code:
import seaborn as sns
df = df.sort_values('2005_2011 (%)', ascending=True)
df_long = df.melt(id_vars='attributes', value_vars=['2005_2011 (%)', '2012_2015 (%)'],
var_name='period', value_name='distribution')
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(data=df_long, y='attributes', x='distribution', hue='period', palette='turbo', ax=ax)
ax.set_xlabel('Frequency distribution (%)')
ax.set_title('Frequency distribution (%) of common attributes between 2005_2011 and between 2012_2015')
ax.grid(axis='x')
ax.tick_params(axis='y', labelsize=12)
sns.despine()
plt.tight_layout()
plt.show()

ax.annotate text partially appearing outside the figure box

Apologies, rather unskilled with programming and stackoverflow too. I am drawing bar plots on some data and have managed to add percentages beside the bars, using ax.annotate. However for the bar with highest responses I always get part of the percentage number outside the figure box, as per image below. Have tried different ideas but none worked to fix this. Looking for some suggestions on how to fix this.
Here is my code
from matplotlib import pyplot as plt
import seaborn as sns
def plot_barplot(df):
plt.rcParams.update({'font.size': 18})
sns.set(font_scale=2)
if (len(df) > 1):
fig = plt.figure(figsize=(12,10))
ax = sns.barplot(x='count', y=df.columns[0], data=df, color='blue')
else:
fig = plt.figure(figsize=(5,7))
ax = sns.barplot(x=df.columns[0], y='count', data=df, color='blue')
fig.set_tight_layout(True)
plt.rcParams.update({'font.size': 14})
total = df['count'].sum()
for p in ax.patches:
percentage ='{:.2f}%'.format(100 * p.get_width()/total)
print(percentage)
x = p.get_x() + p.get_width() + 0.02
y = p.get_y() + p.get_height()/2
ax.annotate(percentage, (x, y))
Dataframe looks like this
I would suggest you increase the axes' margins (in the x direction in that case). That is the space there is between the maximum of your data and the maximum scale on the axis. You will have to play around with the value depending on your needs, but it looks like a value of 0.1 or 0.2 should be enough.
add:
plt.rcParams.update({'axes.xmargin': 0.2})
to the top of your function
full code:
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
def plot_barplot(df):
plt.rcParams.update({'font.size': 18})
plt.rcParams.update({'axes.xmargin': 0.1})
sns.set(font_scale=2)
if (len(df) > 1):
fig = plt.figure(figsize=(12, 10))
ax = sns.barplot(x='count', y=df.columns[0], data=df, color='blue')
else:
fig = plt.figure(figsize=(5, 7))
ax = sns.barplot(x=df.columns[0], y='count', data=df, color='blue')
fig.set_tight_layout(True)
plt.rcParams.update({'font.size': 14})
total = df['count'].sum()
for p in ax.patches:
percentage = '{:.2f}%'.format(100 * p.get_width() / total)
print(percentage)
x = p.get_x() + p.get_width() + 0.02
y = p.get_y() + p.get_height() / 2
ax.annotate(percentage, (x, y))
df = pd.DataFrame({'question': ['Agree', 'Strongly agree'], 'count': [200, 400]})
plot_barplot(df)
plt.show()

Matplotlib, vertical space between legend symbols

I have an issue with customizing the legend of my plot. I did lot's of customizing but couldnt get my head around this one. I want the symbols (not the labels) to be equally spaced in the legend. As you can see in the example, the space between the circles in the legend, gets smaller as the circles get bigger.
any ideas?
Also, how can I also add a color bar (in addition to the size), with smaller circles being light red (for example) and bigger circle being blue (for example)
here is my code so far:
import pandas as pd
import matplotlib.pyplot as plt
from vega_datasets import data as vega_data
gap = pd.read_json(vega_data.gapminder.url)
df = gap.loc[gap['year'] == 2000]
fig, ax = plt.subplots(1, 1,figsize=[14,12])
ax=ax.scatter(df['life_expect'], df['fertility'],
s = df['pop']/100000,alpha=0.7, edgecolor="black",cmap="viridis")
plt.xlabel("X")
plt.ylabel("Y");
kw = dict(prop="sizes", num=6, color="lightgrey", markeredgecolor='black',markeredgewidth=2)
plt.legend(*ax.legend_elements(**kw),bbox_to_anchor=(1, 0),frameon=False,
loc="lower left",markerscale=1,ncol=1,borderpad=2,labelspacing=4,handletextpad=2)
plt.grid()
plt.show()
It's a bit tricky, but you could measure the legend elements and reposition them to have a constant inbetween distance. Due to the pixel positioning, the plot can't be resized afterwards.
I tested the code inside PyCharm with the 'Qt5Agg' backend. And in a Jupyter notebook, both with %matplotlib inline and with %matplotlib notebook. I'm not sure whether it would work well in all environments.
Note that ax.scatter doesn't return an ax (countrary to e.g. sns.scatterplot) but a list of the created scatter dots.
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.transforms import IdentityTransform
from vega_datasets import data as vega_data
gap = pd.read_json(vega_data.gapminder.url)
df = gap.loc[gap['year'] == 2000]
fig, ax = plt.subplots(1, 1, figsize=[14, 12])
fig.subplots_adjust(right=0.8)
scat = ax.scatter(df['life_expect'], df['fertility'],
s=df['pop'] / 100000, alpha=0.7, edgecolor="black", cmap="viridis")
plt.xlabel("X")
plt.ylabel("Y")
x = 1.1
y = 0.1
is_first = True
kw = dict(prop="sizes", num=6, color="lightgrey", markeredgecolor='black', markeredgewidth=2)
handles, labels = scat.legend_elements(**kw)
inverted_transData = ax.transData.inverted()
for handle, label in zip(handles[::-1], labels[::-1]):
plt.setp(handle, clip_on=False)
for _ in range(1 if is_first else 2):
plt.setp(handle, transform=ax.transAxes)
if is_first:
xd, yd = x, y
else:
xd, yd = inverted_transData.transform((x, y))
handle.set_xdata([xd])
handle.set_ydata([yd])
ax.add_artist(handle)
bbox = handle.get_window_extent(fig.canvas.get_renderer())
y += y - bbox.y0 + 15 # 15 pixels inbetween
x = (bbox.x0 + bbox.x1) / 2
if is_first:
xd_text, _ = inverted_transData.transform((bbox.x1+10, y))
ax.text(xd_text, yd, label, transform=ax.transAxes, ha='left', va='center')
y = bbox.y1
is_first = False
plt.show()

Python Create Bar Chart Comparing 2 sets of data

I have a notebook with 2* bar charts, one is winter data & one is summer data. I have counted the total of all the crimes and plotted them in a bar chart, using code:
ax = summer["crime_type"].value_counts().plot(kind='bar')
plt.show()
Which shows a graph like:
I have another chart nearly identical, but for winter:
ax = winter["crime_type"].value_counts().plot(kind='bar')
plt.show()
And I would like to have these 2 charts compared against one another in the same bar chart (Where every crime on the x axis has 2 bars coming from it, one winter & one summer).
I have tried, which is just me experimenting:
bx = (summer["crime_type"],winter["crime_type"]).value_counts().plot(kind='bar')
plt.show()
Any advice would be appreciated!
The following generates dummies of your data and does the grouped bar chart you wanted:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
s = "Crime Type Summer|Crime Type Winter".split("|")
# Generate dummy data into a dataframe
j = {x: [random.choice(["ASB", "Violence", "Theft", "Public Order", "Drugs"]
) for j in range(300)] for x in s}
df = pd.DataFrame(j)
index = np.arange(5)
bar_width = 0.35
fig, ax = plt.subplots()
summer = ax.bar(index, df["Crime Type Summer"].value_counts(), bar_width,
label="Summer")
winter = ax.bar(index+bar_width, df["Crime Type Winter"].value_counts(),
bar_width, label="Winter")
ax.set_xlabel('Category')
ax.set_ylabel('Incidence')
ax.set_title('Crime incidence by season, type')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(["ASB", "Violence", "Theft", "Public Order", "Drugs"])
ax.legend()
plt.show()
With this script I got:
You can check out the demo in the matplotlib docs here: https://matplotlib.org/gallery/statistics/barchart_demo.html
The important thing to note is the index!
index = np.arange(5) # Set an index of n crime types
...
summer = ax.bar(index, ...)
winter = ax.bar(index+bar_width, ...)
...
ax.set_xticks(index + bar_width / 2)
These are the lines that arrange the bars on the horizontal axis so that they are grouped together.
Create a pandas dataframe with 3 columns crimetype, count, Season and try this function.
#Importing required packages
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import MaxNLocator
#Function Creation
def plt_grouped_bar(Plot_Nm,group_bar,x, y,plt_data,**bar_kwargs):
plt_fig=plt.figure(figsize=(18,9))
ax=plt_fig.add_subplot()
g = sns.catplot(x=x, y=y, hue=group_bar,data=plt_data,ax=ax,kind="bar",**bar_kwargs)
for p in ax.patches:
height = p.get_height()
ax.text(x = p.get_x()+(p.get_width()/2),
y = height+0.05,
s = '{:.0f}'.format(height),
ha = 'center',va = 'bottom',zorder=20, rotation=90)
ax.set_title(Plot_Nm,fontweight="bold",fontsize=18,alpha=0.7,y=1.03)
g.set_xticklabels(x,fontsize=10,alpha=0.8,fontweight="bold")
plt.setp(ax.get_xticklabels(), rotation=90)
ax.set_yticklabels("")
ax.set_xlabel("")
ax.set_ylabel("")
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
ax.tick_params(axis=u'both',length=0)
ax.legend(loc='upper right')
for spine in ax.spines:
ax.spines[spine].set_visible(False)
plt.close()
#Calling the function
plt_grouped_bar('Title of bar','weather','crimetype','count',pandasdataframename)

Categories