adjusting horizontal bar chart matplotlib to accommodate the bars - python

I am doing a horizontal bar chart but struggling with adjusting ylim, or maybe another parameter to make my labels clearer and make all the labels fit the y axis . I played around with ylim and the text size can be bigger or smaller but the bars do not fit the y axis. Any idea about the right approach?
My code:
import matplotlib.pyplot as plt #we load the library that contains the plotting capabilities
from operator import itemgetter
D=[]
for att, befor, after in zip(df_portion['attributes'], df_portion['2005_2011 (%)'], df_portion['2012_2015 (%)']):
i=(att, befor, after)
D.append(i)
Dsort = sorted(D, key=itemgetter(1), reverse=False) #sort the list in order of usage
attri = [x[0] for x in Dsort]
aft = [x[1] for x in Dsort]
bef = [x[2] for x in Dsort]
ind = np.arange(len(attri))
width=3
ax = plt.subplot(111)
ax.barh(ind, aft, width,align='center',alpha=1, color='r', label='from 2012 to 2015') #a horizontal bar chart (use .bar instead of .barh for vertical)
ax.barh(ind - width, bef, width, align='center', alpha=1, color='b', label='from 2005 to 2008') #a horizontal bar chart (use .bar instead of .barh for vertical)
ax.set(yticks=ind, yticklabels=attri,ylim=[1, len(attri)/2])
plt.xlabel('Frequency distribution (%)')
plt.title('Frequency distribution (%) of common attributes between 2005_2008 and between 2012_2015')
plt.legend()
plt.show()
This is the plot for above code

To make the labels fit, you need to set a smaller fontsize, or use a larger figsize. Changing the ylim will either just show a subset of the bars (in case ylim is set too narrow), or will show more whitespace (when ylim is larger).
The biggest problem in the code is width being too large. Twice the width needs to fit over a distance of 1.0 (the ticks are placed via ind, which is an array 0,1,2,...). As matplotlib calls the thickness of a horizontal bar plot "height", this name is used in the example code below. Using align='edge' lets you position the bars directly (align='center' will move them half their "height").
Pandas has simple functions to sort dataframes according to one or more rows.
Code to illustrate the ideas:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# first create some test data
df = pd.DataFrame({'attributes': ["alpha", "beta", "gamma", "delta", "epsilon", "zata", "eta", "theta", "iota",
"kappa", "lambda", "mu", "nu", "xi", "omikron", "pi", "rho", "sigma", "tau",
"upsilon", "phi", "chi", "psi", "omega"]})
totals_2005_2011 = np.random.uniform(100, 10000, len(df))
totals_2012_2015 = totals_2005_2011 * np.random.uniform(0.70, 2, len(df))
df['2005_2011 (%)'] = totals_2005_2011 / totals_2005_2011.sum() * 100
df['2012_2015 (%)'] = totals_2012_2015 / totals_2012_2015.sum() * 100
# sort all rows via the '2005_2011 (%)' column, sort from large to small
df = df.sort_values('2005_2011 (%)', ascending=False)
ind = np.arange(len(df))
height = 0.3 # two times height needs to be at most 1
fig, ax = plt.subplots(figsize=(12, 6))
ax.barh(ind, df['2012_2015 (%)'], height, align='edge', alpha=1, color='crimson', label='from 2012 to 2015')
ax.barh(ind - height, df['2005_2011 (%)'], height, align='edge', alpha=1, color='dodgerblue', label='from 2005 to 2011')
ax.set_yticks(ind)
ax.set_yticklabels(df['attributes'], fontsize=10)
ax.grid(axis='x')
ax.set_xlabel('Frequency distribution (%)')
ax.set_title('Frequency distribution (%) of common attributes between 2005_2011 and between 2012_2015')
ax.legend()
ax.margins(y=0.01) # use smaller margins in the y-direction
plt.tight_layout()
plt.show()
The seaborn library has some functions to create barplots with multiple bars per attribute, without the need to manually fiddle with bar positions. Seaborn prefers its data in "long form", which can be created via pandas' melt().
Example code:
import seaborn as sns
df = df.sort_values('2005_2011 (%)', ascending=True)
df_long = df.melt(id_vars='attributes', value_vars=['2005_2011 (%)', '2012_2015 (%)'],
var_name='period', value_name='distribution')
fig, ax = plt.subplots(figsize=(12, 6))
sns.barplot(data=df_long, y='attributes', x='distribution', hue='period', palette='turbo', ax=ax)
ax.set_xlabel('Frequency distribution (%)')
ax.set_title('Frequency distribution (%) of common attributes between 2005_2011 and between 2012_2015')
ax.grid(axis='x')
ax.tick_params(axis='y', labelsize=12)
sns.despine()
plt.tight_layout()
plt.show()

Related

The axes labels do not align with the matrix when using matshow

I used the following code to create the attached plot:
fig, ax = plt.subplots()
fig.set_figheight(50)
fig.set_figwidth(50)
ax.matshow(power_final_for_plotting, cmap='GnBu', origin='upper')
ax.set_xticks(time_periods)
ax.set_xticklabels(time_periods)
ax.set_yticks(sig_wave_height)
ax.set_yticklabels(sig_wave_height)
for i in range(len(time_periods)):
for j in range(len(amplitude)):
c = round(power_final_for_plotting[j, i],3)
ax.text(i, j, str(c), va='center', ha='center', size=27)
plt.tight_layout()
Here time_period and sig_wave_height are lists of integers. The axis labels do not align properly in this case (Check the top right of the image to see the labels as they are). How can I fix this? The labels are really small in this case:
The current scale definition, for example, gives the necessary amount of x-axis as a numerical value, but the scale name is displayed up to 15 because it is a numerical value. We need 30 tick points and tick names for each column. Since no data is provided, I have created sample data as appropriate. Also, the small font of the scale is due to the size of 50 inches.
import numpy as np
import matplotlib.pyplot as plt
power_final_for_plotting = np.random.rand(450).reshape(15,30)
time_periods = np.arange(0,15,0.5)
sig_wave_height = np.arange(0.5,8.0,0.5)
fig, ax = plt.subplots()
fig.set_figheight(6)
fig.set_figwidth(12)
ax.matshow(power_final_for_plotting, cmap='GnBu', origin='upper')
ax.set_xticks(range(len(time_periods)))
ax.set_xticklabels([str(x) for x in time_periods])
ax.set_yticks(range(len(sig_wave_height)))
ax.set_yticklabels([str(x) for x in sig_wave_height])
for i in range(len(time_periods)):
for j in range(len(sig_wave_height)):
c = round(power_final_for_plotting[j, i],3)
ax.text(i, j, str(c), va='center', ha='center', size=9)
plt.tight_layout()
#print(ax.get_xticklabels())
plt.show()

How to subplot multiple KDE distributions, multiple categories in one column

I am trying to plot either 4 graphs (subplots) of KDE or 1 with 4 lines.
I have two columns:
Region: Charges:
southeast 6000
southeast 5422
southwest 3222
northwest 4222
northwest 5555
northeast 6729
etc 1000s of rows..4 regions
I'd like to visualize the distribution of these 4 areas.
Playing around with this and error messages (and I know it's not correct) 'Data must be 1-dimensional'.
fig, axes = plt.subplots(2, 2, sharex=True, figsize=(10,5))
fig.suptitle('Bigger 1 row x 2 columns axes with no data')
#axes[0].set_title('Title of the first chart')
reg_name = df2[['region','charges']].set_index('region')
southeast = reg_name.loc['southeast']
southwest = reg_name.loc['southwest']
northwest = reg_name.loc['northwest']
#c = df2.charges.values
#d = df2.region
# Set the dimensions of the plot
#widthInInches = 10
#heightInInches = 4
#plt.figure( figsize=(widthInInches, heightInInches) )
# Draw histograms and KDEs on the diagonal usin
#if( int(versionStrParts[1]) < 11 ):
# Use the older, now-deprectaed form
# ax = sns.distplot(c,
# kde_kws={"label": "Kernel Density", "color" : "black"},
# hist_kws={"label": "Histogram", "color" : 'lightsteelblue'})
#else:
# Use the more recent for
sns.kdeplot(ax=axes[0], x=southeast.index, y=southeast.values, color="black", label="Kernel Density")
axes[0].set_title(southeast.name)
sns.kdeplot(ax=axes[1], x=southwest.index, y=southwest.values, color="black", label="Kernel Density")
axes[1].set_title(southwest.name)
sns.kdeplot(ax=axes[0,0], data=df2[df2['region']=='southeast'], x='charges', color='k') should work for your data. Note that axes is a 2D array when both the number of rows and columns are larger than 1.
See How to plot a mean line on a distplot between 0 and the y value of the mean? for adding lines for mean, sdev etc..
Instead of doing the kdeplots one by one, sns.displot can draw them in one go (note that displot is different from distplot):
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
np.random.seed(12358)
regions = ['southeast', 'southwest', 'northeast', 'northwest']
df2 = pd.DataFrame({'region': np.repeat(regions, 100),
'charge': np.round(np.random.randn(400).cumsum() * 100 + 2000)})
g = sns.displot( kind='kde', data=df2, x='charge',
col='region', col_order=regions, col_wrap=2,
height=4, aspect=3, color='black')
for region,ax in g.axes_dict.items():
data = df2[df2['region'] == region]['charge'].values
xs, ys = ax.get_lines()[0].get_data()
median = np.median(data)
mean = data.mean()
sdev = data.std()
ax.vlines([mean-sdev, mean, mean+sdev], 0, np.interp([mean-sdev, mean, mean+sdev], xs, ys), color='b', ls=':')
ax.vlines(median, 0, np.interp(median, xs, ys), color='r', ls='--')
plt.tight_layout()
plt.show()
To draw all the regions into one plot, you can use:
fig, ax = plt.subplots(figsize=(12, 4))
sns.kdeplot(data=df2, x='charge', hue='region', ax=ax)

How to draw the normal distribution of a barplot with log x axis?

I'd like to draw a lognormal distribution of a given bar plot.
Here's the code
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np; np.random.seed(1)
import scipy.stats as stats
import math
inter = 33
x = np.logspace(-2, 1, num=3*inter+1)
yaxis = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.01,0.03,0.3,0.75,1.24,1.72,2.2,3.1,3.9,
4.3,4.9,5.3,5.6,5.87,5.96,6.01,5.83,5.42,4.97,4.60,4.15,3.66,3.07,2.58,2.19,1.90,1.54,1.24,1.08,0.85,0.73,
0.84,0.59,0.55,0.53,0.48,0.35,0.29,0.15,0.15,0.14,0.12,0.14,0.15,0.05,0.05,0.05,0.04,0.03,0.03,0.03, 0.02,
0.02,0.03,0.01,0.01,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0,0]
fig, ax = plt.subplots()
ax.bar(x[:-1], yaxis, width=np.diff(x), align="center", ec='k', color='w')
ax.set_xscale('log')
plt.xlabel('Diameter (mm)', fontsize='12')
plt.ylabel('Percentage of Total Particles (%)', fontsize='12')
plt.ylim(0,8)
plt.xlim(0.01, 10)
fig.set_size_inches(12, 12)
plt.savefig("Test.png", dpi=300, bbox_inches='tight')
Resulting plot:
What I'm trying to do is to draw the Probability Density Function exactly like the one shown in red in the graph below:
An idea is to convert everything to logspace, with u = log10(x). Then draw the density histogram in there. And also calculate a kde in the same space. Everything gets drawn as y versus u. When we have u at a top twin axes, x can stay at the bottom. Both axes get aligned by setting the same xlims, but converted to logspace on the top axis. The top axis can be hidden to get the desired result.
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
inter = 33
u = np.linspace(-2, 1, num=3*inter+1)
x = 10**u
us = np.linspace(u[0], u[-1], 500)
yaxis = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.01,0.03,0.3,0.75,1.24,1.72,2.2,3.1,3.9,
4.3,4.9,5.3,5.6,5.87,5.96,6.01,5.83,5.42,4.97,4.60,4.15,3.66,3.07,2.58,2.19,1.90,1.54,1.24,1.08,0.85,0.73,
0.84,0.59,0.55,0.53,0.48,0.35,0.29,0.15,0.15,0.14,0.12,0.14,0.15,0.05,0.05,0.05,0.04,0.03,0.03,0.03, 0.02,
0.02,0.03,0.01,0.01,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0,0]
yaxis = np.array(yaxis)
# reconstruct data from the given frequencies
u_data = np.repeat((u[:-1] + u[1:]) / 2, (yaxis * 100).astype(np.int))
kde = stats.gaussian_kde((u[:-1]+u[1:])/2, weights=yaxis, bw_method=0.2)
total_area = (np.diff(u)*yaxis).sum() # total area of all bars; divide by this area to normalize
fig, ax = plt.subplots()
ax2 = ax.twiny()
ax2.bar(u[:-1], yaxis, width=np.diff(u), align="edge", ec='k', color='w', label='frequencies')
ax2.plot(us, total_area*kde(us), color='crimson', label='kde')
ax2.plot(us, total_area * stats.norm.pdf(us, u_data.mean(), u_data.std()), color='dodgerblue', label='lognormal')
ax2.legend()
ax.set_xscale('log')
ax.set_xlabel('Diameter (mm)', fontsize='12')
ax.set_ylabel('Percentage of Total Particles (%)', fontsize='12')
ax.set_ylim(0,8)
xlim = np.array([0.01,10])
ax.set_xlim(xlim)
ax2.set_xlim(np.log10(xlim))
ax2.set_xticks([]) # hide the ticks at the top
plt.tight_layout()
plt.show()
PS: Apparently this also can be achieved directly without explicitly using u (at the cost of being slightly more cryptic):
x = np.logspace(-2, 1, num=3*inter+1)
xs = np.logspace(-2, 1, 500)
total_area = (np.diff(np.log10(x))*yaxis).sum() # total area of all bars; divide by this area to normalize
kde = gaussian_kde((np.log10(x[:-1])+np.log10(x[1:]))/2, weights=yaxis, bw_method=0.2)
ax.bar(x[:-1], yaxis, width=np.diff(x), align="edge", ec='k', color='w')
ax.plot(xs, total_area*kde(np.log10(xs)), color='crimson')
ax.set_xscale('log')
Note that the bandwidth set for gaussian_kde is a somewhat arbitrarily value. Larger values give a more equalized curve, smaller values keep closer to the data. Some experimentation can help.

Polar plot - Put one grid line in bold

I am trying to make use the polar plot projection to make a radar chart. I would like to know how to put only one grid line in bold (while the others should remain standard).
For my specific case, I would like to highlight the gridline associated to the ytick "0".
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
#Variables
sespi = pd.read_csv("country_progress.csv")
labels = sespi.country
progress = sespi.progress
angles=np.linspace(0, 2*np.pi, len(labels), endpoint=False)
#Concatenation to close the plots
progress=np.concatenate((progress,[progress[0]]))
angles=np.concatenate((angles,[angles[0]]))
#Polar plot
fig=plt.figure()
ax = fig.add_subplot(111, polar=True)
ax.plot(angles, progress, '.--', linewidth=1, c="g")
#ax.fill(angles, progress, alpha=0.25)
ax.set_thetagrids(angles * 180/np.pi, labels)
ax.set_yticklabels([-200,-150,-100,-50,0,50,100,150,200])
#ax.set_title()
ax.grid(True)
plt.show()
The gridlines of a plot are Line2D objects. Therefore you can't make it bold. What you can do (as shown, in part, in the other answer) is to increase the linewidth and change the colour but rather than plot a new line you can do this to the specified gridline.
You first need to find the index of the y tick labels which you want to change:
y_tick_labels = [-100,-10,0,10]
ind = y_tick_labels.index(0) # find index of value 0
You can then get a list of the gridlines using gridlines = ax.yaxis.get_gridlines(). Then use the index you found previously on this list to change the properties of the correct gridline.
Using the example from the gallery as a basis, a full example is shown below:
r = np.arange(0, 2, 0.01)
theta = 2 * np.pi * r
ax = plt.subplot(111, projection='polar')
ax.set_rmax(2)
ax.set_rticks([0.5, 1, 1.5, 2]) # less radial ticks
ax.set_rlabel_position(-22.5) # get radial labels away from plotted line
ax.grid(True)
y_tick_labels = [-100, -10, 0, 10]
ax.set_yticklabels(y_tick_labels)
ind = y_tick_labels.index(0) # find index of value 0
gridlines = ax.yaxis.get_gridlines()
gridlines[ind].set_color("k")
gridlines[ind].set_linewidth(2.5)
plt.show()
Which gives:
It is just a trick, but I guess you could just plot a circle and change its linewidth and color to whatever could be bold for you.
For example:
import matplotlib.pyplot as plt
import numpy as np
Yline = 0
Npoints = 300
angles = np.linspace(0,360,Npoints)*np.pi/180
line = 0*angles + Yline
ax = plt.subplot(111, projection='polar')
plt.plot(angles, line, color = 'k', linewidth = 3)
plt.ylim([-1,1])
plt.grid(True)
plt.show()
In this piece of code, I plot a line using plt.plot between any point of the two vectors angles and line. The former is actually all the angles between 0 and 2*np.pi. The latter is constant, and equal to the 'height' you want to plot that line Yline.
I suggest you try to decrease and increase Npoints while having a look to the documentaion of np.linspace() in order to understand your problem with the roundness of the circle.

Plot two histograms on the same graph and have their columns sum to 100

I have two sets of different sizes that I'd like to plot on the same histogram. However, since one set has ~330,000 values and the other has about ~16,000 values, their frequency histograms are hard to compare. I'd like to plot a histogram comparing the two sets such that the y-axis is the % of occurrences in that bin. My code below gets close to this, except that rather than having the individual bin values sum to 1.0, the integral of the histogram sums to 1.0 (this is because of the normed=True parameter).
How can I achieve my goal? I've already tried manually calculating the % frequency and using plt.bar() but rather than overlaying the plots, the plots are compared side by side. I want to keep the effect of having the alpha=0.5
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
if plt.get_fignums():
plt.close('all')
electric = pd.read_csv('electric.tsv', sep='\t')
gas = pd.read_csv('gas.tsv', sep='\t')
electric_df = pd.DataFrame(electric)
gas_df = pd.DataFrame(ngma_nonheat)
electric = electric_df['avg_daily']*30
gas = gas_df['avg_daily']*30
## Create a plot for NGMA gas usage
plt.figure("Usage Comparison")
weights_electric = np.ones_like(electric)/float(len(electric))
weights_gas = np.ones_like(gas)/float(len(gas))
bins=np.linspace(0, 200, num=50)
n, bins, rectangles = plt.hist(electric, bins, alpha=0.5, label='electric usage', normed=True, weights=weights_electric)
plt.hist(gas, bins, alpha=0.5, label='gas usage', normed=True, weights=weights_gas)
plt.legend(loc='upper right')
plt.xlabel('Average 30 day use in therms')
plt.ylabel('% of customers')
plt.title('NGMA Customer Usage Comparison')
plt.show()
It sounds like you don't want the normed/density kwarg in this case. You're already using weights. If you multiply your weights by 100 and leave out the normed=True option, you should get exactly what you had in mind.
For example:
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1)
x = np.random.normal(5, 2, 10000)
y = np.random.normal(2, 1, 3000000)
xweights = 100 * np.ones_like(x) / x.size
yweights = 100 * np.ones_like(y) / y.size
fig, ax = plt.subplots()
ax.hist(x, weights=xweights, color='lightblue', alpha=0.5)
ax.hist(y, weights=yweights, color='salmon', alpha=0.5)
ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
ax.margins(0.05)
ax.set_ylim(bottom=0)
plt.show()
On the other hand, what you're currently doing (weights and normed) would result in (note the units on the y-axis):
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1)
x = np.random.normal(5, 2, 10000)
y = np.random.normal(2, 1, 3000000)
xweights = 100 * np.ones_like(x) / x.size
yweights = 100 * np.ones_like(y) / y.size
fig, ax = plt.subplots()
ax.hist(x, weights=xweights, color='lightblue', alpha=0.5, normed=True)
ax.hist(y, weights=yweights, color='salmon', alpha=0.5, normed=True)
ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
ax.margins(0.05)
ax.set_ylim(bottom=0)
plt.show()

Categories