Custom Histogram Normalization in matplotlib - python

I am trying to make a normalized histogram in matplotlib, however I want it normalized such that the total area will be 1000. Is there a way to do this?
I know to get it normalized to 1, you just have to include density=True,stacked=True in the argument of plt.hist(). An equivalent solution would be to do this and multiply the height of each column by 1000, if that would be more doable than changing what the histogram is normalized to.
Thank you very much in advance!

The following approach uses np.histogram to calculate the counts for each histogram bin. Using 1000 / total_count / bin_width as normalization factor, the total area will be 1000. On the contrary, to get the sum of all bar heights to be 1000, a factor of 1000 / total_count would be needed.
plt.bar is used to display the end result.
The example code calculates the same combined histogram with density=True, to compare it with the new histogram summing to 1000.
import matplotlib.pyplot as plt
import numpy as np
data = [np.random.randn(100) * 5 + 10, np.random.randn(300) * 4 + 14, np.random.randn(100) * 3 + 17]
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 4))
ax1.hist(data, stacked=True, density=True)
ax1.set_title('Histogram with density=True')
xmin = min([min(d) for d in data])
xmax = max([max(d) for d in data])
bins = np.linspace(xmin, xmax, 11)
bin_width = bins[1] - bins[0]
counts = [np.histogram(d, bins=bins)[0] for d in data]
total_count = sum([sum(c) for c in counts])
# factor = 1000 / total_count # to sum to 1000
factor = 1000 / total_count / bin_width # for an area of 1000
thousands = [c * factor for c in counts]
bottom = 0
for t in thousands:
ax2.bar(bins[:-1], t, bottom=bottom, width=bin_width, align='edge')
bottom += t
ax2.set_title('Histogram with total area of 1000')
plt.show()

An easy way to do this is to set up a second y-axis whose tick labels are the original multiplied by 1000, then hide the original axis' ticks:
import matplotlib.pyplot as plt
import numpy as np
data = [np.random.randn(5000)]
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
#hist returns a tuple that contains a list of y values at its 0 index:
y,_,_ = ax1.hist(data, density=True, bins=10, edgecolor = 'black')
#find max y value of histogram and multiply by 1000:
max_y = np.round(y.max(),1)*1000
#set up the second y-axis ticks as increments of max_y:
ax2.set_ylim(0,max_y)
ax2.set_yticks(np.linspace(0, max_y, 9))
#hide original y-axis ticks:
ax1.axes.yaxis.set_ticks([])
plt.show()

Related

Matplotlib stacked histogram label

Here is my picture. I need to make label for those bars however every upper layer contains lower layer - so the label should containt grouped colors, i.e. blue - dataset 1, blue/orange - dataset 2, blue/orange/green - dataset 3 and finally blue/orange/green/purple - dataset 4. Is it plausible to make it? Thank you.
enter image description here
binwidth = 1
n, bins, patches = ax1.hist(C, bins=range(81, 105, binwidth),
density=False, histtype='barstacked' ,
edgecolor='gray',
color=barvy_histogram,linewidth=0.3)
hatches = ['//','x','..','oo']
for patch_set, hatch in zip(patches, hatches):
for patch in patch_set.patches:
patch.set_hatch(hatch)
patch.set_linewidth=0.1
patch.set_color='gray'
mpl.rcParams['hatch.linewidth'] = 0.5
The following approach uses the tuple legend handler (HandlerTuple) to combine the legend handles. It produces a horizontal layout, while maybe a vertical stacking would be more interesting.
The code starts with creating some test data, supposing C is an Nx4 array of integers. The bin edges are set at halves to make sure that floating point accuracy wouldn't place values in the wrong bin.
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerTuple
import numpy as np
# first, create some test data
C = (np.random.normal(0.001, 1, (100, 20)).cumsum(axis=0) * 1.2 + 90).astype(int).reshape(-1, 4)
c_min = C.min()
c_max = C.max()
mpl.rcParams['hatch.linewidth'] = 0.5
fig, ax1 = plt.subplots(figsize=(12, 5))
binwidth = 1
colors = plt.cm.Set2.colors[:C.shape[1]]
_, _, patches = ax1.hist(C, bins=np.arange(c_min - 0.5, c_max + binwidth, binwidth),
density=False, histtype='barstacked',
edgecolor='gray', color=colors, linewidth=0.3,
label=[f'N={p}' for p in range(25, 101, 25)])
hatches = ['//', 'x', '..', 'oo']
for patch_set, hatch in zip(patches, hatches):
for patch in patch_set.patches:
patch.set_hatch(hatch)
patch.set_linewidth = 0.1
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles=[tuple(handles[:i + 1]) for i in range(C.shape[1])], labels=labels,
handlelength=6, handler_map={tuple: HandlerTuple(ndivide=None, pad=0)})
plt.show()

Matplotlib polar histogram has shifted bins

I am using Matplotlib to create a polar histogram. The correct data for the histogram, in radians is below:
The alignment is [0,0.78) radians (aka 0 to 45 degrees) [0.78,...) (45 to 90 degrees) etc.
However, when plotting it with the polar plot, the bin is now centred on 0 rather than starting at 0. Yet the histogram count is the same.
If it was actually (-22.5 degrees, 22.5 degrees), then the histogram distribution would be different. Therefore it seems like the polar plot axis label is incorrect - that is, the 0 degrees label should actually be 22.5 degrees (or alternatively the 0 degrees label should be shifted 22.5 degrees clockwise).
Is there any way anyone knows how to achieve this?
Relevant Code:
Histogram
bins = np.linspace(-np.pi, np.pi, bins_number + 1)
n, _, _ = plt.hist(angles, bins) # Create histogram
plt.show()
Note, angles is a list of angles in radians
Polar
plt.clf()
width = 2 * np.pi / bins_number
ax = plt.subplot(1, 1, 1, projection='polar')
bars = ax.bar(bins[:bins_number], n, width=width, bottom=0.0)
for bar in bars:
bar.set_alpha(0.5)
plt.show()
Complete Code
import numpy as np
import matplotlib.pyplot as plt
import csv
with open('circ2.csv', 'r') as f:
reader=csv.reader(f)
angles=[] # Initialise empty list
next(reader) # Skip header line
for row in reader:
angle = float(row[1]) # Angle is in the second column of the row
angles.append(angle)
bins_number = 8 # the [-180, 180) interval will be subdivided into this
bins = np.linspace(-np.pi, np.pi, bins_number + 1)
n, _, _ = plt.hist(angles, bins) # Create histogram
plt.clf()
width = 2 * np.pi / bins_number
ax = plt.subplot(1, 1, 1, projection='polar')
bars = ax.bar(bins[:bins_number], n, width=width, bottom=0.0)
for bar in bars:
bar.set_alpha(0.5)
plt.show()
Thanks
Solved by adding align='edge' in the bar plot. That is:
bars = ax.bar(bins[:bins_number], n, width=width, bottom=0.0, align='edge')
Thanks to ImportanceOfBeingErnest

Plot two histograms on the same graph and have their columns sum to 100

I have two sets of different sizes that I'd like to plot on the same histogram. However, since one set has ~330,000 values and the other has about ~16,000 values, their frequency histograms are hard to compare. I'd like to plot a histogram comparing the two sets such that the y-axis is the % of occurrences in that bin. My code below gets close to this, except that rather than having the individual bin values sum to 1.0, the integral of the histogram sums to 1.0 (this is because of the normed=True parameter).
How can I achieve my goal? I've already tried manually calculating the % frequency and using plt.bar() but rather than overlaying the plots, the plots are compared side by side. I want to keep the effect of having the alpha=0.5
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
if plt.get_fignums():
plt.close('all')
electric = pd.read_csv('electric.tsv', sep='\t')
gas = pd.read_csv('gas.tsv', sep='\t')
electric_df = pd.DataFrame(electric)
gas_df = pd.DataFrame(ngma_nonheat)
electric = electric_df['avg_daily']*30
gas = gas_df['avg_daily']*30
## Create a plot for NGMA gas usage
plt.figure("Usage Comparison")
weights_electric = np.ones_like(electric)/float(len(electric))
weights_gas = np.ones_like(gas)/float(len(gas))
bins=np.linspace(0, 200, num=50)
n, bins, rectangles = plt.hist(electric, bins, alpha=0.5, label='electric usage', normed=True, weights=weights_electric)
plt.hist(gas, bins, alpha=0.5, label='gas usage', normed=True, weights=weights_gas)
plt.legend(loc='upper right')
plt.xlabel('Average 30 day use in therms')
plt.ylabel('% of customers')
plt.title('NGMA Customer Usage Comparison')
plt.show()
It sounds like you don't want the normed/density kwarg in this case. You're already using weights. If you multiply your weights by 100 and leave out the normed=True option, you should get exactly what you had in mind.
For example:
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1)
x = np.random.normal(5, 2, 10000)
y = np.random.normal(2, 1, 3000000)
xweights = 100 * np.ones_like(x) / x.size
yweights = 100 * np.ones_like(y) / y.size
fig, ax = plt.subplots()
ax.hist(x, weights=xweights, color='lightblue', alpha=0.5)
ax.hist(y, weights=yweights, color='salmon', alpha=0.5)
ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
ax.margins(0.05)
ax.set_ylim(bottom=0)
plt.show()
On the other hand, what you're currently doing (weights and normed) would result in (note the units on the y-axis):
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1)
x = np.random.normal(5, 2, 10000)
y = np.random.normal(2, 1, 3000000)
xweights = 100 * np.ones_like(x) / x.size
yweights = 100 * np.ones_like(y) / y.size
fig, ax = plt.subplots()
ax.hist(x, weights=xweights, color='lightblue', alpha=0.5, normed=True)
ax.hist(y, weights=yweights, color='salmon', alpha=0.5, normed=True)
ax.set(title='Histogram Comparison', ylabel='% of Dataset in Bin')
ax.margins(0.05)
ax.set_ylim(bottom=0)
plt.show()

Turn Weighted Numbers into Multiple Histograms

I am using the below code to create a weighted list of random numbers within a range.
import csv
import random
import numpy as np
import matplotlib.pyplot as plt
itemsList = []
rnd_numbs = csv.writer(open("rnd_numbs.csv", "wb"))
rnd_numbs.writerow(['number'])
items = [1, 2, 3, 4, 5]
probabilities= [0.1, 0.1, 0.2, 0.2, 0.4]
prob = sum(probabilities)
print prob
c = (1.0)/prob
probabilities = map(lambda x: c*x, probabilities)
print probabilities
ml = max(probabilities, key=lambda x: len(str(x)) - str(x).find('.'))
ml = len(str(ml)) - str(ml).find('.') -1
amounts = [ int(x*(10**ml)) for x in probabilities]
itemsList = list()
for i in range(0, len(items)):
itemsList += items[i:i+1]*amounts[i]
for item in itemsList:
rnd_numbs.writerow([item])
What I would like to do is (a) list these numbers randomly down the csv column, not sure why they are coming out pre-sorted, (b) list the numbers down the comumn instead of as one entry, and (c) create and save multiple histrograms at defined intervals, such as the first 100 numbers, then first 250 numbers, then first 500 numbers, ... to the end
For (c) I would like to create multiple pictures such as this for various cutoffs of the data list.
Attempt at histogram
x = itemsList[0:20]
fig = plt.figure()
ax = fig.add_subplot(111)
# 100 is the number of bins
ax.hist(x, 10, normed=1, facecolor='green', alpha=0.75)
ax.set_xlim(0, 5)
ax.set_ylim(0, 500)
ax.grid(True)
plt.show()
As for the third part of your question, take a look at matplotlib (and numpy.loadtxt() for reading your data). There are many examples to help you learn the basics, as well as advanced features. Here's a quick example of plotting a histogram of a random normal distribution:
import numpy as np
import matplotlib.pyplot as plt
x = np.random.randn(10000)
fig = plt.figure()
ax = fig.add_subplot(111)
# 100 is the number of bins
n = ax.hist(x, 100, facecolor='green', alpha=0.75)
# n[0] is the array of bin heights,
# n[1] is the array of bin edges
xmin = min(n[1]) * 1.1
xmax = max(n[1]) * 1.1
ymax = max(n[0]) * 1.1
ax.set_xlim(xmin, xmax)
ax.set_ylim(0, ymax)
ax.grid(True)
plt.show()
which gives you a nice image:
You can make loops to generate multiple images using different ranges of your data, and save the generated figures in a number of formats, with or without previewing them first.

fourfold display in matplotlib using polar axis

I am trying to create a fourfold display in matplotlib:
but can't get the logic of the polar axis. This is what I have tried so far:
import numpy as np
import matplotlib.pyplot as plt
# radius of each bar
radii = [10, 15, 20, 25]
# Value - width
width = np.pi/ 2
# angle of each bar
theta = [0,90,180,270]
ax = plt.subplot(111, polar=True)
bars = ax.bar(theta, radii, width=width)
plt.show()
not sure what I am missing but I just want four "equal" areas which touch each others. What I can't get to work is
How to "control" the angles ? I mean to have all four "slides" being in [0,90], [90,180], [180, 270], [270, 360].
I do not understand what "width" corresponds to.
theta is expected to be in radians, not degrees.
If you just slightly tweak your code:
import numpy as np
import matplotlib.pyplot as plt
# radius of each bar
radii = [10, 15, 20, 25]
# Value - width
width = np.pi/ 2
# angle of each bar
theta = np.radians([0,90,180,270])
ax = plt.subplot(111, polar=True)
bars = ax.bar(theta, radii, width=width, alpha=0.5)
plt.show()
You'll get what you'd expect:
On a side note, for the exact plot you're making it might make more sense to use 4 Wedges on a rectangular plot with centered spines.
In case somebody else is interested here is what I came up
To use the example of Berkeley admission in the paper one first need to standardized the values (to equate margins) using iterative proportional fitting
def ContTableIPFP(x1ContTable):
''' poor man IPFP
compute iterative proportional fitting for
a 2 X 2 contingency table
Input :
a 2x2 contingency table as numpy array
Output :
numpy array with values standarized to equate margins
'''
import numpy as np
#Margins
xSumRows = np.sum(x1ContTable, axis = 0).tolist()
xSumCols = np.sum(x1ContTable, axis = 1).tolist()
# Seed
xq0 = x1ContTable/x1ContTable
# Iteration 1 : we adjust by row sums (i.e. using the sums of the columns)
xq1 = np.array([
(xq0[0] * xSumCols[0]).astype(float) / np.sum(xq0, axis = 0).tolist()[0],
(xq0[1] * xSumCols[1]).astype(float) / np.sum(xq0, axis = 0).tolist()[1],
]
)
#Iteration 2 : adjust by columns (i.e. using sums of rows)
xq2 = np.array([
(xq1[:,0] * xSumRows[0]).astype(float) / np.sum(xq1, axis = 0).tolist()[0],
(xq1[:,1] * xSumRows[1]).astype(float) / np.sum(xq1, axis = 0).tolist()[1],
]
)
return xq2.T
and then plot
def FourfoldDisplay(radii):
''' radii = [10, 15, 20, 25]
'''
import numpy as np
import matplotlib.pyplot as plt
# Value - width
width = np.pi/ 2
# angle of each bar
theta = np.radians([0,90,180,270])
ax = plt.subplot(111, polar=True)
bars = ax.bar(theta, radii, width=width, alpha=0.5)
#labels
ax.set_xticklabels([])
ax.set_yticks([])
#plt.axis('off')
plt.show()
to use
import numpy as np
x1 = np.array([
[1198, 1493],
[557, 1278]
])
x2 = ContTableIPFP(x1).flatten()
FourfoldDisplay(x2)

Categories