Turn Weighted Numbers into Multiple Histograms - python

I am using the below code to create a weighted list of random numbers within a range.
import csv
import random
import numpy as np
import matplotlib.pyplot as plt
itemsList = []
rnd_numbs = csv.writer(open("rnd_numbs.csv", "wb"))
rnd_numbs.writerow(['number'])
items = [1, 2, 3, 4, 5]
probabilities= [0.1, 0.1, 0.2, 0.2, 0.4]
prob = sum(probabilities)
print prob
c = (1.0)/prob
probabilities = map(lambda x: c*x, probabilities)
print probabilities
ml = max(probabilities, key=lambda x: len(str(x)) - str(x).find('.'))
ml = len(str(ml)) - str(ml).find('.') -1
amounts = [ int(x*(10**ml)) for x in probabilities]
itemsList = list()
for i in range(0, len(items)):
itemsList += items[i:i+1]*amounts[i]
for item in itemsList:
rnd_numbs.writerow([item])
What I would like to do is (a) list these numbers randomly down the csv column, not sure why they are coming out pre-sorted, (b) list the numbers down the comumn instead of as one entry, and (c) create and save multiple histrograms at defined intervals, such as the first 100 numbers, then first 250 numbers, then first 500 numbers, ... to the end
For (c) I would like to create multiple pictures such as this for various cutoffs of the data list.
Attempt at histogram
x = itemsList[0:20]
fig = plt.figure()
ax = fig.add_subplot(111)
# 100 is the number of bins
ax.hist(x, 10, normed=1, facecolor='green', alpha=0.75)
ax.set_xlim(0, 5)
ax.set_ylim(0, 500)
ax.grid(True)
plt.show()

As for the third part of your question, take a look at matplotlib (and numpy.loadtxt() for reading your data). There are many examples to help you learn the basics, as well as advanced features. Here's a quick example of plotting a histogram of a random normal distribution:
import numpy as np
import matplotlib.pyplot as plt
x = np.random.randn(10000)
fig = plt.figure()
ax = fig.add_subplot(111)
# 100 is the number of bins
n = ax.hist(x, 100, facecolor='green', alpha=0.75)
# n[0] is the array of bin heights,
# n[1] is the array of bin edges
xmin = min(n[1]) * 1.1
xmax = max(n[1]) * 1.1
ymax = max(n[0]) * 1.1
ax.set_xlim(xmin, xmax)
ax.set_ylim(0, ymax)
ax.grid(True)
plt.show()
which gives you a nice image:
You can make loops to generate multiple images using different ranges of your data, and save the generated figures in a number of formats, with or without previewing them first.

Related

matplotlib.pyplot: How to plot single graph with different Colormaps and a Legend?

I am plotting separate figures for each attribute and label for each data sample. Here is the illustration:
As illustrated in the the last subplot (Label), my data contains seven classes (numerically) (0 to 6). I'd like to visualize these classes using a different fancy colors and a legend. Please note that I just want colors for last subplot. How should I do that?
Here is the code of above plot:
x, y = test_data["x"], test_data["y"]
# determine the total number of plots
n, off = x.shape[1] + 1, 0
plt.rcParams["figure.figsize"] = (40, 15)
# plot all the attributes
for i in range(6):
plt.subplot(n, 1, off + 1)
plt.plot(x[:, off])
plt.title('Attribute:' + str(i), y=0, loc='left')
off += 1
# plot Labels
plt.subplot(n, 1, n)
plt.plot(y)
plt.title('Label', y=0, loc='left')
plt.savefig(save_file_name, bbox_inches="tight")
plt.close()
First, just to set up a similar dataset:
import matplotlib.pyplot as plt
import numpy as np
x = np.random.random((100,6))
y = np.random.randint(0, 6, (100))
fig, axs = plt.subplots(6, figsize=(40,15))
We could use plt.scatter() to give individual points different marker styles:
for i in range(x.shape[-1]):
axs[i].scatter(range(x.shape[0]), x[:,i], c=y)
Or we could mask the arrays we're plotting:
for i in range(x.shape[-1]):
for j in np.unique(y):
axs[i].plot(np.ma.masked_where(y!=j, x[:,i]), 'o')
Either way we get the same results:
Edit: Ah you've edited your question! You can do exactly the same thing for your last plot only, just modify my code above to take it out of the loop of subplots :)
As suggested, we imitate the matplotlib step function by creating a LineCollection to color the different line segments:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.collections import LineCollection
from matplotlib.patches import Patch
#random data generation
np.random.seed(12345)
number_of_categories=4
y = np.concatenate([np.repeat(np.random.randint(0, number_of_categories), np.random.randint(1, 30)) for _ in range(20)])
#check the results with less points
#y = y[:10]
x = y[None] * np.linspace(1, 5, 3)[:, None]
x += 2 * np.random.random(x.shape) - 1
#your initial plot
num_plots = x.shape[0] + 1
fig, axes = plt.subplots(num_plots, 1, sharex=True, figsize=(10, 8))
for i, ax in enumerate(axes.flat[:-1]):
ax.plot(x[i,:])
#first we create the matplotlib step function with x-values as their midpoint
axes.flat[-1].step(np.arange(y.size), y, where="mid", color="lightgrey", zorder=-1)
#then we plot colored segments with shifted index simulating the step function
shifted_x = np.arange(y.size+1)-0.5
#and identify the step indexes
idx_steps, = np.nonzero(np.diff(y, prepend=np.inf, append=np.inf))
#create collection of plateau segments
colored_segments = np.zeros((idx_steps.size-1, 2, 2))
colored_segments[:, :, 0] = np.vstack((shifted_x[idx_steps[:-1]], shifted_x[idx_steps[1:]])).T
colored_segments[:, :, 1] = np.repeat(y[idx_steps[:-1]], 2).reshape(-1, 2)
#generate discrete color list
n_levels, idx_levels = np.unique(y[idx_steps[:-1]], return_inverse=True)
colorarr = np.asarray(plt.cm.tab10.colors[:n_levels.size])
#and plot the colored segments
lc_cs = LineCollection(colored_segments, colors=colorarr[idx_levels, :], lw=10)
lines_cs = axes.flat[-1].add_collection(lc_cs)
#scaling and legend generation
axes.flat[-1].set_ylim(n_levels.min()-0.5, n_levels.max()+0.5)
axes.flat[-1].legend([Patch(color=colorarr[i, :]) for i, _ in enumerate(n_levels)],
[f"cat {i}" for i in n_levels],
loc="upper center", bbox_to_anchor=(0.5, -0.15),
ncol=n_levels.size)
plt.show()
Sample output:
Alternatively, you can use broken barh plots or color this axis or even all axes using axvspan.

Custom Histogram Normalization in matplotlib

I am trying to make a normalized histogram in matplotlib, however I want it normalized such that the total area will be 1000. Is there a way to do this?
I know to get it normalized to 1, you just have to include density=True,stacked=True in the argument of plt.hist(). An equivalent solution would be to do this and multiply the height of each column by 1000, if that would be more doable than changing what the histogram is normalized to.
Thank you very much in advance!
The following approach uses np.histogram to calculate the counts for each histogram bin. Using 1000 / total_count / bin_width as normalization factor, the total area will be 1000. On the contrary, to get the sum of all bar heights to be 1000, a factor of 1000 / total_count would be needed.
plt.bar is used to display the end result.
The example code calculates the same combined histogram with density=True, to compare it with the new histogram summing to 1000.
import matplotlib.pyplot as plt
import numpy as np
data = [np.random.randn(100) * 5 + 10, np.random.randn(300) * 4 + 14, np.random.randn(100) * 3 + 17]
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 4))
ax1.hist(data, stacked=True, density=True)
ax1.set_title('Histogram with density=True')
xmin = min([min(d) for d in data])
xmax = max([max(d) for d in data])
bins = np.linspace(xmin, xmax, 11)
bin_width = bins[1] - bins[0]
counts = [np.histogram(d, bins=bins)[0] for d in data]
total_count = sum([sum(c) for c in counts])
# factor = 1000 / total_count # to sum to 1000
factor = 1000 / total_count / bin_width # for an area of 1000
thousands = [c * factor for c in counts]
bottom = 0
for t in thousands:
ax2.bar(bins[:-1], t, bottom=bottom, width=bin_width, align='edge')
bottom += t
ax2.set_title('Histogram with total area of 1000')
plt.show()
An easy way to do this is to set up a second y-axis whose tick labels are the original multiplied by 1000, then hide the original axis' ticks:
import matplotlib.pyplot as plt
import numpy as np
data = [np.random.randn(5000)]
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
#hist returns a tuple that contains a list of y values at its 0 index:
y,_,_ = ax1.hist(data, density=True, bins=10, edgecolor = 'black')
#find max y value of histogram and multiply by 1000:
max_y = np.round(y.max(),1)*1000
#set up the second y-axis ticks as increments of max_y:
ax2.set_ylim(0,max_y)
ax2.set_yticks(np.linspace(0, max_y, 9))
#hide original y-axis ticks:
ax1.axes.yaxis.set_ticks([])
plt.show()

How to create grid plot with inner subplots?

I have configured subplots of (5 x 1) format shown in Fig. 1 as given by Figure block A in the MWE. I am trying to repeat them n times such that they appear in a grid format with number of rows and columns given by the function fitPlots as mentioned here; to give output as shown in Fig. 2.
Fig. 1 Initial plot
Fig. 2 Repeated plot (desired output)
What would be the best way to repeat the code block to create a grid plot with inner subplots? The MWE creates multiple pages, I want all of them on a single page.
MWE
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
import numpy as np
import math
x = np.arange(1, 100, 0.2)
y_a = np.sqrt(x)
y_b = np.sin(x)
y_c = np.sin(x)
y_d = np.cos(x) * np.cos(x)
y_e = 1/x
########## Figure block A #####################
with PdfPages('./plot_grid.pdf') as plot_grid_loop:
fig, (a, b, c, d, e) = plt.subplots(5, 1, sharex=True, gridspec_kw={'height_ratios': [5, 1, 1, 1, 1]})
a.plot(x, y_a)
b.plot(x, y_b)
c.plot(x, y_c)
d.plot(x, y_d)
e.plot(x, y_e)
plot_grid_loop.savefig()
plt.close
########## Figure block A #####################
# from https://stackoverflow.com/a/43366784/4576447
def fitPlots(N, aspect=(16,9)):
width = aspect[0]
height = aspect[1]
area = width*height*1.0
factor = (N/area)**(1/2.0)
cols = math.floor(width*factor)
rows = math.floor(height*factor)
rowFirst = width < height
while rows*cols < N:
if rowFirst:
rows += 1
else:
cols += 1
rowFirst = not(rowFirst)
return rows, cols
n_plots = 15
n_rows, n_cols = fitPlots(n_plots)
with PdfPages('./plot_grid.pdf') as plot_grid_loop:
for m in range(1, n_plots+1):
fig, (a, b, c, d, e) = plt.subplots(5, 1, sharex=True, gridspec_kw={'height_ratios': [5, 1, 1, 1, 1]})
a.plot(x, y_a)
b.plot(x, y_b)
c.plot(x, y_c)
d.plot(x, y_d)
e.plot(x, y_e)
plot_grid_loop.savefig()
plt.close
This can be done by generating a GridSpec object with gs_fig = fig.add_gridspec() that contains enough rows and columns to fit the five figure blocks (note that when you use plt.subplots a GridSpec is also generated and can be accessed with ax.get_gridspec()). Each empty slot in the GridSpec can then be filled with a sub-GridSpec with gs_sub = gs_fig[i].subgridspec() to hold the five subplots. The trickier part is sharing the x-axis. This can be done by generating an empty first Axes with which the x-axis of all the subplots can be shared.
The following example illustrates this with only three figure blocks, based on the code sample you have shared but with some differences regarding the figure design: the number of rows is computed based on the chosen number of columns, and the figure dimensions are set based on a chosen figure width and aspect ratio. The code for saving the figure to a pdf file is not included.
import numpy as np # v 1.19.2
import matplotlib.pyplot as plt # v 3.3.4
# Create variables to plot
x = np.arange(1, 100, 0.2)
y_a = np.sqrt(x)
y_b = np.sin(x)
y_c = np.sin(x)
y_d = np.cos(x)*np.cos(x)
y_e = 1/x
# Set parameters for figure dimensions
nplots = 3 # random number of plots for this example
ncols = 2
nrows = int(np.ceil(nplots/ncols))
subp_w = 10/ncols # 10 is the total figure width in inches
subp_h = 1*subp_w # set subplot aspect ratio
# Create figure containing GridSpec object with appropriate dimensions
fig = plt.figure(figsize=(ncols*subp_w, nrows*subp_h))
gs_fig = fig.add_gridspec(nrows, ncols)
# Loop through GridSpec to add sub-GridSpec for each figure block
heights = [5, 1, 1, 1, 1]
for i in range(nplots):
gs_sub = gs_fig[i].subgridspec(len(heights), 1, height_ratios=heights, hspace=0.2)
ax = fig.add_subplot(gs_sub[0, 0]) # generate first empty Axes to enable sharex
ax.axis('off') # remove x and y axes because it is overwritten in the loop below
# Loop through y variables to plot all the subplots with shared x-axis
for j, y in enumerate([y_a, y_b, y_c, y_d, y_e]):
ax = fig.add_subplot(gs_sub[j, 0], sharex=ax)
ax.plot(x, y)
if not ax.is_last_row():
ax.tick_params(labelbottom=False)
Reference: matplotlib tutorial GridSpec using SubplotSpec

In Python, how do I plot a color-coded 2-D graph (X, Y, color) for inequally-spaced / random y values?

I'm plotting something similar to a spectogram. I have some 1D arrays of length N, each of which corresponds to one 'horizontal line' in a 2D graph. For a spectogram [M,N], I would need M such N-length arrays to fill all M horizontal lines.
However, I only have data for a smaller number of lines. Let's say I have m < M arrays. These don't correspond to equal-spaced values in the y axis. They're random values. For example, I might only have arrays corresponding to lines 6, 44, 44.5 and 92 (where M=4).
I want to have the y axis from 0 to 100, and plot these lines of values for only the y-values I have, and 0 otherwise. How do I do this?
If I had arrays for y values that are equally spaced on the Y axis, I can do this:
y_values = np.array(M) # ticks for y axis - equal-spaced. eg: [2, 2.5, 3, 3.5]. These are the only values for which data exists.
get_y_values(y_values)
data = np.array([M,N])
get_data(data)
fig = pyplot.figure()
vmax = np.amax(data)
ax = fig.add_subplot(1, 1, 1)
ax.imshow(data, origin='lower', aspect='auto', vmin=0, vmax=vmax)
ax.set_xlabel('Time')
ax.set_ylabel('Frequency')
ax.set_yticks(np.arange(0, y_values.size))
ax.set_yticklabels(yvalues)
pyplot.show()
But it won't work for random y-values, as they'll appear equal-spaced -when they aren't.
There are a number of ways to do this, but a quick and easy way would be to loop over each time series and plot them individually.
import numpy as np
import matplotlib.pyplot as plt
M = 4
N = 100
dy = 2.0 # The desired vertical thickness of each line
y_values = np.arange(0,100)
y_values = [6, 44, 47, 92]
x = np.arange(0, N)
data = np.random.rand(M,N)
fig = plt.figure()
vmax = np.amax(data)
ax = fig.add_subplot(1, 1, 1)
for i in range(M):
ax.pcolor(x, [y_values[i], y_values[i] + dy], [data[i],data[i]], vmin=0, vmax=vmax)
ax.set_xlabel('Time')
ax.set_ylabel('Frequency')
ax.set_yticks(y_values)
ax.set_ylim(0,100)
plt.show()
Here's a link to the output figure.
I set out to solve the problem of filling in the missing data points. This code assumes you want a list of xs from 0 to 100 and that you already have a few xs and a few corresponding ys. It then fills in the rest of the xs and sets their corresponding y values to 0. After that, it orders them by x values, zips, and prints. I figured you can adapt this from here. Hopefully, I haven't misunderstood.
current_xs = [6, 44, 44.5, 92] #the x values
current_ys = [7, 2, 45, 5] #corresponding y values I made up
def fill(current_xs, current_ys):
for i in range(0, 200):
if i/2 not in current_xs:
current_xs.append(i/2)
current_ys.append(0)
total = list(zip(current_xs, current_ys))
total = sorted(total, key=lambda x: x[0])
print(total)
fill(current_xs, current_ys)

Creating matplotlib legend with dynamic number of columns

I would like to create a legend in matplotlib with at max 5 entries per column. Right now, I can manually set the number of columns like so:
leg = plt.legend(loc='best', fancybox=None, ncol=2)
How do I modify this so that at most 5 entries are allowed per column?
There's no built-in way to specify a number of rows instead of a number of columns. However, you can get the number of items that would be added to the legend using the ax._get_legend_handles() method.
For example:
import numpy as np
import matplotlib.pyplot as plt
numlines = np.random.randint(1, 15)
x = np.linspace(0, 1, 10)
fig, ax = plt.subplots()
for i in range(1, numlines + 1):
ax.plot(x, i * x, label='$y={}x$'.format(i))
numitems = len(list(ax._get_legend_handles()))
nrows = 5
ncols = int(np.ceil(numitems / float(nrows)))
ax.legend(ncol=ncols, loc='best')
plt.show()

Categories