Calculating R-square of a slope of specific part of a graph - python

As a tradition, I want to say that I am pretty new to python.
I have set of x and y values as csv file, and my y values are pretty noisy. So far, I managed to use a filter(scipy.signal.savgol_filter) to filter the noise, plot my graph, and get a linear regression of my data, where it is showing a linear trend. This part is important, because my question is related to linear fitting of some part of the data. Here is the code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.signal
import os
from scipy.signal import savgol_coeffs
from sklearn.metrics import r2_score
from scipy.linalg import lstsq
plt.rc('lines',linewidth=1)
plt.rc('axes', labelsize=16)
plt.rc('xtick', labelsize=14)
plt.rc('ytick', labelsize=14)
plt.rc('legend', fontsize=10)
# define material parameters
deprate = 5.90E-07
#deposition rate unit: cm/s
Ms = 180000 # Si substrate modulus, unit: MPa
hs = 0.03 # Si substrate thickness, unit: cm
stressfac = Ms*hs**2/6 # stress prefactor, unit: MPa
def fit_slope(hfilm, curvature, h0, h1):
# use least square fitting to find the slope of the film_thickness vs curvature curve btw
# thickness = h0 and h1.
# return fitting parameter p. Least square fitting line is y = p[1]*x + p[0]
xdata = hfilm[ (hfilm>h0) & (hfilm<h1)]
ydata = curvature[(hfilm>h0) & (hfilm<h1)]
A = xdata[:, np.newaxis] ** [0,1]
p, *_ = lstsq(A, ydata)
return p
def read_MOSSdata(filename, deprate):
data = pd.read_csv(filename, sep='\s*,\s*', engine='python')
time = data['time [s]'][~data['time [s]'].isna()].to_numpy()
curvature = data['Curvature'][~data['time [s]'].isna()] # curvature unit: 1/cm
hfilm = time * deprate # file thickness unit: cm
return time, hfilm, curvature
filename = (r'C:\Users\yavuz\01-0722-2.csv')
time, hfilm, curvature = read_MOSSdata(filename, deprate)
h0 = 0.00005
h1 = 0.00008
xdata = np.linspace(h0, h1, 500)
yhat = scipy.signal.savgol_filter(curvature, 21,1)
p = fit_slope(hfilm, yhat, h0, h1)
plt.plot(hfilm, curvature)
plt.plot(hfilm, yhat, color='red', label = 'filtered data')
plt.plot(xdata, p[1]*xdata + p[0], color='green', linewidth=4, label = 'linear fitting')
plt.xlabel("Film thickness (cm)")
plt.ylabel("Curvature(1/cm)")
print(f'fitted stress = {-p[1]*stressfac} MPa')
plt.legend(loc=0)
My question is how do I calculate R-square value of this slope on my graph? I tried using r-square value calculators like sklearn.metrics but the problem is that I am limiting my x values to get a slope of a window, and all of the codes I tried, showing the problem of ''expected x and y to have same length''. I would add the csv file but it seems like there is not such an option. Thanks a lot for the help!

Related

How to segment a gaussian function to equal-volume parts

I'm trying to split a gaussian shaped curve, to K equal-volume segments, with Python for signal-filtering purposes.
I'm seeking for pseudo-code, general idea or a library that performs it.
Any help will be much appreciated.
Thanks!
For example in the image below: for K=6. volumes s1 = s2 = ... = s6:
You need to determine percentiles of the distribution. You can use for this scipy.stats.norm class and its .ppf() method.
import numpy as np
import scipy.stats as sps
import matplotlib.pyplot as plt
mu = 25
sigma = 4
splits = 8
# define the normal distribution and PDF
dist = sps.norm(loc=mu, scale=sigma)
x = np.linspace(dist.ppf(.001), dist.ppf(.999))
y = dist.pdf(x)
# calculate PPFs
step = 1 / splits
quantiles = np.arange(step, 1.0 - step / 2, step)
ppfs = dist.ppf(quantiles) # boundaries
# plot results
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(x, y, color='k')
for i, ppf in enumerate(ppfs):
ax.axvline(ppf, color=f'C{i}', label=f'{quantiles[i]:.3f}: {ppf:.1f}')
ax.legend()
plt.show()
This is based on this answer

Theil-Sen regression with sklearn on a log-log scale

I'm trying to plot my data on a log-log scale, using Theil-Sen regression for the best fit line. However, when I plot work out my regression line on a log-scale 2, it's parallel to my x=y line, which I don't think is correct.
normal scale for X and y :
log-log scale for X and y :
I found a related solution by chaooder for Linear Regression on a semi-log scale to be somewhat helpful. So currently, my regression line would go from being:
y = ax + c on a linear scale to y = 10^^(a log(x)+c) on my log-log scale. But in my head, I can't understand how that has a solution as I cannot calculate a.
Here's the data:
index,x,y
0,0.22,0.26
1,0.39,0.1
2,0.4,0.17
3,0.56,0.41
4,0.57,0.12
5,0.62,0.54
6,0.78,0.99
7,0.79,0.35
8,0.8,0.33
9,0.83,0.91
10,0.95,0.81
11,1.08,0.23
12,1.34,0.11
13,1.34,0.44
14,1.35,0.11
15,1.58,0.24
16,1.66,0.71
17,2.11,0.54
18,2.13,0.42
19,2.19,1.72
20,2.25,2.16
21,2.39,0.95
22,2.4,0.16
23,2.73,0.92
24,2.87,1.1
25,2.96,0.27
26,3.12,1.66
27,3.26,0.06
28,3.28,0.68
29,3.34,0.7
30,3.38,1.14
31,3.39,1.81
32,3.41,0.19
33,3.49,1.4
34,3.52,1.57
35,3.6,0.99
36,3.64,1.28
37,3.65,1.68
38,3.89,1.66
39,3.93,1.64
40,4.01,1.04
41,4.07,0.32
42,4.22,0.68
43,4.52,0.57
44,4.53,0.59
45,4.56,0.7
46,4.6,1.15
47,4.62,1.31
48,4.68,1.09
49,5.03,0.48
50,5.06,0.7
51,5.31,0.62
52,5.41,0.21
53,5.45,2.06
54,6.0,0.72
55,6.06,0.36
56,6.64,1.41
57,6.74,0.59
58,6.96,0.95
59,7.01,1.13
60,7.14,1.56
61,7.14,2.82
62,7.19,1.49
63,7.21,0.88
64,7.23,1.31
65,7.55,0.76
66,7.72,0.5
67,7.75,1.65
68,7.77,1.48
69,7.9,1.8
70,7.95,0.68
71,8.03,1.12
72,8.09,2.61
73,8.86,1.71
74,9.31,0.23
75,9.5,2.35
76,9.62,1.84
77,9.91,0.56
78,9.95,1.67
79,10.4,1.15
80,10.8,0.88
81,11.28,1.8
82,11.31,1.58
83,11.43,1.0
84,12.38,2.83
85,13.38,1.45
86,13.9,1.99
87,30.3,1.99
And my current code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
from sklearn.linear_model import TheilSenRegressor
for log in [True, False]:
fig,ax = plt.subplots()
data.plot.scatter(ax=ax,
x='x',
y='y',
loglog=log)
vmin = np.amin(data[['x','y']].min().values)*0.8
vmax = np.amax(data[['x','y']].max().values)*1.25
ax.set_xlim(vmin,vmax)
ax.set_ylim(vmin,vmax)
ax.yaxis.set_minor_locator(AutoMinorLocator())
ax.xaxis.set_minor_locator(AutoMinorLocator())
# best fit (ThielSen) line
X = data.x.values[:,np.newaxis]
y = data.y.values
if log:
X = np.log10(X)
y = np.log10(y)
if len(y) > 0:
estimator = TheilSenRegressor(fit_intercept=False) # intentionally set intercept to 0
estimator.fit(X=X,y=y)
y0 = y[0]
x0 = X[0]
y_pred = estimator.predict(np.array([vmin,vmax]).reshape(2,1))
# y_pred = np.power(10,(estimator.predict(X)))
gradient = (y_pred[1] - y_pred[0]) / (vmax - vmin)
intercept = y_pred[1] - gradient * vmax
print(f'gradient: {gradient} \n intercept: {intercept}')
# Theil-Sen regression line
ax.plot([vmin,vmax],y_pred,color='red',lw=1,zorder=1,label='Best fit')
# 1:1 ratio line (black, dashed)
ax.plot([vmin,vmax],[vmin,vmax],lw=1,color='black',ls='--',alpha=0.6,zorder=1,
label='1:1 correlation')
if log:
ax.set_xscale('log');ax.set_yscale('log')
ax.set_title('log-log scale')
fig.savefig('TS_regression_loglog.png')
else:
ax.set_title('normal scale')
fig.savefig('TS_regression_normalscale.png')
If you fitted on log-log, the input for prediction needs to be on the log scale, and you need to transform the prediction before plotting them. These are the lines in question where it's not consistent in terms of scale:
y_pred = estimator.predict(np.array([vmin,vmax]).reshape(2,1))
[..]
ax.plot([vmin,vmax],y_pred,color='red',lw=1,zorder=1,label='Best fit')
Define some of the variables in your code, note you should get the intercept and gradient from the fit:
vmin = np.amin(data[['x','y']].min().values)*0.8
vmax = np.amax(data[['x','y']].max().values)*1.25
X = data.x.values[:,np.newaxis]
y = data.y.values
With a slight modification to your code:
vmin = np.amin(data[['x','y']].min().values)*0.8
vmax = np.amax(data[['x','y']].max().values)*1.25
X = data.x.values[:,np.newaxis]
y = data.y.values
fig,ax = plt.subplots()
data.plot.scatter(ax=ax,x='x',y='y')
estimator = TheilSenRegressor(fit_intercept=False) # intentionally set intercept to 0
estimator.fit(X=np.log10(X),y=np.log10(y))
gradient = estimator.coef_[0]
intercept = estimator.intercept_
print([gradient,intercept])
y_pred = estimator.predict(np.log10([vmin,vmax]).reshape(2,1))
ax.plot([vmin,vmax],10**(y_pred),color='red',lw=1,zorder=1,label='Best fit')
ax.plot([vmin,vmax],[vmin,vmax],lw=1,color='black',ls='--',alpha=0.6,zorder=1,
label='1:1 correlation')
ax.set_xscale('log')
ax.set_yscale('log')

Asymmetric Gaussian Fit in Python

I'm trying to fit an asymmetric Gaussian to this data: http://ge.tt/99iNaL53 (csv file).
I have tried to use a skewed Gaussian model from lmfit, and also a spline, but I'm not able to get the Gaussian model to fit well and the splines are not what I'm looking for (I don't want the spline to fit the data exactly as shown below, and altering the level of smoothing isn't helping).
Here is code using the above data that produces the plot below. The second figure is an example of what I'm trying to achieve with the goal of reading the rise and decay time from the fit.
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import CubicSpline
from scipy.interpolate import UnivariateSpline
from lmfit.models import SkewedGaussianModel
data = np.loadtxt('data.csv', delimiter=',')
x = data[:,0]
y = data[:,1]
# Skewed Gaussian fit
model = SkewedGaussianModel()
params = model.make_params(amplitude=400, center=3, sigma=7, gamma=1)
result = model.fit(y, params, x=x)
# Cubic Spline
cs = CubicSpline(x, y)
x_range = np.arange(x[0], x[-1], 0.1)
# Univariate Spline
us = UnivariateSpline(x, y, k = 1)
# Univariate Spline (smoothed)
us2 = UnivariateSpline(x, y, k = 5)
plt.scatter(x, y, marker = '^', color = 'k', linewidth = 0.5, s = 10, label = 'data')
plt.plot(x_range, cs(x_range), label = 'Cubic Spline')
plt.plot(x_range, us(x_range), label = 'Univariate Spline, k = 1')
plt.plot(x_range, us2(x_range), label = 'Univariate Spline, k = 5')
plt.plot(x, result.best_fit, color = 'red', label = 'Skewed Gaussian Attempt')
plt.xlabel('x')
plt.ylabel('y')
plt.yscale('log')
plt.ylim(1,500)
plt.legend()
plt.show()
Is there a question here? I don't see one, actually.
That result from lmfit is the best fit to a skewed Gaussian model.
You've chosen to plot the result on a log-scale. That completely changes the view of the quality of the fit or what is not fit well.
It seems like you're expecting a better fit, but not *too good. Well, it looks like your data is not perfectly represented by a single skewed Gaussian. It seems like you were not expecting it to be. You could try different forms for the model function, say a skewed Lorentzian or something. But your data has that low x shoulder that definitely does not look like your uncited figure.
I wrote something for J. Chem. Ed. [1] that involved fitting asymmetric Gaussian functions to data, you can find the core repo here [2] but below is a snippet on how I went about fitting a data set where x = data[:,0] and y = data[:,1] to the type of function you're working with:
import numpy as np
from scipy.optimize import leastsq
from scipy.special import erf
initials = [6.5, 13, 1, 0] # initial guess
def asymGaussian(x, p):
amp = (p[0] / (p[2] * np.sqrt(2 * np.pi)))
spread = np.exp((-(x - p[1]) ** 2.0) / (2 * p[2] ** 2.0))
skew = (1 + erf((p[3] * (x - p[1])) / (p[2] * np.sqrt(2))))
return amp * spread * skew
def residuals(p,y,x):
return y - asymGaussian(x, p)
# executes least-squares regression analysis to optimize initial parameters
cnsts = leastsq(
residuals,
initials,
args=(
data_set[:,1], # y value
data_set[:,0] # x value
))[0]
y = asymGaussian(data[:,0], cnsts)
finally just plot (y, data[:,0]). Hope this helps!
[1] https://pubs.acs.org/doi/10.1021/acs.jchemed.9b00818
[2] https://github.com/1mikegrn/pyGC

plot individual peaks after gaussian curve fitting with python-lmfit

From this piece of code I can print the final fit with "out.best_fit", what I would like to do now, is to plot each of the peaks as individual gaussian curves, instead of all of them merged in one single curve.
from pylab import *
from lmfit import minimize, Parameters, report_errors
from lmfit.models import GaussianModel, LinearModel, SkewedGaussianModel
from scipy.interpolate import interp1d
from numpy import *
fit_data = interp1d(x_data, y_data)
mod = LinearModel()
pars = mod.make_params(slope=0.0, intercept=0.0)
pars['slope'].set(vary=False)
pars['intercept'].set(vary=False)
x_peak = [278.35, 334.6, 375]
y_peak = [fit_data(x) for x in x_peak]
i = 0
for x,y in zip(x_peak, y_peak):
sigma = 1.0
A = y*sqrt(2.0*pi)*sigma
prefix = 'g' + str(i) + '_'
peak = GaussianModel(prefix=prefix)
pars.update(peak.make_params(center=x, sigma=1.0, amplitude=A))
pars[prefix+'center'].set(min=x-20.0, max=x+20.0)
pars[prefix+'amplitude'].set(min=0.0)
mod = mod + peak
i += 1
out = mod.fit(y_data, pars, x=x_data)
plt.figure(1)
plt.plot(x_data, y_data)
plt.figure(1)
plt.plot(x_data, out.best_fit, '--')
Plot of the global fit:
I think you want to do this after your fit:
components = out.eval_components(x=x_data)
for model_name, model_value in components.items():
plt.plot(x_data, model_value)
# or more simply, if you prefer:
plt.plot(x_data, components['g0_'])
plt.plot(x_data, components['g1_'])
...
That is, ModelResult.eval_components() for a composite model will return a dictionary with keys that are the prefixes of the component models, and values that are the calculated model for that component.

Random Number from Histogram

Suppose I create a histogram using scipy/numpy, so I have two arrays: one for the bin counts, and one for the bin edges. If I use the histogram to represent a probability distribution function, how can I efficiently generate random numbers from that distribution?
It's probably what np.random.choice does in #Ophion's answer, but you can construct a normalized cumulative density function, then choose based on a uniform random number:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
data = np.random.normal(size=1000)
hist, bins = np.histogram(data, bins=50)
bin_midpoints = bins[:-1] + np.diff(bins)/2
cdf = np.cumsum(hist)
cdf = cdf / cdf[-1]
values = np.random.rand(10000)
value_bins = np.searchsorted(cdf, values)
random_from_cdf = bin_midpoints[value_bins]
plt.subplot(121)
plt.hist(data, 50)
plt.subplot(122)
plt.hist(random_from_cdf, 50)
plt.show()
A 2D case can be done as follows:
data = np.column_stack((np.random.normal(scale=10, size=1000),
np.random.normal(scale=20, size=1000)))
x, y = data.T
hist, x_bins, y_bins = np.histogram2d(x, y, bins=(50, 50))
x_bin_midpoints = x_bins[:-1] + np.diff(x_bins)/2
y_bin_midpoints = y_bins[:-1] + np.diff(y_bins)/2
cdf = np.cumsum(hist.ravel())
cdf = cdf / cdf[-1]
values = np.random.rand(10000)
value_bins = np.searchsorted(cdf, values)
x_idx, y_idx = np.unravel_index(value_bins,
(len(x_bin_midpoints),
len(y_bin_midpoints)))
random_from_cdf = np.column_stack((x_bin_midpoints[x_idx],
y_bin_midpoints[y_idx]))
new_x, new_y = random_from_cdf.T
plt.subplot(121, aspect='equal')
plt.hist2d(x, y, bins=(50, 50))
plt.subplot(122, aspect='equal')
plt.hist2d(new_x, new_y, bins=(50, 50))
plt.show()
#Jaime solution is great, but you should consider using the kde (kernel density estimation) of the histogram. A great explanation why it's problematic to do statistics over histogram, and why you should use kde instead can be found here
I edited #Jaime's code to show how to use kde from scipy. It looks almost the same, but captures better the histogram generator.
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
def run():
data = np.random.normal(size=1000)
hist, bins = np.histogram(data, bins=50)
x_grid = np.linspace(min(data), max(data), 1000)
kdepdf = kde(data, x_grid, bandwidth=0.1)
random_from_kde = generate_rand_from_pdf(kdepdf, x_grid)
bin_midpoints = bins[:-1] + np.diff(bins) / 2
random_from_cdf = generate_rand_from_pdf(hist, bin_midpoints)
plt.subplot(121)
plt.hist(data, 50, normed=True, alpha=0.5, label='hist')
plt.plot(x_grid, kdepdf, color='r', alpha=0.5, lw=3, label='kde')
plt.legend()
plt.subplot(122)
plt.hist(random_from_cdf, 50, alpha=0.5, label='from hist')
plt.hist(random_from_kde, 50, alpha=0.5, label='from kde')
plt.legend()
plt.show()
def kde(x, x_grid, bandwidth=0.2, **kwargs):
"""Kernel Density Estimation with Scipy"""
kde = gaussian_kde(x, bw_method=bandwidth / x.std(ddof=1), **kwargs)
return kde.evaluate(x_grid)
def generate_rand_from_pdf(pdf, x_grid):
cdf = np.cumsum(pdf)
cdf = cdf / cdf[-1]
values = np.random.rand(1000)
value_bins = np.searchsorted(cdf, values)
random_from_cdf = x_grid[value_bins]
return random_from_cdf
Perhaps something like this. Uses the count of the histogram as a weight and chooses values of indices based on this weight.
import numpy as np
initial=np.random.rand(1000)
values,indices=np.histogram(initial,bins=20)
values=values.astype(np.float32)
weights=values/np.sum(values)
#Below, 5 is the dimension of the returned array.
new_random=np.random.choice(indices[1:],5,p=weights)
print new_random
#[ 0.55141614 0.30226256 0.25243184 0.90023117 0.55141614]
I had the same problem as the OP and I would like to share my approach to this problem.
Following Jaime answer and Noam Peled answer I've built a solution for a 2D problem using a Kernel Density Estimation (KDE).
Frist, let's generate some random data and then calculate its Probability Density Function (PDF) from the KDE. I will use the example available in SciPy for that.
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
def measure(n):
"Measurement model, return two coupled measurements."
m1 = np.random.normal(size=n)
m2 = np.random.normal(scale=0.5, size=n)
return m1+m2, m1-m2
m1, m2 = measure(2000)
xmin = m1.min()
xmax = m1.max()
ymin = m2.min()
ymax = m2.max()
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([m1, m2])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
fig, ax = plt.subplots()
ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
extent=[xmin, xmax, ymin, ymax])
ax.plot(m1, m2, 'k.', markersize=2)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
And the plot is:
Now, we obtain random data from the PDF obtained from the KDE, which is the variable Z.
# Generate the bins for each axis
x_bins = np.linspace(xmin, xmax, Z.shape[0]+1)
y_bins = np.linspace(ymin, ymax, Z.shape[1]+1)
# Find the middle point for each bin
x_bin_midpoints = x_bins[:-1] + np.diff(x_bins)/2
y_bin_midpoints = y_bins[:-1] + np.diff(y_bins)/2
# Calculate the Cumulative Distribution Function(CDF)from the PDF
cdf = np.cumsum(Z.ravel())
cdf = cdf / cdf[-1] # NormalizaĆ§Ć£o
# Create random data
values = np.random.rand(10000)
# Find the data position
value_bins = np.searchsorted(cdf, values)
x_idx, y_idx = np.unravel_index(value_bins,
(len(x_bin_midpoints),
len(y_bin_midpoints)))
# Create the new data
new_data = np.column_stack((x_bin_midpoints[x_idx],
y_bin_midpoints[y_idx]))
new_x, new_y = new_data.T
And we can calculate the KDE from this new data and the plot it.
kernel = stats.gaussian_kde(new_data.T)
new_Z = np.reshape(kernel(positions).T, X.shape)
fig, ax = plt.subplots()
ax.imshow(np.rot90(new_Z), cmap=plt.cm.gist_earth_r,
extent=[xmin, xmax, ymin, ymax])
ax.plot(new_x, new_y, 'k.', markersize=2)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
Here is a solution, that returns datapoints that are uniformly distributed within each bin instead of the bin center:
def draw_from_hist(hist, bins, nsamples = 100000):
cumsum = [0] + list(I.np.cumsum(hist))
rand = I.np.random.rand(nsamples)*max(cumsum)
return [I.np.interp(x, cumsum, bins) for x in rand]
A few things do not work well for the solutions suggested by #daniel, #arco-bast, et al
Taking the last example
def draw_from_hist(hist, bins, nsamples = 100000):
cumsum = [0] + list(I.np.cumsum(hist))
rand = I.np.random.rand(nsamples)*max(cumsum)
return [I.np.interp(x, cumsum, bins) for x in rand]
This assumes that at least the first bin has zero content, which may or may not be true. Secondly, this assumes that the value of the PDF is at the upper bound of the bins, which it isn't - it's mostly in the centre of the bin.
Here's another solution done in two parts
def init_cdf(hist,bins):
"""Initialize CDF from histogram
Parameters
----------
hist : array-like, float of size N
Histogram height
bins : array-like, float of size N+1
Histogram bin boundaries
Returns:
--------
cdf : array-like, float of size N+1
"""
from numpy import concatenate, diff,cumsum
# Calculate half bin sizes
steps = diff(bins) / 2 # Half bin size
# Calculate slope between bin centres
slopes = diff(hist) / (steps[:-1]+steps[1:])
# Find height of end points by linear interpolation
# - First part is linear interpolation from second over first
# point to lowest bin edge
# - Second part is linear interpolation left neighbor to
# right neighbor up to but not including last point
# - Third part is linear interpolation from second to last point
# over last point to highest bin edge
# Can probably be done more elegant
ends = concatenate(([hist[0] - steps[0] * slopes[0]],
hist[:-1] + steps[:-1] * slopes,
[hist[-1] + steps[-1] * slopes[-1]]))
# Calculate cumulative sum
sum = cumsum(ends)
# Subtract off lower bound and scale by upper bound
sum -= sum[0]
sum /= sum[-1]
# Return the CDF
return sum
def sample_cdf(cdf,bins,size):
"""Sample a CDF defined at specific points.
Linear interpolation between defined points
Parameters
----------
cdf : array-like, float, size N
CDF evaluated at all points of bins. First and
last point of bins are assumed to define the domain
over which the CDF is normalized.
bins : array-like, float, size N
Points where the CDF is evaluated. First and last points
are assumed to define the end-points of the CDF's domain
size : integer, non-zero
Number of samples to draw
Returns
-------
sample : array-like, float, of size ``size``
Random sample
"""
from numpy import interp
from numpy.random import random
return interp(random(size), cdf, bins)
# Begin example code
import numpy as np
import matplotlib.pyplot as plt
# initial histogram, coarse binning
hist,bins = np.histogram(np.random.normal(size=1000),np.linspace(-2,2,21))
# Calculate CDF, make sample, and new histogram w/finer binning
cdf = init_cdf(hist,bins)
sample = sample_cdf(cdf,bins,1000)
hist2,bins2 = np.histogram(sample,np.linspace(-3,3,61))
# Calculate bin centres and widths
mx = (bins[1:]+bins[:-1])/2
dx = np.diff(bins)
mx2 = (bins2[1:]+bins2[:-1])/2
dx2 = np.diff(bins2)
# Plot, taking care to show uncertainties and so on
plt.errorbar(mx,hist/dx,np.sqrt(hist)/dx,dx/2,'.',label='original')
plt.errorbar(mx2,hist2/dx2,np.sqrt(hist2)/dx2,dx2/2,'.',label='new')
plt.legend()
Sorry, I don't know how to get this to show up in StackOverflow, so copy'n'paste and run to see the point.
I stumbled upon this question when I was looking for a way to generate a random array based on a distribution of another array. If this would be in numpy, I would call it random_like() function.
Then I realized, I have written a package Redistributor which might do this for me even though the package was created with a bit different motivation (Sklearn transformer capable of transforming data from an arbitrary distribution to an arbitrary known distribution for machine learning purposes). Of course I understand unnecessary dependencies are not desired, but at least knowing this package might be useful to you someday. The thing OP asked about is basically done under the hood here.
WARNING: under the hood, everything is done in 1D. The package also implements multidimensional wrapper, but I have not written this example using it as I find it to be too niche.
Installation:
pip install git+https://gitlab.com/paloha/redistributor
Implementation:
import numpy as np
import matplotlib.pyplot as plt
def random_like(source, bins=0, seed=None):
from redistributor import Redistributor
np.random.seed(seed)
noise = np.random.uniform(source.min(), source.max(), size=source.shape)
s = Redistributor(bins=bins, bbox=[source.min(), source.max()]).fit(source.ravel())
s.cdf, s.ppf = s.source_cdf, s.source_ppf
r = Redistributor(target=s, bbox=[noise.min(), noise.max()]).fit(noise.ravel())
return r.transform(noise.ravel()).reshape(noise.shape)
source = np.random.normal(loc=0, scale=1, size=(100,100))
t = random_like(source, bins=80) # More bins more precision (0 = automatic)
# Plotting
plt.figure(figsize=(12,4))
plt.subplot(121); plt.title(f'Distribution of source data, shape: {source.shape}')
plt.hist(source.ravel(), bins=100)
plt.subplot(122); plt.title(f'Distribution of generated data, shape: {t.shape}')
plt.hist(t.ravel(), bins=100); plt.show()
Explanation:
import numpy as np
import matplotlib.pyplot as plt
from redistributor import Redistributor
from sklearn.metrics import mean_squared_error
# We have some source array with "some unknown" distribution (e.g. an image)
# For the sake of example we just generate a random gaussian matrix
source = np.random.normal(loc=0, scale=1, size=(100,100))
plt.figure(figsize=(12,4))
plt.subplot(121); plt.title('Source data'); plt.imshow(source, origin='lower')
plt.subplot(122); plt.title('Source data hist'); plt.hist(source.ravel(), bins=100); plt.show()
# We want to generate a random matrix from the distribution of the source
# So we create a random uniformly distributed array called noise
noise = np.random.uniform(source.min(), source.max(), size=(100,100))
plt.figure(figsize=(12,4))
plt.subplot(121); plt.title('Uniform noise'); plt.imshow(noise, origin='lower')
plt.subplot(122); plt.title('Uniform noise hist'); plt.hist(noise.ravel(), bins=100); plt.show()
# Then we fit (approximate) the source distribution using Redistributor
# This step internally approximates the cdf and ppf functions.
s = Redistributor(bins=200, bbox=[source.min(), source.max()]).fit(source.ravel())
# A little naming workaround to make obj s work as a target distribution
s.cdf = s.source_cdf
s.ppf = s.source_ppf
# Here we create another Redistributor but now we use the fitted Redistributor s as a target
r = Redistributor(target=s, bbox=[noise.min(), noise.max()])
# Here we fit the Redistributor r to the noise array's distribution
r.fit(noise.ravel())
# And finally, we transform the noise into the source's distribution
t = r.transform(noise.ravel()).reshape(noise.shape)
plt.figure(figsize=(12,4))
plt.subplot(121); plt.title('Transformed noise'); plt.imshow(t, origin='lower')
plt.subplot(122); plt.title('Transformed noise hist'); plt.hist(t.ravel(), bins=100); plt.show()
# Computing the difference between the two arrays
print('Mean Squared Error between source and transformed: ', mean_squared_error(source, t))
Mean Squared Error between source and transformed: 2.0574123162302143

Categories