catastrophic fit with python curve_fit - python

I need to fit data (x axes: sigma, y axes : Mbh) with an exponential model. This is my code:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
#define my data
Mbh = np.array([1.8e6,2.5e6,4.5e7,3.7e7,4.4e7,1.5e7,1.4e7,4.1e7, 1.0e9,2.1e8,1.0e8,1.0e8,1.6e7,1.9e8,3.9e7,5.2e8,3.1e8,3.0e8,7.0e7,1.1e8,3.0e9,5.6e7,7.8e7,2.0e9,1.7e8,1.4e7,2.4e8,5.3e8,3.3e8,3.5e6,2.5e9])
sigma = np.array([103,75,160,209,205,151,175,140,230,205,145,206,143,182,130,315,242,225,186,190,375,162,152,385,177,90,234,290,266,67,340])
#define my model to fit
def Mbh02(alpha, sigma, beta):
return alpha * np.exp(beta*sigma);
#calculate the fit parameter:
#for second model
popt02, pcov02 = curve_fit(Mbh02, sigma, Mbh, p0=[1, 0.058])
print(f'Parameter of the second function : {popt02}')
sigma_plot = [103,75,160,209,205,151,175,140,230,205,145,206,143,182,130,315,242,225,186,190,375,162,152,385,177,90,234,290,266,67,340]
sigma_plot.sort()
sigma_plot = np.array(sigma_plot)
#plot model with data with
plt.figure(figsize=(6,6))
plt.scatter(sigma, Mbh * 1e-9, marker = '+', color ='black', label = 'Data')
plt.plot(sigma_plot , Mbh02(alpha = popt02[0], sigma = sigma_plot, beta = popt02[1]) * 1e-9, color='orange', ls ='-', label ='2. fit')
plt.ylabel(r'$M_{BH}$ in $M_\odot *10^9$ unit', fontsize=16)
plt.xlabel(r'$\sigma$', fontsize=16)
# plt.ylim(-1,10)
plt.title('Plot of the black hole mass $M_{BH}$ \nagainst the velocity dispersion $\sigma$ \nfor different elliptical galaxies', fontsize=18)
plt.grid(True)
plt.legend()
plt.show()
and I get the following parameter
:
print(popt01) = [16.13278858 0.91788691]
which looks :
If I try to find the parameter manually, and plotting them with:
plt.plot(sigma_plot , (1 * np.exp(0.058 * sigma_plot)) * 1e-9, ls ='--', label ='2. fit manual')
I get the following plot which is much better:
What is the problem ? Why is curve_fit not working and giving such parameter ?

In the curve_fit documentation, it says
Assumes ydata = f(xdata, *params) + eps
So if you change your function definition so that the x data is first in your function, it will work:
def Mbh02(sigma, alpha, beta):
return alpha * np.exp(beta*sigma);
# Rest of code
plt.plot(sigma_plot , Mbh02(sigma_plot, *popt02) * 1e-9, color='orange', ls ='-')

Have you tried fitting the log(Mbh) with a linear fit instead of fitting the exp. model directly? This usually gives a lot of stability.
import numpy as np
import matplotlib.pyplot as plt
Mbh = np.array([1.8e6,2.5e6,4.5e7,3.7e7,4.4e7,1.5e7,1.4e7,4.1e7, 1.0e9,2.1e8,1.0e8,1.0e8,1.6e7,1.9e8,3.9e7,5.2e8,3.1e8,3.0e8,7.0e7,1.1e8,3.0e9,5.6e7,7.8e7,2.0e9,1.7e8,1.4e7,2.4e8,5.3e8,3.3e8,3.5e6,2.5e9])
sigma = np.array([103,75,160,209,205,151,175,140,230,205,145,206,143,182,130,315,242,225,186,190,375,162,152,385,177,90,234,290,266,67,340])
plt.figure(2)
plt.plot(sigma,Mbh,'.')
lnMbh= np.log(Mbh)
p = np.polyfit(sigma,lnMbh,1)
plt.plot(sigma, np.exp(np.polyval(p,sigma)),'*')
alpha = np.log(p[0])
beta = p[1]

Related

Hot to calculate FWHM for a gaussian curve fitted over a histogram?

I have a histogram with a fitted gaussian curve, and I'd like to find and calculate the full width at half maximum for this curve. The data used in this code is a single column from a dataframe. I've included a link to an image of my plot. I'm new to python and have no idea how to do this.
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
def gaussian(n, mean, amplitude, standard_deviation):
return amplitude * np.exp( - (n - mean)**2 / (2*standard_deviation ** 2))
n = df_OI_CMC['Area_1_Micrometers']
#Plot Histogram 1
bin_heights, bin_borders, _ = plt.hist(n, bins =
(0,1,5,10,25,50,75,100,125,150,200,250,500,750,1000,2500,5000,7500,10000), label='histogram',
edgecolor ='white')
bin_widths = np.diff(bin_borders)
bin_centers = bin_borders[:-1] + np.diff(bin_borders) / 2
#Generate enough x values to make the curves look smooth
n_interval_for_fit = np.linspace(bin_borders[0], bin_borders[-1], 10000)
n_interval_for_fit_2 = np.linspace(bin_borders[0], bin_borders_2[-1], 10000)
#CurveFit to Histogram
popt, _ = curve_fit(gaussian, bin_centers, bin_heights, p0=[-44.0543433,
1480.64682738,68.86641026])
plt.rcParams["figure.figsize"] = [12,12]
plt.plot(n_interval_for_fit, gaussian(n_interval_for_fit, *popt), label='fit')
plt.ylim([0, 1500])
plt.xlim([-10,1000])

Python multivariable nonlinear regression calculation

i was trying to make a curve fit of data and want to find nonlinear regression equation.
thats what my plot looks like i got x,y data which will be my reference data,
then i got x0 and y0 which will be my second point,
dx and dy will be difference between them
when i show this as vector it showed form of
when i convert dx,dy to R and theta it showed x^2+y^2 form,
is it possible to find those equation with it?
here's my current code
import matplotlib.pyplot as plt
import numpy as np
import math
import seaborn as sns
from statsmodels.formula.api import ols
from mpl_toolkits.mplot3d import Axes3D
from scipy.interpolate import griddata
"""setting dpi for graph shown in editor"""
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300
import pandas as pd
"""reading data from excel sheet 1"""
df = pd.read_excel(r'C:\Users\JRKIM\Desktop\data\2513data.xlsx')
"""variable selection"""
tx_0 = df.loc[:,'TRUE_x_0']
ty_0 = df.loc[:,'TRUE_y_0']
v_x_0 = df.loc[:,'vx']
v_y_0 = df.loc[:,'vy']
dx0_0 = tx_0-v_x_0
dy0_0 = ty_0-v_y_0
dr0_0 = df.loc[:,'dr']
fig1, ax0 = plt.subplots()
ax0.set_title("delta0 in vector")
qk = ax0.quiver(tx_0,ty_0,dx0_0,dy0_0)
ax0.scatter(tx_0, ty_0, color='r', s=1)
"""3d graph with vector and position """
fig4 = plt.figure()
ax4 = fig4.add_subplot(111, projection='3d')
ax4.scatter(tx_0, ty_0, dr0_0, marker='*',linewidth = 0.01, cmap="jet")
ax4.set_xlabel('X Label')
ax4.set_ylabel('Y Label')
ax4.set_zlabel('dr Label')
you can use scipy.optimize.curve_fit as follows.
from scipy.optimize import curve_fit
def quadratic_function(x, a, b):
return a * x[0]**2 + b * x[1]**2 # + c * x[0]*x[1] if you want to satisfy quadratic form
xdata = np.vstack([np.array(dx0_0).flatten(),np.array(dy0_0).flatten()])
ydata = np.array(dr0_0).flatten()
popt, pcov = curve_fit(quadratic_function,xdata,ydata)
print(popt) # values for a and b
this should get you the coefficients for a and b for the least squares fitting of a*x**2 + b*y**2 = r, you have to further calculate the error and see if the error is low enough for your liking.
Edit: corrected the dimensions of inputs

Rayleigh distribution Curve_fit on python

I'm currently working on a lab report for Brownian Motion using this PDF equation with the intent of evaluating D:
Brownian PDF equation
And I am trying to curve_fit it to a histogram. However, whenever I plot my curve_fits, it's a line and does not appear correctly on the histogram.
Example Histogram with bad curve_fit
And here is my code:
import numpy as np
import matplotlib.pyplot as plt
from scipy import optimize
# Variables
eta = 1e-3
ra = 0.95e-6
T = 296.5
t = 0.5
# Random data
r = np.array(np.random.rayleigh(0.5e-6, 500))
# Histogram
plt.hist(r, bins=10, density=True, label='Counts')
# Curve fit
x,y = np.histogram(r, bins=10, density=True)
x = x[2:]
y = y[2:]
bin_width = y[1] - y[2]
print(bin_width)
bin_centers = (y[1:] + y[:-1])/2
err = x*0 + 0.03
def f(r, a):
return (((1e-6)3*np.pi*r*eta*ra)/(a*T*t))*np.exp(((-3*(1e-6 * r)**2)*eta*ra*np.pi)/(a*T*t))
print(x) # these are flipped for some reason
print(y)
plt.plot(bin_centers, x, label='Fitting this', color='red')
popt, pcov = optimize.curve_fit(f, bin_centers, x, p0 = (1.38e-23), sigma=err, maxfev=1000)
plt.plot(y, f(y, popt), label='PDF', color='orange')
print(popt)
plt.title('Distance vs Counts')
plt.ylabel('Counts')
plt.xlabel('Distance in micrometers')
plt.legend()
Is the issue with my curve_fit? Or is there an underlying issue I'm missing?
EDIT: I broke down D to get the Boltzmann constant as a in the function, which is why there are more numbers in f than the equation above. D and Gamma.
I've tried messing with the initial conditions and plotting the function with 1.38e-23 instead of popt, but that does this (the purple line). This tells me something is wrong with the equation for f, but no issues jump out to me when I look at it. Am I missing something?
EDIT 2: I changed the function to this to simplify it and match the numpy.random.rayleigh() distribution:
def f(r, a):
return ((r)/(a))*np.exp((-1*(r)**2)/(2*a))
But this doesn't resolve the issue that the curve_fit is a line with a positive slope instead of anything remotely what I'm interested in. Now I am more confused as to what the issue is.
There are a few things here. I don't think x and y were ever flipped, or at least when I assumed they weren't, everything seemed to work fine. I also cleaned up a few parts of the code, for example, I'm not sure why you call two different histograms; and I think there may have been problems handling the single element tuple of parameters. Also, for curve fitting, the initial parameter guess often needs to be in the ballpark, so I changed that too.
Here's a version that works for me:
import numpy as np
import matplotlib.pyplot as plt
from scipy import optimize
# Random data
r = np.array(np.random.rayleigh(0.5e-6, 500))
# Histogram
hist_values, bin_edges, patches = plt.hist(r, bins=10, density=True, label='Counts')
bin_centers = (bin_edges[1:] + bin_edges[:-1])/2
x = bin_centers[2:] # not necessary, and I'm not sure why the OP did this, but I'm doing this here because OP does
y = hist_values[2:]
def f(r, a):
return (r/(a*a))*np.exp((-1*(r**2))/(2*a*a))
plt.plot(x, y, label='Fitting this', color='red')
err = x*0 + 0.03
popt, pcov = optimize.curve_fit(f, x, y, p0 = (1.38e-6,), sigma=err, maxfev=1000)
plt.plot(x, f(x, *popt), label='PDF', color='orange')
plt.title('Distance vs Counts')
plt.ylabel('Counts')
plt.xlabel('Distance in Meters') # Motion seems to be in micron range, but calculation and plot has been done in meters
plt.legend()

How to specify the prior for scikit-learn's Gaussian process regression?

As mentioned here, scikit-learn's Gaussian process regression (GPR) permits "prediction without prior fitting (based on the GP prior)". But I have an idea for what my prior should be (i.e. it should not simply have a mean of zero but perhaps my output, y, scales linearly with my input, X, i.e. y = X). How could I encode this information into GPR?
Below is a working example, but it assumed zero mean for my prior. I read that "The GaussianProcessRegressor does not allow for the specification of the mean function, always assuming it to be the zero function, highlighting the diminished role of the mean function in calculating the posterior." I believe this is the motivation behind custom kernels (e.g. heteroscedastic) with variable scales at different X, although I'm still trying to better understand what capability they provide. Are there ways to get around the zero mean prior so that an arbitrary prior can be specified in scikit-learn?
import numpy as np
from matplotlib import pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
def f(x):
"""The function to predict."""
return 1.5*(1. - np.tanh(100.*(x-0.96))) + 1.5*x*(x-0.95) + 0.4 + 1.5*(1.-x)* np.random.random(x.shape)
# Instantiate a Gaussian Process model
kernel = C(10.0, (1e-5, 1e5)) * RBF(10.0, (1e-5, 1e5))
X = np.array([0.803,0.827,0.861,0.875,0.892,0.905,
0.91,0.92,0.925,0.935,0.941,0.947,0.96,
0.974,0.985,0.995,1.0])
X = np.atleast_2d(X).T
# Observations and noise
y = f(X).ravel()
noise = np.linspace(0.4,0.3,len(X))
y += noise
# Instantiate a Gaussian Process model
gp = GaussianProcessRegressor(kernel=kernel, alpha=noise ** 2,
n_restarts_optimizer=10)
# Fit to data using Maximum Likelihood Estimation of the parameters
gp.fit(X, y)
# Make the prediction on the meshed x-axis (ask for MSE as well)
x = np.atleast_2d(np.linspace(0.8, 1.02, 1000)).T
y_pred, sigma = gp.predict(x, return_std=True)
plt.figure()
plt.errorbar(X.ravel(), y, noise, fmt='k.', markersize=10, label=u'Observations')
plt.plot(x, y_pred, 'k-', label=u'Prediction')
plt.fill(np.concatenate([x, x[::-1]]),
np.concatenate([y_pred - 1.9600 * sigma,
(y_pred + 1.9600 * sigma)[::-1]]),
alpha=.1, fc='k', ec='None', label='95% confidence interval')
plt.xlabel('x')
plt.ylabel('y')
plt.xlim(0.8, 1.02)
plt.ylim(0, 5)
plt.legend(loc='lower left')
plt.show()
Here is an example on how to use the prior mean function to the sklearn GPR model.
import numpy as np
from matplotlib import pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel
A=np.linspace(5,25,num=100)
# prior mean function
prior_beta=12-0.3*A
# true function
true_beta=20-0.7*A
rng = np.random.seed(44)
# Training data
size=15
ind=np.random.randint(0,100,size=size)
# generate the posterior variance (noisy samples)
var_=np.random.uniform(0.1,10.0,size=size)
A_=A[ind][:, np.newaxis]
beta_=true_beta[ind]-prior_beta[ind]
beta_1=true_beta[ind]
plt.figure()
kernel = ConstantKernel(4) * RBF(length_scale=2, length_scale_bounds=(1e-3, 1e2))
gp = GaussianProcessRegressor(kernel=kernel,
alpha=var_,optimizer=None).fit(A_, beta_)
X_ = np.linspace(5, 25, 100)
y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
# Now you add the prior mean function back
y_mean=y_mean+12-0.3*X_
plt.plot(X_, y_mean, 'k', lw=3, zorder=9, label='predicted')
plt.fill_between(X_, y_mean - 3*np.sqrt(np.diag(y_cov)),
y_mean + 3*np.sqrt(np.diag(y_cov)),
alpha=0.5, color='k', label='+-3sigma')
plt.plot(A,true_beta, 'r', lw=3, zorder=9,label='truth')
plt.plot(A,prior_beta, 'blue', lw=3, zorder=9,label='prior')
plt.errorbar(A_[:,0], beta_1, yerr=3*np.sqrt(var_), fmt='x',ecolor='g',marker='s',
mfc='g', ms=10,capsize=6,label='training set')
plt.title("Initial: %s\n"% (kernel))
plt.legend()
plt.show()

How can I find the right gaussian curve given some data?

I have code that draws from a gaussian in 1D:
import numpy as np
from scipy.stats import norm
from scipy.optimize import curve_fit
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import gauss
# Beginning in one dimension:
mean = 0; Var = 1; N = 1000
scatter = np.random.normal(mean,np.sqrt(Var),N)
scatter = np.sort(scatter)
mu,sigma = norm.fit(scatter)
I obtain mu and sigma using norm.fit()
Now I'd like to obtain my parameters using
xdata = np.linspace(-5,5,N)
pop, pcov = curve_fit(gauss.gauss_1d,xdata,scatter)
The problem is I don't know how to map my scattered points (drawn from a 1D gaussian) to the x-line in order to use curve_fit.
Also, suppose I simply use and mu and sigma as earlier.
I plot using:
n, bins, patches = plt.hist(scatter,50,facecolor='green')
y = 2*max(n)*mlab.normpdf(bins,mu,sigma)
l = plt.plot(bins,y,'r--')
plt.xlabel('x-coord')
plt.ylabel('Occurrences')
plt.grid(True)
plt.show()
But I have to guess the amplitude as 2*max(n). It works but it's not robust. How can I find the amplitude without guessing?
To avoid guessing the amplitude, call hist() with normed=True, then the amplitude corresponds to normpdf().
For doing a curve fit, I suggest to use not the density but the cumulative distribution: Each sample has a height of 1/N, which successively sum up to 1. This has the advantage that you don't need to group samples in bins.
import numpy as np
from scipy.stats import norm
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
# Beginning in one dimension:
mean = 0; Var = 1; N = 100
scatter = np.random.normal(mean,np.sqrt(Var),N)
scatter = np.sort(scatter)
mu1,sigma1 = norm.fit(scatter) # classical fit
scat_sum = np.cumsum(np.ones(scatter.shape))/N # cumulative samples
[mu2,sigma2],Cx = curve_fit(norm.cdf, scatter, scat_sum, p0=[0,1]) # curve fit
print(u"norm.fit(): µ1= {:+.4f}, σ1={:.4f}".format(mu1, sigma1))
print(u"curve_fit(): µ2= {:+.4f}, σ2={:.4f}".format(mu2, sigma2))
fg = plt.figure(1); fg.clf()
ax = fg.add_subplot(1, 1, 1)
t = np.linspace(-4,4, 1000)
ax.plot(t, norm.cdf(t, mu1, sigma1), alpha=.5, label="norm.fit()")
ax.plot(t, norm.cdf(t, mu2, sigma2), alpha=.5, label="curve_fit()")
ax.step(scatter, scat_sum, 'x-', where='post', alpha=.5, label="Samples")
ax.legend(loc="best")
ax.grid(True)
ax.set_xlabel("$x$")
ax.set_ylabel("Cumulative Probability Density")
ax.set_title("Fit to Normal Distribution")
fg.canvas.draw()
plt.show()
prints
norm.fit(): µ1= +0.1534, σ1=1.0203
curve_fit(): µ2= +0.1135, σ2=1.0444
and plots

Categories