I recently started with Python because I have an enormous amount of data where I want to automatically fit a Gaussian to the peaks in spectra. Below is an example of three peaks that I want to fit with three individual peaks.
I have found a question where someone is looking for something very similar, How can I fit multiple Gaussian curved to mass spectrometry data in Python?, and adopted it to my script.
I have added my code at the bottom and when I run the last section I get the error "RuntimeError: Optimal parameters not found: Number of calls to function has reached maxfev = 800." What am I missing?
The data can be downloaded at https://www.dropbox.com/s/zowawljcjco70yh/data_so.h5?dl=0
#%%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
from scipy.sparse.linalg import spsolve
from scipy.optimize import curve_fit
#%% Read data
path = 'D:/Python/data_so.h5'
f = pd.read_hdf(path, mode = 'r')
t = f.loc[:, 'Time stamp']
d = f.drop(['Time stamp', 'Name spectrum'], axis = 1)
#%% Extract desired wavenumber range
wn_st=2000
wn_ed=2500
ix_st=np.argmin(abs(d.columns.values-wn_st))
ix_ed=np.argmin(abs(d.columns.values-wn_ed))
d = d.iloc[:, ix_st:ix_ed+1]
#%% AsLS baseline correction
spectrum = 230
y = d.iloc[spectrum]
niter = 10
lam = 200000
p = 0.005
L = len(y)
D = sparse.diags([1,-2,1],[0,-1,-2], shape=(L,L-2))
w = np.ones(L)
for i in range(niter):
W = sparse.spdiags(w, 0, L, L)
Z = W + lam * D.dot(D.transpose())
z = spsolve(Z, w*y)
w = p * (y > z) + (1-p) * (y < z)
corr = d.iloc[spectrum,:] - z
#%% Plot spectrum, baseline and corrected spectrum
plt.clf()
plt.plot(d.columns, d.iloc[spectrum,:])
plt.plot(d.columns, z)
plt.plot(d.columns, corr)
plt.gca().invert_xaxis()
plt.show()
#%%
x = d.columns.values
def gauss(x, a, mu, sig):
return a*np.exp(-(x.astype(float)-mu)**2/(2*sig**2))
fitx = x[(x>2232)*(x<2252)]
fity = y[(x>2232)*(x<2252)]
mu=np.sum(fitx*fity)/np.sum(fity)
sig=np.sqrt(np.sum(fity*(fitx-mu)**2)/np.sum(fity))
popt, pcov = curve_fit(gauss, fitx, fity, p0=[max(fity),mu, sig])
plt.plot(x, gauss(x, popt[0],popt[1],popt[2]), 'r-', label='fit')
Related
I have an ODE which I need to solve and plot. This is my code so far:
import numpy as np
import math
from scipy.integrate import odeint
import matplotlib.pyplot as plt
import math
from scipy import integrate
##############################
# Constants
#α = 1,β = 12,μ = 0.338,Vb = 0.01,ω = 0.5,
alpha = 1#p1
beta = 12#p2
mu = 0.338#p3
omega = 0.5#p5
gamma = 0.25#p6
Acoef=0.5
tmax = 1000
t = np.arange(0.0, tmax, 0.001)
################################
# Initial conditions vector
X0=0
I0=0
Z0=0.06
H0=0.04
E0=0
O0=0.02
# The model differential equations.
def deriv(y,t):
X, I, Z, H, E, O = y
dXdt = I
dIdt = -(alpha)*(X)-(beta)*(X**3)-(mu)*I+gamma*(1/(1-X)**2)-gamma*(1/(1+X)**2)+Acoef*(1/(1-X)**2)*(np.sin(omega*t))
dZdt = H
dHdt =-(alpha)*(Z)-(beta)*(Z**3)-(mu)*I+gamma*(1/(1-Z)**2)-gamma*(1/(1+Z)**2)+Acoef*(1/(1-Z)**2)*(np.sin(omega*t))
dEdt=O
dOdt=-(alpha)*(X)-(beta)*(X**3)-(mu)*I+gamma*(1/(1-X)**2)-gamma*(1/(1+X)**2)+Acoef*(1/(1-X)**2)*(np.sin(omega*t))-(-(alpha)*(Z)-(beta)*(Z**3)-(mu)*I+gamma*(1/(1-Z)**2)-gamma*(1/(1+Z)**2)+Acoef*(1/(1-Z)**2)*(np.sin(omega*t)))
return dXdt, dIdt, dZdt, dHdt, dEdt,dOdt
# Initial conditions vector
y0 = X0,I0,Z0,H0,E0,O0
# Integrate the SIR equations over the time grid, t.
ret = odeint(deriv, y0, t)
X, I, Z, H, E, O = ret.T
# Plot the data on three separate curves for S(t), I(t) and R(t)
plt.xlim([-10, 10])
plt.plot(ret[:,4], ret[:,5])
plt.show()
e1=ret[:,4]
e2=ret[:,5]
Unfortunately, it produces the following warning:
File "C:\Users\user\AppData\Local\Programs\Python\Python37\lib\site-packages\scipy\integrate\odepack.py", line 247
warnings.warn(warning_msg, ODEintWarning)
ODEintWarning: Excess work done on this call (perhaps wrong Dfun type). Run with full_output = 1 to get quantitative information.
And the result is not as expected. I cannot here post graphical result probably.
Does anyone know how I can fix it, please?
The problem is to fitter on all my wavelength peaks a Gaussian in order to make a medium adjustment as accurate as possible
My question is how to make the Gaussian adjustment on all my peaks automatically without having to manually specify the coordinates of the peaks
For that, I realized the Gaussian adjustment of the brightest peaks, but I would like to generalize it to the following peaks. Subsequently, the Gaussian adjustment will allow me to obtain a polynomial adjustment fine enough to stagger pixels in wavelength
import numpy as np
from astropy.io import fits
import matplotlib.pyplot as plt
from scipy import interpolate
from tqdm import tqdm
from scipy import ndimage
import peakutils
from scipy.optimize import curve_fit
def gauss(x, x0, amp, wid):
return amp * np.exp( -((x - x0)/wid)**2)
def multi_gauss(x, *params):
y = np.zeros_like(x)
for i in range(0, len(params), 3):
x0, amp, wid = params[i:i+3]
y = y + gauss(x, x0, amp, wid)
return y
neon = fits.getdata(data_directory + wave_filename + '.fits')
neon_sp = np.mean(neon, axis= 0)
n_pix = len(neon_sp)
peaks_index = peakutils.peak.indexes(neon_sp, thres=0.05, min_dist=2)
### peals around the brightest peak
bright_index = peaks_index[np.argmax(neon_sp[peaks_index])]
delta_pix = 20
ind_min = bright_index - delta_pix
ind_max = bright_index + delta_pix
peak_select = peaks_index[np.where((peaks_index > ind_min) & (peaks_index < ind_max))]
peak_select_sort = peak_select[np.argsort(-neon_sp[peak_select])]
if peak_select_sort[1] > peak_select_sort[0] :
ind_max = bright_index + 40
else :
ind_min = bright_index - 40
peak_select = peaks_index[np.where((peaks_index > ind_min) & (peaks_index < ind_max))]
peak_select_sort = peak_select[np.argsort(-neon_sp[peak_select])]
plt.figure(num=0)
plt.clf()
plt.plot(neon_sp)
plt.plot(peaks_index,neon_sp[peaks_index], 'r+')
plt.plot(peak_select,neon_sp[peak_select], 'ro')
### Gaussian fit
x = np.arange(n_pix)
xx = np.arange(0, n_pix, .1)
n_peak = 4
bright_index_fit = np.zeros(n_peak)
for i in range(n_peak):
p = peak_select_sort[i]
guess = [p, neon_sp[p], .5]
popt, pcov = curve_fit(gauss, x, neon_sp, p0=guess)
fit = gauss(xx, *popt)
bright_index_fit[i] = popt[0]
plt.plot(xx,fit, '--')
bright_wave = [703.2, 724.5, 693.0, 743.9]
Following the recommendations in this answer I have used several combination of values for beta0, and as shown here, the values from polyfit.
This example is UPDATED in order to show the effect of relative scales of values of X versus Y (X range is 0.1 to 100 times Y):
from random import random, seed
from scipy import polyfit
from scipy import odr
import numpy as np
from matplotlib import pyplot as plt
seed(1)
X = np.array([random() for i in range(1000)])
Y = np.array([i + random()**2 for i in range(1000)])
for num in range(1, 5):
plt.subplot(2, 2, num)
plt.title('X range is %.1f times Y' % (float(100 / max(X))))
X *= 10
z = np.polyfit(X, Y, 1)
plt.plot(X, Y, 'k.', alpha=0.1)
# Fit using odr
def f(B, X):
return B[0]*X + B[1]
linear = odr.Model(f)
mydata = odr.RealData(X, Y)
myodr = odr.ODR(mydata, linear, beta0=z)
myodr.set_job(fit_type=0)
myoutput = myodr.run()
a, b = myoutput.beta
sa, sb = myoutput.sd_beta
xp = np.linspace(plt.xlim()[0], plt.xlim()[1], 1000)
yp = a*xp+b
plt.plot(xp, yp, label='ODR')
yp2 = z[0]*xp+z[1]
plt.plot(xp, yp2, label='polyfit')
plt.legend()
plt.ylim(-1000, 2000)
plt.show()
It seems that no combination of beta0 helps... The only way to get polyfit and ODR fit similar is to swap X and Y, OR as shown here to increase the range of values of X with regard to Y, still not really a solution :)
=== EDIT ===
I do not want ODR to be the same as polyfit. I am showing polyfit just to emphasize that the ODR fit is wrong and it is not a problem of the data.
=== SOLUTION ===
thanks to #norok2 answer when Y range is 0.001 to 100000 times X:
from random import random, seed
from scipy import polyfit
from scipy import odr
import numpy as np
from matplotlib import pyplot as plt
seed(1)
X = np.array([random() / 1000 for i in range(1000)])
Y = np.array([i + random()**2 for i in range(1000)])
plt.figure(figsize=(12, 12))
for num in range(1, 10):
plt.subplot(3, 3, num)
plt.title('Y range is %.1f times X' % (float(100 / max(X))))
X *= 10
z = np.polyfit(X, Y, 1)
plt.plot(X, Y, 'k.', alpha=0.1)
# Fit using odr
def f(B, X):
return B[0]*X + B[1]
linear = odr.Model(f)
mydata = odr.RealData(X, Y,
sy=min(1/np.var(Y), 1/np.var(X))) # here the trick!! :)
myodr = odr.ODR(mydata, linear, beta0=z)
myodr.set_job(fit_type=0)
myoutput = myodr.run()
a, b = myoutput.beta
sa, sb = myoutput.sd_beta
xp = np.linspace(plt.xlim()[0], plt.xlim()[1], 1000)
yp = a*xp+b
plt.plot(xp, yp, label='ODR')
yp2 = z[0]*xp+z[1]
plt.plot(xp, yp2, label='polyfit')
plt.legend()
plt.ylim(-1000, 2000)
plt.show()
The key difference between polyfit() and the Orthogonal Distance Regression (ODR) fit is that polyfit works under the assumption that the error on x is negligible. If this assumption is violated, like it is in your data, you cannot expect the two methods to produce similar results.
In particular, ODR() is very sensitive to the errors you specify.
If you do not specify any error/weighting, it will assign a value of 1 for both x and y, meaning that any scale difference between x and y will affect the results (the so-called numerical conditioning).
On the contrary, polyfit(), before computing the fit, applies some sort of pre-whitening to the data (see around line 577 of its source code) for better numerical conditioning.
Therefore, if you want ODR() to match polyfit(), you could simply fine-tune the error on Y to change your numerical conditioning.
I tested that this works for any numerical conditioning between 1e-10 and 1e10 of your Y (it is / 10. or 1e-1 in your example).
mydata = odr.RealData(X, Y)
# equivalent to: odr.RealData(X, Y, sx=1, sy=1)
to:
mydata = odr.RealData(X, Y, sx=1, sy=1/np.var(Y))
(EDIT: note there was a typo on the line above)
I tested that this works for any numerical conditioning between 1e-10 and 1e10 of your Y (it is / 10. or 1e-1 in your example).
Note that this would only make sense for well-conditioned fits.
I cannot format source code in a comment, and so place it here. This code uses ODR to calculate fit statistics, note the line that has "parameter order for odr" such that I use a wrapper function for the ODR call to my "actual" function.
from scipy.optimize import curve_fit
import numpy as np
import scipy.odr
import scipy.stats
x = np.array([5.357, 5.797, 5.936, 6.161, 6.697, 6.731, 6.775, 8.442, 9.861])
y = np.array([0.376, 0.874, 1.049, 1.327, 2.054, 2.077, 2.138, 4.744, 7.104])
def f(x,b0,b1):
return b0 + (b1 * x)
def f_wrapper_for_odr(beta, x): # parameter order for odr
return f(x, *beta)
parameters, cov= curve_fit(f, x, y)
model = scipy.odr.odrpack.Model(f_wrapper_for_odr)
data = scipy.odr.odrpack.Data(x,y)
myodr = scipy.odr.odrpack.ODR(data, model, beta0=parameters, maxit=0)
myodr.set_job(fit_type=2)
parameterStatistics = myodr.run()
df_e = len(x) - len(parameters) # degrees of freedom, error
cov_beta = parameterStatistics.cov_beta # parameter covariance matrix from ODR
sd_beta = parameterStatistics.sd_beta * parameterStatistics.sd_beta
ci = []
t_df = scipy.stats.t.ppf(0.975, df_e)
ci = []
for i in range(len(parameters)):
ci.append([parameters[i] - t_df * parameterStatistics.sd_beta[i], parameters[i] + t_df * parameterStatistics.sd_beta[i]])
tstat_beta = parameters / parameterStatistics.sd_beta # coeff t-statistics
pstat_beta = (1.0 - scipy.stats.t.cdf(np.abs(tstat_beta), df_e)) * 2.0 # coef. p-values
for i in range(len(parameters)):
print('parameter:', parameters[i])
print(' conf interval:', ci[i][0], ci[i][1])
print(' tstat:', tstat_beta[i])
print(' pstat:', pstat_beta[i])
print()
I need to count the number of particle under the fitted Gaussian curve. The area of the fitted curve can be found by integrating the function within the limit (mean-3*sigma) to (mean+3*sigma). Would you please help me to solve this. Thanks for your kind consideration.
import pylab as py
import numpy as np
from scipy import optimize
from scipy.stats import stats
import matplotlib.pyplot as plt
import pandas as pd
BackPFT='T067.csv'
df_180 = pd.read_csv(BackPFT, error_bad_lines=False, header=1)
x_180=df_180.iloc[:,3]
y_180=df_180.iloc[:,4]
#want to plot the distribution of s calculated by the following equation
s=np.sqrt((((16*x_180**2*38.22**2)/((4*38.22**2-y_180**2)**2))+1))-1
#Shape of this distribution is Gaussian
#I need to fit this distribution by following parameter
mean=0.433
sigma=0.014
draw=s
#Definition of bin number
bi=np.linspace(0.01,8, 1000)
data = py.hist(draw.dropna(), bins = bi)
#Definition of Gaussian function
def f(x, a, b, c):
return (a * py.exp(-(x - mean)**2.0 / (2 *sigma**2)))
x = [0.5 * (data[1][i] + data[1][i+1]) for i in xrange(len(data[1])-1)]
y = data[0]
#Fitting the peak of the distribution
popt, pcov = optimize.curve_fit(f, x, y)
chi2, p = stats.chisquare(popt)
x_fit = py.linspace(x[0], x[-1], 80000)
y_fit = f(x_fit, *popt)
plot(x_fit, y_fit, lw=3, color="r",ls="--")
plt.xlim(0,2)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.show()
The problem is how to integrate the defined function (f) and count the number under the area. Here I attach the file T067.csv. Thanks in advance for your kind consideration.
BackPFT='T061.csv'
df_180 = pd.read_csv(BackPFT, skip_blank_lines=True ,skiprows=1,header=None,skipfooter=None,engine='python')
x_180=df_180.iloc[:,3]
y_180=df_180.iloc[:,4]
b=42.4
E=109.8
LET=24.19
REL=127.32
mean=0.339; m1=0.259
sigma=0.012; s1=0.015
s=np.sqrt((((16*x_180**2*b**2)/((4*b**2-y_180**2)**2))+1))-1
draw=s
bi=np.linspace(0,8, 2000)
binwidth=0.004
#I want to plot the dsitribution of s. This distribution has three gaussian peaks
data = py.hist(draw.dropna(), bins = bi,color='gray',)
#first Gaussian function for the first peak (peaks counted from the right)
def f(x, a, b, c):
return (a * py.exp(-(x - mean)**2.0 / (2 *sigma**2)))
# fitting the function (Gaussian)
x = [0.5 * (data[1][i] + data[1][i+1]) for i in xrange(len(data[1])-1)]
y = data[0]
popt, pcov = optimize.curve_fit(f, x, y)
chi, p = stats.chisquare(popt)
x_fit = py.linspace(x[0], x[-1], 80000)
y_fit = f(x_fit, *popt)
plot(x_fit, y_fit, lw=5, color="r",ls="--")
#integration of first function f
gaussF = lambda x, a: f(x, a, sigma, mean)
bins=((6*sigma)/(binwidth))
delta = ((mean+3*sigma) - (mean-3*sigma))/bins
f1 = lambda x : f(x, popt[0], sigma, mean)
result = quad(f1,mean-3*sigma,mean+3*sigma)
area = result[0] # this give the area after integration of the gaussian
numPar = area / delta # this gives the number of particle under the integrated area
print"\n\tArea under curve = ", area, "\n\tNumber of particel= ", numPar
The file T061.csv here. Thanks Dr. I Putu Susila for his kind co-operation and interest.
I am trying to fit some data that are distributed in the time following a rising gaussian curve, and then decaying exponentially.
I have found this example on the web, that is very similar to my case, but I just started to fit with python, and the example seems quite confusing to me.
Nonetheless, I have tryied to adapt the example to my script and data, and in the following is my progress:
#!/usr/bin/env python
import pyfits, os, re, glob, sys
from scipy.optimize import leastsq
from numpy import *
from pylab import *
from scipy import *
from scipy import optimize
import numpy as N
import pylab as P
data=pyfits.open('http://heasarc.gsfc.nasa.gov/docs/swift/results/transients/weak/GX304-1.orbit.lc.fits')
time = data[1].data.field(0)/86400. + data[1].header['MJDREFF'] + data[1].header['MJDREFI']
rate = data[1].data.field(1)
error = data[1].data.field(2)
data.close()
cond = ((time > 56200) & (time < 56220))
time=time[cond]
rate=rate[cond]
error=error[cond]
def expGauss(x, pos, wid, tConst, expMod = 0.5, amp = 1):
expMod *= 1.0
gNorm = amp * N.exp(-0.5*((x-pos)/(wid))**2)
g = expBroaden(gNorm, tConst, expMod)
return g, gNorm
def expBroaden(y, t, expMod):
fy = F.fft(y)
a = N.exp(-1*expMod*time/t)
fa = F.fft(a)
fy1 = fy*fa
yb = (F.ifft(fy1).real)/N.sum(a)
return yb
if __name__ == '__main__':
# Fit the first set
#p[0] -- amplitude, p[1] -- position, p[2] -- width
fitfuncG = lambda p, x: p[0]*N.exp(-0.5*(x-p[1])**2/p[2]**2) # Target function
errfuncG = lambda p, x, y: fitfuncG(p, x) - y # Distance to the target function
p0 = [0.20, 56210, 2.0] # Initial guess for the parameters
p1, success = optimize.leastsq(errfuncG, p0[:], args=(time, rate))
p1G = fitfuncG(p1, time)
# P.plot(rate, 'ro', alpha = 0.4, label = "Gaussian")
# P.plot(p1G, label = 'G-Fit')
def expGauss(x, pos, wid, tConst, expMod = 0.5, amp = 1):
#p[0] -- amplitude, p[1] -- position, p[2] -- width, p[3]--tConst, p[4] -- expMod
fitfuncExpG = lambda p, x: expGauss(x, p[1], p[2], p[3], p[4], p[0])[0]
errfuncExpG = lambda p, x, y: fitfuncExpG(p, x) - y # Distance to the target function
p0a = [0.20, 56210, 2.0] # Initial guess for the parameters
p1a, success = optimize.leastsq(errfuncExpG, p0a[:], args=(time, rate))
p1aG = fitfuncExpG(p1a, time)
print type(rate), type(time), len(rate), len(time)
P.plot(rate, 'go', alpha = 0.4, label = "ExpGaussian")
P.plot(p1aG, label = 'ExpG-Fit')
P.legend()
P.show()
I am sure to have confused the whole thing, so sorry in advance for that, but at this point I don't know how to go further...
The code take the data from the web, so it is directly executable.
At the moment the code runs without any error, but it doesn't produce any plot.
Again, my goal is to fit the data with those two functions, how can I improve my code to do that?
Any suggestion is really appreciated.
Similarly to your other question, here also I would use a trigonometric function to fit this peaK:
The following code works if pasted after your code:
import numpy as np
from scipy.optimize import curve_fit
x = time
den = x.max() - x.min()
x -= x.min()
y_points = rate
def func(x, a1, a2, a3):
return a1*sin(1*pi*x/den)+\
a2*sin(2*pi*x/den)+\
a3*sin(3*pi*x/den)
popt, pcov = curve_fit(func, x, y_points)
y = func(x, *popt)
plot(time,rate)
plot(x,y, color='r', linewidth=2.)
show()