Fitting multiple datasets with shared paramaters

Fitting multiple datasets with shared paramaters - python

I am trying to fit different data set with different non-linear function that shared some parameters and it look something like this:
import matplotlib
from matplotlib import pyplot as plt
from scipy import optimize
import numpy as np
#some non-linear function
def Sigma1x(x,C11,C111,C1111,C11111):
return C11*x+1/2*C111*pow(x,2)+1/6*C1111*pow(x,3)+1/24*C11111*pow(x,4)
def Sigma2x(x,C12,C112,C1112,C11112):
return C12*x+1/2*C112*pow(x,2)+1/6*C1112*pow(x,3)+1/24*C11112*pow(x,4)
def Sigma1y(y,C12,C111,C222,C112,C1111,C1112,C2222,C12222):
return C12*y+1/2*(C111-C222+C112)*pow(y,2)+1/12*(C111+2*C1112-C2222)*pow(y,3)+1/24*C12222*pow(y,4)
def Sigma2y(y,C11,C222,C222,C2222):
return C11*y+1/2*C222*pow(y,2)+1/6*C2222*pow(y,3)+1/24*C22222*pow(y,4)
def Sigmaz(z,C11,C12,C111,C222,C112,C1111,C1112,C2222,C1122,C11111,C11112,C122222,C11122,C22222):
return (C11+C12)*z+1/2*(2*C111-C222+3*C112)*pow(z,2)+1/6*(3/2*C1111+4*C1112-1/2*C222+3*C1122)*pow(z,3)+\
1/24*(3*C11111+10*C11112-5*C12222+10*C11122-2*C22222)*pow(z,4)
# Experimental datasets
Xdata=np.loadtxt('x-direction.txt') #This contain x axis and two other dataset, should be fitted with Sigma1x and Sigma2x
Ydata=np.loadtxt('y-direction.txt') #his contain yaxis and two other dataset, should be fitted with Sigma1yand Sigma2y
Zdata=nploadtxt('z-direction.txt')#This contain z axis and one dataset fitted with Sigmaz
The question is how to use optimize.leastsq or other packages to fit the data with the appropriate function, knowing that they share multiple paramaters?

I was able to solve ( partially the initial question). I found symfit a very comprehensive and easy to use. So i wrote the following code
import matplotlib.pyplot as plt
from symfit import *
import numpy as np
from symfit.core.minimizers import DifferentialEvolution, BFGS
Y_strain = np.genfromtxt('Y_strain.csv', delimiter=',')
X_strain=np.genfromtxt('X_strain.csv', delimiter=',')
xmax=max(X_strain[:,0])
xmin=min(X_strain[:,0])
xdata = np.linspace(xmin, xmax, 50)
ymax=max(Y_strain[:,0])
ymin=max(Y_strain[:,0])
ydata=np.linspace(ymin, ymax, 50)
x,y,Sigma1x,Sigma2x,Sigma1y,Sigma2y= variables('x,y,Sigma1x,Sigma2x,Sigma1y,Sigma2y')
C11,C111,C1111,C11111,C12,C112,C1112,C11112,C222,C2222,C12222,C22222 = parameters('C11,C111,C1111,C11111,C12,C112,C1112,C11112,C222,C2222,C12222,C22222')
model =Model({
Sigma1x:C11*x+1/2*C111*pow(x,2)+1/6*C1111*pow(x,3)+1/24*C11111*pow(x,4),
Sigma2x:C12*x+1/2*C112*pow(x,2)+1/6*C1112*pow(x,3)+1/24*C11112*pow(x,4),
#Sigma1y:C12*y+1/2*(C111-C222+C112)*pow(y,2)+1/12*(C111+2*C1112-C2222)*pow(y,3)+1/24*C12222*pow(y,4),
#Sigma2y:C11*y+1/2*C222*pow(y,2)+1/6*C2222*pow(y,3)+1/24*C22222*pow(y,4),
})
fit = Fit(model, x=X_strain[:,0], Sigma1x=X_strain[:,1],Sigma2x=X_strain[:,2])
fit_result = fit.execute()
print(fit_result)
plt.scatter(Y_strain[:,0],Y_strain[:,2])
plt.scatter(Y_strain[:,0],Y_strain[:,1])
plt.plot(xdata, model(x=xdata, **fit_result.params).Sigma1x)
plt.plot(xdata, model(x=xdata, **fit_result.params).Sigma2x)
However, The resulting fit is very bad :
Parameter Value Standard Deviation
C11 1.203919e+02 3.988977e+00
C111 -6.541505e+02 5.643111e+01
C1111 1.520749e+03 3.713742e+02
C11111 -7.824107e+02 1.015887e+03
C11112 4.451211e+03 1.015887e+03
C1112 -1.435071e+03 3.713742e+02
C112 9.207923e+01 5.643111e+01
C12 3.272248e+01 3.988977e+00
Status message Desired error not necessarily achieved due to precision loss.
Number of iterations 59
Objective <symfit.core.objectives.LeastSquares object at 0x000001CC00C0A508>
Minimizer <symfit.core.minimizers.BFGS object at 0x000001CC7F84A548>
Goodness of fit qualifiers:
chi_squared 6.230510793023184
objective_value 3.115255396511592
r_squared 0.991979767376565
Any idea's how to improve the fit?

Related

How to implement a butterworth filter

I am trying to implement a butterworthfilter with python in jupyter Notebook. I wrote this code by a tutorial.
The data are from a CSV-File, it calls Samples.csv
The data in Samples.csv are like
998,4778415
1009,209592
1006,619094
1001,785406
993,9426543
990,1408991
992,736118
995,8127334
1002,381664
1006,094429
1000,634799
999,3287747
1002,318812
999,3287747
1004,427698
1008,516733
1007,964781
1002,680906
1000,14449
994,257009
The column calls Euclidian Norm. The range of the data are from 0 to 1679.286158 and theyre are 1838 rows.
This is the code in Jupyter:
from scipy.signal import filtfilt
from scipy import stats
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
def plot():
data=pd.read_csv('Samples.csv',sep=";", decimal=",")
sensor_data=data[['Euclidian Norm']]
sensor_data=np.array(sensor_data)
time=np.linspace(0,1679.286158,1838)
plt.plot(time,sensor_data)
plt.show()
filtered_signal=bandPassFilter(sensor_data)
plt.plot(time,sensor_data)
plt.show()
def bandPassFilter(signal):
fs = 4000.0
lowcut=20.0
highcut=50.0
nyq=0.5*fs
low=lowcut/nyq
high=highcut/nyq
order =2
b,a=scipy.signal.butter(order,[low,high],'bandpass',analog=False)
y=scipy.signal.filtfilt(b,a,signal,axis=0)
return(y)
plot()
My problem is that nothing changes in my data. It doesnt filtered my data. The graph of the filtered data is the same like the source data. Does anyone know what could be wrong?
The first graph is the source data and the second graph is the filtered graph. It looks very similar. Its like the same graph

I can't comment yet.
You're never using filtered_signal and plot with the same arguments twice.
Here`s one of my implementations with added interpolation, very similar to yours:
def butterFit(data, freq, order=2):
ar = scipy.signal.butter(order, freq) # Gets params for filttilt
return spfilter.filtfilt(ar[0], ar[1], data)
def plotFilteredSplines(timeframe, data, amount_points):
# Generate evenly spread indices for the data points.
indices = np.arange(0, len(data), amount_points)
cutoff_freq = 2 / (2/10 * len(timeframe))
# Reshape the data with butter :)
data = butterFit(data, cutoff_freq)
# Plot Fitlered data
plt.plot(timeframe, data, '-.')
interpol_x = np.linspace(timeframe[0], timeframe[-1], 100)
# Get the cubic spline approx function
interpolation = sp.interpolate.interp1d(timeframe, data, kind='cubic')
# Plot the interpolation over the extended time frame.
plt.plot(interpol_x, interpolation(interpol_x), '-r')

Errors using curve_fit for Guassian fit of data

I'm trying to do a guassian fit for some experimental data but I keep running into error after error. I've followed a few different threads online but either the fit isn't good (it's just a horizontal line) or the code just won't run. I'm following this code from another thread. Below is my code.
I apologize if my code seems a bit messy. There are some bits from other attempts when I tried making it work. Hence the "astropy" import.
import math as m
import matplotlib.pyplot as plt
import numpy as np
from scipy import optimize as opt
import pandas as pd
import statistics as stats
from astropy import modeling
def gaus(x,a,x0,sigma, offset):
return a*m.exp(-(x-x0)**2/(2*sigma**2)) + offset
# Python program to get average of a list
def Average(lst):
return sum(lst) / len(lst)
wavelengths = [391.719, 391.984, 392.248, 392.512, 392.777, 393.041, 393.306, 393.57, 393.835, 394.099, 391.719, 391.455, 391.19, 390.926, 390.661, 390.396]
intensities = [511.85, 1105.85, 1631.85, 1119.85, 213.85, 36.85, 10.85, 6.85, 13.85, 7.85, 511.85, 200.85, 80.85, 53.85, 14.85, 24.85]
n=sum(intensities)
mean = sum(wavelengths*intensities)/n
sigma = m.sqrt(sum(intensities*(wavelengths-mean)**2)/n)
def gaus(x,a,x0,sigma):
return a*m.exp(-(x-x0)**2/(2*sigma**2))
popt,pcov = opt.curve_fit(gaus,wavelengths,intensities,p0=[1,mean,sigma])
print(popt)
plt.scatter(wavelengths, intensities)
plt.title("Helium Spectral Line Peak 1")
plt.xlabel("Wavelength (nm)")
plt.ylabel("Intensity (a.u.)")
plt.show()
Thanks to the kind user, my curve seems to be working more reasonably well. However, one of the points seems to be back connecting to an earlier point? Screenshot below:

There are two problems with your code. The first is that you are performing vector operation on list which gives you the first error in the line mean = sum(wavelengths*intensities)/n. Therefore, you should use np.array instead. The second is that you take math.exp on python list which again throws an error as it takes a real number, so you should use np.exp here instead.
The following code solves your problem:
import matplotlib.pyplot as plt
import numpy as np
from scipy import optimize as opt
wavelengths = [391.719, 391.984, 392.248, 392.512, 392.777, 393.041,
393.306, 393.57, 393.835, 394.099, 391.719, 391.455,
391.19, 390.926, 390.661, 390.396]
intensities = [511.85, 1105.85, 1631.85, 1119.85, 213.85, 36.85, 10.85, 6.85,
13.85, 7.85, 511.85, 200.85, 80.85, 53.85, 14.85, 24.85]
wavelengths_new = np.array(wavelengths)
intensities_new = np.array(intensities)
n=sum(intensities)
mean = sum(wavelengths_new*intensities_new)/n
sigma = np.sqrt(sum(intensities_new*(wavelengths_new-mean)**2)/n)
def gaus(x,a,x0,sigma):
return a*np.exp(-(x-x0)**2/(2*sigma**2))
popt,pcov = opt.curve_fit(gaus,wavelengths_new,intensities_new,p0=[1,mean,sigma])
print(popt)
plt.scatter(wavelengths_new, intensities_new, label="data")
plt.plot(wavelengths_new, gaus(wavelengths_new, *popt), label="fit")
plt.title("Helium Spectral Line Peak 1")
plt.xlabel("Wavelength (nm)")
plt.ylabel("Intensity (a.u.)")
plt.show()

Kernel Density Estimation using scipy's gaussian_kde and sklearn's KernelDensity leads to different results

I created some data from two superposed normal distributions and then applied sklearn.neighbors.KernelDensity and scipy.stats.gaussian_kde to estimate the density function. However, using the same bandwith (1.0) and the same kernel, both methods produce a different outcome. Can someone explain me the reason for this? Thanks for help.
Below you can find the code to reproduce the issue:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
import seaborn as sns
from sklearn.neighbors import KernelDensity
n = 10000
dist_frac = 0.1
x1 = np.random.normal(-5,2,int(n*dist_frac))
x2 = np.random.normal(5,3,int(n*(1-dist_frac)))
x = np.concatenate((x1,x2))
np.random.shuffle(x)
eval_points = np.linspace(np.min(x), np.max(x))
kde_sk = KernelDensity(bandwidth=1.0, kernel='gaussian')
kde_sk.fit(x.reshape([-1,1]))
y_sk = np.exp(kde_sk.score_samples(eval_points.reshape(-1,1)))
kde_sp = gaussian_kde(x, bw_method=1.0)
y_sp = kde_sp.pdf(eval_points)
sns.kdeplot(x)
plt.plot(eval_points, y_sk)
plt.plot(eval_points, y_sp)
plt.legend(['seaborn','scikit','scipy'])
If I change the scipy bandwith to 0.25, the result of both methods look approximately the same.

What is meant by bandwidth in scipy.stats.gaussian_kde and sklearn.neighbors.KernelDensity is not the same. Scipy.stats.gaussian_kde uses a bandwidth factor https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html. For a 1-D kernel density estimation the following formula is applied:
the bandwidth of sklearn.neighbors.KernelDensity = bandwidth factor of the scipy.stats.gaussian_kde * standard deviation of the sample
For your estimation this probably means that your standard deviation equals 4.
I would like to refer to Getting bandwidth used by SciPy's gaussian_kde function for more information.

To be honest, I don't know why, but using scipy hyperparameter bw_method='scott' makes it work exactly the same as seaborn.
So, it seems to be all about the hyperparameters. We could find out why by understanding them in depth, but in the meantime just use ‘scott’ or ‘silverman’ instead of using a random scalar.
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
import seaborn as sns
from sklearn.neighbors import KernelDensity
n = 10000
dist_frac = 0.1
x1 = np.random.normal(-5,2,int(n*dist_frac))
x2 = np.random.normal(5,3,int(n*(1-dist_frac)))
x = np.concatenate((x1,x2))
np.random.shuffle(x)
eval_points = np.linspace(np.min(x), np.max(x))
kde_sk = KernelDensity(bandwidth=1, kernel='gaussian')
kde_sk.fit(x.reshape([-1,1]))
y_sk = np.exp(kde_sk.score_samples(eval_points.reshape(-1,1)))
kde_sp = gaussian_kde(x, bw_method='scott') ### I MEAN HERE! ###
y_sp = kde_sp.pdf(eval_points)
sns.kdeplot(x)
plt.plot(eval_points, y_sk)
plt.plot(eval_points, y_sp)
plt.legend(['seaborn','scikit','scipy'])

Increase the size of 'random normal'. your data points are too few.
try with n=500000 and check the results.

How to fix the poor fitting of 1-D data?

I have data set (1-D), with only one independent column. I would like to fit any model to it in order to sample from that model. The raw data
Data set
I tried various theoretical distributions from Fitter package (here https://pypi.org/project/fitter/), none of them works fine. Then i tried Kernel Density Estimation using sklearn. It is good, but i could not prevent negative values due to the way it works. Finally, i tried log normal, but it is not really perfect.
Code for log normal here
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import math
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
NN = 3915 # sample same number as original data set
df = pd.read_excel (r'Data_sets2.xlsx',sheet_name="Set1")
eps = 0.1 # Additional term for c
"""
Estimate parameters of log(c) as normal distribution
"""
df["c"] = df["c"] + eps
mu = np.mean(np.log(df["c"]))
s = np.std(np.log(df["c"]))
print("Mean:",mu,"std:",s)
def simulate(N):
c = []
for i in range(N):
c_s = np.exp(np.random.normal(loc = mu, scale = s, size=1)[0])
c.append(round(c_s))
return (c)
predicted_c = simulate(NN)
XX=scipy.arange(3915)
### plot C relation ###
plt.scatter(XX,df["c"],color='g',label="Original data")
plt.scatter(XX,predicted_c,color='r',label="Sample data")
plt.xlabel('Index')
plt.ylabel('c')
plt.legend()
plt.show()
original vs samples
What i am looking for is how to improve the fitting, any suggestions or direction to models that may fit my data with a better accuracy is appreciated. Thanks

Here is a graphical Python fitter for the scipy statistical distribution Double Gamma using your spreadsheet data, I hope this might be of some use as a Normal distribution seems to be a poor fit to this data set. The scipy documentation for dgamma is at https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.dgamma.html - incidentally,the double Weibull distribution fit almost as well.
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_excel (r'Data_sets2.xlsx',sheet_name="Set1")
eps = 0.1 # Additional term for c
data = df["c"] + eps
P = ss.dgamma.fit(data)
rX = np.linspace(min(data), max(data), 50)
rP = ss.dgamma.pdf(rX, *P)
plt.hist(data,bins=25, normed=True, color='slategrey')
plt.plot(rX, rP, color='darkturquoise')
plt.show()

Gradient in noisy data, python

I have an energy spectrum from a cosmic ray detector. The spectrum follows an exponential curve but it will have broad (and maybe very slight) lumps in it. The data, obviously, contains an element of noise.
I'm trying to smooth out the data and then plot its gradient.
So far I've been using the scipy sline function to smooth it and then the np.gradient().
As you can see from the picture, the gradient function's method is to find the differences between each point, and it doesn't show the lumps very clearly.
I basically need a smooth gradient graph. Any help would be amazing!
I've tried 2 spline methods:
def smooth_data(y,x,factor):
print "smoothing data by interpolation..."
xnew=np.linspace(min(x),max(x),factor*len(x))
smoothy=spline(x,y,xnew)
return smoothy,xnew
def smooth2_data(y,x,factor):
xnew=np.linspace(min(x),max(x),factor*len(x))
f=interpolate.UnivariateSpline(x,y)
g=interpolate.interp1d(x,y)
return g(xnew),xnew
edit: Tried numerical differentiation:
def smooth_data(y,x,factor):
print "smoothing data by interpolation..."
xnew=np.linspace(min(x),max(x),factor*len(x))
smoothy=spline(x,y,xnew)
return smoothy,xnew
def minim(u,f,k):
""""functional to be minimised to find optimum u. f is original, u is approx"""
integral1=abs(np.gradient(u))
part1=simps(integral1)
part2=simps(u)
integral2=abs(part2-f)**2.
part3=simps(integral2)
F=k*part1+part3
return F
def fit(data_x,data_y,denoising,smooth_fac):
smy,xnew=smooth_data(data_y,data_x,smooth_fac)
y0,xnnew=smooth_data(smy,xnew,1./smooth_fac)
y0=list(y0)
data_y=list(data_y)
data_fit=fmin(minim, y0, args=(data_y,denoising), maxiter=1000, maxfun=1000)
return data_fit
However, it just returns the same graph again!

There is an interesting method published on this: Numerical Differentiation of Noisy Data. It should give you a nice solution to your problem. More details are given in another, accompanying paper. The author also gives Matlab code that implements it; an alternative implementation in Python is also available.
If you want to pursue the interpolation with splines method, I would suggest to adjust the smoothing factor s of scipy.interpolate.UnivariateSpline().
Another solution would be to smooth your function through convolution (say with a Gaussian).
The paper I linked to claims to prevent some of the artifacts that come up with the convolution approach (the spline approach might suffer from similar difficulties).

I won't vouch for the mathematical validity of this; it looks like the paper from LANL that EOL cited would be worth looking into. Anyway, I’ve gotten decent results using SciPy’s splines’ built-in differentiation when using splev.
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
from scipy.interpolate import splrep, splev
x = np.arange(0,2,0.008)
data = np.polynomial.polynomial.polyval(x,[0,2,1,-2,-3,2.6,-0.4])
noise = np.random.normal(0,0.1,250)
noisy_data = data + noise
f = splrep(x,noisy_data,k=5,s=3)
#plt.plot(x, data, label="raw data")
#plt.plot(x, noise, label="noise")
plt.plot(x, noisy_data, label="noisy data")
plt.plot(x, splev(x,f), label="fitted")
plt.plot(x, splev(x,f,der=1)/10, label="1st derivative")
#plt.plot(x, splev(x,f,der=2)/100, label="2nd derivative")
plt.hlines(0,0,2)
plt.legend(loc=0)
plt.show()

You can also use scipy.signal.savgol_filter.
Result
Example
import matplotlib.pyplot as plt
import numpy as np
import scipy
from random import random
# generate data
x = np.array(range(100))/10
y = np.sin(x) + np.array([random()*0.25 for _ in x])
dydx = scipy.signal.savgol_filter(y, window_length=11, polyorder=2, deriv=1)
# Plot result
plt.plot(x, y, label='Original signal')
plt.plot(x, dydx*10, label='1st Derivative')
plt.plot(x, np.cos(x), label='Expected 1st Derivative')
plt.legend()
plt.show()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Fitting multiple datasets with shared paramaters - python

Related

How to implement a butterworth filter

Errors using curve_fit for Guassian fit of data

Kernel Density Estimation using scipy's gaussian_kde and sklearn's KernelDensity leads to different results

How to fix the poor fitting of 1-D data?

Gradient in noisy data, python

Categories

Resources