Related
As a tradition, I want to say that I am pretty new to python.
I have set of x and y values as csv file, and my y values are pretty noisy. So far, I managed to use a filter(scipy.signal.savgol_filter) to filter the noise, plot my graph, and get a linear regression of my data, where it is showing a linear trend. This part is important, because my question is related to linear fitting of some part of the data. Here is the code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.signal
import os
from scipy.signal import savgol_coeffs
from sklearn.metrics import r2_score
from scipy.linalg import lstsq
plt.rc('lines',linewidth=1)
plt.rc('axes', labelsize=16)
plt.rc('xtick', labelsize=14)
plt.rc('ytick', labelsize=14)
plt.rc('legend', fontsize=10)
# define material parameters
deprate = 5.90E-07
#deposition rate unit: cm/s
Ms = 180000 # Si substrate modulus, unit: MPa
hs = 0.03 # Si substrate thickness, unit: cm
stressfac = Ms*hs**2/6 # stress prefactor, unit: MPa
def fit_slope(hfilm, curvature, h0, h1):
# use least square fitting to find the slope of the film_thickness vs curvature curve btw
# thickness = h0 and h1.
# return fitting parameter p. Least square fitting line is y = p[1]*x + p[0]
xdata = hfilm[ (hfilm>h0) & (hfilm<h1)]
ydata = curvature[(hfilm>h0) & (hfilm<h1)]
A = xdata[:, np.newaxis] ** [0,1]
p, *_ = lstsq(A, ydata)
return p
def read_MOSSdata(filename, deprate):
data = pd.read_csv(filename, sep='\s*,\s*', engine='python')
time = data['time [s]'][~data['time [s]'].isna()].to_numpy()
curvature = data['Curvature'][~data['time [s]'].isna()] # curvature unit: 1/cm
hfilm = time * deprate # file thickness unit: cm
return time, hfilm, curvature
filename = (r'C:\Users\yavuz\01-0722-2.csv')
time, hfilm, curvature = read_MOSSdata(filename, deprate)
h0 = 0.00005
h1 = 0.00008
xdata = np.linspace(h0, h1, 500)
yhat = scipy.signal.savgol_filter(curvature, 21,1)
p = fit_slope(hfilm, yhat, h0, h1)
plt.plot(hfilm, curvature)
plt.plot(hfilm, yhat, color='red', label = 'filtered data')
plt.plot(xdata, p[1]*xdata + p[0], color='green', linewidth=4, label = 'linear fitting')
plt.xlabel("Film thickness (cm)")
plt.ylabel("Curvature(1/cm)")
print(f'fitted stress = {-p[1]*stressfac} MPa')
plt.legend(loc=0)
My question is how do I calculate R-square value of this slope on my graph? I tried using r-square value calculators like sklearn.metrics but the problem is that I am limiting my x values to get a slope of a window, and all of the codes I tried, showing the problem of ''expected x and y to have same length''. I would add the csv file but it seems like there is not such an option. Thanks a lot for the help!
I'm trying to plot my data on a log-log scale, using Theil-Sen regression for the best fit line. However, when I plot work out my regression line on a log-scale 2, it's parallel to my x=y line, which I don't think is correct.
normal scale for X and y :
log-log scale for X and y :
I found a related solution by chaooder for Linear Regression on a semi-log scale to be somewhat helpful. So currently, my regression line would go from being:
y = ax + c on a linear scale to y = 10^^(a log(x)+c) on my log-log scale. But in my head, I can't understand how that has a solution as I cannot calculate a.
Here's the data:
index,x,y
0,0.22,0.26
1,0.39,0.1
2,0.4,0.17
3,0.56,0.41
4,0.57,0.12
5,0.62,0.54
6,0.78,0.99
7,0.79,0.35
8,0.8,0.33
9,0.83,0.91
10,0.95,0.81
11,1.08,0.23
12,1.34,0.11
13,1.34,0.44
14,1.35,0.11
15,1.58,0.24
16,1.66,0.71
17,2.11,0.54
18,2.13,0.42
19,2.19,1.72
20,2.25,2.16
21,2.39,0.95
22,2.4,0.16
23,2.73,0.92
24,2.87,1.1
25,2.96,0.27
26,3.12,1.66
27,3.26,0.06
28,3.28,0.68
29,3.34,0.7
30,3.38,1.14
31,3.39,1.81
32,3.41,0.19
33,3.49,1.4
34,3.52,1.57
35,3.6,0.99
36,3.64,1.28
37,3.65,1.68
38,3.89,1.66
39,3.93,1.64
40,4.01,1.04
41,4.07,0.32
42,4.22,0.68
43,4.52,0.57
44,4.53,0.59
45,4.56,0.7
46,4.6,1.15
47,4.62,1.31
48,4.68,1.09
49,5.03,0.48
50,5.06,0.7
51,5.31,0.62
52,5.41,0.21
53,5.45,2.06
54,6.0,0.72
55,6.06,0.36
56,6.64,1.41
57,6.74,0.59
58,6.96,0.95
59,7.01,1.13
60,7.14,1.56
61,7.14,2.82
62,7.19,1.49
63,7.21,0.88
64,7.23,1.31
65,7.55,0.76
66,7.72,0.5
67,7.75,1.65
68,7.77,1.48
69,7.9,1.8
70,7.95,0.68
71,8.03,1.12
72,8.09,2.61
73,8.86,1.71
74,9.31,0.23
75,9.5,2.35
76,9.62,1.84
77,9.91,0.56
78,9.95,1.67
79,10.4,1.15
80,10.8,0.88
81,11.28,1.8
82,11.31,1.58
83,11.43,1.0
84,12.38,2.83
85,13.38,1.45
86,13.9,1.99
87,30.3,1.99
And my current code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
from sklearn.linear_model import TheilSenRegressor
for log in [True, False]:
fig,ax = plt.subplots()
data.plot.scatter(ax=ax,
x='x',
y='y',
loglog=log)
vmin = np.amin(data[['x','y']].min().values)*0.8
vmax = np.amax(data[['x','y']].max().values)*1.25
ax.set_xlim(vmin,vmax)
ax.set_ylim(vmin,vmax)
ax.yaxis.set_minor_locator(AutoMinorLocator())
ax.xaxis.set_minor_locator(AutoMinorLocator())
# best fit (ThielSen) line
X = data.x.values[:,np.newaxis]
y = data.y.values
if log:
X = np.log10(X)
y = np.log10(y)
if len(y) > 0:
estimator = TheilSenRegressor(fit_intercept=False) # intentionally set intercept to 0
estimator.fit(X=X,y=y)
y0 = y[0]
x0 = X[0]
y_pred = estimator.predict(np.array([vmin,vmax]).reshape(2,1))
# y_pred = np.power(10,(estimator.predict(X)))
gradient = (y_pred[1] - y_pred[0]) / (vmax - vmin)
intercept = y_pred[1] - gradient * vmax
print(f'gradient: {gradient} \n intercept: {intercept}')
# Theil-Sen regression line
ax.plot([vmin,vmax],y_pred,color='red',lw=1,zorder=1,label='Best fit')
# 1:1 ratio line (black, dashed)
ax.plot([vmin,vmax],[vmin,vmax],lw=1,color='black',ls='--',alpha=0.6,zorder=1,
label='1:1 correlation')
if log:
ax.set_xscale('log');ax.set_yscale('log')
ax.set_title('log-log scale')
fig.savefig('TS_regression_loglog.png')
else:
ax.set_title('normal scale')
fig.savefig('TS_regression_normalscale.png')
If you fitted on log-log, the input for prediction needs to be on the log scale, and you need to transform the prediction before plotting them. These are the lines in question where it's not consistent in terms of scale:
y_pred = estimator.predict(np.array([vmin,vmax]).reshape(2,1))
[..]
ax.plot([vmin,vmax],y_pred,color='red',lw=1,zorder=1,label='Best fit')
Define some of the variables in your code, note you should get the intercept and gradient from the fit:
vmin = np.amin(data[['x','y']].min().values)*0.8
vmax = np.amax(data[['x','y']].max().values)*1.25
X = data.x.values[:,np.newaxis]
y = data.y.values
With a slight modification to your code:
vmin = np.amin(data[['x','y']].min().values)*0.8
vmax = np.amax(data[['x','y']].max().values)*1.25
X = data.x.values[:,np.newaxis]
y = data.y.values
fig,ax = plt.subplots()
data.plot.scatter(ax=ax,x='x',y='y')
estimator = TheilSenRegressor(fit_intercept=False) # intentionally set intercept to 0
estimator.fit(X=np.log10(X),y=np.log10(y))
gradient = estimator.coef_[0]
intercept = estimator.intercept_
print([gradient,intercept])
y_pred = estimator.predict(np.log10([vmin,vmax]).reshape(2,1))
ax.plot([vmin,vmax],10**(y_pred),color='red',lw=1,zorder=1,label='Best fit')
ax.plot([vmin,vmax],[vmin,vmax],lw=1,color='black',ls='--',alpha=0.6,zorder=1,
label='1:1 correlation')
ax.set_xscale('log')
ax.set_yscale('log')
I'm trying to fit an asymmetric Gaussian to this data: http://ge.tt/99iNaL53 (csv file).
I have tried to use a skewed Gaussian model from lmfit, and also a spline, but I'm not able to get the Gaussian model to fit well and the splines are not what I'm looking for (I don't want the spline to fit the data exactly as shown below, and altering the level of smoothing isn't helping).
Here is code using the above data that produces the plot below. The second figure is an example of what I'm trying to achieve with the goal of reading the rise and decay time from the fit.
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import CubicSpline
from scipy.interpolate import UnivariateSpline
from lmfit.models import SkewedGaussianModel
data = np.loadtxt('data.csv', delimiter=',')
x = data[:,0]
y = data[:,1]
# Skewed Gaussian fit
model = SkewedGaussianModel()
params = model.make_params(amplitude=400, center=3, sigma=7, gamma=1)
result = model.fit(y, params, x=x)
# Cubic Spline
cs = CubicSpline(x, y)
x_range = np.arange(x[0], x[-1], 0.1)
# Univariate Spline
us = UnivariateSpline(x, y, k = 1)
# Univariate Spline (smoothed)
us2 = UnivariateSpline(x, y, k = 5)
plt.scatter(x, y, marker = '^', color = 'k', linewidth = 0.5, s = 10, label = 'data')
plt.plot(x_range, cs(x_range), label = 'Cubic Spline')
plt.plot(x_range, us(x_range), label = 'Univariate Spline, k = 1')
plt.plot(x_range, us2(x_range), label = 'Univariate Spline, k = 5')
plt.plot(x, result.best_fit, color = 'red', label = 'Skewed Gaussian Attempt')
plt.xlabel('x')
plt.ylabel('y')
plt.yscale('log')
plt.ylim(1,500)
plt.legend()
plt.show()
Is there a question here? I don't see one, actually.
That result from lmfit is the best fit to a skewed Gaussian model.
You've chosen to plot the result on a log-scale. That completely changes the view of the quality of the fit or what is not fit well.
It seems like you're expecting a better fit, but not *too good. Well, it looks like your data is not perfectly represented by a single skewed Gaussian. It seems like you were not expecting it to be. You could try different forms for the model function, say a skewed Lorentzian or something. But your data has that low x shoulder that definitely does not look like your uncited figure.
I wrote something for J. Chem. Ed. [1] that involved fitting asymmetric Gaussian functions to data, you can find the core repo here [2] but below is a snippet on how I went about fitting a data set where x = data[:,0] and y = data[:,1] to the type of function you're working with:
import numpy as np
from scipy.optimize import leastsq
from scipy.special import erf
initials = [6.5, 13, 1, 0] # initial guess
def asymGaussian(x, p):
amp = (p[0] / (p[2] * np.sqrt(2 * np.pi)))
spread = np.exp((-(x - p[1]) ** 2.0) / (2 * p[2] ** 2.0))
skew = (1 + erf((p[3] * (x - p[1])) / (p[2] * np.sqrt(2))))
return amp * spread * skew
def residuals(p,y,x):
return y - asymGaussian(x, p)
# executes least-squares regression analysis to optimize initial parameters
cnsts = leastsq(
residuals,
initials,
args=(
data_set[:,1], # y value
data_set[:,0] # x value
))[0]
y = asymGaussian(data[:,0], cnsts)
finally just plot (y, data[:,0]). Hope this helps!
[1] https://pubs.acs.org/doi/10.1021/acs.jchemed.9b00818
[2] https://github.com/1mikegrn/pyGC
I want to draw the normal of a curve at a specific point t_0 = 2*sp.pi/5.
The curve is given by the parametric equations x(t) = sin(3t) and y(y) = sin(4t) where t[0, 2pi]. For this type of parametric curve, the parameter equations for the normal line are given by the following equations:
Attempt:
import sympy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib notebook
t,t_0 = sp.symbols('t t_0',real=True)
r_x = sp.sin(3*t)
diff_r_x = sp.diff(r_x, t)
r_y = sp.sin(4*t)#typo has been edited
diff_r_y = sp.diff(r_y, t)
para_eqx = r_x.subs(t, t_0) + diff_r_x.subs(t, t_0)*(t-t_0)#paremeter eq. of the normal defined
para_eqy = r_y.subs(t, t_0) - diff_r_x.subs(t, t_0)*(t-t_0)#paremeter eq. of the normal defined
r_x_normal = para_eqx.subs(t_0, 2*sp.pi/5)#plugging in t_0 = 2*sp.pi/5
r_y_normal = para_eqy.subs(t_0, 2*sp.pi/5)#plugging in t_0 = 2*sp.pi/5
t_range_normal = np.linspace(0, 250, 100) #from here on I have no clear idea on what is wrong.
xmarks = sp.lambdify(t, r_x_normal, "numpy")(t_range_normal)
ymarks = sp.lambdify(t, r_y_normal, "numpy")(t_range_normal)
fig, ax = plt.subplots(1)
complete_curve = ax.plot(xmarks, ymarks, ":", color="grey", alpha=0.5)
piece_of_curve = ax.plot(xmarks[:51], ymarks[:51], color="blue")
ax.plot(xmarks[50], ymarks[50], "o", color="blue")
plt.show()
I am struggling to evaluate these equations for values of t (given by t_range_normal). I used lambdify, and then plot the normal on the figure using a blue line.
However, I get:
Which is incorrect. I must be missing something from t_range_normal = np.linspace(0, 250, 100) on...
Thank you.
Below is your code, let's go through it step by step:
import numpy as np
import sympy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
t,t_0 = sp.symbols('t t_0',real=True)
r_x = sp.sin(3*t)
diff_r_x = sp.diff(r_x, t)
r_y = sp.sin(4*t)
diff_r_y = sp.diff(r_y, t)
r_x_eq= r_x.subs(t, t_0)
r_y_eq = r_y.subs(t, t_0)
r_x_eq
Out: sin(3*t_0)
r_y_eq
Out: sin(4*t_0)
r_x_eq.subs(t_0, 2*sp.pi/5)
Out: -sqrt(-sqrt(5)/8 + 5/8)
r_y_eq.subs(t_0, 2*sp.pi/5)
Out: -sqrt(-sqrt(5)/8 + 5/8)
Which is correct as you are doing a full round around the unit circle and sin(0) = sin(360) = sin(720) etc. etc.
The second term of your parametric function is the same (but with opposed sign) for x and y (according to the figure you posted in your question):
para_eqx = r_x.subs(t, t_0) + diff_r_x.subs(t, t_0)*(t-t_0)#paremeter eq. for the normal defined
para_eqy = r_y.subs(t, t_0) - diff_r_x.subs(t, t_0)*(t-t_0)#paremeter eq. for the normal defined
Hence your two functions are:
r_x_normal = para_eqx.subs(t_0, 2*sp.pi/5)#plugging in t_0 = 2*sp.pi/5
r_x_normal
Out[:]: 3*(-sqrt(5)/4 - 1/4)*(t - 2*pi/5) - sqrt(-sqrt(5)/8 + 5/8)
r_y_normal = para_eqy.subs(t_0, 2*sp.pi/5)#plugging in t_0 = 2*sp.pi/5
r_y_normal
Out[:]: -3*(-sqrt(5)/4 - 1/4)*(t - 2*pi/5) - sqrt(sqrt(5)/8 + 5/8)
Hence, for each given t they will differ only by a constant term.
t_range_normal = np.linspace(0, 250, 100) #from here on I have no clear idea on what is wrong.
xmarks = sp.lambdify(t, r_x_normal, "numpy")(t_range_normal)
ymarks = sp.lambdify(t, r_y_normal, "numpy")(t_range_normal)
fig, ax = plt.subplots(1)
complete_curve = ax.plot(xmarks, ymarks, ":", color="grey", alpha=0.5)
piece_of_curve = ax.plot(xmarks[:51], ymarks[:51], color="blue")
ax.plot(xmarks[50], ymarks[50], "o", color="blue")
plt.show()
I have a code:
import math
import numpy as np
import pylab as plt1
from matplotlib import pyplot as plt
uH2 = 1.90866638
uHe = 3.60187307
eH2 = 213.38
eHe = 31.96
R = float(uH2*eH2)/(uHe*eHe)
C_Values = []
Delta = []
kHeST = []
J_f21 = []
data = np.genfromtxt("Lamda_HeHCL.txt", unpack=True);
J_i1=data[1];
J_f1=data[2];
kHe=data[7]
data = np.genfromtxt("Basecol_Basic_New_1.txt", unpack=True);
J_i2=data[0];
J_f2=data[1];
kH2=data[5]
print kHe
print kH2
kHe = map(float, kHe)
kH2 = map(float, kH2)
kHe = np.array(kHe)
kH2= np.array(kH2)
g = len(kH2)
for n in range(0,g):
if J_f2[n] == 1:
Jf21 = J_f2[n]
J_f21.append(Jf21)
ratio = kHe[n]/kH2[n]
C = (((math.log(float(kH2[n]),10)))-(math.log(float(kHe[n]),10)))/math.log(R,10)
C_Values.append(C)
St = abs(J_f1[n] - J_i1[n])
Delta.append(St)
print C_Values
print Delta
print J_f21
fig, ax = plt.subplots()
ax.scatter(Delta,C_Values)
for i, txt in enumerate(J_f21):
ax.annotate(txt, (Delta[i],C_Values[i]))
plt.plot(np.unique(Delta), np.poly1d(np.polyfit(Delta, C_Values, 1))(np.unique(Delta)))
plt.plot(Delta, C_Values)
fit = np.polyfit(Delta,C_Values,1)
fit_fn = np.poly1d(fit)
# fit_fn is now a function which takes in x and returns an estimate for y
plt.scatter(Delta,C_Values, Delta, fit_fn(Delta))
plt.xlim(0, 12)
plt.ylim(-3, 3)
In this code, I am trying to plot a linear regression that extends past the data and touches the x-axis. I am also trying to add a legend to the plot that shows the slope of the plot. Using the code, I was able to plot this graph.
Here is some trash data I have been using to try and extend the line and add a legend to my code.
x =[5,7,9,15,20]
y =[10,9,8,7,6]
I would also like it to be a scatter except for the linear regression line.
Given that you don't provide the data you're loading from files I was unable to test this, but off the top of my head:
To extend the line past the plot, you could turn this line
plt.plot(np.unique(Delta), np.poly1d(np.polyfit(Delta, C_Values, 1))(np.unique(Delta)))
Into something like
x = np.linspace(0, 12, 50) # both 0 and 12 are from visually inspecting the plot
plt.plot(x, np.poly1d(np.polyfit(Delta, C_Values, 1))(x))
But if you want the line extended to the x-axis,
polynomial = np.polyfit(Delta, C_Values, 1)
x = np.linspace(0, *np.roots(polynomial))
plt.plot(x, np.poly1d(polynomial)(x))
As for the scatter plot thing, it seems to me you could just remove this line:
plt.plot(Delta, C_Values)
Oh right, as for the legend, add a label to the plots you make, like this:
plt.plot(x, np.poly1d(polynomial)(x), label='Linear regression')
and add a call to plt.legend() just before plt.show().