Calculating errors in polynomial coefficients - python

I currently have written some code that generates a polynomial to interpolate several data sets that I have. I am now wanting to calculate the error in the polynomial coefficients but am unsure about how to go about this.
My current code below:
import numpy.polynomial.polynomial as poly
import matplotlib.pyplot as plt
import numpy as np
f = [0,5,16,18.5,30,50]
a = [1.41,1.43,0.72,0.78,0.8,0.86]
b = [3.80e-5,5.40e-5,5.14e-5,5.16e-5,8.5e-5,1.58e-4]
c = [1.6e-2,1.54e-2,10.523e-2,14.589e-2,11.1e-2,5.66e-2]
f_new = np.linspace(f[0], f[5], num=len(f)*10)
coefs_a = poly.polyfit(f, a, 3)
coefs_b = poly.polyfit(f, b, 2)
coefs_c = poly.polyfit(f, c, 2)
ffit_a = poly.polyval(f_new, coefs_a)
ffit_b = poly.polyval(f_new, coefs_b)
ffit_c = poly.polyval(f_new, coefs_c)
print(coefs_a)
print(coefs_b)
print(coefs_c)
plt.plot(f,a)
plt.show()
plt.plot(f_new, ffit_a)
plt.title('a')
plt.xlabel('Frequency (Hz)')
plt.ylabel('Coefficient value')
plt.show()
plt.plot(f,b)
plt.show()
plt.plot(f_new, ffit_b)
plt.title('b')
plt.xlabel('Frequency (Hz)')
plt.ylabel('Coefficient value)')
plt.show()
plt.plot(f,c)
plt.show()
plt.plot(f_new, ffit_c)
plt.title('c')
plt.xlabel('Frequency (Hz)')
plt.ylabel('Coefficient value')
plt.show()
So currently, I generate coefficient values and therefore polynomials for the quantities named ''a'', ''b'' and ''c'' and now want to get the errors in these coefficients so I can calculate an overall error for each of the ''a'', ''b'' and ''c'' quantities.

You can get the sum of squared residuals (a measure of error) for each fit if you set full=True
eg in your code to get the residuals for coefs_a,
coefs_a, res_a = poly.polyfit(f, a, 3, full=True)
print(res_a[0]) # res_a will be a list with the 1st entry being the sum of residuals
Docs for numpy.polynomial.polynomial.polyfit

Related

Are optimisation builtin functions of Matlab better than Python?

all. I encountered a case where minimisation results of Matlab are very close to mathematical solution(i.e., when we solve equations by hand) when compared to the results obtained from Python's scipy minimize builtin function. I'm not sure where i'm doing wrong or how to improve the results in python. Any suggestion would be of great help.
Aim of this problem is to find the time period of set nonlinear differential equations without time evolving. For of test case I took the problem from "This place".
Non-Linear Differential equations looks like this
Here i'm implementing pseudo spectral method for periodic systems. Implementation method is similar to what described here , only change is i'm taking uniform points and "D" matrix is formed using Pseudospectral.
import numpy as np
from numpy import linalg as LA
import ast
from ast import literal_eval as make_tuple
import scipy
from scipy.optimize import minimize
from scipy.linalg import toeplitz
import matplotlib.pyplot as plt
%matplotlib tk
# This "Dmatrix" is used to get derivative.
def Dmatrix(N):
h = 2.0*np.pi/N;
col = np.zeros(N);
col[1:] = 0.5*(-1.0)**np.arange(1,N)/np.sin(np.arange(1,N)*h/2.0);
row = np.zeros(N); row[0] = col[0]; row[1:] = col[N-1:0:-1]
D = toeplitz(col,row);
return D
# Actual differential equations.
def dxD(x,y,t):
u=(1-(x**2)/4 - (y**2));
dx=-4*y+x*u;
dy=x+y*u;
return np.array([dx,dy])
# Implementing Pseudo spectral method
def dxFdxD(initial_guess,final_time):
N=len(initial_guess)//2;
x_guess=initial_guess[:N];
xl=np.array(x_guess[:]);
y_guess=initial_guess[N:2*N];
yl=np.array(y_guess[:]);
tf=final_time;
tl=np.arange(1,N+1)*tf/N;
D=Dmatrix(N);
XYTzipped=zip(xl,yl,tl);
dX_D=np.array([dxDynamics(xs,ys,ts) for xs,ys,ts in XYTzipped ]);
xlyl=np.array([xl,yl]).transpose();
dX_F=(np.array(D#xlyl))*(2*np.pi/tf);
err=np.array(dX_D - dX_F).flatten();
normError= LA.norm(err, 2);
return normError
# Initial guess points
N=201;
final_time=1.052*np.pi;
tf=final_time;
tgrid=np.arange(1,N+1)*tf/N;
xguess=np.cos(tgrid)*2.0;
yguess=-np.cos(tgrid)*0.5;
tfl=np.pi*0.85;
tfu=1.5*np.pi;
tfbounds=(tfl,tfu);
xstates= np.array([xguess,yguess]).flatten();
xstatesParameter=np.array([xstates,final_time], dtype=object);
xins=np.hstack(xstatesParameter).tolist();
# Objective function for optimising
def obj(x):
N=(len(x)-1)//2;
tf=x[-1];
xylist=x[:2*N];
return dxFdxD(xylist,tf)
# Optimization using method='trust-constr'
l1=[tfbounds];
str1=str([bounds123 for bounds123 in l1]);
str2=str1.replace("[", "").replace("]", "")
bounds1=make_tuple("("+ "(-5,5),(-5,5),"*N + str2+ ")")
bnds=bounds1;
# constraint
def xyradius(x):
nps=(len(x)-1)//2;
xs=x[:nps];
ys=x[nps:2*nps];
xsysZip=zip(xs,ys)
truelist=[bool((xi**2)+(yi**2)>0.25) for xi,yi in xsysZip]
result=int(all(truelist))
return result
xyradiusConstraintType={'type':'ineq','fun':xyradius};
cons=[xyradiusConstraintType]
# Minimising "obj"
sol=minimize(obj,
xins,
method='trust-constr',
bounds=bnds,
tol=1e-10)
# Results
x_y_tf=sol.x;
x_F=x_y_tf[:N];
y_F=x_y_tf[N:2*N];
tf_system=x_y_tf[-1];
print("time period tf=",tf_system,end="\n \n")
tgrid=np.arange(1,N+1)*tf/N;
# Plots
fig = plt.figure(1)
ax = fig.add_subplot(111)
#specify label for the corresponding curve
# ax.set_xticks(tgrid, minor=False)
ax.set_xticks(tgrid, minor=True)
ax.xaxis.grid(True, which='major')
ax.xaxis.grid(True, which='minor')
ax.set_title('Collocation points')
plt.plot(tgrid,x_F,label='x result')
plt.plot(tgrid,y_F,label='y result')
ax.set_title('Optimized result x,y')
plt.legend()
plt.show()
# Parametric plot
ax = plt.figure(4).add_subplot()
ax.plot(x_F,y_F,label='State Space')
ax.legend()
plt.show()
Optimizing(Minimizing) using method='SLSQP'
# Scipy for minimization using method='SLSQP'
l1=[tfbounds];
str1=str([bounds123 for bounds123 in l1]);
str2=str1.replace("[", "").replace("]", "")
bounds1=make_tuple("("+ "(-5,5),(-5,5),"*N + str2+ ")")
bnds=bounds1;
def xyradius(x):
nps=(len(x)-1)//2;
xs=x[:nps];
ys=x[nps:2*nps];
xsysZip=zip(xs,ys)
truelist=[bool((xi**2)+(yi**2)>0.25) for xi,yi in xsysZip]
result=int(all(truelist))
return result
xyradiusConstraintType={'type':'ineq','fun':xyradius};
cons=[xyradiusConstraintType]
sol=minimize(obj,
xins,
method='SLSQP',
bounds=bnds,
constraints=cons,
tol=1e-10)
When I implemented the same work in MatLab . I got "pi= 3.14" as the solution(time period of system), where as when in python i'm getting "4.70" as time period. Any suggestions are greatly appreciated. Thank you

How to find the optimized or correct peaks

I have following graph
I am using python's scipy.signal.find_peaks to find the peaks. But I am not sure how do I do that. I did following :
per = np.percentile(x,[70])
peaks_control = findPeaks(x, per[0])
where x is the signal
array([1.07541259e+09, 1.13851049e+09, 1.19241492e+09, 1.23706527e+09,
1.27240093e+09, 1.29836131e+09, 1.31217483e+09, 1.32037296e+09,
1.31908858e+09, 1.30896503e+09, 1.29216550e+09, 1.26958042e+09,
1.24561632e+09, 1.21202121e+09, 1.16869371e+09, 1.11054499e+09,
1.04006154e+09, 9.65663403e+08, 8.87706760e+08, 8.09340093e+08,
7.37568765e+08, 6.79736364e+08, 6.38576457e+08, 6.06062937e+08,
5.80650350e+08, 5.55089744e+08, 5.36334499e+08, 5.20236597e+08,
5.06529837e+08, 4.91825175e+08, 4.77937063e+08, 4.65475058e+08,
4.56520513e+08, 4.48393240e+08, 4.41944988e+08, 4.34822844e+08,
4.33688578e+08, 4.33451049e+08, 4.36256177e+08, 4.33553613e+08,
4.29191142e+08, 4.28492541e+08, 4.24465967e+08, 4.20074825e+08,
4.19935897e+08, 4.16652681e+08, 4.12419580e+08, 4.11747552e+08,
4.08801166e+08, 4.02351981e+08, 3.99620513e+08, 3.98716550e+08,
3.46023077e+08, 3.53969464e+08, 4.17131235e+08, 5.19363869e+08,
6.50956410e+08, 8.01530303e+08, 9.50162937e+08, 1.08249790e+09,
1.18242378e+09, 1.22732168e+09, 1.20123077e+09, 1.21067599e+09,
1.21556410e+09, 1.21272261e+09, 1.20310023e+09, 1.18692774e+09,
1.16694033e+09, 1.14330117e+09, 1.11635338e+09, 1.07947529e+09,
1.03222145e+09, 9.73427972e+08, 9.08558974e+08, 8.39966200e+08,
7.70457343e+08, 7.04976224e+08, 6.49436131e+08, 6.02085548e+08,
5.68915385e+08, 5.41638928e+08, 5.18758741e+08, 5.01973660e+08,
4.88766667e+08, 4.77643823e+08, 4.65681818e+08, 4.56193240e+08,
4.46851515e+08, 4.36135198e+08, 4.32282984e+08, 4.27913520e+08,
4.23408625e+08, 4.24119580e+08, 4.22399068e+08, 4.22415385e+08,
4.20193939e+08, 4.17638462e+08, 4.14822378e+08, 4.10636364e+08,
4.08388345e+08, 4.04844522e+08, 4.00571562e+08, 4.00841026e+08,
4.00764802e+08, 4.00432867e+08, 4.00336364e+08, 4.00724709e+08,
4.03048019e+08, 3.57437995e+08, 3.62371096e+08, 4.16658741e+08,
5.10148019e+08, 6.31750117e+08, 7.65175991e+08, 8.96832168e+08,
1.01666597e+09, 1.10373263e+09, 1.14380816e+09, 1.11629790e+09,
1.12228904e+09, 1.12378788e+09, 1.11974825e+09, 1.10812774e+09,
1.09125035e+09, 1.07033566e+09, 1.04667389e+09, 1.02016830e+09,
9.86036830e+08, 9.42176457e+08, 8.88900233e+08, 8.27962005e+08,
7.64362238e+08, 7.00755245e+08, 6.42390909e+08, 5.92395338e+08,
5.52426107e+08, 5.26319114e+08, 5.03317249e+08, 4.85524942e+08,
4.70421911e+08, 4.59389510e+08, 4.51644988e+08, 4.46288578e+08,
4.41076923e+08, 4.37533566e+08, 4.31993007e+08, 4.28625641e+08,
4.25406294e+08, 4.21161538e+08, 4.19049650e+08, 4.16719347e+08,
4.13124242e+08, 4.08404429e+08, 4.06154545e+08, 4.03386014e+08,
4.00980420e+08, 3.99442657e+08, 3.97792075e+08, 3.95606527e+08,
3.97922378e+08, 3.98345221e+08, 3.96253613e+08, 3.95703030e+08,
3.96108392e+08, 3.67136830e+08, 3.58382051e+08, 3.95844289e+08,
4.70853846e+08, 5.76629837e+08, 6.97682284e+08, 8.21169930e+08,
9.32588112e+08, 1.01885804e+09, 1.06315152e+09, 1.05128159e+09,
1.03944545e+09, 1.03769580e+09, 1.03132145e+09, 1.02008601e+09,
1.00327389e+09, 9.85387646e+08, 9.66403030e+08, 9.44620746e+08,
9.18596737e+08, 8.82269697e+08, 8.37750816e+08, 7.84877156e+08,
7.27590443e+08, 6.70183217e+08, 6.14567832e+08, 5.67404895e+08,
5.30862471e+08, 5.03108625e+08, 4.84348718e+08, 4.68116550e+08,
4.55809907e+08, 4.46616783e+08, 4.39725175e+08, 4.34323077e+08])
The peaks I get are adjacent to each other, as I can see that there is little bump in second, third and forth peak sites.
How should I calculate it and ignore such adjacent ones. To calculate the width, prominecne, etc I need to calculate the peaks. If I know it already I might be able to put some threshold.
As you asked in the comments, I'll provide you an example. Please note, this is only an example, an Exploratory Data Analysis is always needed to choose the best way to reach your goal.
So, let's create some noisy data
import numpy as np
from scipy.signal import find_peaks, periodogram
import matplotlib.pyplot as plt
size = 100
a = np.linspace(1, .5, size)
x = np.linspace(0, 50, size)
np.random.seed(0)
y = a * np.sin(x) + np.random.normal(0, .1, size) + 5
now, let's try to find the peaks with find_peaks from scipy.signal
peaks = find_peaks(y)[0]
plt.plot(x, y)
plt.plot(x[peaks], y[peaks], marker='o', ls='none')
plt.show()
as you can see, there are some "wrong" peaks. We need to set distance argument in find_peaks (see documentation).
Let us suppose that we don't know the distance between the peaks. In this case, we can see that the data are periodic. So we can find the period with a periodogram and use the period as distance in find_peaks
_f, _p = periodogram(y, nfft=2**6)
# calculate the sample rate of x
sample_rate = 1 / np.median(np.diff(x))
periods = 1 / _f[1:] / sample_rate
density = _p[1:] / _p[1:].max()
max_density_idx = density.argmax()
period = periods[max_density_idx]
plt.semilogx(periods, density)
plt.scatter(period, density[max_density_idx], color='r')
plt.title(f"period {period:.2f}")
plt.show()
Now we can use the period as distance argument in find_peaks
peaks = find_peaks(y, distance=period)[0]
plt.plot(x, y)
plt.plot(x[peaks], y[peaks], marker='o', ls='none')
plt.show()
update
In your specific case, it's a little bit different.
Define the signal (I'll call variables X and Y)
Y = np.array([1.07541259e+09, 1.13851049e+09, 1.19241492e+09, 1.23706527e+09, 1.27240093e+09, 1.29836131e+09, 1.31217483e+09, 1.32037296e+09, 1.31908858e+09, 1.30896503e+09, 1.29216550e+09, 1.26958042e+09, 1.24561632e+09, 1.21202121e+09, 1.16869371e+09, 1.11054499e+09, 1.04006154e+09, 9.65663403e+08, 8.87706760e+08, 8.09340093e+08, 7.37568765e+08, 6.79736364e+08, 6.38576457e+08, 6.06062937e+08, 5.80650350e+08, 5.55089744e+08, 5.36334499e+08, 5.20236597e+08, 5.06529837e+08, 4.91825175e+08, 4.77937063e+08, 4.65475058e+08, 4.56520513e+08, 4.48393240e+08, 4.41944988e+08, 4.34822844e+08, 4.33688578e+08, 4.33451049e+08, 4.36256177e+08, 4.33553613e+08, 4.29191142e+08, 4.28492541e+08, 4.24465967e+08, 4.20074825e+08, 4.19935897e+08, 4.16652681e+08, 4.12419580e+08, 4.11747552e+08, 4.08801166e+08, 4.02351981e+08, 3.99620513e+08, 3.98716550e+08, 3.46023077e+08, 3.53969464e+08, 4.17131235e+08, 5.19363869e+08, 6.50956410e+08, 8.01530303e+08, 9.50162937e+08, 1.08249790e+09, 1.18242378e+09, 1.22732168e+09, 1.20123077e+09, 1.21067599e+09, 1.21556410e+09, 1.21272261e+09, 1.20310023e+09, 1.18692774e+09, 1.16694033e+09, 1.14330117e+09, 1.11635338e+09, 1.07947529e+09, 1.03222145e+09, 9.73427972e+08, 9.08558974e+08, 8.39966200e+08, 7.70457343e+08, 7.04976224e+08, 6.49436131e+08, 6.02085548e+08, 5.68915385e+08, 5.41638928e+08, 5.18758741e+08, 5.01973660e+08, 4.88766667e+08, 4.77643823e+08, 4.65681818e+08, 4.56193240e+08, 4.46851515e+08, 4.36135198e+08, 4.32282984e+08, 4.27913520e+08, 4.23408625e+08, 4.24119580e+08, 4.22399068e+08, 4.22415385e+08, 4.20193939e+08, 4.17638462e+08, 4.14822378e+08, 4.10636364e+08, 4.08388345e+08, 4.04844522e+08, 4.00571562e+08, 4.00841026e+08, 4.00764802e+08, 4.00432867e+08, 4.00336364e+08, 4.00724709e+08, 4.03048019e+08, 3.57437995e+08, 3.62371096e+08, 4.16658741e+08, 5.10148019e+08, 6.31750117e+08, 7.65175991e+08, 8.96832168e+08, 1.01666597e+09, 1.10373263e+09, 1.14380816e+09, 1.11629790e+09, 1.12228904e+09, 1.12378788e+09, 1.11974825e+09, 1.10812774e+09, 1.09125035e+09, 1.07033566e+09, 1.04667389e+09, 1.02016830e+09, 9.86036830e+08, 9.42176457e+08, 8.88900233e+08, 8.27962005e+08, 7.64362238e+08, 7.00755245e+08, 6.42390909e+08, 5.92395338e+08, 5.52426107e+08, 5.26319114e+08, 5.03317249e+08, 4.85524942e+08, 4.70421911e+08, 4.59389510e+08, 4.51644988e+08, 4.46288578e+08, 4.41076923e+08, 4.37533566e+08, 4.31993007e+08, 4.28625641e+08, 4.25406294e+08, 4.21161538e+08, 4.19049650e+08, 4.16719347e+08, 4.13124242e+08, 4.08404429e+08, 4.06154545e+08, 4.03386014e+08, 4.00980420e+08, 3.99442657e+08, 3.97792075e+08, 3.95606527e+08, 3.97922378e+08, 3.98345221e+08, 3.96253613e+08, 3.95703030e+08, 3.96108392e+08, 3.67136830e+08, 3.58382051e+08, 3.95844289e+08, 4.70853846e+08, 5.76629837e+08, 6.97682284e+08, 8.21169930e+08, 9.32588112e+08, 1.01885804e+09, 1.06315152e+09, 1.05128159e+09, 1.03944545e+09, 1.03769580e+09, 1.03132145e+09, 1.02008601e+09, 1.00327389e+09, 9.85387646e+08, 9.66403030e+08, 9.44620746e+08, 9.18596737e+08, 8.82269697e+08, 8.37750816e+08, 7.84877156e+08, 7.27590443e+08, 6.70183217e+08, 6.14567832e+08, 5.67404895e+08, 5.30862471e+08, 5.03108625e+08, 4.84348718e+08, 4.68116550e+08, 4.55809907e+08, 4.46616783e+08, 4.39725175e+08, 4.34323077e+08])
X = np.arange(Y.size)
Since Y.size is 200 and in your plot there are 200 secs, I assume that the sample rate is 1 sec.
If we search for the peaks with default distance we find a lot of unwanted peaks
peaks = find_peaks(Y)[0]
plt.plot(X, Y)
plt.plot(X[peaks], Y[peaks], marker='o', ls='none')
plt.show()
Let's do a periodogram
_f, _p = periodogram(Y, nfft=2**12)
# the sample rate of your signal
sample_rate = 1
periods = 1 / _f[1:] / sample_rate
density = _p[1:] / _p[1:].max()
max_density_idx = density.argmax()
period = periods[max_density_idx]
p_peaks_idx = find_peaks(density)[0]
plt.semilogx(periods, density)
plt.scatter(period, density[max_density_idx], color='r')
period_peaks = []
for p_peak in p_peaks_idx:
if density[p_peak] < .1:
continue
period_peaks.append(periods[p_peak])
plt.scatter(periods[p_peak], density[p_peak])
plt.text(periods[p_peak], density[p_peak], f"{periods[p_peak]:.1f} ", ha='right', va='center')
plt.title('periodogram')
plt.show()
We found two main periods
period_peaks
[56.888888888888886, 28.444444444444443]
If we use the higher density period (56.9, the fundamental, or 1st harmonic) we miss a peak
peaks = find_peaks(Y, distance=period_peaks[0])[0]
plt.plot(X, Y)
plt.plot(X[peaks], Y[peaks], marker='o', ls='none')
plt.show()
This could be because
you have too few observations
the periodicity is not constant
If we empirically subtract a quantity (say 10) from the period, we find all the peaks
peaks = find_peaks(Y, distance=period_peaks[0] - 10)[0]
plt.plot(X, Y)
plt.plot(X[peaks], Y[peaks], marker='o', ls='none')
plt.show()
So we got peaks at
X[peaks]
array([ 7, 61, 118, 174])
taking the difference, we see they're not regular (at this sample rate and with these few observations)
np.diff(X[peaks])
array([54, 57, 56])

Using SciPy curve_fit to predict post final score

I have a post, and I need to predict the final score as close as I can.
Apparently using curve_fit should do the trick, although I am not really understanding how I should use it.
I have two known values, that I collect 2 minutes after the post is posted.
Those are the comment count, referred as n_comments, and the vote count, referred as n_votes.
After an hour, I check the post again, and get the final_score (sum of all votes) value, which is what I want to predict.
I've looked at different examples online, but they all use multiple data points (I have just 2), also, my initial data point contains more information (n_votes and n_comments) as I've found that without the other you cannot accurately predict the score.
To use curve_fit you need a function. Mine looks like this:
def func(datapoint,k,t,s):
return ((datapoint[0]*k+datapoint[1]*t)*60*datapoint[2])*s
And a sample datapoint looks like this:
[n_votes, n_comments, hour]
This is the broken mess of my attempt, and the result doesn't look right at all.
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
initial_votes_list = [3, 1, 2, 1, 0]
initial_comment_list = [0, 3, 0, 1, 64]
final_score_list = [26,12,13,14,229]
# Those lists contain data about multiple posts; I want to predict one at a time, passing the parameters to the next.
def func(x,k,t,s):
return ((x[0]*k+x[1]*t)*60*x[2])*s
x = np.array([3, 0, 1])
y = np.array([26 ,0 ,2])
#X = [[a,b,c] for a,b,c in zip(i_votes_list,i_comment_list,[i for i in range(len(i_votes_list))])]
popt, pcov = curve_fit(func, x, y)
plt.plot(x, [ 1 , func(x, *popt), 2], 'g--',
label='fit: a=%5.3f, b=%5.3f, c=%5.3f' % tuple(popt))
plt.xlabel('Time')
plt.ylabel('Score')
plt.legend()
plt.show()
The plot should display the initial/final score and the current prediction.
I have some doubts regarding the function too.. Initially this is what it looked like :
(votes_per_minute + n_comments) * 60 * hour
But I replaced votes_per_minute with just votes. Considering that I collect this data after 2 minutes, and that I have a parameter there, I'd say that it's not too bad but I don't know really.
Again, who guarantees that this is the best function possible? It would be nice to have the function discovered automatically, but I think this is ML territory...
EDIT:
Regarding the measurements: I can get as many as I want (every 15-30-60s), although they have to be collected while the post has =< 3 minutes of age.
Disclaimer: This is just a suggestion on how you may approach this problem. There might be better alternatives.
I think, it might be helpful to take into consideration the relationship between elapsed-time-since-posting and the final-score. The following curve from [OC] Upvotes over time for a reddit post models the behavior of the final-score or total-upvotes-count in time:
The curve obviously relies on the fact that once a post is online, you expect somewhat linear ascending upvotes behavior that slowly converges/ stabilizes around a maximum (and from there you have a gentle/flat slope).
Moreover, we know that usually the number of votes/comments is ascending in function of time. the relationship between these elements can be considered to be a series, I chose to model it as a geometric progression (you can consider arithmetic one if you see it is better). Also, you have to keep in mind that you are counting some elements twice; Some users commented and upvoted so you counted them twice, also some can comment multiple times but can upvote only one time. I chose to consider that only 70% (in code p = 0.7) of the users are unique commenters and that users who commented and upvoted represent 60% (in code e = 1-0.6 = 0.4)of the the total number of users (commenters and upvoters), the result of these assumptions:
So we have two equation to model the score so you can combine them and take their average. In code this would look like this:
import warnings
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from mpl_toolkits.mplot3d import axes3d
# filter warnings
warnings.filterwarnings("ignore")
class Cfit:
def __init__(self, votes, comments, scores, fit_size):
self.votes = votes
self.comments = comments
self.scores = scores
self.time = 60 # prediction time
self.fit_size = fit_size
self.popt = []
def func(self, x, a, d, q):
e = 0.4
b = 1
p = 0.7
return (a * np.exp( 1-(b / self.time**d )) + q**self.time * e * (x + p*self.comments[:len(x)]) ) /2
def fit_then_predict(self):
popt, pcov = curve_fit(self.func, self.votes[:self.fit_size], self.scores[:self.fit_size])
return popt, pcov
# init
init_votes = np.array([3, 1, 2, 1, 0])
init_comments = np.array([0, 3, 0, 1, 64])
final_scores = np.array([26, 12, 13, 14, 229])
# fit and predict
cfit = Cfit(init_votes, init_comments, final_scores, 15)
popt, pcov = cfit.fit_then_predict()
# plot expectations
fig = plt.figure(figsize = (15,15))
ax1 = fig.add_subplot(2,3,(1,3), projection='3d')
ax1.scatter(init_votes, init_comments, final_scores, 'go', label='expected')
ax1.scatter(init_votes, init_comments, cfit.func(init_votes, *popt), 'ro', label = 'predicted')
# axis
ax1.set_xlabel('init votes count')
ax1.set_ylabel('init comments count')
ax1.set_zlabel('final score')
ax1.set_title('fincal score = f(init votes count, init comments count)')
plt.legend()
# evaluation: diff = expected - prediction
diff = abs(final_scores - cfit.func(init_votes, *popt))
ax2 = fig.add_subplot(2,3,4)
ax2.plot(init_votes, diff, 'ro', label='fit: a=%5.3f, d=%5.3f, q=%5.3f' % tuple(popt))
ax2.grid('on')
ax2.set_xlabel('init votes count')
ax2.set_ylabel('|expected-predicted|')
ax2.set_title('|expected-predicted| = f(init votes count)')
# plot expected and predictions as f(init-votes)
ax3 = fig.add_subplot(2,3,5)
ax3.plot(init_votes, final_scores, 'gx', label='fit: a=%5.3f, d=%5.3f, q=%5.3f' % tuple(popt))
ax3.plot(init_votes, cfit.func(init_votes, *popt), 'rx', label='fit: a=%5.3f, d=%5.3f, q=%5.3f' % tuple(popt))
ax3.set_xlabel('init votes count')
ax3.set_ylabel('final score')
ax3.set_title('fincal score = f(init votes count)')
ax3.grid('on')
# plot expected and predictions as f(init-comments)
ax4 = fig.add_subplot(2,3,6)
ax4.plot(init_votes, final_scores, 'gx', label='fit: a=%5.3f, d=%5.3f, q=%5.3f' % tuple(popt))
ax4.plot(init_votes, cfit.func(init_votes, *popt), 'rx', label='fit: a=%5.3f, d=%5.3f, q=%5.3f' % tuple(popt))
ax4.set_xlabel('init comments count')
ax4.set_ylabel('final score')
ax4.set_title('fincal score = f(init comments count)')
ax4.grid('on')
plt.show()
The output of the previous code is the following:
Well obviously the provided data-set is too small to evaluate any approach so it is up to you to test this more.
The main idea here is that you assume your data to follow a certain function/behavior (described in func) but you give it certain degrees of freedom (your parameters: a, d, q), and using curve_fit you try to approximate the best combination of these variables that will fit your input data to your output data. Once you have the returned parameters from curve_fit (in code popt) you just run your function using those parameters, like this for example (add this section at the end of the previous code):
# a function similar to func to predict scores for a certain values
def score(votes_count, comments_count, popt):
e, b, p = 0.4, 1, 0.7
a, d, q = popt[0], popt[1], popt[2]
t = 60
return (a * np.exp( 1-(b / t**d )) + q**t * e * (votes_count + p*comments_count )) /2
print("score for init-votes = 2 & init-comments = 0 is ", score(2, 0, popt))
Output:
score for init-votes = 2 & init-comments = 0 is 14.000150386210994
You can see that this output is close to the correct value 13 and hopefully with more data you can have better/ more accurate approximations of your parameters and consequently better "predictions".

Why doesn't "beta.fit" come out right?

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
observed = [0.294, 0.2955, 0.235, 0.2536, 0.2423, 0.2844, 0.2099, 0.2355, 0.2946, 0.3388, 0.2202, 0.2523, 0.2209, 0.2707, 0.1885, 0.2414, 0.2846, 0.328, 0.2265, 0.2563, 0.2345, 0.2845, 0.1787, 0.2392, 0.2777, 0.3076, 0.2108, 0.2477, 0.234, 0.2696, 0.1839, 0.2344, 0.2872, 0.3224, 0.2152, 0.2593, 0.2295, 0.2702, 0.1876, 0.2331, 0.2809, 0.3316, 0.2099, 0.2814, 0.2174, 0.2516, 0.2029, 0.2282, 0.2697, 0.3424, 0.2259, 0.2626, 0.2187, 0.2502, 0.2161, 0.2194, 0.2628, 0.3296, 0.2323, 0.2557, 0.2215, 0.2383, 0.2166, 0.2315, 0.2757, 0.3163, 0.2311, 0.2479, 0.2199, 0.2418, 0.1938, 0.2394, 0.2718, 0.3297, 0.2346, 0.2523, 0.2262, 0.2481, 0.2118, 0.241, 0.271, 0.3525, 0.2323, 0.2513, 0.2313, 0.2476, 0.232, 0.2295, 0.2645, 0.3386, 0.2334, 0.2631, 0.226, 0.2603, 0.2334, 0.2375, 0.2744, 0.3491, 0.2052, 0.2473, 0.228, 0.2448, 0.2189, 0.2149]
a, b, loc, scale = stats.beta.fit(observed,floc=0,fscale=1)
ax = plt.subplot(111)
ax.hist(observed, alpha=0.75, color='green', bins=104, density=True)
ax.plot(np.linspace(0, 1, 100), stats.beta.pdf(np.linspace(0, 1, 100), a, b))
plt.show()
The α and β is out of whack (α=6.056697373013153,β=409078.57804704335)
The fitting image is also unreasonable. Histograms and beta distributions differ in height on the Y-axis.
The data of average is about 0.25, but calculated according to the expected value of beta distribution, 6.05/(6.05+409078.57)=1.47891162469e-05.This seems counterintuitive.
I think you are messing up a bit the code with whatever your observation is.
The main point to consider is that your beta fit will have both a and b, as well as loc and scale.
If you perform your fit using fixed loc/scale, i.e. scipy.stats.beta.fit(observed, floc=0, fscale=1), then your fitted a and b are: a = 33.26401059422594 and b = 99.0180817184922.
On the other hand, if you perform your fit with variable loc and scale, i.e. scipy.stats.beta.fit(observed), then you must compute / consider scipy.stats.beta.pdf() to include also those as parameter, which are, with your data, a = 6.056697380819225, b = 409078.5780469263, loc = 0.15710752697400227, scale = 6373.831662619217.
According to its documentation, the probability density above is defined in the “standardized” form. To shift and/or scale the distribution use the loc and scale parameters. Specifically, beta.pdf(x, a, b, loc, scale) is identically equivalent to beta.pdf(y, a, b) / scale with y = (x - loc) / scale.
Hence, the theoretical mean/average should be modified accordingly to include the scaling and location transformations.

power-law curve fitting scipy, numpy not working

I came up with a problem in fitting a power-law curve on my data. I have two data sets: bins1 and bins2
bins1 acting fine in curve-fitting by using numpy.linalg.lstsq (I then use np.exp(coefs[0])*x**coefs[1] to get power-law equation)
On the other hand, bins2 is acting weird and shows a bad R-squared
Both data have different equations than what excel shows me (and worse R-squared).
here is the code (and data):
import numpy as np
import matplotlib.pyplot as plt
bins1 = np.array([[6.769318871738219667e-03,
1.306418618130891773e-02,
1.912138120913448383e-02,
2.545189874466026111e-02,
3.214689891729670401e-02,
4.101898933375244805e-02,
5.129862592803200588e-02,
6.636505322669797313e-02,
8.409809827572585494e-02,
1.058164348650862258e-01,
1.375849753230810046e-01,
1.830664031837437311e-01,
2.682454535427478137e-01,
3.912508246490400410e-01,
5.893271848997768680e-01,
8.480213305038615257e-01,
2.408136266017391058e+00,
3.629192766488219313e+00,
4.639246557509275171e+00,
9.901792214343277720e+00],
[8.501658465758301112e-04,
1.562697718429977012e-03,
1.902062808421856087e-04,
4.411817741488644959e-03,
3.409236963162485048e-03,
1.686099657013027898e-03,
3.643231240239608402e-03,
2.544120616413291154e-04,
2.549036204611017029e-02,
3.527340723977697573e-02,
5.038482027310990652e-02,
5.617932487522721979e-02,
1.620407270423956103e-01,
1.906538999080910068e-01,
3.180688368126549093e-01,
2.364903188268162038e-01,
3.267322385964683273e-01,
9.384571074801122403e-01,
4.419747716107813029e-01,
9.254710022316929852e+00]]).T
bins2 = np.array([[6.522512685133712192e-03,
1.300415548684437199e-02,
1.888928895701269539e-02,
2.509905819337970856e-02,
3.239654633369139919e-02,
4.130706234846069635e-02,
5.123820846515786398e-02,
6.444380072984744190e-02,
8.235238352205621892e-02,
1.070907072127811749e-01,
1.403438221033725120e-01,
1.863115065963684147e-01,
2.670209758710758163e-01,
4.003337413814173074e-01,
6.549054078382223754e-01,
1.116611087124244062e+00,
2.438604844718367914e+00,
3.480674117919704269e+00,
4.410201659398489404e+00,
6.401903059926267403e+00],
[1.793454543936148608e-03,
2.441092334386309615e-03,
2.754373929745804715e-03,
1.182752729942167062e-03,
1.357797177773524414e-03,
6.711673916715021199e-03,
1.392761674092503343e-02,
1.127957613093066511e-02,
7.928803089359596004e-03,
2.524609593305639915e-02,
5.698702885370290905e-02,
8.607729156137132465e-02,
2.453761830112021203e-01,
9.734443815196883176e-02,
1.487480479168299119e-01,
9.918002699934079791e-01,
1.121298151253063535e+00,
1.389239135742518227e+00,
4.254082922056571237e-01,
2.643453492951096440e+00]]).T
bins = bins1 #change to bins2 to see results for bins2
def fit(x,a,m): # power-law fit (based on previous studies)
return a*(x**m)
coefs= np.linalg.lstsq(np.vstack([np.ones(len(bins[:,0])), np.log(bins[:,0]), bins[:,0]]).T, np.log(bins[:,1]))[0] # calculating fitting coefficients (a,m)
y_predict = fit(bins[:,0],np.exp(coefs[0]),coefs[1]) # prediction based of fitted model
model_plot = plt.loglog(bins[:,0],bins[:,1],'o',label="error")
fit_line = plt.plot(bins[:,0],y_predict,'r', label="fit")
plt.ylabel('Y (bins[:,1])')
plt.xlabel('X (bins[:,0])')
plt.title('model')
plt.legend(loc='best')
plt.show(model_plot,fit_line)
def R_sqr (y,y_predict): # calculating R squared value to measure fitting accuracy
rsdl = y - y_predict
ss_res = np.sum(rsdl**2)
ss_tot = np.sum((y-np.mean(y))**2)
R2 = 1-(ss_res/ss_tot)
R2 = np.around(R2,decimals=4)
return R2
R2= R_sqr(bins[:,1],y_predict)
print ('(R^2 = %s)' % (R2))
The fit formula for bins1[[x],[y]]: python: y = 0.337*(x)^1.223 (R^2 = 0.7773), excel: y = 0.289*(x)^1.174 (R^2 = 0.8548)
The fit formula for bins2[[x],[y]]: python: y = 0.509*(x)^1.332 (R^2 = -1.753), excel: y = 0.311*(x)^1.174 (R^2 = 0.9116)
And these are two sample data sets out of 30, I randomly see this fitting problem in my data and some have R-squared around "-150"!!
Itried scipy "curve_fit" but I didn't get better results, in fact worse!
Anyone knows how to get excel-like fit in python?
You are trying to calculate an R-squared using Y's that have not been converted to log-space. The following change gives reasonable R-squared values:
R2 = R_sqr(np.log(bins[:,1]), np.log(y_predict))

Categories