I am trying to find a python package that would give an option to fit natural smoothing splines with user selectable smoothing factor. Is there an implementation for that? If not, how would you use what is available to implement it yourself?
By natural spline I mean that there should be a condition that the second derivative of the fitted function at the endpoints is zero (linear).
By smoothing spline I mean that the spline should not be 'interpolating' (passing through all the datapoints). I would like to decide the correct smoothing factor lambda (see the Wikipedia page for smoothing splines) myself.
What I have found
scipy.interpolate.CubicSpline [link]: Does natural (cubic) spline fitting. Does interpolation, and there is no way to smooth the data.
scipy.interpolate.UnivariateSpline [link]: Does spline fitting with user selectable smoothing factor. However, there is no option to make the splines natural.
After hours of investigation, I did not find any pip installable packages which could fit a natural cubic spline with user-controllable smoothness. However, after deciding to write one myself, while reading about the topic I stumbled upon a blog post by github user madrury. He has written python code capable of producing natural cubic spline models.
The model code is available here (NaturalCubicSpline) with a BSD-licence. He has also written some examples in an IPython notebook.
But since this is the Internet and links tend to die, I will copy the relevant parts of the source code here + a helper function (get_natural_cubic_spline_model) written by me, and show an example of how to use it. The smoothness of the fit can be controlled by using different number of knots. The position of the knots can be also specified by the user.
Example
from matplotlib import pyplot as plt
import numpy as np
def func(x):
return 1/(1+25*x**2)
# make example data
x = np.linspace(-1,1,300)
y = func(x) + np.random.normal(0, 0.2, len(x))
# The number of knots can be used to control the amount of smoothness
model_6 = get_natural_cubic_spline_model(x, y, minval=min(x), maxval=max(x), n_knots=6)
model_15 = get_natural_cubic_spline_model(x, y, minval=min(x), maxval=max(x), n_knots=15)
y_est_6 = model_6.predict(x)
y_est_15 = model_15.predict(x)
plt.plot(x, y, ls='', marker='.', label='originals')
plt.plot(x, y_est_6, marker='.', label='n_knots = 6')
plt.plot(x, y_est_15, marker='.', label='n_knots = 15')
plt.legend(); plt.show()
The source code for get_natural_cubic_spline_model
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
def get_natural_cubic_spline_model(x, y, minval=None, maxval=None, n_knots=None, knots=None):
"""
Get a natural cubic spline model for the data.
For the knots, give (a) `knots` (as an array) or (b) minval, maxval and n_knots.
If the knots are not directly specified, the resulting knots are equally
space within the *interior* of (max, min). That is, the endpoints are
*not* included as knots.
Parameters
----------
x: np.array of float
The input data
y: np.array of float
The outpur data
minval: float
Minimum of interval containing the knots.
maxval: float
Maximum of the interval containing the knots.
n_knots: positive integer
The number of knots to create.
knots: array or list of floats
The knots.
Returns
--------
model: a model object
The returned model will have following method:
- predict(x):
x is a numpy array. This will return the predicted y-values.
"""
if knots:
spline = NaturalCubicSpline(knots=knots)
else:
spline = NaturalCubicSpline(max=maxval, min=minval, n_knots=n_knots)
p = Pipeline([
('nat_cubic', spline),
('regression', LinearRegression(fit_intercept=True))
])
p.fit(x, y)
return p
class AbstractSpline(BaseEstimator, TransformerMixin):
"""Base class for all spline basis expansions."""
def __init__(self, max=None, min=None, n_knots=None, n_params=None, knots=None):
if knots is None:
if not n_knots:
n_knots = self._compute_n_knots(n_params)
knots = np.linspace(min, max, num=(n_knots + 2))[1:-1]
max, min = np.max(knots), np.min(knots)
self.knots = np.asarray(knots)
#property
def n_knots(self):
return len(self.knots)
def fit(self, *args, **kwargs):
return self
class NaturalCubicSpline(AbstractSpline):
"""Apply a natural cubic basis expansion to an array.
The features created with this basis expansion can be used to fit a
piecewise cubic function under the constraint that the fitted curve is
linear *outside* the range of the knots.. The fitted curve is continuously
differentiable to the second order at all of the knots.
This transformer can be created in two ways:
- By specifying the maximum, minimum, and number of knots.
- By specifying the cutpoints directly.
If the knots are not directly specified, the resulting knots are equally
space within the *interior* of (max, min). That is, the endpoints are
*not* included as knots.
Parameters
----------
min: float
Minimum of interval containing the knots.
max: float
Maximum of the interval containing the knots.
n_knots: positive integer
The number of knots to create.
knots: array or list of floats
The knots.
"""
def _compute_n_knots(self, n_params):
return n_params
#property
def n_params(self):
return self.n_knots - 1
def transform(self, X, **transform_params):
X_spl = self._transform_array(X)
if isinstance(X, pd.Series):
col_names = self._make_names(X)
X_spl = pd.DataFrame(X_spl, columns=col_names, index=X.index)
return X_spl
def _make_names(self, X):
first_name = "{}_spline_linear".format(X.name)
rest_names = ["{}_spline_{}".format(X.name, idx)
for idx in range(self.n_knots - 2)]
return [first_name] + rest_names
def _transform_array(self, X, **transform_params):
X = X.squeeze()
try:
X_spl = np.zeros((X.shape[0], self.n_knots - 1))
except IndexError: # For arrays with only one element
X_spl = np.zeros((1, self.n_knots - 1))
X_spl[:, 0] = X.squeeze()
def d(knot_idx, x):
def ppart(t): return np.maximum(0, t)
def cube(t): return t*t*t
numerator = (cube(ppart(x - self.knots[knot_idx]))
- cube(ppart(x - self.knots[self.n_knots - 1])))
denominator = self.knots[self.n_knots - 1] - self.knots[knot_idx]
return numerator / denominator
for i in range(0, self.n_knots - 2):
X_spl[:, i+1] = (d(i, X) - d(self.n_knots - 2, X)).squeeze()
return X_spl
You could use this numpy/scipy implementation of natural cubic smoothing spline for univariate/multivariate data smoothing. Smoothing parameter should be in range [0.0, 1.0]. If we use smoothing parameter equal to 1.0 we get natural cubic spline interpolant without data smoothing. Also the implementation supports vectorization for univariate data.
Univariate example:
import numpy as np
import matplotlib.pyplot as plt
import csaps
np.random.seed(1234)
x = np.linspace(-5., 5., 25)
y = np.exp(-(x/2.5)**2) + (np.random.rand(25) - 0.2) * 0.3
sp = csaps.UnivariateCubicSmoothingSpline(x, y, smooth=0.85)
xs = np.linspace(x[0], x[-1], 150)
ys = sp(xs)
plt.plot(x, y, 'o', xs, ys, '-')
plt.show()
Bivariate example:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import csaps
xdata = [np.linspace(-3, 3, 61), np.linspace(-3.5, 3.5, 51)]
i, j = np.meshgrid(*xdata, indexing='ij')
ydata = (3 * (1 - j)**2. * np.exp(-(j**2) - (i + 1)**2)
- 10 * (j / 5 - j**3 - i**5) * np.exp(-j**2 - i**2)
- 1 / 3 * np.exp(-(j + 1)**2 - i**2))
np.random.seed(12345)
noisy = ydata + (np.random.randn(*ydata.shape) * 0.75)
sp = csaps.MultivariateCubicSmoothingSpline(xdata, noisy, smooth=0.988)
ysmth = sp(xdata)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_wireframe(j, i, noisy, linewidths=0.5, color='r')
ax.scatter(j, i, noisy, s=5, c='r')
ax.plot_surface(j, i, ysmth, linewidth=0, alpha=1.0)
plt.show()
The python package patsy has functions for generating spline bases, including a natural cubic spline basis. Described in the documentation.
Any library can then be used for fitting a model, e.g. scikit-learn or statsmodels.
The df parameter for cr() can be used to control the "smoothness"
Note that too low df can result to underfit (see below).
A simple example using scikit-learn.
import numpy as np
from sklearn.linear_model import LinearRegression
from patsy import cr
import matplotlib.pyplot as plt
n_obs = 600
np.random.seed(0)
x = np.linspace(-3, 3, n_obs)
y = 1 / (x ** 2 + 1) * np.cos(np.pi * x) + np.random.normal(0, 0.2, size=n_obs)
def plot_smoothed(df=5):
# Generate spline basis with different degrees of freedom
x_basis = cr(x, df=df, constraints="center")
# Fit model to the data
model = LinearRegression().fit(x_basis, y)
# Get estimates
y_hat = model.predict(x_basis)
plt.plot(x, y_hat, label=f"df={df}")
plt.scatter(x, y, s=4, color="tab:blue")
for df in (5, 7, 10, 25):
plot_smoothed(df)
plt.legend()
plt.title(f"Natural cubic spline with varying degrees of freedom")
plt.show()
For a project of mine, I needed to create intervals for time-series modeling, and to make the procedure more efficient I created tsmoothie: A python library for time-series smoothing and outlier detection in a vectorized way.
It provides different smoothing algorithms together with the possibility to computes intervals.
In the case of SplineSmoother of natural cubic type:
import numpy as np
import matplotlib.pyplot as plt
from tsmoothie.smoother import *
def func(x):
return 1/(1+25*x**2)
# make example data
x = np.linspace(-1,1,300)
y = func(x) + np.random.normal(0, 0.2, len(x))
# operate smoothing
smoother = SplineSmoother(n_knots=10, spline_type='natural_cubic_spline')
smoother.smooth(y)
# generate intervals
low, up = smoother.get_intervals('prediction_interval', confidence=0.05)
# plot the first smoothed timeseries with intervals
plt.figure(figsize=(11,6))
plt.plot(smoother.smooth_data[0], linewidth=3, color='blue')
plt.plot(smoother.data[0], '.k')
plt.fill_between(range(len(smoother.data[0])), low[0], up[0], alpha=0.3)
I point out also that tsmoothie can carry out the smoothing of multiple time-series in a vectorized way
The programming language R offers a very good implementation of natural cubic smoothing splines. You can use R functions in Python with rpy2:
import rpy2.robjects as robjects
r_y = robjects.FloatVector(y_train)
r_x = robjects.FloatVector(x_train)
r_smooth_spline = robjects.r['smooth.spline'] #extract R function# run smoothing function
spline1 = r_smooth_spline(x=r_x, y=r_y, spar=0.7)
ySpline=np.array(robjects.r['predict'](spline1,robjects.FloatVector(x_smooth)).rx2('y'))
plt.plot(x_smooth,ySpline)
If you want to directly set lambda: spline1 = r_smooth_spline(x=r_x, y=r_y, lambda=42) doesn't work, because lambda has already another meaning in Python, but there is a solution: How to use the lambda argument of smooth.spline in RPy WITHOUT Python interprating it as lambda.
To get the code running you first need to define the data x_train and y_train and you can define x_smooth=np.array(np.linspace(-3,5,1920)). if you want to plot it between -3 and 5 in Full-HD-resolution.
Note that this code is not fully compatible with Jupyter-notebooks for the latest versions of rpy2. You can fix this by using !pip install -Iv rpy2==3.4.2 as described in NotImplementedError: Conversion 'rpy2py' not defined for objects of type '<class 'rpy2.rinterface.SexpClosure'>' only after I run the code twice
Related
Goal
I would like to compute the 3D volume integral of a numeric scalar field.
Code
For this post, I will use an example of which the integral can be exactly computed. I have therefore chosen the following function:
In Python, I define the function, and a set of points in 3D, and then generate the discrete values at these points:
import numpy as np
# Make data.
def function(x, y, z):
return x**y**z
N = 5
grid = np.meshgrid(
np.linspace(0, 1, N),
np.linspace(0, 1, N),
np.linspace(0, 1, N)
)
points = np.vstack(list(map(np.ravel, grid))).T
x = points[:, 0]
y = points[:, 1]
z = points[:, 2]
values = [function(points[i, 0], points[i, 1], points[i, 2])
for i in range(len(points))]
Question
How can I find the integral, if I don't know the underlying function, i.e. if I only have the coordinates (x, y, z) and the values?
A nice way to go about this would be using scipy's tplquad integration. However, to use that, we need a function and not a cloud point.
An easy way around that is to use an interpolator, to get a function approximating our cloud point - we can for example use scipy's RegularGridInterpolator if the data is on a regular grid:
import numpy as np
from scipy import integrate
from scipy.interpolate import RegularGridInterpolator
# Make data.
def function(x,y,z):
return x*y*z
N = 5
xmin, xmax = 0, 1
ymin, ymax = 0, 1
zmin, zmax = 0, 1
x = np.linspace(xmin, xmax, N)
y = np.linspace(ymin, ymax, N)
z = np.linspace(zmin, zmax, N)
values = function(*np.meshgrid(x,y,z, indexing='ij'))
# Interpolate:
function_interpolated = RegularGridInterpolator((x, y, z), values)
# tplquad integrates func(z,y,x)
f = lambda z,y,x : my_interpolating_function([z,y,x])
result, error = integrate.tplquad(f, xmin, xmax, lambda _: ymin, lambda _:ymax,lambda *_: zmin, lambda *_: zmax)
In the example above, we get result = 0.12499999999999999 - close enough!
The easiest way to achieve what you are looking for is probably scipy's integration function. Here your example:
from scipy import integrate
# Make data.
def func(x,y,z):
return x**y**z
ranges = [[0,1], [0,1], [0,1]]
result, error = integrate.nquad(func, ranges)
Are you aware that the function that you created is different from the one that you show in the image. The one you created is an exponential (x^y^z) while the one that you are showing is just multiplications. If you want to represent the function in the image, use
def func(x,y,z):
return x*y*z
Hope this answers your question, otherwise just write a comment!
Edit:
Misread your post. If you only have the results, and they are not regularly spaced, you would have to figure out some form of interpolation (i.e. linear) and a lookup-table. If you do not know how to create that, let me know. The rest of the stated answer could still be used if you define func to return interpolated values from your original data
The first answer explains nicely the principal approach to handle this. Just wanted to illustrate an alternative way by showing the power of sklearn package and machine learning regression.
Doing the meshgrid in 3D gives a very large numpy array,
import numpy as np
N = 5
xmin, xmax = 0, 1
ymin, ymax = 0, 1
zmin, zmax = 0, 1
x = np.linspace(xmin, xmax, N)
y = np.linspace(ymin, ymax, N)
z = np.linspace(zmin, zmax, N)
grid = np.array(np.meshgrid(x,y,z, indexing='ij'))
grid.shape = (3, 5, 5, 5) # 2*5*5*5 = 250 numbers
Which is visually not very intuitive with 250 numbers. With different possible indexing ('ij' or 'xy'). Using regression we can get the same result with few input points (15-20).
# building random combinations from (x,y,z)
X = np.random.choice(x, 20)[:,None]
Y = np.random.choice(y, 20)[:,None]
Z = np.random.choice(z, 20)[:,None]
xyz = np.concatenate((X,Y,Z), axis = 1)
data = np.multiply.reduce(xyz, axis = 1)
So the input (grid) is just a 2D numpy array,
xyz.shape
(20, 3)
With the corresponding data,
data.shape = (20,)
Now the regression function and integration,
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from scipy import integrate
pipe=Pipeline([('polynomial',PolynomialFeatures(degree=3)),('modal',LinearRegression())])
pipe.fit(xyz, data)
def func(x,y,z):
return pipe.predict([[x, y, z]])
ranges = [[0,1], [0,1], [0,1]]
result, error = integrate.nquad(func, ranges)
print(result)
0.1257
This approach is useful with limited number of points.
Based on your requirements, it sounds like the most appropriate technique would be Monte Carlo integration:
# Step 0 start with some empirical data
observed_points = np.random.uniform(0,1,size=(10000,3))
unknown_fn = lambda x: np.prod(x) # just used to generate fake values
observed_values = np.apply_along_axis(unknown_fn, 1, observed_points)
K = 1000000
# Step 1 - assume that f(x,y,z) can be approximated by an interpolation
# of the data we have (you could get really fancy with the
# selection of interpolation method - we'll stick with straight lines here)
from scipy.interpolate import LinearNDInterpolator
f_interpolate = LinearNDInterpolator(observed_points, observed_values)
# Step 2 randomly sample from within convex hull of observed data
# Step 2a - Uniformly sample from bounding 3D-box of data
lower_bounds = observed_points.min(axis=0)
upper_bounds = observed_points.max(axis=0)
sampled_points = np.random.uniform(lower_bounds, upper_bounds,size=(K, 3))
# Step 2b - Reject points outside of convex hull...
# Luckily, we get a np.nan from LinearNDInterpolator in this case
sampled_values = f_interpolate(sampled_points)
rejected_idxs = np.argwhere(np.isnan(sampled_values))
# Step 2c - Remember accepted values of estimated f(x_i, y_i, z_i)
final_sampled_values = np.delete(sampled_values, rejected_idxs, axis=0)
# Step 3 - Calculate estimate of volume of observed data domain
# Since we sampled uniformly from the convex hull of data domain,
# each point was selected with P(x,y,z)= 1 / Volume of convex hull
volume = scipy.spatial.ConvexHull(observed_points).volume
# Step 4 - Multiply estimated volume of domain by average sampled value
I_hat = volume * final_sampled_values.mean()
print(I_hat)
For a derivation of why this works see this: https://cs.dartmouth.edu/wjarosz/publications/dissertation/appendixA.pdf
I want to iteratively fit a curve to data in python with the following approach:
Fit a polynomial curve (or any non-linear approach)
Discard values > 2 standard deviation from mean of the curve
repeat steps 1 and 2 till all values are within confidence interval of the curve
I can fit a polynomial curve as follows:
vals = array([0.00441025, 0.0049001 , 0.01041189, 0.47368389, 0.34841961,
0.3487533 , 0.35067096, 0.31142986, 0.3268407 , 0.38099566,
0.3933048 , 0.3479948 , 0.02359819, 0.36329588, 0.42535543,
0.01308297, 0.53873956, 0.6511364 , 0.61865282, 0.64750302,
0.6630047 , 0.66744816, 0.71759617, 0.05965622, 0.71335208,
0.71992683, 0.61635697, 0.12985441, 0.73410642, 0.77318621,
0.75675988, 0.03003641, 0.77527201, 0.78673995, 0.05049178,
0.55139476, 0.02665514, 0.61664748, 0.81121749, 0.05521697,
0.63404375, 0.32649395, 0.36828268, 0.68981099, 0.02874863,
0.61574739])
x_values = np.linspace(0, 1, len(vals))
poly_degree = 3
coeffs = np.polyfit(x_values, vals, poly_degree)
poly_eqn = np.poly1d(coeffs)
y_hat = poly_eqn(x_values)
How do I do steps 2 and 3?
With the eliminating points too far from an expected solution, you are probably looking for RANSAC (RANdom SAmple Consensus), which is fitting a curve (or any other function) to data within certain bounds, like your case with 2*STD.
You can use scikit-learn RANSAC estimator which is well aligned with included regressors such as LinearRegression. For your polynomial case you need to define your own regression class:
from sklearn.metrics import mean_squared_error
class PolynomialRegression(object):
def __init__(self, degree=3, coeffs=None):
self.degree = degree
self.coeffs = coeffs
def fit(self, X, y):
self.coeffs = np.polyfit(X.ravel(), y, self.degree)
def get_params(self, deep=False):
return {'coeffs': self.coeffs}
def set_params(self, coeffs=None, random_state=None):
self.coeffs = coeffs
def predict(self, X):
poly_eqn = np.poly1d(self.coeffs)
y_hat = poly_eqn(X.ravel())
return y_hat
def score(self, X, y):
return mean_squared_error(y, self.predict(X))
and then you can use RANSAC
from sklearn.linear_model import RANSACRegressor
ransac = RANSACRegressor(PolynomialRegression(degree=poly_degree),
residual_threshold=2 * np.std(y_vals),
random_state=0)
ransac.fit(np.expand_dims(x_vals, axis=1), y_vals)
inlier_mask = ransac.inlier_mask_
Note, the X variable is transformed to 2d array as it is required by sklearn RANSAC implementation and in our custom class flatten back because of numpy polyfit function works with 1d array.
y_hat = ransac.predict(np.expand_dims(x_vals, axis=1))
plt.plot(x_vals, y_vals, 'bx', label='input samples')
plt.plot(x_vals[inlier_mask], y_vals[inlier_mask], 'go', label='inliers (2*STD)')
plt.plot(x_vals, y_hat, 'r-', label='estimated curve')
moreover, playing with the polynomial order and residual distance I got following results with degree=4 and range 1*STD
Another option is to use higher order regressor like Gaussian process
from sklearn.gaussian_process import GaussianProcessRegressor
ransac = RANSACRegressor(GaussianProcessRegressor(),
residual_threshold=np.std(y_vals))
Talking about generalization to DataFrame, you just need to set that all columns except one are features and the one remaining is the output, like here:
import pandas as pd
df = pd.DataFrame(np.array([x_vals, y_vals]).T)
ransac.fit(df[df.columns[:-1]], df[df.columns[-1]])
y_hat = ransac.predict(df[df.columns[:-1]])
it doesn't look like you'll get anything worthwhile following that procedure, there are much better techniques for handling unexpected data. googling for "outlier detection" would be a good start.
with that said, here's how to answer your question:
start by pulling in libraries and getting some data:
import matplotlib.pyplot as plt
import numpy as np
Y = np.array([
0.00441025, 0.0049001 , 0.01041189, 0.47368389, 0.34841961,
0.3487533 , 0.35067096, 0.31142986, 0.3268407 , 0.38099566,
0.3933048 , 0.3479948 , 0.02359819, 0.36329588, 0.42535543,
0.01308297, 0.53873956, 0.6511364 , 0.61865282, 0.64750302,
0.6630047 , 0.66744816, 0.71759617, 0.05965622, 0.71335208,
0.71992683, 0.61635697, 0.12985441, 0.73410642, 0.77318621,
0.75675988, 0.03003641, 0.77527201, 0.78673995, 0.05049178,
0.55139476, 0.02665514, 0.61664748, 0.81121749, 0.05521697,
0.63404375, 0.32649395, 0.36828268, 0.68981099, 0.02874863,
0.61574739])
X = np.linspace(0, 1, len(Y))
next do an initial plot of the data:
plt.plot(X, Y, '.')
as this lets you see what we're dealing with and whether a polynomial would ever be a good fit --- short answer is that this method isn't going to get very far with this sort of data
at this point we should stop, but to answer the question I'll go on, mostly following your polynomial fitting code:
poly_degree = 5
sd_cutoff = 1 # 2 keeps everything
coeffs = np.polyfit(X, Y, poly_degree)
poly_eqn = np.poly1d(coeffs)
Y_hat = poly_eqn(X)
delta = Y - Y_hat
sd_p = np.std(delta)
ok = abs(delta) < sd_p * sd_cutoff
hopefully this makes sense, I use a higher degree polynomial and only cutoff at 1SD because otherwise nothing will be thrown away. the ok array contains True values for those points that are within sd_cutoff standard deviations
to check this, I'd then do another plot. something like:
plt.scatter(X, Y, color=np.where(ok, 'k', 'r'))
plt.fill_between(
X,
Y_hat - sd_p * sd_cutoff,
Y_hat + sd_p * sd_cutoff,
color='#00000020')
plt.plot(X, Y_hat)
which gives me:
so the black dots are the points to keep (i.e. X[ok] gives me these back, and np.where(ok) gives you indicies).
you can play around with the parameters, but you probably want a distribution with fatter tails (e.g. a Student's T-distribution) but, as I said above, using Google for outlier detection would be my suggestion
There are three functions need to solve this. First a line fitting function is necesary to fit a line to a set of points:
def fit_line(x_values, vals, poly_degree):
coeffs = np.polyfit(x_values, vals, poly_degree)
poly_eqn = np.poly1d(coeffs)
y_hat = poly_eqn(x_values)
return poly_eqn, y_hat
We need to know the standard deviation from the points to the line. This function computes that standard deviation:
def compute_sd(x_values, vals, y_hat):
distances = []
for x,y, y1 in zip(x_values, vals, y_hat): distances.append(abs(y - y1))
return np.std(distances)
Finally, we need to compare the distance from a point to the line. The point needs to be thrown out if the distance from the point to the line is greater than two times the standard deviation.
def compare_distances(x_values, vals):
new_vals, new_x_vals = [],[]
for x,y in zip(x_values, vals):
y1 = np.polyval(poly_eqn, x)
distance = abs(y - y1)
if distance < 2*sd:
plt.plot((x,x),(y,y1), c='g')
new_vals.append(y)
new_x_vals.append(x)
else:
plt.plot((x,x),(y,y1), c='r')
plt.scatter(x,y, c='r')
return new_vals, new_x_vals
As you can see in the following graphs, this method does not work well for fitting a line to data that has a lot of outliers. All the points end up getting eliminated for being too far from the fitted line.
while len(vals)>0:
poly_eqn, y_hat = fit_line(x_values, vals, poly_degree)
plt.scatter(x_values, vals)
plt.plot(x_values, y_hat)
sd = compute_sd(x_values, vals, y_hat)
new_vals, new_x_vals = compare_distances(x_values, vals)
plt.show()
vals, x_values = np.array(new_vals), np.array(new_x_vals)
Lets say i have a randomly distributed data which looks like:
I want to replace each data point y[x_i] with fixed width gaussian
and add them together. It should give me:
My code is very primitive and slow:
def gaussian(x, mu, sig):
return 1/(sig*np.sqrt(2*np.pi))*np.exp(-np.power(x - mu, 2.) / (
2 * np.power(sig, 2.)))
def gaussian_smoothing(x, y, sig=0.5, n=1000):
x_new = np.linspace(x.min()-10*sig, x.max()+10*sig, n)
y_new = np.zeros(x_new.shape)
for _x, _y in zip(x, y):
y_new += _y*gaussian(x_new, _x, sig)
return x_new, y_new
For large data-sets it takes a long time to perform such smoothing.
I was looking at np.convolve. However it seams that that it is only applicable to evenly distributed data and x step for data and gaussians should be the same. What would be the fastest way to perform such operation.
yon try to estimate it as a Guassian mixture with smaller number of components (like EM algorithm) using sklearn:
import matplotlib.pyplot as plt
from numpy.random import choice
from sklearn import mixture
import scipy.stats
import numpy
# generate some data
x = numpy.array([1.,1.1,1.6,2.,2.1,2.2,2.9,3.,8.,62.,62.2,63.,63.4,64.5,65.,67.,69.])
# generate weights to it
y = numpy.random.rand(x.shape[0])
# normalize weigth to 1
y /= y.sum()
# resamlple to 5000 samples with equal weights according to original weights
x_rsmp = numpy.array([choice(x, p=y) for _ in range(5000)])
x_rsmp.sort()
x_rsmp = x_rsmp.reshape(-1,1)
# define number of components - this must be user seelcted or estimated
n_comp = 2
# fit the mixture
gmm = mixture.GaussianMixture(n_components=n_comp, covariance_type='full')
gmm.fit(x_rsmp)
# plot it
fig = plt.figure()
ax = fig.add_subplot(111)
x_gauss = numpy.linspace(-10,100,1000)
for n_c in range(n_comp):
norm_pdf = scipy.stats.norm.pdf(x_gauss, gmm.means_[n_c,0], gmm.covariances_[n_c,0])
ax.plot(x_gauss, norm_pdf, label='gauss %d' % (n_c+1))
ax.stem(x,y,'gray')
plt.legend()
It yields n_c Gaussian components with means gmm.means_ and covariances gmm.covariances_.
I'm using UnivariateSpline to construct piecewise polynomials for some data that I have. I would then like to use these splines in other programs (either in C or FORTRAN) and so I would like to understand the equation behind the generated spline.
Here is my code:
import numpy as np
import scipy as sp
from scipy.interpolate import UnivariateSpline
import matplotlib.pyplot as plt
import bisect
data = np.loadtxt('test_C12H26.dat')
Tmid = 800.0
print "Tmid", Tmid
nmid = bisect.bisect(data[:,0],Tmid)
fig = plt.figure()
plt.plot(data[:,0], data[:,7],ls='',marker='o',markevery=20)
npts = len(data[:,0])
#print "npts", npts
w = np.ones(npts)
w[0] = 100
w[nmid] = 100
w[npts-1] = 100
spline1 = UnivariateSpline(data[:nmid,0],data[:nmid,7],s=1,w=w[:nmid])
coeffs = spline1.get_coeffs()
print coeffs
print spline1.get_knots()
print spline1.get_residual()
print coeffs[0] + coeffs[1] * (data[0,0] - data[0,0]) \
+ coeffs[2] * (data[0,0] - data[0,0])**2 \
+ coeffs[3] * (data[0,0] - data[0,0])**3, \
data[0,7]
print coeffs[0] + coeffs[1] * (data[nmid,0] - data[0,0]) \
+ coeffs[2] * (data[nmid,0] - data[0,0])**2 \
+ coeffs[3] * (data[nmid,0] - data[0,0])**3, \
data[nmid,7]
print Tmid,data[-1,0]
spline2 = UnivariateSpline(data[nmid-1:,0],data[nmid-1:,7],s=1,w=w[nmid-1:])
print spline2.get_coeffs()
print spline2.get_knots()
print spline2.get_residual()
plt.plot(data[:,0],spline1(data[:,0]))
plt.plot(data[:,0],spline2(data[:,0]))
plt.savefig('test.png')
And here is the resulting plot. I believe I have valid splines for each interval but it looks like my spline equation is not correct... I can't find any reference to what it is supposed to be in the scipy documentation. Anybody knows? Thanks !
The scipy documentation does not have anything to say about how one can take the coefficients and manually generate the spline curve. However, it is possible to figure out how to do this from the existing literature on B-splines. The following function bspleval shows how to construct the B-spline basis functions (the matrix B in the code), from which one can easily generate the spline curve by multiplying the coefficients with the highest-order basis functions and summing:
def bspleval(x, knots, coeffs, order, debug=False):
'''
Evaluate a B-spline at a set of points.
Parameters
----------
x : list or ndarray
The set of points at which to evaluate the spline.
knots : list or ndarray
The set of knots used to define the spline.
coeffs : list of ndarray
The set of spline coefficients.
order : int
The order of the spline.
Returns
-------
y : ndarray
The value of the spline at each point in x.
'''
k = order
t = knots
m = alen(t)
npts = alen(x)
B = zeros((m-1,k+1,npts))
if debug:
print('k=%i, m=%i, npts=%i' % (k, m, npts))
print('t=', t)
print('coeffs=', coeffs)
## Create the zero-order B-spline basis functions.
for i in range(m-1):
B[i,0,:] = float64(logical_and(x >= t[i], x < t[i+1]))
if (k == 0):
B[m-2,0,-1] = 1.0
## Next iteratively define the higher-order basis functions, working from lower order to higher.
for j in range(1,k+1):
for i in range(m-j-1):
if (t[i+j] - t[i] == 0.0):
first_term = 0.0
else:
first_term = ((x - t[i]) / (t[i+j] - t[i])) * B[i,j-1,:]
if (t[i+j+1] - t[i+1] == 0.0):
second_term = 0.0
else:
second_term = ((t[i+j+1] - x) / (t[i+j+1] - t[i+1])) * B[i+1,j-1,:]
B[i,j,:] = first_term + second_term
B[m-j-2,j,-1] = 1.0
if debug:
plt.figure()
for i in range(m-1):
plt.plot(x, B[i,k,:])
plt.title('B-spline basis functions')
## Evaluate the spline by multiplying the coefficients with the highest-order basis functions.
y = zeros(npts)
for i in range(m-k-1):
y += coeffs[i] * B[i,k,:]
if debug:
plt.figure()
plt.plot(x, y)
plt.title('spline curve')
plt.show()
return(y)
To give an example of how this can be used with Scipy's existing univariate spline functions, the following is an example script. This takes the input data and uses Scipy's functional and also its object-oriented approach to spline fitting. Taking the coefficients and knot points from either of the two and using these as inputs to our manually-calculated routine bspleval, we reproduce the same curve that they do. Note that the difference between the manually evaluated curve and Scipy's evaluation method is so small that it is almost certainly floating-point noise.
x = array([-273.0, -176.4, -79.8, 16.9, 113.5, 210.1, 306.8, 403.4, 500.0])
y = array([2.25927498e-53, 2.56028619e-03, 8.64512988e-01, 6.27456769e+00, 1.73894734e+01,
3.29052124e+01, 5.14612316e+01, 7.20531200e+01, 9.40718450e+01])
x_nodes = array([-273.0, -263.5, -234.8, -187.1, -120.3, -34.4, 70.6, 194.6, 337.8, 500.0])
y_nodes = array([2.25927498e-53, 3.83520726e-46, 8.46685318e-11, 6.10568083e-04, 1.82380809e-01,
2.66344008e+00, 1.18164677e+01, 3.01811501e+01, 5.78812583e+01, 9.40718450e+01])
## Now get scipy's spline fit.
k = 3
tck = splrep(x_nodes, y_nodes, k=k, s=0)
knots = tck[0]
coeffs = tck[1]
print('knot points=', knots)
print('coefficients=', coeffs)
## Now try scipy's object-oriented version. The result is exactly the same as "tck": the knots are the
## same and the coeffs are the same, they are just queried in a different way.
uspline = UnivariateSpline(x_nodes, y_nodes, s=0)
uspline_knots = uspline.get_knots()
uspline_coeffs = uspline.get_coeffs()
## Here are scipy's native spline evaluation methods. Again, "ytck" and "y_uspline" are exactly equal.
ytck = splev(x, tck)
y_uspline = uspline(x)
y_knots = uspline(knots)
## Now let's try our manually-calculated evaluation function.
y_eval = bspleval(x, knots, coeffs, k, debug=False)
plt.plot(x, ytck, label='tck')
plt.plot(x, y_uspline, label='uspline')
plt.plot(x, y_eval, label='manual')
## Next plot the knots and nodes.
plt.plot(x_nodes, y_nodes, 'ko', markersize=7, label='input nodes') ## nodes
plt.plot(knots, y_knots, 'mo', markersize=5, label='tck knots') ## knots
plt.xlim((-300.0,530.0))
plt.legend(loc='best', prop={'size':14})
plt.figure()
plt.title('difference')
plt.plot(x, ytck-y_uspline, label='tck-uspl')
plt.plot(x, ytck-y_eval, label='tck-manual')
plt.legend(loc='best', prop={'size':14})
plt.show()
The coefficients given by get_coeffs are B-spline (Basis spline) coefficients, described here: B-spline (Wikipedia)
Probably whatever other program/language you will be using has an implementation. Supply the knot locations and coefficients, and you should be all set.
I am trying to recreate maximum likelihood distribution fitting, I can already do this in Matlab and R, but now I want to use scipy. In particular, I would like to estimate the Weibull distribution parameters for my data set.
I have tried this:
import scipy.stats as s
import numpy as np
import matplotlib.pyplot as plt
def weib(x,n,a):
return (a / n) * (x / n)**(a - 1) * np.exp(-(x / n)**a)
data = np.loadtxt("stack_data.csv")
(loc, scale) = s.exponweib.fit_loc_scale(data, 1, 1)
print loc, scale
x = np.linspace(data.min(), data.max(), 1000)
plt.plot(x, weib(x, loc, scale))
plt.hist(data, data.max(), density=True)
plt.show()
And get this:
(2.5827280639441961, 3.4955032285727947)
And a distribution that looks like this:
I have been using the exponweib after reading this http://www.johndcook.com/distributions_scipy.html. I have also tried the other Weibull functions in scipy (just in case!).
In Matlab (using the Distribution Fitting Tool - see screenshot) and in R (using both the MASS library function fitdistr and the GAMLSS package) I get a (loc) and b (scale) parameters more like 1.58463497 5.93030013. I believe all three methods use the maximum likelihood method for distribution fitting.
I have posted my data here if you would like to have a go! And for completeness I am using Python 2.7.5, Scipy 0.12.0, R 2.15.2 and Matlab 2012b.
Why am I getting a different result!?
My guess is that you want to estimate the shape parameter and the scale of the Weibull distribution while keeping the location fixed. Fixing loc assumes that the values of your data and of the distribution are positive with lower bound at zero.
floc=0 keeps the location fixed at zero, f0=1 keeps the first shape parameter of the exponential weibull fixed at one.
>>> stats.exponweib.fit(data, floc=0, f0=1)
[1, 1.8553346917584836, 0, 6.8820748596850905]
>>> stats.weibull_min.fit(data, floc=0)
[1.8553346917584836, 0, 6.8820748596850549]
The fit compared to the histogram looks ok, but not very good. The parameter estimates are a bit higher than the ones you mention are from R and matlab.
Update
The closest I can get to the plot that is now available is with unrestricted fit, but using starting values. The plot is still less peaked. Note values in fit that don't have an f in front are used as starting values.
>>> from scipy import stats
>>> import matplotlib.pyplot as plt
>>> plt.plot(data, stats.exponweib.pdf(data, *stats.exponweib.fit(data, 1, 1, scale=02, loc=0)))
>>> _ = plt.hist(data, bins=np.linspace(0, 16, 33), normed=True, alpha=0.5);
>>> plt.show()
It is easy to verify which result is the true MLE, just need a simple function to calculate log likelihood:
>>> def wb2LL(p, x): #log-likelihood
return sum(log(stats.weibull_min.pdf(x, p[1], 0., p[0])))
>>> adata=loadtxt('/home/user/stack_data.csv')
>>> wb2LL(array([6.8820748596850905, 1.8553346917584836]), adata)
-8290.1227946678173
>>> wb2LL(array([5.93030013, 1.57463497]), adata)
-8410.3327470347667
The result from fit method of exponweib and R fitdistr (#Warren) is better and has higher log likelihood. It is more likely to be the true MLE. It is not surprising that the result from GAMLSS is different. It is a complete different statistic model: Generalized Additive Model.
Still not convinced? We can draw a 2D confidence limit plot around MLE, see Meeker and Escobar's book for detail).
Again this verifies that array([6.8820748596850905, 1.8553346917584836]) is the right answer as loglikelihood is lower that any other point in the parameter space. Note:
>>> log(array([6.8820748596850905, 1.8553346917584836]))
array([ 1.92892018, 0.61806511])
BTW1, MLE fit may not appears to fit the distribution histogram tightly. An easy way to think about MLE is that MLE is the parameter estimate most probable given the observed data. It doesn't need to visually fit the histogram well, that will be something minimizing mean square error.
BTW2, your data appears to be leptokurtic and left-skewed, which means Weibull distribution may not fit your data well. Try, e.g. Gompertz-Logistic, which improves log-likelihood by another about 100.
Cheers!
I know it's an old post, but I just faced a similar problem and this thread helped me solve it. Thought my solution might be helpful for others like me:
# Fit Weibull function, some explanation below
params = stats.exponweib.fit(data, floc=0, f0=1)
shape = params[1]
scale = params[3]
print 'shape:',shape
print 'scale:',scale
#### Plotting
# Histogram first
values,bins,hist = plt.hist(data,bins=51,range=(0,25),normed=True)
center = (bins[:-1] + bins[1:]) / 2.
# Using all params and the stats function
plt.plot(center,stats.exponweib.pdf(center,*params),lw=4,label='scipy')
# Using my own Weibull function as a check
def weibull(u,shape,scale):
'''Weibull distribution for wind speed u with shape parameter k and scale parameter A'''
return (shape / scale) * (u / scale)**(shape-1) * np.exp(-(u/scale)**shape)
plt.plot(center,weibull(center,shape,scale),label='Wind analysis',lw=2)
plt.legend()
Some extra info that helped me understand:
Scipy Weibull function can take four input parameters: (a,c),loc and scale.
You want to fix the loc and the first shape parameter (a), this is done with floc=0,f0=1. Fitting will then give you params c and scale, where c corresponds to the shape parameter of the two-parameter Weibull distribution (often used in wind data analysis) and scale corresponds to its scale factor.
From docs:
exponweib.pdf(x, a, c) =
a * c * (1-exp(-x**c))**(a-1) * exp(-x**c)*x**(c-1)
If a is 1, then
exponweib.pdf(x, a, c) =
c * (1-exp(-x**c))**(0) * exp(-x**c)*x**(c-1)
= c * (1) * exp(-x**c)*x**(c-1)
= c * x **(c-1) * exp(-x**c)
From this, the relation to the 'wind analysis' Weibull function should be more clear
I was curious about your question and, despite this is not an answer, it compares the Matlab result with your result and with the result using leastsq, which showed the best correlation with the given data:
The code is as follows:
import scipy.stats as s
import numpy as np
import matplotlib.pyplot as plt
import numpy.random as mtrand
from scipy.integrate import quad
from scipy.optimize import leastsq
## my distribution (Inverse Normal with shape parameter mu=1.0)
def weib(x,n,a):
return (a / n) * (x / n)**(a-1) * np.exp(-(x/n)**a)
def residuals(p,x,y):
integral = quad( weib, 0, 16, args=(p[0],p[1]) )[0]
penalization = abs(1.-integral)*100000
return y - weib(x, p[0],p[1]) + penalization
#
data = np.loadtxt("stack_data.csv")
x = np.linspace(data.min(), data.max(), 100)
n, bins, patches = plt.hist(data,bins=x, normed=True)
binsm = (bins[1:]+bins[:-1])/2
popt, pcov = leastsq(func=residuals, x0=(1.,1.), args=(binsm,n))
loc, scale = 1.58463497, 5.93030013
plt.plot(binsm,n)
plt.plot(x, weib(x, loc, scale),
label='weib matlab, loc=%1.3f, scale=%1.3f' % (loc, scale), lw=4.)
loc, scale = s.exponweib.fit_loc_scale(data, 1, 1)
plt.plot(x, weib(x, loc, scale),
label='weib stack, loc=%1.3f, scale=%1.3f' % (loc, scale), lw=4.)
plt.plot(x, weib(x,*popt),
label='weib leastsq, loc=%1.3f, scale=%1.3f' % tuple(popt), lw=4.)
plt.legend(loc='upper right')
plt.show()
I had the same problem, but found that setting loc=0 in exponweib.fit primed the pump for the optimization. That was all that was needed from #user333700's answer. I couldn't load your data -- your data link points to an image, not data. So I ran a test on my data instead:
import scipy.stats as ss
import matplotlib.pyplot as plt
import numpy as np
N=30
counts, bins = np.histogram(x, bins=N)
bin_width = bins[1]-bins[0]
total_count = float(sum(counts))
f, ax = plt.subplots(1, 1)
f.suptitle(query_uri)
ax.bar(bins[:-1]+bin_width/2., counts, align='center', width=.85*bin_width)
ax.grid('on')
def fit_pdf(x, name='lognorm', color='r'):
dist = getattr(ss, name) # params = shape, loc, scale
# dist = ss.gamma # 3 params
params = dist.fit(x, loc=0) # 1-day lag minimum for shipping
y = dist.pdf(bins, *params)*total_count*bin_width
sqerror_sum = np.log(sum(ci*(yi - ci)**2. for (ci, yi) in zip(counts, y)))
ax.plot(bins, y, color, lw=3, alpha=0.6, label='%s err=%3.2f' % (name, sqerror_sum))
return y
colors = ['r-', 'g-', 'r:', 'g:']
for name, color in zip(['exponweib', 't', 'gamma'], colors): # 'lognorm', 'erlang', 'chi2', 'weibull_min',
y = fit_pdf(x, name=name, color=color)
ax.legend(loc='best', frameon=False)
plt.show()
There have been a few answers to this already here and in other places. likt in Weibull distribution and the data in the same figure (with numpy and scipy)
It still took me a while to come up with a clean toy example so I though it would be useful to post.
from scipy import stats
import matplotlib.pyplot as plt
#input for pseudo data
N = 10000
Kappa_in = 1.8
Lambda_in = 10
a_in = 1
loc_in = 0
#Generate data from given input
data = stats.exponweib.rvs(a=a_in,c=Kappa_in, loc=loc_in, scale=Lambda_in, size = N)
#The a and loc are fixed in the fit since it is standard to assume they are known
a_out, Kappa_out, loc_out, Lambda_out = stats.exponweib.fit(data, f0=a_in,floc=loc_in)
#Plot
bins = range(51)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(bins, stats.exponweib.pdf(bins, a=a_out,c=Kappa_out,loc=loc_out,scale = Lambda_out))
ax.hist(data, bins = bins , density=True, alpha=0.5)
ax.annotate("Shape: $k = %.2f$ \n Scale: $\lambda = %.2f$"%(Kappa_out,Lambda_out), xy=(0.7, 0.85), xycoords=ax.transAxes)
plt.show()
In the meantime, there is really good package out there: reliability. Here is the documentation: reliability # readthedocs.
Your code simply becomes:
from reliability.Fitters import Fit_Weibull_2P
...
wb = Fit_Weibull_2P(failures=data)
plt.show()
Saves a lot of headaches and makes beautiful plots, too.
the order of loc and scale is messed up in the code:
plt.plot(x, weib(x, scale, loc))
the scale parameter should come first.