I have a set of integer values, and I want to set them to Weibull distribution and get the best fit parameters. Then I draw the histogram of data together with the pdf of Weibull distribution, using the best fit parameters. This is the code I used.
from jtlHandler import *
import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
def get_pdf(latencies):
a = np.array(latencies)
ag = st.gaussian_kde(a)
ak = np.linspace(np.min(a), np.max(a), len(a))
agv = ag(ak)
plt.plot(ak,agv)
plt.show()
return (ak,agv)
def fit_to_distribution(distribution, data):
params = distribution.fit(data)
# Return MLEs for shape (if applicable), location, and scale parameters from data.
#
# MLE stands for Maximum Likelihood Estimate. Starting estimates for the fit are given by input arguments; for any arguments not provided with starting estimates, self._fitstart(data) is called to generate such.
return params
def make_distribution_pdf(dist, params, end):
arg = params[:-2]
loc = params[-2]
scale = params[-1]
# Build PDF and turn into pandas Series
x = np.linspace(0, end, end)
y = dist.pdf(x, loc=loc, scale=scale, *arg)
pdf = pd.Series(y, x)
return pdf
latencies = getLatencyList("filename")
latencies = latencies[int(9*(len(latencies)/10)):len(latencies)]
data = pd.Series(latencies)
params = fit_to_distribution(st.weibull_max, data)
print("Parameters for the fit: "+str(params))
# Make PDF
pdf = make_distribution_pdf(st.weibull_max, params, max(latencies))
# Display
plt.figure()
ax = pdf.plot(lw=2, label='PDF', legend=True)
data.plot(kind='hist', bins=200, normed=True, alpha=0.5, label='Data',
legend=True, ax=ax)
ax.set_title('Weibull distribution')
ax.set_xlabel('Latnecy')
ax.set_ylabel('Frequency')
plt.savefig("image.png")
This is the resulting figure.
As it is seen, the Weibull approximation is not simmilar to the original distribution of data.
How can I get the best Weibull approximation to my data?
You can fit a data set (set of numbers) to any distribution using the following two methods.
import os
import matplotlib.pyplot as plt
import sys
import math
import numpy as np
import scipy.stats as st
from scipy.stats._continuous_distns import _distn_names
from scipy.optimize import curve_fit
def fit_to_distribution(distribution, latency_values):
distribution = getattr(st, distribution)
params = distribution.fit(latency_values)
return params
def make_distribution_pdf(distribution, latency_list):
distribution = getattr(st, distribution)
params = distribution.fit(latency_list)
arg = params[:-2]
loc = params[-2]
scale = params[-1]
x = np.linspace(min(latency_list), max(latency_list), 10000)
y = distribution.pdf(x, loc=loc, scale=scale, *arg)
return x, y
Related
I want to know how scipy.stats uses its methods fit and pdf. According to the documentation, fit(data, a, loc = 0, scale = 1) estimates parameters for data while pdf(x, a, loc=0, scale=1) computes probability density function . But I couldn't find how fit and pdf are actually performed, statistically and mathematically.
I am using the sm.datasets.elnino data, using the code from tmthydvnprt
import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels as sm
import matplotlib
import matplotlib.pyplot as plt
data = pd.Series(sm.datasets.elnino.load_pandas().data.set_index('YEAR').values.ravel())
y, x = np.histogram(data, bins = 50, density = True)
x = (x + np.roll(x, -1))[:-1] / 2.0
distribution = st.gennorm
params = distribution.fit(data)
arg = params[:-2]
loc = params[-2]
scale = params[-1]
pdf = distribution.pdf(x, loc = loc, scale = scale, *arg)
sse = np.sum(np.power(y - pdf, 2.0))
Using data, arg = 4.3836, loc = 23.2991, scale = 3.8499.
I want to know what arg, loc, and scale represent and how they are calculated.
Thank you.
I want to create a customized distribution based on a Levy truncated law, which reads
p(r) = (r + r0)**(-beta)*exp(-r/k).
So I defined it in the following way:
import numpy as np
import scipy.stats as st
class LevyPDF(st.rv_continuous):
def _pdf(self,r):
r0 = 100
k = 1500
beta = 1.6
return (r + r0)**(-beta)*np.exp(-r/k)
Suppose that I want to find the distribution of distances between r = 0 and r = 50km. Then:
nmin = 0
nmax = 50
my_cv = LevyPDF(a=nmin, b=nmax, name='LevyPDF')
x = np.linspace(nmin, nmax, (nmax-nmin)*2)
I do not understand why:
sum(my_cv.cdf(x)) = 2.22
instead of 1.
Then how can I define an histogram of N = 2000000 random distances based on the distribution that I defined?
Using your minimal example (slightly adapted):
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
class LevyPDF(st.rv_continuous):
def _pdf(self,r):
r0 = 100
k = 1500
beta = 1.6
return (r + r0)**(-beta)*np.exp(-r/k)
nmin = 0
nmax = 50
my_cv = LevyPDF(a=nmin, b=nmax, name='LevyPDF')
To sample from your random variable, use rvs() method from rv_continuous class:
N = 50000
X = my_cv.rvs(size=N, random_state=1)
Will return an array of size (N,) with random variates sampled from your distribution. Use random_state option to freeze your example and make your script repeatable (it defines random seed for your sampling).
Note as N softly increases, computation time drastically increases.
To plot histogram, use matplotlib library, see hist:
fig, axe = plt.subplots()
n, bins, patches = axe.hist(X, 50, normed=1, facecolor='green', alpha=0.75)
plt.show(axe)
Bellow a example of sampling from Chi Square with 40 Degrees of Freedom:
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
rv = stats.chi2(40)
N = 200000
X = rv.rvs(size=N, random_state=1)
fig, axe = plt.subplots()
n, bins, patches = axe.hist(X, 50, normed=1, facecolor='green', alpha=0.75)
plt.show(axe)
It leads to:
I'm trying to plot values from a recorded data set from an experiment.
When fitting the data with an exponential decay, it's very successful in the form of a normal plot.
But having the plot in a semi-log form gives me this.
import matplotlib.pyplot as plt
%matplotlib inline
import pylab as plb
import numpy as np
import scipy as sp
import csv
from scipy.optimize import curve_fit
# Run-4 Data
DecayTime4 = []
DecayCount4 = []
with open('Half_Life_Run4_Decay_AD.csv', 'r') as h:
reader = csv.reader(h, delimiter=',')
for row in reader:
DecayTime4.append(row[0])
DecayCount4.append(row[1])
DecayTime4 = np.array(DecayTime4)
DecayCount4 = np.array(DecayCount4)
def model_func(x, a, k, b):
return a * np.exp(-k*x) + b
# Run 4 Data Fitting plot
x4 = np.float32(DecayTime4)
y4 = np.float32(DecayCount4)
p0_R4 = (1.,1.e-5,1.)
optR4, pcovR4 = curve_fit(model_func, x4, y4, p0_R4)
aR4, kR4, bR4 = optR4
aR4p, kR4p, bR4p = pcovR4
y4M = model_func(x4, aR4, kR4, bR4)
fig4 = plt.figure(figsize=(15,6))
ax4 = fig4.add_subplot(111)
# Plot of data
ax4.plot(DecayTime4, DecayCount4, ".", color='lightcoral')
# Plot of best fit
ax4.plot(x4, y4M, color='k', label='Fitting Function: $f(t) = %0.2f e^{%0.3f\ t} %+0.2f$' % (aR4,kR4,bR4))
ax4.set_xlabel('Time (sec)')
ax4.set_ylabel('Count')
ax4.set_title('Run 4 of Cesium-137 Decay')
ax4.set_yscale('log')
ax4.legend(bbox_to_anchor=(1.0, 1.0), prop={'size':15}, fancybox=True, shadow=True)
The purpose of the semi-log is to show the accuracy of the exponential fit with the data.
It should real be a straight line like this image
The data set is large with a shape of (1401,).
Could it be that the curve_fit must not work well with large data sets?
Can this be correct?
I'm using seaborn distplot (data, fit=stats.gamma)
How do I get the fit parameters returned?
Here is an example:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
df = pd.read_csv ('RequestSize.csv')
import matplotlib.pyplot as plt
reqs = df['12 web pages']
reqs = reqs.dropna()
reqs = reqs[np.logical_and (reqs > np.percentile (reqs, 0), reqs < np.percentile (reqs, 95))]
dist = sns.distplot (reqs, fit=stats.gamma)
Use the object you passed to distplot:
stats.gamma.fit(reqs)
I confirm the above is true - the sns.distplot fit method is equivalent to the fit method in scipy.stats so you can get the parameters from there, e.g.:
from scipy import stats
ax = sns.distplot(e_t_hat, bins=20, kde=False, fit=stats.norm);
plt.title('Distribution of Cointegrating Spread for Brent and Gasoil')
# Get the fitted parameters used by sns
(mu, sigma) = stats.norm.fit(e_t_hat)
print "mu={0}, sigma={1}".format(mu, sigma)
# Legend and labels
plt.legend(["normal dist. fit ($\mu=${0:.2g}, $\sigma=${1:.2f})".format(mu, sigma)])
plt.ylabel('Frequency')
# Cross-check this is indeed the case - should be overlaid over black curve
x_dummy = np.linspace(stats.norm.ppf(0.01), stats.norm.ppf(0.99), 100)
ax.plot(x_dummy, stats.norm.pdf(x_dummy, mu, sigma))
plt.legend(["normal dist. fit ($\mu=${0:.2g}, $\sigma=${1:.2f})".format(mu, sigma),
"cross-check"])
I am running some goodness of fit tests using scipy.stats in Python 2.7.10.
for distrName in distrNameList:
distr = getattr(distributions, distrName)
param = distr.fit(sample)
pdf = distr.pdf(???)
What do I pass into distr.pdf() to get the values of the best-fit pdf on the list of sample points of interest, called abscissas?
From the documentation, the .fit() method returns:
shape, loc, scale : tuple of floats
MLEs for any shape statistics, followed by those for location and scale.
and the .pdf() method accepts:
x : array_like
quantiles
arg1, arg2, arg3,... : array_like
The shape parameter(s) for the distribution (see docstring of the instance object for more information)
loc : array_like, optional
location parameter (default=0)
scale : array_like, optional
So essentially you would do something like this:
import numpy as np
from scipy import stats
from matplotlib import pyplot as plt
# some random variates drawn from a beta distribution
rvs = stats.beta.rvs(2, 5, loc=0, scale=1, size=1000)
# estimate distribution parameters, in this case (a, b, loc, scale)
params = stats.beta.fit(rvs)
# evaluate PDF
x = np.linspace(0, 1, 1000)
pdf = stats.beta.pdf(x, *params)
# plot
fig, ax = plt.subplots(1, 1)
ax.hold(True)
ax.hist(rvs, normed=True)
ax.plot(x, pdf, '--r')
To evaluate the pdf at abscissas, you would pass abcissas as the first argument to pdf. To specify the parameters, use the * operator to unpack the param tuple and pass those values to distr.pdf:
pdf = distr.pdf(abscissas, *param)
For example,
import numpy as np
import scipy.stats as stats
distrNameList = ['beta', 'expon', 'gamma']
sample = stats.norm(0, 1).rvs(1000)
abscissas = np.linspace(0,1, 10)
for distrName in distrNameList:
distr = getattr(stats.distributions, distrName)
param = distr.fit(sample)
pdf = distr.pdf(abscissas, *param)
print(pdf)