I am trying to do some parameter inference on an ODE compared to some experimental data observed at 3 time points (2h, 4h and 6h). I set everything up according to the first example:
https://pyabc.readthedocs.io/en/latest/examples/adaptive_distances.html?highlight=PCMAD
But get and error about parsing from list to numeric:
TypeError: Cannot parse variable Contamination=[array([253.36919232]), array([482.10280333]), array([700.764029])] of type <class 'list'> to numeric.
I think this refers to the output from the deterministic_run() function. How can I convert it to numeric?
Tested code which creates the error.
Preamble
import pyabc as pyabc
from pyabc import (ABCSMC,
RV, Distribution,
MedianEpsilon,
LocalTransition)
from pyabc.visualization import plot_kde_2d, plot_data_callback
import matplotlib.pyplot as plt
import os
import tempfile
import numpy as np
#import scipy as sp
from scipy.integrate import odeint
import math
import seaborn as sns
#pyabc.settings.set_figure_params('pyabc') # for beautified plots
db_path = ("sqlite:///" +
os.path.join(tempfile.gettempdir(), "test5.db"))
Here we define the ODE model
def ode_model(contamination,t,r,C,d,g):
Contamination = contamination;
return(r*(1-Contamination/C)-d*math.exp(-g*t)*Contamination)
Here we create the input parameters and extract only specific time-points
def deterministic_run(parameters):#precision,initial_contamination,r,C,d,g):
precision=5000
tmax = 6
time_space = np.linspace(0,tmax,precision+1)#precision+1?
sim=odeint(ode_model,initial_contamination,time_space,args=(parameters["r"],parameters["C"],parameters["d"],parameters["g"]))
num_at_2=sim[int(precision*2/tmax)]
num_at_4=sim[int(precision*4/tmax)]
num_at_6=sim[int(precision*6/tmax)]
return{"Contamination":[num_at_2,num_at_4,num_at_6]}
Parameter priors
parameter_prior = Distribution(r=RV("uniform", 0.0, 200.0),
C=RV("uniform", 1000.0, 6000.0),
d=RV("uniform", 10.0, 1000.0),
g=RV("uniform", 2.0, 200.0))
parameter_prior.get_parameter_names()
Distance function and set-up
distance = pyabc.PNormDistance(p=2)
abc = pyabc.ABCSMC(models=deterministic_run,parameter_priors=parameter_prior, distance_function=distance)
Observed data for comparison and initial conditions for ODE
initial_contamination=1200.0
measurement_data = np.array([134.0,202.0,294.0]) #Mean observed data at 2h, 4h and 6h.
s=np.array([93.70165,86.13942,162.11107]) #STD of observation
precision=5000
measurement_times = np.array([2,4,6])
and we define where to store the results
history = abc.new(db_path, {"Contamination": measurement_data,"sd": s})
We run the ABC until criterion is met
history = abc.run(max_nr_populations=7)
This gives the error:
TypeError: Cannot parse variable Contamination=[array([253.36919232]), array([482.10280333]), array([700.764029])] of type <class 'list'> to numeric.
Related
I'm trying to obtain only positive autocorrelation values from a timeseries waveform using scipy.signal.correlate() which should look like the following:
But I am ending up getting the following - which has both positive and negative values and also a trend present:
Can anyone please tell how to get only positive & de-trended Autocorrelation values?
The dataset for which I'm finding the autocorrelation, is generated using the following code (which you could use as it is for your reference):
import json
import sys, os
import numpy as np
import pandas as pd
import glob
import pickle
from statsmodels.tsa.stattools import adfuller, acf, pacf
from scipy.signal import find_peaks, square
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
#GENERATION OF A FUNCTION WITH DUAL SEASONALITY & NOISE
def white_noise(mu, sigma, num_pts):
""" Function to generate Gaussian Normal Noise
Args:
sigma: std value
num_pts: no of points
mu: mean value
Returns:
generated Gaussian Normal Noise
"""
noise = np.random.normal(mu, sigma, num_pts)
return noise
def signal_line_plot(input_signal: pd.Series, title: str = "", y_label: str = "Signal"):
""" Function to plot a time series signal
Args:
input_signal: time series signal that you want to plot
title: title on plot
y_label: label of the signal being plotted
Returns:
signal plot
"""
plt.plot(input_signal)
plt.title(title)
plt.ylabel(y_label)
plt.show()
t_week = np.linspace(1,480, 480)
t_weekend=np.linspace(1,192,192)
T=96 #Time Period
x_weekday = 10*square(2*np.pi*t_week/T, duty=0.7)+10 + white_noise(0, 1,480)
x_weekend = 2*square(2*np.pi*t_weekend/T, duty=0.7)+2 + white_noise(0,1,192)
x_daily_weekly = np.concatenate((x_weekday, x_weekend))
x_daily_weekly_long = np.concatenate((x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly))
signal_line_plot(x_daily_weekly_long)
signal_line_plot(x_daily_weekly_long[0:1000])
#x_daily_weekly_long is the final waveform on which I'm carrying out Autocorrelation
I'm performing Autocorrelation as follows (whose resulting output is as I've shown above, which is what I'm not satisfied with):
#DETERMINING AUTOCORRELATION AND LAG VALUES:
import scipy.signal as signal
autocorr = signal.correlate(x_daily_weekly_long, x_daily_weekly_long, mode = "same")
lags = signal.correlation_lags(len(x_daily_weekly_long), len(x_daily_weekly_long), mode = "same")
#VISUALIZATION:
f = plt.figure()
f.set_figwidth(40)
f.set_figheight(10)
plt.plot(lags, autocorr)
Could anyone please help?
May someone suggests me how can I call the leaveOut entry for model prediction. Initially, the model is developed expect the leeaveout entry and now I am interested to check the error for the leaveOut entry using the developed model.
Sample code is as below:
import pandas as pd # Reading Table
import numpy as np # Processing Array
import scipy.stats # Computing Statistic
import matplotlib.pyplot as plt # Drawing Graph
import statsmodels.api as sm # Statistical Models
n = len(data)
a = data["aa"]
b= data["bb"]
MSE_predict = np.zeros(n)
for i in np.arange(n):
a_leaveOne = np.delete(a.values, i)
b_leaveOne = np.delete(b.values, i)
b_leaveOne=sm.add_constant(b_leaveOne)
model=sm.OLS(a_leaveOne, b_leaveOne).fit()
a_pre=model.predict([1],np.array(pres)[i])
MSE=np.square(np.subtract(a[i],a_pre)).mean()
print(MSE)
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
This is my model :
def concentration(T,a,b,c,d):
(Tag,Tau)=T
return np.exp(a-d/Tag)/(np.exp(a-d/Tag) +np.exp(b-c/Tau))
Here I use a variable T which is a combination of two independent variables Tag and Tau. The output that I wish to fit is concentration.
T=[(7.9481e-04,5.9061e-04), (7.64438e-04,6.1231e-04), (7.64438e-04,6.04905e-04), (7.85453e-04,6.01268e-04) ]
experimental_concentration = [0.5428,0.798,0.75,0.70]
My code says,
ValueError: too many values to unpack (expected 2)
.
I have data set (1-D), with only one independent column. I would like to fit any model to it in order to sample from that model. The raw data
Data set
I tried various theoretical distributions from Fitter package (here https://pypi.org/project/fitter/), none of them works fine. Then i tried Kernel Density Estimation using sklearn. It is good, but i could not prevent negative values due to the way it works. Finally, i tried log normal, but it is not really perfect.
Code for log normal here
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import math
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
NN = 3915 # sample same number as original data set
df = pd.read_excel (r'Data_sets2.xlsx',sheet_name="Set1")
eps = 0.1 # Additional term for c
"""
Estimate parameters of log(c) as normal distribution
"""
df["c"] = df["c"] + eps
mu = np.mean(np.log(df["c"]))
s = np.std(np.log(df["c"]))
print("Mean:",mu,"std:",s)
def simulate(N):
c = []
for i in range(N):
c_s = np.exp(np.random.normal(loc = mu, scale = s, size=1)[0])
c.append(round(c_s))
return (c)
predicted_c = simulate(NN)
XX=scipy.arange(3915)
### plot C relation ###
plt.scatter(XX,df["c"],color='g',label="Original data")
plt.scatter(XX,predicted_c,color='r',label="Sample data")
plt.xlabel('Index')
plt.ylabel('c')
plt.legend()
plt.show()
original vs samples
What i am looking for is how to improve the fitting, any suggestions or direction to models that may fit my data with a better accuracy is appreciated. Thanks
Here is a graphical Python fitter for the scipy statistical distribution Double Gamma using your spreadsheet data, I hope this might be of some use as a Normal distribution seems to be a poor fit to this data set. The scipy documentation for dgamma is at https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.dgamma.html - incidentally,the double Weibull distribution fit almost as well.
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_excel (r'Data_sets2.xlsx',sheet_name="Set1")
eps = 0.1 # Additional term for c
data = df["c"] + eps
P = ss.dgamma.fit(data)
rX = np.linspace(min(data), max(data), 50)
rP = ss.dgamma.pdf(rX, *P)
plt.hist(data,bins=25, normed=True, color='slategrey')
plt.plot(rX, rP, color='darkturquoise')
plt.show()
I'm calculating the Autocorrelation Function for a stock's returns. To do so I tested two functions, the autocorr function built into Pandas, and the acf function supplied by statsmodels.tsa. This is done in the following MWE:
import pandas as pd
from pandas_datareader import data
import matplotlib.pyplot as plt
import datetime
from dateutil.relativedelta import relativedelta
from statsmodels.tsa.stattools import acf, pacf
ticker = 'AAPL'
time_ago = datetime.datetime.today().date() - relativedelta(months = 6)
ticker_data = data.get_data_yahoo(ticker, time_ago)['Adj Close'].pct_change().dropna()
ticker_data_len = len(ticker_data)
ticker_data_acf_1 = acf(ticker_data)[1:32]
ticker_data_acf_2 = [ticker_data.autocorr(i) for i in range(1,32)]
test_df = pd.DataFrame([ticker_data_acf_1, ticker_data_acf_2]).T
test_df.columns = ['Pandas Autocorr', 'Statsmodels Autocorr']
test_df.index += 1
test_df.plot(kind='bar')
What I noticed was the values they predicted weren't identical:
What accounts for this difference, and which values should be used?
The difference between the Pandas and Statsmodels version lie in the mean subtraction and normalization / variance division:
autocorr does nothing more than passing subseries of the original series to np.corrcoef. Inside this method, the sample mean and sample variance of these subseries are used to determine the correlation coefficient
acf, in contrary, uses the overall series sample mean and sample variance to determine the correlation coefficient.
The differences may get smaller for longer time series but are quite big for short ones.
Compared to Matlab, the Pandas autocorr function probably corresponds to doing Matlabs xcorr (cross-corr) with the (lagged) series itself, instead of Matlab's autocorr, which calculates the sample autocorrelation (guessing from the docs; I cannot validate this because I have no access to Matlab).
See this MWE for clarification:
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import acf
import matplotlib.pyplot as plt
plt.style.use("seaborn-colorblind")
def autocorr_by_hand(x, lag):
# Slice the relevant subseries based on the lag
y1 = x[:(len(x)-lag)]
y2 = x[lag:]
# Subtract the subseries means
sum_product = np.sum((y1-np.mean(y1))*(y2-np.mean(y2)))
# Normalize with the subseries stds
return sum_product / ((len(x) - lag) * np.std(y1) * np.std(y2))
def acf_by_hand(x, lag):
# Slice the relevant subseries based on the lag
y1 = x[:(len(x)-lag)]
y2 = x[lag:]
# Subtract the mean of the whole series x to calculate Cov
sum_product = np.sum((y1-np.mean(x))*(y2-np.mean(x)))
# Normalize with var of whole series
return sum_product / ((len(x) - lag) * np.var(x))
x = np.linspace(0,100,101)
results = {}
nlags=10
results["acf_by_hand"] = [acf_by_hand(x, lag) for lag in range(nlags)]
results["autocorr_by_hand"] = [autocorr_by_hand(x, lag) for lag in range(nlags)]
results["autocorr"] = [pd.Series(x).autocorr(lag) for lag in range(nlags)]
results["acf"] = acf(x, unbiased=True, nlags=nlags-1)
pd.DataFrame(results).plot(kind="bar", figsize=(10,5), grid=True)
plt.xlabel("lag")
plt.ylim([-1.2, 1.2])
plt.ylabel("value")
plt.show()
Statsmodels uses np.correlate to optimize this, but this is basically how it works.
As suggested in comments, the problem can be decreased, but not completely resolved, by supplying unbiased=True to the statsmodels function. Using a random input:
import statistics
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import acf
DATA_LEN = 100
N_TESTS = 100
N_LAGS = 32
def test(unbiased):
data = pd.Series(np.random.random(DATA_LEN))
data_acf_1 = acf(data, unbiased=unbiased, nlags=N_LAGS)
data_acf_2 = [data.autocorr(i) for i in range(N_LAGS+1)]
# return difference between results
return sum(abs(data_acf_1 - data_acf_2))
for value in (False, True):
diffs = [test(value) for _ in range(N_TESTS)]
print(value, statistics.mean(diffs))
Output:
False 0.464562410987
True 0.0820847168593
In the following example, Pandas autocorr() function gives the expected results but statmodels acf() function does not.
Consider the following series:
import pandas as pd
s = pd.Series(range(10))
We expect that there is perfect correlation between this series and any of its lagged series, and this is actually what we get with autocorr() function
[ s.autocorr(lag=i) for i in range(10) ]
# [0.9999999999999999, 1.0, 1.0, 1.0, 1.0, 0.9999999999999999, 1.0, 1.0, 0.9999999999999999, nan]
But using acf() we get a different result:
from statsmodels.tsa.stattools import acf
acf(s)
# [ 1. 0.7 0.41212121 0.14848485 -0.07878788
# -0.25757576 -0.37575758 -0.42121212 -0.38181818 -0.24545455]
If we try acf with adjusted=True the result is even more unexpected because for some lags the result is less than -1 (note that correlation has to be in [-1, 1])
acf(s, adjusted=True) # 'unbiased' is deprecated and 'adjusted' should be used instead
# [ 1. 0.77777778 0.51515152 0.21212121 -0.13131313
# -0.51515152 -0.93939394 -1.4040404 -1.90909091 -2.45454545]