Wrong fit when using k nearest neighbors regression - python

I use the nearest neighbors method to predict the price of a stock. I have raw data in example.txt file. I use the close column (price at the end of the period = 1 minute). Linear regression predicts well (shown in green). But the method of nearest neighbors works only at the beginning and then turns into a straight line, please tell me how to fix this? Here is my code I wrote:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
class Reader:
def __init__(self, filename='example.txt'):
self.filename = filename
def read(self):
try:
file = open(self.filename)
return file.read()
except IOError:
return "File not found"
def main():
x = Reader('example.txt')
print(x.read())
class Regression:
def __init__(self, window, P0, Ptest, i):
self.window = window
self.P0 = P0
self.Ptest = Ptest
self.i = i
self.data_train = self.get_data_train()
self.x_train = self.get_x_train()
self.y_train = self.get_y_train()
self.data_test = self.get_data_test()
self.x_test = self.get_x_test()
self.y_test = self.get_y_test()
def get_data_train(self):
""" Method of obtaining data train on prices for the entire period."""
x = Reader('example.txt')
data = x.read().splitlines()
close_column = [x.split(',')[7] for x in data][1:]
result = [float(item) for item in close_column]
relative_price = result[:int(len(result)*P0)]
return relative_price
def get_data_test(self):
""" Method of obtaining data test on prices for the entire period."""
x = Reader('example.txt')
data = x.read().splitlines()
close_column = [x.split(',')[7] for x in data][1:]
result = [float(item) for item in close_column]
len_x_test = int(len(result) * Ptest)
len_x_train = int(len(result) * P0)
relative_price = result[(len_x_train + (len_x_test * self.i)): len_x_train + len_x_test
* (self.i + 1)]
return relative_price
def get_x_train(self):
x = []
for i in range(len(self.data_train)):
if i + self.window < len(self.data_train):
x.append(self.data_train[i: i + self.window])
return x
def get_y_train(self):
y = []
for i in self.data_train[self.window:]:
y += [i]
return y
def get_x_test(self):
x = []
for i in range(len(self.data_test)):
if i + self.window < len(self.data_test):
x.append(self.data_test[i: i + self.window])
return x
def get_y_test(self):
y = []
for i in self.data_test[self.window:]:
y += [i]
return y
class Linear_regression(Regression):
def callculate(self):
reg_linear = LinearRegression().fit(self.x_train, self.y_train)
y_pred = reg_linear.predict(self.x_test)
return y_pred
class Nearest_neighbor(Regression):
def callculate(self):
reg_neighbor = KNeighborsRegressor(n_neighbors=window, weights='distance')
reg_neighbor.fit(self.x_train, self.y_train)
y_pred = reg_neighbor.predict(self.x_test)
return y_pred
window = 10
Pk = 1
P0 = 0.1
Ptest = 0.01
k = (Pk - P0)/Ptest
i = 0
y_real = []
y_neigh = []
y_lin = []
while i < k:
lin_price = list(Linear_regression(window, P0, Ptest, i).callculate())
neighbor = list(Nearest_neighbor(window, P0, Ptest, i).callculate())
y_neigh.extend(neighbor)
y_lin.extend(lin_price)
y_real.extend(list(Linear_regression(window, P0, Ptest, i).y_test))
i += 1
""" Output to graphs of the received data """
fig, ax = plt.subplots()
ax.plot(y_real, label='Initial data')
ax.plot(y_neigh, label='Nearest Neighbor Data')
ax.plot(y_lin, label='Linear Regression Data')
ax.set_xlabel('Time (min)')
ax.set_ylabel('Price, ($)')
ax.legend()
plt.show()

"Linear regression predicts well"
No, it never predicted well. You just looked at the graph and thought it looked kind of similar. But if you look more closely, your 'model' simply takes the price of a bit ago as the prediction of the price now. That means, it's not predicting anything! It's a history device, not a prediction device.
That's why if you feed back this sort of 'model' into itself you get a straight line: it always predicts the next price is going to be equal to the last one.

Related

Fitting model to data using scipy differential evolution: "RuntimeError: The map-like callable must be of the form f(func, iterable)..."

I am trying to fit a model to data (extracted from an Excel file and imported using pandas), using a likelihood method. However, when running the code I get a "RuntimeError: The map-like callable must be of the form f(func, iterable), returning a sequence of numbers the same length as 'iterable'" error, which occurred at the "result_simul_G = minimize(negLogLike, params, method = 'differential_evolution', args=(x, y),)" line. Below I have my code; it's very integrated so I couldn't find a way to illustrate what's happening without showing most of it.
#================================================================================
import numpy as np
import pandas as pd
import os
from lmfit import minimize, Parameters, Parameter, report_fit
params = Parameters()
params.add('gamma', value=.45, min=0, max=1, vary = True)
params.add('n', value = 1, min=0, max=3, vary = True)
filename = 'data.xlsx'
#================================================================================
def negLogLike(params, xData, yData):
new_xData = []
new_yData = []
for i in range(len(yData)):
if ((yData[i] != 0) and (xData[i] != 0)):
new_xData.append(xData[i])
new_yData.append(yData[i])
model_result = model(new_xData, params)
nll = 0
epsilon = 10**-10
for i in range(len(new_yData)):
if (model_result[i] < epsilon):
model_result[i] = epsilon
if (model_result[i] > 1 - epsilon):
model_result[i] = 1 - epsilon
nll += new_yData[i] * np.log(model_result[i]) + (1 - new_yData[i]) * np.log(1 - model_result[i])
return -nll
#================================================================================
def model(x, params):
try: # Get parameters
g = params['gamma'].value
n = params['n'].value
except KeyError:
g, n = params
y = 1 - np.exp(-g * x**n)
return y
#================================================================================
def GetFits(DataFrame):
cell_count = 2300000
GFP_GC_SIMUL = np.ones(DataFrame.shape[0], float)
GFP_IC_SIMUL = np.ones(DataFrame.shape[0], float)
# Data
for i in range(DataFrame.shape[0]):
GFP_GC_SIMUL[i] = DataFrame.loc[i, 'GFP genomes'] / cell_count
GFP_IC_SIMUL[i] = DataFrame.loc[i, 'GFP IU'] / cell_count
x = np.array(GFP_GC_SIMUL[10:-10])
y = np.array(GFP_IC_SIMUL[10:-10])
print('len=', len(x), x.dtype, ', x=', x)
print('------------------------')
print('len=', len(y), y.dtype, ', y=', y)
result_simul_G = minimize(negLogLike, params, method = 'differential_evolution', args=(x, y),)
#================================================================================
DataFrame = pd.read_excel('data.xlsx', engine='openpyxl')
GetFits(DataFrame)
When debugging on my own I used print statements to see what x and y data was being supplied to the minimizer and this is what it showed:
len= 34 float64 , x= [0.14478261 0.28695652 0.28695652 0.28695652 0.57391304 0.57391304
0.57391304 0.8738913 0.8738913 0.8738913 1.16086957 1.16086957
1.16086957 1.44780435 1.44780435 1.44780435 1.73478261 1.73478261
1.73478261 2.03476087 2.03476087 2.03476087 2.32173913 2.32173913
2.32173913 2.60869565 2.60869565 2.60869565 2.86956522 2.86956522
2.86956522 7.17391304 7.17391304 7.17391304]
------------------------
len= 34 float64 , y= [0.005 0.01180435 0.01226087 0.01158696 0.036 0.03704348
0.03467391 0.07030435 0.06556522 0.07567391 0.1001087 0.09852174
0.0986087 0.13626087 0.13978261 0.13956522 0.16847826 0.16408696
0.19391304 0.1945 0.21319565 0.19052174 0.32204348 0.23330435
0.25028261 0.28136957 0.26293478 0.25893478 0.28273913 0.29717391
0.273 0.60826087 0.60834783 0.59482609]
I know this is quite a lot but I would appreciate any and all help.

How to find feature Interactions between all columns in a dataframe, Python?

Friedman’s H-statistic The interpretable ML book by Christoph Molnar actually gives us a workable approach, by using Friedman’s H-statistic based on the decomposition of the partial dependence values to calculate the feature interactions.
In Python, sklearn_gbmi will accept feature sets of length two and higher but does not provide support for the first-order measure, very similar to interact.gbm in R. It only works on gradient boosting based models
I found a manual Python implementation from here, posted below for reference, where the feature interactions were calculated.
import itertools
import math
import pandas as pd
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from pdpbox.pdp_calc_utils import _calc_ice_lines_inter
from pdpbox.pdp import pdp_isolate, PDPInteract
from pdpbox.utils import (_check_model, _check_dataset, _check_percentile_range, _check_feature,
_check_grid_type, _check_memory_limit, _make_list,
_calc_memory_usage, _get_grids, _get_grid_combos, _check_classes)
from joblib import Parallel, delayed
def pdp_multi_interact(model, dataset, model_features, features,
num_grid_points=None, grid_types=None, percentile_ranges=None, grid_ranges=None,
cust_grid_points=None,
cust_grid_combos=None, use_custom_grid_combos=False,
memory_limit=0.5, n_jobs=1, predict_kwds=None, data_transformer=None):
def _expand_default(x, default, length):
if x is None:
return [default] * length
return x
def _get_grid_combos(feature_grids, feature_types):
grids = [list(feature_grid) for feature_grid in feature_grids]
for i in range(len(feature_types)):
if feature_types[i] == 'onehot':
grids[i] = np.eye(len(grids[i])).astype(int).tolist()
return np.stack(np.meshgrid(*grids), -1).reshape(-1, len(grids))
if predict_kwds is None:
predict_kwds = dict()
nr_feats = len(features)
# check function inputs
n_classes, predict = _check_model(model=model)
_check_dataset(df=dataset)
_dataset = dataset.copy()
# prepare the grid
pdp_isolate_outs = []
if use_custom_grid_combos:
grid_combos = cust_grid_combos
feature_grids = []
feature_types = []
else:
num_grid_points = _expand_default(x=num_grid_points, default=10, length=nr_feats)
grid_types = _expand_default(x=grid_types, default='percentile', length=nr_feats)
for i in range(nr_feats):
_check_grid_type(grid_type=grid_types[i])
percentile_ranges = _expand_default(x=percentile_ranges, default=None, length=nr_feats)
for i in range(nr_feats):
_check_percentile_range(percentile_range=percentile_ranges[i])
grid_ranges = _expand_default(x=grid_ranges, default=None, length=nr_feats)
cust_grid_points = _expand_default(x=cust_grid_points, default=None, length=nr_feats)
_check_memory_limit(memory_limit=memory_limit)
pdp_isolate_outs = []
for idx in range(nr_feats):
pdp_isolate_out = pdp_isolate(
model=model, dataset=_dataset, model_features=model_features, feature=features[idx],
num_grid_points=num_grid_points[idx], grid_type=grid_types[idx], percentile_range=percentile_ranges[idx],
grid_range=grid_ranges[idx], cust_grid_points=cust_grid_points[idx], memory_limit=memory_limit,
n_jobs=n_jobs, predict_kwds=predict_kwds, data_transformer=data_transformer)
pdp_isolate_outs.append(pdp_isolate_out)
if n_classes > 2:
feature_grids = [pdp_isolate_outs[i][0].feature_grids for i in range(nr_feats)]
feature_types = [pdp_isolate_outs[i][0].feature_type for i in range(nr_feats)]
else:
feature_grids = [pdp_isolate_outs[i].feature_grids for i in range(nr_feats)]
feature_types = [pdp_isolate_outs[i].feature_type for i in range(nr_feats)]
grid_combos = _get_grid_combos(feature_grids, feature_types)
feature_list = []
for i in range(nr_feats):
feature_list.extend(_make_list(features[i]))
# Parallel calculate ICE lines
true_n_jobs = _calc_memory_usage(
df=_dataset, total_units=len(grid_combos), n_jobs=n_jobs, memory_limit=memory_limit)
grid_results = Parallel(n_jobs=true_n_jobs)(delayed(_calc_ice_lines_inter)(
grid_combo, data=_dataset, model=model, model_features=model_features, n_classes=n_classes,
feature_list=feature_list, predict_kwds=predict_kwds, data_transformer=data_transformer)
for grid_combo in grid_combos)
ice_lines = pd.concat(grid_results, axis=0).reset_index(drop=True)
pdp = ice_lines.groupby(feature_list, as_index=False).mean()
# combine the final results
pdp_interact_params = {'n_classes': n_classes,
'features': features,
'feature_types': feature_types,
'feature_grids': feature_grids}
if n_classes > 2:
pdp_interact_out = []
for n_class in range(n_classes):
_pdp = pdp[feature_list + ['class_%d_preds' % n_class]].rename(
columns={'class_%d_preds' % n_class: 'preds'})
pdp_interact_out.append(
PDPInteract(which_class=n_class,
pdp_isolate_outs=[pdp_isolate_outs[i][n_class] for i in range(nr_feats)],
pdp=_pdp, **pdp_interact_params))
else:
pdp_interact_out = PDPInteract(
which_class=None, pdp_isolate_outs=pdp_isolate_outs, pdp=pdp, **pdp_interact_params)
return pdp_interact_out
def center(arr): return arr - np.mean(arr)
def compute_f_vals(mdl, X, features, selectedfeatures, num_grid_points=10, use_data_grid=False):
f_vals = {}
data_grid = None
if use_data_grid:
data_grid = X[selectedfeatures].values
# Calculate partial dependencies for full feature set
p_full = pdp_multi_interact(mdl, X, features, selectedfeatures,
num_grid_points=[num_grid_points] * len(selectedfeatures),
cust_grid_combos=data_grid,
use_custom_grid_combos=use_data_grid)
f_vals[tuple(selectedfeatures)] = center(p_full.pdp.preds.values)
grid = p_full.pdp.drop('preds', axis=1)
# Calculate partial dependencies for [1..SFL-1]
for n in range(1, len(selectedfeatures)):
for subsetfeatures in itertools.combinations(selectedfeatures, n):
if use_data_grid:
data_grid = X[list(subsetfeatures)].values
p_partial = pdp_multi_interact(mdl, X, features, subsetfeatures,
num_grid_points=[num_grid_points] * len(selectedfeatures),
cust_grid_combos=data_grid,
use_custom_grid_combos=use_data_grid)
p_joined = pd.merge(grid, p_partial.pdp, how='left')
f_vals[tuple(subsetfeatures)] = center(p_joined.preds.values)
return f_vals
# the second-order H-measure:
def compute_h_val(f_vals, selectedfeatures):
denom_els = f_vals[tuple(selectedfeatures)].copy()
numer_els = f_vals[tuple(selectedfeatures)].copy()
sign = -1.0
for n in range(len(selectedfeatures)-1, 0, -1):
for subfeatures in itertools.combinations(selectedfeatures, n):
numer_els += sign * f_vals[tuple(subfeatures)]
sign *= -1.0
numer = np.sum(numer_els**2)
denom = np.sum(denom_els**2)
return math.sqrt(numer/denom) if numer < denom else np.nan
# first-order H-measure as well:
def compute_h_val_any(f_vals, allfeatures, selectedfeature):
otherfeatures = list(allfeatures)
otherfeatures.remove(selectedfeature)
denom_els = f_vals[tuple(allfeatures)].copy()
numer_els = denom_els.copy()
numer_els -= f_vals[(selectedfeature,)]
numer_els -= f_vals[tuple(otherfeatures)]
numer = np.sum(numer_els**2)
denom = np.sum(denom_els**2)
return math.sqrt(numer/denom) if numer < denom else np.nan
df = sns.load_dataset("diamonds")
data = pd.get_dummies(df, ["cut", "color", "clarity"])
X = data.drop("cut_Ideal", axis=1)
y = data["cut_Ideal"]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.33,
random_state = 42)
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
max_depth=1, random_state=0).fit(X_train, y_train)
f_val = compute_f_vals(gbc, X, X.columns, ['carat', 'depth'], num_grid_points=10, use_data_grid=False)
# second-order H-measure:
compute_h_val(f_val, ['carat', 'depth'])
I want to calculate feature interactions for all the columns in a dataframe. How could I do that?
I am not here to avail free code writing service I just want to capture a little bit of knowledge from the experienced programmers with discussing things. I was just expecting a suggestion/reference for the appropriate library of methods on finding the feature interactions?

Adaptive DBSCAN achievement

I am doing the DBSCAN clustering in python. I want to achieve an adaptive way to return the number of clusters by self calculating its eps and Minpts parameters. Below is my code.
import math
import copy
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
def loadDataSet(fileName, splitChar='\t'):
dataSet = []
with open(fileName) as fr:
for line in fr.readlines():
curline = line.strip().split(splitChar)
fltline = list(map(float, curline))
dataSet.append(fltline)
return dataSet
def dist(a,b):
return math.sqrt(math.pow(a[0]-b[0],2) + math.pow(a[1]-b[1],2))
def returnDk(matrix,k):
Dk = []
for i in range(len(matrix)):
Dk.append(matrix[i][k])
return Dk
def returnDkAverage(Dk):
sum = 0
for i in range(len(Dk)):
sum = sum + Dk[i]
return sum/len(Dk)
def CalculateDistMatrix(dataset):
DistMatrix = [[0 for j in range(len(dataset))] for i in range(len(dataset))]
for i in range(len(dataset)):
for j in range(len(dataset)):
DistMatrix[i][j] = dist(dataset[i], dataset[j])
return DistMatrix
def returnEpsCandidate(dataSet):
DistMatrix = CalculateDistMatrix(dataSet)
tmp_matrix = copy.deepcopy(DistMatrix)
for i in range(len(tmp_matrix)):
tmp_matrix[i].sort()
EpsCandidate = []
for k in range(1,len(dataSet)):
Dk = returnDk(tmp_matrix,k)
DkAverage = returnDkAverage(Dk)
EpsCandidate.append(DkAverage)
return EpsCandidate
def returnMinptsCandidate(DistMatrix,EpsCandidate):
MinptsCandidate = []
for k in range(len(EpsCandidate)):
tmp_eps = EpsCandidate[k]
tmp_count = 0
for i in range(len(DistMatrix)):
for j in range(len(DistMatrix[i])):
if DistMatrix[i][j] <= tmp_eps:
tmp_count = tmp_count + 1
MinptsCandidate.append(tmp_count/len(dataSet))
return MinptsCandidate
def returnClusterNumberList(dataset,EpsCandidate,MinptsCandidate):
np_dataset = np.array(dataset)
ClusterNumberList = []
for i in range(len(EpsCandidate)):
clustering = DBSCAN(eps= EpsCandidate[i],min_samples= MinptsCandidate[i]).fit(np_dataset)
num_clustering = max(clustering.labels_)
ClusterNumberList.append(num_clustering)
return ClusterNumberList
if __name__ == '__main__':
data = pd.read_csv('/Users/Desktop/Mic/recorder_test1/New folder/MFCCresultsforclustering/MFCCresultsforclustering.csv')
dataSet = data.iloc[:,0:13].values
EpsCandidate = returnEpsCandidate(dataSet)
DistMatrix = CalculateDistMatrix(dataSet)
MinptsCandidate = returnMinptsCandidate(DistMatrix,EpsCandidate)
ClusterNumberList = returnClusterNumberList(dataSet,EpsCandidate,MinptsCandidate)
print(EpsCandidate)
print(MinptsCandidate)
print('cluster number list is')
print(ClusterNumberList)
However, the output with the loading data set is all [-1]s. I am wondering where is the mistake. Am I right for this general direction? If not, how can I achieve the adaptive DBSCAN clustering?

Naive Bayes from scratch in python with result 'Process finished with exit code 0'

I am new to PyCharm and I have found two codes online on Classification Techniques, using Naive Bayes classification. this code doesn't have an error. but I can see the result, even though I use print(). I'm using library iris dataset. and this is my code
import csv
import math
import random
import pandas as pd
from sklearn import datasets
def loadCsv(filename):
#lines = csv.reader(open(r'E:\KULIAH\TUGAS AKHIR\MachineLearning\kananniih.csv'))
lines = datasets.load_iris()
print(lines)
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset;
#spliit dataa
def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
while len(trainSet) < trainSize:
index = random.randrange(len(copy))
trainSet.append(copy.pop(index))
return [trainSet, copy]
#dikumpulkan berdasar kelas
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
return separated
#hitung mean
def mean(numbers):
return sum(numbers)/float(len(numbers))
#hitung standard deviasi
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
return math.sqrt(variance)
#hitung jumlah dataset
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
#hitung atribut tiap kelas
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
for classValue, instances in separated.items():
summaries[classValue] = summarize(instances)
return summaries
#hitung Gaussian PDF
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1/(math.sqrt(2*math.pi)*stdev))*exponent
#hitung probabilitas kelas
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
#make prediction
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
#make prediction
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions
#get accurancy
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return (correct / float(len(testSet))) * 100.0
def main():
filename = datasets.load_iris()
splitRatio = 0.67
dataset = loadCsv(filename)
print(dataset)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print(('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet),len(testSet)))
# prepare model
summaries = summarizeByClass(trainingSet)
# test model
predictions = getPredictions(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print(('Accuracy: {0}%').format(accuracy))
main()
could you guys help me out, please? Much appreciate it!
Best regards, Eliya

Optimizing performance of SciPy Minimize while using Concurrent.Futures?

I'm trying to figure out how to speed up a scipy.minimize function.
minimize() is called thousands of times. I run it in parallel using a ProcessPoolExecutor. bootstrap() is the parent function.
def optimize_weights(forecasts, prices, instrument):
guess = [1/forecasts.shape[1]] * forecasts.shape[1]
bounds = [(0.0,1.0)] * forecasts.shape[1]
cons = {'type': 'eq', 'fun': lambda x: 1 - sum(x)}
def function(w, forecasts, prices, instrument):
wf = (w*forecasts).mean(axis=1)
wf = wf*10/wf.std()
wf = wf.clip(-20,20)
l = accountCurve(wf, prices, slippage=instrument.slippage, per_contract_cost=instrument.per_contract_cost)
return -l.sharpe()
result = minimize(function, guess, (forecasts, prices, instrument), bounds=bounds, method='SLSQP', tol=0.0001, constraints=cons, options={'disp': False ,'eps' : 1e0})
return result.x
def mp_optimize_weights(samples, prices, instrument):
with ProcessPoolExecutor() as executor:
return executor.map(partial(optimize_weights, prices=prices, instrument=instrument), samples)
def bootstrap(instrument, parallel_process=True):
print("Parallel Process: ", parallel_process)
forecasts = instrument.forecasts().dropna()
prices = instrument.prices().reset_index('Contract', drop=True)
prices = prices[forecasts.index]
years = list(set(prices.index.year))
years.sort()
result={}
for year in years:
sample_length = np.int(prices[:str(year)].size/10)
end_of_sample_selection_space = prices[:str(year)].tail(1).index[0] - pd.Timedelta(days=sample_length)
sample_dates = pd.to_datetime(np.random.choice(prices[:end_of_sample_selection_space].index,100))
if(sample_length > 50):
samples = [forecasts.loc[date:date+pd.Timedelta(days=sample_length)] for date in sample_dates]
if parallel_process is True:
weights = pd.DataFrame(list(mp_optimize_weights(samples, prices[:str(year)], instrument=instrument)))
else:
weights = pd.DataFrame(list(map(partial(optimize_weights, prices=prices[:str(year)], instrument=instrument), samples)))
if len(weights)<2:
print('Weights error')
break
result[year]=weights.mean()
print(year, sample_length)
output = pd.DataFrame.from_dict(result).transpose()
output.columns = forecasts.columns
pl.plot(output)
display.clear_output(wait=True)
display.display(pl.gcf())
return output
import numpy as np
import pandas as pd
class accountCurve():
def __init__(self, forecasts, prices, annual_volatility_target=0.25, multiplier = 1, per_contract_cost = 0, slippage = 0, capital=100000, costs=True):
if prices.index.names[0] == 'Contract':
prices = prices.reset_index('Contract', drop=True)
#Adjust for contract multiplier/pricing in pennies
prices = prices*multiplier/100
self.prices = prices
daily_volatility_target = annual_volatility_target/np.sqrt(252)
instrument_value_volatility = prices.diff().ewm(span=36, min_periods=36).std()
self.notional_position = (forecasts * daily_volatility_target * capital).divide(10.0 * instrument_value_volatility[forecasts.index], axis=0)
self.notional_position.dropna(inplace=True)
#Chunk trades to be at least 10% move in position (to reduce trading costs)
self.position = chunk_trades(self.notional_position)
#Round positions to integers
self.position = np.around(self.notional_position)
#self.position.mark_to_market = self.position.multiply(prices, axis=0)
self.trades = self.position.diff()
#Calculate returns
self.gross_returns = (prices.diff().shift(-1)*self.position).dropna()
if costs:
self.costs = self.trades.abs() * per_contract_cost
self.slippage = self.trades.abs() * slippage * multiplier/100
self.returns = (self.gross_returns - self.slippage - self.costs).dropna()
else:
self.returns = self.gross_returns
def sharpe(self):
return self.returns.mean()/self.returns.std()*np.sqrt(252)
def losses(self):
return [z for z in self.returns if z<0]
def sortino(self):
returns = self.returns.pct_change()
return returns.mean()/np.std(losses(returns))*np.sqrt(252)
def plot(self):
self.returns.cumsum().plot()
def chunk_trades(A):
#Take a list of notional positions and filter so that trades are only greater than 10% of notional position
last = A[0]
new = []
for x in A.iteritems():
if np.abs((x[1]-last)/last) > 0.1:
new.append(x[1])
last = x[1]
else:
new.append(last)
s = pd.Series(new, index=A.index)
return s
On my data, this takes around 45 minutes to run.
I'd like to know:
Is my approach to parallel processing correct? Should I be using threads instead of processes?
Can I reconfigure minimize to finish faster? This is bootstrapping, which is a monte-carlo based sampling method, may not require such an accurate result.
Anything else I can do to speed it up?
In an ideal world, I'd like to speed it up an order of magnitude.

Categories