How to continue after gradient descent? - python

I am very new to Data Science and Python. After a few hours of Experimentation, I finally received values for my gradient descent (code below). I am having trouble to plotting bzw. How can I plot the regression line automatically after the algorithm?
import numpy as np;
import matplotlib.pyplot as plt;
import csv
import pandas as pd
def gradient_descent(x,y):
m_curr=b_curr=0
iterations = 5000
n=len(x)
learning_rate = 0.01
for i in range(iterations):
y_predicted = m_curr*x + b_curr
cost = (1/n)*sum([val**2 for val in (y-y_predicted)])
md = -(2/n)*sum(x*(y-y_predicted))
bd = -(2/n)*sum(y-y_predicted)
m_curr = m_curr - learning_rate*md
b_curr = b_curr - learning_rate*bd
print("m{}, b{}, cost {}, iteration {}".format(m_curr,b_curr,cost,i))
if __name__ == '__main__':
#Reading data -> Output: DataFrame in float64
data = pd.read_csv('ex1data1.txt', sep=',', header=None, names=['Feature', 'Label'])
data.plot(x='Feature', y='Label', kind = 'scatter')
#separating data frame to
feat_vec = pd.DataFrame(data['Feature'])
label_vec = pd.DataFrame(data['Label'])
#Finding the Best Fit Line for our given Dataset and convert the df to np.array
#because it's more convenient for matrix multiplication
x = np.array(feat_vec)
y = np.array(label_vec)
gradient_descent(x,y)

Related

Run Different Scikit-learn Clustering Algorithms on Dataset

I have a dataframe like below. The shape is (24,7)
Name x1 x2 x3 x4 x5 x6
Harry 102 204 0.43 0.21 1.02 0.39
James 242 500 0.31 0.11 0.03 0.73
.
.
.
Mike 3555 4002 0.12 0.03 0.52. 0.11
Henry 532 643 0.01 0.02 0.33 0.10
I want to run Scikit-learn's Different Clustering Algorithms Script on the above dataframe. However, the input data looks quite confusing, not too sure how to input my dataframe
https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html#sphx-glr-auto-examples-cluster-plot-cluster-comparison-py
There are two main differences between your scenario and the scikit-learn example you link to:
You only have one dataset, not several different ones to compare.
You have six features, not just two.
Point one allows you to simplify the example code by deleting the loops over the different datasets and related calculations. Point two implies that you cannot easily plot your results. Instead, you could just add the predicted class labels found by each algorithm to your dataset.
So you could modify the example code like this:
import time
import warnings
import numpy as np
import pandas as pd
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
np.random.seed(0)
# ============
# Introduce your dataset
# ============
my_df = # Insert your data here, as a pandas dataframe.
features = [f'x{i}' for i in range(1, 7)]
X = my_df[features].values
# ============
# Set up cluster parameters
# ============
params = {
"quantile": 0.3,
"eps": 0.3,
"damping": 0.9,
"preference": -200,
"n_neighbors": 3,
"n_clusters": 3,
"min_samples": 7,
"xi": 0.05,
"min_cluster_size": 0.1,
}
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = max(cluster.estimate_bandwidth(X, quantile=params["quantile"]),
0.001) # arbitrary correction to avoid 0
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(
X, n_neighbors=params["n_neighbors"], include_self=False
)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# ============
# Create cluster objects
# ============
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=params["n_clusters"])
ward = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"], linkage="ward", connectivity=connectivity
)
spectral = cluster.SpectralClustering(
n_clusters=params["n_clusters"],
eigen_solver="arpack",
affinity="nearest_neighbors",
)
dbscan = cluster.DBSCAN(eps=params["eps"])
optics = cluster.OPTICS(
min_samples=params["min_samples"],
xi=params["xi"],
min_cluster_size=params["min_cluster_size"],
)
affinity_propagation = cluster.AffinityPropagation(
damping=params["damping"], preference=params["preference"], random_state=0
)
average_linkage = cluster.AgglomerativeClustering(
linkage="average",
affinity="cityblock",
n_clusters=params["n_clusters"],
connectivity=connectivity,
)
birch = cluster.Birch(n_clusters=params["n_clusters"])
gmm = mixture.GaussianMixture(
n_components=params["n_clusters"], covariance_type="full"
)
clustering_algorithms = (
("MiniBatch\nKMeans", two_means),
("Affinity\nPropagation", affinity_propagation),
("MeanShift", ms),
("Spectral\nClustering", spectral),
("Ward", ward),
("Agglomerative\nClustering", average_linkage),
("DBSCAN", dbscan),
("OPTICS", optics),
("BIRCH", birch),
("Gaussian\nMixture", gmm),
)
for name, algorithm in clustering_algorithms:
t0 = time.time()
# catch warnings related to kneighbors_graph
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="the number of connected components of the "
+ "connectivity matrix is [0-9]{1,2}"
+ " > 1. Completing it to avoid stopping the tree early.",
category=UserWarning,
)
warnings.filterwarnings(
"ignore",
message="Graph is not fully connected, spectral embedding"
+ " may not work as expected.",
category=UserWarning,
)
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, "labels_"):
y_pred = algorithm.labels_.astype(int)
else:
y_pred = algorithm.predict(X)
# Add cluster labels to the dataset
my_df[name] = y_pred
PS : please replace : data = X_data.iloc[:20000] by your X
import numpy as np
import matplotlib as plt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import cluster, metrics
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn import preprocessing
from collections import Counter
from sklearn.cluster import DBSCAN
from sklearn import mixture
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
comp_model = pd.DataFrame(columns=['Model', 'Score_Silhouette',
'num_clusters', 'size_clusters',
'parameters'])
K-Means :
def k_means(X_data, nb_clusters, model_comp):
ks = nb_clusters
inertias = []
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
for num_clusters in ks:
# Create a KMeans instance with k clusters: model
model = KMeans(n_clusters=num_clusters, n_init=1)
# Fit model to samples
model.fit(X_scaled)
# Append the inertia to the list of inertias
inertias.append(model.inertia_)
silh = metrics.silhouette_score(X_scaled, model.labels_)
# Counting the amount of data in each cluster
taille_clusters = Counter(model.labels_)
data = [{'Model': 'kMeans',
'Score_Silhouette': silh,
'num_clusters': num_clusters,
'size_clusters': taille_clusters,
'parameters': 'nb_clusters :'+str(num_clusters)}]
model_comp = model_comp.append(data, ignore_index=True, sort=False)
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
return model_comp
comp_model = k_means(X_data=df,
nb_clusters=pd.np.arange(2, 11, 1),
model_comp=comp_model)
DBscan :
def dbscan_grid_search(X_data, model_comp, eps_space=0.5,
min_samples_space=5, min_clust=0, max_clust=10):
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
# Starting a tally of total iterations
n_iterations = 0
# Looping over each combination of hyperparameters
for eps_val in eps_space:
for samples_val in min_samples_space:
dbscan_grid = DBSCAN(eps=eps_val,
min_samples=samples_val)
# fit_transform
clusters = dbscan_grid.fit_predict(X=X_scaled)
# Counting the amount of data in each cluster
cluster_count = Counter(clusters)
#n_clusters = sum(abs(pd.np.unique(clusters))) - 1
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
# Increasing the iteration tally with each run of the loop
n_iterations += 1
# Appending the lst each time n_clusters criteria is reached
if n_clusters >= min_clust and n_clusters <= max_clust:
silh = metrics.silhouette_score(X_scaled, clusters)
data = [{'Model': 'Dbscan',
'Score_Silhouette': silh,
'num_clusters': n_clusters,
'size_clusters': cluster_count,
'parameters': 'eps :'+str(eps_val)+'+ samples_val :'+str(samples_val)}]
model_comp = model_comp.append(
data, ignore_index=True, sort=False)
return model_comp
comp_model = dbscan_grid_search(X_data=df,
model_comp=comp_model,
eps_space=pd.np.arange(0.1, 5, 0.6),
min_samples_space=pd.np.arange(1, 30, 3),
min_clust=2,
max_clust=10)
GMM :
def gmm(X_data, nb_clusters, model_comp):
ks = nb_clusters
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
for num_clusters in ks:
# Create a KMeans instance with k clusters: model
gmm = mixture.GaussianMixture(n_components=num_clusters).fit(X_scaled)
# Fit model to samples
gmm.fit(X_scaled)
pred = gmm.predict(X_scaled)
cluster_count = Counter(pred)
silh = metrics.silhouette_score(X_scaled, pred)
data = [{'Model': 'GMM',
'Score_Silhouette': silh,
'num_clusters': num_clusters,
'size_clusters': cluster_count,
'parameters': 'nb_clusters :'+str(num_clusters)}]
model_comp = model_comp.append(data, ignore_index=True, sort=False)
return model_comp
comp_model = gmm(X_data=df,
nb_clusters=pd.np.arange(2, 11, 1),
model_comp=comp_model
)
At the end you will have comp_model which will contain all the results of your algo. Here I am using three algorithms, after you selected the best fit for you (with score silhouette and number of cluster).
You should check the repartitions of each cluster :
https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py

Adding rolling optimization into Vectorbt with N months lookback windows

I am working on a backtesting code for a Maximize Sharpe portfolio.
Here I am using PyPortfolioOpt instead of cvxpy here to compute the weight, however, I am confused in where to configure the number of months for lookback period for this optimization, e.g. a a rolling 36 months optimization.
I believe lookback window should be set here, but I am not quite sure how.
def pre_segment_func_nb(c, find_weights_nb, history_len, ann_factor, num_tests, srb_sharpe):
if history_len == -1:
# Look back at the entire time period
close = c.close[:c.i, c.from_col:c.to_col]
else:
# Look back at a fixed time period
if c.i - history_len <= 0:
return (np.full(c.group_len, np.nan),) # insufficient data
close = c.close[c.i - history_len:c.i, c.from_col:c.to_col]
Here is the full code FYR.
import os
import numpy as np
import pandas as pd
import yfinance as yf
from datetime import datetime
import pytz
from numba import njit
import matplotlib.pyplot as plt
import seaborn as sns
from pypfopt.efficient_frontier import EfficientFrontier
from pypfopt import risk_models
from pypfopt import expected_returns
from pypfopt import base_optimizer
from pypfopt import objective_functions
from pypfopt.discrete_allocation import DiscreteAllocation, get_latest_prices
from pypfopt import EfficientSemivariance
from pypfopt.expected_returns import mean_historical_return
from pypfopt.expected_returns import returns_from_prices
import vectorbt as vbt
from vectorbt.generic.nb import nanmean_nb
from vectorbt.portfolio.nb import order_nb, sort_call_seq_nb
from vectorbt.portfolio.enums import SizeType, Direction
from IPython.display import set_matplotlib_formats
%matplotlib inline
%config InlineBackend.figure_format='retina'
symbols = ['NVDA','AMD','TSLA','NET','JPM','AAPL']
start_date = datetime(2012, 1, 1, tzinfo=pytz.utc)
end_date = datetime.today().strftime('%Y-%m-%d')
num_tests = 1000000
vbt.settings.array_wrapper['freq'] = 'days'
vbt.settings.returns['year_freq'] = '252 days'
vbt.settings.portfolio['seed'] = 42
vbt.settings.portfolio.stats['incl_unrealized'] = True
yfdata = vbt.YFData.download(symbols, start=start_date, end=end_date)
ohlcv = yfdata.concat()
price = ohlcv['Close'].fillna(method='ffill')
srb_sharpe = np.full(price.shape[0], np.nan)
#njit
def pre_sim_func_nb(c, every_nth):
# Define rebalancing days
c.segment_mask[:, :] = False
c.segment_mask[every_nth::every_nth, :] = True
return ()
#njit
def find_weights_nb(c, price, num_tests):
# Find optimal weights based on best Sharpe ratio
returns = (price[1:] - price[:-1]) / price[:-1]
returns = returns[1:, :] # cannot compute np.cov with NaN
mean = nanmean_nb(returns)
cov = np.cov(returns, rowvar=False) # masked arrays not supported by Numba (yet)
best_sharpe_ratio = -np.inf
#best_sharpe_ratio = -np.inf
weights = np.full(c.group_len, np.nan, dtype=np.float_)
for i in range(num_tests):
# Generate weights
w = np.random.random_sample(c.group_len)
w = w / np.sum(w)
# Compute annualized mean, covariance, and Sharpe ratio
p_return = np.sum(mean * w) * ann_factor
p_std = np.sqrt(np.dot(w.T, np.dot(cov, w))) * np.sqrt(ann_factor)
sharpe_ratio = p_return / p_std
if sharpe_ratio > best_sharpe_ratio:
best_sharpe_ratio = sharpe_ratio
weights = w
return best_sharpe_ratio, weights
#njit
def pre_segment_func_nb(c, find_weights_nb, history_len, ann_factor, num_tests, srb_sharpe):
if history_len == -1:
# Look back at the entire time period
close = c.close[:c.i, c.from_col:c.to_col]
else:
# Look back at a fixed time period
if c.i - history_len <= 0:
return (np.full(c.group_len, np.nan),) # insufficient data
close = c.close[c.i - history_len:c.i, c.from_col:c.to_col]
# Find optimal weights
best_sharpe_ratio, weights = find_weights_nb(c, close, num_tests)
srb_sharpe[c.i] = best_sharpe_ratio
# Update valuation price and reorder orders
size_type = SizeType.TargetPercent
direction = Direction.LongOnly
order_value_out = np.empty(c.group_len, dtype=np.float_)
for k in range(c.group_len):
col = c.from_col + k
c.last_val_price[col] = c.close[c.i, col]
sort_call_seq_nb(c, weights, size_type, direction, order_value_out)
return (weights,)
#njit
def order_func_nb(c, weights):
col_i = c.call_seq_now[c.call_idx]
return order_nb(
weights[col_i],
c.close[c.i, c.col],
size_type=SizeType.TargetPercent
)
ann_factor = returns.vbt.returns.ann_factor
def pyopt_find_weights(sc, price, num_tests): # no #njit decorator = it's a pure Python function
price = pd.DataFrame(price, columns=symbols)
avg_returns = expected_returns.mean_historical_return(price)
cov_mat = risk_models.CovarianceShrinkage(price).ledoit_wolf()
ef = EfficientFrontier(avg_returns, cov_mat, weight_bounds=(0,1))
min_weight, max_weight = 0.05, 0.35
constraints=[
# {"type": "eq", "fun": lambda w: np.sum(w) - 1}, # sum to 1
{"type": "ineq", "fun": lambda w: w - min_weight}, # greater than min_weight
{"type": "ineq", "fun": lambda w: max_weight - w}, # less than max_weight
]
weights = ef.nonconvex_objective(
objective_functions.sharpe_ratio,
objective_args=(avg_returns, cov_mat),
weights_sum_to_one=True,
constraints = constraints
)
clean_weights = ef.clean_weights()
weights = np.array([clean_weights[symbol] for symbol in symbols])
best_sharpe_ratio = base_optimizer.portfolio_performance(weights, avg_returns, cov_mat)[2]
latest_prices = get_latest_prices(price)
da = DiscreteAllocation(clean_weights, latest_prices, total_portfolio_value=25000)
allocation, leftover = da.lp_portfolio(reinvest=True)
w = pd.DataFrame(clean_weights, columns=clean_weights.keys(), index = [0])
w.to_excel('w.xlsx')
return best_sharpe_ratio, weights
pyopt_srb_sharpe = np.full(price.shape[0], np.nan)
pyopt_srb_pf = vbt.Portfolio.from_order_func(
price,
order_func_nb,
pre_sim_func_nb=pre_sim_func_nb,
pre_sim_args=(63,), #63 #84
pre_segment_func_nb=pre_segment_func_nb.py_func, # run pre_segment_func_nb as pure Python function
pre_segment_args=(pyopt_find_weights, -1, ann_factor, num_tests, pyopt_srb_sharpe),
cash_sharing=True,
group_by=True,
use_numba=False # run simulate_nb as pure Python function
)
greatly appreciated

How to speed up a high dimensional loop in python with numpy instead of pandas?

This Loop does its work in 5 hours. How can i speed it up? I read something about using numpy functions instead of pandas. I tried as you can see but i am to new to python to do it right. The big thing here is the high dimensional data with 6000 columns. Every data is static, except of the random weights. How do i write better code?
import numpy as np
import os
#Covarinace Matrix in Pandas Dataframe 6000 columns x 6000 rows
cov = input_table_1.copy()
#Mean returns Pandas DataFrame 6000 columns x 1800 rows
mean_returns = input_table_2.copy().squeeze()
#Looping number
num_portfolios = 100.000
#Empty Resultsmatrix
results_matrix = np.zeros((len(cov.columns)+1, num_portfolios))
rf=0
#Loop corpus
for i in range(num_portfolios):
#Random numbers between 0 and 1 for every column
weights = np.random.uniform(0,1,len(cov.columns))
#Ensure sum of all random numbers is = 1
weights /= np.sum(weights)
#Some easy math operations
portfolio_return = np.sum(mean_returns * weights) * 252
portfolio_std = np.sqrt(np.dot(weights.T, np.dot(cov, weights))) * np.sqrt(252)
sharpe_ratio = (portfolio_return - rf) / portfolio_std
#write sharpe_ratio in result matrix as result for every loop
results_matrix[0,i] = sharpe_ratio
#iterate through the weight vector and add data to results array
for j in range(len(weights)):
results_matrix[j+1,i] = weights[j]
#output table as pandas data frame
output_table = pd.DataFrame(results_matrix.T,columns=['sharpe'] + [ticker for ticker in list(cov.columns)] )```
there is not a generic way to do that, first of all you must identify where your code is slow, and after that you can apply optimization.
First of all you have nested loop so complexity is O(n^2) not a bid deal here, because lot of work can be done using vectorial approach.
In python creation of new object is slow, so for example, if it can be stored in ram, the first np.random.uniform can be done one time and consumed during the cycle.
nested iterator, can be done in vectorial mode, this seem the best candidates for performance.
Anyway i suggest to use a tool like perf_tool that will guide you exactly on the slow piece of code [*]
[*] i'm the main developer of this tool.
#AmilaMGunawardana Here is my first try with tensorflow, but i is not fast enough. At the end i waited 5 hours for 100.000 rounds. Maybe i have to do something better?
Perftool showed me that evrything in the code is fast, except the Part:
vol_arr[x] = tnp.sqrt(tnp.dot(multi_randoms[x].T, np.dot(covData*252, multi_randoms[x]))) --> This part takes 90% of the execution Time.
covData = input_table_1.copy()
#Mean returns Pandas DataFrame 6000 columns x 1800 rows
returns = input_table_2.copy().squeeze()
#Looping number
num_portfolios = 100000
rf=0
#print("mean_returns: ", mean_returns)
#print("cov2: ", cov2)
#print("cov: ", cov)
all_weights = np.zeros((num_ports, len(returns.columns))) #tnp.zeros([num_ports,len(returns.columns)], dtype=tnp.float32) #np.zeros((num_ports, len(returns.columns)))
ret_arr = pd.to_numeric(np.zeros(num_ports))#tnp.zeros(num_ports, dtype=tnp.float32)# pd.to_numeric(np.zeros(num_ports))
vol_arr = pd.to_numeric(np.zeros(num_ports))#tnp.zeros(num_ports, dtype=tnp.float32)
sharpe_arr = pd.to_numeric(np.zeros(num_ports))#tnp.zeros(num_ports, dtype=tnp.float32)
multi_randoms = np.random.normal(0, 1., size=(num_portfolios,len(covData.columns) ))
#perf_tool('main')
def main():
for x in range(num_ports):
with PerfTool('preparation1'):
# Save weights
all_weights[x,:] = multi_randoms[x]
with PerfTool('preparation2'):
# Expected return
ret_arr[x] = tnp.sum( (mean_returns * multi_randoms[x] * 252))
with PerfTool('preparation3'):
# Expected volatility
vol_arr[x] = tnp.sqrt(tnp.dot(multi_randoms[x].T, np.dot(covData*252, multi_randoms[x])))
with PerfTool('preparation4'):
# Sharpe Ratio
sharpe_arr[x] = ret_arr[x] - rf /vol_arr[x]
PerfTool.set_enabled()
main()
PerfTool.show_stats_if_enabled()```
This showes up one way of getting better with parallel loading. How could i get rid of the loop? Is there a way to do this calculations in just one step with using all_weights Dataframe once instead of looping over it?
import pandas as pd
import numpy as np
from perf_tool import PerfTool, perf_tool
from joblib import Parallel, delayed, parallel_backend
#Covarinace Matrix in Pandas Dataframe 6000 columns x 6000 rows
covData = input_table_1.copy()
#Mean returns Pandas DataFrame 6000 columns x 1800 rows
mean_returns = input_table_2.copy().squeeze()
#Looping number
num_ports = 100000
all_weights = np.zeros((num_ports, len(mean_returns.columns)))
#multi_randoms = np.random.random(size=(len(df.columns) ))
for x in range(num_ports):
weights = np.array(np.random.random(len(mean_returns.columns)))
weights = weights/np.sum(weights)
all_weights[x,:] = weights
#print(weights)
#weights = np.array(np.random.random(len(returns.columns)))
#print(all_weights)
#print("cov2 type: ", type(cov2))
#cov = pd.DataFrame(np.random.normal(0, 1., size=(600,600)))
#print("cov type: ", type(cov))
rf=0
#print("mean_returns: ", mean_returns)
#print("cov2: ", cov2)
#print("cov: ", cov)
#all_weights = np.zeros((num_ports, len(returns.columns)))
ret_arr = pd.to_numeric(np.zeros(num_ports))
vol_arr = pd.to_numeric(np.zeros(num_ports))
sharpe_arr = pd.to_numeric(np.zeros(num_ports))
##perf_tool('main')
##jit(parallel=True)
def test(x):
#for x in range(num_ports):
#with PerfTool('preparation1'):
# Weights
#weights = np.array(np.random.random(len(returns.columns)))
#with PerfTool('preparation2'):
#weights = weights/np.sum(weights)
#with PerfTool('preparation3'):
# Save weights
weights= all_weights[x]
#with PerfTool('preparation4'):
# Expected return
ret_arr[x] = np.sum( (mean_returns * weights * 252))
#with PerfTool('preparation5'):
# Expected volatility
vol_arr[x] = np.sqrt(np.dot(weights.T, np.dot(covData*252, weights)))
#with PerfTool('preparation6'):
# Sharpe Ratio
return x, ret_arr[x] - rf /vol_arr[x]
#sharpe_arr[x] = (np.sum( (mean_returns * all_weights * 252)) - rf) /(np.sqrt(np.dot(all_weights.T, np.dot(covData*252, all_weights))))
#PerfTool.set_enabled()
sharpe= []
weighttable= []
weighttable, sharpe= zip(*Parallel(n_jobs=-1)([delayed(test)(i) for i in range(num_ports)]))```

How to find the best line Fit Python(banister-impulse model)

I have this formula that is used to predict athletic performance base on daily stress.
It is based on 5 constant unique to each person. I'm trying to find these based on daily stress and performance testing that has been done. I'm new to programming and I don't know where to start.
see the formula
Performance= Fitness(=daily stress+yesterday fitness put decay) - Fatigue(daily stress+yesterday fatigue put decay) +P0
This is a sample of the data: data
thank you
import pandas as pd
import numpy as np
import math
from scipy import optimize
data = pd.read_csv('data_mod1.csv')
TSS = data['stress'].fillna(0)
arr = np.array(TSS)
#data = data.dropna()
a = [arr[0]]
b = [arr[0]]
x = arr[1:]
def Banister(x, t1, t2,k1,k2, c):
for v in x:
a.append(a[-1]*np.exp(-1/t1) + v)
b.append(b[-1]*np.exp(-1/t2) + v)
data['fit'] = pd.Series(a)
data['fat'] = pd.Series(b)
data['perf'] = ((data['fit']*k1)-(data['fat']*k2))+c
return data['perf']
# In[ ]:
from scipy.optimize import curve_fit
fit = curve_fit(Banister, arr,data[data.index], p0=[20, 10,1 ,2, 50])

ValueError in Random forest (Python)

I am trying to perform a Random Forest analysis in Python. Everything seems OK but, when I try to run the code, I get the following error message:
Did any of you get this ValueError?
Cheers
Dataset: https://www.dropbox.com/s/ehyccl8kubazs8x/test.csv?dl=0&preview=test.csv
Code:
from sklearn.ensemble import RandomForestRegressor as RF
import numpy as np
import pylab as pl
headers = file("test.csv").readline().strip().split('\r')[0].split(',')[1:]
data = np.loadtxt("test.csv", delimiter=',', skiprows=1, usecols = range(1,14))
#yellow==PAR, green==VPD, blue== Tsoil and orange==Tair
PAR = data[:,headers.index("PAR")]
VPD = data[:,headers.index("VPD")]
Tsoil= data[:,headers.index("Tsoil")]
Tair = data[:,headers.index("Tair")]
drivers = np.column_stack([PAR,VPD,Tsoil,Tair])
hour = data[:,-1].astype("int")
#performs a random forest hour-wise to explain each NEE, GPP and Reco fluxes
importances = np.zeros([24,2,3,4])
for ff,flux in enumerate(["NEE_f","GPP_f","Reco"]):
fid = headers.index(flux)
obs = data[:,fid]
#store importances: dim are average/std; obs var; expl var
for hh in range(24):
mask = hour == hh
forest = RF(n_estimators=1000)
forest.fit(drivers[mask],obs[mask])
importances[hh,0,ff] = forest.feature_importances_
importances[hh,1,ff] = np.std([tree.feature_importances_ for tree in forest.estimators_],axis=0)
fig = pl.figure('importances',figsize=(15,5));fig.clf()
xx=range(24)
colors = ["#F0E442","#009E73","#56B4E9","#E69F00"];labels= ['PAR','VPD','Tsoil','Tair']
for ff,flux in enumerate(["NEE_f","GPP_f","Reco"]):
ax = fig.add_subplot(1,3,ff+1)
for vv in range(drivers.shape[1]):
ax.fill_between(xx,importances[:,0,ff,vv]+importances[:,1,ff,vv],importances[:,0,ff,vv]-importances[:,1,ff,vv],color=colors[vv],alpha=.35,edgecolor="none")
ax.plot(xx,importances[:,0,ff,vv],color=colors[vv],ls='-',lw=2,label = labels[vv])
ax.set_title(flux);ax.set_xlim(0,23)
if ff == 0:
ax.legend(ncol=2,fontsize='medium',loc='upper center')
fig.show()
fig.savefig('importance-hourly.png')
The problem was that I selected the column where years are stored, not where hours are. Therefore the RF was trained on empty arrays.

Categories