ValueError in Random forest (Python) - python

I am trying to perform a Random Forest analysis in Python. Everything seems OK but, when I try to run the code, I get the following error message:
Did any of you get this ValueError?
Cheers
Dataset: https://www.dropbox.com/s/ehyccl8kubazs8x/test.csv?dl=0&preview=test.csv
Code:
from sklearn.ensemble import RandomForestRegressor as RF
import numpy as np
import pylab as pl
headers = file("test.csv").readline().strip().split('\r')[0].split(',')[1:]
data = np.loadtxt("test.csv", delimiter=',', skiprows=1, usecols = range(1,14))
#yellow==PAR, green==VPD, blue== Tsoil and orange==Tair
PAR = data[:,headers.index("PAR")]
VPD = data[:,headers.index("VPD")]
Tsoil= data[:,headers.index("Tsoil")]
Tair = data[:,headers.index("Tair")]
drivers = np.column_stack([PAR,VPD,Tsoil,Tair])
hour = data[:,-1].astype("int")
#performs a random forest hour-wise to explain each NEE, GPP and Reco fluxes
importances = np.zeros([24,2,3,4])
for ff,flux in enumerate(["NEE_f","GPP_f","Reco"]):
fid = headers.index(flux)
obs = data[:,fid]
#store importances: dim are average/std; obs var; expl var
for hh in range(24):
mask = hour == hh
forest = RF(n_estimators=1000)
forest.fit(drivers[mask],obs[mask])
importances[hh,0,ff] = forest.feature_importances_
importances[hh,1,ff] = np.std([tree.feature_importances_ for tree in forest.estimators_],axis=0)
fig = pl.figure('importances',figsize=(15,5));fig.clf()
xx=range(24)
colors = ["#F0E442","#009E73","#56B4E9","#E69F00"];labels= ['PAR','VPD','Tsoil','Tair']
for ff,flux in enumerate(["NEE_f","GPP_f","Reco"]):
ax = fig.add_subplot(1,3,ff+1)
for vv in range(drivers.shape[1]):
ax.fill_between(xx,importances[:,0,ff,vv]+importances[:,1,ff,vv],importances[:,0,ff,vv]-importances[:,1,ff,vv],color=colors[vv],alpha=.35,edgecolor="none")
ax.plot(xx,importances[:,0,ff,vv],color=colors[vv],ls='-',lw=2,label = labels[vv])
ax.set_title(flux);ax.set_xlim(0,23)
if ff == 0:
ax.legend(ncol=2,fontsize='medium',loc='upper center')
fig.show()
fig.savefig('importance-hourly.png')

The problem was that I selected the column where years are stored, not where hours are. Therefore the RF was trained on empty arrays.

Related

Run Different Scikit-learn Clustering Algorithms on Dataset

I have a dataframe like below. The shape is (24,7)
Name x1 x2 x3 x4 x5 x6
Harry 102 204 0.43 0.21 1.02 0.39
James 242 500 0.31 0.11 0.03 0.73
.
.
.
Mike 3555 4002 0.12 0.03 0.52. 0.11
Henry 532 643 0.01 0.02 0.33 0.10
I want to run Scikit-learn's Different Clustering Algorithms Script on the above dataframe. However, the input data looks quite confusing, not too sure how to input my dataframe
https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html#sphx-glr-auto-examples-cluster-plot-cluster-comparison-py
There are two main differences between your scenario and the scikit-learn example you link to:
You only have one dataset, not several different ones to compare.
You have six features, not just two.
Point one allows you to simplify the example code by deleting the loops over the different datasets and related calculations. Point two implies that you cannot easily plot your results. Instead, you could just add the predicted class labels found by each algorithm to your dataset.
So you could modify the example code like this:
import time
import warnings
import numpy as np
import pandas as pd
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
np.random.seed(0)
# ============
# Introduce your dataset
# ============
my_df = # Insert your data here, as a pandas dataframe.
features = [f'x{i}' for i in range(1, 7)]
X = my_df[features].values
# ============
# Set up cluster parameters
# ============
params = {
"quantile": 0.3,
"eps": 0.3,
"damping": 0.9,
"preference": -200,
"n_neighbors": 3,
"n_clusters": 3,
"min_samples": 7,
"xi": 0.05,
"min_cluster_size": 0.1,
}
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = max(cluster.estimate_bandwidth(X, quantile=params["quantile"]),
0.001) # arbitrary correction to avoid 0
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(
X, n_neighbors=params["n_neighbors"], include_self=False
)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# ============
# Create cluster objects
# ============
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=params["n_clusters"])
ward = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"], linkage="ward", connectivity=connectivity
)
spectral = cluster.SpectralClustering(
n_clusters=params["n_clusters"],
eigen_solver="arpack",
affinity="nearest_neighbors",
)
dbscan = cluster.DBSCAN(eps=params["eps"])
optics = cluster.OPTICS(
min_samples=params["min_samples"],
xi=params["xi"],
min_cluster_size=params["min_cluster_size"],
)
affinity_propagation = cluster.AffinityPropagation(
damping=params["damping"], preference=params["preference"], random_state=0
)
average_linkage = cluster.AgglomerativeClustering(
linkage="average",
affinity="cityblock",
n_clusters=params["n_clusters"],
connectivity=connectivity,
)
birch = cluster.Birch(n_clusters=params["n_clusters"])
gmm = mixture.GaussianMixture(
n_components=params["n_clusters"], covariance_type="full"
)
clustering_algorithms = (
("MiniBatch\nKMeans", two_means),
("Affinity\nPropagation", affinity_propagation),
("MeanShift", ms),
("Spectral\nClustering", spectral),
("Ward", ward),
("Agglomerative\nClustering", average_linkage),
("DBSCAN", dbscan),
("OPTICS", optics),
("BIRCH", birch),
("Gaussian\nMixture", gmm),
)
for name, algorithm in clustering_algorithms:
t0 = time.time()
# catch warnings related to kneighbors_graph
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="the number of connected components of the "
+ "connectivity matrix is [0-9]{1,2}"
+ " > 1. Completing it to avoid stopping the tree early.",
category=UserWarning,
)
warnings.filterwarnings(
"ignore",
message="Graph is not fully connected, spectral embedding"
+ " may not work as expected.",
category=UserWarning,
)
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, "labels_"):
y_pred = algorithm.labels_.astype(int)
else:
y_pred = algorithm.predict(X)
# Add cluster labels to the dataset
my_df[name] = y_pred
PS : please replace : data = X_data.iloc[:20000] by your X
import numpy as np
import matplotlib as plt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import cluster, metrics
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn import preprocessing
from collections import Counter
from sklearn.cluster import DBSCAN
from sklearn import mixture
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
comp_model = pd.DataFrame(columns=['Model', 'Score_Silhouette',
'num_clusters', 'size_clusters',
'parameters'])
K-Means :
def k_means(X_data, nb_clusters, model_comp):
ks = nb_clusters
inertias = []
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
for num_clusters in ks:
# Create a KMeans instance with k clusters: model
model = KMeans(n_clusters=num_clusters, n_init=1)
# Fit model to samples
model.fit(X_scaled)
# Append the inertia to the list of inertias
inertias.append(model.inertia_)
silh = metrics.silhouette_score(X_scaled, model.labels_)
# Counting the amount of data in each cluster
taille_clusters = Counter(model.labels_)
data = [{'Model': 'kMeans',
'Score_Silhouette': silh,
'num_clusters': num_clusters,
'size_clusters': taille_clusters,
'parameters': 'nb_clusters :'+str(num_clusters)}]
model_comp = model_comp.append(data, ignore_index=True, sort=False)
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
return model_comp
comp_model = k_means(X_data=df,
nb_clusters=pd.np.arange(2, 11, 1),
model_comp=comp_model)
DBscan :
def dbscan_grid_search(X_data, model_comp, eps_space=0.5,
min_samples_space=5, min_clust=0, max_clust=10):
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
# Starting a tally of total iterations
n_iterations = 0
# Looping over each combination of hyperparameters
for eps_val in eps_space:
for samples_val in min_samples_space:
dbscan_grid = DBSCAN(eps=eps_val,
min_samples=samples_val)
# fit_transform
clusters = dbscan_grid.fit_predict(X=X_scaled)
# Counting the amount of data in each cluster
cluster_count = Counter(clusters)
#n_clusters = sum(abs(pd.np.unique(clusters))) - 1
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
# Increasing the iteration tally with each run of the loop
n_iterations += 1
# Appending the lst each time n_clusters criteria is reached
if n_clusters >= min_clust and n_clusters <= max_clust:
silh = metrics.silhouette_score(X_scaled, clusters)
data = [{'Model': 'Dbscan',
'Score_Silhouette': silh,
'num_clusters': n_clusters,
'size_clusters': cluster_count,
'parameters': 'eps :'+str(eps_val)+'+ samples_val :'+str(samples_val)}]
model_comp = model_comp.append(
data, ignore_index=True, sort=False)
return model_comp
comp_model = dbscan_grid_search(X_data=df,
model_comp=comp_model,
eps_space=pd.np.arange(0.1, 5, 0.6),
min_samples_space=pd.np.arange(1, 30, 3),
min_clust=2,
max_clust=10)
GMM :
def gmm(X_data, nb_clusters, model_comp):
ks = nb_clusters
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
for num_clusters in ks:
# Create a KMeans instance with k clusters: model
gmm = mixture.GaussianMixture(n_components=num_clusters).fit(X_scaled)
# Fit model to samples
gmm.fit(X_scaled)
pred = gmm.predict(X_scaled)
cluster_count = Counter(pred)
silh = metrics.silhouette_score(X_scaled, pred)
data = [{'Model': 'GMM',
'Score_Silhouette': silh,
'num_clusters': num_clusters,
'size_clusters': cluster_count,
'parameters': 'nb_clusters :'+str(num_clusters)}]
model_comp = model_comp.append(data, ignore_index=True, sort=False)
return model_comp
comp_model = gmm(X_data=df,
nb_clusters=pd.np.arange(2, 11, 1),
model_comp=comp_model
)
At the end you will have comp_model which will contain all the results of your algo. Here I am using three algorithms, after you selected the best fit for you (with score silhouette and number of cluster).
You should check the repartitions of each cluster :
https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py

How to predict price for a future day given historical data

so currently this is the code I have. Not attached are various graphs that I have made that show the actual stock price from the CSV and then my projections. I'm wanting to make it where I simply predict tomorrow's stock price given all of this historical data but I'm having a difficult time. The "df.loc[len(df.index)] = ['2022-04-05',0,0,0,0,0,0]" was where I was trying to put the predictions for future days although I am open to other ways.
# Machine learning
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# For data manipulation
import pandas as pd
import numpy as np
# To plot
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
# method of pandas
df = pd.read_csv('data_files/MSFT.csv')
#add extra row of blank data for future prediction
df.loc[len(df.index)] = ['2022-04-05',0,0,0,0,0,0]
df.loc[len(df.index)] = ['2022-04-06',0,0,0,0,0,0]
df.loc[len(df.index)] = ['2022-04-07',0,0,0,0,0,0]
df.loc[len(df.index)] = ['2022-04-08',0,0,0,0,0,0]
# Changes The Date column as index columns
df.index = pd.to_datetime(df['Date'])
# drop The original date column
df = df.drop(['Date'], axis='columns')
print(df)
# Create predictor variables
df['Open-Close'] = df.Open - df.Close
df['High-Low'] = df.High - df.Low
# Store all predictor variables in a variable X
X = df[['Open-Close', 'High-Low']]
X.head()
# Target variables
y = np.where(df['Close'].shift(-1) > df['Close'], 1, 0)
print(y)
split_percentage = 0.8
split = int(split_percentage*len(df))
# Train data set
X_train = X[:split]
y_train = y[:split]
# Test data set
X_test = X[split:]
y_test = y[split:]
# Support vector classifier
cls = SVC().fit(X_train, y_train)
df['Predicted_Signal'] = cls.predict(X)
# Calculate daily returns
df['Return'] = df.Close.pct_change()
# Calculate strategy returns
df['Strategy_Return'] = df.Return * df.Predicted_Signal.shift(1)
# Calculate Cumulutive returns
df['Cum_Ret'] = df['Return'].cumsum()
# Plot Strategy Cumulative returns
df['Cum_Strategy'] = df['Strategy_Return'].cumsum()

How to continue after gradient descent?

I am very new to Data Science and Python. After a few hours of Experimentation, I finally received values for my gradient descent (code below). I am having trouble to plotting bzw. How can I plot the regression line automatically after the algorithm?
import numpy as np;
import matplotlib.pyplot as plt;
import csv
import pandas as pd
def gradient_descent(x,y):
m_curr=b_curr=0
iterations = 5000
n=len(x)
learning_rate = 0.01
for i in range(iterations):
y_predicted = m_curr*x + b_curr
cost = (1/n)*sum([val**2 for val in (y-y_predicted)])
md = -(2/n)*sum(x*(y-y_predicted))
bd = -(2/n)*sum(y-y_predicted)
m_curr = m_curr - learning_rate*md
b_curr = b_curr - learning_rate*bd
print("m{}, b{}, cost {}, iteration {}".format(m_curr,b_curr,cost,i))
if __name__ == '__main__':
#Reading data -> Output: DataFrame in float64
data = pd.read_csv('ex1data1.txt', sep=',', header=None, names=['Feature', 'Label'])
data.plot(x='Feature', y='Label', kind = 'scatter')
#separating data frame to
feat_vec = pd.DataFrame(data['Feature'])
label_vec = pd.DataFrame(data['Label'])
#Finding the Best Fit Line for our given Dataset and convert the df to np.array
#because it's more convenient for matrix multiplication
x = np.array(feat_vec)
y = np.array(label_vec)
gradient_descent(x,y)

Python Vector Error Correction Model

Anyone has an idea on how to model a VECM in python? I can't find it in the statsmodels package.
https://gist.github.com/yogabonito/5461b26bed335cad6907aa4e613acb99
In this git link they implement a model using VECM in python
Some important parts of code are here
%matplotlib inline
import pandas
from statsmodels.tsa.vecm.vecm import VECM, select_order
import data as dta
iidata = dta.load_pandas();
mdata = iidata.data
dates = mdata[['year', 'quarter']].astype(int).astype(str)
quarterly = dates["year"] + "Q" + dates["quarter"]
from statsmodels.tsa.base.datetools import dates_from_str
quarterly = dates_from_str(quarterly)
mdata = mdata[dta.variable_names]
mdata.index = pandas.DatetimeIndex(quarterly)
data = mdata
model = VECM(data, diff_lags=3, coint_rank=1)
vecm_res = model.fit()
vecm_res.gamma.round(4)
vecm_res.summary()
[![vecm_res.predict(steps=5)
forecast, lower, upper = vecm_res.predict(5, 0.05)
print("lower bounds of confidence intervals:")
print(lower.round(3))
print("\npoint forecasts:")
print(forecast.round(3))
print("\nupper bounds of confidence intervals:")
print(upper.round(3))
vecm_res.plot_forecast(steps=10)][1]][1]
Output forecast shown here:
om/1BVSA.png

Simple Regression Example pyBrain

I am trying to make the simpliest regression on pyBrain but somehow I'm failing.
The Neural Network should learn the function Y=3*X
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.datasets import SupervisedDataSet
from pybrain.structure import FullConnection, FeedForwardNetwork, TanhLayer, LinearLayer, BiasUnit
import matplotlib.pyplot as plt
from numpy import *
n = FeedForwardNetwork()
n.addInputModule(LinearLayer(1, name = 'in'))
n.addInputModule(BiasUnit(name = 'bias'))
n.addModule(TanhLayer(1,name = 'tan'))
n.addOutputModule(LinearLayer(1, name = 'out'))
n.addConnection(FullConnection(n['bias'], n['tan']))
n.addConnection(FullConnection(n['in'], n['tan']))
n.addConnection(FullConnection(n['tan'], n['out']))
n.sortModules()
# initialize the backprop trainer and train
t = BackpropTrainer(n, learningrate = 0.1, momentum = 0.0, verbose = True)
#DATASET
DS = SupervisedDataSet( 1, 1 )
X = random.rand(100,1)*100
Y = X*3+random.rand(100,1)*5
for r in xrange(X.shape[0]):
DS.appendLinked((X[r]),(Y[r]))
t.trainOnDataset(DS, 200)
plt.plot(X,Y,'.b')
X=[[i] for i in arange(0,100,0.1)]
Y=map(n.activate,X)
plt.plot(X,Y,'-g')
It doesn't learn anything. I have tried to remove the hidden layer (because in this example we don't even need that) and the network started to predict NaNs.
What's going on?
EDIT: This is the code that solved my problem:
#DATASET
DS = SupervisedDataSet( 1, 1 )
X = random.rand(100,1)*100
Y = X*3+random.rand(100,1)*5
maxy = float(max(Y))
maxx = 100.0
for r in xrange(X.shape[0]):
DS.appendLinked((X[r]/maxx),(Y[r]/maxy))
t.trainOnDataset(DS, 200)
plt.plot(X,Y,'.b')
X=[[i] for i in arange(0,100,0.1)]
Y=map(lambda x: n.activate(array(x)/maxx)*maxy,X)
plt.plot(X,Y,'-g')
The basic pybrain neurons are going to output something between 0 and 1. Divide your Y by 300 (the maximum possible value), and you'll get better results.
More generally, find the maximum Y for your dataset, and scale everything by that.

Categories