I want to implement the cross validation in Random Forest Regressor in my data set. I want to know if my code is correct or not? Is this the way to cross validate?
Here is my sample data:
Wavelength Phase_velocity Shear_wave_velocity
1.50 202.69 240.73
1.68 192.72 240.73
1.79 205.54 240.73
........
Here is my code:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import KFold,train_test_split,cross_val_score
df = pd.read_csv("5.5-6.csv")
df.head()
X = df[['wavelength', 'phase velocity']]
y = df['shear wave velocity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print (len(X_train),len(X_test),len(y_train),len(y_test))
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True)
rf = RandomForestRegressor(n_estimators=30000)
rf.fit(X_train, y_train)
results = cross_val_score(rf, X_train, y_train, cv=kfold) #Cross validation on training set
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print (rf.predict(X_test)) #array_output
print (y_test)
print (rf.score(X_test, y_test))
y_pred = rf.predict(X_test)
from sklearn.metrics import mean_absolute_error
print (mean_absolute_error(y_test,y_pred))
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse = sqrt(mean_squared_error(y_test,y_pred))
print(rmse)
Related
I try to do a gridsearch on my dataset to know how many features i want to select in my RFE, but it is taking super long. Does anyone know if this is normal, or do i have a foult in my script?
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
#%% train-test split
data = pd.read_csv('preprocesseddata.csv')
data.drop(['Date', 'About'], axis=1, inplace=True)
y = data['Class']
X = data[['Duration_Ball Training','Duration_Match','Duration_Other','Duration_Strenght Training','Positie','Gender','Voorkeursbeen','Instroomjaar','Age','Hours Sleep','Stress','Muscle Soreness','T-test','20m Sprint','CMJ 2b','Yo Yo Result','Heart Rate (Max)','Latest Height', 'Body Fat %','Repetitive Injury','Prefered Leg','AcuteLegs_1day','AcuteCardio_1day','AcuteLegs_3days','AcuteCardio_3days','AcuteLegs_7days','AcuteCardio_7days','ChronicLegs_14days','ChronicCardio_14days','ChronicLegs_21days','ChronicCardio_21days','ChronicLegs_28days','ChronicCardio_28days','TrainingmonotonyLegs','TrainingmonotonyCardio']]
y = y.astype('category')
y = y.cat.codes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
#%% RFE as part of pipeline
lr = LogisticRegression(solver='liblinear', random_state=123)
pipe = make_pipeline(RFE(estimator=lr, step=1), KNeighborsClassifier())
parameters = {'rfe__n_features_to_select': range(1,35), 'kneighborsclassifier__n_neighbors': range(1,30)}
grid = GridSearchCV(pipe, param_grid=parameters, cv=10, n_jobs=1)
grid.fit(X_train_std, y_train)
print('Best params:', grid.best_params_)
print('Best accuracy:', grid.best_score_)
#%% RFE
lr = LogisticRegression(solver='liblinear', random_state=123)
rfe= RFE(estimator=lr, n_features_to_select=5, step=-1)
rfe.fit(X_train_std, y_train)
X_train_sub = rfe.transform(X_train_std)
rfe.support_
It seems to get stuck at the print best parameters line
How can I use this dataset "MC1" to plot a KNN decision boundary figure?
Here is my code, I have tried to use iloc and loc but did not work
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from yellowbrick.contrib.classifier import DecisionViz
from yellowbrick.features import RadViz
from yellowbrick.style import set_palette
set_palette('flatui')
data_set = pd.read_csv('MC1.csv')
X, y = data_set
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = tts(X, y, test_size=.4, random_state=42)
visualizer = RadViz(size=(500, 400))
viz = DecisionViz(
KNeighborsClassifier(5), title="Nearest Neighbors",classes=['Y', 'N']
)
viz.fit(X_train, y_train)
viz.draw(X_test, y_test)
viz.show()
I have a problem with this code. The error is on the line: ppn.fit(X_train, y_train)
I just use Python 3.7
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("file.csv", sep=',', error_bad_lines=False, low_memory=False)
X = df.iloc[:, 1:44].values
y = df.iloc[:, 48].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train = np.isnan(X_train)
y_train = np.isnan(y_train)
X_test = np.isnan(X_test)
ppn = Perceptron(max_iter=40, tol=0.001, eta0=0.1, random_state=0)
ppn.fit(X_train, y_train)
y_pred = ppn.predict(X_test)
y_pred = np.isnan(y_pred)
print(accuracy_score(y_test, y_pred))
How can I fix it? Thanks.
Code #1 Pass pipeline with PCA, XGBClassifier steps to scikit-learn cross_validate function
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import random
random.seed(42)
import numpy as np
np.random.seed(42)
kwargs = {
'n_jobs': -1,
'cv': LeaveOneOut(),
'X': X,
'y': y
}
pipe = Pipeline([
('pca', PCA(1, random_state=42)),
('xgbc', XGBClassifier(random_state=42))
])
results = cross_validate(pipe, **kwargs)
print(results['test_score'].mean())
Code #2 Write cross-validation loop hardcode and calculate mean accuracy for exactly same input X as Code #1
from xgboost import XGBClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.decomposition import PCA
import random
random.seed(42)
import numpy as np
np.random.seed(42)
acc = []
for train_idx, test_idx in LeaveOneOut().split(X, y):
x_train, x_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
pca = PCA(1, random_state=42)
pca.fit(x_train)
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)
model = XGBClassifier(random_state=42, n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
acc.append(score)
print(np.mean(acc))
The problem is I am getting two totally different results when I run the DTC algorithm, I just want to make sure that I am writing the cross validation - K Fold in a correct way or to understand why the result of the K fold is too much less than the normal one.
I've tried to run the codes for getting result from both normal accuracy and K fold accuracy the code is below:
from scipy.signal import butter, lfilter
import numpy as np
import pandas as pd
import pandas
from sklearn import preprocessing
from scipy.fftpack import fft
import pickle
import numpy
from pandas import Series
from numpy.random import randn
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
xx = pandas.read_csv("data1.dat", delimiter=",")
y = pandas.read_csv("label.dat", delim_whitespace=True)
x = xx.as_matrix()
y = numpy.array(y).astype(numpy.int)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
clf2 = DecisionTreeClassifier(random_state=42)
clf2.fit(X_train, y_train)
y_predict_2 = clf2.predict(X_test)
print("DTC Accuracy : ")
print(accuracy_score(y_test, y_predict_2)*100)
DTC Accuracy :
97.6302083333333
from sklearn.model_selection import cross_val_score
DTC = DecisionTreeClassifier(random_state=42)
scores =cross_val_score(DTC, x, y, cv=10, scoring='accuracy')
print(scores.mean()*100)
35.331452470904985
from sklearn.model_selection import cross_val_score
DTC = DecisionTreeClassifier(random_state=42)
scores =cross_val_score(DTC, X_train, y_train, cv=10, scoring='accuracy')
print(scores.mean()*100)
97.34356
However, in the cross validation part, when I put X_train instead of x and y_train instead of y, the accuracy again rises to 97.
I am wondering which one I need to use (x and y) or (X_train adn y_train) will be the correct and common sense cross validation.
Try to shuffle your data and reduce the cross validation folds.
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
xx = pandas.read_csv("data1.dat", delimiter=",")
y = pandas.read_csv("label.dat", delim_whitespace=True)
x = xx.as_matrix()
y = y.values.astype(np.int32).reshape(-1, 1)
x, y = shuffle(x, y, random_state=42)
DTC = DecisionTreeClassifier(random_state=42)
scores = cross_val_score(DTC, x, y, cv=3, scoring='accuracy')
print(scores.mean()*100)