gridsearch before RFE is taking super long - python

I try to do a gridsearch on my dataset to know how many features i want to select in my RFE, but it is taking super long. Does anyone know if this is normal, or do i have a foult in my script?
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
#%% train-test split
data = pd.read_csv('preprocesseddata.csv')
data.drop(['Date', 'About'], axis=1, inplace=True)
y = data['Class']
X = data[['Duration_Ball Training','Duration_Match','Duration_Other','Duration_Strenght Training','Positie','Gender','Voorkeursbeen','Instroomjaar','Age','Hours Sleep','Stress','Muscle Soreness','T-test','20m Sprint','CMJ 2b','Yo Yo Result','Heart Rate (Max)','Latest Height', 'Body Fat %','Repetitive Injury','Prefered Leg','AcuteLegs_1day','AcuteCardio_1day','AcuteLegs_3days','AcuteCardio_3days','AcuteLegs_7days','AcuteCardio_7days','ChronicLegs_14days','ChronicCardio_14days','ChronicLegs_21days','ChronicCardio_21days','ChronicLegs_28days','ChronicCardio_28days','TrainingmonotonyLegs','TrainingmonotonyCardio']]
y = y.astype('category')
y = y.cat.codes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
#%% RFE as part of pipeline
lr = LogisticRegression(solver='liblinear', random_state=123)
pipe = make_pipeline(RFE(estimator=lr, step=1), KNeighborsClassifier())
parameters = {'rfe__n_features_to_select': range(1,35), 'kneighborsclassifier__n_neighbors': range(1,30)}
grid = GridSearchCV(pipe, param_grid=parameters, cv=10, n_jobs=1)
grid.fit(X_train_std, y_train)
print('Best params:', grid.best_params_)
print('Best accuracy:', grid.best_score_)
#%% RFE
lr = LogisticRegression(solver='liblinear', random_state=123)
rfe= RFE(estimator=lr, n_features_to_select=5, step=-1)
rfe.fit(X_train_std, y_train)
X_train_sub = rfe.transform(X_train_std)
rfe.support_
It seems to get stuck at the print best parameters line

Related

ValueError: Found input variables with inconsistent numbers of samples: [658448, 5879]

I have tried to evalute the different Machine Learning model and facing this error. The error shows that cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') and the kfold = StratifiedShuffleSplit(n_splits=2, random_state=2, test_size=.25)
If anyone know where is the problem please let me know about it.
# comparing algorithms and training models
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# Spot Check Algorithms
models = []
models.append(('KNN', KNeighborsClassifier(n_neighbors=19)))
models.append(('Decision Tree', DecisionTreeClassifier(min_samples_leaf=60)))
models.append(('Naive Bayes', GaussianNB()))
models.append(('Random Forest', RandomForestClassifier(n_estimators=80, max_depth=3,random_state=0,min_samples_leaf=9)))
X_train = X_train.reshape(658448,-1)
Y_train = Y_train.reshape(5879,-1)
Y_test = Y_test.reshape(1960,1)
X_test = X_test.reshape(219520,-1)
Y_test = Y_test.astype('int')
X_train = X_train.astype('int')
Y_train = Y_train.astype('int')
type(Y_train )
type(X_train)
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = StratifiedKFold(n_splits=3, random_state=1, shuffle=True)
#kfold = StratifiedShuffleSplit(n_splits=2, random_state=2, test_size=.25)
#kf = KFold(n_splits=5, random_state=3,shuffle=True )
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
names.append(name)
print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
# Compare Algorithms
pyplot.boxplot(results, labels=names)
pyplot.title('Algorithm Comparison')
pyplot.show()

How to feed data into random forest classifier and see prediction

I have build a random forest classifier using scikit learn and python, and I am having trouble actually feeding data in to see the prediction. I want to see the format of the output, and to convert this to a json file. I have attached the code for the random forest and what the data looks like. I believe I need to use 'y_pred', but I am not sure what format the input data needs to be.
X = dataset.iloc[:, 2:4].values
y = dataset["pages"]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=20, random_state = 0)
classifier = classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
You can simply concatenate the predicted values with the matrix of features.
Also note that the pipeline is exactly for this purpose, when you first want to transform the data and then apply some classifier.
This should work for you:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
X = dataset.iloc[:, 2:4].values
y = dataset["pages"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
classifier = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=20, random_state=0))
classifier = classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
pred = pd.concat([X_test, pd.Series(y_pred, name="pages")], axis=1)

ValueError: The number of classes has to be greater than one; got 1 class ScikitLearn Python

I have a problem with this code. The error is on the line: ppn.fit(X_train, y_train)
I just use Python 3.7
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("file.csv", sep=',', error_bad_lines=False, low_memory=False)
X = df.iloc[:, 1:44].values
y = df.iloc[:, 48].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train = np.isnan(X_train)
y_train = np.isnan(y_train)
X_test = np.isnan(X_test)
ppn = Perceptron(max_iter=40, tol=0.001, eta0=0.1, random_state=0)
ppn.fit(X_train, y_train)
y_pred = ppn.predict(X_test)
y_pred = np.isnan(y_pred)
print(accuracy_score(y_test, y_pred))
How can I fix it? Thanks.

Why the output of cross_validate differ from the hardcode loop when using XGBClassifier?

Code #1 Pass pipeline with PCA, XGBClassifier steps to scikit-learn cross_validate function
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import random
random.seed(42)
import numpy as np
np.random.seed(42)
kwargs = {
'n_jobs': -1,
'cv': LeaveOneOut(),
'X': X,
'y': y
}
pipe = Pipeline([
('pca', PCA(1, random_state=42)),
('xgbc', XGBClassifier(random_state=42))
])
results = cross_validate(pipe, **kwargs)
print(results['test_score'].mean())
Code #2 Write cross-validation loop hardcode and calculate mean accuracy for exactly same input X as Code #1
from xgboost import XGBClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.decomposition import PCA
import random
random.seed(42)
import numpy as np
np.random.seed(42)
acc = []
for train_idx, test_idx in LeaveOneOut().split(X, y):
x_train, x_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
pca = PCA(1, random_state=42)
pca.fit(x_train)
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)
model = XGBClassifier(random_state=42, n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
acc.append(score)
print(np.mean(acc))

K Fold Cross validation

I want to implement the cross validation in Random Forest Regressor in my data set. I want to know if my code is correct or not? Is this the way to cross validate?
Here is my sample data:
Wavelength Phase_velocity Shear_wave_velocity
1.50 202.69 240.73
1.68 192.72 240.73
1.79 205.54 240.73
........
Here is my code:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import KFold,train_test_split,cross_val_score
df = pd.read_csv("5.5-6.csv")
df.head()
X = df[['wavelength', 'phase velocity']]
y = df['shear wave velocity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print (len(X_train),len(X_test),len(y_train),len(y_test))
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True)
rf = RandomForestRegressor(n_estimators=30000)
rf.fit(X_train, y_train)
results = cross_val_score(rf, X_train, y_train, cv=kfold) #Cross validation on training set
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print (rf.predict(X_test)) #array_output
print (y_test)
print (rf.score(X_test, y_test))
y_pred = rf.predict(X_test)
from sklearn.metrics import mean_absolute_error
print (mean_absolute_error(y_test,y_pred))
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse = sqrt(mean_squared_error(y_test,y_pred))
print(rmse)

Categories