Getting the same result without GridSearchCV - python

I'm currently using GridSearchCV for a nn project and wondering how can I get the same result (using the resulted hyperparameters from the GridSearchCV) but without using the GridSearchCV? What's the proper code to replicate this but without GridSearchCV use?
Here's my code using GridSearchCV:
from numpy.random import seed
seed(1)
import tensorflow as tf
tf.random.set_seed(2)
X = df.drop(['A','B','C','D','E'],axis=1).values
y = df['A'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
# X_test, X_val, y_test, y_val = train_test_split(X_test,y_test,test_size=0.5,random_state=101)
from sklearn.preprocessing import MinMaxScaler,StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print(X.shape)
kfold = KFold(5, shuffle=True,random_state=42)
#############
def buildModel():
model = Sequential()
model.add(Dense(9,activation='tanh'))
model.add(Dense(1))
model.compile(optimizer=SGD(learning_rate=0.01,momentum=0.6),loss='mse',metrics=['mean_squared_error'])
return model
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
model = KerasRegressor(build_fn=buildModel)
parameters = {'batch_size': [10,15],
'epochs': [100,200,250]}
grid_search = GridSearchCV(estimator=model,param_grid=parameters,cv=kfold,scoring='neg_mean_squared_error',verbose=10)
grid_search = grid_search.fit(X_train,y_train,verbose=10)
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_
>> Best Parameters: {'batch_size': 10, 'epochs': 100}

Related

gridsearch before RFE is taking super long

I try to do a gridsearch on my dataset to know how many features i want to select in my RFE, but it is taking super long. Does anyone know if this is normal, or do i have a foult in my script?
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
#%% train-test split
data = pd.read_csv('preprocesseddata.csv')
data.drop(['Date', 'About'], axis=1, inplace=True)
y = data['Class']
X = data[['Duration_Ball Training','Duration_Match','Duration_Other','Duration_Strenght Training','Positie','Gender','Voorkeursbeen','Instroomjaar','Age','Hours Sleep','Stress','Muscle Soreness','T-test','20m Sprint','CMJ 2b','Yo Yo Result','Heart Rate (Max)','Latest Height', 'Body Fat %','Repetitive Injury','Prefered Leg','AcuteLegs_1day','AcuteCardio_1day','AcuteLegs_3days','AcuteCardio_3days','AcuteLegs_7days','AcuteCardio_7days','ChronicLegs_14days','ChronicCardio_14days','ChronicLegs_21days','ChronicCardio_21days','ChronicLegs_28days','ChronicCardio_28days','TrainingmonotonyLegs','TrainingmonotonyCardio']]
y = y.astype('category')
y = y.cat.codes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
#%% RFE as part of pipeline
lr = LogisticRegression(solver='liblinear', random_state=123)
pipe = make_pipeline(RFE(estimator=lr, step=1), KNeighborsClassifier())
parameters = {'rfe__n_features_to_select': range(1,35), 'kneighborsclassifier__n_neighbors': range(1,30)}
grid = GridSearchCV(pipe, param_grid=parameters, cv=10, n_jobs=1)
grid.fit(X_train_std, y_train)
print('Best params:', grid.best_params_)
print('Best accuracy:', grid.best_score_)
#%% RFE
lr = LogisticRegression(solver='liblinear', random_state=123)
rfe= RFE(estimator=lr, n_features_to_select=5, step=-1)
rfe.fit(X_train_std, y_train)
X_train_sub = rfe.transform(X_train_std)
rfe.support_
It seems to get stuck at the print best parameters line

Getting 100% Accuracy on my DecisionTree Model

Here is my code, and it always returns 100% accuracy, regardless of how big the test size is. I used the train_test_split method, so I do not believe there should be any duplicates of data. Could someone inspect my code?
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data = pd.read_csv('housing.csv')
prices = data['median_house_value']
features = data.drop(['median_house_value', 'ocean_proximity'], axis = 1)
prices.shape
(20640,)
features.shape
(20640, 8)
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.2, random_state=42)
X_train = X_train.dropna()
y_train = y_train.dropna()
X_test = X_test.dropna()
y_test = X_test.dropna()
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_train.shape
(16512,)
X_train.shape
(16512, 8)
predictions = model.predict(X_test)
score = model.score(y_test, predictions)
score
EDIT: I have reworked my answer since I found multiple issues. Please copy-paste the below code to ensure no bugs are left.
Issues -
You are using DecisionTreeClassifier instead of DecisionTreeRegressor for a regression problem.
You are removing nans after doing the test train split which will mess up the count of samples. Do the data.dropna() before the split.
You are using the model.score(X_test, y_test) incorrectly by passing it (X_test, predictions). Please use accuracy_score(X_test, predictions) with those parameters instead, or fix the syntax.
from sklearn.tree import DecisionTreeRegressor #<---- FIRST ISSUE
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data = pd.read_csv('housing.csv')
data = data.dropna() #<--- SECOND ISSUE
prices = data['median_house_value']
features = data.drop(['median_house_value', 'ocean_proximity'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.2, random_state=42)
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
score = accuracy_score(y_test, predictions) #<----- THIRD ISSUE
score

What does the error mean and how to fix it - "ValueError: query data dimension must match training data dimension"

I am trying to write the code for K-NN
Below is my code. - I know that issue is in `predict() but I am not able to figure out how o fix it.
# Importing the libraries
import numpy as np
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('UniversalBank.csv')
X = dataset.iloc[:,[ 1,2,3,5,6,7,8,10,11,12,13]].values #,
y = dataset.iloc[:,9].values
#Splitting the dataset to training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state= 0)
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#Fitting the classifier to training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train,y_train)
#Predicting the test results
y_pred = classifier.predict(X_test)

Why the output of cross_validate differ from the hardcode loop when using XGBClassifier?

Code #1 Pass pipeline with PCA, XGBClassifier steps to scikit-learn cross_validate function
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import random
random.seed(42)
import numpy as np
np.random.seed(42)
kwargs = {
'n_jobs': -1,
'cv': LeaveOneOut(),
'X': X,
'y': y
}
pipe = Pipeline([
('pca', PCA(1, random_state=42)),
('xgbc', XGBClassifier(random_state=42))
])
results = cross_validate(pipe, **kwargs)
print(results['test_score'].mean())
Code #2 Write cross-validation loop hardcode and calculate mean accuracy for exactly same input X as Code #1
from xgboost import XGBClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.decomposition import PCA
import random
random.seed(42)
import numpy as np
np.random.seed(42)
acc = []
for train_idx, test_idx in LeaveOneOut().split(X, y):
x_train, x_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
pca = PCA(1, random_state=42)
pca.fit(x_train)
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)
model = XGBClassifier(random_state=42, n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
acc.append(score)
print(np.mean(acc))

operand could not be broadcast together with shapes <56962,2> .. error

I try logistic regression classification using "k-fold cross validation" in python.
my code:
`import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix,roc_auc_score
data = pd.read_csv('xxx.csv')
X = data[["a","b","c",...]]
y = data["Class"]
def get_predictions(clf, X_train, y_train, X_test):
clf = clf
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)
train_pred = clf.predict(X_train)
print('train-set confusion matrix:\n', confusion_matrix(y_train,train_pred))
return y_pred, y_pred_prob
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
pred_test_full=0
cv_score=[]
i=1
for train_index, test_index in skf.split(X, y):
X_train, y_train = X.loc[train_index], y.loc[train_index]
X_test, y_test = X.loc[test_index], y.loc[test_index]
log_cfl = LogisticRegression(C=2);
log_cfl.fit(X_train, y_train)
y_pred, y_pred_prob = get_predictions(LogisticRegression(C=2), X_train, y_train, X_test)
score=roc_auc_score(y_test,log_cfl.predict(X_test))
print('ROC AUC score: ',score)
cv_score.append(score)
pred_test_full = pred_test_full + y_pred_prob
i+=1`
I get error at this line of code:
`pred_test_full = pred_test_full + y_pred_prob`
For loop runs 2 times. Then in third, I get the error.
'operands could not be broadcast together with shapes <56962,2> <5696..' error.
I couldn't understand what is wrong, could you help to figure out?

Categories