import os
from pylab import rcParams
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()
from numpy import *
from scipy import stats
from pandas.plotting import scatter_matrix
import sklearn
import warnings
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
data = pd.read_excel(r'Attrition Data Exercise.xlsx')
X = data.iloc[:, 3:-1].values
y = data.iloc[:, -1].values
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
ct = ColumnTransformer(transformers=
[('one_encoder', OneHotEncoder(), [2, 5, 11, 13, 28]),
('ord_encoder', OrdinalEncoder(), [0])],
remainder='passthrough')
X = np.array(ct.fit_transform(X))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dropout(rate=0.3))
ann.add(tf.keras.layers.Dense(units=6, activation='relu', kernel_regularizer='l1', bias_regularizer='l2'))
ann.add(tf.keras.layers.Dropout(rate=0.3))
ann.add(tf.keras.layers.Dense(units=3, activation='relu', kernel_regularizer='l1', bias_regularizer='l2'))
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
opt = tf.keras.optimizers.Adam(
learning_rate=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-08)
ann.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy', tf.keras.metrics.Recall()])
The above code runs successfully. It's when I run the below code in a cell that it causes an error.
pipe = Pipeline([('smt', SMOTE()), ('model', KerasClassifier(build_fn = ann, verbose = 0, epochs=170))])
weights = np.linspace(0.5, 0.5, 1)
gsc = GridSearchCV(
estimator = pipe,
param_grid = {
'smt__sampling_strategy' : weights
},
scoring = 'f1',
cv = 4)
grid_result = gsc.fit(X_train, y_train)
The code above results in the following error:
ValueError: The first argument to `Layer.call` must always be passed
Any idea what I might be doing wrong or what can be improved?
I tried replacing KerasClassifier with KerasRegressor too just to see if something changes but nothing did. What essentially is going wrong?
I'm trying to use the Pipeline class from imblearn and GridSearchCV to get the best parameters for classifying the imbalanced dataset, I want to leave out resampling of the validation set and only resample the training set, which imblearn's Pipeline seems to be doing. However, I'm getting an error while implementing the accepted solution
Also link to the screenshot to the error trace is attached.Error Trace Complete
#danr got it correct. Many thanks to him. I was getting the same error when using KerasClassifier with sklearn's cross_val_score. Adding the lambda after build_fn solved the problem. I had a function create_model that created a keras Sequential model. Corrected code that runs smoothly (tensorflow 2.4.1):
from sklearn.model_selection import cross_val_score
# Create a KerasClassifier using best params determined using RandomizedSearchCV above
model = KerasClassifier(build_fn = lambda: create_model(learning_rate = 0.01, activation = 'tanh'), epochs = 50, batch_size = 32, verbose = 0)
# Calculate the accuracy score for each fold
kfolds = cross_val_score(model, X, y, cv = 3)
Related
I am trying to create a stacking ensemble using scikit-learn that contains a Keras model wrapped using KerasClassifier.
Here's an example of how my code looks using the iris dataset:
# import libraries
import pandas
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dropout, Flatten, Dense
from keras.utils import np_utils
from keras import optimizers
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from numpy import mean
from numpy import std
# import data
dataframe = pandas.read_csv("iris.csv", header=None)
dataset = dataframe.values
X = dataset[:,0:4].astype(float)
Y = dataset[:,4]
# create and wrap neural network
def create_model():
model = Sequential()
model.add(Flatten(input_shape=X.shape[1:]))
model.add(Dense(150, activation=tf.keras.layers.LeakyReLU(alpha=0.3)))
model.add(Dropout(0.9))
model.add(Dense(50, activation=tf.keras.layers.LeakyReLU(alpha=0.3)))
model.add(Dropout(0.9))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer=optimizers.Adam(lr=2e-3),
metrics=['acc'])
return model
model_nn = KerasClassifier(build_fn=create_model, epochs=50, batch_size=5, verbose=0)
model_nn._estimator_type = "classifier"
# create stack
def stacking():
level0 = list()
level0.append(('lr', LogisticRegression(max_iter = 500000, C = .00041, solver = 'newton-cg', multi_class = 'ovr')))
level0.append(('nn', model_nn))
level0.append(('svm', SVC(C=1.0, gamma='scale', tol=.001, probability = True)))
level1 = LogisticRegression()
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
return model
# evaluate model score
def evaluate_model(model, X, y):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
return scores
scores = evaluate_model(stacking(), X, Y)
print('%.3f (%.3f)' % (mean(scores), std(scores)))
and I get this error:
ValueError: The estimator KerasClassifier should be a classifier.
I found some posts where other users had this issue, but they were able to fix it using the model_nn._estimator_type = "classifier" line. Unfortunately, that isn't solving the issue for me. I'm really new to all of this, so any advice is appreciated. :)
KerasClassifier is migrated from keras.wrappers.scikit_learn to scikeras.wrappers.
You need to use below code to access the KerasClassifier :
!pip install scikeras
from scikeras.wrappers import KerasClassifier
Please check this link for more details.
I have tried to look for a problem but there is nothing Im seeing wrong here. What could it be? This is for trying binary classification in SVM for the fashion MNIST data set but only classifying 5 and 7.
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
trainset = 'mnist_train.xlsx'
trs = pd.read_excel(trainset)
testset = 'mnist_test.xlsx'
tes = pd.read_excel(testset)
xtrain = trs.iloc[:, [1, 783]]
ytrain = trs.iloc[:, 0]
xtest = tes.iloc[:, [1, 783]]
ytest = tes.iloc[:, 0]
##Linear SVC
svclassifier = SVC(kernel='linear', C=1)
svclassifier.fit(xtest, ytest)
ypred = svclassifier.predict(xtest)
print(ypred.score(xtrain, ytrain))
print(ypred.score(xtest, ytest))
##Gaussian SVC
svclassifier = SVC(kernel='rbf', C=1)
svclassifier.fit(xtrain, ytrain)
ypred = svclassifier.predict(xtest)
print(ypred.score(xtrain, ytrain))
print(ypred.score(xtest, ytest))
ypred is an array of predicted class labels, so the exception makes sense.
What you should do is use the classifier’s score method:
svclassifier = SVC(kernel='rbf', C=1)
svclassifier.fit(xtrain, ytrain)
# ypred = svclassifier.predict(xtest) # We don’t actually use this.
print(svclassifier.score(xtrain, ytrain))
print(svclassifier.score(xtest, ytest))
I developed 3 ML models in spyder, they are Linear Regression, Polynomial Regression, and Random Forest Regression. In sypder all of them worked well. However when I deployed on Django for creating a web app, Random Forest was raising " ValueError: Buffer type mismatch, expected 'SIZE_t' but got 'long long' ". (I tried removing randomforest and the other two models worked well).
Check this out first:-
Model Developed in Sypder
"""****************** Import Lib ******************"""
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
"""****************** Loading dataset ******************"""
boston = load_boston()
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
"""****************** Data Preprocessing ******************"""
""" Data Analysis """
# Check Null
dataset.isnull().sum()
# Calculate X and y
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values.reshape(-1,1)
# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)
""" Visualizing Data """
corr = dataset.corr()
sns.heatmap(corr, annot=True, cmap='Blues')
sns.pairplot(dataset)
"""****************** Regression Models ******************"""
""" Linear Regression """
from sklearn.linear_model import LinearRegression
regressor_linear = LinearRegression()
regressor_linear.fit(X_train, y_train)
cv_linear = cross_val_score(estimator = regressor_linear, X=X_train, y=y_train, cv=10)
""" Polynomial Regression """
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
regressor_poly2 = LinearRegression()
regressor_poly2.fit(X_poly, y_train)
cv_poly2 = cross_val_score(estimator=regressor_poly2, X=X_poly, y=y_train, cv=10)
""" Random Forest Regression """
from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators=500, random_state=0, n_jobs=-1)
regressor_rf.fit(X_train, y_train.ravel())
cv_rf = cross_val_score(estimator=regressor_rf, X=X_train, y=y_train.ravel(), cv=10)
"""****************** Measuring the Error ******************"""
models=[
('Linear Regression', cv_linear.mean()),
('Polynomial Regression (2)', cv_poly2.mean()),
('Random Forest Regression', cv_rf.mean())
]
cv_scores = pd.DataFrame(data=models, columns=['Model','CV Score'])
"""****************** Dump ******************"""
from sklearn.externals import joblib
joblib.dump(regressor_linear,'regressor_linear_jb')
joblib.dump(regressor_poly2,'regressor_poly2_jb')
joblib.dump(regressor_rf,'regressor_rf_jb')
Django Implementation Code
from django.shortcuts import render
from django.http import HttpResponse
import json
from django.http import JsonResponse
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from sklearn.preprocessing import PolynomialFeatures
# Create your views here.
# ML Code
regressor_linear = joblib.load('./models/regressor_linear_jb')
regressor_poly2 = joblib.load('./models/regressor_poly2_jb')
regressor_rf = joblib.load('./models/regressor_rf_jb')
# ML Code End
def predict(request):
temp_data = [
0.16902,
0,
25.65,
0,
0.581,
5.986,
88.4,
1.9929,
2,
188,
19.1,
385.02,
14.81,
]
temp_df = pd.DataFrame(temp_data).transpose()
predict = {}
# Linear Regression
predict['Linear Regressor'] = round(regressor_linear.predict(temp_df)[0, 0], 2)
# Polynomial Regression.
regressor_poly = PolynomialFeatures(degree=2)
temp_df_poly = regressor_poly.fit_transform(temp_df)
predict['Polynomial Regressor'] = round(regressor_poly2.predict(temp_df_poly)[0, 0], 2)
# Random Forest Regression
predict['Random Forest Regressor'] = round(regressor_rf.predict(temp_df)[0],2)
return JsonResponse(predict)
Switch environment for Django to anaconda and this will be resolved
Jupyter notebook was using anaconda environment while Django was using different environment which was installed on system
(Major problem --> one was 32 bit while other was 64 bit)
I am trying to optimize hyperparameters for ridge regression. But also add polynomial features. So, pipeline looks okay but getting error when try to gridsearchcv. Here:
# Importing the Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from collections import Counter
from IPython.core.display import display, HTML
sns.set_style('darkgrid')
# Data Preprocessing
from sklearn.datasets import load_boston
boston_dataset = load_boston()
dataset = pd.DataFrame(boston_dataset.data, columns = boston_dataset.feature_names)
dataset['MEDV'] = boston_dataset.target
# X and y Variables
X = dataset.iloc[:, 0:13].values
y = dataset.iloc[:, 13].values.reshape(-1,1)
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 25)
# Building the Model ------------------------------------------------------------------------
# Fitting regressior to the Training set
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
steps = [
('scalar', StandardScaler()),
('poly', PolynomialFeatures(degree=2)),
('model', Ridge())
]
ridge_pipe = Pipeline(steps)
ridge_pipe.fit(X_train, y_train)
# Predicting the Test set results
y_pred = ridge_pipe.predict(X_test)
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = ridge_pipe, X = X_train, y = y_train, cv = 10)
accuracies.mean()
#accuracies.std()
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = [ {'alpha': np.arange(0, 0.2, 0.01) } ]
grid_search = GridSearchCV(estimator = ridge_pipe,
param_grid = parameters,
scoring = 'accuracy',
cv = 10,
n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train) # <-- GETTING ERROR IN HERE
Error:
ValueError: Invalid parameter ridge for estimator
What to do or, is there a better way to use ridge regression with pipeline? I would be pleased if put some sources about gridsearch because I am a newbie on this. The error:
There are two problems in your code. First since you are using a pipeline, you need to specify in the params list which part of the pipeline does the params belongs to. See the official documentation for more information :
The purpose of the pipeline is to assemble several steps that can be
cross-validated together while setting different parameters. For this,
it enables setting parameters of the various steps using their names
and the parameter name separated by a ‘__’, as in the example below
In this case, since alpha is going to be used with ridge-regression and you have used the string model in the Pipeline defintion, you need to rename the key alpha to model_alpha:
steps = [
('scalar', StandardScaler()),
('poly', PolynomialFeatures(degree=2)),
('model', Ridge()) # <------ Whatever string you assign here will be used later
]
# Since you have named it as 'model', you need change it to 'model_alpha'
parameters = [ {'model__alpha': np.arange(0, 0.2, 0.01) } ]
Next, you need to understand this dataset is for Regression. You should not use accuracy here, instead use a regression based scoring function like, mean_squared_error. Here are some other metrics for regression that you can use. Something like this
from sklearn.metrics import mean_squared_error, make_scorer
scoring_func = make_scorer(mean_squared_error)
grid_search = GridSearchCV(estimator = ridge_pipe,
param_grid = parameters,
scoring = scoring_func, #<--- Use the scoring func defined above
cv = 10,
n_jobs = -1)
Here is a link to a Google colab notebook with working code.
For the GridSearchCV parameters, the parameter name for ridge should be 'ridge__alpha' (note 2 underscores) instead of just 'alpha'.
I have created an artificial neural network. I am trying to calculate the accuracy of the model using k-fold cross validation technique but after compiling the last line its not progressing any further, its stuck there for more than 20 mins. I am not able to figure out where I am going wrong. Can anyone please help me with this thing? Below is the code I have used.
Thanks in advance.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('Churn_Modelling.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X=X[:,1:]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import Sequential #required to initialize ann
from keras.layers import Dense #required to build the layers of ann
def build_classifier():
classifier=Sequential()
classifier.add(Dense(kernel_initializer="uniform", activation="relu", input_dim=11, units=6))
classifier.add(Dense(kernel_initializer="uniform", activation="relu", units=6))
classifier.add(Dense(kernel_initializer="uniform", activation="sigmoid",units=1))
classifier.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])
return classifier
classifier=KerasClassifier(build_fn=build_classifier, batch_size=10, nb_epoch=100)
accuracies=cross_val_score(estimator=classifier,X=X_train,y=y_train,cv=10,n_jobs=-1)
I had the same issue with the exact same code. It seems Windows has an issue with "n_jobs", if you remove it by "accuracies = .." , it will start working. It's just that it could take long but it will work and show each epoch being updated.