Error pickling scikit-learn model - python

I am not able to pickle my model below.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
script_dir = os.path.dirname(__file__)
abs_file_path = os.path.join(script_dir, 'Churn_Modelling.csv')
# Importing the dataset
dataset = pd.read_csv(abs_file_path)
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features=[1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Part 2 - Now let's make the ANN!
# Importing the Keras libraries and packages
from tensorflow.contrib.keras.api.keras.models import Sequential
from tensorflow.contrib.keras.api.keras.layers import Dense
from tensorflow.contrib.keras import backend
# Initialising the ANN
classifier = Sequential()
# Adding the input layer and the first hidden layer
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu', input_dim=11))
# Adding the second hidden layer
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))
# Adding the output layer
classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
# Compiling the ANN
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size=10, epochs=100, validation_split=0.1)
# Part 3 - Making predictions and evaluating the model
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)
# Predicting a single new observation
new_prediction = classifier.predict(sc.transform(np.array([[0.0, 0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]])))
new_prediction = (new_prediction > 0.5)
I have tried using
from sklearn.externals import joblib
joblib.dump(classifier, 'model.pkl')
and
import pickle
with open('classifier.pkl', 'wb') as fid:
pickle.dump(classifier, fid,2)
for both, I am getting PicklingError: Can't pickle : attribute lookup module on builtins failed
What am I doing wrong? Your insights are much appriciated.

Related

Using the configuraiton from the best epoch in keras without saving a file

I have a keras model that I train and I save the configuration from the best epoch in a separate file and load it afterwards. Here is my whole code
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import BatchNormalization, Dense, Flatten
from matplotlib import pyplot as plt
#Read data from csv files
ANN_input_data_features = pd.read_csv("C:/Users/User1/Desktop/TestDataANN_InputFeatures.csv", sep=';')
ANN_input_data_labels = pd.read_csv("C:/Users/User1/Desktop/TestDataANN_OutputLabels.csv", sep=';')
ANN_input_data_features = ANN_input_data_features.values
ANN_input_data_labels = ANN_input_data_labels.values
# standardize input features X and output labels Y
scaler_standardized_X = StandardScaler()
ANN_input_data_features = scaler_standardized_X.fit_transform(ANN_input_data_features)
scaler_standardized_Y = StandardScaler()
ANN_input_data_labels = scaler_standardized_Y.fit_transform(ANN_input_data_labels)
#Split dataset into train, validation, an test
index_X_Train_End = int(0.7 * len(ANN_input_data_features))
index_X_Validation_End = int(0.9 * len(ANN_input_data_features))
X_train = ANN_input_data_features [0: index_X_Train_End]
X_valid = ANN_input_data_features [index_X_Train_End: index_X_Validation_End]
X_test = ANN_input_data_features [index_X_Validation_End:]
Y_train = ANN_input_data_labels [0: index_X_Train_End]
Y_valid = ANN_input_data_labels [index_X_Train_End: index_X_Validation_End]
Y_test = ANN_input_data_labels [index_X_Validation_End:]
#Train the model
optimizer_adam = tf.keras.optimizers.Adam(learning_rate= 0.001)
numberOfInputFeatures = len(ANN_input_data_features[0])
numberOfOutputNeurons = len(ANN_input_data_labels[0])
model = keras.Sequential([
Flatten(input_shape=(numberOfInputFeatures,)),
Dense(30, activation='relu'),
Dense(50, activation='relu'),
Dense(50, activation='relu'),
Dense(30, activation='relu'),
keras.layers.Dense(numberOfOutputNeurons)])
#Important part 1: The file with the best configuration is saved
entireFolderNameForTheResultsOfTheRun = "C:/Users/User1/Desktop/Training/"
pathOfTheFileForBestModel = entireFolderNameForTheResultsOfTheRun + "bestModelSingleTimeSlotTest.keras"
callbacks = [ keras.callbacks.ModelCheckpoint(pathOfTheFileForBestModel, save_best_only=True) ]
model.compile(loss="mean_squared_error", optimizer=optimizer_adam, metrics=['mean_absolute_percentage_error'])
history = model.fit(X_train, Y_train, epochs=2000, batch_size=10, validation_data=(X_valid, Y_valid), callbacks=callbacks)
# Predict the values from the test dataset
#Important part 2: The file with the best configuration is loaded
model = keras.models.load_model(pathOfTheFileForBestModel)
Y_pred = model.predict(X_test)
I indicated the important parts with comments (Important part 1 and Important part 2). Now my question is, whether I can also use the configuration of the best epoch for predictions without saving a file by just using the script? The motivation behind this is that the script will run on a plattform where I can't save any files.

ValueError in DL

I get this error , post choosing the epochs: ValueError: Input arrays should have the same number of samples as target arrays. Found 5516 input samples and 12870 target samples. Any suggestions are welcome. Thanks in advance
Im using a dataset with a lot of categorical variables and they add up to 95 after creating the dummy variables, the code until I choose the number of epochs runs flawlessly and then I get this error, what is the reason for this error, its important, one, I could use it in the future and 2, Im unable to proceed :)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('zrpl_data.csv')
X = dataset.iloc[:, 0:6].values
y = dataset.iloc[:, 6].values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 0] = labelencoder_X_1.fit_transform(X[:, 0])
labelencoder_X_2 = LabelEncoder()
X[:, 1] = labelencoder_X_2.fit_transform(X[:, 1])
labelencoder_X_3 = LabelEncoder()
X[:, 2] = labelencoder_X_1.fit_transform(X[:, 2])
labelencoder_X_4 = LabelEncoder()
X[:, 3] = labelencoder_X_1.fit_transform(X[:, 3])
labelencoder_X_5 = LabelEncoder()
X[:, 4] = labelencoder_X_1.fit_transform(X[:, 4])
onehotencoder = OneHotEncoder(categorical_features = [0,1,2,3,4])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,
random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
import keras
from keras.models import Sequential
from keras.layers import Dense
classifier = Sequential()
classifier.add(Dense(output_dim=47,
init='uniform',activation='relu',input_dim=95))
classifier.add(Dense(output_dim=47, init='uniform',activation='relu'))
classifier.add(Dense(output_dim=1,
init='uniform',activation='sigmoid'))
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
classifier.fit (X_train, y_train, batch_size=10,nb_epoch=100)
You have 5516 feature samples and 12870 target samples (you should have equal), before train the model double check their dimensions.

Gender Voice with Python coding

I am working on a sample data set from a link below.
https://www.kaggle.com/enirtium/gender-voice/data
I am trying to open .csv file(maybe I am opening it wrongly) and trying to create fully connected neural layers. Then, I am trying to train them but unfortunately, I am getting input shape not fitting problem.
"ValueError: Error when checking input: expected dense_1_input to have shape (None, 2800) but got array with shape (3168, 1)"
My codes like these:
import csv
import numpy
import string
from keras.models import Sequential
from sklearn.model_selection import train_test_split
import numpy as np
from keras import models
from keras import layers
path = r'/Users/username/Desktop/voice.csv'
meanfreq = []
sd = []
median = []
label = []
with open(path, 'r') as csv_file:
csv_reader = csv.reader(csv_file)
next(csv_reader)
for line in csv_reader:
#print(line['meanfreq'])
meanfreq.append(line[0])
sd.append(line[1])
median.append(line[2])
if line[20] == "female":
label.append(1)
else:
label.append(0)
network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(2800,)))
network.add(layers.Dense(1, activation='sigmoid'))
network.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
network.fit(meanfreq, label, epochs=5, batch_size=128)
scores = network.evaluate(meanfreq, label)
print("\n%s: %.2f%%" % (network.metrics_names[1], scores[1]*100))
I suppose that maybe, I can't open .csv file (it is opening "list" primitive) or there are any other problems. I am unfortunately fresh man at neural networks and python. I will open this csv file and will use its %70 data to train, %30 data for testing.
Yes,
It works as these;
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
# get data ready
data = pd.read_csv('voice.csv')
data.shape
# split out features and label
X = data.iloc[:, :-1].values
y = data.iloc[:, -1]
# map category to binary
y = np.where(y == 'male', 1, 0)
enc = OneHotEncoder()
# reshape y to be column vector
y_ = enc.fit_transform(y.reshape(-1, 1)).toarray()
X_train, X_test, y_train, y_test = train_test_split(
X, y_, train_size=0.80, random_state=42)
network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(20,)))
network.add(layers.Dense(2, activation='sigmoid'))
network.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
network.fit(X_train, y_train, epochs=100, batch_size=128)
network.evaluate(X_test, y_test)
Reading in the data seems to be fine.
I Imagine you have a data set that looks like:
mean_freq, label
.12 0
.45 1
And you want to train a classifier. Currently the model is expecting
a training example to have 2800 features. input shape=(2800,) but you only want 1 feature: the mean_freq
The mistake here is that you are trying to tell Keras how much training examples to use while declaring the model. You don't do that here, you'll do that later when you're fitting the model.
So the input_shape to keras's Dense Layer should be (1, ) for the single feature. If you're going to use mean and median freq then you would want two features (2, ) and so on.
# note change from 2800 to 1
network.add(layers.Dense(512, activation='relu', input_shape=(1,)))
And you can split your training and test sets in multiple ways. My suggestion is to do something like this:
train_size = 2800
X_train = mean_freq[:train_size]
y_train = label[:train_size]
X_test = mean_freq[train_size:]
y_test = label[:train_size]
Then fit the model with the training set and score with the test set.
network.fit(X_train, y_train, epochs=5, batch_size=128)
scores = network.evaluate(X_test, y_test)
Edit to reflect comments:
well if the case is that you training data has 20 features then
you tell keras that with:
# note change from 2800 to 1
network.add(layers.Dense(512, activation='relu', input_shape=(20,)))
You have do the work necessary to get the data in to the shape you need for training and testing but the template above is how you would fit and evaluate the model.
I would also note that there are better ways read in csv data if you're going to do modeling (as you are). Look at using a pandas dataframe.
Also better (more standard ways) of creating train and test split: look into sklearn's train_test_split
Edit 2: A quick model of the voice data
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.model import Model
from keras.layers import Dense, Input
# get data ready
data = pd.read_csv('voice.csv')
data.shape
# split out features and label
X = data.iloc[:, :-1].values
y = data.iloc[:, -1]
# map category to binary
y = np.where(y == 'male', 1, 0)
enc = OneHotEncoder()
# reshape y to be column vector
y_ = enc.fit_transform(y.reshape(-1, 1)).toarray()
X_train, X_test, y_train, y_test = train_test_split(
X, y_, train_size=0.80, random_state=42)
# model using keras functional style
inp = Input(shape =(20, ))
dense = Dense(128)(inp)
out = Dense(2, activation='sigmoid')(dense)
model = Model(inputs=[inp], outputs=[out])
model.compile(loss='binary_crossentropy', optimizer='adam',
metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=128)
model.evaluate(X_test, y_test)

k-fold cross validation using tensorflow

I have created an artificial neural network. I am trying to calculate the accuracy of the model using k-fold cross validation technique but after compiling the last line its not progressing any further, its stuck there for more than 20 mins. I am not able to figure out where I am going wrong. Can anyone please help me with this thing? Below is the code I have used.
Thanks in advance.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('Churn_Modelling.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X=X[:,1:]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import Sequential #required to initialize ann
from keras.layers import Dense #required to build the layers of ann
def build_classifier():
classifier=Sequential()
classifier.add(Dense(kernel_initializer="uniform", activation="relu", input_dim=11, units=6))
classifier.add(Dense(kernel_initializer="uniform", activation="relu", units=6))
classifier.add(Dense(kernel_initializer="uniform", activation="sigmoid",units=1))
classifier.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])
return classifier
classifier=KerasClassifier(build_fn=build_classifier, batch_size=10, nb_epoch=100)
accuracies=cross_val_score(estimator=classifier,X=X_train,y=y_train,cv=10,n_jobs=-1)
I had the same issue with the exact same code. It seems Windows has an issue with "n_jobs", if you remove it by "accuracies = .." , it will start working. It's just that it could take long but it will work and show each epoch being updated.

Create MLP architecture for general learning

I created a mlp that reads a very precise dataset (in the specific jurisdiction) and then perform the train and set of the same data. Then I perform the evaluation and finally save the model in pkl. Everything works ok, just that I would like to create a dynamic model, even using existing and external functions, to make the aforementioned model work with a generic dataset given in input chosen at random, making it as generic as possible.
thank you
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
# load dataset
url = "iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)
# split dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Opzion test
seed = 7
scoring = 'accuracy'
# Algo
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# Value
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
#
fig = plt.figure()
fig.suptitle('Confronto Algoritmi')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# Preview
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
#
print(accuracy_score(Y_validation, predictions))
#Report
print(classification_report(Y_validation, predictions))
# Save
joblib.dump(knn, 'HomeWork2.pkl')
Your best bet would be to go with Keras (with a GPU it will be faster):
Source: https://keras.io/getting-started/sequential-model-guide/#training
# For a single-input model with 2 classes (binary classification):
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=100))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
# Generate dummy data
import numpy as np
data = np.random.random((1000, 100))
labels = np.random.randint(2, size=(1000, 1))
# Train the model, iterating on the data in batches of 32 samples
model.fit(data, labels, epochs=10, batch_size=32)
# For a single-input model with 10 classes (categorical classification):
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=100))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
# Generate dummy data
import numpy as np
data = np.random.random((1000, 100))
labels = np.random.randint(10, size=(1000, 1))
# Convert labels to categorical one-hot encoding
one_hot_labels = keras.utils.to_categorical(labels, num_classes=10)
# Train the model, iterating on the data in batches of 32 samples
model.fit(data, one_hot_labels, epochs=10, batch_size=32)

Categories