I created a mlp that reads a very precise dataset (in the specific jurisdiction) and then perform the train and set of the same data. Then I perform the evaluation and finally save the model in pkl. Everything works ok, just that I would like to create a dynamic model, even using existing and external functions, to make the aforementioned model work with a generic dataset given in input chosen at random, making it as generic as possible.
thank you
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
# load dataset
url = "iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)
# split dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Opzion test
seed = 7
scoring = 'accuracy'
# Algo
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# Value
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
#
fig = plt.figure()
fig.suptitle('Confronto Algoritmi')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# Preview
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
#
print(accuracy_score(Y_validation, predictions))
#Report
print(classification_report(Y_validation, predictions))
# Save
joblib.dump(knn, 'HomeWork2.pkl')
Your best bet would be to go with Keras (with a GPU it will be faster):
Source: https://keras.io/getting-started/sequential-model-guide/#training
# For a single-input model with 2 classes (binary classification):
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=100))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
# Generate dummy data
import numpy as np
data = np.random.random((1000, 100))
labels = np.random.randint(2, size=(1000, 1))
# Train the model, iterating on the data in batches of 32 samples
model.fit(data, labels, epochs=10, batch_size=32)
# For a single-input model with 10 classes (categorical classification):
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=100))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
# Generate dummy data
import numpy as np
data = np.random.random((1000, 100))
labels = np.random.randint(10, size=(1000, 1))
# Convert labels to categorical one-hot encoding
one_hot_labels = keras.utils.to_categorical(labels, num_classes=10)
# Train the model, iterating on the data in batches of 32 samples
model.fit(data, one_hot_labels, epochs=10, batch_size=32)
Related
I'm currently using GridSearchCV for a nn project and wondering how can I get the same result (using the resulted hyperparameters from the GridSearchCV) but without using the GridSearchCV? What's the proper code to replicate this but without GridSearchCV use?
Here's my code using GridSearchCV:
from numpy.random import seed
seed(1)
import tensorflow as tf
tf.random.set_seed(2)
X = df.drop(['A','B','C','D','E'],axis=1).values
y = df['A'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
# X_test, X_val, y_test, y_val = train_test_split(X_test,y_test,test_size=0.5,random_state=101)
from sklearn.preprocessing import MinMaxScaler,StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print(X.shape)
kfold = KFold(5, shuffle=True,random_state=42)
#############
def buildModel():
model = Sequential()
model.add(Dense(9,activation='tanh'))
model.add(Dense(1))
model.compile(optimizer=SGD(learning_rate=0.01,momentum=0.6),loss='mse',metrics=['mean_squared_error'])
return model
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
model = KerasRegressor(build_fn=buildModel)
parameters = {'batch_size': [10,15],
'epochs': [100,200,250]}
grid_search = GridSearchCV(estimator=model,param_grid=parameters,cv=kfold,scoring='neg_mean_squared_error',verbose=10)
grid_search = grid_search.fit(X_train,y_train,verbose=10)
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_
>> Best Parameters: {'batch_size': 10, 'epochs': 100}
this is the first time I ask a question on this platform. I'm using Sklearn's MLPregressor model to do the bike rental prediction. I need to test and verify the "RMSE" in the test base, however when I perform the prediction, this error is returned ( X has 19 features, but MLPRegressor is expecting 100 features as input).
The two dataframes (training and testing) were treated in the same way, and I've already verified that their features have the same names. I also tried to transform the format of the variables from pd.dataframe to numpy.ndarray and normalize the data with StandardScaler. but all my attempts lead to the same error. can anybody help me?
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
#------------------------------------------------
# Import of training and test data, already treated. The data were treated in the same way
caminho = '/content/treino_tratado.csv'
train = pd.read_csv(caminho)
train
caminho2 = '/content/teste_tratado.csv'
test = pd.read_csv(caminho2)
test.head(3)
#-----------------------------------------------------
#definition of my predictor variables and target variable
X = treino.drop('aluguéis', axis = 1)
y = treino['aluguéis']
# I chose to convert the x and y variables to np array. in the sklearn documentation it is shown that the variables must be in this format
X = X.to_numpy()
y = y.to_numpy()
#test data
test = test.to_numpy()
#----------------------------------------------------------
# model
X, y = make_regression(n_samples=200, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 , random_state=1)
regr = MLPRegressor(hidden_layer_sizes=(1000,), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000).fit(X_train, y_train)
predicoes = regr.predict(X_test)
from sklearn.metrics import mean_squared_error
import math
mse = mean_squared_error(y_test, predicoes)
rmse = math.sqrt(mse)
print(rmse)
# output = 109.25977971042573
#----------------------------------------------------------------
# prediction in test database
predicoes2 = regr.predict(test)
# error -> X has 19 features, but MLPRegressor is expecting 100 features as input.
Do you know that this line is creating a new random dataset with the X, y variables?
X, y = make_regression(n_samples=200, random_state=1)
Why are you using it?
It creates a new dataset with 100 features, and that is probably the reason of your error.
This is the code that I have used. Now I do not really know where did I go wrong as I am new to such things. As an overview of what I am trying to do is that I am trying to classify where there is an attack or no. I gave 'Y' as my label that contains '11' classes.
import pandas as pd
data=pd.read_csv('/content/gdrive/MyDrive/data /Ecobee_Thermostat.csv')
#imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score,
f1_score
#label encoder
data['Y'].unique()
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data['Y']=label_encoder.fit_transform(data['Y'])
data['Y'].unique()
#ex
data.head(10)
data.isnull().sum()
data.shape
for col in data.columns:
print(col)
x = data.iloc[:, 0:115]
x.shape
Y=data['Y']
Y
#scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)
X= scaler.transform(x)
#split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=44, shuffle =True)
#reshape
x_train = np.reshape(x_train, (x_train.shape[0],1,X.shape[1]))
x_test = np.reshape(x_test, (x_test.shape[0],1,X.shape[1]))
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
tf.keras.backend.clear_session()
from tensorflow import keras
from tensorflow.keras import layers
from keras import metrics
from sklearn.metrics import classification_report
opt = keras.optimizers.Adamax(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07,)
model = Sequential()
model.add(LSTM(200, input_shape=(1,115),activation="relu",return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(11, activation='softmax'))
from keras.optimizers import SGD
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])
model.summary()
history = model.fit(x_train, y_train, epochs = 5,batch_size=2000
, validation_data= (x_test, y_test))
score, acc = model.evaluate(x_test, y_test)
print(acc)
print(score)
y_pred = model.predict(x_test)
y_predtrain = model.predict(x_train) (* here where I get the error)
from sklearn.metrics import confusion_matrix
confusiontr = confusion_matrix(y_train, y_pred)
confusionts = confusion_matrix(y_test, y_pred.round())
print('Confusion Matrix\n')
print(confusionts)
sn.heatmap(confusiontr)
When i do run it, it goes very well until evaluating the output be as followed:
> 0.8152588605880737
0.36209338903427124
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-14-61331f357bd6> in <module>()
5 y_predtrain = model.predict(x_train)
6 from sklearn.metrics import confusion_matrix
----> 7 confusiontr = confusion_matrix(y_train, y_pred)
8 confusionts = confusion_matrix(y_test, y_pred.round())
9 print('Confusion Matrix\n')
2 frames
/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
210 if len(uniques) > 1:
211 raise ValueError("Found input variables with inconsistent numbers of"
--> 212 " samples: %r" % [int(l) for l in lengths])
213
214
ValueError: Found input variables with inconsistent numbers of samples: [560036, 275840]
this is a snapshot of the error
The output
I think the problem comes from the following lines:
y_pred = model.predict(x_test)
from sklearn.metrics import confusion_matrix
confusiontr = confusion_matrix(y_train, y_pred)
Your y_pred comes from x_test, while your y_train comes from x_train. That is, your train and test sets have different dimensions, hence what you call y_train and y_pred on the last line above won't match either.
Thus, simply use y_predtrain instead for confusiontr:
confusiontr = confusion_matrix(y_train, y_predtrain)
I am not able to pickle my model below.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
script_dir = os.path.dirname(__file__)
abs_file_path = os.path.join(script_dir, 'Churn_Modelling.csv')
# Importing the dataset
dataset = pd.read_csv(abs_file_path)
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features=[1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Part 2 - Now let's make the ANN!
# Importing the Keras libraries and packages
from tensorflow.contrib.keras.api.keras.models import Sequential
from tensorflow.contrib.keras.api.keras.layers import Dense
from tensorflow.contrib.keras import backend
# Initialising the ANN
classifier = Sequential()
# Adding the input layer and the first hidden layer
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu', input_dim=11))
# Adding the second hidden layer
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))
# Adding the output layer
classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
# Compiling the ANN
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size=10, epochs=100, validation_split=0.1)
# Part 3 - Making predictions and evaluating the model
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)
# Predicting a single new observation
new_prediction = classifier.predict(sc.transform(np.array([[0.0, 0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]])))
new_prediction = (new_prediction > 0.5)
I have tried using
from sklearn.externals import joblib
joblib.dump(classifier, 'model.pkl')
and
import pickle
with open('classifier.pkl', 'wb') as fid:
pickle.dump(classifier, fid,2)
for both, I am getting PicklingError: Can't pickle : attribute lookup module on builtins failed
What am I doing wrong? Your insights are much appriciated.
I'm trying to modify the example from this tutorial to use my own data.
In the tutorial Y-data can only have 3 different values, but in my case it can be between 0 and 200. I consider it a successful estimate if the prediction gets to +-3.
I suspect I have to make some modification to the scoring variable, but I'm not sure how to proceed.
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
url = "testdata2.csv"
dataset = pandas.read_csv(url)
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
# Split-out validation dataset
array = dataset.values
X = array[:,0:6]
Y = array[:,6]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(cv_results)