Converting numeric labels back to original strings - python

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
data = pd.read_csv('data/TrainingData_unsubscribe.csv')
data['labels'] = data['Category'].factorize()[0]
#vectorize the features
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', stop_words='english')
x_vectors = tfidf.fit_transform(data.msgContent)
#split the data
x = x_vectors
y = data.labels
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
x_train = x_train.toarray()
x_train.shape
x.shape
x_test = x_test.toarray()
preds = x_vectors.toarray()
#Random seed and callback
stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)
#create the model
model = tf.keras.Sequential([
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(len(pd.unique(y)), activation='softmax')
])
#compile the model
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer = tf.keras.optimizers.Adam(),
metrics=['accuracy'])
#fit the model
model.fit(x_train,y_train, epochs=500, verbose=0, callbacks=[stop])
#elavuation
print('\nEvaluation: ')
model.evaluate(x_test,y_test)
predictions = model.predict(preds)
len(pd.unique(y))
data["Prediction"] = predictions.argmax(axis=1)
output = data.drop(["labels"], axis=1)
category_ids = data[["Category", "labels"]].drop_duplicates()
output['Prediction'] =
Originally converted string labels to numeric ones using factorize():
data['labels'] = data['Category'].factorize()[0]
Now I'm trying to convert the labels back to their initial string variables. I've created a DF with the mapped values
Category
labels
HardBounce
0
SoftBounce
1
etc...
is it possible to map a df column using another df as a reference for the map?
I've been unable to find any docs that show how to do this.

Related

how to predict encoded sentences with tensorflow model

using spacy, I convert my training sets into matrices containing the individual word vectors. This is done in the encode_sentences() function. Furthermore, I encode the corresponding labels using the label_encoding() function. These data are now to be used as training data for my model. As soon as I want to predict a single sentence to get the right label, I get an nparray as output. How can I make a correct prediction?
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import sqlite3
from sklearn.preprocessing import LabelEncoder
import spacy
from sklearn.model_selection import train_test_split
nlp = spacy.load('en_core_web_lg')
embedding_dim = nlp.vocab.vectors_length
def read_database(path):
# Loading Data from database
connection = sqlite3.connect(path)
db_rows = pd.read_sql('''select intents.intent, patterns.pattern
from intents, patterns where intents.id = patterns.intentid''', connection)
labels = []
sentences = []
intents = []
for i in range(len(db_rows)):
labels.append(db_rows["intent"][i])
if db_rows["intent"][i] not in intents:
intents.append(db_rows["intent"][i])
sentences.append(db_rows["pattern"][i])
return sentences, labels, intents
def label_encoding(labels):
# Calculate the length of labels
n_labels = len(labels)
print('Number of labels :-', n_labels)
le = LabelEncoder()
y = le.fit_transform(labels)
print('Length of y :- ', y.shape)
return y
def encode_sentences(sentences):
# Calculate number of sentences
n_sentences = len(sentences)
X = np.zeros((n_sentences, embedding_dim))
# y = np.zeros((n_sentences, embedding_dim))
# Iterate over the sentences
for idx, sentence in enumerate(sentences):
doc = nlp(sentence)
# Save the document's .vector attribute to the corresponding row in
X[idx, :] = doc.vector
return X
sentences_train, labels_train, all_intents = read_database('./database_x.db')
sentences_train = encode_sentences(sentences_train)
labels_train = label_encoding(labels_train)
x_train, x_test, y_train, y_test = train_test_split(sentences_train, labels_train, test_size=0.2)
model = keras.Sequential([keras.layers.Dense(16, activation='relu'),
keras.layers.Dense(16, activation='relu'),
keras.layers.Dense(len(all_intents), activation='softmax')])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01),
loss=keras.losses.SparseCategoricalCrossentropy(),
metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=16, epochs=100)
prediction = model.predict(encode_sentences("how can i test rf heating"))
print("\n\n\n")
print(prediction)
argmax of predictions gives you the index of the best label candidate.
pred_labels_idx = np.argmax(predictions,-1)
The best practice is to make the LabelEncoder le accessible from outside of the function scope so you can inverse_transform the predictions to actually labels:
pred_labels = le.inverse_transform(pred_labels_idx)
Perhaps, you can modify your label_encoder so it also returns a label_decoder when it encodes the labels on training data.

Using the configuraiton from the best epoch in keras without saving a file

I have a keras model that I train and I save the configuration from the best epoch in a separate file and load it afterwards. Here is my whole code
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import BatchNormalization, Dense, Flatten
from matplotlib import pyplot as plt
#Read data from csv files
ANN_input_data_features = pd.read_csv("C:/Users/User1/Desktop/TestDataANN_InputFeatures.csv", sep=';')
ANN_input_data_labels = pd.read_csv("C:/Users/User1/Desktop/TestDataANN_OutputLabels.csv", sep=';')
ANN_input_data_features = ANN_input_data_features.values
ANN_input_data_labels = ANN_input_data_labels.values
# standardize input features X and output labels Y
scaler_standardized_X = StandardScaler()
ANN_input_data_features = scaler_standardized_X.fit_transform(ANN_input_data_features)
scaler_standardized_Y = StandardScaler()
ANN_input_data_labels = scaler_standardized_Y.fit_transform(ANN_input_data_labels)
#Split dataset into train, validation, an test
index_X_Train_End = int(0.7 * len(ANN_input_data_features))
index_X_Validation_End = int(0.9 * len(ANN_input_data_features))
X_train = ANN_input_data_features [0: index_X_Train_End]
X_valid = ANN_input_data_features [index_X_Train_End: index_X_Validation_End]
X_test = ANN_input_data_features [index_X_Validation_End:]
Y_train = ANN_input_data_labels [0: index_X_Train_End]
Y_valid = ANN_input_data_labels [index_X_Train_End: index_X_Validation_End]
Y_test = ANN_input_data_labels [index_X_Validation_End:]
#Train the model
optimizer_adam = tf.keras.optimizers.Adam(learning_rate= 0.001)
numberOfInputFeatures = len(ANN_input_data_features[0])
numberOfOutputNeurons = len(ANN_input_data_labels[0])
model = keras.Sequential([
Flatten(input_shape=(numberOfInputFeatures,)),
Dense(30, activation='relu'),
Dense(50, activation='relu'),
Dense(50, activation='relu'),
Dense(30, activation='relu'),
keras.layers.Dense(numberOfOutputNeurons)])
#Important part 1: The file with the best configuration is saved
entireFolderNameForTheResultsOfTheRun = "C:/Users/User1/Desktop/Training/"
pathOfTheFileForBestModel = entireFolderNameForTheResultsOfTheRun + "bestModelSingleTimeSlotTest.keras"
callbacks = [ keras.callbacks.ModelCheckpoint(pathOfTheFileForBestModel, save_best_only=True) ]
model.compile(loss="mean_squared_error", optimizer=optimizer_adam, metrics=['mean_absolute_percentage_error'])
history = model.fit(X_train, Y_train, epochs=2000, batch_size=10, validation_data=(X_valid, Y_valid), callbacks=callbacks)
# Predict the values from the test dataset
#Important part 2: The file with the best configuration is loaded
model = keras.models.load_model(pathOfTheFileForBestModel)
Y_pred = model.predict(X_test)
I indicated the important parts with comments (Important part 1 and Important part 2). Now my question is, whether I can also use the configuration of the best epoch for predictions without saving a file by just using the script? The motivation behind this is that the script will run on a plattform where I can't save any files.

Keras hyperparameter tuning

using this dataset https://www.kaggle.com/datasets/harlfoxem/housesalesprediction?datasetId=128&searchQuery=hyperparameter+tuning&language=Python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
#Random seed for reproducibiliity
np.random.seed(5)
####DATA PRE-PROCESSING ####
#read the data
data = pd.read_csv('kc_house_data.csv')
#check for missing/null values
print(data.isnull().any())
#create a 'house age' colum that encompasses both date (which doesn't help in predicting price) and the year built
data['date'] = pd.to_datetime(data['date']) #first convert date format
data["house_age"] = data["date"].dt.year - data['yr_built']
#if 'renovated' is 0, it means not renovated- so to stop confusion, if year renovated is not 0, add 1 to a new 'renovated' column
data['renovated'] = data['yr_renovated'].apply(lambda yr: 0 if yr == 0 else 1)
#Now drop the columns we do not need
data=data.drop('date', axis=1)
data=data.drop('yr_renovated', axis=1)
data=data.drop('yr_built', axis=1)
data.head(5)
print(data)
#Split into test and train
X = data
Y = X['price'].values
X = X.drop('price', axis = 1)
X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.20, random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_test, Y_test, test_size=0.5) #splitting into validation sets as well - not sure if needed tho
#Scale
standard_scaler = StandardScaler() #Unsure if min max scaler would be more appropriate here?
X_train= pd.DataFrame(standard_scaler.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(standard_scaler.transform(X_test),columns = X_test.columns)
X_val = pd.DataFrame(standard_scaler.transform(X_test),columns = X_val.columns)
#print(X_train.shape, X_val.shape, X_test.shape, Y_train.shape, Y_val.shape, Y_test.shape)
# Use scikit-learn to grid search the batch size and epochs
def create_model():
# create model
model = Sequential()
model.add(Dense(32, input_dim=19, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam' , metrics='mse')
return model
# create model
model = KerasRegressor(build_fn=create_model, verbose=0)
# define the grid search parameters
batch_size = [5, 10, 20, 40, 60, 80]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring = 'neg_mean_squared_error')
grid_result = grid.fit(X_train, Y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
param_grid = dict(epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring = 'neg_mean_squared_error')
grid_result = grid.fit(X_train, Y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
It's just not working for me. What I want to do is create a simple neural network and then finetune the epochs and batch size. But this is just giving me 'acccuracy' as NAN. I'm so confused. Any help appreciated
Current output:
Best: -419841571707.859070 using {'batch_size': 5}
Best: -419841571132.739441 using {'epochs': 100}
However if i add batch sizes of 1,2,3 etc, it will give that- it always says the best result is the smallest batch size. Is this right?

Gender Voice with Python coding

I am working on a sample data set from a link below.
https://www.kaggle.com/enirtium/gender-voice/data
I am trying to open .csv file(maybe I am opening it wrongly) and trying to create fully connected neural layers. Then, I am trying to train them but unfortunately, I am getting input shape not fitting problem.
"ValueError: Error when checking input: expected dense_1_input to have shape (None, 2800) but got array with shape (3168, 1)"
My codes like these:
import csv
import numpy
import string
from keras.models import Sequential
from sklearn.model_selection import train_test_split
import numpy as np
from keras import models
from keras import layers
path = r'/Users/username/Desktop/voice.csv'
meanfreq = []
sd = []
median = []
label = []
with open(path, 'r') as csv_file:
csv_reader = csv.reader(csv_file)
next(csv_reader)
for line in csv_reader:
#print(line['meanfreq'])
meanfreq.append(line[0])
sd.append(line[1])
median.append(line[2])
if line[20] == "female":
label.append(1)
else:
label.append(0)
network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(2800,)))
network.add(layers.Dense(1, activation='sigmoid'))
network.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
network.fit(meanfreq, label, epochs=5, batch_size=128)
scores = network.evaluate(meanfreq, label)
print("\n%s: %.2f%%" % (network.metrics_names[1], scores[1]*100))
I suppose that maybe, I can't open .csv file (it is opening "list" primitive) or there are any other problems. I am unfortunately fresh man at neural networks and python. I will open this csv file and will use its %70 data to train, %30 data for testing.
Yes,
It works as these;
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
# get data ready
data = pd.read_csv('voice.csv')
data.shape
# split out features and label
X = data.iloc[:, :-1].values
y = data.iloc[:, -1]
# map category to binary
y = np.where(y == 'male', 1, 0)
enc = OneHotEncoder()
# reshape y to be column vector
y_ = enc.fit_transform(y.reshape(-1, 1)).toarray()
X_train, X_test, y_train, y_test = train_test_split(
X, y_, train_size=0.80, random_state=42)
network = models.Sequential()
network.add(layers.Dense(512, activation='relu', input_shape=(20,)))
network.add(layers.Dense(2, activation='sigmoid'))
network.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
network.fit(X_train, y_train, epochs=100, batch_size=128)
network.evaluate(X_test, y_test)
Reading in the data seems to be fine.
I Imagine you have a data set that looks like:
mean_freq, label
.12 0
.45 1
And you want to train a classifier. Currently the model is expecting
a training example to have 2800 features. input shape=(2800,) but you only want 1 feature: the mean_freq
The mistake here is that you are trying to tell Keras how much training examples to use while declaring the model. You don't do that here, you'll do that later when you're fitting the model.
So the input_shape to keras's Dense Layer should be (1, ) for the single feature. If you're going to use mean and median freq then you would want two features (2, ) and so on.
# note change from 2800 to 1
network.add(layers.Dense(512, activation='relu', input_shape=(1,)))
And you can split your training and test sets in multiple ways. My suggestion is to do something like this:
train_size = 2800
X_train = mean_freq[:train_size]
y_train = label[:train_size]
X_test = mean_freq[train_size:]
y_test = label[:train_size]
Then fit the model with the training set and score with the test set.
network.fit(X_train, y_train, epochs=5, batch_size=128)
scores = network.evaluate(X_test, y_test)
Edit to reflect comments:
well if the case is that you training data has 20 features then
you tell keras that with:
# note change from 2800 to 1
network.add(layers.Dense(512, activation='relu', input_shape=(20,)))
You have do the work necessary to get the data in to the shape you need for training and testing but the template above is how you would fit and evaluate the model.
I would also note that there are better ways read in csv data if you're going to do modeling (as you are). Look at using a pandas dataframe.
Also better (more standard ways) of creating train and test split: look into sklearn's train_test_split
Edit 2: A quick model of the voice data
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.model import Model
from keras.layers import Dense, Input
# get data ready
data = pd.read_csv('voice.csv')
data.shape
# split out features and label
X = data.iloc[:, :-1].values
y = data.iloc[:, -1]
# map category to binary
y = np.where(y == 'male', 1, 0)
enc = OneHotEncoder()
# reshape y to be column vector
y_ = enc.fit_transform(y.reshape(-1, 1)).toarray()
X_train, X_test, y_train, y_test = train_test_split(
X, y_, train_size=0.80, random_state=42)
# model using keras functional style
inp = Input(shape =(20, ))
dense = Dense(128)(inp)
out = Dense(2, activation='sigmoid')(dense)
model = Model(inputs=[inp], outputs=[out])
model.compile(loss='binary_crossentropy', optimizer='adam',
metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=128)
model.evaluate(X_test, y_test)

Error pickling scikit-learn model

I am not able to pickle my model below.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
script_dir = os.path.dirname(__file__)
abs_file_path = os.path.join(script_dir, 'Churn_Modelling.csv')
# Importing the dataset
dataset = pd.read_csv(abs_file_path)
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features=[1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Part 2 - Now let's make the ANN!
# Importing the Keras libraries and packages
from tensorflow.contrib.keras.api.keras.models import Sequential
from tensorflow.contrib.keras.api.keras.layers import Dense
from tensorflow.contrib.keras import backend
# Initialising the ANN
classifier = Sequential()
# Adding the input layer and the first hidden layer
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu', input_dim=11))
# Adding the second hidden layer
classifier.add(Dense(units=6, kernel_initializer='uniform', activation='relu'))
# Adding the output layer
classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
# Compiling the ANN
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size=10, epochs=100, validation_split=0.1)
# Part 3 - Making predictions and evaluating the model
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)
# Predicting a single new observation
new_prediction = classifier.predict(sc.transform(np.array([[0.0, 0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]])))
new_prediction = (new_prediction > 0.5)
I have tried using
from sklearn.externals import joblib
joblib.dump(classifier, 'model.pkl')
and
import pickle
with open('classifier.pkl', 'wb') as fid:
pickle.dump(classifier, fid,2)
for both, I am getting PicklingError: Can't pickle : attribute lookup module on builtins failed
What am I doing wrong? Your insights are much appriciated.

Categories