Good accuracy on validation and test but bad predictions keras lstm - python

I'm having trouble with LSTM and Keras.
I try to predict normal/fake domain names.
My dataset is like this:
domain,fake
google, 0
bezqcuoqzcjloc,1
...
with 50% normal and 50% fake domains
Here's my LSTM model:
def build_model(max_features, maxlen):
"""Build LSTM model"""
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['acc'])
return model
then I preprocess my text data to transform it into numbers:
"""Run train/test on logistic regression model"""
indata = data.get_data()
# Extract data and labels
X = [x[1] for x in indata]
labels = [x[0] for x in indata]
# Generate a dictionary of valid characters
valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))}
max_features = len(valid_chars) + 1
maxlen = 100
# Convert characters to int and pad
X = [[valid_chars[y] for y in x] for x in X]
X = sequence.pad_sequences(X, maxlen=maxlen)
# Convert labels to 0-1
y = [0 if x == 'benign' else 1 for x in labels]
Then I split my data into training, testing and validation sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print("Build model...")
model = build_model(max_features, maxlen)
print("Train...")
X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.2)
And then I train my model on training data and validation data, and evaluate on test data:
history = model.fit(X_train, y_train, epochs=max_epoch, validation_data=(X_holdout, y_holdout), shuffle=False)
scores = model.evaluate(X_test, y_test, batch_size=batch_size)
At the end of my training/testing I have these results:
And these scores when evaluating on test dataset:
loss = 0.060554939906234596
accuracy = 0.978109902033532
However when I predict on a sample of the dataset like this:
LSTM_model = load_model('LSTMmodel_64_sgd.h5')
data = pickle.load(open('traindata.pkl', 'rb'))
#### LSTM ####
"""Run train/test on logistic regression model"""
# Extract data and labels
X = [x[1] for x in data]
labels = [x[0] for x in data]
X1, _, labels1, _ = train_test_split(X, labels, test_size=0.9)
# Generate a dictionary of valid characters
valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X1)))}
max_features = len(valid_chars) + 1
maxlen = 100
# Convert characters to int and pad
X1 = [[valid_chars[y] for y in x] for x in X1]
X1 = sequence.pad_sequences(X1, maxlen=maxlen)
# Convert labels to 0-1
y = [0 if x == 'benign' else 1 for x in labels1]
y_pred = LSTM_model.predict(X1)
I have very poor performance:
accuracy = 0.5934741842730341
confusion matrix = [[25201 14929]
[17589 22271]]
F1-score = 0.5780171295094731
Can someone explain to me why?
I have tried 64 instead of 128 for the LSTM node, adam and rmsprop for optimizers, increasing batch_size however performance remains very low.

Ok so I have found the answer.
This is this line
valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X1)))}
In Python 3 set seems to produce different results everytime a new python3 console is open.
So running the code in Python 2 has resolved my issues !

Related

train test split is not splitting correctly

I am still a beginner in AI and deep learning but I wanted to test whether a neural network will be able to calculate the sum of two numbers so I generated a dataset of 5000 numbers and made test size = 0.3 so the training dataset will be equal to 3500 but what was weird that I found the model is training only on 110 input instead of 3500.
The code used:
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np
from random import random
def generate_dataset(num_samples, test_size=0.33):
"""Generates train/test data for sum operation
:param num_samples (int): Num of total samples in dataset
:param test_size (int): Ratio of num_samples used as test set
:return x_train (ndarray): 2d array with input data for training
:return x_test (ndarray): 2d array with input data for testing
:return y_train (ndarray): 2d array with target data for training
:return y_test (ndarray): 2d array with target data for testing
"""
# build inputs/targets for sum operation: y[0][0] = x[0][0] + x[0][1]
x = np.array([[random()/2 for _ in range(2)] for _ in range(num_samples)])
y = np.array([[i[0] + i[1]] for i in x])
# split dataset into test and training sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size)
return x_train, x_test, y_train, y_test
if __name__ == "__main__":
# create a dataset with 2000 samples
x_train, x_test, y_train, y_test = generate_dataset(5000, 0.3)
# build model with 3 layers: 2 -> 5 -> 1
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(5, input_dim=2, activation="sigmoid"),
tf.keras.layers.Dense(1, activation="sigmoid")
])
# choose optimiser
optimizer = tf.keras.optimizers.SGD(learning_rate=0.1)
# compile model
model.compile(optimizer=optimizer, loss='mse')
# train model
model.fit(x_train, y_train, epochs=100)
# evaluate model on test set
print("\nEvaluation on the test set:")
model.evaluate(x_test, y_test, verbose=2)
# get predictions
data = np.array([[0.1, 0.2], [0.2, 0.2]])
predictions = model.predict(data)
# print predictions
print("\nPredictions:")
for d, p in zip(data, predictions):
print("{} + {} = {}".format(d[0], d[1], p[0]))
The 110/110 you are seeing in your image is actually the batch count, not the sample count. So 110 batches * the default batch size of 32 gives you ~3500 training samples, which matches what you'd expect as 70% of 5000.
You can see by backing into it the other way that the last batch would be a partial batch, since it's not evenly divisible by 32:
>>> (.7 * 5000) / 110
31.818181818181817
In neural networks, an epoch is a full pass over the data. It trains in small batches (also called steps), and this is the way Keras logs them.

LSTM/GRU TImeSeries multioutput strategy forecasts give dropped values

Currently, I'm playing with Stocks Predictions task which I try to solve using LSTM/GRU.
Problem: After training LSTM/GRU I get huge drop predicted values
Model training process
Train, test data is simply generated using pd.shift in series_to_supervised function.
df['Mid'] = df['Low'] + df['High'] / 2
n_lag = 1 # Lag columns back
n_seq = 1*50 # TimeSteps to predict
seq_col = 'Mid'
seq_col_t = f'{seq_col}(t)'
split_date = '2018-01-01'
def series_to_supervised(data: pd.DataFrame,
seq_col: str,
n_in: int = 1,
n_out: int = 1,
drop_seq_col: bool = True,
dropna: bool = True):
"""Convert time series into supervised learning problem
{input sequence, forecast sequence}
"""
# input sequence (t-n, ... t-1) -> pisitive shift
for i in range(n_in, 0, -1):
data[f'{seq_col}(t-{i})'] = data[seq_col].shift(i)
# no sequence -> no shift
data[f'{seq_col}(t)'] = data[seq_col]
for i in range(1, n_out+1):
# forecast sequence (t, t+1, ... t+n) -> negative shift
data[f'{seq_col}(t+{i})'] = data[seq_col].shift(-i)
if drop_seq_col:
data = data.drop(seq_col, axis=1)
if dropna:
data.dropna(inplace=True)
return data
df = series_to_supervised(df, seq_col=seq_col, n_in=n_lag, n_out=n_seq)
mask = df.index < split_date
train, test = df[mask], df[~mask]
X_cols = ['Mid(t-1)']
y_cols = train.filter(like='Mid(t+').columns
X_train, y_train, X_test, y_test = train[X_cols], train[y_cols], test[X_cols], test[y_cols]
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))
# also returns np.ndarray
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
y_train = y_train.values
y_test = y_test.values
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, GRU
from keras.optimizers import Adam, RMSprop, Adamax
from keras.callbacks import ModelCheckpoint
def get_model(X, y, n_batch):
num_classes=y.shape[1]
# design network
model = Sequential()
# For Stock Predictions has to be used LSTM stateful=True
model.add(GRU(10, batch_input_shape=(n_batch, X.shape[1], X.shape[2]), stateful=True))
model.add(Dropout(0.3))
model.add(Dense(num_classes))
opt = Adam(learning_rate=0.01)
# opt = RMSprop(learning_rate=0.001)
model.compile(loss='mean_squared_error', optimizer=opt)
return model
def reshape_batch(X_train, y_train, X_test, y_test, n_batch):
# reshape training into [samples, timesteps, features]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
# cut to equally divided n_batches (without reminder).
# needed for LSTM stateful=True
train_cut = X_train.shape[0] % n_batch
test_cut = X_test.shape[0] % n_batch
if train_cut > 0:
X_train = X_train[:-train_cut]
y_train = y_train[:-train_cut]
if test_cut > 0:
X_test = X_test[:-test_cut]
y_test = y_test[:-test_cut]
return X_train, y_train, X_test, y_test
# fit an LSTM network to training data
def fit_lstm(X_train: np.ndarray,
y_train: np.ndarray,
n_lag: int,
n_seq: int,
n_batch: int,
nb_epoch: int,
X_test: np.ndarray=None,
y_test: np.ndarray=None):
model = get_model(X_train, y_train, n_batch)
# fit network
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), callbacks=None,
epochs=nb_epoch, batch_size=n_batch, verbose=1, shuffle=False)
print('Predict:', model.predict(X_test, batch_size=n_batch))
model.reset_states()
return model, history
n_batch = 32
nb_epoch = 40
X_train, y_train, X_test, y_test = reshape_batch(X_train, y_train, X_test, y_test, n_batch)
model, history = fit_lstm(X_train, y_train, n_lag, n_seq, n_batch, nb_epoch, X_test=X_test, y_test=y_test)
What I Have tried
Different optimizers (kinda all available in keras)
DIfferent recurrent network structures (GRU/LSTM)
Different learning rates
Different epochs from 1 to 1500
Adding/Removing Drop layers with different params (0.1-0.7)
Different LSTM/GRU amount of neurons (1-100)
Number of LSTM/GRU layers, via return_sequences params with more Drop layers.
Different number of forecasts(t+1,t+2 ... t+n) features from 1-365
Different number of lag (t-1, t-2, t-n ...) features from 1-5
Different scale normalization borders (0,1) and (-1,1)
Different n_batch values: 1,8,16,32
What can affect LSTM/GRU give so strange behaviour? And What else should I try to make it work the normal way?

Tensorflow Neural Network with more than 2 categories

So i watched a tensorflow tutorial on udemy and decided to try for myself, he said if you want more than 2 categories change the activation to "softmax" and units to 4 since i have 4 different categories it could be in (changed from 0:1, to 1:4), everything works if there is only 2 different values in "y" but as soon as i change it to 4 units and 4 categories i get error:
ValueError: Error when checking target: expected dense_3 to have shape (4,) but got array with shape (1,)
even changing it back to shape "1" just results in true or false category
my dataset in y:
import numpy as np
dataset = np.load('/Users/alex/desktop/ANN_dataset_for_brightness.npy')
X = dataset[:, 0:17]
y = dataset[:, 17:19]
for i in range (27):
if y[i] == 400:
y[i] = 4
elif y[i] == 300:
y[i] = 3
elif y[i] == 200:
y[i] = 2
elif y[i] == 100:
y[i] = 1
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Importing the Keras libraries and packages
from keras.models import Sequential
from keras.layers import Dense
# Initialising the ANN
classifier = Sequential()
# Adding the input layer and the first hidden layer // input dim for input layer
classifier.add(Dense(activation="relu", input_dim=17, units=6, kernel_initializer="uniform"))
# Adding the second hidden layer
classifier.add(Dense(activation="relu", units=6, kernel_initializer="uniform"))
Problem here
# units = categories, softmax = more than 2
classifier.add(Dense(activation="softmax", units=4, kernel_initializer="uniform"))
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 27, nb_epoch = 100)
# Part 3 - Making the predictions and evaluating the model
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
With multiclass task, your y_train and y_test have to be one-hot-encoded. That means they must have dimension (number_of_samples, 4), where 4 denotes the number of classes.
You need to apply tensorflow.keras.utils.to_categorical to them before train-test splitting.
y = to_categorical(y, 4)
ref: https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical

Denormalization of output from neural network

I have used the MinMax normalization in order to normalize my dataset, both features and label. My question is, it's correct to normalize also the label? If yes, how can I denormalize the output of the neural network (the one that I predict with the test set that is normalized)?
I can't upload the dataset, but it is composed by 18 features and 1 label. It is a regression task, the features and the label are physical quantities.
So the problem is that the y_train_pred and y_test_pred are between 0 and 1. How can I predict the "real value"?
The code:
dataset = pd.read_csv('DataSet.csv', decimal=',', delimiter = ";")
label = dataset.iloc[:,-1]
features = dataset.drop(columns = ['Label'])
features = features[best_features]
X_train1, X_test1, y_train1, y_test1 = train_test_split(features, label, test_size = 0.25, random_state = 1, shuffle = True)
y_test2 = y_test1.to_frame()
y_train2 = y_train1.to_frame()
scaler1 = preprocessing.MinMaxScaler()
scaler2 = preprocessing.MinMaxScaler()
X_train = scaler1.fit_transform(X_train1)
X_test = scaler2.fit_transform(X_test1)
scaler3 = preprocessing.MinMaxScaler()
scaler4 = preprocessing.MinMaxScaler()
y_train = scaler3.fit_transform(y_train2)
y_test = scaler4.fit_transform(y_test2)
optimizer = tf.keras.optimizers.Adamax(lr=0.001)
model = Sequential()
model.add(Dense(80, input_shape = (X_train.shape[1],), activation = 'relu',kernel_initializer='random_normal'))
model.add(Dropout(0.15))
model.add(Dense(120, activation = 'relu',kernel_initializer='random_normal'))
model.add(Dropout(0.15))
model.add(Dense(80, activation = 'relu',kernel_initializer='random_normal'))
model.add(Dense(1,activation = 'linear'))
model.compile(loss = 'mse', optimizer = optimizer, metrics = ['mse'])
history = model.fit(X_train, y_train, epochs = 300,
validation_split = 0.1, shuffle=False, batch_size=120
)
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
You should denormalize so you can get real world predictions to your neural network, rather than a number between 0-1
The min - max normalization is defined by:
z = (x - min)/(max - min)
With z being the normalized value, x being the label value, max being the max x value, and min being the min x value. So if we have z, min, and max we can resolve for x as follows:
x = z(max - min) + min
Thus before you normalize your data, define variables for the max and min value for the label if it is continuous. Then after you get your pred values, you can use the following function:
y_max_pre_normalize = max(label)
y_min_pre_normalize = min(label)
def denormalize(y):
final_value = y(y_max_pre_normalize - y_min_pre_normalize) + y_min_pre_normalize
return final_value
And apply this function to your y_test/y_pred to get the corresponding value.
You can use this link here to better visualize this.

Neural network does not predict properly with Count Vectorizer

I'm trying to do a Sentiment Analysis prediction using the text and the scores of random IMDB reviews. I turned all the words into a Bag Of Words and put it all in a neural network. The prediction however does not seem to be correct and it always shows a 50% positive and a 50% negative prediction for anything that I type as a review.
reviews = pd.read_csv('reviews.txt', header=None)
labels = pd.read_csv('labels.txt', header=None)
Y = (labels=='positive').astype(np.int_)
print(type(reviews))
print(reviews.head())
print(labels.head())
#Split into train/test
x_train, x_test, y_train, y_test = train_test_split(reviews,Y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train)
#min_df = 19 seems to be the first number that fills all 10 000 entries - thus the 10 most commonly used words
vect = CountVectorizer(min_df=19, max_features=10000)
fitter = vect.fit(x_train[0])
X_train = fitter.transform(x_train[0])
X_test = fitter.transform(x_test[0])
X_val = fitter.transform(x_val[0])
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
feature_names = vect.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("Vocabulary content:\n {}".format(fitter.vocabulary_))
X_train = pad_sequences(X_train.toarray(), maxlen=100, value=0.)
X_test = pad_sequences(X_test.toarray(), maxlen=100, value=0.)
X_val = pad_sequences(X_val.toarray(), maxlen=100, value=0.)
Y_train = to_categorical(y_train, 2)
Y_test = to_categorical(y_test, 2)
Y_val = to_categorical(y_val, 2)
tensorflow.reset_default_graph()
input_layer = tflearn.input_data(shape=[None, 100])
net = tflearn.embedding(input_layer, input_dim=10000, output_dim=128)
hid = tflearn.fully_connected(input_layer, 10, activation='tanh') # a hidden layer with 10 neurons
output_layer = tflearn.fully_connected(hid, 2, activation='softmax')
sgd = tflearn.SGD(learning_rate=0.04, lr_decay=0.96, decay_step=1000)
net = tflearn.regression(output_layer, optimizer=sgd, loss='categorical_crossentropy')
model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir='tfdir')
try:
model.fit(X_train, Y_train, n_epoch=5, validation_set=(X_val, Y_val), batch_size=100, show_metric=True, run_id="Imdb")
except KeyboardInterrupt as e:
print("Stopped by user")
The training, validation and test accuracy is always ~0.65 at maximum no matter how much I tune the hyperparameters.
my_review = "This movie sucks"
my_review_enc = fitter.transform([my_review])
my_review_enc_pad = pad_sequences(my_review_enc.toarray(), maxlen=100, value=0.)
prediction = model.predict(my_review_enc_pad)
prediction
As you can see, the positive and negative prediction is always at 50%
What am I doing wrong?

Categories