Why is my neural net only predicting one class (binary classification)? - python

I am having some trouble with my ANN. It is only predicting '0.' The dataset is imbalanced (10:1), ALTHOUGH, I undersampled the training dataset, so I am unsure of what is going on. I am getting 92-93% accuracy on the balanced training set, although on testing (on an unbalanced test set) it just predicts zeroes. Unsure of where to go from here. Anything helps. The data has been one hot encoded and scaled.
#create 80/20 train-test split
train, test = train_test_split(selection, test_size=0.2)
# Class count
count_class_0, count_class_1 = train.AUDITED_FLAG.value_counts()
# Divide by class
df_class_0 = train[train['AUDITED_FLAG'] == 0]
df_class_1 = train[train['AUDITED_FLAG'] == 1]
df_class_0_under = df_class_0.sample(count_class_1)
train_under = pd.concat([df_class_0_under, df_class_1], axis=0)
print('Random under-sampling:')
print(train_under.AUDITED_FLAG.value_counts())
train_under.AUDITED_FLAG.value_counts().plot(kind='bar', title='Count (target)');
Random under-sampling:
1.0 112384
0.0 112384
#split features and labels
y_train = np.array(train_under['AUDITED_FLAG'])
X_train = train_under.drop('AUDITED_FLAG', axis=1)
y_test = np.array(test['AUDITED_FLAG'])
X_test = test.drop('AUDITED_FLAG', axis=1)
y_train = y_train.astype(int)
y_test = y_test.astype(int)
# define model
model = Sequential()
model.add(Dense(6, input_dim=179, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
history = model.fit(X_train, y_train, epochs=5, batch_size=16, verbose=1)
#validate
test_loss, test_acc = model.evaluate(X_test, y_test)
# evaluate the model
_, train_acc = model.evaluate(X_train, y_train, verbose=0)
_, test_acc = model.evaluate(X_test, y_test, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))
print('test_acc:', test_acc)
# plot history
pyplot.plot(history.history['acc'], label='train')
#pyplot.plot(history.history['val_acc'], label='test')
Train: 0.931, Test: 0.921
#preds
y_pred = model.predict(X_test)
y_pred_bool = np.argmax(y_pred, axis=1)
# #plot confusion matrix
y_actu = pd.Series(y_test, name='Actual')
y_pred_bool = pd.Series(y_pred_bool, name='Predicted')
print(pd.crosstab(y_actu, y_pred_bool))
'''
Predicted 0
Actual
0 300011
1 28030

This is not right:
y_pred_bool = np.argmax(y_pred, axis=1)
Argmax is only used with categorical cross-entropy loss and softmax outputs. For binary cross-entropy and sigmoid outputs, you should round the outputs, which is equivalent to thresholding predictions > 0.5:
y_pred_bool = np.round(y_pred)
This is what Keras does to compute binary accuracy.

Related

How can I improve a tensorflow model with CuDDLSTM

I am not a data scientist and not very professional in machine learning. I am trying to improve the results of this model for predicting the trend for a stock movement (-1:down, 0:no change, +1:up). Here is the code in python and plots for the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= False) #Shuffle set to False
#Normalizing data
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
#setting up the model of tensorflow
input_layer = Input(shape=(X_train.shape[1],1))
x=input_layer
for _ in range(2): # five layers
x = Dropout(0.5)(x) # Dropout to avoid overfitting
x = CuDNNLSTM(X_train.shape[1], return_sequences = True)(x) # using LSTM with return sequences to adopt to time sequences
x = GlobalAveragePooling1D()(x) #Global averaging to one layer shape to feed to a dense categorigal classification
output = Dense(y.shape[1], activation='softmax')(x)
model = Model(inputs=input_layer, outputs=output)
opt = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics = ['acc'])
#creating an early stop based on minmizing val_loss
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=200,restore_best_weights=True)
#fit the model
r = model.fit(X_train, y_train, epochs = 200000, batch_size=16400,
validation_data = (X_test, y_test), callbacks=[early_stop], shuffle=False)
#plot the results.
pd.DataFrame(r.history).plot()
model learning plot

Transfer learning with Inception Resnet v2 (breast cancer) low accuracy

I want to binary classify breast cancer histopathological images from the BreakHis dataset (https://www.kaggle.com/ambarish/breakhis) using transfer learning and the Inception Resnet v2. The goal is to freeze all layers and train the fully connected layer by adding two neurons to the model. In particular, initially I want to consider the images related to the magnificant factor 40X (Benign: 625, Malignant: 1370). Here is a summary of what I do:
I read the images and resize them to 150x150
I partition the dataset into training, validation and test set
I load the pre-trained network Inception Resnet v2
I freeze all the layers I add the two neurons for binary
classification (1 = "benign", 0 = "malignant")
I compile the model using as activation function the Adam method
I carry out the training
I make the prediction
I calculate the accuracy
This is the code:
data = dataset[dataset["Magnificant"]=="40X"]
def preprocessing(dataset, img_size):
# images
X = []
# labels
y = []
i = 0
for image in list(dataset["Path"]):
# Ridimensiono e leggo le immagini
X.append(cv2.resize(cv2.imread(image, cv2.IMREAD_COLOR),
(img_size, img_size), interpolation=cv2.INTER_CUBIC))
basename = os.path.basename(image)
# Get labels
if dataset.loc[i][2] == "benign":
y.append(1)
else:
y.append(0)
i = i+1
return X, y
X, y = preprocessing(data, 150)
X = np.array(X)
y = np.array(y)
# Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify = y_40, shuffle=True, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, stratify = y_train, shuffle=True, random_state=1)
conv_base = InceptionResNetV2(weights='imagenet', include_top=False, input_shape=[150, 150, 3])
# Freezing
for layer in conv_base.layers:
layer.trainable = False
model = models.Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))
opt = tf.keras.optimizers.Adam(learning_rate=0.0002)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
model.compile(loss=loss, optimizer=opt, metrics = ["accuracy", tf.metrics.AUC()])
batch_size = 32
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow(X_train, y_train, batch_size=batch_size)
val_generator = val_datagen.flow(X_val, y_val, batch_size=batch_size)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
ntrain =len(X_train)
nval = len(X_val)
len(y_train)
epochs = 70
history = model.fit_generator(train_generator,
steps_per_epoch=ntrain // batch_size,
epochs=epochs,
validation_data=val_generator,
validation_steps=nval // batch_size, callbacks=[callback])
This is the output of the training at the last epoch:
Epoch 70/70
32/32 [==============================] - 3s 84ms/step - loss: 0.0499 - accuracy: 0.9903 - auc_5: 0.9996 - val_loss: 0.5661 - val_accuracy: 0.8250 - val_auc_5: 0.8521
I make the prediction:
test_datagen = ImageDataGenerator(rescale=1./255)
x = X_test
y_pred = model.predict(test_datagen.flow(x))
y_p = []
for i in range(len(y_pred)):
if y_pred[i] > 0.5:
y_p.append(1)
else:
y_p.append(0)
I calculate the accuracy:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_p)
print(accuracy)
This is the accuracy value I get: 0.5459098497495827
Why do I get such low accuracy, I have done several tests but I always get similar results? (HELP ME)
When doing transfer learning, especially with frozen weights, it is extremely important to do the same pre-processing as was used when the network was originally trained.
For the InceptionResNetV2 network the pre-processing type is "tf" in the tensorflow / keras libraries, which corresponds to dividing by 127 then subtracting 1 for the imagenet weights. You are instead dividing by 255.
Fortunately you do not have to wade through the code to find out what function was used, as they are exposed in the API. Simply do
train_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.inception_resnet_v2.preprocess_input)
and so on for validation and test

Train and Test accuracy separately in Keras

This is my model:
def evaluate_model(X_train, y_train,X_test,y_test):
verbose=1
epochs=50
batch_size = 32
n_outputs = 1
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=6, activation='relu', input_shape=(25,1)))
model.add(Conv1D(filters=32, kernel_size=6, activation='relu'))
model.add(Dropout(0.3))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(n_outputs, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X_train, y_train,epochs=50, batch_size=batch_size, verbose=1)
# evaluate model
_, accuracy = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
return accuracy
# summarize scores
def summarize_results(scores):
print(scores)
m, s = mean(scores), std(scores)
print('Accuracy: %.3f%% (+/-%.3f)' % (m, s))
# run an experiment
def run_experiment(repeats=5):
# repeat experiment
scores = list()
for r in range(repeats):
score = evaluate_model(X_train, y_train,X_test,y_test)
score = score * 100.0
print('>#%d: %.3f' % (r+1, score))
scores.append(score)
# summarize results
summarize_results(scores)
# run the experiment
run_experiment()
How do I get the train and test accuracy separately? Right now I am only getting test accuracy with model.evaluate.
What accuracy you get depends on the arguments you use with model.evaluate(); if you replace X_test and y_test with X_train and y_train, respectively, you will get the training accuracy. So, to get both, you should modify the last lines of your evaluate_model function as:
_, train_accuracy = model.evaluate(X_train, y_train, batch_size=batch_size, verbose=1)
_, test_accuracy = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
return train_accuracy, test_accuracy

Tenserflow model for text classification doesn't predict as expected?

I am trying to train a model for sentiment analysis and it shows an accuracy of 90% when splitting the data into training and testing! But whenever I am testing it on a new phrase is has pretty much the same result(usually it's in the range 0.86 - 0.95)!
Here is the code:
sentences = data['text'].values.astype('U')
y = data['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.2, random_state=1000)
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)
vocab_size = len(tokenizer.word_index) + 1
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
embedding_dim = 50
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim,input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()
history = model.fit(X_train, y_train,
epochs=5,
verbose=True,
validation_data=(X_test, y_test),
batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
The training data is a CSV file with 3 coumns: (id, text, labels(0,1)), where 0 is positive and 1 is negative.
Training Accuracy: 0.9855
Testing Accuracy: 0.9013
Testing it on new sentences like 'This is just a text!' and 'hate preachers!' would predict the same result [0.85],[0.83].
It seems that you're victim of overfitting. In other words, our model would overfit to the training data. Although it’s often possible to achieve high accuracy on the training set, as in your case, what we really want is to develop models that generalize well to testing data (or data they haven’t seen before).
You can follow these steps in order to prevent overfitting.
Also, in order to increase the algorithm performance I suggest you to increase the number of neurons for Dense layer and set more epochsin order to increase the performance of the algorithm when comes to testing it to new data.

Testing accuracy higher than training accuracy

Why is the testing accuracy higher than my training accuracy? This is not the case for the validation accuracy. Is it because of the way I am splitting my dataset?
Modifying the network did not work so I am guessing I am doing something wrong in the dataset preparation part.
The dataset is composed of packet captures of malware and normal activities.. dataset.txt file contains total of 777 rows and 28 columns.
#converting dataset and labels to numpy arrays
x = np.genfromtxt("dataset.txt", delimiter=",")
y = np.genfromtxt("label.txt", delimiter=",")
#handling missing values
x[np.isnan(x)] = 0
#shuffling the data
indices = np.arange(x.shape[0])
np.random.shuffle(indices)
x = x[indices]
y = y[indices]
#dividing the dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
#building the model
def build_model():
model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape=(28,)))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy',
metrics=['accuracy'])
return model
'''cross validation
k = 5
num_val_samples = len(x_train) // k
all_scores = []
for i in range(k):
print('fold #', i)
x_val = x_train[i * num_val_samples: (i + 1) * num_val_samples]
y_val = y_train[i * num_val_samples: (i + 1) * num_val_samples]
partial_x_train = np.concatenate([x_train[:i * num_val_samples],
x_train[(i + 1) * num_val_samples:]], axis=0)
partial_y_train = np.concatenate([y_train[:i * num_val_samples],
y_train[(i + 1) * num_val_samples:]], axis=0)
model = build_model()
model.fit(partial_x_train, partial_y_train,epochs=20, batch_size=16,
verbose=0)
val_loss, val_acc = model.evaluate(x_val, y_val, verbose=0)
all_scores.append(val_acc)
print(all_scores)
val_acc = np.mean(all_scores)
print(val_loss , val_acc)
'''
#training the model with the entire training dataset
model = build_model()
model.fit(x_train, y_train, epochs=20, batch_size=16)
#confusion matrix
y_pred = model.predict(x_test)
y_pred = (y_pred > 0.5)
result = confusion_matrix(y_test, y_pred)
print ('Confusion Matrix:')
print(result)
#calculating the test accuracy
model_acc = accuracy_score(y_test, y_pred)
print('Test Accuracy:')
print(model_acc)
This is because keras reports running average accuracy for each epoch. For small number of epochs this means that by the end of an epoch your model is better than it was on average during this epoch.
This could also be due to randomly having 'easier' samples in the test set, but this would not happen each run if you split it randomly in the same portion of the code.

Categories