NLP Sentiment Analysis net is not learning - python

I want to train a neural net for sentiment analysis. I have followed the tutorials on the keras webpage but I had to adapt the code to my usecase in order to be able to use the net afterwards.
For this purpose I decode back the texts from the imdb dataset from keras from numbers to text, and then I stemmize the text because I need to use the text stemmized. After that, since I want to control the way I am doing the word embeddings rather than using text_to_sequences an pad_sequences I am training a doc2vec embeddings and I am using it on the training set, so that I can obtain the embeddings from the text I want to classify.
The problem is that, the net does not learn anything, the accuracy does not improve and I can not reduce the loss function. I have tried many many things, like the architecture of the net, all the hyperparameters and changing the last layer from 2 nets to 1 and from sparse_categorical_entropy to binary_crossentropy. Let's see if anybody can help and show some light to my problem. I plug the code here and thanks in advance.
from keras.datasets import imdb
max_features = 40000
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=max_features)
import numpy as np
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()])
decoded = " ".join([reverse_index.get(i - 3, "") for i in data[0]])
import nltk
from nltk .stem import LancasterStemmer
toke_corpus = list()
lan = LancasterStemmer()
from tqdm import tqdm
lista_reviews = list()
for review in tqdm(data):
lista_reviews.append(np.array([lan.stem(reverse_index.get(i - 3, '')) for i in review][1:]))
train_x, test_x = lista_reviews[10000:], lista_reviews[:10000]
train_y, test_y = targets[10000:], targets[:10000]
from gensim.models.callbacks import CallbackAny2Vec
class EpochLogger(CallbackAny2Vec):
'''Callback to log information about training'''
def __init__(self):
self.epoch = 0
def on_epoch_begin(self, model):
print("Epoch #{} start".format(self.epoch))
def on_epoch_end(self, model):
print("Epoch #{} end".format(self.epoch))
self.epoch += 1
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(lista_reviews)]
print("DOcuments already built")
epoch_logger = EpochLogger()
model = Doc2Vec(documents, vector_size=512, window=5, min_count=3, workers=8, epochs = 7, callbacks=[epoch_logger])
encoded_x_train, encoded_x_test = list(), list()
from tqdm import tqdm
for i in tqdm(train_x):
encoded_x_train.append(model.infer_vector(i))
for k in tqdm(test_x):
encoded_x_test.append(model.infer_vector(k))
import keras
reduce_lr = keras.callbacks.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.50, patience=2, verbose=1, mode='auto', cooldown=0, min_lr=0.00001)
early = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=4, verbose=1, mode='auto')
from keras import models
from keras.models import Sequential
from keras import layers
from keras.layers import Embedding, Bidirectional, Dense, LSTM, Conv1D, MaxPooling1D, Flatten
model1 = Sequential()
model1.add(Embedding(input_dim = max_features, input_length=512, output_dim=128, trainable=False))
model1.add(Conv1D(filters=64,
kernel_size=5,
padding='valid',
activation='linear',
strides=1))
model1.add(MaxPooling1D(pool_size=4))
model1.add(Dense(64, activation='linear'))
model1.add(LSTM(32, activation='tanh'))
# model1.add(Dense(32, activation='relu'))
# model1.add(Flatten())
# model1.add(Dense(1, activation='sigmoid'))
model1.add(Dense(2, activation='softmax'))
model1.summary()
from keras import optimizers
# sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
adam = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, amsgrad=False)
model1.compile(loss='sparse_categorical_crossentropy',
optimizer=adam,
metrics=['accuracy'])
history = model1.fit( np.array(encoded_x_train), np.array(train_y),
epochs= 20,
batch_size = 500,
validation_data = (np.array(encoded_x_test), np.array(test_y)), callbacks = [reduce_lr, early]
)

You use Doc2Vec to create sample embeddings. for this reason, I don't think that Embedding, Conv1D and MaxPooling1D layers are useful in your network. they are useful for word2vec where you can extract embeddings of each token and use them inside a network.
try to feed your network directly with your embedding in this way
model1 = Sequential()
model1.add(Dense(128, activation='relu', input_shape=(512,)))
# ....
model1.add(Dense(2, activation='softmax'))
adam = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, amsgrad=False)
model1.compile(loss='sparse_categorical_crossentropy',
optimizer=adam,
metrics=['accuracy'])
history = model1.fit( np.array(encoded_x_train), np.array(train_y),
epochs= 20,
batch_size = 500,
validation_data = (np.array(encoded_x_test), np.array(test_y)), callbacks = [reduce_lr, early]
)

Related

Why I can't reproduce Keras results?

I know that there is a problem with reproducibility in keras. However due to my research I created a function:
def set_seed():
seed_value = 42
os.environ['PYTHONHASHSEED']=str(seed_value)
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf1.get_default_graph(), config=session_conf)
np.random.seed(seed_value)
random.seed(seed_value)
tf.random.set_seed(seed_value)
K.set_session(sess)
which should assure me a reproductive results in keras.
My problem
I'm trying to run self-created grid search on neural network using keras:
# Early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 50)
callbacks=[es]
# Possible learning rates
learning_rates = np.linspace(0.1, 10**(-5), 10)
# Run grid search on one layer
set_seed()
mse1 = np.array([])
rate1 = np.array([])
neuron_number1 = np.array([])
for rate in learning_rates[0:2]:
for neuron in range(1, 3):
model = Sequential()
model.add(Dense(neuron, input_dim=2, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, activation = 'relu'))
model.summary()
model.compile(loss='mse', optimizer=SGD(lr=rate), metrics=['mse'])
history = model.fit(X_train, y_train, epochs=1000, batch_size=50, validation_split=0.5, callbacks=[es])
mse1 = np.append(mse1, history.history['val_loss'][-1])
rate1 = np.append(rate1, rate)
neuron_number1= np.append(neuron_number1, neuron)
neural_summary1 = pandas.DataFrame(data = [neuron_number1, rate1, mse1])
neural_summary1 = neural_summary1.transpose()
neural_summary1.columns = ["number_of_neurons", "learning_rate", "mse"]
print(neural_summary1.iloc[neural_summary1['mse'].idxmin()])
number_of_neurons 2.000000
learning_rate 0.088890
mse 0.159922
But when I run it apart from grid search:
set_seed()
model = Sequential()
model.add(Dense(2, input_dim=2, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, activation = 'relu'))
model.summary()
model.compile(loss='mse', optimizer=SGD(lr=0.088890), metrics=['mse'])
history = model.fit(X_train, y_train, epochs=1000, batch_size=50, validation_split=0.5, callbacks=[es])
print(history.history['val_loss'][-1])
8.767917346954345
which is different than result obtained previously from grid search.
Do you know where is mistake in my code or why is not working?
Packages that I use are the following:
import pandas
import random
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD
import os
from keras import backend as K

Error with TfidfVectorizer but ok with CountVectorizer

I have been working on this the whole day but no luck
I managed to eliminate the problem in one line of TfidfVectorizer
Here is my working code
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(xtrain)
X_train_count = vectorizer.transform(xtrain)
X_test_count = vectorizer.transform(xval)
X_train_count
from keras.models import Sequential
from keras import layers
input_dim = X_train_count.shape[1] # Number of features
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
history = model.fit(X_train_count, ytrain,
epochs=10,
verbose=False,
validation_data=(X_test_count, yval),
batch_size=10)
But when I change to
from sklearn.feature_extraction.text import TfidfVectorizer
#TF-IDF initializer
vectorizer = TfidfVectorizer(max_df=0.8, max_features=1000)
vectorizer.fit(xtrain)
X_train_count = vectorizer.transform(xtrain)
X_test_count = vectorizer.transform(xval)
X_train_count
from keras.models import Sequential
from keras import layers
input_dim = X_train_count.shape[1] # Number of features
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
history = model.fit(X_train_count, ytrain,
epochs=10,
verbose=False,
validation_data=(X_test_count, yval),
batch_size=10)
The only thing changed is this 2 lines
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.8, max_features=1000)
and then I get this error
InvalidArgumentError: indices[1] = [0,997] is out of order. Many sparse ops require sorted indices.
Use tf.sparse.reorder to create a correctly ordered copy.
[Op:SerializeManySparse]
How to fix that and why it is happening?
vectorizer.transform(...) produces a sparse array and this is not good for keras. you simply have to transform it in a simple array. this is simply possible with:
vectorizer.transform(...).toarray()

How to encode text for an nlp in tensorflow

model = tf.keras.Sequential([
tf.keras.layers.Embedding(VOCAB_SIZE, 32),
tf.keras.layers.LSTM(32),
tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["acc"])
train_data, train_labels, test_data, test_labels = getdata()
model.fit(train_data, train_labels, epochs=10, batch_size=BATCH_SIZE, shuffle=True)
results = model.evaluate(test_data, test_labels)
print(results)
This is my basic tensorflow model and I have my datasets but I'm completely unsure on how to convert the strings in the data to something the network can take in as an input. I'm very new to TF btw.
You can use the gensim library effective for text embedding without losing its meaning and sequence. An example code below. Feel free to make changes according to your needs. Choose algorithms, optimizers etc. as per your requirement.
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, GlobalMaxPool1D, Dropout
# first, we initialize a tokenizer which will convert text corpus to numbers (vectorization)
reviews_text = ["list of strings you want to analyze"]
sentiment = <array of labels/target>
top_words = 1500
tokenizer = Tokenizer(num_words=top_words)
# second, we update the vocabulary of the tokenizer by providing text
tokenizer.fit_on_texts(reviews_text) # fit on reviews before converting to matrix
# third, we transform each text into a sequence of integers
reviews_seq = tokenizer.texts_to_sequences(reviews_text)
len(tokenizer.word_index) # length of tokernizer, means unique words in the vocab
# in order for the math to work, we convert each review into same length. Padding...
max_len = 300
reviews_seq = pad_sequences(reviews_seq, maxlen=max_len)
# word2vec requires list of lists as input
documents = []
i = 0
for doc in <list of strings>:
i += 1
documents.append(doc.split(' '))
# gensim model training, it will translate a word into 128 numbers
embedding_size = 128
window_size = 128
w2v = Word2Vec(documents # input list of lists
, min_count=3 # any word must appear 3 times or more for training
# , workers=3 # CPU cores to be used for training the model
, size=embedding_size # no. of numbers required to represent a word
, window=window_size # how many neighbors to look at either side of the word for learning
, iter=20 # no. of iterations over the documents for training
)
w2v.wv.vectors.shape # shape of the model
# create embeddings to be added into keras model
embedding_matrix = zeros((top_words + 1, embedding_size))
# build matrix from pre-trained word to vec model
for word, i in sorted(tokenizer.word_index.items(), key=lambda x: x[1]):
if i > top_words:
break
if word in w2v.wv.vocab:
embedding_vector = w2v.wv[word]
embedding_matrix[i] = embedding_vector
model = Sequential()
model.add(Embedding(top_words + 1, embedding_size, input_length=max_len, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adadelta(learning_rate=0.001, rho=0.92)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(reviews_seq, asarray(sentiment), validation_split=0.2, epochs=1000, batch_size=32)

How to make prediction on Keras Text classification?

I've trained a model with this reference: https://www.tensorflow.org/tutorials/keras/text_classification_with_hub
Here is my code:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
train_data, validation_data, test_data = tfds.load(
name="imdb_reviews",
split=('train[:60%]', 'train[60%:]', 'test'),
as_supervised=True)
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[],
dtype=tf.string, trainable=True)
hub_layer(train_examples_batch[:3])
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1))
model.summary()
model.compile(optimizer='adam',
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
history = model.fit(train_data.shuffle(10000).batch(512),
epochs=20,
validation_data=validation_data.batch(512),
verbose=1)
results = model.evaluate(test_data.batch(512), verbose=2)
model.save("imdb_model.h5")
I've saved the model as imdb_model.h5. I want to make a prediction on a custom text. For example "The best movie, I have ever seen". How can I do it?
You can use
model.predict(["This is the best movie I have ever seen"])

Keras CNN-LSTM RuntimeError

I'm trying to use the following model Inceptionv3 base CNN and LSTM layer for a regression problem. My input data is pictures with continuous target values. I'd like to feed the sequence of images to a CNN and after to an LSTM layer. However i get a
RuntimeError: You must compile your model before using it
message. Any idea what can be the cause? I tried to find out in github and on several pages but i didnt succeed.
from keras.applications.inception_v3 import InceptionV3
from keras.models import Sequential, Model
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization, Conv2D, MaxPooling2D, GlobalAveragePooling2D, LSTM, TimeDistributed, Input
from keras.optimizers import SGD, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger, ReduceLROnPlateau
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# custom R2-score metrics for keras backend
#https://www.kaggle.com/c/mercedes-benz-greener-manufacturing/discussion/34019
from keras import backend as K
def r2_keras(y_true, y_pred):
SS_res = K.sum(K.square(y_true - y_pred))
SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
return ( 1 - SS_res/(SS_tot + K.epsilon()) )
train_data_dir = '...'
test_data_dir = '...'
train_df = pd.read_csv('...')
valid_df = pd.read_csv('...')
filepath_loss = '...'
filepath_csv = '...'
datagen=ImageDataGenerator(rescale=1./255.,)
img_width, img_height = 380, 380
frames = 5
channels = 3
pictures = Input(shape=(frames, img_width, img_height, channels))
train_generator=datagen.flow_from_dataframe(
dataframe=train_df,
directory=train_data_dir,
x_col="block_heights",
y_col="weighted_prices",
has_ext=False, #x_col column doesnt has the file extensions
#subset="training", if validation split is set in ImageDataGenerator
batch_size=16,
seed=42,
shuffle=False,
class_mode="other", #for regression other should be used
target_size=(img_width, img_height))
valid_generator=datagen.flow_from_dataframe(
dataframe=valid_df,
directory=train_data_dir,
x_col="block_heights",
y_col="weighted_prices",
has_ext=False, #x_col column doesnt has the file extensions
#subset="validation", if validation split is set in ImageDataGenerator
batch_size=16,
seed=42,
shuffle=False,
class_mode="other",
target_size=(img_width, img_height))
conv_base = InceptionV3(weights=None, include_top=False, input_shape=(img_width,img_height,3))
conv_base.trainable = True
model = Sequential()
model.add(TimeDistributed(conv_base))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(10, return_sequences=True))
model.add(Dense(512, activation='relu'))
model.add(Dense(1, activation='linear'))
#error at callbacks if the learning rate is explicitly set somewhere
rms = RMSprop(lr=0.1, rho=0.9, epsilon=None, decay=0.0)
model.compile(loss='mse', optimizer=rms, metrics=['mae', r2_keras])
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size
callbacks = [EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='auto'),
ReduceLROnPlateau(monitor='val_loss', factor=0.02, patience=3, min_lr=0.001),
ModelCheckpoint(filepath_loss, monitor='val_loss', verbose=1, save_best_only=True, mode='min'),
CSVLogger(filepath_csv, separator = ",", append = False)]
history = model.fit_generator(generator=train_generator,steps_per_epoch=STEP_SIZE_TRAIN, validation_data=valid_generator, validation_steps=STEP_SIZE_VALID, epochs=50, callbacks=callbacks)

Categories