I have csv file with two columns:
category, description
1030 categories in the file and only about 12,600 lines
I need to get a model for text classification, trained on this data. I use keras with LSTM model.
I found an article describing how to make a binary classification, and slightly modified it to use several categories.
My code:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from numpy import array
from keras.preprocessing.text import one_hot
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing import sequence
import keras
df = pd.read_csv('/tmp/input_data.csv')
#one hot encode your documents
# integer encode the documents
vocab_size = 2000
encoded_docs = [one_hot(d, vocab_size) for d in df['description']]
def load_data_from_arrays(strings, labels, train_test_split=0.9):
data_size = len(strings)
test_size = int(data_size - round(data_size * train_test_split))
print("Test size: {}".format(test_size))
print("\nTraining set:")
x_train = strings[test_size:]
print("\t - x_train: {}".format(len(x_train)))
y_train = labels[test_size:]
print("\t - y_train: {}".format(len(y_train)))
print("\nTesting set:")
x_test = strings[:test_size]
print("\t - x_test: {}".format(len(x_test)))
y_test = labels[:test_size]
print("\t - y_test: {}".format(len(y_test)))
return x_train, y_train, x_test, y_test
encoder = LabelEncoder()
categories = encoder.fit_transform(df['category'])
num_classes = np.max(categories) + 1
print('Categories count: {}'.format(num_classes))
#Categories count: 1030
X_train, y_train, x_test, y_test = load_data_from_arrays(encoded_docs, categories, train_test_split=0.8)
# Truncate and pad the review sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_review_length)
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
# Build the model
embedding_vector_length = 32
top_words = 10000
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_8 (Embedding) (None, 500, 32) 320000
_________________________________________________________________
lstm_8 (LSTM) (None, 100) 53200
_________________________________________________________________
dense_8 (Dense) (None, 1030) 104030
=================================================================
Total params: 477,230
Trainable params: 477,230
Non-trainable params: 0
_________________________________________________________________
None
#Train the model
model.fit(X_train, y_train, validation_data=(x_test, y_test), epochs=5, batch_size=64)
Train on 10118 samples, validate on 2530 samples
Epoch 1/5
10118/10118 [==============================] - 60s 6ms/step - loss: 6.5086 - acc: 0.0019 - val_loss: 10.0911 - val_acc: 0.0000e+00
Epoch 2/5
10118/10118 [==============================] - 63s 6ms/step - loss: 6.3281 - acc: 0.0028 - val_loss: 10.8270 - val_acc: 0.0000e+00
Epoch 3/5
10118/10118 [==============================] - 63s 6ms/step - loss: 6.3120 - acc: 0.0024 - val_loss: 11.0078 - val_acc: 0.0000e+00
Epoch 4/5
10118/10118 [==============================] - 64s 6ms/step - loss: 6.2891 - acc: 0.0030 - val_loss: 11.8264 - val_acc: 0.0000e+00
Epoch 5/5
10118/10118 [==============================] - 69s 7ms/step - loss: 6.2559 - acc: 0.0032 - val_loss: 12.1625 - val_acc: 0.0000e+00
#Evaluate the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
Accuracy: 0.00%
What mistake did I make when preparing the data?
why accuracy is always 0?
I have curated end-to-end code with some inputs from my end and tested working on this data, you can use the same with your data with no or minimal changes as I have removed specifics and made it generic. Also at the end, I have highlighted what points I have worked on top of the code you provided above.
Code
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from nltk.tokenize import word_tokenize
def load_data_from_arrays(strings, labels, train_test_split=0.9):
data_size = len(strings)
test_size = int(data_size - round(data_size * train_test_split))
print("Test size: {}".format(test_size))
print("\nTraining set:")
x_train = strings[test_size:]
print("\t - x_train: {}".format(len(x_train)))
y_train = labels[test_size:]
print("\t - y_train: {}".format(len(y_train)))
print("\nTesting set:")
x_test = strings[:test_size]
print("\t - x_test: {}".format(len(x_test)))
y_test = labels[:test_size]
print("\t - y_test: {}".format(len(y_test)))
return x_train, y_train, x_test, y_test
# estimating the vocab length with the help of nltk
def get_vocab_length(strings):
vocab = []
for sent in strings:
words = word_tokenize(sent)
vocab.extend(words)
vocab = list(set(vocab))
vocab_length = len(vocab)
return vocab_length
def clean_text(sent):
# <your cleaning code here>
# clean func 1
# clean func 2
# ...
# clean func n
return sent
# load input data
df = pd.read_csv('/tmp/input_data.csv')
strings = df['description'].values
labels = df['category'].values
clean_strings = [clean_text(sent) for sent in strings]
vocab_length = get_vocab_length(clean_strings)
# create onehot encodings of strings
encoded_docs = [one_hot(sent, vocab_length) for sent in strings]
# create onehot encodings of labels
ohe = OneHotEncoder()
categories = ohe.fit_transform(labels.reshape(-1,1)).toarray()
# split data
X_train, y_train, X_test, y_test = load_data_from_arrays(encoded_docs, categories, train_test_split=0.8)
# assuming max input to be not more than 512 words
max_input_len = 512
# padding data
X_train = pad_sequences(X_train, maxlen=max_input_len, padding= 'post')
X_test = pad_sequences(X_test, maxlen=max_input_len, padding= 'post')
# setting embedding vector length
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vocab_length, embedding_vector_length, input_length=max_input_len, name= 'embedding') )
model.add(Flatten())
model.add(Dense(5, activation= 'softmax'))
model.compile('adam', loss= 'categorical_crossentropy', metrics= ['accuracy'])
model.summary()
# training the model
model.fit(X_train, y_train, epochs= 10, batch_size= 128, validation_split= 0.2, verbose= 1)
# evaluating the model
score = model.evaluate(X_test, y_test, verbose=0)
print("Test Loss:", score[0])
print("Test Acc:", score[1])
Additional areas I have worked on
1. Text Cleaning
Created a function to clean the text. It is extremely important as it will remove unnecessary noise from the data and also note this step will totally depend on the type of data you have. To help you simplify, I have created a clean_text function in the above code where you can place your cleaning code. It should be used in such a way that it takes in raw text and provides clean text. Some of the libraries you may like to look into are re, string, and emoji.
2. Estimating Vocab Size
If you have enough data, it is good to estimate the vocab size rather than putting some number directly while passing it to Keras one_hot function. I have created a basic get_vocab_length function using nltk word_tokenize. You can use the same or enhance it further as per your data.
What Else?
You can work further on hyperparameter tuning and a few different neural network designs.
Final Words
It still may not work as it totally depends on the data quality and amount of data you have. There is a good chance you may not get results after trying everything if you have poor quality data or a very less amount of data.
I would then suggest you try transfer learning on some pre-trained models like BERT, RoBERTa, etc. HuggingFace provides good support for state-of-art pre-trained models, you can get started at the following links -
https://huggingface.co/docs/transformers/index#supported-models
https://towardsdatascience.com/text-classification-with-hugging-face-transformers-in-tensorflow-2-without-tears-ee50e4f3e7ed
https://towardsdatascience.com/an-introduction-to-transformers-and-hugging-face-13052ec9d72d
I guess that your vocab_size is way too low. If you are dealing with usual text, try 10.000 - 100.000 as a starting point.
What one_hot does is to use the hashing trick. That means all of your words are hashed and projected into an 2000 vector space. It does not only mean that your dict is 2000 words long, it does mean every word will be projected to into this space, which effectively causes a lot of collisions, where words have the same index and are considered as equal in the LSTM.
Furthermore you should take a look at the transformed text, just too get an understanding of what happens here. To do so, build an reverse lookup and transform all the indices back.
As a further improvement it is feasible to preprocess the text with common techniques like stemming, normalizing etc. and the usage of a vocabulary or discard bag of words and use word embeddings.
from keras.preprocessing.text import one_hot, Tokenizer, hashing_trick
text1 = 'I love you'
text2 = 'you love I'
print('one_hot: ')
print(one_hot(text1, n=20))
print(one_hot(text2, n=20))
print('--------------------------------------')
print('Tokenizer: ')
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text1, text2])
print(tokenizer.word_index)
print(tokenizer.index_word)
print('--------------------------------------')
print('hashing_trick: ')
print(hashing_trick(text1, n=20))
print(hashing_trick(text2, n=20))
print('--------------------------------------')
out:
one_hot:
[14, 7, 14]
[14, 7, 14]
--------------------------------------
Tokenizer:
{'i': 1, 'love': 2, 'you': 3}
{1: 'i', 2: 'love', 3: 'you'}
--------------------------------------
hashing_trick:
[14, 7, 14]
[14, 7, 14]
--------------------------------------
Run more times and you will find that the results of one_hot and hashing_trick are not unique.
You should use Tokenizer to convert text.
Related
As far as I know and research, the sequences in a data set can be of different lengths; we do not need to pad or truncate them provided that each batch in the training process contains the sequences with the same length.
To realize and apply it, I decided to set the batch size to 1 and trained my RNN model over the IMDB movie classification dataset. I added the code that I had written below.
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Embedding
max_features = 10000
batch_size = 1
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=32))
model.add(SimpleRNN(units=32, input_shape=(None, 32)))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer="rmsprop",
loss="binary_crossentropy", metrics=["acc"])
history = model.fit(x_train, y_train,
batch_size=batch_size, epochs=10,
validation_split=0.2)
acc = history.history["acc"]
loss = history.history["loss"]
val_acc = history.history["val_acc"]
val_loss = history.history["val_loss"]
epochs = range(len(acc) + 1)
plt.plot(epochs, acc, "bo", label="Training Acc")
plt.plot(epochs, val_acc, "b", label="Validation Acc")
plt.title("Training and Validation Accuracy")
plt.legend()
plt.figure()
plt.plot(epochs, loss, "bo", label="Training Loss")
plt.plot(epochs, val_loss, "b", label="Validation Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.show()
What error I have been encountered is to fail to convert the input to tensor format because of the list components in the input numpy array. However, when I change them, I continue to get similar kinds of errors.
The error message:
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
I could not handle the problem. Could anyone help me on this point?
With Sequence Padding
There are two issues. You need to use pad_sequences on the text sequence first. And also there is no such param input_shape in SimpleRNN. Try with the following code:
max_features = 20000 # Only consider the top 20k words
maxlen = 200 # Only consider the first 200 words of each movie review
batch_size = 1
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), "Training sequences")
print(len(x_test), "Validation sequences")
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=32))
model.add(SimpleRNN(units=32))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["acc"])
history = model.fit(x_train, y_train, batch_size=batch_size,
epochs=10, validation_split=0.2)
Here is the official code example, it might help you.
With Sequence Padding with Mask in Embedding Layer
Based on your comments and information, It seems that it's possible to use a variable-length input sequence, check this and this too. But still, I can say, in most of the cases practitioner would prefer to pad the sequences for uniform length; as it's convincing. Choosing non-uniform or variable input sequence length is some kind of special case; similar to when we want variable input image sizes for vision models.
However, here we will add info on padding and how we can mask out the padded value in training time which technically seems variable-length input training. Hope that convinces you. Let's first understand what pad_sequences do. Normally in sequence data, it's very much a common case that, each training samples are in a different length. Let's consider the following inputs:
raw_inputs = [
[711, 632, 71],
[73, 8, 3215, 55, 927],
[83, 91, 1, 645, 1253, 927],
]
These 3 training samples are in different lengths, 3, 5, and 6 respectively. What we do next is to make them all equal lengths by adding some value (typically 0 or -1) - whether at the beginning or at the end of the sequence.
tf.keras.preprocessing.sequence.pad_sequences(
raw_inputs, maxlen=6, dtype="int32", padding="pre", value=0.0
)
array([[ 0, 0, 0, 711, 632, 71],
[ 0, 73, 8, 3215, 55, 927],
[ 83, 91, 1, 645, 1253, 927]], dtype=int32)
We can set padding = "post" to set pad value at the end of the sequence. But it recommends using "post" padding when working with RNN layers in order to be able to use the CuDNN implementation of the layers. However, FYI, you may notice we set maxlen = 6 which is the highest input sequence length. But it does not have to be the highest input sequence length as it may get computationally expensive if the dataset gets bigger. We can set it to 5 assuming that our model can learn feature representation within this length, it's a kind of hyper-parameter. And that brings another parameter truncating.
tf.keras.preprocessing.sequence.pad_sequences(
raw_inputs, maxlen=5, dtype="int32", padding="pre", truncating="pre", value=0.0
)
array([[ 0, 0, 711, 632, 71],
[ 73, 8, 3215, 55, 927],
[ 91, 1, 645, 1253, 927]], dtype=int32
Okay, now we have a padded input sequence, all inputs are uniform length. Now, we can mask out those additional padded values in training time. We will tell the model some part of the data is padding and those should be ignored. That mechanism is masking. So, it's a way to tell sequence-processing layers that certain timesteps in the input are missing, and thus should be skipped when processing the data. There are three ways to introduce input masks in Keras models:
Add a keras. layers.Masking layer.
Configure a keras.layers.Embedding layer with mask_zero=True.
Pass a mask argument manually when calling layers that support this argument (e.g. RNN layers).
Here we will show only by configuring the Embedding layer. It has a parameter called mask_zero and set False by default. If we set it True then 0 containing indices in the sequences will be skipped. False entry indicates that the corresponding timestep should be ignored during processing.
padd_input = tf.keras.preprocessing.sequence.pad_sequences(
raw_inputs, maxlen=6, dtype="int32", padding="pre", value=0.0
)
print(padd_input)
embedding = tf.keras.layers.Embedding(input_dim=5000, output_dim=16, mask_zero=True)
masked_output = embedding(padd_input)
print(masked_output._keras_mask)
[[ 0 0 0 711 632 71]
[ 0 73 8 3215 55 927]
[ 83 91 1 645 1253 927]]
tf.Tensor(
[[False False False True True True]
[False True True True True True]
[ True True True True True True]], shape=(3, 6), dtype=bool)
And here is how it's computed in the class Embedding(Layer).
def compute_mask(self, inputs, mask=None):
if not self.mask_zero:
return None
return tf.not_equal(inputs, 0)
And here is one catch, if we set mask_zero as True, as a consequence, index 0 cannot be used in the vocabulary. According to the doc
mask_zero: Boolean, whether or not the input value 0 is a special "padding" value that should be masked out. This is useful when using recurrent layers which may take variable length input. If this is True, then all subsequent layers in the model need to support masking or an exception will be raised. If mask_zero is set to True, as a consequence, index 0 cannot be used in the vocabulary (input_dim should equal size of vocabulary + 1).
So, we have to use max_features + 1 at least. Here is a nice explanation on this.
Here is the complete example using these of your code.
# get the data
(x_train, y_train), (_, _) = imdb.load_data(num_words=max_features)
print(x_train.shape)
# check highest sequence lenght
max_list_length = lambda list: max( [len(i) for i in list])
print(max_list_idx(x_train))
max_features = 20000 # Only consider the top 20k words
maxlen = 350 # Only consider the first 350 words out of `max_list_idx(x_train)`
batch_size = 512
print('Length ', len(x_train[0]), x_train[0])
print('Length ', len(x_train[1]), x_train[1])
print('Length ', len(x_train[2]), x_train[2])
# (1). padding with value 0 at the end of the sequence - padding="post", value=0.
# (2). truncate 'maxlen' words
# out of `max_list_idx(x_train)` at the end - maxlen=maxlen, truncating="post"
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train,
maxlen=maxlen, dtype="int32",
padding="post", truncating="post",
value=0.)
print('Length ', len(x_train[0]), x_train[0])
print('Length ', len(x_train[1]), x_train[1])
print('Length ', len(x_train[2]), x_train[2])
Your model definition should be now
model = Sequential()
model.add(Embedding(
input_dim=max_features + 1,
output_dim=32,
mask_zero=True))
model.add(SimpleRNN(units=32))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["acc"])
history = model.fit(x_train, y_train,
batch_size=256,
epochs=1, validation_split=0.2)
639ms/step - loss: 0.6774 - acc: 0.5640 - val_loss: 0.5034 - val_acc: 0.8036
References
Masking and padding with Keras
Embedding layer, - Pads sequences
Recurrent Neural Networks (RNN) with Keras
Without Sequence Padding
Padding is not MUST for the variable length of the input sequence in sequence modeling. In TensorFlow, a tensor with variable numbers of elements along some axis is called ragged and we use tf.ragged.RaggedTensor for ragged data. For example:
# variable length input sequences
ragged_list = [
[0, 1, 2, 3],
[4, 5],
[6, 7, 8],
[9]]
# convert to ragged tensor that handle such variable length inputs
tf.ragged.constant(ragged_list).shape
shape: [4, None]
So, we can use ragged input data in sequence modeling and we no longer need to pad the sequence for uniform input length.
DataSet
import tensorflow as tf
import warnings, numpy as np
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
# maxlen = 200 # No maximum length but whatever
batch_size = 256
max_features = 20000 # Only consider the top 20k words
(x_train, y_train), (x_test, y_test) = \
tf.keras.datasets.imdb.load_data(num_words=max_features)
print(len(x_train), "Training sequences")
print(len(x_test), "Validation sequences")
25000 Training sequences
25000 Validation sequences
# quick check
x_train[:3]
array([list([1, 14, 22, 16, 43, 53, ....]),
list([....]),
list([...]),
Convert to Ragged
Now, we convert it to a ragged tensor which deals with variable size sequences.
x_train = tf.ragged.constant(x_train)
x_test = tf.ragged.constant(x_test)
# quick check
x_train[:3]
<tf.RaggedTensor [[1, 14, 22, 16, 43, 53, ...] [...] [...]]
x_train.shape, x_test.shape
(TensorShape([25000, None]), TensorShape([25000, None]))
Model
# Input for variable-length sequences of integers
inputs = tf.keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = tf.keras.layers.Embedding(max_features, 128)(inputs)
x = tf.keras.layers.SimpleRNN(units=32)(x)
# Add a classifier
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs, outputs)
model.summary()
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_2 (InputLayer) [(None, None)] 0
_________________________________________________________________
embedding_1 (Embedding) (None, None, 128) 2560000
_________________________________________________________________
simple_rnn (SimpleRNN) (None, 32) 5152
_________________________________________________________________
dense (Dense) (None, 1) 33
=================================================================
Total params: 2,565,185
Trainable params: 2,565,185
Non-trainable params: 0
_________________________________________________________________
Compile and Train
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"])
model.fit(x_train, y_train, batch_size=batch_size, verbose=2,
epochs=10, validation_data=(x_test, y_test))
Epoch 1/10
113s 1s/step - loss: 0.6273 - acc: 0.6295 - val_loss: 0.4188 - val_acc: 0.8206
Epoch 2/10
109s 1s/step - loss: 0.4895 - acc: 0.8041 - val_loss: 0.4703 - val_acc: 0.8040
Epoch 3/10
109s 1s/step - loss: 0.3513 - acc: 0.8661 - val_loss: 0.3996 - val_acc: 0.8337
Epoch 4/10
110s 1s/step - loss: 0.2450 - acc: 0.9105 - val_loss: 0.3945 - val_acc: 0.8420
Epoch 5/10
109s 1s/step - loss: 0.1437 - acc: 0.9559 - val_loss: 0.4085 - val_acc: 0.8422
Epoch 6/10
109s 1s/step - loss: 0.0767 - acc: 0.9807 - val_loss: 0.4310 - val_acc: 0.8429
Epoch 7/10
109s 1s/step - loss: 0.0380 - acc: 0.9932 - val_loss: 0.4784 - val_acc: 0.8437
Epoch 8/10
110s 1s/step - loss: 0.0288 - acc: 0.9946 - val_loss: 0.5039 - val_acc: 0.8564
Epoch 9/10
110s 1s/step - loss: 0.0957 - acc: 0.9615 - val_loss: 0.5687 - val_acc: 0.8575
Epoch 10/10
109s 1s/step - loss: 0.1008 - acc: 0.9637 - val_loss: 0.5166 - val_acc: 0.8550
I'm very new to machine learning so this question might sound stupid.
i'm following a tutorial on Text Classification but I'm facing an error that I don't have any idea about how to solve.
This is the code I have (it is basically what it is found in the tutorial)
import pandas as pd
filepath_dict = {'yelp': 'data/yelp_labelled.txt',
'amazon': 'data/amazon_cells_labelled.txt',
'imdb': 'data/imdb_labelled.txt'}
df_list = []
for source, filepath in filepath_dict.items():
df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
df['source'] = source
df_list.append(df)
df = pd.concat(df_list)
print(df.iloc[0:4])
from sklearn.feature_extraction.text import CountVectorizer
df_yelp = df[df['source'] == 'yelp']
sentences = df_yelp['sentence'].values
y = df_yelp['label'].values
from sklearn.model_selection import train_test_split
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)
from keras.models import Sequential
from keras import layers
input_dim = X_train.shape[1]
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
history = model.fit(X_train, y_train,
nb_epoch=100,
verbose=False,
validation_data=(X_test, y_test),
batch_size=10)
When I reach the last line, I get an error
"TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]"
I guess I'll have to perform some kind of transformation on the data I'm using, or that I should try to load those data in a different way. I tried to search on Stackoverflow already but - being new to all this - I couldn't find anything helpful.
How do I make this work? Ideally I'd like to get not only the solution but also a brief explaination about why the error happened and what the solution does in order to solve it.
thanks!
The reason you're facing this difficulty is that your X_train and X_test are of type <class scipy.sparse.csr.csr_matrix> whereas your model expects it to be a numpy array.
Try casting them to dense and you're fine to go:
X_train = X_train.todense()
X_test = X_test.todense()
Not sure, why are you getting error for this script.
The following script is working fine; even with sparse matrix. May be give a try in your machine.
sentences = ['i want to test this','let us try this',
'would this work','how about this',
'even this','this should not work']
y= [0,0,0,0,0,1]
from sklearn.model_selection import train_test_split
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)
from keras.models import Sequential
from keras import layers
input_dim = X_train.shape[1]
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
model.fit(X_train, y_train,
epochs=2,
verbose=True,
validation_data=(X_test, y_test),
batch_size=2)
#
Layer (type) Output Shape Param #
=================================================================
dense_5 (Dense) (None, 10) 110
_________________________________________________________________
dense_6 (Dense) (None, 1) 11
=================================================================
Total params: 121
Trainable params: 121
Non-trainable params: 0
_________________________________________________________________
Train on 4 samples, validate on 2 samples
Epoch 1/2
4/4 [==============================] - 1s 169ms/step - loss: 0.7570 - acc: 0.2500 - val_loss: 0.6358 - val_acc: 1.0000
Epoch 2/2
4/4 [==============================] - 0s 3ms/step - loss: 0.7509 - acc: 0.2500 - val_loss: 0.6328 - val_acc: 1.0000
I tried to implement LSTM model for time-series prediction. Below is my trial code. This code runs without error. You can also try it without dependency.
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed, Bidirectional
from sklearn.metrics import mean_squared_error, accuracy_score
from scipy.stats import linregress
from sklearn.utils import shuffle
fi = 'pollution.csv'
raw = pd.read_csv(fi, delimiter=',')
raw = raw.drop('Dates', axis=1)
print (raw.shape)
scaler = MinMaxScaler(feature_range=(-1, 1))
raw = scaler.fit_transform(raw)
time_steps = 7
def create_ds(data, t_steps):
data = pd.DataFrame(data)
data_s = data.copy()
for i in range(time_steps):
data = pd.concat([data, data_s.shift(-(i+1))], axis = 1)
data.dropna(axis=0, inplace=True)
return data.values
ds = create_ds(raw, time_steps)
print (ds.shape)
n_feats = raw.shape[1]
n_obs = time_steps * n_feats
n_rows = ds.shape[0]
train_size = int(n_rows * 0.8)
train_data = ds[:train_size, :]
train_data = shuffle(train_data)
test_data = ds[train_size:, :]
x_train = train_data[:, :n_obs]
y_train = train_data[:, n_obs:]
x_test = test_data[:, :n_obs]
y_test = test_data[:, n_obs:]
x_train = x_train.reshape(1, x_train.shape[0], x_train.shape[1])
y_train = y_train.reshape(1, y_train.shape[0], y_train.shape[1])
x_test = x_test.reshape(1, x_test.shape[0], x_test.shape[1])
print (x_train.shape)
print (y_train.shape)
print (x_test.shape)
print (y_test.shape)
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(None, x_train.shape[2]), stateful=True, batch_size=1))
model.add(LSTM(32, return_sequences=True, stateful=True))
model.add(LSTM(n_feats, return_sequences=True, stateful=True))
model.compile(loss='mse', optimizer='rmsprop')
model.fit(x_train, y_train, epochs=10, batch_size=1, verbose=2)
y_predict = model.predict(x_test)
y_predict = y_predict.reshape(y_predict.shape[1], y_predict.shape[2])
y_predict = scaler.inverse_transform(y_predict)
y_test = scaler.inverse_transform(y_test)
y_test = y_test[:,0]
y_predict = y_predict[:,0]
print (y_test.shape)
print (y_predict.shape)
plt.plot(y_test, label='True')
plt.plot(y_predict, label='Predict')
plt.legend()
plt.show()
However, prediction is extremely poor. How to improve the predictin? Do you have any ideas to improve it?
Any ideas for improving prediction by re-designing architecture and/or layers?
If you want to use the model in my code (the link you passed), you need to have the data correctly shaped: (1 sequence, total_time_steps, 5 features)
Important: I don't know if this is the best way or the best model to do this, but this model is predicting 7 time steps ahead of the input (time_shift=7)
Data and initial vars
fi = 'pollution.csv'
raw = pd.read_csv(fi, delimiter=',')
raw = raw.drop('Dates', axis=1)
print("raw shape:")
print (raw.shape)
#(1789,5) - 1789 time steps / 5 features
scaler = MinMaxScaler(feature_range=(-1, 1))
raw = scaler.fit_transform(raw)
time_shift = 7 #shift is the number of steps we are predicting ahead
n_rows = raw.shape[0] #n_rows is the number of time steps of our sequence
n_feats = raw.shape[1]
train_size = int(n_rows * 0.8)
#I couldn't understand how "ds" worked, so I simply removed it because in the code below it's not necessary
#getting the train part of the sequence
train_data = raw[:train_size, :] #first train_size steps, all 5 features
test_data = raw[train_size:, :] #I'll use the beginning of the data as state adjuster
#train_data = shuffle(train_data) !!!!!! we cannot shuffle time steps!!! we lose the sequence doing this
x_train = train_data[:-time_shift, :] #the entire train data, except the last shift steps
x_test = test_data[:-time_shift,:] #the entire test data, except the last shift steps
x_predict = raw[:-time_shift,:] #the entire raw data, except the last shift steps
y_train = train_data[time_shift:, :]
y_test = test_data[time_shift:,:]
y_predict_true = raw[time_shift:,:]
x_train = x_train.reshape(1, x_train.shape[0], x_train.shape[1]) #ok shape (1,steps,5) - 1 sequence, many steps, 5 features
y_train = y_train.reshape(1, y_train.shape[0], y_train.shape[1])
x_test = x_test.reshape(1, x_test.shape[0], x_test.shape[1])
y_test = y_test.reshape(1, y_test.shape[0], y_test.shape[1])
x_predict = x_predict.reshape(1, x_predict.shape[0], x_predict.shape[1])
y_predict_true = y_predict_true.reshape(1, y_predict_true.shape[0], y_predict_true.shape[1])
print("\nx_train:")
print (x_train.shape)
print("y_train")
print (y_train.shape)
print("x_test")
print (x_test.shape)
print("y_test")
print (y_test.shape)
Model
Your model wasn't very powerful for this task, so I tried a bigger one (this on the other hand is too powerful)
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(None, x_train.shape[2])))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(n_feats, return_sequences=True))
model.compile(loss='mse', optimizer='adam')
Fitting
Notice that I had to train 2000+ epochs for the model to have good results.
I added the validation data so we can compare the loss for train and test.
#notice that I'm predicting from the ENTIRE sequence, including x_train
#is important for the model to adjust its states before predicting the end
model.fit(x_train, y_train, epochs=1000, batch_size=1, verbose=2, validation_data=(x_test,y_test))
Predicting
Important: as for predicting the end of a sequence based on the beginning, it's important that the model sees the beginning to adjust the internal states, so I'm predicting the entire data (x_predict), not only the test data.
y_predict_model = model.predict(x_predict)
print("\ny_predict_true:")
print (y_predict_true.shape)
print("y_predict_model: ")
print (y_predict_model.shape)
def plot(true, predicted, divider):
predict_plot = scaler.inverse_transform(predicted[0])
true_plot = scaler.inverse_transform(true[0])
predict_plot = predict_plot[:,0]
true_plot = true_plot[:,0]
plt.figure(figsize=(16,6))
plt.plot(true_plot, label='True',linewidth=5)
plt.plot(predict_plot, label='Predict',color='y')
if divider > 0:
maxVal = max(true_plot.max(),predict_plot.max())
minVal = min(true_plot.min(),predict_plot.min())
plt.plot([divider,divider],[minVal,maxVal],label='train/test limit',color='k')
plt.legend()
plt.show()
test_size = n_rows - train_size
print("test length: " + str(test_size))
plot(y_predict_true,y_predict_model,train_size)
plot(y_predict_true[:,-2*test_size:],y_predict_model[:,-2*test_size:],test_size)
Showing entire data
Showing the end portion of it for more detail
Please notice that this model is overfitting, it means it can learn the training data and get bad results in test data.
To solve this you must experimentally try smaller models, use dropout layers and other techniques to prevent overfitting.
Notice also that this data very probably contains A LOT of random factors, meaning the models will not be able to learn anything useful from it. As you make smaller models to avoid overfitting, you may also find that the model will present worse predictions for training data.
Finding the perfect model is not an easy task, it's an open question and you must experiment. Maybe LSTM models simply aren't the solution. Maybe your data is simply not predictable, etc. There isn't a definitive answer for this.
How to know the model is good
With the validation data in training, you can compare loss for train and test data.
Train on 1 samples, validate on 1 samples
Epoch 1/1000
9s - loss: 0.4040 - val_loss: 0.3348
Epoch 2/1000
4s - loss: 0.3332 - val_loss: 0.2651
Epoch 3/1000
4s - loss: 0.2656 - val_loss: 0.2035
Epoch 4/1000
4s - loss: 0.2061 - val_loss: 0.1696
Epoch 5/1000
4s - loss: 0.1761 - val_loss: 0.1601
Epoch 6/1000
4s - loss: 0.1697 - val_loss: 0.1476
Epoch 7/1000
4s - loss: 0.1536 - val_loss: 0.1287
Epoch 8/1000
.....
Both should go down together. When the test data stops going down, but the train data continues to improve, your model is starting to overfit.
Trying another model
The best I could do (but I didn't really try much) was using this model:
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(None, x_train.shape[2])))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(n_feats, return_sequences=True))
model.compile(loss='mse', optimizer='adam')
When the losses were about:
loss: 0.0389 - val_loss: 0.0437
After this point, the validation loss started going up (so training beyond this point is totally useless)
Result:
This shows that all this model could learn was very overall behaviour, such as zones with higher values.
But the high frequency was either too random or the model wasn't good enough for this...
you may consider changing your model:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed, Bidirectional
from sklearn.metrics import mean_squared_error, accuracy_score
from scipy.stats import linregress
from sklearn.utils import shuffle
fi = 'pollution.csv'
raw = pd.read_csv(fi, delimiter=',')
raw = raw.drop('Dates', axis=1)
print (raw.shape)
scaler = MinMaxScaler(feature_range=(-1, 1))
raw = scaler.fit_transform(raw)
time_steps = 7
def create_ds(data, t_steps):
data = pd.DataFrame(data)
data_s = data.copy()
for i in range(time_steps):
data = pd.concat([data, data_s.shift(-(i+1))], axis = 1)
data.dropna(axis=0, inplace=True)
return data.values
ds = create_ds(raw, time_steps)
print (ds.shape)
n_feats = raw.shape[1]
n_obs = time_steps * n_feats
n_rows = ds.shape[0]
train_size = int(n_rows * 0.8)
train_data = ds[:train_size, :]
train_data = shuffle(train_data)
test_data = ds[train_size:, :]
x_train = train_data[:, :n_obs]
y_train = train_data[:, n_obs:]
x_test = test_data[:, :n_obs]
y_test = test_data[:, n_obs:]
print (x_train.shape)
print (x_test.shape)
print (y_train.shape)
print (y_test.shape)
x_train = x_train.reshape(x_train.shape[0], time_steps, n_feats)
x_test = x_test.reshape(x_test.shape[0], time_steps, n_feats)
print (x_train.shape)
print (x_test.shape)
print (y_train.shape)
print (y_test.shape)
model = Sequential()
model.add(LSTM(64, input_shape=(time_steps, n_feats), return_sequences=True))
model.add(LSTM(32, return_sequences=False))
model.add(Dense(n_feats))
model.compile(loss='mse', optimizer='rmsprop')
model.fit(x_train, y_train, epochs=10, batch_size=1, verbose=1, shuffle=False)
y_predict = model.predict(x_test)
print (y_predict.shape)
y_predict = scaler.inverse_transform(y_predict)
y_test = scaler.inverse_transform(y_test)
y_test = y_test[:,0]
y_predict = y_predict[:,0]
print (y_test.shape)
print (y_predict.shape)
plt.plot(y_test, label='True')
plt.plot(y_predict, label='Predict')
plt.legend()
plt.show()
But I really do not know merits of your implementation:
* both x and y are 3d (1,steps,features) rather than x in 3d (samples, time-steps, features) and y in 2d (samples, features)
* input_shape=(None, x_train.shape[2])
* last layer - model.add(LSTM(n_feats, return_sequences=True, stateful=True))
Someone may provide better answer.
Reading the original code, it seems the author first scales the dataset and then splits it up into Training and Testing subsets. This means that information about the Testing subset (e.g., volatility etc.) has "leaked" into the Training subset.
The recommended approach is to first perform the Training/Testing split up, calculate the scaling parameters using only the Training subset, and using these parameters perform the scaling of the Training and the Testing subsets separately.
I’m not exactly sure what you could do, that data looks as if it has no discernible pattern. If I can’t see one I doubt an LSTM could. Your prediction does look like a good regression line though.
I am at a point myself with creating a model that predicts data like this I created a SMOTErnn soultion to add as past data, and I have found using TimeSeriesGenrator on batch_size higher with higher strides it performs much bettter.
Here is my code to train an RNN with LSTM nodes:
# LSTM RNN with dropout for sequence classification
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pickle, numpy, pandas as pd
###################################### CONSTANTS #############################################
SEED = 7 # Fixes random seed for reproducibility.
URL = 'ibcData.tsv' # Specified dataset to gather data from.
SEPERATOR = '\t' # Seperator the dataset uses to divide data.
RANDOM_STATE = 1 # Pseudo-random number generator state used for random sampling.
TOP_WORDS = 5000 # Most used words in the dataset.
MAX_REVIEW_LENGTH = 500 # Length of each sentence being sent in (necessary).
EMBEDDING_VECTOR_LENGTH = 32 # The specific Embedded later will have 32-length vectors to
# represent each word.
BATCH_SIZE = 64 # Takes 64 sentences at a time and continually retrains RNN.
NUMBER_OF_EPOCHS = 3 # Fits RNN to more accurately guess the data's political bias.
DROPOUT = 0.2 # Helps slow down overfitting of data (slower convergence rate)
RECURRENT_DROPOUT = 0.2 # Helps slow down overfitting of data when recurrently training
##############################################################################################
# fix random seed for reproducibility
numpy.random.seed(SEED)
readData = pd.read_csv(URL, header=None, names=['label', 'message'], sep=SEPERATOR)
# convert label to a numerical variable
readData['label_num'] = readData.label.map({'Liberal' : 0, 'Neutral': 0.5, 'Conservative' : 1})
X = readData.message # Contains the dataset's actual sentences that were labeled
Y = readData.label_num # Either 0.0, 0.5, or 1.0 depending on label mapped to
# load the dataset into training and testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=RANDOM_STATE)
# truncate and pad input sequences
for sentence in X_train:
sentence.zfill(MAX_REVIEW_LENGTH)
for sentence in X_test:
sentence.zfill(MAX_REVIEW_LENGTH)
# create the model
model = Sequential()
model.add(Embedding(TOP_WORDS, EMBEDDING_VECTOR_LENGTH, input_length=MAX_REVIEW_LENGTH))
model.add(LSTM(100, recurrent_dropout=RECURRENT_DROPOUT dropout=DROPOUT)) # Dropouts help prevent overfitting
model.add(Dense(2, activation='sigmoid')) # Layers deal with a 2D tensor, and output a 2D tensor
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=NUMBER_OF_EPOCHS, batch_size=BATCH_SIZE)
# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
It is training a .tsv file that has data like this:
"Liberal","Forcing middle-class workers to bear a greater share of the cost of government weakens their support for needed investments and stirs resentment toward those who depend on public services the most ."
"Liberal", "Because it would not be worthwhile to bring a case for $ 30.22 , the arbitration clause would , as a practical matter , deny the Concepcions any relief and , more important , eliminate a class action that might punish AT&T for its pattern of fraudulent behavior ."
I try to run it and I get this from the console and I have no idea how to fix it nor do my professors trying to help me with this research:
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, 500, 32) 160000
_________________________________________________________________
lstm_1 (LSTM) (None, 100) 53200
_________________________________________________________________
dense_1 (Dense) (None, 2) 202
=================================================================
Total params: 213,402
Trainable params: 213,402
Non-trainable params: 0
_________________________________________________________________
None
Traceback (most recent call last):
File "LSTM-RNN.py", line 55, in <module>
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=NUMBER_OF_EPOCHS
, batch_size=BATCH_SIZE)
File "C:\Users\Hydur\Anaconda3\lib\site-packages\keras\keras\models.py", line 871, in f
it
initial_epoch=initial_epoch)
File "C:\Users\Hydur\Anaconda3\lib\site-packages\keras\keras\engine\training.py", line
1525, in fit
batch_size=batch_size)
File "C:\Users\Hydur\Anaconda3\lib\site-packages\keras\keras\engine\training.py", line
1379, in _standardize_user_data
exception_prefix='input')
File "C:\Users\Hydur\Anaconda3\lib\site-packages\keras\keras\engine\training.py", line
144, in _standardize_input_data
str(array.shape))
ValueError: Error when checking input: expected embedding_1_input to have shape (None, 50
0) but got array with shape (3244, 1)
Main problem seems to be that X contained raw strings, while the Embedding layer expected data already coded numerically. The Keras text preprocessing utilities will take care of that:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=MAX_REVIEW_LENGTH)
tokenizer.fit_on_texts(readData.message)
X = numpy.array(tokenizer.texts_to_matrix(readData.message)) # shape (None, 500)
This will code each message as a 500 integers, with a unique integer assigned to each word.
Once that was fixed, I also got an error on the "dense_1" layer. The last layer in your network was specified to have two output nodes, but the loss function you used (binary_cross_entropy) expects a single column coded as 0/1. I edited it so that layer had only one output node so the process would complete, but doubt using 0, 0.5, 1 with binary cross entropy will do what you want. I think you'd probably be between off with a 3-level one-hot encoding and categorical_cross_entropy, but that's out of scope for this question.
Here is the full edited script that ran for me. I was only able to run it on the two observations you provided but it did complete.
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import os, pickle, numpy, pandas as pd
from keras.preprocessing.text import Tokenizer
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
################################### CONSTANTS ################################################
SEED = 7 # Fixes random seed for reproducibility.
URL = 'ibcData.tsv' # Specified dataset to gather data from.
SEPERATOR = '\t' # Seperator the dataset uses to divide data.
RANDOM_STATE = 1 # Pseudo-random number generator state used for random sampling.
TOP_WORDS = 5000 # Most used words in the dataset.
MAX_REVIEW_LENGTH = 500 # Length of each sentence being sent in (necessary).
EMBEDDING_VECTOR_LENGTH = 32 # The specific Embedded later will have 32-length vectors to
# represent each word.
BATCH_SIZE = 64 # Takes 64 sentences at a time and continually retrains RNN.
NUMBER_OF_EPOCHS = 3 # Fits RNN to more accurately guess the data's political bias.
# fix random seed for reproducibility
numpy.random.seed(SEED)
readData = pd.read_csv(URL, header=None, names=['label', 'message'], sep=SEPERATOR)
# convert label to a numerical variable
tokenizer = Tokenizer(num_words=MAX_REVIEW_LENGTH)
tokenizer.fit_on_texts(readData.message)
X = numpy.array(tokenizer.texts_to_matrix(readData.message)) # shape (None, 32)
readData['label_num'] = readData.label.map({'Liberal' : 0, 'Neutral': 0.5, 'Conservative' : 1})
Y = numpy.array(readData.label_num) # Either 0.0, 0.5, or 1.0 depending on label mapped to
# load the dataset into training and testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=RANDOM_STATE)
# create the model
model = Sequential()
model.add(Embedding(TOP_WORDS, EMBEDDING_VECTOR_LENGTH, input_length=MAX_REVIEW_LENGTH))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid')) # Layers deal with a 2D tensor, and output a 2D tensor
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=NUMBER_OF_EPOCHS, batch_size=BATCH_SIZE)
# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
I then received the following output:
Using TensorFlow backend.
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, 500, 32) 160000
_________________________________________________________________
lstm_1 (LSTM) (None, 100) 53200
_________________________________________________________________
dense_1 (Dense) (None, 1) 101
=================================================================
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Train on 1 samples, validate on 1 samples
Epoch 1/3
1/1 [==============================] - 0s - loss: 0.6953 - acc: 0.0000e+00 - val_loss: 0.6814 - val_acc: 1.0000
Epoch 2/3
1/1 [==============================] - 0s - loss: 0.6814 - acc: 1.0000 - val_loss: 0.6670 - val_acc: 1.0000
Epoch 3/3
1/1 [==============================] - 0s - loss: 0.6670 - acc: 1.0000 - val_loss: 0.6516 - val_acc: 1.0000
Hope that helps.
I'm relatively new to neural nets so please excuse my ignorance. I'm trying to adapt the keras BLSTM example here. The example reads in texts and classifies them as 0 or 1. I want a BLSTM that does something very much like POS tagging, though extras like lemmatizing or other advanced features are not neccessary, I just want a basic model. My data is a list of sentences and each word is given a category 1-8. I want to train a BLSTM that can use this data to predict the category for each word in an unseen sentence.
e.g. input = ['The', 'dog', 'is', 'red'] gives output = [2, 4, 3, 7]
If the keras example is not the best route, I'm open to other suggestions.
I currently have this:
'''Train a Bidirectional LSTM.'''
from __future__ import print_function
import numpy as np
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge
from prep_nn import prep_scan
np.random.seed(1337) # for reproducibility
max_features = 20000
batch_size = 16
maxlen = 18
print('Loading data...')
(X_train, y_train), (X_test, y_test) = prep_scan(nb_words=max_features,
test_split=0.2)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')
print("Pad sequences (samples x time)")
# type issues here? float/int?
X_train = sequence.pad_sequences(X_train, value=0.)
X_test = sequence.pad_sequences(X_test, value=0.) # pad with zeros
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
# need to pad y too, because more than 1 ouput value, not classification?
y_train = sequence.pad_sequences(np.array(y_train), value=0.)
y_test = sequence.pad_sequences(np.array(y_test), value=0.)
print('y_train shape:', X_train.shape)
print('y_test shape:', X_test.shape)
# this is the placeholder tensor for the input sequences
sequence = Input(shape=(maxlen,), dtype='int32')
# this embedding layer will transform the sequences of integers
# into vectors of size 128
embedded = Embedding(max_features, 128, input_length=maxlen)(sequence)
# apply forwards LSTM
forwards = LSTM(64)(embedded)
# apply backwards LSTM
backwards = LSTM(64, go_backwards=True)(embedded)
# concatenate the outputs of the 2 LSTMs
merged = merge([forwards, backwards], mode='concat', concat_axis=-1)
after_dp = Dropout(0.5)(merged)
# number after dense has to corresponse to output matrix?
output = Dense(17, activation='sigmoid')(after_dp)
model = Model(input=sequence, output=output)
# try using different optimizers and different optimizer configs
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
batch_size=batch_size,
nb_epoch=4,
validation_data=[X_test, y_test])
X_test_new = np.array([[0,0,0,0,0,0,0,0,0,12,3,55,4,34,5,45,3,9],[0,0,0,0,0,0,0,1,7,65,34,67,34,23,24,67,54,43,]])
classes = model.predict(X_test_new, batch_size=16)
print(classes)
My output is the right dimension, but is giving me floats 0-1. I think this is because it's still looking for binary classfication. Anyone know how to fix this?
SOLVED
Just make sure the labels are each binary arrays:
(X_train, y_train), (X_test, y_test), maxlen, word_ids, tags_ids = prep_model(
nb_words=nb_words, test_len=75)
W = (y_train > 0).astype('float')
print(len(X_train), 'train sequences')
print(int(len(X_train)*val_split), 'validation sequences')
print(len(X_test), 'heldout sequences')
# this is the placeholder tensor for the input sequences
sequence = Input(shape=(maxlen,), dtype='int32')
# this embedding layer will transform the sequences of integers
# into vectors of size 256
embedded = Embedding(nb_words, output_dim=hidden,
input_length=maxlen, mask_zero=True)(sequence)
# apply forwards LSTM
forwards = LSTM(output_dim=hidden, return_sequences=True)(embedded)
# apply backwards LSTM
backwards = LSTM(output_dim=hidden, return_sequences=True,
go_backwards=True)(embedded)
# concatenate the outputs of the 2 LSTMs
merged = merge([forwards, backwards], mode='concat', concat_axis=-1)
after_dp = Dropout(0.15)(merged)
# TimeDistributed for sequence
# change activation to sigmoid?
output = TimeDistributed(
Dense(output_dim=nb_classes,
activation='softmax'))(after_dp)
model = Model(input=sequence, output=output)
# try using different optimizers and different optimizer configs
# loss=binary_crossentropy, optimizer=rmsprop
model.compile(loss='categorical_crossentropy',
metrics=['accuracy'], optimizer='adam',
sample_weight_mode='temporal')
print('Train...')
model.fit(X_train, y_train,
batch_size=batch_size,
nb_epoch=epochs,
shuffle=True,
validation_split=val_split,
sample_weight=W)
Solved. The main issue was reshaping the data for the classification categories as binary arrays. Also used TimeDistributed and set return_sequences to True.
I knows that this thread is very old but i hope will i can help.
I modified the model for a binary model:
sequence = Input(shape=(X_train.shape[1],), dtype='int32')
embedded = Embedding(max_fatures,embed_dim,input_length=X_train.shape[1], mask_zero=True)(sequence)
# apply forwards LSTM
forwards = LSTM(output_dim=hidden, return_sequences=True)(embedded)
# apply backwards LSTM
backwards = LSTM(output_dim=hidden, return_sequences=True,go_backwards=True)(embedded)
# concatenate the outputs of the 2 LSTMs
merged = concatenate([forwards, backwards])
after_dp = Dropout(0.15)(merged)
# add now layer LSTM without return_sequence
lstm_normal = LSTM(hidden)(merged)
# TimeDistributed for sequence
# change activation to sigmoid?
#output = TimeDistributed(Dense(output_dim=2,activation='sigmoid'))(after_dp)
#I changed output layer TimeDistributed for a Dense, for the problem of dimensionality and output_dim = 1 (output binary)
output = Dense(output_dim=1,activation='sigmoid')(lstm_normal)
model = Model(input=sequence, output=output)
# try using different optimizers and different optimizer configs
# loss=binary_crossentropy, optimizer=rmsprop
# I changed modelo compile by to binary and remove sample_weight_mode parameter
model.compile(loss='binary_crossentropy',
metrics=['accuracy'], optimizer='adam',
)
print(model.summary())
###################################
#this is the line of training
model.fit(X_train, Y_train,
batch_size=128,
epochs=10,
shuffle=True,
validation_split=0.2,
#sample_weight=W
)
#In this moment work fine.....
Train on 536000 samples, validate on 134000 samples
Epoch 1/10
536000/536000 [==============================] - 1814s 3ms/step - loss: 0.4794 - acc: 0.7679 - val_loss: 0.4624 - val_acc: 0.7784
Epoch 2/10
536000/536000 [==============================] - 1829s 3ms/step - loss: 0.4502 - acc: 0.7857 - val_loss: 0.4551 - val_acc: 0.7837
Epoch 3/10
99584/536000 [====>.........................] - ETA: 23:10 - loss: 0.4291 - acc: 0.7980