Error with TfidfVectorizer but ok with CountVectorizer - python

I have been working on this the whole day but no luck
I managed to eliminate the problem in one line of TfidfVectorizer
Here is my working code
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(xtrain)
X_train_count = vectorizer.transform(xtrain)
X_test_count = vectorizer.transform(xval)
X_train_count
from keras.models import Sequential
from keras import layers
input_dim = X_train_count.shape[1] # Number of features
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
history = model.fit(X_train_count, ytrain,
epochs=10,
verbose=False,
validation_data=(X_test_count, yval),
batch_size=10)
But when I change to
from sklearn.feature_extraction.text import TfidfVectorizer
#TF-IDF initializer
vectorizer = TfidfVectorizer(max_df=0.8, max_features=1000)
vectorizer.fit(xtrain)
X_train_count = vectorizer.transform(xtrain)
X_test_count = vectorizer.transform(xval)
X_train_count
from keras.models import Sequential
from keras import layers
input_dim = X_train_count.shape[1] # Number of features
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
history = model.fit(X_train_count, ytrain,
epochs=10,
verbose=False,
validation_data=(X_test_count, yval),
batch_size=10)
The only thing changed is this 2 lines
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.8, max_features=1000)
and then I get this error
InvalidArgumentError: indices[1] = [0,997] is out of order. Many sparse ops require sorted indices.
Use tf.sparse.reorder to create a correctly ordered copy.
[Op:SerializeManySparse]
How to fix that and why it is happening?

vectorizer.transform(...) produces a sparse array and this is not good for keras. you simply have to transform it in a simple array. this is simply possible with:
vectorizer.transform(...).toarray()

Related

I need consistent results for MSE between runs when training a Neural Network

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization
# load data from UCI Machine Learning Repository
df = pd.read_csv(r'C:\Test_set_Yacht.csv')
df1 = pd.read_csv(r'C:\Train_set_Yacht.csv')
df2 = pd.read_csv(r'C:\Yacht_hydro.csv')
X = df2.drop("residuary_resistance", axis=1)
Y = df2["residuary_resistance"]
# split data into features and target
X_train = df1.drop("residuary_resistance", axis=1)
y_train = df1["residuary_resistance"]
# split data into train and test sets
X_test = df.drop("residuary_resistance", axis=1)
y_test = df["residuary_resistance"]
# scale data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
def objective_model_1(hidden_units, learning_rate):
model = Sequential()
model.add(Dense(hidden_units, input_dim=X.shape[1], activation="relu"))
model.add(Dense(hidden_units, activation="relu"))
model.add(Dense(1, activation="linear"))
model.compile(loss="mse", optimizer=Adam(learning_rate=learning_rate))
model.fit(X_train_scaled, y_train, epochs=100, verbose=0)
y_pred = model.predict(X_test_scaled)
return -mean_squared_error(y_test, y_pred)
pbounds_model_1 = {
"hidden_units": (32, 128),
"learning_rate": (1e-5, 1e-1),
}
bo_model_1 = BayesianOptimization(
f=objective_model_1,
pbounds=pbounds_model_1,
random_state=42,
)
bo_model_1.maximize(init_points=10, n_iter=90)
def objective_model_2(hidden_units, learning_rate):
model = Sequential()
model.add(Dense(hidden_units, input_shape=X_train_scaled.shape[1:], activation="relu"))
model.add(Dense(hidden_units, activation="relu"))
model.add(Dense(hidden_units, activation="relu"))
model.add(Dense(hidden_units, activation="relu"))
model.add(Dense(1, activation="linear"))
model.compile(loss="mse", optimizer=Adam(learning_rate=learning_rate))
model.fit(X_train_scaled, y_train, epochs=100, verbose=0)
y_pred = model.predict(X_test_scaled)
return -mean_squared_error(y_test, y_pred)
pbounds_model_2 = {
"hidden_units": (32, 128),
"learning_rate": (1e-5, 1e-1),
}
bo_model_2 = BayesianOptimization(
f=objective_model_2,
pbounds=pbounds_model_2,
random_state=42,
)
bo_model_2.maximize(init_points=10, n_iter=90)
# get the best hyperparameters
# get the best hyperparameters for each model
best_params_model_1 = bo_model_1.max["params"]
best_params_model_2 = bo_model_2.max["params"]
# train and evaluate model 1 with best hyperparameters
model_1 = Sequential()
model_1.add(Dense(32, input_dim=X.shape[1], activation="relu"))
model_1.add(Dense(32, activation="relu"))
model_1.add(Dense(1, activation="linear"))
model_1.compile(loss="mse", optimizer=Adam(learning_rate=best_params_model_1["learning_rate"]))
model_1.fit(X_train_scaled, y_train, epochs=100, verbose=0)
y_pred_1 = model_1.predict(X_test_scaled)
mse_1 = mean_squared_error(y_test, y_pred_1)
print("Model 1 MSE on test set:", mse_1)
# train and evaluate model 2 with best hyperparameters
model_2 = Sequential()
model_2.add(Dense(64, input_dim=X.shape[1], activation="relu"))
model_2.add(Dense(64, activation="relu"))
model_2.add(Dense(64, activation="relu"))
model_2.add(Dense(64, activation="relu"))
model_2.add(Dense(1, activation="linear"))
model_2.compile(loss="mse", optimizer=Adam(learning_rate=best_params_model_2["learning_rate"]))
model_2.fit(X_train_scaled, y_train, epochs=100, verbose=0)
y_pred_2 = model_2.predict(X_test_scaled)
mse_2 = mean_squared_error(y_test, y_pred_2)
print("Model 2 MSE on test set:", mse_2)
In the following code, I implement a bayesian optimization for hyperparameter tunning of 2 different NN using the data set from: https://archive.ics.uci.edu/ml/datasets/yacht+hydrodynamics,After running this, I create again those 2 NN in a JupytherNotebook code block and run with the best hyperparameters already determined by the bayesian optimizer. I need each time I run the code to get the same MSE. This is the reason why I am splitting the data already to ensure the same results.
The inner workings of the tensorflow library are non-deterministic. So you must set a random seed in order to get reproducible results, in practice you just need to add this line at the start of your code:
tf.random.set_seed(0)

Why I can't reproduce Keras results?

I know that there is a problem with reproducibility in keras. However due to my research I created a function:
def set_seed():
seed_value = 42
os.environ['PYTHONHASHSEED']=str(seed_value)
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf1.get_default_graph(), config=session_conf)
np.random.seed(seed_value)
random.seed(seed_value)
tf.random.set_seed(seed_value)
K.set_session(sess)
which should assure me a reproductive results in keras.
My problem
I'm trying to run self-created grid search on neural network using keras:
# Early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 50)
callbacks=[es]
# Possible learning rates
learning_rates = np.linspace(0.1, 10**(-5), 10)
# Run grid search on one layer
set_seed()
mse1 = np.array([])
rate1 = np.array([])
neuron_number1 = np.array([])
for rate in learning_rates[0:2]:
for neuron in range(1, 3):
model = Sequential()
model.add(Dense(neuron, input_dim=2, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, activation = 'relu'))
model.summary()
model.compile(loss='mse', optimizer=SGD(lr=rate), metrics=['mse'])
history = model.fit(X_train, y_train, epochs=1000, batch_size=50, validation_split=0.5, callbacks=[es])
mse1 = np.append(mse1, history.history['val_loss'][-1])
rate1 = np.append(rate1, rate)
neuron_number1= np.append(neuron_number1, neuron)
neural_summary1 = pandas.DataFrame(data = [neuron_number1, rate1, mse1])
neural_summary1 = neural_summary1.transpose()
neural_summary1.columns = ["number_of_neurons", "learning_rate", "mse"]
print(neural_summary1.iloc[neural_summary1['mse'].idxmin()])
number_of_neurons 2.000000
learning_rate 0.088890
mse 0.159922
But when I run it apart from grid search:
set_seed()
model = Sequential()
model.add(Dense(2, input_dim=2, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, activation = 'relu'))
model.summary()
model.compile(loss='mse', optimizer=SGD(lr=0.088890), metrics=['mse'])
history = model.fit(X_train, y_train, epochs=1000, batch_size=50, validation_split=0.5, callbacks=[es])
print(history.history['val_loss'][-1])
8.767917346954345
which is different than result obtained previously from grid search.
Do you know where is mistake in my code or why is not working?
Packages that I use are the following:
import pandas
import random
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD
import os
from keras import backend as K

NLP Sentiment Analysis net is not learning

I want to train a neural net for sentiment analysis. I have followed the tutorials on the keras webpage but I had to adapt the code to my usecase in order to be able to use the net afterwards.
For this purpose I decode back the texts from the imdb dataset from keras from numbers to text, and then I stemmize the text because I need to use the text stemmized. After that, since I want to control the way I am doing the word embeddings rather than using text_to_sequences an pad_sequences I am training a doc2vec embeddings and I am using it on the training set, so that I can obtain the embeddings from the text I want to classify.
The problem is that, the net does not learn anything, the accuracy does not improve and I can not reduce the loss function. I have tried many many things, like the architecture of the net, all the hyperparameters and changing the last layer from 2 nets to 1 and from sparse_categorical_entropy to binary_crossentropy. Let's see if anybody can help and show some light to my problem. I plug the code here and thanks in advance.
from keras.datasets import imdb
max_features = 40000
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=max_features)
import numpy as np
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()])
decoded = " ".join([reverse_index.get(i - 3, "") for i in data[0]])
import nltk
from nltk .stem import LancasterStemmer
toke_corpus = list()
lan = LancasterStemmer()
from tqdm import tqdm
lista_reviews = list()
for review in tqdm(data):
lista_reviews.append(np.array([lan.stem(reverse_index.get(i - 3, '')) for i in review][1:]))
train_x, test_x = lista_reviews[10000:], lista_reviews[:10000]
train_y, test_y = targets[10000:], targets[:10000]
from gensim.models.callbacks import CallbackAny2Vec
class EpochLogger(CallbackAny2Vec):
'''Callback to log information about training'''
def __init__(self):
self.epoch = 0
def on_epoch_begin(self, model):
print("Epoch #{} start".format(self.epoch))
def on_epoch_end(self, model):
print("Epoch #{} end".format(self.epoch))
self.epoch += 1
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(lista_reviews)]
print("DOcuments already built")
epoch_logger = EpochLogger()
model = Doc2Vec(documents, vector_size=512, window=5, min_count=3, workers=8, epochs = 7, callbacks=[epoch_logger])
encoded_x_train, encoded_x_test = list(), list()
from tqdm import tqdm
for i in tqdm(train_x):
encoded_x_train.append(model.infer_vector(i))
for k in tqdm(test_x):
encoded_x_test.append(model.infer_vector(k))
import keras
reduce_lr = keras.callbacks.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.50, patience=2, verbose=1, mode='auto', cooldown=0, min_lr=0.00001)
early = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=4, verbose=1, mode='auto')
from keras import models
from keras.models import Sequential
from keras import layers
from keras.layers import Embedding, Bidirectional, Dense, LSTM, Conv1D, MaxPooling1D, Flatten
model1 = Sequential()
model1.add(Embedding(input_dim = max_features, input_length=512, output_dim=128, trainable=False))
model1.add(Conv1D(filters=64,
kernel_size=5,
padding='valid',
activation='linear',
strides=1))
model1.add(MaxPooling1D(pool_size=4))
model1.add(Dense(64, activation='linear'))
model1.add(LSTM(32, activation='tanh'))
# model1.add(Dense(32, activation='relu'))
# model1.add(Flatten())
# model1.add(Dense(1, activation='sigmoid'))
model1.add(Dense(2, activation='softmax'))
model1.summary()
from keras import optimizers
# sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
adam = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, amsgrad=False)
model1.compile(loss='sparse_categorical_crossentropy',
optimizer=adam,
metrics=['accuracy'])
history = model1.fit( np.array(encoded_x_train), np.array(train_y),
epochs= 20,
batch_size = 500,
validation_data = (np.array(encoded_x_test), np.array(test_y)), callbacks = [reduce_lr, early]
)
You use Doc2Vec to create sample embeddings. for this reason, I don't think that Embedding, Conv1D and MaxPooling1D layers are useful in your network. they are useful for word2vec where you can extract embeddings of each token and use them inside a network.
try to feed your network directly with your embedding in this way
model1 = Sequential()
model1.add(Dense(128, activation='relu', input_shape=(512,)))
# ....
model1.add(Dense(2, activation='softmax'))
adam = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, amsgrad=False)
model1.compile(loss='sparse_categorical_crossentropy',
optimizer=adam,
metrics=['accuracy'])
history = model1.fit( np.array(encoded_x_train), np.array(train_y),
epochs= 20,
batch_size = 500,
validation_data = (np.array(encoded_x_test), np.array(test_y)), callbacks = [reduce_lr, early]
)

how can i evaluate StratifiedKFold model

import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
x_train = dataset[0:700,:-1]
y_train = dataset[0:700,-1]
x_test = dataset[700:,:-1]
y_test = dataset[700:,-1]
def create_model():
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=64)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
scores = cross_val_score(model, x_train, y_train, cv=skf)
predictions = cross_val_predict(model, x_test, y_test, cv=skf)
I want to train [x_train], [y_train] by StratifiedKFold
and eveluate by [x_test], [y_test]
how can i do?
I tried cross_val_predict. but i think it is not appropriate.
To split between train and test in a stratified way you can use:
from sklearn.model_selection import train_test_split
dataset_train, dataset_test = train_test_split(dataset,
stratify=dataset[:,-1],
test_size=0.2)
#split both datasets into X,y
Check:
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
Stratified Train/Test-split in scikit-learn
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
accuracy=[]
for train in skf.split(x_train, y_train):
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
how about this one? it is work but i don`t know is it correct.

Real bad accuracy of training test on neural network on keras

I'm doing a Neural Network for the "Default of credit card clients" from http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients.
But the accuracy of my models is pretty bad, worse than if I predicted all zeros. I already did some research on it and did oversample to correct the classes, and change the optimizer, because adam was not increasing the accuracy.
What else could I do?
import pandas
import numpy
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
import keras
from imblearn.over_sampling import SMOTE
seed = 8
numpy.random.seed(seed)
base = pandas.read_csv('base_nao_trabalhada.csv')
train, test = train_test_split(base, test_size = 0.2)
train=train.values
test=test.values
X_train = train[:,1:23]
Y_train = train[:,24]
X_test = test[:,1:23]
Y_test = test[:,24]
sm = SMOTE(kind='regular')
X_resampled, Y_resampled = sm.fit_sample(X_train, Y_train)
# Model Creation
model = Sequential()
model.add(Dense(40, input_dim=22, init='uniform', activation='relu'))
model.add(Dense(4, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='sigmoid'))
#activation='relu'
opt = keras.optimizers.SGD(lr=0.000001)
# Compile model
model.compile(loss='binary_crossentropy', optimizer=opt , metrics=['accuracy'])
#loss=binary_crossentropy
#optimizer='adam'
# creating .fit
model.fit(X_resampled, Y_resampled, nb_epoch=10000, batch_size=30)
# evaluate the model
scores = model.evaluate(X_test, Y_test)
print ()
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

Categories