I'm trying to make a predictor model for identifying therapeutic peptides based on their word2vec vectors. The dataset has 100 positive and 100 negative examples. I've already embedded the peptide sequences with Word2Vec and am trying to train my neural network. However, the accuracy remains constant at 51.88%.
What I have tried: Changing the loss function(to binary cross entropy), number of nodes in each layer
Here is my code:
import sklearn
a = sklearn.utils.shuffle(arrayvectors, random_state=1)
b = sklearn.utils.shuffle(labels, random_state=1)
dfa = pd.DataFrame(a, columns=None)
dfb = pd.DataFrame(b, columns=None)
X = dfa.iloc[:]
y = dfb.iloc[:]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=300)
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)
## train data
class trainData(Dataset):
def __init__(self, X_data, y_data):
self.X_data = X_data
self.y_data = y_data
def __getitem__(self, index):
return self.X_data[index], self.y_data[index]
def __len__ (self):
return len(self.X_data)
train_data = trainData(torch.FloatTensor(X_train),
torch.FloatTensor(y_train))
## test data
class testData(Dataset):
def __init__(self, X_data):
self.X_data = X_data
def __getitem__(self, index):
return self.X_data[index]
def __len__ (self):
return len(self.X_data)
test_data = testData(torch.FloatTensor(X_test))
EPOCHS = 100
BATCH_SIZE = 2
LEARNING_RATE = 0.0001
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1)
# make mode
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(4,)))
model.add(Dropout(0.5))
model.add(Dense(16, input_dim=1, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(12,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=1000, batch_size=64)
Let me know if you have any thoughts!
Try increasing the batch size to 16 from 2 and also reduce the drop out to .2 or less. That's too much dropout
Related
I am new to ML and I am creating a CNN model for Sentiment analysis using word2vec. My word2vec contains negative value also. While fitting the model I got an error -
InvalidArgumentError in model.fit(X_train, Y_train, epochs=3, batch_size=64)
InvalidArgumentError: Graph execution error: Detected at node 'sequential_30/embedding_29/embedding_lookup'
This is the code to create the model
def get_vec(x):
doc = nlp(x)
vec = doc.vector
return vec
df['vec'] = df['text'].apply(lambda x: get_vec(x))
XTrain = df['vec'].to_numpy()
XTrain = XTrain.reshape(-1, 1)
XTrain = np.concatenate(np.concatenate(XTrain, axis = 0), axis = 0).reshape(-1, 300)
YTrain = df['target']
X_train, X_test, Y_train, Y_test = train_test_split(XTrain, YTrain, test_size = .3, random_state = 45, stratify = YTrain)
# Pad the sequence to the same length
max_review_length = 1600
X_train = pad_sequences(X_train, maxlen=max_review_length)
top_words = (len(nlp.vocab)) + 1
# Using embedding from Keras
embedding_vecor_length = 300
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, padding='same'))
model.add(Convolution1D(32, 3, padding='same'))
model.add(Convolution1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(180,activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=3, batch_size=64)
When I replace all negative values in df['vec'], code is working without error but with 0 accuracy. What is wrong in this? Please help. Thanks in advance..
I am not a data scientist and not very professional in machine learning. I am trying to improve the results of this model for predicting the trend for a stock movement (-1:down, 0:no change, +1:up). Here is the code in python and plots for the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= False) #Shuffle set to False
#Normalizing data
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
#setting up the model of tensorflow
input_layer = Input(shape=(X_train.shape[1],1))
x=input_layer
for _ in range(2): # five layers
x = Dropout(0.5)(x) # Dropout to avoid overfitting
x = CuDNNLSTM(X_train.shape[1], return_sequences = True)(x) # using LSTM with return sequences to adopt to time sequences
x = GlobalAveragePooling1D()(x) #Global averaging to one layer shape to feed to a dense categorigal classification
output = Dense(y.shape[1], activation='softmax')(x)
model = Model(inputs=input_layer, outputs=output)
opt = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics = ['acc'])
#creating an early stop based on minmizing val_loss
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=200,restore_best_weights=True)
#fit the model
r = model.fit(X_train, y_train, epochs = 200000, batch_size=16400,
validation_data = (X_test, y_test), callbacks=[early_stop], shuffle=False)
#plot the results.
pd.DataFrame(r.history).plot()
model learning plot
I am trying to modify the following example to simulate CNN for my set of data and running into some errors
https://machinelearningmastery.com/cnn-models-for-human-activity-recognition-time-series-classification/
X = D.replace(['Resting', 'Swimming', 'Feeding', 'Non directed motion'], [0, 1, 2, 3])
X_Label = X['Label'].to_numpy()
X_Data = X[['X_static','Y_static','Z_static','X_dynamic','Y_dynamic','Z_dynamic']].to_numpy()
X_names = ['X_static','Y_static','Z_static','X_dynamic','Y_dynamic','Z_dynamic']
X_Label_Names = np.array(['Resting', 'Swimming', 'Feeding', 'Non directed motion'])
X_Data is a 5600 by 6 column numpy matrix. Each column represents a type of measurement data over time
X_Label is a 5600 by 1 column consisting of values of 0 through 3 that represents the features or attributes. 0 represents resting, 1 represents swimming and so on.
X = X_Data
y = X_Label
def load_dataset_f(X,y):
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, stratify=y, random_state=random_state
)
trainX = X_train
trainy = y_train
testX = X_test
testy = y_test
print(trainX)
print(trainX.shape)
print(trainy.shape)
return trainX, trainy, testX, testy
# fit and evaluate a model
def evaluate_model_f(trainX, trainy, testX, testy):
verbose, epochs, batch_size = 2, 10, 20
n_timesteps, n_features, n_outputs = 6, 1, 1
print('n timesteps --------------------------------------------------------------------')
print(n_timesteps)
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(n_timesteps,n_features)))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(Dropout(0.5))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(n_outputs, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
print(to_categorical(trainy))
model.fit(trainX.reshape(len(trainX),6,1), to_categorical(trainy))
# evaluate model
_, accuracy = model.evaluate(testX, testy, batch_size=batch_size, verbose=0)
return accuracy
def run_experiment_f(repeats=1):
# load data
trainX, trainy, testX, testy = load_dataset_f(X,y)
print(trainX)
# repeat experiment
scores = list()
for r in range(repeats):
score = evaluate_model_f(trainX, trainy, testX, testy)
score = score * 100.0
print('>#%d: %.3f' % (r+1, score))
scores.append(score)
# summarize results
summarize_results(scores)
load_dataset_f(X,y)
run_experiment_f()
I am unfamiliar with the tensorflow library and getting errors at model.fit(), I am not sure to how to approach this. The matrix presented in the example was 3D where as my data is 2D, not sure if that matters. How do I get this code to work ?
You need to make sure that your input to your Conv1D layer has the shape (timesteps, features) and that your last output layer's units equals the number of unique labels in your dataset. Here is a working example:
import tensorflow as tf
trainX = tf.random.normal((32, 6))
trainy = tf.random.uniform((32, 1), maxval=4)
verbose, epochs, batch_size = 2, 10, 20
n_timesteps, n_features, n_outputs = 6, 1, 4
print('n timesteps --------------------------------------------------------------------')
print(n_timesteps)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(n_timesteps,n_features)))
model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(100, activation='relu'))
model.add(tf.keras.layers.Dense(n_outputs, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
print(tf.keras.utils.to_categorical(trainy))
trainX = tf.expand_dims(trainX, axis=2)
model.fit(trainX, tf.keras.utils.to_categorical(trainy))
I'm training a model in Python with TensorFlow, but I want to print a confusion matrix of the model predicted data (predicted vs actual). How can I do this? My code so far (excluding data load and preprocessing) is below:
from sklearn.preprocessing import LabelEncoder
Y = data.label
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)
len(X)
sequences = list()
for line in X:
# integer encode line
encoded_seq = [char_index[char] for char in line]
# store
sequences.append(encoded_seq)
padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
sequences, padding="pre")
print(padded_inputs)
ex = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding="pre")
print(ex)
#And here is the train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_inputs, Y, test_size= 0.1,
random_state = 0)
X_train
y_train = y_train.astype("int32")
y_train
X_train.shape
X_test
print(X_train.shape)
max_len = 19
max_words = 57
from tensorflow.keras.optimizers import RMSprop
def RNN():
inputs = tf.keras.Input(name='inputs',shape=[max_len])
layer = Embedding(max_words,19,input_length=max_len)(inputs)
layer = LSTM(64)(layer)
layer = Dense(256,name='FC1')(layer)
layer = Activation('relu')(layer)
layer = Dropout(0.5)(layer)
layer = Dense(1,name='out_layer')(layer)
layer = Activation('sigmoid')(layer)
model = tf.keras.Model(inputs=inputs,outputs=layer)
return model
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])
num_epochs = 40
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
history=model.fit(X_train, y_train,batch_size=128, epochs= num_epochs, validation_split=0.2,
callbacks=[tensorboard_callback], verbose=2)
accr = model.evaluate(X_test,y_test)
print('Test set\nLoss: {:0.3f}\nAccuracy: {:0.3f}'.format(accr[0],accr[1]))
yhat = model.predict(X_test)
I've tried the use: tf.math.confusion_matrix(labels, predictions, num_classes=None, weights=None, dtype=tf.dtypes.int32,name=None, but I don't know how to use it properly in this code.
Why is the testing accuracy higher than my training accuracy? This is not the case for the validation accuracy. Is it because of the way I am splitting my dataset?
Modifying the network did not work so I am guessing I am doing something wrong in the dataset preparation part.
The dataset is composed of packet captures of malware and normal activities.. dataset.txt file contains total of 777 rows and 28 columns.
#converting dataset and labels to numpy arrays
x = np.genfromtxt("dataset.txt", delimiter=",")
y = np.genfromtxt("label.txt", delimiter=",")
#handling missing values
x[np.isnan(x)] = 0
#shuffling the data
indices = np.arange(x.shape[0])
np.random.shuffle(indices)
x = x[indices]
y = y[indices]
#dividing the dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
#building the model
def build_model():
model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape=(28,)))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy',
metrics=['accuracy'])
return model
'''cross validation
k = 5
num_val_samples = len(x_train) // k
all_scores = []
for i in range(k):
print('fold #', i)
x_val = x_train[i * num_val_samples: (i + 1) * num_val_samples]
y_val = y_train[i * num_val_samples: (i + 1) * num_val_samples]
partial_x_train = np.concatenate([x_train[:i * num_val_samples],
x_train[(i + 1) * num_val_samples:]], axis=0)
partial_y_train = np.concatenate([y_train[:i * num_val_samples],
y_train[(i + 1) * num_val_samples:]], axis=0)
model = build_model()
model.fit(partial_x_train, partial_y_train,epochs=20, batch_size=16,
verbose=0)
val_loss, val_acc = model.evaluate(x_val, y_val, verbose=0)
all_scores.append(val_acc)
print(all_scores)
val_acc = np.mean(all_scores)
print(val_loss , val_acc)
'''
#training the model with the entire training dataset
model = build_model()
model.fit(x_train, y_train, epochs=20, batch_size=16)
#confusion matrix
y_pred = model.predict(x_test)
y_pred = (y_pred > 0.5)
result = confusion_matrix(y_test, y_pred)
print ('Confusion Matrix:')
print(result)
#calculating the test accuracy
model_acc = accuracy_score(y_test, y_pred)
print('Test Accuracy:')
print(model_acc)
This is because keras reports running average accuracy for each epoch. For small number of epochs this means that by the end of an epoch your model is better than it was on average during this epoch.
This could also be due to randomly having 'easier' samples in the test set, but this would not happen each run if you split it randomly in the same portion of the code.