Siamese LSTM for document similarity using keras giving input error - python

I have written this code to use the Siamese method to calculate the similarity of two documents. I want to embed my vectorize layer (embedding is performed using Google News Dataset) of two separate documents using vectorization approach and then feed it to LSTM and output of LSTM goes into Cosine function to measure the similarity of two documents.
#importing libraries
from __future__ import print_function
import gensim
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import csv
import re
import pandas as pd
from pandas import DataFrame
import pandas as pd
nltk.download('punkt')
from tensorflow import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, TimeDistributed
from tensorflow.keras import layers
#Loading pre=trained word2vec model
from gensim.models.keyedvectors import KeyedVectors
# You need to dowload google pre-trained model using below link
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
#Change the path according to your directory
model_path = 'D:\GoogleNews_vectors_negative300\GoogleNews_vectors_negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)
#Setting Parameters for model
class DocSim(object):
def __init__(self, w2v_model , stopwords=[]):
self.w2v_model = w2v_model
self.stopwords = stopwords
def vectorize(self, doc):
"""Identify the vector values for each word in the given document"""
doc = doc.lower()
words = [w for w in doc.split(" ") if w not in self.stopwords]
word_vecs = []
for word in words:
try:
vec = self.w2v_model[word]
word_vecs.append(vec)
except KeyError:
# Ignore, if the word doesn't exist in the vocabulary
pass
# Assuming that document vector is the mean of all the word vectors
vector = np.mean(word_vecs, axis=0)
return vector
def Siamese_cosine_sim(self, vectorA, vectorB):
model = Sequential()
model.add(LSTM(20, return_sequences=True),input_shape=[vectorA,vectorB])
model.compile(loss='binary_crossentropy', optimizer='adam')
outputs = layers.Dense(1, activation="sigmoid")(left_doc,right_doc)
"""Find the cosine similarity distance between two vectors."""
csim = np.dot(left_doc, right_doc) / (np.linalg.norm(left_doc) * np.linalg.norm(right_doc))
if np.isnan(np.sum(csim)):
return 0
return csim
def calculate_similarity(self, withdigits_source_rules, withdigits_target_rules=[], threshold=0.8):
"""Calculates & returns similarity scores between given source rules & all
the target rules"""
if isinstance(withdigits_target_rules, str):
withdigits_target_rules = [withdigits_target_rules]
source_vec = self.vectorize(withdigits_source_rules)
results = []
for rule in withdigits_target_rules:
target_vec = self.vectorize(rule)
sim_score = self.Siamese_cosine_sim (source_vec, target_vec)
if sim_score > threshold:
results.append({
'Siamese Sim Score':sim_score,
'Target Rule':rule
})
# Sort results by score in desc order
results.sort(key=lambda k : k['Siamese Sim Score'] , reverse=True)
return results
ds = DocSim(w2v_model)
#Two documents data
withdigits_source_rules =set(["2.1 Separation of trains","2.3.1.2 Level crossing obstruction","2.2.1.1 Safety is compromised if a train proceeds without a movement autority","Principle: The method of signalling must maintain a space interval between trains that is safe.","2.1.1 Context"])
#Calculate the similarity score between a source rule & a target rule.
if isinstance(withdigits_source_rules, str):
withdigits_source_rules = [withdigits_source_rules]
# This will return one target rules text with a similarity score
for rule in withdigits_source_rules:
sim_scores= ds.calculate_similarity(rule, withdigits_target_rules)
# Printing the output in text file
print("Source rule: {} \n\nSimilarity with Target Rule is \n\n {}\n".format(rule, sim_scores) , file=open("output.txt", "a"))
print("\n")
# Printing output in Jupyter
print("Source rule: {} \n\nSimilarity with Target Rule is \n\n {}\n".format(rule, sim_scores) )
print("\n")
I am getting following error if someone can help me to solve this issue along with LSTM input function?
TypeError: add() got an unexpected keyword argument 'input_shape'

Refer to the documentation here for adding layers to a Sequential model. The add method only accepts one parameter - layer. If the passed argument is not a layer instance, it raises TypeError, which is precisely the error it threw. I guess, you wanted to pass the input_shape parameter to the LSTM layer (line after creating Sequential model). Just move it inside the LSTM layer and it should work fine.

Related

huggingface classification struggling with prediction

I am fine tuning longformer and then making prediction using TextClassificationPipeline and model(**inputs) methods. I am not sure why I get different results
import pandas as pd
import datasets
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig
import torch.nn as nn
import torch
from torch.utils.data import DataLoader#Dataset,
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
#import wandb
import os
from datasets import Dataset
from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification
tokenizer = LongformerTokenizerFast.from_pretrained('folder_path/', max_length = maximum_len)
Loading the fine tuned model from a saved location. Using original tokenizer
saved_location='c:/xyz'
model_saved=AutoModelForSequenceClassification.from_pretrained(saved_location)
pipe = TextClassificationPipeline(model=model_saved, tokenizer=tokenizer, device=0)#tokenizer_saved, padding=True, truncation=True)
prediction = pipe(["The text to predict"], return_all_scores=True)
prediction
[[{'label': 'LABEL_0', 'score': 0.7107483148574829},
{'label': 'LABEL_1', 'score': 0.2892516553401947}]]
2nd method
inputs = tokenizer("The text to predict", return_tensors="pt").to(device)
outputs = model_saved(**inputs)#, labels=labels)
print (outputs['logits'])
#tensor([[ 0.4552, -0.4438]], device='cuda:0', grad_fn=<AddmmBackward0>)
torch.sigmoid(outputs['logits'])
#tensor([[0.6119, 0.3908]], device='cuda:0', grad_fn=<SigmoidBackward0>)
AutoModelForSequenceClassification returns probabilities 0.71 and 0.29. When I look at the 2nd method. It returns logits 0.4552, -0.4438 which convert to probabilities 0.6119, 0.3908
#update 1
The first link TextClassificationPipeline from cronoik's answer says below
function_to_apply (str, optional, defaults to "default") — The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
"default": if the model has a single label, will apply the sigmoid function on the output. If the model has several labels, will apply the softmax function on the output.
"sigmoid": Applies the sigmoid function on the output.
"softmax": Applies the softmax function on the output.
"none": Does not apply any function on the output.
as this is a binary classification problem (single label) shouldn't it apply sigmoid?
I assume that model.config.num_labels==2, if that is the case, the TextClassificationPipeline applies softmax and not sigmoid to calculate the probabilities (code).
import torch
logits = torch.tensor([ 0.4552, -0.4438])
print(torch.softmax(logits,0))
Output:
tensor([0.7107, 0.2893])

a problem and how to deal with batch while creating a Model

from keras_multi_head import MultiHeadAttention
import keras
from keras.layers import Dense,Input,Multiply
from keras import backend as K
from keras.layers.core import Dropout, Layer
from keras.models import Sequential,Model
import numpy as np
import tensorflow as tf
from self_attention_layer import Encoder
## multi source attention
class Multi_source_attention(keras.Model):
def __init__(self,read_n,embed_dim,num_heads,ff_dim,num_layers):
super().__init__()
self.read_n = read_n
self.embed_dim = embed_dim
self.num_heads = num_heads
self.ff_dim = ff_dim
self.num_layers = num_layers
self.get_weights = Dense(49, activation = 'relu',name = "get_weights")
def compute_output_shape(self,input_shape):
#([batch,7,7,256],[1,256])
return input_shape
def call(self,inputs):
## weights matrix
#(1,49)
weights_res = self.get_weights(inputs[1])
#(1,7,7)
weights = tf.reshape(weights_res,(1,7,7))
#(256,7,7)
weights = tf.tile(weights,[256,1,1])
## img from mobilenet
img=tf.reshape(inputs[0],[-1,7,7])
inter_res = tf.multiply(img,weights)
inter_res = tf.reshape(inter_res, (-1,256,49))
print(inter_res.shape)
att = Encoder(self.embed_dim,self.num_heads,self.ff_dim,self.num_layers)(inter_res)
return att
I try to construct a network to implement the part circled in the image. The output from LSTM **(1,256) and from the previous Mobilenet (batch,7,7,256). Then the output of LSTM is transformed to a weights matrix in form of (7,7).
But the problem is that the input shape of the output from mobilenet has a attribute batch. I have no idea how to deal with "batch" or how to set up a parameter to constraint the batch?
Could someone give me a tip?
And if I remove the function compute_output_shape(), one error unimplementerror occurs. the keras official doc tells me that I don't need to overwrite the function.
Could someone explain me about that?
Compute_output_shape is crucial to custom the layer. if the function summary() is called, the corresponding Graph is generated where the input and output shapes are showed in every layer. The compute_output_shape is responsible for the output shape.

How to get Keras model predicted text back into list of words?

I'm trying to built an Autoencoder neural network for finding outliers in a single column list of text. The text input is like the following:
about_header.png
amaze_header_2.png
amaze_header.png
circle_shape.xml
disableable_ic_edit_24dp.xml
fab_label_background.xml
fab_shadow_black.9.png
fab_shadow_dark.9.png
fab_shadow_light.9.png
fastscroller_handle_normal.xml
fastscroller_handle_pressed.xml
folder_fab.png
The problem is that I don't really know what I'm doing, I'm using Keras, and I've converted these lines of text into a matrix using the Keras Tokenizer, so they can be fed into Keras Model so I can fit and predict them.
The problem is that the predict function returns what I believe is a matrix, and I can't really know for sure what happened because I can't convert the matrix back into the list of text like I originally had.
My entire code is as follows:
import sys
from keras import Input, Model
import matplotlib.pyplot as plt
from keras.layers import Dense
from keras.preprocessing.text import Tokenizer
with open('drawables.txt', 'r') as arquivo:
dados = arquivo.read().splitlines()
tokenizer = Tokenizer(filters='', nb_words=None)
tokenizer.fit_on_texts(dados)
x_dados = tokenizer.texts_to_matrix(dados, mode="count")
tamanho = len(tokenizer.word_index) + 1
tamanho_comprimido = int(tamanho/1.25)
x = Input(shape=(tamanho,))
# Encoder
hidden_1 = Dense(tamanho_comprimido, activation='relu')(x)
h = Dense(tamanho_comprimido, activation='relu')(hidden_1)
# Decoder
hidden_2 = Dense(tamanho, activation='relu')(h)
r = Dense(tamanho, activation='sigmoid')(hidden_2)
autoencoder = Model(input=x, output=r)
autoencoder.compile(optimizer='adam', loss='mse')
history = autoencoder.fit(x_dados, x_dados, epochs=25, shuffle=False)
plt.plot(history.history["loss"])
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.show()
encoded = autoencoder.predict(x_dados)
result = ???????
You can decode the text using original encoding tokenizer.sequences_to_texts. This accepts a list of integer sequences. To get the sequences you can use np.argmax.
encoded_argmax = np.argmax(encoded, axis=1)
text = tokenizer.sequences_to_texts([encoded_argmax]) # since your output is just a number needs to convert into list

LSTM network on pre trained word embedding gensim

I am new to deep learning. I am trying to make very basic LSTM network on word embedding feature. I have written the following code for the model but I am unable to run it.
from keras.layers import Dense, LSTM, merge, Input,Concatenate
from keras.layers.recurrent import LSTM
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten
max_sequence_size = 14
classes_num = 2
LSTM_word_1 = LSTM(100, activation='relu',recurrent_dropout = 0.25, dropout = 0.25)
lstm_word_input_1 = Input(shape=(max_sequence_size, 300))
lstm_word_out_1 = LSTM_word_1(lstm_word_input_1)
merged_feature_vectors = Dense(50, activation='sigmoid')(Dropout(0.2)(lstm_word_out_1))
predictions = Dense(classes_num, activation='softmax')(merged_feature_vectors)
my_model = Model(input=[lstm_word_input_1], output=predictions)
print my_model.summary()
The error I am getting is ValueError: Error when checking input: expected input_1 to have 3 dimensions, but got array with shape (3019, 300). On searching, I found that people have used Flatten() which will compress all the 2-D features (3019,300) for the dense layer. But I am unable to fix the issue.
While explaining, kindly let me know how do the dimension work out.
Upon request:
My X_training had dimension issues, so I am providing the code below to clear out the confusion,
def makeFeatureVec(words, model, num_features):
# Function to average all of the word vectors in a given
# paragraph
#
# Pre-initialize an empty numpy array (for speed)
featureVec = np.zeros((num_features,),dtype="float32")
#
nwords = 0.
#
# Index2word is a list that contains the names of the words in
# the model's vocabulary. Convert it to a set, for speed
index2word_set = set(model.wv.index2word)
#
# Loop over each word in the review and, if it is in the model's
# vocaublary, add its feature vector to the total
for word in words:
if word in index2word_set:
nwords = nwords + 1.
featureVec = np.add(featureVec,model[word])
#
# Divide the result by the number of words to get the average
featureVec = np.divide(featureVec,nwords)
return featureVec
I think the following code is giving 2-D numpy array as I am initializing it that way
def getAvgFeatureVecs(reviews, model, num_features):
# Given a set of reviews (each one a list of words), calculate
# the average feature vector for each one and return a 2D numpy array
#
# Initialize a counter
counter = 0.
#
# Preallocate a 2D numpy array, for speed
reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
for review in reviews:
if counter%1000. == 0.:
print "Question %d of %d" % (counter, len(reviews))
reviewFeatureVecs[int(counter)] = makeFeatureVec(review, model, \
num_features)
counter = counter + 1.
return reviewFeatureVecs
def getCleanReviews(reviews):
clean_reviews = []
for review in reviews["question"]:
clean_reviews.append( KaggleWord2VecUtility.review_to_wordlist( review, remove_stopwords=True ))
return clean_reviews
My objective is just to use gensim pretrained model for LSTM on some comments that I have.
trainDataVecs = getAvgFeatureVecs( getCleanReviews(train), model, num_features )
You should try using Embedding layer before LSTM layer. Also, since you have pre-trained vectors of 300-dimensions for 3019 comments, you can initialize the weights for embedding layer with this matrix.
inp_layer = Input((maxlen,))
x = Embedding(max_features, embed_size, weights=[trainDataVecs])(x)
x = LSTM(50, dropout=0.1)(x)
Here, maxlen is the maximum length of your comments, max_features is the maximum number of unique words or vocabulary size of your dataset, and embed_size is dimensions of your vectors, which is 300 in your case.
Note that shape of trainDataVecs should be (max_features, embed_size), so if you have pre-trained word vectors loaded into trainDataVecs, this should work.

Using pretrained gensim Word2vec embedding in keras

I have trained word2vec in gensim. In Keras, I want to use it to make matrix of sentence using that word embedding. As storing the matrix of all the sentences is very space and memory inefficient. So, I want to make embedding layer in Keras to achieve this so that It can be used in further layers(LSTM). Can you tell me in detail how to do this?
PS: It is different from other questions because I am using gensim for word2vec training instead of keras.
Let's say you have following data that you need to encode
docs = ['Well done!',
'Good work',
'Great effort',
'nice work',
'Excellent!',
'Weak',
'Poor effort!',
'not good',
'poor work',
'Could have done better.']
You must then tokenize it using the Tokenizer from Keras like this and find the vocab_size
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
You can then enocde it to sequences like this
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)
You can then pad the sequences so that all the sequences are of a fixed length
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
Then use the word2vec model to make embedding matrix
# load embedding as a dict
def load_embedding(filename):
# load embedding into memory, skip first line
file = open(filename,'r')
lines = file.readlines()[1:]
file.close()
# create a map of words to vectors
embedding = dict()
for line in lines:
parts = line.split()
# key is string word, value is numpy array for vector
embedding[parts[0]] = asarray(parts[1:], dtype='float32')
return embedding
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
# total vocabulary size plus 0 for unknown words
vocab_size = len(vocab) + 1
# define weight matrix dimensions with all 0
weight_matrix = zeros((vocab_size, 100))
# step vocab, store vectors using the Tokenizer's integer mapping
for word, i in vocab.items():
weight_matrix[i] = embedding.get(word)
return weight_matrix
# load embedding from file
raw_embedding = load_embedding('embedding_word2vec.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, t.word_index)
Once you have the embedding matrix you can use it in Embedding layer like this
e = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=4, trainable=False)
This layer can be used in making a model like this
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
All the codes are adapted from this awesome blog post. follow it to know more about Embeddings using Glove
For using word2vec see this post
With the new Gensim version this is pretty easy:
w2v_model.wv.get_keras_embedding(train_embeddings=False)
there you have your Keras embedding layer
My code for gensim-trained w2v model. Assume all words trained in the w2v model is now a list variable called all_words.
from keras.preprocessing.text import Tokenizer
import gensim
import pandas as pd
import numpy as np
from itertools import chain
w2v = gensim.models.Word2Vec.load("models/w2v.model")
vocab = w2v.wv.vocab
t = Tokenizer()
vocab_size = len(all_words) + 1
t.fit_on_texts(all_words)
def get_weight_matrix():
# define weight matrix dimensions with all 0
weight_matrix = np.zeros((vocab_size, w2v.vector_size))
# step vocab, store vectors using the Tokenizer's integer mapping
for i in range(len(all_words)):
weight_matrix[i + 1] = w2v[all_words[i]]
return weight_matrix
embedding_vectors = get_weight_matrix()
emb_layer = Embedding(vocab_size, output_dim=w2v.vector_size, weights=[embedding_vectors], input_length=FIXED_LENGTH, trainable=False)

Categories