I am fine tuning longformer and then making prediction using TextClassificationPipeline and model(**inputs) methods. I am not sure why I get different results
import pandas as pd
import datasets
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig
import torch.nn as nn
import torch
from torch.utils.data import DataLoader#Dataset,
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
#import wandb
import os
from datasets import Dataset
from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification
tokenizer = LongformerTokenizerFast.from_pretrained('folder_path/', max_length = maximum_len)
Loading the fine tuned model from a saved location. Using original tokenizer
saved_location='c:/xyz'
model_saved=AutoModelForSequenceClassification.from_pretrained(saved_location)
pipe = TextClassificationPipeline(model=model_saved, tokenizer=tokenizer, device=0)#tokenizer_saved, padding=True, truncation=True)
prediction = pipe(["The text to predict"], return_all_scores=True)
prediction
[[{'label': 'LABEL_0', 'score': 0.7107483148574829},
{'label': 'LABEL_1', 'score': 0.2892516553401947}]]
2nd method
inputs = tokenizer("The text to predict", return_tensors="pt").to(device)
outputs = model_saved(**inputs)#, labels=labels)
print (outputs['logits'])
#tensor([[ 0.4552, -0.4438]], device='cuda:0', grad_fn=<AddmmBackward0>)
torch.sigmoid(outputs['logits'])
#tensor([[0.6119, 0.3908]], device='cuda:0', grad_fn=<SigmoidBackward0>)
AutoModelForSequenceClassification returns probabilities 0.71 and 0.29. When I look at the 2nd method. It returns logits 0.4552, -0.4438 which convert to probabilities 0.6119, 0.3908
#update 1
The first link TextClassificationPipeline from cronoik's answer says below
function_to_apply (str, optional, defaults to "default") — The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
"default": if the model has a single label, will apply the sigmoid function on the output. If the model has several labels, will apply the softmax function on the output.
"sigmoid": Applies the sigmoid function on the output.
"softmax": Applies the softmax function on the output.
"none": Does not apply any function on the output.
as this is a binary classification problem (single label) shouldn't it apply sigmoid?
I assume that model.config.num_labels==2, if that is the case, the TextClassificationPipeline applies softmax and not sigmoid to calculate the probabilities (code).
import torch
logits = torch.tensor([ 0.4552, -0.4438])
print(torch.softmax(logits,0))
Output:
tensor([0.7107, 0.2893])
Related
I have written this code to use the Siamese method to calculate the similarity of two documents. I want to embed my vectorize layer (embedding is performed using Google News Dataset) of two separate documents using vectorization approach and then feed it to LSTM and output of LSTM goes into Cosine function to measure the similarity of two documents.
#importing libraries
from __future__ import print_function
import gensim
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import csv
import re
import pandas as pd
from pandas import DataFrame
import pandas as pd
nltk.download('punkt')
from tensorflow import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, TimeDistributed
from tensorflow.keras import layers
#Loading pre=trained word2vec model
from gensim.models.keyedvectors import KeyedVectors
# You need to dowload google pre-trained model using below link
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
#Change the path according to your directory
model_path = 'D:\GoogleNews_vectors_negative300\GoogleNews_vectors_negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)
#Setting Parameters for model
class DocSim(object):
def __init__(self, w2v_model , stopwords=[]):
self.w2v_model = w2v_model
self.stopwords = stopwords
def vectorize(self, doc):
"""Identify the vector values for each word in the given document"""
doc = doc.lower()
words = [w for w in doc.split(" ") if w not in self.stopwords]
word_vecs = []
for word in words:
try:
vec = self.w2v_model[word]
word_vecs.append(vec)
except KeyError:
# Ignore, if the word doesn't exist in the vocabulary
pass
# Assuming that document vector is the mean of all the word vectors
vector = np.mean(word_vecs, axis=0)
return vector
def Siamese_cosine_sim(self, vectorA, vectorB):
model = Sequential()
model.add(LSTM(20, return_sequences=True),input_shape=[vectorA,vectorB])
model.compile(loss='binary_crossentropy', optimizer='adam')
outputs = layers.Dense(1, activation="sigmoid")(left_doc,right_doc)
"""Find the cosine similarity distance between two vectors."""
csim = np.dot(left_doc, right_doc) / (np.linalg.norm(left_doc) * np.linalg.norm(right_doc))
if np.isnan(np.sum(csim)):
return 0
return csim
def calculate_similarity(self, withdigits_source_rules, withdigits_target_rules=[], threshold=0.8):
"""Calculates & returns similarity scores between given source rules & all
the target rules"""
if isinstance(withdigits_target_rules, str):
withdigits_target_rules = [withdigits_target_rules]
source_vec = self.vectorize(withdigits_source_rules)
results = []
for rule in withdigits_target_rules:
target_vec = self.vectorize(rule)
sim_score = self.Siamese_cosine_sim (source_vec, target_vec)
if sim_score > threshold:
results.append({
'Siamese Sim Score':sim_score,
'Target Rule':rule
})
# Sort results by score in desc order
results.sort(key=lambda k : k['Siamese Sim Score'] , reverse=True)
return results
ds = DocSim(w2v_model)
#Two documents data
withdigits_source_rules =set(["2.1 Separation of trains","2.3.1.2 Level crossing obstruction","2.2.1.1 Safety is compromised if a train proceeds without a movement autority","Principle: The method of signalling must maintain a space interval between trains that is safe.","2.1.1 Context"])
#Calculate the similarity score between a source rule & a target rule.
if isinstance(withdigits_source_rules, str):
withdigits_source_rules = [withdigits_source_rules]
# This will return one target rules text with a similarity score
for rule in withdigits_source_rules:
sim_scores= ds.calculate_similarity(rule, withdigits_target_rules)
# Printing the output in text file
print("Source rule: {} \n\nSimilarity with Target Rule is \n\n {}\n".format(rule, sim_scores) , file=open("output.txt", "a"))
print("\n")
# Printing output in Jupyter
print("Source rule: {} \n\nSimilarity with Target Rule is \n\n {}\n".format(rule, sim_scores) )
print("\n")
I am getting following error if someone can help me to solve this issue along with LSTM input function?
TypeError: add() got an unexpected keyword argument 'input_shape'
Refer to the documentation here for adding layers to a Sequential model. The add method only accepts one parameter - layer. If the passed argument is not a layer instance, it raises TypeError, which is precisely the error it threw. I guess, you wanted to pass the input_shape parameter to the LSTM layer (line after creating Sequential model). Just move it inside the LSTM layer and it should work fine.
I have trained a neural net on the MNIST dataset from kaggle.I am having trouble with getting the neural net to predict the number which it is receiving.
I don't know what to try to fix this issue.
'''python
import pandas as pd
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
mnist=pd.read_csv(r"C:\Users\Chandrasang\python projects\digit-recognizer\train.csv").values
xtest=pd.read_csv(r"C:\Users\Chandrasang\python projects\digit-recognizer\test.csv").values
ytrain=mnist[:,0]
xtrain=mnist[:,1:]
x_train=keras.utils.normalize(xtrain,axis=1)
x_test=keras.utils.normalize(xtest,axis=1)
x=0
xtrain2=[]
while True:
d=x_train[x]
d.shape=(28,28)
xtrain2.append(d)
x+=1
if x==42000:
break
y=0
xtest2=[]
while True:
b=x_test[y]
b.shape=(28,28)
xtest2.append(b)
y+=1
if y==28000:
break
train=np.array(xtrain2,dtype=np.float32)
test=np.array(xtest2,dtype=np.float32)
model=keras.models.Sequential()
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(256,activation=keras.activations.relu))
model.add(keras.layers.Dense(256,activation=keras.activations.relu))
model.add(keras.layers.Dense(10,activation=keras.activations.softmax))
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
model.fit(train,ytrain,epochs=10)
ans=model.predict(x_test)
print(ans[3])
'''
I expect the output to be a Whole number instead it gives me the following array:
[2.7538205e-02 1.0337318e-11 2.9973364e-03 5.7095995e-06 1.6916725e-07
6.9060135e-08 1.3406207e-09 1.1861910e-06 1.4758119e-06 9.6945578e-01]
Your output is normal, it is a vector of probabilities. You have 10 classes (digits from 0 to 9) and your network compute the probability of your image to be in each class.Looking at your results, your network classified your input as a 9, with a probability of roughly 0.96.
If you want to see just the predicted class, as Chris A. said use predict_classes.
I am building a Tensorflow implementation of an autoencoder for time series. I have a 2000 time series, each of which is a series of 501-time components. These time series are stored in a '.mat' file, which I read in input using scipy.
I then build the autoencoder and train it using batches of the 2000 time series. Finally, I would like to visualize the prediction of the trained autoencoder on the 2000 time series given as input, and compare with the original series, so that I can see if the autoencoder is doing a good job in compressing the data.
I use a double-layer autoencoder, with 250 and 100 nodes in the first and second hidden layer, respectively.
My problem is that when I compare the predicted time series with the original ones, the predicted ones have only positive values, while the original time series have both negative and positive values.
Here the code I have been using:
import scipy.io
mat = scipy.io.loadmat('input_time_series.mat')
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
input = mat
output = input
tf.reset_default_graph()
num_inputs=501 #number of components in the original time seris
num_hid1=250
num_hid2=100
num_hid3=num_hid1
num_output=num_inputs
lr=0.01
actf=tf.nn.relu
X=tf.placeholder(tf.float32,shape=[None,num_inputs])
initializer=tf.variance_scaling_initializer()
w1=tf.Variable(initializer([num_inputs,num_hid1]),dtype=tf.float32)
w2=tf.Variable(initializer([num_hid1,num_hid2]),dtype=tf.float32)
w3=tf.Variable(initializer([num_hid2,num_hid3]),dtype=tf.float32)
w4=tf.Variable(initializer([num_hid3,num_output]),dtype=tf.float32)
b1=tf.Variable(tf.zeros(num_hid1))
b2=tf.Variable(tf.zeros(num_hid2))
b3=tf.Variable(tf.zeros(num_hid3))
b4=tf.Variable(tf.zeros(num_output))
hid_layer1=actf(tf.matmul(X,w1)+b1)
hid_layer2=actf(tf.matmul(hid_layer1,w2)+b2)
hid_layer3=actf(tf.matmul(hid_layer2,w3)+b3)
output_layer=actf(tf.matmul(hid_layer3,w4)+b4)
loss=tf.reduce_mean(tf.square(output_layer-X))
optimizer=tf.train.AdamOptimizer(lr)
train=optimizer.minimize(loss)
init=tf.global_variables_initializer()
num_epoch=5000
batch_size=150
with tf.Session() as sess:
sess.run(init)
for epoch in range(num_epoch):
num_batches=2000//batch_size
for iteration in range(num_batches):
X_batch = input[:]
Y_batch = output[:]
sess.run(train,feed_dict={X:X_batch})
train_loss=loss.eval(feed_dict={X:X_batch})
print("epoch {} loss {}".format(epoch,train_loss))
results=output_layer.eval(feed_dict={X:input})
I also include an example of comparison between one input time series (in blue) and the relevant one predicted by the autoencoder (in orange)
I'm trying to train a neural network and I would like to know how can I retrieve the values of the label calculated by the neural network, when I call the function evaluate.
I have search in the keras documentation for a parameter which does that but I find nothing.
import tensorflow as tf
from tensorflow import keras
import numpy as np
# Create the array of data
train_data = [[1.0,2.0,3.0],[4.0,5.0,6.0]]
train_data_np = np.asarray(train_data)
train_label = [[1,2,3],[4,5,6]]
train_label_np = np.asarray(train_data)
### Build the model
model = keras.Sequential([
keras.layers.Dense(3,input_shape =(3,2)),
keras.layers.Dense(3,activation=tf.nn.sigmoid)
])
model.compile(optimizer='sgd',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
#Train the model
model.fit(train_data_np,train_label_np,epochs=10)
#test the model
restest = model.evaluate(test_data_np,test_label_np)
before evaluate your model you should predict the labels of your test set :
#predicting
predict_labels = model.predict(test_data_np)
#evaluate
restest = model.evaluate(test_label_np,predict_labels)
I am trying to solve a classification problem using a sequential keras model.
In Keras, model.fit requires two numpy arrays to train on - data, labels.
This works correctly if each row of the data has one corresponding label.
However, for my use, I have more than one classification possible for a given data point.
Can this be handled in keras? If so, what should be the format of my data and labels numpy array?
Sample inputs could look like this:
data[0] = ['What is the colour of the shirt?']
#This text is converted to a vector using a 300 dimension GloVe embedding layer and then processed.
label[0] = ['Red','Orange','Brown']
I require my model to train such that any of the 3 classes can be correct for the given question asked.
Any help would be great.
you can do this with MultiLabelBinarizer:
from sklearn.preprocessing import MultiLabelBinarizer
lb = MultiLabelBinarizer()
label = lb.fit_transform(label)
you can than pass the labels to the fit function with 'categorical_crossentropy' loss.
if you want to do it with keras:
from keras.utils import to_categorical
import numpy as np
unique_labels, new_labels = np.unique(label, return_inverse=True)
to_categorical(new_labels, num_classes=None)