I have written code on python for sentiment analysis of movie reviews
import re
import nltk
from multiprocessing import Pool
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
def lemmatize(l):
# proc = CoreNLP("pos", corenlp_jars=["stanford-corenlp-full-2015-04-20/*"], UnicodeDecodeError='skip')
lmtzr = WordNetLemmatizer()
''' for i in xrange(0, len(l)):
for j in xrange(0, len(l[i])):
l[i][j] = lmtzr.lemmatize(l[i][j])
l[i][j] = lmtzr.lemmatize(l[i][j],'v')
'''
for i in xrange(0, len(l)):
words = l[i].split()
word = [lmtzr.lemmatize(lmtzr.lemmatize(w,'v')) for w in words]
return(" ".join(word))
return l
# input: a list l of string
# output: a list containing the stemmed string in l
def stem(l):
result = []
stmr = PorterStemmer()
for i in xrange(0, len(l)):
words = l[i].split()
meaningful = [stmr.stem(w) for w in words]
l[i] = " ".join( meaningful )
return l
return result
# input: a list l of string
# output: a list of string where the stopwords are removed
def removeStopwords(l):
stops = set(stopwords.words("english"))
for i in xrange(0, len(l)):
words = l[i].lower().split()
meaningful = [w for w in words if not w in stops]
l[i] = " ".join( meaningful )
return l
# input: a list l of string
# output: a matrix where the (i,j) component is how many times
# the j-th word appear in the i-th document
def tf(l):
result = [[]]
vectorizer = CountVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None)
result = vectorizer.fit_transform(l).toarray()
'''
your code goes here...
'''
return result
# input: a list l of string
# output: a matrix where the (i,j) component is the tf-idf value of the j-th word in the i-th document
def tfidf(l):
result = [[]]
tf_ = tf(l)
#print(tf_[2])
vectorizer = TfidfVectorizer(smooth_idf = False)
vectorizer.fit_transform(l)
idf = vectorizer.idf_
idf = idf -1
# scikit learn idf implementation see line 993 below
# https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py
#print(idf[2])
result = tf_*idf
return result
# add any additional preprocessing you find helpful
def additional(l):
result = []
'''
your code goes here...
'''
return result
# input: a list l of string
# output: a feature matrix like object ready for training (2-D list, numpy array, sparse matrix)
# you may choose to use a subset of the previous functions that work best for you
def preprocess(l):
print('preprocess done')
removeStopwords(l)
# print(l[1])
lemmatize(l)
#stem(l)
return l
# train_X: feature matrix for training
# train_t: list of labels for training
# val_X: feature matrix for validation
# val_t: list of labels for validation
# just print out your results, no need to return any value
def sec2c(train_X, train_t, val_X, val_t):
cvalue = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
print('LOGREG result:')
for c in cvalue:
logreg = LogisticRegression(C=c)
a = logreg.fit(train_X, train_t).score(val_X, val_t)
print(a,c)
cvalue = [0.001, 0.01, 0.1, 1, 10, 100]
print('SVM result')
for c in cvalue:
svm = LinearSVC(C= c)
a = svm.fit(train_X, train_t).score(val_X, val_t)
print(a,c)
print('NB result')
array = np.asarray(train_X)
array[array==0]=1e9
train_X = array.tolist()
array = np.asarray(val_X)
array[array==0]=1e9
val_X = array.tolist()
n = int(len(train_X)/8)
nb = GaussianNB()
a = nb.fit(train_X,train_t).score(val_X,val_t)
print(a)
return
# input train_text, vali_text, test_text: each being a list of strings
# train_labels, vali_labels: each being a list of labels
#
def sec2di(train_X, train_t, val_X, val_t, tf= False):
if tf:
print('Using TF')
else:
print('Using TF-IDF')
cvalue = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
print('LOGREG result:')
for c in cvalue:
logreg = LogisticRegression(C=c)
a = logreg.fit(train_X, train_t).score(val_X, val_t)
print(a,c)
def useWord2vec(train_text, train_labels, vali_text, vali_labels, test_text):
# from gensim.models import Word2Vec
# merge your texts here
# train your word2vec here
# train your classifiers here
return 0
def parse(doc,text, label, test= False):
if test:
for sentence in doc:
review = BeautifulSoup(sentence).get_text()
if len(review)>0:
letters_only = re.sub("[^a-zA-Z]"," ",review)
text.append(letters_only)
else:
for sentence in doc:
review = BeautifulSoup(sentence).get_text()
if len(review)>0:
if review[0:1] == '+':
label.append(1)
else:
label.append(-1)
review = review[3:]
letters_only = re.sub("[^a-zA-Z]"," ",review)
text.append(letters_only)
def main():
# read data and extract texts and labels
pool = Pool(processes=3)
train = open('small_train.txt', 'r')
# do preprocessing
trainSentences = re.split(r'\n', train.read())
trainLabel = []
trainText = []
valid = open('small_valid.txt', 'r')
validSentences = re.split(r'\n', valid.read())
validLabel = []
validText = []
test = open('small_test.txt', 'r')
testSentences = re.split(r'\n', test.read())
testLabel = []
testText = []
parse(trainSentences, trainText, trainLabel)
print'parsed train'
parse(validSentences,validText,validLabel)
print'parsed valid'
parse(testSentences,testText,testLabel, test= True)
print'parsed test'
pool.map(preprocess, [trainText, validText, testText])
'''
preprocess(trainText)
print('preprocesed train')
preprocess(validText)
print('preprocesed valid')
preprocess(testText)
print('preprocesed test')
'''
#ts = tfidf(trainText)
# print(ts[2])
# print(trainText[1])
# train the model
# make predictions and save them
return 0
if __name__ == '__main__':
main()
But I received following errors:
Traceback (most recent call last): File
"C:\Users\jre\Desktop\SentimentAnalysis-master\SentimentAnalysis-master\main.py",
line 261, in <module
main() File "C:\Users\jre\Desktop\SentimentAnalysis-master\SentimentAnalysis-master\main.py",
line 222, in main
valid = open('small_valid.txt', 'r') IOError: [Errno 2] No such file or directory: 'small_valid.txt'
Can you help me to solve this issue?
The error is clear: No such file or directory: 'small_valid.txt'. Move your file into this path:
C:\Users\jre\Desktop\SentimentAnalysis-master\SentimentAnalysis-master
or update the next code lines to use an absolute path:
train = open('C:\..path_to_file..\small_train.txt', 'r')
valid = open('C:\..path_to_file..\small_valid.txt', 'r')
Related
I'm a new guy to tensorflow. I tried but couldn't solve this.
TypeError: <tf.Tensor 'module_apply_default/aggregation/mul_3:0' shape=(2, 9, 1024) dtype=float32> is out of scope and cannot be used here. Use return values, explicit Python locals or TensorFlow collections to access it.
Please see https://www.tensorflow.org/guide/function#all_outputs_of_a_tffunction_must_be_return_values for more information.
<tf.Tensor 'module_apply_default/aggregation/mul_3:0' shape=(2, 9, 1024) dtype=float32> was defined here:
The tensor <tf.Tensor 'module_apply_default/aggregation/mul_3:0' shape=(2, 9, 1024) dtype=float32> cannot be accessed from here, because it was defined in <tensorflow.python.framework.ops.Graph object at 0x0000011DC7B75690>, which is out of scope.
This is my Code -
import io
import tensorflow as tf
import tensorflow_hub as hub
import glob
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import genesis
from nltk.corpus import wordnet_ic
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet as wn
import gensim
from gensim.models import doc2vec
from collections import namedtuple
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import math
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from scipy import spatial
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# nltk.download('wordnet')
# nltk.download('genesis')
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
tokenizer = RegexpTokenizer(r'\w+') #will remove punctuations
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
genesis_ic=wn.ic(genesis, False, 0.0)
# module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" ##param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
# embed = hub.Module(module_url)
# elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
def get_tdata():
direc=glob.glob('Semeval Data XMLtocsv/*.csv')
direc.sort()
df=[]
for s in direc:
df.append(pd.read_csv(s))
ref_ans=[]
f = open(r"Semeval Data XMLtocsv/ref_ans.txt", "r")
for x in f:
if x=='\n':
continue
ref_ans.append(x.strip())
question=[]
f = open(r"Semeval Data XMLtocsv/question.txt", "r")
for x in f:
if x=='\n':
continue
question.append(x.strip())
stud_ans=[]
accuracy=[]
for i in range(len(df)):
stud_ans.append([j for j in df[i]['__text']])
accuracy.append([j for j in df[i]['_accuracy']])
corr=0
incorr=0
Y=[]
for i in range(0,len(stud_ans)):
for j in range(len(accuracy[i])):
if(accuracy[i][j]=="correct"):
Y.append(1)
corr=corr+1
else:
Y.append(0)
incorr=incorr+1
print(len(stud_ans),len(ref_ans),len(question),len(Y))
return stud_ans,ref_ans,question,Y
stud_ans,ref_ans,question,Y=get_tdata()
print(ref_ans[0])
def get_func_words():
file=open(r"function_words")
func_words=[]
pattern=r"\w+'?\w*"
for i in file.readlines():
result=re.findall(pattern,i)
for j in result:
func_words.append(j)
return func_words
func_words=get_func_words()
def uni_sent_encoder(stud_ans):
# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)
module_url = "./uni_encoder" ##param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
embed = hub.Module(module_url)
with tf.Session() as session:
session.run([tf.global_variables_initializer(), tf.tables_initializer()])
embeddings = session.run(embed(stud_ans))
return embeddings
def cs_univ_encoder(stud_ans,ref_ans):
t=[]
t=stud_ans[:]
t.append(ref_ans)
matrix = uni_sent_encoder(t)
cossim_use=[0] * (len(matrix)-1)
for i in range(len(matrix)-1):
cossim_use[i] = 1 - spatial.distance.cosine(matrix[i], matrix[len(matrix)-1])
return cossim_use
def word_sent_length(stud_ans):
sent_length=[0] * len(stud_ans)
av_word_length=[0] * len(stud_ans)
j=0
for i in stud_ans:
sent_length[j]=len(tokenizer.tokenize(i))
for w in i:
if(w!=' '):
av_word_length[j]+=1
av_word_length[j]/=sent_length[j]
j+=1
ws = [av_word_length, sent_length]
return ws
def avsenlen(stud_ans):
temp = word_sent_length(stud_ans)
avg_sent_length_in_doc = 0
for i in range(len(temp[1])):
avg_sent_length_in_doc += temp[1][i]
avg_sent_length_in_doc /= len(temp[1])
return avg_sent_length_in_doc
def prompt_overlap(s_ans,question):
# print(len(tokenizer.tokenize(s_ans)),"\n",s_ans,'\n',question)
# overlap_metric = [0] * len(tokenizer.tokenize(s_ans))
qs_words = tokenizer.tokenize(question)
q_words = [w for w in qs_words if w not in stop_words]
l=0
overlap_metric = 0
w = tokenizer.tokenize(s_ans)
for j in w:
for k in q_words:
if(j==k):
overlap_metric+=1
break
l+=1
myInt = len(q_words)
overlap_metric /= myInt
return overlap_metric
def pre_word2vec():
model = r"cc.en.300.vec"
word_vectors = gensim.models.KeyedVectors.load_word2vec_format(model, binary=False)
return word_vectors
# fin = io.open(r'cc.en.300.vec', encoding='utf-8', newline='\n', errors='ignore')
# n, d = map(int, fin.readline().split())
# data = {}
# for line in fin:
# tokens = line.rstrip().split(' ')
# data[tokens[0]] = map(float, tokens[1:])
# return data
w2vmodel = pre_word2vec()
def cosine_sim_word2vec(stud_ans,ref_ans):
nums=[]
for i in range(len(stud_ans)):
ss1 = stud_ans[i]
ss2 = ref_ans
data = []
data2= []
stop_words = set(stopwords.words('english'))
s1 = [w.lower() for w in tokenizer.tokenize(ss1) if w.lower() not in stop_words]
s2 = [w.lower() for w in tokenizer.tokenize(ss2) if w.lower() not in stop_words]
dd=[]
dd.append(s1)
dd.append(s2)
sim=0
for i in s1:
maxi=0
for j in s2:
# print("I'm in")
maxi = max(maxi,w2vmodel.similarity(i,j))
sim+=maxi
length = max(len(word_tokenize(ss1)), len(word_tokenize(ss2)))
sim/=length
nums.append(sim)
return nums
def d2v(sa):
doc1=sa
docs = []
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, text in enumerate(doc1):
words = text.lower().split()
tags = [i]
docs.append(analyzedDocument(words, tags))
# Train model (set min_count = 1, if you want the model to work with the provided example data set)
model = doc2vec.Doc2Vec(docs, vector_size = 12, window = 300, min_count = 1, workers = 4)
return model.docvecs
def cosine_sim_d2v(stud_ans,ref_ans):
t=[]
t=stud_ans[:]
t.append(ref_ans)
matrix = d2v(t)
cossimw2v=[0] * (len(matrix)-1)
for i in range(len(matrix)-1):
cossimw2v[i] = 1 - spatial.distance.cosine(matrix[i], matrix[len(matrix)-1])
return cossimw2v
def IDFpp(stud_ans):
doc_info = []
j=0
for i in stud_ans:
j+=1
sa = tokenizer.tokenize(i)
count = len(sa)
temp = {"doc_id":j, "doc_length(count)":count}
doc_info.append(temp)
k=0
freq = []
for i in stud_ans:
k+=1
sa = tokenizer.tokenize(i)
fd={}
for w in sa:
w=w.lower()
if w in fd:
fd[w]+=1
else:
fd[w]=1
temp = {'doc_id':i, "freq":fd}
freq.append(temp)
#print(freq,"\n\n\n\n\n")
return doc_info, freq
#Tfidf vectorizer function
def IDF(stud_ans):
doc_info, freq = IDFpp(stud_ans)
IDFscore=[]
counter = 0
for d in freq:
counter+=1
for k in d['freq'].keys():
#print(k)
count = sum([k in tempDict['freq'] for tempDict in freq])
#if()
#print(count)
temp = {'doc_id':counter, 'IDFscore':math.log(len(doc_info)/count),'TF score':(count),'key':k}
IDFscore.append(temp)
return IDFscore
def fsts(stud_ans,ref_ans):
k1=1.2
b=0.75
fstsvalue=[]
avsenlength = avsenlen(stud_ans)
#compare stud_ans and ref_ans
idfscore = IDF(stud_ans)
for i in range(len(stud_ans)):
if(len(word_tokenize(stud_ans[i])) > len(word_tokenize(ref_ans))):
lsen = [w.lower() for w in tokenizer.tokenize(stud_ans[i])]
ssen = [w.lower() for w in tokenizer.tokenize(ref_ans)]
sl=len((stud_ans[i]))
ss=len((ref_ans))
else:
ssen = [w.lower() for w in tokenizer.tokenize(stud_ans[i])]
lsen = [w.lower() for w in tokenizer.tokenize(ref_ans)]
ss=len((stud_ans[i]))
sl=len((ref_ans))
temp=0
for i in (lsen):
maxi=0
idf=0
for w in (ssen):
maxi = max(maxi,w2vmodel.similarity(i,w))
for j in range(len(idfscore)):
if(idfscore[j]['key'] == i):
idf = idfscore[j]['IDFscore']
temp += idf * (maxi * (k1+1))
temp /= (maxi + k1* (1- b + b*(ss/avsenlength)))
fstsvalue.append(temp)
return fstsvalue
def noun_overlap(stud_ans,ref_ans):
word_tokens = tokenizer.tokenize(stud_ans)
ref_tokens = tokenizer.tokenize(ref_ans)
stud_ans_tag=nltk.pos_tag(word_tokens)
ref_ans_tag=nltk.pos_tag(ref_tokens)
ref_nouns=[]
stud_ans_nouns=[]
#use regex here
for i,j in ref_ans_tag:
if(j in ["NN","NNS","NNP","NNPS"]):
ref_nouns.append(i)
for i,j in stud_ans_tag:
if(j in ["NN","NNS","NNP","NNPS"]):
stud_ans_nouns.append(i)
score=0
for i in stud_ans_nouns:
if i in ref_nouns:
score=score+1;
return score/len(ref_nouns)
def content_overlap(s,r):#both are lists
s_ans=[]
ref_ans=r
t=[]
for i in range(len(s)):
t=(tokenizer.tokenize(s[i]))
s_ans.append(t)
for i in range(len(s_ans)):
s_ans[i] = [lemmatizer.lemmatize(j) for j in s_ans[i] if j not in func_words]
ref_ans = [lemmatizer.lemmatize(i) for i in tokenizer.tokenize(ref_ans) if i not in func_words]
length=len(ref_ans)
for i in range(len(ref_ans)):
for j in wn.synsets(ref_ans[i]):
for k in j.lemmas():
ref_ans.append(k.name())
temp=[]
for i in s_ans:
val=0
for j in i:
if j in ref_ans:
val+=1
temp.append(val/(length))
return temp
def lsa_score(stud_ans):
corpus = [ans for ques in stud_ans for ans in ques]
vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english')
dtm = vectorizer.fit_transform(corpus)
lsa = TruncatedSVD(3, algorithm = 'arpack')
dtm=dtm.astype(float)
dtm_lsa = lsa.fit_transform(dtm)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
lsa_stud=pd.DataFrame(dtm_lsa, index = corpus, columns = ["component_1","component_2",'component_3'])
X=[]
for i in range(len(corpus)):
t=[]
t.append(lsa_stud.iloc[i,:1].values[0])
t.append(lsa_stud.iloc[i,1:2].values[0])
t.append(lsa_stud.iloc[i,2:3].values[0])
X.append(t)
return X
def elmo_vectors(x):
# Create graph and finalize (optional but recommended).
g = tf.Graph()
with g.as_default():
text_input = tf.compat.v1.placeholder(dtype=tf.string, shape=[None])
elmo = hub.Module("./elmo_module", trainable=True)
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
init_op = tf.group([tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer()])
# g.finalize()
# Create session and initialize.
session = tf.compat.v1.Session(graph=g)
session.run(init_op)
return session.run(tf.compat.v1.reduce_mean(embeddings,1))
# embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
#
# with tf.Session() as sess:
# sess.run(tf.global_variables_initializer())
# sess.run(tf.tables_initializer())
#
# # return average of ELMo features
# return sess.run(tf.reduce_mean(embeddings,1))
def cos_sim_elmo(stud_ans,ref_ans):
t=[]
t=stud_ans[:]
t.append(ref_ans)
matrix = elmo_vectors(t)
cossimelmo = [0] * (len(matrix))
for i in range(len(matrix)):
cossimelmo[i] = 1 - spatial.distance.cosine(matrix[i], matrix[len(matrix)-1])
return cossimelmo
def jc_sim(stud_ans,ref_ans):
X=[]
c=0
ref_words=tokenizer.tokenize(ref_ans)
ref_words=[lemmatizer.lemmatize(j) for j in ref_words if j.lower() not in stop_words]
for s in stud_ans:
num=0
words=tokenizer.tokenize(s)
words=[lemmatizer.lemmatize(j) for j in words if j.lower() not in stop_words]
l=max(len(ref_words),len(words))
for w in words:
maxi=0
for w1 in wn.synsets(w):
for t in ref_words:
for w2 in wn.synsets(t):
if w1._pos in ('n','v','a','r') and w2._pos in ('n','v','a','r') and w1._pos==w2._pos:
n=w1.jcn_similarity(w2,genesis_ic)
if w1==w2 or n>1:
maxi=1
else:
maxi=max(maxi,w1.jcn_similarity(w2,genesis_ic))
num=num+maxi
num=num/l
X.append(num)
return X
def sp_sim(stud_ans,ref_ans):
X=[]
c=0
ref_words=tokenizer.tokenize(ref_ans)
ref_words=[lemmatizer.lemmatize(j) for j in ref_words if j.lower() not in stop_words]
for s in stud_ans:
num=0
words=tokenizer.tokenize(s)
words=[lemmatizer.lemmatize(j) for j in words if j.lower() not in stop_words]
l=max(len(ref_words),len(words))
for w in words:
maxi=0
for w1 in wn.synsets(w):
for t in ref_words:
for w2 in wn.synsets(t):
if w1._pos in ('n','v','a','r') and w2._pos in ('n','v','a','r') and w1._pos==w2._pos:
n=w1.lch_similarity(w2,genesis_ic)
#print(w1, w2, type(n), n)
#return None when there is no similarity hence needed to add another if clause
if n == None:
maxi=0
elif w1==w2 or n>1:
maxi=1
else:
maxi=max(maxi,w1.lch_similarity(w2,genesis_ic))
num=num+maxi
#print(num)
num=num/l
X.append(num)
return X
def ttr(sent):
words=tokenizer.tokenize(sent)
return len(set(words))/len(words)
def calc_train_features():
X=[]
lss=lsa_score(stud_ans)
c=0
for i in range(len(stud_ans)):
sta=stud_ans[i]
ref=ref_ans[i]
que=question[i]
lt=len(sta)
wsn=word_sent_length(sta)
csw2v=cosine_sim_word2vec(sta,ref)
csd2v=cosine_sim_d2v(sta,ref)
fs=fsts(sta,ref)
co = content_overlap(sta,ref)
cselmo = cos_sim_elmo(sta, ref)
jcs = jc_sim(sta,ref)
sps = sp_sim(sta,ref)
csuse = cs_univ_encoder(sta,ref)
for j in range(lt):
temp = []
temp.append(prompt_overlap(sta[j],que))
for k in range(1):
temp.append(wsn[k][j])
temp.append(csw2v[j])
temp.append(csd2v[j])
temp.append(fs[j])
# temp.append(noun_overlap(sta[j],ref))
# temp.append(co[j])
for k in range(3):
temp.append(lss[c][k])
temp.append(ttr(sta[j]))
temp.append(jcs[j])
temp.append(sps[j])
# temp.append(glv[j])
temp.append(csuse[j])
X.append(temp)
print(i," - ",j, " -- ",c)
c=c+1
def get_features(sta,ref,q):
temps=stud_ans[:]
temps.append(sta)
lss=lsa_score(temps)[-len(sta):]
wsn=word_sent_length(sta)
csw2v=cosine_sim_word2vec(sta,ref)
csd2v=cosine_sim_d2v(sta,ref)
fs=fsts(sta,ref)
co = content_overlap(sta,ref)
cselmo = cos_sim_elmo(sta, ref)
jcs = jc_sim(sta,ref)
sps = sp_sim(sta,ref)
csuse = cs_univ_encoder(sta,ref)
X=[]
for j in range(len(sta)):
temp=[]
temp.append(prompt_overlap(sta[j],q))
for k in range(1):
temp.append(wsn[k][j])
temp.append(csw2v[j])
temp.append(csd2v[j])
temp.append(fs[j])
# temp.append(noun_overlap(sta[j],ref))
# temp.append(co[j])
temp.append(cselmo[j])
for k in range(3):
temp.append(lss[j][k])
temp.append(ttr(sta[j]))
temp.append(jcs[j])
temp.append(sps[j])
temp.append(csuse[j])
X.append(temp)
return X
def train_data():
xdf=pd.read_csv("./final_features.csv")[['prompt_overlap','avg_word_length','cosineword2vec','cosinedoc2vec','fsts','cosine_elmo','lsa1','lsa2','lsa3','ttr','jc_sim','sps','cs_use','score']]
Y=xdf['score'].values
xdf=xdf.drop(['score'],axis=1)
X=xdf.values
return X,Y
def train_model():
X,Y=train_data()
train_x, test_x, train_y, test_y = train_test_split(X,Y,test_size=0.2)
clf = RandomForestClassifier(n_estimators=120,max_depth=15)
clf.fit(train_x,train_y)
print( "Test Accuracy :: ", accuracy_score(test_y, clf.predict(test_x)))
return clf
clf=train_model()
def predict_ans(X):
return clf.predict(X)
def get_prob(X):
return clf.predict_proba(X)
I have been trying to write a quick little chatbot based on some home searching and learning.
I keep getting these errors when the chatbot is going to print its response to me.
2021-05-14 13:34:40.197411: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
Traceback (most recent call last):
File "C:/Users/TomDootson/PycharmProjects/Chatbot/Chatbot.py", line 57, in <module>
ints = predict_class(message)
File "C:/Users/TomDootson/PycharmProjects/Chatbot/Chatbot.py", line 41, in predict_class
return_list.append({'intent': classes[r[0]], 'probability': str(r[1])})
IndexError: list index out of range
This is the python script I have written. Any pointing in the right direction would be great.
import random
import json
import pickle
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import load_model
lemmatizer = WordNetLemmatizer()
intents = json.loads(open('intents.json').read())
words = pickle.load(open('words.pkl', 'rb'))
classes = pickle.load(open('classes.pkl', 'rb'))
model = load_model('chatbot_model.h5')
def clean_up_sentence(sentence):
sentence_words = nltk.word_tokenize(sentence)
sentence_words = [lemmatizer.lemmatize(word) for word in sentence_words]
return sentence_words
def bag_of_words(sentence):
sentence_words = clean_up_sentence(sentence)
bag = [0] * len(words)
for w in sentence_words:
for i, word in enumerate(words):
if word == w:
bag[i] = 1
return np.array(bag)
def predict_class(sentence):
bow = bag_of_words(sentence)
res = model.predict(np.array([bow]))[0]
ERROR_THRESHOLD = 0.25
results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]
results.sort(key=lambda x: x[1], reverse=True)
return_list = []
for r in results:
return_list.append({'intent': classes[r[0]], 'probability': str(r[1])})
return return_list
def get_response(intents_list, intents_json):
tag = intents_list[0]['intent']
list_of_intents = intents_json['intents']
for i in list_of_intents:
if i['tag'] == tag:
result = random.choice(i['responses'])
break
return result
print('Hello, I am online. Go ahead')
while True:
message = input('')
ints = predict_class(message)
res = get_response(ints, intents)
print(res)
I believe the error probably comes from the classes[r[0]] part, in particular r[0] will be the index of the prediction (from 0 to the length of the input array) an you are using it to index the classes list (that is probably way shorter than the list of prediction).
I am working with Gensim FASTText modeling and have the following questions.
The output of "ft_model.save(BASE_PATH + MODEL_PATH + fname)" saves the following 3 files. Is this correct? is there a way to combine all three files?
ft_gensim-v3
ft_gensim-v3.trainables.vectors_ngrams_lockf.npy
ft_gensim-v3.wv.vectors_ngrams.npy
When I attempt to load the training file and then use it, I get the following error from if model.wv.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
'function' object has no attribute 'wv'
Finally, both models, is there a way not to have to store the output of def read_train(path,label_path) and def lemmetize(df_col)so I do not have to run this part of the code every time I want to train the model or compare?
Thanks for the assistance.
Here is my FastText Train Model
import os
import logging
from config import BASE_PATH, DATA_PATH, MODEL_PATH
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from pprint import pprint as print
from gensim.models.fasttext import FastText as FT_gensim
from gensim.test.utils import datapath
#Read Training data
import pandas as pd
def read_train(path,label_path):
d = []
#e = []
df = pd.read_excel(path)
labelled = pd.read_csv(label_path)
updated_col1 = lemmetize(df['query_text'])
updated_col2 = lemmetize(labelled['QueryText'])
for i in range(len(updated_col1)):
d.append(updated_col1[i])
#print(d)
for i in range(len(updated_col2)):
d.append(updated_col2[i])
return d
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
from nltk.stem import PorterStemmer
def lemmetize(df_col):
df_updated_col = pd.Series(0, index = df_col.index)
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
ps = PorterStemmer()
for i, j in zip(df_col, range(len(df_col))):
lem = []
t = str(i).lower()
t = t.replace("'s","")
t = t.replace("'","")
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
t = t.translate(translator)
word_tokens = word_tokenize(t)
for i in range(len(word_tokens)):
l1 = lemmatizer.lemmatize(word_tokens[i])
s1 = ps.stem(word_tokens[i])
if list(l1) != [''] and list(l1) != [' '] and l1 != '' and l1 != ' ':
lem.append(l1)
filtered_sentence = [w for w in lem if not w in stop_words]
df_updated_col[j] = filtered_sentence
return df_updated_col
#read test data
def read_test(path):
return pd.read_excel(path)
#Read labelled data
def read_labelled(path):
return pd.read_csv(path)
word_tokenized_corpus = read_train('Train Data.xlsx','SMEQueryText.csv')
#Train fasttext model
import tempfile
import os
from gensim.models import FastText
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("ft_gensime-v3")
def train_fastText(data, embedding_size = 60, window_size = 40, min_word = 5, down_sampling = 1e-2, iter=100):
ft_model = FastText(word_tokenized_corpus,
size=embedding_size,
window=window_size,
min_count=min_word,
sample=down_sampling,
sg=1,
iter=100)
#with tempfile.NamedTemporaryFile(prefix=BASE_PATH + MODEL_PATH + 'ft_gensim_v2-', delete=False) as tmp:
# ft_model.save(tmp.name, separately=[])
ft_model.save(BASE_PATH + MODEL_PATH + fname)
return ft_model
# main function to output
def main(test_path, train_path, labelled):
test_data = read_test(test_path)
train_data = read_train(train_path,labelled)
labelled = read_labelled(labelled)
output_df = pd.DataFrame(index = range(len(test_data)))
output_df['test_query'] = str()
output_df['Similar word'] = str()
output_df['category'] = str()
output_df['similarity'] = float()
model = train_fastText(train_data)
# run main
if __name__ == "__main__":
output = main('Test Data.xlsx','Train Data.xlsx','QueryText.csv')
Here is my Usage Model
import pandas as pd
from gensim.models import FastText
import gensim
from config import BASE_PATH, DATA_PATH, MODEL_PATH
#Read Training data
def read_train(path,label_path):
d = []
#e = []
df = pd.read_excel(path)
labelled = pd.read_csv(label_path)
updated_col1 = lemmetize(df['query_text'])
updated_col2 = lemmetize(labelled['QueryText'])
for i in range(len(updated_col1)):
d.append(updated_col1[i])
for i in range(len(updated_col2)):
d.append(updated_col2[i])
return d
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
from nltk.stem import PorterStemmer
def lemmetize(df_col):
df_updated_col = pd.Series(0, index = df_col.index)
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
ps = PorterStemmer()
for i, j in zip(df_col, range(len(df_col))):
lem = []
t = str(i).lower()
t = t.replace("'s","")
t = t.replace("'","")
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
t = t.translate(translator)
word_tokens = word_tokenize(t)
for i in range(len(word_tokens)):
l1 = lemmatizer.lemmatize(word_tokens[i])
s1 = ps.stem(word_tokens[i])
if list(l1) != [''] and list(l1) != [' '] and l1 != '' and l1 != ' ':
lem.append(l1)
filtered_sentence = [w for w in lem if not w in stop_words]
df_updated_col[j] = filtered_sentence
return df_updated_col
#read test data
def read_test(path):
return pd.read_excel(path)
#Read labelled data
def read_labelled(path):
return pd.read_csv(path)
def load_training():
return FT_gensim.load(BASE_PATH + MODEL_PATH +'ft_gensim-v3')
#compare similarity
def compare_similarity(model, real_data, labelled):
maxWord = ''
category = ''
maxSimilaity = 0
#print("train data",labelled[1])
for i in range(len(labelled)):
if model.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
#print('labelled',labelled['QueryText'][i], 'i', i)
maxWord = labelled['QueryText'][i]
category = labelled['Subjectmatter'][i]
maxSimilaity = model.similarity(real_data, labelled['QueryText'][i])
return maxWord, category, maxSimilaity
# Output from Main to excel
from pandas import ExcelWriter
def export_Excel(data, aFile = 'FASTTEXTOutput.xlsx'):
df = pd.DataFrame(data)
writer = ExcelWriter(aFile)
df.to_excel(writer,'Sheet1')
writer.save()
# main function to output
def main(test_path, train_path, labelled):
test_data = read_test(test_path)
train_data = read_train(train_path,labelled)
labelled = read_labelled(labelled)
output_df = pd.DataFrame(index = range(len(test_data)))
output_df['test_query'] = str()
output_df['Similar word'] = str()
output_df['category'] = str()
output_df['similarity'] = float()
model = load_training
for i in range(len(test_data)):
output_df['test_query'][i] = test_data['query_text'][i]
#<first change>
maxWord, category, maxSimilaity = compare_similarity(model, str(test_data['query_text'][i]), labelled)
output_df['Similar word'][i] = maxWord
output_df['category'][i] = category
output_df['similarity'][i] = maxSimilaity
#<second change>
return output_df
# run main
if __name__ == "__main__":
output = main('Test Data.xlsx','Train Data.xlsx','SMEQueryText.csv')
export_Excel(output)
Here is the full tracible error message
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-22-57803b59c0b9> in <module>
1 # run main
2 if __name__ == "__main__":
----> 3 output = main('Test Data.xlsx','Train Data.xlsx','SMEQueryText.csv')
4 export_Excel(output)
<ipython-input-21-17cb88ee0f79> in main(test_path, train_path, labelled)
13 output_df['test_query'][i] = test_data['query_text'][i]
14 #<first change>
---> 15 maxWord, category, maxSimilaity = compare_similarity(model, str(test_data['query_text'][i]), labelled)
16 output_df['Similar word'][i] = maxWord
17 output_df['category'][i] = category
<ipython-input-19-84d7f268d669> in compare_similarity(model, real_data, labelled)
6 #print("train data",labelled[1])
7 for i in range(len(labelled)):
----> 8 if model.wv.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
9 #print('labelled',labelled['QueryText'][i], 'i', i)
10 maxWord = labelled['QueryText'][i]
AttributeError: 'function' object has no attribute 'wv'
You've got three separate, only-vaguely-related questions here. Taking each in order:
Why are there 3 files, and can they be combined?
It's more efficient to store the big raw arrays separately from the main 'pickled' model – and for models above a few gigabytes in size, necessary to work-around 'pickle' implementation limits. So I'd recommend just keeping the default behavior, and keeping the habit of managing/moving/copying the sets of files together.
If your model is small enough, there is something you can try, though. The .save() method has an optional parameter sep_limit which controls the threshold array size, over which arrays are stored as separate files. By setting that much larger, say sep_limit=2*1024*1024*1024 (2GiB), smaller models should save a single file. (But, loading will be slower, you won't have the sometimes-useful option of memory-map loading, and saving may break on oversized models.)
Why is there a AttributeError: 'function' object has no attribute 'wv' error?
Your line of code model = load_training assigns an actual function to the model variable, rather than what you probably intended, the return-value of calling that function with some arguments. That function has no .wv attribute, hence the error. If model were an actual instance of FastText, you'd not get that error.
Can the corpus text be stored to avoid repeat preprocessing and conversion from pandas formats?
Sure, you can just write the text to a file. Roughly:
with open('mycorpus.txt', mode='w') as corpusfile:
for text in word_tokenized_corpus:
corpusfile.write(' '.join(text))
corpusfile.write('\n')
Though in fact, gensim offers a utility function, utils.save_as_line_sentence(), that can do this (& explicitly handles some extra encoding concerns). See:
https://radimrehurek.com/gensim/utils.html#gensim.utils.save_as_line_sentence
The LineSentence utility class in gensim.models.word2vec can stream texts from such a file back for future re-use:
https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.LineSentence
I am pretty new to python and this is the first code I have written. Trying to use the NLTK package. The problem comes at the end when trying to execute the label_probdist.prob('positive') line.
This is the error I get;
name 'label_probdist' is not defined
NameError Traceback (most recent call last)
<ipython-input-57-006d791d4445> in <module>()
----> 1 print label_probdist.prob('positive')
NameError: name 'label_probdist' is not defined
import nltk, re, pprint
import csv
from nltk import word_tokenize, wordpunct_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
from nltk.classify.api import ClassifierI
# not in use nltk.download() #Download the bookpackage
#open the file that containts wallposts and classifier
with open('Classified.csv' ,'rb') as f:
reader = csv.reader(f)
FBsocial = map(tuple, reader)
import random
random.shuffle(FBsocial)
FBsocial = FBsocial[:500]
len(FBsocial)
FBSocialData = [] #sorting data
for row in FBsocial:
statement = row[0]
sentiment = row[1]
words_filtered = [e.lower() for e in statement.split() if len(e) >= 3]
FBSocialData.append((words_filtered, sentiment))
len(FBSocialData)
#Extracting features of word(list of words ordered by frequency)
def get_words_in_FBdata(FBSocialData):
all_words = []
for (statement, sentiment) in FBSocialData:
all_words.extend(statement)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
word_features = get_word_features(get_words_in_FBdata(FBSocialData))
len(word_features)
#just a test;
document = ("hei","grin","andre","jævlig","gøy",)
#Classifier to decide which feature are relevant
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
extract_features(document)
#testing extract_features
extract_features("udviser blomsterbutik")
training_set = nltk.classify.util.apply_features(extract_features, FBSocialData)
len(training_set)
classifier = nltk.NaiveBayesClassifier.train(training_set)
def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
# Create the P(label) distribution
label_probdist = estimator(label_freqdist)
# Create the P(fval|label, fname) distribution
feature_probdist = {}
return NaiveBayesClassifier(label_probdist, feature_probdist)
#pvalue
print label_probdist.prob('positive')
print label_probdist.prob('negative')
You are defining variable label_probdist inside function train. Then you are trying to access it outside it's scope. It is not possible. It's a local variable, not a global one.
I am trying to use the maxent classifier using the NLTK library. I have a list of positive and negative words and I have trained the classifier on the same. The problem is when I test the classifier against a sentence I always get the same probability of classification for the two classes. Here is the code -
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import MaxentClassifier
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from sklearn import cross_validation
nltk.data.path.append("/home/daksh/Documents/Softwares/nltk_data")
import csv
import operator
from nltk.classify import MaxentClassifier
from nltk.corpus import movie_reviews
def getBestWords(posWords,negWords):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
label_word_fd['pos'][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
label_word_fd['neg'][word.lower()] += 1
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1),reverse=True)[:2500]
bestwords = set([w for w,s in sorted_x])
return bestwords
def best_word_feats(words,bestwords):
return dict([(word, True) for word in words if word in bestwords])
def word_feats(words):
return dict([(word, True) for word in words])
def best_bigram_word_feats(words,posWords,negWords, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
d = dict([(bigram, True) for bigram in bigrams])
bestwords = getBestWords(posWords,negWords)
d.update(best_word_feats(words,bestwords))
return d
posWords = list()
negWords = list()
with open('../data/finalSentiPosWords.csv','r') as csvfile:
spamreader = csv.reader(csvfile)
posWords = list(spamreader)
with open('../data/finalSentiNegWords.csv','r') as csvfile:
spamreader = csv.reader(csvfile)
negWords = list(spamreader)
posWords = [word[0] for word in posWords]
negWords = [word[0] for word in negWords]
bestwords = getBestWords(posWords,negWords)
posfeats = [(best_bigram_word_feats(posWords,posWords,negWords),'pos')]
negfeats = [(best_bigram_word_feats(negWords,posWords,negWords),'neg')]
trainfeats = negfeats + posfeats
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
classifier = nltk.MaxentClassifier.train(trainfeats, algorithm,max_iter=5)
# classifier = nltk.NaiveBayesClassifier.train(trainfeats)
classifier.show_most_informative_features(10)
sentence = "Dosa had a tangy taste but it was fun eating it. On the other hand the other dosa was soggy"
l = sentence.split(' ')
print(l)
print(word_feats(l))
print(classifier.prob_classify(word_feats(l)).prob('pos'))
print(classifier.prob_classify(word_feats(l)).prob('neg'))
The output of this is this -
0.500074231063
0.499925768937
The overall classification seems to be working fine but I can't figure out how the probabilities are calculated and why are they always same even if I change the test sentence.
Any quick help appreciated.
Thanks.
That's a lot of code! I'm not going to debug it for you, but I notice that bestwords is the set of all words in your training corpus. If that's not outright wrong, it's certainly misleadingly named.