i have some problems on my code that show some error when i run it. i'm using python
so, here's my code
import collections
import nltk.classify.util, nltk.metrics
from nltk.classify import SklearnClassifier
import csv
from sklearn import cross_validation
from sklearn.svm import LinearSVC, SVC
import random
from nltk.corpus import stopwords
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
posdata = []
with open('positive-data.csv', 'rb') as myfile:
reader = csv.reader(myfile, delimiter=',')
for val in reader:
posdata.append(val[0])
negdata = []
with open('negative-data.csv', 'rb') as myfile:
reader = csv.reader(myfile, delimiter=',')
for val in reader:
negdata.append(val[0])
def word_split(data):
data_new = []
for word in data:
word_filter = [i.lower() for i in word.split()]
data_new.append(word_filter)
return data_new
def word_split_sentiment(data):
data_new = []
for (word, sentiment) in data:
word_filter = [i.lower() for i in word.split()]
data_new.append((word_filter, sentiment))
return data_new
def word_feats(words):
return dict([(word, True) for word in words])
stopset = set(stopwords.words('english')) - set(('over', 'under', 'below', 'more', 'most', 'no', 'not', 'only', 'such', 'few', 'so', 'too', 'very', 'just', 'any', 'once'))
def stopword_filtered_word_feats(words):
return dict([(word, True) for word in words if word not in stopset])
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
"""
print words
for ngram in itertools.chain(words, bigrams):
if ngram not in stopset:
print ngram
exit()
"""
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def bigram_word_feats_stopwords(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
"""
print words
for ngram in itertools.chain(words, bigrams):
if ngram not in stopset:
print ngram
exit()
"""
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams) if ngram not in stopset])
# Calculating Precision, Recall & F-measure
def evaluate_classifier(featx):
negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
classifier = 'svm'
for cl in classifier:
if cl == 'svm':
classifierName = 'SVM'
classifier = SklearnClassifier(LinearSVC(), sparse=False)
classifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testfeats):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
accuracy = nltk.classify.util.accuracy(classifier, testfeats)
pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
print ''
print '---------------------------------------'
print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')'
print '---------------------------------------'
print 'accuracy:', accuracy
print 'precision', (pos_precision + neg_precision) / 2
print 'recall', (pos_recall + neg_recall) / 2
print 'f-measure', (pos_fmeasure + neg_fmeasure) / 2
print ''
## CROSS VALIDATION
trainfeats = negfeats + posfeats
# SHUFFLE TRAIN SET
random.shuffle(trainfeats)
n = 5
for cl in classifier_list:
subset_size = len(trainfeats) / n
accuracy = []
pos_precision = []
pos_recall = []
neg_precision = []
neg_recall = []
pos_fmeasure = []
neg_fmeasure = []
cv_count = 1
for i in range(n):
testing_this_round = trainfeats[i*subset_size:][:subset_size]
training_this_round = trainfeats[:i*subset_size] + trainfeats[(i+1)*subset_size:]
if cl == 'svm':
classifierName = 'SVM'
classifier = SklearnClassifier(LinearSVC(), sparse=False)
classifier.train(training_this_round)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testing_this_round):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round)
cv_pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
cv_pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
cv_pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
cv_neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
cv_neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
cv_neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
accuracy.append(cv_accuracy)
pos_precision.append(cv_pos_precision)
pos_recall.append(cv_pos_recall)
neg_precision.append(cv_neg_precision)
neg_recall.append(cv_neg_recall)
pos_fmeasure.append(cv_pos_fmeasure)
neg_fmeasure.append(cv_neg_fmeasure)
cv_count += 1
print '---------------------------------------'
print 'N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')'
print '---------------------------------------'
print 'accuracy:', sum(accuracy) / n
print 'precision', (sum(pos_precision)/n + sum(neg_precision)/n) / 2
print 'recall', (sum(pos_recall)/n + sum(neg_recall)/n) / 2
print 'f-measure', (sum(pos_fmeasure)/n + sum(neg_fmeasure)/n) / 2
print ''
evaluate_classifier(word_feats)`
its suppose to analysis sentiment from csv document using SVM, but when i run the code i got this error anyone have an idea to fix it??
really need your help guys
Related
I'm a new guy to tensorflow. I tried but couldn't solve this.
TypeError: <tf.Tensor 'module_apply_default/aggregation/mul_3:0' shape=(2, 9, 1024) dtype=float32> is out of scope and cannot be used here. Use return values, explicit Python locals or TensorFlow collections to access it.
Please see https://www.tensorflow.org/guide/function#all_outputs_of_a_tffunction_must_be_return_values for more information.
<tf.Tensor 'module_apply_default/aggregation/mul_3:0' shape=(2, 9, 1024) dtype=float32> was defined here:
The tensor <tf.Tensor 'module_apply_default/aggregation/mul_3:0' shape=(2, 9, 1024) dtype=float32> cannot be accessed from here, because it was defined in <tensorflow.python.framework.ops.Graph object at 0x0000011DC7B75690>, which is out of scope.
This is my Code -
import io
import tensorflow as tf
import tensorflow_hub as hub
import glob
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import genesis
from nltk.corpus import wordnet_ic
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet as wn
import gensim
from gensim.models import doc2vec
from collections import namedtuple
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import math
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from scipy import spatial
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# nltk.download('wordnet')
# nltk.download('genesis')
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
tokenizer = RegexpTokenizer(r'\w+') #will remove punctuations
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
genesis_ic=wn.ic(genesis, False, 0.0)
# module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" ##param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
# embed = hub.Module(module_url)
# elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
def get_tdata():
direc=glob.glob('Semeval Data XMLtocsv/*.csv')
direc.sort()
df=[]
for s in direc:
df.append(pd.read_csv(s))
ref_ans=[]
f = open(r"Semeval Data XMLtocsv/ref_ans.txt", "r")
for x in f:
if x=='\n':
continue
ref_ans.append(x.strip())
question=[]
f = open(r"Semeval Data XMLtocsv/question.txt", "r")
for x in f:
if x=='\n':
continue
question.append(x.strip())
stud_ans=[]
accuracy=[]
for i in range(len(df)):
stud_ans.append([j for j in df[i]['__text']])
accuracy.append([j for j in df[i]['_accuracy']])
corr=0
incorr=0
Y=[]
for i in range(0,len(stud_ans)):
for j in range(len(accuracy[i])):
if(accuracy[i][j]=="correct"):
Y.append(1)
corr=corr+1
else:
Y.append(0)
incorr=incorr+1
print(len(stud_ans),len(ref_ans),len(question),len(Y))
return stud_ans,ref_ans,question,Y
stud_ans,ref_ans,question,Y=get_tdata()
print(ref_ans[0])
def get_func_words():
file=open(r"function_words")
func_words=[]
pattern=r"\w+'?\w*"
for i in file.readlines():
result=re.findall(pattern,i)
for j in result:
func_words.append(j)
return func_words
func_words=get_func_words()
def uni_sent_encoder(stud_ans):
# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)
module_url = "./uni_encoder" ##param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
embed = hub.Module(module_url)
with tf.Session() as session:
session.run([tf.global_variables_initializer(), tf.tables_initializer()])
embeddings = session.run(embed(stud_ans))
return embeddings
def cs_univ_encoder(stud_ans,ref_ans):
t=[]
t=stud_ans[:]
t.append(ref_ans)
matrix = uni_sent_encoder(t)
cossim_use=[0] * (len(matrix)-1)
for i in range(len(matrix)-1):
cossim_use[i] = 1 - spatial.distance.cosine(matrix[i], matrix[len(matrix)-1])
return cossim_use
def word_sent_length(stud_ans):
sent_length=[0] * len(stud_ans)
av_word_length=[0] * len(stud_ans)
j=0
for i in stud_ans:
sent_length[j]=len(tokenizer.tokenize(i))
for w in i:
if(w!=' '):
av_word_length[j]+=1
av_word_length[j]/=sent_length[j]
j+=1
ws = [av_word_length, sent_length]
return ws
def avsenlen(stud_ans):
temp = word_sent_length(stud_ans)
avg_sent_length_in_doc = 0
for i in range(len(temp[1])):
avg_sent_length_in_doc += temp[1][i]
avg_sent_length_in_doc /= len(temp[1])
return avg_sent_length_in_doc
def prompt_overlap(s_ans,question):
# print(len(tokenizer.tokenize(s_ans)),"\n",s_ans,'\n',question)
# overlap_metric = [0] * len(tokenizer.tokenize(s_ans))
qs_words = tokenizer.tokenize(question)
q_words = [w for w in qs_words if w not in stop_words]
l=0
overlap_metric = 0
w = tokenizer.tokenize(s_ans)
for j in w:
for k in q_words:
if(j==k):
overlap_metric+=1
break
l+=1
myInt = len(q_words)
overlap_metric /= myInt
return overlap_metric
def pre_word2vec():
model = r"cc.en.300.vec"
word_vectors = gensim.models.KeyedVectors.load_word2vec_format(model, binary=False)
return word_vectors
# fin = io.open(r'cc.en.300.vec', encoding='utf-8', newline='\n', errors='ignore')
# n, d = map(int, fin.readline().split())
# data = {}
# for line in fin:
# tokens = line.rstrip().split(' ')
# data[tokens[0]] = map(float, tokens[1:])
# return data
w2vmodel = pre_word2vec()
def cosine_sim_word2vec(stud_ans,ref_ans):
nums=[]
for i in range(len(stud_ans)):
ss1 = stud_ans[i]
ss2 = ref_ans
data = []
data2= []
stop_words = set(stopwords.words('english'))
s1 = [w.lower() for w in tokenizer.tokenize(ss1) if w.lower() not in stop_words]
s2 = [w.lower() for w in tokenizer.tokenize(ss2) if w.lower() not in stop_words]
dd=[]
dd.append(s1)
dd.append(s2)
sim=0
for i in s1:
maxi=0
for j in s2:
# print("I'm in")
maxi = max(maxi,w2vmodel.similarity(i,j))
sim+=maxi
length = max(len(word_tokenize(ss1)), len(word_tokenize(ss2)))
sim/=length
nums.append(sim)
return nums
def d2v(sa):
doc1=sa
docs = []
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, text in enumerate(doc1):
words = text.lower().split()
tags = [i]
docs.append(analyzedDocument(words, tags))
# Train model (set min_count = 1, if you want the model to work with the provided example data set)
model = doc2vec.Doc2Vec(docs, vector_size = 12, window = 300, min_count = 1, workers = 4)
return model.docvecs
def cosine_sim_d2v(stud_ans,ref_ans):
t=[]
t=stud_ans[:]
t.append(ref_ans)
matrix = d2v(t)
cossimw2v=[0] * (len(matrix)-1)
for i in range(len(matrix)-1):
cossimw2v[i] = 1 - spatial.distance.cosine(matrix[i], matrix[len(matrix)-1])
return cossimw2v
def IDFpp(stud_ans):
doc_info = []
j=0
for i in stud_ans:
j+=1
sa = tokenizer.tokenize(i)
count = len(sa)
temp = {"doc_id":j, "doc_length(count)":count}
doc_info.append(temp)
k=0
freq = []
for i in stud_ans:
k+=1
sa = tokenizer.tokenize(i)
fd={}
for w in sa:
w=w.lower()
if w in fd:
fd[w]+=1
else:
fd[w]=1
temp = {'doc_id':i, "freq":fd}
freq.append(temp)
#print(freq,"\n\n\n\n\n")
return doc_info, freq
#Tfidf vectorizer function
def IDF(stud_ans):
doc_info, freq = IDFpp(stud_ans)
IDFscore=[]
counter = 0
for d in freq:
counter+=1
for k in d['freq'].keys():
#print(k)
count = sum([k in tempDict['freq'] for tempDict in freq])
#if()
#print(count)
temp = {'doc_id':counter, 'IDFscore':math.log(len(doc_info)/count),'TF score':(count),'key':k}
IDFscore.append(temp)
return IDFscore
def fsts(stud_ans,ref_ans):
k1=1.2
b=0.75
fstsvalue=[]
avsenlength = avsenlen(stud_ans)
#compare stud_ans and ref_ans
idfscore = IDF(stud_ans)
for i in range(len(stud_ans)):
if(len(word_tokenize(stud_ans[i])) > len(word_tokenize(ref_ans))):
lsen = [w.lower() for w in tokenizer.tokenize(stud_ans[i])]
ssen = [w.lower() for w in tokenizer.tokenize(ref_ans)]
sl=len((stud_ans[i]))
ss=len((ref_ans))
else:
ssen = [w.lower() for w in tokenizer.tokenize(stud_ans[i])]
lsen = [w.lower() for w in tokenizer.tokenize(ref_ans)]
ss=len((stud_ans[i]))
sl=len((ref_ans))
temp=0
for i in (lsen):
maxi=0
idf=0
for w in (ssen):
maxi = max(maxi,w2vmodel.similarity(i,w))
for j in range(len(idfscore)):
if(idfscore[j]['key'] == i):
idf = idfscore[j]['IDFscore']
temp += idf * (maxi * (k1+1))
temp /= (maxi + k1* (1- b + b*(ss/avsenlength)))
fstsvalue.append(temp)
return fstsvalue
def noun_overlap(stud_ans,ref_ans):
word_tokens = tokenizer.tokenize(stud_ans)
ref_tokens = tokenizer.tokenize(ref_ans)
stud_ans_tag=nltk.pos_tag(word_tokens)
ref_ans_tag=nltk.pos_tag(ref_tokens)
ref_nouns=[]
stud_ans_nouns=[]
#use regex here
for i,j in ref_ans_tag:
if(j in ["NN","NNS","NNP","NNPS"]):
ref_nouns.append(i)
for i,j in stud_ans_tag:
if(j in ["NN","NNS","NNP","NNPS"]):
stud_ans_nouns.append(i)
score=0
for i in stud_ans_nouns:
if i in ref_nouns:
score=score+1;
return score/len(ref_nouns)
def content_overlap(s,r):#both are lists
s_ans=[]
ref_ans=r
t=[]
for i in range(len(s)):
t=(tokenizer.tokenize(s[i]))
s_ans.append(t)
for i in range(len(s_ans)):
s_ans[i] = [lemmatizer.lemmatize(j) for j in s_ans[i] if j not in func_words]
ref_ans = [lemmatizer.lemmatize(i) for i in tokenizer.tokenize(ref_ans) if i not in func_words]
length=len(ref_ans)
for i in range(len(ref_ans)):
for j in wn.synsets(ref_ans[i]):
for k in j.lemmas():
ref_ans.append(k.name())
temp=[]
for i in s_ans:
val=0
for j in i:
if j in ref_ans:
val+=1
temp.append(val/(length))
return temp
def lsa_score(stud_ans):
corpus = [ans for ques in stud_ans for ans in ques]
vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english')
dtm = vectorizer.fit_transform(corpus)
lsa = TruncatedSVD(3, algorithm = 'arpack')
dtm=dtm.astype(float)
dtm_lsa = lsa.fit_transform(dtm)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
lsa_stud=pd.DataFrame(dtm_lsa, index = corpus, columns = ["component_1","component_2",'component_3'])
X=[]
for i in range(len(corpus)):
t=[]
t.append(lsa_stud.iloc[i,:1].values[0])
t.append(lsa_stud.iloc[i,1:2].values[0])
t.append(lsa_stud.iloc[i,2:3].values[0])
X.append(t)
return X
def elmo_vectors(x):
# Create graph and finalize (optional but recommended).
g = tf.Graph()
with g.as_default():
text_input = tf.compat.v1.placeholder(dtype=tf.string, shape=[None])
elmo = hub.Module("./elmo_module", trainable=True)
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
init_op = tf.group([tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer()])
# g.finalize()
# Create session and initialize.
session = tf.compat.v1.Session(graph=g)
session.run(init_op)
return session.run(tf.compat.v1.reduce_mean(embeddings,1))
# embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
#
# with tf.Session() as sess:
# sess.run(tf.global_variables_initializer())
# sess.run(tf.tables_initializer())
#
# # return average of ELMo features
# return sess.run(tf.reduce_mean(embeddings,1))
def cos_sim_elmo(stud_ans,ref_ans):
t=[]
t=stud_ans[:]
t.append(ref_ans)
matrix = elmo_vectors(t)
cossimelmo = [0] * (len(matrix))
for i in range(len(matrix)):
cossimelmo[i] = 1 - spatial.distance.cosine(matrix[i], matrix[len(matrix)-1])
return cossimelmo
def jc_sim(stud_ans,ref_ans):
X=[]
c=0
ref_words=tokenizer.tokenize(ref_ans)
ref_words=[lemmatizer.lemmatize(j) for j in ref_words if j.lower() not in stop_words]
for s in stud_ans:
num=0
words=tokenizer.tokenize(s)
words=[lemmatizer.lemmatize(j) for j in words if j.lower() not in stop_words]
l=max(len(ref_words),len(words))
for w in words:
maxi=0
for w1 in wn.synsets(w):
for t in ref_words:
for w2 in wn.synsets(t):
if w1._pos in ('n','v','a','r') and w2._pos in ('n','v','a','r') and w1._pos==w2._pos:
n=w1.jcn_similarity(w2,genesis_ic)
if w1==w2 or n>1:
maxi=1
else:
maxi=max(maxi,w1.jcn_similarity(w2,genesis_ic))
num=num+maxi
num=num/l
X.append(num)
return X
def sp_sim(stud_ans,ref_ans):
X=[]
c=0
ref_words=tokenizer.tokenize(ref_ans)
ref_words=[lemmatizer.lemmatize(j) for j in ref_words if j.lower() not in stop_words]
for s in stud_ans:
num=0
words=tokenizer.tokenize(s)
words=[lemmatizer.lemmatize(j) for j in words if j.lower() not in stop_words]
l=max(len(ref_words),len(words))
for w in words:
maxi=0
for w1 in wn.synsets(w):
for t in ref_words:
for w2 in wn.synsets(t):
if w1._pos in ('n','v','a','r') and w2._pos in ('n','v','a','r') and w1._pos==w2._pos:
n=w1.lch_similarity(w2,genesis_ic)
#print(w1, w2, type(n), n)
#return None when there is no similarity hence needed to add another if clause
if n == None:
maxi=0
elif w1==w2 or n>1:
maxi=1
else:
maxi=max(maxi,w1.lch_similarity(w2,genesis_ic))
num=num+maxi
#print(num)
num=num/l
X.append(num)
return X
def ttr(sent):
words=tokenizer.tokenize(sent)
return len(set(words))/len(words)
def calc_train_features():
X=[]
lss=lsa_score(stud_ans)
c=0
for i in range(len(stud_ans)):
sta=stud_ans[i]
ref=ref_ans[i]
que=question[i]
lt=len(sta)
wsn=word_sent_length(sta)
csw2v=cosine_sim_word2vec(sta,ref)
csd2v=cosine_sim_d2v(sta,ref)
fs=fsts(sta,ref)
co = content_overlap(sta,ref)
cselmo = cos_sim_elmo(sta, ref)
jcs = jc_sim(sta,ref)
sps = sp_sim(sta,ref)
csuse = cs_univ_encoder(sta,ref)
for j in range(lt):
temp = []
temp.append(prompt_overlap(sta[j],que))
for k in range(1):
temp.append(wsn[k][j])
temp.append(csw2v[j])
temp.append(csd2v[j])
temp.append(fs[j])
# temp.append(noun_overlap(sta[j],ref))
# temp.append(co[j])
for k in range(3):
temp.append(lss[c][k])
temp.append(ttr(sta[j]))
temp.append(jcs[j])
temp.append(sps[j])
# temp.append(glv[j])
temp.append(csuse[j])
X.append(temp)
print(i," - ",j, " -- ",c)
c=c+1
def get_features(sta,ref,q):
temps=stud_ans[:]
temps.append(sta)
lss=lsa_score(temps)[-len(sta):]
wsn=word_sent_length(sta)
csw2v=cosine_sim_word2vec(sta,ref)
csd2v=cosine_sim_d2v(sta,ref)
fs=fsts(sta,ref)
co = content_overlap(sta,ref)
cselmo = cos_sim_elmo(sta, ref)
jcs = jc_sim(sta,ref)
sps = sp_sim(sta,ref)
csuse = cs_univ_encoder(sta,ref)
X=[]
for j in range(len(sta)):
temp=[]
temp.append(prompt_overlap(sta[j],q))
for k in range(1):
temp.append(wsn[k][j])
temp.append(csw2v[j])
temp.append(csd2v[j])
temp.append(fs[j])
# temp.append(noun_overlap(sta[j],ref))
# temp.append(co[j])
temp.append(cselmo[j])
for k in range(3):
temp.append(lss[j][k])
temp.append(ttr(sta[j]))
temp.append(jcs[j])
temp.append(sps[j])
temp.append(csuse[j])
X.append(temp)
return X
def train_data():
xdf=pd.read_csv("./final_features.csv")[['prompt_overlap','avg_word_length','cosineword2vec','cosinedoc2vec','fsts','cosine_elmo','lsa1','lsa2','lsa3','ttr','jc_sim','sps','cs_use','score']]
Y=xdf['score'].values
xdf=xdf.drop(['score'],axis=1)
X=xdf.values
return X,Y
def train_model():
X,Y=train_data()
train_x, test_x, train_y, test_y = train_test_split(X,Y,test_size=0.2)
clf = RandomForestClassifier(n_estimators=120,max_depth=15)
clf.fit(train_x,train_y)
print( "Test Accuracy :: ", accuracy_score(test_y, clf.predict(test_x)))
return clf
clf=train_model()
def predict_ans(X):
return clf.predict(X)
def get_prob(X):
return clf.predict_proba(X)
In my problem, I'm trying to compare the perplexity values of different N-gram models, say till N=4.
However, I'm confused with the other results obtained using other methods.
Here is my first implementation: -
import nltk
nltk.download('punkt')
nltk.download('webtext')
from nltk.corpus import webtext
nltk.download('stopwords')
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import Laplace
from nltk.lm import MLE
from sklearn.model_selection import train_test_split
from decimal import Decimal
import numpy as np
from nltk.util import ngrams
corpus = []
for fileid in webtext.fileids():
corpus += [list(i) for i in webtext.sents(fileid)]
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
stopwords = set(nltk.corpus.stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def preprocess_sentence(text):
text = text.lower()
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'[0-9]', '', text)
#remove unwanted spaces
text = re.sub(' +', ' ', text)
#remove trailing spaces
text = text.strip()
#remove stop words
f = []
for w in text.split(" "):
if w not in stopwords:
f.append(w)
text = " ".join(f)
# text = text.split(" ")
# text = [stemmer.stem(word) for word in text]
# text = [lemmatizer.lemmatize(word) for word in text]
return text
data = []
for sent in corpus:
newstr = ' '.join(sent)
newstr = preprocess_sentence(newstr)
newlist = newstr.split(' ')
data.append(newlist)
**Bigram Model**
n = 2
train_data, padded_vocab = padded_everygram_pipeline(n, train)
model_bi = Laplace(n)
model_bi.fit(train_data, padded_vocab)
generated_2grams = []
for sent in test:
generated_2grams.append(list(ngrams(sent, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))
PP_bi = []
sum_sent_pp = 0
for sent in generated_2grams:
totalWords = len(sent)
sent_pp = 1
for bigram in sent:
s1, s2 = bigram[0], bigram[1].split()
score = model_bi.score(s1, s2) # score for each bigram
if score != 0:
sent_pp *= (1 / Decimal(score))
sent_pp = pow(sent_pp, Decimal(1 / totalWords))
PP_bi.append(sent_pp)
print(f"Perplexity of a Bigram Model is {sum(PP_bi) / len(PP_bi)}")
**Trigram Model**
n = 3
train_data, padded_vocab = padded_everygram_pipeline(n, train)
model_tri = Laplace(n)
model_tri.fit(train_data, padded_vocab)
generated_3grams = []
for sent in test:
generated_3grams.append(list(ngrams(sent, 3, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))
PP_tri = []
for sent in generated_3grams:
totalWords = len(sent)
sent_pp = 1
for trigram in sent:
s1, s2, s3 = trigram[0], trigram[1], trigram[2]
jointStr = s1 + " " + s2
l = jointStr.split()
score = model_tri.score(s3, l)
if score != 0:
sent_pp *= Decimal(1 / score)
sent_pp = pow(sent_pp, Decimal(1 / totalWords))
PP_tri.append(sent_pp)
print(f"Perplexity of a Trigram Model is {sum(PP_tri) / len(PP_tri)}")
**Quadgram Model**
n = 4
train_data, padded_vocab = padded_everygram_pipeline(n, train)
model_quad = Laplace(n)
generated_4grams = []
for sent in test:
generated_4grams.append(list(ngrams(sent, 4, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))
PP_quad = []
for sent in generated_4grams:
totalWords = len(sent)
sent_pp = 1
for quadgram in sent:
s1, s2, s3, s4 = quadgram[0], quadgram[1], quadgram[2], quadgram[3]
jointStr = s1 + " " + s2 + " " + s3
l = jointStr.split()
score = model_quad.score(s4, l)
if score != 0:
sent_pp *= Decimal(1 / score)
sent_pp = pow(sent_pp, Decimal(1 / totalWords))
PP_quad.append(sent_pp)
print(f"Perplexity of a Quadgram Model is {sum(PP_quad) / len(PP_quad)}")
Results:
1) Perplexity of Bigram : 12900.02
2) Perplexity of Trigram: 6241.26
3) Perplexity of Quadgram: 6804.64
The Perplexity should decrease in Quadgram model, however, it's more than the trigram one.
Moreover, I'm not so sure that the perplexity values I'm getting, and the method I have implemented is correct.
I also tried finding perplexites usin nltk.perplexity(), however, I'm getting different results. The input to nltk.perplexity is bigrams of a sentence in a corpus.
With both the approaches giving different results, I'm quite confused.
I read an article, named Unsupervised Personality Recognition for Social Network Sites, about the personality extraction from text. There are 22 features and 4 classes represented 4 personalities. Through counting the features in a text, we can know which class this sentence belong to, which means we can know the personality of the sentence.
The article provides the correlations for every feature and class. So the score of a class is the feature's value minus the mean of all feature's value then divided by the standard deviation of all feature's value then multiply the correlation coefficiency provided by the article. Then we can judge if the sentence belongs to the class through its score. I set the threshold to improve its accuracy but it is still not good enough. My result is around 50-60% accuracy and don't know how to improve it. Anyone can help me ?
import csv
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
import pickle
from statistics import mean, stdev
with open('mypersonality_final.csv', newline = '') as csvfile:
reader = csv.reader(csvfile)
test = []
for w in reader:
test.append(w)
def all_punctuation(text):
punctuations = ['.', ',', ';', ':']
count = 0
for w in text:
if w in punctuations:
count += 1
return count
def count_commas(text):
count = 0
for w in text:
if w == ',':
count += 1
return count
def count_pattern(text):
grammar = RegexpTokenizer(r'\#')
pattern = grammar.tokenize(text)
return len(pattern)
def count_exclamation(text):
grammar = RegexpTokenizer(r'\!')
pattern = grammar.tokenize(text)
return len(pattern)
def ex_links(text):
grammar = RegexpTokenizer(r'http?\S+\w(?:(?:\/[^\s/]*))*|www\.\S+\w(?:(?:\/[^\s/]*))*|ftp\S+\w(?:(?:\/[^\s/]*))*')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_sinpronouns(text):
sigpronouns = ['i', 'me', 'my', 'mine', 'we']
count = 0
for w in text:
if w.lower() in sigpronouns:
count += 1
return count
def negative_particle(text):
with open('negative-words.txt') as neg:
neg = neg.read()
words = nltk.word_tokenize(neg)
grammar = RegexpTokenizer(r'\w+')
nopunctuation = grammar.tokenize(text)
count = 0
for w in nopunctuation:
if w.lower() in words:
count += 1
return count
def negative_emoticon(text):
grammar = RegexpTokenizer(r"(?::|;|=)(?:-)?(?:\()")
emoticons = grammar.tokenize(text)
return len(emoticons)
def numbers(text):
grammar = RegexpTokenizer(r'\d+')
pattern = grammar.tokenize(text)
return len(pattern)
def parenthesis(text):
pat = '\([^)]*\)'
parent = re.findall(pat, text)
return len(parent)
def positive_emoticon(text):
grammar = RegexpTokenizer(r'(?::|;|=|<|>)(?:-|\.)?(?:\)|D|P|3|<)')
emoticons = grammar.tokenize(text)
return len(emoticons)
def prepositions(text):
tagged = nltk.pos_tag(text)
count = 0
for w in tagged:
if w[1] == 'IN':
count += 1
return count
def pronouns(text):
tagged = nltk.pos_tag(text)
count = 0
for w in tagged:
if (w[1] == 'PRP' or w[1] == 'PRP$' or w[1] == 'WP' or w[1] == 'WPR$'):
count += 1
return count
def count_question(text):
grammar = RegexpTokenizer(r'\?')
pattern = grammar.tokenize(text)
return len(pattern)
def long_words(text):
grammar = RegexpTokenizer(r'\w{7,}')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_pronouns(text):
firstpronouns = ['i', 'me', 'my', 'mine', 'we', 'our', 'ours', 'us']
count = 0
for w in text:
if w.lower() in firstpronouns:
count += 1
return count
def swears_count(text):
with open('swears.txt') as test:
words = test.read()
swears = re.sub(r'[^\w+\s]+', '', words)
swears = swears.split('\n')
count = 0
for w in text:
if w.lower() in swears:
count += 1
return count
def typetoken_ratio(text):
typed = set(text)
token = text
ratio = len(typed)/len(token)
return ratio
def count_words(text):
grammar = RegexpTokenizer(r'\w+')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_pluralpronouns(text):
pluralpronouns = ['we', 'our', 'ours', 'us']
count = 0
for w in text:
if w.lower() in pluralpronouns:
count += 1
return count
def sec_pronouns(text):
secpronouns = ['you', 'your', 'yours']
count = 0
for w in text:
if w.lower() in secpronouns:
count += 1
return count
def mean_freq(text):
## grammar = RegexpTokenizer(r'\w+')
words = word_tokenize(text)
wordsl = []
for w in words:
wordsl.append(w.lower())
unique = set(wordsl)
return (len(wordsl)/len(unique))
def mean_std(test):
f1 = []
f2 = []
f3 = []
f4 = []
f5 = []
f6 = []
f7 = []
f8 = []
f9 = []
f10 = []
f11 = []
f12 = []
f13 = []
f14 = []
f15 = []
f16 = []
f17 = []
f18 = []
f19 = []
f20 = []
f21 = []
f22 = []
for w in test[1:]:
f1.append(all_punctuation(word_tokenize(w[1])))
f2.append(count_commas(word_tokenize(w[1])))
f3.append(count_pattern(w[1]))
f4.append(count_exclamation(w[1]))
f5.append(ex_links(w[1]))
f6.append(firs_sinpronouns(word_tokenize(w[1])))
f7.append(negative_particle(w[1]))
f8.append(negative_emoticon(w[1]))
f9.append(numbers(w[1]))
f10.append(parenthesis(w[1]))
f11.append(positive_emoticon(w[1]))
f12.append(prepositions(word_tokenize(w[1])))
f13.append(pronouns(word_tokenize(w[1])))
f14.append(count_question(w[1]))
f15.append(long_words(w[1]))
f16.append(firs_pronouns(word_tokenize(w[1])))
f17.append(swears_count(word_tokenize(w[1])))
f18.append(typetoken_ratio(word_tokenize(w[1])))
f19.append(count_words(w[1]))
f20.append(firs_pluralpronouns(word_tokenize(w[1])))
f21.append(sec_pronouns(word_tokenize(w[1])))
f22.append(mean_freq(w[1]))
value = [f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19, f20, f21, f22]
mean1 = []
stdev1 = []
for a in value:
mean1.append(round(mean(a),2))
stdev1.append(round(stdev(a),2))
return (mean1, stdev1)
##save_file = open('sample_value.pickle', 'wb')
##pickle.dump(mean_std(test), save_file)
##save_file.close()
savedfile = open('sample_value.pickle', 'rb')
trained = pickle.load(savedfile)
savedfile.close()
def evaluation(test):
ne = 0
ns = 0
na = 0
nc = 0
no = 0
for w in test[1:]:
z1 = (all_punctuation(word_tokenize(w[1])) - trained[0][0])/(trained[1][0])
z2 = (count_commas(word_tokenize(w[1]))- trained[0][1])/(trained[1][1])
z3 = (count_pattern(w[1]) - trained[0][2])/(trained[1][2])
z4 = (count_exclamation(w[1]) - trained[0][3])/(trained[1][3])
z5 = (ex_links(w[1]) - trained[0][4])/(trained[1][4])
z6 = (firs_sinpronouns(word_tokenize(w[1]))- trained[0][5])/(trained[1][5])
z7 = (negative_particle(w[1])-trained[0][6])/(trained[1][6])
z8 = (negative_emoticon(w[1]) - trained[0][7])/(trained[1][7])
z9 = (numbers(w[1])-trained[0][8])/(trained[1][8])
z10 = (parenthesis(w[1])-trained[0][9])/(trained[1][9])
z11 = (positive_emoticon(w[1])-trained[0][10])/(trained[1][10])
z12 = (prepositions(word_tokenize(w[1]))-trained[0][11])/(trained[1][11])
z13 = (pronouns(word_tokenize(w[1]))-trained[0][12])/(trained[1][12])
z14 = (count_question(w[1])-trained[0][13])/(trained[1][13])
z15 = (long_words(w[1])-trained[0][14])/(trained[1][14])
z16 = (firs_pronouns(word_tokenize(w[1]))-trained[0][15])/(trained[1][15])
z17 = (swears_count(word_tokenize(w[1]))-trained[0][16])/(trained[1][16])
z18 = (typetoken_ratio(word_tokenize(w[1]))-trained[0][17])/(trained[1][17])
z19 = (count_words(w[1])-trained[0][18])/(trained[1][18])
z20 = (firs_pluralpronouns(word_tokenize(w[1]))-trained[0][19])/(trained[1][19])
z21 = (sec_pronouns(word_tokenize(w[1]))-trained[0][20])/(trained[1][20])
z22 = (mean_freq(w[1])-trained[0][21])/(trained[1][21])
E = -0.08*z1-0.02*z2-0.07*z3-0.05*z5+0.05*z6-0.08*z7-0.03*z8-0.03*z9-0.06*z10+0.07*z11+0.07*z13-0.06*z14-0.06*z15+0.07*z16-0.01*z17-0.05*z18-0.01*z19+0.06*z20-0.01*z21+0.05*z22
S = -0.04*z1+0.01*z2+0.02*z3-0.05*z4-0.02*z5-0.15*z6+0.12*z7-0.18*z8+0.05*z9+0.03*z10+0.07*z11+0.06*z12+0.12*z13-0.05*z14+0.06*z15-0.14*z16+0.1*z18+0.02*z19+0.07*z20+0.03*z21-0.06*z22
A = -0.01*z1-0.02*z2+0.01*z3+0.06*z4-0.01*z5+0.05*z6+0.11*z7-0.11*z8-0.03*z9-0.04*z10+0.05*z11+0.04*z12+0.04*z13-0.04*z14-0.05*z15-0.06*z16-0.14*z17-0.04*z18+0.02*z19+0.04*z20-0.06*z21+0.03*z22
C = -0.04*z1-0.01*z2+0.01*z3-0.03*z5+0.04*z6-0.07*z7-0.11*z8-0.02*z9-0.01*z10+0.02*z11+0.08*z12+0.02*z13-0.06*z14+0.02*z15-0.04*z16-0.11*z17-0.05*z18-0.02*z19+0.01*z20-0.04*z21+0.06*z22
O = -10*z1+0.1*z2+0.06*z3-0.03*z4+0.09*z5-0.14*z6+0.01*z7+0.04*z8-0.06*z9+0.1*z10+0.02*z11-0.04*z12-0.06*z13+0.08*z14+0.1*z15-0.14*z16+0.08*z17+0.09*z18+0.06*z19+0.04*z20+0.11*z21-0.07*z22
if E>0.65:
if w[7] =='y':
ne+=1
if E<0.65:
if w[7]=='n':
ne+=1
if S>0.75:
if w[8] == 'y':
ns +=1
if S<0.75:
if w[8] == 'n':
ns+=1
if A>0.005:
if w[9]=='y':
na+=1
if A<0.005:
if w[9]=='n':
na+=1
if C>0.58:
if w[10]=='y':
nc+=1
if C<0.58:
if w[10]=='n':
nc+=1
if O>(-0.05):
if w[11]=='y':
no+=1
if O<(-0.05):
if w[11]=='n':
no+=1
print (round((ne/9917)*100,2), round((ns/9917)*100,2),round((na/9917)*100,2),round((nc/9917)*100,2),round((no/9917)*100,2))
evaluation(test)
The sample data is:
enter image description here
I am trying to use the maxent classifier using the NLTK library. I have a list of positive and negative words and I have trained the classifier on the same. The problem is when I test the classifier against a sentence I always get the same probability of classification for the two classes. Here is the code -
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import MaxentClassifier
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from sklearn import cross_validation
nltk.data.path.append("/home/daksh/Documents/Softwares/nltk_data")
import csv
import operator
from nltk.classify import MaxentClassifier
from nltk.corpus import movie_reviews
def getBestWords(posWords,negWords):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
label_word_fd['pos'][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
label_word_fd['neg'][word.lower()] += 1
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1),reverse=True)[:2500]
bestwords = set([w for w,s in sorted_x])
return bestwords
def best_word_feats(words,bestwords):
return dict([(word, True) for word in words if word in bestwords])
def word_feats(words):
return dict([(word, True) for word in words])
def best_bigram_word_feats(words,posWords,negWords, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
d = dict([(bigram, True) for bigram in bigrams])
bestwords = getBestWords(posWords,negWords)
d.update(best_word_feats(words,bestwords))
return d
posWords = list()
negWords = list()
with open('../data/finalSentiPosWords.csv','r') as csvfile:
spamreader = csv.reader(csvfile)
posWords = list(spamreader)
with open('../data/finalSentiNegWords.csv','r') as csvfile:
spamreader = csv.reader(csvfile)
negWords = list(spamreader)
posWords = [word[0] for word in posWords]
negWords = [word[0] for word in negWords]
bestwords = getBestWords(posWords,negWords)
posfeats = [(best_bigram_word_feats(posWords,posWords,negWords),'pos')]
negfeats = [(best_bigram_word_feats(negWords,posWords,negWords),'neg')]
trainfeats = negfeats + posfeats
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
classifier = nltk.MaxentClassifier.train(trainfeats, algorithm,max_iter=5)
# classifier = nltk.NaiveBayesClassifier.train(trainfeats)
classifier.show_most_informative_features(10)
sentence = "Dosa had a tangy taste but it was fun eating it. On the other hand the other dosa was soggy"
l = sentence.split(' ')
print(l)
print(word_feats(l))
print(classifier.prob_classify(word_feats(l)).prob('pos'))
print(classifier.prob_classify(word_feats(l)).prob('neg'))
The output of this is this -
0.500074231063
0.499925768937
The overall classification seems to be working fine but I can't figure out how the probabilities are calculated and why are they always same even if I change the test sentence.
Any quick help appreciated.
Thanks.
That's a lot of code! I'm not going to debug it for you, but I notice that bestwords is the set of all words in your training corpus. If that's not outright wrong, it's certainly misleadingly named.
I have a huge block of code, I didn't want to bother you with in the first place. I tried figuring out what's going wrong for over a week now and I contacted several external sources (without any response), and at the moment I'm just wondering: maybe the problem is my training set?
For my thesis I need to classify a whole bunch of tweets as pos/neg/neutral. The code I wrote works OK on test datasets I make up myself (e.g. consisting out of 15 training sentences: 5 pos, 5 neg and 5 neutral; 6 test sentences: 2 pos, 2 neg, 2 neutral - only 1 test sentence gets misclassified).
Once I start running the code on the manually classified training set (1629 pos, 1411 neutral tweets and only 690 neg) and 900 test tweets, things start going wrong. Of the 900 test tweets, the HUGE majority gets classified as pos (between 700 and 800), while there's only a minority of neg and neutral tweets.
Would somebody please be so kind as to check my code and help me figure out what I'm doing wrong? I'd be really grateful. If you need any more information, I'd be happy to provide it.
import re, math, collections, itertools
import nltk
import nltk.classify.util, nltk.metrics
import csv
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords = True)
pos = []
neg = []
neutral = []
with open('C:\\...pos.csv', 'r', encoding = "utf8") as f: #open positive training set
reader = csv.reader(f)
for row in reader:
pos.extend(row)
with open('C:\\ ...neg.csv', 'r', encoding = "utf8") as f: #open negative training set
reader = csv.reader(f)
for row in reader:
neg.extend(row)
with open('C:\\...neutral.csv', 'r', encoding = "utf8") as f: #open neutral training set
reader = csv.reader(f)
for row in reader:
neutral.extend(row)
def uni(doc):
x = []
y = []
for tweet in doc:
x.append(word_tokenize(tweet))
for element in x:
for word in element:
if len(word)>2:
word = word.lower()
word = stemmer.stem(word)
y.append(word)
return y
def word_feats_uni(doc):
return dict([(word, True) for word in uni(doc)])
def tokenizer_ngrams(document):
all_tokens = []
filtered_tokens = []
for (sentence) in document:
all_tokens.append(word_tokenize(sentence))
return all_tokens
def get_bi (document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def get_tri(document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def word_feats_bi(doc):
return dict([(word, True) for word in get_bi(doc)])
def word_feats_tri(doc):
return dict([(word, True) for word in get_tri(doc)])
def word_feats_test(doc):
feats_test = {}
feats_test.update(word_feats_uni(doc))
feats_test.update(word_feats_bi(doc))
feats_test.update(word_feats_tri(doc))
return feats_test
pos_feats = [(word_feats_uni(pos),'1')] + [(word_feats_bi(pos),'1')] + [(word_feats_tri(pos),'1')]
neg_feats = [(word_feats_uni(neg),'-1')] + [(word_feats_bi(neg),'-1')] + [(word_feats_tri(neg),'-1')]
neutral_feats = [(word_feats_uni(neutral),'0')] + [(word_feats_bi(neutral),'0')] + [(word_feats_tri(neutral),'0')]
trainfeats = pos_feats + neg_feats + neutral_feats
random.shuffle(trainfeats)
classifier = NaiveBayesClassifier.train(trainfeats)
testtweets = []
with open('C:\\ ... testtweets.csv', 'r', encoding = "utf8") as f: #open testset
reader = csv.reader(f, delimiter = ';')
for row in reader:
testtweets.extend([row])
date = []
word = []
y = []
def classification(date,sentence): #doc = sentencelist
i = 0
for tweet in sentence:
sent = classifier.classify(word_feats_test([tweet]))
y.extend([(date[i],tweet,sent)])
i = i + 1
def result(doc):
i = 0
while i in range(0,len(doc) -1):
date.append(doc[i][0])
word.append(doc[i][1])
i = i + 1
classification(date,word)
result(testtweets)
with open('C:\\...write.csv', 'w') as fp: #write classified test set to file
a = csv.writer(fp, delimiter=',')
a.writerows(y)