TF-IDF for my documents yield 0

TF-IDF for my documents yield 0 - python

I got this tfidf from yebrahim and somehow my output document yield all 0 for the result . Any problem with this ?
example of the output is
hippo 0.0
hipper 0.0
hip 0.0
hint 0.0
hindsight 0.0
hill 0.0
hilarious 0.0
thanks for the help
# a list of (words-freq) pairs for each document
global_terms_in_doc = {}
# list to hold occurrences of terms across documents
global_term_freq = {}
num_docs = 0
lang = 'english'
lang_dictionary = {}
top_k = -1
supported_langs = ('english', 'french')
from django.utils.encoding import smart_str, smart_unicode
# support for custom language if needed
def loadLanguageLemmas(filePath):
print('loading language from file: ' + filePath)
f = open(filePath)
for line in f:
words = line.split()
if words[1] == '=' or words[0] == words[1]:
continue
lang_dictionary[words[0]] = words[1]
def remove_diacritic(words):
for i in range(len(words)):
w = unicode(words[i], 'ISO-8859-1')
w = unicodedata.normalize('NFKD', w).encode('ASCII', 'ignore')
words[i] = w.lower()
return words
# function to tokenize text, and put words back to their roots
def tokenize(text):
text = ' '.join(text)
tokens = PunktWordTokenizer().tokenize(text)
# lemmatize words. try both noun and verb lemmatizations
lmtzr = WordNetLemmatizer()
for i in range(0,len(tokens)):
#tokens[i] = tokens[i].strip("'")
if lang != 'english':
if tokens[i] in lang_dictionary:
tokens[i] = lang_dictionary[tokens[i]]
else:
res = lmtzr.lemmatize(tokens[i])
if res == tokens[i]:
tokens[i] = lmtzr.lemmatize(tokens[i], 'v')
else:
tokens[i] = res
# don't return any single letters
tokens = [t for t in tokens if len(t) > 1 and not t.isdigit()]
return tokens
def remove_stopwords(text):
# remove punctuation
chars = ['.', '/', "'", '"', '?', '!', '#', '$', '%', '^', '&',
'*', '(', ')', ' - ', '_', '+' ,'=', '#', ':', '\\', ',',
';', '~', '`', '<', '>', '|', '[', ']', '{', '}']
for c in chars:
text = smart_str(text.replace(c, ' '))
text = text.split()
import nltk
if lang == 'english':
stopwords = nltk.corpus.stopwords.words('english')
else:
stopwords = open(lang + '_stopwords.txt', 'r').read().split()
content = [w for w in text if w.lower().strip() not in stopwords]
return content
# __main__ execution
import sys, re, math, unicodedata
from optparse import OptionParser
parser = OptionParser(usage='usage: %prog [options] input_file')
parser.add_option('-l', '--language', dest='language',
help='language to use in tokenizing and lemmatizing. supported\
languages: {english, french}', metavar='LANGUAGE')
parser.add_option('-k', '--top-k', dest='top_k',
help='output only terms with score no less k')
parser.add_option('-m', '--mode', dest='mode',
help='display mode. can be either "both" or "term"')
(options, args) = parser.parse_args()
if options.language:
if options.language not in supported_langs:
print 'only ', supported_langs, ' are supported in this version.'
quit()
if options.language != 'english':
lang = options.language
loadLanguageLemmas(options.language + '_lemmas.txt')
if options.top_k:
top_k = int(options.top_k)
display_mode = 'both'
if options.mode:
if options.mode == 'both' or options.mode == 'term':
display_mode = options.mode
else:
parser.print_help()
if not args:
parser.print_help()
quit()
reader = open(args[0])
all_files = reader.read().splitlines()
num_docs = len(all_files)
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize.punkt import PunktWordTokenizer
print('initializing..')
for f in all_files:
# local term frequency map
terms_in_doc = {}
doc_words = open(f).read().lower()
#print 'words:\n', doc_words
doc_words = remove_stopwords(doc_words)
#print 'after stopwords:\n', doc_words
doc_words = tokenize(doc_words)
#print 'after tokenize:\n', doc_words
#quit()
# increment local count
for word in doc_words:
if word in terms_in_doc:
terms_in_doc[word] += 1
else:
terms_in_doc[word] = 1
# increment global frequency
for (word,freq) in terms_in_doc.items():
if word in global_term_freq:
global_term_freq[word] += 1
else:
global_term_freq[word] = 1
global_terms_in_doc[f] = terms_in_doc
print('working through documents.. ')
for f in all_files:
writer = open(f + '_final', 'w')
result = []
# iterate over terms in f, calculate their tf-idf, put in new list
max_freq = 0;
for (term,freq) in global_terms_in_doc[f].items():
if freq > max_freq:
max_freq = freq
for (term,freq) in global_terms_in_doc[f].items():
idf = math.log(float(1 + num_docs) / float(1 + global_term_freq[term]))
tfidf = float(freq) / float(max_freq) * float(idf)
result.append([tfidf, term])
# sort result on tfidf and write them in descending order
result = sorted(result, reverse=True)
for (tfidf, term) in result[:top_k]:
if display_mode == 'both':
writer.write(term + '\t' + str(tfidf) + '\n')
else:
writer.write(term + '\n')
print('success, with ' + str(num_docs) + ' documents.')

Related

Line split is not functioning as intended

I am trying to get this code to split one at a time, but it is not functioning as expected:
for line in text_line:
one_line = line.split(' ',1)
if len(one_line) > 1:
acro = one_line[0].strip()
meaning = one_line[1].strip()
if acro in acronyms_dict:
acronyms_dict[acro] = acronyms_dict[acro] + ', ' + meaning
else:
acronyms_dict[acro] = meaning

Remove the ' ' from the str.split. The file is using tabs to delimit the acronyms:
import requests
data_site = requests.get(
"https://raw.githubusercontent.com/priscian/nlp/master/OpenNLP/models/coref/acronyms.txt"
)
text_line = data_site.text.split("\n")
acronyms_dict = {}
for line in text_line:
one_line = line.split(maxsplit=1) # <-- remove the ' '
if len(one_line) > 1:
acro = one_line[0].strip()
meaning = one_line[1].strip()
if acro in acronyms_dict:
acronyms_dict[acro] = acronyms_dict[acro] + ", " + meaning
else:
acronyms_dict[acro] = meaning
print(acronyms_dict)
Prints:
{
'24KHGE': '24 Karat Heavy Gold Electroplate',
'2B1Q': '2 Binary 1 Quaternary',
'2D': '2-Dimensional',
...

How to improve the personality extraction accuracy?

I read an article, named Unsupervised Personality Recognition for Social Network Sites, about the personality extraction from text. There are 22 features and 4 classes represented 4 personalities. Through counting the features in a text, we can know which class this sentence belong to, which means we can know the personality of the sentence.
The article provides the correlations for every feature and class. So the score of a class is the feature's value minus the mean of all feature's value then divided by the standard deviation of all feature's value then multiply the correlation coefficiency provided by the article. Then we can judge if the sentence belongs to the class through its score. I set the threshold to improve its accuracy but it is still not good enough. My result is around 50-60% accuracy and don't know how to improve it. Anyone can help me ?
import csv
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
import pickle
from statistics import mean, stdev
with open('mypersonality_final.csv', newline = '') as csvfile:
reader = csv.reader(csvfile)
test = []
for w in reader:
test.append(w)
def all_punctuation(text):
punctuations = ['.', ',', ';', ':']
count = 0
for w in text:
if w in punctuations:
count += 1
return count
def count_commas(text):
count = 0
for w in text:
if w == ',':
count += 1
return count
def count_pattern(text):
grammar = RegexpTokenizer(r'\#')
pattern = grammar.tokenize(text)
return len(pattern)
def count_exclamation(text):
grammar = RegexpTokenizer(r'\!')
pattern = grammar.tokenize(text)
return len(pattern)
def ex_links(text):
grammar = RegexpTokenizer(r'http?\S+\w(?:(?:\/[^\s/]*))*|www\.\S+\w(?:(?:\/[^\s/]*))*|ftp\S+\w(?:(?:\/[^\s/]*))*')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_sinpronouns(text):
sigpronouns = ['i', 'me', 'my', 'mine', 'we']
count = 0
for w in text:
if w.lower() in sigpronouns:
count += 1
return count
def negative_particle(text):
with open('negative-words.txt') as neg:
neg = neg.read()
words = nltk.word_tokenize(neg)
grammar = RegexpTokenizer(r'\w+')
nopunctuation = grammar.tokenize(text)
count = 0
for w in nopunctuation:
if w.lower() in words:
count += 1
return count
def negative_emoticon(text):
grammar = RegexpTokenizer(r"(?::|;|=)(?:-)?(?:\()")
emoticons = grammar.tokenize(text)
return len(emoticons)
def numbers(text):
grammar = RegexpTokenizer(r'\d+')
pattern = grammar.tokenize(text)
return len(pattern)
def parenthesis(text):
pat = '\([^)]*\)'
parent = re.findall(pat, text)
return len(parent)
def positive_emoticon(text):
grammar = RegexpTokenizer(r'(?::|;|=|<|>)(?:-|\.)?(?:\)|D|P|3|<)')
emoticons = grammar.tokenize(text)
return len(emoticons)
def prepositions(text):
tagged = nltk.pos_tag(text)
count = 0
for w in tagged:
if w[1] == 'IN':
count += 1
return count
def pronouns(text):
tagged = nltk.pos_tag(text)
count = 0
for w in tagged:
if (w[1] == 'PRP' or w[1] == 'PRP$' or w[1] == 'WP' or w[1] == 'WPR$'):
count += 1
return count
def count_question(text):
grammar = RegexpTokenizer(r'\?')
pattern = grammar.tokenize(text)
return len(pattern)
def long_words(text):
grammar = RegexpTokenizer(r'\w{7,}')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_pronouns(text):
firstpronouns = ['i', 'me', 'my', 'mine', 'we', 'our', 'ours', 'us']
count = 0
for w in text:
if w.lower() in firstpronouns:
count += 1
return count
def swears_count(text):
with open('swears.txt') as test:
words = test.read()
swears = re.sub(r'[^\w+\s]+', '', words)
swears = swears.split('\n')
count = 0
for w in text:
if w.lower() in swears:
count += 1
return count
def typetoken_ratio(text):
typed = set(text)
token = text
ratio = len(typed)/len(token)
return ratio
def count_words(text):
grammar = RegexpTokenizer(r'\w+')
pattern = grammar.tokenize(text)
return len(pattern)
def firs_pluralpronouns(text):
pluralpronouns = ['we', 'our', 'ours', 'us']
count = 0
for w in text:
if w.lower() in pluralpronouns:
count += 1
return count
def sec_pronouns(text):
secpronouns = ['you', 'your', 'yours']
count = 0
for w in text:
if w.lower() in secpronouns:
count += 1
return count
def mean_freq(text):
## grammar = RegexpTokenizer(r'\w+')
words = word_tokenize(text)
wordsl = []
for w in words:
wordsl.append(w.lower())
unique = set(wordsl)
return (len(wordsl)/len(unique))
def mean_std(test):
f1 = []
f2 = []
f3 = []
f4 = []
f5 = []
f6 = []
f7 = []
f8 = []
f9 = []
f10 = []
f11 = []
f12 = []
f13 = []
f14 = []
f15 = []
f16 = []
f17 = []
f18 = []
f19 = []
f20 = []
f21 = []
f22 = []
for w in test[1:]:
f1.append(all_punctuation(word_tokenize(w[1])))
f2.append(count_commas(word_tokenize(w[1])))
f3.append(count_pattern(w[1]))
f4.append(count_exclamation(w[1]))
f5.append(ex_links(w[1]))
f6.append(firs_sinpronouns(word_tokenize(w[1])))
f7.append(negative_particle(w[1]))
f8.append(negative_emoticon(w[1]))
f9.append(numbers(w[1]))
f10.append(parenthesis(w[1]))
f11.append(positive_emoticon(w[1]))
f12.append(prepositions(word_tokenize(w[1])))
f13.append(pronouns(word_tokenize(w[1])))
f14.append(count_question(w[1]))
f15.append(long_words(w[1]))
f16.append(firs_pronouns(word_tokenize(w[1])))
f17.append(swears_count(word_tokenize(w[1])))
f18.append(typetoken_ratio(word_tokenize(w[1])))
f19.append(count_words(w[1]))
f20.append(firs_pluralpronouns(word_tokenize(w[1])))
f21.append(sec_pronouns(word_tokenize(w[1])))
f22.append(mean_freq(w[1]))
value = [f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19, f20, f21, f22]
mean1 = []
stdev1 = []
for a in value:
mean1.append(round(mean(a),2))
stdev1.append(round(stdev(a),2))
return (mean1, stdev1)
##save_file = open('sample_value.pickle', 'wb')
##pickle.dump(mean_std(test), save_file)
##save_file.close()
savedfile = open('sample_value.pickle', 'rb')
trained = pickle.load(savedfile)
savedfile.close()
def evaluation(test):
ne = 0
ns = 0
na = 0
nc = 0
no = 0
for w in test[1:]:
z1 = (all_punctuation(word_tokenize(w[1])) - trained[0][0])/(trained[1][0])
z2 = (count_commas(word_tokenize(w[1]))- trained[0][1])/(trained[1][1])
z3 = (count_pattern(w[1]) - trained[0][2])/(trained[1][2])
z4 = (count_exclamation(w[1]) - trained[0][3])/(trained[1][3])
z5 = (ex_links(w[1]) - trained[0][4])/(trained[1][4])
z6 = (firs_sinpronouns(word_tokenize(w[1]))- trained[0][5])/(trained[1][5])
z7 = (negative_particle(w[1])-trained[0][6])/(trained[1][6])
z8 = (negative_emoticon(w[1]) - trained[0][7])/(trained[1][7])
z9 = (numbers(w[1])-trained[0][8])/(trained[1][8])
z10 = (parenthesis(w[1])-trained[0][9])/(trained[1][9])
z11 = (positive_emoticon(w[1])-trained[0][10])/(trained[1][10])
z12 = (prepositions(word_tokenize(w[1]))-trained[0][11])/(trained[1][11])
z13 = (pronouns(word_tokenize(w[1]))-trained[0][12])/(trained[1][12])
z14 = (count_question(w[1])-trained[0][13])/(trained[1][13])
z15 = (long_words(w[1])-trained[0][14])/(trained[1][14])
z16 = (firs_pronouns(word_tokenize(w[1]))-trained[0][15])/(trained[1][15])
z17 = (swears_count(word_tokenize(w[1]))-trained[0][16])/(trained[1][16])
z18 = (typetoken_ratio(word_tokenize(w[1]))-trained[0][17])/(trained[1][17])
z19 = (count_words(w[1])-trained[0][18])/(trained[1][18])
z20 = (firs_pluralpronouns(word_tokenize(w[1]))-trained[0][19])/(trained[1][19])
z21 = (sec_pronouns(word_tokenize(w[1]))-trained[0][20])/(trained[1][20])
z22 = (mean_freq(w[1])-trained[0][21])/(trained[1][21])
E = -0.08*z1-0.02*z2-0.07*z3-0.05*z5+0.05*z6-0.08*z7-0.03*z8-0.03*z9-0.06*z10+0.07*z11+0.07*z13-0.06*z14-0.06*z15+0.07*z16-0.01*z17-0.05*z18-0.01*z19+0.06*z20-0.01*z21+0.05*z22
S = -0.04*z1+0.01*z2+0.02*z3-0.05*z4-0.02*z5-0.15*z6+0.12*z7-0.18*z8+0.05*z9+0.03*z10+0.07*z11+0.06*z12+0.12*z13-0.05*z14+0.06*z15-0.14*z16+0.1*z18+0.02*z19+0.07*z20+0.03*z21-0.06*z22
A = -0.01*z1-0.02*z2+0.01*z3+0.06*z4-0.01*z5+0.05*z6+0.11*z7-0.11*z8-0.03*z9-0.04*z10+0.05*z11+0.04*z12+0.04*z13-0.04*z14-0.05*z15-0.06*z16-0.14*z17-0.04*z18+0.02*z19+0.04*z20-0.06*z21+0.03*z22
C = -0.04*z1-0.01*z2+0.01*z3-0.03*z5+0.04*z6-0.07*z7-0.11*z8-0.02*z9-0.01*z10+0.02*z11+0.08*z12+0.02*z13-0.06*z14+0.02*z15-0.04*z16-0.11*z17-0.05*z18-0.02*z19+0.01*z20-0.04*z21+0.06*z22
O = -10*z1+0.1*z2+0.06*z3-0.03*z4+0.09*z5-0.14*z6+0.01*z7+0.04*z8-0.06*z9+0.1*z10+0.02*z11-0.04*z12-0.06*z13+0.08*z14+0.1*z15-0.14*z16+0.08*z17+0.09*z18+0.06*z19+0.04*z20+0.11*z21-0.07*z22
if E>0.65:
if w[7] =='y':
ne+=1
if E<0.65:
if w[7]=='n':
ne+=1
if S>0.75:
if w[8] == 'y':
ns +=1
if S<0.75:
if w[8] == 'n':
ns+=1
if A>0.005:
if w[9]=='y':
na+=1
if A<0.005:
if w[9]=='n':
na+=1
if C>0.58:
if w[10]=='y':
nc+=1
if C<0.58:
if w[10]=='n':
nc+=1
if O>(-0.05):
if w[11]=='y':
no+=1
if O<(-0.05):
if w[11]=='n':
no+=1
print (round((ne/9917)*100,2), round((ns/9917)*100,2),round((na/9917)*100,2),round((nc/9917)*100,2),round((no/9917)*100,2))
evaluation(test)
The sample data is:
enter image description here

Chatbot using Markov Chains

Hello fellow developers,
I am trying to build a chatbot using markov chains and I am stuck at a problem. I the code below, I have made a random sentence generator that learns from movie scripts. The problem is, how do I get this sentence generator to not be random and to respond to the user's input? How should I go about doing this? Is it something to do with input/output training like this:
In: how are you today
Out: I'm good thanks how are you
Here is my code. Most of the functions are used to put the data in a csv file so don't mind those.
from collections import defaultdict
import random, itertools, nltk, pandas, csv, string, re, os, time
class Chatbot:
def __init__(self, name, txt_transcript_filedir, character=None):
self.name = name
self.txt_transcript_filedir = txt_transcript_filedir
self.character = character
print("Hello my name is " + name + ".")
def parse_transcript(self):
parsed_lines = []
self.csv_transcript_filedir = self.txt_transcript_filedir.replace('.txt', '.csv')
with open(self.txt_transcript_filedir, encoding='utf-8') as txt_file:
lines = txt_file.readlines()
for line in lines:
line = line.replace(', ', ' ')
line = re.sub(r'\[.*?\]', '', line)
if ': ' in line:
line = line.replace(': ', ',')
parsed_lines.append(line)
with open(self.csv_transcript_filedir, 'w', encoding='utf-8') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(['person', 'text'])
for line in parsed_lines:
csv_file.write(line)
def tokenize_transcript(self):
csv_file = pandas.read_csv(self.csv_transcript_filedir)
textss = []
final_sents = []
if self.character == None:
texts = csv_file['text']
for text in texts:
sent = nltk.sent_tokenize(text)
textss.append(sent)
else:
char_sets = csv_file[csv_file['person'] == self.character]
texts = char_sets['text']
for text in texts:
sent = nltk.sent_tokenize(text)
textss.append(sent)
for text in textss:
for sent in text:
if sent[0] == ' ':
sent = sent[1:]
final_sent = [w for w in sent if w not in string.punctuation]
final_sent = ''.join(final_sent)
final_sents.append(final_sent)
self.training_data = [sent for sent in final_sents]
def learn(self):
self.parse_transcript()
self.tokenize_transcript()
self.make_word_dict(self.training_data)
def make_word_dict(self, text):
word_dict = defaultdict(list)
for sent in text:
words = nltk.word_tokenize(sent)
for i in range(len(words) - 1):
if i+2 >= (len(words)):
word_dict[(words[i], words[i+1])].append('<end>')
else:
word_dict[(words[i], words[i+1])].append(words[i+2])
self.vocabulary = word_dict
def generate_text(self, num):
for i in range(0, num):
start_key = random.choice(list(self.vocabulary.keys()))
text = []
text.append(start_key[0])
text.append(start_key[1])
for i in itertools.count():
key = (text[i], text[i+1])
if key[1] == '<end>':
break
else:
text.append(random.choice(self.vocabulary[text[i], text[i+1]]))
text = ' '.join(text)
if text.endswith('<end>'):
text = text[:-6]
text = text + '.'
return text
def say(self, text):
os.system('say -v Oliver ' + text)
def main():
num = 100
bot = Chatbot("J.A.R.V.I.S", "avengers_age_of_ultron.txt", "JARVIS")
bot.learn()
for i in range(num):
text = bot.generate_text(1)
print(text)
if __name__ == '__main__':
main()

nltk library import error python

i have some problems on my code that show some error when i run it. i'm using python
so, here's my code
import collections
import nltk.classify.util, nltk.metrics
from nltk.classify import SklearnClassifier
import csv
from sklearn import cross_validation
from sklearn.svm import LinearSVC, SVC
import random
from nltk.corpus import stopwords
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
posdata = []
with open('positive-data.csv', 'rb') as myfile:
reader = csv.reader(myfile, delimiter=',')
for val in reader:
posdata.append(val[0])
negdata = []
with open('negative-data.csv', 'rb') as myfile:
reader = csv.reader(myfile, delimiter=',')
for val in reader:
negdata.append(val[0])
def word_split(data):
data_new = []
for word in data:
word_filter = [i.lower() for i in word.split()]
data_new.append(word_filter)
return data_new
def word_split_sentiment(data):
data_new = []
for (word, sentiment) in data:
word_filter = [i.lower() for i in word.split()]
data_new.append((word_filter, sentiment))
return data_new
def word_feats(words):
return dict([(word, True) for word in words])
stopset = set(stopwords.words('english')) - set(('over', 'under', 'below', 'more', 'most', 'no', 'not', 'only', 'such', 'few', 'so', 'too', 'very', 'just', 'any', 'once'))
def stopword_filtered_word_feats(words):
return dict([(word, True) for word in words if word not in stopset])
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
"""
print words
for ngram in itertools.chain(words, bigrams):
if ngram not in stopset:
print ngram
exit()
"""
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def bigram_word_feats_stopwords(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
"""
print words
for ngram in itertools.chain(words, bigrams):
if ngram not in stopset:
print ngram
exit()
"""
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams) if ngram not in stopset])
# Calculating Precision, Recall & F-measure
def evaluate_classifier(featx):
negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
classifier = 'svm'
for cl in classifier:
if cl == 'svm':
classifierName = 'SVM'
classifier = SklearnClassifier(LinearSVC(), sparse=False)
classifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testfeats):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
accuracy = nltk.classify.util.accuracy(classifier, testfeats)
pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
print ''
print '---------------------------------------'
print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')'
print '---------------------------------------'
print 'accuracy:', accuracy
print 'precision', (pos_precision + neg_precision) / 2
print 'recall', (pos_recall + neg_recall) / 2
print 'f-measure', (pos_fmeasure + neg_fmeasure) / 2
print ''
## CROSS VALIDATION
trainfeats = negfeats + posfeats
# SHUFFLE TRAIN SET
random.shuffle(trainfeats)
n = 5
for cl in classifier_list:
subset_size = len(trainfeats) / n
accuracy = []
pos_precision = []
pos_recall = []
neg_precision = []
neg_recall = []
pos_fmeasure = []
neg_fmeasure = []
cv_count = 1
for i in range(n):
testing_this_round = trainfeats[i*subset_size:][:subset_size]
training_this_round = trainfeats[:i*subset_size] + trainfeats[(i+1)*subset_size:]
if cl == 'svm':
classifierName = 'SVM'
classifier = SklearnClassifier(LinearSVC(), sparse=False)
classifier.train(training_this_round)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testing_this_round):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round)
cv_pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
cv_pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
cv_pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
cv_neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
cv_neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
cv_neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
accuracy.append(cv_accuracy)
pos_precision.append(cv_pos_precision)
pos_recall.append(cv_pos_recall)
neg_precision.append(cv_neg_precision)
neg_recall.append(cv_neg_recall)
pos_fmeasure.append(cv_pos_fmeasure)
neg_fmeasure.append(cv_neg_fmeasure)
cv_count += 1
print '---------------------------------------'
print 'N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')'
print '---------------------------------------'
print 'accuracy:', sum(accuracy) / n
print 'precision', (sum(pos_precision)/n + sum(neg_precision)/n) / 2
print 'recall', (sum(pos_recall)/n + sum(neg_recall)/n) / 2
print 'f-measure', (sum(pos_fmeasure)/n + sum(neg_fmeasure)/n) / 2
print ''
evaluate_classifier(word_feats)`
its suppose to analysis sentiment from csv document using SVM, but when i run the code i got this error anyone have an idea to fix it??
really need your help guys

Python - Extract Sentences - Off by 1

I want to extract the sentences of a paragraph and print them out line by line. It does a perfect job except when a period is following by a newline. For example: the user finishes a sentence and then hits enter. So there is no space after the period
The code considers this sentence to be part of the previous sentence because there is no space. When it is printed out, they are stuck together. So in other words, how can I alter the code in order to extract a sentence when there is no space after the period. For example: This.should.be.considered.five.sentences because there are five periods but the code considers it as only one sentence.
This is the code:
import re
abbreviations = {'dr.': 'doctor', 'mr.': 'mister', 'bro': 'brother', 'mrs.': 'mistress', 'ms.': 'miss', 'jr.': 'junior', 'sr.': 'senior',
'i.e.': 'for example', 'e.g.': 'for example', 'vs.': 'versus'}
terminators = ['.', '!', '?']
wrappers = ['"', "'", ')', ']', '}']
def find_sentences(paragraph):
end = True
sentences = []
while end > -1:
end = find_sentence_end(paragraph)
if end > -1:
sentences.append(paragraph[end:].strip())
paragraph = paragraph[:end]
sentences.append(paragraph)
sentences.reverse()
return sentences
def find_sentence_end(paragraph):
[possible_endings, contraction_locations] = [[], []]
contractions = abbreviations.keys()
sentence_terminators = terminators + [terminator + wrapper for wrapper in wrappers for terminator in terminators]
for sentence_terminator in sentence_terminators:
t_indices = list(find_all(paragraph, sentence_terminator))
possible_endings.extend(([] if not len(t_indices) else [[i, len(sentence_terminator)] for i in t_indices]))
for contraction in contractions:
c_indices = list(find_all(paragraph, contraction))
contraction_locations.extend(([] if not len(c_indices) else [i + len(contraction) for i in c_indices]))
possible_endings = [pe for pe in possible_endings if pe[0] + pe[1] not in contraction_locations]
if len(paragraph) in [pe[0] + pe[1] for pe in possible_endings]:
max_end_start = max([pe[0] for pe in possible_endings])
possible_endings = [pe for pe in possible_endings if pe[0] != max_end_start]
possible_endings = [pe[0] + pe[1] for pe in possible_endings if sum(pe) > len(paragraph) or (sum(pe) < len(paragraph) and paragraph[sum(pe)] == ' ')]
end = (-1 if not len(possible_endings) else max(possible_endings))
return end
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1:
return
yield start
start += len(sub)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

TF-IDF for my documents yield 0 - python

Related

Line split is not functioning as intended

How to improve the personality extraction accuracy?

Chatbot using Markov Chains

nltk library import error python

Python - Extract Sentences - Off by 1

Categories

Resources