I have a huge block of code, I didn't want to bother you with in the first place. I tried figuring out what's going wrong for over a week now and I contacted several external sources (without any response), and at the moment I'm just wondering: maybe the problem is my training set?
For my thesis I need to classify a whole bunch of tweets as pos/neg/neutral. The code I wrote works OK on test datasets I make up myself (e.g. consisting out of 15 training sentences: 5 pos, 5 neg and 5 neutral; 6 test sentences: 2 pos, 2 neg, 2 neutral - only 1 test sentence gets misclassified).
Once I start running the code on the manually classified training set (1629 pos, 1411 neutral tweets and only 690 neg) and 900 test tweets, things start going wrong. Of the 900 test tweets, the HUGE majority gets classified as pos (between 700 and 800), while there's only a minority of neg and neutral tweets.
Would somebody please be so kind as to check my code and help me figure out what I'm doing wrong? I'd be really grateful. If you need any more information, I'd be happy to provide it.
import re, math, collections, itertools
import nltk
import nltk.classify.util, nltk.metrics
import csv
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords = True)
pos = []
neg = []
neutral = []
with open('C:\\...pos.csv', 'r', encoding = "utf8") as f: #open positive training set
reader = csv.reader(f)
for row in reader:
pos.extend(row)
with open('C:\\ ...neg.csv', 'r', encoding = "utf8") as f: #open negative training set
reader = csv.reader(f)
for row in reader:
neg.extend(row)
with open('C:\\...neutral.csv', 'r', encoding = "utf8") as f: #open neutral training set
reader = csv.reader(f)
for row in reader:
neutral.extend(row)
def uni(doc):
x = []
y = []
for tweet in doc:
x.append(word_tokenize(tweet))
for element in x:
for word in element:
if len(word)>2:
word = word.lower()
word = stemmer.stem(word)
y.append(word)
return y
def word_feats_uni(doc):
return dict([(word, True) for word in uni(doc)])
def tokenizer_ngrams(document):
all_tokens = []
filtered_tokens = []
for (sentence) in document:
all_tokens.append(word_tokenize(sentence))
return all_tokens
def get_bi (document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def get_tri(document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def word_feats_bi(doc):
return dict([(word, True) for word in get_bi(doc)])
def word_feats_tri(doc):
return dict([(word, True) for word in get_tri(doc)])
def word_feats_test(doc):
feats_test = {}
feats_test.update(word_feats_uni(doc))
feats_test.update(word_feats_bi(doc))
feats_test.update(word_feats_tri(doc))
return feats_test
pos_feats = [(word_feats_uni(pos),'1')] + [(word_feats_bi(pos),'1')] + [(word_feats_tri(pos),'1')]
neg_feats = [(word_feats_uni(neg),'-1')] + [(word_feats_bi(neg),'-1')] + [(word_feats_tri(neg),'-1')]
neutral_feats = [(word_feats_uni(neutral),'0')] + [(word_feats_bi(neutral),'0')] + [(word_feats_tri(neutral),'0')]
trainfeats = pos_feats + neg_feats + neutral_feats
random.shuffle(trainfeats)
classifier = NaiveBayesClassifier.train(trainfeats)
testtweets = []
with open('C:\\ ... testtweets.csv', 'r', encoding = "utf8") as f: #open testset
reader = csv.reader(f, delimiter = ';')
for row in reader:
testtweets.extend([row])
date = []
word = []
y = []
def classification(date,sentence): #doc = sentencelist
i = 0
for tweet in sentence:
sent = classifier.classify(word_feats_test([tweet]))
y.extend([(date[i],tweet,sent)])
i = i + 1
def result(doc):
i = 0
while i in range(0,len(doc) -1):
date.append(doc[i][0])
word.append(doc[i][1])
i = i + 1
classification(date,word)
result(testtweets)
with open('C:\\...write.csv', 'w') as fp: #write classified test set to file
a = csv.writer(fp, delimiter=',')
a.writerows(y)
Related
I have a pandas dataframe consisting of headlines. I am doing a simple calculation of the sentiment, by tokenizing and comparing the headlines with a list of positive and negative words. I am appending the over all sentiment for the headline into a column and then appending this to the original dataframe and saving as an Excel file.
The resulting and original files are about 12 mb. While the code below works, it is slow; and is taking me a couple of hours to fully read the file and assign the score. Is this normal? Is there anything I can do to speed up the process? I understand that loops within a pandas dataframe column may be slow - what are the alternatives?
# -*- coding: utf-8 -*-
from nltk.tokenize import word_tokenize
import pandas as pd
from violencevocabulary import new_words as extended_neg_list
import unicodedata
#function to calculate sentiment
def sentimentanalyzer (country_name,text_type):
data = []
xls_file = pd.ExcelFile('/UsersDesktop/MasterData.xlsx')
df = xls_file.parse(country_name)
text_body = df[text_type]
text_body = pd.Series(text_body)
headlines = text_body.tolist()
for i in headlines:
if type(i) == unicode:
i = unicodedata.normalize('NFKD', i).encode('ascii','ignore')
data.append(i)
# processing the sentiment comparispon files
pos_words = []
neg_words = []
f = open('/Users/positive-words.txt','r')
plines = f.readlines()
for line in plines:
line = line.rstrip('\n')
line = line.lower()
pos_words.append(line)
positive_words = pos_words[35:]
f.close()
g = open('/Users/Desktop/negative-words.txt','r')
nlines = g.readlines()
neg_words = []
for nline in nlines:
nline = nline.strip('\n')
nline = nline.lower()
neg_words.append(nline)
negative_words = neg_words[35:]
g.close()
negative_words = negative_words + extended_neg_list
senti_list = []
for j in data:
tokens = word_tokenize(j)
for k in tokens:
negs = [k for k in tokens if k in negative_words]
negs = len(negs)
pos = [k for k in tokens if k in positive_words]
pos = len(pos)
calc = pos - negs
print calc
senti_list.append(calc)
df2 = pd.Series(senti_list,name="Sentiment")
new_data = pd.concat([df,df2,],axis=1)
new_data_name = '/Users/Desktop/Results/' + country_name + " " + text_type + ".xls"
writer_new_data_name = pd.ExcelWriter(new_data_name, engine='xlsxwriter')
new_data.to_excel(writer_new_data_name,sheet_name='Sheet1')
return
I am pretty new to python and this is the first code I have written. Trying to use the NLTK package. The problem comes at the end when trying to execute the label_probdist.prob('positive') line.
This is the error I get;
name 'label_probdist' is not defined
NameError Traceback (most recent call last)
<ipython-input-57-006d791d4445> in <module>()
----> 1 print label_probdist.prob('positive')
NameError: name 'label_probdist' is not defined
import nltk, re, pprint
import csv
from nltk import word_tokenize, wordpunct_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
from nltk.classify.api import ClassifierI
# not in use nltk.download() #Download the bookpackage
#open the file that containts wallposts and classifier
with open('Classified.csv' ,'rb') as f:
reader = csv.reader(f)
FBsocial = map(tuple, reader)
import random
random.shuffle(FBsocial)
FBsocial = FBsocial[:500]
len(FBsocial)
FBSocialData = [] #sorting data
for row in FBsocial:
statement = row[0]
sentiment = row[1]
words_filtered = [e.lower() for e in statement.split() if len(e) >= 3]
FBSocialData.append((words_filtered, sentiment))
len(FBSocialData)
#Extracting features of word(list of words ordered by frequency)
def get_words_in_FBdata(FBSocialData):
all_words = []
for (statement, sentiment) in FBSocialData:
all_words.extend(statement)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
word_features = get_word_features(get_words_in_FBdata(FBSocialData))
len(word_features)
#just a test;
document = ("hei","grin","andre","jævlig","gøy",)
#Classifier to decide which feature are relevant
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
extract_features(document)
#testing extract_features
extract_features("udviser blomsterbutik")
training_set = nltk.classify.util.apply_features(extract_features, FBSocialData)
len(training_set)
classifier = nltk.NaiveBayesClassifier.train(training_set)
def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
# Create the P(label) distribution
label_probdist = estimator(label_freqdist)
# Create the P(fval|label, fname) distribution
feature_probdist = {}
return NaiveBayesClassifier(label_probdist, feature_probdist)
#pvalue
print label_probdist.prob('positive')
print label_probdist.prob('negative')
You are defining variable label_probdist inside function train. Then you are trying to access it outside it's scope. It is not possible. It's a local variable, not a global one.
I am trying to use the maxent classifier using the NLTK library. I have a list of positive and negative words and I have trained the classifier on the same. The problem is when I test the classifier against a sentence I always get the same probability of classification for the two classes. Here is the code -
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import MaxentClassifier
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from sklearn import cross_validation
nltk.data.path.append("/home/daksh/Documents/Softwares/nltk_data")
import csv
import operator
from nltk.classify import MaxentClassifier
from nltk.corpus import movie_reviews
def getBestWords(posWords,negWords):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
label_word_fd['pos'][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
label_word_fd['neg'][word.lower()] += 1
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1),reverse=True)[:2500]
bestwords = set([w for w,s in sorted_x])
return bestwords
def best_word_feats(words,bestwords):
return dict([(word, True) for word in words if word in bestwords])
def word_feats(words):
return dict([(word, True) for word in words])
def best_bigram_word_feats(words,posWords,negWords, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
d = dict([(bigram, True) for bigram in bigrams])
bestwords = getBestWords(posWords,negWords)
d.update(best_word_feats(words,bestwords))
return d
posWords = list()
negWords = list()
with open('../data/finalSentiPosWords.csv','r') as csvfile:
spamreader = csv.reader(csvfile)
posWords = list(spamreader)
with open('../data/finalSentiNegWords.csv','r') as csvfile:
spamreader = csv.reader(csvfile)
negWords = list(spamreader)
posWords = [word[0] for word in posWords]
negWords = [word[0] for word in negWords]
bestwords = getBestWords(posWords,negWords)
posfeats = [(best_bigram_word_feats(posWords,posWords,negWords),'pos')]
negfeats = [(best_bigram_word_feats(negWords,posWords,negWords),'neg')]
trainfeats = negfeats + posfeats
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
classifier = nltk.MaxentClassifier.train(trainfeats, algorithm,max_iter=5)
# classifier = nltk.NaiveBayesClassifier.train(trainfeats)
classifier.show_most_informative_features(10)
sentence = "Dosa had a tangy taste but it was fun eating it. On the other hand the other dosa was soggy"
l = sentence.split(' ')
print(l)
print(word_feats(l))
print(classifier.prob_classify(word_feats(l)).prob('pos'))
print(classifier.prob_classify(word_feats(l)).prob('neg'))
The output of this is this -
0.500074231063
0.499925768937
The overall classification seems to be working fine but I can't figure out how the probabilities are calculated and why are they always same even if I change the test sentence.
Any quick help appreciated.
Thanks.
That's a lot of code! I'm not going to debug it for you, but I notice that bestwords is the set of all words in your training corpus. If that's not outright wrong, it's certainly misleadingly named.
Sorry to dump a whole block of code (below) here. I've been trying to figure out what I'm doing wrong, but unfortunately I have no idea.
For my thesis I have to classify tweets as neutral (0), negative (-1) or positive (1). I'm trying this by using NLTK. Goal is that the code returns a dictionary in the form 'tweetA,0','tweetB,-1'... At the moment, if I enter more than one tweet as an input, I only get the result (i.e. -1/0/1) for the first tweet back.
For example, if I put 'I love oranges','I hate tomatoes' as in input, I only get '1' as a return and not '1','-1'.
If anyone would be able to help me out, I'd be really grateful!
The code I have up until now:
import re, math, collections, itertools
import nltk
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords = True)
pos_tweets = ['I love bananas','I like pears','I eat oranges']
neg_tweets = ['I hate lettuce','I do not like tomatoes','I hate apples']
neutral_tweets = ['I buy chicken','I am boiling eggs','I am chopping vegetables']
def uni(doc):
x = []
y = []
for tweet in doc:
x.append(word_tokenize(tweet))
for element in x:
for word in element:
if len(word)>2:
word = word.lower()
word = stemmer.stem(word)
y.append(word)
return y
def word_feats_uni(doc):
return dict([(word, True) for word in uni(doc)])
def tokenizer_ngrams(document):
all_tokens = []
filtered_tokens = []
for (sentence) in document:
all_tokens.append(word_tokenize(sentence))
return all_tokens
def get_bi (document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def get_tri(document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def word_feats_bi(doc):
return dict([(word, True) for word in get_bi(doc)])
def word_feats_tri(doc):
return dict([(word, True) for word in get_tri(doc)])
def word_feats_test(doc):
feats_test = {}
i = 0
for tweet in doc:
feats_test.update(word_feats_uni(tweet))
feats_test.update(word_feats_bi(tweet))
feats_test.update(word_feats_tri(tweet))
return feats_test
pos_feats = [(word_feats_uni(pos_tweets),'1')] + [(word_feats_bi(pos_tweets),'1')] + [(word_feats_tri(pos_tweets),'1')]
neg_feats = [(word_feats_uni(neg_tweets),'-1')] + [(word_feats_bi(neg_tweets),'-1')] + [(word_feats_tri(neg_tweets),'-1')]
neutral_feats = [(word_feats_uni(neutral_tweets),'0')] + [(word_feats_bi(neutral_tweets),'0')] + [(word_feats_tri(neutral_tweets),'0')]
trainfeats = pos_feats + neg_feats + neutral_feats
classifier = NaiveBayesClassifier.train(trainfeats)
print (classifier.classify(word_feats_test(['I love oranges'])))
I am new to Python and to Stackoverflow(please be gentle) and am trying to learn how to do a sentiment analysis. I am using a combination of code I found in a tutorial and here: Python - AttributeError: 'list' object has no attribute However, I keep getting
Traceback (most recent call last):
File "C:/Python27/training", line 111, in <module>
processedTestTweet = processTweet(row)
File "C:/Python27/training", line 19, in processTweet
tweet = tweet.lower()
AttributeError: 'list' object has no attribute 'lower'`
This is my code:
import csv
#import regex
import re
import pprint
import nltk.classify
#start replaceTwoOrMore
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
# process the tweets
def processTweet(tweet):
#Convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert #username to AT_USER
tweet = re.sub('#[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
#start getStopWordList
def getStopWordList(stopWordListFileName):
#read the stopwords file and build a list
stopWords = []
stopWords.append('AT_USER')
stopWords.append('URL')
fp = open(stopWordListFileName, 'r')
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
def getFeatureVector(tweet, stopWords):
featureVector = []
words = tweet.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if it consists of only words
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
#ignore if it is a stopWord
if(w in stopWords or val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features
#Read the tweets one by one and process it
inpTweets = csv.reader(open('C:/GsTraining.csv', 'rb'),
delimiter=',',
quotechar='|')
stopWords = getStopWordList('C:/stop.txt')
count = 0;
featureList = []
tweets = []
for row in inpTweets:
sentiment = row[0]
tweet = row[1]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet, stopWords)
featureList.extend(featureVector)
tweets.append((featureVector, sentiment))
# Remove featureList duplicates
featureList = list(set(featureList))
# Generate the training set
training_set = nltk.classify.util.apply_features(extract_features, tweets)
# Train the Naive Bayes classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
# Test the classifier
with open('C:/CleanedNewGSMain.txt', 'r') as csvinput:
with open('GSnewmain.csv', 'w') as csvoutput:
writer = csv.writer(csvoutput, lineterminator='\n')
reader = csv.reader(csvinput)
all=[]
row = next(reader)
for row in reader:
processedTestTweet = processTweet(row)
sentiment = NBClassifier.classify(
extract_features(getFeatureVector(processedTestTweet, stopWords)))
row.append(sentiment)
processTweet(row[1])
writer.writerows(all)
Any help would be massively appreciated.
The result from the csv reader is a list, lower only works on strings. Presumably it is a list of string, so there are two options. Either you can call lower on each element, or turn the list into a string and then call lower on it.
# the first approach
[item.lower() for item in tweet]
# the second approach
' '.join(tweet).lower()
But more reasonably (hard to tell without more information) you only actually want one item out of your list. Something along the lines of:
for row in reader:
processedTestTweet = processTweet(row[0]) # Again, can't know if this is actually correct without seeing the file
Also, guessing that you aren't using the csv reader quite like you think you are, because right now you are training a naive bayes classifier on a single example every time and then having it predict the one example it was trained on. Maybe explain what you're trying to do?