I am pretty new to python and this is the first code I have written. Trying to use the NLTK package. The problem comes at the end when trying to execute the label_probdist.prob('positive') line.
This is the error I get;
name 'label_probdist' is not defined
NameError Traceback (most recent call last)
<ipython-input-57-006d791d4445> in <module>()
----> 1 print label_probdist.prob('positive')
NameError: name 'label_probdist' is not defined
import nltk, re, pprint
import csv
from nltk import word_tokenize, wordpunct_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
from nltk.classify.api import ClassifierI
# not in use nltk.download() #Download the bookpackage
#open the file that containts wallposts and classifier
with open('Classified.csv' ,'rb') as f:
reader = csv.reader(f)
FBsocial = map(tuple, reader)
import random
random.shuffle(FBsocial)
FBsocial = FBsocial[:500]
len(FBsocial)
FBSocialData = [] #sorting data
for row in FBsocial:
statement = row[0]
sentiment = row[1]
words_filtered = [e.lower() for e in statement.split() if len(e) >= 3]
FBSocialData.append((words_filtered, sentiment))
len(FBSocialData)
#Extracting features of word(list of words ordered by frequency)
def get_words_in_FBdata(FBSocialData):
all_words = []
for (statement, sentiment) in FBSocialData:
all_words.extend(statement)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
word_features = get_word_features(get_words_in_FBdata(FBSocialData))
len(word_features)
#just a test;
document = ("hei","grin","andre","jævlig","gøy",)
#Classifier to decide which feature are relevant
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
extract_features(document)
#testing extract_features
extract_features("udviser blomsterbutik")
training_set = nltk.classify.util.apply_features(extract_features, FBSocialData)
len(training_set)
classifier = nltk.NaiveBayesClassifier.train(training_set)
def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
# Create the P(label) distribution
label_probdist = estimator(label_freqdist)
# Create the P(fval|label, fname) distribution
feature_probdist = {}
return NaiveBayesClassifier(label_probdist, feature_probdist)
#pvalue
print label_probdist.prob('positive')
print label_probdist.prob('negative')
You are defining variable label_probdist inside function train. Then you are trying to access it outside it's scope. It is not possible. It's a local variable, not a global one.
Related
I am working with Gensim FASTText modeling and have the following questions.
The output of "ft_model.save(BASE_PATH + MODEL_PATH + fname)" saves the following 3 files. Is this correct? is there a way to combine all three files?
ft_gensim-v3
ft_gensim-v3.trainables.vectors_ngrams_lockf.npy
ft_gensim-v3.wv.vectors_ngrams.npy
When I attempt to load the training file and then use it, I get the following error from if model.wv.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
'function' object has no attribute 'wv'
Finally, both models, is there a way not to have to store the output of def read_train(path,label_path) and def lemmetize(df_col)so I do not have to run this part of the code every time I want to train the model or compare?
Thanks for the assistance.
Here is my FastText Train Model
import os
import logging
from config import BASE_PATH, DATA_PATH, MODEL_PATH
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from pprint import pprint as print
from gensim.models.fasttext import FastText as FT_gensim
from gensim.test.utils import datapath
#Read Training data
import pandas as pd
def read_train(path,label_path):
d = []
#e = []
df = pd.read_excel(path)
labelled = pd.read_csv(label_path)
updated_col1 = lemmetize(df['query_text'])
updated_col2 = lemmetize(labelled['QueryText'])
for i in range(len(updated_col1)):
d.append(updated_col1[i])
#print(d)
for i in range(len(updated_col2)):
d.append(updated_col2[i])
return d
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
from nltk.stem import PorterStemmer
def lemmetize(df_col):
df_updated_col = pd.Series(0, index = df_col.index)
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
ps = PorterStemmer()
for i, j in zip(df_col, range(len(df_col))):
lem = []
t = str(i).lower()
t = t.replace("'s","")
t = t.replace("'","")
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
t = t.translate(translator)
word_tokens = word_tokenize(t)
for i in range(len(word_tokens)):
l1 = lemmatizer.lemmatize(word_tokens[i])
s1 = ps.stem(word_tokens[i])
if list(l1) != [''] and list(l1) != [' '] and l1 != '' and l1 != ' ':
lem.append(l1)
filtered_sentence = [w for w in lem if not w in stop_words]
df_updated_col[j] = filtered_sentence
return df_updated_col
#read test data
def read_test(path):
return pd.read_excel(path)
#Read labelled data
def read_labelled(path):
return pd.read_csv(path)
word_tokenized_corpus = read_train('Train Data.xlsx','SMEQueryText.csv')
#Train fasttext model
import tempfile
import os
from gensim.models import FastText
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("ft_gensime-v3")
def train_fastText(data, embedding_size = 60, window_size = 40, min_word = 5, down_sampling = 1e-2, iter=100):
ft_model = FastText(word_tokenized_corpus,
size=embedding_size,
window=window_size,
min_count=min_word,
sample=down_sampling,
sg=1,
iter=100)
#with tempfile.NamedTemporaryFile(prefix=BASE_PATH + MODEL_PATH + 'ft_gensim_v2-', delete=False) as tmp:
# ft_model.save(tmp.name, separately=[])
ft_model.save(BASE_PATH + MODEL_PATH + fname)
return ft_model
# main function to output
def main(test_path, train_path, labelled):
test_data = read_test(test_path)
train_data = read_train(train_path,labelled)
labelled = read_labelled(labelled)
output_df = pd.DataFrame(index = range(len(test_data)))
output_df['test_query'] = str()
output_df['Similar word'] = str()
output_df['category'] = str()
output_df['similarity'] = float()
model = train_fastText(train_data)
# run main
if __name__ == "__main__":
output = main('Test Data.xlsx','Train Data.xlsx','QueryText.csv')
Here is my Usage Model
import pandas as pd
from gensim.models import FastText
import gensim
from config import BASE_PATH, DATA_PATH, MODEL_PATH
#Read Training data
def read_train(path,label_path):
d = []
#e = []
df = pd.read_excel(path)
labelled = pd.read_csv(label_path)
updated_col1 = lemmetize(df['query_text'])
updated_col2 = lemmetize(labelled['QueryText'])
for i in range(len(updated_col1)):
d.append(updated_col1[i])
for i in range(len(updated_col2)):
d.append(updated_col2[i])
return d
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
from nltk.stem import PorterStemmer
def lemmetize(df_col):
df_updated_col = pd.Series(0, index = df_col.index)
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
ps = PorterStemmer()
for i, j in zip(df_col, range(len(df_col))):
lem = []
t = str(i).lower()
t = t.replace("'s","")
t = t.replace("'","")
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
t = t.translate(translator)
word_tokens = word_tokenize(t)
for i in range(len(word_tokens)):
l1 = lemmatizer.lemmatize(word_tokens[i])
s1 = ps.stem(word_tokens[i])
if list(l1) != [''] and list(l1) != [' '] and l1 != '' and l1 != ' ':
lem.append(l1)
filtered_sentence = [w for w in lem if not w in stop_words]
df_updated_col[j] = filtered_sentence
return df_updated_col
#read test data
def read_test(path):
return pd.read_excel(path)
#Read labelled data
def read_labelled(path):
return pd.read_csv(path)
def load_training():
return FT_gensim.load(BASE_PATH + MODEL_PATH +'ft_gensim-v3')
#compare similarity
def compare_similarity(model, real_data, labelled):
maxWord = ''
category = ''
maxSimilaity = 0
#print("train data",labelled[1])
for i in range(len(labelled)):
if model.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
#print('labelled',labelled['QueryText'][i], 'i', i)
maxWord = labelled['QueryText'][i]
category = labelled['Subjectmatter'][i]
maxSimilaity = model.similarity(real_data, labelled['QueryText'][i])
return maxWord, category, maxSimilaity
# Output from Main to excel
from pandas import ExcelWriter
def export_Excel(data, aFile = 'FASTTEXTOutput.xlsx'):
df = pd.DataFrame(data)
writer = ExcelWriter(aFile)
df.to_excel(writer,'Sheet1')
writer.save()
# main function to output
def main(test_path, train_path, labelled):
test_data = read_test(test_path)
train_data = read_train(train_path,labelled)
labelled = read_labelled(labelled)
output_df = pd.DataFrame(index = range(len(test_data)))
output_df['test_query'] = str()
output_df['Similar word'] = str()
output_df['category'] = str()
output_df['similarity'] = float()
model = load_training
for i in range(len(test_data)):
output_df['test_query'][i] = test_data['query_text'][i]
#<first change>
maxWord, category, maxSimilaity = compare_similarity(model, str(test_data['query_text'][i]), labelled)
output_df['Similar word'][i] = maxWord
output_df['category'][i] = category
output_df['similarity'][i] = maxSimilaity
#<second change>
return output_df
# run main
if __name__ == "__main__":
output = main('Test Data.xlsx','Train Data.xlsx','SMEQueryText.csv')
export_Excel(output)
Here is the full tracible error message
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-22-57803b59c0b9> in <module>
1 # run main
2 if __name__ == "__main__":
----> 3 output = main('Test Data.xlsx','Train Data.xlsx','SMEQueryText.csv')
4 export_Excel(output)
<ipython-input-21-17cb88ee0f79> in main(test_path, train_path, labelled)
13 output_df['test_query'][i] = test_data['query_text'][i]
14 #<first change>
---> 15 maxWord, category, maxSimilaity = compare_similarity(model, str(test_data['query_text'][i]), labelled)
16 output_df['Similar word'][i] = maxWord
17 output_df['category'][i] = category
<ipython-input-19-84d7f268d669> in compare_similarity(model, real_data, labelled)
6 #print("train data",labelled[1])
7 for i in range(len(labelled)):
----> 8 if model.wv.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
9 #print('labelled',labelled['QueryText'][i], 'i', i)
10 maxWord = labelled['QueryText'][i]
AttributeError: 'function' object has no attribute 'wv'
You've got three separate, only-vaguely-related questions here. Taking each in order:
Why are there 3 files, and can they be combined?
It's more efficient to store the big raw arrays separately from the main 'pickled' model – and for models above a few gigabytes in size, necessary to work-around 'pickle' implementation limits. So I'd recommend just keeping the default behavior, and keeping the habit of managing/moving/copying the sets of files together.
If your model is small enough, there is something you can try, though. The .save() method has an optional parameter sep_limit which controls the threshold array size, over which arrays are stored as separate files. By setting that much larger, say sep_limit=2*1024*1024*1024 (2GiB), smaller models should save a single file. (But, loading will be slower, you won't have the sometimes-useful option of memory-map loading, and saving may break on oversized models.)
Why is there a AttributeError: 'function' object has no attribute 'wv' error?
Your line of code model = load_training assigns an actual function to the model variable, rather than what you probably intended, the return-value of calling that function with some arguments. That function has no .wv attribute, hence the error. If model were an actual instance of FastText, you'd not get that error.
Can the corpus text be stored to avoid repeat preprocessing and conversion from pandas formats?
Sure, you can just write the text to a file. Roughly:
with open('mycorpus.txt', mode='w') as corpusfile:
for text in word_tokenized_corpus:
corpusfile.write(' '.join(text))
corpusfile.write('\n')
Though in fact, gensim offers a utility function, utils.save_as_line_sentence(), that can do this (& explicitly handles some extra encoding concerns). See:
https://radimrehurek.com/gensim/utils.html#gensim.utils.save_as_line_sentence
The LineSentence utility class in gensim.models.word2vec can stream texts from such a file back for future re-use:
https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.LineSentence
I have been trying to use python's NLP script with my QT GUI based C++ application.
Basically in the application I am trying to access the NLP script through command line:
QString path = "D:/DS Project/Treegramming";
QString command("py");
QStringList params = QStringList() << "nlp.py";
params << text;
QProcess *process = new QProcess();
process->setWorkingDirectory(path);
process->start(command, params);
process->waitForFinished();
QString result = process->readAll();
The above is working perfectly. but the problem is, it is taking about 40-50 seconds to execute, as it is first training the model and then testing.
But I want to train the model first and test it multiple times as we do in Jupyter Notebook.
for that I made a separate function for testing and trying to access it with command line:
PS D:\DS Project\Treegramming> py nlp.py "test('it was amazing')"
but again this thing is executing the whole script first and then executing the function. is there anything I can do to solve this?
python script:
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 6 16:18:01 2019
#author: Muhammad Ahmed
"""
import nltk
import sys
import random
import re,string
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import twitter_samples
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk import FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
def lemmatize_sentence(tokens):
sentence = []
lematizer = WordNetLemmatizer()
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
sentence.append( lematizer.lemmatize( word , pos ) )
return sentence
def remove_noise(tokens , stop_words = ()):
sentence = []
for token, tag in pos_tag( tokens ):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
token = re.sub("(#[A-Za-z0-9_]+)","",token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
sentence.append( token.lower() )
return sentence
def get_all_words(tokens_list):
for tokens in tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(tokens_list):
for tweets in tokens_list:
yield dict([token,True] for token in tweets)
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )
freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]
dataset = pos_dataset + neg_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
def test( custom_tweet ):
custom_tokens = remove_noise(word_tokenize(custom_tweet))
res = classifier.classify(dict([token, True] for token in custom_tokens))
print(res)
f = open( "result.txt" , "w" )
f.write(res)
f.close()
eval( sys.argv[1] );
You need to create two python scripts:
First to train and save the NaiveBayesClassifier
Second to load and test the model.
To prevent repeating code, I will create a script for helpful functions and I will call it utils.py which should look like this:
import re
import string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
def lemmatize_sentence(tokens):
sentence = []
lematizer = WordNetLemmatizer()
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
sentence.append( lematizer.lemmatize( word , pos ) )
return sentence
def remove_noise(tokens , stop_words = ()):
sentence = []
for token, tag in pos_tag( tokens ):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
token = re.sub("(#[A-Za-z0-9_]+)","",token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
sentence.append( token.lower() )
return sentence
def get_all_words(tokens_list):
for tokens in tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(tokens_list):
for tweets in tokens_list:
yield dict([token,True] for token in tweets)
Then let's create the training script, I will call it train.py and it should look like this:
import random
import pickle
from utils import *
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk.corpus import twitter_samples
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )
freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]
dataset = pos_dataset + neg_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
#### ADD THESE TO SAVE THE CLASSIFIER ####
with open("model.pickle", "wb") as fout:
pickle.dump(classifier, fout)
Finally, the test script test.py that should look like this:
import sys
import pickle
from nltk import classify
from nltk.tokenize import word_tokenize
from utils import remove_noise
#### ADD THESE TO LOAD THE CLASSIFIER ####
with open('model.pickle', 'rb') as fin:
classifier = pickle.load(fin)
def test( custom_tweet ):
custom_tokens = remove_noise(word_tokenize(custom_tweet))
res = classifier.classify(dict([token, True] for token in custom_tokens))
print(res)
f = open( "result.txt" , "w" )
f.write(res)
f.close()
eval( sys.argv[1] );
Now, run train.py once to train the Naive Bayes classifier that will create a new file called model.pickle that holds the trained classifier. Then run test.py from your C++ application on your custom tweet. test.py should loades the trained model model.pickle and use it on the given custom tweet.
I am trying to use the maxent classifier using the NLTK library. I have a list of positive and negative words and I have trained the classifier on the same. The problem is when I test the classifier against a sentence I always get the same probability of classification for the two classes. Here is the code -
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import MaxentClassifier
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from sklearn import cross_validation
nltk.data.path.append("/home/daksh/Documents/Softwares/nltk_data")
import csv
import operator
from nltk.classify import MaxentClassifier
from nltk.corpus import movie_reviews
def getBestWords(posWords,negWords):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
label_word_fd['pos'][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
label_word_fd['neg'][word.lower()] += 1
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1),reverse=True)[:2500]
bestwords = set([w for w,s in sorted_x])
return bestwords
def best_word_feats(words,bestwords):
return dict([(word, True) for word in words if word in bestwords])
def word_feats(words):
return dict([(word, True) for word in words])
def best_bigram_word_feats(words,posWords,negWords, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
d = dict([(bigram, True) for bigram in bigrams])
bestwords = getBestWords(posWords,negWords)
d.update(best_word_feats(words,bestwords))
return d
posWords = list()
negWords = list()
with open('../data/finalSentiPosWords.csv','r') as csvfile:
spamreader = csv.reader(csvfile)
posWords = list(spamreader)
with open('../data/finalSentiNegWords.csv','r') as csvfile:
spamreader = csv.reader(csvfile)
negWords = list(spamreader)
posWords = [word[0] for word in posWords]
negWords = [word[0] for word in negWords]
bestwords = getBestWords(posWords,negWords)
posfeats = [(best_bigram_word_feats(posWords,posWords,negWords),'pos')]
negfeats = [(best_bigram_word_feats(negWords,posWords,negWords),'neg')]
trainfeats = negfeats + posfeats
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
classifier = nltk.MaxentClassifier.train(trainfeats, algorithm,max_iter=5)
# classifier = nltk.NaiveBayesClassifier.train(trainfeats)
classifier.show_most_informative_features(10)
sentence = "Dosa had a tangy taste but it was fun eating it. On the other hand the other dosa was soggy"
l = sentence.split(' ')
print(l)
print(word_feats(l))
print(classifier.prob_classify(word_feats(l)).prob('pos'))
print(classifier.prob_classify(word_feats(l)).prob('neg'))
The output of this is this -
0.500074231063
0.499925768937
The overall classification seems to be working fine but I can't figure out how the probabilities are calculated and why are they always same even if I change the test sentence.
Any quick help appreciated.
Thanks.
That's a lot of code! I'm not going to debug it for you, but I notice that bestwords is the set of all words in your training corpus. If that's not outright wrong, it's certainly misleadingly named.
I have a huge block of code, I didn't want to bother you with in the first place. I tried figuring out what's going wrong for over a week now and I contacted several external sources (without any response), and at the moment I'm just wondering: maybe the problem is my training set?
For my thesis I need to classify a whole bunch of tweets as pos/neg/neutral. The code I wrote works OK on test datasets I make up myself (e.g. consisting out of 15 training sentences: 5 pos, 5 neg and 5 neutral; 6 test sentences: 2 pos, 2 neg, 2 neutral - only 1 test sentence gets misclassified).
Once I start running the code on the manually classified training set (1629 pos, 1411 neutral tweets and only 690 neg) and 900 test tweets, things start going wrong. Of the 900 test tweets, the HUGE majority gets classified as pos (between 700 and 800), while there's only a minority of neg and neutral tweets.
Would somebody please be so kind as to check my code and help me figure out what I'm doing wrong? I'd be really grateful. If you need any more information, I'd be happy to provide it.
import re, math, collections, itertools
import nltk
import nltk.classify.util, nltk.metrics
import csv
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords = True)
pos = []
neg = []
neutral = []
with open('C:\\...pos.csv', 'r', encoding = "utf8") as f: #open positive training set
reader = csv.reader(f)
for row in reader:
pos.extend(row)
with open('C:\\ ...neg.csv', 'r', encoding = "utf8") as f: #open negative training set
reader = csv.reader(f)
for row in reader:
neg.extend(row)
with open('C:\\...neutral.csv', 'r', encoding = "utf8") as f: #open neutral training set
reader = csv.reader(f)
for row in reader:
neutral.extend(row)
def uni(doc):
x = []
y = []
for tweet in doc:
x.append(word_tokenize(tweet))
for element in x:
for word in element:
if len(word)>2:
word = word.lower()
word = stemmer.stem(word)
y.append(word)
return y
def word_feats_uni(doc):
return dict([(word, True) for word in uni(doc)])
def tokenizer_ngrams(document):
all_tokens = []
filtered_tokens = []
for (sentence) in document:
all_tokens.append(word_tokenize(sentence))
return all_tokens
def get_bi (document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def get_tri(document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def word_feats_bi(doc):
return dict([(word, True) for word in get_bi(doc)])
def word_feats_tri(doc):
return dict([(word, True) for word in get_tri(doc)])
def word_feats_test(doc):
feats_test = {}
feats_test.update(word_feats_uni(doc))
feats_test.update(word_feats_bi(doc))
feats_test.update(word_feats_tri(doc))
return feats_test
pos_feats = [(word_feats_uni(pos),'1')] + [(word_feats_bi(pos),'1')] + [(word_feats_tri(pos),'1')]
neg_feats = [(word_feats_uni(neg),'-1')] + [(word_feats_bi(neg),'-1')] + [(word_feats_tri(neg),'-1')]
neutral_feats = [(word_feats_uni(neutral),'0')] + [(word_feats_bi(neutral),'0')] + [(word_feats_tri(neutral),'0')]
trainfeats = pos_feats + neg_feats + neutral_feats
random.shuffle(trainfeats)
classifier = NaiveBayesClassifier.train(trainfeats)
testtweets = []
with open('C:\\ ... testtweets.csv', 'r', encoding = "utf8") as f: #open testset
reader = csv.reader(f, delimiter = ';')
for row in reader:
testtweets.extend([row])
date = []
word = []
y = []
def classification(date,sentence): #doc = sentencelist
i = 0
for tweet in sentence:
sent = classifier.classify(word_feats_test([tweet]))
y.extend([(date[i],tweet,sent)])
i = i + 1
def result(doc):
i = 0
while i in range(0,len(doc) -1):
date.append(doc[i][0])
word.append(doc[i][1])
i = i + 1
classification(date,word)
result(testtweets)
with open('C:\\...write.csv', 'w') as fp: #write classified test set to file
a = csv.writer(fp, delimiter=',')
a.writerows(y)
I was experimenting with python NLTK text classification. Here is the code example i am practicing: http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/
Here is code:
from nltk import bigrams
from nltk.probability import ELEProbDist, FreqDist
from nltk import NaiveBayesClassifier
from collections import defaultdict
train_samples = {}
with file ('data/positive.txt', 'rt') as f:
for line in f.readlines():
train_samples[line] = 'pos'
with file ('data/negative.txt', 'rt') as d:
for line in d.readlines():
train_samples[line] = 'neg'
f = open("data/test.txt", "r")
test_samples = f.readlines()
# Error in this code
# def bigramReturner(text):
# tweetString = text.lower()
# bigramFeatureVector = {}
# for item in bigrams(tweetString.split()):
# bigramFeatureVector.append(' '.join(item))
# return bigramFeatureVector
# Updated the code from the stack overflow comment
def bigramReturner (tweetString):
tweetString = tweetString.lower()
#comment the line since the function is not defined
#tweetString = removePunctuation (tweetString)
bigramFeatureVector = []
for item in nltk.unigrams(tweetString.split()):
bigramFeatureVector.append(' '.join(item))
return bigramFeatureVector
def get_labeled_features(samples):
word_freqs = {}
for text, label in train_samples.items():
tokens = text.split()
for token in tokens:
if token not in word_freqs:
word_freqs[token] = {'pos': 0, 'neg': 0}
word_freqs[token][label] += 1
return word_freqs
def get_label_probdist(labeled_features):
label_fd = FreqDist()
for item, counts in labeled_features.items():
for label in ['neg', 'pos']:
if counts[label] > 0:
label_fd.inc(label)
label_probdist = ELEProbDist(label_fd)
return label_probdist
def get_feature_probdist(labeled_features):
feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
num_samples = len(train_samples) / 2
for token, counts in labeled_features.items():
for label in ['neg', 'pos']:
feature_freqdist[label, token].inc(True, count=counts[label])
feature_freqdist[label, token].inc(None, num_samples - counts[label])
feature_values[token].add(None)
feature_values[token].add(True)
for item in feature_freqdist.items():
print item[0], item[1]
feature_probdist = {}
for ((label, fname), freqdist) in feature_freqdist.items():
probdist = ELEProbDist(freqdist, bins=len(feature_values[fname]))
feature_probdist[label, fname] = probdist
return feature_probdist
labeled_features = get_labeled_features(train_samples)
label_probdist = get_label_probdist(labeled_features)
feature_probdist = get_feature_probdist(labeled_features)
classifier = NaiveBayesClassifier(label_probdist, feature_probdist)
for sample in test_samples:
print "%s | %s" % (sample, classifier.classify(bigramReturner(sample)))
but when I run the code I get following error:
Traceback (most recent call last):
File "naive_bigram_1.py", line 87, in <module>
print "%s | %s" % (sample, classifier.classify(bigramReturner(sample)))
File "naive_bigram_1.py", line 30, in bigramReturner
tweetString = removePunctuation (tweetString)
NameError: global name 'removePunctuation' is not defined
I saw the similar question with other error, here I updated as well n-grams with Naive Bayes classifier
You're calling a function removePunctuation that hasn't been defined previously:
def bigramReturner (tweetString):
tweetString = tweetString.lower()
tweetString = removePunctuation (tweetString)
....
I also noticed that you put spaces between your functions' names and the parameters list. Avoid that as it's not really idiomatic Python and could even cause some problems (like your function being evaluated as an object instead of being called).