So, I am trying to build a twitter sentiment analysis model using naive bayes. I'm getting a problem while I'm trying to preprocess my data.
Here is my code :
This is my data loading code
df = pd.read_csv('full-corpus.csv',
encoding='latin1',
names=['topic', 'sentiment', 'TweetId', 'TweetDate','TweetText'])
trainingData = df.to_dict(orient='records')
This is my code for data preprocessing :
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
class PreProcessTweets:
def __init__(self):
self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
def processTweets(self, list_of_tweets):
processedTweets=[]
for tweet in list_of_tweets:
processedTweets.append((self._processTweet(tweet['TweetText']),tweet['sentiment']))
return processedTweets
def _processTweet(self, tweet):
tweet = tweet.lower() # convert text to lower-case
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
tweet = re.sub('#[^\s]+', 'AT_USER', tweet) # remove usernames
tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)
return [word for word in tweet if word not in self._stopwords]
tweetProcessor = PreProcessTweets()
preprocessedTrainingSet = tweetProcessor.processTweets(trainingData)
preprocessedTestSet = tweetProcessor.processTweets(testDataSet)
I'm getting this keyerror
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-96-acc96635cf33> in <module>()
1 tweetProcessor = PreProcessTweets()
2 preprocessedTrainingSet = tweetProcessor.processTweets(trainingData)
----> 3 preprocessedTestSet = tweetProcessor.processTweets(testDataSet)
<ipython-input-95-05e6d1942355> in processTweets(self, list_of_tweets)
15 for tweet in list_of_tweets:
16
---> 17 processedTweets.append((self._processTweet(tweet['TweetText']),tweet['sentiment']))
18 return processedTweets
19
KeyError: 'TweetText'
Please help to solve this. Thanks
It seems like you pass through a list, but you want to use a dict. For it to work you have to use:
for key, value in dict_foo.items()
If you go through a list while it is a dict you will just have access to the key.
Related
I did preprocessing the tweet data stored in the vaksinsampel2.csv file. And I have done several steps such as text cleaning, case folding, tokenizing, stopword removal, normalization. but I can't do stemming. please help me solve it.
here the code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
import string
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
df = pd.read_csv('vaksinsampel2.csv', encoding = 'unicode_escape')
def remove_punct(tweet):
tweet = re.sub('[^a-zA-Z0-9 ]', ' ', str(tweet))
tweet = re.sub('[0-9]+', ' ', tweet)
tweet = re.sub(r'#', '', str(tweet))
tweet = re.sub(r'http\S+', ' ', tweet)
return tweet
df['TEXT'] = df['full_text'].apply(lambda x:remove_punct(x))
df['case_folding'] = df['TEXT'].str.lower()
def tokenization(tweet):
tweet = re.split('\W+', tweet)
return tweet
df['Tokenization'] = df['TEXT'].apply(lambda x: tokenization(x.lower()))
df.head(10)
stopword = nltk.corpus.stopwords.words('indonesian')
def remove_stopwords(tweet):
tweet = [word for word in tweet if word not in stopword]
return tweet
df['Stopword_Removal'] = df['Tokenization'].apply(lambda x: remove_stopwords(x))
df.head(10)
def normalisasi(tweet):
kamus_slangword = eval(open("slang_indonesia.txt").read()) # Membuka dictionary slangword
pattern = re.compile(r'\b( ' + '|'.join (kamus_slangword.keys())+r')\b') # Search pola kata (contoh kpn -> kapan)
content = []
for kata in tweet:
filteredSlang = pattern.sub(lambda x: kamus_slangword[x.group()],kata) # Replace slangword berdasarkan pola review yg telah ditentukan
content.append(filteredSlang.lower())
tweet = content
return tweet
df['Normalization'] = df['Stopword_Removal'].apply(lambda x: normalisasi(x))
df.head(10)
factory = StemmerFactory()
stemming = factory.create_stemmer()
def stem_list(tweet):
return stemming.stem(df['Normalization'])
df['Stemming'] = df.apply(stem_list, axis=1)
df.head(50)
stemmer.stem() and not stemming.stem()?
I am referring to this: https://pypi.org/project/Sastrawi/
(...just happened to start exploring Sastrawi today)
Try This
factory = StemmerFactory()
stemmer = factory.create_stemmer()
def stemming(tweet):
text = [stemmer.stem(word) for word in tweet]
return tweet
df['Stemming'] = df['Normalization'].apply(lambda x: stemming(x))
I have been trying to use python's NLP script with my QT GUI based C++ application.
Basically in the application I am trying to access the NLP script through command line:
QString path = "D:/DS Project/Treegramming";
QString command("py");
QStringList params = QStringList() << "nlp.py";
params << text;
QProcess *process = new QProcess();
process->setWorkingDirectory(path);
process->start(command, params);
process->waitForFinished();
QString result = process->readAll();
The above is working perfectly. but the problem is, it is taking about 40-50 seconds to execute, as it is first training the model and then testing.
But I want to train the model first and test it multiple times as we do in Jupyter Notebook.
for that I made a separate function for testing and trying to access it with command line:
PS D:\DS Project\Treegramming> py nlp.py "test('it was amazing')"
but again this thing is executing the whole script first and then executing the function. is there anything I can do to solve this?
python script:
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 6 16:18:01 2019
#author: Muhammad Ahmed
"""
import nltk
import sys
import random
import re,string
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import twitter_samples
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk import FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
def lemmatize_sentence(tokens):
sentence = []
lematizer = WordNetLemmatizer()
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
sentence.append( lematizer.lemmatize( word , pos ) )
return sentence
def remove_noise(tokens , stop_words = ()):
sentence = []
for token, tag in pos_tag( tokens ):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
token = re.sub("(#[A-Za-z0-9_]+)","",token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
sentence.append( token.lower() )
return sentence
def get_all_words(tokens_list):
for tokens in tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(tokens_list):
for tweets in tokens_list:
yield dict([token,True] for token in tweets)
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )
freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]
dataset = pos_dataset + neg_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
def test( custom_tweet ):
custom_tokens = remove_noise(word_tokenize(custom_tweet))
res = classifier.classify(dict([token, True] for token in custom_tokens))
print(res)
f = open( "result.txt" , "w" )
f.write(res)
f.close()
eval( sys.argv[1] );
You need to create two python scripts:
First to train and save the NaiveBayesClassifier
Second to load and test the model.
To prevent repeating code, I will create a script for helpful functions and I will call it utils.py which should look like this:
import re
import string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
def lemmatize_sentence(tokens):
sentence = []
lematizer = WordNetLemmatizer()
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
sentence.append( lematizer.lemmatize( word , pos ) )
return sentence
def remove_noise(tokens , stop_words = ()):
sentence = []
for token, tag in pos_tag( tokens ):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
token = re.sub("(#[A-Za-z0-9_]+)","",token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
sentence.append( token.lower() )
return sentence
def get_all_words(tokens_list):
for tokens in tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(tokens_list):
for tweets in tokens_list:
yield dict([token,True] for token in tweets)
Then let's create the training script, I will call it train.py and it should look like this:
import random
import pickle
from utils import *
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk.corpus import twitter_samples
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )
freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]
dataset = pos_dataset + neg_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
#### ADD THESE TO SAVE THE CLASSIFIER ####
with open("model.pickle", "wb") as fout:
pickle.dump(classifier, fout)
Finally, the test script test.py that should look like this:
import sys
import pickle
from nltk import classify
from nltk.tokenize import word_tokenize
from utils import remove_noise
#### ADD THESE TO LOAD THE CLASSIFIER ####
with open('model.pickle', 'rb') as fin:
classifier = pickle.load(fin)
def test( custom_tweet ):
custom_tokens = remove_noise(word_tokenize(custom_tweet))
res = classifier.classify(dict([token, True] for token in custom_tokens))
print(res)
f = open( "result.txt" , "w" )
f.write(res)
f.close()
eval( sys.argv[1] );
Now, run train.py once to train the Naive Bayes classifier that will create a new file called model.pickle that holds the trained classifier. Then run test.py from your C++ application on your custom tweet. test.py should loades the trained model model.pickle and use it on the given custom tweet.
I've got a code that worked, up until I added the entropy portion to it. Now it's giving me an invalid syntax error on the print line. How come?
import nltk, math, re, numpy
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
def entropy(labels):
freqdist = nltk.FreqDist(labels)
probs = [freqdist.freq(1) for l in freqdist]
return -sum(p * math.log(p,2) for p in probs)
def sents():
fileObj = open('1865-Lincoln.txt', 'r')
text = fileObj.read()
tokens = nltk.sent_tokenize(text)
for name in tokens:
words = ' '.join(name.split()[:4])
count = len(name.split())
entro = entropy(len(name.split())
print('{:<35} {:^15} {:>15}'.format(words, count, entro))
There is a closing bracket missing in the line above:
entro = entropy(len(name.split()))
I am pretty new to python and this is the first code I have written. Trying to use the NLTK package. The problem comes at the end when trying to execute the label_probdist.prob('positive') line.
This is the error I get;
name 'label_probdist' is not defined
NameError Traceback (most recent call last)
<ipython-input-57-006d791d4445> in <module>()
----> 1 print label_probdist.prob('positive')
NameError: name 'label_probdist' is not defined
import nltk, re, pprint
import csv
from nltk import word_tokenize, wordpunct_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
from nltk.classify.api import ClassifierI
# not in use nltk.download() #Download the bookpackage
#open the file that containts wallposts and classifier
with open('Classified.csv' ,'rb') as f:
reader = csv.reader(f)
FBsocial = map(tuple, reader)
import random
random.shuffle(FBsocial)
FBsocial = FBsocial[:500]
len(FBsocial)
FBSocialData = [] #sorting data
for row in FBsocial:
statement = row[0]
sentiment = row[1]
words_filtered = [e.lower() for e in statement.split() if len(e) >= 3]
FBSocialData.append((words_filtered, sentiment))
len(FBSocialData)
#Extracting features of word(list of words ordered by frequency)
def get_words_in_FBdata(FBSocialData):
all_words = []
for (statement, sentiment) in FBSocialData:
all_words.extend(statement)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
word_features = get_word_features(get_words_in_FBdata(FBSocialData))
len(word_features)
#just a test;
document = ("hei","grin","andre","jævlig","gøy",)
#Classifier to decide which feature are relevant
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
extract_features(document)
#testing extract_features
extract_features("udviser blomsterbutik")
training_set = nltk.classify.util.apply_features(extract_features, FBSocialData)
len(training_set)
classifier = nltk.NaiveBayesClassifier.train(training_set)
def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
# Create the P(label) distribution
label_probdist = estimator(label_freqdist)
# Create the P(fval|label, fname) distribution
feature_probdist = {}
return NaiveBayesClassifier(label_probdist, feature_probdist)
#pvalue
print label_probdist.prob('positive')
print label_probdist.prob('negative')
You are defining variable label_probdist inside function train. Then you are trying to access it outside it's scope. It is not possible. It's a local variable, not a global one.
I am new to Python and to Stackoverflow(please be gentle) and am trying to learn how to do a sentiment analysis. I am using a combination of code I found in a tutorial and here: Python - AttributeError: 'list' object has no attribute However, I keep getting
Traceback (most recent call last):
File "C:/Python27/training", line 111, in <module>
processedTestTweet = processTweet(row)
File "C:/Python27/training", line 19, in processTweet
tweet = tweet.lower()
AttributeError: 'list' object has no attribute 'lower'`
This is my code:
import csv
#import regex
import re
import pprint
import nltk.classify
#start replaceTwoOrMore
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
# process the tweets
def processTweet(tweet):
#Convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert #username to AT_USER
tweet = re.sub('#[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
#start getStopWordList
def getStopWordList(stopWordListFileName):
#read the stopwords file and build a list
stopWords = []
stopWords.append('AT_USER')
stopWords.append('URL')
fp = open(stopWordListFileName, 'r')
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
def getFeatureVector(tweet, stopWords):
featureVector = []
words = tweet.split()
for w in words:
#replace two or more with two occurrences
w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if it consists of only words
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
#ignore if it is a stopWord
if(w in stopWords or val is None):
continue
else:
featureVector.append(w.lower())
return featureVector
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features
#Read the tweets one by one and process it
inpTweets = csv.reader(open('C:/GsTraining.csv', 'rb'),
delimiter=',',
quotechar='|')
stopWords = getStopWordList('C:/stop.txt')
count = 0;
featureList = []
tweets = []
for row in inpTweets:
sentiment = row[0]
tweet = row[1]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet, stopWords)
featureList.extend(featureVector)
tweets.append((featureVector, sentiment))
# Remove featureList duplicates
featureList = list(set(featureList))
# Generate the training set
training_set = nltk.classify.util.apply_features(extract_features, tweets)
# Train the Naive Bayes classifier
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
# Test the classifier
with open('C:/CleanedNewGSMain.txt', 'r') as csvinput:
with open('GSnewmain.csv', 'w') as csvoutput:
writer = csv.writer(csvoutput, lineterminator='\n')
reader = csv.reader(csvinput)
all=[]
row = next(reader)
for row in reader:
processedTestTweet = processTweet(row)
sentiment = NBClassifier.classify(
extract_features(getFeatureVector(processedTestTweet, stopWords)))
row.append(sentiment)
processTweet(row[1])
writer.writerows(all)
Any help would be massively appreciated.
The result from the csv reader is a list, lower only works on strings. Presumably it is a list of string, so there are two options. Either you can call lower on each element, or turn the list into a string and then call lower on it.
# the first approach
[item.lower() for item in tweet]
# the second approach
' '.join(tweet).lower()
But more reasonably (hard to tell without more information) you only actually want one item out of your list. Something along the lines of:
for row in reader:
processedTestTweet = processTweet(row[0]) # Again, can't know if this is actually correct without seeing the file
Also, guessing that you aren't using the csv reader quite like you think you are, because right now you are training a naive bayes classifier on a single example every time and then having it predict the one example it was trained on. Maybe explain what you're trying to do?