I did preprocessing the tweet data stored in the vaksinsampel2.csv file. And I have done several steps such as text cleaning, case folding, tokenizing, stopword removal, normalization. but I can't do stemming. please help me solve it.
here the code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
import string
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
df = pd.read_csv('vaksinsampel2.csv', encoding = 'unicode_escape')
def remove_punct(tweet):
tweet = re.sub('[^a-zA-Z0-9 ]', ' ', str(tweet))
tweet = re.sub('[0-9]+', ' ', tweet)
tweet = re.sub(r'#', '', str(tweet))
tweet = re.sub(r'http\S+', ' ', tweet)
return tweet
df['TEXT'] = df['full_text'].apply(lambda x:remove_punct(x))
df['case_folding'] = df['TEXT'].str.lower()
def tokenization(tweet):
tweet = re.split('\W+', tweet)
return tweet
df['Tokenization'] = df['TEXT'].apply(lambda x: tokenization(x.lower()))
df.head(10)
stopword = nltk.corpus.stopwords.words('indonesian')
def remove_stopwords(tweet):
tweet = [word for word in tweet if word not in stopword]
return tweet
df['Stopword_Removal'] = df['Tokenization'].apply(lambda x: remove_stopwords(x))
df.head(10)
def normalisasi(tweet):
kamus_slangword = eval(open("slang_indonesia.txt").read()) # Membuka dictionary slangword
pattern = re.compile(r'\b( ' + '|'.join (kamus_slangword.keys())+r')\b') # Search pola kata (contoh kpn -> kapan)
content = []
for kata in tweet:
filteredSlang = pattern.sub(lambda x: kamus_slangword[x.group()],kata) # Replace slangword berdasarkan pola review yg telah ditentukan
content.append(filteredSlang.lower())
tweet = content
return tweet
df['Normalization'] = df['Stopword_Removal'].apply(lambda x: normalisasi(x))
df.head(10)
factory = StemmerFactory()
stemming = factory.create_stemmer()
def stem_list(tweet):
return stemming.stem(df['Normalization'])
df['Stemming'] = df.apply(stem_list, axis=1)
df.head(50)
stemmer.stem() and not stemming.stem()?
I am referring to this: https://pypi.org/project/Sastrawi/
(...just happened to start exploring Sastrawi today)
Try This
factory = StemmerFactory()
stemmer = factory.create_stemmer()
def stemming(tweet):
text = [stemmer.stem(word) for word in tweet]
return tweet
df['Stemming'] = df['Normalization'].apply(lambda x: stemming(x))
Related
So, I am trying to build a twitter sentiment analysis model using naive bayes. I'm getting a problem while I'm trying to preprocess my data.
Here is my code :
This is my data loading code
df = pd.read_csv('full-corpus.csv',
encoding='latin1',
names=['topic', 'sentiment', 'TweetId', 'TweetDate','TweetText'])
trainingData = df.to_dict(orient='records')
This is my code for data preprocessing :
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
class PreProcessTweets:
def __init__(self):
self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
def processTweets(self, list_of_tweets):
processedTweets=[]
for tweet in list_of_tweets:
processedTweets.append((self._processTweet(tweet['TweetText']),tweet['sentiment']))
return processedTweets
def _processTweet(self, tweet):
tweet = tweet.lower() # convert text to lower-case
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
tweet = re.sub('#[^\s]+', 'AT_USER', tweet) # remove usernames
tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)
return [word for word in tweet if word not in self._stopwords]
tweetProcessor = PreProcessTweets()
preprocessedTrainingSet = tweetProcessor.processTweets(trainingData)
preprocessedTestSet = tweetProcessor.processTweets(testDataSet)
I'm getting this keyerror
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-96-acc96635cf33> in <module>()
1 tweetProcessor = PreProcessTweets()
2 preprocessedTrainingSet = tweetProcessor.processTweets(trainingData)
----> 3 preprocessedTestSet = tweetProcessor.processTweets(testDataSet)
<ipython-input-95-05e6d1942355> in processTweets(self, list_of_tweets)
15 for tweet in list_of_tweets:
16
---> 17 processedTweets.append((self._processTweet(tweet['TweetText']),tweet['sentiment']))
18 return processedTweets
19
KeyError: 'TweetText'
Please help to solve this. Thanks
It seems like you pass through a list, but you want to use a dict. For it to work you have to use:
for key, value in dict_foo.items()
If you go through a list while it is a dict you will just have access to the key.
I have this code below to process text data using tf idf in python.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import glob
files = glob.glob("Text/*.txt")
with open("all_data.txt","wb") as outfile:
for f in files:
with open(f,"rb") as infile:
outfile.write(infile.read())
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk #import library nltk
from nltk.tokenize import word_tokenize #import word_tokenize for tokenizing text into words
from nltk.tokenize import sent_tokenize #import sent_tokenize for tokenizing paragraph into sentences
from nltk.stem.porter import PorterStemmer #import Porter Stemmer Algorithm
from nltk.stem import WordNetLemmatizer #import WordNet lemmatizer
from nltk.corpus import stopwords #import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory #import Indonesian Stemmer
import re #import regular expression
from nltk.tokenize import RegexpTokenizer
file = open('all_data.txt', 'r')
t = file.read()
text_data = t
#casefolding
def casefolding(s):
new_str = s.lower()
return new_str
cf = casefolding(text_data)
#remove punctuation from string
def removepunct(str):
new_string = re.sub(r"[\W]", " ", str)
return new_string
rp = removepunct(cf)
#remove digit from string
def removeDigit(str):
new_string = re.sub(r"[0-9]", " ", str)
return new_string
rd = removeDigit(rp)
#remove words in length 1-3
def removelg(str):
new_string = re.sub(r' \w{1,3} ', ' ', str)
return new_string
rl = removelg(rd)
#remove multiple space
def removespace(str):
new_string = re.sub(' +', ' ',str)
return new_string
rms = removespace(rl)
#Stemming Indonesian
def stemmingIndo(str):
factory = StemmerFactory()
stemmer = factory.create_stemmer()
return stemmer.stem(str)
stindo = stemmingIndo(rms)
#remove stopwords
def stpwrds(str):
stop_words = set(stopwords.words('indonesian'))
word_tokens = word_tokenize(stindo)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
return filtered_sentence
filt = stpwrds(stindo)
par = ' '.join(filt)
def word_tokenization(s):
tokens = word_tokenize(s)
return tokens
wordtoken = word_tokenization(par)
bowD = wordtoken
wordSet = set(bowD)
wordDict = dict.fromkeys(wordSet,0)
for word in bowD:
wordDict[word]+=1
def computeTF(wordDict, bow):
tfDict = {}
bowCount = len(bow)
for word, count in wordDict.items():
tfDict[word] = count/float(bowCount)
return tfDict
tf = computeTF(wordDict, bowD)
def computeIDF(docList):
import math
idfDict = {}
N = len(docList)
idfDict = dict.fromkeys(docList[0].keys(), 0)
for doc in docList:
for word, val in doc.items():
if val > 0:
idfDict[word] += 1
for word, val in idfDict.items():
idfDict[word] = math.log10((1+N) / float(val))
return idfDict
idf = computeIDF([wordDict])
def computeTFIDF(tfBow, idfs):
tfidf = {}
for word, val in tfBow.items():
tfidf[word] = val*idfs[word]
return tfidf
tfidf = computeTFIDF(tf, idf)
df = pd.DataFrame({'weight': tfidf})
#test = df.sort_values('tfidf', ascending=False)
test = df.sort_values(by = 'weight', ascending=False)
print(test)
I have managed to run it and got the output below. I don't think there is error from this but I don't how to get the full output
weight
butuh 0.026342
orang 0.019802
milik 0.009629
saudara 0.007267
hidup 0.006359
atur 0.006359
periksa 0.005450
hasil 0.005450
suka 0.004360
barang 0.003997
epps 0.003633
pengaruh 0.003270
perhati 0.003270
agresif 0.003088
salah 0.003088
laku 0.002907
prestasi 0.002907
gantung 0.002907
seksual 0.002907
muhammad 0.002725
rawat 0.002725
benda 0.002725
tolong 0.002725
manja 0.002543
percaya 0.002543
hadap 0.002543
harmonis 0.002543
gaul 0.002543
tekun 0.002362
ubah 0.002362
... ...
widad 0.000908
hubung 0.000727
manusia 0.000727
ekspresi 0.000727
aktivitas 0.000727
taruh 0.000727
pilih 0.000545
masuk 0.000545
putus 0.000545
peka 0.000545
kait 0.000545
ambil 0.000545
sulit 0.000545
paham 0.000545
raih 0.000545
rutin 0.000545
didik 0.000545
laksana 0.000363
kuat 0.000363
mudah 0.000363
jaga 0.000363
patuh 0.000363
gigih 0.000363
tonjol 0.000182
konvensi 0.000182
lingkung 0.000182
sosial 0.000182
interaksi 0.000182
urus 0.000182
tarik 0.000182
[150 rows x 1 columns]
I get a truncated representation, but I want the full array. I want to see the 150 data in row.
Is there any way to do this? should i split into 2 column and how's it working?
A for loop over each row of 'test' will work, print one row at a time. However it'll be slow to print so many times.
Let us know if that is sufficient.
The Pandas .head() method will print the dataframe with the number of rows you specify. You can try using this method and inserting the number of rows you would like to see. For example to see 150 rows, you can try
print(test.head(150))
I have been trying to use python's NLP script with my QT GUI based C++ application.
Basically in the application I am trying to access the NLP script through command line:
QString path = "D:/DS Project/Treegramming";
QString command("py");
QStringList params = QStringList() << "nlp.py";
params << text;
QProcess *process = new QProcess();
process->setWorkingDirectory(path);
process->start(command, params);
process->waitForFinished();
QString result = process->readAll();
The above is working perfectly. but the problem is, it is taking about 40-50 seconds to execute, as it is first training the model and then testing.
But I want to train the model first and test it multiple times as we do in Jupyter Notebook.
for that I made a separate function for testing and trying to access it with command line:
PS D:\DS Project\Treegramming> py nlp.py "test('it was amazing')"
but again this thing is executing the whole script first and then executing the function. is there anything I can do to solve this?
python script:
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 6 16:18:01 2019
#author: Muhammad Ahmed
"""
import nltk
import sys
import random
import re,string
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import twitter_samples
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk import FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
def lemmatize_sentence(tokens):
sentence = []
lematizer = WordNetLemmatizer()
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
sentence.append( lematizer.lemmatize( word , pos ) )
return sentence
def remove_noise(tokens , stop_words = ()):
sentence = []
for token, tag in pos_tag( tokens ):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
token = re.sub("(#[A-Za-z0-9_]+)","",token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
sentence.append( token.lower() )
return sentence
def get_all_words(tokens_list):
for tokens in tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(tokens_list):
for tweets in tokens_list:
yield dict([token,True] for token in tweets)
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )
freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]
dataset = pos_dataset + neg_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
def test( custom_tweet ):
custom_tokens = remove_noise(word_tokenize(custom_tweet))
res = classifier.classify(dict([token, True] for token in custom_tokens))
print(res)
f = open( "result.txt" , "w" )
f.write(res)
f.close()
eval( sys.argv[1] );
You need to create two python scripts:
First to train and save the NaiveBayesClassifier
Second to load and test the model.
To prevent repeating code, I will create a script for helpful functions and I will call it utils.py which should look like this:
import re
import string
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
def lemmatize_sentence(tokens):
sentence = []
lematizer = WordNetLemmatizer()
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
sentence.append( lematizer.lemmatize( word , pos ) )
return sentence
def remove_noise(tokens , stop_words = ()):
sentence = []
for token, tag in pos_tag( tokens ):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' , '',token)
token = re.sub("(#[A-Za-z0-9_]+)","",token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
sentence.append( token.lower() )
return sentence
def get_all_words(tokens_list):
for tokens in tokens_list:
for token in tokens:
yield token
def get_tweets_for_model(tokens_list):
for tweets in tokens_list:
yield dict([token,True] for token in tweets)
Then let's create the training script, I will call it train.py and it should look like this:
import random
import pickle
from utils import *
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk.corpus import twitter_samples
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
all_pos_words = get_all_words( positive_cleaned_tokens_list )
all_neg_words = get_all_words( negative_cleaned_tokens_list )
freq_dis_pos = FreqDist( all_pos_words )
freq_dis_neg = FreqDist( all_neg_words )
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)
pos_dataset = [(tweets,"Positive") for tweets in positive_tokens_for_model]
neg_dataset = [(tweets,"Negative") for tweets in negative_tokens_for_model]
dataset = pos_dataset + neg_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
#### ADD THESE TO SAVE THE CLASSIFIER ####
with open("model.pickle", "wb") as fout:
pickle.dump(classifier, fout)
Finally, the test script test.py that should look like this:
import sys
import pickle
from nltk import classify
from nltk.tokenize import word_tokenize
from utils import remove_noise
#### ADD THESE TO LOAD THE CLASSIFIER ####
with open('model.pickle', 'rb') as fin:
classifier = pickle.load(fin)
def test( custom_tweet ):
custom_tokens = remove_noise(word_tokenize(custom_tweet))
res = classifier.classify(dict([token, True] for token in custom_tokens))
print(res)
f = open( "result.txt" , "w" )
f.write(res)
f.close()
eval( sys.argv[1] );
Now, run train.py once to train the Naive Bayes classifier that will create a new file called model.pickle that holds the trained classifier. Then run test.py from your C++ application on your custom tweet. test.py should loades the trained model model.pickle and use it on the given custom tweet.
I've got a code that worked, up until I added the entropy portion to it. Now it's giving me an invalid syntax error on the print line. How come?
import nltk, math, re, numpy
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
def entropy(labels):
freqdist = nltk.FreqDist(labels)
probs = [freqdist.freq(1) for l in freqdist]
return -sum(p * math.log(p,2) for p in probs)
def sents():
fileObj = open('1865-Lincoln.txt', 'r')
text = fileObj.read()
tokens = nltk.sent_tokenize(text)
for name in tokens:
words = ' '.join(name.split()[:4])
count = len(name.split())
entro = entropy(len(name.split())
print('{:<35} {:^15} {:>15}'.format(words, count, entro))
There is a closing bracket missing in the line above:
entro = entropy(len(name.split()))
Sorry to dump a whole block of code (below) here. I've been trying to figure out what I'm doing wrong, but unfortunately I have no idea.
For my thesis I have to classify tweets as neutral (0), negative (-1) or positive (1). I'm trying this by using NLTK. Goal is that the code returns a dictionary in the form 'tweetA,0','tweetB,-1'... At the moment, if I enter more than one tweet as an input, I only get the result (i.e. -1/0/1) for the first tweet back.
For example, if I put 'I love oranges','I hate tomatoes' as in input, I only get '1' as a return and not '1','-1'.
If anyone would be able to help me out, I'd be really grateful!
The code I have up until now:
import re, math, collections, itertools
import nltk
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords = True)
pos_tweets = ['I love bananas','I like pears','I eat oranges']
neg_tweets = ['I hate lettuce','I do not like tomatoes','I hate apples']
neutral_tweets = ['I buy chicken','I am boiling eggs','I am chopping vegetables']
def uni(doc):
x = []
y = []
for tweet in doc:
x.append(word_tokenize(tweet))
for element in x:
for word in element:
if len(word)>2:
word = word.lower()
word = stemmer.stem(word)
y.append(word)
return y
def word_feats_uni(doc):
return dict([(word, True) for word in uni(doc)])
def tokenizer_ngrams(document):
all_tokens = []
filtered_tokens = []
for (sentence) in document:
all_tokens.append(word_tokenize(sentence))
return all_tokens
def get_bi (document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def get_tri(document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def word_feats_bi(doc):
return dict([(word, True) for word in get_bi(doc)])
def word_feats_tri(doc):
return dict([(word, True) for word in get_tri(doc)])
def word_feats_test(doc):
feats_test = {}
i = 0
for tweet in doc:
feats_test.update(word_feats_uni(tweet))
feats_test.update(word_feats_bi(tweet))
feats_test.update(word_feats_tri(tweet))
return feats_test
pos_feats = [(word_feats_uni(pos_tweets),'1')] + [(word_feats_bi(pos_tweets),'1')] + [(word_feats_tri(pos_tweets),'1')]
neg_feats = [(word_feats_uni(neg_tweets),'-1')] + [(word_feats_bi(neg_tweets),'-1')] + [(word_feats_tri(neg_tweets),'-1')]
neutral_feats = [(word_feats_uni(neutral_tweets),'0')] + [(word_feats_bi(neutral_tweets),'0')] + [(word_feats_tri(neutral_tweets),'0')]
trainfeats = pos_feats + neg_feats + neutral_feats
classifier = NaiveBayesClassifier.train(trainfeats)
print (classifier.classify(word_feats_test(['I love oranges'])))