I have this code below to process text data using tf idf in python.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import glob
files = glob.glob("Text/*.txt")
with open("all_data.txt","wb") as outfile:
for f in files:
with open(f,"rb") as infile:
outfile.write(infile.read())
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk #import library nltk
from nltk.tokenize import word_tokenize #import word_tokenize for tokenizing text into words
from nltk.tokenize import sent_tokenize #import sent_tokenize for tokenizing paragraph into sentences
from nltk.stem.porter import PorterStemmer #import Porter Stemmer Algorithm
from nltk.stem import WordNetLemmatizer #import WordNet lemmatizer
from nltk.corpus import stopwords #import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory #import Indonesian Stemmer
import re #import regular expression
from nltk.tokenize import RegexpTokenizer
file = open('all_data.txt', 'r')
t = file.read()
text_data = t
#casefolding
def casefolding(s):
new_str = s.lower()
return new_str
cf = casefolding(text_data)
#remove punctuation from string
def removepunct(str):
new_string = re.sub(r"[\W]", " ", str)
return new_string
rp = removepunct(cf)
#remove digit from string
def removeDigit(str):
new_string = re.sub(r"[0-9]", " ", str)
return new_string
rd = removeDigit(rp)
#remove words in length 1-3
def removelg(str):
new_string = re.sub(r' \w{1,3} ', ' ', str)
return new_string
rl = removelg(rd)
#remove multiple space
def removespace(str):
new_string = re.sub(' +', ' ',str)
return new_string
rms = removespace(rl)
#Stemming Indonesian
def stemmingIndo(str):
factory = StemmerFactory()
stemmer = factory.create_stemmer()
return stemmer.stem(str)
stindo = stemmingIndo(rms)
#remove stopwords
def stpwrds(str):
stop_words = set(stopwords.words('indonesian'))
word_tokens = word_tokenize(stindo)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
return filtered_sentence
filt = stpwrds(stindo)
par = ' '.join(filt)
def word_tokenization(s):
tokens = word_tokenize(s)
return tokens
wordtoken = word_tokenization(par)
bowD = wordtoken
wordSet = set(bowD)
wordDict = dict.fromkeys(wordSet,0)
for word in bowD:
wordDict[word]+=1
def computeTF(wordDict, bow):
tfDict = {}
bowCount = len(bow)
for word, count in wordDict.items():
tfDict[word] = count/float(bowCount)
return tfDict
tf = computeTF(wordDict, bowD)
def computeIDF(docList):
import math
idfDict = {}
N = len(docList)
idfDict = dict.fromkeys(docList[0].keys(), 0)
for doc in docList:
for word, val in doc.items():
if val > 0:
idfDict[word] += 1
for word, val in idfDict.items():
idfDict[word] = math.log10((1+N) / float(val))
return idfDict
idf = computeIDF([wordDict])
def computeTFIDF(tfBow, idfs):
tfidf = {}
for word, val in tfBow.items():
tfidf[word] = val*idfs[word]
return tfidf
tfidf = computeTFIDF(tf, idf)
df = pd.DataFrame({'weight': tfidf})
#test = df.sort_values('tfidf', ascending=False)
test = df.sort_values(by = 'weight', ascending=False)
print(test)
I have managed to run it and got the output below. I don't think there is error from this but I don't how to get the full output
weight
butuh 0.026342
orang 0.019802
milik 0.009629
saudara 0.007267
hidup 0.006359
atur 0.006359
periksa 0.005450
hasil 0.005450
suka 0.004360
barang 0.003997
epps 0.003633
pengaruh 0.003270
perhati 0.003270
agresif 0.003088
salah 0.003088
laku 0.002907
prestasi 0.002907
gantung 0.002907
seksual 0.002907
muhammad 0.002725
rawat 0.002725
benda 0.002725
tolong 0.002725
manja 0.002543
percaya 0.002543
hadap 0.002543
harmonis 0.002543
gaul 0.002543
tekun 0.002362
ubah 0.002362
... ...
widad 0.000908
hubung 0.000727
manusia 0.000727
ekspresi 0.000727
aktivitas 0.000727
taruh 0.000727
pilih 0.000545
masuk 0.000545
putus 0.000545
peka 0.000545
kait 0.000545
ambil 0.000545
sulit 0.000545
paham 0.000545
raih 0.000545
rutin 0.000545
didik 0.000545
laksana 0.000363
kuat 0.000363
mudah 0.000363
jaga 0.000363
patuh 0.000363
gigih 0.000363
tonjol 0.000182
konvensi 0.000182
lingkung 0.000182
sosial 0.000182
interaksi 0.000182
urus 0.000182
tarik 0.000182
[150 rows x 1 columns]
I get a truncated representation, but I want the full array. I want to see the 150 data in row.
Is there any way to do this? should i split into 2 column and how's it working?
A for loop over each row of 'test' will work, print one row at a time. However it'll be slow to print so many times.
Let us know if that is sufficient.
The Pandas .head() method will print the dataframe with the number of rows you specify. You can try using this method and inserting the number of rows you would like to see. For example to see 150 rows, you can try
print(test.head(150))
Related
I did preprocessing the tweet data stored in the vaksinsampel2.csv file. And I have done several steps such as text cleaning, case folding, tokenizing, stopword removal, normalization. but I can't do stemming. please help me solve it.
here the code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
import string
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
df = pd.read_csv('vaksinsampel2.csv', encoding = 'unicode_escape')
def remove_punct(tweet):
tweet = re.sub('[^a-zA-Z0-9 ]', ' ', str(tweet))
tweet = re.sub('[0-9]+', ' ', tweet)
tweet = re.sub(r'#', '', str(tweet))
tweet = re.sub(r'http\S+', ' ', tweet)
return tweet
df['TEXT'] = df['full_text'].apply(lambda x:remove_punct(x))
df['case_folding'] = df['TEXT'].str.lower()
def tokenization(tweet):
tweet = re.split('\W+', tweet)
return tweet
df['Tokenization'] = df['TEXT'].apply(lambda x: tokenization(x.lower()))
df.head(10)
stopword = nltk.corpus.stopwords.words('indonesian')
def remove_stopwords(tweet):
tweet = [word for word in tweet if word not in stopword]
return tweet
df['Stopword_Removal'] = df['Tokenization'].apply(lambda x: remove_stopwords(x))
df.head(10)
def normalisasi(tweet):
kamus_slangword = eval(open("slang_indonesia.txt").read()) # Membuka dictionary slangword
pattern = re.compile(r'\b( ' + '|'.join (kamus_slangword.keys())+r')\b') # Search pola kata (contoh kpn -> kapan)
content = []
for kata in tweet:
filteredSlang = pattern.sub(lambda x: kamus_slangword[x.group()],kata) # Replace slangword berdasarkan pola review yg telah ditentukan
content.append(filteredSlang.lower())
tweet = content
return tweet
df['Normalization'] = df['Stopword_Removal'].apply(lambda x: normalisasi(x))
df.head(10)
factory = StemmerFactory()
stemming = factory.create_stemmer()
def stem_list(tweet):
return stemming.stem(df['Normalization'])
df['Stemming'] = df.apply(stem_list, axis=1)
df.head(50)
stemmer.stem() and not stemming.stem()?
I am referring to this: https://pypi.org/project/Sastrawi/
(...just happened to start exploring Sastrawi today)
Try This
factory = StemmerFactory()
stemmer = factory.create_stemmer()
def stemming(tweet):
text = [stemmer.stem(word) for word in tweet]
return tweet
df['Stemming'] = df['Normalization'].apply(lambda x: stemming(x))
I've got a code that worked, up until I added the entropy portion to it. Now it's giving me an invalid syntax error on the print line. How come?
import nltk, math, re, numpy
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
def entropy(labels):
freqdist = nltk.FreqDist(labels)
probs = [freqdist.freq(1) for l in freqdist]
return -sum(p * math.log(p,2) for p in probs)
def sents():
fileObj = open('1865-Lincoln.txt', 'r')
text = fileObj.read()
tokens = nltk.sent_tokenize(text)
for name in tokens:
words = ' '.join(name.split()[:4])
count = len(name.split())
entro = entropy(len(name.split())
print('{:<35} {:^15} {:>15}'.format(words, count, entro))
There is a closing bracket missing in the line above:
entro = entropy(len(name.split()))
I am trying to use the maxent classifier using the NLTK library. I have a list of positive and negative words and I have trained the classifier on the same. The problem is when I test the classifier against a sentence I always get the same probability of classification for the two classes. Here is the code -
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import MaxentClassifier
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from sklearn import cross_validation
nltk.data.path.append("/home/daksh/Documents/Softwares/nltk_data")
import csv
import operator
from nltk.classify import MaxentClassifier
from nltk.corpus import movie_reviews
def getBestWords(posWords,negWords):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
label_word_fd['pos'][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
label_word_fd['neg'][word.lower()] += 1
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1),reverse=True)[:2500]
bestwords = set([w for w,s in sorted_x])
return bestwords
def best_word_feats(words,bestwords):
return dict([(word, True) for word in words if word in bestwords])
def word_feats(words):
return dict([(word, True) for word in words])
def best_bigram_word_feats(words,posWords,negWords, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
d = dict([(bigram, True) for bigram in bigrams])
bestwords = getBestWords(posWords,negWords)
d.update(best_word_feats(words,bestwords))
return d
posWords = list()
negWords = list()
with open('../data/finalSentiPosWords.csv','r') as csvfile:
spamreader = csv.reader(csvfile)
posWords = list(spamreader)
with open('../data/finalSentiNegWords.csv','r') as csvfile:
spamreader = csv.reader(csvfile)
negWords = list(spamreader)
posWords = [word[0] for word in posWords]
negWords = [word[0] for word in negWords]
bestwords = getBestWords(posWords,negWords)
posfeats = [(best_bigram_word_feats(posWords,posWords,negWords),'pos')]
negfeats = [(best_bigram_word_feats(negWords,posWords,negWords),'neg')]
trainfeats = negfeats + posfeats
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
classifier = nltk.MaxentClassifier.train(trainfeats, algorithm,max_iter=5)
# classifier = nltk.NaiveBayesClassifier.train(trainfeats)
classifier.show_most_informative_features(10)
sentence = "Dosa had a tangy taste but it was fun eating it. On the other hand the other dosa was soggy"
l = sentence.split(' ')
print(l)
print(word_feats(l))
print(classifier.prob_classify(word_feats(l)).prob('pos'))
print(classifier.prob_classify(word_feats(l)).prob('neg'))
The output of this is this -
0.500074231063
0.499925768937
The overall classification seems to be working fine but I can't figure out how the probabilities are calculated and why are they always same even if I change the test sentence.
Any quick help appreciated.
Thanks.
That's a lot of code! I'm not going to debug it for you, but I notice that bestwords is the set of all words in your training corpus. If that's not outright wrong, it's certainly misleadingly named.
I have a huge block of code, I didn't want to bother you with in the first place. I tried figuring out what's going wrong for over a week now and I contacted several external sources (without any response), and at the moment I'm just wondering: maybe the problem is my training set?
For my thesis I need to classify a whole bunch of tweets as pos/neg/neutral. The code I wrote works OK on test datasets I make up myself (e.g. consisting out of 15 training sentences: 5 pos, 5 neg and 5 neutral; 6 test sentences: 2 pos, 2 neg, 2 neutral - only 1 test sentence gets misclassified).
Once I start running the code on the manually classified training set (1629 pos, 1411 neutral tweets and only 690 neg) and 900 test tweets, things start going wrong. Of the 900 test tweets, the HUGE majority gets classified as pos (between 700 and 800), while there's only a minority of neg and neutral tweets.
Would somebody please be so kind as to check my code and help me figure out what I'm doing wrong? I'd be really grateful. If you need any more information, I'd be happy to provide it.
import re, math, collections, itertools
import nltk
import nltk.classify.util, nltk.metrics
import csv
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords = True)
pos = []
neg = []
neutral = []
with open('C:\\...pos.csv', 'r', encoding = "utf8") as f: #open positive training set
reader = csv.reader(f)
for row in reader:
pos.extend(row)
with open('C:\\ ...neg.csv', 'r', encoding = "utf8") as f: #open negative training set
reader = csv.reader(f)
for row in reader:
neg.extend(row)
with open('C:\\...neutral.csv', 'r', encoding = "utf8") as f: #open neutral training set
reader = csv.reader(f)
for row in reader:
neutral.extend(row)
def uni(doc):
x = []
y = []
for tweet in doc:
x.append(word_tokenize(tweet))
for element in x:
for word in element:
if len(word)>2:
word = word.lower()
word = stemmer.stem(word)
y.append(word)
return y
def word_feats_uni(doc):
return dict([(word, True) for word in uni(doc)])
def tokenizer_ngrams(document):
all_tokens = []
filtered_tokens = []
for (sentence) in document:
all_tokens.append(word_tokenize(sentence))
return all_tokens
def get_bi (document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def get_tri(document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def word_feats_bi(doc):
return dict([(word, True) for word in get_bi(doc)])
def word_feats_tri(doc):
return dict([(word, True) for word in get_tri(doc)])
def word_feats_test(doc):
feats_test = {}
feats_test.update(word_feats_uni(doc))
feats_test.update(word_feats_bi(doc))
feats_test.update(word_feats_tri(doc))
return feats_test
pos_feats = [(word_feats_uni(pos),'1')] + [(word_feats_bi(pos),'1')] + [(word_feats_tri(pos),'1')]
neg_feats = [(word_feats_uni(neg),'-1')] + [(word_feats_bi(neg),'-1')] + [(word_feats_tri(neg),'-1')]
neutral_feats = [(word_feats_uni(neutral),'0')] + [(word_feats_bi(neutral),'0')] + [(word_feats_tri(neutral),'0')]
trainfeats = pos_feats + neg_feats + neutral_feats
random.shuffle(trainfeats)
classifier = NaiveBayesClassifier.train(trainfeats)
testtweets = []
with open('C:\\ ... testtweets.csv', 'r', encoding = "utf8") as f: #open testset
reader = csv.reader(f, delimiter = ';')
for row in reader:
testtweets.extend([row])
date = []
word = []
y = []
def classification(date,sentence): #doc = sentencelist
i = 0
for tweet in sentence:
sent = classifier.classify(word_feats_test([tweet]))
y.extend([(date[i],tweet,sent)])
i = i + 1
def result(doc):
i = 0
while i in range(0,len(doc) -1):
date.append(doc[i][0])
word.append(doc[i][1])
i = i + 1
classification(date,word)
result(testtweets)
with open('C:\\...write.csv', 'w') as fp: #write classified test set to file
a = csv.writer(fp, delimiter=',')
a.writerows(y)
Sorry to dump a whole block of code (below) here. I've been trying to figure out what I'm doing wrong, but unfortunately I have no idea.
For my thesis I have to classify tweets as neutral (0), negative (-1) or positive (1). I'm trying this by using NLTK. Goal is that the code returns a dictionary in the form 'tweetA,0','tweetB,-1'... At the moment, if I enter more than one tweet as an input, I only get the result (i.e. -1/0/1) for the first tweet back.
For example, if I put 'I love oranges','I hate tomatoes' as in input, I only get '1' as a return and not '1','-1'.
If anyone would be able to help me out, I'd be really grateful!
The code I have up until now:
import re, math, collections, itertools
import nltk
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords = True)
pos_tweets = ['I love bananas','I like pears','I eat oranges']
neg_tweets = ['I hate lettuce','I do not like tomatoes','I hate apples']
neutral_tweets = ['I buy chicken','I am boiling eggs','I am chopping vegetables']
def uni(doc):
x = []
y = []
for tweet in doc:
x.append(word_tokenize(tweet))
for element in x:
for word in element:
if len(word)>2:
word = word.lower()
word = stemmer.stem(word)
y.append(word)
return y
def word_feats_uni(doc):
return dict([(word, True) for word in uni(doc)])
def tokenizer_ngrams(document):
all_tokens = []
filtered_tokens = []
for (sentence) in document:
all_tokens.append(word_tokenize(sentence))
return all_tokens
def get_bi (document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def get_tri(document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def word_feats_bi(doc):
return dict([(word, True) for word in get_bi(doc)])
def word_feats_tri(doc):
return dict([(word, True) for word in get_tri(doc)])
def word_feats_test(doc):
feats_test = {}
i = 0
for tweet in doc:
feats_test.update(word_feats_uni(tweet))
feats_test.update(word_feats_bi(tweet))
feats_test.update(word_feats_tri(tweet))
return feats_test
pos_feats = [(word_feats_uni(pos_tweets),'1')] + [(word_feats_bi(pos_tweets),'1')] + [(word_feats_tri(pos_tweets),'1')]
neg_feats = [(word_feats_uni(neg_tweets),'-1')] + [(word_feats_bi(neg_tweets),'-1')] + [(word_feats_tri(neg_tweets),'-1')]
neutral_feats = [(word_feats_uni(neutral_tweets),'0')] + [(word_feats_bi(neutral_tweets),'0')] + [(word_feats_tri(neutral_tweets),'0')]
trainfeats = pos_feats + neg_feats + neutral_feats
classifier = NaiveBayesClassifier.train(trainfeats)
print (classifier.classify(word_feats_test(['I love oranges'])))