Python 2 : AttributeError: 'list' object has no attribute 'split' - python

this is my program of LSA, in this fonction i want to tokenize all my text and then transform it to stem. i'm trying to integrate them program of stemming and then i get this: for word in titles.split(" "):
AttributeError: 'list' object has no attribute 'split'
this code lsa:
# -*- coding: utf-8 -*-
from numpy import zeros
from scipy.linalg import svd
from math import log
from numpy import asarray, sum
#from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
#from nltk.stem import PorterStemmer
#from nltk.stem.isri import ISRIStemmer
import nltk
#from matplotlib import pyplot as plt
from snowballstemmer import stemmer
titles = [" ذهبت الاخت الى المدرسة","تقع المدرسة في الجبال",
"ذهب الام لزيارة ابنتها في المدرسة ","تحضر الام الكعكة" ]
ar_stemmer = stemmer("arabic")
stopwords = ['ثم','و','حتى','الى','على','في']
ignorechars = ''',:'!'''
class LSA(object):
def __init__(self, stopwords, ignorechars):
self.stopwords = stopwords
self.ignorechars = ignorechars
self.wdict = {}
self.dcount = 0
def parse(self, doc):
for word in titles.split(" "):
stem = ar_stemmer.stemWord(word)
if stem in self.stopwords:
pass
elif stem in self.wdict:
self.wdict[stem].append(self.dcount)
else:
self.wdict[stem] = [self.dcount]
self.dcount += 1
and this is what i want integrate:
from snowballstemmer import stemmer
ar_stemmer = stemmer("arabic")
sentence = u" ذهبت الاخت الى المدرسة, تقع المدرسة في الجبال"
for word in sentence.split(" "):
stem = ar_stemmer.stemWord(word)
print stem

titles is already a list; do this instead:
for sentence in titles:
for word in sentence.split(" "):
...

List objects don't have a split method like strings do. If you want to split every string that you have in a titles list, you could nest a loop and do something like this:
def parse(self, doc):
for title in titles:
for word in title.split():
stem = ar_stemmer.stemWord(word)
if stem in self.stopwords:
pass
...

Related

How to modify word in a for loop in python

Im trying to stem some text in python with SnowballStemmer, but it wont work. Here is the code:
import nltk
from nltk import SnowballStemmer
stem = SnowballStemmer("spanish")
def limpiar (texto):
texto = texto.split()
stemm = SnowballStemmer('spanish')
for palabra in texto:
palabra = stem.stem(palabra.lower())
return texto
It returns the text in lower capitals, but without stemming
The will work :
import nltk
from nltk.stem.snowball import SnowballStemmer
def limpiar(texto):
words=texto.split()
stem_words=[]
for w in words:
x = snow_stemmer.stem(w)
stem_words.append(x)
#print stemming results
for e1,e2 in zip(words,stem_words):
print(e1+' ----> '+e2.lower())
snow_stemmer = SnowballStemmer(language='spanish')
texto="..."
limpiar(texto)

How to formatting list of text into 2 column

I have this code below to process text data using tf idf in python.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import glob
files = glob.glob("Text/*.txt")
with open("all_data.txt","wb") as outfile:
for f in files:
with open(f,"rb") as infile:
outfile.write(infile.read())
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk #import library nltk
from nltk.tokenize import word_tokenize #import word_tokenize for tokenizing text into words
from nltk.tokenize import sent_tokenize #import sent_tokenize for tokenizing paragraph into sentences
from nltk.stem.porter import PorterStemmer #import Porter Stemmer Algorithm
from nltk.stem import WordNetLemmatizer #import WordNet lemmatizer
from nltk.corpus import stopwords #import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory #import Indonesian Stemmer
import re #import regular expression
from nltk.tokenize import RegexpTokenizer
file = open('all_data.txt', 'r')
t = file.read()
text_data = t
#casefolding
def casefolding(s):
new_str = s.lower()
return new_str
cf = casefolding(text_data)
#remove punctuation from string
def removepunct(str):
new_string = re.sub(r"[\W]", " ", str)
return new_string
rp = removepunct(cf)
#remove digit from string
def removeDigit(str):
new_string = re.sub(r"[0-9]", " ", str)
return new_string
rd = removeDigit(rp)
#remove words in length 1-3
def removelg(str):
new_string = re.sub(r' \w{1,3} ', ' ', str)
return new_string
rl = removelg(rd)
#remove multiple space
def removespace(str):
new_string = re.sub(' +', ' ',str)
return new_string
rms = removespace(rl)
#Stemming Indonesian
def stemmingIndo(str):
factory = StemmerFactory()
stemmer = factory.create_stemmer()
return stemmer.stem(str)
stindo = stemmingIndo(rms)
#remove stopwords
def stpwrds(str):
stop_words = set(stopwords.words('indonesian'))
word_tokens = word_tokenize(stindo)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
return filtered_sentence
filt = stpwrds(stindo)
par = ' '.join(filt)
def word_tokenization(s):
tokens = word_tokenize(s)
return tokens
wordtoken = word_tokenization(par)
bowD = wordtoken
wordSet = set(bowD)
wordDict = dict.fromkeys(wordSet,0)
for word in bowD:
wordDict[word]+=1
def computeTF(wordDict, bow):
tfDict = {}
bowCount = len(bow)
for word, count in wordDict.items():
tfDict[word] = count/float(bowCount)
return tfDict
tf = computeTF(wordDict, bowD)
def computeIDF(docList):
import math
idfDict = {}
N = len(docList)
idfDict = dict.fromkeys(docList[0].keys(), 0)
for doc in docList:
for word, val in doc.items():
if val > 0:
idfDict[word] += 1
for word, val in idfDict.items():
idfDict[word] = math.log10((1+N) / float(val))
return idfDict
idf = computeIDF([wordDict])
def computeTFIDF(tfBow, idfs):
tfidf = {}
for word, val in tfBow.items():
tfidf[word] = val*idfs[word]
return tfidf
tfidf = computeTFIDF(tf, idf)
df = pd.DataFrame({'weight': tfidf})
#test = df.sort_values('tfidf', ascending=False)
test = df.sort_values(by = 'weight', ascending=False)
print(test)
I have managed to run it and got the output below. I don't think there is error from this but I don't how to get the full output
weight
butuh 0.026342
orang 0.019802
milik 0.009629
saudara 0.007267
hidup 0.006359
atur 0.006359
periksa 0.005450
hasil 0.005450
suka 0.004360
barang 0.003997
epps 0.003633
pengaruh 0.003270
perhati 0.003270
agresif 0.003088
salah 0.003088
laku 0.002907
prestasi 0.002907
gantung 0.002907
seksual 0.002907
muhammad 0.002725
rawat 0.002725
benda 0.002725
tolong 0.002725
manja 0.002543
percaya 0.002543
hadap 0.002543
harmonis 0.002543
gaul 0.002543
tekun 0.002362
ubah 0.002362
... ...
widad 0.000908
hubung 0.000727
manusia 0.000727
ekspresi 0.000727
aktivitas 0.000727
taruh 0.000727
pilih 0.000545
masuk 0.000545
putus 0.000545
peka 0.000545
kait 0.000545
ambil 0.000545
sulit 0.000545
paham 0.000545
raih 0.000545
rutin 0.000545
didik 0.000545
laksana 0.000363
kuat 0.000363
mudah 0.000363
jaga 0.000363
patuh 0.000363
gigih 0.000363
tonjol 0.000182
konvensi 0.000182
lingkung 0.000182
sosial 0.000182
interaksi 0.000182
urus 0.000182
tarik 0.000182
[150 rows x 1 columns]
I get a truncated representation, but I want the full array. I want to see the 150 data in row.
Is there any way to do this? should i split into 2 column and how's it working?
A for loop over each row of 'test' will work, print one row at a time. However it'll be slow to print so many times.
Let us know if that is sufficient.
The Pandas .head() method will print the dataframe with the number of rows you specify. You can try using this method and inserting the number of rows you would like to see. For example to see 150 rows, you can try
print(test.head(150))

Trying to print 3 results into a table using re

I've got a code that worked, up until I added the entropy portion to it. Now it's giving me an invalid syntax error on the print line. How come?
import nltk, math, re, numpy
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
def entropy(labels):
freqdist = nltk.FreqDist(labels)
probs = [freqdist.freq(1) for l in freqdist]
return -sum(p * math.log(p,2) for p in probs)
def sents():
fileObj = open('1865-Lincoln.txt', 'r')
text = fileObj.read()
tokens = nltk.sent_tokenize(text)
for name in tokens:
words = ' '.join(name.split()[:4])
count = len(name.split())
entro = entropy(len(name.split())
print('{:<35} {:^15} {:>15}'.format(words, count, entro))
There is a closing bracket missing in the line above:
entro = entropy(len(name.split()))

Python: Unable to import module app in AWS Lambda

I have the file app.py in the root of my app.zip file. And the function handler is also defined properly (lambda_handler), according to the handler config.: app.lambda_handler
Yet, I am getting the error: Unable to import module 'app': No module named app
Where did I go wrong?
My script:
from __future__ import print_function
import json
import urllib
import boto3
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
stemmer=PorterStemmer()
import sys
reload(sys)
sys.setdefaultencoding('utf8')
print('Loading function')
s3 = boto3.client('s3')
number_of_sentences=0
number_of_words=0
word_list=[]
stop_words=set(stopwords.words('english'))
stop_word_list=[ v for v in stop_words]
modal_verbs=['can', 'could', 'may', 'might', 'must', 'shall', 'should', 'will' ,'would','ought']
auxilary_verbs=['be','do','have']
stop_word_list=stop_word_list+modal_verbs+auxilary_verbs
print("Starting Trigram generation")
#Empty Trigram list
tri_gram_list=[]
def lambda_handler(event, context):
#print("Received event: " + json.dumps(event, indent=2))
# Get the object from the event and show its content type
'''
'''
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key'].encode('utf8'))
try:
response = s3.get_object(Bucket=bucket, Key=key)
print("CONTENT TYPE: " + response['ContentType'])
text = response['Body'].read()
print(type(text))
for line in text.readlines():
for line in open("input.txt","r").readlines():
line=unicode(line, errors='ignore')
if len(line)>1:
sentences=sent_tokenize(line)
number_of_sentences+=len(sentences)
for sentence in sentences:
sentence=sentence.strip().lower()
#sentence = sentence.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ').replace('(', ' ').replace(')', ' ').replace(''`'', ' ').strip().lower()
words_from_sentence=tokenizer.tokenize(line)
words = [word for word in words_from_sentence if word not in stop_word_list]
number_of_words+=len(words)
stemmed_words = [stemmer.stem(word) for word in words]
word_list.extend(stemmed_words)
#generate Trigrams
tri_gram_list_t= [ " ".join([words[index],words[index+1],words[index+2]]) for index,value in enumerate(words) if index<len(words)-2]
#print tri_gram_list
tri_gram_list.extend(tri_gram_list_t)
print number_of_words
print number_of_sentences
print("Conting frequency now...")
count=Counter()
for element in tri_gram_list:
#print element, type(tri_gram_list)
count[element]=count[element]+1
print count.most_common(25)
print "most common 25 words ARE:"
for element in word_list:
#print element, type(tri_gram_list)
count[element]=count[element]+1
print count.most_common(25)
# body = obj.get()['Body'].read()
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
Where did I go wrong?
Try checking the log output. It will give you more information than the error you saw above.
Finally, remember that you need the Python 2 syntax, replace calls like:
print number_of_words by print(number_of_words)

Python: NLTK: return dictionary - only returns 1 value

Sorry to dump a whole block of code (below) here. I've been trying to figure out what I'm doing wrong, but unfortunately I have no idea.
For my thesis I have to classify tweets as neutral (0), negative (-1) or positive (1). I'm trying this by using NLTK. Goal is that the code returns a dictionary in the form 'tweetA,0','tweetB,-1'... At the moment, if I enter more than one tweet as an input, I only get the result (i.e. -1/0/1) for the first tweet back.
For example, if I put 'I love oranges','I hate tomatoes' as in input, I only get '1' as a return and not '1','-1'.
If anyone would be able to help me out, I'd be really grateful!
The code I have up until now:
import re, math, collections, itertools
import nltk
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords = True)
pos_tweets = ['I love bananas','I like pears','I eat oranges']
neg_tweets = ['I hate lettuce','I do not like tomatoes','I hate apples']
neutral_tweets = ['I buy chicken','I am boiling eggs','I am chopping vegetables']
def uni(doc):
x = []
y = []
for tweet in doc:
x.append(word_tokenize(tweet))
for element in x:
for word in element:
if len(word)>2:
word = word.lower()
word = stemmer.stem(word)
y.append(word)
return y
def word_feats_uni(doc):
return dict([(word, True) for word in uni(doc)])
def tokenizer_ngrams(document):
all_tokens = []
filtered_tokens = []
for (sentence) in document:
all_tokens.append(word_tokenize(sentence))
return all_tokens
def get_bi (document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def get_tri(document):
x = tokenizer_ngrams(document)
c = []
for sentence in x:
c.extend([bigram for bigram in nltk.bigrams(sentence)])
return c
def word_feats_bi(doc):
return dict([(word, True) for word in get_bi(doc)])
def word_feats_tri(doc):
return dict([(word, True) for word in get_tri(doc)])
def word_feats_test(doc):
feats_test = {}
i = 0
for tweet in doc:
feats_test.update(word_feats_uni(tweet))
feats_test.update(word_feats_bi(tweet))
feats_test.update(word_feats_tri(tweet))
return feats_test
pos_feats = [(word_feats_uni(pos_tweets),'1')] + [(word_feats_bi(pos_tweets),'1')] + [(word_feats_tri(pos_tweets),'1')]
neg_feats = [(word_feats_uni(neg_tweets),'-1')] + [(word_feats_bi(neg_tweets),'-1')] + [(word_feats_tri(neg_tweets),'-1')]
neutral_feats = [(word_feats_uni(neutral_tweets),'0')] + [(word_feats_bi(neutral_tweets),'0')] + [(word_feats_tri(neutral_tweets),'0')]
trainfeats = pos_feats + neg_feats + neutral_feats
classifier = NaiveBayesClassifier.train(trainfeats)
print (classifier.classify(word_feats_test(['I love oranges'])))

Categories