i'm doing an automatic language detection in python using stopwords
but i'm getting KeyError when trying to test the code.
this is the code
import nltk
from nltk.corpus import stopwords
def scoreFunction(wholetext):
dictiolist={}
scorelist={}
NLTKlanguage = ["dutch","finnish","german","italian","portuguese","spanish","turkish","danish","english"," french","hungarian","norwegian","russian","swedish"]
FREElanguages = [""]
languages= NLTKlanguages + FREElanguages
for lang in NLTKlanguages:
dictiolist[lang]=stopwords.words(lang)
tokens=nltk.tokenize.word_tokenize(wholetext)
tokens=[t.lower() for t in tokens]
freq_dist=nltk.FreqDist(tokens)
for lang in languages:
scorelist[lang]=0
for word in freq_dist.keys()[0:20]:
if word in dictiolist[lang]:
scorelist[lang]+=1
return scorelist
def whichLanguage(scorelist):
maximum=0
for item in scorelist:
value = scorelist[item]
if maximum<value:
maximum = value
lang = item
return lang
whene i run it scoreFunction("hillo my name is osfar and i'm genius")
i get the error
Traceback (most recent call last): File "", line 1, in
scoreFunction("hello my name is osfar and i'm very genius")
File "C:/Users/osama1/Desktop
/fun-test", line 17, in scoreFunction
if word in dictiolist[lang]:
KeyError: ''
Your problem is in the following block of code:
for word in freq_dist.keys()[0:20]:
if word in dictiolist[lang]:
scorelist[lang]+=1
You're using the variable lang in this for loop, but you aren't defining it anywhere. Which means that its value is undefined; as it happens, its value is "" (the empty string) because that was the last value it had in your previous for loop.
What you apparently meant to do is:
for word in freq_dist.keys()[0:20]:
for lang in languages:
if word in dictiolist[lang]:
scorelist[lang]+=1
By the way, there's an easier way to do what you're trying to do: use a Counter. See http://docs.python.org/2.7/library/collections.html#counter-objects for more information.
Related
I'm working on a classification task, using a movie reviews dataset from Kaggle. The part with which I'm struggling is a series of functions, in which the output of one becomes the input of the next.
Specifically, in the code provided, the function "word_token" takes the input "phraselist", tokenizes it, and returns a tokenized document titled "phrasedocs". The only problem is that it doesn't seem to be working, because when I take that theoretical document "phrasedocs" and enter it into the next function, "process_token", I get:
NameError: name 'phrasedocs' is not defined
I am completely willing to accept that there is something simple I have overlooked, but I've been on this for hours and I can't figure it out. I would appreciate any help.
I have tried proofreading and debugging the code, but my Python expertise is not great.
# This function obtains data from train.tsv
def processkaggle(dirPath, limitStr):
# Convert the limit argument from a string to an int
limit = int(limitStr)
os.chdir(dirPath)
f = open('./train.tsv', 'r')
# Loop over lines in the file and use their first limit
phrasedata = []
for line in f:
# Ignore the first line starting with Phrase, then read all lines
if (not line.startswith('Phrase')):
# Remove final end of line character
line = line.strip()
# Each line has four items, separated by tabs
# Ignore the phrase and sentence IDs, keep the phrase and sentiment
phrasedata.append(line.split('\t')[2:4])
return phrasedata
# Randomize and subset data
def random_phrase(phrasedata):
random.shuffle(phrasedata) # phrasedata initiated in function processkaggle
phraselist = phrasedata[:limit]
for phrase in phraselist[:10]:
print(phrase)
return phraselist
# Tokenization
def word_token(phraselist):
phrasedocs=[]
for phrase in phraselist:
tokens=nltk.word_tokenize(phrase[0])
phrasedocs.append((tokens, int(phrase[1])))
return phrasedocs
# Pre-processing
# Convert all tokens to lower case
def lower_case(doc):
return [w.lower() for w in doc]
# Clean text, fixing confusion over apostrophes
def clean_text(doc):
cleantext=[]
for review_text in doc:
review_text = re.sub(r"it 's", "it is", review_text)
review_text = re.sub(r"that 's", "that is", review_text)
review_text = re.sub(r"\'s", "\'s", review_text)
review_text = re.sub(r"\'ve", "have", review_text)
review_text = re.sub(r"wo n't", "will not", review_text)
review_text = re.sub(r"do n't", "do not", review_text)
review_text = re.sub(r"ca n't", "can not", review_text)
review_text = re.sub(r"sha n't", "shall not", review_text)
review_text = re.sub(r"n\'t", "not", review_text)
review_text = re.sub(r"\'re", "are", review_text)
review_text = re.sub(r"\'d", "would", review_text)
review_text = re.sub(r"\'ll", "will", review_text)
cleantext.append(review_text)
return cleantext
# Remove punctuation and numbers
def rem_no_punct(doc):
remtext = []
for text in doc:
punctuation = re.compile(r'[-_.?!/\%#,":;\'{}<>~`()|0-9]')
word = punctuation.sub("", text)
remtext.append(word)
return remtext
# Remove stopwords
def rem_stopword(doc):
stopwords = nltk.corpus.stopwords.words('english')
updatestopwords = [word for word in stopwords if word not in ['not','no','can','has','have','had','must','shan','do','should','was','were','won','are','cannot','does','ain','could','did','is','might','need','would']]
return [w for w in doc if not w in updatestopwords]
# Lemmatization
def lemmatizer(doc):
wnl = nltk.WordNetLemmatizer()
lemma = [wnl.lemmatize(t) for t in doc]
return lemma
# Stemming
def stemmer(doc):
porter = nltk.PorterStemmer()
stem = [porter.stem(t) for t in doc]
return stem
# This function combines all the previous pre-processing functions into one, which is helpful
# if I want to alter these settings for experimentation later
def process_token(phrasedocs):
phrasedocs2 = []
for phrase in phrasedocs:
tokens = nltk.word_tokenize(phrase[0])
tokens = lower_case(tokens)
tokens = clean_text(tokens)
tokens = rem_no_punct(tokens)
tokens = rem_stopword(tokens)
tokens = lemmatizer(tokens)
tokens = stemmer(tokens)
phrasedocs2.append((tokens, int(phrase[1]))) # Any words that pass through the processing
# steps above are added to phrasedocs2
return phrasedocs2
dirPath = 'C:/Users/J/kagglemoviereviews/corpus'
processkaggle(dirPath, 5000) # returns 'phrasedata'
random_phrase(phrasedata) # returns 'phraselist'
word_token(phraselist) # returns 'phrasedocs'
process_token(phrasedocs) # returns phrasedocs2
NameError Traceback (most recent call last)
<ipython-input-120-595bc4dcf121> in <module>()
5 random_phrase(phrasedata) # returns 'phraselist'
6 word_token(phraselist) # returns 'phrasedocs'
----> 7 process_token(phrasedocs) # returns phrasedocs2
8
9
NameError: name 'phrasedocs' is not defined
Simply you defined "phrasedocs" inside a function which is not seen from outside and the function return should be captured in a variable,
edit your code:
dirPath = 'C:/Users/J/kagglemoviereviews/corpus'
phrasedata = processkaggle(dirPath, 5000) # returns 'phrasedata'
phraselist = random_phrase(phrasedata) # returns 'phraselist'
phrasedocs = word_token(phraselist) # returns 'phrasedocs'
phrasedocs2 = process_token(phrasedocs) # returns phrasedocs2
You have only created the variable phrasedocs in a function. Therefore the variable is not defined for all of your other code outside this function. When you call the variable as an input to the function python can't find any variable named like that. You must create a variable called phrasedocs in your main code.
Below is a method that I have tried coding out. However, in line 3 of the codes it said there is an attribute error and that 'WordListCorpusReader' object has no attribute 'word' in python. Please do help me take a look at the below codes :((
'''step 3. conduct preprocessing steps'''
# setting up the resources for the preprocessing steps
stop = set(stopwords.word('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
stop_free = ''.join([i for i in doc.lower().split() if i not in stop])
punc_free = ''.join([ch for ch in stop_free if ch not in exclude])
normalized = ''.join(wn.lemma.lemmatize(word) for word in punc_free.split())
return normalized
doc_clean = [clean(doc).split() for doc in corpus]
'''step 4. prepare word representation'''
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
'''step 5. create lda model'''
topic_num = 5
word_num = 5
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics=topic_num, id2word=dictionary, passes=20)
pprint(ldamodel.print_topics(num_topics=topic_num, num_words=word_num))
This is the trace back after running the codes:
Traceback (most recent call last):
File "C:/Users/user/PycharmProjects/topicmodel/topicmodel.py", line 41, in <module>
stop = set(stopwords.word('english'))
File "C:\Users\user\AppData\Roaming\Python\Python37\site-packages\nltk\corpus\util.py", line 119, in __getattr__
return getattr(self, attr)
AttributeError: 'WordListCorpusReader' object has no attribute 'word'
It's a typo. The method you should be calling is stopwords.words(). Change that:
stop = set(stopwords.word('english'))
into
stop = set(stopwords.words('english'))
and that should fix this issue.
More information on the NLTK documentation page:
https://www.nltk.org/api/nltk.corpus.html?highlight=corpus#module-nltk.corpus
I am using Python's regex with an if-statement: if the match is None, then it should go to the else clause. But it shows this error:
AttributeError: 'NoneType' object has no attribute 'group'
The script is:
import string
chars = re.escape(string.punctuation)
sub='FW: Re: 29699'
if re.search("^FW: (\w{10})",sub).group(1) is not None :
d=re.search("^FW: (\w{10})",sub).group(1)
else:
a=re.sub(r'['+chars+']', ' ',sub)
d='_'.join(a.split())
Every help is great help!
Your problem is this: if your search doesn't find anything, it will return None. You can't do None.group(1), which is what your code amounts to. Instead, check whether the search result is Noneānot the search result's first group.
import re
import string
chars = re.escape(string.punctuation)
sub='FW: Re: 29699'
search_result = re.search(r"^FW: (\w{10})", sub)
if search_result is not None:
d = search_result.group(1)
else:
a = re.sub(r'['+chars+']', ' ', sub)
d = '_'.join(a.split())
print(d)
# FW_RE_29699
I am building an algorithm for sentiment analysis which could segment do the segmentation on a .txt corpus, but there has been some problem in the code I dont know how to resolve?
class Splitter(object):
def _init_(self):
self.nltk_splitter = nltk.data.load('tokenizers/punkt/english/pickle')
self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()
def split(self,text):
"""imput format: a .txt file
output format : a list of lists of words.
for eg [['this', 'is']['life' , 'worth' , 'living']]"""
sentences = self.nltk_splitter.tokenize(text)
tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
return tokenized_sentences
and then I did the following things
>>> f = open('amazonshoes.txt')
>>> raw = f.read()
>>> text = nltk.Text(raw)
>>> splitter = Splitter()
>>> splitted_sentences = splitter.split(text)
and the error is
Traceback (most recent call last):
File "<pyshell#21>", line 1, in <module>
splitted_sentences = splitter.split(text)
File "<pyshell#14>", line 9, in split
sentences = self.nltk_splitter.tokenize(text)
AttributeError: 'Splitter' object has no attribute 'nltk_splitter'
The constructor of the class Splitter should be called __init__, with two leading and trailing underscores.
Currently the _init_ method (single underscore) is not executed, so the Splitter object your create (by calling Splitter()) never acquires the attribute/field nltk_splitter
I have read some tutorials about highlighting search terms in Lucene, and came up with a piece of code like this:
(...)
query = parser.parse(query_string)
for scoreDoc in searcher.search(query, 50).scoreDocs:
doc = searcher.doc(scoreDoc.doc)
filename = doc.get("filename")
print filename
found_paraghaph = fetch_from_my_text_library(filename)
stream = lucene.TokenSources.getTokenStream("contents", found_paraghaph, analyzer);
scorer = lucene.Scorer(query, "contents", lucene.CachingTokenFilter(stream))
highligter = lucene.Highligter(scorer)
fragment = highligter.getBestFragment(analyzer, "contents", found_paraghaph)
print '>>>' + fragment
But it all ends with an error:
Traceback (most recent call last):
File "./search.py", line 76, in <module>
scorer = lucene.Scorer(query, "contents", lucene.CachingTokenFilter(stream))
NotImplementedError: ('instantiating java class', <type 'Scorer'>)
So, I guess, that this part of Lucene insn't iplemented yet in pyLucene. Is there any other way to do it?
I too got similar error. I think this class's wrapper is not yet implemented for Pylucene v3.6.
You might want to try the following:
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
# Constructs a query parser.
queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer)
# Create a query
query = queryParser.parse(QUERY_STRING)
topDocs = searcher.search(query, 50)
# Get top hits
scoreDocs = topDocs.scoreDocs
print "%s total matching documents." % len(scoreDocs)
HighlightFormatter = SimpleHTMLFormatter();
highlighter = Highlighter(HighlightFormatter, QueryScorer (query))
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
text = doc.get(FIELD_CONTENTS)
ts = analyzer.tokenStream(FIELD_CONTENTS, StringReader(text))
print doc.get(FIELD_PATH)
print highlighter.getBestFragments(ts, text, 3, "...")
print ""
Please note that we create token stream for each item in the search result.