Object has no attribute 'update' - python

I am trying to use the code that is on this link... see example 6.
So this is the code:
import json
import nltk
import numpy
BLOG_DATA = "resources/ch05-webpages/feed.json"
N = 100 # Number of words to consider
CLUSTER_THRESHOLD = 5 # Distance between words to consider
TOP_SENTENCES = 5 # Number of sentences to return for a "top n" summary
# Approach taken from "The Automatic Creation of Literature Abstracts" by H.P. Luhn
def _score_sentences(sentences, important_words):
scores = []
sentence_idx = -1
for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:
sentence_idx += 1
word_idx = []
# For each word in the word list...
for w in important_words:
try:
# Compute an index for where any important words occur in the sentence.
word_idx.append(s.index(w))
except ValueError, e: # w not in this particular sentence
pass
word_idx.sort()
# It is possible that some sentences may not contain any important words at all.
if len(word_idx)== 0: continue
# Using the word index, compute clusters by using a max distance threshold
# for any two consecutive words.
clusters = []
cluster = [word_idx[0]]
i = 1
while i < len(word_idx):
if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
cluster.append(word_idx[i])
else:
clusters.append(cluster[:])
cluster = [word_idx[i]]
i += 1
clusters.append(cluster)
# Score each cluster. The max score for any given cluster is the score
# for the sentence.
max_cluster_score = 0
for c in clusters:
significant_words_in_cluster = len(c)
total_words_in_cluster = c[-1] - c[0] + 1
score = 1.0 * significant_words_in_cluster \
* significant_words_in_cluster / total_words_in_cluster
if score > max_cluster_score:
max_cluster_score = score
scores.append((sentence_idx, score))
return scores
def summarize(txt):
sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
normalized_sentences = [s.lower() for s in sentences]
words = [w.lower() for sentence in normalized_sentences for w in
nltk.tokenize.word_tokenize(sentence)]
fdist = nltk.FreqDist(words)
top_n_words = [w[0] for w in fdist.items()
if w[0] not in nltk.corpus.stopwords.words('english')][:N]
scored_sentences = _score_sentences(normalized_sentences, top_n_words)
# Summarization Approach 1:
# Filter out nonsignificant sentences by using the average score plus a
# fraction of the std dev as a filter
avg = numpy.mean([s[1] for s in scored_sentences])
std = numpy.std([s[1] for s in scored_sentences])
mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
if score > avg + 0.5 * std]
# Summarization Approach 2:
# Another approach would be to return only the top N ranked sentences
top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
# Decorate the post object with summaries
return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])
blog_data = json.loads(open(BLOG_DATA).read())
for post in blog_data:
post.update(summarize(post['content']))
print post['title']
print '=' * len(post['title'])
print
print 'Top N Summary'
print '-------------'
print ' '.join(post['top_n_summary'])
print
print 'Mean Scored Summary'
print '-------------------'
print ' '.join(post['mean_scored_summary'])
print
But when I run it it says:
Traceback (most recent call last):
File "/home/jetonp/PycharmProjects/Summeriza/blogs_and_nlp__summarize.py", line 117, in <module>
post.update(summarize(post['content']))
AttributeError: 'unicode' object has no attribute 'update'
Process finished with exit code 1
What is causing this error and how do I fix it?

I figured it out. In the example that you are working off of, the summarize method returns a dictionary. Your summarize method does not return anything, due to improper indentation. For part of it, there is just three spaces, and for part of it there were no spaces. The standard indentation in python is four spaces. Summarize should look like this:
def summarize(txt):
sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
normalized_sentences = [s.lower() for s in sentences]
words = [w.lower() for sentence in normalized_sentences for w in
nltk.tokenize.word_tokenize(sentence)]
fdist = nltk.FreqDist(words)
top_n_words = [w[0] for w in fdist.items()
if w[0] not in nltk.corpus.stopwords.words('english')][:N]
scored_sentences = _score_sentences(normalized_sentences, top_n_words)
# Summarization Approach 1:
# Filter out nonsignificant sentences by using the average score plus a
# fraction of the std dev as a filter
avg = numpy.mean([s[1] for s in scored_sentences])
std = numpy.std([s[1] for s in scored_sentences])
mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
if score > avg + 0.5 * std]
# Summarization Approach 2:
# Another approach would be to return only the top N ranked sentences
top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
# Decorate the post object with summaries
return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])

Related

How can I iterate over a file looking for keywords defined within a list

I have a defined list of keywords and a text file. I would like to search the text file and count how many times each of the keywords within my list appear. Example:
kw = ['max speed', 'time', 'distance', 'travel', 'down', 'up']
with open("file.txt", "r") as f:
data_file = f.read()
d = dict()
for line in data_file:
line = line.strip()
line = line.lower()
words = line.split(" ")
for word in words:
if word in d:
d[word] = d[word] + 1
else:
d[word] = 1
for key in list(d.keys()):
print(key, ":", d[key])
Now lets say we run the code, it should search the file.txt and loop through the list. If it finds a keyword in the list, then print that word and how many times it was found. If no word is found then it doesn't report.
Example output:
Keywords Found:
max speed: 3
travel: 7
distance: 3
Can't quite get this to work like I want. Any feedback would be great! Thank you in advanced!
There are several algorithms which you can use. There are special algorithms for finding a specific words in texts. The easiest one will be the naive algorithm, here is code that I wrote:
def naive_string_matching(text, pattern):
txt_len, pat_len = len(text), len(pattern)
result = []
for s in range(txt_len - pat_len + 1):
if pattern == text[s:s+pat_len]:
result.append(s)
return result
This naive algorithm takes as an input a text and one word as a pattern to search for. The complexity of this algorithm is O((n − m + 1)m) where m is the length of pattern and n is the length of a text.
The next algorithm which you can use and has the better complexity than the naive algorithm is Finite automation algorithm. Here you can read more about it if you are interested in it. Here is also my implementation of this algorithm:
def transition_table(pattern):
alphabet = set(pattern)
ptt_len = len(pattern)
result = []
for q in range(ptt_len+1):
result.append({})
for l in alphabet:
k = min(len(pattern), q+1)
while True:
if k == 0 or pattern[:k] == (pattern[:q] + l)[-k:]:
break
k -= 1
result[q][l] = k
return result
def fa_string_matching(text, pattern):
q = 0
delta = transition_table(pattern)
txt_len = len(text)
result = []
for s in range(txt_len):
if text[s] in delta[q]:
q = delta[q][text[s]]
if q == len(delta) - 1:
result.append(s+1-q)
else:
q = 0
return result
The complexity of this algorithm is O(n) but pre-processing time (transition_table function) takes O(m) where again n is the length of a text and m the length of a pattern.
And the last one algorithm I can propose to you is the KMP (Knuth–Morris–Pratt) algorithm which is the fastest of all 3 of them. Again my implementation of it:
def prefix_function(pattern):
pat_len = len(pattern)
pi = [0]
k = 0
for q in range(1, pat_len):
while k > 0 and pattern[k] != pattern[q]:
k = pi[k-1]
if pattern[k] == pattern[q]:
k += 1
pi.append(k)
return pi
def kmp_string_matching(text, pattern):
txt_len, pat_len = len(text), len(pattern)
pi = prefix_function(pattern)
q = 0
result = []
for i in range(txt_len):
while q > 0 and pattern[q] != text[i]:
q = pi[q-1]
if pattern[q] == text[i]:
q += 1
if q == pat_len:
result.append(i - q + 1)
q = pi[q-1]
return result
As an input it takes full text and a pattern that you are looking for. The complexity of KMP algorithm is similar to the Finite automation algorithm and it is O(n) but the pre-processing time is faster (prefix_function).
If you are interested in such topics like pattern matching or finding the occurrences of a pattern in a text, I highly recommend you to become acquainted with all of them.
To open a file you can simply run:
with open(file_name) as file:
text = file.read()
result = naive_string_matching(text, pattern)
where file_name is the name of your file, pattern is the phrase that you want to search for in the text. To search for patterns in an array you can try:
example_patterns = ['max speed', 'time', 'distance', 'travel', 'down', 'up']
with open(file_name) as file:
text = file.read()
for pattern in example_patterns:
result = kmp_string_matching(text, pattern)
import re
keywords = ['max speed', 'time', 'distance', 'travel', 'down', 'up']
keywords = [x.replace(' ', r'\s') for x in keywords] # replaces spaces with whitespace indicator
with open('file.txt', 'r') as file:
data = file.read()
keywords_found = {}
for key in keywords:
found = re.findall(key, data, re.I) # re.I means it'll ignore case.
if found:
keywords_found[key] = len(found)
print(keywords_found)

Python most frequent sentences in large text

My goal is to extract the x most frequent sentences that include the word y. My solution right now works this way:
Extract all sentences that contains the word y
Count the most frequent words in these sentences and store them in a list
Extract sentences that includes z amount of words from the list
In order to get x sentences then I simply increase or decrease z
So I get the most frequent sentences by looking for the most frequent words. This method works on smaller amount of data. But will take forever on larger amount of data.
EDIT - code
# Extract all sentences from data containing the word
def getSentences(word):
sentences = []
for x in data_lemmatized:
if word in x:
sentences.append(x)
return sentences
# Get the most frequent words from all the sentences
def getSentenceWords(sentences):
cnt = Counter()
for x in sentences:
for y in x:
cnt[y] += 1
words = []
for x,y in cnt.most_common(30):
if(x not in exclude and x != ' '):
words.append(x)
return words
# Get sentences that contains as many words as possible
def countWordshelp(allSentences, words, amountWords):
tempList = []
for sentence in allSentences:
temp = len(words[:amountWords])
count = 0
for word in words[:amountWords]:
if(word in sentence):
count += 1
if(count == temp):
tempList.append(sentence)
return tempList
def countWords(allSentences, words, nrSentences):
tempList = []
prevList = []
amountWords = 1
tempList = countWords2help(allSentences, words, amountWords)
while(len(tempList) > nrSentences):
amountWords += 1
newAllSentences = tempList
prevList = tempList
tempList = countWords2help(newAllSentences,words, amountWords)
if(len(tempList) < nrSentences):
return prevList[:nrSentences]
return tempList
if __name__ == '__main__':
for x in terms:
for y in x:
allSentences = getSentences(y)
words = getSentenceWords(allSentences)
test = countWords2(allSentences,words,nrSentences)
allTest.append(test)
terms will be a list of list each containing 10 words, data_lemmatized will be the large data in lemmatized form.

Pararellize nested for loops over pandas rows

I have a python code similar to this:
df = pd.read_csv('genelist.csv', names = ['ID', 'Tag'])
matrix = np.zeros((df.shape[0], df.shape[0]))
for element in range(df.shape[0]):
for Element in range(element, df.shape[0]):
matrix[element, Element] = distance(df['Tag'].iloc[element], df['Tag'].iloc[Element]
Where distance() is a complex function that calculates the distance between two tags.
My dataframe is 8000 rows long and I need to speed up the process.
I started my script yesterday evening and after 12 hours it wasn't finished.
I was looking at the multiprocessing library but I do not know if it is possible since I have never used it.
The distance function looks like this:
def penn_to_wn(tag):
"""
Convert between a Penn Treebank tag to a simplified Wordnet tag
"""
if tag[0] == 'N':
return 'n'
if tag.startswith('V'):
return 'v'
if tag.startswith('J'):
return 'a'
if tag.startswith('R'):
return 'r'
return None
def tagged_to_synset(word, tag):
"""
Returns synset of a couple of word, WordNet tag
"""
wn_tag = penn_to_wn(tag)
if wn_tag is None:
return None
try:
return wn.synsets(word, wn_tag)[0]
except:
return None
def NS_sentence_similarity(sentence1, sentence2):
"""
Compute the sentence similarity using Wordnet
Distance(A, A) == 1,
Distance(A, B) != Distance(B, A)
"""
# Tokenize and tag
sentence1 = pos_tag(word_tokenize(sentence1))
sentence2 = pos_tag(sentence2.split())
# Get the synsets for the tagged words
synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
# Filter out the Nones
synsets1 = [ss for ss in synsets1 if ss]
synsets2 = [ss for ss in synsets2 if ss]
score, count = 0.0, 0
# For each word in the first sentence
for synset in synsets1:
# Get the similarity value of the most similar word in the other sentence
try:
best_score = max([synset.path_similarity(ss) for ss in synsets2])
except:
best_score = 0
#print(best_score)
# Check that the similarity could have been computed
if best_score is not None:
score += best_score
count += 1
# Average the values
if count != 0:
score /= count
return score
def distance(sentence1, sentence2):
"""
Compute the sentence similarity using Wordnet
Distance(A, A) == 1,
Distance(A, B) == Distance(B, A)
"""
return (NS_sentence_similarity(sentence1, sentence2) +
NS_sentence_similarity(sentence2, sentence1)) / 2
Do you know any approach that could work?

natural language corpus string to int

take a sample of sentences from each of the corpus1, corpus2 and corpus3 corpora and displays the average length (as measured in terms of the number of characters in the sentence).
so I've 3 corpus and sample_raw_sents is a defined function to return random sentences:
tcr = corpus1()
rcr = corpus2()
mcr = corpus3()
sample_size=50
for sentence in tcr.sample_raw_sents(sample_size):
print(len(sentence))
for sentence in rcr.sample_raw_sents(sample_size):
print(len(sentence))
for sentence in mcr.sample_raw_sents(sample_size):
print(len(sentence))
so using this code all lengths are printed, though how do I sum() these lengths?
Use zip, it will allow you to draw a sentence from each corpus all at once.
tcr = corpus1()
rcr = corpus2()
mcr = corpus3()
sample_size=50
zipped = zip(tcr.sample_raw_sents(sample_size),
rcr.sample_raw_sents(sample_size),
mcr.sample_raw_sents(sample_size))
for s1, s2, s3 in zipped:
summed = len(s1) + len(s2) + len(s3)
average = summed/3
print(summed, average)
You could store all lengths of sentences in list and then sum them up.
tcr = corpus1()
rcr = corpus2()
mcr = corpus3()
sample_size=50
lengths = []
for sentence in tcr.sample_raw_sents(sample_size):
lengths.append(len(sentence))
for sentence in rcr.sample_raw_sents(sample_size):
lengths.append(len(sentence))
for sentence in mcr.sample_raw_sents(sample_size):
lengths.append(len(sentence))
print(sum(lengths) / len(lengths))
tcr = corpus1()
rcr = corpus2()
mcr = corpus3()
sample_size=50
s = 0
for sentence in tcr.sample_raw_sents(sample_size):
s = s + len(sentence)
for sentence in rcr.sample_raw_sents(sample_size):
s = s + len(sentence)
for sentence in mcr.sample_raw_sents(sample_size):
s = s + len(sentence)
average = s/150
print('average: {}'.format(average))

Write a function that returns the average of a list of numbers

I am trying to write a function that reads a txt document containing names with corresponding scores(floats), eg. Li Ning, 9.8 9.7 9.6 9.3 9.4 9.8. Each name (competitors) and corresponding numbers are on a newline in the doc.
Function should return the highest average score, and the corresponding name. Average scores are to be calculated. The average score of a contestant is computed after discarding the best and worst scores.
def getWinner(filename):
results = open(filename)
results.readlines()
winner = ''
max_score = 0
for line in results:
tokens = line.split(',') # split line using ',' separator
name = tokens[0] # get the name
scores = sorted(map(float, tokens[1].split()))
ave = [sum(scores[i])for i in range(1, len(scores)-1)]/(len(scores)-2)
if ave > max_score:
winner = name
max_score = ave
return "%s [%d]" % (winner, max_score)
The error I am receiving is Cannot iterate float I am assuming this is contained in my calculation of average. Why is this?
Thanks for all the useful input! My updated code:
def getWinner(filename):
results = open(filename).readlines()
winner = ''
max_score =0
for line in results:
tokens = line.split(',')
name = tokens[0]
scores = map(float, tokens[1].split())
ave = sum(scores[1:-1])/(len(scores) - 2)
if ave > max_score:
winner = name
max_score = ave
return "%s [%.1f]" % (winner, max_score)
The site in which i am learning python through (Pyschools) will return the correct result, but states 'Private test Failed'. Can anyone see any outstanding errors?
Thanks Again
I think your problem is in the average computation. You're doing:
ave = [sum(scores[i])for i in range(1, len(scores)-1)]/(len(scores)-2)
The bracket notation is used in list comprehension. result = [ f(x) for x in iterable ] is equivalent to:
result = []
for x in iterable:
result.append(f(x))
So in your case, your code translates to:
ave = []
for x in range(1, len(scores) - 1):
ave.append(sum(scores[x]))
So not quite what you're looking for. Also, calling the sum function on a single number will fail, since sum expects an iterable to work. And then, afterwards, you're dividing the list by an integer, and python should (also) fail at this point.
So I suggest the following average computation method:
ave = sum(scores[1:-1])/(len(scores) - 2)

Categories