Pararellize nested for loops over pandas rows - python

I have a python code similar to this:
df = pd.read_csv('genelist.csv', names = ['ID', 'Tag'])
matrix = np.zeros((df.shape[0], df.shape[0]))
for element in range(df.shape[0]):
for Element in range(element, df.shape[0]):
matrix[element, Element] = distance(df['Tag'].iloc[element], df['Tag'].iloc[Element]
Where distance() is a complex function that calculates the distance between two tags.
My dataframe is 8000 rows long and I need to speed up the process.
I started my script yesterday evening and after 12 hours it wasn't finished.
I was looking at the multiprocessing library but I do not know if it is possible since I have never used it.
The distance function looks like this:
def penn_to_wn(tag):
"""
Convert between a Penn Treebank tag to a simplified Wordnet tag
"""
if tag[0] == 'N':
return 'n'
if tag.startswith('V'):
return 'v'
if tag.startswith('J'):
return 'a'
if tag.startswith('R'):
return 'r'
return None
def tagged_to_synset(word, tag):
"""
Returns synset of a couple of word, WordNet tag
"""
wn_tag = penn_to_wn(tag)
if wn_tag is None:
return None
try:
return wn.synsets(word, wn_tag)[0]
except:
return None
def NS_sentence_similarity(sentence1, sentence2):
"""
Compute the sentence similarity using Wordnet
Distance(A, A) == 1,
Distance(A, B) != Distance(B, A)
"""
# Tokenize and tag
sentence1 = pos_tag(word_tokenize(sentence1))
sentence2 = pos_tag(sentence2.split())
# Get the synsets for the tagged words
synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
# Filter out the Nones
synsets1 = [ss for ss in synsets1 if ss]
synsets2 = [ss for ss in synsets2 if ss]
score, count = 0.0, 0
# For each word in the first sentence
for synset in synsets1:
# Get the similarity value of the most similar word in the other sentence
try:
best_score = max([synset.path_similarity(ss) for ss in synsets2])
except:
best_score = 0
#print(best_score)
# Check that the similarity could have been computed
if best_score is not None:
score += best_score
count += 1
# Average the values
if count != 0:
score /= count
return score
def distance(sentence1, sentence2):
"""
Compute the sentence similarity using Wordnet
Distance(A, A) == 1,
Distance(A, B) == Distance(B, A)
"""
return (NS_sentence_similarity(sentence1, sentence2) +
NS_sentence_similarity(sentence2, sentence1)) / 2
Do you know any approach that could work?

Related

Why does my while loop calculate incorrect value of the string?

I am trying to find greatest length of a word from the string return it by using values of each letter from alphabets by assigning each letter it's value as per it's rank . So for example For a string s = 'abcd a', I intend to return 10 [a=1 + b=2 + c =3 + d=4] .But, I am getting output as 7 When I debugged the code, I noticed that in while loop my code skips i=2 and directly jumps on i=3. Where am I going wrong? Below is my code.
class Solution(object):
def highest_scoring_word(self,s):
# Dictionary of English letters
dt = {'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,
'g':7,'h':8,'i':9,'j':10,'k':11,'l':12,
'm':13,'n':14,'o':15,'p':16,'q':17,
'r':18,'s':19,'t':20,'u':21,'v':22,
'w':23,'x':24,'y':25,'z':26}
value_sum =0
max_value =value_sum
for i in range(0,len(s)):
if s.upper():
s= s.lower()
words = s.split()
# convert the string in char array
to_char_array = list(words[i])
j=0
while j<len(to_char_array):
if to_char_array[j] in dt.keys() :
value_sum = max(dt.get(to_char_array[j]),value_sum + dt.get(to_char_array[j]))
max_value = max(value_sum,max_value)
else:
pass
j +=j+1
return max_value
if __name__ == '__main__':
p = 'abcd a'
print(Solution().highest_scoring_word(p))
`
I have created a dictionary where I have stored all letters in english alphabet and their values and later I have split the string into words using split() and then after converting each individual word into character array I have traversed it to find their occurrence in the dictionary and add to the final value. I am expecting to get a correct value of a string and finally the greatest value.
As you are using a class and methods, make use of them:
from string import ascii_lowercase as dt
class Solution(object):
def __init__(self, data):
self.scores = {}
self.words = data.lower().strip().split()
def get_scoring(self):
# for each word caculate the scoring
for word in self.words:
score = 0
# for each character in the word, find its index in 'a..z' and add it to score
# same as in your dt implementation (just using index not absolute values)
for c in word:
score += dt.find(c) + 1
self.scores[word] = score
print(self.scores)
# filer the dictionary by its greates value in order to get the word with max score:
return max(self.scores.keys(), key=lambda k: self.scores[k])
if __name__ == '__main__':
p = 'abcd fg11'
maxWord = Solution(p).get_scoring()
print(maxWord)
Out:
{'abcd': 10, 'fg11': 13}
fg11
Try using this:
class Solution(object):
def highest_scoring_word(self,s):
# Dictionary of English letters
dt = {'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,
'g':7,'h':8,'i':9,'j':10,'k':11,'l':12,
'm':13,'n':14,'o':15,'p':16,'q':17,
'r':18,'s':19,'t':20,'u':21,'v':22,
'w':23,'x':24,'y':25,'z':26}
value_sum1 =0
max_value1 =value_sum1
value_sum2 =0
max_value2 =value_sum2
for i in range(0,len(s)):
if s.upper():
s= s.lower()
words = s.split()
if len(words)>1:
# convert the string in char array
to_char_array = list(words[0])
j=0
while j<len(to_char_array):
if to_char_array[j] in dt.keys() :
value_sum1 = max(dt.get(to_char_array[j]),value_sum1 + dt.get(to_char_array[j]))
max_value1 = max(value_sum1,max_value1)
else:
pass
j=j+1
to_char_array = list(words[1])
j=0
while j<len(to_char_array):
if to_char_array[j] in dt.keys() :
value_sum2 = max(dt.get(to_char_array[j]),value_sum2 + dt.get(to_char_array[j]))
max_value2 = max(value_sum2,max_value2)
else:
pass
j=j+1
if max_value2>max_value1:
return max_value2
elif max_value1>max_value2:
return max_value1
else:
return 'Both words have equal score'
else:
# convert the string in char array
to_char_array = list(words[i])
j=0
while j<len(to_char_array):
if to_char_array[j] in dt.keys() :
value_sum1 = max(dt.get(to_char_array[j]),value_sum1 + dt.get(to_char_array[j]))
max_value1 = max(value_sum1,max_value1)
else:
pass
j=j+1
return max_value1
if __name__ == '__main__':
p = 'abcd fg'
print(Solution().highest_scoring_word(p))
It is maybe of interest that the code can be greatly simplified by using features available in Python:
the_sum = sum(ord(c)-96 for c in s.lower() if c.isalpha())
to break this down. for c in s.lower() gets the lower-case characters one by one; the function ord() gives the numerical value with a of 97 so we subtract to get 1. Then we check if the character is a letter and if so accept it. Then sum() adds up all the numbers. You could break up this one line an check how the separate parts work.

Python most frequent sentences in large text

My goal is to extract the x most frequent sentences that include the word y. My solution right now works this way:
Extract all sentences that contains the word y
Count the most frequent words in these sentences and store them in a list
Extract sentences that includes z amount of words from the list
In order to get x sentences then I simply increase or decrease z
So I get the most frequent sentences by looking for the most frequent words. This method works on smaller amount of data. But will take forever on larger amount of data.
EDIT - code
# Extract all sentences from data containing the word
def getSentences(word):
sentences = []
for x in data_lemmatized:
if word in x:
sentences.append(x)
return sentences
# Get the most frequent words from all the sentences
def getSentenceWords(sentences):
cnt = Counter()
for x in sentences:
for y in x:
cnt[y] += 1
words = []
for x,y in cnt.most_common(30):
if(x not in exclude and x != ' '):
words.append(x)
return words
# Get sentences that contains as many words as possible
def countWordshelp(allSentences, words, amountWords):
tempList = []
for sentence in allSentences:
temp = len(words[:amountWords])
count = 0
for word in words[:amountWords]:
if(word in sentence):
count += 1
if(count == temp):
tempList.append(sentence)
return tempList
def countWords(allSentences, words, nrSentences):
tempList = []
prevList = []
amountWords = 1
tempList = countWords2help(allSentences, words, amountWords)
while(len(tempList) > nrSentences):
amountWords += 1
newAllSentences = tempList
prevList = tempList
tempList = countWords2help(newAllSentences,words, amountWords)
if(len(tempList) < nrSentences):
return prevList[:nrSentences]
return tempList
if __name__ == '__main__':
for x in terms:
for y in x:
allSentences = getSentences(y)
words = getSentenceWords(allSentences)
test = countWords2(allSentences,words,nrSentences)
allTest.append(test)
terms will be a list of list each containing 10 words, data_lemmatized will be the large data in lemmatized form.

Cosine similarity of a new text document with existing list of documents

I have a dataframe of 1000 text documents with corresponding keywords.I want to extract keywords of a new document by finding the keywords corresponding to the documents in the list which is most similar.
First save your csv to a dataframe df and use the below functions for cosine similarity calculation.
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
word = re.compile(r'\w+')
words = word.findall(text)
return Counter(words)
def get_result(content_a, content_b):
text1 = content_a
text2 = content_b
vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)
cosine_result = get_cosine(vector1, vector2)
return cosine_result
Then iterate over the df and invoke the functions as below:
similarity=[]
for ind in df.index:
#my_doc="new document should go in here"
#prev_doc= "previous document for each index should go in here"
cos=get_result(my_doc, prev_doc)
similarity.append(cos)
max_ind= similarity.index(max(similarity))
You will get the index position of the most similar document

ordered word perminuations in python

So my question is simple, and half of it is already working.
I need help with generating ordered word-permutations.
My code:
from os.path import isfile
from string import printable
def loadRuleSet(fileLocation):
rules = {}
assert isfile(fileLocation)
for x in open(fileLocation).read().split('\n'):
if not len(x) == 0:
data = x.split(':')
if not len(data[0]) == 0 or not len(data[1]) == 0:
rules[data[0]] = data[1]
return rules
class deform:
def __init__(self, ruleSet):
assert type(ruleSet) == dict
self.ruleSet = ruleSet
def walker(self, string):
spot = []
cnt = 0
for x in string:
spot.append((x, cnt))
cnt += 1
return spot
def replace_exact(self, word, position, new):
cnt = 0
newword = ''
for x in word:
if cnt == position:
newword += new
else:
newword += x
cnt+= 1
return newword
def first_iter(self, word):
data = []
pos = self.walker(word)
for x in pos:
if x[0] in self.ruleSet:
for y in self.ruleSet[x[0]]:
data.append(self.replace_exact(word, x[1], y))
return data
print deform({'a':'#A'}).first_iter('abac')
My current code does half of the job, but I've reached a "writer's block"
>>>deform({'a':'#'}).first_iter('aaa')
['#aa', 'a#a', 'aa#']
Here's the results from my currently made script.
What code is supposed to do is - take the word, and reorder it with other characters in the replacement. I've successfully made it do it with one character, but I need help with making all the results. For example:
['#aa', 'a#a', 'aa#', '##a', 'a##', '#a#']
In your case you can use permutations function which could return all possible orderings, no repeated elements.
from itertools import permutations
from operator import itemgetter
perm_one = sorted(set([''.join(x) for x in permutations('#aa')]))
perm_two = sorted(set([''.join(x) for x in permutations('##a')]), key=itemgetter(1))
print perm_one + perm_two
I divided it into two collections because they differ number of # and a characters.

Object has no attribute 'update'

I am trying to use the code that is on this link... see example 6.
So this is the code:
import json
import nltk
import numpy
BLOG_DATA = "resources/ch05-webpages/feed.json"
N = 100 # Number of words to consider
CLUSTER_THRESHOLD = 5 # Distance between words to consider
TOP_SENTENCES = 5 # Number of sentences to return for a "top n" summary
# Approach taken from "The Automatic Creation of Literature Abstracts" by H.P. Luhn
def _score_sentences(sentences, important_words):
scores = []
sentence_idx = -1
for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:
sentence_idx += 1
word_idx = []
# For each word in the word list...
for w in important_words:
try:
# Compute an index for where any important words occur in the sentence.
word_idx.append(s.index(w))
except ValueError, e: # w not in this particular sentence
pass
word_idx.sort()
# It is possible that some sentences may not contain any important words at all.
if len(word_idx)== 0: continue
# Using the word index, compute clusters by using a max distance threshold
# for any two consecutive words.
clusters = []
cluster = [word_idx[0]]
i = 1
while i < len(word_idx):
if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
cluster.append(word_idx[i])
else:
clusters.append(cluster[:])
cluster = [word_idx[i]]
i += 1
clusters.append(cluster)
# Score each cluster. The max score for any given cluster is the score
# for the sentence.
max_cluster_score = 0
for c in clusters:
significant_words_in_cluster = len(c)
total_words_in_cluster = c[-1] - c[0] + 1
score = 1.0 * significant_words_in_cluster \
* significant_words_in_cluster / total_words_in_cluster
if score > max_cluster_score:
max_cluster_score = score
scores.append((sentence_idx, score))
return scores
def summarize(txt):
sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
normalized_sentences = [s.lower() for s in sentences]
words = [w.lower() for sentence in normalized_sentences for w in
nltk.tokenize.word_tokenize(sentence)]
fdist = nltk.FreqDist(words)
top_n_words = [w[0] for w in fdist.items()
if w[0] not in nltk.corpus.stopwords.words('english')][:N]
scored_sentences = _score_sentences(normalized_sentences, top_n_words)
# Summarization Approach 1:
# Filter out nonsignificant sentences by using the average score plus a
# fraction of the std dev as a filter
avg = numpy.mean([s[1] for s in scored_sentences])
std = numpy.std([s[1] for s in scored_sentences])
mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
if score > avg + 0.5 * std]
# Summarization Approach 2:
# Another approach would be to return only the top N ranked sentences
top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
# Decorate the post object with summaries
return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])
blog_data = json.loads(open(BLOG_DATA).read())
for post in blog_data:
post.update(summarize(post['content']))
print post['title']
print '=' * len(post['title'])
print
print 'Top N Summary'
print '-------------'
print ' '.join(post['top_n_summary'])
print
print 'Mean Scored Summary'
print '-------------------'
print ' '.join(post['mean_scored_summary'])
print
But when I run it it says:
Traceback (most recent call last):
File "/home/jetonp/PycharmProjects/Summeriza/blogs_and_nlp__summarize.py", line 117, in <module>
post.update(summarize(post['content']))
AttributeError: 'unicode' object has no attribute 'update'
Process finished with exit code 1
What is causing this error and how do I fix it?
I figured it out. In the example that you are working off of, the summarize method returns a dictionary. Your summarize method does not return anything, due to improper indentation. For part of it, there is just three spaces, and for part of it there were no spaces. The standard indentation in python is four spaces. Summarize should look like this:
def summarize(txt):
sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
normalized_sentences = [s.lower() for s in sentences]
words = [w.lower() for sentence in normalized_sentences for w in
nltk.tokenize.word_tokenize(sentence)]
fdist = nltk.FreqDist(words)
top_n_words = [w[0] for w in fdist.items()
if w[0] not in nltk.corpus.stopwords.words('english')][:N]
scored_sentences = _score_sentences(normalized_sentences, top_n_words)
# Summarization Approach 1:
# Filter out nonsignificant sentences by using the average score plus a
# fraction of the std dev as a filter
avg = numpy.mean([s[1] for s in scored_sentences])
std = numpy.std([s[1] for s in scored_sentences])
mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
if score > avg + 0.5 * std]
# Summarization Approach 2:
# Another approach would be to return only the top N ranked sentences
top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
# Decorate the post object with summaries
return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])

Categories