how to write this romove_stopwords faster python? - python

I have a function remove_stopwords like this. How do I make it run faster?
temp.reverse()
def drop_stopwords(text):
for x in temp:
elif len(x.split()) > 1:
text_list = text.split()
for y in range(len(text_list)-len(x.split())):
if " ".join(text_list[y:y+len(x.split())]) == x:
del text_list[y:y+len(x.split())]
text = " ".join(text_list)
else:
text = " ".join(text for text in text.split() if text not in vietnamese)
return text
time to solve a text in my data is 14s and if I have some trick like this time for will decrease to 3s:
temp.reverse()
def drop_stopwords(text):
for x in temp:
if len(x.split()) >2:
if x in text:
text = text.replace(x,'')
elif len(x.split()) > 1:
text_list = text.split()
for y in range(len(text_list)-len(x.split())):
if " ".join(text_list[y:y+len(x.split())]) == x:
del text_list[y:y+len(x.split())]
text = " ".join(text_list)
else:
text = " ".join(text for text in text.split() if text not in vietnamese)
return text
but I think it may get wrong some where in my language. How can I rewrite this function in Python to make it faster (in C and C++ I can solve it easily with the function above :(( )

Your function does a lot of the same thing over and over, particularly repeated split and join of the same text. Doing a single split, operating on the list, and then doing a single join at the end might be faster, and would definitely lead to simpler code. Unfortunately I don't have any of your sample data to test the performance with, but hopefully this gives you something to experiment with:
temp = ["foo", "baz ola"]
def drop_stopwords(text):
text_list = text.split()
text_len = len(text_list)
for word in temp:
word_list = word.split()
word_len = len(word_list)
for i in range(text_len + 1 - word_len):
if text_list[i:i+word_len] == word_list:
text_list[i:i+word_len] = [None] * word_len
return ' '.join(t for t in text_list if t)
print(drop_stopwords("the quick brown foo jumped over the baz ola dog"))
# the quick brown jumped over the dog
You could also just try iteratively doing text.replace in all cases and seeing how that performs compared to your more complex split-based solution:
temp = ["foo", "baz ola"]
def drop_stopwords(text):
for word in temp:
text = text.replace(word, '')
return ' '.join(text.split())
print(drop_stopwords("the quick brown foo jumped over the baz ola dog"))
# the quick brown jumped over the dog

Related

how to write strings in reverse with one character spared in python

I know how to write strings in reverse
txt = "Hello World"[::-1]
print(txt)
but I don't know how to do it with one character still in the same place
like when I type world it should be wdlro
thanks
Just prepend the first character to the remainder of the string (reversed using slice notation, but stopping just before we reach index 0, which is the first character):
>>> s = "world"
>>> s[0] + s[:0:-1]
'wdlro'
word = 'w' + "".join(list('world').remove('w'))[::-1]
If you want to reverse all the words in the text based on the criteria (skipping first character of each word):
txt = "Hello World"
result = []
for word in txt.split():
result.append(word[0]+word[1:][::-1])
print (result)
This is a more generic answer that allows you to pick a random location within the string to hold in the same position:
txt = "Hello World"
position = 3
lock_char = txt[position]
new_string = list((txt[:position] + txt[position+1:])[::-1])
new_string.insert(position, lock_char)
listToStr = ''.join([str(elem) for elem in new_string])
print(listToStr)
Result: dlrloW oleH
The simple way using only range:
txt = "Hello World"
position = 10
[first, lock, last] = txt[:position], txt[position],
txt[position+1:]
new_string = (first + last)[::-1]
[first, last] = new_string[:position], new_string[position:]
new_txt = first + lock + last
print(new_txt)

How to split a string based on word match from different lists?

I have a string. Now I want to split the string into parts if anything matches from two different lists. how can I do that ? there what i have.
dummy_word = "I have a HTML file"
dummy_type = ["HTML","JSON","XML"]
dummy_file_type = ["file","document","paper"]
for e in dummy_type:
if e in dummy_word:
type_found = e
print("type ->" , e)
dum = dummy_word.split(e)
complete_dum = "".join(dum)
for c in dummy_file_type:
if c in complete_dum:
then = complete_dum.split("c")
print("file type ->",then)
In the given scenario my expected output is ["I have a", "HTML","file"]
These sort of tasks a handled pretty well by itertools.groupby(). Here the key will translate to individual words if the words is in the set of words, or False if it's not. This allows all the non-special words to group together and each special word to become its own element:
from itertools import groupby
dummy_word = "I have a HTML file"
dummy_type = ["HTML","JSON","XML"]
dummy_file_type = ["file","document","paper"]
words = set(dummy_type).union(dummy_file_type)
[" ".join(g) for k, g in
groupby(dummy_word.split(), key=lambda word: (word in words) and word)]
# ['I have a', 'HTML', 'file']
This worked for me:
dummy_word = "I have a HTML file"
dummy_type = ["HTML","JSON","XML"]
dummy_file_type = ["file","document","paper"]
temp = ""
dummy_list = []
for word in dummy_word.split():
if word in dummy_type or word in dummy_file_type:
if temp:
dummy_list.append(temp)
print(temp, "delete")
print(temp)
new_word = word + " "
dummy_list.append(new_word)
temp = ""
else:
temp += word + " "
print(temp)
print(dummy_list)
One more way using re:
>>> list(map(str.strip, re.sub("|".join(dummy_type + dummy_file_type), lambda x: "," + x.group(), dummy_word).split(',')))
['I have a', 'HTML', 'file']
>>>
First, form a regex pattern by concatenating all the types using join. Using re.sub, the string is replaced where tokens are prepended with a comma, and then we split the string using comma separator. map is used to strip the whitespaces.

Pattern search by NOT using Regex algorithm and code in python

Today I had an interview at AMD and was asked a question which I didn't know how to solve it without Regex. Here is the question:
Find all the pattern for the word "Hello" in a text. Consider that there is only ONE char can be in between letters of hello e.g. search for all instances of "h.ello", "hell o", "he,llo", or "hel!lo".
Since you also tagged this question algorithm, I'm just going to show the general approach that I would take when looking at this question, without including any language tricks from python.
1) I would want to split the string into a list of words
2) Loop through each string in the resulting list, checking if the string matches 'hello' without the character at the current index (or if it simply matches 'hello')
3) If a match is found, return it.
Here is a simple approach in python:
s = "h.ello hello h!ello hell.o none of these"
all = s.split()
def drop_one(s, match):
if s == match:
return True # WARNING: Early Return
for i in range(len(s) - 1):
if s[:i] + s[i+1:] == match:
return True
matches = [x for x in all if drop_one(x, "hello")]
print(matches)
The output of this snippet:
['h.ello', 'hello', 'h!ello', 'hell.o']
This should work. I've tried to make it generic. You might have to make some minor adjustments. Let me know if you don't understand any part.
def checkValidity(tlist):
tmpVar = ''
for i in range(len(tlist)):
if tlist[i] in set("hello"):
tmpVar += tlist[i]
return(tmpVar == 'hello')
mStr = "he.llo hehellbo hellox hell.o hello helloxy abhell.oyz"
mWord = "hello"
mlen = len(mStr)
wordLen = len(mWord)+1
i=0
print ("given str = ", mStr)
while i<mlen:
tmpList = []
if mStr[i] == 'h':
for j in range(wordLen):
tmpList.append(mStr[i+j])
validFlag = checkValidity(tmpList)
if validFlag:
print("Match starting at index: ",i, ':', mStr[i:i+wordLen])
i += wordLen
else:
i += 1
else:
i += 1

Apply collocation from listo of bigrams with NLTK in Python

I have to find and "apply" collocations in several sentences. The sentences are stored in a list of string. Let' focus on only one sentence now.
Here's an example:
sentence = 'I like to eat the ice cream in new york'
Here's what I want in the end:
sentence_final = 'I like to eat the ice_cream in new_york'
I'm using Python NLTK to find the collocations and I'm able to create a set containing all the possible collocations over all the sentences I have.
Here's an example of the set:
set_collocations = set([('ice', 'cream'), ('new', 'york'), ('go', 'out')])
It's obviously bigger in reality.
I created the following function, which should return the new function, modified as described above:
def apply_collocations(sentence, set_colloc):
window_size = 2
words = sentence.lower().split()
list_bigrams = list(nltk.bigrams(words))
set_bigrams=set(list_bigrams)
intersect = set_bigrams.intersection(set_colloc)
print(set_colloc)
print(set_bigrams)
# No collocation in this sentence
if not intersect:
return sentence
# At least one collocation in this sentence
else:
set_words_iters = set()
# Create set of words of the collocations
for bigram in intersect:
set_words_iters.add(bigram[0])
set_words_iters.add(bigram[1])
# Sentence beginning
if list_bigrams[0][0] not in set_words_iters:
new_sentence = list_bigrams[0][0]
begin = 1
else:
new_sentence = list_bigrams[0][0] + '_' + list_bigrams[0][1]
begin = 2
for i in range(begin, len(list_bigrams)):
print(new_sentence)
if list_bigrams[i][1] in set_words_iters and list_bigrams[i] in intersect:
new_sentence += ' ' + list_bigrams[i][0] + '_' + list_bigrams[i][1]
elif list_bigrams[i][1] not in set_words_iters:
new_sentence += ' ' + list_bigrams[i][1]
return new_sentence
2 question:
Is there a more optimized way to to this?
Since I'm a little bit inexpert with NLTK, can someone tell me if there' a "direct way" to apply collocations to a certain text? I mean, once I have identified the bigrams which I consider collocations, is there some function (or fast method) to modify my sentences?
You can simply replace the string "x y" by "x_y" for each element in your collocations set:
def apply_collocations(sentence, set_colloc):
res = sentence.lower()
for b1,b2 in set_colloc:
res = res.replace("%s %s" % (b1 ,b2), "%s_%s" % (b1 ,b2))
return res

Searching from a list of word to words in a text file

I am trying to write a program which reads a text file and then sorts it out into whether the comments in it are positive, negative or neutral. I have tried all sorts of ways to do this but each time with no avail. I can search for 1 word with no problems but any more than that and it doesn't work. Also, I have an if statement but I've had to use else twice underneath it as it wouldn't allow me to use elif. Any help with where I'm going wrong would be really appreciated. Thanks in advance.
middle = open("middle_test.txt", "r")
positive = []
negative = [] #the empty lists
neutral = []
pos_words = ["GOOD", "GREAT", "LOVE", "AWESOME"] #the lists I'd like to search
neg_words = ["BAD", "HATE", "SUCKS", "CRAP"]
for tweet in middle:
words = tweet.split()
if pos_words in words: #doesn't work
positive.append(words)
else: #can't use elif for some reason
if 'BAD' in words: #works but is only 1 word not list
negative.append(words)
else:
neutral.append(words)
Use a Counter, see http://docs.python.org/2/library/collections.html#collections.Counter:
import urllib2
from collections import Counter
from string import punctuation
# data from http://inclass.kaggle.com/c/si650winter11/data
target_url = "http://goo.gl/oMufKm"
data = urllib2.urlopen(target_url).read()
word_freq = Counter([i.lower().strip(punctuation) for i in data.split()])
pos_words = ["good", "great", "love", "awesome"]
neg_words = ["bad", "hate", "sucks", "crap"]
for i in pos_words:
try:
print i, word_freq[i]
except: # if word not in data
pass
[out]:
good 638
great 1082
love 7716
awesome 2032
You could use the code below to count the number of positive and negative words in a paragraph:
from collections import Counter
def readwords( filename ):
f = open(filename)
words = [ line.rstrip() for line in f.readlines()]
return words
# >cat positive.txt
# good
# awesome
# >cat negative.txt
# bad
# ugly
positive = readwords('positive.txt')
negative = readwords('negative.txt')
print positive
print negative
paragraph = 'this is really bad and in fact awesome. really awesome.'
count = Counter(paragraph.split())
pos = 0
neg = 0
for key, val in count.iteritems():
key = key.rstrip('.,?!\n') # removing possible punctuation signs
if key in positive:
pos += val
if key in negative:
neg += val
print pos, neg
You are not reading the lines from the file. And this line
if pos_words in words:
I think it is checking for the list ["GOOD", "GREAT", "LOVE", "AWESOME"] in words. That is you are looking in the list of words for a list pos_words = ["GOOD", "GREAT", "LOVE", "AWESOME"].
You have some problems. At first you can create functions that read comments from file and divides comments into words. Make them and check if they work as you want. Then main procedure can look like:
for comment in get_comments(file_name):
words = get_words(comment)
classified = False
# at first look for negative comment
for neg_word in NEGATIVE_WORDS:
if neg_word in words:
classified = True
negatives.append(comment)
break
# now look for positive
if not classified:
for pos_word in POSITIVE_WORDS:
if pos_word in words:
classified = True
positives.append(comment)
break
if not classified:
neutral.append(comment)
be careful, open() returns a file object.
>>> f = open('workfile', 'w')
>>> print f
<open file 'workfile', mode 'w' at 80a0960>
Use this:
>>> f.readline()
'This is the first line of the file.\n'
Then use set intersection:
positive += list(set(pos_words) & set(tweet.split()))

Categories