I am trying to attempt the mapreduce pairs pattern in python. Need to check if a word is in a text file and then find the word next to it and yield a pair of both words. keep running into either:
neighbors = words[words.index(w) + 1]
ValueError: substring not found
or
ValueError: ("the") is not in list
file cwork_trials.py
from mrjob.job import MRJob
class MRCountest(MRJob):
# Word count
def mapper(self, _, document):
# Assume document is a list of words.
#words = []
words = document.strip()
w = "the"
neighbors = words.index(w)
for word in words:
#searchword = "the"
#wor.append(str(word))
#neighbors = words[words.index(w) + 1]
yield(w,1)
def reducer(self, w, values):
yield(w,sum(values))
if __name__ == '__main__':
MRCountest.run()
Edit:
Trying to use the pairs pattern to search a document for every instance of a specific word and then find the word next to it each time. Then yielding a pair result for each instance i.e. find instances of "the" and the word next to it i.e. [the], [book], [the], [cat] etc.
from mrjob.job import MRJob
class MRCountest(MRJob):
# Word count
def mapper(self, _, document):
# Assume document is a list of words.
#words = []
words = document.split(" ")
want = "the"
for w, want in enumerate(words, 1):
if (w+1) < len(words):
neighbors = words[w + 1]
pair = (want, neighbors)
for u in neighbors:
if want is "the":
#pair = (want, neighbors)
yield(pair),1
#neighbors = words.index(w)
#for word in words:
#searchword = "the"
#wor.append(str(word))
#neighbors = words[words.index(w) + 1]
#yield(w,1)
#def reducer(self, w, values):
#yield(w,sum(values))
if __name__ == '__main__':
MRCountest.run()
As it stands I get yields of every word pair with multiples of the same pairing.
When you use words.index("the") then you will only get the first instance of "the" in your list or string, and as you have found, you will get an error if "the" isn't present.
Also you mention that you are trying to produce pairs, but only yield a single word.
I think what you are trying to do is something more like this:
def get_word_pairs(words):
for i, word in enumerate(words):
if (i+1) < len(words):
yield (word, words[i + 1]), 1
if (i-1) > 0:
yield (word, words[i - 1]), 1
assuming you are interested in neighbours in both directions. (If not, you only need the first yield.)
Lastly, since you use document.strip(), I suspect that document is in fact a string and not a list. If that's the case, you can use words = document.split(" ") to get the word list, assuming you don't have any punctuation.
Related
I am trying to make a job that takes in a text file, then counts the number of syllables in each word, then ultimately returns the top 10 words with the most syllables. I'm able to get all of the word/syllable pairs sorted in descending order, however, I am struggling to figure out how to return only the top 10 words. Here's my code so far:
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
WORD_RE = re.compile(r"[\w']+")
class MRMostUsedWordSyllables(MRJob):
def steps(self):
return [
MRStep(mapper=self.word_splitter_mapper,
reducer=self.sorting_word_syllables),
MRStep(reducer=self.reducer_word_sorted),
MRStep(reducer=self.get_top_10_reducer)
]
def word_splitter_mapper(self, _, line):
#for word in line.split():
for word in WORD_RE.findall(line):
yield(word.lower(), None)
def sorting_word_syllables(self, word, count):
count = 0
vowels = 'aeiouy'
word = word.lower().strip()
if word in vowels:
count +=1
for index in range(1,len(word)):
if word[index] in vowels and word[index-1] not in vowels:
count +=1
if word.endswith('e'):
count -= 1
if word.endswith('le'):
count+=1
if count == 0:
count +=1
yield None, (int(count), word)
def reducer_word_sorted(self, _, syllables_counts):
for count, word in sorted(syllables_counts, reverse=True):
yield (int(count), word)
def get_top_10_reducer(self, count, word):
self.aList = []
for value in list(range(count)):
self.aList.append(value)
self.bList = []
for i in range(10):
self.bList.append(max(self.aList))
self.aList.remove(max(self.aList))
for i in range(10):
yield self.bList[i]
if __name__ == '__main__':
import time
start = time.time()
MRMostUsedWordSyllables.run()
end = time.time()
print(end - start)
I know my issue is with the "get_top_10_reducer" function. I keep getting ValueError: max() arg is an empty sequence.
According to the error, one of your reducers has returned 0 for the count. Do you have an empty line in your input, for example? You should filter this data out as early as possible.
Overall, I think you need to remove reducer_word_sorted. There is no guarantee this returns sorted data. Instead, I think it regroups all data based on the numeric count key, then emits in a non-deterministic order to the next step.
That being said, your top 10 reducer is never using the value of word parameter , which should be a list itself, actually, grouped by each count key emitted by the previous reducer.
With the reducer_word_sorted removed, the sorting_word_syllables returns None for its key... This is fine because you then have all split words in a giant list, so define a regular function
def get_syllable_count_pair(word):
return (syllables(word), word, )
Use that within the reducer
def get_top_10_reducer(self, count, word):
assert count == None # added for a guard
with_counts = [get_syllable_count_pair(w) for w in word]
# Sort the words by the syllable count
sorted_counts = sorted(syllables_counts, reverse=True, key=lambda x: x[0])
# Slice off the first ten
for t in sorted_counts[:10]:
yield t
I want to get the indexes for all the ocurrencies of substring inside a string
s = "the bewildered tourist was in the mosque"
w = "the"
find(s,t)
[1, 31]
I also want the function to start counting on 1, as in the previous example, instead of 0.
This is what I managed:
import re
def find(sentence, word):
if word in sentence:
matches = re.finditer(word, sentence)
matches_positions = [match.start() for match in matches]
print(matches_positions)
else:
return [-1]
find(s, w)
[0, 30]
This is working, but I would like a way of doing this without using any import, as well as starting at 1 instead of 0.
There is no need to use the regular expression functionality to search for substrings if you don't need any regular expression matching.
To solve with just the standard methods available:
def find_indexes(string_to_search, word_to_find):
start_index = 0
positions = []
while True:
try:
start_index = string_to_search.index(word_to_find, start_index)
positions.append(start_index)
start_index += len(word_to_find)
except ValueError:
return positions
return positions
s = "the bewildered tourist was in the mosque"
w = "the"
print(find_indexes(s, w))
index raises an ValueError exception if the word can't be found after the given index, so we catch that and return the current indexes when it gets generated. index takes the start index as its second argument.
To make it start at 1 instead, add 1 to the index when you store it:
positions.append(start_index + 1)
You can convert it the string into a list and find the index of each item where the value of w is found. This will give you the position of the word (not the position of the string).
s = "the bewildered tourist was in the mosque"
w = "the"
ss = s.split(' ')
print ([i for i,si in enumerate(ss) if si == w])
The output of this will be:
[0, 5]
Similarly, if you want to find the position of the string, can search for the word the (or var w), then print the position. To use the similar approach as above, you can do:
print ([i+1 for i in range(len(s)-len(w)-1) if s[i:i+len(2)] == w])
This will return:
[1, 31]
The i+1 is to give you the starting position of 1 instead of 0.
The first result will return the position of the words and the second will return the position within the string.
I have a Python list of string names where I would like to remove a common substring from all of the names.
And after reading this similar answer I could almost achieve the desired result using SequenceMatcher.
But only when all items have a common substring:
From List:
string 1 = myKey_apples
string 2 = myKey_appleses
string 3 = myKey_oranges
common substring = "myKey_"
To List:
string 1 = apples
string 2 = appleses
string 3 = oranges
However I have a slightly noisy list that contains a few scattered items that don't fit the same naming convention.
I would like to remove the "most common" substring from the majority:
From List:
string 1 = myKey_apples
string 2 = myKey_appleses
string 3 = myKey_oranges
string 4 = foo
string 5 = myKey_Banannas
common substring = ""
To List:
string 1 = apples
string 2 = appleses
string 3 = oranges
string 4 = foo
string 5 = Banannas
I need a way to match the "myKey_" substring so I can remove it from all names.
But when I use the SequenceMatcher the item "foo" causes the "longest match" to be equal to blank "".
I think the only way to solve this is to find the "most common substring". But how could that be accomplished?
Basic example code:
from difflib import SequenceMatcher
names = ["myKey_apples",
"myKey_appleses",
"myKey_oranges",
#"foo",
"myKey_Banannas"]
string2 = names[0]
for i in range(1, len(names)):
string1 = string2
string2 = names[i]
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
print(string1[match.a: match.a + match.size]) # -> myKey_
Given names = ["myKey_apples", "myKey_appleses", "myKey_oranges", "foo", "myKey_Banannas"]
An O(n^2) solution I can think of is to find all possible substrings and storing them in a dictionary with the number of times they occur :
substring_counts={}
for i in range(0, len(names)):
for j in range(i+1,len(names)):
string1 = names[i]
string2 = names[j]
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
matching_substring=string1[match.a:match.a+match.size]
if(matching_substring not in substring_counts):
substring_counts[matching_substring]=1
else:
substring_counts[matching_substring]+=1
print(substring_counts) #{'myKey_': 5, 'myKey_apples': 1, 'o': 1, '': 3}
And then picking the maximum occurring substring
import operator
max_occurring_substring=max(substring_counts.iteritems(), key=operator.itemgetter(1))[0]
print(max_occurring_substring) #myKey_
Here's a overly verbose solution to your problem:
def find_matching_key(list_in, max_key_only = True):
"""
returns the longest matching key in the list * with the highest frequency
"""
keys = {}
curr_key = ''
# If n does not exceed max_n, don't bother adding
max_n = 0
for word in list(set(list_in)): #get unique values to speed up
for i in range(len(word)):
# Look up the whole word, then one less letter, sequentially
curr_key = word[0:len(word)-i]
# if not in, count occurance
if curr_key not in keys.keys() and curr_key!='':
n = 0
for word2 in list_in:
if curr_key in word2:
n+=1
# if large n, Add to dictionary
if n > max_n:
max_n = n
keys[curr_key] = n
# Finish the word
# Finish for loop
if max_key_only:
return max(keys, key=keys.get)
else:
return keys
# Create your "from list"
From_List = [
"myKey_apples",
"myKey_appleses",
"myKey_oranges",
"foo",
"myKey_Banannas"
]
# Use the function
key = find_matching_key(From_List, True)
# Iterate over your list, replacing values
new_From_List = [x.replace(key,'') for x in From_List]
print(new_From_List)
['apples', 'appleses', 'oranges', 'foo', 'Banannas']
Needless to say, this solution would look a lot neater with recursion. Thought I'd sketch out a rough dynamic programming solution for you though.
I would first find the starting letter with the most occurrences. Then I would take each word having that starting letter, and take while all these words have matching letters. Then in the end I would remove the prefix that was found from each starting word:
from collections import Counter
from itertools import takewhile
strings = ["myKey_apples", "myKey_appleses", "myKey_oranges", "berries"]
def remove_mc_prefix(words):
cnt = Counter()
for word in words:
cnt[word[0]] += 1
first_letter = list(cnt)[0]
filter_list = [word for word in words if word[0] == first_letter]
filter_list.sort(key = lambda s: len(s)) # To avoid iob
prefix = ""
length = len(filter_list[0])
for i in range(length):
test = filter_list[0][i]
if all([word[i] == test for word in filter_list]):
prefix += test
else: break
return [word[len(prefix):] if word.startswith(prefix) else word for word in words]
print(remove_mc_prefix(strings))
Out: ['apples', 'appleses', 'oranges', 'berries']
To find the most-common-substring from list of python-string
I already tested on python-3.10.5 I hope it will work for you.
I have the same use case but a different kind of task, I just need to find one common-pattern-string from a list of more than 100s files. To use as a regular-expression.
Your Basic example code is not working in my case. because 1st checking with 2nd, 2nd with 3rd, 3rd with 4th and so on. So, I change it to the most common substring and will check with each one.
The downside of this code is that if something is not common with the most common substring, the final most common substring will be an empty one.
But in my case, it is working.
from difflib import SequenceMatcher
for i in range(1, len(names)):
if i==1:
string1, string2 = names[0], names[i]
else:
string1, string2 = most_common_substring, names[i]
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
most_common_substring = string1[match.a: match.a + match.size]
print(f"most_common_substring : {most_common_substring}")
python python-3python-difflib
I'm having trouble with a script to replace the normal letters to especial characters to test a translation system, here's an example (cha-mate is chá-mate but would be tested with chã-mate/chã-máte and other variations), but instead of creating this variations, it's switching all of the same characters to only one espcial letter, here's what it's printing:
chá-máte
chã-mãte
Here's what should print in theory:
cha-máte
cha-mãte
chá-mate
chã-mate
etc.
Here's the code and the json utilized:
def translation_tester(word):
esp_chars = {
'a': 'áã',
}
#words = [word]
for esp_char in esp_chars:
if esp_char in word:
replacement_chars = esp_chars[esp_char]
for i in range(len(replacement_chars)):
print(word.replace(esp_char, replacement_chars[i]))
def main():
words = ['cha-mate']
for word in words:
translation_tester(word)
main()
Anyway, any help is appreciated, thanks in advance!
To handle arbitrary number of replacements, you need to use recursion. This is how I did it.
intword = 'cha-mate'
esp_chars = {'a': 'áã'}
def wpermute(word, i=0):
for idx, c in enumerate(word[i:], i):
if c in esp_chars:
for s in esp_chars[c]:
newword = word[0:idx] + s + word[idx + 1:]
wpermute(newword, idx + 1)
if idx == len(word) -1:
print(word)
wpermute(intword)
which gives the output of 9 different ways the word can be written.
chá-máte
chá-mãte
chá-mate
chã-máte
chã-mãte
chã-mate
cha-máte
cha-mãte
cha-mate
There might be a nicer way to do this, but you can do the following (making sure to include the plain 'a' in the list of replacement chars):
import itertools
import re
def replace_at_indices(word, new_chars, indices):
new_word = word
for i, index in enumerate(indices):
new_word = new_word[:index] + new_chars[i] + new_word[index+1:]
return new_word
def translation_tester(word):
esp_chars = {
'a': 'aáã',
}
for esp_char in esp_chars:
replacement_chars = list(esp_chars[esp_char])
indices = [m.start() for m in re.finditer(esp_char, word)]
product = list(itertools.product(replacement_chars, repeat=len(indices)))
for p in product:
new_word = replace_at_indices(word, p, indices)
print(new_word)
def main():
words = ['cha-mate']
for word in words:
translation_tester(word)
main()
For your example, this should give you:
cha-mate
cha-máte
cha-mãte
chá-mate
chá-máte
chá-mãte
chã-mate
chã-máte
chã-mãte
See also:
Find all occurrences of a substring in Python
generating permutations with repetitions in python
Replacing a character from a certain index
import re
string = "is2 Thi1s T4est 3a"
def order(sentence):
res = ''
count = 1
list = sentence.split()
for i in list:
for i in list:
a = re.findall('\d+', i)
if a == [str(count)]:
res += " ".join(i)
count += 1
print(res)
order(string)
Above there is a code which I have problem with. Output which I should get is:
"Thi1s is2 3a T4est"
Instead I'm getting the correct order but with spaces in the wrong places:
"T h i 1 si s 23 aT 4 e s t"
Any idea how to make it work with this code concept?
You are joining the characters of each word:
>>> " ".join('Thi1s')
'T h i 1 s'
You want to collect your words into a list and join that instead:
def order(sentence):
number_words = []
count = 1
words = sentence.split()
for word in words:
for word in words:
matches = re.findall('\d+', word)
if matches == [str(count)]:
number_words.append(word)
count += 1
result = ' '.join(number_words)
print(result)
I used more verbose and clear variable names. I also removed the list variable; don't use list as a variable name if you can avoid it, as that masks the built-in list name.
What you implemented comes down to a O(N^2) (quadratic time) sort. You could instead use the built-in sort() function to bring this to O(NlogN); you'd extract the digit and sort on its integer value:
def order(sentence):
digit = re.compile(r'\d+')
return ' '.join(
sorted(sentence.split(),
key=lambda w: int(digit.search(w).group())))
This differs a little from your version in that it'll only look at the first (consecutive) digits, it doesn't care about the numbers being sequential, and will break for words without digits. It also uses a return to give the result to the caller rather than print. Just use print(order(string)) to print the return value.
If you assume the words are numbered consecutively starting at 1, then you can sort them in O(N) time even:
def order(sentence):
digit = re.compile(r'\d+')
words = sentence.split()
result = [None] * len(words)
for word in words:
index = int(digit.search(word).group())
result[index - 1] = word
return ' '.join(result)
This works by creating a list of the same length, then using the digits from each word to put the word into the correct index (minus 1, as Python lists start at 0, not 1).
I think the bug is simply in the misuse of join(). You want to concatenate the current sorted string. i is simply a token, hence simply add it to the end of the string. Code untested.
import re
string = "is2 Thi1s T4est 3a"
def order(sentence):
res = ''
count = 1
list = sentence.split()
for i in list:
for i in list:
a = re.findall('\d+', i)
if a == [str(count)]:
res = res + " " + i # your bug here
count += 1
print(res)
order(string)