removing duplicates from n grams in rows python - python

This code produces n-grams and the number of counts the n gram appears.
I have a csv file with rows and a column containing string of words for every row.
This code for example when it searches gets a 4 gram like 'this is my puppy' it also counts the number of occurrences it occurs in that same row.
My intention is that when it gets an occurrence of n-gram in a row it should count it once and count its second time in a another row and so on.
e.g row Word
1 this is my puppy what this is my puppy
2 this is my puppy
so this code counts 'this is my puppy' as 3 times. Yet i want it to be 2 times
This is the python code
import collections
import re
import sys
import time
def tokenize(string):
"""Convert string to lowercase and split into words (ignoring
punctuation), returning list of words.
"""
return re.findall(r'\w+', string.lower())
def count_ngrams(lines, min_length=4, max_length=5):
"""Iterate through given lines iterator (file object or list of
lines) and return n-gram frequencies. The return value is a dict
mapping the length of the n-gram to a collections.Counter
object of n-gram tuple and number of times that n-gram occurred.
Returned dict includes n-grams of length min_length to max_length.
"""
lengths = range(min_length, max_length + 1)
ngrams = {length: collections.Counter() for length in lengths}
queue = collections.deque(maxlen=max_length)
# Helper function to add n-grams at start of current queue to dict
def add_queue():
current = tuple(queue)
for length in lengths:
if len(current) >= length:
ngrams[length][current[:length]] += 1
# Loop through all lines and words and add n-grams to dict
for line in lines:
for word in tokenize(line):
queue.append(word)
if len(queue) >= max_length:
add_queue()
# Make sure we get the n-grams at the tail end of the queue
while len(queue) > min_length:
queue.popleft()
add_queue()
return ngrams
def print_most_frequent(ngrams, num=10):
"""Print num most common n-grams of each length in n-grams dict."""
for n in sorted(ngrams):
print('----- {} most common {}-grams -----'.format(num, n))
for gram, count in ngrams[n].most_common(num):
print('{0}: {1}'.format(' '.join(gram), count))
print('')
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Usage: python ngrams.py filename')
sys.exit(1)
start_time = time.time()
with open("PWorm.csv") as f:
ngrams = count_ngrams(f)
print_most_frequent(ngrams)
elapsed_time = time.time() - start_time
print('Took {:.03f} seconds'.format(elapsed_time))
Your help will be highly appreciated.
Thank you

Instead of populating you ngrams semi-manually, you can use a defaultdict
To prevent the same ngram in a line to count twice, you'll have to make an ngram-dict per line, and then combine that with the general ngram dict
def count_ngrams(lines, min_length=4, max_length=5):
"""Iterate through given lines iterator (file object or list of
lines) and return n-gram frequencies. The return value is a dict
mapping the length of the n-gram to a collections.Counter
object of n-gram tuple and number of times that n-gram occurred.
Returned dict includes n-grams of length min_length to max_length.
"""
lengths = range(min_length, max_length + 1)
ngrams = collections.defaultdict(collections.Counter)
queue = collections.deque(maxlen=max_length)
# Helper function to add n-grams at start of current queue to dict
def add_queue(ngrams_line):
current = tuple(queue)
for length in lengths:
if len(current) >= length:
ngrams_line[length][current[:length]] = 1 # instead of += 1
# to combine the 2 defaultdict(Counter)
def combine_ngrams(ngram, ngramline):
for k, v in ngramsline.items():
ngrams[k] += v
return ngrams
# Loop through all lines and words and add n-grams to dict
for line in lines:
ngrams_line = collections.defaultdict(collections.Counter)
for word in tokenize(line):
queue.append(word)
if len(queue) >= max_length:
add_queue(ngrams_line)
ngrams = combine_ngrams(ngrams, ngrams_line)
# Make sure we get the n-grams at the tail end of the queue
ngrams_line = collections.defaultdict(collections.Counter)
while len(queue) > min_length:
queue.popleft()
add_queue(ngrams_line)
ngrams = combine_ngrams(ngrams, ngrams_line)
return ngrams
I don't 100% understand the part after while len(queue) > min_length:, or why the queue doesn't get reset everyline, you you'll might have to adjust my answer a bit

Related

Finding Top Ten Word Syllable Count

I am trying to make a job that takes in a text file, then counts the number of syllables in each word, then ultimately returns the top 10 words with the most syllables. I'm able to get all of the word/syllable pairs sorted in descending order, however, I am struggling to figure out how to return only the top 10 words. Here's my code so far:
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
WORD_RE = re.compile(r"[\w']+")
class MRMostUsedWordSyllables(MRJob):
def steps(self):
return [
MRStep(mapper=self.word_splitter_mapper,
reducer=self.sorting_word_syllables),
MRStep(reducer=self.reducer_word_sorted),
MRStep(reducer=self.get_top_10_reducer)
]
def word_splitter_mapper(self, _, line):
#for word in line.split():
for word in WORD_RE.findall(line):
yield(word.lower(), None)
def sorting_word_syllables(self, word, count):
count = 0
vowels = 'aeiouy'
word = word.lower().strip()
if word in vowels:
count +=1
for index in range(1,len(word)):
if word[index] in vowels and word[index-1] not in vowels:
count +=1
if word.endswith('e'):
count -= 1
if word.endswith('le'):
count+=1
if count == 0:
count +=1
yield None, (int(count), word)
def reducer_word_sorted(self, _, syllables_counts):
for count, word in sorted(syllables_counts, reverse=True):
yield (int(count), word)
def get_top_10_reducer(self, count, word):
self.aList = []
for value in list(range(count)):
self.aList.append(value)
self.bList = []
for i in range(10):
self.bList.append(max(self.aList))
self.aList.remove(max(self.aList))
for i in range(10):
yield self.bList[i]
if __name__ == '__main__':
import time
start = time.time()
MRMostUsedWordSyllables.run()
end = time.time()
print(end - start)
I know my issue is with the "get_top_10_reducer" function. I keep getting ValueError: max() arg is an empty sequence.
According to the error, one of your reducers has returned 0 for the count. Do you have an empty line in your input, for example? You should filter this data out as early as possible.
Overall, I think you need to remove reducer_word_sorted. There is no guarantee this returns sorted data. Instead, I think it regroups all data based on the numeric count key, then emits in a non-deterministic order to the next step.
That being said, your top 10 reducer is never using the value of word parameter , which should be a list itself, actually, grouped by each count key emitted by the previous reducer.
With the reducer_word_sorted removed, the sorting_word_syllables returns None for its key... This is fine because you then have all split words in a giant list, so define a regular function
def get_syllable_count_pair(word):
return (syllables(word), word, )
Use that within the reducer
def get_top_10_reducer(self, count, word):
assert count == None # added for a guard
with_counts = [get_syllable_count_pair(w) for w in word]
# Sort the words by the syllable count
sorted_counts = sorted(syllables_counts, reverse=True, key=lambda x: x[0])
# Slice off the first ten
for t in sorted_counts[:10]:
yield t

How to efficiently count word occurrences in Python without additional modules

Background
I'm working on a HackerRank problem Word Order. The task is to
Read the following input from stdin
4
bcdef
abcdefg
bcde
bcdef
Produce the output that reflects:
Number of unique words in first line
Count of occurrences for each unique words
Example:
3 # Number of unique words
2 1 1 # count of occurring words, 'bcdef' appears twice = 2
Problem
I've coded two solutions, the second one passes initial tests but fail due to exceeding time limit. First one would also work but I was unnecessarily sorting outputs (time limit issue would occur though).
Notes
In first solution I was unnecessarily sorting values, this is fixed in the second solution
I'm keen to be making better (proper) use of standard Python data structures, list/dictionary comprehension - I would be particularly keen to receive a solution that doesn't import any addittional modules, with exception of import os if needed.
Code
import os
def word_order(words):
# Output no of distinct words
distinct_words = set(words)
n_distinct_words = len(distinct_words)
print(str(n_distinct_words))
# Count occurrences of each word
occurrences = []
for distinct_word in distinct_words:
n_word_appearances = 0
for word in words:
if word == distinct_word:
n_word_appearances += 1
occurrences.append(n_word_appearances)
occurrences.sort(reverse=True)
print(*occurrences, sep=' ')
# for o in occurrences:
# print(o, end=' ')
def word_order_two(words):
'''
Run through all words and only count multiple occurrences, do the maths
to calculate unique words, etc. Attempt to construct a dictionary to make
the operation more memory efficient.
'''
# Construct a count of word occurrences
dictionary_words = {word:words.count(word) for word in words}
# Unique words are equivalent to dictionary keys
unique_words = len(dictionary_words)
# Obtain sorted dictionary values
# sorted_values = sorted(dictionary_words.values(), reverse=True)
result_values = " ".join(str(value) for value in dictionary_words.values())
# Output results
print(str(unique_words))
print(result_values)
return 0
if __name__ == '__main__':
q = int(input().strip())
inputs = []
for q_itr in range(q):
s = input()
inputs.append(s)
# word_order(words=inputs)
word_order_two(words=inputs)
Those nested loops are very bad performance wise (they make your algorithm quadratic) and quite unnecessary. You can get all counts in single iteration. You could use a plain dict or the dedicated collections.Counter:
from collections import Counter
def word_order(words):
c = Counter(words)
print(len(c))
print(" ".join(str(v) for _, v in c.most_common()))
The "manual" implementation that shows the workings of the Counter and its methods:
def word_order(words):
c = {}
for word in words:
c[word] = c.get(word, 0) + 1
print(len(c))
print(" ".join(str(v) for v in sorted(c.values(), reverse=True)))
# print(" ".join(map(str, sorted(c.values(), reverse=True))))
Without any imports, you could count unique elements by
len(set(words))
and count their occurrences by
def counter(words):
count = dict()
for word in words:
if word in count:
count[word] += 1
else:
count[word] = 1
return count.values()
You can use Counter then print output like below:
>>> from collections import Counter
>>> def counter_words(words):
... cnt = Counter(words)
... print(len(cnt))
... print(*[str(v) for k,v in c.items()] , sep=' ')
>>> inputs = ['bcdef' , 'abcdefg' , 'bcde' , 'bcdef']
>>> counter_words(inputs)
3
2 1 1

how to find max. relative frequency in string, python?

How to find max. relative frequency in string? The function which takes a string argument and returns the highest relative frequency of any letter in the string. For example, the string 'sufficit' has 8 letters, and the most frequent letters are f and i, which both occur twice. Thus, the highest relative frequency is 2/8. The string 'Non-even' has 7 letters (- is not a letter), and the most frequent letter is n which occurs three times (one N and two n). Thus, the highest relative frequency is 3/7.
def max_rel_freq(string):
dic = {}
total = float(len(string))
for ch in string:
if ch in dic:
dic[ch] = dic[ch] + 1
else:
dic[ch] = 1
frequencies = []
for s in string:
frequencies.append(float(dic[s]/total))
return max(frequencies)
This works for only 'sufficit' but does not give required output for 'Non-even'.
this should work
def max_rel_freq(string):
max_freq = 1
string = string.lower()
string = list(map(str,string.strip('')))
for i in string:
if string.count(i)> max_freq:
max_freq = string.count(i)
return max_freq

Finding repeated character combinations in string

I have a string that holds a very long sentence without whitespaces/spaces.
mystring = "abcdthisisatextwithsampletextforasampleabcd"
I would like to find all of the repeated substrings that contains minimum 4 chars.
So I would like to achieve something like this:
'text' 2 times
'sample' 2 times
'abcd' 2 times
As both abcd,text and sample can be found two times in the mystring they were recognized as properly matched substrings with more than 4 char length. It's important that I am seeking repeated substrings, finding only existing English words is not a requirement.
The answers I found are helpful for finding duplicates in texts with whitespaces, but I couldn't find a proper resource that covers the situation when there are no spaces and whitespaces in the string. How can this be done in the most efficient way?
Let's go through this step by step. There are several sub-tasks you should take care of:
Identify all substrings of length 4 or more.
Count the occurrence of these substrings.
Filter all substrings with 2 occurrences or more.
You can actually put all of them into a few statements. For understanding, it is easier to go through them one at a time.
The following examples all use
mystring = "abcdthisisatextwithsampletextforasampleabcd"
min_length = 4
1. Substrings of a given length
You can easily get substrings by slicing - for example, mystring[4:4+6] gives you the substring from position 4 of length 6: 'thisis'. More generically, you want substrings of the form mystring[start:start+length].
So what values do you need for start and length?
start must...
cover all substrings, so it must include the first character: start in range(0, ...).
not map to short substrings, so it can stop min_length characters before the end: start in range(..., len(mystring) - min_length + 1).
length must...
cover the shortest substring of length 4: length in range(min_length, ...).
not exceed the remaining string after i: length in range(..., len(mystring) - i + 1))
The +1 terms come from converting lengths (>=1) to indices (>=0).
You can put this all together into a single comprehension:
substrings = [
mystring[i:i+j]
for i in range(0, len(mystring) - min_length + 1)
for j in range(min_length, len(mystring) - i + 1)
]
2. Count substrings
Trivially, you want to keep a count for each substring. Keeping anything for each specific object is what dicts are made for. So you should use substrings as keys and counts as values in a dict. In essence, this corresponds to this:
counts = {}
for substring in substrings:
try: # increase count for existing keys, set for new keys
counts[substring] += 1
except KeyError:
counts[substring] = 1
You can simply feed your substrings to collections.Counter, and it produces something like the above.
>>> counts = collections.Counter(substrings)
>>> print(counts)
Counter({'abcd': 2, 'abcdt': 1, 'abcdth': 1, 'abcdthi': 1, 'abcdthis': 1, ...})
Notice how the duplicate 'abcd' maps to the count of 2.
3. Filtering duplicate substrings
So now you have your substrings and the count for each. You need to remove the non-duplicate substrings - those with a count of 1.
Python offers several constructs for filtering, depending on the output you want. These work also if counts is a regular dict:
>>> list(filter(lambda key: counts[key] > 1, counts))
['abcd', 'text', 'samp', 'sampl', 'sample', 'ampl', 'ample', 'mple']
>>> {key: value for key, value in counts.items() if value > 1}
{'abcd': 2, 'ampl': 2, 'ample': 2, 'mple': 2, 'samp': 2, 'sampl': 2, 'sample': 2, 'text': 2}
Using Python primitives
Python ships with primitives that allow you to do this more efficiently.
Use a generator to build substrings. A generator builds its member on the fly, so you never actually have them all in-memory. For your use case, you can use a generator expression:
substrings = (
mystring[i:i+j]
for i in range(0, len(mystring) - min_length + 1)
for j in range(min_length, len(mystring) - i + 1)
)
Use a pre-existing Counter implementation. Python comes with a dict-like container that counts its members: collections.Counter can directly digest your substring generator. Especially in newer version, this is much more efficient.
counts = collections.Counter(substrings)
You can exploit Python's lazy filters to only ever inspect one substring. The filter builtin or another generator generator expression can produce one result at a time without storing them all in memory.
for substring in filter(lambda key: counts[key] > 1, counts):
print(substring, 'occurs', counts[substring], 'times')
Nobody is using re! Time for an answer [ab]using the regular expression built-in module ;)
import re
Finding all the maximal substrings that are repeated
repeated_ones = set(re.findall(r"(.{4,})(?=.*\1)", mystring))
This matches the longest substrings which have at least a single repetition after (without consuming). So it finds all disjointed substrings that are repeated while only yielding the longest strings.
Finding all substrings that are repeated, including overlaps
mystring_overlap = "abcdeabcdzzzzbcde"
# In case we want to match both abcd and bcde
repeated_ones = set()
pos = 0
while True:
match = re.search(r"(.{4,}).*(\1)+", mystring_overlap[pos:])
if match:
repeated_ones.add(match.group(1))
pos += match.pos + 1
else:
break
This ensures that all --not only disjoint-- substrings which have repetition are returned. It should be much slower, but gets the work done.
If you want in addition to the longest strings that are repeated, all the substrings, then:
base_repetitions = list(repeated_ones)
for s in base_repetitions:
for i in range(4, len(s)):
repeated_ones.add(s[:i])
That will ensure that for long substrings that have repetition, you have also the smaller substring --e.g. "sample" and "ample" found by the re.search code; but also "samp", "sampl", "ampl" added by the above snippet.
Counting matches
Because (by design) the substrings that we count are non-overlapping, the count method is the way to go:
from __future__ import print_function
for substr in repeated_ones:
print("'%s': %d times" % (substr, mystring.count(substr)))
Results
Finding maximal substrings:
With the question's original mystring:
{'abcd', 'text', 'sample'}
with the mystring_overlap sample:
{'abcd'}
Finding all substrings:
With the question's original mystring:
{'abcd', 'ample', 'mple', 'sample', 'text'}
... and if we add the code to get all substrings then, of course, we get absolutely all the substrings:
{'abcd', 'ampl', 'ample', 'mple', 'samp', 'sampl', 'sample', 'text'}
with the mystring_overlap sample:
{'abcd', 'bcde'}
Future work
It's possible to filter the results of the finding all substrings with the following steps:
take a match "A"
check if this match is a substring of another match, call it "B"
if there is a "B" match, check the counter on that match "B_n"
if "A_n = B_n", then remove A
go to first step
It cannot happen that "A_n < B_n" because A is smaller than B (is a substring) so there must be at least the same number of repetitions.
If "A_n > B_n" it means that there is some extra match of the smaller substring, so it is a distinct substring because it is repeated in a place where B is not repeated.
Script (explanation where needed, in comments):
from collections import Counter
mystring = "abcdthisisatextwithsampletextforasampleabcd"
mystring_len = len(mystring)
possible_matches = []
matches = []
# Range `start_index` from 0 to 3 from the left, due to minimum char count of 4
for start_index in range(0, mystring_len-3):
# Start `end_index` at `start_index+1` and range it throughout the rest of
# the string
for end_index in range(start_index+1, mystring_len+1):
current_string = mystring[start_index:end_index]
if len(current_string) < 4: continue # Skip this interation, if len < 4
possible_matches.append(mystring[start_index:end_index])
for possible_match, count in Counter(possible_matches).most_common():
# Iterate until count is less than or equal to 1 because `Counter`'s
# `most_common` method lists them in order. Once 1 (or less) is hit, all
# others are the same or lower.
if count <= 1: break
matches.append((possible_match, count))
for match, count in matches:
print(f'\'{match}\' {count} times')
Output:
'abcd' 2 times
'text' 2 times
'samp' 2 times
'sampl' 2 times
'sample' 2 times
'ampl' 2 times
'ample' 2 times
'mple' 2 times
Here's a Python3 friendly solution:
from collections import Counter
min_str_length = 4
mystring = "abcdthisisatextwithsampletextforasampleabcd"
all_substrings =[mystring[start_index:][:end_index + 1] for start_index in range(len(mystring)) for end_index in range(len(mystring[start_index:]))]
counted_substrings = Counter(all_substrings)
not_counted_final_candidates = [item[0] for item in counted_substrings.most_common() if item[1] > 1 and len(item[0]) >= min_str_length]
counted_final_candidates = {item: counted_substrings[item] for item in not_counted_final_candidates}
print(counted_final_candidates)
Bonus: largest string
sub_sub_strings = [substring1 for substring1 in not_counted_final_candidates for substring2 in not_counted_final_candidates if substring1!=substring2 and substring1 in substring2 ]
largest_common_string = list(set(not_counted_final_candidates) - set(sub_sub_strings))
Everything as a function:
from collections import Counter
def get_repeated_strings(input_string, min_str_length = 2, calculate_largest_repeated_string = True ):
all_substrings = [input_string[start_index:][:end_index + 1]
for start_index in range(len(input_string))
for end_index in range(len(input_string[start_index:]))]
counted_substrings = Counter(all_substrings)
not_counted_final_candidates = [item[0]
for item in counted_substrings.most_common()
if item[1] > 1 and len(item[0]) >= min_str_length]
counted_final_candidates = {item: counted_substrings[item] for item in not_counted_final_candidates}
### This is just a bit of bonus code for calculating the largest repeating sting
if calculate_largest_repeated_string == True:
sub_sub_strings = [substring1 for substring1 in not_counted_final_candidates for substring2 in
not_counted_final_candidates if substring1 != substring2 and substring1 in substring2]
largest_common_strings = list(set(not_counted_final_candidates) - set(sub_sub_strings))
return counted_final_candidates, largest_common_strings
else:
return counted_final_candidates
Example:
mystring = "abcdthisisatextwithsampletextforasampleabcd"
print(get_repeated_strings(mystring, min_str_length= 4))
Output:
({'abcd': 2, 'text': 2, 'samp': 2, 'sampl': 2, 'sample': 2, 'ampl': 2, 'ample': 2, 'mple': 2}, ['abcd', 'text', 'sample'])
CODE:
pattern = "abcdthisisatextwithsampletextforasampleabcd"
string_more_4 = []
k = 4
while(k <= len(pattern)):
for i in range(len(pattern)):
if pattern[i:k+i] not in string_more_4 and len(pattern[i:k+i]) >= 4:
string_more_4.append( pattern[i:k+i])
k+=1
for i in string_more_4:
if pattern.count(i) >= 2:
print(i + " -> " + str(pattern.count(i)) + " times")
OUTPUT:
abcd -> 2 times
text -> 2 times
samp -> 2 times
ampl -> 2 times
mple -> 2 times
sampl -> 2 times
ample -> 2 times
sample -> 2 times
Hope this helps as my code length was short and it is easy to understand. Cheers!
This is in Python 2 because I'm not doing Python 3 at this time. So you'll have to adapt it to Python 3 yourself.
#!python2
# import module
from collections import Counter
# get the indices
def getIndices(length):
# holds the indices
specific_range = []; all_sets = []
# start building the indices
for i in range(0, length - 2):
# build a set of indices of a specific range
for j in range(1, length + 2):
specific_range.append([j - 1, j + i + 3])
# append 'specific_range' to 'all_sets', reset 'specific_range'
if specific_range[j - 1][1] == length:
all_sets.append(specific_range)
specific_range = []
break
# return all of the calculated indices ranges
return all_sets
# store search strings
tmplst = []; combos = []; found = []
# string to be searched
mystring = "abcdthisisatextwithsampletextforasampleabcd"
# mystring = "abcdthisisatextwithtextsampletextforasampleabcdtext"
# get length of string
length = len(mystring)
# get all of the indices ranges, 4 and greater
all_sets = getIndices(length)
# get the search string combinations
for sublst in all_sets:
for subsublst in sublst:
tmplst.append(mystring[subsublst[0]: subsublst[1]])
combos.append(tmplst)
tmplst = []
# search for matching string patterns
for sublst in all_sets:
for subsublst in sublst:
for sublstitems in combos:
if mystring[subsublst[0]: subsublst[1]] in sublstitems:
found.append(mystring[subsublst[0]: subsublst[1]])
# make a dictionary containing the strings and their counts
d1 = Counter(found)
# filter out counts of 2 or more and print them
for k, v in d1.items():
if v > 1:
print k, v
$ cat test.py
import collections
import sys
S = "abcdthisisatextwithsampletextforasampleabcd"
def find(s, min_length=4):
"""
Find repeated character sequences in a provided string.
Arguments:
s -- the string to be searched
min_length -- the minimum length of the sequences to be found
"""
counter = collections.defaultdict(int)
# A repeated sequence can't be longer than half the length of s
sequence_length = len(s) // 2
# populate counter with all possible sequences
while sequence_length >= min_length:
# Iterate over the string until the number of remaining characters is
# fewer than the length of the current sequence.
for i, x in enumerate(s[:-(sequence_length - 1)]):
# Window across the string, getting slices
# of length == sequence_length.
candidate = s[i:i + sequence_length]
counter[candidate] += 1
sequence_length -= 1
# Report.
for k, v in counter.items():
if v > 1:
print('{} {} times'.format(k, v))
return
if __name__ == '__main__':
try:
s = sys.argv[1]
except IndexError:
s = S
find(s)
$ python test.py
sample 2 times
sampl 2 times
ample 2 times
abcd 2 times
text 2 times
samp 2 times
ampl 2 times
mple 2 times
This is my approach to this problem:
def get_repeated_words(string, minimum_len):
# Storing count of repeated words in this dictionary
repeated_words = {}
# Traversing till last but 4th element
# Actually leaving `minimum_len` elements at end (in this case its 4)
for i in range(len(string)-minimum_len):
# Starting with a length of 4(`minimum_len`) and going till end of string
for j in range(i+minimum_len, len(string)):
# getting the current word
word = string[i:j]
# counting the occurrences of the word
word_count = string.count(word)
if word_count > 1:
# storing in dictionary along with its count if found more than once
repeated_words[word] = word_count
return repeated_words
if __name__ == '__main__':
mystring = "abcdthisisatextwithsampletextforasampleabcd"
result = get_repeated_words(mystring, 4)
This is how I would do it, but I don't know any other way:
string = "abcdthisisatextwithsampletextforasampleabcd"
l = len(string)
occurences = {}
for i in range(4, l):
for start in range(l - i):
substring = string[start:start + i]
occurences[substring] = occurences.get(substring, 0) + 1
for key in occurences.keys():
if occurences[key] > 1:
print("'" + key + "'", str(occurences[key]), "times")
Output:
'sample' 2 times
'ampl' 2 times
'sampl' 2 times
'ample' 2 times
'samp' 2 times
'mple' 2 times
'text' 2 times
Efficient, no, but easy to understand, yes.
Here is simple solution using the more_itertools library.
Given
import collections as ct
import more_itertools as mit
s = "abcdthisisatextwithsampletextforasampleabcd"
lbound, ubound = len("abcd"), len(s)
Code
windows = mit.flatten(mit.windowed(s, n=i) for i in range(lbound, ubound))
filtered = {"".join(k): v for k, v in ct.Counter(windows).items() if v > 1}
filtered
Output
{'abcd': 2,
'text': 2,
'samp': 2,
'ampl': 2,
'mple': 2,
'sampl': 2,
'ample': 2,
'sample': 2}
Details
The procedures are:
build sliding windows of varying sizes lbound <= n < ubound
count all occurrences and filter replicates
more_itertools is a third-party package installed by > pip install more_itertools.
s = 'abcabcabcdabcd'
d = {}
def get_repeats(s, l):
for i in range(len(s)-l):
ss = s[i: i+l]
if ss not in d:
d[ss] = 1
else:
d[ss] = d[ss]+1
return d
get_repeats(s, 3)

How to implement mapreduce pairs pattern in python

I am trying to attempt the mapreduce pairs pattern in python. Need to check if a word is in a text file and then find the word next to it and yield a pair of both words. keep running into either:
neighbors = words[words.index(w) + 1]
ValueError: substring not found
or
ValueError: ("the") is not in list
file cwork_trials.py
from mrjob.job import MRJob
class MRCountest(MRJob):
# Word count
def mapper(self, _, document):
# Assume document is a list of words.
#words = []
words = document.strip()
w = "the"
neighbors = words.index(w)
for word in words:
#searchword = "the"
#wor.append(str(word))
#neighbors = words[words.index(w) + 1]
yield(w,1)
def reducer(self, w, values):
yield(w,sum(values))
if __name__ == '__main__':
MRCountest.run()
Edit:
Trying to use the pairs pattern to search a document for every instance of a specific word and then find the word next to it each time. Then yielding a pair result for each instance i.e. find instances of "the" and the word next to it i.e. [the], [book], [the], [cat] etc.
from mrjob.job import MRJob
class MRCountest(MRJob):
# Word count
def mapper(self, _, document):
# Assume document is a list of words.
#words = []
words = document.split(" ")
want = "the"
for w, want in enumerate(words, 1):
if (w+1) < len(words):
neighbors = words[w + 1]
pair = (want, neighbors)
for u in neighbors:
if want is "the":
#pair = (want, neighbors)
yield(pair),1
#neighbors = words.index(w)
#for word in words:
#searchword = "the"
#wor.append(str(word))
#neighbors = words[words.index(w) + 1]
#yield(w,1)
#def reducer(self, w, values):
#yield(w,sum(values))
if __name__ == '__main__':
MRCountest.run()
As it stands I get yields of every word pair with multiples of the same pairing.
When you use words.index("the") then you will only get the first instance of "the" in your list or string, and as you have found, you will get an error if "the" isn't present.
Also you mention that you are trying to produce pairs, but only yield a single word.
I think what you are trying to do is something more like this:
def get_word_pairs(words):
for i, word in enumerate(words):
if (i+1) < len(words):
yield (word, words[i + 1]), 1
if (i-1) > 0:
yield (word, words[i - 1]), 1
assuming you are interested in neighbours in both directions. (If not, you only need the first yield.)
Lastly, since you use document.strip(), I suspect that document is in fact a string and not a list. If that's the case, you can use words = document.split(" ") to get the word list, assuming you don't have any punctuation.

Categories