This question refers to this problem on lintcode. I have a working solution, but it takes too long for the huge testcase. I am wondering how can it be improved? Maybe I can decrease the number of comparisons I make in the outer loop.
class Solution:
# #param strs: A list of strings
# #return: A list of strings
def anagrams(self, strs):
# write your code here
ret=set()
for i in range(0,len(strs)):
for j in range(i+1,len(strs)):
if i in ret and j in ret:
continue
if Solution.isanagram(strs[i],strs[j]):
ret.add(i)
ret.add(j)
return [strs[i] for i in list(ret)]
#staticmethod
def isanagram(s, t):
if len(s)!=len(t):
return False
chars={}
for i in s:
if i in chars:
chars[i]+=1
else:
chars[i]=1
for i in t:
if i not in chars:
return False
else:
chars[i]-=1
if chars[i]<0:
return False
for i in chars:
if chars[i]!=0:
return False
return True
Update: Just to add, not looking for built-in pythonic solutions such as using Counter which are already optimized. Have added Mike's suggestions, but still exceeding time-limit.
Skip strings you already placed in the set. Don't test them again.
# #param strs: A list of strings
# #return: A list of strings
def anagrams(self, strs):
# write your code here
ret=set()
for i in range(0,len(strs)):
for j in range(i+1,len(strs)):
# If both anagrams exist in set, there is no need to compare them.
if i in ret and j in ret:
continue
if Solution.isanagram(strs[i],strs[j]):
ret.add(i)
ret.add(j)
return [strs[i] for i in list(ret)]
You can also do a length comparison in your anagram test before iterating through the letters. Whenever the strings aren't the same length, they can't be anagrams anyway. Also, when a counter in chars reaches -1 when comparing values in t, just return false. Don't iterate through chars again.
#staticmethod
def isanagram(s, t):
# Test strings are the same length
if len(s) != len(t):
return False
chars={}
for i in s:
if i in chars:
chars[i]+=1
else:
chars[i]=1
for i in t:
if i not in chars:
return False
else:
chars[i]-=1
# If this is below 0, return false
if chars[i] < 0:
return False
for i in chars:
if chars[i]!=0:
return False
return True
Instead of comparing all pairs of strings, you can just create a dictionary (or collections.defaultdict) mapping each of the letter-counts to the words having those counts. For getting the letter-counts, you can use collections.Counter. Afterwards, you just have to get the values from that dict. If you want all words that are anagrams of any other words, just merge the lists that have more than one entry.
strings = ["cat", "act", "rat", "hut", "tar", "tact"]
anagrams = defaultdict(list)
for s in strings:
anagrams[frozenset(Counter(s).items())].append(s)
print([v for v in anagrams.values()])
# [['hut'], ['rat', 'tar'], ['cat', 'act'], ['tact']]
print([x for v in anagrams.values() if len(v) > 1 for x in v])
# ['cat', 'act', 'rat', 'tar']
Of course, if you prefer not to use builtin functionality you can with just a few more lines just as well use a regular dict instead of defaultdict and write your own Counter, similar to what you have in your isanagram method, just without the comparison part.
Your solution is slow because you're not taking advantage of python's data structures.
Here's a solution that collects results in a dict:
class Solution:
def anagrams(self, strs):
d = {}
for word in strs:
key = tuple(sorted(word))
try:
d[key].append(word)
except KeyError:
d[key] = [word]
return [w for ws in d.values() for w in ws if len(ws) > 1]
As an addition to #Mike's great answer, here is a nice Pythonic way to do it:
import collections
class Solution:
# #param strs: A list of strings
# #return: A list of strings
def anagrams(self, strs):
patterns = Solution.find_anagram_words(strs)
return [word for word in strs if ''.join(sorted(word)) in patterns]
#staticmethod
def find_anagram_words(strs):
anagrams = collections.Counter(''.join(sorted(word)) for word in strs)
return {word for word, times in anagrams.items() if times > 1}
Why not this?
str1 = "cafe"
str2 = "face"
def isanagram(s1,s2):
return all(sorted(list(str1)) == sorted(list(str2)))
if isanagram(str1, str2):
print "Woo"
The same can be done with a single line of code if you are using Linq in C#
string[] = strs; // Input string array
var result = strs.GroupBy(x => new string(x.ToCharArray().OrderBy(z => z).ToArray())).Select(g => g.ToList()).ToList();
Now to Group Anagrams in Python, We have to : Sort the lists. Then, Create a dictionary. Now dictionary will tell us where are those anagrams are( Indices of Dictionary). Then values of the dictionary is the actual indices of the anagrams.
def groupAnagrams(words):
# sort each word in the list
A = [''.join(sorted(word)) for word in words]
dict = {}
for indexofsamewords, names in enumerate(A):
dict.setdefault(names, []).append(indexofsamewords)
print(dict)
#{'AOOPR': [0, 2, 5, 11, 13], 'ABTU': [1, 3, 4], 'Sorry': [6], 'adnopr': [7], 'Sadioptu': [8, 16], ' KPaaehiklry': [9], 'Taeggllnouy': [10], 'Leov': [12], 'Paiijorty': [14, 18], 'Paaaikpr': [15], 'Saaaabhmryz': [17], ' CNaachlortttu': [19], 'Saaaaborvz': [20]}
for index in dict.values():
print([words[i] for i in index])
if __name__ == '__main__':
# list of words
words = ["ROOPA","TABU","OOPAR","BUTA","BUAT" , "PAROO","Soudipta",
"Kheyali Park", "Tollygaunge", "AROOP","Love","AOORP", "Protijayi","Paikpara","dipSouta","Shyambazaar",
"jayiProti", "North Calcutta", "Sovabazaar"]
groupAnagrams(words)
The Output :
['ROOPA', 'OOPAR', 'PAROO', 'AROOP', 'AOORP']
['TABU', 'BUTA', 'BUAT']
['Soudipta', 'dipSouta']
['Kheyali Park']
['Tollygaunge']
['Love']
['Protijayi', 'jayiProti']
['Paikpara']
['Shyambazaar']
['North Calcutta']
['Sovabazaar']
Related
Imagine we have following list of strings:
Input: ["eat", "tea", "tan", "ate", "nat", "bat"]
The output of our program should group each set of anagram and return them all together as a list as following:
Output:
[
["ate","eat","tea"],
["nat","tan"],
["bat"]
]
My current solution finds the first set of anagrams but fails to detect the other two and instead, duplicates the first groups into the list:
class Solution(object):
def groupAnagrams(self, strs):
allResults=[]
results=[]
temp=''
for s in strs:
temp=s[1:]+s[:1]
for i in range(0,len(strs)):
if temp==strs[i]:
results.append(strs[i])
allResults.append(results)
return allResults
and the output is:
[["ate","eat","tea"],["ate","eat","tea"],["ate","eat","tea"],["ate","eat","tea"],["ate","eat","tea"],["ate","eat","tea"]]
How to fix this issue?
EDIT:
I have fixed the duplication in appending by appending the results into allResults outside of second loop:
class Solution(object):
def groupAnagrams(self, strs):
allResults=[]
results=[]
temp=''
for s in strs:
temp=s[1:]+s[:1]
for i in range(0,len(strs)):
if temp==strs[i]:
results.append(strs[i])
allResults.append(results)
print(results)
return allResults
Yet, it does not detect the other two sets of anagrams.
you can do it using defaultdict of python in-built collections library and sorted :
In [1]: l = ["eat", "tea", "tan", "ate", "nat", "bat"]
In [2]: from collections import defaultdict
In [3]: d = defaultdict(list)
In [4]: for x in l:
...: d[str(sorted(x))].append(x)
In [5]: d.values()
Out[5]: dict_values([['eat', 'tea', 'ate'], ['tan', 'nat'], ['bat']])
to fix your the solution you need add the variable to check is allready added, for exanmple(and the while walk through the strs i use enumerate for little performance in the search of the anagrams):
class Solution(object):
def groupAnagrams(self, strs):
allResults = []
added = set([])
temp=''
for i, s in enumerate(strs):
results = []
unique_s = "".join(sorted(s))
if unique_s in added:
continue
else:
added.add(unique_s)
for x in strs[i:]:
if unique_s=="".join(sorted(x)):
results.append(strs[i])
allResults.append(results)
print(added)
return allResults
Use itertools.groupby
>>> lst = ["eat", "tea", "tan", "ate", "nat", "bat"]
>>>
>>> from itertools import groupby
>>> f = lambda w: sorted(w)
>>> [list(v) for k,v in groupby(sorted(lst, key=f), f)]
[['bat'], ['eat', 'tea', 'ate'], ['tan', 'nat']]
Using only lists, as requested in the title of the question:
The second line s_words takes all the letters of each word in words, sorts them, and recreates a string composed of the sorted letters of the word; it creates a list of all the these sorted letters strings, in the same order as the original sequence of words --> this will be used to compare the possible anagrams (the letters of anagrams produce the same string when sorted)
The 3rd line indices hold True or False values, to indicate if the corresponding word has been extracted already, and avoid duplicates.
The following code is a double loop that for each s_word, determines which other s_word is identical, and uses its index to retrieve the corresponding word in the original list of words; it also updates the truth value of the indices.
words = ["eat", "tea", "tan", "ate", "nat", "bat"]
s_words = [''.join(sorted(list(word))) for word in words]
indices = [False for _ in range(len(words))]
anagrams = []
for idx, s_word in enumerate(s_words):
if indices[idx]:
continue
ana = [words[idx]]
for jdx, word in enumerate(words):
if idx != jdx and not indices[jdx] and s_word == s_words[jdx]:
ana.append(words[jdx])
indices[jdx] = True
anagrams.append(ana)
print(anagrams)
output:
[['eat', 'tea', 'ate'], ['tan', 'nat'], ['bat']]
The way you implemented your function, you are only looking at rotations of the strings (that is you shift a letter from the beginning to the end, e.g. a-t-e -> t-e-a -> e-a-t). What your algorithm cannot detect is single permutations were you only switch two letters (n-a-t -> t-a-n). In mathematical language you only consider the even permutations of the three letter strings and not the odd permutations.
A modification of your code could for example be:
def get_list_of_permutations(input_string):
list_out = []
if len(input_string) > 1:
first_char = input_string[0]
remaining_string = input_string[1:]
remaining_string_permutations = get_list_of_permutations(remaining_string)
for i in range(len(remaining_string)+1):
for permutation in remaining_string_permutations:
list_out.append(permutation[0:i]+first_char+permutation[i:])
else:
return [input_string]
return list_out
def groupAnagrams(strs):
allResults=[]
for s in strs:
results = []
list_of_permutations = get_list_of_permutations(s)
for i in range(0,len(strs)):
if strs[i] in list_of_permutations:
results.append(strs[i])
if results not in allResults:
allResults.append(results)
return allResults
The output is
Out[218]: [['eat', 'tea', 'ate'], ['tan', 'nat'], ['bat']]
Edit: modified the code to work with all lengths of strings.
https://docs.python.org/3/library/itertools.html#itertools.permutations
from itertools import permutations
word_list = ["eat", "tea", "tan", "ate", "nat", "bat"]
anagram_group_list = []
for word in word_list:
if word == None:
pass
else:
anagram_group_list.append([])
for anagram in permutations(word):
anagram = ''.join(anagram)
try:
idx = word_list.index(anagram)
word_list[idx] = None
anagram_group_list[-1].append(anagram)
except ValueError:
pass # this anagram is not present in word_list
print(anagram_group_list)
# [['eat', 'ate', 'tea'], ['tan', 'nat'], ['bat']]
after refactoring code and stopping it from producing redundant result your code still doesn't give expected result as logic for producing anagram is not completely correct
def groupAnagrams(word_list):
allResults=[]
results=[]
for idx,s in enumerate(word_list):
if s == None:
pass
else:
results = [s] # word s is added to anagram list
# you were generating only 1 anagram like for tan --> ant but in word_list only nat was present
for i in range(1,len(s),1):
temp = s[i:]+s[:i] #anagram
# for s = 'tan' it generates only 'ant and 'nta'
# when it should generate all six tna ant nta _nat_ atn tan
if temp in word_list:
results.append(temp)
word_list[word_list.index(temp)] = None
allResults.append(results)
return allResults
print(groupAnagrams(["eat", "tea", "tan", "ate", "nat", "bat"]))
# [['eat', 'ate', 'tea'], ['tan'], ['nat'], ['bat']]
The detection of anagrams of words consisting of unique characters can be done by comparison between sets. [See comment for a general solution]
words = ["eat", "tea", "tan", "ate", "nat", "bat"]
anagrams = []
for w in words:
m = [w2 for w2 in words if set(w2) == set(w)]
if m not in anagrams:
anagrams += [m]
print(anagrams)
Output
[['eat', 'tea', 'ate'], ['tan', 'nat'], ['bat']]
EDIT
For words with duplicate characters a multi-set approach can be used. A multi-set can be modeled with collections.Counter.
from collections import Counter
words = ["eat", "tea", "tan", "ate", "nat", "bat", "cia", "aci"]
# group per index
d = {}
multi_sets = list(map(Counter, words))
for i, w in enumerate(words):
i_reference = multi_sets.index(Counter(w)) # always 1st match
d.setdefault(i_reference, []).append(words[i])
anagrams = list(d.values())
# inplace sort: group per size of family of anagrams
anagrams.sort(key=len, reverse=True)
print(anagrams)
Remark: ordering a multi-set is highly non-trivial and the usual methods __lt__, __gt__, ... are not implemented. As a consequence sorted cannot be used. Comparison is still possible with __eq__ or __ne__ which are both naturally supported by Counter.
So I am trying to implement code that will count the next letter in a sentence, using python.
so for instance,
"""So I am trying to implement code that will count the next letter in a sentence, using
python"""
most common letters one after the other
for 's'
'o' :1
'e' :1
for 'o'
' ' :1
'd' :1
'u' :1
'n' :1
I think you get the idea
I already have written code for counting letters prior
def count_letters(word, char):
count = 0
for c in word:
if char == c:
count += 1
return count
As you can see this just counts for letters, but not the next letter. can someone give me a hand on this one?
from collections import Counter, defaultdict
counts = defaultdict(Counter)
s = """So I am trying to implement code that will count the next letter in a sentence, using
python""".lower()
for c1, c2 in zip(s, s[1:]):
counts[c1][c2] += 1
(apart from being simpler, this should be significantly faster than pault's answer by not iterating over the string for every letter)
Concepts to google that aren't named in the code:
for c1, c2 in ... (namely the fact that there are two variables): tuple unpacking
s[1:]: slicing. Basically this is a copy of the string after the first character.
Here is a relatively terse way to do it:
from itertools import groupby
from collections import Counter
def countTransitionFrequencies(text):
prevNext = list(zip(text[:-1], text[1:]))
prevNext.sort(key = lambda pn: pn[0])
transitions = groupby(prevNext, lambda pn: pn[0])
freqs = map(
lambda kts: (kts[0], Counter(map(lambda kv: kv[1], kts[1]))),
transitions
)
return freqs
Explanation:
zip creates list of pairs with (previous, next) characters
The pairs are sorted and grouped by the previous character
The frequencies of the next characters (extracted from pairs by kv[1]) are then counted using Counter.
Sorting is not really necessary, but unfortunately, this is how the provided groupby works.
An example:
for k, v in countTransitionFrequencies("hello world"):
print("%r -> %r" % (k, v))
This prints:
' ' -> Counter({'w': 1})
'e' -> Counter({'l': 1})
'h' -> Counter({'e': 1})
'l' -> Counter({'l': 1, 'o': 1, 'd': 1})
'o' -> Counter({' ': 1, 'r': 1})
'r' -> Counter({'l': 1})
'w' -> Counter({'o': 1})
Here's a way using collections.Counter:
Suppose the string you provided was stored in a variable s.
First we iterate over the set of all lower case letters in s. We do this by making another string s_lower which will convert the string s to lowercase. We then wrap this with the set constructor to get unique values.
For each char, we iterate through the string and check to see if the previous letter is equal to char. If so, we store this in a list. Finally, we pass this list into the collections.Counter constructor which will count the occurrences.
Each counter is stored in a dictionary, counts, where the keys are the unique characters in the string.
from collections import Counter
counts = {}
s_lower = s.lower()
for char in set(s_lower):
counts[char] = Counter(
[c for i, c in enumerate(s_lower) if i > 0 and s_lower[i-1] == char]
)
For your string, this has the following outputs:
>>> print(counts['s'])
#Counter({'i': 1, 'e': 1, 'o': 1})
>>> print(counts['o'])
#Counter({' ': 2, 'd': 1, 'n': 1, 'u': 1})
One caveat is that this method will iterate through the whole string for each unique character, which could potentially make it slow for large lists.
Here is an alternative approach using collections.Counter and collections.defaultdict that only loops through the string once:
from collections import defaultdict, Counter
def count_letters(s):
s_lower = s.lower()
counts = defaultdict(Counter)
for i in range(len(s_lower) - 1):
curr_char = s_lower[i]
next_char = s_lower[i+1]
counts[curr_char].update(next_char)
return counts
counts = count_letters(s)
We loop over each character in the string (except the last) and on each iteration we update a counter using the next character.
This should work, the only thing is it doesn't sort the values, but that can be solved by creating a new dictionary with list of tuples (char, occurrences) and using sorted function on tuple[1].
def countNext(word):
d = {}
word = word.lower()
for i in range(len(word) - 1):
c = word[i]
cc = word[i+1]
if(not c.isalpha() or not cc.isalpha()):
continue
if c in d:
if cc in d[c]:
d[c][cc] += 1
else:
d[c][cc] = 1
else:
d[c] = {}
d[c][cc] = 1
return d
I have algorithm problem with Python and strings.
My issue:
My function should sum maximum values of substring.
For example:
ae-afi-re-fi -> 2+6+3+5=16
but
ae-a-fi-re-fi -> 2-10+5+3+5=5
I try use string.count function and counting substring, but this method is not good.
What would be the best way to do this in Python? Thanks in advance.
string = "aeafirefi"
Sum the value of substrings.
In my solution i'll use permutations from itertools module in order to list all the possible permutations of substrings that you gave in your question presented into a dict called vals. Then iterate through the input string and split the strings by all the permutations found below. Then sum the values of each permutations and finally get the max.
PS: The key of this solution is the get_sublists() method.
This is an example with some tests:
from itertools import permutations
def get_sublists(a, perm_vals):
# Find the sublists in the input string
# Based on the permutations of the dict vals.keys()
for k in perm_vals:
if k in a:
a = ''.join(a.split(k))
# Yield the sublist if we found any
yield k
def sum_sublists(a, sub, vals):
# Join the sublist and compare it to the input string
# Get the difference by lenght
diff = len(a) - len(''.join(sub))
# Sum the value of each sublist (on every permutation)
return sub , sum(vals[k] for k in sub) - diff * 10
def get_max_sum_sublists(a, vals):
# Get all the possible permutations
perm_vals = permutations(vals.keys())
# Remove duplicates if there is any
sub = set(tuple(get_sublists(a, k)) for k in perm_vals)
# Get the sum of each possible permutation
aa = (sum_sublists(a, k, vals) for k in sub)
# return the max of the above operation
return max(aa, key= lambda x: x[1])
vals = {'ae': 2, 'qd': 3, 'qdd': 5, 'fir': 4, 'afi': 6, 're': 3, 'fi': 5}
# Test
a = "aeafirefi"
final, s = get_max_sum_sublists(a, vals)
print("Sublists: {}\nSum: {}".format(final, s))
print('----')
a = "aeafirefiqdd"
final, s = get_max_sum_sublists(a, vals)
print("Sublists: {}\nSum: {}".format(final, s))
print('----')
a = "aeafirefiqddks"
final, s = get_max_sum_sublists(a, vals)
print("Sublists: {}\nSum: {}".format(final, s))
Output:
Sublists: ('ae', 'afi', 're', 'fi')
Sum: 16
----
Sublists: ('afi', 'ae', 'qdd', 're', 'fi')
Sum: 21
----
Sublists: ('afi', 'ae', 'qdd', 're', 'fi')
Sum: 1
Please try this solution with many input strings as you can and don't hesitate to comment if you found any wrong result.
Probably having a dictionary with:
key = substring: value = value
So if you have:
string = "aeafirefi"
first you look for the whole string in the dictionary, if you don't find it, you cut the last letter so you have "aeafiref", until you find a substring or you have an only letter.
then you skip the letters used: for example, if you found "aeaf", you start all over again using string = "iref".
Here's a brute force solution:
values_dict = {
'ae': 2,
'qd': 3,
'qdd': 5,
'fir': 4,
'afi': 6,
're': 3,
'fi': 5
}
def get_value(x):
return values_dict[x] if x in values_dict else -10
def next_tokens(s):
"""Returns possible tokens"""
# Return any tokens in values_dict
for x in values_dict.keys():
if s.startswith(x):
yield x
# Return single character.
yield s[0]
def permute(s, stack=[]):
"""Returns all possible variations"""
if len(s) == 0:
yield stack
return
for token in next_tokens(s):
perms = permute(s[len(token):], stack + [token])
for perm in perms:
yield perm
def process_string(s):
def process_tokens(tokens):
return sum(map(get_value, tokens))
return max(map(process_tokens, permute(s)))
print('Max: {}'.format(process_string('aeafirefi')))
Can any one here please this code with example if its possible what this code doing?
def sort_by_length(words):
t = []
for word in words:
t.append((len(word), word))
t.sort(reverse=True)
res = []
for length, word in t:
res.append(word)
return res
and what the meaning of reverse =True what reverse is i do understand what men len and append method but return but what he ment by reverse
It is returning a list of words, sorted longest-to-shortest then z-to-a.
You could do the same thing with just
def sort_by_length(words):
return sorted(words, key=lambda w: (len(w), w), reverse=True)
It might make more sense to sort longest-to-shortest, a-to-z, which would be
def sort_by_length(words):
return sorted(words, key=lambda w: (-len(w), w))
def sort_by_length(words):
t = [] # empty list
for word in words:# iterating over given words
t.append((len(word), word)) # appending a word into the list "t" as tupel. e.g word "hello" as (5, "hello")
t.sort(reverse=True) # sorts all tupels in reverse-order
res = []
for length, word in t:
res.append(word) # extracts just the words out of the tupels e.g. (5, "hello") => "hello"
return res # return words ordered
Sort words by length
w = ["abcd", "za", "wyya", "dssffgdg"]
print sort_by_length(w);
http://ideone.com/jRzatV
My problem is the following. I have a long list of URLs such as:
www.foo.com/davidbobmike1joe
www.foo.com/mikejoe2bobkarl
www.foo.com/joemikebob
www.foo.com/bobjoe
I need to compare all the entries (URLs) in that list with each other, extract the keywords in the subdomains of those URLs (in this case: david, joe, bob, mike, karl) and order them by frequency. I've been reading about several libraries such as nltk. However the problem here is that there are no spaces to tokenise each word independently. Any recommendations on how to get the job done?
Limitations
If you refuse to use a dictionary you're algorithm will require a lot of computation. Above that, it is impossible to distinguish a keyword that occurs only once (e.g: "karl") from a crappy sequence (e.g: "e2bo"). My solution will be a best effort and will only work if your list of URL's contains keywords multiple times.
The basic idea
I assume a word is a sequence of characters that occur frequently of at least 3 characters. This prevents the letter "o" from being the most popular word.
The basic idea is the following.
Count all n letter sequences and select the once that occur multiple times.
Cut all sequences that are a part of a larger sequence.
Order them by popularity and you have a solution that comes close to solving your problem. (Left as an exercise to the reader)
In code
import operator
sentences = ["davidbobmike1joe" , "mikejoe2bobkarl", "joemikebob", "bobjoe", "bobbyisawesome", "david", "bobbyjoe"];
dict = {}
def countWords(n):
"""Count all possible character sequences/words of length n occuring in all given sentences"""
for sentence in sentences:
countWordsSentence(sentence, n);
def countWordsSentence(sentence, n):
"""Count all possible character sequence/words of length n occuring in a sentence"""
for i in range(0,len(sentence)-n+1):
word = sentence[i:i+n]
if word not in dict:
dict[word] = 1;
else:
dict[word] = dict[word] +1;
def cropDictionary():
"""Removes all words that occur only once."""
for key in dict.keys():
if(dict[key]==1):
dict.pop(key);
def removePartials(word):
"""Removes all the partial occurences of a given word from the dictionary."""
for i in range(3,len(word)):
for j in range(0,len(word)-i+1):
for key in dict.keys():
if key==word[j:j+i] and dict[key]==dict[word]:
dict.pop(key);
def removeAllPartials():
"""Removes all partial words in the dictionary"""
for word in dict.keys():
removePartials(word);
for i in range(3,max(map(lambda x: len(x), sentences))):
countWords(i);
cropDictionary();
removeAllPartials();
print dict;
Output
>>> print dict;
{'mike': 3, 'bobby': 2, 'david': 2, 'joe': 5, 'bob': 6}
Some challenges to the reader
Sort the dictionary by value before printing it. (Sort a Python dictionary by value)
In this example "bob" occurs six times, 2 times it is a partial word of "bobby". Determine if this is problematic and fix it if necessary.
Take capitalization into account.
Overview
You could use this code to extract the names, passing in a list of [david, bob, etc.]:
Is there an easy way generate a probable list of words from an unspaced sentence in python?
And then use collections.Counter to get frequencies.
The code
from Bio import trie
import string
from collections import Counter
def get_trie(words):
tr = trie.trie()
for word in words:
tr[word] = len(word)
return tr
def get_trie_word(tr, s):
for end in reversed(range(len(s))):
word = s[:end + 1]
if tr.has_key(word):
return word, s[end + 1: ]
return None, s
def get_trie_words(s):
names = ['david', 'bob', 'karl', 'joe', 'mike']
tr = get_trie(names)
while s:
word, s = get_trie_word(tr, s)
yield word
def main(urls):
d = Counter()
for url in urls:
url = "".join(a for a in url if a in string.lowercase)
for word in get_trie_words(url):
d[word] += 1
return d
if __name__ == '__main__':
urls = [
"davidbobmike1joe",
"mikejoe2bobkarl",
"joemikebob",
"bobjoe",
]
print main(urls)
Results
Counter({'bob': 4, 'joe': 4, 'mike': 3, 'karl': 1, 'david': 1})