I'm trying to create a function in Python that will generate anagrams of a given word. I'm not just looking for code that will rearrange the letters aimlessly. All the options given must be real words. I currently have a solution, which to be honest I took most of this code from a YouTube video, but it is very slow for my purpose and can only provide one word responses to a single word given. It uses a 400,000 word dictionary to compare the words it is going though, called "dict.txt".
My goal is to get this code to mimic how well this website's code works:
https://wordsmith.org/anagram/
I could not find the javascript code when reviewing the network activity using Google Chrome's developer tool, so I believe the code is probably in the background, and is possibly using Node.js. This would perhaps make it faster than Python, but given how much faster it is I believe there is more to it than just the programming language. I assume they are using some type of search algorithm rather than just going through each line one by one like I am. I also like the fact that their response is not limited to a single word, but can break up the word given to provide more options to the user. For example, an anagram of "anagram" is "nag a ram".
Any suggestions or ideas would be appreciated.
Thank you.
def init_words(filename):
words = {}
with open(filename) as f:
for line in f:
word = line.strip()
words[word] = 1
return words
def init_anagram_dict(words):
anagram_dict = {}
for word in words:
sorted_word = ''.join(sorted(list(word)))
if sorted_word not in anagram_dict:
anagram_dict[sorted_word] = []
anagram_dict[sorted_word].append(word)
return anagram_dict
def find_anagrams(word, anagram_dict):
key = ''.join(sorted(list(word)))
if key in anagram_dict:
return set(anagram_dict[key]).difference(set([word]))
return set([])
#This is the first function called.
def make_anagram(user_word):
x = str(user_word)
lower_user_word = str.lower(x)
word_dict = init_words('dict.txt')
result = find_anagrams(lower_user_word, init_anagram_dict(word_dict.keys()))
list_result = list(result)
count = len(list_result)
if count > 0:
random_num = random.randint(0,count -1)
anagram_value = list_result[random_num]
return ('An anagram of %s is %s. Would you like me to search for another word?' %(lower_user_word, anagram_value))
else:
return ("Sorry, I could not find an anagram for %s." %(lower_user_word))
You can build a dictionary of anagrams by grouping words by their sorted text. All words that have the same sorted text are anagrams of each other:
from collections import defaultdict
with open("/usr/share/dict/words","r") as wordFile:
words = wordFile.read().split("\n")
anagrams = defaultdict(list)
for word in words:
anagrams["".join(sorted(word))].append(word)
aWord = "spear"
result = anagrams["".join(sorted(aWord))]
print(aWord,result)
# ['asper', 'parse', 'prase', 'spaer', 'spare', 'spear']
Using 235,000 words, the response time is instantaneous
In order to obtain multiple words forming an anagram of the specified word, you will need to get into combinatorics. A recursive function is probably the easiest way to go about it:
from itertools import combinations,product
from collections import Counter,defaultdict
with open("/usr/share/dict/words","r") as wordFile:
words = wordFile.read().split("\n")
anagrams = defaultdict(set)
for word in words:
anagrams["".join(sorted(word))].add(word)
counters = { w:Counter(w) for w in anagrams }
minLen = 2 # minimum word length
def multigram(word,memo=dict()):
sWord = "".join(sorted(word))
if sWord in memo: return memo[sWord]
result = anagrams[sWord]
wordCounts = counters.get(sWord,Counter())
for size in range(minLen,len(word)-minLen+1):
seen = set()
for combo in combinations(word,size):
left = "".join(sorted(combo))
if left in seen or seen.add(left): continue
left = multigram(left,memo)
if not left: continue
right = multigram("".join((wordCounts-Counter(combo)).elements()),memo)
if not right: continue
result.update(a+" "+b for a,b in product(left,right) )
memo[sWord] = list(result)
return memo[sWord]
Performance is good up to 12 character words. Beyond that the exponential nature of combinations start to take a heavy toll
result = multigram("spear")
print(result)
# ['parse', 'asper', 'spear', 'er spa', 're spa', 'se rap', 'er sap', 'sa per', 're asp', 'ar pes', 'se par', 'pa ers', 're sap', 'er asp', 'as per', 'spare', 'spaer', 'as rep', 'sa rep', 'ra pes', 'pa ser', 'es rap', 'es par', 'prase']
len(multigram("mulberries")) # 15986 0.1 second 10 letters
len(multigram("raspberries")) # 60613 0.2 second 11 letters
len(multigram("strawberries")) # 374717 1.3 seconds 12 letters
len(multigram("tranquillizer")) # 711491 7.6 seconds 13 letters
len(multigram("communications")) # 10907666 52.2 seconds 14 letters
In order to avoid any delay, you can convert the function to an iterator. This will allows you to get the first few anagrams without having to generate them all:
def iMultigram(word,prefix=""):
sWord = "".join(sorted(word))
seen = set()
for anagram in anagrams.get(sWord,[]):
full = prefix+anagram
if full in seen or seen.add(full): continue
yield full
wordCounts = counters.get(sWord,Counter(word))
for size in reversed(range(minLen,len(word)-minLen+1)): # longest first
for combo in combinations(sWord,size):
left = "".join(sorted(combo))
if left in seen or seen.add(left): continue
for left in iMultigram(left,prefix):
right = "".join((wordCounts-Counter(combo)).elements())
for full in iMultigram(right,left+" "):
if full in seen or seen.add(full): continue
yield full
from itertools import islice
list(islice(iMultigram("communications"),5)) # 0.0 second
# ['communications', 'cinnamomic so ut', 'cinnamomic so tu', 'cinnamomic os ut', 'cinnamomic os tu']
Related
I have been working on writing a Wordle bot, and wanted to see how it preforms with all 13,000 words. The problem is that I am running this through a for loop and it is very inefficient. After running it for 30 minutes, it only gets to around 5%. I could wait all that time, but it would end up being 10+ hours. There has got to be a more efficient way. I am new to python, so any suggestions would be greatly appreciated.
The code here is the code that is used to limit down the guesses each time. Would there be a way to search for a word that contains "a", "b", and "c"? Instead of running it 3 separate times. Right now the containts, nocontains, and isletter will each run every time I need to search for a new letter. Searching them all together would greatly reduce the time.
#Find the words that only match the criteria
def contains(letter, place):
list.clear()
for x in words:
if x not in removed:
if letter in x:
if letter == x[place]:
removed.append(x)
else:
list.append(x)
else:
removed.append(x)
def nocontains(letter):
list.clear()
for x in words:
if x not in removed:
if letter not in x:
list.append(x)
else:
removed.append(x)
def isletter(letter, place):
list.clear()
for x in words:
if x not in removed:
if letter == x[place]:
list.append(x)
else:
removed.append(x)
The performance problems can be massively reduced by using sets. Any time that you want to repeatedly test for membership (even only a few times), e.g. if x not in removed, you want to try to make a set. Lists require checking every element to find x, which is bad if the list has thousands of elements. In a Python set, if x not in removed should take as long to run if removed has 100 elements or 100,000, a small constant amount of time.
Besides this, you're running into problems by trying to use mutable global variables everywhere, like for list (which needs to be renamed) and removed. There's no benefit to doing that and several downsides, such as making it harder to reason about your code or optimize it. One benefit of Python is that you can pass large containers or objects to functions without any extra time or space cost: calling a function f(huge_list) is as fast and uses as much memory as f(tiny_list), as if you were passing by reference in other languages, so don't hesitate to use containers as function parameters or return types.
In summary, here's how your code could be refactored if you take away 'list' and 'removed' and instead store this as a set of possible words:
all_words = [] # Huge word list to read in from text file
current_possible_words = set(all_words)
def contains_only_elsewhere(possible_words, letter, place):
"""Given letter and place, remove from possible_words
all words containing letter but not at place"""
to_remove = {word for word in possible_words
if letter not in word or word[place] == letter}
return possible_words - to_remove
def must_not_contain(possible_words, letter):
"""Given a letter, remove from possible_words all words containing letter"""
to_remove = {word for word in possible_words
if letter in word}
return possible_words - to_remove
def exact_letter_match(possible_words, letter, place):
"""Given a letter and place, remove from possible_words
all words not containing letter at place"""
to_remove = {word for word in possible_words
if word[place] != letter}
return possible_words - to_remove
The outside code will be different: for example,
current_possible_words = exact_letter_match(current_possible_words, 'a', 2)`
Further optimizations are possible (and much easier now): storing only indices to words rather than the strings; precomputing, for each letter, the set of all words containing that letter, etc.
I just wrote a wordle bot that runs in about a second including the web scraping to fetch a list of 5 letter words.
import urllib.request
from bs4 import BeautifulSoup
def getwords():
source = "https://www.thefreedictionary.com/5-letter-words.htm"
filehandle = urllib.request.urlopen(source)
soup = BeautifulSoup(filehandle.read(), "html.parser")
wordslis = soup.findAll("li", {"data-f": "15"})
words = []
for k in wordslis:
words.append(k.getText())
return words
words = getwords()
def hasLetterAtPosition(letter,position,word):
return letter==word[position]
def hasLetterNotAtPosition(letter,position,word):
return letter in word[:position]+word[position+1:]
def doesNotHaveLetter(letter,word):
return not letter in word
lettersPositioned = [(0,"y")]
lettersMispositioned = [(0,"h")]
lettersNotHad = ["p"]
idx = 0
while idx<len(words):
eliminated = False
for criteria in lettersPositioned:
if not hasLetterAtPosition(criteria[1],criteria[0],words[idx]):
del words[idx]
eliminated = True
break
if eliminated:
continue
for criteria in lettersMispositioned:
if not hasLetterNotAtPosition(criteria[1],criteria[0],words[idx]):
del words[idx]
eliminated = True
break
if eliminated:
continue
for letter in lettersNotHad:
if not doesNotHaveLetter(letter,words[idx]):
del words[idx]
eliminated = True
break
if eliminated:
continue
idx+=1
print(words) # ["youth"]
The reason yours is slow is because you have a lot of calls to check if word in removed in addition to a number of superfluous logical conditions in addition to going through all the words for each of your checks.
Edit: Here's a get words function that gets more words.
def getwords():
source = "https://wordfind-com.translate.goog/length/5-letter-words/?_x_tr_sl=es&_x_tr_tl=en&_x_tr_hl=en&_x_tr_pto=wapp"
filehandle = urllib.request.urlopen(source)
soup = BeautifulSoup(filehandle.read(), "html.parser")
wordslis = soup.findAll("a", {"rel": "nofollow"})
words = []
for k in wordslis:
words.append(k.getText())
return words
For my project I want to write a program that searches for a word in a string/long document in python.
If the word is not in the string/document, I have to search for approximate matches.
For example the word “brain”,
Deletions: rain bain brin bran brai .
Substitutions: train grain blain bryin ...
I already have deletion and substitution function, but I am not sure how to search for the word in Brute Force runtime/ Benchmark runtime
string = "hereharewereworeherareteartoredeardareearrearehrerheasereseersearrah"
# the string can be much longer
Pattern = "ware"
# the output should have 4 deletion and 6 subtitutions
#string0 is Pattern, string1 is the word we compare, if it is the type, append to the list
Deletions = []
def deletions(string0, string1):
deletionlist = []
#append list of deletion word
for i in range(len(string0)):
deletionlist.append(string0.replace(string0[i], ""))
#delete first string and last
if string1[1:] in deletionlist:
Deletions.append(string1[1:])
return 1
elif string1[:-1] in deletionlist:
if len(string1[:-1]) == 1:
Deletions.append(string1[:-1])
return 1
Substitutions = []
def subsitutions(string0, string1):
if len(string0) == len(string1):
sublist = []
#append list of deletion word
for i in range(len(string0)):
sublist.append(string0.replace(string0[i], ""))
for j in range(len(string1)):
if string1.replace(string1[j], "") in sublist:
Substitutions.append(string1)
break
The best is levenshtein algorithm, you may calculate the distance between 2 words or sentences (how many character replacements it takes to convert one into another) or similarity ratio, if you like:
>>> import Levenshtein
>>> Levenshtein.distance( 'hello, guys', 'hello, girls' )
3
>>> Levenshtein.ratio( 'hello, guys', 'hello, girls' )
0.782608695652174
You may check the details of the implementation and other info here: https://en.wikipedia.org/wiki/Levenshtein_distance
I'm trying to write an algorithm that by given to it a bunch of letters is giving you all the words that can be constructed of the letters, for instance, given 'car' should return a list contains [arc,car,a, etc...] and out of it returns the best scrabble word. The problem is in finding that list which contains all the words.
I've got a giant txt file dictionary, line delimited and I've tried this so far:
def find_optimal(bunch_of_letters: str):
words_to_check = []
c1 = Counter(bunch_of_letters.lower())
for word in load_words():
c2 = Counter(word.lower())
if c2 & c1 == c2:
words_to_check.append(word)
max_word = max_word_value(words_to_check)
return max_word,calc_word_value(max_word)
max_word_value - returns the word with the maximum value of the list given
calc_word_value - returns the word's score in scrabble.
load_words - return a list of the dictionary.
I'm currently using counters to do the trick but, the problem is that I'm currently on about 2.5 seconds per search and I don't know how to optimize this, any thoughts?
Try this:
def find_optimal(bunch_of_letters):
bunch_of_letters = ''.join(sorted(bunch_of_letters))
words_to_check = [word for word in load_words() if ''.join(sorted(word)) in bunch_of_letters]
max_word = max_word_value(words_to_check)
return max_word, calc_word_value(max_word)
I've just used (or at least tried to use) a list comprehension. Essentially, words_to_check will (hopefully!) be a list of all of the words which are in your text file.
On a side note, if you don't want to use a gigantic text file for the words, check out enchant!
from itertools import permutations
theword = 'car' # or we can use input('Type in a word: ')
mylist = [permutations(theword, i)for i in range(1, len(theword)+1)]
for generator in mylist:
for word in generator:
print(''.join(word))
# instead of .join just print (word) for tuple
Output:
c
a
r
ca
cr
...
ar rc ra car cra acr arc rca rac
This will give us all the possible combinations (i.e. permutations) of a word.
If you're looking to see if the generated word is an actual word in the English dictionary we can use This Answer
import enchant
d = enchant.Dict("en_US")
for word in mylist:
print(d.check(word), word)
Conclusion:
If want to generate all the combinations of the word. We use this code:
from itertools import combinations, permutations, product
word = 'word' # or we can use input('Type in a word: ')
solution = permutations(word, 4)
for i in solution:
print(''.join(i)) # just print(i) if you want a tuple
For each occurrence of a certain word, I need to display the context by showing about 5 words preceding and following the occurrence of the word.
Example output for the word 'stranger' in a text file of content when you enter occurs('stranger', 'movie.txt'):
My code so far:
def occurs(word, filename):
infile = open(filename,'r')
lines = infile.read().splitlines()
infile.close()
wordsString = ''.join(lines)
words = wordsString.split()
print(words)
for i in range(len(words)):
if words[i].find(word):
#stuck here
I'd suggest slicing words depending on i:
print(words[i-5:i+6])
(This would go where your comment is)
Alternatively, to print as shown in your example:
print("...", " ".join(words[i-5:i+6]), "...")
To account for the word being in the first 5:
if i > 5:
print("...", " ".join(words[i-5:i+6]), "...")
else:
print("...", " ".join(words[0:i+6]), "...")
Additionally, find is not doing what you think it is. If find() doesn't find the string, it returns -1 which evaluates to True when used in a if statement. Try:
if word in words[i].lower():
This retrieves the index of every occurrence of the word in words, which is a list of all words in the file. Then slicing is used to get a list of the matched word and the 5 words before and after.
def occurs(word, filename):
infile = open(filename,'r')
lines = infile.read().splitlines()
infile.close()
wordsString = ''.join(lines)
words = wordsString.split()
matches = [i for i, w in enumerate(words) if w.lower().find(word) != -1]
for m in matches:
l = " ".join(words[m-5:m+6])
print(f"... {l} ...")
Consider the more_itertools.adajacent tool.
Given
import more_itertools as mit
s = """\
But we did not answer him, for he was a stranger and we were not used to, strangers and were shy of them.
We were simple folk, in our village, and when a stranger was a pleasant person we were soon friends.
"""
word, distance = "stranger", 5
words = s.splitlines()[0].split()
Demo
neighbors = list(mit.adjacent(lambda x: x == word, words, distance))
" ".join(word for bool_, word in neighbors if bool_)
# 'him, for he was a stranger and we were not used'
Details
more_itertools.adjacent returns an iterable of tuples, e.g. (bool, item) pairs. A True boolean is returned for words in the string that satisfy the predicate. Example:
>>> neighbors
[(False, 'But'),
...
(True, 'a'),
(True, 'stranger'),
(True, 'and'),
...
(False, 'to,')]
Neighboring words are filtered from the results given a distance from the target word.
Note: more_itertools is a third-party library. Install by pip install more_itertools.
Whenever I see rolling views of files, I think collections.deque
import collections
def occurs(needle, fname):
with open(fname) as f:
lines = f.readlines()
words = iter(''.join(lines).split())
view = collections.deque(maxlen=11)
# prime the deque
for _ in range(10): # leaves an 11-length deque with 10 elements
view.append(next(words, ""))
for w in words:
view.append(w)
if view[5] == needle:
yield list(view.copy())
Note that this approach intentionally does not handle any edge cases for needle names in the first 5 words or the last 5 words of the file. The question is ambiguous as to whether matching the third word should give the first through ninth words, or something different.
Today is one of those days where all of my knowledge in programming seems to be failing horribly, and no amount of coffee administered via IV is helping the situation.
I am presented with a list of phrases, here is some as an example
"tax policies when emigrating from uk"
"shipping to scotland from california"
"immigrating to sweden"
"shipping good to australia"
"shipping good to new zealand"
"how to emigrate to california from the uk"
"shipping services from london to usa"
"cost of shipping from usa to uk"
Now I need to start doing word frequency analysis on this, thankfully in python this is pretty simple, I constructed the following function to take this list and give back a Counter of the most common words.
from collections import Counter
def count(phrases):
counter = Counter()
for phrase in phrases:
for word in phrase.split(" "):
counter[word] += 1
return counter
This rocks, because now I can easily acquire the most common words from the phrase list as so count(phrases).most_common(5)
Now it becomes harder. Say I set an arbitrary depth, lets say 5. Given the most popular word in that list (that isn't a glue word e.g. from or to and) is shipping. I now need to take the word shipping and count again for all the phrases that contain shipping the terms, again mostly kind of simple.
def filter_for_word(word, phrases):
return filter(lambda x: word in x, phrases)
count(filter_for_word("shipping", phrases))
This is where it starts to get hairy, I need to keep going down and down the results until I hit my depth. And then I need to be able to display this information along with the most common phrases together.
I started trying to do this with the following function but I simply cannot get my head around the next few steps to bind the content together and display it in a good structure and format.
def dive(depth, num, phrases):
phrase_tree = {}
for word, value in dict(count(phrases).most_common(num)).iteritems():
phrase_tree[word] = [value, {}]
current = phrase_tree
while True:
if depth == 0:
return phrase_tree
for word in current:
current[word][1] = {key: [v, {}] for (key, v) in count(filter_for_word(word, phrases)).most_common(num)}
# debug!!
return current
If anyone could help me bring this all together I would greatly appreciate it
def filter_for_words(words, phrases):
for word in words:
phrases = filter(lambda x: word in x, phrases)
return phrases
def dive(depth, num, phrases, phrase_tree=None, f_words=None):
if not phrase_tree:
phrase_tree = {}
for word, value in dict(count(phrases).most_common(num)).iteritems():
phrase_tree[word] = [value, {}]
if not f_words:
f_words = []
while True:
if depth == 0:
return phrase_tree
for word in phrase_tree:
words = f_words[:]
words.append(word)
child_tree = {key: [v, {}] for (key, v) in count(filter_for_words(words, phrases)).most_common(num)}
phrase_tree[word][1] = child_tree
dive(depth-1, num, phrases, child_tree, words)
return phrase_tree
Not efficiency but it should work.