Storing word count in the python trie - python

I took a list of words and put it into a trie. I would also like to store word count inside for further analysis. What would be the best way to do it? This is the class where I think the frequency would be collected and stored, but I am not sure how to go about it. You can see my attempt, last line in insert is where I try to store the count.
class TrieNode:
def __init__(self,k):
self.v = 0
self.k = k
self.children = {}
def all_words(self, prefix):
if self.end:
yield prefix
for letter, child in self.children.items():
yield from child.all_words(prefix + letter)
class Trie:
def __init__(self):
self.root = TrieNode()
def __init__(self):
self.root = TrieNode()
def insert(self, word):
curr = self.root
for letter in word:
node = curr.children.get(letter)
if not node:
node = TrieNode()
curr.children[letter] = node
curr.v += 1
def insert_many(self, words):
for word in words:
self.insert(word)
def all_words_beginning_with_prefix(self, prefix):
cur = self.root
for c in prefix:
cur = cur.children.get(c)
if cur is None:
return # No words with given prefix
yield from cur.all_words(prefix)
I want to store the count so that when I use
print(list(trie.all_words_beginning_with_prefix('prefix')))
I would get a result like so:
[(word, count), (word, count)]

While inserting, on seeing any node, it means there's a new word going to be added in that path. Therefore increment your word_count of that node.
class TrieNode:
def __init__(self, char):
self.char = char
self.word_count = 0
self.children = {}
def all_words(self, prefix, path):
if len(self.children) == 0:
yield prefix + path
for letter, child in self.children.items():
yield from child.all_words(prefix, path + letter)
class Trie:
def __init__(self):
self.root = TrieNode('')
def insert(self, word):
curr = self.root
for letter in word:
node = curr.children.get(letter)
if node is None:
node = TrieNode(letter)
curr.children[letter] = node
curr.word_count += 1 # increment it everytime the node is seen at particular level.
curr = node
def insert_many(self, words):
for word in words:
self.insert(word)
def all_words_beginning_with_prefix(self, prefix):
cur = self.root
for c in prefix:
cur = cur.children.get(c)
if cur is None:
return # No words with given prefix
yield from cur.all_words(prefix, path="")
def word_count(self, prefix):
cur = self.root
for c in prefix:
cur = cur.children.get(c)
if cur is None:
return 0
return cur.word_count
trie = Trie()
trie.insert_many(["hello", "hi", "random", "heap"])
prefix = "he"
words = [w for w in trie.all_words_beginning_with_prefix(prefix)]
print("Lazy method:\n Prefix: %s, Words: %s, Count: %d" % (prefix, words, len(words)))
print("Proactive method:\n Word count for '%s': %d" % (prefix, trie.word_count(prefix)))
Output:
Lazy method:
Prefix: he, Words: ['hello', 'heap'], Count: 2
Proactive method:
Word count for 'he': 2

I would add a field called is_word to the trie node, where is_word would be true only for the last letter in the word. Like you have word AND, is_word would be true for the trie node holding the letter D. And I would update frequency for only nodes that have is_word to be true, not for every letter in the word.
So when you iterate from a letter, check if it is a word, if it is, stop the iteration, return the count and the word. I’m assuming in your iteration you keep track of the letters, and keep adding them to the prefix.
Your trie is a multi-way trie.

Related

Longest Common Substring from Trie

i'm trying to obtain the longest common substring from two strings and have coded a trie, but im not too sure how to obtain the longest common substring based on this trie. Could someone please help, thanks!
class TrieNode:
def __init__(self):
self.children = [None] * 27
self.isLeaf = False
class Trie:
def __init__(self):
"""
Initialize your data structure here.
"""
self.root = TrieNode()
def insert(self, word: str) -> None:
"""
Inserts a word into the trie.
"""
current = self.root
for letter in word:
index = ord(letter) - ord('a')
if not current.children[index]:
current.children[index] = TrieNode()
current = current.children[index]
current.isLeaf = True
def search(self, word: str) -> bool:
"""
Returns if the word is in the trie.
"""
current = self.root
for letter in word:
index = ord(letter) - ord('a')
if not current.children[index]:
return False
current = current.children[index]
return current.isLeaf and current
def startsWith(self, prefix: str) -> bool:
"""
Returns if there is any word in the trie that starts with the given prefix.
"""
current = self.root
for letter in prefix:
index = ord(letter) - ord('a')
if not current.children[index]:
return False
current = current.children[index]
return True

Count the number of words that appear before

I would like to ask how can we count the number of words that occur alphabetically before the given string in the trie?
Here is my implementation now.
class TrieNode:
# Trie node class
def __init__(self):
self.children = [None] * 26
# isEndOfWord is True if node represent the end of the word
self.isEndOfWord = False
self.word_count = 0
class Trie:
# Trie data structure class
def __init__(self):
self.root = self.getNode()
def getNode(self):
# Returns new trie node (initialized to NULLs)
return TrieNode()
def _charToIndex(self, ch):
# private helper function
# Converts key current character into index
# use only 'a' through 'z' and lower case
return ord(ch) - ord('a')
def insert(self, key):
# If not present, inserts key into trie
# If the key is prefix of trie node,
# just marks leaf node
pCrawl = self.root
length = len(key)
for level in range(length):
index = self._charToIndex(key[level])
# if current character is not present
if not pCrawl.children[index]:
pCrawl.children[index] = self.getNode()
pCrawl = pCrawl.children[index]
# mark last node as leaf
pCrawl.isEndOfWord = True
pCrawl.word_count += 1
def search(self, key):
# Search key in the trie
# Returns true if key presents
# in trie, else false
pCrawl = self.root
length = len(key)
for level in range(length):
index = self._charToIndex(key[level])
if not pCrawl.children[index]:
return False
pCrawl = pCrawl.children[index]
return pCrawl is not None and pCrawl.isEndOfWord
def count_before(self, string):
cur = self.root
for b in string:
index = self._charToIndex(b)
print(index)
cur = cur.children[index]
if cur is None:
return 0
return cur.word_count
def total_before(text):
t = Trie()
for i in range(len(text)):
t.insert(text[i])
a_list = [] # A list to store the result that occur before the text[i]
for i in range(len(text)):
result = t.count_before(text[i])
a_list.append(result)
return a_list
total_before(["bac", "aaa", "baa", "aac"]) # Output will be [3, 0, 2, 1]
I would like to know how can I count the number of words that occur before the given string in the trie that I had created. Can someone give me an idea about it?
As word_count is currently initialised, it does not serve much purpose. It only is non-zero at nodes with isEndOfWord set to True. It would be more useful if it counted the number of words that depend on the current node, i.e. words that either end in that node (which your code counts now), or continue further down the trie (which are currently not counted).
To make that happen, also increment word_count while you descend the trie:
def insert(self, key):
pCrawl = self.root
length = len(key)
for level in range(length):
pCrawl.word_count += 1 # <-------------- added
index = self._charToIndex(key[level])
if not pCrawl.children[index]:
pCrawl.children[index] = self.getNode()
pCrawl = pCrawl.children[index]
pCrawl.isEndOfWord = True
pCrawl.word_count += 1
In count_before you would need to sum up all the word_count values of the child nodes the precede the child that you will select, as those represent words that come before the current word:
def count_before(self, string):
count = 0 # used to accumulate the word_counts
cur = self.root
for b in string:
index = self._charToIndex(b)
# add the word counts of the children that are to the left of this index:
count += sum(node.word_count for node in cur.children[:index] if node)
cur = cur.children[index]
if cur is None:
break
return count
This line:
count += sum(node.word_count for node in cur.children[:index] if node)
Is a compact way of doing this:
mysum = 0
for node in cur.children[:index]:
if node:
mysum += node.word_count
sum += mysum
I think you overcomplicated the problem.
def total_before(lst):
return [sorted(lst).index(el) for el in lst]
print(total_before(["bac", "aaa", "baa", "aac"]))
Output:
[3, 0, 2, 1]

Passing a list of strings to be put into trie

I have the code that can build a trie data structure when it is given one string. When I am trying to pass a list of strings, it combines the words into one
class TrieNode:
def __init__(self):
self.end = False
self.children = {}
def all_words(self, prefix):
if self.end:
yield prefix
for letter, child in self.children.items():
yield from child.all_words(prefix + letter)
class Trie:
def __init__(self):
self.root = TrieNode()
def __init__(self):
self.root = TrieNode()
def insert(self, words):
curr = self.root
#the line I added to read the words from a list is below
for word in words:
for letter in word:
node = curr.children.get(letter)
if not node:
node = TrieNode()
curr.children[letter] = node
curr = node
curr.end = True
def all_words_beginning_with_prefix(self, prefix):
cur = self.root
for c in prefix:
cur = cur.children.get(c)
if cur is None:
return # No words with given prefix
yield from cur.all_words(prefix)
This is the code I use to insert everything into the tree:
lst = ['foo', 'foob', 'foobar', 'foof']
trie = Trie()
trie.insert(lst)
The output I get is
['foo', 'foofoob', 'foofoobfoobar', 'foofoobfoobarfoof']
The output I would like to get is
['foo', 'foob', 'foobar', 'foof']
This is the line I used to get the output (for reproducibility, in case you will need to run the code) - it returns all the words that start with a particular prefix:
print(list(trie.all_words_beginning_with_prefix('foo')))
How do I fix it?
You aren't resetting curr back to the root after each insert, so you're inserting the next word where the last one left off. You'd want something like:
def insert(self, words):
curr = self.root
for word in words:
for letter in word:
node = curr.children.get(letter)
if not node:
node = TrieNode()
curr.children[letter] = node
curr = node
curr.end = True
curr = self.root # Reset back to the root
I'd break this up though. I think your insert function is doing too much, and shouldn't be dealing with multiple strings. I'd change it to something like:
def insert(self, word):
curr = self.root
for letter in word:
node = curr.children.get(letter)
if not node:
node = TrieNode()
curr.children[letter] = node
curr = node
curr.end = True
def insert_many(self, words):
for word in words:
self.insert(word) # Just loop over self.insert
Now that's a non-problem since each insert is an independent call, and you can't forget to reset curr.

How to make depth search in Trie?

I wrote my Trie solution, where I used defaultdict. The task is to find all words with prefix.
The format must be like {of:[of, offten, offensive]}
Here my Trie class:
from collections import defaultdict
def _trie():
return defaultdict(_trie)
TERMINAL = None
class Trie(object):
def __init__(self):
self.trie = _trie()
def addWord(self, word):
trie = self.trie
for letter in word:
trie = trie[letter]
trie[TERMINAL]
def search(self, word, trie=None):
if trie is None:
trie = self.trie
for i, letter in enumerate(word):
if letter in trie:
trie = trie[letter]
else:
return False
return trie
Here The example:
Trie = Trie()
Trie.addWord('of')
Trie.addWord('often')
Trie.addWord('offensive')
string = 'of'
s = dict(Trie.search(string))
They give the result:
Here I make depth searh
from collections import defaultdict
class TrieNode:
def __init__(self):
self.child = defaultdict(TrieNode)
self.is_word = False
self.words = ""
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word):
cur = self.root
for i in range(len(word)):
cur = cur.child[word[i]]
cur.words = word[:i+1]
cur.is_word = True
def search(self, word):
cur = self.root
for char in word:
cur = cur.child.get(char)
if not cur:
return []
stack = [cur]
res = []
while stack:
node = stack.pop()
if node.is_word:
res.append(node.words)
for key, val in node.child.items():
stack.append(val)
return sorted(res)
Trie = Trie()
Trie.insert('of')
Trie.insert('often')
Trie.insert('offensive')
Trie.insert('offensive2')
Trie.search('o')
# ['of', 'offensive', 'offensive2', 'often']

How to change this prefix trie to return all words that contain similar prefixes

Hi this is a portion of my code for prefix trie, i trying to get it to return more than just prefix, more explanation at the bottom:
class TrieNode:
def __init__(self):
self.isString = False
self.children = {}
def insertString(word, root):
currentNode = root
for char in word:
if char not in currentNode.children:
currentNode.children[char] = TrieNode()
currentNode = currentNode.children[char]
currentNode.isString = True
def findStrings(prefix, node, results):
if node.isString:
results.append(prefix)
for char in node.children:
findStrings(prefix + char, node.children[char], results)
def longestPrefix(word, root):
currentNode = root
currentPrefix = ''
for char in word:
if char not in currentNode.children:
break
else:
currentNode = currentNode.children[char]
currentPrefix += char
strings = []
findStrings(currentPrefix, currentNode, strings)
return strings
pass
# Discussion: Is it dangerous to assume that findStrings actually found a string?
# Hint: There is an edge case that will break this
wordList = ['aydt', 'coombs', 'schuhmacher', 'claypoole', 'exhume', 'forehands', 'carin', 'plaits', 'alfonsin',
'hometowns', 'pedestals', 'emad', 'hourly', 'purchaser', 'spogli', 'combativeness', 'henningsen', 'luedke',
'duchin', 'koglin', 'teason', 'bumpings', 'substantially', 'lamendola', 'cecola', 'henze', 'tutti', 'dills',
'satirical', 'jetted', 'intertwine', 'predict', 'breezes', 'cyclist', 'ancillary', 'schaumburg', 'viewer',
"bay's", 'emissions', 'kincheloe', 'trees', 'vipperman', 'exhale', 'ornamental', 'repeated', 'pedestal',
'pedesta', 'pedest']
root = TrieNode()
for word in wordList:
insertString(word, root)
allWords = []
findStrings('', root, allWords)
print(allWords)
inputWord = 'co'
print(longestPrefix(inputWord, root))
inputWord = 'pedestals'
print(longestPrefix(inputWord, root))
I trying to understand how do i get print(longestPrefix('pedestals', root)) to return 'pedestals','pedestal','pedesta', 'pedest' and not just pedestals. What am i missing in my code?
I trying to understand how do i get print(longestPrefix('pedestals',
root)) to return 'pedestals','pedestal','pedesta', 'pedest' and not
just pedestals.
Since pedestals isn't a prefix, this doesn't make sense given the logic of the code -- I would have expected you to wonder why print(longestPrefix('pedest', root)) didn't return those four results. I've reworked your code below, turning all your functions into methods since each was taking the object you defined as an argument:
class TrieNode:
def __init__(self):
self.isString = False
self.children = {}
def insertString(self, word):
for char in word:
if char not in self.children:
self.children[char] = TrieNode()
self = self.children[char]
self.isString = True
def findStrings(self, prefix):
results = []
if self.isString:
results.append(prefix)
for char in self.children:
results.extend((self.children[char]).findStrings(prefix + char))
return results
def longestPrefix(self, word):
currentPrefix = ''
for char in word:
if char not in self.children:
break
else:
self = self.children[char]
currentPrefix += char
return self.findStrings(currentPrefix)
wordList = [
'aydt', 'coombs', 'schuhmacher', 'claypoole', 'exhume', 'forehands', 'carin', 'plaits', 'alfonsin',
'hometowns', 'pedestals', 'emad', 'hourly', 'purchaser', 'spogli', 'combativeness', 'henningsen', 'luedke',
'duchin', 'koglin', 'teason', 'bumpings', 'substantially', 'lamendola', 'cecola', 'henze', 'tutti', 'dills',
'satirical', 'jetted', 'intertwine', 'predict', 'breezes', 'cyclist', 'ancillary', 'schaumburg', 'viewer',
"bay's", 'emissions', 'kincheloe', 'trees', 'vipperman', 'exhale', 'ornamental', 'repeated', 'pedestal',
'pedesta', 'pedest'
]
root = TrieNode()
for word in wordList:
root.insertString(word)
allWords = root.findStrings('')
print(allWords)
inputWord = 'co'
print(root.longestPrefix(inputWord))
inputWord = 'pedest'
print(root.longestPrefix(inputWord))
The last two print statements output:
['coombs', 'combativeness']
['pedest', 'pedesta', 'pedestal', 'pedestals']
wordList = ['aydt', 'coombs', 'schuhmacher', 'claypoole', 'exhume', 'forehands', 'carin', 'plaits', 'alfonsin',
'hometowns', 'pedestals', 'emad', 'hourly', 'purchaser', 'spogli', 'combativeness', 'henningsen', 'luedke',
'duchin', 'koglin', 'teason', 'bumpings', 'substantially', 'lamendola', 'cecola', 'henze', 'tutti', 'dills',
'satirical', 'jetted', 'intertwine', 'predict', 'breezes', 'cyclist', 'ancillary', 'schaumburg', 'viewer',
"bay's", 'emissions', 'kincheloe', 'trees', 'vipperman', 'exhale', 'ornamental', 'repeated', 'pedestal',
'pedesta', 'pedest']
def findsubstring(fullstring):
for word in wordList:
if word in fullstring:
print (word)
findsubstring("pedestals")
output:
pedestals
pedestal
pedesta
pedest

Categories