Passing a list of strings to be put into trie - python

I have the code that can build a trie data structure when it is given one string. When I am trying to pass a list of strings, it combines the words into one
class TrieNode:
def __init__(self):
self.end = False
self.children = {}
def all_words(self, prefix):
if self.end:
yield prefix
for letter, child in self.children.items():
yield from child.all_words(prefix + letter)
class Trie:
def __init__(self):
self.root = TrieNode()
def __init__(self):
self.root = TrieNode()
def insert(self, words):
curr = self.root
#the line I added to read the words from a list is below
for word in words:
for letter in word:
node = curr.children.get(letter)
if not node:
node = TrieNode()
curr.children[letter] = node
curr = node
curr.end = True
def all_words_beginning_with_prefix(self, prefix):
cur = self.root
for c in prefix:
cur = cur.children.get(c)
if cur is None:
return # No words with given prefix
yield from cur.all_words(prefix)
This is the code I use to insert everything into the tree:
lst = ['foo', 'foob', 'foobar', 'foof']
trie = Trie()
trie.insert(lst)
The output I get is
['foo', 'foofoob', 'foofoobfoobar', 'foofoobfoobarfoof']
The output I would like to get is
['foo', 'foob', 'foobar', 'foof']
This is the line I used to get the output (for reproducibility, in case you will need to run the code) - it returns all the words that start with a particular prefix:
print(list(trie.all_words_beginning_with_prefix('foo')))
How do I fix it?

You aren't resetting curr back to the root after each insert, so you're inserting the next word where the last one left off. You'd want something like:
def insert(self, words):
curr = self.root
for word in words:
for letter in word:
node = curr.children.get(letter)
if not node:
node = TrieNode()
curr.children[letter] = node
curr = node
curr.end = True
curr = self.root # Reset back to the root
I'd break this up though. I think your insert function is doing too much, and shouldn't be dealing with multiple strings. I'd change it to something like:
def insert(self, word):
curr = self.root
for letter in word:
node = curr.children.get(letter)
if not node:
node = TrieNode()
curr.children[letter] = node
curr = node
curr.end = True
def insert_many(self, words):
for word in words:
self.insert(word) # Just loop over self.insert
Now that's a non-problem since each insert is an independent call, and you can't forget to reset curr.

Related

Cannot write a function to retrieve all words in a trie

I have a following Trie implementation:
class TrieNode:
def __init__(self):
self.nodes = defaultdict(TrieNode)
self.is_fullpath = False
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word):
curr = self.root
for char in word:
curr = curr.nodes[char]
curr.is_fullpath = True
I'm trying to write a method to retrieve a list of all words in my trie.
t = Trie()
t.insert('a')
t.insert('ab')
print(t.paths()) # ---> ['a', 'ab']
My current implementation looks like this:
def paths(self, node=None):
if node is None:
node = self.root
result = []
for k, v in node.nodes.items():
if not node.is_fullpath:
for el in self.paths(v):
result.append(str(k) + el)
else:
result.append('')
return result
But it does not seem to return full list of words.
Here are the issues in your code:
It doesn't look further when is_fullpath is True. But you should also look deeper (for longer words) in that case.
It should not check node.is_fullpath but v.is_fullpath.
result.append('') is not correct. It should be result.append(str(k))
So your for loop body could look like this:
if v.is_fullpath:
result.append(str(k))
for el in self.paths(v):
result.append(str(k) + el)
I would however do it like this:
Define this recursive generator method on your TrieNode class:
def paths(self, prefix=""):
if self.is_fullpath:
yield prefix
for chr, node in self.nodes.items():
yield from node.paths(prefix + chr)
Note how this passes the collected characters on the path to the recursive call. If at any time the is_fullpath boolean is True, we yield that path. Always we continue the search recursively via child nodes.
The method on the Trie class is then quite simple:
def paths(self):
return list(self.root.paths())

Count the number of words that appear before

I would like to ask how can we count the number of words that occur alphabetically before the given string in the trie?
Here is my implementation now.
class TrieNode:
# Trie node class
def __init__(self):
self.children = [None] * 26
# isEndOfWord is True if node represent the end of the word
self.isEndOfWord = False
self.word_count = 0
class Trie:
# Trie data structure class
def __init__(self):
self.root = self.getNode()
def getNode(self):
# Returns new trie node (initialized to NULLs)
return TrieNode()
def _charToIndex(self, ch):
# private helper function
# Converts key current character into index
# use only 'a' through 'z' and lower case
return ord(ch) - ord('a')
def insert(self, key):
# If not present, inserts key into trie
# If the key is prefix of trie node,
# just marks leaf node
pCrawl = self.root
length = len(key)
for level in range(length):
index = self._charToIndex(key[level])
# if current character is not present
if not pCrawl.children[index]:
pCrawl.children[index] = self.getNode()
pCrawl = pCrawl.children[index]
# mark last node as leaf
pCrawl.isEndOfWord = True
pCrawl.word_count += 1
def search(self, key):
# Search key in the trie
# Returns true if key presents
# in trie, else false
pCrawl = self.root
length = len(key)
for level in range(length):
index = self._charToIndex(key[level])
if not pCrawl.children[index]:
return False
pCrawl = pCrawl.children[index]
return pCrawl is not None and pCrawl.isEndOfWord
def count_before(self, string):
cur = self.root
for b in string:
index = self._charToIndex(b)
print(index)
cur = cur.children[index]
if cur is None:
return 0
return cur.word_count
def total_before(text):
t = Trie()
for i in range(len(text)):
t.insert(text[i])
a_list = [] # A list to store the result that occur before the text[i]
for i in range(len(text)):
result = t.count_before(text[i])
a_list.append(result)
return a_list
total_before(["bac", "aaa", "baa", "aac"]) # Output will be [3, 0, 2, 1]
I would like to know how can I count the number of words that occur before the given string in the trie that I had created. Can someone give me an idea about it?
As word_count is currently initialised, it does not serve much purpose. It only is non-zero at nodes with isEndOfWord set to True. It would be more useful if it counted the number of words that depend on the current node, i.e. words that either end in that node (which your code counts now), or continue further down the trie (which are currently not counted).
To make that happen, also increment word_count while you descend the trie:
def insert(self, key):
pCrawl = self.root
length = len(key)
for level in range(length):
pCrawl.word_count += 1 # <-------------- added
index = self._charToIndex(key[level])
if not pCrawl.children[index]:
pCrawl.children[index] = self.getNode()
pCrawl = pCrawl.children[index]
pCrawl.isEndOfWord = True
pCrawl.word_count += 1
In count_before you would need to sum up all the word_count values of the child nodes the precede the child that you will select, as those represent words that come before the current word:
def count_before(self, string):
count = 0 # used to accumulate the word_counts
cur = self.root
for b in string:
index = self._charToIndex(b)
# add the word counts of the children that are to the left of this index:
count += sum(node.word_count for node in cur.children[:index] if node)
cur = cur.children[index]
if cur is None:
break
return count
This line:
count += sum(node.word_count for node in cur.children[:index] if node)
Is a compact way of doing this:
mysum = 0
for node in cur.children[:index]:
if node:
mysum += node.word_count
sum += mysum
I think you overcomplicated the problem.
def total_before(lst):
return [sorted(lst).index(el) for el in lst]
print(total_before(["bac", "aaa", "baa", "aac"]))
Output:
[3, 0, 2, 1]

Storing word count in the python trie

I took a list of words and put it into a trie. I would also like to store word count inside for further analysis. What would be the best way to do it? This is the class where I think the frequency would be collected and stored, but I am not sure how to go about it. You can see my attempt, last line in insert is where I try to store the count.
class TrieNode:
def __init__(self,k):
self.v = 0
self.k = k
self.children = {}
def all_words(self, prefix):
if self.end:
yield prefix
for letter, child in self.children.items():
yield from child.all_words(prefix + letter)
class Trie:
def __init__(self):
self.root = TrieNode()
def __init__(self):
self.root = TrieNode()
def insert(self, word):
curr = self.root
for letter in word:
node = curr.children.get(letter)
if not node:
node = TrieNode()
curr.children[letter] = node
curr.v += 1
def insert_many(self, words):
for word in words:
self.insert(word)
def all_words_beginning_with_prefix(self, prefix):
cur = self.root
for c in prefix:
cur = cur.children.get(c)
if cur is None:
return # No words with given prefix
yield from cur.all_words(prefix)
I want to store the count so that when I use
print(list(trie.all_words_beginning_with_prefix('prefix')))
I would get a result like so:
[(word, count), (word, count)]
While inserting, on seeing any node, it means there's a new word going to be added in that path. Therefore increment your word_count of that node.
class TrieNode:
def __init__(self, char):
self.char = char
self.word_count = 0
self.children = {}
def all_words(self, prefix, path):
if len(self.children) == 0:
yield prefix + path
for letter, child in self.children.items():
yield from child.all_words(prefix, path + letter)
class Trie:
def __init__(self):
self.root = TrieNode('')
def insert(self, word):
curr = self.root
for letter in word:
node = curr.children.get(letter)
if node is None:
node = TrieNode(letter)
curr.children[letter] = node
curr.word_count += 1 # increment it everytime the node is seen at particular level.
curr = node
def insert_many(self, words):
for word in words:
self.insert(word)
def all_words_beginning_with_prefix(self, prefix):
cur = self.root
for c in prefix:
cur = cur.children.get(c)
if cur is None:
return # No words with given prefix
yield from cur.all_words(prefix, path="")
def word_count(self, prefix):
cur = self.root
for c in prefix:
cur = cur.children.get(c)
if cur is None:
return 0
return cur.word_count
trie = Trie()
trie.insert_many(["hello", "hi", "random", "heap"])
prefix = "he"
words = [w for w in trie.all_words_beginning_with_prefix(prefix)]
print("Lazy method:\n Prefix: %s, Words: %s, Count: %d" % (prefix, words, len(words)))
print("Proactive method:\n Word count for '%s': %d" % (prefix, trie.word_count(prefix)))
Output:
Lazy method:
Prefix: he, Words: ['hello', 'heap'], Count: 2
Proactive method:
Word count for 'he': 2
I would add a field called is_word to the trie node, where is_word would be true only for the last letter in the word. Like you have word AND, is_word would be true for the trie node holding the letter D. And I would update frequency for only nodes that have is_word to be true, not for every letter in the word.
So when you iterate from a letter, check if it is a word, if it is, stop the iteration, return the count and the word. I’m assuming in your iteration you keep track of the letters, and keep adding them to the prefix.
Your trie is a multi-way trie.

How to make depth search in Trie?

I wrote my Trie solution, where I used defaultdict. The task is to find all words with prefix.
The format must be like {of:[of, offten, offensive]}
Here my Trie class:
from collections import defaultdict
def _trie():
return defaultdict(_trie)
TERMINAL = None
class Trie(object):
def __init__(self):
self.trie = _trie()
def addWord(self, word):
trie = self.trie
for letter in word:
trie = trie[letter]
trie[TERMINAL]
def search(self, word, trie=None):
if trie is None:
trie = self.trie
for i, letter in enumerate(word):
if letter in trie:
trie = trie[letter]
else:
return False
return trie
Here The example:
Trie = Trie()
Trie.addWord('of')
Trie.addWord('often')
Trie.addWord('offensive')
string = 'of'
s = dict(Trie.search(string))
They give the result:
Here I make depth searh
from collections import defaultdict
class TrieNode:
def __init__(self):
self.child = defaultdict(TrieNode)
self.is_word = False
self.words = ""
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word):
cur = self.root
for i in range(len(word)):
cur = cur.child[word[i]]
cur.words = word[:i+1]
cur.is_word = True
def search(self, word):
cur = self.root
for char in word:
cur = cur.child.get(char)
if not cur:
return []
stack = [cur]
res = []
while stack:
node = stack.pop()
if node.is_word:
res.append(node.words)
for key, val in node.child.items():
stack.append(val)
return sorted(res)
Trie = Trie()
Trie.insert('of')
Trie.insert('often')
Trie.insert('offensive')
Trie.insert('offensive2')
Trie.search('o')
# ['of', 'offensive', 'offensive2', 'often']

Trie Implementation in Python -- Print Keys

I Implemented a Trie data structure using python, now the problem is it doesn't display the keys that Trie is stored in its data structure.
class Node:
def __init__(self):
self.children = [None] * 26
self.endOfTheWord = False
class Trie:
def __init__(self):
self.root = self.getNode()
def getNode(self):
return Node()
def charToIndex(self ,ch):
return ord(ch) - ord('a')
def insert(self ,word):
current = self.root
for i in range(len(word)):
index = self.charToIndex(word[i])
if current.children[index] is None:
current.children[index] = self.getNode()
current = current.children[index]
current.endOfTheWord = True
def printKeys(self):
str = []
self.printKeysUtil(self.root ,str)
def printKeysUtil(self ,root ,str):
if root.endOfTheWord == True:
print(''.join(str))
return
for i in range(26):
if root.children[i] is not None:
ch = chr(97) + chr(i)
str.append(ch)
self.printKeysUtil(root.children[i] ,str)
str.pop()
You could perform a pre-order traversal of the nodes, and wherever you find an end-of-word marker, you zoom up to the root, capturing the letters as you go, in order to get the full word... except that to accomplish this, you would need to store the parent node in each node.
def printKeysUtil(self ,root ,str):
if root.endOfTheWord == True:
print(''.join(str))
return
for i in range(26):
if root.children[i] is not None:
ch = chr(97+i)
str.append(ch)
self.printKeysUtil(root.children[i] ,str)
str.pop()

Categories