Binary Search Tree Frequency Counter - python

I need to read a text file, strip the unnecessary punctuation, lowercase the words and use binary search tree function to make a word binary search tree that consists of the words in the file.
We are asked to count the frequency of recurring words and asked for a total word count and total unique word count.
So far I've got the punctuation resolved, file read done, lowercase done, binary search tree basically done and I just need to figure out how to implement the "frequency" counter in the code.
My code is as follows:
class BSearchTree :
class _Node :
def __init__(self, word, left = None, right = None) :
self._word = word
self._count = 0
self._left = left
self._right = right
def __init__(self) :
self._root = None
self._wordc = 0
self._each = 0
def isEmpty(self) :
return self._root == None
def search(self, word) :
probe = self._root
while (probe != None) :
if word == probe._word :
return probe
if word < probe._value :
probe = probe._left
else :
probe = probe._right
return None
def insert(self, word) :
if self.isEmpty() :
self._root = self._Node(word)
self._root._freq += 1 <- is this correct?
return
parent = None #to keep track of parent
#we need above information to adjust
#link of parent of new node later
probe = self._root
while (probe != None) :
if word < probe._word : # go to left tree
parent = probe # before we go to child, save parent
probe = probe._left
elif word > probe._word : # go to right tree
parent = probe # before we go to child, save parent
probe = probe._right
if (word < parent._word) : #new value will be new left child
parent._left = self._Node(word)
else : #new value will be new right child
parent._right = self._Node(word)
cause formatting is killing me, this is the latter part of it.
class NotPresent(Exception) :
pass
def main():
t=BST()
file = open("sample.txt")
line = file.readline()
file.close()
#for word in line:
# t.insert(word)
# Line above crashes program because there are too many
# words to add. Lines on bottom tests BST class
t.insert('all')
t.insert('high')
t.insert('fly')
t.insert('can')
t.insert('boars')
#t.insert('all') <- how do i handle duplicates by making
t.inOrder() #extras add to the nodes frequency?
Thank you for helping/trying to help!

Firstly, it's better to initialize a Node's _freq by 1 than doing that in in BST's insert()
(1 more: In python coding convention, white spaces in writing default argument values are not recommended.)
def __init__(self, word, left=None, right=None) :
self._word = word
self._freq = 1
self._left = left
self._right = right
and just add the last 3 lines:
probe = self._root
while (probe != None) :
if word < probe._word : # go to left tree
parent = probe # before we go to child, save parent
probe = probe._left
elif word > probe._word : # go to right tree
parent = probe # before we go to child, save parent
probe = probe._right
else:
probe._freq += 1
return

Related

Implement a ternary search tree such that it returns the 3 words with highest frequencies from a given input. How?

I have already implemented some code:
from dictionary.base_dictionary import BaseDictionary
from dictionary.word_frequency import WordFrequency
from dictionary.node import Node
# ------------------------------------------------------------------------
# This class is required to be implemented. Ternary Search Tree implementation.
# ------------------------------------------------------------------------
class TernarySearchTreeDictionary(BaseDictionary):
def build_dictionary(self, words_frequencies: list[WordFrequency]):
"""
construct the data structure to store nodes
#param words_frequencies: list of (word, frequency) to be stored
"""
self.n = Node()
for i in words_frequencies:
self.add(self.n, i.word, i.frequency)
# TO BE IMPLEMENTED
def search(self, word: str) -> int:
"""
search for a word
#param word: the word to be searched
#return: frequency > 0 if found and 0 if NOT found
"""
# TO BE IMPLEMENTED
# place holder for return
return 0
def add_word_frequency(self,node: Node, word, frequency) -> bool:
print(word)
def add(self,node: Node, word, frequency) -> bool:
"""
add a word and its frequency to the dictionary
#param word_frequency: (word, frequency) to be added
:return: True whether succeeded, False when word is already in the dictionary
"""
if (len(word) == 0):
return False;
char = word[0]
freq = frequency
if node.letter is None: node.letter = char
if node.frequency is None: node.frequency = freq
if char < node.letter:
if not node.left:
node.left = Node();
self.add(node.left,word, frequency)
return True;
elif char > node.letter:
if not node.right:
node.right = Node()
self.add(node.right, word, frequency)
return True;
else:
if len(word) == 1:
node.end_word = True;
return False;
if not node.middle:
node.middle = Node()
self.add(node.middle, word[1:], frequency)
return True;
def delete_word(self, word: str) -> bool:
"""
delete a word from the dictionary
#param word: word to be deleted
#return: whether succeeded, e.g. return False when point not found
"""
# TO BE IMPLEMENTED
# place holder for return
return False
def all_suffixes(self, word, node):
if node.end_word:
yield "{0}{1}".format(word, node.letter)
if node.left:
for word in self.all_suffixes(word, node.left):
yield word
if node.right:
for word in self.all_suffixes(word, node.right):
yield word
if node.middle:
for word in self.all_suffixes(word + node.letter, node.middle):
yield word
def autocomplete(self, node, word: str) -> list[WordFrequency]:
"""
return a list of 3 most-frequent words in the dictionary that have 'word' as a prefix
#param word: word to be autocompleted
#return: a list (could be empty) of (at most) 3 most-frequent words with prefix 'word'
"""
from collections import Counter
final_word = ""
final_word = {pattern: set([]) for pattern in word }
for pattern in final_word.keys():
print(f"patterns {pattern}")
word = self.autocomplete_(pattern)
if word == None:
return None
else:
completions = [x for x in word]
counts = Counter(list(completions))
return counts
# TO BE IMPLEMENTED
# place holder for return
return []
def autocomplete_(self, word: str) -> list[WordFrequency]:
node = self.n;
for char in word:
while True:
if char > node.letter:
node = node.right
if char < node.letter:
node = node.left
else:
node = node.middle
break
if not node:
return None
return self.all_suffixes(word, node)
# TO BE IMPLEMENTED
# place holder for return
The node class:
class Node:
def __init__(self, letter=None, frequency=None, end_word=False):
self.letter = letter # letter stored at this node
self.frequency = frequency # frequency of the word if this letter is the end of a word
self.end_word = end_word # True if this letter is the end of a word
self.left = None # pointing to the left child Node, which holds a letter < self.letter
self.middle = None # pointing to the middle child Node
self.right = None # pointing to the right child Node, which holds a letter > self.letter
The WordFrequency class:
class WordFrequency:
def __init__(self, word: str, frequency: int):
self.word = word
self.frequency = frequency
Now, this code should've worked but instead gives me this output when I give it the starting prefix as "c":
Counter({'calm': 1, 'calmut': 1, 'calmub': 1, 'calmubte': 1, 'calmubts': 1})Counter({'calm': 1, 'calmut': 1, 'calmub': 1, 'calmubte': 1, 'calmubts': 1})
whereas my output (with the heighest frequency) should've been cuts, cute, and cut.
where my input is (with the words as WORD and the numbers as FREQUENCY):
cute 10
ant 20
cut 30
cuts 50
apple 300
cub 15
calm 1000
annotation 5
further 40
furniture 500
find 400
farm 5000
farming 1000
farmer 300
appendix 10
apology 600
apologetic 1000
fur 10
fathom 40
apps 60
It seems that it merged some words? Is there a problem with my insert function?
This code basically implements a ternary tree where these functions are followed:
• Build a dictionary from a list of words and frequencies: create a dictionary that stores words
and frequencies taken from a given list (using Add below).
• (A)dd a word and its frequency to the dictionary.
• (S)earch for a word in a dictionary and return its frequency (return 0 if not found).
• (D)elete a word from the dictionary.
• (AC)Auto-complete a given string and return a list of three most frequent words (if any) in the
dictionary that have the string as a prefix.
I am currently at the adding and auto-completing step, it seems I have done something wrong.

Count the number of words that appear before

I would like to ask how can we count the number of words that occur alphabetically before the given string in the trie?
Here is my implementation now.
class TrieNode:
# Trie node class
def __init__(self):
self.children = [None] * 26
# isEndOfWord is True if node represent the end of the word
self.isEndOfWord = False
self.word_count = 0
class Trie:
# Trie data structure class
def __init__(self):
self.root = self.getNode()
def getNode(self):
# Returns new trie node (initialized to NULLs)
return TrieNode()
def _charToIndex(self, ch):
# private helper function
# Converts key current character into index
# use only 'a' through 'z' and lower case
return ord(ch) - ord('a')
def insert(self, key):
# If not present, inserts key into trie
# If the key is prefix of trie node,
# just marks leaf node
pCrawl = self.root
length = len(key)
for level in range(length):
index = self._charToIndex(key[level])
# if current character is not present
if not pCrawl.children[index]:
pCrawl.children[index] = self.getNode()
pCrawl = pCrawl.children[index]
# mark last node as leaf
pCrawl.isEndOfWord = True
pCrawl.word_count += 1
def search(self, key):
# Search key in the trie
# Returns true if key presents
# in trie, else false
pCrawl = self.root
length = len(key)
for level in range(length):
index = self._charToIndex(key[level])
if not pCrawl.children[index]:
return False
pCrawl = pCrawl.children[index]
return pCrawl is not None and pCrawl.isEndOfWord
def count_before(self, string):
cur = self.root
for b in string:
index = self._charToIndex(b)
print(index)
cur = cur.children[index]
if cur is None:
return 0
return cur.word_count
def total_before(text):
t = Trie()
for i in range(len(text)):
t.insert(text[i])
a_list = [] # A list to store the result that occur before the text[i]
for i in range(len(text)):
result = t.count_before(text[i])
a_list.append(result)
return a_list
total_before(["bac", "aaa", "baa", "aac"]) # Output will be [3, 0, 2, 1]
I would like to know how can I count the number of words that occur before the given string in the trie that I had created. Can someone give me an idea about it?
As word_count is currently initialised, it does not serve much purpose. It only is non-zero at nodes with isEndOfWord set to True. It would be more useful if it counted the number of words that depend on the current node, i.e. words that either end in that node (which your code counts now), or continue further down the trie (which are currently not counted).
To make that happen, also increment word_count while you descend the trie:
def insert(self, key):
pCrawl = self.root
length = len(key)
for level in range(length):
pCrawl.word_count += 1 # <-------------- added
index = self._charToIndex(key[level])
if not pCrawl.children[index]:
pCrawl.children[index] = self.getNode()
pCrawl = pCrawl.children[index]
pCrawl.isEndOfWord = True
pCrawl.word_count += 1
In count_before you would need to sum up all the word_count values of the child nodes the precede the child that you will select, as those represent words that come before the current word:
def count_before(self, string):
count = 0 # used to accumulate the word_counts
cur = self.root
for b in string:
index = self._charToIndex(b)
# add the word counts of the children that are to the left of this index:
count += sum(node.word_count for node in cur.children[:index] if node)
cur = cur.children[index]
if cur is None:
break
return count
This line:
count += sum(node.word_count for node in cur.children[:index] if node)
Is a compact way of doing this:
mysum = 0
for node in cur.children[:index]:
if node:
mysum += node.word_count
sum += mysum
I think you overcomplicated the problem.
def total_before(lst):
return [sorted(lst).index(el) for el in lst]
print(total_before(["bac", "aaa", "baa", "aac"]))
Output:
[3, 0, 2, 1]

Building a Binary Search Tree from a file

I have a text file of lines in the format
2 0 0
7 0 0
4 1 1
10 0 0
9 0 1
8 1 1
These lines represent the data in a binary search tree where the first element is the node data, the second is whether or not a left child exists ( 0 if no, 1 if yes) and the third is whether or not a right child exists (0 if no, 1 if yes)
I have a class called "BinarySearchTree" which has the following initialization function
def __init__(self, value=None):
# Initializes the tree with a value node, a left child and a right child
self.leftChild = None
self.rightChild = None
self.height = 1
self.value = value
I also have a stack class with the following "push" and "pop" functions:
def push(self, item):
# Adds an item to the beginning of the stack
ending = self.stack
self.stack = [item] + [ending]
def pop(self):
# Removes and returns the first element from the stack
if self.isEmpty():
return None
top_element = self.stack[0]
self.stack = self.stack[1:]
return top_element
I am trying to create a binary search tree instance from the lines in the text file and using the stack class. So far I have:
def loadTreeFromFile(filename):
binarySearchTree = stack.Stack()
with open(filename) as file:
# gets a list containing only the elements in the txt file
for level in file.readlines():
nodeInfo = level.rstrip().split()
data, lc, rc = int(nodeInfo[0]), int(nodeInfo[1]), int(nodeInfo[2])
print(data, lc, rc)
if rc == 1:
right_tree = binarySearchTree.pop()
if lc == 1:
left_tree = binarySearchTree.pop()
newTree = BinarySearchTree(data)
if rc == 1:
newTree.rightChild = right_tree
if lc == 1:
newTree.leftChild = left_tree
binarySearchTree.push(newTree)
return newTree
I am running into the problem when I try to display the BST, I get 8: [[[<__main__.BinarySearchTree object at 0x1033e4390>, []]], 9: [None, 10: [None, None]]] (I have a display function written for the BST class so this is not the problem) AND when I try to do anything with this newly created BST (such as get the depth, search it, etc), I get errors. Any help is much appreciated, thanks .

Add list as child of tree with python 3

I have looked at many very similar questions and cannot figure it out so:
I have a string like this:
{121{12}12{211}2}
I want to read the string into a tree like this:
I am confused as how to tell python to add a whole list as a child node?
I would also like to know how to change the current node to the parent of the old current node?
Here is my code so far:
class Node:
def __init__(self,val):
self.value = val
self.children = []
#init Node class so we can pass in values as nodes and set children to empty list
def add_child(self, obj):
self.children.append(obj)
s=[]
for i in filedata:
if i == leftbrace:
n = Node(i)
#create new child of current node
s = []
#reset list s to blank
if i == rightbrace:
n.add_child(s)
#add list s to current node
#make parent of current node the new current node
else:
s.append(i)
#add i to list s
for c in n.children:
print (c.data)
To make something like this work, it is easiest if you use recursion. Here is one way that this can be done.
Code:
class Node:
def __init__(self, stream):
val = []
children = []
while True:
try:
# get the next character from the stream
ch = next(stream)
# if this is an open brace, then recurse to a child
if ch == '{':
children.append(Node(stream))
# if this is a close brace, we are done on this level
elif ch == '}':
break
# otherwise add this character to our value
else:
val.append(ch)
# stream is empty, we are done
except StopIteration:
break
self.value = ''.join(val)
self.children = children
#classmethod
def from_string(cls, string):
stream = iter(string)
tree_top = Node(stream)
# assert that the string started with '{' and was one top node
assert len(tree_top.children) == 1 and tree_top.value == ''
return tree_top.children[0]
def __str__(self):
return self.value
def __repr__(self):
return "Node('%s', <%d children>)" % (
self.value, len(self.children))
def tree_string(self, level=0):
yield '-' + " " * level + str(self)
for child in self.children:
for child_string in child.tree_string(level+1):
yield child_string
tree = '{121{12}12{211}2}'
for line in Node.from_string(tree).tree_string():
print(line)
Results:
-121122
- 12
- 211

Where does the difference in size come from?

I created a trie of sorts to store all the words (not definitions) in the English dictionary. The point of it was so that I can get all the words that only contain letters within a given range.
The text file containing all the words is about 2.7 mb, but after creating the tree and writing it to a file using pickle, the file is >33 mb.
Where does this difference in size come from? I thought I would be saving space by not needing to store multiple copies of the same letter for different word, e.g for the words app and apple I would only need 5 nodes, for a -> p -> p -> l -> e.
My code is as follows:
import pickle
class WordTrieNode:
def __init__(self, nodeLetter='', parentNode=None, isWordEnding=False):
self.nodeLetter = nodeLetter
self.parentNode = parentNode
self.isWordEnding = isWordEnding
self.children = [None]*26 # One entry for each lowercase letter of the alphabet
def getWord(self):
if(self.parentNode is None):
return ''
return self.parentNode.getWord() + self.nodeLetter
def isEndOfWord(self):
return self.isWordEnding
def markEndOfWord():
self.isWordEnding = True
def insertWord(self, word):
if(len(word) == 0):
return
char = word[0]
idx = ord(char) - ord('a')
if(len(word) == 1):
if(self.children[idx] is None):
node = WordTrieNode(char, self, True)
self.children[idx] = node
else:
self.children[idx].markEndOfWord()
else:
if(self.children[idx] is None):
node = WordTrieNode(char, self, False)
self.children[idx] = node
self.children[idx].insertWord(word[1:])
else:
self.children[idx].insertWord(word[1:])
def getAllWords(self):
for node in self.children:
if node is not None:
if node.isEndOfWord():
print(node.getWord())
node.getAllWords()
def getAllWordsInRange(self, low='a', high='z'):
i = ord(low) - ord('a')
j = ord(high) - ord('a')
for node in self.children[i:j+1]:
if node is not None:
if node.isEndOfWord():
print(node.getWord())
node.getAllWordsInRange(low, high)
def main():
tree = WordTrieNode("", None, False)
with open('en.txt') as file:
for line in file:
tree.insertWord(line.strip('\n'))
with open("treeout", 'wb') as output:
pickle.dump(tree, output, pickle.HIGHEST_PROTOCOL)
#tree.getAllWordsInRange('a', 'l')
#tree.getAllWords()
if __name__ == "__main__":
main()
Nodes of a trie are huge as they store a link for all possible next letters. As you can see in the code, every node holds a list of 26 links (children).
More compact schemes are possible (https://en.wikipedia.org/wiki/Trie#Compressing_tries), at the expense of more complexity and slower speed.

Categories