Where does the difference in size come from? - python

I created a trie of sorts to store all the words (not definitions) in the English dictionary. The point of it was so that I can get all the words that only contain letters within a given range.
The text file containing all the words is about 2.7 mb, but after creating the tree and writing it to a file using pickle, the file is >33 mb.
Where does this difference in size come from? I thought I would be saving space by not needing to store multiple copies of the same letter for different word, e.g for the words app and apple I would only need 5 nodes, for a -> p -> p -> l -> e.
My code is as follows:
import pickle
class WordTrieNode:
def __init__(self, nodeLetter='', parentNode=None, isWordEnding=False):
self.nodeLetter = nodeLetter
self.parentNode = parentNode
self.isWordEnding = isWordEnding
self.children = [None]*26 # One entry for each lowercase letter of the alphabet
def getWord(self):
if(self.parentNode is None):
return ''
return self.parentNode.getWord() + self.nodeLetter
def isEndOfWord(self):
return self.isWordEnding
def markEndOfWord():
self.isWordEnding = True
def insertWord(self, word):
if(len(word) == 0):
return
char = word[0]
idx = ord(char) - ord('a')
if(len(word) == 1):
if(self.children[idx] is None):
node = WordTrieNode(char, self, True)
self.children[idx] = node
else:
self.children[idx].markEndOfWord()
else:
if(self.children[idx] is None):
node = WordTrieNode(char, self, False)
self.children[idx] = node
self.children[idx].insertWord(word[1:])
else:
self.children[idx].insertWord(word[1:])
def getAllWords(self):
for node in self.children:
if node is not None:
if node.isEndOfWord():
print(node.getWord())
node.getAllWords()
def getAllWordsInRange(self, low='a', high='z'):
i = ord(low) - ord('a')
j = ord(high) - ord('a')
for node in self.children[i:j+1]:
if node is not None:
if node.isEndOfWord():
print(node.getWord())
node.getAllWordsInRange(low, high)
def main():
tree = WordTrieNode("", None, False)
with open('en.txt') as file:
for line in file:
tree.insertWord(line.strip('\n'))
with open("treeout", 'wb') as output:
pickle.dump(tree, output, pickle.HIGHEST_PROTOCOL)
#tree.getAllWordsInRange('a', 'l')
#tree.getAllWords()
if __name__ == "__main__":
main()

Nodes of a trie are huge as they store a link for all possible next letters. As you can see in the code, every node holds a list of 26 links (children).
More compact schemes are possible (https://en.wikipedia.org/wiki/Trie#Compressing_tries), at the expense of more complexity and slower speed.

Related

Implement a ternary search tree such that it returns the 3 words with highest frequencies from a given input. How?

I have already implemented some code:
from dictionary.base_dictionary import BaseDictionary
from dictionary.word_frequency import WordFrequency
from dictionary.node import Node
# ------------------------------------------------------------------------
# This class is required to be implemented. Ternary Search Tree implementation.
# ------------------------------------------------------------------------
class TernarySearchTreeDictionary(BaseDictionary):
def build_dictionary(self, words_frequencies: list[WordFrequency]):
"""
construct the data structure to store nodes
#param words_frequencies: list of (word, frequency) to be stored
"""
self.n = Node()
for i in words_frequencies:
self.add(self.n, i.word, i.frequency)
# TO BE IMPLEMENTED
def search(self, word: str) -> int:
"""
search for a word
#param word: the word to be searched
#return: frequency > 0 if found and 0 if NOT found
"""
# TO BE IMPLEMENTED
# place holder for return
return 0
def add_word_frequency(self,node: Node, word, frequency) -> bool:
print(word)
def add(self,node: Node, word, frequency) -> bool:
"""
add a word and its frequency to the dictionary
#param word_frequency: (word, frequency) to be added
:return: True whether succeeded, False when word is already in the dictionary
"""
if (len(word) == 0):
return False;
char = word[0]
freq = frequency
if node.letter is None: node.letter = char
if node.frequency is None: node.frequency = freq
if char < node.letter:
if not node.left:
node.left = Node();
self.add(node.left,word, frequency)
return True;
elif char > node.letter:
if not node.right:
node.right = Node()
self.add(node.right, word, frequency)
return True;
else:
if len(word) == 1:
node.end_word = True;
return False;
if not node.middle:
node.middle = Node()
self.add(node.middle, word[1:], frequency)
return True;
def delete_word(self, word: str) -> bool:
"""
delete a word from the dictionary
#param word: word to be deleted
#return: whether succeeded, e.g. return False when point not found
"""
# TO BE IMPLEMENTED
# place holder for return
return False
def all_suffixes(self, word, node):
if node.end_word:
yield "{0}{1}".format(word, node.letter)
if node.left:
for word in self.all_suffixes(word, node.left):
yield word
if node.right:
for word in self.all_suffixes(word, node.right):
yield word
if node.middle:
for word in self.all_suffixes(word + node.letter, node.middle):
yield word
def autocomplete(self, node, word: str) -> list[WordFrequency]:
"""
return a list of 3 most-frequent words in the dictionary that have 'word' as a prefix
#param word: word to be autocompleted
#return: a list (could be empty) of (at most) 3 most-frequent words with prefix 'word'
"""
from collections import Counter
final_word = ""
final_word = {pattern: set([]) for pattern in word }
for pattern in final_word.keys():
print(f"patterns {pattern}")
word = self.autocomplete_(pattern)
if word == None:
return None
else:
completions = [x for x in word]
counts = Counter(list(completions))
return counts
# TO BE IMPLEMENTED
# place holder for return
return []
def autocomplete_(self, word: str) -> list[WordFrequency]:
node = self.n;
for char in word:
while True:
if char > node.letter:
node = node.right
if char < node.letter:
node = node.left
else:
node = node.middle
break
if not node:
return None
return self.all_suffixes(word, node)
# TO BE IMPLEMENTED
# place holder for return
The node class:
class Node:
def __init__(self, letter=None, frequency=None, end_word=False):
self.letter = letter # letter stored at this node
self.frequency = frequency # frequency of the word if this letter is the end of a word
self.end_word = end_word # True if this letter is the end of a word
self.left = None # pointing to the left child Node, which holds a letter < self.letter
self.middle = None # pointing to the middle child Node
self.right = None # pointing to the right child Node, which holds a letter > self.letter
The WordFrequency class:
class WordFrequency:
def __init__(self, word: str, frequency: int):
self.word = word
self.frequency = frequency
Now, this code should've worked but instead gives me this output when I give it the starting prefix as "c":
Counter({'calm': 1, 'calmut': 1, 'calmub': 1, 'calmubte': 1, 'calmubts': 1})Counter({'calm': 1, 'calmut': 1, 'calmub': 1, 'calmubte': 1, 'calmubts': 1})
whereas my output (with the heighest frequency) should've been cuts, cute, and cut.
where my input is (with the words as WORD and the numbers as FREQUENCY):
cute 10
ant 20
cut 30
cuts 50
apple 300
cub 15
calm 1000
annotation 5
further 40
furniture 500
find 400
farm 5000
farming 1000
farmer 300
appendix 10
apology 600
apologetic 1000
fur 10
fathom 40
apps 60
It seems that it merged some words? Is there a problem with my insert function?
This code basically implements a ternary tree where these functions are followed:
• Build a dictionary from a list of words and frequencies: create a dictionary that stores words
and frequencies taken from a given list (using Add below).
• (A)dd a word and its frequency to the dictionary.
• (S)earch for a word in a dictionary and return its frequency (return 0 if not found).
• (D)elete a word from the dictionary.
• (AC)Auto-complete a given string and return a list of three most frequent words (if any) in the
dictionary that have the string as a prefix.
I am currently at the adding and auto-completing step, it seems I have done something wrong.

Python recursive function to search a binary tree

Very new at Python and I'm trying to understand recursion over a binary tree. I've implemented a very simple tree, which funnily enough maps English characters to binary (1's and 0's). I've only used a very simple structure because I am struggling to get my head round a more complex question that I've been set. I figure if I can get my head round my example then I should be able to go away and look at the question I've been set myself.
The following creates the class BinaryTree and an instance of this
class BinaryTree:
"""A rooted binary tree"""
def __init__(self):
self.root = None
self.left = None
self.right = None
def is_empty(testtree: BinaryTree) -> bool:
"""Return True if tree is empty."""
return testtree.root == testtree.left == testtree.right == None
def join(item: object, left: BinaryTree, right: BinaryTree) -> BinaryTree:
"""Return a tree with the given root and subtrees."""
testtree = BinaryTree()
testtree.root = item
testtree.left = left
testtree.right = right
return testtree
EMPTY = BinaryTree()
C = join('C',EMPTY,EMPTY)
D = join('D',EMPTY,EMPTY)
E = join('E',EMPTY,EMPTY)
F = join('F',EMPTY,EMPTY)
A = join('A',C,D)
B = join('B',E,F)
BINARY = join('START',B,A)
I visualise it as follows
Visualisation of the Binary tree
Now I'm trying to create a function that will take two inputs, a BinaryTree and a single character and the output will be the binary code for the corresponding letter (as an example, D = " 10 "). I'm outputting as a string rather than an integer. My function and test case as follows
# global variable
result = ''
#Convert binary to letter
def convert_letter(testtree: BinaryTree, letter: str) -> str:
global result
if testtree == None:
return False
elif testtree.root == letter:
return True
else:
if convert_letter(testtree.left, letter) == True:
result += "1"
return result
elif convert_letter(testtree.right, letter) == True:
result += "0"
return result
#Test
test = 'D' #Return '10'
convert_letter(BINARY, test)
And unfortunately that's where I'm hitting a brick wall. I had tried initialising an empty string within the function, but everytime it iterates over the function it overwrites the string. Any help greatly appreciated.
The problem is that your function will sometimes return a boolean, sometimes a string, and sometimes None. So with this code:
if convert_letter(testtree.left, letter) == True:
result += "1"
return result
elif convert_letter(testtree.right, letter) == True:
result += "0"
return result
... you are not capturing all successful searches, as a successful search would return the actual string of "0" and "1" which obviously is not True. In that case the execution has no else to go to and returns None -- even when the letter was found in a deeper node.
Your function should not return a boolean -- that doesn't match the type hint either. It should be a string (the result). You could reserve None to indicate the letter was not found.
Some other problems:
result += "0" will append the digit, but since you already made the recursive call, you need to prepend the digit -- as you are higher up in the tree now.
The initialisation of your tree makes a different tree than you put in the image: A should be the left child, not the right child. So it should be join('START', A, B)
With those fixes, you'd have this code:
def convert_letter(testtree: BinaryTree, letter: str) -> str:
global result
if testtree is None:
result = None # Not found here
elif testtree.root == letter:
result = '' # Found! Start a path
elif convert_letter(testtree.left, letter) is not None:
result = "1" + result # Prepend
elif convert_letter(testtree.right, letter) is not None:
result = "0" + result # Prepend
else:
result = None # Not found here
return result
If you also correct to use join('START', A, B), then the output will be 10.
Better Practice
There are some things you can do better:
Don't use a global variable for storing the function result. As you return it, you can capture the result you get from a recursive call as a local variable, prepend to it, and return it again.
The definition of EMPTY makes your tree unnecessarily big. Just use None to denote an empty tree.
Don't call a node's value root. A rooted tree has only one root, and it is a node, not a value of a node. So call that attribute value or data, but not root.
The join function is nice, but why not use the constructor for that feature? The constructor can take those arguments as optional and immediately initialise the left and right attributes with those arguments.
The code-comment above the convert_letter function describes the opposite from what the function does.
Taking all that into account, your code could look like this:
class BinaryTree:
def __init__(self, value, left: 'BinaryTree'=None, right: 'BinaryTree'=None):
self.value = value
self.left = left
self.right = right
def convert_letter(tree: BinaryTree, letter: str) -> str:
if not tree:
return # Not found here, return None
if tree.value == letter:
return "" # Bingo: return an empty path
# No more global. path is a local variable
path = convert_letter(tree.left, letter)
if path is not None:
return "1" + path
path = convert_letter(tree.right, letter)
if path is not None:
return "0" + path
# Look how nice it is to create a tree using the constructor arguments
binary = BinaryTree("Start",
BinaryTree("A",
BinaryTree("C"), BinaryTree("D")
),
BinaryTree("B",
BinaryTree("E"), BinaryTree("F")
)
)
# Test
test = 'D'
print(convert_letter(binary, test)) # 10
I took the liberty of simplfying your code a bit let me know if you have any questions about how this works.
class node:
"""A rooted binary tree"""
def __init__(self, value = None, left = None, right = None):
self.value = value
self.left = left
self.right = right
C = node('C')
D = node('D')
E = node('E')
F = node('F')
A = node('A',C,D)
B = node('B',E,F)
BINARY = node('START',B,A)
def convert_letter(n,letter):
if n.value == letter:
return "1"+(convert_letter(n.left,letter) if not n.left is None else "")+(convert_letter(n.right,letter)if not n.right is None else "")
else:
return "0"+(convert_letter(n.left,letter) if not n.left is None else "")+(convert_letter(n.right,letter)if not n.right is None else "")
def walk(n):
return n.value+(walk(n.left) if not n.left is None else "")+(walk(n.right) if not n.right is None else "")
test = 'D'
print(convert_letter(BINARY, test))
print(walk(BINARY))
This is not how I would personally structure an answer, but I think it most closely follows what you are attempting. The shortcoming of your answer only being that you are only returning one value, but kind of tracking two values. Note, I have taken the liberty of correcting:
BINARY = join('START',A,B)
Let's modify your method to return both a Boolean indicating if the letter was found as well as the indicator of the path.
def convert_letter2(testtree: BinaryTree, letter: str):
if not testtree:
return (False, "")
if testtree.root == letter:
return (True, "")
test, val = convert_letter2(testtree.left, letter)
if test:
return (True, "1" + val)
test, val = convert_letter2(testtree.right, letter)
if test:
return (True, "0" + val)
return (False, "")
Then if we:
print(convert_letter2(BINARY, "D")[1])
We should get back "10"

Binary Search Tree Frequency Counter

I need to read a text file, strip the unnecessary punctuation, lowercase the words and use binary search tree function to make a word binary search tree that consists of the words in the file.
We are asked to count the frequency of recurring words and asked for a total word count and total unique word count.
So far I've got the punctuation resolved, file read done, lowercase done, binary search tree basically done and I just need to figure out how to implement the "frequency" counter in the code.
My code is as follows:
class BSearchTree :
class _Node :
def __init__(self, word, left = None, right = None) :
self._word = word
self._count = 0
self._left = left
self._right = right
def __init__(self) :
self._root = None
self._wordc = 0
self._each = 0
def isEmpty(self) :
return self._root == None
def search(self, word) :
probe = self._root
while (probe != None) :
if word == probe._word :
return probe
if word < probe._value :
probe = probe._left
else :
probe = probe._right
return None
def insert(self, word) :
if self.isEmpty() :
self._root = self._Node(word)
self._root._freq += 1 <- is this correct?
return
parent = None #to keep track of parent
#we need above information to adjust
#link of parent of new node later
probe = self._root
while (probe != None) :
if word < probe._word : # go to left tree
parent = probe # before we go to child, save parent
probe = probe._left
elif word > probe._word : # go to right tree
parent = probe # before we go to child, save parent
probe = probe._right
if (word < parent._word) : #new value will be new left child
parent._left = self._Node(word)
else : #new value will be new right child
parent._right = self._Node(word)
cause formatting is killing me, this is the latter part of it.
class NotPresent(Exception) :
pass
def main():
t=BST()
file = open("sample.txt")
line = file.readline()
file.close()
#for word in line:
# t.insert(word)
# Line above crashes program because there are too many
# words to add. Lines on bottom tests BST class
t.insert('all')
t.insert('high')
t.insert('fly')
t.insert('can')
t.insert('boars')
#t.insert('all') <- how do i handle duplicates by making
t.inOrder() #extras add to the nodes frequency?
Thank you for helping/trying to help!
Firstly, it's better to initialize a Node's _freq by 1 than doing that in in BST's insert()
(1 more: In python coding convention, white spaces in writing default argument values are not recommended.)
def __init__(self, word, left=None, right=None) :
self._word = word
self._freq = 1
self._left = left
self._right = right
and just add the last 3 lines:
probe = self._root
while (probe != None) :
if word < probe._word : # go to left tree
parent = probe # before we go to child, save parent
probe = probe._left
elif word > probe._word : # go to right tree
parent = probe # before we go to child, save parent
probe = probe._right
else:
probe._freq += 1
return

Add list as child of tree with python 3

I have looked at many very similar questions and cannot figure it out so:
I have a string like this:
{121{12}12{211}2}
I want to read the string into a tree like this:
I am confused as how to tell python to add a whole list as a child node?
I would also like to know how to change the current node to the parent of the old current node?
Here is my code so far:
class Node:
def __init__(self,val):
self.value = val
self.children = []
#init Node class so we can pass in values as nodes and set children to empty list
def add_child(self, obj):
self.children.append(obj)
s=[]
for i in filedata:
if i == leftbrace:
n = Node(i)
#create new child of current node
s = []
#reset list s to blank
if i == rightbrace:
n.add_child(s)
#add list s to current node
#make parent of current node the new current node
else:
s.append(i)
#add i to list s
for c in n.children:
print (c.data)
To make something like this work, it is easiest if you use recursion. Here is one way that this can be done.
Code:
class Node:
def __init__(self, stream):
val = []
children = []
while True:
try:
# get the next character from the stream
ch = next(stream)
# if this is an open brace, then recurse to a child
if ch == '{':
children.append(Node(stream))
# if this is a close brace, we are done on this level
elif ch == '}':
break
# otherwise add this character to our value
else:
val.append(ch)
# stream is empty, we are done
except StopIteration:
break
self.value = ''.join(val)
self.children = children
#classmethod
def from_string(cls, string):
stream = iter(string)
tree_top = Node(stream)
# assert that the string started with '{' and was one top node
assert len(tree_top.children) == 1 and tree_top.value == ''
return tree_top.children[0]
def __str__(self):
return self.value
def __repr__(self):
return "Node('%s', <%d children>)" % (
self.value, len(self.children))
def tree_string(self, level=0):
yield '-' + " " * level + str(self)
for child in self.children:
for child_string in child.tree_string(level+1):
yield child_string
tree = '{121{12}12{211}2}'
for line in Node.from_string(tree).tree_string():
print(line)
Results:
-121122
- 12
- 211

Why does this character ▯ appear?

So this character ▯ appears when I run my code which I think means there is a missing character therefor it can't be displayed. (Not sure correct me if I am wrong) And well basically I want to be able to get rid of that character. Here is what it looks like when I run my code:
However in the back-end in the idle when I click on one of the boxes for it to be displayed up top it doesn't register and looks like this in idle:
Why does it appear on screen if it isn't going to appear in idle?
Also how can I get rid of the ▯ character from the main screen?
Here is my full code.
Here are segments in which I think the problem lies. (However I have not been able to solve the problem)
My classes for Tree comparison to find the sentences and their frequent use:
class Branch():
def __init__(self, value):
self.left = None
self.right = None
self.value = value
self.frequency = 1
def incFreq(self):
self.frequency = self.frequency + 1
def freq(self):
return self.frequency
class Tree():
highest = []
def __init__(self):
self.root = None
self.found = False
def findHighest(self):
from operator import itemgetter, attrgetter
self.highest = []
self.inorder(self.root)
self.highest = sorted(self.highest, key=itemgetter(1), reverse=True)
return self.highest
#lessThan function needed to compare strings
def lessThan(self, a, b):
if len(a) < len(b):
loopCount = len(a)
else:
loopCount = len(b)
for pos in range(0, loopCount):
if a[pos] > b[pos]:
return False
return True
def outputTree(self):
self.inorder(self.root)
def insert(self, value):
#increment freq if already exists, else insert
if not self.exists(value):
self.root = self.insertAtBranch(self.root, value)
def exists(self, value):
#set the class variable found to False to assume it is not there
self.found = False
self.findAtBranch(self.root, value)
return self.found
#Used to fine a value in a tree
def findAtBranch(self, branch, value):
if branch == None:
pass
else:
#print ("[" + branch.value + "][" + value + "]") # Error checking
if branch.value == value:
self.found = True
#print("found " + value)
branch.incFreq()
#print(branch.freq())
else:
self.findAtBranch(branch.left, value)
self.findAtBranch(branch.right, value)
def insertAtBranch(self, branch, value):
if branch == None:
return Branch(value)
else:
if self.lessThan(branch.value, value):
branch.right = self.insertAtBranch(branch.right, value)
else:
branch.left = self.insertAtBranch(branch.left, value)
return branch
def inorder(self, branch):
if branch == None: return
self.highest.append((branch.value, branch.freq()))
#print (branch.value)
#print (branch.freq())
#print(self.highest[0])
self.inorder(branch.left)
self.inorder(branch.right)
This is where I use the tree and pass sentences to be used on a different function:
def getPhrases(self, numToReturn):
topPhrases = []
phrasesTree = Tree()
#load tree with phrases from phrase text file
file = open('setPhrases.txt', 'r')
for line in file:
phrasesTree.insert(line)
#create a list of the top n of phrases to return
val = 0
for phrase in phrasesTree.findHighest():
if val < numToReturn:
topPhrases.append(phrase)
val = val + 1
return topPhrases
This is where I use the sentences to be able to display them on the screen:
def createPhrases(self):
print("createPhrases")
self.deletePanes()
self.show_keyboard = False
self.show_words = False
self.show_phrases = True
self.show_terminal = True
words = self.getPhrases(10)
for word, count in words:
self.addPane("{}".format(word, count), WORDS)
self.addPane("Boxes", PHRASE)
self.addPane("Keyboard", PHRASE)
self.addPane("OK", PHRASE)
self.drawPanes()
When you read lines from file, newline characters are at the end. pygame's documentation states that:
The text can only be a single line: newline characters are not rendered.
So, you should change this fragment:
file = open('setPhrases.txt', 'r')
for line in file:
phrasesTree.insert(line)
to this:
file = open('setPhrases.txt', 'r')
for line in file:
phrasesTree.insert(line.strip())

Categories