I'm given a .txt file with a total of 1 million songs and their respective authors. My given task is to write a HashMap in python which can store this information using either the Author or the Song as a key. I've written a HashMap that works but is running incredibly slow, taking up to 2 minutes to finish. (Expected time is apparently a few seconds at most, according to my tutor)
For collision handling I decided to use linked lists as from what I've gathered it's an effective way to handle collisions without drastically reducing performance.
from HashNode import HashNode
class HashTabell:
def __init__(self, size):
self.size = size
self.dict = {}
self.krock = 0
def store(self, nyckel, data):
hashval = self.__hash(nyckel)
## Shit is empty
if self.dict.get(hashval) != None:
get_val = self.dict[hashval]
## Is list, append normally
if isinstance(get_val, list):
list2 = self.dict[hashval]
found = False
for (k, val) in enumerate(list2):
if val.get_nyckel == nyckel:
list[k] = HashNode(nyckel, data) ## Update old value
found = True
break
if found:
self.dict[hashval] = list2
else:
self.dict[hashval] = get_val + [HashNode(nyckel, data)]
self.krock += 1
else:
## Create list
if get_val.get_nyckel() == nyckel:
self.dict[hashval] = HashNode(nyckel, data) ## Update old value
else:
self.dict[hashval] = [get_val, HashNode(nyckel, data)] ## Append to existing node
self.krock += 1
else:
self.dict[hashval] = HashNode(nyckel, data)
def __getitem__(self, nyckel):
return search(nyckel)
def __contains__(self, nyckel):
return (search(nyckel) != None)
def search(self, nyckel):
hashval = self.__hash(nyckel)
## Get val
get_val = self.dict.get(hashval)
if get_val == None:
raise KeyError("Key not found")
## Check if has multiple entries or not
if isinstance(get_val, list):
## Multiple
for value in get_val:
if(get_val.get_nyckel() == nyckel):
return get_val
raise KeyError("Key not found")
else:
## Single
if get_val.get_nyckel() == nyckel:
return get_val
else:
raise KeyError("Key not found")
## Hash function
def __hash(self, input):
inp = str(input) ## Get chars
value = 0
for k in input:
value += ord(k)
return (value % self.size)
def get_dict(self):
return self.dict
def get_krock(self):
return self.krock
Where the HashNode class is simply:
class HashNode:
def __init__(self, nyckel, data):
self.nyckel = nyckel
self.data = data
def get_nyckel(self):
return self.nyckel
def get_data(self):
return self.data
I've been staring myself blind with this issue for the past 2 weeks and I'm not getting any help from my lecturer/assistants, would greatly appreciate any advice on how to improve the speed.
Related
I have a Binary Search Tree and I am trying to trace recursively in order through the tree and append each key,value to a list. It is only appending the first key,value to the list and not going through the list in order. I pasted my code below, along with the test code I used at the bottom. Any help on how to get past this issue is super appreciated!
class TreeMap:
class Node:
def __init__(self, key, value):
self.key = key
self.value = value
self.left = None
self.right = None
def __init__(self):
self.root = None
self.numsearches = 0
self.numcomparisons = 0
def add(self, newkey, newvalue):
newkey = newkey.lower()
if self.root == None:
self.root = TreeMap.Node(newkey, newvalue)
else:
TreeMap.add_helper(self.root, newkey, newvalue)
def add_helper(thisnode, newkey, newvalue):
if newkey <= thisnode.key:
if thisnode.left == None:
thisnode.left = TreeMap.Node(newkey, newvalue)
else:
TreeMap.add_helper(thisnode.left, newkey, newvalue)
else:
if thisnode.right == None:
thisnode.right = TreeMap.Node(newkey, newvalue)
else:
TreeMap.add_helper(thisnode.right, newkey, newvalue)
def print(self):
TreeMap.print_helper(self.root, 0)
def print_helper(somenode, indentlevel):
if somenode == None:
print(" "*(indentlevel),"---")
return
if not TreeMap.isleaf(somenode):
TreeMap.print_helper(somenode.right, indentlevel + 5)
print(" "*indentlevel + str(somenode.key) + ": " +str(somenode.value))
if not TreeMap.isleaf(somenode):
TreeMap.print_helper(somenode.left, indentlevel + 5)
def isleaf(anode):
return anode.left == None and anode.right == None
def listify(self, whichorder="in"):
'''
Returns a list consisting of all the payloads of the tree. (This returns a plain old Python List.)
The order of the payloads is determined by whichorder, which defaults to inorder.
The other possibilities are "pre" and "post".
If the tree is empty, return the empty list.
'''
assert type(whichorder) is str,"Whichorder is a string, and can only be pre, in or post"
assert whichorder in ["pre","in","post"],"Whichorder is a string, and can only be pre, in or post"
return TreeMap.listify_helper(self.root, whichorder)
def listify_helper(somenode, whichorder):
order_list = []
if somenode == None:
return order_list
elif somenode != None and whichorder == 'in':
TreeMap.listify_helper(somenode.left, 'in')
order_list.append(somenode.key+ '='+somenode.value)
TreeMap.listify_helper(somenode.right, 'in')
return order_list
TEST CODE:
import treemap
translator = treemap.TreeMap()
translator.add("cat", "Katze")
translator.add("bird", "Vogel")
translator.add("dog", "Hund")
translator.add("snake", "IDK")
translator.add("bear", "IDK")
translator.add("octopus", "Tintenfisch")
translator.add("horse", "Pferd")
translator.add("zebra", "IDK")
translator.print()
print("---------------------------------------------------")
print (translator.listify())
The problem is here:
def listify_helper(somenode, whichorder):
order_list = []
This function initialises its own local order_list every time it is invoked. Pass order_list as a parameter instead so that the same list is appended to by each recursive invocation.
Alternatively, append each element of the result of the recursive calls of listify_helper to order_list, although this approach could result in unneeded copying.
class _ListNode:
def __init__(self, value, next_):
self._data = value
self._next = next_
return
class List:
def __init__(self):
self._front = None
self._count = 0
return
def _linear_search(self,key):
previous = None
current = self._front
index = 0
while current is not None and key > current._data:
previous = current
current = current._next
index += 1
if current._data != key:
previous = None
current = None
index = -1
return previous, current, index
def __contains__(self, key):
_, _, i = self._linear_search(key)
return i != -1
def append(self, value):
if self._front is None:
self._front = _ListNode(value,None)
else:
self._front._next = _ListNode(value,None)
self._count += 1
l = List()
lst = [1,2,3,4]
i = 0
n = len(lst)
while i < n:
l.append(lst[i])
i += 1
print("{}".format(l.__contains(3))
To explain more, I implement the linear search method and the contains method. The contains method check if the number is in the list or not (returns true or false). Now when I need to check that #3 in the list using contains method, the answer is false!! i cant know what's the problem
Your append method does not walk down the list. It simply always appends to self._front._next if self.front is already present. Meaning the contents at the end of the append loop are the first thing you appended, the last thing you appended and nothing in between.
To correct it walk the list looking for a _next equal to None and append there.
def append(self, value):
if self._front is None:
self._front = _ListNode(value, None)
else:
n = self._front
while n._next is not None:
n = n._next
n._next = _ListNode(value, None)
self._count += 1
You could also define a _str__ method to print the content of the List
e.g.
def __str__(self):
res = []
n = self._front
while n is not None:
res.append(str(n._data))
n = n._next
return ', '.join(res)
This is not a particularly efficient implementation as it builds an intermediate builtin list object.
You also don't need those bare return statements in your methods. You can remove those.
I need help implementing a method for my "MyHashTable" class:
def search(self, search_key):
The method is supposed to use linear probing to handle collision resolution. If the search_key is in the hash table then the method returns the slot number of the slot containing that search_key. If the search_key is not in the hash table, the method returns -1
My class looks like this:
class MyHashTable:
def __init__(self, capacity):
self.capacity = capacity
self.slots = [None] * self.capacity
def __str__(self):
return str(self.slots )
def __len__(self):
count = 0
for i in self.slots:
if i != None:
count += 1
return count
def hash_function(self, key):
i = key % self.capacity
return i
def insert(self, key):
slot = self.hash_function(key)
orig = slot
while True:
if self.slots[slot] is None:
self.slots[slot] = key
return slot
if self.slots[slot] == key:
return -2
slot = (slot + 1) % self.capacity
if slot == orig:
return -1
def search(self, search_key):
Any help or tutorial links would be awesome.
Thanks
You are only using a single list to store all the values, if you wanted a hash table you might use a list of lists where each list was a bucket but if you just want to check if the element is in your hash table with your own code:
def search(self, search_key):
hsh = self.hash_function(search_key)
if self.slots[hsh] is None:
return -1
while hsh < self.capacity:
if self.slots[hsh] == search_key:
return hsh
hsh += 1
return -1
You also have to handle the case where you have multiple collisions so we need at worst to check every element in the hash table to find the correct value:
def search(self, search_key):
hsh = self.hash_function(search_key)
if self.slots[hsh] is None:
return -1
for i in range(self.capacity):
mod = (hsh + i) % self.capacity
if self.slots[mod] == search_key:
return mod
return -1
The first while loop will probe one value over at a time but if we have wrapped around the list from multiple collisions it would miss elements at the start so using range and mod = (hsh + i) % self.capacity makes sure we check all entries like the example below.
m = MyHashTable(5)
m.insert(13) # 13 % 5 = 3
m.insert(73) # 83 % 5 = 3
m.insert(93) # 93 & 5 = 3
print(m.search(13)) # 3
print(m.search(73)) # 4
print(m.search(93)) # 0
print(m.search(2)) # -1
You can make your len method O(1) by keeping track of when you add a unique value to your hash table, there is also a nice wiki page on Open_addressing parts of which you can adopt into your code and it will help you create a proper mapping of keys to values and resized your hash table when needed. If you want to store more than just numbers you need to use a different hash function, I just use hash but you can use whatever you like. Also using in when your hash table is full and the key does not exist will cause an infinite loop so you will need to handle that case:
class MyHashTable:
def __init__(self, capacity):
self.capacity = capacity
self.slots = [None] * self.capacity
self.count = 0
def __str__(self):
return str(self.slots)
def __contains__(self, item):
return self.search(item) != -1
def __len__(self):
return self.count
def hash_function(self, key):
return hash(key) % self.capacity
def find_slot(self, key):
slot = self.hash_function(key)
while self.slots[slot] is not None and self.slots[slot] != key:
slot = (slot + 1) % self.capacity
return slot
def insert(self, key):
slot = self.find_slot(key)
if self.slots[slot] != key:
self.slots[slot] = key
self.count += 1
def search(self, key):
i = self.find_slot(key)
if self.slots[i] is not None:
return i
return -1
Add a __contains__ will also allow you to use in to test for membership:
m = MyHashTable(5)
m.insert("foo")
m.insert(73)
m.insert(93)
m.insert(1)
print(m.search(73))
print(m.search(93))
print(m.search(1))
print(m.search("foo"))
m.insert(73)
print(m.slots)
print(len(m))
print("foo" in m)
print(5 in m)
Output:
3
4
1
0
['foo', 1, None, 73, 93]
4
True
False
My assignment is to implement a map with chaining by creating a hash table with two lists, one called "slots" and one called "data. My code seems to work until the 'G' character. I can't quite pinpoint what is going on here and I have tried debugging.
class HashTable:
def __init__(self):
self.size = 11
self.slots = [None] * self.size
self.data = [None] * self.size
def put(self,key,data):
hashvalue = self.hashfunction(key,len(self.slots))
if self.slots[hashvalue] == None:
self.slots[hashvalue] = list()
self.slots[hashvalue].append(key)
self.data[hashvalue] = list()
self.data[hashvalue].append(data)
else:
if self.slots[hashvalue] != None:
self.data[hashvalue].append(data) #replace
def hashfunction(self,key,size):
return key%size
def get(self,key):
startslot = self.hashfunction(key,len(self.slots))
data = None
stop = False
found = False
position = startslot
while self.slots[position] != None and not found and not stop:
for index in range (len(self.slots[position])):
if self.slots[position][index]== key:
found = True
data = self.data[position][index]
break
position+1
if position == startslot:
stop = True
return data
def __getitem__(self,key):
return self.get(key)
def __setitem__(self,key,data):
self.put(key,data)
## TEST FOR HashTable
h = HashTable() # create new hash table
nums = [1, 3, 5, 50, 1000] # some keys
nums = nums + [ len(h.slots)*i for i in range(20)] # some keys that have same hash
vals = [ chr(x) for x in range(ord('A'),ord('Z')) ] # list of single letters from A to Z
# add key/values
for i in range(len(nums)):
# print("adding (%d, %s)"%(nums[i],vals[i]),end=" ")
h[nums[i]] = vals[i]
for i in range(len(nums)):
key = nums[i]
value = vals[i]
gotValue = h[key]
assert gotValue == value,"expected key: %d to lookup value: %s but got value %s instead " % (key, value, gotValue)
print("\nAll TESTS PASSED")
I found my issue:
I forgot to add a line under:
if self.slots[hashvalue] != None:
that adds the key to the [hashvalue] of slots.
So now I have:
if self.slots[hashvalue] != None:
self.slots[hashvalue].append(key)
self.data[hashvalue].append(data)
It was adding the data value to the corresponding list "data", but not the matching key value to the list "slots"
So this character ▯ appears when I run my code which I think means there is a missing character therefor it can't be displayed. (Not sure correct me if I am wrong) And well basically I want to be able to get rid of that character. Here is what it looks like when I run my code:
However in the back-end in the idle when I click on one of the boxes for it to be displayed up top it doesn't register and looks like this in idle:
Why does it appear on screen if it isn't going to appear in idle?
Also how can I get rid of the ▯ character from the main screen?
Here is my full code.
Here are segments in which I think the problem lies. (However I have not been able to solve the problem)
My classes for Tree comparison to find the sentences and their frequent use:
class Branch():
def __init__(self, value):
self.left = None
self.right = None
self.value = value
self.frequency = 1
def incFreq(self):
self.frequency = self.frequency + 1
def freq(self):
return self.frequency
class Tree():
highest = []
def __init__(self):
self.root = None
self.found = False
def findHighest(self):
from operator import itemgetter, attrgetter
self.highest = []
self.inorder(self.root)
self.highest = sorted(self.highest, key=itemgetter(1), reverse=True)
return self.highest
#lessThan function needed to compare strings
def lessThan(self, a, b):
if len(a) < len(b):
loopCount = len(a)
else:
loopCount = len(b)
for pos in range(0, loopCount):
if a[pos] > b[pos]:
return False
return True
def outputTree(self):
self.inorder(self.root)
def insert(self, value):
#increment freq if already exists, else insert
if not self.exists(value):
self.root = self.insertAtBranch(self.root, value)
def exists(self, value):
#set the class variable found to False to assume it is not there
self.found = False
self.findAtBranch(self.root, value)
return self.found
#Used to fine a value in a tree
def findAtBranch(self, branch, value):
if branch == None:
pass
else:
#print ("[" + branch.value + "][" + value + "]") # Error checking
if branch.value == value:
self.found = True
#print("found " + value)
branch.incFreq()
#print(branch.freq())
else:
self.findAtBranch(branch.left, value)
self.findAtBranch(branch.right, value)
def insertAtBranch(self, branch, value):
if branch == None:
return Branch(value)
else:
if self.lessThan(branch.value, value):
branch.right = self.insertAtBranch(branch.right, value)
else:
branch.left = self.insertAtBranch(branch.left, value)
return branch
def inorder(self, branch):
if branch == None: return
self.highest.append((branch.value, branch.freq()))
#print (branch.value)
#print (branch.freq())
#print(self.highest[0])
self.inorder(branch.left)
self.inorder(branch.right)
This is where I use the tree and pass sentences to be used on a different function:
def getPhrases(self, numToReturn):
topPhrases = []
phrasesTree = Tree()
#load tree with phrases from phrase text file
file = open('setPhrases.txt', 'r')
for line in file:
phrasesTree.insert(line)
#create a list of the top n of phrases to return
val = 0
for phrase in phrasesTree.findHighest():
if val < numToReturn:
topPhrases.append(phrase)
val = val + 1
return topPhrases
This is where I use the sentences to be able to display them on the screen:
def createPhrases(self):
print("createPhrases")
self.deletePanes()
self.show_keyboard = False
self.show_words = False
self.show_phrases = True
self.show_terminal = True
words = self.getPhrases(10)
for word, count in words:
self.addPane("{}".format(word, count), WORDS)
self.addPane("Boxes", PHRASE)
self.addPane("Keyboard", PHRASE)
self.addPane("OK", PHRASE)
self.drawPanes()
When you read lines from file, newline characters are at the end. pygame's documentation states that:
The text can only be a single line: newline characters are not rendered.
So, you should change this fragment:
file = open('setPhrases.txt', 'r')
for line in file:
phrasesTree.insert(line)
to this:
file = open('setPhrases.txt', 'r')
for line in file:
phrasesTree.insert(line.strip())