How can I initialize dictionary in Python where key is an object? - python

I am trying to rephrase the implementation found here. This is what I have so far:
import csv
import math
import random
training_set_ratio = 0.67
training_set = []
test_set = []
class IrisFlower:
def __init__(self, petal_length, petal_width, sepal_length, sepal_width, flower_type):
self.petal_length = petal_length
self.petal_width = petal_width
self.sepal_length = sepal_length
self.sepal_width = sepal_width
self.flower_type = flower_type
def __hash__(self) -> int:
return hash((self.petal_length, self.petal_width, self.sepal_length, self.sepal_width))
def __eq__(self, other):
return (self.petal_length, self.petal_width, self.sepal_length, self.sepal_width) \
== (other.petal_length, other.petal_width, other.sepal_length, other.sepal_width)
def load_data():
with open('dataset.csv') as csvfile:
rows = csv.reader(csvfile, delimiter=',')
for row in rows:
iris_flower = IrisFlower(float(row[0]), float(row[1]), float(row[2]), float(row[3]), row[4])
if random.random() < training_set_ratio:
training_set.append(iris_flower)
else:
test_set.append(iris_flower)
def euclidean_distance(flower_one: IrisFlower, flower_two: IrisFlower):
distance = 0.0
distance = distance + math.pow(flower_one.petal_length - flower_two.petal_length, 2)
distance = distance + math.pow(flower_one.petal_width - flower_two.petal_width, 2)
distance = distance + math.pow(flower_one.sepal_length - flower_two.sepal_length, 2)
distance = distance + math.pow(flower_one.sepal_width - flower_two.sepal_width, 2)
return distance
def get_neighbors(test_flower: IrisFlower):
distances = []
for training_flower in training_set:
dist = euclidean_distance(test_flower, training_flower)
d = dict()
d[training_flower] = dist
print(d)
return
load_data()
get_neighbors(test_set[0])
Currently, print statements in the following code block:
def get_neighbors(test_flower: IrisFlower):
distances = []
for training_flower in training_set:
dist = euclidean_distance(test_flower, training_flower)
d = dict()
d[training_flower] = dist
print(d)
return
will have outputs similar to
{<__main__.IrisFlower object at 0x107774fd0>: 0.25999999999999945}
which is ok. But I do not want to create the dictionary first, and then append the key value, as in:
d = dict()
d[training_flower] = dist
So this is what I am trying:
d = dict(training_flower = dist)
However, it does not seem like the dist method is using the instance, but rather a String, because what I see printed is as follows:
{'training_flower': 23.409999999999997}
{'training_flower': 16.689999999999998}
How do I create the dictionary by using the object as key in one statement?

In your snippet, where you write d = dict(training_flower=dist), "training_flower" is a keyword argument for dict function and not an object. It is equivalent to writing d = {'training_flower': dist}. The only way to create a dictionary with an object as a key is to use the latter syntax:
d = {training_flower: dist}

To directly create a dict with a key which is not a valid keyword, use the {} syntax like:
Code:
d = {training_flower: 'a_value'}
Test Code:
training_flower = 'a key'
d = {training_flower: 'a_value'}
print(d)
Results:
{'a key': 'a_value'}

to initialize a dictionary with an object as a key, (edit: and the string in Stephen's example is an object anyway)
class Flower:
def __repr__(self):
return 'i am flower'
flower1 = Flower()
d = {flower1: 4}
print(d)
outputs
{i am flower: 4}
this is my first post here, and I know I'm late, sorry if it's a duplicate solution. just to show it works with an object.
would upvote Stephen's answer but I can't yet.

Related

Replace dot product for loop Numpy

I am trying to replace the dot product for loop using something faster like NumPy
I did research on dot product and kind of understand and can get it working with toy data in a few ways in but not 100% when it comes to implementing it for actual use with a data frame.
I looked at these and other SO threads to no luck avoide loop dot product, matlab and dot product subarrays without for loop and multiple numpy dot products without a loop
looking to do something like this which works with toy numbers in np array
u1 =np.array([1,2,3])
u2 =np.array([2,3,4])
v1.dot(v2)
20
u1 =np.array([1,2,3])
u2 =np.array([2,3,4])
(u1 * u2).sum()
20
u1 =np.array([1,2,3])
u2 =np.array([2,3,4])
sum([x1*x2 for x1, x2 in zip (u1, u2)])
20
this is the current working get dot product
I would like to do this with out the for loop
def get_dot_product(self, courseid1, courseid2, unit_vectors):
u1 = unit_vectors[courseid1]
u2 = unit_vectors[courseid2]
dot_product = 0.0
for dimension in u1:
if dimension in u2:
dot_product += u1[dimension] * u2[dimension]
return dot_product
** code**
#!/usr/bin/env python
# coding: utf-8
class SearchRecommendationSystem:
def __init__(self):
pass
def get_bag_of_words(self, titles_lines):
bag_of_words = {}
for index, row in titles_lines.iterrows():
courseid, course_bag_of_words = self.get_course_bag_of_words(row)
for word in course_bag_of_words:
word = str(word).strip() # added
if word not in bag_of_words:
bag_of_words[word] = course_bag_of_words[word]
else:
bag_of_words[word] += course_bag_of_words[word]
return bag_of_words
def get_course_bag_of_words(self, line):
course_bag_of_words = {}
courseid = line['courseid']
title = line['title'].lower()
description = line['description'].lower()
wordlist = title.split() + description.split()
if len(wordlist) >= 10:
for word in wordlist:
word = str(word).strip() # added
if word not in course_bag_of_words:
course_bag_of_words[word] = 1
else:
course_bag_of_words[word] += 1
return courseid, course_bag_of_words
def get_sorted_results(self, d):
kv_list = d.items()
vk_list = []
for kv in kv_list:
k, v = kv
vk = v, k
vk_list.append(vk)
vk_list.sort()
vk_list.reverse()
k_list = []
for vk in vk_list[:10]:
v, k = vk
k_list.append(k)
return k_list
def get_keywords(self, titles_lines, bag_of_words):
n = sum(bag_of_words.values())
keywords = {}
for index, row in titles_lines.iterrows():
courseid, course_bag_of_words = self.get_course_bag_of_words(row)
term_importance = {}
for word in course_bag_of_words:
word = str(word).strip() # extra
tf_course = (float(course_bag_of_words[word]) / sum(course_bag_of_words.values()))
tf_overall = float(bag_of_words[word]) / n
term_importance[word] = tf_course / tf_overall
keywords[str(courseid)] = self.get_sorted_results(term_importance)
return keywords
def get_inverted_index(self, keywords):
inverted_index = {}
for courseid in keywords:
for keyword in keywords[courseid]:
if keyword not in inverted_index:
keyword = str(keyword).strip() # added
inverted_index[keyword] = []
inverted_index[keyword].append(courseid)
return inverted_index
def get_search_results(self, query_terms, keywords, inverted_index):
search_results = {}
for term in query_terms:
term = str(term).strip()
if term in inverted_index:
for courseid in inverted_index[term]:
if courseid not in search_results:
search_results[courseid] = 0.0
search_results[courseid] += (
1 / float(keywords[courseid].index(term) + 1) *
1 / float(query_terms.index(term) + 1)
)
sorted_results = self.get_sorted_results(search_results)
return sorted_results
def get_titles(self, titles_lines):
titles = {}
for index, row in titles_lines.iterrows():
titles[row['courseid']] = row['title'][:60]
return titles
def get_unit_vectors(self, keywords, categories_lines):
norm = 1.884
cat = {}
subcat = {}
for line in categories_lines[1:]:
courseid_, category, subcategory = line.split('\t')
cat[courseid_] = category.strip()
subcat[courseid_] = subcategory.strip()
unit_vectors = {}
for courseid in keywords:
u = {}
if courseid in cat:
u[cat[courseid]] = 1 / norm
u[subcat[courseid]] = 1 / norm
for keyword in keywords[courseid]:
u[keyword] = (1 / float(keywords[courseid].index(keyword) + 1) / norm)
unit_vectors[courseid] = u
return unit_vectors
def get_dot_product(self, courseid1, courseid2, unit_vectors):
u1 = unit_vectors[courseid1]
u2 = unit_vectors[courseid2]
dot_product = 0.0
for dimension in u1:
if dimension in u2:
dot_product += u1[dimension] * u2[dimension]
return dot_product
def get_recommendation_results(self, seed_courseid, keywords, inverted_index, unit_vectors):
courseids = []
seed_courseid = str(seed_courseid).strip()
for keyword in keywords[seed_courseid]:
for courseid in inverted_index[keyword]:
if courseid not in courseids and courseid != seed_courseid:
courseids.append(courseid)
dot_products = {}
for courseid in courseids:
dot_products[courseid] = self.get_dot_product(seed_courseid, courseid, unit_vectors)
sorted_results = self.get_sorted_results(dot_products)
return sorted_results
def Final(self):
print("Reading Title file.......")
titles_lines = open('s2-titles.txt', encoding="utf8").readlines()
print("Reading Category file.......")
categories_lines = open('s2-categories.tsv', encoding = "utf8").readlines()
print("Getting Supported Functions Data")
bag_of_words = self.get_bag_of_words(titles_lines)
keywords = self.get_keywords(titles_lines, bag_of_words)
inverted_index = self.get_inverted_index(keywords)
titles = self.get_titles(titles_lines)
print("Getting Unit Vectors")
unit_vectors = self.get_unit_vectors(keywords=keywords, categories_lines=categories_lines)
#Search Part
print("\n ############# Started Search Query System ############# \n")
query = input('Input your search query: ')
while query != '':
query_terms = query.split()
search_sorted_results = self.get_search_results(query_terms, keywords, inverted_index)
print(f"==> search results for query: {query.split()}")
for search_result in search_sorted_results:
print(f"{search_result.strip()} - {str(titles[search_result]).strip()}")
#ask again for query or quit the while loop if no query is given
query = input('Input your search query [hit return to finish]: ')
print("\n ############# Started Recommendation Algorithm System ############# \n")
# Recommendation ALgorithm Part
seed_courseid = (input('Input your seed courseid: '))
while seed_courseid != '':
seed_courseid = str(seed_courseid).strip()
recom_sorted_results = self.get_recommendation_results(seed_courseid, keywords, inverted_index, unit_vectors)
print('==> recommendation results:')
for rec_result in recom_sorted_results:
print(f"{rec_result.strip()} - {str(titles[rec_result]).strip()}")
get_dot_product_ = self.get_dot_product(seed_courseid, str(rec_result).strip(), unit_vectors)
print(f"Dot Product Value: {get_dot_product_}")
seed_courseid = (input('Input seed courseid [hit return to finish]:'))
if __name__ == '__main__':
obj = SearchRecommendationSystem()
obj.Final()
s2-categories.tsv
courseid category subcategory
21526 Design 3D & Animation
153082 Marketing Advertising
225436 Marketing Affiliate Marketing
19482 Office Productivity Apple
33883 Office Productivity Apple
59526 IT & Software Operating Systems
29219 Personal Development Career Development
35057 Personal Development Career Development
40751 Personal Development Career Development
65210 Personal Development Career Development
234414 Personal Development Career Development
Example of how s2-titles.txt looks
courseidXXXYYYZZZtitleXXXYYYZZZdescription
3586XXXYYYZZZLearning Tools for Mrs B's Science Classes This is a series of lessons that will introduce students to the learning tools that will be utilized throughout the schoXXXYYYZZZThis is a series of lessons that will introduce students to the learning tools that will be utilized throughout the school year The use of these tools serves multiple purposes 1 Allow the teacher to give immediate and meaningful feedback on work that is in progress 2 Allow students to have access to content and materials when outside the classroom 3 Provide a variety of methods for students to experience learning materials 4 Provide a variety of methods for students to demonstrate learning 5 Allow for more time sensitive correction grading and reflections on concepts that are assessed
Evidently unit_vectors is a dictionary, from which you extract to 2 values, u1 and u2.
But what are those? Evidently dicts as well (this iteration would not make sense with a list):
for dimension in u1:
if dimension in u2:
dot_product += u1[dimension] * u2[dimension]
But what is u1[dimension]? A list? An array.
Normally dict are access by key as you do here. There isn't a numpy style "vectorization". vals = list(u1.values()) gets a lists of all values, and conceivably that could be made into an array (if the elements are right)
arr1 = np.array(list(u1.values()))
and a np.dot(arr1, arr2) might work
You'll get the best answers if you give small concrete examples - with real working data (and skip the complex generating code). Focus on the core of the problem, so we can grasp the issue with a 30 second read!
===
Looking more in depth at your dot function; this replicates the core (I think). Initially I missed the fact that you aren't iterating on u2 keys, but rather seeking matching ones.
def foo(dd):
x = 0
u1 = dd['u1']
u2 = dd['u2']
for k in u1:
if k in u2:
x += u1[k]*u2[k]
return x
Then making a dictionary of dictionaries:
In [30]: keys=list('abcde'); values=[1,2,3,4,5]
In [31]: adict = {k:v for k,v in zip(keys,values)}
In [32]: dd = {'u1':adict, 'u2':adict}
In [41]: dd
Out[41]:
{'u1': {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5},
'u2': {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}}
In [42]: foo(dd)
Out[42]: 55
In this case the subdictionaries match, so we get the same value with a simple array dot:
In [43]: np.dot(values,values)
Out[43]: 55
But if u2 was different, with different key/value pairs, and possibly different keys the result will be different. I don't see a way around the iterative access by keys. The sum-of-products part of the job is minor compared to the dictionary access.
In [44]: dd['u2'] = {'e':3, 'f':4, 'a':3}
In [45]: foo(dd)
Out[45]: 18
We could construct other data structures that are more suitable to a fast dot like calculation. But that's another topic.
Modified method
def get_dot_product(self, courseid1, courseid2, unit_vectors):
# u1 = unit_vectors[courseid1]
# u2 = unit_vectors[courseid2]
# dimensions = set(u1).intersection(set(u2))
# dot_product = sum(u1[dimension] * u2.get(dimension, 0) for dimension in dimensions)
u1 = unit_vectors[courseid1]
u2 = unit_vectors[courseid2]
dot_product = sum(u1[dimension] * u2.get(dimension, 0) for dimension in u2)
return dot_product

How do I assign dynamic key values in a dictionary in Python?

Environment:
Zipline 1.3.0
miniconda3
windows OS
I am trying to iterate S in data. S.symbol has like 15 values.
When iterating in data for 1 symbol, say ‘spy’ as in below code; i want to create 2keys
(S.symbol + “c”) —-> spyc to hold current value
(S.symbol + “s”) —→ spys to hold a float value.
def before_trading_start(context,data):
print("*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#")
print("Get DateTime: ",get_datetime())
#context is a dictionary
for S in data:
if(S.symbol == 'SPY'):
arr = list(range(1,91))
hist = data.history(S,"price",90,"1d")
price_list = np.log(hist.tolist())
context.spyc = data.current(S,"price")
context.spys = Slope(arr, price_list)
print (context.spyc)
print (S.symbol, context.spys)
else:
continue
############### My failed Version of dynamic naming
def before_trading_start(context,data):
print("*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*")
print("Get DateTime: ",get_datetime())
for S in data:
arr = list(range(1,91))
hist = data.history(S,"price",90,"1d")
price_list = np.log(hist.tolist())
context[S.symbol + "c"] = data.current(S,"price")
context[S.symbol+"s"] = Slope(arr, price_list)
print (context[S.symbol + "c"])
print (S.symbol, context[S.symbol + "s"])
S.symbols = ["Spy","AAPl",'xom','L','T','CSCO','MSFT'..]

ID3 Algorithm in Python

I am trying to plot a decision tree using ID3 in Python. I am really new to Python and couldn't understand the implementation of the following code. I need to know how I can apply this code to my data.
from math import log
import operator
def entropy(data):
entries = len(data)
labels = {}
for feat in data:
label = feat[-1]
if label not in labels.keys():
labels[label] = 0
labels[label] += 1
entropy = 0.0
for key in labels:
probability = float(labels[key])/entries
entropy -= probability * log(probability,2)
return entropy
def split(data, axis, val):
newData = []
for feat in data:
if feat[axis] == val:
reducedFeat = feat[:axis]
reducedFeat.extend(feat[axis+1:])
newData.append(reducedFeat)
return newData
def choose(data):
features = len(data[0]) - 1
baseEntropy = entropy(data)
bestInfoGain = 0.0;
bestFeat = -1
for i in range(features):
featList = [ex[i] for ex in data]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
newData = split(data, i, value)
probability = len(newData)/float(len(data))
newEntropy += probability * entropy(newData)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeat = i
return bestFeat
def majority(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def tree(data,labels):
classList = [ex[-1] for ex in data]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(data[0]) == 1:
return majority(classList)
bestFeat = choose(data)
bestFeatLabel = labels[bestFeat]
theTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [ex[bestFeat] for ex in data]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
theTree[bestFeatLabel][value] = tree(split/(data, bestFeat, value),subLabels)
return theTree
So what I did after this is the following:
infile=open("SData.csv","r")
data=infile.read()
tree(data)
The error which I got is "1 argument is missing" which is the label which I have to define and this is where I don't know what I have to put. I tried the variable for which I have to make the decision tree but it doesn't work:
tree(data,MinTemp)
Here I get an error "MinTemp is not defined".
Please help me out and let me know what I should do to have a look at the tree.
Following is the part of data and I want to generate a tree for MinTemp
MinTemp,Rainfall,Tempat9,RHat9,CAat9,WSat9
high,no,mild,normal,overcast,weak
high,no,mild,normal,cloudy,weak
high,no,mild,normal,cloudy,mild
high,yes,mild,high,cloudy,weak
high,yes,mild,high,cloudy,mild
medium,yes,mild,high,cloudy,mild
high,no,mild,high,overcast,weak
high,no,mild,normal,sunny,weak
high,no,hot,normal,sunny,weak
high,no,hot,normal,overcast,weak

NLTK output changes when using the same input on the same machine

The next piece of code returns different output with the same input(self.SynSets)
Why can it be happening? Am I doing something wrong? or is it caused by python?
def FilterSynSets(self):
self.filteredSysNets = {}
for synset in self.SysNets:
for subsynset in self.SysNets:
length = wn.path_similarity(synset,subsynset)
if not length is None\
and length !=1\
and length >0:
target = synset.__str__().replace("'","")
source =subsynset.__str__().replace("'","")
connection="\"{}\"->\"{}\" [label={}]".format(
target,
source,
str(round(length,3)))
self.filteredSysNets[connection] = length
oldLength = len(self.filteredSysNets)
avarageVal = sum(self.filteredSysNets.values())/len(self.filteredSysNets)
self.filteredSysNets = {k: v for k,v in self.filteredSysNets.items() if v>=avarageVal}
newLength = len(self.filteredSysNets)
prt = newLength/oldLength*100
print ("avr -{}\n old -{}\n new - {}\n prec={}".format(avarageVal,oldLength,newLength,prt))
return self
Outputs:
http://screencast.com/t/eFMOROfkPXR
http://screencast.com/t/Fdd6ufhA

python generating nested dictionary key error

I am trying to create a nested dictionary from a mysql query but I am getting a key error
result = {}
for i, q in enumerate(query):
result['data'][i]['firstName'] = q.first_name
result['data'][i]['lastName'] = q.last_name
result['data'][i]['email'] = q.email
error
KeyError: 'data'
desired result
result = {
'data': {
0: {'firstName': ''...}
1: {'firstName': ''...}
2: {'firstName': ''...}
}
}
You wanted to create a nested dictionary
result = {} will create an assignment for a flat dictionary, whose items can have any values like "string", "int", "list" or "dict"
For this flat assignment
python knows what to do for result["first"]
If you want "first" also to be another dictionary you need to tell Python by an assingment
result['first'] = {}.
otherwise, Python raises "KeyError"
I think you are looking for this :)
>>> from collections import defaultdict
>>> mydict = lambda: defaultdict(mydict)
>>> result = mydict()
>>> result['Python']['rules']['the world'] = "Yes I Agree"
>>> result['Python']['rules']['the world']
'Yes I Agree'
result = {}
result['data'] = {}
for i, q in enumerate(query):
result['data']['i'] = {}
result['data'][i]['firstName'] = q.first_name
result['data'][i]['lastName'] = q.last_name
result['data'][i]['email'] = q.email
Alternatively, you can use you own class which adds the extra dicts automatically
class AutoDict(dict):
def __missing__(self, k):
self[k] = AutoDict()
return self[k]
result = AutoDict()
for i, q in enumerate(query):
result['data'][i]['firstName'] = q.first_name
result['data'][i]['lastName'] = q.last_name
result['data'][i]['email'] = q.email
result['data'] does exist. So you cannot add data to it.
Try this out at the start:
result = {'data': []};
You have to create the key data first:
result = {}
result['data'] = {}
for i, q in enumerate(query):
result['data'][i] = {}
result['data'][i]['firstName'] = q.first_name
result['data'][i]['lastName'] = q.last_name
result['data'][i]['email'] = q.email

Categories