I write this code but I find it very slow and I don't know how to really improve it in term of time. data is a json object with approximately 70 000 key in it. I think the slowest part is the actors part because i'm iterating on a list (which contain at most 3 elements).
genres_number = {}
actors_number = {}
for movie in data:
for genre in data[movie]["genres"]:
if data[movie]["actors"] != None:
for actor in data[movie]["actors"]:
if actor not in actors_number.keys():
actors_number[actor] = 1
else:
actors_number[actor] = actors_number[actor] + 1
if genre not in genres_number.keys():
genres_number[genre] = 1
else:
genres_number[genre] = genres_number[genre] + 1
res = []
res.append(genres_number)
res.append(actors_number)
return res
How does this work for you
from collections import defaultdict
def get_stats(data):
genres_number = defaultdict(int)
actors_number = defaultdict(int)
for movie in data:
actors = movie.get('actors')
if actors:
for actor in actors:
actors_number[actor] += 1
genres = movie.get('genres')
for genre in genres:
genres_number[actor] += 1
res = []
res.append(dict(genres_number))
res.append(dict(actors_number))
return res
Related
I am trying to replace the dot product for loop using something faster like NumPy
I did research on dot product and kind of understand and can get it working with toy data in a few ways in but not 100% when it comes to implementing it for actual use with a data frame.
I looked at these and other SO threads to no luck avoide loop dot product, matlab and dot product subarrays without for loop and multiple numpy dot products without a loop
looking to do something like this which works with toy numbers in np array
u1 =np.array([1,2,3])
u2 =np.array([2,3,4])
v1.dot(v2)
20
u1 =np.array([1,2,3])
u2 =np.array([2,3,4])
(u1 * u2).sum()
20
u1 =np.array([1,2,3])
u2 =np.array([2,3,4])
sum([x1*x2 for x1, x2 in zip (u1, u2)])
20
this is the current working get dot product
I would like to do this with out the for loop
def get_dot_product(self, courseid1, courseid2, unit_vectors):
u1 = unit_vectors[courseid1]
u2 = unit_vectors[courseid2]
dot_product = 0.0
for dimension in u1:
if dimension in u2:
dot_product += u1[dimension] * u2[dimension]
return dot_product
** code**
#!/usr/bin/env python
# coding: utf-8
class SearchRecommendationSystem:
def __init__(self):
pass
def get_bag_of_words(self, titles_lines):
bag_of_words = {}
for index, row in titles_lines.iterrows():
courseid, course_bag_of_words = self.get_course_bag_of_words(row)
for word in course_bag_of_words:
word = str(word).strip() # added
if word not in bag_of_words:
bag_of_words[word] = course_bag_of_words[word]
else:
bag_of_words[word] += course_bag_of_words[word]
return bag_of_words
def get_course_bag_of_words(self, line):
course_bag_of_words = {}
courseid = line['courseid']
title = line['title'].lower()
description = line['description'].lower()
wordlist = title.split() + description.split()
if len(wordlist) >= 10:
for word in wordlist:
word = str(word).strip() # added
if word not in course_bag_of_words:
course_bag_of_words[word] = 1
else:
course_bag_of_words[word] += 1
return courseid, course_bag_of_words
def get_sorted_results(self, d):
kv_list = d.items()
vk_list = []
for kv in kv_list:
k, v = kv
vk = v, k
vk_list.append(vk)
vk_list.sort()
vk_list.reverse()
k_list = []
for vk in vk_list[:10]:
v, k = vk
k_list.append(k)
return k_list
def get_keywords(self, titles_lines, bag_of_words):
n = sum(bag_of_words.values())
keywords = {}
for index, row in titles_lines.iterrows():
courseid, course_bag_of_words = self.get_course_bag_of_words(row)
term_importance = {}
for word in course_bag_of_words:
word = str(word).strip() # extra
tf_course = (float(course_bag_of_words[word]) / sum(course_bag_of_words.values()))
tf_overall = float(bag_of_words[word]) / n
term_importance[word] = tf_course / tf_overall
keywords[str(courseid)] = self.get_sorted_results(term_importance)
return keywords
def get_inverted_index(self, keywords):
inverted_index = {}
for courseid in keywords:
for keyword in keywords[courseid]:
if keyword not in inverted_index:
keyword = str(keyword).strip() # added
inverted_index[keyword] = []
inverted_index[keyword].append(courseid)
return inverted_index
def get_search_results(self, query_terms, keywords, inverted_index):
search_results = {}
for term in query_terms:
term = str(term).strip()
if term in inverted_index:
for courseid in inverted_index[term]:
if courseid not in search_results:
search_results[courseid] = 0.0
search_results[courseid] += (
1 / float(keywords[courseid].index(term) + 1) *
1 / float(query_terms.index(term) + 1)
)
sorted_results = self.get_sorted_results(search_results)
return sorted_results
def get_titles(self, titles_lines):
titles = {}
for index, row in titles_lines.iterrows():
titles[row['courseid']] = row['title'][:60]
return titles
def get_unit_vectors(self, keywords, categories_lines):
norm = 1.884
cat = {}
subcat = {}
for line in categories_lines[1:]:
courseid_, category, subcategory = line.split('\t')
cat[courseid_] = category.strip()
subcat[courseid_] = subcategory.strip()
unit_vectors = {}
for courseid in keywords:
u = {}
if courseid in cat:
u[cat[courseid]] = 1 / norm
u[subcat[courseid]] = 1 / norm
for keyword in keywords[courseid]:
u[keyword] = (1 / float(keywords[courseid].index(keyword) + 1) / norm)
unit_vectors[courseid] = u
return unit_vectors
def get_dot_product(self, courseid1, courseid2, unit_vectors):
u1 = unit_vectors[courseid1]
u2 = unit_vectors[courseid2]
dot_product = 0.0
for dimension in u1:
if dimension in u2:
dot_product += u1[dimension] * u2[dimension]
return dot_product
def get_recommendation_results(self, seed_courseid, keywords, inverted_index, unit_vectors):
courseids = []
seed_courseid = str(seed_courseid).strip()
for keyword in keywords[seed_courseid]:
for courseid in inverted_index[keyword]:
if courseid not in courseids and courseid != seed_courseid:
courseids.append(courseid)
dot_products = {}
for courseid in courseids:
dot_products[courseid] = self.get_dot_product(seed_courseid, courseid, unit_vectors)
sorted_results = self.get_sorted_results(dot_products)
return sorted_results
def Final(self):
print("Reading Title file.......")
titles_lines = open('s2-titles.txt', encoding="utf8").readlines()
print("Reading Category file.......")
categories_lines = open('s2-categories.tsv', encoding = "utf8").readlines()
print("Getting Supported Functions Data")
bag_of_words = self.get_bag_of_words(titles_lines)
keywords = self.get_keywords(titles_lines, bag_of_words)
inverted_index = self.get_inverted_index(keywords)
titles = self.get_titles(titles_lines)
print("Getting Unit Vectors")
unit_vectors = self.get_unit_vectors(keywords=keywords, categories_lines=categories_lines)
#Search Part
print("\n ############# Started Search Query System ############# \n")
query = input('Input your search query: ')
while query != '':
query_terms = query.split()
search_sorted_results = self.get_search_results(query_terms, keywords, inverted_index)
print(f"==> search results for query: {query.split()}")
for search_result in search_sorted_results:
print(f"{search_result.strip()} - {str(titles[search_result]).strip()}")
#ask again for query or quit the while loop if no query is given
query = input('Input your search query [hit return to finish]: ')
print("\n ############# Started Recommendation Algorithm System ############# \n")
# Recommendation ALgorithm Part
seed_courseid = (input('Input your seed courseid: '))
while seed_courseid != '':
seed_courseid = str(seed_courseid).strip()
recom_sorted_results = self.get_recommendation_results(seed_courseid, keywords, inverted_index, unit_vectors)
print('==> recommendation results:')
for rec_result in recom_sorted_results:
print(f"{rec_result.strip()} - {str(titles[rec_result]).strip()}")
get_dot_product_ = self.get_dot_product(seed_courseid, str(rec_result).strip(), unit_vectors)
print(f"Dot Product Value: {get_dot_product_}")
seed_courseid = (input('Input seed courseid [hit return to finish]:'))
if __name__ == '__main__':
obj = SearchRecommendationSystem()
obj.Final()
s2-categories.tsv
courseid category subcategory
21526 Design 3D & Animation
153082 Marketing Advertising
225436 Marketing Affiliate Marketing
19482 Office Productivity Apple
33883 Office Productivity Apple
59526 IT & Software Operating Systems
29219 Personal Development Career Development
35057 Personal Development Career Development
40751 Personal Development Career Development
65210 Personal Development Career Development
234414 Personal Development Career Development
Example of how s2-titles.txt looks
courseidXXXYYYZZZtitleXXXYYYZZZdescription
3586XXXYYYZZZLearning Tools for Mrs B's Science Classes This is a series of lessons that will introduce students to the learning tools that will be utilized throughout the schoXXXYYYZZZThis is a series of lessons that will introduce students to the learning tools that will be utilized throughout the school year The use of these tools serves multiple purposes 1 Allow the teacher to give immediate and meaningful feedback on work that is in progress 2 Allow students to have access to content and materials when outside the classroom 3 Provide a variety of methods for students to experience learning materials 4 Provide a variety of methods for students to demonstrate learning 5 Allow for more time sensitive correction grading and reflections on concepts that are assessed
Evidently unit_vectors is a dictionary, from which you extract to 2 values, u1 and u2.
But what are those? Evidently dicts as well (this iteration would not make sense with a list):
for dimension in u1:
if dimension in u2:
dot_product += u1[dimension] * u2[dimension]
But what is u1[dimension]? A list? An array.
Normally dict are access by key as you do here. There isn't a numpy style "vectorization". vals = list(u1.values()) gets a lists of all values, and conceivably that could be made into an array (if the elements are right)
arr1 = np.array(list(u1.values()))
and a np.dot(arr1, arr2) might work
You'll get the best answers if you give small concrete examples - with real working data (and skip the complex generating code). Focus on the core of the problem, so we can grasp the issue with a 30 second read!
===
Looking more in depth at your dot function; this replicates the core (I think). Initially I missed the fact that you aren't iterating on u2 keys, but rather seeking matching ones.
def foo(dd):
x = 0
u1 = dd['u1']
u2 = dd['u2']
for k in u1:
if k in u2:
x += u1[k]*u2[k]
return x
Then making a dictionary of dictionaries:
In [30]: keys=list('abcde'); values=[1,2,3,4,5]
In [31]: adict = {k:v for k,v in zip(keys,values)}
In [32]: dd = {'u1':adict, 'u2':adict}
In [41]: dd
Out[41]:
{'u1': {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5},
'u2': {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}}
In [42]: foo(dd)
Out[42]: 55
In this case the subdictionaries match, so we get the same value with a simple array dot:
In [43]: np.dot(values,values)
Out[43]: 55
But if u2 was different, with different key/value pairs, and possibly different keys the result will be different. I don't see a way around the iterative access by keys. The sum-of-products part of the job is minor compared to the dictionary access.
In [44]: dd['u2'] = {'e':3, 'f':4, 'a':3}
In [45]: foo(dd)
Out[45]: 18
We could construct other data structures that are more suitable to a fast dot like calculation. But that's another topic.
Modified method
def get_dot_product(self, courseid1, courseid2, unit_vectors):
# u1 = unit_vectors[courseid1]
# u2 = unit_vectors[courseid2]
# dimensions = set(u1).intersection(set(u2))
# dot_product = sum(u1[dimension] * u2.get(dimension, 0) for dimension in dimensions)
u1 = unit_vectors[courseid1]
u2 = unit_vectors[courseid2]
dot_product = sum(u1[dimension] * u2.get(dimension, 0) for dimension in u2)
return dot_product
I am trying to return an array of constructed objects that are build on top of objects that I retrieve from some url plus another fields that I get from another url.
I have an array that consists of two arrays that each has about 8000 objects...
I have tried to make each object construction as a thread however it still takes a lot of time...
Any solution? Here is my code:
def get_all_players_full_data(ea_players_json):
all = []
ea_players_json = list(ea_players_json.values())
for i in range(len(ea_players_json)):
for player_obj in ea_players_json[i]:
all.append(player_obj)
for player_obj in range(len(all)):
all_data = []
with concurrent.futures.ThreadPoolExecutor(len(all)) as executor:
for player_data in all:
future = executor.submit(build_full_player_data_obj, player_data)
print(future.result())
all_data.append(future.result())
def build_full_player_data_obj(ea_player_data):
if ea_player_data.get("c") is not None:
player_full_name = ea_player_data.get("c")
else:
player_full_name = ea_player_data.get("f") + " " + ea_player_data.get("l")
player_id = ea_player_data.get("id")
# go to futhead to find all cards of that player
futhead_url_player_data = f'{FUTHEAD_PLAYER}{player_full_name}'
details_of_specific_player = json.loads(requests.get(futhead_url_player_data).content)
cards_from_the_same_id = []
for player_in_json_futhead in details_of_specific_player:
if player_in_json_futhead["player_id"] == player_id:
rating = player_in_json_futhead["rating"]
specific_card_id = player_in_json_futhead["def_id"]
revision = player_in_json_futhead["revision_type"]
name = player_in_json_futhead["full_name"]
nation = player_in_json_futhead["nation_name"]
position = player_in_json_futhead["position"]
club = player_in_json_futhead["club_name"]
cards_from_the_same_id.append(Player(specific_card_id, name, rating, revision, nation,
position, club))
return cards_from_the_same_id
1.6. Recommend a Movie
Create a function that counts how many keywords are similar in a set of movie reviews
and recommend the movie with the most similar number of keywords.
The solution to this task will require the use of dictionaries.
The film reviews & keywords are in a file called film_reviews.txt, separated by commas.
The first term is the movie name, the remaining terms are the film’s keyword tags (i.e.,
“amazing", “poetic", “scary", etc.).
Function name: similar_movie()
Parameters/arguments: name of a movie
Returns: a list of movies similar to the movie passed as an argument
film_reviews.txt -
7 Days in Entebbe,fun,foreign,sad,boring,slow,romance
12 Strong,war,violence,foreign,sad,action,romance,bloody
A Fantastic Woman,fun,foreign,sad,romance
A Wrinkle in Time,book,witty,historical,boring,slow,romance
Acts of Violence,war,violence,historical,action
Annihilation,fun,war,violence,gore,action
Armed,foreign,sad,war,violence,cgi,fancy,action,bloody
Black '47,fun,clever,witty,boring,slow,action,bloody
Black Panther,war,violence,comicbook,expensive,action,bloody
I think this could work for you
film_data = {'films': {}}
with open('film_reviews.txt', 'r') as f:
for line in f.readlines():
data = line.split(',')
data[-1] = data[-1].strip() # removing new line character
film_data['films'][data[0].lower()] = data[1:]
def get_smilar_movie(name):
if name.lower() in film_data['films'].keys():
original_review = film_data['films'][name.lower()]
similarities = dict()
for key in film_data['films']:
if key == name.lower():
continue
else:
similar_movie_review = set(film_data['films'][key])
overlap = set(original_review) & similar_movie_review
universe = set(original_review) | similar_movie_review
# % of overlap compared to the first movie = output1
output1 = float(len(overlap)) / len(set(original_review)) * 100
# % of overlap compared to the second movie = output2
output2 = float(len(overlap)) / len(similar_movie_review) * 100
# % of overlap compared to universe
output3 = float(len(overlap)) / len(universe) * 100
similarities[output1 + output2 + output3] = dict()
similarities[output1 + output2 + output3]['reviews'] = film_data['films'][key]
similarities[output1 + output2 + output3]['movie'] = key
max_similarity = max(similarities.keys())
movie2 = similarities[max_similarity]
print(name,' reviews ',film_data['films'][name.lower()])
print('similar movie ',movie2)
print('Similarity = {0:.2f}/100'.format(max_similarity/3))
return movie2['movie']
return None
The get_similar_movie function will return the most similar movie from the film_data dict. The function will take a movie name as argument.
I created a dictionary
TTR_of_speakers = defaultdict(int)
TTR_of_speakers = {}
and I wrote a program that does some computations that results in a value called TTR.
for record in cwrecords:
if record["speaker_name"] == "Joe Biden":
text = record["text"]
processed = nlp(text)
textw = [t.lemma_ for t in processed]
N += len(textw)
total_types |= set(textw)
V = len(total_types)
TTR = float(V)/float(N)
How do I put the TTR value into the dictionary with the key as Joe Biden?
I am trying to parse data from a website by inserting the data into a list, but the list comes back empty.
url =("http://www.releasechimps.org/resources/publication/whos-there-md- anderson")
http = urllib3.PoolManager()
r = http.request('Get',url)
soup = BeautifulSoup(r.data,"html.parser")
#print(r.data)
loop = re.findall(r'<td>(.*?)</td>',str(r.data))
#print(str(loop))
newLoop = str(loop)
#print(newLoop)
for x in range(1229):
if "\\n\\t\\t\\t\\t" in loop[x]:
loop[x] = loop[x].replace("\\n\\t\\t\\t\\t","")
list0_v2.append(str(loop[x]))
print(loop[x])
print(str(list0_v2))
Edit: Didn't really have anything else going on, so I made your data format into a nice list of dictionaries. There's a weird <td height="26"> on monkey 111, so I had to change the regex slightly.
Hope this helps you, I did it cause I care about the monkeys man.
import html
import re
import urllib.request
list0_v2 = []
final_list = []
url = "http://www.releasechimps.org/resources/publication/whos-there-md-anderson"
data = urllib.request.urlopen(url).read()
loop = re.findall(r'<td.*?>(.*?)</td>', str(data))
for item in loop:
if "\\n\\t\\t\\t\\t" or "em>" in item:
item = item.replace("\\n\\t\\t\\t\\t", "").replace("<em>", "")\
.replace("</em>", "")
if " " == item:
continue
list0_v2.append(item)
n = 1
while len(list0_v2) != 0:
form = {"n":0, "name":"", "id":"", "gender":"", "birthdate":"", "notes":""}
try:
if list0_v2[5][-1] == '.':
numb, name, ids, gender, birthdate, notes = list0_v2[0:6]
form["notes"] = notes
del(list0_v2[0:6])
else:
raise Exception('foo')
except:
numb, name, ids, gender, birthdate = list0_v2[0:5]
del(list0_v2[0:5])
form["n"] = int(numb)
form["name"] = html.unescape(name)
form["id"] = ids
form["gender"] = gender
form["birthdate"] = birthdate
final_list.append(form)
n += 1
for li in final_list:
print("{:3} {:10} {:10} {:3} {:10} {}".format(li["n"], li["name"], li["id"],\
li["gender"], li["birthdate"], li["notes"]))