Implementation of Okapi BM25 in python - python

I am trying to implement Okapi BM25 in python. While I have seen some tutorials how to do it, it seems I am stuck in the process.
So I have collection of documents (and has as columns 'id' and 'text') and queries (and has as columns 'id' and 'text'). I have done the pre-processing steps and I have my documents and queries as a list:
documents = list(train_docs['text']) #put the documents text to list
queries = list(train_queries_all['text']) #put the queries text to list
Then for BM25 I do this:
pip install rank_bm25
#calculate BM25
from rank_bm25 import BM25Okapi
bm25 = BM25Okapi(documents)
#compute the score
bm_score = BM25Okapi.get_scores(documents, query=queries)
But it wouldn't work.
Then I tried to do this:
import math
import numpy as np
from multiprocessing import Pool, cpu_count
nd = len(documents) # corpus_size = 3612 (I am not sure if this is necessary)
class BM25:
def __init__(self, documents, tokenizer=None):
self.corpus_size = len(documents)
self.avgdl = 0
self.doc_freqs = []
self.idf = {}
self.doc_len = []
self.tokenizer = tokenizer
if tokenizer:
documents = self._tokenize_corpus(documents)
nd = self._initialize(documents)
self._calc_idf(nd)
def _initialize(self, documents):
nd = {} # word -> number of documents with word
num_doc = 0
for document in documents:
self.doc_len.append(len(document))
num_doc += len(document)
frequencies = {}
for word in document:
if word not in frequencies:
frequencies[word] = 0
frequencies[word] += 1
self.doc_freqs.append(frequencies)
for word, freq in frequencies.items():
if word not in nd:
nd[word] = 0
nd[word] += 1
self.avgdl = num_doc / self.corpus_size
return nd
def _tokenize_corpus(self, documents):
pool = Pool(cpu_count())
tokenized_corpus = pool.map(self.tokenizer, documents)
return tokenized_corpus
def _calc_idf(self, nd):
raise NotImplementedError()
def get_scores(self, queries):
raise NotImplementedError()
def get_top_n(self, queries, documents, n=5):
assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"
scores = self.get_scores(queries)
top_n = np.argsort(scores)[::-1][:n]
return [documents[i] for i in top_n]
class BM25T(BM25):
def __init__(self, documents, k1=1.5, b=0.75, delta=1):
# Algorithm specific parameters
self.k1 = k1
self.b = b
self.delta = delta
super().__init__(documents)
def _calc_idf(self, nd):
for word, freq in nd.items():
idf = math.log((self.corpus_size + 1) / freq)
self.idf[word] = idf
def get_scores(self, queries):
score = np.zeros(self.corpus_size)
doc_len = np.array(self.doc_len)
for q in queries:
q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
(self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
return score
and then I try to get the scores:
score = BM25.get_scores(self=documents, queries)
But I get as a meesage:
score = BM25.get_scores(self=documents, queries)
SyntaxError: positional argument follows keyword argument
Does anyone has an idea why there is this error? Thank you in advance.

1 ) tokenize corpus or send tokinizing function to class
2 ) send only queries to "get_scores" function
read official example
from rank_bm25 import BM25Okapi
corpus = [
"Hello there good man!",
"It is quite windy in London",
"How is the weather today?"
]
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)
query = "windy London"
tokenized_query = query.split(" ")
doc_scores = bm25.get_scores(tokenized_query)

I suggest you to use fastbm25, which is more fast than other bm25 version.
`pip install fastbm25
usage
from fastbm25 import fastbm25
corpus = [
"How are you !",
"Hello Jack! Nice to meet you!",
"I am from China, I like math."
]
tokenized_corpus = [doc.lower().split(" ") for doc in corpus]
model = fastbm25(tokenized_corpus)
query = "where are you from".lower().split()
result = model.top_k_sentence(query,k=1)
print(result)
you can learn mroe from https://github.com/zhusleep/fastbm25

Related

what is the diffrence beetween the bleu score and the vaerage sentence of bleu score

i'm having a hard time finding the bleus core for my seq to seq model for the task of question generation , my questions are the following :
if i use the sentence bleu to find the score beetween each refrence and the output and then devide the total of these sentence-bleu scores by the len of the test data , will it be the same as the corpus bleu ?
and for the corpus bleu implemented in the code as the nltk corpus bleu ?
import ntpath
import sys
import codecs
import os
import math
import operator
import functools
def fetch_data(cand, ref):
references = []
if '.eng' in ref:
reference_file = codecs.open(ref, 'r', 'utf-8')
references.append(reference_file.readlines())
else:
for root, dirs, files in os.walk(ref):
for f in files:
reference_file = codecs.open(os.path.join(root, f), 'r', 'utf-8')
references.append(reference_file.readlines())
candidate_file = codecs.open(cand, 'r', 'utf-8')
candidate = candidate_file.readlines()
return candidate, references
def count_ngram(candidate, references, n):
clipped_count = 0
count = 0
r = 0
c = 0
for si in range(len(candidate)):
# Calculate precision for each sentence
ref_counts = []
ref_lengths = []
# Build dictionary of ngram counts
for reference in references:
ref_sentence = reference[si]
ngram_d = {}
words = ref_sentence.strip().split()
ref_lengths.append(len(words))
limits = len(words) - n + 1
# loop through the sentance consider the ngram length
for i in range(limits):
ngram = ' '.join(words[i:i+n]).lower()
if ngram in ngram_d.keys():
ngram_d[ngram] += 1
else:
ngram_d[ngram] = 1
ref_counts.append(ngram_d)
# candidate
cand_sentence = candidate[si]
cand_dict = {}
words = cand_sentence.strip().split()
limits = len(words) - n + 1
for i in range(0, limits):
ngram = ' '.join(words[i:i + n]).lower()
if ngram in cand_dict:
cand_dict[ngram] += 1
else:
cand_dict[ngram] = 1
clipped_count += clip_count(cand_dict, ref_counts)
count += limits
r += best_length_match(ref_lengths, len(words))
c += len(words)
if clipped_count == 0:
pr = 0
else:
pr = float(clipped_count) / count
bp = brevity_penalty(c, r)
return pr, bp
def clip_count(cand_d, ref_ds):
"""Count the clip count for each ngram considering all references"""
count = 0
for m in cand_d.keys():
m_w = cand_d[m]
m_max = 0
for ref in ref_ds:
if m in ref:
m_max = max(m_max, ref[m])
m_w = min(m_w, m_max)
count += m_w
return count
def best_length_match(ref_l, cand_l):
"""Find the closest length of reference to that of candidate"""
least_diff = abs(cand_l-ref_l[0])
best = ref_l[0]
for ref in ref_l:
if abs(cand_l-ref) < least_diff:
least_diff = abs(cand_l-ref)
best = ref
return best
def brevity_penalty(c, r):
if c > r:
bp = 1
else:
bp = math.exp(1-(float(r)/c))
return bp
def geometric_mean(precisions):
return (functools.reduce(operator.mul, precisions)) ** (1.0 / len(precisions))
def BLEU(candidate, references):
precisions = []
for i in range(4):
pr, bp = count_ngram(candidate, references, i+1)
precisions.append(pr)
bleu = geometric_mean(precisions) * bp
return bleu
if __name__ == "__main__":
candidate, references = fetch_data(sys.argv[1], sys.argv[2])
bleu = BLEU(candidate, references)
print (bleu)
I'm not sure about the implementation you show but for implementations strictly following the original paper such as NLTKs it would not be the same: https://github.com/nltk/nltk/blob/develop/nltk/translate/bleu_score.py#L123.
Using sentence-BLEU means basically calling corpus-BLEU with just a one-sentence-corpus, but the other way around doesn't work. The scores should not be drastically different but they do differ because of macro-average vs micro-average.
I used BLEU for Seq2Seq evaluation before and just used sentence-BLEU and it worked just fine.

Unable to produce visualisations to calculate topic frequency for LSI model

I am trying to create a graph which shows the frequency of the topics for LSI. I was able to do this for my LDA model using the same code.
When I try to visualise my LSI topics I get error messages as shown below.
The code to create the models is below:
# Import CSV
df_train = pd.read_csv("Fold_2.csv", engine='python',encoding='latin-1')
# Convert to list
data = df_train['Post'].values.tolist()
# Change sentences to words
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(data))
# Create bigram and trigam
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
#Create LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=20,
random_state=10,
update_every=1,
chunksize=100,
passes=5,
alpha='auto',
per_word_topics=True)
# Print the Keyword in the topics
doc_lda = lda_model[corpus]
x=lda_model.show_topics(num_topics=20, num_words=5,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
print("LDA Model")
# Print The words
name = 0
words_together_list = []
for topic,words in topics_words:
words_together= " ".join(words)
words_together_list.append(words_together)
name = name + 1
print("The key word of Topic ", topic, " was: ", words_together)
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print("Coherence: ", coherence_lda)
lsi_model = gensim.models.lsimodel.LsiModel(
corpus=corpus, id2word=id2word, num_topics=20,chunksize=100
)
print("")
print("LSI Model")
# Print the Keywords in the topics
doc_lsi = lsi_model[corpus]
x=lsi_model.show_topics(num_topics=20, num_words=5,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
# Print The words
name = 0
words_together_list = []
for topic,words in topics_words:
words_together= " ".join(words)
words_together_list.append(words_together)
name = name + 1
print("The key word of Topic ", topic, " was: ", words_together)
coherence_model_lsi = CoherenceModel(model=lsi_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lsi = coherence_model_lsi.get_coherence()
print("Coherence: ", coherence_lsi)
The code for the visualisation of LSI topics is below. The same Python code worked for LSA when LSI was changed to LSI to reference the correct model.
#create a function to calculate topics per post
def topics_per_post(model, corpus, start=0, end=1):
corpus_selected = corpus[start:end]
dominant_topics = []
topic_percentages = []
for i, corp in enumerate(corpus_selected):
topic_percs, wordid_topics, wordid_phivalues = model[corp]
dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
dominant_topics.append((i, dominant_topic))
topic_percentages.append(topic_percs)
return(dominant_topics, topic_percentages)
dominant_topics, topic_percentages = topics_per_post(model=lsi_model, corpus=corpus, end=-1)
# create bar graph of topic frequency
df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()
df_dominant_topic_in_each_doc['count'].plot(kind='bar', title='Topic Frequency', ylabel='Frequency',
xlabel='Topic', figsize=(6, 5))
This is the error message produced:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-42-96b45968c3a6> in <module>()
----> 1 dominant_topics, topic_percentages = topics_per_post(model=lsi_model, corpus=corpus, end=-1)
2
3 # create bar graph of topic frequency
4 df = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
5 dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
<ipython-input-26-541251ac2e71> in topics_per_post(model, corpus, start, end)
5 topic_percentages = []
6 for i, corp in enumerate(corpus_selected):
----> 7 topic_percs, wordid_topics, wordid_phivalues = model[corp]
8 dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
9 dominant_topics.append((i, dominant_topic))
ValueError: too many values to unpack (expected 3)
I also tried with pyLDAvis, however, this also produced an error.
#Import plLDavis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
# Visualise the topics for LSI
lsi_viz = gensimvis.prepare(lsi_model, corpus, id2word)
lsi_viz
This produced the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-44-d9de743e0c86> in <module>()
5
6 # Visualise the topics for LSI
----> 7 lsi_viz = gensimvis.prepare(lsi_model, corpus, id2word)
8 lsi_viz
1 frames
/usr/local/lib/python3.7/dist-packages/pyLDAvis/gensim_models.py in _extract_data(topic_model, corpus, dictionary, doc_topic_dists)
47 gamma = topic_model.inference(corpus)
48 else:
---> 49 gamma, _ = topic_model.inference(corpus)
50 doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
51 else:
AttributeError: 'LsiModel' object has no attribute 'inference'
I have tried some research and I cannot seem to find examples of calculating the frequency of topics across all documents for LSI using Gensim. I have also searched for these errors on stack overflow and cannot find a solution.
Found the answer :)
sent_topics_df = pd.DataFrame()
# Get main topic in each document
for i, row in enumerate(lsi_model[corpus]):
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = lsi_model.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
# Format
df_dominant_topic =sent_topics_df.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
dominant_topic_in_each_doc = df_dominant_topic.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()
df_dominant_topic_in_each_doc['count'].plot(kind='bar', title='Topic Frequency', ylabel='Frequency',
xlabel='Topic', figsize=(6, 5))

MapReduce on several columns

I am new to MapReduce and have troubles with writing a MapReduce job which would work with several columns at once for a CSV file.
I would like to find for each garment group the most frequent product, the second most frequent section and the most frequent department so that the output would look according to the following schema: garment_group, product, section, department. This is based on the articles.csv dataset from kaggle.
So far I could only find for each garment group the most frequent product and do not understand how to incorporate the other columns. This is my code:
from mrjob.step import MRStep
from mrjob.util import log_to_stream, log_to_null
from mr3px.csvprotocol import CsvProtocol
import csv
import logging
log = logging.getLogger(__name__)
class MyMRJob1(MRJob):
OUTPUT_PROTOCOL = CsvProtocol # write output as CSV
def set_up_logging(cls, quiet=False, verbose=False, stream=None):
log_to_stream(name='mrjob', debug=verbose, stream=stream)
log_to_stream(name='__main__', debug=verbose, stream=stream)
def mapper(self, _, line):
result = next(csv.reader([line],quotechar=None)) # extract columns from line
garment_group_name = result[23]
prod_name = result[2]
#section_name = result[21]
#department_name = result[15]
# name = result[2]
#skip sparse entries and header
if prod_name == "prod_name" or prod_name == "" or garment_group_name == "": #skip sparse entries and header
return
yield (garment_group_name,prod_name), 1
def reducer(self,garmetProd,valuelist):
garmet, prod = garmetProd
output = sum(valuelist)
yield None,(garmet,prod,output)
def mapper_top(self, _, line):
result = line # input from last round already a list of strings
garmet = result[0]
prod = result[1]
nProd = result[2]
yield garmet, (prod,nProd)
def reducer_top(self,garmet,values):
mostProduct = "" # most favourite product per garmet group
maxBought = 0 # max amount of times bought
for (prod,nProds) in values:
if int(nProds) > maxBought:
maxBought = int(nProds)
mostProduct = prod
if maxBought > 0:
#CsvProtocol needs None key for output
yield None, (garmet,mostProduct)
def steps(self):
return [
MRStep(mapper = self.mapper,
reducer = self.reducer),
MRStep(mapper = self.mapper_top,
reducer = self.reducer_top)
]
if __name__ == '__main__':
MyMRJob1.run()

Replace dot product for loop Numpy

I am trying to replace the dot product for loop using something faster like NumPy
I did research on dot product and kind of understand and can get it working with toy data in a few ways in but not 100% when it comes to implementing it for actual use with a data frame.
I looked at these and other SO threads to no luck avoide loop dot product, matlab and dot product subarrays without for loop and multiple numpy dot products without a loop
looking to do something like this which works with toy numbers in np array
u1 =np.array([1,2,3])
u2 =np.array([2,3,4])
v1.dot(v2)
20
u1 =np.array([1,2,3])
u2 =np.array([2,3,4])
(u1 * u2).sum()
20
u1 =np.array([1,2,3])
u2 =np.array([2,3,4])
sum([x1*x2 for x1, x2 in zip (u1, u2)])
20
this is the current working get dot product
I would like to do this with out the for loop
def get_dot_product(self, courseid1, courseid2, unit_vectors):
u1 = unit_vectors[courseid1]
u2 = unit_vectors[courseid2]
dot_product = 0.0
for dimension in u1:
if dimension in u2:
dot_product += u1[dimension] * u2[dimension]
return dot_product
** code**
#!/usr/bin/env python
# coding: utf-8
class SearchRecommendationSystem:
def __init__(self):
pass
def get_bag_of_words(self, titles_lines):
bag_of_words = {}
for index, row in titles_lines.iterrows():
courseid, course_bag_of_words = self.get_course_bag_of_words(row)
for word in course_bag_of_words:
word = str(word).strip() # added
if word not in bag_of_words:
bag_of_words[word] = course_bag_of_words[word]
else:
bag_of_words[word] += course_bag_of_words[word]
return bag_of_words
def get_course_bag_of_words(self, line):
course_bag_of_words = {}
courseid = line['courseid']
title = line['title'].lower()
description = line['description'].lower()
wordlist = title.split() + description.split()
if len(wordlist) >= 10:
for word in wordlist:
word = str(word).strip() # added
if word not in course_bag_of_words:
course_bag_of_words[word] = 1
else:
course_bag_of_words[word] += 1
return courseid, course_bag_of_words
def get_sorted_results(self, d):
kv_list = d.items()
vk_list = []
for kv in kv_list:
k, v = kv
vk = v, k
vk_list.append(vk)
vk_list.sort()
vk_list.reverse()
k_list = []
for vk in vk_list[:10]:
v, k = vk
k_list.append(k)
return k_list
def get_keywords(self, titles_lines, bag_of_words):
n = sum(bag_of_words.values())
keywords = {}
for index, row in titles_lines.iterrows():
courseid, course_bag_of_words = self.get_course_bag_of_words(row)
term_importance = {}
for word in course_bag_of_words:
word = str(word).strip() # extra
tf_course = (float(course_bag_of_words[word]) / sum(course_bag_of_words.values()))
tf_overall = float(bag_of_words[word]) / n
term_importance[word] = tf_course / tf_overall
keywords[str(courseid)] = self.get_sorted_results(term_importance)
return keywords
def get_inverted_index(self, keywords):
inverted_index = {}
for courseid in keywords:
for keyword in keywords[courseid]:
if keyword not in inverted_index:
keyword = str(keyword).strip() # added
inverted_index[keyword] = []
inverted_index[keyword].append(courseid)
return inverted_index
def get_search_results(self, query_terms, keywords, inverted_index):
search_results = {}
for term in query_terms:
term = str(term).strip()
if term in inverted_index:
for courseid in inverted_index[term]:
if courseid not in search_results:
search_results[courseid] = 0.0
search_results[courseid] += (
1 / float(keywords[courseid].index(term) + 1) *
1 / float(query_terms.index(term) + 1)
)
sorted_results = self.get_sorted_results(search_results)
return sorted_results
def get_titles(self, titles_lines):
titles = {}
for index, row in titles_lines.iterrows():
titles[row['courseid']] = row['title'][:60]
return titles
def get_unit_vectors(self, keywords, categories_lines):
norm = 1.884
cat = {}
subcat = {}
for line in categories_lines[1:]:
courseid_, category, subcategory = line.split('\t')
cat[courseid_] = category.strip()
subcat[courseid_] = subcategory.strip()
unit_vectors = {}
for courseid in keywords:
u = {}
if courseid in cat:
u[cat[courseid]] = 1 / norm
u[subcat[courseid]] = 1 / norm
for keyword in keywords[courseid]:
u[keyword] = (1 / float(keywords[courseid].index(keyword) + 1) / norm)
unit_vectors[courseid] = u
return unit_vectors
def get_dot_product(self, courseid1, courseid2, unit_vectors):
u1 = unit_vectors[courseid1]
u2 = unit_vectors[courseid2]
dot_product = 0.0
for dimension in u1:
if dimension in u2:
dot_product += u1[dimension] * u2[dimension]
return dot_product
def get_recommendation_results(self, seed_courseid, keywords, inverted_index, unit_vectors):
courseids = []
seed_courseid = str(seed_courseid).strip()
for keyword in keywords[seed_courseid]:
for courseid in inverted_index[keyword]:
if courseid not in courseids and courseid != seed_courseid:
courseids.append(courseid)
dot_products = {}
for courseid in courseids:
dot_products[courseid] = self.get_dot_product(seed_courseid, courseid, unit_vectors)
sorted_results = self.get_sorted_results(dot_products)
return sorted_results
def Final(self):
print("Reading Title file.......")
titles_lines = open('s2-titles.txt', encoding="utf8").readlines()
print("Reading Category file.......")
categories_lines = open('s2-categories.tsv', encoding = "utf8").readlines()
print("Getting Supported Functions Data")
bag_of_words = self.get_bag_of_words(titles_lines)
keywords = self.get_keywords(titles_lines, bag_of_words)
inverted_index = self.get_inverted_index(keywords)
titles = self.get_titles(titles_lines)
print("Getting Unit Vectors")
unit_vectors = self.get_unit_vectors(keywords=keywords, categories_lines=categories_lines)
#Search Part
print("\n ############# Started Search Query System ############# \n")
query = input('Input your search query: ')
while query != '':
query_terms = query.split()
search_sorted_results = self.get_search_results(query_terms, keywords, inverted_index)
print(f"==> search results for query: {query.split()}")
for search_result in search_sorted_results:
print(f"{search_result.strip()} - {str(titles[search_result]).strip()}")
#ask again for query or quit the while loop if no query is given
query = input('Input your search query [hit return to finish]: ')
print("\n ############# Started Recommendation Algorithm System ############# \n")
# Recommendation ALgorithm Part
seed_courseid = (input('Input your seed courseid: '))
while seed_courseid != '':
seed_courseid = str(seed_courseid).strip()
recom_sorted_results = self.get_recommendation_results(seed_courseid, keywords, inverted_index, unit_vectors)
print('==> recommendation results:')
for rec_result in recom_sorted_results:
print(f"{rec_result.strip()} - {str(titles[rec_result]).strip()}")
get_dot_product_ = self.get_dot_product(seed_courseid, str(rec_result).strip(), unit_vectors)
print(f"Dot Product Value: {get_dot_product_}")
seed_courseid = (input('Input seed courseid [hit return to finish]:'))
if __name__ == '__main__':
obj = SearchRecommendationSystem()
obj.Final()
s2-categories.tsv
courseid category subcategory
21526 Design 3D & Animation
153082 Marketing Advertising
225436 Marketing Affiliate Marketing
19482 Office Productivity Apple
33883 Office Productivity Apple
59526 IT & Software Operating Systems
29219 Personal Development Career Development
35057 Personal Development Career Development
40751 Personal Development Career Development
65210 Personal Development Career Development
234414 Personal Development Career Development
Example of how s2-titles.txt looks
courseidXXXYYYZZZtitleXXXYYYZZZdescription
3586XXXYYYZZZLearning Tools for Mrs B's Science Classes This is a series of lessons that will introduce students to the learning tools that will be utilized throughout the schoXXXYYYZZZThis is a series of lessons that will introduce students to the learning tools that will be utilized throughout the school year The use of these tools serves multiple purposes 1 Allow the teacher to give immediate and meaningful feedback on work that is in progress 2 Allow students to have access to content and materials when outside the classroom 3 Provide a variety of methods for students to experience learning materials 4 Provide a variety of methods for students to demonstrate learning 5 Allow for more time sensitive correction grading and reflections on concepts that are assessed
Evidently unit_vectors is a dictionary, from which you extract to 2 values, u1 and u2.
But what are those? Evidently dicts as well (this iteration would not make sense with a list):
for dimension in u1:
if dimension in u2:
dot_product += u1[dimension] * u2[dimension]
But what is u1[dimension]? A list? An array.
Normally dict are access by key as you do here. There isn't a numpy style "vectorization". vals = list(u1.values()) gets a lists of all values, and conceivably that could be made into an array (if the elements are right)
arr1 = np.array(list(u1.values()))
and a np.dot(arr1, arr2) might work
You'll get the best answers if you give small concrete examples - with real working data (and skip the complex generating code). Focus on the core of the problem, so we can grasp the issue with a 30 second read!
===
Looking more in depth at your dot function; this replicates the core (I think). Initially I missed the fact that you aren't iterating on u2 keys, but rather seeking matching ones.
def foo(dd):
x = 0
u1 = dd['u1']
u2 = dd['u2']
for k in u1:
if k in u2:
x += u1[k]*u2[k]
return x
Then making a dictionary of dictionaries:
In [30]: keys=list('abcde'); values=[1,2,3,4,5]
In [31]: adict = {k:v for k,v in zip(keys,values)}
In [32]: dd = {'u1':adict, 'u2':adict}
In [41]: dd
Out[41]:
{'u1': {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5},
'u2': {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}}
In [42]: foo(dd)
Out[42]: 55
In this case the subdictionaries match, so we get the same value with a simple array dot:
In [43]: np.dot(values,values)
Out[43]: 55
But if u2 was different, with different key/value pairs, and possibly different keys the result will be different. I don't see a way around the iterative access by keys. The sum-of-products part of the job is minor compared to the dictionary access.
In [44]: dd['u2'] = {'e':3, 'f':4, 'a':3}
In [45]: foo(dd)
Out[45]: 18
We could construct other data structures that are more suitable to a fast dot like calculation. But that's another topic.
Modified method
def get_dot_product(self, courseid1, courseid2, unit_vectors):
# u1 = unit_vectors[courseid1]
# u2 = unit_vectors[courseid2]
# dimensions = set(u1).intersection(set(u2))
# dot_product = sum(u1[dimension] * u2.get(dimension, 0) for dimension in dimensions)
u1 = unit_vectors[courseid1]
u2 = unit_vectors[courseid2]
dot_product = sum(u1[dimension] * u2.get(dimension, 0) for dimension in u2)
return dot_product

ID3 Algorithm in Python

I am trying to plot a decision tree using ID3 in Python. I am really new to Python and couldn't understand the implementation of the following code. I need to know how I can apply this code to my data.
from math import log
import operator
def entropy(data):
entries = len(data)
labels = {}
for feat in data:
label = feat[-1]
if label not in labels.keys():
labels[label] = 0
labels[label] += 1
entropy = 0.0
for key in labels:
probability = float(labels[key])/entries
entropy -= probability * log(probability,2)
return entropy
def split(data, axis, val):
newData = []
for feat in data:
if feat[axis] == val:
reducedFeat = feat[:axis]
reducedFeat.extend(feat[axis+1:])
newData.append(reducedFeat)
return newData
def choose(data):
features = len(data[0]) - 1
baseEntropy = entropy(data)
bestInfoGain = 0.0;
bestFeat = -1
for i in range(features):
featList = [ex[i] for ex in data]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
newData = split(data, i, value)
probability = len(newData)/float(len(data))
newEntropy += probability * entropy(newData)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeat = i
return bestFeat
def majority(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def tree(data,labels):
classList = [ex[-1] for ex in data]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(data[0]) == 1:
return majority(classList)
bestFeat = choose(data)
bestFeatLabel = labels[bestFeat]
theTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [ex[bestFeat] for ex in data]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
theTree[bestFeatLabel][value] = tree(split/(data, bestFeat, value),subLabels)
return theTree
So what I did after this is the following:
infile=open("SData.csv","r")
data=infile.read()
tree(data)
The error which I got is "1 argument is missing" which is the label which I have to define and this is where I don't know what I have to put. I tried the variable for which I have to make the decision tree but it doesn't work:
tree(data,MinTemp)
Here I get an error "MinTemp is not defined".
Please help me out and let me know what I should do to have a look at the tree.
Following is the part of data and I want to generate a tree for MinTemp
MinTemp,Rainfall,Tempat9,RHat9,CAat9,WSat9
high,no,mild,normal,overcast,weak
high,no,mild,normal,cloudy,weak
high,no,mild,normal,cloudy,mild
high,yes,mild,high,cloudy,weak
high,yes,mild,high,cloudy,mild
medium,yes,mild,high,cloudy,mild
high,no,mild,high,overcast,weak
high,no,mild,normal,sunny,weak
high,no,hot,normal,sunny,weak
high,no,hot,normal,overcast,weak

Categories