Difficulty parsing a section of XML file with ElementTree - python

I have written the code below to parse this XML file. You can see it's still a bit messy, but that I'm on the right track for most of it.
You can see one part that I'm stuck on is the 'targets' section (I've left the code that I've tried for this section in here with triple quotes, but you can see that section doesn't work).
I'm wondering if someone could help show me where I'm going wrong/how to parse the targets section? If you look at the HTML of the XML file here, I basically just want to extract the information in the targets section, for each gene/entry (or if it was possible, there seems to be more info in the targets section of the XML file, so if I could take that either)?
Thanks
import requests
import xml.etree.ElementTree as ET
import urllib2
#get the XML file
#response = requests.get('https://www.drugbank.ca/drugs/DB01048.xml')
#with open('output.txt', 'w') as input:
# input.write(response.content)
tree = ET.parse('output.txt')
root = tree.getroot()
val = lambda x: "{http://www.drugbank.ca}" + str(x)
key_list = ['drugbank-id','name','description','cas-number','unii','average-mass','monoisotopic-mass','state','indication','pharmacodynamics','mechanism-of-action','toxicity','metabolism','absorption','half-life','protein-binding','route-of-elimination','volume-of-distribution','fda-label','msds']
key_dict = {}
for i in key_list:
for child in root.getchildren():
key_dict[i] = child.find(val(i)).text.encode('utf-8')
#print key_dict
def method1(str_name,list_name):
if subnode.tag == str_name:
list_name = []
for i in subnode:
list_name.append(i.text)
return list_name
def method2(list1_name,list2_name,list3_name,list4_name):
if subnode.tag == list1_name:
for i in subnode:
if i.tag == list2_name:
for a in i:
if a.tag == list3_name:
for u in a:
if u.tag == list4_name:
yield u.text
def method3(list1_name, list2_name):
list_of_tuples = []
if subnode.tag == list1_name:
for i in subnode:
if i.tag == list2_name:
temp_list = []
for a in i:
temp_list.append(a.text)
list_of_tuples.append(temp_list)
return list_of_tuples
alternative_parents = []
substituents = []
list_to_run_thru = ['description','direct-parent','kingdom','superclass','class','subclass']
ap_sub = lambda x:'{http://www.drugbank.ca}'+ x
for node in root:
for subnode in node:
print method1('{http://www.drugbank.ca}groups','group_list')
print method1('{http://www.drugbank.ca}synonyms','synonym_list')
print method1('{http://www.drugbank.ca}patent','patent_list')
print method2('{http://www.drugbank.ca}general-references','{http://www.drugbank.ca}articles','{http://www.drugbank.ca}article','{http://www.drugbank.ca}pubmed-id')#
if subnode.tag == '{http://www.drugbank.ca}classification':
for each_item in list_to_run_thru:
for i in subnode:
if i.tag == ap_sub(each_item):
print i.text
if i.tag == '{http://www.drugbank.ca}alternative-parent':
alternative_parents.append(i.text)
if i.tag == '{http://www.drugbank.ca}substituent':
substituents.append(i.text)
print method3('{http://www.drugbank.ca}salts','{http://www.drugbank.ca}salt')
print method3('{http://www.drugbank.ca}products','{http://www.drugbank.ca}product')
print method3('{http://www.drugbank.ca}mixtures','{http://www.drugbank.ca}mixture')
print method3('{http://www.drugbank.ca}packagers','{http://www.drugbank.ca}packager')
print method3('{http://www.drugbank.ca}categories','{http://www.drugbank.ca}category')
print method3('{http://www.drugbank.ca}dosages','{http://www.drugbank.ca}dosage')
print method3('{http://www.drugbank.ca}atc-codes','{http://www.drugbank.ca}atc-code')
print method3('{http://www.drugbank.ca}ahfs-codes','{http://www.drugbank.ca}ahfs-code')
print method3('{http://www.drugbank.ca}pdb-entries','{http://www.drugbank.ca}pdb-entry')
print method3('{http://www.drugbank.ca}food-interactions','{http://www.drugbank.ca}food-interaction')
print method3('{http://www.drugbank.ca}drug-interactions','{http://www.drugbank.ca}drug-interaction')
print method3('{http://www.drugbank.ca}calculated-properties','{http://www.drugbank.ca}property')
print method3('{http://www.drugbank.ca}external-identifiers','{http://www.drugbank.ca}external-identifier')
print method3('{http://www.drugbank.ca}external-links','{http://www.drugbank.ca}external-link')
print method3('{http://www.drugbank.ca}snp-adverse-drug-reactions','{http://www.drugbank.ca}reaction')
print substituents
print alternative_parents
'''
if subnode.tag == '{http://www.drugbank.ca}pathways':
for i in subnode:
if i.tag == '{http://www.drugbank.ca}pathway':
for a in i:
print a.text
for u in a:
if u.tag == '{http://www.drugbank.ca}drug':
for x in u:
print x.text
#missing a bit of data here
if subnode.tag == '{http://www.drugbank.ca}targets':
for i in subnode:
if i.tag == '{http://www.drugbank.ca}target':
print i.text
for a in i:
print a.text
if a.tag == '{http://www.drugbank.ca}actions':
for u in a:
print u.text
if a.tag == '{http://www.drugbank.ca}references':
for u in a:
if u.tag == '{http://www.drugbank.ca}articles':
for x in u:
if x.tag == '{http://www.drugbank.ca}article':
for z in x:
print z.text
'''

I used BeautifulSoup for parsing because it is a simple library.
Code:
import pprint
import requests
from bs4 import BeautifulSoup
html = requests.get('https://www.drugbank.ca/drugs/DB01048#BE0004136').text
soup = BeautifulSoup(html, 'html.parser')
div_targets = soup.find('div', class_='bond-list-container targets')
targets = div_targets.find_all('div', class_='bond card')
t = {}
for target in targets:
k = []
v = []
for property in target.find_all('dt'):
k.append(property.get_text())
for property in target.find_all('dd'):
v.append(property.get_text())
t[target.find('strong').get_text()] = dict(zip(k, v))
pprint.pprint(t)
Output:
{'1. Reverse transcriptase/RNaseH': {'Actions': 'Inhibitor',
'Gene Name': 'pol',
'General Function': 'Rna-dna hybrid '
'ribonuclease '
'activity',
'Kind': 'Protein',
'Molecular Weight': '65223.615 Da',
'Organism': 'Human immunodeficiency virus '
'1',
'Pharmacological action': 'Yes',
'Specific Function': 'Not Available',
'Uniprot ID': 'Q72547',
'Uniprot Name': 'Reverse '
'transcriptase/RNaseH'},
'2. HLA class I histocompatibility antigen, B-57 alpha chain': {'Gene Name': 'HLA-B',
'General Function': 'Involved '
'in '
'the '
'presentation '
'of '
'foreign '
'antigens '
'to '
'the '
'immune '
'system.',
'Kind': 'Protein',
'Molecular Weight': '40223.825 '
'Da',
'Organism': 'Human',
'Pharmacological action': 'Unknown',
'Specific Function': 'Peptide '
'antigen '
'binding',
'Uniprot ID': 'P18465',
'Uniprot Name': 'HLA '
'class '
'I '
'histocompatibility '
'antigen, '
'B-57 '
'alpha '
'chain'}}

Related

Line split is not functioning as intended

I am trying to get this code to split one at a time, but it is not functioning as expected:
for line in text_line:
one_line = line.split(' ',1)
if len(one_line) > 1:
acro = one_line[0].strip()
meaning = one_line[1].strip()
if acro in acronyms_dict:
acronyms_dict[acro] = acronyms_dict[acro] + ', ' + meaning
else:
acronyms_dict[acro] = meaning
Remove the ' ' from the str.split. The file is using tabs to delimit the acronyms:
import requests
data_site = requests.get(
"https://raw.githubusercontent.com/priscian/nlp/master/OpenNLP/models/coref/acronyms.txt"
)
text_line = data_site.text.split("\n")
acronyms_dict = {}
for line in text_line:
one_line = line.split(maxsplit=1) # <-- remove the ' '
if len(one_line) > 1:
acro = one_line[0].strip()
meaning = one_line[1].strip()
if acro in acronyms_dict:
acronyms_dict[acro] = acronyms_dict[acro] + ", " + meaning
else:
acronyms_dict[acro] = meaning
print(acronyms_dict)
Prints:
{
'24KHGE': '24 Karat Heavy Gold Electroplate',
'2B1Q': '2 Binary 1 Quaternary',
'2D': '2-Dimensional',
...

Function that transforms a list in a list of dictionary

I have to make a function whose purpose is taking in parameter a list.
Such as this one :
['music', ' extension=mp3', 'reports/INFOB131', ' extension=doc,docx,pdf', ' name_contains=INFOB131', ' max_size=100000', 'reports/INFOB132', ' extension=doc,docx,pdf', ' name_contains=INFOB132', ' max_size=100000', 'games', ' name_contains=SC2,Wesnoth', 'pictures/Namur', ' extension=jpeg', ' min_size=5000000', ' name_contains=cercle', 'pictures/autres', ' extension=jpeg', ' min_size=5000000']
And return a list similar to this :
data_config = [{'music' : {'extension':'mp3'}}, {'reports/INFOB131': {'extension': ['doc', 'docx','pdf'], 'name_contains':'INFOB131', 'max_size':100000}}, {'reports/INFOB132': {'extension': ['doc', 'docx','pdf'], 'name_contains':'INFOB132', 'max_size':100000}}]
So I made that function :
def my_function(list_in_question, my_config_list =[], prev_list = []):
""" """
enumerated_list = list(enumerate(list_in_question))
if not '=' in enumerated_list[0][1]:
main_key = enumerated_list[0][1]# référencé avant assignement
pre_dict = {main_key : {}}
for i in enumerated_list[1:]:
if '=' in i[1] :
splitted = i[1].split('=')
prev_list.append({splitted[0] : splitted[1]})
elif not '=' in i[1] and i[1] != main_key:
for j in prev_list:
pre_dict[main_key].update(j)
my_config_list.append(pre_dict)
return my_function(list_in_question[i[0]:])
elif not '=' in i[1] and i[1] == main_key and main_key!= enumerated_list[0][1]:
return my_config_list
else:
print("The format of the file containig the data in not adequate !")
But I don't understand why when I execute it this way :
new_lines = ['music', ' extension=mp3', '', 'reports/INFOB131', ' extension=doc,docx,pdf', ' name_contains=INFOB131', ' max_size=100000', '', 'reports/INFOB132', ' extension=doc,docx,pdf', ' name_contains=INFOB132', ' max_size=100000', '', 'games', ' name_contains=SC2,Wesnoth', '', 'pictures/Namur', ' extension=jpeg', ' min_size=5000000', ' name_contains=cercle', '', 'pictures/autres', ' extension=jpeg', ' min_size=5000000']
my_function(new_lines)
I end up with this output...
None
I would be very grateful if someone could help me,
Thank you !
PS : If anyone have an idea of how I could do without loop and do it in a recursive way, it would be awesome !
Everyone... Thank you !!! You really hepled me, all your answers are awesome, I have some issues to understand some parts so I'll be annoying just a little longer with some questions of you code. Anyway, thank you for the time you took to help me, you were all more than great help !!!
Try the following code;
def foo(my_list):
# Create an iterator
list_iter = iter(my_list)
# zip the iterator with itself
key_val_tuple = zip(list_iter, list_iter) # This will group two items in the list at a time
output_list = []
for i in key_val_tuple:
value_dict = {}
value = i[1].split('=')
value_dict[value[0]] = value[1].split(",") if len(value[1].split(","))>1 else value[1]
element_dict = {}
element_dict[i[0]] = value_dict
output_list.append(element_dict)
return output_list
input_list = ['music', ' extension=mp3', 'reports/INFOB131', ' extension=doc,docx,pdf', ' name_contains=INFOB131', ' max_size=100000', 'reports/INFOB132', ' extension=doc,docx,pdf', ' name_contains=INFOB132', ' max_size=100000', 'games', ' name_contains=SC2,Wesnoth', 'pictures/Namur', ' extension=jpeg', ' min_size=5000000', ' name_contains=cercle', 'pictures/autres', ' extension=jpeg', ' min_size=5000000']
# Call the function foo
output = foo(input_list)
print(output) # python3
Got the following output
[{'music': {' extension': 'mp3'}}, {'reports/INFOB131': {' extension': ['doc', 'docx', 'pdf']}}, {' name_contains=INFOB131': {' max_size': '100000'}}, {'reports/INFOB132': {' extension': ['doc', 'docx', 'pdf']}}, {' name_contains=INFOB132': {' max_size': '100000'}}, {'games': {' name_contains': ['SC2', 'Wesnoth']}}, {'pictures/Namur': {' extension': 'jpeg'}}, {' min_size=5000000': {' name_contains': 'cercle'}}, {'pictures/autres': {' extension': 'jpeg'}}]
zip(list_iter, list_iter) : This will group two items in the list at a time.
output : [('music', ' extension=mp3'), ('reports/INFOB131', ' extension=doc,docx,pdf'), ...]
Reference:
python zip()
What exactly are Python's iterator, iterable, and iteration protocols?
Convert List to a list of tuples python
You need to traverse the list one time. The pattern is this:
Start an empty list (let's call it new_list)
You find an element in the original list (original_list).
If it does not contain '=', you create a new dictionary in the new_list
If it contains the '=' sign, split the element into k and v (before and after the '='), and in the last entry in the new_list, for the only key, you add a key-value pair
def parse_list(original_list):
new_list=[]
for element in original_list:
if not '=' in element:
new_list.append({element:{}})
else:
k,w=element.split('=')
new_list[-1][new_list[-1].keys()[0]][k]=w
return new_list
new_lines = ['music', ' extension=mp3', '', 'reports/INFOB131', ' extension=doc,docx,pdf', ' name_contains=INFOB131', ' max_size=100000', '', 'reports/INFOB132', ' extension=doc,docx,pdf', ' name_contains=INFOB132', ' max_size=100000', '', 'games', ' name_contains=SC2,Wesnoth', '', 'pictures/Namur', ' extension=jpeg', ' min_size=5000000', ' name_contains=cercle', '', 'pictures/autres', ' extension=jpeg', ' min_size=5000000']
parse_list(new_lines)
Now I should explain the line before the return statement:
new_list[-1] is the dictionary corresponding to the last entry without an equal sign that was found in the original_list. After the first pass through the loop,
new_list=[{'music': {}}]
during the second pass
new_list[-1]={'music': {}}
new_list[-1].keys()=['music']
new_list[-1].keys()[0]='music'
new_list[-1][new_list[-1].keys()[0]]={}
now you just update this dictionary with the parsed k,w pair
One more way of doing it:
import re
def my_function(list_in_question, my_config_list=[], prev_list=[]):
""" """
result = {}
main_key = ''
for element in list_in_question:
if element == '':
main_key = ''
if re.search('=', element):
key, value = element.split('=')
print "key, value = ", key, value
if re.search(',', value):
value_list = value.split(',')
print "value list =", value_list
result[main_key][key] = value_list
else:
result[main_key][key] = value
else:
main_key = element
result[main_key] = {}
return (result)
new_lines = ['music', ' extension=mp3', '', 'reports/INFOB131', ' extension=doc,docx,pdf', ' name_contains=INFOB131',
' max_size=100000', '', 'reports/INFOB132', ' extension=doc,docx,pdf', ' name_contains=INFOB132',
' max_size=100000', '', 'games', ' name_contains=SC2,Wesnoth', '', 'pictures/Namur', ' extension=jpeg',
' min_size=5000000', ' name_contains=cercle', '', 'pictures/autres', ' extension=jpeg',
' min_size=5000000']
print (my_function(new_lines))
Yet another try with only lists and dicts:
def make(lst):
data_config=[]
for st in lst:
if '=' not in st: # new entry
dd = dict()
dds = dd[st] = dict()
data_config.append(dd)
else: # fill entry
k,v = st.split('=')
if ',' in v:
v = v.split(',')
dds[k] = v
return data_config
For :
In [564]: make(l)
Out[564]:
[{'music': {' extension': 'mp3'}},
{'reports/INFOB131': {' extension': ['doc', 'docx', 'pdf'],
' max_size': '100000',
' name_contains': 'INFOB131'}},
{'reports/INFOB132': {' extension': ['doc', 'docx', 'pdf'],
' max_size': '100000',
' name_contains': 'INFOB132'}},
{'games': {' name_contains': ['SC2', 'Wesnoth']}},
{'pictures/Namur': {' extension': 'jpeg',
' min_size': '5000000',
' name_contains': 'cercle'}},
{'pictures/autres': {' extension': 'jpeg', ' min_size': '5000000'}}]

Python: Search particular string in file

I have text file, that store orders info in following format. I try to search an order by first line of the block, that represent ID and print 7 next lines. But my code checking just the first line or print all line's that contain an input number. Could somebody help me?
4735
['Total price: ', 1425.0]
['Type of menu: ', 'BBQ']
['Type of service: ', ' ']
['Amount of customers: ', 25.0]
['Discount: ', '5%', '= RM', 75.0]
['Time: ', '2017-01-08 21:39:19']
3647
['Total price: ', 2000.0]
['Type of menu: ', ' ']
['Type of service: ', 'Tent ']
['Amount of customers: ', 0]
.......
I use the following code to search in text file.
try:
f = open('Bills.txt', 'r')
f.close()
except IOError:
absent_input = (raw_input("|----File was not founded----|\n|----Press 'Enter' to continue...----|\n"))
report_module = ReportModule()
report_module.show_report()
Id_input = (raw_input("Enter ID of order\n"))
with open("Bills.txt", "r") as f:
searchlines = f.readlines()
j = len(searchlines) - 1
for i, line in enumerate(searchlines):
if Id_input in str(line): # I also try to check in this way (Id_input == str(line)), but it didn't work
k = min(i + 7, j)
for l in searchlines[i:k]: print l,
print
else:
absent_input = (raw_input("|----Order was not founded----|\n|----Press 'Enter' to continue...----|\n"))
report_module = ReportModule()
report_module.show_report()
check the following code.
Id_input = (raw_input("Enter ID of order\n")).strip()
try:
f = open("Bills.txt", "r")
print_rows = False
for idline in f:
if idline.strip() == Id_input:
print_rows = True
continue
if print_rows:
if idline.startswith("["):
print idline
else:
break
if not print_rows:
absent_input = (raw_input("|----Order was not founded----|\n|---- Press 'Enter' to continue...----|\n"))
report_module = ReportModule()
report_module.show_report()
except IOError:
absent_input = (raw_input("|----File was not founded----|\n|---- Press 'Enter' to continue...----|\n"))
report_module = ReportModule()
report_module.show_report()

How to create unique list from list with duplicates

I know how to remove duplicates from a list using set() or two lists, but how do I maintain the same list and add a number at the end for duplicates? I could do it using if, but it´s not pythonic. Thanks guys!!
nome_a = ['Anthony','Rudolph', 'Chuck', 'Chuck', 'Chuck', 'Rudolph', 'Bob']
nomes = []
for item in nome_a:
if item in nomes:
if (str(item) + ' 5') in nomes:
novoitem = str(item) + ' 6'
nomes.append(novoitem)
if (str(item) + ' 4') in nomes:
novoitem = str(item) + ' 5'
nomes.append(novoitem)
if (str(item) + ' 3') in nomes:
novoitem = str(item) + ' 4'
nomes.append(novoitem)
if (str(item) + ' 2') in nomes:
novoitem = str(item) + ' 3'
nomes.append(novoitem)
else:
novoitem = str(item) + ' 2'
nomes.append(novoitem)
if item not in nomes:
nomes.append(item)
print(nomes)
Edit(1): Sorry. I edited for clarification.
You could use the following:
names = ['Anthony','Rudolph', 'Chuck', 'Chuck', 'Chuck', 'Rudolph', 'Bob']
answer = []
name_dict = {}
for name in names:
if name_dict.get(name):
name_dict[name] += 1
answer.append('{}_{}'.format(name, name_dict[name]))
else:
name_dict[name] = 1
answer.append(name)
print(answer)
Output
['Anthony', 'Rudolph', 'Chuck', 'Chuck_2', 'Chuck_3', 'Rudolph_2', 'Bob']

TF-IDF for my documents yield 0

I got this tfidf from yebrahim and somehow my output document yield all 0 for the result . Any problem with this ?
example of the output is
hippo 0.0
hipper 0.0
hip 0.0
hint 0.0
hindsight 0.0
hill 0.0
hilarious 0.0
thanks for the help
# a list of (words-freq) pairs for each document
global_terms_in_doc = {}
# list to hold occurrences of terms across documents
global_term_freq = {}
num_docs = 0
lang = 'english'
lang_dictionary = {}
top_k = -1
supported_langs = ('english', 'french')
from django.utils.encoding import smart_str, smart_unicode
# support for custom language if needed
def loadLanguageLemmas(filePath):
print('loading language from file: ' + filePath)
f = open(filePath)
for line in f:
words = line.split()
if words[1] == '=' or words[0] == words[1]:
continue
lang_dictionary[words[0]] = words[1]
def remove_diacritic(words):
for i in range(len(words)):
w = unicode(words[i], 'ISO-8859-1')
w = unicodedata.normalize('NFKD', w).encode('ASCII', 'ignore')
words[i] = w.lower()
return words
# function to tokenize text, and put words back to their roots
def tokenize(text):
text = ' '.join(text)
tokens = PunktWordTokenizer().tokenize(text)
# lemmatize words. try both noun and verb lemmatizations
lmtzr = WordNetLemmatizer()
for i in range(0,len(tokens)):
#tokens[i] = tokens[i].strip("'")
if lang != 'english':
if tokens[i] in lang_dictionary:
tokens[i] = lang_dictionary[tokens[i]]
else:
res = lmtzr.lemmatize(tokens[i])
if res == tokens[i]:
tokens[i] = lmtzr.lemmatize(tokens[i], 'v')
else:
tokens[i] = res
# don't return any single letters
tokens = [t for t in tokens if len(t) > 1 and not t.isdigit()]
return tokens
def remove_stopwords(text):
# remove punctuation
chars = ['.', '/', "'", '"', '?', '!', '#', '$', '%', '^', '&',
'*', '(', ')', ' - ', '_', '+' ,'=', '#', ':', '\\', ',',
';', '~', '`', '<', '>', '|', '[', ']', '{', '}']
for c in chars:
text = smart_str(text.replace(c, ' '))
text = text.split()
import nltk
if lang == 'english':
stopwords = nltk.corpus.stopwords.words('english')
else:
stopwords = open(lang + '_stopwords.txt', 'r').read().split()
content = [w for w in text if w.lower().strip() not in stopwords]
return content
# __main__ execution
import sys, re, math, unicodedata
from optparse import OptionParser
parser = OptionParser(usage='usage: %prog [options] input_file')
parser.add_option('-l', '--language', dest='language',
help='language to use in tokenizing and lemmatizing. supported\
languages: {english, french}', metavar='LANGUAGE')
parser.add_option('-k', '--top-k', dest='top_k',
help='output only terms with score no less k')
parser.add_option('-m', '--mode', dest='mode',
help='display mode. can be either "both" or "term"')
(options, args) = parser.parse_args()
if options.language:
if options.language not in supported_langs:
print 'only ', supported_langs, ' are supported in this version.'
quit()
if options.language != 'english':
lang = options.language
loadLanguageLemmas(options.language + '_lemmas.txt')
if options.top_k:
top_k = int(options.top_k)
display_mode = 'both'
if options.mode:
if options.mode == 'both' or options.mode == 'term':
display_mode = options.mode
else:
parser.print_help()
if not args:
parser.print_help()
quit()
reader = open(args[0])
all_files = reader.read().splitlines()
num_docs = len(all_files)
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize.punkt import PunktWordTokenizer
print('initializing..')
for f in all_files:
# local term frequency map
terms_in_doc = {}
doc_words = open(f).read().lower()
#print 'words:\n', doc_words
doc_words = remove_stopwords(doc_words)
#print 'after stopwords:\n', doc_words
doc_words = tokenize(doc_words)
#print 'after tokenize:\n', doc_words
#quit()
# increment local count
for word in doc_words:
if word in terms_in_doc:
terms_in_doc[word] += 1
else:
terms_in_doc[word] = 1
# increment global frequency
for (word,freq) in terms_in_doc.items():
if word in global_term_freq:
global_term_freq[word] += 1
else:
global_term_freq[word] = 1
global_terms_in_doc[f] = terms_in_doc
print('working through documents.. ')
for f in all_files:
writer = open(f + '_final', 'w')
result = []
# iterate over terms in f, calculate their tf-idf, put in new list
max_freq = 0;
for (term,freq) in global_terms_in_doc[f].items():
if freq > max_freq:
max_freq = freq
for (term,freq) in global_terms_in_doc[f].items():
idf = math.log(float(1 + num_docs) / float(1 + global_term_freq[term]))
tfidf = float(freq) / float(max_freq) * float(idf)
result.append([tfidf, term])
# sort result on tfidf and write them in descending order
result = sorted(result, reverse=True)
for (tfidf, term) in result[:top_k]:
if display_mode == 'both':
writer.write(term + '\t' + str(tfidf) + '\n')
else:
writer.write(term + '\n')
print('success, with ' + str(num_docs) + ' documents.')

Categories