Passing string over urls django - python

I'm having trouble in understanding this error in my code, first let me try and explain what is happening and what I'm I trying to do.
My code is designed to load up 45 separate text files into an array, including the weight of each word/phrase and the word phrase itself. This has to occur at the beginning, before any description is received.
Second, once the description is received, it is parsed by my software into words/phrases, which are compared to the words/phrases in the array.
Third, my software then provides the top three classes, in rank order (first/second/third) by number, along with the score for each class.
I've made a django application that will serve this code, so I have a form which will provide two parameters classes and description, like this:
class TrademarkClassifierForm(forms.Form):
"""
TODO: This forms will cover the questions the
initial classifier program does
:returns: TODO
"""
classes = forms.CharField(max_length=10,
label="Test all trademark classes? Type 'yes' to do so or else enter the class to be tested ")
description = forms.CharField(widget=forms.Textarea)
def __init__(self, *args, **kwargs):
super(TrademarkClassifierForm, self).__init__(*args, **kwargs)
self.helper = FormHelper()
self.helper.add_input(Submit('submit', 'Submit'))
Then I want to pass this two parameters in the view over the url like this:
class TrademarkClassifierResultView(FormView):
"""
TODO: Post should redirect to it's on page with GET,
specify set values in some query parameters,
something like ?classes=yes&name=NameOfTrademarkClass
This should be visible on results page.
:param: classes
:param: description
:returns: TODO - params
"""
template_name = 'trademark.html'
form_class = TrademarkClassifierForm
def get(self, request, *args, **kwargs):
classes = str(self.request.GET.get('classes'))
description = str(self.request.GET.get('description'))
form = TrademarkClassifierForm(initial={'classes': classes, 'description': description})
context_data = self.get_context_data(classes, description, form=form)
return self.render_to_response(context_data)
def form_valid(self, form):
classes = form.cleaned_data['classes']
description = form.cleaned_data['description']
return redirect(self.get_success_url(classes, description))
def form_invalid(self, form):
messages.add_message(self.request, messages.ERROR,
"Invalid data. Please check fields.")
return self.render_to_response(
self.get_context_data(form=form)
)
def get_success_url(self, classes=None, description=None):
return reverse("classifier:trademark") + "?classes=" + str(classes) + "&description" + str(description)
def get_context_data(self, classes, description, **kwargs):
context = super(TrademarkClassifierResultView, self).get_context_data(**kwargs)
context['classes'] = classes
context['description'] = description
context['trademark'] = ClassifyMarkBased.control_program(classes, description)
return context
Now my problem is this error:
Environment:
Request Method: GET
Request URL: http://127.0.0.1:8000/trademark/
Django Version: 1.11.2
Python Version: 2.7.12
Installed Applications:
['django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'django.contrib.sites',
'classifier',
'crispy_forms',
'allauth',
'allauth.account',
'allauth.socialaccount',
'widget_tweaks',
'debug_toolbar']
Installed Middleware:
['django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
'debug_toolbar.middleware.DebugToolbarMiddleware']
Traceback:
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/core/handlers/exception.py" in inner
41. response = get_response(request)
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/core/handlers/base.py" in _get_response
187. response = self.process_exception_by_middleware(e, request)
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/core/handlers/base.py" in _get_response
185. response = wrapped_callback(request, *callback_args, **callback_kwargs)
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/views/generic/base.py" in view
68. return self.dispatch(request, *args, **kwargs)
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/views/generic/base.py" in dispatch
88. return handler(request, *args, **kwargs)
File "/home/petar/Documents/Synergy/Trademark/TM_base/classifier/views.py" in get
60. context_data = self.get_context_data(classes, description, form=form)
File "/home/petar/Documents/Synergy/Trademark/TM_base/classifier/views.py" in get_context_data
82. context['trademark'] = ClassifyMarkBased.control_program(classes, description)
File "/home/petar/Documents/Synergy/Trademark/TM_base/classifier/services/classify_mark_based.py" in control_program
89. N = len(word_count_array_for_all_classes[i])
Exception Type: IndexError at /trademark/
Exception Value: list index out of range
This is my url:
url(r'^trademark/', TrademarkClassifierResultView.as_view(), name="trademark"),
and this is the part of the code that should calculate the trademark over this two parameters:
import os
import numpy as np
import re
import requests
class TrademarkService(object):
# coding: utf-8
# In[5]:
# compare input string to a class
# for words not found,look in a dictionary - add to text files for trademark words
# In[6]:
# open each trademark class file and read the words/frequency back into an array
#staticmethod
def open_file_read_words(file_name):
unique_words_and_count_not_format = []
tm_word_count_array = []
my_list = []
all_possible_entries = 1
with open(file_name) as f:
lines = [line.strip() for line in open(file_name)]
all_possible_entries = len(lines)
tm_word_count_array = [[0 for x in range(2)] for y in range(all_possible_entries)]
i = 0
while i < all_possible_entries:
tm_word_count_array[i] = lines[i].split(',', 1)
i += 1
i = 0
while i < all_possible_entries:
tm_word_count_array[i][0] = int(tm_word_count_array[i][0])
i += 1
return tm_word_count_array
# In[7]:
# this section normalizes word frequency by the number of words x 1000
#staticmethod
def normalize_array(tm_word_count_array):
list_of_freqs = []
max_entries = len(tm_word_count_array)
list_of_freqs = [0 for y in range(max_entries)]
i = 0
while i < max_entries:
list_of_freqs[i] = tm_word_count_array[i][0]
i += 1
max_value = max(list_of_freqs)
i = 0
while i < max_entries:
tm_word_count_array[i][0] = ((float(tm_word_count_array[i][0])) / max_entries) * 1000
i += 1
return tm_word_count_array
# In[8]:
# include the list of not useful words here
#staticmethod
def find_not_useful_words(word):
not_useful_words = (
"about", "are", "upon", "-", " ", "up", "other", "or", "not", "namely", "more", "made", "in", "for", "except",
"but", "being", "all", "against", "was", "were", "will", "that", "its", "on", "it", "at", "was", "our", "your",
"ours", "yours", "their", "them", "other", "out", "having", "have", "has", "in", "be", "than", "use", "uses",
"using", "", "by", "and", "an", "a", "use", "used", "using", "for", "to", "of", "-)", "-]", "with", "as", "in",
"the", "from")
for test_word in not_useful_words:
if word == test_word:
return False
return True
# In[9]:
# clean up the phrases by removing problematic characters
#staticmethod
def clean_up_phrases(data):
important_words = ''
word = data
for c in word:
if 0 <= ord(c) <= 127:
# this is an ascii character.
not_a_variable = 0
else:
if ord(c) == 201:
word = word.replace(c, "e")
elif ord(c) == 241:
word = word.replace(c, "n")
elif ord(c) == 225:
word = word.replace(c, "a")
elif ord(c) == 251:
word = word.replace(c, "u")
elif ord(c) == 8206:
word = word.replace(c, "")
else:
word = word.replace(c, "")
# continue_yes=raw_input("do you want to continue?")
word = word.lower()
word = str(filter(lambda ch: ch not in "?.!/;:,'()[]", word))
# calls the function above to remove words that were found to interfere with classification
if data.find_not_useful_words(word):
if len(word) > 1:
important_words += word
return important_words
# In[10]:
# find the important words in the string
#staticmethod
def find_important_words(data):
all_entries = len(data)
important_words = []
for word in data.split():
for c in word:
if 0 <= ord(c) <= 127:
# this is an ascii character.
not_a_variable = 0
else:
if ord(c) == 201:
word = word.replace(c, "e")
elif ord(c) == 241:
word = word.replace(c, "n")
elif ord(c) == 225:
word = word.replace(c, "a")
elif ord(c) == 251:
word = word.replace(c, "u")
elif ord(c) == 8206:
word = word.replace(c, "")
else:
word = word.replace(c, "")
word = word.lower()
word = str(filter(lambda ch: ch not in " ?.!/;:,'()[]", word))
if word.endswith("-"):
word = word[:-1]
if word.startswith("-"):
word = word[:1]
if data.find_not_useful_words(word):
if len(word) > 1:
important_words.append(word)
return important_words
# In[11]:
#staticmethod
def analyze_each_line_test_data(test_sentence, N, normalized_tm_word_count_array):
# remove problematic characters and words, plus find important words/phrases
test_important_phrases = test_sentence.clean_up_phrases(test_sentence)
i = 0
total_found = 0
total_TM_class_count = 0
total_TM_words_matched = []
# score the trademark phrases in the string
while i < N:
count_phrases = 0
if len(normalized_tm_word_count_array[i][1].split()) > 1:
if test_important_phrases.find(normalized_tm_word_count_array[i][1]) > -1:
total_TM_words_matched.append(normalized_tm_word_count_array[i][1])
total_TM_class_count += (normalized_tm_word_count_array[i][0])
total_found += 1
i += 1
# decompose the string and remove extraneous words, then score the words in the string
test_important_words = test_sentence.find_important_words(test_sentence)
i = 0
while i < N:
count_words = 0
if test_important_words.count(normalized_tm_word_count_array[i][1]) > 0:
total_TM_words_matched.append(normalized_tm_word_count_array[i][1])
count_words = test_important_words.count(normalized_tm_word_count_array[i][1])
total_TM_class_count += (normalized_tm_word_count_array[i][0] * count_words)
total_found += 1
i += 1
i = 0
normalized_tm_word_count_values = [0 for y in range(N)]
normalized_tm_word_count_words = ['a' for y in range(N)]
while i < N:
normalized_tm_word_count_values[i] = normalized_tm_word_count_array[i][0]
normalized_tm_word_count_words[i] = normalized_tm_word_count_array[i][1]
i += 1
total_words_to_match = len(test_important_words) + len(test_important_phrases)
not_found_words = list(set(test_important_words) - set(normalized_tm_word_count_words))
return total_found, total_TM_words_matched, not_found_words, total_TM_class_count
# In[12]:
#staticmethod
def open_class_file_read_words_to_array(file_name, file_name_class=None):
tm_word_count_array = []
tm_word_count_array = file_name.open_file_read_words(file_name_class)
return tm_word_count_array
# In[13]:
# create a file for the trademark results
#staticmethod
def create_results_file(file_name, results_array, description):
unique_words_and_count_not_format = []
unique_words_and_count_to_write = []
open_file_name = open(file_name, 'a')
open_file_name.write("New trademark comparison")
open_file_name.write("\n")
open_file_name.write(description)
open_file_name.write("\n")
unique_words_and_count_to_write = np.array(results_array, dtype=object)
np.savetxt(open_file_name, unique_words_and_count_to_write, fmt='%s', delimiter=',')
open_file_name.write("\n")
open_file_name.write("\n")
open_file_name.write("\n")
open_file_name.close()
# In[14]:
# this section controls the program
#staticmethod
def control_the_program(classes, description):
description = []
word_count_array_for_all_classes = []
correct_class_set = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17',
'18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33',
'34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45']
# depending on the answer, only one class worth of trademark words will be loaded up or else all will be loaded up
# test_all_classes = raw_input(
# "Test all trademark classes? Type 'yes' to do so or else enter the class to be tested ")
test_all_classes = classes
# test description of goods/services
# test_data_array = raw_input("Provide the description of goods or services ")
test_data_array = description
# file_name_data = raw_input("Provide the identifier for the results file ")
# this file has the output of the classification engine, including the top 3 results
# file_name_results = 'user_test_comparison_results_' + file_name_data + '.txt'
# call to a program to open each file of trademark words in turn and read the words back into an array
if test_all_classes == 'yes':
i = 1
number_classes_to_check = 45
word_count_array_for_all_classes = [[] for z in range(46)]
temp_array = []
while i <= 45:
# opens each file with the trademark words
file_name_class = 'counted_phrases_class' + str(i) + '.txt'
temp_array = classes.open_class_file_read_words_to_array(file_name_class)
# normalization is used because some classes have many words and some have few words
# the words/phrases are weighted according to frequency
word_count_array_for_all_classes[i] = classes.normalize_array(temp_array)
i += 1
else:
# print "you didn't enter yes"
pass
# length_test_data_array = len(test_data_array)
# open(file_name_results, 'a').close()
# start_writing_results = open(file_name_results, 'a')
# start_writing_results.write("The start of the test")
# start_writing_results.write("\n")
# start_writing_results.write("Total number of potential items to match ")
# start_writing_results.write(str(length_test_data_array))
# start_writing_results.write("\n")
# start_writing_results.close()
top_result = [0 for y in range(2)]
second_result = [0 for y in range(2)]
third_result = [0 for y in range(2)]
top_array_words_not_found = []
second_array_words_not_found = []
third_array_words_not_found = []
counter_for_9vs42 = 0
counter_for_data_errors = 0
top_result = [0 for y in range(2)]
second_result = [0 for y in range(2)]
third_result = [0 for y in range(2)]
top_array_words_not_found = []
second_array_words_not_found = []
third_array_words_not_found = []
actual_class_results = [0 for y in range(2)]
overall_array_results = [[0 for x in range(3)] for y in range(4)]
actual_class_words_not_found = []
i = 1
while i <= 45:
total_found = 0
total_TM_words_matched = 0
not_found_words = ['']
score = 0
N = len(word_count_array_for_all_classes[i])
total_found, total_TM_words_matched, not_found_words, score = classes.analyze_each_line_test_data(test_data_array, N,
word_count_array_for_all_classes[i])
if int(score) > 0:
if int(score) > top_result[0]:
third_result[0] = second_result[0]
third_result[1] = second_result[1]
third_array_words_not_found = second_array_words_not_found
second_result[0] = top_result[0]
second_result[1] = top_result[1]
second_array_words_not_found = top_array_words_not_found
top_result[0] = int(score)
top_result[1] = i
top_array_words_not_found = ['']
top_array_words_not_found = not_found_words
elif int(score) > second_result[0]:
third_result[0] = second_result[0]
third_result[1] = second_result[1]
third_array_words_not_found = second_array_words_not_found
second_result[0] = int(score)
second_result[1] = i
second_array_words_not_found = ['']
second_array_words_not_found = not_found_words
elif int(score) > third_result[0]:
third_result[0] = int(score)
third_result[1] = i
third_array_words_not_found = ['']
third_array_words_not_found = not_found_words
i += 1
overall_array_results[0][0] = top_result[0]
overall_array_results[0][1] = top_result[1]
overall_array_results[0][2] = top_array_words_not_found
overall_array_results[1][0] = second_result[0]
overall_array_results[1][1] = second_result[1]
overall_array_results[1][2] = second_array_words_not_found
overall_array_results[2][0] = third_result[0]
overall_array_results[2][1] = third_result[1]
overall_array_results[2][2] = third_array_words_not_found
# all results - including the first, second, third choices of the engine and the original description - are written to the file
# create_results_file(file_name_results, overall_array_results, test_data_array)
# start_writing_results = open(file_name_results, 'a')
# start_writing_results.write("The end of the test")
# start_writing_results.write("\n")
#
# start_writing_results.write("\n")
# start_writing_results.write("\n")
# start_writing_results.close()
# print "finished the process"
From the code that I've provided you can see that this parameters where provided over python raw_input and after calculation code was creating a file in which you can read about the result.
I've rewritten this so I can serve it over the django application, so parameters classes and description should overwrite the raw_input and the result will be displayed in the template, like this:
{{ trademark.overall_array_results.top_result }}<br>
{{ trademark.overall_array_results.second_result }}<br>
{{ trademark.overall_array_results.third_result }}
I'm not sure if I'm doing the write thing here, so I need help to understand this better, can someone help me to over come error.

If classes is not "yes", then word_count_array_for_all_classes remains an empty list.

Related

How to retrieve section IDs using Google docs API Python

For instance, we have a document such as this -
Table Of Content
Introduction
<text: A>
1.1 Background
<text: B>
1.2 Problem statement
<text: C>
Approach
<text: D>
2.1.1 Outline of the algorithm
<text: E>
I need to pattern match a "string" in all of the texts in the document. For example my search string could be "REQ-". Which could match "REQ-1", "REQ-2" to "REQ-10".
Suppose if "REQ-1" was located in text:C, and "REQ-2" in text:E, then the output I am looking for is
("REQ-1", "1.2"), ("REQ-2", "2.1.1") etc
Essentially, it matches the search string, identify all matches, and for each match, returns a 2-tuple of the matched string and the "section id" in the document containing the matched string.
def get_creds():
credentials = service_account.Credentials.from_service_account_file(
"cred_new.json", scopes=SCOPES
)
return credentials
def search_paragraph_element(element, search_str):
text_run = element.get('textRun')
if not text_run:
return False
res = text_run.get('content').find(search_str)
if res != -1:
return True
return False
def search_structural_elements(elements, search_str):
text = ''
hd_1 = 0
hd_2 = 0
hd_3 = 0
for value in elements:
if 'paragraph' in value:
if value['paragraph']['paragraphStyle']['namedStyleType'] == 'HEADING_1':
hd_1 = hd_1 + 1
hd_2 = 0
hd_3 = 0
elif value['paragraph']['paragraphStyle']['namedStyleType'] == 'HEADING_2':
hd_2 = hd_2 + 1
hd_3 = 0
elif value['paragraph']['paragraphStyle']['namedStyleType'] == 'HEADING_3':
hd_3 = hd_3 + 1
elements = value.get('paragraph').get('elements')
for elem in elements:
res = search_paragraph_element(elem, search_str)
if res is True:
return str(hd_1) + '.' + str(hd_2) + '.' + str(hd_3)
return text
def main():
"""Uses the Docs API to print out the text of a document."""
credentials = get_creds()
service = build("docs", "v1", credentials=credentials).documents()
properties = service.get(documentId=REQ_DOCUMENT_ID).execute()
doc_content = properties.get('body').get('content')
print(search_structural_elements(doc_content, "MySearchString"))
if __name__ == '__main__':
main()
``

Appending string from list to arrays after using NLTK postag

I have these arrays
ADJ = [] #adjective
ADP = [] #adposition
ADV = [] #adverb
CONJ = [] #conjunction
DET = [] #determiner
NOUN = [] #noun
NUM = [] #number
PRT = [] #participle
PRON = [] #pronoun
VERB = [] #verb
PUNCT = [] #punctuation
X = [] #other
Now I want to insert the words respectively to the arrays above. Meaning if Hi == NOUN then NOUN.append('Hi')
[["Hi", "NOUN"], ["my", "PRON"], ["name", "NOUN"], ["is", "VERB"], ["Bob", "NOUN"],
["and", "CONJ"], ["I", "PRON"], ["live", "VERB"], ["in", "ADP"], ["Germany", "NOUN"]]
The code below is what I tried but it did not work for me
if(detect(paraInput) == 'en'):
tokenizedPI = nltk.word_tokenize(text)
try:
for(i in tokenizedPI):
word = nltk.word_tokenize(i)
tagged = nltk.pos_tag(word, tagset="universal")
if(tagged[0][1] == "ADJ"):
ADJ.append(tagged[0][0])
elif(tagged[0][1] == "ADP"):
ADP.append(tagged[0][0])
elif(tagged[0][1] == "ADV"):
ADV.append(tagged[0][0])
elif(tagged[0][1] == "CONJ"):
CONJ.append(tagged[0][0])
elif(tagged[0][1] == "DET"):
DET.append(tagged[0][0])
elif(tagged[0][1] == "NOUN"):
NOUN.append(tagged[0][0])
elif(tagged[0][1] == "NUM"):
NUM.append(tagged[0][0])
elif(tagged[0][1] == "PRT"):
PRT.append(tagged[0][0])
elif(tagged[0][1] == "PRON"):
PRON.append(tagged[0][0])
elif(tagged[0][1] == "VERB"):
VERB.append(tagged[0][0])
elif(tagged[0][1] == "PUNCT"):
PUNT.append(tagged[0][0])
else:
X.append(tagged[0][0])
tagged = nltk.pos_tag(word)
entity = nltk.ne_chunk(tagged)
except Exception as e:
print(str(s))
else:
print("The String inputted is not in English, please enter a string in English")
What should be corrected in order to fix it as the error I am getting is
for(i in tokenWord):
SyntaxError: invalid syntax
This code should help.
import nltk
text = "A Sample Text for token test"
tokenizedPI = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokenizedPI, tagset="universal")
print(tagged)
This will return a list as expected.
[('A', 'DET'), ('Sample', 'NOUN'), ('Text', 'NOUN'), ('for', 'ADP'), ('token', 'ADJ'), ('test', 'NOUN')]

How to transpose a single item in a nested list

I am trying to input values into my table but the table is not coming out the way I would like it to. The headers ("OrderDate", "Rep", etc.) of my given csv file should be under the "Columns:" cell in the following image: Table of statistical values
I have tried to create multiple functions that could transpose the headers but when trying to print the table, it would give the error:
TypeError: unsupported format string passed to list.__format__.
One code I tired to input just before the "labels" line was:
headers2 = [x.split() for x in headers]
P.S. I have removed the csv file code and manually put in a list assigned to "A".
My Code:
A = [['OrderDate', 'Region', 'Rep', 'Item', 'Units', 'Unit Price'],
['4-Jul-2014', 'East', 'Richard', 'Pen Set', '62', '4.99'],
['12-Jul-2014', 'East', 'Nick', 'Binder', '29', '1.99'],
['21-Jul-2014', 'Central', 'Morgan', 'Pen Set', '55', '12.49'],
['29-Jul-2014', 'East', 'Susan', 'Binder', '81', '19.99'],
['7-Aug-2014', 'Central', 'Matthew', 'Pen Set', '42', '23.95'],
['15-Aug-2014', 'East', 'Richard', 'Pencil', '35', '4.99'],
['24-Aug-2014', 'West', 'James', 'Desk', '3', '275'],
['1-Sep-2014', 'Central', 'Smith', 'Desk', '2', '125']]
minVal = []
maxVal = []
hist = []
average = []
stanDev = []
mode = []
headers = A[0] #this sets the variable "headers" as the first row
rows = A[1:] #sets the variable 'rows' to be a nested list without headers
def rows2cols(A):
if len(A) == 0: #this covers the base case of having an empty csv file
return []
res = [[] for x in headers] #creates a list of empty lists
for line in A:
for col in range(len(line)):
res[col].append(line[col])
return res
def convertstringtofloats(A):
res = []
for x in A:
res.append(float(x))
return res
def isnumericlist(A):
for x in A:
try:
numeric = float(x)
except:
return False
return True
def getMin(A):
B = convertstringtofloats(cols[col]) #Let's Python know what B is for the next line. If this isn't here, there is an error.
res = B[0]
for x in A:
if x < res:
res = x
return res
def getMax(A):
B = convertstringtofloats(cols[col]) #Let's Python know what B is for the next line. If this isn't here, there is an error.
res = B[0]
for x in A:
if x > res:
res = x
return res
def getAvg(A):
return sum(A)/len(A)
def most_common(A):
counts = {}
for x in A:
counts[(x)] = counts.get((x), 0) + 1
max = -1
maxKey = ""
for key,value in counts.items():
if max < value:
max = value
maxKey = key
return maxKey
def getSD(A):
sumsq = 0
for n in A:
sumsq += (getAvg(A))**2
return sumsq
cols = rows2cols(rows) #transposes 'rows' and assigns to variable 'cols'
def stats(A):
B = convertstringtofloats(A)
minVal.append(getMin(B))
maxVal.append(getMax(B))
average.append(getAvg(B))
stanDev.append(getSD(B))
for col in range(len(headers)):
if isnumericlist(cols[col]):
stats(cols[col]) #calls the function to calculate stats of the transposed matrix
else:
minVal.append("n/a")
maxVal.append("n/a")
average.append("n/a")
stanDev.append("n/a")
mode.append(most_common(cols[col]))
#headers2 = [x.split() for x in headers]
labels = ["Columns:", "Min", "Max", "Avg", "Std. Dev.", "Most Common Word"] #labels for the table
table_values = [labels, headers, minVal, maxVal, average, stanDev, mode] #combines all the calculated stats into a single list
print(table_values)
def print_table(table):
longest_cols = [
(max([len(str(row[i])) for row in table]) + 0) for i in range(len(table[0]))
]
row_format = "|".join([" {:>" + str(longest_col) + "} " for longest_col in longest_cols])
first = True
for row in table:
print(row_format.format(*row))
if first:
print((sum(longest_cols) + (len(table[0]) - 0) * 3) * "-")
first = False
print_table(table_values) # this prints the 'labels' at the top, but the statistical values are not in the right place

How to add a member function to an existing Python object?

Previously I created a lot of Python objects of class A, and I would like to add a new function plotting_in_PC_space_with_coloring_option() (the purpose of this function is to plot some data in this object) to class A and use those old objects to call plotting_in_PC_space_with_coloring_option().
An example is:
import copy
import numpy as np
from math import *
from pybrain.structure import *
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.datasets.supervised import SupervisedDataSet
import pickle
import neural_network_related
class A(object):
"""the neural network for simulation"""
'''
todo:
- find boundary
- get_angles_from_coefficients
'''
def __init__(self,
index, # the index of the current network
list_of_coor_data_files, # accept multiple files of training data
energy_expression_file, # input, output files
preprocessing_settings = None,
connection_between_layers = None, connection_with_bias_layers = None,
PCs = None, # principal components
):
self._index = index
self._list_of_coor_data_files = list_of_coor_data_files
self._energy_expression_file = energy_expression_file
self._data_set = []
for item in list_of_coor_data_files:
self._data_set += self.get_many_cossin_from_coordiantes_in_file(item)
self._preprocessing_settings = preprocessing_settings
self._connection_between_layers = connection_between_layers
self._connection_with_bias_layers = connection_with_bias_layers
self._node_num = [8, 15, 2, 15, 8]
self._PCs = PCs
def save_into_file(self, filename = None):
if filename is None:
filename = "network_%s.pkl" % str(self._index) # by default naming with its index
with open(filename, 'wb') as my_file:
pickle.dump(self, my_file, pickle.HIGHEST_PROTOCOL)
return
def get_cossin_from_a_coordinate(self, a_coordinate):
num_of_coordinates = len(a_coordinate) / 3
a_coordinate = np.array(a_coordinate).reshape(num_of_coordinates, 3)
diff_coordinates = a_coordinate[1:num_of_coordinates, :] - a_coordinate[0:num_of_coordinates - 1,:] # bond vectors
diff_coordinates_1=diff_coordinates[0:num_of_coordinates-2,:];diff_coordinates_2=diff_coordinates[1:num_of_coordinates-1,:]
normal_vectors = np.cross(diff_coordinates_1, diff_coordinates_2);
normal_vectors_normalized = np.array(map(lambda x: x / sqrt(np.dot(x,x)), normal_vectors))
normal_vectors_normalized_1 = normal_vectors_normalized[0:num_of_coordinates-3, :];normal_vectors_normalized_2 = normal_vectors_normalized[1:num_of_coordinates-2,:];
diff_coordinates_mid = diff_coordinates[1:num_of_coordinates-2]; # these are bond vectors in the middle (remove the first and last one), they should be perpendicular to adjacent normal vectors
cos_of_angles = range(len(normal_vectors_normalized_1))
sin_of_angles_vec = range(len(normal_vectors_normalized_1))
sin_of_angles = range(len(normal_vectors_normalized_1)) # initialization
for index in range(len(normal_vectors_normalized_1)):
cos_of_angles[index] = np.dot(normal_vectors_normalized_1[index], normal_vectors_normalized_2[index])
sin_of_angles_vec[index] = np.cross(normal_vectors_normalized_1[index], normal_vectors_normalized_2[index])
sin_of_angles[index] = sqrt(np.dot(sin_of_angles_vec[index], sin_of_angles_vec[index])) * np.sign(sum(sin_of_angles_vec[index]) * sum(diff_coordinates_mid[index]));
return cos_of_angles + sin_of_angles
def get_many_cossin_from_coordinates(self, coordinates):
return map(self.get_cossin_from_a_coordinate, coordinates)
def get_many_cossin_from_coordiantes_in_file (self, filename):
coordinates = np.loadtxt(filename)
return self.get_many_cossin_from_coordinates(coordinates)
def mapminmax(self, my_list): # for preprocessing in network
my_min = min(my_list)
my_max = max(my_list)
mul_factor = 2.0 / (my_max - my_min)
offset = (my_min + my_max) / 2.0
result_list = np.array(map(lambda x : (x - offset) * mul_factor, my_list))
return (result_list, (mul_factor, offset)) # also return the parameters for processing
def get_mapminmax_preprocess_result_and_coeff(self,data=None):
if data is None:
data = self._data_set
data = np.array(data)
data = np.transpose(data)
result = []; params = []
for item in data:
temp_result, preprocess_params = self.mapminmax(item)
result.append(temp_result)
params.append(preprocess_params)
return (np.transpose(np.array(result)), params)
def mapminmax_preprocess_using_coeff(self, input_data=None, preprocessing_settings=None):
# try begin
if preprocessing_settings is None:
preprocessing_settings = self._preprocessing_settings
temp_setttings = np.transpose(np.array(preprocessing_settings))
result = []
for item in input_data:
item = np.multiply(item - temp_setttings[1], temp_setttings[0])
result.append(item)
return result
# try end
def get_expression_of_network(self, connection_between_layers=None, connection_with_bias_layers=None):
if connection_between_layers is None:
connection_between_layers = self._connection_between_layers
if connection_with_bias_layers is None:
connection_with_bias_layers = self._connection_with_bias_layers
node_num = self._node_num
expression = ""
# first part: network
for i in range(2):
expression = '\n' + expression
mul_coef = connection_between_layers[i].params.reshape(node_num[i + 1], node_num[i])
bias_coef = connection_with_bias_layers[i].params
for j in range(np.size(mul_coef, 0)):
temp_expression = 'layer_%d_unit_%d = tanh( ' % (i + 1, j)
for k in range(np.size(mul_coef, 1)):
temp_expression += ' %f * layer_%d_unit_%d +' % (mul_coef[j, k], i, k)
temp_expression += ' %f);\n' % (bias_coef[j])
expression = temp_expression + expression # order of expressions matter in OpenMM
# second part: definition of inputs
index_of_backbone_atoms = [2, 5, 7, 9, 15, 17, 19];
for i in range(len(index_of_backbone_atoms) - 3):
index_of_coss = i
index_of_sins = i + 4
expression += 'layer_0_unit_%d = (raw_layer_0_unit_%d - %f) * %f;\n' % \
(index_of_coss, index_of_coss, self._preprocessing_settings[index_of_coss][1], self._preprocessing_settings[index_of_coss][0])
expression += 'layer_0_unit_%d = (raw_layer_0_unit_%d - %f) * %f;\n' % \
(index_of_sins, index_of_sins, self._preprocessing_settings[index_of_sins][1], self._preprocessing_settings[index_of_sins][0])
expression += 'raw_layer_0_unit_%d = cos(dihedral_angle_%d);\n' % (index_of_coss, i)
expression += 'raw_layer_0_unit_%d = sin(dihedral_angle_%d);\n' % (index_of_sins, i)
expression += 'dihedral_angle_%d = dihedral(p%d, p%d, p%d, p%d);\n' % \
(i, index_of_backbone_atoms[i], index_of_backbone_atoms[i+1],index_of_backbone_atoms[i+2],index_of_backbone_atoms[i+3])
return expression
def write_expression_into_file(self, out_file = None):
if out_file is None: out_file = self._energy_expression_file
expression = self.get_expression_of_network()
with open(out_file, 'w') as f_out:
f_out.write(expression)
return
def get_mid_result(self, input_data=None, connection_between_layers=None, connection_with_bias_layers=None):
if input_data is None: input_data = self._data_set
if connection_between_layers is None: connection_between_layers = self._connection_between_layers
if connection_with_bias_layers is None: connection_with_bias_layers = self._connection_with_bias_layers
node_num = self._node_num
temp_mid_result = range(4)
mid_result = []
# first need to do preprocessing
for item in self.mapminmax_preprocess_using_coeff(input_data, self._preprocessing_settings):
for i in range(4):
mul_coef = connection_between_layers[i].params.reshape(node_num[i + 1], node_num[i]) # fix node_num
bias_coef = connection_with_bias_layers[i].params
previous_result = item if i == 0 else temp_mid_result[i - 1]
temp_mid_result[i] = np.dot(mul_coef, previous_result) + bias_coef
if i != 3: # the last output layer is a linear layer, while others are tanh layers
temp_mid_result[i] = map(tanh, temp_mid_result[i])
mid_result.append(copy.deepcopy(temp_mid_result)) # note that should use deepcopy
return mid_result
def get_PC_and_save_it_to_network(self):
'''get PCs and save the result into _PCs
'''
mid_result = self.get_mid_result()
self._PCs = [item[1] for item in mid_result]
return
def train(self):
####################### set up autoencoder begin #######################
node_num = self._node_num
in_layer = LinearLayer(node_num[0], "IL")
hidden_layers = [TanhLayer(node_num[1], "HL1"), TanhLayer(node_num[2], "HL2"), TanhLayer(node_num[3], "HL3")]
bias_layers = [BiasUnit("B1"),BiasUnit("B2"),BiasUnit("B3"),BiasUnit("B4")]
out_layer = LinearLayer(node_num[4], "OL")
layer_list = [in_layer] + hidden_layers + [out_layer]
molecule_net = FeedForwardNetwork()
molecule_net.addInputModule(in_layer)
for item in (hidden_layers + bias_layers):
molecule_net.addModule(item)
molecule_net.addOutputModule(out_layer)
connection_between_layers = range(4); connection_with_bias_layers = range(4)
for i in range(4):
connection_between_layers[i] = FullConnection(layer_list[i], layer_list[i+1])
connection_with_bias_layers[i] = FullConnection(bias_layers[i], layer_list[i+1])
molecule_net.addConnection(connection_between_layers[i]) # connect two neighbor layers
molecule_net.addConnection(connection_with_bias_layers[i])
molecule_net.sortModules() # this is some internal initialization process to make this module usable
####################### set up autoencoder end #######################
trainer = BackpropTrainer(molecule_net, learningrate=0.002,momentum=0.4,verbose=False, weightdecay=0.1, lrdecay=1)
data_set = SupervisedDataSet(node_num[0], node_num[4])
sincos = self._data_set
(sincos_after_process, self._preprocessing_settings) = self.get_mapminmax_preprocess_result_and_coeff(data = sincos)
for item in sincos_after_process: # is it needed?
data_set.addSample(item, item)
trainer.trainUntilConvergence(data_set, maxEpochs=50)
self._connection_between_layers = connection_between_layers
self._connection_with_bias_layers = connection_with_bias_layers
print("Done!\n")
return
def create_sge_files_for_simulation(self,potential_centers = None):
if potential_centers is None:
potential_centers = self.get_boundary_points()
neural_network_related.create_sge_files(potential_centers)
return
def get_boundary_points(self, list_of_points = None, num_of_bins = 5):
if list_of_points is None: list_of_points = self._PCs
x = [item[0] for item in list_of_points]
y = [item[1] for item in list_of_points]
temp = np.histogram2d(x,y, bins=[num_of_bins, num_of_bins])
hist_matrix = temp[0]
# add a set of zeros around this region
hist_matrix = np.insert(hist_matrix, num_of_bins, np.zeros(num_of_bins), 0)
hist_matrix = np.insert(hist_matrix, 0, np.zeros(num_of_bins), 0)
hist_matrix = np.insert(hist_matrix, num_of_bins, np.zeros(num_of_bins + 2), 1)
hist_matrix = np.insert(hist_matrix, 0, np.zeros(num_of_bins +2), 1)
hist_matrix = (hist_matrix != 0).astype(int)
sum_of_neighbors = np.zeros(np.shape(hist_matrix)) # number of neighbors occupied with some points
for i in range(np.shape(hist_matrix)[0]):
for j in range(np.shape(hist_matrix)[1]):
if i != 0: sum_of_neighbors[i,j] += hist_matrix[i - 1][j]
if j != 0: sum_of_neighbors[i,j] += hist_matrix[i][j - 1]
if i != np.shape(hist_matrix)[0] - 1: sum_of_neighbors[i,j] += hist_matrix[i + 1][j]
if j != np.shape(hist_matrix)[1] - 1: sum_of_neighbors[i,j] += hist_matrix[i][j + 1]
bin_width_0 = temp[1][1]-temp[1][0]
bin_width_1 = temp[2][1]-temp[2][0]
min_coor_in_PC_space_0 = temp[1][0] - 0.5 * bin_width_0 # multiply by 0.5 since we want the center of the grid
min_coor_in_PC_space_1 = temp[2][0] - 0.5 * bin_width_1
potential_centers = []
for i in range(np.shape(hist_matrix)[0]):
for j in range(np.shape(hist_matrix)[1]):
if hist_matrix[i,j] == 0 and sum_of_neighbors[i,j] != 0: # no points in this block but there are points in neighboring blocks
temp_potential_center = [round(min_coor_in_PC_space_0 + i * bin_width_0, 2), round(min_coor_in_PC_space_1 + j * bin_width_1, 2)]
potential_centers.append(temp_potential_center)
return potential_centers
# this function is added after those old objects of A were created
def plotting_in_PC_space_with_coloring_option(self,
list_of_coordinate_files_for_plotting=None, # accept multiple files
color_option='pure'):
'''
by default, we are using training data, and we also allow external data input
'''
if list_of_coordinate_files_for_plotting is None:
PCs_to_plot = self._PCs
else:
temp_sincos = []
for item in list_of_coordinate_files_for_plotting:
temp_sincos += self.get_many_cossin_from_coordiantes_in_file(item)
temp_mid_result = self.get_mid_result(input_data = temp_sincos)
PCs_to_plot = [item[1] for item in temp_mid_result]
(x, y) = ([item[0] for item in PCs_to_plot], [item[1] for item in PCs_to_plot])
# coloring
if color_option == 'pure':
coloring = 'red'
elif color_option == 'step':
coloring = range(len(x))
fig, ax = plt.subplots()
ax.scatter(x,y, c=coloring)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
plt.show()
return
But it seems that plotting_in_PC_space_with_coloring_option() was not binded to those old objects, is here any way to fix it (I do not want to recreate these objects since creation involves CPU-intensive calculation and would take very long time to do it)?
Thanks!
Something like this:
class A:
def q(self): print 1
a = A()
def f(self): print 2
setattr(A, 'f', f)
a.f()
This is called a monkey patch.

Convert LexToken to list Python

I have a lexer for html tokens which returns and prints lextoken objects in a given html string
I have a parser which takes tokens as a list and grammar as input and returns true if the set of tokens form a valid string in grammar
I want to combine these programs to form a complete lexer - parser program
But the problem is in the second program the tokens are in form of list and output of first program is lextoken
Lexer
import ply.lex as lex
tokens = (
'LANGLE', # <
'LANGLESLASH', # </
'RANGLE', # >
'SLASHRANGLE', # />
'EQUAL', # =
'STRING', # "144"
'WORD', # 'Welcome' in "Welcome to my webpage."
'NUMBER' # 12, 5.6, -1., 3.14159, -8.1, 867.5309
)
t_ignore = ' \t\v\r' # shortcut for whitespace
states = (
('htmlcomment', 'exclusive'), # <!--
)
def t_htmlcomment(t):
r'<!--'
t.lexer.begin('htmlcomment')
def t_htmlcomment_end(t):
r'-->'
t.lexer.lineno += t.value.count('\n')
t.lexer.begin('INITIAL')
pass
def t_htmlcomment_error(t):
t.lexer.skip(1)
def t_LANGLESLASH(t):
r'</'
return t
def t_LANGLE(t):
r'<'
return t
def t_SLASHRANGLE(t):
r'/>'
return t
def t_RANGLE(t):
r'>'
return t
def t_EQUAL(t):
r'='
return t
def t_STRING(t):
r'"[^"]*"'
t.value = t.value[1:-1] # drop "surrounding quotes"
return t
def t_WORD(t):
r'[^ <>]+'
return t
webpage = "hello <!-- comment --> 123456 <b> Bushra </b> all"
htmllexer = lex.lex()
htmllexer.input(webpage)
while True:
tok = htmllexer.token()
if not tok: break
print tok
This is my parser
work_count = 0 # track one notion of "time taken"
def addtoset(theset,index,elt):
if not (elt in theset[index]):
theset[index] = [elt] + theset[index]
return True
return False
def parse(tokens,grammar):
global work_count
work_count = 0
tokens = tokens + [ "end_of_input_marker" ]
chart = {}
start_rule = grammar[0]
for i in range(len(tokens)+1):
chart[i] = [ ]
start_state = (start_rule[0], [], start_rule[1], 0)
chart[0] = [ start_state ]
for i in range(len(tokens)):
while True:
changes = False
for state in chart[i]:
# State === x -> a b . c d , j
x = state[0]
ab = state[1]
cd = state[2]
j = state[3]
next_states = [ (rule[0],[],rule[1],i)
for rule in grammar if cd <> [] and cd[0] == rule[0] ]
work_count = work_count + len(grammar)
for next_state in next_states:
changes = addtoset(chart,i,next_state) or changes
if cd <> [] and tokens[i] == cd[0]:
next_state = (x, ab + [cd[0]], cd[1:], j)
changes = addtoset(chart,i+1,next_state) or changes
next_states = [ (jstate[0], jstate[1] + [x], (jstate[2])[1:],
jstate[3] )
for jstate in chart[j]
if cd == [] and jstate[2] <> [] and (jstate[2])[0] == x ]
work_count = work_count + len(chart[j])
for next_state in next_states:
changes = addtoset(chart,i,next_state) or changes
# We're done if nothing changed!
if not changes:
break
accepting_state = (start_rule[0], start_rule[1], [], 0)
return accepting_state in chart[len(tokens)-1]
grammar = [
("html", ["element", "html"]),
("html", [ ]),
("element", ["word"]),
("element", ["tag-open","word","tag-close"]),
("tag-open",["<","word",">"]),
("tag-close",["<","/","word",">"])
]
tokens = [ "<", "b", ">" , "Hello", "<", "/" , "b" , ">"]
result=parse(tokens, grammar)
print result
You can do this by using the attribute value of LexToken:
webpage = "hello <!-- comment --> 123456 <b> Bushra </b> all"
htmllexer = lex.lex()
htmllexer.input(webpage)
tokens = []
while True:
tok = htmllexer.token()
if not tok: break
tokens.append(tok.value)
print tokens #['hello', '123456', '<', 'b', '>', 'Bushra', '</', 'b', '>', 'all']
All available attributes may be obtained by using the dir() function:
print dir(tok)

Categories