Convert LexToken to list Python - python

I have a lexer for html tokens which returns and prints lextoken objects in a given html string
I have a parser which takes tokens as a list and grammar as input and returns true if the set of tokens form a valid string in grammar
I want to combine these programs to form a complete lexer - parser program
But the problem is in the second program the tokens are in form of list and output of first program is lextoken
Lexer
import ply.lex as lex
tokens = (
'LANGLE', # <
'LANGLESLASH', # </
'RANGLE', # >
'SLASHRANGLE', # />
'EQUAL', # =
'STRING', # "144"
'WORD', # 'Welcome' in "Welcome to my webpage."
'NUMBER' # 12, 5.6, -1., 3.14159, -8.1, 867.5309
)
t_ignore = ' \t\v\r' # shortcut for whitespace
states = (
('htmlcomment', 'exclusive'), # <!--
)
def t_htmlcomment(t):
r'<!--'
t.lexer.begin('htmlcomment')
def t_htmlcomment_end(t):
r'-->'
t.lexer.lineno += t.value.count('\n')
t.lexer.begin('INITIAL')
pass
def t_htmlcomment_error(t):
t.lexer.skip(1)
def t_LANGLESLASH(t):
r'</'
return t
def t_LANGLE(t):
r'<'
return t
def t_SLASHRANGLE(t):
r'/>'
return t
def t_RANGLE(t):
r'>'
return t
def t_EQUAL(t):
r'='
return t
def t_STRING(t):
r'"[^"]*"'
t.value = t.value[1:-1] # drop "surrounding quotes"
return t
def t_WORD(t):
r'[^ <>]+'
return t
webpage = "hello <!-- comment --> 123456 <b> Bushra </b> all"
htmllexer = lex.lex()
htmllexer.input(webpage)
while True:
tok = htmllexer.token()
if not tok: break
print tok
This is my parser
work_count = 0 # track one notion of "time taken"
def addtoset(theset,index,elt):
if not (elt in theset[index]):
theset[index] = [elt] + theset[index]
return True
return False
def parse(tokens,grammar):
global work_count
work_count = 0
tokens = tokens + [ "end_of_input_marker" ]
chart = {}
start_rule = grammar[0]
for i in range(len(tokens)+1):
chart[i] = [ ]
start_state = (start_rule[0], [], start_rule[1], 0)
chart[0] = [ start_state ]
for i in range(len(tokens)):
while True:
changes = False
for state in chart[i]:
# State === x -> a b . c d , j
x = state[0]
ab = state[1]
cd = state[2]
j = state[3]
next_states = [ (rule[0],[],rule[1],i)
for rule in grammar if cd <> [] and cd[0] == rule[0] ]
work_count = work_count + len(grammar)
for next_state in next_states:
changes = addtoset(chart,i,next_state) or changes
if cd <> [] and tokens[i] == cd[0]:
next_state = (x, ab + [cd[0]], cd[1:], j)
changes = addtoset(chart,i+1,next_state) or changes
next_states = [ (jstate[0], jstate[1] + [x], (jstate[2])[1:],
jstate[3] )
for jstate in chart[j]
if cd == [] and jstate[2] <> [] and (jstate[2])[0] == x ]
work_count = work_count + len(chart[j])
for next_state in next_states:
changes = addtoset(chart,i,next_state) or changes
# We're done if nothing changed!
if not changes:
break
accepting_state = (start_rule[0], start_rule[1], [], 0)
return accepting_state in chart[len(tokens)-1]
grammar = [
("html", ["element", "html"]),
("html", [ ]),
("element", ["word"]),
("element", ["tag-open","word","tag-close"]),
("tag-open",["<","word",">"]),
("tag-close",["<","/","word",">"])
]
tokens = [ "<", "b", ">" , "Hello", "<", "/" , "b" , ">"]
result=parse(tokens, grammar)
print result

You can do this by using the attribute value of LexToken:
webpage = "hello <!-- comment --> 123456 <b> Bushra </b> all"
htmllexer = lex.lex()
htmllexer.input(webpage)
tokens = []
while True:
tok = htmllexer.token()
if not tok: break
tokens.append(tok.value)
print tokens #['hello', '123456', '<', 'b', '>', 'Bushra', '</', 'b', '>', 'all']
All available attributes may be obtained by using the dir() function:
print dir(tok)

Related

How to retrieve section IDs using Google docs API Python

For instance, we have a document such as this -
Table Of Content
Introduction
<text: A>
1.1 Background
<text: B>
1.2 Problem statement
<text: C>
Approach
<text: D>
2.1.1 Outline of the algorithm
<text: E>
I need to pattern match a "string" in all of the texts in the document. For example my search string could be "REQ-". Which could match "REQ-1", "REQ-2" to "REQ-10".
Suppose if "REQ-1" was located in text:C, and "REQ-2" in text:E, then the output I am looking for is
("REQ-1", "1.2"), ("REQ-2", "2.1.1") etc
Essentially, it matches the search string, identify all matches, and for each match, returns a 2-tuple of the matched string and the "section id" in the document containing the matched string.
def get_creds():
credentials = service_account.Credentials.from_service_account_file(
"cred_new.json", scopes=SCOPES
)
return credentials
def search_paragraph_element(element, search_str):
text_run = element.get('textRun')
if not text_run:
return False
res = text_run.get('content').find(search_str)
if res != -1:
return True
return False
def search_structural_elements(elements, search_str):
text = ''
hd_1 = 0
hd_2 = 0
hd_3 = 0
for value in elements:
if 'paragraph' in value:
if value['paragraph']['paragraphStyle']['namedStyleType'] == 'HEADING_1':
hd_1 = hd_1 + 1
hd_2 = 0
hd_3 = 0
elif value['paragraph']['paragraphStyle']['namedStyleType'] == 'HEADING_2':
hd_2 = hd_2 + 1
hd_3 = 0
elif value['paragraph']['paragraphStyle']['namedStyleType'] == 'HEADING_3':
hd_3 = hd_3 + 1
elements = value.get('paragraph').get('elements')
for elem in elements:
res = search_paragraph_element(elem, search_str)
if res is True:
return str(hd_1) + '.' + str(hd_2) + '.' + str(hd_3)
return text
def main():
"""Uses the Docs API to print out the text of a document."""
credentials = get_creds()
service = build("docs", "v1", credentials=credentials).documents()
properties = service.get(documentId=REQ_DOCUMENT_ID).execute()
doc_content = properties.get('body').get('content')
print(search_structural_elements(doc_content, "MySearchString"))
if __name__ == '__main__':
main()
``

Passing string over urls django

I'm having trouble in understanding this error in my code, first let me try and explain what is happening and what I'm I trying to do.
My code is designed to load up 45 separate text files into an array, including the weight of each word/phrase and the word phrase itself. This has to occur at the beginning, before any description is received.
Second, once the description is received, it is parsed by my software into words/phrases, which are compared to the words/phrases in the array.
Third, my software then provides the top three classes, in rank order (first/second/third) by number, along with the score for each class.
I've made a django application that will serve this code, so I have a form which will provide two parameters classes and description, like this:
class TrademarkClassifierForm(forms.Form):
"""
TODO: This forms will cover the questions the
initial classifier program does
:returns: TODO
"""
classes = forms.CharField(max_length=10,
label="Test all trademark classes? Type 'yes' to do so or else enter the class to be tested ")
description = forms.CharField(widget=forms.Textarea)
def __init__(self, *args, **kwargs):
super(TrademarkClassifierForm, self).__init__(*args, **kwargs)
self.helper = FormHelper()
self.helper.add_input(Submit('submit', 'Submit'))
Then I want to pass this two parameters in the view over the url like this:
class TrademarkClassifierResultView(FormView):
"""
TODO: Post should redirect to it's on page with GET,
specify set values in some query parameters,
something like ?classes=yes&name=NameOfTrademarkClass
This should be visible on results page.
:param: classes
:param: description
:returns: TODO - params
"""
template_name = 'trademark.html'
form_class = TrademarkClassifierForm
def get(self, request, *args, **kwargs):
classes = str(self.request.GET.get('classes'))
description = str(self.request.GET.get('description'))
form = TrademarkClassifierForm(initial={'classes': classes, 'description': description})
context_data = self.get_context_data(classes, description, form=form)
return self.render_to_response(context_data)
def form_valid(self, form):
classes = form.cleaned_data['classes']
description = form.cleaned_data['description']
return redirect(self.get_success_url(classes, description))
def form_invalid(self, form):
messages.add_message(self.request, messages.ERROR,
"Invalid data. Please check fields.")
return self.render_to_response(
self.get_context_data(form=form)
)
def get_success_url(self, classes=None, description=None):
return reverse("classifier:trademark") + "?classes=" + str(classes) + "&description" + str(description)
def get_context_data(self, classes, description, **kwargs):
context = super(TrademarkClassifierResultView, self).get_context_data(**kwargs)
context['classes'] = classes
context['description'] = description
context['trademark'] = ClassifyMarkBased.control_program(classes, description)
return context
Now my problem is this error:
Environment:
Request Method: GET
Request URL: http://127.0.0.1:8000/trademark/
Django Version: 1.11.2
Python Version: 2.7.12
Installed Applications:
['django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'django.contrib.sites',
'classifier',
'crispy_forms',
'allauth',
'allauth.account',
'allauth.socialaccount',
'widget_tweaks',
'debug_toolbar']
Installed Middleware:
['django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
'debug_toolbar.middleware.DebugToolbarMiddleware']
Traceback:
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/core/handlers/exception.py" in inner
41. response = get_response(request)
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/core/handlers/base.py" in _get_response
187. response = self.process_exception_by_middleware(e, request)
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/core/handlers/base.py" in _get_response
185. response = wrapped_callback(request, *callback_args, **callback_kwargs)
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/views/generic/base.py" in view
68. return self.dispatch(request, *args, **kwargs)
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/views/generic/base.py" in dispatch
88. return handler(request, *args, **kwargs)
File "/home/petar/Documents/Synergy/Trademark/TM_base/classifier/views.py" in get
60. context_data = self.get_context_data(classes, description, form=form)
File "/home/petar/Documents/Synergy/Trademark/TM_base/classifier/views.py" in get_context_data
82. context['trademark'] = ClassifyMarkBased.control_program(classes, description)
File "/home/petar/Documents/Synergy/Trademark/TM_base/classifier/services/classify_mark_based.py" in control_program
89. N = len(word_count_array_for_all_classes[i])
Exception Type: IndexError at /trademark/
Exception Value: list index out of range
This is my url:
url(r'^trademark/', TrademarkClassifierResultView.as_view(), name="trademark"),
and this is the part of the code that should calculate the trademark over this two parameters:
import os
import numpy as np
import re
import requests
class TrademarkService(object):
# coding: utf-8
# In[5]:
# compare input string to a class
# for words not found,look in a dictionary - add to text files for trademark words
# In[6]:
# open each trademark class file and read the words/frequency back into an array
#staticmethod
def open_file_read_words(file_name):
unique_words_and_count_not_format = []
tm_word_count_array = []
my_list = []
all_possible_entries = 1
with open(file_name) as f:
lines = [line.strip() for line in open(file_name)]
all_possible_entries = len(lines)
tm_word_count_array = [[0 for x in range(2)] for y in range(all_possible_entries)]
i = 0
while i < all_possible_entries:
tm_word_count_array[i] = lines[i].split(',', 1)
i += 1
i = 0
while i < all_possible_entries:
tm_word_count_array[i][0] = int(tm_word_count_array[i][0])
i += 1
return tm_word_count_array
# In[7]:
# this section normalizes word frequency by the number of words x 1000
#staticmethod
def normalize_array(tm_word_count_array):
list_of_freqs = []
max_entries = len(tm_word_count_array)
list_of_freqs = [0 for y in range(max_entries)]
i = 0
while i < max_entries:
list_of_freqs[i] = tm_word_count_array[i][0]
i += 1
max_value = max(list_of_freqs)
i = 0
while i < max_entries:
tm_word_count_array[i][0] = ((float(tm_word_count_array[i][0])) / max_entries) * 1000
i += 1
return tm_word_count_array
# In[8]:
# include the list of not useful words here
#staticmethod
def find_not_useful_words(word):
not_useful_words = (
"about", "are", "upon", "-", " ", "up", "other", "or", "not", "namely", "more", "made", "in", "for", "except",
"but", "being", "all", "against", "was", "were", "will", "that", "its", "on", "it", "at", "was", "our", "your",
"ours", "yours", "their", "them", "other", "out", "having", "have", "has", "in", "be", "than", "use", "uses",
"using", "", "by", "and", "an", "a", "use", "used", "using", "for", "to", "of", "-)", "-]", "with", "as", "in",
"the", "from")
for test_word in not_useful_words:
if word == test_word:
return False
return True
# In[9]:
# clean up the phrases by removing problematic characters
#staticmethod
def clean_up_phrases(data):
important_words = ''
word = data
for c in word:
if 0 <= ord(c) <= 127:
# this is an ascii character.
not_a_variable = 0
else:
if ord(c) == 201:
word = word.replace(c, "e")
elif ord(c) == 241:
word = word.replace(c, "n")
elif ord(c) == 225:
word = word.replace(c, "a")
elif ord(c) == 251:
word = word.replace(c, "u")
elif ord(c) == 8206:
word = word.replace(c, "")
else:
word = word.replace(c, "")
# continue_yes=raw_input("do you want to continue?")
word = word.lower()
word = str(filter(lambda ch: ch not in "?.!/;:,'()[]", word))
# calls the function above to remove words that were found to interfere with classification
if data.find_not_useful_words(word):
if len(word) > 1:
important_words += word
return important_words
# In[10]:
# find the important words in the string
#staticmethod
def find_important_words(data):
all_entries = len(data)
important_words = []
for word in data.split():
for c in word:
if 0 <= ord(c) <= 127:
# this is an ascii character.
not_a_variable = 0
else:
if ord(c) == 201:
word = word.replace(c, "e")
elif ord(c) == 241:
word = word.replace(c, "n")
elif ord(c) == 225:
word = word.replace(c, "a")
elif ord(c) == 251:
word = word.replace(c, "u")
elif ord(c) == 8206:
word = word.replace(c, "")
else:
word = word.replace(c, "")
word = word.lower()
word = str(filter(lambda ch: ch not in " ?.!/;:,'()[]", word))
if word.endswith("-"):
word = word[:-1]
if word.startswith("-"):
word = word[:1]
if data.find_not_useful_words(word):
if len(word) > 1:
important_words.append(word)
return important_words
# In[11]:
#staticmethod
def analyze_each_line_test_data(test_sentence, N, normalized_tm_word_count_array):
# remove problematic characters and words, plus find important words/phrases
test_important_phrases = test_sentence.clean_up_phrases(test_sentence)
i = 0
total_found = 0
total_TM_class_count = 0
total_TM_words_matched = []
# score the trademark phrases in the string
while i < N:
count_phrases = 0
if len(normalized_tm_word_count_array[i][1].split()) > 1:
if test_important_phrases.find(normalized_tm_word_count_array[i][1]) > -1:
total_TM_words_matched.append(normalized_tm_word_count_array[i][1])
total_TM_class_count += (normalized_tm_word_count_array[i][0])
total_found += 1
i += 1
# decompose the string and remove extraneous words, then score the words in the string
test_important_words = test_sentence.find_important_words(test_sentence)
i = 0
while i < N:
count_words = 0
if test_important_words.count(normalized_tm_word_count_array[i][1]) > 0:
total_TM_words_matched.append(normalized_tm_word_count_array[i][1])
count_words = test_important_words.count(normalized_tm_word_count_array[i][1])
total_TM_class_count += (normalized_tm_word_count_array[i][0] * count_words)
total_found += 1
i += 1
i = 0
normalized_tm_word_count_values = [0 for y in range(N)]
normalized_tm_word_count_words = ['a' for y in range(N)]
while i < N:
normalized_tm_word_count_values[i] = normalized_tm_word_count_array[i][0]
normalized_tm_word_count_words[i] = normalized_tm_word_count_array[i][1]
i += 1
total_words_to_match = len(test_important_words) + len(test_important_phrases)
not_found_words = list(set(test_important_words) - set(normalized_tm_word_count_words))
return total_found, total_TM_words_matched, not_found_words, total_TM_class_count
# In[12]:
#staticmethod
def open_class_file_read_words_to_array(file_name, file_name_class=None):
tm_word_count_array = []
tm_word_count_array = file_name.open_file_read_words(file_name_class)
return tm_word_count_array
# In[13]:
# create a file for the trademark results
#staticmethod
def create_results_file(file_name, results_array, description):
unique_words_and_count_not_format = []
unique_words_and_count_to_write = []
open_file_name = open(file_name, 'a')
open_file_name.write("New trademark comparison")
open_file_name.write("\n")
open_file_name.write(description)
open_file_name.write("\n")
unique_words_and_count_to_write = np.array(results_array, dtype=object)
np.savetxt(open_file_name, unique_words_and_count_to_write, fmt='%s', delimiter=',')
open_file_name.write("\n")
open_file_name.write("\n")
open_file_name.write("\n")
open_file_name.close()
# In[14]:
# this section controls the program
#staticmethod
def control_the_program(classes, description):
description = []
word_count_array_for_all_classes = []
correct_class_set = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17',
'18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33',
'34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45']
# depending on the answer, only one class worth of trademark words will be loaded up or else all will be loaded up
# test_all_classes = raw_input(
# "Test all trademark classes? Type 'yes' to do so or else enter the class to be tested ")
test_all_classes = classes
# test description of goods/services
# test_data_array = raw_input("Provide the description of goods or services ")
test_data_array = description
# file_name_data = raw_input("Provide the identifier for the results file ")
# this file has the output of the classification engine, including the top 3 results
# file_name_results = 'user_test_comparison_results_' + file_name_data + '.txt'
# call to a program to open each file of trademark words in turn and read the words back into an array
if test_all_classes == 'yes':
i = 1
number_classes_to_check = 45
word_count_array_for_all_classes = [[] for z in range(46)]
temp_array = []
while i <= 45:
# opens each file with the trademark words
file_name_class = 'counted_phrases_class' + str(i) + '.txt'
temp_array = classes.open_class_file_read_words_to_array(file_name_class)
# normalization is used because some classes have many words and some have few words
# the words/phrases are weighted according to frequency
word_count_array_for_all_classes[i] = classes.normalize_array(temp_array)
i += 1
else:
# print "you didn't enter yes"
pass
# length_test_data_array = len(test_data_array)
# open(file_name_results, 'a').close()
# start_writing_results = open(file_name_results, 'a')
# start_writing_results.write("The start of the test")
# start_writing_results.write("\n")
# start_writing_results.write("Total number of potential items to match ")
# start_writing_results.write(str(length_test_data_array))
# start_writing_results.write("\n")
# start_writing_results.close()
top_result = [0 for y in range(2)]
second_result = [0 for y in range(2)]
third_result = [0 for y in range(2)]
top_array_words_not_found = []
second_array_words_not_found = []
third_array_words_not_found = []
counter_for_9vs42 = 0
counter_for_data_errors = 0
top_result = [0 for y in range(2)]
second_result = [0 for y in range(2)]
third_result = [0 for y in range(2)]
top_array_words_not_found = []
second_array_words_not_found = []
third_array_words_not_found = []
actual_class_results = [0 for y in range(2)]
overall_array_results = [[0 for x in range(3)] for y in range(4)]
actual_class_words_not_found = []
i = 1
while i <= 45:
total_found = 0
total_TM_words_matched = 0
not_found_words = ['']
score = 0
N = len(word_count_array_for_all_classes[i])
total_found, total_TM_words_matched, not_found_words, score = classes.analyze_each_line_test_data(test_data_array, N,
word_count_array_for_all_classes[i])
if int(score) > 0:
if int(score) > top_result[0]:
third_result[0] = second_result[0]
third_result[1] = second_result[1]
third_array_words_not_found = second_array_words_not_found
second_result[0] = top_result[0]
second_result[1] = top_result[1]
second_array_words_not_found = top_array_words_not_found
top_result[0] = int(score)
top_result[1] = i
top_array_words_not_found = ['']
top_array_words_not_found = not_found_words
elif int(score) > second_result[0]:
third_result[0] = second_result[0]
third_result[1] = second_result[1]
third_array_words_not_found = second_array_words_not_found
second_result[0] = int(score)
second_result[1] = i
second_array_words_not_found = ['']
second_array_words_not_found = not_found_words
elif int(score) > third_result[0]:
third_result[0] = int(score)
third_result[1] = i
third_array_words_not_found = ['']
third_array_words_not_found = not_found_words
i += 1
overall_array_results[0][0] = top_result[0]
overall_array_results[0][1] = top_result[1]
overall_array_results[0][2] = top_array_words_not_found
overall_array_results[1][0] = second_result[0]
overall_array_results[1][1] = second_result[1]
overall_array_results[1][2] = second_array_words_not_found
overall_array_results[2][0] = third_result[0]
overall_array_results[2][1] = third_result[1]
overall_array_results[2][2] = third_array_words_not_found
# all results - including the first, second, third choices of the engine and the original description - are written to the file
# create_results_file(file_name_results, overall_array_results, test_data_array)
# start_writing_results = open(file_name_results, 'a')
# start_writing_results.write("The end of the test")
# start_writing_results.write("\n")
#
# start_writing_results.write("\n")
# start_writing_results.write("\n")
# start_writing_results.close()
# print "finished the process"
From the code that I've provided you can see that this parameters where provided over python raw_input and after calculation code was creating a file in which you can read about the result.
I've rewritten this so I can serve it over the django application, so parameters classes and description should overwrite the raw_input and the result will be displayed in the template, like this:
{{ trademark.overall_array_results.top_result }}<br>
{{ trademark.overall_array_results.second_result }}<br>
{{ trademark.overall_array_results.third_result }}
I'm not sure if I'm doing the write thing here, so I need help to understand this better, can someone help me to over come error.
If classes is not "yes", then word_count_array_for_all_classes remains an empty list.

Comparing two lists and popping the larger value to a new list (Python)

I have to compare two lists and add the top value to a new list. I want to use the .pop function with my code I have now. This is homework but all my resources have been shut down so any help would be greatly appreciated. Thanks.
Here is my code if it helps.
class topList():
__slots__ = ( "name", "gender", "occurences" )
def mkList( name, gender, occurences ):
find = topList()
find.name = name
find.gender = gender
find.occurences = occurences
return find
def main():
year = input( 'Enter year: ' )
file = open( 'yob' + year + '.txt' )
lst = []
femaleLst = []
maleLst = []
for line in file:
line1 = line.split( "," )
names = mkList( line1[0], line1[1], line1[2] )
lst.append( names )
if names.gender == 'F':
femaleLst += [ line1 ]
else:
maleLst += [ line1 ]
while len( lst ) < 20:
if name.occurences( maleLst ) > name.occurences( femaleLst ):
maleLst.pop(0) += [ lst ]
else:
femaleLst.pop(0) += [ lst ]
print( femaleLst )
main()

Reference to value of the function

At beginning i wanna say i'm newbie in use Python and everything I learned it came from tutorials.
My problem concerning reference to the value. I'm writing some script which is scrapping some information from web sites. I defined some function:
def MatchPattern(count):
sock = urllib.urlopen(Link+str(count))
htmlSource = sock.read()
sock.close()
root = etree.HTML(htmlSource)
root = etree.HTML(htmlSource)
result = etree.tostring(root, pretty_print=True, method="html")
expr1 = check_reg(root)
expr2 = check_practice(root)
D_expr1 = no_ks(root)
D_expr2 = Registred_by(root)
D_expr3 = Name_doctor(root)
D_expr4 = Registration_no(root)
D_expr5 = PWZL(root)
D_expr6 = NIP(root)
D_expr7 = Spec(root)
D_expr8 = Start_date(root)
#-----Reg_practice-----
R_expr1 = Name_of_practise(root)
R_expr2 = TERYT(root)
R_expr3 = Street(root)
R_expr4 = House_no(root)
R_expr5 = Flat_no(root)
R_expr6 = Post_code(root)
R_expr7 = City(root)
R_expr8 = Practice_no(root)
R_expr9 = Kind_of_practice(root)
#------Serv_practice -----
S_expr1 = TERYT2(root)
S_expr2 = Street2(root)
S_expr3 = House_no2(root)
S_expr4 = Flat_no2(root)
S_expr5 = Post_code2(root)
S_expr6 = City2(root)
S_expr7 = Phone_no(root)
return expr1
return expr2
return D_expr1
return D_expr2
return D_expr3
return D_expr4
return D_expr5
return D_expr6
return D_expr7
return D_expr8
#-----Reg_practice-----
return R_expr1
return R_expr2
return R_expr3
return R_expr4
return R_expr5
return R_expr6
return R_expr7
return R_expr8
return R_expr9
#------Serv_practice -----
return S_expr1
return S_expr2
return S_expr3
return S_expr4
return S_expr5
return S_expr6
return S_expr7
So now inside the script I wanna check value of the expr1 returned by my fynction. I don't know how to do that. Can u guys help me ? Is my function written correct ?
EDIT:
I can't add answer so I edit my current post
This is my all script. Some comments are in my native language but i add some in english
#! /usr/bin/env python
#encoding:UTF-8-
# ----------------------------- importujemy potrzebne biblioteki i skrypty -----------------------
# ------------------------------------------------------------------------------------------------
import urllib
from lxml import etree, html
import sys
import re
import MySQLdb as mdb
from TOR_connections import *
from XPathSelection import *
import os
# ------------------------------ Definiuje xPathSelectors ------------------------------------------
# --------------------------------------------------------------------------------------------------
# -------Doctors -----
check_reg = etree.XPath("string(//html/body/div/table[1]/tr[3]/td[2]/text())") #warunek Lekarz
check_practice = etree.XPath("string(//html/body/div/table[3]/tr[4]/td[2]/text())") #warunek praktyka
no_ks = etree.XPath("string(//html/body/div/table[1]/tr[1]/td[2]/text())")
Registred_by = etree.XPath("string(//html/body/div/table[1]/tr[4]/td[2]/text())")
Name_doctor = etree.XPath("string(//html/body/div/table[2]/tr[2]/td[2]/text())")
Registration_no = etree.XPath("string(//html/body/div/table[2]/tr[3]/td[2]/text())")
PWZL = etree.XPath("string(//html/body/div/table[2]/tr[4]/td[2]/text())")
NIP = etree.XPath("string(//html/body/div/table[2]/tr[5]/td[2]/text())")
Spec = etree.XPath("string(//html/body/div/table[2]/tr[18]/td[2]/text())")
Start_date = etree.XPath("string(//html/body/div/table[2]/tr[20]/td[2]/text())")
#-----Reg_practice-----
Name_of_practise = etree.XPath("string(//html/body/div/table[2]/tr[1]/td[2]/text())")
TERYT = etree.XPath("string(//html/body/div/table[2]/tr[7]/td[2]/*/text())")
Street = etree.XPath("string(//html/body/div/table[2]/tr[8]/td[2]/text())")
House_no = etree.XPath("string(//html/body/div/table[2]/tr[9]/td[2]/*/text())")
Flat_no = etree.XPath("string(//html/body/div/table[2]/tr[10]/td[2]/*/text())")
Post_code = etree.XPath("string(//html/body/div/table[2]/tr[11]/td[2]/*/text())")
City = etree.XPath("string(//html/body/div/table[2]/tr[12]/td[2]/*/text())")
Practice_no = etree.XPath("string(//html/body/div/table[3]/tr[4]/td[2]/text())")
Kind_of_practice = etree.XPath("string(//html/body/div/table[3]/tr[5]/td[2]/text())")
#------Serv_practice -----
TERYT2 = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[2]/td[2]/*/text())")
Street2 = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[3]/td[2]/text())")
House_no2 = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[4]/td[2]/*/text())")
Flat_no2 = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[5]/td[2]/i/text())")
Post_code2 = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[6]/td[2]/*/text())")
City2 = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[7]/td[2]/*/text())")
Phone_no = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[8]/td[2]/text())")
# --------------------------- deklaracje zmiennych globalnych ----------------------------------
# ----------------------------------------------------------------------------------------------
decrease = 9
No = 1
Link = "http://rpwdl.csioz.gov.pl/rpz/druk/wyswietlKsiegaServletPub?idKsiega="
# --------------------------- funkcje zdefiniowane ----------------------------------
# ----------------------------------------------------------------------------------------------
def MatchPattern(count):
sock = urllib.urlopen(Link+str(count))
htmlSource = sock.read()
sock.close()
root = etree.HTML(htmlSource)
root = etree.HTML(htmlSource)
result = etree.tostring(root, pretty_print=True, method="html")
expr1 = check_reg(root)
expr2 = check_practice(root)
D_expr1 = no_ks(root)
D_expr2 = Registred_by(root)
D_expr3 = Name_doctor(root)
D_expr4 = Registration_no(root)
D_expr5 = PWZL(root)
D_expr6 = NIP(root)
D_expr7 = Spec(root)
D_expr8 = Start_date(root)
#-----Reg_practice-----
R_expr1 = Name_of_practise(root)
R_expr2 = TERYT(root)
R_expr3 = Street(root)
R_expr4 = House_no(root)
R_expr5 = Flat_no(root)
R_expr6 = Post_code(root)
R_expr7 = City(root)
R_expr8 = Practice_no(root)
R_expr9 = Kind_of_practice(root)
#------Serv_practice -----
S_expr1 = TERYT2(root)
S_expr2 = Street2(root)
S_expr3 = House_no2(root)
S_expr4 = Flat_no2(root)
S_expr5 = Post_code2(root)
S_expr6 = City2(root)
S_expr7 = Phone_no(root)
return expr1
return expr2
return D_expr1
return D_expr2
return D_expr3
return D_expr4
return D_expr5
return D_expr6
return D_expr7
return D_expr8
#-----Reg_practice-----
return R_expr1
return R_expr2
return R_expr3
return R_expr4
return R_expr5
return R_expr6
return R_expr7
return R_expr8
return R_expr9
#------Serv_practice -----
return S_expr1
return S_expr2
return S_expr3
return S_expr4
return S_expr5
return S_expr6
return S_expr7
# --------------------------- ustanawiamy polaczenie z baza danych -----------------------------
# ----------------------------------------------------------------------------------------------
con = mdb.connect('localhost', 'root', '******', 'SANBROKER', charset='utf8');
# ---------------------------- początek programu -----------------------------------------------
# ----------------------------------------------------------------------------------------------
with con:
cur = con.cursor()
cur.execute("SELECT Old_num FROM SANBROKER.Number_of_records;")
Old_num = cur.fetchone()
count = Old_num[0]
counter = input("Input number of rows: ")
# ----------------------- pierwsze połączenie z TORem ------------------------------------
# ----------------------------------------------------------------------------------------
#connectTor()
#conn = httplib.HTTPConnection("my-ip.heroku.com")
#conn.request("GET", "/")
#response = conn.getresponse()
#print(response.read())
while count <= counter: # co dziesiata liczba
# --------------- pierwsze wpisanie do bazy danych do Archive --------------------
with con:
cur = con.cursor()
cur.execute("UPDATE SANBROKER.Number_of_records SET Archive_num=%s",(count))
# ---------------------------------------------------------------------------------
if decrease == 0:
MatchPattern(count)
# Now I wanna check some expresions (2 or 3)
# After that i wanna write all the values into my database
#------- ostatnie czynności:
percentage = count / 100
print "rekordów: " + str(count) + " z: " + str(counter) + " procent dodanych: " + str(percentage) + "%"
with con:
cur = con.cursor()
cur.execute("UPDATE SANBROKER.Number_of_records SET Old_num=%s",(count))
decrease = 10-1
count +=1
else:
MatchPattern(count)
# Now I wanna check some expresions (2 or 3)
# After that i wanna write all the values into my database
# ------ ostatnie czynności:
percentage = count / 100
print "rekordów: " + str(count) + " z: " + str(counter) + " procent dodanych: " + str(percentage) + "%"
with con:
cur = con.cursor()
cur.execute("UPDATE SANBROKER.Number_of_records SET Old_num=%s",(count))
decrease -=1
count +=1
Well, I'm assuming check_reg is a function that returns a boolean (either True or False).
If that's the case, to check the return:
if expr1:
print "True."
else:
print "False"
There's more than one way to do it, but basically, if expr1: is all you need to do the checking.
To capture the return value of a function, assign the function to a name with an equal sign, like this:
return_value = somefunction(some_value)
print('The return value is ',return_value)
Keep in mind that when the first return statement is encountered, the function will exit. So if you have more than one return statement after each other, only the first will execute.
If you want to return multiple things, add them to a list and then return the list.
Here is an improved version of your function:
def match_pattern(count):
sock = urllib.urlopen(Link+str(count))
htmlsource = sock.read()
sock.close()
root = etree.HTML(htmlSource)
# root = etree.HTML(htmlSource) - duplicate line
# result = etree.tostring(root, pretty_print=True, method="html")
function_names = [check_reg, check_practice, no_ks, Registered_by, \
Name_doctor, Registration_no, PWZL, NIP, Spec, Start_date, \
Name_of_practise, TERYT, Street, House_no2, Flat_no, \
Post_code2, City2, Phone_no]
results = []
for function in function_names:
results.append(function(root))
return results
r = match_pattern(1)
print r[0] # this will be the result of check_reg(root)
The code you have posted is quite ambigous. Can you please fix the ident to let us know what belongs to the function and which part is the script.
A function can returns only one value. You cannot do :
return something
return something_else
return ...
The function will ends when first value will be returned.
What you can do is returning a list, tuple or dict containing all your values.
For instance :
return (something,something_else,...)
or
return [something,something_else,...]
In your case, it seems better to create a class that would have all values you want as attributes, and turn this function into a method that would set the attributes values.
class Example(object):
def __init__ ( self , link , count ):
sock = urllib.urlopen(link+str(count))
htmlSource = sock.read()
sock.close()
root = etree.HTML(htmlSource)
root = etree.HTML(htmlSource)
result = etree.tostring(root, pretty_print=True, method="html")
self.expr1 = check_reg(root)
self.expr2 = check_practice(root)
self.D_expr1 = no_ks(root)
...
self.D_expr8 = Start_date(root)
#-----Reg_practice-----
self.R_expr1 = Name_of_practise(root)
...
self.R_expr9 = Kind_of_practice(root)
#------Serv_practice -----
self.S_expr1 = TERYT2(root)
...
self.S_expr7 = Phone_no(root)
Then you will be able to use this class like :
exampleInstance = Example ( "link you want to use" , 4 ) # the second argument is your 'count' value
# Now you can use attributes of your class to get the values you want
print exampleInstance . expr1
print exampleInstance . S_expr7

Generating Python soaplib stubs from WSDL

I'd like to generate a stub SOAP web service class using the Python soaplib module, based on an existing WSDL. The idea is to generate a mock for a third party web service.
Does any such code generator exist, or must we write our own?
Martin
Okay, I had a go at hacking my wsdl2interface (http://pypi.python.org/pypi/wsdl2interface) script to output soaplib code. I think I have something that works, though it's not pretty or especially well tested.
I'll paste it here for the record. I could be persuaded to release it if someone needs it, though it's not exactly my best code. Note that it uses Suds' WSDL parser to generate soaplib code, which is a bit strange in itself.
Run like this:
$ wsdl2soaplib <url or filename of WSDL> > wsdl.py
The code (you'll need suds in your path, ideally in a virtualenv):
from StringIO import StringIO
import os.path
import sys
import textwrap
import keyword
import re
import suds.client
VALID_IDENTIFIER_RE = re.compile(r"[_A-Za-z][_A-Za-z1-9]*")
VALID_IDENTIFIER_FIRST_LETTER_RE = re.compile(r"[_A-Za-z]")
VALID_IDENTIFIER_SUBSEQUENT_LETTER_RE = re.compile(r"[_A-Za-z1-9]")
HEADER = '''\
"""SOAP web services generated from:
%(wsdl)s.
"""
from soaplib.serializers.primitive import (
String, Integer, Float, Double, DateTime, Bolean, Null, Array, Map, Any
)
from soaplib.serializers.clazz import ClassSerializer
from soaplib.service import SoapServiceBase
from soaplib.service import soapmethod
'''
INTERFACE = '''\
class %(name)s(%(bases)s):
"""%(docstring)s"""
'''
SERVICE_INTERFACE_DOCSTRING = '''\
SOAP service ``%(serviceName)s`` with target namespace %(tns)s.
'''
TYPE_INTERFACE_DOCSTRING = '''\
SOAP %(type)s ``{%(namespace)s}%(name)s``
'''
TYPE_MAP = '''\
WSDL_TYPES = {
%(items)s
}
'''
SOAPMETHOD = ''' #soapmethod(%(args)s, _returns=%(response)s)'''
METHOD = ''' def %(name)s(self, %(args)s):'''
METHOD_DOCSTRING = '''\
"""Parameters:
%(args)s
Returns: %(response)s
"""
'''
STANDARD_TYPE_NAMESPACES = [
'http://schemas.xmlsoap.org/soap/encoding/',
'http://schemas.xmlsoap.org/wsdl/',
'http://www.w3.org/2001/XMLSchema'
]
SCHEMA_TYPE_MAPPING = {
None: '%(typeName)s',
'None': 'None',
'boolean': 'Boolean',
'string': 'String',
'long': 'Integer',
'int': 'Integer',
'short': 'Integer',
'byte': 'Integer',
'unsignedLong': 'Integer',
'unsignedInt': 'Integer',
'unsignedShort': 'Integer',
'unsignedByte': 'Integer',
'positiveInteger': 'Integer',
'nonPositiveInteger': 'Integer',
'negativeInteger': 'Integer',
'nonNegativeInteger': 'Integer',
'float': 'Float',
'double': 'Float',
'decimal': 'Decimal',
'dateTime': 'DateTime',
'date': 'DateTime',
'anyURI': 'String',
'token': 'String',
'normalizedString': 'String',
'base64Binary': 'String',
'hexBinary': 'String',
}
def formatDocstring(text, indent=4, colwidth=78):
width = colwidth - indent
joiner = '\n' + ' ' * indent
return joiner.join(textwrap.wrap(text, width) + [''])
def typeName(type, sd):
resolved = type.resolve()
return resolved.name or ''
def schemaTypeName(type, sd, deps=None):
resolved = type.resolve()
name = resolved.name or ''
schemaType = SCHEMA_TYPE_MAPPING.get(name)
if schemaType is None: # not a standard type
# user default
schemaType = SCHEMA_TYPE_MAPPING[None]
# possibly save dependency link
if deps is not None:
deps.append(unicode(name))
required = type.required()
schemaType = schemaType % dict(typeName=name, required=required)
if type.unbounded():
schemaType = "Array(%s)" % schemaType
return schemaType
def normalizeIdentifier(identifier):
if not VALID_IDENTIFIER_RE.match(identifier):
newIdentifierLetters = []
firstLetter = True
for letter in identifier:
if firstLetter:
if VALID_IDENTIFIER_FIRST_LETTER_RE.match(letter):
newIdentifierLetters.append(letter)
else:
newIdentifierLetters.append('_')
firstLetter = False
else:
if VALID_IDENTIFIER_SUBSEQUENT_LETTER_RE.match(letter):
newIdentifierLetters.append(letter)
else:
newIdentifierLetters.append('_')
identifier = ''.join(newIdentifierLetters)
if keyword.iskeyword(identifier):
identifier = identifier + '_'
return identifier
def generate(client, url=None, standardTypeNamespaces=STANDARD_TYPE_NAMESPACES, removeInputOutputMesssages=True):
"""Given a WSDL URL, return a file that could become your interfaces.py
"""
printed = [] # sequence of type name -> string
for sd in client.sd:
serviceOut = StringIO()
print >>serviceOut, HEADER % dict(
wsdl=url,
)
printed.append(('', serviceOut.getvalue(),))
# Types
typeMap = {}
typeSeq = []
typeDeps = {}
typeAttributes = {}
typesPrinted = []
for type_ in sd.types:
typeOut = StringIO()
resolved = type_[0].resolve()
namespaceURL = resolved.namespace()[1]
if namespaceURL not in standardTypeNamespaces:
if resolved.enum():
typeDescription = "enumeration"
else:
typeDescription = "complex type"
# Look for basess
interfaceBases = []
if resolved.extension():
def find(t):
for c in t.rawchildren:
if c.extension():
find(c)
if c.ref is not None:
interfaceBases.append(c.ref[0])
find(resolved)
if not interfaceBases:
interfaceBases = ['ClassSerializer']
rawTypeName = typeName(type_[0], sd)
typeInterfaceName = normalizeIdentifier(rawTypeName)
typeMap[rawTypeName] = typeInterfaceName
typeSeq.append((rawTypeName, typeInterfaceName,))
typeAttributes[rawTypeName] = {}
print >>typeOut, INTERFACE % dict(
name=normalizeIdentifier(typeInterfaceName),
bases=', '.join(interfaceBases),
docstring=formatDocstring(TYPE_INTERFACE_DOCSTRING % dict(
type=typeDescription,
name=rawTypeName,
namespace=namespaceURL,
)
)
)
print >>typeOut, " class types:"
if resolved.enum():
for attr in type_[0].children():
name = attr[0].name.replace(' ', '_')
print >>typeOut, " %s = String # XXX: Enumeration value" % name
else:
for attr in type_[0].children():
name = attr[0].name.replace(' ', '_')
attrTypeName = typeName(attr[0], sd)
typeAttributes[rawTypeName][name] = attrTypeName
schemaType = schemaTypeName(attr[0], sd, deps=typeDeps.setdefault(unicode(rawTypeName), []))
print >>typeOut, " %s = %s" % (normalizeIdentifier(name), schemaType,)
print >>typeOut
typesPrinted.append((rawTypeName, typeOut.getvalue(),))
serviceInterfaceOut = StringIO()
# Main service interface
print >>serviceInterfaceOut, INTERFACE % dict(
name=normalizeIdentifier(sd.service.name),
bases=u"SoapServiceBase",
docstring=formatDocstring(SERVICE_INTERFACE_DOCSTRING % dict(
serviceName=sd.service.name,
tns=sd.wsdl.tns[1],
)
)
)
methods = {} # name -> (response type, list of parameters,)
for p in sd.ports:
for m in p[1]:
methodName = m[0]
methodArgs = m[1]
if methodName not in methods:
methodDef = p[0].method(methodName)
# XXX: This is discards the namespace part
if methodDef.soap.output.body.wrapped:
inputMessage = methodDef.soap.input.body.parts[0].element[0]
outputMessage = methodDef.soap.output.body.parts[0].element[0]
if outputMessage in typeAttributes:
if len(typeAttributes[outputMessage]) > 0:
response = typeAttributes[outputMessage].values()[0]
else:
response = "None"
else:
response = outputMessage
# Remove types used as input/output messages
if removeInputOutputMesssages:
remove = False
for idx, (t, x) in enumerate(typesPrinted):
if t == inputMessage:
remove = True
break
if remove:
del typesPrinted[idx]
if inputMessage in typeMap:
del typeMap[inputMessage]
remove = False
for idx, (t, x) in enumerate(typesPrinted):
if t == outputMessage:
remove = True
break
if remove:
del typesPrinted[idx]
if outputMessage in typeMap:
del typeMap[outputMessage]
else:
response = methodDef.soap.output.body.parts[0].element[0]
methods[methodName] = (response, methodArgs,)
for methodName in sorted(methods):
methodArgNames = [m[0] for m in methods[methodName][1]]
methodReturnType = methods[methodName][0]
methodArgDetails = []
methodArgSpecs = []
for m in methods[methodName][1]:
argDetail = m[1]
# for docstring
methodModifierParts = []
if not argDetail.required():
methodModifierParts.append('optional')
if argDetail.nillable:
methodModifierParts.append('may be None')
methodModifiers = ""
if methodModifierParts:
methodModifiers = ' (%s)' % ', '.join(methodModifierParts)
argTypeName = typeName(argDetail, sd)
methodSpec = "``%s`` -- %s%s" % (
argDetail.name,
argTypeName,
methodModifiers
)
methodArgDetails.append(methodSpec)
# for #soapmethod decorator
schemaType = schemaTypeName(argDetail, sd)
methodArgSpecs.append(schemaType)
# TODO: Probably not aware of array return types
if methodReturnType not in typeMap and methodReturnType in SCHEMA_TYPE_MAPPING:
methodReturnType = SCHEMA_TYPE_MAPPING[methodReturnType]
print >>serviceInterfaceOut, SOAPMETHOD % dict(
args=', '.join(methodArgSpecs),
response=methodReturnType,
)
print >>serviceInterfaceOut, METHOD % dict(
name=normalizeIdentifier(methodName),
args=', '.join(methodArgNames),
)
print >>serviceInterfaceOut, METHOD_DOCSTRING % dict(
args='\n '.join(methodArgDetails),
response=methodReturnType,
)
print >>serviceInterfaceOut
# Sort list of complex types based on internal dependencies
def sortDeps(printed):
printed = list(reversed(printed))
queue = [item for item in printed if len(typeDeps.get(unicode(item[0]), [])) == 0]
satisfied = set(queue)
remaining = [item for item in printed if item not in queue]
sortedPrinted = []
while queue:
item = queue.pop()
itemTypeName = unicode(item[0])
sortedPrinted.append(item)
satisfied.add(itemTypeName)
for item in remaining:
remainingItemTypeName = unicode(item[0])
depsList = typeDeps.get(remainingItemTypeName, [])
remainingDeps = []
for dep in depsList:
if dep not in satisfied:
remainingDeps.append(dep)
typeDeps[remainingItemTypeName] = remainingDeps
if len(remainingDeps) == 0:
queue.append(item)
remaining.remove(item)
return sortedPrinted
typesPrinted = sortDeps(typesPrinted)
# Print everything
printed.extend(typesPrinted)
printed.append((sd.service.name, serviceInterfaceOut.getvalue(),))
typeMapOut = StringIO()
print >>typeMapOut, TYPE_MAP % dict(
items=',\n'.join([" '%s': %s" % k for k in typeSeq if k[0] in typeMap])
)
print >>typeMapOut
printed.append(('', typeMapOut.getvalue(),))
return '\n'.join([v[1] for v in printed])
def main():
if len(sys.argv) < 2:
print "Usage: %s <url>" % sys.argv[0]
print "The output will be printed to the console"
return
if not '://' in sys.argv[1]:
sys.argv[1] = 'file://' + os.path.abspath(sys.argv[1])
client = suds.client.Client(sys.argv[1])
print generate(client, sys.argv[1])
if __name__ == '__main__':
main()
I have just created a github repository where I'm improving on optilude's script to make it work with soaplib2.0 and more. The link is https://github.com/fvieira/wsdl2soaplib.

Categories