How to retrieve section IDs using Google docs API Python - python

For instance, we have a document such as this -
Table Of Content
Introduction
<text: A>
1.1 Background
<text: B>
1.2 Problem statement
<text: C>
Approach
<text: D>
2.1.1 Outline of the algorithm
<text: E>
I need to pattern match a "string" in all of the texts in the document. For example my search string could be "REQ-". Which could match "REQ-1", "REQ-2" to "REQ-10".
Suppose if "REQ-1" was located in text:C, and "REQ-2" in text:E, then the output I am looking for is
("REQ-1", "1.2"), ("REQ-2", "2.1.1") etc
Essentially, it matches the search string, identify all matches, and for each match, returns a 2-tuple of the matched string and the "section id" in the document containing the matched string.
def get_creds():
credentials = service_account.Credentials.from_service_account_file(
"cred_new.json", scopes=SCOPES
)
return credentials
def search_paragraph_element(element, search_str):
text_run = element.get('textRun')
if not text_run:
return False
res = text_run.get('content').find(search_str)
if res != -1:
return True
return False
def search_structural_elements(elements, search_str):
text = ''
hd_1 = 0
hd_2 = 0
hd_3 = 0
for value in elements:
if 'paragraph' in value:
if value['paragraph']['paragraphStyle']['namedStyleType'] == 'HEADING_1':
hd_1 = hd_1 + 1
hd_2 = 0
hd_3 = 0
elif value['paragraph']['paragraphStyle']['namedStyleType'] == 'HEADING_2':
hd_2 = hd_2 + 1
hd_3 = 0
elif value['paragraph']['paragraphStyle']['namedStyleType'] == 'HEADING_3':
hd_3 = hd_3 + 1
elements = value.get('paragraph').get('elements')
for elem in elements:
res = search_paragraph_element(elem, search_str)
if res is True:
return str(hd_1) + '.' + str(hd_2) + '.' + str(hd_3)
return text
def main():
"""Uses the Docs API to print out the text of a document."""
credentials = get_creds()
service = build("docs", "v1", credentials=credentials).documents()
properties = service.get(documentId=REQ_DOCUMENT_ID).execute()
doc_content = properties.get('body').get('content')
print(search_structural_elements(doc_content, "MySearchString"))
if __name__ == '__main__':
main()
``

Related

Need to optimize scraping code - select URL with parameters

This is a simple code for get url with search parameters. It actually works, but I think it needs to be optimized.
def target_url(search_term, include_term, intext_term, target_site_in, page):
base_template_0 = f'https://www.google.com/search?q={search_term}+"{include_term}"+intext:{intext_term}+site:{target_site_in}&hl=en&rlz='
base_template_1 = f'https://www.google.com/search?q={search_term}+"{include_term}"+intext:{intext_term}&hl=en&rlz='
base_template_2 = f'https://www.google.com/search?q={search_term}+"{include_term}"&hl=en&rlz='
base_template_3 = f'https://www.google.com/search?q={search_term}&hl=en&rlz='
search_term = search_term.replace(' ', '+')
base_url_0 = base_template_0.format(search_term)
base_url_1 = base_template_1.format(search_term)
base_url_2 = base_template_2.format(search_term)
base_url_3 = base_template_3.format(search_term)
url_template_0 = base_url_0 + '&start={}'
url_template_1 = base_url_1 + '&start={}'
url_template_2 = base_url_2 + '&start={}'
url_template_3 = base_url_3 + '&start={}'
if page == 0 and search_term and include_term and intext_term and target_site:
return base_url_0
if page == 0 and search_term and include_term and intext_term:
return base_url_1
if page == 0 and search_term and include_term:
return base_url_2
if page == 0 and search_term:
return base_url_3
else:
if search_term and include_term and intext_term and target_site:
return url_template_0.format(page)
if search_term and include_term and intext_term:
return url_template_1.format(page)
if search_term and include_term:
return url_template_2.format(page)
if search_term:
return url_template_3.format(page)
Four parameters are required: search_term, inclusion_term, input_term, target_site_in - In each case, a conditioned URL was specified differently.
Give me a better idea for optimization.
Instead of having multiple templates strings and selecting on them, you can make a method that gives you the final search query:
def get_search_query(search_term, include_term, intext_term, target_site_in):
response = search_term.replace(' ', '+')
if include_term:
response = f"{response}+{include_term}"
if intext_term:
response = f"{response}+intext:{intext_term}"
if target_site_in:
response = f"{response}+site:{target_site_in}"
return response
now in your method you can call it
def target_url(search_term, include_term, intext_term, target_site_in, page):
query = get_search_query(search_term, include_term, intext_term, target_site_in)
url = f'https://www.google.com/search?q={query}&hl=en&rlz='
if page != 0:
url = f"{url}&page={page}"
return url

Spacy Dependency Parsing with Pandas dataframe

I would like to extract noun-adjective pair for Aspect Based Sentiment Analysis using Spacy's Dependency parser on my pandas dataframe. I was trying this code on Amazon fine food reviews dataset from Kaggle: Named Entity Recognition in aspect-opinion extraction using dependency rule matching
However, something seems to be wrong the way I feed my pandas dataframe to spacy. My results are not the way I would expect them to be. Could someone help me debug this please. Thanks a lot.
!python -m spacy download en_core_web_lg
import nltk
nltk.download('vader_lexicon')
import spacy
nlp = spacy.load("en_core_web_lg")
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def find_sentiment(doc):
# find roots of all entities in the text
for i in df['Text'].tolist():
doc = nlp(i)
ner_heads = {ent.root.idx: ent for ent in doc.ents}
rule3_pairs = []
for token in doc:
children = token.children
A = "999999"
M = "999999"
add_neg_pfx = False
for child in children:
if(child.dep_ == "nsubj" and not child.is_stop): # nsubj is nominal subject
if child.idx in ner_heads:
A = ner_heads[child.idx].text
else:
A = child.text
if(child.dep_ == "acomp" and not child.is_stop): # acomp is adjectival complement
M = child.text
# example - 'this could have been better' -> (this, not better)
if(child.dep_ == "aux" and child.tag_ == "MD"): # MD is modal auxiliary
neg_prefix = "not"
add_neg_pfx = True
if(child.dep_ == "neg"): # neg is negation
neg_prefix = child.text
add_neg_pfx = True
if (add_neg_pfx and M != "999999"):
M = neg_prefix + " " + M
if(A != "999999" and M != "999999"):
rule3_pairs.append((A, M, sid.polarity_scores(M)['compound']))
return rule3_pairs
df['three_tuples'] = df['Text'].apply(find_sentiment)
df.head()
My result is coming like this which clearly means something is wrong with my loop:
If you call apply on df['Text'], then you are essentially looping over every value in that column and passing that value to a function.
Here, however, your function itself iterates over the same dataframe column that you are applying the function to while also overwriting the value that is passed to it early in the function.
So I would start by rewriting the function as follows and see if it produces the intended results. I can't say for sure, as you didn't post any sample data, but this should at least move the ball forward:
def find_sentiment(text):
doc = nlp(text)
ner_heads = {ent.root.idx: ent for ent in doc.ents}
rule3_pairs = []
for token in doc:
children = token.children
A = "999999"
M = "999999"
add_neg_pfx = False
for child in children:
if(child.dep_ == "nsubj" and not child.is_stop): # nsubj is nominal subject
if child.idx in ner_heads:
A = ner_heads[child.idx].text
else:
A = child.text
if(child.dep_ == "acomp" and not child.is_stop): # acomp is adjectival complement
M = child.text
# example - 'this could have been better' -> (this, not better)
if(child.dep_ == "aux" and child.tag_ == "MD"): # MD is modal auxiliary
neg_prefix = "not"
add_neg_pfx = True
if(child.dep_ == "neg"): # neg is negation
neg_prefix = child.text
add_neg_pfx = True
if (add_neg_pfx and M != "999999"):
M = neg_prefix + " " + M
if(A != "999999" and M != "999999"):
rule3_pairs.append((A, M, sid.polarity_scores(M)['compound']))
return rule3_pairs

Google Sheets API - Formatting inserted values

Through this code I've update a bunch of rows in Google Spreadsheet.
The request goes well and returns me the updatedRange below.
result = service.spreadsheets().values().append(
spreadsheetId=spreadsheetId,
range=rangeName,
valueInputOption="RAW",
insertDataOption="INSERT_ROWS",
body=body
).execute()
print(result)
print("Range updated")
updateRange = result['updates']['updatedRange']
Now I would like to do a batchUpdate request to set the formatting or set a protected range, but those API require a range specified as startRowIndex, endRowIndex and so on.
How could I retrieve the rows index from the updatedRange?
Waiting for a native or better answer, I'll post a function I've created to translate a namedRange into a gridRange.
The function is far from perfect and does not translate the sheet name to a sheet id (I left that task to another specific function), but accept named ranges in the form:
sheet!A:B
sheet!A1:B
sheet!A:B5
sheet!A1:B5
Here is the code
import re
def namedRange2Grid(self, rangeName):
ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
match = re.match(".*?\!([A-Z0-9]+)\:([A-Z0-9]+)", rangeName)
if match:
start = match.group(1)
end = match.group(2)
matchStart = re.match("([A-Z]{1,})([1-9]+){0,}", start)
matchEnd = re.match("([A-Z]{1,})([1-9]+){0,}", end)
if matchStart and matchEnd:
GridRange = {}
letterStart = matchStart.group(1)
letterEnd = matchEnd.group(1)
if matchStart.group(2):
numberStart = int(matchStart.group(2))
GridRange['startRowIndex'] = numberStart - 1
if matchEnd.group(2):
numberEnd = int(matchEnd.group(2))
GridRange['endRowIndex'] = numberEnd
i = 0
for l in range(0, len(letterStart)):
i = i + (l * len(ascii_uppercase))
i = i + ascii_uppercase.index(letterStart[l])
GridRange['startColumnIndex'] = i
i = 0
for l in range(0, len(letterEnd)):
i = i + (l * len(ascii_uppercase))
i = i + ascii_uppercase.index(letterEnd[l])
GridRange['endColumnIndex'] = i + 1
return GridRange

Convert LexToken to list Python

I have a lexer for html tokens which returns and prints lextoken objects in a given html string
I have a parser which takes tokens as a list and grammar as input and returns true if the set of tokens form a valid string in grammar
I want to combine these programs to form a complete lexer - parser program
But the problem is in the second program the tokens are in form of list and output of first program is lextoken
Lexer
import ply.lex as lex
tokens = (
'LANGLE', # <
'LANGLESLASH', # </
'RANGLE', # >
'SLASHRANGLE', # />
'EQUAL', # =
'STRING', # "144"
'WORD', # 'Welcome' in "Welcome to my webpage."
'NUMBER' # 12, 5.6, -1., 3.14159, -8.1, 867.5309
)
t_ignore = ' \t\v\r' # shortcut for whitespace
states = (
('htmlcomment', 'exclusive'), # <!--
)
def t_htmlcomment(t):
r'<!--'
t.lexer.begin('htmlcomment')
def t_htmlcomment_end(t):
r'-->'
t.lexer.lineno += t.value.count('\n')
t.lexer.begin('INITIAL')
pass
def t_htmlcomment_error(t):
t.lexer.skip(1)
def t_LANGLESLASH(t):
r'</'
return t
def t_LANGLE(t):
r'<'
return t
def t_SLASHRANGLE(t):
r'/>'
return t
def t_RANGLE(t):
r'>'
return t
def t_EQUAL(t):
r'='
return t
def t_STRING(t):
r'"[^"]*"'
t.value = t.value[1:-1] # drop "surrounding quotes"
return t
def t_WORD(t):
r'[^ <>]+'
return t
webpage = "hello <!-- comment --> 123456 <b> Bushra </b> all"
htmllexer = lex.lex()
htmllexer.input(webpage)
while True:
tok = htmllexer.token()
if not tok: break
print tok
This is my parser
work_count = 0 # track one notion of "time taken"
def addtoset(theset,index,elt):
if not (elt in theset[index]):
theset[index] = [elt] + theset[index]
return True
return False
def parse(tokens,grammar):
global work_count
work_count = 0
tokens = tokens + [ "end_of_input_marker" ]
chart = {}
start_rule = grammar[0]
for i in range(len(tokens)+1):
chart[i] = [ ]
start_state = (start_rule[0], [], start_rule[1], 0)
chart[0] = [ start_state ]
for i in range(len(tokens)):
while True:
changes = False
for state in chart[i]:
# State === x -> a b . c d , j
x = state[0]
ab = state[1]
cd = state[2]
j = state[3]
next_states = [ (rule[0],[],rule[1],i)
for rule in grammar if cd <> [] and cd[0] == rule[0] ]
work_count = work_count + len(grammar)
for next_state in next_states:
changes = addtoset(chart,i,next_state) or changes
if cd <> [] and tokens[i] == cd[0]:
next_state = (x, ab + [cd[0]], cd[1:], j)
changes = addtoset(chart,i+1,next_state) or changes
next_states = [ (jstate[0], jstate[1] + [x], (jstate[2])[1:],
jstate[3] )
for jstate in chart[j]
if cd == [] and jstate[2] <> [] and (jstate[2])[0] == x ]
work_count = work_count + len(chart[j])
for next_state in next_states:
changes = addtoset(chart,i,next_state) or changes
# We're done if nothing changed!
if not changes:
break
accepting_state = (start_rule[0], start_rule[1], [], 0)
return accepting_state in chart[len(tokens)-1]
grammar = [
("html", ["element", "html"]),
("html", [ ]),
("element", ["word"]),
("element", ["tag-open","word","tag-close"]),
("tag-open",["<","word",">"]),
("tag-close",["<","/","word",">"])
]
tokens = [ "<", "b", ">" , "Hello", "<", "/" , "b" , ">"]
result=parse(tokens, grammar)
print result
You can do this by using the attribute value of LexToken:
webpage = "hello <!-- comment --> 123456 <b> Bushra </b> all"
htmllexer = lex.lex()
htmllexer.input(webpage)
tokens = []
while True:
tok = htmllexer.token()
if not tok: break
tokens.append(tok.value)
print tokens #['hello', '123456', '<', 'b', '>', 'Bushra', '</', 'b', '>', 'all']
All available attributes may be obtained by using the dir() function:
print dir(tok)

Reference to value of the function

At beginning i wanna say i'm newbie in use Python and everything I learned it came from tutorials.
My problem concerning reference to the value. I'm writing some script which is scrapping some information from web sites. I defined some function:
def MatchPattern(count):
sock = urllib.urlopen(Link+str(count))
htmlSource = sock.read()
sock.close()
root = etree.HTML(htmlSource)
root = etree.HTML(htmlSource)
result = etree.tostring(root, pretty_print=True, method="html")
expr1 = check_reg(root)
expr2 = check_practice(root)
D_expr1 = no_ks(root)
D_expr2 = Registred_by(root)
D_expr3 = Name_doctor(root)
D_expr4 = Registration_no(root)
D_expr5 = PWZL(root)
D_expr6 = NIP(root)
D_expr7 = Spec(root)
D_expr8 = Start_date(root)
#-----Reg_practice-----
R_expr1 = Name_of_practise(root)
R_expr2 = TERYT(root)
R_expr3 = Street(root)
R_expr4 = House_no(root)
R_expr5 = Flat_no(root)
R_expr6 = Post_code(root)
R_expr7 = City(root)
R_expr8 = Practice_no(root)
R_expr9 = Kind_of_practice(root)
#------Serv_practice -----
S_expr1 = TERYT2(root)
S_expr2 = Street2(root)
S_expr3 = House_no2(root)
S_expr4 = Flat_no2(root)
S_expr5 = Post_code2(root)
S_expr6 = City2(root)
S_expr7 = Phone_no(root)
return expr1
return expr2
return D_expr1
return D_expr2
return D_expr3
return D_expr4
return D_expr5
return D_expr6
return D_expr7
return D_expr8
#-----Reg_practice-----
return R_expr1
return R_expr2
return R_expr3
return R_expr4
return R_expr5
return R_expr6
return R_expr7
return R_expr8
return R_expr9
#------Serv_practice -----
return S_expr1
return S_expr2
return S_expr3
return S_expr4
return S_expr5
return S_expr6
return S_expr7
So now inside the script I wanna check value of the expr1 returned by my fynction. I don't know how to do that. Can u guys help me ? Is my function written correct ?
EDIT:
I can't add answer so I edit my current post
This is my all script. Some comments are in my native language but i add some in english
#! /usr/bin/env python
#encoding:UTF-8-
# ----------------------------- importujemy potrzebne biblioteki i skrypty -----------------------
# ------------------------------------------------------------------------------------------------
import urllib
from lxml import etree, html
import sys
import re
import MySQLdb as mdb
from TOR_connections import *
from XPathSelection import *
import os
# ------------------------------ Definiuje xPathSelectors ------------------------------------------
# --------------------------------------------------------------------------------------------------
# -------Doctors -----
check_reg = etree.XPath("string(//html/body/div/table[1]/tr[3]/td[2]/text())") #warunek Lekarz
check_practice = etree.XPath("string(//html/body/div/table[3]/tr[4]/td[2]/text())") #warunek praktyka
no_ks = etree.XPath("string(//html/body/div/table[1]/tr[1]/td[2]/text())")
Registred_by = etree.XPath("string(//html/body/div/table[1]/tr[4]/td[2]/text())")
Name_doctor = etree.XPath("string(//html/body/div/table[2]/tr[2]/td[2]/text())")
Registration_no = etree.XPath("string(//html/body/div/table[2]/tr[3]/td[2]/text())")
PWZL = etree.XPath("string(//html/body/div/table[2]/tr[4]/td[2]/text())")
NIP = etree.XPath("string(//html/body/div/table[2]/tr[5]/td[2]/text())")
Spec = etree.XPath("string(//html/body/div/table[2]/tr[18]/td[2]/text())")
Start_date = etree.XPath("string(//html/body/div/table[2]/tr[20]/td[2]/text())")
#-----Reg_practice-----
Name_of_practise = etree.XPath("string(//html/body/div/table[2]/tr[1]/td[2]/text())")
TERYT = etree.XPath("string(//html/body/div/table[2]/tr[7]/td[2]/*/text())")
Street = etree.XPath("string(//html/body/div/table[2]/tr[8]/td[2]/text())")
House_no = etree.XPath("string(//html/body/div/table[2]/tr[9]/td[2]/*/text())")
Flat_no = etree.XPath("string(//html/body/div/table[2]/tr[10]/td[2]/*/text())")
Post_code = etree.XPath("string(//html/body/div/table[2]/tr[11]/td[2]/*/text())")
City = etree.XPath("string(//html/body/div/table[2]/tr[12]/td[2]/*/text())")
Practice_no = etree.XPath("string(//html/body/div/table[3]/tr[4]/td[2]/text())")
Kind_of_practice = etree.XPath("string(//html/body/div/table[3]/tr[5]/td[2]/text())")
#------Serv_practice -----
TERYT2 = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[2]/td[2]/*/text())")
Street2 = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[3]/td[2]/text())")
House_no2 = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[4]/td[2]/*/text())")
Flat_no2 = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[5]/td[2]/i/text())")
Post_code2 = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[6]/td[2]/*/text())")
City2 = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[7]/td[2]/*/text())")
Phone_no = etree.XPath("string(//html/body/div/table[3]/tr[14]/td/table/tr[8]/td[2]/text())")
# --------------------------- deklaracje zmiennych globalnych ----------------------------------
# ----------------------------------------------------------------------------------------------
decrease = 9
No = 1
Link = "http://rpwdl.csioz.gov.pl/rpz/druk/wyswietlKsiegaServletPub?idKsiega="
# --------------------------- funkcje zdefiniowane ----------------------------------
# ----------------------------------------------------------------------------------------------
def MatchPattern(count):
sock = urllib.urlopen(Link+str(count))
htmlSource = sock.read()
sock.close()
root = etree.HTML(htmlSource)
root = etree.HTML(htmlSource)
result = etree.tostring(root, pretty_print=True, method="html")
expr1 = check_reg(root)
expr2 = check_practice(root)
D_expr1 = no_ks(root)
D_expr2 = Registred_by(root)
D_expr3 = Name_doctor(root)
D_expr4 = Registration_no(root)
D_expr5 = PWZL(root)
D_expr6 = NIP(root)
D_expr7 = Spec(root)
D_expr8 = Start_date(root)
#-----Reg_practice-----
R_expr1 = Name_of_practise(root)
R_expr2 = TERYT(root)
R_expr3 = Street(root)
R_expr4 = House_no(root)
R_expr5 = Flat_no(root)
R_expr6 = Post_code(root)
R_expr7 = City(root)
R_expr8 = Practice_no(root)
R_expr9 = Kind_of_practice(root)
#------Serv_practice -----
S_expr1 = TERYT2(root)
S_expr2 = Street2(root)
S_expr3 = House_no2(root)
S_expr4 = Flat_no2(root)
S_expr5 = Post_code2(root)
S_expr6 = City2(root)
S_expr7 = Phone_no(root)
return expr1
return expr2
return D_expr1
return D_expr2
return D_expr3
return D_expr4
return D_expr5
return D_expr6
return D_expr7
return D_expr8
#-----Reg_practice-----
return R_expr1
return R_expr2
return R_expr3
return R_expr4
return R_expr5
return R_expr6
return R_expr7
return R_expr8
return R_expr9
#------Serv_practice -----
return S_expr1
return S_expr2
return S_expr3
return S_expr4
return S_expr5
return S_expr6
return S_expr7
# --------------------------- ustanawiamy polaczenie z baza danych -----------------------------
# ----------------------------------------------------------------------------------------------
con = mdb.connect('localhost', 'root', '******', 'SANBROKER', charset='utf8');
# ---------------------------- początek programu -----------------------------------------------
# ----------------------------------------------------------------------------------------------
with con:
cur = con.cursor()
cur.execute("SELECT Old_num FROM SANBROKER.Number_of_records;")
Old_num = cur.fetchone()
count = Old_num[0]
counter = input("Input number of rows: ")
# ----------------------- pierwsze połączenie z TORem ------------------------------------
# ----------------------------------------------------------------------------------------
#connectTor()
#conn = httplib.HTTPConnection("my-ip.heroku.com")
#conn.request("GET", "/")
#response = conn.getresponse()
#print(response.read())
while count <= counter: # co dziesiata liczba
# --------------- pierwsze wpisanie do bazy danych do Archive --------------------
with con:
cur = con.cursor()
cur.execute("UPDATE SANBROKER.Number_of_records SET Archive_num=%s",(count))
# ---------------------------------------------------------------------------------
if decrease == 0:
MatchPattern(count)
# Now I wanna check some expresions (2 or 3)
# After that i wanna write all the values into my database
#------- ostatnie czynności:
percentage = count / 100
print "rekordów: " + str(count) + " z: " + str(counter) + " procent dodanych: " + str(percentage) + "%"
with con:
cur = con.cursor()
cur.execute("UPDATE SANBROKER.Number_of_records SET Old_num=%s",(count))
decrease = 10-1
count +=1
else:
MatchPattern(count)
# Now I wanna check some expresions (2 or 3)
# After that i wanna write all the values into my database
# ------ ostatnie czynności:
percentage = count / 100
print "rekordów: " + str(count) + " z: " + str(counter) + " procent dodanych: " + str(percentage) + "%"
with con:
cur = con.cursor()
cur.execute("UPDATE SANBROKER.Number_of_records SET Old_num=%s",(count))
decrease -=1
count +=1
Well, I'm assuming check_reg is a function that returns a boolean (either True or False).
If that's the case, to check the return:
if expr1:
print "True."
else:
print "False"
There's more than one way to do it, but basically, if expr1: is all you need to do the checking.
To capture the return value of a function, assign the function to a name with an equal sign, like this:
return_value = somefunction(some_value)
print('The return value is ',return_value)
Keep in mind that when the first return statement is encountered, the function will exit. So if you have more than one return statement after each other, only the first will execute.
If you want to return multiple things, add them to a list and then return the list.
Here is an improved version of your function:
def match_pattern(count):
sock = urllib.urlopen(Link+str(count))
htmlsource = sock.read()
sock.close()
root = etree.HTML(htmlSource)
# root = etree.HTML(htmlSource) - duplicate line
# result = etree.tostring(root, pretty_print=True, method="html")
function_names = [check_reg, check_practice, no_ks, Registered_by, \
Name_doctor, Registration_no, PWZL, NIP, Spec, Start_date, \
Name_of_practise, TERYT, Street, House_no2, Flat_no, \
Post_code2, City2, Phone_no]
results = []
for function in function_names:
results.append(function(root))
return results
r = match_pattern(1)
print r[0] # this will be the result of check_reg(root)
The code you have posted is quite ambigous. Can you please fix the ident to let us know what belongs to the function and which part is the script.
A function can returns only one value. You cannot do :
return something
return something_else
return ...
The function will ends when first value will be returned.
What you can do is returning a list, tuple or dict containing all your values.
For instance :
return (something,something_else,...)
or
return [something,something_else,...]
In your case, it seems better to create a class that would have all values you want as attributes, and turn this function into a method that would set the attributes values.
class Example(object):
def __init__ ( self , link , count ):
sock = urllib.urlopen(link+str(count))
htmlSource = sock.read()
sock.close()
root = etree.HTML(htmlSource)
root = etree.HTML(htmlSource)
result = etree.tostring(root, pretty_print=True, method="html")
self.expr1 = check_reg(root)
self.expr2 = check_practice(root)
self.D_expr1 = no_ks(root)
...
self.D_expr8 = Start_date(root)
#-----Reg_practice-----
self.R_expr1 = Name_of_practise(root)
...
self.R_expr9 = Kind_of_practice(root)
#------Serv_practice -----
self.S_expr1 = TERYT2(root)
...
self.S_expr7 = Phone_no(root)
Then you will be able to use this class like :
exampleInstance = Example ( "link you want to use" , 4 ) # the second argument is your 'count' value
# Now you can use attributes of your class to get the values you want
print exampleInstance . expr1
print exampleInstance . S_expr7

Categories