I've written a quick little program to scrape book data off of a UNESCO website which contains information about book translations. The code is doing what I want it to, but by the time it's processed about 20 countries, it's using ~6GB of RAM. Since there are around 200 I need to process, this isn't going to work for me.
I'm not sure where all the RAM usage is coming from, so I'm not sure how to reduce it. I'm assuming that it's the dictionary that's holding all the book information, but I'm not positive. I'm not sure if I should simply make the program run once for each country, rather than processing the lot of them? Or if there's a better way to do it?
This is the first time I've written anything like this, and I'm a pretty novice, self-taught programmer, so please point out any significant flaws in the code, or improvement tips you have that may not directly relate to the question at hand.
This is my code, thanks in advance for any assistance.
from __future__ import print_function
import urllib2, os
from bs4 import BeautifulSoup, SoupStrainer
''' Set list of countries and their code for niceness in explaining what
is actually going on as the program runs. '''
countries = {"AFG":"Afghanistan","ALA":"Aland Islands","DZA":"Algeria"}
'''List of country codes since dictionaries aren't sorted in any
way, this makes processing easier to deal with if it fails at
some point, mid run.'''
country_code_list = ["AFG","ALA","DZA"]
base_url = "http://www.unesco.org/xtrans/bsresult.aspx?lg=0&c="
destination_directory = "/Users/robbie/Test/"
only_restable = SoupStrainer(class_="restable")
class Book(object):
def set_author(self,book):
'''Parse the webpage to find author names. Finds last name, then
first name of original author(s) and sets the Book object's
Author attribute to the resulting string.'''
authors = ""
author_last_names = book.find_all('span',class_="sn_auth_name")
author_first_names = book.find_all('span', attrs={\
'class':"sn_auth_first_name"})
if author_last_names == []: self.Author = [" "]
for author in author_last_names:
try:
first_name = author_first_names.pop()
authors = authors + author.getText() + ', ' + \
first_name.getText()
except IndexError:
authors = authors + (author.getText())
self.author = authors
def set_quality(self,book):
''' Check to see if book page is using Quality, then set it if
so.'''
quality = book.find_all('span', class_="sn_auth_quality")
if len(quality) == 0: self.quality = " "
else: self.quality = quality[0].contents[0]
def set_target_title(self,book):
target_title = book.find_all('span', class_="sn_target_title")
if len(target_title) == 0: self.target_title = " "
else: self.target_title = target_title[0].contents[0]
def set_target_language(self,book):
target_language = book.find_all('span', class_="sn_target_lang")
if len(target_language) == 0: self.target_language = " "
else: self.target_language = target_language[0].contents[0]
def set_translator_name(self,book) :
translators = ""
translator_last_names = book.find_all('span', class_="sn_transl_name")
translator_first_names = book.find_all('span', \
class_="sn_transl_first_name")
if translator_first_names == [] and translator_last_names == [] :
self.translators = " "
return None
for translator in translator_last_names:
try:
first_name = translator_first_names.pop()
translators = translators + \
(translator.getText() + ',' \
+ first_name.getText())
except IndexError:
translators = translators + \
(translator.getText())
self.translators = translators
def set_published_city(self,book) :
published_city = book.find_all('span', class_="place")
if len(published_city) == 0:
self.published_city = " "
else: self.published_city = published_city[0].contents[0]
def set_publisher(self,book) :
publisher = book.find_all('span', class_="place")
if len(publisher) == 0:
self.publisher = " "
else: self.publisher = publisher[0].contents[0]
def set_published_country(self,book) :
published_country = book.find_all('span', \
class_="sn_country")
if len(published_country) == 0:
self.published_country = " "
else: self.published_country = published_country[0].contents[0]
def set_year(self,book) :
year = book.find_all('span', class_="sn_year")
if len(year) == 0:
self.year = " "
else: self.year = year[0].contents[0]
def set_pages(self,book) :
pages = book.find_all('span', class_="sn_pagination")
if len(pages) == 0:
self.pages = " "
else: self.pages = pages[0].contents[0]
def set_edition(self, book) :
edition = book.find_all('span', class_="sn_editionstat")
if len(edition) == 0:
self.edition = " "
else: self.edition = edition[0].contents[0]
def set_original_title(self,book) :
original_title = book.find_all('span', class_="sn_orig_title")
if len(original_title) == 0:
self.original_title = " "
else: self.original_title = original_title[0].contents[0]
def set_original_language(self,book) :
languages = ''
original_languages = book.find_all('span', \
class_="sn_orig_lang")
for language in original_languages:
languages = languages + language.getText() + ', '
self.original_languages = languages
def export(self, country):
''' Function to allow us to easilly pull the text from the
contents of the Book object's attributes and write them to the
country in which the book was published's CSV file.'''
file_name = os.path.join(destination_directory + country + ".csv")
with open(file_name, "a") as by_country_csv:
print(self.author.encode('UTF-8') + " & " + \
self.quality.encode('UTF-8') + " & " + \
self.target_title.encode('UTF-8') + " & " + \
self.target_language.encode('UTF-8') + " & " + \
self.translators.encode('UTF-8') + " & " + \
self.published_city.encode('UTF-8') + " & " + \
self.publisher.encode('UTF-8') + " & " + \
self.published_country.encode('UTF-8') + " & " + \
self.year.encode('UTF-8') + " & " + \
self.pages.encode('UTF-8') + " & " + \
self.edition.encode('UTF-8') + " & " + \
self.original_title.encode('UTF-8') + " & " + \
self.original_languages.encode('UTF-8'), file=by_country_csv)
by_country_csv.close()
def __init__(self, book, country):
''' Initialize the Book object by feeding it the HTML for its
row'''
self.set_author(book)
self.set_quality(book)
self.set_target_title(book)
self.set_target_language(book)
self.set_translator_name(book)
self.set_published_city(book)
self.set_publisher(book)
self.set_published_country(book)
self.set_year(book)
self.set_pages(book)
self.set_edition(book)
self.set_original_title(book)
self.set_original_language(book)
def get_all_pages(country,base_url):
''' Create a list of URLs to be crawled by adding the ISO_3166-1_alpha-3
country code to the URL and then iterating through the results every 10
pages. Returns a string.'''
base_page = urllib2.urlopen(base_url+country)
page = BeautifulSoup(base_page, parse_only=only_restable)
result_number = page.find_all('td',class_="res1",limit=1)
if not result_number:
return 0
str_result_number = str(result_number[0].getText())
results_total = int(str_result_number.split('/')[1])
page.decompose()
return results_total
def build_list(country_code_list, countries):
''' Build the list of all the books, and return a list of Book objects
in case you want to do something with them in something else, ever.'''
for country in country_code_list:
print("Processing %s now..." % countries[country])
results_total = get_all_pages(country, base_url)
for url in range(results_total):
if url % 10 == 0 :
all_books = []
target_page = urllib2.urlopen(base_url + country \
+"&fr="+str(url))
page = BeautifulSoup(target_page, parse_only=only_restable)
books = page.find_all('td',class_="res2")
for book in books:
all_books.append(Book (book,country))
page.decompose()
for title in all_books:
title.export(country)
return
if __name__ == "__main__":
build_list(country_code_list,countries)
print("Completed.")
I guess I'll just list off some of the problems or possible improvements in no particular order:
Follow PEP 8.
Right now, you've got lots of variables and functions named using camel-case like setAuthor. That's not the conventional style for Python; Python would typically named that set_author (and published_country rather than PublishedCountry, etc.). You can even change the names of some of the things you're calling: for one, BeautifulSoup supports findAll for compatibility, but find_all is recommended.
Besides naming, PEP 8 also specifies a few other things; for example, you'd want to rewrite this:
if len(resultNumber) == 0 : return 0
as this:
if len(result_number) == 0:
return 0
or even taking into account the fact that empty lists are falsy:
if not result_number:
return 0
Pass a SoupStrainer to BeautifulSoup.
The information you're looking for is probably in only part of the document; you don't need to parse the whole thing into a tree. Pass a SoupStrainer as the parse_only argument to BeautifulSoup. This should reduce memory usage by discarding unnecessary parts early.
decompose the soup when you're done with it.
Python primarily uses reference counting, so removing all circular references (as decompose does) should let its primary mechanism for garbage collection, reference counting, free up a lot of memory. Python also has a semi-traditional garbage collector to deal with circular references, but reference counting is much faster.
Don't make Book.__init__ write things to disk.
In most cases, I wouldn't expect just creating an instance of a class to write something to disk. Remove the call to export; let the user call export if they want it to be put on the disk.
Stop holding on to so much data in memory.
You're accumulating all this data into a dictionary just to export it afterwards. The obvious thing to do to reduce memory is to dump it to disk as soon as possible. Your comment indicates that you're putting it in a dictionary to be flexible; but that doesn't mean you have to collect it all in a list: use a generator, yielding items as you scrape them. Then the user can iterate over it just like a list:
for book in scrape_books():
book.export()
…but with the advantage that at most one book will be kept in memory at a time.
Use the functions in os.path rather than munging paths yourself.
Your code right now is rather fragile when it comes to path names. If I accidentally removed the trailing slash from destinationDirectory, something unintended happens. Using os.path.join prevents that from happening and deals with cross-platform differences:
>>> os.path.join("/Users/robbie/Test/", "USA")
'/Users/robbie/Test/USA'
>>> os.path.join("/Users/robbie/Test", "USA") # still works!
'/Users/robbie/Test/USA'
>>> # or say we were on Windows:
>>> os.path.join(r"C:\Documents and Settings\robbie\Test", "USA")
'C:\\Documents and Settings\\robbie\\Test\\USA'
Abbreviate attrs={"class":...} to class_=....
BeautifulSoup 4.1.2 introduces searching with class_, which removes the need for the verbose attrs={"class":...}.
I imagine there are even more things you can change, but that's quite a few to start with.
What do you want the booklist for, in the end? You should export each book at the end of the "for url in range" block (inside it), and do without the allbooks dict. If you really need a list, define exactly what infos you will need, not keeping full Book objects.
Related
I'm trying to figure out how to make the objects I create inside a definition appear on the same workspace as my "name == main" area, so I can call it in other definitions. I don't think I can use a simple "return" statement in my def, because my function is designed to create n-number of objects of a particular class.
def book_to_object(subfolders):
# converts my dictionary of book titles and volumes into individual book objects
for key, val in subfolders.items():
title = replace_spaces_with_underscores(key)
# check to see if book already exists in the registry
if title in [book.name for book in Book._registry]:
print("Book already exists")
else:
exec(title + " = Book(subfolders ,key, Fpath)")
The whole program below.
I'm calling this def from my if __name__ == '__main__': function
print("Done")
#converts my dictionary of book titles and volumes into individual book objects
book_to_object(subfolders)
Originally I had the code in my def in the name=main area of my code but I wanted to reduce the clutter and make it into a definition to give me more flexibility later.
The whole program below.
import os
#get date modified time of a file to determine which file is the newest
def get_date_modified(file):
return os.path.getmtime(file)
#replace spaces with underscores in a string
def replace_spaces_with_underscores(string):
aa = string.replace(" ", "_")
bb= aa.replace('-', '_')
cc= bb.replace("__", "_")
title = cc.replace("__", "_")
return title
# create a list of volumes that have not been exported
def get_unexported_volumes(self):
unexported_volumes = []
for volume in self.volumes:
if volume not in self.last_exported:
unexported_volumes.append(volume)
return unexported_volumes
def book_to_object(subfolders):
# converts my dictionary of book titles and volumes into individual book objects
for key, val in subfolders.items():
title = replace_spaces_with_underscores(key)
# check to see if book already exists in the registry
if title in [book.name for book in Book._registry]:
print("Book already exists")
else:
exec(title + " = Book(subfolders ,key, Fpath)")
class Book(object):
#__metaclass__ = IterBook
_registry = []
#constructor
def __init__(self, my_dict, key, Fpath):
self._registry.append(self)
self.name = key
self.volumes = my_dict[key]
self.Filepath = Fpath + "/" +self.name
#self.last_exported = self.volumes[0]
self.last_exported = ""
self.newest = self.volumes[-1]
self.last_downloaded = self.volumes[-1]
self.last_converted = self.volumes[-1]
self.last_exported_date = get_date_modified(self.Filepath + "/" + self.last_exported)
self.newest_date = get_date_modified(self.Filepath + "/" + self.newest)
self.last_downloaded_date = get_date_modified(self.Filepath + "/" + self.last_downloaded)
self.last_converted_date = get_date_modified(self.Filepath + "/" + self.last_converted)
self.last_exported_volume = self.last_exported
self.newest_volume = self.newest
self.last_downloaded_volume = self.last_downloaded
self.last_converted_volume = self.last_converted
self.last_exported_volume_date = self.last_exported_date
self.newest_volume_date = self.newest_date
self.last_downloaded_volume_date = self.last_downloaded_date
self.last_converted_volume_date = self.last_converted_date
self.last_exported_volume_name = self.last_exported
self.newest_volume_name = self.newest
self.unexported_volumes = get_unexported_volumes(self)
if __name__ == '__main__':
print("Starting")
# File Paths for Debugging purposes
Fpath = '~/'
F_Temp_path = '~/temp'
#parses directory for book based on folder names
subfolders = {'Encyclopedia': ['Vol.001', 'Vol.002', 'Vol.003', 'Vol.004', 'Vol.005'], 'Enginnering_Encyclopedia': ['Avionics', 'Civil', 'Electrical', 'Materials', 'Mechanical']}
print("Done")
#converts my dictionary of book titles and volumes into individual book objects
book_to_object(subfolders)
Notes:
In case anyone is wondering why I'm using objects instead of keeping everything in dictionaries, I need the flexibility that objects give.
Variables such as, file_paths and subfolders are dynamic and are based on what directory the user gives and what files are in that directory, for the purpose of asking the questions they have been hard coded so someone could copy and paste my code and reproduce the problem in question.
I want to preface this by saying I replicated the spirit of the code in the python terminal by writing:
A = {}
A["Q"] = {}
A["Q"]["creation"] = {}
A["Q"]["creation"]["reactants_1"] = ["R1","R2"]
printing A gives what one would expect:
{"Q" : {"creation" : {"reactants_1" : ["R1","R2"]}}}
then adding:
A["Q"]["creation"]["reactants_2"] = ["R3","R4"]
yields:
{"Q" : {"creation" : {"reactants_1" : ["R1","R2"], "reactants_2" : ["R3","R4"]}}}
However I have a function displayed below which, when run correctly assigns the first few key:value pairs but when it loops over to the second value of x it replaces the first key:value pair it wrote earlier with a new one despite the key being different.
using the above example we would only get:
{"Q" : {"creation" : {"reactants_2" : ["R3","R4"]}}}
Reactions is an array containing things like "e+e+p=e+H_1" in the format:
["e+e+p=e+H_1", "e+e+p=e+H_2",...,"e+H_10=e+e+p"]
Species is an array like:
[["e", 100000],["p", 100000],...]
where for now we only care about the string part.
diff_input is an empty dictionary to begin with
reactions_const contains, for each reaction, the left and right sides separate - seen as [x][0][0] and [x][0][1] early in the function as well as some other information that is not important for now.
rates_names is an array of unique identifiers for each reaction i can use later hence using the dictionary approach.
the print statements are all me trying to figure out why it isn't working
def rate_eqns(diff_input, reactions, species, reactions_const,
rates_names):
for x in range(len(reactions)):
# for each reaction
# split the left and right hand side up into lists of their
# respective species involved for counting later
print("reaction: " + str(x) + " " + str(reactions[x]))
species_lhs = reactions_const[x][0][0].split('+')
print("LHS = " + str(species_lhs))
species_rhs = reactions_const[x][0][1].split('+')
for y in range(len(species)):
# For each species, create a sub-dictionary
diff_input[species[y][0]] = {}
# create sub-dictionaries in each species for creation, destruction and preservation/neutral paths
diff_input[species[y][0]]["destruction"] = {}
diff_input[species[y][0]]["creation"] = {}
diff_input[species[y][0]]["balanced"] = {}
# check if species occurs in each reaction
if species[y][0] in reactions[x][0]:
# if you start with more of it than you finish its destruction
if species_lhs.count(species[y][0]) > species_rhs.count(species[y][0]):
# if true: add an entry to the dictionary which holds the reaction identifier
# bound to the destruction/creation/balanced identifier bound to the species identifier.
print("species:" + str(species[y][0]) + " found net loss from reactants")
print("LHS = " + str(species_lhs))
print("RHS = " + str(species_rhs))
print("reaction designation = " + str(rates_names[x]) + " Destruction of species")
print("-------------")
diff_input[species[y][0]]["destruction"][rates_names[x]] = species_lhs
elif species_lhs.count(species[y][0]) == species_rhs.count(species[y][0]):
print("species:" + str(species[y][0]) + " found no change in number")
print("LHS = " + str(species_lhs))
print("RHS = " + str(species_rhs))
print("reaction designation = " + str(rates_names[x]) + " preservation of species")
diff_input[species[y][0]]["balanced"][rates_names[x]] = species_lhs
elif species_lhs.count(species[y][0]) < species_rhs.count(species[y][0]):
print("species:" + str(species[y][0]) + " found net gain from reactants")
print("LHS = " + str(species_lhs))
print("RHS = " + str(species_rhs))
print("reaction designation = " + str(rates_names[x]) + " creation of species")
diff_input[species[y][0]]["creation"][rates_names[x]] = species_lhs
# else:
# print(str(species[y][0]) + " not found in reaction")
print(diff_input)
a = input("press return to continue")
with open('diff_input.txt', 'w') as file:
file.write(str(diff_input))
return diff_input
the file saving part is optional, has anyone else every encountered a dictionary overriding existing keys with new keys?
Thanks for your patience and I appreciate any advice on my formatting (I tried to make it as good as possible without including the rest of the script)
The actual value of your species[1][0] must happen to be equal to species[0][0], so that when it loops over to the second value of x, the assignment diff_input[species[y][0]] = {} would overwrite the sub-dict of the previous iteration since species[y][0] stays the same.
To put it more simply using your example code, in your outer loop you are making the following initialization:
A["Q"] = {}
A["Q"]["creation"] = {}
so even if your inner loop assigns some values to the sub-dict:
A["Q"]["creation"]["reactants_1"] = ["R1","R2"]
A["Q"] = {} would overwrite it in the next iteration as long as "Q" continues to be the main key for the assignment.
I am using Zapier to catch a webhook and use that info for an API post. The action code runs perfectly fine with "4111111111111111" in place of Ccnum in doSale. But when I use the input_data variable and place it in doSale it errors.
Zapier Input Variable:
Zapier Error:
Python code:
import pycurl
import urllib
import urlparse
import StringIO
class gwapi():
def __init__(self):
self.login= dict()
self.order = dict()
self.billing = dict()
self.shipping = dict()
self.responses = dict()
def setLogin(self,username,password):
self.login['password'] = password
self.login['username'] = username
def setOrder(self, orderid, orderdescription, tax, shipping, ponumber,ipadress):
self.order['orderid'] = orderid;
self.order['orderdescription'] = orderdescription
self.order['shipping'] = '{0:.2f}'.format(float(shipping))
self.order['ipaddress'] = ipadress
self.order['tax'] = '{0:.2f}'.format(float(tax))
self.order['ponumber'] = ponumber
def setBilling(self,
firstname,
lastname,
company,
address1,
address2,
city,
state,
zip,
country,
phone,
fax,
email,
website):
self.billing['firstname'] = firstname
self.billing['lastname'] = lastname
self.billing['company'] = company
self.billing['address1'] = address1
self.billing['address2'] = address2
self.billing['city'] = city
self.billing['state'] = state
self.billing['zip'] = zip
self.billing['country'] = country
self.billing['phone'] = phone
self.billing['fax'] = fax
self.billing['email'] = email
self.billing['website'] = website
def setShipping(self,firstname,
lastname,
company,
address1,
address2,
city,
state,
zipcode,
country,
email):
self.shipping['firstname'] = firstname
self.shipping['lastname'] = lastname
self.shipping['company'] = company
self.shipping['address1'] = address1
self.shipping['address2'] = address2
self.shipping['city'] = city
self.shipping['state'] = state
self.shipping['zip'] = zipcode
self.shipping['country'] = country
self.shipping['email'] = email
def doSale(self,amount, ccnumber, ccexp, cvv=''):
query = ""
# Login Information
query = query + "username=" + urllib.quote(self.login['username']) + "&"
query += "password=" + urllib.quote(self.login['password']) + "&"
# Sales Information
query += "ccnumber=" + urllib.quote(ccnumber) + "&"
query += "ccexp=" + urllib.quote(ccexp) + "&"
query += "amount=" + urllib.quote('{0:.2f}'.format(float(amount))) + "&"
if (cvv!=''):
query += "cvv=" + urllib.quote(cvv) + "&"
# Order Information
for key,value in self.order.iteritems():
query += key +"=" + urllib.quote(str(value)) + "&"
# Billing Information
for key,value in self.billing.iteritems():
query += key +"=" + urllib.quote(str(value)) + "&"
# Shipping Information
for key,value in self.shipping.iteritems():
query += key +"=" + urllib.quote(str(value)) + "&"
query += "type=sale"
return self.doPost(query)
def doPost(self,query):
responseIO = StringIO.StringIO()
curlObj = pycurl.Curl()
curlObj.setopt(pycurl.POST,1)
curlObj.setopt(pycurl.CONNECTTIMEOUT,30)
curlObj.setopt(pycurl.TIMEOUT,30)
curlObj.setopt(pycurl.HEADER,0)
curlObj.setopt(pycurl.SSL_VERIFYPEER,0)
curlObj.setopt(pycurl.WRITEFUNCTION,responseIO.write);
curlObj.setopt(pycurl.URL,"https://secure.merchantonegateway.com/api/transact.php")
curlObj.setopt(pycurl.POSTFIELDS,query)
curlObj.perform()
data = responseIO.getvalue()
temp = urlparse.parse_qs(data)
for key,value in temp.iteritems():
self.responses[key] = value[0]
return self.responses['response']
# NOTE: your username and password should replace the ones below
Ccnum = input_data['Ccnum'] #this variable I would like to use in
#the gw.doSale below
gw = gwapi()
gw.setLogin("demo", "password");
gw.setBilling("John","Smith","Acme, Inc.","123 Main St","Suite 200", "Beverly Hills",
"CA","90210","US","555-555-5555","555-555-5556","support#example.com",
"www.example.com")
r = gw.doSale("5.00",Ccnum,"1212",'999')
print gw.responses['response']
if (int(gw.responses['response']) == 1) :
print "Approved"
elif (int(gw.responses['response']) == 2) :
print "Declined"
elif (int(gw.responses['response']) == 3) :
print "Error"
Towards the end is where the problems are. How can I pass the variables from Zapier into the python code?
David here, from the Zapier Platform team. A few things.
First, I think your issue is the one described here. Namely, I believe input_data's values are unicode. So you'll want to call str(input_data['Ccnum']) instead.
Alternatively, if you want to use Requests, it's also supported and is a lot less finicky.
All that said, I would be remiss if I didn't mention that everything in Zapier code steps gets logged in plain text internally. For that reason, I'd strongly recommend against putting credit card numbers, your password for this service, and any other sensitive data through a Code step. A private server that you control is a much safer option.
Let me know if you've got any other questions!
I wrote some code that grabs the numbers I need from this website, but I don't know what to do next.
It grabs the numbers from the table at the bottom. The ones under calving ease, birth weight, weaning weight, yearling weight, milk and total maternal.
#!/usr/bin/python
import urllib2
from bs4 import BeautifulSoup
import pyperclip
def getPageData(url):
if not ('abri.une.edu.au' in url):
return -1
webpage = urllib2.urlopen(url).read()
soup = BeautifulSoup(webpage, "html.parser")
# This finds the epd tree and saves it as a searchable list
pedTreeTable = soup.find('table', {'class':'TablesEBVBox'})
# This puts all of the epds into a list.
# it looks for anything in pedTreeTable with an td tag.
pageData = pedTreeTable.findAll('td')
pageData.pop(7)
return pageData
def createPedigree(animalPageData):
''' make animalPageData much more useful. Strip the text out and put it in a dict.'''
animals = []
for animal in animalPageData:
animals.append(animal.text)
prettyPedigree = {
'calving_ease' : animals[18],
'birth_weight' : animals[19],
'wean_weight' : animals[20],
'year_weight' : animals[21],
'milk' : animals[22],
'total_mat' : animals[23]
}
for animalKey in prettyPedigree:
if animalKey != 'year_weight' and animalKey != 'dam':
prettyPedigree[animalKey] = stripRegNumber(prettyPedigree[animalKey])
return prettyPedigree
def stripRegNumber(animal):
'''returns the animal with its registration number stripped'''
lAnimal = animal.split()
strippedAnimal = ""
for word in lAnimal:
if not word.isdigit():
strippedAnimal += word + " "
return strippedAnimal
def prettify(pedigree):
''' Takes the pedigree and prints it out in a usable format '''
s = ''
pedString = ""
# this is also ugly, but it was the only way I found to format with a variable
cFormat = '{{:^{}}}'
rFormat = '{{:>{}}}'
#row 1 of string
s += rFormat.format(len(pedigree['calving_ease'])).format(
pedigree['calving_ease']) + '\n'
#row 2 of string
s += rFormat.format(len(pedigree['birth_weight'])).format(
pedigree['birth_weight']) + '\n'
#row 3 of string
s += rFormat.format(len(pedigree['wean_weight'])).format(
pedigree['wean_weight']) + '\n'
#row 4 of string
s += rFormat.format(len(pedigree['year_weight'])).format(
pedigree['year_weight']) + '\n'
#row 4 of string
s += rFormat.format(len(pedigree['milk'])).format(
pedigree['milk']) + '\n'
#row 5 of string
s += rFormat.format(len(pedigree['total_mat'])).format(
pedigree['total_mat']) + '\n'
return s
if __name__ == '__main__':
while True:
url = raw_input('Input a url you want to use to make life easier: \n')
pageData = getPageData(url)
s = prettify(createPedigree(pageData))
pyperclip.copy(s)
if len(s) > 0:
print 'the easy string has been copied to your clipboard'
I've just been using this code for easy copying and pasting. All I have to do is insert the URL, and it saves the numbers to my clipboard.
Now I want to use this code on my website; I want to be able to insert a URL in my HTML code, and it displays these numbers on my page in a table.
My questions are as follows:
How do I use the python code on the website?
How do I insert collected data into a table with HTML?
It sounds like you would want to use something like Django. Although the learning curve is a bit steep, it is worth it and it (of course) supports python.
I am trying to develop an Infobox parser in Python which supports all the languages of Wikipedia. The parser will get the infobox data and will return the data in a Dictionary.
The keys of the Dictionary will be the property which is described (e.g. Population, City name, etc...).
The problem is that Wikipedia has slightly different page contents for each language. But the most important thing is that the API response structure for each language can also be different.
For example, the API response for 'Paris' in English contains this Infobox:
{{Infobox French commune |name = Paris |commune status = [[Communes of France|Commune]] and [[Departments of France|department]] |image = <imagemap> File:Paris montage.jpg|275px|alt=Paris montage
and in Greek, the corresponding part for 'Παρίσι' is:
[...] {{Πόλη (Γαλλία) | Πόλη = Παρίσι | Έμβλημα =Blason paris 75.svg | Σημαία =Mairie De Paris (SVG).svg | Πλάτος Σημαίας =120px | Εικόνα =Paris - Eiffelturm und Marsfeld2.jpg [...]
In the second example, there isn't any 'Infobox' occurrence after the {{. Also, in the API response the name = Paris is not the exact translation for Πόλη = Παρίσι. (Πόλη means city, not name)
Because of such differences between the responses, my code fails.
Here is the code:
class WikipediaInfobox():
# Class to get and parse the Wikipedia Infobox Data
infoboxArrayUnprocessed = [] # Maintains the order which the data is displayed.
infoboxDictUnprocessed = {} # Still Contains Brackets and Wikitext coding. Will be processed more later...
language="en"
def getInfoboxDict(self, infoboxRaw): # Get the Infobox in Dict and Array form (Unprocessed)
if infoboxRaw.strip() == "":
return {}
boxLines = [line.strip().replace(" "," ") for line in infoboxRaw.splitlines()]
wikiObjectType = boxLines[0]
infoboxData = [line[1:] for line in boxLines[1:]]
toReturn = {"wiki_type":wikiObjectType}
for i in infoboxData:
key = i.split("=")[0].strip()
value = ""
if i.strip() != key + "=":
value=i.split("=")[1].strip()
self.infoboxArrayUnprocessed.append({key:value})
toReturn[key]=value
self.infoboxDictUnprocessed = toReturn
return toReturn
def getInfoboxRaw(self, pageTitle, followRedirect = False, resetOld=True): # Get Infobox in Raw Text
if resetOld:
infoboxDict = {}
infoboxDictUnprocessed = {}
infoboxArray = []
infoboxArrayUnprocessed = []
params = { "format":"xml", "action":"query", "prop":"revisions", "rvprop":"timestamp|user|comment|content" }
params["titles"] = "%s" % urllib.quote(pageTitle.encode("utf8"))
qs = "&".join("%s=%s" % (k, v) for k, v in params.items())
url = "http://" + self.language + ".wikipedia.org/w/api.php?%s" % qs
tree = etree.parse(urllib.urlopen(url))
revs = tree.xpath('//rev')
if len(revs) == 0:
return ""
if "#REDIRECT" in revs[-1].text and followRedirect == True:
redirectPage = revs[-1].text[revs[-1].text.find("[[")+2:revs[-1].text.find("]]")]
return self.getInfoboxRaw(redirectPage,followRedirect,resetOld)
elif "#REDIRECT" in revs[-1].text and followRedirect == False:
return ""
infoboxRaw = ""
if "{{Infobox" in revs[-1].text: # -> No Multi-language support:
infoboxRaw = revs[-1].text.split("{{Infobox")[1].split("}}")[0]
return infoboxRaw
def __init__(self, pageTitle = "", followRedirect = False): # Constructor
if pageTitle != "":
self.language = guess_language.guessLanguage(pageTitle)
if self.language == "UNKNOWN":
self.language = "en"
infoboxRaw = self.getInfoboxRaw(pageTitle, followRedirect)
self.getInfoboxDict(infoboxRaw) # Now the parsed data is in self.infoboxDictUnprocessed
Some parts of this code was found on this blog...
I don't want to reinvent the wheel, so maybe someone has a nice solution for multi-language support and neat parsing of the Infobox section of Wikipedia.
I have seen many alternatives, like DBPedia or some other parsers that MediaWiki recommends, but I haven't found anything that suits my needs, yet. I also want to avoid scraping the page with BeautifulSoup, because it can fail on some cases, but if it is necessary it will do.
If something isn't clear enough, please ask. I want to help as much as I can.
Wikidata is definitely the first choice these days if you want to get structured data, anyway if in the future you need to parse data from wikipedia articles, especially as you are using Python, I can recommand mwparserfromhell which is a python library aimed at parsing wikitext and that has an option to extract templates and their attributes. That won't directly fix your issue as the multiple templates in multiple languages will definitely be different but that might be useful if you continue trying to parse wikitext.