How to scrape entire integer in python with Beautiful Soup? - python

Working on getting some wave heights from websites and my code fails when the wave heights get into the double digit range.
Ex: Currently the code would scrape a 12 from the site as '1' and '2' separately, not '12'.
#Author: David Owens
#File name: soupScraper.py
#Description: html scraper that takes surf reports from various websites
import csv
import requests
from bs4 import BeautifulSoup
NUM_SITES = 2
reportsFinal = []
###################### SURFLINE URL STRINGS AND TAG ###########################
slRootUrl = 'http://www.surfline.com/surf-report/'
slSunsetCliffs = 'sunset-cliffs-southern-california_4254/'
slScrippsUrl = 'scripps-southern-california_4246/'
slBlacksUrl = 'blacks-southern-california_4245/'
slCardiffUrl = 'cardiff-southern-california_4786/'
slTagText = 'observed-wave-range'
slTag = 'id'
#list of surfline URL endings
slUrls = [slSunsetCliffs, slScrippsUrl, slBlacksUrl]
###############################################################################
#################### MAGICSEAWEED URL STRINGS AND TAG #########################
msRootUrl = 'http://magicseaweed.com/'
msSunsetCliffs = 'Sunset-Cliffs-Surf-Report/4211/'
msScrippsUrl = 'Scripps-Pier-La-Jolla-Surf-Report/296/'
msBlacksUrl = 'Torrey-Pines-Blacks-Beach-Surf-Report/295/'
msTagText = 'rating-text'
msTag = 'li'
#list of magicseaweed URL endings
msUrls = [msSunsetCliffs, msScrippsUrl, msBlacksUrl]
###############################################################################
'''
This class represents a surf break. It contains all wave, wind, & tide data
associated with that break relevant to the website
'''
class surfBreak:
def __init__(self, name,low, high, wind, tide):
self.name = name
self.low = low
self.high = high
self.wind = wind
self.tide = tide
#toString method
def __str__(self):
return '{0}: Wave height: {1}-{2} Wind: {3} Tide: {4}'.format(self.name,
self.low, self.high, self.wind, self.tide)
#END CLASS
'''
This returns the proper attribute from the surf report sites
'''
def reportTagFilter(tag):
return (tag.has_attr('class') and 'rating-text' in tag['class']) \
or (tag.has_attr('id') and tag['id'] == 'observed-wave-range')
#END METHOD
'''
This method checks if the parameter is of type int
'''
def representsInt(s):
try:
int(s)
return True
except ValueError:
return False
#END METHOD
'''
This method extracts all ints from a list of reports
reports: The list of surf reports from a single website
returns: reportNums - A list of ints of the wave heights
'''
def extractInts(reports):
print reports
reportNums = []
afterDash = False
num = 0
tens = 0
ones = 0
#extract all ints from the reports and ditch the rest
for report in reports:
for char in report:
if representsInt(char) == True:
num = int(char)
reportNums.append(num)
else:
afterDash = True
return reportNums
#END METHOD
'''
This method iterates through a list of urls and extracts the surf report from
the webpage dependent upon its tag location
rootUrl: The root url of each surf website
urlList: A list of specific urls to be appended to the root url for each
break
tag: the html tag where the actual report lives on the page
returns: a list of strings of each breaks surf report
'''
def extractReports(rootUrl, urlList, tag, tagText):
#empty list to hold reports
reports = []
reportNums = []
index = 0
#loop thru URLs
for url in urlList:
try:
index += 1
#request page
request = requests.get(rootUrl + url)
#turn into soup
soup = BeautifulSoup(request.content, 'lxml')
#get the tag where surflines report lives
reportTag = soup.findAll(reportTagFilter)[0]
reports.append(reportTag.text.strip())
#notify if fail
except:
print 'scrape failure at URL ', index
pass
reportNums = extractInts(reports)
return reportNums
#END METHOD
'''
This method calculates the average of the wave heights
'''
def calcAverages(reportList):
#empty list to hold averages
finalAverages = []
listIndex = 0
waveIndex = 0
#loop thru list of reports to calc each breaks ave low and high
for x in range(0, 6):
#get low ave
average = (reportList[listIndex][waveIndex]
+ reportList[listIndex+1][waveIndex]) / NUM_SITES
finalAverages.append(average)
waveIndex += 1
return finalAverages
#END METHOD
slReports = extractReports(slRootUrl, slUrls, slTag, slTagText)
msReports = extractReports(msRootUrl, msUrls, msTag, msTagText)
reportsFinal.append(slReports)
reportsFinal.append(msReports)
print 'Surfline: ', slReports
print 'Magicseaweed: ', msReports

You are not actually extracting integers, but floats, it seems, since the values in reports are something like ['0.3-0.6 m']. Right now you are just going through every single character and converting them to int one by one or discarding. So no wonder that you will get only single-digit numbers.
One (arguably) simple way to extract those numbers from that string is with regexp:
import re
FLOATEXPR = re.compile("(\d+\.\d)-(\d+\.\d) {0,1}m")
def extractFloats(reports):
reportNums = []
for report in reports:
groups = re.match(FLOATEXPR, report).groups()
for group in groups:
reportNums.append(float(group))
return reportNums
This expression would match your floats and return them as a list.
In detail, the expression will match anything that has at least one digit before a '.', and one digit after it, a '-' between, another float sequence and ending with 'm' or ' m'. Then it groups the parts representing floats to a tuple. For example that ['12.0m-3.0m'] would return [12.0, 3.0]. If you expect it to have more digits after the floating point, you can add an extra '+' after the second 'd':s in the expression.

Related

Extract Text from a word document

I am trying to scrape data from a word document available at:-
https://dl.dropbox.com/s/pj82qrctzkw9137/HE%20Distributors.docx
I need to scrape the Name, Address, City, State, and Email ID. I am able to scrape the E-mail using the below code.
import docx
content = docx.Document('HE Distributors.docx')
location = []
for i in range(len(content.paragraphs)):
stat = content.paragraphs[i].text
if 'Email' in stat:
location.append(i)
for i in location:
print(content.paragraphs[i].text)
I tried to use the steps mentioned:
How to read data from .docx file in python pandas?
I need to convert this into a data frame with all the columns mentioned above.
Still facing issues with the same.
There are some inconsistencies in the document - phone numbers starting with Tel: sometimes, and Tel.: other times, and even Te: once, and I noticed one of the emails is just in the last line for that distributor without the Email: prefix, and the State isn't always in the last line.... Still, for the most part, most of the data can be extracted with regex and/or splits.
The distributors are separated by empty lines, and the names are in a different color - so I defined this function to get the font color of any paragraph from its xml:
# from bs4 import BeautifulSoup
def getParaColor(para):
try:
return BeautifulSoup(
para.paragraph_format.element.xml, 'xml'
).find('color').get('w:val')
except:
return ''
The try...except hasn't been necessary yet, but just in case...
(The xml is actually also helpful for double-checking that .text hasn't missed anything - in my case, I noticed that the email for Shri Adhya Educational Books wasn't getting extracted.)
Then, you can process the paragraphs from docx.Document with a function like:
# import re
def splitParas(paras):
ptc = [(
p.text, getParaColor(p), p.paragraph_format.element.xml
) for p in paras]
curSectn = 'UNKNOWN'
splitBlox = [{}]
for pt, pc, px in ptc:
# double-check for missing text
xmlText = BeautifulSoup(px, 'xml').text
xmlText = ' '.join([s for s in xmlText.split() if s != ''])
if len(xmlText) > len(pt): pt = xmlText
# initiate
if not pt:
if splitBlox[-1] != {}:
splitBlox.append({})
continue
if pc == '20752E':
curSectn = pt.strip()
continue
if splitBlox[-1] == {}:
splitBlox[-1]['section'] = curSectn
splitBlox[-1]['raw'] = []
splitBlox[-1]['Name'] = []
splitBlox[-1]['address_raw'] = []
# collect
splitBlox[-1]['raw'].append(pt)
if pc == 'D12229':
splitBlox[-1]['Name'].append(pt)
elif re.search("^Te.*:.*", pt):
splitBlox[-1]['tel_raw'] = re.sub("^Te.*:", '', pt).strip()
elif re.search("^Mob.*:.*", pt):
splitBlox[-1]['mobile_raw'] = re.sub("^Mob.*:", '', pt).strip()
elif pt.startswith('Email:') or re.search(".*[#].*[.].*", pt):
splitBlox[-1]['Email'] = pt.replace('Email:', '').strip()
else:
splitBlox[-1]['address_raw'].append(pt)
# some cleanup
if splitBlox[-1] == {}: splitBlox = splitBlox[:-1]
for i in range(len(splitBlox)):
addrsParas = splitBlox[i]['address_raw'] # for later
# join lists into strings
splitBlox[i]['Name'] = ' '.join(splitBlox[i]['Name'])
for k in ['raw', 'address_raw']:
splitBlox[i][k] = '\n'.join(splitBlox[i][k])
# search address for City, State and PostCode
apLast = addrsParas[-1].split(',')[-1]
maybeCity = [ap for ap in addrsParas if '–' in ap]
if '–' not in apLast:
splitBlox[i]['State'] = apLast.strip()
if maybeCity:
maybePIN = maybeCity[-1].split('–')[-1].split(',')[0]
maybeCity = maybeCity[-1].split('–')[0].split(',')[-1]
splitBlox[i]['City'] = maybeCity.strip()
splitBlox[i]['PostCode'] = maybePIN.strip()
# add mobile to tel
if 'mobile_raw' in splitBlox[i]:
if 'tel_raw' not in splitBlox[i]:
splitBlox[i]['tel_raw'] = splitBlox[i]['mobile_raw']
else:
splitBlox[i]['tel_raw'] += (', ' + splitBlox[i]['mobile_raw'])
del splitBlox[i]['mobile_raw']
# split tel [as needed]
if 'tel_raw' in splitBlox[i]:
tel_i = [t.strip() for t in splitBlox[i]['tel_raw'].split(',')]
telNum = []
for t in range(len(tel_i)):
if '/' in tel_i[t]:
tns = [t.strip() for t in tel_i[t].split('/')]
tel1 = tns[0]
telNum.append(tel1)
for tn in tns[1:]:
telNum.append(tel1[:-1*len(tn)]+tn)
else:
telNum.append(tel_i[t])
splitBlox[i]['Tel_1'] = telNum[0]
splitBlox[i]['Tel'] = telNum[0] if len(telNum) == 1 else telNum
return splitBlox
(Since I was getting font color anyway, I decided to add another
column called "section" to put East/West/etc in. And I added "PostCode" too, since it seems to be on the other side of "City"...)
Since "raw" is saved, any other value can be double checked manually at least.
The function combines "Mobile" into "Tel" even though they're extracted with separate regex.
I'd say "Tel_1" is fairly reliable, but some of the inconsistent patterns mean that other numbers in "Tel" might come out incorrect if they were separated with '/'.
Also, "Tel" is either a string or a list of strings depending on how many numbers there were in "tel_raw".
After this, you can just view as DataFrame with:
#import docx
#import pandas
content = docx.Document('HE Distributors.docx')
# pandas.DataFrame(splitParas(content.paragraphs)) # <--all Columns
pandas.DataFrame(splitParas(content.paragraphs))[[
'section', 'Name', 'address_raw', 'City',
'PostCode', 'State', 'Email', 'Tel_1', 'tel_raw'
]]

Data output is not the same as inside function

I am currently having an issue where I am trying to store data in a list (using dataclasses). When I print the data inside the list in the function (PullIncursionData()) it responded with a certain amount of numbers (never the same, not possible due to it's nature). When printing it after it being called to store it's return in a Var it somehow prints only the same number.
I cannot share the numbers, as they update with EVE Online's API, so the only way is to run it locally and read the first list yourself.
The repository is Here: https://github.com/AtherActive/EVEAPI-Demo
Heads up! Inside the main.py (the file with issues) (a snippet of code is down below) are more functions. All functions from line 90 and forward are important, the rest can be ignored for this question, as they do not interact with the other functions.
def PullIncursionData():
#Pulls data from URL and converts it into JSON
url = 'https://esi.evetech.net/latest/incursions/?datasource=tranquility'
data = rq.get(url)
jsData = data.json()
#Init var to store incursions
incursions = []
#Set lenght for loop. yay
length = len(jsData)
# Every loop incursion data will be read by __parseIncursionData(). It then gets added to var Incursions.
for i in range(length):
# Add data to var Incursion.
incursions.append(__parseIncursionData(jsData, i))
# If Dev mode, print some debug. Can be toggled in settings.py
if settings.developerMode == 1:
print(incursions[i].constellation_id)
return incursions
# Basically parses the input data in a decent manner. No comments needed really.
def __parseIncursionData(jsData, i):
icstruct = stru.Incursion
icstruct.constellation_id = jsData[i]['constellation_id']
icstruct.constellation_name = 'none'
icstruct.staging = jsData[i]['staging_solar_system_id']
icstruct.region_name = ResolveSystemNames(icstruct.constellation_id, 'con-reg')
icstruct.status = jsData[i]['state']
icstruct.systems_id = jsData[i]['infested_solar_systems']
icstruct.systems_names = ResolveSystemNames(jsData[i]['infested_solar_systems'], 'system')
return icstruct
# Resolves names for systems, regions and constellations. Still WIP.
def ResolveSystemNames(id, mode='constellation'):
#init value
output_name = 'none'
# If constellation, pull data and find region name.
if mode == 'con-reg':
url = 'https://www.fuzzwork.co.uk/api/mapdata.php?constellationid={}&format=json'.format(id)
data = rq.get(url)
jsData = data.json()
output_name = jsData[0]['regionname']
# Pulls system name form Fuzzwork.co.uk.
elif mode == 'system':
#Convert output to a list.
output_name = []
lenght = len(id)
# Pulls system name from Fuzzwork. Not that hard.
for i in range(lenght):
url = 'https://www.fuzzwork.co.uk/api/mapdata.php?solarsystemid={}&format=json'.format(id[i])
data = rq.get(url)
jsData = data.json()
output_name.append(jsData[i]['solarsystemname'])
return output_name
icdata = PullIncursionData()
print('external data check:')
length = len(icdata)
for i in range(length):
print(icdata[i].constellation_id)
structures.py (custom file)
#dataclass
class Incursion:
constellation_id = int
constellation_name = str
staging = int
staging_name = str
systems_id = list
systems_names = list
region_name = str
status = str
def ___init___(self):
self.constellation_id = -1
self.constellation_name = 'undefined'
self.staging = -1
self.staging_name = 'undefined'
self.systems_id = []
self.systems_names = []
self.region_name = 'undefined'
self.status = 'unknown'

Is there a way to make a dict callable in your code?

So for one of my projects I have to create a AI similar to the pageRank algorithm to rank the importance of html files. The code is in the error below and when I run the code I get this error. I am out of ideas for this one. I have been looking at this code for the past 3 hours and googled a way to fix it. I know that when you call a dict to make sure it works you have to return it as an array, but in the code, I am returning the page as [page]. I need another pair of eyes to look at tit.
Traceback (most recent call last):
File "pagerank.py", line 208, in <module>
main()
File "pagerank.py", line 14, in main
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
File "pagerank.py", line 119, in sample_pagerank
new_samp_choice = transition_model(corpus, sample, damping_factor)
File "pagerank.py", line 64, in transition_model
num_links = len(corpus([page]))
TypeError: 'dict' object is not callable.
So I looked over stacked overflow and said that in order to call a dict is to print it out as an array. But I have looked at the code a hundred times, I don't know whats wrong with it.
import os
import random
import re
import sys
from collections import Counter
DAMPING = 0.85
SAMPLES = 10000
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python pagerank.py corpus")
corpus = crawl(sys.argv[1])
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
print(f"PageRank Results from Sampling (n = {SAMPLES})")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
ranks = iterate_pagerank(corpus, DAMPING)
print(f"PageRank Results from Iteration")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
def crawl(directory):
"""
Parse a directory of HTML pages and check for links to other pages.
Return a dictionary where each key is a page, and values are
a list of all other pages in the corpus that are linked to by the page.
"""
pages = dict()
# Extract all links from HTML files
for filename in os.listdir(directory):
if not filename.endswith(".html"):
continue
with open(os.path.join(directory, filename)) as f:
contents = f.read()
links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
pages[filename] = set(links) - {filename}
# Only include links to other pages in the corpus
for filename in pages:
pages[filename] = set(
link for link in pages[filename]
if link in pages
)
return pages
def transition_model(corpus, page, damping_factor):
"""
Return a probability distribution over which page to visit next,
given a current page.
With probability `damping_factor`, choose a link at random
linked to by `page`. With probability `1 - damping_factor`, choose
a link at random chosen from all pages in the corpus.
"""
page_mod = {}
#run the number of files on the corpus
num_files = len(corpus)
#receive number of links from the page that was picked at random
num_links = len(corpus([page]))
if num_links != 0:
#Calculate the probability
randonm_set = (1 - damping_factor)/num_files
#calculating the specific page realted probability
specific_set = (1 - damping_factor)/ num_links
else: #Calculate the probability from all pages
randonm_set = (1 - damping_factor) / num_links
specific_set = 0
#iterate over the files
for file in corpus:
#Checking the page to see if there any other links
if len(corpus[page])== 0:
page_mod[file] = 1 / num_files
else:
if file not in corpus[page]:
page_mod[file] = randonm_set
else:
page_mod[file] = specific_set + randonm_set
if round(sum(page_mod.values()) ,5) != 1:
print(f'ERROR! The probabilites add up from {sum(page_mod.values())}')
return page_mod
def sample_pagerank(corpus, damping_factor, n):
"""
Return PageRank values for each page by sampling `n` pages
according to transition model, starting with a page at random.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
sample_PR = {}
# Mappinga variable name to sample generated and make it equal to 0
for page in corpus:
sample_PR[page] = 0
sample = None
for iteration in range(n):
if sample == None:
# list of all the choices
choices = list(corpus.keys())
# choose a choice at random
sample = random.choice(choices)
sample_PR[sample] += 1
else:
#Get the probability based of the current sample choice
new_samp_choice = transition_model(corpus, sample, damping_factor)
#List of all choices
choices = list(new_samp_choice.keys())
# Weights for the distribution for each page and ranking up thier importance
weights = [new_samp_choice[key] for key in choices]
# when you run it you the random.choices method will return a list of values
sample = random.choices(choices,weights).pop()
sample_PR[sample] += 1
#Divide the iterations to get an percentage
sample_PR + {key: value/n for key, value in sample_PR.items()}
#Check if the value sadd up to 1
if round(sum(sample_PR.values()), 5) != 1:
print(f'ERROR! The probabilites add up from {sum(page_mod.values())}')
else:
print(
f'sum of the page Rank files: {round(sum(sample_PR.values()),10)}')
return sample_PR
def iterate_pagerank(corpus, damping_factor):
"""
Return PageRank values for each page by iteratively updating
PageRank values until convergence.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
#Create a dictionary for the iterations
iterate_PR = {}
#The number of pages in the coprus
num_pages = len(corpus)
#Iterate over the copus and assign a number to each page
for page in corpus:
iterate_PR[page] = 1/num_pages
changes = 1
iterations = 1
while changes >= 0.001:
changes = 0
#Copy the current state of the value to make sure it doesn't overide another value
prev_state = iterate_PR.copy()
#Iterate over the pages
for page in iterate_PR:
#Get pparent pages that link to it
parents = [link for link in corpus if page in corpus [link]]
#Add the damping factor/ number of links and create the iteration over the parents as a array
first_eq = (1 - damping_factor)/ num_pages
second_eq = []
if len(parents) != 0:
for parent in parents:
#Start the the number of links from the parent page
num_links = len(corpus[parent])
value = prev_state[parent]/ num_links
second_eq.append(value)
#Start the second list to sum up the values
second = sum(second_eq)
iterate_PR[page] = first_eq + (damping_factor * second)
#Calculating the change of the iteration
new_change = abs(iterate_PR[page] - prev_state[page])
if changes < new_change:
changes = new_change
iterations += 1
dictsum = sum(iterate_PR.values())
iterate_PR = {key: value/dictsum for key, value in iterate_PR.items()}
print(f'\nPageRank value stable after {iterations} iterations.')
print(f' Sum of iterate_pagerank values: {round(sum(iterate_PR.values()),10)}')
return iterate_PR
if __name__ == "__main__":
main()

Insert table data from website into table on my own website using Python and Beautiful Soup

I wrote some code that grabs the numbers I need from this website, but I don't know what to do next.
It grabs the numbers from the table at the bottom. The ones under calving ease, birth weight, weaning weight, yearling weight, milk and total maternal.
#!/usr/bin/python
import urllib2
from bs4 import BeautifulSoup
import pyperclip
def getPageData(url):
if not ('abri.une.edu.au' in url):
return -1
webpage = urllib2.urlopen(url).read()
soup = BeautifulSoup(webpage, "html.parser")
# This finds the epd tree and saves it as a searchable list
pedTreeTable = soup.find('table', {'class':'TablesEBVBox'})
# This puts all of the epds into a list.
# it looks for anything in pedTreeTable with an td tag.
pageData = pedTreeTable.findAll('td')
pageData.pop(7)
return pageData
def createPedigree(animalPageData):
''' make animalPageData much more useful. Strip the text out and put it in a dict.'''
animals = []
for animal in animalPageData:
animals.append(animal.text)
prettyPedigree = {
'calving_ease' : animals[18],
'birth_weight' : animals[19],
'wean_weight' : animals[20],
'year_weight' : animals[21],
'milk' : animals[22],
'total_mat' : animals[23]
}
for animalKey in prettyPedigree:
if animalKey != 'year_weight' and animalKey != 'dam':
prettyPedigree[animalKey] = stripRegNumber(prettyPedigree[animalKey])
return prettyPedigree
def stripRegNumber(animal):
'''returns the animal with its registration number stripped'''
lAnimal = animal.split()
strippedAnimal = ""
for word in lAnimal:
if not word.isdigit():
strippedAnimal += word + " "
return strippedAnimal
def prettify(pedigree):
''' Takes the pedigree and prints it out in a usable format '''
s = ''
pedString = ""
# this is also ugly, but it was the only way I found to format with a variable
cFormat = '{{:^{}}}'
rFormat = '{{:>{}}}'
#row 1 of string
s += rFormat.format(len(pedigree['calving_ease'])).format(
pedigree['calving_ease']) + '\n'
#row 2 of string
s += rFormat.format(len(pedigree['birth_weight'])).format(
pedigree['birth_weight']) + '\n'
#row 3 of string
s += rFormat.format(len(pedigree['wean_weight'])).format(
pedigree['wean_weight']) + '\n'
#row 4 of string
s += rFormat.format(len(pedigree['year_weight'])).format(
pedigree['year_weight']) + '\n'
#row 4 of string
s += rFormat.format(len(pedigree['milk'])).format(
pedigree['milk']) + '\n'
#row 5 of string
s += rFormat.format(len(pedigree['total_mat'])).format(
pedigree['total_mat']) + '\n'
return s
if __name__ == '__main__':
while True:
url = raw_input('Input a url you want to use to make life easier: \n')
pageData = getPageData(url)
s = prettify(createPedigree(pageData))
pyperclip.copy(s)
if len(s) > 0:
print 'the easy string has been copied to your clipboard'
I've just been using this code for easy copying and pasting. All I have to do is insert the URL, and it saves the numbers to my clipboard.
Now I want to use this code on my website; I want to be able to insert a URL in my HTML code, and it displays these numbers on my page in a table.
My questions are as follows:
How do I use the python code on the website?
How do I insert collected data into a table with HTML?
It sounds like you would want to use something like Django. Although the learning curve is a bit steep, it is worth it and it (of course) supports python.

Creating a Blog Summary in Python?

Is there any good library (or regex magic) which can convert a blog entry into a blog summary? I'd like the summary to display the first four sentences, first paragraph, or first X number of characters... not really sure what would be the best. Ideally, I would like it to keep html formatting tags such as <a>, <b>, <u> and <i>, but it could remove all other html tags, javascript and css.
More specifically, as input I'd give an html string representing an entire blog post. As output, I'd like an html string which contains the first few sentences, paragraph, or X number of characters. With all potentially unsafe html tags removed. In Python please.
If you're looking at the HTML you'll need to parse it. In addition to aforementioned BeautifulSoup, lxml.html has some nice HTML handling tools.
However if it's a blog you may find it even easier to work with RSS/Atom feeds. Feedparser is fantastic and would make it easy. You'd gain compatibility and durability (because RSS is more defined things will change less) but if the feed doesn't include what you need it won't help you.
I ended up using the gdata library and rolling my own blog summarizer, which uses the gdata library to fetch a Blogspot blog on Google App Engine (wouldn't be hard to port it to other platforms). The code is below. To use it, first set the constant blog_id_constant and then call get_blog_info to return a dictionary with the blog summaries.
I would not trust the code to create summaries of any random blog out there on the internet because it may not remove all unsafe html from the blog feed. However, for a simple blog that you write yourself, the code below should work.
Please feel free to copy but if you see any bugs or would like to make improvements, add them in the comments. (Sorry for the semicolons).
import sys
import os
import logging
import time
import urllib
from HTMLParser import HTMLParser
from django.core.cache import cache
# Import the Blogger API
sys.path.insert(0, 'gdata.zip')
from gdata import service
Months = ["Jan.", "Feb.", "Mar.", "Apr.", "May", "June", "July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec."];
blog_id_constant = -1 # YOUR BLOG ID HERE
blog_pages_at_once = 5
# -----------------------------------------------------------------------------
# Blogger
class BlogHTMLSummarizer(HTMLParser):
'''
An HTML parser which only grabs X number of words and removes
all tags except for certain safe ones.
'''
def __init__(self, max_words = 80):
self.max_words = max_words
self.allowed_tags = ["a", "b", "u", "i", "br", "div", "p", "img", "li", "ul", "ol"]
if self.max_words < 80:
# If it's really short, don't include layout tags
self.allowed_tags = ["a", "b", "u", "i"]
self.reset()
self.out_html = ""
self.num_words = 0
self.no_more_data = False
self.no_more_tags = False
self.tag_stack = []
def handle_starttag(self, tag, attrs):
if not self.no_more_data and tag in self.allowed_tags:
val = "<%s %s>"%(tag,
" ".join("%s='%s'"%(a,b) for (a,b) in attrs))
self.tag_stack.append(tag)
self.out_html += val
def handle_data(self, data):
if self.no_more_data:
return
data = data.split(" ")
if self.num_words + len(data) >= self.max_words:
data = data[:self.max_words-self.num_words]
data.append("...")
self.no_more_data = True
self.out_html += " ".join(data)
self.num_words += len(data)
def handle_endtag(self, tag):
if self.no_more_data and not self.tag_stack:
self.no_more_tags = True
if not self.no_more_tags and self.tag_stack and tag == self.tag_stack[-1]:
if not self.tag_stack:
logging.warning("mixed up blogger tags")
else:
self.out_html += "</%s>"%tag
self.tag_stack.pop()
def get_blog_info(short_summary = False, page = 1, year = "", month = "", day = "", post = None):
'''
Returns summaries of several recent blog posts to be displayed on the front page
page: which page of blog posts to get. Starts at 1.
'''
blogger_service = service.GDataService()
blogger_service.source = 'exampleCo-exampleApp-1.0'
blogger_service.service = 'blogger'
blogger_service.account_type = 'GOOGLE'
blogger_service.server = 'www.blogger.com'
blog_dict = {}
# Do the common stuff first
query = service.Query()
query.feed = '/feeds/' + blog_id_constant + '/posts/default'
query.order_by = "published"
blog_dict['entries'] = []
def get_common_entry_data(entry, summarize_len = None):
'''
Convert an entry to a dictionary object.
'''
content = entry.content.text
if summarize_len != None:
parser = BlogHTMLSummarizer(summarize_len)
parser.feed(entry.content.text)
content = parser.out_html
pubstr = time.strptime(entry.published.text[:-10], '%Y-%m-%dT%H:%M:%S')
safe_title = entry.title.text.replace(" ","_")
for c in ":,.<>!##$%^&*()+-=?/'[]{}\\\"":
# remove nasty characters
safe_title = safe_title.replace(c, "")
link = "%d/%d/%d/%s/"%(pubstr.tm_year, pubstr.tm_mon, pubstr.tm_mday,
urllib.quote_plus(safe_title))
return {
'title':entry.title.text,
'alllinks':[x.href for x in entry.link] + [link], #including blogger links
'link':link,
'content':content,
'day':pubstr.tm_mday,
'month':Months[pubstr.tm_mon-1],
'summary': True if summarize_len != None else False,
}
def get_blogger_feed(query):
feed = cache.get(query.ToUri())
if not feed:
logging.info("GET Blogger Page: " + query.ToUri())
try:
feed = blogger_service.Get(query.ToUri())
except DownloadError:
logging.error("Cant download blog, rate limited? %s"%str(query.ToUri()))
return None
except Exception, e:
web_exception('get_blogger_feed', e)
return None
cache.set(query.ToUri(), feed, 3600)
return feed
def _in_one(a, allBs):
# Return true if a is in one of allBs
for b in allBs:
if a in b:
return True
return False
def _get_int(i):
try:
return int(i)
except ValueError:
return None
(year, month, day) = (_get_int(year), _get_int(month), _get_int(day))
if not short_summary and year and month and day:
# Get one more than we need so we can see if we have more
query.published_min = "%d-%02d-%02dT00:00:00-08:00"%(year, month, day)
query.published_max = "%d-%02d-%02dT23:59:59-08:00"%(year, month, day)
feed = get_blogger_feed(query)
if not feed:
return {}
blog_dict['detail_view'] = True
blog_dict['entries'] = map(lambda e: get_common_entry_data(e, None), feed.entry)
elif not short_summary and year and month and not day:
# Get one more than we need so we can see if we have more
query.published_min = "%d-%02d-%02dT00:00:00-08:00"%(year, month, 1)
query.published_max = "%d-%02d-%02dT23:59:59-08:00"%(year, month, 31)
feed = get_blogger_feed(query)
if not feed:
return {}
blog_dict['detail_view'] = True
blog_dict['entries'] = map(lambda e: get_common_entry_data(e, None), feed.entry)
if post:
blog_dict['entries'] = filter(lambda f: _in_one(post, f['alllinks']), blog_dict['entries'])
elif short_summary:
# Get a summary of all posts
query.max_results = str(3)
query.start_index = str(1)
feed = get_blogger_feed(query)
if not feed:
return {}
feed.entry = feed.entry[:3]
blog_dict['entries'] = map(lambda e: get_common_entry_data(e, 18), feed.entry)
else:
# Get a summary of all posts
try:
page = int(page)
except ValueError:
page = 1
# Get one more than we need so we can see if we have more
query.max_results = str(blog_pages_at_once + 1)
query.start_index = str((page - 1)* blog_pages_at_once + 1)
logging.info("GET Blogger Page: " + query.ToUri())
feed = blogger_service.Get(query.ToUri())
has_older = len(feed.entry) > blog_pages_at_once
feed.entry = feed.entry[:blog_pages_at_once]
if page > 1:
blog_dict['newer_page'] = str(page-1)
if has_older:
blog_dict['older_page'] = str(page+1)
blog_dict['entries'] = map(lambda e: get_common_entry_data(e, 80), feed.entry)
return blog_dict
You will have to parse the html. A nice lib for doing that is BeautifulSoup. It will allow to remove specific tags and extract values (text between tags). The text can than be relatively easily cut down to four sentences, though I'd go for a fixed number of characters, as the sentence length might vary a lot.

Categories