I am trying to build a spider that could efficiently scrape text information from many websites. Since I am a Python user I was referred to Scrapy. However, in order to avoid scraping huge websites, I want to limit the spider to scrape no more than 20 pages of a certain "depth" per website. Here is my spider:
class DownloadSpider(CrawlSpider):
name = 'downloader'
download_path = '/home/MyProjects/crawler'
rules = (Rule(SgmlLinkExtractor(), callback='parse_item', follow=True),)
def __init__(self, *args, **kwargs):
super(DownloadSpider, self).__init__(*args, **kwargs)
self.urls_file_path = [kwargs.get('urls_file')]
data = open(self.urls_file_path[0], 'r').readlines()
self.allowed_domains = [urlparse(i).hostname.strip() for i in data]
self.start_urls = ['http://' + domain for domain in self.allowed_domains]
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
self.fname = self.download_path + urlparse(response.url).hostname.strip()
open(str(self.fname)+ '.txt', 'a').write(response.url)
open(str(self.fname)+ '.txt', 'a').write('\n')
urls_file is a path to a text file with urls. I have also set the max depth in the settings file. Here is my problem: if I set the CLOSESPIDER_PAGECOUNT exception it closes the spider when the total number of scraped pages (regardless for which site) reaches the exception value. However, I need to stop scraping when I have scraped say 20 pages from each url.
I also tried keeping count with a variable like self.parsed_number += 1, but this didn't work either -- it seems that scrapy doesn't go url by url but mixes them up.
Any advice is much appreciated !
To do this you can create your own link extractor class based on SgmlLinkExtractor. It should look something like this:
from scrapy.selector import Selector
from scrapy.utils.response import get_base_url
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class LimitedLinkExtractor(SgmlLinkExtractor):
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
deny_extensions=None, max_pages=20):
self.max_pages=max_pages
SgmlLinkExtractor.__init__(self, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths,
tags=tags, attrs=attrs, canonicalize=canonicalize, unique=unique, process_value=process_value,
deny_extensions=deny_extensions)
def extract_links(self, response):
base_url = None
if self.restrict_xpaths:
sel = Selector(response)
base_url = get_base_url(response)
body = u''.join(f
for x in self.restrict_xpaths
for f in sel.xpath(x).extract()
).encode(response.encoding, errors='xmlcharrefreplace')
else:
body = response.body
links = self._extract_links(body, response.url, response.encoding, base_url)
links = self._process_links(links)
links = links[0:self.max_pages]
return links
The code of this subclass completely based on the code of the class SgmlLinkExtractor. I've just added variable self.max_pages to the class constructor and line which cut the list of links in the end of extract_links method. But you can cut this list in more intelligent way.
I'd make per-class variable, initialize it with stats = defaultdict(int) and increment self.stats[response.url] (or may be the key could be a tuple like (website, depth) in your case) in parse_item.
This is how I imagine this - should work in theory. Let me know if you need an example.
FYI, you can extract base url and calculate depth with the help of urlparse.urlparse (see docs).
Related
I am working on a class project and trying to get all IMDB movie data (titles, budgets. etc.) up until 2016. I adopted the code from https://github.com/alexwhb/IMDB-spider/blob/master/tutorial/spiders/spider.py.
My thought is: from i in range(1874,2016) (since 1874 is the earliest year shown on http://www.imdb.com/year/), direct the program to the corresponding year's website, and grab the data from that url.
But the problem is, each page for each year only show 50 movies, so after crawling the 50 movies, how can I move on to the next page? And after crawling each year, how can I move on to next year? This is my code for the parsing url part so far, but it is only able to crawls 50 movies for a particular year.
class tutorialSpider(scrapy.Spider):
name = "tutorial"
allowed_domains = ["imdb.com"]
start_urls = ["http://www.imdb.com/search/title?year=2014,2014&title_type=feature&sort=moviemeter,asc"]
def parse(self, response):
for sel in response.xpath("//*[#class='results']/tr/td[3]"):
item = MovieItem()
item['Title'] = sel.xpath('a/text()').extract()[0]
item['MianPageUrl']= "http://imdb.com"+sel.xpath('a/#href').extract()[0]
request = scrapy.Request(item['MianPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
yield request
You can use CrawlSpiders to simplify your task. As you'll see below, start_requests dynamically generates the list of URLs while parse_page only extracts the movies to crawl. Finding and following the 'Next' link is done by the rules attribute.
I agree with #Padraic Cunningham that hard-coding values is not a great idea. I've added spider arguments so that you can call:
scrapy crawl imdb -a start=1950 -a end=1980 (the scraper will default to 1874-2016 if it doesn't get any arguments).
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from imdbyear.items import MovieItem
class IMDBSpider(CrawlSpider):
name = 'imdb'
rules = (
# extract links at the bottom of the page. note that there are 'Prev' and 'Next'
# links, so a bit of additional filtering is needed
Rule(LinkExtractor(restrict_xpaths=('//*[#id="right"]/span/a')),
process_links=lambda links: filter(lambda l: 'Next' in l.text, links),
callback='parse_page',
follow=True),
)
def __init__(self, start=None, end=None, *args, **kwargs):
super(IMDBSpider, self).__init__(*args, **kwargs)
self.start_year = int(start) if start else 1874
self.end_year = int(end) if end else 2016
# generate start_urls dynamically
def start_requests(self):
for year in range(self.start_year, self.end_year+1):
yield scrapy.Request('http://www.imdb.com/search/title?year=%d,%d&title_type=feature&sort=moviemeter,asc' % (year, year))
def parse_page(self, response):
for sel in response.xpath("//*[#class='results']/tr/td[3]"):
item = MovieItem()
item['Title'] = sel.xpath('a/text()').extract()[0]
# note -- you had 'MianPageUrl' as your scrapy field name. I would recommend fixing this typo
# (you will need to change it in items.py as well)
item['MainPageUrl']= "http://imdb.com"+sel.xpath('a/#href').extract()[0]
request = scrapy.Request(item['MainPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
yield request
# make sure that the dynamically generated start_urls are parsed as well
parse_start_url = parse_page
# do your magic
def parseMovieDetails(self, response):
pass
you can use the below piece of code to follow the next page
#'a.lister-page-next.next-page::attr(href)' is the selector to get the next page link
next_page = response.css('a.lister-page-next.nextpage::attr(href)').extract_first() # joins current and next page url
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse) # calls parse function again when crawled to next page
I figured out a very dumb way to solve this. I put all the links in the start_urls. Better solution would be very much appreciated!
class tutorialSpider(scrapy.Spider):
name = "tutorial"
allowed_domains = ["imdb.com"]
start_urls = []
for i in xrange(1874, 2017):
for j in xrange(1, 11501, 50):
# since the largest number of movies for a year to have is 11,400 (2016)
start_url = "http://www.imdb.com/search/title?sort=moviemeter,asc&start=" + str(j) + "&title_type=feature&year=" + str(i) + "," + str(i)
start_urls.append(start_url)
def parse(self, response):
for sel in response.xpath("//*[#class='results']/tr/td[3]"):
item = MovieItem()
item['Title'] = sel.xpath('a/text()').extract()[0]
item['MianPageUrl']= "http://imdb.com"+sel.xpath('a/#href').extract()[0]
request = scrapy.Request(item['MianPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
yield request
The code that #Greg Sadetsky has provided needs some minor changes. Well only one change that is in the first line of parse_page method.
Just change xpath in the for loop from:
response.xpath("//*[#class='results']/tr/td[3]"):
to
response.xpath("//*[contains(#class,'lister-item-content')]/h3"):
This worked like a charm for me!
I'm writing a spider (CrawlSpider) for an online store. According to client requisites, I need to write two rules: one for determining which pages have items and other for extracting the items.
I have both rules already working independently:
if my start_urls = ["www.example.com/books.php",
"www.example.com/movies.php"] and I comment the Rule and the code
of parse_category, my parse_item will extract every item.
On the other hand, if start_urls = "http://www.example.com" and I
comment the Ruleand the code of parse_item, parse_category will
return every link in which there a items for extracting, i.e.
parse_category will return www.example.com/books.php and
www.example.com/movies.php.
My problem is that I don't know how to merge both modules, so that start_urls = "http://www.example.com" and then parse_category extracts www.example.com/books.php and www.example.com/movies.php and feed those links to parse_item, where I actually extract the info of each item.
I need to find a way to do it this way instead of just using start_urls = ["www.example.com/books.php", "www.example.com/movies.php"] because if in the future a new category is added (e.g. www.example.com/music.php), the spider wouldn't be able to automatically detect that new category and should be manually edited. Not a big deal, but the client doesn't want this.
class StoreSpider (CrawlSpider):
name = "storyder"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/"]
#start_urls = ["http://www.example.com/books.php", "http://www.example.com/movies.php"]
rules = (
Rule(LinkExtractor(), follow=True, callback='parse_category'),
Rule(LinkExtractor(), follow=False, callback="parse_item"),
)
def parse_category(self, response):
category = StoreCategory()
# some code for determining whether the current page is a category, or just another stuff
if is a category:
category['name'] = name
category['url'] = response.url
return category
def parse_item(self, response):
item = StoreItem()
# some code for extracting the item's data
return item
the CrawlSpider rules don't work like you want, you'll need to implement the logic by yourself. when you specify follow=True you can't use callback, because the idea is to keep getting links (no items) while following the rules, check the documentation
you could try with something like:
class StoreSpider (CrawlSpider):
name = "storyder"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/"]
# no rules
def parse(self, response): # this is parse_category
category_le = LinkExtractor("something for categories")
for a in category_le.extract_links(response):
yield Request(a.url, callback=self.parse_category)
item_le = LinkExtractor("something for items")
for a in item_le.extract_links(response):
yield Request(a.url, callback=self.parse_item)
def parse_category(self, response):
category = StoreCategory()
# some code for determining whether the current page is a category, or just another stuff
if is a category:
category['name'] = name
category['url'] = response.url
yield category
for req in self.parse(response):
yield req
def parse_item(self, response):
item = StoreItem()
# some code for extracting the item's data
return item
Instead of using a parse_category, I used restrict_css in LinkExtractorto get the links I want, and it seems to be feeding the second Rule with the extracted links, so my question is answered. It ended up this way:
class StoreSpider (CrawlSpider):
name = "storyder"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/"]
rules = (
Rule(LinkExtractor(restrict_css=("#movies", "#books"))),
Rule(LinkExtractor(), callback="parse_item"),
)
def parse_item(self, response):
item = StoreItem()
# some code for extracting the item's data
return item
Still it can't detect new added categories (and there is not a clear pattern for using in restrict_css without fetching other garbage), but at least it's complying with the requisites of the client: 2 rules, one for extracting category's links and other for extracting item's data.
As some of you may have gathered, I'm learning scrapy to scrape some data off of Google Scholar for a research project that I am running. I have a file that contains many article titles for which I am scraping citations. I read in the file using pandas, generate the URLs that need scraping, and start scraping.
One problem that I face is 503 errors. Google shuts me off fairly quickly, and many entries remain unscraped. This is a problem that I am working on using some middleware provided by Crawlera.
Another problem I face is that when I export my scraped data, I have a hard time matching the scraped data to what I was trying to look for. My input data is a CSV file with three fields -- 'Authors','Title','pid' where 'pid' is a unique identifier.
I use pandas to read in the file and generate URLs for scholar based off the title. Each time a given URL is scraped, my spider goes through the scholar webpage, and picks up the title, publication information and cites for each article listed on that page.
Here is how I generate the links for scraping:
class ScholarSpider(Spider):
name = "scholarscrape"
allowed_domains = ["scholar.google.com"]
# get the data
data = read_csv("../../data/master_jeea.csv")
# get the titles
queries = data.Title.apply(urllib.quote)
# generate a var to store links
links = []
# create the URLs to crawl
for entry in queries:
links.append("http://scholar.google.com/scholar?q=allintitle%3A"+entry)
# give the URLs to scrapy
start_urls = links
For example, one title from my data file could be the paper 'Elephants Don't Play Chess' by Rodney Brooks with 'pid' 5067. The spider goes to
http://scholar.google.com/scholar?q=allintitle%3Aelephants+don%27t+play+chess
Now on this page, there are six hits. The spider gets all six hits, but they need to be assigned the same 'pid'. I know I need to insert a line somewhere that reads something like item['pid'] = data.pid.apply("something") but I can't figure out exactly how I would do that.
Below is the rest of the code for my spider. I am sure the way to do this is pretty straightforward, but I can't think of how to get the spider to know which entry of data.pid it should look for if that makes sense.
def parse(self, response):
# initialize something to hold the data
items=[]
sel = Selector(response)
# get each 'entry' on the page
# an entry is a self contained div
# that has the title, publication info
# and cites
entries = sel.xpath('//div[#class="gs_ri"]')
# a counter for the entry that is being scraped
count = 1
for entry in entries:
item = ScholarscrapeItem()
# get the title
title = entry.xpath('.//h3[#class="gs_rt"]/a//text()').extract()
# the title is messy
# clean up
item['title'] = "".join(title)
# get publication info
# clean up
author = entry.xpath('.//div[#class="gs_a"]//text()').extract()
item['authors'] = "".join(author)
# get the portion that contains citations
cite_string = entry.xpath('.//div[#class="gs_fl"]//text()').extract()
# find the part that says "Cited by"
match = re.search("Cited by \d+",str(cite_string))
# if it exists, note the number
if match:
cites = re.search("\d+",match.group()).group()
# if not, there is no citation info
else:
cites = None
item['cites'] = cites
item['entry'] = count
# iterate the counter
count += 1
# append this item to the list
items.append(item)
return items
I hope this question is well-defined, but please let me know if I can be more clear. There is really not much else in my scraper except some lines at the top importing things.
Edit 1: Based on suggestions below, I have modified my code as follows:
# test-case: http://scholar.google.com/scholar?q=intitle%3Amigratory+birds
import re
from pandas import *
import urllib
from scrapy.spider import Spider
from scrapy.selector import Selector
from scholarscrape.items import ScholarscrapeItem
class ScholarSpider(Spider):
name = "scholarscrape"
allowed_domains = ["scholar.google.com"]
# get the data
data = read_csv("../../data/master_jeea.csv")
# get the titles
queries = data.Title.apply(urllib.quote)
pid = data.pid
# generate a var to store links
urls = []
# create the URLs to crawl
for entry in queries:
urls.append("http://scholar.google.com/scholar?q=allintitle%3A"+entry)
# give the URLs to scrapy
start_urls = (
(urls, pid),
)
def make_requests_from_url(self, (url,pid)):
return Request(url, meta={'pid':pid}, callback=self.parse, dont_filter=True)
def parse(self, response):
# initialize something to hold the data
items=[]
sel = Selector(response)
# get each 'entry' on the page
# an entry is a self contained div
# that has the title, publication info
# and cites
entries = sel.xpath('//div[#class="gs_ri"]')
# a counter for the entry that is being scraped
count = 1
for entry in entries:
item = ScholarscrapeItem()
# get the title
title = entry.xpath('.//h3[#class="gs_rt"]/a//text()').extract()
# the title is messy
# clean up
item['title'] = "".join(title)
# get publication info
# clean up
author = entry.xpath('.//div[#class="gs_a"]//text()').extract()
item['authors'] = "".join(author)
# get the portion that contains citations
cite_string = entry.xpath('.//div[#class="gs_fl"]//text()').extract()
# find the part that says "Cited by"
match = re.search("Cited by \d+",str(cite_string))
# if it exists, note the number
if match:
cites = re.search("\d+",match.group()).group()
# if not, there is no citation info
else:
cites = None
item['cites'] = cites
item['entry'] = count
item['pid'] = response.meta['pid']
# iterate the counter
count += 1
# append this item to the list
items.append(item)
return items
You need to populate your list start_urls with tuples (url, pid).
Now redefine the method make_requests_from_url(url):
class ScholarSpider(Spider):
name = "ScholarSpider"
allowed_domains = ["scholar.google.com"]
start_urls = (
('http://www.scholar.google.com/', 100),
)
def make_requests_from_url(self, (url, pid)):
return Request(url, meta={'pid': pid}, callback=self.parse, dont_filter=True)
def parse(self, response):
pid = response.meta['pid']
print '!!!!!!!!!!!', pid, '!!!!!!!!!!!!'
pass
I have done and spider that can take the information of this page and it can follow "Next page" links. Now, the spider just takes the information that i'm showing in the following structure.
The structure of the page is something like this
Title 1
URL 1 ---------> If you click you go to one page with more information
Location 1
Title 2
URL 2 ---------> If you click you go to one page with more information
Location 2
Next page
Then, that i want is that the spider goes on each URL link and get full information. I suppose that i must generate another rule that specify that i want do something like this.
The behaviour of the spider it should be:
Go to URL1 (get info)
Go to URL2 (get info)
...
Next page
But i don't know how i can implement it. Can someone guide me?
Code of my Spider:
class BcnSpider(CrawlSpider):
name = 'bcn'
allowed_domains = ['guia.bcn.cat']
start_urls = ['http://guia.bcn.cat/index.php?pg=search&q=*:*']
rules = (
Rule(
SgmlLinkExtractor(
allow=(re.escape("index.php")),
restrict_xpaths=("//div[#class='paginador']")),
callback="parse_item",
follow=True),
)
def parse_item(self, response):
self.log("parse_item")
sel = Selector(response)
sites = sel.xpath("//div[#id='llista-resultats']/div")
items = []
cont = 0
for site in sites:
item = BcnItem()
item['id'] = cont
item['title'] = u''.join(site.xpath('h3/a/text()').extract())
item['url'] = u''.join(site.xpath('h3/a/#href').extract())
item['when'] = u''.join(site.xpath('div[#class="dades"]/dl/dd[1]/text()').extract())
item['where'] = u''.join(site.xpath('div[#class="dades"]/dl/dd[2]/span/a/text()').extract())
item['street'] = u''.join(site.xpath('div[#class="dades"]/dl/dd[3]/span/text()').extract())
item['phone'] = u''.join(site.xpath('div[#class="dades"]/dl/dd[4]/text()').extract())
items.append(item)
cont = cont + 1
return items
EDIT After searching in internet I found a code with which i can do that.
First of all, I have to get all the links, then I have to call another parse method.
def parse(self, response):
#Get all URL's
yield Request( url= _url, callback=self.parse_details )
def parse_details(self, response):
#Detailed information of each page
If you want use Rules because the page have a paginator, you should change def parse to def parse_start_url and then call this method through Rule. With this changes you make sure that the parser begins at the parse_start_url and the code it would be something like this:
rules = (
Rule(
SgmlLinkExtractor(
allow=(re.escape("index.php")),
restrict_xpaths=("//div[#class='paginador']")),
callback="parse_start_url",
follow=True),
)
def parse_start_url(self, response):
#Get all URL's
yield Request( url= _url, callback=self.parse_details )
def parse_details(self, response):
#Detailed information of each page
Thant's all folks
There is an easier way of achieving this. Click next on your link, and read the new url carefully:
http://guia.bcn.cat/index.php?pg=search&from=10&q=*:*&nr=10
By looking at the get data in the url (everything after the questionmark), and a bit of testing, we find that these mean
from=10 - Starting index
q=*:* - Search query
nr=10 - Number of items to display
This is how I would've done it:
Set nr=100 or higher. (1000 may do as well, just be sure that there is no timeout)
Loop from from=0 to 34300. This is above the number of entries currently. You may want to extract this value first.
Example code:
entries = 34246
step = 100
stop = entries - entries % step + step
for x in xrange(0, stop, step):
url = 'http://guia.bcn.cat/index.php?pg=search&from={}&q=*:*&nr={}'.format(x, step)
# Loop over all entries, and open links if needed
I am attempting to scrape the Library of Congress/Thomas website. This Python script is intended to access a sample of 40 bills from their site (# 1-40 identifiers in the URLs). I want to parse the body of each piece of legislation, search in the body/content, extract links to potential multiple versions & follow.
Once on the version page(s) I want to parse the body of each piece of legislation, search the body/content & extract links to potential sections & follow.
Once on the section page(s) I want to parse the body of each section of a bill.
I believe there is some issue with the Rules/LinkExtractor segment of my code. The python code is executing, crawling the start urls, but not parsing or any of the subsequent tasks.
Three issues:
Some bills do not have multiple versions (and ergo no links in the body portion of the URL
Some bills do not have linked sections because they are so short, while some are nothing but links to sections.
Some section links do not contain just section-specific content, and most of the content is just redundant inclusion of prior or subsequent section content.
My question is again, why is Scrapy not crawling or parsing?
from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class BillItem(Item):
title = Field()
body = Field()
class VersionItem(Item):
title = Field()
body = Field()
class SectionItem(Item):
body = Field()
class Lrn2CrawlSpider(CrawlSpider):
name = "lrn2crawl"
allowed_domains = ["thomas.loc.gov"]
start_urls = ["http://thomas.loc.gov/cgi-bin/query/z?c107:H.R.%s:" % bill for bill in xrange(000001,00040,00001) ### Sample of 40 bills; Total range of bills is 1-5767
]
rules = (
# Extract links matching /query/ fragment (restricting tho those inside the content body of the url)
# and follow links from them (since no callback means follow=True by default).
# Desired result: scrape all bill text & in the event that there are multiple versions, follow them & parse.
Rule(SgmlLinkExtractor(allow=(r'/query/'), restrict_xpaths=('//div[#id="content"]')), callback='parse_bills', follow=True),
# Extract links in the body of a bill-version & follow them.
#Desired result: scrape all version text & in the event that there are multiple sections, follow them & parse.
Rule(SgmlLinkExtractor(restrict_xpaths=('//div/a[2]')), callback='parse_versions', follow=True)
)
def parse_bills(self, response):
hxs = HtmlXPathSelector(response)
bills = hxs.select('//div[#id="content"]')
scraped_bills = []
for bill in bills:
scraped_bill = BillItem() ### Bill object defined previously
scraped_bill['title'] = bill.select('p/text()').extract()
scraped_bill['body'] = response.body
scraped_bills.append(scraped_bill)
return scraped_bills
def parse_versions(self, response):
hxs = HtmlXPathSelector(response)
versions = hxs.select('//div[#id="content"]')
scraped_versions = []
for version in versions:
scraped_version = VersionItem() ### Version object defined previously
scraped_version['title'] = version.select('center/b/text()').extract()
scraped_version['body'] = response.body
scraped_versions.append(scraped_version)
return scraped_versions
def parse_sections(self, response):
hxs = HtmlXPathSelector(response)
sections = hxs.select('//div[#id="content"]')
scraped_sections = []
for section in sections:
scraped_section = SectionItem() ## Segment object defined previously
scraped_section['body'] = response.body
scraped_sections.append(scraped_section)
return scraped_sections
spider = Lrn2CrawlSpider()
Just for the record, the problem with your script is that the variable rules is not inside the scope of Lrn2CrawlSpider because it doesn't share the same indentation, so when alecxe fixed the indentation the variable rules became now an attribute of the class. Later the inherited method __init__() reads the attribute and compiles the rules and enforces them.
def __init__(self, *a, **kw):
super(CrawlSpider, self).__init__(*a, **kw)
self._compile_rules()
Erasing the last line had nothing to do with that.
I've just fixed the indentation, removed spider = Lrn2CrawlSpider() line at the end of the script, ran the spider via scrapy runspider lrn2crawl.py and it scrapes, follows links, returns items - your rules work.
Here's what I'm running:
from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class BillItem(Item):
title = Field()
body = Field()
class VersionItem(Item):
title = Field()
body = Field()
class SectionItem(Item):
body = Field()
class Lrn2CrawlSpider(CrawlSpider):
name = "lrn2crawl"
allowed_domains = ["thomas.loc.gov"]
start_urls = ["http://thomas.loc.gov/cgi-bin/query/z?c107:H.R.%s:" % bill for bill in xrange(000001,00040,00001) ### Sample of 40 bills; Total range of bills is 1-5767
]
rules = (
# Extract links matching /query/ fragment (restricting tho those inside the content body of the url)
# and follow links from them (since no callback means follow=True by default).
# Desired result: scrape all bill text & in the event that there are multiple versions, follow them & parse.
Rule(SgmlLinkExtractor(allow=(r'/query/'), restrict_xpaths=('//div[#id="content"]')), callback='parse_bills', follow=True),
# Extract links in the body of a bill-version & follow them.
#Desired result: scrape all version text & in the event that there are multiple sections, follow them & parse.
Rule(SgmlLinkExtractor(restrict_xpaths=('//div/a[2]')), callback='parse_versions', follow=True)
)
def parse_bills(self, response):
hxs = HtmlXPathSelector(response)
bills = hxs.select('//div[#id="content"]')
scraped_bills = []
for bill in bills:
scraped_bill = BillItem() ### Bill object defined previously
scraped_bill['title'] = bill.select('p/text()').extract()
scraped_bill['body'] = response.body
scraped_bills.append(scraped_bill)
return scraped_bills
def parse_versions(self, response):
hxs = HtmlXPathSelector(response)
versions = hxs.select('//div[#id="content"]')
scraped_versions = []
for version in versions:
scraped_version = VersionItem() ### Version object defined previously
scraped_version['title'] = version.select('center/b/text()').extract()
scraped_version['body'] = response.body
scraped_versions.append(scraped_version)
return scraped_versions
def parse_sections(self, response):
hxs = HtmlXPathSelector(response)
sections = hxs.select('//div[#id="content"]')
scraped_sections = []
for section in sections:
scraped_section = SectionItem() ## Segment object defined previously
scraped_section['body'] = response.body
scraped_sections.append(scraped_section)
return scraped_sections
Hope that helps.