Load Item fileds with ItemLoader across multiple responses - python

This is a followup question to accepted answer to question Scrapy: populate items with item loaders over multiple pages. I want to use ItemLoader to collect values from multiple requests to a single Item. The accepted answer suggests that the loaded Item.load_item() should be passed to the next request via meta field in request.
However, I would like to apply output_processors to all collected values of a single field when returning the loaded object at the end of the crawl.
Questions
What would be the best way to achieve it?
Can I pass the ItemLoader instance over meta to next request without loading it and then just replace the selector or response elements in the ItemLoader when adding the values or xpaths from the next response?
Example:
def parse(self, response):
loader = TheLoader(item=TestItems(), response=response)
loader.add_xpath('title1', '//*[#id="firstHeading"]/text()')
request = Request(
"https://en.wikipedia.org/wiki/2016_Rugby_Championship",
callback=self.parsePage1,
meta={'loader': loader},
dont_filter=True
)
yield request
def parsePage1(self, response):
loader = response.meta['loader']
loader.response = response
loader.add_xpath('title1', '//*[#id="firstHeading"]/text()')
return loader.load_item()
Ignore the context of the actual websites.

Yes, you can just pass the ItemLoader instance.
If I recall this correctly from irc or github chat way long ago, there might be some potential issues with doing this, like increased memory usage or leaks from reference handling, because you carry around object references of ItemLoader instances (and processors?) and potentially over long times, depending on the order of your download queues, by binding these itemloader instances to those requests.
So keep that in mind and perhaps beware of using this style on large crawls, or do some memory debugging to be certain.
However, I used this method extensively in the past myself (and would still do so when using ItemLoaders), and haven't seen any problems with that approach myself.
Here is how I do that:
import scrapy
from myproject.loader import ItemLoader
class TheLoader(ItemLoader):
pass
class SomeSpider(scrapy.Spider):
[...]
def parse(self, response):
loader = TheLoader(item=TestItems(), response=response)
loader.add_xpath('title1', '//*[#id="firstHeading"]/text()')
request = Request("https://en.wikipedia.org/wiki/2016_Rugby_Championship",
callback=self.parsePage1,
dont_filter=True
)
request.meta['loader'] = loader
yield request
def parsePage1(self, response):
loader = response.meta['loader']
# rebind ItemLoader to new Selector instance
#loader.reset(selector=response.selector, response=response)
# skipping the selector will default to response.selector, like ItemLoader
loader.reset(response=response)
loader.add_xpath('title1', '//*[#id="firstHeading"]/text()')
return loader.load_item()
This requires using a customized ItemLoader class, which can be found in my scrapy scrapyard,
but the relevant part of the class is here:
from scrapy.loader import ItemLoader as ScrapyItemLoader
class ItemLoader(ScrapyItemLoader):
""" Extended Loader
for Selector resetting.
"""
def reset(self, selector=None, response=None):
if response is not None:
if selector is None:
selector = self.default_selector_class(response)
self.selector = selector
self.context.update(selector=selector, response=response)
elif selector is not None:
self.selector = selector
self.context.update(selector=selector)

Related

Why is Scrapy not following all rules / running all callbacks?

I have two spiders inheriting from a parent spider class as follows:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
class SpiderOpTest(CrawlSpider):
custom_settings = {
"USER_AGENT": "*",
"LOG_LEVEL": "WARNING",
"DOWNLOADER_MIDDLEWARES": {'scraper_scrapy.odds.middlewares.SeleniumMiddleware': 543},
}
httperror_allowed_codes = [301]
def parse_tournament(self, response):
print(f"Parsing tournament - {response.url}")
def parse_tournament_page(self, response):
print(f"Parsing tournament page - {response.url}")
class SpiderOpTest1(SpiderOpTest):
name = "test_1"
start_urls = ["https://www.oddsportal.com/tennis/argentina/atp-buenos-aires/results/"]
rules = (Rule(LinkExtractor(allow="/page/"), callback="parse_tournament_page"),)
class SpiderOpTest2(SpiderOpTest):
name = "test_2"
start_urls = ["https://www.oddsportal.com/tennis/results/"]
rules = (
Rule(LinkExtractor(allow="/atp-buenos-aires/results/"), callback="parse_tournament", follow=True),
Rule(LinkExtractor(allow="/page/"), callback="parse_tournament_page"),
)
process = CrawlerProcess()
process.crawl(<spider_class>)
process.start()
The parse_tournament_page callback for the Rule in first spider works fine.
However, the second spider only runs the parse_tournament callback from the first Rule despite the fact that the second Rule is the same as the first spider and is operating on the same page.
I'm clearly missing something really simple but for the life of me I can't figure out what it is...
As key bits of the pages load via Javascript then it might be useful for me to include the Selenium middleware I'm using:
from scrapy import signals
from scrapy.http import HtmlResponse
from selenium import webdriver
class SeleniumMiddleware:
#classmethod
def from_crawler(cls, crawler):
middleware = cls()
crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
self.driver.get(request.url)
return HtmlResponse(
self.driver.current_url,
body=self.driver.page_source,
encoding='utf-8',
request=request,
)
def spider_opened(self, spider):
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
self.driver = webdriver.Firefox(options=options)
def spider_closed(self, spider):
self.driver.close()
Edit:
So I've managed to create a third spider which is able to execute the parse_tournament_page callback from inside parse_tournament:
class SpiderOpTest3(SpiderOpTest):
name = "test_3"
start_urls = ["https://www.oddsportal.com/tennis/results/"]
httperror_allowed_codes = [301]
rules = (
Rule(
LinkExtractor(allow="/atp-buenos-aires/results/"),
callback="parse_tournament",
follow=True,
),
)
def parse_tournament(self, response):
print(f"Parsing tournament - {response.url}")
xtr = LinkExtractor(allow="/page/")
links = xtr.extract_links(response)
for p in links:
yield response.follow(p.url, dont_filter=True, callback=self.parse_tournament_page)
def parse_tournament_page(self, response):
print(f"Parsing tournament PAGE - {response.url}")
The key here seems to be dont_filter=True - if this is left as the default False then the parse_tournament_page callback isn't executed. This suggests Scrapy is somehow interpreting the second page as a duplicate which I far as I can tell it isn't. That aside, from what I've read if I want to get around this then I need to add unique=False to the LinkExtractor. However, doing this doesn't result in the parse_tournament_page callback executing :(
Update:
So I think I've found the source of the issue. From what I can tell the request_fingerprint method of RFPDupeFilter creates the same hash for https://www.oddsportal.com/tennis/argentina/atp-buenos-aires/results/ as https://www.oddsportal.com/tennis/argentina/atp-buenos-aires/results/#/page/2/.
From reading around I need to subclass RFPDupeFilter to reconfigure the way request_fingerprint works. Any advice on why the same hashes are being generated and/or tips on how to do subclass correctly would be greatly appreciated!
The difference between the two URLs mentioned in the update is in the fragment #/page/2/. Scrapy ignores them by default: Also, servers usually ignore fragments in urls when handling requests, so they are also ignored by default when calculating the fingerprint. If you want to include them, set the keep_fragments argument to True (for instance when handling requests with a headless browser). (from scrapy/utils/request.py)
Check DUPEFILTER_CLASS settings for more information.
The request_fingerprint from scrapy.utils.request can already handle the fragments. When subclassing pass keep_fragments=True.
Add the your class in the custom_settings of SpiderOpTest.

How can I limit the number of items scraped per domain in scrapy?

I am working on scraping items from across a number of websites(using scrapy for the same).
The items I am trying to scrape are not always well defined and might be within texts. So I am using string matches to recognize items. However, this also yields some unwanted information along with my required data and my scraper takes a long time scraping unwanted information.
To avoid this, I have put an upper limit on the number of items scraped. By using an "if" condition, I am raising a CloseSpider() exception on reaching the upper limit.
This approach worked fine until I had only one domain to scrape. How do I extend it for multiple domains.
class CustomSpider(CrawlSpider):
name = "myspider"
start_urls = ['https://www.example1.com/']
allowed_domains = ['www.example1.com']
rules = [Rule(LinkExtractor(allow=()), callback='parse_info', follow = True)]
def parse_info(self, response):
scrape_count = self.crawler.stats.get_value('item_scraped_count')
if scrape_count == 20:
raise CloseSpider("Limit Reached")
My question is, how to extend this code for the following scenario:
class CustomSpider(CrawlSpider):
name = "myspider"
start_urls = ['https://www.example1.com/', 'https://www.example2.com/']
allowed_domains = ['www.example1.com', 'www.example2.com/']
rules = [Rule(LinkExtractor(allow=()), callback='parse_info', follow = True)]
def parse_info(self, response):
suggest change in logic here
scrape_count = self.crawler.stats.get_value('item_scraped_count')
if scrape_count == 20:
raise CloseSpider("Limit Reached")
See this toy example:
from __future__ import print_function
import collections
try:
from urllib.urlparse import urlsplit
except ImportError:
from urlparse import urlsplit
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class MySpider(CrawlSpider):
name = 'myspider'
start_urls = ['http://quotes.toscrape.com/',
'http://webscraper.io/test-sites']
allowed_domains = ['quotes.toscrape.com', 'webscraper.io']
scraped_count = collections.defaultdict(int)
limit = 10
rules = [Rule(LinkExtractor(allow=()), callback='parse_page',
follow=True, process_request='process_request')]
def parse_page(self, response):
yield {
'url': response.url
}
def process_request(self, request):
url = urlsplit(request.url)[1]
if self.scraped_count[url] < self.limit:
self.scraped_count[url] += 1
return request
else:
print('Limit reached for {}'.format(url))
It keeps track of the number of items scraped per domain in attribute scraped_count. Attribute limit holds the limit per domain. The logic is put inside the process_request method that's passed as an argument to Rule and that gets called for every request extracted by that rule (see the documentation). When you are over the limit, request gets filtered, otherwise it's returned unchanged and gets processed.
If you need something more sophisticated or applicable to multiple spiders, I'd suggest you extend CloseSpider extension class, implement the logic there and replace the default class in the settings.py.
You can use CLOSESPIDER_ITEMCOUNT
An integer which specifies a number of items. If the spider scrapes
more than that amount and those items are passed by the item pipeline,
the spider will be closed with the reason closespider_itemcount.
Requests which are currently in the downloader queue (up to
CONCURRENT_REQUESTS requests) are still processed. If zero (or non
set), spiders won’t be closed by number of passed items.

Item Loader not working with response.meta

I want to load two items into an item loader, that is instantiated through the response.meta command. Somehow, the standard:
loader.add_xpath('item', 'xpath')
Is not working (i.e. no value is saved or written, it is like the 'item' was never created), but with the exact same expression the:
response.xpath('xpath)
loader.add_value('item',value)
works? Anyone now why? Complete code below:
Spider.py
def parse(self, response):
for record in response.xpath('//div[#class="box list"]/div[starts-with(#class,"record")]'):
loader = BaseItemLoader(item=BezrealitkyItems(), selector=record)
loader.add_xpath('title','.//div[#class="details"]/h2/a[#href]/text()')
listing_url = record.xpath('.//div[#class="details"]/p[#class="short-url"]/text()').extract_first()
yield scrapy.Request(listing_url, meta={'loader' : loader}, callback=self.parse_listing)
def parse_listing(self, response):
loader = response.meta['loader']
loader.add_value('url', response.url)
loader.add_xpath('lat','//script[contains(.,"recordGps")]',re=r'(?:"lat":)[0-9]+\.[0-9]+')
return loader.load_item()
The above does not work, when I try this it works though:
lat_coords = response.xpath('//script[contains(.,"recordGps")]/text()').re(r'(?:"lat":)([0-9]+\.[0-9]+)')
loader.add_value('lat', lat_coords)
My item.py has nothing special:
class BezrealitkyItems(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
lat = scrapy.Field()
class BaseItemLoader(ItemLoader):
title_in = MapCompose(lambda v: v.strip(), Join(''), unidecode)
title_out = TakeFirst()
Just to clarify, I get no error message. It is just that the 'lat' item has not been created nor nothing scraped to it. The other items are scraped fine, including the url that is also added through the parse_listing function.
It happens because you are carrying over loader reference which has it's own selector object.
Here you create and assign a selector parameter with your reference:
loader = BaseItemLoader(item=BezrealitkyItems(), selector=record)
Now later you put this loader into your Request.meta attribute and carry it over to the next parse method. What you aren't doing though is updating the selector context once you retrieve the loader from the meta:
loader = response.meta['loader']
# if you check loader.selector you'll see that it still has html body
# set in previous method, i.e. selector of record in your case
loader.selector = Selector(response) # <--- this is missing
This would work, however it should be avoided because having complex objects with a lot of references in meta is a bad idea and can cause all kind of errors that are mostly related to Twisted framework (that scrapy uses for it's concurrency).
What you should do however is load and recreate item in every step:
def parse(self, response):
loader = BaseItemLoader(item=BezrealitkyItems(), selector=record)
yield scrapy.Request('some_url', meta={'item': loader.load_item()}, callback=self.parse2)
def parse2(self, response):
loader = BaseItemLoader(item=response.meta['item'], selector=record)

How to crawl all webpages on website up to certain depth?

I have a website and I would like to find a webpage with information about job vacancies. There is only one page usually with such information. So I start crawling with website and I manage to get all webpages up to certain depth. It works. But they are many times duplicated. Instead of lets say 45 pages I get 1000 pages. I know the reason why. The reason is that every time I call my "parse" function, it parses all the webpages on a certain webpage. So when I come to a new webpage, it crawls all webpages, out of which some have been crawled before.
1) I tried to make "items=[]" list out of parse function but I get some global error. I don't know how to get a list of unique webpages. When I have one, I will be able to choose the right one with simple url parsing.
2) I also tried to have "Request" and "return items" in the "parse" function, but I get syntax error: return inside generator.
I am using DEPTH_LIMIT. Do I really need to use Rules ?
code:
import scrapy, urlparse, os
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from tutorial.items import JobItem
from scrapy.utils.response import get_base_url
from scrapy.http import Request
from urlparse import urljoin
from datetime import datetime
class JobSpider(scrapy.Spider):
name = "jobs"
allowed_domains = ["www.gen-i.si"]
start_urls = ["http://www.gen-i.si"]
def parse(self, response):
response.selector.remove_namespaces() #
urls = response.xpath('//#href').extract()#choose all "href", either new websites either webpages on our website
items = []
base_url = get_base_url(response) #base url
for url in urls:
#we need only webpages, so we remove all websites and urls with strange characters
if (url[0:4] != "http") and not any(x in url for x in ['%', ':', '?', '&']):
item = JobItem()
absolute_url = urlparse.urljoin(base_url,url)
item["link"] = absolute_url
if item not in items:
items.append(item)
yield item
yield Request(absolute_url, callback = self.parse)
#return items
You're appending item (a newly instantiated object), to your list items. Since item is always a new JobItem() object, it will never exist in your list items.
To illustrate:
>>> class MyItem(object):
... pass
...
>>> a = MyItem()
>>> b = MyItem()
>>> a.url = "abc"
>>> b.url = "abc"
>>> a == b
False
Just because they have one attribute that is the same, doesn't mean they are the same object.
Even if this worked though, you're resetting the list items everytime you call parse (ie. for each request), so you'll never really remove duplicates.
Instead, you would be better checking vs. the absolute_url itself, and putting the list at the spider level:
class JobSpider(scrapy.Spider):
name = "jobs"
allowed_domains = ["www.gen-i.si"]
start_urls = ["http://www.gen-i.si"]
all_urls = []
def parse(self, response):
# remove "items = []"
...
for url in urls:
if (url[0:4] != "http") and not any(x in url for x in ['%', ':', '?', '&']):
absolute_url = urlparse.urljoin(base_url, url)
if absolute_url not in self.all_urls:
self.all_urls.append(absolute_url)
item = JobItem()
item['link'] = absolute_url
yield item
yield Request(absolute_url, callback = self.parse)
This functionality, however, would be better served by creating a Dupefilter instead (see here for more information). Additionally, I agree with #RodrigoNey, a CrawlSpider would likely better serve your purpose, and be more maintainable in the long run.
I'm working on a web crawler and ended up making a list of links that needed to be crawled, then once we went there it was deleted from that list and added to the crawled list. then you can use a not in search to either add/delete/etc.

Scrapy - Get index of item being parsed?

I'm trying to load some XPATH rules from a database using Scrapy.
The code I've written so far works fine, however after some debugging I've realised that Scrapy is parsing each item asynchronously, meaning I have no control over the order of which item is being parsed.
What I want to do is figure out which item from the list is currently being parsed when it hits the parse() function so I can reference that index to the rows in my database and acquire the correct XPATH query. The way I'm currently doing this is by using a variable called item_index and incrementing it after each item iteration. Now I realise this is not enough and I'm hoping there's some internal functionality that could help me achieve this.
Does anyone know the proper way of keeping track of this? I've looked through the documentation but couldn't find any info about it. I've also looked at the Scrapy source code but I can't seem to figure out how the list of URL's actually get stored.
Here's my code to explain my problem further:
# -*- coding: utf-8 -*-
from scrapy.spider import Spider
from scrapy.selector import Selector
from dirbot.items import Product
from dirbot.database import DatabaseConnection
# Create a database connection object so we can execute queries
connection = DatabaseConnection()
class DmozSpider(Spider):
name = "dmoz"
start_urls = []
item_index = 0
# Query for all products sold by a merchant
rows = connection.query("SELECT * FROM products_merchant WHERE 1=1")
def start_requests(self):
for row in self.rows:
yield self.make_requests_from_url(row["product_url"])
def parse(self, response):
sel = Selector(response)
item = Product()
item['product_id'] = self.rows[self.item_index]['product_id']
item['merchant_id'] = self.rows[self.item_index]['merchant_id']
item['price'] = sel.xpath(self.rows[self.item_index]['xpath_rule']).extract()
self.item_index+=1
return item
Any guidance would be greatly appreciated!
Thanks
Here's the solution I came up with just in case anyone needs it.
As #toothrot suggested, you need to overload methods within the Request class to be able to access meta information.
Hope this helps someone.
# -*- coding: utf-8 -*-
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from dirbot.items import Product
from dirbot.database import DatabaseConnection
# Create a database connection object so we can execute queries
connection = DatabaseConnection()
class DmozSpider(Spider):
name = "dmoz"
start_urls = []
# Query for all products sold by a merchant
rows = connection.query("SELECT * FROM products_merchant WHERE 1=1")
def start_requests(self):
for indx, row in enumerate(self.rows):
self.start_urls.append( row["product_url"] )
yield self.make_requests_from_url(row["product_url"], {'index': indx})
def make_requests_from_url(self, url, meta):
return Request(url, callback=self.parse, dont_filter=True, meta=meta)
def parse(self, response):
item_index = response.meta['index']
sel = Selector(response)
item = Product()
item['product_id'] = self.rows[item_index]['product_id']
item['merchant_id'] = self.rows[item_index]['merchant_id']
item['price'] = sel.xpath(self.rows[item_index]['xpath_rule']).extract()
return item
You can pass the index (or the row id from the database) along with the request using Request.meta. It's a dictionary you can access from Response.meta in your handler.
For example, when you're building your request:
Request(url, callback=self.some_handler, meta={'row_id': row['id']})
Using a counter like you've attempted won't work because you can't guarantee the order in which the responses are handled.

Categories