In Scrapy, How to set time limit for each url? - python

I am trying to crawl multiple websites using Scrapy link extractor and follow as TRUE (recursive) .. Looking for a solution to set the time limit to crawl for each url in start_urls list.
Thanks
import scrapy
class DmozItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
desc = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
for sel in response.xpath('//ul/li'):
item = DmozItem()
item['title'] = sel.xpath('a/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
item['desc'] = sel.xpath('text()').extract()
yield item

You need to use download_timeout meta parameter for scrapy.Request.
To use it in starting urls, you need to override self.start_requests(self) function, something like:
def start_requests(self):
# 10 seconds for first url
yield Request(self.start_urls[0], meta={'download_timeout': 10})
# 60 seconds for first url
yield Request(self.start_urls[1], meta={'download_timeout': 60})
You can read more about Request special meta keys here: http://doc.scrapy.org/en/latest/topics/request-response.html#request-meta-special-keys

You can use the CLOSESPIDER_TIMEOUT setting
For example, call your spider like this:
scrapy crawl DmozSpider -s CLOSESPIDER_TIMEOUT=10

Use a Timeout object!
import signal
class Timeout(object):
"""Timeout class using ALARM signal."""
class TimeoutError(Exception):
pass
def __init__(self, sec):
self.sec = sec
def __enter__(self):
signal.signal(signal.SIGALRM, self.raise_timeout)
signal.alarm(self.sec)
def __exit__(self, *args):
signal.alarm(0)# disable alarm
def raise_timeout(self, *args):
raise Timeout.TimeoutError('TimeoutError')
Then you can call your extractor inside a with statement like this:
with Timeout(10): #10 seconds
try:
do_what_you_need_to_do
except Timeout.TimeoutError:
#break, continue or whatever else you may need

Related

How to run both items in scrapy function?

Whenever I use the link of captions and transcription in start_urls variable, it gives me the price of caption in both captions and transcription variable and again give me the price of transcription in both variables. Why and how to solve this issue?
import scrapy
from .. items import FetchingItem
class SiteFetching(scrapy.Spider):
name = 'Site'
start_urls = ['https://www.rev.com/freelancers/captions',
'https://www.rev.com/freelancers/transcription']
def parse(self, response):
items = FetchingItem()
Transcription_price = response.css('#middle-benefit .mt1::text').extract()
Caption_price = response.css('#middle-benefit .mt1::text').extract()
items['Transcription_price'] = Transcription_price
items['Caption_price'] = Caption_price
yield items
I suspect that you need another structure of class, sequential:
import scrapy
from .. items import FetchingItem
class SiteFetching(scrapy.Spider):
name = 'Site'
start_urls = ['https://www.rev.com/freelancers/captions']
def parse(self, response):
items = FetchingItem()
items['Caption_price'] = response.css('#middle-benefit .mt1::text').extract()
yield Request('https://www.rev.com/freelancers/transcription', self.parse_transcription, meta={'items': items})
def parse_transcription(self, response):
items = response.meta['items']
items['Transcription_price'] = response.css('#middle-benefit .mt1::text').extract()
yield items

Is it possible to run pipelines and crawl multiple URL at the same time in scrapy?

My spider looks like this
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from ProjectName.items import ProjectName
class SpidernameSpider(CrawlSpider):
name = 'spidername'
allowed_domains = ['webaddress']
start_urls = ['webaddress/query1']
rules = (
Rule(LinkExtractor(restrict_css='horizontal css')),
Rule(LinkExtractor(restrict_css='vertical css'),
callback='parse_item')
)
def parse_item(self, response):
item = ProjectName()
1_css = 'css1::text'
item['1'] = response.css(1_css).extract()
item = ProjectName()
2_css = 'css2::text'
item['2'] = response.css(2_css).extract()
return item
and my pipeline like this:
from scrapy.exceptions import DropItem
class RemoveIncompletePipeline(object):
def reminc_item(self, item, spider):
if item['1']:
return item
else:
raise DropItem("Missing content in %s" % item)
Everything works fine, when the value for field 1 is missing then, the coresponding item is taken out from the output.
But, when I change start_urls, in order to do the job for multiple queries, like this:
f = open("queries.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
or like this:
start_urls = [i.strip() for i in open('queries.txt').readlines()]
Then the output contains the items with missing value for field 1.
What's going on? And how I can avoid that?
For the record queries.txt looks like that:
webaddress/query1
webaddress/query2
According to the docs you should override start_requests method.
This method must return an iterable with the first Requests to crawl
for this spider.
This is the method called by Scrapy when the spider is opened for
scraping when no particular URLs are specified. If particular URLs are
specified, the make_requests_from_url() is used instead to create the
Requests. This method is also called only once from Scrapy, so it’s
safe to implement it as a generator.
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from ProjectName.items import ProjectName
class SpidernameSpider(CrawlSpider):
name = 'spidername'
allowed_domains = ['webaddress']
start_urls = ['webaddress/query1']
rules = (
Rule(LinkExtractor(restrict_css='horizontal css')),
Rule(LinkExtractor(restrict_css='vertical css'),
callback='parse_item')
)
def start_requests(self):
return [scrapy.Request(i.strip(), callback=self.parse_item) for i in open('queries.txt').readlines()]
def parse_item(self, response):
item = ProjectName()
1_css = 'css1::text'
item['1'] = response.css(1_css).extract()
item = ProjectName()
2_css = 'css2::text'
item['2'] = response.css(2_css).extract()
return item
UPD:
Just put this code into your spider class
def start_requests(self):
return [scrapy.Request(i.strip(), callback=self.parse_item) for i in open('queries.txt').readlines()]
UPD:
Your have a wrong logic in your parse_item method. You need to fix it.
def parse_item(self, response):
for job in response.css('div.card-top')
item = ProjectName()
# just quick example.
item['city'] = job.xpath('string(//span[#class="serp-location"])').extract()[0].replace(' ', '').replace('\n', '')
# TODO: you should fill other item fields
# ...
yeild item

Limit how much elements scrapy can collect

I am using scrapy to collect some data. My scrapy program collects 100 elements at one session. I need to limit it to 50 or any random number. How can i do that? Any solution is welcomed. Thanks in advance
# -*- coding: utf-8 -*-
import re
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["raleigh.craigslist.org"]
start_urls = [
"http://raleigh.craigslist.org/search/bab"
]
BASE_URL = 'http://raleigh.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/ral/bab/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
This is what CloseSpider extension and CLOSESPIDER_ITEMCOUNT setting were made for:
An integer which specifies a number of items. If the spider scrapes
more than that amount if items and those items are passed by the item
pipeline, the spider will be closed with the reason
closespider_itemcount. If zero (or non set), spiders won’t be closed
by number of passed items.
I tried alecxe answer but I had to combine all 3 limits to make it work, so leaving it here just in case someone else is having the same issue:
class GenericWebsiteSpider(scrapy.Spider):
"""This generic website spider extracts text from websites"""
name = "generic_website"
custom_settings = {
'CLOSESPIDER_PAGECOUNT': 15,
'CONCURRENT_REQUESTS': 15,
'CLOSESPIDER_ITEMCOUNT': 15
}
...

Scrapy pipeline to export csv file in the right format

I made the improvement according to the suggestion from alexce below. What I need is like the picture below. However each row/line should be one review: with date, rating, review text and link.
I need to let item processor process each review of every page.
Currently TakeFirst() only takes the first review of the page. So 10 pages, I only have 10 lines/rows as in the picture below.
Spider code is below:
import scrapy
from amazon.items import AmazonItem
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = [
'http://www.amazon.co.uk/product-reviews/B0042EU3A2/'.format(page) for page in xrange(1,114)
]
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
item = AmazonItem()
item['rating'] = sel.xpath('div/div[2]/span[1]/span/#title').extract()
item['date'] = sel.xpath('div/div[2]/span[2]/nobr/text()').extract()
item['review'] = sel.xpath('div/div[6]/text()').extract()
item['link'] = sel.xpath('div/div[7]/div[2]/div/div[1]/span[3]/a/#href').extract()
yield item
I started from scratch and the following spider should be run with
scrapy crawl amazon -t csv -o Amazon.csv --loglevel=INFO
so that opening the CSV-File with a spreadsheet shows for me
Hope this helps :-)
import scrapy
class AmazonItem(scrapy.Item):
rating = scrapy.Field()
date = scrapy.Field()
review = scrapy.Field()
link = scrapy.Field()
class AmazonSpider(scrapy.Spider):
name = "amazon"
allowed_domains = ['amazon.co.uk']
start_urls = ['http://www.amazon.co.uk/product-reviews/B0042EU3A2/' ]
def parse(self, response):
for sel in response.xpath('//table[#id="productReviews"]//tr/td/div'):
item = AmazonItem()
item['rating'] = sel.xpath('./div/span/span/span/text()').extract()
item['date'] = sel.xpath('./div/span/nobr/text()').extract()
item['review'] = sel.xpath('./div[#class="reviewText"]/text()').extract()
item['link'] = sel.xpath('.//a[contains(.,"Permalink")]/#href').extract()
yield item
xpath_Next_Page = './/table[#id="productReviews"]/following::*//span[#class="paging"]/a[contains(.,"Next")]/#href'
if response.xpath(xpath_Next_Page):
url_Next_Page = response.xpath(xpath_Next_Page).extract()[0]
request = scrapy.Request(url_Next_Page, callback=self.parse)
yield request
If using -t csv (as proposed by Frank in comments) does not work for you for some reason, you can always use built-in CsvItemExporter directly in the custom pipeline, e.g.:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class AmazonPipeline(object):
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.file = open('output.csv', 'w+b')
self.exporter = CsvItemExporter(self.file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
which you need to add to ITEM_PIPELINES:
ITEM_PIPELINES = {
'amazon.pipelines.AmazonPipeline': 300
}
Also, I would use an Item Loader with input and output processors to join the review text and replace new lines with spaces. Create an ItemLoader class:
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Join, MapCompose
class AmazonItemLoader(ItemLoader):
default_output_processor = TakeFirst()
review_in = MapCompose(lambda x: x.replace("\n", " "))
review_out = Join()
Then, use it to construct an Item:
def parse(self, response):
for sel in response.xpath('//*[#id="productReviews"]//tr/td[1]'):
loader = AmazonItemLoader(item=AmazonItem(), selector=sel)
loader.add_xpath('rating', './/div/div[2]/span[1]/span/#title')
loader.add_xpath('date', './/div/div[2]/span[2]/nobr/text()')
loader.add_xpath('review', './/div/div[6]/text()')
loader.add_xpath('link', './/div/div[7]/div[2]/div/div[1]/span[3]/a/#href')
yield loader.load_item()

scrapy get start_urls from database

I have a question want to ask
If I sant to select the link from database and let it be the start_urls
What can I do ??
assum my database save these link and title
1: link : 'http://test.com/id=1' , title : 'math'
2: link : 'http://test.com/id=2' , title : 'english'
3: link : 'http://test.com/id=30' , title : 'sports'
and if I can get the title as a parameter in my spider would be better.
Here is my thought:
class MySpider(Spider):
name = "linktest"
for obj in Learning.objects.all():
print obj.link
#result: http://test.com/id=1
http://test.com/id=2
http://test.com/id=30
start_urls =[ 'http://test.com/id=1', #how to do this?
'http://test.com/id=2',
'http://test.com/id=30' ]
def parse(self, response):
#item['title']=math #when response.url == http://test.com/id=1,
#item['title']=english #when response.url == http://test.com/id=2,...
You need to override start_requests() method and yield/return Request instances from it:
This method must return an iterable with the first Requests to crawl
for this spider.
This is the method called by Scrapy when the spider is opened for
scraping when no particular URLs are specified.
class MySpider(Spider):
name = "linktest"
def start_requests(self):
for obj in Learning.objects.all():
yield Request(obj.link)
In order to pass an obj.title to the callback, use meta attribute:
class MySpider(Spider):
name = "linktest"
def start_requests(self):
for obj in Learning.objects.all():
yield Request(obj.link, meta={'title': obj.title})
def parse(self, response):
print response.url, response.meta['title']

Categories