Scrapy CrawlSpider unable to crawl first page - python

My problem is the crawler crawling all pages except first page & I'm unable to understand why?
I'm sure that there is no problem with my items.py or anything else.
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from tutorial2.items import Tutorial2Item
class QuotesSpider(CrawlSpider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
rules = (
Rule(LinkExtractor(restrict_xpaths='//li[#class="next"]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
for items in response.xpath('//div[#class="quote"]'):
l = ItemLoader(item=Tutorial2Item(),selector=items)
l.add_xpath('text','span[#class="text"]/text()')
l.add_xpath('author','span/small[#class="author"]/text()')
l.add_xpath('author_link','span/a/#href')
l.add_xpath('tags','div[#class="tags"]/a[#class="tag"]/text()')
l.add_xpath('tags_link','div[#class="tags"]/a[#class="tag"]/#href')
yield l.load_item()

just implement parse_start_url to return the response. details url
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
# from tutorial2.items import Tutorial2Item
class QuotesSpider(CrawlSpider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
rules = (
Rule(LinkExtractor(restrict_xpaths='//li[#class="next"]'), callback='parse_item', follow=True),
)
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
for items in response.xpath('//div[#class="quote"]'):
l = ItemLoader(item=Tutorial2Item(),selector=items)
l.add_xpath('text','span[#class="text"]/text()')
l.add_xpath('author','span/small[#class="author"]/text()')
l.add_xpath('author_link','span/a/#href')
l.add_xpath('tags','div[#class="tags"]/a[#class="tag"]/text()')
l.add_xpath('tags_link','div[#class="tags"]/a[#class="tag"]/#href')
yield l.load_item()

This may be due to js rendering issue, try using splash as talked here

Related

How to use Scrapy for URL crawling

I want to crawl the link https://www.aparat.com/.
I crawl it correctly and get all the video links with header tag;like this :
import scrapy
class BlogSpider(scrapy.Spider):
name = 'aparatspider'
start_urls = ['https://www.aparat.com/']
def parse(self, response):
print '=' * 80 , 'latest-trend :'
ul5 = response.css('.block-grid.xsmall-block-grid-2.small-block-grid-3.medium-block-grid-4.large-block-grid-5.is-not-center')
ul5 = ul5.css('ul').css('li')
latesttrend = []
for li5 in ul5:
latesttrend.append(li5.xpath('div/div[1]/a').xpath('#onmousedown').extract_first().encode('utf8'))
print(latesttrend)
now my question is this:
How can I get all the links from the داغ ترین ها tag, more than 1000? Currently, I get only 60, more or less.
I did this with the following code :
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
class aparat_hotnewsItem(scrapy.Item):
videourl = scrapy.Field()
class aparat_hotnewsSpider(CrawlSpider):
name = 'aparat_hotnews'
allowed_domains = ['www.aparat.com']
start_urls = ['http://www.aparat.com/']
# Xpath for selecting links to follow
xp = 'your xpath'
rules = (
Rule(LinkExtractor(restrict_xpaths=xp), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = aparat_hotnewsItem()
item['videourl'] = response.xpath('your xpath').extract()
yield item

How to recursively crawl whole website using scrapy

I want to crawl complete website using scrapy but right now its only crawling single page
import scrapy
from scrapy.http import HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.exporter import JsonItemExporter
class IzodspiderSpider(scrapy.Spider):
name = 'izodspider'
allowed_domains = ['izod.com']
start_urls = ['http://izod.com/']
rules = [Rule(SgmlLinkExtractor(), callback='parse_item', follow=True)]
def parse(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[#name=\'description\']/#content').extract()
name = hxs.xpath('//div[#id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[#id=\'product-details\']/p').extract()
is there any way to extract meta tags using portia ?
There is an error in the rule definition and inside the callback.
Since the parse function you use is parse_item you have to call it inside the callback instead of parse
You can find more information about the callback function on the documentation here http://doc.scrapy.org/en/latest/topics/request-response.html?highlight=callback#topics-request-response-ref-request-callback-arguments
class IzodspiderSpider(CrawlSpider):
name = "izod"
depth_limit= 0
bot_name = 'izod'
allowed_domains = ['izod.com']
start_urls = ['http://www.izod.com']
rules = (
Rule(SgmlLinkExtractor(allow=('')), callback='parse_items',follow= True),
)
def parse_items(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[#name=\'description\']/#content').extract()
name = hxs.xpath('//div[#id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[#id=\'product-details\']/p').extract()

Scrapy Spider does not enter parse_item method using SgmlLinkExtractor Rule

I am making a crawler to crawl the website recursively but the problem is the spider does not enter the parse_item method.The name of my spider is example.py. The code is given below:
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.http.request import Request
from scrapy.utils.response import get_base_url
class CrawlSpider(CrawlSpider):
name = "example"
download_delay = 2
allowed_domains = ["dmoz.org"]
print allowed_domains
start_urls = [
"http://www.dmoz.org/Arts/"
]
print start_urls
rules = (
Rule(SgmlLinkExtractor(allow=('/Arts', )), callback='parse_item',follow=True),
)
#The spide is not entering into this parse_item
def parse_item(self, response):
print "hello parse"
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
print title
Why are you trying to define and call a function explicitly?
Try this:
class CrawlSpider(CrawlSpider):
name = "example"
download_delay = 2
allowed_domains = ["dmoz.org"]
print allowed_domains
start_urls = ["http://www.dmoz.org/Arts/"]
def parse(self, response):
print "hello parse"
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
print title

Constructing a regular expression for url in start_urls list in scrapy framework python

I am very new to scrapy and also i didn't used regular expressions before
The following is my spider.py code
class ExampleSpider(BaseSpider):
name = "test_code
allowed_domains = ["www.example.com"]
start_urls = [
"http://www.example.com/bookstore/new/1?filter=bookstore",
"http://www.example.com/bookstore/new/2?filter=bookstore",
"http://www.example.com/bookstore/new/3?filter=bookstore",
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
Now if we look at start_urls all the three urls are same except they differ at integer value 2?, 3? and so on i mean unlimited according to urls present on the site , i now that we can use crawlspider and we can construct regular expression for the URL like below,
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import re
class ExampleSpider(CrawlSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = [
"http://www.example.com/bookstore/new/1?filter=bookstore",
"http://www.example.com/bookstore/new/2?filter=bookstore",
"http://www.example.com/bookstore/new/3?filter=bookstore",
]
rules = (
Rule(SgmlLinkExtractor(allow=(........),))),
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
can u please guide me , that how can i construct a crawl spider Rule for the above start_url list.
If i understand you correctly, you want a lot of start URL with a certain pattern.
If so, you can override BaseSpider.start_requests method:
class ExampleSpider(BaseSpider):
name = "test_code"
allowed_domains = ["www.example.com"]
def start_requests(self):
for i in xrange(1000):
yield self.make_requests_from_url("http://www.example.com/bookstore/new/%d?filter=bookstore" % i)
...
If you are using CrawlSpider, it's not usually a good idea to override the parse method.
Rule object can filter the urls you are interesed to the ones you do not care for.
See CrawlSpider in the docs for reference.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
import re
class ExampleSpider(CrawlSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com/bookstore']
rules = (
Rule(SgmlLinkExtractor(allow=('\/new\/[0-9]\?',)), callback='parse_bookstore'),
)
def parse_boostore(self, response):
hxs = HtmlXPathSelector(response)

does not work parcing scrapy

It does not collect data from the title. I made in the sample, but it still refuses to work.Here is my code:
toster.py:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from toster.items import DjangoItem
class DjangoSpider(CrawlSpider):
name = "django"
allowed_domains = ["www.toster.ru"]
start_urls = [
'http://www.toster.ru/tag/django/questions',
]
rules = [
Rule(LinkExtractor(
allow=['/tag/django/questions\?page=\d']),
callback='parse_item',
follow=True)
]
def parse_item(self, response):
selector_list = response.css('div.thing')
for selector in selector_list:
item = DjangoItem()
item['title'] = selector.xpath('div/h2/a/text()').extract()
yield item
any help?
Multiple issues in your code:
remove www. from the allowed_domains
fix your regular expression inside the Link Extractor - replace \d with \d+
set unique=False to let Scrapy handle the pagination pages
fix your extracting logic in parse_item() - for instance, there are no elements with thing class on these pages
Fixed version (works for me):
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from toster.items import DjangoItem
class DjangoSpider(CrawlSpider):
name = "django"
allowed_domains = ["toster.ru"]
start_urls = [
'http://www.toster.ru/tag/django/questions',
]
rules = [
Rule(LinkExtractor(
allow=['/tag/django/questions\?page=\d+'], unique=False),
callback='parse_item',
follow=True,)
]
def parse_item(self, response):
selector_list = response.css('div.question__content')
for selector in selector_list:
item = DjangoItem()
item['title'] = selector.css('a.question__title-link[itemprop=url]::text').extract_first().strip()
yield item

Categories