Scrapy Spider does not enter parse_item method using SgmlLinkExtractor Rule - python

I am making a crawler to crawl the website recursively but the problem is the spider does not enter the parse_item method.The name of my spider is example.py. The code is given below:
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.http.request import Request
from scrapy.utils.response import get_base_url
class CrawlSpider(CrawlSpider):
name = "example"
download_delay = 2
allowed_domains = ["dmoz.org"]
print allowed_domains
start_urls = [
"http://www.dmoz.org/Arts/"
]
print start_urls
rules = (
Rule(SgmlLinkExtractor(allow=('/Arts', )), callback='parse_item',follow=True),
)
#The spide is not entering into this parse_item
def parse_item(self, response):
print "hello parse"
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
print title

Why are you trying to define and call a function explicitly?
Try this:
class CrawlSpider(CrawlSpider):
name = "example"
download_delay = 2
allowed_domains = ["dmoz.org"]
print allowed_domains
start_urls = ["http://www.dmoz.org/Arts/"]
def parse(self, response):
print "hello parse"
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
print title

Related

Scrapy CrawlSpider unable to crawl first page

My problem is the crawler crawling all pages except first page & I'm unable to understand why?
I'm sure that there is no problem with my items.py or anything else.
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from tutorial2.items import Tutorial2Item
class QuotesSpider(CrawlSpider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
rules = (
Rule(LinkExtractor(restrict_xpaths='//li[#class="next"]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
for items in response.xpath('//div[#class="quote"]'):
l = ItemLoader(item=Tutorial2Item(),selector=items)
l.add_xpath('text','span[#class="text"]/text()')
l.add_xpath('author','span/small[#class="author"]/text()')
l.add_xpath('author_link','span/a/#href')
l.add_xpath('tags','div[#class="tags"]/a[#class="tag"]/text()')
l.add_xpath('tags_link','div[#class="tags"]/a[#class="tag"]/#href')
yield l.load_item()
just implement parse_start_url to return the response. details url
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
# from tutorial2.items import Tutorial2Item
class QuotesSpider(CrawlSpider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
rules = (
Rule(LinkExtractor(restrict_xpaths='//li[#class="next"]'), callback='parse_item', follow=True),
)
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
for items in response.xpath('//div[#class="quote"]'):
l = ItemLoader(item=Tutorial2Item(),selector=items)
l.add_xpath('text','span[#class="text"]/text()')
l.add_xpath('author','span/small[#class="author"]/text()')
l.add_xpath('author_link','span/a/#href')
l.add_xpath('tags','div[#class="tags"]/a[#class="tag"]/text()')
l.add_xpath('tags_link','div[#class="tags"]/a[#class="tag"]/#href')
yield l.load_item()
This may be due to js rendering issue, try using splash as talked here

How to use Scrapy for URL crawling

I want to crawl the link https://www.aparat.com/.
I crawl it correctly and get all the video links with header tag;like this :
import scrapy
class BlogSpider(scrapy.Spider):
name = 'aparatspider'
start_urls = ['https://www.aparat.com/']
def parse(self, response):
print '=' * 80 , 'latest-trend :'
ul5 = response.css('.block-grid.xsmall-block-grid-2.small-block-grid-3.medium-block-grid-4.large-block-grid-5.is-not-center')
ul5 = ul5.css('ul').css('li')
latesttrend = []
for li5 in ul5:
latesttrend.append(li5.xpath('div/div[1]/a').xpath('#onmousedown').extract_first().encode('utf8'))
print(latesttrend)
now my question is this:
How can I get all the links from the داغ ترین ها tag, more than 1000? Currently, I get only 60, more or less.
I did this with the following code :
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
class aparat_hotnewsItem(scrapy.Item):
videourl = scrapy.Field()
class aparat_hotnewsSpider(CrawlSpider):
name = 'aparat_hotnews'
allowed_domains = ['www.aparat.com']
start_urls = ['http://www.aparat.com/']
# Xpath for selecting links to follow
xp = 'your xpath'
rules = (
Rule(LinkExtractor(restrict_xpaths=xp), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = aparat_hotnewsItem()
item['videourl'] = response.xpath('your xpath').extract()
yield item

How to recursively crawl whole website using scrapy

I want to crawl complete website using scrapy but right now its only crawling single page
import scrapy
from scrapy.http import HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.exporter import JsonItemExporter
class IzodspiderSpider(scrapy.Spider):
name = 'izodspider'
allowed_domains = ['izod.com']
start_urls = ['http://izod.com/']
rules = [Rule(SgmlLinkExtractor(), callback='parse_item', follow=True)]
def parse(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[#name=\'description\']/#content').extract()
name = hxs.xpath('//div[#id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[#id=\'product-details\']/p').extract()
is there any way to extract meta tags using portia ?
There is an error in the rule definition and inside the callback.
Since the parse function you use is parse_item you have to call it inside the callback instead of parse
You can find more information about the callback function on the documentation here http://doc.scrapy.org/en/latest/topics/request-response.html?highlight=callback#topics-request-response-ref-request-callback-arguments
class IzodspiderSpider(CrawlSpider):
name = "izod"
depth_limit= 0
bot_name = 'izod'
allowed_domains = ['izod.com']
start_urls = ['http://www.izod.com']
rules = (
Rule(SgmlLinkExtractor(allow=('')), callback='parse_items',follow= True),
)
def parse_items(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[#name=\'description\']/#content').extract()
name = hxs.xpath('//div[#id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[#id=\'product-details\']/p').extract()

Get RSS links given a domain

I have a file which has a list of domains. I need to crawl the domain(i.e. the whole website) to get rss links. Recursively crawl each page of the website to get rss links from each page and write to a json file corresponding to the domain This is my code just for one website:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = ['apple.com']
start_urls = ['http://apple.com']
#rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
sites = sel.select('/html/head/link[#type=application/rss+xml]/#href').extract()
#items = []
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = sites
#items.append(item)
return item
tried running
scrapy crawl apple -o items.json -t json
But items.json only contains a bracket [
This is my items.py file:
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()
Your XPath expression needs to have quotes around the "application/rss+xml" test value.
Try something like:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = ['apple.com']
start_urls = ['http://apple.com']
rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
rsslinks = sel.select('/html/head/link[#type="application/rss+xml"]/#href').extract()
#items = []
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = rsslinks
#items.append(item)
return item

Scrapy SgmlLinkExtractor Add an Arbitrary URL

How do I add a url to SgmlLinkExtractor? That is, how do I add an arbitrary url to run the callback on?
To elaborate, using dirbot as an example: https://github.com/scrapy/dirbot/blob/master/dirbot/spiders/googledir.py
parse_category only accesses everything that matches the SgmlLinkExtractor SgmlLinkExtractor(allow='directory.google.com/[A-Z][a-zA-Z_/]+$')
Use BaseSpider instead of CrawlSpider, then set add to start_requests or start_urls []
class MySpider(BaseSpider):
name = "myspider"
def start_requests(self):
return [Request("https://www.example.com",
callback=self.parse)]
def parse(self, response):
hxs = HtmlXPathSelector(response)
...
class ThemenHubSpider(CrawlSpider):
name = 'themenHub'
allowed_domains = ['themen.t-online.de']
start_urls = ["http://themen.t-online.de/themen-a-z/a"]
rules = [Rule(SgmlLinkExtractor(allow=['id_\d+']), 'parse_news')]

Categories