I am trying to crawl next page links using crawlspider but I am not able to get any result if I change parse function to something else. My Rule is not working.I am able to fetch only current page with parse function.Where am I going wrong.
This is my naukri_spider.py file
import scrapy
from scrapy import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from naukri.items import NaukriItem
class NaukriSpider(Spider):
name = "naukri"
allowed_domains = ["naukri.com"]
start_urls = ["http://www.naukri.com/information-technology-jobs?xt=catsrch&qf[]=24"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[#class="pagination"]/a/button[#class="grayBtn"]',)), callback="parse", follow= True),
)
def parse(self,response):
for sel in response.xpath('//*[#class="content"]'):
item = NaukriItem()
item['title'] = sel.xpath('span[#class="desig"]/text()').extract()
item['location'] = sel.xpath('span[#class="loc"]/span/text()').extract()
item['organization'] = sel.xpath('span[#class="org"]/text()').extract()
yield item
The parse method implemeted by CrawlSpider used to follow links. Change your rule callback to parse_start_url and override it.
This code works fine.
import scrapy
from scrapy import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from stackoverflow.items import NaukriItem
class NaukriSpider(CrawlSpider):
name = "naukri"
allowed_domains = ["naukri.com"]
start_urls = ["http://www.naukri.com/information-technology-jobs?xt=catsrch&qf[]=24"]
rules = (
Rule(SgmlLinkExtractor(allow=('information-technology-jobs.*', )), callback="parse_start_url", follow= True),
)
def parse_start_url(self,response):
for sel in response.xpath('//*[#class="content"]'):
item = NaukriItem()
item['title'] = sel.xpath('span[#class="desig"]/text()').extract()
item['location'] = sel.xpath('span[#class="loc"]/span/text()').extract()
item['organization'] = sel.xpath('span[#class="org"]/text()').extract()
yield item
Related
so im' trying to scrape the website in the SgmlLinkExtractor parameters below website with scrapy, and this is what my spider looks like:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from desidime_sample.items import DesidimeItem
import string
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('''//td[not(#*)]/div
[not(#*)]/a[not(#class)]/#href''')), callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
deals = hxs.select('''//div[#class='user-comment-text'][1]''')
items = []
for deals in deals:
item = DesidimeItem()
item["deal"] = deals.select("//div[#class='user-comment-text'][1]/p/text()").extract()
item["link"] = deals.select("//div[#class='user-comment-text'][1]/p[1]/a[1]/#href").extract()
items.append(item)
return items
It should be quite obvious what I'm trying to do, but for some reason when I tell the spider to crawl and export the text and links to the CVS file, I end up with:
link,deal http://wwww.facebook.com/desidime,
http://wwww.facebook.com/desidime,
(same thing for many more lines, then:)
",,"
, " same url" ,
(same thing for many more lines, then:)
"link,deals"
So, can anyone tell me what the problem is? If you run each of my above xpaths as reponse.xpath("xpath").extract() after scrapy shell "//corresponingcrawlruleurl", you'll get the right results.
The problem is inside the parse_items callback. When you iterate over the deals, the deal context-specific locators have to be relative. In other words, start your XPath expressions inside the loop with a dot:
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item
(note that I've also simplified the code).
Here is the complete spider, I'm executing (it does scrape the text and links, though I don't know what is your desired output):
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DesidimeItem(scrapy.Item):
deal = scrapy.Field()
link = scrapy.Field()
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = [
Rule(LinkExtractor(restrict_xpaths="//td[not(#*)]/div[not(#*)]/a[not(#class)]"),
callback="parse_items",
follow=True),
]
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item
I want to get all external links from a given website using Scrapy. Using the following code the spider crawls external links as well:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from myproject.items import someItem
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule (LinkExtractor(), callback="parse_obj", follow=True),
)
def parse_obj(self,response):
item = someItem()
item['url'] = response.url
return item
What am I missing? Doesn't "allowed_domains" prevent the external links to be crawled? If I set "allow_domains" for LinkExtractor it does not extract the external links. Just to clarify: I wan't to crawl internal links but extract external links. Any help appriciated!
You can also use the link extractor to pull all the links once you are parsing each page.
The link extractor will filter the links for you. In this example the link extractor will deny links in the allowed domain so it only gets outside links.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LxmlLinkExtractor
from myproject.items import someItem
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
for link in LxmlLinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
item = someItem()
item['url'] = link.url
An updated code based on 12Ryan12's answer,
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.item import Item, Field
class MyItem(Item):
url= Field()
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
item = MyItem()
item['url'] = []
for link in LxmlLinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
item['url'].append(link.url)
return item
A solution would be make usage a process_link function in the SgmlLinkExtractor
Documentation here http://doc.scrapy.org/en/latest/topics/link-extractors.html
class testSpider(CrawlSpider):
name = "test"
bot_name = 'test'
allowed_domains = ["news.google.com"]
start_urls = ["https://news.google.com/"]
rules = (
Rule(SgmlLinkExtractor(allow_domains=()), callback='parse_items',process_links="filter_links",follow= True) ,
)
def filter_links(self, links):
for link in links:
if self.allowed_domains[0] not in link.url:
print link.url
return links
def parse_items(self, response):
### ...
I want to crawl complete website using scrapy but right now its only crawling single page
import scrapy
from scrapy.http import HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.exporter import JsonItemExporter
class IzodspiderSpider(scrapy.Spider):
name = 'izodspider'
allowed_domains = ['izod.com']
start_urls = ['http://izod.com/']
rules = [Rule(SgmlLinkExtractor(), callback='parse_item', follow=True)]
def parse(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[#name=\'description\']/#content').extract()
name = hxs.xpath('//div[#id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[#id=\'product-details\']/p').extract()
is there any way to extract meta tags using portia ?
There is an error in the rule definition and inside the callback.
Since the parse function you use is parse_item you have to call it inside the callback instead of parse
You can find more information about the callback function on the documentation here http://doc.scrapy.org/en/latest/topics/request-response.html?highlight=callback#topics-request-response-ref-request-callback-arguments
class IzodspiderSpider(CrawlSpider):
name = "izod"
depth_limit= 0
bot_name = 'izod'
allowed_domains = ['izod.com']
start_urls = ['http://www.izod.com']
rules = (
Rule(SgmlLinkExtractor(allow=('')), callback='parse_items',follow= True),
)
def parse_items(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[#name=\'description\']/#content').extract()
name = hxs.xpath('//div[#id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[#id=\'product-details\']/p').extract()
sHey I've just started using scrapy and was trying it out on a website "diy.com" but i cant seem to get the CrawlSpider to follow links or scrape any data. I think it might be my regex but i cant see anything
any help will be appreciated
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from items import PartItem
class DIY_spider(CrawlSpider):
name = 'diy_cat'
allowed_domains = ['diy.com']
start_urls =[
"http://www.diy.com/nav/decor/tiles/wall-tiles"
]
rules = (
Rule(SgmlLinkExtractor(allow=(r'/(nav)/(decor)/(\w*)/(.*)(\d*)$', ),deny=(r'(.*)/(jsp)/(.*)')), callback='parse_item',follow = True),
def parse_items(self, response):
sel = Selector(response)
tests =[]
test = PartItem()
if sel.xpath('//*[#id="fullWidthContent"]/div[2]/dl/dd[1]/ul[1]/li[3]/text()') :
price = sel.xpath('//*[#id="fullWidthContent"]/div[2]/dl/dd[1]/ul[1]/li[3]/text()')
else:
price= sel.xpath('//dd[#class="item_cta"]/ul[#class="fright item_price"]/li/text()').extract()
if not price:
return test
return test
Besides, #Talvalin's note, you are not getting actual prices.
Try this version of parse_item:
def parse_item(self, response):
sel = Selector(response)
price_list = sel.xpath('//span[#class="onlyPrice"]/text()').extract()
for price in price_list:
if price:
item = PartItem()
item['price'] = price
yield item
Your rule states parse_item as the callback but the actual callback is named parse_items. Additionally, the indenting for the parse_items function is incorrect, but that could simply be a formatting issue when pasting the code in.
I try to do it with a CrawlSpider and this is the code but the spider didn't return a result (opened and closed after) :
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from torent.items import TorentItem
class MultiPagesSpider(CrawlSpider):
name = 'job'
allowed_domains = ['tanitjobs.com/']
start_urls = ['http://tanitjobs.com/browse-by-category/Nurse/?searchId=1393459812.065&action=search&page=1&view=list',]
rules = (
Rule (SgmlLinkExtractor(allow=('page=*',),restrict_xpaths=('//div[#class="pageNavigation"]',))
, callback='parse_item', follow= True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items= hxs.select('//div[#class="offre"]/div[#class="detail"]')
scraped_items =[]
for item in items:
scraped_item = TorentItem()
scraped_item["title"] = item.select('a/strong/text()').extract()
scraped_items.append(scraped_item)
return items
What #paul t. said in the comment above, but additionally you need to return scraped_items rather than items otherwise you'll get a large number of errors that look like this:
2014-02-26 23:40:59+0000 [job] ERROR: Spider must return Request, BaseItem or None, got 'HtmlXPathSelector' in
<GET http://tanitjobs.com/browse-by-category/Nurse/?action=search&page=3&searchId=1393459812.065&view=list>