use CrawlSpider in scraping - python

I try to do it with a CrawlSpider and this is the code but the spider didn't return a result (opened and closed after) :
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from torent.items import TorentItem
class MultiPagesSpider(CrawlSpider):
name = 'job'
allowed_domains = ['tanitjobs.com/']
start_urls = ['http://tanitjobs.com/browse-by-category/Nurse/?searchId=1393459812.065&action=search&page=1&view=list',]
rules = (
Rule (SgmlLinkExtractor(allow=('page=*',),restrict_xpaths=('//div[#class="pageNavigation"]',))
, callback='parse_item', follow= True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items= hxs.select('//div[#class="offre"]/div[#class="detail"]')
scraped_items =[]
for item in items:
scraped_item = TorentItem()
scraped_item["title"] = item.select('a/strong/text()').extract()
scraped_items.append(scraped_item)
return items

What #paul t. said in the comment above, but additionally you need to return scraped_items rather than items otherwise you'll get a large number of errors that look like this:
2014-02-26 23:40:59+0000 [job] ERROR: Spider must return Request, BaseItem or None, got 'HtmlXPathSelector' in
<GET http://tanitjobs.com/browse-by-category/Nurse/?action=search&page=3&searchId=1393459812.065&view=list>

Related

Scrapy CrawlSpider unable to crawl first page

My problem is the crawler crawling all pages except first page & I'm unable to understand why?
I'm sure that there is no problem with my items.py or anything else.
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from tutorial2.items import Tutorial2Item
class QuotesSpider(CrawlSpider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
rules = (
Rule(LinkExtractor(restrict_xpaths='//li[#class="next"]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
for items in response.xpath('//div[#class="quote"]'):
l = ItemLoader(item=Tutorial2Item(),selector=items)
l.add_xpath('text','span[#class="text"]/text()')
l.add_xpath('author','span/small[#class="author"]/text()')
l.add_xpath('author_link','span/a/#href')
l.add_xpath('tags','div[#class="tags"]/a[#class="tag"]/text()')
l.add_xpath('tags_link','div[#class="tags"]/a[#class="tag"]/#href')
yield l.load_item()
just implement parse_start_url to return the response. details url
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
# from tutorial2.items import Tutorial2Item
class QuotesSpider(CrawlSpider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
rules = (
Rule(LinkExtractor(restrict_xpaths='//li[#class="next"]'), callback='parse_item', follow=True),
)
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
for items in response.xpath('//div[#class="quote"]'):
l = ItemLoader(item=Tutorial2Item(),selector=items)
l.add_xpath('text','span[#class="text"]/text()')
l.add_xpath('author','span/small[#class="author"]/text()')
l.add_xpath('author_link','span/a/#href')
l.add_xpath('tags','div[#class="tags"]/a[#class="tag"]/text()')
l.add_xpath('tags_link','div[#class="tags"]/a[#class="tag"]/#href')
yield l.load_item()
This may be due to js rendering issue, try using splash as talked here

Scrapy crawlspider rule not working for next pages

I am trying to crawl next page links using crawlspider but I am not able to get any result if I change parse function to something else. My Rule is not working.I am able to fetch only current page with parse function.Where am I going wrong.
This is my naukri_spider.py file
import scrapy
from scrapy import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from naukri.items import NaukriItem
class NaukriSpider(Spider):
name = "naukri"
allowed_domains = ["naukri.com"]
start_urls = ["http://www.naukri.com/information-technology-jobs?xt=catsrch&qf[]=24"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[#class="pagination"]/a/button[#class="grayBtn"]',)), callback="parse", follow= True),
)
def parse(self,response):
for sel in response.xpath('//*[#class="content"]'):
item = NaukriItem()
item['title'] = sel.xpath('span[#class="desig"]/text()').extract()
item['location'] = sel.xpath('span[#class="loc"]/span/text()').extract()
item['organization'] = sel.xpath('span[#class="org"]/text()').extract()
yield item
The parse method implemeted by CrawlSpider used to follow links. Change your rule callback to parse_start_url and override it.
This code works fine.
import scrapy
from scrapy import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from stackoverflow.items import NaukriItem
class NaukriSpider(CrawlSpider):
name = "naukri"
allowed_domains = ["naukri.com"]
start_urls = ["http://www.naukri.com/information-technology-jobs?xt=catsrch&qf[]=24"]
rules = (
Rule(SgmlLinkExtractor(allow=('information-technology-jobs.*', )), callback="parse_start_url", follow= True),
)
def parse_start_url(self,response):
for sel in response.xpath('//*[#class="content"]'):
item = NaukriItem()
item['title'] = sel.xpath('span[#class="desig"]/text()').extract()
item['location'] = sel.xpath('span[#class="loc"]/span/text()').extract()
item['organization'] = sel.xpath('span[#class="org"]/text()').extract()
yield item

scrapy spider code check

so im' trying to scrape the website in the SgmlLinkExtractor parameters below website with scrapy, and this is what my spider looks like:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from desidime_sample.items import DesidimeItem
import string
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('''//td[not(#*)]/div
[not(#*)]/a[not(#class)]/#href''')), callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
deals = hxs.select('''//div[#class='user-comment-text'][1]''')
items = []
for deals in deals:
item = DesidimeItem()
item["deal"] = deals.select("//div[#class='user-comment-text'][1]/p/text()").extract()
item["link"] = deals.select("//div[#class='user-comment-text'][1]/p[1]/a[1]/#href").extract()
items.append(item)
return items
It should be quite obvious what I'm trying to do, but for some reason when I tell the spider to crawl and export the text and links to the CVS file, I end up with:
link,deal http://wwww.facebook.com/desidime,
http://wwww.facebook.com/desidime,
(same thing for many more lines, then:)
",,"
, " same url" ,
(same thing for many more lines, then:)
"link,deals"
So, can anyone tell me what the problem is? If you run each of my above xpaths as reponse.xpath("xpath").extract() after scrapy shell "//corresponingcrawlruleurl", you'll get the right results.
The problem is inside the parse_items callback. When you iterate over the deals, the deal context-specific locators have to be relative. In other words, start your XPath expressions inside the loop with a dot:
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item
(note that I've also simplified the code).
Here is the complete spider, I'm executing (it does scrape the text and links, though I don't know what is your desired output):
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DesidimeItem(scrapy.Item):
deal = scrapy.Field()
link = scrapy.Field()
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = [
Rule(LinkExtractor(restrict_xpaths="//td[not(#*)]/div[not(#*)]/a[not(#class)]"),
callback="parse_items",
follow=True),
]
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item

Scraping a website with scrapy

sHey I've just started using scrapy and was trying it out on a website "diy.com" but i cant seem to get the CrawlSpider to follow links or scrape any data. I think it might be my regex but i cant see anything
any help will be appreciated
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from items import PartItem
class DIY_spider(CrawlSpider):
name = 'diy_cat'
allowed_domains = ['diy.com']
start_urls =[
"http://www.diy.com/nav/decor/tiles/wall-tiles"
]
rules = (
Rule(SgmlLinkExtractor(allow=(r'/(nav)/(decor)/(\w*)/(.*)(\d*)$', ),deny=(r'(.*)/(jsp)/(.*)')), callback='parse_item',follow = True),
def parse_items(self, response):
sel = Selector(response)
tests =[]
test = PartItem()
if sel.xpath('//*[#id="fullWidthContent"]/div[2]/dl/dd[1]/ul[1]/li[3]/text()') :
price = sel.xpath('//*[#id="fullWidthContent"]/div[2]/dl/dd[1]/ul[1]/li[3]/text()')
else:
price= sel.xpath('//dd[#class="item_cta"]/ul[#class="fright item_price"]/li/text()').extract()
if not price:
return test
return test
Besides, #Talvalin's note, you are not getting actual prices.
Try this version of parse_item:
def parse_item(self, response):
sel = Selector(response)
price_list = sel.xpath('//span[#class="onlyPrice"]/text()').extract()
for price in price_list:
if price:
item = PartItem()
item['price'] = price
yield item
Your rule states parse_item as the callback but the actual callback is named parse_items. Additionally, the indenting for the parse_items function is incorrect, but that could simply be a formatting issue when pasting the code in.

Scrapy: only parse from pages with meta noindex

I am trying to crawl a website and parse only from pages with meta noindex.
What is happening is that the crawler crawls the first level, but finishes with the first page. It does not seem to follow the links.
The following is my code:
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def _response_downloaded(self, response):
sel = HtmlXPathSelector(response)
if sel.xpath('//meta[#content="noindex"]'):
return super(mydomainSpider, self).parse_items(response)
return
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
yield items
the original _response_downloaded calls _parse_response function that besides calling callback function also follow links, from scrapy code:
def _parse_response(self, response, callback, cb_kwargs, follow=True):
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for requests_or_item in iterate_spider_output(cb_res):
yield requests_or_item
if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item
you can add that follow link part though I believe it's not the best way to go (leading _ may imply just that), why not just check for meta in the beginning of your parse_items function? and if you don't want to repeat this test maybe even write a python decorator.
I believe checking for the meta at the beginning of my parse_items as #Guy Gavriely suggested will be my best option. I will test out the following code below to see.
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
if hxs.xpath('//meta[#content="noindex"]'):
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
yield items
Working code update, I needed to return items instead of yield:
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
if hxs.xpath('//meta[#content="noindex"]'):
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
return items

Categories