Scraping a website with scrapy - python

sHey I've just started using scrapy and was trying it out on a website "diy.com" but i cant seem to get the CrawlSpider to follow links or scrape any data. I think it might be my regex but i cant see anything
any help will be appreciated
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from items import PartItem
class DIY_spider(CrawlSpider):
name = 'diy_cat'
allowed_domains = ['diy.com']
start_urls =[
"http://www.diy.com/nav/decor/tiles/wall-tiles"
]
rules = (
Rule(SgmlLinkExtractor(allow=(r'/(nav)/(decor)/(\w*)/(.*)(\d*)$', ),deny=(r'(.*)/(jsp)/(.*)')), callback='parse_item',follow = True),
def parse_items(self, response):
sel = Selector(response)
tests =[]
test = PartItem()
if sel.xpath('//*[#id="fullWidthContent"]/div[2]/dl/dd[1]/ul[1]/li[3]/text()') :
price = sel.xpath('//*[#id="fullWidthContent"]/div[2]/dl/dd[1]/ul[1]/li[3]/text()')
else:
price= sel.xpath('//dd[#class="item_cta"]/ul[#class="fright item_price"]/li/text()').extract()
if not price:
return test
return test

Besides, #Talvalin's note, you are not getting actual prices.
Try this version of parse_item:
def parse_item(self, response):
sel = Selector(response)
price_list = sel.xpath('//span[#class="onlyPrice"]/text()').extract()
for price in price_list:
if price:
item = PartItem()
item['price'] = price
yield item

Your rule states parse_item as the callback but the actual callback is named parse_items. Additionally, the indenting for the parse_items function is incorrect, but that could simply be a formatting issue when pasting the code in.

Related

Scrapy neither shows any error nor fetches any data

Tried to parse product name and price from a site using scrapy. However, When i run my scrapy code it neither shows any error nor fetches any data. What I'm doing wrong is beyond my capability to find out. Hope there is someone to take a look into it.
"items.py" includes:
import scrapy
class SephoraItem(scrapy.Item):
Name = scrapy.Field()
Price = scrapy.Field()
spider file named "sephorasp.py" contains:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class SephoraspSpider(CrawlSpider):
name = "sephorasp"
allowed_domains = ['sephora.ae']
start_urls = ["https://www.sephora.ae/en/stores/"]
rules = [
Rule(LinkExtractor(restrict_xpaths='//li[#class="level0 nav-1 active first touch-dd parent"]')),
Rule(LinkExtractor(restrict_xpaths='//li[#class="level2 nav-1-1-1 active first"]'),
callback="parse_item")
]
def parse_item(self, response):
page = response.xpath('//div[#class="product-info"]')
for titles in page:
Product = titles.xpath('.//a[#title]/text()').extract()
Rate = titles.xpath('.//span[#class="price"]/text()').extract()
yield {'Name':Product,'Price':Rate}
Here is the Link to the Log:
"https://www.dropbox.com/s/8xktgh7lvj4uhbh/output.log?dl=0"
It works when I play around with BaseSpider:
from scrapy.spider import BaseSpider
from scrapy.http.request import Request
class SephoraspSpider(BaseSpider):
name = "sephorasp"
allowed_domains = ['sephora.ae']
start_urls = [
"https://www.sephora.ae/en/travel-size/make-up",
"https://www.sephora.ae/en/perfume/women-perfume",
"https://www.sephora.ae/en/makeup/eye/eyeshadow",
"https://www.sephora.ae/en/skincare/moisturizers",
"https://www.sephora.ae/en/gifts/palettes"
]
def pro(self, response):
item_links = response.xpath('//a[contains(#class,"level0")]/#href').extract()
for a in item_links:
yield Request(a, callback = self.end)
def end(self, response):
item_link = response.xpath('//a[#class="level2"]/#href').extract()
for b in item_link:
yield Request(b, callback = self.parse)
def parse(self, response):
page = response.xpath('//div[#class="product-info"]')
for titles in page:
Product= titles.xpath('.//a[#title]/text()').extract()
Rate= titles.xpath('.//span[#class="price"]/text()').extract()
yield {'Name':Product,'Price':Rate}
Your xpaths are heavily flawed.
Rule(LinkExtractor(restrict_xpaths='//li[#class="level0 nav-1 active first touch-dd parent"]')),
Rule(LinkExtractor(restrict_xpaths='//li[#class="level2 nav-1-1-1 active first"]'),
You are matching whole class ranges which can change at any point and the order might be different in scrapy. Just pick one class, it's most likely unique enough:
Rule(LinkExtractor(restrict_xpaths='//li[contains(#class,"level0")]')),
Rule(LinkExtractor(restrict_xpaths='//li[contains(#class,"level2")]')),

Scrapy crawlspider rule not working for next pages

I am trying to crawl next page links using crawlspider but I am not able to get any result if I change parse function to something else. My Rule is not working.I am able to fetch only current page with parse function.Where am I going wrong.
This is my naukri_spider.py file
import scrapy
from scrapy import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from naukri.items import NaukriItem
class NaukriSpider(Spider):
name = "naukri"
allowed_domains = ["naukri.com"]
start_urls = ["http://www.naukri.com/information-technology-jobs?xt=catsrch&qf[]=24"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[#class="pagination"]/a/button[#class="grayBtn"]',)), callback="parse", follow= True),
)
def parse(self,response):
for sel in response.xpath('//*[#class="content"]'):
item = NaukriItem()
item['title'] = sel.xpath('span[#class="desig"]/text()').extract()
item['location'] = sel.xpath('span[#class="loc"]/span/text()').extract()
item['organization'] = sel.xpath('span[#class="org"]/text()').extract()
yield item
The parse method implemeted by CrawlSpider used to follow links. Change your rule callback to parse_start_url and override it.
This code works fine.
import scrapy
from scrapy import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from stackoverflow.items import NaukriItem
class NaukriSpider(CrawlSpider):
name = "naukri"
allowed_domains = ["naukri.com"]
start_urls = ["http://www.naukri.com/information-technology-jobs?xt=catsrch&qf[]=24"]
rules = (
Rule(SgmlLinkExtractor(allow=('information-technology-jobs.*', )), callback="parse_start_url", follow= True),
)
def parse_start_url(self,response):
for sel in response.xpath('//*[#class="content"]'):
item = NaukriItem()
item['title'] = sel.xpath('span[#class="desig"]/text()').extract()
item['location'] = sel.xpath('span[#class="loc"]/span/text()').extract()
item['organization'] = sel.xpath('span[#class="org"]/text()').extract()
yield item

scrapy spider code check

so im' trying to scrape the website in the SgmlLinkExtractor parameters below website with scrapy, and this is what my spider looks like:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from desidime_sample.items import DesidimeItem
import string
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('''//td[not(#*)]/div
[not(#*)]/a[not(#class)]/#href''')), callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
deals = hxs.select('''//div[#class='user-comment-text'][1]''')
items = []
for deals in deals:
item = DesidimeItem()
item["deal"] = deals.select("//div[#class='user-comment-text'][1]/p/text()").extract()
item["link"] = deals.select("//div[#class='user-comment-text'][1]/p[1]/a[1]/#href").extract()
items.append(item)
return items
It should be quite obvious what I'm trying to do, but for some reason when I tell the spider to crawl and export the text and links to the CVS file, I end up with:
link,deal http://wwww.facebook.com/desidime,
http://wwww.facebook.com/desidime,
(same thing for many more lines, then:)
",,"
, " same url" ,
(same thing for many more lines, then:)
"link,deals"
So, can anyone tell me what the problem is? If you run each of my above xpaths as reponse.xpath("xpath").extract() after scrapy shell "//corresponingcrawlruleurl", you'll get the right results.
The problem is inside the parse_items callback. When you iterate over the deals, the deal context-specific locators have to be relative. In other words, start your XPath expressions inside the loop with a dot:
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item
(note that I've also simplified the code).
Here is the complete spider, I'm executing (it does scrape the text and links, though I don't know what is your desired output):
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DesidimeItem(scrapy.Item):
deal = scrapy.Field()
link = scrapy.Field()
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = [
Rule(LinkExtractor(restrict_xpaths="//td[not(#*)]/div[not(#*)]/a[not(#class)]"),
callback="parse_items",
follow=True),
]
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item

SgmlLinkExtractor not displaying results or following link

I am having problems fully understanding how SGML Link Extractor works. When making a crawler with Scrapy, I can successfully extract data from links using specific URLS. The problem is using Rules to follow a next page link in a particular URL.
I think the problem lies in the allow() attribute. When the Rule is added to the code, the results do not display in the command line and the link to the next page is not followed.
Any help is greatly appreciated.
Here is the code...
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider
from scrapy.contrib.spiders import Rule
from tutorial.items import TutorialItem
class AllGigsSpider(CrawlSpider):
name = "allGigs"
allowed_domains = ["http://www.allgigs.co.uk/"]
start_urls = [
"http://www.allgigs.co.uk/whats_on/London/clubbing-1.html",
"http://www.allgigs.co.uk/whats_on/London/festivals-1.html",
"http://www.allgigs.co.uk/whats_on/London/comedy-1.html",
"http://www.allgigs.co.uk/whats_on/London/theatre_and_opera-1.html",
"http://www.allgigs.co.uk/whats_on/London/dance_and_ballet-1.html"
]
rules = (Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[#class="more"]',)), callback="parse_me", follow= True),
)
def parse_me(self, response):
hxs = HtmlXPathSelector(response)
infos = hxs.xpath('//div[#class="entry vevent"]')
items = []
for info in infos:
item = TutorialItem()
item ['artist'] = hxs.xpath('//span[#class="summary"]//text()').extract()
item ['date'] = hxs.xpath('//abbr[#class="dtstart dtend"]//text()').extract()
item ['endDate'] = hxs.xpath('//abbr[#class="dtend"]//text()').extract()
item ['startDate'] = hxs.xpath('//abbr[#class="dtstart"]//text()').extract()
items.append(item)
return items
print items
The problem is in the restrict_xpaths - it should point to a block where a link extractor should look for links. Don't specify allow at all:
rules = [
Rule(SgmlLinkExtractor(restrict_xpaths='//div[#class="more"]'),
callback="parse_me",
follow=True),
]
And you need to fix your allowed_domains:
allowed_domains = ["www.allgigs.co.uk"]
Also note that the print items in the parse_me() callback is not reachable since it lies after the return statement. And, in the loop, you should not apply XPath expression using hxs, the expressions should be used in the info context. And you can simplify the parse_me():
def parse_me(self, response):
for info in response.xpath('//div[#class="entry vevent"]'):
item = TutorialItem()
item['artist'] = info.xpath('.//span[#class="summary"]//text()').extract()
item['date'] = info.xpath('.//abbr[#class="dtstart dtend"]//text()').extract()
item['endDate'] = info.xpath('.//abbr[#class="dtend"]//text()').extract()
item['startDate'] = info.xpath('.//abbr[#class="dtstart"]//text()').extract()
yield item

Scrapy - Follow RSS links

I was wondering if anyone ever tried to extract/follow RSS item links using
SgmlLinkExtractor/CrawlSpider. I can't get it to work...
I am using the following rule:
rules = (
Rule(SgmlLinkExtractor(tags=('link',), attrs=False),
follow=True,
callback='parse_article'),
)
(having in mind that rss links are located in the link tag).
I am not sure how to tell SgmlLinkExtractor to extract the text() of
the link and not to search the attributes ...
Any help is welcome,
Thanks in advance
CrawlSpider rules don't work that way. You'll probably need to subclass BaseSpider and implement your own link extraction in your spider callback. For example:
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import XmlXPathSelector
class MySpider(BaseSpider):
name = 'myspider'
def parse(self, response):
xxs = XmlXPathSelector(response)
links = xxs.select("//link/text()").extract()
return [Request(x, callback=self.parse_link) for x in links]
You can also try the XPath in the shell, by running for example:
scrapy shell http://blog.scrapy.org/rss.xml
And then typing in the shell:
>>> xxs.select("//link/text()").extract()
[u'http://blog.scrapy.org',
u'http://blog.scrapy.org/new-bugfix-release-0101',
u'http://blog.scrapy.org/new-scrapy-blog-and-scrapy-010-release']
There's an XMLFeedSpider one can use nowadays.
I have done it using CrawlSpider:
class MySpider(CrawlSpider):
domain_name = "xml.example.com"
def parse(self, response):
xxs = XmlXPathSelector(response)
items = xxs.select('//channel/item')
for i in items:
urli = i.select('link/text()').extract()
request = Request(url=urli[0], callback=self.parse1)
yield request
def parse1(self, response):
hxs = HtmlXPathSelector(response)
# ...
yield(MyItem())
but I am not sure that is a very proper solution...
XML Example From scrapy doc XMLFeedSpider
from scrapy.spiders import XMLFeedSpider
from myproject.items import TestItem
class MySpider(XMLFeedSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com/feed.xml']
iterator = 'iternodes' # This is actually unnecessary, since it's the default value
itertag = 'item'
def parse_node(self, response, node):
self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.extract()))
#item = TestItem()
item = {} # change to dict for removing the class not found error
item['id'] = node.xpath('#id').extract()
item['name'] = node.xpath('name').extract()
item['description'] = node.xpath('description').extract()
return item

Categories