SgmlLinkExtractor not displaying results or following link - python

I am having problems fully understanding how SGML Link Extractor works. When making a crawler with Scrapy, I can successfully extract data from links using specific URLS. The problem is using Rules to follow a next page link in a particular URL.
I think the problem lies in the allow() attribute. When the Rule is added to the code, the results do not display in the command line and the link to the next page is not followed.
Any help is greatly appreciated.
Here is the code...
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider
from scrapy.contrib.spiders import Rule
from tutorial.items import TutorialItem
class AllGigsSpider(CrawlSpider):
name = "allGigs"
allowed_domains = ["http://www.allgigs.co.uk/"]
start_urls = [
"http://www.allgigs.co.uk/whats_on/London/clubbing-1.html",
"http://www.allgigs.co.uk/whats_on/London/festivals-1.html",
"http://www.allgigs.co.uk/whats_on/London/comedy-1.html",
"http://www.allgigs.co.uk/whats_on/London/theatre_and_opera-1.html",
"http://www.allgigs.co.uk/whats_on/London/dance_and_ballet-1.html"
]
rules = (Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[#class="more"]',)), callback="parse_me", follow= True),
)
def parse_me(self, response):
hxs = HtmlXPathSelector(response)
infos = hxs.xpath('//div[#class="entry vevent"]')
items = []
for info in infos:
item = TutorialItem()
item ['artist'] = hxs.xpath('//span[#class="summary"]//text()').extract()
item ['date'] = hxs.xpath('//abbr[#class="dtstart dtend"]//text()').extract()
item ['endDate'] = hxs.xpath('//abbr[#class="dtend"]//text()').extract()
item ['startDate'] = hxs.xpath('//abbr[#class="dtstart"]//text()').extract()
items.append(item)
return items
print items

The problem is in the restrict_xpaths - it should point to a block where a link extractor should look for links. Don't specify allow at all:
rules = [
Rule(SgmlLinkExtractor(restrict_xpaths='//div[#class="more"]'),
callback="parse_me",
follow=True),
]
And you need to fix your allowed_domains:
allowed_domains = ["www.allgigs.co.uk"]
Also note that the print items in the parse_me() callback is not reachable since it lies after the return statement. And, in the loop, you should not apply XPath expression using hxs, the expressions should be used in the info context. And you can simplify the parse_me():
def parse_me(self, response):
for info in response.xpath('//div[#class="entry vevent"]'):
item = TutorialItem()
item['artist'] = info.xpath('.//span[#class="summary"]//text()').extract()
item['date'] = info.xpath('.//abbr[#class="dtstart dtend"]//text()').extract()
item['endDate'] = info.xpath('.//abbr[#class="dtend"]//text()').extract()
item['startDate'] = info.xpath('.//abbr[#class="dtstart"]//text()').extract()
yield item

Related

Scrapy: response.xpath prints None, but upon clicking into weblink, xPath is correct

I am trying to print out the h1 title of the item that I am trying to scrape. I have tried printing the result of
print(response.xpath('/html/body/div[2]/div/div[5]/div[2]/div[2]/div/h1').get()) from a product like this https://www.steinersports.com/football/tampa-bay-buccaneers/tom-brady-tampa-bay-buccaneers-super-bowl-lv-champions-autographed-white-nike-game-jersey-with-lv-mvp-inscription/o-8094+t-92602789+p-2679909745+z-8-2492872768?_ref=p-FALP:m-GRID:i-r20c0:po-60.
I am not sure how to go about debugging this error, since when I click into the links that are returning none and check the xpath, it is correct. Any help is appreciated, full code below:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
class SteinerSportsCrawlSpiderSpider(CrawlSpider):
name = 'steinersports_crawl_spider'
allowed_domains = ['steinersports.com']
start_urls = [
'https://www.steinersports.com/football/signed/o-1383+fa-56+z-95296299-3058648695?_ref=m-TOPNAV',
]
base_url = 'https://www.steinersports.com/football/signed/o-1383+fa-56+z-95296299-3058648695?_ref=m-TOPNAV'
rules = (
Rule(LinkExtractor(allow=r'/signed'), follow=True),
Rule(LinkExtractor(allow=r'football/', deny=r'/signed'), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = {}
description_flag = True
price_flag = True
item_description = response.xpath('/html/body/div[2]/div/div[5]/div[2]/div[17]/div/div[2]/div').get()
print(item)
#item_price = response.xpath('//span[#class="product__price"]/text()').get()
print(response.xpath('html/body/div[2]/div/div[5]/div[2]/div[2]/div/h1').get())
item['item_name'] = response.xpath('html/body/div[2]/div/div[5]/div[2]/div[2]/div/h1').get()
return item
You can directly access the h1 tag using the data-talos attribute. This xpath should get the title:
response.xpath("//h1[#data-talos='labelPdpProductTitle']/text()").extract_first()

Scrapy neither shows any error nor fetches any data

Tried to parse product name and price from a site using scrapy. However, When i run my scrapy code it neither shows any error nor fetches any data. What I'm doing wrong is beyond my capability to find out. Hope there is someone to take a look into it.
"items.py" includes:
import scrapy
class SephoraItem(scrapy.Item):
Name = scrapy.Field()
Price = scrapy.Field()
spider file named "sephorasp.py" contains:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class SephoraspSpider(CrawlSpider):
name = "sephorasp"
allowed_domains = ['sephora.ae']
start_urls = ["https://www.sephora.ae/en/stores/"]
rules = [
Rule(LinkExtractor(restrict_xpaths='//li[#class="level0 nav-1 active first touch-dd parent"]')),
Rule(LinkExtractor(restrict_xpaths='//li[#class="level2 nav-1-1-1 active first"]'),
callback="parse_item")
]
def parse_item(self, response):
page = response.xpath('//div[#class="product-info"]')
for titles in page:
Product = titles.xpath('.//a[#title]/text()').extract()
Rate = titles.xpath('.//span[#class="price"]/text()').extract()
yield {'Name':Product,'Price':Rate}
Here is the Link to the Log:
"https://www.dropbox.com/s/8xktgh7lvj4uhbh/output.log?dl=0"
It works when I play around with BaseSpider:
from scrapy.spider import BaseSpider
from scrapy.http.request import Request
class SephoraspSpider(BaseSpider):
name = "sephorasp"
allowed_domains = ['sephora.ae']
start_urls = [
"https://www.sephora.ae/en/travel-size/make-up",
"https://www.sephora.ae/en/perfume/women-perfume",
"https://www.sephora.ae/en/makeup/eye/eyeshadow",
"https://www.sephora.ae/en/skincare/moisturizers",
"https://www.sephora.ae/en/gifts/palettes"
]
def pro(self, response):
item_links = response.xpath('//a[contains(#class,"level0")]/#href').extract()
for a in item_links:
yield Request(a, callback = self.end)
def end(self, response):
item_link = response.xpath('//a[#class="level2"]/#href').extract()
for b in item_link:
yield Request(b, callback = self.parse)
def parse(self, response):
page = response.xpath('//div[#class="product-info"]')
for titles in page:
Product= titles.xpath('.//a[#title]/text()').extract()
Rate= titles.xpath('.//span[#class="price"]/text()').extract()
yield {'Name':Product,'Price':Rate}
Your xpaths are heavily flawed.
Rule(LinkExtractor(restrict_xpaths='//li[#class="level0 nav-1 active first touch-dd parent"]')),
Rule(LinkExtractor(restrict_xpaths='//li[#class="level2 nav-1-1-1 active first"]'),
You are matching whole class ranges which can change at any point and the order might be different in scrapy. Just pick one class, it's most likely unique enough:
Rule(LinkExtractor(restrict_xpaths='//li[contains(#class,"level0")]')),
Rule(LinkExtractor(restrict_xpaths='//li[contains(#class,"level2")]')),

scrapy spider code check

so im' trying to scrape the website in the SgmlLinkExtractor parameters below website with scrapy, and this is what my spider looks like:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from desidime_sample.items import DesidimeItem
import string
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('''//td[not(#*)]/div
[not(#*)]/a[not(#class)]/#href''')), callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
deals = hxs.select('''//div[#class='user-comment-text'][1]''')
items = []
for deals in deals:
item = DesidimeItem()
item["deal"] = deals.select("//div[#class='user-comment-text'][1]/p/text()").extract()
item["link"] = deals.select("//div[#class='user-comment-text'][1]/p[1]/a[1]/#href").extract()
items.append(item)
return items
It should be quite obvious what I'm trying to do, but for some reason when I tell the spider to crawl and export the text and links to the CVS file, I end up with:
link,deal http://wwww.facebook.com/desidime,
http://wwww.facebook.com/desidime,
(same thing for many more lines, then:)
",,"
, " same url" ,
(same thing for many more lines, then:)
"link,deals"
So, can anyone tell me what the problem is? If you run each of my above xpaths as reponse.xpath("xpath").extract() after scrapy shell "//corresponingcrawlruleurl", you'll get the right results.
The problem is inside the parse_items callback. When you iterate over the deals, the deal context-specific locators have to be relative. In other words, start your XPath expressions inside the loop with a dot:
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item
(note that I've also simplified the code).
Here is the complete spider, I'm executing (it does scrape the text and links, though I don't know what is your desired output):
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DesidimeItem(scrapy.Item):
deal = scrapy.Field()
link = scrapy.Field()
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = [
Rule(LinkExtractor(restrict_xpaths="//td[not(#*)]/div[not(#*)]/a[not(#class)]"),
callback="parse_items",
follow=True),
]
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item

Scraping a website with scrapy

sHey I've just started using scrapy and was trying it out on a website "diy.com" but i cant seem to get the CrawlSpider to follow links or scrape any data. I think it might be my regex but i cant see anything
any help will be appreciated
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from items import PartItem
class DIY_spider(CrawlSpider):
name = 'diy_cat'
allowed_domains = ['diy.com']
start_urls =[
"http://www.diy.com/nav/decor/tiles/wall-tiles"
]
rules = (
Rule(SgmlLinkExtractor(allow=(r'/(nav)/(decor)/(\w*)/(.*)(\d*)$', ),deny=(r'(.*)/(jsp)/(.*)')), callback='parse_item',follow = True),
def parse_items(self, response):
sel = Selector(response)
tests =[]
test = PartItem()
if sel.xpath('//*[#id="fullWidthContent"]/div[2]/dl/dd[1]/ul[1]/li[3]/text()') :
price = sel.xpath('//*[#id="fullWidthContent"]/div[2]/dl/dd[1]/ul[1]/li[3]/text()')
else:
price= sel.xpath('//dd[#class="item_cta"]/ul[#class="fright item_price"]/li/text()').extract()
if not price:
return test
return test
Besides, #Talvalin's note, you are not getting actual prices.
Try this version of parse_item:
def parse_item(self, response):
sel = Selector(response)
price_list = sel.xpath('//span[#class="onlyPrice"]/text()').extract()
for price in price_list:
if price:
item = PartItem()
item['price'] = price
yield item
Your rule states parse_item as the callback but the actual callback is named parse_items. Additionally, the indenting for the parse_items function is incorrect, but that could simply be a formatting issue when pasting the code in.

Scrapy - Follow RSS links

I was wondering if anyone ever tried to extract/follow RSS item links using
SgmlLinkExtractor/CrawlSpider. I can't get it to work...
I am using the following rule:
rules = (
Rule(SgmlLinkExtractor(tags=('link',), attrs=False),
follow=True,
callback='parse_article'),
)
(having in mind that rss links are located in the link tag).
I am not sure how to tell SgmlLinkExtractor to extract the text() of
the link and not to search the attributes ...
Any help is welcome,
Thanks in advance
CrawlSpider rules don't work that way. You'll probably need to subclass BaseSpider and implement your own link extraction in your spider callback. For example:
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import XmlXPathSelector
class MySpider(BaseSpider):
name = 'myspider'
def parse(self, response):
xxs = XmlXPathSelector(response)
links = xxs.select("//link/text()").extract()
return [Request(x, callback=self.parse_link) for x in links]
You can also try the XPath in the shell, by running for example:
scrapy shell http://blog.scrapy.org/rss.xml
And then typing in the shell:
>>> xxs.select("//link/text()").extract()
[u'http://blog.scrapy.org',
u'http://blog.scrapy.org/new-bugfix-release-0101',
u'http://blog.scrapy.org/new-scrapy-blog-and-scrapy-010-release']
There's an XMLFeedSpider one can use nowadays.
I have done it using CrawlSpider:
class MySpider(CrawlSpider):
domain_name = "xml.example.com"
def parse(self, response):
xxs = XmlXPathSelector(response)
items = xxs.select('//channel/item')
for i in items:
urli = i.select('link/text()').extract()
request = Request(url=urli[0], callback=self.parse1)
yield request
def parse1(self, response):
hxs = HtmlXPathSelector(response)
# ...
yield(MyItem())
but I am not sure that is a very proper solution...
XML Example From scrapy doc XMLFeedSpider
from scrapy.spiders import XMLFeedSpider
from myproject.items import TestItem
class MySpider(XMLFeedSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com/feed.xml']
iterator = 'iternodes' # This is actually unnecessary, since it's the default value
itertag = 'item'
def parse_node(self, response, node):
self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.extract()))
#item = TestItem()
item = {} # change to dict for removing the class not found error
item['id'] = node.xpath('#id').extract()
item['name'] = node.xpath('name').extract()
item['description'] = node.xpath('description').extract()
return item

Categories