Scrapy XPath selector

Scrapy XPath selector - python

I am scraping this site and I'm using Scrapy as the means. However, I am having trouble with the XPath. I'm not entirely sure what is going on:
Why does this work:
def parse_item(self, response):
item = BotItem()
for title in response.xpath('//h1'):
item['title'] = title.xpath('strong/text()').extract()
item['wage'] = title.xpath('span[#class="price"]/text()').extract()
yield item
and the following code not?
def parse_item(self, response):
item = BotItem()
for title in response.xpath('//body'):
item['title'] = title.xpath('h1/strong/text()').extract()
item['wage'] = title.xpath('h1/span[#class="price"]/text()').extract()
yield item
I aim to also extract the XPath for:
//div[#id="description"]/p
But I can't because it is outside the h1 node. How can I achieve this? My full code is:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from bot.items import BotItem
class MufmufSpider(CrawlSpider):
name = 'mufmuf'
allowed_domains = ['mufmuf.ro']
start_urls = ['http://mufmuf.ro/locuri-de-munca/joburi-in-strainatate/']
rules = (
Rule(
LinkExtractor(restrict_xpaths='//div[#class="paginate"][position() = last()]'),
#callback='parse_start_url',
follow=True
),
Rule(
LinkExtractor(restrict_xpaths='//h3/a'),
callback='parse_item',
follow=True
),
def parse_item(self, response):
item = BotItem()
for title in response.xpath('//h1'):
item['title'] = title.xpath('strong/text()').extract()
item['wage'] = title.xpath('span[#class="price"]/text()').extract()
#item['description'] = title.xpath('div[#id="descirption"]/p/text()').extract()
yield item

The for title in response.xpath('//body'): option does not work because your XPath expressions in the loop make it search for h1 element directly inside the body element.
Moreover, since there is only one desired entity to extract you don't need a loop here at all:
def parse_item(self, response):
item = BotItem()
item["title"] = response.xpath('//h1/strong/text()').extract()
item["wage"] = response.xpath('//h1/span[#class="price"]/text()').extract()
item["description"] = response.xpath('//div[#id="description"]/p/text()').extract()
return item
(this should also answer your second question about the description)

Related

Scrapy: response.xpath prints None, but upon clicking into weblink, xPath is correct

I am trying to print out the h1 title of the item that I am trying to scrape. I have tried printing the result of
print(response.xpath('/html/body/div[2]/div/div[5]/div[2]/div[2]/div/h1').get()) from a product like this https://www.steinersports.com/football/tampa-bay-buccaneers/tom-brady-tampa-bay-buccaneers-super-bowl-lv-champions-autographed-white-nike-game-jersey-with-lv-mvp-inscription/o-8094+t-92602789+p-2679909745+z-8-2492872768?_ref=p-FALP:m-GRID:i-r20c0:po-60.
I am not sure how to go about debugging this error, since when I click into the links that are returning none and check the xpath, it is correct. Any help is appreciated, full code below:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
class SteinerSportsCrawlSpiderSpider(CrawlSpider):
name = 'steinersports_crawl_spider'
allowed_domains = ['steinersports.com']
start_urls = [
'https://www.steinersports.com/football/signed/o-1383+fa-56+z-95296299-3058648695?_ref=m-TOPNAV',
]
base_url = 'https://www.steinersports.com/football/signed/o-1383+fa-56+z-95296299-3058648695?_ref=m-TOPNAV'
rules = (
Rule(LinkExtractor(allow=r'/signed'), follow=True),
Rule(LinkExtractor(allow=r'football/', deny=r'/signed'), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = {}
description_flag = True
price_flag = True
item_description = response.xpath('/html/body/div[2]/div/div[5]/div[2]/div[17]/div/div[2]/div').get()
print(item)
#item_price = response.xpath('//span[#class="product__price"]/text()').get()
print(response.xpath('html/body/div[2]/div/div[5]/div[2]/div[2]/div/h1').get())
item['item_name'] = response.xpath('html/body/div[2]/div/div[5]/div[2]/div[2]/div/h1').get()
return item

You can directly access the h1 tag using the data-talos attribute. This xpath should get the title:
response.xpath("//h1[#data-talos='labelPdpProductTitle']/text()").extract_first()

Scrapy. I haven't result

I have a work program. But my file json is empty.
My program should get all the articles with New York times.
class ParseSpider(CrawlSpider):
name = "new"
allowed_domains = ["www.nytimes.com"]
start_urls = ["https://www.nytimes.com/section/world?WT.nav=page&action=click&contentCollection=World&module=HPMiniNav&pgtype=Homepage&region=TopBar"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//*[#id="story"]/div[3]/div[1]',)), callback="parse_items", follow= True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
l = parseLoader(parse_item(), hxs)
l.add_value('url', response.url)
l.add_xpath('name', '//*[#id="headline"]' % u"Название статьи:")
l.add_xpath('text', '//*[#id="story"]/div[3]/div[1]' % u"Текст:")
I am change program. Edit:
rules = (
Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[contains(#id,"story")]')), callback = 'parse_item'),
)
def parse_item(self, response):
l = parseLoader(response=response)
l.add_value('url', response.url)
l.add_xpath('name', '//*[#id="headline"]' % u"Название статьи:")
l.add_xpath('text', '//*[#id="story"]/div[3]/div[1]' % u"Текст:")
yield l.load_item()

Seems like you infinite recursion going on in your parse_item method.
You don't need to selector and HtmlXpathSelector shouldn't even be used. Try:
def parse_item(self, response):
l = parseLoader(response=response)
l.add_value('url', response.url)
l.add_xpath('name', '//*[#id="headline"]' % u"Название статьи:")
l.add_xpath('text', '//*[#id="story"]/div[3]/div[1]' % u"Текст:")
yield l.load_item()
Edit: Seems like your linkextractor rule is not extracting anything.
First of all you shouldn't use SgmlLinkExtractor it's deprecated. Secondly your xpath doesn't capture anything, the xpath your are using is too specific and incorrect in some cases. Try:
LinkExtractor(allow=(), restrict_xpaths=('//*[contains(#id,"story")]',))
You can debug and try this in scrapy shell command:
$ scrapy shell "https://www.nytimes.com/section/..."
from scrapy.linkextractor import LinkExtractor
le = LinkExtractor(allow=(), restrict_xpaths=('//*[contains(#id,"story")]',))
le.extract_links(response)
# 20 results will be printed

Scrapy Craigslist script

I want to create a Scrapy script to scrape all of the results for computer gigs in any craigslist subdomain:
for example here: http://losangeles.craigslist.org/search/cpg/
This query returns a list of many articles and I've tried to scrape the title and href of each of this results (not only the ones on the first page) to no avail using CrawlSpider and linkExtractor, but the Script returns nothing.
I'll paste my script here, thanks
import scrapy
from scrapy.spiders import Rule,CrawlSpider
from scrapy.linkextractors import LinkExtractor
class CraigspiderSpider(CrawlSpider):
name = "CraigSpider"
allowed_domains = ["http://losangeles.craigslist.org"]
start_urls = (
'http://losangeles.craigslist.org/search/cpg/',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_page", follow= True),)
def parse_page(self, response):
items = response.selector.xpath("//p[#class='row']")
for i in items:
link = i.xpath("./span[#class='txt']/span[#class='pl']/a/#href").extract()
title = i.xpath("./span[#class='txt']/span[#class='pl']/a/span[#id='titletextonly']/text()").extract()
print link,title

According to the code you pasted, parse_page:
does not return/yield anything, and
only contains one line: "items = response.selector..."
The reason for #2 above is that the for loop is not properly indented.
Try to indent the for loop:
class CraigspiderSpider(CrawlSpider):
name = "CraigSpider"
allowed_domains = ["http://losangeles.craigslist.org"]
start_urls = ('http://losangeles.craigslist.org/search/cpg/',)
rules = (Rule(
LinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)),
callback="parse_page", follow= True))
def parse_page(self, response):
items = response.selector.xpath("//p[#class='row']")
for i in items:
link = i.xpath("./span[#class='txt']/span[#class='pl']/a/#href").extract()
title = i.xpath("./span[#class='txt']/span[#class='pl']/a/span[#id='titletextonly']/text()").extract()
print link, title
yield dict(link=link, title=title)

Getting data from multiple links using scrapy

I am new to Scrapy and Python. I was trying to retrive the data from https://in.bookmyshow.com/movies since i need the information of all the movies I was trying to extract the data .But there is something wrong with my code, I would like to know where I have gone wrong .
rules = ( Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item

You code seems to be fine. Perhaps the problem is outside of the part you posted here.
This worked for me:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class BookmyshowSpider(CrawlSpider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
rules = (Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
EDIT: Version using the standard spider class scrapy.Spider()
import scrapy
class BookmyshowSpider(scrapy.Spider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
def parse(self, response):
links = response.xpath('//a/#href').re('movies/[^\/]+\/.*$')
for url in set(links):
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_movie)
def parse_movie(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = {}
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
parse() parses all links to movie pages from the start page. parse_movie() is used as a callback for all Requests to the specific movie pages. With this version you certainly have more control over the spider behavior.

Scrapy: only parse from pages with meta noindex

I am trying to crawl a website and parse only from pages with meta noindex.
What is happening is that the crawler crawls the first level, but finishes with the first page. It does not seem to follow the links.
The following is my code:
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def _response_downloaded(self, response):
sel = HtmlXPathSelector(response)
if sel.xpath('//meta[#content="noindex"]'):
return super(mydomainSpider, self).parse_items(response)
return
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
yield items

the original _response_downloaded calls _parse_response function that besides calling callback function also follow links, from scrapy code:
def _parse_response(self, response, callback, cb_kwargs, follow=True):
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for requests_or_item in iterate_spider_output(cb_res):
yield requests_or_item
if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item
you can add that follow link part though I believe it's not the best way to go (leading _ may imply just that), why not just check for meta in the beginning of your parse_items function? and if you don't want to repeat this test maybe even write a python decorator.

I believe checking for the meta at the beginning of my parse_items as #Guy Gavriely suggested will be my best option. I will test out the following code below to see.
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
if hxs.xpath('//meta[#content="noindex"]'):
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
yield items
Working code update, I needed to return items instead of yield:
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
if hxs.xpath('//meta[#content="noindex"]'):
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
return items

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy XPath selector - python

Related

Scrapy: response.xpath prints None, but upon clicking into weblink, xPath is correct

Scrapy. I haven't result

Scrapy Craigslist script

Getting data from multiple links using scrapy

Scrapy: only parse from pages with meta noindex

Categories

Resources