Failed to crawl element of specific website with scrapy spider

Failed to crawl element of specific website with scrapy spider - python

I want to get website addresses of some jobs, so I write a scrapy spider, I want to get all of the value with xpath://article/dl/dd/h2/a[#class="job-title"]/#href, but when I execute the spider with command :
scrapy spider auseek -a addsthreshold=3
the variable "urls" used to preserve values is empty, can someone help me to figure it,
here is my code:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.conf import settings
from scrapy.mail import MailSender
from scrapy.xlib.pydispatch import dispatcher
from scrapy.exceptions import CloseSpider
from scrapy import log
from scrapy import signals
from myProj.items import ADItem
import time
class AuSeekSpider(CrawlSpider):
name = "auseek"
result_address = []
addressCount = int(0)
addressThresh = int(0)
allowed_domains = ["seek.com.au"]
start_urls = [
"http://www.seek.com.au/jobs/in-australia/"
]
def __init__(self,**kwargs):
super(AuSeekSpider, self).__init__()
self.addressThresh = int(kwargs.get('addsthreshold'))
print 'init finished...'
def parse_start_url(self,response):
print 'This is start url function'
log.msg("Pipeline.spider_opened called", level=log.INFO)
hxs = Selector(response)
urls = hxs.xpath('//article/dl/dd/h2/a[#class="job-title"]/#href').extract()
print 'urls is:',urls
print 'test element:',urls[0].encode("ascii")
for url in urls:
postfix = url.getAttribute('href')
print 'postfix:',postfix
url = urlparse.urljoin(response.url,postfix)
yield Request(url, callback = self.parse_ad)
return
def parse_ad(self, response):
print 'this is parse_ad function'
hxs = Selector(response)
item = ADItem()
log.msg("Pipeline.parse_ad called", level=log.INFO)
item['name'] = str(self.name)
item['picNum'] = str(6)
item['link'] = response.url
item['date'] = time.strftime('%Y%m%d',time.localtime(time.time()))
self.addressCount = self.addressCount + 1
if self.addressCount > self.addressThresh:
raise CloseSpider('Get enough website address')
return item
The problems is:
urls = hxs.xpath('//article/dl/dd/h2/a[#class="job-title"]/#href').extract()
urls is empty when I tried to print it out, I just cant figure out why it doesn't work and how can I correct it, thanks for your help.

Here is a working example using selenium and phantomjs headless webdriver in a download handler middleware.
class JsDownload(object):
#check_spider_middleware
def process_request(self, request, spider):
driver = webdriver.PhantomJS(executable_path='D:\phantomjs.exe')
driver.get(request.url)
return HtmlResponse(request.url, encoding='utf-8', body=driver.page_source.encode('utf-8'))
I wanted to ability to tell different spiders which middleware to use so I implemented this wrapper:
def check_spider_middleware(method):
#functools.wraps(method)
def wrapper(self, request, spider):
msg = '%%s %s middleware step' % (self.__class__.__name__,)
if self.__class__ in spider.middleware:
spider.log(msg % 'executing', level=log.DEBUG)
return method(self, request, spider)
else:
spider.log(msg % 'skipping', level=log.DEBUG)
return None
return wrapper
settings.py:
DOWNLOADER_MIDDLEWARES = {'MyProj.middleware.MiddleWareModule.MiddleWareClass': 500}
for wrapper to work all spiders must have at minimum:
middleware = set([])
to include a middleware:
middleware = set([MyProj.middleware.ModuleName.ClassName])
You could have implemented this in a request callback (in spider) but then the http request would be happening twice. This isn't a full proof solution but it works for stuff that loads on .ready(). If you spend some time reading into selenium you can wait for specific event's to trigger before saving page source.
Another example: https://github.com/scrapinghub/scrapyjs
More info: What's the best way of scraping data from a website?
Cheers!

Scrapy does not evaluate Javascript. If you run the following command, you will see that the raw HTML does not contain the anchors you are looking for.
curl http://www.seek.com.au/jobs/in-australia/ | grep job-title
You should try PhantomJS or Selenium instead.
After examining the network requests in Chrome, the job listing appear to have originated from this JSONP request. It should be easy to retrieve whatever you need from it.

Related

Why is Scrapy not following all rules / running all callbacks?

I have two spiders inheriting from a parent spider class as follows:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
class SpiderOpTest(CrawlSpider):
custom_settings = {
"USER_AGENT": "*",
"LOG_LEVEL": "WARNING",
"DOWNLOADER_MIDDLEWARES": {'scraper_scrapy.odds.middlewares.SeleniumMiddleware': 543},
}
httperror_allowed_codes = [301]
def parse_tournament(self, response):
print(f"Parsing tournament - {response.url}")
def parse_tournament_page(self, response):
print(f"Parsing tournament page - {response.url}")
class SpiderOpTest1(SpiderOpTest):
name = "test_1"
start_urls = ["https://www.oddsportal.com/tennis/argentina/atp-buenos-aires/results/"]
rules = (Rule(LinkExtractor(allow="/page/"), callback="parse_tournament_page"),)
class SpiderOpTest2(SpiderOpTest):
name = "test_2"
start_urls = ["https://www.oddsportal.com/tennis/results/"]
rules = (
Rule(LinkExtractor(allow="/atp-buenos-aires/results/"), callback="parse_tournament", follow=True),
Rule(LinkExtractor(allow="/page/"), callback="parse_tournament_page"),
)
process = CrawlerProcess()
process.crawl(<spider_class>)
process.start()
The parse_tournament_page callback for the Rule in first spider works fine.
However, the second spider only runs the parse_tournament callback from the first Rule despite the fact that the second Rule is the same as the first spider and is operating on the same page.
I'm clearly missing something really simple but for the life of me I can't figure out what it is...
As key bits of the pages load via Javascript then it might be useful for me to include the Selenium middleware I'm using:
from scrapy import signals
from scrapy.http import HtmlResponse
from selenium import webdriver
class SeleniumMiddleware:
#classmethod
def from_crawler(cls, crawler):
middleware = cls()
crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
self.driver.get(request.url)
return HtmlResponse(
self.driver.current_url,
body=self.driver.page_source,
encoding='utf-8',
request=request,
)
def spider_opened(self, spider):
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
self.driver = webdriver.Firefox(options=options)
def spider_closed(self, spider):
self.driver.close()
Edit:
So I've managed to create a third spider which is able to execute the parse_tournament_page callback from inside parse_tournament:
class SpiderOpTest3(SpiderOpTest):
name = "test_3"
start_urls = ["https://www.oddsportal.com/tennis/results/"]
httperror_allowed_codes = [301]
rules = (
Rule(
LinkExtractor(allow="/atp-buenos-aires/results/"),
callback="parse_tournament",
follow=True,
),
)
def parse_tournament(self, response):
print(f"Parsing tournament - {response.url}")
xtr = LinkExtractor(allow="/page/")
links = xtr.extract_links(response)
for p in links:
yield response.follow(p.url, dont_filter=True, callback=self.parse_tournament_page)
def parse_tournament_page(self, response):
print(f"Parsing tournament PAGE - {response.url}")
The key here seems to be dont_filter=True - if this is left as the default False then the parse_tournament_page callback isn't executed. This suggests Scrapy is somehow interpreting the second page as a duplicate which I far as I can tell it isn't. That aside, from what I've read if I want to get around this then I need to add unique=False to the LinkExtractor. However, doing this doesn't result in the parse_tournament_page callback executing :(
Update:
So I think I've found the source of the issue. From what I can tell the request_fingerprint method of RFPDupeFilter creates the same hash for https://www.oddsportal.com/tennis/argentina/atp-buenos-aires/results/ as https://www.oddsportal.com/tennis/argentina/atp-buenos-aires/results/#/page/2/.
From reading around I need to subclass RFPDupeFilter to reconfigure the way request_fingerprint works. Any advice on why the same hashes are being generated and/or tips on how to do subclass correctly would be greatly appreciated!

The difference between the two URLs mentioned in the update is in the fragment #/page/2/. Scrapy ignores them by default: Also, servers usually ignore fragments in urls when handling requests, so they are also ignored by default when calculating the fingerprint. If you want to include them, set the keep_fragments argument to True (for instance when handling requests with a headless browser). (from scrapy/utils/request.py)
Check DUPEFILTER_CLASS settings for more information.
The request_fingerprint from scrapy.utils.request can already handle the fragments. When subclassing pass keep_fragments=True.
Add the your class in the custom_settings of SpiderOpTest.

How to start a Scrapy spider from another one

I have two spiders in one Scrapy project. Spider1 crawls a list of page or an entire website and analyzes the content. Spider2 uses Splash to fetch URLs on google and pass that list to Spider1.
So, Spider1 crawls and analyze content and can be used without being called by Spider2
# coding: utf8
from scrapy.spiders import CrawlSpider
import scrapy
class Spider1(scrapy.Spider):
name = "spider1"
tokens = []
query = ''
def __init__(self, *args, **kwargs):
'''
This spider works with two modes,
if only one URL it crawls the entire website,
if a list of URLs only analyze the page
'''
super(Spider1, self).__init__(*args, **kwargs)
start_url = kwargs.get('start_url') or ''
start_urls = kwargs.get('start_urls') or []
query = kwargs.get('q') or ''
if google_query != '':
self.query = query
if start_url != '':
self.start_urls = [start_url]
if len(start_urls) > 0:
self.start_urls = start_urls
def parse(self, response):
'''
Analyze and store data
'''
if len(self.start_urls) == 1:
for next_page in response.css('a::attr("href")'):
yield response.follow(next_page, self.parse)
def closed(self, reason):
'''
Finalize crawl
'''
The code for Spider2
# coding: utf8
import scrapy
from scrapy_splash import SplashRequest
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class Spider2(scrapy.Spider):
name = "spider2"
urls = []
page = 0
def __init__(self, *args, **kwargs):
super(Spider2, self).__init__(*args, **kwargs)
self.query = kwargs.get('q')
self.url = kwargs.get('url')
self.start_urls = ['https://www.google.com/search?q=' + self.query]
def start_requests(self):
splash_args = {
'wait:': 2,
}
for url in self.start_urls:
splash_args = {
'wait:': 1,
}
yield SplashRequest(url, self.parse, args=splash_args)
def parse(self, response):
'''
Extract URLs to self.urls
'''
self.page += 1
def closed(self, reason):
process = CrawlerProcess(get_project_settings())
for url in self.urls:
print(url)
if len(self.urls) > 0:
process.crawl('lexi', start_urls=self.urls, q=self.query)
process.start(False)
When running Spider2 I have this error : twisted.internet.error.ReactorAlreadyRunning and Spider1 is called without the list of URLs.
I tried using CrawlRunner as advised by Scrapy documentation but it's the same problem.
I tried using CrawlProcess inside parse method, it "works" but, I still have the error message. When using CrawlRunner inside parse method, it doesn't work.

Currently it is not possible to start a spider from another spider if you're using scrapy crawl command (see https://github.com/scrapy/scrapy/issues/1226). It is possible to start spider from a spider if you write a startup script yourselves - the trick is to use the same CrawlerProcess/CrawlerRunner instance.
I'd not do that though, you're fighting agains the framework. It'd be nice to support this use case, but it is not really supported now.
An easier way is to either rewrite your code to use a single Spider class, or to create a script (bash, Makefile, luigi/airflow if you want to be fancy) which runs scrapy crawl spider1 -o items.jl followed by scrapy crawl spider2; second spider can read items created by the first spider and generate start_requests accordingly.
FTR: combining SplashRequests and regular scrapy.Requests in a single spider is fully supported (it should just work), you don't have to create separate spiders for them.

Scrapy ignore request for a specific domain

I try to crawl the forum category of craiglist.org (https://forums.craigslist.org/).
My spider:
class CraigslistSpider(scrapy.Spider):
name = "craigslist"
allowed_domains = ["forums.craigslist.org"]
start_urls = ['http://geo.craigslist.org/iso/us/']
def error_handler(self, failure):
print failure
def parse(self, response):
yield Request('https://forums.craigslist.org/',
self.getForumPage,
dont_filter=True,
errback=self.error_handler)
def getForumPage(self, response):
print "forum page"
I have this message by the error callback:
[Failure instance: Traceback: :
/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py:455:callback
/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py:563:_startRunCallbacks
/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py:649:_runCallbacks
/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py:1316:gotResult
--- ---
/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py:1258:_inlineCallbacks
/usr/local/lib/python2.7/site-packages/twisted/python/failure.py:389:throwExceptionIntoGenerator
/usr/local/lib/python2.7/site-packages/scrapy/core/downloader/middleware.py:37:process_request
/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py:649:_runCallbacks
/usr/local/lib/python2.7/site-packages/scrapy/downloadermiddlewares/robotstxt.py:46:process_request_2
]
But i have this problem only with the forum section of Craigslist. It might be because is https for the forum section in contrary of the rest of website.
So, impossible to get a response...
An idea ?

I post a solution that I found for get around the problem.
I have used urllib2 library. Look:
import urllib2
from scrapy.http import HtmlResponse
class CraigslistSpider(scrapy.Spider):
name = "craigslist"
allowed_domains = ["forums.craigslist.org"]
start_urls = ['http://geo.craigslist.org/iso/us/']
def error_handler(self, failure):
print failure
def parse(self, response):
# Get a valid request with urllib2
req = urllib2.Request('https://forums.craigslist.org/')
# Get the content of this request
pageContent = urllib2.urlopen(req).read()
# Parse the content in a HtmlResponse compatible with Scrapy
response = HtmlResponse(url=response.url, body=pageContent)
print response.css(".forumlistcolumns li").extract()
With this solution, you can parse a good request in a valid Scrapy request and use this normaly.
There is probably a better method but this one is functional.

I think you are dealing with robots.txt. Try running your spider with
custom_settings = {
"ROBOTSTXT_OBEY": False
}
You can also test it using command line settings: scrapy crawl craigslist -s ROBOTSTXT_OBEY=False.

scrapy crawl spider ajax pagination

I was trying to scrap link which has ajax call for pagination.
I am trying to crawl http://www.demo.com link. and in .py file I provided this code for restrict XPATH and coding is:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import sumSpider, Rule
from scrapy.selector import HtmlXPathSelector
from sum.items import sumItem
class Sumspider1(sumSpider):
name = 'sumDetailsUrls'
allowed_domains = ['sum.com']
start_urls = ['http://www.demo.com']
rules = (
Rule(LinkExtractor(restrict_xpaths='.//ul[#id="pager"]/li[8]/a'), callback='parse_start_url', follow=True),
)
#use parse_start_url if your spider wants to crawl from first page , so overriding
def parse_start_url(self, response):
print '********************************************1**********************************************'
#//div[#class="showMoreCars hide"]/a
#.//ul[#id="pager"]/li[8]/a/#href
self.log('Inside - parse_item %s' % response.url)
hxs = HtmlXPathSelector(response)
item = sumItem()
item['page'] = response.url
title = hxs.xpath('.//h1[#class="page-heading"]/text()').extract()
print '********************************************title**********************************************',title
urls = hxs.xpath('.//a[#id="linkToDetails"]/#href').extract()
print '**********************************************2***url*****************************************',urls
finalurls = []
for url in urls:
print '---------url-------',url
finalurls.append(url)
item['urls'] = finalurls
return item
My items.py file contains
from scrapy.item import Item, Field
class sumItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
page = Field()
urls = Field()
Still I'm not getting exact output not able to fetch all pages when I am crawling it.

I hope the below code will help.
somespider.py
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.spider import BaseSpider
from demo.items import DemoItem
from selenium import webdriver
def removeUnicodes(strData):
if(strData):
strData = strData.encode('utf-8').strip()
strData = re.sub(r'[\n\r\t]',r' ',strData.strip())
return strData
class demoSpider(scrapy.Spider):
name = "domainurls"
allowed_domains = ["domain.com"]
start_urls = ['http://www.domain.com/used/cars-in-trichy/']
def __init__(self):
self.driver = webdriver.Remote("http://127.0.0.1:4444/wd/hub", webdriver.DesiredCapabilities.HTMLUNITWITHJS)
def parse(self, response):
self.driver.get(response.url)
self.driver.implicitly_wait(5)
hxs = Selector(response)
item = DemoItem()
finalurls = []
while True:
next = self.driver.find_element_by_xpath('//div[#class="showMoreCars hide"]/a')
try:
next.click()
# get the data and write it to scrapy items
item['pageurl'] = response.url
item['title'] = removeUnicodes(hxs.xpath('.//h1[#class="page-heading"]/text()').extract()[0])
urls = self.driver.find_elements_by_xpath('.//a[#id="linkToDetails"]')
for url in urls:
url = url.get_attribute("href")
finalurls.append(removeUnicodes(url))
item['urls'] = finalurls
except:
break
self.driver.close()
return item
items.py
from scrapy.item import Item, Field
class DemoItem(Item):
page = Field()
urls = Field()
pageurl = Field()
title = Field()
Note:
You need to have selenium rc server running because HTMLUNITWITHJS works with selenium rc only using Python.
Run your selenium rc server issuing the command :
java -jar selenium-server-standalone-2.44.0.jar
Run your spider using command:
spider crawl domainurls -o someoutput.json

You can check with your browser how the requests are made.
Behind the scene, right after you click on that button "show more cars" your browser will request a JSON data to feed your next page. You can take advantage of this fact and deal directly with the JSON data without the necessity to work with a JavaScript engine as Selenium or PhantomJS.
In your case, as the first step you should simulate an user scrolling down the page given by your start_url parameter and profile at the same time your network requests to discover the endpoint used by the browser to request that JSON. To discover this endpoint in general there is a XHR(XMLHttpRequest) section on the browser's profile tool as here in Safari where you can navigate thought all resources/endpoints used to request the data.
Once you discover this endpoint it's a straightforward task: you give your Spider as start_url the endpoint that you just discovered and according you process and navigate through the JSON's you can discover if it a next page to request.
P.S.: I saw for you that the endpoint url is http://www.carwale.com/webapi/classified/stockfilters/?city=194&kms=0-&year=0-&budget=0-&pn=2
In this case my browser requested the second page, as you can see in the parameter pn. It's is important you set the some header parameters before you send the request. I noticed in your case the headers are:
Accept text/plain, /; q=0.01
Referer http://www.carwale.com/used/cars-in-trichy/
X-Requested-With XMLHttpRequest
sourceid 1
User-Agent Mozilla/5.0...

Parsing the urls in sitemap with different url format using sitemap spider in scrapy, python

I am using sitemap spider in scrapy, python.
The sitemap seems to have unusual format with '//' in front of urls:
<url>
<loc>//www.example.com/10/20-baby-names</loc>
</url>
<url>
<loc>//www.example.com/elizabeth/christmas</loc>
</url>
myspider.py
from scrapy.contrib.spiders import SitemapSpider
from myspider.items import *
class MySpider(SitemapSpider):
name = "myspider"
sitemap_urls = ["http://www.example.com/robots.txt"]
def parse(self, response):
item = PostItem()
item['url'] = response.url
item['title'] = response.xpath('//title/text()').extract()
return item
I am getting this error:
raise ValueError('Missing scheme in request url: %s' % self._url)
exceptions.ValueError: Missing scheme in request url: //www.example.com/10/20-baby-names
How can I manually parse the url using sitemap spider?

If I see it correctly, you could (for a quick solution) override the default implementation of _parse_sitemap in SitemapSpider. It's not nice, because you will have to copy a lot of code, but should work.
You'll have to add a method to generate a URL with scheme.
"""if the URL starts with // take the current website scheme and make an absolute
URL with the same scheme"""
def _fix_url_bug(url, current_url):
if url.startswith('//'):
':'.join((urlparse.urlsplit(current_url).scheme, url))
else:
yield url
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.body)
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
if body is None:
log.msg(format="Ignoring invalid sitemap: %(response)s",
level=log.WARNING, spider=self, response=response)
return
s = Sitemap(body)
if s.type == 'sitemapindex':
for loc in iterloc(s):
# added it before follow-test, to allow test to return true
# if it includes the scheme (yet do not know if this is the better solution)
loc = _fix_url_bug(loc, response.url)
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(s):
loc = _fix_url_bug(loc, response.url) # same here
for r, c in self._cbs:
if r.search(loc):
yield Request(loc, callback=c)
break
This is just a general idea and untested. So it could both either totally not work or there could be syntax errors. Please respond via comments, so I can improve my answer.
The sitemap you are trying to parse, seems to be wrong. From RFC a missing scheme is perfectly fine, but sitemaps require URLs to begin with a scheme.

I think the nicest and cleanest solution would be to add a downloader middleware which changes the malicious URLs without the spider noticing.
import re
import urlparse
from scrapy.http import XmlResponse
from scrapy.utils.gz import gunzip, is_gzipped
from scrapy.contrib.spiders import SitemapSpider
# downloader middleware
class SitemapWithoutSchemeMiddleware(object):
def process_response(self, request, response, spider):
if isinstance(spider, SitemapSpider):
body = self._get_sitemap_body(response)
if body:
scheme = urlparse.urlsplit(response.url).scheme
body = re.sub(r'<loc>\/\/(.+)<\/loc>', r'<loc>%s://\1</loc>' % scheme, body)
return response.replace(body=body)
return response
# this is from scrapy's Sitemap class, but sitemap is
# only for internal use and it's api can change without
# notice
def _get_sitemap_body(self, response):
"""Return the sitemap body contained in the given response, or None if the
response is not a sitemap.
"""
if isinstance(response, XmlResponse):
return response.body
elif is_gzipped(response):
return gunzip(response.body)
elif response.url.endswith('.xml'):
return response.body
elif response.url.endswith('.xml.gz'):
return gunzip(response.body)

I used the trick by #alecxe to parse the urls within the spider. I made it work but not sure if it is the best way to do it.
from urlparse import urlparse
import re
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.utils.response import body_or_str
from example.items import *
class ExampleSpider(BaseSpider):
name = "example"
start_urls = ["http://www.example.com/sitemap.xml"]
def parse(self,response):
nodename = 'loc'
text = body_or_str(response)
r = re.compile(r"(<%s[\s>])(.*?)(</%s>)" % (nodename, nodename), re.DOTALL)
for match in r.finditer(text):
url = match.group(2)
if url.startswith('//'):
url = 'http:'+url
yield Request(url, callback=self.parse_page)
def parse_page(self, response):
# print response.url
item = PostItem()
item['url'] = response.url
item['title'] = response.xpath('//title/text()').extract()
return item

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.