I am having issues with pagination in the following code.
The spider starts but does not find any links on the first page. This is because the page actually returns a partial result... I know it sounds odd but its true, when I visit the page I see jobs listed but when the bot visits, there are no jobs listed.
From what I understand, scrapy will load the entire page regardless of JS or AJAX but I am starting to wonder...
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.http.request import Request
from northrop.items import NorthropItem
from scrapy.http import HtmlResponse
from scrapy.exceptions import CloseSpider
import re
class NorthropSpider(CrawlSpider):
name = "northropJobStart"
start_urls = ['https://ngc.taleo.net/careersection/ngc_pro/jobsearch.ftl?lang=en#']
allowed_domains = ["ngc.taleo.net"]
rules = (
Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="next"]/a',)), callback="parse_listings", follow= True),
)
def parse_start_url(self, response):
return self.parse_listings(response)
def parse_listings(self, response):
sel = Selector(response)
# There are no jobs listed.. I am lost.....
jobs = sel.xpath('//th/div/div/span/a/#href').extract()
for job_url in jobs:
job_url = self.__normalise(job_url)
job_url = self.__to_absolute_url(response.url,job_url)
yield Request(job_url, callback=self.parse_details)
def parse_details(self, response):
sel = Selector(response)
job = sel.xpath('//*[#id="mainbody-jobs"]')
item = NorthropItem()
# Populate job fields
item['title'] = job.xpath('//*[#id="mainbody-jobs"]/h1/text()').extract()
item['location'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[3]/div[2]/text()').extract()
item['applink'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[1]/a/#href').extract()
item['description'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[2]/div[1]/div[2]').extract()
item['travel'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[5]/div[2]/text()').extract()
item['job_category'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[2]/div[2]/text()').extract()
item['clearance_have'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[8]/div[2]/text()').extract()
item['clearance_get'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[8]/div[2]/text()').extract()
item['job_number'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[1]/div[2]/text()').extract()
item['page_url'] = response.url
item = self.__normalise_item(item, response.url)
return item
def __normalise_item(self, item, base_url):
'''
Standardise and format item fields
'''
# Loop item fields to sanitise data and standardise data types
for key, value in vars(item).values()[0].iteritems():
item[key] = self.__normalise(item[key])
# Convert job URL from relative to absolute URL
#item['job_url'] = self.__to_absolute_url(base_url, item['job_url'])
return item
def __normalise(self, value):
# Convert list to string
value = value if type(value) is not list else ' '.join(value)
# Trim leading and trailing special characters (Whitespaces, newlines, spaces, tabs, carriage returns)
value = value.strip()
return value
def __to_absolute_url(self, base_url, link):
'''
Convert relative URL to absolute URL
'''
import urlparse
link = urlparse.urljoin(base_url, link)
return link
def __to_int(self, value):
'''
Convert value to integer type
'''
try:
value = int(value)
except ValueError:
value = 0
return value
def __to_float(self, value):
'''
Convert value to float type
'''
try:
value = float(value)
except ValueError:
value = 0.0
return value
Unfortunately the search form is hidden in quite deep but you can see it if in the network tab of your browser inspect.
Turns out it's sending a full json of default search parameters, so you need to pretty much copy and paste it only incrementing the pageNo. I couldn't help but solve it and before you know it I wrote a whole spider, so here it is, let me know if some parts are unclear:
import json
import scrapy
class TaleoSpider(scrapy.Spider):
name = 'taleo'
start_urls = ['https://ngc.taleo.net/careersection/ngc_pro/jobsearch.ftl?lang=en#']
# baseform with base search values
base_form = {'advancedSearchFiltersSelectionParam':
{'searchFilterSelections': [
{'id': 'ORGANIZATION', 'selectedValues': []},
{'id': 'LOCATION', 'selectedValues': []},
{'id': 'JOB_FIELD', 'selectedValues': []},
{'id': 'URGENT_JOB', 'selectedValues': []},
{'id': 'EMPLOYEE_STATUS', 'selectedValues': []},
{'id': 'STUDY_LEVEL', 'selectedValues': []},
{'id': 'WILL_TRAVEL', 'selectedValues': []},
{'id': 'JOB_SHIFT', 'selectedValues': []},
{'id': 'JOB_NUMBER', 'selectedValues': []}]},
'fieldData': {'fields': {'JOB_TITLE': '', 'KEYWORD': '', 'LOCATION': ''},
'valid': True},
'filterSelectionParam': {'searchFilterSelections': [{'id': 'POSTING_DATE',
'selectedValues': []},
{'id': 'LOCATION', 'selectedValues': []},
{'id': 'JOB_FIELD', 'selectedValues': []},
{'id': 'JOB_TYPE', 'selectedValues': []},
{'id': 'JOB_SCHEDULE', 'selectedValues': []},
{'id': 'JOB_LEVEL', 'selectedValues': []}]},
'multilineEnabled': False,
'pageNo': 1, # <--- change this for pagination
'sortingSelection': {'ascendingSortingOrder': 'false',
'sortBySelectionParam': '3'}}
def parse(self, response):
# we got cookies from first start url now lets request into the search api
# copy base form for the first request
form = self.base_form.copy()
yield scrapy.Request('https://ngc.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=2160420105',
body=json.dumps(self.base_form),
# add headers to indicate we are sending a json package
headers={'Content-Type': 'application/json',
'X-Requested-With': 'XMLHttpRequest'},
# scrapy.Request defaults to 'GET', but we want 'POST' here
method='POST',
# load our form into meta so we can reuse it later
meta={'form': form},
callback=self.parse_items)
def parse_items(self, response):
data = json.loads(response.body)
# scrape data
for item in data['requisitionList']:
yield item
# next page
# get our form back and update the page number in it
form = response.meta['form']
form['pageNo'] += 1
# check if paging is over, is our next page higher than maximum page?
max_page = data['pagingData']['totalCount'] / data['pagingData']['pageSize']
if form['pageNo'] > max_page:
return
yield scrapy.Request('https://ngc.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=2160420105',
body=json.dumps(form),
headers={'Content-Type': 'application/json',
'X-Requested-With': 'XMLHttpRequest'},
method='POST',
meta={'form': form},
callback=self.parse_items)
Related
I am trying to pull data from this site: https://inform.alabama.gov/employeesearch.aspx. The current query that I have below works up to and including page 11. I think the issue resides with the "__VIEWSTATE" form element. It doesn't appear to change with each request. It should represent the current response page in the loop so the server knows how to interpret the subsequent response. It seems to only render the value present on the first response, so I believe that the server is rejecting it because page 12 is not a valid pathway from pages 1-10. If you take a look at the pagination it goes from 1 to ..., where the ... renders page 11. When page 11 is rendered, it changes the pagination to: 11 to ..., where the ... renders page 21.
Note that num_pages defines the total page count. Currently set to 15, it processes pages 1-11 and returns 302 errors for the other pages.
How should this be modified to yield the results for all 661 pages?
from scrapy import FormRequest, Spider
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
class EmployeesSpider(Spider):
name = 'employees'
start_urls = ['https://inform.alabama.gov/employeesearch.aspx']
num_pages = 15 # 661
name_excludes = ['', ' ', '1']
def parse(self, response):
formdata = self.get_formdata(response, 0)
formdata['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$btn_Search'
formdata['__EVENTARGUMENT'] = ''
yield FormRequest(
url='https://inform.alabama.gov/employeesearch.aspx',
method="POST",
dont_filter=True,
formdata=formdata,
callback=self.perform_search,
errback=self.failure)
def perform_search(self, response):
for employee in response.xpath('//*[#id="ContentPlaceHolder1_GridView1"]//tr'):
emp_name = employee.xpath('.//td[1]//text()').get()
if emp_name is not None and emp_name not in self.name_excludes:
final_name = emp_name.strip()
yield {
'name': final_name,
'email': employee.xpath('.//td[1]//span//a//text()').get(),
'org': employee.xpath('.//td[2]//text()').get(),
'phone': employee.xpath('.//td[3]//span//a//text()').get(),
}
# Download search pages starting from #2
for i in range(2, self.num_pages):
formdata = self.get_formdata(response, i)
yield FormRequest(
url='https://inform.alabama.gov/employeesearch.aspx',
method="POST",
dont_filter=True,
formdata=formdata,
callback=self.parse_results,
errback=self.failure)
def get_formdata(self, response, page_num):
eventargument = 'Page$' + str(page_num)
viewstate = response.css(
'input#__VIEWSTATE::attr(value)').get()
if viewstate is None:
viewstate = ''
viewstategen = response.css(
'input#__VIEWSTATEGENERATOR::attr(value)').get()
if viewstategen is None:
viewstategen = ''
eventvalidation = response.css(
'input#__EVENTVALIDATION::attr(value)').get()
if eventvalidation is None:
eventvalidation = ''
formdata = {
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$GridView1',
'__EVENTARGUMENT': eventargument,
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidation,
'ctl00%24ContentPlaceHolder1%24txt_FirstName': '',
'ctl00%24ContentPlaceHolder1%24txt_LastName': '',
'ctl00%24ContentPlaceHolder1%24ddl_Agency': 'Not+Selected',
'ctl00%24ContentPlaceHolder1%24txt_Phone': '',
}
return formdata
def parse_results(self, response):
for employee in response.xpath('//*[#id="ContentPlaceHolder1_GridView1"]//tr'):
emp_name = employee.xpath('.//td[1]//text()').get()
if emp_name is not None and emp_name not in self.name_excludes:
final_name = emp_name.strip()
yield {
'name': final_name,
'email': employee.xpath('.//td[1]//span//a//text()').get(),
'org': employee.xpath('.//td[2]//text()').get(),
'phone': employee.xpath('.//td[3]//span//a//text()').get(),
}
def failure(self, failure):
# log all failures
self.logger.error(repr(failure))
# in case you want to do something special for some errors,
# you may need the failure's type:
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)
First, you need to get the total number of pages.
Press Search, go to the last page and, save 661 to num_pages.
Start from the beginning, now knowing how many pages do you have:
from scrapy import FormRequest, Spider
class EmployeesSpider(Spider):
name = 'employees'
start_urls = ['https://inform.alabama.gov/employeesearch.aspx']
num_pages = None
name_excludes = ['', ' ', '1', None]
def parse(self, response):
yield FormRequest.from_response(
response,
dont_filter=True,
formdata={
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$btn_Search',
'__EVENTARGUMENT': ''
},
callback=self.perform_search
)
def perform_search(self, response):
if not self.num_pages:
# first get the total number of pages
yield FormRequest.from_response(
response,
dont_filter=True,
formdata={
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$GridView1',
'__EVENTARGUMENT': 'Page$Last'
},
callback=self.parse_total_pages
)
else:
for employee in response.xpath(
'//table[#id="ContentPlaceHolder1_GridView1"]//'
'tr[position() < last()]'
):
emp_name = employee.xpath('.//td[1]//text()').get()
if emp_name not in self.name_excludes:
final_name = emp_name.strip()
yield {
'name': final_name,
'email': employee.xpath('.//td[1]//span//a//text()').get(),
'org': employee.xpath('.//td[2]//text()').get(),
'phone': employee.xpath('.//td[3]//span//a//text()').get(),
}
# go to the next page
current_page = response.xpath(
'//tr[#class="employeeSearchPagerStye"]/td//span/text()'
).get()
current_page = int(current_page)
if current_page < self.num_pages:
yield FormRequest.from_response(
response,
dont_filter=True,
formdata={
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$GridView1',
'__EVENTARGUMENT': f'Page${current_page + 1}'
},
callback=self.perform_search
)
def parse_total_pages(self, response):
total_pages = response.xpath(
'//tr[#class="employeeSearchPagerStye"]/td//span/text()'
).get()
self.num_pages = int(total_pages)
# back to search
yield FormRequest.from_response(
response,
dont_filter=True,
formdata={
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$btn_Search',
'__EVENTARGUMENT': ''
},
callback=self.perform_search
)
I'm having trouble structuring scrapy data as I want. My spider get some data from one page, then follows a list of links on that page to get a link of this next page.
def parse_page(self, response):
links = response.css(LINK_SELECTOR).extract()
data = {
'name': response.css(NAME_SELECTOR).extract_first(),
'date': response.css(DATE_SELECTOR).extract(),
}
for link in links:
next_link = response.urljoin(link)
yield scrapy.Request(next_link, callback=self.parse_url, meta={'data': data})
def parse_url(self, response):
data = response.meta['data']
data['url'] = response.css(a::attr(href)').get()
yield data
What I would like is to get the data with the following structure:
{'name': name, 'date': date, 'url': [url1, url2, url3, url4]}
Instead of
{'name': name, 'date': date, 'url': url1}
{'name': name, 'date': date, 'url': url2}
{'name': name, 'date': date, 'url': url3}
{'name': name, 'date': date, 'url': url4}
I've tried to use items but I don't get how to pass the data from parse_url to the parse_page function. How would I do that?
Thanks in advance.
You can use scrapy's coroutine support to do this pretty easily.
The code would look something like this:
async def parse_page(self, response):
...
for link in links:
request = response.follow(link)
response = await self.crawler.engine.download(request, self)
urls.append(response.css('a::attr(href)').get())
The following is one of the ways how you can achieve that. There is a library inline_requests which will help you get the expected output.
import scrapy
from scrapy.crawler import CrawlerProcess
from inline_requests import inline_requests
class YellowpagesSpider(scrapy.Spider):
name = "yellowpages"
start_urls = ["https://www.yellowpages.com/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771"]
#inline_requests
def parse(self, response):
data = {
'name':response.css(".sales-info > h1::text").get(),
'phone':response.css(".contact > p.phone::text").get(),
'target_link':[]
}
for item_link in response.css(".review-info > a.author[href]::attr(href)").getall():
resp = yield scrapy.Request(response.urljoin(item_link), meta={'handle_httpstatus_all': True})
target_link = resp.css("a.review-business-name::attr(href)").get()
data['target_link'].append(target_link)
print(data)
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT':'Mozilla/5.0',
'LOG_LEVEL':'ERROR',
})
c.crawl(YellowpagesSpider)
c.start()
Output it produces:
{'name': 'Honey Honey Cafe & Crepery', 'phone': '(415) 351-2423', 'target_link': ['/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771', '/walnut-ca/mip/akasaka-japanese-cuisine-455476824', '/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771']}
I'm using scrapy to scrape data from the website. Here's my code
import scrapy
class ShopSpider(scrapy.Spider):
name = 'shop'
allowed_domains = ['https://www.shopclues.com/mobiles-smartphones.html?sort_by=bestsellers']
start_urls = ['http://https://www.shopclues.com/mobiles-smartphones.html?sort_by=bestsellers/']
custom_settings = {
'FEED_URI': 'tmp/shop.csv'
}
def parse(self, response):
titles = response.css('img::attr(title)').extract()
images = response.css('img::attr(data-img)').extract()
prices = response.css('.p_price::text').extract()
discounts = response.css('.prd_discount::text').extract()
for item in zip(titles, prices, images, discounts):
scraped_info = {
'title': item[0],
'price': item[1],
'image_urls': [item[2]], # Set's the url for scrapy to download images
'discount': item[3]
}
yield scraped_info
Please check where I'm doing wrong?
Also, I want to scrape all the data while I'm scrolling. So it should take all the data till we are scrolling? So how do I go about it?
You have problems with:
incorrect allowed_domain (only domain needed);
broken start_urls (http twice and slash in the end);
wrong intends for yielding item in parse function.
Check fixed code here:
import scrapy
class ShopSpider(scrapy.Spider):
name = 'shop'
allowed_domains = ['shopclues.com']
start_urls = ['https://www.shopclues.com/mobiles-smartphones.html?sort_by=bestsellers']
def parse(self, response):
titles = response.css('img::attr(title)').extract()
images = response.css('img::attr(data-img)').extract()
prices = response.css('.p_price::text').extract()
discounts = response.css('.prd_discount::text').extract()
for item in zip(titles, prices, images, discounts):
scraped_info = {
'title': item[0],
'price': item[1],
'image_urls': [item[2]], # Set's the url for scrapy to download images
'discount': item[3]
}
yield scraped_info
When i am executing this code i am getting result in the form of {[text1,author1,tag1],[text2,author2,tag2],...}
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
But, in the same code for another URL (below) i am getting result as {[name1,name2,..],[city1,city2,...]}
I want to have it in the form of {[name1,city1],[name2,city2],...] as it wah happening for the above code.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "student"
start_urls = [
'http://www.engineering.careers360.com/colleges/list-of-engineering-colleges-in-karnataka?sort_filter=alpha',
]
def parse(self, response):
for students in response.css('div.list-pages'):
yield {
'name': students.css('div.title a::text').extract(),
'city': students.css('div.clg-state a::text').extract(),
}
Your students selector is faulty:
for students in response.css('div.list-pages'):
This only selects the whole page.
What you are looking for here I think is:
for students in response.css('li.search-result'):
There is an example in Scrapy Documentation Release 1.0.3,in 7th row, urljoin method is used when the links is relative.when the links is absolute,what should i do?
example code:
import scrapy
class StackOverflowSpider(scrapy.Spider):
name = 'stackoverflow'
start_urls = ['http://stackoverflow.com/questions?sort=votes']
def parse(self, response):
for href in response.css('.question-summary h3 a::attr(href)'):
full_url = response.urljoin(href.extract())
yield scrapy.Request(full_url, callback=self.parse_question)
def parse_question(self, response):
yield {
'title': response.css('h1 a::text').extract()[0],
'votes': response.css('.question .vote-count-post::text').extract()[0],
'body': response.css('.question .post-text').extract()[0],
'tags': response.css('.question .post-tag::text').extract(),
'link': response.url,
}
You don't need to worry about, urljoin() handles both cases properly:
In [1]: response.urljoin("http://stackoverflow.com/questions/426258/checking-a-checkbox-with-jquery")
Out[1]: 'http://stackoverflow.com/questions/426258/checking-a-checkbox-with-jquery'
In [2]: response.urljoin("/questions/426258/checking-a-checkbox-with-jquery")
Out[2]: 'http://stackoverflow.com/questions/426258/checking-a-checkbox-with-jquery'