Scrapy & ASPX site - fails to pull pages beyond 11 - 302 errors

Scrapy & ASPX site - fails to pull pages beyond 11 - 302 errors - python

I am trying to pull data from this site: https://inform.alabama.gov/employeesearch.aspx. The current query that I have below works up to and including page 11. I think the issue resides with the "__VIEWSTATE" form element. It doesn't appear to change with each request. It should represent the current response page in the loop so the server knows how to interpret the subsequent response. It seems to only render the value present on the first response, so I believe that the server is rejecting it because page 12 is not a valid pathway from pages 1-10. If you take a look at the pagination it goes from 1 to ..., where the ... renders page 11. When page 11 is rendered, it changes the pagination to: 11 to ..., where the ... renders page 21.
Note that num_pages defines the total page count. Currently set to 15, it processes pages 1-11 and returns 302 errors for the other pages.
How should this be modified to yield the results for all 661 pages?
from scrapy import FormRequest, Spider
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
class EmployeesSpider(Spider):
name = 'employees'
start_urls = ['https://inform.alabama.gov/employeesearch.aspx']
num_pages = 15 # 661
name_excludes = ['', ' ', '1']
def parse(self, response):
formdata = self.get_formdata(response, 0)
formdata['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$btn_Search'
formdata['__EVENTARGUMENT'] = ''
yield FormRequest(
url='https://inform.alabama.gov/employeesearch.aspx',
method="POST",
dont_filter=True,
formdata=formdata,
callback=self.perform_search,
errback=self.failure)
def perform_search(self, response):
for employee in response.xpath('//*[#id="ContentPlaceHolder1_GridView1"]//tr'):
emp_name = employee.xpath('.//td[1]//text()').get()
if emp_name is not None and emp_name not in self.name_excludes:
final_name = emp_name.strip()
yield {
'name': final_name,
'email': employee.xpath('.//td[1]//span//a//text()').get(),
'org': employee.xpath('.//td[2]//text()').get(),
'phone': employee.xpath('.//td[3]//span//a//text()').get(),
}
# Download search pages starting from #2
for i in range(2, self.num_pages):
formdata = self.get_formdata(response, i)
yield FormRequest(
url='https://inform.alabama.gov/employeesearch.aspx',
method="POST",
dont_filter=True,
formdata=formdata,
callback=self.parse_results,
errback=self.failure)
def get_formdata(self, response, page_num):
eventargument = 'Page$' + str(page_num)
viewstate = response.css(
'input#__VIEWSTATE::attr(value)').get()
if viewstate is None:
viewstate = ''
viewstategen = response.css(
'input#__VIEWSTATEGENERATOR::attr(value)').get()
if viewstategen is None:
viewstategen = ''
eventvalidation = response.css(
'input#__EVENTVALIDATION::attr(value)').get()
if eventvalidation is None:
eventvalidation = ''
formdata = {
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$GridView1',
'__EVENTARGUMENT': eventargument,
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidation,
'ctl00%24ContentPlaceHolder1%24txt_FirstName': '',
'ctl00%24ContentPlaceHolder1%24txt_LastName': '',
'ctl00%24ContentPlaceHolder1%24ddl_Agency': 'Not+Selected',
'ctl00%24ContentPlaceHolder1%24txt_Phone': '',
}
return formdata
def parse_results(self, response):
for employee in response.xpath('//*[#id="ContentPlaceHolder1_GridView1"]//tr'):
emp_name = employee.xpath('.//td[1]//text()').get()
if emp_name is not None and emp_name not in self.name_excludes:
final_name = emp_name.strip()
yield {
'name': final_name,
'email': employee.xpath('.//td[1]//span//a//text()').get(),
'org': employee.xpath('.//td[2]//text()').get(),
'phone': employee.xpath('.//td[3]//span//a//text()').get(),
}
def failure(self, failure):
# log all failures
self.logger.error(repr(failure))
# in case you want to do something special for some errors,
# you may need the failure's type:
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)

First, you need to get the total number of pages.
Press Search, go to the last page and, save 661 to num_pages.
Start from the beginning, now knowing how many pages do you have:
from scrapy import FormRequest, Spider
class EmployeesSpider(Spider):
name = 'employees'
start_urls = ['https://inform.alabama.gov/employeesearch.aspx']
num_pages = None
name_excludes = ['', ' ', '1', None]
def parse(self, response):
yield FormRequest.from_response(
response,
dont_filter=True,
formdata={
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$btn_Search',
'__EVENTARGUMENT': ''
},
callback=self.perform_search
)
def perform_search(self, response):
if not self.num_pages:
# first get the total number of pages
yield FormRequest.from_response(
response,
dont_filter=True,
formdata={
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$GridView1',
'__EVENTARGUMENT': 'Page$Last'
},
callback=self.parse_total_pages
)
else:
for employee in response.xpath(
'//table[#id="ContentPlaceHolder1_GridView1"]//'
'tr[position() < last()]'
):
emp_name = employee.xpath('.//td[1]//text()').get()
if emp_name not in self.name_excludes:
final_name = emp_name.strip()
yield {
'name': final_name,
'email': employee.xpath('.//td[1]//span//a//text()').get(),
'org': employee.xpath('.//td[2]//text()').get(),
'phone': employee.xpath('.//td[3]//span//a//text()').get(),
}
# go to the next page
current_page = response.xpath(
'//tr[#class="employeeSearchPagerStye"]/td//span/text()'
).get()
current_page = int(current_page)
if current_page < self.num_pages:
yield FormRequest.from_response(
response,
dont_filter=True,
formdata={
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$GridView1',
'__EVENTARGUMENT': f'Page${current_page + 1}'
},
callback=self.perform_search
)
def parse_total_pages(self, response):
total_pages = response.xpath(
'//tr[#class="employeeSearchPagerStye"]/td//span/text()'
).get()
self.num_pages = int(total_pages)
# back to search
yield FormRequest.from_response(
response,
dont_filter=True,
formdata={
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$btn_Search',
'__EVENTARGUMENT': ''
},
callback=self.perform_search
)

Related

Scrapy: How to get the "requested URL" and the "Redirected URL" using scrapy?

I want to get both the "requested URL" and "Redirected URL" using Scrapy. My code is the following.
def parse(self, response):
if response.request.meta.get('redirect_urls'):
yield {
'URL': response.request.meta.get('redirect_urls')[0],
'Redireted URL': response.url,
'Status': response.status
}
else:
yield {
'URL': response.url,
'Redireted URL': response.url,
'Status': response.status
}
But I'm getting only redirected url
Update
I previously solved this issue by using response.request.url and it worked fine that time. But now I found that it only works properly if I export the output in a JSON file or in the terminal. With CSV it is not getting both Redirected URL and Requested URL. My updated code is given below
Script
import scrapy
import pandas as pd
from twisted.internet.error import *
class CheckerSpider(scrapy.Spider):
name = 'checker'
def read_xl(df):
df = pd.read_excel('url_data.xlsx')
return df['Link'].tolist()
def start_requests(self):
for value in self.read_xl():
yield scrapy.Request(
url = value,
# callback=self.parse,
errback=self.parse_error,
dont_filter=True
)
return super().start_requests()
def parse_error(self, failure):
if failure.check(DNSLookupError):
request = failure.request
yield {
'URL': request.url,
'Status': failure.value
}
elif failure.check(MulticastJoinError):
request = failure.request
yield {
'URL': request.url,
'Status': failure.value
}
def parse(self, response):
if response.request.meta.get('redirect_urls'):
yield {
'Redireted URL': response.request.url,
'Requested URL': response.request.meta['redirect_urls'][0],
'Status': response.status
}
else:
yield {
'Redireted URL': response.request.url,
'Requested URL': response.request.url,
'Status': response.status
}
Terminal Output
JSON Output
CSV Output
Not getting all fields

Wait for Scapy callback function

I am new to Scrapy and Python on general.
Here is the code:
import scrapy
import json
class MOOCSpider(scrapy.Spider):
name = 'mooc'
start_urls = ['https://www.plurk.com/search?q=italy']
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
global_id = 1458122036
def parse(self, response):
url = 'https://www.plurk.com/Search/search2'
headers = {
...omitted...
}
for i in range(1,10):
formdata = {
"after_id": str(self.global_id)
}
yield scrapy.FormRequest(url, callback=self.parse_api, formdata=formdata, headers=headers)
def parse_api(self, response):
raw = response.body
data = json.loads(raw)
posts = data["plurks"]
users = data["users"]
l = len(posts)
i = 0
for post in posts:
i = i + 1
if (i == l):
self.global_id = post["plurk_id"]
...omitted code...
yield {
'Author': user_name,
'Body': post['content'],
'app': 'plurk'
}
The problem that I have is that Scrapy is making first all the requests in the for loop and then it is executing the code in parse_api.
What I would like to do is let scrapy do one iteration of the for loop, call the callback function, wait for it to return and then do another iteration.
This because the id that I need for the next request will be set in the global_id variable by the callback function.

You can't achieve this by scheduling requests in loop.
You can implement this only if you will schedule only one (next) request per parse/parse_api method call:
class MOOCSpider(scrapy.Spider):
name = 'mooc'
start_urls = ['https://www.plurk.com/search?q=italy']
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
'DOWNLOAD_DELAY':5,
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36",
}
def parse(self, response):
# schedule only first request (withour loop)
formdata = {
"query": 'italy',
"start_date": "2019/12",
"end_date": "2020/12",
"after_id": '1458122036', #<- your initial global_id
}
yield scrapy.FormRequest('https://www.plurk.com/Search/search2', callback=self.parse_api, formdata=formdata)
def parse_api(self, response):
data = json.loads(response.body)
after_id = None
for post in data["plurks"]:
after_id = post["plurk_id"]
yield {
'Author': data["users"][str(post["owner_id"])]["nick_name"], # instead of user_id?
'Body': post["content"],
'app': 'plurk'
}
# after end of this loop - after_id should contain required data for next request
# instead of separate loop variable response.meta["depth"] used to limit number requests
if response.meta["depth"] <=11 and after_id: # schedule next request
formdata = {
"query": 'italy',
"start_date": "2019/12",
"end_date": "2020/12",
"after_id": str(after_id),
}
yield scrapy.FormRequest('https://www.plurk.com/Search/search2', callback=self.parse_api, formdata=formdata)

Answering my own question:
Now the parse method does just one request and calls once the parse_api method. Parse_api processes the response and sets the global_id variable. Once it's done processing its own response it makes another request passing itself as the callback function.
By doing this you are guaranteed that the global_id variable will be properly set, since the new request will be made only once parse_api has finished running.
request.cb_kwargs["loop_l"] is used to pass an additional argument to the callback function. This time it's a counter that controls the number of requests we want to make. When the counter is equal to 100 we stop the crawling
import scrapy
import json
plurk_id = []
class MOOCSpider(scrapy.Spider):
name = 'mooc'
start_urls = ['https://www.plurk.com/search?q=']
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
global_id = 1455890167
url = 'https://www.plurk.com/Search/search2'
headers = {
...OMITTED...
}
def parse(self, response):
formdata = {
"after_id": str(self.global_id)
}
request = scrapy.FormRequest(self.url, callback=self.parse_api, formdata=formdata, headers=self.headers)
request.cb_kwargs["loop_l"] = str(0)
yield request
def parse_api(self, response, loop_l):
int_loop_l = int(loop_l)
int_loop_l = int_loop_l + 1
if (int_loop_l == 200):
return
raw = response.body
data = json.loads(raw)
...omitted code...
... GET AND SET THE NEW global_id FROM THE RESPONSE ...
# make another request with the new id
formdata = {
"after_id": str(self.global_id)
}
request = scrapy.FormRequest(self.url, callback=self.parse_api, formdata=formdata, headers=self.headers)
request.cb_kwargs["loop_l"] = str(int_loop_l)
yield request

Scraping Infinite scroll page

I am trying to scrape a infinite scroll ajax request page but not able to go the next page and get the yield items. I'm able to get the response.txt. I tried debbuging but not able get any solution. Can anyone help me in out in this.
import scrapy
class InfiniteScrollingSpider(scrapy.Spider):
name = 'wegotthiscovered_review'
scrolling_url = 'https://wegotthiscovered.com/wp-admin/admin-ajax.php'
def start_requests(self):
yield scrapy.FormRequest(
self.scrolling_url,
formdata={
'action': "face3_infinite_scroll",
'page': '1',
'attrs': "{\"id\":\"1\",\"order\":\"\",\"orderby\":\"\",\"catnames\":\"movies+reviews\",\"postnotin\":\"905069,904520,904521,903475,901576,900303,893944,895136,891795,886876,884402,881283\",\"timestampbefore\":1591800990}"
},
callback=self.parse_page,
meta={'page': 1},
)
def parse_page(self, response):
next_page = response.meta.get('page') + 1
print('next_page:', next_page)
print(response.text)
json_data = json.loads(response.text)
print(json_data.keys())
print('success:', json_data.get('success'))
print('data:', json_data.get('data'))
if not json_data.get('success') or not json_data.get('data') or not json_data['data'].get('content'):
return
articles = scrapy.Selector(text=json_data['data']['content']).css('article')
for article in articles:
yield {
'page_title': article.css('h4 ::text').extract_first().strip(),
'review_link': article.css('h4 ::attr(href)').extract_first().strip(),
}
print('next page >>>')
yield scrapy.FormRequest(
self.scrolling_url,
formdata={
'action': "face3_infinite_scroll",
'page': str(next_page),
"query_args":"{\"archive_type\":\"masonry\",\"show_first\":false,\"columns\":2,\"meta_cat\":false,\"meta\":true,\"summary\":true,\"standard_summary\":\"excerpt\",\"more_button\":false,\"reduce_margin\":false,\"orientation\":\"landscape\",\"list_width\":\"6\",\"widgets\":false,\"widgets_sidebar\":\"sidebar-archive\",\"widgets_after\":3,\"widgets_repeat\":false,\"highlight\":\"featured\",\"pagination_type\":\"ajax\",\"infinite_load\":true}"
},
callback=self.parse_page,
meta={'page': next_page},
)

How to scrape data on website if using Javascript with pagination

I have a website that's need to scrape the data
"https://www.forever21.com/us/shop/catalog/category/f21/sale#pageno=1&pageSize=120&filter=price:0,250&sort=5" but I cannot retrieve all the data it also has pagination and Its uses javascript as well.
any idea on how I will scrape all the items? Here's my code
def parse_2(self, response):
for product_item_forever in response.css('div.pi_container'):
item = GpdealsSpiderItem_f21()
f21_title = product_item_forever.css('p.p_name::text').extract_first()
f21_regular_price = product_item_forever.css('span.p_old_price::text').extract_first()
f21_sale_price = product_item_forever.css('span.p_sale.t_pink::text').extract_first()
f21_photo_url = product_item_forever.css('img::attr(data-original)').extract_first()
f21_description_url = product_item_forever.css('a.item_slider.product_link::attr(href)').extract_first()
item['f21_title'] = f21_title
item['f21_regular_price'] = f21_regular_price
item['f21_sale_price'] = f21_sale_price
item['f21_photo_url'] = f21_photo_url
item['f21_description_url'] = f21_description_url
yield item
Please help Thank you

One of the first steps in web scraping project should be looking for an API that the website uses to get the data. Not only does it save you parsing HTML, using an API also saves provider's bandwidth and server load. To look for an API, use your browser's developer tools and look for XHR requests in the network tab. In your case, the web site makes POST requests to this URL:
https://www.forever21.com/eu/shop/Catalog/GetProducts
You can then simulate the XHR request in Scrapy to get the data in JSON format. Here's the code for the spider:
# -*- coding: utf-8 -*-
import json
import scrapy
class Forever21Spider(scrapy.Spider):
name = 'forever21'
url = 'https://www.forever21.com/eu/shop/Catalog/GetProducts'
payload = {
'brand': 'f21',
'category': 'sale',
'page': {'pageSize': 60},
'filter': {
'price': {'minPrice': 0, 'maxPrice': 250}
},
'sort': {'sortType': '5'}
}
def start_requests(self):
# scrape the first page
payload = self.payload.copy()
payload['page']['pageNo'] = 1
yield scrapy.Request(
self.url, method='POST', body=json.dumps(payload),
headers={'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/json; charset=UTF-8'},
callback=self.parse, meta={'pageNo': 1}
)
def parse(self, response):
# parse the JSON response and extract the data
data = json.loads(response.text)
for product in data['CatalogProducts']:
item = {
'title': product['DisplayName'],
'regular_price': product['OriginalPrice'],
'sale_price': product['ListPrice'],
'photo_url': 'https://www.forever21.com/images/default_330/%s' % product['ImageFilename'],
'description_url': product['ProductShareLinkUrl']
}
yield item
# simulate pagination if we are not at the end
if len(data['CatalogProducts']) == self.payload['page']['pageSize']:
payload = self.payload.copy()
payload['page']['pageNo'] = response.meta['pageNo'] + 1
yield scrapy.Request(
self.url, method='POST', body=json.dumps(payload),
headers={'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/json; charset=UTF-8'},
callback=self.parse, meta={'pageNo': payload['page']['pageNo']}
)

Scrapy not loading entire page? Or I have bad code...

I am having issues with pagination in the following code.
The spider starts but does not find any links on the first page. This is because the page actually returns a partial result... I know it sounds odd but its true, when I visit the page I see jobs listed but when the bot visits, there are no jobs listed.
From what I understand, scrapy will load the entire page regardless of JS or AJAX but I am starting to wonder...
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.http.request import Request
from northrop.items import NorthropItem
from scrapy.http import HtmlResponse
from scrapy.exceptions import CloseSpider
import re
class NorthropSpider(CrawlSpider):
name = "northropJobStart"
start_urls = ['https://ngc.taleo.net/careersection/ngc_pro/jobsearch.ftl?lang=en#']
allowed_domains = ["ngc.taleo.net"]
rules = (
Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="next"]/a',)), callback="parse_listings", follow= True),
)
def parse_start_url(self, response):
return self.parse_listings(response)
def parse_listings(self, response):
sel = Selector(response)
# There are no jobs listed.. I am lost.....
jobs = sel.xpath('//th/div/div/span/a/#href').extract()
for job_url in jobs:
job_url = self.__normalise(job_url)
job_url = self.__to_absolute_url(response.url,job_url)
yield Request(job_url, callback=self.parse_details)
def parse_details(self, response):
sel = Selector(response)
job = sel.xpath('//*[#id="mainbody-jobs"]')
item = NorthropItem()
# Populate job fields
item['title'] = job.xpath('//*[#id="mainbody-jobs"]/h1/text()').extract()
item['location'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[3]/div[2]/text()').extract()
item['applink'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[1]/a/#href').extract()
item['description'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[2]/div[1]/div[2]').extract()
item['travel'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[5]/div[2]/text()').extract()
item['job_category'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[2]/div[2]/text()').extract()
item['clearance_have'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[8]/div[2]/text()').extract()
item['clearance_get'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[8]/div[2]/text()').extract()
item['job_number'] = job.xpath('//*[#id="mainbody-jobs"]/div[3]/div[2]/div[1]/div/div[1]/div[2]/text()').extract()
item['page_url'] = response.url
item = self.__normalise_item(item, response.url)
return item
def __normalise_item(self, item, base_url):
'''
Standardise and format item fields
'''
# Loop item fields to sanitise data and standardise data types
for key, value in vars(item).values()[0].iteritems():
item[key] = self.__normalise(item[key])
# Convert job URL from relative to absolute URL
#item['job_url'] = self.__to_absolute_url(base_url, item['job_url'])
return item
def __normalise(self, value):
# Convert list to string
value = value if type(value) is not list else ' '.join(value)
# Trim leading and trailing special characters (Whitespaces, newlines, spaces, tabs, carriage returns)
value = value.strip()
return value
def __to_absolute_url(self, base_url, link):
'''
Convert relative URL to absolute URL
'''
import urlparse
link = urlparse.urljoin(base_url, link)
return link
def __to_int(self, value):
'''
Convert value to integer type
'''
try:
value = int(value)
except ValueError:
value = 0
return value
def __to_float(self, value):
'''
Convert value to float type
'''
try:
value = float(value)
except ValueError:
value = 0.0
return value

Unfortunately the search form is hidden in quite deep but you can see it if in the network tab of your browser inspect.
Turns out it's sending a full json of default search parameters, so you need to pretty much copy and paste it only incrementing the pageNo. I couldn't help but solve it and before you know it I wrote a whole spider, so here it is, let me know if some parts are unclear:
import json
import scrapy
class TaleoSpider(scrapy.Spider):
name = 'taleo'
start_urls = ['https://ngc.taleo.net/careersection/ngc_pro/jobsearch.ftl?lang=en#']
# baseform with base search values
base_form = {'advancedSearchFiltersSelectionParam':
{'searchFilterSelections': [
{'id': 'ORGANIZATION', 'selectedValues': []},
{'id': 'LOCATION', 'selectedValues': []},
{'id': 'JOB_FIELD', 'selectedValues': []},
{'id': 'URGENT_JOB', 'selectedValues': []},
{'id': 'EMPLOYEE_STATUS', 'selectedValues': []},
{'id': 'STUDY_LEVEL', 'selectedValues': []},
{'id': 'WILL_TRAVEL', 'selectedValues': []},
{'id': 'JOB_SHIFT', 'selectedValues': []},
{'id': 'JOB_NUMBER', 'selectedValues': []}]},
'fieldData': {'fields': {'JOB_TITLE': '', 'KEYWORD': '', 'LOCATION': ''},
'valid': True},
'filterSelectionParam': {'searchFilterSelections': [{'id': 'POSTING_DATE',
'selectedValues': []},
{'id': 'LOCATION', 'selectedValues': []},
{'id': 'JOB_FIELD', 'selectedValues': []},
{'id': 'JOB_TYPE', 'selectedValues': []},
{'id': 'JOB_SCHEDULE', 'selectedValues': []},
{'id': 'JOB_LEVEL', 'selectedValues': []}]},
'multilineEnabled': False,
'pageNo': 1, # <--- change this for pagination
'sortingSelection': {'ascendingSortingOrder': 'false',
'sortBySelectionParam': '3'}}
def parse(self, response):
# we got cookies from first start url now lets request into the search api
# copy base form for the first request
form = self.base_form.copy()
yield scrapy.Request('https://ngc.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=2160420105',
body=json.dumps(self.base_form),
# add headers to indicate we are sending a json package
headers={'Content-Type': 'application/json',
'X-Requested-With': 'XMLHttpRequest'},
# scrapy.Request defaults to 'GET', but we want 'POST' here
method='POST',
# load our form into meta so we can reuse it later
meta={'form': form},
callback=self.parse_items)
def parse_items(self, response):
data = json.loads(response.body)
# scrape data
for item in data['requisitionList']:
yield item
# next page
# get our form back and update the page number in it
form = response.meta['form']
form['pageNo'] += 1
# check if paging is over, is our next page higher than maximum page?
max_page = data['pagingData']['totalCount'] / data['pagingData']['pageSize']
if form['pageNo'] > max_page:
return
yield scrapy.Request('https://ngc.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=2160420105',
body=json.dumps(form),
headers={'Content-Type': 'application/json',
'X-Requested-With': 'XMLHttpRequest'},
method='POST',
meta={'form': form},
callback=self.parse_items)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy & ASPX site - fails to pull pages beyond 11 - 302 errors - python

Related

Scrapy: How to get the "requested URL" and the "Redirected URL" using scrapy?

Wait for Scapy callback function

Scraping Infinite scroll page

How to scrape data on website if using Javascript with pagination

Scrapy not loading entire page? Or I have bad code...

Categories

Resources