Scraping Infinite scroll page - python

I am trying to scrape a infinite scroll ajax request page but not able to go the next page and get the yield items. I'm able to get the response.txt. I tried debbuging but not able get any solution. Can anyone help me in out in this.
import scrapy
class InfiniteScrollingSpider(scrapy.Spider):
name = 'wegotthiscovered_review'
scrolling_url = 'https://wegotthiscovered.com/wp-admin/admin-ajax.php'
def start_requests(self):
yield scrapy.FormRequest(
self.scrolling_url,
formdata={
'action': "face3_infinite_scroll",
'page': '1',
'attrs': "{\"id\":\"1\",\"order\":\"\",\"orderby\":\"\",\"catnames\":\"movies+reviews\",\"postnotin\":\"905069,904520,904521,903475,901576,900303,893944,895136,891795,886876,884402,881283\",\"timestampbefore\":1591800990}"
},
callback=self.parse_page,
meta={'page': 1},
)
def parse_page(self, response):
next_page = response.meta.get('page') + 1
print('next_page:', next_page)
print(response.text)
json_data = json.loads(response.text)
print(json_data.keys())
print('success:', json_data.get('success'))
print('data:', json_data.get('data'))
if not json_data.get('success') or not json_data.get('data') or not json_data['data'].get('content'):
return
articles = scrapy.Selector(text=json_data['data']['content']).css('article')
for article in articles:
yield {
'page_title': article.css('h4 ::text').extract_first().strip(),
'review_link': article.css('h4 ::attr(href)').extract_first().strip(),
}
print('next page >>>')
yield scrapy.FormRequest(
self.scrolling_url,
formdata={
'action': "face3_infinite_scroll",
'page': str(next_page),
"query_args":"{\"archive_type\":\"masonry\",\"show_first\":false,\"columns\":2,\"meta_cat\":false,\"meta\":true,\"summary\":true,\"standard_summary\":\"excerpt\",\"more_button\":false,\"reduce_margin\":false,\"orientation\":\"landscape\",\"list_width\":\"6\",\"widgets\":false,\"widgets_sidebar\":\"sidebar-archive\",\"widgets_after\":3,\"widgets_repeat\":false,\"highlight\":\"featured\",\"pagination_type\":\"ajax\",\"infinite_load\":true}"
},
callback=self.parse_page,
meta={'page': next_page},
)

Related

Scrapy: How to get the "requested URL" and the "Redirected URL" using scrapy?

I want to get both the "requested URL" and "Redirected URL" using Scrapy. My code is the following.
def parse(self, response):
if response.request.meta.get('redirect_urls'):
yield {
'URL': response.request.meta.get('redirect_urls')[0],
'Redireted URL': response.url,
'Status': response.status
}
else:
yield {
'URL': response.url,
'Redireted URL': response.url,
'Status': response.status
}
But I'm getting only redirected url
Update
I previously solved this issue by using response.request.url and it worked fine that time. But now I found that it only works properly if I export the output in a JSON file or in the terminal. With CSV it is not getting both Redirected URL and Requested URL. My updated code is given below
Script
import scrapy
import pandas as pd
from twisted.internet.error import *
class CheckerSpider(scrapy.Spider):
name = 'checker'
def read_xl(df):
df = pd.read_excel('url_data.xlsx')
return df['Link'].tolist()
def start_requests(self):
for value in self.read_xl():
yield scrapy.Request(
url = value,
# callback=self.parse,
errback=self.parse_error,
dont_filter=True
)
return super().start_requests()
def parse_error(self, failure):
if failure.check(DNSLookupError):
request = failure.request
yield {
'URL': request.url,
'Status': failure.value
}
elif failure.check(MulticastJoinError):
request = failure.request
yield {
'URL': request.url,
'Status': failure.value
}
def parse(self, response):
if response.request.meta.get('redirect_urls'):
yield {
'Redireted URL': response.request.url,
'Requested URL': response.request.meta['redirect_urls'][0],
'Status': response.status
}
else:
yield {
'Redireted URL': response.request.url,
'Requested URL': response.request.url,
'Status': response.status
}
Terminal Output
JSON Output
CSV Output
Not getting all fields

Scrapy & ASPX site - fails to pull pages beyond 11 - 302 errors

I am trying to pull data from this site: https://inform.alabama.gov/employeesearch.aspx. The current query that I have below works up to and including page 11. I think the issue resides with the "__VIEWSTATE" form element. It doesn't appear to change with each request. It should represent the current response page in the loop so the server knows how to interpret the subsequent response. It seems to only render the value present on the first response, so I believe that the server is rejecting it because page 12 is not a valid pathway from pages 1-10. If you take a look at the pagination it goes from 1 to ..., where the ... renders page 11. When page 11 is rendered, it changes the pagination to: 11 to ..., where the ... renders page 21.
Note that num_pages defines the total page count. Currently set to 15, it processes pages 1-11 and returns 302 errors for the other pages.
How should this be modified to yield the results for all 661 pages?
from scrapy import FormRequest, Spider
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
class EmployeesSpider(Spider):
name = 'employees'
start_urls = ['https://inform.alabama.gov/employeesearch.aspx']
num_pages = 15 # 661
name_excludes = ['', ' ', '1']
def parse(self, response):
formdata = self.get_formdata(response, 0)
formdata['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$btn_Search'
formdata['__EVENTARGUMENT'] = ''
yield FormRequest(
url='https://inform.alabama.gov/employeesearch.aspx',
method="POST",
dont_filter=True,
formdata=formdata,
callback=self.perform_search,
errback=self.failure)
def perform_search(self, response):
for employee in response.xpath('//*[#id="ContentPlaceHolder1_GridView1"]//tr'):
emp_name = employee.xpath('.//td[1]//text()').get()
if emp_name is not None and emp_name not in self.name_excludes:
final_name = emp_name.strip()
yield {
'name': final_name,
'email': employee.xpath('.//td[1]//span//a//text()').get(),
'org': employee.xpath('.//td[2]//text()').get(),
'phone': employee.xpath('.//td[3]//span//a//text()').get(),
}
# Download search pages starting from #2
for i in range(2, self.num_pages):
formdata = self.get_formdata(response, i)
yield FormRequest(
url='https://inform.alabama.gov/employeesearch.aspx',
method="POST",
dont_filter=True,
formdata=formdata,
callback=self.parse_results,
errback=self.failure)
def get_formdata(self, response, page_num):
eventargument = 'Page$' + str(page_num)
viewstate = response.css(
'input#__VIEWSTATE::attr(value)').get()
if viewstate is None:
viewstate = ''
viewstategen = response.css(
'input#__VIEWSTATEGENERATOR::attr(value)').get()
if viewstategen is None:
viewstategen = ''
eventvalidation = response.css(
'input#__EVENTVALIDATION::attr(value)').get()
if eventvalidation is None:
eventvalidation = ''
formdata = {
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$GridView1',
'__EVENTARGUMENT': eventargument,
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidation,
'ctl00%24ContentPlaceHolder1%24txt_FirstName': '',
'ctl00%24ContentPlaceHolder1%24txt_LastName': '',
'ctl00%24ContentPlaceHolder1%24ddl_Agency': 'Not+Selected',
'ctl00%24ContentPlaceHolder1%24txt_Phone': '',
}
return formdata
def parse_results(self, response):
for employee in response.xpath('//*[#id="ContentPlaceHolder1_GridView1"]//tr'):
emp_name = employee.xpath('.//td[1]//text()').get()
if emp_name is not None and emp_name not in self.name_excludes:
final_name = emp_name.strip()
yield {
'name': final_name,
'email': employee.xpath('.//td[1]//span//a//text()').get(),
'org': employee.xpath('.//td[2]//text()').get(),
'phone': employee.xpath('.//td[3]//span//a//text()').get(),
}
def failure(self, failure):
# log all failures
self.logger.error(repr(failure))
# in case you want to do something special for some errors,
# you may need the failure's type:
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)
First, you need to get the total number of pages.
Press Search, go to the last page and, save 661 to num_pages.
Start from the beginning, now knowing how many pages do you have:
from scrapy import FormRequest, Spider
class EmployeesSpider(Spider):
name = 'employees'
start_urls = ['https://inform.alabama.gov/employeesearch.aspx']
num_pages = None
name_excludes = ['', ' ', '1', None]
def parse(self, response):
yield FormRequest.from_response(
response,
dont_filter=True,
formdata={
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$btn_Search',
'__EVENTARGUMENT': ''
},
callback=self.perform_search
)
def perform_search(self, response):
if not self.num_pages:
# first get the total number of pages
yield FormRequest.from_response(
response,
dont_filter=True,
formdata={
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$GridView1',
'__EVENTARGUMENT': 'Page$Last'
},
callback=self.parse_total_pages
)
else:
for employee in response.xpath(
'//table[#id="ContentPlaceHolder1_GridView1"]//'
'tr[position() < last()]'
):
emp_name = employee.xpath('.//td[1]//text()').get()
if emp_name not in self.name_excludes:
final_name = emp_name.strip()
yield {
'name': final_name,
'email': employee.xpath('.//td[1]//span//a//text()').get(),
'org': employee.xpath('.//td[2]//text()').get(),
'phone': employee.xpath('.//td[3]//span//a//text()').get(),
}
# go to the next page
current_page = response.xpath(
'//tr[#class="employeeSearchPagerStye"]/td//span/text()'
).get()
current_page = int(current_page)
if current_page < self.num_pages:
yield FormRequest.from_response(
response,
dont_filter=True,
formdata={
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$GridView1',
'__EVENTARGUMENT': f'Page${current_page + 1}'
},
callback=self.perform_search
)
def parse_total_pages(self, response):
total_pages = response.xpath(
'//tr[#class="employeeSearchPagerStye"]/td//span/text()'
).get()
self.num_pages = int(total_pages)
# back to search
yield FormRequest.from_response(
response,
dont_filter=True,
formdata={
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$btn_Search',
'__EVENTARGUMENT': ''
},
callback=self.perform_search
)

How to get data from a later function in scrapy

I'm having trouble structuring scrapy data as I want. My spider get some data from one page, then follows a list of links on that page to get a link of this next page.
def parse_page(self, response):
links = response.css(LINK_SELECTOR).extract()
data = {
'name': response.css(NAME_SELECTOR).extract_first(),
'date': response.css(DATE_SELECTOR).extract(),
}
for link in links:
next_link = response.urljoin(link)
yield scrapy.Request(next_link, callback=self.parse_url, meta={'data': data})
def parse_url(self, response):
data = response.meta['data']
data['url'] = response.css(a::attr(href)').get()
yield data
What I would like is to get the data with the following structure:
{'name': name, 'date': date, 'url': [url1, url2, url3, url4]}
Instead of
{'name': name, 'date': date, 'url': url1}
{'name': name, 'date': date, 'url': url2}
{'name': name, 'date': date, 'url': url3}
{'name': name, 'date': date, 'url': url4}
I've tried to use items but I don't get how to pass the data from parse_url to the parse_page function. How would I do that?
Thanks in advance.
You can use scrapy's coroutine support to do this pretty easily.
The code would look something like this:
async def parse_page(self, response):
...
for link in links:
request = response.follow(link)
response = await self.crawler.engine.download(request, self)
urls.append(response.css('a::attr(href)').get())
The following is one of the ways how you can achieve that. There is a library inline_requests which will help you get the expected output.
import scrapy
from scrapy.crawler import CrawlerProcess
from inline_requests import inline_requests
class YellowpagesSpider(scrapy.Spider):
name = "yellowpages"
start_urls = ["https://www.yellowpages.com/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771"]
#inline_requests
def parse(self, response):
data = {
'name':response.css(".sales-info > h1::text").get(),
'phone':response.css(".contact > p.phone::text").get(),
'target_link':[]
}
for item_link in response.css(".review-info > a.author[href]::attr(href)").getall():
resp = yield scrapy.Request(response.urljoin(item_link), meta={'handle_httpstatus_all': True})
target_link = resp.css("a.review-business-name::attr(href)").get()
data['target_link'].append(target_link)
print(data)
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT':'Mozilla/5.0',
'LOG_LEVEL':'ERROR',
})
c.crawl(YellowpagesSpider)
c.start()
Output it produces:
{'name': 'Honey Honey Cafe & Crepery', 'phone': '(415) 351-2423', 'target_link': ['/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771', '/walnut-ca/mip/akasaka-japanese-cuisine-455476824', '/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771']}

how do i scrape this kind of dynamic generated website data?

I'm trying to scrape E-commerce website,
example link: https://www.lazada.sg/products/esogoal-2-in-1-selfie-stick-tripod-bluetooth-selfie-stand-with-remote-shutter-foldable-tripod-monopod-i279432816-s436738661.html?mp=1
Data is being rendered via React and when i perform scraping on few links most of the data is being returned as null, and when i view the page source i cannot find actually HTML that is available via inspect element, just a json inside Javascript tags. I tested few times running scrapy scraper on the same links and data which was not found before, actually returns content, so its somehow randomly. I cannot figure out how should i scrape this kind of website.
As well i'm using pool of useragents and breaks between requests.
script = '''
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(1.5))
return splash:html()
end
'''
def start_requests(self):
url= [
'https://www.lazada.sg/products/esogoal-tactical-sling-bag-outdoor-chest-pack-shoulder-backpack-military-sport-bag-for-trekking-camping-hiking-rover-sling-daypack-for-men-women-i204814494-s353896924.html?mp=1',
'https://www.lazada.sg/products/esogoal-2-in-1-selfie-stick-tripod-bluetooth-selfie-stand-with-remote-shutter-foldable-tripod-monopod-i279432816-s436738661.html?mp=1',
'https://www.lazada.sg/products/esogoal-selfie-stick-tripod-extendable-selfie-stick-monopod-with-integrated-tripod-and-bluetooth-remote-shutter-wireless-selfie-stick-tripod-for-cellphonecameras-i205279097-s309050125.html?mp=1',
'https://www.lazada.sg/products/esogoal-mini-umbrella-travel-umbrella-sun-rain-umbrella8-ribs-98cm-big-surface-lightweight-compact-parasol-uv-protection-for-men-women-i204815487-s308312226.html?mp=1',
'https://www.lazada.sg/products/esogoal-2-in-1-selfie-stick-tripod-bluetooth-selfie-stand-with-remote-shutter-foldable-tripod-monopod-i279432816-s436738661.html?mp=1'
]
for link in url:
yield SplashRequest(url=link, callback=self.parse, endpoint='render.html', args={'wait' : 0.5, 'lua_source' : self.script}, dont_filter=True)
def parse(self, response):
yield {
'title' : response.xpath("//span[#class='pdp-mod-product-badge-title']/text()").extract_first(),
'price' : response.xpath("//span[contains(#class, 'pdp-price')]/text()").extract_first(),
'description' : response.xpath("//div[#id='module_product_detail']").extract_first()
}
I try this:
Pass 'execute' as argument of the splash method instead of 'render html'
from scrapy_splash import SplashRequest
class DynamicSpider(scrapy.Spider):
name = 'products'
url = [
'https://www.lazada.sg/products/esogoal-tactical-sling-bag-outdoor-chest-pack-shoulder-backpack-military-sport-bag-for-trekking-camping-hiking-rover-sling-daypack-for-men-women-i204814494-s353896924.html?mp=1',
'https://www.lazada.sg/products/esogoal-2-in-1-selfie-stick-tripod-bluetooth-selfie-stand-with-remote-shutter-foldable-tripod-monopod-i279432816-s436738661.html?mp=1',
'https://www.lazada.sg/products/esogoal-selfie-stick-tripod-extendable-selfie-stick-monopod-with-integrated-tripod-and-bluetooth-remote-shutter-wireless-selfie-stick-tripod-for-cellphonecameras-i205279097-s309050125.html?mp=1',
'https://www.lazada.sg/products/esogoal-mini-umbrella-travel-umbrella-sun-rain-umbrella8-ribs-98cm-big-surface-lightweight-compact-parasol-uv-protection-for-men-women-i204815487-s308312226.html?mp=1',
'https://www.lazada.sg/products/esogoal-2-in-1-selfie-stick-tripod-bluetooth-selfie-stand-with-remote-shutter-foldable-tripod-monopod-i279432816-s436738661.html?mp=1',
]
script = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(1.5))
return {
html = splash:html()
}
end
"""
def start_requests(self):
for link in self.url:
yield SplashRequest(
url=link,
callback=self.parse,
endpoint='execute',
args={'wait': 0.5, 'lua_source': self.script},
dont_filter=True,
)
def parse(self, response):
yield {
'title': response.xpath("//span[#class='pdp-mod-product-badge-title']/text()").extract_first(),
'price': response.xpath("//span[contains(#class, 'pdp-price')]/text()").extract_first(),
'description': response.xpath("//div[#id='module_product_detail']/h2/text()").extract_first()
}
An this is the result

How to scrape data on website if using Javascript with pagination

I have a website that's need to scrape the data
"https://www.forever21.com/us/shop/catalog/category/f21/sale#pageno=1&pageSize=120&filter=price:0,250&sort=5" but I cannot retrieve all the data it also has pagination and Its uses javascript as well.
any idea on how I will scrape all the items? Here's my code
def parse_2(self, response):
for product_item_forever in response.css('div.pi_container'):
item = GpdealsSpiderItem_f21()
f21_title = product_item_forever.css('p.p_name::text').extract_first()
f21_regular_price = product_item_forever.css('span.p_old_price::text').extract_first()
f21_sale_price = product_item_forever.css('span.p_sale.t_pink::text').extract_first()
f21_photo_url = product_item_forever.css('img::attr(data-original)').extract_first()
f21_description_url = product_item_forever.css('a.item_slider.product_link::attr(href)').extract_first()
item['f21_title'] = f21_title
item['f21_regular_price'] = f21_regular_price
item['f21_sale_price'] = f21_sale_price
item['f21_photo_url'] = f21_photo_url
item['f21_description_url'] = f21_description_url
yield item
Please help Thank you
One of the first steps in web scraping project should be looking for an API that the website uses to get the data. Not only does it save you parsing HTML, using an API also saves provider's bandwidth and server load. To look for an API, use your browser's developer tools and look for XHR requests in the network tab. In your case, the web site makes POST requests to this URL:
https://www.forever21.com/eu/shop/Catalog/GetProducts
You can then simulate the XHR request in Scrapy to get the data in JSON format. Here's the code for the spider:
# -*- coding: utf-8 -*-
import json
import scrapy
class Forever21Spider(scrapy.Spider):
name = 'forever21'
url = 'https://www.forever21.com/eu/shop/Catalog/GetProducts'
payload = {
'brand': 'f21',
'category': 'sale',
'page': {'pageSize': 60},
'filter': {
'price': {'minPrice': 0, 'maxPrice': 250}
},
'sort': {'sortType': '5'}
}
def start_requests(self):
# scrape the first page
payload = self.payload.copy()
payload['page']['pageNo'] = 1
yield scrapy.Request(
self.url, method='POST', body=json.dumps(payload),
headers={'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/json; charset=UTF-8'},
callback=self.parse, meta={'pageNo': 1}
)
def parse(self, response):
# parse the JSON response and extract the data
data = json.loads(response.text)
for product in data['CatalogProducts']:
item = {
'title': product['DisplayName'],
'regular_price': product['OriginalPrice'],
'sale_price': product['ListPrice'],
'photo_url': 'https://www.forever21.com/images/default_330/%s' % product['ImageFilename'],
'description_url': product['ProductShareLinkUrl']
}
yield item
# simulate pagination if we are not at the end
if len(data['CatalogProducts']) == self.payload['page']['pageSize']:
payload = self.payload.copy()
payload['page']['pageNo'] = response.meta['pageNo'] + 1
yield scrapy.Request(
self.url, method='POST', body=json.dumps(payload),
headers={'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/json; charset=UTF-8'},
callback=self.parse, meta={'pageNo': payload['page']['pageNo']}
)

Categories