How to scrape data on website if using Javascript with pagination

How to scrape data on website if using Javascript with pagination - python

I have a website that's need to scrape the data
"https://www.forever21.com/us/shop/catalog/category/f21/sale#pageno=1&pageSize=120&filter=price:0,250&sort=5" but I cannot retrieve all the data it also has pagination and Its uses javascript as well.
any idea on how I will scrape all the items? Here's my code
def parse_2(self, response):
for product_item_forever in response.css('div.pi_container'):
item = GpdealsSpiderItem_f21()
f21_title = product_item_forever.css('p.p_name::text').extract_first()
f21_regular_price = product_item_forever.css('span.p_old_price::text').extract_first()
f21_sale_price = product_item_forever.css('span.p_sale.t_pink::text').extract_first()
f21_photo_url = product_item_forever.css('img::attr(data-original)').extract_first()
f21_description_url = product_item_forever.css('a.item_slider.product_link::attr(href)').extract_first()
item['f21_title'] = f21_title
item['f21_regular_price'] = f21_regular_price
item['f21_sale_price'] = f21_sale_price
item['f21_photo_url'] = f21_photo_url
item['f21_description_url'] = f21_description_url
yield item
Please help Thank you

One of the first steps in web scraping project should be looking for an API that the website uses to get the data. Not only does it save you parsing HTML, using an API also saves provider's bandwidth and server load. To look for an API, use your browser's developer tools and look for XHR requests in the network tab. In your case, the web site makes POST requests to this URL:
https://www.forever21.com/eu/shop/Catalog/GetProducts
You can then simulate the XHR request in Scrapy to get the data in JSON format. Here's the code for the spider:
# -*- coding: utf-8 -*-
import json
import scrapy
class Forever21Spider(scrapy.Spider):
name = 'forever21'
url = 'https://www.forever21.com/eu/shop/Catalog/GetProducts'
payload = {
'brand': 'f21',
'category': 'sale',
'page': {'pageSize': 60},
'filter': {
'price': {'minPrice': 0, 'maxPrice': 250}
},
'sort': {'sortType': '5'}
}
def start_requests(self):
# scrape the first page
payload = self.payload.copy()
payload['page']['pageNo'] = 1
yield scrapy.Request(
self.url, method='POST', body=json.dumps(payload),
headers={'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/json; charset=UTF-8'},
callback=self.parse, meta={'pageNo': 1}
)
def parse(self, response):
# parse the JSON response and extract the data
data = json.loads(response.text)
for product in data['CatalogProducts']:
item = {
'title': product['DisplayName'],
'regular_price': product['OriginalPrice'],
'sale_price': product['ListPrice'],
'photo_url': 'https://www.forever21.com/images/default_330/%s' % product['ImageFilename'],
'description_url': product['ProductShareLinkUrl']
}
yield item
# simulate pagination if we are not at the end
if len(data['CatalogProducts']) == self.payload['page']['pageSize']:
payload = self.payload.copy()
payload['page']['pageNo'] = response.meta['pageNo'] + 1
yield scrapy.Request(
self.url, method='POST', body=json.dumps(payload),
headers={'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/json; charset=UTF-8'},
callback=self.parse, meta={'pageNo': payload['page']['pageNo']}
)

Related

Webscraper Not Accurately Obtaining Data

I am trying to scrape all cars from the website: www.webuycars.co.za
I am using scrapy to do this and each page has 24 vehicles that I want to send to a json file.
Through data analysis it seems that I am just scraping the first page only or overwriting the variable used to create the json file.
import json
import scrapy
from scrapy.crawler import CrawlerProcess
class carSpider(scrapy.Spider):
name = 'car'
body = {"to":24,"size":24,"type":"All","filter_type":"all","subcategory":None,"q":"","Make":None,"Roadworthy":None,"Auctions":[],"Model":None,"Variant":None,"DealerKey":None,"FuelType":None,"BodyType":None,"Gearbox":None,"AxleConfiguration":None,"Colour":None,"FinanceGrade":None,"Priced_Amount_Gte":0,"Priced_Amount_Lte":0,"MonthlyInstallment_Amount_Gte":0,"MonthlyInstallment_Amount_Lte":0,"auctionDate":None,"auctionEndDate":None,"auctionDurationInSeconds":None,"Kilometers_Gte":0,"Kilometers_Lte":0,"Priced_Amount_Sort":"","Bid_Amount_Sort":"","Kilometers_Sort":"","Year_Sort":"","Auction_Date_Sort":"","Auction_Lot_Sort":"","Year":[],"Price_Update_Date_Sort":"","Online_Auction_Date_Sort":"","Online_Auction_In_Progress":""}
def start_requests(self):
yield scrapy.Request(
url='https://website-elastic-api.webuycars.co.za/api/search',
callback=self.parse,
body=json.dumps(self.body),
method="POST",
headers= {
"content-type": "application/json",
"User-Agent":"mozilla/5.0"
}
)
def parse(self, response):
response = json.loads(response.body)
cars = []
filename = "webuycar.json"
for item in range(0,6528,24):
response['total']['value']=item
cars.append(response['data'])
with open(filename, "w") as f:
json.dump(cars, f, indent=4)
for resp in response['data']:
yield {
'Title': resp['OnlineDescription']
}
#Code that runs the spider
process = CrawlerProcess()
process.crawl(carSpider)
process.start()
I would like to fix this as it messes with the accuracy of the database I have created and makes redundant data prevalent.
I have looked at my json file to see if the issue was from extraction. It seems that my webscraper is the problem. I would appreciate some thoughts on this.

You shouldn't try to dump the data into a file from the parse method. You should either use command line arguments, or in the case when running as a script like in your example, you can use feed exports.
Like THis:
import json
import scrapy
from scrapy.crawler import CrawlerProcess
class carSpider(scrapy.Spider):
name = 'car'
body = {"to":24,"size":24,"type":"All","filter_type":"all","subcategory":None,"q":"","Make":None,"Roadworthy":None,"Auctions":[],"Model":None,"Variant":None,"DealerKey":None,"FuelType":None,"BodyType":None,"Gearbox":None,"AxleConfiguration":None,"Colour":None,"FinanceGrade":None,"Priced_Amount_Gte":0,"Priced_Amount_Lte":0,"MonthlyInstallment_Amount_Gte":0,"MonthlyInstallment_Amount_Lte":0,"auctionDate":None,"auctionEndDate":None,"auctionDurationInSeconds":None,"Kilometers_Gte":0,"Kilometers_Lte":0,"Priced_Amount_Sort":"","Bid_Amount_Sort":"","Kilometers_Sort":"","Year_Sort":"","Auction_Date_Sort":"","Auction_Lot_Sort":"","Year":[],"Price_Update_Date_Sort":"","Online_Auction_Date_Sort":"","Online_Auction_In_Progress":""}
custom_settings = {"FEEDS": {
"webuycar.json":{
'format': 'json',
'encoding': 'utf8',
'store_empty': False,
'indent': 4
}
}}
def start_requests(self):
yield scrapy.Request(
url='https://website-elastic-api.webuycars.co.za/api/search',
callback=self.parse,
body=json.dumps(self.body),
method="POST",
headers= {
"content-type": "application/json",
"User-Agent":"mozilla/5.0"
}
)
def parse(self, response):
data = response.json()
for item in range(0,6528,24):
data['total']['value']=item
yield data
for item in data['data']:
yield {'Title': item['OnlineDescription']}
#Code that runs the spider
process = CrawlerProcess()
process.crawl(carSpider)
process.start()
Im not totally sure this solves your problem because you are still scraping a single url, but this should avoid overwriting the file.
Although I tested this and the output json file was 7143882 lines long
Update:
After taking a closer look at your code, I think that this is closer to what you are actually trying to achieve. This makes many calls to the api and extracts all 24 OnlineDescription fields from each api call response.
import json
import scrapy
from scrapy.crawler import CrawlerProcess
class carSpider(scrapy.Spider):
name = 'car'
body = {"to":24,"size":24,"type":"All","filter_type":"all","subcategory":None,"q":"","Make":None,"Roadworthy":None,"Auctions":[],"Model":None,"Variant":None,"DealerKey":None,"FuelType":None,"BodyType":None,"Gearbox":None,"AxleConfiguration":None,"Colour":None,"FinanceGrade":None,"Priced_Amount_Gte":0,"Priced_Amount_Lte":0,"MonthlyInstallment_Amount_Gte":0,"MonthlyInstallment_Amount_Lte":0,"auctionDate":None,"auctionEndDate":None,"auctionDurationInSeconds":None,"Kilometers_Gte":0,"Kilometers_Lte":0,"Priced_Amount_Sort":"","Bid_Amount_Sort":"","Kilometers_Sort":"","Year_Sort":"","Auction_Date_Sort":"","Auction_Lot_Sort":"","Year":[],"Price_Update_Date_Sort":"","Online_Auction_Date_Sort":"","Online_Auction_In_Progress":""}
custom_settings = {"FEEDS": {
"webuycar.json":{
'format': 'json',
'encoding': 'utf8',
'store_empty': False,
'indent': 4
}
}}
def start_requests(self):
for i in range(24,6528,24):
self.body["to"] = i
yield scrapy.Request(
url='https://website-elastic-api.webuycars.co.za/api/search',
callback=self.parse,
body=json.dumps(self.body),
method="POST",
headers= {
"content-type": "application/json",
"User-Agent":"mozilla/5.0"
}
)
def parse(self, response):
data = response.json()
for item in data['data']:
yield {"Title": item['OnlineDescription']}
#Code that runs the spider
process = CrawlerProcess()
process.crawl(carSpider)
process.start()

Submitting Form Data with Python Requests POST not Working

I'm writing a python script to automatically check dog re-homing sites for dogs that we might be able to adopt as they become available, however I'm stuck completing the form data on this site and can't figure out why.
The form attributes state it should have a post method and I've gone through all of the inputs for the form and created a payload.
I expect the page with the search results to be returned and the html scraped from the results page so I can start processing it, but the scrape is just the form page and never has the results.
I've tried using .get with the payload as params, the url with the payload and using the requests-html library to render any java script elements without success.
If you paste the url_w_payload into a browser it loads the page and says one of the fields is empty. If you then press enter in the url bar again to reload the page without modifying the url it loads... something to do with cookies maybe?
import requests
from requests_html import HTMLSession
session = HTMLSession()
form_url = "https://www.rspca.org.uk/findapet?p_p_id=petSearch2016_WAR_ptlPetRehomingPortlets&p_p_lifecycle=1&p_p_state=normal&p_p_mode=view&_petSearch2016_WAR_ptlPetRehomingPortlets_action=search"
url_w_payload = "https://www.rspca.org.uk/findapet?p_p_id=petSearch2016_WAR_ptlPetRehomingPortlets&p_p_lifecycle=1&p_p_state=normal&p_p_mode=view&_petSearch2016_WAR_ptlPetRehomingPortlets_action=search&noPageView=false&animalType=DOG&freshSearch=false&arrivalSort=false&previousAnimalType=&location=WC2N5DU&previousLocation=&prevSearchedPostcode=&postcode=WC2N5DU&searchedLongitude=-0.1282688&searchedLatitude=51.5072106"
payload = {'noPageView': 'false','animalType': 'DOG', 'freshSearch': 'false', 'arrivalSort': 'false', 'previousAnimalType': '', 'location': 'WC2N5DU', 'previousLocation': '','prevSearchedPostcode': '', 'postcode': 'WC2N5DU', 'searchedLongitude': '-0.1282688', 'searchedLatitude': '51.5072106'}
#req = requests.post(form_url, data = payload)
#with open("requests_output.txt", "w") as f:
# f.write(req.text)
ses = session.post(form_url, data = payload)
ses.html.render()
with open("session_output.txt", "w") as f:
f.write(ses.text)
print("Done")

There's a few hoops to jump with cookies and headers but once you get those right, you'll get the proper response.
Here's how to do it:
import time
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
query_string = {
"p_p_id": "petSearch2016_WAR_ptlPetRehomingPortlets",
"p_p_lifecycle": 1,
"p_p_state": "normal",
"p_p_mode": "view",
"_petSearch2016_WAR_ptlPetRehomingPortlets_action": "search",
}
payload = {
'noPageView': 'false',
'animalType': 'DOG',
'freshSearch': 'false',
'arrivalSort': 'false',
'previousAnimalType': '',
'location': 'WC2N5DU',
'previousLocation': '',
'prevSearchedPostcode': '',
'postcode': 'WC2N5DU',
'searchedLongitude': '-0.1282688',
'searchedLatitude': '51.5072106',
}
def make_cookies(cookie_dict: dict) -> str:
return "; ".join(f"{k}={v}" for k, v in cookie_dict.items())
with requests.Session() as connection:
main_url = "https://www.rspca.org.uk"
connection.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) " \
"AppleWebKit/537.36 (KHTML, like Gecko) " \
"Chrome/90.0.4430.212 Safari/537.36"
r = connection.get(main_url)
cookies = make_cookies(r.cookies.get_dict())
additional_string = f"; cb-enabled=enabled; " \
f"LFR_SESSION_STATE_10110={int(time.time())}"
post_url = f"https://www.rspca.org.uk/findapet?{urlencode(query_string)}"
connection.headers.update(
{
"cookie": cookies + additional_string,
"referer": post_url,
"content-type": "application/x-www-form-urlencoded",
}
)
response = connection.post(post_url, data=urlencode(payload)).text
dogs = BeautifulSoup(response, "lxml").find_all("a", class_="detailLink")
print("\n".join(f"{main_url}{dog['href']}" for dog in dogs))
Output (shortened for brevity and no need to paginate the page as all dogs come in the response):
https://www.rspca.org.uk/findapet/details/-/Animal/JAY_JAY/ref/217747/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/STORM/ref/217054/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/DASHER/ref/205702/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/EVE/ref/205701/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/SEBASTIAN/ref/178975/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/FIJI/ref/169578/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/ELLA/ref/154419/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/BEN/ref/217605/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/SNOWY/ref/214416/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/BENSON/ref/215141/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/BELLA/ref/207716/rehome/
and much more ...
PS. I really enjoyed this challenge as I have two dogs from a shelter. Keep it up, man!

Wait for Scapy callback function

I am new to Scrapy and Python on general.
Here is the code:
import scrapy
import json
class MOOCSpider(scrapy.Spider):
name = 'mooc'
start_urls = ['https://www.plurk.com/search?q=italy']
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
global_id = 1458122036
def parse(self, response):
url = 'https://www.plurk.com/Search/search2'
headers = {
...omitted...
}
for i in range(1,10):
formdata = {
"after_id": str(self.global_id)
}
yield scrapy.FormRequest(url, callback=self.parse_api, formdata=formdata, headers=headers)
def parse_api(self, response):
raw = response.body
data = json.loads(raw)
posts = data["plurks"]
users = data["users"]
l = len(posts)
i = 0
for post in posts:
i = i + 1
if (i == l):
self.global_id = post["plurk_id"]
...omitted code...
yield {
'Author': user_name,
'Body': post['content'],
'app': 'plurk'
}
The problem that I have is that Scrapy is making first all the requests in the for loop and then it is executing the code in parse_api.
What I would like to do is let scrapy do one iteration of the for loop, call the callback function, wait for it to return and then do another iteration.
This because the id that I need for the next request will be set in the global_id variable by the callback function.

You can't achieve this by scheduling requests in loop.
You can implement this only if you will schedule only one (next) request per parse/parse_api method call:
class MOOCSpider(scrapy.Spider):
name = 'mooc'
start_urls = ['https://www.plurk.com/search?q=italy']
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
'DOWNLOAD_DELAY':5,
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36",
}
def parse(self, response):
# schedule only first request (withour loop)
formdata = {
"query": 'italy',
"start_date": "2019/12",
"end_date": "2020/12",
"after_id": '1458122036', #<- your initial global_id
}
yield scrapy.FormRequest('https://www.plurk.com/Search/search2', callback=self.parse_api, formdata=formdata)
def parse_api(self, response):
data = json.loads(response.body)
after_id = None
for post in data["plurks"]:
after_id = post["plurk_id"]
yield {
'Author': data["users"][str(post["owner_id"])]["nick_name"], # instead of user_id?
'Body': post["content"],
'app': 'plurk'
}
# after end of this loop - after_id should contain required data for next request
# instead of separate loop variable response.meta["depth"] used to limit number requests
if response.meta["depth"] <=11 and after_id: # schedule next request
formdata = {
"query": 'italy',
"start_date": "2019/12",
"end_date": "2020/12",
"after_id": str(after_id),
}
yield scrapy.FormRequest('https://www.plurk.com/Search/search2', callback=self.parse_api, formdata=formdata)

Answering my own question:
Now the parse method does just one request and calls once the parse_api method. Parse_api processes the response and sets the global_id variable. Once it's done processing its own response it makes another request passing itself as the callback function.
By doing this you are guaranteed that the global_id variable will be properly set, since the new request will be made only once parse_api has finished running.
request.cb_kwargs["loop_l"] is used to pass an additional argument to the callback function. This time it's a counter that controls the number of requests we want to make. When the counter is equal to 100 we stop the crawling
import scrapy
import json
plurk_id = []
class MOOCSpider(scrapy.Spider):
name = 'mooc'
start_urls = ['https://www.plurk.com/search?q=']
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
global_id = 1455890167
url = 'https://www.plurk.com/Search/search2'
headers = {
...OMITTED...
}
def parse(self, response):
formdata = {
"after_id": str(self.global_id)
}
request = scrapy.FormRequest(self.url, callback=self.parse_api, formdata=formdata, headers=self.headers)
request.cb_kwargs["loop_l"] = str(0)
yield request
def parse_api(self, response, loop_l):
int_loop_l = int(loop_l)
int_loop_l = int_loop_l + 1
if (int_loop_l == 200):
return
raw = response.body
data = json.loads(raw)
...omitted code...
... GET AND SET THE NEW global_id FROM THE RESPONSE ...
# make another request with the new id
formdata = {
"after_id": str(self.global_id)
}
request = scrapy.FormRequest(self.url, callback=self.parse_api, formdata=formdata, headers=self.headers)
request.cb_kwargs["loop_l"] = str(int_loop_l)
yield request

Scraping Infinite scroll page

I am trying to scrape a infinite scroll ajax request page but not able to go the next page and get the yield items. I'm able to get the response.txt. I tried debbuging but not able get any solution. Can anyone help me in out in this.
import scrapy
class InfiniteScrollingSpider(scrapy.Spider):
name = 'wegotthiscovered_review'
scrolling_url = 'https://wegotthiscovered.com/wp-admin/admin-ajax.php'
def start_requests(self):
yield scrapy.FormRequest(
self.scrolling_url,
formdata={
'action': "face3_infinite_scroll",
'page': '1',
'attrs': "{\"id\":\"1\",\"order\":\"\",\"orderby\":\"\",\"catnames\":\"movies+reviews\",\"postnotin\":\"905069,904520,904521,903475,901576,900303,893944,895136,891795,886876,884402,881283\",\"timestampbefore\":1591800990}"
},
callback=self.parse_page,
meta={'page': 1},
)
def parse_page(self, response):
next_page = response.meta.get('page') + 1
print('next_page:', next_page)
print(response.text)
json_data = json.loads(response.text)
print(json_data.keys())
print('success:', json_data.get('success'))
print('data:', json_data.get('data'))
if not json_data.get('success') or not json_data.get('data') or not json_data['data'].get('content'):
return
articles = scrapy.Selector(text=json_data['data']['content']).css('article')
for article in articles:
yield {
'page_title': article.css('h4 ::text').extract_first().strip(),
'review_link': article.css('h4 ::attr(href)').extract_first().strip(),
}
print('next page >>>')
yield scrapy.FormRequest(
self.scrolling_url,
formdata={
'action': "face3_infinite_scroll",
'page': str(next_page),
"query_args":"{\"archive_type\":\"masonry\",\"show_first\":false,\"columns\":2,\"meta_cat\":false,\"meta\":true,\"summary\":true,\"standard_summary\":\"excerpt\",\"more_button\":false,\"reduce_margin\":false,\"orientation\":\"landscape\",\"list_width\":\"6\",\"widgets\":false,\"widgets_sidebar\":\"sidebar-archive\",\"widgets_after\":3,\"widgets_repeat\":false,\"highlight\":\"featured\",\"pagination_type\":\"ajax\",\"infinite_load\":true}"
},
callback=self.parse_page,
meta={'page': next_page},
)

how do i scrape this kind of dynamic generated website data?

I'm trying to scrape E-commerce website,
example link: https://www.lazada.sg/products/esogoal-2-in-1-selfie-stick-tripod-bluetooth-selfie-stand-with-remote-shutter-foldable-tripod-monopod-i279432816-s436738661.html?mp=1
Data is being rendered via React and when i perform scraping on few links most of the data is being returned as null, and when i view the page source i cannot find actually HTML that is available via inspect element, just a json inside Javascript tags. I tested few times running scrapy scraper on the same links and data which was not found before, actually returns content, so its somehow randomly. I cannot figure out how should i scrape this kind of website.
As well i'm using pool of useragents and breaks between requests.
script = '''
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(1.5))
return splash:html()
end
'''
def start_requests(self):
url= [
'https://www.lazada.sg/products/esogoal-tactical-sling-bag-outdoor-chest-pack-shoulder-backpack-military-sport-bag-for-trekking-camping-hiking-rover-sling-daypack-for-men-women-i204814494-s353896924.html?mp=1',
'https://www.lazada.sg/products/esogoal-2-in-1-selfie-stick-tripod-bluetooth-selfie-stand-with-remote-shutter-foldable-tripod-monopod-i279432816-s436738661.html?mp=1',
'https://www.lazada.sg/products/esogoal-selfie-stick-tripod-extendable-selfie-stick-monopod-with-integrated-tripod-and-bluetooth-remote-shutter-wireless-selfie-stick-tripod-for-cellphonecameras-i205279097-s309050125.html?mp=1',
'https://www.lazada.sg/products/esogoal-mini-umbrella-travel-umbrella-sun-rain-umbrella8-ribs-98cm-big-surface-lightweight-compact-parasol-uv-protection-for-men-women-i204815487-s308312226.html?mp=1',
'https://www.lazada.sg/products/esogoal-2-in-1-selfie-stick-tripod-bluetooth-selfie-stand-with-remote-shutter-foldable-tripod-monopod-i279432816-s436738661.html?mp=1'
]
for link in url:
yield SplashRequest(url=link, callback=self.parse, endpoint='render.html', args={'wait' : 0.5, 'lua_source' : self.script}, dont_filter=True)
def parse(self, response):
yield {
'title' : response.xpath("//span[#class='pdp-mod-product-badge-title']/text()").extract_first(),
'price' : response.xpath("//span[contains(#class, 'pdp-price')]/text()").extract_first(),
'description' : response.xpath("//div[#id='module_product_detail']").extract_first()
}

I try this:
Pass 'execute' as argument of the splash method instead of 'render html'
from scrapy_splash import SplashRequest
class DynamicSpider(scrapy.Spider):
name = 'products'
url = [
'https://www.lazada.sg/products/esogoal-tactical-sling-bag-outdoor-chest-pack-shoulder-backpack-military-sport-bag-for-trekking-camping-hiking-rover-sling-daypack-for-men-women-i204814494-s353896924.html?mp=1',
'https://www.lazada.sg/products/esogoal-2-in-1-selfie-stick-tripod-bluetooth-selfie-stand-with-remote-shutter-foldable-tripod-monopod-i279432816-s436738661.html?mp=1',
'https://www.lazada.sg/products/esogoal-selfie-stick-tripod-extendable-selfie-stick-monopod-with-integrated-tripod-and-bluetooth-remote-shutter-wireless-selfie-stick-tripod-for-cellphonecameras-i205279097-s309050125.html?mp=1',
'https://www.lazada.sg/products/esogoal-mini-umbrella-travel-umbrella-sun-rain-umbrella8-ribs-98cm-big-surface-lightweight-compact-parasol-uv-protection-for-men-women-i204815487-s308312226.html?mp=1',
'https://www.lazada.sg/products/esogoal-2-in-1-selfie-stick-tripod-bluetooth-selfie-stand-with-remote-shutter-foldable-tripod-monopod-i279432816-s436738661.html?mp=1',
]
script = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(1.5))
return {
html = splash:html()
}
end
"""
def start_requests(self):
for link in self.url:
yield SplashRequest(
url=link,
callback=self.parse,
endpoint='execute',
args={'wait': 0.5, 'lua_source': self.script},
dont_filter=True,
)
def parse(self, response):
yield {
'title': response.xpath("//span[#class='pdp-mod-product-badge-title']/text()").extract_first(),
'price': response.xpath("//span[contains(#class, 'pdp-price')]/text()").extract_first(),
'description': response.xpath("//div[#id='module_product_detail']/h2/text()").extract_first()
}
An this is the result

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to scrape data on website if using Javascript with pagination - python

Related

Webscraper Not Accurately Obtaining Data

Submitting Form Data with Python Requests POST not Working

Wait for Scapy callback function

Scraping Infinite scroll page

how do i scrape this kind of dynamic generated website data?

Categories

Resources