Scrapy-Splash Requested URL vs Real URL - python

I'm trying to make instantaneous reports for some of my webpages using the Splash and Scrapy-Splash python module.
The problem is that I can't obtain the corect last url like in splash render.json. When a website redirects.
For example on localhost:8050/render.json the result for rendering is:
{"requestedUrl": "",
"url": "",
"title": "Google", "geometry": [0, 0, 1024, 768]}
But inside my python script I only manage to obtain ""
My code is:
def start_requests(self):
return [Request(self.url, callback=self.parse, dont_filter=True)]
def parse(self, response):
splash_args = { 'wait': 1 }
return SplashRequest(
def parse_link(self, response):
result = {
'response': response.request.url,
'splash_url': response.real_url
But any of this returns:
{"requested_url": "",
"real_url": "",
"response": "",
"splash_url": ""}
Is there any way to overcome this?


Not getting any data scraped when running the following code using Scrapy on Python

This is the spider I am using to scrape email addresses and names of restaurants from tripadvisor
import scrapy
class RestaurantSpider(scrapy.Spider):
name = 'tripadvisorbot'
start_urls = [
def parse(self, response):
for listing in response.xpath('//div[contains(#class,"__cellContainer--")]'):
link = listing.xpath('.//a[contains(#class,"__restaurantName--")]/#href').get()
text = listing.xpath('.//a[contains(#class,"__restaurantName--")]/text()').get()
complete_url = response.urljoin(link)
yield scrapy.Request(
meta={'link': complete_url,'text': text}
next_url = response.xpath('//*[contains(#class,"pagination")]/*[contains(#class,"next")]/#href').get()
if next_url:
yield scrapy.Request(response.urljoin(next_url), callback=self.parse)
def parse_listing(self, response):
link = response.meta['link']
text = response.meta['text']
email = response.xpath('//a[contains(#href, "mailto:")]/#href').get()
yield {'Link': link,'Text': text,'Email': email}
I run the following command line in the Anaconda prompt to run the above Spider and save it as a json file
scrapy crawl tripadvisorbot -O tripadvisor.json
No data gets scraped, a json file is created but it's empty.
I am not sure what the problem is, I am quite new to web scraping and Python coding in general. All help would be much appreciated
On my computer there is no class _cellContainer-- and __restaurantName-- in HTML.
Page uses random chars as class names.
But every item is in div directly in <div data-test-target="restaurants-list"> and I use this to get all items.
Later I get first <a> (which has image instead of name) and I skip text and complete_url but directly run reponse.follow(link).
And when I get page with details then I get reponse.url to get complete_url and h1 to get text
You can put all code in one file and run python without creating project.
import scrapy
class RestaurantSpider(scrapy.Spider):
name = 'tripadvisorbot'
start_urls = [
def parse(self, response):
for listing in response.xpath('//div[#data-test-target="restaurants-list"]/div'):
url = listing.xpath('.//a/#href').get()
print('link:', url)
if url:
yield response.follow(url, callback=self.parse_listing)
next_url = response.xpath('//*[contains(#class,"pagination")]/*[contains(#class,"next")]/#href').get()
if next_url:
yield response.follow(next_url)
def parse_listing(self, response):
print('url:', response.url)
link = response.url
text = response.xpath('//h1[#data-test-target]/text()').get()
email = response.xpath('//a[contains(#href, "mailto:")]/#href').get()
yield {'Link': link, 'Text': text, 'Email': email}
# --- run without project and save data in `output.json` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEEDS': {'output.json': {'format': 'json'}}, # new in 2.1
Part of result:
{"Link": "", "Text": "Bab mansour", "Email": null},
{"Link": "", "Text": "Milos", "Email": null},
{"Link": "", "Text": "Nefeli deli", "Email": ""},
{"Link": "", "Text": "Waterkant", "Email": ""},
{"Link": "", "Text": "Salero Minang", "Email": null},
{"Link": "", "Text": "Du Passage", "Email": ""},
{"Link": "", "Text": "Lee's Garden", "Email": null},
{"Link": "", "Text": "Warunee", "Email": ""},
{"Link": "", "Text": "Sallo's", "Email": ""},
{"Link": "", "Text": "Saravanaa Bhavan Den Haag", "Email": ""},

Scrapy - Splash fetch dynamic data

I am trying to fetch dynamic phone number from this page (among others):
The phone number appears after a click on the element div with the class page-action click-tel. I am trying to get to this data with scrapy_splash using a LUA script to execute a click.
After pulling splash on my ubuntu:
sudo docker run -d -p 8050:8050 scrapinghub/splash
Here is my code so far (I am using a proxy service) :
class company(scrapy.Spider):
name = "company"
custom_settings = {
"FEEDS" : {
'/home/ubuntu/scraping/europages/data/company.json': {
'format': 'jsonlines',
'encoding': 'utf8'
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
"SPLASH_URL" : '',
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
"DUPEFILTER_CLASS" : 'scrapy_splash.SplashAwareDupeFilter',
"HTTPCACHE_STORAGE" : 'scrapy_splash.SplashAwareFSCacheStorage'
allowed_domains = ['']
def __init__(self, company_url):
self.company_url = "" ##forced
self.item = company_item()
self.script = """
function main(splash)
splash.private_mode_enabled = false
local element = splash:select('')
local bounds = element:bounds()
element:mouse_click{x=bounds.width/2, y=bounds.height/2}
return splash:html()
def start_requests(self):
yield scrapy.Request(
url = self.company_url,
callback = self.parse,
dont_filter = True,
meta = {
'splash': {
'endpoint': 'execute',
'url': self.company_url,
'args': {
'lua_source': self.script,
'proxy': 'http://usernamepassword#proxyhost:port',
def parse(self, response):
soup = BeautifulSoup(response.body, "lxml")
print(soup.find('div',{'class','page-action click-tel'}))
The problem is that it has no effect, I still have nothing as if no button were clicked.
Shouldn't the return splash:html() return the results of element:mouse_click{x=bounds.width/2, y=bounds.height/2} (as element:mouse_click() waits for the changes to appear) in response.body ?
Am I missing something here ?
Most times when sites load data dynamically, they do so via background XHR requests to the server. A close examination of the network tab when you click the 'telephone' button, shows that the browser sends an XHR request to the url You can emulate the same in your spider and avoid using scrapy splash altogether. See sample implementation below using one url:
import scrapy
from urllib.parse import urlparse
class Company(scrapy.Spider):
name = 'company'
allowed_domains = ['']
start_urls = ['']
def parse(self, response):
# obtain the id and uuid to make xhr request
uuid = urlparse(response.url).path.split('/')[-1].rstrip('.html')
id = response.xpath("//div[#itemprop='telephone']/a/#onclick").re_first(r"event,'(\d+)',")
yield scrapy.Request(f"{uuid}&id={id}", callback=self.parse_address)
def parse_address(self, response):
yield response.json()
I get the response
{'digits': '+49 220 69 53 30'}

Scraping Infinite scroll page

I am trying to scrape a infinite scroll ajax request page but not able to go the next page and get the yield items. I'm able to get the response.txt. I tried debbuging but not able get any solution. Can anyone help me in out in this.
import scrapy
class InfiniteScrollingSpider(scrapy.Spider):
name = 'wegotthiscovered_review'
scrolling_url = ''
def start_requests(self):
yield scrapy.FormRequest(
'action': "face3_infinite_scroll",
'page': '1',
'attrs': "{\"id\":\"1\",\"order\":\"\",\"orderby\":\"\",\"catnames\":\"movies+reviews\",\"postnotin\":\"905069,904520,904521,903475,901576,900303,893944,895136,891795,886876,884402,881283\",\"timestampbefore\":1591800990}"
meta={'page': 1},
def parse_page(self, response):
next_page = response.meta.get('page') + 1
print('next_page:', next_page)
json_data = json.loads(response.text)
print('success:', json_data.get('success'))
print('data:', json_data.get('data'))
if not json_data.get('success') or not json_data.get('data') or not json_data['data'].get('content'):
articles = scrapy.Selector(text=json_data['data']['content']).css('article')
for article in articles:
yield {
'page_title': article.css('h4 ::text').extract_first().strip(),
'review_link': article.css('h4 ::attr(href)').extract_first().strip(),
print('next page >>>')
yield scrapy.FormRequest(
'action': "face3_infinite_scroll",
'page': str(next_page),
meta={'page': next_page},

how do i scrape this kind of dynamic generated website data?

I'm trying to scrape E-commerce website,
example link:
Data is being rendered via React and when i perform scraping on few links most of the data is being returned as null, and when i view the page source i cannot find actually HTML that is available via inspect element, just a json inside Javascript tags. I tested few times running scrapy scraper on the same links and data which was not found before, actually returns content, so its somehow randomly. I cannot figure out how should i scrape this kind of website.
As well i'm using pool of useragents and breaks between requests.
script = '''
function main(splash, args)
return splash:html()
def start_requests(self):
url= [
for link in url:
yield SplashRequest(url=link, callback=self.parse, endpoint='render.html', args={'wait' : 0.5, 'lua_source' : self.script}, dont_filter=True)
def parse(self, response):
yield {
'title' : response.xpath("//span[#class='pdp-mod-product-badge-title']/text()").extract_first(),
'price' : response.xpath("//span[contains(#class, 'pdp-price')]/text()").extract_first(),
'description' : response.xpath("//div[#id='module_product_detail']").extract_first()
I try this:
Pass 'execute' as argument of the splash method instead of 'render html'
from scrapy_splash import SplashRequest
class DynamicSpider(scrapy.Spider):
name = 'products'
url = [
script = """
function main(splash, args)
return {
html = splash:html()
def start_requests(self):
for link in self.url:
yield SplashRequest(
args={'wait': 0.5, 'lua_source': self.script},
def parse(self, response):
yield {
'title': response.xpath("//span[#class='pdp-mod-product-badge-title']/text()").extract_first(),
'price': response.xpath("//span[contains(#class, 'pdp-price')]/text()").extract_first(),
'description': response.xpath("//div[#id='module_product_detail']/h2/text()").extract_first()
An this is the result

srapy crawl mulitlayer append data in a list and yield this item

The web site structure i'm trying to parse using scrapy is the following:
I'd like the extracted data to have this format:
I tried this:
from item import Item
class Spider(scrapy.scrapy):
name = spider
def start_request(self):
url = "the main page's url"
yield scrapy.Request(url=url, callback=self.parseProjectList)
def parseProjectList(self, response):
for url in Selector(Project_list)
yield scrapy.Request(url=url, callback=self.parseProject)
def parseProject(self, response):
#scrap some data
myItem = Item()
yield scrapy.Request(url=SampleListPage, callback=self.parseSampleListPage,meta={'myItem':myItem})
def parseSampleListPage(self, response):
for url in Selector(Sample_list)
yield scrapy.Request(url=url, callback=self.parseSample,meta={'myItem':'myItem'})
def parseSample(self, response):
#parse some sample data
I tried to put yield response.meta['myItem'] at parseSampleListPage
def parseSampleListPage(self, response):
for url in Selector(Sample_list)
yield scrapy.Request(url=url, callback=self.parseSample,meta={'myItem':'myItem'})
yield response.meta['myItem']
and also yield response.meta['myItem'] in parseSample
def parseSample(self, response):
#parse some sample data
yield response.meta['myItem']
Both solutions failed.
The first one yields empty "samples" fields. The second one creates multiple data with same project like this:
[ {
"project": {
"projectname": "project2"
"samples": [
] }, {
"project": {
"projectname": "project2"
"samples": [
] }, {
"project": {
"projectname": "project2"
"samples": [
] } ]
Wonder is there any way to deal with this problem?
