Scrapy Splash Dynamic scraping with CrawlSpider

Scrapy Splash Dynamic scraping with CrawlSpider - python

I tried to get some data from a react based website, but when I use CrawlSpider I can't parse other pages.
For Example I can parse my first URL with splash and other urls will parse regularly without dynamic content.
this is my code:
class PageSpider(CrawlSpider):
host = 'hooshmandsazeh.com'
protocol = 'https'
root_domain = 'hooshmandsazeh.com'
name = 'page'
allowed_domains = [host]
#start_urls = [f'{protocol}://{host}',]
def start_requests(self):
url = f'{self.protocol}://{self.host}'
yield SplashRequest(url, dont_process_response=True, args={'wait': 1}, meta={'real_url': url})
custom_settings = {
#'DEPTH_LIMIT': 9,
}
rules = (
# Rule(LinkExtractor(allow=('node_\d+\.htm',)), follow=True),
Rule(LinkExtractor(allow=(host),deny=('\.webp', '\.js', '\.css', '\.jpg', '\.png'),unique=True),
callback='parse',
follow=True,
process_request='splash_request'
),
)
def splash_request(self, request):
request.meta['real_url'] = request.url
print("Aliii",request.meta['real_url'])
return request
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
newresponse = response.replace(url=response.meta.get('real_url'))
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(newresponse)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def parse(self,response):
if len(LinkExtractor(deny = self.host).extract_links(response)) > 0:
loader = ItemLoader(item=PageLevelItem(), response=response)
loader.add_value('page_source_url', response.url)
yield loader.load_item()

Check below code worked for me:
def splash_request(self, request):
# request = request.replace(url=RENDER_HTML_URL + request.url)
request.meta['real_url'] = request.url
return SplashRequest(request.meta['real_url'], dont_process_response=True, args={'wait': 0}, meta={'real_url': request.meta['real_url']})

Related

scrapy-splash CrawlSpider

I tried to scraping the driver names through scrapy-splash CrawlSpider on this site, but I constantly come across errors. After searching for ways to solve the problem, I came across github and just copied the latest code.
start_urls = ['http://www.huananzhi.com/html/1/184/185/index.html']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url,callback=self.parse_item, args={'wait': 0.5}, meta={'real_url': url})
def _requests_to_follow(self, response):
if not isinstance(
response,
(HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r, response)
def use_splash(self, request, response):
request.meta.update(splash={
'args': {
'wait': 15,
},
'endpoint': 'render.html',
})
return request
linkRule = LinkExtractor(restrict_xpaths='//article /div[1]/div[1]/div[2]/a[1]')
itemRule = Rule(linkRule, callback='parse_item', follow=True, process_request='use_splash')
rules = (
itemRule,
)
def parse_item(self, response):
item = HuananzhiItem()
item['name'] = response.xpath("//div[#class='tab-content']//div[1]/h2/text()").get()
yield item
It didn't work so I tried using scrapy.Spider
def start_requests(self):
url = 'http://www.huananzhi.com/html/1/184/185/index.html'
yield SplashRequest(url=url, callback=self.parse)
def parse(self, response):
links = response.xpath('//article/div[1]/div[1]/div[2]/a[1]/#href')
for link in links:
yield SplashRequest(url=link, callback=self.parse_item)
next_page = response.xpath('//section//li[4]//a[1]')
yield from response.follow(next_page, self.parse)
def parse_item(self, response):
item = HuananzhiItem()
item['name'] = response.xpath("//div[#class='tab-content']//div[1]/h2/text()").get()
yield item
i also use scrapy-user-agent
can anyone tell me how to get the item ? Sorry for such a stupid question, I'm a beginner
Thanks

Scrapy Request are not crawling all urls return

recently had this project of crawling google play store apps, for vietnam region, and realized that the request doesn't run the callback function for all URLs that haven been return.
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http.request import Request
from urllib.parse import urlparse, parse_qsl, urlencode
import scrapy
class GooglePlayStoreSpider(CrawlSpider):
name = 'google_play'
allowed_domains = ['play.google.com']
start_urls = ['http://play.google.com']
rules = (
Rule(LinkExtractor(allow=('https://play.google.com/store/apps/details')), follow=True,
process_links='process_links',
callback='parse_1'),
)
crawled_ids = []
first_init = False
def parse_start_url(self, response):
# print("-------------- PRINTING SECTION START_URL --------------")
if not self.first_init:
self.first_init = True
extractor = LinkExtractor(allow=('/store/apps/category/.*',))
raw_links = extractor.extract_links(response)
links = self.process_links(raw_links)
return [
scrapy.Request('{}'.format(link.url))
for link in links
]
else:
# print("============ START_URL ELSE PART ============")
pass
def process_links(self, links):
new_links = []
for link in links:
old_url = link.url
if not old_url.startswith('https://play.google.com/store/apps/'):
continue
old_url_obj = urlparse(old_url)
old_url_query = dict(parse_qsl(old_url_obj.query))
if old_url_obj.path == '/store/apps/details':
if old_url_query['id'] in self.crawled_ids:
continue
else:
self.crawled_ids.append(old_url_query['id'])
old_url_query['hl'] = 'en'
old_url_query['gl'] = 'vn'
link.url = '{}://{}{}?{}'.format(old_url_obj.scheme, old_url_obj.netloc, old_url_obj.path,
urlencode(old_url_query))
new_links.append(link)
# print("LINKKSSS ====", links)
# print("NEW_LINKKSSS ====", new_links)
# print("-------------- PRINTING SECTION PROCESS_LINKS --------------")
return new_links
def parse_1(self, response):
selector = scrapy.Selector(response)
urls = selector.xpath('//a[#class="LkLjZd ScJHi U8Ww7d xjAeve nMZKrb id-track-click "]/#href').extract()
links = []
for url in urls:
if not url.startswith('https://play.google.com/'):
url = "https://play.google.com" + url
links.append(url)
link_flag = 0
for url in urls:
# yield links_list.append(scrapy.Request(url, callback=self.parse_next, dont_filter=True))
yield Request(links[link_flag], callback=self.parse_next, dont_filter=True)
link_flag += 1
def parse_next(self, response):
# print("PARSE_NEXT ===========", response.request.url)
selector = scrapy.Selector(response)
app_urls = selector.xpath('//div[#class="details"]/a[#class="title"]/#href').extract()
urls = []
for url in app_urls:
url = "https://play.google.com" + url + '&hl=en&gl=vn'
urls.append(url)
url_list = []
link_flag = 0
for url in app_urls:
yield Request(urls[link_flag], callback=self.parse_detail, dont_filter=True)
link_flag += 1
# return url_list
def parse_detail(self, response):
print("Parsed ======= ", response.request.url)
item = dict()
item['name'] = response.xpath('//div[#itemscope]//meta[#itemprop="name"]/#content').extract_first()
item['category'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="applicationCategory"]/#content').extract_first()
item['review_score'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="ratingValue"]/#content').extract_first()
item['review_count'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="reviewCount"]/#content').extract_first()
item['link'] = response.request.url
item['id'] = dict(parse_qsl(urlparse(response.request.url).query))['id']
item['content_rating'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="contentRating"]/#content').extract_first()
item['image'] = response.xpath('//div[#itemscope]//meta[#itemprop="image"]/#content').extract_first()
item['price'] = response.xpath('//div[#itemscope]//meta[#itemprop="price"]/#content').extract_first()
item['price_currency'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="priceCurrency"]/#content').extract_first()
# item['operating_system'] = response.xpath('//div[#itemscope]//meta[#itemprop="operatingSystem"]/#content').extract_first()
return item
As i run into the terminal, it says that it crawled 100 pages and scraped only 15 pages, (numbers are for estimate).
Please help

Scrapy: How to pass an item between methods using meta

I'm new to scrapy and python and I'm trying to pass the item item['author'] in parse_quotes to the next parse method parse_bio
I tried the request.meta and response.meta approach as shown in the scrapy documentation but without succes. see code as per below.
import scrapy
from tutorial.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/login',
#'http://quotes.toscrape.com/page/2',
]
# Scraping a site with login
# Important: Cookie settings must be "True" to keep the login session alive
custom_settings = {'COOKIES_ENABLED': True}
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'username': 'john', 'password': 'secret'},
callback=self.parse_quotes
)
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['author_bio_link'] = sel.css('.author + a')
yield item
# follow the detail links # shortcut
# vertical crawling
for a in item['author_bio_link']:
yield response.follow(a, callback = self.parse_bio)
def parse_bio(self, response):
item = QuotesItem()
item['author_born'] = response.css('p span::text').getall()
item['author_born'] = item['author_born'][:2]
item['author_bio'] = response.css('div.author-description ::text').get().strip()
yield item
# follow pagination links # shortcut
# horizontal crawling
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)
I expect to get item['author'] from parse_quotes passed to parse_bio

I suggest you to use meta in this way:
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['author_bio_link'] = sel.css('.author + a')
yield item
# follow the detail links # shortcut
# vertical crawling
for a in item['author_bio_link']:
yield response.follow(a, self.parse_bio,
meta={'author': item['author']}) # <- you set it here
def parse_bio(self, response):
item = QuotesItem()
item['author_born'] = response.css('p span::text').getall()
item['author_born'] = item['author_born'][:2]
item['author_data'] = response.meta.get('author') # <- you get it here
item['author_bio'] = response.css('div.author-description ::text').get().strip()
yield item
# follow pagination links # shortcut
# horizontal crawling
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)

How to determine if a link is nofollow or dofollow in Scrapy?

So, that was the question. I have a Scrapy bot that follows internal links of a given site, writes its links, status code and anchor text into database. But I'm struggling to grab the link's follow status. Is there any way to grab that rel=nofollow/dofollow information? That's my code if anybody wonders;
class MySpider(CrawlSpider):
name = 'spydiiiii'
start_urls = [urlToScrape]
rules = (
Rule (
LxmlLinkExtractor(
allow=(urlToScrape),
deny=(
"google.com",
"facebook.com",
"pinterest.com",
"facebook.com",
"digg.com",
"twitter.com",
"stumbleupon.com",
"linkedin.com"
),
unique=True
),
callback="parse_items",
follow= True,
),
)
def parse_items(self, response):
sel = Selector(response)
items = []
item = InternallinkItem()
referring_url = response.request.headers.get('Referer').decode('utf-8')
item["referring_url"] = referring_url
anchor = response.meta.get('link_text')
item["anchor_text"] = " ".join(anchor.split())
item["current_url"] = response.url
item['status'] = response.status
items.append(item)
return items
Thanks in advance

I use LxmlLinkExtractor manually to get Link objects which have nofollow information.
In parse() I get links from first page and create item with 'nofollow' (and other) informations, and use Requests with this url (and with item in meta) to get status and referer.
New Requests uses parse_item() to get item from meta and add status.
parse_item() also uses extractor to get new links on this page and create new item and execute Requests with parse_item() again.
import scrapy
from scrapy.http import Request
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = ['http://quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com']
#start_urls = ['http://127.0.0.1:5000/'] # for Flask example
extractor = LxmlLinkExtractor(
allow=('http://quotes.toscrape.com'),
#allow=('http://127.0.0.1:5000'), # for Flask example
deny=(
'google.com',
'facebook.com',
'pinterest.com',
'facebook.com',
'digg.com',
'twitter.com',
'stumbleupon.com',
'linkedin.com'
),
unique=True,
)
def parse(self, response):
print('parse url:', response.url)
# use LxmlLinkExtractor manually
for link in self.extractor.extract_links(response):
#print('link:', link)
item = {}
item['nofollow'] = link.nofollow
item['anchor_text'] = link.text
item['current_url'] = link.url
#item['referring_url'] = response.url
yield Request(link.url, meta={'item': item}, callback=self.parse_item)
def parse_item(self, response):
print('parse_item url:', response.url)
item = response.meta['item']
item['referring_url'] = response.request.headers.get('Referer')
#item['referring_url'] = response.request.url
item['status'] = response.status
yield item
# use LxmlLinkExtractor manually with new links
for link in self.extractor.extract_links(response):
#print('link:', link)
item = {}
item['nofollow'] = link.nofollow
item['anchor_text'] = link.text
item['current_url'] = link.url
#item['referring_url'] = response.url
yield Request(link.url, meta={'item': item}, callback=self.parse_item)
# --- run spider without project ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEED_FORMAT': 'csv',
'FEED_URI': 'output.csv',
})
c.crawl(MySpider)
c.start()
EDIT:
Because I don't know any page with rel="nofollow" so I created simple code in Flask to test code.
from flask import Flask
app = Flask(__name__)
#app.route('/')
def index():
return 'Test 1 | Test 2'
#app.route('/test1')
def test1():
return 'Main Page'
#app.route('/test2')
def test2():
return 'Main Page'
if __name__ == '__main__':
app.run(debug=True)

Scrapy + Splash + ScrapyJS

i am using Splash 2.0.2 + Scrapy 1.0.5 + Scrapyjs 0.1.1 and im still not able to render javascript with a click. Here is an example url https://olx.pt/anuncio/loja-nova-com-250m2-garagem-em-box-fechada-para-arrumos-IDyTzAT.html#c49d3d94cf
I am still getting the page without the phone number rendered:
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
script = """
function main(splash)
splash:go(splash.args.url)
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
"""
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents, meta={
'splash': {
'args': {'lua_source': script},
'endpoint': 'execute',
}
})
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
import ipdb;ipdb.set_trace()
how can i get this to work?

Add
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
to Lua script and it will work.
function main(splash)
splash:go(splash.args.url)
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
.click() is JQuery function https://api.jquery.com/click/

You can avoid having to use Splash in the first place and make the appropriate GET request to get the phone number yourself. Working spider:
import json
import re
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents)
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
property_id = re.search(r"ID(\w+)\.", response.url).group(1)
phone_url = "https://olx.pt/ajax/misc/contact/phone/%s/" % property_id
yield scrapy.Request(phone_url, callback=self.parse_phone)
def parse_phone(self, response):
phone_number = json.loads(response.body)["value"]
print(phone_number)
If there are more things to extract from this "dynamic" website, see if Splash is really enough and, if not, look into browser automation and selenium.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy Splash Dynamic scraping with CrawlSpider - python

Related

scrapy-splash CrawlSpider

Scrapy Request are not crawling all urls return

Scrapy: How to pass an item between methods using meta

How to determine if a link is nofollow or dofollow in Scrapy?

Scrapy + Splash + ScrapyJS

Categories

Resources