CrawlSpider with Splash, only first link is crawled & processed - python

I am using Scrapy with Splash. Here is what I have in my spider:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest
import logging
class MainSpider(CrawlSpider):
name = 'main'
allowed_domains = ['www.somesite.com']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
headers = {
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-GB,en-US;q=0.9,en;q=0.8',
['Referer'] = 'https://www.google.com'
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(2))
-- username input
username_input = assert(splash:select('#username'))
username_input:focus()
username_input:send_text('myusername')
assert(splash:wait(0.3))
-- password input
password_input = assert(splash:select('#password'))
password_input:focus()
password_input:send_text('mysecurepass')
assert(splash:wait(0.3))
-- the login button
login_btn = assert(splash:select('#login_btn'))
login_btn:mouse_click()
assert(splash:wait(4))
return splash:html()
end
'''
rules = (
Rule(LinkExtractor(restrict_xpaths="(//div[#id='sidebar']/ul/li)[7]/a"), callback='parse_item', follow=True, process_request='use_splash'),
)
def start_requests(self):
yield SplashRequest(url = 'https://www.somesite.com/login', callback = self.post_login, endpoint = 'execute', args = {
'lua_source': self.script
})
def use_splash(self, request):
request.meta.update(splash={
'args': {
'wait': 1,
},
'endpoint': 'render.html',
})
return request
def _requests_to_follow(self, response):
if not isinstance(response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def post_login(self, response):
logging.info('hey from login!')
with open('post_login_response.txt', 'w') as f:
f.write(response.text)
f.close()
def parse_item(self, response):
logging.info('hey from parse_item!')
with open('post_search_response.txt', 'w') as f:
f.write(response.text)
f.close()
I came across this and I've tried to implement things the same way, but still, prase_item is never run. In the logs, I never get hey from parse_item!
I'm not sure what I'm missing. The full log output can be found here

I ditched the Crawl Spider and converted to a regular spider, and things are working fine now.

Related

Create Xpath using scrapy

import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='icon_link']//a//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
wev={}
d1=response.xpath("//*[#class='line_list_K']//div//span")
for i in range(len(d1)):
if 'Status:' in d1[i].get():
d2=response.xpath("//div["+str(i+1)+"]//text()").get()
print(d2)
I will get the status value but they will give me empty output this is page link https://rejestradwokatow.pl/adwokat/abramska-danuta-51494
Why not selecting your element more specific by its text and getting the text from its next sibling:
//span[text()[contains(.,'Status')]]/following-sibling::div/text()
Example: http://xpather.com/ZUWI58a4
To get the email:
//span[text()[contains(.,'Email')]]/following-sibling::div/(concat(#data-ea,'#',#data-eb))
Your d2 xpath isn't targeting the correct div.
This should work:
def parse_book(self, response):
wev = {} # <- this is never used
for child in response.xpath('//div[#class="line_list_K"]/*'):
if 'Status:' child.xpath(".//span/text()").get():
d2 = child.xpath(".//div/text()").get()
print(d2)

Session handling in scrapy-splash with custom header

I am using Scrapy with Splash via Scrapy-Splash.
I am having issues persisting my logged-in status after the initial request.
Here's my whole spider class:
import scrapy
from scrapy_splash import SplashRequest
import logging
class MasterSpider(scrapy.Spider):
name = 'master'
allowed_domains = ['www.somesite.com']
start_url = 'https://www.somesite.com/login'
login_script = '''
function main(splash, args)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'
headers = {
['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-US;q=0.9,en;q=0.8',
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(2))
-- username input
username_input = assert(splash:select('#username'))
username_input:focus()
username_input:send_text('myusername')
assert(splash:wait(0.3))
-- password input
password_input = assert(splash:select('#password'))
password_input:focus()
password_input:send_text('mysecurepass')
assert(splash:wait(0.3))
-- the login button
login_btn = assert(splash:select('#login_btn'))
login_btn:mouse_click()
assert(splash:wait(4))
return {
html = splash:html(),
cookies = splash:get_cookies(),
}
end
'''
fruit_selection_script = '''
function main(splash, args)
splash:init_cookies(splash.args.cookies)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'
headers = {
['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-US;q=0.9,en;q=0.8',
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(4))
-- state select input
state_select = assert(splash:select('select#fruits'))
state_select:mouse_click()
state_select:send_keys("<Down>")
assert(splash:wait(0.2))
state_select:send_keys("<Enter>")
assert(splash:wait(0.2))
-- game select input
game_select = assert(splash:select('select#type'))
game_select:mouse_click()
game_select:send_keys("<Down>")
assert(splash:wait(0.1))
game_select:send_keys("<Up>")
assert(splash:wait(0.1))
-- the next button
login_btn = assert(splash:select('input.submit'))
login_btn:mouse_click()
assert(splash:wait(4))
return splash:html()
end
'''
def start_requests(self):
yield SplashRequest(url = self.start_url, callback = self.post_login, endpoint = 'execute', args = { 'lua_source': self.login_script })
def post_login(self, response):
search_link = response.urljoin(response.xpath("(//div[#id='sidebar']/ul/li)[7]/a/#href").get())
logging.info('about to fire up second splash request')
with open('temp.html', 'w') as f:
f.write(response.text)
f.close()
yield SplashRequest(url = search_link, callback = self.search, endpoint = 'execute', args = { 'wait': 3, 'lua_source': self.game_selection_script })
def search(self, response):
logging.info('hey from search!')
with open('post_search_response.html', 'w') as f:
f.write(response.text)
f.close()
def post_search(self, response):
logging.info('hey from post_search!')
with open('post_search_response.html', 'w') as f:
f.write(response.text)
f.close()
def parse(self, response):
pass
The scrapy-splash docs say:
SplashRequest sets session_id automatically for /execute endpoint, i.e. cookie handling is enabled by default if you use SplashRequest, /execute endpoint and a compatible Lua rendering script.
If you want to start from the same set of cookies, but then 'fork' sessions set request.meta['splash']['new_session_id'] in addition to session_id. Request cookies will be fetched from cookiejar session_id, but response cookies will be merged back to the new_session_id cookiejar.
As you can see, I am always using the execute endpoint, so I should get cookie handling by default? Yet it isn't working, I'm not sure why, but I wonder if it is because I am setting the custom header for the user-agent and language?
Right now, when the spider comes to run the 2nd script (fruit_selection_script) I get a 403 Forbidden error.
What am I missing?

Scrapy .css in page using atributte

I am trying to use scrapy on this page: http://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/
But I can't bring the image of the product, it can't find anything I might be missing?
I tried by attribute, by ID, by class and nothing
import scrapy
from scrapy import Request
import random
class BrickSetSpider(scrapy.Spider):
name = 'spider'
USER_AGENT_LIST = [
'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0',
]
start_urls = [
'https://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/',
]
download_delay = 5
FEED_EXPORT_ENCODING = 'utf-8'
def start_requests(self):
for url in self.start_urls:
headers = {'User-Agent': random.choice(self.USER_AGENT_LIST)}
yield Request(url, headers=headers)
def parse(self, response):
SET_SELECTOR = '.content-left'
for brickset in response.css(SET_SELECTOR):
SEARCH_SELECTOR = response.url
NAME_SELECTOR = 'span.keyValue span ::text'
IMAGE_SELECTOR = 'img[itemprop="image"] ::attr(src)'
yield {
'search': SEARCH_SELECTOR,
'name': brickset.css(NAME_SELECTOR).re('[^\t\n]+'),
'link': brickset.css(IMAGE_SELECTOR).extract(),
}
If you are using Chrome, you can test this in the console $$(".images [data-test='zoom-wrap'] img") to get the image.
So, you can use this CSS selector in the Scrapy code. You will have to extract the src parameter.
I hope it helps!
The image is generated dynamically by JS. Try the following code.
from simplified_scrapy.spider import Spider, SimplifiedDoc
import re
class MySpider(Spider):
name = 'rs-online.com'
# allowed_domains = ['example.com']
start_urls = [
'https://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/'
]
# refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
# print (doc.html)
div = doc.getElementByClass('content-left')
imgs = re.compile(u'largeImageURL: ".*"').findall(div.script.html)
imgs = ['https:'+img[len('largeImageURL: "'):-1] for img in imgs]
lis = doc.getElementByClass('keyDetailsLL').lis
names = {}
for li in lis:
spans=li.spans
names[spans[0].text]=spans[1].text
data = [{'imgs':imgs,'names':names}]
print (data)
return {"Urls": [], "Data": data} # Return data to framework
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(MySpider()) # Start crawling
Result:
[{'imgs': ['https://media.rs-online.com/t_large/F7858468-01.jpg', 'https://media.rs-online.com/t_large/F7858468-02.jpg'], 'names': {'Codice RS': '785-8468', 'Codice costruttore': 'E2E-S05S12-WC-B1 2M', 'Costruttore': 'Omron'}}]

Scrapy pagination is not working and optimized spider

Please help me to optimize my scrapy spider. Specially next page pagination is not working. There are lot of page per page has 50 items.
I catch first page 50 items(link) in parse_items and next page items also scrap in parse_items.
import scrapy
from scrapy import Field
from fake_useragent import UserAgent
class DiscoItem(scrapy.Item):
release = Field()
images = Field()
class discoSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['discogs.com']
query = input('ENTER SEARCH MUSIC TYPE : ')
start_urls =['http://www.discogs.com/search?q=%s&type=release'%query]
custome_settings = {
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
'handle_httpstatus_list' : [301,302,],
'download_delay' :10}
def start_requests(self):
yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
print('START parse \n')
print("*****",response.url)
#next page pagination
next_page =response.css('a.pagination_next::attr(href)').extract_first()
next_page = response.urljoin(next_page)
yield scrapy.Request(url=next_page, callback=self.parse_items2)
headers={}
for link in response.css('a.search_result_title ::attr(href)').extract():
ua = UserAgent()# random user agent
headers['User-Agent'] = ua.random
yield scrapy.Request(response.urljoin(link),headers=headers,callback=self.parse_items)
def parse_items2(self, response):
print('parse_items2 *******', response.url)
yield scrapy.Request(url=response.url, callback=self.parse)
def parse_items(self,response):
print("parse_items**********",response.url)
items = DiscoItem()
for imge in response.css('div#page_content'):
img = imge.css("span.thumbnail_center img::attr(src)").extract()[0]
items['images'] = img
release=imge.css('div.content a ::text').extract()
items['release']=release[4]
yield items
When I try running your code (after fixing the many indentation, spelling and letter case errors), this line is shown in scrapy's log:
2018-03-05 00:47:28 [scrapy.dupefilters] DEBUG: Filtered duplicate request: <GET https://www.discogs.com/search/?q=rock&type=release&page=2> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
Scrapy will filter duplicate requests by default, and your parse_items2() method does nothing but create duplicate requests. I fail to see any reason for that method existing.
What you should do instead is specify the ˙parse()` method as callback for your requests, and avoid having an extra method that does nothing:
yield scrapy.Request(url=next_page, callback=self.parse)
Try this for pagination:
try:
nextpage = response.urljoin( response.xpath("//*[contains(#rel,'next') and contains(#id,'next')]/#url")[0].extract() )
yield scrapy.Request( nextpage, callback=self.parse )
except:
pass

How to send cookie with scrapy CrawlSpider requests?

I am trying to create this Reddit scraper using Python's Scrapy framework.
I have used the CrawSpider to crawl through Reddit and its subreddits. But, when I come across pages that have adult content, the site asks for a cookie over18=1.
So, I have been trying to send a cookie with every request that the spider makes, but, its not working out.
Here, is my spider code. As you can see I tried to add a cookie with every spider request using the start_requests() method.
Could anyone here tell me how to do this? Or what I have been doing wrong?
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from reddit.items import RedditItem
from scrapy.http import Request, FormRequest
class MySpider(CrawlSpider):
name = 'redditscraper'
allowed_domains = ['reddit.com', 'imgur.com']
start_urls = ['https://www.reddit.com/r/nsfw']
rules = (
Rule(LinkExtractor(
allow=['/r/nsfw/\?count=\d*&after=\w*']),
callback='parse_item',
follow=True),
)
def start_requests(self):
for i,url in enumerate(self.start_urls):
print(url)
yield Request(url,cookies={'over18':'1'},callback=self.parse_item)
def parse_item(self, response):
titleList = response.css('a.title')
for title in titleList:
item = RedditItem()
item['url'] = title.xpath('#href').extract()
item['title'] = title.xpath('text()').extract()
yield item
Okay. Try doing something like this.
def start_requests(self):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'}
for i,url in enumerate(self.start_urls):
yield Request(url,cookies={'over18':'1'}, callback=self.parse_item, headers=headers)
It's the User-Agent which blocks you.
Edit:
Don't know what's wrong with CrawlSpider but Spider could work anyway.
#!/usr/bin/env python
# encoding: utf-8
import scrapy
class MySpider(scrapy.Spider):
name = 'redditscraper'
allowed_domains = ['reddit.com', 'imgur.com']
start_urls = ['https://www.reddit.com/r/nsfw']
def request(self, url, callback):
"""
wrapper for scrapy.request
"""
request = scrapy.Request(url=url, callback=callback)
request.cookies['over18'] = 1
request.headers['User-Agent'] = (
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/45.0.2454.85 Safari/537.36')
return request
def start_requests(self):
for i, url in enumerate(self.start_urls):
yield self.request(url, self.parse_item)
def parse_item(self, response):
titleList = response.css('a.title')
for title in titleList:
item = {}
item['url'] = title.xpath('#href').extract()
item['title'] = title.xpath('text()').extract()
yield item
url = response.xpath('//a[#rel="nofollow next"]/#href').extract_first()
if url:
yield self.request(url, self.parse_item)
# you may consider scrapy.pipelines.images.ImagesPipeline :D
The Scrapy Docs
1.Using a dict:
request_with_cookies = Request(url="http://www.example.com",
cookies={'currency': 'USD', 'country': 'UY'})
2.Using a list of dicts:
request_with_cookies = Request(url="http://www.example.com",
cookies=[{'name': 'currency',
'value': 'USD',
'domain': 'example.com',
'path': '/currency'}])
You can also send it via header.
scrapy.Request(url=url, callback=callback, headers={'Cookie':my_cookie})
You could use the process_request parameter in the rule, something like:
rules = (
Rule(LinkExtractor(
allow=['/r/nsfw/\?count=\d*&after=\w*']),
callback='parse_item',
process_request='ammend_req_header',
follow=True)
def ammend_req_header(self, request):
request.cookies['over18']=1
return request
I found solution for CrawlSpider:
def start_requests(self):
yield Request(url=self.start_urls[0], callback=self._parse, cookies={'beget': 'begetok'})

Categories