I am using Scrapy with Splash. Here is what I have in my spider:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest
import logging
class MainSpider(CrawlSpider):
name = 'main'
allowed_domains = ['www.somesite.com']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
headers = {
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-GB,en-US;q=0.9,en;q=0.8',
['Referer'] = 'https://www.google.com'
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(2))
-- username input
username_input = assert(splash:select('#username'))
username_input:focus()
username_input:send_text('myusername')
assert(splash:wait(0.3))
-- password input
password_input = assert(splash:select('#password'))
password_input:focus()
password_input:send_text('mysecurepass')
assert(splash:wait(0.3))
-- the login button
login_btn = assert(splash:select('#login_btn'))
login_btn:mouse_click()
assert(splash:wait(4))
return splash:html()
end
'''
rules = (
Rule(LinkExtractor(restrict_xpaths="(//div[#id='sidebar']/ul/li)[7]/a"), callback='parse_item', follow=True, process_request='use_splash'),
)
def start_requests(self):
yield SplashRequest(url = 'https://www.somesite.com/login', callback = self.post_login, endpoint = 'execute', args = {
'lua_source': self.script
})
def use_splash(self, request):
request.meta.update(splash={
'args': {
'wait': 1,
},
'endpoint': 'render.html',
})
return request
def _requests_to_follow(self, response):
if not isinstance(response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def post_login(self, response):
logging.info('hey from login!')
with open('post_login_response.txt', 'w') as f:
f.write(response.text)
f.close()
def parse_item(self, response):
logging.info('hey from parse_item!')
with open('post_search_response.txt', 'w') as f:
f.write(response.text)
f.close()
I came across this and I've tried to implement things the same way, but still, prase_item is never run. In the logs, I never get hey from parse_item!
I'm not sure what I'm missing. The full log output can be found here
I ditched the Crawl Spider and converted to a regular spider, and things are working fine now.
Related
import scrapy
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://rejestradwokatow.pl/adwokat/list/strona/1/sta/2,3,9']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='icon_link']//a//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
wev={}
d1=response.xpath("//*[#class='line_list_K']//div//span")
for i in range(len(d1)):
if 'Status:' in d1[i].get():
d2=response.xpath("//div["+str(i+1)+"]//text()").get()
print(d2)
I will get the status value but they will give me empty output this is page link https://rejestradwokatow.pl/adwokat/abramska-danuta-51494
Why not selecting your element more specific by its text and getting the text from its next sibling:
//span[text()[contains(.,'Status')]]/following-sibling::div/text()
Example: http://xpather.com/ZUWI58a4
To get the email:
//span[text()[contains(.,'Email')]]/following-sibling::div/(concat(#data-ea,'#',#data-eb))
Your d2 xpath isn't targeting the correct div.
This should work:
def parse_book(self, response):
wev = {} # <- this is never used
for child in response.xpath('//div[#class="line_list_K"]/*'):
if 'Status:' child.xpath(".//span/text()").get():
d2 = child.xpath(".//div/text()").get()
print(d2)
I am using Scrapy with Splash via Scrapy-Splash.
I am having issues persisting my logged-in status after the initial request.
Here's my whole spider class:
import scrapy
from scrapy_splash import SplashRequest
import logging
class MasterSpider(scrapy.Spider):
name = 'master'
allowed_domains = ['www.somesite.com']
start_url = 'https://www.somesite.com/login'
login_script = '''
function main(splash, args)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'
headers = {
['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-US;q=0.9,en;q=0.8',
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(2))
-- username input
username_input = assert(splash:select('#username'))
username_input:focus()
username_input:send_text('myusername')
assert(splash:wait(0.3))
-- password input
password_input = assert(splash:select('#password'))
password_input:focus()
password_input:send_text('mysecurepass')
assert(splash:wait(0.3))
-- the login button
login_btn = assert(splash:select('#login_btn'))
login_btn:mouse_click()
assert(splash:wait(4))
return {
html = splash:html(),
cookies = splash:get_cookies(),
}
end
'''
fruit_selection_script = '''
function main(splash, args)
splash:init_cookies(splash.args.cookies)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'
headers = {
['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-US;q=0.9,en;q=0.8',
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(4))
-- state select input
state_select = assert(splash:select('select#fruits'))
state_select:mouse_click()
state_select:send_keys("<Down>")
assert(splash:wait(0.2))
state_select:send_keys("<Enter>")
assert(splash:wait(0.2))
-- game select input
game_select = assert(splash:select('select#type'))
game_select:mouse_click()
game_select:send_keys("<Down>")
assert(splash:wait(0.1))
game_select:send_keys("<Up>")
assert(splash:wait(0.1))
-- the next button
login_btn = assert(splash:select('input.submit'))
login_btn:mouse_click()
assert(splash:wait(4))
return splash:html()
end
'''
def start_requests(self):
yield SplashRequest(url = self.start_url, callback = self.post_login, endpoint = 'execute', args = { 'lua_source': self.login_script })
def post_login(self, response):
search_link = response.urljoin(response.xpath("(//div[#id='sidebar']/ul/li)[7]/a/#href").get())
logging.info('about to fire up second splash request')
with open('temp.html', 'w') as f:
f.write(response.text)
f.close()
yield SplashRequest(url = search_link, callback = self.search, endpoint = 'execute', args = { 'wait': 3, 'lua_source': self.game_selection_script })
def search(self, response):
logging.info('hey from search!')
with open('post_search_response.html', 'w') as f:
f.write(response.text)
f.close()
def post_search(self, response):
logging.info('hey from post_search!')
with open('post_search_response.html', 'w') as f:
f.write(response.text)
f.close()
def parse(self, response):
pass
The scrapy-splash docs say:
SplashRequest sets session_id automatically for /execute endpoint, i.e. cookie handling is enabled by default if you use SplashRequest, /execute endpoint and a compatible Lua rendering script.
If you want to start from the same set of cookies, but then 'fork' sessions set request.meta['splash']['new_session_id'] in addition to session_id. Request cookies will be fetched from cookiejar session_id, but response cookies will be merged back to the new_session_id cookiejar.
As you can see, I am always using the execute endpoint, so I should get cookie handling by default? Yet it isn't working, I'm not sure why, but I wonder if it is because I am setting the custom header for the user-agent and language?
Right now, when the spider comes to run the 2nd script (fruit_selection_script) I get a 403 Forbidden error.
What am I missing?
I am trying to use scrapy on this page: http://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/
But I can't bring the image of the product, it can't find anything I might be missing?
I tried by attribute, by ID, by class and nothing
import scrapy
from scrapy import Request
import random
class BrickSetSpider(scrapy.Spider):
name = 'spider'
USER_AGENT_LIST = [
'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0',
]
start_urls = [
'https://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/',
]
download_delay = 5
FEED_EXPORT_ENCODING = 'utf-8'
def start_requests(self):
for url in self.start_urls:
headers = {'User-Agent': random.choice(self.USER_AGENT_LIST)}
yield Request(url, headers=headers)
def parse(self, response):
SET_SELECTOR = '.content-left'
for brickset in response.css(SET_SELECTOR):
SEARCH_SELECTOR = response.url
NAME_SELECTOR = 'span.keyValue span ::text'
IMAGE_SELECTOR = 'img[itemprop="image"] ::attr(src)'
yield {
'search': SEARCH_SELECTOR,
'name': brickset.css(NAME_SELECTOR).re('[^\t\n]+'),
'link': brickset.css(IMAGE_SELECTOR).extract(),
}
If you are using Chrome, you can test this in the console $$(".images [data-test='zoom-wrap'] img") to get the image.
So, you can use this CSS selector in the Scrapy code. You will have to extract the src parameter.
I hope it helps!
The image is generated dynamically by JS. Try the following code.
from simplified_scrapy.spider import Spider, SimplifiedDoc
import re
class MySpider(Spider):
name = 'rs-online.com'
# allowed_domains = ['example.com']
start_urls = [
'https://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/'
]
# refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
# print (doc.html)
div = doc.getElementByClass('content-left')
imgs = re.compile(u'largeImageURL: ".*"').findall(div.script.html)
imgs = ['https:'+img[len('largeImageURL: "'):-1] for img in imgs]
lis = doc.getElementByClass('keyDetailsLL').lis
names = {}
for li in lis:
spans=li.spans
names[spans[0].text]=spans[1].text
data = [{'imgs':imgs,'names':names}]
print (data)
return {"Urls": [], "Data": data} # Return data to framework
from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(MySpider()) # Start crawling
Result:
[{'imgs': ['https://media.rs-online.com/t_large/F7858468-01.jpg', 'https://media.rs-online.com/t_large/F7858468-02.jpg'], 'names': {'Codice RS': '785-8468', 'Codice costruttore': 'E2E-S05S12-WC-B1 2M', 'Costruttore': 'Omron'}}]
Please help me to optimize my scrapy spider. Specially next page pagination is not working. There are lot of page per page has 50 items.
I catch first page 50 items(link) in parse_items and next page items also scrap in parse_items.
import scrapy
from scrapy import Field
from fake_useragent import UserAgent
class DiscoItem(scrapy.Item):
release = Field()
images = Field()
class discoSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['discogs.com']
query = input('ENTER SEARCH MUSIC TYPE : ')
start_urls =['http://www.discogs.com/search?q=%s&type=release'%query]
custome_settings = {
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
'handle_httpstatus_list' : [301,302,],
'download_delay' :10}
def start_requests(self):
yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
def parse(self, response):
print('START parse \n')
print("*****",response.url)
#next page pagination
next_page =response.css('a.pagination_next::attr(href)').extract_first()
next_page = response.urljoin(next_page)
yield scrapy.Request(url=next_page, callback=self.parse_items2)
headers={}
for link in response.css('a.search_result_title ::attr(href)').extract():
ua = UserAgent()# random user agent
headers['User-Agent'] = ua.random
yield scrapy.Request(response.urljoin(link),headers=headers,callback=self.parse_items)
def parse_items2(self, response):
print('parse_items2 *******', response.url)
yield scrapy.Request(url=response.url, callback=self.parse)
def parse_items(self,response):
print("parse_items**********",response.url)
items = DiscoItem()
for imge in response.css('div#page_content'):
img = imge.css("span.thumbnail_center img::attr(src)").extract()[0]
items['images'] = img
release=imge.css('div.content a ::text').extract()
items['release']=release[4]
yield items
When I try running your code (after fixing the many indentation, spelling and letter case errors), this line is shown in scrapy's log:
2018-03-05 00:47:28 [scrapy.dupefilters] DEBUG: Filtered duplicate request: <GET https://www.discogs.com/search/?q=rock&type=release&page=2> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
Scrapy will filter duplicate requests by default, and your parse_items2() method does nothing but create duplicate requests. I fail to see any reason for that method existing.
What you should do instead is specify the ˙parse()` method as callback for your requests, and avoid having an extra method that does nothing:
yield scrapy.Request(url=next_page, callback=self.parse)
Try this for pagination:
try:
nextpage = response.urljoin( response.xpath("//*[contains(#rel,'next') and contains(#id,'next')]/#url")[0].extract() )
yield scrapy.Request( nextpage, callback=self.parse )
except:
pass
I am trying to create this Reddit scraper using Python's Scrapy framework.
I have used the CrawSpider to crawl through Reddit and its subreddits. But, when I come across pages that have adult content, the site asks for a cookie over18=1.
So, I have been trying to send a cookie with every request that the spider makes, but, its not working out.
Here, is my spider code. As you can see I tried to add a cookie with every spider request using the start_requests() method.
Could anyone here tell me how to do this? Or what I have been doing wrong?
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from reddit.items import RedditItem
from scrapy.http import Request, FormRequest
class MySpider(CrawlSpider):
name = 'redditscraper'
allowed_domains = ['reddit.com', 'imgur.com']
start_urls = ['https://www.reddit.com/r/nsfw']
rules = (
Rule(LinkExtractor(
allow=['/r/nsfw/\?count=\d*&after=\w*']),
callback='parse_item',
follow=True),
)
def start_requests(self):
for i,url in enumerate(self.start_urls):
print(url)
yield Request(url,cookies={'over18':'1'},callback=self.parse_item)
def parse_item(self, response):
titleList = response.css('a.title')
for title in titleList:
item = RedditItem()
item['url'] = title.xpath('#href').extract()
item['title'] = title.xpath('text()').extract()
yield item
Okay. Try doing something like this.
def start_requests(self):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'}
for i,url in enumerate(self.start_urls):
yield Request(url,cookies={'over18':'1'}, callback=self.parse_item, headers=headers)
It's the User-Agent which blocks you.
Edit:
Don't know what's wrong with CrawlSpider but Spider could work anyway.
#!/usr/bin/env python
# encoding: utf-8
import scrapy
class MySpider(scrapy.Spider):
name = 'redditscraper'
allowed_domains = ['reddit.com', 'imgur.com']
start_urls = ['https://www.reddit.com/r/nsfw']
def request(self, url, callback):
"""
wrapper for scrapy.request
"""
request = scrapy.Request(url=url, callback=callback)
request.cookies['over18'] = 1
request.headers['User-Agent'] = (
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/45.0.2454.85 Safari/537.36')
return request
def start_requests(self):
for i, url in enumerate(self.start_urls):
yield self.request(url, self.parse_item)
def parse_item(self, response):
titleList = response.css('a.title')
for title in titleList:
item = {}
item['url'] = title.xpath('#href').extract()
item['title'] = title.xpath('text()').extract()
yield item
url = response.xpath('//a[#rel="nofollow next"]/#href').extract_first()
if url:
yield self.request(url, self.parse_item)
# you may consider scrapy.pipelines.images.ImagesPipeline :D
The Scrapy Docs
1.Using a dict:
request_with_cookies = Request(url="http://www.example.com",
cookies={'currency': 'USD', 'country': 'UY'})
2.Using a list of dicts:
request_with_cookies = Request(url="http://www.example.com",
cookies=[{'name': 'currency',
'value': 'USD',
'domain': 'example.com',
'path': '/currency'}])
You can also send it via header.
scrapy.Request(url=url, callback=callback, headers={'Cookie':my_cookie})
You could use the process_request parameter in the rule, something like:
rules = (
Rule(LinkExtractor(
allow=['/r/nsfw/\?count=\d*&after=\w*']),
callback='parse_item',
process_request='ammend_req_header',
follow=True)
def ammend_req_header(self, request):
request.cookies['over18']=1
return request
I found solution for CrawlSpider:
def start_requests(self):
yield Request(url=self.start_urls[0], callback=self._parse, cookies={'beget': 'begetok'})