I'm scraping a website that strongly depends on Javascript. The main page from which I need to extract the urls that will be parsed depends on Javascript, so I have to modify start_requests.
I'm looking for a way to connect start_requests, with the linkextractor and with process_match
class MatchSpider(CrawlSpider):
name = "match"
allowed_domains = ["whoscored"]
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[contains(#class, "match-report")]//#href'), callback='parse_item'),
)
def start_requests(self):
url = 'https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/6335/Stages/13796/Fixtures/England-Premier-League-2016-2017'
browser = Browser(browser='Chrome')
browser.get(url)
# should return a request with the html body from Selenium driver so that LinkExtractor rule can be applied
def process_match(self, response):
match_item = MatchItem()
regex = re.compile("matchCentreData = \{.*?\};", re.S)
match = re.search(regex, response.text).group()
match = match.replace('matchCentreData =', '').replace(';', '')
match_item['match'] = json.loads(match)
match_item['url'] = response.url
match_item['project'] = self.settings.get('BOT_NAME')
match_item['spider'] = self.name
match_item['server'] = socket.gethostname()
match_item['date'] = datetime.datetime.now()
yield match_item
A wrapper I'm using around Selenium:
class Browser:
"""
selenium on steroids. allows you to create different types of browsers plus
adds methods for safer calls
"""
def __init__(self, browser='Firefox'):
"""
type: silent or not
browser: chrome of firefox
"""
self.browser = browser
self._start()
def _start(self):
'''
starts browser
'''
if self.browser == 'Chrome':
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_extension('./libcommon/adblockpluschrome-1.10.0.1526.crx')
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument("user-agent={0}".format(random.choice(USER_AGENTS)))
self.driver_ = webdriver.Chrome(executable_path='./libcommon/chromedriver', chrome_options=chrome_options)
elif self.browser == 'Firefox':
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", random.choice(USER_AGENTS))
profile.add_extension('./libcommon/adblock_plus-2.7.1-sm+tb+an+fx.xpi')
profile.set_preference('permissions.default.image', 2)
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
profile.set_preference("webdriver.load.strategy", "unstable")
self.driver_ = webdriver.Firefox(profile)
elif self.browser == 'PhantomJS':
self.driver_ = webdriver.PhantomJS()
self.driver_.set_window_size(1120, 550)
def close(self):
self.driver_.close()
def return_when(self, condition, locator):
"""
returns browser execution when condition is met
"""
for _ in range(5):
with suppress(Exception):
wait = WebDriverWait(self.driver_, timeout=100, poll_frequency=0.1)
wait.until(condition(locator))
self.driver_.execute_script("return window.stop")
return True
return False
def __getattr__(self, name):
"""
ruby-like method missing: derive methods not implemented to attribute that
holds selenium browser
"""
def _missing(*args, **kwargs):
return getattr(self.driver_, name)(*args, **kwargs)
return _missing
There's two problems I see after looking into this. Forgive any ignorance on my part, because it's been a while since I was last in the Python/Scrapy world.
First: How do we get the HTML from Selenium?
According to the Selenium docs, the driver should have a page_source attribute containing the contents of the page.
browser = Browser(browser='Chrome')
browser.get(url)
html = browser.driver_.page_source
browser.close()
You may want to make this a function in your browser class to avoid accessing browser.driver_ from MatchSpider.
# class Browser
def page_source(self):
return self.driver_.page_source
# end class
browser.get(url)
html = browser.page_source()
Second: How do we override Scrapy's internal web requests?
It looks like Scrapy tries to decouple the behind-the-scenes web requests from the what-am-I-trying-to-parse functionality of each spider you write. start_requests() should "return an iterable with the first Requests to crawl" and make_requests_from_url(url) (which is called if you don't override start_requests()) takes "a URL and returns a Request object". When internally processing a Spider, Scrapy starts creating a plethora of Request objects that will be asynchronously executed and the subsequent Response will be sent to parse(response)...the Spider never actually does the processing from Request to `Response.
Long story short, this means you would need to create middleware for the Scrapy Downloader to use Selenium. Then, you can remove your overridden start_requests() method and add a start_urls attribute. Specifically, your SeleniumDownloaderMiddleware should overwrite the process_request(request, spider) method to use the above Selenium code.
Related
TL;DR
In scrapy, I want the Request to wait till all spider parse callbacks finish. So the whole process needs to be sequential. Like this:
Request1 -> Crawl1 -> Request2 -> Crawl2 ...
But what is happening now:
Request1 -> Request2 -> Request3 ...
Crawl1
Crawl2
Crawl3 ...
Long version
I am new to scrapy + selenium web scraping.
I am trying to scrape a website where the contents are being updated heavily with javascript. Firstly I am opening the website with selenium and logging in. After that, I am creating a using a downloader middleware that handles the requests with selenium and returns the responses. Below is the middleware's process_request implementation:
class XYZDownloaderMiddleware:
'''Other functions are as is. I just changed this one'''
def process_request(self, request, spider):
driver = request.meta['driver']
# We are opening a new link
if request.meta['load_url']:
driver.get(request.url)
WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, request.meta['wait_for_xpath'])))
# We are clicking on an element to get new data using javascript.
elif request.meta['click_bet']:
element = request.meta['click_bet']
element.click()
WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, request.meta['wait_for_xpath'])))
body = driver.page_source
return HtmlResponse(driver.current_url, body=body, encoding="utf-8", request=request)
In settings, I have also set CONCURRENT_REQUESTS = 1 so that, multiple driver.get() do not called and selenium can peacefully load responses one by one.
Now what I see happening is selenium opens each URL, scrapy lets selenium wait for the response to finish loading and then middleware returns the response properly (goes to if response.meta['load_url'] block).
But, after I got the response, I want to use the selenium driver (in the parse(response) functions) to click on each of the elements by yielding a Request and return the updated HTML from the middleware (the elif request.meta['click_bet'] block).
The Spider is minimally like this:
class XYZSpider(scrapy.Spider):
def start_requests(self):
start_urls = [
'https://www.example.com/a',
'https://www.example.com/b'
]
self.driver = self.getSeleniumDriver()
for url in start_urls:
request = scrapy.Request(url=url, callback=self.parse)
request.meta['driver'] = self.driver
request.meta['load_url'] = True
request.meta['wait_for_xpath'] = '/div/bla/bla'
request.meta['click_bet'] = None
yield request
def parse(self, response):
urls = response.xpath('//a/#href').getall()
for url in start_urls:
request = scrapy.Request(url=url, callback=self.rightSectionParse)
request.meta['driver'] = self.driver
request.meta['load_url'] = True
request.meta['wait_for_xpath'] = '//div[contains(#class, "rightSection")]'
request.meta['click_bet'] = None
yield request
def rightSectionParse(self, response):
...
So what is happening is, scrapy is not waiting for the spider to parse. Scrapy gets the response, and then parallelly calls parse callback and next fetch response. But selenium driver needs to be used by the parse callback function before the next request processing.
I want the requests to wait until the parse callback is finished.
I am creating a web crawler using Scrapy and Selenium.
The code looks like this:
class MySpider(scrapy.Spider):
urls = [/* a very long list of url */]
def start_requests(self):
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_item)
def parse_item(self, response):
item = Item()
item['field1'] = response.xpath('some xpath').extract()[0]
yield item
sub_item_url = response.xpath('some another xpath').extract()[0]
# Sub items are Javascript generated so it needs a web driver
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=options)
driver.set_window_size(1920, 1080)
sub_item_generator = self.get_sub_item_generator(driver, sub_item_url)
while True:
try:
yield next(sub_item_generator)
except StopIteration:
break
driver.close()
def get_sub_item_generator(driver, url):
# Crawling using the web driver goes here which takes a long time to finish
yield sub_item
The problem is that the crawler running for a while then it crashed due to run out of memory. Scrapy keeps scheduling a new URL from the list so there are too many web driver processes running.
Is there any way to control the Scrapy scheduler not to schedule a new URL when there is some number of web driver process running?
You could try setting CONCURRENT_REQUESTS to something lower than the default of 16 (as shown here):
class MySpider(scrapy.Spider):
# urls = [/* a very long list of url */]
custom_settings = {
'CONCURRENT_REQUESTS': 5 # default of 16 seemed like it was too much?
}
Try using driver.quit() instead of driver.close()
I have had same problem despite of using driver.close() then I did this, kill all firefox instances before the script starts.
from subprocess import call
call(["killall", "firefox"])
I am writing a scrapy spider with selenium to cral a dynamic web page.
I am pretty sure the regular expression works fine.But the 'page_link' from the linkextractor is getting nothing and the program terminates before parse_item function get called. Can't figure out what is wrong.
class OikotieSpider(CrawlSpider):
name = 'oikotie'
allowed_domains = [my_domain]
start_urls=['https://asunnot.oikotie.fi/myytavat-uudisasunnot?cardType=100&locations=%5B%22helsinki%22%5D&newDevelopment=1&buildingType%5B%5D=1&buildingType%5B%5D=256&pagination=1']
def __init__(self):
CrawlSpider.__init__(self)
chrome_driver = 'mydriver_location'
os.environ["webdriver.chrome.driver"] = chrome_driver
chromeOptions = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chromeOptions.add_experimental_option("prefs", prefs)
#driver instance and call
self.driver = webdriver.Chrome(executable_path=chrome_driver, chrome_options=chromeOptions)
self.driver.get('my_url')
self.selector=Selector(text=self.driver.page_source)
self.driver.close()
self.driver.quit()
page_link=LinkExtractor(allow=('myytavat-asunnot\/helsinki\/\d+',))
rules = (
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(page_link, callback='parse_item',follow=True),
)
def parse_item(self, response):
self.logger.info('Hi, this is an item page! %s', response.url)
print("parse_item is called!!")
self.driver.get(response.url)
self.driver.implicitly_wait(30)
return ....
I think you should use 'DownloadMiddleware' to achieve getting webpage. In 'DownloadMiddlewrae', you can init browser to get webpage. Look at this:
https://doc.scrapy.org/en/latest/topics/downloader-middleware.html?highlight=DownloadMiddleware
It looks like your LinkExtractor allow argument is not an absolute regex. It needs to be:https://doc.scrapy.org/en/latest/topics/link-extractors.html.
Now part of being an absolute regex could just be *+current regex....but that would be terrible:). Just make it absolute.
OS: Ubuntu 16.04
Stack - Scrapy 1.0.3 + Selenium
I'm pretty new to scrapy and this might sound very basic, But in my spider, only "init" is being getting executed. Any code/function after that is not getting called and thhe spider just halts.
class CancerForumSpider(scrapy.Spider):
name = "mainpage_spider"
allowed_domains = ["cancerforums.net"]
start_urls = [
"http://www.cancerforums.net/forums/14-Prostate-Cancer-Forum"
]
def __init__(self,*args,**kwargs):
self.browser=webdriver.Firefox()
self.browser.get("http://www.cancerforums.net/forums/14-Prostate-Cancer-Forum")
print "----------------Going to sleep------------------"
time.sleep(5)
# self.parse()
def __exit__(self):
print "------------Exiting----------"
self.browser.quit()
def parse(self,response):
print "----------------Inside Parse------------------"
print "------------Exiting----------"
self.browser.quit()
The spider gets the browser object, prints "Going to sleep" and just halts. It doesn't go inside the parse function.
Following are the contents of the run logs:
----------------inside init----------------
----------------Going to sleep------------------
There are a few problems you need to address or be aware of:
You're not calling super() during the __init__ method, so none of the inherited classes initialization is going to be happening. Scrapy won't do anything (like calling it's parse() method), as that all is setup in scrapy.Spider.
After fixing the above, your parse() method will be called by Scrapy, but won't be operating on your Selenium-fetched webpage. It will have no knowledge of this whatsoever, and will go re-fetch the url (based on start_urls). It's very much likely that these two sources will differ (often drastically).
You're going to be bypassing almost all of Scrapy's functionality using Selenium the way you are. All of Selenium's get()'s will be executed outside of the Scrapy framework. Middleware won't be applied (cookies, throttling, filtering, etc.) nor will any of the expected/created objects (like request and response) be populated with the data you expect.
Before you fix all of that, you should consider a couple of better options/alternatives:
Create a downloader middleware that handles all "Selenium" related functionality. Have it intercept request objects right before they hit the downloader, populate a new response objects and return them for processing by the spider.
This isn't optimal, as you're effectively creating your own downloader, and short-circuiting Scrapy's. You'll have to re-implement the handling of any desired settings the downloader usually takes into account and make them work with Selenium.
Ditch Selenium and use the Splash HTTP and scrapy-splash middleware for handling Javascript.
Ditch Scrapy all together and just use Selenium and BeautifulSoup.
Scrapy is useful when you have to crawl a big amount of pages. Selenium is normally useful for scraping when you need to have the DOM source after the JS was loaded. If that's your situation, there are two main ways to combine Selenium and Scrapy. One is to write a download handler, like the one you can find here.
The code goes as:
# encoding: utf-8
from __future__ import unicode_literals
from scrapy import signals
from scrapy.signalmanager import SignalManager
from scrapy.responsetypes import responsetypes
from scrapy.xlib.pydispatch import dispatcher
from selenium import webdriver
from six.moves import queue
from twisted.internet import defer, threads
from twisted.python.failure import Failure
class PhantomJSDownloadHandler(object):
def __init__(self, settings):
self.options = settings.get('PHANTOMJS_OPTIONS', {})
max_run = settings.get('PHANTOMJS_MAXRUN', 10)
self.sem = defer.DeferredSemaphore(max_run)
self.queue = queue.LifoQueue(max_run)
SignalManager(dispatcher.Any).connect(self._close, signal=signals.spider_closed)
def download_request(self, request, spider):
"""use semaphore to guard a phantomjs pool"""
return self.sem.run(self._wait_request, request, spider)
def _wait_request(self, request, spider):
try:
driver = self.queue.get_nowait()
except queue.Empty:
driver = webdriver.PhantomJS(**self.options)
driver.get(request.url)
# ghostdriver won't response when switch window until page is loaded
dfd = threads.deferToThread(lambda: driver.switch_to.window(driver.current_window_handle))
dfd.addCallback(self._response, driver, spider)
return dfd
def _response(self, _, driver, spider):
body = driver.execute_script("return document.documentElement.innerHTML")
if body.startswith("<head></head>"): # cannot access response header in Selenium
body = driver.execute_script("return document.documentElement.textContent")
url = driver.current_url
respcls = responsetypes.from_args(url=url, body=body[:100].encode('utf8'))
resp = respcls(url=url, body=body, encoding="utf-8")
response_failed = getattr(spider, "response_failed", None)
if response_failed and callable(response_failed) and response_failed(resp, driver):
driver.close()
return defer.fail(Failure())
else:
self.queue.put(driver)
return defer.succeed(resp)
def _close(self):
while not self.queue.empty():
driver = self.queue.get_nowait()
driver.close()
Suppose your scraper is called "scraper". If you put the mentioned code inside a file called handlers.py on the root of the "scraper" folder, then you could add to your settings.py:
DOWNLOAD_HANDLERS = {
'http': 'scraper.handlers.PhantomJSDownloadHandler',
'https': 'scraper.handlers.PhantomJSDownloadHandler',
}
Another way is to write a download middleware, as described here. The download middleware has the downside of preventing some key features from working out of the box, such as cache and retries.
In any case, starting the Selenium webdriver at the init of the Scrapy spider is not the usual way to go.
I am learning Python and am trying to scrape this page for a specific value on the dropdown menu. After that I need to click each item on the resulted table to retrieve the specific information. I am able to select the item and retrieve the information on the webdriver. But I do not know how to pass the response url to the crawlspider.
driver = webdriver.Firefox()
driver.get('http://www.cppcc.gov.cn/CMS/icms/project1/cppcc/wylibary/wjWeiYuanList.jsp')
more_btn = WebDriverWait(driver, 20).until(
EC.visibility_of_element_located((By.ID, '_button_select'))
)
more_btn.click()
## select specific value from the dropdown
driver.find_element_by_css_selector("select#tabJcwyxt_jiebie > option[value='teyaoxgrs']").click()
driver.find_element_by_css_selector("select#tabJcwyxt_jieci > option[value='d11jie']").click()
search2 = driver.find_element_by_class_name('input_a2')
search2.click()
time.sleep(5)
## convert html to "nice format"
text_html=driver.page_source.encode('utf-8')
html_str=str(text_html)
## this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
resp_for_scrapy=TextResponse('none',200,{},html_str,[],None)
## convert html to "nice format"
text_html=driver.page_source.encode('utf-8')
html_str=str(text_html)
resp_for_scrapy=TextResponse('none',200,{},html_str,[],None)
So this is where I am stuck. I was able to query using the above code. But How can I pass resp_for_scrapy to the crawlspider? I put resp_for_scrapy in place of item but that didn't work.
## spider
class ProfileSpider(CrawlSpider):
name = 'pccprofile2'
allowed_domains = ['cppcc.gov.cn']
start_urls = ['http://www.cppcc.gov.cn/CMS/icms/project1/cppcc/wylibary/wjWeiYuanList.jsp']
def parse(self, resp_for_scrapy):
hxs = HtmlXPathSelector(resp_for_scrapy)
for post in resp_for_scrapy.xpath('//div[#class="table"]//ul//li'):
items = []
item = Ppcprofile2Item()
item ["name"] = hxs.select("//h1/text()").extract()
item ["title"] = hxs.select("//div[#id='contentbody']//tr//td//text()").extract()
items.append(item)
##click next page
while True:
next = self.driver.findElement(By.linkText("下一页"))
try:
next.click()
except:
break
return(items)
Any suggestions would be greatly appreciated!!!!
EDITS I included a middleware class to select from the dropdown before the spider class. But now there is no error and no result.
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
driver.get('http://www.cppcc.gov.cn/CMS/icms/project1/cppcc/wylibary/wjWeiYuanList.jsp')
# select from the dropdown
more_btn = WebDriverWait(driver, 20).until(
EC.visibility_of_element_located((By.ID, '_button_select'))
)
more_btn.click()
driver.find_element_by_css_selector("select#tabJcwyxt_jiebie > option[value='teyaoxgrs']").click()
driver.find_element_by_css_selector("select#tabJcwyxt_jieci > option[value='d11jie']").click()
search2 = driver.find_element_by_class_name('input_a2')
search2.click()
time.sleep(5)
#get the response
body = driver.page_source
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
class ProfileSpider(CrawlSpider):
name = 'pccprofile2'
rules = [Rule(SgmlLinkExtractor(allow=(),restrict_xpaths=("//div[#class='table']")), callback='parse_item')]
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = Ppcprofile2Item()
item ["name"] = hxs.select("//h1/text()").extract()
item ["title"] = hxs.select("//div[#id='contentbody']//tr//td//text()").extract()
items.append(item)
#click next page
while True:
next = response.findElement(By.linkText("下一页"))
try:
next.click()
except:
break
return(items)
Use Downloader Middleware to catch selenium-required pages before you process them regularly with Scrapy:
The downloader middleware is a framework of hooks into Scrapy’s request/response processing. It’s a light, low-level system for globally altering Scrapy’s requests and responses.
Here's a very basic example using PhantomJS:
from scrapy.http import HtmlResponse
from selenium import webdriver
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
driver.get(request.url)
body = driver.page_source
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
Once you return that HtmlResponse (or a TextResponse if that's what you really want), Scrapy will cease processing downloaders and drop into the spider's parse method:
If it returns a Response object, Scrapy won’t bother calling any other
process_request() or process_exception() methods, or the appropriate
download function; it’ll return that response. The process_response()
methods of installed middleware is always called on every response.
In this case, you can continue to use your spider's parse method as you normally would with HTML, except that the JS on the page has already been executed.
Tip: Since the Downloader Middleware's process_request method accepts the spider as an argument, you can add a conditional in the spider to check whether you need to process JS at all, and that will let you handle both JS and non-JS pages with the exact same spider class.
Here is a middleware for Scrapy and Selenium
from scrapy.http import HtmlResponse
from scrapy.utils.python import to_bytes
from selenium import webdriver
from scrapy import signals
class SeleniumMiddleware(object):
#classmethod
def from_crawler(cls, crawler):
middleware = cls()
crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
request.meta['driver'] = self.driver # to access driver from response
self.driver.get(request.url)
body = to_bytes(self.driver.page_source) # body must be of type bytes
return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
def spider_opened(self, spider):
self.driver = webdriver.Firefox()
def spider_closed(self, spider):
self.driver.close()
Also need to add in settings.py
DOWNLOADER_MIDDLEWARES = {
'youproject.middlewares.selenium.SeleniumMiddleware': 200
}
Decide weather its 200 or something else based on docs.
Update firefox headless mode with scrapy and selenium
If you want to run firefox in headless mode then install xvfb
sudo apt-get install -y xvfb
and PyVirtualDisplay
sudo pip install pyvirtualdisplay
and use this middleware
from shutil import which
from pyvirtualdisplay import Display
from scrapy import signals
from scrapy.http import HtmlResponse
from scrapy.utils.project import get_project_settings
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
settings = get_project_settings()
HEADLESS = True
class SeleniumMiddleware(object):
#classmethod
def from_crawler(cls, crawler):
middleware = cls()
crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
self.driver.get(request.url)
request.meta['driver'] = self.driver
body = str.encode(self.driver.page_source)
return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
def spider_opened(self, spider):
if HEADLESS:
self.display = Display(visible=0, size=(1280, 1024))
self.display.start()
binary = FirefoxBinary(settings.get('FIREFOX_EXE') or which('firefox'))
self.driver = webdriver.Firefox(firefox_binary=binary)
def spider_closed(self, spider):
self.driver.close()
if HEADLESS:
self.display.stop()
where settings.py contains
FIREFOX_EXE = '/path/to/firefox.exe'
The problem is that some versions of firefox don't work with selenium.
To solve this problem you can download firefox version 47.0.1 (this version works flawlessly) from here then extract it and put it inside your selenium project. Afterwards modify firefox path as
FIREFOX_EXE = '/path/to/your/scrapyproject/firefox/firefox.exe'