I am writing a scrapy spider with selenium to cral a dynamic web page.
I am pretty sure the regular expression works fine.But the 'page_link' from the linkextractor is getting nothing and the program terminates before parse_item function get called. Can't figure out what is wrong.
class OikotieSpider(CrawlSpider):
name = 'oikotie'
allowed_domains = [my_domain]
start_urls=['https://asunnot.oikotie.fi/myytavat-uudisasunnot?cardType=100&locations=%5B%22helsinki%22%5D&newDevelopment=1&buildingType%5B%5D=1&buildingType%5B%5D=256&pagination=1']
def __init__(self):
CrawlSpider.__init__(self)
chrome_driver = 'mydriver_location'
os.environ["webdriver.chrome.driver"] = chrome_driver
chromeOptions = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chromeOptions.add_experimental_option("prefs", prefs)
#driver instance and call
self.driver = webdriver.Chrome(executable_path=chrome_driver, chrome_options=chromeOptions)
self.driver.get('my_url')
self.selector=Selector(text=self.driver.page_source)
self.driver.close()
self.driver.quit()
page_link=LinkExtractor(allow=('myytavat-asunnot\/helsinki\/\d+',))
rules = (
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(page_link, callback='parse_item',follow=True),
)
def parse_item(self, response):
self.logger.info('Hi, this is an item page! %s', response.url)
print("parse_item is called!!")
self.driver.get(response.url)
self.driver.implicitly_wait(30)
return ....
I think you should use 'DownloadMiddleware' to achieve getting webpage. In 'DownloadMiddlewrae', you can init browser to get webpage. Look at this:
https://doc.scrapy.org/en/latest/topics/downloader-middleware.html?highlight=DownloadMiddleware
It looks like your LinkExtractor allow argument is not an absolute regex. It needs to be:https://doc.scrapy.org/en/latest/topics/link-extractors.html.
Now part of being an absolute regex could just be *+current regex....but that would be terrible:). Just make it absolute.
Related
I am creating a web crawler using Scrapy and Selenium.
The code looks like this:
class MySpider(scrapy.Spider):
urls = [/* a very long list of url */]
def start_requests(self):
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_item)
def parse_item(self, response):
item = Item()
item['field1'] = response.xpath('some xpath').extract()[0]
yield item
sub_item_url = response.xpath('some another xpath').extract()[0]
# Sub items are Javascript generated so it needs a web driver
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(chrome_options=options)
driver.set_window_size(1920, 1080)
sub_item_generator = self.get_sub_item_generator(driver, sub_item_url)
while True:
try:
yield next(sub_item_generator)
except StopIteration:
break
driver.close()
def get_sub_item_generator(driver, url):
# Crawling using the web driver goes here which takes a long time to finish
yield sub_item
The problem is that the crawler running for a while then it crashed due to run out of memory. Scrapy keeps scheduling a new URL from the list so there are too many web driver processes running.
Is there any way to control the Scrapy scheduler not to schedule a new URL when there is some number of web driver process running?
You could try setting CONCURRENT_REQUESTS to something lower than the default of 16 (as shown here):
class MySpider(scrapy.Spider):
# urls = [/* a very long list of url */]
custom_settings = {
'CONCURRENT_REQUESTS': 5 # default of 16 seemed like it was too much?
}
Try using driver.quit() instead of driver.close()
I have had same problem despite of using driver.close() then I did this, kill all firefox instances before the script starts.
from subprocess import call
call(["killall", "firefox"])
I'm scraping a website that strongly depends on Javascript. The main page from which I need to extract the urls that will be parsed depends on Javascript, so I have to modify start_requests.
I'm looking for a way to connect start_requests, with the linkextractor and with process_match
class MatchSpider(CrawlSpider):
name = "match"
allowed_domains = ["whoscored"]
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[contains(#class, "match-report")]//#href'), callback='parse_item'),
)
def start_requests(self):
url = 'https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/6335/Stages/13796/Fixtures/England-Premier-League-2016-2017'
browser = Browser(browser='Chrome')
browser.get(url)
# should return a request with the html body from Selenium driver so that LinkExtractor rule can be applied
def process_match(self, response):
match_item = MatchItem()
regex = re.compile("matchCentreData = \{.*?\};", re.S)
match = re.search(regex, response.text).group()
match = match.replace('matchCentreData =', '').replace(';', '')
match_item['match'] = json.loads(match)
match_item['url'] = response.url
match_item['project'] = self.settings.get('BOT_NAME')
match_item['spider'] = self.name
match_item['server'] = socket.gethostname()
match_item['date'] = datetime.datetime.now()
yield match_item
A wrapper I'm using around Selenium:
class Browser:
"""
selenium on steroids. allows you to create different types of browsers plus
adds methods for safer calls
"""
def __init__(self, browser='Firefox'):
"""
type: silent or not
browser: chrome of firefox
"""
self.browser = browser
self._start()
def _start(self):
'''
starts browser
'''
if self.browser == 'Chrome':
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_extension('./libcommon/adblockpluschrome-1.10.0.1526.crx')
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument("user-agent={0}".format(random.choice(USER_AGENTS)))
self.driver_ = webdriver.Chrome(executable_path='./libcommon/chromedriver', chrome_options=chrome_options)
elif self.browser == 'Firefox':
profile = webdriver.FirefoxProfile()
profile.set_preference("general.useragent.override", random.choice(USER_AGENTS))
profile.add_extension('./libcommon/adblock_plus-2.7.1-sm+tb+an+fx.xpi')
profile.set_preference('permissions.default.image', 2)
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
profile.set_preference("webdriver.load.strategy", "unstable")
self.driver_ = webdriver.Firefox(profile)
elif self.browser == 'PhantomJS':
self.driver_ = webdriver.PhantomJS()
self.driver_.set_window_size(1120, 550)
def close(self):
self.driver_.close()
def return_when(self, condition, locator):
"""
returns browser execution when condition is met
"""
for _ in range(5):
with suppress(Exception):
wait = WebDriverWait(self.driver_, timeout=100, poll_frequency=0.1)
wait.until(condition(locator))
self.driver_.execute_script("return window.stop")
return True
return False
def __getattr__(self, name):
"""
ruby-like method missing: derive methods not implemented to attribute that
holds selenium browser
"""
def _missing(*args, **kwargs):
return getattr(self.driver_, name)(*args, **kwargs)
return _missing
There's two problems I see after looking into this. Forgive any ignorance on my part, because it's been a while since I was last in the Python/Scrapy world.
First: How do we get the HTML from Selenium?
According to the Selenium docs, the driver should have a page_source attribute containing the contents of the page.
browser = Browser(browser='Chrome')
browser.get(url)
html = browser.driver_.page_source
browser.close()
You may want to make this a function in your browser class to avoid accessing browser.driver_ from MatchSpider.
# class Browser
def page_source(self):
return self.driver_.page_source
# end class
browser.get(url)
html = browser.page_source()
Second: How do we override Scrapy's internal web requests?
It looks like Scrapy tries to decouple the behind-the-scenes web requests from the what-am-I-trying-to-parse functionality of each spider you write. start_requests() should "return an iterable with the first Requests to crawl" and make_requests_from_url(url) (which is called if you don't override start_requests()) takes "a URL and returns a Request object". When internally processing a Spider, Scrapy starts creating a plethora of Request objects that will be asynchronously executed and the subsequent Response will be sent to parse(response)...the Spider never actually does the processing from Request to `Response.
Long story short, this means you would need to create middleware for the Scrapy Downloader to use Selenium. Then, you can remove your overridden start_requests() method and add a start_urls attribute. Specifically, your SeleniumDownloaderMiddleware should overwrite the process_request(request, spider) method to use the above Selenium code.
I want to ask how about (do crawling) clicking next button(change number page of website) (then do crawling more till the end of page number) from this site
I've try to combining scrape with selenium,but its still error and says "line 22
self.driver = webdriver.Firefox()
^
IndentationError: expected an indented block"
I don't know why it happens, i think i code is so well.Anybody can resolve this problem?
This my source :
from selenium import webdriver
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from now.items import NowItem
class MySpider(BaseSpider):
name = "nowhere"
allowed_domains = ["n0where.net"]
start_urls = ["https://n0where.net/"]
def parse(self, response):
for article in response.css('.loop-panel'):
item = NowItem()
item['title'] = article.css('.article-title::text').extract_first()
item['link'] = article.css('.loop-panel>a::attr(href)').extract_first()
item['body'] ='' .join(article.css('.excerpt p::text').extract()).strip()
#item['date'] = article.css('[itemprop="datePublished"]::attr(content)').extract_first()
yield item
def __init__(self):
self.driver = webdriver.Firefox()
def parse2(self, response):
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath('/html/body/div[4]/div[3]/div/div/div/div/div[1]/div/div[6]/div/a[8]/span')
try:
next.click()
# get the data and write it to scrapy items
except:
break
self.driver.close()`
This my capture of my program mate :
Ignoring the syntax and indentation errors you have an issue with your code logic in general.
What you do is create webdriver and never use it. What your spider does here is:
Create webdriver object.
Schedule a request for every url in self.start_urls, in your case it's only one.
Download it, make Response object and pass it to the self.parse()
Your parse method seems to find some xpaths and makes some items, so scrapy yields you some items that were found if any
Done
Your parse2 was never called and so your selenium webdriver was never used.
Since you are not using scrapy to download anything in this case you can just override start_requests()(<- that's where your spider starts) method of your spider to do the whole logic.
Something like:
from selenium import webdriver
import scrapy
from scrapy import Selector
class MySpider(scrapy.Spider):
name = "nowhere"
allowed_domains = ["n0where.net"]
start_url = "https://n0where.net/"
def start_requests(self):
driver = webdriver.Firefox()
driver.get(self.start_url)
while True:
next_url = driver.find_element_by_xpath(
'/html/body/div[4]/div[3]/div/div/div/div/div[1]/div/div[6]/div/a[8]/span')
try:
# parse the body your webdriver has
self.parse(driver.page_source)
# click the button to go to next page
next_url.click()
except:
break
driver.close()
def parse(self, body):
# create Selector from html string
sel = Selector(text=body)
# parse it
for article in sel.css('.loop-panel'):
item = dict()
item['title'] = article.css('.article-title::text').extract_first()
item['link'] = article.css('.loop-panel>a::attr(href)').extract_first()
item['body'] = ''.join(article.css('.excerpt p::text').extract()).strip()
# item['date'] = article.css('[itemprop="datePublished"]::attr(content)').extract_first()
yield item
This is a indentation error. Look the lines near the error:
def parse2(self, response):
self.driver.get(response.url)
The first of these two lines ends with a colon. So, the second line should be more indented than the first one.
There are two possible fixes, depending on what you want to do. Either add an indentation level to the second one:
def parse2(self, response):
self.driver.get(response.url)
Or move the parse2 function out of theinit` function:
def parse2(self, response):
self.driver.get(response.url)
def __init__(self):
self.driver = webdriver.Firefox()
# etc.
I'm trying to scrape with phantomJS and selenium (and scrapy) into a list of links inside a website. I'm new to PhantomJS and Selenium so I'll ask here.
I think the website has a session timeout because I can scrape just the first of those links. Then I get this error:
NoSuchWindowException: Message: {"errorMessage":"Currently Window
handle/name is invalid
(closed?)","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"460","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:33038","User-Agent":"Python-urllib/2.7"},"httpVersion":"1.1","method":"POST","post":"{\"url\":
That's part of my code:
class bllanguage(scrapy.Spider):
handle_httpstatus_list = [302]
name = "bllanguage"
download_delay = 1
allowed_domains = ["http://explore.com/"]
f = open("link")
start_urls = [url.strip() for url in f.readlines()]
f.close()
def __init__(self):
self.driver = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs')
def start_requests(self):
for u in self.start_urls:
r = scrapy.Request(url = u, dont_filter=True, callback=self.parse)
r.meta['dont_redirect'] = True
yield r
def parse(self, response):
self.driver.get(response.url)
#print response.url
search_field = []
Etc.
The session timeout problem is just my interpretation, I've seen other messages like that but none of them has a solution. What I would like to try is to "close" the request to each link inside the "link" file. I don't know if this is something PhantomJS does naturally or I have to insert something: I've seen there's a resourceTimeout setting. Is it the right thing to use and where can I put it inside my code?
I have read all the threads on using scrapy for AJAX pages and installed selenium webdrive to simplify the task, my spider can partially crawl but can't get any data into my Items.
My objectives are:
Crawl from this page to this page
Scrape each item(post)'s:
author_name (xpath:/html/body/div[8]/div/div[1]/div[3]/div[3]/ul/li[2]/div[2]/span[2]/ul/li[3]/a/text())
author_page_url (xpath:/html/body/div[8]/div/div[1]/div[3]/div[3]/ul/li[2]/div[2]/span[2]/ul/li[3]/a/#href)
post_title (xpath://a[#class="title_txt"])
post_page_url (xpath://a[#class="title_txt"]/#href)
post_text (xpath on a separate post page: //div[#id="a_NMContent/text()")
This is my monkey code (since I am only making my first steps in Python as an aspiring natural language processing student, who majored in linguistics in the past):
import scrapy
import time
from selenium import webdriver
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import XPathSelector
class ItalkiSpider(CrawlSpider):
name = "italki"
allowed_domains = ['italki.com']
start_urls = ['http://www.italki.com/entries/korean']
# not sure if the rule is set correctly
rules = (Rule(LxmlLinkExtractor(allow="\entry"), callback = "parse_post", follow = True),)
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
# adding necessary search parameters to the URL
self.driver.get(response.url+"#language=korean&author-language=russian&marks-min=-5&sort=1&page=1")
# pressing the "Show More" button at the bottom of the search results page to show the next 15 posts, when all results are loaded to the page, the button disappears
more_btn = self.driver.find_element_by_xpath('//a[#id="a_show_more"]')
while more_btn:
more_btn.click()
# sometimes waiting for 5 sec made spider close prematurely so keeping it long in case the server is slow
time.sleep(10)
# here is where the problem begins, I am making a list of links to all the posts on the big page, but I am afraid links will contain only the first link, because selenium doesn't do the multiple selection as one would expect from this xpath...how can I grab all the links and put them in the links list (and should I?)
links=self.driver.find_elements_by_xpath('/html/body/div[8]/div/div[1]/div[3]/div[3]/ul/li/div[2]/a')
for link in links:
link.click()
time.sleep(3)
# this is the function for parsing individual posts, called back by the *parse* method as specified in the rule of the spider; if it is correct, it should have saved at least one post into an item... I don't really understand how and where this callback function gets the response from the new page (the page of the post in this case)...is it automatically loaded to drive and then passed on to the callback function as soon as selenium has clicked on the link (link.click())? or is it all total nonsense...
def parse_post(self, response):
hxs = Selector(response)
item = ItalkiItem()
item["post_item"] = hxs.xpath('//div [#id="a_NMContent"]/text()').extract()
return item
Let's think about it a bit differently:
open the page in the browser and click "Show More" until you get to the desired page
initialize a scrapy TextResponse with the current page source (with all necessary posts loaded)
for every post initialize an Item, yield a Request to the post page and pass an item instance from a request to a response in the meta dictionary
Notes and changes I'm introducing:
use a normal Spider class
use Selenium Waits to wait for the "Show More" button to be visible
closing the driver instance in spider_closed signal dispatcher
The code:
import scrapy
from scrapy import signals
from scrapy.http import TextResponse
from scrapy.xlib.pydispatch import dispatcher
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class ItalkiItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
text = scrapy.Field()
class ItalkiSpider(scrapy.Spider):
name = "italki"
allowed_domains = ['italki.com']
start_urls = ['http://www.italki.com/entries/korean']
def __init__(self):
self.driver = webdriver.Firefox()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.close()
def parse(self, response):
# selenium part of the job
self.driver.get('http://www.italki.com/entries/korean')
while True:
more_btn = WebDriverWait(self.driver, 10).until(
EC.visibility_of_element_located((By.ID, "a_show_more"))
)
more_btn.click()
# stop when we reach the desired page
if self.driver.current_url.endswith('page=52'):
break
# now scrapy should do the job
response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
for post in response.xpath('//ul[#id="content"]/li'):
item = ItalkiItem()
item['title'] = post.xpath('.//a[#class="title_txt"]/text()').extract()[0]
item['url'] = post.xpath('.//a[#class="title_txt"]/#href').extract()[0]
yield scrapy.Request(item['url'], meta={'item': item}, callback=self.parse_post)
def parse_post(self, response):
item = response.meta['item']
item["text"] = response.xpath('//div[#id="a_NMContent"]/text()').extract()
return item
This is something you should use as a base code and improve to fill out all other fields, like author or author_url. Hope that helps.