I want to ask how about (do crawling) clicking next button(change number page of website) (then do crawling more till the end of page number) from this site
I've try to combining scrape with selenium,but its still error and says "line 22
self.driver = webdriver.Firefox()
^
IndentationError: expected an indented block"
I don't know why it happens, i think i code is so well.Anybody can resolve this problem?
This my source :
from selenium import webdriver
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from now.items import NowItem
class MySpider(BaseSpider):
name = "nowhere"
allowed_domains = ["n0where.net"]
start_urls = ["https://n0where.net/"]
def parse(self, response):
for article in response.css('.loop-panel'):
item = NowItem()
item['title'] = article.css('.article-title::text').extract_first()
item['link'] = article.css('.loop-panel>a::attr(href)').extract_first()
item['body'] ='' .join(article.css('.excerpt p::text').extract()).strip()
#item['date'] = article.css('[itemprop="datePublished"]::attr(content)').extract_first()
yield item
def __init__(self):
self.driver = webdriver.Firefox()
def parse2(self, response):
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath('/html/body/div[4]/div[3]/div/div/div/div/div[1]/div/div[6]/div/a[8]/span')
try:
next.click()
# get the data and write it to scrapy items
except:
break
self.driver.close()`
This my capture of my program mate :
Ignoring the syntax and indentation errors you have an issue with your code logic in general.
What you do is create webdriver and never use it. What your spider does here is:
Create webdriver object.
Schedule a request for every url in self.start_urls, in your case it's only one.
Download it, make Response object and pass it to the self.parse()
Your parse method seems to find some xpaths and makes some items, so scrapy yields you some items that were found if any
Done
Your parse2 was never called and so your selenium webdriver was never used.
Since you are not using scrapy to download anything in this case you can just override start_requests()(<- that's where your spider starts) method of your spider to do the whole logic.
Something like:
from selenium import webdriver
import scrapy
from scrapy import Selector
class MySpider(scrapy.Spider):
name = "nowhere"
allowed_domains = ["n0where.net"]
start_url = "https://n0where.net/"
def start_requests(self):
driver = webdriver.Firefox()
driver.get(self.start_url)
while True:
next_url = driver.find_element_by_xpath(
'/html/body/div[4]/div[3]/div/div/div/div/div[1]/div/div[6]/div/a[8]/span')
try:
# parse the body your webdriver has
self.parse(driver.page_source)
# click the button to go to next page
next_url.click()
except:
break
driver.close()
def parse(self, body):
# create Selector from html string
sel = Selector(text=body)
# parse it
for article in sel.css('.loop-panel'):
item = dict()
item['title'] = article.css('.article-title::text').extract_first()
item['link'] = article.css('.loop-panel>a::attr(href)').extract_first()
item['body'] = ''.join(article.css('.excerpt p::text').extract()).strip()
# item['date'] = article.css('[itemprop="datePublished"]::attr(content)').extract_first()
yield item
This is a indentation error. Look the lines near the error:
def parse2(self, response):
self.driver.get(response.url)
The first of these two lines ends with a colon. So, the second line should be more indented than the first one.
There are two possible fixes, depending on what you want to do. Either add an indentation level to the second one:
def parse2(self, response):
self.driver.get(response.url)
Or move the parse2 function out of theinit` function:
def parse2(self, response):
self.driver.get(response.url)
def __init__(self):
self.driver = webdriver.Firefox()
# etc.
Related
I'm trying to scrape with phantomJS and selenium (and scrapy) into a list of links inside a website. I'm new to PhantomJS and Selenium so I'll ask here.
I think the website has a session timeout because I can scrape just the first of those links. Then I get this error:
NoSuchWindowException: Message: {"errorMessage":"Currently Window
handle/name is invalid
(closed?)","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"460","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:33038","User-Agent":"Python-urllib/2.7"},"httpVersion":"1.1","method":"POST","post":"{\"url\":
That's part of my code:
class bllanguage(scrapy.Spider):
handle_httpstatus_list = [302]
name = "bllanguage"
download_delay = 1
allowed_domains = ["http://explore.com/"]
f = open("link")
start_urls = [url.strip() for url in f.readlines()]
f.close()
def __init__(self):
self.driver = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs')
def start_requests(self):
for u in self.start_urls:
r = scrapy.Request(url = u, dont_filter=True, callback=self.parse)
r.meta['dont_redirect'] = True
yield r
def parse(self, response):
self.driver.get(response.url)
#print response.url
search_field = []
Etc.
The session timeout problem is just my interpretation, I've seen other messages like that but none of them has a solution. What I would like to try is to "close" the request to each link inside the "link" file. I don't know if this is something PhantomJS does naturally or I have to insert something: I've seen there's a resourceTimeout setting. Is it the right thing to use and where can I put it inside my code?
Can you help me please to correct this script: I have a list of links search results and I want to vist and crawl each one of these links.
But this script click just the first link and then my crawler stops.
Any help is appreciated
Code "Spider" :
from scrapy.contrib.spiders import CrawlSpider
from scrapy import Selector
from selenium import webdriver
from selenium.webdriver.support.select import Select
from time import sleep
import selenium.webdriver.support.ui as ui
from scrapy.xlib.pydispatch import dispatcher
from scrapy.http import HtmlResponse, TextResponse
from extraction.items import ProduitItem
from scrapy import log
class RunnerSpider(CrawlSpider):
name = 'products_d'
allowed_domains = ['amazon.com']
start_urls = ['http://www.amazon.com']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
sel = Selector(response)
self.driver.get(response.url)
recherche = self.driver.find_element_by_xpath('//*[#id="twotabsearchtextbox"]')
recherche.send_keys("A")
recherche.submit()
resultat = self.driver.find_element_by_xpath('//ul[#id="s-results-list-atf"]')
#Links
resultas = resultat.find_elements_by_xpath('//li/div[#class="s-item-container"]/div/div/div[2]/div[1]/a')
links = []
for lien in resultas:
l = lien.get_attribute('href')
links.append(l)
for result in links:
item = ProduitItem()
link = result
self.driver.get(link)
item['URL'] = link
item['Title'] = self.driver.find_element_by_xpath('//h1[#id="aiv-content-title"]').text
yield item
self.driver.close()
So there are a few issues with your script.
1) Your parse function overrides CrawlSpider's implementation of the same function. That means that CrawlSpider's default behaviour, which is in charge of extracting links from the page for continued crawling, is not being called. That's not recommended when using CrawlSpider. See here for details:
http://doc.scrapy.org/en/latest/topics/spiders.html
2) You don't yield any followup URLs yourself. You only yield Items. If you want Scrapy to keep processing URLs, you have to yield some form of Request object alongside your items.
3) You kill Selenium's driver at the end of the parse function. That will probably cause it to fail on a followup call anyway. There's no need to do that.
4) You're using Selenium & Scrapy's URL grabbing concurrently. That's not necessarily wrong, but keep in mind that it might result in some erratic behaviour.
5) Your script indentation is definitely off, that makes it difficult to look at your code.
This selenium merged with scrapy is working fine with only one problem-
I need to update the sites = response.xpath() every time with the new source code the page generates otherwise it is returning me repetitive results again and again.
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.http import TextResponse
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from selenium import webdriver
import time
class Product(scrapy.Item):
title = scrapy.Field()
class FooSpider(CrawlSpider):
name = 'foo'
start_urls = ["https://www.example.com"]
def __init__(self, *args, **kwargs):
super(FooSpider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
self.browser = webdriver.Chrome(executable_path="C:\chrm\chromedriver.exe")
self.browser.implicitly_wait(60) #
def parse(self,response):
self.browser.get(response.url)
sites = response.xpath('//div[#class="single-review"]/div[#class="review-header"]')
for i in range(0,200):
items = []
time.sleep(20)
button = self.browser.find_element_by_xpath("/html/body/div[4]/div[6]/div[1]/div[2]/div[2]/div[1]/div[2]/button[1]/div[2]/div/div")
button.click()
self.browser.implicitly_wait(30)
for site in sites:
item = Product()
item['title'] = site.xpath('.//div[#class="review-info"]/span[#class="author-name"]/a/text()').extract()
yield item
You need to create a new Selector instance in the loop after the click passing the current page source from .page_source:
from scrapy.selector import Selector
self.browser.implicitly_wait(30)
for i in range(0,200):
time.sleep(20) # TODO: a delay like this doesn't look good
button = self.browser.find_element_by_xpath("/html/body/div[4]/div[6]/div[1]/div[2]/div[2]/div[1]/div[2]/button[1]/div[2]/div/div")
button.click()
sel = Selector(text=self.browser.page_source)
sites = sel.xpath('//div[#class="single-review"]/div[#class="review-header"]')
for site in sites:
item = Product()
item['title'] = site.xpath('.//div[#class="review-info"]/span[#class="author-name"]/a/text()').extract()
yield item
Note that you need to call implicitly_wait() only once - it doesn't add an immediate delay - it only instructs selenium to wait X seconds when searching for elements.
Also, I doubt you really need time.sleep(20) call. Instead, you may want to start using Explicit Waits.
I have read all the threads on using scrapy for AJAX pages and installed selenium webdrive to simplify the task, my spider can partially crawl but can't get any data into my Items.
My objectives are:
Crawl from this page to this page
Scrape each item(post)'s:
author_name (xpath:/html/body/div[8]/div/div[1]/div[3]/div[3]/ul/li[2]/div[2]/span[2]/ul/li[3]/a/text())
author_page_url (xpath:/html/body/div[8]/div/div[1]/div[3]/div[3]/ul/li[2]/div[2]/span[2]/ul/li[3]/a/#href)
post_title (xpath://a[#class="title_txt"])
post_page_url (xpath://a[#class="title_txt"]/#href)
post_text (xpath on a separate post page: //div[#id="a_NMContent/text()")
This is my monkey code (since I am only making my first steps in Python as an aspiring natural language processing student, who majored in linguistics in the past):
import scrapy
import time
from selenium import webdriver
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import XPathSelector
class ItalkiSpider(CrawlSpider):
name = "italki"
allowed_domains = ['italki.com']
start_urls = ['http://www.italki.com/entries/korean']
# not sure if the rule is set correctly
rules = (Rule(LxmlLinkExtractor(allow="\entry"), callback = "parse_post", follow = True),)
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
# adding necessary search parameters to the URL
self.driver.get(response.url+"#language=korean&author-language=russian&marks-min=-5&sort=1&page=1")
# pressing the "Show More" button at the bottom of the search results page to show the next 15 posts, when all results are loaded to the page, the button disappears
more_btn = self.driver.find_element_by_xpath('//a[#id="a_show_more"]')
while more_btn:
more_btn.click()
# sometimes waiting for 5 sec made spider close prematurely so keeping it long in case the server is slow
time.sleep(10)
# here is where the problem begins, I am making a list of links to all the posts on the big page, but I am afraid links will contain only the first link, because selenium doesn't do the multiple selection as one would expect from this xpath...how can I grab all the links and put them in the links list (and should I?)
links=self.driver.find_elements_by_xpath('/html/body/div[8]/div/div[1]/div[3]/div[3]/ul/li/div[2]/a')
for link in links:
link.click()
time.sleep(3)
# this is the function for parsing individual posts, called back by the *parse* method as specified in the rule of the spider; if it is correct, it should have saved at least one post into an item... I don't really understand how and where this callback function gets the response from the new page (the page of the post in this case)...is it automatically loaded to drive and then passed on to the callback function as soon as selenium has clicked on the link (link.click())? or is it all total nonsense...
def parse_post(self, response):
hxs = Selector(response)
item = ItalkiItem()
item["post_item"] = hxs.xpath('//div [#id="a_NMContent"]/text()').extract()
return item
Let's think about it a bit differently:
open the page in the browser and click "Show More" until you get to the desired page
initialize a scrapy TextResponse with the current page source (with all necessary posts loaded)
for every post initialize an Item, yield a Request to the post page and pass an item instance from a request to a response in the meta dictionary
Notes and changes I'm introducing:
use a normal Spider class
use Selenium Waits to wait for the "Show More" button to be visible
closing the driver instance in spider_closed signal dispatcher
The code:
import scrapy
from scrapy import signals
from scrapy.http import TextResponse
from scrapy.xlib.pydispatch import dispatcher
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class ItalkiItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
text = scrapy.Field()
class ItalkiSpider(scrapy.Spider):
name = "italki"
allowed_domains = ['italki.com']
start_urls = ['http://www.italki.com/entries/korean']
def __init__(self):
self.driver = webdriver.Firefox()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.close()
def parse(self, response):
# selenium part of the job
self.driver.get('http://www.italki.com/entries/korean')
while True:
more_btn = WebDriverWait(self.driver, 10).until(
EC.visibility_of_element_located((By.ID, "a_show_more"))
)
more_btn.click()
# stop when we reach the desired page
if self.driver.current_url.endswith('page=52'):
break
# now scrapy should do the job
response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
for post in response.xpath('//ul[#id="content"]/li'):
item = ItalkiItem()
item['title'] = post.xpath('.//a[#class="title_txt"]/text()').extract()[0]
item['url'] = post.xpath('.//a[#class="title_txt"]/#href').extract()[0]
yield scrapy.Request(item['url'], meta={'item': item}, callback=self.parse_post)
def parse_post(self, response):
item = response.meta['item']
item["text"] = response.xpath('//div[#id="a_NMContent"]/text()').extract()
return item
This is something you should use as a base code and improve to fill out all other fields, like author or author_url. Hope that helps.
As a first time scrapy user, I am hoping be able to scrape deal information on Amazon.com, more specifically this page: http://www.amazon.com/Cyber-Monday/b/ref=sv_gb_2?ie=UTF8&node=5550342011&gb_hero_f_100=p:1,c:all,s:missed.
Sorry, I wish I could post a screen shot here, but I don't have reputation.
I want to extract all of the deal item information (the title, price, % off of each 7 deals and other deals by clicking "next" button on the page) under the "upcoming" and "missed deals" section, and I tried scrapy simply using my code as follows but it was no luck. My thinking of the potential problems are:
(1) I defined wrong xpath in either "rules" or "parse_items" (which is possible, but not likely because I copied xpath using chrome developer)
(2) The site is running in AJAX, which would then probe me to use Selenium as other threads suggested.
Here is my code:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import Selector, HtmlXPathSelector
from selenium import selenium
from deal.items import DealItem
class Dealspider(BaseSpider):
name = 'deal'
allowed_domains = ['amazon.com']
start_urls = ['http://www.amazon.com/b/ref=br_imp_ara-1?_encoding=UTF8&node=5550342011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=desktop-hero-2&pf_rd_r=16WPRNKJ91B97JW7TQ27&pf_rd_t=36701&pf_rd_p=1990071642&pf_rd_i=desktop']
rules = (Rule(SgmlLinkExtractor(allow=('//td[#id="missed_filter"]'),restrict_xpaths=('//a[starts-with(#title,"Next ")]',)),callback='parse_items') , Rule(SgmlLinkExtractor(allow=('//td[#id="upcoming_filter"]'), restrict_xpaths=('//a[starts-with(#title,"Next ")]',)), callback='parse_items_2') )
def __init__(self):
CrawlSpider.__init__(self)
self.verificationErrors = []
self.selenium = selenium("localhost", 4444, "*chrome", "http://www.amazon.com")
self.selenium.start()
def __del__(self):
self.selenium.stop()
print self.verificationErrors
CrawlSpider.__del__(self)
parse for missed deal
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
pdt = hxs.select('//ul[#class="ulResized pealdshoveler"]')
sel = self.selenium
sel.open(response.url) #I don't know where the url is
items = []
for t in pdt:
item = dealItem()
item ["missedproduct"] = t.select('//li[contains(#id,"dealTitle")]/a/#title').extract()
item ["price"] = t.select('//li[contains(#id,"dealDealPrice")]/b').extract()
item ["percentoff"] = t.select('//li[contains(#id,"dealPercentOff")]/span').extract()
items.append(item)
return items
parse for upcoming deal#
def parse_items_2(self, response):
hxs = HtmlXPathSelector(response)
pdt = hxs.select('//ul[#class="ulResized pealdshoveler"]')
itemscurrent = []
for t in pdt:
item = dealItem()
item ["c_product"] = t.select('//li[contains(#id,"dealTitle")]/a/text()').extract()
item ["c_price"] = t.select('//li[contains(#id,"dealDealPrice")]/b').extract()
item ["c_percentoff"] = t.select('//li[contains(#id,"dealPercentOff")]/span').extract()
items.append(item)
return itemscurrent
At this moment, scrapy returns me nothing, and I am simply desperate to work this problem out myself, I hope you all who are smarter to help me out.
Whatever insights you have, please put it here, it would be greatly appreciated!! =) Thank you!
I confirm you that Selenium would be an approach to scrape it.
Here it is a partial solution, you can work on, to find the deals and print the title:
class AmazonSpider(CrawlSpider):
name = "amazon"
allowed_domains = ['amazon.com']
start_urls = ['http://www.amazon.com/b/ref=br_imp_ara-1?_encoding=UTF8&node=5550342011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=desktop-hero-2&pf_rd_r=16WPRNKJ91B97JW7TQ27&pf_rd_t=36701&pf_rd_p=1990071642&pf_rd_i=desktop']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
for element in self.driver.find_elements_by_css_selector('a.titleLink'):
print element.text
self.driver.close()
Result would be:
*Up to 50% Off Select Hasbro Toys
Over 45% Off the Canon PowerShot S110 Digital Camera
Up to 60% Off Digital Cameras for Kids
"Dragon Age Inquisition"*
I suggest you to read the Selenium documentation to simulate the user pressing the "next" link
(http://selenium-python.readthedocs.org/en/latest/api.html#module-selenium.webdriver.common.action_chains)