As a first time scrapy user, I am hoping be able to scrape deal information on Amazon.com, more specifically this page: http://www.amazon.com/Cyber-Monday/b/ref=sv_gb_2?ie=UTF8&node=5550342011&gb_hero_f_100=p:1,c:all,s:missed.
Sorry, I wish I could post a screen shot here, but I don't have reputation.
I want to extract all of the deal item information (the title, price, % off of each 7 deals and other deals by clicking "next" button on the page) under the "upcoming" and "missed deals" section, and I tried scrapy simply using my code as follows but it was no luck. My thinking of the potential problems are:
(1) I defined wrong xpath in either "rules" or "parse_items" (which is possible, but not likely because I copied xpath using chrome developer)
(2) The site is running in AJAX, which would then probe me to use Selenium as other threads suggested.
Here is my code:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import Selector, HtmlXPathSelector
from selenium import selenium
from deal.items import DealItem
class Dealspider(BaseSpider):
name = 'deal'
allowed_domains = ['amazon.com']
start_urls = ['http://www.amazon.com/b/ref=br_imp_ara-1?_encoding=UTF8&node=5550342011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=desktop-hero-2&pf_rd_r=16WPRNKJ91B97JW7TQ27&pf_rd_t=36701&pf_rd_p=1990071642&pf_rd_i=desktop']
rules = (Rule(SgmlLinkExtractor(allow=('//td[#id="missed_filter"]'),restrict_xpaths=('//a[starts-with(#title,"Next ")]',)),callback='parse_items') , Rule(SgmlLinkExtractor(allow=('//td[#id="upcoming_filter"]'), restrict_xpaths=('//a[starts-with(#title,"Next ")]',)), callback='parse_items_2') )
def __init__(self):
CrawlSpider.__init__(self)
self.verificationErrors = []
self.selenium = selenium("localhost", 4444, "*chrome", "http://www.amazon.com")
self.selenium.start()
def __del__(self):
self.selenium.stop()
print self.verificationErrors
CrawlSpider.__del__(self)
parse for missed deal
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
pdt = hxs.select('//ul[#class="ulResized pealdshoveler"]')
sel = self.selenium
sel.open(response.url) #I don't know where the url is
items = []
for t in pdt:
item = dealItem()
item ["missedproduct"] = t.select('//li[contains(#id,"dealTitle")]/a/#title').extract()
item ["price"] = t.select('//li[contains(#id,"dealDealPrice")]/b').extract()
item ["percentoff"] = t.select('//li[contains(#id,"dealPercentOff")]/span').extract()
items.append(item)
return items
parse for upcoming deal#
def parse_items_2(self, response):
hxs = HtmlXPathSelector(response)
pdt = hxs.select('//ul[#class="ulResized pealdshoveler"]')
itemscurrent = []
for t in pdt:
item = dealItem()
item ["c_product"] = t.select('//li[contains(#id,"dealTitle")]/a/text()').extract()
item ["c_price"] = t.select('//li[contains(#id,"dealDealPrice")]/b').extract()
item ["c_percentoff"] = t.select('//li[contains(#id,"dealPercentOff")]/span').extract()
items.append(item)
return itemscurrent
At this moment, scrapy returns me nothing, and I am simply desperate to work this problem out myself, I hope you all who are smarter to help me out.
Whatever insights you have, please put it here, it would be greatly appreciated!! =) Thank you!
I confirm you that Selenium would be an approach to scrape it.
Here it is a partial solution, you can work on, to find the deals and print the title:
class AmazonSpider(CrawlSpider):
name = "amazon"
allowed_domains = ['amazon.com']
start_urls = ['http://www.amazon.com/b/ref=br_imp_ara-1?_encoding=UTF8&node=5550342011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=desktop-hero-2&pf_rd_r=16WPRNKJ91B97JW7TQ27&pf_rd_t=36701&pf_rd_p=1990071642&pf_rd_i=desktop']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
for element in self.driver.find_elements_by_css_selector('a.titleLink'):
print element.text
self.driver.close()
Result would be:
*Up to 50% Off Select Hasbro Toys
Over 45% Off the Canon PowerShot S110 Digital Camera
Up to 60% Off Digital Cameras for Kids
"Dragon Age Inquisition"*
I suggest you to read the Selenium documentation to simulate the user pressing the "next" link
(http://selenium-python.readthedocs.org/en/latest/api.html#module-selenium.webdriver.common.action_chains)
Related
I want to ask how about (do crawling) clicking next button(change number page of website) (then do crawling more till the end of page number) from this site
I've try to combining scrape with selenium,but its still error and says "line 22
self.driver = webdriver.Firefox()
^
IndentationError: expected an indented block"
I don't know why it happens, i think i code is so well.Anybody can resolve this problem?
This my source :
from selenium import webdriver
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from now.items import NowItem
class MySpider(BaseSpider):
name = "nowhere"
allowed_domains = ["n0where.net"]
start_urls = ["https://n0where.net/"]
def parse(self, response):
for article in response.css('.loop-panel'):
item = NowItem()
item['title'] = article.css('.article-title::text').extract_first()
item['link'] = article.css('.loop-panel>a::attr(href)').extract_first()
item['body'] ='' .join(article.css('.excerpt p::text').extract()).strip()
#item['date'] = article.css('[itemprop="datePublished"]::attr(content)').extract_first()
yield item
def __init__(self):
self.driver = webdriver.Firefox()
def parse2(self, response):
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath('/html/body/div[4]/div[3]/div/div/div/div/div[1]/div/div[6]/div/a[8]/span')
try:
next.click()
# get the data and write it to scrapy items
except:
break
self.driver.close()`
This my capture of my program mate :
Ignoring the syntax and indentation errors you have an issue with your code logic in general.
What you do is create webdriver and never use it. What your spider does here is:
Create webdriver object.
Schedule a request for every url in self.start_urls, in your case it's only one.
Download it, make Response object and pass it to the self.parse()
Your parse method seems to find some xpaths and makes some items, so scrapy yields you some items that were found if any
Done
Your parse2 was never called and so your selenium webdriver was never used.
Since you are not using scrapy to download anything in this case you can just override start_requests()(<- that's where your spider starts) method of your spider to do the whole logic.
Something like:
from selenium import webdriver
import scrapy
from scrapy import Selector
class MySpider(scrapy.Spider):
name = "nowhere"
allowed_domains = ["n0where.net"]
start_url = "https://n0where.net/"
def start_requests(self):
driver = webdriver.Firefox()
driver.get(self.start_url)
while True:
next_url = driver.find_element_by_xpath(
'/html/body/div[4]/div[3]/div/div/div/div/div[1]/div/div[6]/div/a[8]/span')
try:
# parse the body your webdriver has
self.parse(driver.page_source)
# click the button to go to next page
next_url.click()
except:
break
driver.close()
def parse(self, body):
# create Selector from html string
sel = Selector(text=body)
# parse it
for article in sel.css('.loop-panel'):
item = dict()
item['title'] = article.css('.article-title::text').extract_first()
item['link'] = article.css('.loop-panel>a::attr(href)').extract_first()
item['body'] = ''.join(article.css('.excerpt p::text').extract()).strip()
# item['date'] = article.css('[itemprop="datePublished"]::attr(content)').extract_first()
yield item
This is a indentation error. Look the lines near the error:
def parse2(self, response):
self.driver.get(response.url)
The first of these two lines ends with a colon. So, the second line should be more indented than the first one.
There are two possible fixes, depending on what you want to do. Either add an indentation level to the second one:
def parse2(self, response):
self.driver.get(response.url)
Or move the parse2 function out of theinit` function:
def parse2(self, response):
self.driver.get(response.url)
def __init__(self):
self.driver = webdriver.Firefox()
# etc.
I'm using the latest version of scrapy (http://doc.scrapy.org/en/latest/index.html) and am trying to figure out how to make scrapy crawl only the URL(s) fed to it as part of start_url list. In most cases I want to crawl only 1 page, but in some cases there may be multiple pages that I will specify. I don't want it to crawl to other pages.
I've tried setting the depth level=1 but I'm not sure that in testing it accomplished what I was hoping to achieve.
Any help will be greatly appreciated!
Thank you!
2015-12-22 - Code update:
# -*- coding: utf-8 -*-
import scrapy
from generic.items import GenericItem
class GenericspiderSpider(scrapy.Spider):
name = "genericspider"
def __init__(self, domain, start_url, entity_id):
self.allowed_domains = [domain]
self.start_urls = [start_url]
self.entity_id = entity_id
def parse(self, response):
for href in response.css("a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath("//body//a"):
item = GenericItem()
item['entity_id'] = self.entity_id
# gets the actual email address
item['emails'] = response.xpath("//a[starts-with(#href, 'mailto')]").re(r'mailto:\s*(.*?)"')
yield item
Below, in the first response, you mention using a generic spider --- isn't that what I'm doing in the code? Also are you suggesting I remove the
callback=self.parse_dir_contents
from the parse function?
Thank you.
looks like you are using CrawlSpider which is a special kind of Spider to crawl multiple categories inside pages.
For only crawling the urls specified inside start_urls just override the parse method, as that is the default callback of the start requests.
Below is a code for the spider that will scrape the title from a blog (Note: the xpath might not be the same for every blog)
Filename: /spiders/my_spider.py
class MySpider(scrapy.Spider):
name = "craig"
allowed_domains = ["www.blogtrepreneur.com"]
start_urls = ["http://www.blogtrepreneur.com/the-best-juice-cleanse-for-weight-loss/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
dive = response.xpath('//div[#id="tve_editor"]')
items = []
item = DmozItem()
item["title"] = response.xpath('//h1/text()').extract()
item["article"] = response.xpath('//div[#id="tve_editor"]//p//text()').extract()
items.append(item)
return items
The above code will only fetch the title and the article body of the given article.
I got the same problem, because I was using
import scrapy from scrapy.spiders import CrawlSpider
Then I changed to
import scrapy from scrapy.spiders import Spider
And change the class to
class mySpider(Spider):
Can you help me please to correct this script: I have a list of links search results and I want to vist and crawl each one of these links.
But this script click just the first link and then my crawler stops.
Any help is appreciated
Code "Spider" :
from scrapy.contrib.spiders import CrawlSpider
from scrapy import Selector
from selenium import webdriver
from selenium.webdriver.support.select import Select
from time import sleep
import selenium.webdriver.support.ui as ui
from scrapy.xlib.pydispatch import dispatcher
from scrapy.http import HtmlResponse, TextResponse
from extraction.items import ProduitItem
from scrapy import log
class RunnerSpider(CrawlSpider):
name = 'products_d'
allowed_domains = ['amazon.com']
start_urls = ['http://www.amazon.com']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
sel = Selector(response)
self.driver.get(response.url)
recherche = self.driver.find_element_by_xpath('//*[#id="twotabsearchtextbox"]')
recherche.send_keys("A")
recherche.submit()
resultat = self.driver.find_element_by_xpath('//ul[#id="s-results-list-atf"]')
#Links
resultas = resultat.find_elements_by_xpath('//li/div[#class="s-item-container"]/div/div/div[2]/div[1]/a')
links = []
for lien in resultas:
l = lien.get_attribute('href')
links.append(l)
for result in links:
item = ProduitItem()
link = result
self.driver.get(link)
item['URL'] = link
item['Title'] = self.driver.find_element_by_xpath('//h1[#id="aiv-content-title"]').text
yield item
self.driver.close()
So there are a few issues with your script.
1) Your parse function overrides CrawlSpider's implementation of the same function. That means that CrawlSpider's default behaviour, which is in charge of extracting links from the page for continued crawling, is not being called. That's not recommended when using CrawlSpider. See here for details:
http://doc.scrapy.org/en/latest/topics/spiders.html
2) You don't yield any followup URLs yourself. You only yield Items. If you want Scrapy to keep processing URLs, you have to yield some form of Request object alongside your items.
3) You kill Selenium's driver at the end of the parse function. That will probably cause it to fail on a followup call anyway. There's no need to do that.
4) You're using Selenium & Scrapy's URL grabbing concurrently. That's not necessarily wrong, but keep in mind that it might result in some erratic behaviour.
5) Your script indentation is definitely off, that makes it difficult to look at your code.
I'm using a scrapy web crawler to extract a bunch of data, as I describe here, I've figured out a brute force way to get the information I want, but.. it's really pretty crude. I just ennumerate all the pages I want to scrape, which is a few hundred. I need to get this done, so I might just grit my teeth and bear it like a moron, but it would be so much nicer to automate this. How could this process be implemented with link extraction using scrapy? I've looked at the documentation and made some experiments as I desribe in the question linked above but nothing yet has worked. This is the brute force code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from brute_force.items import BruteForceItem
class DmozSpider(BaseSpider):
name = "brutus"
allowed_domains = ["tool.httpcn.com"]
start_urls = ["http://tool.httpcn.com/Html/Zi/21/PWAZAZAZXVILEPWXV.shtml",
"http://tool.httpcn.com/Html/Zi/21/PWAZAZCQCQILEPWB.shtml",
"http://tool.httpcn.com/Html/Zi/21/PWAZAZCQKOILEPWD.shtml",
"http://tool.httpcn.com/Html/Zi/21/PWAZAZCQUYILEPWF.shtml",
"http://tool.httpcn.com/Html/Zi/21/PWAZAZCQMEILEKOCQ.shtml",
"http://tool.httpcn.com/Html/Zi/21/PWAZAZCQRNILEKOKO.shtml",
"http://tool.httpcn.com/Html/Zi/22/PWCQKOILUYUYKOTBCQ.shtml",
"http://tool.httpcn.com/Html/Zi/21/PWAZAZAZRNILEPWRN.shtml",
"http://tool.httpcn.com/Html/Zi/21/PWAZAZCQPWILEPWC.shtml",
"http://tool.httpcn.com/Html/Zi/21/PWAZAZCQILILEPWE.shtml",
"http://tool.httpcn.com/Html/Zi/21/PWAZAZCQTBILEKOAZ.shtml",
"http://tool.httpcn.com/Html/Zi/21/PWAZAZCQXVILEKOPW.shtml",
"http://tool.httpcn.com/Html/Zi/21/PWAZAZPWAZILEKOIL.shtml",
"http://tool.httpcn.com/Html/Zi/22/PWCQKOILRNUYKOTBUY.shtml"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = BruteForceItem()
item["the_strokes"] = hxs.xpath('//*[#id="div_a1"]/div[2]').extract()
item["character"] = hxs.xpath('//*[#id="div_a1"]/div[3]').extract()
items.append(item)
return items
I think this is what you want:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from brute_force.items import BruteForceItem
from urlparse import urljoin
class DmozSpider(BaseSpider):
name = "brutus"
allowed_domains = ["tool.httpcn.com"]
start_urls = ['http://tool.httpcn.com/Zi/BuShou.html']
def parse(self, response):
for url in response.css('td a::attr(href)').extract():
cb = self.parse if '/zi/bushou' in url.lower() else self.parse_item
yield Request(urljoin(response.url, url), callback=cb)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = BruteForceItem()
item["the_strokes"] = hxs.xpath('//*[#id="div_a1"]/div[2]').extract()
item["character"] = hxs.xpath('//*[#id="div_a1"]/div[3]').extract()
return item
try this
1.
the spider start with the start_urls.
2.
self.parse. I just find all the a tag in the td tag.
if the url contains '/zi/bushou' then the response should be go to self.parse again because it is what you called 'second layer'.
if not '/zi/bushou' (i think use a more specific regex here is better) like url. i think it is what you want and goes to parse_item function.
3.
self.parse_item. this is the function that you use to get the information from the final page.
I have read all the threads on using scrapy for AJAX pages and installed selenium webdrive to simplify the task, my spider can partially crawl but can't get any data into my Items.
My objectives are:
Crawl from this page to this page
Scrape each item(post)'s:
author_name (xpath:/html/body/div[8]/div/div[1]/div[3]/div[3]/ul/li[2]/div[2]/span[2]/ul/li[3]/a/text())
author_page_url (xpath:/html/body/div[8]/div/div[1]/div[3]/div[3]/ul/li[2]/div[2]/span[2]/ul/li[3]/a/#href)
post_title (xpath://a[#class="title_txt"])
post_page_url (xpath://a[#class="title_txt"]/#href)
post_text (xpath on a separate post page: //div[#id="a_NMContent/text()")
This is my monkey code (since I am only making my first steps in Python as an aspiring natural language processing student, who majored in linguistics in the past):
import scrapy
import time
from selenium import webdriver
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import XPathSelector
class ItalkiSpider(CrawlSpider):
name = "italki"
allowed_domains = ['italki.com']
start_urls = ['http://www.italki.com/entries/korean']
# not sure if the rule is set correctly
rules = (Rule(LxmlLinkExtractor(allow="\entry"), callback = "parse_post", follow = True),)
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
# adding necessary search parameters to the URL
self.driver.get(response.url+"#language=korean&author-language=russian&marks-min=-5&sort=1&page=1")
# pressing the "Show More" button at the bottom of the search results page to show the next 15 posts, when all results are loaded to the page, the button disappears
more_btn = self.driver.find_element_by_xpath('//a[#id="a_show_more"]')
while more_btn:
more_btn.click()
# sometimes waiting for 5 sec made spider close prematurely so keeping it long in case the server is slow
time.sleep(10)
# here is where the problem begins, I am making a list of links to all the posts on the big page, but I am afraid links will contain only the first link, because selenium doesn't do the multiple selection as one would expect from this xpath...how can I grab all the links and put them in the links list (and should I?)
links=self.driver.find_elements_by_xpath('/html/body/div[8]/div/div[1]/div[3]/div[3]/ul/li/div[2]/a')
for link in links:
link.click()
time.sleep(3)
# this is the function for parsing individual posts, called back by the *parse* method as specified in the rule of the spider; if it is correct, it should have saved at least one post into an item... I don't really understand how and where this callback function gets the response from the new page (the page of the post in this case)...is it automatically loaded to drive and then passed on to the callback function as soon as selenium has clicked on the link (link.click())? or is it all total nonsense...
def parse_post(self, response):
hxs = Selector(response)
item = ItalkiItem()
item["post_item"] = hxs.xpath('//div [#id="a_NMContent"]/text()').extract()
return item
Let's think about it a bit differently:
open the page in the browser and click "Show More" until you get to the desired page
initialize a scrapy TextResponse with the current page source (with all necessary posts loaded)
for every post initialize an Item, yield a Request to the post page and pass an item instance from a request to a response in the meta dictionary
Notes and changes I'm introducing:
use a normal Spider class
use Selenium Waits to wait for the "Show More" button to be visible
closing the driver instance in spider_closed signal dispatcher
The code:
import scrapy
from scrapy import signals
from scrapy.http import TextResponse
from scrapy.xlib.pydispatch import dispatcher
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class ItalkiItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
text = scrapy.Field()
class ItalkiSpider(scrapy.Spider):
name = "italki"
allowed_domains = ['italki.com']
start_urls = ['http://www.italki.com/entries/korean']
def __init__(self):
self.driver = webdriver.Firefox()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.close()
def parse(self, response):
# selenium part of the job
self.driver.get('http://www.italki.com/entries/korean')
while True:
more_btn = WebDriverWait(self.driver, 10).until(
EC.visibility_of_element_located((By.ID, "a_show_more"))
)
more_btn.click()
# stop when we reach the desired page
if self.driver.current_url.endswith('page=52'):
break
# now scrapy should do the job
response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
for post in response.xpath('//ul[#id="content"]/li'):
item = ItalkiItem()
item['title'] = post.xpath('.//a[#class="title_txt"]/text()').extract()[0]
item['url'] = post.xpath('.//a[#class="title_txt"]/#href').extract()[0]
yield scrapy.Request(item['url'], meta={'item': item}, callback=self.parse_post)
def parse_post(self, response):
item = response.meta['item']
item["text"] = response.xpath('//div[#id="a_NMContent"]/text()').extract()
return item
This is something you should use as a base code and improve to fill out all other fields, like author or author_url. Hope that helps.