I'm trying to use scrapy to crawl some page with a lot of links inside, but my existing code so far only show the contents of the first link.
What mistake have I made?
from scrapy.spiders import BaseSpider
from scrapy.spiders import Spider
from scrapy.http.request import Request
from scrapy.selector import Selector
from Proje.items import ProjeItem
class ProjeSpider(BaseSpider):
name = "someweb"
allowed_domains = ["someweb.com"]
start_urls = [
def parse(self, response):
for sel in response.xpath('//ul[#id="indeks-container"]'):
for tete in sel.xpath('//linkkk').re('//linkkk.*?(?=")'):
links = 'http:'+str(tete)
req = Request(links,callback=self.kontene)
return req
def kontene(self, response):
for mbuh in response.xpath('//head'):
Item = ProjeItem()
Item['title'] = mbuh.xpath('//title/text()').extract()
yield Item
according to the scrapy docs, parse needs to return an interable of Request, i.e. a list or a generator. Just change return to yield and it should work as expected:
def parse(self, response):
for sel in response.xpath('//ul[#id="indeks-container"]'):
for tete in sel.xpath('//linkkk').re('//linkkk.*?(?=")'):
links = 'http:'+str(tete)
req = Request(links,callback=self.kontene)
yield req
The issue is that you have a return statement within your for loop. In Python, a return will return out of the function, giving you only the first links worth of content. Instead, consider adding req to a list of returned objects.
def parse(self, response):
req_list = []
for sel in response.xpath('//ul[#id="indeks-container"]'):
for tete in sel.xpath('//linkkk').re('//linkkk.*?(?=")'):
links = 'http:'+str(tete)
req = Request(links,callback=self.kontene)
req_list += req
return req_list
I want to use scrapy to crawl data from webpages, but the difference between different pages can't be seen from the url.For example:
The url as above is the first page which I want to crawl data from, and it's easy to get data from it.
Here is my code:
__author__ = 'Rabbit'
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy_Data.items import EPGD
class EPGD_spider(Spider):
name = "EPGD"
allowed_domains = ["epgd.biosino.org"]
stmp = []
term = "man"
url_base = "http://epgd.biosino.org/EPGD/search/textsearch.jsp?textquery=man&submit=Feeling+Lucky"
start_urls = stmp
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//tr[#class="odd"]|//tr[#class="even"]')
for site in sites:
item = EPGD()
item['genID'] = map(unicode.strip, site.xpath('td[1]/a/text()').extract())
item['taxID'] = map(unicode.strip, site.xpath('td[2]/a/text()').extract())
item['familyID'] = map(unicode.strip, site.xpath('td[3]/a/text()').extract())
item['chromosome'] = map(unicode.strip, site.xpath('td[4]/text()').extract())
item['symbol'] = map(unicode.strip, site.xpath('td[5]/text()').extract())
item['description'] = map(unicode.strip, site.xpath('td[6]/text()').extract())
yield item
But the problem comes out if I want to get data from page 2.I click next page, and the url of second page looks like this:
Just as you see, it doesn't have a keyword in its url, so I don't know how to get data from other pages. Maybe I should use cookies, but I don't know how to do with this situation, so can anyone help me.
Thanks a lot!
When link parsing and Request yielding is added to your parse() function, your example just works for me. Maybe the page uses some server-side cookies. But using a proxy service like Scrapy's Crawlera (which downloads from multiple IPs) it fails though.
The solution is to enter the 'textquery' parameter manually into the request url:
import urlparse
from urllib import urlencode
from scrapy import Request
from scrapy.spiders import Spider
from scrapy.selector import Selector
class EPGD_spider(Spider):
name = "EPGD"
allowed_domains = ["epgd.biosino.org"]
term = 'calb'
base_url = "http://epgd.biosino.org/EPGD/search/textsearch.jsp?currentIndex=0&textquery=%s"
start_urls = [base_url % term]
def update_url(self, url, params):
url_parts = list(urlparse.urlparse(url))
query = dict(urlparse.parse_qsl(url_parts[4]))
url_parts[4] = urlencode(query)
url = urlparse.urlunparse(url_parts)
return url
def parse(self, response):
sel = Selector(response)
genes = sel.xpath('//tr[#class="odd"]|//tr[#class="even"]')
for gene in genes:
item = {}
item['genID'] = map(unicode.strip, gene.xpath('td[1]/a/text()').extract())
# ...
yield item
urls = sel.xpath('//div[#id="nviRecords"]/span[#id="quickPage"]/a/#href').extract()
for url in urls:
url = response.urljoin(url)
url = self.update_url(url, params={'textquery': self.term})
yield Request(url)
update_url() function details from Lukasz' solution:
Add params to given URL in Python
I am trying to parse a site, an e-store. I parse a page with products, which are loaded with ajax, get urls of these products,and then parse additional info of each product following these parced urls.
My script gets the list of first 4 items on the page, their urls, makes the request, parses add info, but then not returning into the loop and so spider closes.
Could somebody help me in solving this? I'm pretty new to this kind of stuff, and ask here when totally stuck.
Here is my code:
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http.request import Request
from scrapy_sokos.items import SokosItem
class SokosSpider(Spider):
name = "sokos"
allowed_domains = ["sokos.fi"]
base_url = "http://www.sokos.fi/fi/SearchDisplay?searchTermScope=&searchType=&filterTerm=&orderBy=8&maxPrice=&showResultsPage=true&beginIndex=%s&langId=-11&sType=SimpleSearch&metaData=&pageSize=4&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&urlLangId=-11&categoryId=295401&storeId=10151"
start_urls = [
for i in range(0, 8, 4):
start_urls.append((base_url) % str(i))
def parse(self, response):
products = Selector(response).xpath('//div[#class="product-listing product-grid"]/article[#class="product product-thumbnail"]')
for product in products:
item = SokosItem()
item['url'] = product.xpath('//div[#class="content"]/a[#class="image"]/#href').extract()[0]
yield Request(url = item['url'], meta = {'item': item}, callback=self.parse_additional_info)
def parse_additional_info(self, response):
item = response.meta['item']
item['name'] = Selector(response).xpath('//h1[#class="productTitle"]/text()').extract()[0].strip()
item['description'] = Selector(response).xpath('//div[#id="kuvaus"]/p/text()').extract()[0]
euro = Selector(response).xpath('//strong[#class="special-price"]/span[#class="euros"]/text()').extract()[0]
cent = Selector(response).xpath('//strong[#class="special-price"]/span[#class="cents"]/text()').extract()[0]
item['price'] = '.'.join(euro + cent)
item['number'] = Selector(response).xpath('//#data-productid').extract()[0]
yield item
The AJAX requests you are simulating are caught by the Scrapy "duplicate url filter".
Set dont_filter to True when yielding a Request:
yield Request(url=item['url'],
meta={'item': item},
I practice scrapy and have a question:
I want to crawl the link I got from spider again and don't know how to do
Here is My code:
as you can see,the link I crawl will save in the parameter:movie_descriptionTW_URL
And I wrote yield Request(movie_descriptionTW, parse_detail) to send the result to def :
def parse_detail(self, response):
But there is an error : exceptions.NameError: global name 'parse_detail' is not defined
How to solve this?
Please teach me! Thank you
from scrapy.spider import Spider
from scrapy.selector import Selector
from yahoo.items import YahooItem
from scrapy.http.request import Request
class MySpider(Spider):
name = "yahoogo"
start_urls = ["https://tw.movies.yahoo.com/chart.html"]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath("//tr")
items = []
for site in sites:
item = YahooItem()
ranking_list = site.xpath("td[#class='c1']/span/text()").extract()
movie_descriptionTW = site.xpath("(td[#class='c3']/*//a)[position() < last()-1]/text() | td[#class='c3']/a[1]/text() ").extract()
movie_descriptionTW_URL = site.xpath("(td[#class='c3']/*//a[2]/#href) | td[#class='c3']/a[1]/#href ").extract()
# crawl again!
yield Request(movie_descriptionTW, parse_detail)
if ranking_list:
yield items
def parse_detail(self, response):
use self.parse_detail to refer to class methods like the following:
for url in movie_descriptionTW_URL:
yield Request(url=url, callback=self.parse_detail)
So when i try to scrape product review info from epinions.com, if the main review text is too long, it has a "read more" link to another page.
I took an example from "http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1" you'll see what i mean if you look at the first review.
I am wondering: is it possible to have a tiny spider in each iteration of the for loop to grab the url and scrape the review out of the new link? I have the following code, but it doesn't work for the tiny "spider".
Here is my code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from epinions_test.items import EpinionsTestItem
from scrapy.http import Response, HtmlResponse
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
items = []
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
item["review"] = sites.select('p/span/text()').extract()
# Everything works fine and i do have those four columns beautifully printed out, until....
url2 = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" %str(url2)[3:-2])
# This url is a string. when i print it out, it's like "http://www.epinions.com/review/samsung-galaxy-note-16-gb-cell-phone/content_624031731332", which looks legit.
response2 = HtmlResponse(url)
# I tried in a scrapy shell, it shows that this is a htmlresponse...
hxs2 = HtmlXPathSelector(response2)
fullReview = hxs2.select('//div[#class = "user_review_full"]')
item["url"] = fullReview.select('p/text()').extract()
# The three lines above works in an independent spider, where start_url is changed to the url just generated and everything.
# However, i got nothing from item["url"] in this code.
return items
Why item["url"] returns nothing?
You should instantiate a new Request in the callback and pass your item in the meta dict:
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class EpinionsTestItem(Item):
title = Field()
star = Field()
date = Field()
review = Field()
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
url = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" % str(url)[3:-2])
yield Request(url=url, callback=self.parse_url2, meta={'item': item})
def parse_url2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
fullReview = hxs.select('//div[#class = "user_review_full"]')
item["review"] = fullReview.select('p/text()').extract()
yield item
Also see documentation.
Hope that helps.
I was wondering if anyone ever tried to extract/follow RSS item links using
SgmlLinkExtractor/CrawlSpider. I can't get it to work...
I am using the following rule:
rules = (
Rule(SgmlLinkExtractor(tags=('link',), attrs=False),
(having in mind that rss links are located in the link tag).
I am not sure how to tell SgmlLinkExtractor to extract the text() of
the link and not to search the attributes ...
Any help is welcome,
Thanks in advance
CrawlSpider rules don't work that way. You'll probably need to subclass BaseSpider and implement your own link extraction in your spider callback. For example:
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import XmlXPathSelector
class MySpider(BaseSpider):
name = 'myspider'
def parse(self, response):
xxs = XmlXPathSelector(response)
links = xxs.select("//link/text()").extract()
return [Request(x, callback=self.parse_link) for x in links]
You can also try the XPath in the shell, by running for example:
scrapy shell http://blog.scrapy.org/rss.xml
And then typing in the shell:
>>> xxs.select("//link/text()").extract()
There's an XMLFeedSpider one can use nowadays.
I have done it using CrawlSpider:
class MySpider(CrawlSpider):
domain_name = "xml.example.com"
def parse(self, response):
xxs = XmlXPathSelector(response)
items = xxs.select('//channel/item')
for i in items:
urli = i.select('link/text()').extract()
request = Request(url=urli[0], callback=self.parse1)
yield request
def parse1(self, response):
hxs = HtmlXPathSelector(response)
# ...
but I am not sure that is a very proper solution...
XML Example From scrapy doc XMLFeedSpider
from scrapy.spiders import XMLFeedSpider
from myproject.items import TestItem
class MySpider(XMLFeedSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com/feed.xml']
iterator = 'iternodes' # This is actually unnecessary, since it's the default value
itertag = 'item'
def parse_node(self, response, node):
self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.extract()))
#item = TestItem()
item = {} # change to dict for removing the class not found error
item['id'] = node.xpath('#id').extract()
item['name'] = node.xpath('name').extract()
item['description'] = node.xpath('description').extract()
return item