Scrapy spider not including all requested pages - python

I have a Scrapy script for Yelp that is, for the most part, working. Essentially I can supply it with a list of Yelp pages and it should return all reviews from all pages. The script so far is below:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = [""]
start_urls = ['' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
return items
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
return requests
However, the problem I'm running into is that this particular script scrapes every page of every requested review EXCEPT for the first page. If I comment out the last "if" statement, it only scrapes the FIRST page. I suspect all I need is a simple "else" command but I am stumped... help is greatly appreciated!
EDIT: This is the code as it currently stands based on assistance received...
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = [""]
start_urls = ['' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
yield item
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
for request in requests:
yield request
As mentioned in a comment below, running this code as-is crawls every desired page, but it only returns one review per page rather than all of them.
I tried Changing yield item to yield items, but an error message of ERROR: Spider must return Request, BaseItem or None, got 'list' in <GET[...]> is returned for every URL crawled.

You need to reorganize the methods a bit. First parse restaurant page in parse() method. Then, return requests for reviews and handle responses in another method, e.g. parse_review():
import re
from scrapy.item import Item, Field
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = [""]
start_urls = ['' % s for s in RESTAURANTS]
def parse(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse_review) for n in range(totalReviews/reviewsPerPage)]
return pages
def parse_review(self, response):
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
yield item

If you're returning items/requests in more than one place, you should replace your return statements with yield statements, which turn your function into a generator, which returns a new element each time it's generated (yields it), without exiting the function until they are all returned. Otherwise, as your code is now, your function will exit after the first return and won't get to sending the requests for the following pages.
Edit: Correction - you should yield one item/request at a time, so:
for review in reviews:
item = ...
return items
for review in reviews:
item = ...
yield item
and replace
return requests
for request in requests:
yield request

The final answer did indeed lie in the indentation of one single yield line. This is the code that ended up doing what I needed it to do.
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class YelpXSpider(Spider):
name = "yelpx"
allowed_domains = [""]
start_urls = ['' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
yield item
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
for request in requests:
yield request
Thanks to everyone for helping out a noob!


Scrapy Request are not crawling all urls return

recently had this project of crawling google play store apps, for vietnam region, and realized that the request doesn't run the callback function for all URLs that haven been return.
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http.request import Request
from urllib.parse import urlparse, parse_qsl, urlencode
import scrapy
class GooglePlayStoreSpider(CrawlSpider):
name = 'google_play'
allowed_domains = ['']
start_urls = ['']
rules = (
Rule(LinkExtractor(allow=('')), follow=True,
crawled_ids = []
first_init = False
def parse_start_url(self, response):
# print("-------------- PRINTING SECTION START_URL --------------")
if not self.first_init:
self.first_init = True
extractor = LinkExtractor(allow=('/store/apps/category/.*',))
raw_links = extractor.extract_links(response)
links = self.process_links(raw_links)
return [
for link in links
# print("============ START_URL ELSE PART ============")
def process_links(self, links):
new_links = []
for link in links:
old_url = link.url
if not old_url.startswith(''):
old_url_obj = urlparse(old_url)
old_url_query = dict(parse_qsl(old_url_obj.query))
if old_url_obj.path == '/store/apps/details':
if old_url_query['id'] in self.crawled_ids:
old_url_query['hl'] = 'en'
old_url_query['gl'] = 'vn'
link.url = '{}://{}{}?{}'.format(old_url_obj.scheme, old_url_obj.netloc, old_url_obj.path,
# print("LINKKSSS ====", links)
# print("NEW_LINKKSSS ====", new_links)
# print("-------------- PRINTING SECTION PROCESS_LINKS --------------")
return new_links
def parse_1(self, response):
selector = scrapy.Selector(response)
urls = selector.xpath('//a[#class="LkLjZd ScJHi U8Ww7d xjAeve nMZKrb id-track-click "]/#href').extract()
links = []
for url in urls:
if not url.startswith(''):
url = "" + url
link_flag = 0
for url in urls:
# yield links_list.append(scrapy.Request(url, callback=self.parse_next, dont_filter=True))
yield Request(links[link_flag], callback=self.parse_next, dont_filter=True)
link_flag += 1
def parse_next(self, response):
# print("PARSE_NEXT ===========", response.request.url)
selector = scrapy.Selector(response)
app_urls = selector.xpath('//div[#class="details"]/a[#class="title"]/#href').extract()
urls = []
for url in app_urls:
url = "" + url + '&hl=en&gl=vn'
url_list = []
link_flag = 0
for url in app_urls:
yield Request(urls[link_flag], callback=self.parse_detail, dont_filter=True)
link_flag += 1
# return url_list
def parse_detail(self, response):
print("Parsed ======= ", response.request.url)
item = dict()
item['name'] = response.xpath('//div[#itemscope]//meta[#itemprop="name"]/#content').extract_first()
item['category'] = response.xpath(
item['review_score'] = response.xpath(
item['review_count'] = response.xpath(
item['link'] = response.request.url
item['id'] = dict(parse_qsl(urlparse(response.request.url).query))['id']
item['content_rating'] = response.xpath(
item['image'] = response.xpath('//div[#itemscope]//meta[#itemprop="image"]/#content').extract_first()
item['price'] = response.xpath('//div[#itemscope]//meta[#itemprop="price"]/#content').extract_first()
item['price_currency'] = response.xpath(
# item['operating_system'] = response.xpath('//div[#itemscope]//meta[#itemprop="operatingSystem"]/#content').extract_first()
return item
As i run into the terminal, it says that it crawled 100 pages and scraped only 15 pages, (numbers are for estimate).
Please help

scrapy not following links

The following scrapy code for returning medical treatment information does return the first set of returned results, but does not follow links. Learning code and checked similar results here on stackoverflow, but integrating them did not work. True, I'm learning. Any pointers would be appreciated.
import urlparse
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
import w3lib.url
from yelp.items import YelpItem
class YelpSpider(BaseSpider):
name = "yelp"
download_delay = 10
concurrent_requests = 1
concurrent_requests_per_domain = 1
allowed_domains = [""]
start_urls = ["",
def parse(self, response):
selector = Selector(response)
for title in selector.css("span.indexed-biz-name"):
page_url = urlparse.urljoin(response.url,
self.log("page URL: %s" % page_url)
yield Request(page_url,
for next_page in selector.css(u'ul > li > a.prev-next:contains(\u2192)'):
next_url = urlparse.urljoin(response.url,
self.log("next URL: %s" % next_url)
yield Request(next_url,
def parse_page(self, response):
selector = Selector(response)
item = YelpItem()
item["name"] = selector.xpath('.//h1[#itemprop="name"]/text()').extract()[0].strip()
item["addresslocality"] = u"\n".join(
item["link"] = response.url
website = selector.css(' a')
if website:
website_url = website.xpath('#href').extract()[0]
item["website"] = w3lib.url.url_query_parameter(website_url, "url")
return item
Your next URL extraction and selection logic is not correct. Target the link element having next and pagination-links_anchor classes. The following works for me:
next_url = response.css('').extract_first()
if next_url:
next_url = urlparse.urljoin(response.url, next_url)
self.log("next URL: %s" % next_url)
yield Request(next_url, callback=self.parse)

Scrapy doesn't scrape data from a particular field in the same page which I have successfully scraped other data from

I am actually very new to Scrapy and I'm not sure why am I not getting the information which I want. I am using Scrapy on the website and i want to extract the check in and check out time from all the hotels in New York. I have successfully scraped out data from the same page which the check in and check out time is in but couldnt scrape out data for both these fields.
The code I have is shown below:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from hotel_crawl.items import HotelCrawlItem
from bs4 import BeautifulSoup
import time
import urlparse
class MySpider(CrawlSpider):
name = "kayaksite"
allowed_domains = [""]
start_urls = [""]
rules = (
restrict_xpaths=("//a[#class='actionlink pagenumber' [contains(text(),'Next')]", )), callback="parse_item", follow=True),
def parse_start_url(self, response):
print "test"'Hi, this is an item page! %s', response.url)
item = HotelCrawlItem()
name = response.xpath("//a[#class='hotelname hotelresultsname']//text()").extract()
price = [BeautifulSoup(i).get_text() for i in response.xpath("//div[#class='pricerange']").extract()]
review = response.xpath("//a[#class='reviewsoverview']/strong/text()").extract()
url = response.xpath("//a[#class='hotelname hotelresultsname']//#href").extract()
alldata = zip(name, price, review, url)
for i in alldata:
item['name'] = i[0]
item['price'] = i[1]
item['review'] = i[2]
request = scrapy.Request(urlparse.urljoin(response.url, i[3]), callback=self.parse_item2)
request.meta['item'] = item
yield request
def parse_item(self, response):'Hi, this is an item page! %s', response.url)
item = HotelCrawlItem()
name = response.xpath("//a[#class='hotelname hotelresultsname']//text()").extract()
price = [BeautifulSoup(i).get_text() for i in response.xpath("//div[#class='pricerange']").extract()]
review = response.xpath("//a[#class='reviewsoverview']/strong/text()").extract()
url = response.xpath("//a[#class='hotelname hotelresultsname']//#href").extract()
alldata = zip(name, price, review, url)
for i in alldata:
item['name'] = i[0]
item['price'] = i[1]
item['review'] = i[2]
request = scrapy.Request(urlparse.urljoin(response.url, i[3]), callback=self.parse_item2)
request.meta['item'] = item
yield request
def parse_item2(self, response):
print "test--------------"'Hi, this is an item page! %s', response.url)
item = response.meta['item']
item['location'] = response.xpath("//*[#id='detailsOverviewContactInfo']/div/span/span[1]/text()").extract()
item['postcode'] = response.xpath("//*[#id='detailsOverviewContactInfo']/div/span/span[3]/text()").extract()
item['check_in'] = response.xpath("//*[#id='goodToKnow']/div/div[2]/div[2]/text()").extract()
item['check_out'] = response.xpath("//*[#id='goodToKnow']/div/div[2]/div[2]/text()").extract()
yield item

scrapy: A tiny "spider" in a spider?

So when i try to scrape product review info from, if the main review text is too long, it has a "read more" link to another page.
I took an example from "" you'll see what i mean if you look at the first review.
I am wondering: is it possible to have a tiny spider in each iteration of the for loop to grab the url and scrape the review out of the new link? I have the following code, but it doesn't work for the tiny "spider".
Here is my code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from epinions_test.items import EpinionsTestItem
from scrapy.http import Response, HtmlResponse
class MySpider(BaseSpider):
name = "epinions"
allow_domains = [""]
start_urls = ['']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites ='//div[#class="review_info"]')
items = []
for sites in sites:
item = EpinionsTestItem()
item["title"] ='h2/a/text()').extract()
item["star"] ='span/a/span/#title').extract()
item["date"] ='span/span/span/#title').extract()
item["review"] ='p/span/text()').extract()
# Everything works fine and i do have those four columns beautifully printed out, until....
url2 ='p/span/a/#href').extract()
url = str("" %str(url2)[3:-2])
# This url is a string. when i print it out, it's like "", which looks legit.
response2 = HtmlResponse(url)
# I tried in a scrapy shell, it shows that this is a htmlresponse...
hxs2 = HtmlXPathSelector(response2)
fullReview ='//div[#class = "user_review_full"]')
item["url"] ='p/text()').extract()
# The three lines above works in an independent spider, where start_url is changed to the url just generated and everything.
# However, i got nothing from item["url"] in this code.
return items
Why item["url"] returns nothing?
You should instantiate a new Request in the callback and pass your item in the meta dict:
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class EpinionsTestItem(Item):
title = Field()
star = Field()
date = Field()
review = Field()
class MySpider(BaseSpider):
name = "epinions"
allow_domains = [""]
start_urls = ['']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites ='//div[#class="review_info"]')
for sites in sites:
item = EpinionsTestItem()
item["title"] ='h2/a/text()').extract()
item["star"] ='span/a/span/#title').extract()
item["date"] ='span/span/span/#title').extract()
url ='p/span/a/#href').extract()
url = str("" % str(url)[3:-2])
yield Request(url=url, callback=self.parse_url2, meta={'item': item})
def parse_url2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
fullReview ='//div[#class = "user_review_full"]')
item["review"] ='p/text()').extract()
yield item
Also see documentation.
Hope that helps.

How to call Parse_page2 method for every item

I am trying to call parse_page2 method for every item. But every time i run this spider i am only getting single item per page so how do i call parse_page2 method for every item.
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
class ESpider(CrawlSpider):
name = "atisundarSpider"
allowed_domains = [""]
URLSList = []
for n in range (1,20):
URLSList.append('' + str(n))
URLSList.append('' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites ='//div[#class="block product size-medium"]')
items = []
for site in sites:
item = EscraperItem()
item['productDesc'] = ""
item['productSite'] = [""]
item['productTitle'] ='.//div[#class="main"]/a/#title').extract()
item['productURL'] = ["" +'.//div[#class="main"]/a/#href').extract()[0].encode('utf-8')]
item['productPrice'] ='.//p[#class="pricearea"]//span[#class="was-price"]/text()').extract() +'.//p[#class="pricearea"]//span[#class="price"]/text()').extract()
item['productImage'] = ['.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0]] + ['.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0].replace("medium","grande")]
item['image_urls'] = item['productImage']
secondURL = "" +'.//div[#class="main"]/a/text()').extract()[1].strip().split("#")[-1]
request = Request(secondURL,
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
#item['other_url'] = response.url
return item
1) you are not using CrawlSpider functionality , i would recommend you to inherit your spider from BaseSpider
2) in for loop
for site in sites:
use yield rather then return , other wise it will break the loop in first iteration.
yield request
3) in parse_page2 get item from response.request.meta instead from response.meta
item = response.request.meta['item']
it should work now.
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
class ESpider(BaseSpider):
name = "atisundarSpider"
allowed_domains = [""]
URLSList = []
for n in range (1,20):
URLSList.append('' + str(n))
URLSList.append('' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites ='//div[#class="block product size-medium"]')
for site in sites:
item = EscraperItem()
item['productDesc'] = ""
item['productSite'] = [""]
item['productTitle'] ='.//div[#class="main"]/a/#title').extract()
item['productURL'] = ["" +'.//div[#class="main"]/a/#href').extract()[0].encode('utf-8')]
item['productPrice'] ='.//p[#class="pricearea"]//span[#class="was-price"]/text()').extract() +'.//p[#class="pricearea"]//span[#class="price"]/text()').extract()
item['productImage'] = ['.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0]] + ['.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0].replace("medium","grande")]
item['image_urls'] = item['productImage']
secondURL = "" +'.//div[#class="main"]/a/text()').extract()[1].strip().split("#")[-1]
request = Request(secondURL,
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.request.meta['item']
#item['other_url'] = response.url
return item
