scrapy not following links

scrapy not following links - python

The following scrapy code for returning medical treatment information does return the first set of returned results, but does not follow links. Learning code and checked similar results here on stackoverflow, but integrating them did not work. True, I'm learning. Any pointers would be appreciated.
import urlparse
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
import w3lib.url
from yelp.items import YelpItem
class YelpSpider(BaseSpider):
name = "yelp"
download_delay = 10
concurrent_requests = 1
concurrent_requests_per_domain = 1
allowed_domains = ["yelp.com"]
start_urls = ["http://www.yelp.com/search?find_desc=cancer+treatment&find_loc=manhattan%2Cny&start=0",
"http://www.yelp.com/search?find_desc=cancer+treatment&find_loc=manhattan%2Cny&start=20",
"http://www.yelp.com/search?find_desc=cancer+treatment&find_loc=manhattan%2Cny&start=30"]
def parse(self, response):
selector = Selector(response)
for title in selector.css("span.indexed-biz-name"):
page_url = urlparse.urljoin(response.url,
title.xpath("a/#href").extract()[0])
self.log("page URL: %s" % page_url)
#continue
yield Request(page_url,
callback=self.parse_page)
for next_page in selector.css(u'ul > li > a.prev-next:contains(\u2192)'):
next_url = urlparse.urljoin(response.url,
next_page.xpath('#href').extract()[0])
self.log("next URL: %s" % next_url)
#continue
yield Request(next_url,
callback=self.parse)
def parse_page(self, response):
selector = Selector(response)
item = YelpItem()
item["name"] = selector.xpath('.//h1[#itemprop="name"]/text()').extract()[0].strip()
item["addresslocality"] = u"\n".join(
selector.xpath('.//address[#itemprop="address"]//text()').extract()).strip()
item["link"] = response.url
website = selector.css('div.biz-website a')
if website:
website_url = website.xpath('#href').extract()[0]
item["website"] = w3lib.url.url_query_parameter(website_url, "url")
return item

Your next URL extraction and selection logic is not correct. Target the link element having next and pagination-links_anchor classes. The following works for me:
next_url = response.css('a.pagination-links_anchor.next::attr(href)').extract_first()
if next_url:
next_url = urlparse.urljoin(response.url, next_url)
self.log("next URL: %s" % next_url)
yield Request(next_url, callback=self.parse)

Related

Python Scrapy keep getting same page link from next page button

i am trying to scrape amazon.com for the link of products that has more than 800 reviews but i keep getting the same page link from the next page button it keeps returning page 2 over and over again where i should get page 3,4 and so on
I HAVE SET A IF CONDITION TO SPILT AND CONVERT REVIEW STRING LIKE 1,020 TO INTEGER AND COMPARE IF GREATER THAN 800 OR NOT THEN BASED ON THAT VISIT THE PAGE
here is the code
# -*- coding: utf-8 -*-
import scrapy
from amazon.items import AmazonItem
from urlparse import urljoin
class AmazonspiderSpider(scrapy.Spider):
name = "amazonspider"
DOWNLOAD_DELAY = 1
start_urls = ['https://www.amazon.com/s/ref=lp_165993011_nr_n_0?fst=as%3Aoff&rh=n%3A165793011%2Cn%3A%21165795011%2Cn%3A165993011%2Cn%3A2514571011&bbn=165993011&ie=UTF8&qid=1493778423&rnid=165993011']
def parse(self, response):
SET_SELECTOR = '.a-carousel-card.acswidget-carousel__card'
for attr in response.css(SET_SELECTOR):
#print '\n\n', attr
item = AmazonItem()
review_selector = './/*[#class="acs_product-rating__review-count"]/text()'
link_selector = './/*[#class="a-link-normal"]/#href'
if attr.xpath(review_selector).extract_first():
if int(''.join(attr.xpath(review_selector).extract_first().split(','))) >= 800:
url = urljoin(response.url, attr.xpath(link_selector).extract_first())
item['LINKS'] = url
if url:
yield scrapy.Request(url, callback=self.parse_link, meta={'item': item})
next_page = './/span[#class="pagnRA"]/a[#id="pagnNextLink"]/#href'
next_page = response.xpath(next_page).extract_first()
print '\n\n', urljoin(response.url, next_page)
if next_page:
yield scrapy.Request(
urljoin(response.url, next_page),
callback=self.parse
)
def parse_link(self, response):
item = AmazonItem(response.meta['item'])
catselector = '.cat-link ::text'
defaultcatselector = '.nav-search-label ::text'
cat = response.css(catselector).extract_first()
if cat:
item['CATAGORY'] = cat
else:
item['CATAGORY'] = response.css(defaultcatselector).extract_first()
return item
here is the output when i was printing the next page link before calling the parse function recursively
and
and here is the screenshot from the next page selector of the page
where am i going wrong ?

Move the next page code block outside the loop.
class AmazonspiderSpider(scrapy.Spider):
name = "amazonspider"
DOWNLOAD_DELAY = 1
start_urls = ['https://www.amazon.com/s/ref=lp_165993011_nr_n_0?fst=as%3Aoff&rh=n%3A165793011%2Cn%3A%21165795011%2Cn%3A165993011%2Cn%3A2514571011&bbn=165993011&ie=UTF8&qid=1493778423&rnid=165993011']
def parse(self, response):
SET_SELECTOR = '.a-carousel-card.acswidget-carousel__card'
for attr in response.css(SET_SELECTOR):
#print '\n\n', attr
review_selector = './/*[#class="acs_product-rating__review-count"]/text()'
link_selector = './/*[#class="a-link-normal"]/#href'
if attr.xpath(review_selector).extract_first():
if int(''.join(attr.xpath(review_selector).extract_first().split(','))) >= 800:
url = urljoin(response.url, attr.xpath(link_selector).extract_first())
next_page = './/span[#class="pagnRA"]/a[#id="pagnNextLink"]/#href'
next_page = response.xpath(next_page).extract_first()
print '\n\n', urljoin(response.url, next_page)
if next_page:
yield scrapy.Request(
urljoin(response.url, next_page),
callback=self.parse
)

Scraping/crawling multiple pages

Up to now I have found how to scrape one page or multiple pages with same url, but changing number. However, I could not find how to scrape pages with subcategories and their subcategories and finally get the content needed.
I am trying to scrape this website: http://www.askislam.org/index.html
I am using Scrapy, but I do not know where to start.
Or you can suggest a better option, I just use python and check from there.
Thanks
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider
from scrapy import Selector
from ask_islam.items import AskIslamItem
from scrapy.http import Request
from scrapy.linkextractors import LinkExtractor
import re
class AskislamSpider(Spider):
name = "askislam"
allowed_domains = ["askislam.org"]
start_urls = ['http://www.askislam.org/']
rules = [Rule(LinkExtractor(allow = ()), callback = 'parse', follow=True)]
def parse(self, response):
hxs = Selector(response)
links = hxs.css('div[id="categories"] li a::attr(href)').extract()
for link in links:
url = 'http://www.askislam.org' + link.replace('index.html', '')
yield Request(url, callback=self.parse_page)
def parse_page(self, response):
hxs = Selector(response)
categories = hxs.css('div[id="categories"] li').extract()
questions = hxs.xpath('a').extract()
if(categories):
for categoryLink in categories:
url = 'http://www.askislam.org' + categoryLink.replace('index.html', '')
yield Request(url, callback=self.parse_page)
# print (question)
EDIT
def start_requests(self):
yield Request("http://www.askislam.org", callback=self.parse_page)
def parse_page(self, response):
hxs = Selector(response)
categories = hxs.css('#categories li')
for cat in categories:
item = AskIslamItem()
link = cat.css('a::attr(href)').extract()[0]
link = "http://www.askislam.org" + link
item['catLink'] = link
logging.info("Scraping Link: %s" % (link))
yield Request(link, callback=self.parse_page)
yield Request(link, callback=self.parse_categories)
def parse_categories(self, response):
logging.info("The Cat Url")

Read links from that http://www.askislam.org/index.html page using xPath or CSS Selectors of those sub-categories and then do another Request()
EDIT:
import logging
class AskislamSpider(Spider):
name = "askislam"
def start_requests(self):
yield Request("http://www.askislam.org/", callback=self.parse_page)
def parse_page(self, response):
categories = response.css('#categories li').extract()
for cat in categories:
link = cat.css("a::attr(href)").extract()[0]
link = "http://www.askislam.org/" + link
logging.info("Scraping Link: %s" % (link))
yield Request(link, callback=self.parse_page)

Scrapy spider not including all requested pages

I have a Scrapy script for Yelp that is, for the most part, working. Essentially I can supply it with a list of Yelp pages and it should return all reviews from all pages. The script so far is below:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
items.append(item)
return items
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
return requests
However, the problem I'm running into is that this particular script scrapes every page of every requested review EXCEPT for the first page. If I comment out the last "if" statement, it only scrapes the FIRST page. I suspect all I need is a simple "else" command but I am stumped... help is greatly appreciated!
EDIT: This is the code as it currently stands based on assistance received...
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
yield item
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
for request in requests:
yield request
As mentioned in a comment below, running this code as-is crawls every desired page, but it only returns one review per page rather than all of them.
I tried Changing yield item to yield items, but an error message of ERROR: Spider must return Request, BaseItem or None, got 'list' in <GET http://www.yelp.com/biz/[...]> is returned for every URL crawled.

You need to reorganize the methods a bit. First parse restaurant page in parse() method. Then, return requests for reviews and handle responses in another method, e.g. parse_review():
import re
from scrapy.item import Item, Field
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse_review) for n in range(totalReviews/reviewsPerPage)]
return pages
def parse_review(self, response):
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
yield item

If you're returning items/requests in more than one place, you should replace your return statements with yield statements, which turn your function into a generator, which returns a new element each time it's generated (yields it), without exiting the function until they are all returned. Otherwise, as your code is now, your function will exit after the first return and won't get to sending the requests for the following pages.
Edit: Correction - you should yield one item/request at a time, so:
Replace
for review in reviews:
item = ...
return items
with
for review in reviews:
item = ...
yield item
and replace
return requests
with
for request in requests:
yield request

The final answer did indeed lie in the indentation of one single yield line. This is the code that ended up doing what I needed it to do.
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class YelpXSpider(Spider):
name = "yelpx"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
yield item
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
for request in requests:
yield request
Thanks to everyone for helping out a noob!

Why scrapy is not going to next page and only getting first page items?

Earlier I had one rule also i.e.
if domains in departments.keys():rules = (Rule(SgmlLinkExtractor(allow=("?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=d_d" %(keyword,departments.get(domains)),),restrict_xpaths=('//li[#class="btn-nextResults"]'),),callback='parse',follow=True),),
but I removed it as it was calling parse method which is not recommended.
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from walmart_sample.items import WalmartSampleItem
class MySpider(CrawlSpider):
name = "my_spider"
domains = ['All Departments']
keyword = 'Laptop'
departments = {"All Departments": "0", "Apparel": "5438", "Auto": "91083", "Baby": "5427", "Beauty": "1085666","Books": "3920", "Electronics": "3944", "Gifts": "1094765", "Grocery": "976759", "Health": "976760","Home": "4044", "Home Improvement": "1072864", "Jwelery": "3891", "Movies": "4096", "Music": "4104","Party": "2637", "Patio": "5428", "Pets": "5440", "Pharmacy": "5431", "Photo Center": "5426","Sports": "4125", "Toys": "4171", "Video Games": "2636"}
allowed_domains = ['walmart.com']
denied_domains = ['reviews.walmart.com','facebook.com','twitter.com']
def start_requests(self):
for domain in self.domains:
if domain in self.departments:
url = 'http://www.walmart.com/search/search-ng.do?search_query=%s&ic=16_0&Find=Find&search_constraint=%s' % (self.keyword, self.departments.get(domain))
yield Request(url)
def parse(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a[#class="prodLink ListItemLink"]/#href')
for link in links:
href = link.extract()
yield Request('http://www.walmart.com/' + href, self.parse_data)
next_link = hxs.select('//li[#class="btn-nextResults"]/#href').extract()
if next_link:
yield Request('http://www.walmart.com/search/search-ng.do' + next_link, self.parse)
else:
print "last Page"
def parse_data(self, response):
hxs = HtmlXPathSelector(response)
items=[]
walmart=WalmartSampleItem()
walmart['Title']=hxs.select('//h1[#class="productTitle"]/text()').extract()
walmart['Price']=hxs.select('//span[#class="bigPriceText1"]/text()').extract()+hxs.select('//span[#class="smallPriceText1"]/text()').extract()
walmart['Availability']=hxs.select('//span[#id="STORE_AVAIL"]/text()').extract()
walmart['Description']=hxs.select('//span[#class="ql-details-short-desc"]/text()').extract()
items.append(walmart)
return items

I think you're simply missing an "/a" step in your XPath for next page links:
def parse(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a[#class="prodLink ListItemLink"]/#href')
for link in links:
href = link.extract()
yield Request('http://www.walmart.com/' + href, self.parse_data)
#
# here
# |
# v
next_link = hxs.select('//li[#class="btn-nextResults"]/a/#href').extract()
if next_link:
# and as hxs.select() will return a list, you should select the first element
yield Request('http://www.walmart.com/search/search-ng.do' + next_link[0], self.parse)
else:
print "last Page"

scrapy: A tiny "spider" in a spider?

So when i try to scrape product review info from epinions.com, if the main review text is too long, it has a "read more" link to another page.
I took an example from "http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1" you'll see what i mean if you look at the first review.
I am wondering: is it possible to have a tiny spider in each iteration of the for loop to grab the url and scrape the review out of the new link? I have the following code, but it doesn't work for the tiny "spider".
Here is my code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from epinions_test.items import EpinionsTestItem
from scrapy.http import Response, HtmlResponse
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
items = []
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
item["review"] = sites.select('p/span/text()').extract()
# Everything works fine and i do have those four columns beautifully printed out, until....
url2 = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" %str(url2)[3:-2])
# This url is a string. when i print it out, it's like "http://www.epinions.com/review/samsung-galaxy-note-16-gb-cell-phone/content_624031731332", which looks legit.
response2 = HtmlResponse(url)
# I tried in a scrapy shell, it shows that this is a htmlresponse...
hxs2 = HtmlXPathSelector(response2)
fullReview = hxs2.select('//div[#class = "user_review_full"]')
item["url"] = fullReview.select('p/text()').extract()
# The three lines above works in an independent spider, where start_url is changed to the url just generated and everything.
# However, i got nothing from item["url"] in this code.
items.append(item)
return items
Why item["url"] returns nothing?
Thanks!

You should instantiate a new Request in the callback and pass your item in the meta dict:
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class EpinionsTestItem(Item):
title = Field()
star = Field()
date = Field()
review = Field()
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
url = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" % str(url)[3:-2])
yield Request(url=url, callback=self.parse_url2, meta={'item': item})
def parse_url2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
fullReview = hxs.select('//div[#class = "user_review_full"]')
item["review"] = fullReview.select('p/text()').extract()
yield item
Also see documentation.
Hope that helps.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

scrapy not following links - python

Related

Python Scrapy keep getting same page link from next page button

Scraping/crawling multiple pages

Scrapy spider not including all requested pages

Why scrapy is not going to next page and only getting first page items?

scrapy: A tiny "spider" in a spider?

Categories

Resources