Scrapy skipping urls

Scrapy skipping urls - python

I am using a set of urls that i scraped earlier and using scrapy to loop over it and send it to another function which will iterate through all the pages corresponding to that link and scrape amazon reviews till there is a next button present, but it is skipping many URLs from the start_requests function. Here is the code
import scrapy
from scrapy import Request
import logging
import json
from scrapy.selector import Selector
import pandas as pd
from nordvpn_switcher import initialize_VPN,rotate_VPN,terminate_VPN
class AmazonrSpider(scrapy.Spider):
name = 'amazonr'
allowed_domains = ['*']
start_urls = ['http://amazon.com/']
df = pd.read_csv(r'C:\Amazon Reviews scraper(part1)\Scraper\reviewsScraper\cleanedProducts.csv')
link_asin = list(zip(df.SeeAllReviews,df.asin))[:1000]
counter = 0
settings = initialize_VPN(save=1,area_input=['complete rotation'])
rotate_VPN(settings)
def start_requests(self):
for pair in self.link_asin: """these URLs are being skipped"""
request = Request(pair[0],callback=self.parseReview,dont_filter=True, meta={'item': pair[1]})
yield request
def parseReview(self, response):
asin = response.meta['item']
reviewTitle = response.xpath('//*[#class="a-section a-spacing-none review-views celwidget"]//*[#data-hook="review-title"]//text()').extract()
reviewTitle = [i.strip() for i in reviewTitle if i.strip() != '' ]
reviewRatings = response.xpath('//*[#class="a-section a-spacing-none review-views celwidget"]//*[#data-hook="review-star-rating"]//text()').extract()
reviewText = response.xpath('//*[#class="a-section a-spacing-none review-views celwidget"]//*[#data-hook="review-body"]//text()').extract()
reviewText = [i.strip() for i in reviewText if i.strip() != '']
reviewDate = response.xpath('//*[#data-hook="review-date"]/text()').extract()
all_v = list(zip(reviewTitle,reviewText,reviewRatings,reviewDate))
for i in all_v:
rtitle = i[0]
rtext = i[1]
rrating = i[2]
rdate = i[3]
dict_ = {"asin":asin, "reviewTitle":rtitle,"reviewText":rtext,"reviewRatings":rrating,"reviewDate":rdate}
with open("Reviews.json","a") as fl:
json.dump(dict_,fl)
fl.write('\n')
nextpage = response.xpath('//*[#class="a-last"]//#href').extract_first() # nextpage link
self.counter = self.counter + 1
if self.counter%250 == 0 : # to rotate proxy after every 250 rotations
settings = initialize_VPN(save=1,area_input=['complete rotation'])
rotate_VPN(settings)
if str(nextpage) != 'None': # go to next page
nextpage = "https://www.amazon.com" + nextpage
yield Request(nextpage,callback=self.parseReview,dont_filter=True,meta={'item': asin})
else:
pass```

Related

Scrapy Spider scraping same thing multiple times and missing other items

When I run my scrapy spider to scrape comments on the steam platform, It is missing a lot of comments and is scraping the same comments several times. What is wrong with my code?
import scrapy
from scrapy import Request, FormRequest
from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
import json
from scrapy import Selector
import re
class Workshop_Item(Item):
app_id = Field()
workshop_id = Field()
game = Field()
workshop_name = Field()
user = Field()
comment = Field()
user_level = Field()
date_posted = Field()
user_location = Field()
number_of_badges = Field()
user_join_date = Field()
is_author = Field()
user_experience = Field()
class Workshop_Comment_Spider(scrapy.Spider):
name = "comments"
with open("output/workshop_comment_links.txt") as f:
urls = [line.rstrip("\n") for line in f]
start_urls = urls
def parse(self, response):
if int(max(response.css('span.tabCount::text').getall())) > 0 and "profiles" in response.css('a.commentthread_author_link::attr(href)').get():
contributor_id = re.search(r'Public_(.*?)_' , response.css('div.commentthread_footer a::attr(id)').get()).group(1)
elif int(max(response.css('span.tabCount::text').getall())) > 0:
contributor_id = re.search(r'Public_(.*?)_' , response.css('div.commentthread_footer a::attr(id)').get()).group(1)
workshop_id_number = response.css('form.smallForm > input::attr(value)').get()
if int(max(response.css('span.tabCount::text').getall())) > 50:
comment_number = max(response.css('span.tabCount::text').getall())
url = f'https://steamcommunity.com/comment/PublishedFile_Public/render/{contributor_id}/{workshop_id_number}/'
data = {
"start": "1",
"totalcount": comment_number,
"count": comment_number,
"sessionid": "d880ab2338b70926db0a9591",
"extended_data": "{\"contributors\":[\"" + contributor_id +"\",{}],\"appid\":289070,\"sharedfile\":{\"m_parentsDetails\":null,\"m_parentBundlesDetails\":null,\"m_bundledChildren\":[],\"m_ownedBundledItems\":[]},\"parent_item_reported\":false}",
"feature2": "-1"
}
app_id = response.css('div.apphub_HeaderTop a::attr(data-appid)').get()
game = response.css(".apphub_AppName::text").get()
workshop_id = response.css('form.smallForm input::attr(value)').get()
workshop_name = response.css(".workshopItemTitle::text").get()
yield FormRequest(url, formdata=data, callback=self.parse_paginated_comments, meta={'app_id': app_id, 'game': game, 'workshop_id': workshop_id, 'workshop_name': workshop_name})
else:
for comment in response.css(".commentthread_comment"):
item = Workshop_Item()
item['is_author'] = False
if "authorbadge" in comment.get():
item['is_author'] = True
item['app_id'] = response.css('div.apphub_HeaderTop a::attr(data-appid)').get()
item['workshop_id'] = response.css('form.smallForm input::attr(value)').get()
item['game'] = response.css(".apphub_AppName::text").get()
item['workshop_name'] = response.css(".workshopItemTitle::text").get()
item['user'] = comment.css("bdi::text").get()
item['comment'] = ",".join(comment.css(".commentthread_comment_text::text").getall()).replace('\n', ' ').replace('\t', '').replace('\r', ' ')
item['date_posted'] = comment.css(".commentthread_comment_timestamp::attr(title)").get()
item['user_level'] = -1
user_profile = comment.css(".commentthread_author_link::attr(href)").get()
request = Request(user_profile, callback=self.parse_user_info, meta={'item': item})
yield request
def parse_user_info(self, response):
item = response.meta['item']
if response.css('.profile_private_info'):
item['user_level'] = 'private'
item['user_location'] = 'private'
item['number_of_badges'] = 'private'
item['user_join_date'] = 'private'
item['user_experience'] = 'private'
return item
else:
item['user_level'] = response.css(".friendPlayerLevelNum::text").get()
if response.css('.header_real_name') and response.css("img.profile_flag"):
item['user_location'] = response.css('.header_real_name::text').getall()[2].strip()
else:
item['user_location'] = 'NA'
if response.css("div.profile_badges span.profile_count_link_total::text"):
item['number_of_badges'] = response.css("div.profile_badges span.profile_count_link_total::text").get().strip()
else:
item['number_of_badges'] = 'NA'
user_badge_page = response.css("div.profile_header_badgeinfo_badge_area > a::attr(href)").get() + "/1"
request = Request(user_badge_page, callback=self.parse_badge_info, meta={'item': item})
yield request
def parse_badge_info(self, response):
item = response.meta['item']
if response.css("div.badge_description"):
item['user_join_date'] = response.css("div.badge_description::text").get().strip()
experience_page = response.css('a.whiteLink.persona_name_text_content::attr(href)').get() + "/badges"
request = Request(experience_page, callback=self.parse_experience_page, meta={'item': item})
yield request
def parse_experience_page(self, response):
item = response.meta['item']
if response.css('span.profile_xp_block_xp'):
item['user_experience'] = response.css('span.profile_xp_block_xp::text').get()
return item
def parse_paginated_comments(self, response):
app_id = response.meta['app_id']
game = response.meta['game']
workshop_id = response.meta['workshop_id']
workshop_name = response.meta['workshop_name']
jsonresponse = json.loads(response.body.decode("utf-8"))
sel = Selector(text=jsonresponse['comments_html'])
for comment in sel.css(".commentthread_comment"):
item = Workshop_Item()
item['is_author'] = False
if "authorbadge" in comment.get():
item['is_author'] = True
item['app_id'] = app_id #sel.css('div.apphub_HeaderTop a::attr(data-appid)').get()
item['workshop_id'] = workshop_id #sel.css('form.smallForm input::attr(value)').get()
item['game'] = game #sel.css(".apphub_AppName::text").get()
item['workshop_name'] = workshop_name #sel.css(".workshopItemTitle::text").get()
item['user'] = comment.css("bdi::text").get()
item['comment'] = ",".join(comment.css(".commentthread_comment_text::text").getall()).replace('\n', ' ').replace('\t', '').replace('\r', ' ')
item['date_posted'] = comment.css(".commentthread_comment_timestamp::attr(title)").get()
item['user_level'] = -1
user_profile = sel.css(".commentthread_author_link::attr(href)").get()
request = Request(user_profile, callback=self.parse_user_info, meta={'item': item})
yield request
I am scraping a comment from a page, and then going to the user's profile to collect user data. If the page has pagination (>50 comments), I am sending a post request to the retrieve the json that contains the html for all of the comments, and then scraping that.

Fix it, the problem was here:
def parse_paginated_comments(self, response):
app_id = response.meta['app_id']
game = response.meta['game']
workshop_id = response.meta['workshop_id']
workshop_name = response.meta['workshop_name']
jsonresponse = json.loads(response.body.decode("utf-8"))
sel = Selector(text=jsonresponse['comments_html'])
for comment in sel.css(".commentthread_comment"):
item = Workshop_Item()
item['is_author'] = False
I needed to change
for comment in sel.css(".commentthread_comment"):
to
for comment in comment.css(".commentthread_comment"):
and I needed to add
DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'
To the settings.py file.

Scrapy Request are not crawling all urls return

recently had this project of crawling google play store apps, for vietnam region, and realized that the request doesn't run the callback function for all URLs that haven been return.
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http.request import Request
from urllib.parse import urlparse, parse_qsl, urlencode
import scrapy
class GooglePlayStoreSpider(CrawlSpider):
name = 'google_play'
allowed_domains = ['play.google.com']
start_urls = ['http://play.google.com']
rules = (
Rule(LinkExtractor(allow=('https://play.google.com/store/apps/details')), follow=True,
process_links='process_links',
callback='parse_1'),
)
crawled_ids = []
first_init = False
def parse_start_url(self, response):
# print("-------------- PRINTING SECTION START_URL --------------")
if not self.first_init:
self.first_init = True
extractor = LinkExtractor(allow=('/store/apps/category/.*',))
raw_links = extractor.extract_links(response)
links = self.process_links(raw_links)
return [
scrapy.Request('{}'.format(link.url))
for link in links
]
else:
# print("============ START_URL ELSE PART ============")
pass
def process_links(self, links):
new_links = []
for link in links:
old_url = link.url
if not old_url.startswith('https://play.google.com/store/apps/'):
continue
old_url_obj = urlparse(old_url)
old_url_query = dict(parse_qsl(old_url_obj.query))
if old_url_obj.path == '/store/apps/details':
if old_url_query['id'] in self.crawled_ids:
continue
else:
self.crawled_ids.append(old_url_query['id'])
old_url_query['hl'] = 'en'
old_url_query['gl'] = 'vn'
link.url = '{}://{}{}?{}'.format(old_url_obj.scheme, old_url_obj.netloc, old_url_obj.path,
urlencode(old_url_query))
new_links.append(link)
# print("LINKKSSS ====", links)
# print("NEW_LINKKSSS ====", new_links)
# print("-------------- PRINTING SECTION PROCESS_LINKS --------------")
return new_links
def parse_1(self, response):
selector = scrapy.Selector(response)
urls = selector.xpath('//a[#class="LkLjZd ScJHi U8Ww7d xjAeve nMZKrb id-track-click "]/#href').extract()
links = []
for url in urls:
if not url.startswith('https://play.google.com/'):
url = "https://play.google.com" + url
links.append(url)
link_flag = 0
for url in urls:
# yield links_list.append(scrapy.Request(url, callback=self.parse_next, dont_filter=True))
yield Request(links[link_flag], callback=self.parse_next, dont_filter=True)
link_flag += 1
def parse_next(self, response):
# print("PARSE_NEXT ===========", response.request.url)
selector = scrapy.Selector(response)
app_urls = selector.xpath('//div[#class="details"]/a[#class="title"]/#href').extract()
urls = []
for url in app_urls:
url = "https://play.google.com" + url + '&hl=en&gl=vn'
urls.append(url)
url_list = []
link_flag = 0
for url in app_urls:
yield Request(urls[link_flag], callback=self.parse_detail, dont_filter=True)
link_flag += 1
# return url_list
def parse_detail(self, response):
print("Parsed ======= ", response.request.url)
item = dict()
item['name'] = response.xpath('//div[#itemscope]//meta[#itemprop="name"]/#content').extract_first()
item['category'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="applicationCategory"]/#content').extract_first()
item['review_score'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="ratingValue"]/#content').extract_first()
item['review_count'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="reviewCount"]/#content').extract_first()
item['link'] = response.request.url
item['id'] = dict(parse_qsl(urlparse(response.request.url).query))['id']
item['content_rating'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="contentRating"]/#content').extract_first()
item['image'] = response.xpath('//div[#itemscope]//meta[#itemprop="image"]/#content').extract_first()
item['price'] = response.xpath('//div[#itemscope]//meta[#itemprop="price"]/#content').extract_first()
item['price_currency'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="priceCurrency"]/#content').extract_first()
# item['operating_system'] = response.xpath('//div[#itemscope]//meta[#itemprop="operatingSystem"]/#content').extract_first()
return item
As i run into the terminal, it says that it crawled 100 pages and scraped only 15 pages, (numbers are for estimate).
Please help

Scrapy spider not including all requested pages

I have a Scrapy script for Yelp that is, for the most part, working. Essentially I can supply it with a list of Yelp pages and it should return all reviews from all pages. The script so far is below:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
items.append(item)
return items
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
return requests
However, the problem I'm running into is that this particular script scrapes every page of every requested review EXCEPT for the first page. If I comment out the last "if" statement, it only scrapes the FIRST page. I suspect all I need is a simple "else" command but I am stumped... help is greatly appreciated!
EDIT: This is the code as it currently stands based on assistance received...
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
yield item
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
for request in requests:
yield request
As mentioned in a comment below, running this code as-is crawls every desired page, but it only returns one review per page rather than all of them.
I tried Changing yield item to yield items, but an error message of ERROR: Spider must return Request, BaseItem or None, got 'list' in <GET http://www.yelp.com/biz/[...]> is returned for every URL crawled.

You need to reorganize the methods a bit. First parse restaurant page in parse() method. Then, return requests for reviews and handle responses in another method, e.g. parse_review():
import re
from scrapy.item import Item, Field
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse_review) for n in range(totalReviews/reviewsPerPage)]
return pages
def parse_review(self, response):
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
yield item

If you're returning items/requests in more than one place, you should replace your return statements with yield statements, which turn your function into a generator, which returns a new element each time it's generated (yields it), without exiting the function until they are all returned. Otherwise, as your code is now, your function will exit after the first return and won't get to sending the requests for the following pages.
Edit: Correction - you should yield one item/request at a time, so:
Replace
for review in reviews:
item = ...
return items
with
for review in reviews:
item = ...
yield item
and replace
return requests
with
for request in requests:
yield request

The final answer did indeed lie in the indentation of one single yield line. This is the code that ended up doing what I needed it to do.
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class YelpXSpider(Spider):
name = "yelpx"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
yield item
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
for request in requests:
yield request
Thanks to everyone for helping out a noob!

how to extract certain string from URL

I am trying to extract certain strings from the below mentioned URL :
sample URL :
http://www.ladyblush.com/buy-sarees-online.html?p=1
http://www.ladyblush.com/buy-ladies-suits-online.html?p=1
http://www.ladyblush.com/buy-women-fashion-accessories.html?p=1
i want to extract :
productCategory = "sarees" productSubCategory = ""
productCategory = "ladies" productSubCategory = "suits"
productCategory = "women" productSubCategory = "fashion-accessories"
And so on. Actually i am writing a spider and i need to extract productCategory and productSubCategory from URL's like above mentioned..so i am trying to extract these fields inside parse method from response.url. Can someone help me out please
My code :
import re
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "ladyblushSpider"
allowed_domains = ["ladyblush.com"]
URLSList = []
for n in range (1,100):
URLSList.append('http://www.ladyblush.com/buy-sarees-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-ladies-suits-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-fashion-accessories.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-nightwear-lingerie-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-dress-online-skirts-suits-kurtis-tops.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-decor-online-wallclock-bedsheets-cushions-bedcovers.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-cosmetics-online-massage-oils-aromatherapy-perfumes-soaps.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-jewelery-online-art-fashion-semi-precious-antique-junk-jewellery.html?p=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="third thumbnailSpillLarge"]')
items = []
for site in sites:
item = EscraperItem()
item['currency'] = 'INR'
item['productCategory'] = [""]
item['productSubCategory'] = [""]
item['productSite'] = ["http://ladyblush.com/"]
item['productImage'] = site.select('./a/div/img/#src').extract()
item['productTitle'] = site.select('./a/div/img/#title').extract()
item['productURL'] = [site.select('./a/#href').extract()[0].replace(" ","%20")]
productMRP = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="old-price"]//span[#class="price"]/text()').extract()
productPrice = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="special-price"]//span[#class="price"]/text()').extract()
if productMRP and productPrice:
price = [productMRP[1].strip()] + [productPrice[1].strip()]
else:
price = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//span[#class="regular-price"]//span[#class="price"]/text()').extract()
item['productPrice'] = price
items.append(item)
secondURL = item['productURL'][0]
request = Request(secondURL,callback=self.parsePage2)
request.meta['item'] = item
yield request
def parsePage2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
if hxs.select('//div[#class="addtocart-container"]/div/text()').extract():
item['availability'] = False
else:
item['availability'] = True
if hxs.select('//label[#class="required"]/text()').extract():
item['hasVariants'] = True
else:
item['hasVariants'] = False
item['image_urls'] = list(set(item['productImage']))
item['productDesc'] = [" ".join([re.sub(r'[\t\n\r]',"",i.strip()) for i in hxs.select('//div[#class="std"]/text()').extract()])]
item['productImage'] = item['productImage'] + hxs.select('//div[#class="more-views"]/ul/li/a/img/#src').extract() + hxs.select('//div[#class="more-views"]/ul/li/a/#href').extract()
return item
#------------------------------------------------------------------------------

you can get the url from
response.url in the parse method. You could then parse that to just get the url path
import os
test = 'buy-women-fashion-accessories.html?p=1'
parts = os.path.splitext(test)
# ('buy-women-fashion-accessories', '.html?p=1')
parts[0].split('-')[1:]
# ['women', 'fashion', 'accessories']
This is rather flimsy solution though. Are you sure the data is not stored somewhere in the page's html that your are parsing, instead of looking at the url?

How to call Parse_page2 method for every item

I am trying to call parse_page2 method for every item. But every time i run this spider i am only getting single item per page so how do i call parse_page2 method for every item.
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "atisundarSpider"
allowed_domains = ["atisundar.com"]
URLSList = []
for n in range (1,20):
URLSList.append('http://atisundar.com/collections/sarees?page=' + str(n))
URLSList.append('http://atisundar.com/collections/salwar-suits?page=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="block product size-medium"]')
items = []
for site in sites:
item = EscraperItem()
item['productDesc'] = ""
item['productSite'] = ["http://atisundar.com/"]
item['productTitle'] = site.select('.//div[#class="main"]/a/#title').extract()
item['productURL'] = ["http://atisundar.com" + site.select('.//div[#class="main"]/a/#href').extract()[0].encode('utf-8')]
item['productPrice'] = site.select('.//p[#class="pricearea"]//span[#class="was-price"]/text()').extract() + site.select('.//p[#class="pricearea"]//span[#class="price"]/text()').extract()
item['productImage'] = [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0]] + [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0].replace("medium","grande")]
item['image_urls'] = item['productImage']
items.append(item)
secondURL = "http://admin.atisundar.com/store/skuDetails?product_id=" + site.select('.//div[#class="main"]/a/text()').extract()[1].strip().split("#")[-1]
request = Request(secondURL,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
#item['other_url'] = response.url
return item

1) you are not using CrawlSpider functionality , i would recommend you to inherit your spider from BaseSpider
2) in for loop
for site in sites:
use yield rather then return , other wise it will break the loop in first iteration.
yield request
3) in parse_page2 get item from response.request.meta instead from response.meta
item = response.request.meta['item']
it should work now.
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
#------------------------------------------------------------------------------
from scrapy.spider import BaseSpider
class ESpider(BaseSpider):
name = "atisundarSpider"
allowed_domains = ["atisundar.com"]
URLSList = []
for n in range (1,20):
URLSList.append('http://atisundar.com/collections/sarees?page=' + str(n))
URLSList.append('http://atisundar.com/collections/salwar-suits?page=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="block product size-medium"]')
for site in sites:
item = EscraperItem()
item['productDesc'] = ""
item['productSite'] = ["http://atisundar.com/"]
item['productTitle'] = site.select('.//div[#class="main"]/a/#title').extract()
item['productURL'] = ["http://atisundar.com" + site.select('.//div[#class="main"]/a/#href').extract()[0].encode('utf-8')]
item['productPrice'] = site.select('.//p[#class="pricearea"]//span[#class="was-price"]/text()').extract() + site.select('.//p[#class="pricearea"]//span[#class="price"]/text()').extract()
item['productImage'] = [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0]] + [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0].replace("medium","grande")]
item['image_urls'] = item['productImage']
secondURL = "http://admin.atisundar.com/store/skuDetails?product_id=" + site.select('.//div[#class="main"]/a/text()').extract()[1].strip().split("#")[-1]
request = Request(secondURL,
callback=self.parse_page2)
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.request.meta['item']
#item['other_url'] = response.url
return item

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy skipping urls - python

Related

Scrapy Spider scraping same thing multiple times and missing other items

Scrapy Request are not crawling all urls return

Scrapy spider not including all requested pages

how to extract certain string from URL

How to call Parse_page2 method for every item

Categories

Resources