Scrapy Calling another Url - python

I am using scrapy to scrape a website. I am getting all products from the listing page.Now i want to go to each url of the product but i am not getting the satisfactory result.
Here is my code:
import scrapy
from scrapy.http import Request
from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domain = ["test.com"]
start_urls = [
"http://www.test.com/?page=1"
]
page_index = 1
def parse(self,response):
products = response.xpath('//li')
items = []
if products:
for product in products:
item = DmozItem()
item['link'] = product.xpath('#data-url').extract()
item['sku'] = product.xpath('#data-sku').extract()
item['brand'] = product.xpath('.//span[contains(#class, "qa-brandName")]/text()').extract()
item['img'] = product.xpath('.//img[contains(#class, "itm-img")]/#src').extract()
page_url = "http://www.jabong.com/Lara-Karen-Black-Sweaters-893039.html"
request = Request(url=page_url,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"})
request.meta['item'] = item
item['other'] = request
yield item
else:
return
self.page_index += 1
if self.page_index:
yield Request(url="http://www.test.com/?page=%s" % (self.page_index),
headers={"Referer": "http://www.test.com/", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse)
def parse_page2(self, response):
item = response.meta['item']
item['title'] = response.xpath("//span[#id='before_price']/text()")
yield item
The result i am getting is
{"sku": [], "brand": [], "other": "<Request GET http://www.test.com/>", "link": [], "img": []},
instead of request Get i need the data which i am returning from pars2 function
Where am i going wrong.

Your xpaths seems to be wrong here,
try this
In [0]: products[0].xpath('./#data-url').extract()
Out[0]: [u'Sangria-Green-Kurtis-Kurtas-1081831.html']
In [1]: products[0].xpath('./a/#unbxdparam_sku').extract()
Out[1]: [u'SA038WA68OIXINDFAS']
In [2]: products[0].xpath('./a/span[contains(#class,"qa-brandName")]/text()').extract()
Out[2]: [u'Sangria']
In [3]: products[0].xpath('./a/span[#class="lazyImage cat-prd-img"]/span/#id').extract()
Out[3]: [u'http://static14.jassets.com/p/Sangria-Green--Kurtis-26-Kurtas-5520-1381801-1-catalog.jpg']
so the code will be ,
BASE_URL = 'http://www.jabong.com/'
for product in products:
item = DmozItem()
item_url = product.xpath('./#data-url').extract()
item_url = self.BASE_URL + item_url[0] if item_url else ''
item['link'] = product.xpath('./#data-url').extract()
item['sku'] = product.xpath('./a/#unbxdparam_sku').extract()
item['brand'] = product[0].xpath('./a/span[contains(#class,"qa-brandName")]/text()').extract()
item['img'] = product.xpath('./a/span[#class="lazyImage cat-prd-img"]/span/#id').extract()
if item_url:
yield Request(url=self.BASE_URL + ,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8}, meta={'item'=item})
EDIT
complete spider code
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.spider import Spider
from scrapy.http import Request
class JabongItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
link = scrapy.Field()
sku = scrapy.Field()
brand = scrapy.Field()
img = scrapy.Field()
class JabongSpider(scrapy.Spider):
name = "jabong"
allowed_domains = ["jabong.com"]
start_urls = ["http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1"]
page_index = 1
BASE_URL = 'http://www.jabong.com/'
def parse(self, response):
products = response.xpath("//li[#data-url]")
if products:
for product in products:
link = product.xpath('#data-url').extract()
link = self.BASE_URL + link[0] if link else ''
sku = product.xpath('#data-sku').extract()
sku = sku[0].strip() if sku else 'n/a'
brand = product.xpath('.//span[contains(#class, "qa-brandName")]/text()').extract()
brand = brand[0].strip() if brand else 'n/a'
img = product.xpath('.//img[contains(#class, "itm-img")]/#src').extract()
img = img[0].strip() if img else 'n/a'
item = JabongItem()
item['link'] = link
item['sku'] = sku
item['brand'] = brand
item['img'] = img
if link:
yield Request(url=link, callback=self.parse_page2, meta={'item': item})
else:
return
self.page_index += 1
yield Request(url="http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1%s" % (self.page_index + 1),
callback=self.parse, dont_filter=True)
def parse_page2(self, response):
item = response.meta['item']
# add whatever extra details you want to item
yield item

Related

Scrapy Spider scraping same thing multiple times and missing other items

When I run my scrapy spider to scrape comments on the steam platform, It is missing a lot of comments and is scraping the same comments several times. What is wrong with my code?
import scrapy
from scrapy import Request, FormRequest
from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
import json
from scrapy import Selector
import re
class Workshop_Item(Item):
app_id = Field()
workshop_id = Field()
game = Field()
workshop_name = Field()
user = Field()
comment = Field()
user_level = Field()
date_posted = Field()
user_location = Field()
number_of_badges = Field()
user_join_date = Field()
is_author = Field()
user_experience = Field()
class Workshop_Comment_Spider(scrapy.Spider):
name = "comments"
with open("output/workshop_comment_links.txt") as f:
urls = [line.rstrip("\n") for line in f]
start_urls = urls
def parse(self, response):
if int(max(response.css('span.tabCount::text').getall())) > 0 and "profiles" in response.css('a.commentthread_author_link::attr(href)').get():
contributor_id = re.search(r'Public_(.*?)_' , response.css('div.commentthread_footer a::attr(id)').get()).group(1)
elif int(max(response.css('span.tabCount::text').getall())) > 0:
contributor_id = re.search(r'Public_(.*?)_' , response.css('div.commentthread_footer a::attr(id)').get()).group(1)
workshop_id_number = response.css('form.smallForm > input::attr(value)').get()
if int(max(response.css('span.tabCount::text').getall())) > 50:
comment_number = max(response.css('span.tabCount::text').getall())
url = f'https://steamcommunity.com/comment/PublishedFile_Public/render/{contributor_id}/{workshop_id_number}/'
data = {
"start": "1",
"totalcount": comment_number,
"count": comment_number,
"sessionid": "d880ab2338b70926db0a9591",
"extended_data": "{\"contributors\":[\"" + contributor_id +"\",{}],\"appid\":289070,\"sharedfile\":{\"m_parentsDetails\":null,\"m_parentBundlesDetails\":null,\"m_bundledChildren\":[],\"m_ownedBundledItems\":[]},\"parent_item_reported\":false}",
"feature2": "-1"
}
app_id = response.css('div.apphub_HeaderTop a::attr(data-appid)').get()
game = response.css(".apphub_AppName::text").get()
workshop_id = response.css('form.smallForm input::attr(value)').get()
workshop_name = response.css(".workshopItemTitle::text").get()
yield FormRequest(url, formdata=data, callback=self.parse_paginated_comments, meta={'app_id': app_id, 'game': game, 'workshop_id': workshop_id, 'workshop_name': workshop_name})
else:
for comment in response.css(".commentthread_comment"):
item = Workshop_Item()
item['is_author'] = False
if "authorbadge" in comment.get():
item['is_author'] = True
item['app_id'] = response.css('div.apphub_HeaderTop a::attr(data-appid)').get()
item['workshop_id'] = response.css('form.smallForm input::attr(value)').get()
item['game'] = response.css(".apphub_AppName::text").get()
item['workshop_name'] = response.css(".workshopItemTitle::text").get()
item['user'] = comment.css("bdi::text").get()
item['comment'] = ",".join(comment.css(".commentthread_comment_text::text").getall()).replace('\n', ' ').replace('\t', '').replace('\r', ' ')
item['date_posted'] = comment.css(".commentthread_comment_timestamp::attr(title)").get()
item['user_level'] = -1
user_profile = comment.css(".commentthread_author_link::attr(href)").get()
request = Request(user_profile, callback=self.parse_user_info, meta={'item': item})
yield request
def parse_user_info(self, response):
item = response.meta['item']
if response.css('.profile_private_info'):
item['user_level'] = 'private'
item['user_location'] = 'private'
item['number_of_badges'] = 'private'
item['user_join_date'] = 'private'
item['user_experience'] = 'private'
return item
else:
item['user_level'] = response.css(".friendPlayerLevelNum::text").get()
if response.css('.header_real_name') and response.css("img.profile_flag"):
item['user_location'] = response.css('.header_real_name::text').getall()[2].strip()
else:
item['user_location'] = 'NA'
if response.css("div.profile_badges span.profile_count_link_total::text"):
item['number_of_badges'] = response.css("div.profile_badges span.profile_count_link_total::text").get().strip()
else:
item['number_of_badges'] = 'NA'
user_badge_page = response.css("div.profile_header_badgeinfo_badge_area > a::attr(href)").get() + "/1"
request = Request(user_badge_page, callback=self.parse_badge_info, meta={'item': item})
yield request
def parse_badge_info(self, response):
item = response.meta['item']
if response.css("div.badge_description"):
item['user_join_date'] = response.css("div.badge_description::text").get().strip()
experience_page = response.css('a.whiteLink.persona_name_text_content::attr(href)').get() + "/badges"
request = Request(experience_page, callback=self.parse_experience_page, meta={'item': item})
yield request
def parse_experience_page(self, response):
item = response.meta['item']
if response.css('span.profile_xp_block_xp'):
item['user_experience'] = response.css('span.profile_xp_block_xp::text').get()
return item
def parse_paginated_comments(self, response):
app_id = response.meta['app_id']
game = response.meta['game']
workshop_id = response.meta['workshop_id']
workshop_name = response.meta['workshop_name']
jsonresponse = json.loads(response.body.decode("utf-8"))
sel = Selector(text=jsonresponse['comments_html'])
for comment in sel.css(".commentthread_comment"):
item = Workshop_Item()
item['is_author'] = False
if "authorbadge" in comment.get():
item['is_author'] = True
item['app_id'] = app_id #sel.css('div.apphub_HeaderTop a::attr(data-appid)').get()
item['workshop_id'] = workshop_id #sel.css('form.smallForm input::attr(value)').get()
item['game'] = game #sel.css(".apphub_AppName::text").get()
item['workshop_name'] = workshop_name #sel.css(".workshopItemTitle::text").get()
item['user'] = comment.css("bdi::text").get()
item['comment'] = ",".join(comment.css(".commentthread_comment_text::text").getall()).replace('\n', ' ').replace('\t', '').replace('\r', ' ')
item['date_posted'] = comment.css(".commentthread_comment_timestamp::attr(title)").get()
item['user_level'] = -1
user_profile = sel.css(".commentthread_author_link::attr(href)").get()
request = Request(user_profile, callback=self.parse_user_info, meta={'item': item})
yield request
I am scraping a comment from a page, and then going to the user's profile to collect user data. If the page has pagination (>50 comments), I am sending a post request to the retrieve the json that contains the html for all of the comments, and then scraping that.
Fix it, the problem was here:
def parse_paginated_comments(self, response):
app_id = response.meta['app_id']
game = response.meta['game']
workshop_id = response.meta['workshop_id']
workshop_name = response.meta['workshop_name']
jsonresponse = json.loads(response.body.decode("utf-8"))
sel = Selector(text=jsonresponse['comments_html'])
for comment in sel.css(".commentthread_comment"):
item = Workshop_Item()
item['is_author'] = False
I needed to change
for comment in sel.css(".commentthread_comment"):
to
for comment in comment.css(".commentthread_comment"):
and I needed to add
DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'
To the settings.py file.

Scrapy Request are not crawling all urls return

recently had this project of crawling google play store apps, for vietnam region, and realized that the request doesn't run the callback function for all URLs that haven been return.
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http.request import Request
from urllib.parse import urlparse, parse_qsl, urlencode
import scrapy
class GooglePlayStoreSpider(CrawlSpider):
name = 'google_play'
allowed_domains = ['play.google.com']
start_urls = ['http://play.google.com']
rules = (
Rule(LinkExtractor(allow=('https://play.google.com/store/apps/details')), follow=True,
process_links='process_links',
callback='parse_1'),
)
crawled_ids = []
first_init = False
def parse_start_url(self, response):
# print("-------------- PRINTING SECTION START_URL --------------")
if not self.first_init:
self.first_init = True
extractor = LinkExtractor(allow=('/store/apps/category/.*',))
raw_links = extractor.extract_links(response)
links = self.process_links(raw_links)
return [
scrapy.Request('{}'.format(link.url))
for link in links
]
else:
# print("============ START_URL ELSE PART ============")
pass
def process_links(self, links):
new_links = []
for link in links:
old_url = link.url
if not old_url.startswith('https://play.google.com/store/apps/'):
continue
old_url_obj = urlparse(old_url)
old_url_query = dict(parse_qsl(old_url_obj.query))
if old_url_obj.path == '/store/apps/details':
if old_url_query['id'] in self.crawled_ids:
continue
else:
self.crawled_ids.append(old_url_query['id'])
old_url_query['hl'] = 'en'
old_url_query['gl'] = 'vn'
link.url = '{}://{}{}?{}'.format(old_url_obj.scheme, old_url_obj.netloc, old_url_obj.path,
urlencode(old_url_query))
new_links.append(link)
# print("LINKKSSS ====", links)
# print("NEW_LINKKSSS ====", new_links)
# print("-------------- PRINTING SECTION PROCESS_LINKS --------------")
return new_links
def parse_1(self, response):
selector = scrapy.Selector(response)
urls = selector.xpath('//a[#class="LkLjZd ScJHi U8Ww7d xjAeve nMZKrb id-track-click "]/#href').extract()
links = []
for url in urls:
if not url.startswith('https://play.google.com/'):
url = "https://play.google.com" + url
links.append(url)
link_flag = 0
for url in urls:
# yield links_list.append(scrapy.Request(url, callback=self.parse_next, dont_filter=True))
yield Request(links[link_flag], callback=self.parse_next, dont_filter=True)
link_flag += 1
def parse_next(self, response):
# print("PARSE_NEXT ===========", response.request.url)
selector = scrapy.Selector(response)
app_urls = selector.xpath('//div[#class="details"]/a[#class="title"]/#href').extract()
urls = []
for url in app_urls:
url = "https://play.google.com" + url + '&hl=en&gl=vn'
urls.append(url)
url_list = []
link_flag = 0
for url in app_urls:
yield Request(urls[link_flag], callback=self.parse_detail, dont_filter=True)
link_flag += 1
# return url_list
def parse_detail(self, response):
print("Parsed ======= ", response.request.url)
item = dict()
item['name'] = response.xpath('//div[#itemscope]//meta[#itemprop="name"]/#content').extract_first()
item['category'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="applicationCategory"]/#content').extract_first()
item['review_score'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="ratingValue"]/#content').extract_first()
item['review_count'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="reviewCount"]/#content').extract_first()
item['link'] = response.request.url
item['id'] = dict(parse_qsl(urlparse(response.request.url).query))['id']
item['content_rating'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="contentRating"]/#content').extract_first()
item['image'] = response.xpath('//div[#itemscope]//meta[#itemprop="image"]/#content').extract_first()
item['price'] = response.xpath('//div[#itemscope]//meta[#itemprop="price"]/#content').extract_first()
item['price_currency'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="priceCurrency"]/#content').extract_first()
# item['operating_system'] = response.xpath('//div[#itemscope]//meta[#itemprop="operatingSystem"]/#content').extract_first()
return item
As i run into the terminal, it says that it crawled 100 pages and scraped only 15 pages, (numbers are for estimate).
Please help

Scrapy: Crawled but not scraped any data

I wrote the following code to scrape Booking.com given the name of the city. Ideally, the program should find out all the hotels that are available in the city and scrape all the reviews for each hotel. Unfortunately, it will scrape only a few hotels and only the first 75 reviews of those hotels. Will you please tell me what am I doing wrong here??
import scrapy
from scrapy import Spider
from scrapy.loader import ItemLoader
from booking_spider.items import BookingSpiderItem
class PerhotelrevSpider(Spider):
name = 'perhotelrev'
allowed_domains = ['booking.com']
#start_urls = ['https://booking.com/reviews/us/hotel/maison-st-charles-quality-inn-suites.html?/']
start_urls = ['https://www.booking.com/searchresults.html?ss=New%20Orleans&']
#handle_httpstatus_list = [301, 302]
def parse(self, response):
all_hotels = response.xpath('.//*[#class="sr-hotel__title \n"]')
for ahotel in all_hotels:
hotel_name = ahotel.xpath('.//*[#class="sr-hotel__name\n"]/text()').extract_first().replace('\n','')
hotel_url = ahotel.xpath('.//*[#class="hotel_name_link url"]/#href').extract_first().replace('\n','')
full_hotel_url = 'https://www.booking.com'+str(hotel_url)
request = scrapy.Request(full_hotel_url, callback = self.parse_hotels)
request.meta['adict'] = {'HotelName':hotel_name}
yield request
next_page = response.xpath('.//*[#class="bui-pagination__item bui-pagination__next-arrow"]/a/#href').extract_first()
if next_page is not None:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_hotels(self, response):
adict = response.meta['adict']
hotel_name = adict['HotelName']
#hotel_name = response.xpath('.//*[#class="hp__hotel-name"]/text()')[1].extract().replace('\n','')
image_urls = response.xpath('.//*[#class="b_nha_hotel_small_images hp_thumbgallery_with_counter"]/a/#href').extract()
all_facilities = response.xpath('.//*[#class="facilitiesChecklistSection"]/ul/li/span/text()').extract()
all_facilities = [x.replace('\n','') for x in all_facilities]
important_facility = response.xpath('.//*[#class="important_facility "]/#data-name-en').extract()
#print(hotel_name)
all_review_url = response.xpath('.//*[#class="show_all_reviews_btn"]/#href').extract_first()
adict = { 'HotelName':hotel_name,
'ImageUrls':image_urls,
'Facilities':all_facilities,
'MainFacilities':important_facility
}
if all_review_url is not None:
review_url = "https://booking.com"+all_review_url
request = scrapy.Request(review_url, callback=self.parse_review)
request.meta['adict'] = adict
yield request
def parse_review(self, response):
allreviewsinpage = response.xpath('.//*[#itemprop="review"]')
adict = response.meta['adict']
hotel_name = adict['HotelName']
image_urls = adict['ImageUrls']
all_facilities = adict['Facilities']
important_facility = adict['MainFacilities']
for eachreview in allreviewsinpage:
username = eachreview.xpath('.//p[#class="reviewer_name"]/*[#itemprop="name"]/text()').extract_first()
usercountry = eachreview.xpath('.//*[#itemprop="nationality"]/*[#itemprop="name"]/text()').extract_first()
numreviewgiven = eachreview.xpath('.//*[#class="review_item_user_review_count"]/text()').extract_first()
useragegroup = eachreview.xpath('.//*[#class="user_age_group"]/text()').extract_first()
heading = eachreview.xpath('.//*[#class="review_item_header_content\n"]/*[#itemprop="name"]/text()').extract_first()
neg_rev = eachreview.xpath('.//p[#class="review_neg "]/*[#itemprop="reviewBody"]/text()').extract_first()
pos_rev = eachreview.xpath('.//p[#class="review_pos "]/*[#itemprop="reviewBody"]/text()').extract_first()
tagging = eachreview.xpath('.//ul[#class="review_item_info_tags"]/*[#class="review_info_tag "]/text()').extract()
stayedin = eachreview.xpath('.//p[#class="review_staydate "]/text()').extract_first()
givenscore = eachreview.xpath('.//span[#class="review-score-badge"]/text()').extract_first()
l = ItemLoader(item=BookingSpiderItem(), selector=response)
l.add_value('HotelName',hotel_name)
#l.add_value('ImageUrls',image_urls)
l.add_value('Facilities',all_facilities)
l.add_value('MainFacilities',important_facility)
l.add_value('UserName',username)
l.add_value('UserCountry',usercountry)
l.add_value('NumReviewGiven',numreviewgiven)
l.add_value('UserAgeGroup',useragegroup)
l.add_value('Heading',heading)
l.add_value('NegativeReview',neg_rev)
l.add_value('PositiveReview',pos_rev)
l.add_value('SelfTag',tagging)
l.add_value('StayDate',stayedin)
l.add_value('GivenScore',givenscore)
yield l.load_item()
next_page = response.xpath('.//*[#class="page_link review_next_page"]/a/#href').extract_first()
if next_page is not None:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(next_page_url, callback=self.parse_review)

Scrapy doesn't scrape data from a particular field in the same page which I have successfully scraped other data from

I am actually very new to Scrapy and I'm not sure why am I not getting the information which I want. I am using Scrapy on the website www.kayak.com and i want to extract the check in and check out time from all the hotels in New York. I have successfully scraped out data from the same page which the check in and check out time is in but couldnt scrape out data for both these fields.
The code I have is shown below:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from hotel_crawl.items import HotelCrawlItem
from bs4 import BeautifulSoup
import time
import urlparse
class MySpider(CrawlSpider):
name = "kayaksite"
allowed_domains = ["www.kayak.com"]
start_urls = ["http://www.kayak.com/New-York-Hotels.15830.hotel.ksp"]
rules = (
Rule(LinkExtractor(
restrict_xpaths=("//a[#class='actionlink pagenumber' [contains(text(),'Next')]", )), callback="parse_item", follow=True),
def parse_start_url(self, response):
print "test"
self.logger.info('Hi, this is an item page! %s', response.url)
item = HotelCrawlItem()
name = response.xpath("//a[#class='hotelname hotelresultsname']//text()").extract()
price = [BeautifulSoup(i).get_text() for i in response.xpath("//div[#class='pricerange']").extract()]
review = response.xpath("//a[#class='reviewsoverview']/strong/text()").extract()
url = response.xpath("//a[#class='hotelname hotelresultsname']//#href").extract()
alldata = zip(name, price, review, url)
for i in alldata:
item['name'] = i[0]
item['price'] = i[1]
item['review'] = i[2]
request = scrapy.Request(urlparse.urljoin(response.url, i[3]), callback=self.parse_item2)
request.meta['item'] = item
yield request
def parse_item(self, response):
self.logger.info('Hi, this is an item page! %s', response.url)
item = HotelCrawlItem()
name = response.xpath("//a[#class='hotelname hotelresultsname']//text()").extract()
price = [BeautifulSoup(i).get_text() for i in response.xpath("//div[#class='pricerange']").extract()]
review = response.xpath("//a[#class='reviewsoverview']/strong/text()").extract()
url = response.xpath("//a[#class='hotelname hotelresultsname']//#href").extract()
alldata = zip(name, price, review, url)
for i in alldata:
item['name'] = i[0]
item['price'] = i[1]
item['review'] = i[2]
request = scrapy.Request(urlparse.urljoin(response.url, i[3]), callback=self.parse_item2)
request.meta['item'] = item
yield request
def parse_item2(self, response):
print "test--------------"
self.logger.info('Hi, this is an item page! %s', response.url)
item = response.meta['item']
item['location'] = response.xpath("//*[#id='detailsOverviewContactInfo']/div/span/span[1]/text()").extract()
item['postcode'] = response.xpath("//*[#id='detailsOverviewContactInfo']/div/span/span[3]/text()").extract()
item['check_in'] = response.xpath("//*[#id='goodToKnow']/div/div[2]/div[2]/text()").extract()
item['check_out'] = response.xpath("//*[#id='goodToKnow']/div/div[2]/div[2]/text()").extract()
yield item

How to call Parse_page2 method for every item

I am trying to call parse_page2 method for every item. But every time i run this spider i am only getting single item per page so how do i call parse_page2 method for every item.
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "atisundarSpider"
allowed_domains = ["atisundar.com"]
URLSList = []
for n in range (1,20):
URLSList.append('http://atisundar.com/collections/sarees?page=' + str(n))
URLSList.append('http://atisundar.com/collections/salwar-suits?page=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="block product size-medium"]')
items = []
for site in sites:
item = EscraperItem()
item['productDesc'] = ""
item['productSite'] = ["http://atisundar.com/"]
item['productTitle'] = site.select('.//div[#class="main"]/a/#title').extract()
item['productURL'] = ["http://atisundar.com" + site.select('.//div[#class="main"]/a/#href').extract()[0].encode('utf-8')]
item['productPrice'] = site.select('.//p[#class="pricearea"]//span[#class="was-price"]/text()').extract() + site.select('.//p[#class="pricearea"]//span[#class="price"]/text()').extract()
item['productImage'] = [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0]] + [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0].replace("medium","grande")]
item['image_urls'] = item['productImage']
items.append(item)
secondURL = "http://admin.atisundar.com/store/skuDetails?product_id=" + site.select('.//div[#class="main"]/a/text()').extract()[1].strip().split("#")[-1]
request = Request(secondURL,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
#item['other_url'] = response.url
return item
1) you are not using CrawlSpider functionality , i would recommend you to inherit your spider from BaseSpider
2) in for loop
for site in sites:
use yield rather then return , other wise it will break the loop in first iteration.
yield request
3) in parse_page2 get item from response.request.meta instead from response.meta
item = response.request.meta['item']
it should work now.
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
#------------------------------------------------------------------------------
from scrapy.spider import BaseSpider
class ESpider(BaseSpider):
name = "atisundarSpider"
allowed_domains = ["atisundar.com"]
URLSList = []
for n in range (1,20):
URLSList.append('http://atisundar.com/collections/sarees?page=' + str(n))
URLSList.append('http://atisundar.com/collections/salwar-suits?page=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="block product size-medium"]')
for site in sites:
item = EscraperItem()
item['productDesc'] = ""
item['productSite'] = ["http://atisundar.com/"]
item['productTitle'] = site.select('.//div[#class="main"]/a/#title').extract()
item['productURL'] = ["http://atisundar.com" + site.select('.//div[#class="main"]/a/#href').extract()[0].encode('utf-8')]
item['productPrice'] = site.select('.//p[#class="pricearea"]//span[#class="was-price"]/text()').extract() + site.select('.//p[#class="pricearea"]//span[#class="price"]/text()').extract()
item['productImage'] = [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0]] + [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0].replace("medium","grande")]
item['image_urls'] = item['productImage']
secondURL = "http://admin.atisundar.com/store/skuDetails?product_id=" + site.select('.//div[#class="main"]/a/text()').extract()[1].strip().split("#")[-1]
request = Request(secondURL,
callback=self.parse_page2)
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.request.meta['item']
#item['other_url'] = response.url
return item

Categories