I am trying to extract certain strings from the below mentioned URL :
sample URL :
http://www.ladyblush.com/buy-sarees-online.html?p=1
http://www.ladyblush.com/buy-ladies-suits-online.html?p=1
http://www.ladyblush.com/buy-women-fashion-accessories.html?p=1
i want to extract :
productCategory = "sarees" productSubCategory = ""
productCategory = "ladies" productSubCategory = "suits"
productCategory = "women" productSubCategory = "fashion-accessories"
And so on. Actually i am writing a spider and i need to extract productCategory and productSubCategory from URL's like above mentioned..so i am trying to extract these fields inside parse method from response.url. Can someone help me out please
My code :
import re
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "ladyblushSpider"
allowed_domains = ["ladyblush.com"]
URLSList = []
for n in range (1,100):
URLSList.append('http://www.ladyblush.com/buy-sarees-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-ladies-suits-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-fashion-accessories.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-nightwear-lingerie-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-dress-online-skirts-suits-kurtis-tops.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-decor-online-wallclock-bedsheets-cushions-bedcovers.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-cosmetics-online-massage-oils-aromatherapy-perfumes-soaps.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-jewelery-online-art-fashion-semi-precious-antique-junk-jewellery.html?p=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="third thumbnailSpillLarge"]')
items = []
for site in sites:
item = EscraperItem()
item['currency'] = 'INR'
item['productCategory'] = [""]
item['productSubCategory'] = [""]
item['productSite'] = ["http://ladyblush.com/"]
item['productImage'] = site.select('./a/div/img/#src').extract()
item['productTitle'] = site.select('./a/div/img/#title').extract()
item['productURL'] = [site.select('./a/#href').extract()[0].replace(" ","%20")]
productMRP = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="old-price"]//span[#class="price"]/text()').extract()
productPrice = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="special-price"]//span[#class="price"]/text()').extract()
if productMRP and productPrice:
price = [productMRP[1].strip()] + [productPrice[1].strip()]
else:
price = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//span[#class="regular-price"]//span[#class="price"]/text()').extract()
item['productPrice'] = price
items.append(item)
secondURL = item['productURL'][0]
request = Request(secondURL,callback=self.parsePage2)
request.meta['item'] = item
yield request
def parsePage2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
if hxs.select('//div[#class="addtocart-container"]/div/text()').extract():
item['availability'] = False
else:
item['availability'] = True
if hxs.select('//label[#class="required"]/text()').extract():
item['hasVariants'] = True
else:
item['hasVariants'] = False
item['image_urls'] = list(set(item['productImage']))
item['productDesc'] = [" ".join([re.sub(r'[\t\n\r]',"",i.strip()) for i in hxs.select('//div[#class="std"]/text()').extract()])]
item['productImage'] = item['productImage'] + hxs.select('//div[#class="more-views"]/ul/li/a/img/#src').extract() + hxs.select('//div[#class="more-views"]/ul/li/a/#href').extract()
return item
#------------------------------------------------------------------------------
you can get the url from
response.url in the parse method. You could then parse that to just get the url path
import os
test = 'buy-women-fashion-accessories.html?p=1'
parts = os.path.splitext(test)
# ('buy-women-fashion-accessories', '.html?p=1')
parts[0].split('-')[1:]
# ['women', 'fashion', 'accessories']
This is rather flimsy solution though. Are you sure the data is not stored somewhere in the page's html that your are parsing, instead of looking at the url?
Related
When I run my scrapy spider to scrape comments on the steam platform, It is missing a lot of comments and is scraping the same comments several times. What is wrong with my code?
import scrapy
from scrapy import Request, FormRequest
from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
import json
from scrapy import Selector
import re
class Workshop_Item(Item):
app_id = Field()
workshop_id = Field()
game = Field()
workshop_name = Field()
user = Field()
comment = Field()
user_level = Field()
date_posted = Field()
user_location = Field()
number_of_badges = Field()
user_join_date = Field()
is_author = Field()
user_experience = Field()
class Workshop_Comment_Spider(scrapy.Spider):
name = "comments"
with open("output/workshop_comment_links.txt") as f:
urls = [line.rstrip("\n") for line in f]
start_urls = urls
def parse(self, response):
if int(max(response.css('span.tabCount::text').getall())) > 0 and "profiles" in response.css('a.commentthread_author_link::attr(href)').get():
contributor_id = re.search(r'Public_(.*?)_' , response.css('div.commentthread_footer a::attr(id)').get()).group(1)
elif int(max(response.css('span.tabCount::text').getall())) > 0:
contributor_id = re.search(r'Public_(.*?)_' , response.css('div.commentthread_footer a::attr(id)').get()).group(1)
workshop_id_number = response.css('form.smallForm > input::attr(value)').get()
if int(max(response.css('span.tabCount::text').getall())) > 50:
comment_number = max(response.css('span.tabCount::text').getall())
url = f'https://steamcommunity.com/comment/PublishedFile_Public/render/{contributor_id}/{workshop_id_number}/'
data = {
"start": "1",
"totalcount": comment_number,
"count": comment_number,
"sessionid": "d880ab2338b70926db0a9591",
"extended_data": "{\"contributors\":[\"" + contributor_id +"\",{}],\"appid\":289070,\"sharedfile\":{\"m_parentsDetails\":null,\"m_parentBundlesDetails\":null,\"m_bundledChildren\":[],\"m_ownedBundledItems\":[]},\"parent_item_reported\":false}",
"feature2": "-1"
}
app_id = response.css('div.apphub_HeaderTop a::attr(data-appid)').get()
game = response.css(".apphub_AppName::text").get()
workshop_id = response.css('form.smallForm input::attr(value)').get()
workshop_name = response.css(".workshopItemTitle::text").get()
yield FormRequest(url, formdata=data, callback=self.parse_paginated_comments, meta={'app_id': app_id, 'game': game, 'workshop_id': workshop_id, 'workshop_name': workshop_name})
else:
for comment in response.css(".commentthread_comment"):
item = Workshop_Item()
item['is_author'] = False
if "authorbadge" in comment.get():
item['is_author'] = True
item['app_id'] = response.css('div.apphub_HeaderTop a::attr(data-appid)').get()
item['workshop_id'] = response.css('form.smallForm input::attr(value)').get()
item['game'] = response.css(".apphub_AppName::text").get()
item['workshop_name'] = response.css(".workshopItemTitle::text").get()
item['user'] = comment.css("bdi::text").get()
item['comment'] = ",".join(comment.css(".commentthread_comment_text::text").getall()).replace('\n', ' ').replace('\t', '').replace('\r', ' ')
item['date_posted'] = comment.css(".commentthread_comment_timestamp::attr(title)").get()
item['user_level'] = -1
user_profile = comment.css(".commentthread_author_link::attr(href)").get()
request = Request(user_profile, callback=self.parse_user_info, meta={'item': item})
yield request
def parse_user_info(self, response):
item = response.meta['item']
if response.css('.profile_private_info'):
item['user_level'] = 'private'
item['user_location'] = 'private'
item['number_of_badges'] = 'private'
item['user_join_date'] = 'private'
item['user_experience'] = 'private'
return item
else:
item['user_level'] = response.css(".friendPlayerLevelNum::text").get()
if response.css('.header_real_name') and response.css("img.profile_flag"):
item['user_location'] = response.css('.header_real_name::text').getall()[2].strip()
else:
item['user_location'] = 'NA'
if response.css("div.profile_badges span.profile_count_link_total::text"):
item['number_of_badges'] = response.css("div.profile_badges span.profile_count_link_total::text").get().strip()
else:
item['number_of_badges'] = 'NA'
user_badge_page = response.css("div.profile_header_badgeinfo_badge_area > a::attr(href)").get() + "/1"
request = Request(user_badge_page, callback=self.parse_badge_info, meta={'item': item})
yield request
def parse_badge_info(self, response):
item = response.meta['item']
if response.css("div.badge_description"):
item['user_join_date'] = response.css("div.badge_description::text").get().strip()
experience_page = response.css('a.whiteLink.persona_name_text_content::attr(href)').get() + "/badges"
request = Request(experience_page, callback=self.parse_experience_page, meta={'item': item})
yield request
def parse_experience_page(self, response):
item = response.meta['item']
if response.css('span.profile_xp_block_xp'):
item['user_experience'] = response.css('span.profile_xp_block_xp::text').get()
return item
def parse_paginated_comments(self, response):
app_id = response.meta['app_id']
game = response.meta['game']
workshop_id = response.meta['workshop_id']
workshop_name = response.meta['workshop_name']
jsonresponse = json.loads(response.body.decode("utf-8"))
sel = Selector(text=jsonresponse['comments_html'])
for comment in sel.css(".commentthread_comment"):
item = Workshop_Item()
item['is_author'] = False
if "authorbadge" in comment.get():
item['is_author'] = True
item['app_id'] = app_id #sel.css('div.apphub_HeaderTop a::attr(data-appid)').get()
item['workshop_id'] = workshop_id #sel.css('form.smallForm input::attr(value)').get()
item['game'] = game #sel.css(".apphub_AppName::text").get()
item['workshop_name'] = workshop_name #sel.css(".workshopItemTitle::text").get()
item['user'] = comment.css("bdi::text").get()
item['comment'] = ",".join(comment.css(".commentthread_comment_text::text").getall()).replace('\n', ' ').replace('\t', '').replace('\r', ' ')
item['date_posted'] = comment.css(".commentthread_comment_timestamp::attr(title)").get()
item['user_level'] = -1
user_profile = sel.css(".commentthread_author_link::attr(href)").get()
request = Request(user_profile, callback=self.parse_user_info, meta={'item': item})
yield request
I am scraping a comment from a page, and then going to the user's profile to collect user data. If the page has pagination (>50 comments), I am sending a post request to the retrieve the json that contains the html for all of the comments, and then scraping that.
Fix it, the problem was here:
def parse_paginated_comments(self, response):
app_id = response.meta['app_id']
game = response.meta['game']
workshop_id = response.meta['workshop_id']
workshop_name = response.meta['workshop_name']
jsonresponse = json.loads(response.body.decode("utf-8"))
sel = Selector(text=jsonresponse['comments_html'])
for comment in sel.css(".commentthread_comment"):
item = Workshop_Item()
item['is_author'] = False
I needed to change
for comment in sel.css(".commentthread_comment"):
to
for comment in comment.css(".commentthread_comment"):
and I needed to add
DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'
To the settings.py file.
I am using a set of urls that i scraped earlier and using scrapy to loop over it and send it to another function which will iterate through all the pages corresponding to that link and scrape amazon reviews till there is a next button present, but it is skipping many URLs from the start_requests function. Here is the code
import scrapy
from scrapy import Request
import logging
import json
from scrapy.selector import Selector
import pandas as pd
from nordvpn_switcher import initialize_VPN,rotate_VPN,terminate_VPN
class AmazonrSpider(scrapy.Spider):
name = 'amazonr'
allowed_domains = ['*']
start_urls = ['http://amazon.com/']
df = pd.read_csv(r'C:\Amazon Reviews scraper(part1)\Scraper\reviewsScraper\cleanedProducts.csv')
link_asin = list(zip(df.SeeAllReviews,df.asin))[:1000]
counter = 0
settings = initialize_VPN(save=1,area_input=['complete rotation'])
rotate_VPN(settings)
def start_requests(self):
for pair in self.link_asin: """these URLs are being skipped"""
request = Request(pair[0],callback=self.parseReview,dont_filter=True, meta={'item': pair[1]})
yield request
def parseReview(self, response):
asin = response.meta['item']
reviewTitle = response.xpath('//*[#class="a-section a-spacing-none review-views celwidget"]//*[#data-hook="review-title"]//text()').extract()
reviewTitle = [i.strip() for i in reviewTitle if i.strip() != '' ]
reviewRatings = response.xpath('//*[#class="a-section a-spacing-none review-views celwidget"]//*[#data-hook="review-star-rating"]//text()').extract()
reviewText = response.xpath('//*[#class="a-section a-spacing-none review-views celwidget"]//*[#data-hook="review-body"]//text()').extract()
reviewText = [i.strip() for i in reviewText if i.strip() != '']
reviewDate = response.xpath('//*[#data-hook="review-date"]/text()').extract()
all_v = list(zip(reviewTitle,reviewText,reviewRatings,reviewDate))
for i in all_v:
rtitle = i[0]
rtext = i[1]
rrating = i[2]
rdate = i[3]
dict_ = {"asin":asin, "reviewTitle":rtitle,"reviewText":rtext,"reviewRatings":rrating,"reviewDate":rdate}
with open("Reviews.json","a") as fl:
json.dump(dict_,fl)
fl.write('\n')
nextpage = response.xpath('//*[#class="a-last"]//#href').extract_first() # nextpage link
self.counter = self.counter + 1
if self.counter%250 == 0 : # to rotate proxy after every 250 rotations
settings = initialize_VPN(save=1,area_input=['complete rotation'])
rotate_VPN(settings)
if str(nextpage) != 'None': # go to next page
nextpage = "https://www.amazon.com" + nextpage
yield Request(nextpage,callback=self.parseReview,dont_filter=True,meta={'item': asin})
else:
pass```
I find it can't run short_critic_content(self,response) why, I can't find reason.
I didn't use allowed_domains,
if I take it, the short_critic_content(self,response) is don't run.
allowed_domains = ["movie.mtime.com"].
start_urls = ['http://movie.mtime.com'] is wrong or right!
What's wrong with it getting error :
Scrapy Unsupported URL scheme '': no handler available for that scheme
class YinPin(CrawlSpider):
name = "yingping"
#allowed_domains = ["movie.mtime.com"]
start_urls = ['http://movie.mtime.com']
rules = (
#Rule(LinkExtractor(allow=())),
Rule(LinkExtractor(allow=(r'http://movie.mtime.com/40677/'), ), callback='movie_info', follow=False),
)
def movie_info(self, response):
selector = Selector(response)
#for movieinfo in movie_info:
movie_name = selector.xpath('//*[#id="db_head"]/div[2]/div/div[1]/h1/text()').extract()
movie_url = response.url#movieinfo.xpath('//*[#id="db_head"]/div[2]/div/div[2]/a[3]/#href').extract()
number = re.compile(r'\d+')
movie_num = int(number.search(str(movie_url)).group())
movie_release_time = selector.xpath('//*[#id="db_head"]/div[2]/div/div[1]/p[1]/a/text()').extract()
movie_place = selector.xpath('//*[#id="db_head"]/div[2]/div/div[2]/text()').extract()[3]
movie_type = selector.xpath('//*[#id="db_head"]/div[2]/div/div[2]/a/text()').extract()
movie_type_l = movie_type.pop()
movie_type = ' '.join(movie_type)
short_content = selector.css('#tweetRegion > dd > div > h3::text').extract() # selector.xpath('//*[#id="tweetRegion"]').css('h3::text').extract()
short_url = str(selector.xpath('//*[#id="tweetBottomDiv"]/p[2]/a/#href').extract())
yield Request(short_url, callback=self.short_critic_content,
meta={ 'movie_num': movie_num,
'short_content': short_content})
item = YingpingItem(
movie_num = movie_num,
movie_name = movie_name,
movie_release_time = movie_release_time,
movie_place = movie_place,
movie_type = movie_type,
)
yield item
def short_critic_content(self, response):
selector = Selector(response)
movie_num = response.meta['movie_num']
short_contentft = response.meta['short_content']
short_contentsd = selector.css('#tweetRegion > dd > div > h3::text').extract()
short_contents = short_contentft +short_contentsd
item = shortcriticItem(
movie_num = movie_num,
movie_scritic = short_contents
)
yield item
It's almost certain the problem is in this line of your movie_info function:
short_url = str(selector.xpath('//*[#id="tweetBottomDiv"]/p[2]/a/#href').extract())
extract() method of Selector returns a list, which you then convert to string. But that won't give you the URL, it gives you a string representation of the list, which starts with " character. That's why you get that error.
The correct way is either
short_url = selector.xpath('//*[#id="tweetBottomDiv"]/p[2]/a/#href').extract()[0]
or even better to use extract_first() instead of extract()
short_url = selector.xpath('//*[#id="tweetBottomDiv"]/p[2]/a/#href').extract_first()
I am using scrapy to scrape a website. I am getting all products from the listing page.Now i want to go to each url of the product but i am not getting the satisfactory result.
Here is my code:
import scrapy
from scrapy.http import Request
from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domain = ["test.com"]
start_urls = [
"http://www.test.com/?page=1"
]
page_index = 1
def parse(self,response):
products = response.xpath('//li')
items = []
if products:
for product in products:
item = DmozItem()
item['link'] = product.xpath('#data-url').extract()
item['sku'] = product.xpath('#data-sku').extract()
item['brand'] = product.xpath('.//span[contains(#class, "qa-brandName")]/text()').extract()
item['img'] = product.xpath('.//img[contains(#class, "itm-img")]/#src').extract()
page_url = "http://www.jabong.com/Lara-Karen-Black-Sweaters-893039.html"
request = Request(url=page_url,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"})
request.meta['item'] = item
item['other'] = request
yield item
else:
return
self.page_index += 1
if self.page_index:
yield Request(url="http://www.test.com/?page=%s" % (self.page_index),
headers={"Referer": "http://www.test.com/", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse)
def parse_page2(self, response):
item = response.meta['item']
item['title'] = response.xpath("//span[#id='before_price']/text()")
yield item
The result i am getting is
{"sku": [], "brand": [], "other": "<Request GET http://www.test.com/>", "link": [], "img": []},
instead of request Get i need the data which i am returning from pars2 function
Where am i going wrong.
Your xpaths seems to be wrong here,
try this
In [0]: products[0].xpath('./#data-url').extract()
Out[0]: [u'Sangria-Green-Kurtis-Kurtas-1081831.html']
In [1]: products[0].xpath('./a/#unbxdparam_sku').extract()
Out[1]: [u'SA038WA68OIXINDFAS']
In [2]: products[0].xpath('./a/span[contains(#class,"qa-brandName")]/text()').extract()
Out[2]: [u'Sangria']
In [3]: products[0].xpath('./a/span[#class="lazyImage cat-prd-img"]/span/#id').extract()
Out[3]: [u'http://static14.jassets.com/p/Sangria-Green--Kurtis-26-Kurtas-5520-1381801-1-catalog.jpg']
so the code will be ,
BASE_URL = 'http://www.jabong.com/'
for product in products:
item = DmozItem()
item_url = product.xpath('./#data-url').extract()
item_url = self.BASE_URL + item_url[0] if item_url else ''
item['link'] = product.xpath('./#data-url').extract()
item['sku'] = product.xpath('./a/#unbxdparam_sku').extract()
item['brand'] = product[0].xpath('./a/span[contains(#class,"qa-brandName")]/text()').extract()
item['img'] = product.xpath('./a/span[#class="lazyImage cat-prd-img"]/span/#id').extract()
if item_url:
yield Request(url=self.BASE_URL + ,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8}, meta={'item'=item})
EDIT
complete spider code
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.spider import Spider
from scrapy.http import Request
class JabongItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
link = scrapy.Field()
sku = scrapy.Field()
brand = scrapy.Field()
img = scrapy.Field()
class JabongSpider(scrapy.Spider):
name = "jabong"
allowed_domains = ["jabong.com"]
start_urls = ["http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1"]
page_index = 1
BASE_URL = 'http://www.jabong.com/'
def parse(self, response):
products = response.xpath("//li[#data-url]")
if products:
for product in products:
link = product.xpath('#data-url').extract()
link = self.BASE_URL + link[0] if link else ''
sku = product.xpath('#data-sku').extract()
sku = sku[0].strip() if sku else 'n/a'
brand = product.xpath('.//span[contains(#class, "qa-brandName")]/text()').extract()
brand = brand[0].strip() if brand else 'n/a'
img = product.xpath('.//img[contains(#class, "itm-img")]/#src').extract()
img = img[0].strip() if img else 'n/a'
item = JabongItem()
item['link'] = link
item['sku'] = sku
item['brand'] = brand
item['img'] = img
if link:
yield Request(url=link, callback=self.parse_page2, meta={'item': item})
else:
return
self.page_index += 1
yield Request(url="http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1%s" % (self.page_index + 1),
callback=self.parse, dont_filter=True)
def parse_page2(self, response):
item = response.meta['item']
# add whatever extra details you want to item
yield item
I am trying to call parse_page2 method for every item. But every time i run this spider i am only getting single item per page so how do i call parse_page2 method for every item.
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "atisundarSpider"
allowed_domains = ["atisundar.com"]
URLSList = []
for n in range (1,20):
URLSList.append('http://atisundar.com/collections/sarees?page=' + str(n))
URLSList.append('http://atisundar.com/collections/salwar-suits?page=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="block product size-medium"]')
items = []
for site in sites:
item = EscraperItem()
item['productDesc'] = ""
item['productSite'] = ["http://atisundar.com/"]
item['productTitle'] = site.select('.//div[#class="main"]/a/#title').extract()
item['productURL'] = ["http://atisundar.com" + site.select('.//div[#class="main"]/a/#href').extract()[0].encode('utf-8')]
item['productPrice'] = site.select('.//p[#class="pricearea"]//span[#class="was-price"]/text()').extract() + site.select('.//p[#class="pricearea"]//span[#class="price"]/text()').extract()
item['productImage'] = [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0]] + [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0].replace("medium","grande")]
item['image_urls'] = item['productImage']
items.append(item)
secondURL = "http://admin.atisundar.com/store/skuDetails?product_id=" + site.select('.//div[#class="main"]/a/text()').extract()[1].strip().split("#")[-1]
request = Request(secondURL,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
#item['other_url'] = response.url
return item
1) you are not using CrawlSpider functionality , i would recommend you to inherit your spider from BaseSpider
2) in for loop
for site in sites:
use yield rather then return , other wise it will break the loop in first iteration.
yield request
3) in parse_page2 get item from response.request.meta instead from response.meta
item = response.request.meta['item']
it should work now.
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
#------------------------------------------------------------------------------
from scrapy.spider import BaseSpider
class ESpider(BaseSpider):
name = "atisundarSpider"
allowed_domains = ["atisundar.com"]
URLSList = []
for n in range (1,20):
URLSList.append('http://atisundar.com/collections/sarees?page=' + str(n))
URLSList.append('http://atisundar.com/collections/salwar-suits?page=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="block product size-medium"]')
for site in sites:
item = EscraperItem()
item['productDesc'] = ""
item['productSite'] = ["http://atisundar.com/"]
item['productTitle'] = site.select('.//div[#class="main"]/a/#title').extract()
item['productURL'] = ["http://atisundar.com" + site.select('.//div[#class="main"]/a/#href').extract()[0].encode('utf-8')]
item['productPrice'] = site.select('.//p[#class="pricearea"]//span[#class="was-price"]/text()').extract() + site.select('.//p[#class="pricearea"]//span[#class="price"]/text()').extract()
item['productImage'] = [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0]] + [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0].replace("medium","grande")]
item['image_urls'] = item['productImage']
secondURL = "http://admin.atisundar.com/store/skuDetails?product_id=" + site.select('.//div[#class="main"]/a/text()').extract()[1].strip().split("#")[-1]
request = Request(secondURL,
callback=self.parse_page2)
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.request.meta['item']
#item['other_url'] = response.url
return item