I am trying to call parse_page2 method for every item. But every time i run this spider i am only getting single item per page so how do i call parse_page2 method for every item.
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "atisundarSpider"
allowed_domains = ["atisundar.com"]
URLSList = []
for n in range (1,20):
URLSList.append('http://atisundar.com/collections/sarees?page=' + str(n))
URLSList.append('http://atisundar.com/collections/salwar-suits?page=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="block product size-medium"]')
items = []
for site in sites:
item = EscraperItem()
item['productDesc'] = ""
item['productSite'] = ["http://atisundar.com/"]
item['productTitle'] = site.select('.//div[#class="main"]/a/#title').extract()
item['productURL'] = ["http://atisundar.com" + site.select('.//div[#class="main"]/a/#href').extract()[0].encode('utf-8')]
item['productPrice'] = site.select('.//p[#class="pricearea"]//span[#class="was-price"]/text()').extract() + site.select('.//p[#class="pricearea"]//span[#class="price"]/text()').extract()
item['productImage'] = [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0]] + [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0].replace("medium","grande")]
item['image_urls'] = item['productImage']
items.append(item)
secondURL = "http://admin.atisundar.com/store/skuDetails?product_id=" + site.select('.//div[#class="main"]/a/text()').extract()[1].strip().split("#")[-1]
request = Request(secondURL,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
#item['other_url'] = response.url
return item
1) you are not using CrawlSpider functionality , i would recommend you to inherit your spider from BaseSpider
2) in for loop
for site in sites:
use yield rather then return , other wise it will break the loop in first iteration.
yield request
3) in parse_page2 get item from response.request.meta instead from response.meta
item = response.request.meta['item']
it should work now.
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
#------------------------------------------------------------------------------
from scrapy.spider import BaseSpider
class ESpider(BaseSpider):
name = "atisundarSpider"
allowed_domains = ["atisundar.com"]
URLSList = []
for n in range (1,20):
URLSList.append('http://atisundar.com/collections/sarees?page=' + str(n))
URLSList.append('http://atisundar.com/collections/salwar-suits?page=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="block product size-medium"]')
for site in sites:
item = EscraperItem()
item['productDesc'] = ""
item['productSite'] = ["http://atisundar.com/"]
item['productTitle'] = site.select('.//div[#class="main"]/a/#title').extract()
item['productURL'] = ["http://atisundar.com" + site.select('.//div[#class="main"]/a/#href').extract()[0].encode('utf-8')]
item['productPrice'] = site.select('.//p[#class="pricearea"]//span[#class="was-price"]/text()').extract() + site.select('.//p[#class="pricearea"]//span[#class="price"]/text()').extract()
item['productImage'] = [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0]] + [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0].replace("medium","grande")]
item['image_urls'] = item['productImage']
secondURL = "http://admin.atisundar.com/store/skuDetails?product_id=" + site.select('.//div[#class="main"]/a/text()').extract()[1].strip().split("#")[-1]
request = Request(secondURL,
callback=self.parse_page2)
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.request.meta['item']
#item['other_url'] = response.url
return item
Related
I am actually very new to Scrapy and I'm not sure why am I not getting the information which I want. I am using Scrapy on the website www.kayak.com and i want to extract the check in and check out time from all the hotels in New York. I have successfully scraped out data from the same page which the check in and check out time is in but couldnt scrape out data for both these fields.
The code I have is shown below:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from hotel_crawl.items import HotelCrawlItem
from bs4 import BeautifulSoup
import time
import urlparse
class MySpider(CrawlSpider):
name = "kayaksite"
allowed_domains = ["www.kayak.com"]
start_urls = ["http://www.kayak.com/New-York-Hotels.15830.hotel.ksp"]
rules = (
Rule(LinkExtractor(
restrict_xpaths=("//a[#class='actionlink pagenumber' [contains(text(),'Next')]", )), callback="parse_item", follow=True),
def parse_start_url(self, response):
print "test"
self.logger.info('Hi, this is an item page! %s', response.url)
item = HotelCrawlItem()
name = response.xpath("//a[#class='hotelname hotelresultsname']//text()").extract()
price = [BeautifulSoup(i).get_text() for i in response.xpath("//div[#class='pricerange']").extract()]
review = response.xpath("//a[#class='reviewsoverview']/strong/text()").extract()
url = response.xpath("//a[#class='hotelname hotelresultsname']//#href").extract()
alldata = zip(name, price, review, url)
for i in alldata:
item['name'] = i[0]
item['price'] = i[1]
item['review'] = i[2]
request = scrapy.Request(urlparse.urljoin(response.url, i[3]), callback=self.parse_item2)
request.meta['item'] = item
yield request
def parse_item(self, response):
self.logger.info('Hi, this is an item page! %s', response.url)
item = HotelCrawlItem()
name = response.xpath("//a[#class='hotelname hotelresultsname']//text()").extract()
price = [BeautifulSoup(i).get_text() for i in response.xpath("//div[#class='pricerange']").extract()]
review = response.xpath("//a[#class='reviewsoverview']/strong/text()").extract()
url = response.xpath("//a[#class='hotelname hotelresultsname']//#href").extract()
alldata = zip(name, price, review, url)
for i in alldata:
item['name'] = i[0]
item['price'] = i[1]
item['review'] = i[2]
request = scrapy.Request(urlparse.urljoin(response.url, i[3]), callback=self.parse_item2)
request.meta['item'] = item
yield request
def parse_item2(self, response):
print "test--------------"
self.logger.info('Hi, this is an item page! %s', response.url)
item = response.meta['item']
item['location'] = response.xpath("//*[#id='detailsOverviewContactInfo']/div/span/span[1]/text()").extract()
item['postcode'] = response.xpath("//*[#id='detailsOverviewContactInfo']/div/span/span[3]/text()").extract()
item['check_in'] = response.xpath("//*[#id='goodToKnow']/div/div[2]/div[2]/text()").extract()
item['check_out'] = response.xpath("//*[#id='goodToKnow']/div/div[2]/div[2]/text()").extract()
yield item
I am using scrapy to scrape a website. I am getting all products from the listing page.Now i want to go to each url of the product but i am not getting the satisfactory result.
Here is my code:
import scrapy
from scrapy.http import Request
from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domain = ["test.com"]
start_urls = [
"http://www.test.com/?page=1"
]
page_index = 1
def parse(self,response):
products = response.xpath('//li')
items = []
if products:
for product in products:
item = DmozItem()
item['link'] = product.xpath('#data-url').extract()
item['sku'] = product.xpath('#data-sku').extract()
item['brand'] = product.xpath('.//span[contains(#class, "qa-brandName")]/text()').extract()
item['img'] = product.xpath('.//img[contains(#class, "itm-img")]/#src').extract()
page_url = "http://www.jabong.com/Lara-Karen-Black-Sweaters-893039.html"
request = Request(url=page_url,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"})
request.meta['item'] = item
item['other'] = request
yield item
else:
return
self.page_index += 1
if self.page_index:
yield Request(url="http://www.test.com/?page=%s" % (self.page_index),
headers={"Referer": "http://www.test.com/", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse)
def parse_page2(self, response):
item = response.meta['item']
item['title'] = response.xpath("//span[#id='before_price']/text()")
yield item
The result i am getting is
{"sku": [], "brand": [], "other": "<Request GET http://www.test.com/>", "link": [], "img": []},
instead of request Get i need the data which i am returning from pars2 function
Where am i going wrong.
Your xpaths seems to be wrong here,
try this
In [0]: products[0].xpath('./#data-url').extract()
Out[0]: [u'Sangria-Green-Kurtis-Kurtas-1081831.html']
In [1]: products[0].xpath('./a/#unbxdparam_sku').extract()
Out[1]: [u'SA038WA68OIXINDFAS']
In [2]: products[0].xpath('./a/span[contains(#class,"qa-brandName")]/text()').extract()
Out[2]: [u'Sangria']
In [3]: products[0].xpath('./a/span[#class="lazyImage cat-prd-img"]/span/#id').extract()
Out[3]: [u'http://static14.jassets.com/p/Sangria-Green--Kurtis-26-Kurtas-5520-1381801-1-catalog.jpg']
so the code will be ,
BASE_URL = 'http://www.jabong.com/'
for product in products:
item = DmozItem()
item_url = product.xpath('./#data-url').extract()
item_url = self.BASE_URL + item_url[0] if item_url else ''
item['link'] = product.xpath('./#data-url').extract()
item['sku'] = product.xpath('./a/#unbxdparam_sku').extract()
item['brand'] = product[0].xpath('./a/span[contains(#class,"qa-brandName")]/text()').extract()
item['img'] = product.xpath('./a/span[#class="lazyImage cat-prd-img"]/span/#id').extract()
if item_url:
yield Request(url=self.BASE_URL + ,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8}, meta={'item'=item})
EDIT
complete spider code
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.spider import Spider
from scrapy.http import Request
class JabongItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
link = scrapy.Field()
sku = scrapy.Field()
brand = scrapy.Field()
img = scrapy.Field()
class JabongSpider(scrapy.Spider):
name = "jabong"
allowed_domains = ["jabong.com"]
start_urls = ["http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1"]
page_index = 1
BASE_URL = 'http://www.jabong.com/'
def parse(self, response):
products = response.xpath("//li[#data-url]")
if products:
for product in products:
link = product.xpath('#data-url').extract()
link = self.BASE_URL + link[0] if link else ''
sku = product.xpath('#data-sku').extract()
sku = sku[0].strip() if sku else 'n/a'
brand = product.xpath('.//span[contains(#class, "qa-brandName")]/text()').extract()
brand = brand[0].strip() if brand else 'n/a'
img = product.xpath('.//img[contains(#class, "itm-img")]/#src').extract()
img = img[0].strip() if img else 'n/a'
item = JabongItem()
item['link'] = link
item['sku'] = sku
item['brand'] = brand
item['img'] = img
if link:
yield Request(url=link, callback=self.parse_page2, meta={'item': item})
else:
return
self.page_index += 1
yield Request(url="http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1%s" % (self.page_index + 1),
callback=self.parse, dont_filter=True)
def parse_page2(self, response):
item = response.meta['item']
# add whatever extra details you want to item
yield item
I have a Scrapy script for Yelp that is, for the most part, working. Essentially I can supply it with a list of Yelp pages and it should return all reviews from all pages. The script so far is below:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
items.append(item)
return items
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
return requests
However, the problem I'm running into is that this particular script scrapes every page of every requested review EXCEPT for the first page. If I comment out the last "if" statement, it only scrapes the FIRST page. I suspect all I need is a simple "else" command but I am stumped... help is greatly appreciated!
EDIT: This is the code as it currently stands based on assistance received...
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
yield item
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
for request in requests:
yield request
As mentioned in a comment below, running this code as-is crawls every desired page, but it only returns one review per page rather than all of them.
I tried Changing yield item to yield items, but an error message of ERROR: Spider must return Request, BaseItem or None, got 'list' in <GET http://www.yelp.com/biz/[...]> is returned for every URL crawled.
You need to reorganize the methods a bit. First parse restaurant page in parse() method. Then, return requests for reviews and handle responses in another method, e.g. parse_review():
import re
from scrapy.item import Item, Field
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
class Yelp2aSpider(Spider):
name = "yelp2a"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse_review) for n in range(totalReviews/reviewsPerPage)]
return pages
def parse_review(self, response):
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
yield item
If you're returning items/requests in more than one place, you should replace your return statements with yield statements, which turn your function into a generator, which returns a new element each time it's generated (yields it), without exiting the function until they are all returned. Otherwise, as your code is now, your function will exit after the first return and won't get to sending the requests for the following pages.
Edit: Correction - you should yield one item/request at a time, so:
Replace
for review in reviews:
item = ...
return items
with
for review in reviews:
item = ...
yield item
and replace
return requests
with
for request in requests:
yield request
The final answer did indeed lie in the indentation of one single yield line. This is the code that ended up doing what I needed it to do.
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import re
from yelp2.items import YelpReviewItem
RESTAURANTS = ['sixteen-chicago']
def createRestaurantPageLinks(self, response):
reviewsPerPage = 40
sel = Selector(response)
totalReviews = int(sel.xpath('//div[#class="rating-info clearfix"]//span[#itemprop="reviewCount"]/text()').extract()[0].strip().split(' ')[0])
pages = [Request(url=response.url + '?start=' + str(reviewsPerPage*(n+1)), callback=self.parse) for n in range(totalReviews/reviewsPerPage)]
return pages
class YelpXSpider(Spider):
name = "yelpx"
allowed_domains = ["yelp.com"]
start_urls = ['http://www.yelp.com/biz/%s' % s for s in RESTAURANTS]
def parse(self, response):
requests = []
sel = Selector(response)
reviews = sel.xpath('//div[#class="review review-with-no-actions"]')
items = []
for review in reviews:
item = YelpReviewItem()
item['venueName'] = sel.xpath('//meta[#property="og:title"]/#content').extract()
item['reviewer'] = review.xpath('.//li[#class="user-name"]/a/text()').extract()
item['reviewerLoc'] = review.xpath('.//li[#class="user-location"]/b/text()').extract()
item['rating'] = review.xpath('.//meta[#itemprop="ratingValue"]/#content').extract()
item['reviewDate'] = review.xpath('.//meta[#itemprop="datePublished"]/#content').extract()
item['reviewText'] = review.xpath('.//p[#itemprop="description"]/text()').extract()
item['url'] = response.url
yield item
if response.url.find('?start=') == -1:
requests += createRestaurantPageLinks(self, response)
for request in requests:
yield request
Thanks to everyone for helping out a noob!
I am trying to extract certain strings from the below mentioned URL :
sample URL :
http://www.ladyblush.com/buy-sarees-online.html?p=1
http://www.ladyblush.com/buy-ladies-suits-online.html?p=1
http://www.ladyblush.com/buy-women-fashion-accessories.html?p=1
i want to extract :
productCategory = "sarees" productSubCategory = ""
productCategory = "ladies" productSubCategory = "suits"
productCategory = "women" productSubCategory = "fashion-accessories"
And so on. Actually i am writing a spider and i need to extract productCategory and productSubCategory from URL's like above mentioned..so i am trying to extract these fields inside parse method from response.url. Can someone help me out please
My code :
import re
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "ladyblushSpider"
allowed_domains = ["ladyblush.com"]
URLSList = []
for n in range (1,100):
URLSList.append('http://www.ladyblush.com/buy-sarees-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-ladies-suits-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-fashion-accessories.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-nightwear-lingerie-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-dress-online-skirts-suits-kurtis-tops.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-decor-online-wallclock-bedsheets-cushions-bedcovers.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-cosmetics-online-massage-oils-aromatherapy-perfumes-soaps.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-jewelery-online-art-fashion-semi-precious-antique-junk-jewellery.html?p=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="third thumbnailSpillLarge"]')
items = []
for site in sites:
item = EscraperItem()
item['currency'] = 'INR'
item['productCategory'] = [""]
item['productSubCategory'] = [""]
item['productSite'] = ["http://ladyblush.com/"]
item['productImage'] = site.select('./a/div/img/#src').extract()
item['productTitle'] = site.select('./a/div/img/#title').extract()
item['productURL'] = [site.select('./a/#href').extract()[0].replace(" ","%20")]
productMRP = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="old-price"]//span[#class="price"]/text()').extract()
productPrice = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="special-price"]//span[#class="price"]/text()').extract()
if productMRP and productPrice:
price = [productMRP[1].strip()] + [productPrice[1].strip()]
else:
price = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//span[#class="regular-price"]//span[#class="price"]/text()').extract()
item['productPrice'] = price
items.append(item)
secondURL = item['productURL'][0]
request = Request(secondURL,callback=self.parsePage2)
request.meta['item'] = item
yield request
def parsePage2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
if hxs.select('//div[#class="addtocart-container"]/div/text()').extract():
item['availability'] = False
else:
item['availability'] = True
if hxs.select('//label[#class="required"]/text()').extract():
item['hasVariants'] = True
else:
item['hasVariants'] = False
item['image_urls'] = list(set(item['productImage']))
item['productDesc'] = [" ".join([re.sub(r'[\t\n\r]',"",i.strip()) for i in hxs.select('//div[#class="std"]/text()').extract()])]
item['productImage'] = item['productImage'] + hxs.select('//div[#class="more-views"]/ul/li/a/img/#src').extract() + hxs.select('//div[#class="more-views"]/ul/li/a/#href').extract()
return item
#------------------------------------------------------------------------------
you can get the url from
response.url in the parse method. You could then parse that to just get the url path
import os
test = 'buy-women-fashion-accessories.html?p=1'
parts = os.path.splitext(test)
# ('buy-women-fashion-accessories', '.html?p=1')
parts[0].split('-')[1:]
# ['women', 'fashion', 'accessories']
This is rather flimsy solution though. Are you sure the data is not stored somewhere in the page's html that your are parsing, instead of looking at the url?
I just tried for the first time to populate a item while transport it from page to page.
It works in each loop and the gender information also arrives correctly in parse_3 but g2 doesnt fit the category of the response url and g1 (first category level) is always the last element of the list from the list i loop through in parse_sub ...
For sure I do something wrong, but I can't find the problem, it would be great if somebody could explain me how it works.
Best,
Jack
class xspider(BaseSpider):
name = 'x'
allowed_domains = ['x.com']
start_urls = ['http://www.x.com']
def parse(self, response):
hxs = HtmlXPathSelector(response)
maincats = hxs.select('//ul[#class="Nav"]/li/a/#href').extract()[1:3]
for maincat in maincats:
item = catItem()
if 'men' in maincat:
item['gender'] = 'men'
maincat = 'http://www.x.com' + maincat
request = Request(maincat, callback=self.parse_sub)
request.meta['item'] = item
if 'woman' in maincat:
item['gender'] = []
item['gender'] = 'woman'
maincat = 'http://www.x.com' + maincat
request = Request(maincat, callback=self.parse_sub)
request.meta['item'] = item
yield request
def parse_sub(self, response):
i = 0
hxs = HtmlXPathSelector(response)
subcats = hxs.select('//ul[#class="sub Sprite"]/li/a/#href').extract()[0:5]
text = hxs.select('//ul[#class="sub Sprite"]/li/a/span/text()').extract()[0:5]
for item in text:
item = response.meta['item']
subcat = 'http://www.x.com' + subcats[i]
request = Request(subcat, callback=self.parse_subcat)
item['g1'] = text[i]
item['gender'] = response.request.meta['item']
i = i + 1
request.meta['item'] = item
yield request
def parse_subcat(self, response):
hxs = HtmlXPathSelector(response)
test = hxs.select('//ul[#class="sub"]/li/a').extract()
for s in test:
item = response.meta['item']
item['g2'] = []
item['g2'] = hxs.select('//span[#class="Active Sprite"]/text()').extract()[0]
s = s.encode('utf-8','ignore')
link = s[s.find('href="')+6:][:s[s.find('href="')+6:].find('/"')]
link = 'http://www.x.com/' + str(link) + '/'
request = Request(link, callback=self.parse_3)
request.meta['item'] = item
yield request
def parse_3(self, response):
item = response.meta['item']
print item
def parse_subcat(self, response):
hxs = HtmlXPathSelector(response)
test = hxs.select('//ul[#class="sub"]/li/a').extract()
for s in test:
item = response.meta['item']
item['g2'] = []
item['g2'] = hxs.select('//span[#class="Active Sprite"]/text()').extract()[0]
s = s.encode('utf-8','ignore')
link = s[s.find('href="')+6:][:s[s.find('href="')+6:].find('/"')]
link = 'http://www.x.com/' + str(link) + '/'
request = Request(link, callback=self.parse_3)
request.meta['item'] = item
yield request
response doesn't contains meta but request so
insted of item = response.meta['item']
it should be item = response.request.meta['item']