scrapy: transport item over several parse and collect data

scrapy: transport item over several parse and collect data - python

I just tried for the first time to populate a item while transport it from page to page.
It works in each loop and the gender information also arrives correctly in parse_3 but g2 doesnt fit the category of the response url and g1 (first category level) is always the last element of the list from the list i loop through in parse_sub ...
For sure I do something wrong, but I can't find the problem, it would be great if somebody could explain me how it works.
Best,
Jack
class xspider(BaseSpider):
name = 'x'
allowed_domains = ['x.com']
start_urls = ['http://www.x.com']
def parse(self, response):
hxs = HtmlXPathSelector(response)
maincats = hxs.select('//ul[#class="Nav"]/li/a/#href').extract()[1:3]
for maincat in maincats:
item = catItem()
if 'men' in maincat:
item['gender'] = 'men'
maincat = 'http://www.x.com' + maincat
request = Request(maincat, callback=self.parse_sub)
request.meta['item'] = item
if 'woman' in maincat:
item['gender'] = []
item['gender'] = 'woman'
maincat = 'http://www.x.com' + maincat
request = Request(maincat, callback=self.parse_sub)
request.meta['item'] = item
yield request
def parse_sub(self, response):
i = 0
hxs = HtmlXPathSelector(response)
subcats = hxs.select('//ul[#class="sub Sprite"]/li/a/#href').extract()[0:5]
text = hxs.select('//ul[#class="sub Sprite"]/li/a/span/text()').extract()[0:5]
for item in text:
item = response.meta['item']
subcat = 'http://www.x.com' + subcats[i]
request = Request(subcat, callback=self.parse_subcat)
item['g1'] = text[i]
item['gender'] = response.request.meta['item']
i = i + 1
request.meta['item'] = item
yield request
def parse_subcat(self, response):
hxs = HtmlXPathSelector(response)
test = hxs.select('//ul[#class="sub"]/li/a').extract()
for s in test:
item = response.meta['item']
item['g2'] = []
item['g2'] = hxs.select('//span[#class="Active Sprite"]/text()').extract()[0]
s = s.encode('utf-8','ignore')
link = s[s.find('href="')+6:][:s[s.find('href="')+6:].find('/"')]
link = 'http://www.x.com/' + str(link) + '/'
request = Request(link, callback=self.parse_3)
request.meta['item'] = item
yield request
def parse_3(self, response):
item = response.meta['item']
print item

def parse_subcat(self, response):
hxs = HtmlXPathSelector(response)
test = hxs.select('//ul[#class="sub"]/li/a').extract()
for s in test:
item = response.meta['item']
item['g2'] = []
item['g2'] = hxs.select('//span[#class="Active Sprite"]/text()').extract()[0]
s = s.encode('utf-8','ignore')
link = s[s.find('href="')+6:][:s[s.find('href="')+6:].find('/"')]
link = 'http://www.x.com/' + str(link) + '/'
request = Request(link, callback=self.parse_3)
request.meta['item'] = item
yield request
response doesn't contains meta but request so
insted of item = response.meta['item']
it should be item = response.request.meta['item']

Related

Scrapy: Crawled but not scraped any data

I wrote the following code to scrape Booking.com given the name of the city. Ideally, the program should find out all the hotels that are available in the city and scrape all the reviews for each hotel. Unfortunately, it will scrape only a few hotels and only the first 75 reviews of those hotels. Will you please tell me what am I doing wrong here??
import scrapy
from scrapy import Spider
from scrapy.loader import ItemLoader
from booking_spider.items import BookingSpiderItem
class PerhotelrevSpider(Spider):
name = 'perhotelrev'
allowed_domains = ['booking.com']
#start_urls = ['https://booking.com/reviews/us/hotel/maison-st-charles-quality-inn-suites.html?/']
start_urls = ['https://www.booking.com/searchresults.html?ss=New%20Orleans&']
#handle_httpstatus_list = [301, 302]
def parse(self, response):
all_hotels = response.xpath('.//*[#class="sr-hotel__title \n"]')
for ahotel in all_hotels:
hotel_name = ahotel.xpath('.//*[#class="sr-hotel__name\n"]/text()').extract_first().replace('\n','')
hotel_url = ahotel.xpath('.//*[#class="hotel_name_link url"]/#href').extract_first().replace('\n','')
full_hotel_url = 'https://www.booking.com'+str(hotel_url)
request = scrapy.Request(full_hotel_url, callback = self.parse_hotels)
request.meta['adict'] = {'HotelName':hotel_name}
yield request
next_page = response.xpath('.//*[#class="bui-pagination__item bui-pagination__next-arrow"]/a/#href').extract_first()
if next_page is not None:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_hotels(self, response):
adict = response.meta['adict']
hotel_name = adict['HotelName']
#hotel_name = response.xpath('.//*[#class="hp__hotel-name"]/text()')[1].extract().replace('\n','')
image_urls = response.xpath('.//*[#class="b_nha_hotel_small_images hp_thumbgallery_with_counter"]/a/#href').extract()
all_facilities = response.xpath('.//*[#class="facilitiesChecklistSection"]/ul/li/span/text()').extract()
all_facilities = [x.replace('\n','') for x in all_facilities]
important_facility = response.xpath('.//*[#class="important_facility "]/#data-name-en').extract()
#print(hotel_name)
all_review_url = response.xpath('.//*[#class="show_all_reviews_btn"]/#href').extract_first()
adict = { 'HotelName':hotel_name,
'ImageUrls':image_urls,
'Facilities':all_facilities,
'MainFacilities':important_facility
}
if all_review_url is not None:
review_url = "https://booking.com"+all_review_url
request = scrapy.Request(review_url, callback=self.parse_review)
request.meta['adict'] = adict
yield request
def parse_review(self, response):
allreviewsinpage = response.xpath('.//*[#itemprop="review"]')
adict = response.meta['adict']
hotel_name = adict['HotelName']
image_urls = adict['ImageUrls']
all_facilities = adict['Facilities']
important_facility = adict['MainFacilities']
for eachreview in allreviewsinpage:
username = eachreview.xpath('.//p[#class="reviewer_name"]/*[#itemprop="name"]/text()').extract_first()
usercountry = eachreview.xpath('.//*[#itemprop="nationality"]/*[#itemprop="name"]/text()').extract_first()
numreviewgiven = eachreview.xpath('.//*[#class="review_item_user_review_count"]/text()').extract_first()
useragegroup = eachreview.xpath('.//*[#class="user_age_group"]/text()').extract_first()
heading = eachreview.xpath('.//*[#class="review_item_header_content\n"]/*[#itemprop="name"]/text()').extract_first()
neg_rev = eachreview.xpath('.//p[#class="review_neg "]/*[#itemprop="reviewBody"]/text()').extract_first()
pos_rev = eachreview.xpath('.//p[#class="review_pos "]/*[#itemprop="reviewBody"]/text()').extract_first()
tagging = eachreview.xpath('.//ul[#class="review_item_info_tags"]/*[#class="review_info_tag "]/text()').extract()
stayedin = eachreview.xpath('.//p[#class="review_staydate "]/text()').extract_first()
givenscore = eachreview.xpath('.//span[#class="review-score-badge"]/text()').extract_first()
l = ItemLoader(item=BookingSpiderItem(), selector=response)
l.add_value('HotelName',hotel_name)
#l.add_value('ImageUrls',image_urls)
l.add_value('Facilities',all_facilities)
l.add_value('MainFacilities',important_facility)
l.add_value('UserName',username)
l.add_value('UserCountry',usercountry)
l.add_value('NumReviewGiven',numreviewgiven)
l.add_value('UserAgeGroup',useragegroup)
l.add_value('Heading',heading)
l.add_value('NegativeReview',neg_rev)
l.add_value('PositiveReview',pos_rev)
l.add_value('SelfTag',tagging)
l.add_value('StayDate',stayedin)
l.add_value('GivenScore',givenscore)
yield l.load_item()
next_page = response.xpath('.//*[#class="page_link review_next_page"]/a/#href').extract_first()
if next_page is not None:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(next_page_url, callback=self.parse_review)

Multiple pages per item - using scraped links

My spide looks like this/;
class ScrapeMovies(scrapy.Spider):
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = loopitem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
yield item
# This part is responsible for scraping all of the pages on a start url commented out for convinience
# next_page=response.xpath('//div[#class="page-nav-btm"]/ul/li[last()]/a/#href').extract_first()
# if next_page is not None:
# next_page=response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
What it does as of know it scrapes the table (see the starting url). I want it to then go the link (members name column) and then extract some informations from this link (link is e.g. https://www.trekearth.com/members/monareng/) and the return this as an item.
How should i approach this?
If anything is unclear please do not hesitate to ask for clarification.
EDIT:
nowy my code looks as follows (however still does not work):
class ScrapeMovies(scrapy.Spider):
name='final'
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
item['groups'] = response.xpath('//div[#class="groups-btm"]/ul/li/text()').extract_first()
return item

Use meta field to put item forward to next callback
def parse_page1(self, response):
item = MyItem(main_url=response.url)
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
UPD: to process all rows use a yield in your loop
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
yield request

Exporting scraped data to a CSV file

I'm trying to get data from a website that requires me to follow 2 URLs before scraping the data.
The goal is to get an exported file that looks like this:
My code is as follows:
import scrapy
from scrapy.item import Item, Field
from scrapy import Request
class myItems(Item):
info1 = Field()
info2 = Field()
info3 = Field()
info4 = Field()
class mySpider(scrapy.Spider):
name = 'techbot'
start_urls = ['']
def parse(self, response):
#Extracts first link
items = []
list1 = response.css("").extract() #extract all info from here
for i in list1:
link1 = 'https:...' + str(i)
request = Request(link1, self.parseInfo1, dont_filter =True)
request.meta['item'] = items
yield request
yield items
def parseInfo1(self, response):
#Extracts second link
item = myItems()
items = response.meta['item']
list1 = response.css("").extract()
for i in list1:
link1 = '' + str(i)
request = Request(link1, self.parseInfo2, dont_filter =True)
request.meta['item'] = items
items.append(item)
return request
def parseInfo2(self, response):
#Extracts all data
item = myItems()
items = response.meta['item']
item['info1'] = response.css("").extract()
item['info2'] = response.css("").extract()
item['info3'] = response.css("").extract()
item['info4'] = response.css("").extract()
items.append(item)
return items
I've executed the spider in the terminal with the command:
scrapy crawl techbot
The data I get is out of order, and with gaps like this:
For example it scrapes the first set of data multiple times and the rest is out of order.
If anyone could point me in the direction to get the results in a cleaner format as shown in the beginning that would be greatly appreciated.
Thanks

Solved it by consolidating the following of both links into one function instead of two. My spider is working now as follows:
class mySpider(scrapy.Spider):
name = 'techbot'
start_urls = ['']
def parse(self, response):
#Extracts links
items = []
list1 = response.css("").extract()
for i in list1:
link1 = 'https:...' + str(i)
request = Request(link2, self.parse, dont_filter =True)
request.meta['item'] = items
yield request
list2 = response.css("").extract()
for i in list2:
link2 = '' + str(i)
request = Request(link1, self.parseInfo2, dont_filter =True)
request.meta['item'] = items
yield request
yield items
def parseInfo2(self, response):
#Extracts all data
item = myItems()
items = response.meta['item']
item['info1'] = response.css("").extract()
item['info2'] = response.css("").extract()
item['info3'] = response.css("").extract()
item['info4'] = response.css("").extract()
items.append(item)
return items

Scrapy Calling another Url

I am using scrapy to scrape a website. I am getting all products from the listing page.Now i want to go to each url of the product but i am not getting the satisfactory result.
Here is my code:
import scrapy
from scrapy.http import Request
from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domain = ["test.com"]
start_urls = [
"http://www.test.com/?page=1"
]
page_index = 1
def parse(self,response):
products = response.xpath('//li')
items = []
if products:
for product in products:
item = DmozItem()
item['link'] = product.xpath('#data-url').extract()
item['sku'] = product.xpath('#data-sku').extract()
item['brand'] = product.xpath('.//span[contains(#class, "qa-brandName")]/text()').extract()
item['img'] = product.xpath('.//img[contains(#class, "itm-img")]/#src').extract()
page_url = "http://www.jabong.com/Lara-Karen-Black-Sweaters-893039.html"
request = Request(url=page_url,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"})
request.meta['item'] = item
item['other'] = request
yield item
else:
return
self.page_index += 1
if self.page_index:
yield Request(url="http://www.test.com/?page=%s" % (self.page_index),
headers={"Referer": "http://www.test.com/", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse)
def parse_page2(self, response):
item = response.meta['item']
item['title'] = response.xpath("//span[#id='before_price']/text()")
yield item
The result i am getting is
{"sku": [], "brand": [], "other": "<Request GET http://www.test.com/>", "link": [], "img": []},
instead of request Get i need the data which i am returning from pars2 function
Where am i going wrong.

Your xpaths seems to be wrong here,
try this
In [0]: products[0].xpath('./#data-url').extract()
Out[0]: [u'Sangria-Green-Kurtis-Kurtas-1081831.html']
In [1]: products[0].xpath('./a/#unbxdparam_sku').extract()
Out[1]: [u'SA038WA68OIXINDFAS']
In [2]: products[0].xpath('./a/span[contains(#class,"qa-brandName")]/text()').extract()
Out[2]: [u'Sangria']
In [3]: products[0].xpath('./a/span[#class="lazyImage cat-prd-img"]/span/#id').extract()
Out[3]: [u'http://static14.jassets.com/p/Sangria-Green--Kurtis-26-Kurtas-5520-1381801-1-catalog.jpg']
so the code will be ,
BASE_URL = 'http://www.jabong.com/'
for product in products:
item = DmozItem()
item_url = product.xpath('./#data-url').extract()
item_url = self.BASE_URL + item_url[0] if item_url else ''
item['link'] = product.xpath('./#data-url').extract()
item['sku'] = product.xpath('./a/#unbxdparam_sku').extract()
item['brand'] = product[0].xpath('./a/span[contains(#class,"qa-brandName")]/text()').extract()
item['img'] = product.xpath('./a/span[#class="lazyImage cat-prd-img"]/span/#id').extract()
if item_url:
yield Request(url=self.BASE_URL + ,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8}, meta={'item'=item})
EDIT
complete spider code
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.spider import Spider
from scrapy.http import Request
class JabongItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
link = scrapy.Field()
sku = scrapy.Field()
brand = scrapy.Field()
img = scrapy.Field()
class JabongSpider(scrapy.Spider):
name = "jabong"
allowed_domains = ["jabong.com"]
start_urls = ["http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1"]
page_index = 1
BASE_URL = 'http://www.jabong.com/'
def parse(self, response):
products = response.xpath("//li[#data-url]")
if products:
for product in products:
link = product.xpath('#data-url').extract()
link = self.BASE_URL + link[0] if link else ''
sku = product.xpath('#data-sku').extract()
sku = sku[0].strip() if sku else 'n/a'
brand = product.xpath('.//span[contains(#class, "qa-brandName")]/text()').extract()
brand = brand[0].strip() if brand else 'n/a'
img = product.xpath('.//img[contains(#class, "itm-img")]/#src').extract()
img = img[0].strip() if img else 'n/a'
item = JabongItem()
item['link'] = link
item['sku'] = sku
item['brand'] = brand
item['img'] = img
if link:
yield Request(url=link, callback=self.parse_page2, meta={'item': item})
else:
return
self.page_index += 1
yield Request(url="http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1%s" % (self.page_index + 1),
callback=self.parse, dont_filter=True)
def parse_page2(self, response):
item = response.meta['item']
# add whatever extra details you want to item
yield item

How to call Parse_page2 method for every item

I am trying to call parse_page2 method for every item. But every time i run this spider i am only getting single item per page so how do i call parse_page2 method for every item.
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "atisundarSpider"
allowed_domains = ["atisundar.com"]
URLSList = []
for n in range (1,20):
URLSList.append('http://atisundar.com/collections/sarees?page=' + str(n))
URLSList.append('http://atisundar.com/collections/salwar-suits?page=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="block product size-medium"]')
items = []
for site in sites:
item = EscraperItem()
item['productDesc'] = ""
item['productSite'] = ["http://atisundar.com/"]
item['productTitle'] = site.select('.//div[#class="main"]/a/#title').extract()
item['productURL'] = ["http://atisundar.com" + site.select('.//div[#class="main"]/a/#href').extract()[0].encode('utf-8')]
item['productPrice'] = site.select('.//p[#class="pricearea"]//span[#class="was-price"]/text()').extract() + site.select('.//p[#class="pricearea"]//span[#class="price"]/text()').extract()
item['productImage'] = [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0]] + [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0].replace("medium","grande")]
item['image_urls'] = item['productImage']
items.append(item)
secondURL = "http://admin.atisundar.com/store/skuDetails?product_id=" + site.select('.//div[#class="main"]/a/text()').extract()[1].strip().split("#")[-1]
request = Request(secondURL,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
#item['other_url'] = response.url
return item

1) you are not using CrawlSpider functionality , i would recommend you to inherit your spider from BaseSpider
2) in for loop
for site in sites:
use yield rather then return , other wise it will break the loop in first iteration.
yield request
3) in parse_page2 get item from response.request.meta instead from response.meta
item = response.request.meta['item']
it should work now.
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
#------------------------------------------------------------------------------
from scrapy.spider import BaseSpider
class ESpider(BaseSpider):
name = "atisundarSpider"
allowed_domains = ["atisundar.com"]
URLSList = []
for n in range (1,20):
URLSList.append('http://atisundar.com/collections/sarees?page=' + str(n))
URLSList.append('http://atisundar.com/collections/salwar-suits?page=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="block product size-medium"]')
for site in sites:
item = EscraperItem()
item['productDesc'] = ""
item['productSite'] = ["http://atisundar.com/"]
item['productTitle'] = site.select('.//div[#class="main"]/a/#title').extract()
item['productURL'] = ["http://atisundar.com" + site.select('.//div[#class="main"]/a/#href').extract()[0].encode('utf-8')]
item['productPrice'] = site.select('.//p[#class="pricearea"]//span[#class="was-price"]/text()').extract() + site.select('.//p[#class="pricearea"]//span[#class="price"]/text()').extract()
item['productImage'] = [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0]] + [site.select('.//div[#class="main"]/a/img/#src').extract()[0].split('?')[0].replace("medium","grande")]
item['image_urls'] = item['productImage']
secondURL = "http://admin.atisundar.com/store/skuDetails?product_id=" + site.select('.//div[#class="main"]/a/text()').extract()[1].strip().split("#")[-1]
request = Request(secondURL,
callback=self.parse_page2)
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.request.meta['item']
#item['other_url'] = response.url
return item

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

scrapy: transport item over several parse and collect data - python

Related

Scrapy: Crawled but not scraped any data

Multiple pages per item - using scraped links

Exporting scraped data to a CSV file

Scrapy Calling another Url

How to call Parse_page2 method for every item

Categories

Resources