Scrapy Spider scraping same thing multiple times and missing other items - python

When I run my scrapy spider to scrape comments on the steam platform, It is missing a lot of comments and is scraping the same comments several times. What is wrong with my code?
import scrapy
from scrapy import Request, FormRequest
from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
import json
from scrapy import Selector
import re
class Workshop_Item(Item):
app_id = Field()
workshop_id = Field()
game = Field()
workshop_name = Field()
user = Field()
comment = Field()
user_level = Field()
date_posted = Field()
user_location = Field()
number_of_badges = Field()
user_join_date = Field()
is_author = Field()
user_experience = Field()
class Workshop_Comment_Spider(scrapy.Spider):
name = "comments"
with open("output/workshop_comment_links.txt") as f:
urls = [line.rstrip("\n") for line in f]
start_urls = urls
def parse(self, response):
if int(max(response.css('span.tabCount::text').getall())) > 0 and "profiles" in response.css('a.commentthread_author_link::attr(href)').get():
contributor_id = re.search(r'Public_(.*?)_' , response.css('div.commentthread_footer a::attr(id)').get()).group(1)
elif int(max(response.css('span.tabCount::text').getall())) > 0:
contributor_id = re.search(r'Public_(.*?)_' , response.css('div.commentthread_footer a::attr(id)').get()).group(1)
workshop_id_number = response.css('form.smallForm > input::attr(value)').get()
if int(max(response.css('span.tabCount::text').getall())) > 50:
comment_number = max(response.css('span.tabCount::text').getall())
url = f'https://steamcommunity.com/comment/PublishedFile_Public/render/{contributor_id}/{workshop_id_number}/'
data = {
"start": "1",
"totalcount": comment_number,
"count": comment_number,
"sessionid": "d880ab2338b70926db0a9591",
"extended_data": "{\"contributors\":[\"" + contributor_id +"\",{}],\"appid\":289070,\"sharedfile\":{\"m_parentsDetails\":null,\"m_parentBundlesDetails\":null,\"m_bundledChildren\":[],\"m_ownedBundledItems\":[]},\"parent_item_reported\":false}",
"feature2": "-1"
}
app_id = response.css('div.apphub_HeaderTop a::attr(data-appid)').get()
game = response.css(".apphub_AppName::text").get()
workshop_id = response.css('form.smallForm input::attr(value)').get()
workshop_name = response.css(".workshopItemTitle::text").get()
yield FormRequest(url, formdata=data, callback=self.parse_paginated_comments, meta={'app_id': app_id, 'game': game, 'workshop_id': workshop_id, 'workshop_name': workshop_name})
else:
for comment in response.css(".commentthread_comment"):
item = Workshop_Item()
item['is_author'] = False
if "authorbadge" in comment.get():
item['is_author'] = True
item['app_id'] = response.css('div.apphub_HeaderTop a::attr(data-appid)').get()
item['workshop_id'] = response.css('form.smallForm input::attr(value)').get()
item['game'] = response.css(".apphub_AppName::text").get()
item['workshop_name'] = response.css(".workshopItemTitle::text").get()
item['user'] = comment.css("bdi::text").get()
item['comment'] = ",".join(comment.css(".commentthread_comment_text::text").getall()).replace('\n', ' ').replace('\t', '').replace('\r', ' ')
item['date_posted'] = comment.css(".commentthread_comment_timestamp::attr(title)").get()
item['user_level'] = -1
user_profile = comment.css(".commentthread_author_link::attr(href)").get()
request = Request(user_profile, callback=self.parse_user_info, meta={'item': item})
yield request
def parse_user_info(self, response):
item = response.meta['item']
if response.css('.profile_private_info'):
item['user_level'] = 'private'
item['user_location'] = 'private'
item['number_of_badges'] = 'private'
item['user_join_date'] = 'private'
item['user_experience'] = 'private'
return item
else:
item['user_level'] = response.css(".friendPlayerLevelNum::text").get()
if response.css('.header_real_name') and response.css("img.profile_flag"):
item['user_location'] = response.css('.header_real_name::text').getall()[2].strip()
else:
item['user_location'] = 'NA'
if response.css("div.profile_badges span.profile_count_link_total::text"):
item['number_of_badges'] = response.css("div.profile_badges span.profile_count_link_total::text").get().strip()
else:
item['number_of_badges'] = 'NA'
user_badge_page = response.css("div.profile_header_badgeinfo_badge_area > a::attr(href)").get() + "/1"
request = Request(user_badge_page, callback=self.parse_badge_info, meta={'item': item})
yield request
def parse_badge_info(self, response):
item = response.meta['item']
if response.css("div.badge_description"):
item['user_join_date'] = response.css("div.badge_description::text").get().strip()
experience_page = response.css('a.whiteLink.persona_name_text_content::attr(href)').get() + "/badges"
request = Request(experience_page, callback=self.parse_experience_page, meta={'item': item})
yield request
def parse_experience_page(self, response):
item = response.meta['item']
if response.css('span.profile_xp_block_xp'):
item['user_experience'] = response.css('span.profile_xp_block_xp::text').get()
return item
def parse_paginated_comments(self, response):
app_id = response.meta['app_id']
game = response.meta['game']
workshop_id = response.meta['workshop_id']
workshop_name = response.meta['workshop_name']
jsonresponse = json.loads(response.body.decode("utf-8"))
sel = Selector(text=jsonresponse['comments_html'])
for comment in sel.css(".commentthread_comment"):
item = Workshop_Item()
item['is_author'] = False
if "authorbadge" in comment.get():
item['is_author'] = True
item['app_id'] = app_id #sel.css('div.apphub_HeaderTop a::attr(data-appid)').get()
item['workshop_id'] = workshop_id #sel.css('form.smallForm input::attr(value)').get()
item['game'] = game #sel.css(".apphub_AppName::text").get()
item['workshop_name'] = workshop_name #sel.css(".workshopItemTitle::text").get()
item['user'] = comment.css("bdi::text").get()
item['comment'] = ",".join(comment.css(".commentthread_comment_text::text").getall()).replace('\n', ' ').replace('\t', '').replace('\r', ' ')
item['date_posted'] = comment.css(".commentthread_comment_timestamp::attr(title)").get()
item['user_level'] = -1
user_profile = sel.css(".commentthread_author_link::attr(href)").get()
request = Request(user_profile, callback=self.parse_user_info, meta={'item': item})
yield request
I am scraping a comment from a page, and then going to the user's profile to collect user data. If the page has pagination (>50 comments), I am sending a post request to the retrieve the json that contains the html for all of the comments, and then scraping that.

Fix it, the problem was here:
def parse_paginated_comments(self, response):
app_id = response.meta['app_id']
game = response.meta['game']
workshop_id = response.meta['workshop_id']
workshop_name = response.meta['workshop_name']
jsonresponse = json.loads(response.body.decode("utf-8"))
sel = Selector(text=jsonresponse['comments_html'])
for comment in sel.css(".commentthread_comment"):
item = Workshop_Item()
item['is_author'] = False
I needed to change
for comment in sel.css(".commentthread_comment"):
to
for comment in comment.css(".commentthread_comment"):
and I needed to add
DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'
To the settings.py file.

Related

Scrapy downloading json-files from site?

i tried to create a scrapy spider to download some json-files from a site -
This is my scrapy spider:
(first tested the spider - so it only outputs the link to the json-file which works fine - see commented code below)
But i want to download the json-files to a folder on my pc.
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/#href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l,
callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/#href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2,
callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/#href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = tmpDownloadLink
yield item
# yield {
# "link": tmpDownloadLink,
# }
And this are the changes which i did in the settings.py:
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 1,
}
IMAGES_STORE = r'C:\DOWNLOAD\DATASETS\Spanish'
But unfortunately the download of the json-files is not working.
How can i download the json-files to the defined folder?
You have two problems.
item['file_urls'] should be a list.
IMAGES_STORE should be FILES_STORE
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/#href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l,
callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/#href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2,
callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/#href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = [tmpDownloadLink]
yield item
# yield {
# "link": tmpDownloadLink,
# }
EDIT:
In order to set the file's name do this:
settings.py:
ITEM_PIPELINES = {
'yourprojectname.pipelines.ProcessPipeline': 1,
}
FILES_STORE = r'C:\DOWNLOAD\DATASETS\Spanish'
pipelines.py:
import os
from urllib.parse import unquote
from scrapy.pipelines.files import FilesPipeline
class ProcessPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
file_name = os.path.basename(unquote(request.url))
return file_name
EDIT 2:
writing additional information to file:
import json
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
erg = {}
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/#href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l, callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/#href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2, callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
key = response.xpath('//ul[#class="breadcrumb"]/li[last()]/text()').get()
self.erg[key] = response.url
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/#href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = [tmpDownloadLink]
yield item
def close(self, reason):
with open('erg.json', 'w') as f:
f.write(json.dumps(self.erg, indent=4))

Scrapy: Crawled but not scraped any data

I wrote the following code to scrape Booking.com given the name of the city. Ideally, the program should find out all the hotels that are available in the city and scrape all the reviews for each hotel. Unfortunately, it will scrape only a few hotels and only the first 75 reviews of those hotels. Will you please tell me what am I doing wrong here??
import scrapy
from scrapy import Spider
from scrapy.loader import ItemLoader
from booking_spider.items import BookingSpiderItem
class PerhotelrevSpider(Spider):
name = 'perhotelrev'
allowed_domains = ['booking.com']
#start_urls = ['https://booking.com/reviews/us/hotel/maison-st-charles-quality-inn-suites.html?/']
start_urls = ['https://www.booking.com/searchresults.html?ss=New%20Orleans&']
#handle_httpstatus_list = [301, 302]
def parse(self, response):
all_hotels = response.xpath('.//*[#class="sr-hotel__title \n"]')
for ahotel in all_hotels:
hotel_name = ahotel.xpath('.//*[#class="sr-hotel__name\n"]/text()').extract_first().replace('\n','')
hotel_url = ahotel.xpath('.//*[#class="hotel_name_link url"]/#href').extract_first().replace('\n','')
full_hotel_url = 'https://www.booking.com'+str(hotel_url)
request = scrapy.Request(full_hotel_url, callback = self.parse_hotels)
request.meta['adict'] = {'HotelName':hotel_name}
yield request
next_page = response.xpath('.//*[#class="bui-pagination__item bui-pagination__next-arrow"]/a/#href').extract_first()
if next_page is not None:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_hotels(self, response):
adict = response.meta['adict']
hotel_name = adict['HotelName']
#hotel_name = response.xpath('.//*[#class="hp__hotel-name"]/text()')[1].extract().replace('\n','')
image_urls = response.xpath('.//*[#class="b_nha_hotel_small_images hp_thumbgallery_with_counter"]/a/#href').extract()
all_facilities = response.xpath('.//*[#class="facilitiesChecklistSection"]/ul/li/span/text()').extract()
all_facilities = [x.replace('\n','') for x in all_facilities]
important_facility = response.xpath('.//*[#class="important_facility "]/#data-name-en').extract()
#print(hotel_name)
all_review_url = response.xpath('.//*[#class="show_all_reviews_btn"]/#href').extract_first()
adict = { 'HotelName':hotel_name,
'ImageUrls':image_urls,
'Facilities':all_facilities,
'MainFacilities':important_facility
}
if all_review_url is not None:
review_url = "https://booking.com"+all_review_url
request = scrapy.Request(review_url, callback=self.parse_review)
request.meta['adict'] = adict
yield request
def parse_review(self, response):
allreviewsinpage = response.xpath('.//*[#itemprop="review"]')
adict = response.meta['adict']
hotel_name = adict['HotelName']
image_urls = adict['ImageUrls']
all_facilities = adict['Facilities']
important_facility = adict['MainFacilities']
for eachreview in allreviewsinpage:
username = eachreview.xpath('.//p[#class="reviewer_name"]/*[#itemprop="name"]/text()').extract_first()
usercountry = eachreview.xpath('.//*[#itemprop="nationality"]/*[#itemprop="name"]/text()').extract_first()
numreviewgiven = eachreview.xpath('.//*[#class="review_item_user_review_count"]/text()').extract_first()
useragegroup = eachreview.xpath('.//*[#class="user_age_group"]/text()').extract_first()
heading = eachreview.xpath('.//*[#class="review_item_header_content\n"]/*[#itemprop="name"]/text()').extract_first()
neg_rev = eachreview.xpath('.//p[#class="review_neg "]/*[#itemprop="reviewBody"]/text()').extract_first()
pos_rev = eachreview.xpath('.//p[#class="review_pos "]/*[#itemprop="reviewBody"]/text()').extract_first()
tagging = eachreview.xpath('.//ul[#class="review_item_info_tags"]/*[#class="review_info_tag "]/text()').extract()
stayedin = eachreview.xpath('.//p[#class="review_staydate "]/text()').extract_first()
givenscore = eachreview.xpath('.//span[#class="review-score-badge"]/text()').extract_first()
l = ItemLoader(item=BookingSpiderItem(), selector=response)
l.add_value('HotelName',hotel_name)
#l.add_value('ImageUrls',image_urls)
l.add_value('Facilities',all_facilities)
l.add_value('MainFacilities',important_facility)
l.add_value('UserName',username)
l.add_value('UserCountry',usercountry)
l.add_value('NumReviewGiven',numreviewgiven)
l.add_value('UserAgeGroup',useragegroup)
l.add_value('Heading',heading)
l.add_value('NegativeReview',neg_rev)
l.add_value('PositiveReview',pos_rev)
l.add_value('SelfTag',tagging)
l.add_value('StayDate',stayedin)
l.add_value('GivenScore',givenscore)
yield l.load_item()
next_page = response.xpath('.//*[#class="page_link review_next_page"]/a/#href').extract_first()
if next_page is not None:
next_page_url = response.urljoin(next_page)
yield scrapy.Request(next_page_url, callback=self.parse_review)

Scrapy crawler crawls but does not scrape

Any idea why this isn't working? I'm a complete newby to scrapy, trying to actually extract the data to a csv file but can't do that if it doesnt scrape. I'm thinking the problem could be in the xpath but all the paths under def parse_mode are correct. Could there be another reason it isnt scraping?
Terminal output:
2017-01-10 10:31:16 [scrapy.extensions.logstats] INFO: Crawled 213 pages (at 23 pages/min), scraped 0 items (at 0 items/min)
Code:
#!/usr/bin/env python
import types
import time
from datetime import date, datetime, timedelta
import requests
import msgpack
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector, Selector
from resume_data.items import ResumeDataItem, ResultListItem, WorkItem, SchoolItem, ItemList
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from bs4.element import NavigableString
class ResumeIndeedSpider(CrawlSpider):
name = "indeed_resume"
allowed_domains = ["indeed.com"]
start_urls = ['http://www.indeed.com/resumes/mechanical-engineer',
'http://www.indeed.com/resumes/mechanical-engineering',
'http://www.indeed.com/resumes/piping-engineer',
'http://www.indeed.com/resumes/design-engineer',
'http://www.indeed.com/resumes/project-engineer']
#def __init__(self, filename=None):
#self.unis = list()
rules = (Rule (SgmlLinkExtractor(restrict_xpaths = ('//a[contains(#class,"app_link")]')), callback = "parse_item", follow = True),)
def parse_item(self, response):
hxs = Selector(response)
digest = hxs.xpath('//ol[#class="resultsList"]')
records = ResumeDataItem()
url_prefix = 'http://www.indeed.com'
resume_links = digest.xpath('//li[#class="sre"]//div[#class="sre-entry"]')
names = digest.xpath('//a[#target="_blank"]/text()').extract()
links = digest.xpath('//a[#target="_blank"]/#href').extract()
for name, link in zip(names,links):
if name not in 'Feedback':
records['name'] = name
records['link'] = url_prefix+link
yield Request(records['link'], meta={'item': records}, callback= self.parse_node)
def parse_node(self, response):
hxs = Selector(response)
records = ResumeDataItem()
# name = hxs.xpath('/text()').extract()
name = hxs.xpath('//h1[#id="resume-contact"]/text()').extract()
headline = hxs.xpath('//h2[#id="headline"]/text()').extract()
# locale = hxs.xpath('//div[#class="addr" and #itemprop="address"]//p//text()').extract()
rlocale = hxs.xpath('//p[#id="headline_location" and #class="locality"]//text()').extract()
summary = hxs.xpath('//p[#id="res_summary" and #class="summary"]/text()').extract()
skills = list()
skill = hxs.xpath('//div[#id="skills-items" and #class="items-container"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
skill = hxs.xpath('//div[#id="additionalinfo-section" and #class="last"]//div[#class="data_display"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
resume_links = list()
links = hxs.xpath('//div[#id="link-items" and #class="items-container"]//p//text()').extract()
for link in links:
resume_links.append(''.join(link).encode('utf-8'))
workHistory = ItemList()
experience = hxs.xpath('//div[#id="work-experience-items"]/div')
for elem in experience:
item = elem.xpath('div')
for entry in item:
workEntry = WorkItem()
title = entry.xpath('p[#class="work_title title"]//text()').extract()
workEntry['title'] = ''.join(title).encode('utf-8')
company = entry.xpath('div[#class="work_company"]/span/text()').extract()
workEntry['company']= ''.join(company).encode('utf-8')
location = entry.xpath('div[#class="work_company"]/div[#class="inline-block"]/span/text()').extract()
workEntry['work_location'] = ''.join(company).encode('utf-8')
dates = entry.xpath('p[#class="work_dates"]//text()').extract()
dates_str = ''.join(dates).encode('utf-8').split(' to ')
if len(dates) > 0:
if dates_str[0]:
workEntry['start_date'] = dates_str[0]
if dates_str[1]:
workEntry['end_date'] = dates_str[1]
else:
workEntry['start_date'] = 'NULL'
workEntry['end_date'] = 'NULL'
description = entry.xpath('p[#class="work_description"]//text()').extract()
workEntry['description'] = ''.join(description).encode('utf-8')
workHistory.container.append(workEntry)
eduHistory = ItemList()
education = hxs.xpath('//div[#id="education-items" and #class="items-container"]/div')
for elem in education:
item = elem.xpath('div')
for entry in item:
eduEntry = SchoolItem()
degree = entry.xpath('p[#class="edu_title"]/text()').extract()
degree = ''.join(degree).encode('utf-8')
eduEntry['degree'] = degree
school = entry.xpath('div[#class="edu_school"]/span//text()').extract()
school = ''.join(school).encode('utf-8')
eduEntry['school'] = school
locale = entry.xpath('span[#itemprop="addressLocality"]/text()').extract()
locale = ''.join(locale).encode('utf-8')
eduEntry['locale'] = locale
grad_date = entry.xpath('p[#class="edu_dates"]/text()').extract()
dates_str = ''.join(grad_date).encode('utf-8').split(' to ')
if len(grad_date) > 0:
if len(dates_str) == 2:
if dates_str[0]:
eduEntry['admit_date'] = dates_str[0]
try:
if dates_str[1]:
eduEntry['grad_date'] = dates_str[1]
except:
pass
elif len(dates_str) == 1:
if dates_str[0]:
eduEntry['grad_date'] = dates_str[0]
eduEntry['admit_date'] = 'NULL'
else:
eduEntry['admit_date'] = 'NULL'
eduEntry['grad_date'] = 'NULL'
eduHistory.container.append(eduEntry)
records['url'] = response.url
records['name'] = ''.join(name).encode('utf-8')
records['headline'] = msgpack.packb(''.join(headline).encode('utf-8'))
records['locale'] = ''.join(rlocale).encode('utf-8')
records['summary'] = msgpack.packb(''.join(summary).encode('utf-8'))
records['skills'] = msgpack.packb(skills)
records['links'] = resume_links
#records['experience'] = msgpack.packb(workHistory, default=workHistory.encode)
records['experience'] = workHistory
records['education'] = msgpack.packb(eduHistory, default=eduHistory.encode)
#records['experience'] = workHistory
#records['education'] = eduHistory
return records`

Scrapy Calling another Url

I am using scrapy to scrape a website. I am getting all products from the listing page.Now i want to go to each url of the product but i am not getting the satisfactory result.
Here is my code:
import scrapy
from scrapy.http import Request
from tutorial.items import DmozItem
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domain = ["test.com"]
start_urls = [
"http://www.test.com/?page=1"
]
page_index = 1
def parse(self,response):
products = response.xpath('//li')
items = []
if products:
for product in products:
item = DmozItem()
item['link'] = product.xpath('#data-url').extract()
item['sku'] = product.xpath('#data-sku').extract()
item['brand'] = product.xpath('.//span[contains(#class, "qa-brandName")]/text()').extract()
item['img'] = product.xpath('.//img[contains(#class, "itm-img")]/#src').extract()
page_url = "http://www.jabong.com/Lara-Karen-Black-Sweaters-893039.html"
request = Request(url=page_url,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"})
request.meta['item'] = item
item['other'] = request
yield item
else:
return
self.page_index += 1
if self.page_index:
yield Request(url="http://www.test.com/?page=%s" % (self.page_index),
headers={"Referer": "http://www.test.com/", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse)
def parse_page2(self, response):
item = response.meta['item']
item['title'] = response.xpath("//span[#id='before_price']/text()")
yield item
The result i am getting is
{"sku": [], "brand": [], "other": "<Request GET http://www.test.com/>", "link": [], "img": []},
instead of request Get i need the data which i am returning from pars2 function
Where am i going wrong.
Your xpaths seems to be wrong here,
try this
In [0]: products[0].xpath('./#data-url').extract()
Out[0]: [u'Sangria-Green-Kurtis-Kurtas-1081831.html']
In [1]: products[0].xpath('./a/#unbxdparam_sku').extract()
Out[1]: [u'SA038WA68OIXINDFAS']
In [2]: products[0].xpath('./a/span[contains(#class,"qa-brandName")]/text()').extract()
Out[2]: [u'Sangria']
In [3]: products[0].xpath('./a/span[#class="lazyImage cat-prd-img"]/span/#id').extract()
Out[3]: [u'http://static14.jassets.com/p/Sangria-Green--Kurtis-26-Kurtas-5520-1381801-1-catalog.jpg']
so the code will be ,
BASE_URL = 'http://www.jabong.com/'
for product in products:
item = DmozItem()
item_url = product.xpath('./#data-url').extract()
item_url = self.BASE_URL + item_url[0] if item_url else ''
item['link'] = product.xpath('./#data-url').extract()
item['sku'] = product.xpath('./a/#unbxdparam_sku').extract()
item['brand'] = product[0].xpath('./a/span[contains(#class,"qa-brandName")]/text()').extract()
item['img'] = product.xpath('./a/span[#class="lazyImage cat-prd-img"]/span/#id').extract()
if item_url:
yield Request(url=self.BASE_URL + ,callback=self.parse_page2,
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8}, meta={'item'=item})
EDIT
complete spider code
import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.spider import Spider
from scrapy.http import Request
class JabongItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
link = scrapy.Field()
sku = scrapy.Field()
brand = scrapy.Field()
img = scrapy.Field()
class JabongSpider(scrapy.Spider):
name = "jabong"
allowed_domains = ["jabong.com"]
start_urls = ["http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1"]
page_index = 1
BASE_URL = 'http://www.jabong.com/'
def parse(self, response):
products = response.xpath("//li[#data-url]")
if products:
for product in products:
link = product.xpath('#data-url').extract()
link = self.BASE_URL + link[0] if link else ''
sku = product.xpath('#data-sku').extract()
sku = sku[0].strip() if sku else 'n/a'
brand = product.xpath('.//span[contains(#class, "qa-brandName")]/text()').extract()
brand = brand[0].strip() if brand else 'n/a'
img = product.xpath('.//img[contains(#class, "itm-img")]/#src').extract()
img = img[0].strip() if img else 'n/a'
item = JabongItem()
item['link'] = link
item['sku'] = sku
item['brand'] = brand
item['img'] = img
if link:
yield Request(url=link, callback=self.parse_page2, meta={'item': item})
else:
return
self.page_index += 1
yield Request(url="http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1%s" % (self.page_index + 1),
callback=self.parse, dont_filter=True)
def parse_page2(self, response):
item = response.meta['item']
# add whatever extra details you want to item
yield item

how to extract certain string from URL

I am trying to extract certain strings from the below mentioned URL :
sample URL :
http://www.ladyblush.com/buy-sarees-online.html?p=1
http://www.ladyblush.com/buy-ladies-suits-online.html?p=1
http://www.ladyblush.com/buy-women-fashion-accessories.html?p=1
i want to extract :
productCategory = "sarees" productSubCategory = ""
productCategory = "ladies" productSubCategory = "suits"
productCategory = "women" productSubCategory = "fashion-accessories"
And so on. Actually i am writing a spider and i need to extract productCategory and productSubCategory from URL's like above mentioned..so i am trying to extract these fields inside parse method from response.url. Can someone help me out please
My code :
import re
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "ladyblushSpider"
allowed_domains = ["ladyblush.com"]
URLSList = []
for n in range (1,100):
URLSList.append('http://www.ladyblush.com/buy-sarees-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-ladies-suits-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-fashion-accessories.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-nightwear-lingerie-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-dress-online-skirts-suits-kurtis-tops.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-decor-online-wallclock-bedsheets-cushions-bedcovers.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-cosmetics-online-massage-oils-aromatherapy-perfumes-soaps.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-jewelery-online-art-fashion-semi-precious-antique-junk-jewellery.html?p=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="third thumbnailSpillLarge"]')
items = []
for site in sites:
item = EscraperItem()
item['currency'] = 'INR'
item['productCategory'] = [""]
item['productSubCategory'] = [""]
item['productSite'] = ["http://ladyblush.com/"]
item['productImage'] = site.select('./a/div/img/#src').extract()
item['productTitle'] = site.select('./a/div/img/#title').extract()
item['productURL'] = [site.select('./a/#href').extract()[0].replace(" ","%20")]
productMRP = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="old-price"]//span[#class="price"]/text()').extract()
productPrice = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="special-price"]//span[#class="price"]/text()').extract()
if productMRP and productPrice:
price = [productMRP[1].strip()] + [productPrice[1].strip()]
else:
price = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//span[#class="regular-price"]//span[#class="price"]/text()').extract()
item['productPrice'] = price
items.append(item)
secondURL = item['productURL'][0]
request = Request(secondURL,callback=self.parsePage2)
request.meta['item'] = item
yield request
def parsePage2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
if hxs.select('//div[#class="addtocart-container"]/div/text()').extract():
item['availability'] = False
else:
item['availability'] = True
if hxs.select('//label[#class="required"]/text()').extract():
item['hasVariants'] = True
else:
item['hasVariants'] = False
item['image_urls'] = list(set(item['productImage']))
item['productDesc'] = [" ".join([re.sub(r'[\t\n\r]',"",i.strip()) for i in hxs.select('//div[#class="std"]/text()').extract()])]
item['productImage'] = item['productImage'] + hxs.select('//div[#class="more-views"]/ul/li/a/img/#src').extract() + hxs.select('//div[#class="more-views"]/ul/li/a/#href').extract()
return item
#------------------------------------------------------------------------------
you can get the url from
response.url in the parse method. You could then parse that to just get the url path
import os
test = 'buy-women-fashion-accessories.html?p=1'
parts = os.path.splitext(test)
# ('buy-women-fashion-accessories', '.html?p=1')
parts[0].split('-')[1:]
# ['women', 'fashion', 'accessories']
This is rather flimsy solution though. Are you sure the data is not stored somewhere in the page's html that your are parsing, instead of looking at the url?

Categories