I find it can't run short_critic_content(self,response) why, I can't find reason.
I didn't use allowed_domains,
if I take it, the short_critic_content(self,response) is don't run.
allowed_domains = ["movie.mtime.com"].
start_urls = ['http://movie.mtime.com'] is wrong or right!
What's wrong with it getting error :
Scrapy Unsupported URL scheme '': no handler available for that scheme
class YinPin(CrawlSpider):
name = "yingping"
#allowed_domains = ["movie.mtime.com"]
start_urls = ['http://movie.mtime.com']
rules = (
#Rule(LinkExtractor(allow=())),
Rule(LinkExtractor(allow=(r'http://movie.mtime.com/40677/'), ), callback='movie_info', follow=False),
)
def movie_info(self, response):
selector = Selector(response)
#for movieinfo in movie_info:
movie_name = selector.xpath('//*[#id="db_head"]/div[2]/div/div[1]/h1/text()').extract()
movie_url = response.url#movieinfo.xpath('//*[#id="db_head"]/div[2]/div/div[2]/a[3]/#href').extract()
number = re.compile(r'\d+')
movie_num = int(number.search(str(movie_url)).group())
movie_release_time = selector.xpath('//*[#id="db_head"]/div[2]/div/div[1]/p[1]/a/text()').extract()
movie_place = selector.xpath('//*[#id="db_head"]/div[2]/div/div[2]/text()').extract()[3]
movie_type = selector.xpath('//*[#id="db_head"]/div[2]/div/div[2]/a/text()').extract()
movie_type_l = movie_type.pop()
movie_type = ' '.join(movie_type)
short_content = selector.css('#tweetRegion > dd > div > h3::text').extract() # selector.xpath('//*[#id="tweetRegion"]').css('h3::text').extract()
short_url = str(selector.xpath('//*[#id="tweetBottomDiv"]/p[2]/a/#href').extract())
yield Request(short_url, callback=self.short_critic_content,
meta={ 'movie_num': movie_num,
'short_content': short_content})
item = YingpingItem(
movie_num = movie_num,
movie_name = movie_name,
movie_release_time = movie_release_time,
movie_place = movie_place,
movie_type = movie_type,
)
yield item
def short_critic_content(self, response):
selector = Selector(response)
movie_num = response.meta['movie_num']
short_contentft = response.meta['short_content']
short_contentsd = selector.css('#tweetRegion > dd > div > h3::text').extract()
short_contents = short_contentft +short_contentsd
item = shortcriticItem(
movie_num = movie_num,
movie_scritic = short_contents
)
yield item
It's almost certain the problem is in this line of your movie_info function:
short_url = str(selector.xpath('//*[#id="tweetBottomDiv"]/p[2]/a/#href').extract())
extract() method of Selector returns a list, which you then convert to string. But that won't give you the URL, it gives you a string representation of the list, which starts with " character. That's why you get that error.
The correct way is either
short_url = selector.xpath('//*[#id="tweetBottomDiv"]/p[2]/a/#href').extract()[0]
or even better to use extract_first() instead of extract()
short_url = selector.xpath('//*[#id="tweetBottomDiv"]/p[2]/a/#href').extract_first()
Related
I'm trying to use ScrapyRT but I'm not getting anything from the spider
Here's my code:
class mercadoLibre(CrawlSpider):
name = 'ml'
allowed_domain = ['www.mercadolibre.com.co']
rules = {
Rule(LinkExtractor(allow = (), restrict_xpaths = ('//li[#class="andes-pagination__button andes-pagination__button--next"]/a'))),
Rule(LinkExtractor(allow = (), restrict_xpaths = ('//div[#class="ui-search-item__group ui-search-item__group--title"]/a')),
callback = "parse_item", follow = False),
}
def start_requests(self):
yield scrapy.Request(f'https://listado.mercadolibre.com.co/{self.busqueda}')
def parse_item(self, response):
item = ProductoItem()
item['titulo'] = response.xpath('//h1[#class="ui-pdp-title"]/text()').extract_first()
item['urlImagen'] = response.xpath('//img[#class="ui-pdp-image ui-pdp-gallery__figure__image"]/#src').extract_first()
p = response.xpath('//span/span[#class="andes-money-amount__fraction"]/text()').extract_first()
ch = "."
for x in range(len(ch)):
p = p.replace(ch[x],"")
item['precio'] = p
item['url'] = response.request.url
yield item
I think the problem is "rules" but I'm not sure, because I've not seen any example of ScrapyRT that uses rules, so Idk if I'm doing well
I don't know what I have to do, but that code in normal scrapy (I mean, without ScrapyRT and without start_requests) works
import scrapy
class rlgSpider(scrapy.Spider):
name = 'bot'
start_urls = [
'https://rocket-league.com/trading?filterItem=0&filterCertification=0&filterPaint=0&filterPlatform=1&filterSearchType=1&filterItemType=0&p=1']
def parse(self, response):
data = {}
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data['name'] = item.xpath('//div/div[#position ="relative"]/h2').extarct()
yield data
Here is what I did so far - it doesn't work well. It scrapes the url and not the h2 tag how do I do that when it's inside so many divs?
In order to parse though an element in scrapy you need to start your xpath with "." else you will be parsing through the response, this is the correct way of doing it.
def parse(self, response):
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('.//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data = {}
data['name'] = item.xpath('.//h2/text()').extarct_first()
yield data
I'm trying to scrape the website of a prominent UK retailer but I am facing an issue with my CrawlSpider - I get the following error message:
AttributeError: 'NlCrawlerSpider' object has no attribute '_rules'
I used the example here to convert my regular spider into a crawl spider; I have also played around with the syntax for the Rules as suggested here but end up with the same error msg. All your help would be much appreciated - thank you in advance!
# Scrapy
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
# Other Packages
import time
from datetime import date
from selenium import webdriver
class NlCrawlerSpider(CrawlSpider):
name = 'nl_crawler'
allowed_domains = ['newlook.com']
start_urls = ['http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%7Cmn%7Cwomens%7Cclothing#/?q=:relevance&page=1&sort=relevance&content=false']
rules = (
Rule(LinkExtractor(allow=r'\?q=:relevance&page=[1-130]&sort=relevance&content=false', ), callback='parse_item', follow=True),
)
def __init__(self):
self.driver = webdriver.Safari()
self.driver.set_window_size(800,600)
time.sleep(2)
def parse_item(self, response):
driver = self.driver
driver.get(response.url)
time.sleep(2)
# Collect products
products = driver.find_elements_by_class_name('plp-item ng-scope')
# Iterate over products; extract data and append individual features to NlScrapeItem
for item in products:
# Pull features
desc = item.find_element_by_class_name('product-item__name link--nounderline ng-binding').text
href = item.find_element_by_class_name('plp-carousel__img-link ng-scope').get_attribute('href')
# Generate a product identifier
identifier = href.split('/p/')[1].split('?comp')[0]
identifier = int(identifier)
# datetime
dt = date.today()
dt = dt.isoformat()
# Price Symbol removal and integer conversion
try:
priceString = item.find_element_by_class_name('price ng-binding').text
except:
priceString = item.find_element_by_class_name('price price--previous-price product-item__price--previous-price ng-binding ng-scope').text
priceInt = priceString.split('£')[1]
originalPrice = float(priceInt)
# discountedPrice Logic
try:
discountedPriceString = item.find_element_by_class_name('price ng-binding price--marked-down').text
discountedPriceInt = discountedPriceString.split('£')[1]
discountedPrice = float(discountedPriceInt)
except:
discountedPrice = 'N/A'
# NlScrapeItem
item = NlScrapeItem()
# Append product to NlScrapeItem
item['identifier'] = identifier
item['href'] = href
item['description'] = desc
item['originalPrice'] = originalPrice
item['discountedPrice'] = discountedPrice
item['firstSighted'] = dt
item['lastSighted'] = dt
yield item
Additions:
So I tried to ignore the idea of using a crawlSpider and follow #jabargas thinking - see below:
def __init__(self):
self.driver = webdriver.Safari()
self.driver.set_window_size(800,600)
def start_requests(self):
n = 5
urls= []
for pageNumber in range(1,n):
url = 'http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%%7Cmn%%7Cwomens%%7Cclothing#/?q=:relevance&page=%d&sort=relevance&content=false' % pageNumber
urls.append(url)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
driver = self.driver
driver.get(response.url)
time.sleep(2)
# Collect products
products = driver.find_elements_by_class_name('plp-item ng-scope')
# Iterate over products; extract data and append individual features to NlScrapeItem
for item in products:
# Pull features
desc = item.find_element_by_class_name('product-item__name link--nounderline ng-binding').text
href = item.find_element_by_class_name('plp-carousel__img-link ng-scope').get_attribute('href')
# Generate a product identifier
identifier = href.split('/p/')[1].split('?comp')[0]
identifier = int(identifier)
# datetime
dt = date.today()
dt = dt.isoformat()
# Price Symbol removal and integer conversion
try:
priceString = item.find_element_by_class_name('price ng-binding').text
except:
priceString = item.find_element_by_class_name('price price--previous-price product-item__price--previous-price ng-binding ng-scope').text
priceInt = priceString.split('£')[1]
originalPrice = float(priceInt)
# discountedPrice Logic
try:
discountedPriceString = item.find_element_by_class_name('price ng-binding price--marked-down').text
discountedPriceInt = discountedPriceString.split('£')[1]
discountedPrice = float(discountedPriceInt)
except:
discountedPrice = 'N/A'
# NlScrapeItem
item = NlScrapeItem()
# Append product to NlScrapeItem
item['identifier'] = identifier
item['href'] = href
item['description'] = desc
item['originalPrice'] = originalPrice
item['discountedPrice'] = discountedPrice
item['firstSighted'] = dt
item['lastSighted'] = dt
yield item
Unfortunately no luck: it pulls details for 48 items.
Another possible issue is that you have not added super constructor in your init method.
add "super(MySpider, self).init(*a, **kw)" for it.
I got the same issue and fixed it by that.
so init should look like follows
def __init__(self, *a, **kw):
super(MySpider, self).__init__(*a, **kw)
//your initializations
You could do it like this to scrape till page n:
start_urls = ['http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%%7Cmn%%7Cwomens%%7Cclothing#/?q=:relevance&page=%d&sort=relevance&content=false' % page_number' for page_number in range(1,n)]
where n is the last page + 1
Or you could use scrapy pagination - get the link to the next page and follow it as you can find here.
import scrapy
from ex.items import ExItem
class reddit(scrapy.Spider):
name = "dmoz"
allowed_domains = ["reddit.com"]
start_urls = [
"http://www.reddit.com/"]
"""docstring for reddit"""
def parse(self, response):
item = ExItem()
item ["title"] = response.xpath('//p[contains(#class,"title")]/a/text()').extract()
item ["rank"] = response.xpath('//span[contains(#class,"rank")]/text()').extract()
item ["votes_dislike"] = response.xpath('//div[contains(#class,"score dislikes")]/text()').extract()
item ["votes_unvoted"] = response.xpath('//div[contains(#class,"score unvoted")]/text()').extract()
item ["votes_likes"] = response.xpath('//div[contains(#class,"score likes")]/text()').extract()
item ["video_reference"] = response.xpath('//a[contains(#class,"thumbnail may-blank")]/#href').extract()
item ["image"] = response.xpath('//a[contains(#class,"thumbnail may-blank")]/img/#src').extract()
I am able to convert this into JSON but in the output i am getting a bullet in the JSON how to remove that and still have the JSON format?
There are hidden elements that you don't see in the browser. Scrapy sees them.
You just need to search for the data inside the relevant part of the page (div with id="siteTable"):
def parse(self, response):
# make a selector and search the fields inside it
sel = response.xpath('//div[#id="siteTable"]')
item = ExItem()
item["title"] = sel.xpath('.//p[contains(#class,"title")]/a/text()').extract()
item["rank"] = sel.xpath('.//span[contains(#class,"rank")]/text()').extract()
item["votes_dislike"] = sel.xpath('.//div[contains(#class,"score dislikes")]/text()').extract()
item["votes_unvoted"] = sel.xpath('.//div[contains(#class,"score unvoted")]/text()').extract()
item["votes_likes"] = sel.xpath('.//div[contains(#class,"score likes")]/text()').extract()
item["video_reference"] = sel.xpath('.//a[contains(#class,"thumbnail may-blank")]/#href').extract()
item["image"] = sel.xpath('.//a[contains(#class,"thumbnail may-blank")]/img/#src').extract()
return item
Tested, here is what I get for, for example, votes_likes:
'votes_likes': [u'5340',
u'4041',
u'4080',
u'5055',
u'4385',
u'4784',
u'3842',
u'3734',
u'4081',
u'3731',
u'4580',
u'5279',
u'2540',
u'4345',
u'2068',
u'3715',
u'3249',
u'4232',
u'4025',
u'522',
u'2993',
u'2789',
u'3529',
u'3450',
u'3533'],
I am trying to extract certain strings from the below mentioned URL :
sample URL :
http://www.ladyblush.com/buy-sarees-online.html?p=1
http://www.ladyblush.com/buy-ladies-suits-online.html?p=1
http://www.ladyblush.com/buy-women-fashion-accessories.html?p=1
i want to extract :
productCategory = "sarees" productSubCategory = ""
productCategory = "ladies" productSubCategory = "suits"
productCategory = "women" productSubCategory = "fashion-accessories"
And so on. Actually i am writing a spider and i need to extract productCategory and productSubCategory from URL's like above mentioned..so i am trying to extract these fields inside parse method from response.url. Can someone help me out please
My code :
import re
from scrapy.http import Request
from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider
#------------------------------------------------------------------------------
class ESpider(CrawlSpider):
name = "ladyblushSpider"
allowed_domains = ["ladyblush.com"]
URLSList = []
for n in range (1,100):
URLSList.append('http://www.ladyblush.com/buy-sarees-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-ladies-suits-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-fashion-accessories.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-nightwear-lingerie-online.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-women-dress-online-skirts-suits-kurtis-tops.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-decor-online-wallclock-bedsheets-cushions-bedcovers.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-cosmetics-online-massage-oils-aromatherapy-perfumes-soaps.html?p=' + str(n))
URLSList.append('http://www.ladyblush.com/buy-jewelery-online-art-fashion-semi-precious-antique-junk-jewellery.html?p=' + str(n))
start_urls = URLSList
def parse(self, response):
item = EscraperItem()
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="third thumbnailSpillLarge"]')
items = []
for site in sites:
item = EscraperItem()
item['currency'] = 'INR'
item['productCategory'] = [""]
item['productSubCategory'] = [""]
item['productSite'] = ["http://ladyblush.com/"]
item['productImage'] = site.select('./a/div/img/#src').extract()
item['productTitle'] = site.select('./a/div/img/#title').extract()
item['productURL'] = [site.select('./a/#href').extract()[0].replace(" ","%20")]
productMRP = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="old-price"]//span[#class="price"]/text()').extract()
productPrice = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//p[#class="special-price"]//span[#class="price"]/text()').extract()
if productMRP and productPrice:
price = [productMRP[1].strip()] + [productPrice[1].strip()]
else:
price = site.select('.//div[#class="salePrice"]//div[#class="price-box"]//span[#class="regular-price"]//span[#class="price"]/text()').extract()
item['productPrice'] = price
items.append(item)
secondURL = item['productURL'][0]
request = Request(secondURL,callback=self.parsePage2)
request.meta['item'] = item
yield request
def parsePage2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
if hxs.select('//div[#class="addtocart-container"]/div/text()').extract():
item['availability'] = False
else:
item['availability'] = True
if hxs.select('//label[#class="required"]/text()').extract():
item['hasVariants'] = True
else:
item['hasVariants'] = False
item['image_urls'] = list(set(item['productImage']))
item['productDesc'] = [" ".join([re.sub(r'[\t\n\r]',"",i.strip()) for i in hxs.select('//div[#class="std"]/text()').extract()])]
item['productImage'] = item['productImage'] + hxs.select('//div[#class="more-views"]/ul/li/a/img/#src').extract() + hxs.select('//div[#class="more-views"]/ul/li/a/#href').extract()
return item
#------------------------------------------------------------------------------
you can get the url from
response.url in the parse method. You could then parse that to just get the url path
import os
test = 'buy-women-fashion-accessories.html?p=1'
parts = os.path.splitext(test)
# ('buy-women-fashion-accessories', '.html?p=1')
parts[0].split('-')[1:]
# ['women', 'fashion', 'accessories']
This is rather flimsy solution though. Are you sure the data is not stored somewhere in the page's html that your are parsing, instead of looking at the url?