I am trying to scrape TripAdvisor's attractions, but I cannot get the names and addresses of each attraction. I suspect I wrote product.css(...) wrong (there are jsons?).
Can anyone tell me how to correct the code to get the name and address of each attraction?
My current code:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'https://www.tripadvisor.com/Attractions-g187427-Activities-oa90-Spain'
]
def parse(self, response):
for link in response.css('.EsZYd a::attr(href)'):
yield response.follow(link.get(), callback=self.parse_categories)
def parse_categories(self, response):
products = response.css('div.eeqnt')
for product in products:
yield {
'name' : product.css('h1.WlYyy cPsXC GeSzT::text').get().strip(),
'address' : product.css('span.WlYyy cacGK Wb::text').get().strip(),
}
Updated code (exporting infro from each atrraction on each page from list):
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'https://www.tripadvisor.com/Attractions-g274862-Activities-a_allAttractions.true-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa30-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa60-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa90-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa120-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa150-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa180-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa210-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa240-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa270-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa300-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa330-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa360-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa390-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa420-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa450-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa480-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa510-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa540-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa570-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa600-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa630-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa660-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa690-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa720-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa750-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa780-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa810-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa840-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa870-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa900-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa930-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa960-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa990-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1020-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1050-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1080-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1110-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1140-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1170-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1200-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1230-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1260-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1290-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1320-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1350-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1380-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1410-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1440-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1470-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1500-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1530-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1560-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1590-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1620-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1650-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1680-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1710-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1740-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1770-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1800-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1830-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1860-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1890-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1920-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1950-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1980-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2010-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2040-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2070-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2100-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2130-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2160-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2190-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2220-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2250-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2280-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2310-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2340-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2370-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2400-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2430-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2460-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2490-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2520-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2550-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2580-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2610-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2640-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2670-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2700-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2730-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2760-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2790-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2820-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2850-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2880-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2910-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2940-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2970-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3000-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3030-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3060-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3090-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3120-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3150-Slovenia.html'
]
def parse(self, response):
for link in response.css('.EsZYd a::attr(href)').getall():
yield response.follow(link, callback=self.parse_categories)
def parse_categories(self, response):
yield {
'name': response.css('h1.WlYyy.cPsXC.GeSzT::text').get(),
'reviews': response.xpath('(//*[#class="cfIVb"])[1]//text()').getall(),
'address': response.xpath('(//*[#class="dGWve"])//text()').getall(),
'url': response.url,
}
It's not really related to python, but css-selectors.
CSS classes should separate with dot and not space WlYyy.cPsXC.GeSzT.
Best suggestion would be to use chrome with dev-toolbar. It will give you an ability to get path to the specific element via css-selector or xpath, just right-click on the element in a DOM-tree and select copy menu-item.
Avoid using classes (especially one without semantic meaning) as an anchor point. They might change from page to page, or in time.
Better to use semantically meaningful nodes, like in your case:
XPath for the title would looks like this //main//header//div[#data-automation="main_h1"]//h1.
You can't use for loop in each listing page
from scrapy.crawler import CrawlerProcess
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'https://www.tripadvisor.com/Attractions-g187427-Activities-oa90-Spain'
]
def parse(self, response):
for link in response.css('.EsZYd a::attr(href)').getall():
#print(link)
yield response.follow(link, callback=self.parse_categories)
def parse_categories(self, response):
yield {
'name' : response.css('h1.WlYyy.cPsXC.GeSzT::text').get(),
'address' :''.join(response.xpath('(//*[#class="hxQKk"])[1]//text()').getall()[:-1]),
'url':response.url
}
if __name__ == "__main__":
process =CrawlerProcess(QuotesSpider)
process.crawl()
process.start()
I'm new to this scrapy concept. I have written a script for E-commerce website and need to scrape below mentioned details in that website. I facing issue with this script. please anyone help me to get out from this issue.
website:https://savedbythedress.com/collections/maternity-tops
import scrapy
class DressSpider(scrapy.Spider):
name = 'dress'
allowed_domains = ['savedbythedress.com']
start_urls = ['https://savedbythedress.com/collections/maternity-tops']
def parse(self, response):
#scraped all product links
domain = "https://savedbythedress.com"
link_products = response.css('div[class="product-info-inner"] ::attr(href)').get()
for link in link_products:
product_link = domain + link
yield{
'product_link': product_link.css('div[class="product-info-inner"] ::attr(href)').get(),
}
yield scrapy.Request(url=product_link, callback=self.parse_contents)
def parse_contents(self, response):
#scrape needed information
productlink = response.url
yield{
'product_title' : response.css('.sbtd-product-title ::text').get(),
'product_price' : response.css('.product-price ::text').get(),
'product_review' : response.css('.Natsob ::text').getall()
}
use yield response.follow(page_url, self.parse_contents) it will work for you
import scrapy
class DressSpider(scrapy.Spider):
name = 'dress'
allowed_domains = ['savedbythedress.com']
start_urls = ['https://savedbythedress.com/collections/maternity-tops']
def parse(self, response):
#scraped all product links
domain = "https://savedbythedress.com"
# link_products = response.css('div[class="product-info-inner"] ::attr(href)').get()
for link in response.css('div.product-info'):
page_url = link.css('div[class="product-info-inner"] ::attr(href)').get()
print('PAGE URL IS ', page_url)
yield response.follow(page_url, self.parse_contents)
# product_link = domain + link
# yield{
# 'product_link': link.css('div[class="product-info-inner"] ::attr(href)').get(),
# }
print(page_url)
# yield scrapy.Request(response.follow(page_url), callback=self.parse_contents)
def parse_contents(self, response):
print()
#scrape needed information
print(response.url)
productlink = response.url
yield{
'product_title' : response.css('.sbtd-product-title ::text').get(),
'product_price' : response.css('.product-price ::text').get(),
'product_review' : response.css('.Natsob ::text').getall()
}
I am having a problem with my scrapy program, I want to crawl information from following website
https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=>
I want to get the "Part No." information inside the "span id=resPartNum" TAG. I have already tried:
- NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
- NAME_SELECTOR = './/span[#class="resPartNum"]/text()
- NAME_SELECTOR = './/tr/td/span[#class="resPartNum"]/a/text()'
Here is my full CODE:
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = '.set'
for part in response.css(SET_SELECTOR):
NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css(NAME_SELECTOR).extract_first(),
}
I am not very advanced in scrapy and would appreciate ANY HELP!!
Use the css selector table.partlookup_table to collect the table item through loop partNum and partName.here extract() return list.
import scrapy
class PartSpider(scrapy.Spider):
name = 'part_spider'
start_urls = ['https://parts.cat.com/AjaxCATPartLookupResultsView?catalogId=10051&langId=-1&requestType=1&storeId=21801&serialNumber=KSN00190&keyword=&link=']
def parse(self, response):
SET_SELECTOR = 'table.partlookup_table'
for part in response.css(SET_SELECTOR):
#NAME_SELECTOR = './/*[#id="resPartNum"]/text()'
yield {
'name': part.css('span.resPartName a::text').extract(),
'partnumber': part.css('span.resPartNum a::text').extract()
}
process = CrawlerProcess()
process.crawl(PartSpider)
process.start()
I get this code from a website:
import scrapy
class BrickSetSpider(scrapy.Spider):
name = "brickset_spider"
start_urls = ['http://brickset.com/sets/year-2016']
def parse(self, response):
SET_SELECTOR = '.set'
for brickset in response.css(SET_SELECTOR):
NAME_SELECTOR = 'h1 a ::text'
yield {
'name': brickset.css(NAME_SELECTOR).extract(),
}
I use the code for crawling data. This is a sample result when I run the code:
The name is the result of extract() method. This is the inspect element (in chrome):
I want to ask about the way to get the result for name is 10805: Around the World or only Around the World. How to do that?
To get "10805: Around the World" change your yield to:
yield {
'name': " ".join(brickset.css(NAME_SELECTOR).extract()),
}
To get "Around the World" change your yield to:
yield {
'name': brickset.css(NAME_SELECTOR).extract()[-1],
}
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
for sel in response.xpath('//div[#id="site-list-content"]/div[#class="site-item "]'):
#for sel in response.xpath('//ul/li'):
item = DmozItem()
item['title'] = sel.xpath('div[#class="title-and-desc"]//div[#class="site-title"]/text()').extract()
item['link'] = sel.xpath('div[#class="title-and-desc"]//a/#href').extract()
item['desc'] = sel.xpath('normalize-space(div[#class="title-and-desc"]//div[#class="site-descr "]/text())').extract()
yield item
That is the code for my scraper. I followed the tutorial on the scrapy website, but the code was a little bit outdated so I had to change the code myself. The code works with the /python/books site, but not with the /resources one. Can anyone provide an explanation as to why this happens? Thank you.