I am trying to scrape TripAdvisor's attractions, but I cannot get the names and addresses of each attraction. I suspect I wrote product.css(...) wrong (there are jsons?).
Can anyone tell me how to correct the code to get the name and address of each attraction?
My current code:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'https://www.tripadvisor.com/Attractions-g187427-Activities-oa90-Spain'
]
def parse(self, response):
for link in response.css('.EsZYd a::attr(href)'):
yield response.follow(link.get(), callback=self.parse_categories)
def parse_categories(self, response):
products = response.css('div.eeqnt')
for product in products:
yield {
'name' : product.css('h1.WlYyy cPsXC GeSzT::text').get().strip(),
'address' : product.css('span.WlYyy cacGK Wb::text').get().strip(),
}
Updated code (exporting infro from each atrraction on each page from list):
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'https://www.tripadvisor.com/Attractions-g274862-Activities-a_allAttractions.true-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa30-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa60-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa90-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa120-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa150-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa180-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa210-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa240-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa270-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa300-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa330-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa360-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa390-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa420-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa450-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa480-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa510-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa540-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa570-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa600-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa630-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa660-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa690-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa720-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa750-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa780-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa810-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa840-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa870-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa900-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa930-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa960-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa990-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1020-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1050-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1080-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1110-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1140-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1170-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1200-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1230-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1260-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1290-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1320-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1350-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1380-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1410-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1440-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1470-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1500-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1530-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1560-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1590-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1620-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1650-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1680-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1710-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1740-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1770-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1800-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1830-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1860-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1890-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1920-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1950-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa1980-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2010-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2040-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2070-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2100-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2130-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2160-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2190-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2220-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2250-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2280-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2310-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2340-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2370-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2400-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2430-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2460-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2490-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2520-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2550-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2580-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2610-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2640-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2670-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2700-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2730-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2760-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2790-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2820-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2850-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2880-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2910-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2940-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa2970-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3000-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3030-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3060-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3090-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3120-Slovenia.html',
'https://www.tripadvisor.com/Attractions-g274862-Activities-oa3150-Slovenia.html'
]
def parse(self, response):
for link in response.css('.EsZYd a::attr(href)').getall():
yield response.follow(link, callback=self.parse_categories)
def parse_categories(self, response):
yield {
'name': response.css('h1.WlYyy.cPsXC.GeSzT::text').get(),
'reviews': response.xpath('(//*[#class="cfIVb"])[1]//text()').getall(),
'address': response.xpath('(//*[#class="dGWve"])//text()').getall(),
'url': response.url,
}
It's not really related to python, but css-selectors.
CSS classes should separate with dot and not space WlYyy.cPsXC.GeSzT.
Best suggestion would be to use chrome with dev-toolbar. It will give you an ability to get path to the specific element via css-selector or xpath, just right-click on the element in a DOM-tree and select copy menu-item.
Avoid using classes (especially one without semantic meaning) as an anchor point. They might change from page to page, or in time.
Better to use semantically meaningful nodes, like in your case:
XPath for the title would looks like this //main//header//div[#data-automation="main_h1"]//h1.
You can't use for loop in each listing page
from scrapy.crawler import CrawlerProcess
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'https://www.tripadvisor.com/Attractions-g187427-Activities-oa90-Spain'
]
def parse(self, response):
for link in response.css('.EsZYd a::attr(href)').getall():
#print(link)
yield response.follow(link, callback=self.parse_categories)
def parse_categories(self, response):
yield {
'name' : response.css('h1.WlYyy.cPsXC.GeSzT::text').get(),
'address' :''.join(response.xpath('(//*[#class="hxQKk"])[1]//text()').getall()[:-1]),
'url':response.url
}
if __name__ == "__main__":
process =CrawlerProcess(QuotesSpider)
process.crawl()
process.start()
I need to scrape all of the items but only 1 item is scrape.
My code is working fine before but when I transfer it to other project which is same code this happens I don't know why
I need to get all of the items according to the page size in start_url
here's my working code
class HmSalesitemSpider(scrapy.Spider):
name = 'HM_salesitem'
allowed_domains = ['www2.hm.com']
start_urls = ['https://www2.hm.com/en_us/sale/shopbyproductladies/view-
all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-
size=3002']
def parse(self, response):
for product_item in response.css('li.product-item'):
url = "https://www2.hm.com/" + product_item.css('a::attr(href)').extract_first()
yield scrapy.Request(url=url, callback=self.parse_subpage)
def parse_subpage(self, response):
item = {
'title': response.xpath("normalize-space(.//h1[contains(#class, 'primary') and contains(#class, 'product-item-headline')]/text())").extract_first(),
'sale-price': response.xpath("normalize-space(.//span[#class='price-value']/text())").extract_first(),
'regular-price': response.xpath('//script[contains(text(), "whitePrice")]/text()').re_first("'whitePrice'\s?:\s?'([^']+)'"),
'photo-url': response.css('div.product-detail-main-image-container img::attr(src)').extract_first(),
'description': response.css('p.pdp-description-text::text').extract_first()
}
yield item
Please Help. Thank you
It seems you have problem with indents. Move yielding request to for loop:
def parse(self, response):
for product_item in response.css('li.product-item'):
url = "https://www2.hm.com/" + product_item.css('a::attr(href)').get()
yield scrapy.Request(url=url, callback=self.parse_subpage)
Or this is a bit cleared version:
def parse(self, response):
for link in response.css('li.product-item a::attr(href)').extract():
yield response.follow(link, self.parse_subpage)
So, that was the question. I have a Scrapy bot that follows internal links of a given site, writes its links, status code and anchor text into database. But I'm struggling to grab the link's follow status. Is there any way to grab that rel=nofollow/dofollow information? That's my code if anybody wonders;
class MySpider(CrawlSpider):
name = 'spydiiiii'
start_urls = [urlToScrape]
rules = (
Rule (
LxmlLinkExtractor(
allow=(urlToScrape),
deny=(
"google.com",
"facebook.com",
"pinterest.com",
"facebook.com",
"digg.com",
"twitter.com",
"stumbleupon.com",
"linkedin.com"
),
unique=True
),
callback="parse_items",
follow= True,
),
)
def parse_items(self, response):
sel = Selector(response)
items = []
item = InternallinkItem()
referring_url = response.request.headers.get('Referer').decode('utf-8')
item["referring_url"] = referring_url
anchor = response.meta.get('link_text')
item["anchor_text"] = " ".join(anchor.split())
item["current_url"] = response.url
item['status'] = response.status
items.append(item)
return items
Thanks in advance
I use LxmlLinkExtractor manually to get Link objects which have nofollow information.
In parse() I get links from first page and create item with 'nofollow' (and other) informations, and use Requests with this url (and with item in meta) to get status and referer.
New Requests uses parse_item() to get item from meta and add status.
parse_item() also uses extractor to get new links on this page and create new item and execute Requests with parse_item() again.
import scrapy
from scrapy.http import Request
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = ['http://quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com']
#start_urls = ['http://127.0.0.1:5000/'] # for Flask example
extractor = LxmlLinkExtractor(
allow=('http://quotes.toscrape.com'),
#allow=('http://127.0.0.1:5000'), # for Flask example
deny=(
'google.com',
'facebook.com',
'pinterest.com',
'facebook.com',
'digg.com',
'twitter.com',
'stumbleupon.com',
'linkedin.com'
),
unique=True,
)
def parse(self, response):
print('parse url:', response.url)
# use LxmlLinkExtractor manually
for link in self.extractor.extract_links(response):
#print('link:', link)
item = {}
item['nofollow'] = link.nofollow
item['anchor_text'] = link.text
item['current_url'] = link.url
#item['referring_url'] = response.url
yield Request(link.url, meta={'item': item}, callback=self.parse_item)
def parse_item(self, response):
print('parse_item url:', response.url)
item = response.meta['item']
item['referring_url'] = response.request.headers.get('Referer')
#item['referring_url'] = response.request.url
item['status'] = response.status
yield item
# use LxmlLinkExtractor manually with new links
for link in self.extractor.extract_links(response):
#print('link:', link)
item = {}
item['nofollow'] = link.nofollow
item['anchor_text'] = link.text
item['current_url'] = link.url
#item['referring_url'] = response.url
yield Request(link.url, meta={'item': item}, callback=self.parse_item)
# --- run spider without project ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEED_FORMAT': 'csv',
'FEED_URI': 'output.csv',
})
c.crawl(MySpider)
c.start()
EDIT:
Because I don't know any page with rel="nofollow" so I created simple code in Flask to test code.
from flask import Flask
app = Flask(__name__)
#app.route('/')
def index():
return 'Test 1 | Test 2'
#app.route('/test1')
def test1():
return 'Main Page'
#app.route('/test2')
def test2():
return 'Main Page'
if __name__ == '__main__':
app.run(debug=True)
Up to now I have found how to scrape one page or multiple pages with same url, but changing number. However, I could not find how to scrape pages with subcategories and their subcategories and finally get the content needed.
I am trying to scrape this website: http://www.askislam.org/index.html
I am using Scrapy, but I do not know where to start.
Or you can suggest a better option, I just use python and check from there.
Thanks
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider
from scrapy import Selector
from ask_islam.items import AskIslamItem
from scrapy.http import Request
from scrapy.linkextractors import LinkExtractor
import re
class AskislamSpider(Spider):
name = "askislam"
allowed_domains = ["askislam.org"]
start_urls = ['http://www.askislam.org/']
rules = [Rule(LinkExtractor(allow = ()), callback = 'parse', follow=True)]
def parse(self, response):
hxs = Selector(response)
links = hxs.css('div[id="categories"] li a::attr(href)').extract()
for link in links:
url = 'http://www.askislam.org' + link.replace('index.html', '')
yield Request(url, callback=self.parse_page)
def parse_page(self, response):
hxs = Selector(response)
categories = hxs.css('div[id="categories"] li').extract()
questions = hxs.xpath('a').extract()
if(categories):
for categoryLink in categories:
url = 'http://www.askislam.org' + categoryLink.replace('index.html', '')
yield Request(url, callback=self.parse_page)
# print (question)
EDIT
def start_requests(self):
yield Request("http://www.askislam.org", callback=self.parse_page)
def parse_page(self, response):
hxs = Selector(response)
categories = hxs.css('#categories li')
for cat in categories:
item = AskIslamItem()
link = cat.css('a::attr(href)').extract()[0]
link = "http://www.askislam.org" + link
item['catLink'] = link
logging.info("Scraping Link: %s" % (link))
yield Request(link, callback=self.parse_page)
yield Request(link, callback=self.parse_categories)
def parse_categories(self, response):
logging.info("The Cat Url")
Read links from that http://www.askislam.org/index.html page using xPath or CSS Selectors of those sub-categories and then do another Request()
EDIT:
import logging
class AskislamSpider(Spider):
name = "askislam"
def start_requests(self):
yield Request("http://www.askislam.org/", callback=self.parse_page)
def parse_page(self, response):
categories = response.css('#categories li').extract()
for cat in categories:
link = cat.css("a::attr(href)").extract()[0]
link = "http://www.askislam.org/" + link
logging.info("Scraping Link: %s" % (link))
yield Request(link, callback=self.parse_page)
i am using Splash 2.0.2 + Scrapy 1.0.5 + Scrapyjs 0.1.1 and im still not able to render javascript with a click. Here is an example url https://olx.pt/anuncio/loja-nova-com-250m2-garagem-em-box-fechada-para-arrumos-IDyTzAT.html#c49d3d94cf
I am still getting the page without the phone number rendered:
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
script = """
function main(splash)
splash:go(splash.args.url)
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
"""
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents, meta={
'splash': {
'args': {'lua_source': script},
'endpoint': 'execute',
}
})
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
import ipdb;ipdb.set_trace()
how can i get this to work?
Add
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
to Lua script and it will work.
function main(splash)
splash:go(splash.args.url)
splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")
splash:runjs('document.getElementById("contact_methods").getElementsByTagName("span")[1].click();')
splash:wait(0.5)
return splash:html()
end
.click() is JQuery function https://api.jquery.com/click/
You can avoid having to use Splash in the first place and make the appropriate GET request to get the phone number yourself. Working spider:
import json
import re
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
rotate_user_agent = True
allowed_domains = ["olx.pt"]
start_urls = [
"https://olx.pt/imoveis/"
]
def parse(self, response):
for href in response.css('.link.linkWithHash.detailsLink::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_house_contents)
for next_page in response.css('.pager .br3.brc8::attr(href)'):
url = response.urljoin(next_page.extract())
yield scrapy.Request(url, self.parse)
def parse_house_contents(self, response):
property_id = re.search(r"ID(\w+)\.", response.url).group(1)
phone_url = "https://olx.pt/ajax/misc/contact/phone/%s/" % property_id
yield scrapy.Request(phone_url, callback=self.parse_phone)
def parse_phone(self, response):
phone_number = json.loads(response.body)["value"]
print(phone_number)
If there are more things to extract from this "dynamic" website, see if Splash is really enough and, if not, look into browser automation and selenium.