I'm trying to scrape the website of a prominent UK retailer but I am facing an issue with my CrawlSpider - I get the following error message:
AttributeError: 'NlCrawlerSpider' object has no attribute '_rules'
I used the example here to convert my regular spider into a crawl spider; I have also played around with the syntax for the Rules as suggested here but end up with the same error msg. All your help would be much appreciated - thank you in advance!
# Scrapy
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
# Other Packages
import time
from datetime import date
from selenium import webdriver
class NlCrawlerSpider(CrawlSpider):
name = 'nl_crawler'
allowed_domains = ['newlook.com']
start_urls = ['http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%7Cmn%7Cwomens%7Cclothing#/?q=:relevance&page=1&sort=relevance&content=false']
rules = (
Rule(LinkExtractor(allow=r'\?q=:relevance&page=[1-130]&sort=relevance&content=false', ), callback='parse_item', follow=True),
)
def __init__(self):
self.driver = webdriver.Safari()
self.driver.set_window_size(800,600)
time.sleep(2)
def parse_item(self, response):
driver = self.driver
driver.get(response.url)
time.sleep(2)
# Collect products
products = driver.find_elements_by_class_name('plp-item ng-scope')
# Iterate over products; extract data and append individual features to NlScrapeItem
for item in products:
# Pull features
desc = item.find_element_by_class_name('product-item__name link--nounderline ng-binding').text
href = item.find_element_by_class_name('plp-carousel__img-link ng-scope').get_attribute('href')
# Generate a product identifier
identifier = href.split('/p/')[1].split('?comp')[0]
identifier = int(identifier)
# datetime
dt = date.today()
dt = dt.isoformat()
# Price Symbol removal and integer conversion
try:
priceString = item.find_element_by_class_name('price ng-binding').text
except:
priceString = item.find_element_by_class_name('price price--previous-price product-item__price--previous-price ng-binding ng-scope').text
priceInt = priceString.split('£')[1]
originalPrice = float(priceInt)
# discountedPrice Logic
try:
discountedPriceString = item.find_element_by_class_name('price ng-binding price--marked-down').text
discountedPriceInt = discountedPriceString.split('£')[1]
discountedPrice = float(discountedPriceInt)
except:
discountedPrice = 'N/A'
# NlScrapeItem
item = NlScrapeItem()
# Append product to NlScrapeItem
item['identifier'] = identifier
item['href'] = href
item['description'] = desc
item['originalPrice'] = originalPrice
item['discountedPrice'] = discountedPrice
item['firstSighted'] = dt
item['lastSighted'] = dt
yield item
Additions:
So I tried to ignore the idea of using a crawlSpider and follow #jabargas thinking - see below:
def __init__(self):
self.driver = webdriver.Safari()
self.driver.set_window_size(800,600)
def start_requests(self):
n = 5
urls= []
for pageNumber in range(1,n):
url = 'http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%%7Cmn%%7Cwomens%%7Cclothing#/?q=:relevance&page=%d&sort=relevance&content=false' % pageNumber
urls.append(url)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
driver = self.driver
driver.get(response.url)
time.sleep(2)
# Collect products
products = driver.find_elements_by_class_name('plp-item ng-scope')
# Iterate over products; extract data and append individual features to NlScrapeItem
for item in products:
# Pull features
desc = item.find_element_by_class_name('product-item__name link--nounderline ng-binding').text
href = item.find_element_by_class_name('plp-carousel__img-link ng-scope').get_attribute('href')
# Generate a product identifier
identifier = href.split('/p/')[1].split('?comp')[0]
identifier = int(identifier)
# datetime
dt = date.today()
dt = dt.isoformat()
# Price Symbol removal and integer conversion
try:
priceString = item.find_element_by_class_name('price ng-binding').text
except:
priceString = item.find_element_by_class_name('price price--previous-price product-item__price--previous-price ng-binding ng-scope').text
priceInt = priceString.split('£')[1]
originalPrice = float(priceInt)
# discountedPrice Logic
try:
discountedPriceString = item.find_element_by_class_name('price ng-binding price--marked-down').text
discountedPriceInt = discountedPriceString.split('£')[1]
discountedPrice = float(discountedPriceInt)
except:
discountedPrice = 'N/A'
# NlScrapeItem
item = NlScrapeItem()
# Append product to NlScrapeItem
item['identifier'] = identifier
item['href'] = href
item['description'] = desc
item['originalPrice'] = originalPrice
item['discountedPrice'] = discountedPrice
item['firstSighted'] = dt
item['lastSighted'] = dt
yield item
Unfortunately no luck: it pulls details for 48 items.
Another possible issue is that you have not added super constructor in your init method.
add "super(MySpider, self).init(*a, **kw)" for it.
I got the same issue and fixed it by that.
so init should look like follows
def __init__(self, *a, **kw):
super(MySpider, self).__init__(*a, **kw)
//your initializations
You could do it like this to scrape till page n:
start_urls = ['http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%%7Cmn%%7Cwomens%%7Cclothing#/?q=:relevance&page=%d&sort=relevance&content=false' % page_number' for page_number in range(1,n)]
where n is the last page + 1
Or you could use scrapy pagination - get the link to the next page and follow it as you can find here.
Related
I'm trying to first crawl through the main page of this website for the links to a table for each year. Then I'd like to scrape each site, while maintaining record of each year.
So far I have my spider constructed as:
div = response.xpath('//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div')
hrefs = div.xpath('*//a').extract()
splits = {}
for href in hrefs:
split = href.split('"')
link = split[1]
date = split[2]
clean_date = "".join(re.findall("[^><a/]",date))
clean_link = "http://www.ylioppilastutkinto.fi" + str(link)
splits[clean_date] = clean_link
I would then like to go through each link in this file and crawl through them, using the following logic:
table = resp.xpath('//*[#id="content"]/table/tbody')
rows = table.xpath('//tr')
data_dict = {"Category":
[w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]]
}
for row in rows[1:]:
data = row.xpath('td')
title = w3lib.html.remove_tags(data[0].get())
nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]]
data_dict[title] = nums
My problem is that I couldn't find a way to do this effectively. Calling scrapy.Request on the url returns a response with just the content <html></html>. If there was a way where the response object could resemble the one given by the fetch command in Scrapy shell that would be ideal, since I've based the selection logic on testing with that command.
Edit:
Here's the entire spider so far
The idea is the run the first for loop to get the link and then the second for loop to extract the tables from said links.
import scrapy
import regex as re
from scrapy.http import HtmlResponse
import w3lib.html
class MainSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat']
start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/']
def parse(self, response):
div = response.xpath('//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div')
hrefs = div.xpath('*//a').extract()
splits = {}
for href in hrefs:
split = href.split('"')
link = split[1]
date = split[2]
clean_date = "".join(re.findall("[^><a/]",date))
clean_link = "http://www.ylioppilastutkinto.fi" + str(link)
splits[clean_date] = clean_link
for date,url in splits.items():
resp = HtmlResponse(url)
table = resp.xpath('//*[#id="content"]/table/tbody')
rows = table.xpath('//tr')
data_dict = {"Category":[w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]]}
for row in rows[1:]:
data = row.xpath('td')
title = w3lib.html.remove_tags(data[0].get())
nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]]
data_dict[title] = nums
yield {
'Date': date,
'Scores': data_dict}
Initializing a HtmlResponse(url) doesn't accomplish anything, since the class doesn't make the request itself.
To add a request to scrapy's scheduler, you need to yield one, eg: yield scrapy.Request(url, callback=self.parse).
That being said, there are many improvements you can make to your spider.
Use scrapy's builtin LinkExtractor instead of string splitting
use css selectors instead of the hardcoded xpaths
use selector.root.text instead of w3lib.remove_tags (to remove the dependency entirely)
Here is a working example:
import scrapy
from scrapy.linkextractors import LinkExtractor
class MainSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['www.ylioppilastutkinto.fi']
start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/']
def parse(self, response):
le = LinkExtractor(
allow_domains=self.allowed_domains,
restrict_xpaths='//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div',
)
for link in le.extract_links(response):
yield scrapy.Request(
url=link.url,
callback=self.parse_table,
cb_kwargs={ 'date': link.text },
)
def parse_table(self, response, date):
rows = response.css('#content table tbody tr')
if not rows:
print(f'No table found for url: {response.url}')
return
category = [char.root.text for char in rows[0].css('td strong')[1:]]
if not category:
category = [char.root.text for char in rows[0].css('td')[1:]]
for row in rows[1:]:
cols = row.css('td')
title = cols[0].root.text
nums = [col.root.text for col in cols[1:]]
yield {
'Date': date,
'Category': category,
title: nums
}
Note that your category parsing doesn't appear to work. I'm not exactly sure what you are trying to extract, so I'll leave that one for you.
I am running the scrapy spider on airbnb for academic purposes below. I scrape all listings first
(such as: https://www.airbnb.com/s/Berlin--Germany/homes?tab_id=all_tab&query=Berlin%2C%20Germany&place_id=ChIJAVkDPzdOqEcRcDteW0YgIQQ&checkin=2020-05-01&adults=1&refinement_paths%5B%5D=%2Fhomes&source=structured_search_input_header&search_type=search_query&checkout=2020-05-02)
to get their ids and then go to the listing's page
(such as: https://www.airbnb.de/rooms/20839690?location=Berlin&check_in=2020-05-01&check_out=2020-05-02&adults=1)
and get the geo-data from the details JSON. Ideally, I would like to have a final JSON nested like:
{{'ID': ID1, 'Title': Title1, 'Latitude': Lat1},{'ID': ID2, 'Title': Title2, 'Latitude': Lat2}}
Because of the recursive structure, I have the full list of title, price etc. already in the first go, while lng and lat are only one element per loop run.
{{Price1, Price2, Price3..., id1, id2...lng1, lat1}, {Price1, Price2, Price3..., id1, id2..., lng2, lat2}}
Any idea how I can restructure the code to get the above structure?
Cheers
marcello
Spider:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from airbnb.items import AirbnbItem
import json
import pprint
all_ids = []
detail = {}
class AirbnbSpider(scrapy.Spider):
name = 'airbnb_spider'
allowed_domains = ['airbnb.com', 'airbnb.de']
start_urls = ['https://www.airbnb.de/s/Berlin/homes?checkin=2020-05-01&checkout=2020-05-02&adults=1']
def parse(self, response):
item = AirbnbItem()
for listing in response.xpath('//div[#class = "_fhph4u"]'):
detail["title"] = listing.xpath('//a[#class = "_i24ijs"]/#aria-label').extract()
detail["price"] = listing.xpath('//span[#class = "_1p7iugi"]/text()').extract()
detail["rating"] = listing.xpath('//span[#class = "_3zgr580"]/text()').get()
detail["id"] = listing.xpath('//a[#class = "_i24ijs"]/#target').extract()
#item["link"] = listing.xpath('//a[#class = "_i24ijs"]/#href').extract()
x_id = [i.split('_')[1] for i in detail['id']]
detail['id'] = x_id
for i in x_id:
link = 'https://www.airbnb.de/api/v2/pdp_listing_details/'+i+'?_format=for_rooms_show&_p3_impression_id=p3_1587291065_1e%2FBlC2IefkrfTQe&adults=1&check_in=2020-05-01&check_out=2020-05-02&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&'
yield scrapy.Request(url = link, callback =self.parse_detail)
def parse_detail(self, response):
jsonresponse = json.loads(response.body_as_unicode())
detail["lat"] = jsonresponse["pdp_listing_detail"]["lat"]
detail["lng"] = jsonresponse["pdp_listing_detail"]["lng"]
return detail
Items
import scrapy
class AirbnbItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
price = scrapy.Field()
id = scrapy.Field()
rating = scrapy.Field()
lat = scrapy.Field()
lng = scrapy.Field()
pass
You can pass information to the to the parse_detail method and yield from there
def parse(self, response):
item = AirbnbItem()
for listing in response.xpath('//div[#class = "_fhph4u"]'):
detail["title"] = listing.xpath('//a[#class = "_i24ijs"]/#aria-label').get()
detail["price"] = listing.xpath('//span[#class = "_1p7iugi"]/text()').get()
detail["rating"] = listing.xpath('//span[#class = "_3zgr580"]/text()').get()
detail["id"] = listing.xpath('//a[#class = "_i24ijs"]/#target').get()
#item["link"] = listing.xpath('//a[#class = "_i24ijs"]/#href').get()
detail['id'] = detail['id'].split('_')[1]
link = 'https://www.airbnb.de/api/v2/pdp_listing_details/'+detail['id']+'?_format=for_rooms_show&_p3_impression_id=p3_1587291065_1e%2FBlC2IefkrfTQe&adults=1&check_in=2020-05-01&check_out=2020-05-02&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&'
yield scrapy.Request(url = link,
meta={'item': detail}, #pass information to the next method
callback =self.parse_detail)
def parse_detail(self, response):
jsonresponse = json.loads(response.body_as_unicode())
detail = response.meta['item']
detail["lat"] = jsonresponse["pdp_listing_detail"]["lat"]
detail["lng"] = jsonresponse["pdp_listing_detail"]["lng"]
yield detail
BTW, Item class is useless, do not use it.
import scrapy
class rlgSpider(scrapy.Spider):
name = 'bot'
start_urls = [
'https://rocket-league.com/trading?filterItem=0&filterCertification=0&filterPaint=0&filterPlatform=1&filterSearchType=1&filterItemType=0&p=1']
def parse(self, response):
data = {}
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data['name'] = item.xpath('//div/div[#position ="relative"]/h2').extarct()
yield data
Here is what I did so far - it doesn't work well. It scrapes the url and not the h2 tag how do I do that when it's inside so many divs?
In order to parse though an element in scrapy you need to start your xpath with "." else you will be parsing through the response, this is the correct way of doing it.
def parse(self, response):
offers = response.xpath('//div[#class = "col-3-3"]')
for offer in offers:
for item in offer.xpath('.//div[#class = "rlg-trade-display-container is--user"]/div[#class = "rlg-trade-display-items"]/div[#class = "col-1-2 rlg-trade-display-items-container"]/a'):
data = {}
data['name'] = item.xpath('.//h2/text()').extarct_first()
yield data
This question already has an answer here:
scrapy xpath selector repeats data
(1 answer)
Closed 6 years ago.
I use scrapy for this link. I want crawl information movie from website imdb.com.
When I use code XPath
//td[#class="overview-top"]
I recieve a list information of movies.
Here is my code.
import scrapy
import sys
from imbd.items import ImbdItem
class ImbdSpiderSpider(scrapy.Spider):
name = "imbd_spider"
allowed_domains = ["imdb.com"]
start_urls = ()
def parse(self, response):
print response.url
title_movie= response.xpath('//td[#class="overview-top"]/h4/a/text()').extract()
length_title = len(title_movie)
if(length_title == 0):
return
# $x()
# read block
#print block[0].xpath('//*[#id="main"]/div/div[2]/div[4]/table/tbody/tr[1]/td[2]/h4/a').extract()
#print block[1]
#print block[1].xpath('//td[#class="overview-top"]/h4/a/text()').extract()
tree = response.xpath('//td[#class="overview-top"]')
i = 0
for block in tree:
#print table
title = block.xpath('//h4[#itemprop="name"]/a/text()').extract()
# author = block.xpath('//span[#itemprop="director"]/span/a/text()')[i].extract()
# rate = block.xpath('//div[#class="metascore no_ratings"]/strong/text()')[i].extract()
# time = block.xpath('//time[#itemprop="duration"]/text()')[i].extract()
# tag = block.xpath('//span[#itemprop="genre"]/text()').extract()
# des = block.xpath('//div[#class="outline"]/text()')[i].extract()
print title
# print author
# print rate
# print time
# print tag
# print des
#i = i + 1
# page = response.xpath('//div[#class="sort"]/a')
# page2 = page.xpath('//a[text()="Next"]/#href')
# nextpage = "http://www.imdb.com" + page2[0].extract() # /movie-coming-soon..
# yield scrapy.Request(nextpage, self.parse)
def start_requests(self):
start = "http://www.imdb.com/movies-coming-soon/2017-12/"
yield self.make_requests_from_url(start)
Here is my result
I want the terminal to show one title, but this shows all titles in the list.
My code : https://github.com/Takehashi/Scrapy-imbd.com/tree/master
Just add "." in first to avoid duplicate !
item['title'] = block.xpath('.//h4[#itemprop="name"]/a/text()').extract()
item['author'] = block.xpath('.//span[#itemprop="director"]/span/a/text()').extract()
item['rate'] = block.xpath('.//div[#class="metascore no_ratings"]/strong/text()').extract()
item['time'] = block.xpath('.//time[#itemprop="duration"]/text()').extract()
item['tag'] = block.xpath('.//span[#itemprop="genre"]/text()').extract()
item['des'] = block.xpath('.//div[#class="outline"]/text()').extract()
import scrapy
from ex.items import ExItem
class reddit(scrapy.Spider):
name = "dmoz"
allowed_domains = ["reddit.com"]
start_urls = [
"http://www.reddit.com/"]
"""docstring for reddit"""
def parse(self, response):
item = ExItem()
item ["title"] = response.xpath('//p[contains(#class,"title")]/a/text()').extract()
item ["rank"] = response.xpath('//span[contains(#class,"rank")]/text()').extract()
item ["votes_dislike"] = response.xpath('//div[contains(#class,"score dislikes")]/text()').extract()
item ["votes_unvoted"] = response.xpath('//div[contains(#class,"score unvoted")]/text()').extract()
item ["votes_likes"] = response.xpath('//div[contains(#class,"score likes")]/text()').extract()
item ["video_reference"] = response.xpath('//a[contains(#class,"thumbnail may-blank")]/#href').extract()
item ["image"] = response.xpath('//a[contains(#class,"thumbnail may-blank")]/img/#src').extract()
I am able to convert this into JSON but in the output i am getting a bullet in the JSON how to remove that and still have the JSON format?
There are hidden elements that you don't see in the browser. Scrapy sees them.
You just need to search for the data inside the relevant part of the page (div with id="siteTable"):
def parse(self, response):
# make a selector and search the fields inside it
sel = response.xpath('//div[#id="siteTable"]')
item = ExItem()
item["title"] = sel.xpath('.//p[contains(#class,"title")]/a/text()').extract()
item["rank"] = sel.xpath('.//span[contains(#class,"rank")]/text()').extract()
item["votes_dislike"] = sel.xpath('.//div[contains(#class,"score dislikes")]/text()').extract()
item["votes_unvoted"] = sel.xpath('.//div[contains(#class,"score unvoted")]/text()').extract()
item["votes_likes"] = sel.xpath('.//div[contains(#class,"score likes")]/text()').extract()
item["video_reference"] = sel.xpath('.//a[contains(#class,"thumbnail may-blank")]/#href').extract()
item["image"] = sel.xpath('.//a[contains(#class,"thumbnail may-blank")]/img/#src').extract()
return item
Tested, here is what I get for, for example, votes_likes:
'votes_likes': [u'5340',
u'4041',
u'4080',
u'5055',
u'4385',
u'4784',
u'3842',
u'3734',
u'4081',
u'3731',
u'4580',
u'5279',
u'2540',
u'4345',
u'2068',
u'3715',
u'3249',
u'4232',
u'4025',
u'522',
u'2993',
u'2789',
u'3529',
u'3450',
u'3533'],