Trying to scrape the internet archive website (Wayback Machine): https://web.archive.org/web/20150906222155mp_/https://www.zalando.co.uk/womens-clothing/.
I am succesful in scraping the 1st page content, but can't move to the next page. I have tried multiple xpath to move to next pages:
# 1
next_page_url = response.xpath("//li[a[contains(.,'>')]]//#href").extract_first() # does not work
# 2
next_page_url = response.xpath(//a[#class='catalogPagination_page' and text() ='>'])[1]//#href).get() # does not work
I have tried converting to absolute url (and without) but again with no luck.
Can anyone help with new xpath or css selectors that I can finally scrape the next pages?
Below you can see my full code:
# -*- coding: utf-8 -*-
import scrapy
class ZalandoWomenSpider(scrapy.Spider):
name = 'zalando_women_historic_2015'
allowed_domains = ['www.web.archive.org']
start_urls = ['https://web.archive.org/web/20150906222155mp_/https://www.zalando.co.uk/womens-clothing/']
def parse(self, response):
products = response.xpath("//a[#class='catalogArticlesList_productBox']")
for product in products:
link = product.xpath(".//#href").get()
absolute_url = f"https://web.archive.org{link}"
yield scrapy.Request(url=absolute_url,callback=self.parse_product,dont_filter=True,meta={'link':link})
# process next page
next_page_url = response.xpath("//li[a[contains(.,'>')]]//#href").extract_first() #(//a[#class='catalogPagination_page' and text() ='>'])[1]//#href
absolute_next_page_url = f"https://web.archive.org{next_page_url}"
#absolute_next_page_url = next_page_url
#absolute_next_page_url = response.urljoin(next_page_url)
if next_page_url:
yield scrapy.Request(url=absolute_next_page_url,callback=self.parse)
def parse_product(self, response):
link = response.request.meta['link']
brand = response.xpath("//span[#itemprop='brand']/text()").get()
price = response.xpath("//span[#class='price oldPrice nowrap']/text()").get()
price1 = response.xpath("//span[#itemprop='price']/text()").get()
price2 = response.xpath("//div[#class='boxPrice']//span[contains(#class,'price')]/text()").get()
disc_price = response.xpath("//span[#class='price specialPrice nowrap']/text()").get()
product_type = response.xpath("//span[#itemprop='name']/text()").get()
material = response.xpath("//div[#class='content']//li[contains(.,'material')]/text()").get()
yield {
'brand_name': brand,
'product_price':price,
'product_price1':price1,
'product_price2':price2,
'product_price_b4_disc':disc_price,
'link':link,
'product_type':product_type,
'material':material}
next_page_url=response.xpath(".//a[#class='catalogPagination_page' and text() ='>']/#href").get()
Will get : '/web/20150906222155/https://www.zalando.co.uk/womens-clothing/?p=2'
You can then use split("/") to remove the "/web/201509..." bit
Note 1: I used the " " quotes inside the parentheses.
Note 2: in Scrapy you can also use "response.follow" to save having to join a relative URL to a base URL.
Check this post as well:
Scrapy response.follow query
Related
I'm trying to first crawl through the main page of this website for the links to a table for each year. Then I'd like to scrape each site, while maintaining record of each year.
So far I have my spider constructed as:
div = response.xpath('//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div')
hrefs = div.xpath('*//a').extract()
splits = {}
for href in hrefs:
split = href.split('"')
link = split[1]
date = split[2]
clean_date = "".join(re.findall("[^><a/]",date))
clean_link = "http://www.ylioppilastutkinto.fi" + str(link)
splits[clean_date] = clean_link
I would then like to go through each link in this file and crawl through them, using the following logic:
table = resp.xpath('//*[#id="content"]/table/tbody')
rows = table.xpath('//tr')
data_dict = {"Category":
[w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]]
}
for row in rows[1:]:
data = row.xpath('td')
title = w3lib.html.remove_tags(data[0].get())
nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]]
data_dict[title] = nums
My problem is that I couldn't find a way to do this effectively. Calling scrapy.Request on the url returns a response with just the content <html></html>. If there was a way where the response object could resemble the one given by the fetch command in Scrapy shell that would be ideal, since I've based the selection logic on testing with that command.
Edit:
Here's the entire spider so far
The idea is the run the first for loop to get the link and then the second for loop to extract the tables from said links.
import scrapy
import regex as re
from scrapy.http import HtmlResponse
import w3lib.html
class MainSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat']
start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/']
def parse(self, response):
div = response.xpath('//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div')
hrefs = div.xpath('*//a').extract()
splits = {}
for href in hrefs:
split = href.split('"')
link = split[1]
date = split[2]
clean_date = "".join(re.findall("[^><a/]",date))
clean_link = "http://www.ylioppilastutkinto.fi" + str(link)
splits[clean_date] = clean_link
for date,url in splits.items():
resp = HtmlResponse(url)
table = resp.xpath('//*[#id="content"]/table/tbody')
rows = table.xpath('//tr')
data_dict = {"Category":[w3lib.html.remove_tags(num.get()) for num in rows[0].xpath('td')[1:]]}
for row in rows[1:]:
data = row.xpath('td')
title = w3lib.html.remove_tags(data[0].get())
nums = [w3lib.html.remove_tags(num.get()) for num in data[1:]]
data_dict[title] = nums
yield {
'Date': date,
'Scores': data_dict}
Initializing a HtmlResponse(url) doesn't accomplish anything, since the class doesn't make the request itself.
To add a request to scrapy's scheduler, you need to yield one, eg: yield scrapy.Request(url, callback=self.parse).
That being said, there are many improvements you can make to your spider.
Use scrapy's builtin LinkExtractor instead of string splitting
use css selectors instead of the hardcoded xpaths
use selector.root.text instead of w3lib.remove_tags (to remove the dependency entirely)
Here is a working example:
import scrapy
from scrapy.linkextractors import LinkExtractor
class MainSpider(scrapy.Spider):
name = 'links'
allowed_domains = ['www.ylioppilastutkinto.fi']
start_urls = ['https://www.ylioppilastutkinto.fi/ylioppilastutkinto/pisterajat/']
def parse(self, response):
le = LinkExtractor(
allow_domains=self.allowed_domains,
restrict_xpaths='//*[#id="sidebar"]/div[1]/nav/ul/li[5]/div',
)
for link in le.extract_links(response):
yield scrapy.Request(
url=link.url,
callback=self.parse_table,
cb_kwargs={ 'date': link.text },
)
def parse_table(self, response, date):
rows = response.css('#content table tbody tr')
if not rows:
print(f'No table found for url: {response.url}')
return
category = [char.root.text for char in rows[0].css('td strong')[1:]]
if not category:
category = [char.root.text for char in rows[0].css('td')[1:]]
for row in rows[1:]:
cols = row.css('td')
title = cols[0].root.text
nums = [col.root.text for col in cols[1:]]
yield {
'Date': date,
'Category': category,
title: nums
}
Note that your category parsing doesn't appear to work. I'm not exactly sure what you are trying to extract, so I'll leave that one for you.
I'm trying to scrape a real-estate website: https://www.nepremicnine.net/oglasi-prodaja/slovenija/hisa/. I would like to get the href that is hidden in the tag of the house images:
I would like to get this for the whole page (and other pages). Here is the code I wrote that returns nothing (e.g. empty dictionary):
import scrapy
from ..items import RealEstateSloItem
import time
# first get all the URLs that have more info on the houses
# next crawl those URLs to get the desired information
class RealestateSpider(scrapy.Spider):
# allowed_domains = ['nepremicnine.net']
name = 'realestate'
page_number = 2
# page 1 url
start_urls = ['https://www.nepremicnine.net/oglasi-prodaja/slovenija/hisa/1/']
def parse(self, response):
items = RealEstateSloItem() # create it from items class --> need to store it down
all_links = response.css('a.slika a::attr(href)').extract()
items['house_links'] = all_links
yield items
next_page = 'https://www.nepremicnine.net/oglasi-prodaja/slovenija/hisa/' + str(RealestateSpider.page_number) + '/'
#print(next_page)
# if next_page is not None: # for buttons
if RealestateSpider.page_number < 180: # then only make sure to go to the next page
# if yes then increase it --> for paginations
time.sleep(1)
RealestateSpider.page_number += 1
# parse automatically checks for response.follow if its there when its done with this page
# this is a recursive function
# follow next page and where should it after following
yield response.follow(next_page, self.parse) # want it to go back to parse
Could you tell me what I am doing wrong here with css selectors?
Your selector is looking for an a element inside the a.slika. This should solve your issue:
all_links = response.css('a.slika ::attr(href)').extract()
Those will be relative urls, you can use response.urljoin() to build the absolute url using your response url as base domain.
I am having problems going through multiple pages. Here is my class for scrapy code called quotes.
class quotes(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/?']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback = self.books_detail)
def books_detail(self, response):
yas = {
'title':[],
'price':[],
'availability':[],
'category':[]
}
yas['category'].append(response.css('h1::text').extract())
all_divs = response.css('.col-lg-3')
for div in all_divs:
link = div.css('.product_pod a::attr(href)').get()
title = response.follow(link, callback = self.get_title)
yas['price'].append(div.css('.price_color::text').extract())
yas['availability'].append(div.css('.availability::text')[1].extract())
yield yas
def get_title(self,response):
print('testing')
title = response.css('h1::text').extract()
yield {"title":title}
So I use a response.follow to goto function books_details and in that function, I again call response.follow to call get_title. I get the 'title' from get_title and the rest of the details from the main page.
I can scrape the information just fine from the books_details function and I can get the link of the title page just fine as well from the code line.
link = div.css('.product_pod a::attr(href)').get()
But using the response.follow I can not go to the get_title function.
Any help would be appreciated. Thanks.
You should yield request, not run it directly, and use meta= to send data to next parser
yield response.follow(link, callback=self.get_title, meta={'item': yas})
and in next parser you can get it
yas = response.meta['item']
and then you can add new values and yield all data
yas["title"] = response.css('h1::text').extract()
yield yas
See other example in Scrapy yeild items from multiple requests
Doc: Request and Response, Request.meta special keys
Minimal working code which you can put in one file and run as normal script (python script.py) without creating project.
There are other changes.
You shouldn't put all books to one list but yield every book separatelly. Scrapy will keep all results and when you use option to save in csv then it will save all results.
For every book you should create new dictionary. If you use the same dictionary many time then it will ovewrite data and you may get many result with the same data.
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback=self.books_detail)
def books_detail(self, response):
all_divs = response.css('.col-lg-3')
for div in all_divs:
# every book in separated dictionary and it has to be new dictionary - because it could overwrite old data
book = {
'category': response.css('h1::text').extract(),
'price': div.css('.price_color::text').extract()[0].strip(),
'availability': div.css('.availability::text')[1].extract().strip(),
}
link = div.css('.product_pod a::attr(href)').get()
yield response.follow(link, callback=self.get_title, meta={'item': book})
def get_title(self, response):
book = response.meta['item']
print('testing:', response.url)
book["title"] = response.css('h1::text').extract()[0].strip()
yield book
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(QuotesSpider)
c.start()
I built my first scray spider in several hours for the last two days but i am stuck right now - the main purpose i wanted to achieve is to extract all data to later filter it in csv. Now, the real crucial data for me (Companies without! webpages) is dropped because scrapy can't find the xpath i provided if an item has a homepage. I tried an if statement here, but its not working.
Example website: https://www.achern.de/de/Wirtschaft/Unternehmen-A-Z/Unternehmen?view=publish&item=company&id=1345
I use xPath selector: response.xpath("//div[#class='cCore_contactInformationBlockWithIcon cCore_wwwIcon']/a/#href").extract()
Example non-website: https://www.achern.de/de/Wirtschaft/Unternehmen-A-Z/Unternehmen?view=publish&item=company&id=1512
Spider Code:
# -*- coding: utf-8 -*-
import scrapy
class AchernSpider(scrapy.Spider):
name = 'achern'
allowed_domains = ['www.achern.de']
start_urls = ['https://www.achern.de/de/Wirtschaft/Unternehmen-A-Z/']
def parse(self, response):
for href in response.xpath("//ul[#class='cCore_list cCore_customList']/li[*][*]/a/#href"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback= self.scrape)
def scrape(self, response):
#Extracting the content using css selectors
print("Processing:"+response.url)
firma = response.css('div>#cMpu_publish_company>h2.cCore_headline::text').extract()
anschrift = response.xpath("//div[contains(#class,'cCore_addressBlock_address')]/text()").extract()
tel = response.xpath("//div[#class='cCore_contactInformationBlockWithIcon cCore_phoneIcon']/text()").extract()
mail = response.xpath(".//div[#class='cCore_contactInformationBlock']//*[contains(text(), '#')]/text()").extract()
web1 = response.xpath("//div[#class='cCore_contactInformationBlockWithIcon cCore_wwwIcon']/a/#href").extract()
if "http:" not in web1:
web = "na"
else:
web = web1
row_data=zip(firma,anschrift,tel,mail,web1) #web1 must be changed to web but then it only give out "n" for every link
#Give the extracted content row wise
for item in row_data:
#create a dictionary to store the scraped info
scraped_info = {
'Firma' : item[0],
'Anschrift' : item[1] +' 77855 Achern',
'Telefon' : item[2],
'Mail' : item[3],
'Web' : item[4],
}
#yield or give the scraped info to scrapy
yield scraped_info
So overall it should export the DROPPED items even "web" is not there..
Hope someone can help, greetings S
Using
response.css(".cCore_wwwIcon > a::attr(href)").get()
gives you a None or the website address, then you can use or to provide a default:
website = response.css(".cCore_wwwIcon > a::attr(href)").get() or 'na'
Also, I refactored your scraper to use css selectors. Note that I've used .get() instead of .extract() to get a single item, not a list, which cleans up the code quite a bit.
import scrapy
from scrapy.crawler import CrawlerProcess
class AchernSpider(scrapy.Spider):
name = 'achern'
allowed_domains = ['www.achern.de']
start_urls = ['https://www.achern.de/de/Wirtschaft/Unternehmen-A-Z/']
def parse(self, response):
for url in response.css("[class*=cCore_listRow] > a::attr(href)").extract():
yield scrapy.Request(url, callback=self.scrape)
def scrape(self, response):
# Extracting the content using css selectors
firma = response.css('.cCore_headline::text').get()
anschrift = response.css('.cCore_addressBlock_address::text').get()
tel = response.css(".cCore_phoneIcon::text").get()
mail = response.css("[href^=mailto]::attr(href)").get().replace('mailto:', '')
website = response.css(".cCore_wwwIcon > a::attr(href)").get() or 'na'
scraped_info = {
'Firma': firma,
'Anschrift': anschrift + ' 77855 Achern',
'Telefon': tel,
'Mail': mail,
'Web': website,
}
yield scraped_info
if __name__ == "__main__":
p = CrawlerProcess()
p.crawl(AchernSpider)
p.start()
output:
with website:
{'Firma': 'Wölfinger Fahrschule GmbH', 'Anschrift': 'Güterhallenstraße 8 77855 Achern', 'Telefon': '07841 6738132', 'Mail': 'info#woelfinger-fahrschule.de', 'Web': 'http://www.woelfinger-fahrschule.de'}
without website:
{'Firma': 'Zappenduster-RC Steffen Liepe', 'Anschrift': 'Am Kirchweg 16 77855 Achern', 'Telefon': '07841 6844700', 'Mail': 'Zappenduster-Rc#hotmail.de', 'Web': 'na'}
I am starting at one page on a website and trying to find a link that takes me to another page to scrap. However, for reasons that i can't work out my xpath to the link on this first page to take me to the other page is not working. I have tried to get to the link from several directions, though none seem to work?
import scrapy
from urllib.parse import urljoin
from autotrader1.items import Autotrader1Item
class Autotrader1Spider(scrapy.Spider):
name = "autotrader1"
allowed_domains = ["autotrader.co.uk"]
start_urls = ["https://www.autotrader.co.uk/dealers/greater-manchester/manchester/liberty-cars-10000889"]
def parse(self,response):
for href1 in response.xpath('//*[#class="dealer__stock-header"]/a/#href'):
url1 = urljoin('https://autotrader.co.uk/',href1.extract())
yield scrapy.Request(url1, callback=self.parse_dir_contents1)
def parse_dir_contents1(self, response):
for sel in response.xpath('//div[#class="dealerStock"]'):
URLs = sel.xpath('.//a[#class="external title"]/#href ').extract()
URLs = [URL.strip() for URL in URLs]
descriptions = sel.xpath('.//a[#class="external title"]/text()').extract()
descriptions = [description.strip() for description in descriptions]
prices = sel.xpath('.//div/span[#class="deal-price"]/text()').extract()
prices = [price.strip() for price in prices]
result = zip(URLs, descriptions, prices)
for URL, description, price in result:
item = Autotrader1Item()
item['URL'] = URL
item['description'] = description
item['price'] = price
yield item