I'm scraping dior.com for its products. head/script gives me all the fields I need except for a product description. To scrape the description I need to follow the link (the url variable in the code below). The only way to do that I'm familiar with is by using BeautifulSoup. Can I parse it using only Scrapy?
class DiorSpider(CrawlSpider):
name = 'dior'
allowed_domains = ['www.dior.com']
start_urls = ['https://www.dior.com/en_us/']
rules = (
Rule(LinkExtractor(allow=(r'^https?://www.dior.com/en_us/men/clothing/new-arrivals.*',)), callback='parse_file')
def parse_file(self, response):
script_text = response.xpath("//script[contains(., 'window.initialState')]").extract_first()
blocks = extract_blocks(script_text)
for block in blocks:
sku = re.compile(r'("sku":)"[a-zA-Z0-9_]*"').finditer(block)
url = re.compile(r'("productLink":{"uri":)"[^"]*').finditer(block)
for item in zip(sku, url):
scraped_info = {
'sku': item[0].group(0).split(':')[1].replace('"', ''),
'url': 'https://www.dior.com' + item[1].group(0).split(':')[2].replace('"', '')
yield scraped_info
If you need to extract additional information from a second request, instead of yielding the data there, you should yield a request for the URL that includes the information you already extracted in the Request.meta attribute.
from scrapy import Request
# …
def parse_file(self, response):
# …
for block in blocks:
# …
for item in zip(sku, url):
# …
yield Request(url, callback=self.parse_additional_information, meta={'scraped_info': scraped_info}
def parse_additional_information(self, response):
scraped_info = response.meta['scraped_info']
# extract the additional information, add it to scraped_info
yield scraped_info
I wanted to scrape the feed of sitepoint.com, this is my code:
import scrapy
from urllib.parse import urljoin
class SitepointSpider(scrapy.Spider):
# TODO: Add url tags (like /javascript) to the spider based on class paraneters
name = "sitepoint"
allowed_domains = ["sitepoint.com"]
start_urls = ["http://sitepoint.com/javascript/"]
def parse(self, response):
data = []
for article in response.css("article"):
title = article.css("a.t12xxw3g::text").get()
href = article.css("a.t12xxw3g::attr(href)").get()
img = article.css("img.f13hvvvv::attr(src)").get()
time = article.css("time::text").get()
url = urljoin("https://sitepoint.com", href)
text = scrapy.Request(url, callback=self.parse_article)
{"title": title, "href": href, "img": img, "time": time, "text": text}
yield data
def parse_article(self, response):
text = response.xpath(
yield text
And this is the response I get:-
[{'title': 'How to Build an MVP with React and Firebase',
'href': '/react-firebase-build-mvp/',
'img': 'https://uploads.sitepoint.com/wp-content/uploads/2021/09/1632802723react-firebase-mvp-
'time': 'September 28, 2021',
'text': <GET https://sitepoint.com/react-firebase-build-mvp/>}]
It just does not scrape the urls. I followed everything said in this question but still could not make it work.
You have to visit the detail page from the listing to scrape the article.
In that case you have to yield the URL first then yield the data in the last spider
Also, the //*[#id="main-content"]/article/div/div/div[1]/section/text() won't return you any text since there are lots of HTML elements under the section tag
One solution is you can scrape all the HTML element inside section tag and clean them later to get your article text data
here is the full working code
import re
import scrapy
from urllib.parse import urljoin
class SitepointSpider(scrapy.Spider):
# TODO: Add url tags (like /javascript) to the spider based on class paraneters
name = "sitepoint"
allowed_domains = ["sitepoint.com"]
start_urls = ["http://sitepoint.com/javascript/"]
def clean_text(self, raw_html):
:param raw_html: this will take raw html code
:return: text without html tags
cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
return re.sub(cleaner, '', raw_html)
def parse(self, response):
for article in response.css("article"):
title = article.css("a.t12xxw3g::text").get()
href = article.css("a.t12xxw3g::attr(href)").get()
img = article.css("img.f13hvvvv::attr(src)").get()
time = article.css("time::text").get()
url = urljoin("https://sitepoint.com", href)
yield scrapy.Request(url, callback=self.parse_article, meta={"title": title,
"href": href,
"img": img,
"time": time})
def parse_article(self, response):
title = response.request.meta["title"]
href = response.request.meta["href"]
img = response.request.meta["img"]
time = response.request.meta["time"]
all_data = {}
article_html = response.xpath('//*[#id="main-content"]/article/div/div/div[1]/section').get()
all_data["title"] = title
all_data["href"] = href
all_data["img"] = img
all_data["time"] = time
all_data["text"] = self.clean_text(article_html)
yield all_data
def parse(self, response):
category_names = []
category_urls = []
for item in response.css("#zg_browseRoot ul li"):
category_url = item.css("a").css(self.CSS_URL).extract()
category_name = item.css("a").css(self.CSS_TEXT).extract()
category_url = [
self.parse_url(category_url, 4) for category_url in category_url
(category_url,) = category_url
(category_name,) = category_name
for c_name, url in zip(category_names, category_urls):
self.c_name = [c_name]
yield scrapy.Request(url, callback=self.parse_categories)
def parse_url(self, url, number):
parse = urlparse(url)
split = parse.path.split("/")[:number]
return f'{self.BASE_URL}{"/".join(split)}'
def parse_categories(self, response):
sub_names = []
sub_urls = []
for item in response.css("#zg_browseRoot ul ul li"):
sub_name = item.css("a").css(self.CSS_TEXT).extract()
sub_url = item.css("a").css(self.CSS_URL).extract()
sub_url = [self.parse_url(sub_url, 5) for sub_url in sub_url]
(sub_url,) = sub_url
(sub_name,) = sub_name
for sub_name, url in zip(sub_names, sub_urls):
self.sub_name = [sub_name]
# print("{}: {}, {}".format(url, self.sub_name, self.c_name))
yield scrapy.Request(url, callback=self.parse_subcategories)
def parse_subcategories(self, response):
url = self.parse_url(response.request.url, 5)
print(f"{self.c_name}, {self.sub_name}, {url}")
I'm having an issue with my Scrapy approach. I'm trying to scrape page which has categories and subcategories in which are items. I want to include category and subcategory with each item scraped.
The problem is that the Scrapys callback function is asynchronous and zipping the URLs with names doesn't seem to work, because the for loop is processed first, URLs are stored in a generator and names are staying behind. Can anyone help me to work around this?
You can pass arbitrary data along with the requests by using th cb_kwargs parameter. You can read about the details here.
Here is a simplified example:
def parse(self, response):
rows = response.xpath('//div[#id="some-element"]')
for row in rows:
request_url = row.xpath('a/#href').get()
category = row.xpath('a/text()').get()
yield Request(
cb_kwargs={'category': category}
def parse_category(self, response, category): # Notice category arg in the func
# Process here
yield item
The data inserted in cb_kwargs is passed as a keyword arg into the callback function, so the key in the dict must match the name of the argument in the method definiton.
cb_kwargs were introduced in Scrapy v1.7, if you are using an older version you should use the meta param. You can read about it here, notice that the use is slightly different.
I am having problems going through multiple pages. Here is my class for scrapy code called quotes.
class quotes(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/?']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback = self.books_detail)
def books_detail(self, response):
yas = {
all_divs = response.css('.col-lg-3')
for div in all_divs:
link = div.css('.product_pod a::attr(href)').get()
title = response.follow(link, callback = self.get_title)
yield yas
def get_title(self,response):
title = response.css('h1::text').extract()
yield {"title":title}
So I use a response.follow to goto function books_details and in that function, I again call response.follow to call get_title. I get the 'title' from get_title and the rest of the details from the main page.
I can scrape the information just fine from the books_details function and I can get the link of the title page just fine as well from the code line.
link = div.css('.product_pod a::attr(href)').get()
But using the response.follow I can not go to the get_title function.
Any help would be appreciated. Thanks.
You should yield request, not run it directly, and use meta= to send data to next parser
yield response.follow(link, callback=self.get_title, meta={'item': yas})
and in next parser you can get it
yas = response.meta['item']
and then you can add new values and yield all data
yas["title"] = response.css('h1::text').extract()
yield yas
See other example in Scrapy yeild items from multiple requests
Doc: Request and Response, Request.meta special keys
Minimal working code which you can put in one file and run as normal script (python script.py) without creating project.
There are other changes.
You shouldn't put all books to one list but yield every book separatelly. Scrapy will keep all results and when you use option to save in csv then it will save all results.
For every book you should create new dictionary. If you use the same dictionary many time then it will ovewrite data and you may get many result with the same data.
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback=self.books_detail)
def books_detail(self, response):
all_divs = response.css('.col-lg-3')
for div in all_divs:
# every book in separated dictionary and it has to be new dictionary - because it could overwrite old data
book = {
'category': response.css('h1::text').extract(),
'price': div.css('.price_color::text').extract()[0].strip(),
'availability': div.css('.availability::text')[1].extract().strip(),
link = div.css('.product_pod a::attr(href)').get()
yield response.follow(link, callback=self.get_title, meta={'item': book})
def get_title(self, response):
book = response.meta['item']
print('testing:', response.url)
book["title"] = response.css('h1::text').extract()[0].strip()
yield book
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
I need a Scrapy spider to scrape the following page (https://www.phidgets.com/?tier=1&catid=64&pcid=57) for each URL (30 products, so 30 urls) and then go into each product via that url and scrape the data inside.
I have the second part working exactly as I want:
import scrapy
class ProductsSpider(scrapy.Spider):
name = "products"
start_urls = [
def parse(self, response):
for info in response.css('div.ph-product-container'):
yield {
'product_name': info.css('h2.ph-product-name::text').extract_first(),
'product_image': info.css('div.ph-product-img-ctn a').xpath('#href').extract(),
'sku': info.css('span.ph-pid').xpath('#prod-sku').extract_first(),
'short_description': info.css('div.ph-product-summary::text').extract_first(),
'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
'long_description': info.css('div#product_tab_1').extract_first(),
'specs': info.css('div#product_tab_2').extract_first(),
# next_page = response.css('div.ph-summary-entry-ctn a::attr("href")').extract_first()
# if next_page is not None:
# yield response.follow(next_page, self.parse)
But I don't know how to do the first part. As you will see I have the main page (https://www.phidgets.com/?tier=1&catid=64&pcid=57) set as the start_url. But how do I get it to populate the start_urls list with all 30 urls I need crawled?
I am not able to test at this moment, so please let me know if this works for you so I can edit it should there be any bugs.
The idea here is that we find every link in the first page and yield new scrapy requests passing your product parsing method as a callback
import scrapy
from urllib.parse import urljoin
class ProductsSpider(scrapy.Spider):
name = "products"
start_urls = [
def parse(self, response):
products = response.xpath("//*[contains(#class, 'ph-summary-entry-ctn')]/a/#href").extract()
for p in products:
url = urljoin(response.url, p)
yield scrapy.Request(url, callback=self.parse_product)
def parse_product(self, response):
for info in response.css('div.ph-product-container'):
yield {
'product_name': info.css('h2.ph-product-name::text').extract_first(),
'product_image': info.css('div.ph-product-img-ctn a').xpath('#href').extract(),
'sku': info.css('span.ph-pid').xpath('#prod-sku').extract_first(),
'short_description': info.css('div.ph-product-summary::text').extract_first(),
'price': info.css('h2.ph-product-price > span.price::text').extract_first(),
'long_description': info.css('div#product_tab_1').extract_first(),
'specs': info.css('div#product_tab_2').extract_first(),
I am trying to parse a site, an e-store. I parse a page with products, which are loaded with ajax, get urls of these products,and then parse additional info of each product following these parced urls.
My script gets the list of first 4 items on the page, their urls, makes the request, parses add info, but then not returning into the loop and so spider closes.
Could somebody help me in solving this? I'm pretty new to this kind of stuff, and ask here when totally stuck.
Here is my code:
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http.request import Request
from scrapy_sokos.items import SokosItem
class SokosSpider(Spider):
name = "sokos"
allowed_domains = ["sokos.fi"]
base_url = "http://www.sokos.fi/fi/SearchDisplay?searchTermScope=&searchType=&filterTerm=&orderBy=8&maxPrice=&showResultsPage=true&beginIndex=%s&langId=-11&sType=SimpleSearch&metaData=&pageSize=4&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&urlLangId=-11&categoryId=295401&storeId=10151"
start_urls = [
for i in range(0, 8, 4):
start_urls.append((base_url) % str(i))
def parse(self, response):
products = Selector(response).xpath('//div[#class="product-listing product-grid"]/article[#class="product product-thumbnail"]')
for product in products:
item = SokosItem()
item['url'] = product.xpath('//div[#class="content"]/a[#class="image"]/#href').extract()[0]
yield Request(url = item['url'], meta = {'item': item}, callback=self.parse_additional_info)
def parse_additional_info(self, response):
item = response.meta['item']
item['name'] = Selector(response).xpath('//h1[#class="productTitle"]/text()').extract()[0].strip()
item['description'] = Selector(response).xpath('//div[#id="kuvaus"]/p/text()').extract()[0]
euro = Selector(response).xpath('//strong[#class="special-price"]/span[#class="euros"]/text()').extract()[0]
cent = Selector(response).xpath('//strong[#class="special-price"]/span[#class="cents"]/text()').extract()[0]
item['price'] = '.'.join(euro + cent)
item['number'] = Selector(response).xpath('//#data-productid').extract()[0]
yield item
The AJAX requests you are simulating are caught by the Scrapy "duplicate url filter".
Set dont_filter to True when yielding a Request:
yield Request(url=item['url'],
meta={'item': item},