How to Scrape Inner level url using scrapy? - python

I created a scrapy spider watching some online video. It scrapes profile url from a website. I want to extend this to scrape data like address, name, phone,website url from each profile url scraped.
I was thinking to create to separate scrapers. One for scraping Profile url. And the second one to scrape the data from the scraped first url.
Is there any other solution?
Here is my spider that scrapes profile urls.
# -*- coding: utf-8 -*-
import scrapy
from ..items import ...scraperItem
class SpiderSpider(scrapy.Spider):
name = 'spider'
start_urls = ['https:// ...']
page_number = 15
def parse(self, response):
items=...scraperItem()
..._url=response.css('a.header-5.text-unbold ::attr(href)').extract_first()
items['..._url']= ..._url
yield items
next_page = 'https:/...'+str(...SpiderSpider.page_number)
if ...SpiderSpider.page_number <= 150:
...SpiderSpider.page_number += 15
yield response.follow(next_page, callback = self.parse)

You can add another parse method (eg. parse_profile) to scrape the additional data. E.g.
def parse(self, response):
url = response.css('a.header-5.text-unbold ::attr(href)').extract_first()
yield response.follow(url, callback=self.parse_profile)
# next_page = ...
if self.page_number <= 150:
self.page_number += 15
yield response.follow(next_page, callback=self.parse)
def parse_profile(self, response)
item = HouzzscraperItem()
item['houzz_url'] = response.url
# item['address'] = ...
# item['name'] = ...
# item['phone'] = ...
yield item

Related

Failed to retrieve product listings pages from few categories

From this webpage I am trying to get that kind of link where different products are located. There are 6 categories having More info button which when I traverse recursively, I usually reach the target pages. This is one such product listings page I wish to get.
Please note that some of these pages have both product listing and more info buttons, which is why I failed to capture the product listing pages accurately.
Current spider looks like the following (fails to grab lots of product listings pages):
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url, callback=self.parse)
def parse(self, response):
link_list = []
for item in response.css(".match-height a.more-info::attr(href)").getall():
if not "/detail/" in item:
inner_page_link = response.urljoin(item)
link_list.append(inner_page_link)
yield {"target_url":inner_page_link}
for new_link in link_list:
yield scrapy.Request(new_link, callback=self.parse)
Expected output (randomly taken):
https://www.norgren.com/de/en/list/directional-control-valves/in-line-and-manifold-valves
https://www.norgren.com/de/en/list/pressure-switches/electro-mechanical-pressure-switches
https://www.norgren.com/de/en/list/pressure-switches/electronic-pressure-switches
https://www.norgren.com/de/en/list/directional-control-valves/sub-base-valves
https://www.norgren.com/de/en/list/directional-control-valves/non-return-valves
https://www.norgren.com/de/en/list/directional-control-valves/valve-islands
https://www.norgren.com/de/en/list/air-preparation/combination-units-frl
How to get all the product listings pages from the six categories?
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url)
def parse(self, response):
# check if there are items in the page
if response.xpath('//div[contains(#class, "item-list")]//div[#class="buttons"]/div[#class="more-information"]/a/#href'):
yield scrapy.Request(url=response.url, callback=self.get_links, dont_filter=True)
# follow "more info" buttons
for url in response.xpath('//a[text()="More info"]/#href').getall():
yield response.follow(url)
def get_links(self, response):
yield {"target_url": response.url}
next_page = response.xpath('//a[#class="next-button"]/#href').get()
if next_page:
yield response.follow(url=next_page, callback=self.get_links)
Maybe filter only pages that have at least one link to details? Here is an example of how to identify if a page meets the criteria you are searching for:
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url, callback=self.parse)
def parse(self, response):
link_list = []
more_info_items = response.css(
".match-height a.more-info::attr(href)").getall()
detail_items = [item for item in more_info_items if '/detail/' in item]
if len(detail_items) > 0:
print(f'This is a link you are searching for: {response.url}')
for item in more_info_items:
if not "/detail/" in item:
inner_page_link = response.urljoin(item)
link_list.append(inner_page_link)
yield {"target_url": inner_page_link}
for new_link in link_list:
yield scrapy.Request(new_link, callback=self.parse)
I only printed the link to the console, but you can figure out how to log it to where you need.

Scrapy - Can not do multiple callbacks

I am having problems going through multiple pages. Here is my class for scrapy code called quotes.
class quotes(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/?']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback = self.books_detail)
def books_detail(self, response):
yas = {
'title':[],
'price':[],
'availability':[],
'category':[]
}
yas['category'].append(response.css('h1::text').extract())
all_divs = response.css('.col-lg-3')
for div in all_divs:
link = div.css('.product_pod a::attr(href)').get()
title = response.follow(link, callback = self.get_title)
yas['price'].append(div.css('.price_color::text').extract())
yas['availability'].append(div.css('.availability::text')[1].extract())
yield yas
def get_title(self,response):
print('testing')
title = response.css('h1::text').extract()
yield {"title":title}
So I use a response.follow to goto function books_details and in that function, I again call response.follow to call get_title. I get the 'title' from get_title and the rest of the details from the main page.
I can scrape the information just fine from the books_details function and I can get the link of the title page just fine as well from the code line.
link = div.css('.product_pod a::attr(href)').get()
But using the response.follow I can not go to the get_title function.
Any help would be appreciated. Thanks.
You should yield request, not run it directly, and use meta= to send data to next parser
yield response.follow(link, callback=self.get_title, meta={'item': yas})
and in next parser you can get it
yas = response.meta['item']
and then you can add new values and yield all data
yas["title"] = response.css('h1::text').extract()
yield yas
See other example in Scrapy yeild items from multiple requests
Doc: Request and Response, Request.meta special keys
Minimal working code which you can put in one file and run as normal script (python script.py) without creating project.
There are other changes.
You shouldn't put all books to one list but yield every book separatelly. Scrapy will keep all results and when you use option to save in csv then it will save all results.
For every book you should create new dictionary. If you use the same dictionary many time then it will ovewrite data and you may get many result with the same data.
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback=self.books_detail)
def books_detail(self, response):
all_divs = response.css('.col-lg-3')
for div in all_divs:
# every book in separated dictionary and it has to be new dictionary - because it could overwrite old data
book = {
'category': response.css('h1::text').extract(),
'price': div.css('.price_color::text').extract()[0].strip(),
'availability': div.css('.availability::text')[1].extract().strip(),
}
link = div.css('.product_pod a::attr(href)').get()
yield response.follow(link, callback=self.get_title, meta={'item': book})
def get_title(self, response):
book = response.meta['item']
print('testing:', response.url)
book["title"] = response.css('h1::text').extract()[0].strip()
yield book
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(QuotesSpider)
c.start()

1: my spider is giving me all the results in one liners on csv file

In the first place, If I use extract_first, scrapy gives me the first element of each page and if I run it like this it returns all the content I want but in one-liners.
In Second place, I can't make scrapy go to the links I just scraped and get information from inside these links, returning an empty csv file.
from scrapy import Spider
from companies.items import CompaniesItem
import re
class companiesSpider(Spider):
name = "companies"
allowed_domains = ['http://startup.miami',]
# Defining the list of pages to scrape
start_urls = ["http://startup.miami/category/startups/page/" + str(1*i) + "/" for i in range(0, 10)]
def parse(self, response):
rows = response.xpath('//*[#id="datafetch"]')
for row in rows:
link = row.xpath('.//h2/a/#href').extract()
name = row.xpath('.//header/h2/a/text()').extract()
item = CompaniesItem()
item['link'] = link
item['name'] = name
yield item
Your parse-method is not yielding any requests or items. In the part below we go through the pages and get the urls & names. In the parse_detail you can add additional data to the item.
Instead of hardcoding to 10 pages we check if there is a next page, and go through the parse again if it's the case.
from scrapy import Spider
from ..items import CompaniesItem
import scrapy
class CompaniesSpider(Spider):
name = "companies"
allowed_domains = ['startup.miami']
# Defining the list of pages to scrape
start_urls = ["http://startup.miami/category/startups/"]
def parse(self, response):
# get link & name and send item to parse_detail in meta
rows = response.xpath('//*[#id="datafetch"]/article')
for row in rows:
link = row.xpath('.//#href').extract_first()
name = row.xpath(
'.//*[#class="textoCoworking"]/text()').extract_first()
item = CompaniesItem()
item['link'] = link
item['name'] = name.strip()
yield scrapy.Request(link,
callback=self.parse_detail,
meta={'item': item})
# get the next page
next_page = response.xpath(
'//*[#class="next page-numbers"]/#href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
def parse_detail(self, response):
item = response.meta['item']
# add other details to the item here
yield item
To put the results in a csv file you can launch the scraper like this: scrapy crawl companies -o test_companies.csv

Python Scrapy keep getting same page link from next page button

i am trying to scrape amazon.com for the link of products that has more than 800 reviews but i keep getting the same page link from the next page button it keeps returning page 2 over and over again where i should get page 3,4 and so on
I HAVE SET A IF CONDITION TO SPILT AND CONVERT REVIEW STRING LIKE 1,020 TO INTEGER AND COMPARE IF GREATER THAN 800 OR NOT THEN BASED ON THAT VISIT THE PAGE
here is the code
# -*- coding: utf-8 -*-
import scrapy
from amazon.items import AmazonItem
from urlparse import urljoin
class AmazonspiderSpider(scrapy.Spider):
name = "amazonspider"
DOWNLOAD_DELAY = 1
start_urls = ['https://www.amazon.com/s/ref=lp_165993011_nr_n_0?fst=as%3Aoff&rh=n%3A165793011%2Cn%3A%21165795011%2Cn%3A165993011%2Cn%3A2514571011&bbn=165993011&ie=UTF8&qid=1493778423&rnid=165993011']
def parse(self, response):
SET_SELECTOR = '.a-carousel-card.acswidget-carousel__card'
for attr in response.css(SET_SELECTOR):
#print '\n\n', attr
item = AmazonItem()
review_selector = './/*[#class="acs_product-rating__review-count"]/text()'
link_selector = './/*[#class="a-link-normal"]/#href'
if attr.xpath(review_selector).extract_first():
if int(''.join(attr.xpath(review_selector).extract_first().split(','))) >= 800:
url = urljoin(response.url, attr.xpath(link_selector).extract_first())
item['LINKS'] = url
if url:
yield scrapy.Request(url, callback=self.parse_link, meta={'item': item})
next_page = './/span[#class="pagnRA"]/a[#id="pagnNextLink"]/#href'
next_page = response.xpath(next_page).extract_first()
print '\n\n', urljoin(response.url, next_page)
if next_page:
yield scrapy.Request(
urljoin(response.url, next_page),
callback=self.parse
)
def parse_link(self, response):
item = AmazonItem(response.meta['item'])
catselector = '.cat-link ::text'
defaultcatselector = '.nav-search-label ::text'
cat = response.css(catselector).extract_first()
if cat:
item['CATAGORY'] = cat
else:
item['CATAGORY'] = response.css(defaultcatselector).extract_first()
return item
here is the output when i was printing the next page link before calling the parse function recursively
and
and here is the screenshot from the next page selector of the page
where am i going wrong ?
Move the next page code block outside the loop.
class AmazonspiderSpider(scrapy.Spider):
name = "amazonspider"
DOWNLOAD_DELAY = 1
start_urls = ['https://www.amazon.com/s/ref=lp_165993011_nr_n_0?fst=as%3Aoff&rh=n%3A165793011%2Cn%3A%21165795011%2Cn%3A165993011%2Cn%3A2514571011&bbn=165993011&ie=UTF8&qid=1493778423&rnid=165993011']
def parse(self, response):
SET_SELECTOR = '.a-carousel-card.acswidget-carousel__card'
for attr in response.css(SET_SELECTOR):
#print '\n\n', attr
review_selector = './/*[#class="acs_product-rating__review-count"]/text()'
link_selector = './/*[#class="a-link-normal"]/#href'
if attr.xpath(review_selector).extract_first():
if int(''.join(attr.xpath(review_selector).extract_first().split(','))) >= 800:
url = urljoin(response.url, attr.xpath(link_selector).extract_first())
next_page = './/span[#class="pagnRA"]/a[#id="pagnNextLink"]/#href'
next_page = response.xpath(next_page).extract_first()
print '\n\n', urljoin(response.url, next_page)
if next_page:
yield scrapy.Request(
urljoin(response.url, next_page),
callback=self.parse
)

Scrapy parse list of urls, open one by one and parse additional data

I am trying to parse a site, an e-store. I parse a page with products, which are loaded with ajax, get urls of these products,and then parse additional info of each product following these parced urls.
My script gets the list of first 4 items on the page, their urls, makes the request, parses add info, but then not returning into the loop and so spider closes.
Could somebody help me in solving this? I'm pretty new to this kind of stuff, and ask here when totally stuck.
Here is my code:
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http.request import Request
from scrapy_sokos.items import SokosItem
class SokosSpider(Spider):
name = "sokos"
allowed_domains = ["sokos.fi"]
base_url = "http://www.sokos.fi/fi/SearchDisplay?searchTermScope=&searchType=&filterTerm=&orderBy=8&maxPrice=&showResultsPage=true&beginIndex=%s&langId=-11&sType=SimpleSearch&metaData=&pageSize=4&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&urlLangId=-11&categoryId=295401&storeId=10151"
start_urls = [
"http://www.sokos.fi/fi/SearchDisplay?searchTermScope=&searchType=&filterTerm=&orderBy=8&maxPrice=&showResultsPage=true&beginIndex=0&langId=-11&sType=SimpleSearch&metaData=&pageSize=4&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&urlLangId=-11&categoryId=295401&storeId=10151",
]
for i in range(0, 8, 4):
start_urls.append((base_url) % str(i))
def parse(self, response):
products = Selector(response).xpath('//div[#class="product-listing product-grid"]/article[#class="product product-thumbnail"]')
for product in products:
item = SokosItem()
item['url'] = product.xpath('//div[#class="content"]/a[#class="image"]/#href').extract()[0]
yield Request(url = item['url'], meta = {'item': item}, callback=self.parse_additional_info)
def parse_additional_info(self, response):
item = response.meta['item']
item['name'] = Selector(response).xpath('//h1[#class="productTitle"]/text()').extract()[0].strip()
item['description'] = Selector(response).xpath('//div[#id="kuvaus"]/p/text()').extract()[0]
euro = Selector(response).xpath('//strong[#class="special-price"]/span[#class="euros"]/text()').extract()[0]
cent = Selector(response).xpath('//strong[#class="special-price"]/span[#class="cents"]/text()').extract()[0]
item['price'] = '.'.join(euro + cent)
item['number'] = Selector(response).xpath('//#data-productid').extract()[0]
yield item
The AJAX requests you are simulating are caught by the Scrapy "duplicate url filter".
Set dont_filter to True when yielding a Request:
yield Request(url=item['url'],
meta={'item': item},
callback=self.parse_additional_info,
dont_filter=True)

Categories