scraping : scraping second level of url

scraping : scraping second level of url - python

In the below code parse function executes approximately 32 times (foor loop 32 href's found) in the sameway each sublink should go and scrape the data(32 individual urls parse_next function)
.But parse_next function executes only once(one way)/not called( and output csv file is empty.can anyone help me where i did mistake.
import scrapy
import logging
logger = logging.getLogger('mycustomlogger')
from ScrapyTestProject.items import ScrapytestprojectItem
class QuotesSpider(scrapy.Spider):
name = "nestedurl"
allowed_domains = ['www.grohe.in']
start_urls = [
'https://www.grohe.com/in/7780/bathroom/bathroom-faucets/essence/',
def parse(self, response):
logger.info("Parse function called on %s", response.url)
for divs in response.css('div.viewport div.workspace div.float-box'):
item = {'producturl': divs.css('a::attr(href)').extract_first(),
'imageurl': divs.css('a img::attr(src)').extract_first(),
'description' : divs.css('a div.text::text').extract() + divs.css('a span.nowrap::text').extract()}
next_page = response.urljoin(item['producturl'])
#logger.info("This is an information %s", next_page)
yield scrapy.Request(next_page, callback=self.parse_next, meta={'item': item})
#yield item
def parse_next(self, response):
item = response.meta['item']
logger.info("Parse function called on2 %s", response.url)
item['headline'] = response.css('div#content a.headline::text').extract()
return item
#response.css('div#product-variants a::attr(href)').extract()

Ok so a few things go wrong:
Indentation
start_urls list is not closed with a [
allowed_domains uses the domain extension .in while you want to scrape .com
Working code below:
import scrapy
import logging
class QuotesSpider(scrapy.Spider):
name = "nestedurl"
allowed_domains = ['www.grohe.com']
start_urls = [
'https://www.grohe.com/in/7780/bathroom/bathroom-faucets/essence/'
]
def parse(self, response):
# logger.info("Parse function called on %s", response.url)
for divs in response.css('div.viewport div.workspace div.float-box'):
item = {'producturl': divs.css('a::attr(href)').extract_first(),
'imageurl': divs.css('a img::attr(src)').extract_first(),
'description' : divs.css('a div.text::text').extract() + divs.css('a span.nowrap::text').extract()}
next_page = response.urljoin(item['producturl'])
#logger.info("This is an information %s", next_page)
yield scrapy.Request(next_page, callback=self.parse_next, meta={'item': item})
#yield item
def parse_next(self, response):
item = response.meta['item']
# logger.info("Parse function called on2 %s", response.url)
item['headline'] = response.css('div#content a.headline::text').extract()
return item
#response.css('div#product-variants a::attr(href)').extract()
Note: deleted some logging / item pipelines as these are not defined on my machine.

Related

next page crawl in Scrapy

I am trying to get some data from the website but my spider is not crawling to the next page even after a proper pagination link.
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu/"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
title = response.xpath(
'//*[#class="views-field views-field-title"]/span/text()'
).extract()
doi_link = response.xpath(
'//*[#class="views-field views-field-field-doi-link"]//a[1]/#href'
).extract()
yield {"paper_title": title, "doi_link": doi_link}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)
PS: I don't want to use LinkExtractor.
Any help would be appreciated.

Nothing wrong with your next_page logic, code is just not reaching this because the yield for the item is in the same identation level. Try the following approach:
import scrapy
class NspiderSpider(scrapy.Spider):
name = "nspider"
allowed_domains = ["elimelechlab.yale.edu"]
start_urls = ["https://elimelechlab.yale.edu/pub"]
def parse(self, response):
for view in response.css('div.views-row'):
yield {
'paper_title': view.css('div.views-field-title span.field-content::text').get(),
'doi_link': view.css('div.views-field-field-doi-link div.field-content a::attr(href)').get()
}
next_page = response.xpath(
'//*[#title="Go to next page"]/#href'
).extract_first() # extracting next page link
if next_page:
yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse)

Scrapy - scrape of all of the item instead of 1 item

I need to scrape all of the items but only 1 item is scrape.
My code is working fine before but when I transfer it to other project which is same code this happens I don't know why
I need to get all of the items according to the page size in start_url
here's my working code
class HmSalesitemSpider(scrapy.Spider):
name = 'HM_salesitem'
allowed_domains = ['www2.hm.com']
start_urls = ['https://www2.hm.com/en_us/sale/shopbyproductladies/view-
all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-
size=3002']
def parse(self, response):
for product_item in response.css('li.product-item'):
url = "https://www2.hm.com/" + product_item.css('a::attr(href)').extract_first()
yield scrapy.Request(url=url, callback=self.parse_subpage)
def parse_subpage(self, response):
item = {
'title': response.xpath("normalize-space(.//h1[contains(#class, 'primary') and contains(#class, 'product-item-headline')]/text())").extract_first(),
'sale-price': response.xpath("normalize-space(.//span[#class='price-value']/text())").extract_first(),
'regular-price': response.xpath('//script[contains(text(), "whitePrice")]/text()').re_first("'whitePrice'\s?:\s?'([^']+)'"),
'photo-url': response.css('div.product-detail-main-image-container img::attr(src)').extract_first(),
'description': response.css('p.pdp-description-text::text').extract_first()
}
yield item
Please Help. Thank you

It seems you have problem with indents. Move yielding request to for loop:
def parse(self, response):
for product_item in response.css('li.product-item'):
url = "https://www2.hm.com/" + product_item.css('a::attr(href)').get()
yield scrapy.Request(url=url, callback=self.parse_subpage)
Or this is a bit cleared version:
def parse(self, response):
for link in response.css('li.product-item a::attr(href)').extract():
yield response.follow(link, self.parse_subpage)

How to determine if a link is nofollow or dofollow in Scrapy?

So, that was the question. I have a Scrapy bot that follows internal links of a given site, writes its links, status code and anchor text into database. But I'm struggling to grab the link's follow status. Is there any way to grab that rel=nofollow/dofollow information? That's my code if anybody wonders;
class MySpider(CrawlSpider):
name = 'spydiiiii'
start_urls = [urlToScrape]
rules = (
Rule (
LxmlLinkExtractor(
allow=(urlToScrape),
deny=(
"google.com",
"facebook.com",
"pinterest.com",
"facebook.com",
"digg.com",
"twitter.com",
"stumbleupon.com",
"linkedin.com"
),
unique=True
),
callback="parse_items",
follow= True,
),
)
def parse_items(self, response):
sel = Selector(response)
items = []
item = InternallinkItem()
referring_url = response.request.headers.get('Referer').decode('utf-8')
item["referring_url"] = referring_url
anchor = response.meta.get('link_text')
item["anchor_text"] = " ".join(anchor.split())
item["current_url"] = response.url
item['status'] = response.status
items.append(item)
return items
Thanks in advance

I use LxmlLinkExtractor manually to get Link objects which have nofollow information.
In parse() I get links from first page and create item with 'nofollow' (and other) informations, and use Requests with this url (and with item in meta) to get status and referer.
New Requests uses parse_item() to get item from meta and add status.
parse_item() also uses extractor to get new links on this page and create new item and execute Requests with parse_item() again.
import scrapy
from scrapy.http import Request
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = ['http://quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com']
#start_urls = ['http://127.0.0.1:5000/'] # for Flask example
extractor = LxmlLinkExtractor(
allow=('http://quotes.toscrape.com'),
#allow=('http://127.0.0.1:5000'), # for Flask example
deny=(
'google.com',
'facebook.com',
'pinterest.com',
'facebook.com',
'digg.com',
'twitter.com',
'stumbleupon.com',
'linkedin.com'
),
unique=True,
)
def parse(self, response):
print('parse url:', response.url)
# use LxmlLinkExtractor manually
for link in self.extractor.extract_links(response):
#print('link:', link)
item = {}
item['nofollow'] = link.nofollow
item['anchor_text'] = link.text
item['current_url'] = link.url
#item['referring_url'] = response.url
yield Request(link.url, meta={'item': item}, callback=self.parse_item)
def parse_item(self, response):
print('parse_item url:', response.url)
item = response.meta['item']
item['referring_url'] = response.request.headers.get('Referer')
#item['referring_url'] = response.request.url
item['status'] = response.status
yield item
# use LxmlLinkExtractor manually with new links
for link in self.extractor.extract_links(response):
#print('link:', link)
item = {}
item['nofollow'] = link.nofollow
item['anchor_text'] = link.text
item['current_url'] = link.url
#item['referring_url'] = response.url
yield Request(link.url, meta={'item': item}, callback=self.parse_item)
# --- run spider without project ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEED_FORMAT': 'csv',
'FEED_URI': 'output.csv',
})
c.crawl(MySpider)
c.start()
EDIT:
Because I don't know any page with rel="nofollow" so I created simple code in Flask to test code.
from flask import Flask
app = Flask(__name__)
#app.route('/')
def index():
return 'Test 1 | Test 2'
#app.route('/test1')
def test1():
return 'Main Page'
#app.route('/test2')
def test2():
return 'Main Page'
if __name__ == '__main__':
app.run(debug=True)

Scrape nested URLs using Scrapy

I am trying to scrape this web page:
https://www.grohe.com/in/7780/bathroom/bathroom-faucets/essence/
I tried different ways, but every time it gives me a syntax error. I don't know much Python and Scrapy. Can anyone help me?
My requirements are:
In the header section of the page, there is a background image, some description and 2 product-related images.
In the Product Range section there are some number of images. I would like to go through all the images and scrape the individual product details.
The structure is like this:
Here is my code so far:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "plumber"
start_urls = [
'https://www.grohe.com/in/7780/bathroom/bathroom-faucets/essence/',
]
def parse(self, response):
for divs in response.css('div#product-variants div.viewport div.workspace div.float-box'):
yield {
#response.css('div#product-variants a::attr(href)').extract()
'producturl': divs.css('a::attr(href)').extract(),
'imageurl': divs.css('a img::attr(src)').extract(),
'description' : divs.css('a div.text::text').extract() + divs.css('a span.nowrap::text').extract(),
next_page = producturl
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
}

You should take next_page yield out of your item.
In general you can iterate through products, make some load and carry it over in your request's meta parameter, like so:
def parse(self, response):
for divs in response.css('div#product-variants div.viewport div.workspace div.float-box'):
item = {'producturl': divs.css('a::attr(href)').extract(),
'imageurl': divs.css('a img::attr(src)').extract(),
'description' : divs.css('a div.text::text').extract() + divs.css('a span.nowrap::text').extract()}
next_page = response.urljoin(item['producturl'])
yield scrapy.Request(next_page, callback=self.parse_page, meta={'item': item})
def parse_page(self, response):
"""This is individual product page"""
item = response.meta['item']
item['something_new'] = 'some_value'
return item

Why scrapy is not going to next page and only getting first page items?

Earlier I had one rule also i.e.
if domains in departments.keys():rules = (Rule(SgmlLinkExtractor(allow=("?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=d_d" %(keyword,departments.get(domains)),),restrict_xpaths=('//li[#class="btn-nextResults"]'),),callback='parse',follow=True),),
but I removed it as it was calling parse method which is not recommended.
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from walmart_sample.items import WalmartSampleItem
class MySpider(CrawlSpider):
name = "my_spider"
domains = ['All Departments']
keyword = 'Laptop'
departments = {"All Departments": "0", "Apparel": "5438", "Auto": "91083", "Baby": "5427", "Beauty": "1085666","Books": "3920", "Electronics": "3944", "Gifts": "1094765", "Grocery": "976759", "Health": "976760","Home": "4044", "Home Improvement": "1072864", "Jwelery": "3891", "Movies": "4096", "Music": "4104","Party": "2637", "Patio": "5428", "Pets": "5440", "Pharmacy": "5431", "Photo Center": "5426","Sports": "4125", "Toys": "4171", "Video Games": "2636"}
allowed_domains = ['walmart.com']
denied_domains = ['reviews.walmart.com','facebook.com','twitter.com']
def start_requests(self):
for domain in self.domains:
if domain in self.departments:
url = 'http://www.walmart.com/search/search-ng.do?search_query=%s&ic=16_0&Find=Find&search_constraint=%s' % (self.keyword, self.departments.get(domain))
yield Request(url)
def parse(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a[#class="prodLink ListItemLink"]/#href')
for link in links:
href = link.extract()
yield Request('http://www.walmart.com/' + href, self.parse_data)
next_link = hxs.select('//li[#class="btn-nextResults"]/#href').extract()
if next_link:
yield Request('http://www.walmart.com/search/search-ng.do' + next_link, self.parse)
else:
print "last Page"
def parse_data(self, response):
hxs = HtmlXPathSelector(response)
items=[]
walmart=WalmartSampleItem()
walmart['Title']=hxs.select('//h1[#class="productTitle"]/text()').extract()
walmart['Price']=hxs.select('//span[#class="bigPriceText1"]/text()').extract()+hxs.select('//span[#class="smallPriceText1"]/text()').extract()
walmart['Availability']=hxs.select('//span[#id="STORE_AVAIL"]/text()').extract()
walmart['Description']=hxs.select('//span[#class="ql-details-short-desc"]/text()').extract()
items.append(walmart)
return items

I think you're simply missing an "/a" step in your XPath for next page links:
def parse(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a[#class="prodLink ListItemLink"]/#href')
for link in links:
href = link.extract()
yield Request('http://www.walmart.com/' + href, self.parse_data)
#
# here
# |
# v
next_link = hxs.select('//li[#class="btn-nextResults"]/a/#href').extract()
if next_link:
# and as hxs.select() will return a list, you should select the first element
yield Request('http://www.walmart.com/search/search-ng.do' + next_link[0], self.parse)
else:
print "last Page"

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

scraping : scraping second level of url - python

Related

next page crawl in Scrapy

Scrapy - scrape of all of the item instead of 1 item

How to determine if a link is nofollow or dofollow in Scrapy?

Scrape nested URLs using Scrapy

Why scrapy is not going to next page and only getting first page items?

Categories

Resources