scrapy - parsing multiple times - python

I am trying a parse a domain that whose contents are as follows
Page 1 - contains links to 10 articles
Page 2 - contains links to 10 articles
Page 3 - contains links to 10 articles and so on...
My job is to parse all the articles on all pages.
My thought - Parse all the pages and store links to all the articles in a list and then iterate the list and parse the links.
So far I have been able to iterate through the pages, parse and collect links to the articles. I am stuck on how to start parsing this list.
My Code so far...
import scrapy
class DhoniSpider(scrapy.Spider):
name = "test"
start_urls = [
"https://www.news18.com/cricketnext/newstopics/ms-dhoni.html"
]
count = 0
def __init__(self, *a, **kw):
super(DhoniSpider, self).__init__(*a, **kw)
self.headers= {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
self.seed_urls = []
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, headers=self.headers, callback=self.parse)
def parse(self, response):
DhoniSpider.count += 1
if DhoniSpider.count > 2 :
# there are many pages, this is just to stop parsing after 2 pages
return
for ul in response.css('div.t_newswrap'):
ref_links = ul.css('div.t_videos_box a.t_videosimg::attr(href)').getall()
self.seed_urls.extend(ref_links)
next_page = response.css('ul.pagination li a.nxt::attr(href)').get()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, headers=self.headers, callback=self.parse)
def iterate_urls(self):
for link in self.seed_urls:
link = response.urljoin(link)
yield scrapy.Request(link, headers=self.headers, callback=self.parse_page)
def parse_page(self, response):
print("called")
how to iterate my self.seed_urls list and parse them? From where should I call my iterate_urls function?

Usually in this cases there is no need to make external function like your iterate_urls:
def parse(self, response):
DhoniSpider.count += 1
if DhoniSpider.count > 2 :
# there are many pages, this is just to stop parsing after 2 pages
return
for ul in response.css('div.t_newswrap'):
for ref_link in ul.css('div.t_videos_box a.t_videosimg::attr(href)').getall():
yield scrapy.Request(response.urljoin(ref_link), headers=self.headers, callback=self.parse_page, priority = 5)
next_page = response.css('ul.pagination li a.nxt::attr(href)').get()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, headers=self.headers, callback=self.parse)
def parse_page(self, response):
print("called")

You don't have to collect the links to an array, you can just yield a scrapy.Request right after you parsed them. So instead of self.seed_urls.extend(ref_links), you can modify the following function:
def iterate_urls(self, seed_urls):
for link in seed_urls:
link = response.urljoin(link)
yield scrapy.Request(link, headers=self.headers, callback=self.parse_page)
and call it:
...
for ul in response.css('div.t_newswrap'):
ref_links = ul.css('div.t_videos_box a.t_videosimg::attr(href)').getall()
yield iterate_urls(ref_links)
...

Related

scrapy-splash CrawlSpider

I tried to scraping the driver names through scrapy-splash CrawlSpider on this site, but I constantly come across errors. After searching for ways to solve the problem, I came across github and just copied the latest code.
start_urls = ['http://www.huananzhi.com/html/1/184/185/index.html']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url,callback=self.parse_item, args={'wait': 0.5}, meta={'real_url': url})
def _requests_to_follow(self, response):
if not isinstance(
response,
(HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r, response)
def use_splash(self, request, response):
request.meta.update(splash={
'args': {
'wait': 15,
},
'endpoint': 'render.html',
})
return request
linkRule = LinkExtractor(restrict_xpaths='//article /div[1]/div[1]/div[2]/a[1]')
itemRule = Rule(linkRule, callback='parse_item', follow=True, process_request='use_splash')
rules = (
itemRule,
)
def parse_item(self, response):
item = HuananzhiItem()
item['name'] = response.xpath("//div[#class='tab-content']//div[1]/h2/text()").get()
yield item
It didn't work so I tried using scrapy.Spider
def start_requests(self):
url = 'http://www.huananzhi.com/html/1/184/185/index.html'
yield SplashRequest(url=url, callback=self.parse)
def parse(self, response):
links = response.xpath('//article/div[1]/div[1]/div[2]/a[1]/#href')
for link in links:
yield SplashRequest(url=link, callback=self.parse_item)
next_page = response.xpath('//section//li[4]//a[1]')
yield from response.follow(next_page, self.parse)
def parse_item(self, response):
item = HuananzhiItem()
item['name'] = response.xpath("//div[#class='tab-content']//div[1]/h2/text()").get()
yield item
i also use scrapy-user-agent
can anyone tell me how to get the item ? Sorry for such a stupid question, I'm a beginner
Thanks

Interpreting callbacks and cb_kwargs with scrapy

I'm in reach of a personal milestone with scrapy. The aim is to properly understand the callback and cb_kwargs, I've read the documentation countless times but I learn best with visual code, practice and an explanation.
I have an example scraper, the aim is to grab the book name, price and go into each book page and extract a single piece of information. I'm trying to understand how to properly get information on the next few pages also, which I know is dependent on understanding the operation of callbacks.
When I run my script It returns results only for the first page, how do I get the additional pages?
Here's my scraper:
class BooksItem(scrapy.Item):
items = Field(output_processor = TakeFirst())
price = Field(output_processor = TakeFirst())
availability = Field(output_processor = TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]')
for books in data:
loader = ItemLoader(BooksItem(), selector = books)
loader.add_xpath('items','.//article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price','.//p[#class="price_color"]//text()')
for url in [books.xpath('.//a//#href').get()]:
yield scrapy.Request(
response.urljoin(url),
callback = self.parse_book,
cb_kwargs = {'loader':loader})
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
book_quote = response.xpath('//p[#class="instock availability"]//text()').get()
loader.add_value('availability', book_quote)
yield loader.load_item()
I believe the issue is with the part where I try to grab the next few pages. I have tried an alternative approach using the following:
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse,
cb_kwargs = {'page_count':0}
)
def parse(self, response, next_page):
if page_count > 3:
return
...
...
page_count += 1
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
yield response.follow(next_page, callback=self.parse, cb_kwargs = {'page_count': page_count})
However, I get the following error with this approach:
TypeError: parse() missing 1 required positional argument: 'page_cntr'
It should be start_requests, and self.start_urls (inside the function).
get() will return the first result, what you want is getall() in order to return a list.
There is no need for a for loop for the "next_page" part, it's not a mistake just unnecessary.
In the line for url in books.xpath you're getting every url twice, again not a mistake but still...
Here data = response.xpath('//div[#class = "col-sm-8 col-md-9"]') you don't select the books one by one, you select the whole books container, you can check that len(data.getall()) == 1.
book_quote = response.xpath('//p[#class="instock availability"]//text()').get() will return \n, look at the source at try to find out why (hint: 'i' tag).
Compare your code to this and see what I changed:
import scrapy
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BooksItem(scrapy.Item):
items = Field(output_processor=TakeFirst())
price = Field(output_processor=TakeFirst())
availability = Field(output_processor=TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]//li')
for books in data:
loader = ItemLoader(BooksItem(), selector=books)
loader.add_xpath('items', './/article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price', './/p[#class="price_color"]//text()')
for url in books.xpath('.//h3/a//#href').getall():
yield scrapy.Request(
response.urljoin(url),
callback=self.parse_book,
cb_kwargs={'loader': loader})
next_page = response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
# option 1:
book_quote = response.xpath('//p[#class="instock availability"]/i/following-sibling::text()').get().strip()
# option 2:
# book_quote = ''.join(response.xpath('//div[contains(#class, "product_main")]//p[#class="instock availability"]//text()').getall()).strip()
loader.add_value('availability', book_quote)
yield loader.load_item()

Failed to retrieve product listings pages from few categories

From this webpage I am trying to get that kind of link where different products are located. There are 6 categories having More info button which when I traverse recursively, I usually reach the target pages. This is one such product listings page I wish to get.
Please note that some of these pages have both product listing and more info buttons, which is why I failed to capture the product listing pages accurately.
Current spider looks like the following (fails to grab lots of product listings pages):
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url, callback=self.parse)
def parse(self, response):
link_list = []
for item in response.css(".match-height a.more-info::attr(href)").getall():
if not "/detail/" in item:
inner_page_link = response.urljoin(item)
link_list.append(inner_page_link)
yield {"target_url":inner_page_link}
for new_link in link_list:
yield scrapy.Request(new_link, callback=self.parse)
Expected output (randomly taken):
https://www.norgren.com/de/en/list/directional-control-valves/in-line-and-manifold-valves
https://www.norgren.com/de/en/list/pressure-switches/electro-mechanical-pressure-switches
https://www.norgren.com/de/en/list/pressure-switches/electronic-pressure-switches
https://www.norgren.com/de/en/list/directional-control-valves/sub-base-valves
https://www.norgren.com/de/en/list/directional-control-valves/non-return-valves
https://www.norgren.com/de/en/list/directional-control-valves/valve-islands
https://www.norgren.com/de/en/list/air-preparation/combination-units-frl
How to get all the product listings pages from the six categories?
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url)
def parse(self, response):
# check if there are items in the page
if response.xpath('//div[contains(#class, "item-list")]//div[#class="buttons"]/div[#class="more-information"]/a/#href'):
yield scrapy.Request(url=response.url, callback=self.get_links, dont_filter=True)
# follow "more info" buttons
for url in response.xpath('//a[text()="More info"]/#href').getall():
yield response.follow(url)
def get_links(self, response):
yield {"target_url": response.url}
next_page = response.xpath('//a[#class="next-button"]/#href').get()
if next_page:
yield response.follow(url=next_page, callback=self.get_links)
Maybe filter only pages that have at least one link to details? Here is an example of how to identify if a page meets the criteria you are searching for:
import scrapy
class NorgrenSpider(scrapy.Spider):
name = 'norgren'
start_urls = ['https://www.norgren.com/de/en/list']
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url, callback=self.parse)
def parse(self, response):
link_list = []
more_info_items = response.css(
".match-height a.more-info::attr(href)").getall()
detail_items = [item for item in more_info_items if '/detail/' in item]
if len(detail_items) > 0:
print(f'This is a link you are searching for: {response.url}')
for item in more_info_items:
if not "/detail/" in item:
inner_page_link = response.urljoin(item)
link_list.append(inner_page_link)
yield {"target_url": inner_page_link}
for new_link in link_list:
yield scrapy.Request(new_link, callback=self.parse)
I only printed the link to the console, but you can figure out how to log it to where you need.

How to reverse list order in Python and stop yield upon return of None?

I am generating pagination links which I suspect exists with Python 3.x:
start_urls = [
'https://...',
'https://...' # list full of URLs
]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url = url,
meta={'handle_httpstatus_list': [301]},
callback=self.parse,
)
def parse(self, response):
for i in range(1, 6):
url = response.url + '&pn='+str(i)
yield scrapy.Request(url, self.parse_item)
def parse_item(self, response):
# check if no results page
if response.xpath('//*[#id="searchList"]/div[1]').extract_first():
self.logger.info('No results found on %s', response.url)
return None
...
Those URLs will be processed by scrapy in parse_item. Now there are 2 problems:
The order is reverse and I do not understand why. It will request pagen umbers: 5,4,3,2,1 instead of 1,2,3,4,5
If the no results are found on page 1, the entire series could be stoped. Parse Item returns already "None", but the I guess I need to adapt the method "parse" to exit the for loop and continue. How?
The scrapy.Request you generate are running in parallel - In other words, there is guarantee for the order how you get the response as it depends on the server.
If some of the requests, depends on the response of of a request, you should yield those requests in its parse callback.
For example:
def parse(self, response):
url = response.url + '&pn='+str(1)
yield scrapy.Request(url, self.parse_item, cb_kwargs=dict(page=1, base_url=response.url))
def parse_item(self, response,page, base_url):
# check if no results page
if response.xpath('//*[#id="searchList"]/div[1]').extract_first():
if page < 6:
url = base_url + '&pn='+str(page+1)
yield scrapy.Request(url, self.parse_item, cb_kwargs=dict(base_url=base_url,page=page+1))
else:
# your code
yield ...

Scrapy - scrape of all of the item instead of 1 item

I need to scrape all of the items but only 1 item is scrape.
My code is working fine before but when I transfer it to other project which is same code this happens I don't know why
I need to get all of the items according to the page size in start_url
here's my working code
class HmSalesitemSpider(scrapy.Spider):
name = 'HM_salesitem'
allowed_domains = ['www2.hm.com']
start_urls = ['https://www2.hm.com/en_us/sale/shopbyproductladies/view-
all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-
size=3002']
def parse(self, response):
for product_item in response.css('li.product-item'):
url = "https://www2.hm.com/" + product_item.css('a::attr(href)').extract_first()
yield scrapy.Request(url=url, callback=self.parse_subpage)
def parse_subpage(self, response):
item = {
'title': response.xpath("normalize-space(.//h1[contains(#class, 'primary') and contains(#class, 'product-item-headline')]/text())").extract_first(),
'sale-price': response.xpath("normalize-space(.//span[#class='price-value']/text())").extract_first(),
'regular-price': response.xpath('//script[contains(text(), "whitePrice")]/text()').re_first("'whitePrice'\s?:\s?'([^']+)'"),
'photo-url': response.css('div.product-detail-main-image-container img::attr(src)').extract_first(),
'description': response.css('p.pdp-description-text::text').extract_first()
}
yield item
Please Help. Thank you
It seems you have problem with indents. Move yielding request to for loop:
def parse(self, response):
for product_item in response.css('li.product-item'):
url = "https://www2.hm.com/" + product_item.css('a::attr(href)').get()
yield scrapy.Request(url=url, callback=self.parse_subpage)
Or this is a bit cleared version:
def parse(self, response):
for link in response.css('li.product-item a::attr(href)').extract():
yield response.follow(link, self.parse_subpage)

Categories