Scrapy: How to pass an item between methods using meta - python

I'm new to scrapy and python and I'm trying to pass the item item['author'] in parse_quotes to the next parse method parse_bio
I tried the request.meta and response.meta approach as shown in the scrapy documentation but without succes. see code as per below.
import scrapy
from tutorial.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/login',
#'http://quotes.toscrape.com/page/2',
]
# Scraping a site with login
# Important: Cookie settings must be "True" to keep the login session alive
custom_settings = {'COOKIES_ENABLED': True}
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'username': 'john', 'password': 'secret'},
callback=self.parse_quotes
)
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['author_bio_link'] = sel.css('.author + a')
yield item
# follow the detail links # shortcut
# vertical crawling
for a in item['author_bio_link']:
yield response.follow(a, callback = self.parse_bio)
def parse_bio(self, response):
item = QuotesItem()
item['author_born'] = response.css('p span::text').getall()
item['author_born'] = item['author_born'][:2]
item['author_bio'] = response.css('div.author-description ::text').get().strip()
yield item
# follow pagination links # shortcut
# horizontal crawling
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)
I expect to get item['author'] from parse_quotes passed to parse_bio

I suggest you to use meta in this way:
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['author_bio_link'] = sel.css('.author + a')
yield item
# follow the detail links # shortcut
# vertical crawling
for a in item['author_bio_link']:
yield response.follow(a, self.parse_bio,
meta={'author': item['author']}) # <- you set it here
def parse_bio(self, response):
item = QuotesItem()
item['author_born'] = response.css('p span::text').getall()
item['author_born'] = item['author_born'][:2]
item['author_data'] = response.meta.get('author') # <- you get it here
item['author_bio'] = response.css('div.author-description ::text').get().strip()
yield item
# follow pagination links # shortcut
# horizontal crawling
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)

Related

Interpreting callbacks and cb_kwargs with scrapy

I'm in reach of a personal milestone with scrapy. The aim is to properly understand the callback and cb_kwargs, I've read the documentation countless times but I learn best with visual code, practice and an explanation.
I have an example scraper, the aim is to grab the book name, price and go into each book page and extract a single piece of information. I'm trying to understand how to properly get information on the next few pages also, which I know is dependent on understanding the operation of callbacks.
When I run my script It returns results only for the first page, how do I get the additional pages?
Here's my scraper:
class BooksItem(scrapy.Item):
items = Field(output_processor = TakeFirst())
price = Field(output_processor = TakeFirst())
availability = Field(output_processor = TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]')
for books in data:
loader = ItemLoader(BooksItem(), selector = books)
loader.add_xpath('items','.//article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price','.//p[#class="price_color"]//text()')
for url in [books.xpath('.//a//#href').get()]:
yield scrapy.Request(
response.urljoin(url),
callback = self.parse_book,
cb_kwargs = {'loader':loader})
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
book_quote = response.xpath('//p[#class="instock availability"]//text()').get()
loader.add_value('availability', book_quote)
yield loader.load_item()
I believe the issue is with the part where I try to grab the next few pages. I have tried an alternative approach using the following:
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse,
cb_kwargs = {'page_count':0}
)
def parse(self, response, next_page):
if page_count > 3:
return
...
...
page_count += 1
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
yield response.follow(next_page, callback=self.parse, cb_kwargs = {'page_count': page_count})
However, I get the following error with this approach:
TypeError: parse() missing 1 required positional argument: 'page_cntr'
It should be start_requests, and self.start_urls (inside the function).
get() will return the first result, what you want is getall() in order to return a list.
There is no need for a for loop for the "next_page" part, it's not a mistake just unnecessary.
In the line for url in books.xpath you're getting every url twice, again not a mistake but still...
Here data = response.xpath('//div[#class = "col-sm-8 col-md-9"]') you don't select the books one by one, you select the whole books container, you can check that len(data.getall()) == 1.
book_quote = response.xpath('//p[#class="instock availability"]//text()').get() will return \n, look at the source at try to find out why (hint: 'i' tag).
Compare your code to this and see what I changed:
import scrapy
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BooksItem(scrapy.Item):
items = Field(output_processor=TakeFirst())
price = Field(output_processor=TakeFirst())
availability = Field(output_processor=TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]//li')
for books in data:
loader = ItemLoader(BooksItem(), selector=books)
loader.add_xpath('items', './/article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price', './/p[#class="price_color"]//text()')
for url in books.xpath('.//h3/a//#href').getall():
yield scrapy.Request(
response.urljoin(url),
callback=self.parse_book,
cb_kwargs={'loader': loader})
next_page = response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
# option 1:
book_quote = response.xpath('//p[#class="instock availability"]/i/following-sibling::text()').get().strip()
# option 2:
# book_quote = ''.join(response.xpath('//div[contains(#class, "product_main")]//p[#class="instock availability"]//text()').getall()).strip()
loader.add_value('availability', book_quote)
yield loader.load_item()

Scrapy callback asynchronous

def parse(self, response):
category_names = []
category_urls = []
for item in response.css("#zg_browseRoot ul li"):
category_url = item.css("a").css(self.CSS_URL).extract()
category_name = item.css("a").css(self.CSS_TEXT).extract()
category_url = [
self.parse_url(category_url, 4) for category_url in category_url
]
(category_url,) = category_url
(category_name,) = category_name
category_names.append(category_name)
category_urls.append(category_url)
for c_name, url in zip(category_names, category_urls):
self.c_name = [c_name]
yield scrapy.Request(url, callback=self.parse_categories)
def parse_url(self, url, number):
parse = urlparse(url)
split = parse.path.split("/")[:number]
return f'{self.BASE_URL}{"/".join(split)}'
def parse_categories(self, response):
sub_names = []
sub_urls = []
for item in response.css("#zg_browseRoot ul ul li"):
sub_name = item.css("a").css(self.CSS_TEXT).extract()
sub_url = item.css("a").css(self.CSS_URL).extract()
sub_url = [self.parse_url(sub_url, 5) for sub_url in sub_url]
(sub_url,) = sub_url
(sub_name,) = sub_name
sub_names.append(sub_name)
sub_urls.append(sub_url)
for sub_name, url in zip(sub_names, sub_urls):
self.sub_name = [sub_name]
# print("{}: {}, {}".format(url, self.sub_name, self.c_name))
yield scrapy.Request(url, callback=self.parse_subcategories)
def parse_subcategories(self, response):
url = self.parse_url(response.request.url, 5)
print(f"{self.c_name}, {self.sub_name}, {url}")
Hello everyone,
I'm having an issue with my Scrapy approach. I'm trying to scrape page which has categories and subcategories in which are items. I want to include category and subcategory with each item scraped.
The problem is that the Scrapys callback function is asynchronous and zipping the URLs with names doesn't seem to work, because the for loop is processed first, URLs are stored in a generator and names are staying behind. Can anyone help me to work around this?
Thanks in advance,
Daniel.
You can pass arbitrary data along with the requests by using th cb_kwargs parameter. You can read about the details here.
Here is a simplified example:
def parse(self, response):
rows = response.xpath('//div[#id="some-element"]')
for row in rows:
request_url = row.xpath('a/#href').get()
category = row.xpath('a/text()').get()
yield Request(
url=request_url,
callback=self.parse_category,
cb_kwargs={'category': category}
)
def parse_category(self, response, category): # Notice category arg in the func
# Process here
yield item
The data inserted in cb_kwargs is passed as a keyword arg into the callback function, so the key in the dict must match the name of the argument in the method definiton.
cb_kwargs were introduced in Scrapy v1.7, if you are using an older version you should use the meta param. You can read about it here, notice that the use is slightly different.

How to get the proxy used for each request in an item with Scrapy?

I'm using a DOWNLOADER_MIDDLEWARES for rotating proxies with an scrapy.Spider and I would like to get an item , i.e. item['proxy_used'], for the proxy used for each request.
I guess it could be possible to get the Proxy over the "Stats Collector" but I'm new to Python and Scrapy and till now I haven't been able to come across with a solution.
import scrapy
from tutorial.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["quotes.toscrape.com"]
start_urls = [
'http://quotes.toscrape.com/',
]
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['proxy_used'] = ??? <-- PROXY USED BY REQUEST - "HOW TO???"
yield item
# follow pagination links #shortcut
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)
You can use the response object to access the proxy used. Like below
response.meta.get("proxy")
Updated in your code too.
import scrapy
from tutorial.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["quotes.toscrape.com"]
start_urls = [
'http://quotes.toscrape.com/',
]
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['proxy_used'] = response.meta.get("proxy")
yield item
# follow pagination links #shortcut
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)

Multiple pages per item - using scraped links

My spide looks like this/;
class ScrapeMovies(scrapy.Spider):
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = loopitem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
yield item
# This part is responsible for scraping all of the pages on a start url commented out for convinience
# next_page=response.xpath('//div[#class="page-nav-btm"]/ul/li[last()]/a/#href').extract_first()
# if next_page is not None:
# next_page=response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
What it does as of know it scrapes the table (see the starting url). I want it to then go the link (members name column) and then extract some informations from this link (link is e.g. https://www.trekearth.com/members/monareng/) and the return this as an item.
How should i approach this?
If anything is unclear please do not hesitate to ask for clarification.
EDIT:
nowy my code looks as follows (however still does not work):
class ScrapeMovies(scrapy.Spider):
name='final'
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
item['groups'] = response.xpath('//div[#class="groups-btm"]/ul/li/text()').extract_first()
return item
Use meta field to put item forward to next callback
def parse_page1(self, response):
item = MyItem(main_url=response.url)
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
UPD: to process all rows use a yield in your loop
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
yield request

How to determine if a link is nofollow or dofollow in Scrapy?

So, that was the question. I have a Scrapy bot that follows internal links of a given site, writes its links, status code and anchor text into database. But I'm struggling to grab the link's follow status. Is there any way to grab that rel=nofollow/dofollow information? That's my code if anybody wonders;
class MySpider(CrawlSpider):
name = 'spydiiiii'
start_urls = [urlToScrape]
rules = (
Rule (
LxmlLinkExtractor(
allow=(urlToScrape),
deny=(
"google.com",
"facebook.com",
"pinterest.com",
"facebook.com",
"digg.com",
"twitter.com",
"stumbleupon.com",
"linkedin.com"
),
unique=True
),
callback="parse_items",
follow= True,
),
)
def parse_items(self, response):
sel = Selector(response)
items = []
item = InternallinkItem()
referring_url = response.request.headers.get('Referer').decode('utf-8')
item["referring_url"] = referring_url
anchor = response.meta.get('link_text')
item["anchor_text"] = " ".join(anchor.split())
item["current_url"] = response.url
item['status'] = response.status
items.append(item)
return items
Thanks in advance
I use LxmlLinkExtractor manually to get Link objects which have nofollow information.
In parse() I get links from first page and create item with 'nofollow' (and other) informations, and use Requests with this url (and with item in meta) to get status and referer.
New Requests uses parse_item() to get item from meta and add status.
parse_item() also uses extractor to get new links on this page and create new item and execute Requests with parse_item() again.
import scrapy
from scrapy.http import Request
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = ['http://quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com']
#start_urls = ['http://127.0.0.1:5000/'] # for Flask example
extractor = LxmlLinkExtractor(
allow=('http://quotes.toscrape.com'),
#allow=('http://127.0.0.1:5000'), # for Flask example
deny=(
'google.com',
'facebook.com',
'pinterest.com',
'facebook.com',
'digg.com',
'twitter.com',
'stumbleupon.com',
'linkedin.com'
),
unique=True,
)
def parse(self, response):
print('parse url:', response.url)
# use LxmlLinkExtractor manually
for link in self.extractor.extract_links(response):
#print('link:', link)
item = {}
item['nofollow'] = link.nofollow
item['anchor_text'] = link.text
item['current_url'] = link.url
#item['referring_url'] = response.url
yield Request(link.url, meta={'item': item}, callback=self.parse_item)
def parse_item(self, response):
print('parse_item url:', response.url)
item = response.meta['item']
item['referring_url'] = response.request.headers.get('Referer')
#item['referring_url'] = response.request.url
item['status'] = response.status
yield item
# use LxmlLinkExtractor manually with new links
for link in self.extractor.extract_links(response):
#print('link:', link)
item = {}
item['nofollow'] = link.nofollow
item['anchor_text'] = link.text
item['current_url'] = link.url
#item['referring_url'] = response.url
yield Request(link.url, meta={'item': item}, callback=self.parse_item)
# --- run spider without project ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEED_FORMAT': 'csv',
'FEED_URI': 'output.csv',
})
c.crawl(MySpider)
c.start()
EDIT:
Because I don't know any page with rel="nofollow" so I created simple code in Flask to test code.
from flask import Flask
app = Flask(__name__)
#app.route('/')
def index():
return 'Test 1 | Test 2'
#app.route('/test1')
def test1():
return 'Main Page'
#app.route('/test2')
def test2():
return 'Main Page'
if __name__ == '__main__':
app.run(debug=True)

Categories