I'm trying to populate item using ItemLoader parsing data from multiple pages. But as I can see now, I can't change selector that I used when I initialized ItemLoader. And documentation says about selector attribute:
selector
The Selector object to extract data from. It’s either the
selector given in the constructor or one created from the response
given in the constructor using the default_selector_class. This
attribute is meant to be read-only.
Here's example code:
def parse(self, response):
sel = Selector(response)
videos = sel.xpath('//div[#class="video"]')
for video in videos:
loader = ItemLoader(VideoItem(), videos)
loader.add_xpath('original_title', './/u/text()')
loader.add_xpath('original_id', './/a[#class="hRotator"]/#href', re=r'movies/(\d+)/.+\.html')
try:
url = video.xpath('.//a[#class="hRotator"]/#href').extract()[0]
request = Request(url,
callback=self.parse_video_page)
except IndexError:
pass
request.meta['loader'] = loader
yield request
pages = sel.xpath('//div[#class="pager"]//a/#href').extract()
for page in pages:
url = urlparse.urljoin('http://www.mysite.com/', page)
request = Request(url, callback=self.parse)
yield request
def parse_video_page(self, response):
loader = response.meta['loader']
sel = Selector(response)
loader.add_xpath('original_description', '//*[#id="videoInfo"]//td[#class="desc"]/h2/text()')
loader.add_xpath('duration', '//*[#id="video-info"]/div[2]/text()')
loader.add_xpath('tags', '//*[#id="tags"]//a/text()')
item = loader.load_item()
return item
As for now, I can't scrape info from the second page.
Answering to your question directly - to change selector for ItemLoader you can set new selector object to loader.selector attribute.
def parse_video_page(self, response):
loader = response.meta['loader']
sel = Selector(response)
loader.selector = sel
loader.add_xpath(
'original_description',
'//*[#id="videoInfo"]//td[#class="desc"]/h2/text()'
)
# ...
But this way of working with loader objects seems to be unexpected and thus - not supported - library updates can break this code or produce unexpected bugs. Also passing loader to request meta is a bad thing to do, because loader object references response object - and this can cause memory problems in some situations.
Much more correct way of collecting item fields in several callbacks would be as follows (note the comments):
def parse(self, response):
sel = Selector(response)
videos = sel.xpath('//div[#class="video"]')
for video in videos:
try:
url = video.xpath('.//a[#class="hRotator"]/#href').extract()[0]
except IndexError:
continue
loader = ItemLoader(VideoItem(), videos)
loader.add_xpath('original_title', './/u/text()')
loader.add_xpath(
'original_id',
'.//a[#class="hRotator"]/#href',
re=r'movies/(\d+)/.+\.html'
)
item = loader.load_item()
yield Request(
urlparse.urljoin(response.url, url),
callback=self.parse_video_page,
# Note: item passed to the meta dict, not loader itself
meta={'item': item}
)
pages = sel.xpath('//div[#class="pager"]//a/#href').extract()
for page in pages:
url = urlparse.urljoin('http://www.mysite.com/', page)
yield Request(url, callback=self.parse)
def parse_video_page(self, response):
item = response.meta['item']
# Note: new loader object created,
# item from response.meta is passed to the constructor
loader = ItemLoader(item, response=response)
loader.add_xpath(
'original_description',
'//*[#id="videoInfo"]//td[#class="desc"]/h2/text()'
)
loader.add_xpath(
'duration',
'//*[#id="video-info"]/div[2]/text()'
)
loader.add_xpath('tags', '//*[#id="tags"]//a/text()')
return loader.load_item()
Related
I'm in reach of a personal milestone with scrapy. The aim is to properly understand the callback and cb_kwargs, I've read the documentation countless times but I learn best with visual code, practice and an explanation.
I have an example scraper, the aim is to grab the book name, price and go into each book page and extract a single piece of information. I'm trying to understand how to properly get information on the next few pages also, which I know is dependent on understanding the operation of callbacks.
When I run my script It returns results only for the first page, how do I get the additional pages?
Here's my scraper:
class BooksItem(scrapy.Item):
items = Field(output_processor = TakeFirst())
price = Field(output_processor = TakeFirst())
availability = Field(output_processor = TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]')
for books in data:
loader = ItemLoader(BooksItem(), selector = books)
loader.add_xpath('items','.//article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price','.//p[#class="price_color"]//text()')
for url in [books.xpath('.//a//#href').get()]:
yield scrapy.Request(
response.urljoin(url),
callback = self.parse_book,
cb_kwargs = {'loader':loader})
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
book_quote = response.xpath('//p[#class="instock availability"]//text()').get()
loader.add_value('availability', book_quote)
yield loader.load_item()
I believe the issue is with the part where I try to grab the next few pages. I have tried an alternative approach using the following:
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse,
cb_kwargs = {'page_count':0}
)
def parse(self, response, next_page):
if page_count > 3:
return
...
...
page_count += 1
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
yield response.follow(next_page, callback=self.parse, cb_kwargs = {'page_count': page_count})
However, I get the following error with this approach:
TypeError: parse() missing 1 required positional argument: 'page_cntr'
It should be start_requests, and self.start_urls (inside the function).
get() will return the first result, what you want is getall() in order to return a list.
There is no need for a for loop for the "next_page" part, it's not a mistake just unnecessary.
In the line for url in books.xpath you're getting every url twice, again not a mistake but still...
Here data = response.xpath('//div[#class = "col-sm-8 col-md-9"]') you don't select the books one by one, you select the whole books container, you can check that len(data.getall()) == 1.
book_quote = response.xpath('//p[#class="instock availability"]//text()').get() will return \n, look at the source at try to find out why (hint: 'i' tag).
Compare your code to this and see what I changed:
import scrapy
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BooksItem(scrapy.Item):
items = Field(output_processor=TakeFirst())
price = Field(output_processor=TakeFirst())
availability = Field(output_processor=TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]//li')
for books in data:
loader = ItemLoader(BooksItem(), selector=books)
loader.add_xpath('items', './/article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price', './/p[#class="price_color"]//text()')
for url in books.xpath('.//h3/a//#href').getall():
yield scrapy.Request(
response.urljoin(url),
callback=self.parse_book,
cb_kwargs={'loader': loader})
next_page = response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
# option 1:
book_quote = response.xpath('//p[#class="instock availability"]/i/following-sibling::text()').get().strip()
# option 2:
# book_quote = ''.join(response.xpath('//div[contains(#class, "product_main")]//p[#class="instock availability"]//text()').getall()).strip()
loader.add_value('availability', book_quote)
yield loader.load_item()
I need to parse data from the site. After parsing, data must be saved to disk. I am using scrapy. When working, I need to get data from another page. How can I do that?
class MySpider(scrapy.Spider):
name = "my_spyder"
start_urls = [
'https://www.example.com/title/1',
'https://www.example.com/title/2',
'https://www.example.com/title/3',
]
def parse(self, response):
item = MyItem()
main_page_selector = Selector(response)
...
tagline_url = os.path.join(response.url, 'taglines')
request = Request(url=tagline_url, callback=get_tags)
item['tags'] = yield request
...
yield item
def get_tags(self, response):
tagline_selector = Selector(response)
taglines = []
for tag in tagline_selector.xpath('//div[#class="soda even"))]/text()').getall():
taglines.append(tag.strip())
return taglines
how to write in the 'item' field 'tags' received during the function 'get_tags'?
these requests are executed asynchronously.
request = Request(url=tagline_url, callback=get_tags)
request.meta["item"] = item
yield request
Above code on parse method
item = response.meta["item"]
#...
item["tags"] = taglines
yield item
The second code in the get_tags method
I am having problems going through multiple pages. Here is my class for scrapy code called quotes.
class quotes(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/?']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback = self.books_detail)
def books_detail(self, response):
yas = {
'title':[],
'price':[],
'availability':[],
'category':[]
}
yas['category'].append(response.css('h1::text').extract())
all_divs = response.css('.col-lg-3')
for div in all_divs:
link = div.css('.product_pod a::attr(href)').get()
title = response.follow(link, callback = self.get_title)
yas['price'].append(div.css('.price_color::text').extract())
yas['availability'].append(div.css('.availability::text')[1].extract())
yield yas
def get_title(self,response):
print('testing')
title = response.css('h1::text').extract()
yield {"title":title}
So I use a response.follow to goto function books_details and in that function, I again call response.follow to call get_title. I get the 'title' from get_title and the rest of the details from the main page.
I can scrape the information just fine from the books_details function and I can get the link of the title page just fine as well from the code line.
link = div.css('.product_pod a::attr(href)').get()
But using the response.follow I can not go to the get_title function.
Any help would be appreciated. Thanks.
You should yield request, not run it directly, and use meta= to send data to next parser
yield response.follow(link, callback=self.get_title, meta={'item': yas})
and in next parser you can get it
yas = response.meta['item']
and then you can add new values and yield all data
yas["title"] = response.css('h1::text').extract()
yield yas
See other example in Scrapy yeild items from multiple requests
Doc: Request and Response, Request.meta special keys
Minimal working code which you can put in one file and run as normal script (python script.py) without creating project.
There are other changes.
You shouldn't put all books to one list but yield every book separatelly. Scrapy will keep all results and when you use option to save in csv then it will save all results.
For every book you should create new dictionary. If you use the same dictionary many time then it will ovewrite data and you may get many result with the same data.
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback=self.books_detail)
def books_detail(self, response):
all_divs = response.css('.col-lg-3')
for div in all_divs:
# every book in separated dictionary and it has to be new dictionary - because it could overwrite old data
book = {
'category': response.css('h1::text').extract(),
'price': div.css('.price_color::text').extract()[0].strip(),
'availability': div.css('.availability::text')[1].extract().strip(),
}
link = div.css('.product_pod a::attr(href)').get()
yield response.follow(link, callback=self.get_title, meta={'item': book})
def get_title(self, response):
book = response.meta['item']
print('testing:', response.url)
book["title"] = response.css('h1::text').extract()[0].strip()
yield book
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(QuotesSpider)
c.start()
I am trying to scrape a website using Scrapy. Example Link is: Here.
I am able to get some data using css selectors. I also need to fetch all image urls of each item. Now an item can have multiple colours. When we click on another colour, it actually fetch images from another url in the browser. So, I need to generate manual requests (due to multiple colours) and attach "meta" to store image urls from others urls into a SINGLE ITEM FIELD.
Here is my Scrapy code:
def get_image_urls(self, response):
item = response.meta['item']
if 'image_urls' in item:
urls = item['image_urls']
else:
urls = []
urls.extend(response.css('.product-image-link::attr(href)').extract())
item['image_urls'] = urls
next_url = response.css('.va-color .emptyswatch a::attr(href)').extract()
#print(item['image_urls'])
yield Request(next_url[0], callback=self.get_image_urls, meta={'item': item})
def parse(self, response):
output = JulesProduct()
output['name'] = self.get_name(response)
# Now get the recursive img urls
response.meta['item'] = output
self.get_image_urls(response)
return output
Ideally, I should return output object to have all of the required data. My question is why I am not getting output['image_urls']? Because when I uncomment print statement in get_image_urls function, I see 3 crawled urls and 3 print statements with url appended after each other. I need them in the parse function. I'm not sure if I'm able to dictate my issue. Can anybody help?
Your parse method is returning the output before the get_image_urls requests are done.
You should only yield or return your final item and at the end of your recursive logic. Something like this should work:
def parse(self, response):
output = JulesProduct()
output['name'] = self.get_name(response)
yield Request(response.url, callback=self.get_image_urls, meta={'item': item}, dont_filter=True)
def get_image_urls(self, response):
item = response.meta['item']
if 'image_urls' in item:
urls = item['image_urls']
else:
urls = []
urls.extend(response.css('.product-image-link::attr(href)').extract())
item['image_urls'] = urls
next_url = response.css('.va-color .emptyswatch a::attr(href)').extract()
if len(next_url) > 0:
yield Request(next_url[0], callback=self.get_image_urls, meta={'item': item})
else:
yield item
I am building a simple(ish) parser in Scrapy and I am blissfully ignorant when it comes to scrapy and Python :-) In the file item.py I have a definition of thisItem() which I assign to item in the code below. All worked rather swimmingly, parseusing a callback to get to parse_dir_content... But then I realized I needed to scrape an extra bit of data and created another function parse_other_content. How do I get what is already in item into parse_other_content?
import scrapy
from this-site.items import *
import re
import json
class DmozSpider(scrapy.Spider):
name = "ABB"
allowed_domains = ["this-site.com.au"]
start_urls = [
"https://www.this-site.com.au?page=1",
"https://www.this-site.com.au?page=2",
]
def parse(self, response):
for href in response.xpath('//h3/a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
item['title'] = sel.xpath('text()').extract()
item['rate'] = response.xpath('//div[#class="rate"]/div/span/text()').extract()
so = re.search( r'\d+', response.url)
propID = so.group()
item['propid'] = propID
item['link'] = response.url
yield scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
#yield item
def parse_other_content(self, reponse):
sel = json.loads(reponse.body)
item['rate_detail'] = sel["this"][0]["that"]
yield item
I know I am missing something simple here, but I can't seem to figure it out.
Per the scrapy documentation (http://doc.scrapy.org/en/1.0/topics/request-response.html#topics-request-response-ref-request-callback-arguments):
In some cases you may be interested in passing arguments to those callback functions so you can receive the arguments later, in the second callback. You can use the Request.meta attribute for that.
In your case I would do something like this:
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
...
request = scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
request.meta['item'] = item
yield request
def parse_other_content(self, response):
item = response.meta['item']
# do something with the item
return item
According to Steve (see comments) you can also pass a dictionary of meta data as a keyword argument to the Request constructor like so:
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
...
request = scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content, meta={'item':item})
yield request
You can either allow item to be visible to parse_other_content() by changing it to self.item, or sending it as a parameter to the function. (The first one might be easier.)
For the first solution just add self. to any reference to the item variable. This makes it visible to the entire class.
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
self.item = thisItem()
self.item['title'] = sel.xpath('text()').extract()
self.item['rate'] = response.xpath('//div[#class="rate"]/div/span/text()').extract()
so = re.search( r'\d+', response.url)
propID = so.group()
self.item['propid'] = propID
self.item['link'] = response.url
yield scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
#yield item
def parse_other_content(self, reponse):
sel = json.loads(reponse.body)
self.item['rate_detail'] = sel["this"][0]["that"]
yield self.item