I am having problems going through multiple pages. Here is my class for scrapy code called quotes.
class quotes(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/?']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback = self.books_detail)
def books_detail(self, response):
yas = {
'title':[],
'price':[],
'availability':[],
'category':[]
}
yas['category'].append(response.css('h1::text').extract())
all_divs = response.css('.col-lg-3')
for div in all_divs:
link = div.css('.product_pod a::attr(href)').get()
title = response.follow(link, callback = self.get_title)
yas['price'].append(div.css('.price_color::text').extract())
yas['availability'].append(div.css('.availability::text')[1].extract())
yield yas
def get_title(self,response):
print('testing')
title = response.css('h1::text').extract()
yield {"title":title}
So I use a response.follow to goto function books_details and in that function, I again call response.follow to call get_title. I get the 'title' from get_title and the rest of the details from the main page.
I can scrape the information just fine from the books_details function and I can get the link of the title page just fine as well from the code line.
link = div.css('.product_pod a::attr(href)').get()
But using the response.follow I can not go to the get_title function.
Any help would be appreciated. Thanks.
You should yield request, not run it directly, and use meta= to send data to next parser
yield response.follow(link, callback=self.get_title, meta={'item': yas})
and in next parser you can get it
yas = response.meta['item']
and then you can add new values and yield all data
yas["title"] = response.css('h1::text').extract()
yield yas
See other example in Scrapy yeild items from multiple requests
Doc: Request and Response, Request.meta special keys
Minimal working code which you can put in one file and run as normal script (python script.py) without creating project.
There are other changes.
You shouldn't put all books to one list but yield every book separatelly. Scrapy will keep all results and when you use option to save in csv then it will save all results.
For every book you should create new dictionary. If you use the same dictionary many time then it will ovewrite data and you may get many result with the same data.
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback=self.books_detail)
def books_detail(self, response):
all_divs = response.css('.col-lg-3')
for div in all_divs:
# every book in separated dictionary and it has to be new dictionary - because it could overwrite old data
book = {
'category': response.css('h1::text').extract(),
'price': div.css('.price_color::text').extract()[0].strip(),
'availability': div.css('.availability::text')[1].extract().strip(),
}
link = div.css('.product_pod a::attr(href)').get()
yield response.follow(link, callback=self.get_title, meta={'item': book})
def get_title(self, response):
book = response.meta['item']
print('testing:', response.url)
book["title"] = response.css('h1::text').extract()[0].strip()
yield book
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(QuotesSpider)
c.start()
Related
I'm in reach of a personal milestone with scrapy. The aim is to properly understand the callback and cb_kwargs, I've read the documentation countless times but I learn best with visual code, practice and an explanation.
I have an example scraper, the aim is to grab the book name, price and go into each book page and extract a single piece of information. I'm trying to understand how to properly get information on the next few pages also, which I know is dependent on understanding the operation of callbacks.
When I run my script It returns results only for the first page, how do I get the additional pages?
Here's my scraper:
class BooksItem(scrapy.Item):
items = Field(output_processor = TakeFirst())
price = Field(output_processor = TakeFirst())
availability = Field(output_processor = TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]')
for books in data:
loader = ItemLoader(BooksItem(), selector = books)
loader.add_xpath('items','.//article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price','.//p[#class="price_color"]//text()')
for url in [books.xpath('.//a//#href').get()]:
yield scrapy.Request(
response.urljoin(url),
callback = self.parse_book,
cb_kwargs = {'loader':loader})
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
book_quote = response.xpath('//p[#class="instock availability"]//text()').get()
loader.add_value('availability', book_quote)
yield loader.load_item()
I believe the issue is with the part where I try to grab the next few pages. I have tried an alternative approach using the following:
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse,
cb_kwargs = {'page_count':0}
)
def parse(self, response, next_page):
if page_count > 3:
return
...
...
page_count += 1
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
yield response.follow(next_page, callback=self.parse, cb_kwargs = {'page_count': page_count})
However, I get the following error with this approach:
TypeError: parse() missing 1 required positional argument: 'page_cntr'
It should be start_requests, and self.start_urls (inside the function).
get() will return the first result, what you want is getall() in order to return a list.
There is no need for a for loop for the "next_page" part, it's not a mistake just unnecessary.
In the line for url in books.xpath you're getting every url twice, again not a mistake but still...
Here data = response.xpath('//div[#class = "col-sm-8 col-md-9"]') you don't select the books one by one, you select the whole books container, you can check that len(data.getall()) == 1.
book_quote = response.xpath('//p[#class="instock availability"]//text()').get() will return \n, look at the source at try to find out why (hint: 'i' tag).
Compare your code to this and see what I changed:
import scrapy
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BooksItem(scrapy.Item):
items = Field(output_processor=TakeFirst())
price = Field(output_processor=TakeFirst())
availability = Field(output_processor=TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]//li')
for books in data:
loader = ItemLoader(BooksItem(), selector=books)
loader.add_xpath('items', './/article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price', './/p[#class="price_color"]//text()')
for url in books.xpath('.//h3/a//#href').getall():
yield scrapy.Request(
response.urljoin(url),
callback=self.parse_book,
cb_kwargs={'loader': loader})
next_page = response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
# option 1:
book_quote = response.xpath('//p[#class="instock availability"]/i/following-sibling::text()').get().strip()
# option 2:
# book_quote = ''.join(response.xpath('//div[contains(#class, "product_main")]//p[#class="instock availability"]//text()').getall()).strip()
loader.add_value('availability', book_quote)
yield loader.load_item()
In the first place, If I use extract_first, scrapy gives me the first element of each page and if I run it like this it returns all the content I want but in one-liners.
In Second place, I can't make scrapy go to the links I just scraped and get information from inside these links, returning an empty csv file.
from scrapy import Spider
from companies.items import CompaniesItem
import re
class companiesSpider(Spider):
name = "companies"
allowed_domains = ['http://startup.miami',]
# Defining the list of pages to scrape
start_urls = ["http://startup.miami/category/startups/page/" + str(1*i) + "/" for i in range(0, 10)]
def parse(self, response):
rows = response.xpath('//*[#id="datafetch"]')
for row in rows:
link = row.xpath('.//h2/a/#href').extract()
name = row.xpath('.//header/h2/a/text()').extract()
item = CompaniesItem()
item['link'] = link
item['name'] = name
yield item
Your parse-method is not yielding any requests or items. In the part below we go through the pages and get the urls & names. In the parse_detail you can add additional data to the item.
Instead of hardcoding to 10 pages we check if there is a next page, and go through the parse again if it's the case.
from scrapy import Spider
from ..items import CompaniesItem
import scrapy
class CompaniesSpider(Spider):
name = "companies"
allowed_domains = ['startup.miami']
# Defining the list of pages to scrape
start_urls = ["http://startup.miami/category/startups/"]
def parse(self, response):
# get link & name and send item to parse_detail in meta
rows = response.xpath('//*[#id="datafetch"]/article')
for row in rows:
link = row.xpath('.//#href').extract_first()
name = row.xpath(
'.//*[#class="textoCoworking"]/text()').extract_first()
item = CompaniesItem()
item['link'] = link
item['name'] = name.strip()
yield scrapy.Request(link,
callback=self.parse_detail,
meta={'item': item})
# get the next page
next_page = response.xpath(
'//*[#class="next page-numbers"]/#href').extract_first()
if next_page:
yield scrapy.Request(next_page, callback=self.parse)
def parse_detail(self, response):
item = response.meta['item']
# add other details to the item here
yield item
To put the results in a csv file you can launch the scraper like this: scrapy crawl companies -o test_companies.csv
I want to crawl earning call transcripts from the website https://www.seekingalpha.com with scrapy.
The spider should behave as followed: 1) In the beginning a list of company codes ccodes is provided. 2) For each company all available transcript urls are parsed from https://www.seekingalpha.com/symbol/A/earnings/transcripts. 3) From each transcript url the associated content is parsed.
The difficulty is that https://www.seekingalpha.com/symbol/A/earnings/transcripts contain an infinite scrolling mechanism. Therefore, the idea is to individually iterate through the json files https://www.seekingalpha.com/symbol/A/earnings/more_transcripts?page=1 with page=1,2,3.. that are called by javascript. The json files contain the keys html and count. The key html should be used to parse transcript urls, the key count should be used to stop when there are no further urls. The criteria for that is count=0.
Here is my code so far. I have already managed to successfully parse the first json page for each company code. But I have no idea how I could iterate through the json files and stop when there are no more urls.
import scrapy
import re
import json
from scrapy.http import FormRequest
from scrapy.selector import Selector
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ["https://seekingalpha.com/account/login"]
custom_settings = { 'DOWNLOAD_DELAY': 2 }
loginData = {
'slugs[]': "",
'rt': "",
'user[url_source]': 'https://seekingalpha.com/account/login',
'user[location_source]': 'orthodox_login',
'user[email]': 'abc',
'user[password]': 'xyz'
}
def parse(self, response):
return scrapy.FormRequest.from_response(
response = response,
formdata = self.loginData,
formid = 'orthodox_login',
callback = self.verify_login
)
def verify_login(self, response):
pass
return self.make_initial_requests()
def make_initial_requests(self):
ccodes = ["A", "AB", "GOOGL"]
for ccode in ccodes:
yield scrapy.Request(
url = "https://seekingalpha.com/symbol/"+ccode+"/earnings/more_transcripts?page=1",
callback = self.parse_link_page,
meta = {"ccode": ccode, "page": 1}
)
def parse_link_page(self, response):
ccode = response.meta.get("ccode")
page = response.meta.get("page")
data = json.loads(response.text)
condition = "//a[contains(text(),'Results - Earnings Call Transcript')]/#href"
transcript_urls = Selector(text=data["html"]).xpath(condition).getall()
for transcript_url in transcript_urls:
yield scrapy.Request(
url = "https://seekingalpha.com"+transcript_url,
callback = self.save_contents,
meta = {"ccode": ccode}
)
def save_contents(self, response):
pass
You should be able to execute the code without authentification. The expected result is that all urls from https://www.seekingalpha.com/symbol/A/earnings/transcripts are crawled. Therefore it is necessary to access https://www.seekingalpha.com/symbol/A/earnings/more_transcripts?page=page with page = 1,2,3.. until all available urls are parsed.
Adding the below after looping through the transcript_urls seems to work. It yields a new request with a callback to parse_link_page if there were transcript_urls found on the current page.
if transcript_urls:
next_page = page + 1
parsed_url = urlparse(response.url)
new_query = urlencode({"page": next_page})
next_url = urlunparse(parsed_url._replace(query=new_query))
yield scrapy.Request(
url=next_url,
callback=self.parse_link_page,
meta={"ccode": ccode, "page": next_page},
)
CODE:
import scrapy
from scrapy.spiders import CrawlSpider
from scrapy import Request
class TestSpider(CrawlSpider):
name = "test_spyder"
allowed_domains = ["stackoverflow.com"]
start_urls = ['https://stackoverflow.com/tags']
def parse(self, response):
title_1 = response.xpath('//h1/text()').extract_first()
next_url = 'https://stackoverflow.com/users'
title_2 = Request(url=next_url, callback=self.parse_some)
yield {'title_1': title_1, 'title_2': title_2}
def parse_some(self, response):
return response.xpath('//h1/text()').extract_first()
I don't understand why instead second page title (Users) i get other value (https://stackoverflow.com/users>).
Scrapy should return next values: Tags + Users, but returns: Tag + <Request GET htt... at list i think so.
Where is the error and how to fix it?
To crawl url you need to yield a Request object. So your parse callbacks should either:
Yield a dictionary/Item - this is the end of crawl chain. The item is being generated, it is sent through pipelines and finally saved somewhere if you have that set up.
Yield a Request object - this still continues the crawl chain to another callback.
Example of this process:
crawl url1 (2)
crawl url2 (2)
yield item (1)
Your spider in this case should look like this:
def parse(self, response):
title = response.xpath('//h1/text()').extract_first()
yield {'title': title}
next_url = 'https://stackoverflow.com/users'
yield Request(url=next_url, callback=self.parse_some)
And your end results crawling with scrapy crawl spider -o output.json:
# output.json
[
{'title': 'title1'},
{'title': 'title2'}
]
I'm scraping dior.com for its products. head/script gives me all the fields I need except for a product description. To scrape the description I need to follow the link (the url variable in the code below). The only way to do that I'm familiar with is by using BeautifulSoup. Can I parse it using only Scrapy?
Thx guys.
class DiorSpider(CrawlSpider):
name = 'dior'
allowed_domains = ['www.dior.com']
start_urls = ['https://www.dior.com/en_us/']
rules = (
Rule(LinkExtractor(allow=(r'^https?://www.dior.com/en_us/men/clothing/new-arrivals.*',)), callback='parse_file')
)
def parse_file(self, response):
script_text = response.xpath("//script[contains(., 'window.initialState')]").extract_first()
blocks = extract_blocks(script_text)
for block in blocks:
sku = re.compile(r'("sku":)"[a-zA-Z0-9_]*"').finditer(block)
url = re.compile(r'("productLink":{"uri":)"[^"]*').finditer(block)
for item in zip(sku, url):
scraped_info = {
'sku': item[0].group(0).split(':')[1].replace('"', ''),
'url': 'https://www.dior.com' + item[1].group(0).split(':')[2].replace('"', '')
}
yield scraped_info
If you need to extract additional information from a second request, instead of yielding the data there, you should yield a request for the URL that includes the information you already extracted in the Request.meta attribute.
from scrapy import Request
# …
def parse_file(self, response):
# …
for block in blocks:
# …
for item in zip(sku, url):
# …
yield Request(url, callback=self.parse_additional_information, meta={'scraped_info': scraped_info}
def parse_additional_information(self, response):
scraped_info = response.meta['scraped_info']
# extract the additional information, add it to scraped_info
yield scraped_info