Scrape infinite scrolling websites with scrapy

Scrape infinite scrolling websites with scrapy - python

I want to crawl earning call transcripts from the website https://www.seekingalpha.com with scrapy.
The spider should behave as followed: 1) In the beginning a list of company codes ccodes is provided. 2) For each company all available transcript urls are parsed from https://www.seekingalpha.com/symbol/A/earnings/transcripts. 3) From each transcript url the associated content is parsed.
The difficulty is that https://www.seekingalpha.com/symbol/A/earnings/transcripts contain an infinite scrolling mechanism. Therefore, the idea is to individually iterate through the json files https://www.seekingalpha.com/symbol/A/earnings/more_transcripts?page=1 with page=1,2,3.. that are called by javascript. The json files contain the keys html and count. The key html should be used to parse transcript urls, the key count should be used to stop when there are no further urls. The criteria for that is count=0.
Here is my code so far. I have already managed to successfully parse the first json page for each company code. But I have no idea how I could iterate through the json files and stop when there are no more urls.
import scrapy
import re
import json
from scrapy.http import FormRequest
from scrapy.selector import Selector
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ["https://seekingalpha.com/account/login"]
custom_settings = { 'DOWNLOAD_DELAY': 2 }
loginData = {
'slugs[]': "",
'rt': "",
'user[url_source]': 'https://seekingalpha.com/account/login',
'user[location_source]': 'orthodox_login',
'user[email]': 'abc',
'user[password]': 'xyz'
}
def parse(self, response):
return scrapy.FormRequest.from_response(
response = response,
formdata = self.loginData,
formid = 'orthodox_login',
callback = self.verify_login
)
def verify_login(self, response):
pass
return self.make_initial_requests()
def make_initial_requests(self):
ccodes = ["A", "AB", "GOOGL"]
for ccode in ccodes:
yield scrapy.Request(
url = "https://seekingalpha.com/symbol/"+ccode+"/earnings/more_transcripts?page=1",
callback = self.parse_link_page,
meta = {"ccode": ccode, "page": 1}
)
def parse_link_page(self, response):
ccode = response.meta.get("ccode")
page = response.meta.get("page")
data = json.loads(response.text)
condition = "//a[contains(text(),'Results - Earnings Call Transcript')]/#href"
transcript_urls = Selector(text=data["html"]).xpath(condition).getall()
for transcript_url in transcript_urls:
yield scrapy.Request(
url = "https://seekingalpha.com"+transcript_url,
callback = self.save_contents,
meta = {"ccode": ccode}
)
def save_contents(self, response):
pass
You should be able to execute the code without authentification. The expected result is that all urls from https://www.seekingalpha.com/symbol/A/earnings/transcripts are crawled. Therefore it is necessary to access https://www.seekingalpha.com/symbol/A/earnings/more_transcripts?page=page with page = 1,2,3.. until all available urls are parsed.

Adding the below after looping through the transcript_urls seems to work. It yields a new request with a callback to parse_link_page if there were transcript_urls found on the current page.
if transcript_urls:
next_page = page + 1
parsed_url = urlparse(response.url)
new_query = urlencode({"page": next_page})
next_url = urlunparse(parsed_url._replace(query=new_query))
yield scrapy.Request(
url=next_url,
callback=self.parse_link_page,
meta={"ccode": ccode, "page": next_page},
)

Related

Scrapy. Every time i yield request another function is triggered as well. Cant see why

Here is my spider
It is supposed to assign a list attained from google sheet to global variable denied. In the code this function is called just once , but in the logs it is executed as many times as post request to endpoint is executed (send_to_endpoint()). Where is the error?
import scrapy
from scrapy import Request
from scrapy.linkextractors import LinkExtractor
import json
from datetime import datetime
import json
import logging
import requests
# from scrapy.utils.project import get_project_settings
class Code1Spider(scrapy.Spider):
name = 'c_cointelegraph'
allowed_domains = ['cointelegraph.com']
start_urls = ['https://cointelegraph.com/press-releases/']
id = int(str(datetime.now().timestamp()).split('.')[0])
denied=[]
gs_id = ''
endpoint_url = ''
def parse(self, response):
#Returns settings values as dict
settings=self.settings.copy_to_dict()
self.gs_id = settings.get('GS_ID')
self.endpoint_url = settings.get('ENDPOINT_URL')
#assigns a list of stop words from GS to global variable
self.denied = self.load_gsheet()
for i in response.xpath('//a[#class="post-card-inline__title-link"]/#href').getall():
yield Request(response.urljoin(i), callback = self.parsed)
def parsed(self, response):
#set deny_domains to current domain so we could get all external urls
denied_domains = self.allowed_domains[0]
links = LinkExtractor(deny_domains=denied_domains,restrict_xpaths=('//article[#class="post__article"]'))
links = links.extract_links(response)
links = [i.url for i in links]
#checks the list of external links agains the list of stop words
links = [i for i in links if not any(b in i for b in self.denied)]
company = response.xpath('//h2//text()').getall()
if company: company = [i.split('About ')[-1].strip() for i in company if 'About ' in i.strip()]
if company: company = company[0]
else: company = ''
d = {'heading' : response.xpath('//h1[#class="post__title"]/text()').get().strip(),
'url' : response.url,
'pubDate' : self.get_pub_date(response.xpath('//script[contains(text(),"datePublished")]/text()').get()),
'links' : links,
'company_name' : company,
'ScrapeID' : self.id,
}
# is used for debuging. just to see printed item.
yield d
#create post request to endpoint
req = self.send_to_endpoint(d)
#send request to endpoint
yield req
def get_pub_date(self, d):
d = json.loads(d)
pub_date = d['datePublished']
return pub_date
def load_gsheet(self):
#Loads a list of stop words from predefined google sheet
gs_id=self.gs_id
url = 'https://docs.google.com/spreadsheets/d/{}/export?format=csv'.format(gs_id)
r = requests.get(url)
denied = r.text.splitlines()[1:]
logging.info(denied)
return denied
def send_to_endpoint(self, d):
url = self.endpoint_url
r = scrapy.Request( url, method='POST',
body=json.dumps(d),
headers={'Content-Type':'application/json'},
dont_filter = True)
return r
Whenever I yield req, load_gsheet() function is running as well triggering google sheets. If I comment out yield req, load_gsheet() is called just once as it is supposed to be.
Why does this happen? I have triple check the code line by line, added comments. Have no idea what i miss.

This is happening because you don't assign a callback to the request object that you construct in the send_to_endpoint() method.
The default callback is the parse method so all of the requests created in the send_to_endpoint method are automatically being sent to the parse method which calls the load_gsheet method for every single one of those post requests.
The solution is to either take the load_gsheet call out of the parse method, or explicitly assign a callback to all of the POST requests that isn't self.parse.

Scrapy.Request returns <GET url> without scraping anything

I wanted to scrape the feed of sitepoint.com, this is my code:
import scrapy
from urllib.parse import urljoin
class SitepointSpider(scrapy.Spider):
# TODO: Add url tags (like /javascript) to the spider based on class paraneters
name = "sitepoint"
allowed_domains = ["sitepoint.com"]
start_urls = ["http://sitepoint.com/javascript/"]
def parse(self, response):
data = []
for article in response.css("article"):
title = article.css("a.t12xxw3g::text").get()
href = article.css("a.t12xxw3g::attr(href)").get()
img = article.css("img.f13hvvvv::attr(src)").get()
time = article.css("time::text").get()
url = urljoin("https://sitepoint.com", href)
text = scrapy.Request(url, callback=self.parse_article)
data.append(
{"title": title, "href": href, "img": img, "time": time, "text": text}
)
yield data
def parse_article(self, response):
text = response.xpath(
'//*[#id="main-content"]/article/div/div/div[1]/section/text()'
).extract()
yield text
And this is the response I get:-
[{'title': 'How to Build an MVP with React and Firebase',
'href': '/react-firebase-build-mvp/',
'img': 'https://uploads.sitepoint.com/wp-content/uploads/2021/09/1632802723react-firebase-mvp-
app.jpg',
'time': 'September 28, 2021',
'text': <GET https://sitepoint.com/react-firebase-build-mvp/>}]
It just does not scrape the urls. I followed everything said in this question but still could not make it work.

You have to visit the detail page from the listing to scrape the article.
In that case you have to yield the URL first then yield the data in the last spider
Also, the //*[#id="main-content"]/article/div/div/div[1]/section/text() won't return you any text since there are lots of HTML elements under the section tag
One solution is you can scrape all the HTML element inside section tag and clean them later to get your article text data
here is the full working code
import re
import scrapy
from urllib.parse import urljoin
class SitepointSpider(scrapy.Spider):
# TODO: Add url tags (like /javascript) to the spider based on class paraneters
name = "sitepoint"
allowed_domains = ["sitepoint.com"]
start_urls = ["http://sitepoint.com/javascript/"]
def clean_text(self, raw_html):
"""
:param raw_html: this will take raw html code
:return: text without html tags
"""
cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
return re.sub(cleaner, '', raw_html)
def parse(self, response):
for article in response.css("article"):
title = article.css("a.t12xxw3g::text").get()
href = article.css("a.t12xxw3g::attr(href)").get()
img = article.css("img.f13hvvvv::attr(src)").get()
time = article.css("time::text").get()
url = urljoin("https://sitepoint.com", href)
yield scrapy.Request(url, callback=self.parse_article, meta={"title": title,
"href": href,
"img": img,
"time": time})
def parse_article(self, response):
title = response.request.meta["title"]
href = response.request.meta["href"]
img = response.request.meta["img"]
time = response.request.meta["time"]
all_data = {}
article_html = response.xpath('//*[#id="main-content"]/article/div/div/div[1]/section').get()
all_data["title"] = title
all_data["href"] = href
all_data["img"] = img
all_data["time"] = time
all_data["text"] = self.clean_text(article_html)
yield all_data

Scrapy - Can not do multiple callbacks

I am having problems going through multiple pages. Here is my class for scrapy code called quotes.
class quotes(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/?']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback = self.books_detail)
def books_detail(self, response):
yas = {
'title':[],
'price':[],
'availability':[],
'category':[]
}
yas['category'].append(response.css('h1::text').extract())
all_divs = response.css('.col-lg-3')
for div in all_divs:
link = div.css('.product_pod a::attr(href)').get()
title = response.follow(link, callback = self.get_title)
yas['price'].append(div.css('.price_color::text').extract())
yas['availability'].append(div.css('.availability::text')[1].extract())
yield yas
def get_title(self,response):
print('testing')
title = response.css('h1::text').extract()
yield {"title":title}
So I use a response.follow to goto function books_details and in that function, I again call response.follow to call get_title. I get the 'title' from get_title and the rest of the details from the main page.
I can scrape the information just fine from the books_details function and I can get the link of the title page just fine as well from the code line.
link = div.css('.product_pod a::attr(href)').get()
But using the response.follow I can not go to the get_title function.
Any help would be appreciated. Thanks.

You should yield request, not run it directly, and use meta= to send data to next parser
yield response.follow(link, callback=self.get_title, meta={'item': yas})
and in next parser you can get it
yas = response.meta['item']
and then you can add new values and yield all data
yas["title"] = response.css('h1::text').extract()
yield yas
See other example in Scrapy yeild items from multiple requests
Doc: Request and Response, Request.meta special keys
Minimal working code which you can put in one file and run as normal script (python script.py) without creating project.
There are other changes.
You shouldn't put all books to one list but yield every book separatelly. Scrapy will keep all results and when you use option to save in csv then it will save all results.
For every book you should create new dictionary. If you use the same dictionary many time then it will ovewrite data and you may get many result with the same data.
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
all_links = response.css('.nav-list ul li')
for links in all_links:
link = links.css('a::attr(href)').get()
yield response.follow(link, callback=self.books_detail)
def books_detail(self, response):
all_divs = response.css('.col-lg-3')
for div in all_divs:
# every book in separated dictionary and it has to be new dictionary - because it could overwrite old data
book = {
'category': response.css('h1::text').extract(),
'price': div.css('.price_color::text').extract()[0].strip(),
'availability': div.css('.availability::text')[1].extract().strip(),
}
link = div.css('.product_pod a::attr(href)').get()
yield response.follow(link, callback=self.get_title, meta={'item': book})
def get_title(self, response):
book = response.meta['item']
print('testing:', response.url)
book["title"] = response.css('h1::text').extract()[0].strip()
yield book
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(QuotesSpider)
c.start()

Scrape a single link with Scrapy

I'm scraping dior.com for its products. head/script gives me all the fields I need except for a product description. To scrape the description I need to follow the link (the url variable in the code below). The only way to do that I'm familiar with is by using BeautifulSoup. Can I parse it using only Scrapy?
Thx guys.
class DiorSpider(CrawlSpider):
name = 'dior'
allowed_domains = ['www.dior.com']
start_urls = ['https://www.dior.com/en_us/']
rules = (
Rule(LinkExtractor(allow=(r'^https?://www.dior.com/en_us/men/clothing/new-arrivals.*',)), callback='parse_file')
)
def parse_file(self, response):
script_text = response.xpath("//script[contains(., 'window.initialState')]").extract_first()
blocks = extract_blocks(script_text)
for block in blocks:
sku = re.compile(r'("sku":)"[a-zA-Z0-9_]*"').finditer(block)
url = re.compile(r'("productLink":{"uri":)"[^"]*').finditer(block)
for item in zip(sku, url):
scraped_info = {
'sku': item[0].group(0).split(':')[1].replace('"', ''),
'url': 'https://www.dior.com' + item[1].group(0).split(':')[2].replace('"', '')
}
yield scraped_info

If you need to extract additional information from a second request, instead of yielding the data there, you should yield a request for the URL that includes the information you already extracted in the Request.meta attribute.
from scrapy import Request
# …
def parse_file(self, response):
# …
for block in blocks:
# …
for item in zip(sku, url):
# …
yield Request(url, callback=self.parse_additional_information, meta={'scraped_info': scraped_info}
def parse_additional_information(self, response):
scraped_info = response.meta['scraped_info']
# extract the additional information, add it to scraped_info
yield scraped_info

Scrapy parse list of urls, open one by one and parse additional data

I am trying to parse a site, an e-store. I parse a page with products, which are loaded with ajax, get urls of these products,and then parse additional info of each product following these parced urls.
My script gets the list of first 4 items on the page, their urls, makes the request, parses add info, but then not returning into the loop and so spider closes.
Could somebody help me in solving this? I'm pretty new to this kind of stuff, and ask here when totally stuck.
Here is my code:
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http.request import Request
from scrapy_sokos.items import SokosItem
class SokosSpider(Spider):
name = "sokos"
allowed_domains = ["sokos.fi"]
base_url = "http://www.sokos.fi/fi/SearchDisplay?searchTermScope=&searchType=&filterTerm=&orderBy=8&maxPrice=&showResultsPage=true&beginIndex=%s&langId=-11&sType=SimpleSearch&metaData=&pageSize=4&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&urlLangId=-11&categoryId=295401&storeId=10151"
start_urls = [
"http://www.sokos.fi/fi/SearchDisplay?searchTermScope=&searchType=&filterTerm=&orderBy=8&maxPrice=&showResultsPage=true&beginIndex=0&langId=-11&sType=SimpleSearch&metaData=&pageSize=4&manufacturer=&resultCatEntryType=&catalogId=10051&pageView=image&searchTerm=&minPrice=&urlLangId=-11&categoryId=295401&storeId=10151",
]
for i in range(0, 8, 4):
start_urls.append((base_url) % str(i))
def parse(self, response):
products = Selector(response).xpath('//div[#class="product-listing product-grid"]/article[#class="product product-thumbnail"]')
for product in products:
item = SokosItem()
item['url'] = product.xpath('//div[#class="content"]/a[#class="image"]/#href').extract()[0]
yield Request(url = item['url'], meta = {'item': item}, callback=self.parse_additional_info)
def parse_additional_info(self, response):
item = response.meta['item']
item['name'] = Selector(response).xpath('//h1[#class="productTitle"]/text()').extract()[0].strip()
item['description'] = Selector(response).xpath('//div[#id="kuvaus"]/p/text()').extract()[0]
euro = Selector(response).xpath('//strong[#class="special-price"]/span[#class="euros"]/text()').extract()[0]
cent = Selector(response).xpath('//strong[#class="special-price"]/span[#class="cents"]/text()').extract()[0]
item['price'] = '.'.join(euro + cent)
item['number'] = Selector(response).xpath('//#data-productid').extract()[0]
yield item

The AJAX requests you are simulating are caught by the Scrapy "duplicate url filter".
Set dont_filter to True when yielding a Request:
yield Request(url=item['url'],
meta={'item': item},
callback=self.parse_additional_info,
dont_filter=True)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrape infinite scrolling websites with scrapy - python

Related

Scrapy. Every time i yield request another function is triggered as well. Cant see why

Scrapy.Request returns <GET url> without scraping anything

Scrapy - Can not do multiple callbacks

Scrape a single link with Scrapy

Scrapy parse list of urls, open one by one and parse additional data

Categories

Resources