I've been trying to scrape PDFs from pages like these:
https://www.oecd-ilibrary.org/science-and-technology/oecd-digital-economy-papers_20716826?page=4
... using BeautifulSoup to no avail.
How does one scrape the actual pdf document?
import trio
import httpx
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urljoin, urlparse
mainurl = 'https://www.oecd-ilibrary.org/science-and-technology/oecd-digital-economy-papers_20716826'
async def downloader(client, link, channel):
fname = urlparse(link)[2].split('/')[-1]
async with channel, await trio.open_file(fname, 'wb') as f:
r = await client.get(link)
await f.write(r.content)
print(f'Downloaded: {link}')
async def get_links(content):
return (urljoin(mainurl, x['href']) for x in BeautifulSoup(content, 'lxml', parse_only=SoupStrainer(
id='collectionsort')).select('a.action-pdf'))
async def worker(channel):
async with channel:
async for client, page, nurse in channel:
params = {
'page': page
}
r = await client.get(mainurl, params=params)
links = await get_links(r.text)
for link in links:
nurse.start_soon(downloader, client, link, channel.clone())
async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(5):
nurse.start_soon(worker, receiver.clone())
async with sender:
for page in range(1, 19):
await sender.send([client, page, nurse])
if __name__ == "__main__":
try:
trio.run(main)
except KeyboardInterrupt:
exit('Bye!')
Related
I am learning web scraping using asyncio and aiohttp with beautifulsoup. I want to create a RESTful API to get user input, scrape the data and then show the response in json format. This is how my scraper code looks like;
import asyncio
import aiohttp
from bs4 import BeautifulSoup, SoupStrainer
class TestScraper:
def __init__(self, query):
self.query = query
async def main(self):
urls = [
f"https://books.toscrape.com/catalogue/page-{self.query}.html",
f"https://quotes.toscrape.com/page/{self.query}/",
]
def get_urls(session):
tasks = []
for url in urls:
tasks.append(session.get(url))
return tasks
async with aiohttp.ClientSession() as session:
tasks = get_urls(session)
responses = await asyncio.gather(*tasks)
for r in responses:
if (str(r.url).split(".")[0][8:]) == "books":
soup = BeautifulSoup(
await r.read(), "lxml", parse_only=SoupStrainer("article")
)
books_list = []
for books in soup.find_all("article"):
book_name = books.find("h3").find("a").get("title")
book_price = books.find("p", class_="price_color").text
books_item = {
"book_name": book_name,
"book_price": book_price,
}
books_list.append(books_item)
yield books_list
elif (str(r.url).split(".")[0][8:]) == "quotes":
soup = BeautifulSoup(
await r.read(),
"lxml",
parse_only=SoupStrainer("div", {"class": "quote"}),
)
quotes_list = []
for quotes in soup.find_all("div", class_="quote"):
quote_text = quotes.find("span", class_="text").get_text()
quote_author = quotes.find("small", class_="author").get_text()
quotes_item = {
"quote_text": quote_text,
"quote_author": quote_author,
}
quotes_list.append(quotes_item)
yield quotes_list
else:
yield "No results found"
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(TestScraper(6).main())
# asyncio.run(TestScraper({query}).main())
It's working fine but when I try to use it with FastAPI it returns errors. Even after doing some changes which I found from the web the errors still appear. Here is my FastAPI code;
import asyncio
from fastapi import FastAPI
from scrapers.books_quotes import TestScraper
app = FastAPI()
#app.get("/")
def root():
return {"message": "Hello World"}
#app.get("/test/{test_query}")
async def read_test_items(test_query: str):
return asyncio.run(TestScraper(test_query).main())
And the error I get;
asyncio.run() cannot be called from a running event loop
How to solve it?
asyncio.run is meant as the top-level entry point for the async code, which the FastAPI app (or some other framework which you use to run it) should already call for you.
Normally, to run an async def function (= coroutine) from within async code, simply await it.
#app.get("/test/{test_query}")
async def read_test_items(test_query: str):
return await TestScraper(test_query).main()
In your case, TestScraper.main is not a normal coroutine but an asynchronous generator (because it uses yield statements). You run it by using it in an async for loop.
#app.get("/test/{test_query}")
async def read_test_items(test_query: str):
async for result in TestScraper(test_query).main():
# do something with result
Instead of creating a list for each URL in TestScraper code, created a single list for all URLs.
#same code as before
async with aiohttp.ClientSession() as session:
tasks = get_urls(session)
responses = await asyncio.gather(*tasks)
results = []
for r in responses:
if (str(r.url).split(".")[0][8:]) == "books":
soup = BeautifulSoup(
await r.read(), "lxml", parse_only=SoupStrainer("article")
)
for books in soup.find_all("article"):
book_name = books.find("h3").find("a").get("title")
book_price = books.find("p", class_="price_color").text
books_item = {
"book_name": book_name,
"book_price": book_price,
}
results.append(books_item)
elif (str(r.url).split(".")[0][8:]) == "quotes":
soup = BeautifulSoup(
await r.read(),
"lxml",
parse_only=SoupStrainer("div", {"class": "quote"}),
)
for quotes in soup.find_all("div", class_="quote"):
quote_text = quotes.find("span", class_="text").get_text()
quote_author = quotes.find("small", class_="author").get_text()
quotes_item = {
"quote_text": quote_text,
"quote_author": quote_author,
}
results.append(quotes_item)
else:
results.append({"error": f"No results found for {r.url}"})
yield results
#print(results)
#same code as before
And thanks to #mkrieger1 changed the FastAPI file i.e. main.py code as shown below;
#same code as before
#app.get("/test/{test_query}")
async def read_test_items(test_query: str):
async for results in TestScraper(test_query).main():
return results
And now everything works fine. Thanks for reading and have a nice day.
When I run:
from bs4 import BeautifulSoup
import requests
import discord
from discord.ext import tasks
client = discord.Client()
#tasks.loop(minutes=1)
async def test():
channel = client.get_channel(973939538357522474)
await channel.send(takip)
#client.event
async def on_ready():
test.start()
async def takip():
url = ""
R = requests.get(url)
Soup = BeautifulSoup(R.text, "html5lib")
Title = Soup.find("h1", {"class": "pr-new-br"}).getText()
List = Soup.find("div", {"class": "pr-bx-nm with-org-prc"})
fiyat = List.find("span", {"class": "prc-dsc"}).getText()
degisenfiyat = float(fiyat.replace(",", ".").replace(" TL", ""))
if (degisenfiyat <= 200):
print("Fiyat düştü.")
client.run("")
I get:
A "<function takip at 0x00000244A7A440D0>" message in the discord channel
I want to use channel.send with the takip function. How do I do this?
takip is a function, takip() is what your function return
for example if you have this code
def my_sum(a, b):
return a + b
print(my_sum)
# This one print the function
# expected result : <function my_sum at 0x7f4dc82b7d30>
print(my_sum(1, 2))
# This one print what my function return (so 3 here)
# expected result : 3
In your code, you're sending your function to your discord channel, if you want to send "Fiyat düştü." if (degisenfiyat <= 200), you have to edit your code to this
from bs4 import BeautifulSoup
import requests
import discord
from discord.ext import tasks
client = discord.Client()
#tasks.loop(minutes=1)
async def test():
channel = client.get_channel(973939538357522474)
await channel.send(takip()) # Change here
#client.event
async def on_ready():
test.start()
def takip():
url = ""
R = requests.get(url)
Soup = BeautifulSoup(R.text, "html5lib")
Title = Soup.find("h1", {"class": "pr-new-br"}).getText()
List = Soup.find("div", {"class": "pr-bx-nm with-org-prc"})
fiyat = List.find("span", {"class": "prc-dsc"}).getText()
degisenfiyat = float(fiyat.replace(",", ".").replace(" TL", ""))
if (degisenfiyat <= 200):
return("Fiyat düştü.") # Change here
else:
return "degisenfiyat > 200"
client.run("")
However if (degisenfiyat > 200) this won't return anything so you will be sending None to your discord channel, I recommend you to add an else statement to return an error or more information (i.e.error : degisenfiyat > 200)
You're sending the function object. You need to call the function with parenthesis ()
This line:
await channel.send(takip)
Should be
await channel.send(takip())
takip() also needs to return a value to the caller, not print() it to the terminal. Use return instead of print:
if (degisenfiyat <= 200):
return "Fiyat düştü."
Consider this example of the function object, vs the returned value:
>>> def f():
... return "Hello"
...
>>> f()
'Hello'
>>> f
<function f at 0x103149fc0>
And finally, you need to remove the await from your function definition, as your call does not need to be asynchronous with the bot. This leaves your code at:
# ...
#tasks.loop(minutes=1)
async def test():
channel = client.get_channel(973939538357522474)
# Define x as the function's result before awaiting
x = takip()
await channel.send(x)
# ...
def takip():
url = ""
R = requests.get(url)
Soup = BeautifulSoup(R.text, "html5lib")
Title = Soup.find("h1", {"class": "pr-new-br"}).getText()
List = Soup.find("div", {"class": "pr-bx-nm with-org-prc"})
fiyat = List.find("span", {"class": "prc-dsc"}).getText()
degisenfiyat = float(fiyat.replace(",", ".").replace(" TL", ""))
if (degisenfiyat <= 200):
return "Fiyat düştü."
else:
return f"Error, degisenfiyat > 200. degisenfiyat = {degisenfiyat}"
client.run("")
I made this code to extrat lyrics from a website informing the artist and the music name.
The code is working, the problem is that I have a DataFrame (named years_1920_2020) with 10000 musics, and it took 1:30h to retrieve all these lyrics .
Is there a way to do it faster?
def url_lyric(music,artist):
url_list = ("https://www.letras.mus.br/", str(artist),"/", str(music),"/")
url = ''.join(url_list)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try:
webpage = urlopen(req).read()
bs = BeautifulSoup(webpage, 'html.parser')
lines =bs.find('div', {'class':'cnt-letra p402_premium'})
final_lines = lines.find_all('p')
return final_lines
except:
return 0
final_lyric_series = pd.Series(name = "lyrics")
for year in range (1920,2021):
lyrics_serie = lyrics_from_year(year)
final_lyric_series = pd.concat([final_lyric_series, lyrics_serie])
print(year)
the function lyrics_from_year(year) uses the function url_lyric, perform some re tasks and return a pd.series with all the lyrics of the chosen year
We can get the solution using the pythons asyncio module. Please refer to this Article It's not an exact solution but similar to your problem.
import asyncio
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
def url_lyric(music, artist):
pass
def lyrics_from_year(year):
music = None
artist = None
return url_lyric(music, artist)
async def get_work_done():
with ThreadPoolExecutor(max_workers=10) as executor:
loop = asyncio.get_event_loop()
tasks = [
loop.run_in_executor(
executor,
lyrics_from_year,
*(year) # Allows us to pass in arguments to `lyrics_from_year`
)
for year in range(1920, 2021)
]
return await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(get_work_done())
loop.run_until_complete(future)
final_lyric_series = pd.Series(name="lyrics")
for result in future:
final_lyric_series = pd.concat([final_lyric_series, result])
print(result)
Here is a simple example of how you could do it:
import aiohttp
import asyncio
import requests, bs4
async def main():
async with aiohttp.ClientSession() as session:
urls = [f"https://www.letras.mus.br{x['href']}" for x in bs4.BeautifulSoup(requests.get(
url = 'https://www.letras.mus.br/adele/mais-tocadas.html'
).content, 'html.parser').find_all('a', {'class':'song-name'})]
for url in urls:
async with session.get(url) as r:
lyrics = bs4.BeautifulSoup(await r.text(), 'html.parser').find('div', {'class':'cnt-letra'}).text
print('\n'.join(x.strip() for x in lyrics.strip().split('\n')))
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
have a list of urls that i want to scrape:
urls = [https://www.netflix.com/ar/title/81488310.
https://www.netflix.com/ar/title/81486358
https://www.netflix.com/ar/title/81486558
https://www.netflix.com/ar/title/81488216
https://www.netflix.com/ar/title/81488090
https://www.netflix.com/ar/title/81489616
https://www.netflix.com/ar/title/81487371
https://www.netflix.com/ar/title/81487432
https://www.netflix.com/ar/title/81485995
https://www.netflix.com/ar/title/81489610
https://www.netflix.com/ar/title/81488496]
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(self.main(urls)) # here i call my main function
the following methods to __scrape__all the urls are:
def parse(self, html):
''' Normal method to parse response.text into soup '''
soup = BeautifulSoup(html,'html.parser')
status = soup.find('div', class_='error-page not-found') # contents that are not avialable (404)
if not status:
scripts = soup.find_all('script')
return scripts
else:
return None
async def get_response(self, session, url):
''' Asynchronous requests with ClientSession'''
try:
async with session.get(url=url) as response:
print(f'Successfully got url {url}.')
return await response.text()
except Exception as e:
print(f"Unable to get url {url} due to {e.__class__}.")
async def response_soup(self, session, url):
''' Calls methods to get response and parse into soup
with a separated thread '''
html = await self.get_response(session, url)
loop = asyncio.get_event_loop()
soup = await loop.run_in_executor(None, self.parse, html) # run parse(html) in a separate thread
return soup
async def main(self, urls):
''' Main where we setup semaphore and run the threads
for all urls with ClientSession '''
async with asyncio.BoundedSemaphore(3), aiohttp.ClientSession() as session:
results = await asyncio.gather(*[self.response_soup(session, url) for url in urls])
#final = [result for result in results if result != None]
print(results)
print("Finalized all. Return is a list of len {} outputs.".format(len(results)))
but my output is list of null lists like this!
[
[][][][][][][][][][][][]][][][][]
]
console output (there are like 17000 urls in the real script)
what am i doing wrong?
Cannot make async requests to url, and get response from the error.
File "D:\Dev\Scripts\ol_as.py", line 28, in main
async with requests_html.AsyncHTMLSession() as session:
AttributeError: aexit
import asyncio
import requests_html
from time import time
from bs4 import BeautifulSoup
async def fetch_content(url, session):
async with session.get(url, allow_redirects=True) as response:
data = await respone.read()
respone.html.render()
soup = BeautifulSoup(respone.html.html, 'lxml')
txt = soup.find_all('span', {'class': 'text'})
print(txt)
async def main():
url = 'http://quotes.toscrape.com/js/'
tasks = []
async with requests_html.AsyncHTMLSession() as session:
for i in range(10):
tasks.append(asyncio.create_task(fetch_content(url, session)))
await asyncio.gather(*tasks)
if __name__ == '__main__':
t0 = time()
asyncio.run(main())
print(time() - t0)
You're pretty close. From experimenting AsyncHTMLSession doesn't like to be used in a context manager and passed around to different coroutines. You also need r.html.arender instead of just render.
Here's what I came up with if you want a list of quotes from a specified number of pages:
from requests_html import AsyncHTMLSession
import asyncio
import json
from itertools import chain
async def get_quotes(s, url):
r = await s.get(url)
await r.html.arender()
var_data = r.html.find('script', containing='var data', first=True).text
#this part could be improved, I'm basically isolating the json rendered bit:
*shit, var_data = var_data.split('var data =')
var_data, *shit = var_data.split('; for (var i in data)')
data = json.loads(var_data)
quotes = [post['text'] for post in data]
return quotes
async def main(max_pages=1):
s = AsyncHTMLSession()
tasks = []
for page in range(1,max_pages+1):
url = f'http://quotes.toscrape.com/js/page/{page}'
tasks.append(get_quotes(s,url))
results = await asyncio.gather(*tasks)
return list(chain(*(res for res in results)))
all_quotes = asyncio.run(main(5))
print(all_quotes)