How to make Web Scraping faster? - python

I made this code to extrat lyrics from a website informing the artist and the music name.
The code is working, the problem is that I have a DataFrame (named years_1920_2020) with 10000 musics, and it took 1:30h to retrieve all these lyrics .
Is there a way to do it faster?
def url_lyric(music,artist):
url_list = ("https://www.letras.mus.br/", str(artist),"/", str(music),"/")
url = ''.join(url_list)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try:
webpage = urlopen(req).read()
bs = BeautifulSoup(webpage, 'html.parser')
lines =bs.find('div', {'class':'cnt-letra p402_premium'})
final_lines = lines.find_all('p')
return final_lines
except:
return 0
final_lyric_series = pd.Series(name = "lyrics")
for year in range (1920,2021):
lyrics_serie = lyrics_from_year(year)
final_lyric_series = pd.concat([final_lyric_series, lyrics_serie])
print(year)
the function lyrics_from_year(year) uses the function url_lyric, perform some re tasks and return a pd.series with all the lyrics of the chosen year

We can get the solution using the pythons asyncio module. Please refer to this Article It's not an exact solution but similar to your problem.
import asyncio
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
def url_lyric(music, artist):
pass
def lyrics_from_year(year):
music = None
artist = None
return url_lyric(music, artist)
async def get_work_done():
with ThreadPoolExecutor(max_workers=10) as executor:
loop = asyncio.get_event_loop()
tasks = [
loop.run_in_executor(
executor,
lyrics_from_year,
*(year) # Allows us to pass in arguments to `lyrics_from_year`
)
for year in range(1920, 2021)
]
return await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(get_work_done())
loop.run_until_complete(future)
final_lyric_series = pd.Series(name="lyrics")
for result in future:
final_lyric_series = pd.concat([final_lyric_series, result])
print(result)

Here is a simple example of how you could do it:
import aiohttp
import asyncio
import requests, bs4
async def main():
async with aiohttp.ClientSession() as session:
urls = [f"https://www.letras.mus.br{x['href']}" for x in bs4.BeautifulSoup(requests.get(
url = 'https://www.letras.mus.br/adele/mais-tocadas.html'
).content, 'html.parser').find_all('a', {'class':'song-name'})]
for url in urls:
async with session.get(url) as r:
lyrics = bs4.BeautifulSoup(await r.text(), 'html.parser').find('div', {'class':'cnt-letra'}).text
print('\n'.join(x.strip() for x in lyrics.strip().split('\n')))
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Related

How to use asyncio, aiohttp web scraper with fastapi?

I am learning web scraping using asyncio and aiohttp with beautifulsoup. I want to create a RESTful API to get user input, scrape the data and then show the response in json format. This is how my scraper code looks like;
import asyncio
import aiohttp
from bs4 import BeautifulSoup, SoupStrainer
class TestScraper:
def __init__(self, query):
self.query = query
async def main(self):
urls = [
f"https://books.toscrape.com/catalogue/page-{self.query}.html",
f"https://quotes.toscrape.com/page/{self.query}/",
]
def get_urls(session):
tasks = []
for url in urls:
tasks.append(session.get(url))
return tasks
async with aiohttp.ClientSession() as session:
tasks = get_urls(session)
responses = await asyncio.gather(*tasks)
for r in responses:
if (str(r.url).split(".")[0][8:]) == "books":
soup = BeautifulSoup(
await r.read(), "lxml", parse_only=SoupStrainer("article")
)
books_list = []
for books in soup.find_all("article"):
book_name = books.find("h3").find("a").get("title")
book_price = books.find("p", class_="price_color").text
books_item = {
"book_name": book_name,
"book_price": book_price,
}
books_list.append(books_item)
yield books_list
elif (str(r.url).split(".")[0][8:]) == "quotes":
soup = BeautifulSoup(
await r.read(),
"lxml",
parse_only=SoupStrainer("div", {"class": "quote"}),
)
quotes_list = []
for quotes in soup.find_all("div", class_="quote"):
quote_text = quotes.find("span", class_="text").get_text()
quote_author = quotes.find("small", class_="author").get_text()
quotes_item = {
"quote_text": quote_text,
"quote_author": quote_author,
}
quotes_list.append(quotes_item)
yield quotes_list
else:
yield "No results found"
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(TestScraper(6).main())
# asyncio.run(TestScraper({query}).main())
It's working fine but when I try to use it with FastAPI it returns errors. Even after doing some changes which I found from the web the errors still appear. Here is my FastAPI code;
import asyncio
from fastapi import FastAPI
from scrapers.books_quotes import TestScraper
app = FastAPI()
#app.get("/")
def root():
return {"message": "Hello World"}
#app.get("/test/{test_query}")
async def read_test_items(test_query: str):
return asyncio.run(TestScraper(test_query).main())
And the error I get;
asyncio.run() cannot be called from a running event loop
How to solve it?
asyncio.run is meant as the top-level entry point for the async code, which the FastAPI app (or some other framework which you use to run it) should already call for you.
Normally, to run an async def function (= coroutine) from within async code, simply await it.
#app.get("/test/{test_query}")
async def read_test_items(test_query: str):
return await TestScraper(test_query).main()
In your case, TestScraper.main is not a normal coroutine but an asynchronous generator (because it uses yield statements). You run it by using it in an async for loop.
#app.get("/test/{test_query}")
async def read_test_items(test_query: str):
async for result in TestScraper(test_query).main():
# do something with result
Instead of creating a list for each URL in TestScraper code, created a single list for all URLs.
#same code as before
async with aiohttp.ClientSession() as session:
tasks = get_urls(session)
responses = await asyncio.gather(*tasks)
results = []
for r in responses:
if (str(r.url).split(".")[0][8:]) == "books":
soup = BeautifulSoup(
await r.read(), "lxml", parse_only=SoupStrainer("article")
)
for books in soup.find_all("article"):
book_name = books.find("h3").find("a").get("title")
book_price = books.find("p", class_="price_color").text
books_item = {
"book_name": book_name,
"book_price": book_price,
}
results.append(books_item)
elif (str(r.url).split(".")[0][8:]) == "quotes":
soup = BeautifulSoup(
await r.read(),
"lxml",
parse_only=SoupStrainer("div", {"class": "quote"}),
)
for quotes in soup.find_all("div", class_="quote"):
quote_text = quotes.find("span", class_="text").get_text()
quote_author = quotes.find("small", class_="author").get_text()
quotes_item = {
"quote_text": quote_text,
"quote_author": quote_author,
}
results.append(quotes_item)
else:
results.append({"error": f"No results found for {r.url}"})
yield results
#print(results)
#same code as before
And thanks to #mkrieger1 changed the FastAPI file i.e. main.py code as shown below;
#same code as before
#app.get("/test/{test_query}")
async def read_test_items(test_query: str):
async for results in TestScraper(test_query).main():
return results
And now everything works fine. Thanks for reading and have a nice day.

Why my asyncio func is returning a list of lists (null)?

have a list of urls that i want to scrape:
urls = [https://www.netflix.com/ar/title/81488310.
https://www.netflix.com/ar/title/81486358
https://www.netflix.com/ar/title/81486558
https://www.netflix.com/ar/title/81488216
https://www.netflix.com/ar/title/81488090
https://www.netflix.com/ar/title/81489616
https://www.netflix.com/ar/title/81487371
https://www.netflix.com/ar/title/81487432
https://www.netflix.com/ar/title/81485995
https://www.netflix.com/ar/title/81489610
https://www.netflix.com/ar/title/81488496]
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(self.main(urls)) # here i call my main function
the following methods to __scrape__all the urls are:
def parse(self, html):
''' Normal method to parse response.text into soup '''
soup = BeautifulSoup(html,'html.parser')
status = soup.find('div', class_='error-page not-found') # contents that are not avialable (404)
if not status:
scripts = soup.find_all('script')
return scripts
else:
return None
async def get_response(self, session, url):
''' Asynchronous requests with ClientSession'''
try:
async with session.get(url=url) as response:
print(f'Successfully got url {url}.')
return await response.text()
except Exception as e:
print(f"Unable to get url {url} due to {e.__class__}.")
async def response_soup(self, session, url):
''' Calls methods to get response and parse into soup
with a separated thread '''
html = await self.get_response(session, url)
loop = asyncio.get_event_loop()
soup = await loop.run_in_executor(None, self.parse, html) # run parse(html) in a separate thread
return soup
async def main(self, urls):
''' Main where we setup semaphore and run the threads
for all urls with ClientSession '''
async with asyncio.BoundedSemaphore(3), aiohttp.ClientSession() as session:
results = await asyncio.gather(*[self.response_soup(session, url) for url in urls])
#final = [result for result in results if result != None]
print(results)
print("Finalized all. Return is a list of len {} outputs.".format(len(results)))
but my output is list of null lists like this!
[
[][][][][][][][][][][][]][][][][]
]
console output (there are like 17000 urls in the real script)
what am i doing wrong?

How to order the result while web scraping with multiprocessing?

I am writing a program for scraping data from multiple urls using multiprocessing. Here I stored all the URLs in the bonds_url list. It is working and I am getting output but the problem here is that output is in random orders. I want scraped data to be in the same order as the order of URLs in bonds_url.
Is there any solution for that?
from requests_html import HTMLSession
import constants
bonds_url =[]
from multiprocessing import Pool
def f(url):
session = HTMLSession()
response = session.get(url)
try:
data = [i.text.strip() for i in response.html.find(".value") ]
bonds_values.append(float(data[0]))
print(data[0])
except:
data = [i.text.strip() for i in response.html.find("div>span[data-reactid='31']")]
bonds_values.append(float(data[0]))
print(data[0])
if __name__ == '__main__':
with Pool(len(bonds_url)) as p:
p.map(f, bonds_url)
Solution
Change the printS in f to returnS in order to get the results of multiprocessing.Pool.map in order.
from multiprocessing import Pool
from requests_html import HTMLSession
import constants
bonds_url = []
def f(url):
session = HTMLSession()
response = session.get(url)
try:
data = [i.text.strip() for i in response.html.find(".value")]
except:
data = [i.text.strip() for i in response.html.find("div>span[data-reactid='31']")]
return float(data[0])
if __name__ == '__main__':
with Pool(len(bonds_url)) as p:
bond_values = p.map(f, bonds_url)

How to pass arguments to a async function while using requests-html

I want to build the link in async function by passing arguments,from my get_daraz function.
from requests_html import AsyncHTMLSession
asession = AsyncHTMLSession()
async def get_daraz_page(keyword, page_no):
template_link = 'https://www.daraz.com.np/catalog/?_keyori=ss&from=input&page={page_no}&q={keyword}&spm=a2a0e.11779170.search.go.287d2d2bVToBsh'
r = await asession.get(template_link)
return r
def getDaraz(search):
results=asession.run(get_daraz_page(search,1))
print (results)
getDaraz("Mouse")
It Gives me the following error:
TypeError: 'coroutine' object is not callable
sys:1: RuntimeWarning: coroutine 'get_daraz_page' was never awaited
thankyou.
By requests-html, I got no links at all.
I suggest BeautifulSoup, Requests, and Json. Here is my code.
'''
Build links list.
With BeautifulSoup, Requests, and Json.
To avoid ban/ block, delay for 10 to 20 seconds randomly after requests.
'''
import requests
from bs4 import BeautifulSoup
import random
import time
import json
def delay():
# Delay for 10 to 20 seconds randomly.
sleep = random.randint(10, 20)
time.sleep(sleep)
def make_soup(url, parser):
response = requests.get(url)
delay()
data = response.text
if parser == 'html.parser':
soup = BeautifulSoup(data, 'html.parser')
else:
soup = BeautifulSoup(data, 'lxml')
return soup
def build_list():
url = 'https://www.daraz.com.np/catalog/?q=mouse&_keyori=ss&from=input&spm=a2a0e.11779170.search.go.287d2d2bd0IOUA'
parser = 'html.parser'
soup = make_soup(url, parser)
#
json_tags = soup.find_all('script', {'type': 'application/ld+json'})[1].string
json_data = json.loads(json_tags)
links = []
for item in json_data['itemListElement']:
links.append(item['url'])
print(item['url'])
if __name__ == '__main__':
build_list()
And this' the result.
https://www.daraz.com.np/products/micropack-excalibur-gaming-wired-mouse-g-860-i104220875.html
https://www.daraz.com.np/products/fantech-x15-phantom-wired-gaming-mouse-i100928540.html
https://www.daraz.com.np/products/redragon-m801-pc-gaming-mouse-led-rgb-backlit-mmo-9-programmable-buttons-mouse-with-macro-recording-side-buttons-rapid-fire-button-for-windows-computer-gamer-wired-black-i104116249.html
https://www.daraz.com.np/products/fantech-x16-gaming-mouse-4200-dpi-adjustable-optical-cable-mouse-6-button-macro-for-mouse-gamer-fps-lol-ergonomic-mouse-i103259161.html
https://www.daraz.com.np/products/fantech-t532-wired-mouse-i100928918.html
https://www.daraz.com.np/products/fantech-w188-24ghz-wireless-mouse-i100934719.html
https://www.daraz.com.np/products/fantech-x5s-zeus-computer-wired-mouse-4800-dpi-usb-optical-pc-gaming-mouse-6d-for-pclaptop-i100007184.html
https://www.daraz.com.np/products/dell-optical-mouse-black-i104330431.html
https://www.daraz.com.np/products/fantech-raigor-ii-wg10-gaming-mouse-i103261633.html
https://www.daraz.com.np/products/fantech-w189-24ghz-wireless-mouse-i100924993.html
https://www.daraz.com.np/products/jedel-6d-optical-gaming-mouse-for-computerpc-laptop-with-led-infrared-micro-6d-dpi-adjustment-i103209858.html
https://www.daraz.com.np/products/jedel-usb-optical-mouse-i100366102.html
https://www.daraz.com.np/products/generic-24ghz-1200dpi-wireless-optical-mouse-usb-rolling-car-model-mouse-for-tablet-pc-i103147712.html
https://www.daraz.com.np/products/sunsonny-s-m3s-gaming-mouse-i103218451.html
https://www.daraz.com.np/products/razer-deathadder-multi-color-ergonomic-gaming-mouse-comfortable-grip-worlds-most-popular-gaming-mouse-i104160830.html
https://www.daraz.com.np/products/logitech-b170-wireless-optical-mouse-910-004659-i14400.html
https://www.daraz.com.np/products/micropack-m101-mouse-optical-i30608.html
https://www.daraz.com.np/products/dell-usb-optical-wired-mouse-m5111-i103237876.html
https://www.daraz.com.np/products/redragon-ranger-m910-wired-gaming-mouse-12400-dpi-i104256717.html
https://www.daraz.com.np/products/dell-usb-optical-wired-mouse-m360-i104838908.html
https://www.daraz.com.np/products/limeidi-x1-24ghz-2400dpi-wireless-rechargeable-gaming-mouse-backlight-i103299466.html
https://www.daraz.com.np/products/24g-best-quality-wireless-optical-mouse-assorted-color-i103331286.html
https://www.daraz.com.np/products/redragon-m705-high-performance-wired-gaming-mouse-i104278047.html
https://www.daraz.com.np/products/fantech-wgc1-wireless-mouse-charging-design-rgb-and-2400dpi-adjustable-gaming-mouse-pixart-3212-game-chips-for-mouse-gamer-i103255259.html
https://www.daraz.com.np/products/jeqang-wired-usb-gaming-mouse-i101114219.html
https://www.daraz.com.np/products/dell-24g-best-quality-wireless-optical-mouse-i104032175.html
https://www.daraz.com.np/products/gloross-g501-gaming-mouse-with-mouse-pad-i101672317.html
https://www.daraz.com.np/products/fantech-hive-ux2-gaming-mouse-i104210128.html
https://www.daraz.com.np/products/r8-a6-wireless-bluetooth-charging-mouse-with-rgb-i104816388.html
https://www.daraz.com.np/products/dell-usb-optical-wired-mouse-m5111-i104862052.html
https://www.daraz.com.np/products/lenovo-mini-optical-mouse-i100824202.html
https://www.daraz.com.np/products/jedel-gaming-mouse-gm740-original-i104798245.html
https://www.daraz.com.np/products/jedel-w450-wireless-optical-mouse-1000-dpi-i104776750.html
https://www.daraz.com.np/products/r8-1611-led-accurate-gaming-mouse-i404316.html
https://www.daraz.com.np/products/jedel-mst-1080g-2-usb-optical-mouse-black-i176104.html
https://www.daraz.com.np/products/dell-usb-optical-wired-mouse-m5111-i104546789.html
https://www.daraz.com.np/products/prolink-wireless-optical-mouse-pmw6005-i100838336.html
https://www.daraz.com.np/products/24-ghz-wireless-mouse-with-usb-20-reciever-i100680189.html
https://www.daraz.com.np/products/wiwu-wm101-bluetooth-wireless-rechargeable-mouse-i104868803.html
https://www.daraz.com.np/products/black-wireless-mouse-i112671.html

Requests html. AttributeError: __aexit__ error with asyncio how to fix?

Cannot make async requests to url, and get response from the error.
File "D:\Dev\Scripts\ol_as.py", line 28, in main
async with requests_html.AsyncHTMLSession() as session:
AttributeError: aexit
import asyncio
import requests_html
from time import time
from bs4 import BeautifulSoup
async def fetch_content(url, session):
async with session.get(url, allow_redirects=True) as response:
data = await respone.read()
respone.html.render()
soup = BeautifulSoup(respone.html.html, 'lxml')
txt = soup.find_all('span', {'class': 'text'})
print(txt)
async def main():
url = 'http://quotes.toscrape.com/js/'
tasks = []
async with requests_html.AsyncHTMLSession() as session:
for i in range(10):
tasks.append(asyncio.create_task(fetch_content(url, session)))
await asyncio.gather(*tasks)
if __name__ == '__main__':
t0 = time()
asyncio.run(main())
print(time() - t0)
You're pretty close. From experimenting AsyncHTMLSession doesn't like to be used in a context manager and passed around to different coroutines. You also need r.html.arender instead of just render.
Here's what I came up with if you want a list of quotes from a specified number of pages:
from requests_html import AsyncHTMLSession
import asyncio
import json
from itertools import chain
async def get_quotes(s, url):
r = await s.get(url)
await r.html.arender()
var_data = r.html.find('script', containing='var data', first=True).text
#this part could be improved, I'm basically isolating the json rendered bit:
*shit, var_data = var_data.split('var data =')
var_data, *shit = var_data.split('; for (var i in data)')
data = json.loads(var_data)
quotes = [post['text'] for post in data]
return quotes
async def main(max_pages=1):
s = AsyncHTMLSession()
tasks = []
for page in range(1,max_pages+1):
url = f'http://quotes.toscrape.com/js/page/{page}'
tasks.append(get_quotes(s,url))
results = await asyncio.gather(*tasks)
return list(chain(*(res for res in results)))
all_quotes = asyncio.run(main(5))
print(all_quotes)

Categories