How to pass arguments to a async function while using requests-html - python

I want to build the link in async function by passing arguments,from my get_daraz function.
from requests_html import AsyncHTMLSession
asession = AsyncHTMLSession()
async def get_daraz_page(keyword, page_no):
template_link = 'https://www.daraz.com.np/catalog/?_keyori=ss&from=input&page={page_no}&q={keyword}&spm=a2a0e.11779170.search.go.287d2d2bVToBsh'
r = await asession.get(template_link)
return r
def getDaraz(search):
results=asession.run(get_daraz_page(search,1))
print (results)
getDaraz("Mouse")
It Gives me the following error:
TypeError: 'coroutine' object is not callable
sys:1: RuntimeWarning: coroutine 'get_daraz_page' was never awaited
thankyou.

By requests-html, I got no links at all.
I suggest BeautifulSoup, Requests, and Json. Here is my code.
'''
Build links list.
With BeautifulSoup, Requests, and Json.
To avoid ban/ block, delay for 10 to 20 seconds randomly after requests.
'''
import requests
from bs4 import BeautifulSoup
import random
import time
import json
def delay():
# Delay for 10 to 20 seconds randomly.
sleep = random.randint(10, 20)
time.sleep(sleep)
def make_soup(url, parser):
response = requests.get(url)
delay()
data = response.text
if parser == 'html.parser':
soup = BeautifulSoup(data, 'html.parser')
else:
soup = BeautifulSoup(data, 'lxml')
return soup
def build_list():
url = 'https://www.daraz.com.np/catalog/?q=mouse&_keyori=ss&from=input&spm=a2a0e.11779170.search.go.287d2d2bd0IOUA'
parser = 'html.parser'
soup = make_soup(url, parser)
#
json_tags = soup.find_all('script', {'type': 'application/ld+json'})[1].string
json_data = json.loads(json_tags)
links = []
for item in json_data['itemListElement']:
links.append(item['url'])
print(item['url'])
if __name__ == '__main__':
build_list()
And this' the result.
https://www.daraz.com.np/products/micropack-excalibur-gaming-wired-mouse-g-860-i104220875.html
https://www.daraz.com.np/products/fantech-x15-phantom-wired-gaming-mouse-i100928540.html
https://www.daraz.com.np/products/redragon-m801-pc-gaming-mouse-led-rgb-backlit-mmo-9-programmable-buttons-mouse-with-macro-recording-side-buttons-rapid-fire-button-for-windows-computer-gamer-wired-black-i104116249.html
https://www.daraz.com.np/products/fantech-x16-gaming-mouse-4200-dpi-adjustable-optical-cable-mouse-6-button-macro-for-mouse-gamer-fps-lol-ergonomic-mouse-i103259161.html
https://www.daraz.com.np/products/fantech-t532-wired-mouse-i100928918.html
https://www.daraz.com.np/products/fantech-w188-24ghz-wireless-mouse-i100934719.html
https://www.daraz.com.np/products/fantech-x5s-zeus-computer-wired-mouse-4800-dpi-usb-optical-pc-gaming-mouse-6d-for-pclaptop-i100007184.html
https://www.daraz.com.np/products/dell-optical-mouse-black-i104330431.html
https://www.daraz.com.np/products/fantech-raigor-ii-wg10-gaming-mouse-i103261633.html
https://www.daraz.com.np/products/fantech-w189-24ghz-wireless-mouse-i100924993.html
https://www.daraz.com.np/products/jedel-6d-optical-gaming-mouse-for-computerpc-laptop-with-led-infrared-micro-6d-dpi-adjustment-i103209858.html
https://www.daraz.com.np/products/jedel-usb-optical-mouse-i100366102.html
https://www.daraz.com.np/products/generic-24ghz-1200dpi-wireless-optical-mouse-usb-rolling-car-model-mouse-for-tablet-pc-i103147712.html
https://www.daraz.com.np/products/sunsonny-s-m3s-gaming-mouse-i103218451.html
https://www.daraz.com.np/products/razer-deathadder-multi-color-ergonomic-gaming-mouse-comfortable-grip-worlds-most-popular-gaming-mouse-i104160830.html
https://www.daraz.com.np/products/logitech-b170-wireless-optical-mouse-910-004659-i14400.html
https://www.daraz.com.np/products/micropack-m101-mouse-optical-i30608.html
https://www.daraz.com.np/products/dell-usb-optical-wired-mouse-m5111-i103237876.html
https://www.daraz.com.np/products/redragon-ranger-m910-wired-gaming-mouse-12400-dpi-i104256717.html
https://www.daraz.com.np/products/dell-usb-optical-wired-mouse-m360-i104838908.html
https://www.daraz.com.np/products/limeidi-x1-24ghz-2400dpi-wireless-rechargeable-gaming-mouse-backlight-i103299466.html
https://www.daraz.com.np/products/24g-best-quality-wireless-optical-mouse-assorted-color-i103331286.html
https://www.daraz.com.np/products/redragon-m705-high-performance-wired-gaming-mouse-i104278047.html
https://www.daraz.com.np/products/fantech-wgc1-wireless-mouse-charging-design-rgb-and-2400dpi-adjustable-gaming-mouse-pixart-3212-game-chips-for-mouse-gamer-i103255259.html
https://www.daraz.com.np/products/jeqang-wired-usb-gaming-mouse-i101114219.html
https://www.daraz.com.np/products/dell-24g-best-quality-wireless-optical-mouse-i104032175.html
https://www.daraz.com.np/products/gloross-g501-gaming-mouse-with-mouse-pad-i101672317.html
https://www.daraz.com.np/products/fantech-hive-ux2-gaming-mouse-i104210128.html
https://www.daraz.com.np/products/r8-a6-wireless-bluetooth-charging-mouse-with-rgb-i104816388.html
https://www.daraz.com.np/products/dell-usb-optical-wired-mouse-m5111-i104862052.html
https://www.daraz.com.np/products/lenovo-mini-optical-mouse-i100824202.html
https://www.daraz.com.np/products/jedel-gaming-mouse-gm740-original-i104798245.html
https://www.daraz.com.np/products/jedel-w450-wireless-optical-mouse-1000-dpi-i104776750.html
https://www.daraz.com.np/products/r8-1611-led-accurate-gaming-mouse-i404316.html
https://www.daraz.com.np/products/jedel-mst-1080g-2-usb-optical-mouse-black-i176104.html
https://www.daraz.com.np/products/dell-usb-optical-wired-mouse-m5111-i104546789.html
https://www.daraz.com.np/products/prolink-wireless-optical-mouse-pmw6005-i100838336.html
https://www.daraz.com.np/products/24-ghz-wireless-mouse-with-usb-20-reciever-i100680189.html
https://www.daraz.com.np/products/wiwu-wm101-bluetooth-wireless-rechargeable-mouse-i104868803.html
https://www.daraz.com.np/products/black-wireless-mouse-i112671.html

Related

How to make Web Scraping faster?

I made this code to extrat lyrics from a website informing the artist and the music name.
The code is working, the problem is that I have a DataFrame (named years_1920_2020) with 10000 musics, and it took 1:30h to retrieve all these lyrics .
Is there a way to do it faster?
def url_lyric(music,artist):
url_list = ("https://www.letras.mus.br/", str(artist),"/", str(music),"/")
url = ''.join(url_list)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try:
webpage = urlopen(req).read()
bs = BeautifulSoup(webpage, 'html.parser')
lines =bs.find('div', {'class':'cnt-letra p402_premium'})
final_lines = lines.find_all('p')
return final_lines
except:
return 0
final_lyric_series = pd.Series(name = "lyrics")
for year in range (1920,2021):
lyrics_serie = lyrics_from_year(year)
final_lyric_series = pd.concat([final_lyric_series, lyrics_serie])
print(year)
the function lyrics_from_year(year) uses the function url_lyric, perform some re tasks and return a pd.series with all the lyrics of the chosen year
We can get the solution using the pythons asyncio module. Please refer to this Article It's not an exact solution but similar to your problem.
import asyncio
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
def url_lyric(music, artist):
pass
def lyrics_from_year(year):
music = None
artist = None
return url_lyric(music, artist)
async def get_work_done():
with ThreadPoolExecutor(max_workers=10) as executor:
loop = asyncio.get_event_loop()
tasks = [
loop.run_in_executor(
executor,
lyrics_from_year,
*(year) # Allows us to pass in arguments to `lyrics_from_year`
)
for year in range(1920, 2021)
]
return await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(get_work_done())
loop.run_until_complete(future)
final_lyric_series = pd.Series(name="lyrics")
for result in future:
final_lyric_series = pd.concat([final_lyric_series, result])
print(result)
Here is a simple example of how you could do it:
import aiohttp
import asyncio
import requests, bs4
async def main():
async with aiohttp.ClientSession() as session:
urls = [f"https://www.letras.mus.br{x['href']}" for x in bs4.BeautifulSoup(requests.get(
url = 'https://www.letras.mus.br/adele/mais-tocadas.html'
).content, 'html.parser').find_all('a', {'class':'song-name'})]
for url in urls:
async with session.get(url) as r:
lyrics = bs4.BeautifulSoup(await r.text(), 'html.parser').find('div', {'class':'cnt-letra'}).text
print('\n'.join(x.strip() for x in lyrics.strip().split('\n')))
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Unable to print results from a function while using concurrent.futures in some customized way

I've created a script using concurrent.futures library to print the result from fetch_links function. When I use print statement inside the function, I get the results accordingly. What I wish to do now is print the result from that function using yield statement.
Is there any way I can modify things under main function in order to print the result from fetch_links function keeping it as is, meaning keeping the yield statement?
import requests
from bs4 import BeautifulSoup
import concurrent.futures as cf
links = [
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=2&pagesize=50",
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=3&pagesize=50",
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=4&pagesize=50"
]
base = 'https://stackoverflow.com{}'
def fetch_links(s,link):
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
for item in soup.select(".summary .question-hyperlink"):
# print(base.format(item.get("href")))
yield base.format(item.get("href"))
if __name__ == '__main__':
with requests.Session() as s:
with cf.ThreadPoolExecutor(max_workers=5) as exe:
future_to_url = {exe.submit(fetch_links,s,url): url for url in links}
cf.as_completed(future_to_url)
Your fetch_links is a generator, so you have to loop over that too, to get the results:
import requests
from bs4 import BeautifulSoup
import concurrent.futures as cf
links = [
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=2&pagesize=50",
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=3&pagesize=50",
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=4&pagesize=50"
]
base = 'https://stackoverflow.com{}'
def fetch_links(s, link):
r = s.get(link)
soup = BeautifulSoup(r.text, "lxml")
for item in soup.select(".summary .question-hyperlink"):
yield base.format(item.get("href"))
if __name__ == '__main__':
with requests.Session() as s:
with cf.ThreadPoolExecutor(max_workers=5) as exe:
future_to_url = {exe.submit(fetch_links, s, url): url for url in links}
for future in cf.as_completed(future_to_url):
for result in future.result():
print(result)
Output:
https://stackoverflow.com/questions/64298886/rvest-webscraping-in-r-with-form-inputs
https://stackoverflow.com/questions/64298879/is-this-site-not-suited-for-web-scraping-using-beautifulsoup
https://stackoverflow.com/questions/64297907/python-3-extract-html-data-from-sports-site
https://stackoverflow.com/questions/64297728/cant-get-the-fully-loaded-html-for-a-page-using-puppeteer
https://stackoverflow.com/questions/64296859/scrape-text-from-a-span-tag-containing-nested-span-tag-in-beautifulsoup
https://stackoverflow.com/questions/64296656/scrapy-nameerror-name-items-is-not-defined
https://stackoverflow.com/questions/64296201/missing-values-while-scraping-using-beautifulsoup-in-python
https://stackoverflow.com/questions/64296130/how-can-i-identify-the-element-containing-the-link-to-my-linkedin-profile-after
https://stackoverflow.com/questions/64295959/why-use-scrapy-or-beautifulsoup-vs-just-parsing-html-with-regex-v2
https://stackoverflow.com/questions/64295842/how-to-retreive-scrapping-data-from-web-to-json-like-format
https://stackoverflow.com/questions/64295559/how-to-iterate-through-a-supermarket-website-and-getting-the-product-name-and-pr
https://stackoverflow.com/questions/64295509/cant-stop-asyncio-request-for-some-delay
https://stackoverflow.com/questions/64295244/paginate-with-network-requests-scraper
and so on ...

Trouble fetching results using yield while going for multiprocessing

I'm trying to create a script using python applying multiprocessing within it to fetch the link of different users from a webpage. Although the link of the users are available in it's landing page, I'm trying to dig them out from their inner pages. However, when I use yield within get_links() function and print() within get_target_link(), I can get the results as expected.
My question is: how can I achieve the same using yield within both of the functions?
I've tried:
import requests
import concurrent.futures
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def get_links(url):
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".summary .question-hyperlink"):
yield urljoin(base,item.get("href"))
def get_target_link(targeturl):
res = requests.get(targeturl)
soup = BeautifulSoup(res.text,"lxml")
name_link = urljoin(base,soup.select_one(".user-details > a").get("href"))
yield name_link
if __name__ == '__main__':
base = 'https://stackoverflow.com'
mlink = "https://stackoverflow.com/questions/tagged/web-scraping"
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to_url = {executor.submit(get_target_link, url): url for url in get_links(mlink)}
concurrent.futures.as_completed(future_to_url)
The above script produces no result at all.
A few problems with your initial approach that causing "no result at all":
BeautifulSoup(res.text,"lxml") - change parser to html.parser (you are parsing html web-pages)
there's no benefit of making the function get_target_link as generator cause it's not supposed to become an iterator and it already produces out a single result at once.
concurrent.futures.as_completed returns an iterator over the Future instances, not the final result
The corrected approach would look as below:
import requests
import concurrent.futures as futures
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def get_links(url):
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
for link in soup.select(".summary .question-hyperlink"):
yield urljoin(base, link.get("href"))
def get_target_link(target_url):
res = requests.get(target_url)
soup = BeautifulSoup(res.text, "html.parser")
name_link = urljoin(base, soup.select_one(".user-details a").get("href"))
return name_link
if __name__ == '__main__':
base = 'https://stackoverflow.com'
mlink = "https://stackoverflow.com/questions/tagged/web-scraping"
with futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to_url = {executor.submit(get_target_link, url): url for url in get_links(mlink)}
for future in futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as ex:
print(f'Failed to extract user details from url: {url}')
else:
print(data)
The output:
https://stackoverflow.com/users/10035985/andrej-kesely
https://stackoverflow.com/users/11520568/rachit-gupta
https://stackoverflow.com/users/10568531/robots-txt
https://stackoverflow.com/users/10664939/logan-anderson
https://stackoverflow.com/users/688393/c%c3%a9sar
https://stackoverflow.com/users/903061/gregor
https://stackoverflow.com/users/9950503/saraherceg
https://stackoverflow.com/users/80851/gmile
https://stackoverflow.com/users/11793150/saurabh-rawat
https://stackoverflow.com/users/11793061/xzatar
https://stackoverflow.com/users/11759292/rachel9866
https://stackoverflow.com/users/2628114/user2628114
https://stackoverflow.com/users/9810397/bart
https://stackoverflow.com/users/838355/ir2pid
https://stackoverflow.com/users/10629482/shreya
https://stackoverflow.com/users/11669928/thor-is
https://stackoverflow.com/users/7660288/acro2142
https://stackoverflow.com/users/3342430/freddiev4
https://stackoverflow.com/users/11767045/k-%c3%96sterlund
https://stackoverflow.com/users/11781213/mohamed-shire
https://stackoverflow.com/users/5412619/a-nonymous
https://stackoverflow.com/users/4354477/forcebru
https://stackoverflow.com/users/10568531/robots-txt
https://stackoverflow.com/users/6622587/eyllanesc
https://stackoverflow.com/users/10568531/robots-txt
https://stackoverflow.com/users/3273177/casabonita
https://stackoverflow.com/users/1540328/dipesh-parmar
https://stackoverflow.com/users/6231957/perth
https://stackoverflow.com/users/11400264/workin-4weekend
https://stackoverflow.com/users/1000551/vadim-kotov
https://stackoverflow.com/users/331508/brock-adams
https://stackoverflow.com/users/11300154/helloworld1990
https://stackoverflow.com/users/11786268/mohsine-jirou
https://stackoverflow.com/users/9707561/fatima-tt
https://stackoverflow.com/users/11759292/rachel9866
https://stackoverflow.com/users/6622587/eyllanesc
https://stackoverflow.com/users/11485683/titan
https://stackoverflow.com/users/11593630/supek
https://stackoverflow.com/users/11717116/raja-kishore-patnayakuni
https://stackoverflow.com/users/975887/madushan
https://stackoverflow.com/users/10568531/robots-txt
https://stackoverflow.com/users/283366/phil
https://stackoverflow.com/users/8677101/bpdesilva
https://stackoverflow.com/users/3504096/programmerper
https://stackoverflow.com/users/6303216/akhlaq-ahmed
https://stackoverflow.com/users/11457578/sh-student
https://stackoverflow.com/users/11783947/alexis-cruz-cruz
https://stackoverflow.com/users/3579212/adnanmuttaleb
https://stackoverflow.com/users/1060350/anony-mousse
https://stackoverflow.com/users/8100732/khadija-saeed

Using BeautifulSoup to find links related to specific keyword

I have to modify this code so the scraping keeps only the links that contain a specific keyword. In my case I'm scraping a newspaper page to find news related to the term 'Brexit'.
I've tried modifying the method parse_links so it only keeps the links (or 'a' tags), that contain 'Brexit' in them, but it doesn't seem to work.
Where should i place the condition?
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
class MultiThreadScraper:
def __init__(self, base_url):
self.base_url = base_url
self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
self.pool = ThreadPoolExecutor(max_workers=20)
self.scraped_pages = set([])
self.to_crawl = Queue(10)
self.to_crawl.put(self.base_url)
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)
def scrape_info(self, html):
return
def post_scrape_callback(self, res):
result = res.result()
if result and result.status_code == 200:
self.parse_links(result.text)
self.scrape_info(result.text)
def scrape_page(self, url):
try:
res = requests.get(url, timeout=(3, 30))
return res
except requests.RequestException:
return
def run_scraper(self):
while True:
try:
target_url = self.to_crawl.get(timeout=60)
if target_url not in self.scraped_pages:
print("Scraping URL: {}".format(target_url))
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
except Empty:
return
except Exception as e:
print(e)
continue
if __name__ == '__main__':
s = MultiThreadScraper("https://elpais.com/")
s.run_scraper()
You need to import re module to get the specific text value.Try the below code.
import re
links = soup.find_all('a', text=re.compile("Brexit"))
This should return links which contains only Brexit.
You can get text of the element by using method getText() and check, if string actually contain "Brexit":
if "Brexit" in link.getText().split():
url = link["href"]
I added a check in this function. See if that does the rick for you:
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
if 'BREXIT' in link.text.upper(): #<------ new if statement
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)

Scraping multiple webpages with Python

from bs4 import BeautifulSoup
import urllib, time
class scrape(object):
def __init__(self):
self.urls = ['https://www.onthemarket.com/for-sale/property/wigan/', 'https://www.onthemarket.com/for-sale/property/wigan/?page=1', 'https://www.onthemarket.com/for-sale/property/wigan/?page=2', 'https://www.onthemarket.com/for-sale/property/wigan/?page=3', 'https://www.onthemarket.com/for-sale/property/wigan/?page=4', 'https://www.onthemarket.com/for-sale/property/wigan/?page=6']
self.telephones = []
def extract_info(self):
for link in self.urls:
data = urllib.request.urlopen(link).read()
soup = BeautifulSoup(data, "lxml")
for tel in soup.findAll("span", {"class":"call"}):
self.telephones.append(tel.text.strip())
time.sleep(1)
return self.telephones
to = scrape()
print(to.extract_info())
What is wrong? This code is hanging after second website. It should extract phone numbers from each webpage in list self.urls
All you need to do is put a headers in your request parameter and make a go. Try this:
from bs4 import BeautifulSoup
import requests, time
class scrape(object):
def __init__(self):
self.urls = ['https://www.onthemarket.com/for-sale/property/wigan/', 'https://www.onthemarket.com/for-sale/property/wigan/?page=1', 'https://www.onthemarket.com/for-sale/property/wigan/?page=2', 'https://www.onthemarket.com/for-sale/property/wigan/?page=3', 'https://www.onthemarket.com/for-sale/property/wigan/?page=4', 'https://www.onthemarket.com/for-sale/property/wigan/?page=6']
self.telephones = []
def extract_info(self):
for link in self.urls:
data = requests.get(link,headers={"User-Agent":"Mozilla/5.0"}) #it should do the trick
soup = BeautifulSoup(data.text, "lxml")
for tel in soup.find_all("span",{"class":"call"}):
self.telephones.append(tel.text.strip())
time.sleep(1)
return self.telephones
crawl = scrape()
print(crawl.extract_info())

Categories