Error when trying to get data with asynchronous functions (web scraping) - python

I want to improve my code, changing the synchronous functions to asynchronous for a faster data extraction speed, but every time I run the program, I get the Print("Error").
async def soup(html):
soup = BeautifulSoup(html, 'html.parser')
return soup
async def title_bs4(html, tag, classes):
soup = await soup(html)
title = soup.findAll(tag, attrs={"class": classes})
title = [i.text for i in title]
return title
async def url_bs4(html, tag, classes):
soup = await soup(html)
url = soup.findAll(tag, attrs={"class": classes})
url = [i.text for i in url]
return url
async def price_xpath(html):
soup = await soup(html)
dom = etree.HTML(str(soup))
price = dom.xpath('//li[#class="ui-search-layout__item shops__layout-item"]//div[#class="ui-search-result__content-columns shops__content-columns"]/div[#class="ui-search-result__content-column ui-search-result__content-column--left shops__content-columns-left"]/div[1]/div//div[#class="ui-search-price__second-line shops__price-second-line"]//span[#class="price-tag-amount"]/span[2]')
price = [i.text.replace('.', '') for i in price]
return price
async def page_number_bs4(html, tag, classes):
soup = await soup(html)
page_number = soup.find(tag, attrs={"class": classes}).text
page_number = int(page_number)
return page_number
async def number_of_pages_bs4(html, tag, classes):
soup = await soup(html)
number_of_pages = soup.find(tag, attrs={"class": classes}).text
number_of_pages = int(number_of_pages.split(" ")[1])
return number_of_pages
async def next_xpath(html):
soup = await soup(html)
dom = etree.HTML(str(soup))
next = dom.xpath(
'//div[#class="ui-search-pagination shops__pagination-content"]/ul/li[contains(#class,"--next")]/a')[0].get('href')
return next
async def main(product):
web = "Mercado libre"
list_titles = []
list_urls = []
list_prices = []
next = 'https://listado.mercadolibre.com.co/' + str(product)
async with aiohttp.ClientSession() as session:
async with session.get(next) as response:
while True:
try:
title = await title_bs4(response, 'h2', 'ui-search-item__title shops__item-title')
list_titles.extend(title)
url = await url_bs4(response, 'a', 'ui-search-item__group__element shops__items-group-details ui-search-link')
list_titles.extend(url)
price = await price_xpath(response)
list_titles.extend(price)
page_number = await page_number_bs4(response, 'span', 'andes-pagination__link')
number_of_pages = await number_of_pages_bs4(response, 'li', 'andes-pagination__page-count')
except:
print("Error")
break
if page_number == number_of_pages:
break
next = await next_xpath(response)
df = pd.DataFrame({"shop": web, "titles": list_titles,
"links": list_urls, "prices": list_prices})
df.prices = df.prices.map(
lambda x: float(re.search(r"\d+", x).group(0)))
df.to_json("templates/product.json", orient='records')
return df
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
asyncio.run(main('samsung'))
except KeyboardInterrupt:
pass
My synchronous functions work very well but are very slow when it comes to wanting to extract data from the paginations.
I want to find the solution when running the program or if there is another better alternative to what I am looking for above.

Related

Is there any method to scrape the data by XPath selectors in Playwright

I scraped the data using Playwright by css selectors. But when i tried to scrape the data using Xpath by 'page.evaluate' method, all the data became 'Not Available'. Is there any method to scrape the data by XPath selectors in Playwright.
Using Playwright by css selectors.
import asyncio
import pandas as pd
from playwright.async_api import async_playwright
# Extract the Product links
async def get_product_links(page):
all_items = await page.query_selector_all('.a-link-normal.DealCardDynamic-module__linkOutlineOffset_2XU8RDGmNg2HG1E-ESseNq')
product_links = []
for item in all_items:
link = await item.get_attribute('href')
product_links.append(link)
return product_links
# Extract the Product name
async def get_product_name(page):
try:
product_name = await (await page.query_selector("#productTitle")).text_content()
except:
product_name = "Not Available"
return product_name
async def main():
async with async_playwright() as pw:
browser = await pw.chromium.launch()
page = await browser.new_page()
await page.goto('https://www.amazon.in/gp/goldbox?deals-widget=%257B%2522version%2522%253A1%252C%2522viewIndex%2522%253A0%252C%2522presetId%2522%253A%252215C82F45284EDD496F94A2C368D1B4BD%2522%252C%2522sorting%2522%253A%2522BY_SCORE%2522%257D')
product_links = await get_product_links(page)
data = []
# Get the product data
for link in product_links:
await perform_request_with_retry(page, link)
product_name = await get_product_name(page)
data.append((link, product_name))
# Save the extracted data to a dataframe
df = pd.DataFrame(data, columns=['Product Link', 'Product Name'])
# Save the extracted data to a csv file
df.to_csv('product_details.csv', index=False)
print('CSV file has been written successfully.')
await browser.close()
# Execute the scraping and saving of Amazon today's deals - musical instruments
if __name__ == '__main__':
asyncio.run(main())
Using Xpath by page.evaluate method
async def get_product_name(page):
try:
product_name = await page.evaluate("//span[#id='productTitle']/text()")
except:
product_name = "Not Available"
return product_name
Sample Link :
https://www.amazon.in/gp/goldbox?deals-widget=%257B%2522version%2522%253A1%252C%2522viewIndex%2522%253A0%252C%2522presetId%2522%253A%252215C82F45284EDD496F94A2C368D1B4BD%2522%252C%2522sorting%2522%253A%2522BY_SCORE%2522%257D
Is there any method to scrape the data by XPath selectors in Playwright?

discord.py channel.send function takip error

When I run:
from bs4 import BeautifulSoup
import requests
import discord
from discord.ext import tasks
client = discord.Client()
#tasks.loop(minutes=1)
async def test():
channel = client.get_channel(973939538357522474)
await channel.send(takip)
#client.event
async def on_ready():
test.start()
async def takip():
url = ""
R = requests.get(url)
Soup = BeautifulSoup(R.text, "html5lib")
Title = Soup.find("h1", {"class": "pr-new-br"}).getText()
List = Soup.find("div", {"class": "pr-bx-nm with-org-prc"})
fiyat = List.find("span", {"class": "prc-dsc"}).getText()
degisenfiyat = float(fiyat.replace(",", ".").replace(" TL", ""))
if (degisenfiyat <= 200):
print("Fiyat düştü.")
client.run("")
I get:
A "<function takip at 0x00000244A7A440D0>" message in the discord channel
I want to use channel.send with the takip function. How do I do this?
takip is a function, takip() is what your function return
for example if you have this code
def my_sum(a, b):
return a + b
print(my_sum)
# This one print the function
# expected result : <function my_sum at 0x7f4dc82b7d30>
print(my_sum(1, 2))
# This one print what my function return (so 3 here)
# expected result : 3
In your code, you're sending your function to your discord channel, if you want to send "Fiyat düştü." if (degisenfiyat <= 200), you have to edit your code to this
from bs4 import BeautifulSoup
import requests
import discord
from discord.ext import tasks
client = discord.Client()
#tasks.loop(minutes=1)
async def test():
channel = client.get_channel(973939538357522474)
await channel.send(takip()) # Change here
#client.event
async def on_ready():
test.start()
def takip():
url = ""
R = requests.get(url)
Soup = BeautifulSoup(R.text, "html5lib")
Title = Soup.find("h1", {"class": "pr-new-br"}).getText()
List = Soup.find("div", {"class": "pr-bx-nm with-org-prc"})
fiyat = List.find("span", {"class": "prc-dsc"}).getText()
degisenfiyat = float(fiyat.replace(",", ".").replace(" TL", ""))
if (degisenfiyat <= 200):
return("Fiyat düştü.") # Change here
else:
return "degisenfiyat > 200"
client.run("")
However if (degisenfiyat > 200) this won't return anything so you will be sending None to your discord channel, I recommend you to add an else statement to return an error or more information (i.e.error : degisenfiyat > 200)
You're sending the function object. You need to call the function with parenthesis ()
This line:
await channel.send(takip)
Should be
await channel.send(takip())
takip() also needs to return a value to the caller, not print() it to the terminal. Use return instead of print:
if (degisenfiyat <= 200):
return "Fiyat düştü."
Consider this example of the function object, vs the returned value:
>>> def f():
... return "Hello"
...
>>> f()
'Hello'
>>> f
<function f at 0x103149fc0>
And finally, you need to remove the await from your function definition, as your call does not need to be asynchronous with the bot. This leaves your code at:
# ...
#tasks.loop(minutes=1)
async def test():
channel = client.get_channel(973939538357522474)
# Define x as the function's result before awaiting
x = takip()
await channel.send(x)
# ...
def takip():
url = ""
R = requests.get(url)
Soup = BeautifulSoup(R.text, "html5lib")
Title = Soup.find("h1", {"class": "pr-new-br"}).getText()
List = Soup.find("div", {"class": "pr-bx-nm with-org-prc"})
fiyat = List.find("span", {"class": "prc-dsc"}).getText()
degisenfiyat = float(fiyat.replace(",", ".").replace(" TL", ""))
if (degisenfiyat <= 200):
return "Fiyat düştü."
else:
return f"Error, degisenfiyat > 200. degisenfiyat = {degisenfiyat}"
client.run("")

Why my asyncio func is returning a list of lists (null)?

have a list of urls that i want to scrape:
urls = [https://www.netflix.com/ar/title/81488310.
https://www.netflix.com/ar/title/81486358
https://www.netflix.com/ar/title/81486558
https://www.netflix.com/ar/title/81488216
https://www.netflix.com/ar/title/81488090
https://www.netflix.com/ar/title/81489616
https://www.netflix.com/ar/title/81487371
https://www.netflix.com/ar/title/81487432
https://www.netflix.com/ar/title/81485995
https://www.netflix.com/ar/title/81489610
https://www.netflix.com/ar/title/81488496]
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(self.main(urls)) # here i call my main function
the following methods to __scrape__all the urls are:
def parse(self, html):
''' Normal method to parse response.text into soup '''
soup = BeautifulSoup(html,'html.parser')
status = soup.find('div', class_='error-page not-found') # contents that are not avialable (404)
if not status:
scripts = soup.find_all('script')
return scripts
else:
return None
async def get_response(self, session, url):
''' Asynchronous requests with ClientSession'''
try:
async with session.get(url=url) as response:
print(f'Successfully got url {url}.')
return await response.text()
except Exception as e:
print(f"Unable to get url {url} due to {e.__class__}.")
async def response_soup(self, session, url):
''' Calls methods to get response and parse into soup
with a separated thread '''
html = await self.get_response(session, url)
loop = asyncio.get_event_loop()
soup = await loop.run_in_executor(None, self.parse, html) # run parse(html) in a separate thread
return soup
async def main(self, urls):
''' Main where we setup semaphore and run the threads
for all urls with ClientSession '''
async with asyncio.BoundedSemaphore(3), aiohttp.ClientSession() as session:
results = await asyncio.gather(*[self.response_soup(session, url) for url in urls])
#final = [result for result in results if result != None]
print(results)
print("Finalized all. Return is a list of len {} outputs.".format(len(results)))
but my output is list of null lists like this!
[
[][][][][][][][][][][][]][][][][]
]
console output (there are like 17000 urls in the real script)
what am i doing wrong?

Python Discord Bot Embed

async def on_message(message):
if message.content.startswith(prefix):
msg = message.content[20:]
else:
return None
if msg == "bitcoin" or "BITCOIN" or "btc" or "BTC":
url = "https://coinmarketcap.com/currencies/bitcoin/"
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(url, headers=hdr)
res = urlopen(req).read()
soup = BeautifulSoup(res, 'html.parser')
btc_t1 = soup.find_all("div", class_="priceValue___11gHJ")
btc_t1 = [each_line.get_text().strip() for each_line in btc_t1[:20]]
btc_t1 = " ".join(btc_t1)
time.sleep(0.1)
btc_t2 = soup.find_all("span", class_="sc-1v2ivon-0 fiaaIx")
btc_t2 = [each_line.get_text().strip() for each_line in btc_t2[:20]]
btc_t2 = " ".join(btc_t2)
time.sleep(0.1)
btc_t3 = soup.find_all("div", class_="statsValue___2iaoZ")
btc_t3 = [each_line.get_text().strip() for each_line in btc_t3[:1]]
btc_t3 = " ".join(btc_t3)
time.sleep(0.1)
btcem = discord.Embed(title="Bitcoin", description="BTC market price       \nPowered by Coinmarketcap", color=0xF7931A)
btcem.set_thumbnail(url="https://s2.coinmarketcap.com/static/img/coins/64x64/1.png")
btcem.add_field(name="Market Price", value="Price: "+ str(btc_t1) +" | "+ str(btc_t2), inline=False)
btcem.add_field(name="Market Cap", value="Price: "+ str(btc_t3), inline=False)
# embed.set_footer(text="Market", icon_url="")
await message.channel.send(embed=btcem)
if msg == "ethereum" or "ETHEREUM" or "eth" or "ETH":
url = "https://coinmarketcap.com/currencies/ethereum/"
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(url, headers=hdr)
res = urlopen(req).read()
soup = BeautifulSoup(res, 'html.parser')
eth_t1 = soup.find_all("div", class_="priceValue___11gHJ")
eth_t1 = [each_line.get_text().strip() for each_line in eth_t1[:20]]
eth_t1 = " ".join(eth_t1)
time.sleep(0.1)
eth_t2 = soup.find_all("span", class_="sc-1v2ivon-0 fiaaIx")
eth_t2 = [each_line.get_text().strip() for each_line in eth_t2[:20]]
eth_t2 = " ".join(eth_t2)
time.sleep(0.1)
eth_t3 = soup.find_all("div", class_="statsValue___2iaoZ")
eth_t3 = [each_line.get_text().strip() for each_line in eth_t3[:1]]
eth_t3 = " ".join(eth_t3)
time.sleep(0.1)
ethem = discord.Embed(title="Ethereum", description="ETH market price       \nPowered by Coinmarketcap", color=0x131313)
ethem.set_thumbnail(url="https://s2.coinmarketcap.com/static/img/coins/64x64/1027.png")
ethem.add_field(name="Market Price", value="Price: "+ str(eth_t1) +" | "+ str(eth_t2), inline=False)
ethem.add_field(name="Market Cap", value="Price: "+ str(eth_t3), inline=False)
# embed.set_footer(text="Market", icon_url="")
await message.channel.send(embed=ethem)
I'm trying to make Discord Coin, a stock bot in Python. All the modules used in the code have been installed, and I want to send crawl data by embed message, but when %bitcoin (prefix = %), the Ethereum embed along with the bitcoin embed also comes out.
Your if is completely messed up.
msg == "bitcoin" or "BITCOIN" or "btc" or "BTC" is always true.
Your check should be.
if msg in ('bitcoin', 'BITCOIN', 'btc', 'BTC')
and this also wouldn't work in your case,
since you are doing msg = msg[20:],
it should be msg = msg[1:].
Now, I directly debugged your code, this isn't the way to ask a question on SO. You should be able to debug your code and questions on SO should be based on your algorithm, technique or documentation.
see debugging
wouldn't it be easier to make a commands out of this?
like that(i havent tested it but it should work and it would be prettier):
bot = commands.Bot(command_prefix="%")
#bot.command(aliases=["BITCOIN", "btc","BTC"])
async def bitcoin(self, ctx):
# your bitcoin code here
#bot.command(aliases=["ETHEREUM", "eth", "ETH"])
async def ethereum(self, ctx):
# your etherum code here

access list in a 2x-nested function

def get_html(url):
response = urllib.request.urlopen(url)
return response.read()
def parse_main(html):
webpage = BeautifulSoup(html, features="html.parser")
table = webpage.find('table', id='itemList')
for a_tag in table.find_all('a', class_='all'):
parse_movie(get_html('https://www.somerandommovieswebsite.com' + a_tag['href']))
def parse_movie(html):
web_page = BeautifulSoup(html, features="html.parser")
info = web_page.find('h1', class_="moviename")
movies.append(info.text)
def main():
movies = []
parse_main(get_html('https://www.somerandommovieswebsite.com'))
print(movies)
if __name__ == '__main__':
main()
How do I access the movies list (that is defined in main() function) in parse_movie which is nested in parse_main. Can't append anything to the list because of "unresolved referrence 'movies'" error. Using nonlocal didn't help
I think you should neither use a global variable here nor pass it as an argument:
def get_html(url):
response = urllib.request.urlopen(url)
return response.read()
def parse_main(html):
movies = []
webpage = BeautifulSoup(html, features="html.parser")
table = webpage.find('table', id='itemList')
for a_tag in table.find_all('a', class_='all'):
movies.append(
parse_movie(get_html('https://www.somerandommovieswebsite.com' + a_tag['href']))
)
return movies
def parse_movie(html):
web_page = BeautifulSoup(html, features="html.parser")
info = web_page.find('h1', class_="moviename")
return info.text
def main():
movies = parse_main(get_html('https://www.somerandommovieswebsite.com'))
print(movies)
if __name__ == '__main__':
main()
There are several ways to do it.
First define globally movies.
Second you can just pass a list as a parameter like that.
Since lists are passed by reference and we are appending the list which is define in main function and we don't need to return to the main function.
def parse_main(html,movies):
webpage = BeautifulSoup(html, features="html.parser")
table = webpage.find('table', id='itemList')
for a_tag in table.find_all('a', class_='all'):
parse_movie(get_html('https://www.somerandommovieswebsite.com' + a_tag['href']),movies)
def parse_movie(html,movies):
web_page = BeautifulSoup(html, features="html.parser")
info = web_page.find('h1', class_="moviename")
movies.append(info.text)
def main():
movies = []
parse_main(get_html('https://www.somerandommovieswebsite.com'),movies)
print(movies)
Third approach is to make a list inside a function and return it
def parse_main(html):
webpage = BeautifulSoup(html, features="html.parser")
table = webpage.find('table', id='itemList')
movies = []
for a_tag in table.find_all('a', class_='all'):
movies.append (parse_movie(get_html('https://www.somerandommovieswebsite.com' + a_tag['href'])))
return movies
def parse_movie(html):
web_page = BeautifulSoup(html, features="html.parser")
info = web_page.find('h1', class_="moviename")
return info.text
def main():
movies = parse_main(get_html('https://www.somerandommovieswebsite.com'))
print(movies)
The easiest approach would be using a global variable. But you should avoid using global variables whenever possible. You can change your code something like this and avoid using global variables and passing the variable as parameter.
def get_html(url):
response = urllib.request.urlopen(url)
return response.read()
def parse_main(html):
parse_movies = []
webpage = BeautifulSoup(html, features="html.parser")
table = webpage.find('table', id='itemList')
for a_tag in table.find_all('a', class_='all'):
parse_movies.append(parse_movie(get_html('https://www.somerandommovieswebsite.com' + a_tag['href'])))
return movies
def parse_movie(html):
web_page = BeautifulSoup(html, features="html.parser")
info = web_page.find('h1', class_="moviename")
return info.text
def main():
movies = parse_main(get_html('https://www.somerandommovieswebsite.com'))
print(movies)
if __name__ == '__main__':
main()
Pass the movies list as an argument and avoid using global variables, in most cases it's better.
The issue was that movies was a local variable inside ̀parse_movie, meaning it's a different variable than the one defined in your main.
I simply passed the ̀movies variable from the main function down to the parse_movie one and added return statements.
def get_html(url):
response = urllib.request.urlopen(url)
return response.read()
def parse_main(html):
movies = []
webpage = BeautifulSoup(html, features="html.parser")
table = webpage.find('table', id='itemList')
for a_tag in table.find_all('a', class_='all'):
movies.append(parse_movie(get_html('https://www.somerandommovieswebsite.com' + a_tag['href'])))
return movies
def parse_movie(html):
web_page = BeautifulSoup(html, features="html.parser")
info = web_page.find('h1', class_="moviename")
return info.text
def main():
movies = parse_main(get_html('https://www.somerandommovieswebsite.com'))
print(movies)
if __name__ == '__main__':
main()
movies is a local variable inside your main function, so it's normal your function doesn't find it, either make it global (not always a good idea) or pass it as an argument.

Categories