Grabbing all data fields from a div in python beautifulsoup

Grabbing all data fields from a div in python beautifulsoup - python

The snippet below works fine not until the other day. Is there any way to extract all the data inside this div class="row mb-4" easily. What I am thinking is that if additional changes will be made to the page, still the script will not be affected.
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
url = "https://bscscan.com/token/"
token = "0x4ce1a5cb12151423ea479cfd0c52ec5021d108d8"
tokenurl = str(url)+str(token)
contractpage = requests.get(tokenurl,header)
ca = BeautifulSoup(contractpage.content, 'html.parser')
tokenholders = ca.find(id='ContentPlaceHolder1_tr_tokenHolders').get_text()
tokenholdersa = (((tokenholders.strip().strip("Holders:")).strip()).strip(" a ")).strip()
tholders = ((((tokenholders.strip()).strip("Holders:")).strip()).strip(" a ")).strip()
tokenaname = ca.find('span', class_='text-secondary small').get_text().strip()
def get_transfer_count(str:token)->str:
with requests.Session() as s:
s.headers = {'User-Agent':'Mozilla/5.0'}
r = s.get(f'https://bscscan.com/token/{token}')
try:
sid = re.search(r"var sid = '(.*?)'", r.text).group(1)
r = s.get(f'https://bscscan.com/token/generic-tokentxns2?m=normal&contractAddress={token}&a=&sid={sid}&p=1')
return re.search(r"var totaltxns = '(.*?)'", r.text).group(1)
except:
pass
transcount = get_transfer_count(token)
print ("Token: ", tokenaname)
print ("Holders: ", tholders)
print ("Transfers: ", transcount)
Previous Output:
Token: Binemon
Holders: 27,099
Transfers: 439,636
Wanted Improved Output:
Token: Binemon
PRICE: $0.01 # 0.000037 BNB (-22.41%)
Fully Diluted Market Cap: $14,011,783.50
Total Supply: 975,000,000 BIN
Holders: 27,099 addresses
Transfers: 439,636
Contract: 0xe56842ed550ff2794f010738554db45e60730371
Decimals: 18
Official Site: https://binemon.io/
Social Profiles:
https://twitter.com/binemonnft
https://t.me/binemonchat
https://docs.binemon.io/
https://coinmarketcap.com/currencies/binemon/
https://www.coingecko.com/en/coins/binemon/

Try:
import requests
from bs4 import BeautifulSoup
header = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0",
}
tokenurl = (
"https://bscscan.com/token/0x7083609fce4d1d8dc0c979aab8c869ea2c873402"
)
contractpage = requests.get(tokenurl, headers=header)
ca = BeautifulSoup(contractpage.content, "html.parser")
name = ca.h1.span.get_text(strip=True)
price = ca.select_one(".card-body .d-block").get_text(strip=True)
cap = ca.select_one("#pricebutton").get_text(strip=True)
print("Token:", name)
print("PRICE:", price)
print("Fully Diluted Market Cap:", cap)
print()
for c in ca.select(".row .col-md-8"):
pt = c.find_previous(class_="col-md-4").get_text(strip=True)
t = c.get_text(strip=True, separator=" ").split("(")[0]
if pt == "Social Profiles:":
links = [a["href"].strip() for a in c.select("a")]
print(pt, *links, sep="\n\t")
else:
print(pt, t)
Prints:
Token: Binance-Peg Polkadot Token
PRICE: $30.35# 0.079643 BNB(-10.39%)
Fully Diluted Market Cap: $485,657,455.49
Total Supply: 15,999,999.991309 DOT
Holders: 80,065 addresses
Transfers: -
Contract: 0x7083609fce4d1d8dc0c979aab8c869ea2c873402
Decimals: 18
Official Site: https://polkadot.network/
Social Profiles:
https://polkadot.network/blog
https://reddit.com/r/dot
https://twitter.com/polkadotnetwork
https://github.com/w3f
https://polkadot.network/PolkaDotPaper.pdf
https://coinmarketcap.com/currencies/polkadot-new/
https://www.coingecko.com/en/coins/polkadot/

Related

Grabbing top table data returns a blank display

I am really wondering and in panic mode as quite a few snippet I have been using is problematic. This one used to produced desired output but all of a sudden, a blank output is displayed.
import requests, random, re
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0",
}
url = "https://bscscan.com/token/generic-tokenholders2?m=normal&a="
token = "0xe56842ed550ff2794f010738554db45e60730371"
holdersurl = str(url) + str(token)
s = requests.Session()
r = requests.get(holdersurl, headers=headers)
soupblockdetails = BeautifulSoup(r.content, 'html.parser')
rowsblockdetails = soupblockdetails.findAll('table')[0].findAll('tr', limit=6)
for row in rowsblockdetails[1:]:
rank = row.find_all('td')[0].text[0:].strip()
address = row.find_all('td')[1].text[0:].strip()
amount = row.find_all('td')[2].text[0:].strip()
percentage = row.find_all('td')[3].text[0:]
value = row.find_all('td')[4].text[0:]
try:
calock=row.select_one("i.far.fa-file-alt.text-secondary")['title']
calock = "Contract"
except:
calock = "Unlocked"
print ("{} {}-% {:>8} {:>23} {} {} ".format(rank, address, amount, percentage, value, calock))
Previous Output: #-- Working output previously
1 0xbbda05ea467ad348212dade5c38c11910c14e83e 202,543,296.764175197290329227 20.77% $3,054,207.08 Unlocked
2 UniCrypt: Token Vesting 150,000,000 15.38% $2,261,892.00 Contract
3 PancakeSwap V2: BIN 17 99,217,566.546415684406759104 10.17% $1,496,129.47 Contract
4 Binemon: BIN Token 27,010,296.006072937294814033 2.77% $407,295.82 Contract
5 0x81da471feb4a45438053dc05e709be056ec26c39 14,865,062 1.52% $224,154.43 Unlocked

Now try these code you run these code multiple time the website will block you Now we add `cookies' it will run perfectly
import requests, random, re
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0",
}
cookies= {'session': '17ab96bd8ffbe8ca58a78657a918558'}
headers=headers
cookies=cookies
url = "https://bscscan.com/token/generic-tokenholders2?m=normal&a="
token = "0xe56842ed550ff2794f010738554db45e60730371"
holdersurl = str(url) + str(token)
s = requests.Session()
r = requests.get(holdersurl, headers=headers)
soupblockdetails = BeautifulSoup(r.content, 'html.parser')
rowsblockdetails = soupblockdetails.findAll('table')[0].findAll('tr', limit=6)
for row in rowsblockdetails[1:]:
rank = row.find_all('td')[0].text[0:].strip()
address = row.find_all('td')[1].text[0:].strip()
amount = row.find_all('td')[2].text[0:].strip()
percentage = row.find_all('td')[3].text[0:]
value = row.find_all('td')[4].text[0:]
try:
calock=row.select_one("i.far.fa-file-alt.text-secondary")['title']
calock = "Contract"
except:
calock = "Unlocked"
print ("{} {}-% {:>8} {:>23} {} {} ".format(rank, address, amount, percentage, value, calock))

Scraping multiple page on this site HELP NEEDED

Hi I would love to be able to scrape multiple page for this website
Can some one give help on how i can scrape scrape through all the pages i am only able to get information from one page how ever I just get information from one page
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
for i in range(2000):
Centris ='https://www.centris.ca/en/commercial-units~for-rent~montreal-ville-marie/26349148?view=Summary'.format(i)
r = get(Centris, headers=headers)
soup = bs(r.text, 'html.parser')
results = soup.find_all('div', attrs={'id':'divMainResult'})
data = []
for result in results:
titre = result.find('span', attrs={'data-id': 'PageTitle'})
titre = [str(titre.string).strip() for titre in titre]
superficie = result.find('div', attrs={'class': 'carac-value'}, string=re.compile('sqft'))
superficie = [str(superficie.string).strip() for superficie in superficie]
emplacement = result.find_all('h2', attrs={'class': 'pt-1'})
emplacement = [str(emplacement.string).strip() for emplacement in emplacement]
prix = result.find_all('span', attrs={'class':'text-nowrap'})
prix = [(prix.text).strip('\w.') for prix in prix]
description = result.find_all('div', attrs={'itemprop': 'description'})
description = [str(description.string).strip() for description in description]
lien = result.find_all('a', attrs={'class': 'dropdown-item js-copy-clipboard'})

To get pagination working you can simulate Ajax requests with requests module:
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.centris.ca/Property/GetInscriptions"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
}
json_data = {"startPosition": 0}
with requests.session() as s:
# load cookies:
s.get(
"https://www.centris.ca/en/commercial-units~for-rent?uc=0",
headers=headers,
)
for page in range(0, 100, 20): # <-- increase number of pages here
json_data["startPosition"] = page
data = s.post(url, headers=headers, json=json_data).json()
soup = BeautifulSoup(data["d"]["Result"]["html"], "html.parser")
for a in soup.select(".a-more-detail"):
print(a.select_one(".category").get_text(strip=True))
print(a.select_one(".address").get_text(strip=True, separator="\n"))
print("https://www.centris.ca" + a["href"])
print("-" * 80)
Prints:
Commercial unit for rent
6560, Avenue de l'Esplanade, suite 105
Montréal (Rosemont/La Petite-Patrie)
Neighbourhood La Petite-Patrie
https://www.centris.ca/en/commercial-units~for-rent~montreal-rosemont-la-petite-patrie/16168393?view=Summary
--------------------------------------------------------------------------------
Commercial unit for rent
75, Rue Principale
Gatineau (Aylmer)
Neighbourhood Vieux Aylmer, Des Cèdres, Marina
https://www.centris.ca/en/commercial-units~for-rent~gatineau-aylmer/22414903?view=Summary
--------------------------------------------------------------------------------
Commercial building for rent
53, Rue Saint-Pierre, suite D
Saint-Pie
https://www.centris.ca/en/commercial-buildings~for-rent~saint-pie/15771470?view=Summary
--------------------------------------------------------------------------------
...and so on.

Thank you so much I came up with this and it worked perfectly
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.centris.ca/Property/GetInscriptions"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
}
json_data = {"startPosition": 0}
with requests.session() as s:
Centris = []
# load cookies:
s.get(
"https://www.centris.ca/en/commercial-units~for-rent?uc=0",
headers=headers,
)
for page in range(0, 100, 20): # <-- increase number of pages here
json_data["startPosition"] = page
data = s.post(url, headers=headers, json=json_data).json()
soup = BeautifulSoup(data["d"]["Result"]["html"], "html.parser")
for a in soup.select(".a-more-detail"):
titre = a.select_one(".category").get_text(strip=True)
emplacement = a.select_one(".address").get_text(strip=True, separator="\n")
lien = "https://www.centris.ca" + a["href"]
prix = a.select_one(".price").get_text(strip=True)
Centris.append((titre, emplacement, lien, prix))
df = pd.DataFrame(Centris, columns={'Titre':titre, 'Emplacement':emplacement, 'Lien':lien, 'Prix':prix})
writer = pd.ExcelWriter('Centris.xlsx')
df.to_excel(writer)
writer.save()
print( 'Data Saved To excel' )

Extracting company name and other information inside all urls present in a webpage using beautifulsoup

<li>
<strong>Company Name</strong>
":"
<span itemprop="name">PT ERA MURNI BUSANA</span>
</li>
In the above HTML code, I am trying to extract the company name which is PT ERA MURNI BUSANA.
if I use a single test link, I can get the name using the single line code I wrote:
soup.find_all("span",attrs={"itemprop":"name"})[3].get_text()
But I want to extract the information from all such pages present in a single web page.
So I write the for loop but it is fetch the details. I am pasting the part of the code that I have been trying which needs some modification.
Code:-
for link in supplierlinks: #links have been extracted and merged with the base url
r=requests.get(link,headers=headers)
soup=BeautifulSoup(r.content,'lxml')
companyname=soup.find_all("span",attrs={"itemprop":"name"})[2].get_text()
Output looks like:
{'Company Name': 'AIRINDO SAKTI GARMENT PT'}
{'Company Name': 'Garments'}
{'Company Name': 'Garments'}
Instead of the garments popping up in the output, I need the company name. How do I modify the code within for loop?
Link:https://idn.bizdirlib.com/node/5290

Try this code:
import requests
from bs4 import BeautifulSoup
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0'}
r = requests.get('https://idn.bizdirlib.com/node/5290',headers=headers).text
soup = BeautifulSoup(r,'html5lib')
print(soup.find_all("span",attrs={"itemprop":"name"})[-1].get_text())
div = soup.find('div',class_ = "content clearfix")
li_tags = div.div.find_all('fieldset')[1].find_all('div')[-1].ul.find_all('li')
supplierlinks = []
for li in li_tags:
try:
supplierlinks.append("https://idn.bizdirlib.com/"+li.a['href'])
except:
pass
for link in supplierlinks:
r = requests.get(link,headers=headers).text
soup = BeautifulSoup(r,'html5lib')
print(soup.find_all("span", attrs={"itemprop": "name"})[-1].get_text())
Output:
PT ERA MURNI BUSANA
PT ELKA SURYA ABADI
PT EMPANG BESAR MAKMUR
PT EMS
PT ENERON
PT ENPE JAYA
PT ERIDANI TOUR AND TRAVEL
PT EURO ASIA TRADE & INDUSTRY
PT EUROKARS CHRISDECO UTAMA
PT EVERAGE VALVES METAL
PT EVICO
This code prints the company names of all the links on the page

You can select sibling element to element <strong> that contains the text "Company Name" (also, don't forget to set User-Agent http header):
import requests
from bs4 import BeautifulSoup
url = 'https://idn.bizdirlib.com/node/5290'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
print( soup.select_one('strong:contains("Company Name") + *').text )
Prints:
PT ERA MURNI BUSANA
EDIT: To get contact person:
import requests
from bs4 import BeautifulSoup
url = 'https://idn.bizdirlib.com/node/5290'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
print( soup.select_one('strong:contains("Company Name") + *').text )
print( soup.select_one('strong:contains("Contact") + *').text )
Prints:
PT ERA MURNI BUSANA
Mr. Yohan Kustanto

Python Web Scraping / Beautiful Soup, with list of keywords at the end of URL

I'm trying to build a webscraper to get the reviews of wine off Vivino.com. I have a large list of wines and wanted it to search
url = ("https://www.vivino.com/search/wines?q=")
Then cycle through the list. Scraping the rating text "4.5 - 203 reviews", the name of the wine and the attached link to page.
I found a 20 lines of code https://www.kashifaziz.me/web-scraping-python-beautifulsoup.html/ to build a web scraper. Was trying to compile it with
url = ("https://www.vivino.com/search/wines?q=")
#list having the keywords (made by splitting input with space as its delimiter)
keyword = input().split()
#go through the keywords
for key in keywords :
#everything else is same logic
r = requests.get(url + key)
print("URL :", url+key)
if 'The specified profile could not be found.' in r.text:
print("This is available")
else :
print('\nSorry that one is taken')
Also, where would I include the list of keywords?
I'd love any help with this! I'm trying to get better at python but not sure I'm at this level yet haha.
Thank you.

This script traverses all pages for selected keyword and selects title, price, rating, reviews and link to wine:
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
url = 'https://www.vivino.com/search/wines?q={kw}&start={page}'
prices_url = 'https://www.vivino.com/prices'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0'}
def get_wines(kw):
with requests.session() as s:
page = 1
while True:
soup = BeautifulSoup(s.get(url.format(kw=kw, page=page), headers=headers).content, 'html.parser')
if not soup.select('.default-wine-card'):
break
params = {'vintages[]': [wc['data-vintage'] for wc in soup.select('.default-wine-card')]}
prices_js = s.get(prices_url, params=params, headers={
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0',
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01'
}).text
wine_prices = dict(re.findall(r"\$\('\.vintage-price-id-(\d+)'\)\.find\( '\.wine-price-value' \)\.text\( '(.*?)' \);", prices_js))
for wine_card in soup.select('.default-wine-card'):
title = wine_card.select_one('.header-smaller').get_text(strip=True, separator=' ')
price = wine_prices.get(wine_card['data-vintage'], '-')
average = wine_card.select_one('.average__number')
average = average.get_text(strip=True) if average else '-'
ratings = wine_card.select_one('.text-micro')
ratings = ratings.get_text(strip=True) if ratings else '-'
link = 'https://www.vivino.com' + wine_card.a['href']
yield title, price, average, ratings, link
sleep(3)
page +=1
kw = 'angel'
for title, price, average, ratings, link in get_wines(kw):
print(title)
print(price)
print(average + ' / ' + ratings)
print(link)
print('-' * 80)
Prints:
Angél ica Zapata Malbec Alta
-
4,4 / 61369 ratings
https://www.vivino.com/wines/1469874
--------------------------------------------------------------------------------
Château d'Esclans Whispering Angel Rosé
16,66
4,1 / 38949 ratings
https://www.vivino.com/wines/1473981
--------------------------------------------------------------------------------
Angél ica Zapata Cabernet Sauvignon Alta
-
4,3 / 27699 ratings
https://www.vivino.com/wines/1471376
--------------------------------------------------------------------------------
... and so on.
EDIT: To select only one wine, you can put keyword inside a list and then check each wine in loop:
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
url = 'https://www.vivino.com/search/wines?q={kw}&start={page}'
prices_url = 'https://www.vivino.com/prices'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0'}
def get_wines(kw):
with requests.session() as s:
page = 1
while True:
soup = BeautifulSoup(s.get(url.format(kw=kw, page=page), headers=headers).content, 'html.parser')
if not soup.select('.default-wine-card'):
break
params = {'vintages[]': [wc['data-vintage'] for wc in soup.select('.default-wine-card')]}
prices_js = s.get(prices_url, params=params, headers={
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0',
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01'
}).text
wine_prices = dict(re.findall(r"\$\('\.vintage-price-id-(\d+)'\)\.find\( '\.wine-price-value' \)\.text\( '(.*?)' \);", prices_js))
no = 1
for no, wine_card in enumerate(soup.select('.default-wine-card'), 1):
title = wine_card.select_one('.header-smaller').get_text(strip=True, separator=' ')
price = wine_prices.get(wine_card['data-vintage'], '-')
average = wine_card.select_one('.average__number')
average = average.get_text(strip=True) if average else '-'
ratings = wine_card.select_one('.text-micro')
ratings = ratings.get_text(strip=True) if ratings else '-'
link = 'https://www.vivino.com' + wine_card.a['href']
yield title, price, average, ratings, link
# if no < 20:
# break
# sleep(3)
page +=1
wines = ['10 SPAN VINEYARDS CABERNET SAUVIGNON CENTRAL COAST',
'10 SPAN VINEYARDS CHARDONNAY CENTRAL COAST']
for wine in wines:
for title, price, average, ratings, link in get_wines(wine):
print(title)
print(price)
print(average + ' / ' + ratings)
print(link)
print('-' * 80)
Prints:
10 Span Vineyards Cabernet Sauvignon
-
3,7 / 557 ratings
https://www.vivino.com/wines/4535453
--------------------------------------------------------------------------------
10 Span Vineyards Chardonnay
-
3,7 / 150 ratings
https://www.vivino.com/wines/5815131
--------------------------------------------------------------------------------

import requests
#list having the keywords (made by splitting input with space as its delimiter)
keywords = input().split()
#go through the keywords
for key in keywords :
url = "https://www.vivino.com/search/wines?q={}".format(key)
#everything else is same logic
r = requests.get(url)
print("URL :", url)
if 'The specified profile could not be found.' in r.text:
print("This is available")
else :
print('\nSorry that one is taken')
for the list of keywords you can use a text file where you put in each line one keyword

Using list with python

i've a new question for you. My idea is to check if the price about the first link is <= about the first price that you can found in list`` prices = [90.70.90.20.50.80] '', and if it doesn't the code has to jump into the second link and checks if its price is <= about the second price in the list called prices etc ...
I've tried to do that, but this code check if the first price is> = about all the prices list. I'm using> = to see how many emails sends, and it send 7 mail instead 2
How can i fix that?
import requests
from bs4 import BeautifulSoup
import smtplib
import time
urls = ["https://www.amazon.it/Corsair-Vengeance-Memorie-Desktop-Prestazioni/dp/B0143UM4TC",
"https://www.amazon.it/AMD-Ryzen-5-3600-Processori/dp/B07STGGQ18",
"https://www.amazon.it/Apple-iPhone-Grigio-Siderale-Ricondizionato/dp/B07985C44N"]
prices =[90.70 , 90.20 , 50.80]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0 Chrome/83.0.4103.97 Safari/537.36'}
all_product = []
def check_price():
for url in (urls):
soup = BeautifulSoup(requests.get(url,headers=headers).content,'lxml')
title = soup.find(id='productTitle').get_text(strip=True)
try:
products = soup.find(id='priceblock_ourprice').get_text()
fix_string=products.replace(",",".")
converted_price=float(fix_string[0:5])
all_product.append(converted_price)
#print(all_product)
for price in (prices):
if (converted_price>=price):
#send email
except AttributeError:
print ("Price not found, check if the product has an exposed price")
print(all_product)

What your code does now is that it fetches the price from each URL, and checks that price against every price in prices list.
What you want it is to check the n-th URL against n-th price. This can be achieved by either:
Zipping the two lists together:
urls = [
"https://www.amazon.it/Corsair-Vengeance-Memorie-Desktop-Prestazioni/dp/B0143UM4TC",
"https://www.amazon.it/AMD-Ryzen-5-3600-Processori/dp/B07STGGQ18",
"https://www.amazon.it/Apple-iPhone-Grigio-Siderale-Ricondizionato/dp/B07985C44N",
]
prices =[90.70, 90.20, 50.80]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0 Chrome/83.0.4103.97 Safari/537.36'}
all_product = []
def check_price():
for url, price in zip(urls, prices):
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'lxml')
title = soup.find(id='productTitle').get_text(strip=True)
try:
products = soup.find(id='priceblock_ourprice').get_text()
fix_string = products.replace(",", ".")
converted_price = float(fix_string[0:5])
all_product.append(converted_price)
if converted_price >= price:
# send email
except AttributeError:
print("Price not found, check if the product has an exposed price")
Or by adding the prices as part of your url list:
urls_and_prices = [
("https://www.amazon.it/Corsair-Vengeance-Memorie-Desktop-Prestazioni/dp/B0143UM4TC", 90.70),
("https://www.amazon.it/AMD-Ryzen-5-3600-Processori/dp/B07STGGQ18", 90.20),
("https://www.amazon.it/Apple-iPhone-Grigio-Siderale-Ricondizionato/dp/B07985C44N", 50.80),
]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0 Chrome/83.0.4103.97 Safari/537.36'}
all_product = []
def check_price():
for url, price in urls_and_prices:
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'lxml')
title = soup.find(id='productTitle').get_text(strip=True)
try:
products = soup.find(id='priceblock_ourprice').get_text()
fix_string = products.replace(",", ".")
converted_price = float(fix_string[0:5])
all_product.append(converted_price)
if converted_price >= price:
# send email
except AttributeError:
print("Price not found, check if the product has an exposed price")
I think that the latter might be better for you, as it is easier to see the paired items from it, as they are on the same line together.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Grabbing all data fields from a div in python beautifulsoup - python

Related

Grabbing top table data returns a blank display

Scraping multiple page on this site HELP NEEDED

Extracting company name and other information inside all urls present in a webpage using beautifulsoup

Python Web Scraping / Beautiful Soup, with list of keywords at the end of URL

Using list with python

Categories

Resources