i've a new question for you. My idea is to check if the price about the first link is <= about the first price that you can found in list`` prices = [90.70.90.20.50.80] '', and if it doesn't the code has to jump into the second link and checks if its price is <= about the second price in the list called prices etc ...
I've tried to do that, but this code check if the first price is> = about all the prices list. I'm using> = to see how many emails sends, and it send 7 mail instead 2
How can i fix that?
import requests
from bs4 import BeautifulSoup
import smtplib
import time
urls = ["https://www.amazon.it/Corsair-Vengeance-Memorie-Desktop-Prestazioni/dp/B0143UM4TC",
"https://www.amazon.it/AMD-Ryzen-5-3600-Processori/dp/B07STGGQ18",
"https://www.amazon.it/Apple-iPhone-Grigio-Siderale-Ricondizionato/dp/B07985C44N"]
prices =[90.70 , 90.20 , 50.80]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0 Chrome/83.0.4103.97 Safari/537.36'}
all_product = []
def check_price():
for url in (urls):
soup = BeautifulSoup(requests.get(url,headers=headers).content,'lxml')
title = soup.find(id='productTitle').get_text(strip=True)
try:
products = soup.find(id='priceblock_ourprice').get_text()
fix_string=products.replace(",",".")
converted_price=float(fix_string[0:5])
all_product.append(converted_price)
#print(all_product)
for price in (prices):
if (converted_price>=price):
#send email
except AttributeError:
print ("Price not found, check if the product has an exposed price")
print(all_product)
What your code does now is that it fetches the price from each URL, and checks that price against every price in prices list.
What you want it is to check the n-th URL against n-th price. This can be achieved by either:
Zipping the two lists together:
urls = [
"https://www.amazon.it/Corsair-Vengeance-Memorie-Desktop-Prestazioni/dp/B0143UM4TC",
"https://www.amazon.it/AMD-Ryzen-5-3600-Processori/dp/B07STGGQ18",
"https://www.amazon.it/Apple-iPhone-Grigio-Siderale-Ricondizionato/dp/B07985C44N",
]
prices =[90.70, 90.20, 50.80]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0 Chrome/83.0.4103.97 Safari/537.36'}
all_product = []
def check_price():
for url, price in zip(urls, prices):
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'lxml')
title = soup.find(id='productTitle').get_text(strip=True)
try:
products = soup.find(id='priceblock_ourprice').get_text()
fix_string = products.replace(",", ".")
converted_price = float(fix_string[0:5])
all_product.append(converted_price)
if converted_price >= price:
# send email
except AttributeError:
print("Price not found, check if the product has an exposed price")
Or by adding the prices as part of your url list:
urls_and_prices = [
("https://www.amazon.it/Corsair-Vengeance-Memorie-Desktop-Prestazioni/dp/B0143UM4TC", 90.70),
("https://www.amazon.it/AMD-Ryzen-5-3600-Processori/dp/B07STGGQ18", 90.20),
("https://www.amazon.it/Apple-iPhone-Grigio-Siderale-Ricondizionato/dp/B07985C44N", 50.80),
]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0 Chrome/83.0.4103.97 Safari/537.36'}
all_product = []
def check_price():
for url, price in urls_and_prices:
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'lxml')
title = soup.find(id='productTitle').get_text(strip=True)
try:
products = soup.find(id='priceblock_ourprice').get_text()
fix_string = products.replace(",", ".")
converted_price = float(fix_string[0:5])
all_product.append(converted_price)
if converted_price >= price:
# send email
except AttributeError:
print("Price not found, check if the product has an exposed price")
I think that the latter might be better for you, as it is easier to see the paired items from it, as they are on the same line together.
Related
I'm trying to get all the category, sub category and sub sub category and so on of authors URL from dmoz website using BeautifulSoup.
I'm getting the following output:
# Missing the every 2nd option/URL in first step
/Arts/Literature/Authors/A
/Arts/Literature/Authors/C
/Arts/Literature/Authors/E
/Arts/Literature/Authors/G
/Arts/Literature/Authors/Horror
. . .
# Missing the every 1st option/URL in second step
/Arts/Literature/Authors/A/Abbey,_Lynn
/Top/Arts/Literature/Authors/A/Abe,_Kobo
In the above code 2nd element is missing in 1st step and 1st element in 2nd step.
Here is my code:
scrape_url = "http://dmoz.org/Arts/Literature/Authors"
page = session.get(scrape_url, headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
soup = bs(page.text, 'html.parser')
find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]
# get all the root category author list
for test in find_row:
if test.find('div', attrs = {'class':'panel-body'}):
test_link = test.find_all('a')
for link in test_link:
sub_cat.append(link['href'])
# now get the sub or sub-sub category author URL list
for cat in sub_cat:
scrape_cat_url = "http://dmoz.org%s" % (cat)
print('scraping...', scrape_cat_url)
page = session.get(scrape_cat_url, headers={
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
soup = bs(page.text, 'html.parser')
find_row = soup.find_all('div', attrs = {'class':'row'})[6:7]
# if sub category go next level or restart
for row in find_row:
if row.find('div', attrs = {'class':'panel-body'}):
test_link = row.find_all('a')
for link in test_link:
sub_cat.append(link['href'])
records.append(scrape_cat_url)
else:
records.append(scrape_cat_url)
# remove the category url from the sub_cat list
sub_cat.remove(cat)
Can anybody suggest a better way to get all the category, sub category and sub sub category URL of authors?
Try this streamlined version of your code:
from bs4 import BeautifulSoup
headers = {"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
scrape_url = "http://dmozlive.com/Top/Arts/Literature/Authors"
page = requests.get(scrape_url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]
cats = []
for row in find_rows:
links = row.find_all('a')
for link in links:
cats.append(link['href'])
cats
Print out:
['/Top/Arts/Literature/Authors/A',
'/Top/Arts/Literature/Authors/B',
'/Top/Arts/Literature/Authors/C',
'/Top/Arts/Literature/Authors/D',
'/Top/Arts/Literature/Authors/E',
'/Top/Arts/Literature/Authors/F',
…
Now get the subcategories:
sub_cats = []
for cat in cats:
scrape_url = f"http://dmozlive.com{cat}"
page = requests.get(scrape_url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
find_rows = soup.find_all('div', attrs = {'class':'row'})[6:7]
for row in find_rows:
links = row.find_all('a')
for link in links:
sub_cats.append(link['href'])
subcats
Print out:
['/Top/Arts/Literature/Authors/A/Abbey,_Edward',
'/Top/Arts/Literature/Authors/A/Abbey,_Lynn',
'/Top/Arts/Literature/Authors/A/Abbott,_Edwin_A.',
'/Top/Arts/Literature/Authors/A/Abe,_Kobo',
'/Top/Arts/Literature/Authors/A/Achebe,_Chinua',
'/Top/Arts/Literature/Authors/A/Ackroyd,_Peter',
'/Top/Arts/Literature/Authors/A/Adams,_Douglas',
…
The following code may meet your expectation that's pull all the categories and sub-categories urls.
import requests
from bs4 import BeautifulSoup
url= 'http://dmozlive.com/Top/Arts/Literature/Authors'
headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"}
req=requests.get(url,headers=headers)
soup = BeautifulSoup(req.text, 'html.parser')
for cat_url in soup.select('.list-group.col-md-6 a'):
cat_url = 'http://dmozlive.com' + cat_url.get('href')
#print(cat_url)
req2=requests.get(cat_url,headers=headers)
soup2 = BeautifulSoup(req2.text, 'html.parser')
for author_url in soup2.select('.list-group-item'):
author_url= 'http://dmozlive.com' + str(author_url.get('href'))
print(author_url)
<img class="no-img" data-src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium" alt="Biryani By Kilo" data-gatype="RestaurantImageClick" data-url="/delhi/biryani-by-kilo-connaught-place-central-delhi-40178" data-w-onclick="cardClickHandler" src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium">
page url - https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p=1
this page contains some restaurants card now while scrapping the page in the loop I want to go inside the restaurant card URL which is in the above HTML code name by data-url class and scrape the no. of reviews from inside it, I don't know how to do it my current code for normal front page scrapping is ;
def extract(page):
url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}" # URL of the website
header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
r = requests.get(url, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup): # function to scrape the page
divs = soup.find_all('div', class_ = 'restnt-card restaurant')
for item in divs:
title = item.find('a').text.strip() # restaurant name
loc = item.find('div', class_ = 'restnt-loc ellipsis').text.strip() # restaurant location
try: # used this try and except method because some restaurants are unrated and while scrpaping those we would run into an error
rating = item.find('div', class_="img-wrap").text
rating = (re.sub("[^0-9,.]", "", rating))
except:
rating = None
pricce = item.find('span', class_="double-line-ellipsis").text.strip() # price for biriyani
price = re.sub("[^0-9]", "", pricce)[:-1]
biry_del = {
'name': title,
'location': loc,
'rating': rating,
'price': price
}
rest_list.append(biry_del)
rest_list = []
for i in range(1,18):
print(f'getting page, {i}')
c = extract(i)
transform(c)
I hope you guys understood please ask in comment for any confusion.
It's not very fast but it looks like you can get all the details you want including the review count (not 232!) if you hit this backend api endpoint:
https://www.dineout.co.in/get_rdp_data_main/delhi/69676/restaurant_detail_main
import requests
from bs4 import BeautifulSoup
import pandas as pd
rest_list = []
for page in range(1,3):
print(f'getting page, {page}')
s = requests.Session()
url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}" # URL of the website
header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
r = s.get(url, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')
divs = soup.find_all('div', class_ = 'restnt-card restaurant')
for item in divs:
code = item.find('a')['href'].split('-')[-1] # restaurant code
print(f'Getting details for {code}')
data = s.get(f'https://www.dineout.co.in/get_rdp_data_main/delhi/{code}/restaurant_detail_main').json()
info = data['header']
info.pop('share') #clean up csv
info.pop('options')
rest_list.append(info)
df = pd.DataFrame(rest_list)
df.to_csv('dehli_rest.csv',index=False)
The snippet below works fine not until the other day. Is there any way to extract all the data inside this div class="row mb-4" easily. What I am thinking is that if additional changes will be made to the page, still the script will not be affected.
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
url = "https://bscscan.com/token/"
token = "0x4ce1a5cb12151423ea479cfd0c52ec5021d108d8"
tokenurl = str(url)+str(token)
contractpage = requests.get(tokenurl,header)
ca = BeautifulSoup(contractpage.content, 'html.parser')
tokenholders = ca.find(id='ContentPlaceHolder1_tr_tokenHolders').get_text()
tokenholdersa = (((tokenholders.strip().strip("Holders:")).strip()).strip(" a ")).strip()
tholders = ((((tokenholders.strip()).strip("Holders:")).strip()).strip(" a ")).strip()
tokenaname = ca.find('span', class_='text-secondary small').get_text().strip()
def get_transfer_count(str:token)->str:
with requests.Session() as s:
s.headers = {'User-Agent':'Mozilla/5.0'}
r = s.get(f'https://bscscan.com/token/{token}')
try:
sid = re.search(r"var sid = '(.*?)'", r.text).group(1)
r = s.get(f'https://bscscan.com/token/generic-tokentxns2?m=normal&contractAddress={token}&a=&sid={sid}&p=1')
return re.search(r"var totaltxns = '(.*?)'", r.text).group(1)
except:
pass
transcount = get_transfer_count(token)
print ("Token: ", tokenaname)
print ("Holders: ", tholders)
print ("Transfers: ", transcount)
Previous Output:
Token: Binemon
Holders: 27,099
Transfers: 439,636
Wanted Improved Output:
Token: Binemon
PRICE: $0.01 # 0.000037 BNB (-22.41%)
Fully Diluted Market Cap: $14,011,783.50
Total Supply: 975,000,000 BIN
Holders: 27,099 addresses
Transfers: 439,636
Contract: 0xe56842ed550ff2794f010738554db45e60730371
Decimals: 18
Official Site: https://binemon.io/
Social Profiles:
https://twitter.com/binemonnft
https://t.me/binemonchat
https://docs.binemon.io/
https://coinmarketcap.com/currencies/binemon/
https://www.coingecko.com/en/coins/binemon/
Try:
import requests
from bs4 import BeautifulSoup
header = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0",
}
tokenurl = (
"https://bscscan.com/token/0x7083609fce4d1d8dc0c979aab8c869ea2c873402"
)
contractpage = requests.get(tokenurl, headers=header)
ca = BeautifulSoup(contractpage.content, "html.parser")
name = ca.h1.span.get_text(strip=True)
price = ca.select_one(".card-body .d-block").get_text(strip=True)
cap = ca.select_one("#pricebutton").get_text(strip=True)
print("Token:", name)
print("PRICE:", price)
print("Fully Diluted Market Cap:", cap)
print()
for c in ca.select(".row .col-md-8"):
pt = c.find_previous(class_="col-md-4").get_text(strip=True)
t = c.get_text(strip=True, separator=" ").split("(")[0]
if pt == "Social Profiles:":
links = [a["href"].strip() for a in c.select("a")]
print(pt, *links, sep="\n\t")
else:
print(pt, t)
Prints:
Token: Binance-Peg Polkadot Token
PRICE: $30.35# 0.079643 BNB(-10.39%)
Fully Diluted Market Cap: $485,657,455.49
Total Supply: 15,999,999.991309 DOT
Holders: 80,065 addresses
Transfers: -
Contract: 0x7083609fce4d1d8dc0c979aab8c869ea2c873402
Decimals: 18
Official Site: https://polkadot.network/
Social Profiles:
https://polkadot.network/blog
https://reddit.com/r/dot
https://twitter.com/polkadotnetwork
https://github.com/w3f
https://polkadot.network/PolkaDotPaper.pdf
https://coinmarketcap.com/currencies/polkadot-new/
https://www.coingecko.com/en/coins/polkadot/
I am really wondering and in panic mode as quite a few snippet I have been using is problematic. This one used to produced desired output but all of a sudden, a blank output is displayed.
import requests, random, re
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0",
}
url = "https://bscscan.com/token/generic-tokenholders2?m=normal&a="
token = "0xe56842ed550ff2794f010738554db45e60730371"
holdersurl = str(url) + str(token)
s = requests.Session()
r = requests.get(holdersurl, headers=headers)
soupblockdetails = BeautifulSoup(r.content, 'html.parser')
rowsblockdetails = soupblockdetails.findAll('table')[0].findAll('tr', limit=6)
for row in rowsblockdetails[1:]:
rank = row.find_all('td')[0].text[0:].strip()
address = row.find_all('td')[1].text[0:].strip()
amount = row.find_all('td')[2].text[0:].strip()
percentage = row.find_all('td')[3].text[0:]
value = row.find_all('td')[4].text[0:]
try:
calock=row.select_one("i.far.fa-file-alt.text-secondary")['title']
calock = "Contract"
except:
calock = "Unlocked"
print ("{} {}-% {:>8} {:>23} {} {} ".format(rank, address, amount, percentage, value, calock))
Previous Output: #-- Working output previously
1 0xbbda05ea467ad348212dade5c38c11910c14e83e 202,543,296.764175197290329227 20.77% $3,054,207.08 Unlocked
2 UniCrypt: Token Vesting 150,000,000 15.38% $2,261,892.00 Contract
3 PancakeSwap V2: BIN 17 99,217,566.546415684406759104 10.17% $1,496,129.47 Contract
4 Binemon: BIN Token 27,010,296.006072937294814033 2.77% $407,295.82 Contract
5 0x81da471feb4a45438053dc05e709be056ec26c39 14,865,062 1.52% $224,154.43 Unlocked
Now try these code you run these code multiple time the website will block you Now we add `cookies' it will run perfectly
import requests, random, re
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0",
}
cookies= {'session': '17ab96bd8ffbe8ca58a78657a918558'}
headers=headers
cookies=cookies
url = "https://bscscan.com/token/generic-tokenholders2?m=normal&a="
token = "0xe56842ed550ff2794f010738554db45e60730371"
holdersurl = str(url) + str(token)
s = requests.Session()
r = requests.get(holdersurl, headers=headers)
soupblockdetails = BeautifulSoup(r.content, 'html.parser')
rowsblockdetails = soupblockdetails.findAll('table')[0].findAll('tr', limit=6)
for row in rowsblockdetails[1:]:
rank = row.find_all('td')[0].text[0:].strip()
address = row.find_all('td')[1].text[0:].strip()
amount = row.find_all('td')[2].text[0:].strip()
percentage = row.find_all('td')[3].text[0:]
value = row.find_all('td')[4].text[0:]
try:
calock=row.select_one("i.far.fa-file-alt.text-secondary")['title']
calock = "Contract"
except:
calock = "Unlocked"
print ("{} {}-% {:>8} {:>23} {} {} ".format(rank, address, amount, percentage, value, calock))
I'm trying to build a webscraper to get the reviews of wine off Vivino.com. I have a large list of wines and wanted it to search
url = ("https://www.vivino.com/search/wines?q=")
Then cycle through the list. Scraping the rating text "4.5 - 203 reviews", the name of the wine and the attached link to page.
I found a 20 lines of code https://www.kashifaziz.me/web-scraping-python-beautifulsoup.html/ to build a web scraper. Was trying to compile it with
url = ("https://www.vivino.com/search/wines?q=")
#list having the keywords (made by splitting input with space as its delimiter)
keyword = input().split()
#go through the keywords
for key in keywords :
#everything else is same logic
r = requests.get(url + key)
print("URL :", url+key)
if 'The specified profile could not be found.' in r.text:
print("This is available")
else :
print('\nSorry that one is taken')
Also, where would I include the list of keywords?
I'd love any help with this! I'm trying to get better at python but not sure I'm at this level yet haha.
Thank you.
This script traverses all pages for selected keyword and selects title, price, rating, reviews and link to wine:
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
url = 'https://www.vivino.com/search/wines?q={kw}&start={page}'
prices_url = 'https://www.vivino.com/prices'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0'}
def get_wines(kw):
with requests.session() as s:
page = 1
while True:
soup = BeautifulSoup(s.get(url.format(kw=kw, page=page), headers=headers).content, 'html.parser')
if not soup.select('.default-wine-card'):
break
params = {'vintages[]': [wc['data-vintage'] for wc in soup.select('.default-wine-card')]}
prices_js = s.get(prices_url, params=params, headers={
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0',
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01'
}).text
wine_prices = dict(re.findall(r"\$\('\.vintage-price-id-(\d+)'\)\.find\( '\.wine-price-value' \)\.text\( '(.*?)' \);", prices_js))
for wine_card in soup.select('.default-wine-card'):
title = wine_card.select_one('.header-smaller').get_text(strip=True, separator=' ')
price = wine_prices.get(wine_card['data-vintage'], '-')
average = wine_card.select_one('.average__number')
average = average.get_text(strip=True) if average else '-'
ratings = wine_card.select_one('.text-micro')
ratings = ratings.get_text(strip=True) if ratings else '-'
link = 'https://www.vivino.com' + wine_card.a['href']
yield title, price, average, ratings, link
sleep(3)
page +=1
kw = 'angel'
for title, price, average, ratings, link in get_wines(kw):
print(title)
print(price)
print(average + ' / ' + ratings)
print(link)
print('-' * 80)
Prints:
Angél ica Zapata Malbec Alta
-
4,4 / 61369 ratings
https://www.vivino.com/wines/1469874
--------------------------------------------------------------------------------
Château d'Esclans Whispering Angel Rosé
16,66
4,1 / 38949 ratings
https://www.vivino.com/wines/1473981
--------------------------------------------------------------------------------
Angél ica Zapata Cabernet Sauvignon Alta
-
4,3 / 27699 ratings
https://www.vivino.com/wines/1471376
--------------------------------------------------------------------------------
... and so on.
EDIT: To select only one wine, you can put keyword inside a list and then check each wine in loop:
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
url = 'https://www.vivino.com/search/wines?q={kw}&start={page}'
prices_url = 'https://www.vivino.com/prices'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0'}
def get_wines(kw):
with requests.session() as s:
page = 1
while True:
soup = BeautifulSoup(s.get(url.format(kw=kw, page=page), headers=headers).content, 'html.parser')
if not soup.select('.default-wine-card'):
break
params = {'vintages[]': [wc['data-vintage'] for wc in soup.select('.default-wine-card')]}
prices_js = s.get(prices_url, params=params, headers={
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0',
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01'
}).text
wine_prices = dict(re.findall(r"\$\('\.vintage-price-id-(\d+)'\)\.find\( '\.wine-price-value' \)\.text\( '(.*?)' \);", prices_js))
no = 1
for no, wine_card in enumerate(soup.select('.default-wine-card'), 1):
title = wine_card.select_one('.header-smaller').get_text(strip=True, separator=' ')
price = wine_prices.get(wine_card['data-vintage'], '-')
average = wine_card.select_one('.average__number')
average = average.get_text(strip=True) if average else '-'
ratings = wine_card.select_one('.text-micro')
ratings = ratings.get_text(strip=True) if ratings else '-'
link = 'https://www.vivino.com' + wine_card.a['href']
yield title, price, average, ratings, link
# if no < 20:
# break
# sleep(3)
page +=1
wines = ['10 SPAN VINEYARDS CABERNET SAUVIGNON CENTRAL COAST',
'10 SPAN VINEYARDS CHARDONNAY CENTRAL COAST']
for wine in wines:
for title, price, average, ratings, link in get_wines(wine):
print(title)
print(price)
print(average + ' / ' + ratings)
print(link)
print('-' * 80)
Prints:
10 Span Vineyards Cabernet Sauvignon
-
3,7 / 557 ratings
https://www.vivino.com/wines/4535453
--------------------------------------------------------------------------------
10 Span Vineyards Chardonnay
-
3,7 / 150 ratings
https://www.vivino.com/wines/5815131
--------------------------------------------------------------------------------
import requests
#list having the keywords (made by splitting input with space as its delimiter)
keywords = input().split()
#go through the keywords
for key in keywords :
url = "https://www.vivino.com/search/wines?q={}".format(key)
#everything else is same logic
r = requests.get(url)
print("URL :", url)
if 'The specified profile could not be found.' in r.text:
print("This is available")
else :
print('\nSorry that one is taken')
for the list of keywords you can use a text file where you put in each line one keyword