How do I parse two elements that are stuck together? - python

I want to get rating and numVotes from zomato.com but unfortunately it seems like the elements are stuck together. Hard to explain but I made a quick video show casing what I mean.
https://streamable.com/sdh0w
entire code: https://pastebin.com/JFKNuK2a
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
response = requests.get("https://www.zomato.com/san-francisco/restaurants?q=restaurants&page=1",headers=headers)
content = response.content
bs = BeautifulSoup(content,"html.parser")
zomato_containers = bs.find_all("div", {"class": "search-snippet-card"})
for zomato_container in zomato_containers:
rating = zomato_container.find('div', {'class': 'search_result_rating'})
# numVotes = zomato_container.find("div", {"class": "rating-votes-div"})
print("rating: ", rating.get_text().strip())
# print("numVotes: ", numVotes.text())

You can use re module to parse the voting count:
import re
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
response = requests.get("https://www.zomato.com/san-francisco/restaurants?q=restaurants&page=1",headers=headers)
content = response.content
bs = BeautifulSoup(content,"html.parser")
zomato_containers = bs.find_all("div", {"class": "search-snippet-card"})
for zomato_container in zomato_containers:
print('name:', zomato_container.select_one('.result-title').get_text(strip=True))
print('rating:', zomato_container.select_one('.rating-popup').get_text(strip=True))
votes = ''.join( re.findall(r'\d', zomato_container.select_one('[class^="rating-votes"]').text) )
print('votes:', votes)
print('*' * 80)
Prints:
name: The Original Ghirardelli Ice Cream and Chocolate...
rating: 4.9
votes: 344
********************************************************************************
name: Tadich Grill
rating: 4.6
votes: 430
********************************************************************************
name: Delfina
rating: 4.8
votes: 718
********************************************************************************
...and so on.
OR:
If you don't want to use re, you can use str.split():
votes = zomato_container.select_one('[class^="rating-votes"]').get_text(strip=True).split()[0]

According to requirements in your clip you should alter you selectors to be more specific so as to target the appropriate child elements (rather than parent). At present, by targeting parent you are getting the unwanted extra child. To get the appropriate ratings element you can use a css attribute = value with starts with operator.
This
[class^=rating-votes-div]
says match on elements with class attribute whose values starts with rating-votes-div
Visual:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
response = requests.get("https://www.zomato.com/san-francisco/restaurants?q=restaurants&page=1",headers=headers)
content = response.content
bs = BeautifulSoup(content,"html.parser")
zomato_containers = bs.find_all("div", {"class": "search-snippet-card"})
for zomato_container in zomato_containers:
name = zomato_container.select_one('.result-title').text.strip()
rating = zomato_container.select_one('.rating-popup').text.strip()
numVotes = zomato_container.select_one('[class^=rating-votes-div]').text
print('name: ', name)
print('rating: ' , rating)
print('votes: ', numVotes)

Related

getting an empty list when trying to extract urls from google with beautifulsoup

I am trying to extract the first 100 urls that return from a location search in google
however i am getting an empty list every time ("no results found")
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all("div", class_="r")
websites = []
if results:
counter = 0
for result in results:
websites.append(result.find("a")["href"])
counter += 1
if counter == 100:
break
else:
print("No search results found.")
return websites
location = "Athens"
print(get_location_info(location))
No search results found.
[]
I have also tried this approach :
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all("div", class_="r")
websites = [result.find("a")["href"] for result in results][:10]
return websites
location = "sifnos"
print(get_location_info(location))`
and i get an empty list. I think i am doing everything suggested in similar posts but i still get nothing
Always and first of all, take a look at your soup to see if all the expected ingredients are in place.
Select your elements more specific in this case for example with css selector:
[a.get('href') for a in soup.select('a:has(>h3)')]
To void consent banner also send some cookies:
cookies={'CONSENT':'YES+'}
Example
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers, cookies={'CONSENT':'YES+'})
soup = BeautifulSoup(response.text, 'html.parser')
websites = [a.get('href') for a in soup.select('a:has(>h3)')]
return websites
location = "sifnos"
print(get_location_info(location))
Output
['https://www.griechenland.de/sifnos/', 'http://de.sifnos-greece.com/plan-trip-to-sifnos/travel-information.php', 'https://www.sifnosisland.gr/', 'https://www.visitgreece.gr/islands/cyclades/sifnos/', 'http://www.griechenland-insel.de/Hauptseiten/sifnos.htm', 'https://worldonabudget.de/sifnos-griechenland/', 'https://goodmorningworld.de/sifnos-griechenland/', 'https://de.wikipedia.org/wiki/Sifnos', 'https://sifnos.gr/en/sifnos/', 'https://www.discovergreece.com/de/cyclades/sifnos']

How to extract key info from <script> tag

I'm trying to extract the user id from this link
https://www.instagram.com/design.kaf/
using bs4 and Regex
Found a JSON key inside script tag called "profile_id"
but I can't even search that script tag
You can find my try in regex here
https://regex101.com/r/WmlAEc/1
Also I can't find something I can pull this certain <script> tag
my code :
url= "https://www.instagram.com/design.kaf/"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36'
}
response = requests.request("GET", url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
a=str(soup.findall("script"))
x = re.findall('profile_id":"-?\d+"', a)
id = int(x[0])
print(id)
Here is another answer using re approach
import requests
from bs4 import BeautifulSoup
import re, ast
url = 'https://www.instagram.com/design.kaf/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36'
}
r = requests.request("GET", url)
soup = BeautifulSoup(r.text, 'html.parser')
s = soup.findAll('script')
s = str(s)
# this will print "profile_id":"5172989370"
to_be_find_string = re.findall('"profile_id":"-?\d+"', s)[0] # changed you regex by adding a double quote at the beginning
string_formatted_as_dict = '{'+ to_be_find_string + '}'
# it will convert a type <str> formatted as dict to type <dict>
profile_dict = ast.literal_eval(string_formatted_as_dict)
print(profile_dict['profile_id']) # print your user id i.e. 5172989370
you can try this code, it is an approach with loop and string search
import requests
from bs4 import BeautifulSoup
url = 'https://www.instagram.com/design.kaf/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36'
}
r = requests.request("GET", url)
soup = BeautifulSoup(r.text, 'html.parser')
s = soup.findAll('script')
s = str(s)
id_str, counter = '', 0
counter = 0
while True:
# our required string format "profile_id":"0123456789....",
str_to_find = '"profile_id":"'
index_p = s.find(str_to_find) # returns the index of first character i.e. double quote
# first number of id will start from index_p + length of the searched string
if s[index_p+len(str_to_find)+counter] == '"':
break # iteration will stop when we again find double quote
else:
id_str += s[index_p+len(str_to_find)+counter]
counter += 1
print(id_str) # print 5172989370 in this case

How to extract text from the different id from beautifulsoup

I want to extract from the id but every id has different value check it:
div',id='statement80863
div',id='statement26092
and so on ............................
CODE
import requests
from bs4 import BeautifulSoup
import re
limit = 100
url = f'https://www.counselingcalifornia.com/cc/cgi-bin/utilities.dll/customlist?FIRSTNAME=~&LASTNAME=~&ZIP=&DONORCLASSSTT=&_MULTIPLE_INSURANCE=&HASPHOTOFLG=&_MULTIPLE_EMPHASIS=&ETHNIC=&_MULTIPLE_LANGUAGE=ENG&QNAME=THERAPISTLIST&WMT=NONE&WNR=NONE&WHP=therapistHeader.htm&WBP=therapistList.htm&RANGE=1%2F{limit}&SORT=LASTNAME'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
rows = soup.find_all('div', {'class':'row'})
for row in rows:
des=row.find('div',id='statement80863').text
print(des)
You can use Regular Expressions to select only such <div> tags.
row.find('div', {'id': re.compile('^statement.*')}) - will select all the <div> tags that has an id which starts with the word statement.
import re
import requests
from bs4 import BeautifulSoup
url = 'https://www.counselingcalifornia.com/cc/cgi-bin/utilities.dll/customlist?FIRSTNAME=~&LASTNAME=~&ZIP=&DONORCLASSSTT=&_MULTIPLE_INSURANCE=&HASPHOTOFLG=&_MULTIPLE_EMPHASIS=&ETHNIC=&_MULTIPLE_LANGUAGE=ENG&QNAME=THERAPISTLIST&WMT=NONE&WNR=NONE&WHP=therapistHeader.htm&WBP=therapistList.htm&RANGE=1%2F100&SORT=LASTNAME'
headers = {"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
rows = soup.find_all('div', class_='row')
for row in rows:
d = row.find('div', {'id': re.compile('^statement*')})
if d:
# Your scraping code here...

Scrape and save the data in to csv in beautiful soup

Below is the url to scrape
https://www.agtta.co.in/individuals.php
I need to extract Name, Mobile number, and Email
I need to save into csv after that
I am able scrape the data full data with below code
Extract using user agent below is the code
from bs4 import BeautifulSoup
import urllib.request
urls=['https://www.agtta.co.in/individuals.php']
for url in urls:
req = urllib.request.Request(
url,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
resp= urllib.request.urlopen(req)
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'),features='html.parser')
scrape_data = soup.find('section', class_='b-branches')
to_list = scrape_data .find_all_next(string=True)
I tried with
for biz in results:
#print(biz)
title = biz.findAll('h3', {'class': 'b-branches__title ui-title-inner ui-title-inner_lg'})
print (title)
I m getting [<h3 class="b-branches__title ui-title-inner ui-title-inner_lg">SHRI RAMESHBHAI P. SAKARIYA</h3>]
Tag is coming while extracting How to remove the tag
My expected out
Name, Mobilenumber, Email
A, 333, mm#gmail.com`
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
urls=['https://www.agtta.co.in/individuals.php']
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
for url in urls:
req = urllib.request.Request(url, headers=headers)
resp= urllib.request.urlopen(req)
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'),features='html.parser')
result = []
for individual in soup.findAll("section", {"class": "b-branches"}):
name = individual.h3.text
phone_data = individual.find('p')
phone = phone_data.text.replace("Mobile No","").strip() if phone_data else ""
email_data = individual.select('div:contains("Email")')
email = email_data[0].text.replace("Email","").strip() if email_data else ""
result.append({"Name":name, "Phone": phone, "Email":email})
output = pd.DataFrame(result)
output.to_csv("Details.csv",index = False)
Here is the full code to do it:
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
r = requests.get('https://www.agtta.co.in/individuals.php',headers = headers).text
soup = BeautifulSoup(r,'html5lib')
sections = soup.find_all('section',class_ = "b-branches")
names = []
phone_numbers = []
emails = []
for section in sections:
name = section.h3.text
names.append(name)
phone_number = section.p.text
phone_number = phone_number.split('Mobile No ')[1]
phone_numbers.append(phone_number)
try:
email = section.find_all('div')[3].text
email = email.split('Email ')[1]
emails.append(email)
except:
emails.append(None)
details_dict = {"Names":names,
"Phone Numbers":phone_numbers,
"Emails":emails}
df = pd.DataFrame(details_dict)
df.to_csv("Details.csv",index = False)
Output:
Hope that this helps!

Web-scraping: Accessing text information within a large list

Example: https://www.realtor.com/realestateandhomes-detail/20013-Hazeltine-Pl_Ashburn_VA_20147_M65748-31771
I am trying to access the number of garage spaces for several real estate listings. The only problem is that the location of the number of garage spaces isn't always in the 9th location of the list. On some pages it is earlier, and on other pages it is later.
garage = info[9].strip().replace('\n','')[15]
where
info = soup.find_all('ul', {'class': "list-default"})
info = [t.text for t in info]
and
header = {"user agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.2 Safari/605.1.15"}
page = requests.get(url, headers = header)
page.reason
requests.utils.default_user_agent()
soup = bs4.BeautifulSoup(page.text, 'html5lib')
What is the best way for me to obtain how many garage spaces a house listing has?
You can use CSS selector li:contains("Garage Spaces:") that will find <li> tag with the text "Garage Spaces:".
For example:
import requests
from bs4 import BeautifulSoup
url = 'https://www.realtor.com/realestateandhomes-detail/20013-Hazeltine-Pl_Ashburn_VA_20147_M65748-31771'
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.2 Safari/605.1.15"}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
garage_spaces = soup.select_one('li:contains("Garage Spaces:")')
if garage_spaces:
garage_spaces = garage_spaces.text.split()[-1]
print('Found Garage spaces! num =', garage_spaces)
Prints:
Found Garage spaces! num = 2

Categories