Scrape table fields from html with specific class - python

So I want to build a simple scraper for google shopping and I encountered some problems.
This is the html text from my request(to https://www.google.es/shopping/product/7541391777504770249/online) where I'm trying to query the highlighted div class sh-osd__total-price inside the div class sh-osd__offer-row :
My code is currently:
from bs4 import BeautifulSoup
from requests import get
url = 'https://www.google.es/shopping/product/7541391777504770249/online'
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
r = html_soup.findAll('tr', {'class': 'sh-osd__offer-row'}) #Returns empty
print(r)
r = html_soup.findAll('tr', {'class': 'sh-osd__total-price'}) #Returns empty
print(r)
Where both r are empty, beatiful soup doesn't find anything.
Is there any way to find these two div classes with beautiful soup?

You need to add user agent into the headers:
from bs4 import BeautifulSoup
from requests import get
url = 'https://www.google.es/shopping/product/7541391777504770249/online'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'} #<-- added line
response = get(url, headers=headers) #<--- include here
html_soup = BeautifulSoup(response.text, 'html.parser')
r = html_soup.find_all('tr', {'class': 'sh-osd__offer-row'}) #Returns empty
print(r)
r = html_soup.findAll('tr', {'class': 'sh-osd__total-price'}) #Returns empty
print(r)
But, since it's a <table> tag, you can use pandas (it uses beautifulsoup under the hood), but does the hard work for you. It will return a list of all elements that are <table>s as dataframes
import pandas as pd
url = 'https://www.google.es/shopping/product/7541391777504770249/online'
dfs = pd.read_html(url)
print(dfs[-1])
Output:
print(dfs[-1])
Sellers Seller Rating ... Base Price Total Price
0 One Fragance No rating ... £30.95 +£8.76 delivery £39.71
1 eBay No rating ... £46.81 £46.81
2 Carethy.co.uk No rating ... £34.46 +£3.99 delivery £38.45
3 fruugo.co.uk No rating ... £36.95 +£9.30 delivery £46.25
4 cosmeticsmegastore.com/gb No rating ... £36.95 +£9.30 delivery £46.25
5 Perfumes Club UK No rating ... £30.39 +£5.99 delivery £36.38
[6 rows x 5 columns]

Related

how to scrape page inside the result card using Bs4?

<img class="no-img" data-src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium" alt="Biryani By Kilo" data-gatype="RestaurantImageClick" data-url="/delhi/biryani-by-kilo-connaught-place-central-delhi-40178" data-w-onclick="cardClickHandler" src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium">
page url - https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p=1
this page contains some restaurants card now while scrapping the page in the loop I want to go inside the restaurant card URL which is in the above HTML code name by data-url class and scrape the no. of reviews from inside it, I don't know how to do it my current code for normal front page scrapping is ;
def extract(page):
url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}" # URL of the website
header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
r = requests.get(url, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup): # function to scrape the page
divs = soup.find_all('div', class_ = 'restnt-card restaurant')
for item in divs:
title = item.find('a').text.strip() # restaurant name
loc = item.find('div', class_ = 'restnt-loc ellipsis').text.strip() # restaurant location
try: # used this try and except method because some restaurants are unrated and while scrpaping those we would run into an error
rating = item.find('div', class_="img-wrap").text
rating = (re.sub("[^0-9,.]", "", rating))
except:
rating = None
pricce = item.find('span', class_="double-line-ellipsis").text.strip() # price for biriyani
price = re.sub("[^0-9]", "", pricce)[:-1]
biry_del = {
'name': title,
'location': loc,
'rating': rating,
'price': price
}
rest_list.append(biry_del)
rest_list = []
for i in range(1,18):
print(f'getting page, {i}')
c = extract(i)
transform(c)
I hope you guys understood please ask in comment for any confusion.
It's not very fast but it looks like you can get all the details you want including the review count (not 232!) if you hit this backend api endpoint:
https://www.dineout.co.in/get_rdp_data_main/delhi/69676/restaurant_detail_main
import requests
from bs4 import BeautifulSoup
import pandas as pd
rest_list = []
for page in range(1,3):
print(f'getting page, {page}')
s = requests.Session()
url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}" # URL of the website
header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
r = s.get(url, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')
divs = soup.find_all('div', class_ = 'restnt-card restaurant')
for item in divs:
code = item.find('a')['href'].split('-')[-1] # restaurant code
print(f'Getting details for {code}')
data = s.get(f'https://www.dineout.co.in/get_rdp_data_main/delhi/{code}/restaurant_detail_main').json()
info = data['header']
info.pop('share') #clean up csv
info.pop('options')
rest_list.append(info)
df = pd.DataFrame(rest_list)
df.to_csv('dehli_rest.csv',index=False)

Finding a Field in an parsed HTML content

I was trying to get the table from https://www.innovatoretfs.com/etf/default.aspx?ticker=kjul
I just want the top-right table
I wrote the code in python and was able to get till the parsed HTML content and got the contents.
import requests
from bs4 import BeautifulSoup
url = "https://www.innovatoretfs.com/etf/default.aspx?ticker=kjul"
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.content, "html.parser")
table_it=soup.find_all(class_="al_fund")
value_it=soup.find_all(class_="ar_fund")
print(table_it)
print(value_it)
But I am not able to get the final values in a table coz of the HTML formal. can some one please help?
Another approach would be to grab all the values for given divs and then zip them all together.
Here's how:
import requests
from bs4 import BeautifulSoup
def extractor(s: BeautifulSoup, tag: str, cls: str) -> list:
return [i.getText() for i in s.find_all(tag, class_=cls)]
url = "https://www.innovatoretfs.com/etf/default.aspx?ticker=kjul"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
row_names = extractor(soup, "div", "al_fund")
row_values = extractor(soup, "div", "ar_fund")
for k, v in list(zip(row_names, row_values))[:6]:
print(f"{k}\t{v}")
Output:
Ticker KJUL
Listing Date 7/1/2020
Number of Holdings 7
Expense Ratio 0.79%
Intraday NAV KJUL.IV
Exchange Cboe BZX
Quick and easy way to get the desired content is like the following:
import requests
from bs4 import BeautifulSoup
link = 'https://www.innovatoretfs.com/etf/default.aspx?ticker=kjul'
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
for item in soup.select(".noshadecol")[:6]:
title = item.select_one(".al_fund").text
value = item.select_one(".ar_fund").text
print(title,value)
Output:
Ticker KJUL
Listing Date 7/1/2020
Number of Holdings 7
Expense Ratio 0.79%
Intraday NAV KJUL.IV
Exchange Cboe BZX
with respect to the structure of HTML you did right to extract al_fund and ar_fund. You just need to add this code to print the table :
for i in range(6):
print(f'{table_it[i].text} : {value_it[i].text}')
Ticker : KJUL
Listing Date : 7/1/2020
Number of Holdings : 7
Expense Ratio : 0.79%
Intraday NAV : KJUL.IV
Exchange : Cboe BZX

Only scrape a portion of the page

I am using Python/requests to gather data from a website. Ideally I only want the latest 'banking' information, which always at the top of the page.
The code I have currently does that, but then it attempts to keep going and hits an index out of range error. I am not very good with aspx pages, but is it possible to only gather the data under the 'banking' heading?
Here's what I have so far:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
print('Scraping South Dakota Banking Activity Actions...')
url2 = 'https://dlr.sd.gov/banking/monthly_activity_reports/monthly_activity_reports.aspx'
r2 = requests.get(url2, headers=headers)
soup = BeautifulSoup(r2.text, 'html.parser')
mylist5 = []
for tr in soup.find_all('tr')[2:]:
tds = tr.find_all('td')
print(tds[0].text, tds[1].text)
Ideally I'd be able to slice the information as well so I can only show the activity or approval status, etc.
With bs4 4.7.1 + you can use :contains to isolate the latest month by filtering out the later months. I explain the principle of filtering out later general siblings using :not in this SO answer. In short, find the row containing "August 2019" (this month is determined dynamically) and grab it and all its siblings, then find the row containing "July 2019" and all its general siblings and remove the latter from the former.
import requests, re
from bs4 import BeautifulSoup as bs
import pandas as pd
r = requests.get('https://dlr.sd.gov/banking/monthly_activity_reports/monthly_activity_reports.aspx')
soup = bs(r.content, 'lxml')
months = [i.text for i in soup.select('[colspan="2"]:has(a)')][0::2]
latest_month = months[0]
next_month = months[1]
rows_of_interest = soup.select(f'tr:contains("{latest_month}"), tr:contains("{latest_month}") ~ tr:not(:contains("{next_month}"), :contains("{next_month}") ~ tr)')
results = []
for row in rows_of_interest:
data = [re.sub('\xa0|\s{2,}',' ',td.text) for td in row.select('td')]
if len(data) == 1:
data.extend([''])
results.append(data)
df = pd.DataFrame(results)
print(df)
Same as before
import requests
from bs4 import BeautifulSoup, Tag
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
url = 'https://dlr.sd.gov/banking/monthly_activity_reports/monthly_activity_reports.aspx'
print('Scraping South Dakota Banking Activity Actions...')
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
Inspecting data source, we can find the id of the element you need (the table of values).
banking = soup.find(id='secondarycontent')
After this, we filter out elements of soup that aren't tags (like NavigableString or others). You can see how to get texts too (for other options, check Tag doc).
blocks = [b for b in banking.table.contents if type(b) is Tag] # filter out NavigableString
texts = [b.text for b in blocks]
Now, if it's the goal you're achieving when you talk about latest, we must determine which month is latest and which is the month before.
current_month_idx, last_month_idx = None, None
current_month, last_month = 'August 2019', 'July 2019' # can parse with datetime too
for i, b in enumerate(blocks):
if current_month in b.text:
current_month_idx = i
elif last_month in b.text:
last_month_idx = i
if all(idx is not None for idx in (current_month_idx, last_month_idx)):
break # break when both indeces are not null
assert current_month_idx < last_month_idx
curr_month_blocks = [b for i, b in enumerate(blocks) if current_month_idx < i < last_month_idx]
curr_month_texts = [b.text for b in curr_month_blocks]

Beautifulsoup parsing error

I am trying to extract some information about an App on Google Play and BeautifulSoup doesn't seem to work.
The link is this(say):
https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts
My code:
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html)
l = soup.find_all("div", { "class" : "document-subtitles"})
print len(l)
0 #How is this 0?! There is clearly a div with that class
I decided to go all in, didn't work either:
i = soup.select('html body.no-focus-outline.sidebar-visible.user-has-no-subscription div#wrapper.wrapper.wrapper-with-footer div#body-content.body-content div.outer-container div.inner-container div.main-content div div.details-wrapper.apps.square-cover.id-track-partial-impression.id-deep-link-item div.details-info div.info-container div.info-box-top')
print i
What am I doing wrong?
You need to pretend to be a real browser by supplying the User-Agent header:
import requests
from bs4 import BeautifulSoup
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
})
html = r.content
soup = BeautifulSoup(html, "html.parser")
title = soup.find(class_="id-app-title").get_text()
rating = soup.select_one(".document-subtitle .star-rating-non-editable-container")["aria-label"].strip()
print(title)
print(rating)
Prints the title and the current rating:
Weird Facts
Rated 4.3 stars out of five stars
To get the additional information field values, you can use the following generic function:
def get_info(soup, text):
return soup.find("div", class_="title", text=lambda t: t and t.strip() == text).\
find_next_sibling("div", class_="content").get_text(strip=True)
Then, if you do:
print(get_info(soup, "Size"))
print(get_info(soup, "Developer"))
You will see printed:
1.4M
Email email#here.com

python nested for to retrieve css tags values

The tags from a web page are as follows:
<div class="lg_col MT5">
<p>
<span class="sp starGryB">4.4</span>
</p>
<p class="MT5 UC">
<span class="gd10gb">141 Ratings</span>
</p>
</div>
I am trying to retrieve the values "4.4", and "141 Ratings" for all the div class values "lg_col MT5".
The nested for loop that I use isn't working as expected. It seems as if the hierarchy of the tags isn't taken into account.
import requests
import sys
from bs4 import BeautifulSoup
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0"}
def test_function():
url = "http://www.burrp.com/chennai/search.html?q=buffet"
source_code = requests.get(url, headers=HEADERS)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for tag in soup.select('div.lg_col.MT5'):
for tag1 in soup.select('span.sp.starGryB'):
try:
print(tag1.string)
except KeyError:
pass
for tag2 in soup.select('span.gd10gb'):
try:
print(tag2.string)
except KeyError:
pass
test_function()
`
The expected output is: 4.4 followed by 141 Ratings for each of the div tags in the webpage.
But the output is: All the starGryB values followed by all the gd10gb values as this happens over and over again.
Use tag.select instead of soup.select if you want to look in just tag and not the entire soup.
Not for points.
This is another way to scrape it to avoid having to deal with loops.
import requests
from bs4 import BeautifulSoup
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0"}
url = "http://www.burrp.com/chennai/search.html?q=buffet"
source_code = requests.get(url, headers=HEADERS)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
tags_1 = soup.find_all('span', class_='sp starGryB')
tags_2 = [tag.parent.parent.select('span.gd10gb') for tag in tags_1]
tags_3 = [tag.parent.parent.parent.select('a.gr24mb.UC') for tag in tags_1]
scores = [score.get_text() for score in tags_1]
ratings = [rating[0].get_text() if len(rating) > 0 else 'NA' for rating in tags_2]
names = [name[0].get_text().strip() for name in tags_3]
tags = zip(names, scores, ratings)
for a, b, c in tags:
print a, b, c
Result:
Wild Amazon 2.9 27 Ratings
European Buffet NA NA
Flamingo 2.3 17 Ratings
The Holy Smoke 2.9 13 Ratings
Snow Park 2.6 14 Ratings
Dhabba Express 2.7 11 Ratings
The Yellow Chilli 2.7 6 Ratings
The Piano, The Savera Hotel 2.5 6 Ratings
Roasts & Grills, Green Park Hotel 2.3 6 Ratings
[Finished in 0.9s]

Categories