I'm trying to scrape this page of all the offers, and want to iterate over <p class="white-strip"> but page_soup.find_all("p", "white-strip") returns an empty list [].
My code so far-
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.sbicard.com/en/personal/offers.page#all-offers'
# Opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "lxml")
Edit: I got it working using Selenium and below is the code I used. However, I am not able to figure out the other method through which the same can be done.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome("C:\chromedriver_win32\chromedriver.exe")
driver.get('https://www.sbicard.com/en/personal/offers.page#all-offers')
# html parsing
page_soup = BeautifulSoup(driver.page_source, 'lxml')
# grabs each offer
containers = page_soup.find_all("p", {'class':"white-strip"})
filename = "offers.csv"
f = open(filename, "w")
header = "offer-list\n"
f.write(header)
for container in containers:
offer = container.span.text
f.write(offer + "\n")
f.close()
driver.close()
If you look for either of the items, you can find them within a script tag containing var offerData. To get the desired content out of that script, you can try the following.
import re
import json
import requests
url = "https://www.sbicard.com/en/personal/offers.page#all-offers"
res = requests.get(url)
p = re.compile(r"var offerData=(.*?);",re.DOTALL)
script = p.findall(res.text)[0].strip()
items = json.loads(script)
for item in items['offers']['offer']:
print(item['text'])
Output are like:
Upto Rs 8000 off on flights at Yatra
Electricity Bill payment – Phonepe Offer
25% off on online food ordering
Get 5% cashback at Best Price stores
Get 5% cashback
website is dynamic rendering request data.
You should try automation selenium library. it allows you to scrape dynamic rendering request(js or ajax) page data.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome("/usr/bin/chromedriver")
driver.get('https://www.sbicard.com/en/personal/offers.page#all-offers')
page_soup = BeautifulSoup(driver.page_source, 'lxml')
p_list = page_soup.find_all("p", {'class':"white-strip"})
print(p_list)
where '/usr/bin/chromedriver' selenium web driver path.
Download selenium web driver for chrome browser:
http://chromedriver.chromium.org/downloads
Install web driver for chrome browser:
https://christopher.su/2015/selenium-chromedriver-ubuntu/
Selenium tutorial:
https://selenium-python.readthedocs.io/
Related
To start, this is my first time using stack overflow!
I started my journey yesterday on python and I'm trying to extract the value of some pages automatically.
This is my code
import requests
from bs4 import BeautifulSoup
url = 'https://www.jpg.store/collection/chilledkongs'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
div = soup.find('div', class_ = 'stat-title')
print(div)
I'm getting nothing and my objective is to get the floor price. Atm is 888
The floor price is loaded via JavaScript from external source. To get it via requests use next example:
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.jpg.store/collection/chilledkongs"
api_url = "https://server.jpgstoreapis.com/collection/{}/floor"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = soup.select_one("#__NEXT_DATA__").contents[0]
data = json.loads(data)
policy_id = data["props"]["pageProps"]["collection"]["policy_id"]
data = requests.get(api_url.format(policy_id)).json()
print(data["floor"] / 1_000_000)
Prints:
888.0
As #Bao Huynh Lamn stated, the website is being dynamically generated/rendered using JavaScript.So you can use an automation tool like selenium.
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.jpg.store/collection/chilledkongs'
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
for div in soup.find_all('div', class_ = 'stat-title')[-2]:
print(div.text)
Output:
888
I'm trying to get any item's price from Target website. I did some examples for this website using selenium and Redsky API but now I tried to wrote bs4 code below:
import requests
from bs4 import BeautifulSoup
url = "https://www.target.com/p/ruffles-cheddar-38-sour-cream-potato-chips-2-5oz/-/A-14930847#lnk=sametab"
r= requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price)
But it returns me None .
I tried soup.find("div",{'class': "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp"})
What am I missing?
I can accept any selenium code or Redsky API code but my priority is bs4
The page is dynamic. The data is rendered after the initial request is made. You can use selenium to load the page, and once it's rendered, then you can pull out the relevant tag. API though is always the preferred way to go if it's available.
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
# If you don't want to open a browser, comment out the line above and uncomment below
#options = webdriver.ChromeOptions()
#options.add_argument('headless')
#driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe', options=options)
url = "https://www.target.com/p/ruffles-cheddar-38-sour-cream-potato-chips-2-5oz/-/A-14930847#lnk=sametab"
driver.get(url)
r = driver.page_source
soup = BeautifulSoup(r, "lxml")
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price.text)
Output:
$1.99
You are simply using wrong locator.
Try this
price_css_locator = 'div[data-test=product-price]'
or in XPath style
price_xpath_locator = '//div[#data-test="product-price"]'
With bs4 it should be something like this:
soup.select('div[data-test="product-price"]')
to get the element get you just need to add .text
price = soup.select('div[data-test="product-price"]').text
print(price)
use .text
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price.text)
for context I am pretty new to Python. I am trying to use bs4 to parse some data out of https://bigfuture.collegeboard.org/college-university-search/university-of-california-los-angeles
To be exact, I want to obtain the 57% number in the "paying" section of the webpage.
My problem is that bs4 will only return the first layer of the HTML, while the data I want is deeply nested in the code. I think it's under 17 divs.
Here is my python code:
import requests
import bs4
url = 'https://bigfuture.collegeboard.org/college-university-search/university-of-california-los-angeles'
res = requests.get(url)
soup = bs4.BeautifulSoup(res.text, "html.parser")
print(soup.find_all("div", {"id": "gwtDiv"}))
(This returns [<div class="clearfix margin60 marginBottomOnly" id="gwtDiv" style="min-height: 300px;height: 300px;height: auto;"></div>] None of the elements inside it are shown.)
If the page is using js to render content inside the element then requests will not be able to get that content since that content is rendered on the client side in a browser. I'd recommend using ChromeDriver and Selenium along with BeautifulSoup.
You can download the chrome driver from here:https://chromedriver.chromium.org/
Put this in the same folder in which you're running your program.
Try something like this
from selenium import webdriver
from bs4 import BeautifulSoup
url = 'https://bigfuture.collegeboard.org/college-university-search/university-of-california-los-angeles'
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.documentElement.outerHTML")
sel_soup = BeautifulSoup(html, 'html.parser')
print(soup.find_all("div", {"id": "gwtDiv"}))
I know that this code works for other websites that end in .com
However I noticed that the code doesn't work if I try to parse websites that end in .kr
Can somebody help to find why this is happening and an alternate solution to parse these types of websites?
Following is my code.
import requests
from bs4 import BeautifulSoup
URL = 'https://everytime.kr/#nN4K1XC0weHnnM9VB5Qe'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='container')
print(results)
The URL here is a link to my timetable. I need to parse this website so that I can easily collect the information for the subjects and data relevant to the subject (duration, location, professor's name, etc.).
Thanks
Website is serving dynamic content and you get an empty response back - you may use selenium.
Example
from selenium import webdriver
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome(executable_path=r'C:\Program Files\ChromeDriver\chromedriver.exe')
url = 'https://everytime.kr/#nN4K1XC0weHnnM9VB5Qe'
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find(id='container')
print(results)
driver.close()
From the url that is in the code, I am ultimately trying to gather all of the players names from the page. However, when I am using .findAll in order to get all of the list elements, I am yet to be successful. Please advise.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
players_url = 'https://stats.nba.com/players/list/?Historic=Y'
# Opening up the Connection and grabbing the page
uClient = uReq(players_url)
page_html = uClient.read()
players_soup = soup(page_html, "html.parser")
# Taking all of the elements from the unordered lists that contains all of the players.
list_elements = players_soup.findAll('li', {'class': 'players-list__name'})
As #Oluwafemi Sule suggested it is better to use selenium together with BS:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox()
driver.get('https://stats.nba.com/players/list/?Historic=Y')
soup = BeautifulSoup(driver.page_source, 'lxml')
for div in soup.findAll('li', {'class': 'players-list__name'}):
print(div.find('a').contents[0])
Output:
Abdelnaby, Alaa
Abdul-Aziz, Zaid
Abdul-Jabbar, Kareem
Abdul-Rauf, Mahmoud
Abdul-Wahad, Tariq
etc.
You can do this with requests alone by pulling direct from the js script which provides the names.
import requests
import json
r = requests.get('https://stats.nba.com/js/data/ptsd/stats_ptsd.js')
s = r.text.replace('var stats_ptsd = ','').replace('};','}')
data = json.loads(s)['data']['players']
players = [item[1] for item in data]
print(players)
As #Oluwafemi Sule suggested) mentioned in the comment:
The list of players generated in the page is done with javascript.
Instead of using Selenium, I recommend you this package requests-html created by the author of very popular requests. It uses Chromium under the hood to render JavaScript content.
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://stats.nba.com/players/list/?Historic=Y')
r.html.render()
for anchor in r.html.find('.players-list__name > a'):
print(anchor.text)
Output:
Abdelnaby, Alaa
Abdul-Aziz, Zaid
Abdul-Jabbar, Kareem
Abdul-Rauf, Mahmoud
Abdul-Wahad, Tariq
...