Beautifulsoup can not find table containing specific class

Beautifulsoup can not find table containing specific class - python

from bs4 import BeautifulSoup
import requests
url="https://www.calculator.net/currency-calculator.html"
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text
# Parse the html content
soup = BeautifulSoup(html_content,'html5lib')
print(soup.prettify()) # print the parsed data of html
conv_table = soup.find("table", attrs={"class":"cinfoT "})
conv_data = gdp_table.tbody.find_all("tr")
I have written the above script to get the table listed on this particular website.
when i run the same conv_table comes as None type object.
If you visit the website, basically i want to extract the 2nd table bigger table and its class name contains "cinfoT ". Also i have checked that there are some blank spaces in the class name.
Please help me out.
Thanks in advance.

It is because this data is loaded by javascript. Try selenium. requests will give you plain html file
from selenium import webdriver
from bs4 import BeautifulSoup
DRIVER_PATH="Your selenium chrome driver path"
url = 'https://www.calculator.net/currency-calculator.html'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver.get(url)
html_soup = BeautifulSoup(driver.page_source, 'html.parser')
table = html_soup.find_all('table', class_ = 'cinfoT ')
driver.quit()
print(table[0].tbody)

Related

Extracting the titles of the websites mentioned in the link

https://www.g2.com/categories/marketing-automation
I am trying webscrap the above link that has list of 350+ websites i need to extract the title of the websites mentioned
But I am failing to get any results i have tried with using requests and beautiful soup
then with selenium and all i am getting is empty list "[]" or none
import requests
from bs4 import BeautifulSoup
# Send a GET request to the URL and parse the HTML content
url = 'https://www.g2.com/categories/marketing-automation'
response = requests.get(url).text
soup = BeautifulSoup(response, 'html.parser')
name = soup.find(class_ = "product-card__product-name")
print(name)
This above code is just a test code to check if the data is being pulled or not and the response is 'None'
From this code i am expecting to see the results of the class mentioned upon calling print

I kind of got this code to get something. Im still working on it.
from selenium import webdriver
from selenium.webdriver.common.by import By
# Create a new instance of the Chrome driver
driver = webdriver.Chrome()
# Navigate to the webpage
driver.get('https://www.g2.com/categories/marketing-automation')
# Wait for the page to load
driver.implicitly_wait(10)
# Find all the product cards on the page
product_cards = driver.find_elements(By.CLASS_NAME, 'product-card__product-name')
# Iterate over the product cards and extract the title from each one
for product_card in product_cards:
title = product_card.text
print(title)
# Close the browser
driver.quit()

Python - Item Price Web Scraping for Target

I'm trying to get any item's price from Target website. I did some examples for this website using selenium and Redsky API but now I tried to wrote bs4 code below:
import requests
from bs4 import BeautifulSoup
url = "https://www.target.com/p/ruffles-cheddar-38-sour-cream-potato-chips-2-5oz/-/A-14930847#lnk=sametab"
r= requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price)
But it returns me None .
I tried soup.find("div",{'class': "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp"})
What am I missing?
I can accept any selenium code or Redsky API code but my priority is bs4

The page is dynamic. The data is rendered after the initial request is made. You can use selenium to load the page, and once it's rendered, then you can pull out the relevant tag. API though is always the preferred way to go if it's available.
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
# If you don't want to open a browser, comment out the line above and uncomment below
#options = webdriver.ChromeOptions()
#options.add_argument('headless')
#driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe', options=options)
url = "https://www.target.com/p/ruffles-cheddar-38-sour-cream-potato-chips-2-5oz/-/A-14930847#lnk=sametab"
driver.get(url)
r = driver.page_source
soup = BeautifulSoup(r, "lxml")
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price.text)
Output:
$1.99

You are simply using wrong locator.
Try this
price_css_locator = 'div[data-test=product-price]'
or in XPath style
price_xpath_locator = '//div[#data-test="product-price"]'
With bs4 it should be something like this:
soup.select('div[data-test="product-price"]')
to get the element get you just need to add .text
price = soup.select('div[data-test="product-price"]').text
print(price)

use .text
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price.text)

Can't identify a table when scraping

Beginner question.. I'm attempting to scrape data from a table but I can't seem to recognize it, I've tried using the class and the id to identify it but my result is 0. The code and output are below.
# Import necessary packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
# Site URL
url="https://fbref.com/en/comps/9/stats/Premier-League-Stats"
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text
# Parse HTML code for the entire site
soup = BeautifulSoup(html_content, "lxml")
#print(soup.prettify()) # print the parsed data of html
gdp = soup.find_all("table", attrs={"id": "stats_standard"})
print("Number of tables on site: ",len(gdp))
Output - 'Number of tables on site: 0'

I suggest you to use selenium for such scraping, its performance is very reliable.
This code will work for you:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
option = Options()
option.add_argument('--headless')
url = 'https://fbref.com/en/comps/9/stats/Premier-League-Stats'
driver = webdriver.Chrome(options=option)
driver.get(url)
bs = BeautifulSoup(driver.page_source, 'html.parser')
gdp = bs.find_all('table', {'id': 'stats_standard'})
driver.quit()
print("Number of tables on site: ",len(gdp))
Output
Number of tables on site: 1

Can you find the table(s) without using attrs={"id": "stats_standard"}?
I have checked and indeed I cannot find any table whose ID is stats_standard (but there is one with ID stats_standard_sh, for example). So I guess you might be using the wrong ID.

HTML parsing with BeautifulSoup in Python unknown error

I know that this code works for other websites that end in .com
However I noticed that the code doesn't work if I try to parse websites that end in .kr
Can somebody help to find why this is happening and an alternate solution to parse these types of websites?
Following is my code.
import requests
from bs4 import BeautifulSoup
URL = 'https://everytime.kr/#nN4K1XC0weHnnM9VB5Qe'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='container')
print(results)
The URL here is a link to my timetable. I need to parse this website so that I can easily collect the information for the subjects and data relevant to the subject (duration, location, professor's name, etc.).
Thanks

Website is serving dynamic content and you get an empty response back - you may use selenium.
Example
from selenium import webdriver
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome(executable_path=r'C:\Program Files\ChromeDriver\chromedriver.exe')
url = 'https://everytime.kr/#nN4K1XC0weHnnM9VB5Qe'
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find(id='container')
print(results)
driver.close()

Not able to scrape specific information off BSE website using bs4

I'm trying to scrape the previous close and open stock price from this website. Here's an image as a reference to where the information to scrape is located.
It looks like the particular table is a child of a div tag with class="col-lg-13", but bs4 just returns None on all attempts to find it.
I've tried the following:
from bs4 import BeautifulSoup
import requests
link = "https://bseindia.com/stock-share-price/bharat-gears-ltd/bharatgear/505688/"
resp = requests.get(link).content
soup = BeautifulSoup(resp, "lxml")
box = soup.find('div', class_="col-lg-13")
table = box.find('table')
print(table)
>>> None
I've also tried:
container = soup.find('div', attr={'ng-init': "fnStockTrading()"})
tables = container.find_all('table')
print(tables)
>>> []

Use the same url (the API) the page is using for the data. This can be found in the network tab
import requests
r = requests.get('https://api.bseindia.com/BseIndiaAPI/api/getScripHeaderData/w?Debtflag=&scripcode=505688&seriesid=').json()
prev_close = r['Header']['PrevClose']
prev_open = r['Header']['Open']
print(prev_close, prev_open)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Beautifulsoup can not find table containing specific class - python

Related

Extracting the titles of the websites mentioned in the link

Python - Item Price Web Scraping for Target

Can't identify a table when scraping

HTML parsing with BeautifulSoup in Python unknown error

Not able to scrape specific information off BSE website using bs4

Categories

Resources