Extracting HTML Table data using Beautiful Soup - python

I'm looking to extract all of the brands from this page using Beautiful Soup. My program so far is:
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
def main():
opts = Options()
opts.headless = True
assert opts.headless # Operating in headless mode
browser = Firefox(options=opts)
browser.get('https://neighborhoodgoods.com/pages/brands')
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
brand = []
for tag in soup.find('table'):
brand.append(tag.contents.text)
print(brand)
browser.close()
print('This program is terminated.')
I'm struggling with figuring out the right tag to use as the data is nested in tr/td. Any advice? Thanks so much!

If I understand your question correctly, you only want to get the company name (the first <td> of each table)
Try using a CSS Selector td:nth-of-type(1) which selects the first <td> of every table.
import requests
from bs4 import BeautifulSoup
URL = "https://neighborhoodgoods.com/pages/brands"
soup = BeautifulSoup(requests.get(URL).content, "html.parser")
print([tag.text for tag in soup.select("td:nth-of-type(1)")])
Output:
['A.N Other', 'Act + Acre', ...And on.. , 'Wild One']

Related

Web Scrapping with beautiful soup

To start, this is my first time using stack overflow!
I started my journey yesterday on python and I'm trying to extract the value of some pages automatically.
This is my code
import requests
from bs4 import BeautifulSoup
url = 'https://www.jpg.store/collection/chilledkongs'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
div = soup.find('div', class_ = 'stat-title')
print(div)
I'm getting nothing and my objective is to get the floor price. Atm is 888
The floor price is loaded via JavaScript from external source. To get it via requests use next example:
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.jpg.store/collection/chilledkongs"
api_url = "https://server.jpgstoreapis.com/collection/{}/floor"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = soup.select_one("#__NEXT_DATA__").contents[0]
data = json.loads(data)
policy_id = data["props"]["pageProps"]["collection"]["policy_id"]
data = requests.get(api_url.format(policy_id)).json()
print(data["floor"] / 1_000_000)
Prints:
888.0
As #Bao Huynh Lamn stated, the website is being dynamically generated/rendered using JavaScript.So you can use an automation tool like selenium.
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.jpg.store/collection/chilledkongs'
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
for div in soup.find_all('div', class_ = 'stat-title')[-2]:
print(div.text)
Output:
888

Python - Item Price Web Scraping for Target

I'm trying to get any item's price from Target website. I did some examples for this website using selenium and Redsky API but now I tried to wrote bs4 code below:
import requests
from bs4 import BeautifulSoup
url = "https://www.target.com/p/ruffles-cheddar-38-sour-cream-potato-chips-2-5oz/-/A-14930847#lnk=sametab"
r= requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price)
But it returns me None .
I tried soup.find("div",{'class': "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp"})
What am I missing?
I can accept any selenium code or Redsky API code but my priority is bs4
The page is dynamic. The data is rendered after the initial request is made. You can use selenium to load the page, and once it's rendered, then you can pull out the relevant tag. API though is always the preferred way to go if it's available.
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
# If you don't want to open a browser, comment out the line above and uncomment below
#options = webdriver.ChromeOptions()
#options.add_argument('headless')
#driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe', options=options)
url = "https://www.target.com/p/ruffles-cheddar-38-sour-cream-potato-chips-2-5oz/-/A-14930847#lnk=sametab"
driver.get(url)
r = driver.page_source
soup = BeautifulSoup(r, "lxml")
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price.text)
Output:
$1.99
You are simply using wrong locator.
Try this
price_css_locator = 'div[data-test=product-price]'
or in XPath style
price_xpath_locator = '//div[#data-test="product-price"]'
With bs4 it should be something like this:
soup.select('div[data-test="product-price"]')
to get the element get you just need to add .text
price = soup.select('div[data-test="product-price"]').text
print(price)
use .text
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price.text)

BeatifulSoap find() returns "None" with any name/attributes

I'm trying to get some informations about a product i'm interested in, on Amazon.
I'm using BeatifulSoap library for webscraping :
URL = 'https://www.amazon.it/gp/offer-listing/B08KHL2J5X/ref=dp_olp_unknown_mbc'
page = requests.get(URL,headers=headers)
soup = BeautifulSoup(page.content,'html.parser')
title = soup.find('span',class_='a-size-large a-color-price olpOfferPrice a-text-bold')
print(title)
In the pic, the highlined row it's the one i want to select, but when i run my script i get 'None' everytime. (Printing the entire output after BeatifulSoap call, give me the entire HTML source, so i'm using the right URL)
Any solutions?
You need to use .text() to get the text of an element.
so change:
print(title)
to:
print(title.text)
Output:
EUR 1.153,00
I wouldn't use BS alone in this case. You can easily use add Selenium to scrape the website:
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium import webdriver
url = 'https://www.amazon.it/gp/offer-listing/B08KHL2J5X/ref=dp_olp_unknown_mbc'
driver = webdriver.Safari()
driver.get(url)
html_content = driver.page_source
soup = BeautifulSoup(html_content, "html.parser")
title = soup.find('span',class_='a-size-large a-color-price olpOfferPrice a-text-bold')
print(title)
If you don't can use Safari you have to download the webdriver for Chrome, Firefox etc. but there is plenty of reading material on this topic.

Output None with the correct tags while scraping with BeautifulSoup

I am trying to web scrape from Zalora for 3 things:
1. item brand
2. item name
3. item price(old)
Below is my initial attempt:
from bs4 import BeautifulSoup
import requests
def make_soup(url):
html = requests.get(url)
bsObj = BeautifulSoup(html.text, 'html.parser')
return bsObj
soup = make_soup('https://www.zalora.com.hk/men/clothing/shirt/?gender=men&dir=desc&sort=popularity&category_id=31&enable_visual_sort=1')
itemBrand = soup.find("span",{"class":"b-catalogList__itmBrand fsm txtDark uc js-catalogProductTitle"})
itemName = soup.find("em",{"class":"b-catalogList__itmTitle fss"})
itemPrice = soup.find("span",{"class":"b-catalogList__itmPrice old"})
print(itemBrand, itemName, itemPrice)
Output:
None None None
Then I do further investigation:
productsCatalog = soup.find("ul",{"id":"productsCatalog"})
print(productsCatalog)
Output:
<ul class="b-catalogList__wrapper clearfix" id="productsCatalog">
This is the weird thing that puzzle me, there should be many tags within the ul tag (The 3 things I need are within those hidden tags), why are they not showing up?
Matter in fact, everything I try to scrape with BeautifulSoup within the ul tag have the output of None.
Since this content is rendered by JavaScript, you can't access it using the requests module. You should use selenium to automate your browser and then use BeautifulSoup to parse the actual html.
This is how you do it using selenium with chromedriver:
from selenium import webdriver
from bs4 import BeautifulSoup
chrome_driver = "path\\to\\chromedriver.exe"
driver = webdriver.Chrome(executable_path=chrome_driver)
target = 'https://www.zalora.com.hk/men/clothing/shirt/?gender=men&dir=desc&sort=popularity&category_id=31&enable_visual_sort=1'
driver.get(target)
soup = BeautifulSoup(driver.page_source, "lxml")
print(soup.find("span",{"class":"b-catalogList__itmBrand fsm txtDark uc js-catalogProductTitle"}).get_text().strip())
print(soup.find("span", {'class': 'b-catalogList__itmPrice old'}).get_text().strip())
print(soup.find("em",{"class":"b-catalogList__itmTitle fss"}).get_text().strip())
Output:
JAXON
HK$ 149.00
EMBROIDERY SHORT SLEEVE SHIRT

Python scraping href iinks

My goal is to scrape href links on the base_url site.
My code:
from bs4 import BeautifulSoup
from selenium import webdriver
import requests, csv, re
game_links = []
link_pages = []
base_url = "http://www.basket.fi/sarjat/ohjelma_tulokset/?season_id=93783&league_id=4#mbt:2-303$f&stage=177155:$p&0="
browser = webdriver.PhantomJS()
browser.get(base_url)
table = BeautifulSoup(browser.page_source, 'lxml')
for game in table.find_all("a", {'game_id': re.compile('\d+')}):
href=game.get("href")
print(href)
Result:
http://www.basket.fi/sarjat/ottelu/?game_id=3502579&season_id=93783&league_id=4
http://www.basket.fi/sarjat/ottelu/?game_id=3502579&season_id=93783&league_id=4
http://www.basket.fi/sarjat/ottelu/?game_id=3502523&season_id=93783&league_id=4
http://www.basket.fi/sarjat/ottelu/?game_id=3502523&season_id=93783&league_id=4
......
The problem is that I can't understand why in the result the href links will come always two times?
As you Notice in the image there are same game_id for two links
Modified Code:
This would help you to get only one link
for game in table.find_all("a", {'game_id': re.compile('\d+')}):
if game.children:
href=game.get("href")
print(href)

Categories