scrape a specific div value with beautifulsoup in nested div - python

I currently try scrape a value at this specific website for a school project https://data.census.gov/cedsci/table?q=53706%20income&tid=ACSST5Y2020.S1901
it's the first one below if you search Median income (dollars), which should be the median income of the area, the comp-id keep changing for some reason
This median income estimate is what I'm looking for
I tried serveral method on the sites to go over the nested divs but I'm not able to get any results after runned, below is a code that I tried to use, but it just kept returning nothing to me.
Any help will be appreciate, thanks!
import csv
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from bs4 import BeautifulSoup
DRIVER_PATH = 'chromedriver_107.exe'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
url = 'https://data.census.gov/cedsci/table?q=' + '53706' + '%20income&tid=ACSST5Y2020.S1901'
driver.get(url)
page = requests.get(url)
content = driver.page_source
soup = BeautifulSoup(content, 'lxml')
a = soup.findAll("div", {"comp-id":"1539"})
print(a)

Try with this:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#set up Chrome driver
options=webdriver.ChromeOptions()
#Define web driver as a Chrome driver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://data.census.gov/cedsci/table?q=53703%20income&tid=ACSST5Y2020.S1901'
driver.get(url)
# We print the label of row 11 (Which is the median)
label = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "(//div[#row-id='11'])[1]")))
print(label.text)
# We print the values of row 11 (Which is the median)
values = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "(//div[#row-id='11'])[2]")))
print(values.text)
Output:
Median income (dollars)
42,153
±3,200
114,643
±28,572
139,694

Related

Get total number of review of permanently closed place in google map without API

I am learning python as well as web scrapping and I want to get number of review from google map of a permanently closed restaurant but I cannot do that, would you please help? Thank you
from bs4 import BeautifulSoup
url = 'https://www.google.com/maps?q=asia+halal+restaurant+aichi+japan+open+date&safe=strict&rlz=1C1GCEA_enID892ID892&sxsrf=ALeKk01NqaBLM8bXeVVS6M6tv9kAy0G6qQ:1616997971678&gs_lcp=Cgdnd3Mtd2l6EAM6BwgjELADECc6BQghEKABOgQIIRAVOgcIIRAKEKABUIUIWKojYOckaABwAHgAgAHHAogB7RGSAQcxLjUuNC4ymAEAoAEBqgEHZ3dzLXdpesgBAcABAQ&uact=5&um=1&ie=UTF-8&sa=X&ved=2ahUKEwjbhef-7NTvAhWa93MBHaFHCzYQ_AUoAXoECAEQAw'
import requests
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ps = soup.find_all(string = 'クチコミ')
ps
I also tried to use find 'class' and 'span aria-label' based on developer tool of chrome below but still cannot do that
browser picture for html class
#ps = soup.find_all(class_='h0ySl-wcwwM-E70qVe-list')
#ps = soup.find_all('span aria-label')
#total_rev = ps.get_text()
#total_rev
Here is the code that I tried using selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
driver = webdriver.Chrome('F:/Download/SW/chromedriver_win32/chromedriver.exe')
url = 'https://www.google.com/maps/place/%E3%82%A2%E3%83%83%E3%83%90%E3%82%B7+%E3%82%B9%E3%82%A4%E3%83%BC%E3%83%84/#35.0903185,136.8551766,17z/data=!3m1!4b1!4m5!3m4!1s0x600378381c4bb1f7:0x8e9d356b9ded5bcc!8m2!3d35.0903185!4d136.8573653'
driver.get(url)
I have tried to get number of review using this code in "still operating" restaurant, but when it comes to permanently closed one I cannot get the number of review
span_review = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "section-star")]'))).click()
#Find the total number of reviews
total_number_of_reviews = driver.find_element_by_xpath('//*[#id="pane"]/div/div[1]/div/div/div[2]/div[2]/div/div[2]/div[2]').text.split(" ")[0]
total_number_of_reviews = int(total_number_of_reviews.replace(',','')) if ',' in total_number_of_reviews else int(total_number_of_reviews)#Find scroll layout
total_reviews = driver.find_element_by_class_name("h0ySl-wcwwM-E70qVe-list")
total_reviews #= driver.get('aria-label')
total_reviews = total_reviews.get_text('aria-label')
total_reviews
total_reviews
total_number_of_reviews = total_reviews.text[0:]
total_number_of_reviews
Hopefully I can learn
Thanks!
I can't find your xpath in HTML. There is no <button> with text section-star but <li class="section-star">.
And aria-label is not text but attribute and you have to use .get_attribute('aria-label')
But I found other xpath //button[jsaction="pane.rating.moreReviews"] and it works for me for permanent closed and still operating
Tested on Firefox and Chrome, Linux.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome('F:/Download/SW/chromedriver_win32/chromedriver.exe')
#driver = webdriver.Chrome()
#driver = webdriver.Firefox()
all_urls = [
# permanent closed
'https://www.google.com/maps/place/%E3%82%A2%E3%83%83%E3%83%90%E3%82%B7+%E3%82%B9%E3%82%A4%E3%83%BC%E3%83%84/#35.0903185,136.8551766,17z/data=!3m1!4b1!4m5!3m4!1s0x600378381c4bb1f7:0x8e9d356b9ded5bcc!8m2!3d35.0903185!4d136.8573653',
# still operating
'https://www.google.com/maps/place/Seaside+Restaurant+Higashiyama+Garden+-+Port+Bldg./#35.0841323,136.8474088,14z/data=!3m1!5s0x6003790a61e056e7:0x7f307de064680a96!4m9!1m2!2m1!1srestaurants!3m5!1s0x600379a07cd9fcc7:0x89f84cc9f0422e30!8m2!3d35.0895485!4d136.8809243!15sCgtyZXN0YXVyYW50c1oNIgtyZXN0YXVyYW50c5IBCnJlc3RhdXJhbnQ',
]
for url in all_urls:
driver.get(url)
total_reviews = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//button[#jsaction="pane.rating.moreReviews"]')))
total_reviews = total_reviews.get_attribute('aria-label')
print(total_reviews)

While using beautifulsoup4 and selenium, output a None value if a an element is inexstant on the page

Good time of the day,
Currently I work on scraping project with the end goal is to create a DataFrame.
While I navigate from page to page, I have to gather different criterias. Though In case if the criteria is not present on the page, I would like to receive a "None"
import re
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
import time
import random
from bs4 import BeautifulSoup
start_time = time.time()
url='https://www.immoweb.be/en/search/house/for-sale?countries=BE&page=1&orderBy=relevance'
driver = webdriver.Chrome()
driver.implicitly_wait(30)
driver.get(url)
time.sleep(random.uniform(1.0, 3.0))
python_button = driver.find_elements_by_xpath('//*[#id="uc-btn-accept-banner"]')[0]
python_button.click()
time.sleep(random.uniform(1.0, 3.0))
python_button = driver.find_elements_by_xpath('//*[#id="classified_9312278"]')[0]
python_button.click()
soup = BeautifulSoup(driver.page_source)
area = list()
for i in range(15):
python_button = driver.find_elements_by_xpath('//*[#id="classifiedNavigation"]/ul/li[2]/a')[0]
python_button.click()
time.sleep(random.uniform(1.0, 3.0))
soup = BeautifulSoup(driver.page_source)
try:
for table in soup.findAll("th",text=re.compile("Living area")):
if table:
area.append(table.find_next("td").next_element.strip())
else:
area.append(None)
except:
area.append(None)
houses = {"Area":area}
print(houses)
However with the current code, only exisiting value appends to the list - whatever is not added does not even leave a blank.
And here is a link to the search
Thank you very much in advance!
It is pretty much obvious to me now
if soup.findAll("th",text=re.compile("Living area")):
for table in soup.findAll("th",text=re.compile("Living area")):
area.append(table.find_next("td").next_element.strip())
else:
area.append(None)

This code for webscraping nt wrkng nd giving empty list for Td tags for other tag wrking fine,how to use index i want to hve 4th td tag wid dis clss

please get me solution how to get the entity type scrape from this page.This code for web scraping not working and giving empty list for Td tags for other tag its working fine and how to use index i want to have 7th td tag with this class
INPUT:import bs4 as bs
import requests as req
import selenium
from selenium import webdriver
driver = webdriver.Chrome()
url= "https://portal.unifiedpatents.com/litigation/caselist?case_no=1%3A18-CV-01956"
#driver.maximize_window()
driver.get(url)
content = driver.page_source.encode('utf-8').strip()
soup = bs.BeautifulSoup(content,"html.parser")
a=soup.find_all("td",{"class":"ant-table-row-cell-break-word"})
print(a)
driver.quit()
OUTPUT: "C:\Users\Lumenci 3\PycharmProjects\untitled6\venv\Scripts\python.exe" "C:/Users/Lumenci 3/.PyCharmCE2019.3/config/scratches/scratch_2.py"
[]
Process finished with exit code 0
You can just use Selenium without bs4.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
url= "https://portal.unifiedpatents.com/litigation/caselist?case_no=1%3A18-CV-01956"
driver.get(url)
elements = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'td.ant-table-row-cell-break-word')))
print([element.text for element in elements])
driver.quit()
Output:
['1:18-cv-01956', '2018-12-11', 'Open', 'Delaware District Court', 'Axcess International, Inc.', 'Lenel Systems International, Inc.', 'Infringement', 'NPE (Individual)', 'High-Tech']

Python in Selenium/BeautifulSoup

I'm trying to extract real estate listing info from a site using selenium and beautiful soup using this tutorial: https://medium.com/#ben.sturm/scraping-house-listing-data-using-selenium-and-beautiful-soup-1cbb94ba9492
Aim is to gather all the href links from the first page before finding the 'next page' button, navigating to next and collecting all links on that page and so on.
Tried with a single function to achieve this and repeat for each page but can't figure out why it's not working. New to learning code and have seems too trivial to find an answer yet. Would appreciate any help
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import sys
import numpy as np
import pandas as pd
import regex as re
driver = webdriver.Chrome
url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(url)
try:
wait = WebDriverWait(driver, 3)
wait.until(EC.presence_of_element_located((By.ID, "body1")))
print("Page is Ready!")
except TimeoutException:
print("page took too long to load")
def get_house_links(url, driver, pages=3):
house_links = []
driver.get(url)
for i in range(pages):
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all("a", class_="L")
page_data = [row['href'] for row in listings]
house_links.append(page_data)
time.sleep(np.random.lognormal(0, 1))
next_button = soup.find_all("a", class_="pageingBlock darkBorder")
next_button_link = ['http://property.shw.co.uk'+row['href'] for row in next_button]
if i < 3:
driver.get(next_button_link[0])
return house_links
get_house_links(url, driver)
class_="pageingBlock darkBorder" match the previous page button as well, so next_button_link[0] send you back to previous page. You need more precise locator
next_button = soup.select('img[src*="propNext"]')
if next_button:
next_button = next_button[0].find_parent('a')
next_button_link = 'http://property.shw.co.uk' + next_button['href']
driver.get(next_button_link)

Python crawler finds desired tags in only first few divs

I am trying to scrape some images from a shopping site (https://www.grailed.com/shop/EkpEBRw4rw) but I am having some trouble with it since the listings updates as you scroll. I am trying to get the image source in the HTML tag below:
Now the code I have been using is shown below:
from bs4 import BeautifulSoup
from selenium import webdriver
url = 'https://www.grailed.com/shop/EkpEBRw4rw'
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
listing = soup.select('.listing-cover-photo ')
for item in listing:
print(item.select('img'))
The problem is that although it does find the tag for every listing, it can only find the tag for the first 6 listings. The output from my code is shown below:
OUTPUT:
[<img alt="Off-White Off White Caravaggio Hoodie" src="https://process.fs.grailed.com/AJdAgnqCST4iPtnUxiGtTz/cache=expiry:max/rotate=deg:exif/resize=width:480,height:640,fit:crop/output=format:webp,quality:70/compress/https://cdn.fs.grailed.com/api/file/yX8vvvBsTaugadX0jssT"/>]
(...a few more of these...)
[<img alt="Off-White Off-White Arrows Hoodie Black" src="https://process.fs.grailed.com/AJdAgnqCST4iPtnUxiGtTz/cache=expiry:max/rotate=deg:exif/resize=width:480,height:640,fit:crop/output=format:webp,quality:70/compress/https://cdn.fs.grailed.com/api/file/9CMvJoQIRaqgtK0u9ov0"/>]
[]
[]
[]
[]
(...many more empty lists...)
This persists even if loop through all pages in the side (adding '?page = n' to the url) and shows only the first 6 entries of each page.
To scrape the src attributes of the <img> tags within the shopping site using Selenium you need to induce WebDriverWait for the visibility_of_all_elements_located() and you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument('--disable-extensions')
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get('https://www.grailed.com/shop/EkpEBRw4rw')
print([my_image.get_attribute("src") for my_image in WebDriverWait(driver, 5).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.listing-cover-photo>img")))])
Console Output:
['https://process.fs.grailed.com/AJdAgnqCST4iPtnUxiGtTz/cache=expiry:max/rotate=deg:exif/resize=width:240,height:320,fit:crop/output=format:webp,quality:70/compress/https://cdn.fs.grailed.com/api/file/yX8vvvBsTaugadX0jssT', 'https://process.fs.grailed.com/AJdAgnqCST4iPtnUxiGtTz/cache=expiry:max/rotate=deg:exif/resize=width:240,height:320,fit:crop/output=format:webp,quality:70/compress/https://cdn.fs.grailed.com/api/file/YjiErjJNQrarKGDuGr3S', 'https://process.fs.grailed.com/AJdAgnqCST4iPtnUxiGtTz/cache=expiry:max/rotate=deg:exif/resize=width:240,height:320,fit:crop/output=format:webp,quality:70/compress/https://cdn.fs.grailed.com/api/file/G9CwIli8QUW3uGgZeirk', 'https://process.fs.grailed.com/AJdAgnqCST4iPtnUxiGtTz/cache=expiry:max/rotate=deg:exif/resize=width:240,height:320,fit:crop/output=format:webp,quality:70/compress/https://cdn.fs.grailed.com/api/file/Ta9DAxg4SeKAT6kBLyJo', 'https://process.fs.grailed.com/AJdAgnqCST4iPtnUxiGtTz/cache=expiry:max/rotate=deg:exif/resize=width:240,height:320,fit:crop/output=format:webp,quality:70/compress/https://cdn.fs.grailed.com/api/file/QglmTKyTxu31PeDFWFnw', 'https://process.fs.grailed.com/AJdAgnqCST4iPtnUxiGtTz/cache=expiry:max/rotate=deg:exif/resize=width:240,height:320,fit:crop/output=format:webp,quality:70/compress/https://cdn.fs.grailed.com/api/file/9CMvJoQIRaqgtK0u9ov0', 'https://process.fs.grailed.com/AJdAgnqCST4iPtnUxiGtTz/cache=expiry:max/rotate=deg:exif/resize=width:240,height:320,fit:crop/output=format:webp,quality:70/compress/https://cdn.fs.grailed.com/api/file/MCJY9cSQsiSU4TlSTcD7', 'https://process.fs.grailed.com/AJdAgnqCST4iPtnUxiGtTz/cache=expiry:max/rotate=deg:exif/resize=width:240,height:320,fit:crop/output=format:webp,quality:70/compress/https://cdn.fs.grailed.com/api/file/L4NHu1ByT3Kwn8dRsdBX']

Categories