I'm trying to extract real estate listing info from a site using selenium and beautiful soup using this tutorial: https://medium.com/#ben.sturm/scraping-house-listing-data-using-selenium-and-beautiful-soup-1cbb94ba9492
Aim is to gather all the href links from the first page before finding the 'next page' button, navigating to next and collecting all links on that page and so on.
Tried with a single function to achieve this and repeat for each page but can't figure out why it's not working. New to learning code and have seems too trivial to find an answer yet. Would appreciate any help
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import sys
import numpy as np
import pandas as pd
import regex as re
driver = webdriver.Chrome
url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(url)
try:
wait = WebDriverWait(driver, 3)
wait.until(EC.presence_of_element_located((By.ID, "body1")))
print("Page is Ready!")
except TimeoutException:
print("page took too long to load")
def get_house_links(url, driver, pages=3):
house_links = []
driver.get(url)
for i in range(pages):
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all("a", class_="L")
page_data = [row['href'] for row in listings]
house_links.append(page_data)
time.sleep(np.random.lognormal(0, 1))
next_button = soup.find_all("a", class_="pageingBlock darkBorder")
next_button_link = ['http://property.shw.co.uk'+row['href'] for row in next_button]
if i < 3:
driver.get(next_button_link[0])
return house_links
get_house_links(url, driver)
class_="pageingBlock darkBorder" match the previous page button as well, so next_button_link[0] send you back to previous page. You need more precise locator
next_button = soup.select('img[src*="propNext"]')
if next_button:
next_button = next_button[0].find_parent('a')
next_button_link = 'http://property.shw.co.uk' + next_button['href']
driver.get(next_button_link)
Related
I currently try scrape a value at this specific website for a school project https://data.census.gov/cedsci/table?q=53706%20income&tid=ACSST5Y2020.S1901
it's the first one below if you search Median income (dollars), which should be the median income of the area, the comp-id keep changing for some reason
This median income estimate is what I'm looking for
I tried serveral method on the sites to go over the nested divs but I'm not able to get any results after runned, below is a code that I tried to use, but it just kept returning nothing to me.
Any help will be appreciate, thanks!
import csv
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from bs4 import BeautifulSoup
DRIVER_PATH = 'chromedriver_107.exe'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
url = 'https://data.census.gov/cedsci/table?q=' + '53706' + '%20income&tid=ACSST5Y2020.S1901'
driver.get(url)
page = requests.get(url)
content = driver.page_source
soup = BeautifulSoup(content, 'lxml')
a = soup.findAll("div", {"comp-id":"1539"})
print(a)
Try with this:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#set up Chrome driver
options=webdriver.ChromeOptions()
#Define web driver as a Chrome driver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://data.census.gov/cedsci/table?q=53703%20income&tid=ACSST5Y2020.S1901'
driver.get(url)
# We print the label of row 11 (Which is the median)
label = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "(//div[#row-id='11'])[1]")))
print(label.text)
# We print the values of row 11 (Which is the median)
values = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "(//div[#row-id='11'])[2]")))
print(values.text)
Output:
Median income (dollars)
42,153
±3,200
114,643
±28,572
139,694
I am learning python as well as web scrapping and I want to get number of review from google map of a permanently closed restaurant but I cannot do that, would you please help? Thank you
from bs4 import BeautifulSoup
url = 'https://www.google.com/maps?q=asia+halal+restaurant+aichi+japan+open+date&safe=strict&rlz=1C1GCEA_enID892ID892&sxsrf=ALeKk01NqaBLM8bXeVVS6M6tv9kAy0G6qQ:1616997971678&gs_lcp=Cgdnd3Mtd2l6EAM6BwgjELADECc6BQghEKABOgQIIRAVOgcIIRAKEKABUIUIWKojYOckaABwAHgAgAHHAogB7RGSAQcxLjUuNC4ymAEAoAEBqgEHZ3dzLXdpesgBAcABAQ&uact=5&um=1&ie=UTF-8&sa=X&ved=2ahUKEwjbhef-7NTvAhWa93MBHaFHCzYQ_AUoAXoECAEQAw'
import requests
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ps = soup.find_all(string = 'クチコミ')
ps
I also tried to use find 'class' and 'span aria-label' based on developer tool of chrome below but still cannot do that
browser picture for html class
#ps = soup.find_all(class_='h0ySl-wcwwM-E70qVe-list')
#ps = soup.find_all('span aria-label')
#total_rev = ps.get_text()
#total_rev
Here is the code that I tried using selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
driver = webdriver.Chrome('F:/Download/SW/chromedriver_win32/chromedriver.exe')
url = 'https://www.google.com/maps/place/%E3%82%A2%E3%83%83%E3%83%90%E3%82%B7+%E3%82%B9%E3%82%A4%E3%83%BC%E3%83%84/#35.0903185,136.8551766,17z/data=!3m1!4b1!4m5!3m4!1s0x600378381c4bb1f7:0x8e9d356b9ded5bcc!8m2!3d35.0903185!4d136.8573653'
driver.get(url)
I have tried to get number of review using this code in "still operating" restaurant, but when it comes to permanently closed one I cannot get the number of review
span_review = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "section-star")]'))).click()
#Find the total number of reviews
total_number_of_reviews = driver.find_element_by_xpath('//*[#id="pane"]/div/div[1]/div/div/div[2]/div[2]/div/div[2]/div[2]').text.split(" ")[0]
total_number_of_reviews = int(total_number_of_reviews.replace(',','')) if ',' in total_number_of_reviews else int(total_number_of_reviews)#Find scroll layout
total_reviews = driver.find_element_by_class_name("h0ySl-wcwwM-E70qVe-list")
total_reviews #= driver.get('aria-label')
total_reviews = total_reviews.get_text('aria-label')
total_reviews
total_reviews
total_number_of_reviews = total_reviews.text[0:]
total_number_of_reviews
Hopefully I can learn
Thanks!
I can't find your xpath in HTML. There is no <button> with text section-star but <li class="section-star">.
And aria-label is not text but attribute and you have to use .get_attribute('aria-label')
But I found other xpath //button[jsaction="pane.rating.moreReviews"] and it works for me for permanent closed and still operating
Tested on Firefox and Chrome, Linux.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome('F:/Download/SW/chromedriver_win32/chromedriver.exe')
#driver = webdriver.Chrome()
#driver = webdriver.Firefox()
all_urls = [
# permanent closed
'https://www.google.com/maps/place/%E3%82%A2%E3%83%83%E3%83%90%E3%82%B7+%E3%82%B9%E3%82%A4%E3%83%BC%E3%83%84/#35.0903185,136.8551766,17z/data=!3m1!4b1!4m5!3m4!1s0x600378381c4bb1f7:0x8e9d356b9ded5bcc!8m2!3d35.0903185!4d136.8573653',
# still operating
'https://www.google.com/maps/place/Seaside+Restaurant+Higashiyama+Garden+-+Port+Bldg./#35.0841323,136.8474088,14z/data=!3m1!5s0x6003790a61e056e7:0x7f307de064680a96!4m9!1m2!2m1!1srestaurants!3m5!1s0x600379a07cd9fcc7:0x89f84cc9f0422e30!8m2!3d35.0895485!4d136.8809243!15sCgtyZXN0YXVyYW50c1oNIgtyZXN0YXVyYW50c5IBCnJlc3RhdXJhbnQ',
]
for url in all_urls:
driver.get(url)
total_reviews = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//button[#jsaction="pane.rating.moreReviews"]')))
total_reviews = total_reviews.get_attribute('aria-label')
print(total_reviews)
I tried using Selenium to crawl the web data, it loads all 346 products after clicking on the load more button for a few times on the browser, however, it only shows 96 / 346 product instead of 346 / 346 product, any idea how to fix it? I have already put the crawling code right after the while true loop for clicking the load more button
screen capture of the result
from urllib.request import urlopen
import requests
import ast
from selenium import webdriver
driver=webdriver.Chrome('e:/Users/fungc1/Documents/chromedriver.exe')
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
options=Options()
from bs4 import BeautifulSoup
url="https://www.toysrus.com.sg/lego"
#data = soup.findAll('div',attrs={'class':'card-image-wrapper'})
#toc = soup.find_all('div',attrs={'class':'result-count text-center'})
driver.get(url)
driver.maximize_window()
time.sleep(5)
driver.find_element_by_link_text("STAY ON THE SINGAPORE SITE").click()
while True:
try:
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
wait=WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.btn[data-url*='www.toysrus.com']"))).click()
time.sleep(5)
except Exception as e:
print(e)
break
time.sleep(5)
response = requests.get(url)
response_text = response.text
soup = BeautifulSoup(response_text, 'lxml')
text = urlopen(url).read()
soup = BeautifulSoup(text)
data = soup.findAll('div',attrs={'class':'card-image-wrapper'})
toc = soup.find_all('div',attrs={'class':'result-count text-center'})
emptylist2=[]
for item in toc:
print((item).text.strip()[:-1])
for div in data:
links = div.findAll('a')
for a in links:
catalogueresult=ast.literal_eval("" + a['href'][1:-5][-7:])
emptylist2.append(catalogueresult)
print (emptylist2)
You are mixing few things.
You opened the browser with Selenium and loaded all the items by clicking on load button. But after that you use requests library to request new html again from the url which has nothing to do with Selenium. So, you are doing two separate things. In your case even you remove the Selenium code you will get the same thing because you are not utilizing Selenium after loading all the products.
Now, what you need to do is to ask the Selenium to return the html code of all 396 products so that you can give it to BeautifulSoup for further parsing.
To do that, you don't need first 4 lines after your while loop ends. Do something like this:
html = driver.page_source #will return the html code with all products
soup = BeautifulSoup(html, 'lxml')
With this you will get all 396 products.
Good time of the day,
Currently I work on scraping project with the end goal is to create a DataFrame.
While I navigate from page to page, I have to gather different criterias. Though In case if the criteria is not present on the page, I would like to receive a "None"
import re
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
import time
import random
from bs4 import BeautifulSoup
start_time = time.time()
url='https://www.immoweb.be/en/search/house/for-sale?countries=BE&page=1&orderBy=relevance'
driver = webdriver.Chrome()
driver.implicitly_wait(30)
driver.get(url)
time.sleep(random.uniform(1.0, 3.0))
python_button = driver.find_elements_by_xpath('//*[#id="uc-btn-accept-banner"]')[0]
python_button.click()
time.sleep(random.uniform(1.0, 3.0))
python_button = driver.find_elements_by_xpath('//*[#id="classified_9312278"]')[0]
python_button.click()
soup = BeautifulSoup(driver.page_source)
area = list()
for i in range(15):
python_button = driver.find_elements_by_xpath('//*[#id="classifiedNavigation"]/ul/li[2]/a')[0]
python_button.click()
time.sleep(random.uniform(1.0, 3.0))
soup = BeautifulSoup(driver.page_source)
try:
for table in soup.findAll("th",text=re.compile("Living area")):
if table:
area.append(table.find_next("td").next_element.strip())
else:
area.append(None)
except:
area.append(None)
houses = {"Area":area}
print(houses)
However with the current code, only exisiting value appends to the list - whatever is not added does not even leave a blank.
And here is a link to the search
Thank you very much in advance!
It is pretty much obvious to me now
if soup.findAll("th",text=re.compile("Living area")):
for table in soup.findAll("th",text=re.compile("Living area")):
area.append(table.find_next("td").next_element.strip())
else:
area.append(None)
I am scraping public linkedIn data from specific people.
here is the code inside the while loop. For you to know, I used time.sleep() for the first 400 profils urls and it worked. However, it is not working anymore as it makes my firefox browser crash. I am pretty sure that the bug comes from the time.sleep() function that I tried to modify using implictly_wait() and WebdriverWait. However, none of this tries worked ;(
Here the code inside the while loop with the time.sleep() function that worked for around 400urls:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
browser = webdriver.Firefox()
browser.get("https://www.linkedin.com/uas/login")
time.sleep(4)
username = browser.find_element_by_id("session_key-login")
password = browser.find_element_by_id("session_password-login")
username.send_keys("yourmail")
password.send_keys("yourpassword")
login_attempt = browser.find_element_by_xpath("//*[#type='submit']")
login_attempt.submit()
time.sleep(4)
browser.get(the profile link I wanna scrap)
html = browser.page_source
soup = BeautifulSoup(html,"html.parser")
formation = soup.find_all('div', {'class': "education"})
nom = soup.find_all('span', {'class': "full-name"})
for a in nom:
for b in formation:
print(a.text,b.text)
time.sleep(4)
browser.close()
I tried to replace the time.sleep() by Implicitly_wait() but it is not working. The browser does not wait at all.
I also tried this
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
browser = webdriver.Firefox()
browser.get("the profile url I wanna scrap")
delay = 30 # seconds
try:
WebDriverWait(browser, delay).until(EC.presence_of_element_located(browser.find_element_by('education'))
print("Page is ready!")
except TimeoutException:
print("Loading took too much time!")
But it is still not working.
Do you have any idea on how to solve the issue ?
If I could make the browser wait without using time.sleep() (which makes my browser crash) without any conditions that would be amazing !
other question ? If I use chrome instead of firefox, do I have a chance to overcome the problem ?
Thanks for your answers,
Raphaël
With WebDriverWait, the browser waits but firefox crashes again: here the code
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
browser.get("https://www.linkedin.com/uas/login")
username = browser.find_element_by_id("session_key-login")
password = browser.find_element_by_id("session_password-login")
username.send_keys("mail")
password.send_keys("password")
login_attempt = browser.find_element_by_xpath("//*[#type='submit']")
login_attempt.submit()
try:
element = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.ID, "content")))
finally:
browser.get("profil linkedin to scrap")
html = browser.page_source
soup = BeautifulSoup(html,"html.parser")
formation = soup.find_all('div', {'class': "education"})
nom = soup.find_all('span', {'class': "full-name"})
for a in nom:
for b in formation:
print(a.text,b.text)
browser.close()