I'm scraping a website where the searchbar auto-fills with the last thing I search. If I use .clear(), the search bar still continues to append the last-searched item.
for sku in skus_to_find:
search_bar = WebDriverWait(driver,100).until(EC.presence_of_element_located((By.XPATH, "//input[contains(#accesskey, 'S')]")))
search_bar.clear()
search_bar.send_keys(sku)
search_bar.send_keys(Keys.RETURN)
How can I better clear out the search bar on a web page prior to .send_keys()?
added logic to click on cross, so it cleans the text box before you add new sku.
from bs4 import BeautifulSoup
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
#PATH = "C:\Program Files (x86)\chromedriver_win32\chromedriver.exe"
baseurl = "http://www.zoro.com"
driver = webdriver.Chrome()
driver.get(baseurl)
def use_driver_current_html(driver):
soup = BeautifulSoup(driver.page_source, 'lxml')
return soup
file = open('zoro_skus.txt')
skus_to_find = []
product_list = []
for line in file:
line = line.replace('\n', '')
skus_to_find.append(line)
for sku in skus_to_find:
search_bar = WebDriverWait(driver,100).until(EC.presence_of_element_located((By.XPATH, "//input[contains(#accesskey, 'S')]")))
time.sleep(2)
search_bar.send_keys(sku)
search_bar.send_keys(Keys.RETURN)
#below try and catch added
try:
cross_click=WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH, "(//*[#class='svg-icon zcl-icon zcl-icon--small'])[2]")))
cross_click.click()
except Exception as e:
print(e)
pass
Related
Good time of the day,
Currently I work on scraping project with the end goal is to create a DataFrame.
While I navigate from page to page, I have to gather different criterias. Though In case if the criteria is not present on the page, I would like to receive a "None"
import re
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
import time
import random
from bs4 import BeautifulSoup
start_time = time.time()
url='https://www.immoweb.be/en/search/house/for-sale?countries=BE&page=1&orderBy=relevance'
driver = webdriver.Chrome()
driver.implicitly_wait(30)
driver.get(url)
time.sleep(random.uniform(1.0, 3.0))
python_button = driver.find_elements_by_xpath('//*[#id="uc-btn-accept-banner"]')[0]
python_button.click()
time.sleep(random.uniform(1.0, 3.0))
python_button = driver.find_elements_by_xpath('//*[#id="classified_9312278"]')[0]
python_button.click()
soup = BeautifulSoup(driver.page_source)
area = list()
for i in range(15):
python_button = driver.find_elements_by_xpath('//*[#id="classifiedNavigation"]/ul/li[2]/a')[0]
python_button.click()
time.sleep(random.uniform(1.0, 3.0))
soup = BeautifulSoup(driver.page_source)
try:
for table in soup.findAll("th",text=re.compile("Living area")):
if table:
area.append(table.find_next("td").next_element.strip())
else:
area.append(None)
except:
area.append(None)
houses = {"Area":area}
print(houses)
However with the current code, only exisiting value appends to the list - whatever is not added does not even leave a blank.
And here is a link to the search
Thank you very much in advance!
It is pretty much obvious to me now
if soup.findAll("th",text=re.compile("Living area")):
for table in soup.findAll("th",text=re.compile("Living area")):
area.append(table.find_next("td").next_element.strip())
else:
area.append(None)
I've been having trouble trying to extract the phone number after clicking the "llamar" button. So far I've used the xpath method with selenium and also tried using beautiful soup to extract the number but unfortunately nothing has worked. I usually get an invalid selector error (if I use an xpath selector with selenium) and with BS4 I get a - AttributeError: 'NoneType' object has no attribute 'text' ...
I hope you can help me out!
Here is the url to the link - https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque-386352344.htm
Heres the code that I tried:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import UnexpectedAlertPresentException
url = 'https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque - 386352344.htm'
path = r'C:\Users\WL-133\anaconda3\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe'
path1 = r'C:\Users\WL-133\anaconda3\Lib\site-packages\selenium\webdriver\firefox'
# driver = webdriver.Chrome(path)
options = Options()
driver = webdriver.Chrome(path)
driver.get(url)
a = []
mah_div = driver.page_source
soup = BeautifulSoup(mah_div, features='lxml')
cookie_button = '//*[#id="sui-TcfFirstLayerModal"]/div/div/footer/div/button[2]'
btn_press = driver.find_element_by_xpath(cookie_button)
btn_press.click()
llam_button = '//*[#id="ad-detail-contact"]/a[2]'
llam_press = driver.find_element_by_xpath(llam_button)
llam_press.click()
time.sleep(10)
for item in soup.find_all("div", {"class": "contenido"}):
a.append(item.find("div", {"class": "plaincontenido"}).text)
print(a)
The phone is stored inside Javascript. You can use re module to extract it:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque-386352344.htm"
phone_url = "https://www.milanuncios.com/datos-contacto/?usePhoneProxy=0&from=detail&includeEmail=false&id={}"
ad_id = re.search(r"(\d+)\.htm", url).group(1)
html_text = requests.get(phone_url.format(ad_id)).text
soup = BeautifulSoup(html_text, "html.parser")
phone = re.search(r"getTrackingPhone\((.*?)\)", html_text).group(1)
print(soup.select_one(".texto").get_text(strip=True), phone)
Prints:
ana (Particular) 639....
With Selenium you will need to click the button and to switch to iframe.
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".def-btn.phone-btn")))
tel_button = driver.find_element_by_css_selector(".def-btn.phone-btn")
tel_button.click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.ID, "ifrw")))
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,".texto>.telefonos")))
tel_number = driver.find_element_by_css_selector(".texto>.telefonos").text
Please note, I used much stable locators.
So I am trying to parse this data from a dynamic table with selenium, it keeps getting the old data from page 1, I am trying to get gather pages 2's data, I've tried to search for other answers, but haven't found any, some say I need to add a wait period, and I did, however that didn't work.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Firefox()
browser.get('https://www.nyse.com/listings_directory/stock')
symbol_list=[]
table_data=browser.find_elements_by_xpath("//td");
def append_to_list(data):
for element in data:
symbol_list.append(element.text)
append_to_list(table_data)
pages=browser.find_elements_by_xpath('//a[#href="#"]')
for page in pages:
if(page.get_attribute("rel")== "next"):
if(page.text=="NEXT ›"):
page.click()
browser.implicitly_wait(100)
for elem in browser.find_elements_by_xpath("//td"): //still fetchs the data from page 1
print(elem.text)
#print(symbol_list)
I modified your script as below.
You should retrieve element in for loop or it will cause stale element reference exception.
And using WebDriverWait to wait for elements to be visible before find element.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from time import sleep
browser = webdriver.Chrome()
browser.get('https://www.nyse.com/listings_directory/stock')
symbol_list = []
while True:
try:
table_data = WebDriverWait(browser, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//table//td")))
for i in range(1, len(table_data)+1):
td_text = browser.find_element_by_xpath("(//table//td)["+str(i)+"]").text
print(td_text)
symbol_list.append(td_text)
next_page = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//a[#href="#" and contains(text(),"Next")]')))
next_clickable = next_page.find_element_by_xpath("..").get_attribute("class") # li
if next_clickable == 'disabled':
break
print("Go to next page ...")
next_page.click()
sleep(3)
except Exception as e:
print(e)
break
I'm trying to extract real estate listing info from a site using selenium and beautiful soup using this tutorial: https://medium.com/#ben.sturm/scraping-house-listing-data-using-selenium-and-beautiful-soup-1cbb94ba9492
Aim is to gather all the href links from the first page before finding the 'next page' button, navigating to next and collecting all links on that page and so on.
Tried with a single function to achieve this and repeat for each page but can't figure out why it's not working. New to learning code and have seems too trivial to find an answer yet. Would appreciate any help
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import sys
import numpy as np
import pandas as pd
import regex as re
driver = webdriver.Chrome
url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(url)
try:
wait = WebDriverWait(driver, 3)
wait.until(EC.presence_of_element_located((By.ID, "body1")))
print("Page is Ready!")
except TimeoutException:
print("page took too long to load")
def get_house_links(url, driver, pages=3):
house_links = []
driver.get(url)
for i in range(pages):
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all("a", class_="L")
page_data = [row['href'] for row in listings]
house_links.append(page_data)
time.sleep(np.random.lognormal(0, 1))
next_button = soup.find_all("a", class_="pageingBlock darkBorder")
next_button_link = ['http://property.shw.co.uk'+row['href'] for row in next_button]
if i < 3:
driver.get(next_button_link[0])
return house_links
get_house_links(url, driver)
class_="pageingBlock darkBorder" match the previous page button as well, so next_button_link[0] send you back to previous page. You need more precise locator
next_button = soup.select('img[src*="propNext"]')
if next_button:
next_button = next_button[0].find_parent('a')
next_button_link = 'http://property.shw.co.uk' + next_button['href']
driver.get(next_button_link)
According to my code I have tried to click on the View button which contain the hidden document, I need to download that document using selenium webdriver in python. When I inspect, I got the stream-url = chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai/85967fa5-7853-412e-bbe5-c96406308ec6
this stream-url I found in the embed tag. I am not getting how to download that document.
enter code here
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
import os
from selenium.webdriver.support.select import Select
import time
import pandas as pd
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver,
20).until(EC.element_to_be_clickable((By.XPATH,"//div[#class='search-
pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio=
WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P50500000005")
Search = WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a") if
item.get_attribute('href') is not None]
View = View[0]
request = urllib.request.Request(View)
driver.get(View)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html, 'html.parser')
divPInfo = soup.find("div", {"id": "DivDocument"})
title = divPInfo.find("div", {'class': 'x_panel'},
recursive=False).find("div", {'class': 'x_title'}).find(
"h2").text.strip()
print(title)
with open("uploads.csv" , "a") as csv_file:
csv_file.write(title + "\n")
csv_file.close()
table = pd.read_html(driver.page_source)[11]
print(table)
table.to_csv("uploads.csv" , sep=',',index = False)
btn = WebDriverWait(driver,
20).until(EC.element_to_be_clickable((By.XPATH, "//button[#class='btn
btn-info btn-xs' and #id='btnShow_10']")))
driver.execute_script("arguments[0].click();",btn)
In Firefox page uses <object data="..."> to display PDF with scan. There are buttons in section "Uploaded Documents" to display other scans.
This code uses these buttons to display scans, get data from <object> and save in files document-0.pdf, document-1.pdf, etc.
I use the same code you could see in my answer to your previous question:
Save the pdf using the selenium webdriver in python
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
url = 'https://maharerait.mahaonline.gov.in'
#chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
#driver = webdriver.Chrome(executable_path=chrome_path)
driver = webdriver.Firefox()
driver.get(url)
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,"//div[#class='search-pro-details']//a[contains(.,'Search Project Details')]"))).click()
registered_project_radio = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();", registered_project_radio)
application = driver.find_element_by_id("CertiNo")
application.send_keys("P50500000005")
search = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();", search)
time.sleep(5)
View = [item.get_attribute('href')
for item in driver.find_elements_by_tag_name("a")
if item.get_attribute('href') is not None]
# if there is list then get first element
if View:
View = View[0]
#-----------------------------------------------------------------------------
# load page
driver.get(View)
# find buttons in section `Uploaded Documents`
buttons = driver.find_elements_by_xpath('//div[#id="DivDocument"]//button')
# work with all buttons
for i, button in enumerate(buttons):
# click button
button.click()
# wait till page display scan
print('wait for object:', i)
search = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.TAG_NAME, "object")))
# get data from object
print('get data:', i)
import base64
obj = driver.find_element_by_tag_name('object')
data = obj.get_attribute('data')
text = data.split(',')[1]
bytes = base64.b64decode(text)
# save scan in next PDF
print('save: document-{}.pdf'.format(i))
with open('document-{}.pdf'.format(i), 'wb') as fp:
fp.write(bytes)
# close scan
print('close document:', i)
driver.find_element_by_xpath('//button[text()="Close"]').click()
# --- end ---
driver.close()