I want to download U.S. Department of Housing and Urban Development data using Python's Selenium. Here's my code.
import os
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
options = webdriver.ChromeOptions()
preferences= {"download.default_directory": os.getcwd(), "directory_upgrade": True}
options.add_experimental_option("prefs", preferences)
#options.headless = True
options.add_experimental_option('excludeSwitches', ['enable-logging'])
url = "https://hudgis-hud.opendata.arcgis.com/datasets/deteriorated-paint-index-by-county/explore"
# Path of my WebDriver
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
wait = WebDriverWait(driver, 60)
# to maximize the browser window
driver.maximize_window()
#get method to launch the URL
driver.get(url)
paths = ["#ember97", "calcite-card > div > calcite-button"]
for x in paths:
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, x))).click()
I can click the button to expand the side panel, where the CSV file button is located, but I cannot click the CSV file itself to download it. My first thought was to check for if the side panel existed within an IFRAME, so I did
seq = driver.find_elements_by_tag_name('iframe')
seq
And it returned nothing. The content is nested in a class called side-panel-ref. Is there a way to switch to this somehow so I can click that content, when iframes aren't there? What might I be missing?
Your button is inside a shadowroot.
You see this when you inspect in devtools:
Quickest and easiest way to handle this is with some JS .This is your script slightly refactored + the JS call:
url = "https://hudgis-hud.opendata.arcgis.com/datasets/deteriorated-paint-index-by-county/explore"
wait = WebDriverWait(driver, 60)
driver.maximize_window()
driver.get(url)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#ember97'))).click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div.dataset-download-card > hub-download-card")))
driver.execute_script('document.querySelector("div.dataset-download-card > hub-download-card").shadowRoot.querySelector("calcite-card > div > calcite-button").click()')
It's a fairly lengthy JS call. It's reasonably self explanatory if you read it, there are 5 parts to it: document.querySelector(..).shadowRoot.querySelector(..).click() - but just ask if you need more support.
Please also be aware that selenium is bad at downloading files. There's no API that exposes the downloads progress. You'll need to ensure your browser remains open while you download the file.
It seems a pretty quick download so you might get away with a hard coded sleep.
Also worth a mention - If you're not a fan of the long JS, you can also break it down like so:
container = driver.find_element_by_css_selector("div.dataset-download-card > hub-download-card")
shadowRoot = driver.execute_script("return arguments[0].shadowRoot", container)
shadowRoot.find_element_by_css_selector("calcite-card > div > calcite-button").click()
Related
I'm trying to create a script to show only pikachus on singapore poke map and the rest of the code is to go over the elements and get the coords for it and print the list.
I'm trying for a long time many suggestions I've seen here but still unable to make the checkbox be set with the latest code:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
def find_pokemon():
links = []
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
driver.get('https://sgpokemap.com/index.html?fbclid=IwAR2p_93Ll6K9b923VlyfaiTglgeog4uWHOsQksvzQejxo2fkOj4JN_t-MN8')
driver.find_element(By.ID, 'filter_link').click()
driver.find_element(By.ID, 'deselect_all_btn').click()
driver.find_element(By.ID, 'search_pokemon').send_keys("pika")
driver.switch_to.frame(driver.find_elements(By.ID, "filter"))
driver.find_element(By.ID, 'checkbox_25').click()
The second part of the code is working when I'm checking the box manually after putting a breakpoint and ignoring the checkbox click() exception.
Do you have any suggestions what can I try?
Bonus question, how can I determine and close the donate view:
There are several problems with your code:
There is no element with ID = 'search_pokemon'
There is no frame there to switch into it.
You need to use WebDriverWait expected_conditions to wait for elements to be clickable.
And generally you need to learn how to create correct locators.
The following code works:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=webdriver_service)
wait = WebDriverWait(driver, 30)
url = "https://sgpokemap.com/index.html?fbclid=IwAR2p_93Ll6K9b923VlyfaiTglgeog4uWHOsQksvzQejxo2fkOj4JN_t-MN8"
driver.get(url)
try:
wait.until(EC.element_to_be_clickable((By.ID, 'close_donation_button'))).click()
except:
pass
wait.until(EC.element_to_be_clickable((By.ID, 'filter_link'))).click()
wait.until(EC.element_to_be_clickable((By.ID, "deselect_all_btn"))).click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "[name='search_pokemon']"))).send_keys("pika")
wait.until(EC.element_to_be_clickable((By.XPATH, "//div[#class='filter_checkbox'][not(#style)]//label"))).click()
The result is:
UPD
This time I saw the donation dialog so I added the mechanism to close it.
I still can't see there element with ID = 'search_pokemon' as you mentioned.
As about the XPath to find the relevant checkbox - when pokemon name is inserted you can see in the dev tools that there are a lot of checkboxes there but all of them are invisibly while only one in our case is visible. The invisible elements are all have attribute style="display: none;" while the enabled element does not have style attribute. This is why [not(#style)] is coming there. So, I'm looking for parent element //div[#class='filter_checkbox'] who is also have no style attribute. In XPath words //div[#class='filter_checkbox'][not(#style)] then I'm just looking for it label child to click it. This can also be done with CSS Selectors as well.
The list of invisible elements with the enabled one:
With the help and answers from #Prophet , the current code for crawling the map and getting all of the Pikachus coordinates:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from keep import saveToKeep
def find_pokemon():
links = []
options = Options()
options.add_argument("--headless")
options.add_argument("disable-infobars")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=webdriver_service)
wait = WebDriverWait(driver, 30)
driver.get('https://sgpokemap.com')
try:
wait.until(EC.element_to_be_clickable((By.ID, 'close_donation_button'))).click()
except:
pass
wait.until(EC.element_to_be_clickable((By.ID, 'filter_link'))).click()
wait.until(EC.element_to_be_clickable((By.ID, "deselect_all_btn"))).click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "[name='search_pokemon']"))).send_keys("pika")
wait.until(EC.element_to_be_clickable((By.XPATH, "//div[#class='filter_checkbox'][not(#style)]//label"))).click()
# count = 0
wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'pokemon_icon_img')))
pokeList = driver.find_elements(By.CLASS_NAME, 'pokemon_icon_img')
for poke in pokeList:
# count += 1
try:
poke.click()
links.append(driver.find_element(By.LINK_TEXT, "Maps").get_attribute('href'))
except Exception:
pass
# if count > 300:
# break
res = []
for link in links:
res.append(link.split("=")[1].replace("'", ""))
# for item in res:
# print(item)
if len(res) > 1:
saveToKeep(res)
print("success")
else:
print("unsuccessful")
find_pokemon()
if __name__ == '__main__':
find_pokemon()
Used the headless chrome option in hope to achieve better
performance.
Commented out 'count' in case I want to limit list results
(currently I'm getting like 15 results tops when unlimited
although there are many more...weird :( )
the following code wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'pokemon_icon_img'))) is needed for now since it's not always
showing icons right away, so it's either that or adding a constant
time delay.
Have made this method recursive in case it's unsuccessful(sometimes it still gives out exceptions)
Lastly, saveToKeep(res) method is a simple method I'm using to open
and write results into my google keep notes. Needed to get an app
password within google security settings and I'm using it with my google account credentials for login.
Any comments or regards for improvements are welcomed :D
I'm trying to download or rather read a csv from a website. The csv is hidden under a button. Here is the website link.
The csv I'm trying to save is on the furthest right and is denoted as a blue button labelled 'Download all data in .csv'
Here is a sample code I tried to use to download it, but the download fails:
# import the required libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()
prefs = {"download.default_directory" : "D:/Profession/Data Extraction and Web Scraping/Stocks Data Extraction - Core Scientific/Output"}
#example: prefs = {"download.default_directory" : "C:\Tutorial\down"};
options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(executable_path="D:/Software/Selenium WebDrivers/chromedriver_win32/chromedriver", options=options)
try:
driver.implicitly_wait(5)
driver.get("https://defillama.com/chains")
downloadcsv = driver.find_element(By.CLASS_NAME, 'sc-8f0f10aa-1')
download_button = downloadcsv.find_element(By.TAG_NAME, 'button')
download_button.click() # this should save it to the folder I specified in prefs
time.sleep(3)
driver.close()
except:
print('There is an Error!')
This code tries to download the csv, however the download fails. Is there a better way I could download the CSV, especially one that is under a button? Thanks!
Looks like you need to use a proper wait method.
implicitly_wait waits for element presence while you need to wait for element to be clickable, this is more mature element state. WebDriverWait is the best practice to be used in such (and almost all the others) cases.
The following code works:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=webdriver_service)
wait = WebDriverWait(driver, 10)
url = "https://defillama.com/chains"
driver.get(url)
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".sc-8f0f10aa-1 button"))).click()
I am playing around with selenium on https://www.autozone.com/, trying to fill out the add vehicle form using selenium and python
first I click on the add vehicle button
URL = "https://www.autozone.com/"
ADD_VEHICLE_XPATH = "/html/body/div[1]/div/div[2]/div[2]/header/div[2]/div/div/div[2]/div/button"
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.implicitly_wait(WAIT_TIME)
driver.get(URL)
add_vehicle_button = driver.find_element(By.XPATH, ADD_VEHICLE_XPATH)
add_vehicle_button.click()
Then a pop up window pops up, and I try to locate the button for the year dropdown
YEAR_BUTTON_XPATH = "/html/body/div[4]/div[3]/div/div[2]/div/div/div[1]/div/div[2]/div[1]/div/div/div[1]/div/button"
year_button = driver.find_element(By.XPATH, YEAR_BUTTON_XPATH)
This throws the NoSuchElementException
according to this script
def is_in_iframe():
driver.execute_script("""function iniFrame() {
if ( window.location !== window.parent.location )
{
// The page is in an iFrames
document.write("The page is in an iFrame");
}
else {
// The page is not in an iFrame
document.write("The page is not in an iFrame");
}
}
// Calling iniFrame function
iniFrame();""")
I am not in an iframe after clicking on add vehicle
I have also checked the names of all windows before and after clicking add vehicle, there is only ever 1 window and it is the same before and after clicking.
Some things to note:
I have tried adding both python sleep() and waiting in selenium
The div that contains the pop up and all buttons does not show up until I click add vehicle, and after that it shows up near the bottom
The xpath I'm using is unique to that button
What are some other ways I can try to locate the button? Please let me know if I need to add any more code or description.
You need to improve your locators.
Also you need to wait for elements to become clickable before clicking them.
The following code opens the "Add Vehicle" dialog and selects 2020 year.
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "eager"
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, desired_capabilities=caps, service=webdriver_service)
wait = WebDriverWait(driver, 10)
url = "https://www.autozone.com/"
driver.get(url)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-testid='deskTopVehicle-menu-lg']"))).click()
wait.until(EC.element_to_be_clickable((By.ID, "yearheader"))).click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-testid='yearheader-dropdown-list-item-4']"))).click()
This is the screenshot of web page state after applying the code above:
That site containing several JavaScript scripts making the page loading time long, so I added a special settings to driver not to wait for it.
This is what
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "eager"
coming for
I'm working on a scraping project of Aliexpress, and I want to change the ship to country using selenium,for example change spain to Australia and click Save button and then scrap the page, I already found an answer it worked just I don't know how can I save it by clicking the button save using selenium, any help is highly appreciated. This is my code using for this task :
country_button = driver.find_element_by_class_name('ship-to')
country_button.click()
country_buttonn = driver.find_element_by_class_name('shipping-text')
country_buttonn.click()
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//li[#class='address-select-item ']//span[#class='shipping-text' and text()='Australia']"))).click()
Well, there are 2 pop-ups there you need to close first in order to access any other elements. Then You can select the desired shipment destination. I used WebDriverWait for all those commands to make the code stable. Also, I used scrolling to scroll the desired destination button before clicking on it and finally clicked the save button.
The code below works.
Just pay attention that after selecting a new destination pop-ups can appear again.
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
options = Options()
options.add_argument("--start-maximized")
s = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=s)
url = 'https://www.aliexpress.com/'
wait = WebDriverWait(driver, 10)
actions = ActionChains(driver)
driver.get(url)
try:
wait.until(EC.element_to_be_clickable((By.XPATH, "//div[contains(#style,'display: block')]//img[contains(#src,'TB1')]"))).click()
except:
pass
try:
wait.until(EC.element_to_be_clickable((By.XPATH, "//img[#class='_24EHh']"))).click()
except:
pass
wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "ship-to"))).click()
wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "shipping-text"))).click()
ship_to_australia_element = driver.find_element(By.XPATH, "//li[#class='address-select-item ']//span[#class='shipping-text' and text()='Australia']")
actions.move_to_element(ship_to_australia_element).perform()
time.sleep(0.5)
ship_to_australia_element.click()
wait.until(EC.element_to_be_clickable((By.XPATH, "//button[#data-role='save']"))).click()
I mostly used XPath locators here. CSS Selectors could be used as well
Fellows,
I'm doing some webscraping and need to download multiple PDFs from the www1.hkexnews.hk website.
However, I encountered a problem while trying to make my Selenium chromedriver tick the box that appears every time one wants to download a PDF on the said website. The code executes, but the box still appears unclicked.
Please refer to my source code below - would appreciate any advice!
driver = webdriver.Chrome('/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/chromedriver',options=chrome_options)
driver.implicitly_wait(10)
driver.maximize_window()
start_address = "https://www1.hkexnews.hk/app/appyearlyindex.html?lang=en&board=mainBoard&year=2021"
driver.get(start_address)
PDF_link = driver.find_element_by_xpath("//a[contains(text(),'Full Version')]")
print("Now clicking...'", PDF_link.text,"'")
PDF_link.click()
checkbox = driver.find_element_by_id('warning-statement-accept')
print("Now clicking...", checkbox.text)
checkbox.click
Edit: Thank you guys! The downloading works fine now, just one small follow-up question - how can I modify the downloading code to save each PDF according to its company name - available through all_names = driver.find_elements_by_xpath("//div[#class='applicant-name']")?
At the moment, I am using the automatic download options as per below, I guess the downloading logic would have to be adjusted (I would rather download the PDFs with correct names already, rather than employ the dirty workaround of using Python to change their names once they're saved...)
chrome_options.add_experimental_option('prefs', {
"download.default_directory": "/Users/XXX/Downloads", #Change default directory for downloads
"download.prompt_for_download": False, #To auto download the file
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True #It will not show PDF directly in chrome
})
This should do it:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://www1.hkexnews.hk/app/appyearlyindex.html?lang=en&board=mainBoard&year=2021"
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
driver.get(link)
elem = wait.until(EC.presence_of_element_located((By.XPATH,"//tr[#class='record-ap-phip']//a[contains(.,'Full Version')]")))
elem.click()
wait.until(EC.presence_of_element_located((By.XPATH,"//*[#id='warning-statement-dialog']//label[#for='warning-statement-accept']"))).click()
wait.until(EC.presence_of_element_located((By.XPATH,"//*[#id='warning-statement-dialog']//a[contains(#class,'btn-ok')]"))).click()
Here goes the modified version of the script which will kick out the newly opened tabs. I didn't include the downloading logic within the script. I suppose you can do that yourself.
driver.get(link)
current = driver.current_window_handle
for elem in wait.until(EC.presence_of_all_elements_located((By.XPATH,"//tr[#class='record-ap-phip']//a[contains(.,'Full Version')]"))):
elem.click()
wait.until(EC.presence_of_element_located((By.XPATH,"//*[#id='warning-statement-dialog']//label[#for='warning-statement-accept']"))).click()
wait.until(EC.presence_of_element_located((By.XPATH,"//*[#id='warning-statement-dialog']//a[contains(#class,'btn-ok')]"))).click()
wait.until(EC.new_window_is_opened)
driver.switch_to.window([window for window in driver.window_handles if window != current][0])
print(driver.current_url)
driver.close()
driver.switch_to.window(current)
driver.quit()
There are several issues here:
"checkbox" locator is wrong.
Your current code will download the first PDF file only.
It is preferably to use expected conditions explicit waits instead of implicit wait.
This should work better:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome('/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/chromedriver',options=chrome_options)
wait = WebDriverWait(driver, 20)
driver.maximize_window()
start_address = "https://www1.hkexnews.hk/app/appyearlyindex.html?lang=en&board=mainBoard&year=2021"
driver.get(start_address)
PDF_link = wait.until(EC.visibility_of_element_located((By.XPATH, "//a[contains(text(),'Full Version')]")))
print("Now clicking...'", PDF_link.text,"'")
PDF_link.click()
checkbox = wait.until(EC.visibility_of_element_located((By.XPATH, "//div[./label[#for='warning-statement-accept']]//input")))
print("Now clicking...", checkbox.text)
checkbox.click