I'm trying to scrape some information from clickable popups in a table on a website into a pandas dataframe using Selenium in python and it seems to be able to do this if the popups have information.
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import pandas as pd
import time
driver = webdriver.Chrome()
driver.get('https://mspotrace.org.my/Sccs_list')
time.sleep(20)
# Select maximum number of entries
elem = driver.find_element_by_css_selector('select[name=dTable_length]')
select = Select(elem)
select.select_by_value('500')
time.sleep(15)
# Get list of elements
elements = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, "//a[#title='View on Map']")))
# Loop through element popups and pull details of facilities into DF
pos = 0
df = pd.DataFrame(columns=['facility_name','other_details'])
try:
for element in elements:
data = []
element.click()
time.sleep(3)
facility_name = driver.find_element_by_xpath('//h4[#class="modal-title"]').text
other_details = driver.find_element_by_xpath('//div[#class="modal-body"]').text
data.append(facility_name)
data.append(other_details)
df.loc[pos] = data
WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Close'] > span"))).click() # close popup window
time.sleep(10)
pos+=1
except:
print("No geo location information")
pass
print(df)
However, there are cases when a window like below appears and I need to click 'OK' on this to resume scraping the other rows on the web page but I can't seem to be able to find the element to click on to do this.
can you try for Pythonenter code here:
driver.switch_to.alert.accept()
But, your test scenario should be clear and should know where this pop up appears. If you don't know and "really" random, you can check some hooks that running for each test step
Selenium driver provides methods to switch to alerts context and working with it:
driver.switch_to().alert()
After that, you can do whatever you want, depending on alert type. To simulate clicking on “OK”:
driver.switch_to().alert().accept()
More info here
Related
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome import options
import unittest
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
#link to website
website = 'https://www.ncei.noaa.gov/access/monitoring/climate-at-a-glance'
path = ('../chromedriver') #Folder location where is the chromedriver
driver = webdriver.Chrome(path)
driver.get(website)
driver.maximize_window()
#Selection of the sections where the information I am looking for is located
state_click = driver.find_element(by=By.XPATH, value='//*[#id="show-statewide"]/a').click()
time_series_click = driver.find_element(by=By.XPATH, value='.//*[#id="time-series"]/div[3]/button').click()
#selection of the years (for all files the same range of 1950 - 2021)
star_year_dropdown =Select(driver.find_element(by=By.ID, value='begyear'))
star_year_dropdown.select_by_visible_text('1950')
end_year_dropdown = Select(driver.find_element(by=By.ID, value='endyear'))
end_year_dropdown.select_by_visible_text('2021')
#selection of the parameter to download: Average temperature
parameter_dropdown = Select(driver.find_element(by=By.ID, value='parameter'))
parameter_dropdown.select_by_visible_text('Average Temperature')
#Creating a loop to loop through all the states and all the months:
#state selection
select_state = driver.find_element(by=By.XPATH, value='.//*[#id="state"]')
opcion_state = select_state.find_elements(by=By.TAG_NAME, value='option')
#month selection
select_month = driver.find_element(by=By.XPATH, value = '//*[#id="month"]')
opcion_month = select_month.find_elements(by = By.TAG_NAME, value='option')
for option in opcion_month:
option.click()
for option in opcion_state:
option.click()
time.sleep(3)
plot = driver.find_element(by=By.XPATH, value='.//input[#id="submit"]').click()
dowload = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="csv-download"]'))).click()
time.sleep(3)
The code works fine, but in the plot and download functions created (at the end of the whole) when trying to click it gives an error and cannot be solved. I think it is because the web page at the time of executing that command does not display the button in the screen to plot the graph and download the csv. I have tried modifying the waiting time, but not work. Let's see if someone can help me.
Thanks in advance!!
So the problem that you are having with downloading the CSV file is not that the command is wrong (it does work), however, the issue is that the download CSV button is not visible on the page, which prevents you from clicking it.
A way to get around having to visibly see the element on the page and still click it you can do the following:
driver.execute_script("arguments[0].click();", driver.find_element(By.XPATH, '//*[#id="csv-download"]'))
This would be the preferred method, otherwise you would have to scroll to where the button is visible on the page and then click the button. The page has a scrolling effect if trying to click on a button which you can do the following (but the previous method is preferred as it is cleaner and does not take any additional time to do):
from selenium.common.exceptions import ElementClickInterceptedException
download_btn = driver.find_element(By.XPATH, '//*[#id="csv-download"]')
try:
download_btn.click() # the first time you try to click it will throw an error, while navigating the button into the page view
except ElementClickInterceptedException: # catch the exception and proceed as if clicking again
download_btn.click()
Other comments on your code
you have option.click() twice in your code, and it is unclear if you want to click the month or state option - there may be a confusion about which options are clicked. I would suggest renaming your iterator variables appropriately so that you know which buttons are being clicked
The goal: need to select a option on a dropdown menu then when a list gets pasted below I need to click on each one iteratively and scrap all the given data. Thankfully classes have proper ID names so should be doable but am facing some issues as described below
Can better understand it if you visit the website here www.psx.com.pk/psx/resources-and-tools/listings/listed-companies
Messy code:
chromedriver = "chromedriver.exe"
driver = webdriver.Chrome(chromedriver)
driver.get("https://www.psx.com.pk/psx/resources-and-tools/listings/listed-companies")
select = Select(driver.find_element_by_id("sector"))
for opt in select.options: #this will loop through all the dropdown options from the site
opt.click() #in source code table class gets populated here
table = driver.find_elements_by_class_name("addressbook")
for index in range(len(table)):
# if index % 2 == 0:
elem = table[index].text
print(elem)
elem.click()
data = driver.find_elements_by_class_name("addressbookdata")
print(data)
If you run this code on your end the output is very erratic, if everything work correctly I will get Index/Company names in my table.text variable so thought a quick and dirty solution to just get IDs would be to % 2 the index instead of populating a df first and then dropping the duplicates. After I've gotten all the IDs I need to click on all of them and then extract and append the data from ID addressbookdata into a dataframe whole, I don't think theres any logical problem in my code right now? But I can't make this work, its my first time using selenium as well am much more comfortable with beautifulsoup
I select dropdown table by value and pull table data selenium with pandas
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.psx.com.pk/psx/resources-and-tools/listings/listed-companies'
driver.get(url)
driver.maximize_window()
wait = WebDriverWait(driver,30)
#select from dropdown pop up option
Select(Wait.until(EC.visibility_of_element_located((By.XPATH, "//select[#id='sector']")))).select_by_value("0801")
dptable = wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#class="table-responsive"]'))).get_attribute("outerHTML")
df = pd.read_html(dptable)
print(df)
My code goes into this website, and clicks on each row of a table.
These rows open a window, which contain a field "keywords" at the bottom of the new window, which I am trying to get.
I dont think I have the xpath for this field right, though all I did was right click and "copy xpath".
Expected output is to print the value in the keywords.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import requests
driver = webdriver.Chrome()
driver.get('https://aaaai.planion.com/Web.User/SearchSessions?ACCOUNT=AAAAI&CONF=AM2021&USERPID=PUBLIC&ssoOverride=OFF')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
eachRow=driver.find_elements_by_class_name('clickdiv')
for item in eachRow:
item.click() #opens the new window per each row
time.sleep(2)
faculty = driver.find_element_by_xpath("//td[#valign='MIDDLE']/b") #this works fine
keywords=item.find_element_by_xpath('//*[#id="W1"]/div/div/div/div[2]/div[2]/table/tbody/tr[5]/td/text()') #this does not
print(keywords.text)
print(faculty.text)
driver.find_element_by_class_name('XX').click()#closes window
ERROR message - selenium.common.exceptions.InvalidSelectorException: Message: invalid selector: The result of the xpath expression "//*[#id="W1"]/div/div/div/div[2]/div[2]/table/tbody/tr[5]/td/text()" is: [object Text]. It should be an element.
.....
Your XPath '//*[#id="W1"]/div/div/div/div[2]/div[2]/table/tbody/tr[5]/td/text()' is getting the actual text of the element, not the entire element itself. Just remove text() and it should work. XPath is notoriously finnicky, so be careful how you use it, and how you are defining your top level root.
I reworked your code very slightly here to give a better option than time.sleep(), explicit waits. It isn't necessary, but tends to make more reliable code and speeds up processing; it doesn't pause the process unless it has to, and only does so for as long as necessary.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get('https://aaaai.planion.com/Web.User/SearchSessions?ACCOUNT=AAAAI&CONF=AM2021&USERPID=PUBLIC&ssoOverride=OFF')
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'clickdiv')))
eachRow = driver.find_elements_by_class_name('clickdiv')
for item in eachRow:
item.click() # opens the new window per each row
faculty = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//td[#valign='MIDDLE']/b")))
keywords = item.find_element_by_xpath('//*[#id="W1"]/div/div/div/div[2]/div[2]/table/tbody/tr[5]/td')
print(keywords.text)
print(faculty.text)
driver.find_element_by_class_name('XX').click() # closes window
Sup, I'm trying to extract some data tables from an website (https://www.anbima.com.br/pt_br/informar/curvas-de-juros-fechamento.htm), but as we can see the data is inside an Iframe. It took me a while, since I'm not an expert to webscraping data, to click in the button "Consultar" to get in the page that I want. Basically, i't loads the data (4 tables) that inside an Iframe too.
The problem it's I still don't have any successful attempt to get the tables, maybe it's because of the Iframe.
For an example, I tried to use xpath i the first table without sucess.
drive.find_elemnt_by_xpath(//*[#id="Parametros"]/table).text
Here's the code to reach the page that i mentioned:
from selenium import webdriver
import time
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as expectedCondition
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
#----------------------- INICIALIZAÇÃO DO SCRAPING -----------------------------#
want_to_scrape = True
if want_to_scrape:
options = Options()
#options.add_argument('--headless')
driver = webdriver.Chrome("C:\\Users\\......\\chromedriver.exe",options=options)
now = time.time()
dataset_list = []
url = 'https://www.anbima.com.br/pt_br/informar/curvas-de-juros-fechamento.htm'
driver.get(url)
#element = driver.find_element_by_class_name('full')
#driver.switch_to.frame(element)
driver.switch_to.frame(0)
element = driver.find_elements_by_name('Consultar')
element[0].click()
time.sleep(1)
try:
alert = driver.switch_to_alert()
alert.accept()
print("alert accepted")
except:
print("no alert")
time.sleep(1)
driver.switch_to.frame(0)
driver.find_element_by_xpath
Try replacing your driver.switch_to.frame(0) line with this:
# Get the iframe element - note, may need to use more specialized selector here
iframe = driver.find_elements_by_tag_name('iframe')
driver.switch_to.frame(iframe)
That will get your driver into the frame context so you can fetch the tables. You may need to use a different selector to get the iframe element. If you post the iframe HTML here, I can help you write a selector.
I've written a script in python in combination with selenium to get some information from a webpage. To get the content it is necessary to do some clicks and fill in an inputbox to produce the result. The result gets displayed in a new tab. So, it is necessary to switch to that particular window to parse the information. My script can do all this very efficiently.
It's a follow-up post of this one.
The problem I'm facing is that I'm using few keywords to get information from. When a single keyword is used then driver.close() might do the trick but I'm using multiple keywords so when I use driver.close(), there is no more window left to move along.
Question: how can I make the scraper close the new tab (when it grabs information from there) and switch back to main window to do the process cyclically until there is no more keyword to check?
This is my script so far:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://officialrecords.broward.org/AcclaimWeb/search/SearchTypeName"
def get_information(driver,url):
for keyword in ['HMC DESIGN GROUP','HMC DESIGN GROUP']:
driver.get(url)
current = driver.current_window_handle
wait.until(EC.element_to_be_clickable((By.ID, "btnButton"))).click()
wait.until(EC.presence_of_element_located((By.ID,"SearchOnName"))).send_keys(keyword)
wait.until(EC.presence_of_element_located((By.ID, "btnSearch"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, "//td[contains(., 'HMC DESIGN GROUP')]"))).click()
wait.until(EC.new_window_is_opened)
driver.switch_to.window([window for window in driver.window_handles if window != current][0])
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".listDocDetails"))):
print(items.text)
# driver.switch_to.default_content()
# driver.close()
if __name__ == "__main__":
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
try:
get_information(driver,link)
finally:
driver.quit()
Try to implement below solution:
def get_information(driver,url):
driver.get(url)
current = driver.current_window_handle
wait.until(EC.element_to_be_clickable((By.ID, "btnButton"))).click()
for keyword in ['HMC DESIGN GROUP','HMC DESIGN GROUP']:
input_field = wait.until(EC.presence_of_element_located((By.ID,"SearchOnName")))
input_field.clear()
input_field.send_keys(keyword)
wait.until(EC.presence_of_element_located((By.ID, "btnSearch"))).click()
wait.until_not(EC.visibility_of_element_located((By.ID, "SearchingWaitImg")))
wait.until(EC.element_to_be_clickable((By.XPATH, "//td[contains(., '%s')]" % keyword))).click()
driver.switch_to.window([window for window in driver.window_handles if window != current][0])
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".listDocDetails"))):
print(items.text)
driver.close()
driver.switch_to.window(current)