I'm trying to use a multi-thread strategy with selenium. In shorts I'm trying to fill in input field with ids.
This is my script :
from concurrent.futures import ThreadPoolExecutor
from selenium.webdriver.common.by import By
import numpy as np
import sys
from selenium import webdriver
def driver_setup():
path = "geckodriver.exe"
options = webdriver.FirefoxOptions()
# options.add_argument('--headless')
driver = webdriver.Firefox(options=options, executable_path=path)
return driver
def fetcher(id, driver):
print(id) #this works
# this doesnt work
driver.find_element(By.XPATH, '//input[#name="30_user_id"]').send_keys(id)
print(i, " sent")
#return data
def crawler(ids):
for id in ids:
results = fetcher(id, driver_setup())
drivers = [driver_setup() for _ in range(4)]
ids = list(range(0,50)) # generates ids
chunks = np.array_split(np.array(ids),4) #splits the id list into 4 chunks
with ThreadPoolExecutor(max_workers=4) as executor:
bucket = executor.map(crawler, chunks)
#results = [item for block in bucket for item in block]
[driver.quit() for driver in drivers]
Everything seems to work except the send_keys method. Both print() works so it seems the ids are sent to both functions. Weirdly, I don't get an error message (i get the pycharm's Process finished with exit code 0 notice) so I don't know what I'm doing wrong.
Any idea what is missing ?
I used this example : https://blog.devgenius.io/multi-threaded-web-scraping-with-selenium-dbcfb0635e83 if it helps
when using threading, watch out for exceptions as they get embedded into futures.
for example change your code to have the below-tweaked code(don't change any other line yet)
with ThreadPoolExecutor(max_workers=4) as executor:
bucket = executor.map(crawler, chunks)
# bucket is list of futures, so let's try to print it
for e_buck in bucket: # simpleapp add for demo
print(e_buck) #
you will see that you will get exception errors like:
i is not defined, look at this statement print(i, " sent") and print(i) in crawler.
once you fix the above error, the next error will be in the id in send keys- send_keys(id), id is of type numpy.int64. change it to str by typecast, str(), send_keys(str(id))
so your code, after fixes will be like:
from concurrent.futures import ThreadPoolExecutor
from selenium.webdriver.common.by import By
import numpy as np
import sys
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains as AC
from selenium.webdriver.common.keys import Keys
import time
def driver_setup():
path = "geckodriver.exe"
options = webdriver.FirefoxOptions()
# options.add_argument('--headless')
driver = webdriver.Firefox(options=options, executable_path=path)
return driver
def fetcher(id, driver):
print(id) #this works
# this doesnt work - it will work now :)
driver.find_element(By.XPATH, '//input[#name="30_user_id"]').send_keys(str(id))
print(id, " sent")
#return data
def crawler(ids):
for id in ids:
results = fetcher(id, driver_setup())
#drivers = [driver_setup() for _ in range(4)]
ids = list(range(0,50)) # generates ids
chunks = np.array_split(np.array(ids),4) #splits the id list into 4 chunks
with ThreadPoolExecutor(max_workers=4) as executor:
bucket = executor.map(crawler, chunks)
# bucket is list of futures, so let's try to print it
for e_buck in bucket: # simpleapp add for demo
print(e_buck) # check what print, you get, first time you will get that
# i is not defined, look at this statment print(i, " sent") and print(i) in crawler.
# once you fix the above error, next error will be in id in send keys- send_keys(id), id is of type ''numpy.int64''. change it to str by typecast, str(), send_keys(str(id))
#results = [item for block in bucket for item in block]
#[driver.quit() for driver in drivers]
Possibly you trying to invoke send_keys() too early even before the <input> field have rendered completely.
Ideally to send a character sequence to the element you need to induce WebDriverWait for the element_to_be_clickable() and you can use either of the following locator strategies:
Using NAME:
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.NAME, "30_user_id"))).send_keys(id)
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='30_user_id']"))).send_keys(id)
Using XPATH:
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//input[#name='30_user_id']"))).send_keys(id)
Note: You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
I'm trying to create a script to show only pikachus on singapore poke map and the rest of the code is to go over the elements and get the coords for it and print the list.
I'm trying for a long time many suggestions I've seen here but still unable to make the checkbox be set with the latest code:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
def find_pokemon():
links = []
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
driver.find_element(By.ID, 'filter_link').click()
driver.find_element(By.ID, 'deselect_all_btn').click()
driver.find_element(By.ID, 'search_pokemon').send_keys("pika")
driver.switch_to.frame(driver.find_elements(By.ID, "filter"))
driver.find_element(By.ID, 'checkbox_25').click()
The second part of the code is working when I'm checking the box manually after putting a breakpoint and ignoring the checkbox click() exception.
Do you have any suggestions what can I try?
Bonus question, how can I determine and close the donate view:
There are several problems with your code:
There is no element with ID = 'search_pokemon'
There is no frame there to switch into it.
You need to use WebDriverWait expected_conditions to wait for elements to be clickable.
And generally you need to learn how to create correct locators.
The following code works:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=webdriver_service)
wait = WebDriverWait(driver, 30)
url = "https://sgpokemap.com/index.html?fbclid=IwAR2p_93Ll6K9b923VlyfaiTglgeog4uWHOsQksvzQejxo2fkOj4JN_t-MN8"
wait.until(EC.element_to_be_clickable((By.ID, 'close_donation_button'))).click()
wait.until(EC.element_to_be_clickable((By.ID, 'filter_link'))).click()
wait.until(EC.element_to_be_clickable((By.ID, "deselect_all_btn"))).click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "[name='search_pokemon']"))).send_keys("pika")
wait.until(EC.element_to_be_clickable((By.XPATH, "//div[#class='filter_checkbox'][not(#style)]//label"))).click()
The result is:
This time I saw the donation dialog so I added the mechanism to close it.
I still can't see there element with ID = 'search_pokemon' as you mentioned.
As about the XPath to find the relevant checkbox - when pokemon name is inserted you can see in the dev tools that there are a lot of checkboxes there but all of them are invisibly while only one in our case is visible. The invisible elements are all have attribute style="display: none;" while the enabled element does not have style attribute. This is why [not(#style)] is coming there. So, I'm looking for parent element //div[#class='filter_checkbox'] who is also have no style attribute. In XPath words //div[#class='filter_checkbox'][not(#style)] then I'm just looking for it label child to click it. This can also be done with CSS Selectors as well.
The list of invisible elements with the enabled one:
With the help and answers from #Prophet , the current code for crawling the map and getting all of the Pikachus coordinates:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from keep import saveToKeep
def find_pokemon():
links = []
options = Options()
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=webdriver_service)
wait = WebDriverWait(driver, 30)
wait.until(EC.element_to_be_clickable((By.ID, 'close_donation_button'))).click()
wait.until(EC.element_to_be_clickable((By.ID, 'filter_link'))).click()
wait.until(EC.element_to_be_clickable((By.ID, "deselect_all_btn"))).click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "[name='search_pokemon']"))).send_keys("pika")
wait.until(EC.element_to_be_clickable((By.XPATH, "//div[#class='filter_checkbox'][not(#style)]//label"))).click()
# count = 0
wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'pokemon_icon_img')))
pokeList = driver.find_elements(By.CLASS_NAME, 'pokemon_icon_img')
for poke in pokeList:
# count += 1
links.append(driver.find_element(By.LINK_TEXT, "Maps").get_attribute('href'))
except Exception:
# if count > 300:
# break
res = []
for link in links:
res.append(link.split("=")[1].replace("'", ""))
# for item in res:
# print(item)
if len(res) > 1:
if __name__ == '__main__':
Used the headless chrome option in hope to achieve better
Commented out 'count' in case I want to limit list results
(currently I'm getting like 15 results tops when unlimited
although there are many more...weird :( )
the following code wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'pokemon_icon_img'))) is needed for now since it's not always
showing icons right away, so it's either that or adding a constant
time delay.
Have made this method recursive in case it's unsuccessful(sometimes it still gives out exceptions)
Lastly, saveToKeep(res) method is a simple method I'm using to open
and write results into my google keep notes. Needed to get an app
password within google security settings and I'm using it with my google account credentials for login.
Any comments or regards for improvements are welcomed :D
Apologies in advance if this long question seems quite basic!
search query link in a library website:
url = 'https://digi.kansalliskirjasto.fi/search?query=economic%20crisis&orderBy=RELEVANCE'
I'd like to extract all useful information for each individual search result (total 20 in 1 page) of this specific query as depicted by red rectangles in this figure:
currently, I have the following code:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
def run_selenium(URL):
options = Options()
options.headless = True
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
pt = "//app-digiweb/ng-component/section/div/div/app-binding-search-results/div/div"
medias = driver.find_elements(By.XPATH, pt) # expect to obtain a list with 20 elements!!
print(medias) # >>>>>> result: []
for i, v in enumerate(medias):
print(i, v.get_attribute("innerHTML"))
if __name__ == '__main__':
url = 'https://digi.kansalliskirjasto.fi/search?query=economic%20crisis&orderBy=RELEVANCE'
Having a look at part of the inspect in chrome:
I have tried several xpath generated by Chrome Extensions XPath Helper and SelectorsHub to produce XPath and use it as pt variable in my python code this library search engine, but the result is [] or simply nothing.
Using SelectorsHub and hovering the mouse over Rel XPath, I get this warning: id & class both look dynamic. Uncheck id & class checkbox to generate rel xpath without them if it is generated with them.
Assuming selenium as a tool for web scraping of a page containing dynamic attributes instead of BeautifulSoup as recommended here and here, shouldn't driver.find_elements(), return a list of 20 elements each of which containing all info and to be extracted?
>>>>> UPDATE <<<<< Working Solution (although time inefficient)
As recommended by #JaSON in the solution, I now use WebDriverWait in try except block as follows:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions
def get_all_search_details(URL):
st_t = time.time()
options = Options()
options.headless = True
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver =webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
print(f"Scraping {driver.current_url}")
medias = WebDriverWait(driver,timeout=10,).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'result-row')))
for media_idx, media_elem in enumerate(medias):
outer_html = media_elem.get_attribute('outerHTML')
result = scrap_newspaper(outer_html) # some function to retrieve results
SEARCH_RESULTS[f"result_{media_idx}"] = result
except exceptions.StaleElementReferenceException as e:
print(f"Selenium: {type(e).__name__}: {e.args}")
except exceptions.NoSuchElementException as e:
print(f"Selenium: {type(e).__name__}: {e.args}")
except exceptions.TimeoutException as e:
print(f"Selenium: {type(e).__name__}: {e.args}")
except exceptions.WebDriverException as e:
print(f"Selenium: {type(e).__name__}: {e.args}")
except exceptions.SessionNotCreatedException as e:
print(f"Selenium: {type(e).__name__}: {e.args}")
except Exception as e:
print(f"Selenium: {type(e).__name__} line {e.__traceback__.tb_lineno} of {__file__}: {e.args}")
print(f"Selenium General Exception: {URL}")
print(f"\t\tFound {len(medias)} media(s) => {len(SEARCH_RESULTS)} search result(s)\tElapsed_t: {time.time()-st_t:.2f} s")
if __name__ == '__main__':
url = 'https://digi.kansalliskirjasto.fi
This approach works but seems to be very time consuming and inefficient:
Found 20 media(s) => 20 search result(s) Elapsed_t: 15.22 s
This is an answer for Question#2 only since #1 and #3 (as Prophet've already said in comment) are not valid for SO.
Since you're dealing with dynamic content find_elements is not what you need. Try to wait for required data to appear:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
medias = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'media')))
On top of the search results there is an option to download search results as excel, there comes the newspaper/journal metadata and the text surrounding the search. Could it be easier to use than scrape individual elements? (Excel contains only 10.000 first hits, thou...)
I am new to coding, using stackoverflow for the first time. Wondering if I can get some help on this here.
I am trying to scrape the total no.of jobs given on this link.
Following is my code.
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
os.environ['PATH'] += "/Users/monicayadav/PycharmProjects/pythonProject4/selenium/venv/bin"
driver = webdriver.Firefox()
wait = WebDriverWait(driver, 10)
JobCountBESTBUY = wait.until(ec.presence_of_element_located((By.XPATH, "//p[contains(#class, 'font-wt-500 ng-binding')]"))).text
Output I am getting
jobs found
Process finished with exit code 0
I am getting only "job found" as a result , but I need this number instead 1,925
Solution 1 - The easier one
Use time.sleep(seconds) to wait for the page to load the results completely. It's going to be something like the following. Don't forget to import time.
import time
# ... Removed code for simplicity ...
wait = WebDriverWait(driver, 10)
JobCountBESTBUY = wait.until(ec.presence_of_element_located((By.XPATH, "//p[contains(#class, 'font-wt-500 ng-binding')]"))).text
Solution 2 - The faster one
On the other hand, time.sleep spends too much time waiting even though the text is ready already. Another approach is to search for the text itself like the following. The advantage is that as soon as a match is found the wait is over and is possible to return the number directly.
import re
# ... Removed code for simplicity ...
WebDriverWait(driver, 10).until(ec.presence_of_element_located((By.XPATH, "//p[contains(#class, 'font-wt-500 ng-binding')]")))
# Matches `1,234`, `1`, `12`, `1,234,567`
r = re.compile(r'^([0-9,]+).*$')
JobCountBESTBUY = WebDriverWait(driver, 10).until(
lambda _: (e := driver.find_element(By.XPATH, "//p[contains(#class, 'font-wt-500 ng-binding')]")) \
and (m := r.match(e.text)) \
and m.group(1)
I worked on little bit web scraping before but I have not idea of javascript. I want to scrape "Company Name" and "description of the company" from https://www.ces.tech/Show-Floor/Exhibitor-Directory.aspx. I am using selenium for scraping but I don't want to use browser in background. I write some code here:
from selenium.webdriver.common.by import By
from selenium import webdriver
import os
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=op)
company = []
items = driver.find_elements(By.CLASS_NAME, "exhibitorCardModal")
for item in items:
comp=item.find_elements(By.CLASS_NAME, "company-name")
desc = item.find_elements(By.CLASS_NAME, "description")
result_dict = {
But got empty list. Can someone tell me what is wrong here. I also try to use there api https://www.ces.tech/api/Exhibitors?searchTerm=&sortBy=alpha&alpha=&state=&country=&venue=&exhibitorType=&pageNo=1&pageSize=30 but got this error :
{"error":{"code":"ApiVersionUnspecified","message":"An API version is required, but was not specified."}}
You also have to add wait / delay before accessing the elements to let the page completely loaded before you trying to access them.
You should use find_element instead of find_elements for the loop internal commands:
comp=item.find_elements(By.CLASS_NAME, "company-name")
desc = item.find_elements(By.CLASS_NAME, "description")
So your code should be something like this:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import os
import time
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=op)
wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "exhibitorCardModal")))
company = []
items = driver.find_elements(By.CLASS_NAME, "exhibitorCardModal")
for item in items:
comp=item.find_element(By.CLASS_NAME, "company-name")
desc = item.find_element(By.CLASS_NAME, "description")
result_dict = {
I've been trying to scrape data from a table using selenium, but when I run the code, it only gets the header of the table.
from selenium import webdriver
driver = webdriver.Chrome()
table = driver.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/div/div/div[2]/div[2]/div[3]/table/tbody')
I also tried finding element by tag name using table, without luck.
you should try this:
from selenium import webdriver
driver = webdriver.Chrome()
table = driver.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/div/div/div[2]/div[2]/div[3]/table/tbody')
content = driver.find_element_by_xpath('//*[#id="body"]/div/div[2]/div/div/div[2]/div[2]/div[3]/table/tbody/tr['+str(number)+']')
The XPATH in 'table' is just the header, the actual content is this : '//*[#id="body"]/div/div[2]/div/div/div[2]/div[2]/div[3]/table/tbody/tr['+str(number)+']' , that's why you are not getting any content different than the header. Since the XPATH in the rows are like ...../tr[2],...../tr[3],...../tr[4], etc, Im using the str(number) < 12 , to get all the raws, you can also try with 50 rows a the time, is up to you.
I would use requests and mimic the POST request by the page as much faster
import requests
data = {'METHOD': '0','VALUE': '{"BusquedaRubros":"true","IdRubro":"41","Inicio":0}'}
r = s.post('http://www.panamacompra.gob.pa/Security/AmbientePublico.asmx/cargarActosOportunidadesDeNegocio', data=data).json()
You need wait until loader disappear, you can use invisibility_of_element_located, utilize WebDriverWait and expected_conditions. For the table you can use css_selector instead your xpath.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome()
WebDriverWait(driver, 50).until(EC.invisibility_of_element_located((By.XPATH, '//img[#src="images/loading.gif"]')))
table = driver.find_element_by_css_selector('.table_asearch.table.table-bordered.table-striped.table-hover.table-condensed')
Selenium is loading the table (happens fairly quickly) and then assuming it is done, since it's never given a chance to load the table rows (happens more slowly). One way around this is to repeatedly try to find an element that won't appear until the table is finished loading.
This is FAR from the most elegant solution (and there's probably Selenium libraries that do it better), but you can wait for the table by checking to see if a new table row can be found, and if not, sleep for 1 second before trying again.
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time
driver = webdriver.Chrome()
wvar = 0
while(wvar == 0):
#try loading one of the elements we want to read
el = driver.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/div/div/div[2]/div[2]/div[3]/table/tbody/tr[3]')
wvar = 1
except NoSuchElementException:
#not loaded yet
print('table body empty, waiting...')
print('table loaded!')
#element got loaded; reload the table
table = driver.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/div/div/div[2]/div[2]/div[3]/table/tbody')