I have a list of pages to crawl using selenium
Let's say the website is example.com/1...N (up to unknown size)
from concurrent.futures import ThreadPoolExecutor, as_completed
from webdriver_manager.chrome import ChromeDriverManager
def crawl_example(page):
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(f"example.com/{page}")
# Do some processing
result = "Fetched data"
return result
N_THREAD = 10
MAX_SIZE = 100
with ThreadPoolExecutor(N_THREAD) as ex:
futures = [ex.submit(crawl_example, page) for page in range(MAX_SIZE)]
Setting MAX_SIZE call unnecessary request after N, so I wanted to find a better solution.
I could only think of creating a global variable (is_done) or add another parameter to the function.
What would be the most pythonic approach to solve the above issue?
Initialize a last_page variable to be infinity (preferably in a class variable)
And update and crawl with the following logic would be good enough
Since two threads can update last_page at the same time,
prevent higher page overwrite last_page updated by lower page
from threading import Lock
last_page_lock = Lock()
def crawl_page(page):
if page > last_page:
continue
if page_empty():
with last_page_lock():
last_page = min(last_page, page)
...
I'm using Selenium to capture screenshots of a web page. It works great on sites like stackoverflow but I'm trying to use it on a page that never stops loading. Is there a way to grab the screenshot after x seconds regardless if it's done or not?
Current code:
import os
from selenium import webdriver
def main():
driver = webdriver.Chrome()
with open('test.txt', 'r') as f:
for url in f.readlines():
driver.get('http://' + url)
sn_name = os.path.join('Screenshots', url.strip().replace('/', '-') + '.png')
print('Attempting to save:', sn_name)
if not driver.save_screenshot(sn_name):
raise Exception('Could not save screen shot: ' + sn_name)
driver.quit()
if __name__ == '__main__':
main()
I think it doesn't work like that.
Webdriver will implicit waiting for a page loading till timed-out.
It should give you a timeout exception.
I think you should use try-except to catch that and then take a screenshot.
Otherwise, you should do a multithreading programming for another thread to take a screenshot.
import threading
def rand_function1():
#random actions
def rand_function2():
#random actions
def main()
rand_function1
rand_function2
return
if __name__ == '__main__':
url_list = "https://www.rand_urls.com/"
driver = webdriver.Firefox()
for t in range(10):
t = threading.Thread(target=main)
t.start()
I have this simple program that I am trying to open urls using 10 Firefox web drivers. However, all it does it use one browser and continue to cycle though urls thought that individual browser. I will be using a unique proxies for each browser so opening tabs wont be an option.
How do I get n threads to run the main function individually using its own Firefox web driver?
According to this and this previous question, selenium is not thread safe.
You should create drivers inside your main, so that every thread has its own driver.
import threading
def rand_function1():
#random actions
def rand_function2():
#random actions
def main()
# use a different driver for each thread
driver = webdriver.Firefox()
rand_function1
rand_function2
return
if __name__ == '__main__':
url_list = "https://www.rand_urls.com/"
for t in range(10):
t = threading.Thread(target=main)
t.start()
I am using Python 3.4 to make a webscraper that logins to my bank account, clicks into each account copying the balance , adding the total then pasting into google sheets.
I got it working but as you can see from the code, it is repetitive, ugly and long winded.
I have identified a few issues:
I believe I should be using a function to loop through the different account pages to get the balance and then assigning values to a different variable. However I couldn't think of a way of getting this done.
converting the string to float seems messy, what I am trying to do is to make a string ie. $1,000.00 into a float by stripping the '$' and ',' , is there a more elegant way?
from selenium import webdriver
import time
import bs4
import gspread
from oauth2client.service_account import serviceAccountCredentials
driver = webdriver.Chrome()
driver.get(bank url)
inputElement = driver.find_element_by_id("dUsername")
inputElement.send_keys('username')
pwdElement = driver.find_element_by_id("password")
pwdElement.send_keys('password')
driver.find_element_by_id('loginBtn').click()
time.sleep(3)
#copies saving account balance
driver.find_element_by_link_text('Savings').click()
time.sleep(3)
html = driver.page_source
soup = bs4.BeautifulSoup(html)
elems=soup.select('#CurrentBalanceAmount')
SavingsAcc = float(elems[0].getText().strip('$').replace(',',''))
driver.back()
#copy cheque balance
driver.find_element_by_link_text('cheque').click()
time.sleep(3)
html = driver.page_source
soup = bs4.BeautifulSoup(html)
elems=soup.select('#CurrentBalanceAmount')
ChequeAcc = float(elems[0].getText().strip('$').replace(',',''))
Total = SavingsAcc+ ChequeACC
driver.back()
try the following code:
from selenium import webdriver
import time
import bs4
import gspread
from oauth2client.service_account import serviceAccountCredentials
driver = webdriver.Chrome()
driver.get(bank url)
inputElement = driver.find_element_by_id("dUsername")
inputElement.send_keys('username')
pwdElement = driver.find_element_by_id("password")
pwdElement.send_keys('password')
driver.find_element_by_id('loginBtn').click()
time.sleep(3)
def getBalance(accountType):
driver.find_element_by_link_text(accountType).click()
time.sleep(3)
html = driver.page_source
soup = bs4.BeautifulSoup(html)
elems=soup.select('#CurrentBalanceAmount')
return float(elems[0].getText().strip('$').replace(',',''))
#copies saving account balance
SavingsAcc = getBalance('Savings')
driver.back()
#copy cheque balance
ChequeACC = getBalance('cheque')
Total = SavingsAcc+ ChequeACC
driver.back()
Made a method getBalance, where you have to pass the account type, which returns the balance amount.
Note: you can keep driver.back call in getBalance as per your convenience, but before return statement.
Related to converting string to float, I don't know any other better way apart from the existing logic. As it is now moved into a method, I hope now it won't trouble you much. there is float method, which converts string to float, but $, , are not accepted. more details here
Note: If #CurrentBalanceAmount value changes every time for different account types, you can parameterize like accountType.
I would use several python idioms to clean up the code:
Wrap all code in functions
Generally speaking, putting your code in functions makes it easier to read and follow
When you run a python script (python foo.py), the python interpreter runs every line it can, in order, one by one. When it encounters a function definition, it only runs the definition line (def bar():), and not the code within the function.
This article seems like a good place to get more info on it: Understanding Python's Execution Model
Use the if __name__ == "__main__": idiom to make it an importable module
Similar to the above bullet, this gives you more control on how and when your code executes, how portable it is, and how reusable it is.
"Importable module" means you can write your code in one file, and then import that code in another module.
More info on if __name__ == "__main__" here: What does if name == “main”: do?
Use try/finally to make sure your driver instances get cleaned up
Use explicit waits to interact with the page so you don't need to use sleep
By default, Selenium tries to find and return things immediately. If the element hasn't loaded yet, Selenium throws an exception because it isn't smart enough to wait for it to load.
Explicit waits are built into Selenium, and allow your code to wait for an element to load into the page. By default it checks every half a second to see if the element loaded in. If it hasn't, it simply tries again in another half second. If it has, it returns the element. If it doesn't ever load in, the Wait object throws a TimeoutException.
More here: Explicit and Implicit Waits
And here: WAIT IN SELENIUM PYTHON
Code (untested for obvious reasons):
from selenium import webdriver
from explicit import waiter, ID # This package makes explicit waits easier to use
# pip install explicit
from selenium.webdriver.common.by import By
# Are any of these needed?
# import time
# import bs4
# import gspread
# from oauth2client.service_account import serviceAccountCredentials
def bank_login(driver, username, password):
"""Log into the bank account"""
waiter.find_write(driver, 'dUsername', username, by=ID)
waiter.find_write(driver, 'password', password, by=ID, send_enter=True)
def get_amount(driver, source):
"""Click the page and scrape the amount"""
# Click the page in question
waiter.find_element(driver, source, by=By.LINK_TEXT).click()
# Why are you using beautiful soup? Because it is faster?
# time.sleep(3)
# html = driver.page_source
# soup = bs4.BeautifulSoup(html)
# elems=soup.select('#CurrentBalanceAmount')
# SavingsAcc = float(elems[0].getText().strip('$').replace(',',''))
# driver.back()
# I would do it this way:
# When using explicit waits there is no need to explicitly sleep
amount_str = waiter.find_element(driver, "CurrentBalanceAmount", by=ID).text
# This conversion scheme will handle none $ characters too
amount = float("".join([char for char in amount_str if char in ["1234567890."]]))
driver.back()
return amount
def main():
driver = webdriver.Chrome()
try:
driver.get(bank_url)
bank_login(driver, 'username', 'password')
print(sum([get_amount(driver, source) for source in ['Savings', 'cheque']]))
finally:
driver.quit() # Use this try/finally idiom to prevent a bunch of dead browsers instances
if __name__ == "__main__":
main()
Full disclosure: I maintain the explicit package. You could replace the waiter calls above with relatively short Wait calls if you would prefer. If you are using Selenium with any regularity it is worth investing the time to understand and use explicit waits.
I'm using webbrowser, so I can open a html to an performance test I'm currently doing.
This small piece of code is the begin of the automation. The goal of the function perf_measure is to return how long took to load the page in url entirely.
import webbrowser
def perf_measure(url=""):
try:
webbrowser.open(url)
except webbrowser.Error, e:
print "It couldn't open the url: ", url
url = "www.google.com"
open_browser(url)
How can I accomplish that? I just need the value, in seconds, like:
www.google.com Total time to load page in (secs): 2.641
Do you need to use the web browser? As in do you need to view the result?
Otherwise you could do this.
import urllib2
from time import time
stream = urllib2.urlopen('http://www.rarlab.com/rar/winrar-x64-420.exe')
start_time = time()
output = stream.read()
end_time = time()
stream.close()
print(end_time-start_time)
If you want a more human-readable result you can use round.
print(round(end_time-start_time, 3))
Output
0.865000009537 # Without Round
0.865 # With Round
A fancy way using a decorator
import time
def time_it(func):
def wrapper(*arg,**kw):
t1 = time.time()
res = func(*arg,**kw)
t2 = time.time()
return (t2-t1),res,func.func_name
return wrapper
#time_it
def perf_measure(url=""):
#w hatever you want
pass
If you want to time the page load (including all of the resources it loads, rendering time etc.) in a real browser you can use Selenium Webdriver. This will open your browser of choice, load the URL and then extract timings:
from selenium import webdriver
def time_url(driver, url):
driver.get(url)
# Use the browser Navigation Timing API to get some numbers:
# https://developer.mozilla.org/en-US/docs/Web/API/Navigation_timing_API
navigation_start = driver.execute_script(
"return window.performance.timing.navigationStart")
dom_complete = driver.execute_script(
"return window.performance.timing.domComplete")
total_time = dom_complete - navigation_start
print(f"Time {total_time}ms")
driver = webdriver.Chrome()
try:
url = "https://httpbin.org/delay/"
time_url(driver, url + '1')
time_url(driver, url + '2')
finally:
driver.close()
There are many other metrics you can load if you want to know the render-time separately from the loading time etc.