I'm developing a python app that sets form componentes of a certain webpage (developed using vue3.js)
I'm able to set a datepicker's value, but, after that, next operation clears the dapicker away.
I must be doing something really fool, but I'm out of ideas.
Here's my code:
import sys
from selenium import webdriver
#-----------------------------------------------------------------------------------------------------------------------
driver_headfull = None
try:
driver_headfull = webdriver.Firefox()
firefox_options = webdriver.FirefoxOptions()
except Exception as e:
print('ERROR WHEN CREATING webdriver.Firefox()')
print("Please, verify you have installed: firefox and that geckodriver.exe's is in %PATH%")
print(e)
sys.exit(5)
#navigate to url
driver_headfull.get('http://a_certain_ip')
#set a datepicker
element_to_interact_with_headfull = driver_headfull.find_element_by_id('datesPolicyEffectiveDate')
driver_headfull.execute_script("arguments[0].value = '2020-07-01';", element_to_interact_with_headfull)
#set a <div> that behaves like a <select>.
element_to_interact_with_headfull = driver_headfull.find_element_by_id('industryDescription')
driver_headfull.execute_script("arguments[0].click();", element_to_interact_with_headfull)
element_pseudo_select_option_headfull = driver_headfull.find_element_by_id('descriptionIndustryoption0')
driver_headfull.execute_script("arguments[0].click();", element_pseudo_select_option_headfull)
# this very last instruction resets value of html_id=datesPolicyEffectiveDate (datepicker)
while(True):
pass
Any ideas will be so welcome!
Well, this was a pain. I'll post it in case it's of any use for someone.
It seems the component was reloaded, and I was setting the son of the component by means of
arguments[0].value = '2020-07-01';
so the parent wouldn't see the change, and would automatically reload the child with a default (empty) value.
Adding the following snippet solved my trouble:
driver_headfull.execute_script("arguments[0].value = '2021-07-01';", element_to_interact_with_headfull)
driver_headfull.execute_script("arguments[0].dispatchEvent(new Event('input', { bubbles: true }));", element_to_interact_with_headfull)
Related
I want to write a Python file that contains functions which we want to use in our project. We are working on a Selenium web scraping bot fot Instagram. Right now we write all the functions in the scripts but we want to make a "function" file which we will import and use for our scripts. But the thing is that VS code does not use autocompletion when I want to use a webdrivers function like driver.find_element_by_xpath(cookies_button_xpath).click().
The function file (not finished yet) looks like this:
import time
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
# set constants for functions to run
WEBSITE_PRE_FIX = 'https://www.instagram.com/'
FORBIDDEN_CAPTION_WORDS = ['link in bio','buy now','limited time']
def open_ig(driver: webdriver):
# opens the website and waits till it is loaded
driver.get(WEBSITE_PRE_FIX)
time.sleep(2)
# accept cookies
cookies_button_xpath = "/html/body/div[4]/div/div/button[1]"
driver.find_element_by_xpath(cookies_button_xpath).click()
def login(driver: webdriver, username, password):
time.sleep(2)
# fill in user name and password and log in
username_box_xpath = '/html/body/div[1]/section/main/article/div[2]/div[1]/div/form/div/div[1]/div/label/input'
username_element = driver.find_element_by_xpath(username_box_xpath)
username_element.send_keys(username)
password_box_xpath = '/html/body/div[1]/section/main/article/div[2]/div[1]/div/form/div/div[2]/div/label/input'
password_element = driver.find_element_by_xpath(password_box_xpath)
password_element.send_keys(password)
password_element.send_keys(Keys.ENTER)
# click on do not save username and password + do not turn on notifications
time.sleep(3)
dont_save_username_button_password_xpath = '/html/body/div[1]/section/main/div/div/div/div/button'
dont_save_username_button_element = driver.find_element_by_xpath(dont_save_username_button_password_xpath)
dont_save_username_button_element.click()
So the code does work (as in it runs and does what I want) but I would like to know if we can write the function file another way so things like autocompletion en the color filters work. I'm not completely sure if it is possible. If there is any other way to write the functions file, all recommendations are welcome.
Have you tried writing the functions file as a simple class?
class FunctionsFile():
def __init__(self):
self.website_pre_fix = 'https://www.instagram.com/'
self.forbidden_capture_words = ['link in bio','buy now','limited time']
def open_ig(self, driver: webdriver):
# opens the website and waits till it is loaded
driver.get(WEBSITE_PRE_FIX)
time.sleep(2)
# accept cookies
cookies_button_xpath = "/html/body/div[4]/div/div/button[1]"
driver.find_element_by_xpath(cookies_button_xpath).click()
def login(self, driver: webdriver, username, password):
time.sleep(2)
# fill in user name and password and log in
username_box_xpath = '/html/body/div[1]/section/main/article/div[2]/div[1]/div/form/div/div[1]/div/label/input'
username_element = driver.find_element_by_xpath(username_box_xpath)
username_element.send_keys(username)
password_box_xpath = '/html/body/div[1]/section/main/article/div[2]/div[1]/div/form/div/div[2]/div/label/input'
password_element = driver.find_element_by_xpath(password_box_xpath)
password_element.send_keys(password)
password_element.send_keys(Keys.ENTER)
# click on do not save username and password + do not turn on notifications
time.sleep(3)
dont_save_username_button_password_xpath = '/html/body/div[1]/section/main/div/div/div/div/button'
dont_save_username_button_element = driver.find_element_by_xpath(dont_save_username_button_password_xpath)
dont_save_username_button_element.click()
You can then instantiate the class in any file. If in same directory:
from FunctionsFile import FunctionsFile
funcs = FunctionsFile()
funcs.open_ig(driver)
That should use the standard VS Code color schemes and autocompletion. (I think anyway).
I've tried running a script on Windows and on Ubuntu, both using Python 3 and the latest versions of geckodriver, resulting in differing behavior. The full script is given below.
I'm trying to get the data for several different tests from a test prep site. There are different subjects, each of which has a specialization, each of which has a practice-test, each of which has several questions. The scrape function walks through the steps to get data of each type.
subject <--- specialization <---- practice-test *------ question
The get_questions function is where the difference shows up:
In Windows, it behaves as expected. After the last question's choice is clicked, it goes on to a results page.
In Ubuntu, when a choice is clicked on the last question, it reloads the last question and keeps clicking the same choice and reloading the same question.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pathlib
import time
import json
import os
driver=webdriver.Firefox(executable_path="./geckodriver.exe")
wait = WebDriverWait(driver, 15)
data=[]
def setup():
driver.get('https://www.varsitytutors.com/practice-tests')
try:
go_away_1= driver.find_element_by_class_name("ub-emb-iframe")
driver.execute_script("arguments[0].style.visibility='hidden'", go_away_1)
go_away_2= driver.find_element_by_class_name("ub-emb-iframe-wrapper")
driver.execute_script("arguments[0].style.visibility='hidden'", go_away_2)
go_away_3= driver.find_element_by_class_name("ub-emb-visible")
driver.execute_script("arguments[0].style.visibility='hidden'", go_away_3)
except:
pass
def get_subjects(subs=[]):
subject_clickables_xpath="/html/body/div[3]/div[9]/div/*/div[#data-subject]/div[1]"
subject_clickables=driver.find_elements_by_xpath(subject_clickables_xpath)
subject_names=map(lambda x : x.find_element_by_xpath('..').get_attribute('data-subject'), subject_clickables)
subject_pairs=zip(subject_names, subject_clickables)
return subject_pairs
def get_specializations(subject):
specialization_clickables_xpath="//div//div[#data-subject='"+subject+"']/following-sibling::div//div[#class='public_problem_set']//a[contains(.,'Practice Tests')]"
specialization_names_xpath="//div//div[#data-subject='"+subject+"']/following-sibling::div//div[#class='public_problem_set']//a[contains(.,'Practice Tests')]/../.."
specialization_names=map(lambda x : x.get_attribute('data-subject'), driver.find_elements_by_xpath(specialization_names_xpath))
specialization_clickables = driver.find_elements_by_xpath(specialization_clickables_xpath)
specialization_pairs=zip(specialization_names, specialization_clickables)
return specialization_pairs
def get_practices(subject, specialization):
practice_clickables_xpath="/html/body/div[3]/div[8]/div[3]/*/div[1]/a[1]"
practice_names_xpath="//*/h3[#class='subject_header']"
lengths_xpath="/html/body/div[3]/div[8]/div[3]/*/div[2]"
lengths=map(lambda x : x.text, driver.find_elements_by_xpath(lengths_xpath))
print(lengths)
practice_names=map(lambda x : x.text, driver.find_elements_by_xpath(practice_names_xpath))
practice_clickables = driver.find_elements_by_xpath(practice_clickables_xpath)
practice_pairs=zip(practice_names, practice_clickables)
return practice_pairs
def remove_popup():
try:
button=wait.until(EC.element_to_be_clickable((By.XPATH,"//button[contains(.,'No Thanks')]")))
button.location_once_scrolled_into_view
button.click()
except:
print('could not find the popup')
def get_questions(subject, specialization, practice):
remove_popup()
questions=[]
current_question=None
while True:
question={}
try:
WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]")))
question_number=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]').text.replace('.','')
question_pre=driver.find_element_by_class_name('question_pre')
question_body=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[2]/p')
answer_choices=driver.find_elements_by_class_name('question_row')
answers=map(lambda x : x.text, answer_choices)
question['id']=question_number
question['pre']=question_pre.text
question['body']=question_body.text
question['answers']=list(answers)
questions.append(question)
choice=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"input.test_button")))
driver.execute_script("arguments[0].click();", choice[3])
time.sleep(3)
except Exception as e:
if 'results' in driver.current_url:
driver.get(driver.current_url.replace('http://', 'https://'))
# last question has been answered; record results
remove_popup()
pathlib.Path('data/'+subject+'/'+specialization).mkdir(parents=True, exist_ok=True)
with open('data/'+subject+'/'+specialization+'/questions.json', 'w') as outfile:
json.dump(list(questions), outfile)
break
else:
driver.get(driver.current_url.replace('http://', 'https://'))
return questions
def scrape():
setup()
subjects=get_subjects()
for subject_name, subject_clickable in subjects:
subject={}
subject['name']=subject_name
subject['specializations']=[]
subject_clickable.click()
subject_url=driver.current_url.replace('http://', 'https://')
specializations=get_specializations(subject_name)
for specialization_name, specialization_clickable in specializations:
specialization={}
specialization['name']=specialization_name
specialization['practices']=[]
specialization_clickable.click()
specialization_url=driver.current_url.replace('http://', 'https://')
practices=get_practices(subject_name, specialization_name)
for practice_name, practice_clickable in practices:
practice={}
practice['name']=practice_name
practice_clickable.click()
questions=get_questions(subject_name, specialization_name, practice_name)
practice['questions']=questions
driver.get(specialization_url)
driver.get(subject_url)
data.append(subject)
print(data)
scrape()
Can anyone help me figure out what may be causing this?
It's just timing. The last question will take much longer than the 3 second sleep until it loads the next page. Waiting for the page to be gone fixes this and speeds up the script execution.
from selenium.common.exceptions import StaleElementReferenceException
<snip>
choice=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"input.test_button")))
choice[3].click()
try:
while choice[3].is_displayed():
time.sleep(1)
except StaleElementReferenceException as e:
continue
The first problem here is that you are using an Exception to break the loop. The proper way is to use a condition test, for example if result not in url, continue the loop, else, break the loop. The Exception can come in as a backup execution step.
The second is that just using sleep to wait for the results page is not enough, you need to test for the presence of an element on the result page. Or you could just watch for the title change:
wait = WebDriverWait(driver, 10)
wait.until(EC.title_contains("Results"))
To scrape a pool of URLs, I am paralell processing selenium with joblib. In this context, I am facing two challenges:
Challenge 1 is to speed up this process. In the moment, my code opens and closes a driver instance for every URL (ideally would be one for every process)
Challenge 2 is to get rid of the CPU-intensive while loop that I think I need to continue on empty results (I know that this is most likely wrong)
Pseudocode:
URL_list = [URL1, URL2, URL3, ..., URL100000] # List of URLs to be scraped
def scrape(URL):
while True: # Loop needed to use continue
try: # Try scraping
driver = webdriver.Firefox(executable_path=path) # Set up driver
website = driver.get(URL) # Get URL
results = do_something(website) # Get results from URL content
driver.close() # Close worker
if len(results) == 0: # If do_something() failed:
continue # THEN Worker to skip URL
else: # If do_something() worked:
safe_results("results.csv") # THEN Save results
break # Go to next worker/URL
except Exception as e: # If something weird happens:
save_exception(URL, e) # THEN Save error message
break # Go to next worker/URL
Parallel(n_jobs = 40)(delayed(scrape)(URL) for URL in URL_list))) # Run in 40 processes
My understanding is that in order to re-use a driver instance across iterations, the # Set up driver-line needs to be placed outside scrape(URL). However, everything outside scrape(URL) will not find its way to joblib's Parallel(n_jobs = 40). This would imply that you can't reuse driver instances while scraping with joblib which can't be true.
Q1: How to reuse driver instances during parallel processing in the above example?
Q2: How to get rid of the while-loop while maintaining functionality in the above-mentioned example?
Note: Flash and image loading is disabled in firefox_profile (code not shown)
1) You should first create a bunch of drivers: one for each process. And pass an instance to the worker. I don't know how to pass drivers to an Prallel object, but you could use threading.current_thread().name key to identify drivers. To do that, use backend="threading". So now each thread will has its own driver.
2) You don't need a loop at all. Parallel object itself iter all your urls (I hope I realy understend your intentions to use a loop)
import threading
from joblib import Parallel, delayed
from selenium import webdriver
def scrape(URL):
try:
driver = drivers[threading.current_thread().name]
except KeyError:
drivers[threading.current_thread().name] = webdriver.Firefox()
driver = drivers[threading.current_thread().name]
driver.get(URL)
results = do_something(driver)
if results:
safe_results("results.csv")
drivers = {}
Parallel(n_jobs=-1, backend="threading")(delayed(scrape)(URL) for URL in URL_list)
for driver in drivers.values():
driver.quit()
But I don't realy think you get profit in using n_job more than you have CPUs. So n_jobs=-1 is the best (of course I may be wrong, try it).
I am trying to scrape webpages using python and selenium. I have a url which takes a single parameter and a list of valid parameters. I navigate to that url with a single parameter at a time and click on a link, a pop up window opens with a page.
The pop window automatically opens a print dialogue on page load.
Also the url bar is disabled for that popup.
My code:
def packAmazonOrders(self, order_ids):
order_window_handle = self.driver.current_window_handle
for each in order_ids:
self.driver.find_element_by_id('sc-search-field').send_keys(Keys.CONTROL, "a")
self.driver.find_element_by_id('sc-search-field').send_keys(Keys.DELETE)
self.driver.find_element_by_id('sc-search-field').send_keys(each)
self.driver.find_element_by_class_name('sc-search-button').click()
src = self.driver.page_source.encode('utf-8')
if 'Unshipped' in src and 'Easy Ship - Schedule pickup' in src:
is_valid = True
else:
is_valid = False
if is_valid:
print 'Packing Slip Start - %s' %each
self.driver.find_element_by_link_text('Print order packing slip').click()
handles = self.driver.window_handles
print handles
try:
handles.remove(order_window_handle)
except:
pass
self.driver.switch_to_window(handles.pop())
print handles
packing_slip_page = ''
packing_slip_page = self.driver.page_source.encode('utf-8')
if each in packing_slip_page:
print 'Packing Slip Window'
else:
print 'not found'
self.driver.close()
self.driver.switch_to_window(order_window_handle)
Now I have two questions:
How can I download that pop up page as pdf?
For first parameter every thing works fine. But for another parameters in the list the packing_slip_page does not update (which i think because of the disabled url bar. But not sure though.) I tried the print the handle (print handles) for each parametre but it always print the same value. So how to access the correct page source for other parameters?
I have built a web scraper. The program enters searchterm into a searchbox and grabs the results. Pandas goes through a spreadsheet line-by-line in a column to retrieve each searchterm.
Sometimes the page doesn't load properly, prompting a refresh.
I need a way for it to repeat the function and try the same searchterm if it fails. Right now, if I return, it would go on to the next line in the spreadsheet.
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
df = pd.read_csv(searchterms.csv, delimiter=",")
def scrape(searchterm):
#Loads url
searchbox = driver.find_element_by_name("searchbox")
searchbox.clear()
searchbox.send_keys(searchterm)
print "Searching for %s ..." % searchterm
no_result = True
while no_result is True:
try:
#Find results, grab them
no_result = False
except:
#Refresh page and do the above again for the current searchterm - How?
driver.refresh()
return pd.Series([col1, col2])
df[["Column 1", "Column 2"]] = df["searchterm"].apply(scrape)
#Executes crawl for each line in csv
The try except construct comes with else clause. The else block is executed if everything goes OK. :
def scrape(searchterm):
#Loads url
no_result = True
while no_result:
#Find results, grab them
searchbox = driver.find_element_by_name("searchbox")
searchbox.clear()
try: #assumes that an exception is thrown if there is no results
searchbox.send_keys(searchterm)
print "Searching for %s ..." % searchterm
except:
#Refresh page and do the above again for the current searchterm
driver.refresh()
else: # executed if no exceptions were thrown
no_results = False
# .. some post-processing code here
return pd.Series([col1, col2])
(There is also a finally block that is executed no matter what, which is useful for cleanup tasks that don't depend on the success or failure of the preceding code)
Also, note that empty except catches any exceptions and is almost never a good idea. I'm not familiar with how selenium handles errors, but when catching exceptions, you should specify which exception you are expecting to handle. This how, if an unexpected exception occurs, your code will abort and you'll know that something bad happened.
That is why you should also try keeping as few lines a possible within the try block.