Python WebScraping - Sleep oscillate in slow websites

Python WebScraping - Sleep oscillate in slow websites - python

I have a webscraping, but the site I'm using in some days is slow and sometimes not. Using the fixed SLEEP, it gives an error in a few days. How to fix this?
I use SLEEP in the intervals of the tasks that I have placed, because the site is sometimes slow and does not return the result giving me an error.
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox import options
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import Select
import pandas as pd
import json
from time import sleep
options = Options()
options.headless = True
navegador = webdriver.Firefox(options = options)
link = '****************************'
navegador.get(url = link)
sleep(1)
usuario = navegador.find_element(by=By.ID, value='ctl00_ctl00_Content_Content_txtLogin')
usuario.send_keys('****************************')
sleep(1)
senha = navegador.find_element(by=By.ID, value='ctl00_ctl00_Content_Content_txtSenha')
senha.send_keys('****************************')
sleep(2.5)
botaologin = navegador.find_element(by=By.ID, value='ctl00_ctl00_Content_Content_btnEnviar')
botaologin.click()
sleep(40)
agendamento = navegador.find_element(by=By.ID, value='ctl00_ctl00_Content_Content_TreeView2t8')
agendamento.click()
sleep(2)
selecdia = navegador.find_element(By.CSS_SELECTOR, "a[title='06 de dezembro']")
selecdia.click()
sleep(2)
selecterminal = navegador.find_element(by=By.ID, value='ctl00_ctl00_Content_Content_ddlVagasTerminalEmpresa')
selecterminal.click()
sleep(1)
select = Select(navegador.find_element(by=By.ID, value='ctl00_ctl00_Content_Content_ddlVagasTerminalEmpresa'))
select.select_by_index(1)
sleep(10)
buscalink = navegador.find_elements(by=By.XPATH, value='//*[#id="divScroll"]')
for element in buscalink:
teste3 = element.get_attribute('innerHTML')
soup = BeautifulSoup(teste3, "html.parser")
Vagas = soup.find_all(title="Vaga disponível.")
print(Vagas)
temp=[]
for i in Vagas:
on_click = i.get('onclick')
temp.append(on_click)
df = pd.DataFrame(temp)
df.to_csv('test.csv', mode='a', header=False, index=False)
It returns an error because the page does not load in time and it cannot get the data, but this time is variable

Instead of all these hardcoded sleeps you need to use WebDriverWait expected_conditions explicit waits.
With it you can set some timeout period so Selenium will poll the page periodically until the expected condition is fulfilled.
For example if you need to click a button you will wait for that element clickability. Once this condition is found Selenium will return you that element and you will be able to click it.
This will reduce all the redundant delays on the one hand and will keep waiting until the condition is matched on the other hand (until it is inside the defined timeout).
So, your code can be modified as following:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
#-----
wait = WebDriverWait(navegador, 30)
navegador.get(link)
wait.until(EC.element_to_be_clickable((By.ID, "ctl00_ctl00_Content_Content_txtLogin"))).send_keys('****************************')
wait.until(EC.element_to_be_clickable((By.ID, "ctl00_ctl00_Content_Content_txtSenha"))).send_keys('****************************')
wait.until(EC.element_to_be_clickable((By.ID, "ctl00_ctl00_Content_Content_btnEnviar"))).click()
wait.until(EC.element_to_be_clickable((By.ID, "ctl00_ctl00_Content_Content_TreeView2t8"))).click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[title='06 de dezembro']"))).click()
etc.

Related

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

I got a dataframe that contains links to google reviews of two restaurants. I wanted to load all reviews of two restaurants (one by one) into the browser and then save them into a new data frame. I wrote a script that reads and load all reviews into the browser as follow:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
link_df = Link
0 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
1 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
i = 0
driver = webdriver.Chrome()
for index, i in link_df.iterrows():
base_url = i['Link'] #link_df['Link'][i]
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
print('Restaurant number is ',index)
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=1
reviews_info = driver.find_elements_by_xpath("//div[#class='jxjCjc']")
review_information = pd.DataFrame(columns=["Restaurant title","Restaurant rating","Total reviews","Reviewer Name","Rating", "Review"])
name= ''
rating = ''
text = ''
for index,review_info in enumerate(reviews_info):
name = review_info.find_element_by_xpath("./div/div/a").text
rating = review_info.find_element_by_xpath(".//div[#class='PuaHbe']//g-review-stars//span").get_attribute('aria-label')
text = review_info.find_element_by_xpath(".//div[#class='Jtu6Td']//span").text
review_information.at[len(review_information)] = [title,overall_rating,num_reviews,name,rating,text]
filename = 'Google_reviews' + ' ' +pd.to_datetime("now").strftime("%Y_%m_%d")+'.csv'
files_present = glob.glob(filename)
if files_present:
review_information.to_csv(filename,index=False,mode='a',header=False)
else:
review_information.to_csv(filename,index=False)
driver.get('https:ww.google.com')
time.sleep(3)
The problem is that script throws an error when it reaches the following line.
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
It throws following error:
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=95.0.4638.69)
When I tried the same program without storing google links in dataframe (i.e. no for loop and instead of base_url = i['Link'], I wrote base_url = google review link) it works fine.
I am not sure where I am making the mistake. Any suggestion or help to fix the issue would be highly appreciated?

EDIT
you put the creation of driver outside the for loop
you cant launch the new url with gps data when the first popup is always in front, if you launch it, it stays in backdoor, the easier way is to launch a new url without gps data -> https:ww.google.com and wait 3 dec before to follow your loop:
your count is not good, i have changed your selector and change the total and set some lines in comment
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.options import Options
import time
link_df = ["https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binary = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
options = Options()
options.binary = binary
driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
# i have to launch one time to accept the cookies manually
#by setting a breakpoint after, but you dont have that i think
#driver.get(link_df[0])
print ("Headless Firefox Initialized")
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Avis les plus récents']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
all_reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()
it seems the solution for chrome needs some fixes:
org.openqa.selenium.StaleElementReferenceException: stale element reference: element is not attached to the page document
The literal meaning is about , The referenced element is out of date , No longer attached to the current page . Usually , This is because the page has been refreshed or skipped , The solution is , Reuse findElement or findElements Method to locate the element .
so its seems for chrome there is a problem of refreshing, so i suggest to load the number of record before to scroll, to have a fresh copy of DOM items, and i have to add a wait 1sec at the end of while loop
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
#from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
import time
link_df = [
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binaryfirefox = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
binarychrome = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
options = Options()
#cap = DesiredCapabilities().CHROME
#cap["marionette"] = True
#cap = DesiredCapabilities().FIREFOX
#options.binary = binaryfirefox
#driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
options.binary_location = binarychrome
driver = webdriver.Chrome(options=options, executable_path="E:\\Téléchargement\\chromedriver.exe" )
# same reason tha Firefox i have to load one time
# an url to accept manually the cookies
#driver.get(link_df[0])
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Newest']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
#reload to avoid exception, or trap scroll with try/except but more expznsive
all_reviews = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
time.sleep(1)
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()

Conditional dropdown for loop is not working in the expected way

I had posted in Stack Exchange earlier; however, did not get much response from that yet; hence, posting it here.
I am trying to scrape some data using the following code. When I run the code line by line, it works fine. However, when I want to run all code at one go, the dropdown options go blank and as a result, the last line returns error. Your help would be much appreciated. The code is below.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import os
path = os.path.join(r"D:\ScrapedData\TN\SocialAudit")
path_to_chromedriver = 'D:\ScrapedData/chromedriver'
options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : path}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(chrome_options=options ,executable_path=path_to_chromedriver)
url = "http://mnregaweb4.nic.in/netnrega/SocialAudit/StateList.aspx"
browser.get(url)
browser.set_page_load_timeout(45)
browser.maximize_window()
browser.find_element_by_link_text("BIHAR").click()
browser.implicitly_wait(5)
year=['2016-2017', '2017-2018', '2018-2019', '2019-2020']
elem2 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlFin")
elem2.send_keys(year[0])
browser.implicitly_wait(5)
select_dist = browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddldist")
options = [x for x in select_dist.find_elements_by_tag_name("option")]
dist=[]
for e in range(len(options)):
select_dist = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddldist"))
select_dist.select_by_index(e)
select_block = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlblock"))
options1 = select_block.options
for f in range(len(options1)):
select_block = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlblock"))
select_block.select_by_index(f)
select_gp = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlpanchayat"))
options2 = select_gp.options
for g in range(len(options2)):
select_gp = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlpanchayat"))
select_gp.select_by_index(g)
browser.find_element_by_css_selector("#ctl00_ContentPlaceHolder1_rbLoginLevel_1").click()
browser.implicitly_wait(10)
elem6 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$txtperiodFrom")
elem6.send_keys('01/04/2016')
browser.implicitly_wait(10)
elem7 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$txtperiodTo")
elem7.send_keys('31/03/2017')
browser.implicitly_wait(10)
browser.find_element_by_css_selector("#ctl00_ContentPlaceHolder1_login").click()
browser.implicitly_wait(10)
browser.find_element_by_link_text("Download All Reports").click()

Besides that the target page is slower than an aged snail, and those 10 second waits are barely enough for anything, there are two things you missed an those caused your troubles:
you did not take account that the first element of the select options are "select an option" types. So if you try to cycle trough all of them, you must ignore the option at the first index, else it will look like "nothing is selected"
wait for that spinner. After the spinner is gone, page will be refreshed. Do not grab the elements before page refresh is complete, wait until the spinner is gone.
With these two helper functions it is possible to press the "Get Reports" button without issues:
def is_spinner_gone(arg):
loaded_spinner = browser.find_element_by_xpath('//div[//div[#class="loader"]]')
if loaded_spinner:
return loaded_spinner.get_attribute('style') == 'display: none;'
return True
def wait_for_element(xpath):
# this is necessary because the spinner does not pop up instantly
time.sleep(1)
no_spinner = WebDriverWait(browser, 500).until(is_spinner_gone)
element = WebDriverWait(browser, 500).until(
EC.element_to_be_clickable((By.XPATH, xpath)))
return element
If you get your elements via the wait_for_element call then you'll be able to interact with them without error. I guess you know that pressing that button is not the end of the road yet, you'll have to choose the report format and who knows what later on.
Adjusted code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import os
import time
path = os.path.join(r"D:\ScrapedData\TN\SocialAudit")
path_to_chromedriver = 'D:\ScrapedData/chromedriver'
options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : path}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(chrome_options=options ,executable_path=path_to_chromedriver)
start = time.time()
url = "http://mnregaweb4.nic.in/netnrega/SocialAudit/StateList.aspx"
browser.get(url)
browser.set_page_load_timeout(45)
browser.maximize_window()
loaded = time.time()
print(f'PAGE LOADED IN {loaded-start} seconds')
browser.find_element_by_link_text("BIHAR").click()
def is_spinner_gone(arg):
loaded_spinner = browser.find_element_by_xpath('//div[//div[#class="loader"]]')
if loaded_spinner:
return loaded_spinner.get_attribute('style') == 'display: none;'
return True
def wait_for_element(xpath):
# this is necessary because the spinner does not pop up instantly
time.sleep(1)
no_spinner = WebDriverWait(browser, 500).until(is_spinner_gone)
element = WebDriverWait(browser, 500).until(
EC.element_to_be_clickable((By.XPATH, xpath)))
return element
year=['2016-2017', '2017-2018', '2018-2019', '2019-2020']
elem2 = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlFin"]')
selector_page_loaded = time.time()
print(f'WORK AREA LOADED IN {selector_page_loaded-loaded} seconds')
elem2.send_keys(year[0])
select_dist = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddldist"]')
options = [x for x in select_dist.find_elements_by_tag_name("option")]
dist=[]
# ISSUE: default fields are included in the options!
for e in range(1,len(options)):
select_dist = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddldist"]'))
select_dist.select_by_index(e)
select_block = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlblock"]'))
options1 = select_block.options
for f in range(1, len(options1)):
select_block = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlblock"]'))
select_block.select_by_index(f)
select_gp = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlpanchayat"]'))
options2 = select_gp.options
for g in range(1, len(options2)):
select_gp = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlpanchayat"]'))
select_gp.select_by_index(g)
wait_for_element('//*[#id="ctl00_ContentPlaceHolder1_rbLoginLevel_1"]').click()
elem6 = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$txtperiodFrom"]')
elem6.send_keys('01/04/2016')
elem7 = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$txtperiodTo"]')
elem7.send_keys('31/03/2017')
wait_for_element('//*[#value="Get Reports"]').click()
print(f'FIRST RUN IN {time.time()-selector_page_loaded}')

Python Selenium wedriver iterate over form

I'm trying to automate a form assertion with a list of data, however I'm struggling on when and how to use "WebDriverWait" or driver implict wait. My list is 1000 strings. When I run a sample of a 100, less than 100 are captured correctly. The code below catches ElementNotSelectableException\StaleElementReferenceException but doesn't address them correctly.
import unittest
from selenium.common.exceptions import ElementNotVisibleException, ElementNotSelectableException, \
ElementNotInteractableException, StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Chrome()
oar_order_id = '#oar-order-id'
oar_zip_code = '#oar_zip'
#order confirmation list
order_id_list = ["WO-34284975","WO-50002804","WO-50004302","WO-34282964","WO-34214963"]
#zip code confirmation list
zip_code_list = ["23508","99338","62036","75074-7520","37763","98034","89406-4361"]
submit_button = 'body.sales-guest-form.page-layout-1column:nth-child(2) div.page-wrapper:nth-child(10) main.page-main:nth-child(4) div.columns:nth-child(4) div.column.main form.form.form-orders-search:nth-child(4) div.actions-toolbar div.primary > button.action.submit.primary'
# Enter orderid & Zip & click enter
found = []
notfound = []
length = len(order_id_list)
driver.implicitly_wait(10)
wait = WebDriverWait(driver, 15, poll_frequency=1,
ignored_exceptions=[ElementNotVisibleException ,ElementNotSelectableException])
driver.get('https:/www.ecklers.com/sales/orderlookup/index/')
# Sample of 10 from list
for i in range(10):
try:
wait
element1 = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, oar_order_id)))
element1.send_keys(order_id_list[i])
element2 = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, oar_zip_code)))
element2.send_keys(zip_code_list[i])
element3 = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, submit_button)))
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, submit_button)))
element3.click()
wait
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"body.sales-guest-view.page-layout-1column:nth-child(2) div.page-wrapper:nth-child(10) main.page-main:nth-child(4) div.page-title-wrapper:nth-child(3) h1.page-title > span.base")))
title = driver.title
#driver.implicitly_wait(60)
except(StaleElementReferenceException):
print("StaleElementReferenceException")
except(ElementNotInteractableException):
print("ElementNotInteractableException")
try:
text = order_id_list[i]
assert(title.endswith(text[3:]))
found.append(text)
except(AssertionError):
notfound.append((order_id_list[i]))
driver.get('https:/www.ecklers.com/sales/orderlookup/index/')
wait
print(found,notfound)```

I found an answer that helped me. I used: time.sleep(.500) after driver.get('https:/www.ecklers.com/sales/orderlookup/index/'

Create a loop for web scraping with selenium in python

I want to create a loop so that I can scrape the individual time figures for each horse on all eight races from the at the races website.
Below is an example of the first race (17:15) of eight:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
url = 'http://www.attheraces.com/racecard/Wolverhampton/6-October-2018/1715'
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_element_by_xpath('//*[#id="racecard-tabs 1061960"]/div[1]/div/div[1]/ul/li[2]/a').click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//*[#id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
The next race (17:45) would have the following url:
url = 'http://www.attheraces.com/racecard/Wolverhampton/6-October-2018/1745'
And the id in the following code keeps changing with the url
driver.find_element_by_xpath('//*[#id="racecard-tabs 1061961"]/div[1]/div/div[1]/ul/li[2]/a').click()
So for the 17:15, the racecard-tabs becomes 1061960
for the 17:45, the racecard-tabs becomes 1061961
the 18:15, raecard-tabs becomes 1061963, and so on.
Any help or advice is greatly appreciated.

This will work. You can even change the date and the rest will be automatically be executed for you!
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
def races(main_url):
driver = webdriver.Chrome()
driver.get(main_url)
driver.implicitly_wait(2)
races = driver.find_elements_by_class_name('time-location')
races = [race.text[:5] for race in races]
races = [race.replace(':', '') for race in races]
driver.close()
return races
def scrape(url):
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(2)
driver.find_elements_by_class_name('racecard-ajax-link')[1].click()
WebDriverWait(driver, 5).until(expected_conditions.presence_of_element_located((By.XPATH, '//*[#id="tab-racecard-sectional-times"]/div/div[1]/div[1]/div[2]/div/button')))
for horse in driver.find_elements_by_class_name('card-item'):
horseName = horse.find_element_by_class_name('form-link').text
times = horse.find_elements_by_class_name('sectionals-time')
times = [time.text for time in times]
print('{}: {}'.format(horseName, times))
print()
driver.close()
def main():
date = '6-October-2018'
main_url = 'http://www.attheraces.com/racecard/Wolverhampton/' + date
for race in races(main_url):
url = main_url + '/' + race
print(url)
scrape(url)
if __name__ == '__main__':
main()

Parsing a site where URL doesn't change with Selenium Python

I'm trying to scrape [this][1] site its URL doesnt change when next page is clicked on. So, I used Selenium to click on the next page, but doing that doesnt help. As my driver keeps getting the old page even after next page is clicked on. Is there any other way to get to the next page and scrape it?
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
driver = webdriver.Safari()
store_pages = []
#10306 is total number of pages.
for i in range (10306):
Starting_url = 'site'
driver.get(Starting_url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
print (store_pages.append(i))
timeout = 20
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_lblDisclaimerMsg']")))
except TimeoutException:
print("Timed out waiting for page to load")
driver.quit()
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
timeout = 20
wait = WebDriverWait(driver, 10).until(EC.text_to_be_present_in_element_value((By.CSS_SELECTOR, '#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a > div.act_search_results > div.act_search_header'), "206113 Record(s) | Page [2 of 10306]"))
NGO_element = driver.find_element_by_class_name("faq-sub-content exempted-result")
NGO_name = NGO_element.find_elements_by_tag_name("h1")
NGO_name_pancard = driver.find_elements_by_class_name("pan-id")
NGO_data = NGO_element.find_elements_by_tag_name("ul")
NGO_sub_data = NGO_element.find_elements_by_tag_name("li")
for i, p, t in zip(NGO_name, NGO_name_pancard, NGO_data):
n_name = i.text.replace(p.text, '')
n_data = t.text
n_pan = p.text
print ("Name of NGO:", n_name, "Fields of NGO:", n_data, "Pancard number:", n_pan)
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
#timeout = 2

You need to make sure when you reach the next page, the content of the earlier page has become stale otherwise, you will have stale element error or get the same thing repeatedly. Try the below approach, it should get you there. The rest you can modify yourself.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("http://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
while True:
for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"[id^='arrowex']"))):
print(elem.text)
try:
wait.until(EC.presence_of_element_located((By.ID, "ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_imgbtnNext"))).click()
wait.until(EC.staleness_of(elem))
except:
break
driver.quit()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python WebScraping - Sleep oscillate in slow websites - python

Related

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

Conditional dropdown for loop is not working in the expected way

Python Selenium wedriver iterate over form

Create a loop for web scraping with selenium in python

Parsing a site where URL doesn't change with Selenium Python

Categories

Resources