I'm using selenium with ChromeDriver to crawl images. To speed up the progress, I execute javascript window.stop() to stop loading the whole page as soon as the elements of interest are loaded. I ran the same code several times but get different outcomes (sometimes the NoSuchElementException occurred, sometimes not).
Could anybody explain what's wrong with my code? Here is my sample code:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
def get_driver(quick_load=True):
chromeOptions = webdriver.ChromeOptions()
# chromeOptions.add_argument('headless')
prefs = {"profile.managed_default_content_settings.images": 2}
chromeOptions.add_experimental_option("prefs", prefs)
if quick_load:
capa = DesiredCapabilities.CHROME
capa["pageLoadStrategy"] = "none"
driver = webdriver.Chrome(desired_capabilities=capa,
chrome_options=chromeOptions)
else:
driver = webdriver.Chrome(chrome_options=chromeOptions)
wait = WebDriverWait(driver, 20)
return driver, wait
url = [
"https://www.redbubble.com/people/captainkita/works/27361999-no-words?grid_pos=2&p=lightweight-hoodie&rbs=84d3442a-6f22-4dc1-980d-9a707104c791&ref=shop_grid",
"https://www.redbubble.com/people/zeeteesapparel/works/27338911-game-of-thrones-waiting-for-a-bastard-with-direwolf?grid_pos=44&p=t-shirt&rbs=337dcf4c-3ff9-4507-9cbd-c6e80f78d022&ref=shop_grid&style=vneck",
"https://www.redbubble.com/people/zeeteesapparel/works/27338911-game-of-thrones-waiting-for-a-bastard-with-direwolf?grid_pos=44&p=t-shirt&rbs=337dcf4c-3ff9-4507-9cbd-c6e80f78d022&ref=shop_grid&style=vneck"
]
driver, wait = get_driver()
for u in url:
driver.execute_script("window.open('%s');" % u)
driver.switch_to_window(driver.window_handles[-1])
time.sleep(2)
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'input[type="image"]')))
driver.execute_script("window.stop();")
print "Stopped window"
images = driver.find_elements_by_css_selector('input[type="image"]')
print len(images)
candidates = [t for t in images if "750x1000" in t.get_attribute('src')]
print candidates[-1].get_attribute('src')
# driver.execute_script("window.close();")
# driver.switch_to_window(driver.window_handles[-1])
driver.quit()
Related
The question is about selenium webdriver's explicitly wait feature which is same locating action with a timeout. Webdriver waits element to be visible or present. In my situation, I locate elements with explicit waiting and without it. When it is without the explicit wait, it founds and assigns it. But below, I also try to locate element explicitly waiting, that variable is unassigned, and in such form passes to for iteration, and my for iteration gives UnboundLocalError
I know that error is not selenium but python based. I can get rid of that error by using instead of pass y_priceNewProds = x_priceNewProds` but I need to understand why selenium webdriver cannot locate the element with wait if it already a line above found it. There must be cookies also.
code is below;
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#reading from csv file url-s
def readCSV(path_csv):
df=pd.read_csv(path_csv)
return df
fileCSV=readCSV(r'C:\Users\Admin\Downloads\urls.csv')
length_of_column_urls=fileCSV['linkamazon'].last_valid_index()
def create_driver():
chrome_options = Options()
chrome_options.headless = True
chrome_options.add_argument("start-maximized")
# options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
webdriver_service = Service(r'C:\pythonPro\w_crawl\AmznScrpBot\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
return driver
#going to urls 1-by-1
def goToUrl_Se(driver):
global counter
counter = 0
for i in range(0, length_of_column_urls + 1):
xUrl = fileCSV.iloc[i, 1]
print(xUrl,i)
# going to url(amazn) via Selenium WebDriver
driver.get(xUrl)
parse_data()
counter+=1
driver.quit()
#fetch-parse the data from url page
def parse_data():
global asin, title, bookform, priceNewProd
wait=WebDriverWait(driver,timeout=20)
try:
#trying code snippets where locating elements occur by XPATH, in some ofthem
#there is also WebDriver explicit wait, that element has both methods
#with presence_of element wait method and without explicit wait method. Just trying to understand
#why sometimes elements cannot be located on page
y_data_index=wait.until(EC.presence_of_element_located((By.XPATH,"//div[#data-index='1']")))
print(y_data_index.text,'y_index')
x_data_index=wait.until(EC.visibility_of_element_located((By.XPATH,"//div[#data-index='1']")))
print(x_data_index.text,'x_index')
x_titles=driver.find_elements(By.XPATH,'//span[contains(#class,"a-size-medium")]')
x_bookforms = driver.find_elements(By.XPATH,'//div[contains(#class,"a-section a-spacing-none a-spacing-top-micro s-price-instructions-style")]//div[contains(#class,"a-row a-size-base a-color-base")]//a[contains(#class,"a-size-base a-link-normal puis-medium-weight-text s-underline-text s-underline-link-text s-link-style s-link-centralized-style a-text-bold")]')
x_priceNewProds = driver.find_elements(By.XPATH,'//div[contains(#class,"a-row a-size-base a-color-base")]//span[contains(#data-a-color,"base")]//span[contains(#class,"a-offscreen")][1]')
y_priceNewProds=wait.until(EC.visibility_of_all_elements_located((By.XPATH,'//div[contains(#class,"a-row a-size-base a-color-base")]//span[contains(#data-a-color,"base")]//span[contains(#class,"a-offscreen")][1]')))
except:
pass
for i in range(len(x_titles)):
x_title=x_titles[i].text
x_priceNewProd = x_priceNewProds[i].text
print(x_priceNewProd)
y_priceNewProd=y_priceNewProds[i].text
print(y_priceNewProd)
try:
x_bookform=x_bookforms[i].text
print(x_bookform)
except:
x_bookform='-'
title = x_title
bookform = x_bookform
priceNewProd = x_priceNewProd
write_to_csv()
def write_to_csv():
if counter==0:
df=pd.DataFrame({'Products':title,'Bookform':bookform,'Price':priceNewProd},index=[0])
wr=df.to_csv('results_new00.csv',index=False,mode='w')
else:
df=pd.DataFrame({'Products':title,'Bookform':bookform,'Price':priceNewProd},index=[0])
wr=df.to_csv('results_new00.csv',index=False,mode='a',header=False)
driver=create_driver()
goToUrl_Se(driver)
y_priceNewProd=y_priceNewProds[i].text UnboundLocalError: local variable 'y_priceNewProds' referenced before assignment
Since you didn't share all the code including the link we can't know what exactly happens there, but I guess that this line
wait.until(EC.visibility_of_all_elements_located((By.XPATH,'//div[contains(#class,"a-row a-size-base a-color-base")]//span[contains(#data-a-color,"base")]//span[contains(#class,"a-offscreen")][1]')))
Throws exception since that element(s) are / is not visible.
This is why y_priceNewProds remains not assigned any value.
While if you use simple driver.find_elements method it can find those elements since they are existing but, again, not visible.
UPD
After you shared the link I tested that and now I can say: Yes, I was right!
Those elements are not visible.
The following code throws TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
url = 'https://www.amazon.com/s?i=stripbooks&bbn=1&rh=n%3A1%2Cp_n_feature_eighteen_browse-bin%3A8622846011&dc&fs=true&ds=v1%3AgAO0%2BZc%2BC6RthRkqyWzOHmDVufv7JbuCK96Grvjle68&qid=1665559431&rnid=8622845011&ref=sr_nr_p_n_feature_eighteen_browse-bin_5'
driver.get(url)
wait = WebDriverWait(driver, 20)
m_list = wait.until(EC.visibility_of_all_elements_located((By.XPATH, '//div[contains(#class,"a-row a-size-base a-color-base")]//span[contains(#data-a-color,"base")]//span[contains(#class,"a-offscreen")][1]')))
print(m_list)
UPD2
But if you will take the direct parent of that element it will work correctly since it is visible.
For the following code:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
url = 'https://www.amazon.com/s?i=stripbooks&bbn=1&rh=n%3A1%2Cp_n_feature_eighteen_browse-bin%3A8622846011&dc&fs=true&ds=v1%3AgAO0%2BZc%2BC6RthRkqyWzOHmDVufv7JbuCK96Grvjle68&qid=1665559431&rnid=8622845011&ref=sr_nr_p_n_feature_eighteen_browse-bin_5'
driver.get(url)
wait = WebDriverWait(driver, 20)
m_list = wait.until(EC.visibility_of_all_elements_located((By.XPATH, '//div[contains(#class,"a-row a-size-base a-color-base")]//span[contains(#data-a-color,"base")]')))
print(len(m_list))
the output is:
27
I want to create a script that grabs the info from the website using selenium.
However, if it doesn't find the info and shows an error message, it skips that request and continues to the next one.
from selenium import webdriver
import pandas as pd
import undetected_chromedriver as uc
list1 = [6019306,6049500,6051161,6022230,5776662,6151430]
for x in range(0, list1.count()):
while True:
try:
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
driver = uc.Chrome(options=options)
url = 'https://www.axie.tech/axie-pricing/'+str(list1[x])
driver.get(url)
driver.implicitly_wait(10)
test = driver.find_element_by_xpath('//*[#id="root"]/div[1]/div[2]/div[2]/div/div/div[1]/div/div[1]/div[4]/div/div[3]/div/span').text
test = float(test[1:])
print(test)
driver.close()
except NoSuchElementException:
'This Value doesnt exist'
driver.close()
A bit unclear what exactly you are trying to do through the line test = float(test[1:]).
However to extract the desired text from the list of websites you need to induce WebDriverWait for visibility_of_element_located() and you can use the following locator strategy:
Code Block:
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
driver = uc.Chrome(options=options)
list1 = [6019306, 6049500, 6051161, 6022230, 5776662, 6151430]
for x in range(0, len(list1)):
try:
url = 'https://www.axie.tech/axie-pricing/'+str(list1[x])
driver.get(url)
print(WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//span[text()='Reasonable']//following::div[1]//span[contains(#class, 'MuiTypography-root')]"))).text)
except TimeoutException:
continue
driver.quit()
Console Output:
Ξ0.01
Ξ0.012
Ξ0.0162
Ξ0.026
Note : You have to add the following imports :
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import sys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver=webdriver.Chrome('chromedriver',options=options)
driver.get("https://rentry.co/wftw8/edit")
driver.implicitly_wait(20)
#print (driver.page_source)
try:
# here I selected the **span** element that I talk above
span = driver.find_element_by_xpath('//*[#id="text"]/div/div[5]/div[1]/div/div/div/div[5]/pre/span')
# change the innerText thwough js
driver.execute_script('arguments[0].innerText="Hello boy"', span)
# just wait for the id_edit_code to be present
edit = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "id_edit_code")))
edit.send_keys("iRfiNq6M")
# and same as you had it
#driver.find_element_by_id("submitButton").send_keys(Keys.ENTER)
#driver.find_element_by_link_text("Save").click()
driver.find_element_by_id("submitButton").click()
except:
print("Oops!", sys.exc_info()[0], "occurred.")
finally:
driver.close()
print("done")
There is no exception but still updating of the text is not reflecting in the url?
Even though there is timer which is enough for the whole code to get processed.
Then also there is no updation.
Currently the text is Hello if u visit the URL but I want it to be Hello boy using selenium , which is done by the below code line:
span = driver.find_element_by_xpath('//*[#id="text"]/div/div[5]/div[1]/div/div/div/div[5]/pre/span')
# change the innerText thwough js
driver.execute_script('arguments[0].innerText="Hello boy"', span)
But no updation!!?
I want to download files by clicking on Download icon on Chrome browser.
I tried several ways like Xpath and CSS but it doesn't worked. Please let me know if there is any solution on this using Python 3.x and selenium.
Below is code that I have tried,
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
class TEAutomation:
def automateTask(self):
chromeOptions = Options()
chromeOptions.add_experimental_option("prefs",{"download.default_directory": "/home/vishal/Documents/PythonProgram/"})
baseUrl = "https://www.te.com/commerce/DocumentDelivery/DDEController?Action=showdoc&DocId=Customer+Drawing%7F160743%7FM2%7Fpdf%7FEnglish%7FENG_CD_160743_M2.pdf%7F160743-1"
driver = webdriver.Chrome(executable_path="/home/vishal/PycharmProjects/VSProgramming/drivers/chromedriver",chrome_options=chromeOptions)
driver.maximize_window()
driver.get(baseUrl)
driver.implicitly_wait(10)
driver.find_element(By.XPATH,'//*[#id="download"]').click()
#driver.find_element(By.CSS_SELECTOR, '#download').click()
time.sleep(5)
driver.quit()
molexAuto = TEAutomation()
molexAuto.automateTask()
Thank you in advance.
Maybe the element is still not loaded when you try to click it, try waiting for it with WebDriverWait, I don't have chrome so you will have to test this yourself:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class TEAutomation:
def automateTask(self):
chromeOptions = Options()
prefs = {
"download.default_directory": "/home/vishal/Documents/PythonProgram/",
"plugins.always_open_pdf_externally": True
}
chromeOptions.add_experimental_option("prefs", prefs)
baseUrl = "https://www.te.com/commerce/DocumentDelivery/DDEController?Action=showdoc&DocId=Customer+Drawing%7F160743%7FM2%7Fpdf%7FEnglish%7FENG_CD_160743_M2.pdf%7F160743-1"
driver = webdriver.Chrome(executable_path="/home/vishal/PycharmProjects/VSProgramming/drivers/chromedriver",chrome_options=chromeOptions)
driver.implicitly_wait(10)
driver.maximize_window()
driver.get(baseUrl)
time.sleep(5)
driver.quit()
molexAuto = TEAutomation()
molexAuto.automateTask()
zillow picture
I have the above image at https://www.zillow.com/homes/for_sale/7132668_zpid/globalrelevanceex_sort/60.780619,-65.522461,4.521666,-125.551758_rect/3_zm/
I cant seem to find the selector for tax history.
I tried to use driver wait but the table that is output is the price history not tax history.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
options = webdriver.ChromeOptions()
options.add_argument(f"user-agent={useragent[0]}")
options.add_argument('--proxy-server=%s' % ips[0])
options.add_argument('--incognito')
chromedriver = '~/Downloads/chromedriver'
chromedriver = os.path.expanduser(chromedriver)
driver = webdriver.Chrome(chromedriver, chrome_options=options)
driver.get('https://www.zillow.com/homes/for_sale/7132668_zpid/globalrelevanceex_sort/60.673178,-74.663086,4.653079,-116.323243_rect/3_zm/2_p/')
wait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, "hdp-collapse"))).click()
table = wait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div#hdp-tax-history")))
Looks like you need some waits (and maybe some clicks to get that tab visible. You can write out the table. The below is just to show how you can access
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
url = 'https://www.zillow.com/homes/for_sale/7132668_zpid/globalrelevanceex_sort/53.566414,-73.081055,17.434511,-118.081055_rect/3_zm/'
d = webdriver.Chrome()
d.get(url)
WebDriverWait(d,20).until(EC.presence_of_element_located((By.ID , 'price-and-tax-history'))).click()
tabs = WebDriverWait(d,5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".zsg-tab-link")))
tabs[1].click()
print(d.find_element_by_css_selector('#hdp-tax-history table').text) # just to show present