Unclickable element with selenium Python - python

I used web scrapping through Python with selenium in order to get daily price values for EEX French Power futures at the url "https://www.eex.com/en/market-data/power/futures#%7B%22snippetpicker%22%3A%2221%22%7D".
I guess they updated their website as the url changed recently, and now my script doesn't work properly anymore as I can't find a way to click on each displayed product button (Year, Quarter, Month, Weekend, Day).
Here is my code until the step that doesn't work (it simply doesn't click, it doesn't fail) :
import time
import datetime
from datetime import date
from dateutil.relativedelta import relativedelta
import pyodbc
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
url = "https://www.eex.com/en/market-data/power/futures#%7B%22snippetpicker%22%3A%2221%22%7D"
dico_product = ('Day', 'Weekend', 'Week', 'Month', 'Quarter', 'Year')
now = datetime.datetime.now()
date_prx = now.date()
options=Options()
d = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
d.get(url)
time.sleep(6)
d.maximize_window()
cookies_button_str = "//input[#class='btn bordered uo_cookie_btn_type_1']"
d.find_element(By.XPATH, cookies_button_str).click()
time.sleep(4)
dateinput_button_str = "//div[#class = 'mv-date-input']//div[#class = 'mv-stack-block']//input[#class = 'mv-input-box']"
Date_input = date_prx
Date_input_str = str(Date_input.year) + '-' + str(Date_input.month) + '-' + str(Date_input.day)
element_view = d.find_element(By.CLASS_NAME, 'collapsed')
d.execute_script("arguments[0].scrollIntoView()", element_view)
WebDriverWait(d, 20).until(EC.presence_of_element_located((By.XPATH, dateinput_button_str)))
element = d.find_element(By.XPATH, dateinput_button_str)
time.sleep(2)
d.execute_script('arguments[0].value = "' + str(Date_input_str) + '";', element)
time.sleep(2)
element_button_str = './/div[contains(#class, "mv-button-base mv-hyperlink-button")]'
containers = d.find_elements(By.XPATH, element_button_str)
for item in containers:
if item.text in dico_product:
print('Traitement ' + str(item.text) + ' pour la date ' + str(Date_input_str) + '.')
element_button_str = './/div[contains(#class, "' + str(item.get_attribute("class")) + '") and contains(., "' + str(item.text) + '")]'
product_button = d.find_element(By.XPATH, element_button_str)
d.execute_script("arguments[0].click()", product_button)
It does find the element to click on, but it doesn't click.
What is suprising is that if you take the old url, that get you to the Austrian futures by default, it works fine. But if you take the proper url, it doesn't.
I don't know if it can be done or if it's no use, but honestly I tried everything I could think of. Could you gently help me ?
Thank you

Related

Selenium - selecting date picker from skyscanner

I couldn't find a way to automate the date picker using Selenium.
from selenium import webdriver
from getpass import getpass
import pandas as pd
import numpy as np
import requests
import lxml
url = "https://www.skyscanner.ca"
driver = webdriver.Chrome("chromedriver")
driver.maximize_window()
driver.get(url)
trip_type_id = "fsc-trip-type-selector-one-way"
trip_type_select = driver.find_element(by="id", value=trip_type_id )
trip_type_select.click()
origin_textbox = driver.find_element(by="id", value="fsc-origin-search" )
origin_value = "Vancouver (Any)"
destin_textbox = driver.find_element(by="id", value="fsc-destination-search")
dest_value = "Dubai (Any)"
destin_textbox.send_keys(dest_value)
I am stuck at the datepicker
driver.find_element(by="id", value="depart-fsc-datepicker-button" ).click()
opens the datepicker as
enter image description here
Is there a way to automate by passing a specific date ("December 12, 2022") with selenium?.
After analyzing the DOM of the page I found that the calendar class has dropdown for month_year and is available in the option tab. The element can be selected either on the basis of text or on the basis class name. I have used text here.
Similarly the date part can be selected by selecting the span on text which is present inside the button.
Here is the code that selects the given date on the page.
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# REPLACE YOUR CHROME PATH HERE
chrome_path = r"C:\Users\hpoddar\Desktop\Tools\chromedriver_win32\chromedriver.exe"
s = Service(chrome_path)
driver = webdriver.Chrome(service=s)
driver.get('https://www.skyscanner.ca')
driver.maximize_window()
date = 'December 12, 2022' # ENTER YOUR DATE HERE
# Extracting day and month_year from the date
m = re.search(r'([A-Za-z]+) (\d{2}), (\d{4})', date)
day, month_year = m.group(2), m.group(1) + " " + m.group(3)
driver.find_element(by="id", value="depart-fsc-datepicker-button" ).click()
click_on_depart_dropdown = driver.find_element(by="id", value="depart-calendar__bpk_calendar_nav_select" )
click_on_depart_dropdown.click()
monthyear = click_on_depart_dropdown.find_element(By.XPATH, f'//option[contains(text(), "{month_year}")]')
monthyear.click()
day_element = click_on_depart_dropdown.find_element(By.XPATH, f'//button[contains(#class, "BpkCalendarDate_bpk-calendar-date__MTdlO")]//span[contains(text(), "{day}")]')
day_element.click()
The above example was for Depart, similarly it can be done for Return.

Getting first sibling of second instance using Xpath

I'm trying to extract information from this page:
I'm trying to extract the time (6:30 PM).
My strategy is to find the second instance of the date (Mar. 31st, 2022), and then get the first sibling of that. Photo here (I want the part boxed in yellow):
Here's what I've tried:
#Get First Date (Date at top of the page)
try:
first_date = driver.find_elements_by_css_selector('a[href^="https://www.bandsintown.com/a/"] + div + div')
first_date = first_date[0].text
except (ElementNotVisibleException, NoSuchElementException, TimeoutException):
print ("first_date doesn't exist")
continue
#Get time. This will the first sibling of the second instance of date
try:
event_time = driver.find_elements_by_xpath("//div[text()='" + first_date + "'][1]/following-sibling::div")
print(event_time[0].text)
except (ElementNotVisibleException, NoSuchElementException, TimeoutException):
continue
However, this is not getting me what I want. What am I doing wrong here? I'm looking for a way to get the first sibling of the second instance using Xpath.
It seems it is first element with PM / AM so I would use find_element with
'//div[contains(text(), " PM") or contains(text(), " AM")]'
like this
item = driver.find_element(By.XPATH, '//div[contains(text(), " PM") or contains(text(), " AM")]')
print(item.text)
I use space before PM/AM to make sure it is not inside word.
Your xpath works when I add ( ) so it first gets divs and later select by index.
Without () it may treats [text()="..."][1] like [text()="..." and 1].
And it needs [2] instead of [1] because xpath start counting at 1, not 0
"(//div[text()='" + first_date + "'])[2]/following-sibling::div"
Full working example
from selenium import webdriver
from selenium.webdriver.common.by import By
#from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
import time
url = 'https://www.bandsintown.com/e/103275458-nayo-jones-at-promise-of-justice-initiative?came_from=253&utm_medium=web&utm_source=city_page&utm_campaign=event'
#driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
driver.get(url)
time.sleep(5)
item = driver.find_element(By.XPATH, '//div[contains(text(), " PM") or contains(text(), " AM")]')
print(item.text)
print('---')
first_date = driver.find_elements(By.CSS_SELECTOR, 'a[href^="https://www.bandsintown.com/a/"] + div + div')
first_date = first_date[0].text
event_time = driver.find_elements(By.XPATH, "(//div[text()='" + first_date + "'])[2]/following-sibling::div")
print(event_time[0].text)
The following xpath will give you date and time.
date:
print(driver.find_element_by_xpath("//a[text()='Promise of Justice Initiative']/following::div[4]").text)
time:
print(driver.find_element_by_xpath("//a[text()='Promise of Justice Initiative']/following::div[5]").text)
or your use.
print(driver.find_element_by_xpath("
//a[contains(#href,'https://www.bandsintown.com/v/')]/following::div[contains(text(), 'PM') or contains(text(), 'AM')]").text)

URL not updating when iterating through URLs

The following code does the following:
1 Opens a specific URL (for the first date YYYY-MM-DD);
2 getURL() generates all URLs with all dates in a specific date range (starting from the second day);
3 Opens new tab with the first date generated by getURL();
4 Goes back to previous tab and closes it;
5 Repeat steps 3 and 4.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from datetime import datetime, timedelta
# Load Chrome driver and movement.uber.com/cities website
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)
# Attributing the city name and the center-most zone code (or origin) to variables so they can be inserted in the URL later
city = 'atlanta'
origin_code = '1074'
coordinates = '&lat.=33.7489&lng.=-84.4234622&z.=12'
# Open URL for the first day in the desired city (change coordinates depending on city)
driver.get('https://movement.uber.com/explore/' + city + '/travel-times/query?si' + origin_code + '&ti=&ag=taz&dt[tpb]=ALL_DAY&dt[wd;]=1,2,3,4,5,6,7&dt[dr][sd]=' +
'2016-01-02' + '&dt[dr][ed]=' + '2016-01-02' + '&cd=&sa;=&sdn=' + coordinates + '&lang=en-US')
# Generating the correct URLs for each date
def getURL():
date = datetime(2016,1,4)
while date <= datetime(2020,3,31):
yield ('https://movement.uber.com/explore/' + city + '/travel-times/query?si' + origin_code + '&ti=&ag=taz&dt[tpb]=ALL_DAY&dt[wd;]=1,2,3,4,5,6,7&dt[dr][sd]=' +
date.strftime('%Y-%m-%d') + '&dt[dr][ed]=' + date.strftime('%Y-%m-%d') + '&cd=&sa;=&sdn=&lat.=33.7489&lng.=-84.4234622&z.=12&lang=en-US')
date += timedelta(days=1)
# Open new tab
i = 0
for url in getURL():
i += 1
if i < 3:
driver.execute_script("window.open(url)")
# Switch to previous tab and close it (leaving us with the newly above opened tab)
tabs = driver.window_handles
if len(tabs) > 1:
driver.switch_to.window(tabs[0])
driver.close()
driver.switch_to.window(tabs[1])
The problem: every time a new tab/"window" is opened, the code opens the URL with the first date YYYY-MM-DD, completely ignoring the URLs generated by getURL().
The question: how do I open a new tab with the next date, close the previous one, repeat?
My ultimate goal: to download datasets that are inside each distinct URL (but the code for that is irrelevant for the problem here). Obs.: I use the Selenium library for that.
you can maybe try to put all the urls you create in a list and then return it
like this :
def getURL():
tab = []
date = datetime(2016,1,4)
while date <= datetime(2020,3,31):
url ='https://movement.uber.com/explore/' + city + '/travel-times/query?si' + origin_code + '&ti=&ag=taz&dt[tpb]=ALL_DAY&dt[wd;]=1,2,3,4,5,6,7&dt[dr][sd]=' +
date.strftime('%Y-%m-%d') + '&dt[dr][ed]=' + date.strftime('%Y-%m-%d') + '&cd=&sa;=&sdn=&lat.=33.7489&lng.=-84.4234622&z.=12&lang=en-US'
tab.append(url)
date += timedelta(days=1)
return tab
The error lies on how you're launching a tab.
When i changed FROM:
driver.execute_script("window.open(url)")
TO:
driver.execute_script("window.open('"+url+"','_blank')")
The script executed perfectly on my machine.
It looks like url inside your approach is not updated on the iteration. If you make it into a parameter of the for loop it is resolved each time.
Have a look here for more info on javascript on how to open a window (FYI - you can also do _self instead of _blank to replace the current window - this might mitigate your need for tab management).
Results of my test...
This is the first iteration:
This is the second iteration:
For reference This is the entire script i ran: (note that i updated i for more iterations, the chromedriver PATH for my machine + added couple of prints)
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from datetime import datetime, timedelta
# Load Chrome driver and movement.uber.com/cities website
#PATH = ''# - mine lives local -> 'C:\Program Files (x86)\chromedriver.exe'
#driver = webdriver.Chrome(PATH)
driver = webdriver.Chrome()
# Attributing the city name and the center-most zone code (or origin) to variables so they can be inserted in the URL later
city = 'atlanta'
origin_code = '1074'
coordinates = '&lat.=33.7489&lng.=-84.4234622&z.=12'
# Open URL for the first day in the desired city (change coordinates depending on city)
driver.get('https://movement.uber.com/explore/' + city + '/travel-times/query?si' + origin_code + '&ti=&ag=taz&dt[tpb]=ALL_DAY&dt[wd;]=1,2,3,4,5,6,7&dt[dr][sd]=' +
'2016-01-02' + '&dt[dr][ed]=' + '2016-01-02' + '&cd=&sa;=&sdn=' + coordinates + '&lang=en-US')
# Generating the correct URLs for each date
def getURL():
date = datetime(2016,1,4)
while date <= datetime(2020,3,31):
yield ('https://movement.uber.com/explore/' + city + '/travel-times/query?si' + origin_code + '&ti=&ag=taz&dt[tpb]=ALL_DAY&dt[wd;]=1,2,3,4,5,6,7&dt[dr][sd]=' +
date.strftime('%Y-%m-%d') + '&dt[dr][ed]=' + date.strftime('%Y-%m-%d') + '&cd=&sa;=&sdn=&lat.=33.7489&lng.=-84.4234622&z.=12&lang=en-US')
date += timedelta(days=1)
# Open new tab
i = 0
print("urls: %i", len(list(getURL())))
for url in getURL():
i += 1
if i < 10:
driver.execute_script("window.open('"+url+"','_blank')")
print (url)
# Switch to previous tab and close it (leaving us with the newly above opened tab)
tabs = driver.window_handles
if len(tabs) > 1:
driver.switch_to.window(tabs[0])
driver.close()
driver.switch_to.window(tabs[1])

Automation of stock updates

I'm building a bot to print prizes of bonds from hour to hour. However, I am incurring in two errors:
Since the task requires the browser to open from time to time, it ruins the experience while using the notebook. Is there a way to keep this task as a 'background' rule?
I am using schedule library to set the update, but I am not quite sure it is right (even though I read the manual). Or the time set is not respected (I set to 10 minutes and the code is read from 5 to 5) or the function time it is not updated (it repeats minutes/hours/seconds).
The code is below:
import sys
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
import time
import datetime
import schedule
clock = datetime.datetime.now()
def preço():
os.chdir('C:/Users/Thiago/Desktop/Backup/Python')
options = webdriver.ChromeOptions()
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
driver.get("https://www.google.com/")
elem = driver.find_element_by_name("q")
elem.clear()
elem.send_keys("cvcb3")
time.sleep(1)
elem = driver.find_element_by_name("btnK")
elem.click()
time.sleep(2)
cvcb3 = driver.find_element_by_xpath(".//span[#jsname = 'vWLAgc']")
preço_cvcb3 = open('preço_cvcb3.txt', 'a')
preço_cvcb3.write('O preço da ação da CVC é ' + cvcb3.get_attribute("innerHTML") + ' - Extração feita ás ' + clock.strftime("%I:%M:%S %p") + '.\n')
preço_cvcb3.close()
print('O preço da ação da CVC é ' + cvcb3.get_attribute("innerHTML") + ' - Extração feita ás ' + clock.strftime("%I:%M:%S %p") + '.\n')
driver.close()
schedule.every(1).minutes.do(preço)
while True:
schedule.run_pending()
time.sleep(1)

loop with BeautifulSoup to web scrape multiple pages by timestamps

I am trying to retrive the day temperature of a local weather site.
I built this loop using BeautifulSoup.
Unfortunately the loop breaks after the first round.
this is my code and the result:
code:
#coding: latin-1
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
# create a file zam-data.txt
# seperated with komma
f = open('zamg-data.txt','w')
# start webdriver
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
#loop through month and days
for m in range(1,13):
for d in range (1, 32):
# was the last day in a month
if (m==2 and d>28):
break
elif (m in [4,6,9,11] and d>30):
break
#open zamg site
timestamp = '2019' +'-'+ str(m) +'-'+ str(d)
print("call page of "+timestamp)
url = "https://www.zamg.ac.at/cms/de/klima/klima-aktuell/klimamonitoring/?param=t&period=period-ymd-"+timestamp
driver.get(url)
# extract temprature
html = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(html, "html.parser")
data = soup.find_all(class_='u-txt--big')[1].string
print(len(data))
print(data + '...okay')
# format month for timestamp
if(len(str(m)) < 2):
mStamp = '0'+str(m)
else:
mStamp = str(m)
# format day for timestamp
if(len(str(d)) < 2):
dStamp = '0'+ str(d)
else:
dStamp = str(d)
# timestamp
timestamp = '2019' + mStamp + dStamp
# write time and value
f.write(timestamp + ',' + data + '\n')
# data is extracted - close
f.close()
my result:
➜ weather-app python get-data-02.py
call page of 2019-1-1
5
+3,9 ...okay
call page of 2019-1-2
Traceback (most recent call last):
File "get-data-02.py", line 37, in <module>
data = soup.find_all(class_='u-txt--big')[1].string
IndexError: list index out of range
➜ weather-app
I don't understand what is wrong here. the 2nd page is loaded in the browser but then it breaks
any Ideas?
#coding: latin-1
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import datetime
import time
base = datetime.datetime(2019,1,1).date()
date_list = [base + datetime.timedelta(days=x) for x in range(365)]
# start webdriver
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
base_url = "https://www.zamg.ac.at/cms/de/klima/klima-aktuell/klimamonitoring/?param=t&period=period-ymd-"
with open('zamg-data.txt','w') as file:
for dt in date_list:
timestamp = dt.strftime("%Y-%m-%d")
print("call page of "+timestamp)
url = f"{base_url}{timestamp}"
driver.get(url)
WebDriverWait(driver, timeout=40).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "u-txt--big")))
# extract temprature
html = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(html, "html.parser")
data = soup.find_all(class_='u-txt--big')[1].string
print(len(data))
print(data + '...okay')
# timestamp
timestamp_1 = dt.strftime("%Y%m%d")
# write time and value
file.write(timestamp_1 + ',' + data + '\n')
time.sleep(3)
driver.quit()
print("Done!!!")
As someone from the comment section mentioned, you need to make the browser wait till all elements of that class are detected. I've added an explicit time delay after each page load so that the website is not overwhelmed with requests. It is a potential way to get your IP blocked. It's best to always use a context manager, whenever you can.

Categories