I'm learning web scraping and I need the webdriver to wait until the user selects a start date and end date off an existing calendar from here and read it so I can process the availabilities in that given period. I hope somebody can help me!
here's the part of the code:
tables = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//table[#role='grid']")))
table_first_month = tables[0].find_element(By.TAG_NAME, "tbody")
all_dates = table_first_month.find_elements(By.XPATH, "//td[#role='gridcell']")
for date in all_dates:
date_span = date.find_element(By.TAG_NAME, "span")
aria_label_span = date_span.get_attribute("aria-label")
print(aria_label_span)
#userStartDate = wait.until(EC.element_to_be_clickable((this is where i need help)))
if aria_label_span == str(userStartDate):
date_span.click()
time.sleep(4)
break
this code gets the avalaibale dates in calendar for the shown two months and verifies the condition that the given date (the user will select) exists with the help of this function
def press_right_arrow_until_date_is_found(date):
# get the text of the initial calendar
current_calendar = driver.find_element(By.XPATH, "/html[1]/body[1]/div[2]/div[1]/div[3]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/form[1]/div[1]/div[3]/div[4]/div[1]/div[1]").text
# while the date does not appear in the calendar view press right arrow until it does
while(date_formater(date) not in current_calendar):
right_arrow = driver.find_element(By.XPATH,
"//button[#class='fc63351294 a822bdf511 e3c025e003 fa565176a8 cfb238afa1 ae1678b153 c9fa5fc96d be298b15fa']")
right_arrow.click()
current_calendar = driver.find_element(By.XPATH,
"/html[1]/body[1]/div[2]/div[1]/div[3]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/form[1]/div[1]/div[3]/div[4]/div[1]/div[1]").text
Using selenium I'm downloading some files from a webpage. On Monday's I need to download the info for Friday, Saturday, and Sunday. Every other day I only need yesterday. I wrote an if/else statement to accomplish this and just copy and pasted the code into the else statement. There must be a more pythonic way to write this but I'm still new to this.
today = datetime.date.today()
yesterday = str(today - timedelta(days=1))
if today.weekday() == 0:
fri = str(today - timedelta(days=3))
sat = str(today - timedelta(days=2))
weekend = [fri, sat, yesterday]
for day in weekend:
# Needs to go first otherwise page won't load
date_field = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]""")
date_field.send_keys(day)
org_list = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]/option[text()=\"string\"]""").click()
delay = 5
try:
table_chk = WebDriverWait(driver, delay).until(
EC.presence_of_element_located((By.XPATH, """//*[#id="id blah blah"]""")))
export_btn = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]""")
export_btn.click()
date_field = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]""")
date_field.clear()
org_list = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]/option[1]""").click()
except TimeoutException:
print("Loading took too much time!")
time.sleep(2)
else:
# Needs to go first otherwise it doesn't work
date_field = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]""")
date_field.send_keys(yesterday)
org_list = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]/option[text()=\"string\"]""").click()
delay = 5
try:
table_chk = WebDriverWait(driver, delay).until(
EC.presence_of_element_located((By.XPATH, """//*[#id="id blah blah"]""")))
export_btn = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]""")
export_btn.click()
except TimeoutException:
print("Loading took too much time!")
How can I efficiently repeat the code but have it run multiple times on Monday for Fri, Sat, Sun and just once for the day before, every other day of the week?
Make it always loop, but programmatically define the collection to loop over as a single element most of the time, and multiple days when needed:
today = datetime.date.today()
# No need to define yesterday; we'll make it as needed next
if today.weekday() == 0:
# Today is Monday, quickly get the days for Friday-Sunday
days = [today - timedelta(days=i) for i in (3, 2, 1)]
else:
# Today is not Monday, just check yesterday
days = [today - timedelta(days=1)]
# days is now either one element list of just yesterday, or the whole weekend
# loop runs once or three times, as needed, with the same code
for day in days:
# Complete body of original for day in weekend: loop goes here
If you really want to get code duplication to a minimum, you could reduce the code before the loop to:
today = datetime.date.today()
num_days_to_check = 3 if today.weekday() == 0 else 1
days = [today - timedelta(days=i) for i in range(num_days_to_check, 0, -1)]
since really, all that differs is how many prior days you need to check, 1 or 3, so the conditional can simplify to a one-liner choosing between the two, and the rest is just based on that initial decision point.
I would like to select the popup date table/calendar from the below website by using selenium. i tried to add double click function in it, but it was failed to select the date that i wanted.
from selenium.webdriver.common.action_chains import ActionChains
ccass = driver.get('http://www.hkexnews.hk/sdw/search/searchsdw_c.aspx')
ticker = '00001'
menu = driver.find_element_by_xpath("#date-picker-popup").click()
ccass_search_year = driver.find_element_by_xpath('//*[#id="date-picker"]/div[1]/b[1]/ul/li[2]/button').click()
actions.double_click(ccass_search_year)
ccass_search_month = driver.find_element_by_xpath('//*[#id="date-picker"]/div[1]/b[2]/ul/li[4]/button').click()
actions.double_click(ccass_search_month)
ccass_search_day = driver.find_element_by_xpath('//*[#id="date-picker"]/div[1]/b[3]/ul/li[4]/button').click()
actions.double_click(ccass_search_day)
ccass_search = driver.find_element_by_xpath('//*[#id="txtStockCode"]').send_keys(ticker) #Keys.ENTER)
ccass_search_click = driver.find_element_by_xpath('//*[#id="btnSearch"]').click()
The date you are trying to select is disabled. You can't select 3rd April 2018. You can only select from 10th April and I'm guessing that it will be disabled tomorrow.
Sorry to say, you are too late. Which the class name for the disabled dates also.
i edited for a little bit, the problem was that although i found all those buttons from the popup date picker, but i could not get the result of the date that i selected:
year = year_list['2019']
month = month_list['4']
day = day_list['4']
year_list = {'2018':'//*[#id="date-picker"]/div[1]/b[1]/ul/li[1]/button', '2019':'//*[#id="date-picker"]/div[1]/b[1]/ul/li[2]/button'}
month_list = {'1':'//*[#id="date-picker"]/div[1]/b[2]/ul/li[1]/button', '2':'//*[#id="date-picker"]/div[1]/b[2]/ul/li[2]/button', '3':'//*[#id="date-picker"]/div[1]/b[2]/ul/li[3]/button', '4':'//*[#id="date-picker"]/div[1]/b[2]/ul/li[4]/button', '5':'//*[#id="date-picker"]/div[1]/b[2]/ul/li[5]/button', '6':'//*[#id="date-picker"]/div[1]/b[2]/ul/li[6]/button', '7':'//*[#id="date-picker"]/div[1]/b[2]/ul/li[7]/button', '8':'//*[#id="date-picker"]/div[1]/b[2]/ul/li[8]/button', '9':'//*[#id="date-picker"]/div[1]/b[2]/ul/li[9]/button', '10':'//*[#id="date-picker"]/div[1]/b[2]/ul/li[10]/button', '11':'//*[#id="date-picker"]/div[1]/b[2]/ul/li[11]/button', '12':'//*[#id="date-picker"]/div[1]/b[2]/ul/li[12]/button'}
day_list = {'1':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[1]/button','2':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[2]/button','3':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[3]/button','4':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[4]/button','5':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[5]/button','6':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[6]/button','7':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[7]/button','8':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[8]/button','9':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[9]/button','10':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[10]/button','11':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[11]/button','12':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[12]/button','13':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[13]/button','14':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[14]/button','15':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[15]/button','16':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[16]/button','17':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[17]/button','18':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[18]/button','19':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[19]/button','20':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[20]/button','21':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[21]/button','22':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[22]/button','23':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[23]/button','24':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[24]/button','25':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[25]/button','26':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[26]/button','27':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[27]/button','28':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[28]/button','29':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[29]/button','30':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[30]/button','31':' //*[#id="date-picker"]/div[1]/b[3]/ul/li[31]/button'}
ccass = driver.get('http://www.hkexnews.hk/sdw/search/searchsdw_c.aspx')
popup_datepicker = driver.find_element_by_xpath('//*[#id="txtShareholdingDate"]').click()
ccass_search_year = driver.find_element_by_xpath(year).click()
actions.double_click(ccass_search_year)
ccass_search_month = driver.find_element_by_xpath(month).click()
actions.double_click(ccass_search_month)
ccass_search_day = driver.find_element_by_xpath(day).click()
actions.double_click(ccass_search_day)
ccass_search = driver.find_element_by_xpath('//*[#id="txtStockCode"]').send_keys(ticker) #Keys.ENTER)
ccass_search_click = driver.find_element_by_xpath('//*[#id="btnSearch"]').click()
I created a web scraping program that open several URLs, it checks which one of the URLs has information related to "tomorrow"s date and then it prints some specific information that is on that URL. My problem is that sometimes none of the URLs in that list has information concerning "tomorrow". So I would like that in such case, the program prints other innformation like "no data found". How could I accomplish that? Other doubt I have, do I need the while loop at the beginning? Thanks.
My code is:
from datetime import datetime, timedelta
tomorrow = datetime.now() + timedelta(days=1)
tomorrow = tomorrow.strftime('%d-%m-%Y')
day = ""
while day != tomorrow:
for url in list_urls:
browser.get(url)
time.sleep(1)
dia_page = browser.find_element_by_xpath("//*[#id='item2']/b").text
dia_page = dia_page[-10:]
day_uns = datetime.strptime(dia_page, "%d-%m-%Y")
day = day_uns.strftime('%d-%m-%Y')
if day == tomorrow:
meals = browser.find_elements_by_xpath("//*[#id='item2']/span")
meal_reg = browser.find_element_by_xpath("//*[#id='item_frm']/span[1]").text
sopa2 = (meals[0].text)
refeicao2 = (meals[1].text)
sobremesa2 = (meals[2].text)
print(meal_reg)
print(sopa2)
print(refeicao2)
print(sobremesa2)
break
No need for a while loop, you can use the for-else Python construct for this:
for url in list_urls:
# do stuff
if day == tomorrow:
# do and print stuff
break
else: # break never encountered
print("no data found")
I came across a very useful set of scripts on the Shane Lynn for the
Analysis of Weather data. The first script, used to scrape data from Weather Underground, is as follows:
import requests
import pandas as pd
from dateutil import parser, rrule
from datetime import datetime, time, date
import time
def getRainfallData(station, day, month, year):
"""
Function to return a data frame of minute-level weather data for a single Wunderground PWS station.
Args:
station (string): Station code from the Wunderground website
day (int): Day of month for which data is requested
month (int): Month for which data is requested
year (int): Year for which data is requested
Returns:
Pandas Dataframe with weather data for specified station and date.
"""
url = "http://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID={station}&day={day}&month={month}&year={year}&graphspan=day&format=1"
full_url = url.format(station=station, day=day, month=month, year=year)
# Request data from wunderground data
response = requests.get(full_url, headers={'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
data = response.text
# remove the excess <br> from the text data
data = data.replace('<br>', '')
# Convert to pandas dataframe (fails if issues with weather station)
try:
dataframe = pd.read_csv(io.StringIO(data), index_col=False)
dataframe['station'] = station
except Exception as e:
print("Issue with date: {}-{}-{} for station {}".format(day,month,year, station))
return None
return dataframe
# Generate a list of all of the dates we want data for
start_date = "2016-08-01"
end_date = "2016-08-31"
start = parser.parse(start_date)
end = parser.parse(end_date)
dates = list(rrule.rrule(rrule.DAILY, dtstart=start, until=end))
# Create a list of stations here to download data for
stations = ["ILONDON28"]
# Set a backoff time in seconds if a request fails
backoff_time = 10
data = {}
# Gather data for each station in turn and save to CSV.
for station in stations:
print("Working on {}".format(station))
data[station] = []
for date in dates:
# Print period status update messages
if date.day % 10 == 0:
print("Working on date: {} for station {}".format(date, station))
done = False
while done == False:
try:
weather_data = getRainfallData(station, date.day, date.month, date.year)
done = True
except ConnectionError as e:
# May get rate limited by Wunderground.com, backoff if so.
print("Got connection error on {}".format(date))
print("Will retry in {} seconds".format(backoff_time))
time.sleep(10)
# Add each processed date to the overall data
data[station].append(weather_data)
# Finally combine all of the individual days and output to CSV for analysis.
pd.concat(data[station]).to_csv("data/{}_weather.csv".format(station))
However, I get the error:
Working on ILONDONL28
Issue with date: 1-8-2016 for station ILONDONL28
Issue with date: 2-8-2016 for station ILONDONL28
Issue with date: 3-8-2016 for station ILONDONL28
Issue with date: 4-8-2016 for station ILONDONL28
Issue with date: 5-8-2016 for station ILONDONL28
Issue with date: 6-8-2016 for station ILONDONL28
Can anyone help me with this error?
The data for the chosen station and the time period is available, as shown at this link.
The output you are getting is because an exception is being raised. If you added a print e you would see that this is because import io was missing from the top of the script. Secondly, the station name you gave was out by one character. Try the following:
import io
import requests
import pandas as pd
from dateutil import parser, rrule
from datetime import datetime, time, date
import time
def getRainfallData(station, day, month, year):
"""
Function to return a data frame of minute-level weather data for a single Wunderground PWS station.
Args:
station (string): Station code from the Wunderground website
day (int): Day of month for which data is requested
month (int): Month for which data is requested
year (int): Year for which data is requested
Returns:
Pandas Dataframe with weather data for specified station and date.
"""
url = "http://www.wunderground.com/weatherstation/WXDailyHistory.asp?ID={station}&day={day}&month={month}&year={year}&graphspan=day&format=1"
full_url = url.format(station=station, day=day, month=month, year=year)
# Request data from wunderground data
response = requests.get(full_url)
data = response.text
# remove the excess <br> from the text data
data = data.replace('<br>', '')
# Convert to pandas dataframe (fails if issues with weather station)
try:
dataframe = pd.read_csv(io.StringIO(data), index_col=False)
dataframe['station'] = station
except Exception as e:
print("Issue with date: {}-{}-{} for station {}".format(day,month,year, station))
return None
return dataframe
# Generate a list of all of the dates we want data for
start_date = "2016-08-01"
end_date = "2016-08-31"
start = parser.parse(start_date)
end = parser.parse(end_date)
dates = list(rrule.rrule(rrule.DAILY, dtstart=start, until=end))
# Create a list of stations here to download data for
stations = ["ILONDONL28"]
# Set a backoff time in seconds if a request fails
backoff_time = 10
data = {}
# Gather data for each station in turn and save to CSV.
for station in stations:
print("Working on {}".format(station))
data[station] = []
for date in dates:
# Print period status update messages
if date.day % 10 == 0:
print("Working on date: {} for station {}".format(date, station))
done = False
while done == False:
try:
weather_data = getRainfallData(station, date.day, date.month, date.year)
done = True
except ConnectionError as e:
# May get rate limited by Wunderground.com, backoff if so.
print("Got connection error on {}".format(date))
print("Will retry in {} seconds".format(backoff_time))
time.sleep(10)
# Add each processed date to the overall data
data[station].append(weather_data)
# Finally combine all of the individual days and output to CSV for analysis.
pd.concat(data[station]).to_csv(r"data/{}_weather.csv".format(station))
Giving you an output CSV file starting as follows:
,Time,TemperatureC,DewpointC,PressurehPa,WindDirection,WindDirectionDegrees,WindSpeedKMH,WindSpeedGustKMH,Humidity,HourlyPrecipMM,Conditions,Clouds,dailyrainMM,SoftwareType,DateUTC,station
0,2016-08-01 00:05:00,17.8,11.6,1017.5,ESE,120,0.0,0.0,67,0.0,,,0.0,WeatherCatV2.31B93,2016-07-31 23:05:00,ILONDONL28
1,2016-08-01 00:20:00,17.7,11.0,1017.5,SE,141,0.0,0.0,65,0.0,,,0.0,WeatherCatV2.31B93,2016-07-31 23:20:00,ILONDONL28
2,2016-08-01 00:35:00,17.5,10.8,1017.5,South,174,0.0,0.0,65,0.0,,,0.0,WeatherCatV2.31B93,2016-07-31 23:35:00,ILONDONL28
If you are not getting a CSV file, I suggest you add a full path to the output filename.