Using selenium I'm downloading some files from a webpage. On Monday's I need to download the info for Friday, Saturday, and Sunday. Every other day I only need yesterday. I wrote an if/else statement to accomplish this and just copy and pasted the code into the else statement. There must be a more pythonic way to write this but I'm still new to this.
today = datetime.date.today()
yesterday = str(today - timedelta(days=1))
if today.weekday() == 0:
fri = str(today - timedelta(days=3))
sat = str(today - timedelta(days=2))
weekend = [fri, sat, yesterday]
for day in weekend:
# Needs to go first otherwise page won't load
date_field = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]""")
date_field.send_keys(day)
org_list = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]/option[text()=\"string\"]""").click()
delay = 5
try:
table_chk = WebDriverWait(driver, delay).until(
EC.presence_of_element_located((By.XPATH, """//*[#id="id blah blah"]""")))
export_btn = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]""")
export_btn.click()
date_field = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]""")
date_field.clear()
org_list = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]/option[1]""").click()
except TimeoutException:
print("Loading took too much time!")
time.sleep(2)
else:
# Needs to go first otherwise it doesn't work
date_field = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]""")
date_field.send_keys(yesterday)
org_list = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]/option[text()=\"string\"]""").click()
delay = 5
try:
table_chk = WebDriverWait(driver, delay).until(
EC.presence_of_element_located((By.XPATH, """//*[#id="id blah blah"]""")))
export_btn = driver.find_element_by_xpath(
"""//*[#id="id blah blah"]""")
export_btn.click()
except TimeoutException:
print("Loading took too much time!")
How can I efficiently repeat the code but have it run multiple times on Monday for Fri, Sat, Sun and just once for the day before, every other day of the week?
Make it always loop, but programmatically define the collection to loop over as a single element most of the time, and multiple days when needed:
today = datetime.date.today()
# No need to define yesterday; we'll make it as needed next
if today.weekday() == 0:
# Today is Monday, quickly get the days for Friday-Sunday
days = [today - timedelta(days=i) for i in (3, 2, 1)]
else:
# Today is not Monday, just check yesterday
days = [today - timedelta(days=1)]
# days is now either one element list of just yesterday, or the whole weekend
# loop runs once or three times, as needed, with the same code
for day in days:
# Complete body of original for day in weekend: loop goes here
If you really want to get code duplication to a minimum, you could reduce the code before the loop to:
today = datetime.date.today()
num_days_to_check = 3 if today.weekday() == 0 else 1
days = [today - timedelta(days=i) for i in range(num_days_to_check, 0, -1)]
since really, all that differs is how many prior days you need to check, 1 or 3, so the conditional can simplify to a one-liner choosing between the two, and the rest is just based on that initial decision point.
Related
I'm learning web scraping and I need the webdriver to wait until the user selects a start date and end date off an existing calendar from here and read it so I can process the availabilities in that given period. I hope somebody can help me!
here's the part of the code:
tables = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//table[#role='grid']")))
table_first_month = tables[0].find_element(By.TAG_NAME, "tbody")
all_dates = table_first_month.find_elements(By.XPATH, "//td[#role='gridcell']")
for date in all_dates:
date_span = date.find_element(By.TAG_NAME, "span")
aria_label_span = date_span.get_attribute("aria-label")
print(aria_label_span)
#userStartDate = wait.until(EC.element_to_be_clickable((this is where i need help)))
if aria_label_span == str(userStartDate):
date_span.click()
time.sleep(4)
break
this code gets the avalaibale dates in calendar for the shown two months and verifies the condition that the given date (the user will select) exists with the help of this function
def press_right_arrow_until_date_is_found(date):
# get the text of the initial calendar
current_calendar = driver.find_element(By.XPATH, "/html[1]/body[1]/div[2]/div[1]/div[3]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/form[1]/div[1]/div[3]/div[4]/div[1]/div[1]").text
# while the date does not appear in the calendar view press right arrow until it does
while(date_formater(date) not in current_calendar):
right_arrow = driver.find_element(By.XPATH,
"//button[#class='fc63351294 a822bdf511 e3c025e003 fa565176a8 cfb238afa1 ae1678b153 c9fa5fc96d be298b15fa']")
right_arrow.click()
current_calendar = driver.find_element(By.XPATH,
"/html[1]/body[1]/div[2]/div[1]/div[3]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/form[1]/div[1]/div[3]/div[4]/div[1]/div[1]").text
im parsing all news from site, first problem is to take all links and titles, time and date from archive, this is my code:
lst_of_URL = []
years = list(range(2005,2023))
months = list(range(1,13))
months = list(map(str, months))
for i in range(len(months)):
if len(months[i]) == 1:
months[i] = '0' + months[i]
days = list(range(1,32))
days = list(map(str, days))
for i in range(len(days)):
if len(days[i]) == 1:
days[i] = '0' + days[i]
for year in years:
for month in months:
for day in days:
lst_of_URL.append(f'https://*******/archive/{year}/{month}/{day}')
it allows me to get all the links for each day's news and now I go through this list and parse each day separately, from the html I extract the number of links, title, publication date, etc. and write it into mysql, but it takes a lot of time hours 6 for 6600 samples from the list, tell me how to speed things up, I was thinking about running a couple of parallel parsers of different parts of the list al urls, but I don't know if it's possible, what would you recommend? here is the code:
for URL in tqdm(lst_of_URL):
data_salary = {}
try:
html = urlopen(URL)
bsObj = BeautifulSoup(html, 'lxml')
for link in bsObj.find_all('div', {'class':'c-card__body'}):
data_salary['link_href'] = link.a.get('href') # get links
data_salary['link_name'] = link.a.get_text() # get text from links
for sibling in link.find('div', 'u-fx u-fx--wrap'):
data_salary['date'] = sibling.get_text() # get post dates and view counts
cnx = mysql.connector.connect(user='root', password='aaaa1111',
host='127.0.0.1',
database='data')
cursor = cnx.cursor()
add_salary = ("INSERT INTO data_tsn "
"(link_href, link_name, date) "
"VALUES (%(link_href)s, %(link_name)s, %(date)s)")
cursor.execute(add_salary, data_salary)
cnx.commit()
except HTTPError as e:
print('cant connect or server not found')
cursor.close()
cnx.close()
Yesterday I started to change my code a little.
One part apparently was very tricky.
Its only use was to click on the continue button so the next click on the car symbol won't raise an error for. As I learned the error is raised because the form is in front of the button. (Apparently it sometimes works nevertheless)
This code snipped worked perfectly,
but changing it resulted in some errors I didn't see coming.
This code should work for itself if anyone wants to test it.
The variables *_strasse, (street name '+' separated)
*_hausnummer, (House number)
*_plz, (Post Code)
*_stadt (city) worked for me.
try:
#Getting the HTML
link = f"https://www.google.de/maps/dir/{start_strasse}+{start_hausnummer},+{start_plz}+{start_stadt},+Deutschland/{end_strasse}+{end_hausnummer},+D-{end_plz}+{end_stadt},+Deutschland"
driver.get(link)
driver.implicitly_wait(6)
#Wait for the button to appear
try:
elem = driver.find_element_by_xpath('//*[#id="introAgreeButton"]').click()
except :
continue
#Find the button for routes by car
list(filter(lambda x: x.get_attribute('jstcache') == '508', driver.find_elements_by_tag_name('button')))[0].click()
#Parse the fastest times
times = [ re.findall(r'[0-9]+', x.text[:-3]) for x in list(filter(lambda x: x.get_attribute('jstcache') == '265', driver.find_elements_by_tag_name('span')))]
print('Elemente:', times )
#Select the fastest time
length = times[0][0]
#Convert hh:mm format to minutes
length = int(length[0]) if len(length) == 1 else 60*int(length[0]) + int(length[1])
print(length)
except:
#used for debugging
#print(str(runde) + f'/{len(adresses.keys())}', link)
raise
My new code looks like this, but it's not able to find the continue button by XPATH.
try:
link = f"https://www.google.de/maps/dir/{start_strasse}+{start_hausnummer},+{start_plz}+{start_stadt},+Deutschland/{end_strasse}+{end_hausnummer},+D-{end_plz}+{end_stadt},+Deutschland"
driver.get(link)
driver.implicitly_wait(6)
elem = driver.find_element_by_xpath('//*[#id="introAgreeButton"]').click()
#element = driver.find_element_by_css('div[class*="U26fgb"]')
#driver.execute_script("arguments[0].click();", element)
#list(filter(lambda x: x.get_attribute('jstcache') == '508', driver.find_elements_by_tag_name('button')))[0].click()
driver.find_element_by_xpath("//button[#jstcache='508']").click()
#times = [ re.findall(r'[0-9]+', x.text[:-3]) for x in list(filter(lambda x: x.get_attribute('jstcache') == '265', driver.find_elements_by_tag_name('span')))]
#print('Elemente:', times )
#length = re.findall(r'[0-9]+', list(filter(lambda x: x.get_attribute('jstcache') == '265', driver.find_elements_by_tag_name('span')))[0].text[:-3])
#length = times[0][0]
elem = driver.find_element_by_xpath("//*[#jstcache='265']")
zeit = re.findall(r'[0-9]+', elem.text[:-3])
length = int(zeit[0]) if len(zeit) == 1 else 60*int(zeit[0]) + int(zeit[1])
verbindung = ','.join([start, end, str(length)]) + '\n'
datei.write(verbindung)
print(verbindung)
except:
print(link)
raise
I tested countless ideas from the web including invoking some java script functions instead of clicking, switching to an iframe, or to the active element or the active alert.
I think one of the ideas was actually right but I implemented it wrong.
Apparently implicit waits worked way better for me than any wait until what I found strange.
So I thought that my code could already have the problem inside.
I appreciate all help and comments!
It's my first post here so if I missed something pls tell me!
How the Browser looks like
The HTML of the button
I created a web scraping program that open several URLs, it checks which one of the URLs has information related to "tomorrow"s date and then it prints some specific information that is on that URL. My problem is that sometimes none of the URLs in that list has information concerning "tomorrow". So I would like that in such case, the program prints other innformation like "no data found". How could I accomplish that? Other doubt I have, do I need the while loop at the beginning? Thanks.
My code is:
from datetime import datetime, timedelta
tomorrow = datetime.now() + timedelta(days=1)
tomorrow = tomorrow.strftime('%d-%m-%Y')
day = ""
while day != tomorrow:
for url in list_urls:
browser.get(url)
time.sleep(1)
dia_page = browser.find_element_by_xpath("//*[#id='item2']/b").text
dia_page = dia_page[-10:]
day_uns = datetime.strptime(dia_page, "%d-%m-%Y")
day = day_uns.strftime('%d-%m-%Y')
if day == tomorrow:
meals = browser.find_elements_by_xpath("//*[#id='item2']/span")
meal_reg = browser.find_element_by_xpath("//*[#id='item_frm']/span[1]").text
sopa2 = (meals[0].text)
refeicao2 = (meals[1].text)
sobremesa2 = (meals[2].text)
print(meal_reg)
print(sopa2)
print(refeicao2)
print(sobremesa2)
break
No need for a while loop, you can use the for-else Python construct for this:
for url in list_urls:
# do stuff
if day == tomorrow:
# do and print stuff
break
else: # break never encountered
print("no data found")
I'm having a python issue which I cannot seem to understand. Not sure if I need to use if statements but because I'm new to python, I'm not actually sure how to code this little issue.
Virtually this is the issue I have. For the departure calendar, I want python to be able to do the following:
View 'Your date'. If there's a flight (doesn't matter if lowfare or normal), click it. If not then move onto the next available date that does have a flight and click that.
Will need to be able to move to the next month if no date is available in the current month (I have an example code for this).
For the return calendar, I want it to do the same thing but ensure it selects a date at least 7 days after the selected departure date.
That's virtually my question, how to do that?
Below is the html of the depature calendar (return calendar is exactly the same except it's inboundsearchresults rather than outbound search results):
Below I have a sample code which works when selecting from an ordinary date picker (this is used in the page before the url) if you want to use that template and manipulate it:
# select depart date
datepicker = driver.find_element_by_id("departure-date-selector")
actions.move_to_element(datepicker).click().perform()
# find the calendar, month and year picker and the current date
calendar = driver.find_element_by_id("departureDateContainer")
month_picker = Select(calendar.find_element_by_class_name("ui-datepicker-month"))
year_picker = Select(calendar.find_element_by_class_name("ui-datepicker-year"))
current_date = calendar.find_element_by_class_name("ui-datepicker-current-day")
# printing out current date
month = month_picker.first_selected_option.text
year = year_picker.first_selected_option.text
print("Current departure date: {day} {month} {year}".format(day=current_date.text, month=month, year=year))
# see if we have an available date in this month
try:
next_available_date = current_date.find_element_by_xpath("following::td[#data-handler='selectDay' and ancestor::div/#id='departureDateContainer']")
print("Found an available departure date: {day} {month} {year}".format(day=next_available_date.text, month=month, year=year))
next_available_date.click()
except NoSuchElementException:
# looping over until the next available date found
while True:
# click next, if not found, select the next year
try:
calendar.find_element_by_class_name("ui-datepicker-next").click()
except NoSuchElementException:
# select next year
year = Select(calendar.find_element_by_class_name("ui-datepicker-year"))
year.select_by_visible_text(str(int(year.first_selected_option.text) + 1))
# reporting current processed month and year
month = Select(calendar.find_element_by_class_name("ui-datepicker-month")).first_selected_option.text
year = Select(calendar.find_element_by_class_name("ui-datepicker-year")).first_selected_option.text
print("Processing {month} {year}".format(month=month, year=year))
try:
next_available_date = calendar.find_element_by_xpath(".//td[#data-handler='selectDay']")
print("Found an available departure date: {day} {month} {year}".format(day=next_available_date.text, month=month, year=year))
next_available_date.click()
break
except NoSuchElementException:
continue
The idea is to define a reusable function - calling it select_date() that receives a "calendar" WebElement and an optional minimum date. This function would first look for the Your date in the calendar and if it is there and it is more than minimum (if given) click it and return the date. If there is no Your date, look for the available "flight" days and, if minimum date is given and the date is more than or equal to it, click it and return the date.
Working implementation:
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def select_date(calendar, mininum_date=None):
try:
# check if "Your Date" is there
your_date_elm = calendar.find_element_by_class_name("your-date")
your_date = your_date_elm.get_attribute("data-date")
print("Found 'Your Date': " + your_date)
your_date_elm.click()
# check if your_date against the minimum date if given
your_date = datetime.strptime(your_date, "%Y-%m-%d")
if mininum_date and your_date < mininum_date:
raise NoSuchElementException("Minimum date violation")
return your_date
except NoSuchElementException:
flight_date = None
flight_date_elm = None
while True:
print("Processing " + calendar.find_element_by_css_selector("div.subheader > p").text)
try:
if mininum_date:
flight_date_elms = calendar.find_elements_by_class_name("flights")
flight_date_elm = next(flight_date_elm for flight_date_elm in flight_date_elms
if datetime.strptime(flight_date_elm.get_attribute("data-date"), "%Y-%m-%d") >= mininum_date)
else:
flight_date_elm = calendar.find_element_by_class_name("flights")
except (StopIteration, NoSuchElementException):
calendar.find_element_by_partial_link_text("Next month").click()
# if found - print out the date, click and exit the loop
if flight_date_elm:
flight_date = flight_date_elm.get_attribute("data-date")
print("Found 'Flight Date': " + flight_date)
flight_date_elm.click()
break
return datetime.strptime(flight_date, "%Y-%m-%d")
driver = webdriver.Firefox()
driver.get("http://www.jet2.com/cheap-flights/leeds-bradford/antalya/2016-03-01/2016-04-12?adults=2&children=2&infants=1&childages=4%2c6")
wait = WebDriverWait(driver, 10)
# get the outbound date
outbound = wait.until(EC.visibility_of_element_located((By.ID, "outboundsearchresults")))
outbound_date = select_date(outbound)
# get the inbound date
inbound = driver.find_element_by_id("inboundsearchresults")
inbound_minimum_date = outbound_date + timedelta(days=7)
inbound_date = select_date(inbound, mininum_date=inbound_minimum_date)
print(outbound_date, inbound_date)
driver.close()
For the provided in the question URL, it prints:
Processing March 2016
Found 'Flight Date': 2016-03-28
Processing April 2016
Found 'Flight Date': 2016-04-04
2016-03-28 00:00:00 2016-04-04 00:00:00
The two dates printed at the end are the departure and the return dates.
Let me know if you need any clarifications and hope it helps.