URL not updating when iterating through URLs - python

The following code does the following:
1 Opens a specific URL (for the first date YYYY-MM-DD);
2 getURL() generates all URLs with all dates in a specific date range (starting from the second day);
3 Opens new tab with the first date generated by getURL();
4 Goes back to previous tab and closes it;
5 Repeat steps 3 and 4.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from datetime import datetime, timedelta
# Load Chrome driver and movement.uber.com/cities website
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)
# Attributing the city name and the center-most zone code (or origin) to variables so they can be inserted in the URL later
city = 'atlanta'
origin_code = '1074'
coordinates = '&lat.=33.7489&lng.=-84.4234622&z.=12'
# Open URL for the first day in the desired city (change coordinates depending on city)
driver.get('https://movement.uber.com/explore/' + city + '/travel-times/query?si' + origin_code + '&ti=&ag=taz&dt[tpb]=ALL_DAY&dt[wd;]=1,2,3,4,5,6,7&dt[dr][sd]=' +
'2016-01-02' + '&dt[dr][ed]=' + '2016-01-02' + '&cd=&sa;=&sdn=' + coordinates + '&lang=en-US')
# Generating the correct URLs for each date
def getURL():
date = datetime(2016,1,4)
while date <= datetime(2020,3,31):
yield ('https://movement.uber.com/explore/' + city + '/travel-times/query?si' + origin_code + '&ti=&ag=taz&dt[tpb]=ALL_DAY&dt[wd;]=1,2,3,4,5,6,7&dt[dr][sd]=' +
date.strftime('%Y-%m-%d') + '&dt[dr][ed]=' + date.strftime('%Y-%m-%d') + '&cd=&sa;=&sdn=&lat.=33.7489&lng.=-84.4234622&z.=12&lang=en-US')
date += timedelta(days=1)
# Open new tab
i = 0
for url in getURL():
i += 1
if i < 3:
driver.execute_script("window.open(url)")
# Switch to previous tab and close it (leaving us with the newly above opened tab)
tabs = driver.window_handles
if len(tabs) > 1:
driver.switch_to.window(tabs[0])
driver.close()
driver.switch_to.window(tabs[1])
The problem: every time a new tab/"window" is opened, the code opens the URL with the first date YYYY-MM-DD, completely ignoring the URLs generated by getURL().
The question: how do I open a new tab with the next date, close the previous one, repeat?
My ultimate goal: to download datasets that are inside each distinct URL (but the code for that is irrelevant for the problem here). Obs.: I use the Selenium library for that.

you can maybe try to put all the urls you create in a list and then return it
like this :
def getURL():
tab = []
date = datetime(2016,1,4)
while date <= datetime(2020,3,31):
url ='https://movement.uber.com/explore/' + city + '/travel-times/query?si' + origin_code + '&ti=&ag=taz&dt[tpb]=ALL_DAY&dt[wd;]=1,2,3,4,5,6,7&dt[dr][sd]=' +
date.strftime('%Y-%m-%d') + '&dt[dr][ed]=' + date.strftime('%Y-%m-%d') + '&cd=&sa;=&sdn=&lat.=33.7489&lng.=-84.4234622&z.=12&lang=en-US'
tab.append(url)
date += timedelta(days=1)
return tab

The error lies on how you're launching a tab.
When i changed FROM:
driver.execute_script("window.open(url)")
TO:
driver.execute_script("window.open('"+url+"','_blank')")
The script executed perfectly on my machine.
It looks like url inside your approach is not updated on the iteration. If you make it into a parameter of the for loop it is resolved each time.
Have a look here for more info on javascript on how to open a window (FYI - you can also do _self instead of _blank to replace the current window - this might mitigate your need for tab management).
Results of my test...
This is the first iteration:
This is the second iteration:
For reference This is the entire script i ran: (note that i updated i for more iterations, the chromedriver PATH for my machine + added couple of prints)
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from datetime import datetime, timedelta
# Load Chrome driver and movement.uber.com/cities website
#PATH = ''# - mine lives local -> 'C:\Program Files (x86)\chromedriver.exe'
#driver = webdriver.Chrome(PATH)
driver = webdriver.Chrome()
# Attributing the city name and the center-most zone code (or origin) to variables so they can be inserted in the URL later
city = 'atlanta'
origin_code = '1074'
coordinates = '&lat.=33.7489&lng.=-84.4234622&z.=12'
# Open URL for the first day in the desired city (change coordinates depending on city)
driver.get('https://movement.uber.com/explore/' + city + '/travel-times/query?si' + origin_code + '&ti=&ag=taz&dt[tpb]=ALL_DAY&dt[wd;]=1,2,3,4,5,6,7&dt[dr][sd]=' +
'2016-01-02' + '&dt[dr][ed]=' + '2016-01-02' + '&cd=&sa;=&sdn=' + coordinates + '&lang=en-US')
# Generating the correct URLs for each date
def getURL():
date = datetime(2016,1,4)
while date <= datetime(2020,3,31):
yield ('https://movement.uber.com/explore/' + city + '/travel-times/query?si' + origin_code + '&ti=&ag=taz&dt[tpb]=ALL_DAY&dt[wd;]=1,2,3,4,5,6,7&dt[dr][sd]=' +
date.strftime('%Y-%m-%d') + '&dt[dr][ed]=' + date.strftime('%Y-%m-%d') + '&cd=&sa;=&sdn=&lat.=33.7489&lng.=-84.4234622&z.=12&lang=en-US')
date += timedelta(days=1)
# Open new tab
i = 0
print("urls: %i", len(list(getURL())))
for url in getURL():
i += 1
if i < 10:
driver.execute_script("window.open('"+url+"','_blank')")
print (url)
# Switch to previous tab and close it (leaving us with the newly above opened tab)
tabs = driver.window_handles
if len(tabs) > 1:
driver.switch_to.window(tabs[0])
driver.close()
driver.switch_to.window(tabs[1])

Related

Unclickable element with selenium Python

I used web scrapping through Python with selenium in order to get daily price values for EEX French Power futures at the url "https://www.eex.com/en/market-data/power/futures#%7B%22snippetpicker%22%3A%2221%22%7D".
I guess they updated their website as the url changed recently, and now my script doesn't work properly anymore as I can't find a way to click on each displayed product button (Year, Quarter, Month, Weekend, Day).
Here is my code until the step that doesn't work (it simply doesn't click, it doesn't fail) :
import time
import datetime
from datetime import date
from dateutil.relativedelta import relativedelta
import pyodbc
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
url = "https://www.eex.com/en/market-data/power/futures#%7B%22snippetpicker%22%3A%2221%22%7D"
dico_product = ('Day', 'Weekend', 'Week', 'Month', 'Quarter', 'Year')
now = datetime.datetime.now()
date_prx = now.date()
options=Options()
d = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
d.get(url)
time.sleep(6)
d.maximize_window()
cookies_button_str = "//input[#class='btn bordered uo_cookie_btn_type_1']"
d.find_element(By.XPATH, cookies_button_str).click()
time.sleep(4)
dateinput_button_str = "//div[#class = 'mv-date-input']//div[#class = 'mv-stack-block']//input[#class = 'mv-input-box']"
Date_input = date_prx
Date_input_str = str(Date_input.year) + '-' + str(Date_input.month) + '-' + str(Date_input.day)
element_view = d.find_element(By.CLASS_NAME, 'collapsed')
d.execute_script("arguments[0].scrollIntoView()", element_view)
WebDriverWait(d, 20).until(EC.presence_of_element_located((By.XPATH, dateinput_button_str)))
element = d.find_element(By.XPATH, dateinput_button_str)
time.sleep(2)
d.execute_script('arguments[0].value = "' + str(Date_input_str) + '";', element)
time.sleep(2)
element_button_str = './/div[contains(#class, "mv-button-base mv-hyperlink-button")]'
containers = d.find_elements(By.XPATH, element_button_str)
for item in containers:
if item.text in dico_product:
print('Traitement ' + str(item.text) + ' pour la date ' + str(Date_input_str) + '.')
element_button_str = './/div[contains(#class, "' + str(item.get_attribute("class")) + '") and contains(., "' + str(item.text) + '")]'
product_button = d.find_element(By.XPATH, element_button_str)
d.execute_script("arguments[0].click()", product_button)
It does find the element to click on, but it doesn't click.
What is suprising is that if you take the old url, that get you to the Austrian futures by default, it works fine. But if you take the proper url, it doesn't.
I don't know if it can be done or if it's no use, but honestly I tried everything I could think of. Could you gently help me ?
Thank you

Selenium - selecting date picker from skyscanner

I couldn't find a way to automate the date picker using Selenium.
from selenium import webdriver
from getpass import getpass
import pandas as pd
import numpy as np
import requests
import lxml
url = "https://www.skyscanner.ca"
driver = webdriver.Chrome("chromedriver")
driver.maximize_window()
driver.get(url)
trip_type_id = "fsc-trip-type-selector-one-way"
trip_type_select = driver.find_element(by="id", value=trip_type_id )
trip_type_select.click()
origin_textbox = driver.find_element(by="id", value="fsc-origin-search" )
origin_value = "Vancouver (Any)"
destin_textbox = driver.find_element(by="id", value="fsc-destination-search")
dest_value = "Dubai (Any)"
destin_textbox.send_keys(dest_value)
I am stuck at the datepicker
driver.find_element(by="id", value="depart-fsc-datepicker-button" ).click()
opens the datepicker as
enter image description here
Is there a way to automate by passing a specific date ("December 12, 2022") with selenium?.
After analyzing the DOM of the page I found that the calendar class has dropdown for month_year and is available in the option tab. The element can be selected either on the basis of text or on the basis class name. I have used text here.
Similarly the date part can be selected by selecting the span on text which is present inside the button.
Here is the code that selects the given date on the page.
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# REPLACE YOUR CHROME PATH HERE
chrome_path = r"C:\Users\hpoddar\Desktop\Tools\chromedriver_win32\chromedriver.exe"
s = Service(chrome_path)
driver = webdriver.Chrome(service=s)
driver.get('https://www.skyscanner.ca')
driver.maximize_window()
date = 'December 12, 2022' # ENTER YOUR DATE HERE
# Extracting day and month_year from the date
m = re.search(r'([A-Za-z]+) (\d{2}), (\d{4})', date)
day, month_year = m.group(2), m.group(1) + " " + m.group(3)
driver.find_element(by="id", value="depart-fsc-datepicker-button" ).click()
click_on_depart_dropdown = driver.find_element(by="id", value="depart-calendar__bpk_calendar_nav_select" )
click_on_depart_dropdown.click()
monthyear = click_on_depart_dropdown.find_element(By.XPATH, f'//option[contains(text(), "{month_year}")]')
monthyear.click()
day_element = click_on_depart_dropdown.find_element(By.XPATH, f'//button[contains(#class, "BpkCalendarDate_bpk-calendar-date__MTdlO")]//span[contains(text(), "{day}")]')
day_element.click()
The above example was for Depart, similarly it can be done for Return.

How to continue a script even if there is a missing element on the current page?

I am working on a scraping project and am trying so scrape many different profiles. Not all of the profiles have the same information, so I want to skip that piece of data if the current profile does not have it. Here is my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
driver = webdriver.Chrome("MY DIRECTORY")
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body") #
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
##### SCRAPE CODE #####
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div')
IssuedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[1]/div[2]')
CertificationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]')
CertfiedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]')
RecertificationCycle = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[3]/div[2]')
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]')
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a')
print(Name.text + " : " + IssuedBy.text + " : " + CertificationNumber.text + " : " + CertfiedSince.text + " : " + RecertificationCycle.text + " : " + Expires.text + " : " + AccreditedBy.text)
driver.close()
driver.switch_to.window(driver.window_handles[0])
driver.close()
Please let me know how I would be able to skip an element if it is not present on the current profile.
According to the docs, find_element_by_xpath() raises a NoSuchElementException if the element you're looking for couldn't be found.
I suggest handling potential NoSuchElementExceptions accordingly. What a proper exception handling could look like depends on what you're trying to achieve, you might want to log an error, assign default values, skip certain follow up actions...
try:
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div')
except NoSuchElementException:
Name = "Default Name"
You could even wrap multiple find_element_by_xpath() calls in your try block.
It will fix try:.. except:.. but you have some other errors too. I fixed them all.
Code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
driver = webdriver.Chrome('chromedriver')
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body") #
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
c = 1
while c <= count:
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
sleep(1)
##### SCRAPE CODE #####
try:
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div')
IssuedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[1]/div[2]')
CertificationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]')
CertfiedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]')
RecertificationCycle = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[3]/div[2]')
except:
c -= 1
driver.switch_to.window(driver.window_handles[0])
c += 1
if c > count:
break
driver.quit()

loop with BeautifulSoup to web scrape multiple pages by timestamps

I am trying to retrive the day temperature of a local weather site.
I built this loop using BeautifulSoup.
Unfortunately the loop breaks after the first round.
this is my code and the result:
code:
#coding: latin-1
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
# create a file zam-data.txt
# seperated with komma
f = open('zamg-data.txt','w')
# start webdriver
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
#loop through month and days
for m in range(1,13):
for d in range (1, 32):
# was the last day in a month
if (m==2 and d>28):
break
elif (m in [4,6,9,11] and d>30):
break
#open zamg site
timestamp = '2019' +'-'+ str(m) +'-'+ str(d)
print("call page of "+timestamp)
url = "https://www.zamg.ac.at/cms/de/klima/klima-aktuell/klimamonitoring/?param=t&period=period-ymd-"+timestamp
driver.get(url)
# extract temprature
html = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(html, "html.parser")
data = soup.find_all(class_='u-txt--big')[1].string
print(len(data))
print(data + '...okay')
# format month for timestamp
if(len(str(m)) < 2):
mStamp = '0'+str(m)
else:
mStamp = str(m)
# format day for timestamp
if(len(str(d)) < 2):
dStamp = '0'+ str(d)
else:
dStamp = str(d)
# timestamp
timestamp = '2019' + mStamp + dStamp
# write time and value
f.write(timestamp + ',' + data + '\n')
# data is extracted - close
f.close()
my result:
➜ weather-app python get-data-02.py
call page of 2019-1-1
5
+3,9 ...okay
call page of 2019-1-2
Traceback (most recent call last):
File "get-data-02.py", line 37, in <module>
data = soup.find_all(class_='u-txt--big')[1].string
IndexError: list index out of range
➜ weather-app
I don't understand what is wrong here. the 2nd page is loaded in the browser but then it breaks
any Ideas?
#coding: latin-1
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import datetime
import time
base = datetime.datetime(2019,1,1).date()
date_list = [base + datetime.timedelta(days=x) for x in range(365)]
# start webdriver
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
base_url = "https://www.zamg.ac.at/cms/de/klima/klima-aktuell/klimamonitoring/?param=t&period=period-ymd-"
with open('zamg-data.txt','w') as file:
for dt in date_list:
timestamp = dt.strftime("%Y-%m-%d")
print("call page of "+timestamp)
url = f"{base_url}{timestamp}"
driver.get(url)
WebDriverWait(driver, timeout=40).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "u-txt--big")))
# extract temprature
html = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(html, "html.parser")
data = soup.find_all(class_='u-txt--big')[1].string
print(len(data))
print(data + '...okay')
# timestamp
timestamp_1 = dt.strftime("%Y%m%d")
# write time and value
file.write(timestamp_1 + ',' + data + '\n')
time.sleep(3)
driver.quit()
print("Done!!!")
As someone from the comment section mentioned, you need to make the browser wait till all elements of that class are detected. I've added an explicit time delay after each page load so that the website is not overwhelmed with requests. It is a potential way to get your IP blocked. It's best to always use a context manager, whenever you can.

Python: Issue with difference between webdriver Firefox and PhantomJS

I've been working on this Python script for the past day or two and all is working fine when I use the Firefox webdriver, but when I switch to use a headless browser like PhantomJS it fails on the line with setNumber = parseSetNumber(setName[0]) with the error Error: list index out of range due to setName being empty.
The line before it setName = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/text()") returns nothing when using the PhantomJS webdriver only, if using the Firefox webdriver it returns a value fine.
The error only happens when I switch the webdriver from Firefox to PhantomJS. I use PhantomJS as the script is run on a linux server.
import time
import os.path
import lxml.html as LH
import re
import sys
from selenium import webdriver
from random import randint
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
PARAMS = sys.argv
URL = PARAMS[1]
BASEURL = URL[:URL.rfind('/')+1]
# Parses the set name for the set number
def parseSetNumber(string):
string = string.split(' ')
stringLength = len(string)
string = string[(stringLength - 1)]
if string.replace('.','').isdigit():
return string
else:
return ""
# Returns set reference for this site
def parseRefId(string):
string = string.split('_')
return str(string[2])
try:
PAGE_NUMBER = 1
#--------------------------------------------------
## Get initial page
driver = webdriver.PhantomJS()
driver.get(PARAMS[1])
#--------------------------------------------------
## Get page count
# Give page time to load
time.sleep(2)
PAGE_RAW = driver.page_source
PAGE_RAW = LH.fromstring(PAGE_RAW)
PAGE_COUNT_RAW = PAGE_RAW.xpath("//div[contains(#class, 'pageControlMenu')]/div/ul/li")
PAGE_COUNT = len(PAGE_COUNT_RAW) - 2
#--------------------------------------------------
## Get page if its not page one
while PAGE_NUMBER <= PAGE_COUNT:
#--------------------------------------------------
## Create empty file
FILE_NAME = PARAMS[3] + 'json/' + time.strftime("%Y%m%d%H") + '_' + str(PARAMS[2]) + '_' + str(PAGE_NUMBER) + '.json'
#--------------------------------------------------
## Create JSON file if it doesnt exist
if os.path.exists(FILE_NAME)==False:
JSON_FILE = open(FILE_NAME, "a+", encoding="utf-8")
else:
JSON_FILE = open(FILE_NAME, "w", encoding="utf-8")
JSON_FILE.write("{")
#--------------------------------------------------
# Click page for next page if not page 1
if PAGE_NUMBER > 1:
index = 0
for atag in PAGE_COUNT_RAW:
if index == PAGE_NUMBER:
elements = driver.find_elements_by_xpath("//div[contains(#class, 'pageControlMenu')]/div/ul/li")
if elements:
element = elements[index].find_elements_by_xpath("./a")
if element:
element[0].click()
time.sleep(randint(3,5))
index += 1
#--------------------------------------------------
## Remove survey box if it pops up and log
try:
surveyBox = driver.find_element_by_link_text("No, thanks")
if surveyBox:
surveyBox.click()
print("Store[" + str(PARAMS[2]) + "]: Survey box found on page - " + str(PAGE_NUMBER))
except:
print("Store[" + str(PARAMS[2]) + "]: No survey box on page - " + str(PAGE_NUMBER))
#--------------------------------------------------
## Proces page
# If page is greater then 1 then get the page source of the new page.
if PAGE_NUMBER > 1:
PAGE_RAW = driver.page_source
PAGE_RAW = LH.fromstring(PAGE_RAW)
PAGE_RAW = PAGE_RAW.xpath("//div[contains(#class, 'estore_product_container')]")
index = 0
size = len(PAGE_RAW)
for atag in PAGE_RAW:
if PAGE_NUMBER > 1 and index == 0:
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH, "./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a")))
setStore = PARAMS[2]
setName = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/text()")
setNumber = parseSetNumber(setName[0])
setPrice = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_price')]/text()")
setLink = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/#href")
setRef = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_price')]/#id")
if setRef:
setRef = parseRefId(setRef[0])
if re.search('[0-9\.]+', setPrice[0]) is not None:
JSON_FILE.write("\"" + str(index) + "\":{\"store\":\"" + str(setStore) + "\",\"name\":\"" + str(setName[0]) + "\",\"number\":\"" + str(setNumber) + "\",\"price\":\"" + re.search('[0-9\.]+', setPrice[0]).group() + "\",\"ref\":\"" + str(setRef) + "\",\"link\":\"" + str(setLink[0]) + "\"}")
if index+1 < size:
JSON_FILE.write(",")
index += 1
#--------------------------------------------------
## Close JSON file
JSON_FILE.write("}")
JSON_FILE.close()
#--------------------------------------------------
## Increment page number
PAGE_NUMBER += 1
#--------------------------------------------------
#--------------------------------------------------
## Close webdriver
driver.quit()
#--------------------------------------------------
except Exception as e:
print('Error: ' + str(e.args[0]))
# Remove gecodriver.log file
GHOSTDRIVER_FILE = str(PARAMS[3]) + 'jobs/ghostdriver.log'
if os.path.exists(GHOSTDRIVER_FILE)==True:
os.remove(GHOSTDRIVER_FILE)
Update
It looks like these are the only two lines not working with PhantomJS, they both return an empty value.
setName = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/text()")
setLink = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/#href")
Ok, looks like I've solved this issue, I had to add the set_windows_size option for the webdriver when using PhantomJS.
Originally:
driver = webdriver.PhantomJS()
driver.get(PARAMS[1])
Solution:
driver = webdriver.PhantomJS()
driver.set_window_size(1024, 768)
driver.get(PARAMS[1])
Now the PhantomJS webdriver works as is expected in the same way the Firefox webdriver works.

Categories