How I can get the playlist urls stored like
here: https://www.youtube.com/watch?v=VpTRlS7EO6E&list=RDOIhVs0FQ8xc&index=5
with bs4?
Using
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://www.youtube.com/watch?v=OIhVs0FQ8xc&list=RDOIhVs0FQ8xc&index=1')
page = r.text
soup=bs(page,'html.parser')
#print(soup)
res=soup.find_all('ytd-playlist-panel-video-renderer')
print(res)
doesn't return anything. Even printing the soup itself doesn't contain the link I'am looking for (like href="/watch?v=puNOG62lf-Y&list=RDOIhVs0FQ8xc&index=2")
It is a javascript rendered page. You have to use selenium.
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
url = 'https://www.youtube.com/watch?v=OIhVs0FQ8xc&list=RDOIhVs0FQ8xc&index=1'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)
time.sleep(2)
soup=bs(driver.page_source,'html.parser')
res=soup.find_all('ytd-playlist-panel-video-renderer')
print(res)
Install the required package using pip install webdriver-manager
Thank you!
Here some dirty code working for me:
#---------------------------------
# import modules
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
import re
#---------------------------------
#
from webdriver_manager.firefox import GeckoDriverManager
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
#---------------------------------
# get links from url
def get_links(driver, sleep_time):
# open driver window
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)
# wait some seconds
time.sleep(sleep_time)
# get information from url
soup = bs(driver.page_source,'html.parser')
res = soup.find_all('ytd-playlist-panel-video-renderer')
# check if there is information
if len(res) > 0:
main_url = 'https://www.youtube.com/watch?v='
urls = re.findall('watch.*list', str(res))
links = [main_url + str(a[8:-9]) for a in urls[::2]]
# if there is no information return false
else:
links = False
return links
#---------------------------------
# set sleep timer
sleep_time = 10
# call function to get links
links = get_links(driver, sleep_time)
This works for me:
from selenium import webdriver # pip install selenium
import time
# make sure you download chrome driver from https://chromedriver.chromium.org/downloads and put it in folder 'driver'
driver = webdriver.Chrome('driver\chromedriver.exe')
driver.get('https://www.youtube.com/playlist?list=PLxvodScTx2RtAOoajGSu6ad4p8P8uXKQk') # put here your link
# scroll page down
old_position = 0
new_position = None
position_script = """return (window.pageYOffset !== undefined) ?
window.pageYOffset : (document.documentElement ||
document.body.parentNode || document.body);"""
while new_position != old_position:
old_position = driver.execute_script(position_script)
time.sleep(1)
driver.execute_script(
"""var scrollingElement = (document.scrollingElement ||
document.body);scrollingElement.scrollTop =
scrollingElement.scrollHeight;""")
new_position = driver.execute_script(position_script)
source_page = driver.page_source
driver.quit()
# extract the url's and name's
counter = 1
element_to_find = 'amp;index={}" ar'
video_index = source_page.find(element_to_find.format(counter)) #'amp;index=1" ar'
while video_index != -1:
title_element = ''
count_name = video_index
while title_element != 'title="':
title_element = source_page[count_name: count_name + 7]
count_name += 1
count_name += 6
start_title_position = count_name
end_title = ''
while end_title != '>':
end_title = source_page[count_name] # exit loop if end_title == '>'
count_name += 1
name = source_page[start_title_position:count_name - 2] # extract the name of the video
name = name.replace('"','"')
video_id = source_page[video_index - 56: video_index - 45] # extract video id
print(str(counter)
+ '. link: ' + 'https://www.youtube.com/watch?v=' + video_id +
', name: ' + name)
counter += 1
video_index = source_page.find(element_to_find.format(counter)) # continue the next video
The easiest solution is:
from pytube import Playlist
URL_PLAYLIST = "https://www.youtube.com/playlist?list=YOUR-LINK"
# Retrieve URLs of videos from playlist
playlist = Playlist(URL_PLAYLIST)
print('Number Of Videos In playlist: %s' % len(playlist.video_urls))
urls = []
for url in playlist:
urls.append(url)
print(urls)
Related
anyone can help with scraping from https://www.whed.net/home.php
the code I'm using is giving me empty df. would love to have universities with websites and maybe field of study. My scraping skills are weak so if you can guide me through this would be great thanks guys.
begin=time.time()
countries=['Emirates','United States of America (all)']
result = [] # List to store all data
univ_links=[] # Links for all universities
fields = ['Street:','City:','Province:','Post Code:','WWW:','Fields of study:','Job title:']
webD = wb.Chrome(executable_path=r'C:\Users\Admin\OneDrive\Sagasit\chromedriver.exe') # To launch chrome and run script
# Trigger the target website
webD.get("https://www.whed.net/results_institutions.php")
webD.implicitly_wait(5)
#all_countries=[]
cntry_el = webD.find_elements_by_xpath('//*[#id="Chp1"]/option')
#cntry_grp = webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup')
grps=webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup/option[1]')
for c in cntry_el:countries.append(c.text)
for g in grps: countries.append(g.text)
for cntry in countries:
select = Select(webD.find_element_by_id('Chp1'))#select country dropdown
select.select_by_visible_text(cntry)#choosing country
Btn_GO = webD.find_element_by_xpath('//*[#id="fsearch"]/p/input')
Btn_GO.click()
select_rpp = Select(webD.find_element_by_name('nbr_ref_pge'))#select results per page drop down
select_rpp.select_by_visible_text('100')#choosing 100 results per page option
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li') # list of university elements
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
while True:
try:
webD.find_element_by_partial_link_text('Next').click()
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li')
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
except NoSuchElementException: break
for l in univ_links:
webD.get(l)
webD.implicitly_wait(2)
title=webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[1]').text
title_detailed = webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[2]').text
cntry_name=webD.find_element_by_xpath('//*[#id="contenu"]/p[2]').text
t1=webD.find_elements_by_class_name('dt')
t2=webD.find_elements_by_class_name('dd')
labels=webD.find_elements_by_class_name('libelle')
content=webD.find_elements_by_class_name('contenu')
temp={}
fos=''
fos1=''
temp.update({'Title': title,'Detailed Title':title_detailed,'Country':cntry_name})
for i in range(len(t1)):
if t1[i].text == '' or t1[i].text == 'Address':
continue
else:
value=t2[i].text
temp.update({t1[i].text:value.replace('\n',',')})
for j in range(len(content)):
if labels[j].text in fields:
if labels[j].text == 'Fields of study:':
info=content[j].text
fos=fos+','+info
elif labels[j].text == 'Job title:':
info1=content[j].text
fos1=fos1+','+info1
else:
key=labels[j].text
temp.update({key[:-1]: content[j].text})
temp.update({'Fields of study': fos.lstrip(','),'Job titles':fos1.lstrip(',')})
result.append(temp)
data=pd.DataFrame(result)
data
end=time.time()
print("Time taken : "+ str(end-begin) +"s")
data.to_csv("WHED1.csv",index=False)
this code what i could use taken from github project.
would be great if i can re-create the data and save it, want this to be used as a dropdown in a web application just to make sure no mistakes written in the university studied in.
Update 1/12/22 - Async
Found a much better solution using aiohttp, it also runs the entire list of countries in ~30 seconds instead of 3 hours
import json
import time
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
def main():
print("Init")
driver = init_driver()
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Gathering Countries")
countries = get_countries(driver)
driver.quit()
print("Scraping")
start = time.time()
institution_list = asyncio.run(fetch_all(countries))
print("Writing out")
f = open('output.json', 'w')
f.write(json.dumps(institution_list))
f.close()
end = time.time()
print(f"Total time: {end - start}s")
def init_driver():
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
return driver
def get_countries(driver):
select = Select(driver.find_element(By.ID, "Chp1"))
countries = list(map(lambda c: c.get_attribute('value'), select.options))
countries.pop(0)
return countries
def extract_institutions(html, country):
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
print(str(page))
number_of_institutions = str(page).split()[0]
if number_of_institutions == 'No':
print(f"No results for {country}")
return []
results = []
inst_index = 0
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
inst_index += 1
return {
'country': country,
'count': number_of_institutions,
'records': results
}
async def get_institutions(country, session):
try:
async with session.post(
url='https://www.whed.net/results_institutions.php',
data={"Chp1": country, "nbr_ref_pge": 10000}
) as response:
html = await response.read()
print(f"Successfully got {country}")
return extract_institutions(html, country)
except Exception as e:
print(f"Unable to get {country} due to {e.__class__}.")
async def fetch_all(countries):
async with aiohttp.ClientSession() as session:
return await asyncio.gather(*[get_institutions(country, session) for country in countries])
# Main call
main()
Old answer using synchronous algorithm
Improving on #Mithun's answer since it doesn't really work as it'll be stuck on the same page.
Also added direct access to the name and url to make it easier in case you want to access those.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
print("Init")
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Selecting country")
select = Select(driver.find_element(By.ID, "Chp1"))
country = "Albania"
select.select_by_visible_text(country)
time.sleep(.5)
print("Searching")
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
print("Parsing")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
results = []
while True:
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
print(f'{len(results)}/{number_of_pages}')
if counter >= int(number_of_pages):
break
counter += 10
driver.find_element(By.LINK_TEXT, "Next page").click()
time.sleep(0.5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
print(results)
You can use Selenium to scrape data. The following code will help you scrape the university names for "United States of America (all)". Similarly, you can scrape for other countries as well using Loop or entering the name manually. If you need the field of study for every university, you can scrape its href using bs4 and its field of study.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
driver = webdriver.Chrome(r"chromedriver.exe")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
select = Select(driver.find_element(By.ID, "Chp1"))
select.select_by_visible_text("United States of America (all)")
time.sleep(1)
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
while counter < int(number_of_pages):
raw = soup.find_all('div', {'class': 'details'})
for i in raw:
i = (str(i.text).lstrip())
i = i.replace("\n","")
i = i.replace("\r", "")
i = i.replace("\t", "")
print(i)
next_page = driver.find_element(By.LINK_TEXT, "Next page").click()
counter += 10
driver.quit()
I'm working with python, selenium. I'm typing a keyword which is then being searched on google. In the results section, I am trying to open the URLs one by one and storing the data of the p tag.
But in my script, it is storing data of only one site. Can anyone help on this to store the data of p tag of all the opened sites?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
"""
Taking input from user
"""
search_input = input("Input the keyword you want to search for:")
search_input = search_input.replace(' ', '+')
driver = webdriver.Chrome(executable_path="E:\chromedriver\chromedriver.exe")
for i in range(1):
matched_elements = driver.get("https://www.google.com/search?q=" +
search_input + "&start=" + str(i))
print(driver.title)
driver.maximize_window()
time.sleep(5)
links_url = driver.find_elements_by_xpath("//div[#class='yuRUbf']/a[#href]")
links = []
for x in links_url:
links.append(x.get_attribute('href'))
link_data = []
for new_url in links:
print('new url : ', new_url)
driver.get(new_url)
link_data.append(driver.page_source)
"""
Getting the data from the site
"""
content = driver.find_elements(By.TAG_NAME, "p")
for data in content:
print(data.text)
driver.back()
driver.close()
Here is the edited answer, first I misunderstood your question:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
"""
Taking input from user
"""
search_input = input("Input the keyword you want to search for:")
search_input = search_input.replace(' ', '+')
driver = webdriver.Chrome(executable_path="E:\chromedriver\chromedriver.exe")
for i in range(1):
matched_elements = driver.get("https://www.google.com/search?q=" +
search_input + "&start=" + str(i))
print(driver.title)
driver.maximize_window()
time.sleep(5)
links_url = driver.find_elements_by_xpath("//div[#class='yuRUbf']/a[#href]")
links = []
for x in links_url:
links.append(x.get_attribute('href'))
link_data = []
for new_url in links:
print('\nnew url : ', new_url)
driver.get(new_url)
#Getting the data from the site
try:
link = driver.find_elements(By.TAG_NAME, "p")
for p in link:
print(p.get_attribute("innerText"))
except:
continue
driver.quit()
I am trying to start a new thread for each page, but this way it starts a new thread after the other thread/function is finished.
Can anyone help me run them independent of each other?
Example:
Thread 1:
Open page 1
Thread 2:
Open page 2
And do this for X amount of pages.
I am a beginner in python so excuse my messy code.
import random
import string
import threading
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
# driver.find_element_by_css_selector("a[onclick*='if (!window.__cfRLUnblockHandlers) return false; bail()']")
def randomStringDigits(stringLength=6):
"""Generate a random string of letters and digits """
lettersAndDigits = string.ascii_letters + string.digits
return ''.join(random.choice(lettersAndDigits) for i in range(stringLength))
def startscrape(url):
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get("urlhere")
cookies_list = driver.get_cookies()
cookies_dict = {} # create dictionary
usrelem = driver.find_element_by_name("login")
usrelem.send_keys("user")
pwdelem = driver.find_element_by_name("password")
pwdelem.send_keys("pass")
pwdelem.send_keys(Keys.RETURN)
sleep(1)
driver.get(url)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
xx = soup.find("input",
{"class": "input input--number js-numberBoxTextInput input input--numberNarrow js-pageJumpPage"})
driver.get(page)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
xxx = soup.findAll("a", {"class": "js-lbImage"})
# find all thumbs
for link in xxx:
xxx = soup.find("a", {"href": link.get('href')})
dlfullimg = driver.find_element_by_xpath("//a[#href='" + xxx.get('href') + "']")
wait = WebDriverWait(driver, 10)
dlfullimg.click()
thumbs = soup.findAll("div", {"class": "lg-thumb-item"})
dlfullimg = driver.find_element_by_id('lg-download').click()
close = driver.find_element_by_xpath("//span[#class='lg-close lg-icon']").click()
sleep(1)
assert "No results found." not in driver.page_source
url = input("Main URL: ")
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get("urlhere")
cookies_list = driver.get_cookies()
cookies_dict = {} # create dictionary
usrelem = driver.find_element_by_name("login")
usrelem.send_keys("user")
pwdelem = driver.find_element_by_name("password")
pwdelem.send_keys("pass")
pwdelem.send_keys(Keys.RETURN)
sleep(1)
driver.get(url)
wait = WebDriverWait(driver, 10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find page number with soup.find
xx = soup.find("input",
{"class": "input input--number js-numberBoxTextInput input input--numberNarrow js-pageJumpPage"})
driver.close()
threads = []
for i in range(int(xx.get('max'))):
page = url + "page-" + str(i + 1)
t = threading.Thread(target=startscrape(url), args=[])
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
You can use concurrent.futures to handle the heavy lifting for you
Here's a pseudo-code to do it
import concurrent.futures
from selenium import webdriver
def process_url(url):
driver = webdriver.Chrome()
driver.get(url)
# process page
driver.close
# Find number of pages here
driver = webdriver.Chrome()
driver.get(url)
# urls = find list of urls
driver.close
threads_count = 10
with concurrent.futures.ThreadPoolExecutor(threads_count) as executor:
executor.map(process_url, urls)
This code will crawl again from the beginning every time an error occurs. I want to change this to crawl only new text, not just from the beginning.
and I would like to ask for further advice.
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup, Comment
import pandas as pd
#Setting up Chrome webdriver Options
#chrome_options = webdriver.ChromeOptions()
#setting up local path of chrome binary file
#chrome_options.binary_location = "/Users/Norefly/chromedriver2/chromedriver.exec"
#creating Chrome webdriver instance with the set chrome_options
driver = webdriver.PhantomJS("C:/Python/phantomjs-2.1.1-windows/bin/phantomjs.exe")
link = "https://play.google.com/store/apps/details?id=com.supercell.clashofclans&hl=en"
driver.get(link)
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
Ptitle = driver.find_element_by_class_name('id-app-title').text.replace(' ','')
print(Ptitle)
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]').click()
sleep(1)
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#select_newest.select_by_visible_text('Newest')
#driver.find_element_by_xpath('//*[#id="body- content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
sleep(2)
#driver.find_element_by_css_selector('.review-filter.id-review-sort-filter.dropdown-menu-container').click()
driver.find_element_by_css_selector('.displayed-child').click()
#driver.find_element_by_xpath("//button[#data-dropdown-value='1']").click()
driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
reviews_df = []
for i in range(1,10):
try:
for elem in driver.find_elements_by_class_name('single-review'):
print(str(i))
content = elem.get_attribute('outerHTML')
soup = BeautifulSoup(content, "html.parser")
#print(soup.prettify())
date = soup.find('span',class_='review-date').get_text()
rating = soup.find('div',class_='tiny-star')['aria-label'][6:7]
title = soup.find('span',class_='review-title').get_text()
txt = soup.find('div',class_='review-body').get_text().replace('Full Review','')[len(title)+1:]
print(soup.get_text())
temp = pd.DataFrame({'Date':date,'Rating':rating,'Review Title':title,'Review Text':txt},index=[0])
print('-'*10)
reviews_df.append(temp)
#print(elem)
except:
print('what i can do?')
driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
#driver.find_element_by_xpath('//*[#id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
reviews_df = pd.concat(reviews_df,ignore_index=True)
reviews_df.to_csv(Ptitle+'review_google.csv', encoding='utf-8')
driver.close()
And I wonder if this is a problem with phantom js
I've been working on this Python script for the past day or two and all is working fine when I use the Firefox webdriver, but when I switch to use a headless browser like PhantomJS it fails on the line with setNumber = parseSetNumber(setName[0]) with the error Error: list index out of range due to setName being empty.
The line before it setName = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/text()") returns nothing when using the PhantomJS webdriver only, if using the Firefox webdriver it returns a value fine.
The error only happens when I switch the webdriver from Firefox to PhantomJS. I use PhantomJS as the script is run on a linux server.
import time
import os.path
import lxml.html as LH
import re
import sys
from selenium import webdriver
from random import randint
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
PARAMS = sys.argv
URL = PARAMS[1]
BASEURL = URL[:URL.rfind('/')+1]
# Parses the set name for the set number
def parseSetNumber(string):
string = string.split(' ')
stringLength = len(string)
string = string[(stringLength - 1)]
if string.replace('.','').isdigit():
return string
else:
return ""
# Returns set reference for this site
def parseRefId(string):
string = string.split('_')
return str(string[2])
try:
PAGE_NUMBER = 1
#--------------------------------------------------
## Get initial page
driver = webdriver.PhantomJS()
driver.get(PARAMS[1])
#--------------------------------------------------
## Get page count
# Give page time to load
time.sleep(2)
PAGE_RAW = driver.page_source
PAGE_RAW = LH.fromstring(PAGE_RAW)
PAGE_COUNT_RAW = PAGE_RAW.xpath("//div[contains(#class, 'pageControlMenu')]/div/ul/li")
PAGE_COUNT = len(PAGE_COUNT_RAW) - 2
#--------------------------------------------------
## Get page if its not page one
while PAGE_NUMBER <= PAGE_COUNT:
#--------------------------------------------------
## Create empty file
FILE_NAME = PARAMS[3] + 'json/' + time.strftime("%Y%m%d%H") + '_' + str(PARAMS[2]) + '_' + str(PAGE_NUMBER) + '.json'
#--------------------------------------------------
## Create JSON file if it doesnt exist
if os.path.exists(FILE_NAME)==False:
JSON_FILE = open(FILE_NAME, "a+", encoding="utf-8")
else:
JSON_FILE = open(FILE_NAME, "w", encoding="utf-8")
JSON_FILE.write("{")
#--------------------------------------------------
# Click page for next page if not page 1
if PAGE_NUMBER > 1:
index = 0
for atag in PAGE_COUNT_RAW:
if index == PAGE_NUMBER:
elements = driver.find_elements_by_xpath("//div[contains(#class, 'pageControlMenu')]/div/ul/li")
if elements:
element = elements[index].find_elements_by_xpath("./a")
if element:
element[0].click()
time.sleep(randint(3,5))
index += 1
#--------------------------------------------------
## Remove survey box if it pops up and log
try:
surveyBox = driver.find_element_by_link_text("No, thanks")
if surveyBox:
surveyBox.click()
print("Store[" + str(PARAMS[2]) + "]: Survey box found on page - " + str(PAGE_NUMBER))
except:
print("Store[" + str(PARAMS[2]) + "]: No survey box on page - " + str(PAGE_NUMBER))
#--------------------------------------------------
## Proces page
# If page is greater then 1 then get the page source of the new page.
if PAGE_NUMBER > 1:
PAGE_RAW = driver.page_source
PAGE_RAW = LH.fromstring(PAGE_RAW)
PAGE_RAW = PAGE_RAW.xpath("//div[contains(#class, 'estore_product_container')]")
index = 0
size = len(PAGE_RAW)
for atag in PAGE_RAW:
if PAGE_NUMBER > 1 and index == 0:
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH, "./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a")))
setStore = PARAMS[2]
setName = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/text()")
setNumber = parseSetNumber(setName[0])
setPrice = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_price')]/text()")
setLink = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/#href")
setRef = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_price')]/#id")
if setRef:
setRef = parseRefId(setRef[0])
if re.search('[0-9\.]+', setPrice[0]) is not None:
JSON_FILE.write("\"" + str(index) + "\":{\"store\":\"" + str(setStore) + "\",\"name\":\"" + str(setName[0]) + "\",\"number\":\"" + str(setNumber) + "\",\"price\":\"" + re.search('[0-9\.]+', setPrice[0]).group() + "\",\"ref\":\"" + str(setRef) + "\",\"link\":\"" + str(setLink[0]) + "\"}")
if index+1 < size:
JSON_FILE.write(",")
index += 1
#--------------------------------------------------
## Close JSON file
JSON_FILE.write("}")
JSON_FILE.close()
#--------------------------------------------------
## Increment page number
PAGE_NUMBER += 1
#--------------------------------------------------
#--------------------------------------------------
## Close webdriver
driver.quit()
#--------------------------------------------------
except Exception as e:
print('Error: ' + str(e.args[0]))
# Remove gecodriver.log file
GHOSTDRIVER_FILE = str(PARAMS[3]) + 'jobs/ghostdriver.log'
if os.path.exists(GHOSTDRIVER_FILE)==True:
os.remove(GHOSTDRIVER_FILE)
Update
It looks like these are the only two lines not working with PhantomJS, they both return an empty value.
setName = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/text()")
setLink = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/#href")
Ok, looks like I've solved this issue, I had to add the set_windows_size option for the webdriver when using PhantomJS.
Originally:
driver = webdriver.PhantomJS()
driver.get(PARAMS[1])
Solution:
driver = webdriver.PhantomJS()
driver.set_window_size(1024, 768)
driver.get(PARAMS[1])
Now the PhantomJS webdriver works as is expected in the same way the Firefox webdriver works.