I've been working on this Python script for the past day or two and all is working fine when I use the Firefox webdriver, but when I switch to use a headless browser like PhantomJS it fails on the line with setNumber = parseSetNumber(setName[0]) with the error Error: list index out of range due to setName being empty.
The line before it setName = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/text()") returns nothing when using the PhantomJS webdriver only, if using the Firefox webdriver it returns a value fine.
The error only happens when I switch the webdriver from Firefox to PhantomJS. I use PhantomJS as the script is run on a linux server.
import time
import os.path
import lxml.html as LH
import re
import sys
from selenium import webdriver
from random import randint
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
PARAMS = sys.argv
URL = PARAMS[1]
BASEURL = URL[:URL.rfind('/')+1]
# Parses the set name for the set number
def parseSetNumber(string):
string = string.split(' ')
stringLength = len(string)
string = string[(stringLength - 1)]
if string.replace('.','').isdigit():
return string
else:
return ""
# Returns set reference for this site
def parseRefId(string):
string = string.split('_')
return str(string[2])
try:
PAGE_NUMBER = 1
#--------------------------------------------------
## Get initial page
driver = webdriver.PhantomJS()
driver.get(PARAMS[1])
#--------------------------------------------------
## Get page count
# Give page time to load
time.sleep(2)
PAGE_RAW = driver.page_source
PAGE_RAW = LH.fromstring(PAGE_RAW)
PAGE_COUNT_RAW = PAGE_RAW.xpath("//div[contains(#class, 'pageControlMenu')]/div/ul/li")
PAGE_COUNT = len(PAGE_COUNT_RAW) - 2
#--------------------------------------------------
## Get page if its not page one
while PAGE_NUMBER <= PAGE_COUNT:
#--------------------------------------------------
## Create empty file
FILE_NAME = PARAMS[3] + 'json/' + time.strftime("%Y%m%d%H") + '_' + str(PARAMS[2]) + '_' + str(PAGE_NUMBER) + '.json'
#--------------------------------------------------
## Create JSON file if it doesnt exist
if os.path.exists(FILE_NAME)==False:
JSON_FILE = open(FILE_NAME, "a+", encoding="utf-8")
else:
JSON_FILE = open(FILE_NAME, "w", encoding="utf-8")
JSON_FILE.write("{")
#--------------------------------------------------
# Click page for next page if not page 1
if PAGE_NUMBER > 1:
index = 0
for atag in PAGE_COUNT_RAW:
if index == PAGE_NUMBER:
elements = driver.find_elements_by_xpath("//div[contains(#class, 'pageControlMenu')]/div/ul/li")
if elements:
element = elements[index].find_elements_by_xpath("./a")
if element:
element[0].click()
time.sleep(randint(3,5))
index += 1
#--------------------------------------------------
## Remove survey box if it pops up and log
try:
surveyBox = driver.find_element_by_link_text("No, thanks")
if surveyBox:
surveyBox.click()
print("Store[" + str(PARAMS[2]) + "]: Survey box found on page - " + str(PAGE_NUMBER))
except:
print("Store[" + str(PARAMS[2]) + "]: No survey box on page - " + str(PAGE_NUMBER))
#--------------------------------------------------
## Proces page
# If page is greater then 1 then get the page source of the new page.
if PAGE_NUMBER > 1:
PAGE_RAW = driver.page_source
PAGE_RAW = LH.fromstring(PAGE_RAW)
PAGE_RAW = PAGE_RAW.xpath("//div[contains(#class, 'estore_product_container')]")
index = 0
size = len(PAGE_RAW)
for atag in PAGE_RAW:
if PAGE_NUMBER > 1 and index == 0:
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH, "./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a")))
setStore = PARAMS[2]
setName = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/text()")
setNumber = parseSetNumber(setName[0])
setPrice = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_price')]/text()")
setLink = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/#href")
setRef = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_price')]/#id")
if setRef:
setRef = parseRefId(setRef[0])
if re.search('[0-9\.]+', setPrice[0]) is not None:
JSON_FILE.write("\"" + str(index) + "\":{\"store\":\"" + str(setStore) + "\",\"name\":\"" + str(setName[0]) + "\",\"number\":\"" + str(setNumber) + "\",\"price\":\"" + re.search('[0-9\.]+', setPrice[0]).group() + "\",\"ref\":\"" + str(setRef) + "\",\"link\":\"" + str(setLink[0]) + "\"}")
if index+1 < size:
JSON_FILE.write(",")
index += 1
#--------------------------------------------------
## Close JSON file
JSON_FILE.write("}")
JSON_FILE.close()
#--------------------------------------------------
## Increment page number
PAGE_NUMBER += 1
#--------------------------------------------------
#--------------------------------------------------
## Close webdriver
driver.quit()
#--------------------------------------------------
except Exception as e:
print('Error: ' + str(e.args[0]))
# Remove gecodriver.log file
GHOSTDRIVER_FILE = str(PARAMS[3]) + 'jobs/ghostdriver.log'
if os.path.exists(GHOSTDRIVER_FILE)==True:
os.remove(GHOSTDRIVER_FILE)
Update
It looks like these are the only two lines not working with PhantomJS, they both return an empty value.
setName = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/text()")
setLink = atag.xpath("./div[contains(#class, 'product_info')]/div[contains(#class, 'product_name')]/a/#href")
Ok, looks like I've solved this issue, I had to add the set_windows_size option for the webdriver when using PhantomJS.
Originally:
driver = webdriver.PhantomJS()
driver.get(PARAMS[1])
Solution:
driver = webdriver.PhantomJS()
driver.set_window_size(1024, 768)
driver.get(PARAMS[1])
Now the PhantomJS webdriver works as is expected in the same way the Firefox webdriver works.
Related
I'm working with python, selenium. I'm typing a keyword which is then being searched on google. In the results section, I am trying to open the URLs one by one and storing the data of the p tag.
But in my script, it is storing data of only one site. Can anyone help on this to store the data of p tag of all the opened sites?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
"""
Taking input from user
"""
search_input = input("Input the keyword you want to search for:")
search_input = search_input.replace(' ', '+')
driver = webdriver.Chrome(executable_path="E:\chromedriver\chromedriver.exe")
for i in range(1):
matched_elements = driver.get("https://www.google.com/search?q=" +
search_input + "&start=" + str(i))
print(driver.title)
driver.maximize_window()
time.sleep(5)
links_url = driver.find_elements_by_xpath("//div[#class='yuRUbf']/a[#href]")
links = []
for x in links_url:
links.append(x.get_attribute('href'))
link_data = []
for new_url in links:
print('new url : ', new_url)
driver.get(new_url)
link_data.append(driver.page_source)
"""
Getting the data from the site
"""
content = driver.find_elements(By.TAG_NAME, "p")
for data in content:
print(data.text)
driver.back()
driver.close()
Here is the edited answer, first I misunderstood your question:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
"""
Taking input from user
"""
search_input = input("Input the keyword you want to search for:")
search_input = search_input.replace(' ', '+')
driver = webdriver.Chrome(executable_path="E:\chromedriver\chromedriver.exe")
for i in range(1):
matched_elements = driver.get("https://www.google.com/search?q=" +
search_input + "&start=" + str(i))
print(driver.title)
driver.maximize_window()
time.sleep(5)
links_url = driver.find_elements_by_xpath("//div[#class='yuRUbf']/a[#href]")
links = []
for x in links_url:
links.append(x.get_attribute('href'))
link_data = []
for new_url in links:
print('\nnew url : ', new_url)
driver.get(new_url)
#Getting the data from the site
try:
link = driver.find_elements(By.TAG_NAME, "p")
for p in link:
print(p.get_attribute("innerText"))
except:
continue
driver.quit()
How I can get the playlist urls stored like
here: https://www.youtube.com/watch?v=VpTRlS7EO6E&list=RDOIhVs0FQ8xc&index=5
with bs4?
Using
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://www.youtube.com/watch?v=OIhVs0FQ8xc&list=RDOIhVs0FQ8xc&index=1')
page = r.text
soup=bs(page,'html.parser')
#print(soup)
res=soup.find_all('ytd-playlist-panel-video-renderer')
print(res)
doesn't return anything. Even printing the soup itself doesn't contain the link I'am looking for (like href="/watch?v=puNOG62lf-Y&list=RDOIhVs0FQ8xc&index=2")
It is a javascript rendered page. You have to use selenium.
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
url = 'https://www.youtube.com/watch?v=OIhVs0FQ8xc&list=RDOIhVs0FQ8xc&index=1'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)
time.sleep(2)
soup=bs(driver.page_source,'html.parser')
res=soup.find_all('ytd-playlist-panel-video-renderer')
print(res)
Install the required package using pip install webdriver-manager
Thank you!
Here some dirty code working for me:
#---------------------------------
# import modules
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
import re
#---------------------------------
#
from webdriver_manager.firefox import GeckoDriverManager
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
#---------------------------------
# get links from url
def get_links(driver, sleep_time):
# open driver window
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)
# wait some seconds
time.sleep(sleep_time)
# get information from url
soup = bs(driver.page_source,'html.parser')
res = soup.find_all('ytd-playlist-panel-video-renderer')
# check if there is information
if len(res) > 0:
main_url = 'https://www.youtube.com/watch?v='
urls = re.findall('watch.*list', str(res))
links = [main_url + str(a[8:-9]) for a in urls[::2]]
# if there is no information return false
else:
links = False
return links
#---------------------------------
# set sleep timer
sleep_time = 10
# call function to get links
links = get_links(driver, sleep_time)
This works for me:
from selenium import webdriver # pip install selenium
import time
# make sure you download chrome driver from https://chromedriver.chromium.org/downloads and put it in folder 'driver'
driver = webdriver.Chrome('driver\chromedriver.exe')
driver.get('https://www.youtube.com/playlist?list=PLxvodScTx2RtAOoajGSu6ad4p8P8uXKQk') # put here your link
# scroll page down
old_position = 0
new_position = None
position_script = """return (window.pageYOffset !== undefined) ?
window.pageYOffset : (document.documentElement ||
document.body.parentNode || document.body);"""
while new_position != old_position:
old_position = driver.execute_script(position_script)
time.sleep(1)
driver.execute_script(
"""var scrollingElement = (document.scrollingElement ||
document.body);scrollingElement.scrollTop =
scrollingElement.scrollHeight;""")
new_position = driver.execute_script(position_script)
source_page = driver.page_source
driver.quit()
# extract the url's and name's
counter = 1
element_to_find = 'amp;index={}" ar'
video_index = source_page.find(element_to_find.format(counter)) #'amp;index=1" ar'
while video_index != -1:
title_element = ''
count_name = video_index
while title_element != 'title="':
title_element = source_page[count_name: count_name + 7]
count_name += 1
count_name += 6
start_title_position = count_name
end_title = ''
while end_title != '>':
end_title = source_page[count_name] # exit loop if end_title == '>'
count_name += 1
name = source_page[start_title_position:count_name - 2] # extract the name of the video
name = name.replace('"','"')
video_id = source_page[video_index - 56: video_index - 45] # extract video id
print(str(counter)
+ '. link: ' + 'https://www.youtube.com/watch?v=' + video_id +
', name: ' + name)
counter += 1
video_index = source_page.find(element_to_find.format(counter)) # continue the next video
The easiest solution is:
from pytube import Playlist
URL_PLAYLIST = "https://www.youtube.com/playlist?list=YOUR-LINK"
# Retrieve URLs of videos from playlist
playlist = Playlist(URL_PLAYLIST)
print('Number Of Videos In playlist: %s' % len(playlist.video_urls))
urls = []
for url in playlist:
urls.append(url)
print(urls)
I am working on a scraping project and am trying so scrape many different profiles. Not all of the profiles have the same information, so I want to skip that piece of data if the current profile does not have it. Here is my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
driver = webdriver.Chrome("MY DIRECTORY")
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body") #
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
##### SCRAPE CODE #####
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div')
IssuedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[1]/div[2]')
CertificationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]')
CertfiedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]')
RecertificationCycle = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[3]/div[2]')
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]')
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a')
print(Name.text + " : " + IssuedBy.text + " : " + CertificationNumber.text + " : " + CertfiedSince.text + " : " + RecertificationCycle.text + " : " + Expires.text + " : " + AccreditedBy.text)
driver.close()
driver.switch_to.window(driver.window_handles[0])
driver.close()
Please let me know how I would be able to skip an element if it is not present on the current profile.
According to the docs, find_element_by_xpath() raises a NoSuchElementException if the element you're looking for couldn't be found.
I suggest handling potential NoSuchElementExceptions accordingly. What a proper exception handling could look like depends on what you're trying to achieve, you might want to log an error, assign default values, skip certain follow up actions...
try:
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div')
except NoSuchElementException:
Name = "Default Name"
You could even wrap multiple find_element_by_xpath() calls in your try block.
It will fix try:.. except:.. but you have some other errors too. I fixed them all.
Code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
driver = webdriver.Chrome('chromedriver')
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body") #
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
c = 1
while c <= count:
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
sleep(1)
##### SCRAPE CODE #####
try:
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div')
IssuedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[1]/div[2]')
CertificationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]')
CertfiedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]')
RecertificationCycle = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[3]/div[2]')
except:
c -= 1
driver.switch_to.window(driver.window_handles[0])
c += 1
if c > count:
break
driver.quit()
I wanted to learn Python so I started with learning web scraping. I know my code is all over the place but as I learn more I will clean up and make it more efficient. However, I want to know how to print from several different websites and get an output on the same line.
Example:
output : output : output : output : output
Here is my redacted code:
from selenium import webdriver
def bond(x):
driver = webdriver.Chrome()
url = 'website'
driver.get(url)
year_10_bond = driver.find_elements_by_xpath('element')[0].text
print(year_10_bond)
driver.close()
b = bond(print)
def stocks(s):
driver = webdriver.Chrome()
for i in range(0, 7661):
page_num = ('&r=' + str(i * 20 + 1))
url = 'website)
driver.get(url)
tickers = driver.find_elements_by_class_name('element')
company = driver.find_elements_by_xpath('element')
price = driver.find_elements_by_xpath('element')
num_of_tickers = len(tickers)
for i in range(num_of_tickers):
print(tickers[i].text + " : " + company[i].text + " : " + price[i].text)
s = stocks(print)
def outstanding(o)
driver = webdriver.Chrome()
for i in range(0, 7661):
page_num = ('&r=' + str(i * 20 + 1))
url = ('element')
driver.get(url)
shares_outstanding = driver.find_elements_by_xpath('element')
num_of_tickers = len(shares_outstanding)
for i in range(num_of_tickers):
print(shares_outstanding[i].text)
o = outstanding(print)
driver.close()
Here is the solution that worked for me I was just putting this in the wrong place at first:
end = ' '
I'm having an issue trying to click on an a href tag from an xpath query, the line in question is element = atag.xpath("./a"), I get an error saying Error: 'list' object has no attribute 'click'.
Any help greatly appreciated.
import time
import os.path
import lxml.html as LH
import re
import sys
from selenium import webdriver
from random import randint
PARAMS = sys.argv
URL = PARAMS[1]
BASEURL = URL[:URL.rfind('/')+1]
try:
PAGE_NUMBER = 1
#--------------------------------------------------
## Get initial page
driver = webdriver.Firefox()
driver.get(PARAMS[1])
#--------------------------------------------------
## Get page count
# Give page time to load
time.sleep(2)
PAGE_RAW = driver.page_source
PAGE_RAW = LH.fromstring(PAGE_RAW)
PAGE_COUNT_RAW = PAGE_RAW.xpath("//div[contains(#class, 'menu')]/div/ul/li")
PAGE_COUNT = len(PAGE_COUNT_RAW) - 2
#--------------------------------------------------
## Get page if it's not page one
while PAGE_NUMBER <= PAGE_COUNT:
#--------------------------------------------------
# Delay page processing for a random number of seconds from 2-5
time.sleep(randint(2,5))
#--------------------------------------------------
## Create empty file
FILE_NAME = PARAMS[3] + 'json/' + time.strftime("%Y%m%d%H") + '_' + str(PARAMS[2]) + '_' + str(PAGE_NUMBER) + '.json'
#--------------------------------------------------
## Create JSON file if it doesn't exist
if os.path.exists(FILE_NAME)==False:
JSON_FILE = open(FILE_NAME, "a+", encoding="utf-8")
else:
JSON_FILE = open(FILE_NAME, "w", encoding="utf-8")
JSON_FILE.write("{")
#--------------------------------------------------
# Click page for next page if not page 1
if PAGE_NUMBER > 1:
index = 1
for atag in PAGE_COUNT_RAW:
if index == (PAGE_NUMBER + 1):
element = atag.xpath("./a")
element.click()
index += 1
#--------------------------------------------------
## Proces page
#TODO
#--------------------------------------------------
## Close webdriver
driver.quit()
#--------------------------------------------------
## Close JSON file
JSON_FILE.write("}")
JSON_FILE.close()
#--------------------------------------------------
## Increment page number
PAGE_NUMBER += 1
#--------------------------------------------------
except Exception as e:
print('Error: ' + str(e.args[0]))
You mixed lxml code with selenium code. Your element is a list returned by lxml code, it's not a WebElement or list of WebElements and you can't apply click() even if you try element[0].click().
I'd suggest you to avoid using lxml as it seem to be redundant in this case. Just try to parse page source with selenium built-in methods.
If you need to get list of div elements you can use:
PAGE_COUNT_RAW = driver.find_elements_by_xpath("//div[contains(#class, 'menu')]/div/ul/li")
To find child a element:
for div in PAGE_COUNT_RAW:
element = div.find_element_by_xpath('./a')
Note that if you defined PAGE_COUNT_RAW on the first page, it will not be accessible on the next page, so you can scrape just a list of links and then get each link in a loop. Something like:
links = [link.get_attribute('href') for link in driver.find_elements_by_xpath("//div[contains(#class, 'menu')]/div/ul/li/a")]
for link in links:
driver.get(link)
If you need more details then update your ticket with specific description as for now your problem is not quite clear