import csv
import time
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from csv import reader
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
chrome_options = Options()
scroll = 5
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
header_added = False
header_added1 = False
url = "url"
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe', options=chrome_options)
driver.maximize_window()
driver.get(url)
time.sleep(3)
search_city = input("Enter the city :")
res_n = input("Enter the Restaurant's name :")
search = driver.find_element_by_xpath('//input[#name="location"]').send_keys(search_city)
time.sleep(2)
driver.find_element_by_xpath('//*[#id="root"]/div[1]/div[1]/div/div[1]/div[1]/div/div[2]/div/div[3]/div[1]/span[2]').click()
time.sleep(3)
driver.find_element_by_xpath('/html/body/div[1]/div[1]/header/div/div/ul/li[5]/div/a/span[1]').click()
time.sleep(1)
search_res = driver.find_element_by_class_name('_2BJMh').send_keys(res_n.lower())
time.sleep(5)
driver.find_element_by_class_name('_2BJMh').send_keys(Keys.RETURN)
time.sleep(5)
try:
driver.find_element_by_class_name('_3FR5S').click()
time.sleep(5)
except:
print("restaurant not open")
driver.quit()
html = driver.find_element_by_tag_name('html')
def get_items():
global header_added
global item_dvs
cats = driver.find_elements_by_class_name('D_TFT')
cats[1].click()
time.sleep(3)
item_dvs = driver.find_elements_by_class_name('_2wg_t')
for div in item_dvs:
name = div.find_element_by_class_name('styles_itemNameText__3bcKX')
print(name.text)
price = div.find_element_by_class_name('rupee')
print(price.text)
if div.find_elements_by_class_name('styles_itemDesc__MTsVd'):
desc = div.find_element_by_class_name('styles_itemDesc__MTsVd').text
else:
desc = None
if div.find_element_by_css_selector('div._1C1Fl._23qjy'):
element = div.find_element_by_css_selector('div._1C1Fl._23qjy')
print("found")
driver.execute_script("arguments[0].scrollIntoView();", element)
add = div.find_element_by_css_selector('._1RPOp')
driver.execute_script("arguments[0].click();", add)
time.sleep(1)
add_ons = driver.find_element_by_class_name('_3UzO2').text
print(add_ons)
driver.find_element_by_css_selector('#modal-placeholder > div:nth-child(3) > div > div._1Kr-y._3EeZR > div > div._1EZLh > div > button').click()
else:
add_ons = None
dict1 = {'Item Name': name.text, "Price": price.text, "Add Ons :": add_ons, "Description": desc}
with open(f'{search_city}_{res_n}.csv', 'a+', encoding='utf-8-sig') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
get_items()
The is_cust loop keeps running over and over again opening the same element, while the rest of the code moves on to the next divs. What is wrong here?
xPath are bidirectional and is probably the cause here.
Try this code using cssSelector:
for div in item_dvs:
#Do Something
try:
is_cust = div.find_element_by_css_selector('._1C1Fl._23qjy')
print("found")
except NoSuchElementException:
continue
driver.execute_script("arguments[0].scrollIntoView();", is_cust)
add = div.find_element_by_css_selector('._1RPOp')
driver.execute_script("arguments[0].click();", add)
time.sleep(1)
# Not sure why for this one you had driver instead of div. Suspect div should be
add_ons = div.find_element_by_class_name('_26cJ9').text
div.find_element_by_css_selector('#modal-placeholder > div:nth-child(3) > div > div._1Kr-y._3EeZR > div > div._1EZLh > div > button').click()
UPDATE
From your updated code, you are using lot of hardcoded sleep. I will suggest to use the WebDriverWait with expected_conditions.
More info here: Wait from Selenium
Imports needed:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
Code to be added post driver creation:
wait_time = 5
wait = WebDriverWait(driver, wait_time)
Instead of using sleep like this:
time.sleep(5)
driver.find_element_by_class_name('_2BJMh').send_keys(Keys.RETURN)
time.sleep(5)
Use:
wait.until(EC.presence_of_element_located((By.CLASS_NAME, '_2BJMh'))).send_keys(res_n.lower())
Don't gather the element twice.. use find_elements_by* then validate the length:
descs = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'styles_itemDesc__MTsVd')))
if len(descs) > 0:
desc = descs[0].text
else:
desc = None
Related
I'm trying to display a list of links that I parsed, it displays 1, when deriving a variable linkvideo2 I get the result in the form of the first link to the video. Maybe you can't use CSS_SELECTOR with a loop?)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
import time
name = 'hello world'
profile_path = r"C:/Users/Федорчик/Desktop"
options=Options()
options.set_preference('profile', profile_path)
driver = webdriver.Firefox(options=options)
#driver = webdriver.Firefox(r"C:/Users/Федорчик/Desktop")
driver.get('https://www.youtube.com')
id_serth = driver.find_element(By.NAME, "search_query")
id_serth.send_keys(name)
button_serth = driver.find_element(By.ID, "search-icon-legacy")
time.sleep(4)
button_serth.click()
time.sleep(4)
button_filtr = driver.find_element(By.CLASS_NAME ,"ytd-toggle-button-renderer")
button_filtr.click()
time.sleep(4)
button_filtrtode=driver.find_element(By.CLASS_NAME, "ytd-search-filter-renderer")
button_filtrtode.click()
time.sleep(4)
urltek = driver.current_url
linkvideo2 = driver.find_elements(By.CSS_SELECTOR, 'ytd-video-renderer.style-scope:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1) > h3:nth-child(1) > a:nth-child(2)')
links=[]
for i in linkvideo2:
links.append(i.get_attribute('href'))
print(len(links))
print (urltek)
Answer:
1
https://www.youtube.com/results?search_query=hello+world&sp=EgIIAQ%253D%253D
I will be very grateful for your help
better find by ID
linkvideo2 = driver.find_elements(By.ID, 'video-title')
links=[]
for i in linkvideo2:
links.append(i.get_attribute('href'))
print(len(links))
print(urltek)
print(links)
output of "links" will be something like this:
[None, None, None, None, None, 'https://www.youtube.com/shorts/uXMjlXBW_fg', 'https://www.youtube.com/watch?v=s0a7AWgo6CY', 'https://www.youtube.com/watch?v=SEgOG1fZwoM', 'https://www.youtube.com/watch?v=XXFdprzIxvc', 'https://www.youtube.com/watch?v=82YAs_viROM', 'https://www.youtube.com/watch?v=IgRObNaGhAg', 'https://www.youtube.com/watch?v=sFLPNXJU4KY', 'https://www.youtube.com/watch?v=64Wo6zQDgQ0', 'https://www.youtube.com/watch?v=SpvI9648RrU', 'https://www.youtube.com/watch?v=wUw-rh97fso', 'https://www.youtube.com/watch?v=C7XDmQsf1-A', 'https://www.youtube.com/watch?v=3md7eviZC_Y', 'https://www.youtube.com/watch?v=wklkXDhb6MQ', 'https://www.youtube.com/watch?v=82wxikm2CwI', 'https://www.youtube.com/watch?v=D1DVUYcE43A', 'https://www.youtube.com/watch?v=MvFgCJqpxWI', 'https://www.youtube.com/shorts/8cfy5DjIS10', 'https://www.youtube.com/watch?v=5RV3unTmyTA', 'https://www.youtube.com/watch?v=Sjgee-HCCxs', 'https://www.youtube.com/watch?v=vUGQsCQNUgs']
I am trying to scrape the to paginate and scrape each table details from this site.
https://www.cyprusbar.org/CypriotAdvocateMembersPage.aspx
Screenshot
I need to click each details box, get directed to a new window and do it for the other records in each page. Then paginate. Here is my selenium code
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
PATH = 'chromedriver.exe'
options = Options()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument("--lang=en")
driver = webdriver.Chrome(executable_path=PATH, options=options)
driver.maximize_window()
driver.get('https://www.cyprusbar.org/CypriotAdvocateMembersPage.aspx')
driver.find_element_by_xpath('//*[#id="Div1"]/input').click()
def wait(locator, id):
element = WebDriverWait(driver, 50).until(
EC.presence_of_all_elements_located((locator, id))
)
return element
DATA = []
name = '//*[#id="ctl00_ContentPlaceHolder1_TxtName_I"]'
postal = '//*[#id="ctl00_ContentPlaceHolder1_TxtPostalCode_I"]'
fax = '//*[#id="ctl00_ContentPlaceHolder1_TxtFax_I"]'
province = '//*[#id="ctl00_ContentPlaceHolder1_TxtDistrict_I"]'
email = '//*[#id="ctl00_ContentPlaceHolder1_TxtEmail_I"]'
address = '//*[#id="ctl00_ContentPlaceHolder1_TxtAddress_I"]'
phone = '//*[#id="ctl00_ContentPlaceHolder1_TxtPhone_I"]'
courtroom = '//*[#id="ctl00_ContentPlaceHolder1_TxtCourtBox_I"]'
webpage = '//*[#id="ctl00_ContentPlaceHolder1_TxtUrl_I"]'
details = ['Postal Code', 'Fax', 'Calendar Province', 'Email', 'Address', 'Phone', 'Courtroom', 'Webpage']
def gotopage(page):
for p in range(page-1):
next_page = driver.find_element_by_class_name('dxWeb_pNext_Material')
action = ActionChains(driver)
action.click(next_page)
action.perform()
time.sleep(4)
def each_page(page, new):
global DATA
curr = 0
while curr < 80:
if page > 1 and new:
gotopage(page)
action = ActionChains(driver)
action.move_to_element(driver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_LawyersGrid_DXPagerBottom_PSI"]')).click()
action.perform()
action.send_keys(Keys.ARROW_UP, Keys.RETURN)
action.perform()
time.sleep(17)
data = {}
action = ActionChains(driver)
detail_list = wait(By.CLASS_NAME, 'dxb-hbc')
try:
action.click(detail_list[curr])
action.perform()
except IndexError:
print(curr)
driver.back()
gotopage(page)
data['Name'] = wait(By.XPATH, name)[0].get_attribute('value')
for i, d in enumerate([postal, fax, province, email, address, phone, courtroom, webpage]):
info = driver.find_element_by_xpath(d).get_attribute(('value'))
data[details[i]] = info
DATA.append(data)
curr += 1
driver.back()
print('============SCRAPING===============')
page = 1
new=True
while page <= 50:
try:
each_page(page, new)
page += 1
except Exception as err:
print(err)
print(page)
The problem here is that this is incredibly slow because each time you say
driver.back()
It goes back to page 1 and I would need to go back to the current page it would need to go back to the page it was in.
Is there anyway i can achieve this with something like BeautifulSoup?
Trying to get the tyres' details from this page. https://eurawheels.com/fr/catalogue/BBS
links = driver.find_elements_by_xpath('//div[#class="col-xs-1 col-md-3"]//a')
parent_window = driver.current_window_handle
x = 0
for j in range(len(links)):
driver.execute_script('window.open(arguments[0]);', links[j])
#scraping here
if x == 0:
driver.close()
driver.switch_to.window(parent_window)
x += 1
else:
driver.back()
driver.refresh() #refresh page
tyres = WebDriverWait(driver, 25).until(EC.visibility_of_all_elements_located((By.XPATH, '//div[#class="card-body text-center"]//a'))) #redefine links
time.sleep(4)
It works for 10 links but then the links go stale. Cannot figure out what needs to be changed. Any help is welcome.
You need to add scroll element into the view before executing driver.execute_script('window.open(arguments[0]);', links[j]) since not all the elements are initially loaded on the page.
So your code should look like following:
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
links = driver.find_elements_by_xpath('//div[#class="col-xs-1 col-md-3"]//a')
parent_window = driver.current_window_handle
x = 0
for j in range(len(links)):
actions.move_to_element(j).perform()
driver.execute_script('window.open(arguments[0]);', links[j])
#scraping here
if x == 0:
driver.close()
driver.switch_to.window(parent_window)
x += 1
else:
driver.back()
driver.refresh() #refresh page
tyres = WebDriverWait(driver, 25).until(EC.visibility_of_all_elements_located((By.XPATH, '//div[#class="card-body text-center"]//a'))) #redefine links
time.sleep(4)
Try this:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = 'https://eurawheels.com/fr/catalogue/BBS'
with webdriver.Chrome() as driver:
wait = WebDriverWait(driver,15)
driver.get(link)
linklist = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".card-body > a")))
for i,elem in enumerate(linklist):
linklist[i].click()
wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR,".spinner-border[role='status']")))
time.sleep(2) #if you kick out this delay, your script will run very fast but you may end up getting same results multiple times.
item = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"h3"))).text
print(item)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"h1.modal-title + button[class='close'][data-dismiss='modal']"))).click()
driver.back()
from time import sleep
from webbrowser import Chrome
import selenium
from bs4 import BeautifulSoup as bsoup
import pandas as pd
from selenium import webdriver
class FindByXpathCss():
def test(self):
baseUrl = "https://play.google.com/store/apps/details?
id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver = webdriver.Chrome("F:\\Chrome-webdriver\\chromedriver")
driver.maximize_window()
driver.get(baseUrl)
Here I need to click on one button (Full review) to view the full review text.
fullReviewbtn = driver.find_element_by_css_selector('#fcxH9b > div.WpDbMd > c-wiz > div >
div.ZfcPIb > div > div.JNury.Ekdcne > div > div > div.W4P4ne > div:nth-child(2) > div >
div:nth-child(2) > div > div.d15Mdf.bAhLNe > div.UD7Dzf > span:nth-child(1) > div >
button').click()
sleep(1)
Here we are reading that full review text using an xpath, but I wish to read all other reviews of
the app, around 1200 reviews for this app alone. I wish to know how can i iterate it using for
loop here.
elementByXpath = driver.find_element_by_xpath('//*
[#id="fcxH9b"]/div[4]/c-wiz/div/div[2]/div/div[1]/div/div/div[1]/div[2]/div/div[2]/div/div[2]/div[2]').text
if elementByXpath is not None:
print("We found an element using Xpath")
#Review = elementByXpath.get_attribute("Review")
print(elementByXpath)
driver.close()
ff = FindByXpathCss()
ff.test()
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
class FindByXpathCss():
driver = webdriver.Chrome(executable_path=r"C:\New folder\chromedriver.exe")
driver.maximize_window()
baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver.get(baseUrl)
scrolls = 15
while True:
scrolls -= 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
if scrolls < 0:
break
elemtn = WebDriverWait(driver, 30).until(
EC.element_to_be_clickable((By.XPATH, "//span[contains(#class,'RveJvd snByac')]")))
elemtn.click()
scrolls = 5
while True:
scrolls -= 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
if scrolls < 0:
break
elemtn = WebDriverWait(driver, 30).until(
EC.element_to_be_clickable((By.XPATH, "//span[contains(#class,'RveJvd snByac')]")))
elemtn.click()
reviewText = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, "//*[#class='UD7Dzf']")))
# reviewText = driver.find_elements_by_xpath("//*[#class='UD7Dzf']")
for textreview in reviewText:
print textreview.text
The website I am scrapping is:
http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx
I am getting to page 10 with my code that is looking at the pagination numbers and iterating over them but it is failing when it wants to get past page 10 because there are three dots (...) that, if you click in the browser, it loads page 11 (Same for after page 20, page 30 etc). How can I update my code below so that it can deal with this error without breaking?
The code I am using is:
import re
import string
import urlparse
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
class DoctorScraper(object):
def __init__(self):
self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx"
self.driver = webdriver.PhantomJS()
self.driver.set_window_size(1120, 550)
def scrape(self):
self.driver.get(self.url)
# choose to search using the region
try:
self.driver.find_element_by_id('SearchChkb_5').click()
except NoSuchElementException:
pass
#get the provinces that are available
select = Select(self.driver.find_element_by_id('ddlProvince'))
option_indexes = range(1, len(select.options))
#iterate through the provinces
for index in option_indexes[:3]:
select.select_by_index(index)
#click the search button
self.driver.find_element_by_id('cmdSearch').click()
pageno = 2
while True:
#create a beautiful soup of the page source code
s = BeautifulSoup(self.driver.page_source)
#get all links that match seeing practitioner profile
r1 = re.compile(r'^PractitionerView\.aspx\?FILENO=([A-Z0-9-]+)$')
#create a dictionary of the attributes
x = {'href': r1}
#so in the page source, find all links that have the attributes stated in x
for a in s.findAll('a', attrs=x):
print 'View Doctor URL: ', urlparse.urljoin(self.driver.current_url, a['href'])
print
# Pagination
try:
next_page_elem = self.driver.find_element_by_xpath("//a[text()='%d']" % pageno)
print "Next page: ", next_page_elem
except NoSuchElementException:
break # no more pages
print 'page ', pageno, '\n'
next_page_elem.click()
pageno += 1
self.driver.quit()
if __name__ == '__main__':
scraper = DoctorScraper()
scraper.scrape()
I am getting this error:
StaleElementReferenceException: {"errorMessage":"Element does not exist in cache","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"121","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:63135","User-Agent":"Python http auth"},"httpVersion":"1.1","method":"POST","post":"{\"using\": \"tag name\", \"sessionId\": \"ef6d0590-a2d6-11e7-91fa-5773b3326267\", \"id\": \":wdc:1506442969197\", \"value\": \"option\"}","url":"/elements","urlParsed":{"anchor":"","query":"","file":"elements","directory":"/","path":"/elements","relative":"/elements","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/elements","queryKey":{},"chunks":["elements"]},"urlOriginal":"/session/ef6d0590-a2d6-11e7-91fa-5773b3326267/element/:wdc:1506442969197/elements"}}
The main problem with this site is that the clickable elements frequently goes beyond the sight and it throws element not clickable error. However, I've already fixed it. If you have ChromeDriver installed in your machine, just run it and see the magic. It will flawlessly traverse all the pages no matter how many they are. I've checked it.
from selenium import webdriver ; import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
main_link = 'http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx'
def get_content(driver,wait,link):
driver.get(link)
driver.find_element_by_id('SearchChkb_5').click()
select = Select(driver.find_element_by_id('ddlProvince'))
select.select_by_visible_text('WESTERN CAPE')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elem = wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch')))
elem.click()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter = 2
while True:
try:
if not page_counter % 10 == 1:
driver.find_element_by_link_text(str(page_counter)).click()
page_counter += 1
else:
driver.find_elements_by_link_text("...")[-1].click()
time.sleep(2)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter += 1
except NoSuchElementException:
break
if __name__ == '__main__':
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
try:
get_content(driver,wait,main_link)
finally:
driver.close()
And using Class:
from selenium import webdriver ; import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class DoctorScraper(object):
def __init__(self):
self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx"
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def __del__(self):
self.driver.close()
def controlling_pagination(self):
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter = 2
while True:
try:
if not page_counter % 10 == 1:
self.driver.find_element_by_link_text(str(page_counter)).click()
page_counter += 1
else:
self.driver.find_elements_by_link_text("...")[-1].click()
time.sleep(2)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter += 1
except NoSuchElementException:
break
def get_content(self):
self.driver.get(self.url)
self.driver.find_element_by_id('SearchChkb_5').click()
select = Select(self.driver.find_element_by_id('ddlProvince'))
select.select_by_visible_text('WESTERN CAPE')
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elem = self.wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch')))
elem.click()
self.controlling_pagination()
if __name__ == '__main__':
scraper = DoctorScraper()
scraper.get_content()
Btw, take a look at the bottom of the image where you can see the changes of pages: