I'm trying to automate the scraping of links from here:
https://thegoodpubguide.co.uk/pubs/?paged=1&order_by=category&search=pubs&pub_name=&postal_code=®ion=london
Once I have the first page, I want to click the right chevron at the bottom, in order to move to the second, the third and so on. Scraping the links in between.
Unfortunately nothing I try will allow me to send chrome to the next page.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from datetime import datetime
import csv
from selenium.webdriver.common.action_chains import ActionChains
#User login info
pagenum = 1
#Creates link to Chrome Driver and shortens this to 'browser'
path_to_chromedriver = '/Users/abc/Downloads/chromedriver 2' # change path as needed
driver = webdriver.Chrome(executable_path = path_to_chromedriver)
#Navigates Chrome to the specified page
url = 'https://thegoodpubguide.co.uk/pubs/?paged=1&order_by=category&search=pubs&pub_name=&postal_code=®ion=london'
#Clicks Login
def findlinks(address):
global pagenum
list = []
driver.get(address)
#wait
while pagenum <= 2:
for i in range(20): # Scrapes available links
xref = '//*[#id="search-results"]/div[1]/div[' + str(i+1) + ']/div/div/div[2]/div[1]/p/a'
link = driver.find_element_by_xpath(xref).get_attribute('href')
print(link)
list.append(link)
with open("links.csv", "a") as fp: # Saves list to file
wr = csv.writer(fp, dialect='excel')
wr.writerow(list)
print(pagenum)
pagenum = pagenum + 1
element = driver.find_element_by_xpath('//*[#id="search-results"]/div[2]/div/div/ul/li[8]/a')
element.click()
findlinks(url)
Is something blocking the button that i'm not seeing?
The error printed in my terminal:
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[#id="search-results"]/div[2]/div/div/ul/li[8]/a"}
try this :
element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[class='next-page btn']"))
element.click()
EDIT :
The xpath that you're specifying for the chevron is variable between pages, and is not exactly correct. Note the li[6] and li[8] and li[9].
On page 1: the xpath is //*[#id="search-results"]/div[2]/div/div/ul/li[6]/a/i
On page 2: the xpath is //*[#id="search-results"]/div[2]/div/div/ul/li[8]/a/i
On page 3: the xpath is //*[#id="search-results"]/div[2]/div/div/ul/li[9]/a/i
You'll have to come up with some way of determining what xpath to use. Here's a hint: it seems that the last li under the //*[#id="search-results"]/div[2]/div/div/ul/ designates the chevron.
ORIGINAL POST :
You may want to try waiting for the page to load before you try to find and click the chevron. I usually just do a time.sleep(...) when I'm testing my automation script, but for (possibly) more sophisticated functions, try Waits. See the documentation here.
Related
I'm currently trying to figure out how to loop through a set of studios on a fitness class website.
On the search results page of this website, it lists 50 studios on each page and there are about 26 pages. https://classpass.com/search if you want to take a look.
My code parses the search result page, and selenium gets the link for each studio on the page(In my full code selenium opens goes to the link and scrapes data on the page).
After looping through all the results on page 1, I want to click the next page button and repeat on results page 2. I get the error Message: no such element: Unable to locate element: but I know the element is definitely on the results page and can be clicked. I tested this with a simplified script to confirm.
What could I be doing wrong? I've tried many suggestions but none have worked so far.
from selenium import webdriver
from bs4 import BeautifulSoup as soup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as browser_wait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import re
import csv
# initialize the chrome browser
browser = webdriver.Chrome(executable_path=r'./chromedriver')
# URL
class_pass_url = 'https://www.classpass.com'
# Create file and writes the first row, added encoding type as write was giving errors
#f = open('ClassPass.csv', 'w', encoding='utf-8')
#headers = 'URL, Studio, Class Name, Description, Image, Address, Phone, Website, instagram, facebook, twitter\n'
#f.write(headers)
# classpass results page
page = "https://classpass.com/search"
browser.get(page)
# Browser waits
browser_wait(browser, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "line")))
# Scrolls to bottom of page to reveal all classes
# browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Extract page source and parse
search_source = browser.page_source
search_soup = soup(search_source, "html.parser")
pageCounter = 0
maxpagecount = 27
# Looks through results and gets link to class page
studios = search_soup.findAll('li', {'class': '_3vk1F9nlSJQIGcIG420bsK'})
while (pageCounter < maxpagecount):
search_source = browser.page_source
search_soup = soup(search_source, "html.parser")
studios = search_soup.findAll('li', {'class': '_3vk1F9nlSJQIGcIG420bsK'})
for studio in studios:
studio_link = class_pass_url + studio.a['href']
browser.get(studio_link)
browser_wait(browser, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "line")))
element = browser.find_element_by_xpath('//*[#id="Search_Results"]/div[1]/div/div/nav/button[2]')
browser.execute_script("arguments[0].click();", element)
You have to return to the main page before finding the next page button. You could solve the problem by the replacing the following code. This code will initially collect all page's studio url.
studios = search_soup.findAll('li', {'class': '_3vk1F9nlSJQIGcIG420bsK'})
to
studios = []
for page in range(num_pages):
studios.append(search_soup.findAll('li', {'class': '_3vk1F9nlSJQIGcIG420bsK'}))
element = browser.find_element_by_xpath('//*[#id="Search_Results"]/div[1]/div/div/nav/button[2]')
browser.execute_script("arguments[0].click();", element)
and remove the code clicking the next page button element.
I am scraping an angular.js site. My initial link has a search button. I find by xpath and click with no issues. After I click search, I want to be able to click each of the athletes in the table to go to their info pages, but I am not having success with the click method. The links are attached to their names.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
TIMEOUT = 5
driver = webdriver.Firefox()
driver.set_page_load_timeout(TIMEOUT)
url = 'https://n.rivals.com/search#?formValues=%7B%22sport%22:%22Football%22,%22recruit_year%22:2021,%22offer_and_visit_type%22:%5B%22Offer%22%5D,%22prospect_profiles.prospect_colleges.offer%22:true,%22page_number%22:1,%22page_size%22:50%7D'
try:
driver.get(url)
except TimeoutException:
pass
search_button = driver.find_element_by_xpath('//*[#id="articles"]/div/div[2]/div/div/div[1]/form/div[2]/div[5]/button')
search_button.click();
#below is where I tried, but could not get to click
first_athlete = driver.find_element_by_xpath('//*[#id="content_"]/td[1]/div[2]/a')
first_athlete.click();
Works if you remove the last /a in the xpath:
first_athlete = driver.find_element_by_xpath('//*[#id="content_"]/td[1]/div[2]')
first_athlete.click()
If you want to search for all athletes and you have the name of athletes with you, you can use CSS selector as well.
athelete = driver.find_elements_by_css_selector(`#content_ > td > div > a[href *="donovan-jackson"]);
athelete.click();
This code will give you a unique web element for each player.
Thanks
Error message :
selenium.common.exceptions.NoSuchElementException: Message: Unable to locate element: input.ytd-searchbox
I keep getting this error, even though i added a sleep command from other solutions for the page to load dynamically with javascript, but still it cannot find it?
import time
from selenium import webdriver
firefox = webdriver.Firefox()
firefox.get("https://www.youtube.com")
element = firefox.find_element_by_css_selector("ytd-mini-guide-entry-renderer.style-scope:nth-child(3) > a:nth-child(1)") # opens subscriptions
element.click()
time.sleep(10) # wait for page to load before finding it
searchelement = firefox.find_element_by_css_selector('input.ytd-searchbox') # search bar
searchelement.send_keys("Cute Puppies")
searchelement.submit()
I just changed the CSS Selector. You did that wrong here.
Umm... how i did that? Well there's an easy trick for selecting CSS Selectors.
Type the tag name first. In your case it's input.
If there's an ID present here, type the ID name with # on it. So
as i did : #search.
If there's a class there, then use . before it's name. For
example .search.
Try this. It's working :
import time
from selenium import webdriver
firefox = webdriver.Firefox(executable_path=r'C:\Users\intel\Downloads\Setups\geckodriver.exe')
firefox.get("https://www.youtube.com")
element = firefox.find_element_by_css_selector(".style-scope:nth-child(1) > #items > .style-scope:nth-child(3) > #endpoint .title") # opens subscriptions
element.click()
time.sleep(10) # wait for page to load before finding it
searchelement = firefox.find_element_by_css_selector('input#search') # search bar
searchelement.send_keys("Cute Puppies")
searchelement.submit()
I would like to write a python Programm which downloads automaticaly historical stock data from a web-page. The correspindent HTML-Code of the Element I would like to select is on the following Picture:
There are two iframes. One is inside the other. I switch to the second iframe, but the element I would like to click can't be found. I get the following error: "Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id=":cu"]"} (Session info: chrome=75.0.3770.100)"
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import ctypes # An included library with Python install.
import time
user = ""
pwd = ""
driver = webdriver.Chrome()
driver.get("https://www.dukascopy.com/trading-tools/widgets/quotes/historical_data_feed")
driver.maximize_window()
## Give time for iframe to load ##
time.sleep(1)
# get the list of iframes present on the web page using tag "iframe"
seq = driver.find_elements_by_tag_name('iframe')
print("No of frames present in the web page are: ", len(seq))
#switch to correct iFrame
driver.switch_to_default_content()
iframe = driver.find_elements_by_tag_name('iframe')[1]
driver.switch_to.frame(iframe)
driver.implicitly_wait(5)
elem = driver.find_element_by_id(':cu')
elem.click()
ctypes.windll.user32.MessageBoxW(0, "Test", "Test MsgBox", 1)
driver.close()
If my code would be correct the element "EUR/TRY" in the List would be selected.
There are total 4 iframes.
The table you want to interact with is in iframe[src^='https://freeserv'] and parent iframe is widget-container. One by one you have to switch to it like this :
Code :
wait = WebDriverWait(driver,10)
driver.maximize_window()
driver.get("https://www.dukascopy.com/trading-tools/widgets/quotes/historical_data_feed")
wait.until(EC.frame_to_be_available_and_switch_to_it((By.ID, "widget-container")))
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "iframe[src^='https://freeserv']")))
check_Box = wait.until(EC.visibility_of_element_located((By.XPATH, "//strong[text()='EUR/TRY']/../preceding-sibling::span/span")))
ActionChains(driver).move_to_element(check_Box).perform()
check_Box.click()
I'm trying to scrape this website: http://data.eastmoney.com/xg/xg/
So far I've used selenium to execute the javascript and get the table scraped. However, my code right now only gets me the first page. I was wondering if there's a way to access the other 17 pages, because when I click on next page the URL does not change, so I cannot just iterate over a different URL each time
Below is my code so far:
from selenium import webdriver
import lxml
from bs4 import BeautifulSoup
import time
def scrape():
url = 'http://data.eastmoney.com/xg/xg/'
d={}
f = open('east.txt','a')
driver = webdriver.PhantomJS()
driver.get(url)
lst = [x for x in range(0,25)]
htmlsource = driver.page_source
bs = BeautifulSoup(htmlsource)
heading = bs.find_all('thead')[0]
hlist = []
for header in heading.find_all('tr'):
head = header.find_all('th')
for i in lst:
if i!=2:
hlist.append(head[i].get_text().strip())
h = '|'.join(hlist)
print h
table = bs.find_all('tbody')[0]
for row in table.find_all('tr'):
cells = row.find_all('td')
d[cells[0].get_text()]=[y.get_text() for y in cells]
for key in d:
ret=[]
for i in lst:
if i != 2:
ret.append(d.get(key)[i])
s = '|'.join(ret)
print s
if __name__ == "__main__":
scrape()
Or is it possible for me to click next through the browser if I use webdriver.Chrome() instead of PhantomJS and then the Python run on the new page, after I click each time?
This is not a trivial page to interact with and would require the use of Explicit Waits to wait for invisibility of "loading" indicators.
Here is the complete and working implementation that you may use as a starting point:
# -*- coding: utf-8 -*-
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import time
url = "http://data.eastmoney.com/xg/xg/"
driver = webdriver.PhantomJS()
driver.get(url)
def get_table_results(driver):
for row in driver.find_elements_by_css_selector("table#dt_1 tr[class]"):
print [cell.text for cell in row.find_elements_by_tag_name("td")]
# initial wait for results
WebDriverWait(driver, 10).until(EC.invisibility_of_element_located((By.XPATH, u"//th[. = '加载中......']")))
while True:
# print current page number
page_number = driver.find_element_by_id("gopage").get_attribute("value")
print "Page #" + page_number
get_table_results(driver)
next_link = driver.find_element_by_link_text("下一页")
if "nolink" in next_link.get_attribute("class"):
break
next_link.click()
time.sleep(2) # TODO: fix?
# wait for results to load
WebDriverWait(driver, 10).until(EC.invisibility_of_element_located((By.XPATH, u"//img[contains(#src, 'loading')]")))
print "------"
The idea is to have an endless loop which we would exit only if the "Next Page" link becomes disabled (no more pages available). On every iteration, get the table results (printing on the console for the sake of an example), click the next link and wait for invisibility of the "loading" spinning circle appearing on top of the grid.
I found another way to do this in C# using Chromedriver and Selenium. All you have to do is add selenium references to the code and put chromedriver.exe references.
In your code you can navigate to the url using
using (var driver = new chromedriver())
{
driver.Navigate().GoToUrl(pathofurl);
//find your element by using FindElementByXpath
//var element = driver.FindElementByXpath(--Xpath--).Text;
}
Finding Xpath is easy - all you have to do is download scraper extension or x-path extension in chrome by going to chrome store. once you get a hang of x-path for elements you can find x-path for next button and use it in your code to navigate through pages very easily in a loop. Hope this helps.