I've been at this for hours and haven't made any progress. I'm trying to click on the next button on this page here
Here's my code:
#!/usr/local/bin python3
import sys
import time
import re
import logging
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as options
from bs4 import BeautifulSoup as bs
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
_USE_VIRTUAL_DISPLAY = False
_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
# logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG)
logging.basicConfig(format=_FORMAT, level=logging.INFO)
_LOGGER = logging.getLogger(sys.argv[0])
_DEFAULT_SLEEP = 0.5
try:
options = options()
# options.headless = True
driver = webdriver.Firefox(options=options, executable_path=r"/usr/local/bin/geckodriver")
print("Started Browser and Driver")
except:
_LOGGER.info("Can not run headless mode.")
url = 'https://www.govinfo.gov/app/collection/uscourts/district/alsd/2021/%7B%22pageSize%22%3A%22100%22%2C%22offset%22%3A%220%22%7D'
driver.get(url)
time.sleep(5)
page = driver.page_source
soup = bs(page, "html.parser")
next_page = WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH,'//*[#id="collapseOne1690"]/div/span[1]/div/ul/li[8]/a')))
if next_page:
print('*****getting next page*****')
# driver.execute_script('arguments[0].click()', next_page)
next_page.click()
time.sleep(3)
else:
print('no next page')
driver.quit()
I get a timeout error. I've tried changing the XPath. I've tried ActionChains to scroll into view and none have worked. Any help appreciated.
1 Your XPATH does not work because it uses dynamic class name collapseOne1690, as was mentioned earlier.
Also, it's not very stable even if you used a part of this class name.
If you prefer XPaths, I'd suggest this one: //span[#class='custom-paginator']//li[#class='next fw-pagination-btn']/a or just //li[#class='next fw-pagination-btn']/a. You can also use css selector: .next.fw-pagination-btn
2 I got rid of logging code because it also has some issues, re-check it.
3 5 seconds explicit wait is too small. Make it at least 10 seconds, better 15. It's just a suggestion.
The smallest reproducible code which clicks the button and uses Firefox is:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as options
from bs4 import BeautifulSoup as bs
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
options = options()
# options.headless = True
driver = webdriver.Firefox(options=options)
print("Started Browser and Driver")
url = 'https://www.govinfo.gov/app/collection/uscourts/district/alsd/2021/%7B%22pageSize%22%3A%22100%22%2C%22offset%22%3A%220%22%7D'
driver.get(url)
page = driver.page_source
soup = bs(page, "html.parser")
print(soup)
next_page = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable((By.XPATH, "//span[#class='custom-paginator']//li[#class='next fw-pagination-btn']/a")))
next_page.click()
# driver.quit()
It appears when I load up this page that the div id's are assigned dynamically. The first time I loaded the page, the id was collapseOne5168, the second time it was collapseOne1136
You might consider using find_element_by_class_name("next fw-pagination-btn") instead?
Related
Hey i would like to be able to access a link for example from the following html code
(to access each profile on the url in the code)
<div class="fancyCompLabel" onclick="window.open('https://www.techpilot.de/servlets/supplier/perfect_profile.jsp?lngCode=de&ckey=A4gxuEGikU16YXWt6RMd','_blank')" style="cursor:pointer;">Rathberger GmbH</div>
basically i want to access each profile get on the profile do stuff an go to the next profile page.
the following code ive written/ got helped by on stack is able to access the relevant html code but im not able to get the link.
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
time.sleep(3)
# Set some Selenium Options
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Webdriver
wd = webdriver.Chrome(executable_path='/usr/bin/chromedriver', options=options)
# URL
url = 'https://www.techpilot.de/zulieferer-suchen?laserschneiden%202d%20(laserstrahlschneiden)'
# Load URL
wd.get(url)
# Get HTML
soup = BeautifulSoup(wd.page_source, 'html.parser')
wait = WebDriverWait(wd, 15)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#bodyJSP #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#efficientSearchIframe")))
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".hideFunctionalScrollbar #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
#wd.switch_to.default_content() # you do not need to switch to default content because iframe is closed already
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".fancyCompLabel")))
results = wd.find_elements_by_css_selector(".fancyCompLabel")
''' #prints text (e.g. Rathberger) here i would like to acess the link instead
for profil in results:
print(profil)
'''
wd.close()
'''
To get the link in the onclick attribute you can use .get_attribute("onclick"). To parse the text from onclick attribute you could split the string into an array on the ' character and return the index that contains the url, .split("'")[1].
See below:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
time.sleep(3)
# Set some Selenium Options
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Webdriver
wd = webdriver.Chrome(executable_path='/usr/bin/chromedriver', options=options)
# URL
url = 'https://www.techpilot.de/zulieferer-suchen?laserschneiden%202d%20(laserstrahlschneiden)'
# Load URL
wd.get(url)
# Get HTML
soup = BeautifulSoup(wd.page_source, 'html.parser')
wait = WebDriverWait(wd, 15)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#bodyJSP #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#efficientSearchIframe")))
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".hideFunctionalScrollbar #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
#wd.switch_to.default_content() # you do not need to switch to default content because iframe is closed already
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".fancyCompLabel")))
results = wd.find_elements_by_css_selector(".fancyCompLabel")
''' #prints text (e.g. Rathberger) here i would like to acess the link instead
for profil in results:
print(profil)
'''
for profil in results:
print(profil.get_attribute("onclick").split("'")[1])
wd.close()
Good time of the day,
Currently I work on scraping project with the end goal is to create a DataFrame.
While I navigate from page to page, I have to gather different criterias. Though In case if the criteria is not present on the page, I would like to receive a "None"
import re
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
import time
import random
from bs4 import BeautifulSoup
start_time = time.time()
url='https://www.immoweb.be/en/search/house/for-sale?countries=BE&page=1&orderBy=relevance'
driver = webdriver.Chrome()
driver.implicitly_wait(30)
driver.get(url)
time.sleep(random.uniform(1.0, 3.0))
python_button = driver.find_elements_by_xpath('//*[#id="uc-btn-accept-banner"]')[0]
python_button.click()
time.sleep(random.uniform(1.0, 3.0))
python_button = driver.find_elements_by_xpath('//*[#id="classified_9312278"]')[0]
python_button.click()
soup = BeautifulSoup(driver.page_source)
area = list()
for i in range(15):
python_button = driver.find_elements_by_xpath('//*[#id="classifiedNavigation"]/ul/li[2]/a')[0]
python_button.click()
time.sleep(random.uniform(1.0, 3.0))
soup = BeautifulSoup(driver.page_source)
try:
for table in soup.findAll("th",text=re.compile("Living area")):
if table:
area.append(table.find_next("td").next_element.strip())
else:
area.append(None)
except:
area.append(None)
houses = {"Area":area}
print(houses)
However with the current code, only exisiting value appends to the list - whatever is not added does not even leave a blank.
And here is a link to the search
Thank you very much in advance!
It is pretty much obvious to me now
if soup.findAll("th",text=re.compile("Living area")):
for table in soup.findAll("th",text=re.compile("Living area")):
area.append(table.find_next("td").next_element.strip())
else:
area.append(None)
I've been having trouble trying to extract the phone number after clicking the "llamar" button. So far I've used the xpath method with selenium and also tried using beautiful soup to extract the number but unfortunately nothing has worked. I usually get an invalid selector error (if I use an xpath selector with selenium) and with BS4 I get a - AttributeError: 'NoneType' object has no attribute 'text' ...
I hope you can help me out!
Here is the url to the link - https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque-386352344.htm
Heres the code that I tried:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import UnexpectedAlertPresentException
url = 'https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque - 386352344.htm'
path = r'C:\Users\WL-133\anaconda3\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe'
path1 = r'C:\Users\WL-133\anaconda3\Lib\site-packages\selenium\webdriver\firefox'
# driver = webdriver.Chrome(path)
options = Options()
driver = webdriver.Chrome(path)
driver.get(url)
a = []
mah_div = driver.page_source
soup = BeautifulSoup(mah_div, features='lxml')
cookie_button = '//*[#id="sui-TcfFirstLayerModal"]/div/div/footer/div/button[2]'
btn_press = driver.find_element_by_xpath(cookie_button)
btn_press.click()
llam_button = '//*[#id="ad-detail-contact"]/a[2]'
llam_press = driver.find_element_by_xpath(llam_button)
llam_press.click()
time.sleep(10)
for item in soup.find_all("div", {"class": "contenido"}):
a.append(item.find("div", {"class": "plaincontenido"}).text)
print(a)
The phone is stored inside Javascript. You can use re module to extract it:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque-386352344.htm"
phone_url = "https://www.milanuncios.com/datos-contacto/?usePhoneProxy=0&from=detail&includeEmail=false&id={}"
ad_id = re.search(r"(\d+)\.htm", url).group(1)
html_text = requests.get(phone_url.format(ad_id)).text
soup = BeautifulSoup(html_text, "html.parser")
phone = re.search(r"getTrackingPhone\((.*?)\)", html_text).group(1)
print(soup.select_one(".texto").get_text(strip=True), phone)
Prints:
ana (Particular) 639....
With Selenium you will need to click the button and to switch to iframe.
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".def-btn.phone-btn")))
tel_button = driver.find_element_by_css_selector(".def-btn.phone-btn")
tel_button.click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.ID, "ifrw")))
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,".texto>.telefonos")))
tel_number = driver.find_element_by_css_selector(".texto>.telefonos").text
Please note, I used much stable locators.
I am scraping public linkedIn data from specific people.
here is the code inside the while loop. For you to know, I used time.sleep() for the first 400 profils urls and it worked. However, it is not working anymore as it makes my firefox browser crash. I am pretty sure that the bug comes from the time.sleep() function that I tried to modify using implictly_wait() and WebdriverWait. However, none of this tries worked ;(
Here the code inside the while loop with the time.sleep() function that worked for around 400urls:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
browser = webdriver.Firefox()
browser.get("https://www.linkedin.com/uas/login")
time.sleep(4)
username = browser.find_element_by_id("session_key-login")
password = browser.find_element_by_id("session_password-login")
username.send_keys("yourmail")
password.send_keys("yourpassword")
login_attempt = browser.find_element_by_xpath("//*[#type='submit']")
login_attempt.submit()
time.sleep(4)
browser.get(the profile link I wanna scrap)
html = browser.page_source
soup = BeautifulSoup(html,"html.parser")
formation = soup.find_all('div', {'class': "education"})
nom = soup.find_all('span', {'class': "full-name"})
for a in nom:
for b in formation:
print(a.text,b.text)
time.sleep(4)
browser.close()
I tried to replace the time.sleep() by Implicitly_wait() but it is not working. The browser does not wait at all.
I also tried this
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
browser = webdriver.Firefox()
browser.get("the profile url I wanna scrap")
delay = 30 # seconds
try:
WebDriverWait(browser, delay).until(EC.presence_of_element_located(browser.find_element_by('education'))
print("Page is ready!")
except TimeoutException:
print("Loading took too much time!")
But it is still not working.
Do you have any idea on how to solve the issue ?
If I could make the browser wait without using time.sleep() (which makes my browser crash) without any conditions that would be amazing !
other question ? If I use chrome instead of firefox, do I have a chance to overcome the problem ?
Thanks for your answers,
Raphaƫl
With WebDriverWait, the browser waits but firefox crashes again: here the code
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
browser.get("https://www.linkedin.com/uas/login")
username = browser.find_element_by_id("session_key-login")
password = browser.find_element_by_id("session_password-login")
username.send_keys("mail")
password.send_keys("password")
login_attempt = browser.find_element_by_xpath("//*[#type='submit']")
login_attempt.submit()
try:
element = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.ID, "content")))
finally:
browser.get("profil linkedin to scrap")
html = browser.page_source
soup = BeautifulSoup(html,"html.parser")
formation = soup.find_all('div', {'class': "education"})
nom = soup.find_all('span', {'class': "full-name"})
for a in nom:
for b in formation:
print(a.text,b.text)
browser.close()
I am scraping one of the URLs like this which loads data via Ajax. When using Firefox it's able to scrape HTML but upon using PhantomJS it return:
<html><head></head><body></body></html>
Code is below:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import selenium.webdriver.support.ui as ui
import sys
import os
from time import sleep
driver = None
url = 'https://sports.bovada.lv/live-betting/event/2391243'
driver = webdriver.PhantomJS('/Setups/phantomjs-1.9.8-macosx/bin/phantomjs')
driver.set_window_size(1128, 768) # optional
driver.get(url)
wait = ui.WebDriverWait(driver, 3000)
sleep(40)
#wait.until(EC.staleness_of(driver.find_element_by_id("coupon")), 'visible')
html = driver.page_source
#userElement = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "coupon")))
print(html)
Update
Ok this is happening with every URL regardless of ajax or non-Ajax