Hey i would like to be able to access a link for example from the following html code
(to access each profile on the url in the code)
<div class="fancyCompLabel" onclick="window.open('https://www.techpilot.de/servlets/supplier/perfect_profile.jsp?lngCode=de&ckey=A4gxuEGikU16YXWt6RMd','_blank')" style="cursor:pointer;">Rathberger GmbH</div>
basically i want to access each profile get on the profile do stuff an go to the next profile page.
the following code ive written/ got helped by on stack is able to access the relevant html code but im not able to get the link.
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
time.sleep(3)
# Set some Selenium Options
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Webdriver
wd = webdriver.Chrome(executable_path='/usr/bin/chromedriver', options=options)
# URL
url = 'https://www.techpilot.de/zulieferer-suchen?laserschneiden%202d%20(laserstrahlschneiden)'
# Load URL
wd.get(url)
# Get HTML
soup = BeautifulSoup(wd.page_source, 'html.parser')
wait = WebDriverWait(wd, 15)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#bodyJSP #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#efficientSearchIframe")))
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".hideFunctionalScrollbar #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
#wd.switch_to.default_content() # you do not need to switch to default content because iframe is closed already
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".fancyCompLabel")))
results = wd.find_elements_by_css_selector(".fancyCompLabel")
''' #prints text (e.g. Rathberger) here i would like to acess the link instead
for profil in results:
print(profil)
'''
wd.close()
'''
To get the link in the onclick attribute you can use .get_attribute("onclick"). To parse the text from onclick attribute you could split the string into an array on the ' character and return the index that contains the url, .split("'")[1].
See below:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
time.sleep(3)
# Set some Selenium Options
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Webdriver
wd = webdriver.Chrome(executable_path='/usr/bin/chromedriver', options=options)
# URL
url = 'https://www.techpilot.de/zulieferer-suchen?laserschneiden%202d%20(laserstrahlschneiden)'
# Load URL
wd.get(url)
# Get HTML
soup = BeautifulSoup(wd.page_source, 'html.parser')
wait = WebDriverWait(wd, 15)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#bodyJSP #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, "#efficientSearchIframe")))
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".hideFunctionalScrollbar #CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll"))).click()
#wd.switch_to.default_content() # you do not need to switch to default content because iframe is closed already
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".fancyCompLabel")))
results = wd.find_elements_by_css_selector(".fancyCompLabel")
''' #prints text (e.g. Rathberger) here i would like to acess the link instead
for profil in results:
print(profil)
'''
for profil in results:
print(profil.get_attribute("onclick").split("'")[1])
wd.close()
Related
This is the website link which I am trying to scrape for data https://tis.nhai.gov.in/tollplazasataglance.aspx?language=en#
There are links in 4th column in above site if clicked a popup window comes which has certain info along with href for the next link when we click More Information tab.We get to such links https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID=236
From selenium import webdriver
driver = webdriver.Firefox()
driver.maximize_window()
driver.get("https://tis.nhai.gov.in/tollplazasataglance.aspx?language=en#")
b = driver.find_element("xpath", '//*[#id="tollList"]/table/tbody/tr[2]/td[4]/a')
c = driver.execute_script("arguments[0].click();", b)
At this point I am stuck up as am unable to capture the href or url of the popup window... Kindly help me to get past to the other page from the pop up window
Those pop-ups are the result of POST requests, where the payload is each location ID. Here is a way to get the locations IDs:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time as t
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
wait = WebDriverWait(driver, 25)
url = 'https://tis.nhai.gov.in/tollplazasataglance.aspx?language=en#'
driver.get(url)
places = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//div[#id="tollList"]//tbody/tr/td[4]' )))
for p in places:
p_id = p.find_element(By.XPATH, './/a').get_attribute('onclick').split('(')[1].split(')')[0]
print(p.text, p_id)
Result in terminal:
Aganampudi 236
Amakathadu 258
Badava 4486
Bandapalli 5697
Bandlapalli 5952
Basapuram 4542
Bathalapalli 5753
Bolapalli 252
[...]
Once you have the IDs, you can go to each place' page with https://tis.nhai.gov.in/TollInformation.aspx?TollPlazaID={place_id}.
I am learning python as well as web scrapping and I want to get number of review from google map of a permanently closed restaurant but I cannot do that, would you please help? Thank you
from bs4 import BeautifulSoup
url = 'https://www.google.com/maps?q=asia+halal+restaurant+aichi+japan+open+date&safe=strict&rlz=1C1GCEA_enID892ID892&sxsrf=ALeKk01NqaBLM8bXeVVS6M6tv9kAy0G6qQ:1616997971678&gs_lcp=Cgdnd3Mtd2l6EAM6BwgjELADECc6BQghEKABOgQIIRAVOgcIIRAKEKABUIUIWKojYOckaABwAHgAgAHHAogB7RGSAQcxLjUuNC4ymAEAoAEBqgEHZ3dzLXdpesgBAcABAQ&uact=5&um=1&ie=UTF-8&sa=X&ved=2ahUKEwjbhef-7NTvAhWa93MBHaFHCzYQ_AUoAXoECAEQAw'
import requests
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ps = soup.find_all(string = 'クチコミ')
ps
I also tried to use find 'class' and 'span aria-label' based on developer tool of chrome below but still cannot do that
browser picture for html class
#ps = soup.find_all(class_='h0ySl-wcwwM-E70qVe-list')
#ps = soup.find_all('span aria-label')
#total_rev = ps.get_text()
#total_rev
Here is the code that I tried using selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
driver = webdriver.Chrome('F:/Download/SW/chromedriver_win32/chromedriver.exe')
url = 'https://www.google.com/maps/place/%E3%82%A2%E3%83%83%E3%83%90%E3%82%B7+%E3%82%B9%E3%82%A4%E3%83%BC%E3%83%84/#35.0903185,136.8551766,17z/data=!3m1!4b1!4m5!3m4!1s0x600378381c4bb1f7:0x8e9d356b9ded5bcc!8m2!3d35.0903185!4d136.8573653'
driver.get(url)
I have tried to get number of review using this code in "still operating" restaurant, but when it comes to permanently closed one I cannot get the number of review
span_review = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "section-star")]'))).click()
#Find the total number of reviews
total_number_of_reviews = driver.find_element_by_xpath('//*[#id="pane"]/div/div[1]/div/div/div[2]/div[2]/div/div[2]/div[2]').text.split(" ")[0]
total_number_of_reviews = int(total_number_of_reviews.replace(',','')) if ',' in total_number_of_reviews else int(total_number_of_reviews)#Find scroll layout
total_reviews = driver.find_element_by_class_name("h0ySl-wcwwM-E70qVe-list")
total_reviews #= driver.get('aria-label')
total_reviews = total_reviews.get_text('aria-label')
total_reviews
total_reviews
total_number_of_reviews = total_reviews.text[0:]
total_number_of_reviews
Hopefully I can learn
Thanks!
I can't find your xpath in HTML. There is no <button> with text section-star but <li class="section-star">.
And aria-label is not text but attribute and you have to use .get_attribute('aria-label')
But I found other xpath //button[jsaction="pane.rating.moreReviews"] and it works for me for permanent closed and still operating
Tested on Firefox and Chrome, Linux.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome('F:/Download/SW/chromedriver_win32/chromedriver.exe')
#driver = webdriver.Chrome()
#driver = webdriver.Firefox()
all_urls = [
# permanent closed
'https://www.google.com/maps/place/%E3%82%A2%E3%83%83%E3%83%90%E3%82%B7+%E3%82%B9%E3%82%A4%E3%83%BC%E3%83%84/#35.0903185,136.8551766,17z/data=!3m1!4b1!4m5!3m4!1s0x600378381c4bb1f7:0x8e9d356b9ded5bcc!8m2!3d35.0903185!4d136.8573653',
# still operating
'https://www.google.com/maps/place/Seaside+Restaurant+Higashiyama+Garden+-+Port+Bldg./#35.0841323,136.8474088,14z/data=!3m1!5s0x6003790a61e056e7:0x7f307de064680a96!4m9!1m2!2m1!1srestaurants!3m5!1s0x600379a07cd9fcc7:0x89f84cc9f0422e30!8m2!3d35.0895485!4d136.8809243!15sCgtyZXN0YXVyYW50c1oNIgtyZXN0YXVyYW50c5IBCnJlc3RhdXJhbnQ',
]
for url in all_urls:
driver.get(url)
total_reviews = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//button[#jsaction="pane.rating.moreReviews"]')))
total_reviews = total_reviews.get_attribute('aria-label')
print(total_reviews)
I've been at this for hours and haven't made any progress. I'm trying to click on the next button on this page here
Here's my code:
#!/usr/local/bin python3
import sys
import time
import re
import logging
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as options
from bs4 import BeautifulSoup as bs
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
_USE_VIRTUAL_DISPLAY = False
_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
# logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG)
logging.basicConfig(format=_FORMAT, level=logging.INFO)
_LOGGER = logging.getLogger(sys.argv[0])
_DEFAULT_SLEEP = 0.5
try:
options = options()
# options.headless = True
driver = webdriver.Firefox(options=options, executable_path=r"/usr/local/bin/geckodriver")
print("Started Browser and Driver")
except:
_LOGGER.info("Can not run headless mode.")
url = 'https://www.govinfo.gov/app/collection/uscourts/district/alsd/2021/%7B%22pageSize%22%3A%22100%22%2C%22offset%22%3A%220%22%7D'
driver.get(url)
time.sleep(5)
page = driver.page_source
soup = bs(page, "html.parser")
next_page = WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.XPATH,'//*[#id="collapseOne1690"]/div/span[1]/div/ul/li[8]/a')))
if next_page:
print('*****getting next page*****')
# driver.execute_script('arguments[0].click()', next_page)
next_page.click()
time.sleep(3)
else:
print('no next page')
driver.quit()
I get a timeout error. I've tried changing the XPath. I've tried ActionChains to scroll into view and none have worked. Any help appreciated.
1 Your XPATH does not work because it uses dynamic class name collapseOne1690, as was mentioned earlier.
Also, it's not very stable even if you used a part of this class name.
If you prefer XPaths, I'd suggest this one: //span[#class='custom-paginator']//li[#class='next fw-pagination-btn']/a or just //li[#class='next fw-pagination-btn']/a. You can also use css selector: .next.fw-pagination-btn
2 I got rid of logging code because it also has some issues, re-check it.
3 5 seconds explicit wait is too small. Make it at least 10 seconds, better 15. It's just a suggestion.
The smallest reproducible code which clicks the button and uses Firefox is:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as options
from bs4 import BeautifulSoup as bs
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
options = options()
# options.headless = True
driver = webdriver.Firefox(options=options)
print("Started Browser and Driver")
url = 'https://www.govinfo.gov/app/collection/uscourts/district/alsd/2021/%7B%22pageSize%22%3A%22100%22%2C%22offset%22%3A%220%22%7D'
driver.get(url)
page = driver.page_source
soup = bs(page, "html.parser")
print(soup)
next_page = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable((By.XPATH, "//span[#class='custom-paginator']//li[#class='next fw-pagination-btn']/a")))
next_page.click()
# driver.quit()
It appears when I load up this page that the div id's are assigned dynamically. The first time I loaded the page, the id was collapseOne5168, the second time it was collapseOne1136
You might consider using find_element_by_class_name("next fw-pagination-btn") instead?
According to my code I have tried to click on the View button which contain the hidden document, I need to download that document using selenium webdriver in python. When I inspect, I got the stream-url = chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai/85967fa5-7853-412e-bbe5-c96406308ec6
this stream-url I found in the embed tag. I am not getting how to download that document.
enter code here
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
import os
from selenium.webdriver.support.select import Select
import time
import pandas as pd
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver,
20).until(EC.element_to_be_clickable((By.XPATH,"//div[#class='search-
pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio=
WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P50500000005")
Search = WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a") if
item.get_attribute('href') is not None]
View = View[0]
request = urllib.request.Request(View)
driver.get(View)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html, 'html.parser')
divPInfo = soup.find("div", {"id": "DivDocument"})
title = divPInfo.find("div", {'class': 'x_panel'},
recursive=False).find("div", {'class': 'x_title'}).find(
"h2").text.strip()
print(title)
with open("uploads.csv" , "a") as csv_file:
csv_file.write(title + "\n")
csv_file.close()
table = pd.read_html(driver.page_source)[11]
print(table)
table.to_csv("uploads.csv" , sep=',',index = False)
btn = WebDriverWait(driver,
20).until(EC.element_to_be_clickable((By.XPATH, "//button[#class='btn
btn-info btn-xs' and #id='btnShow_10']")))
driver.execute_script("arguments[0].click();",btn)
In Firefox page uses <object data="..."> to display PDF with scan. There are buttons in section "Uploaded Documents" to display other scans.
This code uses these buttons to display scans, get data from <object> and save in files document-0.pdf, document-1.pdf, etc.
I use the same code you could see in my answer to your previous question:
Save the pdf using the selenium webdriver in python
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
url = 'https://maharerait.mahaonline.gov.in'
#chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
#driver = webdriver.Chrome(executable_path=chrome_path)
driver = webdriver.Firefox()
driver.get(url)
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,"//div[#class='search-pro-details']//a[contains(.,'Search Project Details')]"))).click()
registered_project_radio = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();", registered_project_radio)
application = driver.find_element_by_id("CertiNo")
application.send_keys("P50500000005")
search = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();", search)
time.sleep(5)
View = [item.get_attribute('href')
for item in driver.find_elements_by_tag_name("a")
if item.get_attribute('href') is not None]
# if there is list then get first element
if View:
View = View[0]
#-----------------------------------------------------------------------------
# load page
driver.get(View)
# find buttons in section `Uploaded Documents`
buttons = driver.find_elements_by_xpath('//div[#id="DivDocument"]//button')
# work with all buttons
for i, button in enumerate(buttons):
# click button
button.click()
# wait till page display scan
print('wait for object:', i)
search = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.TAG_NAME, "object")))
# get data from object
print('get data:', i)
import base64
obj = driver.find_element_by_tag_name('object')
data = obj.get_attribute('data')
text = data.split(',')[1]
bytes = base64.b64decode(text)
# save scan in next PDF
print('save: document-{}.pdf'.format(i))
with open('document-{}.pdf'.format(i), 'wb') as fp:
fp.write(bytes)
# close scan
print('close document:', i)
driver.find_element_by_xpath('//button[text()="Close"]').click()
# --- end ---
driver.close()
Using selenium and python. I am trying to get a URL and save it by doing this:
driver = webdriver.Firefox()
driver.get("https://google.com")
elem = driver.find_element(By.XPATH, "/html/body/div/div[3]/div[1]/div/div/div/div[1]/div[1]/a")
elem.click()
url = driver.current_url
print url
url that prints is google.com and not the new clicked link which gmail.
My question is, how can I get the second url and save it.
You are getting the current url before the new page is loaded. Add an Explicit Wait to, for instance, wait for the page title to contain "Gmail":
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("https://google.com")
# click "Gmail" link
elem = driver.find_element_by_link_text("Gmail")
elem.click()
# wait for the page to load
wait = WebDriverWait(driver, 10)
wait.until(EC.title_contains("Gmail"))
url = driver.current_url
print(url)
Also note how I've improved the way to locate the Gmail link.