For example a site
I need to script clicking "close" button on an appeared frame.
I'm already tried using xpath, css_selection still worthless.
Need to do stuff using headless-browser, like HtmlUnit
Because there is no "a"-tag.
from selenium import webdriver
from lxml import etree, html
url = "http://2gis.ru/moscow/search/%D1%81%D0%BF%D0%BE%D1%80%D1%82%D0%B8%D0%B2%D0%BD%D1%8B%D0%B5%20%D1%81%D0%B5%D0%BA%D1%86%D0%B8%D0%B8/center/37.437286%2C55.753395/tab/firms/zoom/11"
driver = webdriver.Firefox()
#driver = webdriver.Remote(desired_capabilities=webdriver.DesiredCapabilities.HTMLUNIT)
driver.get(url)
content = (driver.page_source).encode('utf-8')
doc = html.fromstring(content)
elems = doc.xpath('//article[#data-module="miniCard"]')
elem = elems[0]
# get element id to click on
el1_id = elem.attrib['id']
#simulate click to open frame
el1_to_click = driver.find_element_by_xpath('//article[#id="{0}"]//\
a[contains(#class, "miniCard__headerTitle")]'.format(el1_id))
el1_to_click.click()
#some stuff
pass
#now need to close this
close = driver.find_element_by_xpath('//html/body/div[1]/div/div[1]/div[2]/div[2]/div[2]/div/div[2]/div/div[2]/div[3]/div[1]/div[2]/svg/use')
close.click()
But the last part isn't working (can't close frame).
How to do this ?
Try this. this should work.
from selenium import webdriver
from lxml import etree, html
url = "http://2gis.ru/moscow/search/%D1%81%D0%BF%D0%BE%D1%80%D1%82%D0%B8%D0%B2%D0%BD%D1%8B%D0%B5%20%D1%81%D0%B5%D0%BA%D1%86%D0%B8%D0%B8/center/37.437286%2C55.753395/tab/firms/zoom/11"
driver = webdriver.Firefox()
driver.implicitly_wait(10)
# driver = webdriver.Remote(desired_capabilities=webdriver.DesiredCapabilities.HTMLUNIT)
driver.get(url)
content = (driver.page_source).encode('utf-8')
doc = html.fromstring(content)
elems = doc.xpath('//article[#data-module="miniCard"]')
elem = elems[0]
# get element id to click on
el1_id = elem.attrib['id']
# simulate click to open frame
el1_to_click = driver.find_element_by_xpath('//article[#id="{0}"]//\
a[contains(#class, "miniCard__headerTitle")]'.format(el1_id))
el1_to_click.click()
# some stuff
pass
# now need to close this
close = driver.find_element_by_xpath(
'//div[#class="frame _num_2 _pos_last _moverDir_left _active _show _state_visible _ready _cont_firmCard _mover"]/div/div/div[#class="frame__controlsButton _close"]')
close.click()
close = driver.find_element_by_xpath('//div[#data-module="frame"]/\
div[#class="frame__content"]/div[#class="frame__controls"]/\
div[contains(#class, "frame__controlsButton")\
and contains(#class,"_close")]')
That is the answer for any driver.set_window_size()
But need to find headless.
Related
I wanted to extract text from multiple pages. Currently, I am able to extract data from the first page but I want to append and go to muliple pages and extract the data from pagination. I have written this simple code which extracts data from the first page. I am not able to extract the data from multiple pages which is dynamic in number.
`
element_list = []
opts = webdriver.ChromeOptions()
opts.headless = True
driver = webdriver.Chrome(ChromeDriverManager().install())
base_url = "XYZ"
driver.maximize_window()
driver.get(base_url)
driver.set_page_load_timeout(50)
element = WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.ID, 'all-my-groups')))
l = []
l = driver.find_elements_by_xpath("//div[contains(#class, 'alias-wrapper sim-ellipsis sim-list--shortId')]")
for i in l:
print(i.text)
`
I have shared the images of class if this could help from pagination.
If we could extract the automate and extract from all the pages that would be awesome. Also, I am new so please pardon me for asking silly questions. Thanks in advance.
You have provided the code just for the previous page button. I guess you need to go to the next page until next page exists. As I don't know what site we are talking about I can only guess its behavior. So I'm assuming the button 'next' disappears when no next page exists. If so, it can be done like this:
element_list = []
opts = webdriver.ChromeOptions()
opts.headless = True
driver = webdriver.Chrome(ChromeDriverManager().install())
base_url = "XYZ"
driver.maximize_window()
driver.get(base_url)
driver.set_page_load_timeout(50)
element = WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.ID, 'all-my-groups')))
l = []
l = driver.find_elements_by_xpath("//div[contains(#class, 'alias-wrapper sim-ellipsis sim-list--shortId')]")
while True:
try:
next_page = driver.find_element(By.XPATH, '//button[#label="Next page"]')
except NoSuchElementException:
break
next_page.click()
l.extend(driver.find_elements(By.XPATH, "//div[contains(#class, 'alias-wrapper sim-ellipsis sim-list--shortId')]"))
for i in l:
print(i.text)
To be able to catch the exception this import has to be added:
from selenium.common.exceptions import NoSuchElementException
Also note that the method find_elements_by_xpath is deprecated and it would be better to replace this line:
l = driver.find_elements_by_xpath("//div[contains(#class, 'alias-wrapper sim-ellipsis sim-list--shortId')]")
by this one:
l = driver.find_elements(By.XPATH, "//div[contains(#class, 'alias-wrapper sim-ellipsis sim-list--shortId')]")
I want to retrieve from the link below the first page can be retrieve but I have a problem for putting the loop for the next page till the end. May you help me and complete my code?
My link is:
https://www150.statcan.gc.ca/n1/pub/71-607-x/2021004/exp-eng.htm?r1=(1)&r2=0&r3=0&r4=12&r5=0&r7=0&r8=2022-02-01&r9=2022-02-01
from selenium import webdriver
import time
url = "https://www150.statcan.gc.ca/n1/pub/71-607-x/2021004/exp-eng.htm?r1=(1)&r2=0&r3=0&r4=12&r5=0&r7=0&r8=2022-02-01&r9=2022-02-01"
driver = webdriver.Chrome("C:\Program Files\Python310\chromedriver.exe")
driver.get(url)
table = driver.find_element_by_id('report_table')
body = table.find_element_by_tag_name('tbody')
cells = body.find_elements_by_tag_name('td')
for cell in cells:
print(cell.text)
it brings the first page data but I don't know how to retrieve the others.
Look for the next-page selector and iterate over it, if it is there, let it click after your extraction part. You wanna do that in a while loop for example which you break if selector can't be found.
from selenium import webdriver
import time
url = "https://www150.statcan.gc.ca/n1/pub/71-607-x/2021004/exp-eng.htm?r1=(1)&r2=0&r3=0&r4=12&r5=0&r7=0&r8=2022-02-01&r9=2022-02-01"
driver = webdriver.Chrome("C:\Program Files\Python310\chromedriver.exe")
driver.get(url)
table = driver.find_element_by_id('report_table')
body = table.find_element_by_tag_name('tbody')
cells = body.find_elements_by_tag_name('td')
for cell in cells:
print(cell.text)
while True:
next_page = driver.find_element(By.XPATH, '//a[#id="report_results_next"]')
if next_page:
# steps to extract if next_page
driver.get(next_page)
table = driver.find_element_by_id('report_table')
body = table.find_element_by_tag_name('tbody')
cells = body.find_elements_by_tag_name('td')
for cell in cells:
print(cell.text)
else:
# stop
break
This is not tested.
this is the form link: https://docs.google.com/forms/d/e/1FAIpQLSe61r6TNx4JvRg2gVu3Eu8-KYKCvd1dJCAmYJFnNw4EU9llMw/viewform this is my code i want to select DHAOUI MOHAMED AZIZ what am i missing :
import time
from selenium import webdriver
def sleep():
time.sleep(5)
Chrome = webdriver.Chrome(executable_path='C:/Users/dhaou/Desktop/chromedriver.exe')
url = "https://docs.google.com/forms/d/e/1FAIpQLSe61r6TNx4JvRg2gVu3Eu8-KYKCvd1dJCAmYJFnNw4EU9llMw/viewform"
Chrome.get(url)
first_div = Chrome.find_element_by_xpath('//*[#id="mG61Hd"]/div[2]/div/div[2]/div[1]/div/div/div[2]/div')
first_div.click()
sleep()
second=Chrome.find_element_by_xpath('//*[#id="mG61Hd"]/div[2]/div/div[2]/div[1]/div/div/div[2]/div/div[1]/div[1]/div[16]')
second.click()
This seems to work for me for the url you had provided.
PATH = "./chromedriver"
driver = webdriver.Chrome(PATH)
driver.implicitly_wait(5)
url = "https://docs.google.com/forms/d/e/1FAIpQLSe61r6TNx4JvRg2gVu3Eu8-KYKCvd1dJCAmYJFnNw4EU9llMw/viewform"
driver.get(url)
element = ".quantumWizMenuPaperselectOption.appsMaterialWizMenuPaperselectOption.freebirdThemedSelectOptionDarkerDisabled.exportOption.isSelected.isPlaceholder"
dropdown = driver.find_element_by_css_selector(element)
dropdown.click()
name = "ALI GHANMI"
list_element = "//div[#class='exportSelectPopup quantumWizMenuPaperselectPopup appsMaterialWizMenuPaperselectPopup']//span[text()='"+name+"']"
dropdown_element = driver.find_element_by_xpath(list_element)
dropdown_element.click()
By the way, The form in the URL you had provided does not have DHAOUI MOHAMED AZIZ. I tried a different name.
I need to search for a flight with python selenium but I couldn't able to select my desirable date.
import time
import selenium
from selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.spicejet.com/")
departureButton = browser.find_element_by_id("ctl00_mainContent_ddl_originStation1_CTXT")
departureButton.click()
browser.find_element_by_partial_link_text("Kolkata").click()
arivalButton = browser.find_element_by_id("ctl00_mainContent_ddl_destinationStation1_CTXT")
arivalButton.click()
browser.find_element_by_partial_link_text("Goa").click()
date_position = browser.find_element_by_id("ctl00_mainContent_view_date1")
date_position.click()
search_date = "10-September 2019"
dep_date1 = search_date.split("-")
dep_month = dep_date[1]
dep_day = dep_date[0]
cal_head = browser.find_elements_by_class_name("ui-datepicker-title")
for month_hd in cal_head:
month_year = month_hd.text
if dep_month == month_year:
pass
else:
nxt = browser.find_element_by_class_name("ui-icon-circle-triangle-e").click()
print(month_year)
time.sleep(2)
browser.close()
The problem with your code is that when you click on the next button the DOM changes and the element reference saved in your variables are not updated. That is why it gives you stale element reference exception. Instead of using variables, use the locator for every time you access the calendar elements and it will work.
Try This :
import time
import selenium
from selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.spicejet.com/")
departureButton = browser.find_element_by_id("ctl00_mainContent_ddl_originStation1_CTXT")
departureButton.click()
browser.find_element_by_partial_link_text("Kolkata").click()
arivalButton = browser.find_element_by_id("ctl00_mainContent_ddl_destinationStation1_CTXT")
arivalButton.click()
browser.find_element_by_partial_link_text("Goa").click()
date_position = browser.find_element_by_id("ctl00_mainContent_view_date1")
date_position.click()
search_date = "10-September 2019"
dep_date = search_date.split("-")
dep_month = dep_date[1]
dep_day = dep_date[0]
while browser.find_element_by_class_name("ui-datepicker-title").text != dep_month:
browser.find_element_by_css_selector("a[data-handler='next']").click()
browser.find_element_by_xpath("//table//a[text()='"+dep_day+"']").click()
time.sleep(2)
browser.close()
I am trying to make a scraping application to scrape Hants.gov.uk and right now I am working on it just clicking the pages instead of scraping. When it gets to the last row on page 1 it just stopped, so what I did was make it click button "Next Page" but first it has to go back to the original URL. It clicks page 2, but after page 2 is scraped it doesn't go to page 3, it just restarts page 2.
Can somebody help me fix this issue?
Code:
import time
import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome(executable_path=r"C:\Users\Goten\Desktop\chromedriver.exe")
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result = []
for link in links:
if link not in result:
result.append(link)
else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
start()
driver.get(url)
for i in range(5):
driver.find_element_by_id("ctl00_mainContentPlaceHolder_lvResults_bottomPager_ctl02_NextButton").click()
url = driver.current_url
start()
driver.get(url)
driver.close()
try this:
import time
# import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
result = []
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result.extend(links)
def start2():
for link in result:
# if link not in result:
# result.append(link)
# else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
while True:
start()
element = driver.find_element_by_class_name('rdpPageNext')
try:
check = element.get_attribute('onclick')
if check != "return false;":
element.click()
else:
break
except:
break
print(result)
start2()
driver.get(url)
As per the url https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True to click through all the pages you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "mainContentPlaceHolder_btnAccept"))).click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#ctl00_mainContentPlaceHolder_lvResults_topPager div.rdpWrap.rdpNumPart>a"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scrapping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='ctl00_mainContentPlaceHolder_lvResults_topPager']//div[#class='rdpWrap rdpNumPart']//a[#class='rdpCurrentPage']/span//following::span[1]"))).click()
driver.quit()
Console Output:
8
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8
hi #Feitan Portor you have written the code absolutely perfect the only reason that you are redirected back to the first page is because you have given url = driver.current_url in the last for loop where it is the url that remains static and only the java script that instigates the next click event so just remove url = driver.current_url and driver.get(url)
and you are good to go i have tested my self
also to get the current page that your scraper is in just add this part in the for loop so you will get to know where your scraper is :
ss = driver.find_element_by_class_name('rdpCurrentPage').text
print(ss)
Hope this solves your confusion