How to copy all website with selenium in Python? - python

I used webdriver, because I need to make to copy the site after authentication.
from selenium import webdriver
import myconnutils
import re
from time import sleep
connection = myconnutils.getConnection()
#use Chrome
driver = webdriver.Chrome("/Users/User/Documents/sender/chromedriver")
#enter to site
driver.get("https://example.com/en/account")
driver.find_element_by_id("user").send_keys("userlogin")
driver.find_element_by_id("password").send_keys("passwordinput")
driver.find_element_by_id("submit").click()
What is next? How to copy all page with css, js, images?

Eventually try using Selenium with BeautifulSoup. You should be able to get the source code like this:
example_soup = BeautifulSoup(driver.page_source, 'html.parser')
Eventually this blog post also helps.

Related

How do I go about scraping some data from chrome browser?

The webpage I am trying to scrape can only be seen after login so using a direct url won't work. I need to scrape data while I am logged in using my chrome browser.
Then I need to get the value of the the element from
I have tried using the following code.
import requests
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
lastdatadate=[]
lastprocesseddate=[]
source = requests.get('webpage.com').text
content = driver.page_source
soup = bs(content, 'lxml')
#print(soup.prettify())
price = soup.find('span', attrs={'id':'calculatedMinRate'})
print(price.text)
You could still perform a login on the opened webdriver and fill in the input fields, as explained here: How to locate and insert a value in a text box (input) using Python Selenium?
Steps:
Fill in the input fields
Find the submit button and trigger a click event
Afterwards add a sleep command, few seconds should be enough
Afterwards you should be able to get the data.

Using selenium in Python, select HTML page element content with xpath?

In the Xpath Helper plugin, I was able to get the HTML tag content:
QUERY://div[#id="cardModel"]/div[#class="modal-dialog"]/div[#class="modal-content"]//tr[1]/td[1]//tr/td[2]/div/span/text()
RESULTS (1):Enrico
The result is:
Enrico
But in Python:
from selenium import webdriver
from lxml import etree
driver = webdriver.Chrome()
detailUrl = 'https://www.enf.com.cn/3d-energy-1?directory=panel&utm_source=ENF&utm_medium=perc&utm_content=22196&utm_campaign=profiles_panel'
driver.get(detailUrl)
html_ele_detail = etree.HTML(driver.page_source)
time.sleep(5)
companyPhone = html_ele_detail.xpath('//div[#id="cardModel"]/div[#class="modal-dialog"]/div[#class="modal-content"]//tr[1]/td[1]//tr/td[2]/div/span/text()')
print("companyPhone = ", companyPhone)
companyPhone shows empty, what's wrong?Thank you all for solving this problem
As you are already using the selenium library, you do not need to use etree library.
For this application selenium library is enough
see the example below and adapt for your purpose:
from selenium import webdriver
driver = webdriver.Chrome()
detailUrl = 'your url here'
driver.get(detailUrl)
web_element_text = driver.find_element_by_xpath('your xpath directory here').text
print(web_element_text)
See some other examples in another topic by clicking here
Let me know if this was helpful.

Selenium browser is getting an enable cookies page, not the page I am sending it to

I am trying to scrape a js website with selenium. When beautiful soup reads what selenium retrieved I get an html page that says: "Cookies must be enabled in order to view this page."
If anyone could help me past this stumbling block I would appreciate it. Here is my code:
# import libraries and specify URL
import lxml as lxml
import pandas as pd
from bs4 import BeautifulSoup
import html5lib
from selenium import webdriver
import urllib.request
import csv
url = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2020/06/09"
#new chrome session
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(executable_path= '/Users/susanwhite/PycharmProjects/Horse
Racing/chromedriver', chrome_options=chrome_options)
# Wait for the page to fully load
driver.implicitly_wait(time_to_wait=10)
# Load the web page
driver.get(url)
cookies = driver.get_cookies()
# Parse HTML code and grab tables with Beautiful Soup
soup = BeautifulSoup(driver.page_source, 'html5lib')
print(soup)
Try removing this line: chrome_options.add_argument("--incognito"). There's no need for it, as Selenium naturally doesn't save cookies or any other information from websites.
Removing below code solved it for me, but headless mode will be disabled and the browser window will be visible.
chrome_options.add_argument("--headless")
Your issues might also be with the specific website you're accessing. I had the same issue, and after poking around with it, it looks like something in the way the HKJC website loads, selenium thinks the page is finished loading prematurely. I was able to get good page_source objects out of fetching the page by putting in a time.sleep(30) after the get statement, so my code looks like:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import time
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options, executable_path=r'C:\Python\webdrivers\geckodriver.exe')
driver.get("https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2023/01/01&RaceNo=1")
time.sleep(30)
html = driver.page_source
with open('Date_2023-01-01_Race1.html', 'wb') as f:
f.write(html.encode('utf-8'))
f.close()
You might not have to sleep that long. I found manually loading the pages takes 20+ seconds for me because I have slow internet over VPNs. It also works headless for me as above.
You do have to make sure the Firefox geckodriver is the latest (at least according to other posts, as I only tried this over ~2 days, so not long enough for my installed Firefox and geckodriver to get out of sync)

Selenium get() redirects to another url

I'm trying to navigate to the following page and extract the html https://www.automobile.it/annunci?b=data&d=DESC, but everytime i call the get() method it looks like the website redirects me to the another page, always the same one which is https://www.automobile.it/torrenova?radius=100&b=data&d=DESC.
here's the simple code i'm running:
from selenium import webdriver
driver = webdriver.Chrome(executable_path=ex_path)
driver.get("https://www.automobile.it/annunci?b=data&d=DESC")
html=driver.page_source
if i do the same thing using the request module i don't get redirected
import requests
html=requests.get("https://www.automobile.it/annunci?b=data&d=DESC")
i don't understand why it's behaving like this, any ideas?
Use driver.delete_all_cookies()
from selenium import webdriver
driver = webdriver.Chrome(executable_path=ex_path)
driver.delete_all_cookies()
driver.get("https://www.automobile.it/annunci?b=data&d=DESC")
html=driver.page_source
PS: be also warned: Page_source will not get you the completed DOM as rendered.
Well you can clear browser cache by using the below code :
I am assuming that you are using chrome.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome(executable_path=ex_path)
driver.get('chrome://settings/clearBrowserData')
driver.find_element_by_xpath('//settings-ui').send_keys(Keys.ENTER)
driver.get("https://www.automobile.it/annunci?b=data&d=DESC")

Can someone help me to get live data from a website named Suncalc.org

I used beautifulsoup to get data from this website.
My code:
import bs4
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.suncalc.org/#/12.98,80.188,10/2020.02.21/15:51/1/3')
soup = BeautifulSoup(r.content,'html.parser')
week = soup.find(id='clickSunrise')
print(week)
Result:
<span class="sunriseX Bold sunrise-time" id="clickSunrise" style="white-space:nowrap;">...</span>
Those three dots were actually numbers, i need those numbers.
Hello I tested your code and seems like the website doesnt load the data until a browser requests the information. Since you are using the requests modules there is no browser.
You need to use a browser emulator like the selenium module to get that data.
This module will open a browser for you and you can program it to navigate to that website wait until everything gets loaded and get the information for you.
Steps:
1-Install selenium
2-Download the chromedriver and put it somewhere (maybe in your project)
https://chromedriver.chromium.org/downloads
3-Learn selenium (this is an amazing tool to automate navigation of the web). This is an untested example just so you can get an idea (might work for you right away but might not)
import time
from selenium import webdriver
driver = webdriver.Chrome('/path/to/chromedriver') # Change this to your chromedriver path.
driver.get('https://www.suncalc.org/#/12.98,80.188,10/2020.02.21/15:51/1/3');
time.sleep(5) # Let the user actually see something!
clickSunrise = driver.find_element_by_id('clickSunrise')
print(clickSunrise.text)
I hope this helps!
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.common.keys import Keys
import time
driver:WebDriver=webdriver.Chrome(executable_path="D:\download\chromedriver_win32\chromedriver.exe")
driver.get("https://suncalc.org/#/12.05,80.04,17/null/null/324.0/2")
time.sleep(5)
altitude = driver.find_element_by_id("sunhoehe")
time.sleep(5)
print(altitude.text)

Categories