I'm attempting to write a Python script that logs in to a website that runs JavaScript and scrape an element from the dashboard page. I'm using mechanize to login to the website and Requests-HTML to scape the data.
I can successfully login to the accounts page using mechanize. But I cannot pass the cookie data to Requests-HTML and continue the session to the dashboard page so I can scrape the data. I can't seem to format the data right to get the website (through Requests-HTML) to accept it.
I did get a version of this script running entirely with Selenium (the code is at the bottom), but I'd prefer to run a script that doesn't require a browser driver that opens a window.
from requests_html import HTMLSession
import mechanize
username = "me#example.com"
password = "12345678"
accts_url = "https://accounts.website.com"
dash_url = "https://dashboard.website.com"
browser = mechanize.Browser()
browser.open(accts_url)
browser.select_form(nr=0)
browser.form['email'] = username
browser.form['password'] = password
browser.submit()
response = browser.open(dash_url)
cookiejar_token = browser.cookiejar
print("mechanize, response:\n", response.read())
print("mechanize, browser.cookiejar:\n", cookiejar_token)
if str(cookiejar_token).startswith('<CookieJar['):
cookiejar_token_str_list = str(cookiejar_token).split(' ')
LBSERVERID_accts = cookiejar_token_str_list[1].lstrip('LBSERVERID=')
accounts_domain = cookiejar_token_str_list[3].rstrip('/>,')
session = cookiejar_token_str_list[5].lstrip('session=')
session_domain = cookiejar_token_str_list[7].rstrip('/>,')
LBSERVERID_dash = cookiejar_token_str_list[9].lstrip('LBSERVERID=')
dashboard_domain = cookiejar_token_str_list[11].rstrip('/>]>')
print("cookiejar_token_str_list:\n", cookiejar_token_str_list)
print("accounts 'LBSERVERID': %s for %s" % (LBSERVERID_accts, accounts_domain))
print("accounts 'session': %s for %s" % (session, session_domain))
print("dashboard 'LBSERVERID': %s for %s" % (LBSERVERID_dash, dashboard_domain))
else:
print("Incompatible token!\n")
# *****Requests_HTML does not communicate with mechanize!
session = HTMLSession()
print ("session.cookies:\n", session.cookies)
# I also made accounts_cookie_dict and session_cookie_dict
dash_cookie_dict = {
'name': 'LBSERVERID',
'value': LBSERVERID_dash,
'domain': dashboard_domain,
'path': '/'
}
# I attempt to manually create the correct cookie and assign it to dash_token, below
dash_token = browser.set_simple_cookie(dash_cookie_dict['name'], dash_cookie_dict['value'], dash_cookie_dict['domain'], dash_cookie_dict['path'])
print("dash_token:", dash_token)
print("cookiejar_token:", cookiejar_token)
print("dash_cookie_dict:\n", dash_cookie_dict)
# *****Attempting to pass the cookie to Requests-HTML below FAILS! :'(
response_obj = session.post(dash_url, cookies=dash_token)
print("response_obj:\n", response_obj)
print("response_obj.cookies from session.post:\n", response_obj.cookies)
response_obj.html.render(sleep=0.5)
print("requests_html, r.html.find('input'):\n", response_obj.html.find('input'))
Terminal Output:
mechanize, response:
b'<!doctype html><html lang="en"><head><script>!function(e***shortened by OP***</html>' ### Output in this field tells me the login by mechanize was successful
mechanize, browser.cookiejar:
<CookieJar[<Cookie LBSERVERID=3**************8 for accounts.example.com/>, <Cookie session=.e***shortened by OP***Y for accounts.example.com/>, <Cookie LBSERVERID=0**************a for dashboard.example.com/>]>
cookiejar_token_str_list:
['<CookieJar[<Cookie', 'LBSERVERID=3************8', 'for', 'accounts.example.com/>,', '<Cookie', 'session=.e***shortened by OP***Y', 'for', 'accounts.example.com/>,', '<Cookie', 'LBSERVERID=0**************a', 'for', 'dashboard.example.com/>]>']
accounts 'LBSERVERID': 3************8 for accounts.example.com
accounts 'session': .e***shortened by OP***Y for accounts.example.com
dashboard 'LBSERVERID': 0**************a for dashboard.example.com
session.cookies:
<RequestsCookieJar[]>
dash_token: None
cookiejar_token: <CookieJar[<Cookie LBSERVERID=3************8 for accounts.example.com/>, <Cookie session=.e***shortened by OP***Y for accounts.example.com/>, <Cookie LBSERVERID=0**************a for dashboard.example.com/>]>
dash_cookie_dict:
{'name': 'LBSERVERID', 'value': '0**************a', 'domain': 'dashboard.example.com', 'path': '/'}
response_obj:
<Response [403]> ### Access denied and it issues a new cookie below
response_obj.cookies from session.post:
<RequestsCookieJar[<Cookie LBSERVERID=a**************3 for dashboard.example.com/>]>
requests_html, r.html.find('input'): ### The output below tells me I'm back on the login page
[<Element 'input' class=('form-control',) id='email' name='email' required='' type='text' value=''>, <Element 'input' class=('form-control',) id='password' name='password' required='' type='password' value=''>, <Element 'input' id='csrf_token' name='csrf_token' type='hidden' value='I***shortened by OP***Y'>]
My Selenium code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import time
login_post_url = "https://accounts.example.com"
internal_url = "https://dashboard.example.com"
username = "user#email.com"
password = "12345678"
driver = webdriver.Safari(executable_path='/usr/bin/safaridriver') # initialize the Safari driver for Mac
driver.get(login_post_url) # head to login page
driver.find_element("id", "email").send_keys(username)
driver.find_element("id", "password").send_keys(password)
driver.find_element("id", "submit_form").click()
WebDriverWait(driver=driver, timeout=10).until( # wait the ready state to be complete
lambda x: x.execute_script("return document.readyState === 'complete'"))
error_message = "Incorrect username or password."
errors = driver.find_elements(By.CLASS_NAME, "flash-error") # get the errors (if there are)
# print the errors optionally
# for e in errors:
# print(e.text)
if any(error_message in e.text for e in errors): # if we find that error message within errors, then login is failed
print("[!] Login failed")
else:
print("[+] Login successful")
time.sleep(5)
driver.get(internal_url)
time.sleep(5)
element = driver.find_element(By.XPATH, '/html/........./div/p')
scraped_variable = element.get_attribute('innerHTML')
print("scraped_variable:", scraped_variable)
Related
I am working on a python web scraping project. The website I am trying to scrape data from contains info about all the medicines sold in India. The website requires a user to login before giving access to this information.
I want to access all the links in this url https://mims.com/india/browse/alphabet/a?cat=drug&tab=brand and store it in an array.
Here is my code for logging into the website
##################################### Method 1
import mechanize
import http.cookiejar as cookielib
from bs4 import BeautifulSoup
import html2text
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Chrome')]
br.open('https://sso.mims.com/Account/SignIn')
# View available forms
for f in br.forms():
print(f)
br.select_form(nr=0)
# User credentials
br.form['EmailAddress'] = <USERNAME>
br.form['Password'] = <PASSWORD>
# Login
br.submit()
print(br.open('https://mims.com/india/browse/alphabet/a?cat=drug&tab=brand').read())
But the problem is that when the credentials are submitted, a middle page pops up with the following information.
You will be redirected to your destination shortly.
This page submits a hidden form and only then is the required end page shown. I want to access the end page. But br.open('https://mims.com/india/browse/alphabet/a?cat=drug&tab=brand').read() accesses the middle page and prints the results.
How do I wait for the middle page to submit the hidden form and then access the contents of the end page?
I've posted a selenium solution below, which works, but after understanding a bit more about the login process, it's possible to login using BeautifulSoup and requests only. Please read the comments on the code.
BeautifulSoup / requests solution
import requests
from bs4 import BeautifulSoup
d = {
"EmailAddress": "your#email.tld",
"Password": "password",
"RememberMe": True,
"SubscriberId": "",
"LicenseNumber": "",
"CountryCode": "SG"
}
req = requests.Session()
login_u = "https://sso.mims.com/"
html = req.post(login_u, data=d)
products_url = "https://mims.com/india/browse/alphabet/a?cat=drug"
html = req.get(products_url) # The cookies generated on the previous request will be use on this one automatically because we use Sessions
# Here's the tricky part. The site uses 2 intermediary "relogin" pages that (theoretically) are only available with JavaScript enabled, but we can bypass that, i.e.:
soup = BeautifulSoup(html.text, "html.parser")
form = soup.find('form', {"id": "openid_message"})
form_url = form['action'] # used on the next post request
inputs = form.find_all('input')
form_dict = {}
for input in inputs:
if input.get('name'):
form_dict[input.get('name')] = input.get('value')
form_dict['submit_button'] = "Continue"
relogin = req.post(form_url, data=form_dict)
soup = BeautifulSoup(relogin.text, "html.parser")
form = soup.find('form', {"id": "openid_message"})
form_url = form['action'] # used
inputs = form.find_all('input')
form_dict = {}
for input in inputs:
if input.get('name'):
form_dict[input.get('name')] = input.get('value')
products_a = req.post(form_url, data=form_dict)
print(products_a.text)
# You can now request any url normally because the necessary cookies are already present on the current Session()
products_url = "https://mims.com/india/browse/alphabet/c?cat=drug"
products_c = req.get(products_url)
print(products_c.text)
Selenium solution
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from time import sleep
driver = webdriver.Firefox()
wait = WebDriverWait(driver, 10)
driver.maximize_window()
driver.get("https://sso.mims.com/")
el = wait.until(EC.element_to_be_clickable((By.ID, "EmailAddress")))
el.send_keys("your#email.com")
el = wait.until(EC.element_to_be_clickable((By.ID, "Password")))
el.send_keys("password")
el = wait.until(EC.element_to_be_clickable((By.ID, "btnSubmit")))
el.click()
wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "profile-section-header"))) # we logged in successfully
driver.get("http://mims.com/india/browse/alphabet/a?cat=drug")
wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "searchicon")))
print(driver.page_source)
# do what you need with the source code
I've to login into a site (for exemple I will use facebook.com). I can manage the login process using selenium, but I need to do it with a POST. I've tried to use requests but I'm not able to pass the info needed to the selenium webdriver in order to enter in the site as logged user. I've found on-line that exists a library that integrates selenium and requests https://pypi.org/project/selenium-requests/ , but the problem is that there is no documentation and I'm blocked in the same story.
With selenium-requests
webdriver = Chrome()
url = "https://www.facebook.com"
webdriver.get(url)
params = {
'email': 'my_email',
'pass': 'my_password'
}
resp = webdriver.request('POST','https://www.facebook.com/login/device-based/regular/login/?login_attempt=1&lwv=110', params)
webdriver.get(url)
# I hoped that the new page open was the one with me logged in but it did not works
With Selenium and requests passing the cookies
driver = webdriver.Chrome()
webdriver = Chrome()
url = "https://www.facebook.com"
driver.get(url)
#storing the cookies generated by the browser
request_cookies_browser = driver.get_cookies()
#making a persistent connection using the requests library
params = {
'email': 'my_email',
'pass': 'my_password'
}
s = requests.Session()
#passing the cookies generated from the browser to the session
c = [s.cookies.set(c['name'], c['value']) for c in request_cookies_browser]
resp = s.post('https://www.facebook.com/login/device-based/regular/login/?login_attempt=1&lwv=110', params) #I get a 200 status_code
#passing the cookie of the response to the browser
dict_resp_cookies = resp.cookies.get_dict()
response_cookies_browser = [{'name':name, 'value':value} for name, value in dict_resp_cookies.items()]
c = [driver.add_cookie(c) for c in response_cookies_browser]
driver.get(url)
In both the cases if in the end I print the cookies seems that something as changed from the beginning, but the page remains the one with the login form.
This is the codes I've tried, I put both the attempts but it is sufficient to find the solution to one of these two.
Someone can help me and know what I've to do or to change to open the page with me logged in?
Thank you in advance!
I have the same problem.
In your code, you just pass the params as is.
In this example the code would be data=params in :
resp = webdriver.request('POST','https://www.facebook.com/login/device-based/regular/login/?login_attempt=1&lwv=110', params)
I'm new in python. Want to login on my account
My code is:
driver = webdriver.Firefox()
session = requests.Session()
url = "http://tterminal.info/"
response = session.post(url)
# <div class="auth">
authForm = driver.find_element_by_class_name("auth")
# <div class="login">
loginForm = authForm.find_element_by_class_name("login")
# Enter login
login = loginForm.find_element_by_name("login")
login.clear()
login.send_keys("mylogin")
# Enter pass
pswd = loginForm.find_element_by_name("pass")
pswd.send_keys("mypassword")
# Click login
loginForm.find_element_by_class_name("submit").click()
But it givesme Unable to find element .auth
Where is my wrong?
While working with Selenium you don't have to use requests.Session() module and your code block can be optimized as follows :
from selenium import webdriver
driver = webdriver.Firefox(executable_path=r'C:\path\to\geckodriver.exe')
url = "http://tterminal.info"
driver.get(url)
login = driver.find_element_by_css_selector("form[name='login'] input.text[name='login']")
login.clear()
login.send_keys("mylogin")
pswd = driver.find_element_by_css_selector("form[name='login'] input.text[name='pass']")
pswd.send_keys("mypassword")
driver.find_element_by_css_selector("form[name='login'] input.submit").click()
I have been trying to login to my instagram account with mechanize python for the past while and for some reason it is not working.
To check if I have logged in correctly, I decided to check the url with "br.geturl()", which should read "https://www.instagram.com/" once the login is successful, but after I run the program it is just:
"https://www.instagram.com/accounts/login/username=username_here&password=password_here"
Anyone know how to fix this?
Note: I know forsure my login info is correct.
Here is my code:
import mechanize
br = mechanize.Browser()
url = "https://www.instagram.com/accounts/login/"
br.set_handle_robots(False)
response = br.open(url)
f = list(br.forms())
br.form = f[0]
print br.form
br.form["username"] = 'username_goes_here'
br.form["password"] = 'password_goes_here'
br.submit()
print br.geturl()
Since the form is generated via JavaScript. So it could not be found on the html. One of the way would be using Selenium Webdriver.
from selenium import webdriver
driver = webdriver.Chrome('/Usr/chromedriver')
driver.get("https://www.instagram.com/accounts/login/?source=auth_switcher")
username = driver.find_element_by_xpath('//*[#name="username"]')
password = driver.find_element_by_xpath('//*[#name="password"]')
login_btn = driver.find_element_by_xpath('//*[#class="oF4XW sqdOP L3NKy "]')
username.send_keys("username")
password.send_keys("password")
#test to see if input values are reflecting
print(username.get_attribute('value'))
print(password.get_attribute('value'))
#login
login_btn.click()
logged_in_class = driver.find_elements_by_class_name("logged-in")
not_logged_in_class = driver.find_elements_by_class_name("not-logged-in")
#to check if logged-in or not-logged-in
print(len(logged_in_class))
print(len(not_logged_in_class))
driver.quit()
I have a Python programme that I have stored my log in details on, How would I get the python programme to connect to the Facebook login page and input my log in details for me and then log in or return something?
E.g:
my details are in my main programme(Email and password), then I want to connect to Facebook, and have my program enter the details and send that off to Facebook.
Main Python File:
import urllib
import urllib2
def facebookDetails():
url = 'https://www.facebook.com/'
values = {'email' : 'somebody#facebook.com',
'pass' : 'password',
}
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
the_page = response.read()
Here's an example using selenium:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
if __name__ == '__main__':
iuser = 'username'
ipass = 'password'
# You will also need to download PhantomJS
driver = webdriver.PhantomJS(pathToPhantomJS.exe)
driver.get('https://www.facebook.com/')
email = driver.find_element_by_xpath('//*[#id="email"]')
email.send_keys(iuser)
password = driver.find_element_by_xpath('//*[#id="pass"]')
password.send_keys(ipass)
login_button = driver.find_element_by_xpath('//*[#id="u_0_n"]')
login_button.click()