Python beautifulsoup4 - Web scraping (Pfsense) - Create an user - python

I'm trying to log into the website and create an user in Pfsense.
Actually, i can log and consult on Pfsense but i cannot create users.
import requests
from bs4 import BeautifulSoup
with requests.Session() as s:
#First CSRF Token for login
URL1 = "http://myipaddress/index.php"
html_text1 = requests.get(URL1).text
soup1 = BeautifulSoup(html_text1, 'lxml')
token1 = soup1.find('input', {'name':'__csrf_magic'})['value']
payload1 = {'__csrf_magic': token1,'usernamefld':'myusername','passwordfld':'mypassword','login':'Sign In'}
s.post(URL1, data=payload1)
#First CSRF Token for create an user
URL2 = "http:///myipaddress/system_usermanager.php?act=new"
html_text2 = requests.get(URL2).text
soup2 = BeautifulSoup(html_text2, 'lxml')
token2 = soup2.find('input', {'name':'__csrf_magic'})['value']
payload2 = {'__csrf_magic': token2, 'usernamefld' : 'Robert', 'passwordfld1' : 'root', 'passwordfld2' : 'root', 'descr':'Robert DELAFONDU', 'groups[]':'Simple','utype':'user','save':'Save'}
s.post(URL2, data=payload2)
#Print group user
URL = "http://myipaddress/system_groupmanager.php?act=edit&groupid=0"
html_text = s.get(URL).text
soup = BeautifulSoup(html_text, 'lxml')
users = soup.find_all("option")
for user in users :
print (user.text)
This script print all of users.
I also tried to add all of payload from creating an user (payload image) but didn't work.
I tried to log in pfsense and get CSRF token for log, then i'm expecting to create an user but i don't know how to.
Anyone can help me to understand how this work?

Related

BeautfulSoup issues to login in to website with request session

I using code below to make a login in a website en be able to scrape data from my own profile page.
However same after i make get from URL of profile the selector(soup) only returns data from login page.
I still dont be able to found a reason for that.
import requests
from requests import session
from bs4 import BeautifulSoup
login_url='https://caicara.pizzanapoles.com.br/Account/Login'
url_perfil = 'https://caicara.pizzanapoles.com.br/AdminCliente'
payload = {
'username' : 'MY_USERNAME',
'password' : 'MY_PASSWORD'
}
with requests.session() as s:
s.post(login_url, data = payload)
r = requests.get(url_perfil)
soup = BeautifulSoup(r.content, 'html.parser')
print(soup.title)
Firstly you need to use your session object s for all the requests.
r = requests.get(url_perfil)
changes to
r = s.get(url_perfil)
A __RequestVerificationToken is sent in the POST data when you try to login - you may need to send it too.
It is present inside the HTML of the login_url
<input name="__RequestVerificationToken" value="..."
This means you .get() the login page - extract the token - then send your .post()
r = s.get(login_url)
soup = BeautifulSoup(r.content, 'html.parser')
token = soup.find('input', {'name': '__RequestVerificationToken'})['value']
payload['__RequestVerificationToken'] = token
r1 = s.post(login_url, data=payload)
r2 = s.get(url_perfil)
You may want to save each request into its own variable for further debugging.
Thank You Karl for yout return,
But it dont worked fine.
U change my code using tips as you mentioned above.
import requests
from bs4 import BeautifulSoup
login_url = 'https://caicara.pizzanapoles.com.br/Account/Login'
url = 'https://caicara.pizzanapoles.com.br/AdminCliente'
data = {
'username': 'myuser',
'password': 'mypass',
}
with requests.session() as s:
r = s.get(login_url)
soup = BeautifulSoup(r.content, 'html.parser')
token = soup.find('input', name='__RequestVerificationToken')['value_of
_my_token']
payload['__RequestVerificationToken'] = token
r1 = s.post(login_url, data=payload)
r2 = s.get(url_perfil)
However it returns a error below.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-140-760e35f7b327> in <module>
13
14 soup = BeautifulSoup(r.content, 'html.parser')
---> 15 token = soup.find('input', name='__RequestVerificationToken')['QHlUQaro9sNo4lefL59lQRtbuziHnHtolV7Xm_Et_3tvnZKZnS4gjBBJZakw7crW0dyXy_lok44RozrMAvWm61XXGla5tC3AuZlgXC4GukA1']
16
17 payload['__RequestVerificationToken'] = token
TypeError: find() got multiple values for argument 'name'

Problem sending data through post request in Python

I am trying to input a decision start and end date into 2 input boxes on the Gosport Council website by sending a post request. Whenever I print out the text received from after I send the request it gives me the info shown on the input page, not the loaded page
import requests
payload = {
"applicationDecisionStart": "1/8/2018",
"applicationDecisionEnd": "1/10/2018",
}
with requests.Session() as session:
r = session.get("https://publicaccess.gosport.gov.uk/online-applications/search.do?action=advanced", timeout=10, data=payload)
print(r.text)
If I execute it I want it to print out the HTML with the href links for example
<a href="/online-applications/applicationDetails.do?keyVal=PEA12JHO07E00&activeTab=summary">
But my code won't show anything like this
I observe the POST, not GET which you are doing, is as follows (ignoring empty fields in POST):
from bs4 import BeautifulSoup as bs
import requests
payload = {
'caseAddressType':'Application'
,'date(applicationDecisionStart)' :'1/8/2018'
,'date(applicationDecisionEnd)': '1/10/2018'
, 'searchType' : 'Application'
}
with requests.Session() as s:
r = s.post('https://publicaccess.gosport.gov.uk/online-applications/advancedSearchResults.do?action=firstPage', data = payload)
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
## later pages
#https://publicaccess.gosport.gov.uk/online-applications/pagedSearchResults.do?action=page&searchCriteria.page=2
Loop over pages:
from bs4 import BeautifulSoup as bs
import requests
payload = {
'caseAddressType':'Application'
,'date(applicationDecisionStart)' :'1/8/2018'
,'date(applicationDecisionEnd)': '1/10/2018'
, 'searchType' : 'Application'
}
with requests.Session() as s:
r = s.post('https://publicaccess.gosport.gov.uk/online-applications/advancedSearchResults.do?action=firstPage', data = payload)
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
pages = int(soup.select('span + a.page')[-1].text)
for page in range(2, pages + 1):
r = s.get('https://publicaccess.gosport.gov.uk/online-applications/pagedSearchResults.do?action=page&searchCriteria.page={}'.format(page))
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
the url and data is incorrect
use Chrome to analysis the response
press f12 to open Developer tools,change to item "network".then submit your page,analysis the first request initiated by Chrome.
what you need:
Hearders-general-request url
Hearders-request headers
Hearders-data
you need some packages to parser th html, such as bs4

Authentication results in 404 code

There is a website I need to scrape, but before I do I need to login.
There seems to be three things I need to get in, the username, password and authenticity token. The user name and password I know, but I am not sure how to access the token.
This is what I have tried:
import requests
from lxml import html
login_url = "https://urs.earthdata.nasa.gov/home"
session_requests = requests.session()
result = session_requests.get(login_url)
tree = html.fromstring(result.text)
authenticity_token = list(set(tree.xpath("//input[#name='authenticity_token']/#value")))[0]
payload = {"username": "my_name",
"password": "my_password",
"authenticity_token": authenticity_token}
result = session_requests.post(
login_url,
data = payload,
headers = dict(referer=login_url)
)
print (result)
This results in :
<Response [404]>
My name and password are entered correctly so it is the token that must be going wrong. I think the problem is this line:
authenticity_token = list(set(tree.xpath("//input[#name='authenticity_token']/#value")))[0]
or this line:
payload = {"username": "my_name",
"password": "my_password",
"authenticity_token": authenticity_token}
by looking at the source code on the webpage I noticed there is a authenticity_token, csrf-token and a csrf-param. So its possible these are in the wrong order, but I tried all the combinations.
EDIT:
Here is a beautiful soup approach that results in 404 again.
s = requests.session()
response = s.get(login_url)
soup = BeautifulSoup(response.text, "lxml")
for n in soup('input'):
if n['name'] == 'authenticity_token':
token = n['value']
if n['name'] == 'utf8':
utf8 = n['value']
break
auth = {
'username': 'my_username'
, 'password': 'my_password'
, 'authenticity_token': token
, 'utf8' : utf8
}
s.post(login_url, data=auth)
If you inspect the page you'll notice that form action value is '/login', so you have to submit your data to https://urs.earthdata.nasa.gov/login'.
login_url = "https://urs.earthdata.nasa.gov/login"
home_url = "https://urs.earthdata.nasa.gov/home"
s = requests.session()
soup = BeautifulSoup(s.get(home_url).text, "lxml")
data = {i['name']:i.get('value', '') for i in soup.find_all('input')}
data['username'] = 'my_username'
data['password'] = 'my_password'
result = s.post(login_url, data=data)
print(result)
< Response [200]>
A quick example with selenium:
from selenium import webdriver
driver = webdriver.Firefox()
url = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD10A1.006/'
driver.get(url)
driver.find_element_by_name('username').send_keys('my_username')
driver.find_element_by_name('password').send_keys('my_password')
driver.find_element_by_id('login').submit()
html = driver.page_source
driver.quit()

Unable to create Wiki page using python requests

I am trying to edit a page on a Wiki that uses MediaWiki software, but it is not working. I am able to succesfully log in, but unable to edit pages. I am unsure what is causing this problem, as I included the edit token in my request. Here is my code:
import requests
from bs4 import BeautifulSoup as bs
def get_login_token(raw_resp):
soup = bs(raw_resp.text, 'lxml')
token = [n.get('value', '') for n in soup.find_all('input')
if n.get('name', '') == 'wpLoginToken']
return token[0]
def get_edit_token(raw_resp):
soup = bs(raw_resp.text, 'lxml')
token = [n.get('value', '') for n in soup.find_all('input')
if n.get('name', '') == 'wpEditToken']
return token[0]
#login
s = requests.Session()
values = {'wpName' : 'username',
'wpPassword' : 'password',
'wpLoginAttempt' : 'Log in',
'wpForceHttps' : '1',
'wpLoginToken' : ''
}
url = '.....'
resp = s.get(url)
values['wpLoginToken'] = get_login_token(resp)
req = s.post(url, values)
# edit page
url1 = '.....'
editing = {'wpTextbox1' : 'hi there',
'wpSave' : 'Save page',
'wpSummary' : 'hi',
'wpEditToken' : ''}
resp = s.get(url1)
editing['wpEditToken'] = get_edit_token(resp)
edit = s.post(url1, editing)
print(edit.url)
print(edit.content)

Beautiful soup pareses in some cases but not in others. Why?

I am using Beautiful Soup to parse some JSON out of an HTML file.
Basically I am using to get all employee profiles out of a LinkedIn search result.
However, for some reason it does not work with companies that have more than 10 employees for some reason.
Here is my code
import requests, json
from bs4 import BeautifulSoup
s = requests.session()
def get_csrf_tokens():
url = "https://www.linkedin.com/"
req = s.get(url).text
csrf_token = req.split('name="csrfToken" value=')[1].split('" id="')[0]
login_csrf_token = req.split('name="loginCsrfParam" value="')[1].split('" id="')[0]
return csrf_token, login_csrf_token
def login(username, password):
url = "https://www.linkedin.com/uas/login-submit"
csrfToken, loginCsrfParam = get_csrf_tokens()
data = {
'session_key': username,
'session_password': password,
'csrfToken': csrfToken,
'loginCsrfParam': loginCsrfParam
}
req = s.post(url, data=data)
print "success"
login(USERNAME PASSWORD)
def get_all_json(company_link):
r=s.get(company_link)
html= r.content
soup=BeautifulSoup(html)
html_file= open("html_file.html", 'w')
html_file.write(html)
html_file.close()
Json_stuff=soup.find('code', id="voltron_srp_main-content")
print Json_stuff
return remove_tags(Json_stuff)
def remove_tags(p):
p=str(p)
return p[62: -10]
def list_of_employes():
jsons=get_all_json('https://www.linkedin.com/vsearch/p?f_CC=2409087')
print jsons
loaded_json=json.loads(jsons.replace(r'\u002d', '-'))
employes=loaded_json['content']['page']['voltron_unified_search_json']['search']['results']
return employes
def get_employee_link(employes):
profiles=[]
for employee in employes:
print employee['person']['link_nprofile_view_3']
profiles.append(employee['person']['link_nprofile_view_3'])
return profiles , len(profiles)
print get_employee_link(list_of_employes())
It will not work for the link that is in place; however it will work for this company search: https://www.linkedin.com/vsearch/p?f_CC=3003796
EDIT:
I am pretty sure that this is an error with the get_all_json() function. If
you take a look, it does not correctly fetch the JSON for companies with more than 10 employees.
This is because the results are paginated. You need get over all pages defined inside the json data at:
data['content']['page']['voltron_unified_search_json']['search']['baseData']['resultPagination']['pages']
pages is a list, for the company 2409087 it is:
[{u'isCurrentPage': True, u'pageNum': 1, u'pageURL': u'http://www.linkedin.com/vsearch/p?f_CC=2409087&page_num=1'},
{u'isCurrentPage': False, u'pageNum': 2, u'pageURL': u'http://www.linkedin.com/vsearch/p?f_CC=2409087&page_num=2', u'page_number_i18n': u'Page 2'},
{u'isCurrentPage': False, u'pageNum': 3, u'pageURL': u'http://www.linkedin.com/vsearch/p?f_CC=2409087&page_num=3', u'page_number_i18n': u'Page 3'}]
This is basically a list of URLs you need to get over and get the data.
Here's what you need to do (ommiting the code for login):
def get_results(json_code):
return json_code['content']['page']['voltron_unified_search_json']['search']['results']
url = "https://www.linkedin.com/vsearch/p?f_CC=2409087"
soup = BeautifulSoup(s.get(url).text)
code = soup.find('code', id="voltron_srp_main-content").contents[0].replace(r'\u002d', '-')
json_code = json.loads(code)
results = get_results(json_code)
pages = json_code['content']['page']['voltron_unified_search_json']['search']['baseData']['resultPagination']['pages']
for page in pages[1:]:
soup = BeautifulSoup(s.get(page['pageURL']).text)
code = soup.find('code', id="voltron_srp_main-content").contents[0].replace(r'\u002d', '-')
json_code = json.loads(code)
results += get_results(json_code)
print len(results)
It prints 25 for https://www.linkedin.com/vsearch/p?f_CC=2409087 - exactly how much you see in browser.
Turns out it was a problem with the default BeautifulSoup parser.
I changed it to html5lib by doing this:
Install in the console
pip install html5lib
And change the type of parser you choose when first creating the soup object.
soup = BeautifulSoup(html, 'html5lib')
This is documented in the BeautifulSoup docs here

Categories