how to automatically use the session? - python

please help to deal with the authorization through the script. the problem is that it is impossible to automatically insert a get-request invbf_session_id.
import pprint
import requests
import re
import shelve
import bs4
def scanning_posts():
print('------------------------enter begin--------------------')
url = 'http://forum.saransk.ru/'
html = requests.get(url)
pprint.pprint(html.headers)
rawCookie = html.headers['Set-Cookie']
cookie = re.search(r"invbf_session_id=(.*?);", rawCookie).group(1)
pprint.pprint(cookie) # cookie != zzzzzzzzzzzzzzzzzzzzzz
html = requests.get(url)
soup = bs4.BeautifulSoup(html.text)
loginForm = soup.find('form', {'id': 'login'})
hiddenAuthKey = soup.find('input', {'name': 'auth_key'})['value']
authData = {
'ips_username': 'xxxxxx',
'ips_password': 'yyyyyy',
'auth_key': hiddenAuthKey,
'rememberMe': 1,
'referer': 'http://forum.saransk.ru/'
}
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
'Referer': 'http://forum.saransk.ru/forum/'
}
#pprint.pprint(authData)
print('\tlogin: ', authData['ips_username'])
cookie = dict(invbf_session_id='zzzzzzzzzzzzzzzzzzzzzz')
req = requests.get(url, params=authData, cookies=cookie, headers=header)
soup = bs4.BeautifulSoup(req.text)
signLinkNotLogged = soup.find('a', {'id': 'sign_in'})
if signLinkNotLogged:
print('------------------------enter failed--------------------')
else:
print('------------------------enter successful--------------------')
scanning_posts()
After running the script displays the wrong value invbf_session_id, as seen in FF firebug. respectively, authorization is not obtained.
if the value invbf_session_id copy of FF firebug and paste the script, the authorization is successful

Related

Instagram login with py requests | bs lib / how do i login 2fac auth?

Im trying to make a Instagramlogin script with python.
Here is my full code
import requests,os,sys,json,time
from datetime import datetime
from bs4 import BeautifulSoup as BS
os.system("cls")
main = "https://instagram.com/"
login = "https://www.instagram.com/api/v1/web/accounts/login/ajax/"
t = int(datetime.now().timestamp())
udata = {
"enc_password":f"#PWD_INSTAGRAM_BROWSER:0:{t}:mypassword",
"username":"myusername",
"queryParams": {},
"optIntoOneTap":"false"
}
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 11.0; WOW64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5222.187 Safari/537.36",
"x-requested-with":"XMLHttpRequest",
"referer":"https://www.instagram.com/",
"x-csrftoken": "token"
}
with requests.Session() as ses:
ses.get(main)
headers["x-csrftoken"] = ses.cookies["ig_did"]
response = requests.post(login, data=udata, headers=headers,allow_redirects=True)
url = "https://www.instagram.com/api/v1/web/accounts/login/ajax/two_factor/"
two_fac = {
"username":"myusername",
"verificationCode":"code",
"verification_method":"3",
"queryParams": {"next":"/"}
}
headers["referer"] = "https://www.instagram.com/accounts/login/two_factor?next=/"
r = requests.post(url)
headers["x-csrftoken"] = r.cookies["ig_did"]
TwoFacCode = input("-->> ")
two_fac["verificationCode"] = TwoFacCode
response = requests.post(url,headers=headers,data=two_fac)
print(response.text)
# json_data = json.loads(response.text)
# if json_data["authenticated"]:
# print("TRUE")
output --> Oops, an error occurred.
the 2fac addr (url) returns that output.
how can i countinue to 2fac screen like i want to write the auth code and pass but it isnt workin
Idk what's wrong here , it should be clean but idk why this error comes out

Log in to a website with csrfmiddlewaretoken verification using python

I am using the following code to log in to a website with csrfmiddlewaretoken verification, but it throws me the following error:
"csrfmiddlewaretoken = HTML.find_all('input')[1]['value']
IndexError: list index out of range"
What do you think is the problem, I'm new using python :)
import requests
from bs4 import BeautifulSoup
request_url = 'https://text.gob.pe/accounts/login/'
with requests.session() as session:
get_url = session.get('https://text.gob.pe/accounts/login/')
HTML = BeautifulSoup(get_url.text, 'html.parser')
csrfmiddlewaretoken = HTML.find_all('input')[1]['value']
#logging in
payload = {
'next' : '/ profile /',
'username' : 'secret',
'password' : 'secret',
'next': '/ profile /',
'csrfmiddlewaretoken': csrfmiddlewaretoken
}
headers = {
'Referer': 'https://text.gob.pe/accounts/login/'
}
login_request = session.post(request_url,payload, headers=headers)
home_page = session.get("https://text.gob.pe/ficha/buscar/")
print(home_page.content)
Without a username and password, it is difficult to show how to proceed. At the moment your mistake is that you didn't specify headers
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'content-type': 'application/x-www-form-urlencoded',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
url = "https://logincovid19.minsa.gob.pe/accounts/login/"
session = requests.Session()
response = session.get(url, headers=headers)
csrfmiddlewaretoken = BeautifulSoup(response.text, 'lxml').find('input', {'name': 'csrfmiddlewaretoken'}).get('value')
Now csrfmiddlewaretoken is something like this - eyHLYFv7HOYxglzFS9a3JDxOT38u8mrakdwhatOnkvcJJzwN9dNi6olBxJxD1HZi
I think further the code will look like this, but you need to check it:
user = 'YourUserNmaeHere'
password = 'UserPaswordHere'
payload = f'csrfmiddlewaretoken={csrfmiddlewaretoken}&username={user}&password={password}'
response = session.post(url, data=payload, headers=headers)

How to get past the consent page of google shopping using requests and python

For a small project I tried to get past the google consent page to webscrape the prices found by it. However i could not get the to the good stuff. Sofar i tried the following code, as also proposed by Using python requests with google search.
First try:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
#%% get acces
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
"cookie": "CONSENT=YES"}
cookie = {
'set-cookie': 'CONSENT=YES+cb.20211212-16-p1.nl+FX+671'
}
url = 'https://www.google.com/search?q=kitesurf+bar&rlz=1C1CAFA_enNL603NL674&sxsrf=AOaemvJF3BPrjeczkCI1e1YotCSJYcKjug:1641909060258&source=lnms&tbm=shop&sa=X&ved=2ahUKEwiBwqjy66n1AhXO-KQKHUKVBSoQ_AUoAXoECAEQAw'
s = requests.Session()
s.get(url,headers=headers)
purl = 'https://consent.google.com/s'
payload = {
'gl': 'NL',
'm': 'false',
'pc': 'srp',
'continue': 'https://www.google.com/search?q=kitesurf+bar&rlz=1C1CAFA_enNL603NL674&sxsrf=AOaemvJF3BPrjeczkCI1e1YotCSJYcKjug:1641909060258&source=lnms&tbm=shop&sa=X&ved=2ahUKEwiBwqjy66n1AhXO-KQKHUKVBSoQ_AUoAXoECAEQAw',
'ca': 'r',
'x': '6',
'v': 'cb.20211212-16-p1.nl+FX+092',
't': 'ADw3F8jHa3HqOqq133-wOkXCYf4K_r-AIA:1641909071268',
'hl': 'nl',
'src': '1'
}
s.post(purl,params=payload,headers=headers)
page = s.get(url, headers=headers, cookies=cookie)
Second try:
import requests
from bs4 import BeautifulSoup
with requests.Session() as s:
# url = f"https://www.google.com/search?q=fitness+wear"
headers = {
"referer":"referer: https://www.google.com/",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"
}
s.post(url, headers=headers)
response = s.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)```

Scraping Data after filling form in python of a website

I have tried to scrape data from http://www.educationboardresults.gov.bd/ with python and BeautifulSoup.
Firstly, website need to fill the form. After filling the form the website provide results. I have attached two image here.
Before Submitting Form: https://prnt.sc/w4lo7i
After Submission: https://prnt.sc/w4lqd0
I have tried with following code
import requests
from bs4 import BeautifulSoup as bs
resultdata = {
'sr': '3',
'et': '2',
'exam': 'ssc',
'year': 2012,
'board': 'chittagong',
'roll': 102275,
'reg': 626948,
'button2': 'Submit',
}
headers ={
'user-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
'cookie': 'PHPSESSID=24vp2g7ll9utu1p2ob5bniq263; tcount_unique_eb_log=1',
'Origin': 'http://www.educationboardresults.gov.bd',
'Referer': 'http://www.educationboardresults.gov.bd/',
'Request URL': 'http://www.educationboardresults.gov.bd/result.php'
}
with requests.Session() as s:
url = 'http://www.educationboardresults.gov.bd'
r = s.get(url, headers=headers)
soup = bs(r.content,'html5lib')
#Scraping and by passing Captcha
alltable =soup.findAll('td')
captcha = alltable[56].text.split('+')
for digit in captcha:
value_one, value_two = int(captcha[0]), int(captcha[1])
resultdata['value_s'] = value_one+value_two
r=s.post(url, data=resultdata, headers= headers)
While printing r.content it is showing first page's code. I want to scrape the second page.
Thanks in Advance
You are making post requests to the wrong url. Moreover, you are supposed to add the value of two numbers and use the result right next to value_s. If you are using bs4 version 3.7 or later, the following selector will work for you as I've used pseudo css selector. The bottom line is your issue is solved. Try the following:
import requests
from bs4 import BeautifulSoup
link = 'http://www.educationboardresults.gov.bd/'
result_url = 'http://www.educationboardresults.gov.bd/result.php'
resultdata = {
'sr': '3',
'et': '2',
'exam': 'ssc',
'year': 2012,
'board': 'chittagong',
'roll': 102275,
'reg': 626948,
'button2': 'Submit',
}
def get_number(s,link):
r = s.get(link)
soup = BeautifulSoup(r.text,"html5lib")
num = 0
captcha_numbers = soup.select_one("tr:has(> td > #value_s) > td + td").text.split("+")
for i in captcha_numbers:
num+=int(i)
return num
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
resultdata['value_s'] = get_number(s,link)
r = s.post(result_url, data=resultdata)
print(r.text)
I am also trying.
import requests
from bs4 import BeautifulSoup as bs
resultdata = {
'sr': '3',
'et': '2',
'exam': 'ssc',
'year': "2012",
'board': 'chittagong',
'roll': "102275",
'reg': "626948",
}
headers ={
'user-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
'cookie': 'PHPSESSID=24vp2g7ll9utu1p2ob5bniq263; tcount_unique_eb_log=1',
'Origin': 'http://www.educationboardresults.gov.bd',
'Referer': 'http://www.educationboardresults.gov.bd/',
'Request URL': 'http://www.educationboardresults.gov.bd/result.php'
}
with requests.Session() as s:
url = 'http://www.educationboardresults.gov.bd/index.php'
r = s.get(url, headers=headers)
soup = bs(r.content,'lxml')
# print(soup.prettify())
#Scraping and by passing Captcha
alltable =soup.findAll('td')
captcha = alltable[56].text.split('+')
print(captcha)
value_one, value_two = int(captcha[0]), int(captcha[1])
print(value_one, value_one)
resultdata['value_s'] = value_one+value_two
resultdata['button2'] = 'Submit'
print(resultdata)
r=s.post("http://www.educationboardresults.gov.bd/result.php", data=resultdata, headers= headers)
soup = bs(r.content, 'lxml')
print(soup.prettify())

Can't parse some names and and their concerning urls from a webpage

I've created a python script using requests and BeautifulSoup to parse the profile names and the links to their profile names from a webpage. The content seems to generate dynamically but they are present in page source. So, I tried with the following but unfortunately I get nothing.
SiteLink
My attempt so far:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.century21.com/real-estate-agents/Dallas,TX'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
'cache-control': 'max-age=0',
'cookie': 'JSESSIONID=8BF2F6FB5603A416DCFBAB8A3BB5A79E.app09-c21-id8; website_user_id=1255553501;',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
def get_info(link):
res = requests.get(link,headers=headers)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".media__content"):
profileUrl = item.get("href")
profileName = item.select_one("[itemprop='name']").get_text()
print(profileUrl,profileName)
if __name__ == '__main__':
get_info(URL)
How can I fetch the content from that page?
The required content does available in page source. The site is very good at discarding requests when it is made using the same user-agent. So, I used fake_useragent to supply the same randomly with requests. It works if you don't use it incessantly.
Working solution:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from fake_useragent import UserAgent
URL = 'https://www.century21.com/real-estate-agents/Dallas,TX'
def get_info(s,link):
s.headers["User-Agent"] = ua.random
res = s.get(link)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".media__content a[itemprop='url']"):
profileUrl = urljoin(link,item.get("href"))
profileName = item.select_one("span[itemprop='name']").get_text()
print(profileUrl,profileName)
if __name__ == '__main__':
ua = UserAgent()
with requests.Session() as s:
get_info(s,URL)
Partial output:
https://www.century21.com/CENTURY-21-Judge-Fite-Company-14501c/Stewart-Kipness-2657107a Stewart Kipness
https://www.century21.com/CENTURY-21-Judge-Fite-Company-14501c/Andrea-Anglin-Bulin-2631495a Andrea Anglin Bulin
https://www.century21.com/CENTURY-21-Judge-Fite-Company-14501c/Betty-DeVinney-2631507a Betty DeVinney
https://www.century21.com/CENTURY-21-Judge-Fite-Company-14501c/Sabra-Waldman-2657945a Sabra Waldman
https://www.century21.com/CENTURY-21-Judge-Fite-Company-14501c/Russell-Berry-2631447a Russell Berry
The page content is NOT rendered via javascript. Your code is fine in my case.
You have just some issue to find the profileUrl and to handle nonetype exception. You have to focus to the a tag to get the data
You should try this:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.century21.com/real-estate-agents/Dallas,TX'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
'cache-control': 'max-age=0',
'cookie': 'JSESSIONID=8BF2F6FB5603A416DCFBAB8A3BB5A79E.app09-c21-id8; website_user_id=1255553501;',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
def get_info(link):
res = requests.get(link,headers=headers)
soup = BeautifulSoup(res.text,"lxml")
results = []
for item in soup.select(".media__content"):
a_link = item.find('a')
if a_link:
result = {
'profileUrl': a_link.get('href'),
'profileName' : a_link.get_text()
}
results.append(result)
return results
if __name__ == '__main__':
info = get_info(URL)
print(info)
print(len(info))
OUTPUT:
[{'profileName': 'Stewart Kipness',
'profileUrl': '/CENTURY-21-Judge-Fite-Company-14501c/Stewart-Kipness-2657107a'},
....,
{'profileName': 'Courtney Melkus',
'profileUrl': '/CENTURY-21-Realty-Advisors-47551c/Courtney-Melkus-7389925a'}]
941
It looks like you can construct the url as well (Though does seem easier to just grab it)
import requests
from bs4 import BeautifulSoup as bs
URL = 'https://www.century21.com/real-estate-agents/Dallas,TX'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
'cache-control': 'max-age=0',
'cookie': 'JSESSIONID=8BF2F6FB5603A416DCFBAB8A3BB5A79E.app09-c21-id8; website_user_id=1255553501;',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
r = requests.get(URL, headers = headers)
soup = bs(r.content, 'lxml')
items = soup.select('.media')
ids = []
names = []
urls = []
for item in items:
if item.select_one('[data-agent-id]') is not None:
anId = item.select_one('[data-agent-id]')['data-agent-id']
ids.append(anId)
name = item.select_one('[itemprop=name]').text.replace(' ','-')
names.append(name)
url = 'https://www.century21.com/CENTURY-21-Judge-Fite-Company-14501c/' + name + '-' + anId + 'a'
urls.append(url)
results = list(zip(names, urls))
print(results)
Please try:
profileUrl = "https://www.century21.com/" + item.select("a")[0].get("href")

Categories