Just to clarify from the beginning: I'm a total beginner (I wrote something in Python for the first time today). This was more applying from a guide and trying to remember what I did 7 years ago when I tried learning java than anything else.
I wanted to scrape the image tags from a website (to plot them later) but have to stay logged in to view all images. After I got the scraping down I noticed that there were some tags blocked so the issue with the login came up. I now managed to log in but it doesn't work outside of the session itself which makes the rest of my code useless. Can I get this to work or do I have to give up?
This is the working login:
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
login_data = {
'user' : 'theusername',
'pass' : 'thepassword',
'op' : 'Log in'
}
with requests.Session() as s:
url = "https://thatwebsite.com/index.php?page=account&s=login&code=00"
r = s.get(url)
r = s.post(url, data=login_data)
And what I had working before to scrape the website but with the login missing:
filename = "taglist.txt"
f = open(filename, "w", encoding="utf-8")
headers = "tags\n"
f.write(headers)
pid = 0
actual_page = 1
while pid < 150:
url = "https://thatwebsite.com/index.php?page=post&s=list&tags=absurdres&pid=" + str(pid)
print(url)
client = urlopen(url)
page_html = client.read()
client.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"thumbnail-preview"})
print("Current pid: " + str(pid))
for container in containers:
tags = container.span.a.img["title"]
f.write(tags.replace(" ", "\n") + "\n")
pid = pid + 42
print("Current page: " + str(actual_page))
actual_page += 1
print("Done.")
f.close()
Out comes a list of every tag used by high res images.
I hope I don't offend anyone with this.
Edit: The code is working now, had a cookie typo:
import requests
from bs4 import BeautifulSoup as soup
login_data = {
'user' : 'myusername',
'pass' : 'mypassword',
'op' : 'Log in'
}
s = requests.Session()
print("\n\n\n\n\n")
filename = "taglist.txt"
f = open(filename, "w", encoding="utf-8")
headers = "tags\n"
f.write(headers)
pid = 0
actual_page = 1
while pid < 42:
url2 = "https://thiswebsite.com/index.php?page=post&s=list&tags=rating:questionable&pid=" + str(pid)
r = s.get(url2, cookies={'duid' : 'somehash', 'user_id' : 'my userid', 'pass_hash' : 'somehash'})
page_html = str(r.content)
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"thumbnail-preview"})
for container in containers:
tags = container.span.a.img["title"]
f.write(tags.replace(" ", "\n") + "\n")
print("\nCurrent page: " + str(actual_page) + " Current pid: " + str(pid) + "\nDone.")
actual_page += 1
pid = pid + 42
f.close()
You use two different libraries for doing web requests right now. requests and urllib. I would opt for using only requests.
Also don't use the Session() context manager. Context manager are used to do some cleanup after leaving the indented block and have that with ... as x syntax you use on the requests.Session() object. In context of requests this will clear the cookies as you leave the session. (I assume login is managed by cookies at this site).
Keep the session in a variable instead that you can use for subsequent requests as this stores your cookies at login. You need them for subsequent requests.
s = requests.Session()
url = "https://thatwebsite.com/index.php?page=account&s=login&code=00"
r = s.get(url) # do you need this request?
r = s.post(url, data=login_data)
Also make the subsequent call in the loop with requests:
client = s.get(url)
I set up this code to extract the links from the following website. The problem is that it breaks into register 19 and doesn't continue with the listing.
You can help me.
import urllib.request
import os
tematica = 'fun'
url = "https://www.shutterstock.com/es/search/" + tematica
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
data_content = response.read()
Html_file= open("html_file.html","wb")
Html_file.write(data_content)
Html_file.close()
html=codecs.open("html_file.html", 'r', 'utf-8').read()
soup = BeautifulSoup(html)
for i,img_element in enumerate(soup.findAll('img', None)):
try:
img_src = img_element['src']
print(i,img_src)
except:
pass
I am not very experienced with this type of thing, but I cannot seem to log into this webpage via Python: https://ravenpack.com/discovery/login/
I have tried solutions from other StackOverflow posts, but nothing seems to work. It could be that it is not possible or I just do not know what I'm doing - either are likely possible
I have tried:
import requests
LOGIN_URL = 'https://ravenpack.com/discovery/login/'
DATA_URL = 'https://ravenpack.com/discovery/news_analytics_story/FFF4BFD4F4D4FF803852899BD1F02077/'
payload = {
'username': 'uname',
'password': 'pword'
}
with requests.Session() as s:
s.post(LOGIN_URL, data=payload)
r = s.get(DATA_URL)
print r.text
this:
from twill.commands import *
go('https://ravenpack.com/discovery/login/')
fv("2", "username", "uname")
fv("2", "password", "pword")
submit('1')
this:
import mechanize
br = mechanize.Browser()
br.set_handle_robots(False)
br.open("https://ravenpack.com/discovery/login/") #Url that contains signin form
br.select_form()
br['username'] = "uname" #see what is the name of txt input in form
br['password'] = 'pword'
result = br.submit().read()
f=file('s.html', 'w')
f.write(result)
f.close()
and this:
from robobrowser import RoboBrowser
browser = RoboBrowser(history=True,user_agent='Mozilla/5.0')
login_url = 'https://ravenpack.com/discovery/login/'
browser.open(login_url)
form = browser.get_form(id='login_form')
form['username'].value = 'uname'
form['password'].value = 'pword'
browser.submit_form(form)
Any help is appreciated.
import requests
LOGIN_URL = 'https://ravenpack.com/discovery/login/'
DATA_URL = 'https://ravenpack.com/discovery/news_analytics_story/FFF4BFD4F4D4FF803852899BD1F02077/'
username = 'user'
password = 'password'
with requests.Session() as s:
s.post(LOGIN_URL, auth=HTTPBasicAuth(username, password))
r = s.get(DATA_URL)
print r.text
def get_main_page_url("https://malwr.com/analysis/search/", strDestPath, strMD5):
base_url = 'https://malwr.com/'
url = 'https://malwr.com/account/login/'
username = 'myname'
password = 'pswd'
session = requests.Session()
# getting csrf value
response = session.get(url)
soup = bs4.BeautifulSoup(response.content)
form = soup.form
csrf = form.find('input', attrs={'name': 'csrfmiddlewaretoken'}).get('value')
## csrf1 = form.find('input', attrs ={'name': 'search'}).get('value')
# logging in
data = {
'username': username,
'password': password,
'csrfmiddlewaretoken': csrf
}
session.post(url, data=data)
# getting analysis data
response = session.get(urlparameter)
soup = bs4.BeautifulSoup(response.content)
form = soup.form
csrf = form.find('input', attrs={'name': 'csrfmiddlewaretoken'}).get('value')
## csrf1 = form.find('input', attrs ={'name': 'search'}).get('value')
data = {
'search': strMD5,
'csrfmiddlewaretoken': csrf
}
session.post(urlparameter, data = data)
response = session.get(urlparameter)
soup = bs4.BeautifulSoup(response.content)
print(soup)
if(None != soup.find('section', id='file').find('table')('tr')[-1].a):
link = soup.find('section', id='file').find('table')('tr')[-1].a.get('href')
link = urljoin(base_url, link)
webFile = session.get(link)
filename =link.split('/')[-2]
filename = arg + filename
localFile = open(filename, 'wb')
localFile.write(webFile.content)
webFile.close()
localFile.close()
I am able to login by searching crftoken. Then I am trying to send MD5 to search on malware.com, however I am not able to get the page that searches the sent MD5 to page.
I want to search the MD5 that we passes through crftoken.
Please let me know what is the wrong in code.
You've done almost everything correctly. Except that you need to pass the result of the POST request to BeautifulSoup. Replace:
session.post(urlparameter, data = data)
response = session.get(urlparameter)
with:
response = session.post(urlparameter, data=data)
Worked for me (I had an account at malwr).
I'm trying to log into LinkedIn using Python requests:
import sys
import requests
from BeautifulSoup import BeautifulSoup
payload={
'session-key' : 'user#email.com',
'session-password' : 'password'
}
URL='https://www.linkedin.com/uas/login-submit'
s=requests.session()
s.post(URL,data=payload)
r=s.get('http://www.linkedin.com/nhome')
soup = BeautifulSoup(r.text)
print soup.find('title')
I can't seem to log in using this method. I even tried playing with csrf etc. in the payload, but aren't sessions supposed to take care of that for you?
Note about the last line: I use the title to check if I've successfully logged in. (I should see "Welcome! | LinkedIn" if I have signed in, instead I see "World's Largest Professional Network | LinkedIn"
Am I missing something?
I modified a web-scraping template I use for most of my Python-based scraping needs to fit your needs. Verified it worked with my own login info.
The way it works is by mimic-ing a browser and maintaining a cookieJar that stores your user session. Got it to work with BeautifulSoup for you as well.
Note: This is a Python2 version. I added a working Python3 example further below by request.
import cookielib
import os
import urllib
import urllib2
import re
import string
from BeautifulSoup import BeautifulSoup
username = "user#email.com"
password = "password"
cookie_filename = "parser.cookies.txt"
class LinkedInParser(object):
def __init__(self, login, password):
""" Start up... """
self.login = login
self.password = password
# Simulate browser with cookies enabled
self.cj = cookielib.MozillaCookieJar(cookie_filename)
if os.access(cookie_filename, os.F_OK):
self.cj.load()
self.opener = urllib2.build_opener(
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(self.cj)
)
self.opener.addheaders = [
('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
'Windows NT 5.2; .NET CLR 1.1.4322)'))
]
# Login
self.loginPage()
title = self.loadTitle()
print title
self.cj.save()
def loadPage(self, url, data=None):
"""
Utility function to load HTML from URLs for us with hack to continue despite 404
"""
# We'll print the url in case of infinite loop
# print "Loading URL: %s" % url
try:
if data is not None:
response = self.opener.open(url, data)
else:
response = self.opener.open(url)
return ''.join(response.readlines())
except:
# If URL doesn't load for ANY reason, try again...
# Quick and dirty solution for 404 returns because of network problems
# However, this could infinite loop if there's an actual problem
return self.loadPage(url, data)
def loginPage(self):
"""
Handle login. This should populate our cookie jar.
"""
html = self.loadPage("https://www.linkedin.com/")
soup = BeautifulSoup(html)
csrf = soup.find(id="loginCsrfParam-login")['value']
login_data = urllib.urlencode({
'session_key': self.login,
'session_password': self.password,
'loginCsrfParam': csrf,
})
html = self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
return
def loadTitle(self):
html = self.loadPage("https://www.linkedin.com/feed/")
soup = BeautifulSoup(html)
return soup.find("title")
parser = LinkedInParser(username, password)
Update June 19, 2014: Added parsing for CSRF token from homepage for use in updated login process.
Update July 23, 2015: Adding a Python 3 example here. Basically requires substituting library locations and removing deprecated methods. It's not perfectly formatted or anything, but it functions. Sorry for the rush job. In the end the principals and steps are identical.
import http.cookiejar as cookielib
import os
import urllib
import re
import string
from bs4 import BeautifulSoup
username = "user#email.com"
password = "password"
cookie_filename = "parser.cookies.txt"
class LinkedInParser(object):
def __init__(self, login, password):
""" Start up... """
self.login = login
self.password = password
# Simulate browser with cookies enabled
self.cj = cookielib.MozillaCookieJar(cookie_filename)
if os.access(cookie_filename, os.F_OK):
self.cj.load()
self.opener = urllib.request.build_opener(
urllib.request.HTTPRedirectHandler(),
urllib.request.HTTPHandler(debuglevel=0),
urllib.request.HTTPSHandler(debuglevel=0),
urllib.request.HTTPCookieProcessor(self.cj)
)
self.opener.addheaders = [
('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
'Windows NT 5.2; .NET CLR 1.1.4322)'))
]
# Login
self.loginPage()
title = self.loadTitle()
print(title)
self.cj.save()
def loadPage(self, url, data=None):
"""
Utility function to load HTML from URLs for us with hack to continue despite 404
"""
# We'll print the url in case of infinite loop
# print "Loading URL: %s" % url
try:
if data is not None:
response = self.opener.open(url, data)
else:
response = self.opener.open(url)
return ''.join([str(l) for l in response.readlines()])
except Exception as e:
# If URL doesn't load for ANY reason, try again...
# Quick and dirty solution for 404 returns because of network problems
# However, this could infinite loop if there's an actual problem
return self.loadPage(url, data)
def loadSoup(self, url, data=None):
"""
Combine loading of URL, HTML, and parsing with BeautifulSoup
"""
html = self.loadPage(url, data)
soup = BeautifulSoup(html, "html5lib")
return soup
def loginPage(self):
"""
Handle login. This should populate our cookie jar.
"""
soup = self.loadSoup("https://www.linkedin.com/")
csrf = soup.find(id="loginCsrfParam-login")['value']
login_data = urllib.parse.urlencode({
'session_key': self.login,
'session_password': self.password,
'loginCsrfParam': csrf,
}).encode('utf8')
self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
return
def loadTitle(self):
soup = self.loadSoup("https://www.linkedin.com/feed/")
return soup.find("title")
parser = LinkedInParser(username, password)
This is a much simpler version.
import requests
from bs4 import BeautifulSoup
client = requests.Session()
HOMEPAGE_URL = 'https://www.linkedin.com'
LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'
html = client.get(HOMEPAGE_URL).content
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find(id="loginCsrfParam-login")['value']
login_information = {
'session_key':'Login',
'session_password':'Password',
'loginCsrfParam': csrf,
}
client.post(LOGIN_URL, data=login_information)
client.get('Any_Linkedin_URL')
2019 Version.
Slightly revised version working that takes into account the new structure of the page to find the connection cookie and adds the trk parameter.
import requests
from bs4 import BeautifulSoup
email = ""
password = ""
client = requests.Session()
HOMEPAGE_URL = 'https://www.linkedin.com'
LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'
html = client.get(HOMEPAGE_URL).content
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find('input', {'name': 'loginCsrfParam'}).get('value')
login_information = {
'session_key': email,
'session_password': password,
'loginCsrfParam': csrf,
'trk': 'guest_homepage-basic_sign-in-submit'
}
client.post(LOGIN_URL, data=login_information)
response = client.get('')
2020 version of #garromark's accepted solution:
import http.cookiejar as cookielib
import os
import urllib
import re
import string
from bs4 import BeautifulSoup
username = ""
password = ""
cookie_filename = "parser.cookies.txt"
class LinkedInParser(object):
def __init__(self, login, password):
""" Start up... """
self.login = login
self.password = password
# Simulate browser with cookies enabled
self.cj = cookielib.MozillaCookieJar(cookie_filename)
if os.access(cookie_filename, os.F_OK):
self.cj.load()
self.opener = urllib.request.build_opener(
urllib.request.HTTPRedirectHandler(),
urllib.request.HTTPHandler(debuglevel=0),
urllib.request.HTTPSHandler(debuglevel=0),
urllib.request.HTTPCookieProcessor(self.cj)
)
self.opener.addheaders = [
('User-agent', 'Mozilla/5.0')
]
# Login
self.loginPage()
title = self.loadTitle()
print(title)
# self.cj.save()
def loadPage(self, url, data=None):
"""
Utility function to load HTML from URLs for us with hack to continue despite 404
"""
# We'll print the url in case of infinite loop
# print "Loading URL: %s" % url
try:
if data is not None:
response = self.opener.open(url, data)
else:
response = self.opener.open(url)
content = ''.join([str(l) for l in response.readlines()])
print("Page loaded: %s \n Content: %s \n" % (url, content))
return content
except Exception as e:
# If URL doesn't load for ANY reason, try again...
# Quick and dirty solution for 404 returns because of network problems
# However, this could infinite loop if there's an actual problem
print("Exception on %s load: %s" % (url, e))
# return self.loadPage(url, data)
def loadSoup(self, url, data=None):
"""
Combine loading of URL, HTML, and parsing with BeautifulSoup
"""
html = self.loadPage(url, data)
soup = BeautifulSoup(html, "html5lib")
return soup
def loginPage(self):
"""
Handle login. This should populate our cookie jar.
"""
soup = self.loadSoup("https://www.linkedin.com/login")
loginCsrfParam = soup.find("input", {"name": "loginCsrfParam"})['value']
csrfToken = soup.find("input", {"name": "csrfToken"})['value']
sIdString = soup.find("input", {"name": "sIdString"})['value']
print("loginCsrfParam: %s" % loginCsrfParam)
print("csrfToken: %s" % csrfToken)
print("sIdString: %s" % sIdString)
login_data = urllib.parse.urlencode({
'session_key': self.login,
'session_password': self.password,
'loginCsrfParam': loginCsrfParam,
'csrfToken': csrfToken,
'sIdString': sIdString
}).encode('utf8')
self.loadPage("https://www.linkedin.com/checkpoint/lg/login-submit", login_data)
def loadTitle(self):
soup = self.loadSoup("https://www.linkedin.com/feed/")
return soup.find("title")
parser = LinkedInParser(username, password)
The OP's solution worked for me with only a very slight modification.
Change 'session-key' to 'session_key' and change 'session-password' to session_password.'
Other than that, the code is good as it stands.