Session handling in scrapy-splash with custom header

Session handling in scrapy-splash with custom header - python

I am using Scrapy with Splash via Scrapy-Splash.
I am having issues persisting my logged-in status after the initial request.
Here's my whole spider class:
import scrapy
from scrapy_splash import SplashRequest
import logging
class MasterSpider(scrapy.Spider):
name = 'master'
allowed_domains = ['www.somesite.com']
start_url = 'https://www.somesite.com/login'
login_script = '''
function main(splash, args)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'
headers = {
['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-US;q=0.9,en;q=0.8',
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(2))
-- username input
username_input = assert(splash:select('#username'))
username_input:focus()
username_input:send_text('myusername')
assert(splash:wait(0.3))
-- password input
password_input = assert(splash:select('#password'))
password_input:focus()
password_input:send_text('mysecurepass')
assert(splash:wait(0.3))
-- the login button
login_btn = assert(splash:select('#login_btn'))
login_btn:mouse_click()
assert(splash:wait(4))
return {
html = splash:html(),
cookies = splash:get_cookies(),
}
end
'''
fruit_selection_script = '''
function main(splash, args)
splash:init_cookies(splash.args.cookies)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'
headers = {
['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-US;q=0.9,en;q=0.8',
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(4))
-- state select input
state_select = assert(splash:select('select#fruits'))
state_select:mouse_click()
state_select:send_keys("<Down>")
assert(splash:wait(0.2))
state_select:send_keys("<Enter>")
assert(splash:wait(0.2))
-- game select input
game_select = assert(splash:select('select#type'))
game_select:mouse_click()
game_select:send_keys("<Down>")
assert(splash:wait(0.1))
game_select:send_keys("<Up>")
assert(splash:wait(0.1))
-- the next button
login_btn = assert(splash:select('input.submit'))
login_btn:mouse_click()
assert(splash:wait(4))
return splash:html()
end
'''
def start_requests(self):
yield SplashRequest(url = self.start_url, callback = self.post_login, endpoint = 'execute', args = { 'lua_source': self.login_script })
def post_login(self, response):
search_link = response.urljoin(response.xpath("(//div[#id='sidebar']/ul/li)[7]/a/#href").get())
logging.info('about to fire up second splash request')
with open('temp.html', 'w') as f:
f.write(response.text)
f.close()
yield SplashRequest(url = search_link, callback = self.search, endpoint = 'execute', args = { 'wait': 3, 'lua_source': self.game_selection_script })
def search(self, response):
logging.info('hey from search!')
with open('post_search_response.html', 'w') as f:
f.write(response.text)
f.close()
def post_search(self, response):
logging.info('hey from post_search!')
with open('post_search_response.html', 'w') as f:
f.write(response.text)
f.close()
def parse(self, response):
pass
The scrapy-splash docs say:
SplashRequest sets session_id automatically for /execute endpoint, i.e. cookie handling is enabled by default if you use SplashRequest, /execute endpoint and a compatible Lua rendering script.
If you want to start from the same set of cookies, but then 'fork' sessions set request.meta['splash']['new_session_id'] in addition to session_id. Request cookies will be fetched from cookiejar session_id, but response cookies will be merged back to the new_session_id cookiejar.
As you can see, I am always using the execute endpoint, so I should get cookie handling by default? Yet it isn't working, I'm not sure why, but I wonder if it is because I am setting the custom header for the user-agent and language?
Right now, when the spider comes to run the 2nd script (fruit_selection_script) I get a 403 Forbidden error.
What am I missing?

Related

CrawlSpider with Splash, only first link is crawled & processed

I am using Scrapy with Splash. Here is what I have in my spider:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest
import logging
class MainSpider(CrawlSpider):
name = 'main'
allowed_domains = ['www.somesite.com']
script = '''
function main(splash, args)
splash.private_mode_enabled = false
my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
headers = {
['User-Agent'] = my_user_agent,
['Accept-Language'] = 'en-GB,en-US;q=0.9,en;q=0.8',
['Referer'] = 'https://www.google.com'
}
splash:set_custom_headers(headers)
url = args.url
assert(splash:go(url))
assert(splash:wait(2))
-- username input
username_input = assert(splash:select('#username'))
username_input:focus()
username_input:send_text('myusername')
assert(splash:wait(0.3))
-- password input
password_input = assert(splash:select('#password'))
password_input:focus()
password_input:send_text('mysecurepass')
assert(splash:wait(0.3))
-- the login button
login_btn = assert(splash:select('#login_btn'))
login_btn:mouse_click()
assert(splash:wait(4))
return splash:html()
end
'''
rules = (
Rule(LinkExtractor(restrict_xpaths="(//div[#id='sidebar']/ul/li)[7]/a"), callback='parse_item', follow=True, process_request='use_splash'),
)
def start_requests(self):
yield SplashRequest(url = 'https://www.somesite.com/login', callback = self.post_login, endpoint = 'execute', args = {
'lua_source': self.script
})
def use_splash(self, request):
request.meta.update(splash={
'args': {
'wait': 1,
},
'endpoint': 'render.html',
})
return request
def _requests_to_follow(self, response):
if not isinstance(response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def post_login(self, response):
logging.info('hey from login!')
with open('post_login_response.txt', 'w') as f:
f.write(response.text)
f.close()
def parse_item(self, response):
logging.info('hey from parse_item!')
with open('post_search_response.txt', 'w') as f:
f.write(response.text)
f.close()
I came across this and I've tried to implement things the same way, but still, prase_item is never run. In the logs, I never get hey from parse_item!
I'm not sure what I'm missing. The full log output can be found here

I ditched the Crawl Spider and converted to a regular spider, and things are working fine now.

Unable to expand more... python

I can scrape all the reviews from the web page.But I am not getting full content.Only half review content i can scrape.I need to scrape the full content.
from bs4 import BeautifulSoup import requests import re
s = requests.Session()
def get_soup(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
r = s.get(url, headers=headers)
#with open('temp.html', 'wb') as f:
# f.write(r.content)
# webbrowser.open('temp.html')
if r.status_code != 200:
print('status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def parse(url, response):
if not response:
print('no response:', url)
return
# get number of reviews
# num_reviews = response.find('span', class_='reviews_header_count').text
# num_reviews = num_reviews[1:-1] # remove `( )`
# num_reviews = num_reviews.replace(',', '') # remove `,`
# num_reviews = int(num_reviews)
# print('num_reviews:', num_reviews, type(num_reviews))
num_reviews = (20)
# num_reviews = num_reviews[1:-1] # remove `( )`
# num_reviews = num_reviews.replace(',', '') # remove `,`
# num_reviews = int(num_reviews)
print('num_reviews:', num_reviews, type(num_reviews))
# create template for urls to pages with reviews
url = url.replace('Hilton_New_York_Grand_Central-New_York_City_New_York.html', 'or{}-Hilton_New_York_Grand_Central-New_York_City_New_York.html')
print('template:', url)
# add requests to list
for offset in range(0, num_reviews, 5):
print('url:', url.format(offset))
url_ = url.format(offset)
parse_reviews(url_, get_soup(url_))
#return # for test only - to stop after first page
def parse_reviews(url, response):
print('review:', url)
if not response:
print('no response:', url)
return
for idx, review in enumerate(response.find_all('div', class_='review-container')):
item = {
'hotel_name': response.find('h1', class_='heading_title').text,
'review_title': review.find('span', class_='noQuotes').text,
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='relativeDate')['title'],#.text,#[idx],
# 'num_reviews_reviewer': review.find('span', class_='badgetext').text,
'reviewer_name': review.find('span', class_='scrname').text,
'bubble_rating': review.select_one('div.reviewItemInline span.ui_bubble_rating')['class'][1][7:],
}
#~ yield item
results.append(item)
for key,val in item.items():
print(key, ':', val)
print('----')
#return # for test only - to stop after first review
start_urls = [
'https://www.tripadvisor.in/Hotel_Review-g60763-d93339-Reviews-Hilton_New_York_Grand_Central-New_York_City_New_York.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d102542-Reviews-Courtyard_Philadelphia_Airport-Philadelphia_Pennsylvania.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d122332-Reviews-The_Ritz_Carlton_Philadelphia-Philadelphia_Pennsylvania.html', ]
results = []
for url in start_urls:
parse(url, get_soup(url))
import pandas as pd
df = pd.DataFrame(results) # <--- convert list to DataFrame df.to_csv('output.csv')
I am getting an output sample in csv file from review like:
I went on a family trip and it was amazing, I hope to come back soon. The room was small but what can you expect from New York. It was close to many things and the staff was perfect.I will come back again soon.More...
I just want to expand that more. I need a help..I really have no clue to do it.Please help.
I have written one more code but unable to pull the id from next page.Code is given below
import re
import urllib
#import webbrowser``
s = requests.Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
for i in range(0,10,5):
url = ("https://www.tripadvisor.in/Hotel_Review-g60763-d93339-Reviews-or{}-Hilton_New_York_Grand_Central-New_York_City_New_York.html").format(i)
print(url)
r = s.get(url,headers=headers)
html = BeautifulSoup(r.text, 'html.parser')
pattern = re.compile(r"UID_(\w+)\-SRC_(\w+)")
id = soup.find("div", id=pattern)["id"]
uid = pattern.match(id).group(2)
print(uid)
url1 ="https://www.tripadvisor.in/ShowUserReviews-g60763-d93339-r"+str(uid)+"-Hilton_New_York_Grand_Central-New_York_City_New_York.html#CHECK_RATES_CONT"
print(url1)
url2 = ('"' + url1 + '"')`enter code here`
print(url2)

The site uses ajax to expand the review content. The full content is not downloaded until the More link is clicked.
One way to access the content would be to figure out the ajax request format and then issue a HTTP request for the same. That might be difficult, perhaps not.
Another, easier, way is by noticing that the review title is a clickable link which loads the full review in a new page. You can therefore scrape the URL for each review and send a similar GET request. Then scrape the data from the response.

Python3 crawl info after login

I am trying to crawl my person information from a website that requires login.
My work is below:
#coding:utf-8
import os, time
import urllib.request
import http.cookiejar
import requests
from bs4 import BeautifulSoup
# =================== urls ===================
login_url = 'https://passport.jd.com/uc/login'
info_url = 'http://i.jd.com/user/info'
# =================== post_data gathering ===================
login = urllib.request.urlopen(login_url )
loginSoup = BeautifulSoup(login,'html.parser')
uuid = loginSoup.find_all('form')[0].find_all('input')[0]['value']
clrName = loginSoup.find_all('form')[0].find_all('input')[6]['name']
clrValue = loginSoup.find_all('form')[0].find_all('input')[6]['value']
# jd login page captcha
checkPicUrl = loginSoup.find_all('div', id = 'o-authcode')[0].find_all('img')[0]['src2']
print(checkPicUrl)
# print(get_html('http:'+checkPicUrl))
image = urllib.request.urlopen('http:'+ checkPicUrl + '&yys=' + str(int(time.time() *1000)))
if image.getcode() == 200:
urllib.request.urlretrieve('http:'+ checkPicUrl , "checkPic.jpg")
else:
print('unable to get image!')
os.startfile('E:\Projects\Python\jd_scrapy\checkPic.jpg')
checkCode = input('enter captcha: ')
# # =================== cookie ===================
# # Storing cookies in cj variable
# cj = http.cookiejar.CookieJar()
# # Defining a handler for later http operations with cookies(cj_).
# opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
# urllib.request.install_opener(opener)
# =================== login ===================
login_info = {
'chkRememberMe': 'on',
'loginname':'<USERSNAME>',
'nloginpwd':'<PASSWORD>',
'loginpwd':'<PASSWORD>',
'machineNet':'',
'machineCpu':'',
'machineDisk':'',
str(clrName):str(clrValue),
'uuid':uuid,
'authcode': checkCode
}
post_data = urllib.parse.urlencode(login_info).encode(encoding = 'UTF-8')
session = requests.session()
# header
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
}
#login
passport = session.post(login_url, data = post_data, headers = headers)
print(passport) # <Response [200]>
# ============== page after login ==============
info_page = urllib.request.urlopen(info_url)
info = info_page.read()
print(info)
The program runs fine with whatever captcha I enter, it returns . I expected to see the raw html page after login but it shows nothing.
I don't think this program really login me into the account. Can anyone tell me how?

Logging in to LinkedIn with python requests sessions

I'm trying to log into LinkedIn using Python requests:
import sys
import requests
from BeautifulSoup import BeautifulSoup
payload={
'session-key' : 'user#email.com',
'session-password' : 'password'
}
URL='https://www.linkedin.com/uas/login-submit'
s=requests.session()
s.post(URL,data=payload)
r=s.get('http://www.linkedin.com/nhome')
soup = BeautifulSoup(r.text)
print soup.find('title')
I can't seem to log in using this method. I even tried playing with csrf etc. in the payload, but aren't sessions supposed to take care of that for you?
Note about the last line: I use the title to check if I've successfully logged in. (I should see "Welcome! | LinkedIn" if I have signed in, instead I see "World's Largest Professional Network | LinkedIn"
Am I missing something?

I modified a web-scraping template I use for most of my Python-based scraping needs to fit your needs. Verified it worked with my own login info.
The way it works is by mimic-ing a browser and maintaining a cookieJar that stores your user session. Got it to work with BeautifulSoup for you as well.
Note: This is a Python2 version. I added a working Python3 example further below by request.
import cookielib
import os
import urllib
import urllib2
import re
import string
from BeautifulSoup import BeautifulSoup
username = "user#email.com"
password = "password"
cookie_filename = "parser.cookies.txt"
class LinkedInParser(object):
def __init__(self, login, password):
""" Start up... """
self.login = login
self.password = password
# Simulate browser with cookies enabled
self.cj = cookielib.MozillaCookieJar(cookie_filename)
if os.access(cookie_filename, os.F_OK):
self.cj.load()
self.opener = urllib2.build_opener(
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(self.cj)
)
self.opener.addheaders = [
('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
'Windows NT 5.2; .NET CLR 1.1.4322)'))
]
# Login
self.loginPage()
title = self.loadTitle()
print title
self.cj.save()
def loadPage(self, url, data=None):
"""
Utility function to load HTML from URLs for us with hack to continue despite 404
"""
# We'll print the url in case of infinite loop
# print "Loading URL: %s" % url
try:
if data is not None:
response = self.opener.open(url, data)
else:
response = self.opener.open(url)
return ''.join(response.readlines())
except:
# If URL doesn't load for ANY reason, try again...
# Quick and dirty solution for 404 returns because of network problems
# However, this could infinite loop if there's an actual problem
return self.loadPage(url, data)
def loginPage(self):
"""
Handle login. This should populate our cookie jar.
"""
html = self.loadPage("https://www.linkedin.com/")
soup = BeautifulSoup(html)
csrf = soup.find(id="loginCsrfParam-login")['value']
login_data = urllib.urlencode({
'session_key': self.login,
'session_password': self.password,
'loginCsrfParam': csrf,
})
html = self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
return
def loadTitle(self):
html = self.loadPage("https://www.linkedin.com/feed/")
soup = BeautifulSoup(html)
return soup.find("title")
parser = LinkedInParser(username, password)
Update June 19, 2014: Added parsing for CSRF token from homepage for use in updated login process.
Update July 23, 2015: Adding a Python 3 example here. Basically requires substituting library locations and removing deprecated methods. It's not perfectly formatted or anything, but it functions. Sorry for the rush job. In the end the principals and steps are identical.
import http.cookiejar as cookielib
import os
import urllib
import re
import string
from bs4 import BeautifulSoup
username = "user#email.com"
password = "password"
cookie_filename = "parser.cookies.txt"
class LinkedInParser(object):
def __init__(self, login, password):
""" Start up... """
self.login = login
self.password = password
# Simulate browser with cookies enabled
self.cj = cookielib.MozillaCookieJar(cookie_filename)
if os.access(cookie_filename, os.F_OK):
self.cj.load()
self.opener = urllib.request.build_opener(
urllib.request.HTTPRedirectHandler(),
urllib.request.HTTPHandler(debuglevel=0),
urllib.request.HTTPSHandler(debuglevel=0),
urllib.request.HTTPCookieProcessor(self.cj)
)
self.opener.addheaders = [
('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
'Windows NT 5.2; .NET CLR 1.1.4322)'))
]
# Login
self.loginPage()
title = self.loadTitle()
print(title)
self.cj.save()
def loadPage(self, url, data=None):
"""
Utility function to load HTML from URLs for us with hack to continue despite 404
"""
# We'll print the url in case of infinite loop
# print "Loading URL: %s" % url
try:
if data is not None:
response = self.opener.open(url, data)
else:
response = self.opener.open(url)
return ''.join([str(l) for l in response.readlines()])
except Exception as e:
# If URL doesn't load for ANY reason, try again...
# Quick and dirty solution for 404 returns because of network problems
# However, this could infinite loop if there's an actual problem
return self.loadPage(url, data)
def loadSoup(self, url, data=None):
"""
Combine loading of URL, HTML, and parsing with BeautifulSoup
"""
html = self.loadPage(url, data)
soup = BeautifulSoup(html, "html5lib")
return soup
def loginPage(self):
"""
Handle login. This should populate our cookie jar.
"""
soup = self.loadSoup("https://www.linkedin.com/")
csrf = soup.find(id="loginCsrfParam-login")['value']
login_data = urllib.parse.urlencode({
'session_key': self.login,
'session_password': self.password,
'loginCsrfParam': csrf,
}).encode('utf8')
self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
return
def loadTitle(self):
soup = self.loadSoup("https://www.linkedin.com/feed/")
return soup.find("title")
parser = LinkedInParser(username, password)

This is a much simpler version.
import requests
from bs4 import BeautifulSoup
client = requests.Session()
HOMEPAGE_URL = 'https://www.linkedin.com'
LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'
html = client.get(HOMEPAGE_URL).content
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find(id="loginCsrfParam-login")['value']
login_information = {
'session_key':'Login',
'session_password':'Password',
'loginCsrfParam': csrf,
}
client.post(LOGIN_URL, data=login_information)
client.get('Any_Linkedin_URL')

2019 Version.
Slightly revised version working that takes into account the new structure of the page to find the connection cookie and adds the trk parameter.
import requests
from bs4 import BeautifulSoup
email = ""
password = ""
client = requests.Session()
HOMEPAGE_URL = 'https://www.linkedin.com'
LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'
html = client.get(HOMEPAGE_URL).content
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find('input', {'name': 'loginCsrfParam'}).get('value')
login_information = {
'session_key': email,
'session_password': password,
'loginCsrfParam': csrf,
'trk': 'guest_homepage-basic_sign-in-submit'
}
client.post(LOGIN_URL, data=login_information)
response = client.get('')

2020 version of #garromark's accepted solution:
import http.cookiejar as cookielib
import os
import urllib
import re
import string
from bs4 import BeautifulSoup
username = ""
password = ""
cookie_filename = "parser.cookies.txt"
class LinkedInParser(object):
def __init__(self, login, password):
""" Start up... """
self.login = login
self.password = password
# Simulate browser with cookies enabled
self.cj = cookielib.MozillaCookieJar(cookie_filename)
if os.access(cookie_filename, os.F_OK):
self.cj.load()
self.opener = urllib.request.build_opener(
urllib.request.HTTPRedirectHandler(),
urllib.request.HTTPHandler(debuglevel=0),
urllib.request.HTTPSHandler(debuglevel=0),
urllib.request.HTTPCookieProcessor(self.cj)
)
self.opener.addheaders = [
('User-agent', 'Mozilla/5.0')
]
# Login
self.loginPage()
title = self.loadTitle()
print(title)
# self.cj.save()
def loadPage(self, url, data=None):
"""
Utility function to load HTML from URLs for us with hack to continue despite 404
"""
# We'll print the url in case of infinite loop
# print "Loading URL: %s" % url
try:
if data is not None:
response = self.opener.open(url, data)
else:
response = self.opener.open(url)
content = ''.join([str(l) for l in response.readlines()])
print("Page loaded: %s \n Content: %s \n" % (url, content))
return content
except Exception as e:
# If URL doesn't load for ANY reason, try again...
# Quick and dirty solution for 404 returns because of network problems
# However, this could infinite loop if there's an actual problem
print("Exception on %s load: %s" % (url, e))
# return self.loadPage(url, data)
def loadSoup(self, url, data=None):
"""
Combine loading of URL, HTML, and parsing with BeautifulSoup
"""
html = self.loadPage(url, data)
soup = BeautifulSoup(html, "html5lib")
return soup
def loginPage(self):
"""
Handle login. This should populate our cookie jar.
"""
soup = self.loadSoup("https://www.linkedin.com/login")
loginCsrfParam = soup.find("input", {"name": "loginCsrfParam"})['value']
csrfToken = soup.find("input", {"name": "csrfToken"})['value']
sIdString = soup.find("input", {"name": "sIdString"})['value']
print("loginCsrfParam: %s" % loginCsrfParam)
print("csrfToken: %s" % csrfToken)
print("sIdString: %s" % sIdString)
login_data = urllib.parse.urlencode({
'session_key': self.login,
'session_password': self.password,
'loginCsrfParam': loginCsrfParam,
'csrfToken': csrfToken,
'sIdString': sIdString
}).encode('utf8')
self.loadPage("https://www.linkedin.com/checkpoint/lg/login-submit", login_data)
def loadTitle(self):
soup = self.loadSoup("https://www.linkedin.com/feed/")
return soup.find("title")
parser = LinkedInParser(username, password)

The OP's solution worked for me with only a very slight modification.
Change 'session-key' to 'session_key' and change 'session-password' to session_password.'
Other than that, the code is good as it stands.

Facebook Python login script not using API

Ok, so I found a basic script to log in to facebook using python a while back. It didn't work - but after some tweaking (mainly around updating the post strings) it worked well for quite a while. Now it's stopped again - I suspect because facebook have changed their site a little.
I've tried making further tweaks having captured a login in Firefox and making sure I mimic as many of the post values etc as possible.
I need to log in to the site directly as I have a bunch of scripts that collect data that's available through a browser, but not through the API.
Having spent days trying to fix this I'm still drawing a blank... what am I missing?
import sys
import re
import urllib
import urllib2
import cookielib
import json
def main():
# Check the arguments
user = sys.argv[1]
passw = sys.argv[2]
# Initialize the needed modules
CHandler = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
browser = urllib2.build_opener(CHandler)
browser.addheaders = [('Referer', 'http://login.facebook.com'),
('Content-Type', 'application/x-www-form-urlencoded'),
('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7 (.NET CLR 3.5.30729)')]
urllib2.install_opener(browser)
res = browser.open('http://m.facebook.com/index.php')
pg=res.read()
mxt = re.search('name="li" value="(\w+)"', pg)
mxt2 = re.search('name="m_ts" value="(\w+)"', pg)
mxt3 = re.search('name="lsd" value="(\w+)"', pg)
mxt4 = re.search('name="locale" value="(\w+)"', pg)
li = mxt.group(1)
m_ts = mxt2.group(1)
lsd = mxt3.group(1)
locale = mxt4.group(1)
res.close()
# Initialize the POST data
data = urllib.urlencode({
'lsd' : lsd,
'charset_test' : urllib.unquote_plus('%E2%82%AC%2C%C2%B4%2C%E2%82%AC%2C%C2%B4%2C%E6%B0%B4%2C%D0%94%2C%D0%84'),
'version' : '1',
'm_ts' : m_ts,
'li' : li,
'locale' : locale,
'signup_layout' : 'header_button',
'laststage' :'first',
'post_form_id' : pfi,
'email' : user,
'pass' : passw,
'login' : 'Log in'
})
url='https://login.facebook.com/login.php?login_attempt=1&non_com_login=&'+ data
res = urllib2.urlopen(url)
print ('%s' % url)
res.close()
# Get Access Token
res = browser.open('http://developers.facebook.com/docs/reference/api')
conft = res.read()
# For Debugging
fh = open('debug.html', 'w')
fh.write(conft)
fh.close
mat = re.search('access_token=(.*?)"', conft)
acct = mat.group(1)
print ('Using access token: %s' % acct)

For the record, here is the working answer for the above.
#!/usr/bin/python
import mechanize
browser = mechanize.Browser()
browser.set_handle_robots(False)
cookies = mechanize.CookieJar()
browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.41 Safari/534.7')]
browser.open("http://m.facebook.com/")
browser.select_form(nr=0)
browser.form['email'] = 'YOUR_LOGIN'
browser.form['pass'] = 'YOUR_PASSWORD'
response = browser.submit()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Session handling in scrapy-splash with custom header - python

Related

CrawlSpider with Splash, only first link is crawled & processed

Unable to expand more... python

Python3 crawl info after login

Logging in to LinkedIn with python requests sessions

Facebook Python login script not using API

Categories

Resources