I am working on a small project that gets the following of a given user's Instagram. I have this working flawlessly as a script using a function, however I plan to make this into an actual program so I decided to write a class. I believe I am using "self" correctly in all the right places, but I am failing to see why I am getting this name error. Here is my code:
# Library imports
import requests
import json
import time
# Class decleration
class NodesCursor:
# Class variables
# Login url
LOGIN_URL = 'https://www.instagram.com/accounts/login/ajax/'
# Referer url
REFERER_URL = 'https://www.instagram.com/accounts/login/'
# User agent
USER_AGENT = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
# Class constructor
def __init__(self, USERNAME, PASSWD):
# Login username
self.USERNAME = USERNAME
# Login password
self.PASSWD = PASSWD
# Creating a session
self.session = requests.Session()
# Get request to login url
self.req = session.get(LOGIN_URL)
# Setting user agent for session header
self.session.headers = {'user-agent': USER_AGENT}
# Setting referer url for session header
self.session.headers.update({'Referer': REFERER_URL})
# Updating session header with x-csrftoken cookie
self.session.headers.update({'x-csrftoken': self.req.cookies['csrftoken']})
# Login data for website
self.login_data = {'username': self.USERNAME, 'password': self.PASSWD}
# Login with a post requests
self.login = session.post(LOGIN_URL, data=self.login_data, allow_redirects=True)
# Updating the session with x-csrftoken cookie
self.session.headers.update({'x-csrftoken': self.login.cookies['csrftoken']})
# Function to parse following
def parse(self):
# An array of usernames
usernames = []
# Variable to handle continuous scrolling
has_next_page = True
# Variable to handle continuous scrolling
end_cursor = None
# Loop to handle the scrolling to get the needed data
while has_next_page == True:
# Sleep for 30 seconds to not get rate limited
#time.sleep(30)
# Query url
queryUrl = "https://www.instagram.com/graphql/query/"
# Parameters for the get request
payload = {"query_hash":"9335e35a1b280f082a47b98c5aa10fa4", "id":"8164444379","first":24, "after": end_cursor}
# Variable for GETting all of the user's following
following = self.session.get(queryUrl, params=payload).json()
# Parsing the node to check to see what has_next_page equals to
has_next_page = following['data']['user']['edge_follow']['page_info']['has_next_page']
# Parse all user followings until there are no more
if has_next_page == True or has_next_page == False:
# Parsing end cursor id
end_cursor = following['data']['user']['edge_follow']['page_info']['end_cursor']
# Sleep for 30 seconds to not get rate limited
time.sleep(30)
# Parsing to get to username node
userList = following['data']['user']['edge_follow']
# Loop to interate through all of the names
for eachName in userList['edges']:
# Add each name to the array
usernames.append(eachName['node']['username'])
# Print the array of usernames, along with the length
print(usernames)
print(len(usernames))
if __name__ == '__main__':
checkFollowing = NodesCursor('username', 'password')
checkFollowing().parse()
Error:
Traceback (most recent call last):
File "test.py", line 115, in <module>
turboOne = NodesCursor('moola.ig', 'yeet1234')
File "test.py", line 42, in __init__
self.req = session.get(LOGIN_URL)
NameError: name 'session' is not defined
Though as I stated earlier that I think I am using "self" correctly, it is possible that is where my error is coming from but I'm unsure. Any help is greatly appreciated.
You’re missing the self. when accessing session:
# Creating a session
self.session = requests.Session()
# Get request to login url
self.req = self.session.get(LOGIN_URL)
To fix to error with LOGIN_URL:
self.req = self.session.get(NodesCursor.LOGIN_URL)
Try replacing
self.req = session.get(LOGIN_URL)
With
self.req = self.session.get(LOGIN_URL)
Related
I'd like to check the new posts every set time (using apscheduler) on the site where needs to be logged in and receive messages from telegram bot.
import requests
from bs4 import BeautifulSoup
import os
import telegram
import sys
from apscheduler.schedulers.blocking import BlockingScheduler
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
def scraping():
headers = {'User-Agent':'Mozilla/5.0'}
LOGIN_URL = 'Login page url'
LOGIN_DATA = {
"user_id":"id",
"password":"pw",
"keep_signed":"Y"
}
with requests.Session() as s:
login_req = s.post(LOGIN_URL, data=LOGIN_DATA, headers=headers)
url = "address"
req = s.get(url, headers=headers)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
title = soup.select('#css values')
latest_title = title[0].text
token = "certain value"
bot = telegram.Bot(token=token)
chat_id = 'id'
with open(os.path.join(BASE_DIR, 'latest.txt'), 'r+') as f_read:
before = f_read.readline()
if before != latest_title:
bot.sendMessage(chat_id=chat_id, text= latest_title)
f_read.close()
with open(os.path.join(BASE_DIR, 'latest.txt'), 'w+') as f_write:
f_write.write(latest_title)
f_write.close()
scheduler = BlockingScheduler()
scheduler.add_job(scraping, 'interval', seconds=30)
scheduler.start()
With this code, the login process is also included in every interval and it's inefficient.
How can I check the posts repeatedly but keep the session alive with only one login?
I've had a similar issue before, and solved it by storing the session as a pickled object in redis.
When you try to login, get the pickled session, unpickle, and then try to use it. If it is no longer a valid session (for example, they time out your login session on the api), then create a new session.
something along these lines might work:
import pickle
import redis
redis_client = redis.Redis(host='localhost', port=6379, db=0)
conn = None
def connect(self):
if conn is None:
conn = # your login code here
redis_client.set(
"connection", pickle.dumps(# your session here)
)
connection = redis_client.get("connection")
conn = pickle.loads(connection) if connection else None
connect()
# make connection is not already connected.
timeout = time.time() + 60 * 3 # 3 mins from now
while True:
try:
connected = # code to check if you are connected.. for example get a url.
if not connected:
raise AssertionError()
break
except (AssertionError, ConnectionResetError) as e:
if time.time() <= timeout:
time.sleep(30) # wait 30 sec before retrying
# recreate login
connect()
continue
elif time.time() > timeout:
raise ValueError("Connection failed after timeout.")
else:
raise e
i want get webpage resource content use python via Chrome Debugging Protocol,from this page method-getResourceContent,i noticed this method:getResourceContent,need params frameId and url.i think this method is what i need.
so i did this thing:
1.get start chrome as a server: .\chrome.exe --remote-debugging-port=9222
2.write python test code:
# coding=utf-8
"""
chrome --remote-debugging api test
"""
import json
import requests
import websocket
import pdb
def send():
geturl = requests.get('http://localhost:9222/json')
websocketURL = json.loads(geturl.content)[0]['webSocketDebuggerUrl']
request = {}
request['id'] = 1
request['method'] = 'Page.navigate'
request['params'] = {"url": 'http://global.bing.com'}
ws = websocket.create_connection(websocketURL)
ws.send(json.dumps(request))
res = ws.recv()
ws.close()
print res
frameId = json.loads(res)['result']['frameId']
print frameId
geturl = requests.get('http://localhost:9222/json')
websocketURL = json.loads(geturl.content)[0]['webSocketDebuggerUrl']
req = {}
req['id'] = 1
req['method'] = 'Page.getResourceContent'
req['params'] = {"frameId":frameId,"url": 'http://global.bing.com'}
header = ["User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"]
pdb.set_trace()
ws = websocket.create_connection(websocketURL,header=header)
ws.send(json.dumps(req))
ress = ws.recv()
ws.close()
print ress
if __name__ == '__main__':
send()
3.Page.navigate work fine,i got something like this:
{"id":1,"result":{"frameId":"8504.2"}}
4.when i try method:getResourceContent,error came out:
{"error":{"code":-32000,"message":"Agent is not enabled."},"id":1}
i tried to add User-Agent,still not work.
Thanks.
The error message "Agent is not enabled" has nothing to do with the HTTP User-Agent header but refers to an agent within chrome that needs to be enabled in order to retrieve page contents.
The term "agent" is a bit misleading since the protocol documentation speaks about domains which need to be enabled in order to debug them (the term "agent" refers to the way this is implemented in Chrome internally, I suppose)
So, the question is which domain does need to be enabled in order to access the page contents? In hindsight it is quite obvious: the Page domain needs to be enabled as we are calling a method in this domain. I only found this out after stumbling over this example, though.
Once I added the Page.enable request to script to activate the Page domain, the error message disappeared. However, I encountered two other problems:
The websockets connection needs to be kept open between requests as Chrome keeps some state between invocations (such as whether the agent is enabled)
When navigating to http://global.bing.com/ the browser is redirected to http://www.bing.com/ (at least it is on my computer). This causes Page.getResourceContent to fail to retrieve the resource because the requested resource http://global.bing.com/ is not available.
After fixing these issues I was able to retrieve the page content. This is my code:
# coding=utf-8
"""
chrome --remote-debugging api test
"""
import json
import requests
import websocket
def send():
# Setup websocket connection:
geturl = requests.get('http://localhost:9222/json')
websocketURL = json.loads(geturl.content)[0]['webSocketDebuggerUrl']
ws = websocket.create_connection(websocketURL)
# Navigate to global.bing.com:
request = {}
request['id'] = 1
request['method'] = 'Page.navigate'
request['params'] = {"url": 'http://global.bing.com'}
ws.send(json.dumps(request))
result = ws.recv()
print "Page.navigate: ", result
frameId = json.loads(result)['result']['frameId']
# Enable page agent:
request = {}
request['id'] = 1
request['method'] = 'Page.enable'
request['params'] = {}
ws.send(json.dumps(request))
print 'Page.enable: ', ws.recv()
# Retrieve resource contents:
request = {}
request['id'] = 1
request['method'] = 'Page.getResourceContent'
request['params'] = {"frameId": frameId, "url": 'http://www.bing.com'}
ws.send(json.dumps(request))
result = ws.recv()
print("Page.getResourceContent: ", result)
# Close websocket connection
ws.close()
if __name__ == '__main__':
send()
The server I'm trying to logon and download a file from is using Basic Auth as I can confirm from Chrome Dev Tools and some tests. So I write code like below, bad example of OOP perhaps, but should make sense.
class Utils(object):
def __init__(self, username, password):
self.username = username
self.password = password
self.top_level_url = 'http://test.com/'
password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, self.top_level_url, self.username, self.password)
basic_auth_handler = urllib2.HTTPBasicAuthHandler(password_mgr)
opener = urllib2.build_opener(basi_auth_handler)
urllib2.install_opener(opener)
def download(self, filename):
url = self.top_level_url + filename
req = urllib2.Request(url)
try:
response = urllib2.urlopen(req)
return response
except urllib2.HTTPError as e:
print e.headers
raise
Strange things happen, when I initialize a Utils object and download the file repeatedly:
u = Utils('username', 'password')
index = 0
while 1:
resp = u.download('file.txt')
index += 1
time.sleep(1)
The scripts works for the first 5 times of download, but at the 6th time, it would raise HTTPError 401. But if I change the code, add the post header to include 'Authorization: Basic ***' instead of using HTTPBasicAuthHandler, it works every time... So is this something wrong with my code or the server part setup?
I tried logging into quora using python. But it gives me the following error.
urllib2.HTTPError: HTTP Error 500: Internal Server Error
This is my code till now. I also work behind a proxy.
import urllib2
import urllib
import re
import cookielib
class Quora:
def __init__(self):
'''Initialising and authentication'''
auth = 'http://name:password#proxy:port'
cj = cookielib.CookieJar()
logindata = urllib.urlencode({'email' : 'email' , 'password' : 'password'})
handler = urllib2.ProxyHandler({'http' : auth})
opener = urllib2.build_opener(handler , urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
a = urllib2.urlopen('http://www.quora.com/' , logindata)
def main():
Quora()
Can someone please point out what is wrong?
if __name__ == '__main__':
main()
Try something like this:
# Load proxies
proxies = []
proxies_fp = open('proxies.txt', 'r') # A list of proxies
for proxy in proxies_fp:
proxies.append(proxy)
cookiejar = cookielib.CookieJar()
def perform_request(url, opener, credientials):
# Instantiate our request object
request = urllib2.Request(url)
# Perform the request, returning a pointer to the result set.
result = opener.urlopen(request, credentials)
return result
credentials ={
'username' : 'username',
'password' : 'password'
}
encoded_credentials = urllib.urlencode(credentials)
def main():
# Get random proxy
proxy = random.choice(proxies)
# Install our proxy
opener = urllib2.build_opener(
urllib2.ProxyHandler({'http': proxy}),
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(cookiejar),
)
urllib2.install_opener(opener)
a = perform_request(url, opener, encoded_credentials)
--untested--
I've had to do something similar to this, and it worked for me this way. (Please note, that this is NOT an exact copy of code I used. I had to manipulate it a bit, and did NOT test)
Please take a look at the following python code snippet :
import cookielib, urllib, urllib2
def login(username, password):
cookie_jar = cookielib.LWPCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
login = urllib.urlencode({'username': username, 'password': password})
try:
login_data = opener.open("http://www.example.com/login.php", login).read()
except IOError:
return 'Network Error'
# on successful login the I'm storing the 'SESSION COOKIE'
# that the site sends on a local file called cookie.txt
cookie_jar.save('./cookie.txt', True, True)
return login_data
# this method is called after quite sometime
# from calling the above method "login"
def re_accessing_the _site():
cookie_jar = cookielib.LWPCookieJar()
# Here I'm re-loading the saved cookie
# from the file to the cookie jar
cookie_jar.revert('./cookie.txt', True, True)
# there's only 1 cookie in the cookie jar
for Cookie in cookie_jar:
print 'Expires : ', Cookie.expires ## prints None
print 'Discard : ', Cookie.discard ## prints True , means that the cookie is a
## session cookie
print 'Is Expired : ', Cookie.is_expired() ## prints False
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
try:
data = opener.open("http://www.example.com/send.php")
# Sometimes the opening fails as the cookie has expired
# & sometimes it doesn't. Here I want a way to determine
# whether the (session) cookie is still alive
except IOError:
return False
return True
First I'm calling the method login & saves the retrieved cookie (which is a session cookie)to a local file called cookie.txt.
Next after quite sometime (15-20 mins) I'm calling the other method re_accessing_the _site . This time I'm also re loading the previously saved cookie into the cookie jar.
Sometimes it's working fine but sometimes it's blocking me to access (as the session cookie has expired) . So all I need is a way to check whether the cookie is still alive during the call ...