how to get webpage resource content via chrome remote debugging

how to get webpage resource content via chrome remote debugging - python

i want get webpage resource content use python via Chrome Debugging Protocol,from this page method-getResourceContent,i noticed this method：getResourceContent,need params frameId and url.i think this method is what i need.
so i did this thing:
1.get start chrome as a server: .\chrome.exe --remote-debugging-port=9222
2.write python test code:
# coding=utf-8
"""
chrome --remote-debugging api test
"""
import json
import requests
import websocket
import pdb
def send():
geturl = requests.get('http://localhost:9222/json')
websocketURL = json.loads(geturl.content)[0]['webSocketDebuggerUrl']
request = {}
request['id'] = 1
request['method'] = 'Page.navigate'
request['params'] = {"url": 'http://global.bing.com'}
ws = websocket.create_connection(websocketURL)
ws.send(json.dumps(request))
res = ws.recv()
ws.close()
print res
frameId = json.loads(res)['result']['frameId']
print frameId
geturl = requests.get('http://localhost:9222/json')
websocketURL = json.loads(geturl.content)[0]['webSocketDebuggerUrl']
req = {}
req['id'] = 1
req['method'] = 'Page.getResourceContent'
req['params'] = {"frameId":frameId,"url": 'http://global.bing.com'}
header = ["User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"]
pdb.set_trace()
ws = websocket.create_connection(websocketURL,header=header)
ws.send(json.dumps(req))
ress = ws.recv()
ws.close()
print ress
if __name__ == '__main__':
send()
3.Page.navigate work fine,i got something like this:
{"id":1,"result":{"frameId":"8504.2"}}
4.when i try method:getResourceContent,error came out:
{"error":{"code":-32000,"message":"Agent is not enabled."},"id":1}
i tried to add User-Agent,still not work.
Thanks.

The error message "Agent is not enabled" has nothing to do with the HTTP User-Agent header but refers to an agent within chrome that needs to be enabled in order to retrieve page contents.
The term "agent" is a bit misleading since the protocol documentation speaks about domains which need to be enabled in order to debug them (the term "agent" refers to the way this is implemented in Chrome internally, I suppose)
So, the question is which domain does need to be enabled in order to access the page contents? In hindsight it is quite obvious: the Page domain needs to be enabled as we are calling a method in this domain. I only found this out after stumbling over this example, though.
Once I added the Page.enable request to script to activate the Page domain, the error message disappeared. However, I encountered two other problems:
The websockets connection needs to be kept open between requests as Chrome keeps some state between invocations (such as whether the agent is enabled)
When navigating to http://global.bing.com/ the browser is redirected to http://www.bing.com/ (at least it is on my computer). This causes Page.getResourceContent to fail to retrieve the resource because the requested resource http://global.bing.com/ is not available.
After fixing these issues I was able to retrieve the page content. This is my code:
# coding=utf-8
"""
chrome --remote-debugging api test
"""
import json
import requests
import websocket
def send():
# Setup websocket connection:
geturl = requests.get('http://localhost:9222/json')
websocketURL = json.loads(geturl.content)[0]['webSocketDebuggerUrl']
ws = websocket.create_connection(websocketURL)
# Navigate to global.bing.com:
request = {}
request['id'] = 1
request['method'] = 'Page.navigate'
request['params'] = {"url": 'http://global.bing.com'}
ws.send(json.dumps(request))
result = ws.recv()
print "Page.navigate: ", result
frameId = json.loads(result)['result']['frameId']
# Enable page agent:
request = {}
request['id'] = 1
request['method'] = 'Page.enable'
request['params'] = {}
ws.send(json.dumps(request))
print 'Page.enable: ', ws.recv()
# Retrieve resource contents:
request = {}
request['id'] = 1
request['method'] = 'Page.getResourceContent'
request['params'] = {"frameId": frameId, "url": 'http://www.bing.com'}
ws.send(json.dumps(request))
result = ws.recv()
print("Page.getResourceContent: ", result)
# Close websocket connection
ws.close()
if __name__ == '__main__':
send()

Related

How to conditionally disable proxies for some requests playwright python

try:
with sync_playwright() as p:
driver = p.firefox.launch(hedless=headless, proxy={
"server": 'fa****y.com:10000',
'username': 'iu***53cpeuytpna0cc7bd7',
"password": '**W7**Fm5',
})
context = driver.new_context()
page = context.new_page()
page.route("**/*",lambda route:route.abort()
if route.request.resource_type == "image"
or route.request.resource_type == "stylesheet"
or route.request.resource_type == "svg"
else route.continue_()
)
#page.set_default_timeout(100000)
try:
url = 'https://accounts.google.com/v3/signin/identifier?dsh=S-536847501%3A1663770960047319&continue=https%3A%2F%2Fplay.google.com%2Fconsole%2Fsignup&followup=https%3A%2F%2Fplay.google.com%2Fconsole%2Fsignup&passive=1209600&service=androiddeveloper&flowName=GlifWebSignIn&flowEntry=ServiceLogin&ifkv=AQDHYWoj7hmeMm5YT3PrA0sojYcd3nnuAx2JkCLnedM0A9sCEUG9nrlRYD-grtVE1CcBagVSvXOG'
page.goto(url)
except Exception:
print(" [+] Time out Error ")
print(Fore.LIGHTBLUE_EX +" [+] Start >>> " + self.gmail )
page.fill('id=identifierId',self.gmail)
btn = '#identifierNext > div > button'
page.click(btn)
page.wait_for_timeout(3000)
inbpass = '#password > div.aCsJod.oJeWuf > div > div.Xb9hP > input'
page.fill(inbpass,self.password)
btnpass = '#passwordNext > div > button'
page.click(btnpass)
time.sleep(3)
page.wait_for_timeout(3000)
try:
page.locator("text=I understand").click(timeout=10000)
page.wait_for_timeout(1500)
sleep(1)
except:
pass
try:
page.locator("div[role=\"link\"]:has-text(\"Confirm your recovery email\")").click()
page.wait_for_timeout(3000)
page.locator("[aria-label=\"Enter recovery email address\"]").fill(self.recvery)
page.wait_for_timeout(3000)
time.sleep(3)
# Click button:has-text("Next")
page.locator("button:has-text(\"Next\")").click()
time.sleep(10)
except:
pass
page.locator("text=YourselfChoose if your account is for personal >> button").click()
time.sleep(3)
page.wait_for_timeout(1500)
**IN THIS LINE:
(In this line : I want to make proxy stop because it's not unlimited and I am paying for GB and I want to use only local internet from this line to end is there any help ?)**
I want to to stop proxy usage when reach to click in
One: for proxy usage is high and I don't need proxy anymore in the next step.
Two: I want to make it faster because proxy is slow when it reach to the page.
Please help me with code or anything I don't know how to solve it

There is no direct way to achieve this, but there are workarounds you can use to achieve the same results using routing. Basically, after you no longer require the proxy, you route all requests playwright makes to a handler, which subsequently bypasses proxies and forwards the response back to playwright. However, this approach will only work with async version of playwright (otherwise executing a blocking call from a route handler will block further network traffic too). Consider the below code:
import aiohttp, asyncio
import playwright
def no_proxy_route(session):
"""
Bypasses any proxy and routes request directly. It's a factory function to store value of session so we don't need
to create a new one every time.
"""
async def route_handler(route: playwright.async_api.Route):
request = route.request
try:
body, status, headers = await fetch(session, request)
# Abort route in case of error
except (aiohttp.ClientConnectionError, asyncio.TimeoutError):
await route.abort()
return
# If no error, fulfill route with the given body and headers
await route.fulfill(status=status, headers=headers, body=body)
return route_handler
async def fetch(session, request, timeout=5000):
"""
Fetch the given request using aiohttp
"""
assert timeout > 0, "timeout must be positive"
try:
async with session.request(request.method, request.url, headers=request.headers,
data=request.post_data, timeout=timeout) as response:
body = await response.read()
status = response.status
headers = response.headers
except Exception:
raise
else:
return body, status, headers
Now if you want to conditionally disable proxies for some requests, you simply create a matching pattern and route all those requests through no_proxy_route. An example is given below:
### All your previous code
# .
# .
# .
# .
### which required a proxy
# Create an aiohttp session with a dummy cookie jar. This is because we will be passing the cookies
# explicitly and don't want previous requests/responses cookies to persist and interfere
session = aiohttp.ClientSession(cookie_jar=aiohttp.DummyCookieJar())
# Create a route to our handler with an appropriate pattern. The below pattern will route ALL subsequent requests to
# handler. Remember that you can use regex for patterns as well.
await page.route('**/*', no_proxy_route(session))
After this is done, all requests that match the pattern, with which you created the route, will not use a proxy, even if the context has one set. But again, this is only useful if you are using the async API for playwright.

How to connect to Splunk API via Python, receiving javascript error

I am trying to connect to Splunk via API using python. I can connect, and get a 200 status code but when I read the content, it doesn't read the content of the page. View below:
Here is my code:
import json
import requests
import re
baseurl = 'https://my_splunk_url:8888'
username = 'my_username'
password = 'my_password'
headers={"Content-Type": "application/json"}
s = requests.Session()
s.proxies = {"http": "my_proxy"}
r = s.get(baseurl, auth=(username, password), verify=False, headers=None, data=None)
print(r.status_code)
print(r.text)
I am new to Splunk and python so any ideas or suggestions as to why this is happening would help.

You need to authenticate first to get a token, then you'll be able to hit the rest of REST endpoints. The auth endpoint it at /servicesNS/admin/search/auth/login, which will give you the session_key, which you then provide to subsequent requests.
Here is some code that uses requests to authenticate to a Splunk instance, then start a search. It then checks to see if the search is complete, if not, wait a second and then check again. Keep checking and sleeping until the search is done, then print out the results.
import time # need for sleep
from xml.dom import minidom
import json, pprint
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
base_url = 'https://localhost:8089'
username = 'admin'
password = 'changeme'
search_query = "search=search index=*"
r = requests.get(base_url+"/servicesNS/admin/search/auth/login",
data={'username':username,'password':password}, verify=False)
session_key = minidom.parseString(r.text).getElementsByTagName('sessionKey')[0].firstChild.nodeValue
print ("Session Key:", session_key)
r = requests.post(base_url + '/services/search/jobs/', data=search_query,
headers = { 'Authorization': ('Splunk %s' %session_key)},
verify = False)
sid = minidom.parseString(r.text).getElementsByTagName('sid')[0].firstChild.nodeValue
print ("Search ID", sid)
done = False
while not done:
r = requests.get(base_url + '/services/search/jobs/' + sid,
headers = { 'Authorization': ('Splunk %s' %session_key)},
verify = False)
response = minidom.parseString(r.text)
for node in response.getElementsByTagName("s:key"):
if node.hasAttribute("name") and node.getAttribute("name") == "dispatchState":
dispatchState = node.firstChild.nodeValue
print ("Search Status: ", dispatchState)
if dispatchState == "DONE":
done = True
else:
time.sleep(1)
r = requests.get(base_url + '/services/search/jobs/' + sid + '/results/',
headers = { 'Authorization': ('Splunk %s' %session_key)},
data={'output_mode': 'json'},
verify = False)
pprint.pprint(json.loads(r.text))
Many of the request calls thare used include the flag, verify = False to avoid issues with the default self-signed SSL certs, but you can drop that if you have legit certificates.
Published a while ago at https://gist.github.com/sduff/aca550a8df636fdc07326225de380a91

Nice piece of coding. One of the wonderful aspects of Python is the ability to use other people's well written packages. In this case, why not use Splunk's Python packages to do all of that work, with a lot less coding around it.
pip install splunklib.
Then add the following to your import block
import splunklib.client as client
import splunklib.results as results
pypi.org has documentation on some of the usage, Splunk has an excellent set of how-to documents. Remember, be lazy, use someone else's work to make your work look better.

Python 3 NameError in init, 'session' not defined

I am working on a small project that gets the following of a given user's Instagram. I have this working flawlessly as a script using a function, however I plan to make this into an actual program so I decided to write a class. I believe I am using "self" correctly in all the right places, but I am failing to see why I am getting this name error. Here is my code:
# Library imports
import requests
import json
import time
# Class decleration
class NodesCursor:
# Class variables
# Login url
LOGIN_URL = 'https://www.instagram.com/accounts/login/ajax/'
# Referer url
REFERER_URL = 'https://www.instagram.com/accounts/login/'
# User agent
USER_AGENT = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
# Class constructor
def __init__(self, USERNAME, PASSWD):
# Login username
self.USERNAME = USERNAME
# Login password
self.PASSWD = PASSWD
# Creating a session
self.session = requests.Session()
# Get request to login url
self.req = session.get(LOGIN_URL)
# Setting user agent for session header
self.session.headers = {'user-agent': USER_AGENT}
# Setting referer url for session header
self.session.headers.update({'Referer': REFERER_URL})
# Updating session header with x-csrftoken cookie
self.session.headers.update({'x-csrftoken': self.req.cookies['csrftoken']})
# Login data for website
self.login_data = {'username': self.USERNAME, 'password': self.PASSWD}
# Login with a post requests
self.login = session.post(LOGIN_URL, data=self.login_data, allow_redirects=True)
# Updating the session with x-csrftoken cookie
self.session.headers.update({'x-csrftoken': self.login.cookies['csrftoken']})
# Function to parse following
def parse(self):
# An array of usernames
usernames = []
# Variable to handle continuous scrolling
has_next_page = True
# Variable to handle continuous scrolling
end_cursor = None
# Loop to handle the scrolling to get the needed data
while has_next_page == True:
# Sleep for 30 seconds to not get rate limited
#time.sleep(30)
# Query url
queryUrl = "https://www.instagram.com/graphql/query/"
# Parameters for the get request
payload = {"query_hash":"9335e35a1b280f082a47b98c5aa10fa4", "id":"8164444379","first":24, "after": end_cursor}
# Variable for GETting all of the user's following
following = self.session.get(queryUrl, params=payload).json()
# Parsing the node to check to see what has_next_page equals to
has_next_page = following['data']['user']['edge_follow']['page_info']['has_next_page']
# Parse all user followings until there are no more
if has_next_page == True or has_next_page == False:
# Parsing end cursor id
end_cursor = following['data']['user']['edge_follow']['page_info']['end_cursor']
# Sleep for 30 seconds to not get rate limited
time.sleep(30)
# Parsing to get to username node
userList = following['data']['user']['edge_follow']
# Loop to interate through all of the names
for eachName in userList['edges']:
# Add each name to the array
usernames.append(eachName['node']['username'])
# Print the array of usernames, along with the length
print(usernames)
print(len(usernames))
if __name__ == '__main__':
checkFollowing = NodesCursor('username', 'password')
checkFollowing().parse()
Error:
Traceback (most recent call last):
File "test.py", line 115, in <module>
turboOne = NodesCursor('moola.ig', 'yeet1234')
File "test.py", line 42, in __init__
self.req = session.get(LOGIN_URL)
NameError: name 'session' is not defined
Though as I stated earlier that I think I am using "self" correctly, it is possible that is where my error is coming from but I'm unsure. Any help is greatly appreciated.

You’re missing the self. when accessing session:
# Creating a session
self.session = requests.Session()
# Get request to login url
self.req = self.session.get(LOGIN_URL)
To fix to error with LOGIN_URL:
self.req = self.session.get(NodesCursor.LOGIN_URL)

Try replacing
self.req = session.get(LOGIN_URL)
With
self.req = self.session.get(LOGIN_URL)

Python requests session keeps creating connection without reusing the connection

I have the following code:
protocol = "http"
if self.protocol == PROTOCOL_HTTPS:
protocol = "https"
if self.session is None:
self.session = Session()
self.session.get(protocol+'://'+self.ip)
url = protocol+"://"+self.ip+requestURL
response = None
if requestType == GET_METHOD:
response = self.session.get(url, headers=(header),stream=False)
elif requestType == POST_METHOD:
response = self.session.post(url, payload, headers=(header), stream=False)
This code works but it opens too many connections to the device. I want only one connection to be opened in the session. I tried the following code but it doesn't seem to work. It is still creating more than 1 connection.
adapter = requests.adapters.HTTPAdapter(pool_connections=1, pool_maxsize=1)
self.session.mount('http://', adapter)
url = protocol+"://"+self.ip+requestURL
resp = self.session.get(url)
What am I doing wrong?
How do I ensure that only one connection is opened in the session?

You don't need to use an HTTPAdapter for one connection.
May just try
>>> import requests
>>> s = requests.Session()
>>> s.get('http://httpbin.org/get')
from http://docs.python-requests.org/en/latest/api/#requests.Session

Adding Cookie to SOAPpy Request

I'm trying to send a SOAP request using SOAPpy as the client. I've found some documentation stating how to add a cookie by extending SOAPpy.HTTPTransport, but I can't seem to get it to work.
I tried to use the example here,
but the server I'm trying to send the request to started throwing 415 errors, so I'm trying to accomplish this without using ClientCookie, or by figuring out why the server is throwing 415's when I do use it. I suspect it might be because ClientCookie uses urllib2 & http/1.1, whereas SOAPpy uses urllib & http/1.0
Does someone know how to make ClientCookie use http/1.0, if that is even the problem, or a way to add a cookie to the SOAPpy headers without using ClientCookie? If tried this code using other services, it only seems to throw errors when sending requests to Microsoft servers.
I'm still finding my footing with python, so it could just be me doing something dumb.
import sys, os, string
from SOAPpy import WSDL,HTTPTransport,Config,SOAPAddress,Types
import ClientCookie
Config.cookieJar = ClientCookie.MozillaCookieJar()
class CookieTransport(HTTPTransport):
def call(self, addr, data, namespace, soapaction = None, encoding = None,
http_proxy = None, config = Config):
if not isinstance(addr, SOAPAddress):
addr = SOAPAddress(addr, config)
cookie_cutter = ClientCookie.HTTPCookieProcessor(config.cookieJar)
hh = ClientCookie.HTTPHandler()
hh.set_http_debuglevel(1)
# TODO proxy support
opener = ClientCookie.build_opener(cookie_cutter, hh)
t = 'text/xml';
if encoding != None:
t += '; charset="%s"' % encoding
opener.addheaders = [("Content-Type", t),
("Cookie", "Username=foobar"), # ClientCookie should handle
("SOAPAction" , "%s" % (soapaction))]
response = opener.open(addr.proto + "://" + addr.host + addr.path, data)
data = response.read()
# get the new namespace
if namespace is None:
new_ns = None
else:
new_ns = self.getNS(namespace, data)
print '\n' * 4 , '-'*50
# return response payload
return data, new_ns
url = 'http://www.authorstream.com/Services/Test.asmx?WSDL'
proxy = WSDL.Proxy(url, transport=CookieTransport)
print proxy.GetList()

Error 415 is because of incorrect content-type header.
Install httpfox for firefox or whatever tool (wireshark, Charles or Fiddler) to track what headers are you sending. Try Content-Type: application/xml.
...
t = 'application/xml';
if encoding != None:
t += '; charset="%s"' % encoding
...
If you trying to send file to the web server use Content-Type:application/x-www-form-urlencoded

A nice hack to use cookies with SOAPpy calls
Using Cookies with SOAPpy calls

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

how to get webpage resource content via chrome remote debugging - python

Related

How to conditionally disable proxies for some requests playwright python

How to connect to Splunk API via Python, receiving javascript error

Python 3 NameError in init, 'session' not defined

Python requests session keeps creating connection without reusing the connection

Adding Cookie to SOAPpy Request

Categories

Resources

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

how to get webpage resource content via chrome remote debugging - python

Related

How to conditionally disable proxies for some requests playwright python

How to connect to Splunk API via Python, receiving javascript error

Python 3 NameError in __init__, 'session' not defined

Python requests session keeps creating connection without reusing the connection

Adding Cookie to SOAPpy Request

Categories

Resources

Python 3 NameError in init, 'session' not defined