Now I am working on tests on python. I am using BrowserMob Proxy and Selenium to capture HTTP
requests.
robot_globals = {'proxy': None, 'selenium': None}
def robot_setup():
server = Server(settings.BROWSERMOB_PROXY_PATH, options={'port': settings.BROWSERMOB_PROXY_PORT})
server.start()
proxy = server.create_proxy()
proxy.selenium_proxy()
if settings.BROWSER_TO_TEST == 'FIREFOX':
from selenium.webdriver.firefox.webdriver import WebDriver
selenium = WebDriver(proxy=proxy, timeout=10)
elif settings.BROWSER_TO_TEST == 'CHROME':
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
capabilities = DesiredCapabilities.CHROME
proxy.add_to_capabilities(capabilities)
selenium = WebDriver(executable_path=settings.CHROME_DRIVER_PATH,
desired_capabilities=capabilities,
service_log_path=settings.DRIVER_LOG_PATH)
selenium.maximize_window()
return (selenium, proxy)
class BaseGATestCase(unittest.TestCase):
def setUp(self):
self.proxy = robot_globals['proxy']
self.selenium = robot_globals['selenium']
....
class TestHomePage(BaseGATestCase):
def test_01_homepage_utme_vars(self):
self.proxy.new_har('home_page')
self.selenium.get('%s%s' % (settings.SERVER_URL_TO_TEST, '/'))
This code works correctly usually. But once or twice in month system launches the browser
but does not load the url in it. Browser just waits but the page does not get loaded at all.
However browser can load page without self.proxy.new_har('..'). The code likes this works:
class TestHomePage(BaseGATestCase):
def test_01_homepage_utme_vars(self):
self.selenium.get('%s%s' % (settings.SERVER_URL_TO_TEST, '/'))
server.log:
INFO 10/09 03:09:18 n.l.b.p.j.h.HttpSer~ - Version Jetty/5.1.x
INFO 10/09 03:09:18 n.l.b.p.j.u.Contain~ - Started HttpContext[/,/]
INFO 10/09 03:09:18 n.l.b.p.j.h.SocketL~ - Started SocketListener on 0.0.0.0:9159
INFO 10/09 03:09:18 n.l.b.p.j.u.Contain~ - Started net.lightbody.bmp.proxy.jetty.jetty.Server#6a1192e9
INFO 10/09 03:10:25 n.l.b.p.j.u.Threade~ - Stopping Acceptor ServerSocket[addr=0.0.0.0/0.0.0.0,localport=9154]
It is really weird for me because last time I couldn't fix this problem, but it was fixed itself the next day. I do not understand why. And now I have the same problem. It would be great if anyone knows how I can fix this problem. Thanks!
Related
I am trying to scrape data from this url with Python-Selenium.
ยป https://shopee.co.id/PANCI-PRESTO-24cm-3.5L-TEFLON-i.323047288.19137193916?sp_atk=7e8e7abc-834c-4f4a-9234-19da9ddb2445&xptdk=7e8e7abc-834c-4f4a-9234-19da9ddb2445
If you watch the network stream you will see that it returns an api on the back end like this https://shopee.co.id/api/v4/item/get?itemid=19137193916&shopid=323047288. How can I get the response returned by this api with selenium?
Solved!
import json
import time
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# Set up Selenium webdriver
capabilities = DesiredCapabilities.CHROME
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
options = webdriver.ChromeOptions()
options.binary_location = "/usr/bin/brave"
options.add_argument("--ignore-certificate-errors")
driver = webdriver.Chrome(desired_capabilities=capabilities, options=options)
# Navigate to URL and monitor network flow
url = "https://shopee.co.id/PANCI-PRESTO-24cm-3.5L-TEFLON-i.323047288.19137193916?sp_atk=7e8e7abc-834c-4f4a-9234-19da9ddb2445&xptdk=7e8e7abc-834c-4f4a-9234-19da9ddb2445"
driver.get(url)
time.sleep(3) # Wait for the page to load
# Find any API requests and print the returned data to the screen
logs = driver.get_log("performance")
for entry in logs:
message = entry.get("message", {})
parsed_message = json.loads(message)
message_dict = parsed_message.get("message", {})
method = message_dict.get("method")
if method == "Network.requestWillBeSent":
request = message_dict.get("params", {}).get("request", {})
url = request.get("url")
if "https://shopee.co.id/api/v4/item/get?itemid=19137193916&shopid=323047288" in url:
response_url = url.replace("request", "response")
response = driver.execute_cdp_cmd(
"Network.getResponseBody", {"requestId": message_dict.get("params", {}).get("requestId")}
)
with open("response.json", "w") as f:
f.write(response.get("body", ""))
I use selenium wire for this. You can do pip install selenium-wire to get it and then import it into your project and use it like so:
from seleniumwire import webdriver
#Sets the Option to disable response encoding
sw_options = {
'disable_encoding': True
}
#Creates driver with selected options
driver = webdriver.Chrome(seleniumwire_options=sw_options)
#Starts selenium wire interceptor to monitor network traffic
driver.request_interceptor = interceptor
#Navigate to page
driver.get('https://shopee.co.id/PANCI-PRESTO-24cm-3.5L-TEFLON-i.323047288.19137193916?sp_atk=7e8e7abc-834c-4f4a-9234-19da9ddb2445&xptdk=7e8e7abc-834c-4f4a-9234-19da9ddb2445')
#Iterate through requests and find the one with the endpoint you need in the url
for a in driver.requests:
if("/api/v4/item/get?itemid=19137193916&shopid=323047288" in a.url):
body = a.response.body
print(body)
We add disable encoding to the options otherwise the body would come back encoded and youd have to decode it manually which can be done like so
body = decode(response.body, response.headers.get('Content-Encoding', 'identity'))
Or done in the browser options as I did.
You can find more information here:
https://pypi.org/project/selenium-wire/#response-objects
How do I use driver.get to open several URLs in Chrome.
My code:
import requests
import json
import pandas as pd
from selenium import webdriver
chromeOptions = webdriver.ChromeOptions()
chromedriver = r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(executable_path=r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe",chrome_options=chromeOptions)
links = []
request1 = requests.get('https://api.beta.tab.com.au/v1/recommendation-service/featured-events?jurisdiction=NSW')
json1 = request1.json()
for n in json1['nextToGoRaces']:
if n['meeting']['location'] in ['VIC','NSW','QLD','SA','WA','TAS','IRL']:
links.append(n['_links']['self'])
driver.get('links')
Based on the comments - you'll want a class to manage your browsers, a class for your tests, then a runner to run in parallel.
Try this:
import unittest
import time
import testtools
from selenium import webdriver
class BrowserManager:
browsers=[]
def createBrowser(self, url):
browser = webdriver.Chrome()
browser.get(url)
self.browsers.append(browser)
def getBrowserByPartialURL(self, url):
for browser in self.browsers:
if url in browser.current_url:
return browser
def CloseItAllDown(self):
for browser in self.browsers:
browser.close()
class UnitTest1(unittest.TestCase):
def test_DoStuffOnGoogle(self):
browser = b.getBrowserByPartialURL("google")
#Point of this is to watch the output! you'll see this +other test intermingled (proves parallel run)
for i in range(10):
print(browser.current_url)
time.sleep(1)
def test_DoStuffOnYahoo(self):
browser = b.getBrowserByPartialURL("yahoo")
#Point of this is to watch the output! you'll see this +other test intermingled (proves parallel run)
for i in range(10):
print(browser.current_url)
time.sleep(1)
#create a global variable for the brwosers
b = BrowserManager()
# To Run the tests
if __name__ == "__main__":
##move to an init to Create your browers
b.createBrowser("https://www.google.com")
b.createBrowser("https://www.yahoo.com")
time.sleep(5) # This is so you can see both open at the same time
suite = unittest.TestLoader().loadTestsFromTestCase(UnitTest1)
concurrent_suite = testtools.ConcurrentStreamTestSuite(lambda: ((case, None) for case in suite))
concurrent_suite.run(testtools.StreamResult())
This code doesn't do anything exciting - it's an example of how to manage multiple browsers and run tests in parallel. It goes to the specified urls (which you should move to an init/setup), then prints out the URL it's on 10 times.
This is how you add a browser to the manager: b.createBrowser("https://www.google.com")
This is how you retrieve your browser: browser = b.getBrowserByPartialURL("google") - note it's a partial URL so you can use the domain as a keyword.
This is the output (just the first few lines- not all of it...) - It's a print URL for google then yahoo, then google then yahoo - showing that they're running at the same time:
PS C:\Git\PythonSelenium\BrowserManager> cd 'c:\Git\PythonSelenium'; & 'C:\Python38\python.exe' 'c:\Users\User\.vscode\extensions\ms-python.python-2020.7.96456\pythonFiles\lib\python\debugpy\launcher' '62426' '--' 'c:\Git\PythonSelenium\BrowserManager\BrowserManager.py'
DevTools listening on ws://127.0.0.1:62436/devtools/browser/7260dee3-368c-4f21-bd59-2932f3122b2e
DevTools listening on ws://127.0.0.1:62463/devtools/browser/9a7ce919-23bd-4fee-b302-8d7481c4afcd
https://www.google.com/
https://consent.yahoo.com/collectConsent?sessionId=3_cc-session_d548b656-8315-4eef-bb1d-82fd4c6469f8&lang=en-GB&inline=false
https://www.google.com/
https://consent.yahoo.com/collectConsent?sessionId=3_cc-session_d548b656-8315-4eef-bb1d-82fd4c6469f8&lang=en-GB&inline=false
https://www.google.com/
My test environment is under a corporate proxy ("proxy.ptbc.std.com:2538").I want to open a particular video on YoTube for a period of time (eg 200 seconds) and capture the har file for each visit, the process is repeated several times for a massive test. I have tried different examples found here but the firefox / chrome browsers do not connect to the internet because they are behind the proxy.
How can run "python-selenium + browsermobproxy" behind a corporate proxy and capture the har file for each instance.
Example code:
from browsermobproxy import Server
server = Server("C:\\Utility\\browsermob-proxy-2.1.4\\bin\\browsermob-proxy")
server.start()
proxy = server.create_proxy()
from selenium import webdriver
profile = webdriver.FirefoxProfile()
profile.set_proxy(proxy.selenium_proxy())
driver = webdriver.Firefox(firefox_profile=profile)
proxy.new_har("google")
driver.get("http://www.google.co.in")
proxy.har # returns a HAR JSON blob
server.stop()
driver.quit()
Any help would be appreciated
According to browsermob-proxy documentation:
Sometimes you will want to route requests through an upstream proxy
server. In this case specify your proxy server by adding the httpProxy
parameter to your create proxy request:
[~]$ curl -X POST http://localhost:8080/proxy?httpProxy=yourproxyserver.com:8080
{"port":8081}
According to source code of browsermob-proxy API for Python
def create_proxy(self, params=None):
"""
Gets a client class that allow to set all the proxy details that you
may need to.
:param dict params: Dictionary where you can specify params
like httpProxy and httpsProxy
"""
params = params if params is not None else {}
client = Client(self.url[7:], params)
return client
So, everything you need is to specify params in create_proxy depending on what proxy you use (http or https):
from browsermobproxy import Server
from selenium import webdriver
import json
server = Server("C:\\Utility\\browsermob-proxy-2.1.4\\bin\\browsermob-proxy")
server.start()
# httpProxy or httpsProxy
proxy = server.create_proxy(params={'httpProxy': 'proxy.ptbc.std.com:2538'})
profile = webdriver.FirefoxProfile()
profile.set_proxy(proxy.selenium_proxy())
driver = webdriver.Firefox(firefox_profile=profile)
proxy.new_har("google")
driver.get("http://www.google.co.in")
result = json.dumps(proxy.har, ensure_ascii=False)
print(result)
server.stop()
driver.quit()
This is my setup:
Raspberry Pi 3 Model B Plus Rev 1.3
Linux 4.19.66-v7+ (RaspbianGNU/Linux 9 (stretch))
Selenium 3.141.0
Browsermob-Proxy 2.1.4
Chromium 72.0.3626.121
ChromeDriver 72.0.3626.121
Python 3.5.3
I would like to record the network traffic when I visit an https page. So far, it actually works quite well. The problem is, the content of the packages that browsermob proxy records are encrypted.
Here my code
import pprint
import time
from selenium import webdriver
from pyvirtualdisplay import Display
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from browsermobproxy import Server
# Source: https://github.com/ArturSpirin/YouTube-WebDriver-Tutorials/blob/master/proxy/BmpProxy.py
class ProxyManger:
__BMP = "/usr/local/bin/browsermob-proxy-2.1.4/bin/browsermob-proxy"
def __init__(self):
self.__server = Server(ProxyManger.__BMP, options={'port': 8089})
self.__client = None
def start_server(self):
self.__server.start()
return self.__server
def start_client(self):
self.__client = self.__server.create_proxy(params={"trustAllServers": "true"})
return self.__client
#property
def client(self): return self.__client #property
def server(self):
return self.__server
# set virtual dispaly
display = Display(visible=0, size=(800, 600))
display.start()
# set browsermob-proxy
proxy = ProxyManger() server = proxy.start_server()
client = proxy.start_client()
client.new_har(url)
# set chrome options
opts = webdriver.ChromeOptions()
opts.add_argument("--proxy-server={}".format(client.proxy))
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--no-sandbox")
opts.add_argument("--ignore-certificate-errors")
browser = webdriver.Chrome(options=opts)
browser.get(url)
time.sleep(10)
pprint.pprint(client.har)
browser.quit()
server.stop()
display.stop()
The code works quite well so far. I receive the packages i want.
The problem is the encrypted content. It is clear to me that the browsermob-proxy acts as a MITM and cannot read the contents of these packages due to the end-to-end encryption.
...
'content': {'comment': '',
'encoding': 'base64',
'mimeType': 'application/json',
'size': 10493,
'text': 'IUQHACBHdln10z6SWSgCD9DkLZ0OUL9H9+NwllhRXLaI+7nOI023mVdkr5uCJV115AeolXUwyJUgklGU8z/0tYu/n/iuQCnAQJIG8JwmwaOcwRRLTheZ8abRSDFM/gQTqc6nP03QiSiJ/ZuxVZTkH/6SKKpir/SsMAt5+RMiPU+eJ3fN+U8JBjguGdWoNCGCrSqOw9gBeKORKcY4Ek014310aXl3BUqBnJ01VqPyeaJQasKY1hxRkkYTfFGAefuYQ5pbF1588ghm1VDPrdoKB1lERMVl/j0Y2HWEt+tbdHYe3t9fCrtSN+5Nq++ejmp/pg9UUuyVF8FlWvJiA6YB'},
...
I run the Raspberry Pi headless. That means I only have access via ssh and no x. According to the github page of Browsermob-proxy, it is possible to add a certificate to my browser.
According to some internet research, this usually works in chrome via the GUI.
After doing some more research, I found this:
https://github.com/ThomasLeister/root-certificate-deployment
I ran linux-browser-import.sh, but unfortunately this had no effect on that.
Where is my mistake? Does someone have a solution to my problem? How is it possible to read packages decrypted from an ssl connection?
Is there any other method known how I can read xhr packages?
Thanks,
Mike
I will start by describing the infrastructure I am working within. It contains multiple proxy servers that uses a load balancer to forward user authentications to the appropriate proxy that are directly tied to an active directory. The authentication uses the credentials and source IP that was used to log into the computer the request is coming from. The server caches the IP and credentials for 60 minutes. I am using a test account specifically for this process and is only used on the unit testing server.
I am working on some automation with selenium webdriver on a remote server using a docker container. I am using python as the scripting language. I am trying to run tests on both internal and external webpages/applications. I was able to get a basic test on an internal website with the following script:
Note: 10.1.54.118 is the server hosting the docker container with the selenium web driver
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
browser = webdriver.Remote(command_executor='http://10.1.54.118:4444/wd/hub', desired_capabilities=DesiredCapabilities.CHROME)
browser.get("http://10.0.0.2")
print (browser.find_element_by_tag_name('body').text)
bodyText = browser.find_element_by_tag_name('body').text
print (bodyText)
if 'Hello' in bodyText:
print ('Found hello in body')
else:
print ('Hello not found in body')
browser.quit()
The script is able to access the internal webpage and print all the text on it.
However, I am experiencing problems trying to run test scripts against external websites.
I have tried the following articles and tutorials and it doesn't seem to work for me.
The articles and tutorials I have tried:
https://www.seleniumhq.org/docs/04_webdriver_advanced.jsp
Pass driver ChromeOptions and DesiredCapabilities?
https://www.programcreek.com/python/example/100023/selenium.webdriver.Remote
https://github.com/webdriverio/webdriverio/issues/324
https://www.programcreek.com/python/example/96010/selenium.webdriver.common.desired_capabilities.DesiredCapabilities.CHROME
Running Selenium Webdriver with a proxy in Python
how do i set proxy for chrome in python webdriver
https://docs.proxymesh.com/article/4-python-proxy-configuration
I have tried creating 4 versions of a script to access an external site i.e. google.com and simply print the text off of it. Every script returns a time out error. I apologize for posting a lot of code but maybe the community is able to see where I am going wrong with the coding aspect.
Code 1:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
PROXY = "10.32.51.169:3128" # IP:PORT or HOST:PORT
desired_capabilities = webdriver.DesiredCapabilities.CHROME.copy()
desired_capabilities['proxy'] = {
"httpProxy":PROXY,
"ftpProxy":PROXY,
"sslProxy":PROXY,
"socksUsername":"myusername",
"socksPassword":"mypassword",
"noProxy":None,
"proxyType":"MANUAL",
"class":"org.openqa.selenium.Proxy",
"autodetect":False
}
browser = webdriver.Remote('http://10.1.54.118:4444/wd/hub', desired_capabilities)
browser.get("https://www.google.com/")
print (browser.find_element_by_tag_name('body').text)
bodyText = browser.find_element_by_tag_name('body').text
print (bodyText)
if 'Hello' in bodyText:
print ('Found hello in body')
else:
print ('Hello not found in body')
browser.quit()
Is my code incorrect in any way? Am I able to pass configuration parameters to the docker chrome selenium webdriver or do I need to build the docker container with the proxy settings preconfigured before building it? I look forward to your replies and any help that can point me in the right direction.
A little late on this one, but a couple ideas + improvements:
Remove the user/pass from the socks proxy config and add them to your Proxy connection uri.
Use the selenium Proxy object to help abstract some of the other bits of the proxy capability.
Add the scheme to the proxy connection string.
Use a try/finally block to make sure the browser quits despite any failures
Note... I'm using Python3, selenium version 3.141.0, and I'm leaving out the FTP config for brevity/simplicity:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy
# Note the addition of the scheme (http) and the user/pass into the connection string.
PROXY = 'http://myusername:mypassword#10.32.51.169:3128'
# Use the selenium Proxy object to add proxy capabilities
proxy_config = {'httpProxy': PROXY, 'sslProxy': PROXY}
proxy_object = Proxy(raw=proxy_config)
capabilities = DesiredCapabilities.CHROME.copy()
proxy_object.add_to_capabilities(capabilities)
browser = webdriver.Remote('http://10.1.54.118:4444/wd/hub', desired_capabilities=capabilities)
# Use try/finally so the browser quits even if there is an exception
try:
browser.get("https://www.google.com/")
print(browser.find_element_by_tag_name('body').text)
bodyText = browser.find_element_by_tag_name('body').text
print(bodyText)
if 'Hello' in bodyText:
print('Found hello in body')
else:
print('Hello not found in body')
finally:
browser.quit()