Monkey patching _ssl_wrap_socket in Python requests library isn't executing - python

We are trying to add HTTPS support to a web server virtual host scanning tool. Said tool uses the python3 requests library, which uses urllib3 under the hood.
We need a way to provide our own SNI hostname so are attempting to monkey patch the _ssl_wrap_socket function of urllib3 to control server_hostname but aren't having much success.
Here is the full code:
from urllib3.util import ssl_
_target_host = None
_orig_wrap_socket = ssl_.ssl_wrap_socket
def _ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None,
ca_certs=None, server_hostname=None,
ssl_version=None, ciphers=None, ssl_context=None,
ca_cert_dir=None):
_orig_wrap_socket(sock, keyfile=keyfile, certfile=certfile,
cert_reqs=cert_reqs, ca_certs=ca_certs,
server_hostname=_target_host, ssl_version=ssl_version,
ciphers=ciphers, ssl_context=ssl_context,
ca_cert_dir=ca_cert_dir)
ssl_.ssl_wrap_socket = _ssl_wrap_socket
We then call requests.get() further down in the code. The full context can be found on Github (here).
Unfortunately this isn't working as our code never appears to be reached, and we're not sure why. Is there something obvious that we're missing or a better way to approach this issue?
Further Explanation
The following is the full class:
import os
import random
import requests
import hashlib
import pandas as pd
import time
from lib.core.discovered_host import *
import urllib3
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '\
'AppleWebKit/537.36 (KHTML, like Gecko) '\
'Chrome/61.0.3163.100 Safari/537.36'
urllib3.disable_warnings()
from urllib3.util import ssl_
class virtual_host_scanner(object):
"""Virtual host scanning class
Virtual host scanner has the following properties:
Attributes:
wordlist: location to a wordlist file to use with scans
target: the target for scanning
port: the port to scan. Defaults to 80
ignore_http_codes: commad seperated list of http codes to ignore
ignore_content_length: integer value of content length to ignore
output: folder to write output file to
"""
def __init__(self, target, wordlist, **kwargs):
self.target = target
self.wordlist = wordlist
self.base_host = kwargs.get('base_host')
self.rate_limit = int(kwargs.get('rate_limit', 0))
self.port = int(kwargs.get('port', 80))
self.real_port = int(kwargs.get('real_port', 80))
self.ssl = kwargs.get('ssl', False)
self.fuzzy_logic = kwargs.get('fuzzy_logic', False)
self.unique_depth = int(kwargs.get('unique_depth', 1))
self.ignore_http_codes = kwargs.get('ignore_http_codes', '404')
self.first_hit = kwargs.get('first_hit')
self.ignore_content_length = int(
kwargs.get('ignore_content_length', 0)
)
self.add_waf_bypass_headers = kwargs.get(
'add_waf_bypass_headers',
False
)
# this can be made redundant in future with better exceptions
self.completed_scan = False
# this is maintained until likely-matches is refactored to use
# new class
self.results = []
# store associated data for discovered hosts
# in array for oN, oJ, etc'
self.hosts = []
# available user-agents
self.user_agents = list(kwargs.get('user_agents')) \
or [DEFAULT_USER_AGENT]
#property
def ignore_http_codes(self):
return self._ignore_http_codes
#ignore_http_codes.setter
def ignore_http_codes(self, codes):
self._ignore_http_codes = [
int(code) for code in codes.replace(' ', '').split(',')
]
_target_host = None
_orig_wrap_socket = ssl_.ssl_wrap_socket
def _ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None,
ca_certs=None, server_hostname=None,
ssl_version=None, ciphers=None, ssl_context=None,
ca_cert_dir=None):
print('SHOULD BE PRINTED')
_orig_wrap_socket(sock, keyfile=keyfile, certfile=certfile,
cert_reqs=cert_reqs, ca_certs=ca_certs,
server_hostname=_target_host, ssl_version=ssl_version,
ciphers=ciphers, ssl_context=ssl_context,
ca_cert_dir=ca_cert_dir)
def scan(self):
print('fdsa')
ssl_.ssl_wrap_socket = self._ssl_wrap_socket
if not self.base_host:
self.base_host = self.target
if not self.real_port:
self.real_port = self.port
for virtual_host in self.wordlist:
hostname = virtual_host.replace('%s', self.base_host)
if self.real_port == 80:
host_header = hostname
else:
host_header = '{}:{}'.format(hostname, self.real_port)
headers = {
'User-Agent': random.choice(self.user_agents),
'Host': host_header,
'Accept': '*/*'
}
if self.add_waf_bypass_headers:
headers.update({
'X-Originating-IP': '127.0.0.1',
'X-Forwarded-For': '127.0.0.1',
'X-Remote-IP': '127.0.0.1',
'X-Remote-Addr': '127.0.0.1'
})
dest_url = '{}://{}:{}/'.format(
'https' if self.ssl else 'http',
self.target,
self.port
)
_target_host = hostname
try:
res = requests.get(dest_url, headers=headers, verify=False)
except requests.exceptions.RequestException:
continue
if res.status_code in self.ignore_http_codes:
continue
response_length = int(res.headers.get('content-length', 0))
if self.ignore_content_length and \
self.ignore_content_length == response_length:
continue
# hash the page results to aid in identifing unique content
page_hash = hashlib.sha256(res.text.encode('utf-8')).hexdigest()
self.hosts.append(self.create_host(res, hostname, page_hash))
# add url and hash into array for likely matches
self.results.append(hostname + ',' + page_hash)
if len(self.hosts) >= 1 and self.first_hit:
break
# rate limit the connection, if the int is 0 it is ignored
time.sleep(self.rate_limit)
self.completed_scan = True
def likely_matches(self):
if self.completed_scan is False:
print("[!] Likely matches cannot be printed "
"as a scan has not yet been run.")
return
# segment results from previous scan into usable results
segmented_data = {}
for item in self.results:
result = item.split(",")
segmented_data[result[0]] = result[1]
dataframe = pd.DataFrame([
[key, value] for key, value in segmented_data.items()],
columns=["key_col", "val_col"]
)
segmented_data = dataframe.groupby("val_col").filter(
lambda x: len(x) <= self.unique_depth
)
return segmented_data["key_col"].values.tolist()
def create_host(self, response, hostname, page_hash):
"""
Creates a host using the responce and the hash.
Prints current result in real time.
"""
output = '[#] Found: {} (code: {}, length: {}, hash: {})\n'.format(
hostname,
response.status_code,
response.headers.get('content-length'),
page_hash
)
host = discovered_host()
host.hostname = hostname
host.response_code = response.status_code
host.hash = page_hash
host.contnet = response.content
for key, val in response.headers.items():
output += ' {}: {}\n'.format(key, val)
host.keys.append('{}: {}'.format(key, val))
print(output)
return host
In this case the following line is never being hit:
print('SHOULD BE PRINTED')
This also results in the following log entry on the web server:
[Wed Oct 25 16:37:23.654321 2017] [ssl:error] [pid 1355] AH02032:
Hostname provided via SNI and hostname test.test provided via
HTTP are different
Which indicates the code was never run also.

Edit-1: No reload needed
Thanks to #MartijnPieters helping me enhance the answer. There is no reload needed if we directly patch urllib3.connection. But requests package has some changes in the latest version, which made the original answer not work on some version of requests.
Here is a updated version of the code, which handles all these things
import requests
try:
assert requests.__version__ != "2.18.0"
import requests.packages.urllib3.util.ssl_ as ssl_
import requests.packages.urllib3.connection as connection
except (ImportError,AssertionError,AttributeError):
import urllib3.util.ssl_ as ssl_
import urllib3.connection as connection
print("Using " + requests.__version__)
def _ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None,
ca_certs=None, server_hostname=None,
ssl_version=None, ciphers=None, ssl_context=None,
ca_cert_dir=None):
print('SHOULD BE PRINTED')
return ssl_.ssl_wrap_socket(sock, keyfile=keyfile, certfile=certfile,
cert_reqs=cert_reqs, ca_certs=ca_certs,
server_hostname=server_hostname, ssl_version=ssl_version,
ciphers=ciphers, ssl_context=ssl_context,
ca_cert_dir=ca_cert_dir)
connection.ssl_wrap_socket = _ssl_wrap_socket
res = requests.get("https://www.google.com", verify=True)
The code is also available on
https://github.com/tarunlalwani/monkey-patch-ssl_wrap_socket
Original Answer
So two issues in your code.
requests doesn't actually directly import urllib3. It does it through its own context using requests.packages
So the socket you want to overwrite is
requests.packages.urllib3.util.ssl_.ssl_wrap_socket
Next if you look at connection.py from urllib3/connection.py
from .util.ssl_ import (
resolve_cert_reqs,
resolve_ssl_version,
ssl_wrap_socket,
assert_fingerprint,
)
This is a local import and it can't be overridden on first attempt as the code is loaded when we use import requests. You can easily confirm that by putting a breakpoint and checking the stack trace back to parent file.
So for the monkey patch to work we need to reload the module once the patching is done, so it takes our patched function
Below is minimal code showing that interception works this way
try:
reload # Python 2.7
except NameError:
try:
from importlib import reload # Python 3.4+
except ImportError:
from imp import reload # Python 3.0 - 3.3
def _ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None,
ca_certs=None, server_hostname=None,
ssl_version=None, ciphers=None, ssl_context=None,
ca_cert_dir=None):
print('SHOULD BE PRINTED')
_orig_wrap_socket(sock, keyfile=keyfile, certfile=certfile,
cert_reqs=cert_reqs, ca_certs=ca_certs,
server_hostname=_target_host, ssl_version=ssl_version,
ciphers=ciphers, ssl_context=ssl_context,
ca_cert_dir=ca_cert_dir)
import requests
_orig_wrap_socket = requests.packages.urllib3.util.ssl_.ssl_wrap_socket
requests.packages.urllib3.util.ssl_.ssl_wrap_socket = _ssl_wrap_socket
reload(requests.packages.urllib3.connection)
res = requests.get("https://www.google.com", verify=True)

Related

How Get a list of MQ queues pymqi?

You want to get a list of the queues for a specific queue manager. I seem to understand how to do this, but when I try, I get an error.
Traceback (most recent call last): File
"D:/project/Work-Project/queue list.py", line 23, in
response = pcf.MQCMD_INQUIRE_Q(args) File "C:\Users\ShevcovAA\AppData\Local\Programs\Python\Python37\lib\site-packages\pymqi_init_.py",
line 2769, in call
message = self._pcf.reply_queue.get(None, get_md, get_opts) File
"C:\Users\ShevcovAA\AppData\Local\Programs\Python\Python37\lib\site-packages\pymqi_init.py",
line 2021, in get
raise MQMIError(rv[-2], rv[-1], message=rv[0], original_length=rv[-3]) pymqi.MQMIError: MQI Error. Comp: 2, Reason
2033: FAILED: MQRC_NO_MSG_AVAILABLE
My Code:
import logging
import re
import pymqi
logging.basicConfig(level=logging.INFO)
queue_manager = 'QM1'
channel = 'DEV.APP.SVRCONN'
host = '127.0.0.1'
port = '1414'
conn_info = '%s(%s)' % (host, port)
prefix = "*"
queue_type = pymqi.CMQC.MQQT_LOCAL
args = {pymqi.CMQC.MQCA_Q_NAME: prefix,
pymqi.CMQC.MQIA_Q_TYPE: queue_type}
qmgr = pymqi.connect(queue_manager, channel, conn_info)
pcf = pymqi.PCFExecute(qmgr)
response = pcf.MQCMD_INQUIRE_Q(args)
for queue_info in response:
queue_name = queue_info[pymqi.CMQC.MQCA_Q_NAME]
if (re.match('^SYSTEM', queue_name) or re.match('^AMQ', queue_name) or re.match('^MQ', queue_name)):
pass
else:
q = pymqi.Queue(qmgr, queue_name)
print(queue_name.strip() + ':' + 'Queue depth:', q.inquire(pymqi.CMQC.MQIA_CURRENT_Q_DEPTH))
q.close()
qmgr.disconnect()
v1.12.0 pymqi uses different logic to get PCF response messages from the response queue.
By default, a timeout of 5 seconds is used to wait for a response.
As a result, if you have a lot of queues or your QM is under heavy load, this may not be enough.
To fix this, you can increase this interval using the response_wait_interval parameter of the PCFExecute constructor.
pcf = pymqi.PCFExecute(qmgr, response_wait_interval=30000) # 30 seconds
v1.11.0 does not have this parameter and uses default interval of 30 seconds.
And avoid querying each queue for depth, just query MQIA_CURRENT_Q_DEPTH attribute.
In new notation, supported in v1.12+, it will be something like:
attrs = [] # type: List[pymqi.MQOpts]
attrs.append(pymqi.CFST(Parameter=pymqi.CMQC.MQCA_Q_NAME,
String=pymqi.ensure_bytes(prefix)))
attrs.append(pymqi.CFIN(Parameter=pymqi.CMQC.MQIA_Q_TYPE,
Value=queue_type))
attrs.append(pymqi.CFIL(Parameter=pymqi.CMQCFC.MQIACF_Q_ATTRS,
Values=[pymqi.CMQC.MQIA_CURRENT_Q_DEPTH]))
object_filters = []
# object_filters.append(
# pymqi.CFIF(Parameter=pymqi.CMQC.MQIA_CURRENT_Q_DEPTH,
# Operator=pymqi.CMQCFC.MQCFOP_GREATER,
# FilterValue=0))
response = pcf.MQCMD_INQUIRE_Q(attrs, object_filters)
for queue_info in response:
queue_name = queue_info[pymqi.CMQC.MQCA_Q_NAME]
queue_depth = queue_info[pymqi.CMQC.MQIA_CURRENT_Q_DEPTH]
print('{}: {} message(s)'.format(queue_name.strip().decode(), queue_depth))
Solved this error by simply installing the version below. That is, Meln had PyMQi 1.12.0, and now it is PyMQI 1.11.0
My Code:
import pymqi
import date_conn
qmgr = pymqi.connect(date_conn.queue_manager, date_conn.channel, date_conn.conn_info)
pcf = pymqi.PCFExecute(qmgr)
c = 0
attrs = {
pymqi.CMQC.MQCA_Q_NAME :'*'
}
result = pcf.MQCMD_INQUIRE_Q(attrs)
for queue_info in result:
queue_name = queue_info[pymqi.CMQC.MQCA_Q_NAME]
print(queue_name)
c+=1
print(c)
qmgr.disconnect()

Python, websocket auto-close on some machines

I've written some api to communicate with a website using websocketapp. It works fine only on 2 pc. If i put my code on every other pc the websocket doesnt receive any message and closes. I've tried a lot of different machines and operating systems, many version of python (included the same that works), wireless and wired connection but nothing changed. There's no error or exception. What can it be?
EDIT: i don't own the website or the server. All other methods send messages and parse the response in on_socket_message
import requests
import websocket
import time
from threading import Thread
from datetime import datetime
import json
from position import Position
from constants import ACTIVES
class IQOption():
practice_balance = 0
real_balance = 0
server_time = 0
positions = {}
instruments_categories = ["cfd","forex","crypto"]
top_assets_categories = ["forex","crypto","fx-option"]
instruments_to_id = ACTIVES
id_to_instruments = {y:x for x,y in ACTIVES.items()}
market_data = {}
binary_expiration_list = {}
open_markets = {}
digital_strike_list = {}
candle_data = []
latest_candle = 0
position_id = 0
quotes =[]
position_id_list=[]
def __init__(self,username,password,host="iqoption.com"):
self.username = username
self.password = password
self.host = host
self.session = requests.Session()
self.generate_urls()
self.socket = websocket.WebSocketApp(self.socket_url,on_open=self.on_socket_connect,on_message=self.on_socket_message,on_close=self.on_socket_close,on_error=self.on_socket_error)
def generate_urls(self):
"""Generates Required Urls to operate the API"""
#https://auth.iqoption.com/api/v1.0/login
self.api_url = "https://{}/api/".format(self.host)
self.socket_url = "wss://{}/echo/websocket".format(self.host)
self.login_url = self.api_url+"v1.0/login"
self.profile_url = self.api_url+"profile"
self.change_account_url = self.profile_url+"/"+"changebalance"
self.getprofile_url = self.api_url+"getprofile"
def login(self):
"""Login and set Session Cookies"""
print("LOGIN")
data = {"email":self.username,"password":self.password}
self.log_resp = self.session.request(url="https://auth.iqoption.com/api/v1.0/login",data=data,method="POST")
requests.utils.add_dict_to_cookiejar(self.session.cookies, dict(platform="9"))
self.__ssid = self.log_resp.cookies.get("ssid")
print(self.__ssid)
self.start_socket_connection()
time.sleep(1) ## artificial delay to complete socket connection
self.log_resp2 = self.session.request(url="https://eu.iqoption.com/api/getprofile",method="GET")
ss = self.log_resp2._content.decode('utf-8')
js_ss=json.loads(ss)
self.parse_account_info(js_ss)
self.balance_id = js_ss["result"]["balance_id"]
self.get_instruments()
self.get_top_assets()
self.setOptions()
#self.getFeatures()
time.sleep(1)
print(js_ss["isSuccessful"])
return js_ss["isSuccessful"]
def on_socket_message(self,socket,message):
#do things
def on_socket_connect(self,socket):
"""Called on Socket Connection"""
self.initial_subscriptions()
print("On connect")
def initial_subscriptions(self):
self.send_socket_message("ssid",self.__ssid)
self.send_socket_message("subscribe","tradersPulse")
def on_socket_error(self,socket,error):
"""Called on Socket Error"""
print(message)
def on_socket_close(self,socket):
"""Called on Socket Close, does nothing"""
def start_socket_connection(self):
"""Start Socket Connection"""
self.socket_thread = Thread(target=self.socket.run_forever)
self.socket_thread.start()
def send_socket_message(self,name,msg):
#print(msg)
data = {"name":name,"msg":msg}
self.socket.send(json.dumps(data))
Here is an example running under Gevent Websockets. This makes it ASYNC (which I suspect is part of your problem) and allows for bidirectional communication.
import gevent
from gevent import monkey, signal, Timeout, sleep, spawn as gspawn
monkey.patch_all()
from gevent.pywsgi import WSGIServer
from geventwebsocket.handler import WebSocketHandler
from geventwebsocket import WebSocketError
import bottle
from bottle import get, route, template, request, response, abort, static_file
import ujson as json
#route('/static/<filepath:path>')
def server_static(filepath):
return static_file(filepath, root='static')
#route('/ws/remote')
def handle_websocket():
wsock = request.environ.get('wsgi.websocket')
if not wsock:
abort(400, 'Expected WebSocket request.')
while 1:
try:
message = ''
with Timeout(2, False) as timeout:
message = wsock.receive()
if message:
message = json.loads(message)
if 'command' in message:
r.command(message['command'])
except WebSocketError:
break
except Exception as exc:
print(str(exc))
#get('/')
def remote():
return template('templates/remote.tpl', title='WebsocketTest', websocket=WEBSOCKET, command='command', status=status)
if __name__ == '__main__':
r=None
status="Connecting..."
gspawn(initialize)
print 'Started...'
HOST = socket.gethostbyname(socket.gethostname())
HOST = 'localhost'
WEBSOCKET = 'ws://{}/ws/remote'.format(HOST)
botapp = bottle.app()
server = WSGIServer(("0.0.0.0", 80), botapp, handler_class=WebSocketHandler)
def shutdown():
print('Shutting down ...')
server.stop(timeout=60)
exit(signal.SIGTERM)
gevent.signal(signal.SIGTERM, shutdown)
gevent.signal(signal.SIGINT, shutdown) #CTRL C
server.serve_forever()
Then in your HTML you really should use reconnecting websocket library
https://github.com/joewalnes/reconnecting-websocket
<button id="TRIGGERED" type="button" class="btn btn-outline-primary">TRIGGER</button>
<script type="text/javascript" src="/static/reconnecting-websocket.min.js"></script>
<script>
var ws = new ReconnectingWebSocket('{{websocket}}');
ws.reconnectInterval = 3000;
ws.maxReconnectAttempts = 10;
ws.onmessage = function (evt) {
var wsmsg = JSON.parse(evt.data);
console.log(evt.data)
};
$("button").click(function() {
<!--console.log(this.id);-->
ws.send(JSON.stringify({'{{command}}': this.id}));
});
</script>

Scraping the metadata of 10,000 website is too slow (Python)

Hi all,
I'm trying to parse the metadata of 10,000 websites into a Pandas dataframe for an SEO / analytics application but the code is taking ages. I've been trying to do it on 1,000 websites and the code has been running for the last 3 hours (it works without problem on 10-50 websites).
Here's the sample data:
index site
0 http://www.google.com
1 http://www.youtube.com
2 http://www.facebook.com
3 http://www.cnn.com
... ...
10000 http://www.sony.com
Here's my Python (2.7) code:
# Importing dependencies
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import metadata_parser
# Loading the Pandas dataframe
df = pd.read_csv('final_urls')
# Utility functions
def meta(website, metadata):
full_url = website
parser = metadata_parser.MetadataParser(url=full_url)
if metadata == 'all':
return parser.metadata
else:
return parser.metadata[metadata]
def meta_all(website):
try:
result = meta(website, 'all')
except BaseException:
result = 'Exception'
return result
# Main
df['site'].apply(meta_all)
I'd like the code to be much faster. I've been using the metadata_parser library (https://github.com/jvanasco/metadata_parser) which relies heavily on requests and BeautifulSoup.
I understand I might be able to change the parser to lxml for the code to be faster. It's already installed on my machine so BeautifulSoup should use it as the primary choice.
Do you have any suggestion to get this code to run faster?
Thanks!
You can use Python Twisted (Twisted is an event-driven networking engine written in Python). You will need to install a few packages with pip, maybe twisted, pyopenssl and service_identity maybe others. This code works on Python 2.7 which you say you are using.
from twisted.internet import defer, reactor
from twisted.web.client import getPage
import metadata_parser
import pandas as pd
import numpy as np
from multiprocessing import Process
def pageCallback(result, url):
data = {
'content': result,
'url': url,
}
return data
def getPageData(url):
d = getPage(url)
d.addCallback(pageCallback, url)
return d
def listCallback(result):
for isSuccess, data in result:
if isSuccess:
print("Call to %s succeeded " % (data['url']))
parser = metadata_parser.MetadataParser(html=data['content'], search_head_only=False)
print(parser.metadata) # do something with it here
def finish(ign):
reactor.stop()
def start(urls):
data = []
for url in urls:
data.append(getPageData(url))
dl = defer.DeferredList(data)
dl.addCallback(listCallback)
dl.addCallback(finish)
def processStart(chunk):
start(chunk)
reactor.run()
df = pd.read_csv('final_urls')
urls = df['site'].values.tolist()
chunkCounter = 0
chunkLength = 1000
for chunk in np.array_split(urls,len(urls)/chunkLength):
p = Process(target=processStart, args=(chunk,))
p.start()
p.join()
chunkCounter += 1
print("Finished chunk %s of %s URLs" % (str(chunkCounter), str(chunkLength)))
I have run it on 10,000 URLs and it took less than 16 minutes.
Updated
Normally you would process the data you generated where I added the comment "# do something with it here". In the event you want the generated data returned back for processing you can do something like this (I have also updated to use treq.):
from twisted.internet import defer, reactor
import treq
import metadata_parser
import pandas as pd
import numpy as np
import multiprocessing
from twisted.python import log
import sys
# log.startLogging(sys.stdout)
results = []
def pageCallback(result, url):
content = result.content()
data = {
'content': content,
'url': url,
}
return data
def getPageData(url):
d = treq.get(url, timeout=60, headers={'User-Agent': ["Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv'\:'57.0) Gecko/20100101 Firefox/57.0"]})
d.addCallback(pageCallback, url)
return d
def listCallback(result):
global results
for isSuccess, data in result:
if isSuccess:
print("Call to %s succeeded " % (data['url']))
parser = metadata_parser.MetadataParser(html=str(data['content']), search_head_only=False)
# print(parser.metadata) # do something with it here
results.append((data['url'], parser.metadata))
def finish(ign):
reactor.stop()
def start(urls):
data = []
for url in urls:
data.append(getPageData(url))
dl = defer.DeferredList(data)
dl.addCallback(listCallback)
dl.addCallback(finish)
def processStart(chunk, returnList):
start(chunk)
reactor.run()
returnList.extend(results)
df = pd.read_csv('final_urls')
urls = df['site'].values.tolist()
chunkCounter = 0
chunkLength = 1000
manager = multiprocessing.Manager()
returnList = manager.list()
for chunk in np.array_split(urls,len(urls)/chunkLength):
p = multiprocessing.Process(target=processStart, args=(chunk,returnList))
p.start()
p.join()
chunkCounter += 1
print("Finished chunk %s of %s URLs" % (str(chunkCounter), str(chunkLength)))
for res in returnList:
print (res)
print (len(returnList))
You may also want to add some error handling, to help you can uncomment the line reading "log.startLogging(sys.stdout)" but this is too much detail for one answer. If you get some failures for URLs I would generally retry them by running the code again with just the failed URLs possibly a few times if necessary.

http fuzzing with boofuzz

I was looking for a fuzzing library and I happened to see "boofuzz"
though there are no examples of how to use the library for http fuzzing.
This is the only code I see in their github page, but they say it was taken from sulley (an old fuzzing library):
import sys
sys.path.insert(0, '../')
from boofuzz.primitives import String, Static, Delim
class Group(object):
blocks = []
def __init__(self, name, definition=None):
self.name = name
if definition:
self.definition = definition
def add_definition(self, definition):
assert isinstance(definition, (list, tuple)), "Definition must be a list or a tuple!"
self.definition = definition
def render(self):
return "".join([x.value for x in self.definition])
def exhaust(self):
for item in self.definition:
while item.mutate():
current_value = item.value
self.log_send(current_value)
recv_data = self.send_buffer(current_value)
self.log_recv(recv_data)
def __repr__(self):
return '<%s [%s items]>' % (self.__class__.__name__, len(self.definition))
# noinspection PyMethodMayBeStatic
def send_buffer(self, current_value):
return "Sent %s!" % current_value
def log_send(self, current_value):
pass
def log_recv(self, recv_data):
pass
s_static = Static
s_delim = Delim
s_string = String
CloseHeader = Group(
"HTTP Close Header",
definition=[
# GET / HTTP/1.1\r\n
s_static("GET / HTTP/1.1\r\n"),
# Connection: close
s_static("Connection"), s_delim(":"), s_delim(" "), s_string("close"),
s_static("\r\n\r\n")
]
)
OpenHeader = Group(
"HTTP Open Header",
definition=[
# GET / HTTP/1.1\r\n
Static("GET / HTTP/1.1\r\n"),
# Connection: close
Static("Connection"), Delim(":"), Delim(" "), String("open"),
Static("\r\n\r\n")
]
)
# CloseHeader = Group("HTTP Close Header")
# CloseHeader.add_definition([
# # GET / HTTP/1.1\r\n
# s_static("GET / HTTP/1.1\r\n"),
# # Connection: close
# s_static("Connection"), s_delim(":"), s_delim(" "), s_string("close"),
# s_static("\r\n\r\n")
# ])
Why would they post it, if it's another's library code? And is there a good explanation of how to work with the boofuzz library?
If you Google "http protocol format", the first result right now is this HTTP tutorial. If you read a few pages there, you can get a pretty good description of the protocol format. Based on that, I wrote the following fuzz script, source code here:
#!/usr/bin/env python
# Designed for use with boofuzz v0.0.9
from boofuzz import *
def main():
session = Session(
target=Target(
connection=SocketConnection("127.0.0.1", 80, proto='tcp')
),
)
s_initialize(name="Request")
with s_block("Request-Line"):
s_group("Method", ['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE'])
s_delim(" ", name='space-1')
s_string("/index.html", name='Request-URI')
s_delim(" ", name='space-2')
s_string('HTTP/1.1', name='HTTP-Version')
s_static("\r\n", name="Request-Line-CRLF")
s_static("\r\n", "Request-CRLF")
session.connect(s_get("Request"))
session.fuzz()
if __name__ == "__main__":
main()
Although I got tripped up for a while because I only had one CRLF. After checking RFC 2616 (Section 5), it's pretty clear that this example should end with two CRLFs.
Request = Request-Line ; Section 5.1
*(( general-header ; Section 4.5
| request-header ; Section 5.3
| entity-header ) CRLF) ; Section 7.1
CRLF
[ message-body ] ; Section 4.3
[...]
Request-Line = Method SP Request-URI SP HTTP-Version CRLF
Obviously, this fuzz script doesn't come close to covering the whole protocol. Just a few things that could be added:
HTTP Headers (there are a lot)
Specialized formats for each HTTP Method
A message body (e.g. on POST)
Some way to choose valid URIs for the particular target server
Report warnings based on server response (could get noisy, but server errors do tend to indicate... errors)

Non blocking python sockets

I'd like to write a small Bluetooth server application to my Nokia phone in PyS60. It needs to be able to send response to the client's request and be able to push data to the client as well.
option 1:
if I use socket.recv(1024), the program waits until something is received, therefore the server can't push data to the client. The Python for S60 implementation is missing the socket.settimeout() method, so I couldn't write a proper non-blocking code.
oprion 2:
The socket.makefile() approach was looking good, but couldn't make it work. When I replaced the conn.recv(1024) to fd = socket.makefile() fd.readline(), it didn't read a thing.
option 3:
Looked into the select() function, but had no luck with it. When I changed the conn.recv() to the r,w,e = select.select([conn],[],[]) like it's been suggested the client doesn't even connect. It hangs at "Waiting for the client...". Strange...
I know that there are pretty nice server implementations and asynchronous API-s as well, but I only need a really basic stuff here. Thanks in advance!
here's what I have:
sock = btsocket.socket(btsocket.AF_BT, btsocket.SOCK_STREAM)
channel = btsocket.bt_rfcomm_get_available_server_channel(sock)
sock.bind(("", channel))
sock.listen(1)
btsocket.bt_advertise_service(u"name", sock, True, btsocket.RFCOMM)
print "Waiting for the client..."
conn, client_mac = sock.accept()
print "connected: " + client_mac
while True:
try:
data = conn.recv(1024)
if len(data) != 0:
print "received [%s]" % data
if data.startswith("something"): conn.send("something\r\n")
else:
conn.send("some other data \r\n")
except:
pass
It's obviously blocking, so the "some other data" is never sent, but it's the best I've got so far. At least I can send something in reply to the client.
Found the solution finally!
The select function wasn't working with the btsocket module of the newer PyS60 ports.
Someone wrote a new_btsocket (available here) with a working select function.
Here is a simple example based on an echo server
#!/usr/bin/python
import socket
import select
server = socket.socket( socket.AF_INET, socket.SOCK_STREAM )
server.bind( ('localhost', 12556) )
server.listen( 5 )
toread = [server]
running = 1
# we will shut down when all clients disconenct
while running:
rready,wready,err = select.select( toread, [], [] )
for s in rready:
if s == server:
# accepting the socket, which the OS passes off to another
# socket so we can go back to selecting. We'll append this
# new socket to the read list we select on next pass
client, address = server.accept()
toread.append( client ) # select on this socket next time
else:
# Not the server's socket, so we'll read
data = s.recv( 1024 )
if data:
print "Received %s" % ( data )
else:
print "Client disconnected"
s.close()
# remove socket so we don't watch an invalid
# descriptor, decrement client count
toread.remove( s )
running = len(toread) - 1
# clean up
server.close()
That said, I still find socketserver cleaner and easier. Implement handle_request and call serve_forever
Here's an Epoll Server Implementation (non-blocking)
http://pastebin.com/vP6KPTwH (same thing as below, felt this might be easier to copy)
use python epollserver.py to start the server.
Test it using wget localhost:8888
import sys
import socket, select
import fcntl
import email.parser
import StringIO
import datetime
"""
See:
http://docs.python.org/library/socket.html
"""
__author__ = ['Caleb Burns', 'Ben DeMott']
def main(argv=None):
EOL1 = '\n\n'
EOL2 = '\n\r\n'
response = 'HTTP/1.0 200 OK\r\nDate: Mon, 1 Jan 1996 01:01:01 GMT\r\n'
response += 'Content-Type: text/plain\r\nContent-Length: 13\r\n\r\n'
response += 'Hello, world!'
serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# Tell the server socket file descriptor to destroy itself when this program ends.
socketFlags = fcntl.fcntl(serversocket.fileno(), fcntl.F_GETFD)
socketFlags |= fcntl.FD_CLOEXEC
fcntl.fcntl(serversocket.fileno(), fcntl.F_SETFD, socketFlags)
serversocket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
serversocket.bind(('0.0.0.0', 8888))
serversocket.listen(1)
# Use asynchronous sockets.
serversocket.setblocking(0)
# Allow a queue of up to 128 requests (connections).
serversocket.listen(128)
# Listen to socket events on the server socket defined by the above bind() call.
epoll = select.epoll()
epoll.register(serversocket.fileno(), select.EPOLLIN)
print "Epoll Server Started..."
try:
#The connection dictionary maps file descriptors (integers) to their corresponding network connection objects.
connections = {}
requests = {}
responses = {}
while True:
# Ask epoll if any sockets have events and wait up to 1 second if no events are present.
events = epoll.poll(1)
# fileno is a file desctiptor.
# event is the event code (type).
for fileno, event in events:
# Check for a read event on the socket because a new connection may be present.
if fileno == serversocket.fileno():
# connection is a new socket object.
# address is client IP address. The format of address depends on the address family of the socket (i.e., AF_INET).
connection, address = serversocket.accept()
# Set new socket-connection to non-blocking mode.
connection.setblocking(0)
# Listen for read events on the new socket-connection.
epoll.register(connection.fileno(), select.EPOLLIN)
connections[connection.fileno()] = connection
requests[connection.fileno()] = b''
responses[connection.fileno()] = response
# If a read event occured, then read the new data sent from the client.
elif event & select.EPOLLIN:
requests[fileno] += connections[fileno].recv(1024)
# Once we're done reading, stop listening for read events and start listening for EPOLLOUT events (this will tell us when we can start sending data back to the client).
if EOL1 in requests[fileno] or EOL2 in requests[fileno]:
epoll.modify(fileno, select.EPOLLOUT)
# Print request data to the console.
epoll.modify(fileno, select.EPOLLOUT)
data = requests[fileno]
eol = data.find("\r\n") #this is the end of the FIRST line
start_line = data[:eol] #get the contents of the first line (which is the protocol information)
# method is POST|GET, etc
method, uri, http_version = start_line.split(" ")
# re-used facebooks httputil library (works well to normalize and parse headers)
headers = HTTPHeaders.parse(data[eol:])
print "\nCLIENT: FD:%s %s: '%s' %s" % (fileno, method, uri, datetime.datetime.now())
# If the client is ready to receive data, sent it out response.
elif event & select.EPOLLOUT:
# Send response a single bit at a time until the complete response is sent.
# NOTE: This is where we are going to use sendfile().
byteswritten = connections[fileno].send(responses[fileno])
responses[fileno] = responses[fileno][byteswritten:]
if len(responses[fileno]) == 0:
# Tell the socket we are no longer interested in read/write events.
epoll.modify(fileno, 0)
# Tell the client we are done sending data and it can close the connection. (good form)
connections[fileno].shutdown(socket.SHUT_RDWR)
# EPOLLHUP (hang-up) events mean the client has disconnected so clean-up/close the socket.
elif event & select.EPOLLHUP:
epoll.unregister(fileno)
connections[fileno].close()
del connections[fileno]
finally:
# Close remaining open socket upon program completion.
epoll.unregister(serversocket.fileno())
epoll.close()
serversocket.close()
#!/usr/bin/env python
#
# Copyright 2009 Facebook
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""HTTP utility code shared by clients and servers."""
class HTTPHeaders(dict):
"""A dictionary that maintains Http-Header-Case for all keys.
Supports multiple values per key via a pair of new methods,
add() and get_list(). The regular dictionary interface returns a single
value per key, with multiple values joined by a comma.
>>> h = HTTPHeaders({"content-type": "text/html"})
>>> h.keys()
['Content-Type']
>>> h["Content-Type"]
'text/html'
>>> h.add("Set-Cookie", "A=B")
>>> h.add("Set-Cookie", "C=D")
>>> h["set-cookie"]
'A=B,C=D'
>>> h.get_list("set-cookie")
['A=B', 'C=D']
>>> for (k,v) in sorted(h.get_all()):
... print '%s: %s' % (k,v)
...
Content-Type: text/html
Set-Cookie: A=B
Set-Cookie: C=D
"""
def __init__(self, *args, **kwargs):
# Don't pass args or kwargs to dict.__init__, as it will bypass
# our __setitem__
dict.__init__(self)
self._as_list = {}
self.update(*args, **kwargs)
# new public methods
def add(self, name, value):
"""Adds a new value for the given key."""
norm_name = HTTPHeaders._normalize_name(name)
if norm_name in self:
# bypass our override of __setitem__ since it modifies _as_list
dict.__setitem__(self, norm_name, self[norm_name] + ',' + value)
self._as_list[norm_name].append(value)
else:
self[norm_name] = value
def get_list(self, name):
"""Returns all values for the given header as a list."""
norm_name = HTTPHeaders._normalize_name(name)
return self._as_list.get(norm_name, [])
def get_all(self):
"""Returns an iterable of all (name, value) pairs.
If a header has multiple values, multiple pairs will be
returned with the same name.
"""
for name, list in self._as_list.iteritems():
for value in list:
yield (name, value)
def items(self):
return [{key: value[0]} for key, value in self._as_list.iteritems()]
def get_content_type(self):
return dict.get(self, HTTPHeaders._normalize_name('content-type'), None)
def parse_line(self, line):
"""Updates the dictionary with a single header line.
>>> h = HTTPHeaders()
>>> h.parse_line("Content-Type: text/html")
>>> h.get('content-type')
'text/html'
"""
name, value = line.split(":", 1)
self.add(name, value.strip())
#classmethod
def parse(cls, headers):
"""Returns a dictionary from HTTP header text.
>>> h = HTTPHeaders.parse("Content-Type: text/html\\r\\nContent-Length: 42\\r\\n")
>>> sorted(h.iteritems())
[('Content-Length', '42'), ('Content-Type', 'text/html')]
"""
h = cls()
for line in headers.splitlines():
if line:
h.parse_line(line)
return h
# dict implementation overrides
def __setitem__(self, name, value):
norm_name = HTTPHeaders._normalize_name(name)
dict.__setitem__(self, norm_name, value)
self._as_list[norm_name] = [value]
def __getitem__(self, name):
return dict.__getitem__(self, HTTPHeaders._normalize_name(name))
def __delitem__(self, name):
norm_name = HTTPHeaders._normalize_name(name)
dict.__delitem__(self, norm_name)
del self._as_list[norm_name]
def get(self, name, default=None):
return dict.get(self, HTTPHeaders._normalize_name(name), default)
def update(self, *args, **kwargs):
# dict.update bypasses our __setitem__
for k, v in dict(*args, **kwargs).iteritems():
self[k] = v
#staticmethod
def _normalize_name(name):
"""Converts a name to Http-Header-Case.
>>> HTTPHeaders._normalize_name("coNtent-TYPE")
'Content-Type'
"""
return "-".join([w.capitalize() for w in name.split("-")])
if(__name__ == '__main__'):
sys.exit(main(sys.argv))

Categories