Benchmarking tool using twisted - python

I am trying to write a web benchmarking tool base on twisted. Twisted is very fantastic asynchronous framework for web applications. Because I get started with this framework for just two weeks, I face a problem, here is it:
When I test this benchmarking tool compare with ApacheBench, the result differs greatly on the same concurrency. Here is the result of my tool:
python pyab.py 50000 50 http://xx.com/a.txt
speed:1063(q/s), worker:50, interval:7, req_made:7493, req_done:7443, req_error:0
And Here is the result of Apache Bench:
ab -c 50 -n 50000 http://xx.com/a.txt
Server Software: nginx/1.4.1
Server Hostname: 203.90.245.26
Server Port: 8080
Document Path: /a.txt
Document Length: 6 bytes
Concurrency Level: 50
Time taken for tests: 6.89937 seconds
Complete requests: 50000
Failed requests: 0
Write errors: 0
Total transferred: 12501750 bytes
HTML transferred: 300042 bytes
Requests per second: 8210.27 [#/sec] (mean)
Time per request: 6.090 [ms] (mean)
Time per request: 0.122 [ms] (mean, across all concurrent requests)
Transfer rate: 2004.62 [Kbytes/sec] received
Connection Times (ms)
min mean[+/-sd] median max
Connect: 0 0 0.8 0 4
Processing: 1 5 3.4 5 110
Waiting: 0 2 3.6 2 109
Total: 1 5 3.5 5 110
Percentage of the requests served within a certain time (ms)
50% 5
66% 6
75% 6
80% 6
90% 7
95% 7
98% 8
99% 8
100% 110 (longest request)
On the same url and concurrency, ApacheBench can go up to 8000 req/sec, while pyab only 1000 req/sec.
Here is my code(pyab.py):
from twisted.internet import reactor,threads
from twisted.internet.protocol import Protocol
from twisted.internet.defer import Deferred
from twisted.web.client import Agent
from twisted.web.client import HTTPConnectionPool
from twisted.web.http_headers import Headers
from twisted.python import log
import time, os, stat, logging, sys
from collections import Counter
logging.basicConfig(
#filename= "/%s/log/%s.%s" % (RUN_DIR,RUN_MODULE,RUN_TIME),
format="%(asctime)s [%(levelname)s] %(message)s",
level=logging.WARNING,
#level=logging.DEBUG,
stream=sys.stdout
)
#log.startLogging(sys.stdout)
observer = log.PythonLoggingObserver()
observer.start()
class IgnoreBody(Protocol):
def __init__(self, deferred, tl):
self.deferred = deferred
self.tl = tl
def dataReceived(self, bytes):
pass
def connectionLost(self, reason):
self.deferred.callback(None)
class Pyab:
def __init__( self, n = 50000, concurrency = 100, url='http://203.90.245.26:8080/a.txt'):
self.n = n
self.url = url
self.pool = HTTPConnectionPool(reactor, persistent=True)
self.pool.maxPersistentPerHost = concurrency
self.agent = Agent(reactor, connectTimeout = 5, pool = self.pool)
#self.agent = Agent(reactor, connectTimeout = 5)
self.time_start = time.time()
self.max_worker = concurrency
self.cnt = Counter({
'worker' : 0 ,
'req_made' : 0,
'req_done' : 0,
'req_error' : 0,
})
def monitor( self ):
interval = int(time.time() - self.time_start)
speed = 0
if interval != 0:
speed = int( self.cnt['req_done'] / interval )
log.msg("speed:%d(q/s), worker:%d, interval:%d, req_made:%d, req_done:%d, req_error:%d"
% (speed, self.cnt['worker'], interval, self.cnt['req_made'], self.cnt['req_done'], self.cnt['req_error']), logLevel=logging.WARNING)
reactor.callLater(1, lambda : self.monitor())
def start( self ):
self.keeprunning = True
self.monitor()
self.readMore()
def stop( self ):
self.keeprunning = False
def readMore( self ):
while self.cnt['worker'] < self.max_worker and self.cnt['req_done'] < self.n :
self.make_request()
if self.keeprunning and self.cnt['req_done'] < self.n:
reactor.callLater( 0.0001, lambda: self.readMore() )
else:
reactor.stop()
def make_request( self ):
d = self.agent.request(
'GET',
#'http://examplexx.com/',
#'http://example.com/',
#'http://xa.xingcloud.com/v4/qvo/WDCXWD7500AADS-00M2B0_WD-WCAV5E38536685366?update0=ref0%2Ccor&update1=nation%2Ccn&action0=visit&_ts=1376397973636',
#'http://203.90.245.26:8080/a.txt',
self.url,
Headers({'User-Agent': ['Twisted Web Client Example']}),
None)
self.cnt['worker'] += 1
self.cnt['req_made'] += 1
def cbResponse(resp):
self.cnt['worker'] -= 1
self.cnt['req_done'] += 1
log.msg('response received')
finished = Deferred()
resp.deliverBody(IgnoreBody(finished, self))
return finished
def cbError(error):
self.cnt['worker'] -= 1
self.cnt['req_error'] += 1
log.msg(error, logLevel=logging.ERROR)
d.addCallback(cbResponse)
d.addErrback(cbError)
if __name__ == '__main__' :
if len(sys.argv) < 4:
print "Usage: %s <n> <concurrency> <url>" % (sys.argv[0])
sys.exit()
ab = Pyab(n=int(sys.argv[1]), concurrency=int(sys.argv[2]), url=sys.argv[3])
ab.start()
reactor.run()
Is there any wrong with my code? Thanks!

When I last used it, ab was known to have dozens of serious bugs. Sometimes that would cause it to report massively inflated results. Sometimes it would report negative results. Sometimes it would crash. I'd try another tool, like httperf, as a sanity check.
However, if your server is actually that fast, then you might have another issue.
Even if ab has been fixed, you're talking here about a C program versus a Python program running on CPython. 8x slower than C in Python is not actually all that bad, so I don't expect there is actually anything wrong with your program, except that it doesn't make use of spawnProcess and multi-core concurrency.
For starters, see if you get any better results on PyPy.

Related

Creating a Delayed Response in a Python 2 HTTP Server

I'm creating a very simple HTTP server in python 2 for testing.
I would like to randomly delay by a fixed amount the reply to a GET request without closing the connection to the server or shutting down the server first.
Here is the current code I would like to modify:
# Only runs under Python 2
import BaseHTTPServer
import time
from datetime import datetime
class SimpleRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
def do_GET(self):
print "incoming request: " + self.path
self.wfile.write('HTTP-1.0 200 Okay\r\n\r\n')
self.wfile.write(modes(self.path))
def run(server_class = BaseHTTPServer.HTTPServer,
handler_class = SimpleRequestHandler):
server_address = ('', 80)
httpd = server_class(server_address, handler_class)
httpd.serve_forever()
def modes(argument):
# returns value based on time of day to simulate data
curr_time = datetime.now()
seconds = int(curr_time.strftime('%S'))
if seconds < 20:
return "reply1"
elif seconds >= 20 and seconds < 40:
return "reply2"
else:
return "reply3"
run ( )
The delay would be be added every, say, 3rd or 4th time a GET is received but when the delay times out, a properly formed response would be sent.
Thanks.

Tornado AsyncHTTPClient performance degradation

Setup: Python 2.7.15, Tornado 5.1
I have a web-server machine that handles ~40 /recommend requests per second.
The average response time is 25ms, but there's a big divergence (some requests can take more than 500ms).
Each request generates between 1-8 Elasticsearch queries (HTTP requests) internally.
Each Elasticsearch query can take between 1-150ms.
The Elasticsearch requests are handled synchronously via elasticsearch-dsl library.
The goal is to reduce the i/o waiting time (queries to Elasticsearch) and handle more requests per second so I can reduce the number of machines.
One thing is unacceptable - I don't want to increase the average handle time (25ms).
I found some tornado-elasticsearch implementations on the web, but since I need to use only one endpoint to Elasticsearch (/_search) I am trying to do that alone.
Below there's a degenerated implementation of my web-server. With the same load (~40 request per second) the average request response time increased to 200ms!
Digging in, I see that the internal async handle time (queries to Elasticsearch) is not stable and the time takes to each fetch call might be different, and the total average (in ab load test) is high.
I'm using ab to simulate the load and measure it internally by printing the current fetch handle time, average fetch handle time and maximum handle time.
When doing one request at a time (concurrency 1):
ab -p es-query-rcom.txt -T application/json -n 1000 -c 1 -k 'http://localhost:5002/recommend'
my prints looks like: [avg req_time: 3, dur: 3] [current req_time: 2, dur: 3] [max req_time: 125, dur: 125] reqs: 8000
But when I try to increase the concurrency (up to 8): ab -p es-query-rcom.txt -T application/json -n 1000 -c 8 -k 'http://localhost:5002/recommend'
now my prints looks like: [avg req_time: 6, dur: 13] [current req_time: 4, dur: 4] [max req_time: 73, dur: 84] reqs: 8000
The average req is now x2 slower (or x4 by my measurements)!
What do I miss here? why do I see this degradation?
web_server.py:
import tornado
from tornado.httpclient import AsyncHTTPClient
from tornado.options import define, options
from tornado.httpserver import HTTPServer
from web_handler import WebHandler
SERVICE_NAME = 'web_server'
NUM_OF_PROCESSES = 1
class Statistics(object):
def __init__(self):
self.total_requests = 0
self.total_requests_time = 0
self.total_duration = 0
self.max_time = 0
self.max_duration = 0
class RcomService(object):
def __init__(self):
print 'initializing RcomService...'
AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient", max_clients=3)
self.stats = Statistics()
def start(self, port):
define("port", default=port, type=int)
db = self.get_db(self.stats)
routes = self.generate_routes(db)
app = tornado.web.Application(routes)
http_server = HTTPServer(app, xheaders=True)
http_server.bind(options.port)
http_server.start(NUM_OF_PROCESSES)
tornado.ioloop.IOLoop.current().start()
#staticmethod
def generate_routes(db):
return [
(r"/recommend", WebHandler, dict(db=db))
]
#staticmethod
def get_db(stats):
return {
'stats': stats
}
def main():
port = 5002
print('starting %s on port %s', SERVICE_NAME, port)
rcom_service = RcomService()
rcom_service.start(port)
if __name__ == '__main__':
main()
web_handler.py:
import time
import ujson
from tornado import gen
from tornado.gen import coroutine
from tornado.httpclient import AsyncHTTPClient
from tornado.web import RequestHandler
class WebHandler(RequestHandler):
def initialize(self, db):
self.stats = db['stats']
#coroutine
def post(self, *args, **kwargs):
result = yield self.wrapper_innear_loop([{}, {}, {}, {}, {}, {}, {}, {}]) # dummy queries (empty)
self.write({
'res': result
})
#coroutine
def wrapper_innear_loop(self, queries):
result = []
for q in queries: # queries are performed serially
res = yield self.async_fetch_gen(q)
result.append(res)
raise gen.Return(result)
#coroutine
def async_fetch_gen(self, query):
url = 'http://localhost:9200/my_index/_search'
headers = {
'Content-Type': 'application/json',
'Connection': 'keep-alive'
}
http_client = AsyncHTTPClient()
start_time = int(round(time.time() * 1000))
response = yield http_client.fetch(url, method='POST', body=ujson.dumps(query), headers=headers)
end_time = int(round(time.time() * 1000))
duration = end_time - start_time
body = ujson.loads(response.body)
request_time = int(round(response.request_time * 1000))
self.stats.total_requests += 1
self.stats.total_requests_time += request_time
self.stats.total_duration += duration
if self.stats.max_time < request_time:
self.stats.max_time = request_time
if self.stats.max_duration < duration:
self.stats.max_duration = duration
duration_avg = self.stats.total_duration / self.stats.total_requests
time_avg = self.stats.total_requests_time / self.stats.total_requests
print "[avg req_time: " + str(time_avg) + ", dur: " + str(duration_avg) + \
"] [current req_time: " + str(request_time) + ", dur: " + str(duration) + "] [max req_time: " + \
str(self.stats.max_time) + ", dur: " + str(self.stats.max_duration) + "] reqs: " + \
str(self.stats.total_requests)
raise gen.Return(body)
I tried to play a bit with the async class (Simple vs curl), the max_clients size, but I don't understand what is the best tune in my case.
But
Increased time may be because with concurrency==1, CPU was under-utilized and with c==8 it's 100%+ utilized and is unable to catch up with all requests. Example, abstract CPU can process 1000 operations/sec, to send a request it takes 50 CPU ops and to read a request result it takes 50 CPU ops too. When you have 5 RPS your CPU is 50% utilized and average request time is 50 ms (to send a req.) + request time + 50 ms (to read a req.). But when you have, for example, 40 RPS (8 times more than 5 RPS), your CPU would be over-utilized by 400% and some finished requests would be waiting to be parsed, so average request time now is 50 ms + request time + CPU wait time + 50 ms.
To sum up, my advise would be to check a CPU utilization on both loads and, to be sure, to profile how much time does it takes to send a request and parse a response, CPU may be your bottleneck.

Python Tornado rate limiting AsyncHttpClient fetch

Currently using an API that rate limits me to 3000 requests per 10 seconds. I have 10,000 urls that are fetched using Tornado due to it's asynchronous IO nature.
How do I go about implementing a rate limit to reflect the API limit?
from tornado import ioloop, httpclient
i = 0
def handle_request(response):
print(response.code)
global i
i -= 1
if i == 0:
ioloop.IOLoop.instance().stop()
http_client = httpclient.AsyncHTTPClient()
for url in open('urls.txt'):
i += 1
http_client.fetch(url.strip(), handle_request, method='HEAD')
ioloop.IOLoop.instance().start()
You can check where does the value of i lies in the interval of 3000 requests. For example, if i is in between 3000 and 6000, you can set the timeout of 10 seconds on every request until 6000. After 6000, just double the timeout. And so on.
http_client = AsyncHTTPClient()
timeout = 10
interval = 3000
for url in open('urls.txt'):
i += 1
if i <= interval:
# i is less than 3000
# just fetch the request without any timeout
http_client.fetch(url.strip(), handle_request, method='GET')
continue # skip the rest of the loop
if i % interval == 1:
# i is now 3001, or 6001, or so on ...
timeout += timeout # double the timeout for next 3000 calls
loop = ioloop.IOLoop.current()
loop.call_later(timeout, callback=functools.partial(http_client.fetch, url.strip(), handle_request, method='GET'))
Note: I only tested this code with small number of requests. It might be possible that the value of i would change because you're subtracting i in handle_request function. If that's the case, you should maintain another variable similar to i and perform subtraction on that.

Redis: # of channels degrading latency. How to prevent degradation?

pub.py
import redis
import datetime
import time
import json
import sys
import threading
import gevent
from gevent import monkey
monkey.patch_all()
def main(chan):
redis_host = '10.235.13.29'
r = redis.client.StrictRedis(host=redis_host, port=6379)
while True:
def getpkg():
package = {'time': time.time(),
'signature' : 'content'
}
return package
#test 2: complex data
now = json.dumps(getpkg())
# send it
r.publish(chan, now)
print 'Sending {0}'.format(now)
print 'data type is %s' % type(now)
time.sleep(1)
def zerg_rush(n):
for x in range(n):
t = threading.Thread(target=main, args=(x,))
t.setDaemon(True)
t.start()
if __name__ == '__main__':
num_of_chan = 10
zerg_rush(num_of_chan)
cnt = 0
stop_cnt = 21
while True:
print 'Waiting'
cnt += 1
if cnt == stop_cnt:
sys.exit(0)
time.sleep(30)
sub.py
import redis
import threading
import time
import json
import gevent
from gevent import monkey
monkey.patch_all()
def callback(ind):
redis_host = '10.235.13.29'
r = redis.client.StrictRedis(host=redis_host, port=6379)
sub = r.pubsub()
sub.subscribe(str(ind))
start = False
avg = 0
tot = 0
sum = 0
while True:
for m in sub.listen():
if not start:
start = True
continue
got_time = time.time()
decoded = json.loads(m['data'])
sent_time = float(decoded['time'])
dur = got_time - sent_time
tot += 1
sum += dur
avg = sum / tot
print decoded #'Recieved: {0}'.format(m['data'])
file_name = 'logs/sub_%s' % ind
f = open(file_name, 'a')
f.write('processing no. %s' % tot)
f.write('it took %s' % dur)
f.write('current avg: %s\n' % avg)
f.close()
def zerg_rush(n):
for x in range(n):
t = threading.Thread(target=callback, args=(x,))
t.setDaemon(True)
t.start()
def main():
num_of_chan = 10
zerg_rush(num_of_chan)
while True:
print 'Waiting'
time.sleep(30)
if __name__ == '__main__':
main()
I am testing redis pubsub to replace the use of rsh to communicate with remote boxes.
One of the things I have tested for was the number of channels affecting latency of publish and pubsub.listen().
Test: One publisher and one subscriber per channel (publisher publish every one second). Incremented the number of channels from and observed the latency (The duration from the moment publisher publish a message to the moment subscriber got the message via listen)
num of chan--------------avg latency in seconds
10:----------------------------------0.004453
50:----------------------------------0.005246
100:---------------------------------0.0155
200:---------------------------------0.0221
300:---------------------------------0.0621
Note: tested on 2 CPU + 4GB RAM + 1 NICsĀ RHEL6.4 VM.
What can I do to maintain low latency with high number of channels?
Redis is single-threaded so increasing more cpus wont help. maybe more RAM? if so, how much more?
Anything I can do code-wise or bottleneck is in Redis itself?
Maybe the limitation comes from the way my test codes are written with threading?
EDIT:
Redis Cluster vs ZeroMQ in Pub/Sub, for horizontally scaled distributed systems
Accepted answer says "You want to minimize latency, I guess. The number of channels is irrelevant. The key factors are the number of publishers and number of subscribers, message size, number of messages per second per publisher, number of messages received by each subscriber, roughly. ZeroMQ can do several million small messages per second from one node to another; your bottleneck will be the network long before it's the software. Most high-volume pubsub architectures therefore use something like PGM multicast, which ZeroMQ supports."
From my testings, i dont know if this is true. (The claim that the number of channels is irrelevant)
For example, i did a testing.
1) One channel. 100 publishers publishing to a channel with 1 subscriber listening. Publisher publishing one second at a time. latency was 0.00965 seconds
2) Same testing except 1000 publishers. latency was 0.00808 seconds
Now during my channel testing:
300 channels with 1 pub - 1 sub resulted in 0.0621 and this is only 600 connections which is less than above testing yet significantly slow in latency

Network traffic monitor with pcapy in python

I have written simple network traffic monitor to get transfer rate in B/s and/or total data transfer (in B). However when I test it by transferring a file with ftp (using Total Commander) I just can not make it measure total transfer as a size of the file. It always give much lower size than actual one.
I am not sure if I am doing something wrong..
The BPF filter I set is
dst <IP of ftp server pc>
Below is my source code :
import threading
import sys
import pcapy
import time
import logging as logger
class NetMonitor(threading.Thread):
_timeout = 1
#classmethod
def get_net_interfaces(cls):
return pcapy.findalldevs()
def __init__(self, device, bpf_filter):
threading.Thread.__init__(self)
self.active = True
self._net_monitor = pcapy.open_live(device, 65535, 0, 1000) #self.timeout * 1000)
self._net_monitor.setfilter(bpf_filter)
#self.dumper = self.net_monitor.dump_open("pkt_dump.txt")
self._current_bytes_rate = 0
self.total_transfer = 0 # total number of Bytes transfered
#<--- this is to calc average transfer B/s
self._tmp_bytes_per_sec_sum = 0 # sums up B/s values from each dispatch iteration (eventually used to calc average value)
self._inc = 0 # number of dispatch iterations (eventually used to calc average B/s value)
#--->
self._dispatch_bytes_sum = 0 # sums up packets size for one dispatch call
def __handle_packet(self, header, data):
# method is called for each packet by dispatch call (pcapy)
self._dispatch_bytes_sum += len(data) #header.getlen() #len(data)
#logger.debug("h: ({}, {}, {}), d:{}".format(header.getlen(), header.getcaplen(), header.getts(), len(data)))
#self.dumper.dump(header, data)
def update(self):
self._dispatch_bytes_sum = 0
# process packets
packets_nr = self._net_monitor.dispatch(-1, self.__handle_packet)
self.total_transfer += self._dispatch_bytes_sum
self._inc += 1
self._current_bytes_rate = self._dispatch_bytes_sum # add single dispatch B/s -> timeout is 1 s
self._tmp_bytes_per_sec_sum += self._current_bytes_rate
logger.debug('inc:{}, current rate: {} B/s, avg rate: {} B/s, total:{} B'.format(self._inc, self.current_rate, self.avg_rate, self.total_transfer))
return self._current_bytes_rate, packets_nr
def get_avg_bytes_rate(self):
if self._inc:
return self._tmp_bytes_per_sec_sum / self._inc
else:
return 0
def get_current_bytes_rate(self):
return self._current_bytes_rate
def run(self):
while(self.active):
self.update()
time.sleep(self._timeout)
# average B/s rate
avg_rate = property(get_avg_bytes_rate)
# current B/s rate
current_rate = property(get_current_bytes_rate)
if __name__ == '__main__':
filter = ' '.join(sys.argv[2:])
print filter
#nm0 = NetMonitor(pcapy.findalldevs()[0], filter)
nm1 = NetMonitor(pcapy.findalldevs()[1], filter)
nm1.start()
start_time = time.time()
while time.time() - start_time < int(sys.argv[1]):
print "current {} B/s, avg {} B/s, total transfer {} B".format(nm1.current_rate, nm1.avg_rate, nm1.total_transfer)
time.sleep(1)
nm1.active = False
nm1.join()
print "++++++ total: {}, avg: {}".format(nm1.total_transfer, nm1.avg_rate)
Any advice is much appreciated.
Cheers.
Use a filter to capture only the useful tcp streams, ftp-data :
port ftp-data
I suggest also to capture in promiscous mode, only the packet headers ( you don't need the full data to know the length ):
open_live( device, 4096, True, 100 )
In your handler, it is correct to use header.getlen().

Categories