timing a python function timeit vs time.clock disparity

timing a python function timeit vs time.clock disparity - python

import time
import logging
from functools import reduce
logging.basicConfig(filename='debug.log', level=logging.DEBUG)
def read_large_file(file_object):
"""Uses a generator to read a large file lazily"""
while True:
data = file_object.readline()
if not data:
break
yield data
def process_file_1(file_path):
"""Opens a large file and reads it in"""
try:
with open(file_path) as fp:
for line in read_large_file(fp):
logging.debug(line)
pass
except(IOError, OSError):
print('Error Opening or Processing file')
def process_file_2(file_path):
"""Opens a large file and reads it in"""
try:
with open(path) as file_handler:
while True:
logging.debug(next(file_handler))
except (IOError, OSError):
print("Error opening / processing file")
except StopIteration:
pass
if __name__ == "__main__":
path = "TB_data_dictionary_2016-04-15.csv"
l1 = []
for i in range(1,10):
start = time.clock()
process_file_1(path)
end = time.clock()
diff = (end - start)
l1.append(diff)
avg = reduce(lambda x, y: x + y, l1) / len(l1)
print('processing time (with generators) {}'.format(avg))
l2 = []
for i in range(1,10):
start = time.clock()
process_file_2(path)
end = time.clock()
diff = (end - start)
l2.append(diff)
avg = reduce(lambda x, y: x + y, l2) / len(l2)
print('processing time (with iterators) {}'.format(avg))
Output of the program:
C:\Python34\python.exe C:/pypen/data_structures/generators/generators1.py
processing time (with generators) 0.028033358176432314
processing time (with iterators) 0.02699498330810426
Another approach that I used is as follows.
def wrapper(func, *args, **kwargs):
def wrapped():
return func(*args, **kwargs)
return wrapped
if __name__ == "__main__":
path = "TB_data_dictionary_2016-04-15.csv"
wrapped = wrapper(process_file_1, path)
t = timeit.timeit(wrapped, number=100)
print('processing time (with generators) {}'.format(t))
wrapped = wrapper(process_file_2, path)
t = timeit.timeit(wrapped, number=100)
print('processing time (with iterators) {}'.format(t))
This gives me a different result.
C:\Python34\python.exe C:/pypen/data_structures/generators/generators1.py
processing time (with generators) 3.0999817624283916
processing time (with iterators) 3.2149597826018304
I would expect the implementation that uses generators to be faster than the one that uses iterators (which is what I am getting when I use timeit).
Why am I not getting the same result using the other method.

Related

Multiprocessing with conditional process spawning in python

I have a mono threaded function that I would like to parallelize. The code is a bit too complex to show you, but here is a modelization of its behaviours
R = list(range(4))
def compute(val):
res = sum(val)
if res%2 == 0: #first condition on res
sleep(0.5) #expansive operation
if res%4 == 0: #second condition on res
sleep(0.5) #expansive operation
return 2
else:
return 1
else:
return 1
def f_single(idx, val):
if idx == len(R):
return 1
else:
val = val + [R[idx]]
ret = compute(val)
if ret == 1:
return f_single(idx+1, val)
else:
#DISJUNCT
return f_single(idx+1, val) + f_single(idx+1, val)
Basically, each recursion performs an update of the variable val, and I may need to perform a double call on certain conditions depending on the result of compute(val), which is an expansive computation.
(Sidenote: this implementation does not scale up to large lists, since I will StackOverflow quite rapidly; the multiprocessing effort is also an excuse to rewrite this code).
Ideally, I would like to spawn a new process for computing the new call to f_single.
I started refactoring the code like this:
# list of indices
def process_idxs():
return list(range(len(R)))
def are_two_path(idx, val):
val = val + [R[idx]]
ret = compute(val)
if ret == 1:
return False #simulate a "only one path"
return True #simulate a "two path available"
if __name__ == '__main__':
ret = f_single(0,[])
now = time()
idxs = process_idxs()
# start a job when the job queue is not full
# when the job is complete, return the results (solvers with call stacks)
# add the new results to the job queue
# program terminates when the job queue is done
# TODO: how to do this?
with ProcessPoolExecutor(max_workers=12) as executor:
for idx in idxs:
f = executor.submit(are_two_path, idx, val)
print(f.result())
print("multi proc: ", time()-now, "s")
I don't know how to write my parallelization routine to obtain the same return value than f_single (the last few lines are a tentative to do just that).
When looking in concurrent.future and multiprocessing, I did not found an easy way to collect the results of computation for the current index, conditionally spawn the process and perform to the next recursion, while passing the updated value of val.
I don't have any shared state, except R which is read-only so it shouldn't be an issue here.
Do you have any suggestions or guides on how to convert f_single to a multiprocessing function?

A possible way to do it is to do the following:
import os
from time import time, sleep
from multiprocessing import Queue, Process
from queue import Empty
R = list(range(16))
NUMBER_OF_PROCESSES = 32
TIMEOUT = 1
def compute(val):
res = sum(val)
if res%2 == 0: #first condition on res
sleep(1) #expansive operation
if res%4 == 0: #second condition on res
sleep(1) #expansive operation
return 2
else:
return 1
else:
return 1
def are_two_path(idx, val):
val = val + [R[idx]]
ret = compute(val)
if ret == 1:
return False #simulate a "only one path"
return True #simulate a "two path available"
def worker(q, r, start_val, start_idx):
"""Worker spawned in a new process, in charge of
going through the list iteratively.
Sends a new job to the tasks queue if two path are available
"""
val = start_val
for idx in range(start_idx, len(R)+1):
if idx == len(R):
r.put(1)
else:
result = are_two_path(idx, val)
if result:
q.put((idx+1, val+[R[idx]]))
val = val + [R[idx]]
def overseer():
"""Running in the initial process,
this function create tasks and results queues,
maintain the number of current running processes
and spawn new processes when there is enough room
"""
tasks = Queue()
results = Queue()
init_p = Process(target=worker,
args=(tasks, results, [], 0))
init_p.start()
working = 1
completed_last_cycle = 0
while True:
completed_tasks = results.qsize()
if working < NUMBER_OF_PROCESSES:
# if there is enough room in the working queue,
# spawn a new process and add it
try:
(idx, val) = tasks.get(timeout=5)
except Empty:
break
p = Process(target=worker, args=(tasks, results, val, idx))
p.start()
working += 1
if completed_tasks > completed_last_cycle:
# if some processes terminated during last cycle,
# update the working counter
working -= (completed_tasks - completed_last_cycle)
completed_last_cycle = completed_tasks
tasks.close()
tasks.join_thread()
results.close()
results.join_thread()
return results
def test():
res = overseer()
print("Number of case splits: ", res.qsize())
if __name__ == '__main__':
now = time()
test()
print("multi proc: ", time()-now, "s")

Python iterator that precomputes return (enqueue)

I have a python iterator that solves a time-consuming task each iteration. It would be nice if the return values of the iterator could be precomputed in the background, such that when the iterator is called, the result can be yielded right away.
eg
import numpy as np
def sample_iterator():
while True:
x = np.random.rand(int(1e8)).mean()
yield x

Here is a iterator (precomputing_iterator) that takes an iterator (sample_iterator) as input. precomputing_iterator precomputes the return values of sample_iterator. When precomputing_iterator is created the precomputation of return values of sample_iterator is started right away. The return values are saved on a multiprocessing.Queue object. If there are values on the queue, precomputing_iterator can yield them right away.
from multiprocessing import Process, Queue
import numpy as np
import time
def sample_iterator():
while True:
x = np.random.rand(int(1e8)).mean()
yield x
def precomputing_iterator(iterator, maxsize = 5):
def enqueue(q):
while True:
q.put(iterator.next())
q = Queue(maxsize = maxsize)
p = Process(target=enqueue, args=(q,))
p.start()
while True:
yield q.get()
i1 = sample_iterator()
i2 = precomputing_iterator(i1)
t = time.time()
i2.next()
print "execution time:", time.time() - t
time.sleep(3)
t = time.time()
i2.next()
print "execution time:", time.time() - t
Here for me the first execution time is 1.4 seconds (queue is empty. No return values precomputed). The second execution time is 0.00031 seconds (the precomputed result is just returned)

Python multiprocessing pool is slower than sequential

I have used multiprocessing Pool to get some performance benefit over my sequential approach. However result is just opposite and Pool takes more time than sequential:
import multiprocessing as mp
import datetime
class A:
def __init__(self):
self.result_list = []
# parallel processing function
def foo_pool(self, data):
for d in data:
d[0] = d[0] * 10
return data
# sequential function
def foo_seq(self, data):
data[0] = data[0] * 10
return data
def log_result(self, result):
# This is called whenever foo_pool(i) returns a result.
self.result_list.extend([result])
def apply_async_with_callback(self):
pool = mp.Pool(8)
# Data Creation
lst = []
for i in range(100000):
lst.append([i, i + 1, i + 2])
print('length of data ', len(lst))
dtStart = datetime.datetime.now()
print('start time:', str(datetime.datetime.now()))
# Multiprocessing takes 2 secs
for data in self.chunks(lst, 1000):
pool.apply_async(self.foo_pool, args=(data,),
callback=self.log_result)
# Sequential. It is 10x faster than pool
# for d in lst:
# self.result_list.extend([self.foo_seq(d)])
pool.close()
pool.join()
print('output data length:', len(self.result_list))
dtEnd = datetime.datetime.now()
print('end time:', str(datetime.datetime.now()))
print('Time taken:', str(dtEnd - dtStart))
# Divide big data into chunks
def chunks(self, data, n):
for i in range(0, len(data), n):
res = data[i:i + n]
yield res
if __name__ == '__main__':
a = A()
a.apply_async_with_callback()
In above python code, in apply_async_with_callback(). If you un-comment the sequential code and run, result would get 10 times faster then multiprocessing Pool code.
Can someone help me understand, what is the wrong thing i am doing?
Edit:
After applying the code provided in Why is multiprocessed code in given code taking more time than usual sequential execution?
sequential is now only 2 times faster than parallel processing code. Updated code below:
import multiprocessing as mp
import datetime
class A:
def __init__(self):
self.result_list = []
# parallel processing function
def foo_pool(self, data):
for d in data:
d[0] = d[0] * float(10) + 10 * (float(d[0]) / 100)
return data
def log_result(self, result):
# This is called whenever foo_pool(i) returns a result.
self.result_list.extend([result])
def flatten(self, ll):
lst = []
for l in ll:
lst.extend(l)
return lst
def square(self, x):
return x * x
def squareChunk(self, chunk):
return self.foo_pool(chunk) #[self.foo_pool(x) for x in chunk]
def apply_async_with_callback(self):
# Data Creation
lst = []
for i in range(1000000):
lst.append([i, i + 1, i + 2])
print('length of data ', len(lst))
chunked = self.chunks(lst, 10000) # split original list in decent sized chunks
pool = mp.Pool(2)
dtStart = datetime.datetime.now()
print('start time:', str(datetime.datetime.now()))
results = self.flatten(pool.map(self.squareChunk, chunked))
pool.close()
pool.join()
print('output data length:', len(results))
dtEnd = datetime.datetime.now()
print('end time:', str(datetime.datetime.now()))
print('multi proc Time taken:', str(dtEnd - dtStart))
def chunks(self, l, n):
n = max(1, n)
return (l[i:i + n] for i in range(0, len(l), n))
if __name__ == '__main__':
a = A()
a.apply_async_with_callback()
I can see the difference of using Pool.map instead of Pool.apply_async. Code is faster now. Earlier it was 10 times slower than sequential, now it is 2 times slower. But... slower....
This is how multiprocessing behaves? Then what is the point of using multiprocessing? Or am i still doing something wrong?

Multiprocessing code hangs when input list is beyond a certain length

I'm been playing with the multiprocessing module to gain a better understanding from the implementation side. The code below does the following serial and then parallel way:
A set of random numbers are generated. Each number is used as a constant in an exponential function. The goal is to find, for each random number, a scalar needed such that the integral of the exponential function is 20.
The code below seems to work. However, once the value of num is set to 500, the code will just hang and I have no idea why. For what it's worth, this is on a Windows machine with everything running in Spyder.
from scipy import optimize as op
from scipy.integrate import trapz as intg
import numpy as np
import multiprocessing as mp
import random
import timeit
import time
def to_solve(a=None, x=None, y=None):
return intg(a*y, x)-20
def worker(lst, x, out_q):
ans = np.zeros(shape=(len(lst), 2))
for i, a in enumerate(lst):
y = func(a=a, x=x)
ans[i,0] = a
ans[i,1] = op.newton(func=to_solve, x0=1, args=(x, y))
out_q.put(ans)
def func(a=None, x=None):
return 1-np.exp(-a*x)
def main_p(nums):
start = timeit.default_timer()
x = np.linspace(0,100)
procs = []
out_q = mp.Queue()
num_procs = 2
step = int(len(nums)/num_procs)
first = 0
last = 0
for i in range(num_procs):
first = last
last = first+step
if i == num_procs-1:
out = nums[first:]
else:
out = nums[first:last]
p = mp.Process(target=worker, args=(out, x, out_q))
procs.append(p)
p.start()
for p in procs:
p.join()
for i in range(len(procs)):
if i == 0:
results = out_q.get()
else:
results = np.vstack((results, out_q.get()))
results = results[results[:,0].argsort()]
print timeit.default_timer() - start
return results
def main_s(nums):
start = timeit.default_timer()
results = np.zeros(shape=(len(nums),2))
x = np.linspace(0,100)
for i, a in enumerate(nums):
results[i,0] = a
y = func(a=a, x=x)
results[i,1] = op.newton(func=to_solve, x0=1, args=(x,y))
results = results[results[:,0].argsort()]
print timeit.default_timer() - start
return results
if __name__ == '__main__':
num = 400
nums = np.random.rand(1,num)
nums = nums.tolist()[0]
a = main_s(nums)
b = main_p(nums)

The object you are putting on the Queue is too large. Th worker process will not terminate until the buffer feeding the Queue is empty, which in turn will not happen until the parent process reads from the queue, which will happen after the join(), which waits -> Deadlock.
Here ist the description:
"An example which will deadlock is the following."
It follows that the problem goes away if you move the loop
for p in procs:
p.join()
after the for i in range(len(procs)): - loop.

Decorator to time specific lines of the code instead of whole method?

Lets assume a simple method :
def test_method():
a = 1
b = 10000
c = 20000
sum1 = sum(range(a,b))
sum2 = sum(range(b,c))
return (sum1,sum2)
To time this method using a decorator, a simple decorator would be :
from functools import wraps
def timed_decorator(f):
#wraps(f)
def wrapper(*args, **kwds):
start = time.time()
result = f(*args, **kwds)
elapsed = (time.time() - start)*1000
logger.debug("f::{0} t::{1:0.2f} ms".format(f.__name__, elapsed))
return result
return wrapper
Now if I want to time specific lines of test_method say line 4 sum1 = sum(range(a,b)) , the current implementation involves inline coding like:
def test_method():
a = 1
b = 10000
c = 20000
start = time.time()
sum1 = sum(range(a,b)) # timing specific line or lines
elapsed = (time.time() - start)*1000
logger.debug("This part took::{1:0.2f} ms".format(elapsed))
sum2 = sum(range(b,c))
return (sum1,sum2)
The intention is to use the decorator to time lines M to N of a specific method without modifying the code in the method.
Is it possible to inject such logic using a decorator ?

You can use a context manager.
import contextlib
#contextlib.contextmanager
def time_measure(ident):
tstart = time.time()
yield
elapsed = time.time() - tstart
logger.debug("{0}: {1} ms".format(ident, elapsed))
In your code, you use it like
with time_measure('test_method:sum1'):
sum1 = sum(range(a, b))
By the way, if you want to improve your code, you can use the Gaussian Sum Formula (explained here) instead of sum(range(a, b)).
def sum_range(a, b):
r_a = (a ** 2 + a) / 2 - a
r_b = (b ** 2 + b) / 2 - b
return r_b - r_a

Very simple solution with a custom context manager:
class elapsed:
def __enter__(self): self.start = time.time()
def __exit__(self, *args): print("%.1f ms" % ((time.time() - self.start)*1000))
Example usage:
with elapsed():
sum1 = sum(x ** 2 for x in range(1, 1000000))
# 547.0 ms
More about this: Decorator-like syntax for a specific line of code
Another solution: here is a slight variation of #NiklasR's answer without logger but print, and a ready-to-run example:
import contextlib, time
#contextlib.contextmanager
def time_measure(ident):
tstart = time.time()
yield
elapsed = time.time() - tstart
print("{0}: {1} ms".format(ident, elapsed))
with time_measure('hello'):
sum1 = sum(x ** 2 for x in range(1, 1000000))
# hello: 0.577033281326294 ms

One way I can think of is to use sys.settrace() and record time when handling "line" event in the tracer function. But one caveat is, the practice of setting a tracer may cause the time recorded to be inaccurate.
The general idea is:
Set a tracer function in the decorator that wraps the target method.
Get the line number for the first line of this method, with FLN = inspect.currentframe().f_lineno.
In the tracer function, handle "call" event and return a local tracer function to trace the "line" events in the scope. Read this if you are confused.
Within the local tracer function, get the current line number LN,
if LN-FLN == M, record the start time; if LN-FLN == N, record the end time, the time taken to execute lines M to N is endtime - starttime.
code:
import sys
from functools import wraps
import time
import linecache
_func_name_ = None
_func_ln_ = 0
_start_ = 0
_end_ = 0
_timestamp_ = 0
def trace_calls(frame, event, arg):
global _func_name_, _func_ln_
def trace_lines(frame, event, arg):
global _timestamp_
if event != 'line':
return
line_no = frame.f_lineno
filename = frame.f_code.co_filename
if line_no-_func_ln_ == _start_:
_timestamp_ = time.time()
print "%d %s TS:%d"%(line_no, linecache.getline(filename, line_no)[:-1], _timestamp_)
elif line_no-_func_ln_ == _end_:
_timestamp_ = time.time() - _timestamp_
print "%d %s"%(line_no, linecache.getline(filename, line_no)[:-1])
print "Lines %d to %d of %s takes %d seconds."%(_start_, _end_, _func_name_, _timestamp_)
if event != 'call':
return
co = frame.f_code
_func_ln_ = frame.f_lineno # record the line number at function entry point
func_name = co.co_name
if func_name != _func_name_:
return
return trace_lines
def time_lines(start, end):
global _start_, _end_
_start_, _end_ = start+1, end+2 # function name takes a line, end is inclusive
def inner(f):
#wraps(f)
def wrapper(*args, **kwargs):
global _func_name_
_func_name_ = f.__name__
sys.settrace(trace_calls)
f(*args, **kwargs)
sys.settrace(None)
return wrapper
return inner
#time_lines(2,4)
def tested_func():
print "Enter target function"
time.sleep(2)
time.sleep(1)
time.sleep(3)
print "Exit target function"
if __name__=="__main__":
tested_func()

It's pretty ugly, and not very stable code. but the only way I found to do this task is to exec the code of the function again, after injecting your code.
Something like this:
import inspect
import re
import time
def inject_timer(f,n,m):
codelines = inspect.getsourcelines(f)[0]
ident_lvl = re.search("^[ \t]*",codelines[n]).group(0)
codelines.insert(n,ident_lvl + "start_longJibrishTo_preventCollision = time.time()\n")
codelines.insert(m+2,ident_lvl + "elapsed_longJibrishTo_preventCollision = (time.time() - start_longJibrishTo_preventCollision)*1000\n")
codelines.insert(m+3,ident_lvl + """print("f::{0} t::{1:0.2f} ms".format("""+f.__name__+""", elapsed_longJibrishTo_preventCollision))\n""")
#print "".join(codelines)
exec "".join(codelines) in globals()
def test_method():
a = 1
b = 10000
time.sleep(2)
c = 20000
sum1 = sum(range(a,b))
sum2 = sum(range(b,c))
return (sum1,sum2)
inject_timer(test_method,3,5)

A decorator can only decorate callables (e.g. functions, methods, classes). A single line or a group of lines are not callable as long as you do not wrap them in their own callable.
For timing a unit of your code you should choose an appropriate number of repetitions. The goal is to make sure that the execution time is longer than just a few micro or milliseconds, otherwise the measurement error will be too large.
Did you have a look at the timeit module?

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

timing a python function timeit vs time.clock disparity - python

Related

Multiprocessing with conditional process spawning in python

Python iterator that precomputes return (enqueue)

Python multiprocessing pool is slower than sequential

Multiprocessing code hangs when input list is beyond a certain length

Decorator to time specific lines of the code instead of whole method?

Categories

Resources