cpython vs cython vs numpy array performance - python

I am doing some performance test on a variant of the prime numbers generator from http://docs.cython.org/src/tutorial/numpy.html.
The below performance measures are with kmax=1000
Pure Python implementation, running in CPython: 0.15s
Pure Python implementation, running in Cython: 0.07s
def primes(kmax):
p = []
k = 0
n = 2
while k < kmax:
i = 0
while i < k and n % p[i] != 0:
i = i + 1
if i == k:
p.append(n)
k = k + 1
n = n + 1
return p
Pure Python+Numpy implementation, running in CPython: 1.25s
import numpy
def primes(kmax):
p = numpy.empty(kmax, dtype=int)
k = 0
n = 2
while k < kmax:
i = 0
while i < k and n % p[i] != 0:
i = i + 1
if i == k:
p[k] = n
k = k + 1
n = n + 1
return p
Cython implementation using int*: 0.003s
from libc.stdlib cimport malloc, free
def primes(int kmax):
cdef int n, k, i
cdef int *p = <int *>malloc(kmax * sizeof(int))
result = []
k = 0
n = 2
while k < kmax:
i = 0
while i < k and n % p[i] != 0:
i = i + 1
if i == k:
p[k] = n
k = k + 1
result.append(n)
n = n + 1
free(p)
return result
The above performs great but looks horrible, as it holds two copies of the data... so I tried reimplementing it:
Cython + Numpy: 1.01s
import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.int
ctypedef np.int_t DTYPE_t
#cython.boundscheck(False)
def primes(DTYPE_t kmax):
cdef DTYPE_t n, k, i
cdef np.ndarray p = np.empty(kmax, dtype=DTYPE)
k = 0
n = 2
while k < kmax:
i = 0
while i < k and n % p[i] != 0:
i = i + 1
if i == k:
p[k] = n
k = k + 1
n = n + 1
return p
Questions:
why is the numpy array so incredibly slower than a python list, when running on CPython?
what did I do wrong in the Cython+Numpy implementation? cython is obviously NOT treating the numpy array as an int[] as it should.
how do I cast a numpy array to a int*? The below doesn't work
cdef numpy.nparray a = numpy.zeros(100, dtype=int)
cdef int * p = <int *>a.data

cdef DTYPE_t [:] p_view = p
Using this instead of p in the calculations. reduced the runtime from 580 ms down to 2.8 ms for me. About the exact same runtime as the implementation using *int. And that's about the max you can expect from this.
DTYPE = np.int
ctypedef np.int_t DTYPE_t
#cython.boundscheck(False)
def primes(DTYPE_t kmax):
cdef DTYPE_t n, k, i
cdef np.ndarray p = np.empty(kmax, dtype=DTYPE)
cdef DTYPE_t [:] p_view = p
k = 0
n = 2
while k < kmax:
i = 0
while i < k and n % p_view[i] != 0:
i = i + 1
if i == k:
p_view[k] = n
k = k + 1
n = n + 1
return p

why is the numpy array so incredibly slower than a python list, when running on CPython?
Because you didn't fully type it. Use
cdef np.ndarray[dtype=np.int, ndim=1] p = np.empty(kmax, dtype=DTYPE)
how do I cast a numpy array to a int*?
By using np.intc as the dtype, not np.int (which is a C long). That's
cdef np.ndarray[dtype=int, ndim=1] p = np.empty(kmax, dtype=np.intc)
(But really, use a memoryview, they're much cleaner and the Cython folks want to get rid of the NumPy array syntax in the long run.)

Best syntax I found so far:
import numpy
cimport numpy
cimport cython
#cython.boundscheck(False)
#cython.wraparound(False)
def primes(int kmax):
cdef int n, k, i
cdef numpy.ndarray[int] p = numpy.empty(kmax, dtype=numpy.int32)
k = 0
n = 2
while k < kmax:
i = 0
while i < k and n % p[i] != 0:
i = i + 1
if i == k:
p[k] = n
k = k + 1
n = n + 1
return p
Note where I used numpy.int32 instead of int. Anything on the left side of a cdef is a C type (thus int = int32 and float = float32), while anything on the RIGHT side of it (or outside of a cdef) is a python type (int = int64 and float = float64)

Related

Mean of numbers in Cython

I'm new to Cython, and have made a mean.pyx file with
#cython: language_level=3
def cmean(list arr, int length):
cdef float tot
cdef float elem
tot = 0
for i in range(length):
elem = arr[i]
tot += elem
tot /= length
return tot
I then call this from a Python file main.py:
import pyximport
pyximport.install()
from mean import cmean
arr = [1,2,4]
cres = cmean(arr, len(arr))
pyres = sum(arr)/len(arr)
print(cres)
print(pyres)
print(cres == pyres)
which outputs
2.3333332538604736
2.3333333333333335
False
Why are the results not the same?
I'm using Cython==0.29.30 and Python 3.9.2

Internal function int, decreases the size when converting from str to int in Python

This is my code:
import trng
bitn: int = 2 ** (2 ** 3)
w: "function" = lambda q=1: q * (trng.randbelow(1001) / 1000)
def rn(x0: int = None):
if (m := trng.randbelow(bitn)) <= 1:
m: int = 2
a, c = (trng.randbelow(m) for _ in range(2))
if (c := trng.randbelow(m)) <= 1:
c: int = 2
if x0 == None:
x0: int = trng.randbelow(m)
xs: list = [x0]
for k in range(1, bitn):
x = (a * xs[k - 1] + c) % m
if xs.count(x) > 1:
break
xs.append(x)
xs: str = str(sum(xs))
x = trng.choice(xs)
del m, a, c
del x0, xs, k
return x
def sm():
listbits = [rn() for _ in range(64)]
listbits: str = "".join(listbits)
listbits: int = int(listbits)
return listbits
while True: print(sm())
But, when I run this, python returns a number less than 64 characters, which is what I'm doing in the above code in the sm () function, I'm not able to understand. I've tried using the int function, outside the function (in the while loop), converting on the same line as I convert the list to string with the join method and an empty string, can someone help me please?
Thanks for your time and attention!
I'm using Windows 10 1903 x86 and Python 3.8.3

Is my Python/Cython iteration benchmark representative?

I want to iterate through a large data structure in a Python program and perform a task for each element. For simplicity, let's say the elements are integers and the task is just an incrementation. In the end, the last incremented element is returned as (dummy) result. In search of the best structure/method to do this I compared timings in pure Python and Cython for these structures (I could not find a direct comparison of them elsewhere):
Python list
NumPy array / typed memory view
Cython extension type with underlying C++ vector
The iterations I timed are:
Python foreach in list iteration (it_list)
Cython list iteration with explicit element access (cit_list)
Python foreach in array iteration (it_nparray)
Python NumPy vectorised operation (vec_nparray)
Cython memory view iteration with explicit element access (cit_memview)
Python foreach in underlying vector iteration (it_pyvector)
Python foreach in underlying vector iteration via __iter__ (it_pyvector_iterator)
Cython vector iteration with explicit element access (cit_pyvector)
Cython vector iteration via vector.iterator (cit_pyvector_iterator)
I am concluding from this (timings are below):
plain Python iteration over the NumPy array is extremely slow (about 10 times slower than the Python list iteration) -> not a good idea
Python iteration over the wrapped C++ vector is slow, too (about 1.5 times slower than the Python list iteration) -> not a good idea
Cython iteration over the wrapped C++ vector is the fastest option, approximately equal to the C contiguous memory view
The iteration over the vector using explicit element access is slightly faster than using an iterator -> why bother to use an iterator?
The memory view approach has comparably larger overhead than the extension type approach
My question is now: Are my numbers reliable (did I do something wrong or miss anything here)? Is this in line with your experience with real-world examples? Is there anything else I could do to improve the iteration? Below the code that I used and the timings. I am using this in a Jupyter notebook by the way. Suggestions and comments are highly appreciated!
Relative timings (minimum value 1.000), for different data structure sizes n:
================================================================================
Timings for n = 1:
--------------------------------------------------------------------------------
cit_pyvector_iterator: 1.000
cit_pyvector: 1.005
cit_list: 1.023
it_list: 3.064
it_pyvector: 4.230
it_pyvector_iterator: 4.937
cit_memview: 8.196
vec_nparray: 20.187
it_nparray: 25.310
================================================================================
================================================================================
Timings for n = 1000:
--------------------------------------------------------------------------------
cit_pyvector_iterator: 1.000
cit_pyvector: 1.001
cit_memview: 2.453
vec_nparray: 5.845
cit_list: 9.944
it_list: 137.694
it_pyvector: 199.702
it_pyvector_iterator: 218.699
it_nparray: 1516.080
================================================================================
================================================================================
Timings for n = 1000000:
--------------------------------------------------------------------------------
cit_pyvector: 1.000
cit_memview: 1.056
cit_pyvector_iterator: 1.197
vec_nparray: 2.516
cit_list: 7.089
it_list: 87.099
it_pyvector_iterator: 143.232
it_pyvector: 162.374
it_nparray: 897.602
================================================================================
================================================================================
Timings for n = 10000000:
--------------------------------------------------------------------------------
cit_pyvector: 1.000
cit_memview: 1.004
cit_pyvector_iterator: 1.060
vec_nparray: 2.721
cit_list: 7.714
it_list: 88.792
it_pyvector_iterator: 130.116
it_pyvector: 149.497
it_nparray: 872.798
================================================================================
Cython code:
%%cython --annotate
# distutils: language = c++
# cython: boundscheck = False
# cython: wraparound = False
from libcpp.vector cimport vector
from cython.operator cimport dereference as deref, preincrement as princ
# Extension type wrapping a vector
cdef class pyvector:
cdef vector[long] _data
cpdef void push_back(self, long x):
self._data.push_back(x)
def __iter__(self):
cdef size_t i, n = self._data.size()
for i in range(n):
yield self._data[i]
#property
def data(self):
return self._data
# Cython iteration over Python list
cpdef long cit_list(list l):
cdef:
long j, ii
size_t i, n = len(l)
for i in range(n):
ii = l[i]
j = ii + 1
return j
# Cython iteration over NumPy array
cpdef long cit_memview(long[::1] v) nogil:
cdef:
size_t i, n = v.shape[0]
long j
for i in range(n):
j = v[i] + 1
return j
# Iterate over pyvector
cpdef long cit_pyvector(pyvector v) nogil:
cdef:
size_t i, n = v._data.size()
long j
for i in range(n):
j = v._data[i] + 1
return j
cpdef long cit_pyvector_iterator(pyvector v) nogil:
cdef:
vector[long].iterator it = v._data.begin()
long j
while it != v._data.end():
j = deref(it) + 1
princ(it)
return j
Python code:
# Python iteration over Python list
def it_list(l):
for i in l:
j = i + 1
return j
# Python iteration over NumPy array
def it_nparray(a):
for i in a:
j = i + 1
return j
# Vectorised NumPy operation
def vec_nparray(a):
a + 1
return a[-1]
# Python iteration over C++ vector extension type
def it_pyvector_iterator(v):
for i in v:
j = i + 1
return j
def it_pyvector(v):
for i in v.data:
j = i + 1
return j
And for the benchmark:
import numpy as np
from operator import itemgetter
def bm(sizes):
"""Call functions with data structures of varying length"""
Timings = {}
for n in sizes:
Timings[n] = {}
# Python list
list_ = list(range(n))
# NumPy array
a = np.arange(n, dtype=np.int64)
# C++ vector extension type
pyv = pyvector()
for i in range(n):
pyv.push_back(i)
calls = [
(it_list, list_),
(cit_list, list_),
(it_nparray, a),
(vec_nparray, a),
(cit_memview, a),
(it_pyvector, pyv),
(it_pyvector_iterator, pyv),
(cit_pyvector, pyv),
(cit_pyvector_iterator, pyv),
]
for fxn, arg in calls:
Timings[n][fxn.__name__] = %timeit -o fxn(arg)
return Timings
def ratios(timings, base=None):
"""Show relative performance of runs based on `timings` dict"""
if base is not None:
base = timings[base].average
else:
base = min(x.average for x in timings.values())
return sorted([
(k, v.average / base)
for k, v in timings.items()
], key=itemgetter(1))
Timings = {}
sizes = [1, 1000, 1000000, 10000000]
Timings.update(bm(sizes))
for s in sizes:
print("=" * 80)
print(f"Timings for n = {s}:")
print("-" * 80)
for x in ratios(Timings[s]):
print(f"{x[0]:>25}: {x[1]:7.3f}")
print("=" * 80, "\n")

Does numpy.zeros() allow correct numba caching?

Numba does not cache when numpy.zeros() is called in the function. However, caching works properly with numpy.zeros_like().
I cannot have numba to cache the function when numpy.zeros() is called in the function. The same function is successfully cached when numpy.zeros() is replaced by numpy.zeros_like(), and properly changing the argument.
#jit('UniTuple(float64[:],4)(int32[:],int32[:],int32[:],int32[:],float64[:])',
nopython=True, parallel=True, cache=True, fastmath=True, nogil=True)
def fun_rec(a: np.ndarray, b: np.ndarray, m: np.ndarray, n: np.ndarray, x: np.ndarray):
l = int(len(x))
rec_num = np.zeros_like(x)
# not caching with rec_num = np.zeros(l)
rec_avg = np.zeros_like(x)
rec_err = np.zeros_like(x)
rec_fnd = np.zeros_like(x)
for i in range(l):
if rec_num[i] != 0:
continue
for j in prange(i + 1, l):
if (a[i] == m[j] and b[i] == n[j] and m[i] == a[j] and n[i] == b[j]):
avg = (x[i] + x[j]) / 2
err = abs(x[i] - x[j]) / abs(avg)
rec_num[i] = j + 1
rec_num[j] = i + 1
rec_avg[i] = avg
rec_avg[j] = avg
rec_err[i] = err
rec_err[j] = err
rec_fnd[i] = 1
rec_fnd[j] = 1
break
return(rec_num, rec_avg, rec_err, rec_fnd)
I was expecting numba to support numpy.zeros() as described in the documentation, no difference between numpy.zeros() and numpy.zeros_like() is described. I would like to know if the incorrect caching is related to possible errors in my code.

Error using scipy.weave.inline

I am using several techniques (NumPy, Weave, Cython, Numba) to perform a Python performance benchmark. The code takes two numpy arrays of size NxN and multiplies them element-wise and stores the values in another array C.
My weave.inline() code gives me a scipy.weave.build_tools.CompileError. I have created a minimalist piece of code which generates the same error. Could someone please help?
import time
import numpy as np
from scipy import weave
from scipy.weave import converters
def benchmark():
N = np.array(5000, dtype=np.int)
A = np.random.rand(N, N)
B = np.random.rand(N, N)
C = np.zeros([N, N], dtype=float)
t = time.clock()
weave_inline_loop(A, B, C, N)
print time.clock() - t
def weave_inline_loop(A, B, C, N):
code = """
int i, j;
for (i = 0; i < N; ++i)
{
for (j = 0; j < N; ++j)
{
C(i, j) = A(i, j) * B(i, j);
}
}
return_val = C;
"""
C = weave.inline(code, ['A', 'B', 'C', 'N'], type_converters=converters.blitz, compiler='gcc')
benchmark()
Three small changes are needed:
N can't be a 0D-numpy array (it has to be an integer so that i < N works in the C code). You should write N = 5000 instead of N = np.array(5000, dtype=np.int).
The C array is being modified in-place so it doesn't have to be returned. I don't know the restrictions on the kind of objects than return_val can handle, but if you try to keep return_val = C; it fails compiling: don't know how to convert ‘blitz::Array<double, 2>’ to ‘const py::object&’.
After that, weave.inline returns None. Keeping the assignment C = weave.inline(... makes the code look confusing, even if it works fine and the array named C will hold the result in the benchmark scope.
This is the end result:
import time
import numpy as np
from scipy import weave
from scipy.weave import converters
def benchmark():
N = 5000
A = np.random.rand(N, N)
B = np.random.rand(N, N)
C = np.zeros([N, N], dtype=float)
t = time.clock()
weave_inline_loop(A, B, C, N)
print time.clock() - t
def weave_inline_loop(A, B, C, N):
code = """
int i, j;
for (i = 0; i < N; ++i)
{
for (j = 0; j < N; ++j)
{
C(i, j) = A(i, j) * B(i, j);
}
}
"""
weave.inline(code, ['A', 'B', 'C', 'N'], type_converters=converters.blitz, compiler='gcc')
Two issues. First, you don't need the line return_val = C. You are directly manipulating the data in the variable C in your inlined code, so its already available to python and there's no need to explicitly return it to the environment (and trying to do so is causing errors when trying to do the appropriate type conversions). So change your function to:
def weave_inline_loop(A, B, C, N):
code = """
int i, j;
for (i = 0; i < N; ++i)
{
for (j = 0; j < N; ++j)
{
C(i, j) = A(i, j) * B(i, j);
}
}
"""
weave.inline(code, ['A', 'B', 'C', 'N'], type_converters=converters.blitz, compiler='gcc')
return C
Second issue. You are comparing i and j (both ints), to N an array of length 1. This also generated an error. But if you call your code as:
def benchmark():
N = np.array(5000, dtype=np.int)
A = np.random.rand(N, N)
B = np.random.rand(N, N)
C = np.zeros([N, N], dtype=float)
t = time.clock()
print weave_inline_loop(A, B, C, int(N))
# I added a print statement so you can see that C is being
# populated with the new 2d array
print time.clock() - t

Categories