Cython - Optimize read binary file - python

I am trying to improve this cython code (which works). Please note that I don't want to use numpy.fromfile.. because I want to be able to parse not fixed binary structures.
from libc.stdio cimport *
import struct
cpdef inline cimport_td(char* f, double[:] dates, double[:] tpx, int[:] tvo):
f_b = open(f.replace('\\','/'),'rb').read()
cdef int B = len(f_b), bb = 0, dd = 0
while bb < B:
dates[dd], tpx[dd], tvo[dd] = struct.unpack('ddi', f_b[bb:bb+20])
bb += 20
dd += 1
del f_b
return dates, tpx, tvo
Is there anything better than open/read and struct unpack ?
Thank you.

Related

How to get rid of for loop in my function? [duplicate]

T(i) = Tm(i) + (T(i-1)-Tm(i))**(-tau(i))
Tm and tau are NumPy vectors of the same length that have been previously calculated, and the desire is to create a new vector T. The i is included only to indicate the element index for what is desired.
Is a for loop necessary for this case?
You might think this would work:
import numpy as np
n = len(Tm)
t = np.empty(n)
t[0] = 0 # or whatever the initial condition is
t[1:] = Tm[1:] + (t[0:n-1] - Tm[1:])**(-tau[1:])
but it doesn't: you can't actually do recursion in numpy this way (since numpy calculates the whole RHS and then assigns it to the LHS).
So unless you can come up with a non-recursive version of this formula, you're stuck with an explicit loop:
tt = np.empty(n)
tt[0] = 0.
for i in range(1,n):
tt[i] = Tm[i] + (tt[i-1] - Tm[i])**(-tau[i])
2019 Update. The Numba code broke with the new version of numba. Changing dtype="float32" to dtype=np.float32 solved it.
I performed some benchmarks and in 2019 using Numba is the first option people should try to accelerate recursive functions in Numpy (adjusted proposal of Aronstef). Numba is already preinstalled in the Anaconda package and has one of the fastest times (about 20 times faster than any Python). In 2019 Python supports #numba annotations without additional steps (at least versions 3.6, 3.7, and 3.8). Here are three benchmarks: performed on 2019-12-05, 2018-10-20 and 2016-05-18.
And, as mentioned by Jaffe, in 2018 it is still not possible to vectorize recursive functions. I checked the vectorization by Aronstef and it does NOT work.
Benchmarks sorted by execution time:
-------------------------------------------
|Variant |2019-12 |2018-10 |2016-05 |
-------------------------------------------
|Pure C | na | na | 2.75 ms|
|C extension | na | na | 6.22 ms|
|Cython float32 | 0.55 ms| 1.01 ms| na |
|Cython float64 | 0.54 ms| 1.05 ms| 6.26 ms|
|Fortran f2py | 4.65 ms| na | 6.78 ms|
|Numba float32 |73.0 ms| 2.81 ms| na |
|(Aronstef) | | | |
|Numba float32v2| 1.82 ms| 2.81 ms| na |
|Numba float64 |78.9 ms| 5.28 ms| na |
|Numba float64v2| 4.49 ms| 5.28 ms| na |
|Append to list |73.3 ms|48.2 ms|91.0 ms|
|Using a.item() |36.9 ms|58.3 ms|74.4 ms|
|np.fromiter() |60.8 ms|60.0 ms|78.1 ms|
|Loop over Numpy|71.3 ms|71.9 ms|87.9 ms|
|(Jaffe) | | | |
|Loop over Numpy|74.6 ms|74.4 ms| na |
|(Aronstef) | | | |
-------------------------------------------
Corresponding code is provided at the end of the answer.
It seems that with time Numba and Cython times get better. Now both of them are faster than Fortran f2py. Cython is faster 8.6 times now and Numba 32bit is faster 2.5 times. Fortran was very hard to debug and compile in 2016. So now there is no reason to use Fortran at all.
I did not check Pure C and C extension in 2019 and 2018, because it is not easy to compile them in Jupyter notebooks.
I had the following setup in 2019:
Processor: Intel i5-9600K 3.70GHz
Versions:
Python: 3.8.0
Numba: 0.46.0
Cython: 0.29.14
Numpy: 1.17.4
I had the following setup in 2018:
Processor: Intel i7-7500U 2.7GHz
Versions:
Python: 3.7.0
Numba: 0.39.0
Cython: 0.28.5
Numpy: 1.15.1
The recommended Numba code using float32 (adjusted Aronstef):
#numba.jit("float32[:](float32[:], float32[:])", nopython=True, nogil=True)
def calc_py_jit32v2(Tm_, tau_):
tt = np.empty(len(Tm_),dtype=np.float32)
tt[0] = Tm_[0]
for i in range(1, len(Tm_)):
tt[i] = Tm_[i] - (tt[i-1] + Tm_[i])**(-tau_[i])
return tt[1:]
All the other code:
Data creation (like Aronstef + Mike T comment):
np.random.seed(0)
n = 100000
Tm = np.cumsum(np.random.uniform(0.1, 1, size=n).astype('float64'))
tau = np.random.uniform(-1, 0, size=n).astype('float64')
ar = np.column_stack([Tm,tau])
Tm32 = Tm.astype('float32')
tau32 = tau.astype('float32')
Tm_l = list(Tm)
tau_l = list(tau)
The code in 2016 was slightly different as I used abs() function to prevent nans and not the variant of Mike T. In 2018 the function is exactly the same as OP (Original Poster) wrote.
Cython float32 using Jupyter %% magic. The function can be used directly in Python. Cython needs a C++ compiler in which Python was compiled. Installation of the right version of Visual C++ compiler (for Windows) could be problematic:
%%cython
import cython
import numpy as np
cimport numpy as np
from numpy cimport ndarray
cdef extern from "math.h":
np.float32_t exp(np.float32_t m)
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.infer_types(True)
#cython.initializedcheck(False)
def cy_loop32(np.float32_t[:] Tm,np.float32_t[:] tau,int alen):
cdef np.float32_t[:] T=np.empty(alen, dtype=np.float32)
cdef int i
T[0]=0.0
for i in range(1,alen):
T[i] = Tm[i] + (T[i-1] - Tm[i])**(-tau[i])
return T
Cython float64 using Jupyter %% magic. The function can be used directly in Python:
%%cython
cdef extern from "math.h":
double exp(double m)
import cython
import numpy as np
cimport numpy as np
from numpy cimport ndarray
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.infer_types(True)
#cython.initializedcheck(False)
def cy_loop(double[:] Tm,double[:] tau,int alen):
cdef double[:] T=np.empty(alen)
cdef int i
T[0]=0.0
for i in range(1,alen):
T[i] = Tm[i] + (T[i-1] - Tm[i])**(-tau[i])
return T
Numba float64:
#numba.jit("float64[:](float64[:], float64[:])", nopython=False, nogil=True)
def calc_py_jitv2(Tm_, tau_):
tt = np.empty(len(Tm_),dtype=np.float64)
tt[0] = Tm_[0]
for i in range(1, len(Tm_)):
tt[i] = Tm_[i] - (tt[i-1] + Tm_[i])**(-tau_[i])
return tt[1:]
Append to list. Fastest non-compiled solution:
def rec_py_loop(Tm,tau,alen):
T = [Tm[0]]
for i in range(1,alen):
T.append(Tm[i] - (T[i-1] + Tm[i])**(-tau[i]))
return np.array(T)
Using a.item():
def rec_numpy_loop_item(Tm_,tau_):
n_ = len(Tm_)
tt=np.empty(n_)
Ti=tt.item
Tis=tt.itemset
Tmi=Tm_.item
taui=tau_.item
Tis(0,Tm_[0])
for i in range(1,n_):
Tis(i,Tmi(i) - (Ti(i-1) + Tmi(i))**(-taui(i)))
return tt[1:]
np.fromiter():
def it(Tm,tau):
T=Tm[0]
i=0
while True:
yield T
i+=1
T=Tm[i] - (T + Tm[i])**(-tau[i])
def rec_numpy_iter(Tm,tau,alen):
return np.fromiter(it(Tm,tau), np.float64, alen)[1:]
Loop over Numpy (based on the Jaffe's idea):
def rec_numpy_loop(Tm,tau,alen):
tt=np.empty(alen)
tt[0]=Tm[0]
for i in range(1,alen):
tt[i] = Tm[i] - (tt[i-1] + Tm[i])**(-tau[i])
return tt[1:]
Loop over Numpy (Aronstef's code). On my computer float64 is the default type for np.empty.
def calc_py(Tm_, tau_):
tt = np.empty(len(Tm_),dtype="float64")
tt[0] = Tm_[0]
for i in range(1, len(Tm_)):
tt[i] = (Tm_[i] - (tt[i-1] + Tm_[i])**(-tau_[i]))
return tt[1:]
Pure C without using Python at all. Version from year 2016 (with fabs() function):
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <windows.h>
#include <sys\timeb.h>
double randn() {
double u = rand();
if (u > 0.5) {
return sqrt(-1.57079632679*log(1.0 - pow(2.0 * u - 1, 2)));
}
else {
return -sqrt(-1.57079632679*log(1.0 - pow(1 - 2.0 * u,2)));
}
}
void rec_pure_c(double *Tm, double *tau, int alen, double *T)
{
for (int i = 1; i < alen; i++)
{
T[i] = Tm[i] + pow(fabs(T[i - 1] - Tm[i]), (-tau[i]));
}
}
int main() {
int N = 100000;
double *Tm= calloc(N, sizeof *Tm);
double *tau = calloc(N, sizeof *tau);
double *T = calloc(N, sizeof *T);
double time = 0;
double sumtime = 0;
for (int i = 0; i < N; i++)
{
Tm[i] = randn();
tau[i] = randn();
}
LARGE_INTEGER StartingTime, EndingTime, ElapsedMicroseconds;
LARGE_INTEGER Frequency;
for (int j = 0; j < 1000; j++)
{
for (int i = 0; i < 3; i++)
{
QueryPerformanceFrequency(&Frequency);
QueryPerformanceCounter(&StartingTime);
rec_pure_c(Tm, tau, N, T);
QueryPerformanceCounter(&EndingTime);
ElapsedMicroseconds.QuadPart = EndingTime.QuadPart - StartingTime.QuadPart;
ElapsedMicroseconds.QuadPart *= 1000000;
ElapsedMicroseconds.QuadPart /= Frequency.QuadPart;
if (i == 0)
time = (double)ElapsedMicroseconds.QuadPart / 1000;
else {
if (time > (double)ElapsedMicroseconds.QuadPart / 1000)
time = (double)ElapsedMicroseconds.QuadPart / 1000;
}
}
sumtime += time;
}
printf("1000 loops,best of 3: %.3f ms per loop\n",sumtime/1000);
free(Tm);
free(tau);
free(T);
}
Fortran f2py. Function can be used from Python. Version from year 2016 (with abs() function):
subroutine rec_fortran(tm,tau,alen,result)
integer*8, intent(in) :: alen
real*8, dimension(alen), intent(in) :: tm
real*8, dimension(alen), intent(in) :: tau
real*8, dimension(alen) :: res
real*8, dimension(alen), intent(out) :: result
res(1)=0
do i=2,alen
res(i) = tm(i) + (abs(res(i-1) - tm(i)))**(-tau(i))
end do
result=res
end subroutine rec_fortran
Update: 21-10-2018
I have corrected my answer based on comments.
It is possible to vectorize operations on vectors as long as the calculation is not recursive. Because a recursive operation depends on the previous calculated value it is not possible to parallel process the operation.
This does therefore not work:
def calc_vect(Tm_, tau_):
return Tm_[1:] - (Tm_[:-1] + Tm_[1:]) ** (-tau_[1:])
Since (serial processing / a loop) is necessary, the best performance is gained by moving as close as possible to optimized machine code, therefore Numba and Cython are the best answers here.
A Numba approach can be achieves as follows:
init_string = """
from math import pow
import numpy as np
from numba import jit, float32
np.random.seed(0)
n = 100000
Tm = np.cumsum(np.random.uniform(0.1, 1, size=n).astype('float32'))
tau = np.random.uniform(-1, 0, size=n).astype('float32')
def calc_python(Tm_, tau_):
tt = np.empty(len(Tm_))
tt[0] = Tm_[0]
for i in range(1, len(Tm_)):
tt[i] = Tm_[i] - pow(tt[i-1] + Tm_[i], -tau_[i])
return tt
#jit(float32[:](float32[:], float32[:]), nopython=False, nogil=True)
def calc_numba(Tm_, tau_):
tt = np.empty(len(Tm_))
tt[0] = Tm_[0]
for i in range(1, len(Tm_)):
tt[i] = Tm_[i] - pow(tt[i-1] + Tm_[i], -tau_[i])
return tt
"""
import timeit
py_time = timeit.timeit('calc_python(Tm, tau)', init_string, number=100)
numba_time = timeit.timeit('calc_numba(Tm, tau)', init_string, number=100)
print("Python Solution: {}".format(py_time))
print("Numba Soltution: {}".format(numba_time))
Timeit comparison of the Python and Numba functions:
Python Solution: 54.58057559299999
Numba Soltution: 1.1389029540000024
This is a good question. I am also interested to know if this is possible but so far I have not found a way to do it except in some simple cases.
Option 1. numpy.ufunc.accumulate
This seems to be a promising option as mentioned by #Karl Knechtel. You need to create a ufunc first. This web page explains how.
In the simple case of a recurrent function that takes two scalars as input and outputs one scaler, it seems to work:
import numpy as np
def test_add(x, data):
return x + data
assert test_add(1, 2) == 3
assert test_add(2, 3) == 5
# Make a Numpy ufunc from my test_add function
test_add_ufunc = np.frompyfunc(test_add, 2, 1)
assert test_add_ufunc(1, 2) == 3
assert test_add_ufunc(2, 3) == 5
assert np.all(test_add_ufunc([1, 2], [2, 3]) == [3, 5])
data_sequence = np.array([1, 2, 3, 4])
f_out = test_add_ufunc.accumulate(data_sequence, dtype=object)
assert np.array_equal(f_out, [1, 3, 6, 10])
[Note the dtype=object argument which is necessary as explained on the web page linked above].
But in your case (and mine) we want to compute a recurrent equation that has more than one data input (and potentially more than one state variable too).
When I tried this using the ufunc.accumulate approach above I got ValueError: accumulate only supported for binary functions.
If anyone knows a way round that constraint I would be very interested.
Option 2. Python's builtin accumulate function
In the mean time, this solution doesn't quite achieve what you wanted in terms of a vectorized calculation in numpy, but it does at least avoid a for loop.
from itertools import accumulate, chain
def t_next(t, data):
Tm, tau = data # Unpack more than one data input
return Tm + (t - Tm)**tau
assert t_next(2, (0.38, 0)) == 1.38
t0 = 2 # Initial t
Tm_values = np.array([0.38, 0.88, 0.56, 0.67, 0.45, 0.98, 0.58, 0.72, 0.92, 0.82])
tau_values = np.linspace(0, 0.9, 10)
# Combine the input data into a 2D array
data_sequence = np.vstack([Tm_values, tau_values]).T
t_out = np.fromiter(accumulate(chain([t0], data_sequence), t_next), dtype=float)
print(t_out)
# [2. 1.38 1.81303299 1.60614649 1.65039964 1.52579703
# 1.71878078 1.66109554 1.67839293 1.72152195 1.73091672]
# Slightly more readable version possible in Python 3.8+
t_out = np.fromiter(accumulate(data_sequence, t_next, initial=t0), dtype=float)
print(t_out)
# [2. 1.38 1.81303299 1.60614649 1.65039964 1.52579703
# 1.71878078 1.66109554 1.67839293 1.72152195 1.73091672]
To build on NPE's answer, I agree that there has to be a loop somewhere. Perhaps your goal is to avoid the overhead associated with a Python for loop? In that case, numpy.fromiter does beat out a for loop, but only by a little:
Using the very simple recursion relation,
x[i+1] = x[i] + 0.1
I get
#FOR LOOP
def loopit(n):
x = [0.0]
for i in range(n-1): x.append(x[-1] + 0.1)
return np.array(x)
#FROMITER
#define an iterator (a better way probably exists -- I'm a novice)
def it():
x = 0.0
while True:
yield x
x += 0.1
#use the iterator with np.fromiter
def fi_it(n):
return np.fromiter(it(), np.float, n)
%timeit -n 100 loopit(100000)
#100 loops, best of 3: 31.7 ms per loop
%timeit -n 100 fi_it(100000)
#100 loops, best of 3: 18.6 ms per loop
Interestingly, pre-allocating a numpy array results in a substantial loss in performance. This is a mystery to me, though I would guess that there must be more overhead associated with accessing an array element than with appending to a list.
def loopit(n):
x = np.zeros(n)
for i in range(n-1): x[i+1] = x[i] + 0.1
return x
%timeit -n 100 loopit(100000)
#100 loops, best of 3: 50.1 ms per loop

Binary reading with python gives unexpected results

I'm trying to read some binary files with python for my analysis generated with Zemax OpticStudio. The structure of the file is supposed to be the following:
2 x 32-bit integer as header
n chunks of data
Each chunk is made by
32-bit integer indicating the number of C struc that come after
m C structures
The structures' definition is the following:
typedef struct
{
unsigned int status;
int level;
int hit_object;
int hit_face;
int unused;
int in_object;
int parent;
int storage;
int xybin, lmbin;
double index, starting_phase;
double x, y, z;
double l, m, n;
double nx, ny, nz;
double path_to, intensity;
double phase_of, phase_at;
double exr, exi, eyr, eyi, ezr, ezi;
}
which has a size of 208 bytes, for your convenience.
Here is the code that I wrote with some research and a couple of brilliant answers from here.
from pathlib import Path
from functools import partial
from io import DEFAULT_BUFFER_SIZE
import struct
def little_endian_int(x):
return int.from_bytes(x,'little')
def file_byte_iterator(path):
"""iterator over lazily loaded file
"""
path = Path(path)
with path.open('rb') as file:
reader = partial(file.read1, DEFAULT_BUFFER_SIZE)
file_iterator = iter(reader, bytes())
for chunk in file_iterator:
yield from chunk
def ray_tell(rays_idcs:list,ray_idx:int,seg_idx:int):
idx = rays_idcs[ray_idx][0]
idx += 4 + 208*seg_idx
return idx
def read_header(bytearr:bytearray):
version = int.from_bytes(bytearr[0:4],'little')
zrd_format = version//10000
version = version%10000
num_seg_max = int.from_bytes(bytearr[4:8],'little')
return zrd_format,version,num_seg_max
def rays_indices(bytearr:bytearray):
index=8
rays=[]
while index <len(bytearr):
num_seg = int.from_bytes(bytearr[index:index+4],'little')
rays.append((index,num_seg))
index = index+4 + 208*num_seg
return rays
def read_ray(bytearr:bytearray,ray):
ray_idx,num_seg = ray
data = []
ray_idx = ray_idx + 4
seg_idx=0
for ray_idx in range(8,8+num_seg*208,208):
offsets = [0,4,8,12,16,20,24,28,32,36,40,48,56,64,72,80,88,96,104,112,120,128,136,144,152,160,168,176,184,192,200]
int_vars = offsets[0:11]
doubl_vars = offsets[11:]
data_integ = [bytearr[ray_idx+offset:ray_idx+offset+4] for offset in int_vars]
data_doubl = [bytearr[ray_idx+offset:ray_idx+offset+8] for offset in doubl_vars]
data.append([seg_idx,data_integ,data_doubl])
seg_idx += 1
return data
file="test_uncompressed.ZRD"
raypath = {}
filebin = bytearray(file_byte_iterator(file))
header = read_header(filebin)
print(header)
rays_idcs = rays_indices(filebin)
rays = []
for ray in rays_idcs:
rays.append(read_ray(filebin,ray))
ray = rays[1] #Random ray
segm = ray[2] #Random segm
ints = segm[1]
doub = segm[2]
print("integer vars:")
for x in ints:
print(x,little_endian_int(x))
print("double vars:")
for x in doub:
print(x,struct.unpack('<d',x))
I have verified that all of the structures have the right size and number of chunks and structures (my reading matches the number of segments and rays that I read with Zemax, ) , and thanks to the header, I verified the endianness of the file (little endian).
My output is the following:
(0, 2002)
bytearray(b'\x1f\xd8\x9c?') 1067243551
bytearray(b'\x06\x80\x00\x00') 32774
bytearray(b'\x02\x00\x00\x00') 2
bytearray(b'\x11\x00\x00\x00') 17
bytearray(b'\x02\x00\x00\x00') 2
bytearray(b'\x00\x00\x00\x00') 0
bytearray(b'\x11\x00\x00\x00') 17
bytearray(b'\x01\x00\x00\x00') 1
bytearray(b'\x00\x00\x00\x00') 0
bytearray(b'\x00\x00\x00\x00') 0
double vars:
bytearray(b'\x00\x00\x00\x00# \xac\xe8') (-1.6425098109028998e+196,)
bytearray(b'\xe8\xe3\xf9?\x00\x00\x00\x00') (5.3030112e-315,)
bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00') (0.0,)
bytearray(b'\x00\x00\x00\x00p_\xb4\xec') (-4.389425605765071e+215,)
bytearray(b'5\xe3\x9d\xbf\xf0\xbd"\xa2') (-3.001836066957746e-144,)
bytearray(b'z"\xc0?\x00\x00\x00\x00') (5.28431047e-315,)
bytearray(b'\x00\x00\x00\x00 \xc9+\xa3') (-2.9165705864036956e-139,)
bytearray(b'g\xd4\xcd?\x9ch{ ') (3.2707669223572687e-152,)
bytearray(b'q\x1e\xef?\x00\x00\x00\x00') (5.299523535e-315,)
bytearray(b'\x00\x00\x00\x00%\x0c\xb4A') (336340224.0,)
bytearray(b'\t\xf2u\xbf\\3L\xe6') (-5.991371249309652e+184,)
bytearray(b'\xe1\xff\xef\xbf1\x8dV\x1e') (1.5664573023148095e-162,)
bytearray(b'\xa1\xe9\xe8?\x9c\x9a6\xfc') (-2.202825582975923e+290,)
bytearray(b'qV\xb9?\x00\x00\x00\x00') (5.28210966e-315,)
bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00') (0.0,)
bytearray(b'\x00\x00\x00\x00\xc6\xfd\x0c\xa1') (-1.7713316840526727e-149,)
bytearray(b'\x96\x94\x8d?\xad\xf9(\xcc') (-7.838624888507203e+58,)
bytearray(b'yN\xb2\xbff.\\\x1a') (1.0611651097687064e-181,)
bytearray(b'\xb9*\xae?\xac\xaf\xe5\xe1') (-3.90257774261585e+163,)
bytearray(b'c\xab\xd2\xbf\xccQ\x8bj') (1.7130904564012918e+205,)
bytearray(b'\xc8\xea\x8c\xbf\xdf\xdc\xe49') (8.22891935818188e-30,)
I'm reading correctly just the int values. I don't understand why I get those binaries for all the other variables
EDIT
I want to highlight that bytearrays contain non-hexadecimal digits, and I'm sure that binary files are not corrupted, since I can read those in zemax
Solved.
It was just an error in my pointer arithmetic in the read_ray function. Thanks to Mad Physicist for his suggestion to unpack the whole structure which put me in the right direction.
def read_ray(bytearr:bytearray,ray):
ray_idx,num_seg = ray
data = []
assert num_seg==little_endian_int(bytearr[ray_idx:ray_idx+4])
ray_idx = ray_idx + 4
for seg_ptr in range(ray_idx,ray_idx + num_seg*208,208):
...
data_integ = [bytearr[seg_ptr+offset:seg_ptr+offset+4] for offset in int_vars]
data_doubl = [bytearr[seg_ptr+offset:seg_ptr+offset+8] for offset in doubl_vars]
...
return data

Cython typed memory view of struct

I have this code
import random
from random import randint
from cython cimport boundscheck, wraparound
cdef char * flip(float p):
cdef:
char* head = 'h'
char* tail = 't'
return head if random.random() < p else tail
cdef struct v_bag:
float v1
float v_rand
float v_min
cdef v_bag[:] flip(float num_flips,float num_coins_flipped,float num_experiments):
cdef:
float[:] head_count_coins
v_bag[:] results
int N,M,i,j,l
v_bag vs
N=num_experiments.shape[0]; M=num_coins_flipped.shape[0]
with boundscheck(False), wraparound(False):
for i in range(N):
for j in range(M):
flips = [flip(0.5) for k in xrange(num_flips)]
count = float(flips.count('h'))/num_flips
head_count_coins[j]=count
l = randint(0, 999)
results[i]= v_bag(v1=head_count_coins[0],
v_rand=head_count_coins[l],
v_min=float(min(head_count_coins)))
return results
And I keep getting this error: ' Function signature does not match previous declaration'
I can't seem to figure out how to get cython to compile this with a typed memory view, any help or suggestions would be greatly appreciated, the issue seems to be the memory view of structs.

Wrapping a LAPACKE function using Cython

I'm trying to wrap the LAPACK function dgtsv (a solver for tridiagonal systems of equations) using Cython.
I came across this previous answer, but since dgtsv is not one of the LAPACK functions that are wrapped in scipy.linalg I don't think I can use this particular approach. Instead I've been trying to follow this example.
Here's the contents of my lapacke.pxd file:
ctypedef int lapack_int
cdef extern from "lapacke.h" nogil:
int LAPACK_ROW_MAJOR
int LAPACK_COL_MAJOR
lapack_int LAPACKE_dgtsv(int matrix_order,
lapack_int n,
lapack_int nrhs,
double * dl,
double * d,
double * du,
double * b,
lapack_int ldb)
...here's my thin Cython wrapper in _solvers.pyx:
#!python
cimport cython
from lapacke cimport *
cpdef TDMA_lapacke(double[::1] DL, double[::1] D, double[::1] DU,
double[:, ::1] B):
cdef:
lapack_int n = D.shape[0]
lapack_int nrhs = B.shape[1]
lapack_int ldb = B.shape[0]
double * dl = &DL[0]
double * d = &D[0]
double * du = &DU[0]
double * b = &B[0, 0]
lapack_int info
info = LAPACKE_dgtsv(LAPACK_ROW_MAJOR, n, nrhs, dl, d, du, b, ldb)
return info
...and here's a Python wrapper and test script:
import numpy as np
from scipy import sparse
from cymodules import _solvers
def trisolve_lapacke(dl, d, du, b, inplace=False):
if (dl.shape[0] != du.shape[0] or dl.shape[0] != d.shape[0] - 1
or b.shape != d.shape):
raise ValueError('Invalid diagonal shapes')
if b.ndim == 1:
# b is (LDB, NRHS)
b = b[:, None]
# be sure to force a copy of d and b if we're not solving in place
if not inplace:
d = d.copy()
b = b.copy()
# this may also force copies if arrays are improperly typed/noncontiguous
dl, d, du, b = (np.ascontiguousarray(v, dtype=np.float64)
for v in (dl, d, du, b))
# b will now be modified in place to contain the solution
info = _solvers.TDMA_lapacke(dl, d, du, b)
print info
return b.ravel()
def test_trisolve(n=20000):
dl = np.random.randn(n - 1)
d = np.random.randn(n)
du = np.random.randn(n - 1)
M = sparse.diags((dl, d, du), (-1, 0, 1), format='csc')
x = np.random.randn(n)
b = M.dot(x)
x_hat = trisolve_lapacke(dl, d, du, b)
print "||x - x_hat|| = ", np.linalg.norm(x - x_hat)
Unfortunately, test_trisolve just segfaults on the call to _solvers.TDMA_lapacke.
I'm pretty sure my setup.py is correct - ldd _solvers.so shows that _solvers.so is being linked to the correct shared libraries at runtime.
I'm not really sure how to proceed from here - any ideas?
A brief update:
for smaller values of n I tend not to get segfaults immediately, but I do get nonsense results (||x - x_hat|| ought to be very close to 0):
In [28]: test_trisolve2.test_trisolve(10)
0
||x - x_hat|| = 6.23202576396
In [29]: test_trisolve2.test_trisolve(10)
-7
||x - x_hat|| = 3.88623414288
In [30]: test_trisolve2.test_trisolve(10)
0
||x - x_hat|| = 2.60190676562
In [31]: test_trisolve2.test_trisolve(10)
0
||x - x_hat|| = 3.86631743386
In [32]: test_trisolve2.test_trisolve(10)
Segmentation fault
Usually LAPACKE_dgtsv returns with code 0 (which should indicate success), but occasionally I get -7, which means that argument 7 (b) had an illegal value. What's happening is that only the first value of b is actually being modified in place. If I keep on calling test_trisolve I will eventually hit a segfault even when n is small.
OK, I figured it out eventually - it seems I've misunderstood what row- and column-major refer to in this case.
Since C-contiguous arrays follow row-major order, I assumed that I ought to specify LAPACK_ROW_MAJOR as the first argument to LAPACKE_dgtsv.
In fact, if I change
info = LAPACKE_dgtsv(LAPACK_ROW_MAJOR, ...)
to
info = LAPACKE_dgtsv(LAPACK_COL_MAJOR, ...)
then my function works:
test_trisolve2.test_trisolve()
0
||x - x_hat|| = 6.67064747632e-12
This seems pretty counter-intuitive to me - can anyone explain why this is the case?
Although rather old the question seems still to be relevant.
The observed behavior is the result of a misinterpretation of parameter LDB:
Fortran arrays are col major and the leading dimension of the array B corresponds to N. Therefore LDB >= max(1,N).
With row major LDB corresponds to NRHS and therefore the condition LDB >= max(1,NRHS) must be met.
Comment # b is (LDB, NRHS) is not correct since b has the dimension (LDB,N) and LDB should be 1 in this case.
Switching from LAPACK_ROW_MAJOR to LAPACK_COL_MAJOR fixes the issue as long as NRHS is equal to 1. The memory layout of a col major (N,1) is the same as row major (1,N). It will fail, however, if NRHS is greater than 1.

Python Tuple to Cython Struct

Scipy splprep (spline preperation) produces a Tuple tckp
tckp : tuple (t,c,k) a tuple containing the vector of knots,
the B-spline coefficients, and the degree of the spline.
tckp = [array[double,double ,..,double],
[array[double,double ,..,double],
array[double,double ,..... ,double],
array[double,double ,..... ,double]], int]
How can I construct and fill an equivalent Cython Structure to be able to use
splev (spline evaluation) within Cython
As discussed in the comments, it depends on how you will pass tckp to other functions. One way to store this information and pass to other functions is using a struct.
In the example below you pass the tckp list using a struct to a cdef function that takes a void * as input, simulating a C function... this example function adds 1 to all the arrays assuming that int0 is the size of the arrays.
import numpy as np
cimport numpy as np
cdef struct type_tckp_struct:
double *array0
double *array1
double *array2
double *array3
int *int0
def main():
cdef type_tckp_struct tckp_struct
cdef np.ndarray[np.float64_t, ndim=1] barray0, barray1, barray2, barray3
cdef int bint
tckp = [np.arange(1,11).astype(np.float64),
2*np.arange(1,11).astype(np.float64),
3*np.arange(1,11).astype(np.float64),
4*np.arange(1,11).astype(np.float64), 10]
barray0 = tckp[0]
barray1 = tckp[1]
barray2 = tckp[2]
barray3 = tckp[3]
bint = tckp[4]
tckp_struct.array0 = &barray0[0]
tckp_struct.array1 = &barray1[0]
tckp_struct.array2 = &barray2[0]
tckp_struct.array3 = &barray3[0]
tckp_struct.int0 = &bint
intern_func(&tckp_struct)
cdef void intern_func(void *args):
cdef type_tckp_struct *args_in=<type_tckp_struct *>args
cdef double *array0
cdef double *array1
cdef double *array2
cdef double *array3
cdef int int0, i
array0 = args_in.array0
array1 = args_in.array1
array2 = args_in.array2
array3 = args_in.array3
int0 = args_in.int0[0]
for i in range(int0):
array0[i] += 1
array1[i] += 1
array2[i] += 1
array3[i] += 1

Categories