Fast string array - Cython

Fast string array - Cython - python

Having following hypothetical code:
cdef extern from "string.h":
int strcmp(char* str1, char* str2)
def foo(list_str1, list_str2):
cdef unsigned int i, j
c_arr1 = ??
c_arr2 = ??
for i in xrange(len(list_str1)):
for j in xrange(len(list_str2)):
if not strcmp(c_arr1[i], c_arr2[j]):
do some funny stuff
is there some way how to convert the lists to c arrays?
I have read and tried Cython - converting list of strings to char ** but that only throws errors.

Try following code. to_cstring_array function in the following code is what you want.
from libc.stdlib cimport malloc, free
from libc.string cimport strcmp
from cpython.string cimport PyString_AsString
cdef char ** to_cstring_array(list_str):
cdef char **ret = <char **>malloc(len(list_str) * sizeof(char *))
for i in xrange(len(list_str)):
ret[i] = PyString_AsString(list_str[i])
return ret
def foo(list_str1, list_str2):
cdef unsigned int i, j
cdef char **c_arr1 = to_cstring_array(list_str1)
cdef char **c_arr2 = to_cstring_array(list_str2)
for i in xrange(len(list_str1)):
for j in xrange(len(list_str2)):
if i != j and strcmp(c_arr1[i], c_arr2[j]) == 0:
print i, j, list_str1[i]
free(c_arr1)
free(c_arr2)
foo(['hello', 'python', 'world'], ['python', 'rules'])

If you're on Python 3, here's an update to #falsetru's answer (untested on Python 2).
cdef extern from "Python.h":
char* PyUnicode_AsUTF8(object unicode)
from libc.stdlib cimport malloc, free
from libc.string cimport strcmp
cdef char ** to_cstring_array(list_str):
cdef char **ret = <char **>malloc(len(list_str) * sizeof(char *))
for i in xrange(len(list_str)):
ret[i] = PyUnicode_AsUTF8(list_str[i])
return ret
def foo(list_str1, list_str2):
cdef unsigned int i, j
cdef char **c_arr1 = to_cstring_array(list_str1)
cdef char **c_arr2 = to_cstring_array(list_str2)
for i in range(len(list_str1)):
for j in range(len(list_str2)):
if i != j and strcmp(c_arr1[i], c_arr2[j]) == 0:
print(i, j, list_str1[i])
free(c_arr1)
free(c_arr2)
foo(['hello', 'python', 'world'], ['python', 'rules'])
Warning: The pointer returned by PyUnicode_AsUTF8 is cached in the parent unicode-object. Which has two consequences:
this pointer is only valid as long as the parent unicode-object is alive. Accessing it afterwards leads to undefined behavior (e.g. possible segmentation fault).
The caller of the PyUnicode_AsUTF8 isn't responsible for the freeing the memory.

Related

How to copy a 2D array (matrix) from python with a C function (and do some computer heavy computation) which return a 2D array (matrix) in python?

I want to copy a 2D numpy array (matrix) in a C function a get it back in python (and then do some calculation on it in C taking the speed advantage of C) . Therefore I need the C function matrix_copy to return a 2D array (or, I guess, a pointer to it). I tried with the following code but I get the following output (where one can see the second dimension of the array is lost).
matrix_in.shape:
(300, 200)
matrix_out.shape:
(300,)
How could I change the code (I guess the matrix_copy.c adding some pointer magic) so I could obtain an exact copy of the matrix_in in matrix_out?
Here is the main.py script:
from ctypes import c_void_p, c_double, c_int, cdll
from numpy.ctypeslib import ndpointer
import numpy as np
import pdb
n = 300
m = 200
matrix_in = np.random.randn(n, m)
lib = cdll.LoadLibrary("matrix_copy.so")
matrix_copy = lib.matrix_copy
matrix_copy.restype = ndpointer(dtype=c_double,
shape=(n,))
matrix_out = matrix_copy(c_void_p(matrix_in.ctypes.data),
c_int(n),
c_int(m))
print("matrix_in.shape:")
print(matrix_in.shape)
print("matrix_out.shape:")
print(matrix_out.shape)
Here is the matrix_copy.c script:
#include <stdlib.h>
#include <stdio.h>
double * matrix_copy(const double * matrix_in, int n, int m){
double * matrix_out = (double *)malloc(sizeof(double) * (n*m));
int index = 0;
for(int i=0; i< n; i++){
for(int j=0; j<m; j++){
matrix_out[i+j] = matrix_in[i+j];
//matrix_out[i][j] = matrix_in[i][j];
// some heavy computations not yet implemented
}
}
return matrix_out;
}
which I compile with the command
cc -fPIC -shared -o matrix_copy.so matrix_copy.c
And as a side note, why does the notation matrix_out[i][j] = matrix_in[i][j]; throws me an error on compilation?
matrix_copy.c:10:26: error: subscripted value is not an array, pointer, or vector
matrix_out[i][j] = matrix_in[i][j];
~~~~~~~~~~~~~^~
matrix_copy.c:10:44: error: subscripted value is not an array, pointer, or vector
matrix_out[i][j] = matrix_in[i][j];

The second dimension is 'lost' because you explicitly omit it in the named shape argument of ndpointer. Change:
matrix_copy.restype = ndpointer(dtype=c_double, shape=(n,))
to
matrix_copy.restype = ndpointer(dtype=c_double, shape=(n,m), flags='C')
Where flags='C' additionally notes that the returned data is stored contiguously in row major order.
With regards to matrix_out[i][j] = matrix_in[i][j]; throwing an error, consider that matrix_in is of type const double *. matrix_in[i] would yield a value of type const double - how do you further index this value (i.e., with [j])?
If you want to emulate accessing a 2-dimensional array via a single pointer, you must calculate offsets manually. matrix_out[i+j] is not sufficient, as you must account for the span of each sub array:
matrix_out[i * m + j] = matrix_in[i * m + j];
Note that in C, size_t is the generally preferred type to use when dealing with memory sizes or array lengths.
matrix_copy.c, simplified:
#include <stdlib.h>
double *matrix_copy(const double *matrix_in, size_t n, size_t m)
{
double *matrix_out = malloc(sizeof *matrix_out * n * m);
for (size_t i = 0; i < n; i++)
for (size_t j = 0; j < m; j++)
matrix_out[i * m + j] = matrix_in[i * m + j];
return matrix_out;
}
matrix.py, with more explicit typing:
from ctypes import c_void_p, c_double, c_size_t, cdll, POINTER
from numpy.ctypeslib import ndpointer
import numpy as np
c_double_p = POINTER(c_double)
n = 300
m = 200
matrix_in = np.random.randn(n, m).astype(c_double)
lib = cdll.LoadLibrary("matrix_copy.so")
matrix_copy = lib.matrix_copy
matrix_copy.argtypes = c_double_p, c_size_t, c_size_t
matrix_copy.restype = ndpointer(
dtype=c_double,
shape=(n,m),
flags='C')
matrix_out = matrix_copy(
matrix_in.ctypes.data_as(c_double_p),
c_size_t(n),
c_size_t(m))
print("matrix_in.shape:", matrix_in.shape)
print("matrix_out.shape:", matrix_out.shape)
print("in == out", matrix_in == matrix_out)

The incoming data is a probably single block of memory. You need to create the substructure.
In my C++ code I have to do the following on data (block) coming in via swig:
void divide2DDoubleArray(double * &block, double ** &subblockdividers, int noofsubblocks, int subblocksize){
/* The starting address of a block of doubles is used to generate
* pointers to subblocks.
*
* block: memory containing the original block of data
* subblockdividers: array of subblock addresses
* noofsubblocks: specify the number of subblocks produced
* subblocksize: specify the size of the subblocks produced
*
* Design by contract: application should make sure the memory
* in block is allocated and initialized properly.
*/
// Build 2D matrix for cols
subblockdividers=new double *[noofsubblocks];
subblockdividers[0]= block;
for (int i=1; i<noofsubblocks; ++i) {
subblockdividers[i] = &subblockdividers[i-1][subblocksize];
}
}
Now the pointer returned in subblockdividers can be used the way you would like to.
Don't forget to free subblockdividers when your done. (Note: adjustments might be needed to compile this as C code)

Writing my own ufunc; ufunc not supported for the input types

I'm trying to write a C function that coverts numpy string array to a float array. How can I receive numpy's array in C as const char *?
static void double_logitprod(char **args, npy_intp *dimensions,
npy_intp* steps, void* data)
{
npy_intp i;
npy_intp n = dimensions[0];
char *in1 = args[0], *in2 = args[1];
char *out = args[2];
npy_intp in1_step = steps[0];
npy_intp out_step = steps[2];
for (i = 0; i < n; i++) {
/*BEGIN main ufunc computation*/
char *tmp1 = *((char **) in1);
double tmp2 = *((double *)in2);
*((double *) out) = to_float(tmp1, tmp2);
/*END main ufunc computation*/
in1 += in1_step;
out += out_step;
}
}
/*This a pointer to the above function*/
PyUFuncGenericFunction funcs[1] = {&double_logitprod};
/* These are the input and return dtypes of logit.*/
static char types[3] = {NPY_STRING, NPY_DOUBLE,
NPY_DOUBLE};
How to accept a numpy string array in C? NPY_STRING or NPY_UNICODE gives error.
Numpy string array is like this:
x = np.array(['1.0', '2.0', 'N/A'])

Cython typed memory view of struct

I have this code
import random
from random import randint
from cython cimport boundscheck, wraparound
cdef char * flip(float p):
cdef:
char* head = 'h'
char* tail = 't'
return head if random.random() < p else tail
cdef struct v_bag:
float v1
float v_rand
float v_min
cdef v_bag[:] flip(float num_flips,float num_coins_flipped,float num_experiments):
cdef:
float[:] head_count_coins
v_bag[:] results
int N,M,i,j,l
v_bag vs
N=num_experiments.shape[0]; M=num_coins_flipped.shape[0]
with boundscheck(False), wraparound(False):
for i in range(N):
for j in range(M):
flips = [flip(0.5) for k in xrange(num_flips)]
count = float(flips.count('h'))/num_flips
head_count_coins[j]=count
l = randint(0, 999)
results[i]= v_bag(v1=head_count_coins[0],
v_rand=head_count_coins[l],
v_min=float(min(head_count_coins)))
return results
And I keep getting this error: ' Function signature does not match previous declaration'
I can't seem to figure out how to get cython to compile this with a typed memory view, any help or suggestions would be greatly appreciated, the issue seems to be the memory view of structs.

Swig and multidimensional arrays

I am using Swig to interface python with C code.
I want to call a C function that takes for argument a struct containing an int** var:
typedef struct
{
(...)
int** my2Darray;
} myStruct;
void myCFunction( myStruct struct );
I am struggling with multi dimensional arrays.
My code looks like this:
In the interface file, I am using carray like this:
%include carrays.i
%array_class( int, intArray );
%array_class( intArray, intArrayArray );
In python, I have:
myStruct = myModule.myStruct()
var = myModule.intArrayArray(28)
for j in range(28):
var1 = myModule.intArray(28)
for i in range(28):
var1[i] = (...) filling var1 (...)
var[j] = var1
myStruct.my2Darray = var
myCFonction( myStruct )
I get an error on the line myStruct.my2Darray = var:
TypeError: in method 'maStruct_monTableau2D_set', argument 2 of type 'int **'
I doubt about the line %array_class( intArray, intArrayArray ).
I tried using a typedef for int* to create my array like this:
%array_class( myTypeDef, intArrayArray );
But it didn't seem to work.
Do you know how to handle multidimensional arrays in Swig ?
Thanks for your help.

Have you considered using numpy for this? I have used numpy with my SWIG-wrapped C++ project for 1D, 2D, and 3D arrays of double and std::complex elements with a lot of success.
You would need to get numpy.i and install numpy in your python environment.
Here is an example of how you would structure it:
.i file:
// Numpy Related Includes:
%{
#define SWIG_FILE_WITH_INIT
%}
// numpy arrays
%include "numpy.i"
%init %{
import_array(); // This is essential. We will get a crash in Python without it.
%}
// These names must exactly match the function declaration.
%apply (int* INPLACE_ARRAY2, int DIM1, int DIM2) \
{(int* npyArray2D, int npyLength1D, int npyLength2D)}
%include "yourheader.h"
%clear (int* npyArray2D, int npyLength1D, int npyLength2D);
.h file:
/// Get the data in a 2D Array.
void arrayFunction(int* npyArray2D, int npyLength1D, int npyLength2D);
.cpp file:
void arrayFunction(int* npyArray2D, int npyLength1D, int npyLength2D)
{
for(int i = 0; i < npyLength1D; ++i)
{
for(int j = 0; j < npyLength2D; ++j)
{
int nIndexJ = i * npyLength2D + j;
// operate on array
npyArray2D[nIndexJ];
}
}
}
.py file:
def makeArray(rows, cols):
return numpy.array(numpy.zeros(shape=(rows, cols)), dtype=numpy.int)
arr2D = makeArray(28, 28)
myModule.arrayFunction(arr2D)

This is how I handled 2d arrays. The trick I used was to write some inline code to handle the creation and mutation of an array. Once that is done, I can use those functions to do my bidding.
Below is the sample code.
ddarray.i
%module ddarray
%inline %{
// Helper function to create a 2d array
int* *int_array(int rows, int cols) {
int i;
int **arr = (int **)malloc(rows * sizeof(int *));
for (i=0; i<rows; i++)
arr[i] = (int *)malloc(cols * sizeof(int));
return arr;
}
void *setitem(int **array, int row, int col, int value) {
array[row][col] = value;
}
%}
ddarray.c
int calculate(int **arr, int rows, int cols) {
int i, j, sum = 0, product;
for(i = 0; i < rows; i++) {
product = 1;
for(j = 0; j < cols; j++)
product *= arr[i][j];
sum += product;
}
return sum;
}
Sample Python script
import ddarray
a = ddarray.int_array(2, 3)
for i in xrange(2):
for j in xrange(3):
ddarray.setitem(a, i, j, i + 1)
print ddarray.calculate(a, 2, 3)

Python Tuple to Cython Struct

Scipy splprep (spline preperation) produces a Tuple tckp
tckp : tuple (t,c,k) a tuple containing the vector of knots,
the B-spline coefficients, and the degree of the spline.
tckp = [array[double,double ,..,double],
[array[double,double ,..,double],
array[double,double ,..... ,double],
array[double,double ,..... ,double]], int]
How can I construct and fill an equivalent Cython Structure to be able to use
splev (spline evaluation) within Cython

As discussed in the comments, it depends on how you will pass tckp to other functions. One way to store this information and pass to other functions is using a struct.
In the example below you pass the tckp list using a struct to a cdef function that takes a void * as input, simulating a C function... this example function adds 1 to all the arrays assuming that int0 is the size of the arrays.
import numpy as np
cimport numpy as np
cdef struct type_tckp_struct:
double *array0
double *array1
double *array2
double *array3
int *int0
def main():
cdef type_tckp_struct tckp_struct
cdef np.ndarray[np.float64_t, ndim=1] barray0, barray1, barray2, barray3
cdef int bint
tckp = [np.arange(1,11).astype(np.float64),
2*np.arange(1,11).astype(np.float64),
3*np.arange(1,11).astype(np.float64),
4*np.arange(1,11).astype(np.float64), 10]
barray0 = tckp[0]
barray1 = tckp[1]
barray2 = tckp[2]
barray3 = tckp[3]
bint = tckp[4]
tckp_struct.array0 = &barray0[0]
tckp_struct.array1 = &barray1[0]
tckp_struct.array2 = &barray2[0]
tckp_struct.array3 = &barray3[0]
tckp_struct.int0 = &bint
intern_func(&tckp_struct)
cdef void intern_func(void *args):
cdef type_tckp_struct *args_in=<type_tckp_struct *>args
cdef double *array0
cdef double *array1
cdef double *array2
cdef double *array3
cdef int int0, i
array0 = args_in.array0
array1 = args_in.array1
array2 = args_in.array2
array3 = args_in.array3
int0 = args_in.int0[0]
for i in range(int0):
array0[i] += 1
array1[i] += 1
array2[i] += 1
array3[i] += 1

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Fast string array - Cython - python

Related

How to copy a 2D array (matrix) from python with a C function (and do some computer heavy computation) which return a 2D array (matrix) in python?

Writing my own ufunc; ufunc not supported for the input types

Cython typed memory view of struct

Swig and multidimensional arrays

Python Tuple to Cython Struct

Categories

Resources