I want to Cythonize portion of a pyx script which involves work with numpy arrays with complex numbers. The relevant portion of the python script looks like this:
M = np.dot(N , Q)
In my work, N, Q and M are numpy arrays with complex number entries.
Specifically, I want to transfer the matrices N and Q to a C++ code and do the matrix multiplication in C++.
While I know the method to transfer real valued numpy arrays using pointers to C++ script, followed by use of cython, I am a bit confused about how I should approach things for numpy arrays with complex values.
This is how I am trying to transfer the array from pyx to C++ presently.
import numpy as np
cimport numpy as np
cdef extern from "./matmult.h" nogil:
void mult(double* M, double* N, double* Q)
def sim():
cdef:
np.ndarray[np.complex128_t,ndim=2] N = np.zeros(( 2 , 2 ), dtype=np.float64)
np.ndarray[np.complex128_t,ndim=2] Q = np.zeros(( 2 , 2 ), dtype=np.float64)
np.ndarray[np.complex128_t,ndim=2] M = np.zeros(( 2 , 2 ), dtype=np.float64)
N = np.array([[1.1 + 2j,2.2],[3.3,4.4]])
Q = np.array([[3.3,4.4+5j],[5.5,6.6]])
mult(&M[0,0], &N[0,0], &Q[0,0])
print M
This is my C++ code:
#include "matmult.h"
using namespace std;
int main(){}
void mult(double *M, double *N, double *Q)
{
double P[2][2], A[2][2], B[2][2];
for (int i=0; i<2; i++)
{
for (int j=0; j<2; j++)
{
A[i][j] = *( N + ((2*i) + j) );
B[i][j] = *( Q + ((2*i) + j) );
P[i][j] = 0;
}
}
for (int i=0; i<2; i++)
{
for (int j=0; j<2; j++)
{
for (int k=0; k<2; k++)
{
P[i][j] += A[i][k]*B[k][i];
}
}
}
for (int i=0; i<2; i++)
{
for (int j=0; j<2; j++)
{
*( M + ((2*i) + j) ) = P[i][j];
}
}
}
When I compile this using cython, I get the following error
mat.pyx:17:27: Cannot assign type 'double complex *' to 'double *'
I will be grateful to have some help here.
This error message is telling you what's wrong:
mat.pyx:17:27: Cannot assign type 'double complex *' to 'double *'
That is, you have a double complex pointer from numpy (pointer to complex128 numpy dtype) and you're trying to pass that into the C++ function using double pointers. C++ needs to be able to deal with the complex numbers, so if you change your double* -> std::complex this should fix your problem
void mult(double *M, double *N, double *Q)
becomes
#include <complex>
void mult(std::complex<double> *M, std::complex<double> *N, std::complex<double> *Q)
Does numpy matrix multiply not suffice for your use case? Cython might be overkill.
Edit: Ok I finally got something, there's something a bit weird dealing with C++ std::complex and C double _Complex types.
cppmul.pyx:
import numpy as np
cimport numpy as np
cdef extern from "./matmult.h" nogil:
void mult(np.complex128_t* M, np.complex128_t* N, np.complex128_t* Q)
def sim():
cdef:
np.ndarray[np.complex128_t,ndim=2] N = np.zeros(( 2 , 2 ), dtype=np.complex128)
np.ndarray[np.complex128_t,ndim=2] Q = np.zeros(( 2 , 2 ), dtype=np.complex128)
np.ndarray[np.complex128_t,ndim=2] M = np.zeros(( 2 , 2 ), dtype=np.complex128)
N = np.array([[1.1 + 2j,2.2],[3.3,4.4]])
Q = np.array([[3.3,4.4+5j],[5.5,6.6]])
mult(&M[0,0], &N[0,0], &Q[0,0])
print M
matmul.c:
#include "matmult.h"
void mult(complex_t *M, complex_t *N, complex_t *Q)
{
complex_t P[2][2], A[2][2], B[2][2];
for (int i=0; i<2; i++)
{
for (int j=0; j<2; j++)
{
A[i][j] = *( N + ((2*i) + j) );
B[i][j] = *( Q + ((2*i) + j) );
P[i][j] = 0;
}
}
for (int i=0; i<2; i++)
{
for (int j=0; j<2; j++)
{
for (int k=0; k<2; k++)
{
P[i][j] += A[i][k]*B[k][i];
}
}
}
for (int i=0; i<2; i++)
{
for (int j=0; j<2; j++)
{
*( M + ((2*i) + j) ) = P[i][j];
}
}
}
matmult.h:
#include <complex.h>
typedef double _Complex complex_t;
void mult(complex_t *M, complex_t *N, complex_t *Q);
setup.py:
from distutils.core import setup
from Cython.Build import cythonize
from distutils.extension import Extension
import numpy as np
sourcefiles = ['cppmul.pyx', 'matmult.c']
extensions = [Extension("cppmul",
sourcefiles,
include_dirs=[np.get_include()],
extra_compile_args=['-O3']
)]
setup(
ext_modules = cythonize(extensions)
)
after running python setup.py build_ext --inplace it imports and runs as expected
import cppmul
cppmul.sim()
result:
[[15.73 +6.6j 15.73 +6.6j]
[43.56+16.5j 43.56+16.5j]]
try this
#include "matmult.h"
using namespace std;
int main(){}
void mult(double *M, double *N, double *Q)
{
double P[2][2], A[2][2], B[2][2];
for (int i=0; i<2; i++)
{
for (int j=0; j<2; j++)
{
A[i][j] = *( N + ((2*i) + j) );
B[i][j] = *( Q + ((2*i) + j) );
P[i][j] = 0;
}
}
for (int i=0; i<2; i++)
{
for (int j=0; j<2; j++)
{
for (int k=0; k<2; k++)
{
P[i][j] += A[i][k]*B[k][i];
}
}
}
for (int i=0; i<2; i++)
{
for (int j=0; j<2; j++)
{
*( ((2*i) + j) )+ M = P[i][j];
}
}
}
Related
Hello I am trying to write a simple C function that takes two inputs (m,n) and creates a 2D - array of pointers. Now I want to call that function in Ctypes and create a numpy array from the pointers. I am however not sure how to proceed - and run into an error when calling the np.frombuffer - function. Any help is apprechiated
c- file
#include <stdio.h>
#include <stdlib.h>
#define RANDOM_RANGE 50
typedef struct {
float val;
} cell;
cell **matrixMake(int m, int n){
// the rows
cell **pRow = (cell **)malloc(m* sizeof(cell *));
// the cols
for (int i = 0; i < m; i++){
pRow[i] = (cell *)malloc(n * sizeof(cell));
}
for (int i = 0; i < m; i++){
for (int j = 0; j < n; j++){
pRow[i][j].val = (float) (rand() % RANDOM_RANGE);
}
}
return pRow;
}
Corresponding Python File
import numpy as np
from numpy.ctypeslib import ndpointer
from ctypes import *
class CELL(Structure):
_fields_ = [ ('val', c_float) ]
libc = CDLL("c_arr_multi.so")
libc.matrixMake.argtypes = [ c_int, c_int ]
libc.matrixMake.restype = POINTER(POINTER(CELL))
res = libc.matrixMake(6, 3)
x = np.frombuffer((c_float * 6 * 3).from_address(libc.matrixMake(6, 3)), np.float32).copy()
print(x)
I am simply not shure how to proceed
I am hoping to mimic a Python for loop with the range() function in C. I'd like to accomplish a task an increasing number of times each loop until I reach the value of a given variable, in this case 5 (for the variable h). Here it is in Python:
x = 5
y = 0
while x > y:
for i in range(y+1):
print("#",end='')
print('')
y+=1
Output:
#
##
###
####
#####
I was able to accomplish the opposite (executing something a decreasing number of times) in C, as below:
{
int h = 5;
while (h > 0)
{
for (int i = 0; i < h; i++)
{
printf("#");
}
printf("\n");
h--;
}
}
Output:
#####
####
###
##
#
When I've attempted the top version in C, with the increasing number of executions, I run into the problem of not knowing how to control the various incrementing and decrementing variables.
I suggest you should think simply:
Increment up the number of # to print
Use loop to print that number of #
#include <stdio.h>
int main(void)
{
int h = 5;
for (int c = 1; c <= h; c++) // the number of # to print
{
for (int i = 0; i < c; i++)
{
printf("#");
}
printf("\n");
}
return 0;
}
Another way is simply writing in just the same way as the Python version:
#include <stdio.h>
int main(void)
{
int x = 5;
int y = 0;
while (x > y)
{
for (int i = 0; i < y+1; i++)
{
printf("#");
}
printf("\n");
y += 1;
}
return 0;
}
The solution in C:
#include <stdio.h>
int main ()
{
int x = 5;
int y = 0;
while (x > y)
{
for (int i=0;i<y+1;i++)
{
printf("#");
}
printf("\n");
}
return 0;
}
In Python, in the for loop, the variable is initialized as zero and increments by 1 by default. But in C, you need to do it explicitly.
I'm making a Game Engine in Python. Currently it runs at 5-7 FPS, and I want to increase it. So I thought to use C/C++ for putting shapes onto frames. My frames are 3-Dimensional arrays. I want to transport theses arrays between python and C++. But When using np.ctypeslib.ndpointer the array contains garbage.
Here's the code of C++ file:
#include <iostream>
using namespace std;
int*** zeros_3d_cpp(int n1, int n2, int n3) {
int ***arr = new int**[n1];
for (int i = 0; i < n1; i++) {
arr[i] = new int*[n2];
for (int j = 0; j < n2; j++) {
arr[i][j] = new int[n3];
for (int k = 0; k < n3; k++) {
arr[i][j][k] = 0;
}
}
}
return arr;
}
extern "C" {
__declspec(dllexport) int*** zeros_3d(int n1, int n2, int n3) { return zeros_3d_cpp(n1, n2, n3); }
}
Here's the python code:
from ctypes import *
import numpy as np
lib = cdll.LoadLibrary('./bin/main.dll')
lib.zeros_3d.argtypes = [c_int, c_int, c_int]
lib.zeros_3d.restype = np.ctypeslib.ndpointer(dtype=c_int, ndim=3, shape=(3, 2, 4))
print(lib.zeros_3d(3, 2, 4))
And the generated output is:
[[[ 4490944 0 4491040 0]
[ 4491136 0 387641098 268496650]]
[[ 4490976 0 4491008 0]
[ 4456792 0 387641098 268496650]]
[[ 0 0 0 0]
[ 4456792 0 387641098 268496650]]]
While it should be 0 all over.
Please Help me fast I want to finish this project before my school opens.
I don't think its a good idea to store frames in a Multi-Dimensional array as you mentioned. It would be easier to store the data in a linear 1-D array.
int*** zeros_3d_cpp(int n1, int n2, int n3) {
int ***arr = new int**[n1];
for (int i = 0; i < n1; i++) {
arr[i] = new int*[n2];
for (int j = 0; j < n2; j++) {
arr[i][j] = new int[n3];
for (int k = 0; k < n3; k++) {
arr[i][j][k] = 0;
}
}
}
return arr;
}
I would split the code into 2 parts, first allocating Array and then fill it up.
int* zeros_3d_cpp(int width, int height, int breadth) {
//Allocating Memory
int *arr = new int[width*height*breadth];
for (int i = 0; i < breadth; i++)
{
//Iterates over 2D Image
for (int j = 0; j < height; j++)
{
//Iterates over a Single Row
for (int k = 0; k < width; k++)
{
arr[i*width*height + j*width +k] = 0;
}
}
}
return arr;
}
Make sure to bind the memory (ie) the returned array ptr to some object in python.
Memory
Also you are using Raw Pointers, use extra precaution to see who is managing the memory of the resource. You can allocate the memory (Dynamic Array in your case) from python side using numpy or allocate from C++ and track the object lifetime in Python to make sure it's freed correctly.
You can also create the array in python using numpy and then return the pointer into C++ and so your C++ function would take the starting pointer address as input
void zeros_3d_cpp(int* start_address, int width, int height, int breadth) {
//Only fill in with zeros
}
I am benchmarking matrix multiplication for different libraries as I am thinking of rewriting some cython code to native c++. However the simple tests seem to imply that numpy is faster than BLAS or eigen for simple matrix multiplications.
I have written the following files:
#!test_blas.cpp
#include <random>
#include <cstdio>
#include <stdlib.h>
#include <iostream>
#include <cblas.h>
int main ( int argc, char* argv[] ) {
// Random numbers
std::mt19937_64 rnd;
std::uniform_real_distribution<double> doubleDist(0, 1);
// Create arrays that represent the matrices A,B,C
const int n = 2000;
double* A = new double[n*n];
double* B = new double[n*n];
double* C = new double[n*n];
// Fill A and B with random numbers
for(uint i =0; i <n; i++){
for(uint j=0; j<n; j++){
A[i*n+j] = doubleDist(rnd);
B[i*n+j] = doubleDist(rnd);
}
}
// Calculate A*B=C
clock_t start = clock();
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, A, n, B, n, 0.0, C, n);
clock_t end = clock();
double time = double(end - start)/ CLOCKS_PER_SEC;
std::cout<< "Time taken : " << time << std::endl;
// Clean up
delete[] A;
delete[] B;
delete[] C;
return 0;
}
#!test_eigen.cpp
#include <iostream>
#include <Eigen/Dense>
using namespace Eigen;
int main()
{
int n_a_rows = 2000;
int n_a_cols = 2000;
int n_b_rows = n_a_cols;
int n_b_cols = 2000;
MatrixXd a(n_a_rows, n_a_cols);
for (int i = 0; i < n_a_rows; ++ i)
for (int j = 0; j < n_a_cols; ++ j)
a (i, j) = n_a_cols * i + j;
MatrixXd b (n_b_rows, n_b_cols);
for (int i = 0; i < n_b_rows; ++ i)
for (int j = 0; j < n_b_cols; ++ j)
b (i, j) = n_b_cols * i + j;
MatrixXd d (n_a_rows, n_b_cols);
clock_t begin = clock();
d = a * b;
clock_t end = clock();
double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
std::cout << "Time taken : " << elapsed_secs << std::endl;
}
#!test_numpy.py
import numpy as np
import time
N = 2000
a = np.random.rand(N, N)
b = np.random.rand(N, N)
start = time.time()
c = a.dot(b)
print(f"Time taken : {time.time() - start}")
Finally I created the following test file
#!test_matrix.sh
c++ -O2 -march=native -std=c++11 -I /usr/include/eigen3 test_eigen.cpp -o eigen
c++ -O2 -march=native -std=c++11 test_blas.cpp -o blas -lcblas
echo "testing BLAS"
./blas
echo "testing Eigen"
./eigen
echo "testing numpy"
python test_numpy.py
which yields the output
testing BLAS
Time taken : 1.63807
testing Eigen
Time taken : 0.795115
testing numpy
Time taken : 0.28397703170776367
Now my question is, how come numpy is the fastest of these tests? Am I missing something with regards to optimizations?
One thing could be that numpy uses threading to compute the matrix product. Adding the compiler flag -fopenmp however yields worse performance for eigen and BLAS.
I am using g++ version 9.0.3-1. Numpy is version 1.18.1 using python 3.8.2. Thanks in advance.
The example in Simple wrapping of C code with cython describes nicely how to evaluate a function written in C on an array passed from numpy and return the result in a numpy array.
How would one go about doing the same thing but returning a 2D array? I.e. I'd like to evaluate a C function on a grid defined by two numpy arrays, and return the result as a numpy 2D array.
It would be something like this (using same functions as in the link above). Obviously one can't use double z[] now, but I'm not sure how to pass a 2D numpy array to C.
/* fc.cpp */
int fc( int N, const double a[], const double b[], double z[] )
{
for( int i = 0; i < N; i ++ ){
for( int j = 0; j < N; j ++ ){
z[i][j] = somefunction(a[i],b[j]);
}
return N;
}
This is the original .pyx file (see below).
import numpy as np
cimport numpy as np
cdef extern from "fc.h":
int fc( int N, double* a, double* b, double* z ) # z = a + b
def fpy( N,
np.ndarray[np.double_t,ndim=1] A,
np.ndarray[np.double_t,ndim=1] B,
np.ndarray[np.double_t,ndim=1] Z ):
""" wrap np arrays to fc( a.data ... ) """
assert N <= len(A) == len(B) == len(Z)
fcret = fc( N, <double*> A.data, <double*> B.data, <double*> Z.data )
return fcret
Many thanks.
You can use a normal array for a 2D Matrix. You need only give the length of the dimension to the function.
In the C file do something as that:
(z is now an array of length N*N)
int fc( int N, const double a[], const double b[], double z[] )
{
for( int i = 0; i < N; i++ ){
for( int j = 0; j < N; j ++ ){
z[(i*N)+j] = somefunction(a[i],b[j]);
}
return N;
}
In Python you need to do the same, so you can use a 1D Array with N*N elements instead of an 2D Matrix.
Update 3D case
(z is now an array of length N*N*N)
int fc( int N, const double a[], const double b[],const double c[], double z[] )
{
for( int i = 0; i < N; i++ ){
for( int j = 0; j < N; j ++ ){
for( int k = 0; k < N; k ++ ){
z[((i*N)+j)*N+k] = somefunction(a[i],b[j],c[k]);
}
return N;
}