Let us consider the CUDA code at CUDA's Mersenne Twister for an arbitrary number of threads and suppose that I want to convert it to a pyCUDA application.
I know that I can use ctypes and CDLL, namely,
cudart = CDLL("/usr/local/cuda/lib64/libcudart.so")
to use the cudart routines.
However, I would also need to allocate, for example, a curandStateMtgp32 array whose definition is in curand_mtgp32.h, or else call
curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, devKernelParams);
and use mtgp32dc_params_fast_11213 whose definition is in curand_mtgp32_host.h.
How to deal with CUDA type definitions and values in pyCUDA?
I solved the problem with reference to device side APIs as follows:
I created a .dll containing two functions: MTGP32Setup() to setup the Mersenne Twister Generator and MTGP32Generation() to generate the random numbers;
I called the above functions using ctypes.
Source code for the .dll
// --- Generate random numbers with cuRAND's Mersenne Twister
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <cuda.h>
#include <curand_kernel.h>
/* include MTGP host helper functions */
#include <curand_mtgp32_host.h>
#define BLOCKSIZE 256
#define GRIDSIZE 64
curandStateMtgp32 *devMTGPStates;
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/*************************/
/* CURAND ERROR CHECKING */
/*************************/
static const char *_curandGetErrorEnum(curandStatus_t error)
{
switch (error)
{
case CURAND_STATUS_SUCCESS:
return "CURAND_SUCCESS";
case CURAND_STATUS_VERSION_MISMATCH:
return "CURAND_STATUS_VERSION_MISMATCH";
case CURAND_STATUS_NOT_INITIALIZED:
return "CURAND_STATUS_NOT_INITIALIZED";
case CURAND_STATUS_ALLOCATION_FAILED:
return "CURAND_STATUS_ALLOCATION_FAILED";
case CURAND_STATUS_TYPE_ERROR:
return "CURAND_STATUS_TYPE_ERROR";
case CURAND_STATUS_OUT_OF_RANGE:
return "CURAND_STATUS_OUT_OF_RANGE";
case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
case CURAND_STATUS_LAUNCH_FAILURE:
return "CURAND_STATUS_LAUNCH_FAILURE";
case CURAND_STATUS_PREEXISTING_FAILURE:
return "CURAND_STATUS_PREEXISTING_FAILURE";
case CURAND_STATUS_INITIALIZATION_FAILED:
return "CURAND_STATUS_INITIALIZATION_FAILED";
case CURAND_STATUS_ARCH_MISMATCH:
return "CURAND_STATUS_ARCH_MISMATCH";
case CURAND_STATUS_INTERNAL_ERROR:
return "CURAND_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
inline void __curandSafeCall(curandStatus_t err, const char *file, const int line)
{
if (CURAND_STATUS_SUCCESS != err) {
fprintf(stderr, "CURAND error in file '%s', line %d, error: %s \nterminating!\n", __FILE__, __LINE__, \
_curandGetErrorEnum(err)); \
assert(0); \
}
}
void curandSafeCall(curandStatus_t err) { __curandSafeCall(err, __FILE__, __LINE__); }
/*******************/
/* iDivUp FUNCTION */
/*******************/
__host__ __device__ int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/*********************/
/* GENERATION KERNEL */
/*********************/
__global__ void generate_kernel(curandStateMtgp32 * __restrict__ state, float * __restrict__ result, const int N)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int k = tid; k < N; k += blockDim.x * gridDim.x)
result[k] = curand_uniform(&state[blockIdx.x]);
}
extern "C" {
/**************************/
/* MERSENNE TWISTER SETUP */
/**************************/
__declspec(dllexport)
void MTGP32Setup() {
// --- Setup the pseudorandom number generator
gpuErrchk(cudaMalloc(&devMTGPStates, GRIDSIZE * sizeof(curandStateMtgp32)));
mtgp32_kernel_params *devKernelParams; gpuErrchk(cudaMalloc(&devKernelParams, sizeof(mtgp32_kernel_params)));
curandSafeCall(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, devKernelParams));
curandSafeCall(curandMakeMTGP32KernelState(devMTGPStates, mtgp32dc_params_fast_11213, devKernelParams, GRIDSIZE, time(NULL)));
}
/*******************************/
/* MERSENNE TWISTER GENERATION */
/*******************************/
__declspec(dllexport)
void MTGP32Generation(float * __restrict__ devResults, const int N) {
// --- Generate pseudo-random sequence and copy to the host
generate_kernel << <GRIDSIZE, BLOCKSIZE >> > (devMTGPStates, devResults, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
} //
Source code for the PyCUDA caller
import os
import sys
import numpy as np
import ctypes
from ctypes import *
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
lib = cdll.LoadLibrary('D:\\Project\\cuRAND\\mersenneTwisterDLL\\x64\\Release\\mersenneTwisterDLL.dll')
N = 10
d_x = gpuarray.zeros((N, 1), dtype = np.float32)
lib.MTGP32Setup()
lib.MTGP32Generation(ctypes.cast(d_x.ptr, POINTER(c_float)), N)
print(d_x)
Host side APIs can be dealt with in a way similar to Calling host functions in PyCUDA.
Related
I'm working on the python with ctypes to call the c so file, but the c file define the structure with function pointer
// mem ==================================================================
typedef struct StdMemFunc
{
void* (*const malloc) (unsigned long size);
void (*const free) (void* ptr);
void* (*const realloc) (void* ptr, unsigned long size);
void* (*const calloc) (unsigned long count, unsigned long size);
void* (*const set) (void* ptr, int value, unsigned long num);
void* (*const copy) (void* dest, const void* src, unsigned long num);
}*StdMemFunc;
typedef struct StdLib
{
const uint32_t version;
bool (*const is_version_compatible) (uint32_t version, uint32_t func_mask);
void (*const delay) (int32_t milli_sec);
const StdMemFunc mem;
}*StdLib;
and mock the function in another file as below
void *std_malloc(unsigned long size)
{
return malloc(size);
}
void std_free(void *ptr)
{
free(ptr);
}
void *std_realloc(void *ptr, unsigned long size)
{
return realloc(ptr, size);
}
void *std_calloc(unsigned long count, unsigned long size)
{
return calloc(count, size);
}
void *std_memset(void *ptr, int value, unsigned long num)
{
return memset(ptr, value, num);
}
void *std_memcopy(void *dest, const void *src, unsigned long num)
{
return memcpy(dest, src, num);
}
struct StdMemFunc mem_func =
{
.malloc = std_malloc,
.free = std_free,
.realloc = std_realloc,
.calloc = std_calloc,
.set = std_memset,
.copy = std_memcopy
};
then the python need to call another method with std_lib as paramater, the std_lib with call mem->malloc() method in C part, so how to define the class in the python with ctypes?
I have tried the below one, but it was not work
class StdMemFunc(Structure):
_fields_ = [
("malloc", ctypes.CFUNCTYPE(c_void_p, c_ulong)),
("free", ctypes.CFUNCTYPE(None, c_void_p)),
("realloc", ctypes.CFUNCTYPE(c_void_p, c_void_p, c_ulong)),
("calloc", ctypes.CFUNCTYPE(c_void_p, c_ulong, c_ulong)),
("set", ctypes.CFUNCTYPE(c_void_p, c_void_p, c_int, c_ulong)),
("copy", ctypes.CFUNCTYPE(c_void_p, c_void_p, c_ulong))
]
class StdLib(Structure):
_fields_ = [
("version", c_uint32),
("is_version_compatible", c_bool),
("delay", c_void_p),
("mem", POINTER(StdMemFunc)),
]
libc_std_lib = CDLL('/home/linus/code/galileo/mock_std_lib.so')
std_lib = StdLib()
std_lib.mem.malloc = libc_std_lib.std_malloc
libc_modbus.modbus_create_server_station.argtypes = [POINTER(ModbusNodeDef), c_int, StdLib, PlcDrvAccessor]
libc_modbus.modbus_create_server_station.restype = POINTER(ModbusStation)
libc_modbus.modbus_create_server_station(node_def, node_num, std_lib, plc_drv_accessor)
It looks like there are two problems here:
The is_version_compatible and delay fields in the StdLib struct are functions, but you are defining them as constants.
You are not instantiating all the fields in the struct, meaning that the program might be trying to dereference a null pointer, as null pointers are the default value for pointer types.
The StdLib struct definition should look something like this:
class StdLib(Structure):
_fields_ = [
("version", c_uint32),
("is_version_compatible", CFUNCTYPE(POINTER(c_bool), c_uint32, _uint32)),
("delay", CFUNCTYPE(c_void_p, c_int32)),
("mem", POINTER(StdMemFunc)),
]
For the instantiation, I would do something like this:
libc_std_lib = CDLL('/home/linus/code/galileo/mock_std_lib.so')
std_mem_func = StdMemFunc(
libc_std_lib.std_malloc,
libc_std_lib.std_free,
libc_std_lib.std_realloc,
libc_std_lib.std_calloc,
libc_std_lib.std_set,
libc_std_lib.std_copy
)
std_lib = StdLib(
1,
reference_to_is_version_compatible_func,
reference_to_delay_func,
std_mem_func
)
Of course, you need to pass the correct params/function references to StdLib. Maybe you will need to mock the is_version_compatible and delay functions as well.
Disclaimer: this is entirely untested, so I don't guarantee it will work.
The OP's code isn't quite reproducible, but I was able to get the same error message on the following line:
std_lib.mem.malloc = libc_std_lib.std_malloc
If I am following correctly, the OP wants to initialize the C structure with functions that are provided in C, but libc.std_lib.std_malloc isn't wrapped properly to do that. It is a function that wraps a C function that is callable from Python, not C.
ctypes function prototypes can be instantiated a number of ways, and the one that works is:
prototype(func_spec[, paramflags])
Returns a foreign function exported by a shared library. func_spec must be a 2-tuple
(name_or_ordinal, library). The first item is the name of the exported
function as string, or the ordinal of the exported function as small
integer. The second item is the shared library instance.
For example:
std_lib.mem.malloc = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_ulong)(('std_malloc',libc_std_lib))
Here's a working set of files:
test.cpp
#include <stdlib.h>
#include <stdint.h>
#include <memory.h>
#include <stdio.h>
#ifdef _WIN32
# define API __declspec(dllexport)
#else
# define API
#endif
extern "C" {
typedef struct StdMemFunc {
void* (*const malloc)(unsigned long size);
void (*const free)(void* ptr);
void* (*const realloc)(void* ptr, unsigned long size);
void* (*const calloc)(unsigned long count, unsigned long size);
void* (*const set)(void* ptr, int value, unsigned long num);
void* (*const copy)(void* dest, const void* src, unsigned long num);
} *StdMemFuncPtr;
typedef struct StdLib {
const uint32_t version;
bool (*const is_version_compatible)(uint32_t version, uint32_t func_mask);
void (*const delay)(int32_t milli_sec);
const StdMemFunc mem;
} *StdLibPtr;
API void* std_malloc(unsigned long size) {
return malloc(size);
}
API void std_free(void* ptr) {
free(ptr);
}
API void* std_realloc(void* ptr, unsigned long size) {
return realloc(ptr, size);
}
API void* std_calloc(unsigned long count, unsigned long size) {
return calloc(count, size);
}
API void* std_memset(void* ptr, int value, unsigned long num) {
return memset(ptr, value, num);
}
API void* std_memcopy(void* dest, const void* src, unsigned long num) {
return memcpy(dest, src, num);
}
// A couple of test functions that accepts the initialized structure
// and calls sum of the function pointers.
API char* testit(StdLib* test) {
// This is how I debugged this, by comparing the *actual*
// function pointer value to the one received from Python.
// Once they matched the code worked.
printf("%p %p\n", std_malloc, test->mem.malloc);
char* p = static_cast<char*>(test->mem.malloc(10));
test->mem.set(p, 'A', 9);
p[9] = 0;
return p;
}
API void freeit(StdLib* test, char* p) {
test->mem.free(p);
}
}
test.py
import ctypes as ct
# prototypes
MALLOC = ct.CFUNCTYPE(ct.c_void_p,ct.c_ulong)
FREE = ct.CFUNCTYPE(None,ct.c_void_p)
REALLOC = ct.CFUNCTYPE(ct.c_void_p, ct.c_void_p, ct.c_ulong)
CALLOC = ct.CFUNCTYPE(ct.c_void_p, ct.c_ulong, ct.c_ulong)
SET = ct.CFUNCTYPE(ct.c_void_p,ct.c_void_p,ct.c_int,ct.c_ulong)
COPY = ct.CFUNCTYPE(ct.c_void_p, ct.c_void_p, ct.c_ulong)
class StdMemFunc(ct.Structure):
_fields_ = [("malloc", MALLOC),
("free", FREE),
("realloc", REALLOC),
("calloc", CALLOC),
("set", SET),
("copy", COPY)]
class StdLib(ct.Structure):
_fields_ = [("version", ct.c_uint32),
# Note these two fields were function pointers as well.
# Declared correctly now.
("is_version_compatible", ct.CFUNCTYPE(ct.c_bool, ct.c_uint32, ct.c_uint32)),
("delay", ct.CFUNCTYPE(None, ct.c_int32)),
("mem", StdMemFunc)]
dll = ct.CDLL('./test')
dll.testit.argtypes = ct.POINTER(StdLib),
dll.testit.restype = ct.POINTER(ct.c_char)
dll.freeit.argtypes = ct.POINTER(StdLib), ct.c_char_p
dll.freeit.restype = None
lib = StdLib()
lib.mem.malloc = MALLOC(('std_malloc', dll))
lib.mem.realloc = REALLOC(('std_realloc', dll))
lib.mem.calloc = CALLOC(('std_calloc', dll))
lib.mem.free = FREE(('std_free', dll))
lib.mem.set = SET(('std_memset', dll))
lib.mem.copy = COPY(('std_memcopy', dll))
p = dll.testit(lib)
# One way to access the data in the returned pointer is to slice it to the known length
print(p[:10])
# If known to be null-terminated, can also cast to c_char_p, which expects
# null-terminated data, and extract the value.
print(ct.cast(p,ct.c_char_p).value)
dll.freeit(lib,p)
Output:
b'AAAAAAAAA\x00'
b'AAAAAAAAA'
I want to implement the following function in c. I have written three different programming language's implementations of the function:
Python
def get_add(x):
def add(y):
return x + y
return add
add5 = get_add(5)
add5(10) # 15
add10 = get_add(10)
add10(10) # 20
JS
function get_add(x) {
return (y)=>{
return x + y
}
}
add5 = get_add(5)
add5(10) // 15
add10 = get_add(10)
add10(10) // 20
lua
function get_add(num)
return function(a) return a+num end
end
add5 = get_add(5)
add5(10) -- 15
add10 = get_add(10)
add10(10) -- 20
I cannot think of a way to implement this. Maybe this can be implemented using hash table in some manner? Or function pointers?
I would be fairly confident in the claim that there is no way to do this portably in strictly conforming C. That said, if you're willing to make some generous assumptions about how the specific implementation you're using works, you could do this by allocating memory for new code (or at least the data the closure would capture), marking it as executable, and then engaging in some standards-violating pointer casting. An example that at least works on my machine (x86-64 Linux) would be:
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
uint8_t (*add(uint8_t x))(uint8_t) {
// lea eax, [rdi + x]
// ret
char code[] = { 0x8D, 0x47, x, 0xC3 };
char *p = mmap(0, sizeof code, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS,
-1, 0);
memcpy(p, code, sizeof code);
return (uint8_t(*)(uint8_t))p;
}
int main(void) {
uint8_t (*add5)(uint8_t) = add(5);
printf(" 5 + 10 = %" PRIu8 "\n", add5(10));
printf("10 + 10 = %" PRIu8 "\n", add(10)(10));
return 0;
}
But as noted, this is non-portable at best, and is definitely not anywhere close to idiomatic C.
There are ways to do equivalent things to this in a standards-conforming manner, like storing the data you would capture in a struct and passing that to a different function, but in terms of doing it transparently with functions, I think this is about the best you can do.
So is there any other way to implement it?
Yes, but without external dependencies it can begin to get a little unwieldy.
One option is something like this, where for each different function you create a struct with all the variables you would want to capture and pass that as an argument:
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
struct add {
uint8_t x;
};
struct add get_add(uint8_t x) {
return (struct add) {
.x = x,
};
}
uint8_t add(struct add info, uint8_t y) {
return info.x + y;
}
int main(void) {
struct add add5 = get_add(5);
printf(" 5 + 10 = %" PRIu8 "\n", add(add5, 10));
printf("10 + 10 = %" PRIu8 "\n", add(get_add(10), 10));
return 0;
}
This becomes a bit verbose if you want a function that has more than a couple curried arguments.
#SteveSummit's suggestion in the comments about libffi is also good. With their closures API, here's an example of what you might want:
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <ffi.h>
void _add(ffi_cif *cif, void *ret, void *args[], void *x) {
*(ffi_arg*)ret = *(uint64_t*)x + *(uint64_t*)args[0];
}
int main(void) {
ffi_cif cif;
ffi_type *args[1];
void *add_code;
ffi_closure *add_closure = ffi_closure_alloc(sizeof *add_closure, &add_code);
uint64_t x = 5;
if (add_closure) {
args[0] = &ffi_type_uint64;
if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, 1, &ffi_type_uint64, args) == FFI_OK) {
if (ffi_prep_closure_loc(add_closure, &cif, _add, &x, add_code) == FFI_OK) {
printf(" 5 + 10 = %" PRIu64 "\n", ((uint64_t (*)(uint64_t))add_code)(10));
printf(" 5 + 15 = %" PRIu64 "\n", ((uint64_t (*)(uint64_t))add_code)(15));
}
}
}
ffi_closure_free(add_closure);
return 0;
}
I haven't checked, but my guess is their implementation is probably just a more robust wrapper around the first example I give (with additional platform support).
I'm figuring out the Python/C API for a more complex task. Initially, I wrote a simple example of adding two ndarrays of shape = (2,3) and type = float32.
I am able to pass two numpy arrays into c functions, read their dimensions and data and perform custom addion on data. But when I try to wrap the resulting data using PyArray_SimpleNewFromData, code hangs (returns NULL?)
To replicate the issue, create three files: mymath.c, setup.py, test.py in a folder as follows and run test.py (it runs setup.py to compile and install the module and then runs a simple test).
I'm using python in windows, inside an anaconda environment. I'm new to the Python/C API. So, any help would be much appreciated.
// mymath.c
#include <Python.h>
#include <stdio.h>
#include "numpy/arrayobject.h"
#include "numpy/npy_math.h"
#include <math.h>
#include <omp.h>
/*
C functions
*/
float* arr_add(float* d1, float* d2, int M, int N){
float * result = (float *) malloc(sizeof(float)*M*N);
for (int m=0; m<M; m++)
for (int n=0; n<N; n++)
result [m*N+ n] = d1[m*N+ n] + d2[m*N+ n];
return result;
}
/*
Unwrap apply and wrap pyObjects
*/
void capsule_cleanup(PyObject *capsule) {
void *memory = PyCapsule_GetPointer(capsule, NULL);
free(memory);
}
// add two 2d arrays (float32)
static PyObject *arr_add_fn(PyObject *self, PyObject *args)
{
PyArrayObject *arr1, *arr2;
if (!PyArg_ParseTuple(args, "OO", &arr1, &arr2))
return NULL;
// get data as flat list
float *d1, *d2;
d1 = (float *) arr1->data;
d2 = (float *) arr2->data;
int M, N;
M = (int)arr1->dimensions[0];
N = (int)arr1->dimensions[1];
printf("Dimensions, %d, %d \n\n", M,N);
PyObject *result, *capsule;
npy_intp dim[2];
dim[0] = M;
dim[1] = N;
float * d3 = arr_add(d1, d2, M, N);
result = PyArray_SimpleNewFromData(2, dim, NPY_FLOAT, (void *)d3);
if (result == NULL)
return NULL;
// -----------This is not executed. code hangs--------------------
for (int m=0; m<M; m++)
for (int n=0; n<N; n++)
printf("%f \n", d3[m*N+n]);
capsule = PyCapsule_New(d3, NULL, capsule_cleanup);
PyArray_SetBaseObject((PyArrayObject *) result, capsule);
return result;
}
/*
Bundle functions into module
*/
static PyMethodDef MyMethods [] ={
{"arr_add", arr_add_fn, METH_VARARGS, "Array Add two numbers"},
{NULL,NULL,0,NULL}
};
/*
Create module
*/
static struct PyModuleDef mymathmodule = {
PyModuleDef_HEAD_INIT,
"mymath", "My doc of mymath", -1, MyMethods
};
PyMODINIT_FUNC PyInit_mymath(void){
return PyModule_Create(&mymathmodule);
}
# setup.py
from distutils.core import setup, Extension
import numpy
module1 = Extension('mymath',
sources = ['mymath.c'],
# define_macros = [('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')],
include_dirs=[numpy.get_include()],
extra_compile_args = ['-fopenmp'],
extra_link_args = ['-lgomp'])
setup (name = 'mymath',
version = '1.0',
description = 'My math',
ext_modules = [module1])
# test.py
import os
os.system("python .\setup.py install")
import numpy as np
import mymath
a = np.arange(6,dtype=np.float32).reshape(2,3)
b = np.arange(6,dtype=np.float32).reshape(2,3)
c = mymath.arr_add(a,b)
print(c)
I wrote a dll library in c, compile with vs2017 64-bit, and try to load it with python3.6 64-bit. However the object's member variable's address got truncated to 32-bit.
Here's my sim.c file, which is compiled to sim.dll:
class Detector {
public:
Detector();
void process(int* pin, int* pout, int n);
private:
int member_var;
};
Detector::Detector()
{
memset(&member_var, 0, sizeof(member_var));
myfile.open("addr_debug.txt");
myfile << "member_var init address: " << &member_var << endl;
}
void Detector::process(int* pin, int* pout, int n);
{
myfile << "member_var process address: " << &member_var << endl;
myfile.close();
}
#define DllExport __declspec( dllexport )
extern "C" {
DllExport Detector* Detector_new() { return new Detector(); }
DllExport void Detector_process(Detector* det, int* pin, int* pout, int n)
{
det->process(pin, pout, n);
}
}
Here's my python script:
from ctypes import cdll
lib = cdll.LoadLibrary(r'sim.dll')
class Detector(object):
def __init__(self):
self.obj = lib.Detector_new()
def process(self,pin, pout, n):
lib.Detector_process(self.obj,pin, pout, n)
detector = Detector()
n = 1024
a = np.arange(n, dtype=np.uint32)
b = np.zeros(n, dtype=np.int32)
aptr = a.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
bptr = b.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
detector.process(aptr, bptr, n)
Here's the address of the member_var in addr_debug.txt:
member_var init address: 0000025259E123C4
member_var process address: 0000000059E123C4
So accessing it trigger memory access error:
OSError: exception: access violation reading 0000000059E123C4
Some attempts I tried to understand the issue:
Define member_var as public instead of private, not help, address still truncated.
Define member_var as global variable, then the address is ok. So I guess the member_var address truncation happens either when returning the object to python, or passing the object back to dll.
Always (CORRECTLY) specify argtypes and restype for functions defined in C, otherwise (C89 style) they will default to int (generally 32bit), generating !!! Undefined Behavior !!!. On 64bit, addresses (larger than 2 GiB) will be truncated (which is exactly what you're experiencing). Check [SO]: C function called from Python via ctypes returns incorrect value (#CristiFati's answer) for more details.
Also, when running into issues, don't forget about [Python.Docs]: ctypes - A foreign function library for Python.
Below it's an adapted version of your code.
detector.cpp:
#include <stdio.h>
#include <memory.h>
#include <fstream>
#define SIM_EXPORT __declspec(dllexport)
#define C_TAG "From C"
#define PRINT_MSG_3SPI(ARG0, ARG1, ARG2) printf("%s - [%s] (%d) - [%s]: %s: 0x%0p(%d)\n", C_TAG, __FILE__, __LINE__, __FUNCTION__, ARG0, ARG1, ARG2)
using std::endl;
std::ofstream outFile;
class Detector {
public:
Detector();
void process(int *pIn, int *pOut, int n);
private:
int m_var;
};
Detector::Detector()
: m_var(25) {
outFile.open("addr_debug.txt");
outFile << "m_var init address: " << &m_var << endl;
PRINT_MSG_3SPI("&m_var(m_var)", &m_var, m_var);
}
void Detector::process(int *pIn, int *pOut, int n)
{
outFile << "m_var process address: " << &m_var << endl;
outFile.close();
PRINT_MSG_3SPI("&m_var(m_var)", &m_var, m_var);
}
#if defined(__cplusplus)
extern "C" {
#endif
SIM_EXPORT Detector* DetectorNew() { return new Detector(); }
SIM_EXPORT void DetectorProcess(Detector *pDet, int *pIn, int *pOut, int n)
{
pDet->process(pIn, pOut, n);
}
SIM_EXPORT void DetectorDelete(Detector *pDet) { delete pDet; }
#if defined(__cplusplus)
}
#endif
code00.py:
#!/usr/bin/env python
import ctypes as ct
import sys
import numpy as np
IntPtr = ct.POINTER(ct.c_int)
sim_dll = ct.CDLL("./sim.dll")
detector_new_func = sim_dll.DetectorNew
detector_new_func.argtypes = ()
detector_new_func.restype = ct.c_void_p
detector_process_func = sim_dll.DetectorProcess
detector_process_func.argtypes = (ct.c_void_p, IntPtr, IntPtr, ct.c_int)
detector_process_func.restype = None
detector_delete_func = sim_dll.DetectorDelete
detector_delete_func.argtypes = (ct.c_void_p,)
detector_delete_func.restype = None
class Detector():
def __init__(self):
self.obj = detector_new_func()
def process(self, pin, pout, n):
detector_process_func(self.obj, pin, pout, n)
def __del__(self):
detector_delete_func(self.obj)
def main(*argv):
detector = Detector()
n = 1024
a = np.arange(n, dtype=np.uint32)
b = np.zeros(n, dtype=np.int32)
aptr = a.ctypes.data_as(IntPtr)
bptr = b.ctypes.data_as(IntPtr)
detector.process(aptr, bptr, n)
if __name__ == "__main__":
print("Python {:s} {:03d}bit on {:s}\n".format(" ".join(elem.strip() for elem in sys.version.split("\n")),
64 if sys.maxsize > 0x100000000 else 32, sys.platform))
rc = main(*sys.argv[1:])
print("\nDone.")
sys.exit(rc)
Notes:
As I stated at the beginning, the problem was argtypes and restype not being specified (e.g. for DetectorNew: comment detector_new_func.restype = ct.c_void_p, and you'll run into the problem again)
Code in the question is missing parts (#includes, imports, ...), also there are some syntax errors, so it doesn't compile, and therefore doesn't follow [SO]: How to create a Minimal, Complete, and Verifiable example (mcve) guidelines. Please when make sure to have MCVE when asking
The object that you allocate (new Detector()), must also be deallocated (otherwise, it will generate a memory leak), so I added a function (DetectorDelete - to do that), which is called from (Python) Detector's destructor
Other (non critical) changes (identifiers renaming, a bit of refactoring, printing to stdout, ...)
Output:
(py35x64_tes1) e:\Work\Dev\StackOverflow\q052268294>"c:\Install\x86\Microsoft\Visual Studio Community\2015\vc\vcvarsall.bat" x64
(py35x64_test) e:\Work\Dev\StackOverflow\q052268294>dir /b
code00.py
detector.cpp
(py35x64_test) e:\Work\Dev\StackOverflow\q052268294>cl /nologo /DDLL /EHsc detector.cpp /link /DLL /OUT:sim.dll
detector.cpp
Creating library sim.lib and object sim.exp
(py35x64_test) e:\Work\Dev\StackOverflow\q052268294>dir /b
code00.py
detector.cpp
detector.obj
sim.dll
sim.exp
sim.lib
(py35x64_test) e:\Work\Dev\StackOverflow\q052268294>"e:\Work\Dev\VEnvs\py35x64_test\Scripts\python.exe" ./code.py
Python 3.5.4 (v3.5.4:3f56838, Aug 8 2017, 02:17:05) [MSC v.1900 64 bit (AMD64)] 064bit on win32
From C - [detector.cpp] (28) - [Detector::Detector]: &m_var: 0x0000020CE366E270
From C - [detector.cpp] (34) - [Detector::process]: &m_var: 0x0000020CE366E270
Done.
I have the following C file that I am compiling to a shared object. I then load the .so shared object via ctypes in python. I can call the function from ctypes, and the function prints the correct temp and humidity, however I can't seem to get the struct back from the main code. How can I get the struct back from the C function and how can I retrieve the fields from it within python.
#!/bin/python
from ctypes import *
class HMTEMP(Structure):
_fields_ = [ ("temp", c_double) , ("humidity", c_double) ]
dhtlib = 'libdht4py.so'
hlibc = CDLL(dhtlib)
HMTEMP = hlibc.readDHT()
print HMTEMP.temp
#define BCM2708_PERI_BASE 0x20000000
#define GPIO_BASE (BCM2708_PERI_BASE + 0x200000) /* GPIO controller */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <dirent.h>
#include <fcntl.h>
#include <assert.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <bcm2835.h>
#include <unistd.h>
#define MAXTIMINGS 100
struct DHStruct {
double temp;
double humidity;
} ;
struct DHStruct readDHT();
int bits[250], data[100];
int bitidx = 0;
struct DHStruct readDHT() {
bcm2835_init() ;
int type = 11 ;
int pin = 4 ;
struct DHStruct dhts;
int counter = 0;
int laststate = HIGH;
int j=0;
// Set GPIO pin to output
bcm2835_gpio_fsel(pin, BCM2835_GPIO_FSEL_OUTP);
bcm2835_gpio_write(pin, HIGH);
usleep(500000); // 500 ms
bcm2835_gpio_write(pin, LOW);
usleep(20000);
bcm2835_gpio_fsel(pin, BCM2835_GPIO_FSEL_INPT);
data[0] = data[1] = data[2] = data[3] = data[4] = 0;
// wait for pin to drop?
while (bcm2835_gpio_lev(pin) == 1) {
usleep(1);
} //while
// read data!
for (int i=0; i< MAXTIMINGS; i++) {
counter = 0;
while ( bcm2835_gpio_lev(pin) == laststate) {
counter++;
//nanosleep(1); // overclocking might change this?
if (counter == 1000)
break;
}//while
laststate = bcm2835_gpio_lev(pin);
if (counter == 1000) break;
bits[bitidx++] = counter;
if ((i>3) && (i%2 == 0)) {
// shove each bit into the storage bytes
data[j/8] <<= 1;
if (counter > 200)
data[j/8] |= 1;
j++;
}//if
} //for
dhts.temp = data[2] ;
dhts.humidity = data[0] ;
printf("Temp = %5.2f *C, Hum = %5.2f \%\n", dhts.temp , dhts.humidity );
return dhts;
}//function
Ok I got it - and using ctypes was very fast. The python code:
#!/bin/python
from ctypes import *
# define the struct and it's fields
class DHStruct(Structure):
_fields_ = [("temp",c_double),("humidity",c_double)]
#reference the library
dhtlib = CDLL("libdht4py.so")
# set the return type as the object above
dhtlib.readDHT.restype = POINTER(DHStruct)
# dereference the pointer using ctype's -contents and access the struct fields.
print ( dhtlib.readDHT().contents.temp , dhtlib.readDHT().contents.humidity )
The C code : the key was to convert the function to return a pointer.
#define BCM2708_PERI_BASE 0x20000000
#define GPIO_BASE (BCM2708_PERI_BASE + 0x200000) /* GPIO controller */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <dirent.h>
#include <fcntl.h>
#include <assert.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <bcm2835.h>
#include <unistd.h>
#define MAXTIMINGS 100
//define the struct
struct DHStruct {
double temp;
double humidity;
} ;
struct DHStruct *readDHT(); // define the function prototype to return the pointer
int bits[250], data[100];
int bitidx = 0;
//make sure to return a POINTER!!
struct DHStruct *readDHT() {
bcm2835_init() ;
int type = 11 ;
int pin = 4 ;
struct DHStruct *dhts; // here is the key - define the pointer to the struct
int counter = 0;
int laststate = HIGH;
int j=0;
// Set GPIO pin to output
bcm2835_gpio_fsel(pin, BCM2835_GPIO_FSEL_OUTP);
bcm2835_gpio_write(pin, HIGH);
usleep(500000); // 500 ms
bcm2835_gpio_write(pin, LOW);
usleep(20000);
bcm2835_gpio_fsel(pin, BCM2835_GPIO_FSEL_INPT);
data[0] = data[1] = data[2] = data[3] = data[4] = 0;
// wait for pin to drop?
while (bcm2835_gpio_lev(pin) == 1) {
usleep(1);
} //while
// read data!
for (int i=0; i< MAXTIMINGS; i++) {
counter = 0;
while ( bcm2835_gpio_lev(pin) == laststate) {
counter++;
//nanosleep(1); // overclocking might change this?
if (counter == 1000)
break;
}//while
laststate = bcm2835_gpio_lev(pin);
if (counter == 1000) break;
bits[bitidx++] = counter;
if ((i>3) && (i%2 == 0)) {
// shove each bit into the storage bytes
data[j/8] <<= 1;
if (counter > 200)
data[j/8] |= 1;
j++;
}//if
} //for
dhts->temp = data[2] ;
dhts->humidity = data[0] ;
//for debug printf("Temp = %5.2f *C, Hum = %5.2f \%\n", dhts->temp , dhts->humidity );
return dhts;
}//function
To combine C/C++ and Python I would recommend to use Cython.
With Cython you are able to pass objects (eg. numpy arrays) to C/C++, fill it with your data and get it back to your python-code.
Here is a minmal example:
The C-skript: (c_example.c)
#include <stdlib.h>
#include <math.h>
void c_claculate(double *x, int N) {
int i;
for (i = 0; i<N;i++) {
x[i]+=i*i;
}
}
The python-skript: (example.py)
from numpy import *
from example import *
data=zeros(10)
calculate(data)
print data
The .pyx file: (example.pyx)
import cython
import numpy
cimport numpy
# declare the interface to the C code
cdef extern void c_claculate(double *x, int N)
# Cython interface to C function
def calculate(numpy.ndarray[double, ndim=1, mode='c'] x not None):
cdef int N = x.shape[0]
c_claculate(&x[0],N)
return x
and the setup file: (setup.py)
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
import numpy
setup(
cmdclass = {'build_ext': build_ext},
ext_modules = [
Extension("example",
sources=["example.pyx", "c_example.c"],
include_dirs=[numpy.get_include()]
)
],
)
Now you can compile the skript by running
python setup.py build_ext -fi
and then execute the python skript.
Cython should be available via pip on your PI.