I'm trying to embed Python in a C++ multithreaded program.
What I do is calling two statistical functions from the Python C API to perform the Two Sample Kolmogorov-Smirnov Test and the Two Sample Anderson-Darling Test on some data that I collect. So I'm just embedding Python in my code, I'm not extending it or using my own Python functions.
I recently found out that in order to run a multithreaded program that uses the Python C API you need to handle properly the Global Interpreter Lock (GIL) and when ever you use a Python C API function you need to acquire the GIL and then release it when you're done using the API functions.
The thing that I still don't understand is how to properly release the GIL from the main thread in order to let the others execute the Python code.
I tried this (option 1):
int main(int argc, const char * argv[]) {
int n = 4;
std::thread threads[n];
Py_Initialize();
PyEval_InitThreads();
PyEval_SaveThread();
for (int i = 0; i < n; i++) {
threads[i] = std::thread(exec, i);
}
for (int i = 0; i < n; i++) {
threads[i].join();
}
Py_Finalize();
return 0;
}
But it gives me a segmentation fault when calling Py_Finalize().
So I tried this (option 2):
int main(int argc, const char * argv[]) {
int n = 4;
std::thread threads[n];
Py_Initialize();
PyEval_InitThreads();
PyThreadState * Py_UNBLOCK_THREADS
for (int i = 0; i < n; i++) {
threads[i] = std::thread(exec, i);
}
for (int i = 0; i < n; i++) {
threads[i].join();
}
Py_BLOCK_THREADS
Py_Finalize();
return 0;
}
and this (option 3):
int main(int argc, const char * argv[]) {
int n = 4;
std::thread threads[n];
Py_Initialize();
PyEval_InitThreads();
Py_BEGIN_ALLOW_THREADS
for (int i = 0; i < n; i++) {
threads[i] = std::thread(exec, i);
}
for (int i = 0; i < n; i++) {
threads[i].join();
}
Py_END_ALLOW_THREADS
Py_Finalize();
return 0;
}
With both these last two options the code runs but ends with this error:
Exception ignored in: <module 'threading' from '/usr/local/opt/python3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py'>
Traceback (most recent call last):
File "/usr/local/opt/python3/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py", line 1289, in _shutdown
assert tlock.locked()
AssertionError:
EDIT:
The code that is executed by the spawned threads is this:
double limited_rand(double lower_bound, double upper_bound) {
return lower_bound + (rand() / (RAND_MAX / (upper_bound-lower_bound) ) );
}
double exec_1(std::vector<int> &left_sample, std::vector<int> &right_sample) {
PyGILState_STATE gstate = PyGILState_Ensure(); // Acquiring GIL for thread-safe usage Python C API
PyObject* scipy_stats_module = PyImport_ImportModule("scipy.stats"); // importing "scipy.stats" module
import_array();
npy_intp left_nparray_shape[] = {(npy_intp)left_sample.size()}; // Size of left nparray's first dimension
PyObject* left_sample_nparray = PyArray_SimpleNewFromData(1, left_nparray_shape, NPY_INT, &left_sample[0]); // Creating numpy array with 1 dimension, taking "dim" as a dummy, elements are integers, and the data is taken from "sample1" as a int* pointer
npy_intp right_nparray_shape[] = {(npy_intp)right_sample.size()}; // Size of right nparray's first dimension
PyObject* right_sample_nparray = PyArray_SimpleNewFromData(1, right_nparray_shape, NPY_INT, &right_sample[0]);
PyObject* ks_2samp = PyObject_GetAttrString(scipy_stats_module, "ks_2samp");
Py_DecRef(scipy_stats_module);
PyObject* ks_2samp_return_val = PyObject_CallFunctionObjArgs(ks_2samp, left_sample_nparray, right_sample_nparray, NULL);
Py_DecRef(ks_2samp);
Py_DecRef(right_sample_nparray);
Py_DecRef(left_sample_nparray);
double p_value = PyFloat_AsDouble(PyTuple_GetItem(ks_2samp_return_val, 1));
Py_DecRef(ks_2samp_return_val);
PyGILState_Release(gstate); // Releasing GIL
return p_value;
}
void initialize_c_2d_int_array(int*& c_array, unsigned long row_length_c_array, std::vector<int> &row1, std::vector<int> &row2) {
for (unsigned int i = 0; i < row_length_c_array; i++) {
c_array[i] = row1[i];
c_array[row_length_c_array + i] = row2[i];
}
}
double exec_2(std::vector<int> &left_sample, std::vector<int> &right_sample){
PyGILState_STATE gstate = PyGILState_Ensure(); // Acquiring GIL for thread-safe usage Python C API
PyObject* scipy_stats_module = PyImport_ImportModule("scipy.stats"); // importing "scipy.stats" module
// import_array();
unsigned long n_cols = std::min(left_sample.size(), right_sample.size());
int* both_samples = (int*) (malloc(2 * n_cols * sizeof(int)));
initialize_c_2d_int_array(both_samples, n_cols, left_sample, right_sample);
npy_intp dim3[] = {2, (npy_intp) n_cols};
PyObject* both_samples_nparray = PyArray_SimpleNewFromData(2, dim3, NPY_INT, both_samples);
PyObject* anderson_ksamp = PyObject_GetAttrString(scipy_stats_module, "anderson_ksamp");
Py_DecRef(scipy_stats_module);
PyObject* anderson_2samp_return_val = PyObject_CallFunctionObjArgs(anderson_ksamp, both_samples_nparray, NULL);
Py_DecRef(anderson_ksamp);
Py_DecRef(both_samples_nparray);
free(both_samples);
double p_value = PyFloat_AsDouble(PyTuple_GetItem(anderson_2samp_return_val, 2));
Py_DecRef(anderson_2samp_return_val);
PyGILState_Release(gstate); // Releasing GIL
return p_value;
}
void exec(int thread_id) {
std::vector<int> left_sample;
std::vector<int> right_sample;
int n = 50;
for (int j = 0; j < n; j++) {
int size = 100;
for (int i = 0; i < size; i++) {
left_sample.push_back(limited_rand(0, 100));
right_sample.push_back(limited_rand(0, 100));
}
exec_1(left_sample, right_sample);
exec_2(left_sample, right_sample);
}
}
The functions where I use the Python C API are only exec_1 and exec_2, while exec has just the job to call the repeatedly on new random data. This is the simplest code I could think of that mimics the behavior of my real code. I've also left out every type of error checking when using the Python APIs for a better readability.
Without any other choice I'll run my code like option 2 or option 3 and forget about the error, but I would really like to understand what's going on. Can you help me?
P.S. I'm running Python 3.6.1 under a macOS 10.12.5 system using Xcode 8.3.3. If you need more details let me know.
option1:
I think is giving you a segmentation fault because you called PyEval_SaveThread() (which releases the gil, returns a saved thread state, and sets the current thread state to NULL).
Py_Finalize will try to free all memory associated with the interpreter, and I guess this included the main thread state. So you can either capture this state with:
PyEval_InitThreads(); //initialize and aquire the GIL
//release the GIL, store thread state, set the current thread state to NULL
PyThreadState *mainThreadState = PyEval_SaveThread();
*main code segment*
//re-aquire the GIL (re-initialize the current thread state)
PyEval_RestoreThread(mainThreadState);
Py_Finalize();
return 0;
Or you can immediately call PyEval_ReleaseLock() after calling PyEval_InitThreads() since it looks like the main code segment does not use any embedded python. I had a similar problem and that seemed to fix it.
NOTE: Other threads will still need to aquire/release the GIL wherever necessary
Related
I am working on my Project which implies the use of Empirical Mode Decomposition in C++ for EEG Signals. The input Data is Eigen::MatrixXd, where the rows are the Channels and the columns are the samples.
I did not found a good C++ library for EMD so I want to use a Python one (dsatools). I have downloaded the package through Pip installer from the setup.py file on Xubuntu... so it's a system package now.
the problem is that the program can't read the module.
this is the code:
std::vector <Eigen::MatrixXd> DataAquisition::EMD (Eigen::MatrixXd array, int order, int iterations, int locality) {
std::vector <Eigen::MatrixXd> IMFs;
for (int i = 0; i < array.rows(); i++) {
Eigen::MatrixXd Kanals = array.row(i);
Eigen::MatrixXd IMFs_Cpp;
Py_Initialize();
//PyRun_SimpleString("from dsatools._base._imf_decomposition import * ");
PyObject* sys_path = PySys_GetObject("path");
PyObject* ProgrammName = PyUnicode_FromString("/home/user/Schreibtisch/mne-cpp-main/applications/mne_bci/MNE-BCI-QT/dsatools-master/dsatools/_base/_imf_decomposition/_emd.py");
PyList_Append(sys_path, ProgrammName);
PyObject* pModuleString = PyUnicode_FromString ((char*)"_emd.py");
PyObject* pModule = PyImport_Import(pModuleString);
PyObject* pFunction = PyObject_GetAttrString(pModule,(char*)"emd");
//PyObject* pDict = PyModule_GetDict(pModule);
//PyObject* pFunc = PyDict_GetItemString(pDict, (char*)"emd");
if (PyCallable_Check(pFunction))
{
PyObject* Signal = Py_BuildValue("(d)",(double*)Kanals.data());
PyObject* Order = Py_BuildValue("(i)",order);
PyObject* method = Py_BuildValue("(z)",(char*)"cubic");
PyObject* max_itter = Py_BuildValue("(i)",iterations);
PyObject* args = PyTuple_Pack(4,Signal,Order,method,max_itter);
PyErr_Print();
PyObject* IMFs_Py = PyObject_CallObject(pFunction,args);
PyErr_Print();
if (PyArray_Check(IMFs_Py))
std::cout << "EMD Output is NOT Array \n";
PyArrayObject *np_ret = reinterpret_cast <PyArrayObject*> (IMFs_Py);
int Rows = PyArray_SHAPE(np_ret)[0];
int Cols = PyArray_SHAPE(np_ret)[1];
double* c_out = reinterpret_cast<double*>(PyArray_DATA(np_ret));
Eigen::MatrixXd IMFs_Cpp = Eigen::Map <Eigen::MatrixXd> (c_out,Rows,Cols);
IMFs.push_back(IMFs_Cpp);
}
else
std::cout << "Python did not call the function \n";
Py_Finalize();
}
return IMFs;}
this is how the code in Python should look like and I just want to call the emd function:
I'm figuring out the Python/C API for a more complex task. Initially, I wrote a simple example of adding two ndarrays of shape = (2,3) and type = float32.
I am able to pass two numpy arrays into c functions, read their dimensions and data and perform custom addion on data. But when I try to wrap the resulting data using PyArray_SimpleNewFromData, code hangs (returns NULL?)
To replicate the issue, create three files: mymath.c, setup.py, test.py in a folder as follows and run test.py (it runs setup.py to compile and install the module and then runs a simple test).
I'm using python in windows, inside an anaconda environment. I'm new to the Python/C API. So, any help would be much appreciated.
// mymath.c
#include <Python.h>
#include <stdio.h>
#include "numpy/arrayobject.h"
#include "numpy/npy_math.h"
#include <math.h>
#include <omp.h>
/*
C functions
*/
float* arr_add(float* d1, float* d2, int M, int N){
float * result = (float *) malloc(sizeof(float)*M*N);
for (int m=0; m<M; m++)
for (int n=0; n<N; n++)
result [m*N+ n] = d1[m*N+ n] + d2[m*N+ n];
return result;
}
/*
Unwrap apply and wrap pyObjects
*/
void capsule_cleanup(PyObject *capsule) {
void *memory = PyCapsule_GetPointer(capsule, NULL);
free(memory);
}
// add two 2d arrays (float32)
static PyObject *arr_add_fn(PyObject *self, PyObject *args)
{
PyArrayObject *arr1, *arr2;
if (!PyArg_ParseTuple(args, "OO", &arr1, &arr2))
return NULL;
// get data as flat list
float *d1, *d2;
d1 = (float *) arr1->data;
d2 = (float *) arr2->data;
int M, N;
M = (int)arr1->dimensions[0];
N = (int)arr1->dimensions[1];
printf("Dimensions, %d, %d \n\n", M,N);
PyObject *result, *capsule;
npy_intp dim[2];
dim[0] = M;
dim[1] = N;
float * d3 = arr_add(d1, d2, M, N);
result = PyArray_SimpleNewFromData(2, dim, NPY_FLOAT, (void *)d3);
if (result == NULL)
return NULL;
// -----------This is not executed. code hangs--------------------
for (int m=0; m<M; m++)
for (int n=0; n<N; n++)
printf("%f \n", d3[m*N+n]);
capsule = PyCapsule_New(d3, NULL, capsule_cleanup);
PyArray_SetBaseObject((PyArrayObject *) result, capsule);
return result;
}
/*
Bundle functions into module
*/
static PyMethodDef MyMethods [] ={
{"arr_add", arr_add_fn, METH_VARARGS, "Array Add two numbers"},
{NULL,NULL,0,NULL}
};
/*
Create module
*/
static struct PyModuleDef mymathmodule = {
PyModuleDef_HEAD_INIT,
"mymath", "My doc of mymath", -1, MyMethods
};
PyMODINIT_FUNC PyInit_mymath(void){
return PyModule_Create(&mymathmodule);
}
# setup.py
from distutils.core import setup, Extension
import numpy
module1 = Extension('mymath',
sources = ['mymath.c'],
# define_macros = [('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION')],
include_dirs=[numpy.get_include()],
extra_compile_args = ['-fopenmp'],
extra_link_args = ['-lgomp'])
setup (name = 'mymath',
version = '1.0',
description = 'My math',
ext_modules = [module1])
# test.py
import os
os.system("python .\setup.py install")
import numpy as np
import mymath
a = np.arange(6,dtype=np.float32).reshape(2,3)
b = np.arange(6,dtype=np.float32).reshape(2,3)
c = mymath.arr_add(a,b)
print(c)
I would like to integrate C modules in the Python, so my choice fell on the interface Python.h. Everything compiled without errors and warnings, so I can not understand what the problem is.
C side:
#include <python3.5m/Python.h>
...
#define PyInt_AsLong(x) (PyLong_AsLong((x)))
typedef PyObject* Py;
static Py getSumma(Py self, Py args){
Py nums;
if (!PyArg_ParseTuple(args, "O", &nums)){
return NULL;
}
size_t numsAmount = PyList_Size(args);
int32_t summa = 0;
for (size_t i = 0; i < numsAmount; i++){
Py temp = PyList_GetItem(nums, i);
int32_t num = PyInt_AsLong(temp);
summa += num;
}
return Py_BuildValue("l", summa);
}
static PyMethodDef moduleMethods[] = {
{"getSumma", (PyCFunction)getSumma, METH_VARARGS, NULL},
{NULL, NULL, 0, NULL}
};
static PyModuleDef SummaLogic = {
PyModuleDef_HEAD_INIT,
"SummaLogic",
"",
-1,
moduleMethods
};
PyMODINIT_FUNC PyInit_SummaLogic(void){
return PyModule_Create(&SummaLogic);
}
setup.py:
from distutils.core import setup, Extension
SummaLogic = Extension("SummaLogic", sources=['SummaLogic.c'])
setup(ext_modules=[SummaLogic])
Python side:
from SummaLogic import getSumma
if __name__ == "__main__":
a = [1, 2, 3]
b = getSumma(a)
print(b)
It seems right, but when I start it in terminal - nothing happens, just hanging without any activity. What could I miss?
It boils down to PyList_Size and that you don't check for errors there.
You probably wanted to use it on nums, not args as argument. However you used on args and a very interesting thing happened:
args is a tuple,
therefore PyList_Size failed and returned -1
that -1 which was cast to an unsigned size_t which probably resulted in a very huge number, probably 2**64-1
therefore your iteration runs a "very long time" because it takes quite a while to iterate over 2**64-1 items (apart from all the out-of-bound memory accesses).
The quick fix would be to use:
Py_ssize_t listlength = PyList_Size(nums); /* nums instead of args */
if (listlength == -1) { /* check for errors */
return NULL;
}
size_t numsAmount = (size_t)listlength /* cast to size_t only after you checked for errors */
However you should check what the error conditions are and test for them after every python C API function call otherwise you'll get a lot of undefined behaviours. Also I probably would stick to the defined return types instead of int32_t (PyInt_AsLong returns long so you might get weird casting errors there as well!), size_t, ... and the typedef PyObject* Py; makes things really tricky for someone who regularly writes C extensions.
I am learning embedding python in c++ code. I have trouble to use multi-threading to parallelize two c++ functions with embedding python.
My sample codes are shown below:
thread_test.py
import time
def test1():
time.sleep(5) # delays for 5 seconds
print 1935
return 'happy'
def test2():
time.sleep(10) # delays for 10 seconds
print 3000
py_thread.h
string test_func1(string file_dir){
string result_dir;
string str = "import sys; sys.path.insert(0," "\'"+file_dir+"\'"+")";
const char * c = str.c_str();
PyRun_SimpleString (c);
PyObject * pModule,* pFunc, *pName, *presult, *pArgs;
pName = PyString_FromString("thread_test");
pModule = PyImport_Import(pName);
Py_DECREF(pName);
pFunc = PyObject_GetAttrString(pModule, "test1");
if(pFunc != NULL) {
presult=PyObject_CallObject(pFunc,NULL);
result_dir = PyString_AsString(presult);
}
else {
printf("pFunc returned NULL\n");
}
Py_DECREF(pModule);
Py_DECREF(pFunc);
return result_dir;
}
void test_func2(string file_dir){
// Almost the same test_func1 except replacing "test1" with "test2" and no return value of result_dir
}
In main class, if I don't use multi-threading and just run the two functions and other normal c++ functions in serious, it works. But if I use some c++ threading techniques, such as OPENMP, it will give me SEGMENTATION FAULT. (code is shown below)
main.cpp
int main(){
Py_Initialize();
#pragma omp parallel num_threads(2)
{
int i = omp_get_thread_num();
if(i == 0)
{
test_func1("../");
}
if(i == 1 || omp_get_num_threads() != 2)
{
ANOTHER_C++_ONLY_SIMPLE_FUNCTION();
test_func2("../");
}
}
Py_Finalize();
return 0;
}
I also have tried thread in c++11 and pthread. They all give me segmentation fault. So how can I parallelize the two function???
Thank you!
I have Python Script embedded in C which I run in a thread. I need to pass the variable 'a' from the Python-Class 'Detect Motion' to my C program continuously. (Not as a return value)
I know I could do this with a fifo or something like that, but is there a way to pass it directly to C, maybe by calling a C function?
C:
#include <Python.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <pthread.h>
pthread_t mythread;
void *ThreadProc();
PyObject *pName, *pModule, *pDict, *pFunc, *pFunc2;
int main(int argc, char *argv[])
{
py_callback = PyCFunction_New(&callback_descr, NULL);
char *script = "motion";
char *functionUse = "get_values";
Py_Initialize();
pName = PyString_FromString(script);
pModule = PyImport_Import(pName);
// pDict and pFunc are borrowed references
pDict = PyModule_GetDict(pModule);
pFunc = PyDict_GetItemString(pDict, functionUse);
// POSIX code
pthread_create( &mythread, NULL, ThreadProc, NULL);
// Random testing code
for(int i = 0; i < 10; i++)
{
printf("Printed from the main thread.\n");
sleep(1);
}
printf("Main Thread waiting for My Thread to complete...\n");
// Join and wait for the created thread to complete...
// POSIX code
pthread_join(mythread, NULL);
printf("Main thread finished gracefully.\n");
return 0;
}
void *ThreadProc()
{
if (PyCallable_Check(pFunc))
{
PyObject_CallObject(pFunc, NULL);
}
else {
PyErr_Print();
}
// Clean up
Py_DECREF(pModule);
Py_DECREF(pName);
Py_Finalize();
printf("My thread is finishing...\n");
}
Python:
import numpy as np
import picamera
import picamera.array
class DetectMotion(picamera.array.PiMotionAnalysis):
def analyse(self, a):
a = np.sqrt(
np.square(a['x'].astype(np.float)) +
np.square(a['y'].astype(np.float))
).clip(0, 255).astype(np.uint8)
# If there're more than 10 vectors with a magnitude greater
# than 60, then say we've detected motion
print a
if (a > 60).sum() > 10:
print 'Motion detected!'
def get_values():
with picamera.PiCamera() as camera:
with DetectMotion(camera) as output:
camera.resolution = (640, 480)
camera.start_preview()
camera.start_recording(
'/dev/null', format='h264', motion_output=output)
camera.wait_recording(10)
camera.stop_recording()
camera.stop_preview()