Related
I am a beginner in C++ and python. I have installed boost-1.78.0, OPENCV-4.5.5 and added a new Dynamic-Link Library (DLL) project in Visual Studio 2019, and then successfully build the sln and generated testSqaure.dll.
My code is as follows:
#define BOOST_PYTHON_STATIC_LIB
#include "pch.h"
#include<boost/python.hpp>
#include<boost/python/numpy.hpp>
#include<opencv2/highgui/highgui.hpp>
#include<opencv2/imgproc.hpp>
#include<cmath>
#include<algorithm>
using boost::python::list;
using boost::python::numpy::ndarray;
using boost::python::object;
using boost::python::extract;
using std::vector;
using std::array;
using std::sqrt;
using std::min;
using cv::Point2f;
using cv::Point;
using cv::pointPolygonTest;
vector<double> ndarray2vec(const ndarray& arr)
{
int input_size = static_cast<int>(arr.shape(0));
double* input_ptr = reinterpret_cast<double*>(arr.get_data());
vector<double> v_arr(input_size);
for (int i = 0; i < input_size; ++i)
{
v_arr[i] = *(input_ptr + i);
}
return v_arr;
}
bool two_line_segment_test(double x1, double y1, double x2, double y2, double x3, double y3, double x4, double y4)
{
if (((x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)) == 0) return false;
const double px = ((x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)) / ((x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4));
const double py = ((x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4)) / ((x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4));
if ((((x1 <= px) && (px <= x2)) || ((x2 <= px) && (px <= x1))) && (((x3 <= px) && (px <= x4)) || ((x4 <= px) && (px <= x3))) && (((y1 <= py) && (py <= y2)) || ((y2 <= py) && (py <= y1))) && (((y3 <= py) && (py <= y4)) || ((y4 <= py) && (py <= y3))))
{
return true;
}
else return false;
}
object two_triangle_test(ndarray& tri1_side1, ndarray& tri1_side2, ndarray& tri1_side3, ndarray& tri2_side1, ndarray& tri2_side2, ndarray& tri2_side3)
{
vector<double> v_tri1_side1 = ndarray2vec(tri1_side1),
v_tri1_side2 = ndarray2vec(tri1_side2),
v_tri1_side3 = ndarray2vec(tri1_side3),
v_tri2_side1 = ndarray2vec(tri2_side1),
v_tri2_side2 = ndarray2vec(tri2_side2),
v_tri2_side3 = ndarray2vec(tri2_side3);
array<vector<double>, 3> v_tri1_sides = { v_tri1_side1, v_tri1_side2, v_tri1_side3 };
array<vector<double>, 3> v_tri2_sides = { v_tri2_side1, v_tri2_side2, v_tri2_side3 };
for (int i = 0; i < 3; ++i)
{
for (int j = 0; j < 3; ++j)
{
if (two_line_segment_test(v_tri1_sides[i][0], v_tri1_sides[i][1], v_tri1_sides[i][2], v_tri1_sides[i][3], v_tri2_sides[j][0], v_tri2_sides[j][1], v_tri2_sides[j][2], v_tri2_sides[j][3]))
{
return object(true);
}
}
}
vector<Point2f> tri1(3);
tri1[0] = Point(v_tri1_side1[0], v_tri1_side1[1]);
tri1[1] = Point(v_tri1_side1[2], v_tri1_side1[3]);
tri1[2] = Point(v_tri1_side2[2], v_tri1_side2[3]);
for (auto& side : v_tri2_sides)
{
const int retval = static_cast<int>(pointPolygonTest(tri1, Point(side[0], side[1]), false));
if (retval == 1 || retval == 0) return object(true);
}
vector<Point2f> tri2(3);
tri2[0] = Point(v_tri2_side1[0], v_tri2_side1[1]);
tri2[1] = Point(v_tri2_side1[2], v_tri2_side1[3]);
tri2[2] = Point(v_tri2_side2[2], v_tri2_side2[3]);
for (auto& side : v_tri1_sides)
{
const int retval = static_cast<int>(pointPolygonTest(tri2, Point(side[0], side[1]), false));
if (retval == 1 || retval == 0) return object(true);
}
return object(false);
}
array<double, 2> get_foot_point(const vector<double>& v_point, const vector<double>& v_line_p1, const vector<double>& v_line_p2)
{
const double k = -((v_line_p1[0] - v_point[0]) * (v_line_p2[0] - v_line_p1[0]) + (v_line_p1[1] - v_point[0]) * (v_line_p2[1] - v_line_p1[1]) + 0) / ((v_line_p2[0] - v_line_p1[0]) * (v_line_p2[0] - v_line_p1[0]) + (v_line_p2[1] - v_line_p1[1]) * (v_line_p2[1] - v_line_p1[1]) + 0) * 1.0;
const double xn = k * (v_line_p2[0] - v_line_p1[0]) + v_line_p1[0];
const double yn = k * (v_line_p2[1] - v_line_p1[1]) + v_line_p1[1];
return array<double, 2>{xn, yn};
}
double get_dis_point2line(const vector<double>& v_point, const vector<double>& v_line_p1, const vector<double>& v_line_p2)
{
array<double, 2> footP = get_foot_point(v_point, v_line_p1, v_line_p2);
double dist = 0;
if (((footP[0] - v_line_p1[0]) > 0) ^ ((footP[0] - v_line_p2[0]) > 0))
{
dist = sqrt((footP[0] - v_point[0]) * (footP[0] - v_point[0]) + (footP[1] - v_point[1]) * (footP[1] - v_point[1]));
}
else
{
dist = min(sqrt((v_line_p1[0] - v_point[0]) * (v_line_p1[0] - v_point[0]) + (v_line_p1[1] - v_point[1]) * (v_line_p1[1] - v_point[1])),
sqrt((v_line_p2[0] - v_point[0]) * (v_line_p2[0] - v_point[0]) + (v_line_p2[1] - v_point[1]) * (v_line_p2[1] - v_point[1])));
}
return dist;
}
object circle_triangle_test(const ndarray& center, const object& radius, const ndarray& tri_side1, const ndarray& tri_side2, const ndarray& tri_side3)
{
const double val_radius = extract<double>(radius);
vector<double> v_tri_side1 = ndarray2vec(tri_side1),
v_tri_side2 = ndarray2vec(tri_side2),
v_tri_side3 = ndarray2vec(tri_side3),
v_center = ndarray2vec(center);
array<vector<double>, 3> v_tri_sides = { v_tri_side1, v_tri_side2, v_tri_side3 };
for (int i = 0; i < 3; ++i)
{
if (sqrt((v_tri_sides[i][0] - v_center[0]) * (v_tri_sides[i][0] - v_center[0]) + (v_tri_sides[i][1] - v_center[1]) * (v_tri_sides[i][1] - v_center[1])) <= val_radius)
{
return object(true);
}
}
for (int i = 0; i < 3; ++i)
{
if (get_dis_point2line(v_center, { v_tri_sides[i][0], v_tri_sides[i][1] }, { v_tri_sides[i][2], v_tri_sides[i][3] }) <= val_radius)
{
return object(true);
}
}
return object(false);
}
BOOST_PYTHON_MODULE(testSquare)
{
using namespace boost::python;
def("two_triangle_test", two_triangle_test);
def("circle_triangle_test", circle_triangle_test);
}
This file mainly writes two functions, which are used to detect whether two triangles overlap, and whether a circle overlaps another triangle.
But when I rename testSquare.dll to testSquare.pyd and import it in a .py file, I found that there is a error: "ImportError: DLL load failed: The specified module could not be found."
The DLL file generated by boost example.cpp like this can be imported and used correctly:
#include "pch.h"
#define BOOST_PYTHON_STATIC_LIB
#include<boost/python.hpp>
using boost::python::list;
list Square(list& data)
{
list ret;
for (int i = 0; i < len(data); ++i)
{
ret.append(data[i] * data[i]);
}
return ret;
}
BOOST_PYTHON_MODULE(testSquare)
{
using namespace boost::python;
def("Square", Square);
}
I have solved this problem by moving boost_python37-vc142-mt-gd-x64-1_78.dll and boost_numpy37-vc142-mt-gd-x64-1_78.dll to the project folder, or by putting the paths like D:\Data\dev\boost_1_78_0\stage\lib where they are in PATH.
The routine for configuring boost doesn't tell me to put this path into PATH.
I am trying to convert this C++ function into Python code but I am having trouble in doing so for the pointer and turning the python variables into uint32_t and uint8_t respectively. I am not sure how to declare the functions such that it returns uint32_t and also what to do with the pointer uint8_t *buf. Please help me in figuring out on how to convert the function, from C++ to python.
This is my C++ code:
uint32_t Functions::Do_calc(uint8_t *buf, uint32_t len){
return Do_calc(25, buf, len);
}
uint32_t Functions::Do_calc(uint32_t val, uint8_t *buf, uint32_t len){
uint32_t temp_int, c = val;
uint32_t ip_buf[128];
uint32_t j, rem = 0, tf = 0, p = 0;
rem = len;
while(rem > 0){
if(rem <= 512){
tf = rem;
}
else{
tf = 512;
}
for(j = 0; j < 128; j++){
ip_buf[j]=0;
}
for(j = 0; j < tf; j += 2){
temp = ((buf[p * 512 + (j + 3)]<<24) +
(buf[p * 512 + (j + 2)]<<16) +
(buf[p * 512 + (j + 1)]<<8) +
buf[p * 512 + j]);
ip_buf[j / 4] = temp;
}
c = c_cal(ip_buf, tf, c, 0x04C22AB9, 2, true);
p++
}
return c;
}
uint32_t Functions::c_cal(uint32_t *d_base, uint32_t d_size, uint32_t c, uint32_t poly, uint8_t c_size, bool b_r_ip)
{
unsigned long d_offset;
unsigned long d, d_temp;
unsigned char c_bit;
d = 0;
for(d_offset = 0; d_offset < d_size; (d_offset += c_size))
{
u32_d_temp = 0;
d_temp = d_base[d_offset/c_size];
if(FALSE == b_r_ip)
{
d = d_temp;
}
else
{
d = 0;
for(c_bit = 0; c_bit < (c_size << 3); c_bit++)
{
d <<= 1;
d |= (d_temp & 1);
d_temp >>= 1;
}
}
for(c_bit = 0; c_bit < (c_size << 3); c_bit++)
{
if(((c >> ((c_size << 3) - 1)) ^ d) & 1)
{
c <<= 1;
d >>= 1;
c ^= poly;
}
else
{
c <<= 1;
d >>= 1;
}
}
}
return (c & (0xFFFFFFFF >> (32 - (c_size << 3))));
}
This is my attempted Python implementation. As you can see I just did a basic implementation as I did not worry about the pointer and the size of the integer which is very much needed:
def Do_calc(buf, len):
return Do_calc(0, buf, len)
def Do_calc(val, buf, len):
ip_buf = []
c = val
p = 0
rem = len
while rem > 0:
if rem <= 512:
tf = rem
else:
tf = 512
for j in range(128):
ip_buf[j].append = 0
for j in xrange(0, tf, 2):
temp_int = ((buffer[packet * 512 + (i + 3)] << 24) +
(buffer[packet * 512 + (i + 2)] << 16) +
(buffer[packet * 512 + (i + 1)] << 8) +
buffer[packet * 512 + i])
ip_buf[j/4] = temp
c = c_cal(ip_buf, tf, c, 0x04C22AB9, 2, true)
p += 1
return c
How do I properly do the conversion after taking care of all the aspects?
I have a Python code (for implementing RayTracing) that I'm running in parallel with PyCuda.
import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
from stl import mesh
import time
my_mesh = mesh.Mesh.from_file('test_solid_py.stl')
n = my_mesh.normals
v0 = my_mesh.v0
v1 = my_mesh.v1
v2 = my_mesh.v2
v0_x = v0[:,0]
v0_x = np.ascontiguousarray(v0_x)
v0_y = v0[:,1]
v0_y = np.ascontiguousarray(v0_y)
v0_z = v0[:,2]
v0_z = np.ascontiguousarray(v0_z)
v1_x = v1[:,0]
v1_x = np.ascontiguousarray(v1_x)
v1_y = v1[:,1]
v1_y = np.ascontiguousarray(v1_y)
v1_z = v1[:,2]
v1_z = np.ascontiguousarray(v1_z)
v2_x = v2[:,0]
v2_x = np.ascontiguousarray(v2_x)
v2_y = v2[:,1]
v2_y = np.ascontiguousarray(v2_y)
v2_z = v2[:,2]
v2_z = np.ascontiguousarray(v2_z)
mod = SourceModule("""
#include <math.h>
__global__ void intersect(float *origin,float *dir_x,float *dir_y,float *dir_z,float *v0_x,float *v0_y,float *v0_z,float *v1_x,float *v1_y,float *v1_z,float *v2_x,float *v2_y,float *v2_z,float *int_point_real_x, float *int_point_real_y,float *int_point_real_z)
{
using namespace std;
unsigned int idx = blockDim.x*blockIdx.x + threadIdx.x;
int count = 0;
float v0_current[3];
float v1_current[3];
float v2_current[3];
float dir_current[3] = {dir_x[idx],dir_y[idx],dir_z[idx]};
float int_point[3];
float int_pointS[2][3];
int int_faces[2];
float dist[2];
dist[0] = -999;
int n_tri = 105500;
for(int i = 0; i<n_tri; i++) {
v0_current[0] = v0_x[i];
v0_current[1] = v0_y[i];
v0_current[2] = v0_z[i];
v1_current[0] = v1_x[i];
v1_current[1] = v1_y[i];
v1_current[2] = v1_z[i];
v2_current[0] = v2_x[i];
v2_current[1] = v2_y[i];
v2_current[2] = v2_z[i];
double eps = 0.0000001;
float E1[3];
float E2[3];
float s[3];
for (int j = 0; j < 3; j++) {
E1[j] = v1_current[j] - v0_current[j];
E2[j] = v2_current[j] - v0_current[j];
s[j] = origin[j] - v0_current[j];
}
float h[3];
h[0] = dir_current[1] * E2[2] - dir_current[2] * E2[1];
h[1] = -(dir_current[0] * E2[2] - dir_current[2] * E2[0]);
h[2] = dir_current[0] * E2[1] - dir_current[1] * E2[0];
float a;
a = E1[0] * h[0] + E1[1] * h[1] + E1[2] * h[2];
if (a > -eps && a < eps) {
int_point[0] = false;
}
else {
double f = 1 / a;
float u;
u = f * (s[0] * h[0] + s[1] * h[1] + s[2] * h[2]);
if (u < 0 || u > 1) {
int_point[0] = false;
}
else {
float q[3];
q[0] = s[1] * E1[2] - s[2] * E1[1];
q[1] = -(s[0] * E1[2] - s[2] * E1[0]);
q[2] = s[0] * E1[1] - s[1] * E1[0];
float v;
v = f * (dir_current[0] * q[0] + dir_current[1] * q[1] + dir_current[2] * q[2]);
if (v < 0 || (u + v)>1) {
int_point[0] = false;
}
else {
float t;
t = f * (E2[0] * q[0] + E2[1] * q[1] + E2[2] * q[2]);
if (t > eps) {
for (int j = 0; j < 3; j++) {
int_point[j] = origin[j] + dir_current[j] * t;
}
//return t;
}
}
}
}
if (int_point[0] != false) {
count = count+1;
int_faces[count-1] = i;
dist[count-1] = sqrt(pow((origin[0] - int_point[0]), 2) + pow((origin[1] - int_point[1]), 2) + pow((origin[2] - int_point[2]), 2));
for (int j = 0; j<3; j++) {
int_pointS[count-1][j] = int_point[j];
}
}
}
double min = dist[0];
int ind_min = 0;
for (int i = 0; i < 2; i++){
if (min > dist[i]) {
min = dist[i];
ind_min = i;
}
}
if (dist[0] < -998){
int_point_real_x[idx] = -999;
int_point_real_y[idx] = -999;
int_point_real_z[idx] = -999;
}
else{
int_point_real_x[idx] = int_pointS[ind_min][0];
int_point_real_y[idx] = int_pointS[ind_min][1];
int_point_real_z[idx] = int_pointS[ind_min][2];
}
}
""")
n_rays = 20000
num_threads = 1024
num_blocks = int(n_rays/num_threads)
origin = np.asarray([-2, -2, -2]).astype(np.float32)
origin = np.ascontiguousarray(origin)
rand_x = np.random.randn(n_rays)
rand_y = np.random.randn(n_rays)
rand_z = np.random.randn(n_rays)
direction_x = np.ones((n_rays, 1)) * 3
direction_x = direction_x.astype(np.float32)
direction_x = np.ascontiguousarray(direction_x)
direction_y = np.ones((n_rays, 1)) * 4
direction_y = direction_y.astype(np.float32)
direction_y = np.ascontiguousarray(direction_y)
direction_z = np.ones((n_rays, 1)) * 5
direction_z = direction_z.astype(np.float32)
direction_z = np.ascontiguousarray(direction_z)
int_point_real_x = np.zeros((n_rays, 1)).astype(np.float32)
int_point_real_x = np.ascontiguousarray(int_point_real_x)
int_point_real_y = np.zeros((n_rays, 1)).astype(np.float32)
int_point_real_y = np.ascontiguousarray(int_point_real_y)
int_point_real_z = np.zeros((n_rays, 1)).astype(np.float32)
int_point_real_z = np.ascontiguousarray(int_point_real_z)
intersect = mod.get_function("intersect")
start = time.time()
intersect(drv.In(origin), drv.In(direction_x),drv.In(direction_y),drv.In(direction_z),drv.In(v0_x),drv.In(v0_y),drv.In(v0_z), drv.In(v1_x),drv.In(v1_y),drv.In(v1_z), drv.In(v2_x), drv.In(v2_y), drv.In(v2_z), drv.Out(int_point_real_x),drv.Out(int_point_real_y),drv.Out(int_point_real_z), block=(num_threads, 1, 1), grid=((num_blocks+0), 1, 1))
finish = time.time()
print(finish-start)
I give as input some arrays whose size is 20k (dir_x, dir_y, dir_z) and I have as output 3 arrays (int_point_real_x,int_point_real_y,int_point_real_z) that have the same size as the above mentioned arrays (20k).
If n_rays is a multiple of num_threads, e.g. n_rays=19456 and num_threads=1024, then int_point_real_x_y_z are correctly filled by the kernel.
Otherwise, if n_rays is NOT a multiple of num_threads, e.g. n_rays=20000 (what I really need) and num_threads=1024, then int_point_real_x_y_z are filled by the kernel up to position 19455 and the 544 spots left in the array are not filled.
Does anyone know if this is a rule of CUDA?
If it's not, how could I modify my code in order to use an arbitrary size of input array (and not only multiple of num_threads)?
Thanks
your int(n_rays/num_threads) is rounding down
to fix this, you need to round up and then put a condition into the kernel to enforce that idx is valid and "do nothing" if it's not. this will cause some cores to waste time, but your code looks pretty suboptimal anyway so it probably won't matter much
I am writing some code that takes binary data from Python, Pipes it to C++, does some processing on the data, (in this case calculating a mutual information metric) and then pipes the results back to python. While testing I have found that everything works fine if the data I send is a set of 2 arrays with dimensions less than 1500 X 1500, but if I send 2 arrays that are 2K X 2K I get back a lot of corrupted nonsense.
I currently believe the algorithmic portion of the code is fine because it provides the expected answers during testing with small (<=1500 X1500) arrays. That leads me to believe that this is an issue with either the stdin or stdout piping. That maybe I’m passing some intrinsic limit somewhere.
The Python Code and C++ code are below.
Python Code:
import subprocess
import struct
import sys
import numpy as np
#set up the variables needed
bytesPerDouble = 8
sizeX = 2000
sizeY = 2000
offset = sizeX*sizeY
totalBytesPerArray = sizeX*sizeY*bytesPerDouble
totalBytes = totalBytesPerArray*2 #the 2 is because we pass 2 different versions of the 2D array
#setup the testing data array
a = np.zeros(sizeX*sizeY*2, dtype='d')
for i in range(sizeX):
for j in range(sizeY):
a[j+i*sizeY] = i
a[j+i*sizeY+offset] = i
if i % 10 == 0:
a[j+i*sizeY+offset] = j
data = a.tobytes('C')
strTotalBytes = str(totalBytes)
strLineBytes = str(sizeY*bytesPerDouble)
#communicate with c++ code
print("starting C++ code")
command = "C:\Python27\PythonPipes.exe"
proc = subprocess.Popen([command, strTotalBytes, strLineBytes, str(sizeY), str(sizeX)], stdin=subprocess.PIPE,stderr=subprocess.PIPE,stdout=subprocess.PIPE)
ByteBuffer = (data)
proc.stdin.write(ByteBuffer)
print("Reading results back from C++")
for i in range(sizeX):
returnvalues = proc.stdout.read(sizeY*bytesPerDouble)
a = buffer(returnvalues)
b = struct.unpack_from(str(sizeY)+'d', a)
print str(b) + " " + str(i)
print('done')
C++ Code:
Main function:
int main(int argc, char **argv) {
int count = 0;
long totalbytes = stoi(argv[argc-4], nullptr,10); //bytes being transfered
long bytechunk = stoi(argv[argc - 3], nullptr, 10); //bytes being transfered at a time
long height = stoi(argv[argc-2], nullptr, 10); //bytes being transfered at a time
long width = stoi(argv[argc-1], nullptr, 10); //bytes being transfered at a time
long offset = totalbytes / sizeof(double) / 2;
data = new double[totalbytes/sizeof(double)];
int columnindex = 0;
//read in data from pipe
while (count<totalbytes) {
fread(&(data[columnindex]), 1, bytechunk, stdin);
columnindex += bytechunk / sizeof(double);
count += bytechunk;
}
//calculate the data transform
MutualInformation MI = MutualInformation();
MI.Initialize(data, height, width, offset);
MI.calcMI();
count = 0;
//*
//write out data to pipe
columnindex = 0;
while (count<totalbytes/2) {
fwrite(&(MI.getOutput()[columnindex]), 1, bytechunk, stdout);
fflush(stdout);
count += bytechunk;
columnindex += bytechunk/sizeof(double);
}
//*/
delete [] data;
return 0;
}
and in case you need it the actual processing code:
double MutualInformation::calcMI(){
double rvalue = 0.0;
std::map<int, map<int, double>> lHistXY = map<int, map<int, double>>();
std::map<int, double> lHistX = map<int, double>();
std::map<int, double> lHistY = map<int, double>();
typedef std::map<int, std::map<int, double>>::iterator HistXY_iter;
typedef std::map<int, double>::iterator HistY_iter;
//calculate Entropys and MI
double MI = 0.0;
double Hx = 0.0;
double Hy = 0.0;
double Px = 0.0;
double Py = 0.0;
double Pxy = 0.0;
//scan through the image
int ip = 0;
int jp = 0;
int chipsize = 3;
//setup zero array
double * zeros = new double[this->mHeight];
for (int j = 0; j < this->mHeight; j++){
zeros[j] = 0.0;
}
//zero out Output array
for (int i = 0; i < this->mWidth; i++){
memcpy(&(this->mOutput[i*this->mHeight]), zeros, this->mHeight*8);
}
double index = 0.0;
for (int ioutter = chipsize; ioutter < (this->mWidth - chipsize); ioutter++){
//write out processing status
//index = (double)ioutter;
//fwrite(&index, 8, 1, stdout);
//fflush(stdout);
//*
for (int j = chipsize; j < (this->mHeight - chipsize); j++){
//clear the histograms
lHistX.clear();
lHistY.clear();
lHistXY.clear();
//chip out a section of the image
for (int k = -chipsize; k <= chipsize; k++){
for (int l = -chipsize; l <= chipsize; l++){
ip = ioutter + k;
jp = j + l;
//update X histogram
if (lHistX.count(int(this->mData[ip*this->mHeight + jp]))){
lHistX[int(this->mData[ip*this->mHeight + jp])] += 1.0;
}else{
lHistX[int(this->mData[ip*this->mHeight + jp])] = 1.0;
}
//update Y histogram
if (lHistY.count(int(this->mData[ip*this->mHeight + jp+this->mOffset]))){
lHistY[int(this->mData[ip*this->mHeight + jp+this->mOffset])] += 1.0;
}
else{
lHistY[int(this->mData[ip*this->mHeight + jp+this->mOffset])] = 1.0;
}
//update X and Y Histogram
if (lHistXY.count(int(this->mData[ip*this->mHeight + jp]))){
//X Key exists check if Y key exists
if (lHistXY[int(this->mData[ip*this->mHeight + jp])].count(int(this->mData[ip*this->mHeight + jp + this->mOffset]))){
//X & Y keys exist
lHistXY[int(this->mData[ip*this->mHeight + jp])][int(this->mData[ip*this->mHeight + jp + this->mOffset])] += 1;
}else{
//X exist but Y doesn't
lHistXY[int(this->mData[ip*this->mHeight + jp])][int(this->mData[ip*this->mHeight + jp + this->mOffset])] = 1;
}
}else{
//X Key Didn't exist
lHistXY[int(this->mData[ip*this->mHeight + jp])][int(this->mData[ip*this->mHeight + jp + this->mOffset])] = 1;
};
}
}
//calculate PMI, Hx, Hy
// iterator->first = key
// iterator->second = value
MI = 0.0;
Hx = 0.0;
Hy = 0.0;
for (HistXY_iter Hist2D_iter = lHistXY.begin(); Hist2D_iter != lHistXY.end(); Hist2D_iter++) {
Px = lHistX[Hist2D_iter->first] / ((double) this->mOffset);
Hx -= Px*log(Px);
for (HistY_iter HistY_iter = Hist2D_iter->second.begin(); HistY_iter != Hist2D_iter->second.end(); HistY_iter++) {
Py = lHistY[HistY_iter->first] / ((double) this->mOffset);
Hy -= Py*log(Py);
Pxy = HistY_iter->second / ((double) this->mOffset);
MI += Pxy*log(Pxy / Py / Px);
}
}
//normalize PMI to max(Hx,Hy) so that the PMI value runs from 0 to 1
if (Hx >= Hy && Hx > 0.0){
MI /= Hx;
}else if(Hy > Hx && Hy > 0.0){
MI /= Hy;
}
else{
MI = 0.0;
}
//write PMI to data output array
if (MI < 1.1){
this->mOutput[ioutter*this->mHeight + j] = MI;
}
else{
this->mOutput[ioutter*this->mHeight + j] = 0.0;
}
}
}
return rvalue;
}
with arrays that return something that makes sense I get output bounded between 0 and 1 like this:
(0.0, 0.0, 0.0, 0.7160627908692593, 0.6376472316395495, 0.5728801401524277,...
with the 2Kx2K or higher arrays I get nonesense like this (even though the code clamps the values between 0 and 1):
(-2.2491400820412374e+228, -2.2491400820412374e+228, -2.2491400820412374e+228, -2.2491400820412374e+228, -2.2491400820412374e+228,...
I would like to know why this code is corrupting the data set after it is assigned between 0.0 and 1, and whether or not it is a piping issue, a stdin/stdout issue, a buffer issue of some sort, or a coding issue I am simply not seeing.
Update I tried passing the data in smaller chunks using the code that Chris suggested with no luck. also of note is that I added a catch for ferror on stdout and it never got tripped so I am pretty sure that the bytes are at least making it to stdout. Is it possible that something else is writing to stdout somehow? maybe an extra byte making its way into stdout while my program is running? I find this doubtful as the errors are appearing consistently on the 4th fwrite read in the 10th entry.
Per Craig's request here is the full C++ code (the full Python Code is already posted): it is sitting in 3 files:
main.cpp
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include "./MutualInformation.h"
double * data;
using namespace std;
void
xxwrite(unsigned char *buf, size_t wlen, FILE *fo)
{
size_t xlen;
for (; wlen > 0; wlen -= xlen, buf += xlen) {
xlen = wlen;
if (xlen > 1024)
xlen = 1024;
xlen = fwrite(buf, 1, xlen, fo);
fflush(fo);
}
}
int main(int argc, char **argv) {
int count = 0;
long totalbytes = stoi(argv[argc-4], nullptr,10); //bytes being transfered
long bytechunk = stoi(argv[argc - 3], nullptr, 10); //bytes being transfered at a time
long height = stoi(argv[argc-2], nullptr, 10); //bytes being transfered at a time
long width = stoi(argv[argc-1], nullptr, 10); //bytes being transfered at a time
long offset = totalbytes / sizeof(double) / 2;
data = new double[totalbytes/sizeof(double)];
int columnindex = 0;
//read in data from pipe
while (count<totalbytes) {
fread(&(data[columnindex]), 1, bytechunk, stdin);
columnindex += bytechunk / sizeof(double);
count += bytechunk;
}
//calculate the data transform
MutualInformation MI = MutualInformation();
MI.Initialize(data, height, width, offset);
MI.calcMI();
count = 0;
columnindex = 0;
while (count<totalbytes/2) {
xxwrite((unsigned char*)&(MI.getOutput()[columnindex]), bytechunk, stdout);
count += bytechunk;
columnindex += bytechunk/sizeof(double);
}
delete [] data;
return 0;
}
MutualInformation.h
#include <map>
using namespace std;
class MutualInformation
{
private:
double * mData;
double * mOutput;
long mHeight;
long mWidth;
long mOffset;
public:
MutualInformation();
~MutualInformation();
bool Initialize(double * data, long Height, long Width, long Offset);
const double * getOutput();
double calcMI();
};
MutualInformation.cpp
#include "MutualInformation.h"
MutualInformation::MutualInformation()
{
this->mData = nullptr;
this->mOutput = nullptr;
this->mHeight = 0;
this->mWidth = 0;
}
MutualInformation::~MutualInformation()
{
delete[] this->mOutput;
}
bool MutualInformation::Initialize(double * data, long Height, long Width, long Offset){
bool rvalue = false;
this->mData = data;
this->mHeight = Height;
this->mWidth = Width;
this->mOffset = Offset;
//allocate output data
this->mOutput = new double[this->mHeight*this->mWidth];
return rvalue;
}
const double * MutualInformation::getOutput(){
return this->mOutput;
}
double MutualInformation::calcMI(){
double rvalue = 0.0;
std::map<int, map<int, double>> lHistXY = map<int, map<int, double>>();
std::map<int, double> lHistX = map<int, double>();
std::map<int, double> lHistY = map<int, double>();
typedef std::map<int, std::map<int, double>>::iterator HistXY_iter;
typedef std::map<int, double>::iterator HistY_iter;
//calculate Entropys and MI
double MI = 0.0;
double Hx = 0.0;
double Hy = 0.0;
double Px = 0.0;
double Py = 0.0;
double Pxy = 0.0;
//scan through the image
int ip = 0;
int jp = 0;
int chipsize = 3;
//setup zero array
double * zeros = new double[this->mHeight];
for (int j = 0; j < this->mHeight; j++){
zeros[j] = 0.0;
}
//zero out Output array
for (int i = 0; i < this->mWidth; i++){
memcpy(&(this->mOutput[i*this->mHeight]), zeros, this->mHeight*8);
}
double index = 0.0;
for (int ioutter = chipsize; ioutter < (this->mWidth - chipsize); ioutter++){
for (int j = chipsize; j < (this->mHeight - chipsize); j++){
//clear the histograms
lHistX.clear();
lHistY.clear();
lHistXY.clear();
//chip out a section of the image
for (int k = -chipsize; k <= chipsize; k++){
for (int l = -chipsize; l <= chipsize; l++){
ip = ioutter + k;
jp = j + l;
//update X histogram
if (lHistX.count(int(this->mData[ip*this->mHeight + jp]))){
lHistX[int(this->mData[ip*this->mHeight + jp])] += 1.0;
}else{
lHistX[int(this->mData[ip*this->mHeight + jp])] = 1.0;
}
//update Y histogram
if (lHistY.count(int(this->mData[ip*this->mHeight + jp+this->mOffset]))){
lHistY[int(this->mData[ip*this->mHeight + jp+this->mOffset])] += 1.0;
}
else{
lHistY[int(this->mData[ip*this->mHeight + jp+this->mOffset])] = 1.0;
}
//update X and Y Histogram
if (lHistXY.count(int(this->mData[ip*this->mHeight + jp]))){
//X Key exists check if Y key exists
if (lHistXY[int(this->mData[ip*this->mHeight + jp])].count(int(this->mData[ip*this->mHeight + jp + this->mOffset]))){
//X & Y keys exist
lHistXY[int(this->mData[ip*this->mHeight + jp])][int(this->mData[ip*this->mHeight + jp + this->mOffset])] += 1;
}else{
//X exist but Y doesn't
lHistXY[int(this->mData[ip*this->mHeight + jp])][int(this->mData[ip*this->mHeight + jp + this->mOffset])] = 1;
}
}else{
//X Key Didn't exist
lHistXY[int(this->mData[ip*this->mHeight + jp])][int(this->mData[ip*this->mHeight + jp + this->mOffset])] = 1;
};
}
}
//calculate PMI, Hx, Hy
// iterator->first = key
// iterator->second = value
MI = 0.0;
Hx = 0.0;
Hy = 0.0;
for (HistXY_iter Hist2D_iter = lHistXY.begin(); Hist2D_iter != lHistXY.end(); Hist2D_iter++) {
Px = lHistX[Hist2D_iter->first] / ((double) this->mOffset);
Hx -= Px*log(Px);
for (HistY_iter HistY_iter = Hist2D_iter->second.begin(); HistY_iter != Hist2D_iter->second.end(); HistY_iter++) {
Py = lHistY[HistY_iter->first] / ((double) this->mOffset);
Hy -= Py*log(Py);
Pxy = HistY_iter->second / ((double) this->mOffset);
MI += Pxy*log(Pxy / Py / Px);
}
}
//normalize PMI to max(Hx,Hy) so that the PMI value runs from 0 to 1
if (Hx >= Hy && Hx > 0.0){
MI /= Hx;
}else if(Hy > Hx && Hy > 0.0){
MI /= Hy;
}
else{
MI = 0.0;
}
//write PMI to data output array
if (MI < 1.1){
this->mOutput[ioutter*this->mHeight + j] = MI;
}
else{
this->mOutput[ioutter*this->mHeight + j] = 0.0;
//cout << "problem with output";
}
}
}
//*/
return rvalue;
}
SOLVED By 6502
6502's answer below solved my problem. I needed to explicitly tell Windows to use a binary mode for stdin / stdout. to do that I had to include 2 new header files in my main cpp file.
#include <fcntl.h>
#include <io.h>
add the following lines of code (modified away from 6502's POSIX versions because Visual Studio complained) to the beginning of my main function
_setmode(_fileno(stdout), O_BINARY);
_setmode(_fileno(stdin), O_BINARY);
and then add these lines to my Python code:
import os, msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
The problem is that stdin/stdout in windows are opened in text mode, not in binary mode and therefore will mess up when the character 13 (\r) is sent.
You can set for example binary mode in Python with
import os, msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
msvcrt.setmode(sys.stdin.fileno(), os.O_BINARY)
and in C++ with
_setmode(fileno(stdout), O_BINARY);
_setmode(fileno(stdin), O_BINARY);
See https://msdn.microsoft.com/en-us/library/tw4k6df8.aspx
Your C++ fwrite code does not account for getting a "short" transfer.
Here's a slight tweak:
//write out data to pipe
columnindex = 0;
while (count < totalbytes / 2) {
wlen = fwrite(&(MI.getOutput()[columnindex]), 1, bytechunk, stdout);
fflush(stdout);
count += wlen;
columnindex += wlen / sizeof(double);
}
Note: You still need to be careful as this would still have issues if wlen returns and it's not a multiple of sizeof(double). For example, if bytechunk were 16 and wlen came back with 14, you'd need an additional fwrite with length 2 before continuing the loop. A generalization of this is just to treat the entire data matrix as a giant byte buffer and loop on that.
Actually, you'll get about the same efficiency with many much smaller transfers that are capped by a fixed (i.e. "known safe amount") of [say] 1024 bytes. This works because the output is a byte stream.
Here's a slightly more general solution that I've often used:
void
xxwrite(void *buf,size_t wlen,FILE *fo)
{
size_t xlen;
for (; wlen > 0; wlen -= xlen, buf += xlen) {
xlen = wlen;
if (xlen > 1024)
xlen = 1024;
xlen = fwrite(buf,1,xlen,fo);
fflush(fo);
}
}
//write out data to pipe
columnindex = 0;
while (count < totalbytes / 2) {
xxwrite(&(MI.getOutput()[columnindex]), bytechunk, stdout);
count += bytechunk;
columnindex += bytechunk / sizeof(double);
}
UPDATE:
I've downloaded all your code and run it. I've got good news and bad news: The code runs fine here, even for a matrix size above 3000. I ran it both using xxwrite and without and the results were the same.
Using my limited python skills, I added some pretty print to your python script (e.g. some line wrap) and had it check every value for range and annotate any bad values. There were none found by the script. Also, visual inspection of the values turned up nothing [this was true before the pretty print, so it hasn't introduced anything]. Just lots of zeros and then blocks in the 0.9 range.
The only difference I can see is that I'm using gcc [and, of course, python] on linux. But, from your script it seems your using Windows [based on the C:\... path for your C++ executable. This shouldn't matter for this application, but I mention it anyway.
So, pipes work here. One thing you might try is to direct the C++ output to a file. Then, have the script read back from the file (i.e. no pipe) and see if that makes a difference. I tend to think not, but ...
Also, I don't know what compiler and python implementation you're using under Windows. Whenever I have to do this, I usually have Cygwin installed as it gives one of the closest implementations of linux/Unix-like environment (i.e. pipes are more likely to work as advertised).
Anyway, here's the modified script. Also note that I added os.getenv to grab alternate matrix sizes and an alternate place for the C++ executable, so that it would work for both of us with minimal pain
#!/usr/bin/python
import subprocess
import struct
import sys
import os
import numpy as np
val = os.getenv("MTX","2000")
sizeX = int(val)
sizeY = sizeX
print "sizeX=%d sizeY=%d" % (sizeX,sizeY)
#set up the variables needed
bytesPerDouble = 8
offset = sizeX*sizeY
totalBytesPerArray = sizeX*sizeY*bytesPerDouble
totalBytes = totalBytesPerArray*2 #the 2 is because we pass 2 different versions of the 2D array
#setup the testing data array
a = np.zeros(sizeX*sizeY*2, dtype='d')
for i in range(sizeX):
for j in range(sizeY):
a[j+i*sizeY] = i
a[j+i*sizeY+offset] = i
if i % 10 == 0:
a[j+i*sizeY+offset] = j
data = a.tobytes('C')
strTotalBytes = str(totalBytes)
strLineBytes = str(sizeY*bytesPerDouble)
#communicate with c++ code
print("starting C++ code")
command = os.getenv("CPGM",None);
if command is None:
command = "C:\Python27\PythonPipes.exe"
proc = subprocess.Popen([command, strTotalBytes, strLineBytes, str(sizeY), str(sizeX)], stdin=subprocess.PIPE,stderr=subprocess.PIPE,stdout=subprocess.PIPE)
ByteBuffer = (data)
proc.stdin.write(ByteBuffer)
def prt(i,b):
hangflg = 0
per = 8
for j in range(0,len(b)):
if ((j % per) == 0):
print("[%d,%d]" % (i,j)),
q = b[j]
print(q),
hangflg = 1
if (q < 0.0) or (q > 1.0):
print("=WTF"),
if ((j % per) == (per - 1)):
print("")
hangflg = 0
if (hangflg):
print("")
print("Reading results back from C++")
for i in range(sizeX):
returnvalues = proc.stdout.read(sizeY*bytesPerDouble)
a = buffer(returnvalues)
b = struct.unpack_from(str(sizeY)+'d', a)
prt(i,b)
###print str(b) + " " + str(i)
###print str(i) + ": " + str(b)
print('done')
I got the following code in javascript for RSA implementionhttp://www-cs-students.stanford.edu/~tjw/jsbn/:
// Return the PKCS#1 RSA encryption of "text" as an even-length hex string
function RSAEncrypt(text) {
var m = pkcs1pad2(text,(this.n.bitLength()+7)>>3);
if(m == null) return null;
var c = this.doPublic(m);
if(c == null) return null;
var h = c.toString(16);
if((h.length & 1) == 0) return h; else return "0" + h;
}
// PKCS#1 (type 2, random) pad input string s to n bytes, and return a bigint
function pkcs1pad2(s,n) {
if(n < s.length + 11) { // TODO: fix for utf-8
alert("Message too long for RSA");
return null;
}
var ba = new Array();
var i = s.length - 1;
while(i >= 0 && n > 0) {
var c = s.charCodeAt(i--);
if(c < 128) { // encode using utf-8
ba[--n] = c;
}
else if((c > 127) && (c < 2048)) {
ba[--n] = (c & 63) | 128;
ba[--n] = (c >> 6) | 192;
}
else {
ba[--n] = (c & 63) | 128;
ba[--n] = ((c >> 6) & 63) | 128;
ba[--n] = (c >> 12) | 224;
}
}
ba[--n] = 0;
var rng = new SecureRandom();
var x = new Array();
while(n > 2) { // random non-zero pad
x[0] = 0;
while(x[0] == 0) rng.nextBytes(x);
ba[--n] = x[0];
}
ba[--n] = 2;
ba[--n] = 0;
return new BigInteger(ba);
}
In the snippets above, it seems that the pkcs1pad2 function is used for padding the message with some random bytes(maybe sth like 0|2|random|0 ) in front of the message.
I'm using the python rsa package (http://stuvel.eu/rsa) for imitating the javascript result, i'm a newbie to python world and have no idea to traslate javascript algorithm code to the python code.
Any help would be appreciated.
Jiee
I know it's a bit late, but in a few days I'll release a new version of my Python-RSA package. That version will include PKCS#1 v1.5 padding, so it should be compatible with your JavaScript code ;-)