OpenCV Inverse Matrix - python

somehow I get different results using numpy's linalg.inv and OpenCv's Mat::inv, e.g. given a matrix S:
[747.8561839552103, 359.9317804054358, 204.6165451419812, 241.7376332155144, 126.132733370211, 81.57610562583466;
359.9317804054358, 204.6165451419812, 140.4788277943211, 126.132733370211, 80.55127348381332, 47.7506303038364;
204.6165451419812, 140.4788277943211, 116.7472083846913, 80.55127348381332, 63.86033402962857, 35.56970826526131;
241.7376332155144, 126.132733370211, 80.55127348381332, 81.57610562583466, 47.7506303038364, 29.55106507912703;
126.132733370211, 80.55127348381332, 63.86033402962857, 47.7506303038364, 35.56970826526131, 20.31193086803655;
81.57610562583466, 47.7506303038364, 35.56970826526131, 29.55106507912703, 20.31193086803655, 12]
OpenCV S.inv() results in
[-19562262532715.65, 137.3094415699439, -44015090698406.93, 78249050130618.84, 88030181396160.73, -78249050129617.47;
124.2797501878658, 19.14093204142484, 301.2737811201993, -531.0713785028685, -686.3332686269306, 655.5356524828615;
-44015090698424.45, 330.6065529853437, -99033954070961.39, 176060362793110.9, 198067908140346.2, -176060362790691.7;
78249050130642.06, -583.2397919514979, 176060362793093.2, -312996200521532.5, -352120725583424.9, 312996200517305.3;
88030181396256.19, -744.9706630355842, 198067908140482.2, -352120725583702.1, -396135816277430.2, 352120725578288.9;
-78249050129732.52, 707.7008280168318, -176060362790880.5, 312996200517672.6, 352120725578424.8, -312996200512573.6]
numpy's linalg.inv results in:
[[2685109332201.37, -23.46, 6041495997420.42, -10740437328763.82, -12082991994731.51, 10740437328597.41]
[-32.56, 19.14, -51.60, 96.23, 19.43, 28.24]
[6041495997407.60, -31.13, 13593365994129.85, -24165983989574.97, -27186731988120.73, 24165983989366.80]
[-10740437328747.65, 59.84, -24165983989589.86, 42961749314884.59, 48331967978891.43, -42961749314440.71]
[-12082991994663.29, -21.50, -27186731988024.93, 48331967978691.32, 54373463976152.90, -48331967978849.60]
[10740437328516.33, 64.62, 24165983989235.66, -42961749314181.06, -48331967978757.62, 42961749314608.99]]
Okay, now some detail of how I stumbled across this:
I am working on a routine to fit an ellipse given a point set; I have transferred code from: http://nicky.vanforeest.com/misc/fitEllipse/fitEllipse.html to C++:
template <typename InputIt>
std::array<double, 6> fit_ellipse(InputIt first, InputIt last) {
typedef double value_type;
auto num_points = std::distance(first, last);
cv::Mat_<value_type> D(num_points, 6);
size_t row(0);
for (auto it(first); it != last; ++it, ++row) {
auto &point = *it;
auto &x = std::get<0>(point);
auto &y = std::get<1>(point);
D(row, 0) = x * x;
D(row, 1) = x * y;
D(row, 2) = y * y;
D(row, 3) = x;
D(row, 4) = y;
D(row, 5) = 1.f;
}
auto S = D.t() * D;
cv::Mat_<value_type> C = cv::Mat_<value_type>::zeros(6, 6);
C(0, 2) = C(2, 0) = 2; C(1, 1) = -1;
cv::Mat Sinv = S.inv();
std::cout << "Sinv:" << Sinv << std::endl;
cv::Mat_<value_type> E, V; cv::eigen(Sinv * C, E, V);
int n;
cv::minMaxIdx(cv::abs(E), nullptr, nullptr, nullptr, &n);
return {{V(n, 0), V(n, 1), V(n, 2), V(n, 3), V(n, 4), V(n, 5)}};
}
To comparison the Python code:
import numpy as np
from numpy.linalg import eig, inv
def fitEllipse(x,y):
x = x[:,np.newaxis]
y = y[:,np.newaxis]
D = np.hstack((x*x, x*y, y*y, x, y, np.ones_like(x)))
S = np.dot(D.T,D)
C = np.zeros([6,6])
C[0,2] = C[2,0] = 2; C[1,1] = -1
Sinv = inv(S)
print("Sinv: %s" % Sinv)
E, V = eig(np.dot(Sinv, C))
n = np.argmax(np.abs(E))
a = V[:,n]
return a
The code is run initializing following points:
TEST(fit_ellipse, test_example) {
namespace bmc = boost::math::constants;
typedef std::array<double, 2> point_type;
std::vector<point_type> points;
auto arc = .8;
for (size_t i(0); i < 12; ++i) {
auto r = i * arc * bmc::pi<double>() / 12;
points.push_back({1.5 * std::cos(r) + 2, std::sin(r) + 1});
}
auto a = fit_ellipse(points.begin(), points.end());
std::cout << "Parameters: "
for (auto &p : a)
std::cout << p << ' ';
std::cout << std::endl;
}
Same in Python:
R = np.arange(0, arc * np.pi, arc * np.pi / 12)
x = 1.5 * np.cos(R) + 2
y = np.sin(R) + 1.
print("Parameters: %s" % fitEllipse(x,y))
Any Idea what I am missing out?

Related

How to convert integer sizes and pointer declarations from C++ to Python?

I am trying to convert this C++ function into Python code but I am having trouble in doing so for the pointer and turning the python variables into uint32_t and uint8_t respectively. I am not sure how to declare the functions such that it returns uint32_t and also what to do with the pointer uint8_t *buf. Please help me in figuring out on how to convert the function, from C++ to python.
This is my C++ code:
uint32_t Functions::Do_calc(uint8_t *buf, uint32_t len){
return Do_calc(25, buf, len);
}
uint32_t Functions::Do_calc(uint32_t val, uint8_t *buf, uint32_t len){
uint32_t temp_int, c = val;
uint32_t ip_buf[128];
uint32_t j, rem = 0, tf = 0, p = 0;
rem = len;
while(rem > 0){
if(rem <= 512){
tf = rem;
}
else{
tf = 512;
}
for(j = 0; j < 128; j++){
ip_buf[j]=0;
}
for(j = 0; j < tf; j += 2){
temp = ((buf[p * 512 + (j + 3)]<<24) +
(buf[p * 512 + (j + 2)]<<16) +
(buf[p * 512 + (j + 1)]<<8) +
buf[p * 512 + j]);
ip_buf[j / 4] = temp;
}
c = c_cal(ip_buf, tf, c, 0x04C22AB9, 2, true);
p++
}
return c;
}
uint32_t Functions::c_cal(uint32_t *d_base, uint32_t d_size, uint32_t c, uint32_t poly, uint8_t c_size, bool b_r_ip)
{
unsigned long d_offset;
unsigned long d, d_temp;
unsigned char c_bit;
d = 0;
for(d_offset = 0; d_offset < d_size; (d_offset += c_size))
{
u32_d_temp = 0;
d_temp = d_base[d_offset/c_size];
if(FALSE == b_r_ip)
{
d = d_temp;
}
else
{
d = 0;
for(c_bit = 0; c_bit < (c_size << 3); c_bit++)
{
d <<= 1;
d |= (d_temp & 1);
d_temp >>= 1;
}
}
for(c_bit = 0; c_bit < (c_size << 3); c_bit++)
{
if(((c >> ((c_size << 3) - 1)) ^ d) & 1)
{
c <<= 1;
d >>= 1;
c ^= poly;
}
else
{
c <<= 1;
d >>= 1;
}
}
}
return (c & (0xFFFFFFFF >> (32 - (c_size << 3))));
}
This is my attempted Python implementation. As you can see I just did a basic implementation as I did not worry about the pointer and the size of the integer which is very much needed:
def Do_calc(buf, len):
return Do_calc(0, buf, len)
def Do_calc(val, buf, len):
ip_buf = []
c = val
p = 0
rem = len
while rem > 0:
if rem <= 512:
tf = rem
else:
tf = 512
for j in range(128):
ip_buf[j].append = 0
for j in xrange(0, tf, 2):
temp_int = ((buffer[packet * 512 + (i + 3)] << 24) +
(buffer[packet * 512 + (i + 2)] << 16) +
(buffer[packet * 512 + (i + 1)] << 8) +
buffer[packet * 512 + i])
ip_buf[j/4] = temp
c = c_cal(ip_buf, tf, c, 0x04C22AB9, 2, true)
p += 1
return c
How do I properly do the conversion after taking care of all the aspects?

OSError: exception: access violation reading 0x000002369EF14000

I have the same error as many of the questions here: OSError: exception: access violation reading 0x000002369EF14000. However, the situation is different and I couldn't figure out why this happens. Moreover, the address appears to be different every time I run the code.
I ran the following python code and get the error above. How do I fix this issue? (Or what is the general cause for the OSError: exception: access violation reading error?)
from ctypes import *
import numpy as np
splines = CDLL('splines.dll')
splines.spline_basis.argtypes = [POINTER(c_double), POINTER(c_int), POINTER(c_int), POINTER(c_double), POINTER(c_int),
POINTER(c_int), POINTER(c_double), POINTER(c_int)]
x = np.random.rand(10)
ord = 3
knots = np.linspace(0, 1, 5)
nk = len(knots)
nx = len(x)
ind = sorted(range(x.shape[0]), key=lambda k: x[k])
sortx = x[ind]
derivs = nx
derivs = [derivs if i == 0 else np.nan for i in ind]
ncoef = nk - ord
temp = splines.spline_basis(np.array(knots).ctypes.data_as(POINTER(c_double)), POINTER(c_int)(c_int(ncoef)),
POINTER(c_int)(c_int(ord)),
sortx.ctypes.data_as(POINTER(c_double)), np.array(derivs).ctypes.data_as(POINTER(c_int)),
POINTER(c_int)(c_int(nx)), np.zeros([ord, nx]).ctypes.data_as(POINTER(c_double)),
POINTER(c_int)(c_int(nx)))
print(temp)
The dll file is compiled from the c code below, which builds a spline basis. The main function used is the spline_basis function
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "splines.h"
double* ldel, * rdel;
long orderm1;
void
diff_table(double* ti, double x, int ndiff)
{
register double* r = rdel, * l = ldel, * dpt = ti;
while (ndiff--) {
*r++ = *dpt++ - x;
*l++ = x - *--ti;
}
}
void
basis_funcs(double* ti, double x, double* b)
{
int j, r;
double saved, term;
diff_table(ti, x, orderm1);
b[0] = 1.;
for (j = 1; j <= orderm1; j++) {
saved = 0.;
for (r = 0; r < j; r++) {
term = b[r] / (rdel[r] + ldel[j - 1 - r]);
b[r] = saved + rdel[r] * term;
saved = ldel[j - 1 - r] * term;
}
b[j] = saved;
}
}
double
evaluate(double* ti, double x, double* a, int nder)
{
register double* lpt, * rpt, * apt;
register int inner;
int outer = orderm1;
while (nder--) {
for (inner = outer, apt = a, lpt = ti - outer; inner--; apt++, lpt++)
*apt = outer * (*(apt + 1) - *apt) / (*(lpt + outer) - *lpt);
outer--;
}
diff_table(ti, x, (long)outer);
while (outer--)
for (apt = a, lpt = ldel + outer, rpt = rdel, inner = outer + 1;
inner--; lpt--, rpt++, apt++)
*apt = (*(apt + 1) * *lpt + *apt * *rpt) / (*rpt + *lpt);
return(*a);
}
void
spline_value(double* knots, double* coeff, int * ncoeff,
int * order, double* x, int * nx, int * deriv,
double* y)
{
long n = *nx;
double* a, * last = knots + *ncoeff;
a = calloc(*order, sizeof(double));
orderm1 = *order - 1L; /* allocate difference tables */
rdel = calloc(orderm1, sizeof(double));
ldel = calloc(orderm1, sizeof(double));
knots += *order; /* First *order knots must be <= all x's */
while (n--) {
while (knots <= last && *knots <= *x) { knots++; coeff++; }
memcpy(a, coeff, *order);
*y++ = evaluate(knots, *x++, a, (int)*deriv);
}
free(ldel); free(rdel); free(a);
}
void spline_basis(double* knots, int * ncoeff, int * order,
double* xvals, int * derivs, int * nx,
double* basis, int * offsets)
{ /* evaluate the non-zero B-spline basis */
/* functions (or their derivatives) at */
/* xvals. */
int n = *nx, i, j;
double* dpt, * coeff, * last = knots + *ncoeff;
orderm1 = *order - 1L;
rdel = calloc(orderm1, sizeof(double));
ldel = calloc(orderm1, sizeof(double));
coeff = calloc(*order, sizeof(double));
dpt = (knots += *order); /* first *order knots must be <= all xvals */
for (; n--; xvals++, derivs++) {
while (dpt < last && *dpt <= *xvals) dpt++;
if (*derivs) { /* slow method for derivatives */
for (i = 0; i < *order; i++) {
for (j = 0; j < *order; j++) coeff[j] = 0;
coeff[i] = 1;
*basis++ = evaluate(dpt, *xvals, coeff, (int)*derivs);
}
}
else {
basis_funcs(dpt, *xvals, basis); /* fast method for value */
basis += *order;
}
*offsets++ = (long)(dpt - knots);
}
free(ldel); free(rdel); free(coeff);
}
void lin_interp(double* x, double* y, double* x0, double* y0, int * nvals)
{
int n = *nvals;
double* firstx = x;
while (n--) {
while (*x < *x0) { x++; y++; }
if (x > firstx) { x--; y--; }
if (*x > * x0) *y0++ = *y + (*(y + 1) - *y) * (*x0 - *x) / (*(x + 1) - *x);
else *y0++ = *y;
x0++;
}
}
The header file includes declaration of functions only.
void diff_table(double* ti, double x, int ndiff);
void basis_funcs(double* ti, double x, double* b);
double evaluate(double* ti, double x, double* a, int nder);
void spline_value(double* knots, double* coeff, int* ncoeff, int* order, double* x, int* nx, int* deriv, double* y);
__declspec(dllexport) void spline_basis(double* knots, int* ncoeff, int* order, double* xvals, int* derivs, int* nx, double* basis, int* offsets);
void lin_interp(double* x, double* y, double* x0, double* y0, int* nvals);

How to get the output from YOLO model using tensorflow with C++ correctly?

I'm trying to write an inference program with YOLO model in C++. I've searched for some info about darknet, but it has to use .cfg file to import the model structure(which is a bit too complicated for me...), thus I want to do the program with tensorflow.
(My model weight is converted from .hdf5(used in python) to .pb(used in C++))
I've found some examples written in python, it seems like they have done some work before the inference process... Source
def yolo_eval(yolo_outputs,
anchors,
num_classes,
image_shape,
max_boxes=50,
score_threshold=.6,
iou_threshold=.5):
"""Evaluate YOLO model on given input and return filtered boxes."""
num_layers = len(yolo_outputs)
anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]] # default setting
input_shape = K.shape(yolo_outputs[0])[1:3] * 32
boxes = []
box_scores = []
for l in range(num_layers):
_boxes, _box_scores = yolo_boxes_and_scores(yolo_outputs[l],
anchors[anchor_mask[l]], num_classes, input_shape, image_shape)
boxes.append(_boxes)
box_scores.append(_box_scores)
boxes = K.concatenate(boxes, axis=0)
box_scores = K.concatenate(box_scores, axis=0)
mask = box_scores >= score_threshold
max_boxes_tensor = K.constant(max_boxes, dtype='int32')
boxes_ = []
scores_ = []
classes_ = []
for c in range(num_classes):
# TODO: use keras backend instead of tf.
class_boxes = tf.boolean_mask(boxes, mask[:, c])
class_box_scores = tf.boolean_mask(box_scores[:, c], mask[:, c])
nms_index = tf.image.non_max_suppression(
class_boxes, class_box_scores, max_boxes_tensor, iou_threshold=iou_threshold)
class_boxes = K.gather(class_boxes, nms_index)
class_box_scores = K.gather(class_box_scores, nms_index)
classes = K.ones_like(class_box_scores, 'int32') * c
boxes_.append(class_boxes)
scores_.append(class_box_scores)
classes_.append(classes)
boxes_ = K.concatenate(boxes_, axis=0)
scores_ = K.concatenate(scores_, axis=0)
classes_ = K.concatenate(classes_, axis=0)
return boxes_, scores_, classes_
I've printed out the return value
and it looks like this
boxes-> Tensor("concat_11:0", shape=(?, 4), dtype=float32)
scores-> Tensor("concat_12:0", shape=(?,), dtype=float32)
classes-> Tensor("concat_13:0", shape=(?,), dtype=int32)
the original output of my YOLO model(.hdf5) is (I got this by printed out model.output)
tf.Tensor 'conv2d_59_1/BiasAdd:0' shape=(?, ?, ?, 21) dtype=float32
tf.Tensor 'conv2d_67_1/BiasAdd:0' shape=(?, ?, ?, 21) dtype=float32
tf.Tensor 'conv2d_75_1/BiasAdd:0' shape=(?, ?, ?, 21) dtype=float32
And the inference part of the python code is
out_boxes, out_scores, out_classes = sess.run(
[boxes, scores, classes],
feed_dict={
yolo_model.input: image_data,
input_image_shape: [image.size[1], image.size[0]],
K.learning_phase(): 0
})
Compare to the python version of inference code,
C++ part is... (Reference)
int main()
{
string image = "test.jpg";
string graph = "yolo_weight.pb";
string labels = "coco.names";
int32 input_width = 416;
int32 input_height = 416;
float input_mean = 0;
float input_std = 255;
string input_layer = "input_1:0";
std::vector<std::string> output_layer = {"conv2d_59/BiasAdd:0", "conv2d_67/BiasAdd:0", "conv2d_75/BiasAdd:0" };
std::unique_ptr<tensorflow::Session> session;
string graph_path = tensorflow::io::JoinPath(root_dir, graph);
Status load_graph_status = LoadGraph(graph_path, &session);
std::vector<Tensor> resized_tensors;
string image_path = tensorflow::io::JoinPath(root_dir, image);
Status read_tensor_status = ReadTensorFromImageFile(image_path, input_height, input_width,
input_mean, input_std, &resized_tensors);
Tensor inpTensor = Tensor(DT_FLOAT, TensorShape({ 1, input_height, input_width, 3 }));
std::vector<Tensor> outputs;
cv::Mat srcImage = cv::imread(image);
cv::resize(srcImage, srcImage, cv::Size(input_width, input_height));
srcImage.convertTo(srcImage, CV_32FC3);
srcImage = srcImage / 255;
string ty = type2str(srcImage.type());
float *p = (&inpTensor)->flat<float>().data();
cv::Mat tensorMat(input_height, input_width, CV_32FC3, p);
srcImage.convertTo(tensorMat, CV_32FC3);
Status run_status = session->Run({{ input_layer, inpTensor }}, { output_layer }, {}, &outputs);
int cc = 1;
auto output_detection_class = outputs[0].tensor<float, 4>();
std::cout << "detection scores" << std::endl;
std::cout << "typeid(output_detection_scoreclass).name->" << typeid(output_detection_class).name() << std::endl;
for (int i = 0; i < 13; ++i)
{
for (int j = 0; j < 13; ++j)
{
for (int k = 0; k < 21; ++k)
{
// using (index_1, index_2, index_3) to access the element in a tensor
printf("i->%d, j->%d, k->%d\t", i, j, k);
std::cout << output_detection_class(1, i, j, k) << "\t";
cc += 1;
if (cc % 4 == 0)
{
std::cout << "\n";
}
}
}
std::cout << std::endl;
}
return 0;
}
The output of c++ version inference part is
outputs.size()-> 3
outputs[0].shape()-> [1,13,13,21]
outputs[1].shape()-> [1,26,26,21]
outputs[2].shape()-> [1,52,52,21]
But the output I get is pretty weird...
(The output value of outputs[0] doesn't seems like any one of score, class, or coordinates...)
So I'm wondering is it because I miss the part written in python before its inference? Or I use the wrong way to get my output data?
I've checked some related questions and answers...
1.Yolo v3 model output clarification with keras
2.Convert YoloV3 output to coordinates of bounding box, label and confidence
3.How to access tensorflow::Tensor C++
But I still can't figure out how to make it :(
I also found a repo which might be helpful,
I've taken a look at its yolo.cpp, but its model output tensor's shape is different from mine, I'm not sure if I can revise the code directly, its output tensor is
tf.Tensor 'import/output:0' shape=(?, 735) dtype = float32
Any help or advice is appreciated...
In case you're still struggling with this, I don't see where you are applying the Sigmoid and Exp to the output layer values.
You might look at this paper, which describes how to handle the output.
https://medium.com/analytics-vidhya/yolo-v3-theory-explained-33100f6d193
As Bryan said, there're still some actions need to be done with the output layer.
So in my case (according to this repo), I add this to the YOLO class (at file yolo.py) for adding those post-processing when saving model:
def output_pb(self, out_dir, out_pb):
out_bx = self.boxes.name.split(":")[0]
out_sc = self.scores.name.split(":")[0]
out_cs = self.classes.name.split(":")[0]
print(out_bx, out_sc, out_cs)
frozen_graph = tf.graph_util.remove_training_nodes(tf.graph_util.convert_variables_to_constants(self.sess, self.sess.graph.as_graph_def(), [out_bx, out_sc, out_cs]))
tf.io.write_graph(frozen_graph, out_dir, out_pb, as_text=False)
print("===== FINISH saving new pb file =====")
When saving model, I called the function like this:
yolo = YOLO(**config)
yolo.output_pb(output_dir, output_pb_name)
And when doing inference in C++,
the whole process goes like this:
// initialize model
YOLO* YOLO_data = (YOLO*)Init_DllODM_object(config);
// do some stuff to set data in YOLO_data
cv::Mat input_pic = "whatever_pic.png";
predict(YOLO_data, input_pic, YOLO_data ->bbox_res, YOLO_data ->score_res, YOLO_data ->class_res);
// draw result on pic
cv::Mat res = show_result(YOLO_data, input_pic);
Detailed code is here:
// yolo_cpp.h
struct YOLO
{
float score_thres;
std::vector<int> class_res;
std::vector<float> bbox_res, score_res;
std::string inp_tensor_name;
std::string placeholder_name;
std::vector<std::string> out_tensors;
Session* session;
Tensor t, inpTensor;
std::vector<tensorflow::Tensor> outTensor;
std::vector<int> MD_size;
std::vector<int> inp_pic_size;
std::vector<std::string> md_class_list;
std::vector<cv::Scalar> color_list;
int show_score;
int score_type;
int return_origin;
};
// yolo_cpp.cpp
void* Init_DllODM_object(json config)
{
std::string model_path = config["model"].get<std::string>();
YOLO* YOLO_data = new YOLO();
auto options = tensorflow::SessionOptions();
GraphDef graphdef;
// loading model to graph
Status status_load = ReadBinaryProto(Env::Default(), model_path, &graphdef);
options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.7);
options.config.mutable_gpu_options()->set_allow_growth(true);
int node_count = graphdef.node_size();
for (int i = 0; i < node_count; i++)
{
auto n = graphdef.node(i);
if (n.name().find("input_") != string::npos)
{
YOLO_data->inp_tensor_name = n.name();
}
else if (n.name().find("Placeholder_") != string::npos)
{
YOLO_data->placeholder_name = n.name();
}
else if (i == node_count - 5)
{
YOLO_data->out_tensors.push_back(n.name());
}
else if (i == node_count - 3)
{
YOLO_data->out_tensors.push_back(n.name());
}
else if (i == node_count - 1)
{
YOLO_data->out_tensors.push_back(n.name());
}
}
if (!status_load.ok()) {
std::cout << "ERROR: Loading model failed..." << std::endl;
std::cout << model_path << status_load.ToString() << "\n";
}
std::vector<int> MD_size_ = config["input_size"];
YOLO_data->MD_size = MD_size_;
std::vector<int> inp_pic_size_ = config["input_pic_size"];
YOLO_data->inp_pic_size = inp_pic_size_;
YOLO_data->inpTensor = Tensor(DT_FLOAT, TensorShape({ 1, YOLO_data->MD_size[0], YOLO_data->MD_size[1], 3 })); // input tensor
YOLO_data->t = Tensor(DT_FLOAT, TensorShape({ 2 }));
//ref: https://stackoverflow.com/questions/36804714/define-a-feed-dict-in-c-for-tensorflow-models
auto t_matrix = YOLO_data->t.tensor<float, 1>();
t_matrix(0) = YOLO_data->inp_pic_size[0];
t_matrix(1) = YOLO_data->inp_pic_size[1];
// create session
Status status_newsess = NewSession(options, &YOLO_data->session); //for the usage of gpu setting
Status status_create = YOLO_data->session->Create(graphdef);
if (!status_create.ok()) {
std::cout << "ERROR: Creating graph in session failed.." << status_create.ToString() << std::endl;
}
else {
std::cout << "----------- Successfully created session and load graph -------------" << std::endl;
}
return YOLO_data;
}
int predict(YOLO* YOLO_, cv::Mat srcImage, std::vector<float>& bbox_res, std::vector<float>& score_res, std::vector<int>& class_res)
{
// read image -> input image
if (srcImage.empty()) // check if image can open correctly
{
std::cout << "can't open the image!!!!!!!" << std::endl;
int res = -1;
return res;
}
// ref: https://ppt.cc/f7ERNx
std::vector<std::pair<string, tensorflow::Tensor>> inputs = {
{ YOLO_->inp_tensor_name, YOLO_->inpTensor },
{ YOLO_->placeholder_name, YOLO_->t },
};
srcImage = letterbox_image(srcImage, YOLO_->MD_size[0], YOLO_->MD_size[1]);
convertCVMatToTensor(YOLO_, srcImage);
Status status_run = YOLO_->session->Run({ inputs }, { YOLO_->out_tensors }, {}, &YOLO_->outTensor);
if (!status_run.ok()) {
std::cout << "ERROR: RUN failed..." << std::endl;
std::cout << status_run.ToString() << "\n";
int res = -1;
return res;
}
TTypes<float>::Flat pp1 = YOLO_->outTensor[0].flat<float>();
TTypes<float>::Flat pp2 = YOLO_->outTensor[1].flat<float>();
TTypes<int>::Flat pp3 = YOLO_->outTensor[2].flat<int>();
int pp1_idx;
for (int i = 0; i < pp2.size(); i++)
{
pp1_idx = i * 4;
bbox_res.push_back(pp1(pp1_idx));
bbox_res.push_back(pp1(pp1_idx + 1));
bbox_res.push_back(pp1(pp1_idx + 2));
bbox_res.push_back(pp1(pp1_idx + 3));
score_res.push_back(pp2(i));
class_res.push_back(pp3(i));
}
return 0;
}
cv::Mat show_result(YOLO* inf_obj, cv::Mat inp_pic)
{
int bbox_idx;
std::string plot_str;
bool under_thresh = false;
std::vector<int> del_idx;
for (int i = 0; i < inf_obj->class_res.size(); i++)
{
int y_min, y_max, x_min, x_max;
bbox_idx = i * 4;
y_min = std::max(0, (int)floor(inf_obj->bbox_res[bbox_idx] + 0.5));
x_min = std::max(0, (int)floor(inf_obj->bbox_res[bbox_idx + 1] + 0.5));
y_max = std::max(0, (int)floor(inf_obj->bbox_res[bbox_idx + 2] + 0.5));
x_max = std::max(0, (int)floor(inf_obj->bbox_res[bbox_idx + 3] + 0.5));
//std::cout << md_class_list[class_res[i]] << ", ";
//std::cout << score_res[i] << ",";
//std::cout << "[" << x_min << ", " << y_min << ", " << x_max << ", " << y_max << "]\n";
if (inf_obj->show_score)
{
if (inf_obj->score_type)
plot_str = inf_obj->md_class_list[inf_obj->class_res[i]] + ", " + std::to_string(rounding(inf_obj->score_res[i] * 100, 2)).substr(0, 5) + "%";
else
plot_str = inf_obj->md_class_list[inf_obj->class_res[i]] + ", " + std::to_string(rounding(inf_obj->score_res[i], 2)).substr(0, 4);
}
else
plot_str = inf_obj->md_class_list[inf_obj->class_res[i]];
if (inf_obj->score_res[i] >= inf_obj->score_thres)
{
inp_pic = plot_one_box(inp_pic, x_min, y_min, x_max, y_max, plot_str, inf_obj->color_list[inf_obj->class_res[i]]);
}
else
{
//std::cout << "score_res[i]->" << score_res[i] << "under thresh!!" << std::endl;
under_thresh = true;
del_idx.push_back(i);
}
}
if (under_thresh)
{
//std::cout << "*** deleting element" << std::endl;
for (int x = 0; x < del_idx.size(); x++)
{
bbox_idx = (del_idx[x] - x) * 4;
inf_obj->bbox_res.erase(inf_obj->bbox_res.begin() + bbox_idx + 3);
inf_obj->bbox_res.erase(inf_obj->bbox_res.begin() + bbox_idx + 2);
inf_obj->bbox_res.erase(inf_obj->bbox_res.begin() + bbox_idx + 1);
inf_obj->bbox_res.erase(inf_obj->bbox_res.begin() + bbox_idx);
inf_obj->score_res.erase(inf_obj->score_res.begin() + del_idx[x] - x);
inf_obj->class_res.erase(inf_obj->class_res.begin() + del_idx[x] - x);
}
del_idx.clear();
}
return inp_pic;
}
Since my code is used for dll, I arranged in this way.
There are still some redundant code I didn't delete,
but I think the whole process can be done with these provided code so far.
Hope this help :D

Recoding from C to Python

I'm pretty much a layman in C and I'm learning Python. I need to write the routine described below (in C) for Python:
#include <stdio.h>
#include <math.h>
main()
{
float hold[26], hnew[26];
float dt, dx;
float t, s;
float ho;
float time;
float f1, d2h;
int i;
int nx, nlx;
int n, nend;
int kount, kprint;
dt = 5.0;
dx = 10.0;
t = 0.02;
s = 0.002;
nx = 11;
nlx = nx-1;
ho = 16.0;
for( i = 1; i <= nx; i++ )
{
hold[i] = ho;
hnew[i] = ho;
}
hold[nx] = 11.0;
printf("\t\t\t\thead\t\t\t\t time\n\n");
kount = 1;
kprint = 2;
time = dt;
nend = 100;
for( n = 1; n <= nend; n++ )
{
/* update solution */
for( i = 2; i <= nlx; i++ )
{
f1 = dt*t/s;
d2h = ( hold[i+1] - 2.0*hold[i] + hold[i-1])/(dx*dx);
hnew[i] = hold[i] + (f1*d2h);
}
for( i = 1; i <= nlx; i++ )
{
hold[i] = hnew[i];
}
if( kount == kprint )
{
for( i = 1; i <= nx; i++ )
{
printf(" %.2f",hold[i]);
}
printf(" %6.2f\n",time);
kount = 0;
}
time = time + dt;
kount = kount + 1;
}
}
This is my attempt at Python:
import numpy as np
dt = 5.0
dx = 10.0
t = 0.02
s = 0.002
nx = 11
nlx = nx - 1
ho = 16.0
hold = np.zeros(nx+1)
hnew = np.zeros(nx+1)
for i in range(nx):
hold[i] = ho
hnew[i] = ho
hold[nx] = 11.0
However, I can't get over this because I don't know the Python correspondent of the printf function. What would be the correct form of this function in Python? What does it reffer to?
Just print() in Python with .format.
For example:
x, y = 1, 2
print("x = {0}, y = {1}".format(x, y))
Here's the doc
To print similar to C's printf, the following is an example:
f = 3.25645
g = 3.14159265358979
for fl in (f,g):
print(f'{fl:.2f}')
3.26
3.14
The first f in the print is the format specifier. The f in the braces says to consider the number as a float.
it just print() (see a small program below)
squares = []
for x in range(14):
squares.append(x**2)
squares
squares2 = [x**2 for x in range(100)]
print (squares2)

CUDA: does size of input/output data have to be a multiple of the number of threads per block?

I have a Python code (for implementing RayTracing) that I'm running in parallel with PyCuda.
import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
from stl import mesh
import time
my_mesh = mesh.Mesh.from_file('test_solid_py.stl')
n = my_mesh.normals
v0 = my_mesh.v0
v1 = my_mesh.v1
v2 = my_mesh.v2
v0_x = v0[:,0]
v0_x = np.ascontiguousarray(v0_x)
v0_y = v0[:,1]
v0_y = np.ascontiguousarray(v0_y)
v0_z = v0[:,2]
v0_z = np.ascontiguousarray(v0_z)
v1_x = v1[:,0]
v1_x = np.ascontiguousarray(v1_x)
v1_y = v1[:,1]
v1_y = np.ascontiguousarray(v1_y)
v1_z = v1[:,2]
v1_z = np.ascontiguousarray(v1_z)
v2_x = v2[:,0]
v2_x = np.ascontiguousarray(v2_x)
v2_y = v2[:,1]
v2_y = np.ascontiguousarray(v2_y)
v2_z = v2[:,2]
v2_z = np.ascontiguousarray(v2_z)
mod = SourceModule("""
#include <math.h>
__global__ void intersect(float *origin,float *dir_x,float *dir_y,float *dir_z,float *v0_x,float *v0_y,float *v0_z,float *v1_x,float *v1_y,float *v1_z,float *v2_x,float *v2_y,float *v2_z,float *int_point_real_x, float *int_point_real_y,float *int_point_real_z)
{
using namespace std;
unsigned int idx = blockDim.x*blockIdx.x + threadIdx.x;
int count = 0;
float v0_current[3];
float v1_current[3];
float v2_current[3];
float dir_current[3] = {dir_x[idx],dir_y[idx],dir_z[idx]};
float int_point[3];
float int_pointS[2][3];
int int_faces[2];
float dist[2];
dist[0] = -999;
int n_tri = 105500;
for(int i = 0; i<n_tri; i++) {
v0_current[0] = v0_x[i];
v0_current[1] = v0_y[i];
v0_current[2] = v0_z[i];
v1_current[0] = v1_x[i];
v1_current[1] = v1_y[i];
v1_current[2] = v1_z[i];
v2_current[0] = v2_x[i];
v2_current[1] = v2_y[i];
v2_current[2] = v2_z[i];
double eps = 0.0000001;
float E1[3];
float E2[3];
float s[3];
for (int j = 0; j < 3; j++) {
E1[j] = v1_current[j] - v0_current[j];
E2[j] = v2_current[j] - v0_current[j];
s[j] = origin[j] - v0_current[j];
}
float h[3];
h[0] = dir_current[1] * E2[2] - dir_current[2] * E2[1];
h[1] = -(dir_current[0] * E2[2] - dir_current[2] * E2[0]);
h[2] = dir_current[0] * E2[1] - dir_current[1] * E2[0];
float a;
a = E1[0] * h[0] + E1[1] * h[1] + E1[2] * h[2];
if (a > -eps && a < eps) {
int_point[0] = false;
}
else {
double f = 1 / a;
float u;
u = f * (s[0] * h[0] + s[1] * h[1] + s[2] * h[2]);
if (u < 0 || u > 1) {
int_point[0] = false;
}
else {
float q[3];
q[0] = s[1] * E1[2] - s[2] * E1[1];
q[1] = -(s[0] * E1[2] - s[2] * E1[0]);
q[2] = s[0] * E1[1] - s[1] * E1[0];
float v;
v = f * (dir_current[0] * q[0] + dir_current[1] * q[1] + dir_current[2] * q[2]);
if (v < 0 || (u + v)>1) {
int_point[0] = false;
}
else {
float t;
t = f * (E2[0] * q[0] + E2[1] * q[1] + E2[2] * q[2]);
if (t > eps) {
for (int j = 0; j < 3; j++) {
int_point[j] = origin[j] + dir_current[j] * t;
}
//return t;
}
}
}
}
if (int_point[0] != false) {
count = count+1;
int_faces[count-1] = i;
dist[count-1] = sqrt(pow((origin[0] - int_point[0]), 2) + pow((origin[1] - int_point[1]), 2) + pow((origin[2] - int_point[2]), 2));
for (int j = 0; j<3; j++) {
int_pointS[count-1][j] = int_point[j];
}
}
}
double min = dist[0];
int ind_min = 0;
for (int i = 0; i < 2; i++){
if (min > dist[i]) {
min = dist[i];
ind_min = i;
}
}
if (dist[0] < -998){
int_point_real_x[idx] = -999;
int_point_real_y[idx] = -999;
int_point_real_z[idx] = -999;
}
else{
int_point_real_x[idx] = int_pointS[ind_min][0];
int_point_real_y[idx] = int_pointS[ind_min][1];
int_point_real_z[idx] = int_pointS[ind_min][2];
}
}
""")
n_rays = 20000
num_threads = 1024
num_blocks = int(n_rays/num_threads)
origin = np.asarray([-2, -2, -2]).astype(np.float32)
origin = np.ascontiguousarray(origin)
rand_x = np.random.randn(n_rays)
rand_y = np.random.randn(n_rays)
rand_z = np.random.randn(n_rays)
direction_x = np.ones((n_rays, 1)) * 3
direction_x = direction_x.astype(np.float32)
direction_x = np.ascontiguousarray(direction_x)
direction_y = np.ones((n_rays, 1)) * 4
direction_y = direction_y.astype(np.float32)
direction_y = np.ascontiguousarray(direction_y)
direction_z = np.ones((n_rays, 1)) * 5
direction_z = direction_z.astype(np.float32)
direction_z = np.ascontiguousarray(direction_z)
int_point_real_x = np.zeros((n_rays, 1)).astype(np.float32)
int_point_real_x = np.ascontiguousarray(int_point_real_x)
int_point_real_y = np.zeros((n_rays, 1)).astype(np.float32)
int_point_real_y = np.ascontiguousarray(int_point_real_y)
int_point_real_z = np.zeros((n_rays, 1)).astype(np.float32)
int_point_real_z = np.ascontiguousarray(int_point_real_z)
intersect = mod.get_function("intersect")
start = time.time()
intersect(drv.In(origin), drv.In(direction_x),drv.In(direction_y),drv.In(direction_z),drv.In(v0_x),drv.In(v0_y),drv.In(v0_z), drv.In(v1_x),drv.In(v1_y),drv.In(v1_z), drv.In(v2_x), drv.In(v2_y), drv.In(v2_z), drv.Out(int_point_real_x),drv.Out(int_point_real_y),drv.Out(int_point_real_z), block=(num_threads, 1, 1), grid=((num_blocks+0), 1, 1))
finish = time.time()
print(finish-start)
I give as input some arrays whose size is 20k (dir_x, dir_y, dir_z) and I have as output 3 arrays (int_point_real_x,int_point_real_y,int_point_real_z) that have the same size as the above mentioned arrays (20k).
If n_rays is a multiple of num_threads, e.g. n_rays=19456 and num_threads=1024, then int_point_real_x_y_z are correctly filled by the kernel.
Otherwise, if n_rays is NOT a multiple of num_threads, e.g. n_rays=20000 (what I really need) and num_threads=1024, then int_point_real_x_y_z are filled by the kernel up to position 19455 and the 544 spots left in the array are not filled.
Does anyone know if this is a rule of CUDA?
If it's not, how could I modify my code in order to use an arbitrary size of input array (and not only multiple of num_threads)?
Thanks
your int(n_rays/num_threads) is rounding down
to fix this, you need to round up and then put a condition into the kernel to enforce that idx is valid and "do nothing" if it's not. this will cause some cores to waste time, but your code looks pretty suboptimal anyway so it probably won't matter much

Categories