I would like to implement a variant of convolution in pycuda.
For simplicity, I'll show rectangle kernel of the interpolation.
The standard convolution can be applied as following:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule
mod = SourceModule("""
#include <stdio.h>
__global__ void func(float *dest, float *a)
{
const int img_size = 64;
const int kernel_size = 3;
const int kernel_size_half = kernel_size/2;
const int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int ty = blockIdx.y * blockDim.y + threadIdx.y;
int tx_kernel;
tx_kernel = tx - kernel_size_half;
for (int idx=-kernel_size_half; idx <= kernel_size_half; idx++)
{
tx_kernel = tx + idx ;
if ((tx_kernel < 0) || (tx_kernel > img_size-1))
continue;
dest[ty * img_size + tx] += a[ty * img_size + tx_kernel] / ((float) kernel_size);
}
}
""")
Instead of calculating the current position wrt neighbours, I would like to do the opposite,
to add the value of the current pixel to the neighbours.
I.e:
to change the line:
dest[ty * img_size + tx] += a[ty * img_size + tx_kernel] / ((float) kernel_size);
to:
dest[ty * img_size + tx_kernel] += a[ty * img_size + tx] / ((float) kernel_size);
However, The first works fine but the second is not, it fails by updating the neighbours.
Is there a way to bypass it?
Note:
I simplified the question to focus on what I need,
the general problem is to use a different convolution kernel for each pixel instead of same one as I asked in the question.
to change the line:
dest[ty * img_size + tx] += a[ty * img_size + tx_kernel] / ((float) kernel_size);
to:
dest[ty * img_size + tx_kernel] += a[ty * img_size + tx] / ((float) kernel_size);
However, The first works fine but the second is not, it fails by updating the neighbours. Is there a way to bypass it?
The first method is preferred from a performance perspective. However if you wish to "update the neighbors" then it should be possible to recast the second operation as:
atomicAdd(&(dest[ty * img_size + tx_kernel]), a[ty * img_size + tx] / ((float) kernel_size));
Related
What is a correct way to do the matrix multiplication using ctype ?
in my current implementation data going back and forth consuming lots of time, is there any way to do it optimally ? by passing array address and getting pointer in return instead of generating entire array using .contents method.
cpp_function.cpp
compile using g++ -shared -fPIC cpp_function.cpp -o cpp_function.so
#include <iostream>
extern "C" {
double* mult_matrix(double *a1, double *a2, size_t a1_h, size_t a1_w,
size_t a2_h, size_t a2_w, int size)
{
double* ret_arr = new double[size];
for(size_t i = 0; i < a1_h; i++){
for (size_t j = 0; j < a2_w; j++) {
double val = 0;
for (size_t k = 0; k < a2_h; k++){
val += a1[i * a1_h + k] * a2[k * a2_h +j] ;
}
ret_arr[i * a1_h +j ] = val;
// printf("%f ", ret_arr[i * a1_h +j ]);
}
// printf("\n");
}
return ret_arr;
}
}
Python file to call the so file
main.py
import ctypes
import numpy
from time import time
libmatmult = ctypes.CDLL("./cpp_function.so")
ND_POINTER_1 = numpy.ctypeslib.ndpointer(dtype=numpy.float64,
ndim=2,
flags="C")
ND_POINTER_2 = numpy.ctypeslib.ndpointer(dtype=numpy.float64,
ndim=2,
flags="C")
libmatmult.mult_matrix.argtypes = [ND_POINTER_1, ND_POINTER_2, ctypes.c_size_t, ctypes.c_size_t]
def mult_matrix_cpp(a,b):
shape = a.shape[0] * a.shape[1]
libmatmult.mult_matrix.restype = ctypes.POINTER(ctypes.c_double * shape )
ret_cpp = libmatmult.mult_matrix(a, b, *a.shape, *b.shape , a.shape[0] * a.shape[1])
out_list_c = [i for i in ret_cpp.contents] # <---- regenrating list which is time consuming
return out_list_c
size_a = (300,300)
size_b = size_a
a = numpy.random.uniform(low=1, high=255, size=size_a)
b = numpy.random.uniform(low=1, high=255, size=size_b)
t2 = time()
out_cpp = mult_matrix_cpp(a,b)
print("cpp time taken:{:.2f} ms".format((time() - t2) * 1000))
out_cpp = numpy.array(out_cpp).reshape(size_a[0], size_a[1])
t3 = time()
out_np = numpy.dot(a,b)
# print(out_np)
print("Numpy dot() time taken:{:.2f} ms".format((time() - t3) * 1000))
This solution works but time consuming is there any way to make it faster ?
One reason for the time consumption is not using an ndpointer for the return value and copying it into a Python list. Instead use the following restype. You won't need the later reshape as well. But take the commenters' advice and don't reinvent the wheel.
def mult_matrix_cpp(a, b):
shape = a.shape[0] * a.shape[1]
libmatmult.mult_matrix.restype = np.ctypeslib.ndpointer(dtype=np.float64, ndim=2, shape=a.shape, flags="C")
return libmatmult.mult_matrix(a, b, *a.shape, *b.shape , a.shape[0] * a.shape[1])
use restype
def mult_matrix_cpp(a, b):
shape = a.shape[0] * a.shape[1]
libmatmult.mult_matrix.restype = np.ctypeslib.ndpointer(dtype=np.float64, ndim=2, shape=a.shape, flags="C")
return libmatmult.mult_matrix(a, b, *a.shape, *b.shape , a.shape[0] * a.shape[1])
so Im trying to implement the hough transform using python and c++ (using Pybind11 for interfacing between the two languages). When Im plotting the hough space it seems alright but I just can't get a line from the maximum of the voting matrix.
Here is the C++ code (looks a bit different bc I use PyBind11):
py::array_t<int> houghTransform(py::array_t<int> image, int angleStep, int angleAmount) {
auto imageBuf = image.mutable_unchecked<3>();
int height = imageBuf.shape(0);
int width = imageBuf.shape(1);
py::array_t<int> edgeMatrix = edgeDetect(imageBuf, height, width);
auto edgeMatrixBuf = edgeMatrix.mutable_unchecked<2>();
int distanceAxis = 2 * sqrt(pow((float) height, 2.0) + pow((float) width, 2.0));
int angleAxis = angleAmount;
int angleDim = (int) angleAxis / angleStep;
int distanceDim = (int) distanceAxis / 2;
py::array_t<int> votingMatrix = py::array_t<int>({distanceAxis, angleDim});
auto votingMatrixBuf = votingMatrix.mutable_unchecked<2>();
// fill voting matrices with zeros
for(int i=0; i<distanceDim; i++) {
for(int j=0; j<angleDim; j++) {
votingMatrixBuf(i, j) = 0;
}
}
// vote
for(int x=0; x<edgeMatrixBuf.shape(0); x++) {
for(int y=0; y<edgeMatrixBuf.shape(1); y++) {
if(edgeMatrixBuf(x, y) == 1) {
int counter = 0;
float theta;
float ro;
for(int thetaIdx=0; thetaIdx<=angleAxis; thetaIdx++) {
if(thetaIdx % angleStep == 0) {
counter++;
theta = (float) (thetaIdx) * (M_PI / 180);
ro = distanceDim + std::round((x * cos(theta)) + (y * sin(theta)));
votingMatrixBuf(ro, counter) += 1;
}
}
}
}
}
return votingMatrix;
}
As you can see the arguments of the functions are the image matrix, which I transform to a matrix where the edges are 1 and the rest 0, so I got my pixels of interest.
int angleAmount is what angle range I want to try outand int angleStep is how many of angles of that theta I really want to use (for example, skip every second theta). But for this example I will use angleAmount = 360 and angleStep = 1. So I will use all angles form 1 to 360.
Here is the python code:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import time
from houghTransform import houghTransform
def apply_hough_transform(image_path: str=""):
image = np.array(Image.open(image_path))
lines = houghTransform(image, 1, 360)
p = np.unravel_index(lines.argmax(), lines.shape)
max_distance = 2 * np.sqrt(pow(image.shape[0], 2) + pow(image.shape[1], 2))
ro = p[0] - (max_distance / 2)
theta = p[1] * (np.pi / 180)
a = np.cos(theta)
b = np.sin(theta)
x = a * ro
y = b * ro
pt1 = (int(x + 1000*(-b)), int(y + 1000*(a)))
pt2 = (int(x - 1000*(-b)), int(y - 1000*(a)))
fig, axs = plt.subplots(2)
axs[0].matshow(lines)
axs[0].scatter(p[1], p[0], facecolors="none", edgecolors="r")
axs[1].plot([pt1[0], pt2[0]], [pt1[1], pt2[1]])
axs[1].imshow(image)
plt.show()
apply_hough_transform(image_path="images/black_line.png")
The function houghTransform is the same as in the c++ code which I exported to Python using PyBind11.
Here are the images:
I also tried to create the line using this function:
def line(x):
return -(1 / np.arctan(theta)) * (x - ro * np.cos(theta)) + ro * np.sin(theta)
But it also didnt work.
Can you spot my error? Im sitting on this for quite some time so help is really appreciated!
Recently I trying to do some image processing for my work.
Unfortunately that I keep trying to apply my old C++ code with cv support to python code with cv2 support.
It cannot work very well... Can anyone help me?
Original C++ Code:
#define IMAGE_WIDE 40
#define IMAGE_LENGTH 30
#define CHANNELS 3
DNN_image_out = cvCreateImage(cvSize(IMAGE_WIDE, IMAGE_LENGTH), IPL_DEPTH_8U, 3);
for(int k = 0; k < IMAGE_LENGTH; k++){ //縦
for(int l = 0; l < IMAGE_WIDE; l++){ //横
DNN_image_out[i]->imageData[(k * IMAGE_WIDE + l)*3 +0] = DNN_image_tmp[(k * IMAGE_WIDE + l)*3 + 0 ];
DNN_image_out[i]->imageData[(k * IMAGE_WIDE + l)*3 +1] = DNN_image_tmp[(k * IMAGE_WIDE + l)*3 + 1 ];
DNN_image_out[i]->imageData[(k * IMAGE_WIDE + l)*3 +2] = DNN_image_tmp[(k * IMAGE_WIDE + l)*3 + 2 ];
}
}
My Python CV2 code:
import numpy as np
import cv2
def split_channel3(array,width,height):
R=[]
G=[]
B=[]
for k in range(height):
for l in range(width):
R.append(array[(k * width + l)*3 +0])
G.append(array[(k * width + l)*3 +1])
B.append(array[(k * width + l)*3 +2])
R = np.asarray(R)
G = np.asarray(G)
B = np.asarray(B)
return [R,G,B]
[R,G,B] = split_channel3(img,40,30)
R = R.reshape(40,30,1)
G = G.reshape(40,30,1)
B = B.reshape(40,30,1)
Color_img = np.dstack((R,G))
Color_img = np.dstack((Color_img,B))
cv2.imshow('image',Color_img)
cv2.waitKey(0)
Is my logic wrong? Or what should I change in python code?
You can simply use cv2.split, without the need of your custom function nor reshape:
B,G,R = cv2.split(img)
and then evantually use:
Color_img = cv2.merge((B,G,R))
Remember that the channels are B,G,R by default in OpenCV, and not R,G,B.
I'm attempting to implement the discrete time wave equation in OpenCL. I think I'm pretty close, but the results look like what I would expect from the heat equation. I know they're very similar, but when I've implemented the 2D wave equation (not using OpenCL) I got distinct wavefronts and reflections. With the OpenCL kernel below everything diffuses until it is a wash.
__kernel void wave_calc(
__global float* height,
__global float* height_old,
const unsigned int len_x,
const unsigned int len_y,
const unsigned int len_z,
const float dtxc_term)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
unsigned int z = get_global_id(2);
int this_cell = x + len_y * (y + len_x * z);
float laplacian;
if (x==0 || x==(len_x-1) || y==0 || y==(len_y-1) || z==0 || z==(len_z-1)) {
laplacian = 0;
height_old[this_cell] = height[this_cell];
height[this_cell] = 0;
}
else if ( x < len_x-1 && y < len_y-1 && z < len_z-1 ){
int n1 = x - 1 + len_y * (y + len_x * z);
int n2 = x + 1 + len_y * (y + len_x * z);
int n3 = x + len_y * (y - 1 + len_x * z);
int n4 = x + len_y * (y + 1 + len_x * z);
int n5 = x + len_y * (y + len_x * (z -1));
int n6 = x + len_y * (y + len_x * (z + 1));
laplacian = -6 * height[this_cell] +
height[n1] +
height[n2] +
height[n3] +
height[n4] +
height[n5] +
height[n6];
height_old[this_cell] = height[this_cell];
height[this_cell] = (dtxc_term*laplacian+2*height[this_cell]) - height_old[this_cell];
}
}
(DTXC is the result of ((DT * DT)/(DX * DX)) * C passed from the host)
Every step I copy height back to the host for plotting, and then call the function again.
for i in np.arange(steps):
#copy height from host to device
cl.enqueue_copy(queue, d_height, h_height)
#step once
wave_calc(queue, field_3d.shape, None, d_height, d_height_old, LEN_X, LEN_Y, LEN_Z, DTXC)
queue.finish()
#copy height back
cl.enqueue_copy(queue, h_height, d_height)
#do my plotting
Any thoughts/suggestions/condescending remarks? All would be appreciated. :)
Here is an update to answer Joel's question:
I'm not much good when it comes to calculus, but I'm taking a working C++ implementation in 2D and trying to adapt it for 3D. Below is the C++. The only modification I made was to the loop, since there are 6 neighbor cells in 3D instead of 4. In both cases the outer walls of the plane/cube are set to 0:
for(int x=1; x<field.xRes()-1;x++) {
for (int y=1; y<field.yRes()-1; y++) {
laplacian(x,y) = -4 * height(x,y) +
height(x-1,y) +
height(x+1,y) +
height(x,y-1) +
height(x,y+1);
}
}
const float dt = 0.001;
const float xLen = 1.0;
const float C = 1.0;
const float dx = xLen/xRes;
backup = height;
height = ((dt*dt)/(dx*dx))*C*laplacian+2*height;
height = height - heightOld;
heightOld = backup;
I have this code for matrix multiplication using pyopenCL.
My problem is that the result is wrong in some matrices, and I dont understand why.
After some research i think its related with global size of something like that but i dont understand how to set that values.
For example:
matrices using numpy dtype = float32
matrix 1:
[[ 0.99114645 0.09327769 0.90075564 0.8913309 ]
[ 0.59739089 0.13906649 0.94246316 0.65673178]
[ 0.24535166 0.68942326 0.41361505 0.5789603 ]
[ 0.31962237 0.17714553 0.49025267 0.21861202]]
matrix2:
[[ 0.41509482 0.82779616 0.74143827 0.37681136]
[ 0.88058949 0.01039944 0.4342753 0.45752665]
[ 0.60375261 0.21243185 0.88312167 0.97394323]
[ 0.60855824 0.69482827 0.61627114 0.57155776]]
expected result:
[[ 1.57981943 1.63210835 2.12016045 1.80288424]
[ 1.3391085 1.15248911 1.7403561 1.58199609]
[ 1.31099532 0.70041376 1.20338154 1.14162762]
[ 0.71769556 0.52246746 0.88158722 0.8039138 ]]
script result:
[[ 1.20828819 0.73175305 1.64546931 1.42526579]
[ 1.13179159 0.46403384 1.20692348 1.14317513]
[ 1.25328159 0.86723316 1.58679342 1.40186214]
[ 1.35214019 0.6795128 1.73811913 1.48048854]]
script:
def openCL_multiplication(matrix1, matrix2, res):
import pyopencl as cl
import numpy as np
import numpy.linalg as la
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=matrix1)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=matrix2)
dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, matrix1.nbytes )
prg = cl.Program(ctx, """
__kernel void multiplymatrices(const unsigned int size, __global float * matrix1, __global float * matrix2, __global float * res) {
int i = get_global_id(1);
int j = get_global_id(0);
res[i + size * j] = 0;
for (int k = 0; k < size; k++)
{
res[i + size * j] += matrix1[i + size * k] * matrix2[k + size * j];
}
}
""").build()
t0 = datetime.datetime.now()
prg.multiplymatrices(queue, matrix1.shape, None,np.int32(len(matrix1)) ,a_buf, b_buf, dest_buf)
final_matrix = np.empty_like(matrix1)
cl.enqueue_copy(queue, final_matrix , dest_buf)
print final_matrix
delta_t = datetime.datetime.now() - t0
print 'OpenCL Multiplication: ' + str(delta_t)
return final_matrix
Thank you!
Well, I think the kernel does all right.
I can even call script result correct. It all depends on how you treat your matrices :-)
If you want your expected result. I'd change this:
res[i + size * j] += matrix1[i + size * k] * matrix2[k + size * j];
to this:
res[i + size * j] += matrix1[k + size * i] * matrix2[j + size * k];
Hope this helps.