PyOpenCL Matrix multiplication

PyOpenCL Matrix multiplication - python

I have this code for matrix multiplication using pyopenCL.
My problem is that the result is wrong in some matrices, and I dont understand why.
After some research i think its related with global size of something like that but i dont understand how to set that values.
For example:
matrices using numpy dtype = float32
matrix 1:
[[ 0.99114645 0.09327769 0.90075564 0.8913309 ]
[ 0.59739089 0.13906649 0.94246316 0.65673178]
[ 0.24535166 0.68942326 0.41361505 0.5789603 ]
[ 0.31962237 0.17714553 0.49025267 0.21861202]]
matrix2:
[[ 0.41509482 0.82779616 0.74143827 0.37681136]
[ 0.88058949 0.01039944 0.4342753 0.45752665]
[ 0.60375261 0.21243185 0.88312167 0.97394323]
[ 0.60855824 0.69482827 0.61627114 0.57155776]]
expected result:
[[ 1.57981943 1.63210835 2.12016045 1.80288424]
[ 1.3391085 1.15248911 1.7403561 1.58199609]
[ 1.31099532 0.70041376 1.20338154 1.14162762]
[ 0.71769556 0.52246746 0.88158722 0.8039138 ]]
script result:
[[ 1.20828819 0.73175305 1.64546931 1.42526579]
[ 1.13179159 0.46403384 1.20692348 1.14317513]
[ 1.25328159 0.86723316 1.58679342 1.40186214]
[ 1.35214019 0.6795128 1.73811913 1.48048854]]
script:
def openCL_multiplication(matrix1, matrix2, res):
import pyopencl as cl
import numpy as np
import numpy.linalg as la
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=matrix1)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=matrix2)
dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, matrix1.nbytes )
prg = cl.Program(ctx, """
__kernel void multiplymatrices(const unsigned int size, __global float * matrix1, __global float * matrix2, __global float * res) {
int i = get_global_id(1);
int j = get_global_id(0);
res[i + size * j] = 0;
for (int k = 0; k < size; k++)
{
res[i + size * j] += matrix1[i + size * k] * matrix2[k + size * j];
}
}
""").build()
t0 = datetime.datetime.now()
prg.multiplymatrices(queue, matrix1.shape, None,np.int32(len(matrix1)) ,a_buf, b_buf, dest_buf)
final_matrix = np.empty_like(matrix1)
cl.enqueue_copy(queue, final_matrix , dest_buf)
print final_matrix
delta_t = datetime.datetime.now() - t0
print 'OpenCL Multiplication: ' + str(delta_t)
return final_matrix
Thank you!

Well, I think the kernel does all right.
I can even call script result correct. It all depends on how you treat your matrices :-)
If you want your expected result. I'd change this:
res[i + size * j] += matrix1[i + size * k] * matrix2[k + size * j];
to this:
res[i + size * j] += matrix1[k + size * i] * matrix2[j + size * k];
Hope this helps.

Related

Matrix multiplication of a 2d numpy array to cpp using ctypes

What is a correct way to do the matrix multiplication using ctype ?
in my current implementation data going back and forth consuming lots of time, is there any way to do it optimally ? by passing array address and getting pointer in return instead of generating entire array using .contents method.
cpp_function.cpp
compile using g++ -shared -fPIC cpp_function.cpp -o cpp_function.so
#include <iostream>
extern "C" {
double* mult_matrix(double *a1, double *a2, size_t a1_h, size_t a1_w,
size_t a2_h, size_t a2_w, int size)
{
double* ret_arr = new double[size];
for(size_t i = 0; i < a1_h; i++){
for (size_t j = 0; j < a2_w; j++) {
double val = 0;
for (size_t k = 0; k < a2_h; k++){
val += a1[i * a1_h + k] * a2[k * a2_h +j] ;
}
ret_arr[i * a1_h +j ] = val;
// printf("%f ", ret_arr[i * a1_h +j ]);
}
// printf("\n");
}
return ret_arr;
}
}
Python file to call the so file
main.py
import ctypes
import numpy
from time import time
libmatmult = ctypes.CDLL("./cpp_function.so")
ND_POINTER_1 = numpy.ctypeslib.ndpointer(dtype=numpy.float64,
ndim=2,
flags="C")
ND_POINTER_2 = numpy.ctypeslib.ndpointer(dtype=numpy.float64,
ndim=2,
flags="C")
libmatmult.mult_matrix.argtypes = [ND_POINTER_1, ND_POINTER_2, ctypes.c_size_t, ctypes.c_size_t]
def mult_matrix_cpp(a,b):
shape = a.shape[0] * a.shape[1]
libmatmult.mult_matrix.restype = ctypes.POINTER(ctypes.c_double * shape )
ret_cpp = libmatmult.mult_matrix(a, b, *a.shape, *b.shape , a.shape[0] * a.shape[1])
out_list_c = [i for i in ret_cpp.contents] # <---- regenrating list which is time consuming
return out_list_c
size_a = (300,300)
size_b = size_a
a = numpy.random.uniform(low=1, high=255, size=size_a)
b = numpy.random.uniform(low=1, high=255, size=size_b)
t2 = time()
out_cpp = mult_matrix_cpp(a,b)
print("cpp time taken:{:.2f} ms".format((time() - t2) * 1000))
out_cpp = numpy.array(out_cpp).reshape(size_a[0], size_a[1])
t3 = time()
out_np = numpy.dot(a,b)
# print(out_np)
print("Numpy dot() time taken:{:.2f} ms".format((time() - t3) * 1000))
This solution works but time consuming is there any way to make it faster ?

One reason for the time consumption is not using an ndpointer for the return value and copying it into a Python list. Instead use the following restype. You won't need the later reshape as well. But take the commenters' advice and don't reinvent the wheel.
def mult_matrix_cpp(a, b):
shape = a.shape[0] * a.shape[1]
libmatmult.mult_matrix.restype = np.ctypeslib.ndpointer(dtype=np.float64, ndim=2, shape=a.shape, flags="C")
return libmatmult.mult_matrix(a, b, *a.shape, *b.shape , a.shape[0] * a.shape[1])

use restype
def mult_matrix_cpp(a, b):
shape = a.shape[0] * a.shape[1]
libmatmult.mult_matrix.restype = np.ctypeslib.ndpointer(dtype=np.float64, ndim=2, shape=a.shape, flags="C")
return libmatmult.mult_matrix(a, b, *a.shape, *b.shape , a.shape[0] * a.shape[1])

plus equal (+=) operator in pycuda

I would like to implement a variant of convolution in pycuda.
For simplicity, I'll show rectangle kernel of the interpolation.
The standard convolution can be applied as following:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule
mod = SourceModule("""
#include <stdio.h>
__global__ void func(float *dest, float *a)
{
const int img_size = 64;
const int kernel_size = 3;
const int kernel_size_half = kernel_size/2;
const int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int ty = blockIdx.y * blockDim.y + threadIdx.y;
int tx_kernel;
tx_kernel = tx - kernel_size_half;
for (int idx=-kernel_size_half; idx <= kernel_size_half; idx++)
{
tx_kernel = tx + idx ;
if ((tx_kernel < 0) || (tx_kernel > img_size-1))
continue;
dest[ty * img_size + tx] += a[ty * img_size + tx_kernel] / ((float) kernel_size);
}
}
""")
Instead of calculating the current position wrt neighbours, I would like to do the opposite,
to add the value of the current pixel to the neighbours.
I.e:
to change the line:
dest[ty * img_size + tx] += a[ty * img_size + tx_kernel] / ((float) kernel_size);
to:
dest[ty * img_size + tx_kernel] += a[ty * img_size + tx] / ((float) kernel_size);
However, The first works fine but the second is not, it fails by updating the neighbours.
Is there a way to bypass it?
Note:
I simplified the question to focus on what I need,
the general problem is to use a different convolution kernel for each pixel instead of same one as I asked in the question.

to change the line:
dest[ty * img_size + tx] += a[ty * img_size + tx_kernel] / ((float) kernel_size);
to:
dest[ty * img_size + tx_kernel] += a[ty * img_size + tx] / ((float) kernel_size);
However, The first works fine but the second is not, it fails by updating the neighbours. Is there a way to bypass it?
The first method is preferred from a performance perspective. However if you wish to "update the neighbors" then it should be possible to recast the second operation as:
atomicAdd(&(dest[ty * img_size + tx_kernel]), a[ty * img_size + tx] / ((float) kernel_size));

Hough transform: How to get lines from voting-matrix?

so Im trying to implement the hough transform using python and c++ (using Pybind11 for interfacing between the two languages). When Im plotting the hough space it seems alright but I just can't get a line from the maximum of the voting matrix.
Here is the C++ code (looks a bit different bc I use PyBind11):
py::array_t<int> houghTransform(py::array_t<int> image, int angleStep, int angleAmount) {
auto imageBuf = image.mutable_unchecked<3>();
int height = imageBuf.shape(0);
int width = imageBuf.shape(1);
py::array_t<int> edgeMatrix = edgeDetect(imageBuf, height, width);
auto edgeMatrixBuf = edgeMatrix.mutable_unchecked<2>();
int distanceAxis = 2 * sqrt(pow((float) height, 2.0) + pow((float) width, 2.0));
int angleAxis = angleAmount;
int angleDim = (int) angleAxis / angleStep;
int distanceDim = (int) distanceAxis / 2;
py::array_t<int> votingMatrix = py::array_t<int>({distanceAxis, angleDim});
auto votingMatrixBuf = votingMatrix.mutable_unchecked<2>();
// fill voting matrices with zeros
for(int i=0; i<distanceDim; i++) {
for(int j=0; j<angleDim; j++) {
votingMatrixBuf(i, j) = 0;
}
}
// vote
for(int x=0; x<edgeMatrixBuf.shape(0); x++) {
for(int y=0; y<edgeMatrixBuf.shape(1); y++) {
if(edgeMatrixBuf(x, y) == 1) {
int counter = 0;
float theta;
float ro;
for(int thetaIdx=0; thetaIdx<=angleAxis; thetaIdx++) {
if(thetaIdx % angleStep == 0) {
counter++;
theta = (float) (thetaIdx) * (M_PI / 180);
ro = distanceDim + std::round((x * cos(theta)) + (y * sin(theta)));
votingMatrixBuf(ro, counter) += 1;
}
}
}
}
}
return votingMatrix;
}
As you can see the arguments of the functions are the image matrix, which I transform to a matrix where the edges are 1 and the rest 0, so I got my pixels of interest.
int angleAmount is what angle range I want to try outand int angleStep is how many of angles of that theta I really want to use (for example, skip every second theta). But for this example I will use angleAmount = 360 and angleStep = 1. So I will use all angles form 1 to 360.
Here is the python code:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import time
from houghTransform import houghTransform
def apply_hough_transform(image_path: str=""):
image = np.array(Image.open(image_path))
lines = houghTransform(image, 1, 360)
p = np.unravel_index(lines.argmax(), lines.shape)
max_distance = 2 * np.sqrt(pow(image.shape[0], 2) + pow(image.shape[1], 2))
ro = p[0] - (max_distance / 2)
theta = p[1] * (np.pi / 180)
a = np.cos(theta)
b = np.sin(theta)
x = a * ro
y = b * ro
pt1 = (int(x + 1000*(-b)), int(y + 1000*(a)))
pt2 = (int(x - 1000*(-b)), int(y - 1000*(a)))
fig, axs = plt.subplots(2)
axs[0].matshow(lines)
axs[0].scatter(p[1], p[0], facecolors="none", edgecolors="r")
axs[1].plot([pt1[0], pt2[0]], [pt1[1], pt2[1]])
axs[1].imshow(image)
plt.show()
apply_hough_transform(image_path="images/black_line.png")
The function houghTransform is the same as in the c++ code which I exported to Python using PyBind11.
Here are the images:
I also tried to create the line using this function:
def line(x):
return -(1 / np.arctan(theta)) * (x - ro * np.cos(theta)) + ro * np.sin(theta)
But it also didnt work.
Can you spot my error? Im sitting on this for quite some time so help is really appreciated!

Cython code 3x slower than corresponding NumPy version

I'm currently writing my thesis on the use of particle filters for filtering out latent states in stochastic volatility models. To improve the filtering results I've added option prices as an observed process. This means that for a given time series, I have to calculate the option prices at each time step - a "normal" time series is 100-200 points.
Without going too deep into the algorithm, I'm having a serious problem with performance. The last for-loop loops over all of the particles that I use, which is somewhere around a 1,000 (as determined by M). Running this code for only one particle takes 0.25 seconds - which means that it takes around 4 minutes per time step to run using 1,000 particles (which is rather infeasible).
from __future__ import division
import numpy as np
import numexpr as ne
from fftInC import fft
import time
import math
import pyfftw
def HestonCallPrice(M, N, S, V, t, T, strikes, r, param, b, NFFT, inp, v, alphaC, eta, k, weights):
"""
This will be the pricing function for the European call option. Since we found the
quadrature procedure to be too slow we shall move on to use FFT instead.
So, we begin defining all of the constants etc.
"""
vT, weightsT, inpJ, vJT = v.T, weights.T, inp * 1j, v.T * 1j
p1, p2, p3_2, p3, p4 = param[1,:], param[2,:], param[3,:], np.sqrt(param[3,:]), param[4,:]
"""
Next we move on to the calculations. These have been found to be rather fast, and hence do not
need any Cythonization.
"""
gamma = p3_2 / 2
beta = ne.evaluate("p1 - p4 * p3 * 1j * inp")
alpha = ne.evaluate("(-inp**2 - inpJ)/2")
d = ne.evaluate("sqrt(beta**2 - 4 * alpha * gamma)")
r_pos, r_neg = ne.evaluate("(beta + d)/(2 * gamma)"), ne.evaluate("(beta - d)/(2 * gamma)")
g, inpJT = ne.evaluate("r_neg / r_pos"), inpJ.T
D = ne.evaluate("r_neg * (1 - exp( -d * (T - t) ) ) / (1 - g * exp( -d * (T - t) ) )" )
C = ne.evaluate("p1 * (r_neg*(T - t) - 2 / p3_2 * log( (1 - g*exp(-d*(T - t)))/(1 - g) ) )")
A = 1j * inp.T * (math.log(S) + r * (T - t))
C_tmp = (C * p2).T
"""
The matrices and vectors that are sent into the Cython version of the code are
A = (1, 2048)
C_tmp = (4, 2048)
D.T = (4, 2048)
V = (4, 1000)
vJT[0, :] = (2048,)
k[:, 0] = (2048,)
weights.T[0, :] = (2048,)
This is now where we call the Cython script.
"""
start = time.time()
prices = fft(A, float(r), float(t), float(T), C_tmp, D.T, V, float(alphaC), vJT[0, :], k[:, 0],
float(b), strikes, float(eta), weights.T[0, :])
print 'Cythonized version: ', time.time() - start, ' seconds'
"""
The below code is the original code which has been "cythonized".
"""
start = time.time()
outPrices = np.empty( (M, N) )
prices = np.empty( (M * N, len(strikes)) )
"""
Regularly I use pyFFTW since it's a bit faster, but I couldn't figure out how to use the C
version of this, so to be fair when comparing speeds I disable pyFFTW. However, turning this on
using the below settings it's 20-30% faster.
"""
# fftIn = pyfftw.n_byte_align_empty((N, NFFT), 16, 'complex128')
#
# fftOut = fftIn.copy()
#
# fft_object = pyfftw.FFTW(fftIn, fftOut, nthreads=8)
for j in range( len(strikes) ):
position = (np.log(strikes[j]) + b) / ( 2 * b / NFFT)
x_1 = np.exp( k[ int(math.floor(position)) ] )
x_2 = np.exp( k[ int(math.ceil(position)) ] )
for m in range(M):
C_m, D_m, V_m = C_tmp[m, :], D[:, m].T, V[m, :][:, np.newaxis]
F_cT = ne.evaluate("exp( -r*(T - t) ) * exp(C_m + D_m * V_m + A) / \
( (alphaC + vJT) * (alphaC + 1 + vJT) )")
toFFT = ne.evaluate("exp( b * vJT ) * F_cT * eta / 3 * weightsT")
price = np.exp( -alphaC * k.T ) / math.pi * np.real ( np.fft.fft(toFFT) )
y_1 = price[ :, int(math.floor(position)) ]
y_2 = price[ :, int(math.ceil(position)) ]
dydx = (y_2 - y_1)/(x_2 - x_1)
outPrices[m, :] = dydx * (strikes[j] - x_1) + y_1
prices[:, j] = outPrices.reshape(M * N)
print 'Non-cythonized version: ', time.time() - start, ' seconds'
return prices
" ------ Defining constants etc, nothing to say really ----- "
M, N, S, t, T, r, NFFT, alphaC = 1, 1000, 1000, 0, 1, 0, 2048, 1.5
strikes = np.array([900, 1100])
c, V = 600, np.random.normal(loc=0.2, scale=0.05, size=(M, N))
param = np.repeat(np.array([0.05, 0.5, 0.15, 0.15**2, 0]), M).reshape((5, M))
eta = c / NFFT
b = np.pi / eta
j = np.arange(1, NFFT+1)[:, np.newaxis]
v, k = eta * (j - 1), -b + 2 * b/ NFFT*(j - 1)
inp = v - (alphaC + 1)*1j
weights = 3 + (-1)**j - np.array([1] + [0]*(NFFT-1))[:, np.newaxis]
" ------------------------------------------------------------- "
HestonCallPrice(M, N, S, V, t, T, strikes, r, param, b, NFFT, inp, v, alphaC, eta, k, weights)
I found that the bottleneck is the last for-loop. I got a tip to rewrite the for-loop in Cython instead, see below
" --------------------------------- C IMPORTED PACKAGES ------------------------------------------ "
from __future__ import division
import cython
cimport cython
import math
cimport numpy as np
import numpy as np
import pyfftw
" ------------------------------------------------------------------------------------------------ "
"""
I heard that the boundscheck and wraparound functions could improve the performance, but I didn't
notice any performance gain whatsoever.
"""
#cython.profile(False)
#cython.boundscheck(False)
#cython.wraparound(False)
def fft(np.ndarray[double complex, ndim=2] A, float r, float t, float T,
np.ndarray[double complex, ndim=2] C, np.ndarray[double complex, ndim=2] D,
np.ndarray[double, ndim=2] V, float alphaC, np.ndarray[double complex, ndim=1] vJT,
np.ndarray[double, ndim=1] k, float b,
np.ndarray[long, ndim=1] strikes, float eta,
np.ndarray[long, ndim=1] weightsT):
cdef int M = V.shape[0]
cdef int N = V.shape[1]
cdef int NFFT = D.shape[1]
cdef np.ndarray[double complex, ndim=1] F_cT
cdef np.ndarray[double complex, ndim=2] toFFT = np.empty( (N, NFFT), dtype=complex)
cdef np.ndarray[double, ndim=2] prices
cdef float x_1, x_2, position
cdef np.ndarray[double, ndim=1] y_1
cdef np.ndarray[double, ndim=1] y_2
cdef np.ndarray[double, ndim=1] dydx
cdef int m, j, n
cdef np.ndarray[double, ndim=2] price = np.empty( (M * N, len(strikes)) )
cdef np.ndarray[double complex, ndim=1] A_inp = A[0, :]
for j in range( len(strikes) ):
position = (math.log(strikes[j]) + b) / ( 2 * b / NFFT)
x_1 = math.exp ( k[ int(math.floor(position)) ] )
x_2 = math.exp ( k[ int(math.ceil(position)) ] )
for m in range(M):
"""
M is the number of rows we have in A, C, D and V, so we need to loop over all of those.
"""
for n in range(N):
"""
Next we loop over all of the elements for each row in V, corresponding to N. For
us this corresponds to 1000 (if you haven't changed to N in the main program).
Each of the rows of A, C and D are 2048 in length. So I tried to loop over all of
those as well as for each n, but this made the code 4 times slower.
"""
F_cT = math.exp( -r*(T - t) ) * np.exp (A_inp + C[m, :] + D[m, :] * V[m, n]) / \
( (alphaC + vJT) * (alphaC + 1 + vJT) )
toFFT[n, :] = np.exp (b * vJT) * F_cT * eta / 3 * weightsT
"""
I'm guessing FFT'ing is rather slow using NumPy in Cython?
"""
prices = np.exp ( -alphaC * k ) / math.pi * np.real ( np.fft.fft(toFFT) )
y_1 = prices[ :, int(math.floor(position)) ]
y_2 = prices[ :, int(math.ceil(position)) ]
dydx = (y_2 - y_1)/(x_2 - x_1)
price[m * N:(m + 1) * N, j] = dydx * (strikes[j] - x_1) + y_1
return price
I'm compiling the code as
from distutils.core import setup, Extension
from Cython.Distutils import build_ext
import numpy.distutils.misc_util
include_dirs = numpy.distutils.misc_util.get_numpy_include_dirs()
setup(
name = 'fftInC',
ext_modules = [Extension('fftInC', ['fftInC.pyx'], include_dirs=include_dirs)],
cmdclass = {'build_ext':build_ext}
)
But to my surprise, the Cython version is about 3x slower than the original one. And I can't really figure out where I'm going wrong. I think I've defined the input types correctly (which I understand should give a considerable performance boost).
My question is therefore: Can you identify where I'm going wrong? Is it the type definition, for-loops or FFT'ing (or something else)?

Stuck Trying to Implement 3D Wave Equation in PyOpenCL

I'm attempting to implement the discrete time wave equation in OpenCL. I think I'm pretty close, but the results look like what I would expect from the heat equation. I know they're very similar, but when I've implemented the 2D wave equation (not using OpenCL) I got distinct wavefronts and reflections. With the OpenCL kernel below everything diffuses until it is a wash.
__kernel void wave_calc(
__global float* height,
__global float* height_old,
const unsigned int len_x,
const unsigned int len_y,
const unsigned int len_z,
const float dtxc_term)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
unsigned int z = get_global_id(2);
int this_cell = x + len_y * (y + len_x * z);
float laplacian;
if (x==0 || x==(len_x-1) || y==0 || y==(len_y-1) || z==0 || z==(len_z-1)) {
laplacian = 0;
height_old[this_cell] = height[this_cell];
height[this_cell] = 0;
}
else if ( x < len_x-1 && y < len_y-1 && z < len_z-1 ){
int n1 = x - 1 + len_y * (y + len_x * z);
int n2 = x + 1 + len_y * (y + len_x * z);
int n3 = x + len_y * (y - 1 + len_x * z);
int n4 = x + len_y * (y + 1 + len_x * z);
int n5 = x + len_y * (y + len_x * (z -1));
int n6 = x + len_y * (y + len_x * (z + 1));
laplacian = -6 * height[this_cell] +
height[n1] +
height[n2] +
height[n3] +
height[n4] +
height[n5] +
height[n6];
height_old[this_cell] = height[this_cell];
height[this_cell] = (dtxc_term*laplacian+2*height[this_cell]) - height_old[this_cell];
}
}
(DTXC is the result of ((DT * DT)/(DX * DX)) * C passed from the host)
Every step I copy height back to the host for plotting, and then call the function again.
for i in np.arange(steps):
#copy height from host to device
cl.enqueue_copy(queue, d_height, h_height)
#step once
wave_calc(queue, field_3d.shape, None, d_height, d_height_old, LEN_X, LEN_Y, LEN_Z, DTXC)
queue.finish()
#copy height back
cl.enqueue_copy(queue, h_height, d_height)
#do my plotting
Any thoughts/suggestions/condescending remarks? All would be appreciated. :)
Here is an update to answer Joel's question:
I'm not much good when it comes to calculus, but I'm taking a working C++ implementation in 2D and trying to adapt it for 3D. Below is the C++. The only modification I made was to the loop, since there are 6 neighbor cells in 3D instead of 4. In both cases the outer walls of the plane/cube are set to 0:
for(int x=1; x<field.xRes()-1;x++) {
for (int y=1; y<field.yRes()-1; y++) {
laplacian(x,y) = -4 * height(x,y) +
height(x-1,y) +
height(x+1,y) +
height(x,y-1) +
height(x,y+1);
}
}
const float dt = 0.001;
const float xLen = 1.0;
const float C = 1.0;
const float dx = xLen/xRes;
backup = height;
height = ((dt*dt)/(dx*dx))*C*laplacian+2*height;
height = height - heightOld;
heightOld = backup;

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

PyOpenCL Matrix multiplication - python

Related

Matrix multiplication of a 2d numpy array to cpp using ctypes

plus equal (+=) operator in pycuda

Hough transform: How to get lines from voting-matrix?

Cython code 3x slower than corresponding NumPy version

Stuck Trying to Implement 3D Wave Equation in PyOpenCL

Categories

Resources