I'm attempting to implement the discrete time wave equation in OpenCL. I think I'm pretty close, but the results look like what I would expect from the heat equation. I know they're very similar, but when I've implemented the 2D wave equation (not using OpenCL) I got distinct wavefronts and reflections. With the OpenCL kernel below everything diffuses until it is a wash.
__kernel void wave_calc(
__global float* height,
__global float* height_old,
const unsigned int len_x,
const unsigned int len_y,
const unsigned int len_z,
const float dtxc_term)
{
unsigned int x = get_global_id(0);
unsigned int y = get_global_id(1);
unsigned int z = get_global_id(2);
int this_cell = x + len_y * (y + len_x * z);
float laplacian;
if (x==0 || x==(len_x-1) || y==0 || y==(len_y-1) || z==0 || z==(len_z-1)) {
laplacian = 0;
height_old[this_cell] = height[this_cell];
height[this_cell] = 0;
}
else if ( x < len_x-1 && y < len_y-1 && z < len_z-1 ){
int n1 = x - 1 + len_y * (y + len_x * z);
int n2 = x + 1 + len_y * (y + len_x * z);
int n3 = x + len_y * (y - 1 + len_x * z);
int n4 = x + len_y * (y + 1 + len_x * z);
int n5 = x + len_y * (y + len_x * (z -1));
int n6 = x + len_y * (y + len_x * (z + 1));
laplacian = -6 * height[this_cell] +
height[n1] +
height[n2] +
height[n3] +
height[n4] +
height[n5] +
height[n6];
height_old[this_cell] = height[this_cell];
height[this_cell] = (dtxc_term*laplacian+2*height[this_cell]) - height_old[this_cell];
}
}
(DTXC is the result of ((DT * DT)/(DX * DX)) * C passed from the host)
Every step I copy height back to the host for plotting, and then call the function again.
for i in np.arange(steps):
#copy height from host to device
cl.enqueue_copy(queue, d_height, h_height)
#step once
wave_calc(queue, field_3d.shape, None, d_height, d_height_old, LEN_X, LEN_Y, LEN_Z, DTXC)
queue.finish()
#copy height back
cl.enqueue_copy(queue, h_height, d_height)
#do my plotting
Any thoughts/suggestions/condescending remarks? All would be appreciated. :)
Here is an update to answer Joel's question:
I'm not much good when it comes to calculus, but I'm taking a working C++ implementation in 2D and trying to adapt it for 3D. Below is the C++. The only modification I made was to the loop, since there are 6 neighbor cells in 3D instead of 4. In both cases the outer walls of the plane/cube are set to 0:
for(int x=1; x<field.xRes()-1;x++) {
for (int y=1; y<field.yRes()-1; y++) {
laplacian(x,y) = -4 * height(x,y) +
height(x-1,y) +
height(x+1,y) +
height(x,y-1) +
height(x,y+1);
}
}
const float dt = 0.001;
const float xLen = 1.0;
const float C = 1.0;
const float dx = xLen/xRes;
backup = height;
height = ((dt*dt)/(dx*dx))*C*laplacian+2*height;
height = height - heightOld;
heightOld = backup;
Related
I'm trying to implement A Radix-5,Radix-3 FFT in C++, I already managed to write a Radix-2 but I have some type of bug when it comes to Radix 3 or 5, let's say I do an FFT of 3 samples, that would show the correct results, however if I do FFT of 9 which is 3 * 3, it doesn't show the correct results.
I originally took the code from Python, it works there and I'm trying to simply "copy" it to C++, Here is the original Python code:
def fft(x):
"""
radix-2,3,5 FFT algorithm
"""
N = len(x)
if N <= 1:
return x
elif N % 2 == 0:
# For multiples of 2 this formula works
even = fft(x[0::2])
odd = fft(x[1::2])
T = [np.exp(-2j*np.pi*k/N)*odd[k] for k in range(N//2)]
return [even[k] + T[k] for k in range(N//2)] + \
[even[k] - T[k] for k in range(N//2)]
elif N % 3 == 0:
# Optional, implementing factor 3 decimation
p0 = fft(x[0::3])
p1 = fft(x[1::3])
p2 = fft(x[2::3])
# This will construct the output output without the simplifications
# you can do explorint symmetry
for k in range(N):
return [p0[k % (N//3)] +
p1[k % (N//3)] * np.exp(-2j*np.pi*k/N) +
p2[k % (N//3)] * np.exp(-4j*np.pi*k/N)]
elif N % 5 == 0:
#factor 5 decimation
p0 = fft(x[0::5])
p1 = fft(x[1::5])
p2 = fft(x[2::5])
p3 = fft(x[3::5])
p4 = fft(x[4::5])
return [p0[k % (N//5)] +
p1[k % (N//5)] * np.exp(-2j*np.pi*k/N) +
p2[k % (N//5)] * np.exp(-4j*np.pi*k/N) +
p3[k % (N//5)] * np.exp(-6j*np.pi*k/N) +
p4[k % (N//5)] * np.exp(-8j*np.pi*k/N)
for k in range(N)]
x = [1,1.00071,1.00135,1.00193,1.00245,1.0029,1.00329,1.00361,1.00387]
assert(np.allclose(fft(x), np.fft.fft(x)))
And here is my code in C++ (fft.hpp):
#define _USE_MATH_DEFINES
#pragma once
#include <cmath>
#include <vector>
#include <complex>
using std::vector;
using std::complex;
vector<complex<float>> slicing(vector<complex<float>> vec, unsigned int X, unsigned int Y, unsigned int stride)
{
// To store the sliced vector
vector<complex<float>> result;
// Copy vector using copy function()
int i = X;
while (result.size() < Y)
{
result.push_back(vec[i]);
i = i + stride;
}
// Return the final sliced vector
return result;
}
void fft(vector<complex<float>>& x)
{
// Check if it is splitted enough
const size_t N = x.size();
if (N <= 1)
return;
else if (N % 2 == 0)
{
//Radix-2
vector<complex<float>> even = slicing(x, 0, N / 2, 2); //split the inputs in even / odd indices subarrays
vector<complex<float>> odd = slicing(x, 1, N / 2, 2);
// conquer
fft(even);
fft(odd);
// combine
for (size_t k = 0; k < N / 2; ++k)
{
complex<float> t = std::polar<float>(1.0, -2 * M_PI * k / N) * odd[k];
x[k] = even[k] + t;
x[k + N / 2] = even[k] - t;
}
}
else if (N % 3 == 0)
{
//Radix-3
//factor 3 decimation
vector<complex<float>> p0 = slicing(x, 0, N / 3, 3);
vector<complex<float>> p1 = slicing(x, 1, N / 3, 3);
vector<complex<float>> p2 = slicing(x, 2, N / 3, 3);
fft(p0);
fft(p1);
fft(p2);
for (int i = 0; i < N; i++)
{
complex<float> temp = p0[i % (int)N / 3];
temp += (p1[i % (int)N / 3] * std::polar<float>(1.0, -2 * M_PI * i / N));
temp += (p2[i % (int)N / 3] * std::polar<float>(1.0, -4 * M_PI * i / N));
x[i] = temp;
}
}
else if (N % 5 == 0)
{
//Radix-5
//factor 5 decimation
vector<complex<float>> p0 = slicing(x, 0, N / 5, 5);
vector<complex<float>> p1 = slicing(x, 1, N / 5, 5);
vector<complex<float>> p2 = slicing(x, 2, N / 5, 5);
vector<complex<float>> p3 = slicing(x, 3, N / 5, 5);
vector<complex<float>> p4 = slicing(x, 4, N / 5, 5);
fft(p0);
fft(p1);
fft(p2);
fft(p3);
fft(p4);
for (int i = 0; i < N; i++)
{
complex<float> temp = p0[i % (int)N / 5];
temp += (p1[i % (int)N / 5] * std::polar<float>(1.0, -2 * M_PI * i / N));
temp += (p2[i % (int)N / 5] * std::polar<float>(1.0, -4 * M_PI * i / N));
temp += (p3[i % (int)N / 5] * std::polar<float>(1.0, -6 * M_PI * i / N));
temp += (p4[i % (int)N / 5] * std::polar<float>(1.0, -8 * M_PI * i / N));
x[i] = temp;
}
}
}
And main.cpp:
#define _USE_MATH_DEFINES
#include <stdio.h>
#include <iostream>
#include "fft.hpp"
typedef vector<complex<float>> complexSignal;
int main()
{
complexSignal abit;
int N = 9;
abit.push_back({1,0});
abit.push_back({1.00071 ,0 });
abit.push_back({1.00135 ,0 });
abit.push_back({1.00193 ,0 });
abit.push_back({1.00245 ,0 });
abit.push_back({1.0029 ,0 });
abit.push_back({1.00329 ,0 });
abit.push_back({1.00361 ,0 });
abit.push_back({1.00387 ,0 });
//abit.push_back({ 1.0029 ,0 });
std::cout << "Before:" << std::endl;
for (int i = 0; i < N; i++)
{
//abit.push_back(data[0][i]);
std::cout << abit[i] << std::endl;
}
std::cout << "After:" << std::endl;
fft(abit);
for (int i = 0; i < N; i++)
{
std::cout << abit[i] << std::endl;
}
return 0;
}
I'm getting the following output from CPP:
(9.02011,0)
(5.83089,-4.89513)
(0.700632,-3.98993)
(-0.000289979,0.000502368)
(-0.00218513,0.000362784)
(-0.00179241,0.00139188)
(-0.000289979,-0.000502368)
(0.000175771,-0.00354373)
(-0.003268,-0.00558837)
While I should get:
(9.020109999999999+0j)
(-0.0032675770104925446+0.005588577982060319j)
(-0.0023772289746976797+0.0024179090499282354j)
(-0.0022250000000012538+0.0011691342951078987j)
(-0.002185194014811494+0.00036271471530890747j)
(-0.0021851940148113033-0.00036271471530980844j)
(-0.0022249999999994774-0.0011691342951105632j)
(-0.002377228974696629-0.0024179090499291786j)
(-0.00326757701049002-0.005588577982061138j)
As you can see only the first result is correct.
In Python it says k % (N//3), in C++ it says i % (int)N / 3. These are not the same! See operator precedence in C++. You need to use parenthesis to execute the operations in the right order: i % ((int)N / 3).
Pepijn Kramer’s comment about changing
-2 * M_PI * i / N
into
-2.0 * M_PI * static_cast<double>(i)/static_cast<double>(N)
is good practice, in my opinion, but shouldn’t make a difference in this case because of the left-to-right evaluation and the automatic promotion of integers to floats in a float context. Still, ensuring all your constants are floats where needed doesn’t cost much effort and can prevent difficult to spot bugs.
I would like to implement a variant of convolution in pycuda.
For simplicity, I'll show rectangle kernel of the interpolation.
The standard convolution can be applied as following:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule
mod = SourceModule("""
#include <stdio.h>
__global__ void func(float *dest, float *a)
{
const int img_size = 64;
const int kernel_size = 3;
const int kernel_size_half = kernel_size/2;
const int tx = blockIdx.x * blockDim.x + threadIdx.x;
const int ty = blockIdx.y * blockDim.y + threadIdx.y;
int tx_kernel;
tx_kernel = tx - kernel_size_half;
for (int idx=-kernel_size_half; idx <= kernel_size_half; idx++)
{
tx_kernel = tx + idx ;
if ((tx_kernel < 0) || (tx_kernel > img_size-1))
continue;
dest[ty * img_size + tx] += a[ty * img_size + tx_kernel] / ((float) kernel_size);
}
}
""")
Instead of calculating the current position wrt neighbours, I would like to do the opposite,
to add the value of the current pixel to the neighbours.
I.e:
to change the line:
dest[ty * img_size + tx] += a[ty * img_size + tx_kernel] / ((float) kernel_size);
to:
dest[ty * img_size + tx_kernel] += a[ty * img_size + tx] / ((float) kernel_size);
However, The first works fine but the second is not, it fails by updating the neighbours.
Is there a way to bypass it?
Note:
I simplified the question to focus on what I need,
the general problem is to use a different convolution kernel for each pixel instead of same one as I asked in the question.
to change the line:
dest[ty * img_size + tx] += a[ty * img_size + tx_kernel] / ((float) kernel_size);
to:
dest[ty * img_size + tx_kernel] += a[ty * img_size + tx] / ((float) kernel_size);
However, The first works fine but the second is not, it fails by updating the neighbours. Is there a way to bypass it?
The first method is preferred from a performance perspective. However if you wish to "update the neighbors" then it should be possible to recast the second operation as:
atomicAdd(&(dest[ty * img_size + tx_kernel]), a[ty * img_size + tx] / ((float) kernel_size));
so Im trying to implement the hough transform using python and c++ (using Pybind11 for interfacing between the two languages). When Im plotting the hough space it seems alright but I just can't get a line from the maximum of the voting matrix.
Here is the C++ code (looks a bit different bc I use PyBind11):
py::array_t<int> houghTransform(py::array_t<int> image, int angleStep, int angleAmount) {
auto imageBuf = image.mutable_unchecked<3>();
int height = imageBuf.shape(0);
int width = imageBuf.shape(1);
py::array_t<int> edgeMatrix = edgeDetect(imageBuf, height, width);
auto edgeMatrixBuf = edgeMatrix.mutable_unchecked<2>();
int distanceAxis = 2 * sqrt(pow((float) height, 2.0) + pow((float) width, 2.0));
int angleAxis = angleAmount;
int angleDim = (int) angleAxis / angleStep;
int distanceDim = (int) distanceAxis / 2;
py::array_t<int> votingMatrix = py::array_t<int>({distanceAxis, angleDim});
auto votingMatrixBuf = votingMatrix.mutable_unchecked<2>();
// fill voting matrices with zeros
for(int i=0; i<distanceDim; i++) {
for(int j=0; j<angleDim; j++) {
votingMatrixBuf(i, j) = 0;
}
}
// vote
for(int x=0; x<edgeMatrixBuf.shape(0); x++) {
for(int y=0; y<edgeMatrixBuf.shape(1); y++) {
if(edgeMatrixBuf(x, y) == 1) {
int counter = 0;
float theta;
float ro;
for(int thetaIdx=0; thetaIdx<=angleAxis; thetaIdx++) {
if(thetaIdx % angleStep == 0) {
counter++;
theta = (float) (thetaIdx) * (M_PI / 180);
ro = distanceDim + std::round((x * cos(theta)) + (y * sin(theta)));
votingMatrixBuf(ro, counter) += 1;
}
}
}
}
}
return votingMatrix;
}
As you can see the arguments of the functions are the image matrix, which I transform to a matrix where the edges are 1 and the rest 0, so I got my pixels of interest.
int angleAmount is what angle range I want to try outand int angleStep is how many of angles of that theta I really want to use (for example, skip every second theta). But for this example I will use angleAmount = 360 and angleStep = 1. So I will use all angles form 1 to 360.
Here is the python code:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import time
from houghTransform import houghTransform
def apply_hough_transform(image_path: str=""):
image = np.array(Image.open(image_path))
lines = houghTransform(image, 1, 360)
p = np.unravel_index(lines.argmax(), lines.shape)
max_distance = 2 * np.sqrt(pow(image.shape[0], 2) + pow(image.shape[1], 2))
ro = p[0] - (max_distance / 2)
theta = p[1] * (np.pi / 180)
a = np.cos(theta)
b = np.sin(theta)
x = a * ro
y = b * ro
pt1 = (int(x + 1000*(-b)), int(y + 1000*(a)))
pt2 = (int(x - 1000*(-b)), int(y - 1000*(a)))
fig, axs = plt.subplots(2)
axs[0].matshow(lines)
axs[0].scatter(p[1], p[0], facecolors="none", edgecolors="r")
axs[1].plot([pt1[0], pt2[0]], [pt1[1], pt2[1]])
axs[1].imshow(image)
plt.show()
apply_hough_transform(image_path="images/black_line.png")
The function houghTransform is the same as in the c++ code which I exported to Python using PyBind11.
Here are the images:
I also tried to create the line using this function:
def line(x):
return -(1 / np.arctan(theta)) * (x - ro * np.cos(theta)) + ro * np.sin(theta)
But it also didnt work.
Can you spot my error? Im sitting on this for quite some time so help is really appreciated!
I have made a function who calculate area polygon with Shoelace way.
That's works perfectly but right now I wonder if there is not a faster way to have the same result.
I want to know that because this function must work faster with polygon with a lot of coordinates.
My function :
def shoelace_formula(polygonBoundary, absoluteValue = True):
nbCoordinates = len(polygonBoundary)
nbSegment = nbCoordinates - 1
l = [(polygonBoundary[i+1][0] - polygonBoundary[i][0]) * (polygonBoundary[i+1][1] + polygonBoundary[i][1]) for i in xrange(nbSegment)]
if absoluteValue:
return abs(sum(l) / 2.)
else:
return sum(l) / 2.
My polygon :
polygonBoundary = ((5, 0), (6, 4), (4, 5), (1, 5), (1, 0))
Result :
22.
Any ideas?
I try with Numpy :
It's speedest but you have to convert your coordinates first.
import numpy as np
x, y = zip(*polygonBoundary)
def shoelace_formula_3(x, y, absoluteValue = True):
result = 0.5 * np.array(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
if absoluteValue:
return abs(result)
else:
return result
For me the fastest way would be using numpy where you have to send a numpy array of (x,y) cordinates as an argument in shoelace method:
import numpy as np
def shoelace(x_y):
x_y = np.array(x_y)
x_y = x_y.reshape(-1,2)
x = x_y[:,0]
y = x_y[:,1]
S1 = np.sum(x*np.roll(y,-1))
S2 = np.sum(y*np.roll(x,-1))
area = .5*np.absolute(S1 - S2)
return area
Take a look at the Rosetta Code example using Numpy. Numpy gives a fast solution.
For your convenience, I paste below the Rosetta Code snippet:
import numpy as np
# x,y are arrays containing coordinates of the polygon vertices
x=np.array([3,5,12,9,5])
y=np.array([4,11,8,5,6])
i=np.arange(len(x))
#Area=np.sum(x[i-1]*y[i]-x[i]*y[i-1])*0.5 # signed area, positive if the vertex sequence is counterclockwise
Area=np.abs(np.sum(x[i-1]*y[i]-x[i]*y[i-1])*0.5) # one line of code for the shoelace formula
EDIT
You can now find the Shoelace formula implemented in the Scikit-Spatial library.
Here's a version that uses 1/2 as many multiplications: https://stackoverflow.com/a/717367/763269
If you need even greater performance, you could consider doing this in a Python C extension. C can be dramatically faster than Python, especially for math operations -- sometimes 100-1000x.
Another interesting approach (although slower)
m = np.vstack([x,y])
result = 0.5 * np.abs(np.linalg.det(as_strided(m, (m.shape[1]-1, 2, 2), (m.itemsize, m.itemsize*m.shape[1], m.itemsize))).sum())
class Point //a new class for an any point a(X,Y), b(X,Y), c(X,Y), d(X,Y)
{
//private int x, y;
public int X { get; set; }
public int Y { get; set; }
}
static class Geometry
{
public static void GerArea(Point a, Point b, Point c)
{
double area = 0.5 * ( (a.X * b.Y) + (b.X * c.Y) + (c.X * a.Y) - (b.X * a.Y) - (c.X * b.Y) - (a.X * c.Y) );
Console.WriteLine(Math.Abs(area));
}
public static void GerArea(Point a, Point b, Point c, Point d)
{
double area = 0.5 * ((a.X * b.Y) + (b.X * c.Y) + (c.X * d.Y) + (d.X * a.Y) - (b.X * a.Y) - (c.X * b.Y) - (d.X * c.Y) - (a.X * d.Y ) );
Console.WriteLine(Math.Abs(area));
}
}
class Program
{
static void Main(string[] args)
{
Point a = new Point() { X = -12, Y = 12 };
Point b = new Point() { X = 15, Y = 15 };
Point c = new Point() { X = -15, Y = -16 };
Point d = new Point() { X = 16, Y = -15 };
Console.WriteLine("****Shoelace formula****\n");
Console.Write("Area of tringle: ");
Geometry.GerArea(a, b, c);
Console.Write("Area of quad: ");
Geometry.GerArea(a, b, c, d);
Console.ReadLine();
}
}
This is a very simple implementation of shoelace formula in python
class Polygon:
def __init__(self,arr):
self.arr = arr
def area(self):
total=0
i = 0
while i != len(self.arr)-1:
total+=self.arr[i][0]*self.arr[i+1][1]
total-=self.arr[i+1][0]*self.arr[i][1]
i+=1
return abs(total +self.arr[-1][0]*self.arr[0][1] -self.arr[-1][-1]*self.arr[0][0])/2
Try simplest way, raw shoelace formula for triangles and polygons:
def shoelace_formula(x1, y1, x2, y2, x3, y3, x4, y4, x5, y5):
return abs(0.5 * (x1*y2 + x2*y3 + x3*y4 + x4*y5 + x5*y1
- x2*y1 - x3*y2 - x4*y3 - x5*y4 - y1*y5))
print(shoelace_formula(5, 0, 6, 4, 4, 5, 1, 5, 1, 0))
I am trying to adapt the code here : http://code.activestate.com/recipes/577166-newton-fractals/ into C and am having some trouble. I am using C99's complex type.
I basically have tried a few different approaches which haven't worked. On every pixel it goes all the way to the maximum iteration every time, so the image comes out as a solid color.
Is there soemthing I have fundementally wrong about the way the types work in C, it seems like it should work, I reconstructed the algorithm pretty exactly.
//newt.c
#include <stdio.h>
#include <stdlib.h>
#include <complex.h>
#include <float.h>
#include <math.h>
complex f(complex z);
const int WIDTH = 512, HEIGHT = 512;
const double SCALED_X_MAX = 1.0;
const double SCALED_X_MIN = -1.0;
const double SCALED_Y_MAX = 1.0;
const double SCALED_Y_MIN = -1.0;
const int MAX_ITERATIONS = 20;
const int EPSILON = 1e-3;
int main(int argc, char **argv) {
const double SCALED_WIDTH = SCALED_X_MAX - SCALED_X_MIN ;
const double SCALED_HEIGHT = SCALED_Y_MAX - SCALED_Y_MIN ;
FILE * image = fopen("newton.ppm", "w") ;
fprintf(image, "P3\n");
fprintf(image, "%d %d\n" , WIDTH , HEIGHT ) ;
fprintf(image, "%d\n", 255) ;
for ( int y = 0 ; y < HEIGHT ; ++y) {
double zy = y * (SCALED_HEIGHT)/(HEIGHT-1) + SCALED_Y_MIN;
for ( int x = 0 ; x < WIDTH ; ++x ) {
double zx = x * (SCALED_WIDTH)/(WIDTH-1) + SCALED_X_MIN;
complex z = zx + zy*I;
int iteration = 0;
while(iteration < MAX_ITERATIONS ) {
// complex h=sqrt(DBL_EPSILON) + sqrt(DBL_EPSILON)*I;
double h=1e-6;
/*
complex zph = z + h;
complex dz = zph - z;
complex slope = (f(zph) - f(z))/dz;
*/
complex volatile dz = (f(z + (h+h*I)) - f(z)) / (h+I*h) ;
complex z0 = z - f(z) / dz;
//fprintf(stdout,"%f \n", cabs(z0 - z ));
if ( cabs(z0 - z) < EPSILON){
break;
}
z = z0;
iteration++;
}
if (iteration != MAX_ITERATIONS) fprintf(stdout, "%d " , iteration );
fprintf(image,"%3d %3d %3d ", iteration % 4 * 64 ,
iteration % 8 * 32 ,
iteration % 16 * 16);
}
fprintf(image, "\n") ;
}
fclose(image) ;
exit(0);
}
complex f(complex z ) {
return cpow(z,3)-1.0 ;
}
After checking over the complex maths and not seeing any problems I noticed that your error is simply due to the integer division in the line
zy = y * (SCALED_HEIGHT)/(HEIGHT-1) + SCALED_Y_MIN;
and similarly for zx. Because (SCALED_HEIGHT) and (HEIGHT-1) are both integers this is not going to give you the floating point result you require. Try using:
zy = y * SCALED_HEIGHT * 1.0/(HEIGHT-1) + SCALED_Y_MIN;
and similarly for zx.
EDIT: Sorry the above was in error. Your SCALED_HEIGHT was in fact double so the above was actually ok. The real problem is simply in the line
const int EPSILON = 1e-3;
This will in fact always return zero, because it's an integer. You need to make EPSILON a floating point type.