cuda code error within numbapro

cuda code error within numbapro - python

import numpy
import numpy as np
from numbapro import cuda
#cuda.autojit
def foo(aryA, aryB,out):
d_ary1 = cuda.to_device(aryA)
d_ary2 = cuda.to_device(aryB)
#dd = numpy.empty(10, dtype=np.int32)
d_ary1.copy_to_host(out)
griddim = 1, 2
blockdim = 3, 4
aryA = numpy.arange(10, dtype=np.int32)
aryB = numpy.arange(10, dtype=np.int32)
out = numpy.empty(10, dtype=np.int32)
foo[griddim, blockdim](aryA, aryB,out)
Exception: Caused by input line 11:
can only get attribute from globals, complex numbers or arrays
I am new to numbapro, hints are needed!

The #cuda.autotjit marks and compiles foo() as a CUDA kernel. The memory transfer operations should be placed outside of the kernel. It should look like the following code:
import numpy
from numbapro import cuda
#cuda.autojit
def foo(aryA, aryB ,out):
# do something here
i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
out[i] = aryA[i] + aryB[i]
griddim = 1, 2
blockdim = 3, 4
aryA = numpy.arange(10, dtype=numpy.int32)
aryB = numpy.arange(10, dtype=numpy.int32)
out = numpy.empty(10, dtype=numpy.int32)
# transfer memory
d_ary1 = cuda.to_device(aryA)
d_ary2 = cuda.to_device(aryB)
d_out = cuda.device_array_like(aryA) # like numpy.empty_like() but for GPU
# launch kernel
foo[griddim, blockdim](aryA, aryB, d_out)
# transfer memory device to host
d_out.copy_to_host(out)
print out
I recommend new NumbaPro users to look at the examples in https://github.com/ContinuumIO/numbapro-examples.

Related

How to use ArrayFire batched 2D convolution

Reading through ArrayFire documentation, I noticed that the library supports batched operations when using 2D convolution. Therefore, I need to apply N filters to an image using the C++ API.
For easy testing, I decided to create a simple Python script to assert the convolution results. However, I couldn't get proper results when using >1 filters and comparing them to OpenCV's 2D convolution separately. Following is my Python script:
import arrayfire as af
import cv2
import numpy as np
np.random.seed(1)
np.set_printoptions(precision=3)
af.set_backend('cuda')
n_kernels = 2
image = np.random.randn(512,512).astype(np.float32)
kernels_list = [np.random.randn(7,7).astype(np.float32) for _ in range(n_kernels)]
conv_cv_list = [cv2.filter2D(image, -1, cv2.flip(kernel,-1), borderType=cv2.BORDER_CONSTANT) for kernel in kernels_list]
image_gpu = af.array.Array(image.ctypes.data, image.shape, image.dtype.char)
kernels = np.stack(kernels_list, axis=-1) if n_kernels > 1 else kernels_list[0]
kernels_gpu = af.array.Array(kernels.ctypes.data, kernels.shape, kernels.dtype.char)
conv_af_gpu = af.convolve2(image_gpu, kernels_gpu)
conv_af = conv_af_gpu.to_ndarray()
if n_kernels == 1:
conv_af = conv_af[..., None]
for kernel_idx in range(n_kernels):
print("CV conv:", conv_cv_list[kernel_idx][0, 0])
print("AF conv", conv_af[0, 0, kernel_idx])
That said, I would like to know how properly use ArrayFire batched support.

ArrayFire is column-major, whereas OpenCV and NumPy are row-major. This can cause issues. We provide an interop function to handle this for you, as follows.
import arrayfire as af
import cv2
import numpy as np
np.random.seed(1)
np.set_printoptions(precision=3)
af.set_backend('cuda')
n_kernels = 2
image = np.random.randn(512,512).astype(np.float32)
kernels_list = [np.random.randn(7,7).astype(np.float32) for _ in range(n_kernels)]
conv_cv_list = [cv2.filter2D(image, -1, cv2.flip(kernel,-1), borderType=cv2.BORDER_CONSTANT) for kernel in kernels_list]
image_gpu = af.interop.from_ndarray(image) # CHECK OUT THIS
kernels = np.stack(kernels_list, axis=-1) if n_kernels > 1 else kernels_list[0]
kernels_gpu = af.interop.from_ndarray(kernels) # CHECK OUT THIS
conv_af_gpu = af.convolve2(image_gpu, kernels_gpu)
conv_af = conv_af_gpu.to_ndarray()
if n_kernels == 1:
conv_af = conv_af[..., None]
for kernel_idx in range(n_kernels):
print("CV conv:", conv_cv_list[kernel_idx][0, 0])
print("AF conv", conv_af[0, 0, kernel_idx])
We are working on a newer version of ArrayFire Python which will address this issue.
Good luck!

Numba function fails when imported from another script, but works well when compiled manually

Here is a reproducible example. The numba function "is_in_set_pnb" doesn't work when is imported from "functions.py" script. However, it works just fine when is defined in the same script.
functions.py script
import numpy as np
import numba as nb
import pandas as pd
# corrFilter function
def corrFilter(df, threshold):
corr_matrix = df.corr().abs()
flat_matrix = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
.stack()
.sort_values(ascending=False))
flat_matrix = pd.DataFrame(flat_matrix).reset_index()
flat_matrix.columns=["var1", "var2", "correlation"]
flat_matrix = flat_matrix.reindex(flat_matrix.correlation.sort_values(ascending=False).index).reset_index().drop(["index"],axis=1)
filtered_matrix = flat_matrix[flat_matrix.correlation > threshold]
pairs_to_remove = filtered_matrix[["var1", "var2"]].to_numpy()
return pairs_to_remove
# Numba function / numpy.isin() improved version
#nb.jit(parallel=True)
def is_in_set_pnb(a, b):
shape = a.shape
a = a.ravel()
n = len(a)
result = np.full(n, False)
set_b = set(b)
for i in nb.prange(n):
if a[i] in set_b:
result[i] = True
return result.reshape(shape)
main.py script
from sklearn import datasets
import pandas as pd
import numpy as np
import itertools
from functions import *
# Loading data
iris = datasets.load_iris()
data = iris.data
data = pd.DataFrame(data)
data.columns = ["var1", "var2", "var3", "var4"]
# Get pairs with a correlation higher than 0.5
remove_pairs = corrFilter(data, 0.5)
remove_pairs_check = np.sum(remove_pairs, axis=1)
# Remove triplets with pairs with a higher correlation than 0.5
triplets = [list(k) for k in itertools.combinations(data.columns, 3)]
array_triplets = np.vstack(triplets).astype(object)
n, d = array_triplets.shape
pair1 = np.sum(array_triplets[:,[0,1]], axis=1).reshape(n, 1)
pair2 = np.sum(array_triplets[:,[1,2]], axis=1).reshape(n, 1)
pair3 = np.sum(array_triplets[:,[0,2]], axis=1).reshape(n, 1)
array_triplets_check = np.concatenate((pair1, pair2, pair3), axis =1)
# Check if each pair is in the remove_pairs_check list
array_triplets_check_v1 = np.in1d(array_triplets_check, remove_pairs_check).reshape(n, d)
# Using numba function from "functions.py" script"
# This fails!!
array_triplets_check_v2 = is_in_set_pnb(array_triplets_check, remove_pairs_check)
However, if we define the numba function within the main.py script:
import numba as nb
#nb.jit(parallel=True)
def is_in_set_pnb(a, b):
shape = a.shape
a = a.ravel()
n = len(a)
result = np.full(n, False)
set_b = set(b)
for i in nb.prange(n):
if a[i] in set_b:
result[i] = True
return result.reshape(shape)
# This works properly!!!
array_triplets_check_v2 = is_in_set_pnb(array_triplets_check, remove_pairs_check)
What I am missing?? I am using:
numba 0.50.1 py38h47e9c7a_0

How to submit map_blocks operation to a Client

I am trying to chunk-wise evaluate a funtion on a dask array. I can successfully do the following:
import numpy as np
import dask.array as da
arr = np.random.randint(0, 9, (4, 4))
darr = da.from_array(arr, chunks=(2, 2))
def block_mean(block):
return np.array([[block.mean()]])
r = darr.map_blocks(block_mean)
out_arr = r.compute()
How can I replicate this in a Client? I tried the following:
import numpy as np
import dask.array as da
arr = np.random.randint(0, 9, (4, 4))
darr = da.from_array(arr, chunks=(2, 2))
def block_mean(block):
return np.array([[block.mean()]])
r = darr.map_blocks(block_mean)
c = Client()
# The following does not work
out = c.submit(r)

To keep this very short: you need not do anything at all; if you define a Client, it will become the default scheduler, and .compute() uses the default scheduler (unless specified).
r = darr.map_blocks(block_mean)
c = Client()
out_arr = r.compute()
If you read the docs for submit, it expects a function - there are plenty of examples of how to use it.

n-dimensional sliding window operation in python using gpu accelerated libraries, preferably Tensorflow?

I am looking for a gpu-accelerated n-dimensional sliding window operation implementation in Python using Tensorflow. You can post your implementation in Torch, Caffe or Theano, but I'll choose the Tensorflow implementation as the accepted answer. Please post working code snippet that performs a 2d median filter operation (hopefully, with no code change or minimal code change, can be applied to n-dimensional images)
With my limited knowledge on Tensorflow, I believe the 2 potential modules to start with are sliding_window_batch or extract_image_patches and then with some map,apply,reshape magic?
My failed attempt is posted below, for entertainment. Please note I have posted a similar question 2 years ago, asking for a Theano implementation, nowadays, most people are using tf/keras or torch.
import time
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import tensorflow as tf
from tensorflow.contrib.data.python.ops import sliding
from skimage import img_as_float, data
from scipy.signal import medfilt
imgs = img_as_float(data.camera())
### SCIPY median ###
stime = time.time()
scipysmoothed = medfilt(imgs,(9,9))
etime = time.time()
print('scipy smoothed: {:1.4f} seconds'.format(etime-stime))
### Failed attempt of TF median ###
method = 'Tensorflow'
stime = time.time()
window_func = lambda x: tf.contrib.distributions.percentile(x, 50.0)
# create TensorFlow Dataset object
data = tf.data.Dataset.from_tensor_slices(imgs)
# sliding window - only 1d is allowed?
window = 3
stride = 1
data = data.apply(sliding.sliding_window_batch(window, stride)).map(lambda x: window_func(x))
# create TensorFlow Iterator object
iterator = tf.data.Iterator.from_structure(data.output_types)
next_element = iterator.get_next()
# create initialization ops
init_op = iterator.make_initializer(data)
c=0
smoothed = np.zeros(imgs.shape)
with tf.Session() as sess:
# initialize the iterator on the data
sess.run(init_op)
while True:
try:
elem = sess.run(next_element)
smoothed[c,:]=elem
# obviously WRONG.
c+=1
except tf.errors.OutOfRangeError:
#print("End of dataset.")
break
#print(c)
etime = time.time()
print('tf smoothed: {:1.4f} seconds'.format(etime-stime))
plt.figure(figsize=(20,20))
plt.subplot(131)
plt.imshow(imgs,cmap='gray',interpolation='none')
plt.title('original')
plt.subplot(132)
plt.imshow(smoothed,cmap='gray',interpolation='none')
plt.title('actual smoothed\nwith {}'.format(method))
plt.subplot(133)
plt.imshow(scipysmoothed,cmap='gray',interpolation='none')
_=plt.title('expected smoothed')
.
scipy smoothed: 1.1899 seconds
tf smoothed: 0.7485 seconds

Proposal 1: My attempt is the below, since it just uses tf.image.extract_image_patches and tf.extract_volume_patches, the implementation supports only 2d and 3d images.
Proposal 2: one could just format the data as a preprocessing step (via tf.data.Dataset.map), however this also takes alot of time, I am not sure why yet ( example https://gist.github.com/pangyuteng/ca5cb07fe383ebe59b521c832f2e2918 ).
Proposal 3: use convolutional blocks to parallelize processing, see "Hypercolumns for Object Segmentation and Fine-grained Localization" https://arxiv.org/abs/1411.5752 .
--
Proposal 1 code:
import time
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import tensorflow as tf
from tensorflow.contrib.data.python.ops import sliding
from skimage import img_as_float, data
from scipy.signal import medfilt
dtype = 2
if dtype==2:
imgs = img_as_float(data.camera())
elif dtype==3:
imgs = np.random.rand(28,28,28)
imgs = img_as_float(data.camera())
### SCIPY median ###
stime = time.time()
scipysmoothed = medfilt(imgs,(9,9))
etime = time.time()
print('scipy smoothed: {:1.4f} seconds'.format(etime-stime))
### TF median ###
method = 'Tensorflow'
imgs = np.expand_dims(imgs,axis=-1)
imgs = np.expand_dims(imgs,axis=0)
print('imgs.shape:{}'.format(imgs.shape))
imgs = tf.cast(imgs,tf.float32)
stime = time.time()
if len(imgs.shape) == 4:
kernel=(1,9,9,1)
stride=(1,1,1,1)
rates=(1,1,1,1)
padding='SAME'
patches=tf.image.extract_image_patches(
imgs,kernel,stride,rates,padding,
)
_,x,y,n = patches.shape
_,sx,sy,_ = kernel
window_func = lambda x: tf.contrib.distributions.percentile(x, 50.0)
patches = tf.reshape(patches,[x*y,sx,sy])
smoothed = tf.map_fn(lambda x: window_func(patches[x,:,:]), tf.range(x*y), dtype=tf.float32)
smoothed = tf.reshape(smoothed,[x,y])
elif len(imgs.shape) == 5:
kernel=(1,12,12,12,1)
stride=(1,1,1,1,1)
padding='SAME'
patches=tf.extract_volume_patches(
imgs,kernel,stride,padding,
)
_,x,y,z,n = patches.shape
_,sx,sy,sz,_ = kernel
window_func = lambda x: tf.contrib.distributions.percentile(x, 50.0)
patches = tf.reshape(patches,[x*y*z,sx,sy,sz])
smoothed = tf.map_fn(lambda x: window_func(patches[x,:,:]), tf.range(x*y*z), dtype=tf.float32)
smoothed = tf.reshape(smoothed,[x,y,z])
else:
raise NotImplemented()
with tf.Session() as sess:
output = sess.run(smoothed)
etime = time.time()
print('tf smoothed: {:1.4f} seconds'.format(etime-stime))
print(output.shape)
plt.figure(figsize=(20,20))
plt.subplot(131)
imgs = img_as_float(data.camera())
plt.imshow(imgs.squeeze(),cmap='gray',interpolation='none')
plt.title('original')
plt.subplot(132)
plt.imshow(output.squeeze(),cmap='gray',interpolation='none')
plt.title('actual smoothed\nwith {}'.format(method))
plt.subplot(133)
plt.imshow(scipysmoothed,cmap='gray',interpolation='none')
_=plt.title('expected smoothed')

Problems faced by Manually Implementing a FIFOQueue in tensorflow

I am trying to come up with a method that could implement a FIFOQueue in tensorflow. So on every iteration, the purpose is to assign a placeholder a certain number, then store it in a Variable named: buffer. After each assignment, I am incrementing an index. The buffer size is [5], so that index should range from 0 to 4. Finally, after the buffer is full, I would set buffer[0:4] to be buffer[1:5], and then add the new value to buffer[4]. So here is my
code:
import tensorflow as tf
import numpy as np
import random
dim = 30
lst = []
for i in range(dim):
lst.append(random.randint(1, 10))
data = np.reshape(lst, [dim, 1])
print(lst)
# create a buffer:
buffer_input = tf.placeholder(tf.int32, shape=[1])
buffer = tf.Variable(tf.zeros([5], tf.int32))
index = tf.Variable(tf.constant(0))
def fillBufferBeforeFilled():
update_op1 = tf.scatter_update(buffer, indices=[index], updates=buffer_input)
index_assign_add = tf.assign_add(index, 1)
return update_op1, index_assign_add
def fillBufferAfterFilled():
tmp = tf.slice(buffer, begin=[0], size=[4])
update_op2 = tf.scatter_update(buffer, indices=[0, 1, 2, 3], updates=tmp)
update_op3 = tf.scatter_update(buffer, indices=[index], updates=buffer_input)
return update_op2, update_op3
cond = tf.cond(tf.equal(index, 4), lambda: fillBufferBeforeFilled(), lambda: fillBufferAfterFilled())
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(dim):
cond_ = sess.run(cond, feed_dict={buffer_input: data[i]})
buf = sess.run(buffer, feed_dict={buffer_input: data[i]})
print('buf: ', buf)
Problem: The index Variable is not incremented after each call, while the first element of the buffer is being assigned to the value passed to the placeholder.
I would like to know why I'm getting this behavior and what is the solution to this problem.
any help is much appreciated!!

You've mixed up the order of the conditions in tf.cond; it should be
cond = tf.cond(tf.equal(index, 4), lambda: fillBufferAfterFilled(), lambda: fillBufferBeforeFilled())
I can get your code running and it mostly works, but the updates aren't quite right; I suspect you'll need to add some tf.control_dependencies calls to force things to happen in the right order.

Here is the solution:
import tensorflow as tf
import numpy as np
import random
dim = 30
lst = []
for i in range(dim):
lst.append(random.randint(1, 10))
data = np.reshape(lst, [dim, 1])
print(lst)
# create a buffer:
buffer_input = tf.placeholder(tf.int32, shape=[1])
buffer = tf.Variable(tf.zeros([5], tf.int32))
index = tf.Variable(-1, tf.int32)
def fillBufferBeforeFilled():
index_assign_add = tf.assign_add(index, 1)
with tf.control_dependencies([index_assign_add]):
update_op1 = tf.scatter_update(buffer, indices=[index], updates=buffer_input)
return update_op1, index_assign_add
def fillBufferAfterFilled():
tmp = tf.slice(buffer, begin=[1], size=[4])
update_op2 = tf.scatter_update(buffer, indices=[0, 1, 2, 3], updates=tmp)
with tf.control_dependencies([update_op2]):
update_op3 = tf.scatter_update(buffer, indices=[index], updates=buffer_input)
return update_op2, update_op3
cond = tf.cond(tf.equal(index, 4), lambda: fillBufferAfterFilled(), lambda: fillBufferBeforeFilled())
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(dim):
cond_ = sess.run(cond, feed_dict={buffer_input: data[i]})
buf = sess.run(buffer, feed_dict={buffer_input: data[i]})
print('buf: ', buf)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

cuda code error within numbapro - python

Related

How to use ArrayFire batched 2D convolution

Numba function fails when imported from another script, but works well when compiled manually

How to submit map_blocks operation to a Client

n-dimensional sliding window operation in python using gpu accelerated libraries, preferably Tensorflow?

Problems faced by Manually Implementing a FIFOQueue in tensorflow

Categories

Resources