Cuda out of resources error when running python numbapro - python

I am trying to run a cuda kernel in numbapro python, but I keep getting an out of resources error.
I then tried to execute the kernel into a loop and send smaller arrays, but that still gave me the same error.
Here is my error message:
Traceback (most recent call last):
File "./predict.py", line 418, in <module>
predict[griddim, blockdim, stream](callResult_d, catCount, numWords, counts_d, indptr_d, indices_d, probtcArray_d, priorC_d)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/compiler.py", line 228, in __call__
sharedmem=self.sharedmem)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/compiler.py", line 268, in _kernel_call
cu_func(*args)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.py", line 1044, in __call__
self.sharedmem, streamhandle, args)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.py", line 1088, in launch_kernel
None)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.py", line 215, in safe_cuda_api_call
self._check_error(fname, retcode)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.py", line 245, in _check_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: Call to cuLaunchKernel results in CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
Here is my source code:
from numbapro.cudalib import cusparse
from numba import *
from numbapro import cuda
#cuda.jit(argtypes=(double[:], int64, int64, double[:], int64[:], int64[:], double[:,:], double[:] ))
def predict( callResult, catCount, wordCount, counts, indptr, indices, probtcArray, priorC ):
i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
correct = 0
wrong = 0
lastDocIndex = -1
maxProb = -1e6
picked = -1
for cat in range(catCount):
probSum = 0.0
for j in range(indptr[i],indptr[i+1]):
wordIndex = indices[j]
probSum += (counts[j]*math.log(probtcArray[cat,wordIndex]))
probSum += math.log(priorC[cat])
if probSum > maxProb:
maxProb = probSum
picked = cat
callResult[i] = picked
predictions = []
counter = 1000
for i in range(int(math.ceil(numDocs/(counter*1.0)))):
docTestSliceList = docTestList[i*counter:(i+1)*counter]
numDocsSlice = len(docTestSliceList)
docTestArray = np.zeros((numDocsSlice,numWords))
for j,doc in enumerate(docTestSliceList):
for ind in doc:
docTestArray[j,ind['term']] = ind['count']
docTestArraySparse = cusparse.ss.csr_matrix(docTestArray)
start = time.time()
OPT_N = numDocsSlice
blockdim = 1024, 1
griddim = int(math.ceil(float(OPT_N)/blockdim[0])), 1
catCount = len(music_categories)
callResult = np.zeros(numDocsSlice)
stream = cuda.stream()
with stream.auto_synchronize():
probtcArray_d = cuda.to_device(numpy.asarray(probtcArray),stream)
priorC_d = cuda.to_device(numpy.asarray(priorC),stream)
callResult_d = cuda.to_device(callResult, stream)
counts_d = cuda.to_device(docTestArraySparse.data, stream)
indptr_d = cuda.to_device(docTestArraySparse.indptr, stream)
indices_d = cuda.to_device(docTestArraySparse.indices, stream)
predict[griddim, blockdim, stream](callResult_d, catCount, numWords, counts_d, indptr_d, indices_d, probtcArray_d, priorC_d)
callResult_d.to_host(stream)
#stream.synchronize()
predictions += list(callResult)
print "prediction %d: %f" % (i,time.time()-start)

I found out this was in the cuda procedure.
When you call predict the blockdim is set to 1024.
predict[griddim, blockdim, stream](callResult_d, catCount, numWords, counts_d, indptr_d, indices_d, probtcArray_d, priorC_d)
But the procedure is called iteratively with slice sizes of 1000 elements not 1024.
So, in the procedure it will attempt to write 24 elements that are out of bounds in the return array.
Sending a number of elements parameter (n_el) and placing an error checking call in the cuda procedure solves it.
#cuda.jit(argtypes=(double[:], int64, int64, int64, double[:], int64[:], int64[:], double[:,:], double[:] ))
def predict( callResult, n_el, catCount, wordCount, counts, indptr, indices, probtcArray, priorC ):
i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
if i < n_el:
....

Related

argmax_cuda" not implemented for 'Bool

I am facing this error in google collab. I tried other datatypes like bool tensor but didn't work, please help
Code
def _mask(prev_generated_seq):
prev_mask = torch.eq(prev_generated_seq, 1)
lengths = torch.argmax(prev_mask,dim=1)
#test = torch.max(prev_mask,dim=1)
#lengths = torch.FloatTensor(test)
max_len = prev_generated_seq.size(1)
mask = []
for i in range(prev_generated_seq.size(0)):
if lengths[i] == 0:
mask_line = [0] * max_len
else:
mask_line = [0] * lengths[i].item()
mask_line.extend([1] * (max_len - lengths[i].item()))
mask.append(mask_line)
mask = torch.ByteTensor(mask)
if args.cuda:
mask = mask.cuda()
return prev_generated_seq.data.masked_fill_(mask, 0)
Error
File "main.py", line 179, in <module>
train_epoches(abstracts, model, config.epochs, teacher_forcing_ratio=1)
File "main.py", line 155, in train_epoches
target_variables, model, teacher_forcing_ratio)
File "main.py", line 139, in train_batch
prev_generated_seq = _mask(prev_generated_seq)
File "main.py", line 101, in _mask
lengths = torch.argmax(prev_mask,dim=1)
RuntimeError: "argmax_cuda" not implemented for 'Bool'
Your prev_mask is a bool tensor containing True and False values since you compare prev_generated_seq == 1 or torch.eq(prev_generated_seq, 1). Afterward, you use torch.argmax(prev_mask,dim=1) which is invalid operation for bool tensors.
You can look at this example:
import torch
a = torch.zeros((2,2), dtype=torch.bool, device='cuda')
a.argmax(1)
RuntimeError: "argmax_cuda" not implemented for 'Bool'
If your data on cpu not gpu, you get the same mistake but on cpu:
RuntimeError: "argmax_cpu" not implemented for 'Bool'
Solution: you should cast the bool tensor to other type e.g. long:
prev_mask = torch.eq(prev_generated_seq, 1) * 1
or :
prev_mask = torch.eq(prev_generated_seq, 1).to(torch.long)

Python TypeError: 'float' object cannot be interpreted as an index

The following code uses audio files to create a matrix of features in tensorflow:
import tensorflow as tf
directory = "audio_dataset/*.wav"
filenames = tf.train.match_filenames_once(directory)
init = (tf.global_variables_initializer(), tf.local_variables_initializer())
count_num_files = tf.size(filenames)
filename_queue = tf.train.string_input_producer(filenames)
reader = tf.WholeFileReader()
filename, file_contents = reader.read(filename_queue)
with tf.Session() as sess:
sess.run(init)
num_files = sess.run(count_num_files)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in range(num_files):
audio_file = sess.run(filename)
print(audio_file)
this is a toolkit that converts audio from time to frequency domain:
from bregman.suite import *
chromo = tf.placeholder(tf.float32)
max_freqs = tf.argmax(chromo, 0)
def get_next_chromogram(sess):
audio_file = sess.run(filename)
F = Chromagram(audio_file, nfft=16384, wfft=8192, nhop=2205)
return F.X
def extract_feature_vector(sess, chromo_data):
num_features, num_samples = np.shape(chromo_data)
freq_vals = sess.run(max_freqs, feed_dict={chromo: chromo_data})
hist, bins = np.histogram(freq_vals, bins=range(num_features + 1))
return hist.astype(float) / num_samples
def get_dataset(sess):
num_files = sess.run(count_num_files)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
xs = []
for _ in range(num_files):
chromo_data = get_next_chromogram(sess)
x = [extract_feature_vector(sess, chromo_data)]
x = np.matrix(x)
if len(xs) == 0:
xs = x
else:
xs = np.vstack((xs, x))
return xs
this clusters the data around two centroids:
k = 2
max_iterations = 100
def initial_cluster_centroids(X, k):
return X[0:k, :]
def assign_cluster(X, centroids):
expanded_vectors = tf.expand_dims(X, 0)
expanded_centroids = tf.expand_dims(centroids, 1)
distances = tf.reduce_sum(tf.square(tf.subtract(expanded_vectors, expanded_centroids)), 2)
mins = tf.argmin(distances, 0)
return mins
def recompute_centroids(X, Y):
sums = tf.unsorted_segment_sum(X, Y, k)
counts = tf.unsorted_segment_sum(tf.ones_like(X), Y, k)
return sums / counts
with tf.Session() as sess:
sess.run(init)
X = get_dataset(sess)
centroids = initial_cluster_centroids(X, k)
i, converged = 0, False
while not converged and i < max_iterations:
i += 1
Y = assign_cluster(X, centroids)
centroids = sess.run(recompute_centroids(X, Y))
print(centroids)
but Im getting the following traceback:
Traceback (most recent call last):
File "components.py", line 776, in <module>
X = get_dataset(sess)
File "ccomponents.py", line 745, in get_dataset
chromo_data = get_next_chromogram(sess)
File "coffee_components.py", line 728, in get_next_chromogram
F = Chromagram(audio_file, nfft=16384, wfft=8192, nhop=2205)
File "/Volumes/Dados/Documents/Education/Programming/Machine Learning/Manning/book/BregmanToolkit-master/bregman/features.py", line 143, in __init__
Features.__init__(self, arg, feature_params)
File "/Volumes/Dados/Documents/Education/Programming/Machine Learning/Manning/book/BregmanToolkit-master/bregman/features_base.py", line 70, in __init__
self.extract()
File "/Volumes/Dados/Documents/Education/Programming/Machine Learning/Manning/book/BregmanToolkit-master/bregman/features_base.py", line 213, in extract
self.extract_funs.get(f, self._extract_error)()
File "/Volumes/Dados/Documents/Education/Programming/Machine Learning/Manning/book/BregmanToolkit-master/bregman/features_base.py", line 711, in _chroma
if not self._cqft():
File "/Volumes/Dados/Documents/Education/Programming/Machine Learning/Manning/book/BregmanToolkit-master/bregman/features_base.py", line 588, in _cqft
self._make_log_freq_map()
File "/Volumes/Dados/Documents/Education/Programming/Machine Learning/Manning/book/BregmanToolkit-master/bregman/features_base.py", line 353, in _make_log_freq_map
mxnorm = P.empty(self._cqtN) # Normalization coefficients
TypeError: 'float' object cannot be interpreted as an index
as far as I'm concerned, rangeis an intand not a float.
could someone please point me the error here?
The problem is that you're using Python 3, but the Bregman Toolkit was written in Python 2. The error comes from this line:
mxnorm = P.empty(self._cqtN)
self._cqtN is a float. In Python 2, the pylab library accepts floats as input:
pylab.empty(5.0)
__main__:1: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
array([ 0., 0., 0., 0., 0.])
However, in Python 3 you get the same error as you do:
pylab.empty(5.0)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: 'float' object cannot be interpreted as an integer
You should be able to fix this error by just editing the line in the file I linked above and cast it to an int:
mxnorm = P.empty(int(self._cqtN))
However, I'd be surprised if there weren't any other errors due to the incompatible versions. You might want to try using Python 2 or look for an alternative to the Bregman Toolkit.
You need to change castself._cqtN to int in line 353 and 357 in feature_base.py
There are
mxnorm = P.empty(int(self._cqtN))
and
for i in P.arange(int(self._cqtN))])

How specify constraints in pulp dynamically?

I want to check if my data is linearly separable or not.For that I am using the equations mentioned at this link. Below is the code that I am using:
try:
import os
#import random
import traceback
import datetime
#import numpy as np
import scipy.io as sio
import pulp
os.system('cls')
dicA = sio.loadmat('A1.mat')
A = dicA.get('A1')
var = pulp.LpVariable.dicts("var",range(11),pulp.LpContinuous)
A = A[:,0:10]
model = pulp.LpProblem("Data linearly seaparable", pulp.LpMinimize)
model+= 0
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
for i in range(len(A)):
expr = pulp.LpAffineExpression()
for j in range(len(A[i])):
expr += var[j]*A[i][j]
expr+= var[10] <= -1
model+= expr
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
model.solve()
print(pulp.LpStatus[model.status])
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
except:
print('exception')
tb = traceback.format_exc()
print(tb)
finally:
print('reached finally')
And here is the output that I am getting:
C:\Users\puneet\Anaconda3\lib\site-packages\pulp\pulp.py:1348: UserWarning: Overwriting previously set objective.
warnings.warn("Overwriting previously set objective.")
2017-08-29 10:06:21
exception
Traceback (most recent call last):
File "C:/Hackerearth Challenge/Machine Learning #3/LInearlySeaparblePulp.py", line 31, in <module>
model.solve()
File "C:\Users\puneet\Anaconda3\lib\site-packages\pulp\pulp.py", line 1664, in solve
status = solver.actualSolve(self, **kwargs)
File "C:\Users\puneet\Anaconda3\lib\site-packages\pulp\solvers.py", line 1362, in actualSolve
return self.solve_CBC(lp, **kwargs)
File "C:\Users\puneet\Anaconda3\lib\site-packages\pulp\solvers.py", line 1384, in solve_CBC
tmpMps, rename = 1)
File "C:\Users\puneet\Anaconda3\lib\site-packages\pulp\pulp.py", line 1484, in writeMPS
f.write(" LO BND %-8s % .12e\n" % (n, v.lowBound))
TypeError: must be real number, not str
reached finally
I am adding 0 to specify that there is no objective function as mentioned in the link. Also since there are about 12000 rows in A variable, hence I am trying to create constraints dynamically.But there seems to be some problem in that.So, what is it that I am doing wrong?
var = pulp.LpVariable.dicts("var",range(11),pulp.LpContinuous)
needs to be
var = pulp.LpVariable.dicts("var",range(11),cat=pulp.LpContinuous)
as the LpVariable.dicts fn looks like this
def dicts(self, name, indexs, lowBound = None, upBound = None, cat = LpContinuous, indexStart = []):

How to correctly initialize pool worker and run method

Background:
I am trying to get a pool of workers going to solve a task. My issue is that I am trying to pass it a shared variable, but I get an error. I have written an initializer method for the workers that expect my variables, but I can't seem to get it to work.
Here is my code:
from matplotlib import pyplot
import time
import multiprocessing
#initialize some multiprocessing stuff
num_processes = 8
y = multiprocessing.Array('d', 1000, lock=False)
new_y = multiprocessing.Array('d', 1000, lock=False)
dt = multiprocessing.Value('d',0, lock=False)
y_len = multiprocessing.Value('i',len(y), lock=False)
def init(y_to_share, new_y_to_share):
global y, new_y
y = y_to_share
new_y = new_y_to_share
y[480:520] = [1] * 40
dt.value = 0.01
# our rule for reaction-diffusion
def advance():
global y, new_y
n = len(y)
new_y = list(y)
for j in xrange(n):
new_y[j] += dt * (20 * (y[j - 1] - 2 * y[j] + y[(j + 1) % n])
- y[j] * (1 - y[j]) * (0.3 - y[j]))
y = new_y
return y
# advance through t (t = i * dt) is at least 100; plot
# every 20
chunks = len(y)/num_processes
y_range = range(len(y))
y_range = [y_range[i:i+chunks] for i in range(0, len(y_range), chunks)]
p = multiprocessing.Pool(num_processes, initializer=init, initargs=(y, new_y))
i = 0
start = time.time()
while i * dt.value <= 100:
if i * dt.value % 20 == 0:
pyplot.plot(y, label='t = %g' % (i * dt.value))
arr = p.map(advance, (y, new_y))#hand in an array of indices
i += 1
#print i * dt.value
end = time.time()
elapsed = end-start
print elapsed
pyplot.legend()
pyplot.show()
Edit: Post the actual error
The error:
runfile('/home/kevin/Downloads/cbb750_parallel_hw/propagating-signal-parallel.py', wdir='/home/kevin/Downloads/cbb750_parallel_hw')
Traceback (most recent call last):
File "<ipython-input-64-5ad3fdf93b59>", line 1, in <module>
runfile('/home/kevin/Downloads/cbb750_parallel_hw/propagating-signal-parallel.py', wdir='/home/kevin/Downloads/cbb750_parallel_hw')
File "/usr/local/lib/python2.7/dist-packages/spyder/utils/site/sitecustomize.py", line 866, in runfile
execfile(filename, namespace)
File "/usr/local/lib/python2.7/dist-packages/spyder/utils/site/sitecustomize.py", line 94, in execfile
builtins.execfile(filename, *where)
File "/home/kevin/Downloads/cbb750_parallel_hw/propagating-signal-parallel.py", line 45, in <module>
arr = p.map(advance, (y, new_y))#hand in an array of indices
File "/usr/lib/python2.7/multiprocessing/pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 567, in get
raise self._value
PicklingError: Can't pickle <class 'multiprocessing.sharedctypes.c_double_Array_1000'>: attribute lookup multiprocessing.sharedctypes.c_double_Array_1000 failed
Can anyone help me solve this? I am not sure what I am doing wrong, but I would like to use pool.map .

Scipy minimize erroring for large array of variables

Hi I am trying to use scipy for optimization. The minimize function with method as 'COBYLA' is working fine for small array size but errors out for larger sized arrays. I tried with 'COBYLA' and 'SLSQP' methods since I have a constrained optimization problem for non-linear functions.
Code snippet:
import scipy as sp
import random
def mytest7obj(x, x_d, y_d, z_d, power):
for x_i in x:
if x_i < 0:
return 0.
sum = 0.
for i in range(x_d):
for j in range(z_d):
term = 1.
for k in range(y_d):
term *= (x[i*(y_d*z_d)+j*(y_d)+k] ** power[k])
sum += term
return 0. - sum
def mytest7():
x_d = 30
y_d = 10
z_d = 100
goal = 1000000.
constraints = []
power = []
for i in range(y_d):
power.append(random.uniform(0.,0.3))
constraints.append({'type':'ineq', 'fun': lambda x: goal - sum(x)})
print 'power: %s\n' % (power,)
result = sp.optimize.minimize(fun = mytest7obj, x0 = [30.] * (x_d*y_d*z_d), method = 'COBYLA', args = (x_d, y_d, z_d, power), jac=False, constraints=constraints, options={'disp':True, 'rhobeg':3., 'maxiter': 10000})
print 'goal attained: %s'% (sum(result.x),)
if __name__ == “__main__”:
mytest7()
The traceback of the error with method 'COBYLA' is:
Traceback (most recent call last):
File "opt_test.py", line 584, in <module>
print 'mytest7'; mytest7()
File "opt_test.py", line 571, in mytest7
result = sp.optimize.minimize(fun = mytest7obj, x0 = [30.] * (x_d*y_d*z_d), method = 'COBYLA', args = (x_d, y_d, z_d, power), jac=False, constraints=constraints, options={'disp':True, 'rhobeg':3., 'maxiter': 10000})
File "/usr/lib64/python2.7/site-packages/scipy/optimize/_minimize.py", line 385, in minimize
return _minimize_cobyla(fun, x0, args, constraints, **options)
File "/usr/lib64/python2.7/site-packages/scipy/optimize/cobyla.py", line 238, in _minimize_cobyla
dinfo=info)
ValueError: failed to create intent(cache|hide)|optional array-- must have defined dimensions but got (-1594577286,)
With 'SLSQP', the error is:
File "opt_test.py", line 586, in <module>
print 'test'; test()
File "opt_test.py", line 454, in test
x = get_optimal(base, budget, initial_values, x_elas, y_elas, x_history, y_history, constraint_coeffs, opt_method = 'SLSQP')
File "opt_test.py", line 355, in get_optimal
constraints=constraints, options=opts)
File "/usr/lib64/python2.7/site-packages/scipy/optimize/_minimize.py", line 388, in minimize
constraints, **options)
File "/usr/lib64/python2.7/site-packages/scipy/optimize/slsqp.py", line 316, in _minimize_slsqp
w = zeros(len_w)
MemoryError
I am using python 2.7.5,
scipy version: 0.14.0rc1,
numpy version: 1.8.1

Categories