Related
apologies first for any improper layout as I am new to stack overflow. My issue is as follows:
I am currently creating a custom module inside an existing CNN using pytorch. I am doing this as part of my schools research, and so I have access to a super computer with multiple GPU devices. When I train my model, after running through the first validation set I run into the following error:
"RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument weight in method wrapper__cudnn_convolution)"
Now, every similar issue I see is when people are mixing tensors between the cpu and gpu, however mine seems to be strictly an issue with different cuda devices. For some context, when the model is initially loaded it is wrapped with nn.DataParallel. When I run the training excluding my custom module, the training works fine with multiple gpus. When I do implement the custom module and use just the cpu or just one cuda device, the training goes fine (albeit very slow) until the devices run out of memory from the amount of parameters I have to train.
Here is the code for my custom module:
class DivisiveNormBlock(nn.Module):
def __init__(self, channel_num = 512, size = 56, ksize = 4):
super().__init__()
self.channel_num = channel_num
self.size = size
self.ksize = ksize
scale = 90
self.theta = torch.nn.Parameter(scale * torch.abs(torch.randn(self.channel_num, self.channel_num, device="cuda",
requires_grad=True))) # 512 thetas for a channel, 512 channels, same goes for...
self.p = torch.nn.Parameter(
scale * torch.abs(torch.randn(self.channel_num, self.channel_num, device="cuda", requires_grad=True)))
self.sig = torch.nn.Parameter(
scale * torch.abs(torch.randn(self.channel_num, self.channel_num, device="cuda", requires_grad=True)))
self.a = torch.nn.Parameter(
scale * torch.abs(torch.randn(self.channel_num, self.channel_num, device="cuda", requires_grad=True)))
self.nI = torch.nn.Parameter(
torch.abs(torch.randn(self.channel_num, self.channel_num, device="cuda", requires_grad=True)))
self.nU = torch.nn.Parameter(torch.abs(torch.randn(self.channel_num, device="cuda", requires_grad=True)))
self.bias = torch.nn.Parameter(torch.abs(torch.randn(self.channel_num, device="cuda", requires_grad=True)))
self.gaussian_bank = torch.zeros(self.channel_num, self.channel_num, self.ksize * 2+ 1, self.ksize * 2+ 1,
device="cuda")
self.x = torch.linspace(-self.ksize, self.ksize, self.ksize * 2 + 1, device="cuda")
self.y = torch.linspace(-self.ksize, self.ksize, self.ksize * 2 + 1, device="cuda")
self.xv, self.yv = torch.meshgrid(self.x, self.y)
for i in range(self.channel_num):
for u in range(self.channel_num):
self.gaussian_bank[i, u, :, :] = self.get_gaussian(i, u)
def get_gaussian(self, cc, oc): #
xrot = self.xv * torch.cos(self.theta[cc, oc]) + self.yv * torch.sin(self.theta[cc, oc])
yrot = -self.xv * torch.sin(self.theta[cc, oc]) + self.yv * torch.cos(self.theta[cc, oc])
g_kernel = (self.a[cc, oc] / \
(2 * torch.pi * self.p[cc, oc] * self.sig[cc, oc])) * \
torch.exp(-0.5 * ((((xrot) ** 2) / self.p[cc, oc] ** 2) + ((yrot) ** 2) / self.sig[cc, oc] ** 2))
return g_kernel
def forward(self, x):
x_test = self.dn_f(x)
return x_test
def dn_f(self, x):
batch_size = x.shape[0]
under_sum = torch.zeros((self.channel_num, self.size, self.size), device="cuda")
normalized_channels = torch.zeros((batch_size, self.channel_num, self.size, self.size), device="cuda")
for b in tqdm(range(batch_size)):
for i in range(self.channel_num):
for u in range(self.channel_num):
under_sum[u] = self.conv_gauss(torch.pow(x[b, i], self.nI[i, u]), self.gaussian_bank[i, u])
normalized_channels[b, i] = torch.pow(x[b, i], self.nU[i]) / (
torch.pow(self.bias[i], self.nU[i]) + torch.sum(under_sum, 0))
return normalized_channels
def conv_gauss(self, x_conv, gauss_conv):
x_conv = torch.reshape(x_conv, (1, 1, self.size, self.size))
gauss_conv = torch.reshape(gauss_conv, (1, 1, self.ksize * 2+ 1, self.ksize * 2+ 1))
p = int((self.ksize*2)/2)
output = F.conv2d(x_conv, gauss_conv, padding=p, stride=1)
output = torch.reshape(output, (self.size, self.size))
return output
And here is the full error traceback, the main error in question at the bottom:
Traceback (most recent call last):
File "train.py", line 390, in <module>
fire.Fire(command=FIRE_FLAGS)
File "/home/andrewc/.local/lib/python3.8/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/home/andrewc/.local/lib/python3.8/site-packages/fire/core.py", line 466, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/home/andrewc/.local/lib/python3.8/site-packages/fire/core.py", line 681, in
_CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "train.py", line 220, in train
results[validator.name] = validator()
File "train.py", line 366, in __call__
output = self.model(inp)
File "/home/andrewc/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102,
in _call_impl
return forward_call(*input, **kwargs)
File "/home/andrewc/.local/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py",
line 168, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/andrewc/.local/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py",
line 178, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/andrewc/.local/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py",
line 86, in parallel_apply
output.reraise()
File "/home/andrewc/.local/lib/python3.8/site-packages/torch/_utils.py", line 434, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
File "/home/andrewc/.local/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py",
line 61, in _worker
output = module(*input, **kwargs)
File "/home/andrewc/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102,
in _call_impl
return forward_call(*input, **kwargs)
File "/home/andrewc/.local/lib/python3.8/site-packages/torch/nn/modules/container.py", line
141, in forward
input = module(input)
File "/home/andrewc/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102,
in _call_impl
return forward_call(*input, **kwargs)
File "/home/andrewc/Surround-Suppresion-for-VOneNet/modules.py", line 177, in forward
x_test = self.dn_f(x)
File "/home/andrewc/Surround-Suppresion-for-VOneNet/modules.py", line 190, in dn_f
under_sum[u] = self.conv_gauss(torch.pow(x[b, i], self.nI[i, u]), self.gaussian_bank[i, u])
File "/home/andrewc/Surround-Suppresion-for-VOneNet/modules.py", line 200, in conv_gauss
output = F.conv2d(x_conv, gauss_conv, padding=p, stride=1)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices,
cuda:1 and cuda:0! (when checking argument for argument weight in method
wrapper__cudnn_convolution)
It seems like it may have to do with the convolution at the bottom of the module? Any help on how to get this to work with multiple gpus is greatly appreciated!
I am trying to run this command python run.py --mode MLE
and got this error. I am not able to find the correct solution for it .
Traceback (most recent call last):
File "run.py", line 208, in <module>
train_MLE()
File "run.py", line 94, in train_MLE
encoder_input, decoder_input, weight = model.get_batch(d_valid, i)
File "C:\Users\Kriti Gupta\Desktop\GitHub_repo\Seq2seq-Chatbot-With-Deep-Reinforcement-Learning\seq2seq_model.py", line 342, in get_batch
encoder_input, decoder_input = random.choice(data[bucket_id])
File "C:\Users\Kriti Gupta\AppData\Local\Programs\Python\Python37\lib\random.py", line 261, in choice
raise IndexError('Cannot choose from an empty sequence') from None
IndexError: Cannot choose from an empty sequence
Below is the code which contains the function
def get_batch(self, data, bucket_id, rand = True, order = False):
# data should be [whole_data_length x (source, target)]
# decoder_input should contain "GO" symbol and target should contain "EOS" symbol
encoder_size, decoder_size = self.buckets[bucket_id]
encoder_inputs, decoder_inputs = [], []
#print(bucket_id)
print(random.choice(data[bucket_id]))
encoder_input, decoder_input = random.choice(data[bucket_id])
c = 0
for i in xrange(self.batch_size):
if rand:
encoder_input, decoder_input = random.choice(data[bucket_id])
if order:
encoder_input, decoder_input = data[bucket_id][i]
c += 1
Please help!!
random.choice always raises IndexError on an empty sequence. I would suggest checking the data you are passing to the function
get_batch()
You can also add an 'if condition' in 'get_batch()' method to check if the data passed is empty or not.
Reference:
Python Bug Tracker
I am trying to run the following code:
import matplotlib.pylab as plt
import numpy as np
import torch
import torch.nn as nn
class LSTM(nn.Module):
def __init__(self, input_shape, n_actions):
super(LSTM, self).__init__()
self.lstm = nn.LSTM(input_shape, 12)
self.hidden2tag = nn.Linear(12, n_actions)
def forward(self, x):
out = self.lstm(x)
out = self.hidden2tag(out)
return out
state = [(1,2,3,4,5),(2,3,4,5,6),(3,4,5,6,7),(4,5,6,7,8),(5,6,7,8,9),(6,7,8,9,0)]
device = torch.device("cuda")
net = LSTM(5, 3).to(device)
state_v = torch.FloatTensor(state).to(device)
q_vals_v = net(state_v.view(1, state_v.shape[0], state_v.shape[1]))
_, action = int(torch.max(q_vals_v, dim=1).item())
And that returns this error:
Traceback (most recent call last):
File "/home/dikkerj/Documents/PycharmProjects/LSTMReactor/QuestionStackoverflow.py", line 26, in <module>
q_vals_v = net(state_v.view(1, state_v.shape[0], state_v.shape[1]))
File "/home/dikkerj/.local/lib/python3.5/site-packages/torch/nn/modules/module.py", line 477, in __call__
result = self.forward(*input, **kwargs)
File "/home/dikkerj/Documents/PycharmProjects/LSTMReactor/QuestionStackoverflow.py", line 15, in forward
out = self.hidden2tag(out)
File "/home/dikkerj/.local/lib/python3.5/site-packages/torch/nn/modules/module.py", line 477, in __call__
result = self.forward(*input, **kwargs)
File "/home/dikkerj/.local/lib/python3.5/site-packages/torch/nn/modules/linear.py", line 55, in forward
return F.linear(input, self.weight, self.bias)
File "/home/dikkerj/.local/lib/python3.5/site-packages/torch/nn/functional.py", line 1022, in linear
if input.dim() == 2 and bias is not None:
AttributeError: 'tuple' object has no attribute 'dim'
Anyone knows how to fix this? (to get rid of the tensor being a tuple so that it can be fed into the LSTM network)
The pytorch LSTM returns a tuple. So you get this error as your linear layer self.hidden2tag can not handle this tuple.
So change:
out = self.lstm(x)
to
out, states = self.lstm(x)
This will fix your error, by splitting up the tuple so that out is just your output tensor.
out then stores the hidden states, while states is another tuple that contains the last hidden and cell state.
You can also take a look here:
https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
You will get another error for the last line as max() returns a tuple as well. But this should be easy to fix and is yet different error :)
Transform your state in a numpy array first:
state = np.array(state)
PyTorch is probably missing a np.asarray in their API.
The following code uses audio files to create a matrix of features in tensorflow:
import tensorflow as tf
directory = "audio_dataset/*.wav"
filenames = tf.train.match_filenames_once(directory)
init = (tf.global_variables_initializer(), tf.local_variables_initializer())
count_num_files = tf.size(filenames)
filename_queue = tf.train.string_input_producer(filenames)
reader = tf.WholeFileReader()
filename, file_contents = reader.read(filename_queue)
with tf.Session() as sess:
sess.run(init)
num_files = sess.run(count_num_files)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in range(num_files):
audio_file = sess.run(filename)
print(audio_file)
this is a toolkit that converts audio from time to frequency domain:
from bregman.suite import *
chromo = tf.placeholder(tf.float32)
max_freqs = tf.argmax(chromo, 0)
def get_next_chromogram(sess):
audio_file = sess.run(filename)
F = Chromagram(audio_file, nfft=16384, wfft=8192, nhop=2205)
return F.X
def extract_feature_vector(sess, chromo_data):
num_features, num_samples = np.shape(chromo_data)
freq_vals = sess.run(max_freqs, feed_dict={chromo: chromo_data})
hist, bins = np.histogram(freq_vals, bins=range(num_features + 1))
return hist.astype(float) / num_samples
def get_dataset(sess):
num_files = sess.run(count_num_files)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
xs = []
for _ in range(num_files):
chromo_data = get_next_chromogram(sess)
x = [extract_feature_vector(sess, chromo_data)]
x = np.matrix(x)
if len(xs) == 0:
xs = x
else:
xs = np.vstack((xs, x))
return xs
this clusters the data around two centroids:
k = 2
max_iterations = 100
def initial_cluster_centroids(X, k):
return X[0:k, :]
def assign_cluster(X, centroids):
expanded_vectors = tf.expand_dims(X, 0)
expanded_centroids = tf.expand_dims(centroids, 1)
distances = tf.reduce_sum(tf.square(tf.subtract(expanded_vectors, expanded_centroids)), 2)
mins = tf.argmin(distances, 0)
return mins
def recompute_centroids(X, Y):
sums = tf.unsorted_segment_sum(X, Y, k)
counts = tf.unsorted_segment_sum(tf.ones_like(X), Y, k)
return sums / counts
with tf.Session() as sess:
sess.run(init)
X = get_dataset(sess)
centroids = initial_cluster_centroids(X, k)
i, converged = 0, False
while not converged and i < max_iterations:
i += 1
Y = assign_cluster(X, centroids)
centroids = sess.run(recompute_centroids(X, Y))
print(centroids)
but Im getting the following traceback:
Traceback (most recent call last):
File "components.py", line 776, in <module>
X = get_dataset(sess)
File "ccomponents.py", line 745, in get_dataset
chromo_data = get_next_chromogram(sess)
File "coffee_components.py", line 728, in get_next_chromogram
F = Chromagram(audio_file, nfft=16384, wfft=8192, nhop=2205)
File "/Volumes/Dados/Documents/Education/Programming/Machine Learning/Manning/book/BregmanToolkit-master/bregman/features.py", line 143, in __init__
Features.__init__(self, arg, feature_params)
File "/Volumes/Dados/Documents/Education/Programming/Machine Learning/Manning/book/BregmanToolkit-master/bregman/features_base.py", line 70, in __init__
self.extract()
File "/Volumes/Dados/Documents/Education/Programming/Machine Learning/Manning/book/BregmanToolkit-master/bregman/features_base.py", line 213, in extract
self.extract_funs.get(f, self._extract_error)()
File "/Volumes/Dados/Documents/Education/Programming/Machine Learning/Manning/book/BregmanToolkit-master/bregman/features_base.py", line 711, in _chroma
if not self._cqft():
File "/Volumes/Dados/Documents/Education/Programming/Machine Learning/Manning/book/BregmanToolkit-master/bregman/features_base.py", line 588, in _cqft
self._make_log_freq_map()
File "/Volumes/Dados/Documents/Education/Programming/Machine Learning/Manning/book/BregmanToolkit-master/bregman/features_base.py", line 353, in _make_log_freq_map
mxnorm = P.empty(self._cqtN) # Normalization coefficients
TypeError: 'float' object cannot be interpreted as an index
as far as I'm concerned, rangeis an intand not a float.
could someone please point me the error here?
The problem is that you're using Python 3, but the Bregman Toolkit was written in Python 2. The error comes from this line:
mxnorm = P.empty(self._cqtN)
self._cqtN is a float. In Python 2, the pylab library accepts floats as input:
pylab.empty(5.0)
__main__:1: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
array([ 0., 0., 0., 0., 0.])
However, in Python 3 you get the same error as you do:
pylab.empty(5.0)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: 'float' object cannot be interpreted as an integer
You should be able to fix this error by just editing the line in the file I linked above and cast it to an int:
mxnorm = P.empty(int(self._cqtN))
However, I'd be surprised if there weren't any other errors due to the incompatible versions. You might want to try using Python 2 or look for an alternative to the Bregman Toolkit.
You need to change castself._cqtN to int in line 353 and 357 in feature_base.py
There are
mxnorm = P.empty(int(self._cqtN))
and
for i in P.arange(int(self._cqtN))])
I am trying to run a cuda kernel in numbapro python, but I keep getting an out of resources error.
I then tried to execute the kernel into a loop and send smaller arrays, but that still gave me the same error.
Here is my error message:
Traceback (most recent call last):
File "./predict.py", line 418, in <module>
predict[griddim, blockdim, stream](callResult_d, catCount, numWords, counts_d, indptr_d, indices_d, probtcArray_d, priorC_d)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/compiler.py", line 228, in __call__
sharedmem=self.sharedmem)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/compiler.py", line 268, in _kernel_call
cu_func(*args)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.py", line 1044, in __call__
self.sharedmem, streamhandle, args)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.py", line 1088, in launch_kernel
None)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.py", line 215, in safe_cuda_api_call
self._check_error(fname, retcode)
File "/home/mhagen/Developer/anaconda/lib/python2.7/site-packages/numba/cuda/cudadrv/driver.py", line 245, in _check_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: Call to cuLaunchKernel results in CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
Here is my source code:
from numbapro.cudalib import cusparse
from numba import *
from numbapro import cuda
#cuda.jit(argtypes=(double[:], int64, int64, double[:], int64[:], int64[:], double[:,:], double[:] ))
def predict( callResult, catCount, wordCount, counts, indptr, indices, probtcArray, priorC ):
i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
correct = 0
wrong = 0
lastDocIndex = -1
maxProb = -1e6
picked = -1
for cat in range(catCount):
probSum = 0.0
for j in range(indptr[i],indptr[i+1]):
wordIndex = indices[j]
probSum += (counts[j]*math.log(probtcArray[cat,wordIndex]))
probSum += math.log(priorC[cat])
if probSum > maxProb:
maxProb = probSum
picked = cat
callResult[i] = picked
predictions = []
counter = 1000
for i in range(int(math.ceil(numDocs/(counter*1.0)))):
docTestSliceList = docTestList[i*counter:(i+1)*counter]
numDocsSlice = len(docTestSliceList)
docTestArray = np.zeros((numDocsSlice,numWords))
for j,doc in enumerate(docTestSliceList):
for ind in doc:
docTestArray[j,ind['term']] = ind['count']
docTestArraySparse = cusparse.ss.csr_matrix(docTestArray)
start = time.time()
OPT_N = numDocsSlice
blockdim = 1024, 1
griddim = int(math.ceil(float(OPT_N)/blockdim[0])), 1
catCount = len(music_categories)
callResult = np.zeros(numDocsSlice)
stream = cuda.stream()
with stream.auto_synchronize():
probtcArray_d = cuda.to_device(numpy.asarray(probtcArray),stream)
priorC_d = cuda.to_device(numpy.asarray(priorC),stream)
callResult_d = cuda.to_device(callResult, stream)
counts_d = cuda.to_device(docTestArraySparse.data, stream)
indptr_d = cuda.to_device(docTestArraySparse.indptr, stream)
indices_d = cuda.to_device(docTestArraySparse.indices, stream)
predict[griddim, blockdim, stream](callResult_d, catCount, numWords, counts_d, indptr_d, indices_d, probtcArray_d, priorC_d)
callResult_d.to_host(stream)
#stream.synchronize()
predictions += list(callResult)
print "prediction %d: %f" % (i,time.time()-start)
I found out this was in the cuda procedure.
When you call predict the blockdim is set to 1024.
predict[griddim, blockdim, stream](callResult_d, catCount, numWords, counts_d, indptr_d, indices_d, probtcArray_d, priorC_d)
But the procedure is called iteratively with slice sizes of 1000 elements not 1024.
So, in the procedure it will attempt to write 24 elements that are out of bounds in the return array.
Sending a number of elements parameter (n_el) and placing an error checking call in the cuda procedure solves it.
#cuda.jit(argtypes=(double[:], int64, int64, int64, double[:], int64[:], int64[:], double[:,:], double[:] ))
def predict( callResult, n_el, catCount, wordCount, counts, indptr, indices, probtcArray, priorC ):
i = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
if i < n_el:
....