PyTorch (GPU) slower than CPU slower than keras

PyTorch (GPU) slower than CPU slower than keras - python

I'm just getting started with PyTorch and I wanted to run through a few toy problems. In the following case, I'm noticing a significant difference in how much time it takes for the model to train once over and issue one batch of predictions.
This is the PyTorch implementation. On the GPU, it takes ~17 seconds on my machine. The same model on the CPU takes ~11 seconds.
class LR(torch.nn.Module):
def __init__(self):
super().__init__()
self.linear1 = torch.nn.Linear(2, 20)
self.linear2 = torch.nn.Linear(20, 1)
def forward(self, x):
x = torch.nn.functional.relu(self.linear1(x))
x = torch.nn.functional.relu(self.linear2(x))
return x
def fit_torch(df_train, df_test):
sampler_tr = torch.utils.data.SubsetRandomSampler(df_train.index)
train = torch.utils.data.DataLoader(
torch.tensor(df_train.values, dtype=torch.float),
batch_size=batch_size, sampler=sampler_tr)
sampler_te = torch.utils.data.SubsetRandomSampler(df_test.index)
test = torch.utils.data.DataLoader(
torch.tensor(df_test.values, dtype=torch.float),
batch_size=batch_size, sampler=sampler_te)
model = LR()
model = model.to(device)
loss = torch.nn.MSELoss()
optim = torch.optim.Adam(model.parameters(), lr=0.001)
model.train()
for _ in range(1000):
for train_data in train:
train_data = train_data.to(device)
x_train = train_data[:, :2]
y_train = train_data[:, 2]
optim.zero_grad()
pred = model(x_train)
loss_val = loss(pred.squeeze(), y_train)
loss_val.backward()
optim.step()
model.eval()
with torch.no_grad():
for test_data in test:
test_data = test_data.to(device)
pred = model(test_data[:, :2].float())
break
This is the keras implementation. It takes approximately 9 seconds to run.
def fit_tf(df_train, df_test):
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(20, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='relu'))
model.compile(loss='mse', optimizer='adam')
model.fit(
df_train.values[:, :2],
df_train.values[:, 2],
batch_size=batch_size, epochs=1000, verbose=0)
model.predict(df_test.iloc[:batch_size].values[:, :2])
The dataset and main functions.
device = torch.device('cuda:0')
scaler = MinMaxScaler()
batch_size = 64
def create_dataset():
dataset = []
random_x = np.random.randint(10, 1000, 1000)
random_y = np.random.randint(10, 1000, 1000)
for x, y in zip(random_x, random_y):
dataset.append((x, y, 4 * x + 3 * y + 10))
np.random.shuffle(dataset)
df = pd.DataFrame(dataset)
df = pd.DataFrame(scaler.fit_transform(df))
return df
def __main__():
df = create_dataset()
df_train, df_test = train_test_split(df)
start_time = time.time()
fit_tf(df_train.reset_index(drop=True), df_test.reset_index(drop=True))
print(time.time() - start_time)

PyTorch uses a dynamic computational graph by default, which is more flexible when you start to develop a neural network since it will give a more straight forward debug message. TensorFlow, in contrast, will produce a static computational graph, and that is why you need to compile the model before use it. The compiler can optimize your model, but the tradeoff is the neural network becomes difficult to debug. This may cause minor difference between the performance of the two framework, but should not be a big deal.
Since your network is pretty small, the overhead to copy the network between GPU memory and CPU memory and to initiate the CUDA subsystem exceeds the benefit brought by the GPU. If you try some more complex neural network such as AlexNet, ResNet or even GoogLeNet, the benefit will be much more obvious.

Related

Error: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first

I am trying to make a MLP classifier in PyTorch. The error is produced from the code in the final chunk. I'm not sure why numpy is even involved with this, can someone please point me in the right direction.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data = ImageFolder(data_dir, transform=transforms.Compose([transforms.Resize((224,224)),transforms.ToTensor()]))
trainloader = torch.utils.data.DataLoader(data, batch_size=600,shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(data, batch_size=150,shuffle=True, num_workers=2)
dataiter = iter(trainloader)
x_train, y_train = dataiter.next()
x_train = x_train.view(600,-1).to('cpu').to(device)
y_train = y_train.to('cpu').to(device)
class Net(torch.nn.Module):
def __init__(self):
super().__init__()
self.layer1 = torch.nn.Linear(150528, 9408)
self.layer2 = torch.nn.Linear(9408, 3)
def forward(self, x):
# here we define the (forward) computational graph,
# in terms of the tensors, and elt-wise non-linearities
x = F.relu(self.layer1(x))
x = self.layer2(x)
return x
def train_show(network, data, targ, lossFunc, optimiser, epochs):
lossHistory = [] # just to show a plot later...
accuHistory = []
for t in range(epochs):
optimiser.zero_grad() # Gradients accumulate by default, so don't forget to do this.
y = network.forward(data) # the forward pass
loss = lossFunc(y,targ) # recompute the loss
loss.backward() # runs autograd, to get the gradients needed by optimiser
optimiser.step() # take a step
# just housekeeping and reporting
accuracy = torch.mean((torch.argmax(y,dim=1) == targ).float())
lossHistory.append(loss.detach().item())
accuHistory.append(accuracy.detach())
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.plot(lossHistory,'r'); plt.title("loss"); plt.xlabel("epochs")
plt.subplot(1,2,2)
plt.plot(accuHistory,'b'); plt.title("accuracy")
net = Net().to('cpu').to(device)
lossFunction = torch.nn.CrossEntropyLoss().to('cpu').to(device)
optimiser = torch.optim.SGD(net.parameters(), lr=0.01)
train_show(net, x_train, y_train, lossFunction, optimiser, 50)

The plotting function you are using, plt.plot, works on numpy arrays and not on torch.tensors. Therefore, accHistory is being converted to numpy array and failed there.
Please see this answer for more details.

Why does my Colab session run out of RAM?

I'm building a model for image deblurring based on the model described in this paper using Keras. I train the model on Colab using the following training code:
x_train, y_train = load_h5_dataset()
def train(batch_size=16, epoch_num=5, critic_updates=5, log_dir='drive/MyDrive/train_logs'):
g = make_resnet_generator_model()
d = make_discriminator_model()
gan = make_gan(g, d)
d_opt = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
gan_opt = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
d.trainable = True
d.compile(optimizer=d_opt, loss=wasserstein_loss)
d.trainable = False
loss = [perceptual_loss, wasserstein_loss]
loss_weights = [100, 1]
gan.compile(optimizer=gan_opt, loss=loss, loss_weights=loss_weights)
d.trainable = True
output_true_batch, output_false_batch = np.ones((batch_size, 1)), -np.ones((batch_size, 1))
writer = tf.summary.create_file_writer(log_dir)
for epoch in tqdm(range(epoch_num)):
print(f"Epoch {epoch + 1}/{epoch_num}...")
permuted_indexes = np.random.permutation(x_train.shape[0])
d_losses = []
gan_losses = []
x_train = dataset['sharp_img']
for index in range(int(x_train.shape[0] / batch_size)):
batch_indexes = permuted_indexes[index * batch_size:(index + 1) * batch_size]
image_blur_batch = x_train[batch_indexes]
image_full_batch = y_train[batch_indexes]
generated_images = g.predict(x=image_blur_batch, batch_size=batch_size)
for _ in range(critic_updates):
d_loss_real = d.train_on_batch(image_full_batch, output_true_batch)
d_loss_fake = d.train_on_batch(generated_images, output_false_batch)
d_loss = 0.5 * np.add(d_loss_fake, d_loss_real)
d_losses.append(d_loss)
d.trainable = False
gan_loss = gan.train_on_batch(image_blur_batch, [image_full_batch, output_true_batch])
gan_losses.append(gan_loss)
d.trainable = True
write_logs(writer, ['d_loss', 'gan_loss'], [np.mean(d_losses), np.mean(gan_losses)], epoch)
save_weights(d, g, epoch, int(np.mean(gan_losses)))
In the training code above, the perceptual loss is calculated using a VGG16 network, pretrained on ImageNet. The function load_h5_dataset() is used to load a dataset saved as a .hdf5 file. I encounter two problems when executing this code:
When I run it on Colab, it keeps running out of RAM on Colab and stops the execution. However, the size of the dataset is 6GB, which is well below the available size of RAM of Colab.
When I run this code on my local machine (which has 16GB of RAM and a NVIDIA GeForce GTX 1660 Ti with 6GB capacity), I encounter this error: tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[16,256,128,128] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Conv2D]
Can someone have a look at my code and see what going wrong here? Thank you very much.

Can you check this issue https://github.com/tensorflow/models/issues/1993
And also you can
del whatevervariable
and then RAM will be free

Loss & accuracy don't improve in Xception (image classification)

As a trial, I'm implementing Xception to classify images without using pretrained weight in Tensorflow.
However, the accuracy are too low compared to the original paper.
Could somebody share any advice to address this problem?
I prepared 500 out of 1000 classes from ImageNet and train ready-Xception model with this data from scrach .
I tried the same learning rate and optimizer as used in the original paper.
– Optimizer: SGD
– Momentum: 0.9
– Initial learning rate: 0.045
– Learning rate decay: decay of rate 0.94 every 2 epochs
However, this did not work so well.
I know it is better to use all of 1000 classes rather than only 500, however, I couldn't prepare storage for it.
Did it affect the performance of my code?
Here is my code.
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers, losses, models, optimizers, callbacks, applications, preprocessing
# scheduler
def scheduler(epoch, lr):
return 0.045*0.94**(epoch/2.0)
lr_decay = callbacks.LearningRateScheduler(scheduler)
# early stopping
EarlyStopping = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=500, verbose=0, mode='auto', restore_best_weights=True)
# build xception
inputs = tf.keras.Input(shape=(224, 224, 3))
x = tf.cast(inputs, tf.float32)
x = tf.keras.applications.xception.preprocess_input(x) #preprocess image
x = applications.xception.Xception(weights=None, include_top=False,)(x, training=True)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dense(nb_class)(x)
outputs = layers.Softmax()(x)
model = tf.keras.Model(inputs, outputs)
model.compile(optimizer=optimizers.SGD(momentum=0.9, nesterov=True),
loss = 'categorical_crossentropy',
metrics= ['accuracy'])
# fitting data
history = model.fit(image_gen(df_train_chunk, 224, 224, ), #feed images with a generator
batch_size = 32,
steps_per_epoch = 64,
epochs=1000000000,
validation_data = image_gen(df_valid_chunk, 224, 224, ), #feed images with a generator
validation_steps = 64,
callbacks = [lr_decay, EarlyStopping],
)
My results are below. In the original paper, its accuracy reached around 0.8.
In contrast, the performance of my code is too poor.
P.S.
Some might wonder if my generator got wrong, so I put my generator code and result below.
from PIL import Image, ImageEnhance, ImageOps
def image_gen(df_data, h, w, shuffle=True):
nb_class = len(np.unique(df_data['Class']))
while True:
if shuffle:
df_data = df_data.sample(frac=1)
for i in range(len(df_data)):
X = Image.open((df_data.iloc[i]).loc['Path'])
X = X.convert('RGB')
X = X.resize((w,h))
X = preprocessing.image.img_to_array(X)
X = np.expand_dims(X, axis=0)
klass = (df_data.iloc[i]).loc['Class']
y = np.zeros(nb_class)
y[klass] = 1
y = np.expand_dims(y, axis=0)
yield X, y
train_gen = image_gen(df_train_chunk, 224, 224, )
for i in range(5):
X, y = next(train_gen)
print('\n\n class: ', y.argmax(-1))
display(Image.fromarray(X.squeeze(0).astype(np.uint8)))
the result is below.

When you chose only 500 labels, do you choose the first 500?
softmax output starting from 0, so make sure your labels staring from 0 to 499 either.

np.load() in generator slows down drastically after a number of iterations

I am training a large dataset with Keras. I have about 500 000 npy-files (2x 240000) that are loaded batch-wise with a data generator. During training the GPU utilization is high in the very beginning, then it stabilizes at 0%, sometimes spiking up to 20-30%.
When testing the generator in a for-loop, it shows the same tendency. It
works perfectly for a short time in the beginning, then it suddenly slows down considerably (approx. 100 times as slow).
I have narrowed it down to the np.load(). As suggested in other relevant posts, I have tried gc.collect(), and opening and closing the file for each load. None of these solved the problem.
I would like to avoid rebuilding the dataset.
Quite new to this. Does anyone have suggestions for how to improve the performance of the data generator?
The data generator:
def data_generator(train = True, batch_size = BATCH_SIZE, dim_x=(300,69), dim_y=(300,66)):
if train:
filenames = x_train_fn
else:
filenames = x_val_fn
batch_i = 0
while True:
if batch_i >= (len(filenames)// batch_size):
batch_i = 0
np.random.shuffle(filenames)
file_chunk = filenames[batch_i*batch_size:(batch_i+1)*batch_size]
X = np.zeros((batch_size,*dim_x))
y = np.zeros((batch_size,*dim_y))
for i, ID in enumerate(file_chunk):
X[i,] = np.load(data_dir_x + ID)
y[i,] = np.load(data_dir_y + ID.replace("x", "y"))
yield X, y
batch_i += 1
train_dataset = tf.data.Dataset.from_generator(lambda: data_generator(train=True),(tf.float32, tf.float32))
validation_dataset = tf.data.Dataset.from_generator(lambda: data_generator(train=False), (tf.float32, tf.float32))
train_dataset = train_dataset.prefetch(buffer_size = tf.data.experimental.AUTOTUNE)
validation_dataset = validation_dataset.prefetch(buffer_size = tf.data.experimental.AUTOTUNE)
The model used for training:
model = keras.models.Sequential()
model.add(keras.layers.Input(shape=(SEQ_LEN,INPUT_DIMS)))
model.add(keras.layers.LSTM(HIDDEN_UNITS1, return_sequences=True, dropout=d, name= 'lstm1'))
model.add(keras.layers.LSTM(HIDDEN_UNITS2, return_sequences=True, dropout=d,name= 'lstm2'))
model.add(keras.layers.LSTM(HIDDEN_UNITS3, return_sequences=True, dropout=d, name= 'lstm3'))
model.add(keras.layers.TimeDistributed(mdn.MDN(OUTPUT_DIMS, N_MIXES, name='mdn_outputs'),name='td_mdn'))
model.compile(loss=mdn.get_mixture_loss_func(OUTPUT_DIMS,N_MIXES), optimizer=opt)
model.summary()
model.fit(train_dataset,
steps_per_epoch = int(num_train_ex // BATCH_SIZE),
epochs = EPOCHS,
verbose = 1,
validation_data = validation_dataset,
validation_steps = int(num_val_ex// BATCH_SIZE),
callbacks=callbacks, use_multiprocessing=True, workers=8)
Have been using this for-loop for testing the generator:
dataset_check = tf.data.Dataset.from_generator(lambda: data_generator(train=True), (tf.float32, tf.float32))
for x, y in dataset_check.take(1000):
print('ok')

Keras: Wrong Number of Training Epochs

I'm trying to build a class to quickly initialize and train an autoencoder for rapid prototyping. One thing I'd like to be able to do is quickly adjust the number of epochs I train for. However, it seems like no matter what I do, the model trains each layer for 100 epochs! I'm using the tensorflow backend.
Here is the code from the two offending methods.
def pretrain(self, X_train, nb_epoch = 10):
data = X_train
for ae in self.pretrains:
ae.fit(data, data, nb_epoch = nb_epoch)
ae.layers[0].output_reconstruction = False
ae.compile(optimizer='sgd', loss='mse')
data = ae.predict(data)
.........
def fine_train(self, X_train, nb_epoch):
weights = [ae.layers[0].get_weights() for ae in self.pretrains]
dims = self.dims
encoder = containers.Sequential()
decoder = containers.Sequential()
## add special input encoder
encoder.add(Dense(output_dim = dims[1], input_dim = dims[0],
weights = weights[0][0:2], activation = 'linear'))
## add the rest of the encoders
for i in range(1, len(dims) - 1):
encoder.add(Dense(output_dim = dims[i+1],
weights = weights[i][0:2], activation = self.act))
## add the decoders from the end
decoder.add(Dense(output_dim = dims[len(dims) - 2], input_dim = dims[len(dims) - 1],
weights = weights[len(dims) - 2][2:4], activation = self.act))
for i in range(len(dims) - 2, 1, -1):
decoder.add(Dense(output_dim = dims[i - 1],
weights = weights[i-1][2:4], activation = self.act))
## add the output layer decoder
decoder.add(Dense(output_dim = dims[0],
weights = weights[0][2:4], activation = 'linear'))
masterAE = AutoEncoder(encoder = encoder, decoder = decoder)
masterModel = models.Sequential()
masterModel.add(masterAE)
masterModel.compile(optimizer = 'sgd', loss = 'mse')
masterModel.fit(X_train, X_train, nb_epoch = nb_epoch)
self.model = masterModel
Any suggestions on how to fix the problem would be appreciated. My original suspicion was that it was something to do with tensorflow, so I tried running with the theano backend but encountered the same problem.
Here is a link to the full program.

Following the Keras doc, the fit method uses a default of 100 training epochs (nb_epoch=100):
fit(X, y, batch_size=128, nb_epoch=100, verbose=1, callbacks=[], validation_split=0.0, validation_data=None, shuffle=True, show_accuracy=False, class_weight=None, sample_weight=None)
I'm sure how you are running these methods, but following the "Typical usage" from the original code, you should be able to run something like (adjusting the variable num_epoch as required):
#Typical usage:
num_epoch = 10
ae = JPAutoEncoder(dims)
ae.pretrain(X_train, nb_epoch = num_epoch)
ae.train(X_train, nb_epoch = num_epoch)
ae.predict(X_val)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

PyTorch (GPU) slower than CPU slower than keras - python

Related

Error: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first

Why does my Colab session run out of RAM?

Loss & accuracy don't improve in Xception (image classification)

np.load() in generator slows down drastically after a number of iterations

Keras: Wrong Number of Training Epochs

Categories

Resources