I wanna implement text2image neural networks like the image below: Please see the image
using CNNs and Transposed CNNs with Embedding layer
import torch
from torch import nn
Input text :
text = "A cat wearing glasses and playing the guitar "
# Simple preprocessing the text
word_to_ix = {"A": 0, "cat": 1, "wearing": 2, "glasses": 3, "and": 4, "playing": 5, "the": 6, "guitar":7}
lookup_tensor = torch.tensor(list(word_to_ix.values()), dtype = torch.long) # a tensor representing words by integers
vocab_size = len(lookup_tensor)
architecture implementation :
class TextToImage(nn.Module):
def __init__(self, vocab_size):
super(TextToImage, self).__init__()
self.vocab_size = vocab_size
self.noise = torch.rand((56,64))
# DEFINE the layers
# Embedding
self.embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim = 64)
# Conv
self.conv2d_1 = nn.Conv2d(in_channels=64, out_channels=3, kernel_size=(3, 3), stride=(2, 2), padding='valid')
self.conv2d_2 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=(3, 3), stride=(2, 2), padding='valid')
# Transposed CNNs
self.conv2dTran_1 = nn.ConvTranspose2d(in_channels=16, out_channels=16, kernel_size=(3, 3), stride=(1, 1), padding=1)
self.conv2dTran_2 = nn.ConvTranspose2d(in_channels=16, out_channels=3, kernel_size=(3, 3), stride=(2, 2), padding=0)
self.conv2dTran_3 = nn.ConvTranspose2d(in_channels=6, out_channels=3, kernel_size=(4, 4), stride=(2, 2), padding=0)
self.relu = torch.nn.ReLU(inplace=False)
self.dropout = torch.nn.Dropout(0.4)
def forward(self, text_tensor):
#SEND the input text tensor to the embedding layer
emb = self.embed(text_tensor)
#COMBINE the embedding with the noise tensor. Make it have 3 dimensions
combine1 = torch.cat((emb, self.noise), dim=1, out=None)
#SEND the noisy embedding to the convolutional and transposed convolutional layers
conv2d_1 = self.conv2d_1(combine1)
conv2d_2 = self.conv2d_2(conv2d_1)
dropout = self.dropout(conv2d_2)
conv2dTran_1 = self.conv2dTran_1(dropout)
conv2dTran_2 = self.conv2dTran_2(conv2dTran_1)
#COMBINE the outputs having a skip connection in the image of the architecture
combine2 = torch.cat((conv2d_1, conv2dTran_2), dim=1, out=None)
conv2dTran_3 = self.conv2dTran_3(combine2)
#SEND the combined outputs to the final layer. Please name your final output variable as "image" so you that it can be returned
image = self.relu(conv2dTran_3)
return image
Expected output
torch.Size( [3, 64, 64] )
texttoimage = TextToImage(vocab_size=vocab_size)
output = texttoimage(lookup_tensor)
output.size()
Generated random noisy image :
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.imshow(np.moveaxis(output.detach().numpy(), 0,-1))
The error I got :
RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 8 but got size 56 for tensor number 1 in the list.
Does anyone how to solve this issue I think it from concatenate nosey with embedding
After changing dim = 0
and expand to 3 dim
In addition there was issue in Input channel for first Conv_1 where changed from 64 to 1
class TextToImage(nn.Module):
def __init__(self, vocab_size):
super(TextToImage, self).__init__()
self.vocab_size = vocab_size
self.noise = torch.rand((56,64))
# DEFINE the layers
# Embedding
self.embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim = 64)
# Conv
self.conv2d_1 = nn.Conv2d(in_channels=1, out_channels=3, kernel_size=(3, 3), stride=(2, 2), padding='valid')
self.conv2d_2 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=(3, 3), stride=(2, 2), padding='valid')
# Transposed CNNs
self.conv2dTran_1 = nn.ConvTranspose2d(in_channels=16, out_channels=16, kernel_size=(3, 3), stride=(1, 1), padding=1)
self.conv2dTran_2 = nn.ConvTranspose2d(in_channels=16, out_channels=3, kernel_size=(3, 3), stride=(2, 2), padding=0)
self.conv2dTran_3 = nn.ConvTranspose2d(in_channels=6, out_channels=3, kernel_size=(4, 4), stride=(2, 2), padding=0)
self.relu = torch.nn.ReLU(inplace=False)
self.dropout = torch.nn.Dropout(0.4)
def forward(self, text_tensor):
#SEND the input text tensor to the embedding layer
emb = self.embed(text_tensor)
#COMBINE the embedding with the noise tensor. Make it have 3 dimensions
combined = torch.cat((emb, self.noise), dim=0) #, out=None
print(combined.shape)
combined_3d = combined[None, :]
print(combined_3d.shape)
# SEND the noisy embedding to the convolutional and transposed convolutional layers
conv2d_1 = self.conv2d_1(combined_3d)
conv2d_2 = self.conv2d_2(conv2d_1)
dropout = self.dropout(conv2d_2)
conv2dTran_1 = self.conv2dTran_1(dropout)
conv2dTran_2 = self.conv2dTran_2(conv2dTran_1)
#COMBINE the outputs having a skip connection in the image of the architecture
combined_2 = torch.cat((conv2d_1, conv2dTran_2),axis=0) #dim=1, out=None
conv2dTran_3 = self.conv2dTran_3(combined_2)
#SEND the combined outputs to the final layer. Please name your final output variable as "image" so you that it can be returned
image = self.relu(conv2dTran_3)
return image
The cat function requires the tensor shapes to match aside from the dimension you're concatenating, so to concatenate (8,64) and (56,64) your cat should look like this using dim 0 instead of 1:
combine1 = torch.cat((emb, self.noise), dim=0, out=None)
After that, I'm not seeing where you give combine1 a 3rd dimension.
Related
I would like to train a convolutional neural network autoencoder on a csv file. The csv file contains pixel neighborhood position of an original image of 1024x1024.
When I try to train it, I have the following error that I don't manage to resolve.
ValueError: Input 0 of layer max_pooling2d is incompatible with the layer: expected ndim=4, found ndim=5. Full shape received: (None, 1, 1024, 1024, 16). Any idea, what I am doing wrong in my coding?
Let's me explain my code:
My csv file has this structure:
0 0 1.875223e+01
1 0 1.875223e+01
2 0 2.637685e+01
3 0 2.637685e+01
4 0 2.637685e+01
I managed to load my dataset, extract the x, y, and value columns as NumPy arrays and extract the relevant columns as NumPy arrays
x = data[0].values
y = data[1].values
values = data[2].values
Then, I create an empty image with the correct dimensions and fill in the image with the pixel values
image = np.empty((1024, 1024))
for i, (xi, yi, value) in enumerate(zip(x, y, values)):
image[xi.astype(int), yi.astype(int)] = value
To use this array as input to my convolutional autoencoder I reshaped it to a 4D array with dimensions
# Reshape the image array to a 4D tensor
image = image.reshape((1, image.shape[0], image.shape[1], 1))
Finally, I declare the convolutional autoencoder structure, at this stage I have the error `incompatible with the layer: expected ndim=4, found ndim=5. Full shape received: (None, 1, 1024, 1024, 16)'
import keras
from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D
from keras.models import Model
# Define the input layer
input_layer = Input(shape=(1,image.shape[1], image.shape[2], 1))
# Define the encoder layers
x = Conv2D(16, (3, 3), activation='relu', padding='same')(input_layer)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
encoded = MaxPooling2D((2, 2), padding='same')(x)
# Define the decoder layers
x = Conv2D(8, (3, 3), activation='relu', padding='same')(encoded)
x = UpSampling2D((2, 2))(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(16, (3, 3), activation='relu')(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)
# Define the autoencoder model
autoencoder = Model(input_layer, decoded)
# Compile the model
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
# Reshape the image array to a 4D tensor
image = image.reshape((1, image.shape[0], image.shape[1], 1))
# Train the model
autoencoder.fit(image, image, epochs=50, batch_size=1, shuffle=True)
You should drop the first dimension in the input layer:
input_layer = Input(shape=(image.shape[1], image.shape[2], 1))
Training pool shape and input layer shape should be different. Input layer shape describes the shape of a single datapoint, while training pool shape describes whole dataset. Hence it has one more dimension with size equal to the number of data points in your dataset.
You should also drop second image.reshape since at that time image.shape == (1, 1024, 1024, 1) and doing image = image.reshape((1, image.shape[0], image.shape[1], 1)) tries to reshape that into (1, 1, 1024, 1) which is impossible.
And lastly you forgot to add padding='same' to one of the Conv2D layers. Add that to match output layer shape with your training label shape.
I'm having a problem implementing a super-resolution model
class SRNet(Model):
def __init__(self, scale=4):
super(SRNet, self).__init__()
self.scale = scale
self.conv1 = Sequential([
layers.Conv2D(filters=64, kernel_size=3,
strides=(1, 1), padding="same", data_format="channels_first"),
layers.ReLU(),
])
self.residualBlocks = Sequential(
[ResidualBlock() for _ in range(16)])
self.convUp = Sequential([
layers.Conv2DTranspose(filters=64, kernel_size=3, strides=(
2, 2), padding="same", data_format="channels_first"),
layers.ReLU(),
layers.Conv2DTranspose(filters=64, kernel_size=3, strides=(
2, 2), padding="same", data_format="channels_first"),
layers.ReLU(),
])
self.reluAfterPixleShuffle = layers.ReLU()
self.convOut = layers.Conv2D(
filters=3, kernel_size=3, strides=(1, 1), padding="same", data_format="channels_first", input_shape=(4, 1440, 2560)) # (kernel, kernel, channel, output)
def call(self, lrCur_hrPrevTran):
lrCur, hrPrevTran = lrCur_hrPrevTran
x = tf.concat([lrCur, hrPrevTran], axis=1)
x = self.conv1(x)
x = self.residualBlocks(x)
x = self.convUp(x)
# pixel shuffle
Subpixel_layer = Lambda(lambda x: tf.nn.depth_to_space(
x, self.scale, data_format="NCHW"))
x = Subpixel_layer(inputs=x)
x = self.reluAfterPixleShuffle(x)
x = self.convOut(x)
return x
Error
/usr/src/app/generator.py:164 call *
x = self.convOut(x)
ValueError: Tensor's shape (3, 3, 64, 3) is not compatible with supplied shape (3, 3, 4, 3)
after reading the error I know that (3, 3, 4, 3) is (kernel size, kernel size, channel, output) mean that only channel of input is not correct
so I printed out the shape of the input
# after pixel shuffle before convOut
print(x.shape)
>>> (1, 4, 1440, 2560) (batch size, channel, height, width)
but the shape of x after pixel shuffle (depth_to_space) is (1, 4, 1440, 2560) the channel value is 4 which is the same as convOut need
question is why the input's channel is changing from 4 to 64 as the error?
I have found a solution
First of all, I'm using checkpoints to save model weight when training
during the implementation and testing of the model, I have changed some of the layers so the input size is changed too, but my weight still remember the input size from the previous checkpoint
so I delete the checkpoints folder and then everything works again
Working on a university exercise, I used the model sub-classing API of TF2.0. Here's my code (it's the Alexnet architecture, if you wonder...):
class MyModel(Model):
def __init__(self):
super(MyModel, self).__init__()
# OPS
self.relu = Activation('relu', name='ReLU')
self.maxpool = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='valid', name='MaxPool')
self.softmax = Activation('softmax', name='Softmax')
# Conv layers
self.conv1 = Conv2D(filters=96, input_shape=(224, 224, 3), kernel_size=(11, 11), strides=(4, 4), padding='same',
name='conv1')
self.conv2a = Conv2D(filters=128, kernel_size=(5, 5), strides=(1, 1), padding='same', name='conv2a')
self.conv2b = Conv2D(filters=128, kernel_size=(5, 5), strides=(1, 1), padding='same', name='conv2b')
self.conv3 = Conv2D(filters=384, kernel_size=(3, 3), strides=(1, 1), padding='same', name='conv3')
self.conv4a = Conv2D(filters=192, kernel_size=(3, 3), strides=(1, 1), padding='same', name='conv4a')
self.conv4b = Conv2D(filters=192, kernel_size=(3, 3), strides=(1, 1), padding='same', name='conv4b')
self.conv5a = Conv2D(filters=128, kernel_size=(3, 3), strides=(1, 1), padding='same', name='conv5a')
self.conv5b = Conv2D(filters=128, kernel_size=(3, 3), strides=(1, 1), padding='same', name='conv5b')
# Fully-connected layers
self.flatten = Flatten()
self.dense1 = Dense(4096, input_shape=(100,), name='FC_4096_1')
self.dense2 = Dense(4096, name='FC_4096_2')
self.dense3 = Dense(1000, name='FC_1000')
# Network definition
def call(self, x, **kwargs):
x = self.conv1(x)
x = self.relu(x)
x = tf.nn.local_response_normalization(x, depth_radius=2, alpha=2e-05, beta=0.75, bias=1.0)
x = self.maxpool(x)
x = tf.concat((self.conv2a(x[:, :, :, :48]), self.conv2b(x[:, :, :, 48:])), 3)
x = self.relu(x)
x = tf.nn.local_response_normalization(x, depth_radius=2, alpha=2e-05, beta=0.75, bias=1.0)
x = self.maxpool(x)
x = self.conv3(x)
x = self.relu(x)
x = tf.concat((self.conv4a(x[:, :, :, :192]), self.conv4b(x[:, :, :, 192:])), 3)
x = self.relu(x)
x = tf.concat((self.conv5a(x[:, :, :, :192]), self.conv5b(x[:, :, :, 192:])), 3)
x = self.relu(x)
x = self.maxpool(x)
x = self.flatten(x)
x = self.dense1(x)
x = self.relu(x)
x = self.dense2(x)
x = self.relu(x)
x = self.dense3(x)
return self.softmax(x)
My goal is to access an arbitrary layer's output (in order to maximize a specific neuron's activation, if you have to know exactly :) ). The problem is that trying to access any layer's output, I get an attribute error. For example:
model = MyModel()
print(model.get_layer('conv1').output)
# => AttributeError: Layer conv1 has no inbound nodes.
I found some questions with this error here in SO, and all of them claim that I have to define the input shape in the first layer, but as you can see - it's already done (see the definition of self.conv1 in the __init__ function)!
I did find that if I define a keras.layers.Input object, I do manage to get the output of conv1, but trying to access deeper layers fails, for example:
model = MyModel()
I = tf.keras.Input(shape=(224, 224, 3))
model(I)
print(model.get_layer('conv1').output)
# prints Tensor("my_model/conv1/Identity:0", shape=(None, 56, 56, 96), dtype=float32)
print(model.get_layer('FC_1000').output)
# => AttributeError: Layer FC_1000 has no inbound nodes.
I googled every exception that I got on the way, but found no answer. How can I access any layer's input/output (or input/output _shape attributes, by the way) in this case?
In sub-classed model there is no graph of layers, it's just a piece of code (models call function). Layer connections are not defined while creating instance of Model class. Hence we need to build model first by calling call method.
Try this:
model = MyModel()
inputs = tf.keras.Input(shape=(224,224,3))
model.call(inputs)
# instead of model(I) in your code.
After doing this model graph is created.
for i in model.layers:
print(i.output)
# output
# Tensor("ReLU_7/Relu:0", shape=(?, 56, 56, 96), dtype=float32)
# Tensor("MaxPool_3/MaxPool:0", shape=(?, 27, 27, 96), dtype=float32)
# Tensor("Softmax_1/Softmax:0", shape=(?, 1000), dtype=float32)
# ...
Hello iam currently using the tensorflow.org Variational Autoencoder implementation.
https://www.tensorflow.org/tutorials/generative/cvae
I just tried to change the architecture in a way that it accepts batches of 6 images.
I tried to adapt the code by myslelf and just change the Conv2D to Conv3D but that did not really work.
The original images are batches of 6 x 299 x 299 OCT images.
I reshaped them to 64 x 64.
class CVAE(tf.keras.Model):
def __init__(self, latent_dim):
super(CVAE, self).__init__()
self.latent_dim = latent_dim
# defines an approximate posterior distribution q(z|x)
# outputs mean and log-variance of a factorized Gaussian
self.inference_net = tf.keras.Sequential(
[
tf.keras.layers.InputLayer(input_shape=(6, 64, 64, 1)), # (28, 28, 1)
tf.keras.layers.Conv3D(
filters=32, kernel_size=3, strides=(2, 2, 2), activation='relu'),
tf.keras.layers.Conv3D(
filters=64, kernel_size=3, strides=(2, 2, 2), activation='relu'),
tf.keras.layers.Flatten(),
# No activation
tf.keras.layers.Dense(latent_dim + latent_dim),
]
)
# outputs p(x|z)
self.generative_net = tf.keras.Sequential(
[
tf.keras.layers.InputLayer(input_shape=(latent_dim,)),
tf.keras.layers.Dense(units=6*16*16*32, activation=tf.nn.relu), # change with img_size (7,7,32)
tf.keras.layers.Reshape(target_shape=(6, 16, 16, 32)), # change with image size # (7,7,32)
tf.keras.layers.Conv3DTranspose(
filters=64,
kernel_size=3,
strides=(2, 2, 2),
padding="SAME",
activation='relu'),
tf.keras.layers.Conv3DTranspose(
filters=32,
kernel_size=3,
strides=(2, 2, 2),
padding="SAME",
activation='relu'),
# No activation
tf.keras.layers.Conv3DTranspose(
filters=1, kernel_size=3, strides=(1, 1, 1), padding="SAME"),
]
)
InvalidArgumentError: Negative dimension size caused by subtracting 3 from 2 for 'conv3d_5/Conv3D' (op: 'Conv3D') with input shapes: [?,2,13,13,32], [3,3,3,32,64].
def _parser(self, example_proto):
# define a dict with the data-names and types we
# expect to find
features = { 'image_raw': tf.io.FixedLenFeature([], tf.string) }
# parse the serialized data
parsed_features = tf.io.parse_single_example(example_proto, features)
# decode the raw bytes so it becomes a tensor with type
ima = tf.io.decode_raw(parsed_features['image_raw'], tf.float64)
ima = tf.reshape(ima, (6,299,299)) # (6,299,299)
## custom; ima is already a tensor
ima = tf.expand_dims(ima, -1) # (6,299,299,1)
ima = tf.image.resize(ima, (64, 64))
#ima = ima[0,:] # only 1st scan
#ima = tf.squeeze(ima)
#ima = tf.reshape(ima, (1,784)) #(6, 784)
ima = tf.cast(ima, 'float32')
# normalizing images
ima = ima / 255
print("Parser Format: {}" .format(ima))
return ima # (28, 28, 1)
Any kind of help is highly appreciated. I am kind of new to neural networks.
Thank you very much in advance.
My input is 299,299,3
My graphics card is 1070 (8 gigs of ram)
Other Specs: Python 3.6, Keras 2.xx, Tensorflow-backend(1.4), Windows 7
Even batch size of 1 isn't working.
I feel like my card should handle a batch of size one --
Here is my code:
def full_model():
#model layers
input_img = Input(shape=(299, 299, 3))
tower_1 = Conv2D(64, (1, 1,), padding='same', activation='relu')(input_img)
tower_1 = Conv2D(64, (3, 3), padding='same', activation='relu')(tower_1)
tower_2 = Conv2D(64, (1, 1), padding='same', activation='relu')(input_img)
tower_2 = Conv2D(64, (5, 5), padding='same', activation='relu')(tower_2)
concatenated_layer = keras.layers.concatenate([tower_1, tower_2], axis=3)
bottleneck = MaxPooling2D((2, 2), strides=(2, 2), padding='same')(concatenated_layer)
flatten = Flatten()(bottleneck)
dense_1 = Dense(500, activation = 'relu')(flatten)
predictions = Dense(12, activation = 'softmax')(dense_1)
model = Model(inputs= input_img, output = predictions)
SGD =keras.optimizers.SGD(lr=0.1, momentum=0.0, decay=0.0, nesterov=False)
model.compile(optimizer=SGD,
loss='categorical_crossentropy',
metrics=['accuracy'])
return model
hdf5_path =r'C:\Users\Moondra\Desktop\Keras Applications\training.hdf5'
model = full_model()
def run_model( hdf5_path,
epochs = 10,
steps_per_epoch =8,
classes =12,
batch_size =1, model= model ):
for i in range(epochs):
batches = loading_hdf5_files.load_batches(batch_size =1,
hdf5_path=hdf5_path ,
classes = classes)
for i in range(steps_per_epoch):
x,y = next(batches)
#plt.imshow(x[0])
#plt.show()
x = (x/255).astype('float32')
print(x.shape)
data =model.train_on_batch(x,y)
print('loss : {:.5}, accuracy : {:.2%}'.format(*data))
return model
I can't seem to handle even a batch of size one.
Here is the last part of the error:
ResourceExhaustedError (see above for traceback): OOM when allocating tensor of shape [] and type float
[[Node: conv2d_4/random_uniform/sub = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [] values: 0.0866025388>, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]
It turns out I have way too many parameters.
After running print(model.summary()), I had a billion plus parameters.
I increased size of MaxPooling and no more problems.