I am trying to create a transform that shuffles the patches of each image in a batch.
I aim to use it in the same manner as the rest of the transformations in torchvision:
trans = transforms.Compose([
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
ShufflePatches(patch_size=(16,16)) # our new transform
])
More specifically, the input is a BxCxHxW tensor. I want to split each image in the batch into non-overlapping patches of size patch_size, shuffle them, and regroup into a single image.
Given the image (of size 224x224):
Using ShufflePatches(patch_size=(112,112)) I would like to produce the output image:
I think the solution has to do with torch.unfold and torch.fold, but didn't manage to get any further.
Any help would be appreciated!
Indeed unfold and fold seem appropriate in this case.
import torch
import torch.nn.functional as nnf
class ShufflePatches(object):
def __init__(self, patch_size):
self.ps = patch_size
def __call__(self, x):
# divide the batch of images into non-overlapping patches
u = nnf.unfold(x, kernel_size=self.ps, stride=self.ps, padding=0)
# permute the patches of each image in the batch
pu = torch.cat([b_[:, torch.randperm(b_.shape[-1])][None,...] for b_ in u], dim=0)
# fold the permuted patches back together
f = nnf.fold(pu, x.shape[-2:], kernel_size=self.ps, stride=self.ps, padding=0)
return f
Here's an example with patch size=16:
Related
Using the Fashion mnist dataset, I don't want to just split a single image into patches but rather all of images.
I've seen the function unfold() but I think this only works for a single image
mnist_train = torchvision.datasets.FashionMNIST(
root="../data", train=True,
transform=transforms.Compose([transforms.ToTensor()]), download=True)
x = mnist_train[0][0][-1, :, :]
x = x.unfold(0, 7, 7).unfold(1, 7, 7)
x.shape
How do I make non-overlapping patches (of any number to keep it simple) for all images?
Would appreciate any help. Thanks!
You can create a custom transform to split all images into multiple patches. Something along the lines of:
class Patch(object):
"""
Creates patches from images
"""
def __init__(self, patch_size=7):
self.patch_size = patch_size
def __call__(self, image):
patched_image = # ... your code here
return patched_image
mnist_train = torchvision.datasets.FashionMNIST(
root="../data", train=True,
transform=transforms.Compose([transforms.ToTensor(), Patch()]), download=True)
In tf keras, it is possible to have a data augmentation layer that performs rotation on each given image during training, in the following way as the docs say:
tf.keras.layers.RandomRotation(
factor, fill_mode='reflect', interpolation='bilinear',
seed=None, fill_value=0.0, **kwargs
)
The factor argument indicates the value of maximum rotation if a float is given and indicates lower and upper limits if a tuple is given.
For my specific application only specific rotations are allowed, say 0°, 90°, 180° and 270°.
Is there any way I can achieve this using the RandomRotation class or a good alternative to this or should I just augment the whole dataset before training?
You can do this by creating a custom PreprocessingLayer.
import tensorflow as tf
class Rotate90Randomly(tf.keras.layers.experimental.preprocessing.PreprocessingLayer):
def __init__(self):
super(Rotate90Randomly, self).__init__()
def call(self, x, training=False):
def random_rotate():
rotation_factor = tf.random.uniform([], minval=0,
maxval=4, dtype=tf.int32)
return tf.image.rot90(x, k=rotation_factor)
training = tf.constant(training, dtype=tf.bool)
rotated = tf.cond(training, random_rotate, lambda: x)
rotated.set_shape(rotated.shape)
return rotated
One thing to consider, if the inputs' height and width are not the same, in other words they are not square you need to define input_shape as (None, None, channels) while creating the model.
Examples:
model = tf.keras.Sequential([
tf.keras.Input((180,180,3)),
Rotate90Randomly()])
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
for i in range(9):
ax = plt.subplot(3, 3, i + 1)
images = model(images, training = True)
plt.imshow(images[i].numpy().astype("uint8"))
plt.title(class_names[labels[i]])
plt.axis("off")
With training = False, they remain the same so this layer is not active during inference.
I'm Trying to implement of Faster-RCNN model with Pytorch.
In the structure, First element of model is Transform.
from torchvision.models.detection import fasterrcnn_resnet50_fpn
model = fasterrcnn_resnet50_fpn(pretrained=True)
print(model.transform)
GeneralizedRCNNTransform(
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
Resize(min_size=(800,), max_size=1333, mode='bilinear')
)
When images pass forward of Resize(), They come out with (800,h) or (w, 1333) according to ratio of Width and Height.
for i in range(2):
_, image, target = testset.__getitem__(i)
img = image.unsqueeze(0)
output, _ = model.transform(img)
Before Transform : torch.Size([512, 640])
After Transform : [(800, 1000)]
Before Transform : torch.Size([315, 640])
After Transform : [(656, 1333)]
My question is how to get those resized output and why they use This method? I can't find the information in the paper and I can't understand the source code about transform in fasterrcnn_resnet50_fpn.
Sorry for my English
GeneralizedRCNN data transform:
https://github.com/pytorch/vision/blob/922db3086e654871c35cd80c2c01eabb65d78475/torchvision/models/detection/generalized_rcnn.py#L15
performs the data transformation on the inputs to feed into the model
min_size: minimum size of the image to be rescaled before feeding it to the backbone.
max_size: maximum size of the image to be rescaled before feeding it to the backbone
https://github.com/pytorch/vision/blob/main/torchvision/models/detection/faster_rcnn.py#L256
I couldn't either find out why it was generalize for min 800 and max 1333, didn't find anything in research paper either.
but as the 1st layer is a Conv layer, the input to the network is fixed size, I apply many other augmentations such as mirror, random cropping etc, inspired by SSD based networks. Hence I would prefer to do all augmentation in a separate place once instead of twice.
I would assume the model should work the best during validation using images with shapes and other properties as close as possible to the training data.
though you can experiment with custom min_size and max_size...
`
from .transform import GeneralizedRCNNTransform
min_size = 900 #changed from default
max_size = 1433 #changed from default
image_mean = [0.485, 0.456, 0.406]
image_std = [0.229, 0.224, 0.225]
model = fasterrcnn_resnet50_fpn(pretrained=True, min_size, max_size, image_mean, image_std)
#batch of 4 image, 4 bboxes
images, boxes = torch.rand(4, 3, 600, 1200), torch.rand(4, 11, 4)
labels = torch.randint(1, 91, (4, 11))
images = list(image for image in images)
targets = []
for i in range(len(images)):
d = {}
d['boxes'] = boxes[i]
d['labels'] = labels[i]
targets.append(d)
output = model(images, targets)
`
or you can completely write your transforms
https://pytorch.org/vision/stable/transforms.html
'
from torchvision.transforms import transforms as T
model = fasterrcnn_resnet50_rpn()
model.transform = T.Compose([*check torchvision.transforms for more*])
'
Hope this helps.
I am using image_dataset_from_directory to load a very large RGB imagery dataset from disk into a Dataset. For example,
dataset = tf.keras.preprocessing.image_dataset_from_directory(
<directory>,
label_mode=None,
seed=1,
subset='training',
validation_split=0.1)
The Dataset has, say, 100000 images grouped into batches of size 32 yielding a tf.data.Dataset with spec (batch=32, width=256, height=256, channels=3)
I would like to extract patches from the images to create a new tf.data.Dataset with image spatial dimensions of, say, 64x64.
Therefore, I would like to create a new Dataset with 400000 patches still in batches of 32 with a tf.data.Dataset with spec (batch=32, width=64, height=64, channels=3)
I've looked at the window method and the extract_patches function but it's not clear from the documentation how to use them to create a new Dataset I need to start training on the patches. The window seems to be geared toward 1D tensors and the extract_patches seems to work with arrays and not with Datasets.
Any suggestions on how to accomplish this?
UPDATE:
Just to clarify my needs. I am trying to avoid manually creating the patches on disk. One, that would be untenable disk wise. Two, the patch size is not fixed. The experiments will be conducted over several patch sizes. So, I do not want to manually perform the patch creation either on disk or manually load the images in memory and perform the patching. I would prefer to have tensorflow handle the patch creation as part of the pipeline workflow to minimize disk and memory usage.
What you're looking for is tf.image.extract_patches. Here's an example:
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import numpy as np
data = tfds.load('mnist', split='test', as_supervised=True)
get_patches = lambda x, y: (tf.reshape(
tf.image.extract_patches(
images=tf.expand_dims(x, 0),
sizes=[1, 14, 14, 1],
strides=[1, 14, 14, 1],
rates=[1, 1, 1, 1],
padding='VALID'), (4, 14, 14, 1)), y)
data = data.map(get_patches)
fig = plt.figure()
plt.subplots_adjust(wspace=.1, hspace=.2)
images, labels = next(iter(data))
for index, image in enumerate(images):
ax = plt.subplot(2, 2, index + 1)
ax.set_xticks([])
ax.set_yticks([])
ax.imshow(image)
plt.show()
I believe you can use a python class generator. You can pass this generator to model.fit function if you want. I actually used it once for labels preprocessing.
I wrote the following dataset generator that loads a batch from your dataset, splits the images from the batch into multiple images based on the tile_shape parameter. If there are enough images, the next batch is returned.
In the example, I used a simple dataset from_tensor_slices for simplification. You can, of course, replace it with yours.
import tensorflow as tf
class TileDatasetGenerator:
def __init__(self, dataset, batch_size, tile_shape):
self.dataset_iterator = iter(dataset)
self.batch_size = batch_size
self.tile_shape = tile_shape
self.image_queue = None
def __iter__(self):
return self
def __next__(self):
if self._has_queued_enough_for_batch():
return self._dequeue_batch()
batch = next(self.dataset_iterator)
self._split_images(batch)
return self.__next__()
def _has_queued_enough_for_batch(self):
return self.image_queue is not None and tf.shape(self.image_queue)[0] >= self.batch_size
def _dequeue_batch(self):
batch, remainder = tf.split(self.image_queue, [self.batch_size, -1], axis=0)
self.image_queue = remainder
return batch
def _split_images(self, batch):
batch_shape = tf.shape(batch)
batch_splitted = tf.reshape(batch, shape=[-1, self.tile_shape[0], self.tile_shape[1], batch_shape[-1]])
if self.image_queue is None:
self.image_queue = batch_splitted
else:
self.image_queue = tf.concat([self.image_queue, batch_splitted], axis=0)
dataset = tf.data.Dataset.from_tensor_slices(tf.ones(shape=[128, 64, 64, 3]))
dataset.batch(32)
generator = TileDatasetGenerator(dataset, batch_size = 16, tile_shape = [32,32])
for batch in generator:
tf.print(tf.shape(batch))
Edit:
It is possible to convert the generator to tf.data.Dataset if you want, but it requires that you add a __call__ function to the generator returning an iterator (self in this case).
new_dataset = tf.data.Dataset.from_generator(generator, output_types=(tf.int64))
I am trying to filter a TensorFlow tensor of shape (N_batch, N_data), where N_batch is the batch size (e.g. 32), and N_data is the size of the (noisy) timeseries array. I have a Gaussian kernel (taken from here), which is one-dimensional. I then want to use tensorflow.nn.conv1d to convolve this kernel with my signal.
I have been trying for most of the morning to get the dimensions of the input signal and the kernel right, but obviously with no success. From what I gathered from the interwebs, the dimensions of both the input signal and the kernel need to be aligned in some finicky way, and I just can't figure out which way that is. The TensorFlow error messages aren't particularly meaningful either (Shape must be rank 4 but is rank 3 for 'conv1d/Conv2D' (op: 'Conv2D') with input shapes: [?,1,1000], [1,81]). Below I've included a little piece of code to reproduce the situation:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
# Based on: https://stackoverflow.com/a/52012658/1510542
# Credits to #zephyrus
def gaussian_kernel(size, mean, std):
d = tf.distributions.Normal(tf.cast(mean, tf.float32), tf.cast(std, tf.float32))
vals = d.prob(tf.range(start=-size, limit=size+1, dtype=tf.float32))
kernel = vals # Some reshaping is required here
return kernel / tf.reduce_sum(kernel)
def gaussian_filter(input, sigma):
size = int(4*sigma + 0.5)
x = input # Some reshaping is required here
kernel = gaussian_kernel(size=size, mean=0.0, std=sigma)
conv = tf.nn.conv1d(x, kernel, stride=1, padding="SAME")
return conv
def run_filter():
tf.reset_default_graph()
# Define size of data, batch sizes
N_batch = 32
N_data = 1000
noise = 0.2 * (np.random.rand(N_batch, N_data) - 0.5)
x = np.linspace(0, 2*np.pi, N_data)
y = np.tile(np.sin(x), N_batch).reshape(N_batch, N_data)
y_noisy = y + noise
input = tf.placeholder(tf.float32, shape=[None, N_data])
smooth_input = gaussian_filter(input, sigma=10)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
y_smooth = smooth_input.eval(feed_dict={input: y_noisy})
plt.plot(y_noisy[0])
plt.plot(y_smooth[0])
plt.show()
if __name__ == "__main__":
run_filter()
Any ideas?
You need to add channel dimensions to your input/kernel, since TF convolutions are generally used for multi-channel inputs/outputs. As you are working with simple 1-channel input/output this amounts to just adding some size-1 "dummy" axes.
Since by default convolution expects channels to come last, your placeholder should have shape [None, N_data, 1] and your input be modified like
y_noisy = y + noise
y_noisy = y_noisy[:, :, np.newaxis]
Similarly, you need to add input and output channel dimensions to your filter:
kernel = gaussian_kernel(size=size, mean=0.0, std=sigma)
kernel = kernel[:, tf.newaxis, tf.newaxis]
That is, the filter is expected to have shape [width, in_channels, out_cannels].