Related
I have a dataset of images that looks like this:
array([[[[0.35980392, 0.26078431, 0.14313725],
[0.38137255, 0.26470588, 0.15196078],
[0.51960784, 0.3745098 , 0.26176471],
...,
[0.34313725, 0.22352941, 0.15 ],
[0.30784314, 0.2254902 , 0.15686275],
[0.28823529, 0.22843137, 0.16862745]],
[[0.38627451, 0.28235294, 0.16764706],
[0.45098039, 0.32843137, 0.21666667],
[0.62254902, 0.47254902, 0.36470588],
...,
[0.34607843, 0.22745098, 0.15490196],
[0.30686275, 0.2245098 , 0.15588235],
[0.27843137, 0.21960784, 0.16176471]],
[[0.41568627, 0.30098039, 0.18431373],
[0.51862745, 0.38529412, 0.27352941],
[0.67745098, 0.52058824, 0.40980392],
...,
[0.34901961, 0.22941176, 0.15588235],
[0.29901961, 0.21666667, 0.14901961],
[0.26078431, 0.20098039, 0.14313725]],
...,
This is how i download it:
data, attrs = fetch_dataset()
This is how fetch_dataset() function works:
import numpy as np
import os
from skimage.transform import resize
import skimage.io
import pandas as pd
def fetch_dataset(attrs_name = "lfw_attributes.txt",
images_name = "lfw-deepfunneled",
dx=80,dy=80,
dimx=45,dimy=45
):
#download if not exists
if not os.path.exists(images_name):
print("images not found, donwloading...")
os.system("wget http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz -O tmp.tgz")
print("extracting...")
os.system("tar xvzf tmp.tgz && rm tmp.tgz")
print("done")
assert os.path.exists(images_name)
if not os.path.exists(attrs_name):
print("attributes not found, downloading...")
os.system("wget http://www.cs.columbia.edu/CAVE/databases/pubfig/download/%s" % attrs_name)
print("done")
#read attrs
df_attrs = pd.read_csv("lfw_attributes.txt",sep='\t',skiprows=1,)
df_attrs = pd.DataFrame(df_attrs.iloc[:,:-1].values, columns = df_attrs.columns[1:])
#read photos
photo_ids = []
for dirpath, dirnames, filenames in os.walk(images_name):
for fname in filenames:
if fname.endswith(".jpg"):
fpath = os.path.join(dirpath,fname)
photo_id = fname[:-4].replace('_',' ').split()
person_id = ' '.join(photo_id[:-1])
photo_number = int(photo_id[-1])
photo_ids.append({'person':person_id,'imagenum':photo_number,'photo_path':fpath})
photo_ids = pd.DataFrame(photo_ids)
# print(photo_ids)
#mass-merge
#(photos now have same order as attributes)
df = pd.merge(df_attrs,photo_ids,on=('person','imagenum'))
assert len(df)==len(df_attrs),"lost some data when merging dataframes"
# print(df.shape)
#image preprocessing
all_photos =df['photo_path'].apply(skimage.io.imread)\
.apply(lambda img:img[dy:-dy,dx:-dx])\
.apply(lambda img: resize(img,[dimx,dimy]))
all_photos = np.stack(all_photos.values)#.astype('uint8')
all_attrs = df.drop(["photo_path","person","imagenum"],axis=1)
return all_photos,all_attrs
Next I'm trying to convert my dataset to Tensor:
import torch
import torchvision.transforms
from torchvision import transforms as transforms
class MyDataset(torch.utils.data.Dataset):
def __init__(self, data):
self.dataset = data
def __getitem__(self, idx):
sample = self.dataset[idx]
data, label = sample[0], sample[1]
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
return transform(data), torch.tensor(label)
def __len__(self):
return len(self.dataset)
Then:
from sklearn.model_selection import train_test_split
train, val = train_test_split(data, test_size=0.25, random_state=42)
train_dataset = MyDataset(train)
val_dataset = MyDataset(val)
Autoencoder class:
from copy import deepcopy
class Autoencoder(nn.Module):
def __init__(self):
#<определите архитектуры encoder и decoder>
super(Autoencoder,self).__init__()
self.encoder = nn.Sequential(
nn.Conv2d(3, 6, kernel_size=5),
nn.ReLU(True),
nn.Conv2d(6,16,kernel_size=5),
nn.ReLU(True))
self.decoder = nn.Sequential(
nn.Conv2d(16, 6, kernel_size=5),
nn.ReLU(True),
nn.Conv2d(6,3,kernel_size=5),
nn.ReLU(True))
def forward(self, x):
latent_code = self.encoder(x)
reconstruction = self.decoder(latent_code)
return reconstruction, latent_code
Then:
criterion = nn.BCELoss()
autoencoder = Autoencoder()
optimizer = torch.optim.Adam(autoencoder.parameters())
The training itself:
#<тут Ваш код тренировки автоэнкодера>
num_epochs = 5
for epoch in range(num_epochs):
for x in train_dataset:
img, _ = x
img = Variable(img).cpu()
output = autoencoder(img)
loss = criterion(output, img)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('epoch [{}/{}], loss:{:.4f}'.format(epoch+1, num_epochs, loss.data()))
And I get this error:
RuntimeError: output with shape [1, 45, 3] doesn't match the broadcast shape [3, 45, 3]
I think I'm making mistakes somewhere in preparing my data. Could someome please explain me how to do it properly in my case? Thanks for any help.
I am trying to produce a model that will produce a caption for an image using resnet as the encoder, transformer as the decoder and COCO as the database.
After training my model for 10 epochs, my model failed to produce anything other than the word <pad> which implies that the only result after going through the model only produced tokens of 0 which corresponds to <pad>.
After using the debugger it seems that the error occurs at the argmax, where the output just becomes zero rather than anything else, but I don't know how to fix it, is it an issue with my model, or the way it is trained?
I based my model off of this github if it helps.
The script to download the COCO model is here:
Download.sh
mkdir data
wget http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip -P ./data/
wget http://images.cocodataset.org/zips/train2014.zip -P ./data/
wget http://images.cocodataset.org/zips/val2014.zip -P ./data/
unzip ./data/captions_train-val2014.zip -d ./data/
rm ./data/captions_train-val2014.zip
unzip ./data/train2014.zip -d ./data/
rm ./data/train2014.zip
unzip ./data/val2014.zip -d ./data/
rm ./data/val2014.zip
Any help is much appreciated.
Here is my code:
model.py
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
import torchvision.models as models
from torch.nn import TransformerDecoderLayer, TransformerDecoder
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable
class EncoderCNN(nn.Module):
def __init__(self, embed_size):
super(EncoderCNN, self).__init__()
resnet = models.resnet152(pretrained=True)
self.resnet = nn.Sequential(*list(resnet.children())[:-2])
self.conv1 = nn.Conv2d(2048, embed_size, 1)
self.embed_size = embed_size
self.fine_tune()
def forward(self, images):
features = self.resnet(images)
batch_size, _,_,_ = features.shape
features = self.conv1(features)
features = features.view(batch_size, self.embed_size, -1)
features = features.permute(2, 0, 1)
return features
def fine_tune(self, fine_tune=True):
for p in self.resnet.parameters():
p.requires_grad = False
# If fine-tuning, only fine-tune convolutional blocks 2 through 4
for c in list(self.resnet.children())[5:]:
for p in c.parameters():
p.requires_grad = fine_tune
class PositionEncoder(nn.Module):
def __init__(self, d_model, dropout, max_len=5000):
super(PositionEncoder, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class Embedder(nn.Module):
def __init__(self, vocab_size, d_model):
super().__init__()
self.embed = nn.Embedding(vocab_size, d_model)
def forward(self, x):
return self.embed(x)
class Transformer(nn.Module):
def __init__(self, vocab_size, d_model, h, num_hidden, N, device, dropout_dec=0.1, dropout_pos=0.1):
super(Transformer, self).__init__()
decoder_layers = TransformerDecoderLayer(d_model, h, num_hidden, dropout_dec)
self.source_mask = None
self.device = device
self.d_model = d_model
self.pos_decoder = PositionalEncoder(d_model, dropout_pos)
self.decoder = TransformerDecoder(decoder_layers, N)
self.embed = Embedder(vocab_size, d_model)
self.linear = nn.Linear(d_model, vocab_size)
self.init_weights()
def forward(self, source, mem):
source = source.permute(1,0)
if self.source_mask is None or self.source_mask.size(0) != len(source):
self.source_mask = nn.Transformer.generate_square_subsequent_mask(self=self, sz=len(source)).to(self.device)
source = self.embed(source)
source = source*math.sqrt(self.d_model)
source = self.pos_decoder(source)
output = self.decoder(source, mem, self.source_mask)
output = self.linear(output)
return output
def init_weights(self):
initrange = 0.1
self.linear.bias.data.zero_()
self.linear.weight.data.uniform_(-initrange, initrange)
def pred(self, memory, pred_len):
batch_size = memory.size(1)
src = torch.ones((pred_len, batch_size), dtype=int) * 2
if self.source_mask is None or self.source_mask.size(0) != len(src):
self.source_mask = nn.Transformer.generate_square_subsequent_mask(self=self, sz=len(src)).to(self.device)
output = torch.ones((pred_len, batch_size), dtype=int)
src, output = src.cuda(), output.cuda()
for i in range(pred_len):
src_emb = self.embed(src) # src_len * batch size * embed size
src_emb = src_emb*math.sqrt(self.d_model)
src_emb = self.pos_decoder(src_emb)
out = self.decoder(src_emb, memory, self.source_mask)
out = out[i]
out = self.linear(out) # batch_size * vocab_size
out = out.argmax(dim=1)
if i < pred_len-1:
src[i+1] = out
output[i] = out
return output
Data_Loader.py
import torch
import torchvision.transforms as transforms
import torch.utils.data as data
import os
import pickle
import numpy as np
import nltk
from PIL import Image
from build_vocab import Vocabulary
from pycocotools.coco import COCO
class CocoDataset(data.Dataset):
"""COCO Custom Dataset compatible with torch.utils.data.DataLoader."""
def __init__(self, root, json, vocab, transform=None):
"""Set the path for images, captions and vocabulary wrapper.
Args:
root: image directory.
json: coco annotation file path.
vocab: vocabulary wrapper.
transform: image transformer.
"""
self.root = root
self.coco = COCO(json)
self.ids = list(self.coco.anns.keys())
self.vocab = vocab
self.transform = transform
def __getitem__(self, index):
"""Returns one data pair (image and caption)."""
coco = self.coco
vocab = self.vocab
ann_id = self.ids[index]
caption = coco.anns[ann_id]['caption']
img_id = coco.anns[ann_id]['image_id']
path = coco.loadImgs(img_id)[0]['file_name']
image = Image.open(os.path.join(self.root, path)).convert('RGB')
if self.transform is not None:
image = self.transform(image)
# Convert caption (string) to word ids.
tokens = nltk.tokenize.word_tokenize(str(caption).lower())
caption = []
caption.append(vocab('<start>'))
caption.extend([vocab(token) for token in tokens])
caption.append(vocab('<end>'))
target = torch.Tensor(caption)
return image, target
def __len__(self):
return len(self.ids)
def collate_fn(data):
"""Creates mini-batch tensors from the list of tuples (image, caption).
We should build custom collate_fn rather than using default collate_fn,
because merging caption (including padding) is not supported in default.
Args:
data: list of tuple (image, caption).
- image: torch tensor of shape (3, 256, 256).
- caption: torch tensor of shape (?); variable length.
Returns:
images: torch tensor of shape (batch_size, 3, 256, 256).
targets: torch tensor of shape (batch_size, padded_length).
lengths: list; valid length for each padded caption.
"""
# Sort a data list by caption length (descending order).
data.sort(key=lambda x: len(x[1]), reverse=True)
images, captions = zip(*data)
# Merge images (from tuple of 3D tensor to 4D tensor).
images = torch.stack(images, 0)
# Merge captions (from tuple of 1D tensor to 2D tensor).
lengths = [len(cap) for cap in captions]
targets = torch.zeros(len(captions), max(lengths)).long()
for i, cap in enumerate(captions):
end = lengths[i]
targets[i, :end] = cap[:end]
return images, targets, lengths
def get_loader(root, json, vocab, transform, batch_size, shuffle, num_workers):
"""Returns torch.utils.data.DataLoader for custom coco dataset."""
# COCO caption dataset
coco = CocoDataset(root=root,
json=json,
vocab=vocab,
transform=transform)
# Data loader for COCO dataset
# This will return (images, captions, lengths) for each iteration.
# images: a tensor of shape (batch_size, 3, 224, 224).
# captions: a tensor of shape (batch_size, padded_length).
# lengths: a list indicating valid length for each caption. length is (batch_size).
data_loader = torch.utils.data.DataLoader(dataset=coco,
batch_size=batch_size,
shuffle=shuffle,
num_workers=num_workers,
collate_fn=collate_fn)
return data_loader
Build_vocab.py
import nltk
import pickle
import argparse
from collections import Counter
from pycocotools.coco import COCO
class Vocabulary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = {}
self.idx = 0
def add_word(self, word):
if not word in self.word2idx:
self.word2idx[word] = self.idx
self.idx2word[self.idx] = word
self.idx += 1
def __call__(self, word):
if not word in self.word2idx:
return self.word2idx['<unk>']
return self.word2idx[word]
def __len__(self):
return len(self.word2idx)
def build_vocab(json, threshold):
coco = COCO(json)
counter = Counter()
ids = coco.anns.keys()
for i, id in enumerate(ids):
caption = str(coco.anns[id]['caption'])
tokens = nltk.tokenize.word_tokenize(caption.lower())
counter.update(tokens)
if (i+1) % 1000 == 0:
print("[{}/{}] Tokenized the captions.".format(i+1, len(ids)))
# If the word frequency is less than 'threshold', then the word is discarded.
words = [word for word, cnt in counter.items() if cnt >= threshold]
# Create a vocab wrapper and add some special tokens.
vocab = Vocabulary()
vocab.add_word('<pad>')
vocab.add_word('<start>')
vocab.add_word('<end>')
vocab.add_word('<unk>')
# Add the words to the vocabulary.
for i, word in enumerate(words):
vocab.add_word(word)
return vocab
def main(args):
vocab = build_vocab(json=args.caption_path, threshold=args.threshold)
vocab_path = args.vocab_path
with open(vocab_path, 'wb') as f:
pickle.dump(vocab, f)
print("Total vocabulary size: {}".format(len(vocab)))
print("Saved the vocabulary wrapper to '{}'".format(vocab_path))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--caption_path', type=str,
default='./data/annotations/captions_train2014.json',
help='path for train annotation file')
parser.add_argument('--vocab_path', type=str, default='./data/vocab.pkl',
help='path for saving vocabulary wrapper')
parser.add_argument('--threshold', type=int, default=4,
help='minimum word count threshold')
args = parser.parse_args()
main(args)
train.py
import argparse
import torch
import torch.nn as nn
import numpy as np
import os
import pickle
import math
from tqdm import tqdm
from data_loader import get_loader
from build_vocab import Vocabulary
from model import EncoderCNN, Decoder
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def main(args):
batch_size = 64
embed_size = 512
num_heads = 8
num_layers = 6
num_workers = 2
num_epoch = 5
lr = 1e-3
load = False
# Create model directory
if not os.path.exists('models/'):
os.makedirs('models/')
# Image preprocessing, normalization for the pretrained resnet
transform = transforms.Compose([
transforms.RandomCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406),
(0.229, 0.224, 0.225))])
# Load vocabulary wrapper
with open('data/vocab.pkl', 'rb') as f:
vocab = pickle.load(f)
# Build data loader
data_loader = get_loader('data/resized2014', 'data/annotations/captions_train2014.json', vocab,
transform, batch_size,
shuffle=True, num_workers=num_workers)
encoder = EncoderCNN(embed_size).to(device)
encoder.fine_tune(False)
decoder = Decoder(len(vocab), embed_size, num_heads, embed_size, num_layers).to(device)
if(load):
encoder.load_state_dict(torch.load(os.path.join('models/', 'encoder-{}-{}.ckpt'.format(5, 5000))))
decoder.load_state_dict(torch.load(os.path.join('models/', 'decoder-{}-{}.ckpt'.format(5, 5000))))
print("Load Successful")
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
encoder_optim = torch.optim.Adam(encoder.parameters(), lr=lr)
decoder_optim = torch.optim.Adam(decoder.parameters(), lr=lr)
# Train the models
for epoch in range(num_epoch):
encoder.train()
decoder.train()
for i, (images, captions, lengths) in tqdm(enumerate(data_loader), total=len(data_loader), leave=False):
# Set mini-batch dataset
images = images.to(device)
captions = captions.to(device)
# Forward, backward and optimize
features = encoder(images)
cap_input = captions[:, :-1]
cap_target = captions[:, 1:]
outputs = decoder(cap_input, features)
outputs = outputs.permute(1,0,2)
outputs_shape = outputs.reshape(-1, len(vocab))
loss = criterion(outputs_shape, cap_target.reshape(-1))
decoder.zero_grad()
encoder.zero_grad()
loss.backward()
encoder_optim.step()
decoder_optim.step()
# Save the model checkpoints
if (i+1) % args.save_step == 0:
torch.save(decoder.state_dict(), os.path.join(
'models/', 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
torch.save(encoder.state_dict(), os.path.join(
'models/', 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--log_step', type=int , default=10, help='step size for prining log info')
parser.add_argument('--save_step', type=int , default=1000, help='step size for saving trained models')
args = parser.parse_args()
print(args)
main(args)
sample.py
import torch
import matplotlib.pyplot as plt
import numpy as np
import argparse
import pickle
import os
from torchvision import transforms
from build_vocab import Vocabulary
from data_loader import get_loader
from model import EncoderCNN, Decoder
from PIL import Image
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#
def token_sentence(decoder_out, itos):
tokens = decoder_out
tokens = tokens.transpose(1, 0)
tokens = tokens.cpu().numpy()
results = []
for instance in tokens:
result = ' '.join([itos[x] for x in instance])
results.append(''.join(result.partition('<eos>')[0])) # Cut before '<eos>'
return results
def load_image(image_path, transform=None):
image = Image.open(image_path).convert('RGB')
image = image.resize([224, 224], Image.LANCZOS)
if transform is not None:
image = transform(image).unsqueeze(0)
return image
def main(args):
batch_size = 64
embed_size = 512
num_heads = 8
num_layers = 6
num_workers = 2
# Image preprocessing
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406),
(0.229, 0.224, 0.225))])
# Load vocabulary wrapper
with open(args.vocab_path, 'rb') as f:
vocab = pickle.load(f)
data_loader = get_loader('data/resized2014', 'data/annotations/captions_train2014.json', vocab,
transform, batch_size,
shuffle=True, num_workers=num_workers)
# Build models
encoder = EncoderCNN(embed_size).to(device)
encoder.fine_tune(False)
decoder = Decoder(len(vocab), embed_size, num_heads, embed_size, num_layers).to(device)
# Load trained models
encoder.load_state_dict(torch.load(os.path.join('models/', 'encoder-{}-{}.ckpt'.format(1, 4000))))
decoder.load_state_dict(torch.load(os.path.join('models/', 'decoder-{}-{}.ckpt'.format(1, 4000))))
encoder.eval()
decoder.eval()
itos = vocab.idx2word
pred_len = 100
result_collection = []
# Decode with greedy
# with torch.no_grad():
# for i, (images, captions, lengths) in enumerate(data_loader):
# images = images.to(device)
# features = encoder(images)
# output = decoder.generator(features, pred_len)
# result_caption = token_sentence(output, itos)
# result_collection.extend(result_caption)
# Decode with greedy
with torch.no_grad():
for batch_index, (inputs, captions, caplens) in enumerate(data_loader):
inputs, captions = inputs.cuda(), captions.cuda()
enc_out = encoder(inputs)
captions_input = captions[:, :-1]
captions_target = captions[:, 1:]
output = decoder.pred(enc_out, pred_len)
result_caption = token_sentence(output, itos)
result_collection.extend(result_caption)
print("Prediction-greedy:", result_collection[1])
print("Prediction-greedy:", result_collection[2])
print("Prediction-greedy:", result_collection[3])
print("Prediction-greedy:", result_collection[4])
print("Prediction-greedy:", result_collection[5])
print("Prediction-greedy:", result_collection[6])
print("Prediction-greedy:", result_collection[7])
print("Prediction-greedy:", result_collection[8])
print("Prediction-greedy:", result_collection[9])
print("Prediction-greedy:", result_collection[10])
print("Prediction-greedy:", result_collection[11])
# # Prepare an image
# image = load_image(args.image, transform)
# image_tensor = image.to(device)
# # Generate an caption from the image
# feature = encoder(image_tensor)
# sampled_ids = decoder.generator(feature, pred_len)
# sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length)
# # Convert word_ids to words
# sampled_caption = []
# for word_id in sampled_ids:
# word = vocab.idx2word[word_id]
# sampled_caption.append(word)
# if word == '<end>':
# break
# sentence = ' '.join(sampled_caption)
# # Print out the image and the generated caption
# print (sentence)
# image = Image.open(args.image)
# plt.imshow(np.asarray(image))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--image', type=str, required=False, help='input image for generating caption')
parser.add_argument('--vocab_path', type=str, default='data/vocab.pkl', help='path for vocabulary wrapper')
args = parser.parse_args()
main(args)
resize.py
import argparse
import os
from PIL import Image
def resize_image(image, size):
"""Resize an image to the given size."""
return image.resize(size, Image.ANTIALIAS)
def resize_images(image_dir, output_dir, size):
"""Resize the images in 'image_dir' and save into 'output_dir'."""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
images = os.listdir(image_dir)
num_images = len(images)
for i, image in enumerate(images):
with open(os.path.join(image_dir, image), 'r+b') as f:
with Image.open(f) as img:
img = resize_image(img, size)
img.save(os.path.join(output_dir, image), img.format)
if (i+1) % 100 == 0:
print ("[{}/{}] Resized the images and saved into '{}'."
.format(i+1, num_images, output_dir))
def main(args):
image_dir = args.image_dir
output_dir = args.output_dir
image_size = [args.image_size, args.image_size]
resize_images(image_dir, output_dir, image_size)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--image_dir', type=str, default='./data/train2014/',
help='directory for train images')
parser.add_argument('--output_dir', type=str, default='./data/resized2014/',
help='directory for saving resized images')
parser.add_argument('--image_size', type=int, default=256,
help='size for image after processing')
args = parser.parse_args()
main(args)
I have images in a folder(train) and csv file(train.csv) containing image names and labels.
how to map images in one folder and labels in another csv file
how can i create a data frame with image data and labels.
multiclass classification
import tensorflow as tf
from tensorflow import keras
import pandas as pd
class MyTrainingData(keras.utils.Sequence):
def __init__(self, file, labels, batchSize):
self.file = file
self.label = labels
self.batchSize = batchSize
self.n_bathces = int(len(self.file) / self.batchSize)
def on_epoch_end(self): # it is called after every epoch
self.file, self.label = shuffle(self.file, self.label)
for i in range(50):
print(self.file[i], self.label[i], 'file-label')
def __len__(self):
return self.n_bathces
# called after every batch to get new batch or new 32 images and labels
def __getitem__(self, idx):
# this method calls by fit method with idx ranging from 0 to len(training_exmaples) / batch_size =
batchX = self.file[idx*self.batchSize: (idx+1)*self.batchSize]
batchY = self.label[idx*self.batchSize: (idx+1)*self.batchSize]
imgFiles = [image.load_img(name, target_size=(224, 224, 3)) for name in batchX] #loading 32 images
imgFiles = [image.img_to_array(img) for img in imgFiles] #preprocessing
imgFiles = [img / 255 for img in imgFiles] batchY = to_categorical(batchY, 4) # 4 represent number of classes (4 in that case)
return np.array(imgFiles), np.array(batchY)
def getfilePath(filenames):
path = './train/' # or any other path according to directory structure
filePaths = []
for name in filenames:
filePaths.append(path + name) # './train/' + 'img1.jpg' = './train/img1.jpg
return filePaths
df = pd.read_csv()
img_names = df['img_names']
labels = df['labels']
img_names = ['img1.jpg', 'img2.jpg', -----]
img_names = getfilePath(img_names)
img_names = ['./train/img1.jpg', './train/img2.jpg', -----]
label = [3, 1, 2, 0, ----]
batch_size = 32
data = MyTrainingData(fileNames, labels, batchSize
model = defineModel()
sgd = SGD(learning_rate=0.0001, momentum=0.9, nesterov=True)
model.compile(optimizer=sgd, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(data, epochs=10, verbose=1)
The above code does much more than mapping. In case of large dataset(to big to fit in RAM). this techniwue will help you to load, preprocess and generate input data dynamically. Hope you find it usefull. Give feedback so that i can further improve it.
I have two dataset folder of tif images, one is a folder called BMMCdata, and the other one is the mask of BMMCdata images called BMMCmasks(the name of images are corresponds). I am trying to make a customised dataset and also split the data randomly to train and test. at the moment I am getting an error
self.filenames.append(fn)
AttributeError: 'CustomDataset' object has no attribute 'filenames'
Any comment will be appreciated a lot.
import torch
from torch.utils.data.dataset import Dataset # For custom data-sets
from torchvision import transforms
from PIL import Image
import os.path as osp
import glob
folder_data = "/Users/parto/PycharmProjects/U-net/BMMCdata/data"
class CustomDataset(Dataset):
def __init__(self, root):
self.filename = folder_data
self.root = root
self.to_tensor = transforms.ToTensor()
filenames = glob.glob(osp.join(folder_data, '*.tif'))
for fn in filenames:
self.filenames.append(fn)
self.len = len(self.filenames)
print(fn)
def __getitem__(self, index):
image = Image.open(self.filenames[index])
return self.transform(image)
def __len__(self):
return self.len
custom_img = CustomDataset(folder_data)
# total images in set
print(custom_img.len)
train_len = int(0.6*custom_img.len)
test_len = custom_img.len - train_len
train_set, test_set = CustomDataset.random_split(custom_img, lengths=[train_len, test_len])
# check lens of subset
len(train_set), len(test_set)
train_set = CustomDataset(folder_data)
train_set = torch.utils.data.TensorDataset(train_set, train=True, batch_size=4)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=4, shuffle=True, num_workers=1)
print(train_set)
print(train_loader)
test_set = torch.utils.data.DataLoader(Dataset, batch_size=4, sampler= train_sampler)
test_loader = torch.utils.data.DataLoader(Dataset, batch_size=4)
answer given by #ptrblck in pytorch community. thank you
# get all the image and mask path and number of images
folder_data = glob.glob("D:\\Neda\\Pytorch\\U-net\\BMMCdata\\data\\*.tif")
folder_mask = glob.glob("D:\\Neda\\Pytorch\\U-net\\BMMCmasks\\masks\\*.tif")
# split these path using a certain percentage
len_data = len(folder_data)
print(len_data)
train_size = 0.6
train_image_paths = folder_data[:int(len_data*train_size)]
test_image_paths = folder_data[int(len_data*train_size):]
train_mask_paths = folder_mask[:int(len_data*train_size)]
test_mask_paths = folder_mask[int(len_data*train_size):]
class CustomDataset(Dataset):
def __init__(self, image_paths, target_paths, train=True): # initial logic
happens like transform
self.image_paths = image_paths
self.target_paths = target_paths
self.transforms = transforms.ToTensor()
def __getitem__(self, index):
image = Image.open(self.image_paths[index])
mask = Image.open(self.target_paths[index])
t_image = self.transforms(image)
return t_image, mask
def __len__(self): # return count of sample we have
return len(self.image_paths)
train_dataset = CustomDataset(train_image_paths, train_mask_paths, train=True)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=1)
test_dataset = CustomDataset(test_image_paths, test_mask_paths, train=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=1)
This question mainly concerns the return value of __getitem__ in a pytorch Dataset which I've seen as both a tuple and a dict in the source code.
I have been following this tutorial for creating a dataset class within my code, which is following this tutorial on transfer learning. It has the following definition of a dataset.
class FaceLandmarksDataset(Dataset):
"""Face Landmarks dataset."""
def __init__(self, csv_file, root_dir, transform=None):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.landmarks_frame = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = transform
def __len__(self):
return len(self.landmarks_frame)
def __getitem__(self, idx):
img_name = os.path.join(self.root_dir,
self.landmarks_frame.iloc[idx, 0])
image = io.imread(img_name)
landmarks = self.landmarks_frame.iloc[idx, 1:].as_matrix()
landmarks = landmarks.astype('float').reshape(-1, 2)
sample = {'image': image, 'landmarks': landmarks}
if self.transform:
sample = self.transform(sample)
return sample
As you can see, __getitem__ returns a dictionary with two entries.
In the transfer learning tutorial, the following calls are made to transform a dataset:
data_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
data_dir = 'hymenoptera_data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
data_transforms[x])
for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
shuffle=True, num_workers=4)
for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes
use_gpu = torch.cuda.is_available()
inputs, classes = next(iter(dataloaders['train']))
That last line of code causes an error in my code by attempting to run transform on a sample in my custom dataset.
'dict' object has no attribute 'size'
But if the tutorial dataset is implemented correctly, shouldn't it function correctly with a transform? My own hybrid implementation is below:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
from torch.utils.data import *
from skimage import io, transform
plt.ion()
class NumsDataset(Dataset):
"""Face Landmarks dataset."""
def __init__(self, root_dir, transform=None):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.docs = []
for file in os.listdir(root_dir):
#print(file)
if file.endswith(".txt"):
path = os.path.join(root_dir, file)
with open(path, 'r') as f:
self.docs.append( ( file , list(f.read()) ) ) #tup containing file, image values pairs
self.root_dir = root_dir
self.transform = transform
def __len__(self): #returns number of images
i = 0
for j in self.docs:
i += len(j[1])
return i
def len2(self): #returns number of batches
return len(self.docs)
def __getitem__(self, idx):
idx1 = idx // self.len2()
idx2 = idx % self.len2()
imglabel = self.docs[idx1][0] #label with filename for batch error calculation later
imgdir = os.path.join(self.root_dir, self.docs[idx1][0].strip(".txt"))
img = None
l = idx2
for file in os.listdir(imgdir):
file = os.path.join(imgdir, file)
if(l == 0):
img = io.imread(file)
l -= 1
sample = (img , imglabel)
sample ={'image': img, 'label': imglabel}
if self.transform:
sample = self.transform(sample)
return sample
data_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
data_dir = "images"
image_datasets = {x: NumsDataset(os.path.join(data_dir, x),
data_transforms[x])
for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=5)
for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = ["one", "two", "four"]
use_gpu = torch.cuda.is_available()
# Get a batch of training data
inputs, classes = next(iter(dataloaders['train']))
directory structure:
images
/train
/file1
*.jpg
/file2...
*.jpg
file1.txt
file2.txt...
/val
/file1
*.jpg
/file2...
*.jpg
file1.txt
file2.txt...
Is the sample I'm returning formatted incorrectly?
The below problem occurs when you pass dict instead of image to transforms. The custom transforms mentioned in the example can handle that, but a default transforms cannot, instead you can pass only image to the transform. This will solve half of the problem.
'dict' object has no attribute 'size'
the rest of the problem lies with the image handling code in the example, so I had to dig through till transforms.py in the torchvision; this uses PIL image unlike skimage mentioned in the example, so I replaced the code with PIL and is working perfectly fine.
site-packages/torchvision/transforms/transforms.py
Original Code:
def __getitem__(self, idx):
if torch.is_tensor(idx):
img_name = os.path.join(self.root_dir,self.anb_frame.iloc[idx, 0])
image = io.imread(img_name)
labels = self.anb_frame.iloc[idx, 1:]
labels = np.array([labels])
sample = {'image': image, 'labels': labels}
if self.transform:
image = self.transform(image)
return sample
Modified:
def __getitem__(self, idx):
if torch.is_tensor(idx):
img_name = os.path.join(self.root_dir,self.anb_frame.iloc[idx, 0])
image = Image.open(img_name)
if self.transform:
image = self.transform(image)
labels = self.anb_frame.iloc[idx, 1:]
labels = np.array([labels])
sample = {'image': image, 'labels': labels}
return sample
The particular way the tutorial on dataloading uses the custom dataset is with self defined transforms. The transforms must be designed to fit the dataset. As such, the dataset must output a sample compatible with the library transform functions, or transforms must be defined for the particular sample case. Choosing the latter, among other things has resulted in completely functional code.