Transformer Image captioning model produces just padding rather than a caption

Transformer Image captioning model produces just padding rather than a caption - python

I am trying to produce a model that will produce a caption for an image using resnet as the encoder, transformer as the decoder and COCO as the database.
After training my model for 10 epochs, my model failed to produce anything other than the word <pad> which implies that the only result after going through the model only produced tokens of 0 which corresponds to <pad>.
After using the debugger it seems that the error occurs at the argmax, where the output just becomes zero rather than anything else, but I don't know how to fix it, is it an issue with my model, or the way it is trained?
I based my model off of this github if it helps.
The script to download the COCO model is here:
Download.sh
mkdir data
wget http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip -P ./data/
wget http://images.cocodataset.org/zips/train2014.zip -P ./data/
wget http://images.cocodataset.org/zips/val2014.zip -P ./data/
unzip ./data/captions_train-val2014.zip -d ./data/
rm ./data/captions_train-val2014.zip
unzip ./data/train2014.zip -d ./data/
rm ./data/train2014.zip
unzip ./data/val2014.zip -d ./data/
rm ./data/val2014.zip
Any help is much appreciated.
Here is my code:
model.py
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
import torchvision.models as models
from torch.nn import TransformerDecoderLayer, TransformerDecoder
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable
class EncoderCNN(nn.Module):
def __init__(self, embed_size):
super(EncoderCNN, self).__init__()
resnet = models.resnet152(pretrained=True)
self.resnet = nn.Sequential(*list(resnet.children())[:-2])
self.conv1 = nn.Conv2d(2048, embed_size, 1)
self.embed_size = embed_size
self.fine_tune()
def forward(self, images):
features = self.resnet(images)
batch_size, _,_,_ = features.shape
features = self.conv1(features)
features = features.view(batch_size, self.embed_size, -1)
features = features.permute(2, 0, 1)
return features
def fine_tune(self, fine_tune=True):
for p in self.resnet.parameters():
p.requires_grad = False
# If fine-tuning, only fine-tune convolutional blocks 2 through 4
for c in list(self.resnet.children())[5:]:
for p in c.parameters():
p.requires_grad = fine_tune
class PositionEncoder(nn.Module):
def __init__(self, d_model, dropout, max_len=5000):
super(PositionEncoder, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class Embedder(nn.Module):
def __init__(self, vocab_size, d_model):
super().__init__()
self.embed = nn.Embedding(vocab_size, d_model)
def forward(self, x):
return self.embed(x)
class Transformer(nn.Module):
def __init__(self, vocab_size, d_model, h, num_hidden, N, device, dropout_dec=0.1, dropout_pos=0.1):
super(Transformer, self).__init__()
decoder_layers = TransformerDecoderLayer(d_model, h, num_hidden, dropout_dec)
self.source_mask = None
self.device = device
self.d_model = d_model
self.pos_decoder = PositionalEncoder(d_model, dropout_pos)
self.decoder = TransformerDecoder(decoder_layers, N)
self.embed = Embedder(vocab_size, d_model)
self.linear = nn.Linear(d_model, vocab_size)
self.init_weights()
def forward(self, source, mem):
source = source.permute(1,0)
if self.source_mask is None or self.source_mask.size(0) != len(source):
self.source_mask = nn.Transformer.generate_square_subsequent_mask(self=self, sz=len(source)).to(self.device)
source = self.embed(source)
source = source*math.sqrt(self.d_model)
source = self.pos_decoder(source)
output = self.decoder(source, mem, self.source_mask)
output = self.linear(output)
return output
def init_weights(self):
initrange = 0.1
self.linear.bias.data.zero_()
self.linear.weight.data.uniform_(-initrange, initrange)
def pred(self, memory, pred_len):
batch_size = memory.size(1)
src = torch.ones((pred_len, batch_size), dtype=int) * 2
if self.source_mask is None or self.source_mask.size(0) != len(src):
self.source_mask = nn.Transformer.generate_square_subsequent_mask(self=self, sz=len(src)).to(self.device)
output = torch.ones((pred_len, batch_size), dtype=int)
src, output = src.cuda(), output.cuda()
for i in range(pred_len):
src_emb = self.embed(src) # src_len * batch size * embed size
src_emb = src_emb*math.sqrt(self.d_model)
src_emb = self.pos_decoder(src_emb)
out = self.decoder(src_emb, memory, self.source_mask)
out = out[i]
out = self.linear(out) # batch_size * vocab_size
out = out.argmax(dim=1)
if i < pred_len-1:
src[i+1] = out
output[i] = out
return output
Data_Loader.py
import torch
import torchvision.transforms as transforms
import torch.utils.data as data
import os
import pickle
import numpy as np
import nltk
from PIL import Image
from build_vocab import Vocabulary
from pycocotools.coco import COCO
class CocoDataset(data.Dataset):
"""COCO Custom Dataset compatible with torch.utils.data.DataLoader."""
def __init__(self, root, json, vocab, transform=None):
"""Set the path for images, captions and vocabulary wrapper.
Args:
root: image directory.
json: coco annotation file path.
vocab: vocabulary wrapper.
transform: image transformer.
"""
self.root = root
self.coco = COCO(json)
self.ids = list(self.coco.anns.keys())
self.vocab = vocab
self.transform = transform
def __getitem__(self, index):
"""Returns one data pair (image and caption)."""
coco = self.coco
vocab = self.vocab
ann_id = self.ids[index]
caption = coco.anns[ann_id]['caption']
img_id = coco.anns[ann_id]['image_id']
path = coco.loadImgs(img_id)[0]['file_name']
image = Image.open(os.path.join(self.root, path)).convert('RGB')
if self.transform is not None:
image = self.transform(image)
# Convert caption (string) to word ids.
tokens = nltk.tokenize.word_tokenize(str(caption).lower())
caption = []
caption.append(vocab('<start>'))
caption.extend([vocab(token) for token in tokens])
caption.append(vocab('<end>'))
target = torch.Tensor(caption)
return image, target
def __len__(self):
return len(self.ids)
def collate_fn(data):
"""Creates mini-batch tensors from the list of tuples (image, caption).
We should build custom collate_fn rather than using default collate_fn,
because merging caption (including padding) is not supported in default.
Args:
data: list of tuple (image, caption).
- image: torch tensor of shape (3, 256, 256).
- caption: torch tensor of shape (?); variable length.
Returns:
images: torch tensor of shape (batch_size, 3, 256, 256).
targets: torch tensor of shape (batch_size, padded_length).
lengths: list; valid length for each padded caption.
"""
# Sort a data list by caption length (descending order).
data.sort(key=lambda x: len(x[1]), reverse=True)
images, captions = zip(*data)
# Merge images (from tuple of 3D tensor to 4D tensor).
images = torch.stack(images, 0)
# Merge captions (from tuple of 1D tensor to 2D tensor).
lengths = [len(cap) for cap in captions]
targets = torch.zeros(len(captions), max(lengths)).long()
for i, cap in enumerate(captions):
end = lengths[i]
targets[i, :end] = cap[:end]
return images, targets, lengths
def get_loader(root, json, vocab, transform, batch_size, shuffle, num_workers):
"""Returns torch.utils.data.DataLoader for custom coco dataset."""
# COCO caption dataset
coco = CocoDataset(root=root,
json=json,
vocab=vocab,
transform=transform)
# Data loader for COCO dataset
# This will return (images, captions, lengths) for each iteration.
# images: a tensor of shape (batch_size, 3, 224, 224).
# captions: a tensor of shape (batch_size, padded_length).
# lengths: a list indicating valid length for each caption. length is (batch_size).
data_loader = torch.utils.data.DataLoader(dataset=coco,
batch_size=batch_size,
shuffle=shuffle,
num_workers=num_workers,
collate_fn=collate_fn)
return data_loader
Build_vocab.py
import nltk
import pickle
import argparse
from collections import Counter
from pycocotools.coco import COCO
class Vocabulary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = {}
self.idx = 0
def add_word(self, word):
if not word in self.word2idx:
self.word2idx[word] = self.idx
self.idx2word[self.idx] = word
self.idx += 1
def __call__(self, word):
if not word in self.word2idx:
return self.word2idx['<unk>']
return self.word2idx[word]
def __len__(self):
return len(self.word2idx)
def build_vocab(json, threshold):
coco = COCO(json)
counter = Counter()
ids = coco.anns.keys()
for i, id in enumerate(ids):
caption = str(coco.anns[id]['caption'])
tokens = nltk.tokenize.word_tokenize(caption.lower())
counter.update(tokens)
if (i+1) % 1000 == 0:
print("[{}/{}] Tokenized the captions.".format(i+1, len(ids)))
# If the word frequency is less than 'threshold', then the word is discarded.
words = [word for word, cnt in counter.items() if cnt >= threshold]
# Create a vocab wrapper and add some special tokens.
vocab = Vocabulary()
vocab.add_word('<pad>')
vocab.add_word('<start>')
vocab.add_word('<end>')
vocab.add_word('<unk>')
# Add the words to the vocabulary.
for i, word in enumerate(words):
vocab.add_word(word)
return vocab
def main(args):
vocab = build_vocab(json=args.caption_path, threshold=args.threshold)
vocab_path = args.vocab_path
with open(vocab_path, 'wb') as f:
pickle.dump(vocab, f)
print("Total vocabulary size: {}".format(len(vocab)))
print("Saved the vocabulary wrapper to '{}'".format(vocab_path))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--caption_path', type=str,
default='./data/annotations/captions_train2014.json',
help='path for train annotation file')
parser.add_argument('--vocab_path', type=str, default='./data/vocab.pkl',
help='path for saving vocabulary wrapper')
parser.add_argument('--threshold', type=int, default=4,
help='minimum word count threshold')
args = parser.parse_args()
main(args)
train.py
import argparse
import torch
import torch.nn as nn
import numpy as np
import os
import pickle
import math
from tqdm import tqdm
from data_loader import get_loader
from build_vocab import Vocabulary
from model import EncoderCNN, Decoder
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def main(args):
batch_size = 64
embed_size = 512
num_heads = 8
num_layers = 6
num_workers = 2
num_epoch = 5
lr = 1e-3
load = False
# Create model directory
if not os.path.exists('models/'):
os.makedirs('models/')
# Image preprocessing, normalization for the pretrained resnet
transform = transforms.Compose([
transforms.RandomCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406),
(0.229, 0.224, 0.225))])
# Load vocabulary wrapper
with open('data/vocab.pkl', 'rb') as f:
vocab = pickle.load(f)
# Build data loader
data_loader = get_loader('data/resized2014', 'data/annotations/captions_train2014.json', vocab,
transform, batch_size,
shuffle=True, num_workers=num_workers)
encoder = EncoderCNN(embed_size).to(device)
encoder.fine_tune(False)
decoder = Decoder(len(vocab), embed_size, num_heads, embed_size, num_layers).to(device)
if(load):
encoder.load_state_dict(torch.load(os.path.join('models/', 'encoder-{}-{}.ckpt'.format(5, 5000))))
decoder.load_state_dict(torch.load(os.path.join('models/', 'decoder-{}-{}.ckpt'.format(5, 5000))))
print("Load Successful")
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
encoder_optim = torch.optim.Adam(encoder.parameters(), lr=lr)
decoder_optim = torch.optim.Adam(decoder.parameters(), lr=lr)
# Train the models
for epoch in range(num_epoch):
encoder.train()
decoder.train()
for i, (images, captions, lengths) in tqdm(enumerate(data_loader), total=len(data_loader), leave=False):
# Set mini-batch dataset
images = images.to(device)
captions = captions.to(device)
# Forward, backward and optimize
features = encoder(images)
cap_input = captions[:, :-1]
cap_target = captions[:, 1:]
outputs = decoder(cap_input, features)
outputs = outputs.permute(1,0,2)
outputs_shape = outputs.reshape(-1, len(vocab))
loss = criterion(outputs_shape, cap_target.reshape(-1))
decoder.zero_grad()
encoder.zero_grad()
loss.backward()
encoder_optim.step()
decoder_optim.step()
# Save the model checkpoints
if (i+1) % args.save_step == 0:
torch.save(decoder.state_dict(), os.path.join(
'models/', 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
torch.save(encoder.state_dict(), os.path.join(
'models/', 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--log_step', type=int , default=10, help='step size for prining log info')
parser.add_argument('--save_step', type=int , default=1000, help='step size for saving trained models')
args = parser.parse_args()
print(args)
main(args)
sample.py
import torch
import matplotlib.pyplot as plt
import numpy as np
import argparse
import pickle
import os
from torchvision import transforms
from build_vocab import Vocabulary
from data_loader import get_loader
from model import EncoderCNN, Decoder
from PIL import Image
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#
def token_sentence(decoder_out, itos):
tokens = decoder_out
tokens = tokens.transpose(1, 0)
tokens = tokens.cpu().numpy()
results = []
for instance in tokens:
result = ' '.join([itos[x] for x in instance])
results.append(''.join(result.partition('<eos>')[0])) # Cut before '<eos>'
return results
def load_image(image_path, transform=None):
image = Image.open(image_path).convert('RGB')
image = image.resize([224, 224], Image.LANCZOS)
if transform is not None:
image = transform(image).unsqueeze(0)
return image
def main(args):
batch_size = 64
embed_size = 512
num_heads = 8
num_layers = 6
num_workers = 2
# Image preprocessing
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406),
(0.229, 0.224, 0.225))])
# Load vocabulary wrapper
with open(args.vocab_path, 'rb') as f:
vocab = pickle.load(f)
data_loader = get_loader('data/resized2014', 'data/annotations/captions_train2014.json', vocab,
transform, batch_size,
shuffle=True, num_workers=num_workers)
# Build models
encoder = EncoderCNN(embed_size).to(device)
encoder.fine_tune(False)
decoder = Decoder(len(vocab), embed_size, num_heads, embed_size, num_layers).to(device)
# Load trained models
encoder.load_state_dict(torch.load(os.path.join('models/', 'encoder-{}-{}.ckpt'.format(1, 4000))))
decoder.load_state_dict(torch.load(os.path.join('models/', 'decoder-{}-{}.ckpt'.format(1, 4000))))
encoder.eval()
decoder.eval()
itos = vocab.idx2word
pred_len = 100
result_collection = []
# Decode with greedy
# with torch.no_grad():
# for i, (images, captions, lengths) in enumerate(data_loader):
# images = images.to(device)
# features = encoder(images)
# output = decoder.generator(features, pred_len)
# result_caption = token_sentence(output, itos)
# result_collection.extend(result_caption)
# Decode with greedy
with torch.no_grad():
for batch_index, (inputs, captions, caplens) in enumerate(data_loader):
inputs, captions = inputs.cuda(), captions.cuda()
enc_out = encoder(inputs)
captions_input = captions[:, :-1]
captions_target = captions[:, 1:]
output = decoder.pred(enc_out, pred_len)
result_caption = token_sentence(output, itos)
result_collection.extend(result_caption)
print("Prediction-greedy:", result_collection[1])
print("Prediction-greedy:", result_collection[2])
print("Prediction-greedy:", result_collection[3])
print("Prediction-greedy:", result_collection[4])
print("Prediction-greedy:", result_collection[5])
print("Prediction-greedy:", result_collection[6])
print("Prediction-greedy:", result_collection[7])
print("Prediction-greedy:", result_collection[8])
print("Prediction-greedy:", result_collection[9])
print("Prediction-greedy:", result_collection[10])
print("Prediction-greedy:", result_collection[11])
# # Prepare an image
# image = load_image(args.image, transform)
# image_tensor = image.to(device)
# # Generate an caption from the image
# feature = encoder(image_tensor)
# sampled_ids = decoder.generator(feature, pred_len)
# sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length)
# # Convert word_ids to words
# sampled_caption = []
# for word_id in sampled_ids:
# word = vocab.idx2word[word_id]
# sampled_caption.append(word)
# if word == '<end>':
# break
# sentence = ' '.join(sampled_caption)
# # Print out the image and the generated caption
# print (sentence)
# image = Image.open(args.image)
# plt.imshow(np.asarray(image))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--image', type=str, required=False, help='input image for generating caption')
parser.add_argument('--vocab_path', type=str, default='data/vocab.pkl', help='path for vocabulary wrapper')
args = parser.parse_args()
main(args)
resize.py
import argparse
import os
from PIL import Image
def resize_image(image, size):
"""Resize an image to the given size."""
return image.resize(size, Image.ANTIALIAS)
def resize_images(image_dir, output_dir, size):
"""Resize the images in 'image_dir' and save into 'output_dir'."""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
images = os.listdir(image_dir)
num_images = len(images)
for i, image in enumerate(images):
with open(os.path.join(image_dir, image), 'r+b') as f:
with Image.open(f) as img:
img = resize_image(img, size)
img.save(os.path.join(output_dir, image), img.format)
if (i+1) % 100 == 0:
print ("[{}/{}] Resized the images and saved into '{}'."
.format(i+1, num_images, output_dir))
def main(args):
image_dir = args.image_dir
output_dir = args.output_dir
image_size = [args.image_size, args.image_size]
resize_images(image_dir, output_dir, image_size)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--image_dir', type=str, default='./data/train2014/',
help='directory for train images')
parser.add_argument('--output_dir', type=str, default='./data/resized2014/',
help='directory for saving resized images')
parser.add_argument('--image_size', type=int, default=256,
help='size for image after processing')
args = parser.parse_args()
main(args)

Related

MarkRCNN output images (prediction)

I'm following this object detection tutorial in Pytorch for Mask RCNN: https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html and at the end there are some verification image.
How can I get this prediction image? I'm working with object deteciton.
Is there some way to output images from trained model that I can see is my network is learn something?
This is my code:
import os
import numpy as np
import torch
from PIL import Image
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
#from engine import train_one_epoch, evaluate
from vision.references.detection.engine import train_one_epoch, evaluate
import utils
import transforms as T
import matplotlib.pyplot as plt
class Moj_Dataset_ArT(object):
def __init__(self, root, transforms):
self.root = root
self.transforms = transforms
# load all image files, sorting them to
# ensure that they are aligned
self.imgs = list(sorted(os.listdir(os.path.join(root, "NEW_train"))))
self.masks = list(sorted(os.listdir(os.path.join(root, "train_masks"))))
def __getitem__(self, idx):
# load images and masks
img_path = os.path.join(self.root, "NEW_train", self.imgs[idx])
mask_path = os.path.join(self.root, "train_masks", self.masks[idx])
img = Image.open(img_path).convert("RGB")
# note that we haven't converted the mask to RGB,
# because each color corresponds to a different instance
# with 0 being background
mask = Image.open(mask_path)
mask = np.array(mask)
# instances are encoded as different colors
obj_ids = np.unique(mask)
# first id is the background, so remove it
obj_ids = obj_ids[1:]
# split the color-encoded mask into a set
# of binary masks
masks = mask == obj_ids[:, None, None]
# get bounding box coordinates for each mask
num_objs = len(obj_ids)
boxes = []
for i in range(num_objs):
pos = np.where(masks[i])
xmin = np.min(pos[1])
xmax = np.max(pos[1])
ymin = np.min(pos[0])
ymax = np.max(pos[0])
boxes.append([xmin, ymin, xmax, ymax])
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# there is only one class
labels = torch.ones((num_objs,), dtype=torch.int64)
masks = torch.as_tensor(masks, dtype=torch.uint8)
image_id = torch.tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# suppose all instances are not crowd
iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transforms is not None:
img, target = self.transforms(img, target)
return img, target
def __len__(self):
return len(self.imgs)
def get_model_instance_segmentation(num_classes):
# load an instance segmentation model pre-trained pre-trained on COCO
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=False) #bilo na True
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
# now get the number of input features for the mask classifier
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layer = 256
# and replace the mask predictor with a new one
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
hidden_layer,
num_classes)
return model
def get_transform(train):
transforms = []
transforms.append(T.ToTensor())
#if train:
#transforms.append(T.RandomHorizontalFlip(0.5))
return T.Compose(transforms)
def draw_loss(ml):
plt.figure(figsize=(10,5))
plt.title("Training Loss")
#plt.plot(val_losses,label="val")
plt.plot(ml,label="train")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.xlim([0,80])
plt.ylim([0, 1.2])
plt.show()
def main():
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# our dataset has two classes only - background and person
num_classes = 2
# use our dataset and defined transformations
dataset = Moj_Dataset_ArT('Train/ArT', get_transform(train=True))
dataset_test = Moj_Dataset_ArT('Train/ArT', get_transform(train=False))
# split the dataset in train and test set
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-500])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-500:])
# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=4, shuffle=True, num_workers=4,
collate_fn=utils.collate_fn)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1, shuffle=False, num_workers=4,
collate_fn=utils.collate_fn)
# get the model using our helper function
model = get_model_instance_segmentation(num_classes)
# move model to the right device
model.to(device)
# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.0005,
momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=30,
gamma=0.1) # gamma bila na 0.5
# let's train it for 10 epochs
num_epochs = 200
PATH = 'home//Train/ArT/models/'
ml =[]
for epoch in range(num_epochs):
# train for one epoch, printing every 10 iterations
loss_value = train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
ml.append(loss_value)
# update the learning rate
lr_scheduler.step()
torch.save(model, PATH)
# evaluate on the test dataset
#evaluate(model, data_loader_test, device=device)
print(ml)
draw_loss(ml)
if __name__ == "__main__":
main()
Something like this
When I run this after evalueate():
img, _ = dataset_test[20]
with torch.no_grad():
prediction = model([img.to(device)])
imag = Image.fromarray(img.mul(255).permute(1, 2, 0).byte().numpy())
imaag = Image.fromarray(prediction[0]['masks'][0, 0].mul(255).byte().cpu().numpy())
imag.show()
imaag.show()
I get this (not good image):
I also tried with Detectron2
but then I need to make cfg file and train on the other way, but I'm using this Pytorch tutorial

Pytorch CNN script training, but not getting results

I’m just getting started with pytorch. I am trying to do a simple binary classification project with the cats and dogs dataset. After much fumbling around, I was able to get the model to train, but I’m not getting the expected results.
First, the loss starts out way too low. To me, that seems to indicate I’m not measuring loss correctly.
Second, the model just predicts everything as 0.
I’m sure there are many mistakes here, but I would appreciate it if someone could take a look and let me know what I’m doing wrong. Thank you!
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from torchvision.io import read_image
from torch.utils.data import Dataset, DataLoader
from torchvision.utils import make_grid
from torchvision.utils import save_image
from sklearn.model_selection import train_test_split
import os
import numpy as np
from sklearn import preprocessing
import glob
import cv2
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
IMAGE_SIZE = 64
DATA_DIR = "C:\\Users\\user\\source\\repos\\pytorch-youtube\\data\\catsdogs\\PetImages\\"
LABELS = ('cat', 'dog')
# custom dataset class
# expects the root folder to have sub folders with class names
# and pictures of classes inside folder
class CustomImageDataset(Dataset):
def __init__(self):
self.imgs_path = DATA_DIR
file_list = glob.glob(self.imgs_path + "*")
self.data = []
for class_path in file_list:
class_name = class_path.split("\\")[-1]
for img_path in glob.glob(class_path + "\\*.jpg"):
self.data.append([img_path, class_name])
self.class_map = {"Dog": 0, "Cat": 1}
self.img_dim = (IMAGE_SIZE, IMAGE_SIZE)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
img_path, class_name = self.data[idx]
# this is to handle corrupt images in the dataset
# could probably be handled better
try:
img = cv2.imread(img_path)
img = cv2.resize(img, self.img_dim)
except:
img_path, class_name = self.data[idx+1]
img = cv2.imread(img_path)
img = cv2.resize(img, self.img_dim)
class_id = self.class_map[class_name]
img_tensor = torch.from_numpy(img)
img_tensor = img_tensor.permute(2, 0, 1) # not exactly sure what/why for this line
class_id = torch.tensor([class_id])
return img_tensor, class_id
# as is, we aren't using these
transform = transforms.Compose(
[transforms.Resize((64, 64)),
transforms.ConvertImageDtype(torch.float32),
transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5)),
]
)
dataset = CustomImageDataset()
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
dataiter = iter(dataloader)
train_features, train_labels = dataiter.next()
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(2704, 128) # only way I got input size was by running code
self.fc2 = nn.Linear(128, 64)
self.fc3 = nn.Linear(64, 2)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
# net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.001)
for epoch in range(2):
running_loss = 0.0
for i, data in enumerate(dataloader, 0):
inputs, labels = data
# this is the fix for "expected scalar type Byte but found Float"
# this seems to completely destroy the features in the image to just white
inputs = inputs.float()
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, torch.max(labels,1)[1])
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.10f}')
running_loss = 0.0
print("finished")
# save the model
PATH = './custom_trained_model_dogs_cats.pth'
torch.save(net.state_dict(), PATH)

It seems I was passing in the wrong thing to my loss function. I changed this line
loss = criterion(outputs, torch.max(labels,1)[1])
to this
loss = criterion(outputs, torch.max(labels,1)[0])
and everything seems to be working. I'm able to correctly classify the cats and dogs.

Reading labels in csv for images in PyTorch

I am new to PyTorch.
The task - create train, validation and test classes.
Data:
CSV file with 2 columns
Where id is name of the picture stored in train and test1 directories
Directories with train and test data images.
My code so far:
**import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from PIL import Image
##transforms
transforms = transforms.Compose([
transforms.Resize(64),
transforms.ToTensor(),
transforms.Normalize(mean = [0.485, 0.456, 0.406],
std = [0.229, 0.224, 0.225])
])
##dataloader
dataset_path = "C:/Users/nikit/OneDrive/Desktop/PyTorch/train/train"
dataset = torchvision.datasets.ImageFolder(root = train_data_path, transform = transforms)
val_split = 0.2
dataset_size = len(dataset)
val_size = int(test_split * dataset_size)
train_size = dataset_size - val_size
train_data, val_data = torch.utils.data.random_split(dataset, [train_size, test_size])
##define test
test_data_path = "C:/Users/nikit/OneDrive/Desktop/PyTorch/test1/test1"
test_data = torchvision.datasets.ImageFolder(root = train_data_path, transform = transforms)
##data load
batch_size = 64
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size = batch_size)
val_data_loader = torch.utils.data.DataLoader(val_data, batch_size = batch_size)
test_data_loader = torch.utils.data.DataLoader(test_data, batch_size = batch_size)**
Please help me to connect the values with csv data

train_df = pd.DataFrame(columns=["img_name","label"])
train_df["img_name"] = os.listdir(path_train)
for idx, i in enumerate(os.listdir(path_train)):
if "cat" in i:
train_df["label"][idx] = 0
if "dog" in i:
train_df["label"][idx] = 1
train_df.to_csv (r'train_csv.csv', index = False, header=True)

Creating a Custom Dataset for your files
A custom Dataset class must implement three functions: init, len, and getitem. Take a look at this implementation; the FashionMNIST images are stored in a directory img_dir, and their labels are stored separately in a CSV file annotations_file.
In the next sections, we’ll break down what’s happening in each of these functions:
import os
import pandas as pd
from torchvision.io import read_image
class CustomImageDataset(Dataset):
def __init__(self, annotations_file, img_dir, transform=None,
target_transform=None):
self.img_labels = pd.read_csv(annotations_file)
self.img_dir = img_dir
self.transform = transform
self.target_transform = target_transform
def __len__(self):
return len(self.img_labels)
def __getitem__(self, idx):
img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
image = read_image(img_path)
label = self.img_labels.iloc[idx, 1]
if self.transform:
image = self.transform(image)
if self.target_transform:
label = self.target_transform(label)
return image, label

How to prepare data and perform training of autoencoder?

I have a dataset of images that looks like this:
array([[[[0.35980392, 0.26078431, 0.14313725],
[0.38137255, 0.26470588, 0.15196078],
[0.51960784, 0.3745098 , 0.26176471],
...,
[0.34313725, 0.22352941, 0.15 ],
[0.30784314, 0.2254902 , 0.15686275],
[0.28823529, 0.22843137, 0.16862745]],
[[0.38627451, 0.28235294, 0.16764706],
[0.45098039, 0.32843137, 0.21666667],
[0.62254902, 0.47254902, 0.36470588],
...,
[0.34607843, 0.22745098, 0.15490196],
[0.30686275, 0.2245098 , 0.15588235],
[0.27843137, 0.21960784, 0.16176471]],
[[0.41568627, 0.30098039, 0.18431373],
[0.51862745, 0.38529412, 0.27352941],
[0.67745098, 0.52058824, 0.40980392],
...,
[0.34901961, 0.22941176, 0.15588235],
[0.29901961, 0.21666667, 0.14901961],
[0.26078431, 0.20098039, 0.14313725]],
...,
This is how i download it:
data, attrs = fetch_dataset()
This is how fetch_dataset() function works:
import numpy as np
import os
from skimage.transform import resize
import skimage.io
import pandas as pd
def fetch_dataset(attrs_name = "lfw_attributes.txt",
images_name = "lfw-deepfunneled",
dx=80,dy=80,
dimx=45,dimy=45
):
#download if not exists
if not os.path.exists(images_name):
print("images not found, donwloading...")
os.system("wget http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz -O tmp.tgz")
print("extracting...")
os.system("tar xvzf tmp.tgz && rm tmp.tgz")
print("done")
assert os.path.exists(images_name)
if not os.path.exists(attrs_name):
print("attributes not found, downloading...")
os.system("wget http://www.cs.columbia.edu/CAVE/databases/pubfig/download/%s" % attrs_name)
print("done")
#read attrs
df_attrs = pd.read_csv("lfw_attributes.txt",sep='\t',skiprows=1,)
df_attrs = pd.DataFrame(df_attrs.iloc[:,:-1].values, columns = df_attrs.columns[1:])
#read photos
photo_ids = []
for dirpath, dirnames, filenames in os.walk(images_name):
for fname in filenames:
if fname.endswith(".jpg"):
fpath = os.path.join(dirpath,fname)
photo_id = fname[:-4].replace('_',' ').split()
person_id = ' '.join(photo_id[:-1])
photo_number = int(photo_id[-1])
photo_ids.append({'person':person_id,'imagenum':photo_number,'photo_path':fpath})
photo_ids = pd.DataFrame(photo_ids)
# print(photo_ids)
#mass-merge
#(photos now have same order as attributes)
df = pd.merge(df_attrs,photo_ids,on=('person','imagenum'))
assert len(df)==len(df_attrs),"lost some data when merging dataframes"
# print(df.shape)
#image preprocessing
all_photos =df['photo_path'].apply(skimage.io.imread)\
.apply(lambda img:img[dy:-dy,dx:-dx])\
.apply(lambda img: resize(img,[dimx,dimy]))
all_photos = np.stack(all_photos.values)#.astype('uint8')
all_attrs = df.drop(["photo_path","person","imagenum"],axis=1)
return all_photos,all_attrs
Next I'm trying to convert my dataset to Tensor:
import torch
import torchvision.transforms
from torchvision import transforms as transforms
class MyDataset(torch.utils.data.Dataset):
def __init__(self, data):
self.dataset = data
def __getitem__(self, idx):
sample = self.dataset[idx]
data, label = sample[0], sample[1]
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
return transform(data), torch.tensor(label)
def __len__(self):
return len(self.dataset)
Then:
from sklearn.model_selection import train_test_split
train, val = train_test_split(data, test_size=0.25, random_state=42)
train_dataset = MyDataset(train)
val_dataset = MyDataset(val)
Autoencoder class:
from copy import deepcopy
class Autoencoder(nn.Module):
def __init__(self):
#<определите архитектуры encoder и decoder>
super(Autoencoder,self).__init__()
self.encoder = nn.Sequential(
nn.Conv2d(3, 6, kernel_size=5),
nn.ReLU(True),
nn.Conv2d(6,16,kernel_size=5),
nn.ReLU(True))
self.decoder = nn.Sequential(
nn.Conv2d(16, 6, kernel_size=5),
nn.ReLU(True),
nn.Conv2d(6,3,kernel_size=5),
nn.ReLU(True))
def forward(self, x):
latent_code = self.encoder(x)
reconstruction = self.decoder(latent_code)
return reconstruction, latent_code
Then:
criterion = nn.BCELoss()
autoencoder = Autoencoder()
optimizer = torch.optim.Adam(autoencoder.parameters())
The training itself:
#<тут Ваш код тренировки автоэнкодера>
num_epochs = 5
for epoch in range(num_epochs):
for x in train_dataset:
img, _ = x
img = Variable(img).cpu()
output = autoencoder(img)
loss = criterion(output, img)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('epoch [{}/{}], loss:{:.4f}'.format(epoch+1, num_epochs, loss.data()))
And I get this error:
RuntimeError: output with shape [1, 45, 3] doesn't match the broadcast shape [3, 45, 3]
I think I'm making mistakes somewhere in preparing my data. Could someome please explain me how to do it properly in my case? Thanks for any help.

How to custom train Face-Aging-with-Identity-Preserved-Conditional-Generative-Adversarial-Networks

I am currently working on a project using GAN.
Searching for several models, I found a model called Conditional GAN and found the Face-Aging-with-Identity-Preserved-Conditional-Generative-Adversarial-Networks project on github.
https://github.com/dawei6875797/Face-Aging-with-Identity-Preserved-Conditional-Generative-Adversarial-Networks
The Readme.md for this project described how to download the data and how to train the model.
Viewing the results using the data provided by this project and the trained model was a success. But I failed to train from the data I wanted.
I have tried many things about this, changing the path of the data, etc. but have not been successful. I'm not sure what I missed.
this is my code
import os.path
import os
os.environ['CUDA_DEVICES_ORDER'] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
import numpy as np
import tensorflow as tf
from datetime import datetime
from models import FaceAging
import sys
sys.path.append('./tools/')
from source_input import load_source_batch3
from utils import save_images, save_source
from data_generator import ImageDataGenerator
flags = tf.app.flags
flags.DEFINE_float("learning_rate", 0.001, "Learning rate")
flags.DEFINE_integer("batch_size", 32, "The size of batch images")
flags.DEFINE_integer("image_size", 128, "the size of the generated image")
flags.DEFINE_integer("noise_dim", 256, "the length of the noise vector")
flags.DEFINE_integer("feature_size", 128, "image size after stride 2 conv")
flags.DEFINE_integer("age_groups", 5, "the number of different age groups")
flags.DEFINE_integer('max_steps', 200000, 'Number of batches to run')
flags.DEFINE_string("alexnet_pretrained_model", "pre_trained/alexnet.model-292000",
"Directory name to save the checkpoints")
flags.DEFINE_string("age_pretrained_model", "pre_trained/age_classifier.model-300000",
"Directory name to save the checkpoints")
flags.DEFINE_integer('model_index', None, 'the index of trained model')
flags.DEFINE_float("gan_loss_weight", 75, "gan_loss_weight")
flags.DEFINE_float("fea_loss_weight", 0.5e-4, "fea_loss_weight")
flags.DEFINE_float("age_loss_weight", 30, "age_loss_weight")
flags.DEFINE_float("tv_loss_weight", None, "face_loss_weight")
flags.DEFINE_string("checkpoint_dir", "checkpoints/age/0_conv5_lsgan_transfer_g75_0.5f-4_a30", "Directory name to save the checkpoints")
flags.DEFINE_string("source_checkpoint_dir", ' ', "Directory name to save the checkpoints")
flags.DEFINE_string("sample_dir", "age/0_conv5_lsgan_transfer_g75_0.5f-4_a30 ", "Directory name to save the sample images")
flags.DEFINE_string("fea_layer_name", 'conv5', "which layer to use for fea_loss")
flags.DEFINE_string("source_file", "train/", "source file path")
flags.DEFINE_string("root_folder", "CACD_cropped_400/", "folder that contains images")
FLAGS = flags.FLAGS
# How often to run a batch through the validation model.
VAL_INTERVAL = 5000
# How often to save a model checkpoint
SAVE_INTERVAL = 10000
d_iter = 1
g_iter = 1
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
# Initalize the data generator seperately for the training and validation set
train_generator = ImageDataGenerator(batch_size=FLAGS.batch_size, height=FLAGS.feature_size, width=FLAGS.feature_size,
z_dim=FLAGS.noise_dim, scale_size=(FLAGS.image_size, FLAGS.image_size), mode='train')
def my_train():
with tf.Graph().as_default():
sess = tf.Session(config=config)
model = FaceAging(sess=sess, lr=FLAGS.learning_rate, keep_prob=1., model_num=FLAGS.model_index, batch_size=FLAGS.batch_size,
age_loss_weight=FLAGS.age_loss_weight, gan_loss_weight=FLAGS.gan_loss_weight,
fea_loss_weight=FLAGS.fea_loss_weight, tv_loss_weight=FLAGS.tv_loss_weight)
imgs = tf.placeholder(tf.float32, [FLAGS.batch_size, FLAGS.image_size, FLAGS.image_size, 3])
true_label_features_128 = tf.placeholder(tf.float32, [FLAGS.batch_size, 128, 128, FLAGS.age_groups])
true_label_features_64 = tf.placeholder(tf.float32, [FLAGS.batch_size, 64, 64, FLAGS.age_groups])
false_label_features_64 = tf.placeholder(tf.float32, [FLAGS.batch_size, 64, 64, FLAGS.age_groups])
age_label = tf.placeholder(tf.int32, [FLAGS.batch_size])
source_img_227, source_img_128, face_label = load_source_batch3(FLAGS.source_file, FLAGS.root_folder, FLAGS.batch_size)
model.train_age_lsgan_transfer(source_img_227, source_img_128, imgs, true_label_features_128,
true_label_features_64, false_label_features_64, FLAGS.fea_layer_name, age_label)
ge_samples = model.generate_images(imgs, true_label_features_128, reuse=True, mode='train')
# Create a saver.
model.saver = tf.train.Saver(model.save_d_vars + model.save_g_vars, max_to_keep=200)
model.alexnet_saver = tf.train.Saver(model.alexnet_vars)
model.age_saver = tf.train.Saver(model.age_vars)
d_error = model.d_loss/model.gan_loss_weight
g_error = model.g_loss/model.gan_loss_weight
fea_error = model.fea_loss/model.fea_loss_weight
age_error = model.age_loss/model.age_loss_weight
# Start running operations on the Graph.
sess.run(tf.global_variables_initializer())
tf.train.start_queue_runners(sess)
model.alexnet_saver.restore(sess, FLAGS.alexnet_pretrained_model)
model.age_saver.restore(sess, FLAGS.age_pretrained_model)
if model.load(FLAGS.checkpoint_dir, model.saver):
print(" [*] Load SUCCESS")
else:
print(" [!] Load failed...")
print("{} Start training...")
# Loop over max_steps
for step in range(FLAGS.max_steps):
images, t_label_features_128, t_label_features_64, f_label_features_64, age_labels = \
train_generator.next_target_batch_transfer2()
dict = {imgs: images,
true_label_features_128: t_label_features_128,
true_label_features_64: t_label_features_64,
false_label_features_64: f_label_features_64,
age_label: age_labels
}
for i in range(d_iter):
_, d_loss = sess.run([model.d_optim, d_error], feed_dict=dict)
for i in range(g_iter):
_, g_loss, fea_loss, age_loss = sess.run([model.g_optim, g_error, fea_error, age_error],
feed_dict=dict)
format_str = ('%s: step %d, d_loss = %.3f, g_loss = %.3f, fea_loss=%.3f, age_loss=%.3f')
print(format_str % (datetime.now(), step, d_loss, g_loss, fea_loss, age_loss))
# Save the model checkpoint periodically.
if step % SAVE_INTERVAL == SAVE_INTERVAL-1 or (step + 1) == FLAGS.max_steps:
checkpoint_path = os.path.join(FLAGS.checkpoint_dir)
model.save(checkpoint_path, step, 'acgan')
if step % VAL_INTERVAL == VAL_INTERVAL-1:
if not os.path.exists(FLAGS.sample_dir):
os.makedirs(FLAGS.sample_dir)
path = os.path.join(FLAGS.sample_dir, str(step))
if not os.path.exists(path):
os.makedirs(path)
source = sess.run(source_img_128)
save_source(source, [4, 8], os.path.join(path, 'source.jpg'))
for j in range(train_generator.n_classes):
true_label_fea = train_generator.label_features_128[j]
dict = {
imgs: source,
true_label_features_128: true_label_fea
}
samples = sess.run(ge_samples, feed_dict=dict)
save_images(samples, [4, 8], './{}/test_{:01d}.jpg'.format(path, j))
def main(argv=None):
my_train()
if __name__ == '__main__':
tf.app.run()
this is error msg
Traceback (most recent call last):
File "C:/Users/admin/Desktop/Face/Face-Aging-with-Identity-Preserved-Conditional-Generative-Adversarial-Networks-master/age_lsgan_transfer.py", line 167, in <module>
tf.app.run()
File "C:\Users\admin\PycharmProjects\Deep\venv\lib\site-packages\tensorflow\python\platform\app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "C:\Users\admin\PycharmProjects\Deep\venv\lib\site-packages\absl\app.py", line 300, in run
_run_main(main, args)
File "C:\Users\admin\PycharmProjects\Deep\venv\lib\site-packages\absl\app.py", line 251, in _run_main
sys.exit(main(argv))
File "C:/Users/admin/Desktop/Face/Face-Aging-with-Identity-Preserved-Conditional-Generative-Adversarial-Networks-master/age_lsgan_transfer.py", line 163, in main
my_train()
File "C:/Users/admin/Desktop/Face/Face-Aging-with-Identity-Preserved-Conditional-Generative-Adversarial-Networks-master/age_lsgan_transfer.py", line 88, in my_train
source_img_227, source_img_128, face_label = load_source_batch3(FLAGS.source_file, FLAGS.root_folder, FLAGS.batch_size)
File "C:\Users\admin\Desktop\Face\Face-Aging-with-Identity-Preserved-Conditional-Generative-Adversarial-Networks-master\source_input.py", line 128, in load_source_batch3
img_list, label_list = get_imgAndlabel_list2(filename, img_folder)
File "C:\Users\admin\Desktop\Face\Face-Aging-with-Identity-Preserved-Conditional-Generative-Adversarial-Networks-master\source_input.py", line 173, in get_imgAndlabel_list2
f = open(filename, 'r')
PermissionError: [Errno 13] Permission denied: 'train/'
this code is source_input.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os
import sys
import tensorflow as tf
#from read_image import *
from tensorflow.python.platform import gfile
from tensorflow.python.platform import flags
import numpy as np
import scipy.io as scio
from tensorflow.python.framework import ops
from PIL import Image
FLAGS = flags.FLAGS
T = 1
IM_HEIGHT = 400
IM_WIDTH = 400
IM_CHANNELS = 3
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def read_images(filename_queue, new_height=None, new_width=None):
reader = tf.WholeFileReader()
key, value = reader.read(filename_queue)
image = tf.image.decode_jpeg(value) # use png or jpeg decoder based on your files
image = tf.reshape(image, [IM_HEIGHT, IM_WIDTH, IM_CHANNELS])
if new_height and new_width:
image = tf.image.resize_images(image, [new_height, new_width])
image = tf.cast(image, tf.float32) - np.array([104., 117., 124.])
return image
def read_images2(filename_queue):
reader = tf.WholeFileReader()
key, value = reader.read(filename_queue)
image = tf.image.decode_jpeg(value) # use png or jpeg decoder based on your files
image = tf.reshape(image, [IM_HEIGHT, IM_WIDTH, IM_CHANNELS])
image_227 = tf.image.resize_images(image, [227, 227])
image_227 = tf.cast(image_227, tf.float32) - np.array([104., 117., 124.])
image_128 = tf.image.resize_images(image, [128, 128])
image_128 = tf.cast(image_128, tf.float32) - np.array([104., 117., 124.])
return image_227, image_128
def read_images3(input_queue):
label = input_queue[1]
file_contents = tf.read_file(input_queue[0])
image = tf.image.decode_image(file_contents, channels=3)
image = tf.reshape(image, [IM_HEIGHT, IM_WIDTH, IM_CHANNELS])
image_227 = tf.image.resize_images(image, [227, 227])
image_227 = tf.cast(image_227, tf.float32) - np.array([104., 117., 124.])
image_128 = tf.image.resize_images(image, [128, 128])
# image_128 = tf.cast(image_128, tf.float32)
image_128 = tf.cast(image_128, tf.float32) - np.array([104., 117., 124.])
return image_227, image_128, label
def load_source_batch(filename, img_folder, batch_size, img_size, shuffle=True):
filenames = get_imgAndlabel_list(filename, img_folder)
print('%d images to train' %(len(filenames)))
if not filenames:
raise RuntimeError('No data files found.')
with tf.name_scope('input'):
filename_queue = tf.train.string_input_producer(filenames, shuffle=shuffle)
# Even when reading in multiple threads, share the filename queue.
image = read_images(filename_queue, new_height=img_size, new_width=img_size)
image_batch = tf.train.shuffle_batch(
[image],
batch_size=batch_size,
num_threads=4,
capacity=1280,
min_after_dequeue=640)
# image_batch = tf.train.batch(
# [image],
# batch_size=batch_size,
# num_threads=4,
# capacity=1280)
#
return image_batch
def load_source_batch2(filename, img_folder, batch_size, shuffle=True):
filenames = get_imgAndlabel_list(filename, img_folder)
print('%d images to train' % (len(filenames)))
if not filenames:
raise RuntimeError('No data files found.')
with tf.name_scope('input'):
filename_queue = tf.train.string_input_producer(filenames, shuffle=shuffle)
# Even when reading in multiple threads, share the filename queue.
image_227, image_128 = read_images2(filename_queue)
image_227_batch, image_128_batch = tf.train.shuffle_batch(
[image_227, image_128],
batch_size=batch_size,
num_threads=4,
capacity=1280,
min_after_dequeue=640)
return image_227_batch, image_128_batch
def load_source_batch3(filename, img_folder, batch_size, shuffle=True):
img_list, label_list = get_imgAndlabel_list2(filename, img_folder)
print('%d images to train' % (len(img_list)))
images = ops.convert_to_tensor(img_list, dtype=tf.string)
labels = ops.convert_to_tensor(label_list, dtype=tf.int32)
# Makes an input queue
input_queue = tf.train.slice_input_producer([images, labels], shuffle=shuffle)
# Even when reading in multiple threads, share the filename queue.
image_227, image_128, label = read_images3(input_queue)
image_227_batch, image_128_batch, label_batch = tf.train.shuffle_batch(
[image_227, image_128, label],
batch_size=batch_size,
num_threads=4,
capacity=1280,
min_after_dequeue=640)
return image_227_batch, image_128_batch, label_batch
def get_imgAndlabel_list(filename, img_folder):
"""
:param filename:
each line in filename is img_name \space label
:return:
img names list
label list
"""
f = open(filename, 'r')
lines = f.readlines()
f.close()
imgname_lists = []
for i in range(len(lines)):
img_name = lines[i].split()[0]
imgname_lists.append(os.path.join(img_folder, img_name))
return imgname_lists
def get_imgAndlabel_list2(filename, img_folder):
"""
:param filename:
each line in filename is img_name \space label
:return:
img names list
label list
"""
f = open(filename, 'r')
lines = f.readlines()
f.close()
imgname_lists = []
label_lists = []
for i in range(len(lines)):
img_name, label = lines[i].split()
imgname_lists.append(os.path.join(img_folder, img_name))
label_lists.append(int(label))
return imgname_lists, label_lists

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Transformer Image captioning model produces just padding rather than a caption - python

Related

MarkRCNN output images (prediction)

Pytorch CNN script training, but not getting results

Reading labels in csv for images in PyTorch

How to prepare data and perform training of autoencoder?

How to custom train Face-Aging-with-Identity-Preserved-Conditional-Generative-Adversarial-Networks

Categories

Resources