Adapting my code from TF1 to TF2.6 I run into trouble.
I am trying to add some custom layers to an inception resnet, save the model, and then load and run it.
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
import tensorflow as tf
import numpy as np
from PIL import Image
export_path = "./save_test"
# Get model without top and add two layers
base_model = InceptionResNetV2(weights='imagenet', input_tensor=None, include_top=False)
out = base_model.output
out = GlobalAveragePooling2D()(out)
predictions = Dense(7, activation='softmax', name="output")(out)
# Make new model using inputs from base model and custom outputs
model = Model(inputs=base_model.input, outputs=[predictions])
# save model
tf.saved_model.save(model, export_path)
# load model and run
with tf.compat.v1.Session(graph=tf.Graph()) as sess:
tf.compat.v1.saved_model.loader.load(sess, ['serve'], export_path)
graph = tf.compat.v1.get_default_graph()
img = Image.new('RGB', (299, 299))
x = tf.keras.preprocessing.image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = x[..., :3]
x /= 255.0
x = (x - 0.5) * 2.0
y_pred = sess.run('output/Softmax:0', feed_dict={'serving_default_input_1:0': x})
Error:
KeyError: "The name 'output/Softmax:0' refers to a Tensor which does not exist. The operation, 'output/Softmax', does not exist in the graph."
What I don't understand:
predictions.name is 'output/Softmax:0', but
graph.get_tensor_by_name('output/Softmax:0') tells me it does not exist!
Note: I am aware that I can save and load with TF2's tf.keras.models.save and tf.keras.models.load_model and then run the model with model(x). However, in my application I have multiple models in memory and I have found that the inference takes much longer than in my TF1 code using the session object. I would therefore like to use the TF1 approach with the session object in compatibility mode.
How can I control the names of input/output when saving? What am I missing?
Tested on TF 2.0, 2.6, and 2.7:
If you haven't already, you could try something like the following, as I believe you are referencing the wrong keys in SignatureDef:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
import tensorflow as tf
import numpy as np
from PIL import Image
export_path = "./save_test"
base_model = InceptionResNetV2(weights='imagenet', input_tensor=None, include_top=False)
out = base_model.output
out = GlobalAveragePooling2D()(out)
predictions = Dense(7, activation='softmax', name="output")(out)
model = Model(inputs=base_model.input, outputs=[predictions])
tf.saved_model.save(model, export_path)
with tf.compat.v1.Session(graph=tf.Graph()) as sess:
meta_graph = tf.compat.v1.saved_model.loader.load(sess, ["serve"], export_path)
sig_def = meta_graph.signature_def[tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
input_key = list(dict(sig_def.inputs).keys())[0]
input_name = sig_def.inputs[input_key].name
output_name = sig_def.outputs['output'].name
img = Image.new('RGB', (299, 299))
x = tf.keras.preprocessing.image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = x[..., :3]
x /= 255.0
x = (x - 0.5) * 2.0
y_pred = sess.run(output_name, feed_dict={input_name: x})
print(y_pred)
INFO:tensorflow:Restoring parameters from ./save_test/variables/variables
[[0.14001141 0.13356228 0.14509581 0.22432518 0.16313255 0.11899492
0.07487784]]
You could also take a look at the SignatureDef for input and output information:
print(meta_graph.signature_def)
{'serving_default': inputs {
key: "input_2"
value {
name: "serving_default_input_2:0"
dtype: DT_FLOAT
tensor_shape {
dim {
size: -1
}
dim {
size: -1
}
dim {
size: -1
}
dim {
size: 3
}
}
}
}
outputs {
key: "output"
value {
name: "StatefulPartitionedCall:0"
dtype: DT_FLOAT
tensor_shape {
dim {
size: -1
}
dim {
size: 7
}
}
}
}
method_name: "tensorflow/serving/predict"
, '__saved_model_init_op': outputs {
key: "__saved_model_init_op"
value {
name: "NoOp"
tensor_shape {
unknown_rank: true
}
}
}
}
If you remove the first layer of your base_model and add a new Input layer, you can use static key names sig_def.inputs['input'].name and sig_def.outputs['output'].name:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
import tensorflow as tf
import numpy as np
from PIL import Image
export_path = "./save_test"
base_model = InceptionResNetV2(weights='imagenet', input_tensor=None, include_top=False)
base_model.layers.pop(0)
new_input = tf.keras.layers.Input(shape=(299,299,3), name='input')
out = base_model(new_input)
out = GlobalAveragePooling2D()(out)
predictions = Dense(7, activation='softmax', name="output")(out)
model = Model(inputs=new_input, outputs=[predictions])
tf.saved_model.save(model, export_path)
with tf.compat.v1.Session(graph=tf.Graph()) as sess:
meta_graph = tf.compat.v1.saved_model.loader.load(sess, ["serve"], export_path)
sig_def = meta_graph.signature_def[tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
input_name = sig_def.inputs['input'].name
output_name = sig_def.outputs['output'].name
img = Image.new('RGB', (299, 299))
x = tf.keras.preprocessing.image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = x[..., :3]
x /= 255.0
x = (x - 0.5) * 2.0
y_pred = sess.run(output_name, feed_dict={input_name: x})
print(y_pred)
INFO:tensorflow:Restoring parameters from ./save_test/variables/variables
[[0.21079363 0.10773096 0.07287834 0.06983061 0.10538215 0.09172108
0.34166315]]
Note that changing the name of the first layer of base_model does not work with the syntax model.layers[0]._name = 'input' because the model configuration itself will not be updated.
While trying to import VGG19 model, the code below generates an error of non tensor inputs. Although I am following another this code snippet here.
Code:
from keras.applications.vgg19 import VGG19
import keras.backend as K
from keras.models import Model
import imageio as iio
image_shape = (384,384,3)
vgg19 = VGG19(include_top=False, weights='imagenet', input_shape=image_shape)
vgg19.trainable = False
# Make trainable as False
for l in vgg19.layers:
l.trainable = False
model = Model(inputs=vgg19.input, outputs=vgg19.get_layer('block5_conv4').output)
model.trainable = False
img1 = iio.imread('img1.jpg')
img2 = iio.imread('img2.jpg')
mean = K.mean(K.square(model(img1) - model(img2)))
Error:
...,
[164, 90, 0, 255],
[164, 90, 0, 255],
[164, 90, 0, 255]]]], dtype=uint8)]. All inputs to the layer should be tensors.
unable to figure out why.
Maybe try converting your images to tensors:
import numpy
from PIL import Image
from keras.applications.vgg19 import VGG19
import keras.backend as K
from keras.models import Model
import imageio as iio
# Create random images
for n in range(2):
a = numpy.random.rand(384,384,3) * 255
im = Image.fromarray(a.astype('uint8')).convert('RGB')
im.save('test%0d.jpg' % n)
image_shape = (384,384,3)
vgg19 = VGG19(include_top=False, weights='imagenet', input_shape=image_shape)
vgg19.trainable = False
# Make trainable as False
for l in vgg19.layers:
l.trainable = False
model = Model(inputs=vgg19.input, outputs=vgg19.get_layer('block5_conv4').output)
model.trainable = False
img1 = iio.imread('test0.jpg')
img2 = iio.imread('test1.jpg')
img1 = tf.expand_dims(tf.constant(img1), axis=0)
img2 = tf.expand_dims(tf.constant(img2), axis=0)
mean = K.mean(K.square(model(img1) - model(img2)))
print(mean)
tf.Tensor(5.283036, shape=(), dtype=float32)
Instead of tf.expand_dims, you could also just do this:
img1 = tf.constant([img1])
img2 = tf.constant([img2])
There is also an option to load your images with tf.keras.preprocessing.image.load_img:
img1 = tf.keras.preprocessing.image.load_img('test0.jpg')
img2 = tf.keras.preprocessing.image.load_img('test1.jpg')
img1 = tf.constant([tf.keras.preprocessing.image.img_to_array(img1)])
img2 = tf.constant([tf.keras.preprocessing.image.img_to_array(img2)])
mean = K.mean(K.square(model(img1) - model(img2)))
print(mean)
the code from this page works fine.
i changed the code little bit.
image_shape = (384,384,3)
base_model = VGG19(include_top=False, weights='imagenet', input_shape=image_shape)
model = Model(inputs=base_model.input, outputs=base_model.get_layer('block5_conv4').output)
img01 = iio.imread('test0.jpg').astype('float32')
img11 = iio.imread('test1.jpg').astype('float32')
imgx1 = normalize(img01)
imgx2 = normalize(img11)
img1 = np.expand_dims(imgx1, axis=0)
img2 = np.expand_dims(imgx2, axis=0)
mean = np.mean((model.predict(img1) - model.predict(img2))**2)
print(mean)
i tried to create a python script for detect the object in a picture and return the similar images, but it return always an error:
TypeError: only integer scalar arrays can be converted to a scalar index
the ids : is the matrix returned after the detection; and it returned perfectly
but the problem is only in the last line: scores = [img_paths[id] for id in ids].
My code :
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
import numpy as np
from PIL import Image
#from feature_extractor import FeatureExtractor
from datetime import datetime
from flask import Flask, request, render_template
from pathlib import Path
from keras.optimizers import Adam
from tensorflow.keras.layers import Dropout, Dense, Activation, Flatten
class FeatureExtractor:
def __init__(self):
input_shape = (224, 224, 3)
base_model = VGG16(weights='imagenet', include_top=False, input_shape=input_shape)
for layer in base_model.layers:
layer.trainable = False
last = base_model.layers[-1].output
x = Flatten()(last)
x = Dense(1000, activation='relu', name='fc1')(x)
x = Dropout(0.3)(x)
x = Dense(10, activation='softmax', name='predictions')(x)
model = Model(base_model.input, x)
model.compile(optimizer=Adam(lr=0.001),
loss = 'categorical_crossentropy',metrics=['accuracy'])
self.model = Model(inputs=base_model.input, outputs=base_model.layers[-1].output)
def extract(self, img):
"""
Extract a deep feature from an input image
Args:
img: from PIL.Image.open(path) or tensorflow.keras.preprocessing.image.load_img(path)
Returns:
feature (np.ndarray): deep feature with the shape=(4096, )
"""
img = img.resize((224, 224)) # VGG must take a 224x224 img as an input
img = img.convert('RGB') # Make sure img is color
x = image.img_to_array(img) # To np.array. Height x Width x Channel. dtype=float32
x = np.expand_dims(x, axis=0) # (H, W, C)->(1, H, W, C), where the first elem is the number of img
x = preprocess_input(x) # Subtracting avg values for each pixel
feature = self.model.predict(x)[0] # (1, 4096) -> (4096, )
return feature / np.linalg.norm(feature) # Normalize
path = "/home/virtuag/www/storage/searchSCB.jpg"
img = Image.open(path)
app = Flask(__name__)
fe = FeatureExtractor()
features = []
img_paths = []
for feature_path in Path("/home/virtuag/www/storage/images_article").glob("*.npy"):
features.append(np.load(feature_path))
img_paths.append(Path("/home/virtuag/www/storage/images_article") / (feature_path.stem + ".jpg"))
features = np.array(features)
query = fe.extract(img)
dists = np.linalg.norm(features-query, axis=1)
ids = np.argsort(dists)[:30]
scores = [img_paths[id] for id in ids]
I am trying to produce a model that will produce a caption for an image using resnet as the encoder, transformer as the decoder and COCO as the database.
After training my model for 10 epochs, my model failed to produce anything other than the word <pad> which implies that the only result after going through the model only produced tokens of 0 which corresponds to <pad>.
After using the debugger it seems that the error occurs at the argmax, where the output just becomes zero rather than anything else, but I don't know how to fix it, is it an issue with my model, or the way it is trained?
I based my model off of this github if it helps.
The script to download the COCO model is here:
Download.sh
mkdir data
wget http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip -P ./data/
wget http://images.cocodataset.org/zips/train2014.zip -P ./data/
wget http://images.cocodataset.org/zips/val2014.zip -P ./data/
unzip ./data/captions_train-val2014.zip -d ./data/
rm ./data/captions_train-val2014.zip
unzip ./data/train2014.zip -d ./data/
rm ./data/train2014.zip
unzip ./data/val2014.zip -d ./data/
rm ./data/val2014.zip
Any help is much appreciated.
Here is my code:
model.py
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
import torchvision.models as models
from torch.nn import TransformerDecoderLayer, TransformerDecoder
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable
class EncoderCNN(nn.Module):
def __init__(self, embed_size):
super(EncoderCNN, self).__init__()
resnet = models.resnet152(pretrained=True)
self.resnet = nn.Sequential(*list(resnet.children())[:-2])
self.conv1 = nn.Conv2d(2048, embed_size, 1)
self.embed_size = embed_size
self.fine_tune()
def forward(self, images):
features = self.resnet(images)
batch_size, _,_,_ = features.shape
features = self.conv1(features)
features = features.view(batch_size, self.embed_size, -1)
features = features.permute(2, 0, 1)
return features
def fine_tune(self, fine_tune=True):
for p in self.resnet.parameters():
p.requires_grad = False
# If fine-tuning, only fine-tune convolutional blocks 2 through 4
for c in list(self.resnet.children())[5:]:
for p in c.parameters():
p.requires_grad = fine_tune
class PositionEncoder(nn.Module):
def __init__(self, d_model, dropout, max_len=5000):
super(PositionEncoder, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class Embedder(nn.Module):
def __init__(self, vocab_size, d_model):
super().__init__()
self.embed = nn.Embedding(vocab_size, d_model)
def forward(self, x):
return self.embed(x)
class Transformer(nn.Module):
def __init__(self, vocab_size, d_model, h, num_hidden, N, device, dropout_dec=0.1, dropout_pos=0.1):
super(Transformer, self).__init__()
decoder_layers = TransformerDecoderLayer(d_model, h, num_hidden, dropout_dec)
self.source_mask = None
self.device = device
self.d_model = d_model
self.pos_decoder = PositionalEncoder(d_model, dropout_pos)
self.decoder = TransformerDecoder(decoder_layers, N)
self.embed = Embedder(vocab_size, d_model)
self.linear = nn.Linear(d_model, vocab_size)
self.init_weights()
def forward(self, source, mem):
source = source.permute(1,0)
if self.source_mask is None or self.source_mask.size(0) != len(source):
self.source_mask = nn.Transformer.generate_square_subsequent_mask(self=self, sz=len(source)).to(self.device)
source = self.embed(source)
source = source*math.sqrt(self.d_model)
source = self.pos_decoder(source)
output = self.decoder(source, mem, self.source_mask)
output = self.linear(output)
return output
def init_weights(self):
initrange = 0.1
self.linear.bias.data.zero_()
self.linear.weight.data.uniform_(-initrange, initrange)
def pred(self, memory, pred_len):
batch_size = memory.size(1)
src = torch.ones((pred_len, batch_size), dtype=int) * 2
if self.source_mask is None or self.source_mask.size(0) != len(src):
self.source_mask = nn.Transformer.generate_square_subsequent_mask(self=self, sz=len(src)).to(self.device)
output = torch.ones((pred_len, batch_size), dtype=int)
src, output = src.cuda(), output.cuda()
for i in range(pred_len):
src_emb = self.embed(src) # src_len * batch size * embed size
src_emb = src_emb*math.sqrt(self.d_model)
src_emb = self.pos_decoder(src_emb)
out = self.decoder(src_emb, memory, self.source_mask)
out = out[i]
out = self.linear(out) # batch_size * vocab_size
out = out.argmax(dim=1)
if i < pred_len-1:
src[i+1] = out
output[i] = out
return output
Data_Loader.py
import torch
import torchvision.transforms as transforms
import torch.utils.data as data
import os
import pickle
import numpy as np
import nltk
from PIL import Image
from build_vocab import Vocabulary
from pycocotools.coco import COCO
class CocoDataset(data.Dataset):
"""COCO Custom Dataset compatible with torch.utils.data.DataLoader."""
def __init__(self, root, json, vocab, transform=None):
"""Set the path for images, captions and vocabulary wrapper.
Args:
root: image directory.
json: coco annotation file path.
vocab: vocabulary wrapper.
transform: image transformer.
"""
self.root = root
self.coco = COCO(json)
self.ids = list(self.coco.anns.keys())
self.vocab = vocab
self.transform = transform
def __getitem__(self, index):
"""Returns one data pair (image and caption)."""
coco = self.coco
vocab = self.vocab
ann_id = self.ids[index]
caption = coco.anns[ann_id]['caption']
img_id = coco.anns[ann_id]['image_id']
path = coco.loadImgs(img_id)[0]['file_name']
image = Image.open(os.path.join(self.root, path)).convert('RGB')
if self.transform is not None:
image = self.transform(image)
# Convert caption (string) to word ids.
tokens = nltk.tokenize.word_tokenize(str(caption).lower())
caption = []
caption.append(vocab('<start>'))
caption.extend([vocab(token) for token in tokens])
caption.append(vocab('<end>'))
target = torch.Tensor(caption)
return image, target
def __len__(self):
return len(self.ids)
def collate_fn(data):
"""Creates mini-batch tensors from the list of tuples (image, caption).
We should build custom collate_fn rather than using default collate_fn,
because merging caption (including padding) is not supported in default.
Args:
data: list of tuple (image, caption).
- image: torch tensor of shape (3, 256, 256).
- caption: torch tensor of shape (?); variable length.
Returns:
images: torch tensor of shape (batch_size, 3, 256, 256).
targets: torch tensor of shape (batch_size, padded_length).
lengths: list; valid length for each padded caption.
"""
# Sort a data list by caption length (descending order).
data.sort(key=lambda x: len(x[1]), reverse=True)
images, captions = zip(*data)
# Merge images (from tuple of 3D tensor to 4D tensor).
images = torch.stack(images, 0)
# Merge captions (from tuple of 1D tensor to 2D tensor).
lengths = [len(cap) for cap in captions]
targets = torch.zeros(len(captions), max(lengths)).long()
for i, cap in enumerate(captions):
end = lengths[i]
targets[i, :end] = cap[:end]
return images, targets, lengths
def get_loader(root, json, vocab, transform, batch_size, shuffle, num_workers):
"""Returns torch.utils.data.DataLoader for custom coco dataset."""
# COCO caption dataset
coco = CocoDataset(root=root,
json=json,
vocab=vocab,
transform=transform)
# Data loader for COCO dataset
# This will return (images, captions, lengths) for each iteration.
# images: a tensor of shape (batch_size, 3, 224, 224).
# captions: a tensor of shape (batch_size, padded_length).
# lengths: a list indicating valid length for each caption. length is (batch_size).
data_loader = torch.utils.data.DataLoader(dataset=coco,
batch_size=batch_size,
shuffle=shuffle,
num_workers=num_workers,
collate_fn=collate_fn)
return data_loader
Build_vocab.py
import nltk
import pickle
import argparse
from collections import Counter
from pycocotools.coco import COCO
class Vocabulary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = {}
self.idx = 0
def add_word(self, word):
if not word in self.word2idx:
self.word2idx[word] = self.idx
self.idx2word[self.idx] = word
self.idx += 1
def __call__(self, word):
if not word in self.word2idx:
return self.word2idx['<unk>']
return self.word2idx[word]
def __len__(self):
return len(self.word2idx)
def build_vocab(json, threshold):
coco = COCO(json)
counter = Counter()
ids = coco.anns.keys()
for i, id in enumerate(ids):
caption = str(coco.anns[id]['caption'])
tokens = nltk.tokenize.word_tokenize(caption.lower())
counter.update(tokens)
if (i+1) % 1000 == 0:
print("[{}/{}] Tokenized the captions.".format(i+1, len(ids)))
# If the word frequency is less than 'threshold', then the word is discarded.
words = [word for word, cnt in counter.items() if cnt >= threshold]
# Create a vocab wrapper and add some special tokens.
vocab = Vocabulary()
vocab.add_word('<pad>')
vocab.add_word('<start>')
vocab.add_word('<end>')
vocab.add_word('<unk>')
# Add the words to the vocabulary.
for i, word in enumerate(words):
vocab.add_word(word)
return vocab
def main(args):
vocab = build_vocab(json=args.caption_path, threshold=args.threshold)
vocab_path = args.vocab_path
with open(vocab_path, 'wb') as f:
pickle.dump(vocab, f)
print("Total vocabulary size: {}".format(len(vocab)))
print("Saved the vocabulary wrapper to '{}'".format(vocab_path))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--caption_path', type=str,
default='./data/annotations/captions_train2014.json',
help='path for train annotation file')
parser.add_argument('--vocab_path', type=str, default='./data/vocab.pkl',
help='path for saving vocabulary wrapper')
parser.add_argument('--threshold', type=int, default=4,
help='minimum word count threshold')
args = parser.parse_args()
main(args)
train.py
import argparse
import torch
import torch.nn as nn
import numpy as np
import os
import pickle
import math
from tqdm import tqdm
from data_loader import get_loader
from build_vocab import Vocabulary
from model import EncoderCNN, Decoder
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def main(args):
batch_size = 64
embed_size = 512
num_heads = 8
num_layers = 6
num_workers = 2
num_epoch = 5
lr = 1e-3
load = False
# Create model directory
if not os.path.exists('models/'):
os.makedirs('models/')
# Image preprocessing, normalization for the pretrained resnet
transform = transforms.Compose([
transforms.RandomCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406),
(0.229, 0.224, 0.225))])
# Load vocabulary wrapper
with open('data/vocab.pkl', 'rb') as f:
vocab = pickle.load(f)
# Build data loader
data_loader = get_loader('data/resized2014', 'data/annotations/captions_train2014.json', vocab,
transform, batch_size,
shuffle=True, num_workers=num_workers)
encoder = EncoderCNN(embed_size).to(device)
encoder.fine_tune(False)
decoder = Decoder(len(vocab), embed_size, num_heads, embed_size, num_layers).to(device)
if(load):
encoder.load_state_dict(torch.load(os.path.join('models/', 'encoder-{}-{}.ckpt'.format(5, 5000))))
decoder.load_state_dict(torch.load(os.path.join('models/', 'decoder-{}-{}.ckpt'.format(5, 5000))))
print("Load Successful")
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
encoder_optim = torch.optim.Adam(encoder.parameters(), lr=lr)
decoder_optim = torch.optim.Adam(decoder.parameters(), lr=lr)
# Train the models
for epoch in range(num_epoch):
encoder.train()
decoder.train()
for i, (images, captions, lengths) in tqdm(enumerate(data_loader), total=len(data_loader), leave=False):
# Set mini-batch dataset
images = images.to(device)
captions = captions.to(device)
# Forward, backward and optimize
features = encoder(images)
cap_input = captions[:, :-1]
cap_target = captions[:, 1:]
outputs = decoder(cap_input, features)
outputs = outputs.permute(1,0,2)
outputs_shape = outputs.reshape(-1, len(vocab))
loss = criterion(outputs_shape, cap_target.reshape(-1))
decoder.zero_grad()
encoder.zero_grad()
loss.backward()
encoder_optim.step()
decoder_optim.step()
# Save the model checkpoints
if (i+1) % args.save_step == 0:
torch.save(decoder.state_dict(), os.path.join(
'models/', 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
torch.save(encoder.state_dict(), os.path.join(
'models/', 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--log_step', type=int , default=10, help='step size for prining log info')
parser.add_argument('--save_step', type=int , default=1000, help='step size for saving trained models')
args = parser.parse_args()
print(args)
main(args)
sample.py
import torch
import matplotlib.pyplot as plt
import numpy as np
import argparse
import pickle
import os
from torchvision import transforms
from build_vocab import Vocabulary
from data_loader import get_loader
from model import EncoderCNN, Decoder
from PIL import Image
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#
def token_sentence(decoder_out, itos):
tokens = decoder_out
tokens = tokens.transpose(1, 0)
tokens = tokens.cpu().numpy()
results = []
for instance in tokens:
result = ' '.join([itos[x] for x in instance])
results.append(''.join(result.partition('<eos>')[0])) # Cut before '<eos>'
return results
def load_image(image_path, transform=None):
image = Image.open(image_path).convert('RGB')
image = image.resize([224, 224], Image.LANCZOS)
if transform is not None:
image = transform(image).unsqueeze(0)
return image
def main(args):
batch_size = 64
embed_size = 512
num_heads = 8
num_layers = 6
num_workers = 2
# Image preprocessing
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406),
(0.229, 0.224, 0.225))])
# Load vocabulary wrapper
with open(args.vocab_path, 'rb') as f:
vocab = pickle.load(f)
data_loader = get_loader('data/resized2014', 'data/annotations/captions_train2014.json', vocab,
transform, batch_size,
shuffle=True, num_workers=num_workers)
# Build models
encoder = EncoderCNN(embed_size).to(device)
encoder.fine_tune(False)
decoder = Decoder(len(vocab), embed_size, num_heads, embed_size, num_layers).to(device)
# Load trained models
encoder.load_state_dict(torch.load(os.path.join('models/', 'encoder-{}-{}.ckpt'.format(1, 4000))))
decoder.load_state_dict(torch.load(os.path.join('models/', 'decoder-{}-{}.ckpt'.format(1, 4000))))
encoder.eval()
decoder.eval()
itos = vocab.idx2word
pred_len = 100
result_collection = []
# Decode with greedy
# with torch.no_grad():
# for i, (images, captions, lengths) in enumerate(data_loader):
# images = images.to(device)
# features = encoder(images)
# output = decoder.generator(features, pred_len)
# result_caption = token_sentence(output, itos)
# result_collection.extend(result_caption)
# Decode with greedy
with torch.no_grad():
for batch_index, (inputs, captions, caplens) in enumerate(data_loader):
inputs, captions = inputs.cuda(), captions.cuda()
enc_out = encoder(inputs)
captions_input = captions[:, :-1]
captions_target = captions[:, 1:]
output = decoder.pred(enc_out, pred_len)
result_caption = token_sentence(output, itos)
result_collection.extend(result_caption)
print("Prediction-greedy:", result_collection[1])
print("Prediction-greedy:", result_collection[2])
print("Prediction-greedy:", result_collection[3])
print("Prediction-greedy:", result_collection[4])
print("Prediction-greedy:", result_collection[5])
print("Prediction-greedy:", result_collection[6])
print("Prediction-greedy:", result_collection[7])
print("Prediction-greedy:", result_collection[8])
print("Prediction-greedy:", result_collection[9])
print("Prediction-greedy:", result_collection[10])
print("Prediction-greedy:", result_collection[11])
# # Prepare an image
# image = load_image(args.image, transform)
# image_tensor = image.to(device)
# # Generate an caption from the image
# feature = encoder(image_tensor)
# sampled_ids = decoder.generator(feature, pred_len)
# sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length)
# # Convert word_ids to words
# sampled_caption = []
# for word_id in sampled_ids:
# word = vocab.idx2word[word_id]
# sampled_caption.append(word)
# if word == '<end>':
# break
# sentence = ' '.join(sampled_caption)
# # Print out the image and the generated caption
# print (sentence)
# image = Image.open(args.image)
# plt.imshow(np.asarray(image))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--image', type=str, required=False, help='input image for generating caption')
parser.add_argument('--vocab_path', type=str, default='data/vocab.pkl', help='path for vocabulary wrapper')
args = parser.parse_args()
main(args)
resize.py
import argparse
import os
from PIL import Image
def resize_image(image, size):
"""Resize an image to the given size."""
return image.resize(size, Image.ANTIALIAS)
def resize_images(image_dir, output_dir, size):
"""Resize the images in 'image_dir' and save into 'output_dir'."""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
images = os.listdir(image_dir)
num_images = len(images)
for i, image in enumerate(images):
with open(os.path.join(image_dir, image), 'r+b') as f:
with Image.open(f) as img:
img = resize_image(img, size)
img.save(os.path.join(output_dir, image), img.format)
if (i+1) % 100 == 0:
print ("[{}/{}] Resized the images and saved into '{}'."
.format(i+1, num_images, output_dir))
def main(args):
image_dir = args.image_dir
output_dir = args.output_dir
image_size = [args.image_size, args.image_size]
resize_images(image_dir, output_dir, image_size)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--image_dir', type=str, default='./data/train2014/',
help='directory for train images')
parser.add_argument('--output_dir', type=str, default='./data/resized2014/',
help='directory for saving resized images')
parser.add_argument('--image_size', type=int, default=256,
help='size for image after processing')
args = parser.parse_args()
main(args)
I made a convolutional neural network, have trained the model, and now it can accurately determine the form that is on the photo.
But now I need to make sure that it determines all the forms that are in one photo. So I need to somehow cut the original photo into parts, and then identify each.
What is the best way to do it? I am using Python3, keras. Learning and recognition code:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import argparse
import random
import pickle
import cv2
import os
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers.core import Dense, Flatten
from keras.layers import Dropout
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import to_categorical
from keras.optimizers import SGD
from keras.models import load_model
from imutils import paths
matplotlib.use ("Agg")
categories = ['bread', 'chicken', 'cucumbers', 'dry_peas', 'eggs',
'green_peas', 'kolbasa', 'potato', 'raw_beef',
'spaghetti', 'tomatoes']
dataset = "ingredients"
model_path = "test_model.model"
label_bin = "test_model.pickle"
plot = "output / test_model_plot.png"
print ("[INFO] loading images ...")
data = []
labels = []
imagePaths = sorted (list (paths.list_images (dataset)))
random.seed (42)
random.shuffle (imagePaths)
for imagePath in imagePaths:
try:
image = cv2.imread (imagePath)
image = cv2.resize (image, (32, 32))
data.append (image / 255)
label = imagePath.split (os.path.sep) [- 2]
labels.append (categories.index (label))
except Exception as e:
print ("[WARNING]", e)
data = np.array (data)
print (labels)
labels = to_categorical (np.array (labels))
print (labels)
(trainX, testX, trainY, testY) = train_test_split (data, labels,
test_size = 0.2,
random_state = 42)
lb = LabelBinarizer ()
trainY = lb.fit_transform (trainY)
testY = lb.transform (testY)
'' '
model = Sequential ()
model.add (Conv2D (32, (3, 3), activation = 'relu', input_shape = (32, 32, 3)))
model.add (MaxPooling2D ((2, 2)))
model.add (Conv2D (64, (3, 3), activation = 'relu'))
model.add (MaxPooling2D ((2, 2)))
model.add (Flatten ())
model.add (Dropout (0.5))
model.add (Dense (512, activation = "sigmoid"))
model.add (Dense (len (lb.classes_), activation = "softmax"))
'' '
model = load_model ('test_model.model')
INIT_LR = 0.01
EPOCHS = 150
print ("[INFO] training network ...")
opt = SGD (lr = INIT_LR)
model.compile (loss = "categorical_crossentropy", optimizer = opt,
metrics = ["accuracy"])
H = model.fit (trainX, trainY, validation_data = (testX, testY),
epochs = EPOCHS, batch_size = 32)
print ("[INFO] evaluating network ...")
predictions = model.predict (testX, batch_size = 32)
print (predictions)
#print (classification_report (testY.argmax (axis = 1)
# predictions.argmax (axis = 1), target_names = lb.classes_))
N = np.arange (0, EPOCHS)
plt.style.use ("ggplot")
plt.figure ()
print (H.history.keys ())
plt.plot (N, H.history ["loss"], label = "train_loss")
plt.plot (N, H.history ["val_loss"], label = "val_loss")
plt.plot (N, H.history ["accuracy"], label = "train_acc")
plt.plot (N, H.history ["val_accuracy"], label = "val_acc")
plt.title ("Training Loss and Accuracy (Simple NN)")
plt.xlabel ("Epoch #")
plt.ylabel ("Loss / Accuracy")
plt.legend ()
plt.savefig (plot)
print ("[INFO] serializing network and label binarizer ...")
model.save (model_path)
f = open (label_bin, "wb")
f.write (pickle.dumps (lb))
f.close ()
and
import argparse
import pickle
import cv2
import flask
import werkzeug
from keras.models import load_model
import keras
import tensorflow as tf
import keras.backend.tensorflow_backend as tb
from tensorflow.python.keras.backend import set_session
from tensorflow.python.keras.models import load_model
import sys
sys.modules['keras'] = keras
class FoodRecognizer:
def __init__(self, model, label_bin, size, flatten):
self.label_bin = label_bin
self.size = size
self.width, self.height = self.size
self.flatten = flatten
print("[INFO] loading network and label binarizer...")
set_session(sess)
self.model = load_model(model)
self.lb = pickle.loads(open(self.label_bin, "rb").read())
def load_image(self, image_file):
image = cv2.imread(image_file)
output = image.copy()
image = cv2.resize(image, self.size)
image = image / 255.0
image = image.reshape(1, *image.shape)
self.image = image
def recognize(self):
preds = self.model.predict(self.image)
result = list(preds[0])
for i in range(len(result)):
print(categories[i].ljust(10, " "), result[i], sep='\t')
i = preds.argmax(axis=1)[0]
print()
out = sorted(result)[-3:][::-1]
print("Скорее всего, на фотографии:")
for o in out:
print(categories[result.index(o)], f"{round(o * 100, 2)}%")
label = self.lb.classes_[i]
return label
app = flask.Flask(__name__)
#app.route('/', methods=['POST'])
def handle_request():
print(flask.request.files.to_dict())
imagefile = flask.request.files['image']
filename = werkzeug.utils.secure_filename(imagefile.filename)
print("\nReceived image File name : " + imagefile.filename)
imagefile.save('images/' + str(filename))
MFR.load_image('images/' + filename)
print(MFR.image)
print('images/' + filename)
with graph.as_default():
set_session(sess)
result = MFR.recognize()
print(result)
return categories[result]
categories = ['bread', 'chicken', 'cucumbers', 'dry_peas', 'eggs',
'green_peas', 'kolbasa', 'potato', 'raw_beef',
'spaghetti', 'tomatoes']
if __name__ == "__main__":
model_path = "test_model.model"
label_path = "test_model.pickle"
size = (32, 32)
flatten = 1
sess = tf.Session()
graph = tf.get_default_graph()
MFR = FoodRecognizer(model_path, label_path, size, flatten)
#MFR.load_image("images/test_image.jpg")
#print(categories[MFR.recognize()])
set_session(sess)
app.run(host='10.61.4.238', debug=True, threaded=False)
I think you are refering to image segmentation, check out this article to find out more information.
In short, it is similar to classification but instead of giving one class it draws bounding boxes around the detected objects.