I'm modifying code from this github (https://github.com/hehefan/Video-Classification) so it can accept my own input. When I try to run it one video at a time for 5 epoch, it run with no problem. But when I try to get it run from multiple video in succession it run to Memory Error in sess.run() line.
Code:
import sys
import os
import numpy as np
import tensorflow as tf
import gzip
#import cPickle
import _pickle as cPickle
import random
from config import FLAGS
from models import DynamicRNN
from models import AveragePooling
filename = ['D2N2Sur', 'H2N2A', 'H2N2C', 'H2N2D', 'H2N2S', 'N2A', 'N2C', 'N2D', 'N2H', 'N2S', 'N2Sur', 'S2N2H']
#TRAINING LABEL
batch_label = list(range(12))
#DATA PROCESSING
data = []
batch_length = []
for name in filename:
#READ DATA
counter = 0
frame = 0
video=[]
l = ""
f = open("Train1/"+name+".txt", "r")
for line in f:
l = l + line[:-1]
counter += 1
if (counter == 365):
l = list(l)
video.append(l)
l = ""
counter = 0
frame += 1
#MAKE SURE ALL VIDEO HAVE SAME LENGTH
#PAD BY 0
frame = FLAGS.max_video_length - frame
for number in range(frame):
video.append([0]*FLAGS.feature_size)
#APPEND VIDEO TO DATA
data.append(video)
batch_length.append(FLAGS.max_video_length)
training_steps_per_epoch = len(data) // FLAGS.batch_size
if not os.path.exists(FLAGS.checkpoint_dir):
os.makedirs(FLAGS.checkpoint_dir)
model = AveragePooling(feature_size=FLAGS.feature_size, max_video_length=FLAGS.max_video_length,
num_classes=FLAGS.num_classes, cell_size=FLAGS.size, use_lstm=FLAGS.use_lstm,
learning_rate=FLAGS.learning_rate, learning_rate_decay_factor=FLAGS.learning_rate_decay_factor,
min_learning_rate=FLAGS.min_learning_rate, training_steps_per_epoch=training_steps_per_epoch,
max_gradient_norm=FLAGS.max_gradient_norm, keep_prob=FLAGS.keep_prob, is_training=True)
with tf.Session() as sess:
ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
model.saver.restore(sess, ckpt.model_checkpoint_path)
step = int(ckpt.model_checkpoint_path.split('-')[1])
else:
sess.run(tf.global_variables_initializer())
step = 0
for epoch in range(1, FLAGS.num_epochs+1):
random.shuffle(data)
batch_feature = []
batch_feature.append(data)
feed_dict = {model.frame_feature_ph: batch_feature, model.video_length_ph:batch_length, model.video_label_ph:batch_label}
loss, _ = sess.run([model.loss, model.train_op], feed_dict=feed_dict)
step += 1
if step % FLAGS.steps_per_checkpoint == 0:
checkpoint_path = os.path.join(FLAGS.checkpoint_dir, "ckpt")
model.saver.save(sess, checkpoint_path, global_step=model.global_step)
print ("%5d: %3d, %.3f"%(step, epoch, loss))
sys.stdout.flush()
Error:
Traceback (most recent call last):
File "/root/Documents/EmotionRecognition/masstrain.py", line 114, in <module>
loss, _ = sess.run([model.loss, model.train_op], feed_dict=feed_dict)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 895, in run
run_metadata_ptr)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1093, in _run
np_val = np.asarray(subfeed_val, dtype=subfeed_dtype)
File "/usr/local/lib/python3.5/dist-packages/numpy/core/numeric.py", line 482, in asarray
return array(a, dtype, copy=False, order=order)
MemoryError
Process finished with exit code 1
Anyone have a clue about this ?
Related
I have a Tensorflow graph that I managed to partition into two subgraphs, using the following code.
# graph_split.py
import os
import sys
import tensorflow as tf
import copy
from tensorflow.core.framework import graph_pb2
from tensorflow.python.framework.graph_util_impl import _extract_graph_summary, _assert_nodes_are_present, _bfs_for_reachable_nodes
def extract_sub_graph(graph_def, dest_nodes):
if not isinstance(graph_def, graph_pb2.GraphDef):
raise TypeError("graph_def must be a graph_pb2.GraphDef proto.")
if isinstance(dest_nodes, six.string_types):
raise TypeError("dest_nodes must be a list.")
name_to_input_name, name_to_node, name_to_seq_num = _extract_graph_summary(graph_def)
_assert_nodes_are_present(name_to_node, dest_nodes)
nodes_to_keep = _bfs_for_reachable_nodes(dest_nodes, name_to_input_name)
nodes_to_keep_copy = copy.deepcopy(nodes_to_keep)
for node in nodes_to_keep_copy:
if node not in dest_nodes:
nodes_to_keep.remove(node)
nodes_to_keep_list = sorted(
list(nodes_to_keep), key=lambda n: name_to_seq_num[n])
# Now construct the output GraphDef
out = graph_pb2.GraphDef()
for n in nodes_to_keep_list:
out.node.extend([copy.deepcopy(name_to_node[n])])
out.library.CopyFrom(graph_def.library)
out.versions.CopyFrom(graph_def.versions)
return out
def split_model(graph_def):
subgraphs = []
graph_nodes = [n for n in graph_def.node]
node_names = []
for t in graph_nodes:
node_names.append(t.name)
middle_node_index = int(len(graph_nodes) / 2)
subgraph_1_nodes = []
subgraph_2_nodes = []
for i in range(0, middle_node_index):
subgraph_1_nodes.append(node_names[i])
for i in range(middle_node_index, len(sub_graphs)):
subgraph_2_nodes.append(node_names[i])
subgraph_1 = extract_sub_graph(graph_def, subgraph_1_nodes)
subgraph_2 = extract_sub_graph(graph_def, subgraph_2_nodes)
subgraphs = [subgraph_1, subgraph_2]
return subgraphs
if __name__ == "__main__":
weights_path = "model.pb"
pbtxt_path = "protobuf_text.pbtxt"
with tf.gfile.FastGFile(weights_path, 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
tf.import_graph_def(graph_def, name='')
subgraphs = split_model(graph_def)
Now, I have to convert the frozen inference graphs into SavedModel objects and export them.
# frozen_to_saved.py
import tensorflow as tf
import os
import shutil
from tensorflow.python.saved_model import signature_constants
from tensorflow.python.saved_model import tag_constants
def frozen_to_saved(graph_def, export_folder="saved"):
export_dir = os.path.join(os.getcwd(), export_folder)
if os.path.exists(export_dir):
shutil.rmtree(export_dir)
os.mkdir(export_dir)
tf_version = tf.__version__.split('.')
if int(tf_version[0]) == 2:
builder = tf.compat.v1.saved_model.builder.SavedModelBuilder(export_dir)
elif int(tf_version[0]) == 1:
builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
sigs = {}
if int(tf_version[0]) == 2:
with tf.compat.v1.Session(graph=tf.Graph()) as sess:
# name="" is important to ensure we don't get spurious prefixing
tf.import_graph_def(graph_def, name="")
g = tf.get_default_graph()
inp = g.get_tensor_by_name(graph_def.node[0].name + ":0")
out = g.get_tensor_by_name(graph_def.node[-1].name + ":0")
sigs[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = \
tf.saved_model.signature_def_utils.predict_signature_def(
{"in": inp}, {"out": out})
builder.add_meta_graph_and_variables(sess,
[tag_constants.SERVING],
signature_def_map=sigs)
elif int(tf_version[0]) == 1:
with tf.Session(graph=tf.Graph()) as sess:
# name="" is important to ensure we don't get spurious prefixing
tf.import_graph_def(graph_def, name="")
g = tf.get_default_graph()
inp = g.get_tensor_by_name(graph_def.node[0].name + ":0")
out = g.get_tensor_by_name(graph_def.node[-1].name + ":0")
sigs[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = \
tf.saved_model.signature_def_utils.predict_signature_def(
{"in": inp}, {"out": out})
builder.add_meta_graph_and_variables(sess,
[tag_constants.SERVING],
signature_def_map=sigs)
builder.save()
The problem occurs when tf.import_graph_def(graph_def, name="") in frozen_to_saved.py is called.
Traceback (most recent call last):
File "model_split.py", line 142, in <module>
save_graph(subgraphs)
File "model_split.py", line 92, in save_graph
frozen_to_saved.frozen_to_saved(graph, export_folder="subgraph{}_saved".format(idx))
File "F:\model_split\frozen_to_saved.py", line 50, in frozen_to_saved
tf.import_graph_def(graph_def, name="")
File "C:\Users\Deployment\.conda\envs\sw_2021\lib\site-packages\tensorflow\python\util\deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "C:\Users\Deployment\.conda\envs\sw_2021\lib\site-packages\tensorflow\python\framework\importer.py", line 431, in import_graph_def
raise ValueError(str(e))
ValueError: Node 'BatchMultiClassNonMaxSuppression/map/while/Merge': Unknown input node 'BatchMultiClassNonMaxSuppression/map/while/NextIteration'
Apparently, when the node names are partitioned, some of them are missing from the splits. I have verified that the node names are partitioned equally.
I also noticed that for the partitions, the nodes missing in one is present in the other, and vice versa.
My suspicion is how I delete the actual nodes and reconstructed the graphs in extract_sub_graph().
I am a beginner to machine learning and trying to train a model on counting the amount of numbers below 0.5 in a 1D Vector with the length of 10. The input vectors contain number between 0 and 1. I generate the input data and the labels in my script instead of having them in a seperate file, because the data is so simple.
This is the Code:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.lin1 = nn.Linear(10,10)
self.lin2 = nn.Linear(10,1)
def forward(self,x):
x = self.lin1(x)
x = F.relu(x)
x = self.lin2(x)
return x
net = MyNet()
net.to(device)
def train():
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.1)
for epochs in range(100):
target = 0
data = torch.rand(10)
for entry in data:
if entry < 0.5:
target += 1
# print(target)
# print(data)
data = data.to(device)
out = net(data)
# print(out)
target = torch.Tensor(target)
target = target.to(device)
loss = criterion(out, target)
print(loss)
net.zero_grad()
loss.backward()
optimizer.step()
def test():
acc_error = 0
for i in range(100):
test_data = torch.rand(10)
test_data.to(device)
test_target = 0
for entry in test_data:
if entry < 0.5:
test_target += 1
out = net(test_data)
error = test_target - out
if error < 0:
error *= -1
acc_error += error
overall_error = acc_error / 100
print(overall_error)
train()
test()
This is the error:
Traceback (most recent call last):
File "test1.py", line 70, in <module>
test()
File "test1.py", line 59, in test
out = net(test_data)
File "/vol/fob-vol7/mi18/radtklau/SP/sem_project/lib64/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "test1.py", line 15, in forward
x = self.lin1(x)
File "/vol/fob-vol7/mi18/radtklau/SP/sem_project/lib64/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/vol/fob-vol7/mi18/radtklau/SP/sem_project/lib64/python3.6/site-packages/torch/nn/modules/linear.py", line 94, in forward
return F.linear(input, self.weight, self.bias)
File "/vol/fob-vol7/mi18/radtklau/SP/sem_project/lib64/python3.6/site-packages/torch/nn/functional.py", line 1753, in linear
return torch._C._nn.linear(input, weight, bias)
RuntimeError: Tensor for 'out' is on CPU, Tensor for argument #1 'self' is on CPU, but expected them to be on GPU (while checking arguments for addmm)
The other posts regarding the topic have not solved my problem. Maybe somebody can help. Thanks!
Notice how your error message traces back to test, while train works fine.
You've transfered your data correctly in train:
data = data.to(device)
But not in test:
test_data.to(device)
Instead it should be reassigned to test_data, since torch.Tensor.to makes a copy:
test_data = test_data.to(device)
I am implementing the following Keras Model for an image captioning network using keras functinoal API (tf.keras):
import glob
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
from tqdm import tqdm
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Input
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.preprocessing import image
from tensorflow.keras import Model
import nltk
from google.colab import drive
drive.mount('/content/drive')
def data_generator(batch_size = 32):
partial_caps = []
next_words = []
images = []
df = pd.read_csv(folder_drive + 'flickr8k_training_dataset.txt', delimiter='\t')
df = df.sample(frac=1) #shuffle rows
iter = df.iterrows()
c = []
imgs = []
for i in range(df.shape[0]):
x = next(iter)
c.append(x[1][1])
imgs.append(x[1][0])
count = 0
while True:
for j, text in enumerate(c):
current_image = encoding_train[imgs[j]]
for i in range(len(text.split())-1):
count+=1
partial = [word2idx[txt] for txt in text.split()[:i+1]]
partial_caps.append(partial)
# Initializing with zeros to create a one-hot encoding matrix
# This is what we have to predict
# Hence initializing it with vocab_size length
n = np.zeros(vocab_size)
# Setting the next word to 1 in the one-hot encoded matrix
n[word2idx[text.split()[i+1]]] = 1
next_words.append(n)
images.append(current_image)
if count>=batch_size:
next_words = np.asarray(next_words)
images = np.asarray(images)
partial_caps = sequence.pad_sequences(partial_caps, maxlen=max_len, padding='post')
yield [[images, partial_caps], next_words]
partial_caps = []
next_words = []
images = []
count = 0
image_input = Input(shape = (2048,))
x = layers.Dense(embedding_size, activation='relu')(image_input)
image_output = layers.RepeatVector(max_len)(x)
image_model = Model(inputs=image_input,outputs = image_output)
image_model.summary()
caption_input = Input(shape = (max_len,))
y = layers.Embedding(vocab_size,embedding_size,input_length=max_len)(caption_input)
y = layers.LSTM(256,return_sequences=True)(y)
caption_output = layers.TimeDistributed(layers.Dense(embedding_size))(y)
caption_model = Model(inputs = caption_input, outputs = caption_output)
caption_model.summary()
conca = layers.Concatenate(axis=1)([image_model.output,caption_model.output])
z = layers.Bidirectional(layers.LSTM(256, input_shape = (max_len,300), return_sequences=False))(conca)
z = layers.Dense(vocab_size)(z)
final_output = layers.Activation('softmax')(z)
final_model = Model(inputs = [image_model.input,caption_model.input], outputs = final_output)
final_model.summary()
Plot of the model
final_model.compile(loss='categorical_crossentropy', optimizer="rmsprop", metrics=['accuracy'])
final_model.fit_generator(data_generator(batch_size=2048), steps_per_epoch = samples_per_epoch//2048,
verbose=1,epochs = 50)
When running the fit_generator method, I always get the following error:
Epoch 1/50
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-225-9cb298634256> in <module>()
1 final_model.fit_generator(data_generator(batch_size=2048), steps_per_epoch = samples_per_epoch//2048,
----> 2 verbose=1,epochs = 50)
12 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
966 except Exception as e: # pylint:disable=broad-except
967 if hasattr(e, "ag_error_metadata"):
--> 968 raise e.ag_error_metadata.to_exception(e)
969 else:
970 raise
ValueError: in user code:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:571 train_function *
outputs = self.distribute_strategy.run(
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/distribute_lib.py:951 run **
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
return self._call_for_each_replica(fn, args, kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
return fn(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:541 train_step **
self.trainable_variables)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:1804 _minimize
trainable_variables))
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:521 _aggregate_gradients
filtered_grads_and_vars = _filter_grads(grads_and_vars)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:1219 _filter_grads
([v.name for _, v in grads_and_vars],))
ValueError: No gradients provided for any variable: ['embedding_6/embeddings:0', 'dense_14/kernel:0', 'dense_14/bias:0', 'lstm_10/lstm_cell_18/kernel:0', 'lstm_10/lstm_cell_18/recurrent_kernel:0', 'lstm_10/lstm_cell_18/bias:0', 'time_distributed_6/kernel:0', 'time_distributed_6/bias:0', 'bidirectional_4/forward_lstm_11/lstm_cell_20/kernel:0', 'bidirectional_4/forward_lstm_11/lstm_cell_20/recurrent_kernel:0', 'bidirectional_4/forward_lstm_11/lstm_cell_20/bias:0', 'bidirectional_4/backward_lstm_11/lstm_cell_21/kernel:0', 'bidirectional_4/backward_lstm_11/lstm_cell_21/recurrent_kernel:0', 'bidirectional_4/backward_lstm_11/lstm_cell_21/bias:0', 'dense_17/kernel:0', 'dense_17/bias:0'
Can anyone help me to identify where is the error because i have never seen it before and i have been checking similar posts in SO but any of the solutions there has worked for me.
Try this:
final_model = Model(inputs = [image_input ,caption_input ], outputs = final_output)
I'm following the steps provided by the author of a research paper on training an outdoor image classifier.
(Github: https://github.com/yuxiaoz/SGSN)
However,this is the error that I'm getting in my Ubuntu terminal:
Traceback (most recent call last):
File "./train.py", line 165, in <module>
main()
File "./train.py", line 63, in main
x_datalists = get_data_lists(args.x_data_txt_path) # a list of x images
File "./train.py", line 47, in get_data_lists
f = open(data_path, 'r')
FileNotFoundError: [Errno 2] No such file or directory: './datasets/x_traindata.txt'
Here is the training Python code provided by the author:
import argparse
from datetime import datetime
from random import shuffle
import os
import sys
import time
import math
import tensorflow as tf
import numpy as np
from utils import *
from train_image_reader import *
from net import *
parser = argparse.ArgumentParser(description='')
parser.add_argument("--snapshot_dir", default='./snapshots', help="path of snapshots")
parser.add_argument("--image_size", type=int, default=256, help="load image size")
parser.add_argument("--x_data_txt_path", default='./datasets/x_traindata.txt', help="txt of x images")
parser.add_argument("--y_data_txt_path", default='./datasets/y_traindata.txt', help="txt of y images")
parser.add_argument("--random_seed", type=int, default=1234, help="random seed")
parser.add_argument('--base_lr', type=float, default=0.0002, help='initial learning rate for adam')
parser.add_argument('--epoch', dest='epoch', type=int, default=50, help='# of epoch')
parser.add_argument('--epoch_step', dest='epoch_step', type=int, default=20, help='# of epoch to decay lr')
parser.add_argument("--lamda", type=float, default=10.0, help="L1 lamda")
parser.add_argument('--beta1', dest='beta1', type=float, default=0.5, help='momentum term of adam')
parser.add_argument("--summary_pred_every", type=int, default=200, help="times to summary.")
parser.add_argument("--save_pred_every", type=int, default=8000, help="times to save.")
parser.add_argument("--x_image_forpath", default='./datasets/train/X/images/', help="forpath of x training datas.")
parser.add_argument("--x_label_forpath", default='./datasets/train/X/labels/', help="forpath of x training labels.")
parser.add_argument("--y_image_forpath", default='./datasets/train/Y/images/', help="forpath of y training datas.")
parser.add_argument("--y_label_forpath", default='./datasets/train/Y/labels/', help="forpath of y training labels.")
args = parser.parse_args()
def save(saver, sess, logdir, step):
model_name = 'model'
checkpoint_path = os.path.join(logdir, model_name)
if not os.path.exists(logdir):
os.makedirs(logdir)
saver.save(sess, checkpoint_path, global_step=step)
print('The checkpoint has been created.')
def get_data_lists(data_path):
f = open(data_path, 'r')
datas=[]
for line in f:
data = line.strip("\n")
datas.append(data)
return datas
def l1_loss(src, dst):
return tf.reduce_mean(tf.abs(src - dst))
def gan_loss(src, dst):
return tf.reduce_mean((src-dst)**2)
def main():
if not os.path.exists(args.snapshot_dir):
os.makedirs(args.snapshot_dir)
x_datalists = get_data_lists(args.x_data_txt_path) # a list of x images
y_datalists = get_data_lists(args.y_data_txt_path) # a list of y images
tf.set_random_seed(args.random_seed)
x_img = tf.placeholder(tf.float32,shape=[1, args.image_size, args.image_size,3],name='x_img')
x_label = tf.placeholder(tf.float32,shape=[1, args.image_size, args.image_size,3],name='x_label')
y_img = tf.placeholder(tf.float32,shape=[1, args.image_size, args.image_size,3],name='y_img')
y_label = tf.placeholder(tf.float32,shape=[1, args.image_size, args.image_size,3],name='y_label')
fake_y = generator(image=x_img, reuse=False, name='generator_x2y') # G
fake_x_ = generator(image=fake_y, reuse=False, name='generator_y2x') # S
fake_x = generator(image=y_img, reuse=True, name='generator_y2x') # G'
fake_y_ = generator(image=fake_x, reuse=True, name='generator_x2y') # S'
dy_fake = discriminator(image=fake_y, gen_label = x_label, reuse=False, name='discriminator_y') # D
dx_fake = discriminator(image=fake_x, gen_label = y_label, reuse=False, name='discriminator_x') # D'
dy_real = discriminator(image=y_img, gen_label = y_label, reuse=True, name='discriminator_y') # D
dx_real = discriminator(image=x_img, gen_label = x_label, reuse=True, name='discriminator_x') #D'
final_loss = gan_loss(dy_fake, tf.ones_like(dy_fake)) + gan_loss(dx_fake, tf.ones_like(dx_fake)) + args.lamda*l1_loss(x_label, fake_x_) + args.lamda*l1_loss(y_label, fake_y_) # final objective function
dy_loss_real = gan_loss(dy_real, tf.ones_like(dy_real))
dy_loss_fake = gan_loss(dy_fake, tf.zeros_like(dy_fake))
dy_loss = (dy_loss_real + dy_loss_fake) / 2
dx_loss_real = gan_loss(dx_real, tf.ones_like(dx_real))
dx_loss_fake = gan_loss(dx_fake, tf.zeros_like(dx_fake))
dx_loss = (dx_loss_real + dx_loss_fake) / 2
dis_loss = dy_loss + dx_loss # discriminator loss
final_loss_sum = tf.summary.scalar("final_objective", final_loss)
dx_loss_sum = tf.summary.scalar("dx_loss", dx_loss)
dy_loss_sum = tf.summary.scalar("dy_loss", dy_loss)
dis_loss_sum = tf.summary.scalar("dis_loss", dis_loss)
discriminator_sum = tf.summary.merge([dx_loss_sum, dy_loss_sum, dis_loss_sum])
x_images_summary = tf.py_func(cv_inv_proc, [x_img], tf.float32) #(1, 256, 256, 3) float32
y_fake_cv2inv_images_summary = tf.py_func(cv_inv_proc, [fake_y], tf.float32) #(1, 256, 256, 3) float32
x_label_summary = tf.py_func(label_proc, [x_label], tf.float32) #(1, 256, 256, 3) float32
x_gen_label_summary = tf.py_func(label_inv_proc, [fake_x_], tf.float32) #(1, 256, 256, 3) float32
image_summary = tf.summary.image('images', tf.concat(axis=2, values=[x_images_summary, y_fake_cv2inv_images_summary, x_label_summary, x_gen_label_summary]), max_outputs=3)
summary_writer = tf.summary.FileWriter(args.snapshot_dir, graph=tf.get_default_graph())
g_vars = [v for v in tf.trainable_variables() if 'generator' in v.name]
d_vars = [v for v in tf.trainable_variables() if 'discriminator' in v.name]
lr = tf.placeholder(tf.float32, None, name='learning_rate')
d_optim = tf.train.AdamOptimizer(lr, beta1=args.beta1)
g_optim = tf.train.AdamOptimizer(lr, beta1=args.beta1)
d_grads_and_vars = d_optim.compute_gradients(dis_loss, var_list=d_vars)
d_train = d_optim.apply_gradients(d_grads_and_vars) # update weights of D and D'
g_grads_and_vars = g_optim.compute_gradients(final_loss, var_list=g_vars)
g_train = g_optim.apply_gradients(g_grads_and_vars) # update weights of G, G', S and S'
train_op = tf.group(d_train, g_train)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
init = tf.global_variables_initializer()
sess.run(init)
saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=50)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord, sess=sess)
counter = 0 # training step
for epoch in range(args.epoch):
shuffle(x_datalists) # change the order of x images
shuffle(y_datalists) # change the order of y images
lrate = args.base_lr if epoch < args.epoch_step else args.base_lr*(args.epoch-epoch)/(args.epoch-args.epoch_step)
for step in range(len(x_datalists)):
counter += 1
x_image_resize, x_label_resize, y_image_resize, y_label_resize = TrainImageReader(args.x_image_forpath, args.x_label_forpath, args.y_image_forpath, args.y_label_forpath, x_datalists, y_datalists, step, args.image_size)
batch_x_image = np.expand_dims(np.array(x_image_resize).astype(np.float32), axis = 0)
batch_x_label = np.expand_dims(np.array(x_label_resize).astype(np.float32), axis = 0)
batch_y_image = np.expand_dims(np.array(y_image_resize).astype(np.float32), axis = 0)
batch_y_label = np.expand_dims(np.array(y_label_resize).astype(np.float32), axis = 0)
start_time = time.time()
feed_dict = { lr : lrate, x_img : batch_x_image, x_label : batch_x_label, y_img : batch_y_image, y_label : batch_y_label}
if counter % args.save_pred_every == 0:
final_loss_value, dis_loss_value, _ = sess.run([final_loss, dis_loss, train_op], feed_dict=feed_dict)
save(saver, sess, args.snapshot_dir, counter)
elif counter % args.summary_pred_every == 0:
final_loss_value, dis_loss_value, final_loss_sum_value, discriminator_sum_value, image_summary_value, _ = \
sess.run([final_loss, dis_loss, final_loss_sum, discriminator_sum, image_summary, train_op], feed_dict=feed_dict)
summary_writer.add_summary(final_loss_sum_value, counter)
summary_writer.add_summary(discriminator_sum_value, counter)
summary_writer.add_summary(image_summary_value, counter)
else:
final_loss_value, dis_loss_value, _ = \
sess.run([final_loss, dis_loss, train_op], feed_dict=feed_dict)
print('epoch {:d} step {:d} \t final_loss = {:.3f}, dis_loss = {:.3f}'.format(epoch, step, final_loss_value, dis_loss_value))
coord.request_stop()
coord.join(threads)
if __name__ == '__main__':
main()
Note:
I stumbled upon a somewhat similar problem and a proposed solution here(IOError: [Errno 2] No such file or directory (when it really exist) Python). But I'm not quite sure how or where to implement it in the author's code.
Your call is correct as i can see from the log the file running is ./train.py.
Now, in the repository i can't see any x_traindata.txt file. The only thing available inside the folder are 3 python files, unless you have created that file yourself, the error seems to be justified as there are no txt files inside it.
When I try load training data from an HDF5 file using fit_generator and generator functions, I get a ValueError, which results from a PointSelectionError with HDF5:
Epoch 1/10
Exception in thread Thread-1:
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 763, in run
self.__target(*self.__args, **self.__kwargs)
File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 429, in data_generator_task
generator_output = next(self._generator)
File "osr.py", line 108, in generate_training_sequences
X = training_save_file["X"][batch_idxs]
File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper (/tmp/pip-4rPeHA-build/h5py/_objects.c:2684)
File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper (/tmp/pip-4rPeHA-build/h5py/_objects.c:2642)
File "/usr/local/lib/python2.7/dist-packages/h5py/_hl/dataset.py", line 462, in __getitem__
selection = sel.select(self.shape, args, dsid=self.id)
File "/usr/local/lib/python2.7/dist-packages/h5py/_hl/selections.py", line 72, in select
sel[arg]
File "/usr/local/lib/python2.7/dist-packages/h5py/_hl/selections.py", line 210, in __getitem__
raise TypeError("PointSelection __getitem__ only works with bool arrays")
TypeError: PointSelection __getitem__ only works with bool arrays
Traceback (most recent call last):
File "osr.py", line 359, in <module>
osr.train_osr_model()
File "osr.py", line 89, in train_osr_model
nb_worker=1)
File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 1532, in fit_generator
str(generator_output))
ValueError: output of generator should be a tuple (x, y, sample_weight) or (x, y). Found: None
I researched the error and it was mentioned that it could be due to duplicate indices, but that does not seem to be true in my case. Here are the row indices that were accessed:
[581 305 67 510 631 832 340 663 689 801 579 701 831 879 382 844 15 798
342 329 118 657 503 129 602 2 528 157 341 299 731 539]
Here are the training and generation functions:
def train_osr_model(self):
""" Train the optical speech recognizer
"""
print "\nTraining OSR"
validation_ratio = 0.3
batch_size = 32
with h5py.File(self.training_save_fn, "r") as training_save_file:
sample_count = int(training_save_file.attrs["sample_count"])
sample_idxs = range(0, sample_count)
sample_idxs = np.random.permutation(sample_idxs)
training_sample_idxs = sample_idxs[0:int((1-validation_ratio)*sample_count)]
validation_sample_idxs = sample_idxs[int((1-validation_ratio)*sample_count):]
training_sequence_generator = self.generate_training_sequences(batch_size=batch_size,
training_save_file=training_save_file,
training_sample_idxs=training_sample_idxs)
validation_sequence_generator = self.generate_validation_sequences(batch_size=batch_size,
training_save_file=training_save_file,
validation_sample_idxs=validation_sample_idxs)
print "Sample Idxs: {0}\n".format(sample_idxs) # FOR DEBUG ONLY
print "Training Idxs: {0}\n".format(training_sample_idxs) # FOR DEBUG ONLY
print "Validation Idxs: {0}\n".format(validation_sample_idxs) # FOR DEBUG ONLY
pbi = ProgressDisplay()
self.osr.fit_generator(generator=training_sequence_generator,
validation_data=validation_sequence_generator,
samples_per_epoch=len(training_sample_idxs),
nb_val_samples=len(validation_sample_idxs),
nb_epoch=10,
max_q_size=1,
verbose=2,
callbacks=[pbi],
class_weight=None,
nb_worker=1)
def generate_training_sequences(self, batch_size, training_save_file, training_sample_idxs):
""" Generates training sequences from HDF5 file on demand
"""
while True:
# generate sequences for training
training_sample_count = len(training_sample_idxs)
batches = int(training_sample_count/batch_size)
remainder_samples = training_sample_count%batch_size
if remainder_samples:
batches = batches + 1
# generate batches of samples
for idx in xrange(0, batches):
if idx == batches - 1:
batch_idxs = training_sample_idxs[idx*batch_size:]
else:
batch_idxs = training_sample_idxs[idx*batch_size:idx*batch_size+batch_size]
print batch_idxs # FOR DEBUG ONLY
X = training_save_file["X"][batch_idxs]
Y = training_save_file["Y"][batch_idxs]
yield (np.array(X), np.array(Y))
def generate_validation_sequences(self, batch_size, training_save_file, validation_sample_idxs):
while True:
# generate sequences for validation
validation_sample_count = len(validation_sample_idxs)
batches = int(validation_sample_count/batch_size)
remainder_samples = validation_sample_count%batch_size
if remainder_samples:
batches = batches + 1
# generate batches of samples
for idx in xrange(0, batches):
if idx == batches - 1:
batch_idxs = validation_sample_idxs[idx*batch_size:]
else:
batch_idxs = validation_sample_idxs[idx*batch_size:idx*batch_size+batch_size]
print batch_idxs # FOR DEBUG ONLY
X = training_save_file["X"][batch_idxs]
Y = training_save_file["Y"][batch_idxs]
yield (np.array(X), np.array(Y))
Here are the functions that preprocesses and saves the training data into an HDF5 file:
def process_training_data(self):
""" Preprocesses training data and saves them into an HDF5 file
"""
# load training metadata from config file
training_metadata = {}
training_classes = []
with open(self.config_file) as training_config:
training_metadata = json.load(training_config)
training_classes = sorted(list(training_metadata.keys()))
print "".join(["\n",
"Found {0} training classes!\n".format(len(training_classes)),
"-"*40])
for class_label, training_class in enumerate(training_classes):
print "{0:<4d} {1:<10s} {2:<30s}".format(class_label, training_class, training_metadata[training_class])
print ""
# count number of samples
sample_count = 0
sample_count_by_class = [0]*len(training_classes)
for class_label, training_class in enumerate(training_classes):
# get training class sequeunce paths
training_class_data_path = training_metadata[training_class]
training_class_sequence_paths = [os.path.join(training_class_data_path, file_name)
for file_name in os.listdir(training_class_data_path)
if (os.path.isfile(os.path.join(training_class_data_path, file_name))
and ".mov" in file_name)]
# update sample count
sample_count += len(training_class_sequence_paths)
sample_count_by_class[class_label] = len(training_class_sequence_paths)
print "".join(["\n",
"Found {0} training samples!\n".format(sample_count),
"-"*40])
for class_label, training_class in enumerate(training_classes):
print "{0:<4d} {1:<10s} {2:<6d}".format(class_label, training_class, sample_count_by_class[class_label])
print ""
# initialize HDF5 save file, but clear older duplicate first if it exists
try:
print "Saved file \"{0}\" already exists! Overwriting previous saved file.\n".format(self.training_save_fn)
os.remove(self.training_save_fn)
except OSError:
pass
# process and save training data into HDF5 file
print "Generating {0} samples from {1} samples via data augmentation\n".format(sample_count*self.samples_generated_per_sample,
sample_count)
sample_count = sample_count*self.samples_generated_per_sample
with h5py.File(self.training_save_fn, "w") as training_save_file:
training_save_file.attrs["training_classes"] = np.string_(",".join(training_classes))
training_save_file.attrs["sample_count"] = sample_count
x_training_dataset = training_save_file.create_dataset("X",
shape=(sample_count, self.frames_per_sequence, 3, self.rows, self.columns),
dtype="f")
y_training_dataset = training_save_file.create_dataset("Y",
shape=(sample_count, len(training_classes)),
dtype="i")
# iterate through each class data
sample_idx = 0
for class_label, training_class in enumerate(training_classes):
# get training class sequeunce paths
training_class_data_path = training_metadata[training_class]
training_class_sequence_paths = [os.path.join(training_class_data_path, file_name)
for file_name in os.listdir(training_class_data_path)
if (os.path.isfile(os.path.join(training_class_data_path, file_name))
and ".mov" in file_name)]
# iterate through each sequence
for idx, training_class_sequence_path in enumerate(training_class_sequence_paths):
sys.stdout.write("Processing training data for class \"{0}\": {1}/{2} sequences\r"
.format(training_class, idx+1, len(training_class_sequence_paths)))
sys.stdout.flush()
# accumulate samples and labels
samples_batch = self.process_frames(training_class_sequence_path)
label = [0]*len(training_classes)
label[class_label] = 1
label = np.array(label).astype("int32")
for sample in samples_batch:
x_training_dataset[sample_idx] = sample
y_training_dataset[sample_idx] = label
# update sample index
sample_idx += 1
print "\n"
training_save_file.close()
print "Training data processed and saved to {0}".format(self.training_save_fn)
def process_frames(self, video_file_path):
""" Preprocesses sequence frames
"""
# haar cascades for localizing oral region
face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
mouth_cascade = cv2.CascadeClassifier('haarcascade_mcs_mouth.xml')
video = cv2.VideoCapture(video_file_path)
success, frame = video.read()
frames = []
success = True
# convert to grayscale, localize oral region, equalize frame dimensions, and accumulate valid frames
while success:
success, frame = video.read()
if success:
# convert to grayscale
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
# localize single facial region
faces_coords = face_cascade.detectMultiScale(frame, 1.3, 5)
if len(faces_coords) == 1:
face_x, face_y, face_w, face_h = faces_coords[0]
frame = frame[face_y:face_y + face_h, face_x:face_x + face_w]
# localize oral region
mouth_coords = mouth_cascade.detectMultiScale(frame, 1.3, 5)
threshold = 0
for (mouth_x, mouth_y, mouth_w, mouth_h) in mouth_coords:
if (mouth_y > threshold):
threshold = mouth_y
valid_mouth_coords = (mouth_x, mouth_y, mouth_w, mouth_h)
else:
pass
mouth_x, mouth_y, mouth_w, mouth_h = valid_mouth_coords
frame = frame[mouth_y:mouth_y + mouth_h, mouth_x:mouth_x + mouth_w]
# equalize frame dimensions
frame = cv2.resize(frame, (self.columns, self.rows)).astype('float32')
# accumulate frames
frames.append(frame)
# ignore multiple facial region detections
else:
pass
# equalize sequence lengths
if len(frames) < self.frames_per_sequence:
frames = [frames[0]]*(self.frames_per_sequence - len(frames)) + frames
frames = np.array(frames[-self.frames_per_sequence:])
# function to normalize and add channel dimension to each frame
proc_frame = lambda frame: np.array([frame / 255.0]*3)
samples_batch = [np.array(map(proc_frame, frames))]
# random transformations for data augmentation
for _ in xrange(0, self.samples_generated_per_sample-1):
rotated_frames = random_rotation(frames, rg=4.5)
shifted_frames = random_shift(rotated_frames, wrg=0.05, hrg=0.05)
sheared_frames = random_shear(shifted_frames, intensity=0.08)
zoomed_frames = random_zoom(sheared_frames, zoom_range=(1.05, 1.05))
samples_batch.append(np.array(map(proc_frame, zoomed_frames)))
return samples_batch
The error comes from two things :
The one you are reading comes from the fact that batch_idxs is an array, not a list. h5py objects accept indexing with lists. But even if you change
X = training_save_file["X"][list(batch_idxs)]
you will still get an error. This comes from some restrictions about the list indexing. That brings us to the second point.
If you read the doc you sent me, this is written :
The following restrictions exist:
List selections may not be empty
Selection coordinates must be given in increasing order
Duplicate selections are ignored
Very long lists (> 1000 elements) may produce poor performance
The second bullet is our issue : your random shuffling when creating training_sample_idxs makes the index order random and the dataset is expecting them in ecreasing order. This is a limitation you will have to deal with, but it is not too constraining as the order in a batch doesn't matter, the model will be optimized on the whole batch anyway.
Does that help?