Using tf.train.string_input_producer and tf.image.decode_jpeg I manage to read from disk and decode a single image.
This is the code:
# -------- Graph
filename_queue = tf.train.string_input_producer(
[img_path, img_path])
image_reader = tf.WholeFileReader()
key, image_file = image_reader.read(filename_queue)
image = tf.image.decode_jpeg(image_file, channels=3)
# Run my network
logits = network.get_logits(image)
# -------- Session
sess = tf.Session()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
logits_output = sess.run(logits)
The thing is, that when I look at the shape of the logit_outputs I get only 1 value even though the queue is 2 images long.
How can I read and decode the entire queue?
tf.WholeFileReader(), along tf.train.string_input_producer() work as an iterator, and thus does not have an easy way to evaluate the size of the complete dataset it is handling.
To obtain batches of N samples out of it, you could instead use image_reader.read_up_to(filename_queue, N).
Note: you can achieve the same using the newer tf.data pipeline:
def _parse_function(filename):
image_string = tf.read_file(filename)
image_decoded = tf.image.decode_image(image_string)
return image_decoded
# A vector of filenames.
filenames = tf.constant([img_path, img_path])
dataset = tf.data.Dataset.from_tensor_slices((filenames))
dataset = dataset.map(_parse_function).batch(N)
iterator = dataset.make_one_shot_iterator()
next_image_batch = iterator.get_next()
logits = network.get_logits(next_image_batch)
# ...
I am having trouble reading TFRecord format image data using the "new" (TensorFlow v1.4) Dataset API. I believe the problem is that I am somehow consuming the whole dataset instead of a single batch when trying to read. I have a working example of doing this using the batch/file-queue API here: https://github.com/gnperdue/TFExperiments/tree/master/conv (well, in the example I am running a classifier, but the code to read the TFRecord images is in the DataReaders.py class).
The problem functions are, I believe, these:
def parse_mnist_tfrec(tfrecord, features_shape):
tfrecord_features = tf.parse_single_example(
tfrecord,
features={
'features': tf.FixedLenFeature([], tf.string),
'targets': tf.FixedLenFeature([], tf.string)
}
)
features = tf.decode_raw(tfrecord_features['features'], tf.uint8)
features = tf.reshape(features, features_shape)
features = tf.cast(features, tf.float32)
targets = tf.decode_raw(tfrecord_features['targets'], tf.uint8)
targets = tf.one_hot(indices=targets, depth=10, on_value=1, off_value=0)
targets = tf.cast(targets, tf.float32)
return features, targets
class MNISTDataReaderDset:
def __init__(self, data_reader_dict):
# doesn't matter here
def batch_generator(self, num_epochs=1):
def parse_fn(tfrecord):
return parse_mnist_tfrec(
tfrecord, self.name, self.features_shape
)
dataset = tf.data.TFRecordDataset(
self.filenames_list, compression_type=self.compression_type
)
dataset = dataset.map(parse_fn)
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(self.batch_size)
iterator = dataset.make_one_shot_iterator()
batch_features, batch_labels = iterator.get_next()
return batch_features, batch_labels
Then, in use:
batch_features, batch_labels = \
data_reader.batch_generator(num_epochs=1)
sess.run(tf.local_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
# look at 3 batches only
for _ in range(3):
labels, feats = sess.run([
batch_labels, batch_features
])
This generates an error like:
[[Node: Reshape_1 = Reshape[T=DT_UINT8, Tshape=DT_INT32](DecodeRaw_1, Reshape_1/shape)]]
Input to reshape is a tensor with 50000 values, but the requested shape has 1
[[Node: Reshape_1 = Reshape[T=DT_UINT8, Tshape=DT_INT32](DecodeRaw_1, Reshape_1/shape)]]
[[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[?,28,28,1], [?,10]], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](OneShotIterator)]]
Does anyone have any ideas?
I have a gist with the full code in the reader example and a link to the TFRecord files (our old, good friend MNIST, in TFRecord form) here:
https://gist.github.com/gnperdue/56092626d611ae23370a21fdeeb2abe8
Thanks!
Edit - I also tried a flat_map, e.g.:
def batch_generator(self, num_epochs=1):
"""
TODO - we can use placeholders for the list of file names and
init with a feed_dict when we call `sess.run` - give this a
try with one list for training and one for validation
"""
def parse_fn(tfrecord):
return parse_mnist_tfrec(
tfrecord, self.name, self.features_shape
)
dataset = tf.data.Dataset.from_tensor_slices(self.filenames_list)
dataset = dataset.flat_map(
lambda filename: (
tf.data.TFRecordDataset(
filename, compression_type=self.compression_type
).map(parse_fn).batch(self.batch_size)
)
)
dataset = dataset.repeat(num_epochs)
iterator = dataset.make_one_shot_iterator()
batch_features, batch_labels = iterator.get_next()
return batch_features, batch_labels
I also tried using just one file and not a list (in my first way of approaching this above). No matter what, it seems TF always wants to eat the entire file into the TFRecordDataset and won't operate on single records.
Okay, I figured this out - the code above is fine. The problem was my script for creating the TFRecords. Basically, I had a block like this
def write_tfrecord(reader, start_idx, stop_idx, tfrecord_file):
writer = tf.python_io.TFRecordWriter(tfrecord_file)
tfeat, ttarg = get_binary_data(reader, start_idx, stop_idx)
example = tf.train.Example(
features=tf.train.Features(
feature={
'features': tf.train.Feature(
bytes_list=tf.train.BytesList(value=[tfeat])
),
'targets': tf.train.Feature(
bytes_list=tf.train.BytesList(value=[ttarg])
)
}
)
)
writer.write(example.SerializeToString())
writer.close()
and I needed a block like this instead:
def write_tfrecord(reader, start_idx, stop_idx, tfrecord_file):
writer = tf.python_io.TFRecordWriter(tfrecord_file)
for idx in range(start_idx, stop_idx):
tfeat, ttarg = get_binary_data(reader, idx)
example = tf.train.Example(
features=tf.train.Features(
feature={
'features': tf.train.Feature(
bytes_list=tf.train.BytesList(value=[tfeat])
),
'targets': tf.train.Feature(
bytes_list=tf.train.BytesList(value=[ttarg])
)
}
)
)
writer.write(example.SerializeToString())
writer.close()
Which is to say - I was basically writing my entire block of data as one giant TFRecord when I needed be making one per example in the data.
It turns out if you do it either way in the old file and batch-queue API everything works - the functions like tf.train.batch are auto-magically 'smart' enough to either carve the big block up or concatenate lots of single-example records into a batch depending on what you give it. When I fixed my code that made the TFRecords file, I didn't need to change anything in my old file and batch-queue code and it still consumed the TFRecords file just fine. However, the Dataset API is sensitive to this difference. That is why in my code above it always appeared to be consuming the entire file - its because the entire file really was one big TFRecord.
I have some basic function that takes in the URL of an image and transforms it via a VGG-16 CNN:
def convert_url(_id, url):
im = get_image(url)
return _id, np.squeeze(sess.run(end_points['vgg_16/fc7'], feed_dict={input_tensor: im}))
I have a large set of URLs (~60,000) on which I'd like to perform this function. Each iteration takes longer than a second, which is far too slow. I'd like to speed it up by using multiple processes in parallel. There is no shared state to worry about, so the usual pitfalls of multithreading aren't an issue.
However, I'm not exactly sure how to actually get tensorflow to work with the multiprocessing package. I know that you can't pass a tensorflow session to a Pool variable. So instead, I tried to initialize multiple instances of session:
def init():
global sess;
sess = tf.Session()
But when I actually launch the process, it just hangs indefinitely:
with Pool(processes=3,initializer=init) as pool:
results = pool.starmap(convert_url, list(id_img_dict.items())[0:5])
Note that the tensorflow graph is defined globally. I think that's the right way to do it but I'm not sure:
input_tensor = tf.placeholder(tf.float32, shape=(None,224,224,3), name='input_image')
scaled_input_tensor = tf.scalar_mul((1.0/255), input_tensor)
scaled_input_tensor = tf.subtract(scaled_input_tensor, 0.5)
scaled_input_tensor = tf.multiply(scaled_input_tensor, 2.0)
arg_scope = vgg_arg_scope()
with slim.arg_scope(arg_scope):
_, end_points = vgg_16(scaled_input_tensor, is_training=False)
saver = tf.train.Saver()
saver.restore(sess, checkpoint_file)
Can anyone help me get this working? Much obliged.
Forget about python's normal multithreading tools and use a tensorflow.contrib.data.Dataset. Try something like the following.
urls = ['img1.jpg', 'img2.jpg', ...]
batch_size = 16
n_batches = len(urls) // batch_size # do something more elegant for remainder
def load_img(url):
image = tf.read_file(url, name='image_data')
image = tf.image.decode_jpeg(image, channels=3, name='image')
return image
def preprocess(img_tensor):
img_tensor = (tf.cast(img_tensor, tf.float32) / 255 - 0.5)*2
img_tensor.set_shape((256, 256, 3)) # whatever shape
return img_tensor
dataset = tf.contrib.data.Dataset.from_tensor_slices(urls)
dataset = dataset.map(load_img).map(preprocess)
preprocessed_images = dataset.batch(
batch_size).make_one_shot_iterator().get_next()
arg_scope = vgg_arg_scope()
with slim.arg_scope(arg_scope):
_, end_points = vgg_16(preprocessed_images, is_training=False)
output = end_points['vgg_16/fc7']
results = []
with tf.Session() as sess:
tf.train.Saver().restore(sess, checkpoint_file)
for i in range(n_batches):
batch_results = sess.run(output)
results.extend(batch_results)
print('Done batch %d / %d' % (i+1, n_batches))
I'm experimenting with tensorflow and I'm trying to read from a csv file and print out a batch of its data via shuffle_batch. I've gone throw the decode_csv docs and the shuffle_batch docs, but I'm still unable to get it working.
Here's what I have:
import tensorflow as tf
sess = tf.InteractiveSession()
filename_queue = tf.train.string_input_producer(
["./data/train.csv"], num_epochs=1, shuffle=True) # total record count in csv is 30K
reader = tf.TextLineReader()
key, value = reader.read(filename_queue)
record_defaults = [["1"], ["2"]] # irrelevant for this discussion
input, outcome = tf.decode_csv(value, record_defaults=record_defaults)
min_after_dequeue = 1000
batch_size = 10
capacity = min_after_dequeue + 3 * batch_size
example_batch = tf.train.shuffle_batch([outcome], batch_size, capacity, min_after_dequeue)
coord = tf.train.Coordinator()
tf.train.start_queue_runners(sess, coord=coord)
example_batch.eval(session = sess)
Running this will generate this exception:
OutOfRangeError: RandomShuffleQueue
'_3_shuffle_batch_1/random_shuffle_queue' is closed
and has insufficient elements (requested 10, current size 0)
I'm not sure what the issue is. I have a feeling it's due to the session and the way I'm handling it; I'm probably not doing it properly.
Try removing the num_epochs=1 from your string_input_producer initializer.
"Note: if num_epochs is not None, this function creates local counter epochs. Use local_variables_initializer() to initialize local variables." see from: https://www.tensorflow.org/api_docs/python/tf/train/string_input_producer
I'm relatively new to the world of TensorFlow, and pretty perplexed by how you'd actually read CSV data into a usable example/label tensors in TensorFlow. The example from the TensorFlow tutorial on reading CSV data is pretty fragmented and only gets you part of the way to being able to train on CSV data.
Here's my code that I've pieced together, based off that CSV tutorial:
from __future__ import print_function
import tensorflow as tf
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
filename = "csv_test_data.csv"
# setup text reader
file_length = file_len(filename)
filename_queue = tf.train.string_input_producer([filename])
reader = tf.TextLineReader(skip_header_lines=1)
_, csv_row = reader.read(filename_queue)
# setup CSV decoding
record_defaults = [[0],[0],[0],[0],[0]]
col1,col2,col3,col4,col5 = tf.decode_csv(csv_row, record_defaults=record_defaults)
# turn features back into a tensor
features = tf.stack([col1,col2,col3,col4])
print("loading, " + str(file_length) + " line(s)\n")
with tf.Session() as sess:
tf.initialize_all_variables().run()
# start populating filename queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in range(file_length):
# retrieve a single instance
example, label = sess.run([features, col5])
print(example, label)
coord.request_stop()
coord.join(threads)
print("\ndone loading")
And here is an brief example from the CSV file I'm loading - pretty basic data - 4 feature columns, and 1 label column:
0,0,0,0,0
0,15,0,0,0
0,30,0,0,0
0,45,0,0,0
All the code above does is print each example from the CSV file, one by one, which, while nice, is pretty darn useless for training.
What I'm struggling with here is how you'd actually turn those individual examples, loaded one-by-one, into a training dataset. For example, here's a notebook I was working on in the Udacity Deep Learning course. I basically want to take the CSV data I'm loading, and plop it into something like train_dataset and train_labels:
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
I've tried using tf.train.shuffle_batch, like this, but it just inexplicably hangs:
for i in range(file_length):
# retrieve a single instance
example, label = sess.run([features, colRelevant])
example_batch, label_batch = tf.train.shuffle_batch([example, label], batch_size=file_length, capacity=file_length, min_after_dequeue=10000)
print(example, label)
So to sum up, here are my questions:
What am I missing about this process?
It feels like there is some key intuition that I'm missing about how to properly build an input pipeline.
Is there a way to avoid having to know the length of the CSV file?
It feels pretty inelegant to have to know the number of lines you want to process (the for i in range(file_length) line of code above)
Edit:
As soon as Yaroslav pointed out that I was likely mixing up imperative and graph-construction parts here, it started to become clearer. I was able to pull together the following code, which I think is closer to what would typically done when training a model from CSV (excluding any model training code):
from __future__ import print_function
import numpy as np
import tensorflow as tf
import math as math
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('dataset')
args = parser.parse_args()
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
def read_from_csv(filename_queue):
reader = tf.TextLineReader(skip_header_lines=1)
_, csv_row = reader.read(filename_queue)
record_defaults = [[0],[0],[0],[0],[0]]
colHour,colQuarter,colAction,colUser,colLabel = tf.decode_csv(csv_row, record_defaults=record_defaults)
features = tf.stack([colHour,colQuarter,colAction,colUser])
label = tf.stack([colLabel])
return features, label
def input_pipeline(batch_size, num_epochs=None):
filename_queue = tf.train.string_input_producer([args.dataset], num_epochs=num_epochs, shuffle=True)
example, label = read_from_csv(filename_queue)
min_after_dequeue = 10000
capacity = min_after_dequeue + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch(
[example, label], batch_size=batch_size, capacity=capacity,
min_after_dequeue=min_after_dequeue)
return example_batch, label_batch
file_length = file_len(args.dataset) - 1
examples, labels = input_pipeline(file_length, 1)
with tf.Session() as sess:
tf.initialize_all_variables().run()
# start populating filename queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
while not coord.should_stop():
example_batch, label_batch = sess.run([examples, labels])
print(example_batch)
except tf.errors.OutOfRangeError:
print('Done training, epoch reached')
finally:
coord.request_stop()
coord.join(threads)
I think you are mixing up imperative and graph-construction parts here. The operation tf.train.shuffle_batch creates a new queue node, and a single node can be used to process the entire dataset. So I think you are hanging because you created a bunch of shuffle_batch queues in your for loop and didn't start queue runners for them.
Normal input pipeline usage looks like this:
Add nodes like shuffle_batch to input pipeline
(optional, to prevent unintentional graph modification) finalize graph
--- end of graph construction, beginning of imperative programming --
tf.start_queue_runners
while(True): session.run()
To be more scalable (to avoid Python GIL), you could generate all of your data using TensorFlow pipeline. However, if performance is not critical, you can hook up a numpy array to an input pipeline by using slice_input_producer. Here's an example with some Print nodes to see what's going on (messages in Print go to stdout when node is run)
tf.reset_default_graph()
num_examples = 5
num_features = 2
data = np.reshape(np.arange(num_examples*num_features), (num_examples, num_features))
print data
(data_node,) = tf.slice_input_producer([tf.constant(data)], num_epochs=1, shuffle=False)
data_node_debug = tf.Print(data_node, [data_node], "Dequeueing from data_node ")
data_batch = tf.batch([data_node_debug], batch_size=2)
data_batch_debug = tf.Print(data_batch, [data_batch], "Dequeueing from data_batch ")
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
tf.get_default_graph().finalize()
tf.start_queue_runners()
try:
while True:
print sess.run(data_batch_debug)
except tf.errors.OutOfRangeError as e:
print "No more inputs."
You should see something like this
[[0 1]
[2 3]
[4 5]
[6 7]
[8 9]]
[[0 1]
[2 3]]
[[4 5]
[6 7]]
No more inputs.
The "8, 9" numbers didn't fill up the full batch, so they didn't get produced. Also tf.Print are printed to sys.stdout, so they show up in separately in Terminal for me.
PS: a minimal of connecting batch to a manually initialized queue is in github issue 2193
Also, for debugging purposes you might want to set timeout on your session so that your IPython notebook doesn't hang on empty queue dequeues. I use this helper function for my sessions
def create_session():
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction=0.3 # don't hog all vRAM
config.operation_timeout_in_ms=60000 # terminate on long hangs
# create interactive session to register a default session
sess = tf.InteractiveSession("", config=config)
return sess
Scalability Notes:
tf.constant inlines copy of your data into the Graph. There's a fundamental limit of 2GB on size of Graph definition so that's an upper limit on size of data
You could get around that limit by using v=tf.Variable and saving the data into there by running v.assign_op with a tf.placeholder on right-hand side and feeding numpy array to the placeholder (feed_dict)
That still creates two copies of data, so to save memory you could make your own version of slice_input_producer which operates on numpy arrays, and uploads rows one at a time using feed_dict
Or you could try this, the code loads the Iris dataset into tensorflow using pandas and numpy and a simple one neuron output is printed in the session. Hope it helps for a basic understanding.... [ I havent added the way of one hot decoding labels].
import tensorflow as tf
import numpy
import pandas as pd
df=pd.read_csv('/home/nagarjun/Desktop/Iris.csv',usecols = [0,1,2,3,4],skiprows = [0],header=None)
d = df.values
l = pd.read_csv('/home/nagarjun/Desktop/Iris.csv',usecols = [5] ,header=None)
labels = l.values
data = numpy.float32(d)
labels = numpy.array(l,'str')
#print data, labels
#tensorflow
x = tf.placeholder(tf.float32,shape=(150,5))
x = data
w = tf.random_normal([100,150],mean=0.0, stddev=1.0, dtype=tf.float32)
y = tf.nn.softmax(tf.matmul(w,x))
with tf.Session() as sess:
print sess.run(y)
You can use latest tf.data API :
dataset = tf.contrib.data.make_csv_dataset(filepath)
iterator = dataset.make_initializable_iterator()
columns = iterator.get_next()
with tf.Session() as sess:
sess.run([iteator.initializer])
If anyone came here searching for a simple way to read absolutely large and sharded CSV files in tf.estimator API then , please see below my code
CSV_COLUMNS = ['ID','text','class']
LABEL_COLUMN = 'class'
DEFAULTS = [['x'],['no'],[0]] #Default values
def read_dataset(filename, mode, batch_size = 512):
def _input_fn(v_test=False):
# def decode_csv(value_column):
# columns = tf.decode_csv(value_column, record_defaults = DEFAULTS)
# features = dict(zip(CSV_COLUMNS, columns))
# label = features.pop(LABEL_COLUMN)
# return add_engineered(features), label
# Create list of files that match pattern
file_list = tf.gfile.Glob(filename)
# Create dataset from file list
#dataset = tf.data.TextLineDataset(file_list).map(decode_csv)
dataset = tf.contrib.data.make_csv_dataset(file_list,
batch_size=batch_size,
column_names=CSV_COLUMNS,
column_defaults=DEFAULTS,
label_name=LABEL_COLUMN)
if mode == tf.estimator.ModeKeys.TRAIN:
num_epochs = None # indefinitely
dataset = dataset.shuffle(buffer_size = 10 * batch_size)
else:
num_epochs = 1 # end-of-input after this
batch_features, batch_labels = dataset.make_one_shot_iterator().get_next()
#Begins - Uncomment for testing only -----------------------------------------------------<
if v_test == True:
with tf.Session() as sess:
print(sess.run(batch_features))
#End - Uncomment for testing only -----------------------------------------------------<
return add_engineered(batch_features), batch_labels
return _input_fn
Example usage in TF.estimator:
train_spec = tf.estimator.TrainSpec(input_fn = read_dataset(
filename = train_file,
mode = tf.estimator.ModeKeys.TRAIN,
batch_size = 128),
max_steps = num_train_steps)
2.0 Compatible Solution: This Answer might be provided by others in the above thread but I will provide additional links which will help the community.
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=5, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
**kwargs)
For more information, please refer this Tensorflow Tutorial.