tensorflow multiprocessing for image feature extraction - python

I have some basic function that takes in the URL of an image and transforms it via a VGG-16 CNN:
def convert_url(_id, url):
im = get_image(url)
return _id, np.squeeze(sess.run(end_points['vgg_16/fc7'], feed_dict={input_tensor: im}))
I have a large set of URLs (~60,000) on which I'd like to perform this function. Each iteration takes longer than a second, which is far too slow. I'd like to speed it up by using multiple processes in parallel. There is no shared state to worry about, so the usual pitfalls of multithreading aren't an issue.
However, I'm not exactly sure how to actually get tensorflow to work with the multiprocessing package. I know that you can't pass a tensorflow session to a Pool variable. So instead, I tried to initialize multiple instances of session:
def init():
global sess;
sess = tf.Session()
But when I actually launch the process, it just hangs indefinitely:
with Pool(processes=3,initializer=init) as pool:
results = pool.starmap(convert_url, list(id_img_dict.items())[0:5])
Note that the tensorflow graph is defined globally. I think that's the right way to do it but I'm not sure:
input_tensor = tf.placeholder(tf.float32, shape=(None,224,224,3), name='input_image')
scaled_input_tensor = tf.scalar_mul((1.0/255), input_tensor)
scaled_input_tensor = tf.subtract(scaled_input_tensor, 0.5)
scaled_input_tensor = tf.multiply(scaled_input_tensor, 2.0)
arg_scope = vgg_arg_scope()
with slim.arg_scope(arg_scope):
_, end_points = vgg_16(scaled_input_tensor, is_training=False)
saver = tf.train.Saver()
saver.restore(sess, checkpoint_file)
Can anyone help me get this working? Much obliged.

Forget about python's normal multithreading tools and use a tensorflow.contrib.data.Dataset. Try something like the following.
urls = ['img1.jpg', 'img2.jpg', ...]
batch_size = 16
n_batches = len(urls) // batch_size # do something more elegant for remainder
def load_img(url):
image = tf.read_file(url, name='image_data')
image = tf.image.decode_jpeg(image, channels=3, name='image')
return image
def preprocess(img_tensor):
img_tensor = (tf.cast(img_tensor, tf.float32) / 255 - 0.5)*2
img_tensor.set_shape((256, 256, 3)) # whatever shape
return img_tensor
dataset = tf.contrib.data.Dataset.from_tensor_slices(urls)
dataset = dataset.map(load_img).map(preprocess)
preprocessed_images = dataset.batch(
batch_size).make_one_shot_iterator().get_next()
arg_scope = vgg_arg_scope()
with slim.arg_scope(arg_scope):
_, end_points = vgg_16(preprocessed_images, is_training=False)
output = end_points['vgg_16/fc7']
results = []
with tf.Session() as sess:
tf.train.Saver().restore(sess, checkpoint_file)
for i in range(n_batches):
batch_results = sess.run(output)
results.extend(batch_results)
print('Done batch %d / %d' % (i+1, n_batches))

Related

Manually Get Next Batch or Use Identical Batch with TensorFlow Data API

I am trying to use the tf.Data API to accelerate my code and prevent GPU data starvation but there is one thing that is stopping me for being comfortable with it and it's the ability to use the same batch when calling the training op multiple times.
Suppose I have my dataset set up as
dataset = tf.data.TextLineDataset("textfile.txt")
dataset = dataset.shuffle(dataset_size)
dataset = dataset.padded_batch(batch_size, ...)
dataset = dataset.repeat()
iterator = dataset.make_one_shot_iterator()
x_batch = iterator.get_next()
loss1 = someFunctionOf(x_batch)
loss2 = someOtherFunctionOf(x_batch)
train_op1 = someOptimizerOf(loss1)
train_op2 = someOtherOptimizerOf(loss2)
but now whenever I call train_op1, iterator.get_next() is called and so when calling train_op2, I am training on the next batch.
From this question, I am aware that I can use a combination of flat_map and repeat(n) where n is the number of times I wanna repeat the same batch but this n would depend on the number of train_ops that I call which I have to count manually. Also, I need these two train_ops because they optimize different parts of my graph.
Thank you for your help!
Try the code below. It creates a copy of input and target so hopefully they will not change when you switch optimizer/loss_op. They are persistent between sess.run calls as long as you do not pass is_new:True flag.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
def ds_train(batch_size, num_epochs):
ds = (tf.data.Dataset.from_tensor_slices(([1.0,2.0,3.0,4.0,5.0], [-1,-2,-3,-4,-5]))
.batch(batch_size)
.repeat(num_epochs)
)
return ds
batch_size = 1
input_size = 1
num_epochs = 2
with tf.variable_scope("dataset"):
ds_t = ds_train(batch_size, num_epochs)
with tf.variable_scope("iterator"):
iterator_t = ds_t.make_initializable_iterator()
iterator_handle = tf.placeholder(tf.string, shape=[], name="iterator_handle")
iterator = tf.data.Iterator.from_string_handle(iterator_handle,
iterator_t.output_types,
iterator_t.output_shapes)
def next_item():
next_elem = iterator.get_next(name="next_element")
x, y = tf.cast(next_elem[0], tf.float32), next_elem[1]# tf.cast(next_elem[1], tf.int32)
return x, y
inputs = tf.Variable(tf.zeros(shape=[batch_size,input_size]), dtype=tf.float32, name="inputs", trainable=False, use_resource=True)
target = tf.Variable(tf.zeros(shape=[batch_size], dtype=tf.int32), dtype=tf.int32, name="target", trainable=False,use_resource=True)
is_new = tf.placeholder_with_default(tf.constant(False), shape=[], name="new_item_flag")
def new_data(batch_size, input_size):
# run the data layer to generate a new batch
next_inputs, next_target = next_item()
next_inputs = tf.reshape(next_inputs, shape=[batch_size, input_size])
with tf.control_dependencies([tf.assign(inputs, next_inputs), tf.assign(target, next_target)]):
return tf.identity(inputs), tf.identity(target)
def old_data():
# just forward the existing batch
return inputs, target
next_inputs, next_target = next_item()
inputs, target = tf.cond(is_new, lambda:new_data(batch_size, input_size), old_data)
with tf.Session() as sess:
sess.run([tf.global_variables_initializer(),tf.local_variables_initializer()])
handle_t = sess.run(iterator_t.string_handle())
sess.run(iterator_t.initializer)
while True:
try:
print(sess.run([inputs, target], feed_dict={iterator_handle:handle_t, is_new: False}))
print(sess.run([inputs, target], feed_dict={iterator_handle:handle_t, is_new: False}))
print(sess.run([inputs, target], feed_dict={iterator_handle:handle_t, is_new: True}))
except tf.errors.OutOfRangeError:
print("End of training dataset.")
break

TensorFlow : Eval Never ends despite starting queue runners

My code hangs on the following statement:
print('Beginning Eval...')
feed_dict_train = {img_data: images_batch.eval(session=session),
img_labels: labels_batch.eval(session=session)}
I have tried to eval with both a single example and as a batch. The code to understand how images_batch and labels_batch are created is as follows:
Creating a single example:
def read_single_example(filename):
#Mini-init
image_size = 256
filenames = tf.train.string_input_producer([filename], num_epochs = None)
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filenames)
features = tf.parse_single_example(
serialized_example,
features={
'label':tf.FixedLenFeature([], tf.float32),
'image':tf.FixedLenFeature([image_size*image_size*3], tf.float32)
})
label = features['label']
#label = tf.get_default_session().run(label)
image_data = features['image']
#image_data = tf.get_default_session().run(image_data)
image = tf.reshape(image_data,(256,256,3))
return label, image
Reading a single example:
label_train, image_train = read_single_example(path)
Batching the examples inside the training loop:
print('Getting Batch')
images_batch, labels_batch = tf.train.shuffle_batch(
[image_train, label_train], batch_size=batch_size,
capacity=2000,
min_after_dequeue=1000)
The records read from disk were created with the code found here but in essence are just a bunch of 256x256x3 images with a 1x41 tensor for labels. Thats what I think I did anyway, hoping the records arent where I made the error or I have like 100GB of dead data.
This is how the Tf session is started:
session = tf.Session()
init = tf.global_variables_initializer()
session.run(init)
tf.train.start_queue_runners(sess=session)
Which to my understanding was all I needed to do to make everything work as intended. A gist of the full code for the network can be found here. Hope I didnt do anything stupid, and thanks for the help!

How to use dataset in TensorFlow session for training

I like to perform image classification on our own large image libary (millions of labeled images) with tensorflow. I´m new to stackoverflow, python and tensorflow and worked myself through a few tutorials (mnist etc.) and got to the point, where i was able to prepare a TensorFlow datset from a dictionary including the absolute path to the images and the according labels. However, i´m stuck at the point using the dataset in a TensorFlow session. Here is my (example) code:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
import time
import mymodule # I build my module to read the images and labels
from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes
from tensorflow.contrib.data import Iterator
beginTime = time.time()
batch_size = 100
learning_rate = 0.005
max_steps = 2
NUM_CLASSES = 25
def input_parser(img_path, label):
one_hot = tf.one_hot(label, NUM_CLASSES)
img_file = tf.read_file(img_path)
img_decoded = tf.image.decode_jpeg(img_file, channels = 3)
return img_decoded, one_hot
#Import Training data (returns the dicitonary with paths and labels)
train_dict = mymodule.getFileMap(labelList, imageList)
#Import Test data
test_dict = mymodule.getFileMap(labelList, imageList)
#Get train data
train_file_list, train_label_list = get_file_label_list(train_dict)
train_images_tensor = ops.convert_to_tensor(train_file_list, dtype=dtypes.string)
train_labels_tensor = ops.convert_to_tensor(train_label_list, dtype=dtypes.int64)
#Get test data
test_file_list, test_label_list = get_file_label_list(test_dict)
test_images_tensor = ops.convert_to_tensor(test_file_list, dtype=dtypes.string)
test_labels_tensor = ops.convert_to_tensor(test_label_list, dtype=dtypes.int64)
#Create TensorFlow Datset object
train_data = tf.data.Dataset.from_tensor_slices((train_images_tensor, train_labels_tensor))
test_data = tf.data.Dataset.from_tensor_slices((test_images_tensor, test_labels_tensor))
# Transform the datset so that it contains decoded images
# and one-hot vector labels
train_data = train_data.map(input_parser)
test_data = test_data.map(input_parser)
# Batching --> How to do it right?
#train_data = train_data.batch(batch_size = 100)
#test_data = train_data.batch(batch_size = 100)
#Define input placeholders
image_size = 990*990*3
images_placeholder = tf.placeholder(tf.float32, shape=[None, image_size])
labels_placeholder = tf.placeholder(tf.int64, shape=[None])
# Define variables (these afe the values we want to optimize)
weigths = tf.Variable(tf.zeros([image_size, NUM_CLASSES]))
biases = tf.Variable(tf.zeros([NUM_CLASSES]))
# Define the classifier´s result
logits = tf.matmul(images_placeholder, weigths) + biases
# Define the loss function
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits, labels = labels_placeholder))
# Define the training operation
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
# Operation comparing prediciton with true label
correct_prediciton = tf.equal(tf.argmax(logits, 1), labels_placeholder)
# Operation calculating the accuracy of our predicitons
accuracy = tf.reduce_mean(tf.cast(correct_prediciton, tf.float32))
#Create TensorFlow Iterator object
iterator = Iterator.from_structure(train_data.output_types,
train_data.output_shapes)
next_element = iterator.get_next()
#Create two initialization ops to switch between the datasets
train_init_op = iterator.make_initializer(train_data)
test_init_op = iterator.make_initializer(test_data)
with tf.Session() as sess:
#Initialize variables
sess.run(tf.global_variables_initializer())
sess.run(train_init_op)
for _ in range(10):
try:
elem = sess.run(next_element)
print(elem)
except tf.errors.OutOfRangeError:
print("End of training datset.")
break
Following this and this tutorial i could not solve the problem of how to use the (image and label) dataset in a tensorflow session for training. I was able to print out the datset by iterating through it, but wasn´t able to use it for learning.
I don´t understand how to access the images and labels seperately after they have been merged in the train_data = tf.data.Dataset.from_tensor_slices((train_images_tensor, train_labels_tensor)) operation, as requried by the 2nd tutorial. Also i don´t know how to implement batching correctly.
What i want to do in the session is basically this (from the 2nd tutorial):
# Generate input data batch
indices = np.random.choice(data_sets['images_train'].shape[0], batch_size)
images_batch = data_sets['images_train'][indices]
labels_batch = data_sets['labels_train'][indices]
# Periodically print out the model's current accuracy
if i % 100 == 0:
train_accuracy = sess.run(accuracy, feed_dict={
images_placeholder: images_batch, labels_placeholder: labels_batch})
print('Step {:5d}: training accuracy {:g}'.format(i, train_accuracy))
# Perform a single training step
sess.run(train_step, feed_dict={images_placeholder: images_batch,
labels_placeholder: labels_batch})
# After finishing the training, evaluate on the test set
test_accuracy = sess.run(accuracy, feed_dict={
images_placeholder: data_sets['images_test'],
labels_placeholder: data_sets['labels_test']})
print('Test accuracy {:g}'.format(test_accuracy))
endTime = time.time()
print('Total time: {:5.2f}s'.format(endTime - beginTime))
If anyone can tell me, how to access images and labels in the dataset sepearately and use it for training, i would be really thankful. Also a tip where and how to do the batching would be appreciated.
Thank you.
In your code, next_element is a tuple of two tensors, matching the structure of your datasets: i.e. it is a tuple whose first element is an image, and second element is a label. To access the individual tensors, you can do the following:
next_element = iterator.get_next()
next_image = next_element[0]
next_label = next_element[1]
# Or, in a single line:
next_image, next_label = iterator.get_next()
To batch a tf.data.Dataset, you can use the Dataset.batch() transformation. Your commented out code for this should simply work:
train_data = train_data.batch(batch_size = 100)
test_data = train_data.batch(batch_size = 100)

Tensorflow input pipeline with queues workflow - feeding data

I am using the Tensorflow input pipeline via queues.
Therefore, I do:
images, volumes = utils.inputs(FLAGS.train_file_path, FLAGS.batch_size, FLAGS.num_epochs)
v_images, v_volumes = utils.inputs(FLAGS.val_file_path, FLAGS.batch_size)
To read the data.
init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
with tf.Session() as sess:
# Start input enqueue threads.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
try:
step = 0
while not coord.should_stop():
_, train_loss = sess.run([train_optimizer, loss])
if step % FLAGS.val_interval == 0:
for val_step in range(0, FLAGS.val_iter):
val_loss = sess.run(v_loss)
How can I make sure that I use the images, volumes and volumes for the training and the v_images, v_volumes for validation?
EDIT
logits = utils.inference(images, FLAGS.stacks, True)
v_logits = utils.inference(v_images, FLAGS.stacks, False)
loss = utils.get_loss(logits, volumes, FLAGS.stacks, 'train')
v_loss = utils.get_loss(v_logits, v_volumes, FLAGS.stacks, 'val')
EDIT 2
summary_train_op = tf.summary.merge_all('train')
summary_val_op = tf.summary.merge_all('val')
with tf.Session() as sess:
summary_writer = tf.summary.FileWriter(FLAGS.train_dir + FLAGS.run, sess.graph)
summary_writer_val = tf.summary.FileWriter(FLAGS.train_dir + FLAGS.run + FLAGS.run_val)
One technique is to build graph nodes individually for training and validation.
Consider "net" is your model graph.
logits_train, volumes = net(images,volumes)
logits_val, v_volumes = net(v_images, v_volumes)
loss_train = some_loss(logits_train,volumes)
train_optimizer = some_optimizer(loss_train)
Now, if you do,
1. sess.run([train_optimizer]) #It uses training data (Dequeues train data)
2. sess.run([logits_val]) #It uses validation data (Dequeues validation data )
I hope this helps.
Update:
_,loss_train_value, logits_train_value = sess.run([train_optimizer, loss_train,logits_train])
Update 2
I am doing everything explicitly as shown below. May be there is a better way to do it.
loss_train_s = tf.summary.scalar("loss_train", loss_train)
acc_train_s = tf.summary.scalar("acc_train", acc_train)
loss_val_s = tf.summary.scalar("loss_val", loss_val)
train_summary_op = tf.summary.merge([loss_train_s,acc_train_s])
val_summary_op = tf.summary.merge([loss_val_s])

How to *actually* read CSV data in TensorFlow?

I'm relatively new to the world of TensorFlow, and pretty perplexed by how you'd actually read CSV data into a usable example/label tensors in TensorFlow. The example from the TensorFlow tutorial on reading CSV data is pretty fragmented and only gets you part of the way to being able to train on CSV data.
Here's my code that I've pieced together, based off that CSV tutorial:
from __future__ import print_function
import tensorflow as tf
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
filename = "csv_test_data.csv"
# setup text reader
file_length = file_len(filename)
filename_queue = tf.train.string_input_producer([filename])
reader = tf.TextLineReader(skip_header_lines=1)
_, csv_row = reader.read(filename_queue)
# setup CSV decoding
record_defaults = [[0],[0],[0],[0],[0]]
col1,col2,col3,col4,col5 = tf.decode_csv(csv_row, record_defaults=record_defaults)
# turn features back into a tensor
features = tf.stack([col1,col2,col3,col4])
print("loading, " + str(file_length) + " line(s)\n")
with tf.Session() as sess:
tf.initialize_all_variables().run()
# start populating filename queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in range(file_length):
# retrieve a single instance
example, label = sess.run([features, col5])
print(example, label)
coord.request_stop()
coord.join(threads)
print("\ndone loading")
And here is an brief example from the CSV file I'm loading - pretty basic data - 4 feature columns, and 1 label column:
0,0,0,0,0
0,15,0,0,0
0,30,0,0,0
0,45,0,0,0
All the code above does is print each example from the CSV file, one by one, which, while nice, is pretty darn useless for training.
What I'm struggling with here is how you'd actually turn those individual examples, loaded one-by-one, into a training dataset. For example, here's a notebook I was working on in the Udacity Deep Learning course. I basically want to take the CSV data I'm loading, and plop it into something like train_dataset and train_labels:
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
I've tried using tf.train.shuffle_batch, like this, but it just inexplicably hangs:
for i in range(file_length):
# retrieve a single instance
example, label = sess.run([features, colRelevant])
example_batch, label_batch = tf.train.shuffle_batch([example, label], batch_size=file_length, capacity=file_length, min_after_dequeue=10000)
print(example, label)
So to sum up, here are my questions:
What am I missing about this process?
It feels like there is some key intuition that I'm missing about how to properly build an input pipeline.
Is there a way to avoid having to know the length of the CSV file?
It feels pretty inelegant to have to know the number of lines you want to process (the for i in range(file_length) line of code above)
Edit:
As soon as Yaroslav pointed out that I was likely mixing up imperative and graph-construction parts here, it started to become clearer. I was able to pull together the following code, which I think is closer to what would typically done when training a model from CSV (excluding any model training code):
from __future__ import print_function
import numpy as np
import tensorflow as tf
import math as math
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('dataset')
args = parser.parse_args()
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
def read_from_csv(filename_queue):
reader = tf.TextLineReader(skip_header_lines=1)
_, csv_row = reader.read(filename_queue)
record_defaults = [[0],[0],[0],[0],[0]]
colHour,colQuarter,colAction,colUser,colLabel = tf.decode_csv(csv_row, record_defaults=record_defaults)
features = tf.stack([colHour,colQuarter,colAction,colUser])
label = tf.stack([colLabel])
return features, label
def input_pipeline(batch_size, num_epochs=None):
filename_queue = tf.train.string_input_producer([args.dataset], num_epochs=num_epochs, shuffle=True)
example, label = read_from_csv(filename_queue)
min_after_dequeue = 10000
capacity = min_after_dequeue + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch(
[example, label], batch_size=batch_size, capacity=capacity,
min_after_dequeue=min_after_dequeue)
return example_batch, label_batch
file_length = file_len(args.dataset) - 1
examples, labels = input_pipeline(file_length, 1)
with tf.Session() as sess:
tf.initialize_all_variables().run()
# start populating filename queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
while not coord.should_stop():
example_batch, label_batch = sess.run([examples, labels])
print(example_batch)
except tf.errors.OutOfRangeError:
print('Done training, epoch reached')
finally:
coord.request_stop()
coord.join(threads)
I think you are mixing up imperative and graph-construction parts here. The operation tf.train.shuffle_batch creates a new queue node, and a single node can be used to process the entire dataset. So I think you are hanging because you created a bunch of shuffle_batch queues in your for loop and didn't start queue runners for them.
Normal input pipeline usage looks like this:
Add nodes like shuffle_batch to input pipeline
(optional, to prevent unintentional graph modification) finalize graph
--- end of graph construction, beginning of imperative programming --
tf.start_queue_runners
while(True): session.run()
To be more scalable (to avoid Python GIL), you could generate all of your data using TensorFlow pipeline. However, if performance is not critical, you can hook up a numpy array to an input pipeline by using slice_input_producer. Here's an example with some Print nodes to see what's going on (messages in Print go to stdout when node is run)
tf.reset_default_graph()
num_examples = 5
num_features = 2
data = np.reshape(np.arange(num_examples*num_features), (num_examples, num_features))
print data
(data_node,) = tf.slice_input_producer([tf.constant(data)], num_epochs=1, shuffle=False)
data_node_debug = tf.Print(data_node, [data_node], "Dequeueing from data_node ")
data_batch = tf.batch([data_node_debug], batch_size=2)
data_batch_debug = tf.Print(data_batch, [data_batch], "Dequeueing from data_batch ")
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
tf.get_default_graph().finalize()
tf.start_queue_runners()
try:
while True:
print sess.run(data_batch_debug)
except tf.errors.OutOfRangeError as e:
print "No more inputs."
You should see something like this
[[0 1]
[2 3]
[4 5]
[6 7]
[8 9]]
[[0 1]
[2 3]]
[[4 5]
[6 7]]
No more inputs.
The "8, 9" numbers didn't fill up the full batch, so they didn't get produced. Also tf.Print are printed to sys.stdout, so they show up in separately in Terminal for me.
PS: a minimal of connecting batch to a manually initialized queue is in github issue 2193
Also, for debugging purposes you might want to set timeout on your session so that your IPython notebook doesn't hang on empty queue dequeues. I use this helper function for my sessions
def create_session():
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction=0.3 # don't hog all vRAM
config.operation_timeout_in_ms=60000 # terminate on long hangs
# create interactive session to register a default session
sess = tf.InteractiveSession("", config=config)
return sess
Scalability Notes:
tf.constant inlines copy of your data into the Graph. There's a fundamental limit of 2GB on size of Graph definition so that's an upper limit on size of data
You could get around that limit by using v=tf.Variable and saving the data into there by running v.assign_op with a tf.placeholder on right-hand side and feeding numpy array to the placeholder (feed_dict)
That still creates two copies of data, so to save memory you could make your own version of slice_input_producer which operates on numpy arrays, and uploads rows one at a time using feed_dict
Or you could try this, the code loads the Iris dataset into tensorflow using pandas and numpy and a simple one neuron output is printed in the session. Hope it helps for a basic understanding.... [ I havent added the way of one hot decoding labels].
import tensorflow as tf
import numpy
import pandas as pd
df=pd.read_csv('/home/nagarjun/Desktop/Iris.csv',usecols = [0,1,2,3,4],skiprows = [0],header=None)
d = df.values
l = pd.read_csv('/home/nagarjun/Desktop/Iris.csv',usecols = [5] ,header=None)
labels = l.values
data = numpy.float32(d)
labels = numpy.array(l,'str')
#print data, labels
#tensorflow
x = tf.placeholder(tf.float32,shape=(150,5))
x = data
w = tf.random_normal([100,150],mean=0.0, stddev=1.0, dtype=tf.float32)
y = tf.nn.softmax(tf.matmul(w,x))
with tf.Session() as sess:
print sess.run(y)
You can use latest tf.data API :
dataset = tf.contrib.data.make_csv_dataset(filepath)
iterator = dataset.make_initializable_iterator()
columns = iterator.get_next()
with tf.Session() as sess:
sess.run([iteator.initializer])
If anyone came here searching for a simple way to read absolutely large and sharded CSV files in tf.estimator API then , please see below my code
CSV_COLUMNS = ['ID','text','class']
LABEL_COLUMN = 'class'
DEFAULTS = [['x'],['no'],[0]] #Default values
def read_dataset(filename, mode, batch_size = 512):
def _input_fn(v_test=False):
# def decode_csv(value_column):
# columns = tf.decode_csv(value_column, record_defaults = DEFAULTS)
# features = dict(zip(CSV_COLUMNS, columns))
# label = features.pop(LABEL_COLUMN)
# return add_engineered(features), label
# Create list of files that match pattern
file_list = tf.gfile.Glob(filename)
# Create dataset from file list
#dataset = tf.data.TextLineDataset(file_list).map(decode_csv)
dataset = tf.contrib.data.make_csv_dataset(file_list,
batch_size=batch_size,
column_names=CSV_COLUMNS,
column_defaults=DEFAULTS,
label_name=LABEL_COLUMN)
if mode == tf.estimator.ModeKeys.TRAIN:
num_epochs = None # indefinitely
dataset = dataset.shuffle(buffer_size = 10 * batch_size)
else:
num_epochs = 1 # end-of-input after this
batch_features, batch_labels = dataset.make_one_shot_iterator().get_next()
#Begins - Uncomment for testing only -----------------------------------------------------<
if v_test == True:
with tf.Session() as sess:
print(sess.run(batch_features))
#End - Uncomment for testing only -----------------------------------------------------<
return add_engineered(batch_features), batch_labels
return _input_fn
Example usage in TF.estimator:
train_spec = tf.estimator.TrainSpec(input_fn = read_dataset(
filename = train_file,
mode = tf.estimator.ModeKeys.TRAIN,
batch_size = 128),
max_steps = num_train_steps)
2.0 Compatible Solution: This Answer might be provided by others in the above thread but I will provide additional links which will help the community.
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=5, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
**kwargs)
For more information, please refer this Tensorflow Tutorial.

Categories