I asked a previous question about the same code here where and how to put the filename in this tensorflow code?
Not sure if I should merge that into this question or leave it as is.
The following code is from Sirajology's git hub. I have not found a super straight forward tutorial on how to get one's own .csv file into a simple tensorflow neural network so my hope is this thread might provide that instruction for future searchers.
The code is as follows
import tensorflow.python.platform
import numpy as np
import tensorflow as tf
# Global variables.
NUM_LABELS = 2 # The number of labels.
BATCH_SIZE = 5 # The number of training examples to use per training step.
# Define the flags useable from the command line.
tf.app.flags.DEFINE_string('train', None,
'File containing the training data (labels & features).')
tf.app.flags.DEFINE_string('test', None,
'File containing the test data (labels & features).')
tf.app.flags.DEFINE_integer('num_epochs', 1,
'Number of examples to separate from the training '
'data for the validation set.')
tf.app.flags.DEFINE_boolean('verbose', False, 'Produce verbose output.')
FLAGS = tf.app.flags.FLAGS
# Extract numpy representations of the labels and features given rows consisting of:
# label, feat_0, feat_1, ..., feat_n
def extract_data(filename):
# Arrays to hold the labels and feature vectors.
labels = []
fvecs = []
# Iterate over the rows, splitting the label from the features. Convert labels
# to integers and features to floats.
for line in file(filename):
row = line.split(",")
labels.append(int(row[0]))
fvecs.append([float(x) for x in row[1:]])
# Convert the array of float arrays into a numpy float matrix.
fvecs_np = np.matrix(fvecs).astype(np.float32)
# Convert the array of int labels into a numpy array.
labels_np = np.array(labels).astype(dtype=np.uint8)
# Convert the int numpy array into a one-hot matrix.
labels_onehot = (np.arange(NUM_LABELS) == labels_np[:, None]).astype(np.float32)
# Return a pair of the feature matrix and the one-hot label matrix.
return fvecs_np,labels_onehot
def main(argv=None):
# Be verbose?
verbose = FLAGS.verbose
# Get the data.
train_data_filename = FLAGS.train
test_data_filename = FLAGS.test
# Extract it into numpy matrices.
train_data,train_labels = extract_data(train_data_filename)
test_data, test_labels = extract_data(test_data_filename)
# Get the shape of the training data.
train_size,num_features = train_data.shape
# Get the number of epochs for training.
num_epochs = FLAGS.num_epochs
# This is where training samples and labels are fed to the graph.
# These placeholder nodes will be fed a batch of training data at each
# training step using the {feed_dict} argument to the Run() call below.
x = tf.placeholder("float", shape=[None, num_features])
y_ = tf.placeholder("float", shape=[None, NUM_LABELS])
# For the test data, hold the entire dataset in one constant node.
test_data_node = tf.constant(test_data)
# Define and initialize the network.
# These are the weights that inform how much each feature contributes to
# the classification.
W = tf.Variable(tf.zeros([num_features,NUM_LABELS]))
b = tf.Variable(tf.zeros([NUM_LABELS]))
y = tf.nn.softmax(tf.matmul(x,W) + b)
# Optimization.
cross_entropy = -tf.reduce_sum(y_*tf.log(y))
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
# Evaluation.
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
# Create a local session to run this computation.
with tf.Session() as s:
# Run all the initializers to prepare the trainable parameters.
tf.initialize_all_variables().run()
if verbose:
print ('Initialized!')
print
print ('Training.')
# Iterate and train.
for step in xrange(num_epochs * train_size // BATCH_SIZE):
if verbose:
print (step,)
offset = (step * BATCH_SIZE) % train_size
batch_data = train_data[offset:(offset + BATCH_SIZE), :]
batch_labels = train_labels[offset:(offset + BATCH_SIZE)]
train_step.run(feed_dict={x: batch_data, y_: batch_labels})
if verbose and offset >= train_size-BATCH_SIZE:
print
# Give very detailed output.
if verbose:
print
print ('Weight matrix.')
print (s.run(W))
print
print ('Bias vector.')
print (s.run(b))
print
print ("Applying model to first test instance.")
first = test_data[:1]
print ("Point =", first)
print ("Wx+b = ", s.run(tf.matmul(first,W)+b))
print ("softmax(Wx+b) = ", s.run(tf.nn.softmax(tf.matmul(first,W)+b)))
print
print ("Accuracy:", accuracy.eval(feed_dict={x: test_data, y_: test_labels}))
if __name__ == '__main__':
tf.app.run()
When I run the code from terminal with the following command (windows10 cmd line) python YourScript.py --train FileName.csv --test TestName.csv --num_epochs 5 --verbose True I get these errors. Any help is greatly appreciated!
Error #1
File "softmax.py", line 133, in
tf.app.run()
tf.app.run()
Error #2
File "C:\app.py", line 43, in run
sys.exit(main(sys.argv[:1] + flags_passthrough))
labels_onehot = (np.arange(NUM_LABELS) == labels_np[:, None]).astype(np.float32)
Error #3
File "softmax.py", line 57, in main
train_data,train_labels = extract_data(train_data_filename)
train_data,train_labels = extract_data(train_data_filename)
test_data, test_labels = extract_data(test_data_filename)
Error #4
File "softmax.py", line 31, in extract_data
for line in file(filename):
NameError: name 'file' is not defined
for line in file(filename):
row = line.split(",")
labels.append(int(row[7]))
fvecs.append([float(x) for x in row[1:6]])
It looks like the problem stems from this line, which uses a built-in function (file()) that is not available in Python 3.5:
for line in file(filename):
Replacing it with the following line should fix the error:
for line in open(filename):
Related
I am trying to implement linear regression using tensor-flow. Following is the code I am using.
import tensorflow as tf
import numpy as np
import pandas as pd
import os
rng = np.random
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
# reading data from a csv file
file1 = pd.read_csv('out.csv')
x_data=file1['^GSPC']
# converting datafram into array
x_data=x_data.values
y_data=file1['FB']
#converting dataframe into array
y_data=y_data.values
n_steps = 1000 #Total number of steps
n_iterations = [] #Nth iteration value
n_loss = [] #Loss at nth iteration
learned_weight = [] #weight at nth iteration
learned_bias = [] #bias value at nth iteration
# Try to find values for W and b that compute y_data = W * x_data + b
W = tf.Variable(rng.randn())
b = tf.Variable(rng.rand())
y = W * x_data + b
# Minimize the mean squared errors.
loss=tf.reduce_sum(tf.pow(y-y_data, 2))/(2*28)
optimizer = tf.train.GradientDescentOptimizer(0.01)
train = optimizer.minimize(loss)
with tf.Session() as sess:
# Before starting, initialize the variables. We will 'run' this first.
sess.run(tf.global_variables_initializer())
for step in range(n_steps):
sess.run(train)
n_iterations.append(step)
n_loss.append(loss.eval())
learned_weight.append(W.eval())
learned_bias.append(b.eval())
print("Final Weight: "+str(learned_weight[-1])+", Final Bias: "+str(learned_bias[-1]) + ", Final cost:"+str(n_loss[-1]))
The problem is every time I run the code I get different result (weights, bias and cost(loss)). I have studied from a few resources that weights, bias and cost should be approximately same in every run.
Secondly, the line i.e ( y=weights*x_data+bias) does not quite fit the training data.
Thirdly, I have to convert dataframe x_data and y_data to array by implementing the following
x_data=x_data.values
y_data=y_data.values
if I don’t do as shown above my code run the following error:
Traceback (most recent call last): File "python", line 33, in File "tensorflow/python/framework/fast_tensor_util.pyx", line 120, in tensorflow.python.framework.fast_tensor_util.AppendObjectArrayToTensorProto TypeError: Expected binary or unicode string, got tf.Tensor 'sub:0' shape=(28,) dtype=float32
Please help me understanding what I am doing wrong!
P.S: My questions may sound stupid because I am new to tensor flow and machine learning.
The code is implemented wrongly:
Use tf.Placeholders for data that will be passed into the model.
Use the feed_dict attribute of sess.run to pass data to the placeholder when executing the graph.
Here's an updated example:
Build the Graph
import numpy as np
import tensorflow as tf
import numpy as np
# dataset
X_data = np.random.randn(100,3)
y_data = 2*np.sum(X_data, 1)+0.01
# reshape y to be a column vector
y_data = np.reshape(y_data, [-1, 1])
# parameters
n_steps = 1000 #Total number of steps
batch_size = 20
input_length = X_data.shape[0] # => 100
display_cost = 500
# data placeholders
X = tf.placeholder(shape=[None, 3],dtype = tf.float32)
y = tf.placeholder(shape=[None, 1],dtype = tf.float32)
# build the model
W = tf.Variable(initial_value = tf.random_normal([3,1]))
b = tf.Variable(np.random.rand())
y_fitted = tf.add(tf.matmul(X, W), b)
# Minimize the mean squared errors
loss=tf.losses.mean_squared_error(labels=y, predictions=y_fitted)
optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
Execute in Session
# execute in Session
with tf.Session() as sess:
# initialize all variables
tf.global_variables_initializer().run()
# Train the model
for steps in range(n_steps):
mini_batch = zip(range(0, input_length, batch_size),
range(batch_size, input_length+1, batch_size))
# train data in mini-batches
for (start, end) in mini_batch:
sess.run(optimizer, feed_dict = {X: X_data[start:end],
y: y_data[start:end]})
# print training performance
if (steps+1) % display_cost == 0:
print('Step: {}'.format((steps+1)))
# evaluate loss function
cost = sess.run(loss, feed_dict = {X: X_data,
y: y_data})
print('Cost: {}'.format(cost))
# report rmse for training and test data
print('\nFinal Weight: {}'.format(W.eval()))
print('\nFinal Bias: {}'.format(b.eval()))
Output for 2 runs
# Run 1
Step: 500
Cost: 3.1569701713918263e-11
Step: 1000
Cost: 3.1569701713918263e-11
Final Weight: [[2.0000048]
[2.0000024]
[1.9999973]]
Final Bias: 0.010000854730606079
# Run 2
Step: 500
Cost: 7.017615221566187e-12
Step: 1000
Cost: 7.017615221566187e-12
Final Weight: [[1.9999975]
[1.9999989]
[1.9999999]]
Final Bias: 0.0099998963996768
Indeed, the weight and bias are approximately the same for multiple calls to build a classifier using the same dataset. Also when doing numerical computations, Numpy ndarrays are mostly the preferred data format hence the conversion using .values.
I like to perform image classification on our own large image libary (millions of labeled images) with tensorflow. I´m new to stackoverflow, python and tensorflow and worked myself through a few tutorials (mnist etc.) and got to the point, where i was able to prepare a TensorFlow datset from a dictionary including the absolute path to the images and the according labels. However, i´m stuck at the point using the dataset in a TensorFlow session. Here is my (example) code:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
import time
import mymodule # I build my module to read the images and labels
from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes
from tensorflow.contrib.data import Iterator
beginTime = time.time()
batch_size = 100
learning_rate = 0.005
max_steps = 2
NUM_CLASSES = 25
def input_parser(img_path, label):
one_hot = tf.one_hot(label, NUM_CLASSES)
img_file = tf.read_file(img_path)
img_decoded = tf.image.decode_jpeg(img_file, channels = 3)
return img_decoded, one_hot
#Import Training data (returns the dicitonary with paths and labels)
train_dict = mymodule.getFileMap(labelList, imageList)
#Import Test data
test_dict = mymodule.getFileMap(labelList, imageList)
#Get train data
train_file_list, train_label_list = get_file_label_list(train_dict)
train_images_tensor = ops.convert_to_tensor(train_file_list, dtype=dtypes.string)
train_labels_tensor = ops.convert_to_tensor(train_label_list, dtype=dtypes.int64)
#Get test data
test_file_list, test_label_list = get_file_label_list(test_dict)
test_images_tensor = ops.convert_to_tensor(test_file_list, dtype=dtypes.string)
test_labels_tensor = ops.convert_to_tensor(test_label_list, dtype=dtypes.int64)
#Create TensorFlow Datset object
train_data = tf.data.Dataset.from_tensor_slices((train_images_tensor, train_labels_tensor))
test_data = tf.data.Dataset.from_tensor_slices((test_images_tensor, test_labels_tensor))
# Transform the datset so that it contains decoded images
# and one-hot vector labels
train_data = train_data.map(input_parser)
test_data = test_data.map(input_parser)
# Batching --> How to do it right?
#train_data = train_data.batch(batch_size = 100)
#test_data = train_data.batch(batch_size = 100)
#Define input placeholders
image_size = 990*990*3
images_placeholder = tf.placeholder(tf.float32, shape=[None, image_size])
labels_placeholder = tf.placeholder(tf.int64, shape=[None])
# Define variables (these afe the values we want to optimize)
weigths = tf.Variable(tf.zeros([image_size, NUM_CLASSES]))
biases = tf.Variable(tf.zeros([NUM_CLASSES]))
# Define the classifier´s result
logits = tf.matmul(images_placeholder, weigths) + biases
# Define the loss function
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits, labels = labels_placeholder))
# Define the training operation
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
# Operation comparing prediciton with true label
correct_prediciton = tf.equal(tf.argmax(logits, 1), labels_placeholder)
# Operation calculating the accuracy of our predicitons
accuracy = tf.reduce_mean(tf.cast(correct_prediciton, tf.float32))
#Create TensorFlow Iterator object
iterator = Iterator.from_structure(train_data.output_types,
train_data.output_shapes)
next_element = iterator.get_next()
#Create two initialization ops to switch between the datasets
train_init_op = iterator.make_initializer(train_data)
test_init_op = iterator.make_initializer(test_data)
with tf.Session() as sess:
#Initialize variables
sess.run(tf.global_variables_initializer())
sess.run(train_init_op)
for _ in range(10):
try:
elem = sess.run(next_element)
print(elem)
except tf.errors.OutOfRangeError:
print("End of training datset.")
break
Following this and this tutorial i could not solve the problem of how to use the (image and label) dataset in a tensorflow session for training. I was able to print out the datset by iterating through it, but wasn´t able to use it for learning.
I don´t understand how to access the images and labels seperately after they have been merged in the train_data = tf.data.Dataset.from_tensor_slices((train_images_tensor, train_labels_tensor)) operation, as requried by the 2nd tutorial. Also i don´t know how to implement batching correctly.
What i want to do in the session is basically this (from the 2nd tutorial):
# Generate input data batch
indices = np.random.choice(data_sets['images_train'].shape[0], batch_size)
images_batch = data_sets['images_train'][indices]
labels_batch = data_sets['labels_train'][indices]
# Periodically print out the model's current accuracy
if i % 100 == 0:
train_accuracy = sess.run(accuracy, feed_dict={
images_placeholder: images_batch, labels_placeholder: labels_batch})
print('Step {:5d}: training accuracy {:g}'.format(i, train_accuracy))
# Perform a single training step
sess.run(train_step, feed_dict={images_placeholder: images_batch,
labels_placeholder: labels_batch})
# After finishing the training, evaluate on the test set
test_accuracy = sess.run(accuracy, feed_dict={
images_placeholder: data_sets['images_test'],
labels_placeholder: data_sets['labels_test']})
print('Test accuracy {:g}'.format(test_accuracy))
endTime = time.time()
print('Total time: {:5.2f}s'.format(endTime - beginTime))
If anyone can tell me, how to access images and labels in the dataset sepearately and use it for training, i would be really thankful. Also a tip where and how to do the batching would be appreciated.
Thank you.
In your code, next_element is a tuple of two tensors, matching the structure of your datasets: i.e. it is a tuple whose first element is an image, and second element is a label. To access the individual tensors, you can do the following:
next_element = iterator.get_next()
next_image = next_element[0]
next_label = next_element[1]
# Or, in a single line:
next_image, next_label = iterator.get_next()
To batch a tf.data.Dataset, you can use the Dataset.batch() transformation. Your commented out code for this should simply work:
train_data = train_data.batch(batch_size = 100)
test_data = train_data.batch(batch_size = 100)
I'm trying to fit an exponentially decaying model (y=Ax^b + C)to some data but have yet to get a value other than 0 for for b. I have two "working" sets of code right now, one steps through each X,Y pair, and the other attempts to use the entire [X,Y] array, but I'm not sure that I have implemented that correctly. For now I'd like for it to correctly fit a curve. The linear model works fine so I'm not sure where this is going south.
Data is here - PASTEBIN
#!/usr/bin/python
import numpy as np
import tensorflow as tf
import sys
import matplotlib.pyplot as plt
k=0
xdata= []
ydata = []
# Open the data and read it in, ignore the header.
with open('curvedata_full_formatted.csv') as f:
for line in f:
k+=1
if k==1:continue
items = line.split(',')
xdata.append(float(items[0]))
ydata.append(float(items[1]))
# Model linear regression y = A*x^B+C
# x - data to be fed into the model - 1 feature
x = tf.placeholder(tf.float32, [None, 1])
# A - training variable - 1 feature, 1 output
A = tf.Variable(tf.zeros([1,1]))
# B - training variable - 1 output
B = tf.Variable(tf.zeros([1,1]))
# C - training variable - 1 output
C = tf.Variable(tf.zeros([1]))
# x^B
xb = tf.exp(B)
# A*x^b
product = tf.mul(A,xb)
# Prediction
y = tf.add(product,C)
# Actual value ybar
y_ = tf.placeholder(tf.float32)
# Cost function sum((y_-y)**2)
cost = tf.reduce_mean(tf.square(y_-y))
# Training using Gradient Descent to minimize cost
train_step = tf.train.GradientDescentOptimizer(1*10**-9).minimize(cost)
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)
steps = 150
for i in range(steps):
# Read in data from log file and use as x,y
for (X,Y) in zip(xdata,ydata):
#xs = np.array([[xdata]])
#ys = np.array([[ydata]])
# Train
# Feed dict x placeholder xs, y_ placeholder ys
X = np.array([[X]])
Y = np.array([[Y]])
feed = { x: X, y_: Y }
sess.run(train_step, feed_dict=feed)
sys.stdout.write("\rIteration %i " %i +"cost %.15f" % sess.run(cost, feed_dict=feed))
sys.stdout.flush()
print ''
print 'A: %f'%sess.run(A)
print 'B: %f'%sess.run(B)
print 'C: %f'%sess.run(C)
As a test, try starting the optimizer with initial values close to the expected final parameters. This test will tell you whether or not the problem is in the selection of initial parameter values.
I'm relatively new to the world of TensorFlow, and pretty perplexed by how you'd actually read CSV data into a usable example/label tensors in TensorFlow. The example from the TensorFlow tutorial on reading CSV data is pretty fragmented and only gets you part of the way to being able to train on CSV data.
Here's my code that I've pieced together, based off that CSV tutorial:
from __future__ import print_function
import tensorflow as tf
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
filename = "csv_test_data.csv"
# setup text reader
file_length = file_len(filename)
filename_queue = tf.train.string_input_producer([filename])
reader = tf.TextLineReader(skip_header_lines=1)
_, csv_row = reader.read(filename_queue)
# setup CSV decoding
record_defaults = [[0],[0],[0],[0],[0]]
col1,col2,col3,col4,col5 = tf.decode_csv(csv_row, record_defaults=record_defaults)
# turn features back into a tensor
features = tf.stack([col1,col2,col3,col4])
print("loading, " + str(file_length) + " line(s)\n")
with tf.Session() as sess:
tf.initialize_all_variables().run()
# start populating filename queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
for i in range(file_length):
# retrieve a single instance
example, label = sess.run([features, col5])
print(example, label)
coord.request_stop()
coord.join(threads)
print("\ndone loading")
And here is an brief example from the CSV file I'm loading - pretty basic data - 4 feature columns, and 1 label column:
0,0,0,0,0
0,15,0,0,0
0,30,0,0,0
0,45,0,0,0
All the code above does is print each example from the CSV file, one by one, which, while nice, is pretty darn useless for training.
What I'm struggling with here is how you'd actually turn those individual examples, loaded one-by-one, into a training dataset. For example, here's a notebook I was working on in the Udacity Deep Learning course. I basically want to take the CSV data I'm loading, and plop it into something like train_dataset and train_labels:
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)
I've tried using tf.train.shuffle_batch, like this, but it just inexplicably hangs:
for i in range(file_length):
# retrieve a single instance
example, label = sess.run([features, colRelevant])
example_batch, label_batch = tf.train.shuffle_batch([example, label], batch_size=file_length, capacity=file_length, min_after_dequeue=10000)
print(example, label)
So to sum up, here are my questions:
What am I missing about this process?
It feels like there is some key intuition that I'm missing about how to properly build an input pipeline.
Is there a way to avoid having to know the length of the CSV file?
It feels pretty inelegant to have to know the number of lines you want to process (the for i in range(file_length) line of code above)
Edit:
As soon as Yaroslav pointed out that I was likely mixing up imperative and graph-construction parts here, it started to become clearer. I was able to pull together the following code, which I think is closer to what would typically done when training a model from CSV (excluding any model training code):
from __future__ import print_function
import numpy as np
import tensorflow as tf
import math as math
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('dataset')
args = parser.parse_args()
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
def read_from_csv(filename_queue):
reader = tf.TextLineReader(skip_header_lines=1)
_, csv_row = reader.read(filename_queue)
record_defaults = [[0],[0],[0],[0],[0]]
colHour,colQuarter,colAction,colUser,colLabel = tf.decode_csv(csv_row, record_defaults=record_defaults)
features = tf.stack([colHour,colQuarter,colAction,colUser])
label = tf.stack([colLabel])
return features, label
def input_pipeline(batch_size, num_epochs=None):
filename_queue = tf.train.string_input_producer([args.dataset], num_epochs=num_epochs, shuffle=True)
example, label = read_from_csv(filename_queue)
min_after_dequeue = 10000
capacity = min_after_dequeue + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch(
[example, label], batch_size=batch_size, capacity=capacity,
min_after_dequeue=min_after_dequeue)
return example_batch, label_batch
file_length = file_len(args.dataset) - 1
examples, labels = input_pipeline(file_length, 1)
with tf.Session() as sess:
tf.initialize_all_variables().run()
# start populating filename queue
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
while not coord.should_stop():
example_batch, label_batch = sess.run([examples, labels])
print(example_batch)
except tf.errors.OutOfRangeError:
print('Done training, epoch reached')
finally:
coord.request_stop()
coord.join(threads)
I think you are mixing up imperative and graph-construction parts here. The operation tf.train.shuffle_batch creates a new queue node, and a single node can be used to process the entire dataset. So I think you are hanging because you created a bunch of shuffle_batch queues in your for loop and didn't start queue runners for them.
Normal input pipeline usage looks like this:
Add nodes like shuffle_batch to input pipeline
(optional, to prevent unintentional graph modification) finalize graph
--- end of graph construction, beginning of imperative programming --
tf.start_queue_runners
while(True): session.run()
To be more scalable (to avoid Python GIL), you could generate all of your data using TensorFlow pipeline. However, if performance is not critical, you can hook up a numpy array to an input pipeline by using slice_input_producer. Here's an example with some Print nodes to see what's going on (messages in Print go to stdout when node is run)
tf.reset_default_graph()
num_examples = 5
num_features = 2
data = np.reshape(np.arange(num_examples*num_features), (num_examples, num_features))
print data
(data_node,) = tf.slice_input_producer([tf.constant(data)], num_epochs=1, shuffle=False)
data_node_debug = tf.Print(data_node, [data_node], "Dequeueing from data_node ")
data_batch = tf.batch([data_node_debug], batch_size=2)
data_batch_debug = tf.Print(data_batch, [data_batch], "Dequeueing from data_batch ")
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
tf.get_default_graph().finalize()
tf.start_queue_runners()
try:
while True:
print sess.run(data_batch_debug)
except tf.errors.OutOfRangeError as e:
print "No more inputs."
You should see something like this
[[0 1]
[2 3]
[4 5]
[6 7]
[8 9]]
[[0 1]
[2 3]]
[[4 5]
[6 7]]
No more inputs.
The "8, 9" numbers didn't fill up the full batch, so they didn't get produced. Also tf.Print are printed to sys.stdout, so they show up in separately in Terminal for me.
PS: a minimal of connecting batch to a manually initialized queue is in github issue 2193
Also, for debugging purposes you might want to set timeout on your session so that your IPython notebook doesn't hang on empty queue dequeues. I use this helper function for my sessions
def create_session():
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.per_process_gpu_memory_fraction=0.3 # don't hog all vRAM
config.operation_timeout_in_ms=60000 # terminate on long hangs
# create interactive session to register a default session
sess = tf.InteractiveSession("", config=config)
return sess
Scalability Notes:
tf.constant inlines copy of your data into the Graph. There's a fundamental limit of 2GB on size of Graph definition so that's an upper limit on size of data
You could get around that limit by using v=tf.Variable and saving the data into there by running v.assign_op with a tf.placeholder on right-hand side and feeding numpy array to the placeholder (feed_dict)
That still creates two copies of data, so to save memory you could make your own version of slice_input_producer which operates on numpy arrays, and uploads rows one at a time using feed_dict
Or you could try this, the code loads the Iris dataset into tensorflow using pandas and numpy and a simple one neuron output is printed in the session. Hope it helps for a basic understanding.... [ I havent added the way of one hot decoding labels].
import tensorflow as tf
import numpy
import pandas as pd
df=pd.read_csv('/home/nagarjun/Desktop/Iris.csv',usecols = [0,1,2,3,4],skiprows = [0],header=None)
d = df.values
l = pd.read_csv('/home/nagarjun/Desktop/Iris.csv',usecols = [5] ,header=None)
labels = l.values
data = numpy.float32(d)
labels = numpy.array(l,'str')
#print data, labels
#tensorflow
x = tf.placeholder(tf.float32,shape=(150,5))
x = data
w = tf.random_normal([100,150],mean=0.0, stddev=1.0, dtype=tf.float32)
y = tf.nn.softmax(tf.matmul(w,x))
with tf.Session() as sess:
print sess.run(y)
You can use latest tf.data API :
dataset = tf.contrib.data.make_csv_dataset(filepath)
iterator = dataset.make_initializable_iterator()
columns = iterator.get_next()
with tf.Session() as sess:
sess.run([iteator.initializer])
If anyone came here searching for a simple way to read absolutely large and sharded CSV files in tf.estimator API then , please see below my code
CSV_COLUMNS = ['ID','text','class']
LABEL_COLUMN = 'class'
DEFAULTS = [['x'],['no'],[0]] #Default values
def read_dataset(filename, mode, batch_size = 512):
def _input_fn(v_test=False):
# def decode_csv(value_column):
# columns = tf.decode_csv(value_column, record_defaults = DEFAULTS)
# features = dict(zip(CSV_COLUMNS, columns))
# label = features.pop(LABEL_COLUMN)
# return add_engineered(features), label
# Create list of files that match pattern
file_list = tf.gfile.Glob(filename)
# Create dataset from file list
#dataset = tf.data.TextLineDataset(file_list).map(decode_csv)
dataset = tf.contrib.data.make_csv_dataset(file_list,
batch_size=batch_size,
column_names=CSV_COLUMNS,
column_defaults=DEFAULTS,
label_name=LABEL_COLUMN)
if mode == tf.estimator.ModeKeys.TRAIN:
num_epochs = None # indefinitely
dataset = dataset.shuffle(buffer_size = 10 * batch_size)
else:
num_epochs = 1 # end-of-input after this
batch_features, batch_labels = dataset.make_one_shot_iterator().get_next()
#Begins - Uncomment for testing only -----------------------------------------------------<
if v_test == True:
with tf.Session() as sess:
print(sess.run(batch_features))
#End - Uncomment for testing only -----------------------------------------------------<
return add_engineered(batch_features), batch_labels
return _input_fn
Example usage in TF.estimator:
train_spec = tf.estimator.TrainSpec(input_fn = read_dataset(
filename = train_file,
mode = tf.estimator.ModeKeys.TRAIN,
batch_size = 128),
max_steps = num_train_steps)
2.0 Compatible Solution: This Answer might be provided by others in the above thread but I will provide additional links which will help the community.
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=5, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True,
**kwargs)
For more information, please refer this Tensorflow Tutorial.
So I have this great bit of code that comes out with approximately a 93% accuracy rate on its predictions. What I'm wondering how to do now is to take the trained program, make it look at actual test data without the answer on it, and make it fill in the answer regardless of the accuracy. Here's the code that I have that predicts with a ~93% accuracy rate.
import tensorflow as tf
import numpy as np
from numpy import genfromtxt
import sklearn
# Convert to one hot
def convertOneHot(data):
y=np.array([int(i[0]) for i in data])
y_onehot=[0]*len(y)
for i,j in enumerate(y):
y_onehot[i]=[0]*(y.max() + 1)
y_onehot[i][j]=1
return (y,y_onehot)
data = genfromtxt('cs-training.csv',delimiter=',') # Training data
test_data = genfromtxt('cs-test.csv',delimiter=',') # Test data
x_train=np.array([ i[1::] for i in data])
y_train,y_train_onehot = convertOneHot(data)
x_test=np.array([ i[1::] for i in test_data])
y_test,y_test_onehot = convertOneHot(test_data)
A=data.shape[1]-1 # Number of features, Note first is y
B=len(y_train_onehot[0])
tf_in = tf.placeholder("float", [None, A]) # Features
tf_weight = tf.Variable(tf.zeros([A,B]))
tf_bias = tf.Variable(tf.zeros([B]))
tf_softmax = tf.nn.softmax(tf.matmul(tf_in,tf_weight) + tf_bias)
# Training via backpropagation
tf_softmax_correct = tf.placeholder("float", [None,B])
tf_cross_entropy = -tf.reduce_sum(tf_softmax_correct*tf.log(tf_softmax))
# Train using tf.train.GradientDescentOptimizer
tf_train_step = tf.train.GradientDescentOptimizer(0.01).minimize(tf_cross_entropy)
# Add accuracy checking nodes
tf_correct_prediction = tf.equal(tf.argmax(tf_softmax,1), tf.argmax(tf_softmax_correct,1))
tf_accuracy = tf.reduce_mean(tf.cast(tf_correct_prediction, "float"))
saver = tf.train.Saver([tf_weight,tf_bias])
# Initialize and run
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
print("...")
# Run the training
for i in range(100):
sess.run(tf_train_step, feed_dict={tf_in: x_train, tf_softmax_correct: y_train_onehot})
#Print accuracy
result = sess.run(tf_accuracy, feed_dict={tf_in: x_test, tf_softmax_correct: y_test_onehot})
print result
Now I have the actual test set cs-test-actual.csv where the first column is entirely empty and I need to fill it in with a predicted 1 or 0. How do I go about doing that?
The program above doesn't appear to be saving the trained session. I think you want to do this in two steps.
Train and save the session
Restore the save session, and run test data through it.
Step 1:
#!/usr/bin/env python
import tensorflow as tf
import numpy as np
from numpy import genfromtxt
import sklearn
# Convert to one hot
def convertOneHot(data):
y=np.array([int(i[0]) for i in data])
y_onehot=[0]*len(y)
for i,j in enumerate(y):
y_onehot[i]=[0]*(y.max() + 1)
y_onehot[i][j]=1
return (y,y_onehot)
# Build Example Data is CSV format, but use Iris data
from sklearn import datasets
from sklearn.model_selection import train_test_split
def buildDataFromIris():
iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.33, random_state=42)
f=open('cs-training.csv','w')
for i,j in enumerate(X_train):
k=np.append(np.array(y_train[i]),j )
f.write(",".join([str(s) for s in k]) + '\n')
f.close()
f=open('cs-test.csv','w')
for i,j in enumerate(X_test):
k=np.append(np.array(y_test[i]),j )
f.write(",".join([str(s) for s in k]) + '\n')
f.close()
# Recreate logging and save dir
# Seems the tensorflow won't always overwrite
import shutil, os, sys
TMPDir='./tensorTMP'
try:
shutil.rmtree(TMPDir)
except:
print "Tmp Dir did not exist...that's okay"
os.mkdir(TMPDir, 0755 )
# Populate the data
buildDataFromIris()
data = genfromtxt('cs-training.csv',delimiter=',') # Training data
test_data = genfromtxt('cs-test.csv',delimiter=',') # Test data
x_train=np.array([ i[1::] for i in data])
y_train,y_train_onehot = convertOneHot(data)
x_test=np.array([ i[1::] for i in test_data])
y_test,y_test_onehot = convertOneHot(test_data)
A=data.shape[1]-1 # Number of features, Note first is y
B=len(y_train_onehot[0])
tf_in = tf.placeholder("float", [None, A]) # Features
tf_weight = tf.Variable(tf.zeros([A,B]))
tf_bias = tf.Variable(tf.zeros([B]))
tf_softmax = tf.nn.softmax(tf.matmul(tf_in,tf_weight) + tf_bias)
# Training via backpropagation
tf_softmax_correct = tf.placeholder("float", [None,B])
tf_cross_entropy = -tf.reduce_sum(tf_softmax_correct*tf.log(tf_softmax))
# Train using tf.train.GradientDescentOptimizer
tf_train_step = tf.train.GradientDescentOptimizer(0.01).minimize(tf_cross_entropy)
# Add accuracy checking nodes
tf_correct_prediction = tf.equal(tf.argmax(tf_softmax,1), tf.argmax(tf_softmax_correct,1))
tf_accuracy = tf.reduce_mean(tf.cast(tf_correct_prediction, "float"))
saver = tf.train.Saver([tf_weight,tf_bias])
# Initialize and run
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
THRESHOLD = 0.98
saved = False
print("...")
# Run the training
for i in range(100):
sess.run(tf_train_step, feed_dict={tf_in: x_train, tf_softmax_correct: y_train_onehot})
result = sess.run(tf_accuracy, feed_dict={tf_in: x_test, tf_softmax_correct: y_test_onehot})
# If it's well trained on this iteration, save it. We just need one save.
if result > THRESHOLD and saved == False:
saved = True
print "saving result {}".format(result)
saver.save(sess,TMPDir +"/savedSess")
The only modifications made were generating sample data using Iris, establishing a THRESHOLD or confidence interval for the session. If it's over that THRESHOLD, then, save the session. After running step one, the model should be trained and saved.
Step 2:
Restore the saved session, and run the training data through it.
#!/usr/bin/env python
import tensorflow as tf
import numpy as np
from numpy import genfromtxt
import sklearn
# Convert to one hot
def convertOneHot(data):
y=np.array([int(i[0]) for i in data])
y_onehot=[0]*len(y)
for i,j in enumerate(y):
y_onehot[i]=[0]*(y.max() + 1)
y_onehot[i][j]=1
return (y,y_onehot)
data = genfromtxt('cs-training.csv',delimiter=',') # Training data
test_data = genfromtxt('cs-test.csv',delimiter=',') # Test data
x_train=np.array([ i[1::] for i in data])
y_train,y_train_onehot = convertOneHot(data)
x_test=np.array([ i[1::] for i in test_data])
y_test,y_test_onehot = convertOneHot(test_data)
A=data.shape[1]-1 # Number of features, Note first is y
B=len(y_train_onehot[0])
tf_in = tf.placeholder("float", [None, A]) # Features
tf_weight = tf.Variable(tf.zeros([A,B]))
tf_bias = tf.Variable(tf.zeros([B]))
tf_softmax = tf.nn.softmax(tf.matmul(tf_in,tf_weight) + tf_bias)
# Training via backpropagation
tf_softmax_correct = tf.placeholder("float", [None,B])
tf_cross_entropy = -tf.reduce_sum(tf_softmax_correct*tf.log(tf_softmax))
# Train using tf.train.GradientDescentOptimizer
tf_train_step = tf.train.GradientDescentOptimizer(0.01).minimize(tf_cross_entropy)
# Add accuracy checking nodes
tf_correct_prediction = tf.equal(tf.argmax(tf_softmax,1), tf.argmax(tf_softmax_correct,1))
tf_accuracy = tf.reduce_mean(tf.cast(tf_correct_prediction, "float"))
saver = tf.train.Saver([tf_weight,tf_bias])
# Initialize and run
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
TMPDir='./tensorTMP'
saver.restore(sess, TMPDir + '/savedSess')
ans = sess.run(tf_softmax, feed_dict={tf_in: x_test, tf_softmax_correct: y_test_onehot})
print ans
Note, your output will look like the following...
[[ 6.17585704e-02 8.63590300e-01 7.46511072e-02]
[ 9.98804331e-01 1.19561062e-03 3.25832108e-13]
[ 1.52018686e-07 4.49650863e-04 9.99550164e-01]