Above picture is the distributions of a binary output from a Neural Network that I wrote. (Y-Axis is percentage of Yes output, X-Axis is iteration)
I trained a NN on a binary classification dataset that has labels 0-No, 1-Yes
However I try to change the parameters of the NN (tried different layer count, layer size, activation function) the NN output oscillates from outputting all Zeros or all Ones.
I am using tensorflow to build the NN with this as the cost function
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=prediction))
I am 90% sure that the NN is sound in term of architecture as it is the same one I used for the MNIST dataset. Also have tried many different datasets and it seems to be able to classify datasets well.
The training dataset contains 58% of No labels and 42% of Yes labels.
I have a suspicion that the feature inputted into the NN is uninformative to the NN to make a decision hence why defaulting to one answer is better for it.
I would like to know what kind of Cost function would better penalize the current behavior of the network?
Code:
def neural_network_model(data):
w1 = tf.Variable(tf.random_normal([n_input, n_nodes_hl1]), name="w1", collections=["weights", tf.GraphKeys.GLOBAL_VARIABLES])
b1 = tf.Variable(tf.random_normal([n_nodes_hl1]), name="b1", collections=["bias", tf.GraphKeys.GLOBAL_VARIABLES])
w2 = tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2]), name="w2", collections=["weights", tf.GraphKeys.GLOBAL_VARIABLES])
b2 = tf.Variable(tf.random_normal([n_nodes_hl2]), name="b2", collections=["bias", tf.GraphKeys.GLOBAL_VARIABLES])
wout = tf.Variable(tf.random_normal([n_nodes_hl2, n_output]), name="wout", collections=["weights", tf.GraphKeys.GLOBAL_VARIABLES])
bout = tf.Variable(tf.random_normal([n_output]), name="bout", collections=["bias", tf.GraphKeys.GLOBAL_VARIABLES])
l1 = tf.add(tf.matmul(data, w1), b1)
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1, w2), b2)
l2 = tf.nn.relu(l2)
output = tf.matmul(l2, wout) + bout
return output
n_input = len(features[1, :])
n_nodes_hl1 = 2000
n_nodes_hl2 = 2000
n_output = 2
batch_size = 100
iterations = 10000000
x = tf.placeholder("float", name="x")
y = tf.placeholder("float", name="y")
prediction = neural_network_model(x)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=prediction))
optimizer = tf.train.AdamOptimizer().minimize(cost)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for iteration in range(iterations):
iter_x, iter_y = nextbatch(features, label, batch_size, iteration)
_, c = sess.run([optimizer, cost], feed_dict={x: iter_x, y: iter_y})
if iteration % 500 == 0:
print("Iter:", iteration, "Cost:", c)
print("Count:", correct.eval({x: val_features, y:val_label})/len(val_label))
EDIT:
After shuffling the time-series input as suggested by #FlorentinHennecker, the oscillation seems to be reduced.
Related
I am new to tensorflow and i am tasked to design a feedforward neural network which consists of: an input layer, one hidden perceptron layer of 10 neurons and an output softmax layer. Assume a learning rate of 0.01, L2 regularization with weight decay parameter of 0.000001, and batch size of 32.
I would like to know if there is anyway to know if the network that I have created is what intend to create. Like a graph showing the nodes?
The following is attempt on the task but I am not sure if it is correct.
import math
import tensorflow as tf
import numpy as np
import pylab as plt
# scale data
def scale(X, X_min, X_max):
return (X - X_min)/(X_max-X_min)
def tfvariables(start_nodes, end_nodes):
W = tf.Variable(tf.truncated_normal([start_nodes, end_nodes], stddev=1.0/math.sqrt(float(start_nodes))))
b = tf.Variable(tf.zeros([end_nodes]))
return W, b
NUM_FEATURES = 36
NUM_CLASSES = 6
learning_rate = 0.01
beta = 10 ** -6
epochs = 10000
batch_size = 32
num_neurons = 10
seed = 10
np.random.seed(seed)
#read train data
train_input = np.loadtxt('sat_train.txt',delimiter=' ')
trainX, train_Y = train_input[:, :36], train_input[:, -1].astype(int)
trainX = scale(trainX, np.min(trainX, axis=0), np.max(trainX, axis=0))
# There are 6 class-labels 1,2,3,4,5,7
train_Y[train_Y == 7] = 6
trainY = np.zeros((train_Y.shape[0], NUM_CLASSES))
trainY[np.arange(train_Y.shape[0]), train_Y-1] = 1 #one matrix
# experiment with small datasets
trainX = trainX[:1000]
trainY = trainY[:1000]
n = trainX.shape[0]
# Create the model
x = tf.placeholder(tf.float32, [None, NUM_FEATURES])
y_ = tf.placeholder(tf.float32, [None, NUM_CLASSES])
# Build the graph for the deep net
W1, b1 = tfvariables(NUM_FEATURES, num_neurons)
W2, b2 = tfvariables(num_neurons, NUM_CLASSES)
logits_1 = tf.matmul(x, W1) + b1
perceptron_layer = tf.nn.sigmoid(logits_1)
logits_2 = tf.matmul(perceptron_layer, W2) + b2
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_, logits=logits_2)
# Standard Loss
loss = tf.reduce_mean(cross_entropy)
# Loss function with L2 Regularization with beta
regularizers = tf.nn.l2_loss(W1) + tf.nn.l2_loss(W2)
loss = tf.reduce_mean(loss + beta * regularizers)
# Create the gradient descent optimizer with the given learning rate.
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.minimize(cross_entropy)
correct_prediction = tf.cast(tf.equal(tf.argmax(logits_2, 1), tf.argmax(y_, 1)), tf.float32)
accuracy = tf.reduce_mean(correct_prediction)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
train_acc = []
train_loss = []
for i in range(epochs):
train_op.run(feed_dict={x: trainX, y_: trainY})
train_acc.append(accuracy.eval(feed_dict={x: trainX, y_: trainY}))
train_loss.append(loss.eval(feed_dict={x: trainX, y_: trainY}))
if i % 500 == 0:
print('iter %d: accuracy %g loss %g'%(i, train_acc[i], train_loss[i]))
# plot learning curves
plt.figure(1)
plt.plot(range(epochs), train_acc)
plt.xlabel(str(epochs) + ' iterations')
plt.ylabel('Train accuracy')
# plot learning curves
plt.figure(1)
plt.plot(range(epochs), train_loss)
plt.xlabel(str(epochs) + ' iterations')
plt.ylabel('Train loss')
plt.show()
plt.show()
You can utitilize Tensorboard to visualize the graph you created. Basically, you have to follow a few steps to do this:
declare a writer as writer = tf.summary.FileWriter('PATH/TO/A/LOGDIR')
add the graph to the writer with writer.add_graph(sess.graph) with sess being your current tf.Session() in which you execute the graph
possibly you have to use writer.flush() to write it to disk immediately
Note that you have to add these lines AFTER building your graph.
You can view the graph by executing this command in your shell:
tensorboard --logdir=PATH/TO/A/LOGDIR
Then you are presented an address (usually something like localhost:6006) on which you can view the graph with your browser (Chrome and Firefox are guaranteed to work).
Tensorboard (in TensorFlow) is useful tool.
Use tf.summary.FileWriter for writing the graph into a folder and run tensorboard from the corresponding directory.
Check the following links:
https://www.tensorflow.org/guide/graphs
https://www.tensorflow.org/guide/summaries_and_tensorboard
After playing with tensorflow, I finally was able to make my code run but the loss seem to go in the negative, even with few epochs. Why is this happening? I made simple neural nets with numpy only and never had this problem, can someone explain what am I doing wrong?
here is the input file if anyone wants to test it: input file
additional information: the labels arent onehot array, is my loss function the correct one since it isnt onehot array? I followed a tutorial that used onehot array labels.
import tensorflow as tf
import numpy as np
data = np.load("test_data.npz")
trng_input = np.array(data['Input'], dtype=np.float64)
trng_output = np.array(data['Output'], dtype=np.float64)
nhl1 = 12
nhl2 = 8
n_classes = 4
x = tf.placeholder(dtype=tf.float64, shape=[len(trng_input),24])
y = tf.placeholder(dtype=tf.float64, shape=[len(trng_output),n_classes])
def NN(data):
hl1 = {"weights":tf.Variable(tf.random_normal([24, nhl1], dtype=tf.float64)),
"biases":tf.Variable(tf.random_normal([nhl1], dtype=tf.float64))}
hl2 = {"weights":tf.Variable(tf.random_normal([nhl1, nhl2], dtype=tf.float64)),
"biases":tf.Variable(tf.random_normal([nhl2], dtype=tf.float64))}
output_layer = {"weights":tf.Variable(tf.random_normal([nhl2, n_classes], dtype=tf.float64)),
"biases":tf.Variable(tf.random_normal([n_classes], dtype=tf.float64))}
l1 = tf.add(tf.matmul(data, hl1["weights"]), hl1["biases"])
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1, hl2["weights"]), hl2["biases"])
l2 = tf.nn.relu(l2)
output = tf.add(tf.matmul(l2, output_layer["weights"]), output_layer["biases"])
output = tf.nn.relu(output)
return output
def train(data, epochs):
prediction = NN(data)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=prediction, labels=y))
optimizer = tf.train.AdamOptimizer().minimize(cost)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(epochs):
_, c = sess.run([optimizer, cost], feed_dict={x: trng_input, y: trng_output})
if not epoch % 20:
print(F"Epoch {epoch} completed out of {epochs}. Loss:{c}")
correct = tf.equal(tf.argmax(prediction,1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct,"float"))
Eval = accuracy.eval({x:trng_input, y:trng_output})
print(F"Accuracy:{Eval}")
train(trng_input, 200)
I am new to tensorflow and I am trying to implement a simple feed-forward network for regression, just for learning purposes. The complete executable code is as follows.
The regression mean squared error is around 6, which is quite large. It is a little unexpected because the function to regress is linear and simple 2*x+y, and I expect a better performance.
I am asking for help to check if I did anything wrong in the code. I carefully checked the matrix dimensions so that should be good, but it is possible that I misunderstand something so the network or the session is not properly configured (like, should I run the training session multiple times, instead of just one time (the code below enclosed by #TRAINING#)? I see in some examples they input data piece by piece, and run the training progressively. I run the training just one time and input all data).
If the code is good, maybe this is a modeling issue, but I really don't expect to use a complicated network for such a simple regression.
import tensorflow as tf
import numpy as np
from sklearn.metrics import mean_squared_error
# inputs are points from a 100x100 grid in domain [-2,2]x[-2,2], total 10000 points
lsp = np.linspace(-2,2,100)
gridx,gridy = np.meshgrid(lsp,lsp)
inputs = np.dstack((gridx,gridy))
inputs = inputs.reshape(-1,inputs.shape[-1]) # reshpaes the grid into a 10000x2 matrix
feature_size = inputs.shape[1] # feature_size is 2, features are the 2D coordinates of each point
input_size = inputs.shape[0] # input_size is 10000
# a simple function f(x)=2*x[0]+x[1] to regress
f = lambda x: 2 * x[0] + x[1]
label_size = 1
labels = f(inputs.transpose()).reshape(-1,1) # reshapes labels as a column vector
ph_inputs = tf.placeholder(tf.float32, shape=(None, feature_size), name='inputs')
ph_labels = tf.placeholder(tf.float32, shape=(None, label_size), name='labels')
# just one hidden layer with 16 units
hid1_size = 16
w1 = tf.Variable(tf.random_normal([hid1_size, feature_size], stddev=0.01), name='w1')
b1 = tf.Variable(tf.random_normal([hid1_size, label_size]), name='b1')
y1 = tf.nn.relu(tf.add(tf.matmul(w1, tf.transpose(ph_inputs)), b1))
# the output layer
wo = tf.Variable(tf.random_normal([label_size, hid1_size], stddev=0.01), name='wo')
bo = tf.Variable(tf.random_normal([label_size, label_size]), name='bo')
yo = tf.transpose(tf.add(tf.matmul(wo, y1), bo))
# defines optimizer and predictor
lr = tf.placeholder(tf.float32, shape=(), name='learning_rate')
loss = tf.losses.mean_squared_error(ph_labels,yo)
optimizer = tf.train.GradientDescentOptimizer(lr).minimize(loss)
predictor = tf.identity(yo)
# TRAINING
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
_, c = sess.run([optimizer, loss], feed_dict={lr:0.05, ph_inputs: inputs, ph_labels: labels})
# TRAINING
# gets the regression results
predictions = np.zeros((input_size,1))
for i in range(input_size):
predictions[i] = sess.run(predictor, feed_dict={ph_inputs: inputs[i, None]}).squeeze()
# prints regression MSE
print(mean_squared_error(predictions, labels))
You're right, you understood the problem by yourself.
The problem is, in fact, that you're running the optimization step only one time. Hence you're doing one single update step of your network parameter and therefore the cost won't decrease.
I just changed the training session of your code in order to make it work as expected (100 training steps):
# TRAINING
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
for i in range(100):
_, c = sess.run(
[optimizer, loss],
feed_dict={
lr: 0.05,
ph_inputs: inputs,
ph_labels: labels
})
print("Train step {} loss value {}".format(i, c))
# TRAINING
and at the end of the training step I go:
Train step 99 loss value 0.04462708160281181
0.044106700712455045
I am trying to write a two layer neural network to train a class labeler. The input to the network is a 150-feature list of about 1000 examples; all features on all examples have been L2 normalized.
I only have two outputs, and they should be disjoint--I am just attempting to predict whether the example is a one or a zero.
My code is relatively simple; I am feeding the input data into the hidden layer, and then the hidden layer into the output. As I really just want to see this working in action, I am training on the entire data set with each step.
My code is below. Based on the other NN implementations I have referred to, I believe that the performance of this network should be improving over time. However, regardless of the number of epochs I set, I am getting back an accuracy of about ~20%. The accuracy is not changing when the number of steps are changed, so I don't believe that my weights and biases are being updated.
Is there something obvious I am missing with my model? Thanks!
import numpy as np
import tensorflow as tf
sess = tf.InteractiveSession()
# generate data
np.random.seed(10)
inputs = np.random.normal(size=[1000,150]).astype('float32')*1.5
label = np.round(np.random.uniform(low=0,high=1,size=[1000,1])*0.8)
reverse_label = 1-label
labels = np.append(label,reverse_label,1)
# parameters
learn_rate = 0.01
epochs = 200
n_input = 150
n_hidden = 75
n_output = 2
# set weights/biases
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
b0 = tf.Variable(tf.truncated_normal([n_hidden]))
b1 = tf.Variable(tf.truncated_normal([n_output]))
w0 = tf.Variable(tf.truncated_normal([n_input,n_hidden]))
w1 = tf.Variable(tf.truncated_normal([n_hidden,n_output]))
# step function
def returnPred(x,w0,w1,b0,b1):
z1 = tf.add(tf.matmul(x, w0), b0)
a2 = tf.nn.relu(z1)
z2 = tf.add(tf.matmul(a2, w1), b1)
h = tf.nn.relu(z2)
return h #return the first response vector from the
y_ = returnPred(x,w0,w1,b0,b1) # predict operation
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=y_,labels=y) # calculate loss between prediction and actual
model = tf.train.GradientDescentOptimizer(learning_rate=learn_rate).minimize(loss) # apply gradient descent based on loss
init = tf.global_variables_initializer()
tf.Session = sess
sess.run(init) #initialize graph
for step in range(0,epochs):
sess.run(model,feed_dict={x: inputs, y: labels }) #train model
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: inputs, y: labels})) # print accuracy
I changed your optimizer to AdamOptimizer (in many cases it performs better than GradientDescentOptimizer).
I also played a bit with the parameters. In particular, I took smaller std for your variable initialization, decreased learning rate (as your loss was unstable and "jumped around") and increased epochs (as I noticed that your loss continues to decrease).
I also reduced the size of the hidden layer. It is harder to train networks with large hidden layer when you don't have that much data.
Regarding your loss, it is better to apply tf.reduce_mean on it so that loss would be a number. In addition, following the answer of ml4294, I used softmax instead of sigmoid, so the loss looks like:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_,labels=y))
The code below achieves accuracy of around 99.9% on the training data:
import numpy as np
import tensorflow as tf
sess = tf.InteractiveSession()
# generate data
np.random.seed(10)
inputs = np.random.normal(size=[1000,150]).astype('float32')*1.5
label = np.round(np.random.uniform(low=0,high=1,size=[1000,1])*0.8)
reverse_label = 1-label
labels = np.append(label,reverse_label,1)
# parameters
learn_rate = 0.002
epochs = 400
n_input = 150
n_hidden = 60
n_output = 2
# set weights/biases
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
b0 = tf.Variable(tf.truncated_normal([n_hidden],stddev=0.2,seed=0))
b1 = tf.Variable(tf.truncated_normal([n_output],stddev=0.2,seed=0))
w0 = tf.Variable(tf.truncated_normal([n_input,n_hidden],stddev=0.2,seed=0))
w1 = tf.Variable(tf.truncated_normal([n_hidden,n_output],stddev=0.2,seed=0))
# step function
def returnPred(x,w0,w1,b0,b1):
z1 = tf.add(tf.matmul(x, w0), b0)
a2 = tf.nn.relu(z1)
z2 = tf.add(tf.matmul(a2, w1), b1)
h = tf.nn.relu(z2)
return h #return the first response vector from the
y_ = returnPred(x,w0,w1,b0,b1) # predict operation
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_,labels=y)) # calculate loss between prediction and actual
model = tf.train.AdamOptimizer(learning_rate=learn_rate).minimize(loss) # apply gradient descent based on loss
init = tf.global_variables_initializer()
tf.Session = sess
sess.run(init) #initialize graph
for step in range(0,epochs):
sess.run([model,loss],feed_dict={x: inputs, y: labels }) #train model
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: inputs, y: labels})) # print accuracy
Just a suggestion in addition to the answer provided by Miriam Farber:
You use a multi-dimensional output label ([0., 1.]) for the classification. I suggest to use the softmax cross entropy tf.nn.softmax_cross_entropy_with_logits() instead of the sigmoid cross entropy, since you assume the outputs to be disjoint softmax on Wikipedia. I achieved much faster convergence with this small modification.
This should also improve your performance once you decide to increase your output dimensionality from 2 to a higher number.
I guess you have some problem here:
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=y_,labels=y) # calculate loss between prediction and actual
It should look smth like that:
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_,labels=y))
Did't look at you code much, so if this would't work out you can check udacity deep learning course or forum they have good samples of that are you trying to do.
GL
my bio dataset has 20K rows and 170 feature. I'm doing dnn regression to predict bioactivity. ( one unit output layer with linear equation and two hidden layers). It ran very slow on my cpu and produced really bad r-square (negative). Then I ran it with skflow with the same network architecture. It was way way faster (more than 100x) and I got r2 much better than the previous run (r2=0.3), although not a great result. Does anyone know why? anything wrong with my code? what is the difference between my code and underlying skflow code? is my loss function defined correctly?
Help is very much appreciated.
Below are the codes:
# with scikit flow
dnn_reg = skflow.TensorFlowDNNRegressor(hidden_units=[200,500], steps=3000, learning_rate=0.5)
dnn_reg.fit(x_train, y_train)
pred_train = dnn_reg.predict (x_train)
pred_valid = dnn_reg.predict (x_valid)
print ('r-square for training set', r2_score(y_train, pred_train))
print ('r-square for validation set',r2_score(y_valid, pred_valid))
# tensorflow code
n_samples = 15000
n_features = 171
batch_size = 1000
num_batch = n_samples/batch_size
hidden1 = 200
hidden2 = 100
learning_rate=0.01
n_epoch=3000
graph = tf.Graph()
with graph.as_default():
#constant and palceholder
tf_train_data = tf.placeholder(tf.float32, shape=(batch_size, n_features))
tf_train_act = tf.placeholder(tf.float32, shape=(batch_size))
tf_valid_data=tf.constant (x_valid.astype(np.float32))
# variables
w1 = tf.Variable(tf.truncated_normal([n_features, hidden1]), name='weight1')
b1 = tf.Variable(tf.zeros([hidden1]), name='bias1')
w2 = tf.Variable(tf.truncated_normal([hidden1, hidden2]), name='weight2')
b2 = tf.Variable(tf.zeros([hidden2]), name='bias2')
w3 = tf.Variable(tf.truncated_normal([hidden2, 1]), name='weight3')
b3 = tf.Variable(tf.zeros([1]), name='bias3')
#parameter histogram
w1_hist = tf.histogram_summary('weight_input', w1)
w2_hist = tf.histogram_summary('weight2', w2)
w3_hist = tf.histogram_summary('weight3', w3)
b1_hist = tf.histogram_summary('bias1', b1)
b2_hist = tf.histogram_summary('bias2', b2)
b3_hist = tf.histogram_summary('bias3', b3)
#y_hist = tf.histogram_summary('y', y_train)
#training computation
def forward_prop (input):
with tf.name_scope('hidden_1') as scope:
h1 = tf.nn.relu(tf.matmul(input, w1)+b1)
with tf.name_scope('hidden_2') as scope:
h2 = tf.nn.relu(tf.matmul(h1, w2)+b2)
with tf.name_scope('output') as scope:
output = tf.matmul(h2, w3)+b3
return (output)
#forward propagation
output = forward_prop(tf_train_data)
with tf.name_scope('cost') as scope:
loss=tf.sqrt(tf.reduce_mean(tf.square(tf.sub(tf_train_act, output))))
cost_summary = tf.scalar_summary('cost', loss)
#optimizer
with tf.name_scope('train') as scope:
optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)
#predictions
train_prediction = output
valid_prediction = forward_prop(tf_valid_data)
with tf.Session(graph=graph) as session:
session.run(tf.initialize_all_variables())
print ('initialized')
merged = tf.merge_all_summaries()
writer = tf.train.SummaryWriter ('./logs/log1', session.graph)
for epoch in range(n_epoch):
mini = np.array_split(range(y_train.shape[0]), num_batch)
for idx in mini[:-1]:
batch_x = x_train[idx]
batch_y = y_train[idx]
feed_dict = {tf_train_data:batch_x, tf_train_act:batch_y}
_,l, pred_train = session.run([optimizer, loss, output], feed_dict=feed_dict)
if epoch % 100 == 0:
print ('minibatch loss at step %d: %f' % (epoch, l))
print ('minibatch r2: %0.1f' % r2_score(batch_y, pred_train))
print ('validation r2: %0.1f' % r2_score(y_valid, valid_prediction.eval()))
There's a lot of parameters that are different between your TensorFlowDNNRegressor and vanilla tensorflow model including:
hidden2 = 100
learning_rate=0.01
batch_size=1000, the default batch_size for TensorFlowDNNRegressor is 32. I think that's the main reason why TensorFlowDNNRegressor runs much faster.
Also, TensorFlowDNNRegressor use SGD as default optimizer.