I am running this code (https://github.com/ayu-22/BPPNet-Back-Projected-Pyramid-Network/blob/master/Single_Image_Dehazing.ipynb) on a custom dataset but I am running into this error.
RuntimeError: one of the variables needed for gradient computation has been modified by an in place operation: [torch. cuda.FloatTensor [1, 512, 4, 4]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
Please refer to the code link above for clarification of where the error is occurring.
I am running this model on a custom dataset, the data loader part is pasted below.
import torchvision.transforms as transforms
train_transform = transforms.Compose([
transforms.Resize((256,256)),
#transforms.RandomResizedCrop(256),
#transforms.RandomHorizontalFlip(),
#transforms.ColorJitter(),
transforms.ToTensor(),
transforms.Normalize([0.5,0.5,0.5], [0.5,0.5,0.5])
])
class Flare(Dataset):
def __init__(self, flare_dir, wf_dir,transform = None):
self.flare_dir = flare_dir
self.wf_dir = wf_dir
self.transform = transform
self.flare_img = os.listdir(flare_dir)
self.wf_img = os.listdir(wf_dir)
def __len__(self):
return len(self.flare_img)
def __getitem__(self, idx):
f_img = Image.open(os.path.join(self.flare_dir, self.flare_img[idx])).convert("RGB")
for i in self.wf_img:
if (self.flare_img[idx].split('.')[0][4:] == i.split('.')[0]):
wf_img = Image.open(os.path.join(self.wf_dir, i)).convert("RGB")
break
f_img = self.transform(f_img)
wf_img = self.transform(wf_img)
return f_img, wf_img
flare_dir = '../input/flaredataset/Flare/Flare_img'
wf_dir = '../input/flaredataset/Flare/Without_Flare_'
flare_img = os.listdir(flare_dir)
wf_img = os.listdir(wf_dir)
wf_img.sort()
flare_img.sort()
print(wf_img[0])
train_ds = Flare(flare_dir, wf_dir,train_transform)
train_loader = torch.utils.data.DataLoader(dataset=train_ds,
batch_size=BATCH_SIZE,
shuffle=True)
To get a better idea of the dataset class , you can compare my dataset class with the link pasted above
Your code is stuck in what is called the "Backpropagation" of your GAN Network.
What you have defined your backward graph should follow is the following:
def backward(self, unet_loss, dis_loss):
dis_loss.backward(retain_graph = True)
self.dis_optimizer.step()
unet_loss.backward()
self.unet_optimizer.step()
So in your backward graph, you are propagating the dis_loss which is the combination of the discriminator and adversarial loss first and then you are propagating the unet_loss which is the combination of UNet, SSIM and ContentLoss but the unet_loss is connected to discriminator's output loss. So the pytorch is confused and gives you this error as you are taking the optimizer step of dis_loss before even storing the backward graph for unet_loss and I would recommend you to change the code as follows:
def backward(self, unet_loss, dis_loss):
dis_loss.backward(retain_graph = True)
unet_loss.backward()
self.dis_optimizer.step()
self.unet_optimizer.step()
And this will start your training! but you can experiment with your retain_graph=True.
And great work on the BPPNet Work.
Related
Working in colab, with default tensorflow and keras versions (which print tensorflow 2.2.0-rc2, keras 2.3.0-tf )
I've got a superweird error. Basically, the results of model.evaluate() depend on the batch size I'm using and they change after I shuffle the data. Which makes no sense. I've been able to reproduce this in a minimally working example. In my full program (which works in 3D with bigger datasets) the variations are even more significant. I don't know whether this might depend on batch normalization... But I expect it to be fixed when I'm predicting! My full program is doing multiclass segmentation, my minimal example takes a black image with a white square in a random position, with some little noise, and tries to segment the same white square out of it.
I'm using keras sequence as generators to feed data to the model, which I guess might be relevant as I don't see the behaviour when evaluating the data directly.
Here's the code with its output:
#environment setup
%tensorflow_version 2.x
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,Conv2D, Activation, BatchNormalization
from tensorflow.keras import metrics
#set up a toy model
K.set_image_data_format("channels_last")
inputL = Input([64,64,1])
l1 = Conv2D(4,[3,3],padding='same')(inputL)
l1N = BatchNormalization(axis=-1,momentum=0.9) (l1)
l2 = Activation('relu') (l1N)
l3 = Conv2D(32,[3,3],padding='same')(l2)
l3N = BatchNormalization(axis=-1,momentum=0.9) (l3)
l4 = Activation('relu') (l3N)
l5 = Conv2D(1,[1,1],padding='same',dtype='float32')(l4)
l6 = Activation('sigmoid') (l5)
model = Model(inputs=inputL,outputs=l6)
model.compile(optimizer='sgd',loss='mse',metrics='accuracy' )
#Create random images
import numpy as np
import random
X_train = np.zeros([96,64,64,1])
for imIdx in range(96):
centPoin = random.randrange(7,50)
X_train[imIdx,centPoin-5:centPoin+5,centPoin-5:centPoin+5,0]=1
X_val = X_train[:32,:,:,:]
X_train = X_train[32:,:,:,:]
Y_train = X_train.copy()
X_train = np.random.normal(0.,0.1,size=X_train.shape)+X_train
for imIdx in range(64):
X_train[imIdx,:,:,:] = X_train[imIdx,:,:,:]+np.random.normal(0,0.2,size=1)
from tensorflow.keras.utils import Sequence
import random
import tensorflow as tf
#setup the data generator
class dataGen (Sequence):
def __init__ (self,x_set,y_set,batch_size):
self.x, self.y = x_set, y_set
self.batch_size = batch_size
nSamples = self.x.shape[0]
patList = np.array(range(nSamples),dtype='int16')
patList = patList.reshape(nSamples,1)
np.random.shuffle(patList)
self.patList = patList
def __len__ (self):
return round(self.patList.shape[0] / self.batch_size)
def __getitem__ (self, idx):
patStart = idx
batchS = self.batch_size
listLen = self.patList.shape[0]
Xout = np.zeros((batchS,64,64,1))
Yout = np.zeros((batchS,64,64,1))
for patIdx in range(batchS):
curPat = (patStart+patIdx) % listLen
patInd = self.patList[curPat]
Xout[patIdx,:,:] = self.x[patInd,:,:,:]
Yout[patIdx,:,:] = self.y[patInd,:,:,:]
return Xout, Yout
def on_epoch_end(self):
np.random.shuffle(self.patList)
def setBatchSize(self,batchS):
self.batch_size = batchS
#load the data in the generator
trainGen = dataGen(X_train,Y_train,16)
valGen = dataGen(X_val,X_val,16)
# train the model for two epochs, so that the loss is bad
trainSteps = len(trainGen)
model.fit(trainGen,steps_per_epoch=trainSteps,epochs=32,validation_data=valGen,validation_steps=len(valGen))
trainGen.setBatchSize(4)
model.evaluate(trainGen)
[0.16259156167507172, 0.9870567321777344]
trainGen.setBatchSize(16)
model.evaluate(trainGen)
[0.17035068571567535, 0.9617958068847656]
trainGen.on_epoch_end()
trainGen.setBatchSize(16)
model.evaluate(trainGen)
[0.16663715243339539, 0.9710426330566406]
If I do model.evaluate(Xtrain,Ytrain,batch_size=16) instead the result is not dependent from the batch size.
If I train the model until convergence, where the loss gets to 0.05, the same thing still happens. With the accuracy fluctuating from one evaluation to the other from 0.95 to 0.99.
Why would this happen?
I'd expect the prediction to be super easy, am I wrong?
You made a small mistake inside the __getitem__ function.
curPat = (patStart+patIdx)
should be changed to
curPat = (patStart*batchS+patIdx)
patStart is equal to idx, the current batch number. If your data set contains 64 samples and your batch size is set to 16, the possible values for idx will be 0, 1, 2 and 3.
curPat on the other hand refers to the index of the current sample number in the shuffled list of sample numbers. curPat should therefore be able to take on all values from 0 to 63. In your code, that is not the case. By making the aforementioned change, this issue is fixed.
I built a network that attempts to predict raster images of surface temperatures.
The output of the network is a (1000, 1000) size array, representing a raster image. For training and testing these are compared to the real raster of their respective samples.
I understand how to add the training image to my TensorBoard callback but I'd like to also add the network's output image to the callback, so that I could compare them visually. Is this possible?
x = Input(shape = (2))
x = Dense(4)(x)
x = Reshape((2, 2))(x)
Where Reshape would be the last layer (or one before some deconvolution layer).
Depending on the tensorflow version you are using, I would have 2 different codes to suggest. I will assume you use > 2.0 and post the code I use for that version for image-to-image models. I basically initialize a callback with a noisy image (I am doing denoising but you can easily adapt to your problem), and the corresponding ground truth image. I then use the model to do the inference after each epoch.
"""Inspired by https://github.com/sicara/tf-explain/blob/master/tf_explain/callbacks/grad_cam.py"""
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import Callback
class TensorBoardImage(Callback):
def __init__(self, log_dir, image, noisy_image):
super().__init__()
self.log_dir = log_dir
self.image = image
self.noisy_image = noisy_image
def set_model(self, model):
self.model = model
self.writer = tf.summary.create_file_writer(self.log_dir, filename_suffix='images')
def on_train_begin(self, _):
self.write_image(self.image, 'Original Image', 0)
def on_train_end(self, _):
self.writer.close()
def write_image(self, image, tag, epoch):
image_to_write = np.copy(image)
image_to_write -= image_to_write.min()
image_to_write /= image_to_write.max()
with self.writer.as_default():
tf.summary.image(tag, image_to_write, step=epoch)
def on_epoch_end(self, epoch, logs={}):
denoised_image = self.model.predict_on_batch(self.noisy_image)
self.write_image(denoised_image, 'Denoised Image', epoch)
So typically you would use this the following way:
# define the model
model = Model(inputs, outputs)
# define the callback
image_tboard_cback = TensorBoardImage(
log_dir=log_dir + '/images',
image=val_gt[0:1],
noisy_image=val_noisy[0:1],
)
# fit the model
model.fit(
x,
y,
callbacks=[image_tboard_cback,],
)
If you use versions prior to 2.0 I can direct to this gist I wrote (which is a bit more intricate).
I am trying to predict several million images with my trained model using a predict_generator in python 3 with keras and tensorflow as backend. The generator and the model predictions work, however, some images in the directory are broken or corrupted and cause the predict_generator to stop and throw an error. Once the image is removed it works again until the next corrupted/broken image gets fed through the function.
Since there are so many images it is not feasible to run a script to open every image and delete the ones that are throwing an error. Is there a way to incorporate a "skip image if broken" argument into the generator or flow from directory function?
Any help is greatly appreciated!
There's no such argument in ImageDataGenerator and neither in flow_from_directory method as you can see int the Keras docs for both (here and here). One workaround would be to extend the ImageDataGenerator class and overload the flow_from_directory method to check wether the image is corrupted or not before yeld it in the generator. Here you can find it's source code.
Since it happens during prediction, if you skip any image or batch, you need to keep track of which images are skipped, so that you can correctly map the prediction scores to the image file name.
Based on this idea, my DataGenerator is implemented with a valid image index tracker. In particular, focus on the variable valid_index where index of valid images are tracked.
class DataGenerator(keras.utils.Sequence):
def __init__(self, df, batch_size, verbose=False, **kwargs):
self.verbose = verbose
self.df = df
self.batch_size = batch_size
self.valid_index = kwargs['valid_index']
self.success_count = self.total_count = 0
def __len__(self):
return int(np.ceil(self.df.shape[0] / float(self.batch_size)))
def __getitem__(self, idx):
print('generator is loading batch ',idx)
batch_df = self.df.iloc[idx * self.batch_size:(idx + 1) * self.batch_size]
self.total_count += batch_df.shape[0]
# return a list whose element is either an image array (when image is valid) or None(when image is corrupted)
x = load_batch_image_to_arrays(batch_df['image_file_names'])
# filter out corrupted images
tmp = [(u, i) for u, i in zip(x, batch_df.index.values.tolist()) if
u is not None]
# boundary case. # all image failed, return another random batch
if len(tmp) == 0:
print('[ERROR] All images loading failed')
# based on https://github.com/keras-team/keras/blob/master/keras/utils/data_utils.py#L621,
# Keras will automatically find the next batch if it returns None
return None
print('successfully loaded image in {}th batch {}/{}'.format(str(idx), len(tmp), self.batch_size))
self.success_count += len(tmp)
x, batch_index = zip(*tmp)
x = np.stack(x) # list to np.array
self.valid_index[idx] = batch_index
# follow preprocess input function provided by keras
x = resnet50_preprocess(np.array(x, dtype=np.float))
return x
def on_epoch_end(self):
print('total image count', self.total_count)
print('successful images count', self.success_count)
self.success_count = self.total_count = 0 # reset count after one epoch ends.
During prediction.
predictions = model.predict_generator(
generator=data_gen,
workers=10,
use_multiprocessing=False,
max_queue_size=20,
verbose=1
).squeeze()
indexes = []
for i in sorted(data_gen.valid_index.keys()):
indexes.extend(data_gen.valid_index[i])
result_df = df.loc[indexes]
result_df['score'] = predictions
I'm doing some numerical experiments in TensorFlow that involve creating some simple networks and seeing how well they can approximate various functions.
My first thought was to make a different Python class for each type of network I'm comparing. But I think the way I'm writing my classes is flawed. For instance, my first one is called AffineNetwork. It has a method for training the network, and a method for simply feeding an input through the network.
import numpy as np
import tensorflow as tf
class AffineNetwork:
def __init__(self, trainingData, targets, initialParams):
self.trainingData = trainingData
self.targets = targets
self.weights = tf.get_variable("weights", initializer = tf.constant(initialParams[0]))
self.bias = tf.get_variable("bias", initializer = tf.constant(initialParams[1]))
self.outputs = tf.expand_dims(self.weights,0) # trainingData + self.bias
self.loss = tf.reduce_mean(tf.square(self.targets - self.outputs))
self.optimizer = tf.train.GradientDescentOptimizer(0.001)
self.train_step = self.optimizer.minimize(self.loss)
self.init = tf.global_variables_initializer()
self.loss_summary = tf.summary.scalar("loss", self.loss)
def train(self, max_iter, directory_name):
with tf.Session() as sess:
sess.run(self.init)
writer = tf.summary.FileWriter("Tensorboard/" + directory_name)
for step in range(max_iter):
sess.run(self.train_step)
summ = sess.run(self.loss_summary)
writer.add_summary(summ, step)
writer.close()
def feedforward(self, x):
if len(x.shape) == 1:
x = tf.expand_dims(x, 1)
with tf.Session() as sess:
sess.run(self.init)
return sess.run(tf.expand_dims(self.weights,0) # x + self.bias)
I think this way of organizing things is flawed, because:
Suppose I train my network parameters, and then I want to use these to make predictions. I have to do this within the same TensorFlow session, or else everything will be wiped out, and I'll be starting fresh. (Question: Is this statement correct?)
If that is correct, it would seem that it wouldn't make sense to split up these functions between two methods, because the session would close in between them.
For reference, what I'm trying to do with this class is something like the following script. It trains the network on a constant function, and then compare its first prediction and its prediction after training.
dim = 3
dataSetSize = 5
trainingData = np.array(np.arange(15).astype(np.float32).reshape(dim, dataSetSize))
targets = np.ones(dataSetSize, dtype = np.float32)
initialParams = [np.random.uniform(0.0, 1.0, dim).astype(np.float32), np.float32(0)]
myAffineNetwork = nt.AffineNetwork(trainingData, targets,initialParams)
print(myAffineNetwork.feedforward(np.array([1, 2, 3], dtype=np.float32)))
myAffineNetwork.train(50, output_folder_name)
print(myAffineNetwork.feedforward(np.array([1, 2, 3], dtype=np.float32)))
But it's spitting out the same thing before and after training.
Question: What would be the usual, smarter way of organizing these things? My idea about having a different class for each architecture of network: do I need to abandon that idea altogether? Or just write the methods in a smarter way?
Defining a class for a network is not a bad idea. Danijar (LINK) states that all models in the TensorFlow codebase are defined this way.
To tackle your problem, you could do several things. Firstly, when you define the __init__() method, you could define a session for the object:
class AffineNetwork:
def __init__(self, trainingData, targets, initialParams):
self.trainingData = trainingData
self.targets = targets
self.weights = tf.get_variable("weights", initializer = tf.constant(initialParams[0]))
self.bias = tf.get_variable("bias", initializer = tf.constant(initialParams[1]))
self.outputs = tf.expand_dims(self.weights,0) # trainingData + self.bias
self.loss = tf.reduce_mean(tf.square(self.targets - self.outputs))
self.optimizer = tf.train.GradientDescentOptimizer(0.001)
self.train_step = self.optimizer.minimize(self.loss)
self.init = tf.global_variables_initializer()
self.loss_summary = tf.summary.scalar("loss", self.loss)
self.sess = tf.Session()
This way, you have the same session available for all your operations. You have to adapt the rest of your code though, e.g. use self.sess.run() for evaluating your operations.
Another option would be that you train the model in a session, save it to disk, and restore it in another session to do inference. That answers your first question, as different sessions do wipe out information, but it can be stored on disk (which makes sense considering that you don't wanna train every time before you do inference once).
Problem statement
I am trying to train a dynamic RNN in TensorFlow v1.0.1 on Linux RedHat 7.3 (problem also manifests on Windows 7), and no matter what I try, I get the exact same training and validation error at every epoch, i.e. my weights are not updating.
I appreciate any help you can offer.
Example
I tried to reduce this to a minimum example that shows my issue, but the minimum example is still pretty large. I based the network structure largely on this gist.
Network definition
import functools
import numpy as np
import tensorflow as tf
def lazy_property(function):
attribute = '_' + function.__name__
#property
#functools.wraps(function)
def wrapper(self):
if not hasattr(self, attribute):
setattr(self, attribute, function(self))
return getattr(self, attribute)
return wrapper
class MyNetwork:
"""
Class defining an RNN for labeling a time series.
"""
def __init__(self, data, target, num_hidden=64):
self.data = data
self.target = target
self._num_hidden = num_hidden
self._num_steps = int(self.target.get_shape()[1])
self._num_classes = int(self.target.get_shape()[2])
self._weight_and_bias() # create weight and bias tensors
self.prediction
self.error
self.optimize
#lazy_property
def prediction(self):
"""Defines the recurrent neural network prediction scheme."""
# Dynamic LSTM.
network = tf.contrib.rnn.BasicLSTMCell(self._num_hidden)
output, _ = tf.nn.dynamic_rnn(network, data, dtype=tf.float32)
# Flatten and apply same weights to all time steps.
output = tf.reshape(output, [-1, self._num_hidden])
prediction = tf.nn.softmax(tf.matmul(output, self.weight) + self.bias)
prediction = tf.reshape(prediction,
[-1, self._num_steps, self._num_classes])
return prediction
#lazy_property
def cost(self):
"""Defines the cost function for the network."""
cross_entropy = -tf.reduce_sum(self.target * tf.log(self.prediction),
axis=[1, 2])
cross_entropy = tf.reduce_mean(cross_entropy)
return cross_entropy
#lazy_property
def optimize(self):
"""Defines the optimization scheme."""
learning_rate = 0.003
optimizer = tf.train.RMSPropOptimizer(learning_rate)
return optimizer.minimize(self.cost)
#lazy_property
def error(self):
"""Defines a measure of prediction error."""
mistakes = tf.not_equal(tf.argmax(self.target, 2),
tf.argmax(self.prediction, 2))
return tf.reduce_mean(tf.cast(mistakes, tf.float32))
def _weight_and_bias(self):
"""Returns appropriately sized weight and bias tensors for the output layer."""
self.weight = tf.Variable(tf.truncated_normal(
[self._num_hidden, self._num_classes],
mean=0.0,
stddev=0.01,
dtype=tf.float32))
self.bias = tf.Variable(tf.constant(0.1, shape=[self._num_classes]))
Training
Here is my training process. The all_data class just holds my data and labels, and uses a batch generator class to spit out batches for training when I call all_data.train.next() and all_data.train_labels.next(). You can reproduce with any batch generation scheme you like, and I can add the code if you think it is relevant; I felt like this was getting too long as it is.
tf.reset_default_graph()
data = tf.placeholder(tf.float32,
[None, all_data.num_steps, all_data.num_features])
target = tf.placeholder(tf.float32,
[None, all_data.num_steps, all_data.num_outputs])
model = MyNetwork(data, target, NUM_HIDDEN)
print('Training the model...')
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print('Initialized.')
for epoch in range(3):
print('Epoch {} |'.format(epoch), end='', flush=True)
for step in range(all_data.train_size // BATCH_SIZE):
# Generate the next training batch and train.
d = all_data.train.next()
t = all_data.train_labels.next()
sess.run(model.optimize,
feed_dict={data: d, target: t})
# Update the user periodically.
if step % summary_frequency == 0:
print('.', end='', flush=True)
# Show training and validation error at the end of each epoch.
print('|', flush=True)
train_error = sess.run(model.error,
feed_dict={data: d, target: t})
valid_error = sess.run(model.error,
feed_dict={
data: all_data.valid,
target: all_data.valid_labels
})
print('Training error: {}%'.format(100 * train_error))
print('Validation error: {}%'.format(100 * valid_error))
# Check testing error after everything.
test_error = sess.run(model.error,
feed_dict={
data: all_data.test,
target: all_data.test_labels
})
print('Testing error after {} epochs: {}%'.format(epoch + 1, 100 * test_error))
For a simple example, I generated random data and labels, where data has shape [num_samples, num_steps, num_features], and each sample has a single label associated with the whole thing:
data = np.random.rand(5000, 1000, 2)
labels = np.random.randint(low=0, high=2, size=[5000])
I then converted my labels to one-hot vectors and tiled them so that the resulting labels tensor was the same size as the data tensor.
Results
No matter what I do, I get results like this:
Training the model...
Initialized.
Epoch 0 |.......................................................|
Training error: 56.25%
Validation error: 53.39999794960022%
Epoch 1 |.......................................................|
Training error: 56.25%
Validation error: 53.39999794960022%
Epoch 2 |.......................................................|
Training error: 56.25%
Validation error: 53.39999794960022%
Testing error after 3 epochs: 49.000000953674316%
Where I have exactly the same error at every epoch. Even if my weights were randomly walking around this should change. For the example shown here, I used random data with random labels, so I do not expect much improvement, but I do expect some change, and I am getting the exact same results every epoch. When I do this with my actual data set, I get the same behavior.
Insight
I hesitate to include this in case it proves to be a red herring, but I believe that my optimizer is calculating cost function gradients of None. When I tried a different optimizer and attempted to clip the gradients, I went ahead and used tf.Print to output the gradients as well. The network crashed with an error that tf.Print could not handle None-type values.
Attempted fixes
I have tried the following things, and the problem persists in all cases:
Using different optimizers, e.g. AdamOptimizer with and without modifications to the gradients (clipping).
Adjusting batch sizes.
Using many more and many fewer hidden nodes.
Running for more epochs.
Initializing my weights with different values assigned to stddev.
Initializing my biases to zeros (using tf.zeros) and to different constants.
Using weights and biases that are defined within the prediction method and are not member variables of the class, and a _weight_and_bias method that is defined as a #staticmethod like in this gist.
Determining logits in the prediction function instead of softmax predictions, i.e. predictions = tf.matmul(output, self.weights) + self.bias, and then using tf.nn.softmax_cross_entropy_with_logits. This requires some reshaping because the method wants its labels and targets given with shape [batch_size, num_classes], so the cost method becomes:
(line added to get code to format...)
#lazy_property
def cost(self):
"""Defines the cost function for the network."""
targs = tf.reshape(self.target, [-1, self._num_classes])
logits = tf.reshape(self.predictions, [-1, self._num_classes])
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=targs, logits=logits)
cross_entropy = tf.reduce_mean(cross_entropy)
return cross_entropy
Changing which size dimension I leave as None when I create my placeholders as suggested in this answer, which requires a bit of rewriting in the network definition. Basically setting size = [all_data.batch_size, -1, all_data.num_features] and size = [all_data.batch_size, -1, all_data.num_classes].
Using tf.contrib.rnn.DropoutWrapper in my network definition and passing a dropout value set to 0.5 in training and 1.0 in validation and testing.
The problem went away when I used
output = tf.contrib.layers.flatten(output)
logits = tf.contrib.layers.fully_connected(output, some_size, activation_fn=None)
instead of flattening my network output, defining weights, and performing the tf.matmul(output, weight) + bias manually. I then used logits (instead of predictions in the question) in my cost function with
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=target,
logits=logits)
If you want to get the network prediction, you will still need to do prediction = tf.nn.softmax(logits).
I have no idea why this helped, but the network would not train even on random made-up data until I made these changes.