Tensorflow uses all memory for a single example

Tensorflow uses all memory for a single example - python

Here is my code for reading and passing a tf record batch through a network.
import os, sys
import tensorflow as tf
def read_and_decode_single_example(filename_queue):
# Unlike the TFRecordWriter, the TFRecordReader is symbolic
reader = tf.TFRecordReader()
# One can read a single serialized example from a filename
# serialized_example is a Tensor of type string.
_, serialized_example = reader.read(filename_queue)
# The serialized example is converted back to actual values.
# One needs to describe the format of the objects to be returned
features = tf.parse_single_example(
serialized_example,
features={
# We know the length of both fields. If not the
# tf.VarLenFeature could be used
'click': tf.FixedLenFeature([], tf.int64),
'title': tf.FixedLenFeature([25], tf.int64)
# maybe others eg data1:tf.FixLenFeature([],tf.float64)
})
# now return the converted data
lbl = features['click']
ttl = features['title']
return lbl, ttl
def read_batch_data(files, b_s):
min_after_dequeue = 8
num_threads = 2
batch_size = b_s
capacity = min_after_dequeue + (num_threads + 2) * batch_size
filename_queue = tf.train.string_input_producer(files, num_epochs=1)
c_n_c, tit = read_and_decode_single_example(filename_queue)
label_batch, title_batch = tf.train.shuffle_batch([c_n_c, tit], batch_size=batch_size, capacity=capacity,num_threads=num_threads, min_after_dequeue=min_after_dequeue)
return label_batch, title_batch
And the network code:
import math
import os,sys
import subprocess
import pickle
import load_data_labels
import numpy as np
import tensorflow as tf
import shutil
LOG_DIR = './log_dir'
def init_weights(shape, name):
return tf.Variable(tf.random_normal(shape,stddev=0.01,dtype=tf.float64), name=name)
def init_biases(shape, name):
return tf.Variable(tf.random_normal(shape,dtype=tf.float64),name=name)
def model(titles, w_h, w_h2, w_o, vocab_size,embd_layer):
# Add layer name scopes for better graph visualization
# Embedding layer
with tf.device('/cpu:0'), tf.name_scope("embedding"):
W_em = tf.Variable(embd_layer,name="word_embeddings")
embed_l = tf.nn.embedding_lookup(W_em, titles)
# can be reduce sum
embedding = tf.reduce_mean(embed_l, [1])
with tf.name_scope("layer1"):
h = tf.nn.relu(tf.add(tf.matmul(embedding, w_h), b_h))
with tf.name_scope("layer2"):
h2 = tf.nn.relu(tf.add(tf.matmul(h, w_h2), b_h2))
with tf.name_scope("layer3"):
return tf.add(tf.matmul(h2, w_o), b_o)
def init_word_embedding_with_w2v(w2v_dict, word_map, emb_dim, voc_len):
initW = np.random.uniform(-1.0,1.0,(voc_len+1, emb_dim))
for word in word_map:
vec = w2v_dict.get(word)
idx = word_map[word]
if vec is not None:
initW[idx,:] = vec
return initW
with open('./data/word_map.pickle', 'rb') as word_map_file:
word_map = pickle.load(word_map_file)
with open('./data/word_2_vec_dict.pickle', 'rb') as w2vec_file:
w2vec = pickle.load(w2vec_file)
dataset_file= "./data/file000000000000_1000lines.tfrecords"
batch_size=4
trY,trX = load_data_labels.read_batch_data([dataset_file],batch_size)
trY=tf.one_hot(trY,depth=2,axis = -1)
trY=tf.reshape(trY,[4,2])
print trY.get_shape()
print trX.get_shape()
w_h = init_weights([300, 625], "w_h")
w_h2 = init_weights([625, 625], "w_h2")
w_o = init_weights([625, 2], "w_o")
vocabulary_length=len(w2vec)
any_vector_in_dict = w2vec.itervalues().next()
emb_dim = len(any_vector_in_dict)
embd_layer=init_word_embedding_with_w2v(w2vec,word_map,emb_dim,vocabulary_length)
b_h = init_biases([625], "b_h")
b_h2 = init_biases([625], "b_h2")
b_o = init_biases([2],"b_o")
tf.summary.histogram("w_h_summar", w_h)
tf.summary.histogram("w_h2_summar", w_h2)
tf.summary.histogram("w_o_summar", w_o)
tf.summary.histogram("embedding_layer", embd_layer)
py_x = model(trX, w_h, w_h2, w_o, vocabulary_length,embd_layer)
with tf.name_scope("cost"):
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=trY, logits=py_x))
train_op = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)
tf.summary.scalar("cost", cost)
with tf.name_scope("accuracy"):
correct_pred = tf.equal(tf.argmax(trY, 1), tf.argmax(py_x, 1))
acc_op = tf.reduce_mean(tf.cast(correct_pred, "float"))
tf.summary.scalar("accuracy", acc_op)
with tf.Session() as sess:
writer = tf.summary.FileWriter(LOG_DIR, sess.graph)
merged = tf.summary.merge_all()
tf.global_variables_initializer().run()
for i in range(10):
sess.run(train_op)
summary, acc = sess.run([merged, acc_op])
writer.add_summary(summary, i) # Write summary
The problem is that the program crushes because it fills all of the RAM memory.The point that the crush starts is in merge_all statement of the network and it hangs in global_variable_initializer from it never returns but the memory is filled gradually.Maybe a queue left open?I can't find anything relevant and specific and tensorflow's docs are at least bad.I'm searching it around more than a week and I m starting to get extremely tired.Could anyone help?

Related

Keras - ValueError: Could not interpret loss function identifier

I am trying to build the autoencoder structure detailed in this IEEE article. The autoencoder uses a separable loss function where it is required that I create a custom loss function for the "cluster loss" term of the separable loss function as a function of the average output of the encoder. I create my own layer called RffConnected that calculates the cluster loss and utilizes the add_loss method. Otherwise, this RffConnected layer should act as just a normal deep layer.
Here are my relevant code snippets:
import matplotlib.pyplot as plot
from mpl_toolkits.axes_grid1 import ImageGrid
import numpy as np
import math
from matplotlib.figure import Figure
import tensorflow as tf
import keras
from keras import layers
import random
import time
from os import listdir
#loads data from a text file
def loadData(basePath, samplesPerFile, sampleRate):
real = []
imag = []
fileOrder = []
for file in listdir(basePath):
if((file != "READ_ME") and ((file != "READ_ME.txt"))):
fid = open(basePath + "\\" + file, "r")
fileOrder.append(file)
t = 0
sampleEvery = samplesPerFile / sampleRate
temp1 = []
temp2 = []
times = []
for line in fid.readlines():
times.append(t)
samples = line.split("\t")
temp1.append(float(samples[0]))
temp2.append(float(samples[1]))
t = t + sampleEvery
real.append(temp1)
imag.append(temp2)
fid.close()
real = np.array(real)
imag = np.array(imag)
return real, imag, times, fileOrder
#####################################################################################################
#Breaks up and randomizes data
def breakUpData(real, imag, times, numPartitions, basePath):
if(len(real) % numPartitions != 0):
raise ValueError("Error: The length of the dataset must be divisible by the number of partitions.")
newReal = []
newImag = []
newTimes = []
fileOrder = listdir(basePath)
dataFiles = []
interval = int(len(real[0]) / numPartitions)
for i in range(0, interval):
newTimes.append(times[i])
for i in range(0, len(real)):
tempI = []
tempQ = []
for j in range(0, len(real[0])):
tempI.append(real[i, j])
tempQ.append(imag[i, j])
if((j + 1) % interval == 0):
newReal.append(tempI)
newImag.append(tempQ)
#fileName = fileOrder[i][0: fileOrder[i].find("_") + 3]
dataFiles.append(fileOrder[i])
tempI = []
tempQ = []
#randomizes the broken up dataset and the file list
for i in range(0, len(newReal)):
r = random.randint(0, len(newReal) - 1)
tempReal = newReal[i]
tempImag = newImag[i]
newReal[i] = newReal[r]
newImag[i] = newImag[r]
newReal[r] = tempReal
newImag[r] = tempImag
tempFile = dataFiles[i]
dataFiles[i] = dataFiles[r]
dataFiles[r] = tempFile
#return np.array(newReal), np.array(newImag), newTimes, dataFiles
return newReal, newImag, newTimes, dataFiles
#####################################################################################################
#custom loss layer for the RffAe-S that calculates the clustering loss term
class RffConnected(layers.Layer):
def __init__(self, output_dim, batchSize, beta, alpha):
super(RffConnected, self).__init__()
# self.total = tf.Variable(initial_value=tf.zeros((input_dim,)), trainable=False)
#array = np.zeros(output_dim)
self.iters = 0.0
self.beta = beta
self.alpha = alpha
self.batchSize = batchSize
self.output_dim = output_dim
self.sum = tf.zeros(output_dim, tf.float64)
self.moving_average = tf.zeros(output_dim, tf.float64)
self.clusterloss = tf.zeros(output_dim, tf.float64)
self.sum = tf.cast(self.sum, tf.float32)
self.moving_average = tf.cast(self.moving_average, tf.float32)
self.clusterloss = tf.cast(self.clusterloss, tf.float32)
# self.sum = keras.Input(shape=(self.output_dim,))
# self.moving_average = keras.Input(shape=(self.output_dim,))
# self.clusterloss = keras.Input(shape=(self.output_dim,))
def build(self, input_shape):
self.kernel = self.add_weight(name = 'kernel', \
shape = (int(input_shape[-1]), self.output_dim), \
initializer = 'normal', trainable = True)
#self.kernel = tf.cast(self.kernel, tf.float64)
super(RffConnected, self).build(int(input_shape[-1]))
def call(self, inputs):
#keeps track of training epochs
self.iters = self.iters + 1
#inputs = tf.cast(inputs, tf.float64)
#where this custom layer acts as a normal layer- the loss then uses this
#calc = keras.backend.dot(inputs, self.kernel)
calc = tf.matmul(inputs, self.kernel)
#cumulative sum of deep encoded features
#self.sum = state_ops.assign(self.sum, tf.reshape(tf.math.add(self.sum, calc), tf.shape(self.sum)))
#self.sum = tf.ops.state_ops.assign(self.sum, tf.math.add(self.sum, calc))
#self.sum.assign_add(calc)
self.sum = tf.math.add(self.sum, calc)
#calculate the moving average and loss if we have already trained one batch
if(self.iters >= self.batchSize):
self.moving_average = tf.math.divide(self.sum, self.iters)
self.clusterloss = tf.math.exp(\
tf.math.multiply(-1 * self.beta, tf.math.reduce_sum(tf.math.square(tf.math.subtract(inputs, self.moving_average)))))
#self.add_loss(tf.math.multiply(self.clusterloss, self.alpha))
self.add_loss(self.clusterloss.numpy() * self.alpha)
return calc
#####################################################################################################
def customloss(y_true, y_pred):
loss = tf.square(y_true - y_pred)
print(loss)
return loss
#####################################################################################################
realTraining = np.array(real[0:2200])
realTesting = np.array(real[2200:-1])
imagTraining = np.array(imag[0:2200])
imagTesting = np.array(imag[2200:-1])
numInputs = len(realTraining[0])
i_sig = keras.Input(shape=(numInputs,))
q_sig = keras.Input(shape=(numInputs,))
iRff = tf.keras.layers.experimental.RandomFourierFeatures(numInputs, \
kernel_initializer='gaussian', scale=9.0)(i_sig)
rff1 = keras.Model(inputs=i_sig, outputs=iRff)
qRff = tf.keras.layers.experimental.RandomFourierFeatures(numInputs, \
kernel_initializer='gaussian', scale=9.0)(q_sig)
rff2 = keras.Model(inputs=q_sig, outputs=qRff)
combined = layers.Concatenate()([iRff, qRff])
combineRff = tf.keras.layers.experimental.RandomFourierFeatures(4 * numInputs, \
kernel_initializer='gaussian', scale=10.0)(combined)
preprocess = keras.Model(inputs=[iRff, qRff], outputs=combineRff)
#print(realTraining[0:5])
preprocessedTraining = preprocess.predict([realTraining, imagTraining])
preprocessedTesting = preprocess.predict([realTesting, imagTesting])
################## Entering Encoder ######################
encoderIn = keras.Input(shape=(4*numInputs,))
#connected1 = layers.Dense(100, activation="sigmoid")(encoderIn)
clusterLossLayer = RffConnected(100, 30, 1.00, 100.00)(encoderIn)
#clusterLossLayer = myRffConnected(256)(connected1)
encoder = keras.Model(inputs=encoderIn, outputs=clusterLossLayer)
################## Entering Decoder ######################
connected2 = layers.Dense(125, activation="sigmoid")(clusterLossLayer)
relu1 = layers.ReLU()(connected2)
dropout = layers.Dropout(0.2)(relu1)
reshape1 = layers.Reshape((25, 5, 1))(dropout)
bn1 = layers.BatchNormalization()(reshape1)
trans1 = layers.Conv2DTranspose(1, (4, 2))(bn1)
ups1 = layers.UpSampling2D(size=(2, 1))(trans1)
relu2 = layers.ReLU()(ups1)
bn2 = layers.BatchNormalization()(relu2)
trans2 = layers.Conv2DTranspose(1, (4, 2))(bn2)
ups2 = layers.UpSampling2D(size=(2, 1))(trans2)
relu3 = layers.ReLU()(ups2)
bn3 = layers.BatchNormalization()(relu3)
trans3 = layers.Conv2DTranspose(1, (5, 2))(bn3)
ups3 = layers.UpSampling2D(size=(2, 1))(trans3)
relu4 = layers.ReLU()(ups3)
bn4 = layers.BatchNormalization()(relu4)
trans4 = layers.Conv2DTranspose(1, (7, 1))(bn4)
reshape2 = layers.Reshape((4*numInputs, 1, 1))(trans4)
autoencoder = keras.Model(inputs=encoderIn, outputs=reshape2)
encoded_input = keras.Input(shape=(None, 100))
decoder_layer = autoencoder.layers[-1]
#autoencoder.summary()
autoencoder.compile(optimizer='adam', loss=[autoencoder.losses[-1], customloss], metrics=['accuracy', 'accuracy'])
autoencoder.fit(preprocessedTraining, preprocessedTraining, epochs=100, batch_size=20, shuffle=True, validation_data=(preprocessedTesting, preprocessedTesting))
It seems like it runs for two training epochs then it gives me an error. I end up getting this error when I run it:
ValueError: Could not interpret loss function identifier: Tensor("rff_connected_137/Const:0", shape=(100,), dtype=float32)
I've already spent a considerable amount of time debugging this thing, although if you spot any more errors I would appreciate a heads-up. Thank you in advance.

According to the documentation of the keras Keras Model Training-Loss, the 'loss' attribute can take the value of float tensor (except for the sparse loss functions returning integer arrays) with a specific shape.
If it is necessary to combine two loss functions, it would be better to perform mathematical calculations within your custom loss function to return an output of float tensor. This reference might be a help Keras CustomLoss definition.

Running Python (Pytorch) training script on remote machine causes SSH session crash with no error message

I am trying to train custom UNET model for multiclass segmentation with 25000 training images. I am running code on remote ubuntu machine via Putty SSH connection. The scripts starts running and after some epochs the entire Putty session crashes making it impossible to get the actual error that caused the crash.
I believe that in my main training script is not a problem, but maybe I have some problems in my script that create Datasets and DataLoaders. It looks like this:
import pandas as pd
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
from datetime import datetime
import torch
import torch.nn as nn
from torch.nn import ConvTranspose2d
from torch.nn import Conv2d
from torch.nn import MaxPool2d
from torch.nn import Module
from torch.nn import ModuleList
from torch.nn import ReLU
from torch.nn import functional as F
from torchvision.transforms import CenterCrop
import torchvision.transforms as T
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import time
from tqdm import tqdm
import random
from PIL import Image
to_tensor = T.ToTensor()
# determine the device to be used for training and evaluation
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# determine if we will be pinning memory during data loading
PIN_MEMORY = True if DEVICE == "cuda" else False
BATCH_SIZE = 64
# # Data importing
df_train = pd.read_csv("../../datasets/unet_cropped/df_train.csv")
df_val = pd.read_csv("../../datasets/unet_cropped/df_val.csv")
# Dictionaries
d_train = {}
for row, data in df_train.iterrows():
d_train[data['image_path']] = [data['class_1_path'], data['class_3_path'], data['class_4_path']]
d_val = {}
for row, data in df_val.iterrows():
d_val[data['image_path']] = [data['class_1_path'], data['class_3_path'], data['class_4_path']]
def image_to_gray(image_path):
# Open image
img = Image.open(image_path)
# Apply transformation and convert to Pytorch tensor
img_tensor = to_tensor(img)
# Convert into Gray-scale
img_torch_gray = T.functional.rgb_to_grayscale(img_tensor, num_output_channels= 1)
return img_torch_gray
def show(images, folder_path, title, subtitles = True):
""" images : list of images
"""
fig,axes = plt.subplots(nrows = len(images), ncols = 1, figsize=(6,7), constrained_layout=True)
fig.suptitle(title, fontsize=10)
if len(images) >1:
for i, image_id in enumerate(images):
image_path = folder_path + image_id
im_for_plot = image_to_gray(image_path)
if subtitles:
axes[i-1].set_title(image_id, fontsize=8)
axes[i-1].imshow(im_for_plot, cmap = "gray")
else:
image_id = images[0]
image_path = folder_path + image_id
im_for_plot = image_to_gray(image_path)
if subtitles:
axes.set_title(image_id, fontsize=8)
axes.imshow(im_for_plot.permute((1,2,0)), cmap = "gray")
plt.show()
def mask_to_gray4(mask_path):
""" For given list of mask_paths create 3-dim tensor.
Example:
mask_path = [mask_path_class_1, NaN, NaN]
final_mask = [ gray_scale_mask_class_1, torch_zeros(1, W, H), torch_zeros(1, W, H) ]
"""
final_mask = torch.zeros((3, 256, 256))
for i, sample in enumerate(mask_path):
# if sample is not NaN, continue
if sample!=sample:
continue
# Open image
img = Image.open(sample)
# Apply transformation and convert to Pytorch tensor
img_tensor = to_tensor(img)
# Convert into Gray-scale
img_torch_gray = T.functional.rgb_to_grayscale(img_tensor, num_output_channels= 1)
img_torch_gray[img_torch_gray>0] = 1
final_mask[i, :, :] = img_torch_gray
return final_mask
image_train_paths = []
mask_train_paths = []
for key, value in d_train.items():
image_train_paths.append(key)
mask_train_paths.append(value)
image_val_paths = []
mask_val_paths = []
for key, value in d_val.items():
image_val_paths.append(key)
mask_val_paths.append(value)
del d_train, d_val
# main function
class SegmentationDataset(Dataset):
def __init__(self, image_paths, mask_paths):
# store the image and mask filepaths
self.image_paths = image_paths
self.mask_paths = mask_paths
def __len__(self):
# return the number of total samples contained in the dataset
return len(self.image_paths)
def __getitem__(self, idx):
""" Loads and returns a sample from the dataset at the given index idx. """
# grab the image path from the current index
image_path = self.image_paths[idx]
image = image_to_gray(image_path)
# grab the mask path from the current index
mask_path = self.mask_paths[idx]
mask = mask_to_gray4(mask_path)
return (image, mask)
# # Data Loading
trainDS = SegmentationDataset(image_train_paths, mask_train_paths)
trainLoader = DataLoader(trainDS, shuffle = True, batch_size = BATCH_SIZE,
pin_memory = PIN_MEMORY)
valDS = SegmentationDataset(image_val_paths, mask_val_paths)
valLoader = DataLoader(valDS, shuffle = True, batch_size = BATCH_SIZE,
pin_memory = PIN_MEMORY)
# calculate steps per epoch for training and validation set
trainSteps = len(trainDS) // BATCH_SIZE
valSteps = len(valDS) // BATCH_SIZE
Maybe the problem has to do with some sort of memory leak or RAM constantly increasing during training, but I can't spot where exactly can that happen in this script.
Training script looks like this:
#!/usr/bin/env python
# coding: utf-8
# Import libraries
import pandas as pd
import numpy as np
import os
from datetime import datetime
import time
import random
from PIL import Image
import torch
import torch.nn as nn
from torch.nn import ConvTranspose2d
from torch.nn import Conv2d
from torch.nn import MaxPool2d, BatchNorm2d
from torch.nn import Module
from torch.nn import ModuleList
from torch.nn import ReLU
from torch.nn import functional as F
from torchvision.transforms import CenterCrop
import torchvision.transforms as T
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
torch.manual_seed(1)
from dataset import (
trainLoader,
valLoader,
trainSteps,
valSteps
)
from model import UNet
# determine the device to be used for training and evaluation
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# determine if we will be pinning memory during data loading
PIN_MEMORY = True if DEVICE == "cuda" else False
#----------------------------------------------------------------------------
NUM_CHANNELS = 1 # number of channels in the input - grayscale image
NUM_CLASSES = 3 # number of classes
NUM_LEVELS = 3 # number of levels in the U-Net model
# initialize learning rate, number of epochs to train for, and the batch size
INIT_LR = 1e-3
NUM_EPOCHS = 100
BATCH_SIZE = 64
threshold = 0.4 ######## --------------------------
dt_string = time.ctime()
model_name = "models/tuesday_25/unet" + "_th_" + str(threshold) + "_" + dt_string.replace(" ", "_")
path_model = model_name + '.pth'
path_param = model_name + '_model_param.txt'
eval_txt = model_name + "_eval.txt"
FILEPRINT = True
if FILEPRINT:
EVAL_FILE = open(eval_txt, "a+")
# print("[INFO] follow training and validation loss in last 'n' epochs by running:")
# print(f" >watch tail -n {eval_txt}")
def fileprint(*args):
if FILEPRINT:
print(*args, file = EVAL_FILE)
EVAL_FILE.flush()
else:
print(*args)
# # Training UNet model
class EarlyStopping():
# https://stackoverflow.com/questions/71998978/early-stopping-in-pytorch
def __init__(self, tolerance=50, min_delta=0):
self.tolerance = tolerance
self.min_delta = min_delta
self.counter = 0
self.early_stop = False
def __call__(self, train_loss, validation_loss):
if (validation_loss - train_loss) > self.min_delta:
# if self.counter == 0:
# print("Validation loss increase detected")
self.counter +=1
if self.counter >= self.tolerance:
self.early_stop = True
# def bce_dice_loss(predicted, truth, threshold):
# batch_size = len(truth)
# # BCE
# bce_loss = lossFunc(predicted, truth)
# # DICE
# predicted = torch.sigmoid(predicted.detach())
# predicted[predicted > threshold] = 1
# predicted[predicted <= threshold] = 0
# predicted = predicted.view(batch_size, -1)
# truth = truth.view(batch_size, -1)
# assert(predicted.shape == truth.shape)
# tp = (predicted * truth).sum(-1)
# fp = ((truth == 0.0).float() * predicted).sum(-1)
# fn = ((truth >= 1.0).float() * (predicted == 0.0).float()).sum(-1)
# dice_score = 2*tp / (2*tp + fp + fn)
# # BCE DICE
# bce_dice = 0.75 * bce_loss + 0.25 * (1 - dice_score)
# batch_bce_dice_loss = torch.nanmean(bce_dice)
# return batch_bce_dice_loss, (torch.nanmean(dice_score)).item()
# ---------------------------- Initialize UNet model -------------------------
model = UNet(nbClasses = NUM_CLASSES).to(DEVICE)
lossFunc = nn.BCEWithLogitsLoss()
opt = torch.optim.RAdam(model.parameters(), lr=INIT_LR)
scaler = torch.cuda.amp.GradScaler()
torch.autograd.set_detect_anomaly(True)
early_stopping = EarlyStopping(tolerance=500, min_delta=1e-5)
# initialize a dictionary to store training history
H = {"train_loss": [], "val_loss": [], "dice_score": []}
# ----------------------------- Training UNet model ----------------------------
# print("[INFO] training the network...")
startTime = time.time()
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
# print(f"Num of params: {params}")
# for e in tqdm(range(NUM_EPOCHS)):
for e in range(NUM_EPOCHS):
model.train() # set the model in training mode
totalTrainLoss = 0 # initialize the total training and validation loss
totalValLoss = 0
# totalValDiceScore = 0
# totalTrainDiceScore = 0
# loop over the training set
for (i, (x, y)) in enumerate(trainLoader):
opt.zero_grad() # first, zero out any previously accumulated gradients,
(x, y) = (x.to(DEVICE), y.to(DEVICE)) # send the input to the device
pred = model(x) # perform a forward pass
# print("pred :", pred.size())
# print("y:" , y.size())
# loss, dice_score = bce_dice_loss(pred, y, threshold)
loss = lossFunc(pred, y.float()) # calculate the training loss
scaler.scale(loss).backward() # perform backpropagation
scaler.step(opt)
scaler.update() # update model parameters
totalTrainLoss += loss.item() # add the loss to the total training loss
# totalTrainDiceScore += float(dice_score)
# switch off autograd
with torch.no_grad():
model.eval() # set the model in evaluation mode
# loop over the validation set
for (x, y) in valLoader:
(x, y) = (x.to(DEVICE), y.to(DEVICE))
pred = model(x)
loss = lossFunc(pred, y.float())
# loss, dice_score = bce_dice_loss(pred, y, threshold)
totalValLoss += loss.item()
# totalValDiceScore += float(dice_score)
# calculate the average training and validation loss
avgTrainLoss = totalTrainLoss / trainSteps
# avgTrainDiceScore = totalTrainDiceScore/ trainSteps
avgValLoss = totalValLoss / valSteps
# avgValDiceScore = totalValDiceScore/ valSteps
# early_stopping(avgTrainLoss, avgValLoss)
# if early_stopping.early_stop:
if (e>0) & ((e%5) == 0):
# print("We are at epoch: ", e)
path_model = model_name + ".pth"
path_param = model_name + '_model_param.txt'
torch.save(model.state_dict(), path_model)
with open(path_param, 'wt') as f:
f.write(f"Batch size used: {BATCH_SIZE}")
f.write(f"\nNumber of epochs: {NUM_EPOCHS}")
f.write(f"\nINIT_LR: {INIT_LR}")
f.write("\nModel parameters: \n")
f.write(str(model.eval()))
# break
text1 = "[INFO] EPOCH: {}/{}\n".format(e + 1, NUM_EPOCHS)
# text2 = "Train loss: {:.4f}, Train dice score: {:.4f}, Val loss: {:.4f}, Val dice score: {:.4f}\n".format(avgTrainLoss, avgTrainDiceScore, avgValLoss, avgValDiceScore)
text2 = "Train loss: {:.4f}, Val loss: {:.4f}\n".format(avgTrainLoss, avgValLoss)
fileprint(text1)
fileprint(text2)
# display the total time needed to perform the training
endTime = time.time()
# print("[INFO] total time taken to train the model: {:.2f}s".format(endTime - startTime))
# ------------------------------- Saving UNet model --------------------------------------------------
path_model = model_name + '.pth'
path_param = model_name + '_model_param.txt'
torch.save(model.state_dict(), path_model)
with open(path_model, 'wt') as f:
f.write(f"Batch size used: {BATCH_SIZE}")
f.write(f"\nNumber of epochs: {NUM_EPOCHS}")
f.write(f"\nINIT_LR: {INIT_LR}")
f.write("\nModel parameters: \n")
f.write(str(model.eval()))
Does anyone have any ideas how to solve this?
Thank you a lot

tensorflow run python crash

tensorflow: tensorflow-gpu 0.12
Anaconda: anaconda4.2.9(python3.5)
GPU: Nvidia 940M(notebook)（2GB）
OS: win7-64bit sp1
Cuda: 8.0
cudnn: 5.0
IDE: pycharm
Mnist test OK under GPU(CNNs),But when it comes to my own project,python crashes.I debug my code and find function "**session.run()**" lead to this problem.The error is:
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:586] Could not identify NUMA node of /job:localhost/replica:0/task:0/gpu:0, defaulting to 0. Your kernel may not have been built with NUMA support.
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_event.cc:49] Error polling for event status: failed to query event: CUDA_ERROR_LAUNCH_FAILED
F c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_event_mgr.cc:198] Unexpected Event status: 1
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:385] could not create cudnn handle: **CUDNN_STATUS_INTERNAL_ERROR**
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:352] could not destroy cudnn handle: **CUDNN_STATUS_BAD_PARAM**
F c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\kernels\conv_ops.cc:532] **Check failed: stream->parent()->GetConvolveAlgorithms(&algorithms)**
Since Mnist run correctly, so there is no defects with my GPU driver 、cuda and cudnn .I really don't know how this problem comes.
This is my code:
import cv2
import os
import tensorflow as tf
import data_trans as dt
with tf.variable_scope('weights'):
weights={
# 60*60*3->60*60*32->30*30*32
'conv1':tf.get_variable('conv1',[5,5,3,32],initializer=tf.contrib.layers.xavier_initializer_conv2d()),
# 30*30*32->30*30*64->15*15*64
'conv2':tf.get_variable('conv2',[5,5,32,64],initializer=tf.contrib.layers.xavier_initializer_conv2d()),
# 15*15*64->12*12*128->6*6*128
'conv3':tf.get_variable('conv3',[4,4,64,128],initializer=tf.contrib.layers.xavier_initializer_conv2d()),
# 6*6*128->256
'fc1':tf.get_variable('fc1',[6*6*128,256],initializer=tf.contrib.layers.xavier_initializer()),
# 256->2
'fc2':tf.get_variable('fc2',[256,2],initializer=tf.contrib.layers.xavier_initializer())
}
with tf.variable_scope('biases'):
biases = {
'conv1':tf.get_variable('conv1',[32,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32)),
'conv2':tf.get_variable('conv2',[64,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32)),
'conv3':tf.get_variable('conv3',[128,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32)),
'fc1':tf.get_variable('fc1',[256,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32)),
'fc2':tf.get_variable('fc2',[2,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32))
}
def inference(images):
images = (tf.cast(images,tf.float32)/255)
conv1 = tf.nn.bias_add(tf.nn.conv2d(images,weights['conv1'],strides=[1,1,1,1],padding='SAME'),biases['conv1'])
relu1 = tf.nn.relu(conv1)
pool1 = tf.nn.max_pool(relu1,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')
conv2 = tf.nn.bias_add(tf.nn.conv2d(pool1,weights['conv2'],strides=[1,1,1,1],padding='SAME'),biases['conv2'])
relu1 = tf.nn.relu(conv2)
pool2 = tf.nn.max_pool(relu1,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME')
conv3 = tf.nn.bias_add(tf.nn.conv2d(pool2,weights['conv3'],strides=[1,1,1,1],padding='VALID'),biases['conv3'])
relu3 = tf.nn.relu(conv3)
pool3 = tf.nn.max_pool(relu3,ksize=[1,2,2,1],strides=[1,2,2,1],padding='VALID')
flatten = tf.reshape(pool3,[-1,weights['fc1'].get_shape().as_list()[0]])
drop = tf.nn.dropout(flatten,0.5)
fc1 = tf.matmul(drop,weights['fc1']) + biases['fc1']
fc_relu1 = tf.nn.relu(fc1)
fc2 = tf.matmul(fc_relu1,weights['fc2']) + biases['fc2']
return fc2
def train():
dt.encode_to_tfrecords('../train_data/train.txt','../train_data','data.tfrecords',(60,60))
image,label = dt.decode_from_tfrecords('../train_data/data.tfrecords')
batch_image,batch_label = dt.get_batch(image,label,batch_size=10,crop_size=60)
inf = inference(batch_image)
predicts = tf.nn.softmax(inf)
cross_entropy = -tf.reduce_mean(batch_label * tf.log(predicts))
train_step = tf.train.GradientDescentOptimizer(1e-2).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(predicts, 1), tf.argmax(batch_label, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
#if os.path.exists(os.path.join('model','model.ckpt')) is True:
# tf.train.Saver(max_to_keep=None).restore(sess,os.path.join('model','model.ckpt'))
for epcho in range(8):
print(sess.run(accuracy))
print('here!')
coord.request_stop()
coord.join(threads)
train()
data_trans.py contains three function use to transform image to tfrecords:
import cv2
import tensorflow as tf
def encode_to_tfrecords(label_file,data_root,new_name='data.tfrecords',resize=None):
writer = tf.python_io.TFRecordWriter(data_root + '/' + new_name)
num_example = 0
with open(label_file,'r') as f:
for l in f.readlines():
l = l.split()
path = data_root+'/'+l[0]
image = cv2.imread(path)
if resize is not None:
image = cv2.resize(image,resize)
height,width,nchannel = image.shape
label = int(l[1])
example = tf.train.Example(features=tf.train.Features(feature={
'height':tf.train.Feature(int64_list=tf.train.Int64List(value=[height])),
'width':tf.train.Feature(int64_list=tf.train.Int64List(value=[width])),
'nchannel':tf.train.Feature(int64_list=tf.train.Int64List(value=[nchannel])),
'image':tf.train.Feature(bytes_list=tf.train.BytesList(value=[image.tobytes()])),
'label':tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
}))
serialized = example.SerializeToString()
writer.write(serialized)
num_example += 1
print(label_file,'Sample_Num:',num_example)
writer.close()
#encode_to_tfrecords('../train_data/train.txt','../train_data')
def decode_from_tfrecords(filename,num_epoch=None):
filename_queue = tf.train.string_input_producer([filename],num_epoch)
reader = tf.TFRecordReader()
_,serialized = reader.read(filename_queue)
example = tf.parse_single_example(serialized,features={
'height':tf.FixedLenFeature([],tf.int64),
'width':tf.FixedLenFeature([],tf.int64),
'nchannel':tf.FixedLenFeature([],tf.int64),
'image':tf.FixedLenFeature([],tf.string),
'label':tf.FixedLenFeature([],tf.int64)
})
label = tf.cast(example['label'],tf.int32)
image = tf.decode_raw(example['image'],tf.uint8)
image = tf.reshape(image,tf.stack([
tf.cast(example['height'],tf.int32),
tf.cast(example['width'],tf.int32),
tf.cast(example['nchannel'],tf.int32)
]))
return image, label
#encode_to_tfrecords("../train_data/train.txt","../train_data",'data.tfrecords')
#image,label=decode_from_tfrecords('../train_data/data.tfrecords')
#print image[0]
def get_batch(image,label,batch_size,crop_size):
distorted_image = tf.random_crop(image,[crop_size, crop_size, 3])
distorted_image = tf.image.random_flip_up_down(distorted_image)
images,label_batch = tf.train.shuffle_batch([distorted_image,label],batch_size=batch_size,capacity=130,min_after_dequeue=100)
return images,tf.one_hot(tf.reshape(label_batch,[batch_size]), 2)

Thanks All. i have solve this problem. it seems like a bug in tensorflow-gpu in windows(7/10) .function "tf.one_hot()" cannot execute correctly under win7(maybe tensorflow-gpu0.12 & win7) , we must explicitly set this function executed by cpu like:
tf.device('/cpu:0'):
tf.one_hot()

The node 'Merge/MergeSummary' has inputs from different frames: what does it mean?

trying to merge all my summaries, I have an error saying that the inputs of Merge/MergeSummary comes from different frames. So, first of all: what is a frame? Could you please point me somewhere in the TF documentation about such stuff? -- of course, I googled a bit but could find almost nothing. How can I fix this issue? Below the code to reproduce the error. Thanks in advance.
import numpy as np
import tensorflow as tf
tf.reset_default_graph()
tf.set_random_seed(23)
BATCH = 2
LENGTH = 4
SIZE = 5
ATT_SIZE = 3
NUM_QUERIES = 2
def linear(inputs, output_size, use_bias=True, activation_fn=None):
"""Linear projection."""
input_shape = inputs.get_shape().as_list()
input_size = input_shape[-1]
output_shape = input_shape[:-1] + [output_size]
if len(output_shape) > 2:
output_shape_tensor = tf.unstack(tf.shape(inputs))
output_shape_tensor[-1] = output_size
output_shape_tensor = tf.stack(output_shape_tensor)
inputs = tf.reshape(inputs, [-1, input_size])
kernel = tf.get_variable("kernel", [input_size, output_size])
output = tf.matmul(inputs, kernel)
if use_bias:
output = output + tf.get_variable('bias', [output_size])
if len(output_shape) > 2:
output = tf.reshape(output, output_shape_tensor)
output.set_shape(output_shape) # pylint: disable=I0011,E1101
if activation_fn is not None:
return activation_fn(output)
return output
class Attention(object):
"""Attention mechanism implementation."""
def __init__(self, attention_states, attention_size):
"""Initializes a new instance of the Attention class."""
self._states = attention_states
self._attention_size = attention_size
self._batch = tf.shape(self._states)[0]
self._length = tf.shape(self._states)[1]
self._size = self._states.get_shape()[2].value
self._features = None
def _init_features(self):
states = tf.reshape(
self._states, [self._batch, self._length, 1, self._size])
weights = tf.get_variable(
"kernel", [1, 1, self._size, self._attention_size])
self._features = tf.nn.conv2d(states, weights, [1, 1, 1, 1], "SAME")
def get_weights(self, query, scope=None):
"""Reurns the attention weights for the given query."""
with tf.variable_scope(scope or "Attention"):
if self._features is None:
self._init_features()
else:
tf.get_variable_scope().reuse_variables()
vect = tf.get_variable("Vector", [self._attention_size])
with tf.variable_scope("Query"):
query_features = linear(query, self._attention_size, False)
query_features = tf.reshape(
query_features, [-1, 1, 1, self._attention_size])
activations = vect * tf.tanh(self._features + query_features)
activations = tf.reduce_sum(activations, [2, 3])
with tf.name_scope('summaries'):
tf.summary.histogram('histogram', activations)
return tf.nn.softmax(activations)
states = tf.placeholder(tf.float32, shape=[BATCH, None, SIZE]) # unknown length
queries = tf.placeholder(tf.float32, shape=[NUM_QUERIES, BATCH, ATT_SIZE])
attention = Attention(states, ATT_SIZE)
func = lambda x: attention.get_weights(x, "Softmax")
weights = tf.map_fn(func, queries)
for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
name = var.name.replace(':', '_')
tf.summary.histogram(name, var)
summary_op = tf.summary.merge_all()
states_np = np.random.rand(BATCH, LENGTH, SIZE)
queries_np = np.random.rand(NUM_QUERIES, BATCH, ATT_SIZE)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
weights_np, summary_str = sess.run([weights, summary_op], {states: states_np, queries: queries_np})
print weights_np

The error message was indeed not user friendly. It has been updated to
ValueError: Cannot use 'map/while/summaries/histogram' as input to 'Merge/MergeSummary' because 'map/while/summaries/histogram' is in a while loop. See info log for more details.
As the new message says, the problem is that you cannot produce summaries from inside of the while loop. The frame that the original message referred to is the "execution frame" of the while loop - all the state for each iteration of the while loop is kept in a frame.
In this case, the while_loop is created by tf.map_fn and the summary inside it is tf.summary.histogram('histogram', activations).
There are a couple of ways to deal with this. You can take the summary out of the get_weights, have the get_weights return activations as well, create the summary using the newly returned activations from tf.map_fn call.
Another approach, if NUM_QUERIES is constant and small, can be to statically unroll the loop instead of using tf.map_fn. Here is the code to do this:
# TOP PART OF THE CODE IS THE SAME
states = tf.placeholder(tf.float32, shape=[BATCH, None, SIZE]) # unknown length
queries = tf.placeholder(tf.float32, shape=[NUM_QUERIES, BATCH, ATT_SIZE])
attention = Attention(states, ATT_SIZE)
func = lambda x: attention.get_weights(x, "Softmax")
# NEW CODE BEGIN
split_queries = tf.split(queries, NUM_QUERIES)
weights = []
for query in split_queries:
weights.append(func(query))
# NEW CODE END
for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
name = var.name.replace(':', '_')
tf.summary.histogram(name, var)
summary_op = tf.summary.merge_all()
states_np = np.random.rand(BATCH, LENGTH, SIZE)
queries_np = np.random.rand(NUM_QUERIES, BATCH, ATT_SIZE)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# NEW CODE BEGIN
results = sess.run(weights + [summary_op], {states: states_np, queries: queries_np})
weights_np, summary_str = results[:-1], results[-1]
# NEW CODE END
print weights_np

Tensorflow evaluate: Aborted (core dumped)

tl;dr: I input a word to my model, and am supposed to get a list of similar words and their associated measures of similarity back. I get an error: Aborted (core dumped).
My goal is to determine which words are similar to an input word, based on their feature vectors. I have model already trained. I load it and call two functions:
def main(argv=None):
model = NVDM(args)
sess_saver = tf.train.Saver()
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)
loaded = load_for_similar(sess, sess_saver) #my function
wm = word_match(sess, loaded[0], loaded[1], "bottle", loaded[2], loaded[3], topN=5)
My problem is that I can't print out the words which are similar and the associated similarity measure. I tried (in main):
sess.run(wm)
wm[0].eval(session=sess)
print(wm)
All of which gave me the error:
F tensorflow/core/kernels/strided_slice_op.cc:316] Check failed: tmp.CopyFrom(input.Slice(begin[0], end[0]), final_shape)
Aborted (core dumped)
This tells me I'm not running the session properly. What am I doing wrong?
Details on the functions, just in case:
The function 'load_for_similar' restores the weights and bias of the decoder in my model (a variational autoencoder), and normalizes them. It also reverses the order of the keys and values in my vocabulary dictionary for later use:
def load_for_similar(sess, saver_obj):
saver_obj.restore(sess, "./CA_checkpoints/saved_model.ckpt")
vocab_file = '/path/to/vocab.pkl'
t1 = loader_object(vocab_file)
v1 = t1.get_vocab()
v1_rev = {k:v for v, k in v1.iteritems()}
decoder_mat = tf.get_collection(tf.GraphKeys.VARIABLES, scope='decoder')[0]
decoder_bias = tf.get_collection(tf.GraphKeys.VARIABLES, scope='decoder')[1]
return (find_norm(decoder_mat), find_norm(decoder_bias), v1, v1_rev)
To find similar words, I pass the normalized weight matrix and bias in to an new function, along with the feature vector of my word (vec):
def find_similar(sess, Weights, vec, bias):
dists = tf.add(tf.reduce_sum(tf.mul(Weights, vec)), bias)
best = argsort(sess, dists, reverse=True)
dist_sort = tf.nn.top_k(dists, k=dists.get_shape().as_list()[0], sorted=True).values
return dist_sort, best
Finally, I want to match the words that are closest to my supplied word, "bottle":
def word_match(sess, norm_mat , norm_bias, word_ , vocab, vocab_inverse , topN = 10):
idx = vocab[word_]
similarity_meas , indexes = find_similar(sess, norm_mat , norm_mat[idx], norm_bias)
words = tf.gather(vocab_inverse.keys(), indexes[:topN])
return (words, similarity_meas[:topN])
EDIT: in response to mrry's comment, here is the model (I hope this is what you wanted?). This code depends on utils.py, a separate utilities file. I will include that as well. Please note that this code is heavily based on Yishu Miao's and Sarath Nair's.
class NVDM(object):
""" Neural Variational Document Model -- BOW VAE.
"""
def __init__(self,
vocab_size=15000, #was 2000
n_hidden=500,
n_topic=50,
n_sample=1,
learning_rate=1e-5,
batch_size=100, #was 64
non_linearity=tf.nn.tanh):
self.vocab_size = vocab_size
self.n_hidden = n_hidden
self.n_topic = n_topic
self.n_sample = n_sample
self.non_linearity = non_linearity
self.learning_rate = learning_rate/batch_size #CA
self.batch_size = batch_size
self.x = tf.placeholder(tf.float32, [None, vocab_size], name='input')
self.mask = tf.placeholder(tf.float32, [None], name='mask') # mask paddings
# encoder
with tf.variable_scope('encoder'):
self.enc_vec = utils.mlp(self.x, [self.n_hidden, self.n_hidden])
self.mean = utils.linear(self.enc_vec, self.n_topic, scope='mean')
self.logsigm = utils.linear(self.enc_vec,
self.n_topic,
bias_start_zero=True,
matrix_start_zero=False,
scope='logsigm')
self.kld = -0.5 * tf.reduce_sum(1 - tf.square(self.mean) + 2 * self.logsigm - tf.exp(2 * self.logsigm), 1)
self.kld = self.mask*self.kld # mask paddings
with tf.variable_scope('decoder'):
if self.n_sample ==1: # single sample
p1 = tf.cast(tf.reduce_sum(self.mask), tf.int32) #needed for random normal generation
eps = tf.random_normal((p1, self.n_topic), 0, 1)
doc_vec = tf.mul(tf.exp(self.logsigm), eps) + self.mean
logits = tf.nn.log_softmax(utils.linear(doc_vec, self.vocab_size, scope='projection'))
self.recons_loss = -tf.reduce_sum(tf.mul(logits, self.x), 1)
# multiple samples
else:
eps = tf.random_normal((self.n_sample*batch_size, self.n_topic), 0, 1)
eps_list = tf.split(0, self.n_sample, eps)
recons_loss_list = []
for i in xrange(self.n_sample):
if i > 0: tf.get_variable_scope().reuse_variables()
curr_eps = eps_list[i]
doc_vec = tf.mul(tf.exp(self.logsigm), curr_eps) + self.mean
logits = tf.nn.log_softmax(utils.linear(doc_vec, self.vocab_size, scope='projection'))
recons_loss_list.append(-tf.reduce_sum(tf.mul(logits, self.x), 1))
self.recons_loss = tf.add_n(recons_loss_list) / self.n_sample
self.objective = self.recons_loss + self.kld
optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
fullvars = tf.trainable_variables()
enc_vars = utils.variable_parser(fullvars, 'encoder')
dec_vars = utils.variable_parser(fullvars, 'decoder')
enc_grads = tf.gradients(self.objective, enc_vars)
dec_grads = tf.gradients(self.objective, dec_vars)
self.optim_enc = optimizer.apply_gradients(zip(enc_grads, enc_vars))
self.optim_dec = optimizer.apply_gradients(zip(dec_grads, dec_vars))
def minibatch_bow(it1, Instance1, n_samples, batch_size, used_ints = set()):
available = set(np.arange(n_samples)) - used_ints #
if len(available) < batch_size:
indices = np.array(list(available))
else:
indices = np.random.choice(tuple(available), batch_size, replace=False)
used = used_ints
mb = itemgetter(*indices)(it1)
batch_xs = Instance1._bag_of_words(mb, vocab_size=15000)
batch_flattened = np.ravel(batch_xs)
index_positions = np.where(batch_flattened > 0)[0]
return (batch_xs, index_positions, set(indices)) #batch_xs[0] is the bag of words; batch_xs[1] is the 0/1 word used/not;
def train(sess, model, train_file, vocab_file, saver_obj, training_epochs, alternate_epochs, batch_size):
Instance1 = testchunk_Nov23.testLoader(train_file, vocab_file)
data_set = Instance1.get_batch(batch_size) #get all minibatches of size 100
n_samples = Instance1.num_reviews()
train_batches = list(data_set) #this is an itertools.chain object
it1_train = list(itertools.chain(*train_batches)) #length is 732,356. This is all the reviews.atch_size
if len(it1_train) % batch_size != 0:
total_batch = int(len(it1_train)/batch_size) + 1
else:
total_batch = int(len(it1_train)/batch_size)
trainfilesave = "train_ELBO_and_perplexity_Dec1.txt"
#Training
train_time = time.time()
for epoch in range(training_epochs):
for switch in xrange(0, 2):
if switch == 0:
optim = model.optim_dec
print_mode = 'updating decoder'
else:
optim = model.optim_enc
print_mode = 'updating encoder'
with open(trainfilesave, 'w') as f:
for i in xrange(alternate_epochs):
loss_sum = 0.0
kld_sum = 0.0
word_count = 0
used_indices = set()
for idx_batch in range(total_batch): #train_batches:
mb = minibatch_bow(it1_train, Instance1, n_samples, batch_size, used_ints=used_indices)
print('minibatch', idx_batch)
used_indices.update(mb[2])
num_mb = np.ones(mb[0][0].shape[0])
input_feed = {model.x.name: mb[0][0], model.mask: num_mb}
_, (loss, kld) = sess.run((optim,[model.objective, model.kld]) , input_feed)
loss_sum += np.sum(loss)
And the utils.py file:
def linear(inputs,
output_size,
no_bias=False,
bias_start_zero=False,
matrix_start_zero=False,
scope=None):
"""Define a linear connection."""
with tf.variable_scope(scope or 'Linear'):
if matrix_start_zero:
matrix_initializer = tf.constant_initializer(0)
else:
matrix_initializer = None
if bias_start_zero:
bias_initializer = tf.constant_initializer(0)
else:
bias_initializer = None
input_size = inputs.get_shape()[1].value
matrix = tf.get_variable('Matrix', [input_size, output_size],
initializer=matrix_initializer)
bias_term = tf.get_variable('Bias', [output_size],
initializer=bias_initializer)
output = tf.matmul(inputs, matrix)
if not no_bias:
output = output + bias_term
return output
def mlp(inputs,
mlp_hidden=[],
mlp_nonlinearity=tf.nn.tanh,
scope=None):
"""Define an MLP."""
with tf.variable_scope(scope or 'Linear'):
mlp_layer = len(mlp_hidden)
res = inputs
for l in xrange(mlp_layer):
res = mlp_nonlinearity(linear(res, mlp_hidden[l], scope='l'+str(l)))
return res

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Tensorflow uses all memory for a single example - python

Related

Keras - ValueError: Could not interpret loss function identifier

Running Python (Pytorch) training script on remote machine causes SSH session crash with no error message

tensorflow run python crash

The node 'Merge/MergeSummary' has inputs from different frames: what does it mean?

Tensorflow evaluate: Aborted (core dumped)

Categories

Resources