fit_generator in Keras -- One minibatch at a time - python

I have a model that operates on a large dataset - not all that large by Big Data standards but significantly more than my home server can hold in memory. As such, I'm using fit_generator to load it a chunk at a time, so that it never has to hold more than one minibatch in memory at once.
... At least, that's the theory. But when Keras hung at Epoch 1/10 without even starting the "training" animation and I (eventually) got an Out of Memory exception -- the minibatches are large but I can still hold a couple of them in memory at once without trouble -- I got suspicious and threw in a bunch of testing print statements into my generator. Lo and behold, Keras was invoking the generator three or four times before even starting (appearing to start?).
So... what's going on here? Is this normal, or did I implement my generator wrong somehow? How can I get it not to try and load more than one batch at once?
Code follows, in case it helps:
def data_gen(directory):
def epsilon_div(x, y):
return (x + K.epsilon()) / (y + K.epsilon())
while(True):
filelist = os.listdir(directory + "/data")
order = np.random.permutation(len(filelist))
for i in order:
dataf = directory + "/data/" + filelist[i]
labelf = directory + "/labels/" + filelist[i]
with open(dataf, 'rb') as f:
databook = sb.Songbook.FromString(f.read())
with open(labelf, 'rb') as f:
labelbook = sb.Songbook.FromString(f.read())
print('Booked')
l, _, r, _ = sb_np_extract(databook)
ll, _, lr, _ = sb_np_extract(labelbook)
databook = None
labelbook = None
print('Extracted')
l = l.transpose([0, 2, 1])
r = r.transpose([0, 2, 1])
ll = ll.transpose([0, 2, 1])
lr = lr.transpose([0, 2, 1])
print('Chosen')
mask_l = epsilon_div(ll, l)
mask_r = epsilon_div(lr, r)
print('Done')
yield [[l, r], [mask_l, mask_r]]

I suggest you save the filenames in an array then shuffle it. I call the file list with the full path and shuffled listname_data and listname_labels.
steps = number of files
def generator(steps):
i = 1
while True:
dataf = filelist_data[i]
labelf = filelist_labels[i]
...
if i == steps:
i = 1
c = list(zip(listname_data,listname_data))
shuffle(c)
listname_data, listname_data = zip(*c)
else:
i +=1
yield [[l, r], [mask_l, mask_r]]

Related

Using FOR loop to initialise weights using TensorFlow

I am facing the following problem. I want to have a function that given the number of points in each hidden layer, creates the weights for a simple NN.
def initialize_parameters(hidden):
parameters = dict({})
def W_creator(b,a,i):
return tf.get_variable("W"+str(i+1), [b,a], initializer = tf.contrib.layers.xavier_initializer(seed = 1))
def b_creator(b,i):
return tf.get_variable('b'+str(i+1), [b,1], initializer = tf.zeros_initializer())
for l in range(len(hidden)):
parameters['W'+str(l+1)] = W_creator(hidden[l+1],hidden[l],l)
parameters['b'+str(l+1)] = b_creator(hidden[l+1],l)
return parameters
I call this function then using
tf.reset_default_graph()
with tf.Session() as sess:
parameters = initialize_parameters()
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))
to check everything is fine and an IndexError is raised.
29 for l in range(len(hidden)):
---> 30 W = W_creator(hidden[l+1],hidden[l],l)
31 parameters['W'+str(l+1)] = W
32
IndexError: list index out of range
Anyone can help with this?
As you use hidden[l+1], your loop has to stop one step earlier, to avoid "out of range" IndexError:
for l in range(len(hidden)-1):

Why is Z3 slow for tiny search space?

I'm trying to make a Z3 program (in Python) that generates boolean circuits that do certain tasks (e.g. adding two n-bit numbers) but the performance is terrible to the point where a brute-force search of the entire solution space would be faster. This is my first time using Z3 so I could be doing something that impacts my performance, but my code seems fine.
The following is copied from my code here:
from z3 import *
BITLEN = 1 # Number of bits in input
STEPS = 1 # How many steps to take (e.g. time)
WIDTH = 2 # How many operations/values can be stored in parallel, has to be at least BITLEN * #inputs
# Input variables
x = BitVec('x', BITLEN)
y = BitVec('y', BITLEN)
# Define operations used
op_list = [BitVecRef.__and__, BitVecRef.__or__, BitVecRef.__xor__, BitVecRef.__xor__]
unary_op_list = [BitVecRef.__invert__]
for uop in unary_op_list:
op_list.append(lambda x, y : uop(x))
# Chooses a function to use by setting all others to 0
def chooseFunc(i, x, y):
res = 0
for ind, op in enumerate(op_list):
res = res + (ind == i) * op(x, y)
return res
s = Solver()
steps = []
# First step is just the bits of the input padded with constants
firststep = Array("firststep", IntSort(), BitVecSort(1))
for i in range(BITLEN):
firststep = Store(firststep, i * 2, Extract(i, i, x))
firststep = Store(firststep, i * 2 + 1, Extract(i, i, y))
for i in range(BITLEN * 2, WIDTH):
firststep = Store(firststep, i, BitVec("const_0_%d" % i, 1))
steps.append(firststep)
# Generate remaining steps
for i in range(1, STEPS + 1):
this_step = Array("step_%d" % i, IntSort(), BitVecSort(1))
last_step = steps[-1]
for j in range(WIDTH):
func_ind = Int("func_%d_%d" % (i,j))
s.add(func_ind >= 0, func_ind < len(op_list))
x_ind = Int("x_%d_%d" % (i,j))
s.add(x_ind >= 0, x_ind < WIDTH)
y_ind = Int("y_%d_%d" % (i,j))
s.add(y_ind >= 0, y_ind < WIDTH)
node = chooseFunc(func_ind, Select(last_step, x_ind), Select(last_step, y_ind))
this_step = Store(this_step, j, node)
steps.append(this_step)
# Set the result to the first BITLEN bits of the last step
if BITLEN == 1:
result = Select(steps[-1], 0)
else:
result = Concat(*[Select(steps[-1], i) for i in range(BITLEN)])
# Set goal
goal = x | y
s.add(ForAll([x, y], goal == result))
print(s)
print(s.check())
print(s.model())
The code basically lays out the inputs as individual bits, then at each "step" one of 5 boolean functions can operate on the values from the previous step, where the final step represents the end result.
In this example, I generate a circuit to calculate the boolean OR of two 1-bit inputs, and an OR function is available in the circuit, so the solution is trivial.
I have a solution space of only 5*5*2*2*2*2=400:
5 Possible functions (two function nodes)
2 Inputs for each function, each of which has two possible values
This code takes a few seconds to run and provides a correct answer, but I feel like it should run instantaneously as there are only 400 possible solutions, of which quite a few are valid. If I increase the inputs to be two bits long, the solution space has a size of (5^4)*(4^8)=40,960,000 and never finishes on my computer, though I feel this should be easily doable with Z3.
I also tried effectively the same code but substituted Arrays/Store/Select for Python lists and "selected" the variables by using the same trick I used in chooseFunc(). The code is here and it runs in around the same time the original code does, so no speedup.
Am I doing something that would drastically slow down the solver? Thanks!
You have a duplicated __xor__ in your op_list; but that's not really the major problem. The slowdown is inevitable as you increase bit-size, but on a first look you can (and should) avoid mixing integer reasoning with booleans here. I'd code your chooseFunc as follows:
def chooseFunc(i, x, y):
res = False;
for ind, op in enumerate(op_list):
res = If(ind == i, op (x, y), res)
return res
See if that improves run-times in any meaningful way. If not, the next thing to do would be to get rid of arrays as much as possible.

Numpy Array Manipulation Based off of Internal Values

I am trying to accomplish a weird task.
I need to complete the following without the use of sklearn, and preferably with numpy:
Given a dataset, split the data into 5 equal "folds", or partitions
Within each partition, split the data into a "training" and "testing" set, with an 80/20 split
Here is the catch: Your dataset is labeled for classes. So take for example a dataset with 100 instances, and class A with 33 samples and class B with 67 samples. I should create 5 folds of 20 data instances, where in each fold, class A has something like 6 or 7 (1/3) values and class B has the rest
My issue that:
I do not know how to properly return a test and training set for each fold, despite being able to split it appropriately, and, more important, I do not know how to incorporate the proper division of # of elements per class.
My current code is here. It is commented where I am stuck:
import numpy
def csv_to_array(file):
# Open the file, and load it in delimiting on the ',' for a comma separated value file
data = open(file, 'r')
data = numpy.loadtxt(data, delimiter=',')
# Loop through the data in the array
for index in range(len(data)):
# Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
try:
data[index] = [float(x) for x in data[index]]
except Exception:
data[index] = 0
except ValueError:
data[index] = 0
# Return the now type-formatted data
return data
def five_cross_fold_validation(dataset):
# print("DATASET", dataset)
numpy.random.shuffle(dataset)
num_rows = dataset.shape[0]
split_mark = int(num_rows / 5)
folds = []
temp1 = dataset[:split_mark]
# print("TEMP1", temp1)
temp2 = dataset[split_mark:split_mark*2]
# print("TEMP2", temp2)
temp3 = dataset[split_mark*2:split_mark*3]
# print("TEMP3", temp3)
temp4 = dataset[split_mark*3:split_mark*4]
# print("TEMP4", temp4)
temp5 = dataset[split_mark*4:]
# print("TEMP5", temp5)
folds.append(temp1)
folds.append(temp2)
folds.append(temp3)
folds.append(temp4)
folds.append(temp5)
# folds = numpy.asarray(folds)
for fold in folds:
# fold = numpy.asarray(fold)
num_rows = fold.shape[0]
split_mark = int(num_rows * .8)
fold_training = fold[split_mark:]
fold_testing = fold[:split_mark]
print(type(fold))
# fold.tolist()
list(fold)
print(type(fold))
del fold[0:len(fold)]
fold.append(fold_training)
fold.append(fold_testing)
fold = numpy.asarray(fold)
# Somehow, return a testing and training set within each fold
# print(folds)
return folds
def confirm_size(folds):
total = 0
for fold in folds:
curr = len(fold)
total = total + curr
return total
def main():
print("BEGINNING CFV")
ecoli = csv_to_array('Classification/ecoli.csv')
print(len(ecoli))
folds = five_cross_fold_validation(ecoli)
size = confirm_size(folds)
print(size)
main()
Additionally, for reference, I have attached my csv I am working with (it is a modification of the UCI Ecoli Dataset.) The classes here are the values in the last column. So 0, 1, 2, 3, 4. It is important to note that there are not equal amounts of each class.
0.61,0.45,0.48,0.5,0.48,0.35,0.41,0
0.17,0.38,0.48,0.5,0.45,0.42,0.5,0
0.44,0.35,0.48,0.5,0.55,0.55,0.61,0
0.43,0.4,0.48,0.5,0.39,0.28,0.39,0
0.42,0.35,0.48,0.5,0.58,0.15,0.27,0
0.23,0.33,0.48,0.5,0.43,0.33,0.43,0
0.37,0.52,0.48,0.5,0.42,0.42,0.36,0
0.29,0.3,0.48,0.5,0.45,0.03,0.17,0
0.22,0.36,0.48,0.5,0.35,0.39,0.47,0
0.23,0.58,0.48,0.5,0.37,0.53,0.59,0
0.47,0.47,0.48,0.5,0.22,0.16,0.26,0
0.54,0.47,0.48,0.5,0.28,0.33,0.42,0
0.51,0.37,0.48,0.5,0.35,0.36,0.45,0
0.4,0.35,0.48,0.5,0.45,0.33,0.42,0
0.44,0.34,0.48,0.5,0.3,0.33,0.43,0
0.44,0.49,0.48,0.5,0.39,0.38,0.4,0
0.43,0.32,0.48,0.5,0.33,0.45,0.52,0
0.49,0.43,0.48,0.5,0.49,0.3,0.4,0
0.47,0.28,0.48,0.5,0.56,0.2,0.25,0
0.32,0.33,0.48,0.5,0.6,0.06,0.2,0
0.34,0.35,0.48,0.5,0.51,0.49,0.56,0
0.35,0.34,0.48,0.5,0.46,0.3,0.27,0
0.38,0.3,0.48,0.5,0.43,0.29,0.39,0
0.38,0.44,0.48,0.5,0.43,0.2,0.31,0
0.41,0.51,0.48,0.5,0.58,0.2,0.31,0
0.34,0.42,0.48,0.5,0.41,0.34,0.43,0
0.51,0.49,0.48,0.5,0.53,0.14,0.26,0
0.25,0.51,0.48,0.5,0.37,0.42,0.5,0
0.29,0.28,0.48,0.5,0.5,0.42,0.5,0
0.25,0.26,0.48,0.5,0.39,0.32,0.42,0
0.24,0.41,0.48,0.5,0.49,0.23,0.34,0
0.17,0.39,0.48,0.5,0.53,0.3,0.39,0
0.04,0.31,0.48,0.5,0.41,0.29,0.39,0
0.61,0.36,0.48,0.5,0.49,0.35,0.44,0
0.34,0.51,0.48,0.5,0.44,0.37,0.46,0
0.28,0.33,0.48,0.5,0.45,0.22,0.33,0
0.4,0.46,0.48,0.5,0.42,0.35,0.44,0
0.23,0.34,0.48,0.5,0.43,0.26,0.37,0
0.37,0.44,0.48,0.5,0.42,0.39,0.47,0
0,0.38,0.48,0.5,0.42,0.48,0.55,0
0.39,0.31,0.48,0.5,0.38,0.34,0.43,0
0.3,0.44,0.48,0.5,0.49,0.22,0.33,0
0.27,0.3,0.48,0.5,0.71,0.28,0.39,0
0.17,0.52,0.48,0.5,0.49,0.37,0.46,0
0.36,0.42,0.48,0.5,0.53,0.32,0.41,0
0.3,0.37,0.48,0.5,0.43,0.18,0.3,0
0.26,0.4,0.48,0.5,0.36,0.26,0.37,0
0.4,0.41,0.48,0.5,0.55,0.22,0.33,0
0.22,0.34,0.48,0.5,0.42,0.29,0.39,0
0.44,0.35,0.48,0.5,0.44,0.52,0.59,0
0.27,0.42,0.48,0.5,0.37,0.38,0.43,0
0.16,0.43,0.48,0.5,0.54,0.27,0.37,0
0.06,0.61,0.48,0.5,0.49,0.92,0.37,1
0.44,0.52,0.48,0.5,0.43,0.47,0.54,1
0.63,0.47,0.48,0.5,0.51,0.82,0.84,1
0.23,0.48,0.48,0.5,0.59,0.88,0.89,1
0.34,0.49,0.48,0.5,0.58,0.85,0.8,1
0.43,0.4,0.48,0.5,0.58,0.75,0.78,1
0.46,0.61,0.48,0.5,0.48,0.86,0.87,1
0.27,0.35,0.48,0.5,0.51,0.77,0.79,1
Edit I replaced np.random.shuffle(A) by A = np.random.permutation(A), the only difference is that it doesn't mutate the input array. This doesn't make any difference in this code, but it is safer in general.
The idea is to randomly sample the input by using numpy.random.permutation. Once the rows are shuffled we just need to iterate over all the possible tests sets (sliding window of the desired size, here 20% of the input size). The corresponding training sets are just composed of all remaining elements.
This will preserve the original classes distribution on all subsets even though we pick them in order because we shuffled the input.
The following code iterate over the test/train sets combinations:
import numpy as np
def csv_to_array(file):
with open(file, 'r') as f:
data = np.loadtxt(f, delimiter=',')
return data
def classes_distribution(A):
"""Print the class distributions of array A."""
nb_classes = np.unique(A[:,-1]).shape[0]
total_size = A.shape[0]
for i in range(nb_classes):
class_size = sum(row[-1] == i for row in A)
class_p = class_size/total_size
print(f"\t P(class_{i}) = {class_p:.3f}")
def random_samples(A, test_set_p=0.2):
"""Split the input array A in two uniformly chosen
random sets: test/training.
Repeat this until all rows have been yielded once at least
once as a test set."""
A = np.random.permutation(A)
sample_size = int(test_set_p*A.shape[0])
for start in range(0, A.shape[0], sample_size):
end = start + sample_size
yield {
"test": A[start:end,],
"train": np.append(A[:start,], A[end:,], 0)
}
def main():
ecoli = csv_to_array('ecoli.csv')
print("Input set shape: ", ecoli.shape)
print("Input set class distribution:")
classes_distribution(ecoli)
print("Training sets class distributions:")
for iteration in random_samples(ecoli):
test_set = iteration["test"]
training_set = iteration["train"]
classes_distribution(training_set)
print("---")
# ... Do what ever with these two sets
main()
It produces an output of the form:
Input set shape: (169, 8)
Input set class distribution:
P(class_0) = 0.308
P(class_1) = 0.213
P(class_2) = 0.207
P(class_3) = 0.118
P(class_4) = 0.154
Training sets class distributions:
P(class_0) = 0.316
P(class_1) = 0.206
P(class_2) = 0.199
P(class_3) = 0.118
P(class_4) = 0.162
...

Proper way to stop a TensorFlow Dataset `from_generator`?

I would like to use a TensorFlow Dataset built with from_generator to access a formatted file. Most everything works except I don't know how to stop the Dataset iterator when the generator runs out of data (the generator just returns empty lists forever when you go out of range).
My actual code is very complex, but I can mock up the situation with this short program:
import tensorflow as tf
def make_batch_generator_fn(batch_size=10, dset_size=100):
feats, targs = range(dset_size), range(1, dset_size + 1)
def batch_generator_fn():
start_idx, stop_idx = 0, batch_size
while True:
# if stop_idx > dset_size: --- stop action?
yield feats[start_idx: stop_idx], targs[start_idx: stop_idx]
start_idx, stop_idx = start_idx + batch_size, stop_idx + batch_size
return batch_generator_fn
def test(batch_size=10):
dgen = make_batch_generator_fn(batch_size)
features_shape, targets_shape = [None], [None]
ds = tf.data.Dataset.from_generator(
dgen, (tf.int32, tf.int32),
(tf.TensorShape(features_shape), tf.TensorShape(targets_shape))
)
feats, targs = ds.make_one_shot_iterator().get_next()
with tf.Session() as sess:
counter = 0
try:
while True:
f, t = sess.run([feats, targs])
print(f, t)
counter += 1
if counter > 15:
break
except tf.errors.OutOfRangeError:
print('end of dataset at counter = {}'.format(counter))
if __name__ == '__main__':
test()
If I know the number of records in advance, I can tune the number of batches, but I don't always know. I've tried putting some code in the snippet above where I have a comment line like stop action?. In particular, I've tried raising an IndexError, but TensorFlow doesn't like this, even if I explicitly catch it in my execution code. I also tried raising a tf.errors.OutOfRangeError, but I'm not sure how to instantiate it: the constructor requires three arguments - 'node_def', 'op', and 'message', and I'm not quite sure what to use for 'node_def' and 'op' in general.
I'd appreciate any thoughts or comments on this issue. Thanks!
Return when you meet your stop criteria:
def make_batch_generator_fn(batch_size=10, dset_size=100):
feats, targs = range(dset_size), range(1, dset_size + 1)
def batch_generator_fn():
start_idx, stop_idx = 0, batch_size
while True:
if stop_idx > dset_size:
return
else:
yield feats[start_idx: stop_idx], targs[start_idx: stop_idx]
start_idx, stop_idx = start_idx + batch_size, stop_idx + batch_size
return batch_generator_fn
This is in line with the behavior specified in the Python 3 documentation:
In a generator function, the return statement indicates that the generator is done and will cause StopIteration to be raised. The returned value (if any) is used as an argument to construct StopIteration and becomes the StopIteration.value attribute.
It works with following lines:
dataset_size = your dataset size
batch_size = your batch size
dataset = your tf.data.Dataset
steps_per_epoch = dataset_size // batch_size
for data, _ in zip(dataset, range(steps_per_epoch)):
# your train_step
The iteration will stop when it's through.

Implementing a simple gaussian naive bayes algorithm in python

So im a real amateur, trying to implement something you may call a sort of 'simplified' version of the naive bayes algorithm in python, and seem to have a lot of trouble [the reason for which is perhaps the fact that im not too sure i completely understand the way the algorithm works..]. I would appreciate any help/suggestions very much though. This is the code I have:
class GaussianNB(object):
def __init__(self):
'''
Constructor
'''
# This variable will hold the gaussian distribution over your data
# In fact, you need a distribution per class for each feature variable.
# This can be done as a list of lists.
self.classmodels_count = {}
self.classmodels = {}
self.classmodelsMeanAndVariance = {}
self.featureTokenCount= 0;
self.featureTypeCount = 0;
def train(self, trainingdata):
for i in trainingdata:
current_class = i[0]
features = i[1]
if self.classmodels.has_key(current_class):
current_class_model = self.classmodels[current_class]
self.classmodels_count[current_class] = self.classmodels_count[current_class] + 1
else:
current_class_model = {}
self.classmodels_count[current_class] = 1
for f in features:
feature = f[0]
value = f[1]
if current_class_model.has_key(feature):
list_of_values = current_class_model[feature]
list_of_values.append(value)
current_class_model[feature] = list_of_values
else:
list_of_values = []
list_of_values.append(value)
current_class_model[feature] = list_of_values
self.classmodels[current_class] = current_class_model
for a_class in self.classmodels.keys():
a_class_model = self.classmodels[a_class]
a_class_model_mean_and_variance = {}
for feature in a_class_model.keys():
a_class_model_mean_and_variance[feature] = findMeanSD(np.array(a_class_model[feature]))
self.classmodelsMeanAndVariance[a_class] = a_class_model_mean_and_variance
def classify(self, testing_vecs):
outputs = []
for vec in testing_vecs:
features = vec[1]
class_model_output_prob = {}
for a_class in self.classmodelsMeanAndVariance.keys():
a_class_output_prob = 0.0
a_class_model_mean_and_variance = self.classmodelsMeanAndVariance[a_class]
for feature_value in features:
feature = feature_value[0]
value = feature_value[1]
#simply ignore a feature if its not seen in training
if(a_class_model_mean_and_variance.has_key(feature)):
feature_mean = a_class_model_mean_and_variance[feature][0]
feature_std = a_class_model_mean_and_variance[feature][1]
a_class_output_prob = a_class_output_prob + math.log10(norm(value,feature_mean,feature_std))
#ignoring P(class) prior.. assuming equal priors
class_model_output_prob[a_class_output_prob] = a_class
probs = class_model_output_prob.keys()
print probs
probs.sort()
max_prob = probs[len(probs)-1]
max_class =class_model_output_prob[max_prob]
outputs.append(max_class)
return outputs
When running on some data, the error I get is
Traceback (most recent call last):
File "C:\Users\Toshiba\workspace\Assignment6\src\gnb_test.py", line 34, in
gaussian = Model.train(testData)
File "C:\Users\Toshiba\workspace\Assignment6\src\gnb.py", line 91, in train
for f in features:
TypeError: 'numpy.float64' object is not iterable
And I dont really [at all] understand what it means
Your traceback suggests that the problem is that you are trying to iterate through features, but features is a float and not a list or tuple - basically, it can't be broken into individual elements. I think it is a float because the lines
for i in trainingdata:
current_class = i[0]
features = i[1]
suggest features keeps getting rewritten as a successive series of numbers, when what you seem to want is to save the numbers into an iterable type. Try
features = []
for i in trainingdata:
current_class = i[0]
features.append(i[1])

Categories