filtering "empty" values from Tensorflow - python

I wrote this code to filter values from a Dataset that are <= 6.
import tensorflow as tf
import tensorflow.contrib.data as ds
def make_graph():
inits = []
filter_value = tf.constant([6], dtype=tf.int64)
source = ds.Dataset.range(10)
batched = source.batch(3)
batched_iter = batched.make_initializable_iterator()
batched_next = batched_iter.get_next()
inits.append(batched_iter.initializer)
predicate = tf.less_equal(batched_next, filter_value, name="less_than_filter")
true_coordinates = tf.where(predicate)
reshaped = tf.reshape(true_coordinates, [-1])
# need to turn bools into 1 and 0 elsewhere
found = tf.gather(params=batched_next, indices=reshaped)
return found, inits # prepend final tensor
def run_graph(final_tensor, initializers, rounds):
with tf.Session() as sess:
init_ops = (tf.local_variables_initializer(), tf.global_variables_initializer())
sess.run(init_ops)
summary_writer = tf.summary.FileWriter(graph=sess.graph, logdir=".")
while rounds > 0:
for i in initializers:
sess.run(i)
try:
while True:
final_result = sess.run(final_tensor)
p```pythrint("Got result: {r}".format(r=final_result))
except tf.errors.OutOfRangeError:
print("Got out of range error")
rounds -=1
summary_writer.flush()
def run():
final_tensor, initializers = make_graph()
run_graph(final_tensor=final_tensor,
initializers=initializers,
rounds=1)
if __name__ == "__main__":
run()
However, the results are as follows:
Got result: [0 1 2]
Got result: [3 4 5]
Got result: [6]
Got result: []
Got out of range error
Is there a way to filter this empty Tensor? I tried to brainstorm ways to do this, maybe with a tf.while loop, but I'm not sure whether I'm missing something or such an operation (i.e. an OpKernel "dropping" an input by not producing output based on its value) is not possible in Tensorflow.

Keeping only values <= 6 BEFORE batching:
dataset = ds.Dataset.range(10)
dataset = dataset.filter( lambda v : v <= 6 )
dataset = dataset.batch(3)
batched_iter = dataset.make_initializable_iterator()
This will generate batches containing only the data you want. Note that it's generally better to filter out the unwanted data before building the batches. This way, empty tensors will not be generated by the iterator.

Related

Pytorch expects each tensor to be equal size

When running this code: embedding_matrix = torch.stack(embeddings)
I got this error:
RuntimeError: stack expects each tensor to be equal size, but got [7, 768] at entry 0 and [8, 768] at entry 1
I'm trying to get embedding using BERT via:
split_sent = sent.split()
tokens_embedding = []
j = 0
for full_token in split_sent:
curr_token = ''
x = 0
for i,_ in enumerate(tokenized_sent[1:]):
token = tokenized_sent[i+j]
piece_embedding = bert_embedding[i+j]
if token == full_token and curr_token == '' :
tokens_embedding.append(piece_embedding)
j += 1
break
sent_embedding = torch.stack(tokens_embedding)
embeddings.append(sent_embedding)
embedding_matrix = torch.stack(embeddings)
Does anyone know how I can fix this?
As per PyTorch Docs about torch.stack() function, it needs the input tensors in the same shape to stack. I don't know how will you be using the embedding_matrix but either you can add padding to your tensors (which will be a list of zeros at the end till a certain user-defined length and is recommended if you will train with this stacked tensor, refer this tutorial) to make them equidimensional or you can simply use something like torch.cat(data,dim=0).

Update Tensor at index in a tf.while_loop

Ok so I want to replicate this keep[count] = i Pytorch code in Tensorflow, this is happening within a tf.while_loop both count and i are tensors.
keep = tf.Variable(tf.zeros(tf.size(scores), tf.int64))
count = 0
idx = tf.argsort(scores, axis=0)#scores.sort(0) # sort in ascending order
idx = idx[-top_k:]
...
def loop_body(idx, keep, count, ...)
i = idx[-1] # index of current largest val
keep = tf.scatter_update(keep, count, i)
tf.while_loop(loop_cond, loop_body, loop_vars)
This is what I'm trying to do keep = tf.scatter_update(keep, count, i), this unfortunately raises a AttributeError: 'Tensor' object has no attribute '_lazy_read

Numpy Array Manipulation Based off of Internal Values

I am trying to accomplish a weird task.
I need to complete the following without the use of sklearn, and preferably with numpy:
Given a dataset, split the data into 5 equal "folds", or partitions
Within each partition, split the data into a "training" and "testing" set, with an 80/20 split
Here is the catch: Your dataset is labeled for classes. So take for example a dataset with 100 instances, and class A with 33 samples and class B with 67 samples. I should create 5 folds of 20 data instances, where in each fold, class A has something like 6 or 7 (1/3) values and class B has the rest
My issue that:
I do not know how to properly return a test and training set for each fold, despite being able to split it appropriately, and, more important, I do not know how to incorporate the proper division of # of elements per class.
My current code is here. It is commented where I am stuck:
import numpy
def csv_to_array(file):
# Open the file, and load it in delimiting on the ',' for a comma separated value file
data = open(file, 'r')
data = numpy.loadtxt(data, delimiter=',')
# Loop through the data in the array
for index in range(len(data)):
# Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
try:
data[index] = [float(x) for x in data[index]]
except Exception:
data[index] = 0
except ValueError:
data[index] = 0
# Return the now type-formatted data
return data
def five_cross_fold_validation(dataset):
# print("DATASET", dataset)
numpy.random.shuffle(dataset)
num_rows = dataset.shape[0]
split_mark = int(num_rows / 5)
folds = []
temp1 = dataset[:split_mark]
# print("TEMP1", temp1)
temp2 = dataset[split_mark:split_mark*2]
# print("TEMP2", temp2)
temp3 = dataset[split_mark*2:split_mark*3]
# print("TEMP3", temp3)
temp4 = dataset[split_mark*3:split_mark*4]
# print("TEMP4", temp4)
temp5 = dataset[split_mark*4:]
# print("TEMP5", temp5)
folds.append(temp1)
folds.append(temp2)
folds.append(temp3)
folds.append(temp4)
folds.append(temp5)
# folds = numpy.asarray(folds)
for fold in folds:
# fold = numpy.asarray(fold)
num_rows = fold.shape[0]
split_mark = int(num_rows * .8)
fold_training = fold[split_mark:]
fold_testing = fold[:split_mark]
print(type(fold))
# fold.tolist()
list(fold)
print(type(fold))
del fold[0:len(fold)]
fold.append(fold_training)
fold.append(fold_testing)
fold = numpy.asarray(fold)
# Somehow, return a testing and training set within each fold
# print(folds)
return folds
def confirm_size(folds):
total = 0
for fold in folds:
curr = len(fold)
total = total + curr
return total
def main():
print("BEGINNING CFV")
ecoli = csv_to_array('Classification/ecoli.csv')
print(len(ecoli))
folds = five_cross_fold_validation(ecoli)
size = confirm_size(folds)
print(size)
main()
Additionally, for reference, I have attached my csv I am working with (it is a modification of the UCI Ecoli Dataset.) The classes here are the values in the last column. So 0, 1, 2, 3, 4. It is important to note that there are not equal amounts of each class.
0.61,0.45,0.48,0.5,0.48,0.35,0.41,0
0.17,0.38,0.48,0.5,0.45,0.42,0.5,0
0.44,0.35,0.48,0.5,0.55,0.55,0.61,0
0.43,0.4,0.48,0.5,0.39,0.28,0.39,0
0.42,0.35,0.48,0.5,0.58,0.15,0.27,0
0.23,0.33,0.48,0.5,0.43,0.33,0.43,0
0.37,0.52,0.48,0.5,0.42,0.42,0.36,0
0.29,0.3,0.48,0.5,0.45,0.03,0.17,0
0.22,0.36,0.48,0.5,0.35,0.39,0.47,0
0.23,0.58,0.48,0.5,0.37,0.53,0.59,0
0.47,0.47,0.48,0.5,0.22,0.16,0.26,0
0.54,0.47,0.48,0.5,0.28,0.33,0.42,0
0.51,0.37,0.48,0.5,0.35,0.36,0.45,0
0.4,0.35,0.48,0.5,0.45,0.33,0.42,0
0.44,0.34,0.48,0.5,0.3,0.33,0.43,0
0.44,0.49,0.48,0.5,0.39,0.38,0.4,0
0.43,0.32,0.48,0.5,0.33,0.45,0.52,0
0.49,0.43,0.48,0.5,0.49,0.3,0.4,0
0.47,0.28,0.48,0.5,0.56,0.2,0.25,0
0.32,0.33,0.48,0.5,0.6,0.06,0.2,0
0.34,0.35,0.48,0.5,0.51,0.49,0.56,0
0.35,0.34,0.48,0.5,0.46,0.3,0.27,0
0.38,0.3,0.48,0.5,0.43,0.29,0.39,0
0.38,0.44,0.48,0.5,0.43,0.2,0.31,0
0.41,0.51,0.48,0.5,0.58,0.2,0.31,0
0.34,0.42,0.48,0.5,0.41,0.34,0.43,0
0.51,0.49,0.48,0.5,0.53,0.14,0.26,0
0.25,0.51,0.48,0.5,0.37,0.42,0.5,0
0.29,0.28,0.48,0.5,0.5,0.42,0.5,0
0.25,0.26,0.48,0.5,0.39,0.32,0.42,0
0.24,0.41,0.48,0.5,0.49,0.23,0.34,0
0.17,0.39,0.48,0.5,0.53,0.3,0.39,0
0.04,0.31,0.48,0.5,0.41,0.29,0.39,0
0.61,0.36,0.48,0.5,0.49,0.35,0.44,0
0.34,0.51,0.48,0.5,0.44,0.37,0.46,0
0.28,0.33,0.48,0.5,0.45,0.22,0.33,0
0.4,0.46,0.48,0.5,0.42,0.35,0.44,0
0.23,0.34,0.48,0.5,0.43,0.26,0.37,0
0.37,0.44,0.48,0.5,0.42,0.39,0.47,0
0,0.38,0.48,0.5,0.42,0.48,0.55,0
0.39,0.31,0.48,0.5,0.38,0.34,0.43,0
0.3,0.44,0.48,0.5,0.49,0.22,0.33,0
0.27,0.3,0.48,0.5,0.71,0.28,0.39,0
0.17,0.52,0.48,0.5,0.49,0.37,0.46,0
0.36,0.42,0.48,0.5,0.53,0.32,0.41,0
0.3,0.37,0.48,0.5,0.43,0.18,0.3,0
0.26,0.4,0.48,0.5,0.36,0.26,0.37,0
0.4,0.41,0.48,0.5,0.55,0.22,0.33,0
0.22,0.34,0.48,0.5,0.42,0.29,0.39,0
0.44,0.35,0.48,0.5,0.44,0.52,0.59,0
0.27,0.42,0.48,0.5,0.37,0.38,0.43,0
0.16,0.43,0.48,0.5,0.54,0.27,0.37,0
0.06,0.61,0.48,0.5,0.49,0.92,0.37,1
0.44,0.52,0.48,0.5,0.43,0.47,0.54,1
0.63,0.47,0.48,0.5,0.51,0.82,0.84,1
0.23,0.48,0.48,0.5,0.59,0.88,0.89,1
0.34,0.49,0.48,0.5,0.58,0.85,0.8,1
0.43,0.4,0.48,0.5,0.58,0.75,0.78,1
0.46,0.61,0.48,0.5,0.48,0.86,0.87,1
0.27,0.35,0.48,0.5,0.51,0.77,0.79,1
Edit I replaced np.random.shuffle(A) by A = np.random.permutation(A), the only difference is that it doesn't mutate the input array. This doesn't make any difference in this code, but it is safer in general.
The idea is to randomly sample the input by using numpy.random.permutation. Once the rows are shuffled we just need to iterate over all the possible tests sets (sliding window of the desired size, here 20% of the input size). The corresponding training sets are just composed of all remaining elements.
This will preserve the original classes distribution on all subsets even though we pick them in order because we shuffled the input.
The following code iterate over the test/train sets combinations:
import numpy as np
def csv_to_array(file):
with open(file, 'r') as f:
data = np.loadtxt(f, delimiter=',')
return data
def classes_distribution(A):
"""Print the class distributions of array A."""
nb_classes = np.unique(A[:,-1]).shape[0]
total_size = A.shape[0]
for i in range(nb_classes):
class_size = sum(row[-1] == i for row in A)
class_p = class_size/total_size
print(f"\t P(class_{i}) = {class_p:.3f}")
def random_samples(A, test_set_p=0.2):
"""Split the input array A in two uniformly chosen
random sets: test/training.
Repeat this until all rows have been yielded once at least
once as a test set."""
A = np.random.permutation(A)
sample_size = int(test_set_p*A.shape[0])
for start in range(0, A.shape[0], sample_size):
end = start + sample_size
yield {
"test": A[start:end,],
"train": np.append(A[:start,], A[end:,], 0)
}
def main():
ecoli = csv_to_array('ecoli.csv')
print("Input set shape: ", ecoli.shape)
print("Input set class distribution:")
classes_distribution(ecoli)
print("Training sets class distributions:")
for iteration in random_samples(ecoli):
test_set = iteration["test"]
training_set = iteration["train"]
classes_distribution(training_set)
print("---")
# ... Do what ever with these two sets
main()
It produces an output of the form:
Input set shape: (169, 8)
Input set class distribution:
P(class_0) = 0.308
P(class_1) = 0.213
P(class_2) = 0.207
P(class_3) = 0.118
P(class_4) = 0.154
Training sets class distributions:
P(class_0) = 0.316
P(class_1) = 0.206
P(class_2) = 0.199
P(class_3) = 0.118
P(class_4) = 0.162
...

Proper way to stop a TensorFlow Dataset `from_generator`?

I would like to use a TensorFlow Dataset built with from_generator to access a formatted file. Most everything works except I don't know how to stop the Dataset iterator when the generator runs out of data (the generator just returns empty lists forever when you go out of range).
My actual code is very complex, but I can mock up the situation with this short program:
import tensorflow as tf
def make_batch_generator_fn(batch_size=10, dset_size=100):
feats, targs = range(dset_size), range(1, dset_size + 1)
def batch_generator_fn():
start_idx, stop_idx = 0, batch_size
while True:
# if stop_idx > dset_size: --- stop action?
yield feats[start_idx: stop_idx], targs[start_idx: stop_idx]
start_idx, stop_idx = start_idx + batch_size, stop_idx + batch_size
return batch_generator_fn
def test(batch_size=10):
dgen = make_batch_generator_fn(batch_size)
features_shape, targets_shape = [None], [None]
ds = tf.data.Dataset.from_generator(
dgen, (tf.int32, tf.int32),
(tf.TensorShape(features_shape), tf.TensorShape(targets_shape))
)
feats, targs = ds.make_one_shot_iterator().get_next()
with tf.Session() as sess:
counter = 0
try:
while True:
f, t = sess.run([feats, targs])
print(f, t)
counter += 1
if counter > 15:
break
except tf.errors.OutOfRangeError:
print('end of dataset at counter = {}'.format(counter))
if __name__ == '__main__':
test()
If I know the number of records in advance, I can tune the number of batches, but I don't always know. I've tried putting some code in the snippet above where I have a comment line like stop action?. In particular, I've tried raising an IndexError, but TensorFlow doesn't like this, even if I explicitly catch it in my execution code. I also tried raising a tf.errors.OutOfRangeError, but I'm not sure how to instantiate it: the constructor requires three arguments - 'node_def', 'op', and 'message', and I'm not quite sure what to use for 'node_def' and 'op' in general.
I'd appreciate any thoughts or comments on this issue. Thanks!
Return when you meet your stop criteria:
def make_batch_generator_fn(batch_size=10, dset_size=100):
feats, targs = range(dset_size), range(1, dset_size + 1)
def batch_generator_fn():
start_idx, stop_idx = 0, batch_size
while True:
if stop_idx > dset_size:
return
else:
yield feats[start_idx: stop_idx], targs[start_idx: stop_idx]
start_idx, stop_idx = start_idx + batch_size, stop_idx + batch_size
return batch_generator_fn
This is in line with the behavior specified in the Python 3 documentation:
In a generator function, the return statement indicates that the generator is done and will cause StopIteration to be raised. The returned value (if any) is used as an argument to construct StopIteration and becomes the StopIteration.value attribute.
It works with following lines:
dataset_size = your dataset size
batch_size = your batch size
dataset = your tf.data.Dataset
steps_per_epoch = dataset_size // batch_size
for data, _ in zip(dataset, range(steps_per_epoch)):
# your train_step
The iteration will stop when it's through.

TypeError for predict_proba(np.array(test))

model = LogisticRegression()
model = model.fit(X, y)
test_data = [1,2,3,4,5,6,7,8,9,10,11,12,13]
test_prediction = model.predict_proba(np.array(test_data))
max = -1.0
res = 0
for i in range(test_prediction):
if test_prediction[i]>max:
max = test_prediction[i]
res = i
if res==0:
print('A')
elif res==1:
print('B')
else:
print('C')
Using the above python code I have to predict the probabilities of the 3 possible results (A, B, C).
The probabilities are saved in test_prediction and it can be printed as:
Output: [[ 0.82882588 0.08641236 0.08476175]]
But the remaining part gives an error:
for i in range(test_prediction):
TypeError: only integer scalar arrays can be converted to a scalar index
I want to find the max probability and then display the event that is likely to occur the most (A/B/C).
How to go about this?
You can also use numpy.argmax which will directly give you the index of the largest value.
import numpy as np
#test_prediction is most probably np array only
pred = np.array(test_prediction)
classes_val = np.argmax(pred, axis=1)
for res in class_val:
if res==0:
print('A')
elif res==1:
print('B')
else:
print('C')
The problem in using array in range
In this case you should use length of array range(len(test_prediction))
Also you may simplify your code:
import operator
#...
enum_predict = enumerate(test_prediction)
res = max(enum_predict, key=operator.itemgetter(1))[0]
enumerate convert array to list of tuples (index, item)
key=operator.itemgetter(1) - max function will compare types by second value
You can do something like this:
predict_prob_df = pd.DataFrame(model.predict_proba(test_data))
max_prob = predict_prob_df.apply(max,axis = 1)
predicted_output = pd.DataFrame(model.predict(test_data))
Then you can concat them:
final_frame = pd.concat([max_prob,predicted_output],axis = 1)
This way you do not need to use the for loop, which was causing the error.
I came up with another solution:
for i in range(3):
if np.take(test_prediction, i) > max:
max = np.take(test_prediction, i)
res = i
if res==0:
.....
This worked by accessing the index in test_prediction using np.take
But the solution specified by #Vivek_Kumar seems more correct and efficient.

Categories