How to generate a dynamic number of samples from tensorflow dataset - python

My goal is to allow my Tensorflow Dataset pipeline to allow near arbitrary sized inputs, which will be converted to uniform (known at 'compile' time) sized samples, which number more than the original. Thus I have a py_func (similar to 1 in idea of mapping one to many) which aims to return a dataset for use in flat_map
def split_fn(x, y):
""" Splits X into a number of subsamples, each labeled y"""
full_width = x.shape[1]
full_height = x.shape[0]
print(full_width)
print(full_height)
slice_width = SLICE_WIDTH
slice_height = SLICE_HEIGHT
# The splits created by these offset cover the complete input image
offsets1 = [[x,0] for x in range(0, full_width-slice_width, slice_width)]
if full_width % slice_width != 0:
offsets1.append([full_width-slice_width, 0])
# The splits from these offsets are random, intended for data augmentation
offsets2 = [[x,0] for x in random.sample(range(0,full_width-slice_width), 5)]
#Combine the two lists of offsets
offsets = offsets1 + offsets2
image = x.reshape(1, full_height, full_width, 1)
#This creates a list of the slices corresponding to the offsets
ts = list(map(lambda offset: tf.image.crop_to_bounding_box(image,
offset[1],
offset[0],
slice_height,
slice_width),
offsets))
#Create and concatenate a dataset for each of the samples
datasets = map(lambda d: tf.data.Dataset.from_tensors((d, y)), ts)
ds = reduce((lambda x, y: x.concatenate(y)), datasets)
return ds
However, where I define offsets1,
TypeError: __index__ returned non-int (type NoneType)
. I've tried to fix this by wrapping it in a py_func which returns a dataset
dataset = dataset.flat_map(
lambda image, label: tuple(tf.py_func(
split_fn, [image, label], [tf.data.Dataset])))
however I can't seem to get this to correctly work:
TypeError: Expected DataType for argument 'Tout' not < class
'tensorflow.python.data.ops.dataset_ops.Dataset' > .
What can I do to get this to work?
Thank you

Related

Apache Beam : Expected 2D array, got 1D array instead in distributed kmeans

so i have this code :
class distKmeans(beam.DoFn):
#i will do an init function to add the kmeans parameters
def __init__(self, n_clusters,rseed=2):
self.n_clusters = n_clusters
self.rseed = rseed
self.centers = None
#The function "process" implements the main functionality of the K-means algorithm
def process(self,element):
if self.centers is None:
rng = np.random.RandomState(self.rseed)
#we use len instead of shape because element is a PCOLLECTION
i = rng.permutation(element.shape[0])[:self.n_clusters]
self.centers = element[i]
# b1. Calculate the closest center μ to xi
labels = pairwise_distances_argmin(element, self.centers)
# b2. Update the center
new_centers = np.array([element[labels == i].mean(0)
for i in range(self.n_clusters)])
# c.
if np.all(self.centers == new_centers):
return
self.centers = new_centers
yield self.centers, labels
with beam.Pipeline() as pipeline:
mydata = pipeline | beam.Create(X)
mydata = mydata |beam.ParDo(distKmeans(3))
mydata |"write" >> beam.io.WriteToText("sample_data/output.txt")
as i'm trying to create a distributed kmeans with apache beam, my data was generated using this code :
n_samples=200
n_features=2
X, y = make_blobs(n_samples=n_samples,centers=3, n_features=n_features)
data = np.c_[X,y]
plt.scatter(data[:, 0], data[:, 1], s=50);
and then X is :
X = data[['X1','X2']].to_numpy()
X = X[1:]
it shape is (200, 2 )
The code seems correct but i always get the fellowing error even tho my data is a 2d array:
Expected 2D array, got 1D array instead:
array=[-6.03120913 11.30181549].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample. [while running '[54]: ParDo(distKmeans)']
and this error comes in this line :
labels = pairwise_distances_argmin(element, self.centers)

Sklearn logistic regression shape error, but x, y shapes are consistent

I get a ValueError: Found input variables with inconsistent numbers of samples: [20000, 1] when I run the following even though the row values of x and y are correct. I load in the RCV1 dataset, get indices of the categories with the top x documents, create list of tuples with equal number of randomly-selected positives and negatives for each category, and then finally attempt to run a logistic regression on one of the categories.
import sklearn.datasets
from sklearn import model_selection, preprocessing
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from scipy import sparse
rcv1 = sklearn.datasets.fetch_rcv1()
def get_top_cat_indices(target_matrix, num_cats):
cat_counts = target_matrix.sum(axis=0)
#cat_counts = cat_counts.reshape((1,103)).tolist()[0]
cat_counts = cat_counts.reshape((103,))
#b = sorted(cat_counts, reverse=True)
ind_temp = np.argsort(cat_counts)[::-1].tolist()[0]
ind = [ind_temp[i] for i in range(5)]
return ind
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
cat_present = x.tocsr()[np.where(temp.sum(axis=1)>0)[0],:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[np.where(temp.sum(axis=1)==0)[0],:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[idx_cat,:]
sampled_y_neg = temp.tocsr()[idx_nocat,:]
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
ind = get_top_cat_indices(rcv1.target, 5)
test_res = prepare_data(train_x, train_y, ind, 20000)
x, y = test_res[0]
print(x.shape)
print(y.shape)
LogisticRegression().fit(x, y)
Could it be an issue with the sparse matrices, or problem with dimensionality (there are 20K samples and 47K features)
When I run your code, I get following error:
AttributeError: 'bool' object has no attribute 'any'
That's because y for LogisticRegression needs to numpy array. So, I changed last line to:
LogisticRegression().fit(x, y.A.flatten())
Then I get following error:
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0
This is because your sampling code has a bug. You need to subset y array with rows having that category before using sampling indices. See code below:
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
c1 = np.where(temp.sum(axis=1)>0)[0]
c2 = np.where(temp.sum(axis=1)==0)[0]
cat_present = x.tocsr()[c1,:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[c2,:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[c1][idx_cat,:]
print(sampled_y_pos.nnz)
sampled_y_neg = temp.tocsr()[c2][idx_nocat,:]
print(sampled_y_neg.nnz)
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
Now, Everything works like a charm

How to fix 'need at least one array to concatenate' error?

I have read through the various posts on ValueError but I'm not getting much satisfactory solution. Please, can anyone help me what I am doing wrong??
Code:
assert(type(images) == list)
# assert(type(images[0]) == np.ndarray)
# assert(len(images[0].shape) == 3)
# assert(np.max(images[0]) > 10)
# assert(np.min(images[0]) >= 0.0)
inps = []
for img in images:
img = img.astype(np.float32)
inps.append(np.expand_dims(img, 0))
bs = 100
with tf.Session() as sess:
preds = []
n_batches = int(math.ceil(float(len(inps)) / float(bs)))
for i in range(n_batches):
sys.stdout.write(".")
sys.stdout.flush()
inp = inps[(i * bs):min((i + 1) * bs, len(inps))]
inp = np.concatenate(inp, 0)
pred = sess.run(softmax, {'ExpandDims:0': inp})
preds.append(pred)
preds = np.concatenate(preds, 0)
scores = []
for i in range(splits):
part = preds[(i * preds.shape[0] // splits):((i + 1) * preds.shape[0] // splits), :]
kl = part * (np.log(part) - np.log(np.expand_dims(np.mean(part, 0), 0)))
kl = np.mean(np.sum(kl, 1))
scores.append(np.exp(kl))
return np.mean(scores), np.std(scores)
Error :
>File "/content/Inception-Score/inception_score.py", line 45, in >get_inception_score
> preds = np.concatenate(preds, 0)
>ValueError: need at least one array to concatenate
It appears that you are missing the argument for the array you would like to concatenate. You specified the initial array and the axis to concatenate on, but not the second array -- hence "need at least one array to concatenate".
np.concatenate() has a minimum of two arrays in the first argument, as detailed in the documentation here. Looks like "preds" is only one array. I am not sure what you are trying to do, but maybe concatenate is not what you want?
The problem seems to be in np.concatenate where it expects an array of arrays and you are not providing that
#syntax
numpy.concatenate((a1, a2, ...), axis=0, out=None)
Parameters:
a1, a2, … : sequence of array_like The arrays must have the same shape, except in the dimension corresponding to axis (the first, by default).
axis : int, optional The axis along which the arrays will be joined. If axis is None, arrays are flattened before use. Default is 0.
out : ndarray, optional If provided, the destination to place the result. The shape must be correct, matching that of what concatenate would have returned if no out argument were specified.
Returns: ndarray The concatenated array.
check preds what it returns

Produce a dataset of stridded slices from a tfrecords dataset

Continuing from this question and the discussion here - I am trying to use the Dataset API to take a dataset of variable length tensors and cut them into slices (segments) of equal length. Something like:
Dataset = tf.contrib.data.Dataset
segment_len = 6
batch_size = 16
with tf.Graph().as_default() as g:
# get the tfrecords dataset
dataset = tf.contrib.data.TFRecordDataset(filenames).map(
partial(record_type.parse_single_example, graph=g)).batch(batch_size)
# zip it with the number of segments we need to slice each tensor
dataset2 = Dataset.zip((dataset, Dataset.from_tensor_slices(
tf.constant(num_segments, dtype=tf.int64))))
it2 = dataset2.make_initializable_iterator()
def _dataset_generator():
with g.as_default():
while True:
try:
(im, length), count = sess.run(it2.get_next())
dataset3 = Dataset.zip((
# repeat each tensor then use map to take a stridded slice
Dataset.from_tensors((im, length)).repeat(count),
Dataset.range(count))).map(lambda x, c: (
x[0][:, c: c + segment_len],
x[0][:, c + 1: (c + 1) + segment_len],
))
it = dataset3.make_initializable_iterator()
it_init = it.initializer
try:
yield it_init
while True:
yield sess.run(it.get_next())
except tf.errors.OutOfRangeError:
continue
except tf.errors.OutOfRangeError:
return
# Dataset.from_generator need tensorflow > 1.3 !
das_dataset = Dataset.from_generator(
_dataset_generator,
(tf.float32, tf.float32),
# (tf.TensorShape([]), tf.TensorShape([]))
)
das_dataset_it = das_dataset.make_one_shot_iterator()
with tf.Session(graph=g) as sess:
while True:
print(sess.run(it2.initializer))
print(sess.run(das_dataset_it.get_next()))
Of course I do not want to pass the session in the generator but this should be workarounded by the trick given in the link (create a dummy dataset and map the iterator of the other). The code above fails with the biblical:
tensorflow.python.framework.errors_impl.InvalidArgumentError: TypeError: If shallow structure is a sequence, input must also be a sequence. Input has type: <class 'tensorflow.python.framework.ops.Operation'>.
[[Node: PyFunc = PyFunc[Tin=[DT_INT64], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_1"](arg0)]]
[[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](OneShotIterator)]]
which is I guess because I try to yield the initializer of the iterator but my question is basically if I can achieve at all what I am trying using the dataset API.
The easiest way to build a Dataset from a nested Dataset is to use the Dataset.flat_map() transformation. This transformation applies a function to each element of the input dataset (dataset2 in your example), that function returns a nested Dataset (most likely dataset3 in your example), and then the transformation flattens all the nested datasets into a single Dataset.
dataset2 = ... # As above.
def get_slices(im_and_length, count):
im, length = im_and_length
# Repeat each tensor then use map to take a strided slice.
return Dataset.zip((
Dataset.from_tensors((im, length)).repeat(count),
Dataset.range(count))).map(lambda x, c: (
x[0][:, c + segment_len: (c + 1) + segment_len],
x[0][:, c + 1 + segment_len: (c + 2) + segment_len],
))
das_dataset = dataset2.flat_map(get_slices)

Theano Dimshuffle equivalent in Google's TensorFlow?

I have seen that transpose and reshape together can help but I don't know how to use.
Eg. dimshuffle(0, 'x')
What is its equivalent by using transpose and reshape? or is there a better way?
Thank you.
There are three relevant ops for implementing Theano's dimshuffle in TensorFlow:
tf.transpose() is used to permute the dimensions of a tensor. If the pattern specified in the arguments to dimshuffle is a permutation of the input tensor's dimensions (i.e. there is no 'x' or missing dimension) you can use tf.transpose() to implement dimshuffle().
tf.expand_dims() is used to add one or more size-1 dimensions to a tensor. This handles the case where 'x' is specified as part of the dimshuffle() pattern, but does not reorder the existing dimensions.
tf.squeeze() is used to remove one or more size-1 dimensions from a tensor. This handles the case where a dimension is omitted from a dimshuffle() pattern, but it does not reorder the existing dimensions.
Assuming that the input is a vector, your example (dimshuffle(0, 'x')) can be expressed using tf.expand_dims() only:
input = tf.placeholder(tf.float32, [None]) # Defines an arbitrary-sized vector.
result = tf.expand_dims(input, 1)
print result.get_shape() # ==> TensorShape([Dimension(None), Dimension(1)])
Taking a more complicated example, dimshuffle(1, 'x', 0) applied to a matrix would be:
input = tf.placeholder(tf.float32, [128, 32]) # Defines a matrix.
output = tf.expand_dims(tf.transpose(input, [1, 0]), 1)
print output.get_shape()
# ==> TensorShape([Dimension(32), Dimension(1), Dimension(128)])
I implemented dimshuffle for TensorFlow in our framework Returnn (here). The code is this:
def expand_multiple_dims(x, axes, name="expand_multiple_dims"):
"""
:param tf.Tensor x:
:param list[int]|tuple[int] axes: after completion, tf.shape(y)[axis] == 1 for axis in axes
:param str name: scope name
:return: y where we have a new broadcast axis for each axis in axes
:rtype: tf.Tensor
"""
with tf.name_scope(name):
for i in sorted(axes):
x = tf.expand_dims(x, axis=i, name="expand_axis_%i" % i)
return x
def dimshuffle(x, axes, name="dimshuffle"):
"""
Like Theanos dimshuffle.
Combines tf.transpose, tf.expand_dims and tf.squeeze.
:param tf.Tensor x:
:param list[int|str]|tuple[int|str] axes:
:param str name: scope name
:rtype: tf.Tensor
"""
with tf.name_scope(name):
assert all([i == "x" or isinstance(i, int) for i in axes])
real_axes = [i for i in axes if isinstance(i, int)]
bc_axes = [i for (i, j) in enumerate(axes) if j == "x"]
if x.get_shape().ndims is None:
x_shape = tf.shape(x)
x = tf.reshape(x, [x_shape[i] for i in range(max(real_axes) + 1)]) # will have static ndims
assert x.get_shape().ndims is not None
# First squeeze missing axes.
i = 0
while i < x.get_shape().ndims:
if i not in real_axes:
x = tf.squeeze(x, axis=i)
real_axes = [(j if (j < i) else (j - 1)) for j in real_axes]
else:
i += 1
# Now permute.
assert list(sorted(real_axes)) == list(range(x.get_shape().ndims))
if real_axes != list(range(x.get_shape().ndims)):
x = tf.transpose(x, real_axes)
# Now add broadcast dimensions.
if bc_axes:
x = expand_multiple_dims(x, bc_axes)
assert len(axes) == x.get_shape().ndims
return x
If tensorflow is your backend
from keras import baskend as K
K.permute_dimension should do
tf.transpose is probably what you are looking for. it takes an arbitrary permutation.

Categories