I looked into the anomaly detection using both PCA and Autoencoder using the codes from the following link: Machine learning for anomaly detection and condition monitoring and I try to run the code part for using PCA with Mahalanobis Distance, however, if I run the code I always get the exception message and it turns out the problem is with the covariance matrix function part where the error 'numpy.ndarray' object is not callable appears. I tried to create new variables, change the dataframe into NumPy but nothing worked what is causing this error?
Code:
def cov_matrix(data, verbose=False):
# data = pd.DataFrame(data).to_numpy()
print('calculating the covaraince matrix')
covariance_matrix = np.cov(data, rowvar=False)
print('Done the covaraince matrix')
if is_pos_def(covariance_matrix):
inv_covariance_matrix = np.linalg.inv(covariance_matrix)
if is_pos_def(inv_covariance_matrix):
return covariance_matrix, inv_covariance_matrix
else:
print("Error: Inverse of Covariance Matrix is not positive definite!")
else:
print("Error: Covariance Matrix is not positive definite!")
def MahalanobisDist(inv_cov_matrix, mean_distr, data, verbose=False):
inv_covariance_matrix = inv_cov_matrix
vars_mean = mean_distr
diff = data - vars_mean
md = []
for i in range(len(diff)):
md.append(np.sqrt(diff[i].dot(inv_covariance_matrix).dot(diff[i])))
return md
def MD_detectOutliers(dist, extreme=False, verbose=False):
k = 3. if extreme else 2.
threshold = np.mean(dist) * k
outliers = []
for i in range(len(dist)):
if dist[i] >= threshold:
outliers.append(i) # index of the outlier
return np.array(outliers)
def MD_threshold(dist, extreme=False, verbose=False):
k = 3. if extreme else 2.
threshold = np.mean(dist) * k
return threshold
#### Main code:
# Inputting the training and test dataframes:
data_train = np.array(principalDf_C0.values)
data_test_C1 = np.array(principalDf_C1.values)
data_test_C2 = np.array(principalDf_C2.values)
data_test_C3 = np.array(principalDf_C4.values)
data_test_C4 = np.array(principalDf_C5.values)
print('Training Dataframe: ', data_train[:,])
print('Test1 Dataframe: ', data_test_C1)
print('Test2 Dataframe: ', data_test_C2)
print('Test3 Dataframe: ', data_test_C3)
print('Test4 Dataframe: ', data_test_C4)
data_train_df = pd.DataFrame(principalDf_C0.values)
data_test_df_C1 = pd.DataFrame(principalDf_C1.values)
data_test_df_C2 = pd.DataFrame(principalDf_C2.values)
data_test_df_C3 = pd.DataFrame(principalDf_C4.values)
data_test_df_C4 = pd.DataFrame(principalDf_C5.values)
# Calculating the covariance matrix:
cov_matrix, inv_cov_matrix = cov_matrix(data=data_train)
# Calculating the mean value for the input variables:
mean_distr = data_train_df.mean(axis=0)
# Calculating the Mahalanobis distance and threshold value to flag datapoints as an anomaly:
dist_test_C1 = MahalanobisDist(inv_cov_matrix, mean_distr, data_test_df_C1, verbose=True)
dist_test_C2 = MahalanobisDist(inv_cov_matrix, mean_distr, data_test_df_C2, verbose=True)
dist_test_C3 = MahalanobisDist(inv_cov_matrix, mean_distr, data_test_df_C3, verbose=True)
dist_test_C4 = MahalanobisDist(inv_cov_matrix, mean_distr, data_test_df_C4, verbose=True)
dist_train = MahalanobisDist(inv_cov_matrix, mean_distr, data_train_df, verbose=True)
threshold = MD_threshold(dist_train, extreme = True)
# Distribution of Threshold value for flagging an anomaly:
plt.figure()
sns.distplot(np.square(dist_train),bins = 10, kde= False)
# plt.xlim([0.0,15])
plt.show()
plt.figure()
sns.distplot(dist_train, bins = 10, kde= True, color = 'green');
# plt.xlim([0.0,5])
plt.xlabel('Mahalanobis dist')
plt.show()
anomaly_train = pd.DataFrame(index=data_train_df.index)
anomaly_train['Mob dist']= dist_train
anomaly_train['Thresh'] = threshold
# If Mob dist above threshold: Flag as anomaly
anomaly_train['Anomaly'] = anomaly_train['Mob dist'] > anomaly_train['Thresh']
anomaly_train.index = X_train_PCA.index
anomaly_C1 = pd.DataFrame(index=data_test_df_C1.index)
anomaly_C1['Mob dist']= dist_test_C1
anomaly_C1['Thresh'] = threshold
# If Mob dist above threshold: Flag as anomaly
anomaly_C1['Anomaly'] = anomaly_C1['Mob dist'] > anomaly_C1['Thresh']
anomaly_C1.index = data_test_df_C1.index
anomaly_C1.head()
anomaly_C2 = pd.DataFrame(index=data_test_df_C2.index)
anomaly_C2['Mob dist']= dist_test_C2
anomaly_C2['Thresh'] = threshold
# If Mob dist above threshold: Flag as anomaly
anomaly_C2['Anomaly'] = anomaly_C2['Mob dist'] > anomaly_C2['Thresh']
anomaly_C2.index = data_test_df_C2.index
anomaly_C2.head()
anomaly_C3 = pd.DataFrame(index=data_test_df_C3.index)
anomaly_C3['Mob dist']= dist_test_C3
anomaly_C3['Thresh'] = threshold
# If Mob dist above threshold: Flag as anomaly
anomaly_C3['Anomaly'] = anomaly_C3['Mob dist'] > anomaly_C3['Thresh']
anomaly_C3.index = data_test_df_C3.index
anomaly_C3.head()
anomaly_C4 = pd.DataFrame(index=data_test_df_C4.index)
anomaly_C4['Mob dist']= dist_test_C4
anomaly_C4['Thresh'] = threshold
# If Mob dist above threshold: Flag as anomaly
anomaly_C4['Anomaly'] = anomaly_C4['Mob dist'] > anomaly_C4['Thresh']
anomaly_C4.index = data_test_df_C4.index
anomaly_C4.head()
final_scored = pd.concat([anomaly_train, anomaly_C1, anomaly_C2, anomaly_C3, anomaly_C4])
print(final_scored)
except Exception:
print('Cannot implement Anomaly detection using Mahalanobis distance metric')
pass
Per your comment, you have a namespace collision between a var cov_matrix and a function cov_matrix()
Change that line to e.g.
matrix, inv_matrix = cov_matrix(data=data_train)
And update your code accordingly, or rename cov_matrix(). A good convention is that functions which return things should have verbs in their name, e.g. generate_cov_matrix() or calculate_cov_matrix().*
(Yes, as written the code should run once, since AFAICS you don't call cov_matrix() again after that, but I'm guessing you're using a persistent interpreter session and evaluating the code again once cov_matrix() has been overwritten.)
*This convention assumes that functions are there to have side effects, and return things exceptionally. Of course if you are writing functionally, and having side effects is the exception not the rule, you would likely want to invert it, or follow another convention entirely.
My guess is that you are running into an issue where you have a variable named cov_matrix and a function named cov_matrix. At some point I think you overwrote the function with the variable, which is a numpy.ndarray. Later you try calling the function cov_matrix(), but the object is actually the variable, i.e. the numpy array.
Related
I have this code but it only works right now for continous data from a normal distribution. I want to use this for binomial distribution (i.e. conversion rates). How would I change it to make it fit?
def bootstrap_ci(df, variable, classes, repetitions = 1000, alpha = 0.05, random_state=None):
df = df[[variable, classes]]
bootstrap_sample_size = len(df)
mean_diffs = []
for i in range(repetitions):
bootstrap_sample = df.sample(n = bootstrap_sample_size, replace = True, random_state = random_state)
mean_diff = bootstrap_sample.groupby(classes).mean().iloc[1,0] - bootstrap_sample.groupby(classes).mean().iloc[0,0]
mean_diffs.append(mean_diff)
# confidence interval
left = np.percentile(mean_diffs, alpha/2*100)
right = np.percentile(mean_diffs, 100-alpha/2*100)
# point estimate
point_est = df.groupby(classes).mean().iloc[1,0] - df.groupby(classes).mean().iloc[0,0]
print('Point estimate of difference between means:', round(point_est,2))
print((1-alpha)*100,'%','confidence interval for the difference between means:', (round(left,2), round(right,2)))
bootstrap_ci(df,'conversion_rate','group')
I want to update python data which was originally created for a 1d array to process data. I tried different ways but still got errors. if I flatten my 2d data the data loses meaning sing it is voice data. Below is a made-up data and the function to reproduce the error.
x = np.random.normal(0,1,(40,2))
print(cpp_function(x=signal, fs=44100, pitch_range=[75, 300], trendline_quefrency_range=[0.001, 0.05]))
def cpp_function(x, fs, pitch_range, trendline_quefrency_range, smooth=False, time_smooth_len=None, quefrency_smooth_len=None):
"""
Computes cepstral peak prominence for a given signal
Parameters
-----------
x: ndarray
The audio signal
fs: integer
The sampling frequency
pitch_range: list of 2 elements
The pitch range where a peak is searched for
trendline_quefrency_range: list of 2 elements
The quefrency range for which the amplitudes will be modelled by a straight line
Returns
-----------
integer
The cepstral peak prominence of the audio signal
"""
# Cepstrum
x = np.hamming(len(x))*x
spectrum = np.fft.rfft(x)
spectrum = 20*np.log10(np.abs(spectrum))
ceps = np.fft.rfft(spectrum)
ceps = 20*np.log10(np.abs(ceps))
# Smoothing
if smooth == True:
def smooth(y, box_pts):
box = np.ones(box_pts)/box_pts
y_smooth = np.convolve(y, box, mode='same')
return y_smooth
ceps = smooth(ceps.T, time_smooth_len).T
ceps = smooth(ceps, quefrency_smooth_len)
# Quefrency
dt = 1/fs
freq_vector = np.fft.rfftfreq(len(x), d=dt)
df = freq_vector[1] - freq_vector[0]
quefrency_vector = np.fft.rfftfreq(2*ceps.size-2, df)
# Selecting part of cepstrum
quefrency_range = [1/pitch_range[1], 1/pitch_range[0]]
index_range = np.where((quefrency_vector >= quefrency_range[0]) & (quefrency_vector <=quefrency_range[1]))
# For trend line
index_range_tl = np.where((quefrency_vector >= trendline_quefrency_range[0]) & (quefrency_vector <=trendline_quefrency_range[1]))
# Linear regression
linear_regressor = LinearRegression()
linear_regressor.fit(quefrency_vector[index_range_tl].reshape(-1, 1), ceps[index_range_tl].reshape(-1, 1))
Y_pred = linear_regressor.predict(quefrency_vector.reshape(-1, 1))
# CPP
peak_value = np.max(ceps[index_range])
peak_index = np.argmax(ceps[index_range])
cpp = peak_value - Y_pred[index_range][peak_index][0]
return cpp
I am trying to write a function that properly calculates the entropy of a given dataset. However, I am getting very weird entropy values.
I am following the understanding that all entropy calculations must fall between 0 and 1, yet I am consistently getting values above 2.
Note: I must use log base 2 for this
Can someone explain why am I yielding incorrect entropy results?
The dataset I am testing is the ecoli dataset from the UCI Machine Learning Repository
import numpy
import math
#################### DATA HANDLING LIBRARY ####################
def csv_to_array(file):
# Open the file, and load it in delimiting on the ',' for a comma separated value file
data = open(file, 'r')
data = numpy.loadtxt(data, delimiter=',')
# Loop through the data in the array
for index in range(len(data)):
# Utilize a try catch to try and convert to float, if it can't convert to float, converts to 0
try:
data[index] = [float(x) for x in data[index]]
except Exception:
data[index] = 0
except ValueError:
data[index] = 0
# Return the now type-formatted data
return data
# Function that utilizes the numpy library to randomize the dataset.
def randomize_data(csv):
csv = numpy.random.shuffle(csv)
return csv
# Function to split the data into test, training set, and validation sets
def split_data(csv):
# Call the randomize data function
randomize_data(csv)
# Grab the number of rows and calculate where to split
num_rows = csv.shape[0]
validation_split = int(num_rows * 0.10)
training_split = int(num_rows * 0.72)
testing_split = int(num_rows * 0.18)
# Validation set as the first 10% of the data
validation_set = csv[:validation_split]
# Training set as the next 72
training_set = csv[validation_split:training_split + validation_split]
# Testing set as the last 18
testing_set = csv[training_split + validation_split:]
# Split the data into classes vs actual data
training_cols = training_set.shape[1]
testing_cols = testing_set.shape[1]
validation_cols = validation_set.shape[1]
training_classes = training_set[:, training_cols - 1]
testing_classes = testing_set[:, testing_cols - 1]
validation_classes = validation_set[:, validation_cols - 1]
# Take the sets and remove the last (classification) column
training_set = training_set[:-1]
testing_set = testing_set[:-1]
validation_set = validation_set[:-1]
# Return the datasets
return testing_set, testing_classes, training_set, training_classes, validation_set, validation_classes
#################### DATA HANDLING LIBRARY ####################
# This function returns the list of classes, and their associated weights (i.e. distributions)
# for a given dataset
def class_distribution(dataset):
# Ensure the dataset is a numpy array
dataset = numpy.asarray(dataset)
# Collect # of total rows and columns, using numpy
num_total_rows = dataset.shape[0]
num_columns = dataset.shape[1]
# Create a numpy array of just the classes
classes = dataset[:, num_columns - 1]
# Use numpy.unique to remove duplicates
classes = numpy.unique(classes)
# Create an empty array for the class weights
class_weights = []
# Loop through the classes one by one
for aclass in classes:
# Create storage variables
total = 0
weight = 0
# Now loop through the dataset
for row in dataset:
# If the class of the dataset is equal to the current class you are evaluating, increase the total
if numpy.array_equal(aclass, row[-1]):
total = total + 1
# If not, continue
else:
continue
# Divide the # of occurences by total rows
weight = float((total / num_total_rows))
# Add that weight to the list of class weights
class_weights.append(weight)
# Turn the weights into a numpy array
class_weights = numpy.asarray(class_weights)
# Return the array
return classes, class_weights
# This function returns the entropy for a given dataset
# Can be used across an entire csv, or just for a column of data (feature)
def get_entropy(dataset):
# Set initial entropy
entropy = 0.0
# Determine the classes and their frequencies (weights) of the dataset
classes, class_freq = class_distribution(dataset)
# Utilize numpy's quicksort to test the most occurring class first
numpy.sort(class_freq)
# Determine the max entropy for the dataset
max_entropy = math.log(len(classes), 2)
print("MAX ENTROPY FOR THIS DATASET: ", max_entropy)
# Loop through the frequencies and use given formula to calculate entropy
# For...Each simulates the sequence operator
for freq in class_freq:
entropy += float(-freq * math.log(freq, 2))
# Return the entropy value
return entropy
def main():
ecol = csv_to_array('ecoli.csv')
testing_set, testing_classes, training_set, training_classes, validation_set, validation_classes = split_data(ecol)
entropy = get_entropy(ecol)
print(entropy)
main()
The following function was used to calculate Entropy:
# Function to return Shannon's Entropy
def entropy(attributes, dataset, targetAttr):
freq = {}
entropy = 0.0
index = 0
for item in attributes:
if (targetAttr == item):
break
else:
index = index + 1
index = index - 1
for item in dataset:
if ((item[index]) in freq):
# Increase the index
freq[item[index]] += 1.0
else:
# Initialize it by setting it to 0
freq[item[index]] = 1.0
for freq in freq.values():
entropy = entropy + (-freq / len(dataset)) * math.log(freq / len(dataset), 2)
return entropy
As #MattTimmermans had indicated, entropy's value is actually contingent on the number of classes. For strictly 2 classes, it is contained in the 0 to 1 (inclusive) range. However, for more than 2 classes (which is what was being tested), entropy is calculated with a different formula (converted to Pythonic code above). This post here explains those mathematics and calculations a bit more in detail.
I am trying to create a multible voigt/Gaussian/Lorentizan-peak fit function with lmfit.
Therefore, I wrote the following Function:
def apply_fit_mix_multy(data,modelPeak,peakPos,amplitud,**kwargs):
peakPos=np.array(peakPos)
Start=kwargs.get('Start',data[0,0])
length_data=len(data)-1
End=kwargs.get('End',data[length_data,0])
StartPeak=kwargs.get('StartPeak',data[0,0])
EndPeak=kwargs.get('EndPeak',data[length_data,0])
BackFunc=kwargs.get('BackFunc',False)
BackCut=kwargs.get('BackCut',False)
dataN=data_intervall(data,Start,End)
y=dataN[:, 1]
x=dataN[:, 0]
amplitud=amplitud
center=peakPos
mod = None
for i in range(len(peakPos)):
this_mod = make_model(i,amplitud,center,modelPeak)
if mod is None:
mod = this_mod
else:
mod = mod + this_mod
bgy=[list() for f in range(len(x))]
if(BackFunc==True):
bg,bgx=BackFunc
for i in range(len(x)):
bgy[i]=bg.best_values.get('c')
elif(BackCut!=False):
slope,intercept=back_ground_cut(data,BackCut[0],BackCut[1])
for i in range(len(x)):
bgy[i]=slope*x[i]+intercept
if(BackCut!=False):
print('Background substraction model is used! (Sign=Sign-backgr(linear between two points))')
y=y-bgy
out = mod.fit(y, x=x)
else:
print('Combination model is used! (offset+Gauss/Lor/Voig)')
offset=ConstantModel()
mod=mod+offset
out = mod.fit(y, x=x)#out is the fitted function
area=[list() for f in range(len(peakPos))]
comps=out.eval_components(x=x)
if(BackCut!=False):
for i in range(len(peakPos)):
area[i]=simps(comps['peak'+str(i)+'_'],x=x,even='avg')-simps(bgy,x=x,even='avg')
fit_dict={'signal':y, 'convol':out.best_fit,'x':x,'peak_area':area,'backgr':bgy,'comps':comps}
else:
for i in range(len(peakPos)):
area[i]=simps(comps['peak'+str(i)+'_'],x=x,even='avg')
fit_dict={'convol':out.best_fit,'x':x,'peak_area':area,'comps':comps} #comps is inf. of sperate peaks
return fit_dict
The function reads in a data set, the modelPeak (e.g. GaussianModel) an initial guess of peak positions and amplitudes (peakPos, amplitude) .
In the first Part I initializing the model of the peaks (how many peaks...)
for i in range(len(peakPos)):
this_mod = make_model(i,amplitud,center,modelPeak)
if mod is None:
mod = this_mod
else:
mod = mod + this_mod
With the make_model funktion:
def make_model(num,amplitud,center,mod):
pref = "peak{0}_".format(num)
model = mod(prefix = pref)
model.set_param_hint(pref+'amplitud', value=amplitud[num], min=0, max=5*amplitud[num])
model.set_param_hint(pref+'center', value=center[num], min=center[num]-0.5, max=center[num]+0.5)
if(num==0):
model.set_param_hint(pref+'sigma', value=0.3, min=0.01, max=1)
else:
model.set_param_hint(pref+'sigma', value=0.3, min=0.01, max=1)
#print('Jetzt',center[num],amplitud[num])
return model
here is now my Problem: I I whant to fit e.g. 3 Peaks I whant that e.g. the sigma of the first peak is varies during the fit while the sigmas of the other peaks depend on the sigma of the first peak!
any idea?
thx
maths
FYI this is how a fit looks like:
enter image description here
If I understand your long question (it would be helpful to remove the extraneous stuff - and there is quite a lot of it), you want to create a Model with multiple peaks, allowing sigma from the 1st peak to vary freely, and constraining sigma for the other peaks to depend on this.
To do that, you can either use parameter hints (as you use in your make_model() function) or set expressions for the parameters after the Parameters object is created. For the first approach, something like this
def make_model(num,amplitud,center,mod):
pref = "peak{0}_".format(num)
model = mod(prefix = pref)
model.set_param_hint(pref+'amplitud', value=amplitud[num], min=0, max=5*amplitud[num])
model.set_param_hint(pref+'center', value=center[num], min=center[num]-0.5, max=center[num]+0.5)
if(num==0):
model.set_param_hint(pref+'sigma', value=0.3, min=0.01, max=1)
else:
## instead of
# model.set_param_hint(pref+'sigma', value=0.3, min=0.01, max=1)
## set peakN_sigma == peak0_sigma
model.set_param_hint(pref+'sigma', expr='peak0_sigma')
## or maybe set peakN_sigma == N * peak0_sigma
model.set_param_hint(pref+'sigma', expr='%d*peak0_sigma' % num)
return model
You could also make the full model (simplified somewhat from your code, but the same idea):
model = (VoigtModel(prefix='peak0_') + VoigtModel(prefix='peak1_') +
VoigtModel(prefix='peak2_') + LinearModel(prefix='const_'))
# create parameters with default values
params = model.make_params(peak0_amplitude=10, peak0_sigma=2, ....)
# set constraints for `sigma` params:
params['peak1_sigma'].expr = 'peak0_sigma'
params['peak2_sigma'].expr = 'peak0_sigma'
# similarly, set bounds as needed:
params['peak1_sigma'].min = 0
params['peak1_amplitude'].min = 0
Hope that helps...
I'm looking for a way to cluster set of features with DBSCAN algorithm in tensorflow however I'm unable to find anything related.
TensorFlow offers K-Means clustering (tf.contrib.learn.KMeansClustering), but I need DBSCAN algorithm.
Can anybody suggest me any existing wrappers written in python/java?
Any pointers on how to implement it from scratch?
P.S. I am aware of sklearn and similar libraries that has DBSCAN, but I specifically need in TensorFlow.
I know I'm like a year late to the party, but for any future reference:
here is my implementation of DBSCAN-like algorithm. It might give results slightly different from what you'd get from algorithm implemented in sklearn, for example, especially for observations that may belong to more than one clusters.
I know it's probably not optimal.
I know, that TF is not the best choice when it comes to implementing the algorithm.
But maybe someone will find the code valuable.
Relevant code:
import tensorflow as tf
import numpy as np
def run(vals, epsilon=4, min_points=4):
def merge_core_points_into_clusters(elems):
row = elems
mat = core_points_connection_matrix
nonempty_intersection_inds = tf.where(tf.reduce_any(tf.logical_and(row, mat), axis=1))
cumul = tf.logical_or(row, mat)
subcumul = tf.gather_nd(cumul, nonempty_intersection_inds)
return tf.reduce_any(subcumul, axis=0)
def label_clusters(elems):
return tf.reduce_min(tf.where(elems))
def get_subsets_for_labels(elems):
val = elems[0]
labels = elems[1]
conn = relation_matrix
inds = tf.where(tf.equal(labels, val))
masks = tf.gather_nd(conn, inds)
return tf.reduce_any(masks, axis=0)
def scatter_labels(elems):
label = tf.expand_dims(elems[0], 0)
mask = elems[1]
return label*tf.cast(mask, dtype=tf.int64)
data_np = np.array(vals)
eps = epsilon
min_pts = min_points
in_set = tf.placeholder(tf.float64)
# distance matrix
r = tf.reduce_sum(in_set*in_set, 1)
# turn r into column vector
r = tf.reshape(r, [-1, 1])
dist_mat = tf.sqrt(r - 2*tf.matmul(in_set, tf.transpose(in_set)) + tf.transpose(r))
# for every point show, which points are within eps distance of that point (including that point)
relation_matrix = dist_mat <= eps
# number of points within eps-ball for each point
num_neighbors = tf.reduce_sum(tf.cast(relation_matrix, tf.int64), axis=1)
# for each point show, whether this point is core point
core_points_mask = num_neighbors >= min_pts
# indices of core points
core_points_indices = tf.where(core_points_mask)
core_points_connection_matrix = tf.cast(core_points_mask, dtype=tf.int64) * tf.cast(relation_matrix, dtype=tf.int64)
core_points_connection_matrix = tf.cast(core_points_connection_matrix, dtype=tf.bool)
core_points_connection_matrix = tf.logical_and(core_points_connection_matrix, core_points_mask)
merged = tf.map_fn(
merge_core_points_into_clusters,
core_points_connection_matrix,
dtype=tf.bool
)
nonempty_clusters_records = tf.gather_nd(merged, core_points_indices)
marked_core_points = tf.map_fn(label_clusters, nonempty_clusters_records, dtype=tf.int64)
_, labels_core_points = tf.unique(marked_core_points, out_idx=tf.int64)
labels_core_points = labels_core_points+1
unique_labels, _ = tf.unique(labels_core_points)
labels_all = tf.scatter_nd(
tf.cast(core_points_indices, tf.int64),
labels_core_points,
shape=tf.cast(tf.shape(core_points_mask), tf.int64)
)
# for each label return mask, which points should have this label
ul_shape = tf.shape(unique_labels)
labels_tiled = tf.maximum(tf.zeros([ul_shape[0], 1], dtype=tf.int64), labels_all)
labels_subsets = tf.map_fn(
get_subsets_for_labels,
(unique_labels, labels_tiled),
dtype=tf.bool
)
final_labels = tf.map_fn(
scatter_labels,
elems=(tf.expand_dims(unique_labels, 1), labels_subsets),
dtype=tf.int64
)
final_labels = tf.reduce_max(final_labels, axis=0)
with tf.Session() as sess:
results = (sess.run(final_labels, feed_dict={in_set:data_np})).reshape((1, -1))
results = results.reshape((-1, 1))
return results