singular value decomposition changing results - python

I am trying to perform text summarization using svds but the summary result is changing everytime I run the function. Can someone please let me know the reason and also a solution for it ?
I even checked the indivudual arrays u, s and v even they are changing after every run. How to make them static ?
The sentence matrix has been calculated as follows after that svds code. The dataset is some legal document from australian supreme court.
def _compute_matrix(sentences, weighting, norm):
if weighting.lower() == 'binary':
vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 1),
binary=True, stop_words=None)
elif weighting.lower() == 'frequency':
vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 1),
binary=False, stop_words=None)
elif weighting.lower() == 'tfidf':
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1),
stop_words=None)
else:
raise ValueError('Parameter "method" must take one of the values
"binary", "frequency" or "tfidf".')
# Extract word features from sentences using sparse vectorizer
frequency_matrix = vectorizer.fit_transform(sentences).astype(float)
terms = vectorizer.get_feature_names()
if norm in ('l1', 'l2'):
frequency_matrix = normalize(frequency_matrix, norm=norm, axis=1)
elif norm is not None:
raise ValueError('Parameter "norm" can only take values "l1", "l2"
or None')
return frequency_matrix, terms
processed_sentences = _createsentences(raw_content)
sentence_matrix, feature_names = _compute_matrix(processed_sentences,
weighting='tfidf', norm='l2')
sentence_matrix = sentence_matrix.transpose()
sentence_matrix = sentence_matrix.multiply(sentence_matrix > 0)
print(sentence_matrix.shape)
u, s, v = svds(sentence_matrix, k=20)
topic_sigma_threshold = 0.5
topic_averages = v.mean(axis=1)
for topic_ndx, topic_avg in enumerate(topic_averages):
v[topic_ndx, v[topic_ndx, :] <= topic_avg] = 0
if 1 <= topic_sigma_threshold < 0:
raise ValueError('Parameter topic_sigma_threshold must take a value
between 0 and 1')
sigma_threshold = max(s) * topic_sigma_threshold
s[s < sigma_threshold] = 0
saliency_vec = np.dot(np.square(s), np.square(v))
top_sentences = saliency_vec.argsort()[-25:][::-1]
top_sentences.sort()
[processed_sentences[i] for i in top_sentences]

I found a solution by playing with the parameters of svd and understanding the source code of svds. svds uses random intial vector from dimension N of the sparse matrix. So to set the initial vector to a constant choice we must use the v0 parameter and the code is mentioned below.
np.random.seed(0)
v0 = np.random.rand(min(sentence_matrix.shape))
u, s, v = svds(sentence_matrix, k=20, v0=v0)

Related

how to overcome the "'numpy.ndarray' object is not callable" error?

I looked into the anomaly detection using both PCA and Autoencoder using the codes from the following link: Machine learning for anomaly detection and condition monitoring and I try to run the code part for using PCA with Mahalanobis Distance, however, if I run the code I always get the exception message and it turns out the problem is with the covariance matrix function part where the error 'numpy.ndarray' object is not callable appears. I tried to create new variables, change the dataframe into NumPy but nothing worked what is causing this error?
Code:
def cov_matrix(data, verbose=False):
# data = pd.DataFrame(data).to_numpy()
print('calculating the covaraince matrix')
covariance_matrix = np.cov(data, rowvar=False)
print('Done the covaraince matrix')
if is_pos_def(covariance_matrix):
inv_covariance_matrix = np.linalg.inv(covariance_matrix)
if is_pos_def(inv_covariance_matrix):
return covariance_matrix, inv_covariance_matrix
else:
print("Error: Inverse of Covariance Matrix is not positive definite!")
else:
print("Error: Covariance Matrix is not positive definite!")
def MahalanobisDist(inv_cov_matrix, mean_distr, data, verbose=False):
inv_covariance_matrix = inv_cov_matrix
vars_mean = mean_distr
diff = data - vars_mean
md = []
for i in range(len(diff)):
md.append(np.sqrt(diff[i].dot(inv_covariance_matrix).dot(diff[i])))
return md
def MD_detectOutliers(dist, extreme=False, verbose=False):
k = 3. if extreme else 2.
threshold = np.mean(dist) * k
outliers = []
for i in range(len(dist)):
if dist[i] >= threshold:
outliers.append(i) # index of the outlier
return np.array(outliers)
def MD_threshold(dist, extreme=False, verbose=False):
k = 3. if extreme else 2.
threshold = np.mean(dist) * k
return threshold
#### Main code:
# Inputting the training and test dataframes:
data_train = np.array(principalDf_C0.values)
data_test_C1 = np.array(principalDf_C1.values)
data_test_C2 = np.array(principalDf_C2.values)
data_test_C3 = np.array(principalDf_C4.values)
data_test_C4 = np.array(principalDf_C5.values)
print('Training Dataframe: ', data_train[:,])
print('Test1 Dataframe: ', data_test_C1)
print('Test2 Dataframe: ', data_test_C2)
print('Test3 Dataframe: ', data_test_C3)
print('Test4 Dataframe: ', data_test_C4)
data_train_df = pd.DataFrame(principalDf_C0.values)
data_test_df_C1 = pd.DataFrame(principalDf_C1.values)
data_test_df_C2 = pd.DataFrame(principalDf_C2.values)
data_test_df_C3 = pd.DataFrame(principalDf_C4.values)
data_test_df_C4 = pd.DataFrame(principalDf_C5.values)
# Calculating the covariance matrix:
cov_matrix, inv_cov_matrix = cov_matrix(data=data_train)
# Calculating the mean value for the input variables:
mean_distr = data_train_df.mean(axis=0)
# Calculating the Mahalanobis distance and threshold value to flag datapoints as an anomaly:
dist_test_C1 = MahalanobisDist(inv_cov_matrix, mean_distr, data_test_df_C1, verbose=True)
dist_test_C2 = MahalanobisDist(inv_cov_matrix, mean_distr, data_test_df_C2, verbose=True)
dist_test_C3 = MahalanobisDist(inv_cov_matrix, mean_distr, data_test_df_C3, verbose=True)
dist_test_C4 = MahalanobisDist(inv_cov_matrix, mean_distr, data_test_df_C4, verbose=True)
dist_train = MahalanobisDist(inv_cov_matrix, mean_distr, data_train_df, verbose=True)
threshold = MD_threshold(dist_train, extreme = True)
# Distribution of Threshold value for flagging an anomaly:
plt.figure()
sns.distplot(np.square(dist_train),bins = 10, kde= False)
# plt.xlim([0.0,15])
plt.show()
plt.figure()
sns.distplot(dist_train, bins = 10, kde= True, color = 'green');
# plt.xlim([0.0,5])
plt.xlabel('Mahalanobis dist')
plt.show()
anomaly_train = pd.DataFrame(index=data_train_df.index)
anomaly_train['Mob dist']= dist_train
anomaly_train['Thresh'] = threshold
# If Mob dist above threshold: Flag as anomaly
anomaly_train['Anomaly'] = anomaly_train['Mob dist'] > anomaly_train['Thresh']
anomaly_train.index = X_train_PCA.index
anomaly_C1 = pd.DataFrame(index=data_test_df_C1.index)
anomaly_C1['Mob dist']= dist_test_C1
anomaly_C1['Thresh'] = threshold
# If Mob dist above threshold: Flag as anomaly
anomaly_C1['Anomaly'] = anomaly_C1['Mob dist'] > anomaly_C1['Thresh']
anomaly_C1.index = data_test_df_C1.index
anomaly_C1.head()
anomaly_C2 = pd.DataFrame(index=data_test_df_C2.index)
anomaly_C2['Mob dist']= dist_test_C2
anomaly_C2['Thresh'] = threshold
# If Mob dist above threshold: Flag as anomaly
anomaly_C2['Anomaly'] = anomaly_C2['Mob dist'] > anomaly_C2['Thresh']
anomaly_C2.index = data_test_df_C2.index
anomaly_C2.head()
anomaly_C3 = pd.DataFrame(index=data_test_df_C3.index)
anomaly_C3['Mob dist']= dist_test_C3
anomaly_C3['Thresh'] = threshold
# If Mob dist above threshold: Flag as anomaly
anomaly_C3['Anomaly'] = anomaly_C3['Mob dist'] > anomaly_C3['Thresh']
anomaly_C3.index = data_test_df_C3.index
anomaly_C3.head()
anomaly_C4 = pd.DataFrame(index=data_test_df_C4.index)
anomaly_C4['Mob dist']= dist_test_C4
anomaly_C4['Thresh'] = threshold
# If Mob dist above threshold: Flag as anomaly
anomaly_C4['Anomaly'] = anomaly_C4['Mob dist'] > anomaly_C4['Thresh']
anomaly_C4.index = data_test_df_C4.index
anomaly_C4.head()
final_scored = pd.concat([anomaly_train, anomaly_C1, anomaly_C2, anomaly_C3, anomaly_C4])
print(final_scored)
except Exception:
print('Cannot implement Anomaly detection using Mahalanobis distance metric')
pass
Per your comment, you have a namespace collision between a var cov_matrix and a function cov_matrix()
Change that line to e.g.
matrix, inv_matrix = cov_matrix(data=data_train)
And update your code accordingly, or rename cov_matrix(). A good convention is that functions which return things should have verbs in their name, e.g. generate_cov_matrix() or calculate_cov_matrix().*
(Yes, as written the code should run once, since AFAICS you don't call cov_matrix() again after that, but I'm guessing you're using a persistent interpreter session and evaluating the code again once cov_matrix() has been overwritten.)
*This convention assumes that functions are there to have side effects, and return things exceptionally. Of course if you are writing functionally, and having side effects is the exception not the rule, you would likely want to invert it, or follow another convention entirely.
My guess is that you are running into an issue where you have a variable named cov_matrix and a function named cov_matrix. At some point I think you overwrote the function with the variable, which is a numpy.ndarray. Later you try calling the function cov_matrix(), but the object is actually the variable, i.e. the numpy array.

I'm having trouble with content based recommendation system prediction (NOT TDIDF)

I keep getting the following error --> Exception: Dim. mismatch: Test data contains 3 items, while Content contains 1526 items. Please make sure the columns of test and content match.
Can someone help me? I've been working on this code for a few days. My entire body of code is below.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IESEGRecSys.Functions import *
from sklearn.model_selection import train_test_split
from surprise import KNNBasic
from surprise import Dataset, Reader
user_artists = pd.read_table("user_artists.dat")
user_artists['ratings'] = 0
user_artists.loc[user_artists['weight'] <= user_artists['weight'].quantile(1), 'ratings'] = 5
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.8), 'ratings'] = 4
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.6), 'ratings'] = 3
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.4), 'ratings'] = 2
user_artists.loc[user_artists['weight'] < user_artists['weight'].quantile(0.2), 'ratings'] = 1
data = user_artists[['userID','artistID','ratings']]
data.head()
data.shape
# train-test split
train, test = train_test_split(data, test_size=0.3, random_state=42)
# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print(data.shape)
print(train.shape)
print(test.shape)
tags = pd.read_table("tags.dat", encoding = 'unicode_escape')
user_taggedartists = pd.read_table("user_taggedartists.dat")
user_tag_merged = pd.merge(user_taggedartists, tags, on="tagID", how="inner")
user_tag_merged_updated = pd.merge(user_tag_merged, data, on=(["userID","artistID"]),how="inner")
movie=user_tag_merged_updated
movie
data2 = data[['userID','artistID','ratings']]
# train-test split
train, test2 = train_test_split(data2, test_size=0.3, random_state=42)
# reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
print(data2.shape)
print(train.shape)
print(test.shape)
data_pivot2 = data2.pivot_table(index='artistID', values='ratings', columns='userID').fillna(0)
data_pivot2.head()
movie2 = [['tagID','artistID','year']]
movie2 = user_tag_merged_updated.pivot_table(index='tagID', values='year', columns='userID').fillna(0)
movie2.head()
# Content based as a function
from numpy.linalg import norm
def simil_cosine(a,b):
return np.dot(a, b)/(norm(a)*norm(b))
def ContentBased(content_data, test_data, NN):
cdata = content_data.reset_index(drop=True).copy()
# store user and item dimensions
dim = cdata.shape[0]
nr_user = cdata.shape[0]
if test_data.shape[1] != dim:
raise Exception('Dim. mismatch: Test data contains {} items, while Content contains {} items. Please make sure the columns of test and content match.'\
.format(test_data.shape[1], dim))
# similarity matrices
matrix = np.zeros(shape=(dim, dim), dtype=np.float)
matrixNN = np.zeros(shape=(dim, dim), dtype=np.float)
# compute similarity
for i, row in cdata.iterrows():
for j, col in cdata.iterrows():
if i <= j: continue
else: matrix[i][j] = simil_cosine(np.array(row),np.array(col))
# copy values to other diagonal
matrix = matrix + matrix.T - np.diag(np.diag(matrix))
print('Similarity calculation done...')
# mask all values that are not nearest neighbors
cutoff = lambda x,cv: x if x >= cv else 0.0
v_cutoff = np.vectorize(cutoff)
for i in range(dim):
crit_val = -np.sort(-matrix[i])[NN-1]
matrixNN[i] = v_cutoff(matrix[i], crit_val)
print('Nearest neighbor selection done...')
# predict user-item ratings in test_data
prediction = np.zeros(shape=(nr_user, dim), dtype=np.float)
for i in range(nr_user):
num = np.matmul(np.array(test_data.iloc[i,:]), matrixNN)
denom = matrixNN.sum(axis=0) # column sums
prediction[i] = num/denom
print('Prediction done...')
# return DataFrame
return pd.DataFrame(prediction, index=test_data.index, columns=test_data.columns)
cb_pred = ContentBased(movie2,data_pivot2, 10)
# Content Based as a Class
from numpy.linalg import norm
class ContentBased:
def simil_cosine(self, a,b):
return np.dot(a, b)/(norm(a)*norm(b))
def __init__(self, NN):
self.NN = NN
def fit(self, content_data):
cdata = content_data.reset_index(drop=True).copy()
self.item_dim = cdata.shape[0]
self.matrix = np.zeros(shape=(self.item_dim, self.item_dim), dtype=np.float)
self.matrixNN = np.zeros(shape=(self.item_dim, self.item_dim), dtype=np.float)
# compute similarity
for i, row in cdata.iterrows():
for j, col in cdata.iterrows():
if i <= j: continue
else: self.matrix[i][j] = self.simil_cosine(np.array(row),np.array(col))
# copy values to other diagonal
self.matrix = self.matrix + self.matrix.T - np.diag(np.diag(self.matrix))
cutoff = lambda x,cv: x if x >= cv else 0.0
v_cutoff = np.vectorize(cutoff)
for i in range(self.item_dim):
crit_val = -np.sort(-self.matrix[i])[self.NN-1]
self.matrixNN[i] = v_cutoff(self.matrix[i], crit_val)
def predict(self, test_data):
if test_data.shape[1] != self.item_dim:
raise Exception('Dim. mismatch: Test data contains {} items, while Content contains {} items. Please make sure the columns of test and content match.'\
.format(test_data.shape[1], self.item_dim))
I keep getting the following error --> Exception: Dim. mismatch: Test data contains 3 items, while Content contains 1526 items. Please make sure the columns of test and content match.

Why does it work when columns are larger than rows in Python Sklearn (Linear Regression) [duplicate]

it's known that when the number of variables (p) is larger than the number of samples (n) the least square estimator is not defined.
In sklearn I receive this values:
In [30]: lm = LinearRegression().fit(xx,y_train)
In [31]: lm.coef_
Out[31]:
array([[ 0.20092363, -0.14378298, -0.33504391, ..., -0.40695124,
0.08619906, -0.08108713]])
In [32]: xx.shape
Out[32]: (1097, 3419)
Call [30] should return an error. How does sklearn work when p>n like in this case?
EDIT:
It seems that the matrix is filled with some values
if n > m:
# need to extend b matrix as it will be filled with
# a larger solution matrix
if len(b1.shape) == 2:
b2 = np.zeros((n, nrhs), dtype=gelss.dtype)
b2[:m,:] = b1
else:
b2 = np.zeros(n, dtype=gelss.dtype)
b2[:m] = b1
b1 = b2
When the linear system is underdetermined, then the sklearn.linear_model.LinearRegression finds the minimum L2 norm solution, i.e.
argmin_w l2_norm(w) subject to Xw = y
This is always well defined and obtainable by applying the pseudoinverse of X to y, i.e.
w = np.linalg.pinv(X).dot(y)
The specific implementation of scipy.linalg.lstsq, which is used by LinearRegression uses get_lapack_funcs(('gelss',), ... which is precisely a solver that finds the minimum norm solution via singular value decomposition (provided by LAPACK).
Check out this example
import numpy as np
rng = np.random.RandomState(42)
X = rng.randn(5, 10)
y = rng.randn(5)
from sklearn.linear_model import LinearRegression
lr = LinearRegression(fit_intercept=False)
coef1 = lr.fit(X, y).coef_
coef2 = np.linalg.pinv(X).dot(y)
print(coef1)
print(coef2)
And you will see that coef1 == coef2. (Note that fit_intercept=False is specified in the constructor of the sklearn estimator, because otherwise it would subtract the mean of each feature before fitting the model, yielding different coefficients)

Sklearn logistic regression shape error, but x, y shapes are consistent

I get a ValueError: Found input variables with inconsistent numbers of samples: [20000, 1] when I run the following even though the row values of x and y are correct. I load in the RCV1 dataset, get indices of the categories with the top x documents, create list of tuples with equal number of randomly-selected positives and negatives for each category, and then finally attempt to run a logistic regression on one of the categories.
import sklearn.datasets
from sklearn import model_selection, preprocessing
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from scipy import sparse
rcv1 = sklearn.datasets.fetch_rcv1()
def get_top_cat_indices(target_matrix, num_cats):
cat_counts = target_matrix.sum(axis=0)
#cat_counts = cat_counts.reshape((1,103)).tolist()[0]
cat_counts = cat_counts.reshape((103,))
#b = sorted(cat_counts, reverse=True)
ind_temp = np.argsort(cat_counts)[::-1].tolist()[0]
ind = [ind_temp[i] for i in range(5)]
return ind
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
cat_present = x.tocsr()[np.where(temp.sum(axis=1)>0)[0],:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[np.where(temp.sum(axis=1)==0)[0],:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[idx_cat,:]
sampled_y_neg = temp.tocsr()[idx_nocat,:]
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
ind = get_top_cat_indices(rcv1.target, 5)
test_res = prepare_data(train_x, train_y, ind, 20000)
x, y = test_res[0]
print(x.shape)
print(y.shape)
LogisticRegression().fit(x, y)
Could it be an issue with the sparse matrices, or problem with dimensionality (there are 20K samples and 47K features)
When I run your code, I get following error:
AttributeError: 'bool' object has no attribute 'any'
That's because y for LogisticRegression needs to numpy array. So, I changed last line to:
LogisticRegression().fit(x, y.A.flatten())
Then I get following error:
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0
This is because your sampling code has a bug. You need to subset y array with rows having that category before using sampling indices. See code below:
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
c1 = np.where(temp.sum(axis=1)>0)[0]
c2 = np.where(temp.sum(axis=1)==0)[0]
cat_present = x.tocsr()[c1,:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[c2,:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[c1][idx_cat,:]
print(sampled_y_pos.nnz)
sampled_y_neg = temp.tocsr()[c2][idx_nocat,:]
print(sampled_y_neg.nnz)
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
Now, Everything works like a charm

How to implement DBSCAN clustering in tensorflow?

I'm looking for a way to cluster set of features with DBSCAN algorithm in tensorflow however I'm unable to find anything related.
TensorFlow offers K-Means clustering (tf.contrib.learn.KMeansClustering), but I need DBSCAN algorithm.
Can anybody suggest me any existing wrappers written in python/java?
Any pointers on how to implement it from scratch?
P.S. I am aware of sklearn and similar libraries that has DBSCAN, but I specifically need in TensorFlow.
I know I'm like a year late to the party, but for any future reference:
here is my implementation of DBSCAN-like algorithm. It might give results slightly different from what you'd get from algorithm implemented in sklearn, for example, especially for observations that may belong to more than one clusters.
I know it's probably not optimal.
I know, that TF is not the best choice when it comes to implementing the algorithm.
But maybe someone will find the code valuable.
Relevant code:
import tensorflow as tf
import numpy as np
def run(vals, epsilon=4, min_points=4):
def merge_core_points_into_clusters(elems):
row = elems
mat = core_points_connection_matrix
nonempty_intersection_inds = tf.where(tf.reduce_any(tf.logical_and(row, mat), axis=1))
cumul = tf.logical_or(row, mat)
subcumul = tf.gather_nd(cumul, nonempty_intersection_inds)
return tf.reduce_any(subcumul, axis=0)
def label_clusters(elems):
return tf.reduce_min(tf.where(elems))
def get_subsets_for_labels(elems):
val = elems[0]
labels = elems[1]
conn = relation_matrix
inds = tf.where(tf.equal(labels, val))
masks = tf.gather_nd(conn, inds)
return tf.reduce_any(masks, axis=0)
def scatter_labels(elems):
label = tf.expand_dims(elems[0], 0)
mask = elems[1]
return label*tf.cast(mask, dtype=tf.int64)
data_np = np.array(vals)
eps = epsilon
min_pts = min_points
in_set = tf.placeholder(tf.float64)
# distance matrix
r = tf.reduce_sum(in_set*in_set, 1)
# turn r into column vector
r = tf.reshape(r, [-1, 1])
dist_mat = tf.sqrt(r - 2*tf.matmul(in_set, tf.transpose(in_set)) + tf.transpose(r))
# for every point show, which points are within eps distance of that point (including that point)
relation_matrix = dist_mat <= eps
# number of points within eps-ball for each point
num_neighbors = tf.reduce_sum(tf.cast(relation_matrix, tf.int64), axis=1)
# for each point show, whether this point is core point
core_points_mask = num_neighbors >= min_pts
# indices of core points
core_points_indices = tf.where(core_points_mask)
core_points_connection_matrix = tf.cast(core_points_mask, dtype=tf.int64) * tf.cast(relation_matrix, dtype=tf.int64)
core_points_connection_matrix = tf.cast(core_points_connection_matrix, dtype=tf.bool)
core_points_connection_matrix = tf.logical_and(core_points_connection_matrix, core_points_mask)
merged = tf.map_fn(
merge_core_points_into_clusters,
core_points_connection_matrix,
dtype=tf.bool
)
nonempty_clusters_records = tf.gather_nd(merged, core_points_indices)
marked_core_points = tf.map_fn(label_clusters, nonempty_clusters_records, dtype=tf.int64)
_, labels_core_points = tf.unique(marked_core_points, out_idx=tf.int64)
labels_core_points = labels_core_points+1
unique_labels, _ = tf.unique(labels_core_points)
labels_all = tf.scatter_nd(
tf.cast(core_points_indices, tf.int64),
labels_core_points,
shape=tf.cast(tf.shape(core_points_mask), tf.int64)
)
# for each label return mask, which points should have this label
ul_shape = tf.shape(unique_labels)
labels_tiled = tf.maximum(tf.zeros([ul_shape[0], 1], dtype=tf.int64), labels_all)
labels_subsets = tf.map_fn(
get_subsets_for_labels,
(unique_labels, labels_tiled),
dtype=tf.bool
)
final_labels = tf.map_fn(
scatter_labels,
elems=(tf.expand_dims(unique_labels, 1), labels_subsets),
dtype=tf.int64
)
final_labels = tf.reduce_max(final_labels, axis=0)
with tf.Session() as sess:
results = (sess.run(final_labels, feed_dict={in_set:data_np})).reshape((1, -1))
results = results.reshape((-1, 1))
return results

Categories