How to implement DBSCAN clustering in tensorflow? - python

I'm looking for a way to cluster set of features with DBSCAN algorithm in tensorflow however I'm unable to find anything related.
TensorFlow offers K-Means clustering (tf.contrib.learn.KMeansClustering), but I need DBSCAN algorithm.
Can anybody suggest me any existing wrappers written in python/java?
Any pointers on how to implement it from scratch?
P.S. I am aware of sklearn and similar libraries that has DBSCAN, but I specifically need in TensorFlow.

I know I'm like a year late to the party, but for any future reference:
here is my implementation of DBSCAN-like algorithm. It might give results slightly different from what you'd get from algorithm implemented in sklearn, for example, especially for observations that may belong to more than one clusters.
I know it's probably not optimal.
I know, that TF is not the best choice when it comes to implementing the algorithm.
But maybe someone will find the code valuable.
Relevant code:
import tensorflow as tf
import numpy as np
def run(vals, epsilon=4, min_points=4):
def merge_core_points_into_clusters(elems):
row = elems
mat = core_points_connection_matrix
nonempty_intersection_inds = tf.where(tf.reduce_any(tf.logical_and(row, mat), axis=1))
cumul = tf.logical_or(row, mat)
subcumul = tf.gather_nd(cumul, nonempty_intersection_inds)
return tf.reduce_any(subcumul, axis=0)
def label_clusters(elems):
return tf.reduce_min(tf.where(elems))
def get_subsets_for_labels(elems):
val = elems[0]
labels = elems[1]
conn = relation_matrix
inds = tf.where(tf.equal(labels, val))
masks = tf.gather_nd(conn, inds)
return tf.reduce_any(masks, axis=0)
def scatter_labels(elems):
label = tf.expand_dims(elems[0], 0)
mask = elems[1]
return label*tf.cast(mask, dtype=tf.int64)
data_np = np.array(vals)
eps = epsilon
min_pts = min_points
in_set = tf.placeholder(tf.float64)
# distance matrix
r = tf.reduce_sum(in_set*in_set, 1)
# turn r into column vector
r = tf.reshape(r, [-1, 1])
dist_mat = tf.sqrt(r - 2*tf.matmul(in_set, tf.transpose(in_set)) + tf.transpose(r))
# for every point show, which points are within eps distance of that point (including that point)
relation_matrix = dist_mat <= eps
# number of points within eps-ball for each point
num_neighbors = tf.reduce_sum(tf.cast(relation_matrix, tf.int64), axis=1)
# for each point show, whether this point is core point
core_points_mask = num_neighbors >= min_pts
# indices of core points
core_points_indices = tf.where(core_points_mask)
core_points_connection_matrix = tf.cast(core_points_mask, dtype=tf.int64) * tf.cast(relation_matrix, dtype=tf.int64)
core_points_connection_matrix = tf.cast(core_points_connection_matrix, dtype=tf.bool)
core_points_connection_matrix = tf.logical_and(core_points_connection_matrix, core_points_mask)
merged = tf.map_fn(
merge_core_points_into_clusters,
core_points_connection_matrix,
dtype=tf.bool
)
nonempty_clusters_records = tf.gather_nd(merged, core_points_indices)
marked_core_points = tf.map_fn(label_clusters, nonempty_clusters_records, dtype=tf.int64)
_, labels_core_points = tf.unique(marked_core_points, out_idx=tf.int64)
labels_core_points = labels_core_points+1
unique_labels, _ = tf.unique(labels_core_points)
labels_all = tf.scatter_nd(
tf.cast(core_points_indices, tf.int64),
labels_core_points,
shape=tf.cast(tf.shape(core_points_mask), tf.int64)
)
# for each label return mask, which points should have this label
ul_shape = tf.shape(unique_labels)
labels_tiled = tf.maximum(tf.zeros([ul_shape[0], 1], dtype=tf.int64), labels_all)
labels_subsets = tf.map_fn(
get_subsets_for_labels,
(unique_labels, labels_tiled),
dtype=tf.bool
)
final_labels = tf.map_fn(
scatter_labels,
elems=(tf.expand_dims(unique_labels, 1), labels_subsets),
dtype=tf.int64
)
final_labels = tf.reduce_max(final_labels, axis=0)
with tf.Session() as sess:
results = (sess.run(final_labels, feed_dict={in_set:data_np})).reshape((1, -1))
results = results.reshape((-1, 1))
return results

Related

Python + Image Processing: Efficiently Assign Pixel Values to Nearest Predefined Value

I implemented an algorithm that uses opencv kmeans to quantize the unique brightness values present in a greyscale image. Quantizing the unique values helped avoid biases towards image backgrounds which are typically all the same value.
However, I struggled to find a way to utilize this data to quantize a given input image.
I implemented a very naive solution, but it is unusably slow for the required input sizes (4000x4000):
for x in range(W):
for y in range(H):
center_id = np.argmin([(arr[y,x]-center)**2 for center in centers])
ret_labels2D[y,x] = sortorder.index(center_id)
ret_qimg[y,x] = centers[center_id]
Basically, I am simply adjusting each pixel to the predefined level with the minimum squared error.
Is there any way to do this faster? I was trying to process an image of size 4000x4000 and this implementation was completely unusable.
Full code:
def unique_quantize(arr, K, eps = 0.05, max_iter = 100, max_tries = 20):
"""#param arr: 2D numpy array of floats"""
H, W = arr.shape
unique_values = np.squeeze(np.unique(arr.copy()))
unique_values = np.array(unique_values, float)
if unique_values.ndim == 0:
unique_values = np.array([unique_values],float)
unique_values = np.ravel(unique_values)
unique_values = np.expand_dims(unique_values,1)
Z = unique_values.astype(np.float32)
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER,max_iter,eps)
compactness, labels, centers = cv2.kmeans(Z,K,None,criteria,max_tries,cv2.KMEANS_RANDOM_CENTERS)
labels = np.ravel(np.squeeze(labels))
centers = np.ravel(np.squeeze(centers))
sortorder = list(np.argsort(centers)) # old index --> index to sortorder
ret_center = centers[sortorder]
ret_labels2D = np.zeros((H,W),int)
ret_qimg = np.zeros((H,W),float)
for x in range(W):
for y in range(H):
center_id = np.argmin([(arr[y,x]-center)**2 for center in centers])
ret_labels2D[y,x] = sortorder.index(center_id)
ret_qimg[y,x] = centers[center_id]
return ret_center, ret_labels2D, ret_qimg
EDIT: I looked at the input file again. The size was actually 12000x12000.
As your image is grayscale (presumably 8 bits), a lookup-table will be an efficient solution. It suffices to map all 256 gray-levels to the nearest center once for all, then use this as a conversion table. Even a 16 bits range (65536 entries) would be significantly accelerated.
I recently thought of a much better answer. This code is not extensively tested, but it worked for the use case in my project.
I made use of obscure fancy-indexing techniques in order to keep the entire algorithm contained within numpy functions.
def unique_quantize(arr, K, eps = 0.05, max_iter = 100, max_tries = 20):
"""#param arr: 2D numpy array of floats"""
H, W = arr.shape
unique_values = np.squeeze(np.unique(arr.copy()))
unique_values = np.array(unique_values, float)
if unique_values.ndim == 0:
unique_values = np.array([unique_values],float)
unique_values = np.ravel(unique_values)
unique_values = np.expand_dims(unique_values,1)
Z = unique_values.astype(np.float32)
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER,max_iter,eps)
compactness, labels, centers = cv2.kmeans(Z,K,None,criteria,max_tries,cv2.KMEANS_RANDOM_CENTERS)
labels = np.ravel(np.squeeze(labels))
centers = np.ravel(np.squeeze(centers))
sortorder = np.argsort(centers) # old index --> index to sortorder
inverse_sortorder = np.array([list(sortorder).index(i) for i in range(len(centers))],int)
ret_center = centers[sortorder]
ret_labels2D = np.zeros((H,W),int)
ret_qimg = np.zeros((H,W),float)
errors = [np.power((arr-center),2) for center in centers]
errors = np.array(errors,float)
classification = np.squeeze(np.argmin(errors,axis=0))
ret_labels2D = inverse_sortorder[classification]
ret_qimg = centers[classification]
return np.array(ret_center,float), np.array(ret_labels2D,int), np.array(ret_qimg,float)

singular value decomposition changing results

I am trying to perform text summarization using svds but the summary result is changing everytime I run the function. Can someone please let me know the reason and also a solution for it ?
I even checked the indivudual arrays u, s and v even they are changing after every run. How to make them static ?
The sentence matrix has been calculated as follows after that svds code. The dataset is some legal document from australian supreme court.
def _compute_matrix(sentences, weighting, norm):
if weighting.lower() == 'binary':
vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 1),
binary=True, stop_words=None)
elif weighting.lower() == 'frequency':
vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 1),
binary=False, stop_words=None)
elif weighting.lower() == 'tfidf':
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1),
stop_words=None)
else:
raise ValueError('Parameter "method" must take one of the values
"binary", "frequency" or "tfidf".')
# Extract word features from sentences using sparse vectorizer
frequency_matrix = vectorizer.fit_transform(sentences).astype(float)
terms = vectorizer.get_feature_names()
if norm in ('l1', 'l2'):
frequency_matrix = normalize(frequency_matrix, norm=norm, axis=1)
elif norm is not None:
raise ValueError('Parameter "norm" can only take values "l1", "l2"
or None')
return frequency_matrix, terms
processed_sentences = _createsentences(raw_content)
sentence_matrix, feature_names = _compute_matrix(processed_sentences,
weighting='tfidf', norm='l2')
sentence_matrix = sentence_matrix.transpose()
sentence_matrix = sentence_matrix.multiply(sentence_matrix > 0)
print(sentence_matrix.shape)
u, s, v = svds(sentence_matrix, k=20)
topic_sigma_threshold = 0.5
topic_averages = v.mean(axis=1)
for topic_ndx, topic_avg in enumerate(topic_averages):
v[topic_ndx, v[topic_ndx, :] <= topic_avg] = 0
if 1 <= topic_sigma_threshold < 0:
raise ValueError('Parameter topic_sigma_threshold must take a value
between 0 and 1')
sigma_threshold = max(s) * topic_sigma_threshold
s[s < sigma_threshold] = 0
saliency_vec = np.dot(np.square(s), np.square(v))
top_sentences = saliency_vec.argsort()[-25:][::-1]
top_sentences.sort()
[processed_sentences[i] for i in top_sentences]
I found a solution by playing with the parameters of svd and understanding the source code of svds. svds uses random intial vector from dimension N of the sparse matrix. So to set the initial vector to a constant choice we must use the v0 parameter and the code is mentioned below.
np.random.seed(0)
v0 = np.random.rand(min(sentence_matrix.shape))
u, s, v = svds(sentence_matrix, k=20, v0=v0)

How to substract two tensor with mask in Tensorflow?

I am implementing YOLO network with a selfdefine loss.
Say there two tensor,GT and PD (ground truth and predicts).both are 2 dims matrix of 4x4.
Assume GT is:
0,0,0,0
0,1,0,0
0,0,1,0
0,0,0,0
PD has the same size with some random nums.
Here I need to calc Mean Squared Error separately.
calc MSE with ones in GT and calc MSE with zeros in GT seperately.
I prefer to use a mask to cover the unrelated elements, so the calculation with only calc the related elements. I already implemented this in numpy, but don't know how to do this with tf(v1.14)
import numpy as np
import numpy.ma as ma
conf = y_true[...,0]
conf = np.expand_dims(conf,-1)
conf_pred = y_pred[...,0]
conf_pred = np.expand_dims(conf_pred,-1)
noobj_conf = ma.masked_equal(conf,1) #cover grid with objects
obj_conf = ma.masked_equal(conf,0) #cover grid without objects
loss_obj = np.sum(np.square(obj_conf - conf_pred))
loss_noobj = np.sum(np.square(noobj_conf - conf_pred))
Any suggestions about how to implement this in tensorflow?
If I understand you correctly, you want to calculate mean square errors of 0's and 1's separately.
You can do something like below:
y_true = tf.constant([[0,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,0]], dtype=tf.float32)
y_pred = tf.random.uniform([4, 4], minval=0, maxval=1)
# find indices where 0 is present in y_true
indices0 = tf.where(tf.equal(y_true, tf.zeros([1.])))
# find indices where 1 is present in y_true
indices1 = tf.where(tf.equal(y_true, tf.ones([1.])))
# find all values in y_pred which are present at indices0
y_pred_indices0 = tf.gather_nd(y_pred, indices0)
# find all values in y_pred which are present at indices1
y_pred_indices1 = tf.gather_nd(y_pred, indices1)
# mse loss calculations
mse0 = tf.losses.mean_squared_error(labels=tf.gather_nd(y_true, indices0), predictions=y_pred_indices0)
mse1 = tf.losses.mean_squared_error(labels=tf.gather_nd(y_true, indices1), predictions=y_pred_indices1)
# mse0 = tf.reduce_sum(tf.squared_difference(tf.gather_nd(y_true, indices0), y_pred_indices0))
# mse1 = tf.reduce_sum(tf.squared_difference(tf.gather_nd(y_true, indices1), y_pred_indices1))
with tf.Session() as sess:
y_, loss0, loss1 = sess.run([y_pred, mse0, mse1])
print(y_)
print(loss0, loss1)
output:
[[0.12770343 0.43467927 0.9362457 0.09105921]
[0.46243036 0.8838414 0.92655015 0.9347118 ]
[0.14018488 0.14527774 0.8395766 0.14391887]
[0.1209656 0.7793218 0.70543754 0.749542 ]]
0.341359 0.019614244

How to use for-loops and arrays in python to calculate error propagation

I am trying to write a for loop to calculate error bars by using the derivative method. The formula is relatively simple, however I seem to be running into errors in my code with respect to vector/array sizes. There are a lot of defined vectors in my code, and I have checked the length of all of them. All of the inputs into the for-loop are 1x25 sized arrays.
I've tried changing the indices in the for loop from range(1,25) to range(0,24) but that doesn't seem to work.
# Creating vectors
dfdvg = np.zeros(25)
dfdxi0 = np.zeros(25)
sigsquare = np.zeros(25)
vgerr = vrs
xi0err = xi0s
Asq = np.zeros(25)
Bsq= np.zeros(25)
sig = np.zeros(25)
# calculating derivatives and error vectors
for i in range(0,24):
dfdvg[i] = (np.multiply(rms[:,i],delta[:,i]))**-1
dfdxi0[i] = -vr[:,i]/(vr[:,i]*(np.power(delta[:,i],2)))
Asq[i] = np.power(np.multiply(dfdvg[i],vgerr[i]),2)
Bsq[i] = np.power(np.multiply(dfdxi0[i],xi0err[i]),2)
sigsquare[i] = Asq[i] + Bsq[i]
sig[i] = np.power(sigsquare[i],0.5)
q = np.power(np.multiply(rms,delta),-1)
left = np.multiply(vg,q)
right = -(beta*H)/(3*(1+zeff))
What I want is the "sig" vector, representing the propagated error for each index.
The problem is not in the dimensions of the array, the problem is in the shape. Unluckily you didn't write all your arrays. The point is, if you could just use arrays (25) instead of (1, 25), everything works fine:
vrs = np.random.rand(25)
vr = np.random.rand(25)
xi0s = np.random.rand(25)
rms = np.random.rand(25)
delta = np.random.rand(25)
vg = np.random.rand(25)
# Creating vectors
dfdvg = np.zeros(25)
dfdxi0 = np.zeros(25)
sigsquare = np.zeros(25)
vgerr = vrs
xi0err = xi0s
Asq = np.zeros(25)
Bsq= np.zeros(25)
sig = np.zeros(25)
# calculating derivatives and error vectors
for i in range(0,24):
dfdvg[i] = (np.multiply(rms[i],delta[i]))**-1
dfdxi0[i] = -vr[i]/(vr[i]*(np.power(delta[i],2)))
Asq[i] = np.power(np.multiply(dfdvg[i],vgerr[i]),2)
Bsq[i] = np.power(np.multiply(dfdxi0[i],xi0err[i]),2)
sigsquare[i] = Asq[i] + Bsq[i]
sig[i] = np.power(sigsquare[i],0.5)
q = np.power(np.multiply(rms,delta),-1)
left = np.multiply(vg,q)
(your last line of code seems unrelated)
So, in my opinion, your best option is to reshape your arrays:
vrs=vrs.reshape(25)

Python Information gain implementation

I am currently using scikit-learn for text classification on the 20ng dataset. I want to calculate the information gain for a vectorized dataset. It has been suggested to me that this can be accomplished, using mutual_info_classif from sklearn. However, this method is really slow, so I was trying to implement information gain myself based on this post.
I came up with the following solution:
from scipy.stats import entropy
import numpy as np
def information_gain(X, y):
def _entropy(labels):
counts = np.bincount(labels)
return entropy(counts, base=None)
def _ig(x, y):
# indices where x is set/not set
x_set = np.nonzero(x)[1]
x_not_set = np.delete(np.arange(x.shape[1]), x_set)
h_x_set = _entropy(y[x_set])
h_x_not_set = _entropy(y[x_not_set])
return entropy_full - (((len(x_set) / f_size) * h_x_set)
+ ((len(x_not_set) / f_size) * h_x_not_set))
entropy_full = _entropy(y)
f_size = float(X.shape[0])
scores = np.array([_ig(x, y) for x in X.T])
return scores
Using a very small dataset, most scores from sklearn and my implementation are equal. However, sklearn seems to take frequencies into account, which my algorithm clearly doesn't. For example
categories = ['talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
categories=categories)
X, y = newsgroups_train.data, newsgroups_train.target
cv = CountVectorizer(max_df=0.95, min_df=2,
max_features=100,
stop_words='english')
X_vec = cv.fit_transform(X)
t0 = time()
res_sk = mutual_info_classif(X_vec, y, discrete_features=True)
print("Time passed for sklearn method: %3f" % (time()-t0))
t0 = time()
res_ig = information_gain(X_vec, y)
print("Time passed for ig: %3f" % (time()-t0))
for name, res_mi, res_ig in zip(cv.get_feature_names(), res_sk, res_ig):
print("%s: mi=%f, ig=%f" % (name, res_mi, res_ig))
sample output:
center: mi=0.011824, ig=0.003548
christian: mi=0.128629, ig=0.127122
color: mi=0.028413, ig=0.026397
com: mi=0.041184, ig=0.030458
computer: mi=0.020590, ig=0.012327
cs: mi=0.007291, ig=0.001574
data: mi=0.020734, ig=0.008986
did: mi=0.035613, ig=0.024604
different: mi=0.011432, ig=0.005492
distribution: mi=0.007175, ig=0.004675
does: mi=0.019564, ig=0.006162
don: mi=0.024000, ig=0.017605
earth: mi=0.039409, ig=0.032981
edu: mi=0.023659, ig=0.008442
file: mi=0.048056, ig=0.045746
files: mi=0.041367, ig=0.037860
ftp: mi=0.031302, ig=0.026949
gif: mi=0.028128, ig=0.023744
god: mi=0.122525, ig=0.113637
good: mi=0.016181, ig=0.008511
gov: mi=0.053547, ig=0.048207
So I was wondering if my implementation is wrong, or it is correct, but a different variation of the mutual information algorithm scikit-learn uses.
A little late with my answer but you should look at Orange's implementation. Within their app it is used as a behind-the-scenes processor to help inform the dynamic model parameter building process.
The implementation itself looks fairly straightforward and could most likely be ported out. The entropy calculation first
The sections starting at https://github.com/biolab/orange3/blob/master/Orange/preprocess/score.py#L233
def _entropy(dist):
"""Entropy of class-distribution matrix"""
p = dist / np.sum(dist, axis=0)
pc = np.clip(p, 1e-15, 1)
return np.sum(np.sum(- p * np.log2(pc), axis=0) * np.sum(dist, axis=0) / np.sum(dist))
Then the second portion.
https://github.com/biolab/orange3/blob/master/Orange/preprocess/score.py#L305
class GainRatio(ClassificationScorer):
"""
Information gain ratio is the ratio between information gain and
the entropy of the feature's
value distribution. The score was introduced in [Quinlan1986]_
to alleviate overestimation for multi-valued features. See `Wikipedia entry on gain ratio
<http://en.wikipedia.org/wiki/Information_gain_ratio>`_.
.. [Quinlan1986] J R Quinlan: Induction of Decision Trees, Machine Learning, 1986.
"""
def from_contingency(self, cont, nan_adjustment):
h_class = _entropy(np.sum(cont, axis=1))
h_residual = _entropy(np.compress(np.sum(cont, axis=0), cont, axis=1))
h_attribute = _entropy(np.sum(cont, axis=0))
if h_attribute == 0:
h_attribute = 1
return nan_adjustment * (h_class - h_residual) / h_attribute
The actual scoring process happens at https://github.com/biolab/orange3/blob/master/Orange/preprocess/score.py#L218

Categories