I am trying to adjust the following code to produce bootstrap samples of equal size with replacement:
rng = np.random.RandomState(1)
def cluster_stability(X, est, n_iter=20, random_state=None):
labels = []
indices = []
for i in range(n_iter):
# draw bootstrap samples, store indices
sample_indices = rng.randint(0, X.shape[0], X.shape[0])
indices.append(sample_indices)
est = clone(est)
if hasattr(est, "random_state"):
# randomize estimator if possible
est.random_state = rng.randint(1e5)
X_bootstrap = X.iloc[sample_indices] # ADD .iloc IF NOT 2D
est.fit(X_bootstrap)
# store clustering outcome using original indices
relabel = -np.ones(X.shape[0], dtype=int)
relabel[sample_indices] = est.labels_
labels.append(relabel)
scores = []
for l, i in zip(labels, indices):
for k, j in zip(labels, indices):
# we also compute the diagonal which is a bit silly
in_both = np.intersect1d(i, j)
scores.append(metrics.adjusted_rand_score(l[in_both], k[in_both]))
return np.mean(scores)
Hence, sample_indices should be of size len(X). However, this adjustment crashes the code.
Thanks in advance!
Related
I implemented an algorithm that uses opencv kmeans to quantize the unique brightness values present in a greyscale image. Quantizing the unique values helped avoid biases towards image backgrounds which are typically all the same value.
However, I struggled to find a way to utilize this data to quantize a given input image.
I implemented a very naive solution, but it is unusably slow for the required input sizes (4000x4000):
for x in range(W):
for y in range(H):
center_id = np.argmin([(arr[y,x]-center)**2 for center in centers])
ret_labels2D[y,x] = sortorder.index(center_id)
ret_qimg[y,x] = centers[center_id]
Basically, I am simply adjusting each pixel to the predefined level with the minimum squared error.
Is there any way to do this faster? I was trying to process an image of size 4000x4000 and this implementation was completely unusable.
Full code:
def unique_quantize(arr, K, eps = 0.05, max_iter = 100, max_tries = 20):
"""#param arr: 2D numpy array of floats"""
H, W = arr.shape
unique_values = np.squeeze(np.unique(arr.copy()))
unique_values = np.array(unique_values, float)
if unique_values.ndim == 0:
unique_values = np.array([unique_values],float)
unique_values = np.ravel(unique_values)
unique_values = np.expand_dims(unique_values,1)
Z = unique_values.astype(np.float32)
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER,max_iter,eps)
compactness, labels, centers = cv2.kmeans(Z,K,None,criteria,max_tries,cv2.KMEANS_RANDOM_CENTERS)
labels = np.ravel(np.squeeze(labels))
centers = np.ravel(np.squeeze(centers))
sortorder = list(np.argsort(centers)) # old index --> index to sortorder
ret_center = centers[sortorder]
ret_labels2D = np.zeros((H,W),int)
ret_qimg = np.zeros((H,W),float)
for x in range(W):
for y in range(H):
center_id = np.argmin([(arr[y,x]-center)**2 for center in centers])
ret_labels2D[y,x] = sortorder.index(center_id)
ret_qimg[y,x] = centers[center_id]
return ret_center, ret_labels2D, ret_qimg
EDIT: I looked at the input file again. The size was actually 12000x12000.
As your image is grayscale (presumably 8 bits), a lookup-table will be an efficient solution. It suffices to map all 256 gray-levels to the nearest center once for all, then use this as a conversion table. Even a 16 bits range (65536 entries) would be significantly accelerated.
I recently thought of a much better answer. This code is not extensively tested, but it worked for the use case in my project.
I made use of obscure fancy-indexing techniques in order to keep the entire algorithm contained within numpy functions.
def unique_quantize(arr, K, eps = 0.05, max_iter = 100, max_tries = 20):
"""#param arr: 2D numpy array of floats"""
H, W = arr.shape
unique_values = np.squeeze(np.unique(arr.copy()))
unique_values = np.array(unique_values, float)
if unique_values.ndim == 0:
unique_values = np.array([unique_values],float)
unique_values = np.ravel(unique_values)
unique_values = np.expand_dims(unique_values,1)
Z = unique_values.astype(np.float32)
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER,max_iter,eps)
compactness, labels, centers = cv2.kmeans(Z,K,None,criteria,max_tries,cv2.KMEANS_RANDOM_CENTERS)
labels = np.ravel(np.squeeze(labels))
centers = np.ravel(np.squeeze(centers))
sortorder = np.argsort(centers) # old index --> index to sortorder
inverse_sortorder = np.array([list(sortorder).index(i) for i in range(len(centers))],int)
ret_center = centers[sortorder]
ret_labels2D = np.zeros((H,W),int)
ret_qimg = np.zeros((H,W),float)
errors = [np.power((arr-center),2) for center in centers]
errors = np.array(errors,float)
classification = np.squeeze(np.argmin(errors,axis=0))
ret_labels2D = inverse_sortorder[classification]
ret_qimg = centers[classification]
return np.array(ret_center,float), np.array(ret_labels2D,int), np.array(ret_qimg,float)
(Edited to include dataset and model code)
I'm training a Keras CNN 2d matrix. I'm creating my own training dataset, in which each matrix cell has the shape of [[list], int]. The cell's first list item is the product of a string class that I converts to list (using tf.keras.utils.to_categorical):
cell[0] = to_categorical(
rnd_type-1, num_classes=num_types)
the second is a simple int:
cell[1] = random.randint(0, max_val)
The dataset creation function:
def make_data(num_of_samples, num_types, max_height, grid_x, grid_y):
grids_list = []
target_list = []
target = 0
for _ in range(num_of_samples):
# create empty grid
grid = [[[[],0] for i in range(grid_y)] for j in range(grid_x)]
for i in range(grid_x):
for j in range(grid_y):
rnd_type = random.randint(
0, num_types)
# get random class
# and convert to cat list
cat = to_categorical(
rnd_type-1, num_classes=num_types)
# get random type
rnd_height = random.randint(0, max_height)
# inject the two values into the cell
grid[i][j] = [cat, rnd_height]
# get some target value
target += rnd_type * 5 + random.random()*5
target_list.append(target)
grids_list.append(grid)
# make np arrs out of the lists
t = np.array(target_list)
g = np.array(grids_list)
return t, g
my model is created using model = models.create_cnn(grid_size, grid_size, 2, regress=True) in which (I assumed) the Input depth is 2.
The model creation code:
num_types = 20
max_height = 50
num_of_samples = 10
grid_size = 10
epochs = 5000
# get n results of X x Y grid with target
targets_list, grids_list = datasets.make_data(
num_of_samples, num_types, max_height, grid_size, grid_size)
split = train_test_split(targets_list, grids_list,
test_size=0.25, random_state=42)
(train_attr_X, test_attr_X, train_grids_X, test_grids_X) = split
# find the largest value in the training set and use it to
# scale values to the range [0, 1]
max_target = train_attr_X.max()
train_attr_Y = train_attr_X / max_target
test_attr_Y = test_attr_X / max_target
model = models.create_cnn(grid_size, grid_size, 2, regress=True)
I however cannot train it given this error: ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
Answer my own question:
model can only accept int as depth. Therefore, the depth of my matrix must by a list of int len, not a 2D matrix. For that reason, the way to merge class data with continuous field rnd_height is:
class => cat = to_categorical
cell = np.append(cat, [rnd_height])
This way, cat list is added with the rnd_height value.
The whole dataset function now look like this:
def make_data(num_of_samples, num_types, max_height, grid_x, grid_y):
grids_list = []
target_list = []
target = 0
for _ in range(num_of_samples):
grid = [[[False, False] for i in range(grid_y)] for j in range(grid_x)]
for i in range(grid_x):
for j in range(grid_y):
rnd_type = random.randint(
0, num_types)
cat = to_categorical(
rnd_type-1, num_classes=num_types)
rnd_height = random.randint(0, max_height)
cell = np.append(cat, [rnd_height])
grid[i][j] = cell
# simulate simple objective function
if rnd_type < num_types/5:
target += rnd_height * 5
target_list.append(target)
grids_list.append(grid)
t = np.array(target_list)
g = np.array(grids_list)
# return grids and targets
return g, t
I want to read an grayscale image, say something with (248, 480, 3) shape, then use each element of it as the lam value for making a Poisson random value and do this for each element and make a new data set with the same shape. I want to do this as much as nscan, then I want to add them all together and put them in a new data set and plot it again to get something that is similar to the first image that I put in the beginning. This code is working but it is extremely slow, I was wondering if there is any way to make it faster?
import numpy as np
import matplotlib.pyplot as plt
my_image = plt.imread('myimage.png')
def genP(data):
new_data = np.zeros(data.shape)
for i in range(data.shape[0]):
for j in range(data.shape[1]):
for k in range(data.shape[2]):
new_data[i, j, k] = np.random.poisson(lam = data[i, j, k])
return new_data
def get_total(data, nscan = 1):
total = genP(data)
for i in range(nscan):
total += genP(data)
total = total/nscan
plt.imshow(total)
plt.show()
get_total(my_image, 100)
numpy.random.poisson can entirely replace your genP() function... This is basically guaranteed to be much faster.
If size is None (default), a single value is returned if lam is a scalar. Otherwise, np.array(lam).size samples are drawn
def get_total(data, nscan = 1):
total = np.random.poisson(lam=data)
for i in range(nscan):
total += np.random.poisson(lam=data)
total = total/nscan
plt.imshow(total)
plt.show()
I'm trying to vectorize a code with numpy, to run it using multiprocessing, but i can't understand how numpy.apply_along_axis works. This is an example of the code, vectorized using map
import numpy
from scipy import sparse
import multiprocessing
from matplotlib import pyplot
#first i build a matrix of some x positions vs time datas in a sparse format
matrix = numpy.random.randint(2, size = 100).astype(float).reshape(10,10)
x = numpy.nonzero(matrix)[0]
times = numpy.nonzero(matrix)[1]
weights = numpy.random.rand(x.size)
#then i define an array of y positions
nStepsY = 5
y = numpy.arange(1,nStepsY+1)
#now i build an image using x-y-times coordinates and x-times weights
def mapIt(ithStep):
ncolumns = 80
image = numpy.zeros(ncolumns)
yTimed = y[ithStep]*times
positions = (numpy.round(x-yTimed)+50).astype(int)
values = numpy.bincount(positions,weights)
values = values[numpy.nonzero(values)]
positions = numpy.unique(positions)
image[positions] = values
return image
image = list(map(mapIt, range(nStepsY)))
image = numpy.array(image)
a = pyplot.imshow(image, aspect = 10)
Here the output plot
I tried to use numpy.apply_along_axis, but this function allows me to iterate only along the rows of image, while i need to iterate along the ithStep index too. E.g.:
#now i build an image using x-y-times coordinates and x-times weights
nrows = nStepsY
ncolumns = 80
matrix = numpy.zeros(nrows*ncolumns).reshape(nrows,ncolumns)
def applyIt(image):
image = numpy.zeros(ncolumns)
yTimed = y[ithStep]*times
positions = (numpy.round(x-yTimed)+50).astype(int)
values = numpy.bincount(positions,weights)
values = values[numpy.nonzero(values)]
positions = numpy.unique(positions)
image[positions] = values
return image
imageApplied = numpy.apply_along_axis(applyIt,1,matrix)
a = pyplot.imshow(imageApplied, aspect = 10)
It obviously return only the firs row nrows times, since nothing iterates ithStep:
And here the wrong plot
There is a way to iterate an index, or to use an index while numpy.apply_along_axis iterates?
Here the code with only matricial operations: it's quite faster than map or apply_along_axis but uses so much memory.
(in this function i use a trick with scipy.sparse, which works more intuitively than numpy arrays when you try to sum numbers on a same element)
def fullmatrix(nRows, nColumns):
y = numpy.arange(1,nStepsY+1)
image = numpy.zeros((nRows, nColumns))
yTimed = numpy.outer(y,times)
x3d = numpy.outer(numpy.ones(nStepsY),x)
weights3d = numpy.outer(numpy.ones(nStepsY),weights)
y3d = numpy.outer(y,numpy.ones(x.size))
positions = (numpy.round(x3d-yTimed)+50).astype(int)
matrix = sparse.coo_matrix((numpy.ravel(weights3d), (numpy.ravel(y3d), numpy.ravel(positions)))).todense()
return matrix
image = fullmatrix(nStepsY, 80)
a = pyplot.imshow(image, aspect = 10)
This way is simplier and very fast! Thank you so much.
nStepsY = 5
nRows = nStepsY
nColumns = 80
y = numpy.arange(1,nStepsY+1)
image = numpy.zeros((nRows, nColumns))
fakeRow = numpy.zeros(positions.size)
def itermatrix(ithStep):
yTimed = y[ithStep]*times
positions = (numpy.round(x-yTimed)+50).astype(int)
matrix = sparse.coo_matrix((weights, (fakeRow, positions))).todense()
matrix = numpy.ravel(matrix)
missColumns = (nColumns-matrix.size)
zeros = numpy.zeros(missColumns)
matrix = numpy.concatenate((matrix, zeros))
return matrix
for i in numpy.arange(nStepsY):
image[i] = itermatrix(i)
#or, without initialization of image:
imageMapped = list(map(itermatrix, range(nStepsY)))
imageMapped = numpy.array(imageMapped)
It feels like attempting to use map or apply_along_axis is obscuring the essentially iteration of the problem.
I rewrote your code as an explicit loop on y:
nStepsY = 5
y = numpy.arange(1,nStepsY+1)
image = numpy.zeros((nStepsY, 80))
for i, yi in enumerate(y):
yTimed = yi*times
positions = (numpy.round(x-yTimed)+50).astype(int)
values = numpy.bincount(positions,weights)
values = values[numpy.nonzero(values)]
positions = numpy.unique(positions)
image[i, positions] = values
a = pyplot.imshow(image, aspect = 10)
pyplot.show()
Looking at the code, I think I could calculate positions for all y values making a (y.shape[0],times.shape[0]) array. But the rest, the bincount and unique still have to work row by row.
apply_along_axis when working with a 2d array, and axis=1 essentially does:
res = np.zeros_like(arr)
for i in range....:
res[i,:] = func1d(arr[i,:])
If the input array has more dimensions it constructs a more elaborate indexing object [i,j,k,:]. And it can handle cases where func1d returns a different size array than the input. But in any case it is just a generalized iteration tool.
Moving the initial positions creation outside the loop:
yTimed = y[:,None]*times
positions = (numpy.round(x-yTimed)+50).astype(int)
image = numpy.zeros((positions.shape[0], 80))
for i, pos in enumerate(positions):
values = numpy.bincount(pos,weights)
values = values[numpy.nonzero(values)]
pos = numpy.unique(pos)
image[i, pos] = values
Now I can cast this as an apply_along_axis problem, with an applyIt that takes a positions vector (with all the yTimed information) rather than blank image vector.
def applyIt(pos, size, weights):
acolumn = numpy.zeros(size)
values = numpy.bincount(pos,weights)
values = values[numpy.nonzero(values)]
pos = numpy.unique(pos)
acolumn[pos] = values
return acolumn
image = numpy.apply_along_axis(applyIt, 1, positions, 80, weights)
Timing wise I expect it's a bit slower than my explicit iteration. It has to do more setup work, including a test call applyIt(positions[0,:],...) to determine the size of its return array (i.e image has different shape than positions.)
def csrmatrix(y, times, x, weights):
yTimed = numpy.outer(y,times)
n=y.shape[0]
x3d = numpy.outer(numpy.ones(n),x)
weights3d = numpy.outer(numpy.ones(n),weights)
y3d = numpy.outer(y,numpy.ones(x.size))
positions = (numpy.round(x3d-yTimed)+50).astype(int)
#print(y.shape, weights3d.shape, y3d.shape, positions.shape)
matrix = sparse.csr_matrix((numpy.ravel(weights3d), (numpy.ravel(y3d), numpy.ravel(positions))))
#print(repr(matrix))
return matrix
# one call
image = csrmatrix(y, times, x, weights)
# iterative call
alist = []
for yi in numpy.arange(1,nStepsY+1):
alist.append(csrmatrix(numpy.array([yi]), times, x, weights))
def mystack(alist):
# concatenate without offset
row, col, data = [],[],[]
for A in alist:
A = A.tocoo()
row.extend(A.row)
col.extend(A.col)
data.extend(A.data)
print(len(row),len(col),len(data))
return sparse.csr_matrix((data, (row, col)))
vimage = mystack(alist)
i have a geodesic distance of graph data in .csv format
i want to reduce it into 2D using Multidimensional Scaling (MDS) and cluster it using Kmedoids
This is my code:
# coding: utf-8
import numpy as np
import csv
from sklearn import manifold
from sklearn.metrics.pairwise import pairwise_distances
import kmedoidss
rawdata = csv.reader(open('data.csv', 'r').readlines()[1:])
# Process the data into a 2D array, omitting the header row
data, labels = [], []
for row in rawdata:
labels.append(row[1])
data.append([int(i) for i in row[1:]])
#print data
# Now run very basic MDS
# Documentation here: http://scikit-learn.org/dev/modules/generated/sklearn.manifold.MDS.html#sklearn.manifold.MDS
mds = manifold.MDS(n_components=2, dissimilarity="precomputed")
pos = mds.fit_transform(data)
# distance matrix
D = pairwise_distances(pos, metric='euclidean')
# split into c clusters
M, C = kmedoidss.kMedoids(D, 3)
print ('Data awal : ')
for index, point_idx in enumerate(pos, 1):
print(index, point_idx)
print ('\n medoids:' )
for point_idx in M:
print('{} index ke - {} '.format (pos[point_idx], point_idx+1))
print('')
print('clustering result:')
for label in C:
for point_idx in C[label]:
print('cluster- {}:{} index- {}'.format(label, pos[point_idx], point_idx+1))
kmedoidss.py
import numpy as np
import random
def kMedoids(D, k, tmax=100):
# determine dimensions of distance matrix D
m, n = D.shape
# randomly initialize an array of k medoid indices
M = np.sort(np.random.choice(n, k))
# create a copy of the array of medoid indices
Mnew = np.copy(M)
# initialize a dictionary to represent clusters
C = {}
for t in xrange(tmax):
# determine clusters, i. e. arrays of data indices
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# update cluster medoids
for kappa in range(k):
J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
j = np.argmin(J)
Mnew[kappa] = C[kappa][j]
np.sort(Mnew)
# check for convergence
if np.array_equal(M, Mnew):
break
M = np.copy(Mnew)
else:
# final update of cluster memberships
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# return results
return M, C
how to visualize the cluster result as a graph with different node color based on its cluster?
You don't need MDS to run kMedoids - just run it on the original distance matrix (kMedoids can also be made to work on a similarity matrix by switching min for max).
Use MDS only for plotting.
The usual approach for visualization is to use a loop over clusters, and plot each cluster in a different color; or to use a color predicate. There are many examples in the scipy documentation.
http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
y_pred = labels.astype(np.int)
plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
where X is your pos variable (2d mds result) and labels are an integer cluster number for every point. Since you don't have your data in thid "labels" layout, consider using a loop instead:
for label, pts in C.items():
plt.scatter(pos[pts, 0], pos[pts, 1], color=colors[label])
plt.show()