binning data live into a 2D array - python

I am calculating two distances and binning them in intervals of 0.1 in a 2D array. Currently I am doing this. However it takes a lot of time for large number of points
import numpy as np
from scipy.spatial import distance as d
dat=np.random.rand(100,3)
dd2d=np.zeros((10,10))
while len(dat)>0:
i=len(dat)-1
while i>0:
dist0=d.euclidean(dat[0],dat[i])
dist1=d.cosine(dat[0],dat[i])
ind0=int(dist0/0.1)
ind1=int(dist1/0.1)
if ind0>9 or ind1>9:
pass
else:
dd2d[ind0,ind1]+=1
i-=1
dat=np.delete(dat,0,axis=0)
print len(dat)
What is the most efficient way of doing this?
Also how can I convert the while loops in my code into for loops so that I can add progressbar/tqdm to keep track of run time.

If you are already importing scipy.spatial.distance, might as well use pdist. And then you're just making a 2d histogram. Use np.histogram2d.
def binDists2d(dat, f1 = 'euclidean', f2 = 'cosine'):
dist0 = d.pdist(dat, f1)
dist1 = d.pdist(dat, f2)
rng = np.array([[0, 1], [0, 1]])
return np.histogram2d(dist0, dist1, bins = 10, range = rng)
pdist only returns the upper triangular elements. If you want to do this manually, use np.triu_indices, which you could use to generate the distances if scipy is unavailable.
def cosdist(u, v):
return 1 - u.dot(v) / (np.linalg.norm(u) * np.linlg.norm(v))
def binDists2d(dat, f0 = lambda u, v: np.linalg.norm(u - v), f1 = cosdist):
i, j = np.triu_indices(dat.shape[0], 1)
dist0 = f0(dat[i], dat[j])
dist1 = f1(dat[i], dat[j])
rng = np.array([[0, 1], [0, 1]])
return np.histogram2d(dist0, dist1, bins = 10, range = rng)
EDIT: Less memory-hungry version:
def binDists2d(dat, f0, f1, n = 1, bins = 10, rng = np.array([[0, 1], [0, 1]])):
i_, j_ = np.triu_indices(dat.shape[0], 1)
out = np.zeros((bins, bins))
i_, j_ = np.array_split(i_, n), np.array_split(j_, n)
for k, (i, j) in enumerate(zip(i_, j_)):
dist0 = f0(dat[i], dat[j])
dist1 = f1(dat[i], dat[j])
out += np.histogram2d(dist0, dist1, bins = bins, range = rng)
print(str(k) + " of " + str(n) + "completed")
return out

Related

Finding a set of K points from N points that the minimum distance between points is maximum?

Assume there is a set of N points named A. I'd like to select K points and make a subset of A, which is A', where the minimum distance between points are maximum over subsets.
the distance between all points within A' is defined as y(x, x'), x∈A, x'∈A
the minimum distance is defined as k(A') = min_{x∈A', x'∈A', x!=x'}(y(x,x'))
C = argmin_A'(k(A')) where A'⊂A, |A'| = K
N = 100
K = 10
I want to solve and get set C. Is there an efficient algorithm to solve this?
The sample result is like the following, which is quite heavy to solve ; O(N^K).
import numpy as np
from matplotlib import pyplot as plt
from itertools import combinations
N=30
K=5
# generate sample point
mean = [0, 0]
cov = [[1, 0], [0, 1]]
rs=np.random.multivariate_normal(mean,cov, N)
def calc_min_dist(arr):
dist_arr = ((arr[None,:,:] - arr[:,None,:])**2).sum(axis=2)**.5
min_dist = min(dist_arr[dist_arr>0])
return min_dist
def min_points(arr, K):
max_value = 0
max_index = [range(K)]
index = list(range(len(arr)))
for sub_index in combinations(index, K):
sub_arr = arr[tuple([sub_index])]
dist = calc_min_dist(sub_arr)
if max_value < dist:
max_value = dist
max_index = sub_index
return max_value, max_index
# solve
min_value, sub_index = min_points(rs, K)
# plot result
fig, ax = plt.subplots()
ax.scatter(*rs.T)
ax.scatter(*rs[tuple([sub_index])].T)

How do I create a better resolution on my plot with linspace?

i need to create a higher resolution of my plot with the linspace function but i can't figure out how to implement it into my code. Maybe someone has a better understanding of this and can help me.
import numpy as np
import matplotlib.pyplot as plt
N = np.array([1, 2, 3, 4])
c = np.array([1359,2136.6,2617.74,2630.16])
ct = c/1000
ct0 = 103.8348/1000
cmax = 2630.16/1000
n = N.size
A = np.zeros(n)
k = np.zeros(n)
for j in range(0,n):
A[j] = (ct[j] - ct0)/(cmax - ct0)
for j in range(0, n):
if j < 3:
k[j] = -(np.log(1 - A[j])) / N[j]
else:
k[j] = 1
MWk = np.mean(k)
Amod = np.zeros(n)
for j in range(0,n):
Amod[j] = 1 - np.exp((-N[j]) * MWk)
print(ct)
print(A)
print(k)
plt.xlabel("N")
plt.ylabel("Aufschlussgrad ")
plt.plot(N, A, "g", label = "Aufschlussgrad")
plt.plot(N, Amod, "k", label = "Modelfunktion")
plt.title("Hochdruckhomogenisator")
plt.legend()
plt.show()
There is no need to interpolate Amod, since it is a function you defined. On the other hand, it is necessary to perform an interpolation (either on A or on the original data c) in order to add more points to the graph. With only 4 or 5 points, the interpolation will not be very meaningful. In this case I choose to interpolate A.
The code was not taking profit of numpy's arrays, so I pythonized it a little (it looked like C)
import numpy as np
import matplotlib.pyplot as plt
def Amod(x, MWk):
return 1 - np.exp((-x) * MWk)
def k(x, A):
rv = -np.log(1 - A) / x
rv[np.nonzero(A==1)] = 1
return rv
# No real changes here: only a little 'pythonization'
N = np.array([1, 2, 3, 4])
c = np.array([1359,2136.6,2617.74,2630.16])
ct = c/1000
ct0 = 103.8348/1000
cmax = 2630.16/1000
n = N.size
A = (ct - ct0) / (cmax-ct0)
MWk = np.mean(k(N, A))
print(ct)
print(A)
print(k)
# we now interpolate A
# numpy's interpolation is linear... it is not useful for this case
from scipy.interpolate import interp1d
# interp1d returns a function that interpolates the data we provide
# in this case, i choose quadratic interpolation
A_interp_fun = interp1d(N, A, 'quadratic')
# Let's increase the number of points
new_x = np.linspace(N[0], N[-1])
A_xtra = A_interp_fun(new_x)
plt.xlabel("N")
plt.ylabel("Aufschlussgrad ")
plt.scatter(N, A, label = "Aufschlussgrad - data")
plt.plot(new_x, A_xtra, "g", label="Aufschlussgrad - interpolation")
plt.scatter(N, Amod(N, MWk), label="Modelfunktion - data")
plt.plot(new_x, Amod(new_x, MWk), "k", label="Modelfunktion - function")
plt.title("Hochdruckhomogenisator")
plt.legend()
plt.show()

Generating a random line on a 2d array in python

I've got a 2d array of zeros: 250 by 250. And I want to generate a random straight random line of a specific length (haven't yet decided). Obviously, since it's a line the values that turn from zero to one must be connected in some way, vertically, horizontally, or diagonally; and it also has to be straight. How could I do this? I'm quite stuck with this problem, any help would be appreciated.
We can do:
import numpy as np
SIZE = 250
arr = np.zeros((SIZE, SIZE))
M_POS = np.arange(-SIZE, SIZE)
M_POS = np.r_[M_POS, 1 / M_POS[M_POS!=0]]
M = np.random.choice(M_POS, 1)[0]
N = np.random.choice(np.arange(-SIZE, SIZE), 1)[0]
L = 50
P0 = np.array([0, N])
X_Y = np.array([1, 1 / M]) if abs(M) < 1 else np.array([1, M])
draw_in = np.add(np.repeat([P0], L, axis=0),
np.repeat([X_Y], L, axis=0) * np.arange(L)[:, np.newaxis]).astype(int)
draw_in = draw_in[((draw_in < SIZE) & (draw_in>0)).all(axis=1)]
arr[draw_in[:, 0], draw_in[:, 1]] = 1

Find closest k points for every point in row of numpy array

I have a np array, X that is size 1000 x 1000 where each element is a real number. I want to find the 5 closest points for every point in each row of this np array. Here the distance metric can just be abs(x-y). I have tried to do
for i in range(X.shape[0]):
knn = NearestNeighbors(n_neighbors=5)
knn.fit(X[i])
for j in range(X.shape[1])
d = knn.kneighbors(X[i,j], return_distance=False)
However, this does not work for me and I am not sure how efficient this is. Is there a way around this? I have seen a lot of methods for comparing vectors but not any for comparing single elements. I know that I could use a for loop and loop over and find the k smallest, but this would be computationally expensive. Could a KD tree work for this? I have tried a method similar to
Finding index of nearest point in numpy arrays of x and y coordinates
However, I can not get this to work. Is there some numpy function I don't know about that could accomplish this?
Construct a kdtree with scipy.spatial.cKDTree for each row of your data.
import numpy as np
import scipy.spatial
def nearest_neighbors(arr, k):
k_lst = list(range(k + 2))[2:] # [2,3]
neighbors = []
for row in arr:
# stack the data so each element is in its own row
data = np.vstack(row)
# construct a kd-tree
tree = scipy.spatial.cKDTree(data)
# find k nearest neighbors for each element of data, squeezing out the zero result (the first nearest neighbor is always itself)
dd, ii = tree.query(data, k=k_lst)
# apply an index filter on data to get the nearest neighbor elements
closest = data[ii].reshape(-1, k)
neighbors.append(closest)
return np.stack(neighbors)
N = 1000
k = 5
A = np.random.random((N, N))
nearest_neighbors(A, k)
I'm not really sure how you want the final results. But this definitely gets you what you need.
np.random.seed([3,1415])
X = np.random.rand(1000, 1000)
Grab upper triangle indices to track every combination of points per row
x1, x2 = np.triu_indices(X.shape[1], 1)
generate an array of all distances
d = np.abs(X[:, x1] - X[:, x2])
Find the closest 5 for every row
tpos = np.argpartition(d, 5)[:, :5]
Then x1[tpos] gives the row-wise positions of the first point in the closest pairs while x2[tpos] gives the second position of the closest pairs.
Here is an argsorting solution that strives to take advantage of the simple metric:
def nn(A, k):
out = np.zeros((A.shape[0], A.shape[1] + 2*k), dtype=int)
out[:, k:-k] = np.argsort(A, axis=-1)
out[:, :k] = out[:, -k-1, None]
out[:, -k:] = out[:, k, None]
strd = stride_tricks.as_strided(
out, strides=out.strides + (out.strides[-1],), shape=A.shape + (2*k+1,))
delta = A[np.arange(A.shape[0])[:, None, None], strd]
delta -= delta[..., k, None]
delta = np.abs(delta)
s = np.argpartition(delta,(0, k), axis = -1)[..., 1:k+1]
inds = tuple(np.ogrid[:strd.shape[0], :strd.shape[1], :0][:2])
res = np.empty(A.shape + (k,), dtype=int)
res[np.arange(strd.shape[0])[:, None, None], out[:, k:-k, None],
np.arange(k)[None, None, :]] = strd[inds + (s,)]
return res
N = 1000
k = 5
r = 10
A = np.random.random((N, N))
# crude test
print(np.abs(A[np.arange(N)[:, None, None], res]-A[..., None]).mean())
# timings
print(timeit(lambda: nn(A, k), number=r) / r)
Output:
# 0.00150537172454
# 0.4567880852999224

3D distance vectorization

I need help vectorizing this code. Right now, with N=100, its takes a minute or so to run. I would like to speed that up. I have done something like this for a double loop, but never with a 3D loop, and I am having difficulties.
import numpy as np
N = 100
n = 12
r = np.sqrt(2)
x = np.arange(-N,N+1)
y = np.arange(-N,N+1)
z = np.arange(-N,N+1)
C = 0
for i in x:
for j in y:
for k in z:
if (i+j+k)%2==0 and (i*i+j*j+k*k!=0):
p = np.sqrt(i*i+j*j+k*k)
p = p/r
q = (1/p)**n
C += q
print '\n'
print C
The meshgrid/where/indexing solution is already extremely fast. I made it about 65 % faster. This is not too much, but I explain it anyway, step by step:
It was easiest for me to approach this problem with all 3D vectors in the grid being columns in one large 2D 3 x M array. meshgrid is the right tool for creating all the combinations (note that numpy version >= 1.7 is required for a 3D meshgrid), and vstack + reshape bring the data into the desired form. Example:
>>> np.vstack(np.meshgrid(*[np.arange(0, 2)]*3)).reshape(3,-1)
array([[0, 0, 1, 1, 0, 0, 1, 1],
[0, 0, 0, 0, 1, 1, 1, 1],
[0, 1, 0, 1, 0, 1, 0, 1]])
Each column is one 3D vector. Each of these eight vectors represents one corner of a 1x1x1 cube (a 3D grid with step size 1 and length 1 in all dimensions).
Let's call this array vectors (it contains all 3D vectors representing all points in the grid). Then, prepare a bool mask for selecting those vectors fulfilling your mod2 criterion:
mod2bool = np.sum(vectors, axis=0) % 2 == 0
np.sum(vectors, axis=0) creates an 1 x M array containing the element sum for each column vector. Hence, mod2bool is a 1 x M array with a bool value for each column vector. Now use this bool mask:
vectorsubset = vectors[:,mod2bool]
This selects all rows (:) and uses boolean indexing for filtering the columns, both are fast operations in numpy. Calculate the lengths of the remaining vectors, using the native numpy approach:
lengths = np.sqrt(np.sum(vectorsubset**2, axis=0))
This is quite fast -- however, scipy.stats.ss and bottleneck.ss can perform the squared sum calculation even faster than this.
Transform the lengths using your instructions:
with np.errstate(divide='ignore'):
p = (r/lengths)**n
This involves finite number division by zero, resulting in Infs in the output array. This is entirely fine. We use numpy's errstate context manager for making sure that these zero divisions do not throw an exception or a runtime warning.
Now sum up the finite elements (ignore the infs) and return the sum:
return np.sum(p[np.isfinite(p)])
I have implemented this method two times below. Once exactly like just explained, and once involving bottleneck's ss and nansum functions. I have also added your method for comparison, and a modified version of your method that skips the np.where((x*x+y*y+z*z)!=0) indexing, but rather creates Infs, and finally sums up the isfinite way.
import sys
import numpy as np
import bottleneck as bn
N = 100
n = 12
r = np.sqrt(2)
x,y,z = np.meshgrid(*[np.arange(-N, N+1)]*3)
gridvectors = np.vstack((x,y,z)).reshape(3, -1)
def measure_time(func):
import time
def modified_func(*args, **kwargs):
t0 = time.time()
result = func(*args, **kwargs)
duration = time.time() - t0
print("%s duration: %.3f s" % (func.__name__, duration))
return result
return modified_func
#measure_time
def method_columnvecs(vectors):
mod2bool = np.sum(vectors, axis=0) % 2 == 0
vectorsubset = vectors[:,mod2bool]
lengths = np.sqrt(np.sum(vectorsubset**2, axis=0))
with np.errstate(divide='ignore'):
p = (r/lengths)**n
return np.sum(p[np.isfinite(p)])
#measure_time
def method_columnvecs_opt(vectors):
# On my system, bn.nansum is even slightly faster than np.sum.
mod2bool = bn.nansum(vectors, axis=0) % 2 == 0
# Use ss from bottleneck or scipy.stats (axis=0 is default).
lengths = np.sqrt(bn.ss(vectors[:,mod2bool]))
with np.errstate(divide='ignore'):
p = (r/lengths)**n
return bn.nansum(p[np.isfinite(p)])
#measure_time
def method_original(x,y,z):
ind = np.where((x+y+z)%2==0)
x = x[ind]
y = y[ind]
z = z[ind]
ind = np.where((x*x+y*y+z*z)!=0)
x = x[ind]
y = y[ind]
z = z[ind]
p=np.sqrt(x*x+y*y+z*z)/r
return np.sum((1/p)**n)
#measure_time
def method_original_finitesum(x,y,z):
ind = np.where((x+y+z)%2==0)
x = x[ind]
y = y[ind]
z = z[ind]
lengths = np.sqrt(x*x+y*y+z*z)
with np.errstate(divide='ignore'):
p = (r/lengths)**n
return np.sum(p[np.isfinite(p)])
print method_columnvecs(gridvectors)
print method_columnvecs_opt(gridvectors)
print method_original(x,y,z)
print method_original_finitesum(x,y,z)
This is the output:
$ python test.py
method_columnvecs duration: 1.295 s
12.1318801965
method_columnvecs_opt duration: 1.162 s
12.1318801965
method_original duration: 1.936 s
12.1318801965
method_original_finitesum duration: 1.714 s
12.1318801965
All methods produce the same result. Your method becomes a bit faster when doing the isfinite style sum. My methods are faster, but I would say that this is an exercise of academic nature rather than an important improvement :-)
I have one question left: you were saying that for N=3, the calculation should produce a 12. Even yours doesn't do this. All methods above produce 12.1317530867 for N=3. Is this expected?
Thanks to #Bill, I was able to get this to work. Very fast now. Perhaps could be done better, especially with the two masks to get rid of the two conditions that I originally had for loops for.
from __future__ import division
import numpy as np
N = 100
n = 12
r = np.sqrt(2)
x, y, z = np.meshgrid(*[np.arange(-N, N+1)]*3)
ind = np.where((x+y+z)%2==0)
x = x[ind]
y = y[ind]
z = z[ind]
ind = np.where((x*x+y*y+z*z)!=0)
x = x[ind]
y = y[ind]
z = z[ind]
p=np.sqrt(x*x+y*y+z*z)/r
ans = (1/p)**n
ans = np.sum(ans)
print 'ans'
print ans

Categories