Related
I am writing a code to simulate Continuous Time Random Walk phenomena with a function in python. My code works correctly so far, but I would like to exploit the indexing abilities of NumPy arrays and improve the speed. In the code, below I am generating an ensemble of trajectories, so I have to loop over each of them while generating it. Is it somehow possible to index the NumPy array x in such a way that I can get rid of the loop on Nens (the for loop in the below code snippet)
for k in range(Nens):
#start building the trajectory
stop = 0
i = 0
while (stop < Nt):
#reset the the stop time
start = stop
#increment the stop till the next waiting time
stop += int(trand[i,k]) # A 2D numpy array
#update the trajectory
x[start:stop,k] = x[start-1,k] \ #x is also a 2D array
+ (1-int(abs((x[start-1,k]+xrand[i,k])/(xmax))))* xrand[i,k] \
- int(abs((x[start-1,k]+xrand[i,k])/(xmax)))*np.sign(x[start-1,k]/xrand[i,k])* xrand[i,k]
i += 1
print i
return T, x
A plausible method that I can look to is as follows.
In this code, start and stop are scalar integers. However, I would like to index this is in way in which both start and stop are 1D Numpy integer arrays.
But I have seen that if I can use only stop/start to slice the numpy array, but using slicing from a beginning to ending tuple of indices is not possible .
EDIT 1 (MWE):
The following is the function that I have written, which produces random walk trajectory if given the appropriate parameters,
def ctrw_ens2d(sig,tau,sig2,tau2,alpha,xmax,Nens,Nt=1000,dt=1.0):
#first define the array for time
T = np.arange(0,Nt,1)*dt
#generate at least Nt random time increments based on Poisson
# distribution (you will use only less than that)
trand = np.random.exponential(tau, (2*Nt,Nens,1))
xrand = np.random.normal(0.0,sig,(2*Nt,Nens,2))
Xdist = np.random.lognormal(-1,0.9,(Nens))
Xdist = np.clip(Xdist,2*sig,12*sig)
trand2 = np.random.exponential(tau2, (2*Nt,Nens,1))
xrand2 = np.random.normal(0.0,sig2,(2*Nt,Nens,2))
#make a zero array of trajectory
x1 = np.zeros((Nt,Nens))
x2 = np.zeros((Nt,Nens))
y1 = np.zeros((Nt,Nens))
y2 = np.zeros((Nt,Nens))
for k in range(Nens):
#start building the trajectory
stop = 0
i = 0
while (stop < Nt):
#reset the the stop time
start = stop
#increment the stop till the next waiting time
stop += int(trand[i,k,0])
#update the trajectory
r1 = np.sqrt(x1[start-1,k]**2 + y1[start-1,k]**2)
rr = np.linalg.norm(xrand[i,k])
x1[start:stop,k] = x1[start-1,k] \
+ (1-int(abs((r1+rr)/(Xdist[k]))))* xrand[i,k,0] \
- int(abs((r1+rr)/(Xdist[k])))* \
np.sign(x1[start-1,k]/xrand[i,k,0])* xrand[i,k,0]
y1[start:stop,k] = y1[start-1,k] \
+ (1-int(abs((r1+rr)/(Xdist[k]))))* xrand[i,k,1] \
- int(abs((r1+rr)/(Xdist[k])))* \
np.sign(y1[start-1,k]/xrand[i,k,1])* xrand[i,k,1]
i += 1
#randomly add jumps in between, at later stage
stop = 1
i = 0
while (stop < Nt):
#reset the the stop time
start = stop
#increment the stop till the next waiting time
stop += int(trand2[i,k,0])
#update the trajectory
x2[start:stop,k] = x2[start-1,k] + xrand2[i,k,0]
y2[start:stop,k] = y2[start-1,k] + xrand2[i,k,1]
i += 1
return T, (x1+x2), (y1+y2)
A simple run of the above function is given below,
Tmin = 0.61 # in ps
Tmax = 1000 # in ps
NT = int(Tmax/Tmin)*10
delt = (Tmax-0.0)/NT
print "Delta T, No. of timesteps:",delt,NT
Dint = 0.21 #give it Ang^2/ps
sig = 0.3 #in Ang
xmax = 5.*sig
tau = sig**2/(2*Dint)/delt # from ps, convert it into the required units according to delt
print "Waiting time for confined motion (in Delta T units)",tau
Dj = 0.03 # in Ang^2/ps
tau2 = 10 # in ps
sig2 = np.sqrt(2*Dj*tau2)
print "Sigma 2:", sig2
tau2 = tau2/delt
alpha = 1
tim, xtall, ytall = ctrw_ens2d(sig,tau,sig2,tau2,alpha,xmax,100,Nt=NT,dt=delt)
The generated trajectories can be plotted as follows,
rall = np.stack((xtall,ytall),axis=-1)
print rall.shape
print xtall.shape
print rall[:,99,:].shape
k = 19
plt.plot(xtall[:,k],ytall[:,k])
Starting with a zero array, the loop
while stop < Nt:
start = stop
stop += randint();
x[start:stop] = x[start-1] + rand();
will create a series of steps.
A step can be achieved with the cumulative sum of the inpulse
while stop < Nt:
start = stop
stop += randint();
x[start] = any();
np.cumsum(x, out=x)
This applies to both the first and second loop.
The (x2, y2) are more easily vectorized because the increments do not depend on the previous values
The (x2, y2) still require a while loop, but each iteration can be vectorized.
The final result is like this
def ctrw_ens2d_vectorized(sig,tau,sig2,tau2,alpha,xmax,Nens,Nt=1000,dt=1.0):
# first define the array for time
T = np.arange(0,Nt,1)*dt
# generate at least Nt random time increments based on Poisson
# distribution (you will use only less than that)
trand = np.random.exponential(tau, (2*Nt,Nens,1))
xrand = np.random.normal(0.0,sig,(2*Nt,Nens,2))
Xdist = np.random.lognormal(-1,0.9,(Nens))
Xdist = np.clip(Xdist,2*sig,12*sig)
trand2 = np.random.exponential(tau2, (2*Nt,Nens,1)).astype(np.int64)
xrand2 = np.random.normal(0.0,sig2,(2*Nt,Nens,2))
#make a zero array of trajectory
x1 = np.zeros((Nt,Nens))
x2 = np.zeros((Nt,Nens))
y1 = np.zeros((Nt,Nens))
y2 = np.zeros((Nt,Nens))
#randomly add jumps in between, at later stage
stop = 1 + np.cumsum(trand2[:,:,0], axis=0)
# vectorize the indices
I, J = np.indices(stop.shape)
m = stop < Nt # Vectorized loop stopping condition
I = I[m]; J = J[m]; # indices only for the executed iterations
# update x
x2[stop[I,J], J] = xrand2[I,J,0]
y2[stop[I,J], J] = xrand2[I,J,1]
np.cumsum(x2, axis=0, out=x2)
np.cumsum(y2, axis=0, out=y2)
# this part is more complicated and I vectorized on axis 1
stop = np.zeros(Nens, dtype=np.int64)
start = np.zeros(Nens, dtype=np.int64)
k = np.arange(Nens)
i = 0
zx1 = np.zeros_like(x1[0])
zy1 = np.zeros_like(y1[0])
assert(np.all(trand > 0))
m = k
i = 0
while np.any(stop < Nt):
start[:] = stop;
stop[m] += trand[i,m,0].astype(np.int64)
m = k[stop < Nt];
r1 = np.sqrt(zx1[m]**2 + zy1[m]**2)
rr = np.linalg.norm(xrand[i,m,:],axis=-1) # axis requires numpy 1.8
tx = (1-(abs((r1+rr)/(Xdist[m]))).astype(np.int64))* xrand[i,m,0] \
- (abs((r1+rr)/(Xdist[m]))).astype(np.int64)* \
np.sign(zx1[m]/xrand[i,m,0])* xrand[i,m,0]
ty = (1-(abs((r1+rr)/(Xdist[m]))).astype(np.int64))* xrand[i,m,1] \
- (abs((r1+rr)/(Xdist[m]))).astype(np.int64)* \
np.sign(zy1[m]/xrand[i,m,1])* xrand[i,m,1]
zx1[m] += tx[:] * (start[m] < stop[m])
zy1[m] += ty[:] * (start[m] < stop[m])
x1[start[m],m] = tx[:]
y1[start[m],m] = ty[:]
i += 1
np.cumsum(x1, axis=0, out=x1)
np.cumsum(y1, axis=0, out=y1)
return T, (x1+x2), (y1+y2)
This runs ~8x faster than the original code here.
I am trying to teach my students about Chi-Square while trapped here at home. I have made a video that should be mostly helpful, however I have been having trouble making a graph with the specific properties of the Chi-Square distribution. The shape is right, however there is a lot of noise. This is simulation data, so it will never be perfectly smooth, however this is a bit much.
I have been trying to smooth the data. I have gone as far as to round the data to the nearest tenth and perform a moving average (k = 3) in order to get a graph as presentable as this:
Chi-Squared Simulation df = 3, sample size = 100, samples = 100000, rounded and smoothed
Chi-Squared Simulation df = 3, sample size = 100, samples = 100000, not rounded, smoothed
A few things I have noticed while working on this problem. First, the spikes and dips seem to occur at predictable locations. Second, without the rounding, the graph seems to alternate back and forth between a spike and dip regularly. I think it may be possible that this is be due to some sort of binary precision problem. I have tried to account for this by switching to using numpy for my operations and forcing the data to be float64. This had no effect.
What I would like to know is either:
If this problem is caused by binary precision, how can I properly mitigate that?
If this cannot be solved in that way, is there a better smoothing operation I could use?
Thank you for the assistance. Code is below.
# Draw n samples of 25 and get Chi-Square list
chiSqrList = []
n = 100000
sampleSize = 100
j = 0
while j < n:
redTotal = 0
greenTotal = 0
yellowTotal = 0
blueTotal = 0
i = 0
while i < sampleSize:
x = random.random()
if x < redLim:
redTotal += 1
elif x < greenLim:
greenTotal += 1
elif x < yellowLim:
yellowTotal +=1
else:
blueTotal += 1
i += 1
observedBalls = np.array([redTotal, greenTotal, yellowTotal, blueTotal], dtype=np.float64)
expectedBalls = np.array([sampleSize*redBalls, sampleSize*greenBalls, sampleSize*yellowBalls, sampleSize*blueBalls], dtype=np.float64)
chiSqr = 0
chiSqr = np.power((observedBalls - expectedBalls), 2)/expectedBalls
chiSqr = np.sum(chiSqr)
chiSqr = round(chiSqr, 1)
chiSqrList.append(chiSqr)
j += 1
# Make count data
avgSqrDist = []
count = []
i = 0
for value in chiSqrList:
if len(avgSqrDist) == 0:
avgSqrDist.append(value)
count.append(1)
elif avgSqrDist[i] != value:
avgSqrDist.append(value)
count.append(1)
i += 1
else:
count[i] += 1
# Smooth curve
i = 0
smoothAvgSqrDist = []
smoothCount = []
while i < len(avgSqrDist)-2:
smoothCount.append((count[i]+count[i+1]+count[i+2])/3)
smoothAvgSqrDist.append(avgSqrDist[i+1])
i += 1
I am trying to optimize a snippet that gets called a lot (millions of times) so any type of speed improvement (hopefully removing the for-loop) would be great.
I am computing a correlation function of some j'th particle with all others
C_j(|r-r'|) = sqrt(E((s_j(r')-s_k(r))^2)) averaged over k.
My idea is to have a variable corrfun which bins data into some bins (the r, defined elsewhere). I find what bin of r each s_k belongs to and this is stored in ind. So ind[0] is the index of r (and thus the corrfun) for which the j=0 point corresponds to. Multiple points can fall into the same bin (in fact I want bins to be big enough to contain multiple points) so I sum together all of the (s_j(r')-s_k(r))^2 and then divide by number of points in that bin (stored in variable rw). The code I ended up making for this is the following (np is for numpy):
for k, v in enumerate(ind):
if j==k:
continue
corrfun[v] += (s[k]-s[j])**2
rw[v] += 1
rw2 = rw
rw2[rw < 1] = 1
corrfun = np.sqrt(np.divide(corrfun, rw2))
Note, the rw2 business was because I want to avoid divide by 0 problems but I do return the rw array and I want to be able to differentiate between the rw=0 and rw=1 elements. Perhaps there is a more elegant solution for this as well.
Is there a way to make the for-loop faster? While I would like to not add the self interaction (j==k) I am even ok with having self interaction if it means I can get significantly faster calculation (length of ind ~ 1E6 so self interaction is probably insignificant anyways).
Thank you!
Ilya
Edit:
Here is the full code. Note, in the full code I am averaging over j as well.
import numpy as np
def twopointcorr(x,y,s,dr):
width = np.max(x)-np.min(x)
height = np.max(y)-np.min(y)
n = len(x)
maxR = np.sqrt((width/2)**2 + (height/2)**2)
r = np.arange(0, maxR, dr)
print(r)
corrfun = r*0
rw = r*0
print(maxR)
''' go through all points'''
for j in range(0, n-1):
hypot = np.sqrt((x[j]-x)**2+(y[j]-y)**2)
ind = [np.abs(r-h).argmin() for h in hypot]
for k, v in enumerate(ind):
if j==k:
continue
corrfun[v] += (s[k]-s[j])**2
rw[v] += 1
rw2 = rw
rw2[rw < 1] = 1
corrfun = np.sqrt(np.divide(corrfun, rw2))
return r, corrfun, rw
I debug test it the following way
from twopointcorr import twopointcorr
import numpy as np
import matplotlib.pyplot as plt
import time
n=1000
x = np.random.rand(n)
y = np.random.rand(n)
s = np.random.rand(n)
print('running two point corr functinon')
start_time = time.time()
r,corrfun,rw = twopointcorr(x,y,s,0.1)
print("--- Execution time is %s seconds ---" % (time.time() - start_time))
fig1=plt.figure()
plt.plot(r, corrfun,'-x')
fig2=plt.figure()
plt.plot(r, rw,'-x')
plt.show()
Again, the main issue is that in the real dataset n~1E6. I can resample to make it smaller, of course, but I would love to actually crank through the dataset.
Here is the code that use broadcast, hypot, round, bincount to remove all the loops:
def twopointcorr2(x, y, s, dr):
width = np.max(x)-np.min(x)
height = np.max(y)-np.min(y)
n = len(x)
maxR = np.sqrt((width/2)**2 + (height/2)**2)
r = np.arange(0, maxR, dr)
osub = lambda x:np.subtract.outer(x, x)
ind = np.clip(np.round(np.hypot(osub(x), osub(y)) / dr), 0, len(r)-1).astype(int)
rw = np.bincount(ind.ravel())
rw[0] -= len(x)
corrfun = np.bincount(ind.ravel(), (osub(s)**2).ravel())
return r, corrfun, rw
to compare, I modified your code as follows:
def twopointcorr(x,y,s,dr):
width = np.max(x)-np.min(x)
height = np.max(y)-np.min(y)
n = len(x)
maxR = np.sqrt((width/2)**2 + (height/2)**2)
r = np.arange(0, maxR, dr)
corrfun = r*0
rw = r*0
for j in range(0, n):
hypot = np.sqrt((x[j]-x)**2+(y[j]-y)**2)
ind = [np.abs(r-h).argmin() for h in hypot]
for k, v in enumerate(ind):
if j==k:
continue
corrfun[v] += (s[k]-s[j])**2
rw[v] += 1
return r, corrfun, rw
and here is the code to check the results:
import numpy as np
n=1000
x = np.random.rand(n)
y = np.random.rand(n)
s = np.random.rand(n)
r1, corrfun1, rw1 = twopointcorr(x,y,s,0.1)
r2, corrfun2, rw2 = twopointcorr2(x,y,s,0.1)
assert np.allclose(r1, r2)
assert np.allclose(corrfun1, corrfun2)
assert np.allclose(rw1, rw2)
and the %timeit results:
%timeit twopointcorr(x,y,s,0.1)
%timeit twopointcorr2(x,y,s,0.1)
outputs:
1 loop, best of 3: 5.16 s per loop
10 loops, best of 3: 134 ms per loop
Your original code on my system runs in about 5.7 seconds. I fully vectorized the inner loop and got it to run in 0.39 seconds. Simply replace your "go through all points" loop with this:
points = np.column_stack((x,y))
hypots = scipy.spatial.distance.cdist(points, points)
inds = np.rint(hypots.clip(max=maxR) / dr).astype(np.int)
# go through all points
for j in range(n): # n.b. previously n-1, not sure why
ind = inds[j]
np.add.at(corrfun, ind, (s - s[j])**2)
np.add.at(rw, ind, 1)
rw[ind[j]] -= 1 # subtract self
The first observation was that your hypot code was computing 2D distances, so I replaced that with cdist from SciPy to do it all in a single call. The second was that the inner for loop was slow, and thanks to an insightful comment from #hpaulj I vectorized that as well using np.add.at().
Since you asked how to vectorize the inner loop as well, I did that later. It now takes 0.25 seconds to run, for a total speedup of over 20x. Here's the final code:
points = np.column_stack((x,y))
hypots = scipy.spatial.distance.cdist(points, points)
inds = np.rint(hypots.clip(max=maxR) / dr).astype(np.int)
sn = np.tile(s, (n,1)) # n copies of s
diffs = (sn - sn.T)**2 # squares of pairwise differences
np.add.at(corrfun, inds, diffs)
rw = np.bincount(inds.flatten(), minlength=len(r))
np.subtract.at(rw, inds.diagonal(), 1) # subtract self
This uses more memory but does produce a substantial speedup vs. the single-loop version above.
Ok, so as it turns out outer products are incredibly memory expensive, however, using answers from #HYRY and #JohnZwinck i was able to make code that is still roughly linear in n in memory and computes fast (0.5 seconds for the test case)
import numpy as np
def twopointcorr(x,y,s,dr,maxR=-1):
width = np.max(x)-np.min(x)
height = np.max(y)-np.min(y)
n = len(x)
if maxR < dr:
maxR = np.sqrt((width/2)**2 + (height/2)**2)
r = np.arange(0, maxR+dr, dr)
corrfun = r*0
rw = r*0
for j in range(0, n):
ind = np.clip(np.round(np.hypot(x[j]-x,y[j]-y) / dr), 0, len(r)-1).astype(int)
np.add.at(corrfun, ind, (s - s[j])**2)
np.add.at(rw, ind, 1)
rw[0] -= n
corrfun = np.sqrt(np.divide(corrfun, np.maximum(rw,1)))
r=np.delete(r,-1)
rw=np.delete(rw,-1)
corrfun=np.delete(corrfun,-1)
return r, corrfun, rw
I'm writing a code in Python that predicts the energy levels of Hydrogen which I will use as a template for research into quarkonium energy levels. I'm using the scipy.integrate.odeint() function to solve the Shroedinger equation and it works fine for the lower energy levels up to n=6. I don't expect I'll have much need to go beyond that, but odeint returns Excess work done on this call (perhaps wrong Dfun type). which only encourages me to extend what I can predict.
The Shroedinger equation substitution I'm using is:
u'' - (l*(l+1)/r**2 - 2mu_e(E-V_emag(r))) * u = 0
=>
u' = v
v' = ((l*(l+1))/(r**2) - 2.0*mu_e*(E - V_emag(r)))*u
I'm then using scipy.integrate.odeint() on it and iterating through energies and using other functions I've defined that assess turning points and nodes in the result. The way I find the energy levels is finding the lowest possible E value where the number of turning points and nodes matches what it should; then incrementing L by 1 and finding the new ground energy, e.g. if L=0 I'll find n=1 energy and if L=3, I'll find the n=2 energy.
Once the code increments to L=7 it doesn't return anything useful. The range of r has been extended but I've tried with keeping it the same to reduce the number of steps but to no avail. The code is self-taught so in my research I've read about Jacobians. I'm afraid I haven't worked out what they are or if I need one yet. Any ideas?
def v_emag(r):
v = -alpha/r
return v
def s_e(y,r,l,E): #Shroedinger equation for electromagntism
x = numpy.zeros_like(y)
x[0] = y[1]
x[1] = ((l*(l+1))/(r**2) - 2.0*mu_e*(E - V_emag(r)))*y[0]
return x
def i_s_e(l,E,start=0.001,stop=None,step=(0.005*bohr)):
if stop is None:
stop = ((l+1)*30-10)*bohr
r = numpy.arange(start,stop,step)
y = odeint(s_e,y0,r,args=(l,E))
return y
def inormalise_e(l,E,start=0.001,stop=None,step=(0.005*bohr)):
if stop is None:
stop = ((l+1)*30-10)*bohr
r = numpy.arange(start,stop,step)
f = i_s_e(l,E,start,stop,step)[:,0]
f2 = f**2
area = numpy.trapz(f2,x=r)
return f/(numpy.sqrt(area))
def inodes_e(l,E,start=0.001,stop=None,step=(0.005*bohr)):
if stop is None:
stop = ((l+1)*30-10)*bohr
x = i_s_e(l,E,start,stop,step)
r = numpy.arange(start,stop,step)
k=0
for i in range(len(r)-1):
if x[i,0]*x[i+1,0] < 0: #If u value times next u value <0,
k+=1 #crossing of u=0 has occured, therefore count node
return k
def iturns_e(l,E,start=0.001,stop=None,step=(0.005*bohr)):
if stop is None:
stop = ((l+1)*30-10)*bohr
x = i_s_e(l,E,start,stop,step)
r = numpy.arange(start,stop,step)
k = 0
for i in range(len(r)-1):
if x[i,1]*x[i+1,1] < 0: #If du/dr value times next du/dr value <0,
k=k+1 #crossing of du/dr=0, therefore a maximum/minimum
return k
l = 0
while l < 10: #The ground state for a system with a non-zero angular momentum will
E1 = -1.5e-08 #be the energy level of principle quantum number l-1, therefore
E3 = 0 #by changing l, we can find n by searching for the ground state
E2 = 0.5*(E1+E3)
i = 0
while i < 40:
N1 = inodes_e(l,E1)
N2 = inodes_e(l,E2)
N3 = inodes_e(l,E3)
T1 = iturns_e(l,E1)
T2 = iturns_e(l,E2)
T3 = iturns_e(l,E3)
if N1 != N2:# and T1 != T2: #Looks in lower half first, therefore will tend to ground state
E3 = E2
E2 = 0.5*(E1+E3)
elif N2 != N3:# and T2 != T3:
E1 = E2
E2 = 0.5*(E1+E3)
else:
print "Can't find satisfactory E in range"
break
i += 1
x = inormalise_e(l,E2)
if x[((l+1)**2)/0.005] > (x[2*((l+1)**2)/0.005]) and iturns_e(l,E2+1e-20)==1:
print 'Energy of state: n =',(l+1),'is: ',(E2*(10**9)),'eV'
l += 1
else:
E1 = E2+10e-20
I don't know exactly what is wrong with your code and I'm not entirely sure what your while i<40: loop is doing, so perhaps you can correct the following if I'm wrong.
If you want the wavefunctions for a certain n, l for this system you can calculate the energy as E = RH/n^2 where RH is the Rydberg constant, so you don't need to count nodes. If you do need to count nodes, then the number corresponding to the (n,l) is n-l-1, so you can vary E and watch the number of nodes change for fixed l.
The main problem, it seems to me is that your r range isn't large enough to encompass all of the nodes (for large n ~ l), and that odeint doens't know to stay away from the other (unphysical) asymptotic solution, psi ~ exp(+ cr), and so under some conditions sends psi off to ±infinity for large r.
If it's at all helpful, this is what I came up with to find numerical solutions to the SE equation: you need to vary the r-range according to n,l though to avoid the above problems (eg see what happens if you ask for n, l = 10, 9).
import numpy as np
import scipy as sp
from scipy.integrate import odeint
m_e, m_p, hbar = sp.constants.m_e, sp.constants.m_p, sp.constants.hbar
mu_e = m_e*m_p/(m_e + m_p)
bohr = sp.constants.physical_constants['Bohr radius'][0]
Rinfhc = sp.constants.physical_constants['Rydberg constant times hc in J'][0]
RHhc = Rinfhc * mu_e / m_e
fac = sp.constants.e**2/4/sp.pi/sp.constants.epsilon_0
def V(r):
return -fac/r
def deriv(y, r, l, E):
y1, y2 = y
dy1dr = y2
dy2dr = -2*y2/r - (2*mu_e/hbar**2*(E - V(r)) - l*(l+1)/r**2)*y1
return dy1dr, dy2dr
def solveSE(l, E, y0):
rstep = 0.001 * bohr
rmin = rstep
rmax = 200*l * bohr #
r = np.arange(rmin, rmax, rstep)
y, dydt = odeint(deriv, y0, r, args=(l,E)).T
return r, y, dydt
n = 10
l = 2
y0 = (bohr, -bohr)
E = -RHhc / n**2
r, psi, dpsi_dr = solveSE(l, E, y0)
import pylab
pylab.plot(r, psi)
pylab.show()
I want to build a grid from sampled data. I could use a machine learning - clustering algorithm, like k-means, but I want to restrict the centres to be roughly uniformly distributed.
I have come up with an approach using the scikit-learn nearest neighbours search: pick a point at random, delete all points within radius r then repeat. This works well, but wondering if anyone has a better (faster) way of doing this.
In response to comments I have tried two alternate methods, one turns out much slower the other is about the same...
Method 0 (my first attempt):
def get_centers0(X, r):
N = X.shape[0]
D = X.shape[1]
grid = np.zeros([0,D])
nearest = near.NearestNeighbors(radius = r, algorithm = 'auto')
while N > 0:
nearest.fit(X)
x = X[int(random()*N), :]
_, del_x = nearest.radius_neighbors(x)
X = np.delete(X, del_x[0], axis = 0)
grid = np.vstack([grid, x])
N = X.shape[0]
return grid
Method 1 (using the precomputed graph):
def get_centers1(X, r):
N = X.shape[0]
D = X.shape[1]
grid = np.zeros([0,D])
nearest = near.NearestNeighbors(radius = r, algorithm = 'auto')
nearest.fit(X)
graph = nearest.radius_neighbors_graph(X)
#This method is very slow even before doing any 'pruning'
Method 2:
def get_centers2(X, r, k):
N = X.shape[0]
D = X.shape[1]
k = k
grid = np.zeros([0,D])
nearest = near.NearestNeighbors(radius = r, algorithm = 'auto')
while N > 0:
nearest.fit(X)
x = X[np.random.randint(0,N,k), :]
#min_dist = near.NearestNeighbors().fit(x).kneighbors(x, n_neighbors = 1, return_distance = True)
min_dist = dist(x, k, 2, np.ones(k)) # where dist is a cython compiled function
x = x[min_dist < 0.1,:]
_, del_x = nearest.radius_neighbors(x)
X = np.delete(X, del_x[0], axis = 0)
grid = np.vstack([grid, x])
N = X.shape[0]
return grid
Running these as follows:
N = 50000
r = 0.1
x1 = np.random.rand(N)
x2 = np.random.rand(N)
X = np.vstack([x1, x2]).T
tic = time.time()
grid0 = get_centers0(X, r)
toc = time.time()
print 'Method 0: ' + str(toc - tic)
tic = time.time()
get_centers1(X, r)
toc = time.time()
print 'Method 1: ' + str(toc - tic)
tic = time.time()
grid2 = get_centers2(X, r)
toc = time.time()
print 'Method 1: ' + str(toc - tic)
Method 0 and 2 are about the same...
Method 0: 0.840130090714
Method 1: 2.23365592957
Method 2: 0.774812936783
I'm not sure from the question exactly what you are trying to do. You mention wanting to create an "approximate grid", or a "uniform distribution", while the code you provide selects a subset of points such that no pairwise distance is greater than r.
A couple possible suggestions:
if what you want is an approximate grid, I would construct the grid you want to approximate, and then query for the nearest neighbor of each grid point. Depending on your application, you might further trim these results to cut-out points whose distance from the grid point is larger than is useful for you.
if what you want is an approximately uniform distribution drawn from among the points, I would do a kernel density estimate (sklearn.neighbors.KernelDensity) at each point, and do a randomized sub-selection from the dataset weighted by the inverse of the local density at each point.
if what you want is a subset of points such that no pairwise distance is greater than r, I would start by constructing a radius_neighbors_graph with radius r, which will, in one go, give you a list of all points which are too close together. You can then use a pruning algorithm similar to the one you wrote above to remove points based on these sparse graph distances.
I hope that helps!
I have come up with a very simple method which is much more efficient than my previous attempts.
This one simply loops over the data set and adds the current point to the list of grid points only if it is greater than r distance from all existing centers. This method is around 20 times faster than my previous attempts. Because there are no external libraries involved I can run this all in cython...
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.nonecheck(False)
def get_centers_fast(np.ndarray[DTYPE_t, ndim = 2] x, double radius):
cdef int N = x.shape[0]
cdef int D = x.shape[1]
cdef int m = 1
cdef np.ndarray[DTYPE_t, ndim = 2] xc = np.zeros([10000, D])
cdef double r = 0
cdef double r_min = 10
cdef int i, j, k
for k in range(D):
xc[0,k] = x[0,k]
for i in range(1, N):
r_min = 10
for j in range(m):
r = 0
for k in range(D):
r += (x[i, k] - xc[j, k])**2
r = r**0.5
if r < r_min:
r_min = r
if r_min > radius:
m = m + 1
for k in range(D):
xc[m - 1,k] = x[i,k]
nonzero = np.nonzero(xc[:,0])[0]
xc = xc[nonzero,:]
return xc
Running these methods as follows:
N = 40000
r = 0.1
x1 = np.random.normal(size = N)
x1 = (x1 - min(x1)) / (max(x1)-min(x1))
x2 = np.random.normal(size = N)
x2 = (x2 - min(x2)) / (max(x2)-min(x2))
X = np.vstack([x1, x2]).T
tic = time.time()
grid0 = gt.get_centers0(X, r)
toc = time.time()
print 'Method 0: ' + str(toc - tic)
tic = time.time()
grid2 = gt.get_centers2(X, r, 10)
toc = time.time()
print 'Method 2: ' + str(toc - tic)
tic = time.time()
grid3 = gt.get_centers_fast(X, r)
toc = time.time()
print 'Method 3: ' + str(toc - tic)
The new method is around 20 times faster. It could be made even faster, if I stopped looping early (e.g. if k successive iterations fail to produce a new center).
Method 0: 0.219595909119
Method 2: 0.191949129105
Method 3: 0.0127329826355
Maybe you could only re-fit the nearest object every k << N deletions to speedup the process. Most of the time the neighborhood structure should not change much.
Sounds like you are trying to reinvent one of the following:
cluster features (see BIRCH)
data bubbles (see "Data bubbles: Quality preserving performance boosting for hierarchical clustering")
canopy pre-clustering
i.e. this concept has already been invented at least three times with small variations.
Technically, it is not clustering. K-means isn't really clustering either.
It is much more adequately described as vector quantization.