speedup dtaidistance key function with numba - python
The DTAIDistance package can be used to find k best matches of the input query. but it cannot be used for multi-dimensional input query. moreover, I want to find the k best matches of many input queries in one run.
I modified the DTAIDistance function so that it can be used to search subsequences of multi-dimensions of multi-queries. I use njit with parallel to speed up the process,i.e.the p_calc function which applies numba-parallel to each of the input query. but I find that the parallel calculation seems not to speed up the calculation compared to just simply looping over the input queries one by one, i.e. the calc function.
import time
from tqdm import tqdm
from numba import njit, prange
import numpy as np
inf = np.inf
argmin=np.argmin
#njit(fastmath=True, nogil=True, error_model="numpy", cache=True, parallel=False)
def p_calc(d, dtw, s1, s2, r, c, psi_1b, psi_1e, psi_2b, psi_2e, window, max_step, max_dist, penalty, psi_neg):
n_series = s1.shape[1]
ndim = s1.shape[2]
# s1 = np.ascontiguousarray(s1)#.shape
# s2 = np.ascontiguousarray(s2)#.shape
# dtw = np.full((n_series,r + 1, c + 1), np.inf,dtype=s1.dtype) # cmath.inf
# d = np.full((n_series), np.inf,dtype=s1.dtype) # cmath.inf
for i in range(psi_2b + 1):
dtw[:, 0, i] = 0
for i in range(psi_1b + 1):
dtw[:, i, 0] = 0
for nn in prange(n_series):
print('im alive...')
i0 = 1
i1 = 0
sc = 0
ec = 0
smaller_found = False
ec_next = 0
for i in range(r):
i0 = i
i1 = i + 1
j_start = max(0, i - max(0, r - c) - window + 1)
j_end = min(c, i + max(0, c - r) + window)
if sc > j_start:
j_start = sc
smaller_found = False
ec_next = i
for j in range(j_start, j_end):
val = 0
tmp = ((s1[i, nn] - s2[j]) ** 2)
# tmp = (np.abs(s1[i, nn] - s2[j, 0]))
for nd in range(ndim):
val += tmp[nd]
d[nn] = val
# d = np.sum(np.abs(s1[i] - s2[j]) ) # multi-d
if max_step is not None and d[nn] > max_step:
continue
# print(i, j + 1 - skip, j - skipp, j + 1 - skipp, j - skip)
dtw[nn, i1, j + 1] = d[nn] + min(dtw[nn, i0, j],
dtw[nn, i0, j + 1] + penalty,
dtw[nn, i1, j] + penalty)
# dtw[i + 1, j + 1 - skip] = d + min(dtw[i + 1, j + 1 - skip], dtw[i + 1, j - skip])
if dtw[nn, i1, j + 1] > max_dist:
if not smaller_found:
sc = j + 1
if j >= ec:
break
else:
smaller_found = True
ec_next = j + 1
ec = ec_next
# Decide which d to return
dtw[nn] = np.sqrt(dtw[nn])
if psi_1e == 0 and psi_2e == 0:
d[nn] = dtw[nn, i1, min(c, c + window - 1)]
else:
ir = i1
ic = min(c, c + window - 1)
if psi_1e != 0:
vr = dtw[nn, ir:max(0, ir - psi_1e - 1):-1, ic]
mir = np.argmin(vr)
vr_mir = vr[mir]
else:
mir = ir
vr_mir = inf
if psi_2e != 0:
vc = dtw[nn, ir, ic:max(0, ic - psi_2e - 1):-1]
mic = np.argmin(vc)
vc_mic = vc[mic]
else:
mic = ic
vc_mic = inf
if vr_mir < vc_mic:
if psi_neg:
dtw[nn, ir:ir - mir:-1, ic] = -1
d[nn] = vr_mir
else:
if psi_neg:
dtw[nn, ir, ic:ic - mic:-1] = -1
d[nn] = vc_mic
if max_dist and d[nn] ** 2 > max_dist:
# if max_dist and d[nn] > max_dist:
d[nn] = inf
return d, dtw
#njit(fastmath=True, nogil=True) # Set "nopython" mode for best performance, equivalent to #njit
def calc(s1, s2, r, c, psi_1b, psi_1e, psi_2b, psi_2e, window, max_step, max_dist, penalty, psi_neg):
dtw = np.full((r + 1, c + 1), np.inf) # cmath.inf
for i in range(psi_2b + 1):
dtw[0, i] = 0
for i in range(psi_1b + 1):
dtw[i, 0] = 0
i0 = 1
i1 = 0
sc = 0
ec = 0
smaller_found = False
ec_next = 0
for i in range(r):
i0 = i
i1 = i + 1
j_start = max(0, i - max(0, r - c) - window + 1)
j_end = min(c, i + max(0, c - r) + window)
if sc > j_start:
j_start = sc
smaller_found = False
ec_next = i
for j in range(j_start, j_end):
# d = (s1[i] - s2[j]) ** 2# 1-d
d = np.sum((s1[i] - s2[j]) ** 2) # multi-d
# d = np.sum(np.abs(s1[i] - s2[j]) ) # multi-d
if max_step is not None and d > max_step:
continue
dtw[i1, j + 1] = d + min(dtw[i0, j],
dtw[i0, j + 1] + penalty,
dtw[i1, j] + penalty)
if dtw[i1, j + 1] > max_dist:
if not smaller_found:
sc = j + 1
if j >= ec:
break
else:
smaller_found = True
ec_next = j + 1
ec = ec_next
# Decide which d to return
dtw = np.sqrt(dtw)
if psi_1e == 0 and psi_2e == 0:
d = dtw[i1, min(c, c + window - 1)]
else:
ir = i1
ic = min(c, c + window - 1)
if psi_1e != 0:
vr = dtw[ir:max(0, ir - psi_1e - 1):-1, ic]
mir = argmin(vr)
vr_mir = vr[mir]
else:
mir = ir
vr_mir = inf
if psi_2e != 0:
vc = dtw[ir, ic:max(0, ic - psi_2e - 1):-1]
mic = argmin(vc)
vc_mic = vc[mic]
else:
mic = ic
vc_mic = inf
if vr_mir < vc_mic:
if psi_neg:
dtw[ir:ir - mir:-1, ic] = -1
d = vr_mir
else:
if psi_neg:
dtw[ir, ic:ic - mic:-1] = -1
d = vc_mic
if max_dist and d * d > max_dist:
d = inf
return d, dtw
mydtype = np.float32
series1 = np.random.random((16, 30, 2)).astype(mydtype)
series2 = np.random.random((100000, 2)).astype(mydtype)
n_series = series1.shape[1]
r = series1.shape[0]
c = series2.shape[0]
dtw = np.full((n_series, r + 1, c + 1), np.inf, dtype=mydtype) # cmath.inf
d = np.full((n_series), np.inf, dtype=mydtype) # cmath.inf
time1 = time.time()
d, dtw1 = p_calc(d, dtw, series1, series2, series1.shape[0], series2.shape[0], 0, 0,
series2.shape[0], series2.shape[0], series2.shape[0], np.inf, np.inf, 0.01, False)
print(time.time() - time1)
time1 = time.time()
for ii in tqdm(range(series1.shape[1])):
d, dtw1 = calc( series1[:, ii, :], series2, series1.shape[0], series2.shape[0], 0, 0,
series2.shape[0], series2.shape[0], series2.shape[0], np.inf, np.inf, 0.01, False)
print(time.time() - time1)# this one is faster
How can I speed up the calc function or p_calc function so that I can calculate the dynamic time warping paths of multi-dimensional multi-queries?
thanks for the answer,and then i modified the code for simplification.
i delete the np.sum part and use loop,i can get another speedup. any suggestions for further speedups ?
import time
from numba import njit, prange
import numpy as np
inf = np.inf
argmin=np.argmin
#njit(fastmath=True, nogil=True, error_model="numpy", cache=False, parallel=True)
def p_calc(d, dtw, s1, s2, r, c, psi_1b, psi_1e, psi_2b, psi_2e, window, max_step, max_dist, penalty, psi_neg):
n_series = s1.shape[1]
ndim = s1.shape[2]
for nn in prange(n_series):
for i in range(r):
j_start = 0
j_end = c
for j in range(j_start, j_end):
val = 0
# tmp = ((s1[i, nn] - s2[j]) ** 2)
# tmp = (np.abs(s1[i, nn] - s2[j, 0]))
for nd in range(ndim):
tmp = ((s1[i, nn,nd] - s2[j,nd]) ** 2)
val += tmp
d[nn] = val
return d, dtw
#njit(fastmath=True, nogil=True) # Set "nopython" mode for best performance, equivalent to #njit
def calc(dtw,s1, s2, r, c, psi_1b, psi_1e, psi_2b, psi_2e, window, max_step, max_dist, penalty, psi_neg):
ndim = s1.shape[-1]
for i in range(r):
j_start = 0
j_end = c
for j in range(j_start, j_end):
d = 0
for kk in range(ndim):
d += (s1[i, kk] - s2[j, kk]) ** 2
return d, dtw
mydtype = np.float32
series1 = np.random.random((16, 300, 2)).astype(mydtype)
series2 = np.random.random((1000000, 2)).astype(mydtype)
n_series = series1.shape[1]
r = series1.shape[0]
c = series2.shape[0]
dtw = np.full((n_series, r + 1, c + 1), np.inf, dtype=mydtype) # cmath.inf
d = np.full((n_series), np.inf, dtype=mydtype) # cmath.inf
time1 = time.time()
# assert 1==2
# dtw[:,series2.shape[0]]
d1, dtw1 = p_calc(d, dtw, series1, series2, series1.shape[0], series2.shape[0], 0, 0, series2.shape[0], series2.shape[0], series2.shape[0], np.inf, np.inf, 0.01, False)
print(time.time() - time1)
# assert 1==2
time1 = time.time()
dtw = np.full(( r + 1, c + 1), np.inf, dtype=mydtype) # cmath.inf
for ii in (range(series1.shape[1])):
d2, dtw2 = calc( dtw,series1[:, ii, :], series2, series1.shape[0], series2.shape[0], 0, 0,
series2.shape[0], series2.shape[0], series2.shape[0], np.inf, np.inf, 0.01, False)
print(time.time() - time1)# this one is faster
np.allclose(dtw1[-1],dtw2)
np.allclose(d1[-1],d2)
EDIT:
i found the following code's performance is very different if use pass or break. i don't understand why?
#njit(fastmath=True, nogil=True)
def kbest_matches(matching,k=4000):
ki = 0
while ki < k:
best_idx =np.argmin(matching)# np.argmin(np.arange(10000000))#
if best_idx == 0 :
# pass
break
ki += 1
return 0
ss= np.random.random((1575822,))
time1 = time.time()
pp = kbest_matches(ss)
print(time.time() - time1)
I assume the code of both implementations are correct and as been carefully checked (otherwise the benchmark would be pointless).
The issue likely comes from the compilation time of the function. Indeed, the first call is significantly slower than next calls, even with cache=True. This is especially important for the parallel implementation as compiling parallel Numba code is often slower (since it is more complex). The best solution to avoid this is to compile Numba functions ahead of time by providing types to Numba.
Besides this, benchmarking a computation only once is usually considered as a bad practice. Good benchmarks perform multiple iterations and remove the first ones (or consider them separately). Indeed, several other problems can appear when a code is executed for the first time: CPU caches (and the TLB) are cold, the CPU frequency can change during the execution and is likely smaller when the program is just started, page faults may need to be needed, etc.
In practice, I cannot reproduce the issue. Actually, p_calc is 3.3 times faster on my 6-core machine. When the benchmark is done in a loop of 5 iterations, the measured time of the parallel implementation is much smaller: about 13 times (which is actually suspicious for a parallel implementation using 6 threads on a 6-core machine).
Related
Heat equation divide by zero issue
I'm writing a code that solves a heat equation implementing an implicit method. The problem is that the values between first and last layer of the matrix are NaNs. What could be the problem? From my problem of view, the main issue might be with the 105th line, which represents the convrsion of original function to the one that includes the boundary function. Boundary functions code: def func(x, t): return x*(1 - x)*np.exp(-2*t) # boundary function for x = 0 and x = 1 def q0(t): return t*np.exp(-t/0.1)*np.cos(t) # граничное условие при x = 0 def q1(t): return t*np.exp(-t/0.5)*np.cos(t) # граничное уcловие при x = 1 def derivative(f, x0, step): return (f(x0+step) - f(x0))/step # boundary function that for t = 0 def u_x0(x): return (-x + 1)*x Function that solves the three-diagonal matrix equation def solution(a, b): n = len(a) x = [0 for k in range(0, n)] # forward v = [0 for k in range(0, n)] u = [0 for k in range(0, n)] # first string (t = 0) v[0] = a[0][1] / (-a[0][0]) u[0] = ( - b[0]) / (-a[0][0]) for i in range(1, n - 1): v[i] = a[i][i+1] / ( -a[i][i] - a[i][i-1]*v[i-1] ) u[i] = ( a[i][i-1]*u[i-1] - b[i] ) / ( -a[i][i] - a[i][i-1]*v[i-1] ) # last string (t = 1) v[n-1] = 0 u[n-1] = (a[n-1][n-2]*u[n-2] - b[n-1]) / (-a[n-1][n-1] - a[n-1][n-2]*v[n-2]) x[n-1] = u[n-1] for i in range(n-1, 0, -1): x[i-1] = v[i-1] * x[i] + u[i-1] return x Coefficent matrix values: A = -t/h**2 B = 1 + 2*t/h**2 C = -t/h**2 Code that actually solves the matrix: i = 1 X =[] while i < 99: X = solution(cool_array, f) k = 0 while k < len(x_i): #line-105 X[k] += 0.01*(func(x_i[k], x_i[i]) - (1 - x_i[i])*derivative(q0, x_i[i], 0.01) - (x_i[i])*derivative(q1, x_i[i], 0.01)) k+=1 a = 1 while a < 98: w_h_t[i][a] = X[a] a+=1 f = X f[0] = w_h_t[i][0] f[99] = w_h_t[i][99] i+=1 print(w_h_t) As far as I understand, the algorith solution(a, b) is written properly, so I guess the problem might be with the boundary functions or with the 105th line. The output I expect is at least an array of number, not NaNs.
Subtle mistakes of one implementation of decision stump
Note: this question arises because of implementation details instead of decision stump ERM algorithm itself. I am trying to implement the decision stump algorithm by myself and compare it with a correct implementation (both implementations are shown below). To my understanding, my implementation is essentially the same as the correct implementation, where I do the sorting of x, D and y together but correct implementation does not sort D and y and uses curr_idx instead. But after running the following test codes, of 10000 different test cases, I got 6213 incorrect test cases. np.random.seed(0) cor_count = 0 err_count = 0 for iter in range(10000): X = np.random.randint(1, 100, size=(10, 2)) D = np.random.rand(10) D /= np.sum(D) y = np.random.choice([-1, 1], size=(10,)) my_res = my_decision_stump(X, D, y) res = decision_stump(X, D, y) if my_res == res: cor_count += 1 else: err_count += 1 print("Number of correct cases {}".format(cor_count)) print("Number of error cases {}".format(err_count)) My implementation and correct implementation are shown below. def my_decision_stump(X, D, y): Fs = np.inf optimal_j = None optimal_b = None optimal_theta = None m, d = X.shape for j in range(d): record = np.hstack((X[:, j].reshape(-1, 1), D.reshape(-1, 1), y.reshape(-1, 1))) record_sorted = record[record[:, 0].argsort()] x = record_sorted[:, 0]; D = record_sorted[:, 1]; y = record_sorted[:, 2] x = np.hstack((x, x[-1]+1)) F_pos = np.sum(D[y == 1]) F_neg = np.sum(D[y == -1]) if F_pos < Fs or F_neg < Fs: optimal_theta = x[0] - 1; optimal_j = j if F_pos < F_neg: Fs = F_pos; optimal_b = 1 else: Fs = F_neg; optimal_b = -1 for i in range(m): F_pos -= y[i] * D[i] F_neg += y[i] * D[i] if (F_pos < Fs or F_neg < Fs) and x[i] != x[i+1]: optimal_theta = 0.5 * (x[i] + x[i+1]); optimal_j = j if F_pos < F_neg: Fs = F_pos; optimal_b = 1 else: Fs = F_neg; optimal_b = -1 return (optimal_j, optimal_b, optimal_theta) def decision_stump(X, D, y): Fs = np.inf optimal_j = None optimal_b = None optimal_theta = None m, d = X.shape for j in range(d): index = np.argsort(X[:, j]) x = np.zeros(m+1) x[:-1] = X[index, j] x[-1] = x[-2] + 1 F_pos = np.sum(D[y == 1]) F_neg = np.sum(D[y == -1]) if F_pos < Fs or F_neg < Fs: optimal_theta = x[0] - 1; optimal_j = j if F_pos < F_neg: Fs = F_pos; optimal_b = 1 else: Fs = F_neg; optimal_b = -1 for i in range(m): curr_idx = index[i] F_pos -= y[curr_idx] * D[curr_idx] F_neg += y[curr_idx] * D[curr_idx] if (F_pos < Fs or F_neg < Fs) and x[i] != x[i+1]: optimal_theta = 0.5 * (x[i] + x[i+1]); optimal_j = j if F_pos < F_neg: Fs = F_pos; optimal_b = 1 else: Fs = F_neg; optimal_b = -1 return (optimal_j, optimal_b, optimal_theta) I am not sure what (maybe subtle) mistake I have made. Could anyone help me, thank you in advance.
I made a stupid mistake... In my implementation, due to the abuse of variable name D and y, it turns out that every time D and y changes (i.e. D = record_sorted[:, 1] but in next iteration this D is used, the same with y) since I do the sorting but they SHOULD NOT. This example shows the great hazard of abusing variable name.
Visual for masses on a string using ivisual
Hi I am trying to create a visual for a physics problem using ivisual. The best visual I could find to describe the problem is below: The problem I am having is I found an example in a text book of the same problem. The textbook visualizes the answers using visual module, however from the research I have been doing I believe that the visual module is now called ivisual. I can get the answer numerically when I hash out the function call and any other code that uses the ivisual module. In my specific example the masses are 10 and 20 and the length of each rope segment is 3, 4, and 4, and the distance between a and a is 8. I tried fixing the visual code, but I cannot get it work, it is probably something stupid. My code is below with the error I am currently getting: from ivisual import * from numpy.linalg import solve import numpy as np import math as m scene = display(x=0,y=0,width=500,height=500, title='String and masses configuration') tempe = curve(x=range(0,500),color=color.black) n = 9 eps = 1*10**(-6) deriv = np.zeros( (n, n), float) f = np.zeros( (n), float) x = np.array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1., 1., 1.]) def plotconfig(): for obj in scene.objects: obj.visible=0 # to erase the previous configuration L1 = 3.0 L2 = 4.0 L3 = 4.0 xa = L1*x[3] # L1*cos(th1) ya = L1*x[0] # L1 sin(th1) xb = xa+L2*x[4] # L1*cos(th1)+L2*cos(th2) yb = ya+L2*x[1] # L1*sin(th1)+L2*sen(th2) xc = xb+L3*x[5] # L1*cos(th1)+L2*cos(th2)+L3*cos(th3) yc = yb-L3*x[2] # L1*sin(th1)+L2*sen(th2)-L3*sin(th3) mx = 100.0 # for linear coordinate transformation bx = -500.0 # from 0=< x =<10 my = -100.0 # to -500 =<x_window=>500 by = 400.0 # same transformation for y xap = mx*xa+bx # to keep aspect ratio yap = my*ya+by ball1 = sphere(pos=(xap,yap), color=color.cyan,radius=15) xbp = mx*xb+bx ybp = my*yb+by ball2 = sphere(pos=(xbp,ybp), color=color.cyan,radius=25) xcp = mx*xc+bx ycp = my*yc+by x0 = mx*0+bx y0 = my*0+by line1 = curve(pos=[(x0,y0),(xap,yap)], color=color.yellow,radius=4) line2 = curve(pos=[(xap,yap),(xbp,ybp)], color=color.yellow,radius=4) line3 = curve(pos=[(xbp,ybp),(xcp,ycp)], color=color.yellow,radius=4) topline = curve(pos=[(x0,y0),(xcp,ycp)], color=color.red,radius=4) def F(x, f): # Define F function f[0] = 3*x[3] + 4*x[4] + 4*x[5] - 8.0 f[1] = 3*x[0] + 4*x[1] - 4*x[2] f[2] = x[6]*x[0] - x[7]*x[1] - 10.0 f[3] = x[6]*x[3] - x[7]*x[4] f[4] = x[7]*x[1] + x[8]*x[2] - 20.0 f[5] = x[7]*x[4] - x[8]*x[5] f[6] = pow(x[0], 2) + pow(x[3], 2) - 1.0 f[7] = pow(x[1], 2) + pow(x[4], 2) - 1.0 f[8] = pow(x[2], 2) + pow(x[5], 2) - 1.0 def dFi_dXj(x, deriv, n): # Define derivative function h = 1*10**(-4) for j in range(0, n): temp = x[j] x[j] = x[j] + h/2. F(x, f) for i in range(0, n): deriv[i, j] = f[i] x[j] = temp for j in range(0, n): temp = x[j] x[j] = x[j] - h/2. F(x, f) for i in range(0, n): deriv[i, j] = (deriv[i, j] - f[i])/h x[j] = temp for it in range(1, 100): rate(1) # 1 second between graphs F(x, f) dFi_dXj(x, deriv, n) B = np.array([[-f[0]], [-f[1]], [-f[2]], [-f[3]], [-f[4]], [-f[5]],[-f[6]], [-f[7]], [-f[8]]]) sol = solve(deriv, B) dx = np.take(sol, (0, ), 1) # take the first column of matrix sol for i in range(0, n): x[i] = x[i] + dx[i] plotconfig() errX = errF = errXi = 0.0 for i in range(0, n): if ( x[i] != 0.): errXi = abs(dx[i]/x[i]) else: errXi = abs(dx[i]) if ( errXi > errX): errX = errXi if ( abs(f[i]) > errF ): errF = abs(f[i]) if ( (errX <= eps) and (errF <= eps) ): break print('Number of iterations = ', it) print('Solution:') for i in range(0, n): print('x[', i, '] = ', x[i]) AttributeError Traceback (most recent call last) <ipython-input-5-78050c1f23ab> in <module>() 76 for i in range(0, n): 77 x[i] = x[i] + dx[i] ---> 78 plotconfig() 79 errX = errF = errXi = 0.0 80 <ipython-input-5-78050c1f23ab> in plotconfig() 10 11 def plotconfig(): ---> 12 for obj in scene.objects: 13 obj.visible=0 # to erase the previous configuration 14 L1 = 3.0 AttributeError: 'NoneType' object has no attribute 'objects'
I tried your example : It works like it is, but first install vpython this way pip install vpython (after to upgrade : --upgrade) pip install ivisual --upgrade If you work system wide sudo -H pip.....
Efficient way to create a dense matrix from diagonal vectors in Python?
I am trying to create this matrix in Python using numpy vectors: where the values come from a function. I have implemented it with repeatedly using numpy.diag but for large dimensions, it becomes very slow. Here is the code: def makeS(N): vec = np.full(N, 2*v(x_range[1])) vec[0]*=0.5 S = np.diag(vec) vec = np.full(N-1, v(x_range[0])) S+= np.diag(vec, 1) for m in xrange(1, N): vec = np.full(N-m, 2*v(x_range[m+1])) vec[0]*= 0.5 S += np.diag(vec, -m) return S where v() is the said function and x_range is a vector of x-values. Is there a way to make this more efficient? Edit: Here is a full example: import numpy as np import math N = 5 x_range = np.linspace(0, 1, N+1) def v(x): return math.exp(x) def makeS(N): vec = np.full(N, 2*v(x_range[1])) vec[0]*=0.5 S = np.diag(vec) vec = np.full(N-1, v(x_range[0])) S+= np.diag(vec, 1) for m in xrange(1, N): vec = np.full(N-m, 2*v(x_range[m+1])) vec[0]*= 0.5 S += np.diag(vec, -m) return S print makeS(N) which outputs [[ 1.22140276 1. 0. 0. 0. ] [ 1.4918247 2.44280552 1. 0. 0. ] [ 1.8221188 2.9836494 2.44280552 1. 0. ] [ 2.22554093 3.6442376 2.9836494 2.44280552 1. ] [ 2.71828183 4.45108186 3.6442376 2.9836494 2.44280552]]
This is the fastest approach I could find: def makeS(N): values = np.array([v(x) for x in x_range]) values_doubled = 2 * values result = np.eye(N, k=1) * values[0] result[:, 0] = values[1:] for i in xrange(N - 1): result[i + 1, 1:i + 2] = values_doubled[1:i + 2][::-1] return result With N=2000 the original takes 26.97 seconds on my machine while the new version takes 0.02339 seconds. Here is the complete script for evaluating timings with some additional approaches. import numpy as np import math import timeit def v(x): return math.exp(x) def makeS1(N, x_range): vec = np.full(N, 2 * v(x_range[1])) vec[0] *= 0.5 S = np.diag(vec) vec = np.full(N - 1, v(x_range[0])) S += np.diag(vec, 1) for m in xrange(1, N): vec = np.full(N - m, 2 * v(x_range[m + 1])) vec[0] *= 0.5 S += np.diag(vec, -m) return S def makeS2(N, x_range): values = np.array([v(x) for x in x_range]) values_doubled = 2 * values def value_at_position(ai, aj): result = np.zeros((N, N)) for i, j in zip(ai.flatten(), aj.flatten()): if j > i + 1: continue elif j == i + 1: result[i, j] = values[0] elif j == 0: result[i, j] = values[i + 1] else: result[i, j] = values_doubled[i - j + 1] return result return np.fromfunction(value_at_position, (N, N)) def makeS3(N, x_range): values = np.array([v(x) for x in x_range]) values_doubled = 2 * values result = np.zeros((N, N)) for i in xrange(N): for j in xrange(min(i + 2, N)): if j == i + 1: result[i, j] = values[0] elif j == 0: result[i, j] = values[i + 1] else: result[i, j] = values_doubled[i - j + 1] return result def makeS4(N, x_range): values = np.array([v(x) for x in x_range]) values_doubled = 2 * values result = np.eye(N, k=1) * values[0] result[:, 0] = values[1:] for i in xrange(N - 1): result[i + 1, 1:i + 2] = values_doubled[1:i + 2][::-1] return result def main(): N = 2000 x_range = np.random.randn(N + 1) start = timeit.default_timer() s1 = makeS1(N, x_range) print 'makeS1', timeit.default_timer() - start start = timeit.default_timer() s2 = makeS2(N, x_range) print 'makeS2', timeit.default_timer() - start start = timeit.default_timer() s3 = makeS3(N, x_range) print 'makeS3', timeit.default_timer() - start start = timeit.default_timer() s4 = makeS4(N, x_range) print 'makeS4', timeit.default_timer() - start if N < 10: print s1 print s2 print s2 print s4 assert np.allclose(s1, s2) assert np.allclose(s2, s3) assert np.allclose(s3, s4) main() On my machine, this produces the output: makeS1 26.9707232448 makeS2 11.7728229076 makeS3 0.643742975052 makeS4 0.0233912765665
Scikit-learn: How to run KMeans on a one-dimensional array?
I have an array of 13.876(13,876) values between 0 and 1. I would like to apply sklearn.cluster.KMeans to only this vector to find the different clusters in which the values are grouped. However, it seems KMeans works with a multidimensional array and not with one-dimensional ones. I guess there is a trick to make it work but I don't know how. I saw that KMeans.fit() accepts "X : array-like or sparse matrix, shape=(n_samples, n_features)", but it wants the n_samples to be bigger than one I tried putting my array on a np.zeros() matrix and run KMeans, but then is putting all the non-null values on class 1 and the rest on class 0. Can anyone help in running this algorithm on a one-dimensional array?
You have many samples of 1 feature, so you can reshape the array to (13,876, 1) using numpy's reshape: from sklearn.cluster import KMeans import numpy as np x = np.random.random(13876) km = KMeans() km.fit(x.reshape(-1,1)) # -1 will be calculated to be 13876 here
Read about Jenks Natural Breaks. Function in Python found the link from the article: def get_jenks_breaks(data_list, number_class): data_list.sort() mat1 = [] for i in range(len(data_list) + 1): temp = [] for j in range(number_class + 1): temp.append(0) mat1.append(temp) mat2 = [] for i in range(len(data_list) + 1): temp = [] for j in range(number_class + 1): temp.append(0) mat2.append(temp) for i in range(1, number_class + 1): mat1[1][i] = 1 mat2[1][i] = 0 for j in range(2, len(data_list) + 1): mat2[j][i] = float('inf') v = 0.0 for l in range(2, len(data_list) + 1): s1 = 0.0 s2 = 0.0 w = 0.0 for m in range(1, l + 1): i3 = l - m + 1 val = float(data_list[i3 - 1]) s2 += val * val s1 += val w += 1 v = s2 - (s1 * s1) / w i4 = i3 - 1 if i4 != 0: for j in range(2, number_class + 1): if mat2[l][j] >= (v + mat2[i4][j - 1]): mat1[l][j] = i3 mat2[l][j] = v + mat2[i4][j - 1] mat1[l][1] = 1 mat2[l][1] = v k = len(data_list) kclass = [] for i in range(number_class + 1): kclass.append(min(data_list)) kclass[number_class] = float(data_list[len(data_list) - 1]) count_num = number_class while count_num >= 2: # print "rank = " + str(mat1[k][count_num]) idx = int((mat1[k][count_num]) - 2) # print "val = " + str(data_list[idx]) kclass[count_num - 1] = data_list[idx] k = int((mat1[k][count_num] - 1)) count_num -= 1 return kclass Use and visualization: import numpy as np import matplotlib.pyplot as plt def get_jenks_breaks(...):... x = np.random.random(30) breaks = get_jenks_breaks(x, 5) for line in breaks: plt.plot([line for _ in range(len(x))], 'k--') plt.plot(x) plt.grid(True) plt.show() Result: