Is there lexographical version of searchsorted in numpy? - python

I have two arrays which are lex-sorted.
In [2]: a = np.array([1,1,1,2,2,3,5,6,6])
In [3]: b = np.array([10,20,30,5,10,100,10,30,40])
In [4]: ind = np.lexsort((b, a)) # sorts elements first by a and then by b
In [5]: print a[ind]
[1 1 1 2 2 3 5 6 6]
In [7]: print b[ind]
[ 10 20 30 5 10 100 10 30 40]
I want to do a binary search for (2, 7) and (5, 150) expecting (4, 7) as the answer.
In [6]: np.lexsearchsorted((a,b), ([2, 5], [7,150]))
We have searchsorted function but that works only on 1D arrays.

EDIT: Edited to reflect comment.
def comp_leq(t1,t2):
if (t1[0] > t2[0]) or ((t1[0] == t2[0]) and (t1[1] > t2[1])):
return 0
else:
return 1
def bin_search(L,item):
from math import floor
x = L[:]
while len(x) > 1:
index = int(floor(len(x)/2) - 1)
#Check item
if comp_leq(x[index], item):
x = x[index+1:]
else:
x = x[:index+1]
out = L.index(x[0])
#If greater than all
if item >= L[-1]:
return len(L)
else:
return out
def lexsearch(a,b,items):
z = zip(a,b)
return [bin_search(z,item) for item in items]
if __name__ == '__main__':
a = [1,1,1,2,2,3,5,6,6]
b = [10,20,30,5,10,100,10,30,40]
print lexsearch(a,b,([2,7],[5,150])) #prints [4,7]

This code seems to do it for a set of (exactly) 2 lexsorted arrays
You might be able to make it faster if you create a set of values[-1], and than create a dictionary with the boundries for them.
I haven't checked other cases apart from the posted one, so please verify it's not bugged.
def lexsearchsorted_2(arrays, values, side='left'):
assert len(arrays) == 2
assert (np.lexsort(arrays) == range(len(arrays[0]))).all()
# here it will be faster to work on all equal values in 'values[-1]' in one time
boundries_l = np.searchsorted(arrays[-1], values[-1], side='left')
boundries_r = np.searchsorted(arrays[-1], values[-1], side='right')
# a recursive definition here will make it work for more than 2 lexsorted arrays
return tuple([boundries_l[i] +
np.searchsorted(arrays[-2[boundries_l[i]:boundries_r[i]],
values[-2][i],
side=side)
for i in range(len(boundries_l))])
Usage:
import numpy as np
a = np.array([1,1,1,2,2,3,5,6,6])
b = np.array([10,20,30,5,10,100,10,30,40])
lexsearchsorted_2((b, a), ([7,150], [2, 5])) # return (4, 7)

I ran into the same issue and came up with a different solution. You can treat the multi-column data instead as single entries using a structured data type. A structured data type will allow one to use argsort/sort on the data (instead of lexsort, although lexsort appears faster at this stage) and then use the standard searchsorted. Here is an example:
import numpy as np
from itertools import repeat
# Setup our input data
# Every row is an entry, every column what we want to sort by
# Unlike lexsort, this takes columns in decreasing priority, not increasing
a = np.array([1,1,1,2,2,3,5,6,6])
b = np.array([10,20,30,5,10,100,10,30,40])
data = np.transpose([a,b])
# Sort the data
data = data[np.lexsort(data.T[::-1])]
# Convert to a structured data-type
dt = np.dtype(zip(repeat(''), repeat(data.dtype, data.shape[1]))) # the structured dtype
data = np.ascontiguousarray(data).view(dt).squeeze(-1) # the dtype change leaves a trailing 1 dimension, ascontinguousarray is required for the dtype change
# You can also first convert to the structured data-type with the two lines above then use data.sort()/data.argsort()/np.sort(data)
# Search the data
values = np.array([(2,7),(5,150)], dtype=dt) # note: when using structured data types the rows must be a tuple
pos = np.searchsorted(data, values)
# pos is (4,7) in this example, exactly what you would want
This works for any number of columns, uses the built-in numpy functions, the columns remain in the "logical" order (decreasing priority), and it should be quite fast.
A compared the two two numpy-based methods time-wise.
#1 is the recursive method from #j0ker5 (the one below extends his example with his suggestion of recursion and works with any number of lexsorted rows)
#2 is the structured array from me
They both take the same inputs, basically like searchsorted except a and v are as per lexsort.
import numpy as np
def lexsearch1(a, v, side='left', sorter=None):
def _recurse(a, v):
if a.shape[1] == 0: return 0
if a.shape[0] == 1: return a.squeeze(0).searchsorted(v.squeeze(0), side)
bl = np.searchsorted(a[-1,:], v[-1], side='left')
br = np.searchsorted(a[-1,:], v[-1], side='right')
return bl + _recurse(a[:-1,bl:br], v[:-1])
a,v = np.asarray(a), np.asarray(v)
if v.ndim == 1: v = v[:,np.newaxis]
assert a.ndim == 2 and v.ndim == 2 and a.shape[0] == v.shape[0] and a.shape[0] > 1
if sorter is not None: a = a[:,sorter]
bl = np.searchsorted(a[-1,:], v[-1,:], side='left')
br = np.searchsorted(a[-1,:], v[-1,:], side='right')
for i in xrange(len(bl)): bl[i] += _recurse(a[:-1,bl[i]:br[i]], v[:-1,i])
return bl
def lexsearch2(a, v, side='left', sorter=None):
from itertools import repeat
a,v = np.asarray(a), np.asarray(v)
if v.ndim == 1: v = v[:,np.newaxis]
assert a.ndim == 2 and v.ndim == 2 and a.shape[0] == v.shape[0] and a.shape[0] > 1
a_dt = np.dtype(zip(repeat(''), repeat(a.dtype, a.shape[0])))
v_dt = np.dtype(zip(a_dt.names, repeat(v.dtype, a.shape[0])))
a = np.asfortranarray(a[::-1,:]).view(a_dt).squeeze(0)
v = np.asfortranarray(v[::-1,:]).view(v_dt).squeeze(0)
return a.searchsorted(v, side, sorter).ravel()
a = np.random.randint(100, size=(2,10000)) # Values to sort, rows in increasing priority
v = np.random.randint(100, size=(2,10000)) # Values to search for, rows in increasing priority
sorted_idx = np.lexsort(a)
a_sorted = a[:,sorted_idx]
And the timing results (in iPython):
# 2 rows
%timeit lexsearch1(a_sorted, v)
10 loops, best of 3: 33.4 ms per loop
%timeit lexsearch2(a_sorted, v)
100 loops, best of 3: 14 ms per loop
# 10 rows
%timeit lexsearch1(a_sorted, v)
10 loops, best of 3: 103 ms per loop
%timeit lexsearch2(a_sorted, v)
100 loops, best of 3: 14.7 ms per loop
Overall the structured array approach is faster, and can be made even faster if you design it to work with the flipped and transposed versions of a and v. It gets even faster as the numbers of rows/keys goes up, barely slowing down when going from 2 rows to 10 rows.
I did not notice any significant timing difference between using a_sorted or a and sorter=sorted_idx so I left those out for clarity.
I believe that a really fast method could be made using Cython, but this is as fast as it is going to get with pure pure Python and numpy.

Related

Pandas drop rows lower then others in all colums

I have a dataframe with a lot of rows with numerical columns, such as:
A
B
C
D
12
7
1
0
7
1
2
0
1
1
1
1
2
2
0
0
I need to reduce the size of the dataframe by removing those rows that has another row with all values bigger.
In the previous example i need to remove the last row because the first row has all values bigger (in case of dubplicate rows i need to keep one of them).
And return This:
A
B
C
D
12
7
1
0
7
1
2
0
1
1
1
1
My faster solution are the folowing:
def complete_reduction(df, columns):
def _single_reduction(row):
df["check"] = True
for col in columns:
df["check"] = df["check"] & (df[col] >= row[col])
drop_index.append(df["check"].sum() == 1)
df = df.drop_duplicates(subset=columns)
drop_index = []
df.apply(lambda x: _single_reduction(x), axis=1)
df = df[numpy.array(drop_index).astype(bool)]
return df
Any better ideas?
Update:
A new solution has been found here
https://stackoverflow.com/a/68528943/11327160
but i hope for somethings faster.
An more memory-efficient and faster solution than the one proposed so far is to use Numba. There is no need to create huge temporary array with Numba. Moreover, it is easy to write a parallel implementation that makes use of all CPU cores. Here is the implementation:
import numba as nb
#nb.njit
def is_dominated(arr, k):
n, m = arr.shape
for i in range(n):
if i != k:
dominated = True
for j in range(m):
if arr[i, j] < arr[k, j]:
dominated = False
if dominated:
return True
return False
# Precompile the function to native code for the most common types
#nb.njit(['(i4[:,::1],)', '(i8[:,::1],)'], parallel=True, cache=True)
def dominated_rows(arr):
n, m = arr.shape
toRemove = np.empty(n, dtype=np.bool_)
for i in nb.prange(n):
toRemove[i] = is_dominated(arr, i)
return toRemove
# Special case
df2 = df.drop_duplicates()
# Main computation
result = df2[~dominated_rows(np.ascontiguousarray(df.values))]
Benchmark
The input test is two random dataframes of shape 20000x5 and 5000x100 containing small integers (ie. [0;100[). Tests have been done on a (6-core) i5-9600KF processor with 16 GiB of RAM on Windows. The version of #BingWang is the updated one of the 2022-05-24. Here are performance results of the proposed approaches so far:
Dataframe with shape 5000x100
- Initial code: 114_340 ms
- BENY: 2_716 ms (consume few GiB of RAM)
- Bing Wang: 2_619 ms
- Numba: 303 ms <----
Dataframe with shape 20000x5
- Initial code: (too long)
- BENY: 8.775 ms (consume few GiB of RAM)
- Bing Wang: 578 ms
- Numba: 21 ms <----
This solution is respectively about 9 to 28 times faster than the fastest one (of #BingWang). It also has the benefit of consuming far less memory. Indeed, the #BENY implementation consume few GiB of RAM while this one (and the one of #BingWang) only consumes no more than few MiB for this used-case. The speed gain over the #BingWang implementation is due to the early stop, parallelism and the native execution.
One can see that this Numba implementation and the one of #BingWang are quite efficient when the number of column is small. This makes sense for the #BingWang since the complexity should be O(N(logN)^(d-2)) where d is the number of columns. As for Numba, it is significantly faster because most rows are dominated on the second random dataset causing the early stop to be very effective in practice. I think the #BingWang algorithm might be faster when most rows are not dominated. However, this case should be very uncommon on dataframes with few columns and a lot of rows (at least, clearly on uniformly random ones).
We can do numpy board cast
s = df.values
out = df[np.sum(np.all(s>=s[:,None],-1),1)==1]
Out[44]:
A B C D
0 12 7 1 0
1 7 1 2 0
2 1 1 1 1
Here is a try based on Kung et al 1975
http://www.eecs.harvard.edu/~htk/publication/1975-jacm-kung-luccio-preparata.pdf
Brutal force solution is from https://stackoverflow.com/a/68528943/11327160
I didn't robustly test it, but using these parameters it looks to be the same answer
There is no guarantee it is correct, or I am even following the paper. Please test thoroughly. In addition, there is very likely to be a commercial solution to calculate it.
D=5 #dimension, or number of columns
N=2000 #number of data rows
M=1000 #upper bound for random integers
Changing to D=20 and N=20000 you can see Kung75 completes in <1 minute but Brutal Force will use more than 10x the time.
Even at Dimension=1000,Rows=20000,value range 0~999, it can still complete slightly over 1 minute
This can be revised similar to merge sort (compute small chunks by brutal force, then merge up with Filter), which is easier to switch to parallel computing.
Another way of speeding up is to turn off array boundary check after you are comfortable with the code. This is due to heavy array indexing here. I would recommend C# if you want to try this path.
import pandas as pd
import numpy as np
import datetime
#generate fake data
D=1000 #dimension, or number of columns
N=20000 #number of data rows
M=1000 #upper bound for random integers
np.random.seed(12345) #set seed so this is reproducible
data=np.random.randint(0,M,(N,D))
for i in range(0,12):
print(i,data[i])
#Compare w and v starting dimention d
def Compare(w,v,d=0):
cmp=3 #0x11, low bit is GE, high bit is LE, together means EQ
while d<D:
if w[d]>v[d]:
cmp&=1
elif w[d]<v[d]:
cmp&=2
if cmp>0:
d+=1
else:
break
return cmp # 0=uncomparable, 1=GT, 2=LT, 3=EQ
#unit test:
#print(Compare(data[0],data[1]))
#print(Compare(data[0],data[1],4))
#print(Compare(data[1],data[11]))
#print(Compare(data[11],data[1]))
#print(Compare(data[1],data[1]))
def AuxSort(d,ndxArray): #stable sort desc by dimention d
return [x[1] for x in sorted([(-data[n][d],n) for n in ndxArray])]
#unit test
#print(AuxSort(data,0,[0,4,3]))
#print(AuxSort(data,2,[0,1,2]))
#cumulatively find the pareto front. Time O(N^2), space O(N)
def N2BrutalForce(data,ndxArray=None,d=0):
if len(data)==0:
return []
if not ndxArray: #by default check the entire data
ndxArray=list(range(len(data)))
#up to this point ndxArray is not empty
result={ndxArray[0]:data[ndxArray[0]]}
for i in range(1,len(ndxArray)):
dominated=[]
j=ndxArray[i]
for k,v in result.items():
c=Compare(data[j],v,d)
if c>1:
break
elif c==1:
dominated.append(k)
else:
for o in dominated:
del result[o]
result[j]=data[j]
return [r for r in result]
def resultPrinter(res, ShowCountOnly=False):
if not ShowCountOnly:
for r in sorted(res):
print(r,data[r])
print(len(res),'results found',datetime.datetime.today())
#unit rest
#resultPrinter(N2BrutalForce(data),True)
#resultPrinter(N2BrutalForce(data,list(range(15))))
def FindT(R1,R2,S1,S2,d):
S1R1=set(Filter(data,d,R1,S1))
T1=[s for s in S1 if s in S1R1]
S2R1=Filter(data,d+1,R1,S2)
S2R2=set(Filter(data,d,R2,S2))
T2=[s for s in S2R1 if s in S2R2]
return T1+T2
def BreakAtPseudoMedian(sArray,d):
sArray=AuxSort(d,sArray) #this could speed up by moving the sort to caller and avoid redo sorting
if data[sArray[0]][d]==data[sArray[-1]][d]:
return [],sArray
L=len(sArray)
mHigh=mLow=L//2
while mLow>0 and data[sArray[mLow]][d]==data[sArray[mLow-1]][d]:
mLow-=1
if mLow>0:
return sArray[:mLow],sArray[mLow:]
while mHigh<L-1 and data[sArray[mHigh]][d]==data[sArray[mHigh+1]][d]:
mHigh+=1
return sArray[:mHigh],sArray[mHigh:]
def Filter(data,d,rArray,sArray):
L=len(rArray)+len(sArray)
if d==D-1 and rArray:
R=max(data[r][d] for r in rArray)
return [s for s in sArray if data[s][d]>R]
elif len(rArray)*len(sArray)<=30 or len(rArray)<=2 or len(sArray)<=2:
nonDominated=[]
for s in sArray:
for r in rArray:
c=Compare(data[s],data[r],d)
if c>1:
break
else:
nonDominated.append(s)
return nonDominated
S1,S2=BreakAtPseudoMedian(sArray,d)
R1,R2=BreakAtRefValue(rArray,d,data[S2[0]][d])
if not S1 and not R1:
return Filter(data,d+1,rArray,sArray)
return FindT(R1,R2,S1,S2,d)
#Filter(data,0,[0,1,2,3,4,5,6,7,8,9],[11])
def BreakAtRefValue(rArray,d,br):
rArray=AuxSort(d,rArray)
if data[rArray[0]][d]<=br:
return [],rArray
if data[rArray[-1]][d]>br:
return rArray,[]
mLow,mHigh=0,len(rArray)-1
while mLow<mHigh-1 and data[rArray[mLow]][d]>br and data[rArray[mHigh]][d]<br:
mid=(mLow+mHigh)//2
if data[rArray[mid]][d]>br:
mLow=mid
elif data[rArray[mid]][d]<br:
mHigh=mid
else:
mLow=mid
break
if data[rArray[mLow]][d]>br and data[rArray[mHigh]][d]<br:
return rArray[:mHigh],rArray[mHigh:]
if data[rArray[mLow]][d]==br:
while data[rArray[mLow-1]][d]==br:
mLow-=1
return rArray[:mLow],rArray[mLow:]
while data[rArray[mHigh-1]][d]==br:
mHigh-=1
return rArray[:mHigh],rArray[mHigh:]
def Kung75(data,d,ndxArray):
L=len(ndxArray)
if L<10:
return N2BrutalForce(data,ndxArray,d)
elif d==D-1:
x,y=-1,-1
for n in ndxArray:
if y<0 or data[n][d]>x:
x,y=data[n][d],n
return [y]
if data[ndxArray[0]][d]==data[ndxArray[-1]][d]:
return Kung75(data,d+1,AuxSort(d+1,ndxArray))
R,S=BreakAtPseudoMedian(ndxArray,d)
R=Kung75(data,d,R)
S=Kung75(data,d,S)
T=Filter(data,d+1,R,S)
return R+T
print('started at',datetime.datetime.today())
resultPrinter(Kung75(data,0,AuxSort(0,list(range(len(data))))),True)
We take the cumulative maximum value per column in the dataframe.
We want to keep all rows that have a single column value that is equal to the maximum. We then drop duplicates using pandas drop_duplicates
In [14]: df = pd.DataFrame(
...: [[12, 7, 1, 0], [7, 1, 2, 0], [1, 1, 1, 1], [2, 2, 0, 0]],
...: columns=["A", "B", "C", "D"],
...: )
In [15]: df[(df == df.cummax(axis=0)).any(axis=1)].drop_duplicates()
Out[15]:
A B C D
0 12 7 1 0
1 7 1 2 0
2 1 1 1 1
df.sort_values(by=['A', 'B', 'C', 'D'], ascending=False, inplace=True)
df = df.iloc[:cutoff]
If this takes too long you could do it on subsets of the df until
it is small enough.

Fastest way to find the maximum minimum value of 'connected' matrices

The answer for three matrices was given in this question, but I'm not sure how to apply this logic to an arbitrary amount of pairwise connected matrices:
f(i, j, k, l, ...) = min(A(i, j), B(i,k), C(i,l), D(j,k), E(j,l), F(k,l), ...)
Where A,B,... are matrices and i,j,... are indices that range up to the respective dimensions of the matrices. If we consider n indices, there are n(n-1)/2 pairs and thus matrices. I would like to find (i,j,k,...) such that f(i,j,k,l,...) is maximized. I am currently doing that as follows:
import numpy as np
import itertools
# i j k l ...
dimensions = [50,50,50,50]
n_dims = len(dimensions)
pairs = list(itertools.combinations(range(n_dims), 2))
# Construct the matrices A(i,j), B(i,k), ...
matrices = [];
for pair in pairs:
matrices.append(np.random.rand(dimensions[pair[0]], dimensions[pair[1]]))
# All the different i,j,k,l... combinations
combinations = itertools.product(*list(map(np.arange,dimensions)))
combinations = np.asarray(list(combinations))
# Find the maximum minimum
vals = []
for i in range(len(pairs)):
pair = pairs[i]
matrix = matrices[i]
vals.append(matrix[combinations[:,pair[0]], combinations[:,pair[1]]])
f = np.min(vals,axis=0)
best_indices = combinations[np.argmax(f)]
print(best_indices, np.max(f))
[5 17 17 18] 0.932985854758534
This is faster than iterating over all (i, j, k, l, ...), but a lot of time is spent constructing the combinations and vals matrices. Is there an alternative way to do this where (1) the speed of numpy's matrix computation can be preserved and (2) I don't have to construct the memory-intensive vals matrices?
Here is a generalisation of the 3D solution. I assume there are other (better?) ways of organising the recursion but this works well enough. It does a 6D example (product of dims 9x10^6) in <10 ms
Sample run, note that occasionally the indices returned by the two methods do not match. This is because they are not always unique, sometimes different index combinations yield the same maximum of minima. Also note that in the very end we do a single run of a huge 6D 9x10^12 example. Brute force is no longer viable on that, the smart method takes about 10 seconds.
trial 1
results identical True
results compatible True
brute force 276.8830654968042 ms
branch cut 9.971900499658659 ms
trial 2
results identical True
results compatible True
brute force 273.444719001418 ms
branch cut 9.236706099909497 ms
trial 3
results identical True
results compatible True
brute force 274.2998780013295 ms
branch cut 7.31226220013923 ms
trial 4
results identical True
results compatible True
brute force 273.0268925006385 ms
branch cut 6.956217200058745 ms
HUGE (100, 150, 200, 100, 150, 200) 9000000000000
branch cut 10246.754082996631 ms
Code:
import numpy as np
import itertools as it
import functools as ft
def bf(dims,pairs):
dims,pairs = np.array(dims),np.array(pairs,object)
n,m = len(dims),len(pairs)
IDX = np.empty((m,n),object)
Y,X = np.triu_indices(n,1)
IDX[np.arange(m),Y] = slice(None)
IDX[np.arange(m),X] = slice(None)
idx = np.unravel_index(
ft.reduce(np.minimum,(p[(*i,)] for p,i in zip(pairs,IDX))).argmax(),dims)
return ft.reduce(np.minimum,(
p[I] for p,I in zip(pairs,it.combinations(idx,2)))),idx
def cut(dims,pairs,offs=None):
n = len(dims)
if n<3:
if n==2:
A = pairs[0] if offs is None else np.minimum(
pairs[0],np.minimum.outer(offs[0],offs[1]))
idx = np.unravel_index(A.argmax(),dims)
return A[idx],idx
else:
idx = offs[0].argmax()
return offs[0][idx],(idx,)
gmx = min(map(np.min,pairs))
gidx = n * (0,)
A = pairs[0] if offs is None else np.minimum(
pairs[0],np.minimum.outer(offs[0],offs[1]))
Y,X = np.unravel_index(A.argsort(axis=None)[::-1],dims[:2])
for y,x in zip(Y,X):
if A[y,x] <= gmx:
return gmx,gidx
coffs = [np.minimum(p1[y],p2[x])
for p1,p2 in zip(pairs[1:n-1],pairs[n-1:])]
if not offs is None:
coffs = [*map(np.minimum,coffs,offs[2:])]
cmx,cidx = cut(dims[2:],pairs[2*n-3:],coffs)
if cmx >= A[y,x]:
return A[y,x],(y,x,*cidx)
if gmx < cmx:
gmx = min(A[y,x],cmx)
gidx = y,x,*cidx
return gmx,gidx
from timeit import timeit
IDX = 10,15,20,10,15,20
for rep in range(4):
print("trial",rep+1)
pairs = [np.random.rand(i,j) for i,j in it.combinations(IDX,2)]
print("results identical",cut(IDX,pairs)==bf(IDX,pairs))
print("results compatible",cut(IDX,pairs)[1]==bf(IDX,pairs)[1])
print("brute force",timeit(lambda:bf(IDX,pairs),number=2)*500,"ms")
print("branch cut",timeit(lambda:cut(IDX,pairs),number=10)*100,"ms")
IDX = 100,150,200,100,150,200
pairs = [np.random.rand(i,j) for i,j in it.combinations(IDX,2)]
print("HUGE",IDX,np.prod(IDX))
print("branch cut",timeit(lambda:cut(IDX,pairs),number=1)*1000,"ms")

How to replace a list of values in a numpy array?

I have an unsorted array of numbers.
I need to replace certain numbers (given in a list) with specific alternatives (also given in a corresponding list)
I wrote the following code (which seems to works):
import numpy as np
numbers = np.arange(0,40)
np.random.shuffle(numbers)
problem_numbers = [33, 23, 15] # table, night_stand, plant
alternative_numbers = [12, 14, 26] # desk, dresser, flower_pot
for i in range(len(problem_numbers)):
idx = numbers == problem_numbers[i]
numbers[idx] = alternative_numbers[i]
However, this seems highly inefficient (this needs to be done several millions of times for much larger arrays).
I found this question which answers a similar problem however in my case the numbers are not sorted and they need to maintain their original location.
Note: numbers may contain multiple or no occurrences of elements in problem_numbers
EDIT: I implemented a TensorFlow version of this in this answer (almost exactly the same, except replacements are a dict).
Here is a simple way to do it:
import numpy as np
numbers = np.arange(0,40)
np.random.shuffle(numbers)
problem_numbers = [33, 23, 15] # table, night_stand, plant
alternative_numbers = [12, 14, 26] # desk, dresser, flower_pot
# Replace values
problem_numbers = np.asarray(problem_numbers)
alternative_numbers = np.asarray(alternative_numbers)
n_min, n_max = numbers.min(), numbers.max()
replacer = np.arange(n_min, n_max + 1)
# Mask replacements out of range
mask = (problem_numbers >= n_min) & (problem_numbers <= n_max)
replacer[problem_numbers[mask] - n_min] = alternative_numbers[mask]
numbers = replacer[numbers - n_min]
This works well an should be efficient as long as the range of the values in numbers (the difference between the smallest and the biggest) is not huge (e.g you don't have something like 1, 7 and 10000000000).
Benchmarking
I've compared the code in the OP with the three (as of now) proposed solutions with this code:
import numpy as np
def method_itzik(numbers, problem_numbers, alternative_numbers):
numbers = np.asarray(numbers)
for i in range(len(problem_numbers)):
idx = numbers == problem_numbers[i]
numbers[idx] = alternative_numbers[i]
return numbers
def method_mseifert(numbers, problem_numbers, alternative_numbers):
numbers = np.asarray(numbers)
replacer = dict(zip(problem_numbers, alternative_numbers))
numbers_list = numbers.tolist()
numbers = np.array(list(map(replacer.get, numbers_list, numbers_list)))
return numbers
def method_divakar(numbers, problem_numbers, alternative_numbers):
numbers = np.asarray(numbers)
problem_numbers = np.asarray(problem_numbers)
problem_numbers = np.asarray(alternative_numbers)
# Pre-process problem_numbers and correspondingly alternative_numbers
# such that repeats and no matches are taken care of
sidx_pn = problem_numbers.argsort()
pn = problem_numbers[sidx_pn]
mask = np.concatenate(([True],pn[1:] != pn[:-1]))
an = alternative_numbers[sidx_pn]
minN, maxN = numbers.min(), numbers.max()
mask &= (pn >= minN) & (pn <= maxN)
pn = pn[mask]
an = an[mask]
# Pre-pocessing done. Now, we need to use pn and an in place of
# problem_numbers and alternative_numbers repectively. Map, index and assign.
sidx = numbers.argsort()
idx = sidx[np.searchsorted(numbers, pn, sorter=sidx)]
valid_mask = numbers[idx] == pn
numbers[idx[valid_mask]] = an[valid_mask]
def method_jdehesa(numbers, problem_numbers, alternative_numbers):
numbers = np.asarray(numbers)
problem_numbers = np.asarray(problem_numbers)
alternative_numbers = np.asarray(alternative_numbers)
n_min, n_max = numbers.min(), numbers.max()
replacer = np.arange(n_min, n_max + 1)
# Mask replacements out of range
mask = (problem_numbers >= n_min) & (problem_numbers <= n_max)
replacer[problem_numbers[mask] - n_min] = alternative_numbers[mask]
numbers = replacer[numbers - n_min]
return numbers
The results:
import numpy as np
np.random.seed(100)
MAX_NUM = 100000
numbers = np.random.randint(0, MAX_NUM, size=100000)
problem_numbers = np.unique(np.random.randint(0, MAX_NUM, size=500))
alternative_numbers = np.random.randint(0, MAX_NUM, size=len(problem_numbers))
%timeit method_itzik(numbers, problem_numbers, alternative_numbers)
10 loops, best of 3: 63.3 ms per loop
# This method expects lists
problem_numbers_l = list(problem_numbers)
alternative_numbers_l = list(alternative_numbers)
%timeit method_mseifert(numbers, problem_numbers_l, alternative_numbers_l)
10 loops, best of 3: 20.5 ms per loop
%timeit method_divakar(numbers, problem_numbers, alternative_numbers)
100 loops, best of 3: 9.45 ms per loop
%timeit method_jdehesa(numbers, problem_numbers, alternative_numbers)
1000 loops, best of 3: 822 µs per loop
In case not all problem_values are in numbers and they may even occur multiple times:
In that case I would just use a dict to keep the values to be replaced and use dict.get to translate problematic numbers:
replacer = dict(zip(problem_numbers, alternative_numbers))
numbers_list = numbers.tolist()
numbers = np.array(list(map(replacer.get, numbers_list, numbers_list)))
Even though it has to go "through Python" this is almost self-explaining and it's not much slower than a NumPy solution (probably).
In case every problem_value is actually present in the numbers array and only once:
If you have the numpy_indexed package you could simply use numpy_indexed.indices:
>>> import numpy_indexed as ni
>>> numbers[ni.indices(numbers, problem_numbers)] = alternative_numbers
That should be pretty efficient even for big arrays.

why is len so much more efficient on DataFrame than on underlying numpy array?

I've noticed that using len on a DataFrame is far quicker than using len on the underlying numpy array. I don't understand why. Accessing the same information via shape isn't any help either. This is more relevant as I try to get at the number of columns and number of rows. I was always debating which method to use.
I put together the following experiment and it's very clear that I will be using len on the dataframe. But can someone explain why?
from timeit import timeit
import pandas as pd
import numpy as np
ns = np.power(10, np.arange(6))
results = pd.DataFrame(
columns=ns,
index=pd.MultiIndex.from_product(
[['len', 'len(values)', 'shape'],
ns]))
dfs = {(n, m): pd.DataFrame(np.zeros((n, m))) for n in ns for m in ns}
for n, m in dfs.keys():
df = dfs[(n, m)]
results.loc[('len', n), m] = timeit('len(df)', 'from __main__ import df', number=10000)
results.loc[('len(values)', n), m] = timeit('len(df.values)', 'from __main__ import df', number=10000)
results.loc[('shape', n), m] = timeit('df.values.shape', 'from __main__ import df', number=10000)
fig, axes = plt.subplots(2, 3, figsize=(9, 6), sharex=True, sharey=True)
for i, (m, col) in enumerate(results.iteritems()):
r, c = i // 3, i % 3
col.unstack(0).plot.bar(ax=axes[r, c], title=m)
From looking at the various methods, the main reason is that constructing the numpy array df.values takes the lion's share of the time.
len(df) and df.shape
These two are fast because they are essentially
len(df.index._data)
and
(len(df.index._data), len(df.columns._data))
where _data is a numpy.ndarray. Thus, using df.shape should be half as fast as len(df) because it's finding the length of both df.index and df.columns (both of type pd.Index)
len(df.values) and df.values.shape
Let's say you had already extracted vals = df.values. Then
In [1]: df = pd.DataFrame(np.random.rand(1000, 10), columns=range(10))
In [2]: vals = df.values
In [3]: %timeit len(vals)
10000000 loops, best of 3: 35.4 ns per loop
In [4]: %timeit vals.shape
10000000 loops, best of 3: 51.7 ns per loop
Compared to:
In [5]: %timeit len(df.values)
100000 loops, best of 3: 3.55 µs per loop
So the bottleneck is not len but how df.values is constructed. If you examine pandas.DataFrame.values(), you'll find the (roughly equivalent) methods:
def values(self):
return self.as_matrix()
def as_matrix(self, columns=None):
self._consolidate_inplace()
if self._AXIS_REVERSED:
return self._data.as_matrix(columns).T
if len(self._data.blocks) == 0:
return np.empty(self._data.shape, dtype=float)
if columns is not None:
mgr = self._data.reindex_axis(columns, axis=0)
else:
mgr = self._data
if self._data._is_single_block or not self._data.is_mixed_type:
return mgr.blocks[0].get_values()
else:
dtype = _interleaved_dtype(self.blocks)
result = np.empty(self.shape, dtype=dtype)
if result.shape[0] == 0:
return result
itemmask = np.zeros(self.shape[0])
for blk in self.blocks:
rl = blk.mgr_locs
result[rl.indexer] = blk.get_values(dtype)
itemmask[rl.indexer] = 1
# vvv here is your final array assuming you actually have data
return result
def _consolidate_inplace(self):
def f():
if self._data.is_consolidated():
return self._data
bm = self._data.__class__(self._data.blocks, self._data.axes)
bm._is_consolidated = False
bm._consolidate_inplace()
return bm
self._protect_consolidate(f)
def _protect_consolidate(self, f):
blocks_before = len(self._data.blocks)
result = f()
if len(self._data.blocks) != blocks_before:
if i is not None:
self._item_cache.pop(i, None)
else:
self._item_cache.clear()
return result
Note that df._data is a pandas.core.internals.BlockManager, not a numpy.ndarray.
If you look at __len__ for pd.DataFrame, they actually just call len(df.index):
https://github.com/pandas-dev/pandas/blob/master/pandas/core/frame.py#L770
For a RangeIndex, this is a really fast operation since it's just a subtraction and division of values stored within the index object:
return max(0, -(-(self._stop - self._start) // self._step))
https://github.com/pandas-dev/pandas/blob/master/pandas/indexes/range.py#L458
I suspect that if you tested with a non-RangeIndex, the difference in times would be much more similar. I'll probably try modifying what you have to see if that's the case.
EDIT: After a quick check, the speed difference still seems to hold even with a standard Index, so there must still be some other optimization there.

Kinect + Python - Fill depth for shadows

The Kinect camera returns a depth image for the whole view. Due to the way the image is captured, some small areas are invisible to the camera. For those areas 2047 is returned.
I want to fill those areas with the value that is left of them - which is the most likely value for that area. I have the depth as numpy uint16 array. The trivial solution would be:
for x in xrange(depth.shape[1]):
for y in xrange(depth.shape[0]):
if depth[y,x] == 2047 and x > 0:
depth[y,x] = depth[y,x-1]
This takes around 16 seconds to execute (Raspberry 2) per 640 x 480 frame.
I came up with a solution using indexes:
w = numpy.where(depth == 2047)
w = zip(w[0], w[1])
for index in w:
if depth[index] == 2047 and index[1] > 0:
depth[index] = depth[index[0],index[1] - 1]
This takes around 0.6 seconds to execute for a test frame. Much faster but still far from perfect. Index computation and zip only take 0.04 seconds, so the main performance killer is the loop.
I reduced it to 0.3 seconds by using item():
for index in w:
if depth.item(index) == 2047 and index[1] > 0:
depth.itemset(index, depth.item(index[0],index[1] - 1))
Can this be improved further using only python (+numpy/opencv)? Compared to how fast simple filtering is, it should be possible to be faster than 0.05s
You have islands going behind the places where the elements in the input array are 2. The job here is to fill the shadows with the element right before starting the shadows. So, one way would be to find out the start and stop places of those islands and put x and -x at those places respectively, where x is the element right before starting of each island. Then, do cumsum along the rows, which would effectively fill the shodow-islands with x. That's all there is for a vectorized solution! Here's the implementation -
# Get mask of places to be updated
mask = np.zeros(np.array(depth.shape) + [0,1],dtype=bool)
mask[:,1:-1] = depth[:,1:] == 2047
# Get differentiation along the second axis and thus island start and stops
diffs = np.diff(mask.astype(int),axis=1)
start_mask = diffs == 1
stop_mask = diffs == -1
# Get a mapping array that has island places filled with the start-1 element
map_arr = np.zeros_like(diffs)
map_arr[start_mask] = depth[start_mask]
map_arr[stop_mask] = -depth[start_mask]
map_filled_arr = map_arr.cumsum(1)[:,:-1]
# Use mask created earlier to selectively set elements from map array
valid_mask = mask[:,1:-1]
depth[:,1:][valid_mask] = map_filled_arr[valid_mask]
Benchmarking
Define functions :
def fill_depth_original(depth):
for x in xrange(depth.shape[1]):
for y in xrange(depth.shape[0]):
if depth[y,x] == 2047 and x > 0:
depth[y,x] = depth[y,x-1]
def fill_depth_original_v2(depth):
w = np.where(depth == 2047)
w = zip(w[0], w[1])
for index in w:
if depth[index] == 2047 and index[1] > 0:
depth[index] = depth[index[0],index[1] - 1]
def fill_depth_vectorized(depth):
mask = np.zeros(np.array(depth.shape) + [0,1],dtype=bool)
mask[:,1:-1] = depth[:,1:] == 2047
diffs = np.diff(mask.astype(int),axis=1)
start_mask = diffs == 1
stop_mask = diffs == -1
map_arr = np.zeros_like(diffs)
map_arr[start_mask] = depth[start_mask]
map_arr[stop_mask] = -depth[start_mask]
map_filled_arr = map_arr.cumsum(1)[:,:-1]
valid_mask = mask[:,1:-1]
depth[:,1:][valid_mask] = map_filled_arr[valid_mask]
Runtime tests and verify outputs :
In [303]: # Create a random array and get a copy for profiling vectorized method
...: depth = np.random.randint(2047-150,2047+150,(500,500))
...: depthc1 = depth.copy()
...: depthc2 = depth.copy()
...:
In [304]: fill_depth_original(depth)
...: fill_depth_original_v2(depthc1)
...: fill_depth_vectorized(depthc2)
...:
In [305]: np.allclose(depth,depthc1)
Out[305]: True
In [306]: np.allclose(depth,depthc2)
Out[306]: True
In [307]: # Create a random array and get a copy for profiling vectorized method
...: depth = np.random.randint(2047-150,2047+150,(500,500))
...: depthc1 = depth.copy()
...: depthc2 = depth.copy()
...:
In [308]: %timeit fill_depth_original(depth)
...: %timeit fill_depth_original_v2(depthc1)
...: %timeit fill_depth_vectorized(depthc2)
...:
10 loops, best of 3: 89.6 ms per loop
1000 loops, best of 3: 1.47 ms per loop
100 loops, best of 3: 10.3 ms per loop
So, the second approach listed in the question still looks like winning!

Categories