Is there a faster way to mask an array? - python

I have a numpy array and I need to mask it.
My function looks like this:
def mask_arr(arr, min, max):
for i in range(arr.size-1):
if arr[i] < min:
arr[i] = 0
elif arr[i] > max:
arr[i] = 1
else:
arr[i] = 10
Problem is the array is huge and it takes a long time to mask it.
How can i achieve the same result but faster?

You can use use nested np.where like the following:
import numpy as np
q = np.random.rand(4,4)
# array([[0.86305369, 0.88477713, 0.58776518, 0.69122533],
# [0.52591559, 0.33155238, 0.50139987, 0.66812239],
# [0.83240284, 0.70147098, 0.17118681, 0.59652636],
# [0.82031661, 0.32032657, 0.55088698, 0.28931661]])
np.where(q > 0.8, 1, np.where(q < 0.3, 0, 10))
# array([[ 1, 1, 10, 10],
# [10, 10, 10, 10],
# [ 1, 10, 0, 10],
# [ 1, 10, 10, 0]])
Edit:
Based on your question, In the case you want to change the value in the case the element of the array is not greater then maxVal or smaller then minVal you can do or any other logic that you want:
import numpy as np
q = q = np.random.rand(4,4)
minVal = 0.3
maxVal = 0.9
qq = np.where(q > 0.8, 1, np.where(q < 0.3, 0, 2 * q))
Where q is:
[[0.63604995 0.18637738 0.90680287 0.64617278]
[0.97435344 0.04670638 0.3510053 0.71613776]
[0.17973416 0.50296747 0.35085383 0.853201 ]
[0.27820978 0.69438172 0.96186074 0.96625938]]
And qq is:
[[1.27209991 0. 1. 1.29234556]
[1. 0. 0.7020106 1.43227553]
[0. 1.00593493 0.70170767 1. ]
[0. 1.38876345 1. 1. ]]

Solution
You could use three simple assignments based on your rules. This uses the native vectorization available in numpy and hence will be quite faster compared to what you have tried.
# minval, maxval = 0.3, 0.8
condition = np.logical_and(a>=minval, a<=maxval)
a[a<minval] = 0
a[a>maxval] = 1
a[condition] = 10 # if a constant value of 10
a[condition] *= 2 # if each element gets multiplied by 2
Output:
[[10. 0. 10. 1. 0.]
[10. 10. 10. 0. 10.]
[ 1. 10. 10. 1. 1.]
[ 0. 1. 10. 10. 0.]
[ 0. 0. 10. 10. 10.]]
Dummy Data
a = np.random.rand(5,5)
Output:
array([[0.68554168, 0.27430639, 0.4382025 , 0.97162651, 0.16740865],
[0.32530579, 0.3415287 , 0.45920916, 0.09422211, 0.75247522],
[0.91621921, 0.65845783, 0.38678723, 0.83644281, 0.95865701],
[0.26290637, 0.83810284, 0.55327399, 0.3406887 , 0.26173914],
[0.24974815, 0.08543414, 0.78509214, 0.64663201, 0.61502744]])
Convenience Function
Since you mentioned that you could also self-multiply the target elements by a factor of two, I extended that functionality to either absolute assignment (setting a value of 10) or relative update (add, subtract, multiply, divide) w.r.t the current values of the array.
def mask_arr(arr,
minval: float = 0.3,
maxval: float = 0.8,
update_type: str = 'abs',
update_value: float = 10,
rel_update_method: str = '*',
mask_floor: float = 0.0,
mesk_ceiling: float = 1.0):
"""Returns the array arr after setting lower-bound (mask_floor),
upper-bound (mask_ceiling), and logic-for-in-between-values.
"""
# minval, maxval = 0.3, 0.8
condition = np.logical_and(arr>=minval, arr<=maxval)
arr[arr<minval] = lowerbound
arr[arr>maxval] = upperbound
if update_type=='abs':
# absolute update
arr[condition] = update_value
if update_type=='rel':
# relative update
if rel_update_method=='+':
arr[condition] += update_value
if rel_update_method=='-':
arr[condition] -= update_value
if rel_update_method=='*':
arr[condition] *= update_value
if rel_update_method=='/':
arr[condition] /= update_value
return arr
Example
# declare all inputs
arr = mask_arr(arr,
minval = 0.3,
maxval = 0.8,
update_type = 'rel',
update_value = 2.0,
rel_update_method = '*',
mask_floor = 0.0,
mesk_ceiling = 1.0)
# using defaults for
# mask_floor = 0.0,
# mesk_ceiling = 1.0
arr = mask_arr(arr,
minval = 0.3,
maxval = 0.8,
update_type = 'rel',
update_value = 2.0,
rel_update_method = '*')
# using defaults as before and
# setting a fixed value of 10
arr = mask_arr(arr,
minval = 0.3,
maxval = 0.8,
update_type = 'abs',
update_value = 10.0)

With numpy you don't need to do loops for such operations.
More, I would recommend you not using 'min' and 'max' as variable names given they are reserved names.
Try the following
arr[arr < min_val]=0
arr[arr > max_val]=1
arr[(arr<=max_val) & (arr>=min_val)]=10

Related

Find all the intervals in numpy

I have xy numpy array with shape(2,600)
array([[0. , 0.01 ],
[0.02 , 0.03 ],
[0.04 , 0.05 ],
...,
[1.21943121, 1.14205236],
[1.07493206, 1.01916783],
[0.97570154, 0.94530397]])
I need to find all the intervals in which the values of the second demention is less than zero. Mark them as + and print them with index from first dimention.
Output example:
[0.00 0.04] +
[0.04 0.08] -
[0.08 0.10] +
I would be very grateful if you could help me!
This would filter out the intervals that have second value less than 0.
filter = [i for i in arr if i[1] < 0]
If you need the indices then,
ind = []
for i in range(len(arr)):
if arr[i][1] < 0:
ind.append(i)
In Numpy, it would be simply:
np.where(arr[:,1] < 0)
This will get you the indices.

Speed up numpy conditional replacement

I'm trying to speed up the inner most loop:
import cv2
import datetime
gt = cv2.imread("image.png",0)
start = datetime.datetime.now()
for i in range(gt.shape[0]):
for j in range(gt.shape[1]):
if(gt[i,j] == 0): continue
limit = min(60,((gt.shape[1]-j)))
for d in range(limit): # <------- this one
if(gt[i,j] < gt[i,j+d] - d):
gt[i,j] = 0
break
print(datetime.datetime.now() - start)
Is there any way using numpy built-in operators to rewrite it? at the moment it is very slow, like 46 secs for each image. I already tried something like:
gt[ gt[i,j+range(d)] - range(d)]=0
but of course it doesn't work since you can't sum int with list.
The ds can be represented with a constant np.arange(60) array, and the gt[i,j+d] with gt[i, j:j+d].
import numpy as np
ds = np.arange(60)
for i in range(gt.shape[0]):
for j in range(gt.shape[1]):
if(gt[i, j] == 0):
continue
limit = min(60, ((gt.shape[1]-j)))
if np.any(gt[i, j] < (gt[i, j:j+limit]-ds[:limit])):
gt[i, j] = 0
If you don't have memory concerns, this can probably be vectorized further. For instance:
mask = np.zeros(gt.shape, dtype="bool")
for dist in range(1, 60):
diffs = gt[:, :-dist] < (gt[:, dist:] - dist)
mask[:, :-dist] |= diffs
gt[mask] = 0
When working with unsigned ints diffs should be calculated with:
diffs = (gt2[:, :-dist] < (gt2[:, dist:] - dist)) & (gt2[:, dist:]>dist)
to prevent overflow issues.
Which gives approx. 200x speedup on a 200x200pix image.
gt = np.array([[1, 1, 1], [1.5, 2.5, 2.5], [2, 3.5, 4]])
limit = 2
m = np.zeros_like(gt, dtype=bool)
for d in range(1, limit + 1):
if d >= gt.shape[0]:
break
shifted = np.roll(gt, shift=-d, axis=0)
shifted[-d:] = np.nan
m = m | (gt < shifted - d)
gt[m] = 0
print(gt)
outputs
array([[1. , 0. , 0. ],
[1.5, 2.5, 0. ],
[2. , 3.5, 4. ]])

numpy interpolation to increase a vector size

Hi I have to enlarge the number of points inside of vector to enlarge the vector to fixed size. for example:
for this simple vector
>>> a = np.array([0, 1, 2, 3, 4, 5])
>>> len(a)
# 6
now, I want to get a vector with size of 11 taken the a vector as base the results will be
# array([ 0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])
EDIT 1
what I need is a function that will enter the base vector and the number of values that must be the resultant vector, and I return a new vector with size equal to the parameter. something like
def enlargeVector(vector, size):
.....
return newVector
to use like:
>>> a = np.array([0, 1, 2, 3, 4, 5])
>>> b = enlargeVector(a, 200):
>>> len(b)
# 200
and b contains data results of linear, cubic, or whatever interpolation methods
There are many methods to do this within scipy.interpolate. My favourite is UnivariateSpline, which produces an order k spline guaranteed to be differentiable k times.
To use it:
from scipy.interpolate import UnivariateSpline
old_indices = np.arange(0,len(a))
new_length = 11
new_indices = np.linspace(0,len(a)-1,new_length)
spl = UnivariateSpline(old_indices,a,k=3,s=0)
new_array = spl(new_indices)
The s is a smoothing factor that you should set to 0 in this case (since the data are exact).
Note that for the problem you have specified (since a just increases monotonically by 1), this is overkill, since the second np.linspace gives already the desired output.
EDIT: clarified that the length is arbitrary
As AGML pointed out there are tools to do this, but how about a pure numpy solution:
In [20]: a = np.arange(6)
In [21]: temp = np.dstack((a[:-1], a[:-1] + np.diff(a) / 2.0)).ravel()
In [22]: temp
Out[22]: array([ 0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5])
In [23]: np.hstack((temp, [a[-1]]))
Out[23]: array([ 0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

Assign numpy array of points to a 2D square grid

I'm going beyond my previous question because of speed problems. I have an array of Lat/Lon coordinates of points, and I would like to assign them to an index code derived from a 2D square grid of equal size cells. This is an example of how it would be. Let's called points my first array containing coordinates (called them [x y] pairs) of six points:
points = [[ 1.5 1.5]
[ 1.1 1.1]
[ 2.2 2.2]
[ 1.3 1.3]
[ 3.4 1.4]
[ 2. 1.5]]
Then I have another array containing the coordinates of the vertices of a grid of two cells in the form [minx,miny,maxx,maxy]; let's call it bounds:
bounds = [[ 0. 0. 2. 2.]
[ 2. 2. 3. 3.]]
I would like to find which points are in which boundary, and then assign a code derived from the bounds array index (in this case the first cell has code 0, the second 1 and so on...). Since the cells are squares, the easiest way to compute if each point is in each cell is to evaluate:
x > minx & x < maxx & y > miny & y < maxy
So that the resulting array would appear as:
results = [0 0 1 0 NaN NaN]
where NaN means that the point is outside cells. The number of elements in my real case is of the order of finding 10^6 points into 10^4 cells. Is there a way to do this kind of things in a fast way using numpy arrays?
EDIT: to clarify, the results array expected means that the first points is inside the first cell (0 index of the bounds array) so the second, and the first is inside the second cell of the bounds array and so on...
Here is a vectorized approach to your problem. It should speed things up significantly.
import numpy as np
def findCells(points, bounds):
# make sure points is n by 2 (pool.map might send us 1D arrays)
points = points.reshape((-1,2))
# check for each point if all coordinates are in bounds
# dimension 0 is bound
# dimension 1 is is point
allInBounds = (points[:,0] > bounds[:,None,0])
allInBounds &= (points[:,1] > bounds[:,None,1])
allInBounds &= (points[:,0] < bounds[:,None,2])
allInBounds &= (points[:,1] < bounds[:,None,3])
# now find out the positions of all nonzero (i.e. true) values
# nz[0] contains the indices along dim 0 (bound)
# nz[1] contains the indices along dim 1 (point)
nz = np.nonzero(allInBounds)
# initialize the result with all nan
r = np.full(points.shape[0], np.nan)
# now use nz[1] to index point position and nz[0] to tell which cell the
# point belongs to
r[nz[1]] = nz[0]
return r
def findCellsParallel(points, bounds, chunksize=100):
import multiprocessing as mp
from functools import partial
func = partial(findCells, bounds=bounds)
# using python3 you could also do 'with mp.Pool() as p:'
p = mp.Pool()
try:
return np.hstack(p.map(func, points, chunksize))
finally:
p.close()
def main():
nPoints = 1e6
nBounds = 1e4
# points = np.array([[ 1.5, 1.5],
# [ 1.1, 1.1],
# [ 2.2, 2.2],
# [ 1.3, 1.3],
# [ 3.4, 1.4],
# [ 2. , 1.5]])
points = np.random.random([nPoints, 2])
# bounds = np.array([[0,0,2,2],
# [2,2,3,3]])
# bounds = np.array([[0,0,1.4,1.4],
# [1.4,1.4,2,2],
# [2,2,3,3]])
bounds = np.sort(np.random.random([nBounds, 2, 2]), 1).reshape(nBounds, 4)
r = findCellsParallel(points, bounds)
print(points[:10])
for bIdx in np.unique(r[:10]):
if np.isnan(bIdx):
continue
print("{}: {}".format(bIdx, bounds[bIdx]))
print(r[:10])
if __name__ == "__main__":
main()
Edit:
Trying it with your amount of data gave me a MemoryError. You can avoid that and even speed things up a little more if you use multiprocessing.Pool with its map function, see updated code.
Result:
>time python test.py
[[ 0.69083585 0.19840985]
[ 0.31732711 0.80462512]
[ 0.30542996 0.08569184]
[ 0.72582609 0.46687164]
[ 0.50534322 0.35530554]
[ 0.93581095 0.36375539]
[ 0.66226118 0.62573407]
[ 0.08941219 0.05944215]
[ 0.43015872 0.95306899]
[ 0.43171644 0.74393729]]
9935.0: [ 0.31584562 0.18404152 0.98215445 0.83625487]
9963.0: [ 0.00526106 0.017255 0.33177741 0.9894455 ]
9989.0: [ 0.17328876 0.08181912 0.33170444 0.23493507]
9992.0: [ 0.34548987 0.15906761 0.92277442 0.9972481 ]
9993.0: [ 0.12448765 0.5404578 0.33981119 0.906822 ]
9996.0: [ 0.41198261 0.50958195 0.62843379 0.82677092]
9999.0: [ 0.437169 0.17833114 0.91096133 0.70713434]
[ 9999. 9993. 9989. 9999. 9999. 9935. 9999. 9963. 9992. 9996.]
real 0m 24.352s
user 3m 4.919s
sys 0m 1.464s
You can use a nested loop with to check the condition and yield the result as a generator :
points = [[ 1.5 1.5]
[ 1.1 1.1]
[ 2.2 2.2]
[ 1.3 1.3]
[ 3.4 1.4]
[ 2. 1.5]]
bounds = [[ 0. ,0. , 2., 2.],
[ 2. ,2. ,3., 3.]]
import numpy as np
def pos(p,b):
for x,y in p:
flag=False
for index,dis in enumerate(b):
minx,miny,maxx,maxy=dis
if x > minx and x < maxx and y > miny and y < maxy :
flag=True
yield index
if not flag:
yield 'NaN'
print list(pos(points,bounds))
result :
[0, 0, 1, 0, 'NaN', 'NaN']
I would do it like this:
import numpy as np
points = np.random.rand(10,2)
xmin = [0.25,0.5]
ymin = [0.25,0.5]
results = np.zeros(len(points))
for i in range(len(xmin)):
bool_index_array = np.greater(points, [xmin[i],ymin[i]])
print "boolean index of (x,y) greater (xmin, ymin): ", bool_index_array
indicies_of_true_true = np.where(bool_index_array[:,0]*bool_index_array[:,1]==1)[0]
print "indices of [True,True]: ", indicies_of_true_true
results[indicies_of_true_true] += 1
print "results: ", results
[out]: [ 1. 1. 1. 2. 0. 0. 1. 1. 1. 1.]
This uses the lower boundaries to catagorize your points into the groups:
1 (if xmin[0] < x <= xmin[1] & ymin[0] < y <= ymin[1])
2 (if x > xmin[1] & y > ymin[1])
0 if none of the conditions above are fullfilled

Efficiently select subsection of numpy array

I want to split a numpy array into three different arrays based on a logical comparison. The numpy array I want to split is called x. It's shape looks as follows, but it's entries vary: (In response to Saullo Castro's comment I included a slightly different array x.)
array([[ 0.46006547, 0.5580928 , 0.70164242, 0.84519205, 1.4 ],
[ 0.00912908, 0.00912908, 0.05 , 0.05 , 0.05 ]])
This values of this array are monotonically increasing along columns. I also have two other arrays called lowest_gridpoints and highest_gridpoints. The entries of these arrays also vary, but the shape is always identical to the following:
array([ 0.633, 0.01 ]), array([ 1.325, 0.99 ])
The selection procedure I want to apply is as follows:
All columns containing values lower than any value in lowest_gridpoints should be removed from x and constitute the array temp1.
All columns containing values higher than any value in highest_gridpoints should be removed from x and constitute the array temp2.
All columns of x that are included in neither temp1 or temp2 constitute the array x_new.
The following code I wrote achieves the task.
if np.any( x[:,-1] > highest_gridpoints ) or np.any( x[:,0] < lowest_gridpoints ):
for idx, sample, in enumerate(x.T):
if np.any( sample > highest_gridpoints):
max_idx = idx
break
elif np.any( sample < lowest_gridpoints ):
min_idx = idx
temp1, temp2 = np.array([[],[]]), np.array([[],[]])
if 'min_idx' in locals():
temp1 = x[:,0:min_idx+1]
if 'max_idx' in locals():
temp2 = x[:,max_idx:]
if 'min_idx' in locals() or 'max_idx' in locals():
if 'min_idx' not in locals():
min_idx = -1
if 'max_idx' not in locals():
max_idx = x.shape[1]
x_new = x[:,min_idx+1:max_idx]
However, I suspect that this code is very inefficient because of the heavy use of loops. Additionally, I think the syntax is bloated.
Does someone have an idea for a code which achieve the task outlined above more efficiently or looks concise?
Only the first part of your question
from numpy import *
x = array([[ 0.46006547, 0.5580928 , 0.70164242, 0.84519205, 1.4 ],
[ 0.00912908, 0.00912908, 0.05 , 0.05 , 0.05 ]])
low, high = array([ 0.633, 0.01 ]), array([ 1.325, 0.99 ])
# construct an array of two rows of bools expressing your conditions
indices1 = array((x[0,:]<low[0], x[1,:]<low[1]))
print indices1
# do an or of the values along the first axis
indices1 = any(indices1, axis=0)
# now it's a single row array
print indices1
# use the indices1 to extract what you want,
# the double transposition because the elements
# of a 2d array are the rows
tmp1 = x.T[indices1].T
print tmp1
# [[ True True False False False]
# [ True True False False False]]
# [ True True False False False]
# [[ 0.46006547 0.5580928 ]
# [ 0.00912908 0.00912908]]
next construct similarly indices2 and tmp2, the indices of the remnant are the negation of the oring of the first two indices. (i.e., numpy.logical_not(numpy.logical_or(i1,i2))).
Addendum
Another approach, possibly faster if you have thousands of entries, implies numpy.searchsorted
from numpy import *
x = array([[ 0.46006547, 0.5580928 , 0.70164242, 0.84519205, 1.4 ],
[ 0.00912908, 0.00912908, 0.05 , 0.05 , 0.05 ]])
low, high = array([ 0.633, 0.01 ]), array([ 1.325, 0.99 ])
l0r = searchsorted(x[0,:], low[0], side='right')
l1r = searchsorted(x[1,:], low[1], side='right')
h0l = searchsorted(x[0,:], high[0], side='left')
h1l = searchsorted(x[1,:], high[1], side='left')
lr = max(l0r, l1r)
hl = min(h0l, h1l)
print lr, hl
print x[:,:lr]
print x[:,lr:hl]
print x[:,hl]
# 2 4
# [[ 0.46006547 0.5580928 ]
# [ 0.00912908 0.00912908]]
# [[ 0.70164242 0.84519205]
# [ 0.05 0.05 ]]
# [ 1.4 0.05]
Excluding overlaps can be obtained by hl = max(lr, hl). NB in previuos approach the array slices are copied to new objects, here you get views on x and you have to be explicit if you want new objects.
Edit An unnecessary optimization
If we use only the upper part of x in the second couple of sortedsearches (if you look at the code you'll see what I mean...) we get two benefits, 1) a very small speedup of the searches (sortedsearch is always fast enough) and 2) the case of overlap is automatically managed.
As a bonus, code for copying the segments of x in the new arrays. NB x was changed to force overlap
from numpy import *
# I changed x to force overlap
x = array([[ 0.46006547, 1.4 , 1.4, 1.4, 1.4 ],
[ 0.00912908, 0.00912908, 0.05, 0.05, 0.05 ]])
low, high = array([ 0.633, 0.01 ]), array([ 1.325, 0.99 ])
l0r = searchsorted(x[0,:], low[0], side='right')
l1r = searchsorted(x[1,:], low[1], side='right')
lr = max(l0r, l1r)
h0l = searchsorted(x[0,lr:], high[0], side='left')
h1l = searchsorted(x[1,lr:], high[1], side='left')
hl = min(h0l, h1l) + lr
t1 = x[:,range(lr)]
xn = x[:,range(lr,hl)]
ncol = shape(x)[1]
t2 = x[:,range(hl,ncol)]
print x
del(x)
print
print t1
print
# note that xn is a void array
print xn
print
print t2
# [[ 0.46006547 1.4 1.4 1.4 1.4 ]
# [ 0.00912908 0.00912908 0.05 0.05 0.05 ]]
#
# [[ 0.46006547 1.4 ]
# [ 0.00912908 0.00912908]]
#
# []
#
# [[ 1.4 1.4 1.4 ]
# [ 0.05 0.05 0.05]]

Categories