efficient way to get all numpy slices for different ranges - python

I want to slice the same numpy array (data_arra) multiple times to find each time the values in a different range
data_ar shpe: (203,)
range_ar shape: (1000,)
I implemented it with a for loop, but it takes way to long since I have a lot of data_arrays:
#create results array
results_ar = np.zeros(shape=(1000),dtype=object)
i=0
for range in range_ar:
results_ar[i] = data_ar[( (data_ar>=(range-delta)) & (data_ar<(range+delta)) )].values
i+=1
so for example:
data_ar = [1,3,4,6,10,12]
range_ar = [7,4,2]
delta= 3
expected output:
(note results_ar shpae=(3,) dtype=object, each element is an array)
results_ar[[6,10];
[1,3,4,6];
[1,3,4]]
some idea on how to tackle this?

You can use numba to speed up the computations.
import numpy as np
import numba
from numba.typed import List
import timeit
data_ar = np.array([1,3,4,6,10,12])
range_ar = np.array([7,4,2])
delta = 3
def foo(data_ar, range_ar):
results_ar = list()
for i in range_ar:
results_ar.append(data_ar[( (data_ar>=(i-delta)) & (data_ar<(i+delta)) )])
print(timeit.timeit(lambda :foo(data_ar, range_ar)))
#numba.njit(parallel=True, fastmath=True)
def foo(data_ar, range_ar):
results_ar = List()
for i in range_ar:
results_ar.append(data_ar[( (data_ar>=(i-delta)) & (data_ar<(i+delta)) )])
print(timeit.timeit(lambda :foo(data_ar, range_ar)))
15.53519330600102
1.6557575029946747
An almost 9.8 times speedup.

You could use np.searchsorted like this:
data_ar = np.array([1, 3, 4, 6, 10, 12])
range_ar = np.array([7, 4, 2])
delta = 3
bounds = range_ar[:, None] + delta * np.array([-1, 1])
result = [data_ar[slice(*row)] for row in np.searchsorted(data_ar, bounds)]

Related

Negative Sampling in JAX

I'm implementing a negative sampling algorithm in JAX. The idea is to sample negatives from a range excluding from this range a number of non-acceptable outputs. My current solution is close to the following:
import jax.numpy as jnp
import jax
max_range = 5
n_samples = 2
true_cases = jnp.array(
[
[1,2],
[1,4],
[0,5]
]
)
# i combine the true cases in a dictionary of the following form:
non_acceptable_as_negatives = {
0: jnp.array([5]),
1: jnp.array([2,4]),
2: jnp.array([]),
3: jnp.array([]),
4: jnp.array([]),
5: jnp.array([])
}
negatives = []
key = jax.random.PRNGKey(42)
for i in true_cases[:,0]:
key,use_key = jax.random.split(key,2)
p = jnp.ones((max_range+1,))
p = p.at[non_acceptable_as_negatives[int(i)]].set(0)
p = p / p.sum()
negatives.append(
jax.random.choice(use_key,
jnp.arange(max_range+1),
(1, n_samples),
replace=False,
p=p,
)
)
However this seems
a) rather complicated and
b) is not very performant as the true cases in the original contain ~200_000 entries and max range is ~ 50_000. How can i improve this solution? And is there a more JAX way to store arrays of varying size which i currently store in the non_acceptable_as_negatives dict?
Thanks in a advance
You'll generally achieve better performance in JAX (as in NumPy) if you can avoid loops and use vectorized operations instead. If I'm understanding your function correctly, I think the following does roughly the same thing, but using vmap.
Since JAX does not support dictionary lookups based on traced values, I replaced your dict with a padded array
import jax.numpy as jnp
import jax
max_range = 5
n_samples = 2
fill_value = max_range + 1
true_cases = jnp.array([
[1,2],
[1,4],
[0,5]
])
non_acceptable_as_negatives = jnp.array([
[5, fill_value],
[2, 4],
])
#jax.vmap
def func(key, true_case):
p = jnp.ones(max_range + 1)
idx = true_cases[0]
replace = non_acceptable_as_negatives.at[idx].get(fill_value=fill_value)
p = p.at[replace].set(0, mode='drop')
return jax.random.choice(key, max_range + 1, (n_samples,), replace=False, p=p)
key = jax.random.PRNGKey(42)
keys = jax.random.split(key, len(true_cases))
result = func(keys, true_cases)
print(result)
[[3 1]
[5 1]
[1 5]]
Jax array are immutable. It means that you can't edit it without copying the entire array. Here the main problem is that you create the vector p two times at each iteration. I advice you to compute the probabilities only once via numpy:
import numpy as np
non_acceptable_as_negatives = {
0: np.array([5]),
1: np.array([2,4]),
2: np.array([]),
3: np.array([]),
4: np.array([]),
5: np.array([])
}
probas = np.ones((max_range+1, max_range+1))
for k, idx in non_acceptable_as_negatives.items():
for i in idx:
probas[k, i] = 0
probas = probas / probas.sum(axis=1, keepdims=True)
probas = jnp.array(probas)
Then, to further speed-up the algorithm, you can compile the choice function. You can try:
from functools import partial
#partial(jax.jit, static_argnums=1)
def sample(key, max_range, probas):
key, use_key = jax.random.split(key, 2)
return jax.random.choice(use_key,
jnp.arange(max_range+1),
(1, n_samples),
replace=False,
p=probas[i],
), key
And finally:
for i in true_cases[:,0]:
neg, key = aux(key, max_range, probas)
negatives.append(neg)

Replace outlier values with NaN in numpy? (preserve length of array)

I have an array of magnetometer data with artifacts every two hours due to power cycling.
I'd like to replace those indices with NaN so that the length of the array is preserved.
Here's a code example, adapted from https://www.kdnuggets.com/2017/02/removing-outliers-standard-deviation-python.html.
import numpy as np
import plotly.express as px
# For pulling data from CDAweb:
from ai import cdas
import datetime
# Import data:
start = datetime.datetime(2016, 1, 24, 0, 0, 0)
end = datetime.datetime(2016, 1, 25, 0, 0, 0)
data = cdas.get_data(
'sp_phys',
'THG_L2_MAG_'+ 'PG2',
start,
end,
['thg_mag_'+ 'pg2']
)
x =data['UT']
y =data['VERTICAL_DOWN_-_Z']
def reject_outliers(y): # y is the data in a 1D numpy array
n = 5 # 5 std deviations
mean = np.mean(y)
sd = np.std(y)
final_list = [x for x in y if (x > mean - 2 * sd)]
final_list = [x for x in final_list if (x < mean + 2 * sd)]
return final_list
px.scatter(reject_outliers(y))
print('Length of y: ')
print(len(y))
print('Length of y with outliers removed (should be the same): ')
print(len(reject_outliers(y)))
px.line(y=y, x=x)
# px.scatter(y) # It looks like the outliers are successfully dropped.
# px.line(y=reject_outliers(y), x=x) # This is the line I'd like to see work.
When I run 'px.scatter(reject_outliers(y))', it looks like the outliers are successfully getting dropped:
...but that's looking at the culled y vector relative to the index, rather than the datetime vector x as in the above plot. As the debugging text indicates, the vector is shortened because the outlier values are dropped rather than replaced.
How can I edit my 'reject_outliers()` function to assign those values to NaN, or to adjacent values, in order to keep the length of the array the same so that I can plot my data?
Use else in the list comprehension along the lines of:
[x if x_condition else other_value for x in y]
Got a less compact version to work. Full code:
import numpy as np
import plotly.express as px
# For pulling data from CDAweb:
from ai import cdas
import datetime
# Import data:
start = datetime.datetime(2016, 1, 24, 0, 0, 0)
end = datetime.datetime(2016, 1, 25, 0, 0, 0)
data = cdas.get_data(
'sp_phys',
'THG_L2_MAG_'+ 'PG2',
start,
end,
['thg_mag_'+ 'pg2']
)
x =data['UT']
y =data['VERTICAL_DOWN_-_Z']
def reject_outliers(y): # y is the data in a 1D numpy array
mean = np.mean(y)
sd = np.std(y)
final_list = np.copy(y)
for n in range(len(y)):
final_list[n] = y[n] if y[n] > mean - 5 * sd else np.nan
final_list[n] = final_list[n] if final_list[n] < mean + 5 * sd else np.nan
return final_list
px.scatter(reject_outliers(y))
print('Length of y: ')
print(len(y))
print('Length of y with outliers removed (should be the same): ')
print(len(reject_outliers(y)))
# px.line(y=y, x=x)
px.line(y=reject_outliers(y), x=x) # This is the line I wanted to get working - check!
More compact answer, sent via email by a friend:
In numpy you can select/index based on a Boolean array, and then make assignment with it:
def reject_outliers(y): # y is the data in a 1D numpy array
n = 5 # 5 std deviations
mean = np.mean(y)
sd = np.std(y)
final_list = y.copy()
final_list[np.abs(y - mean) > n * sd] = np.nan
return final_list
I also noticed that you didn’t use the value of n in your example code.
Alternatively, you can use the where method (https://numpy.org/doc/stable/reference/generated/numpy.where.html)
np.where(np.abs(y - mean) > n * sd, np.nan, y)
You don’t need the .copy() if you don’t mind modifying the input array.
Replace np.mean and np.std with np.nanmean and np.nanstd if you want the function to work on arrays that already contain nans, i.e. if you want to use this function recursively.
The answer about using if else in a list comprehension would work, but avoiding the list comprehension makes the function much faster if the arrays are large.

How to do element wise matrix multiply using numpy

I have two numpy arrays, a in size (20*3*3) and b in size (3*3). Let a=(a1, a2, ..., a20). I want to calculate the matrix product element wise like this:
c=(c1, c2, ..., c20), ci=b.Taib, i=1~20.
How can I do it efficiently using numpy?
A slow version using for loop is like this:
a = np.random.sample((20, 3, 3))
b = np.random.sample((3, 3))
c = np.zeros_like(a)
for i0, ai in enumerate(a):
c[i0] = np.dot(b.T, np.dot(ai, b))
You can try np.matmul(b.T, np.dot(a,b)):
import numpy as np
import pandas as pd
a = np.random.sample((4, 3, 3))
b = np.random.sample((3, 3))
c = np.zeros_like(a)
# using for loop
for i0, ai in enumerate(a):
c[i0] = np.dot(b.T, np.dot(ai, b))
# alternative method
e = np.zeros_like(a)
e = np.matmul(b.T, np.dot(a,b))
# checking for equal
print(np.array_equal(c, e))
You can just put your operation in a vectorized form because your inputs are NumPy arrays. No need of explicit for loop and indexing.
P.S: Thanks to #yatu who found that the answer was not the same shape. Now I added the swapaxes to get the consistent answer as OP's approach
np.random.seed(1)
a = np.random.sample((4, 3, 3))
b = np.random.sample((3, 3))
c = np.dot(b.T, np.dot(a, b)).swapaxes(0,1)
print (c)
[[[0.96496962 1.30807122 0.55382266]
[1.42300972 1.98975139 0.81871374]
[0.32358338 0.45493059 0.1346777 ]]
[[1.46772447 2.15650254 0.87555186]
[2.26335921 3.33689922 1.28679305]
[0.71561413 0.96507585 0.54309736]]
[[1.50660527 2.36946435 0.59771395]
[2.49705244 3.76328176 1.06274954]
[0.96090846 1.43636151 0.31807679]]
[[1.03706878 1.94107476 0.61884642]
[1.74739926 3.07419808 1.03537019]
[0.59565039 1.09721382 0.37283626]]]

Scatter plot with logical indexing

I have a 100x2 array D and a 100x1 array c (with entries +/- 1) I'm trying to make a scatter plot of the columns in D corresponding to c = 1.
I tried something like this: plt.scatter(D[0][c==1],D[1][c==1]) but it throws up IndexError: too many indices for array
I'm aware that I've use list comprehension or something of that sort. I'm fairly new to Python and hence struggling with the format.
Thanks a lot.
Concept
You can use np.where to select only rows from D that are 1 in your array C:
D = np.array([[0.25, 0.25], [0.75, 0.75]])
C = np.array([1, 0])
Using np.where, we can select only rows that are 1 in C:
>>> D[np.where(C==1)]
array([[0.25, 0.25]])
Example On your actual data:
D = np.random.randn(100, 2)
C = np.random.randint(0, 2, (100, 1))
valid = D[np.where(C.ravel()==1)]
import matplotlib.pyplot as plt
plt.scatter(valid[:, 0], valid[:, 1])
Output:
You can use numpy for this (assuming you have two numpy arrays, otherwise you can convert them into numpy arrays):
import numpy as np
c_ones = np.where(c == 1) # Finds all indices where c == 1
d_0 = D[0][c_ones]
d_1 = D[1][c_ones]
Then you can plot d_0, d_1 as normal.
For converting your lists if needed,
C_np = np.asarray(c)
D_np = np.asarray(D)
And then perform np.where on C_np as shown above.
Would this solve your issue?

How to get the index of a list items in another list?

Consider I have these lists:
l = [5,6,7,8,9,10,5,15,20]
m = [10,5]
I want to get the index of m in l. I used list comprehension to do that:
[(i,i+1) for i,j in enumerate(l) if m[0] == l[i] and m[1] == l[i+1]]
Output : [(5,6)]
But if I have more numbers in m, I feel its not the right way. So is there any easy approach in Python or with NumPy?
Another example:
l = [5,6,7,8,9,10,5,15,20,50,16,18]
m = [10,5,15,20]
The output should be:
[(5,6,7,8)]
The easiest way (using pure Python) would be to iterate over the items and first only check if the first item matches. This avoids doing sublist comparisons when not needed. Depending on the contents of your l this could outperform even NumPy broadcasting solutions:
def func(haystack, needle): # obviously needs a better name ...
if not needle:
return
# just optimization
lengthneedle = len(needle)
firstneedle = needle[0]
for idx, item in enumerate(haystack):
if item == firstneedle:
if haystack[idx:idx+lengthneedle] == needle:
yield tuple(range(idx, idx+lengthneedle))
>>> list(func(l, m))
[(5, 6, 7, 8)]
In case your interested in speed I checked the performance of the approaches (borrowing from my setup here):
import random
import numpy as np
# strided_app is from https://stackoverflow.com/a/40085052/
def strided_app(a, L, S ): # Window len = L, Stride len/stepsize = S
nrows = ((a.size-L)//S)+1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))
def pattern_index_broadcasting(all_data, search_data):
n = len(search_data)
all_data = np.asarray(all_data)
all_data_2D = strided_app(np.asarray(all_data), n, S=1)
return np.flatnonzero((all_data_2D == search_data).all(1))
# view1D is from https://stackoverflow.com/a/45313353/
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
def pattern_index_view1D(all_data, search_data):
a = strided_app(np.asarray(all_data), L=len(search_data), S=1)
a0v, b0v = view1D(np.asarray(a), np.asarray(search_data))
return np.flatnonzero(np.in1d(a0v, b0v))
def find_sublist_indices(haystack, needle):
if not needle:
return
# just optimization
lengthneedle = len(needle)
firstneedle = needle[0]
restneedle = needle[1:]
for idx, item in enumerate(haystack):
if item == firstneedle:
if haystack[idx+1:idx+lengthneedle] == restneedle:
yield tuple(range(idx, idx+lengthneedle))
def Divakar1(l, m):
return np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
def Divakar2(l, m):
return np.squeeze(pattern_index_view1D(l, m)[:,None] + np.arange(len(m)))
def MSeifert(l, m):
return list(find_sublist_indices(l, m))
# Timing setup
timings = {Divakar1: [], Divakar2: [], MSeifert: []}
sizes = [2**i for i in range(5, 20, 2)]
# Timing
for size in sizes:
l = [random.randint(0, 50) for _ in range(size)]
m = [random.randint(0, 50) for _ in range(10)]
larr = np.asarray(l)
marr = np.asarray(m)
for func in timings:
# first timings:
# res = %timeit -o func(l, m)
# second timings:
if func is MSeifert:
res = %timeit -o func(l, m)
else:
res = %timeit -o func(larr, marr)
timings[func].append(res)
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure(1)
ax = plt.subplot(111)
for func in timings:
ax.plot(sizes,
[time.best for time in timings[func]],
label=str(func.__name__))
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('size')
ax.set_ylabel('time [seconds]')
ax.grid(which='both')
ax.legend()
plt.tight_layout()
In case your l and m are lists my function outperforms the NumPy solutions for all sizes:
But in case you have these as numpy arrays you'll get faster results for large arrays (size > 1000 elements) when using Divakars NumPy solutions:
You are basically looking for the starting indices of a list in another list.
Approach #1 : One approach to solve it would be to create sliding windows of the elements in list in which we are searching, giving us a 2D array and then simply use NumPy broadcasting to perform broadcasted comparison against the search list against each row of the 2D sliding window version obtained earlier. Thus, one method would be -
# strided_app is from https://stackoverflow.com/a/40085052/
def strided_app(a, L, S ): # Window len = L, Stride len/stepsize = S
nrows = ((a.size-L)//S)+1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))
def pattern_index_broadcasting(all_data, search_data):
n = len(search_data)
all_data = np.asarray(all_data)
all_data_2D = strided_app(np.asarray(all_data), n, S=1)
return np.flatnonzero((all_data_2D == search_data).all(1))
out = np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Sample runs -
In [340]: l = [5,6,7,8,9,10,5,15,20,50,16,18]
...: m = [10,5,15,20]
...:
In [341]: np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Out[341]: array([5, 6, 7, 8])
In [342]: l = [5,6,7,8,9,10,5,15,20,50,16,18,10,5,15,20]
...: m = [10,5,15,20]
...:
In [343]: np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Out[343]:
array([[ 5, 6, 7, 8],
[12, 13, 14, 15]])
Approach #2 : Another method would be to get the sliding window and then get the row-wise scalar view into the data to be search data and the data to be search for, giving us 1D data to work with, like so -
# view1D is from https://stackoverflow.com/a/45313353/
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
def pattern_index_view1D(all_data, search_data):
a = strided_app(np.asarray(all_data), L=len(search_data), S=1)
a0v, b0v = view1D(np.asarray(a), np.asarray(search_data))
return np.flatnonzero(np.in1d(a0v, b0v))
out = np.squeeze(pattern_index_view1D(l, m)[:,None] + np.arange(len(m)))
2020 Versions
In search of more easy/compact approaches, we could look into scikit-image's view_as_windows for getting sliding windows with a built-in. I am assuming arrays as inputs for less messy code. For lists as input, we have to use np.asarray() as shown earlier.
Approach #3 : Basically a derivative of pattern_index_broadcasting with view_as_windows for a one-liner with a as the larger data and b is the array to be searched -
from skimage.util import view_as_windows
np.flatnonzero((view_as_windows(a,len(b))==b).all(1))[:,None]+np.arange(len(b))
Approach #4 : For a small number of matches from b in a, we could optimize, by looking for first element match from b to reduce the dataset size for searches -
mask = a[:-len(b)+1]==b[0]
mask[mask] = (view_as_windows(a,len(b))[mask]).all(1)
out = np.flatnonzero(mask)[:,None]+np.arange(len(b))
Approach #5 : For a small sized b, we could simply run a loop for each of the elements in b and perform bitwise and-reduction -
mask = np.bitwise_and.reduce([a[i:len(a)-len(b)+1+i]==b[i] for i in range(len(b))])
out = np.flatnonzero(mask)[:,None]+np.arange(len(b))
Just making the point that #MSeifert's approach can, of course, also be implemented in numpy:
def pp(h,n):
nn = len(n)
NN = len(h)
c = (h[:NN-nn+1]==n[0]).nonzero()[0]
if c.size==0: return
for i,l in enumerate(n[1:].tolist(),1):
c = c[h[i:][c]==l]
if c.size==0: return
return np.arange(c[0],c[0]+nn)
def get_data(l1,l2):
d=defaultdict(list)
[d[item].append(index) for index,item in enumerate(l1)]
print(d)
Using defaultdict to store indices of elements from other list.

Categories