Index array using array of unique values - python

I have three arrays, such that:
Data_Arr = np.array([1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5])
ID_Arr = np.array([1, 2, 3, 4, 5])
Value_Arr = np.array([0.1, 0.6, 0.3, 0.8, 0.2])
I want to create a new array which has the dimensions of Data, but where each element is from Values, using the index position in ID. So far I have this in a loop, but its very slow as my Data array is very large:
out = np.zeros_like(Data_Arr, dtype=np.float)
for i in range(len(Data_Arr)):
out[i] = Values_Arr[ID_Arr==Data_Arr[I]]
is there a more pythonic way of doing this and avoiding this loop (doesn't have to use numpy)?
Actual data looks like:
Data_Arr = [ 852116 852116 852116 ... 1001816 1001816 1001816]
ID_Arr = [ 852116 852117 852118 ... 1001814 1001815 1001816]
Value_Arr = [1.5547194 1.5547196 1.5547197 ... 1.5536859 1.5536858 1.5536857]
shapes are:
Data_Arr = (4021165,)
ID_Arr = (149701,)
Value_Arr = (149701,)

Since ID_Arr is sorted, we can directly use np.searchsorted and index Value_Arr with the result:
Value_Arr[np.searchsorted(ID_Arr, Data_Arr)]
array([0.1, 0.1, 0.1, 0.6, 0.6, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.8, 0.8,
0.2, 0.2, 0.2])
If ID_Arr isn't sorted (note: in case there may be out of bounds indices, we should remove them, see divakar's answer):
s_ind = ID_Arr.argsort()
ss = np.searchsorted(ID_Arr, Data_Arr, sorter=s_ind)
out = Value_Arr[s_ind[ss]]
Checking with the arrays suggested by alaniwi:
Data_Arr = np.array([1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5])
ID_Arr = array([2, 1, 3, 4, 5])
Value_Arr = np.array([0.6, 0.1, 0.3, 0.8, 0.2])
out_op = np.zeros_like(Data_Arr, dtype=np.float)
for i in range(len(Data_Arr)):
out_op[i] = Value_Arr[ID_Arr==Data_Arr[i]]
s_ind = ID_Arr.argsort()
ss = np.searchsorted(ID_Arr, Data_Arr, sorter=s_ind)
out_answer = Value_Arr[s_ind[ss]]
np.array_equal(out_op, out_answer)
#True

Based off approaches from this post, here are the adaptations.
Approach #1
# https://stackoverflow.com/a/62658135/ #Divakar
a,b,invalid_specifier = ID_Arr, Data_Arr, 0
sidx = a.argsort()
idx = np.searchsorted(a,b,sorter=sidx)
# Remove out of bounds indices as they wont be matches
idx[idx==len(a)] = 0
# Get traced back indices corresponding to original version of a
idx0 = sidx[idx]
# Mask out invalid ones with invalid_specifier and return
out = np.where(a[idx0]==b, Values_Arr[idx0], invalid_specifier)
Approach #2
Lookup based -
# https://stackoverflow.com/a/62658135/ #Divakar
def find_indices_lookup(a,b,invalid_specifier=-1):
# Setup array where we will assign ranged numbers
N = max(a.max(), b.max())+1
lookup = np.full(N, invalid_specifier)
# We index into lookup with b to trace back the positions. Non matching ones
# would have invalid_specifier values as wount had been indexed by ranged ones
lookup[a] = np.arange(len(a))
indices = lookup[b]
return indices
idx = find_indices_lookup(ID_Arr, Data_Arr)
out = np.where(idx!=-1, Values_Arr[idx], 0)
Faster/simpler variant
And a simplified and hopefully faster version would be a direct lookup of values -
a,b,invalid_specifier = ID_Arr, Data_Arr, 0
N = max(a.max(), b.max())+1
lookup = np.zeros(N, dtype=Values_Arr.dtype)
lookup[ID_Arr] = Values_Arr
out = lookup[Data_Arr]
If all values from ID_Arr are guaranteed to be in Data_Arr, we can use np.empty in place of np.zeros for the array-assignment and thus gain further perf. boost.

Looks like you want:
out = Value_Arr[ID_Arr[Data_Arr - 1] - 1]
Note that the - 1 are due to the fact that Python/Numpy is 0-based index.

Related

Share and manipulate multiple numpy arrays through multiprocessing

I'm trying to make use of multiprocessing to speed up my array-based calculations. General workflow is as follows:
I have three arrays:
id_array is holding IDs of an array that belong together
class_array is a classified array (just integer representing class values from an image classification)
prob_array has the probability for these classes
based on the segments I want to:
find the class majority within each segment
average the probabilities within the segment, but only for the "pixels" that have the class majority
Here is my non-parallel example, which works fine:
import numpy as np
id_array = np.array([[1, 1, 2, 2, 2],
[1, 1, 2, 2, 4],
[3, 3, 4, 4, 4],
[3, 3, 4, 4, 4]])
class_array = np.array([[7, 7, 6, 8, 8],
[5, 7, 7, 8, 8],
[8, 8, 5, 5, 8],
[9, 9, 8, 7, 7]])
prob_array = np.array([[0.7, 0.3, 0.9, 0.5, 0.1],
[0.4, 0.6, 0.3, 0.5, 0.9],
[0.8, 0.6, 0.2, 0.2, 0.3],
[0.4, 0.4, 0.6, 0.3, 0.7]])
all_ids = np.unique( )
dst_classes = np.zeros_like(class_array)
dst_probs = np.zeros_like(prob_array)
for my_id in all_ids:
segment = np.where(id_array == my_id)
class_data = class_array[segment]
# get majority of classes within segment
majority = np.bincount(class_data.flatten()).argmax()
# get probabilities within segment
prob_data = prob_array[segment]
# get probabilities within segment where class equals majority
majority_probs = prob_data[np.where(class_data == majority)]
# get median of these probabilities
median_prob = np.nanmedian(majority_probs)
# write values
dst_classes[segment] = majority
dst_probs[segment] = median_prob
print(dst_classes)
print(dst_probs)
The problem is that my real data have something like 4 million segments and this then takes a week to compute. So I followed this tutorial and came up with this:
import numpy as np
import multiprocessing as mp
WORKER_DICT = dict()
NODATA = 0
def shared_array_from_np_array(data_array, init_value=None):
raw_array = mp.RawArray(np.ctypeslib.as_ctypes_type(data_array.dtype), data_array.size)
shared_array = np.frombuffer(raw_array, dtype=data_array.dtype).reshape(data_array.shape)
if init_value:
np.copyto(shared_array, np.full_like(data_array, init_value))
return raw_array, shared_array
else:
np.copyto(shared_array, data_array)
return raw_array, shared_array
def init_worker(id_array, class_array, prob_array, class_results, prob_results):
WORKER_DICT['id_array'] = id_array
WORKER_DICT['class_array'] = class_array
WORKER_DICT['prob_array'] = prob_array
WORKER_DICT['class_results'] = class_results
WORKER_DICT['prob_results'] = prob_results
WORKER_DICT['shape'] = id_array.shape
mp.freeze_support()
def worker(id):
id_array = WORKER_DICT['id_array']
class_array = WORKER_DICT['class_array']
prob_array = WORKER_DICT['prob_array']
class_result = WORKER_DICT['class_results']
prob_result = WORKER_DICT['prob_results']
# array indices for "id"
segment = np.where(id_array == id)
# get data at these indices, mask nodata values
class_data = np.ma.masked_equal(class_array[segment], NODATA)
# get majority value
majority_class = np.bincount(class_data.flatten()).argmax()
# get probabilities
probs = prob_array[segment]
majority_probs = probs[np.where(class_array[segment] == majority_class)]
med_majority_probs = np.nanmedian(majority_probs)
class_result[segment] = majority_class
prob_result[segment] = med_majority_probs
return
if __name__ == '__main__':
# segment IDs
id_ra, id_array = shared_array_from_np_array(np.array(
[[1, 1, 2, 2, 2],
[1, 1, 2, 2, 4],
[3, 3, 4, 4, 4],
[3, 3, 4, 4, 4]]))
# classification
cl_ra, class_array = shared_array_from_np_array(np.array(
[[7, 7, 6, 8, 8],
[5, 7, 7, 8, 8],
[8, 8, 5, 5, 8],
[9, 9, 8, 7, 7]]))
# probabilities
pr_ra, prob_array = shared_array_from_np_array(np.array(
[[0.7, 0.3, 0.9, 0.5, 0.1],
[0.4, 0.6, 0.3, 0.5, 0.9],
[0.8, 0.6, 0.2, 0.2, 0.3],
[0.4, 0.4, 0.6, 0.3, 0.7]]))
cl_res, class_results = shared_array_from_np_array(class_array, 0)
pr_res, prob_results = shared_array_from_np_array(prob_array, 0.)
unique_ids = np.unique(id_array)
init_args = (id_ra, cl_ra, pr_ra, cl_res, pr_res, id_array.shape)
with mp.Pool(processes=2, initializer=init_worker, initargs=init_args) as pool:
pool.map_async(worker, unique_ids)
print('Majorities:', cl_res)
print('Probabilities:', pr_res)
But I do not see how I can now get my results and whether they are correct. I tried
np.frombuffer(cl_res)
np.frombuffer(pr_res)
but that gives me only 10 values for cl_res (there should be 20) and they seem completely random, while pr_res has the exact same values as prob_array.
I have tried making use of other examples around here, like this, but can't get them to work either. That looks like a similar problem, but it already required a lot of knowledge how multiprocessing really works and I don't have that (total beginner with multiprocessing).
Several things to fix:
You need to create the numpy arrays in init_worker(), which should also take a shape argument:
def init_worker(id_ra, cl_ra, pr_ra, cl_res, pr_res, shape):
WORKER_DICT['id_array'] = np.ctypeslib.as_array(id_ra, shape)
WORKER_DICT['class_array'] = np.ctypeslib.as_array(cl_ra, shape)
WORKER_DICT['prob_array'] = np.ctypeslib.as_array(pr_ra, shape)
WORKER_DICT['class_results'] = np.ctypeslib.as_array(cl_res, shape)
WORKER_DICT['prob_results'] = np.ctypeslib.as_array(pr_res, shape)
You should check if init_value is not None instead of just init_value in shared_array_from_np_array(), as 0 evaluates to False.
mp.freeze_support() should only be called immediately after if __name__ == '__main__', as per its docs.
pool.map_async() returns an AsyncResult object that needs to be waited on; you probably want pool.map(), which blocks until the processing is done.
You can access the results directly in the main section with the class_results and prob_results arrays.

get a list of unique items from Random.choices function

I have a method that is using the random package to generate a list with certain probability for example:
import random
seed = 30
rand = random.Random(seed)
options_list = [1, 2, 3, 4, 5, 6]
prob_weights = [0.1, 0.2, 0.1, 0.05, 0.02, 0.06]
result = rand.choices(option_list, prob_weights, k=4) # k will be <= len(option_list)
my problem is that result can hold two of the same item, and I want it to be unique.
I could make the k param much larger and then filter out the unique items but that seems like the wrong way to do that. I looked in the docs and I dont see that the choices function gets this kind of parameter.
Any ideas how to config random to return a list of unique items?
You can use np.random.choice, which allows you to assign probabilities associated with each entry and also to generate random samples without replacement. The probabilities however must add up to one, you'll have to divide the vector by its L^1-Norm. So here's how you could do it:
import numpy as np
options_list = np.array([1, 2, 3, 4, 5, 6])
prob_weights = np.array([0.1, 0.2, 0.1, 0.05, 0.02, 0.06])
prob_weights_scaled = prob_weights / sum(prob_weights)
some_length = 4
np.random.choice(a=options_list, size=some_length, replace=False, p=prob_weights_scaled)
Output
array([2, 1, 6, 3])

2D array to represent a huge python dict, COOrdinate like solution to save memory

I try to update a dict_with_tuples_key with the data from an array:
myarray = np.array([[0, 0], # 0, 1
[0, 1],
[1, 1], # 1, 2
[1, 2], # 1, 3
[2, 2],
[1, 3]]
) # a lot of this with shape~(10e6, 2)
dict_with_tuples_key = {(0, 1): 1,
(3, 7): 1} # ~10e6 keys
Using an array to store the dict values, (thanks to #MSeifert) we get this:
def convert_dict_to_darray(dict_with_tuples_key, myarray):
idx_max_array = np.max(myarray, axis=0)
idx_max_dict = np.max(dict_with_tuples_key.keys(), axis=0)
lens = np.max([list(idx_max_array), list(idx_max_dict)], axis=0)
xlen, ylen = lens[0] + 1, lens[1] + 1
darray = np.zeros((xlen, ylen)) # Empty array to hold all indexes in myarray
for key, value in dict_with_tuples_key.items():
darray[key] = value
return darray
#njit
def update_darray(darray, myarray):
elements = myarray.shape[0]
for i in range(elements):
darray[myarray[i][0]][myarray[i][1]] += 1
return darray
def darray_to_dict(darray):
updated_dict = {}
keys = zip(*map(list, np.nonzero(darray)))
for x, y in keys:
updated_dict[(x, y)] = darray[x, y]
return updated_dict
darray = convert_dict_to_darray(dict_with_tuples_key, myarray)
darray = update_darray(darray, myarray)
I get the exact result needed:
# print darray_to_dict(darray)
# {(0, 1): 2.0,
# (0, 0): 1.0,
# (1, 1): 1.0,
# (2, 2): 1.0,
# (1, 2): 1.0,
# (1, 3): 1.0,
# (3, 7): 1.0, }
For small matrix it work quit well, #njit work on it so it's very fast,
but...
the creation of the huge empty darray = np.zeros((xlen, ylen)) does not fit on memory. How can we avoid to assign a very sparse array, and only store non null values like sparse matrix in COOrdinate format ?
Use dok_matrix from scipy; a dock_matrix is a dictionary Of Keys based sparse matrix. They allow you to build sparse matrices incrementally and they won't allocate huge empty darray = np.zeros((xlen, ylen)) that does not fit into your computer memory.
The only change to do is to import the right module from scipy and to change the definition of darray in your function convert_dict_to_darray.
It will look like this:
from scipy.sparse import dok_matrix
def convert_dict_to_darray(dict_with_tuples_key, myarray):
idx_max_array = np.max(myarray, axis=0)
idx_max_dict = np.max(dict_with_tuples_key.keys(), axis=0)
lens = np.max([list(idx_max_array), list(idx_max_dict)], axis=0)
xlen, ylen = lens[0] + 1, lens[1] + 1
darray = dok_matrix( (xlen, ylen) )
for key, value in dict_with_tuples_key.items():
darray[key[0], key[1]] = value
return darray

How do I "randomly" select numbers with a specified bias toward a particular number

How do I generate random numbers with a specified bias toward one number. For example, how would I pick between two numbers, 1 and 2, with a 90% bias toward 1. The best I can come up with is...
import random
print random.choice([1, 1, 1, 1, 1, 1, 1, 1, 1, 2])
Is there a better way to do this? The method I showed works in simple examples but eventually I'll have to do more complicated selections with biases that are very specific (such as 37.65% bias) which would require a very long list.
EDIT:
I should have added that I'm stuck on numpy 1.6 so I can't use numpy.random.choice.
np.random.choice has a p parameter which you can use to specify the probability of the choices:
np.random.choice([1,2], p=[0.9, 0.1])
The algorithm used by np.random.choice() is relatively simple to replicate if you only need to draw one item at a time.
import numpy as np
def simple_weighted_choice(choices, weights, prng=np.random):
running_sum = np.cumsum(weights)
u = prng.uniform(0.0, running_sum[-1])
i = np.searchsorted(running_sum, u, side='left')
return choices[i]
For random sampling with replacement, the essential code in np.random.choice is
cdf = p.cumsum()
cdf /= cdf[-1]
uniform_samples = self.random_sample(shape)
idx = cdf.searchsorted(uniform_samples, side='right')
So we can use that in a new function the does the same thing (but without error checking and other niceties):
import numpy as np
def weighted_choice(values, p, size=1):
values = np.asarray(values)
cdf = np.asarray(p).cumsum()
cdf /= cdf[-1]
uniform_samples = np.random.random_sample(size)
idx = cdf.searchsorted(uniform_samples, side='right')
sample = values[idx]
return sample
Examples:
In [113]: weighted_choice([1, 2], [0.9, 0.1], 20)
Out[113]: array([1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1])
In [114]: weighted_choice(['cat', 'dog', 'goldfish'], [0.3, 0.6, 0.1], 15)
Out[114]:
array(['cat', 'dog', 'cat', 'dog', 'dog', 'dog', 'dog', 'dog', 'dog',
'dog', 'dog', 'dog', 'goldfish', 'dog', 'dog'],
dtype='|S8')
Something like that should do the trick, and working with all floating point probability without creating a intermediate array.
import random
from itertools import accumulate # for python 3.x
def accumulate(l): # for python 2.x
tmp = 0
for n in l:
tmp += n
yield tmp
def random_choice(a, p):
sums = sum(p)
accum = accumulate(p) # made a cumulative list of probability
accum = [n / sums for n in accum] # normalize
rnd = random.random()
for i, item in enumerate(accum):
if rnd < item:
return a[i]
Easy to get is the index in probability table.
Make a table for as many weights as you need looking for example like this:
prb = [0.5, 0.65, 0.8, 1]
Get index with something like this:
def get_in_range(prb, pointer):
"""Returns index of matching range in table prb"""
found = 0
for p in prb:
if nr>p:
found += 1
return found
Index returned by get_in_range may be used to point in corresponding table of values.
Example usage:
import random
values = [1, 2, 3]
weights = [0.9, 0.99, 1]
result = values[get_in_range(prb, random.random())]
There should be probability of choosing 1 with 95%; 2 with 4% and 3 with 1%

Binning a numpy array

I have a numpy array which contains time series data. I want to bin that array into equal partitions of a given length (it is fine to drop the last partition if it is not the same size) and then calculate the mean of each of those bins.
I suspect there is numpy, scipy, or pandas functionality to do this.
example:
data = [4,2,5,6,7,5,4,3,5,7]
for a bin size of 2:
bin_data = [(4,2),(5,6),(7,5),(4,3),(5,7)]
bin_data_mean = [3,5.5,6,3.5,6]
for a bin size of 3:
bin_data = [(4,2,5),(6,7,5),(4,3,5)]
bin_data_mean = [7.67,6,4]
Just use reshape and then mean(axis=1).
As the simplest possible example:
import numpy as np
data = np.array([4,2,5,6,7,5,4,3,5,7])
print data.reshape(-1, 2).mean(axis=1)
More generally, we'd need to do something like this to drop the last bin when it's not an even multiple:
import numpy as np
width=3
data = np.array([4,2,5,6,7,5,4,3,5,7])
result = data[:(data.size // width) * width].reshape(-1, width).mean(axis=1)
print result
Since you already have a numpy array, to avoid for loops, you can use reshape and consider the new dimension to be the bin:
In [33]: data.reshape(2, -1)
Out[33]:
array([[4, 2, 5, 6, 7],
[5, 4, 3, 5, 7]])
In [34]: data.reshape(2, -1).mean(0)
Out[34]: array([ 4.5, 3. , 4. , 5.5, 7. ])
Actually this will just work if the size of data is divisible by n. I'll edit a fix.
Looks like Joe Kington has an answer that handles that.
Try this, using standard Python (NumPy isn't necessary for this). Assuming Python 2.x is in use:
data = [ 4, 2, 5, 6, 7, 5, 4, 3, 5, 7 ]
# example: for n == 2
n=2
partitions = [data[i:i+n] for i in xrange(0, len(data), n)]
partitions = partitions if len(partitions[-1]) == n else partitions[:-1]
# the above produces a list of lists
partitions
=> [[4, 2], [5, 6], [7, 5], [4, 3], [5, 7]]
# now the mean
[sum(x)/float(n) for x in partitions]
=> [3.0, 5.5, 6.0, 3.5, 6.0]
I just wrote a function to apply it to all array size or dimension you want.
data is your array
axis is the axis you want to been
binstep is the number of points between each bin (allow overlapping bins)
binsize is the size of each bin
func is the function you want to apply to the bin (np.max for maxpooling, np.mean for an average ...)
def binArray(data, axis, binstep, binsize, func=np.nanmean):
data = np.array(data)
dims = np.array(data.shape)
argdims = np.arange(data.ndim)
argdims[0], argdims[axis]= argdims[axis], argdims[0]
data = data.transpose(argdims)
data = [func(np.take(data,np.arange(int(i*binstep),int(i*binstep+binsize)),0),0) for i in np.arange(dims[axis]//binstep)]
data = np.array(data).transpose(argdims)
return data
In you case it will be :
data = [4,2,5,6,7,5,4,3,5,7]
bin_data_mean = binArray(data, 0, 2, 2, np.mean)
or for the bin size of 3:
bin_data_mean = binArray(data, 0, 3, 3, np.mean)

Categories