Share and manipulate multiple numpy arrays through multiprocessing - python

I'm trying to make use of multiprocessing to speed up my array-based calculations. General workflow is as follows:
I have three arrays:
id_array is holding IDs of an array that belong together
class_array is a classified array (just integer representing class values from an image classification)
prob_array has the probability for these classes
based on the segments I want to:
find the class majority within each segment
average the probabilities within the segment, but only for the "pixels" that have the class majority
Here is my non-parallel example, which works fine:
import numpy as np
id_array = np.array([[1, 1, 2, 2, 2],
[1, 1, 2, 2, 4],
[3, 3, 4, 4, 4],
[3, 3, 4, 4, 4]])
class_array = np.array([[7, 7, 6, 8, 8],
[5, 7, 7, 8, 8],
[8, 8, 5, 5, 8],
[9, 9, 8, 7, 7]])
prob_array = np.array([[0.7, 0.3, 0.9, 0.5, 0.1],
[0.4, 0.6, 0.3, 0.5, 0.9],
[0.8, 0.6, 0.2, 0.2, 0.3],
[0.4, 0.4, 0.6, 0.3, 0.7]])
all_ids = np.unique( )
dst_classes = np.zeros_like(class_array)
dst_probs = np.zeros_like(prob_array)
for my_id in all_ids:
segment = np.where(id_array == my_id)
class_data = class_array[segment]
# get majority of classes within segment
majority = np.bincount(class_data.flatten()).argmax()
# get probabilities within segment
prob_data = prob_array[segment]
# get probabilities within segment where class equals majority
majority_probs = prob_data[np.where(class_data == majority)]
# get median of these probabilities
median_prob = np.nanmedian(majority_probs)
# write values
dst_classes[segment] = majority
dst_probs[segment] = median_prob
print(dst_classes)
print(dst_probs)
The problem is that my real data have something like 4 million segments and this then takes a week to compute. So I followed this tutorial and came up with this:
import numpy as np
import multiprocessing as mp
WORKER_DICT = dict()
NODATA = 0
def shared_array_from_np_array(data_array, init_value=None):
raw_array = mp.RawArray(np.ctypeslib.as_ctypes_type(data_array.dtype), data_array.size)
shared_array = np.frombuffer(raw_array, dtype=data_array.dtype).reshape(data_array.shape)
if init_value:
np.copyto(shared_array, np.full_like(data_array, init_value))
return raw_array, shared_array
else:
np.copyto(shared_array, data_array)
return raw_array, shared_array
def init_worker(id_array, class_array, prob_array, class_results, prob_results):
WORKER_DICT['id_array'] = id_array
WORKER_DICT['class_array'] = class_array
WORKER_DICT['prob_array'] = prob_array
WORKER_DICT['class_results'] = class_results
WORKER_DICT['prob_results'] = prob_results
WORKER_DICT['shape'] = id_array.shape
mp.freeze_support()
def worker(id):
id_array = WORKER_DICT['id_array']
class_array = WORKER_DICT['class_array']
prob_array = WORKER_DICT['prob_array']
class_result = WORKER_DICT['class_results']
prob_result = WORKER_DICT['prob_results']
# array indices for "id"
segment = np.where(id_array == id)
# get data at these indices, mask nodata values
class_data = np.ma.masked_equal(class_array[segment], NODATA)
# get majority value
majority_class = np.bincount(class_data.flatten()).argmax()
# get probabilities
probs = prob_array[segment]
majority_probs = probs[np.where(class_array[segment] == majority_class)]
med_majority_probs = np.nanmedian(majority_probs)
class_result[segment] = majority_class
prob_result[segment] = med_majority_probs
return
if __name__ == '__main__':
# segment IDs
id_ra, id_array = shared_array_from_np_array(np.array(
[[1, 1, 2, 2, 2],
[1, 1, 2, 2, 4],
[3, 3, 4, 4, 4],
[3, 3, 4, 4, 4]]))
# classification
cl_ra, class_array = shared_array_from_np_array(np.array(
[[7, 7, 6, 8, 8],
[5, 7, 7, 8, 8],
[8, 8, 5, 5, 8],
[9, 9, 8, 7, 7]]))
# probabilities
pr_ra, prob_array = shared_array_from_np_array(np.array(
[[0.7, 0.3, 0.9, 0.5, 0.1],
[0.4, 0.6, 0.3, 0.5, 0.9],
[0.8, 0.6, 0.2, 0.2, 0.3],
[0.4, 0.4, 0.6, 0.3, 0.7]]))
cl_res, class_results = shared_array_from_np_array(class_array, 0)
pr_res, prob_results = shared_array_from_np_array(prob_array, 0.)
unique_ids = np.unique(id_array)
init_args = (id_ra, cl_ra, pr_ra, cl_res, pr_res, id_array.shape)
with mp.Pool(processes=2, initializer=init_worker, initargs=init_args) as pool:
pool.map_async(worker, unique_ids)
print('Majorities:', cl_res)
print('Probabilities:', pr_res)
But I do not see how I can now get my results and whether they are correct. I tried
np.frombuffer(cl_res)
np.frombuffer(pr_res)
but that gives me only 10 values for cl_res (there should be 20) and they seem completely random, while pr_res has the exact same values as prob_array.
I have tried making use of other examples around here, like this, but can't get them to work either. That looks like a similar problem, but it already required a lot of knowledge how multiprocessing really works and I don't have that (total beginner with multiprocessing).

Several things to fix:
You need to create the numpy arrays in init_worker(), which should also take a shape argument:
def init_worker(id_ra, cl_ra, pr_ra, cl_res, pr_res, shape):
WORKER_DICT['id_array'] = np.ctypeslib.as_array(id_ra, shape)
WORKER_DICT['class_array'] = np.ctypeslib.as_array(cl_ra, shape)
WORKER_DICT['prob_array'] = np.ctypeslib.as_array(pr_ra, shape)
WORKER_DICT['class_results'] = np.ctypeslib.as_array(cl_res, shape)
WORKER_DICT['prob_results'] = np.ctypeslib.as_array(pr_res, shape)
You should check if init_value is not None instead of just init_value in shared_array_from_np_array(), as 0 evaluates to False.
mp.freeze_support() should only be called immediately after if __name__ == '__main__', as per its docs.
pool.map_async() returns an AsyncResult object that needs to be waited on; you probably want pool.map(), which blocks until the processing is done.
You can access the results directly in the main section with the class_results and prob_results arrays.

Related

Index array using array of unique values

I have three arrays, such that:
Data_Arr = np.array([1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5])
ID_Arr = np.array([1, 2, 3, 4, 5])
Value_Arr = np.array([0.1, 0.6, 0.3, 0.8, 0.2])
I want to create a new array which has the dimensions of Data, but where each element is from Values, using the index position in ID. So far I have this in a loop, but its very slow as my Data array is very large:
out = np.zeros_like(Data_Arr, dtype=np.float)
for i in range(len(Data_Arr)):
out[i] = Values_Arr[ID_Arr==Data_Arr[I]]
is there a more pythonic way of doing this and avoiding this loop (doesn't have to use numpy)?
Actual data looks like:
Data_Arr = [ 852116 852116 852116 ... 1001816 1001816 1001816]
ID_Arr = [ 852116 852117 852118 ... 1001814 1001815 1001816]
Value_Arr = [1.5547194 1.5547196 1.5547197 ... 1.5536859 1.5536858 1.5536857]
shapes are:
Data_Arr = (4021165,)
ID_Arr = (149701,)
Value_Arr = (149701,)
Since ID_Arr is sorted, we can directly use np.searchsorted and index Value_Arr with the result:
Value_Arr[np.searchsorted(ID_Arr, Data_Arr)]
array([0.1, 0.1, 0.1, 0.6, 0.6, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.8, 0.8,
0.2, 0.2, 0.2])
If ID_Arr isn't sorted (note: in case there may be out of bounds indices, we should remove them, see divakar's answer):
s_ind = ID_Arr.argsort()
ss = np.searchsorted(ID_Arr, Data_Arr, sorter=s_ind)
out = Value_Arr[s_ind[ss]]
Checking with the arrays suggested by alaniwi:
Data_Arr = np.array([1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5])
ID_Arr = array([2, 1, 3, 4, 5])
Value_Arr = np.array([0.6, 0.1, 0.3, 0.8, 0.2])
out_op = np.zeros_like(Data_Arr, dtype=np.float)
for i in range(len(Data_Arr)):
out_op[i] = Value_Arr[ID_Arr==Data_Arr[i]]
s_ind = ID_Arr.argsort()
ss = np.searchsorted(ID_Arr, Data_Arr, sorter=s_ind)
out_answer = Value_Arr[s_ind[ss]]
np.array_equal(out_op, out_answer)
#True
Based off approaches from this post, here are the adaptations.
Approach #1
# https://stackoverflow.com/a/62658135/ #Divakar
a,b,invalid_specifier = ID_Arr, Data_Arr, 0
sidx = a.argsort()
idx = np.searchsorted(a,b,sorter=sidx)
# Remove out of bounds indices as they wont be matches
idx[idx==len(a)] = 0
# Get traced back indices corresponding to original version of a
idx0 = sidx[idx]
# Mask out invalid ones with invalid_specifier and return
out = np.where(a[idx0]==b, Values_Arr[idx0], invalid_specifier)
Approach #2
Lookup based -
# https://stackoverflow.com/a/62658135/ #Divakar
def find_indices_lookup(a,b,invalid_specifier=-1):
# Setup array where we will assign ranged numbers
N = max(a.max(), b.max())+1
lookup = np.full(N, invalid_specifier)
# We index into lookup with b to trace back the positions. Non matching ones
# would have invalid_specifier values as wount had been indexed by ranged ones
lookup[a] = np.arange(len(a))
indices = lookup[b]
return indices
idx = find_indices_lookup(ID_Arr, Data_Arr)
out = np.where(idx!=-1, Values_Arr[idx], 0)
Faster/simpler variant
And a simplified and hopefully faster version would be a direct lookup of values -
a,b,invalid_specifier = ID_Arr, Data_Arr, 0
N = max(a.max(), b.max())+1
lookup = np.zeros(N, dtype=Values_Arr.dtype)
lookup[ID_Arr] = Values_Arr
out = lookup[Data_Arr]
If all values from ID_Arr are guaranteed to be in Data_Arr, we can use np.empty in place of np.zeros for the array-assignment and thus gain further perf. boost.
Looks like you want:
out = Value_Arr[ID_Arr[Data_Arr - 1] - 1]
Note that the - 1 are due to the fact that Python/Numpy is 0-based index.

get a list of unique items from Random.choices function

I have a method that is using the random package to generate a list with certain probability for example:
import random
seed = 30
rand = random.Random(seed)
options_list = [1, 2, 3, 4, 5, 6]
prob_weights = [0.1, 0.2, 0.1, 0.05, 0.02, 0.06]
result = rand.choices(option_list, prob_weights, k=4) # k will be <= len(option_list)
my problem is that result can hold two of the same item, and I want it to be unique.
I could make the k param much larger and then filter out the unique items but that seems like the wrong way to do that. I looked in the docs and I dont see that the choices function gets this kind of parameter.
Any ideas how to config random to return a list of unique items?
You can use np.random.choice, which allows you to assign probabilities associated with each entry and also to generate random samples without replacement. The probabilities however must add up to one, you'll have to divide the vector by its L^1-Norm. So here's how you could do it:
import numpy as np
options_list = np.array([1, 2, 3, 4, 5, 6])
prob_weights = np.array([0.1, 0.2, 0.1, 0.05, 0.02, 0.06])
prob_weights_scaled = prob_weights / sum(prob_weights)
some_length = 4
np.random.choice(a=options_list, size=some_length, replace=False, p=prob_weights_scaled)
Output
array([2, 1, 6, 3])

Creating cdata of type `REAL (* vertices)[DIM]` in CFFI

I'm trying to build a python interface around some existing C code with CFFI. As usual with C code trimmed for performance, it is fraught with extensive macros and typedefs.
ATM I am working on replicating following struct
#define DIM 3
typedef double REAL;
struct Object_structure {
int numpoints;
REAL (* vertices)[DIM];
int * rings;
};
typedef struct Object_structure * Object;
The function I'm trying to call expects an argument of type Object.
REAL gjk_distance(
Object obj1, REAL (* tr1)[DIM+1],
Object obj2, REAL (* tr2)[DIM+1],
REAL wpt1[DIM], REAL wpt2[DIM],
struct simplex_point * simplex, int use_seed
);
I have written the following python class for representing such an object/struct, but I'm having trouble converting it into the expected cdata object. (For now I just considering a UnitCube, but ultimately I'll want to generalize that.)
class Box:
def __init__(self, pos):
self._weakkeydict = weakref.WeakKeyDictionary()
self.numpoints = 8
self.rings = [
8, 12, 16, 20, 24, 28, 32, 36,
3, 1, 4, -1,
0, 2, 5, -1,
1, 3, 6, -1,
2, 0, 7, -1,
7, 5, 0, -1,
4, 6, 1, -1,
5, 7, 2, -1,
6, 4, 3, -1]
x, y, z = pos
self.vertices = [
[x+0, y+0, z+0],
[x+1, y+0, z+0],
[x+1, y+1, z+0],
[x+0, y+1, z+0],
[x+0, y+0, z+1],
[x+1, y+0, z+1],
[x+1, y+1, z+1],
[x+0, y+1, z+1],
]
#property
def cdata(self):
self._weakkeydict.clear()
#ptr_numpoints = ffi.new("int", self.numpoints)
ptr_rings = ffi.new("int[]", self.rings)
vertices = [ffi.new("REAL[3]", v) for v in self.vertices]
ptr_vertices = ffi.new("REAL *[3]", vertices )
ptr_obj = ffi.new("Object", {
'numpoints': self.numpoints,
'rings': ptr_rings,
'vertices': ptr_vertices})
self._weakkeydict[ptr_obj] = (ptr_rings, ptr_vertices, vertices)
return ptr_obj
With the above I'm getting IndexError: too many initializers for 'double *[3]' (got 8) in line ptr_vertices = ffi.new("REAL *[3]", vertices ) when calling:
box1 = Box((0,0,0))
box2 = Box((10,0,0))
d = lib.gjk_distance(
[box1.cdata], ffi.NULL,
[box2.cdata], ffi.NULL,
ffi.NULL, ffi.NULL,
ffi.NULL, 0 )
To me it seems as if the dimensions got switched somehow. As it should be an 8 element array with 3 element items.
I'm hoping someone can point me in the right direction here.
If you are creating a single item, use ffi.new('REAL(*)[3]', [1, 2, 3]). The parenthesis around the * is important.
In C, the type REAL(*)[3] means a pointer to array (size=3) of REAL, while REAL*[3] means an array (size=3) of pointer to real. See C pointer to array/array of pointers disambiguation for detail.
Now, you are creating an array of items, CFFI expects an array type instead, as you have already discovered. This can be compared as:
ffi.new('int*', 1) # ok
ffi.new('int[]', 1) # wrong
ffi.new('int*', [1, 2, 3]) # wrong
ffi.new('int[]', [1, 2, 3]) # ok
ffi.new('REAL(*)[3]', [0.1, 0.2, 0.3]) # ok
ffi.new('REAL[][3]', [0.1, 0.2, 0.3]) # wrong
ffi.new('REAL(*)[3]', [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) # wrong
ffi.new('REAL[][3]', [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) # ok
Apparently ptr_vertices = ffi.new("REAL[][3]", self.vertices ) is the way to go. For now it appears to work.

Interpolating a 2D data grid in python

I have a 2D grid with radioactive beta-decay rates. Each vale corresponds to a rate on a specific pair of temperature and density (both on logarithmic scale). What I would like to do, is when I have a temperature and density data pair (after getting their logarithms), to find the matching values in the table. I tried using the scipy interpolate interpn function, but I got a little confused, I would be grateful for the help.
What I have so far:
pointsx = np.array([7+0.2*i for i in range(0,16)]) #temperature range
pointsy = np.array([i for i in range(0,11) ]) #rho_el range
data = numpy.loadtxt(filename) #getting data from file
logT = np.log10(T) #wanted temperature logarithmic
logrho = np.log10(rho) #wanted rho logarithmic
The interpn function has the following arguments: points, values, xi, method='linear', bounds_error=True, fill_value=nan. I figure that the points will be the pointsx and pointsy I have, the data is quite obvious, and xi will be the (T,rho) I'm looking for. But I'm not sure, what dimensions they should have? The points is the same size, as the data? So I have to make an array of the corresponding pairs of T and rho, which will be the points part, and then have a (T, rho) pair as xi?
When you aren't certain about how a function works, it's always a good idea to open up a REPL and test it yourself. In this case, the function works exactly as expected, given your understanding of the documentation.
>>> points = [[1, 2, 3, 4], [1, 2, 3, 4]] # Input values for each grid dimension
>>> values = [[1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6], [4, 5, 6, 7]] # The grid itself
>>> xi = (1, 1.5)
>>> scipy.interpolate.interpn(points, values, xi)
array([ 1.5])
>>> xi = [[1, 1.5], [2, 1.5], [2, 2.5], [3, 2.5], [3, 3.5], [4, 3.5]]
>>> scipy.interpolate.interpn(points, values, xi)
array([ 1.5, 2.5, 3.5, 4.5, 5.5, 6.5])
The only thing you missed was that points is supposed to be a tuple. But as you can see from the above, it works even if points ins't a tuple.

Triangular mesh queries in Python

I am looking for a Python library which would support mesh queries. For now, I have looked at openmesh, but I am a bit afraid that would be an overkill for my small master thesis project. The features which I need is:
to iterate over vertices around a given vertex
iterate over all edges, faces, vertices
easily associate function values with each vertex, face, edge (I picture that these geometric entities are indexed)
And if I am really successful, I might need also to:
change the topology of the mesh, like adding or removing a vertex
Is it possible to do this with numpy so I could keep my depedency list small? For now I plan that the initial mesh will be generated with distmesh (pydistmesh). Does it have parts which could be useful for my mesh queries?
Theese kinds of queries became quite easy and effiecient with improved face based data structure which is used by CGAL. Here I have implemented code to valk around one specific vertex:
# The demonstration of improved face based data structure
from numpy import array
triangles = array([[ 5, 7, 10],
[ 7, 5, 6],
[ 4, 0, 3],
[ 0, 4, 6],
[ 4, 7, 6],
[ 4, 9, 10],
[ 7, 4, 10],
[ 0, 2, 1],
[ 2, 0, 6],
[ 2, 5, 1],
[ 5, 2, 6],
[ 8, 4, 3],
[ 4, 11, 9],
[ 8, 11, 4],
[ 9, 11, 3],
[11, 8, 3]], dtype=int)
points = array([[ 0.95448092, 0.45655774],
[ 0.86370317, 0.02141752],
[ 0.53821089, 0.16915935],
[ 0.97218064, 0.72769053],
[ 0.55030382, 0.70878147],
[ 0.34692982, 0.08765148],
[ 0.46289581, 0.29827649],
[ 0.21159925, 0.39472549],
[ 0.61679844, 0.79488884],
[ 0.4272861 , 0.93375762],
[ 0.12451604, 0.54267654],
[ 0.45974728, 0.91139648]])
import pylab as plt
fig = plt.figure()
pylab.triplot(points[:,0],points[:,1],triangles)
for i,tri in enumerate(triangles):
v1,v2,v3 = points[tri]
vavg = (v1 + v2 + v3)/3
plt.text(vavg[0],vavg[1],i)
#plt.show()
## constructing improved face based data structure
def edge_search(v1,v2,skip):
"""
Which triangle has edge with verticies i and j and aren't triangle <skip>?
"""
neigh = -1
for i,tri in enumerate(triangles):
if (v1 in tri) and (v2 in tri):
if i is skip:
continue
else:
neigh = i
break
return(neigh)
def triangle_search(i):
"""
For given vertex with index i return any triangle from neigberhood
"""
for i,tri in enumerate(triangles):
if i in tri:
return(i)
neighberhood = []
for i,tri in enumerate(triangles):
v1, v2, v3 = tri
t3 = edge_search(v1,v2,i)
t1 = edge_search(v2,v3,i)
t2 = edge_search(v3,v1,i)
neighberhood.append([t1,t2,t3])
neighberhood = array(neighberhood,dtype=int)
faces = []
for vi,_ in enumerate(points):
faces.append(triangle_search(vi))
## Now walking over first ring can be implemented
def triangle_ring(vertex):
tri_start = faces[vertex]
tri = tri_start
## with asumption that vertex is not on the boundary
for i in range(10):
yield tri
boolindx = triangles[tri]==vertex
# permutating to next and previous vertex
w = boolindx[[0,1,2]]
cw = boolindx[[2,0,1]]
ccw = boolindx[[1,2,0]]
ct = neighberhood[tri][cw][0]
if ct==tri_start:
break
else:
tri=ct
for i in triangle_ring(6):
print(i)
## Using it for drawing lines on plot
vertex = 6
ring_points = []
for i in triangle_ring(vertex):
vi = triangles[i]
cw = (vi==vertex)[[2,0,1]]
print("v={}".format(vi[cw][0]))
ring_points.append(vi[cw][0])
data = array([points[i] for i in ring_points])
plt.plot(data[:,0],data[:,1],"ro")
#plt.savefig("topology.png")
plt.show()
input("Press Enter to continue...")
plt.close("all")

Categories