Compute Non Zero Entries of a Matrix Faster in Python - python

I would like to make three vectors from a Matrix summarizes its non-zero values. A vector of values, a vector of row indexes, and a vector of column indexes.
For example if W = [[ 0. 2. 0.], [ 0. 10. 0.], [ 0. 0. 5.]].
I would like the function to return ([2.0, 10.0, 5.0], [0, 1, 2], [1, 1, 2]).
The code below does the job but is too slow for large matrix. I am working n on the order of 100000. And I do not know which indexes are non zero. Is there a way to speed this up?
from __future__ import division
import numpy as np
import collections
from numpy import *
import copy
#import timing
def nonZeroIndexes(W):
s = W.shape
nRows = s[0]
nColumns = s[1]
values = []
row_indexes = []
column_indexes = []
for r in xrange(nRows):
for c in xrange(nColumns):
if W[r,c] != 0:
values.append(W[r,c])
row_indexes.append(r)
column_indexes.append(c)
return values, row_indexes, column_indexes
n = 3
W = np.zeros((n,n))
W[0,1] = 2
W[1,1] = 10
W[2,2] = 5
vecs = nonZeroIndexes(W)

Use np.nonzero
>>> import numpy as np
>>> W = np.array([[0, 2, 0], [0, 10, 0], [0, 0, 5]])
>>>
>>> def nonZeroIndexes(W):
... zero_pos = np.nonzero(W)
... return (W[zero_pos],) + zero_pos
...
>>>
>>> nonZeroIndexes(W)
(array([ 2, 10, 5]), array([0, 1, 2]), array([1, 1, 2]))

Related

Map function along arbitrary dimension with pytorch? numpy?

I wonder why I find no utility to map custom pytorch or numpy transformations along any dimensions of complicated tensors/arrays/matrices.
I think I remember that such a thing was available in R. With this fantasy tch.map utility you could do:
>>> import torch as tch # or numpy
>>> # one torch tensor
>>> a = tch.tensor([0, 1, 2, 3, 4])
>>> # one torch function (dummy) returning 2 values
>>> f = lambda x: tch.tensor((x + 1, x * 2))
>>> # map f along dimension 0 of a, expecting 2 outputs
>>> res = tch.map(f, a, 0, 2) # fantasy, optimized on CPU/GPU..
>>> res
tensor([[1, 0],
[2, 2],
[3, 4],
[4, 6],
[5, 8]])
>>> res.shape
torch.Size([5, 2])
>>> # another tensor
>>> a = tch.tensor(list(range(24))).reshape(2, 3, 4).type(tch.double)
>>> # another function (dummy) returning 2 values
>>> f = lambda x: tch.tensor((tch.mean(x), tch.std(x)))
>>> # map f along dimension 2 of a, expecting 2 outputs
>>> res = tch.map(f, a, 2, 2) # fantasy, optimized on CPU/GPU..
tensor([[[ 1.5000, 1.2910],
[ 5.5000, 1.2910],
[ 9.5000, 1.2910]],
[[13.5000, 1.2910],
[17.5000, 1.2910],
[21.5000, 1.2910]]])
>>> res.shape
torch.Size([2, 3, 2])
>>> # yet another tensor
>>> a = tch.tensor(list(range(12))).reshape(3, 4)
>>> # another function (dummy) returning 2x2 values
>>> f = lambda x: x + tch.rand(2, 2)
>>> # map f along all values of a, expecting 2x2 outputs
>>> res = tch.map(f, a, -1, (2, 2)) # fantasy, optimized on CPU/GPU..
>>> print(res)
tensor([[[[ 0.4827, 0.3043],
[ 0.8619, 0.0505]],
[[ 1.4670, 1.5715],
[ 1.1270, 1.7752]],
[[ 2.9364, 2.0268],
[ 2.2420, 2.1239]],
[[ 3.9343, 3.6059],
[ 3.3736, 3.5178]]],
[[[ 4.2063, 4.9981],
[ 4.3817, 4.4109]],
[[ 5.3864, 5.3826],
[ 5.3614, 5.1666]],
[[ 6.6926, 6.2469],
[ 6.7888, 6.6803]],
[[ 7.2493, 7.5727],
[ 7.6129, 7.1039]]],
[[[ 8.3171, 8.9037],
[ 8.0520, 8.9587]],
[[ 9.5006, 9.1297],
[ 9.2620, 9.8371]],
[[10.4955, 10.5853],
[10.9939, 10.0271]],
[[11.3905, 11.9326],
[11.9376, 11.6408]]]])
>>> res.shape
torch.Size([3, 4, 2, 2])
Instead, I keep finding myself messing around with complicated tch.stack, tch.squeeze, tch.reshape, tch.permute, etc., counting dimensions on my fingers not to get lost.
Does such a utility exist and I have missed it for some reason?
Is such a utility impossible to implement for some reason?

Numpy Dot product with nested array

trying to come up with a method to perform load combinations and transient load patterning for structural/civil engineering applications.
without patterning it's fairly simple:
list of load results = [[d],[t1],...,[ti]], where [ti] = transient load result as a numpy array = A
list of combos = [[1,0,....,0],[0,1,....,1], [dfi, tf1,.....,tfi]] , where tfi = code load factor for transient load = B
in python this works as numpy.dot(A,B)
so my issue arises where:
`list of load results = [[d],[t1],.....[ti]]`, where [t1] = [[t11]......[t1i]] for i pattern possibilities and [t1i] = numpy array
so I have a nested array within another array and want to multiply by a matrix of load combinations. Is there a way to implement this in one matrix operation, I can come up with a method by looping the pattern possibilities then a dot product with the load combos, but this is computationally expensive. Any thoughts?
Thanks
for an example not considering patterning see: https://github.com/buddyd16/Structural-Engineering/blob/master/Analysis/load_combo_test.py
essential I need a method that gives similar results assuming that for loads = np.array([[D],[Ex],[Ey],[F],[H],[L],[Lr],[R],[S],[Wx],[Wy]]) --> [L],[Lr],[R],[S] are actually nested arrays ie if D = 1x500 array/vector, L, Lr, R, or S could = 100x500 array.
my simple solution is:
combined_pattern = []
for pattern in load_patterns:
loads = np.array([[D],[Ex],[Ey],[F],[H],[L[pattern]],[Lr[pattern]],[R[pattern]],[S[pattern]],[Wx],[Wy]])
combined_pattern.append(np.dot(basic_factors, loads))
Simpler Example:
import numpy as np
#Simple
A = np.array([1,0,0])
B = np.array([0,1,0])
C = np.array([0,0,1])
Loads = np.array([A,B,C])
Factors = np.array([[1,1,1],[0.5,0.5,0.5],[0.25,0.25,0.25]])
result = np.dot(Factors, Loads)
# Looking for a faster way to accomplish the below operation
# this works but will be slow for large data sets
# bi can be up to 1x5000 in size and i can be up to 500
A = np.array([1,0,0])
b1 = np.array([1,0,0])
b2 = np.array([0,1,0])
b3 = np.array([0,0,1])
B = np.array([b1,b2,b3])
C = np.array([0,0,1])
result_list = []
for load in B:
Loads = np.array([A,load,C])
Factors = np.array([[1,1,1],[0.5,0.5,0.5],[0.25,0.25,0.25]])
result = np.dot(Factors, Loads)
result_list.append(result)
edit: Had Factors and Loads reversed in the np.dot().
In your simple example, the array shapes are:
In [2]: A.shape
Out[2]: (3,)
In [3]: Loads.shape
Out[3]: (3, 3)
In [4]: Factors.shape
Out[4]: (3, 3)
In [5]: result.shape
Out[5]: (3, 3)
The rule in dot is that the last dimension of Loads pairs with the 2nd to the last of Factors
result = np.dot(Loads,Factors)
(3,3) dot (3,3) => (3,3) # 3's in common
(m,n) dot (n,l) => (m,l) # n's in common
In the iteration, A,load and C are all (3,) and Loads is (3,3).
result_list is a list of 3 (3,3) arrays, and np.array(result_list) would be (3,3,3).
Let's make a 3d array of all the Loads:
In [16]: Bloads = np.array([np.array([A,load,C]) for load in B])
In [17]: Bloads.shape
Out[17]: (3, 3, 3)
In [18]: Bloads
Out[18]:
array([[[1, 0, 0],
[1, 0, 0],
[0, 0, 1]],
[[1, 0, 0],
[0, 1, 0],
[0, 0, 1]],
[[1, 0, 0],
[0, 0, 1],
[0, 0, 1]]])
I can easily do a dot of this Bloads and Factors with einsum:
In [19]: np.einsum('lkm,mn->lkn', Bloads, Factors)
Out[19]:
array([[[1. , 1. , 1. ],
[1. , 1. , 1. ],
[0.25, 0.25, 0.25]],
[[1. , 1. , 1. ],
[0.5 , 0.5 , 0.5 ],
[0.25, 0.25, 0.25]],
[[1. , 1. , 1. ],
[0.25, 0.25, 0.25],
[0.25, 0.25, 0.25]]])
einsum isn't the only way, but it's the easiest way (for me) to keep track of dimensions.
It's even easier to keep dimensions straight if they differ. Here they are all 3, so it's hard to keep them separate. But if B was (5,4) and Factors (4,2), then Bloads would be (5,3,4), and the einsum result (5,3,2) (the size 4 dropping out in the dot).
Constructing Bloads without a loop is a bit trickier, since the rows of B are interleaved with A and C.
In [38]: np.stack((A[None,:].repeat(3,0),B,C[None,:].repeat(3,0)),1)
Out[38]:
array([[[1, 0, 0],
[1, 0, 0],
[0, 0, 1]],
[[1, 0, 0],
[0, 1, 0],
[0, 0, 1]],
[[1, 0, 0],
[0, 0, 1],
[0, 0, 1]]])
To understand this test the subexpressions, e.g. A[None,:], the repeat etc.
Equivalently:
np.array((A[None,:].repeat(3,0),B,C[None,:].repeat(3,0))).transpose(1,0,2)

Replace values in subarray based upon dynamic condition in Numpy

I have a Python Numpy array that is a 2D array where the second dimension is a subarray of 3 elements of integers. For example:
[ [2, 3, 4], [9, 8, 7], ... [15, 14, 16] ]
For each subarray I want to replace the lowest number with a 1 and all other numbers with a 0. So the desired output from the above example would be:
[ [1, 0, 0], [0, 0, 1], ... [0, 1, 0] ]
This is a large array, so I want to exploit Numpy performance. I know about using conditions to operate on array elements, but how do I do this when the condition is dynamic? In this instance the condition needs to be something like:
newarray = (a == min(a)).astype(int)
But how do I do this across each subarray?
You can specify the axis parameter to calculate a 2d array of mins(if you keep the dimension of the result), then when you do a == a.minbyrow, you will get trues at the minimum position for each sub array:
(a == a.min(1, keepdims=True)).astype(int)
#array([[1, 0, 0],
# [0, 0, 1],
# [0, 1, 0]])
How about this?
import numpy as np
a = np.random.random((4,3))
i = np.argmin(a, axis=-1)
out = np.zeros(a.shape, int)
out[np.arange(out.shape[0]), i] = 1
print(a)
print(out)
Sample output:
# [[ 0.58321885 0.18757452 0.92700724]
# [ 0.58082897 0.12929637 0.96686648]
# [ 0.26037634 0.55997658 0.29486454]
# [ 0.60398426 0.72253012 0.22812904]]
# [[0 1 0]
# [0 1 0]
# [1 0 0]
# [0 0 1]]
It appears to be marginally faster than the direct approach:
from timeit import timeit
def dense():
return (a == a.min(1, keepdims=True)).astype(int)
def sparse():
i = np.argmin(a, axis=-1)
out = np.zeros(a.shape, int)
out[np.arange(out.shape[0]), i] = 1
return out
for shp in ((4,3), (10000,3), (100,10), (100000,1000)):
a = np.random.random(shp)
d = timeit(dense, number=40)/40
s = timeit(sparse, number=40)/40
print('shape, dense, sparse, ratio', '({:6d},{:6d}) {:9.6g} {:9.6g} {:9.6g}'.format(*shp, d, s, d/s))
Sample run:
# shape, dense, sparse, ratio ( 4, 3) 4.22172e-06 3.1274e-06 1.34992
# shape, dense, sparse, ratio ( 10000, 3) 0.000332396 0.000245348 1.35479
# shape, dense, sparse, ratio ( 100, 10) 9.8944e-06 5.63165e-06 1.75693
# shape, dense, sparse, ratio (100000, 1000) 0.344177 0.189913 1.81229

Calculate percentage of count for a list of arrays

Simple problem, but I cannot seem to get it to work. I want to calculate the percentage a number occurs in a list of arrays and output this percentage accordingly.
I have a list of arrays which looks like this:
import numpy as np
# Create some data
listvalues = []
arr1 = np.array([0, 0, 2])
arr2 = np.array([1, 1, 2, 2])
arr3 = np.array([0, 2, 2])
listvalues.append(arr1)
listvalues.append(arr2)
listvalues.append(arr3)
listvalues
>[array([0, 0, 2]), array([1, 1, 2, 2]), array([0, 2, 2])]
Now I count the occurrences using collections, which returns a a list of collections.Counter:
import collections
counter = []
for i in xrange(len(listvalues)):
counter.append(collections.Counter(listvalues[i]))
counter
>[Counter({0: 2, 2: 1}), Counter({1: 2, 2: 2}), Counter({0: 1, 2: 2})]
The result I am looking for is an array with 3 columns, representing the value 0 to 2 and len(listvalues) of rows. Each cell should be filled with the percentage of that value occurring in the array:
# Result
66.66 0 33.33
0 50 50
33.33 0 66.66
So 0 occurs 66.66% in array 1, 0% in array 2 and 33.33% in array 3, and so on..
What would be the best way to achieve this?
Many thanks!
Here's an approach -
# Get lengths of each element in input list
lens = np.array([len(item) for item in listvalues])
# Form group ID array to ID elements in flattened listvalues
ID_arr = np.repeat(np.arange(len(lens)),lens)
# Extract all values & considering each row as an indexing perform counting
vals = np.concatenate(listvalues)
out_shp = [ID_arr.max()+1,vals.max()+1]
counts = np.bincount(ID_arr*out_shp[1] + vals)
# Finally get the percentages with dividing by group counts
out = 100*np.true_divide(counts.reshape(out_shp),lens[:,None])
Sample run with an additional fourth array in input list -
In [316]: listvalues
Out[316]: [array([0, 0, 2]),array([1, 1, 2, 2]),array([0, 2, 2]),array([4, 0, 1])]
In [317]: print out
[[ 66.66666667 0. 33.33333333 0. 0. ]
[ 0. 50. 50. 0. 0. ]
[ 33.33333333 0. 66.66666667 0. 0. ]
[ 33.33333333 33.33333333 0. 0. 33.33333333]]
The numpy_indexed package has a utility function for this, called count_table, which can be used to solve your problem efficiently as such:
import numpy_indexed as npi
arrs = [arr1, arr2, arr3]
idx = [np.ones(len(a))*i for i, a in enumerate(arrs)]
(rows, cols), table = npi.count_table(np.concatenate(idx), np.concatenate(arrs))
table = table / table.sum(axis=1, keepdims=True)
print(table * 100)
You can get a list of all values and then simply iterate over the individual arrays to get the percentages:
values = set([y for row in listvalues for y in row])
print [[(a==x).sum()*100.0/len(a) for x in values] for a in listvalues]
You can create a list with the percentages with the following code :
percentage_list = [((counter[i].get(j) if counter[i].get(j) else 0)*10000)//len(listvalues[i])/100.0 for i in range(len(listvalues)) for j in range(3)]
After that, create a np array from that list :
results = np.array(percentage_list)
Reshape it so we have the good result :
results = results.reshape(3,3)
This should allow you to get what you wanted.
This is most likely not efficient, and not the best way to do this, but it has the merit of working.
Do not hesitate if you have any question.
I would like to use functional-paradigm to resolve this problem. For example:
>>> import numpy as np
>>> import pprint
>>>
>>> arr1 = np.array([0, 0, 2])
>>> arr2 = np.array([1, 1, 2, 2])
>>> arr3 = np.array([0, 2, 2])
>>>
>>> arrays = (arr1, arr2, arr3)
>>>
>>> u = np.unique(np.hstack(arrays))
>>>
>>> result = [[1.0 * c.get(uk, 0) / l
... for l, c in ((len(arr), dict(zip(*np.unique(arr, return_counts=True))))
... for arr in arrays)] for uk in u]
>>>
>>> pprint.pprint(result)
[[0.6666666666666666, 0.0, 0.3333333333333333],
[0.0, 0.5, 0.0],
[0.3333333333333333, 0.5, 0.6666666666666666]]

How can I obtain the same 'special' solutions to underdetermined linear systems that Matlab's `A \ b` (mldivide) operator returns using numpy/scipy?

I found a link where it is shown with an example that the Matlab mldivide operator (\) gives 'special' solutions when the system of linear equations has infinitely many solutions.
For example:
A = [1 2 0; 0 4 3];
b = [8; 18];
c_mldivide = A \ b
c_pinv = pinv(A) * b
gives the output:
c_mldivide =
0
4
0.66666666666667
c_pinv =
0.918032786885245
3.54098360655738
1.27868852459016
The solution is 'special' in the sense that the number of non-zero entries in the solution c_mldivide is equal to rank(A) (in this case 2). I tried the same thing in numpy using numpy.linalg.lstsq, which gives an identical result to c_pinv.
Is there a way to achieve the c_mldivide solution in Python?
There was another very similar question here, but I suppose the explanation of the word 'special' was not clear enough.
Another question asked about the internal workings of the mldivide operator, but the accepted answer doesn't seem to address this behavior.
Edit 1 : numpy code
In [149]: test_A = np.array([[1,2,0],[0,4,3]])
test_b = np.array([[8],[18]])
np.linalg.lstsq(test_A,test_b)
Out[149]:
(array([[ 0.918 ],
[ 3.541 ],
[ 1.2787]]), array([], dtype=float64), 2, array([ 5.2732, 1.4811]))
Edit 2 : Using scipy.optimize.nnls
In[189]:
from scipy.optimize import nnls
nnls(test_A,test_b)
Out[190]:
ValueError Traceback (most recent call last)
<ipython-input-165-19ed603bd86c> in <module>()
1 from scipy.optimize import nnls
2
----> 3 nnls(test_A,test_b)
C:\Users\abhishek\Anaconda\lib\site-packages\scipy\optimize\nnls.py in nnls(A, b)
43 raise ValueError("expected matrix")
44 if len(b.shape) != 1:
---> 45 raise ValueError("expected vector")
46
47 m, n = A.shape
ValueError: expected vector
Non-negative least squares (scipy.optimize.nnls) is not a general solution to this problem. A trivial case where it will fail is if all of the possible solutions contain negative coefficients:
import numpy as np
from scipy.optimize import nnls
A = np.array([[1, 2, 0],
[0, 4, 3]])
b = np.array([-1, -2])
print(nnls(A, b))
# (array([ 0., 0., 0.]), 2.23606797749979)
In the case where A·x = b is underdetermined,
x1, res, rnk, s = np.linalg.lstsq(A, b)
will pick a solution x' that minimizes ||x||L2 subject to ||A·x - b||L2 = 0. This happens not to be the particular solution we are looking for, but we can linearly transform it to get what we want. In order to do that, we'll first compute the right null space of A, which characterizes the space of all possible solutions to A·x = b. We can get this using a rank-revealing QR decomposition:
from scipy.linalg import qr
def qr_null(A, tol=None):
Q, R, P = qr(A.T, mode='full', pivoting=True)
tol = np.finfo(R.dtype).eps if tol is None else tol
rnk = min(A.shape) - np.abs(np.diag(R))[::-1].searchsorted(tol)
return Q[:, rnk:].conj()
Z = qr_null(A)
Z is a vector (or, in case where n - rnk(A) > 1, a set of basis vectors spanning a subspace of A) such that A·Z = 0:
print(A.dot(Z))
# [[ 0.00000000e+00]
# [ 8.88178420e-16]]
In other words, the column(s) of Z are vectors that are orthogonal to all of the rows in A. This means that for any solution x' to A·x = b, then x' = x + Z·c must also be a solution for any arbitrary scaling factor c. This means that by picking an appropriate value of c, we can set any n - rnk(A) of the coefficients in the solution to zero.
For example, let's say we wanted to set the value of the last coefficient to zero:
c = -x1[-1] / Z[-1, 0]
x2 = x1 + Z * c
print(x2)
# [ -8.32667268e-17 -5.00000000e-01 0.00000000e+00]
print(A.dot(x2))
# [-1. -2.]
The more general case where n - rnk(A) ≤ 1 is a little bit more complicated:
A = np.array([[1, 4, 9, 6, 9, 2, 7],
[6, 3, 8, 5, 2, 7, 6],
[7, 4, 5, 7, 6, 3, 2],
[5, 2, 7, 4, 7, 5, 4],
[9, 3, 8, 6, 7, 3, 1]])
x_exact = np.array([ 1, 2, -1, -2, 5, 0, 0])
b = A.dot(x_exact)
print(b)
# [33, 4, 26, 29, 30]
We get x' and Z as before:
x1, res, rnk, s = np.linalg.lstsq(A, b)
Z = qr_null(A)
Now in order to maximise the number of zero-valued coefficients in the solution vector, we want to find a vector C such that
x' = x + Z·C = [x'0, x'1, ..., x'rnk(A)-1, 0, ..., 0]T
If the last n - rnk(A) coefficients in x' are to be zeros, this imposes that
Z{rnk(A),...,n}·C = -x{rnk(A),...,n}
We can thus solve for C (exactly, since we know that Z[rnk:] must be full-rank):
C = np.linalg.solve(Z[rnk:], -x1[rnk:])
and compute x' :
x2 = x1 + Z.dot(C)
print(x2)
# [ 1.00000000e+00 2.00000000e+00 -1.00000000e+00 -2.00000000e+00
# 5.00000000e+00 5.55111512e-17 0.00000000e+00]
print(A.dot(x2))
# [ 33. 4. 26. 29. 30.]
To put it all together into a single function:
import numpy as np
from scipy.linalg import qr
def solve_minnonzero(A, b):
x1, res, rnk, s = np.linalg.lstsq(A, b)
if rnk == A.shape[1]:
return x1 # nothing more to do if A is full-rank
Q, R, P = qr(A.T, mode='full', pivoting=True)
Z = Q[:, rnk:].conj()
C = np.linalg.solve(Z[rnk:], -x1[rnk:])
return x1 + Z.dot(C)
np.array([[8],[18]]).shape
is
(2,1)
but you want
(2,)
#!/usr/bin/env python3
import numpy as np
from scipy.optimize import nnls
test_A = np.array([[1,2,0],[0,4,3]])
try:
test_b = np.array([[8],[18]]) # wrong
print(nnls(test_A,test_b))
except Exception as e:
print(str(e))
test_b = np.array([8,18]) # sic!
print(nnls(test_A,test_b))
output:
expected vector
(array([ 0. , 4. , 0.66666667]), 0.0)

Categories