Fastest way of comparing two numpy arrays - python

I have two arrays:
>>> import numpy as np
>>> a=np.array([2, 1, 3, 3, 3])
>>> b=np.array([1, 2, 3, 3, 3])
What is the fastest way of comparing these two arrays for equality of elements, regardless of the order?
EDIT
I measured for the execution times of the following functions:
def compare1(): #works only for arrays without redundant elements
a=np.array([1,2,3,5,4])
b=np.array([2,1,3,4,5])
temp=0
for i in a:
temp+=len(np.where(b==i)[0])
if temp==5:
val=True
else:
val=False
return 0
def compare2():
a=np.array([1,2,3,3,3])
b=np.array([2,1,3,3,3])
val=np.all(np.sort(a)==np.sort(b))
return 0
def compare3(): #thx to ODiogoSilva
a=np.array([1,2,3,3,3])
b=np.array([2,1,3,3,3])
val=set(a)==set(b)
return 0
import numpy.lib.arraysetops as aso
def compare4(): #thx to tom10
a=np.array([1,2,3,3,3])
b=np.array([2,1,3,3,3])
val=len(aso.setdiff1d(a,b))==0
return 0
The results are:
>>> import timeit
>>> timeit.timeit(compare1,number=1000)
0.0166780948638916
>>> timeit.timeit(compare2,number=1000)
0.016178131103515625
>>> timeit.timeit(compare3,number=1000)
0.008063077926635742
>>> timeit.timeit(compare4,number=1000)
0.03257489204406738
Seems like the "set"-method by ODiogoSilva is the fastest.
Do you know other methods that I can test as well?
EDIT2
The runtime above was not the right measure for comparing arrays, as explained in a comment by user2357112.
#test.py
import numpy as np
import numpy.lib.arraysetops as aso
#without duplicates
N=10000
a=np.arange(N,0,step=-2)
b=np.arange(N,0,step=-2)
def compare1():
temp=0
for i in a:
temp+=len(np.where(b==i)[0])
if temp==len(a):
val=True
else:
val=False
return val
def compare2():
val=np.all(np.sort(a)==np.sort(b))
return val
def compare3():
val=set(a)==set(b)
return val
def compare4():
val=len(aso.setdiff1d(a,b))==0
return val
The output is:
>>> from test import *
>>> import timeit
>>> timeit.timeit(compare1,number=1000)
101.16708397865295
>>> timeit.timeit(compare2,number=1000)
0.09285593032836914
>>> timeit.timeit(compare3,number=1000)
1.425955057144165
>>> timeit.timeit(compare4,number=1000)
0.44780397415161133
Now compare2 is the fastest. Is there still a method that could outgun this?

Numpy as a collection of set operations.
import numpy as np
import numpy.lib.arraysetops as aso
a=np.array([2, 1, 3, 3, 3])
b=np.array([1, 2, 3, 3, 3])
print aso.setdiff1d(a, b)

To see if both arrays contain the same kind of elements, in this case [1,2,3], you could do:
import numpy as np
a=np.array([2, 1, 3, 3, 3])
b=np.array([1, 2, 3, 3, 3])
set(a) == set(b)
# True

Related

Indexing array from second element for all elements

I think it must be easy, but I cannot google it. Suppose I have array of numbers 1, 2, 3, 4.
import numpy as np
a = np.array([1,2,3,4])
How to index array if I want sequence 2, 3, 4, 1??
I know that for sequence 2, 3, 4 I can choose e.g.:
print(a[1::1])
If you want to rotate the list, you can use a deque instead of a numpy array. This data structure is designed for this kind of operation and directly provides a rotate function.
>>> from collections import deque
>>> a = deque([1, 2, 3, 4])
>>> a.rotate(-1)
>>> a
deque([2, 3, 4, 1])
If you want to use Numpy, you can check out the roll function.
>>> import numpy as np
>>> a = np.array([1,2,3,4])
>>> np.roll(a, -1)
array([2, 3, 4, 1])
One possible way is to define index set (a list).
index_set = [1, 2, 3, 0]
print(a[index_set])

Get list of indices matching condition with NumPy [duplicate]

Is there any way to get the indices of several elements in a NumPy array at once?
E.g.
import numpy as np
a = np.array([1, 2, 4])
b = np.array([1, 2, 3, 10, 4])
I would like to find the index of each element of a in b, namely: [0,1,4].
I find the solution I am using a bit verbose:
import numpy as np
a = np.array([1, 2, 4])
b = np.array([1, 2, 3, 10, 4])
c = np.zeros_like(a)
for i, aa in np.ndenumerate(a):
c[i] = np.where(b == aa)[0]
print('c: {0}'.format(c))
Output:
c: [0 1 4]
You could use in1d and nonzero (or where for that matter):
>>> np.in1d(b, a).nonzero()[0]
array([0, 1, 4])
This works fine for your example arrays, but in general the array of returned indices does not honour the order of the values in a. This may be a problem depending on what you want to do next.
In that case, a much better answer is the one #Jaime gives here, using searchsorted:
>>> sorter = np.argsort(b)
>>> sorter[np.searchsorted(b, a, sorter=sorter)]
array([0, 1, 4])
This returns the indices for values as they appear in a. For instance:
a = np.array([1, 2, 4])
b = np.array([4, 2, 3, 1])
>>> sorter = np.argsort(b)
>>> sorter[np.searchsorted(b, a, sorter=sorter)]
array([3, 1, 0]) # the other method would return [0, 1, 3]
This is a simple one-liner using the numpy-indexed package (disclaimer: I am its author):
import numpy_indexed as npi
idx = npi.indices(b, a)
The implementation is fully vectorized, and it gives you control over the handling of missing values. Moreover, it works for nd-arrays as well (for instance, finding the indices of rows of a in b).
All of the solutions here recommend using a linear search. You can use np.argsort and np.searchsorted to speed things up dramatically for large arrays:
sorter = b.argsort()
i = sorter[np.searchsorted(b, a, sorter=sorter)]
For an order-agnostic solution, you can use np.flatnonzero with np.isin (v 1.13+).
import numpy as np
a = np.array([1, 2, 4])
b = np.array([1, 2, 3, 10, 4])
res = np.flatnonzero(np.isin(a, b)) # NumPy v1.13+
res = np.flatnonzero(np.in1d(a, b)) # earlier versions
# array([0, 1, 2], dtype=int64)
There are a bunch of approaches for getting the index of multiple items at once mentioned in passing in answers to this related question: Is there a NumPy function to return the first index of something in an array?. The wide variety and creativity of the answers suggests there is no single best practice, so if your code above works and is easy to understand, I'd say keep it.
I personally found this approach to be both performant and easy to read: https://stackoverflow.com/a/23994923/3823857
Adapting it for your example:
import numpy as np
a = np.array([1, 2, 4])
b_list = [1, 2, 3, 10, 4]
b_array = np.array(b_list)
indices = [b_list.index(x) for x in a]
vals_at_indices = b_array[indices]
I personally like adding a little bit of error handling in case a value in a does not exist in b.
import numpy as np
a = np.array([1, 2, 4])
b_list = [1, 2, 3, 10, 4]
b_array = np.array(b_list)
b_set = set(b_list)
indices = [b_list.index(x) if x in b_set else np.nan for x in a]
vals_at_indices = b_array[indices]
For my use case, it's pretty fast, since it relies on parts of Python that are fast (list comprehensions, .index(), sets, numpy indexing). Would still love to see something that's a NumPy equivalent to VLOOKUP, or even a Pandas merge. But this seems to work for now.

Python : Choosing elements from a list using a list [duplicate]

This question already has answers here:
Explicitly select items from a list or tuple
(9 answers)
Closed 7 months ago.
I need to choose some elements from the given list, knowing their index. Let say I would like to create a new list, which contains element with index 1, 2, 5, from given list [-2, 1, 5, 3, 8, 5, 6]. What I did is:
a = [-2,1,5,3,8,5,6]
b = [1,2,5]
c = [ a[i] for i in b]
Is there any better way to do it? something like c = a[b] ?
You can use operator.itemgetter:
from operator import itemgetter
a = [-2, 1, 5, 3, 8, 5, 6]
b = [1, 2, 5]
print(itemgetter(*b)(a))
# Result:
(1, 5, 5)
Or you can use numpy:
import numpy as np
a = np.array([-2, 1, 5, 3, 8, 5, 6])
b = [1, 2, 5]
print(list(a[b]))
# Result:
[1, 5, 5]
But really, your current solution is fine. It's probably the neatest out of all of them.
Alternatives:
>>> map(a.__getitem__, b)
[1, 5, 5]
>>> import operator
>>> operator.itemgetter(*b)(a)
(1, 5, 5)
Another solution could be via pandas Series:
import pandas as pd
a = pd.Series([-2, 1, 5, 3, 8, 5, 6])
b = [1, 2, 5]
c = a[b]
You can then convert c back to a list if you want:
c = list(c)
Basic and not very extensive testing comparing the execution time of the five supplied answers:
def numpyIndexValues(a, b):
na = np.array(a)
nb = np.array(b)
out = list(na[nb])
return out
def mapIndexValues(a, b):
out = map(a.__getitem__, b)
return list(out)
def getIndexValues(a, b):
out = operator.itemgetter(*b)(a)
return out
def pythonLoopOverlap(a, b):
c = [ a[i] for i in b]
return c
multipleListItemValues = lambda searchList, ind: [searchList[i] for i in ind]
using the following input:
a = range(0, 10000000)
b = range(500, 500000)
simple python loop was the quickest with lambda operation a close second, mapIndexValues and getIndexValues were consistently pretty similar with numpy method significantly slower after converting lists to numpy arrays.If data is already in numpy arrays the numpyIndexValues method with the numpy.array conversion removed is quickest.
numpyIndexValues -> time:1.38940598 (when converted the lists to numpy arrays)
numpyIndexValues -> time:0.0193445 (using numpy array instead of python list as input, and conversion code removed)
mapIndexValues -> time:0.06477512099999999
getIndexValues -> time:0.06391049500000001
multipleListItemValues -> time:0.043773591
pythonLoopOverlap -> time:0.043021754999999995
Here's a simpler way:
a = [-2,1,5,3,8,5,6]
b = [1,2,5]
c = [e for i, e in enumerate(a) if i in b]
I'm sure this has already been considered: If the amount of indices in b is small and constant, one could just write the result like:
c = [a[b[0]]] + [a[b[1]]] + [a[b[2]]]
Or even simpler if the indices itself are constants...
c = [a[1]] + [a[2]] + [a[5]]
Or if there is a consecutive range of indices...
c = a[1:3] + [a[5]]
List comprehension is clearly the most immediate and easiest to remember - in addition to being quite pythonic!
In any case, among the proposed solutions, it is not the fastest (I have run my test on Windows using Python 3.8.3):
import timeit
from itertools import compress
import random
from operator import itemgetter
import pandas as pd
__N_TESTS__ = 10_000
vector = [str(x) for x in range(100)]
filter_indeces = sorted(random.sample(range(100), 10))
filter_boolean = random.choices([True, False], k=100)
# Different ways for selecting elements given indeces
# list comprehension
def f1(v, f):
return [v[i] for i in filter_indeces]
# itemgetter
def f2(v, f):
return itemgetter(*f)(v)
# using pandas.Series
# this is immensely slow
def f3(v, f):
return list(pd.Series(v)[f])
# using map and __getitem__
def f4(v, f):
return list(map(v.__getitem__, f))
# using enumerate!
def f5(v, f):
return [x for i, x in enumerate(v) if i in f]
# using numpy array
def f6(v, f):
return list(np.array(v)[f])
print("{:30s}:{:f} secs".format("List comprehension", timeit.timeit(lambda:f1(vector, filter_indeces), number=__N_TESTS__)))
print("{:30s}:{:f} secs".format("Operator.itemgetter", timeit.timeit(lambda:f2(vector, filter_indeces), number=__N_TESTS__)))
print("{:30s}:{:f} secs".format("Using Pandas series", timeit.timeit(lambda:f3(vector, filter_indeces), number=__N_TESTS__)))
print("{:30s}:{:f} secs".format("Using map and __getitem__", timeit.timeit(lambda: f4(vector, filter_indeces), number=__N_TESTS__)))
print("{:30s}:{:f} secs".format("Enumeration (Why anyway?)", timeit.timeit(lambda: f5(vector, filter_indeces), number=__N_TESTS__)))
My results are:
List comprehension :0.007113 secs
Operator.itemgetter :0.003247 secs
Using Pandas series :2.977286 secs
Using map and getitem :0.005029 secs
Enumeration (Why anyway?) :0.135156 secs
Numpy :0.157018 secs
Static indexes and small list?
Don't forget that if the list is small and the indexes don't change, as in your example, sometimes the best thing is to use sequence unpacking:
_,a1,a2,_,_,a3,_ = a
The performance is much better and you can also save one line of code:
%timeit _,a1,b1,_,_,c1,_ = a
10000000 loops, best of 3: 154 ns per loop
%timeit itemgetter(*b)(a)
1000000 loops, best of 3: 753 ns per loop
%timeit [ a[i] for i in b]
1000000 loops, best of 3: 777 ns per loop
%timeit map(a.__getitem__, b)
1000000 loops, best of 3: 1.42 µs per loop
My answer does not use numpy or python collections.
One trivial way to find elements would be as follows:
a = [-2, 1, 5, 3, 8, 5, 6]
b = [1, 2, 5]
c = [i for i in a if i in b]
Drawback: This method may not work for larger lists. Using numpy is recommended for larger lists.
Kind of pythonic way:
c = [x for x in a if a.index(x) in b]
The results for the latest pandas==1.4.2 as of June 2022 are as follows.
Note that simple slicing is no longer possible and benchmark results are faster.
import timeit
import pandas as pd
print(pd.__version__)
# 1.4.2
pd.Series([-2, 1, 5, 3, 8, 5, 6])[1, 2, 5]
# KeyError: 'key of type tuple not found and not a MultiIndex'
pd.Series([-2, 1, 5, 3, 8, 5, 6]).iloc[[1, 2, 5]].tolist()
# [1, 5, 5]
def extract_multiple_elements():
return pd.Series([-2, 1, 5, 3, 8, 5, 6]).iloc[[1, 2, 5]].tolist()
__N_TESTS__ = 10_000
t1 = timeit.timeit(extract_multiple_elements, number=__N_TESTS__)
print(round(t1, 3), 'seconds')
# 1.035 seconds

How to generate a random arange in python?

I want to generate a random arange of 10 values in Python. Here's my code. Is there any faster way or more elegant way to generate it ? Because the x in function lambda is actually never used.
>>> import numpy as np
>>> import random as rd
>>> i = np.arange(0, 10)
>>> noice = map(lambda x: rd.random(), i)
>>> i
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
>>> noice
[0.040625208925370804, 0.3979391267477529, 0.36389993607597815, 0.5222540865995068, 0.4568532903714069, 0.9028000777629279, 0.6011546913245037, 0.4779160505869603, 0.9929389966510183, 0.7688424310182902]
You can simply use np.random.rand(10).
See the NumpPy docs.

Python: create sublist without copying

I have a question about how to create a sublist (I hope this is the right term to use) from a given list without copying.
It seems that slicing can create sublists, but does it with copying. Here is an example.
In [1]: a = [1,2,3]
In [2]: id(a)
Out[2]: 4354651128
In [3]: b = a[0:2]
In [4]: b
Out[4]: [1, 2]
In [5]: id(b)
Out[5]: 4354621312
In [6]: id(a[0:2])
Out[6]: 4354620880
See here the id of b and a[0:2] are different, although their values are the same. To double check, change the value in a, the value in b does not change.
In [7]: a[1] = 4
In [8]: a
Out[8]: [1, 4, 3]
In [9]: b
Out[9]: [1, 2]
So to get back to my question, how can I create sublists but without copying? I mean, when value of a[1] is set to 4, b will be [1, 4].
I searched around and did not find much help (maybe I am not using the right keywords). Thank you!
Edits:
Thank you all for your comments and answers! Here is what I have learned.
There is no built-in way in Python to create a view of a list (or to create a sublist without copying).
The easiest way to do this is to use the numpy array.
Although numpy array has limitations on data type compared with list, it does serve my purpose (to implement quicksort with no extra memory)
Here is the same process with numpy array.
In [1]: import numpy as np
In [2]: a = np.arange(1,4)
In [3]: a
Out[3]: array([1, 2, 3])
In [4]: b = a[0:2]
In [5]: b
Out[5]: array([1, 2])
In [6]: id(b)
Out[6]: 4361253952
In [7]: id(a[0:2])
Out[7]: 4361254032
In [8]: a[1] = 4
In [9]: a
Out[9]: array([1, 4, 3])
In [10]: b
Out[10]: array([1, 4])
numpy's array objects support this notion of creating interdependent sub-lists, by having slicing return views rather than copies of the data.
Altering the original numpy array will alter the views created from the array, and changes to any of the views will also be reflected in the original array. Especially for large data sets, views are a great way of cutting data in different ways, while saving on memory.
>>> import numpy as np
>>> array1 = np.array([1, 2, 3, 4])
>>> view1 = array1[1:]
>>> view1
array([2, 3, 4])
>>> view1[1] = 5
>>> view1
array([2, 5, 4])
>>> array1
array([1, 2, 5, 4]) # Notice that the change to view1 has been reflected in array1
For further reference, see the numpy documentation on views as well as this SO post.
There is no way to do this with built in Python data structures. However, I created a class that does what you need. I don't guarantee it to be bug-free, but it should get you started.
from itertools import islice
class SubLister(object):
def __init__(self, base=[], start=0, end=None):
self._base = base
self._start = start
self._end = end
def __len__(self):
if self._end is None:
return len(self._base) - self._start
return self._end - self._start
def __getitem__(self, index):
self._check_end_range(index)
return self._base[index + self._start]
def __setitem__(self, index, value):
self._check_end_range(index, "list assignment index out of range")
self._base[index + self._start] = value
def __delitem__(self, index):
self._check_end_range(index, "list assignment index out of range")
del self._base[index + self._start]
def __iter__(self):
return islice(self._base, self._start, self._end)
def __str__(self):
return str(self._base[self._start:self._end])
def __repr__(self):
return repr(self._base[self._start:self._end])
# ...etc...
def get_sublist(self, start=0, end=None):
return SubLister(base=self._base, start=start, end=end)
def _check_end_range(self, index, msg="list index out of range"):
if self._end is not None and index >= self._end - self._start:
raise IndexError(msg)
Example:
>>> from sublister import SubLister
>>> base = SubLister([1, 2, 3, 4, 5])
>>> a = base.get_sublist(0, 2)
>>> b = base.get_sublist(1)
>>> base
[1, 2, 3, 4, 5]
>>> a
[1, 2]
>>> b
[2, 3, 4, 5]
>>> len(base)
5
>>> len(a)
2
>>> len(b)
4
>>> base[1] = 'ref'
>>> base
[1, 'ref', 3, 4, 5]
>>> a
[1, 'ref']
>>> b
['ref', 3, 4, 5]
you can't if you slice a to get b.
All slice operations return a new list containing the requested
elements. This means that the following slice returns a new (shallow)
copy of the list [1]
[1] https://docs.python.org/2/tutorial/introduction.html
There is no built-in way to do this. You could create your own list-like class that takes a reference to a list and reimplements all of the list accessor methods to operate on it.

Categories