Checking if all elements in a list are unique - python

What is the best way (best as in the conventional way) of checking whether all elements in a list are unique?
My current approach using a Counter is:
>>> x = [1, 1, 1, 2, 3, 4, 5, 6, 2]
>>> counter = Counter(x)
>>> for values in counter.itervalues():
if values > 1:
# do something
Can I do better?

Not the most efficient, but straight forward and concise:
if len(x) > len(set(x)):
pass # do something
Probably won't make much of a difference for short lists.

Here is a two-liner that will also do early exit:
>>> def allUnique(x):
... seen = set()
... return not any(i in seen or seen.add(i) for i in x)
...
>>> allUnique("ABCDEF")
True
>>> allUnique("ABACDEF")
False
If the elements of x aren't hashable, then you'll have to resort to using a list for seen:
>>> def allUnique(x):
... seen = list()
... return not any(i in seen or seen.append(i) for i in x)
...
>>> allUnique([list("ABC"), list("DEF")])
True
>>> allUnique([list("ABC"), list("DEF"), list("ABC")])
False

An early-exit solution could be
def unique_values(g):
s = set()
for x in g:
if x in s: return False
s.add(x)
return True
however for small cases or if early-exiting is not the common case then I would expect len(x) != len(set(x)) being the fastest method.

for speed:
import numpy as np
x = [1, 1, 1, 2, 3, 4, 5, 6, 2]
np.unique(x).size == len(x)

How about adding all the entries to a set and checking its length?
len(set(x)) == len(x)

Alternative to a set, you can use a dict.
len({}.fromkeys(x)) == len(x)

Another approach entirely, using sorted and groupby:
from itertools import groupby
is_unique = lambda seq: all(sum(1 for _ in x[1])==1 for x in groupby(sorted(seq)))
It requires a sort, but exits on the first repeated value.

Here is a recursive O(N2) version for fun:
def is_unique(lst):
if len(lst) > 1:
return is_unique(s[1:]) and (s[0] not in s[1:])
return True

Here is a recursive early-exit function:
def distinct(L):
if len(L) == 2:
return L[0] != L[1]
H = L[0]
T = L[1:]
if (H in T):
return False
else:
return distinct(T)
It's fast enough for me without using weird(slow) conversions while
having a functional-style approach.

All answer above are good but I prefer to use all_unique example from 30 seconds of python
You need to use set() on the given list to remove duplicates, compare its length with the length of the list.
def all_unique(lst):
return len(lst) == len(set(lst))
It returns True if all the values in a flat list are unique, False otherwise.
x = [1, 2, 3, 4, 5, 6]
y = [1, 2, 2, 3, 4, 5]
all_unique(x) # True
all_unique(y) # False

I've compared the suggested solutions with perfplot and found that
len(lst) == len(set(lst))
is indeed the fastest solution. If there are early duplicates in the list, there are some constant-time solutions which are to be preferred.
Code to reproduce the plot:
import perfplot
import numpy as np
import pandas as pd
def len_set(lst):
return len(lst) == len(set(lst))
def set_add(lst):
seen = set()
return not any(i in seen or seen.add(i) for i in lst)
def list_append(lst):
seen = list()
return not any(i in seen or seen.append(i) for i in lst)
def numpy_unique(lst):
return np.unique(lst).size == len(lst)
def set_add_early_exit(lst):
s = set()
for item in lst:
if item in s:
return False
s.add(item)
return True
def pandas_is_unique(lst):
return pd.Series(lst).is_unique
def sort_diff(lst):
return not np.any(np.diff(np.sort(lst)) == 0)
b = perfplot.bench(
setup=lambda n: list(np.arange(n)),
title="All items unique",
# setup=lambda n: [0] * n,
# title="All items equal",
kernels=[
len_set,
set_add,
list_append,
numpy_unique,
set_add_early_exit,
pandas_is_unique,
sort_diff,
],
n_range=[2**k for k in range(18)],
xlabel="len(lst)",
)
b.save("out.png")
b.show()

How about this
def is_unique(lst):
if not lst:
return True
else:
return Counter(lst).most_common(1)[0][1]==1

If and only if you have the data processing library pandas in your dependencies, there's an already implemented solution which gives the boolean you want :
import pandas as pd
pd.Series(lst).is_unique

You can use Yan's syntax (len(x) > len(set(x))), but instead of set(x), define a function:
def f5(seq, idfun=None):
# order preserving
if idfun is None:
def idfun(x): return x
seen = {}
result = []
for item in seq:
marker = idfun(item)
# in old Python versions:
# if seen.has_key(marker)
# but in new ones:
if marker in seen: continue
seen[marker] = 1
result.append(item)
return result
and do len(x) > len(f5(x)). This will be fast and is also order preserving.
Code there is taken from: http://www.peterbe.com/plog/uniqifiers-benchmark

Using a similar approach in a Pandas dataframe to test if the contents of a column contains unique values:
if tempDF['var1'].size == tempDF['var1'].unique().size:
print("Unique")
else:
print("Not unique")
For me, this is instantaneous on an int variable in a dateframe containing over a million rows.

It does not fully fit the question but if you google the task I had you get this question ranked first and it might be of interest to the users as it is an extension of the quesiton. If you want to investigate for each list element if it is unique or not you can do the following:
import timeit
import numpy as np
def get_unique(mylist):
# sort the list and keep the index
sort = sorted((e,i) for i,e in enumerate(mylist))
# check for each element if it is similar to the previous or next one
isunique = [[sort[0][1],sort[0][0]!=sort[1][0]]] + \
[[s[1], (s[0]!=sort[i-1][0])and(s[0]!=sort[i+1][0])]
for [i,s] in enumerate (sort) if (i>0) and (i<len(sort)-1) ] +\
[[sort[-1][1],sort[-1][0]!=sort[-2][0]]]
# sort indices and booleans and return only the boolean
return [a[1] for a in sorted(isunique)]
def get_unique_using_count(mylist):
return [mylist.count(item)==1 for item in mylist]
mylist = list(np.random.randint(0,10,10))
%timeit for x in range(10): get_unique(mylist)
%timeit for x in range(10): get_unique_using_count(mylist)
mylist = list(np.random.randint(0,1000,1000))
%timeit for x in range(10): get_unique(mylist)
%timeit for x in range(10): get_unique_using_count(mylist)
for short lists the get_unique_using_count as suggested in some answers is fast. But if your list is already longer than 100 elements the count function takes quite long. Thus the approach shown in the get_unique function is much faster although it looks more complicated.

If the list is sorted anyway, you can use:
not any(sorted_list[i] == sorted_list[i + 1] for i in range(len(sorted_list) - 1))
Pretty efficient, but not worth sorting for this purpose though.

For begginers:
def AllDifferent(s):
for i in range(len(s)):
for i2 in range(len(s)):
if i != i2:
if s[i] == s[i2]:
return False
return True

Related

check identical for list in list(python)

I want to check if there is no identical entries in a list of list. If there are no identical matches, then return True, otherwise False.
For example:
[[1],[1,2],[1,2,3]] # False
[[1,2,3],[10,20,30]] # True
I am thinking of combine all of the entries into one list,
for example: change [[1,2,3][4,5,6]] into [1,2,3,4,5,6] and then check
Thanks for editing the question and helping me!
>>> def flat_unique(list_of_lists):
... flat = [element for sublist in list_of_lists for element in sublist]
... return len(flat) == len(set(flat))
...
>>> flat_unique([[1],[1,2],[1,2,3]])
False
>>> flat_unique([[1,2,3],[10,20,30]])
True
We can use itertools.chain.from_iterable and set built-in function.
import itertools
def check_iden(data):
return len(list(itertools.chain.from_iterable(data))) == len(set(itertools.chain.from_iterable(data)))
data1 = [[1],[1,2],[1,2,3]]
data2 = [[1,2,3],[10,20,30]]
print check_iden(data1)
print check_iden(data2)
Returns
False
True
You could use sets which have intersection methods to find which elements are common
Place all elements of each sublist into a separate list. If that separate list has any duplicates (call set() to find out), then return False. Otherwise return True.
def identical(x):
newX = []
for i in x:
for j in i:
newX.append(j)
if len(newX) == len(set(newX)): # if newX has any duplicates, the len(set(newX)) will be less than len(newX)
return True
return False
I think you can flat the list and count the element in it, then compare it with set()
import itertools
a = [[1],[1,2],[1,2,3]]
b = [[1,2,3],[10,20,30]]
def check(l):
merged = list(itertools.chain.from_iterable(l))
if len(set(merged)) < len(merged):
return False
else:
return True
print check(a) # False
print check(b) # True
Depending on your data you might not want to look at all the elements, here is a solution that returns False as soon as you hit a first duplicate.
def all_unique(my_lists):
my_set = set()
for sub_list in my_lists:
for ele in sub_list:
if ele in my_set:
return False
my_set.add(ele)
else:
return True
Result:
In [12]: all_unique([[1,2,3],[10,20,30]])
Out[12]: True
In [13]: all_unique([[1],[1,2],[1,2,3]])
Out[13]: False
Using this method will make the boolean variable "same" turn to True if there is a number in your list that occurs more than once as the .count() function returns you how many time a said number was found in the list.
li = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
same = False
for nbr in li:
if li.count(nbr) > 1:
same = True

Most common value in flat list

What are some ways to get the most common value in a list?
l = [1,2,2]
So far I'm doing:
Counter(l).most_common()[0][0]
But I was wondering if there was a list method or something 'simpler' to do this?
That's pretty much as good as it gets - although I'd suggest using .most_common(1) which will be more efficient* than .most_common() and use it like so:
(value, count), = Counter(sequence).most_common(1)
*Source from collections.Counter:
if n is None:
return sorted(self.items(), key=_itemgetter(1), reverse=True)
return _heapq.nlargest(n, self.items(), key=_itemgetter(1))
You can use max with list.count, but it's not efficient as your current solution:
>>> l = [1, 2, 2]
>>> max(set(l), key=l.count)
2
This is almost equivalent to what #JonClement 's solution does
>>> from collections import Counter
>>> l = [1,2,2]
>>> c = Counter(l)
>>> max(c, key=c.get)
2
As heapq.nlargest will run
if n == 1:
it = iter(iterable)
head = list(islice(it, 1))
if not head:
return []
if key is None:
return [max(chain(head, it))]
return [max(chain(head, it), key=key)]
in this specific case where n=1 which performs the same as above just without the list of a single tuple.

How to find a duplicate in a list without using set in python?

I know that we can use the set in python to find if there is any duplicate in a list. I was just wondering, if we can find a duplicate in a list without using set.
Say, my list is
a=['1545','1254','1545']
then how to find a duplicate?
a=['1545','1254','1545']
from collections import Counter
print [item for item, count in Counter(a).items() if count != 1]
Output
['1545']
This solution runs in O(N). This will be a huge advantage if the list used has a lot of elements.
If you just want to find if the list has duplicates, you can simply do
a=['1545','1254','1545']
from collections import Counter
print any(count != 1 for count in Counter(a).values())
As #gnibbler suggested, this would be the practically fastest solution
from collections import defaultdict
def has_dup(a):
result = defaultdict(int)
for item in a:
result[item] += 1
if result[item] > 1:
return True
else:
return False
a=['1545','1254','1545']
print has_dup(a)
>>> lis = []
>>> a=['1545','1254','1545']
>>> for i in a:
... if i not in lis:
... lis.append(i)
...
>>> lis
['1545', '1254']
>>> set(a)
set(['1254', '1545'])
use list.count:
In [309]: a=['1545','1254','1545']
...: a.count('1545')>1
Out[309]: True
Using list.count:
>>> a = ['1545','1254','1545']
>>> any(a.count(x) > 1 for x in a) # To check whether there's any duplicate
True
>>> # To retrieve any single element that is duplicated
>>> next((x for x in a if a.count(x) > 1), None)
'1545'
# To get duplicate elements (used set literal!)
>>> {x for x in a if a.count(x) > 1}
set(['1545'])
sort the list and check that the next value is not equal to the last one..
a.sort()
last_x = None
for x in a:
if x == last_x:
print "duplicate: %s" % x
break # existence of duplicates is enough
last_x = x
This should be O(n log n) which is slower for big n than the Counter solution (but counter uses a dict under the hood.. which is not too dissimilar from a set really).
An alternative is to insert the elements and keep the list sorted.. see the bisect module. It makes your inserts slower but your check for duplicates fast.
If this is homework, your teacher is probably asking for the hideously inefficient .count() style answer.
In practice using a dict is your next best bet if set is disallowed.
>>> a = ['1545','1254','1545']
>>> D = {}
>>> for i in a:
... if i in D:
... print "duplicate", i
... break
... D[i] = i
... else:
... print "no duplicate"
...
duplicate 1545
Here is a version using groupby which is still much better that the .count() method
>>> from itertools import groupby
>>> a = ['1545','1254','1545']
>>> next(k for k, g in groupby(sorted(a)) if sum(1 for i in g) > 1)
'1545'
thanks all for working on this problem. I also got to learn a lot from different answers. This is how I have answered:
a=['1545','1254','1545']
d=[]
duplicates=False
for i in a:
if i not in d:
d.append(i)
if len(d)<len(a):
duplicates=True
else:
duplicates=False
print(duplicates)

Comparing two lists and only printing the differences? (XORing two lists)

I'm trying to create a function that takes in 2 lists and returns the list that only has the differences of the two lists.
Example:
a = [1,2,5,7,9]
b = [1,2,4,8,9]
The result should print [4,5,7,8]
The function so far:
def xor(list1, list2):
list3=list1+list2
for i in range(0, len(list3)):
x=list3[i]
y=i
while y>0 and x<list3[y-1]:
list3[y]=list3[y-1]
y=y-1
list3[y]=x
last=list3[-1]
for i in range(len(list3) -2, -1, -1):
if last==list3[i]:
del list3[i]
else:
last=list3[i]
return list3
print xor([1,2,5,7,8],[1,2,4,8,9])
The first for loop sorts it, second one removes the duplicates. Problem is the result is
[1,2,4,5,7,8,9] not [4,5,7,8], so it doesn't completely remove the duplicates? What can I add to do this.
I can't use any special modules, .sort, set or anything, just loops basically.
You basically want to add an element to your new list if it is present in one and not present in another. Here is a compact loop which can do it. For each element in the two lists (concatenate them with list1+list2), we add element if it is not present in one of them:
[a for a in list1+list2 if (a not in list1) or (a not in list2)]
You can easily transform it into a more unPythonic code with explicit looping through elements as you have now, but honestly I don't see a point (not that it matters):
def xor(list1, list2):
outputlist = []
list3 = list1 + list2
for i in range(0, len(list3)):
if ((list3[i] not in list1) or (list3[i] not in list2)) and (list3[i] not in outputlist):
outputlist[len(outputlist):] = [list3[i]]
return outputlist
Use set is better
>>> a = [1,2,5,7,9]
>>> b = [1,2,4,8,9]
>>> set(a).symmetric_difference(b)
{4, 5, 7, 8}
Thanks to #DSM, a better sentence is:
>>> set(a)^set(b)
These two statements are the same. But the latter is clearer.
Update: sorry, I did not see the last requirement: cannot use set. As far as I see, the solution provided by #sashkello is the best.
Note: This is really unpythonic and should only be used as a homework answer :)
After you have sorted both lists, you can find duplicates by doing the following:
1) Place iterators at the start of A and B
2) If Aitr is greater than Bitr, advance Bitr after placing Bitr's value in the return list
3) Else if Bitr is greater than Aitr, advance Aiter after placing Aitr's value in the return list
4) Else you have found a duplicate, advance Aitr and Bitr
This code works assuming you've got sorted lists. It works in linear time, rather than quadratic like many of the other solutions given.
def diff(sl0, sl1):
i0, i1 = 0, 0
while i0 < len(sl0) and i1 < len(sl1):
if sl0[i0] == sl1[i1]:
i0 += 1
i1 += 1
elif sl0[i0] < sl1[i1]:
yield sl0[i0]
i0 += 1
else:
yield sl1[i1]
i1 += 1
for i in xrange(i0, len(sl0)):
yield sl0[i]
for i in xrange(i1, len(sl1)):
yield sl1[i]
print list(diff([1,2,5,7,9], [1,2,4,8,9]))
Try this,
a = [1,2,5,7,9]
b = [1,2,4,8,9]
print set(a).symmetric_difference(set(b))
Simple, but not particularly efficient :)
>>> a = [1,2,5,7,9]
>>> b = [1,2,4,8,9]
>>> [i for i in a+b if (a+b).count(i)==1]
[5, 7, 4, 8]
Or with "just loops"
>>> res = []
>>> for i in a+b:
... c = 0
... for j in a+b:
... if i==j:
... c += 1
... if c == 1:
... res.append(i)
...
>>> res
[5, 7, 4, 8]

Returning a copy of the list in which one instance of every value is removed

What i have only removes duplicated items and sorts them. I need to remove one instance of every item and return a new list with the items in it. This is what i have:
def rem(nlst):
n = []
for x in nlst:
if x not in n:
n.append(x)
n.sort()
return n
This is what it should do:
>>> rem([4])
[]
>>> rem([4,4])
[4]
>>> rem([4, 1, 3, 2])
[]
>>> rem([2, 4, 2, 4, 4])
[2, 4, 4]
An easy implementation is to use collections.Counter:
def rem(iterable):
c = collections.Counter(iterable)
for k in c:
c[k] -= 1
return sorted(c.elements())
In Python versions before 2.7, collections.Counter is not available. You can use a set to record the items you already saw instead:
def rem(iterable):
result = []
seen = set()
for x in iterable:
if x in seen:
result.append(x)
else:
seen.add(x)
result.sort()
return result
A slight tweak to your code seems to work ok. Just added a variable to track the current value and only append a new one if you have already seen that value:
def rem(nlist):
n = []
nlist.sort()
cur = None
for x in nlist:
if x == cur:
n.append( x )
cur = x
return n
~

Categories