Python Check For Duplicate Values In A Object Array

Python Check For Duplicate Values In A Object Array - python

I have a array which stores a object. I am trying to see if there are duplicate values in this object array, but only on one of the objects parameters (hexdigest).
How can I check for duplicates and record the entire object of duplicates I find?
# class to store hashes
class customclass:
def __init__(self, value, hexdigest):
self.value = value
self.hexdigest = hexdigest
# array to store class data
hash_array = []
hash_array.append(customclass(value=299, hexdigest='927'))
hash_array.append(customclass(value=207, hexdigest='92b'))
hash_array.append(customclass(value=113, hexdigest='951'))
hash_array.append(customclass(value=187, hexdigest='951'))
hash_array.append(customclass(value=205, hexdigest='998'))
# sort array
sorted_array = sorted(hash_array, key=attrgetter('hexdigest'))
# check for duplicate hexdigest's
newlist = []
duplist = []
for obj in sorted_array:
for jbo in newlist:
if obj.hexdigest not in jbo:
newlist.append(obj)
else:
duplist.append(obj)

hex_list = []
duplist = []
for obj in sorted_array:
if(obj.hexdigest in hex_list):
duplist.append(obj)
else:
hex_list.append(obj.hexdigest)
use this above block of code instead of the below one which you have implemented to find the list of duplicate object
newlist = []
duplist = []
for obj in sorted_array:
for jbo in newlist:
if obj.hexdigest not in jbo:
newlist.append(obj)
else:
duplist.append(obj)

Well, newlist is empty, so the inner for loop never runs, so nothing gets appended to newlist or duplist.
You may wish to group by the hexdigest attribute using itertools.groupby and a dictionary comprehension.
from operator import attrgetter
from itertools import groupby
class customclass:
def __init__(self, value, hexdigest):
self.value = value
self.hexdigest = hexdigest
hash_array = []
hash_array.append(customclass(value=299, hexdigest='927'))
hash_array.append(customclass(value=207, hexdigest='92b'))
hash_array.append(customclass(value=113, hexdigest='951'))
hash_array.append(customclass(value=187, hexdigest='951'))
hash_array.append(customclass(value=205, hexdigest='998'))
sorted_array = sorted(hash_array, key=attrgetter('hexdigest'))
# [<__main__.customclass object at 0x7f488d1a2a20>,
# <__main__.customclass object at 0x7f488d1a29b0>,
# <__main__.customclass object at 0x7f488d1a2b00>,
# <__main__.customclass object at 0x7f488d1a2b70>,
# <__main__.customclass object at 0x7f488d1a2c18>]
groups = groupby(sorted_array, key=attrgetter('hexdigest'))
{k: list(v) for k, v in groups}
# {'927': [<__main__.customclass object at 0x7f488d1a2a20>],
# '92b': [<__main__.customclass object at 0x7f488d1a29b0>],
# '951': [<__main__.customclass object at 0x7f488d1a2b00>,
# <__main__.customclass object at 0x7f488d1a2b70>],
# '998': [<__main__.customclass object at 0x7f488d1a2c18>]}
From there it's relatively easy to retrieve the unique and duplicate values.
It may be easier to visualize what's going on if you provide a more useful definition for __repr__.
class customclass:
def __init__(self, value, hexdigest):
self.value = value
self.hexdigest = hexdigest
def __repr__(self):
return f"<customclass value: {self.value}, hexdigest: {self.hexdigest}>"
Doing so, hash_array prints in the interactive interpreter as follows, with the exception of he newlines I added for sanity's sake.
[<customclass value: 299, hexdigest: 927>,
<customclass value: 207, hexdigest: 92b>,
<customclass value: 113, hexdigest: 951>,
<customclass value: 187, hexdigest: 951>,
<customclass value: 205, hexdigest: 998>]

Related

Python dynamic enum

I would like to create dynamic enums in python loaded from a SQL Table. The output of SQL will be a list of tuplets, which with I want to fill the attributes of the enum.
Lets say I receive this list:
lst = [('PROCESS_0', 0, "value", 123, False), ('PROCESS_1',1,"anothervalue", 456, True)]
I now want to fill the values in the enum below:
class Jobs(IntEnum):
def __new__(cls, value: int, label: str, heartbeat: int = 60, heartbeat_required: bool = False):
obj = int.__new__(cls, value)
obj._value_ = value
obj.label = label
obj.heartbeat = heartbeat
obj.heartbeat_required = heartbeat_required
return obj
The first variable in the tuple should be the variable name of the enum, I have solved this with:
locals()['Test'] = (0, '', 789, False)
But this only works for single values, it seems that I can not run a for loop within enum. When using a for loop like this:
for i in lst:
locals()[i[0]] = (i[1], i[2], i[3])
Python sends this error TypeError: Attempted to reuse key: 'i' which propably comes from enums only having constants.
Is there any (possibly elegant) solution for this?
Many thanks in advance!

You need to use _ignore_ = "i". Something like:
class Jobs(IntEnum):
_ignore_ = "i"
def __new__(cls, value, label, heartbeat=60, heartbeat_required=False):
obj = int.__new__(cls, value)
obj._value_ = value
obj.label = label
obj.heartbeat = heartbeat
obj.heartbeat_required = heartbeat_required
return obj
for i in lst:
locals()[i[0]] = i[1:]

Check the example at https://docs.python.org/3/howto/enum.html#timeperiod
Note that the _ignore_ can be avoided in favor of dict comprehension
from datetime import timedelta
class Period(timedelta, Enum):
"different lengths of time"
vars().update({ f"day_{i}": i for i in range(367) })
Then you can access all possible enum values via Period.__members__

Python Pickle not saving entire object

I'm trying to pickle out a list of objects where the objects contain a list. When I open the pickled file I can see any data in my objects except from the list. I'm putting code below so this makes more sense.
Object that contains a list.
class TestPickle:
testNumber = None
testList = []
def addNumber(self, value):
self.testNumber = value
def getNumber(self):
return self.testNumber
def addTestList(self, value):
self.testList.append(value)
def getTestList(self):
return self.testList
This example I create a list of the above object (I'm adding one object to keep it brief)
testPKL = TestPickle()
testList = []
testPKL.addNumber(12)
testPKL.addTestList(1)
testPKL.addTestList(2)
testList.append(testPKL)
with open(os.path.join(os.path.curdir, 'test.pkl'), 'wb') as f:
pickle.dump(testList, f)
Here is an example of me opening the pickled file and trying to access the data, I can only retrieve the testNumber from above, the testList returns a empty list.
pklResult = None
with open(os.path.join(os.path.curdir, 'test.pkl'), 'rb') as f:
pklResult = pickle.load(f)
for result in pklResult:
print result.getNumber() # returns 12
print result.testNumber # returns 12
print result.getTestList() # returns []
print result.testList # returns []
I think i'm missing something obvious here but I'm not having any luck spotting it. Thanks for any guidance.

testNumber and testList both are class attributes initially. testNumber is of immutable type hence modifying it create new instance attribute, But testList is of mutable type and can be modified in place. Hence modifying testList doesn't create new instance attribute and it remains as class attribute.
You can verify it -
print testPKL.__dict__
{'testNumber': 12}
print result.__dict__
{'testNumber': 12}
So when you access result.testList, it looks for class attribute TestPickle.testList, which is [] in your case.
Solution
You are storing instance in pickle so use instance attribute. Modify TestPickle class as below -
class TestPickle:
def __init__(self):
self.testNumber = None
self.testList = []
def addNumber(self, value):
self.testNumber = value
def getNumber(self):
return self.testNumber
def addTestList(self, value):
self.testList.append(value)
def getTestList(self):
return self.testList

How do I use dictionary key,value pair to set class instance attributes "pythonic"ly?

I have created some Python classes to use as multivariate data structures, which are then used for various tasks. In some instances, I like to populate the classes with various value sets. The default parameter filename "ho2.defaults" would look something like this:
name = 'ho2'
mass_option = 'h1o16'
permutation = 'odd'
parity = 'odd'
j_total = 10
lr = 40
br = 60
jmax = 60
mass_lr = 14578.471659
mass_br = 1781.041591
length_lr = ( 1.0, 11.0, 2.65 )
length_br = ( 0.0, 11.0, 2.46 )
use_spline = True
energy_units = 'au'
pes_zpe = -7.407998138300982E-2
pes_cutoff = 0.293994
Currently, I create a dictionary from reading the desired key,value pairs from file, and now I'd like a "pythonic" way of making those dictionary keys be class instance variable names, i.e.
# Instantiate Molecule Class
molecule = Molecule()
# Create Dictionary of default values
default_dict = read_dict_from_file(filename)
# Set populate class instance variables with dictionary values
for key,value in default_dict:
molecule.key = value
So the Class's instance variable "molecule.name" could be set with the dictionary key,value pair. I could do this by hand, but I'ms sure there is a better way to loop through it. In actuality, the dictionary could be large, and I'd rather allow the user to choose which values they want to populate, so the dictionary could change. What am I missing here?

You would use setattr: setattr(molecule, key, value)

The simple way is:
vars(molecule).update(default_dict)
This will clobber any pre-existing attributes though. For a more delicate approach try:
for name, value in default_dict.items():
if not hasattr(molecule, name):
setattr(molecule, name value)

I'd invert the logic so that the object dynamically answers questions:
class Settings(object):
ATTRS = {'foo', 'bar'}
def __init__(self, defaults):
self.__dict__['data'] = defaults.copy()
def __getattr__(self, key):
if key not in self.ATTRS or key not in self.data:
raise AttributeError("'{}' object has no attribute '{}'".format(
self.__class__.__name__, key))
return self.data[key]
def __setattr__(self, key, value):
self.data[key] = value
s = Settings({'a': 'b', 'foo': 'foo!', 'spam': 'eggs'})
print s.foo
try:
print s.spam
except AttributeError:
pass
else:
raise AssertionError("That should have failed because 'spam' isn't in Settings.ATTRS")
try:
print s.bar
except AttributeError:
pass
else:
raise AssertionError("That should have failed because 'bar' wasn't passed in")
class Molecule(settings):
ATTRS = {'name', 'mass_option', ...}
molecule = Molecule(default_dict)

Find object from list that has attribute equal to some value and also get the next object after

I am able to find a object that its attribute equals to some value. But I would like to also get the object after that from a list (and also if the found object is the last in list, the next object after that should be the first object). Something like:
from pprint import pprint
class User(object):
def __init__(self, name):
self.name = name
users = []
users.append(User("Peter"))
users.append(User("James"))
users.append(User("John"))
# find object that has attribute name equal to James
pprint(vars([user for user in users if user.name == "James"][0]))
And the output from pprint line prints:
{'name': 'James'}
That is correct.
I would like to ask you how to get the next object after "James" and also if I would search for "John" the next object after "John" should be returned "Peter". Suggestions?
I also tried with itertools, but I cannot get the next element if the found element is last:
from itertools import enumerate
_i = next(i for i, user in enumerate(users) if (user.name == "John"))
print users[_i + 1] #this is not working
I could add if condition to change the counter before operation [_i+1] but I would like to know if there is more smoother solution to this?

To handle the last element, you can use modulo: index % len(users).
Here is one way:
def find_after_name(users, name):
for i, user in enumerate(users):
if user.name == name:
return users[(i+1) % len(users)]
Another option would be to zip the list with a shifted copy of the list. deque.rotate() is useful for such shifting:
from collections import deque
def find_after_name(users, name):
users2 = deque(users)
users2.rotate(-1)
for user1, user2 in zip(users1, users2):
if user1.name == name:
return user2

Since you've chosen to use OOP, why not implement a UserList class inherited from built-in list class?
class User(object):
def __init__(self, name):
self.name = name
def __str__(self):
return repr(self)
def __repr__(self):
return "User('{0}')".format(self.name)
class UserList(list):
def find(self, name):
for k, user in enumerate(self):
if user.name == name:
return k, user
def next_to(self, name):
"""get user next to the one of name (e.g. 'James')"""
index, user = self.find(name)
next_to_index = self.get_next_to_index(index)
return self[next_to_index]
def get_next_to_index(self, index):
next_to_index = index + 1
if next_to_index == len(self):
# meaning index is already the last element, need to reset index
next_to_index = 0
return next_to_index
users = UserList()
users.append(User("Peter"))
users.append(User("James"))
users.append(User("John"))
print users.find('James')
print users.next_to('James')
print users.next_to('John')
Output:
(1, User('James'))
User('John')
User('Peter')

For kicks, I wanted to see if there was an itertools solution:
from itertools import dropwhile
def find_after_name(users, name):
for i, _ in dropwhile(lambda eu: eu[1].name != name, enumerate(users)):
return users[(i+1) % len(users)]
Note: There really should be a list.index(value, key=None) method where key is like the key argument to list.sort(). Then you could do something like this:
index = users.index("John", key=lambda u: u.name)

Solution extending list class:
class extended_list(list):
def index(self, value, key=None):
if key is None:
return super().index(self, value)
try:
return next(i for i,o in enumerate(self) if value == key(o))
except StopIteration:
raise ValueError("{} is not in list".format(repr(value)))
def find_after_name(users, name):
i = extended_list(users).index(name, key=lambda u: u.name)
return users[(i+1) % len(users)]

merge join two generators in python

i want to merge two kyoto cabinet b-tree databases by key.
(kyoto cabinet python api ).
the resulting list should contain each unique key (and its value) of any of the two input dbs.
the following code works but i think its ugly.
left_generator/right_generator are two cursor objects.
its especially odd that get() returns None if the generator is exhausted.
def merge_join_kv(left_generator, right_generator):
stop = False
while left_generator.get() or right_generator.get():
try:
comparison = cmp(right_generator.get_key(), left_generator.get_key())
if comparison == 0:
yield left_generator.get_key(), left_generator.get_value()
left_generator.next()
right_generator.next()
elif (comparison < 0) or (not left_generator.get() or not right_generator.get()):
yield right_generator.get_key(), right_generator.get_value()
right_generator.next()
else:
yield left_generator.get_key(), left_generator.get_value()
left_generator.next()
except StopIteration:
if stop:
raise
stop = True
generally: is there a function/lib which merge joins generators with cmp() ?

I think this is what you need; orderedMerge is based on Gnibbler's code but adds a custom key function and a unique argument,
import kyotocabinet
import collections
import heapq
class IterableCursor(kyotocabinet.Cursor, collections.Iterator):
def __init__(self, *args, **kwargs):
kyotocabinet.Cursor.__init__(self, *args, **kwargs)
collections.Iterator.__init__(self)
def next():
"Return (key,value) pair"
res = self.get(True)
if res is None:
raise StopIteration
else:
return res
def orderedMerge(*iterables, **kwargs):
"""Take a list of ordered iterables; return as a single ordered generator.
#param key: function, for each item return key value
(Hint: to sort descending, return negated key value)
#param unique: boolean, return only first occurrence for each key value?
"""
key = kwargs.get('key', (lambda x: x))
unique = kwargs.get('unique', False)
_heapify = heapq.heapify
_heapreplace = heapq.heapreplace
_heappop = heapq.heappop
_StopIteration = StopIteration
# preprocess iterators as heapqueue
h = []
for itnum, it in enumerate(map(iter, iterables)):
try:
next = it.next
data = next()
keyval = key(data)
h.append([keyval, itnum, data, next])
except _StopIteration:
pass
_heapify(h)
# process iterators in ascending key order
oldkeyval = None
while True:
try:
while True:
keyval, itnum, data, next = s = h[0] # get smallest-key value
# raises IndexError when h is empty
# if unique, skip duplicate keys
if unique and keyval==oldkeyval:
pass
else:
yield data
oldkeyval = keyval
# load replacement value from same iterator
s[2] = data = next() # raises StopIteration when exhausted
s[0] = key(data)
_heapreplace(h, s) # restore heap condition
except _StopIteration:
_heappop(h) # remove empty iterator
except IndexError:
return
then your function can be done as
from operator import itemgetter
def merge_join_kv(leftGen, rightGen):
# assuming that kyotocabinet.Cursor has a copy initializer
leftIter = IterableCursor(leftGen)
rightIter = IterableCursor(rightGen)
return orderedMerge(leftIter, rightIter, key=itemgetter(0), unique=True)

Python 2.6 has a merge in heapq, but it does not support a user defined cmp/key func
def merge(*iterables):
'''Merge multiple sorted inputs into a single sorted output.
Similar to sorted(itertools.chain(*iterables)) but returns a generator,
does not pull the data into memory all at once, and assumes that each of
the input streams is already sorted (smallest to largest).
>>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
[0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
'''
_heappop, _heapreplace, _StopIteration = heappop, heapreplace, StopIteration
h = []
h_append = h.append
for itnum, it in enumerate(map(iter, iterables)):
try:
next = it.next
h_append([next(), itnum, next])
except _StopIteration:
pass
heapify(h)
while 1:
try:
while 1:
v, itnum, next = s = h[0] # raises IndexError when h is empty
yield v
s[0] = next() # raises StopIteration when exhausted
_heapreplace(h, s) # restore heap condition
except _StopIteration:
_heappop(h) # remove empty iterator
except IndexError:
return

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Check For Duplicate Values In A Object Array - python

Related

Python dynamic enum

Python Pickle not saving entire object

How do I use dictionary key,value pair to set class instance attributes "pythonic"ly?

Find object from list that has attribute equal to some value and also get the next object after

merge join two generators in python

Categories

Resources