Issue with Python class instances having a shallow connection - python

I'm attempting to write a genetic algorithm framework in Python, and am running into issues with shallow/deep copying. My background is mainly C/C++, and I'm struggling to understand how these connections are persisting.
What I am seeing is an explosion in the length of an attribute list within a subclass. My code is below...I'll point out the problems.
This is the class for a single gene. Essentially, it should have a name, value, and boolean flag. Instances of Gene populate a list within my Individual class.
# gene class
class Gene():
# constructor
def __init__(self, name, is_float):
self.name_ = name
self.is_float_ = is_float
self.value_ = self.randomize_gene()
# create a random gene
def randomize_gene(self):
return random.random()
This is my Individual class. Each generation, a population of these are created (I'll show the creation code after the class declaration) and have the typical genetic algorithm operations applied. Of note is the print len(self.Genes_) call, which grows each time this class is instantiated.
# individual class
class Individual():
# genome definition
Genes_ = [] # genes list
evaluated_ = False # prevent re-evaluation
fitness_ = 0.0 # fitness value (from evaluation)
trace_ = "" # path to trace file
generation_ = 0 # generation to which this individual belonged
indiv_ = 0 # identify this individual by number
# constructor
def __init__(self, gen, indv):
# assign indices
self.generation_ = gen
self.indiv_ = indv
self.fitness_ = random.random()
# populate genome
for lp in cfg.params_:
g = Gene(lp[0], lp[1])
self.Genes_.append(g)
print len(self.Genes_)
> python ga.py
> 24
> 48
> 72
> 96
> 120
> 144
......
As you can see, each Individual should have 24 genes, however this population explodes quite rapidly.
I create an initial population of new Individuals like this:
# create a randomized initial population
def createPopulation(self, gen):
loc_population = []
for i in range(0, cfg.population_size_):
indv = Individual(gen, i)
loc_population.append(indv)
return loc_population
and later on my main loop (apologies for the whole dump, but felt it was necessary - if my secondary calls (mutation/crossover) are needed please let me know))
for i in range(0, cfg.generations_):
# evaluate current population
self.evaluate(i)
# sort population on fitness
loc_pop = sorted(self.population_, key=operator.attrgetter('fitness_'), reverse=True)
# create next population & preserve elite individual
next_population = []
elitist = copy.deepcopy(loc_pop[0])
elitist.generation_ = i
next_population.append(elitist)
# perform selection
selection_pool = []
selection_pool = self.selection(elitist)
# perform crossover on selection
new_children = []
new_children = self.crossover(selection_pool, i)
# perform mutation on selection
muties = []
muties = self.mutation(selection_pool, i)
# add members to next population
next_population = next_population + new_children + muties
# fill out the rest with random
for j in xrange(len(next_population)-1, cfg.population_size_ - 1):
next_population.append(Individual(i, j))
# copy next population over old population
self.population_ = copy.deepcopy(next_population)
# clear old lists
selection_pool[:] = []
new_children[:] = []
muties[:] = []
next_population[:] = []

I'm not not completely sure that I understand your question, but I suspect that your problem is that the Genes_ variable in your Individual() class is declared in the class namespace. This namespace is available to all members of the class. In other words, each instance of Individual() will share the same variable Genes_.
Consider the following two snippets:
class Individual():
# genome definition
genes = []
def __init__(self):
for i in xrange(10):
self.genes.append(i)
ind_1 = Individual()
print ind_1.genes
ind_2 = Individual()
print ind_1.genes
print ind_2.genes
and
class Individual():
# genome definition
def __init__(self):
self.genes = []
for i in xrange(10):
self.genes.append(i)
ind_1 = Individual()
print ind_1.genes
ind_2 = Individual()
print ind_1.genes
print ind_2.genes
The first snippet outputs
>>> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
>>> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
>>> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
while the second snippet outputs
>>> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
>>> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
>>> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
In the first scenario, when the second Individual() is instantiated the genes list variable already exists, and the genes from the second individual are added to this existing list.
Rather than creating the Individual() class like this,
# individual class
class Individual():
# genome definition
Genes_ = [] # genes list
# constructor
def __init__(self, gen, indv):
# assign indices
self.generation_ = gen
self.indiv_ = indv
self.fitness_ = random.random()
you should consider declaring the Genes_ variable in init so that each Individual() instance gets its own gene set
# individual class
class Individual():
# constructor
def __init__(self, gen, indv):
# genome definition
self.Genes_ = [] # genes list
# assign indices
self.generation_ = gen
self.indiv_ = indv
self.fitness_ = random.random()

When you create a class, you are really creating exactly one 'class object'. These are objects just like any other object in Python; everything in Python is an object, and what those objects do is defined by their methods, not their class! That is the magic of duck typing. In Python you can even create new classes dynamically on the fly.
Anyway, you are adding exactly one list object to the "Genes_" attribute of the one and only "Individuals" class object. The upshot is that every instance object of the "Individual" class object is accessing the same "Genes_" list object.
Consider this
# In 2.2 <= Python < 3.0 you should ALWAYS inherit from 'object'.
class Foobar(object):
doodah = []
a = Foobar()
b = Foobar()
assert id(a.doodah) == id(b.doodah) # True
In this case, as you can see, "a.doodah" and "b.doodah" are the same object!
class Foobar(object):
def __init__(self):
self.doodah = []
a = Foobar()
b = Foobar()
assert id(a.doodah) != id(b.doodah) # True
In this case, they are different objects.
It's possible to have your cake and eat it too. Consider this
class Foobar(object):
doodah = []
a = Foobar()
b = Foobar()
a.doodah = 'hlaghalgh'
assert id(a.doodah) != id(b.doodah) # True
In this case a "doodah" attribute is added to the "a" object, which overrides the class attribute.
Hope this helps!

Related

Serialize both inner and outer class arguments

I'm not quite sure I'm using the right wording in my researches -- if that's the case, please let me know, I may have missed obvious answers just because of that -- but I'd like to serialize (i.e. convert to a dictionary or JSON structure) both the main (outer) and inner class arguments of a class.
Here's an example:
class Outer(object):
def __init__(self, idx, array1, array2):
self.idx = idx
# flatten individual values:
## unpack first array
self.prop_a = array1[0]
self.prop_b = array1[1]
self.prop_c = array1[2]
## unpack second array
self.prop_d = array2[0]
self.prop_e = array2[1]
self.prop_f = array2[2]
# Nest elements to fit a wanted JSON schema
class inner1(object):
def __init__(self, outer):
self.prop_a = outer.prop_a
self.prop_b = outer.prop_b
self.prop_c = outer.prop_c
class inner2(object):
def __init__(self, outer):
self.prop_d = outer.prop_d
self.prop_e = outer.prop_e
self.prop_f = outer.prop_f
self.inner_first = inner1(self)
self.inner_second = inner2(self)
def serialize(self):
return vars(self)
Now I can call both:
import numpy as np
obj = Outer(10, np.array([1,2,3]), np.array([4,5,6]))
obj.prop_a # returns 1, or
obj.inner_first.prop_1 # also returns 1
But when I try to serialize it, it prints:
vars(obj) # prints:
{'idx': 10,
'prop_a': 1,
'prop_b': 2,
'prop_c': 3,
'prop_d': 4,
'prop_e': 5,
'prop_f': 6,
'inner_first': <__main__.Outer.__init__.<locals>.inner1 at 0x7f231a4fe3b0>,
'inner_second': <__main__.Outer.__init__.<locals>.inner2 at 0x7f231a4febc0>}
where I want it to print:
vars(obj) # prints:
{'idx': 10,
'prop_a': 1,
'prop_b': 2,
'prop_c': 3,
'prop_d': 4,
'prop_e': 5,
'prop_f': 6,
'inner_first': {'prop_a': 1, 'prop_b': 2, 'prop_c': 3},
'inner_second': {'prop_d': 4, 'prop_e': 5, 'prop_f': 6}}
with the 'inner_first' key being the actual results of vars(obj.inner_first), and same thing for the 'inner_second' key.
Ideally I'd like to call the serialize() method to convert my object to the desired output: obj.serialize()
I feel I'm close to the results but I can simply not see where I must go to solve this task.
At the really end, I wish I could simply:
obj = Outer(10, np.array([1,2,3]), np.array([4,5,6]))
obj.serialze()
{
'inner_first': {
'prop_a': 1,
'prop_b': 2,
'prop_c': 3
},
'inner_second': {
'prop_d': 4,
'prop_e': 5,
'prop_f': 6
}
}
in order to basically fit a given JSON structure that I have.
Info: this thread helped me to build the inner classes.
Also note that this question only embeds two "layers" or "levels" of the final structure, but I may have more than 2.

TypeError: INT object is not iterable while defining a custom iterator

I am tying to make my custom class iterable by defining an iterator based on Vijay Shankar's answer here:
import numpy as np
import itertools
class MyClass():
id = itertools.count()
def __init__(self, location = None):
self.id = next(MyClass.id)
self.location = np.random.uniform(0, 1, size=(1, 2)).tolist()
def __iter__(self):
for _ in self.__dict__.values():
yield _
def create():
objects = []
objects.append(MyClass())
counter = 1
while counter != 20:
new_object = MyClass()
objects.append(new_object)
counter = counter + 1
return objects
objects = create()
objects = [[item for subsublist in sublist for item in subsublist] for sublist in objects]
However, I still get this error
objects = [[item for subsublist in sublist for item in subsublist] for sublist in objects]
TypeError: 'MyClass' object is not iterable
How can I fix this problem?
Edit:
Currently, this is what the iterator returns:
>>> print([x for x in create()[0]])
[20, [[0.2552026126490259, 0.48489389169530417]]]
How should one revise it so that it returns like below?
>>> print([x for x in create()[0]])
[20, [0.2552026126490259, 0.48489389169530417]]
Your code has one too many iterations:
[[item for subsublist in sublist for item in subsublist] for sublist in objects]
I count 3 fors Meaning 3 iterations. The first iterations would be the list from create() into the MyClass() objects. The second iteration would be the attributes of each MyClass() The third would attempt to iterate over location/id/whatever other properties the class has. This isn't safe because the property id (int) is not an iterator.
List[MyClass] -> MyClass -> Properties of MyClass (id/location) -> ERROR
Your iterator is working. Here's an iteration over just a single MyClass():
print([x for x in create()[0]])
>>> [20, [[0.2552026126490259, 0.48489389169530417]]]
If you want to expand a list of your class (instead of just one as I did above)
my_classes = create()
objects = [[attribute for attribute in my_class] for my_class in my_classes]
print(objects)
>>>[[0, [[0.7226935825759357, 0.18522688980137658]]], [1, [[0.1660964810272717, 0.016810136422152677]]], [2, [[0.1611089351209548, 0.3935547119768953]]], [3, [[0.4589556901947873, 0.18405198063215056]]], [4, [[0.811343515881961, 0.6123114388786854]]], [5, [[0.38830918188777996, 0.23119360704055836]]], [6, [[0.3269834811013743, 0.3608326475799025]]], [7, [[0.9971686351479419, 0.7054058805215702]]], [8, [[0.11316919241038192, 0.07453424664431929]]], [9, [[0.5548059787590179, 0.062422711183232615]]], [10, [[0.38567389514423267, 0.659106105987059]]], [11, [[0.973277039327461, 0.2821071201116454]]], [12, [[0.16566758369419543, 0.3010363002131601]]], [13, [[0.923317671409532, 0.30016022638587536]]], [14, [[0.9757923181511164, 0.5888806462517852]]], [15, [[0.5582498753119571, 0.27190786180188264]]], [16, [[0.28120075553258217, 0.6873211952682786]]], [17, [[0.7016575026994472, 0.5820325771264436]]], [18, [[0.5815482608888624, 0.22729004063915448]]], [19, [[0.2009082164070768, 0.11317171355184519]]]]
Additionally. You may as well use yield from here. As you're yielding another iterable.
class MyClass():
id = itertools.count()
def __init__(self, location = None):
self.id = next(MyClass.id)
self.location = np.random.uniform(0, 1, size=(1, 2)).tolist()
def __iter__(self):
yield from self.__dict__.valies()
EDIT:
Per your question on location being a nested list instead of a list just throw away that extra dimension when you assign to self.location.
print(np.random.uniform(0, 1, size=(1, 2)).tolist())
>>> [[0.3649653171602294, 0.8447097505387996]]
print(np.random.uniform(0, 1, size=(1, 2)).tolist()[0])
[0.247024738276844, 0.9303441776787809]
>>>

Specify a Default Offset for Python List

Is there a way in python that you can specify a default offset for list?
Like:
a = [0, 1, 2, 3, 4, 5, 6]
a.offset = 2
So that whenever use index for access/modify, the index will be added by the offset first:
a[0] == 2
a[4] == 6
There's no built-in way to achieve this. However you can create your custom class by extending list to get this behaviour. When you do my_list[n], internally __getitem__() function is triggered. You can override this function to return the value by adding offset to the index.
Similarly, list contains other magic functions which you can override to further modify the behaviour of your custom class. For example, __setitem__() is triggered when you assign any value to list, __delitem__() is trigger while deleting the item.
Here's a sample code to create OffsetList class which takes additional argument as offset while creating the list, and performs index based operations on index+offset value.
class OffsetList(list):
def __init__(self, offset, *args, **kwargs):
super(OffsetList, self).__init__(*args, **kwargs)
self.offset = offset
def _get_offset_index(self, key):
if isinstance(key, slice):
key = slice(
None if key.start is None else key.start + self.offset,
None if key.stop is None else key.stop + self.offset,
key.step
)
elif isinstance(key, int):
key += self.offset
return key
def __getitem__(self, key):
key = self._get_offset_index(key)
return super(OffsetList, self).__getitem__(key)
def __setitem__(self, key, value):
key = self._get_offset_index(key)
return super(OffsetList, self).__setitem__(key, value)
def __delitem__(self, key):
key = self._get_offset_index(key)
return super(OffsetList, self).__delitem__(key)
Sample Run:
# With offset as `0`, behaves as normal list
>>> offset_list = OffsetList(0, [10,20,30,40,50,60])
>>> offset_list[0]
10
# With offset as `1`, returns index+1
>>> offset_list = OffsetList(1, [10,20,30,40,50,60])
>>> offset_list[0]
20
# With offset as `2`, returns index+2
>>> offset_list = OffsetList(2, [10,20,30,40,50,60])
>>> offset_list[0]
30
# Slicing support, with `start` as start+offset and `end` as end+offset
>>> offset_list[1:]
[40, 50, 60]
# Assigning new value, based on index+offset
>>> offset_list[0] = 123
>>> offset_list
[10, 20, 123, 40, 50, 60]
# Deleting value based on index+offset
>>> del offset_list[0]
>>> offset_list
[10, 20, 40, 50, 60]
Similarly you can modify the behaviour of other magic functions like __len__(), __iter__(), __repr__(), __str__(), etc as per your need.
There is no such feature in Python -- or in any other language that I know of. Your suggested syntax is reasonable, assuming that you could get the feature approved. However, it has several drawbacks.
Until and unless this feature became common usage, you would confuse anyone trying to read such code. Zero-based and one-based indexing are the "rule"; arbitrary indexing is a violation of long-learned assumptions.
You would seriously crimp Python's right-end indexing: the semantics aren't clear. If someone writes a[-1] to access the last element, should they get that element (this is a language-defined idiom), the original a[1] element (per your definition), a "reflective" a[-3], or index out of bounds trying to move two elements to the right?
Note that Python does give you the capability to define your own functionality:
class
Any time you don't like the given data types, you get to make your own. You're not allowed to alter the built-in types, but you can do what you like by inheriting from list and writing your own get and other methods.
If you're just reading data from the list, you could probably work with a subscript copy of the original:
a = [0, 1, 2, 3, 4, 5, 6]
a = a[2:]
a[0] == 2 # True
a[4] == 6 # True
Keep in mind that this makes a copy of the list using the same variable name so you are losing the original content (indexes 0 and 1). You could keep it in a separate variable if you do need it though:
a = [0, 1, 2, 3, 4, 5, 6]
a0,a = a,a[2:]
a[0] == 2 # True
a[4] == 6 # True
a0[0] == 0 # True
a0[4] == 4 # True
If you really need a view on the original array with read and write capabilities, then I would suggest using a numpy array:
import numpy as np
a = np.array([0, 1, 2, 3, 4, 5, 6])
b = a[2:].view()
b[0] == 2 # True
b[4] == 4 # True
b[1] = 99
print(a) # [ 0 1 2 99 4 5 6]
a[3] == 99 # True
If you want to implement something similar to numpy yourself, you could create a class that represents a "view" on a list with an internal slice property (start, stop, step):
class ListView:
def __init__(self,aList,start=None,stop=None,step=1):
self.data = aList
self.slice = slice(start,stop,step)
#property
def indices(self): return range(len(self.data))[self.slice]
def offset(self,index=None):
if not isinstance(index,slice): return self.indices[index]
first = self.indices[index][0]
last = self.indices[index][-1]
step = (index.step or 1)*(self.slice.step or 1)
return slice(first,last+1-2*(step<0),step)
def __len__(self): return len(self.indices)
def __getitem__(self,index): return self.data[self.offset(index)]
def __repr__(self): return self[:].__repr__()
def __iter__(self): return self[:].__iter__()
def __setitem__(self,index,value): self.data[self.offset(index)] = value
def __delitem__(self,index): del self.data[self.offset(index)]
usage:
a = list(range(1,21))
v = ListView(a,3,-2,2)
len(v) # 8
print(a)
# [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
print(v)
# [4, 6, 8, 10, 12, 14, 16, 18]
v[2] += 80
print(a)
# [1, 2, 3, 4, 5, 6, 7, 88, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
v.slice = slice(-4,None,-3)
print(v)
# [17, 14, 11, 88, 5, 2]

How to write a stack in python by giving a range and using for- loop?

I was trying to write a stack operation using python 3 and classes where i don't add the elements into the list manually but instead use a range of 0-10.
But i stumble into the problems either l1 is not defined or missing parameters
Please help, below is my code for pushing into a empty list.
class Stack():
def __init__(self):
self.l1 = []
def push(self,l1):
for i in range(0,10):
self.ll.append(i)
print(l1)
s = Stack(l1)
print("After stack push operation values are :", s.push())
This is a textbook stack implementation to give you a reference:
1 class ArrayStack:
2 ”””LIFO Stack implementation using a Python list as underlying storage.”””
3
4 def init (self):
5 ”””Create an empty stack.”””
6 self.data = [ ] # nonpublic list instance
7
8 def len (self):
9 ”””Return the number of elements in the stack.”””
10 return len(self.data)
11
12 def is empty(self):
13 ”””Return True if the stack is empty.”””
14 return len(self.data) == 0
15
16 def push(self, e):
17 ”””Add element e to the top of the stack.”””
18 self.data.append(e) # new item stored at end of list
19
20 def top(self):
21 ”””Return (but do not remove) the element at the top of the stack.
22
23 Raise Empty exception if the stack is empty.
24 ”””
25 if self.is empty( ):
26 raise Empty( Stack is empty )
27 return self.data[−1] # the last item in the list
28
29 def pop(self):
30 ”””Remove and return the element from the top of the stack (i.e., LIFO).
31
32 Raise Empty exception if the stack is empty.
33 ”””
34 if self.isempty( ):
35 raise Empty( Stack is empty )
36 return self.data.pop( ) # remove last item from list
Source: Data Structures and Algorithms in Python by Goodrich et al
I think that your example also showed a few errors about how to use classes. If you want to pass an argument to the class when it is being initialized, then that argument needs to be declared like so:
class Stack:
def __init__(self, arg1, arg2): # Declare args
pass
And then called like so:
s = Stack(arg1, arg2)
Next, you mix references to l1 and self.l1 in your method definition. I would also add that a variable named l1 is error-prone because it looks very similar to ll (and I think I found just such a typo in your post. I tried to correct your class definition and this is what I came up with:
class Stack():
def __init__(self):
self.l1 = []
def push(self):
for i in range(0,10):
self.l1.append(i)
print(self.l1)
s = Stack()
print("After stack push operation values are :", s.push())
Note that this results in:
[0]
[0, 1]
[0, 1, 2]
[0, 1, 2, 3]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4, 5]
[0, 1, 2, 3, 4, 5, 6]
[0, 1, 2, 3, 4, 5, 6, 7]
[0, 1, 2, 3, 4, 5, 6, 7, 8]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
After stack push operation values are : None

Need to create a list of sets, from a list of sets whose members may be connected

I'm dealing with polygonal data in realtime here, but the problems quite simple.
I have a huge list containing thousands of sets of polygon Indecies (Integers) and I need to simplify the list as "fast" as possible into a list of sets of "connected" Indecies.
i.e. Any sets containing integers that are also in another set become one set in the result. I've read several possible solutions involving sets & graphs etc. All i'm after are a final list of sets which had any degree of commonality.
I'm dealing with lots of data here, but for simplicities sake here's some sample data:
setA = set([0,1,2])
setB = set([6,7,8,9])
setC = set([4,5,6])
setD = set([3,4,5,0])
setE = set([10,11,12])
setF = set([11,13,14,15])
setG = set([16,17,18,19])
listOfSets = [setA,setB,setC,setD,setE,setF,setG]
In this case I'm after a list with a result like this, although ordering is irrelevant:
connectedFacesListOfSets = [ set([0,1,2,3,4,5,6,7,8,9]), set([10,11,12,13,14,15]), set([16,17,18,19])]
I've looked for similar solutions, but the one with the highest votes gave incorrect results on my large test data.
Merge lists that share common elements
It's hard to tell the performance without a sufficiently large set, but here is some basic code to start from:
while True:
merged_one = False
supersets = [listOfSets[0]]
for s in listOfSets[1:]:
in_super_set = False
for ss in supersets:
if s & ss:
ss |= s
merged_one = True
in_super_set = True
break
if not in_super_set:
supersets.append(s)
print supersets
if not merged_one:
break
listOfSets = supersets
This works in 3 iterations on the provided data. And the output is as follows:
[set([0, 1, 2, 3, 4, 5]), set([4, 5, 6, 7, 8, 9]), set([10, 11, 12, 13, 14, 15]), set([16, 17, 18, 19])]
[set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), set([10, 11, 12, 13, 14, 15]), set([16, 17, 18, 19])]
[set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), set([10, 11, 12, 13, 14, 15]), set([16, 17, 18, 19])]
This is a union find problem.
Though I haven't used it, this Python code looks good to me.
http://code.activestate.com/recipes/577225-union-find/
Forgive the messed up caps (autocorrect...):
# the results cotainer
Connected = set()
sets = # some list of sets
# convert the sets to frozensets (which are hashable and can be added to sets themselves)
Sets = map(frozenset, sets)
for s1 in sets:
Res = copy.copy(s1)
For s2 in sets:
If s1 & s2:
Res = res | s2
Connected.add(res)
So.. I think I got it. It's a mess but I got it. Here's what I did:
def connected_valid(li):
for i, l in enumerate(li):
for j, k in enumerate(li):
if i != j and contains(l,k):
return False
return True
def contains(set1, set2):
for s in set1:
if s in set2:
return True
return False
def combine(set1, set2):
set2 |= set1
return set2
def connect_sets(li):
while not connected_valid(li):
s1 = li.pop(0)
s2 = li[0]
if contains(s1, s2):
li[0] = combine(s1,s2)
else:
li.append(s1)
return li
Then in the main function you'd do something like this:
setA = set([0,1,2])
setB = set([6,7,8,9])
setC = set([4,5,6])
setD = set([3,4,5,0])
setE = set([10,11,12])
setF = set([11,13,14,15])
setG = set([16,17,18,19])
connected_sets = connect_sets([setA,setB,setC,setD,setE,setF,setG,])
After running it, I got the following output
print connected_sets
[set([0,1,2,3,4,5,6,7,8,9]), set([10,11,12,13,14,15]), set([16,17,18,19])]
Hope that's what you're looking for.
EDIT: Added code to randomly generate sets:
# Creates a list of 4000 sets with a random number of values ranging from 0 to 20000
sets = []
ma = 0
mi = 21000
for x in range(4000):
rand_num = sample(range(20),1)[0]
tmp_set_li = sample(range(20000), rand_num)
sets.append(set(tmp_set_li))
The last 3 lines can be condensed into one if you really wanted to.
I tried to do something different: this algorithm loops once for each set and once for each element:
# Our test sets
setA = set([0,1,2])
setB = set([6,7,8,9])
setC = set([4,5,6])
setD = set([3,4,5,0])
setE = set([10,11,12])
setF = set([11,13,14,15])
setG = set([16,17,18,19])
list_of_sets = [setA,setB,setC,setD,setE,setF,setG]
# We will use a map to store our new merged sets.
# This map will work as an reference abstraction, so it will
# map set ids to the set or to other set id.
# This map may have an indirection level greater than 1
merged_sets = {}
# We will also use a map between indexes and set ids.
index_to_id = {}
# Given a set id, returns an equivalent set id that refers directly
# to a set in the merged_sets map
def resolve_id(id):
if not isinstance(id, (int, long)):
return None
while isinstance(merged_sets[id], (int, long)):
id = merged_sets[id]
return id
# Points the informed set to the destination id
def link_id(id_source, id_destination):
point_to = merged_sets[id_source]
merged_sets[id_source] = id_destination
if isinstance(point_to, (int, long)):
link_id(point_to, id_destination)
empty_set_found = False
# For each set
for current_set_id, current_set in enumerate(list_of_sets):
if len(current_set) == 0 and empty_set_found:
continue
if len(current_set) == 0:
empty_set_found = True
# Create a set id for the set and place it on the merged sets map
merged_sets[current_set_id] = current_set
# For each index in the current set
possibly_merged_current_set = current_set
for index in current_set:
# See if the index is free, i.e., has not been assigned to any set id
if index not in index_to_id:
# If it is free, then assign the set id to the index
index_to_id[index] = current_set_id
# ... and then go to the next index
else:
# If it is not free, then we may need to merge the sets
# Find out to which set we need to merge the current one,
# ... dereferencing if necessary
id_to_merge = resolve_id(index_to_id[index])
# First we check to see if the assignment is to the current set or not
if id_to_merge == resolve_id(merged_sets[current_set_id]):
continue
# Merge the current set to the one found
print 'Merging %d with %d' % (current_set_id, id_to_merge)
merged_sets[id_to_merge] |= possibly_merged_current_set
possibly_merged_current_set = merged_sets[id_to_merge]
# Map the current set id to the set id of the merged set
link_id(current_set_id, id_to_merge)
# Return all the sets in the merged sets map (ignore the references)
print [x for x in merged_sets.itervalues() if not isinstance(x, (int, long))]
It prints:
Merging 2 with 1
Merging 3 with 0
Merging 3 with 1
Merging 5 with 4
[set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), set([10, 11, 12, 13, 14, 15]), set([16, 17, 18, 19])]

Categories