Why data = copy.deepcopy(G) matters in Karger min cut algorithm? - python

Here is the code to compute the min cut posted here karger min cut algorithm in python 2.7. Without data = copy.deepcopy(G), the efficiency to find the min cut is not good. Can anybody explain why? Thanks.
import random, copy
data = open("***.txt","r")
G = {}
for line in data:
lst = [int(s) for s in line.split()]
G[lst[0]] = lst[1:]
def choose_random_key(G):
v1 = random.choice(list(G.keys()))
v2 = random.choice(list(G[v1]))
return v1, v2
def karger(G):
length = []
while len(G) > 2:
v1, v2 = choose_random_key(G)
G[v1].extend(G[v2])
for x in G[v2]:
G[x].remove(v2)
G[x].append(v1)
while v1 in G[v1]:
G[v1].remove(v1)
del G[v2]
for key in G.keys():
length.append(len(G[key]))
return length[0]
def operation(n):
i = 0
count = 10000
while i < n:
data = copy.deepcopy(G)
min_cut = karger(data)
if min_cut < count:
count = min_cut
i = i + 1
return count
print(operation(100))

data = G and karger(data) doesn't duplicate dictionary and karger(data) would use original G dictionary. But karger changes values in data so automatically it would change values in original dictionary. So next executions of karger(data) would use dictionary with different values.
Remove deepcopy and add print(data) before karger(data) and you see different values in data.

Related

How to check if 2 different values are from the same list and obtaining the list name

** I modified the entire question **
I have an example list specified below and i want to find if 2 values are from the same list and i wanna know which list both the value comes from.
list1 = ['a','b','c','d','e']
list2 = ['f','g','h','i','j']
c = 'b'
d = 'e'
i used for loop to check whether the values exist in the list however not sure how to obtain which list the value actually is from.
for x,y in zip(list1,list2):
if c and d in x or y:
print(True)
Please advise if there is any work around.
First u might want to inspect the distribution of values and sizes where you can improve the result with the least effort like this:
df_inspect = df.copy()
df_inspect["size.value"] = ["size.value"].map(lambda x: ''.join(y.upper() for y in x if x.isalpha() if y != ' '))
df_inspect = df_inspect.groupby(["size.value"]).count().sort_values(ascending=False)
Then create a solution for the most occuring size category, here "Wide"
long = "adasda, 9.5 W US"
short = "9.5 Wide"
def get_intersection(s1, s2):
res = ''
l_s1 = len(s1)
for i in range(l_s1):
for j in range(i + 1, l_s1):
t = s1[i:j]
if t in s2 and len(t) > len(res):
res = t
return res
print(len(get_intersection(long, short)) / len(short) >= 0.6)
Then apply the solution to the dataframe
df["defective_attributes"] = df.apply(lambda x: len(get_intersection(x["item_name.value"], x["size.value"])) / len(x["size.value"]) >= 0.6)
Basically, get_intersection search for the longest intersection between the itemname and the size. Then takes the length of the intersection and says, its not defective if at least 60% of the size_value are also in the item_name.

Extracting elements from frozenset

I've been trying to develop an apriori algorithm using this data. I was able to get the associations and the confidence for both the pairs and the triples but am having trouble formatting the output and extracting the correct elements.
I ran the algorithm on this test data. Its just a subset of the original dataset. Currently the output looks like this:
[[frozenset({'GRO73461'}), frozenset({'ELE17451'}), 1.0],
[frozenset({'GRO99222'}), frozenset({'ELE17451'}), 0.8125], [frozenset({'ELE17451'}), frozenset({'GRO99222'}), 0.5], [frozenset({'ELE17451'}), frozenset({'GRO73461'}), 0.38461538461538464]]
frozenset({'GRO73461', 'ELE17451'}), 0.8], [frozenset({'GRO73461'}), frozenset({'DAI22896', 'ELE17451'}), 0.8]
As you can see its kind of a mess. The list is ordered based on the confidence in descending order. I want to separate the frequent pairs from the frequent triples and arrange the output so that it looks like this:
OUTPUT A
FRO11987 FRO12685 0.4325
FRO11987 ELE11375 0.4225
FRO11987 GRO94758 0.4125
FRO11987 SNA80192 0.4025
FRO11987 FRO18919 0.4015
OUTPUT B
FRO11987 FRO12685 DAI95741 0.4325
FRO11987 ELE11375 GRO73461 0.4225
FRO11987 GRO94758 ELE26917 0.4125
FRO11987 SNA80192 ELE28189 0.4025
FRO11987 FRO18919 GRO68850 0.4015
Where the above is the top 5 frequent pairs, and top 5 frequent triples based on the confidence.
The main area I'm having trouble with is discerning between the frequent pairs and triples and then extracting the items from the frozensets such that they are in the above format.
from numpy import *
import pandas as pd
from operator import itemgetter
def loadDataSet(data=None):
return pd.read_csv(data, sep = ' ', error_bad_lines=False)
def createCandidateSet(data):
C1 = []
for transaction in data:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return list(map(frozenset, C1))
def scanData(dataset, Ck, support):
ssCount = {}
for tID in dataset:
for candidate in Ck:
if candidate.issubset(tID):
if not candidate in ssCount:
ssCount[candidate] = 1
else:
ssCount[candidate]+=1
# numItems = float(len(dataset))
res = []
supportData ={}
for key in ssCount:
#Support is a proportion or a integer; the occurrence of the item in relation to the data set
# currSupport = ssCount[key]/numItems
currSupport = ssCount[key]
if currSupport >= support:
res.insert(0, key)
supportData[key] = currSupport
return res, supportData
def aprioriHelper(Lk, k): #creates candidate itemsets
res = []
freqItemLen = len(Lk)
for i in range(freqItemLen):
for j in range(i+1, freqItemLen):
L1 = list(Lk[i])[:k-2]
L2 = list(Lk[j])[:k-2]
L1.sort()
L2.sort()
if L1 == L2:
res.append(Lk[i] | Lk[j])
return res
def apriori(dataset, minSupport=100):
C1 = createCandidateSet(dataset)
D = list(map(set, dataset))
L1, supportData = scanData(D, C1, minSupport)
L = [L1]
k = 2
while (len(L[k-2]) > 0):
Ck = aprioriHelper(L[k-2], k)
Lk, supportK = scanData(D, Ck, minSupport) #scan dataset to get frequent items sets, now the itemsets are bigger
supportData.update(supportK)
L.append(Lk)
k+=1
return L, supportData
def generateRules(L, supportData, conf = 0.7): #support data is data on each item sets support, comes from scanData
rules = [] #takes tuples of associations, consequences, and confidence
for i in range(1, len(L)): #get itemsets with number of items >=2
for freq in L[i]:
association = [frozenset([item]) for item in freq]
if i > 1:
rulesFromConsequences(freq, association, supportData, rules, conf)
else:
calculateConfidence(freq, association, supportData, rules, conf)
return rules
def calculateConfidence(freq, association, supportData, rules, conf=0.7):
filteredAssociations = []
for consequence in association:
#confidence(I -> J) = Support(I U J)/Support(I)
confidence = supportData[freq]/supportData[freq - consequence] #calculate confidence
if confidence >= conf:
# print(freq-consequence, ' ', consequence, ' ', confidence) #print out association rule and confidence
rules.append((freq-consequence, consequence, confidence))
filteredAssociations.append(consequence)
return filteredAssociations
def rulesFromConsequences(freq, association, supportData, rules, conf=0.7):
#generate more rules when frequent itemsets become larger
a_len = len(association[0])
if (len(freq) > (a_len+1)): #try to merge into a bigger itemset that is frequent
association_p1 = aprioriHelper(association, a_len+1) #create association+1 new candidates- create bigger itemset and get more candidates for association rules
association_p1 = calculateConfidence(freq, association_p1, supportData, rules, conf)
if len(association_p1) > 1: #need to have at least two sets in order to merge
rulesFromConsequences(freq, association_p1, supportData, rules, conf) #recursively call to build bigger itemset and get more rules
def main():
dataset = [line.split() for line in open('datatest.txt')]
L, supportData = apriori(dataset, minSupport=8)
rules = generateRules(L, supportData, conf=0)
rules = sorted(rules, key = itemgetter(2), reverse=True)
triples = []
doubles = []
i = 0
while len(triples) < 5:
if i == len(rules):
break
if len(rules[i][1]) == 2:
triples.append(rules[i])
i+=1
j = 0
while len(doubles) < 5:
if j == len(rules):
break
if len(rules[j][1]) == 1:
doubles.append(rules[j])
j+=1
if __name__ == '__main__':
main()
Any advice on the issue is appreciated. If you have any questions on the code or thought process please let me know. Apologies in advance if there are any careless mistakes.
Thank you for reading

Faster implementation for shuffling buckets in a dataset? Python

Is there a faster way to implement this? Each row is about 1024 buckets and it's not as fast as I wish it was..
I'd like to generate quite a lot but it needs a few hours to complete as it is. It's quite the bottleneck at this point.. Any suggestions or ideas for how to optimize it would be greatly appreciated!
Edit*
Apologies for not having a minimal working example before. Now it's posted. If optimization could be made for Python 2.7 it would be very appreciated.
import math
import numpy as np
import copy
import random
def number_to_move(n):
l=math.exp(-n)
k=0
p=1.0
while p>l:
k += 1
p *= random.random()
return k-n
def createShuffledDataset(input_data, shuffle_indexes_dict, shuffle_quantity):
shuffled = []
for key in shuffle_indexes_dict:
for values in shuffle_indexes_dict[key]:
temp_holder = copy.copy(input_data[values[0] - 40: values[1]]) #may need to increase 100 padding
for line in temp_holder:
buckets = range(1,1022)
for bucket in buckets:
bucket_value = line[bucket]
proposed_number = number_to_move(bucket_value)
moving_amount = abs(proposed_number) if bucket_value - abs(proposed_number) >= 0 else bucket_value
line[bucket] -= moving_amount
if proposed_number > 0:
line[bucket + 1] += moving_amount
else:
line[bucket - 1] += moving_amount
shuffled.extend(temp_holder)
return np.array(shuffled)
example_data = np.ones((100,1024))
shuffle_indexes = {"Ranges to Shuffle 1" : [[10,50], [53, 72]]}
shuffle_quantity = 1150
shuffled_data = createShuffledDataset(example_data, shuffle_indexes,
shuffle_quantity)
Some minors things you might try:
merge key and values into single call:
for key, values in shuffle_indexes_dict.iteritems():
use xrange rather range
bucket values seem to be integer - try caching them.
_cache = {}
def number_to_move(n):
v = _cache.get(n, None)
if v is not None: return v
l=math.exp(-n)
k=0
p=1.0
while p>l:
k += 1
p *= random.random()
v = k-n
_cache[n] = v
return v
If shuffle_indexes_dict ranges are exclusive, then you can - probably - avoid coping values from input_data.
Otherwise i'd say you're out of luck.

Python dictionary manipulating items(getting min and max)

Edit: I removed my explanation part since it was wrong but I still could not be able to convert it.
I was studying list and dictionaries in python and I came accross this code.
x = min(minValue, key=lambda b: min([a( \
myFunction(5,b),c) for c in something]))
What is the logical equivalent of this ? It seems simple but I do not get same thing when I try to write it with a different code. . How can write this differently without whole key and lambda thing
Seems like my explanation was wrong. Here is the updated code I try.
for b in minValue:
for c in something:
minimum=min(myFunction(5,b),c)
result=min(minimum)
return result
Note: By logical equivalent I do not mean the provided code should calculate this exactly like the code I gave but it should have the same output.
I' not sure but it can be something like this
def example(minValue):
data = []
for b in minValue:
values = []
for c in something:
values.append( a(myFunction(5,b),c) )
result = min(values)
# keep `result` and `b` which gives this `result`
data.append( [result, b] )
# find minimal `result` and `b` which gives this `result`
x = min(data) # x = [result, b]
# return `b`
return x[1]
#---
x = example(minValue)
EDIT: there can be problem with min(data) because min will be comparing result and b and oryginal version compare only result. It may need version without min() but with:
if result < min_result:
min_result = result
min_b = b
EDIT:
def key(val):
min_c = something[0]
min_result = a(myFunction(5,val),min_c)
for c in something[1:]:
result = a(myFunction(5,val),c)
if result < min_result:
min_result = result
#min_c = c
return min_result
def example(minValue):
min_b = minValue[0]
min_result = key(min_b)
for b in minValue[1:]:
result = key(b)
if result < min_result:
min_result = result
min_b = b
return min_b
#---
x = example(minValue)

How to reduce a collection of ranges to a minimal set of ranges [duplicate]

This question already has answers here:
Union of multiple ranges
(5 answers)
Closed 7 years ago.
I'm trying to remove overlapping values from a collection of ranges.
The ranges are represented by a string like this:
499-505 100-115 80-119 113-140 500-550
I want the above to be reduced to two ranges: 80-140 499-550. That covers all the values without overlap.
Currently I have the following code.
cr = "100-115 115-119 113-125 80-114 180-185 500-550 109-120 95-114 200-250".split(" ")
ar = []
br = []
for i in cr:
(left,right) = i.split("-")
ar.append(left);
br.append(right);
inc = 0
for f in br:
i = int(f)
vac = []
jnc = 0
for g in ar:
j = int(g)
if(i >= j):
vac.append(j)
del br[jnc]
jnc += jnc
print vac
inc += inc
I split the array by - and store the range limits in ar and br. I iterate over these limits pairwise and if the i is at least as great as the j, I want to delete the element. But the program doesn't work. I expect it to produce this result: 80-125 500-550 200-250 180-185
For a quick and short solution,
from operator import itemgetter
from itertools import groupby
cr = "499-505 100-115 80-119 113-140 500-550".split(" ")
fullNumbers = []
for i in cr:
a = int(i.split("-")[0])
b = int(i.split("-")[1])
fullNumbers+=range(a,b+1)
# Remove duplicates and sort it
fullNumbers = sorted(list(set(fullNumbers)))
# Taken From http://stackoverflow.com/questions/2154249
def convertToRanges(data):
result = []
for k, g in groupby(enumerate(data), lambda (i,x):i-x):
group = map(itemgetter(1), g)
result.append(str(group[0])+"-"+str(group[-1]))
return result
print convertToRanges(fullNumbers)
#Output: ['80-140', '499-550']
For the given set in your program, output is ['80-125', '180-185', '200-250', '500-550']
Main Possible drawback of this solution: This may not be scalable!
Let me offer another solution that doesn't take time linearly proportional to the sum of the range sizes. Its running time is linearly proportional to the number of ranges.
def reduce(range_text):
parts = range_text.split()
if parts == []:
return ''
ranges = [ tuple(map(int, part.split('-'))) for part in parts ]
ranges.sort()
new_ranges = []
left, right = ranges[0]
for range in ranges[1:]:
next_left, next_right = range
if right + 1 < next_left: # Is the next range to the right?
new_ranges.append((left, right)) # Close the current range.
left, right = range # Start a new range.
else:
right = max(right, next_right) # Extend the current range.
new_ranges.append((left, right)) # Close the last range.
return ' '.join([ '-'.join(map(str, range)) for range in new_ranges ]
This function works by sorting the ranges, then looking at them in order and merging consecutive ranges that intersect.
Examples:
print(reduce('499-505 100-115 80-119 113-140 500-550'))
# => 80-140 499-550
print(reduce('100-115 115-119 113-125 80-114 180-185 500-550 109-120 95-114 200-250'))
# => 80-125 180-185 200-250 500-550

Categories