Extracting elements from frozenset - python

I've been trying to develop an apriori algorithm using this data. I was able to get the associations and the confidence for both the pairs and the triples but am having trouble formatting the output and extracting the correct elements.
I ran the algorithm on this test data. Its just a subset of the original dataset. Currently the output looks like this:
[[frozenset({'GRO73461'}), frozenset({'ELE17451'}), 1.0],
[frozenset({'GRO99222'}), frozenset({'ELE17451'}), 0.8125], [frozenset({'ELE17451'}), frozenset({'GRO99222'}), 0.5], [frozenset({'ELE17451'}), frozenset({'GRO73461'}), 0.38461538461538464]]
frozenset({'GRO73461', 'ELE17451'}), 0.8], [frozenset({'GRO73461'}), frozenset({'DAI22896', 'ELE17451'}), 0.8]
As you can see its kind of a mess. The list is ordered based on the confidence in descending order. I want to separate the frequent pairs from the frequent triples and arrange the output so that it looks like this:
OUTPUT A
FRO11987 FRO12685 0.4325
FRO11987 ELE11375 0.4225
FRO11987 GRO94758 0.4125
FRO11987 SNA80192 0.4025
FRO11987 FRO18919 0.4015
OUTPUT B
FRO11987 FRO12685 DAI95741 0.4325
FRO11987 ELE11375 GRO73461 0.4225
FRO11987 GRO94758 ELE26917 0.4125
FRO11987 SNA80192 ELE28189 0.4025
FRO11987 FRO18919 GRO68850 0.4015
Where the above is the top 5 frequent pairs, and top 5 frequent triples based on the confidence.
The main area I'm having trouble with is discerning between the frequent pairs and triples and then extracting the items from the frozensets such that they are in the above format.
from numpy import *
import pandas as pd
from operator import itemgetter
def loadDataSet(data=None):
return pd.read_csv(data, sep = ' ', error_bad_lines=False)
def createCandidateSet(data):
C1 = []
for transaction in data:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return list(map(frozenset, C1))
def scanData(dataset, Ck, support):
ssCount = {}
for tID in dataset:
for candidate in Ck:
if candidate.issubset(tID):
if not candidate in ssCount:
ssCount[candidate] = 1
else:
ssCount[candidate]+=1
# numItems = float(len(dataset))
res = []
supportData ={}
for key in ssCount:
#Support is a proportion or a integer; the occurrence of the item in relation to the data set
# currSupport = ssCount[key]/numItems
currSupport = ssCount[key]
if currSupport >= support:
res.insert(0, key)
supportData[key] = currSupport
return res, supportData
def aprioriHelper(Lk, k): #creates candidate itemsets
res = []
freqItemLen = len(Lk)
for i in range(freqItemLen):
for j in range(i+1, freqItemLen):
L1 = list(Lk[i])[:k-2]
L2 = list(Lk[j])[:k-2]
L1.sort()
L2.sort()
if L1 == L2:
res.append(Lk[i] | Lk[j])
return res
def apriori(dataset, minSupport=100):
C1 = createCandidateSet(dataset)
D = list(map(set, dataset))
L1, supportData = scanData(D, C1, minSupport)
L = [L1]
k = 2
while (len(L[k-2]) > 0):
Ck = aprioriHelper(L[k-2], k)
Lk, supportK = scanData(D, Ck, minSupport) #scan dataset to get frequent items sets, now the itemsets are bigger
supportData.update(supportK)
L.append(Lk)
k+=1
return L, supportData
def generateRules(L, supportData, conf = 0.7): #support data is data on each item sets support, comes from scanData
rules = [] #takes tuples of associations, consequences, and confidence
for i in range(1, len(L)): #get itemsets with number of items >=2
for freq in L[i]:
association = [frozenset([item]) for item in freq]
if i > 1:
rulesFromConsequences(freq, association, supportData, rules, conf)
else:
calculateConfidence(freq, association, supportData, rules, conf)
return rules
def calculateConfidence(freq, association, supportData, rules, conf=0.7):
filteredAssociations = []
for consequence in association:
#confidence(I -> J) = Support(I U J)/Support(I)
confidence = supportData[freq]/supportData[freq - consequence] #calculate confidence
if confidence >= conf:
# print(freq-consequence, ' ', consequence, ' ', confidence) #print out association rule and confidence
rules.append((freq-consequence, consequence, confidence))
filteredAssociations.append(consequence)
return filteredAssociations
def rulesFromConsequences(freq, association, supportData, rules, conf=0.7):
#generate more rules when frequent itemsets become larger
a_len = len(association[0])
if (len(freq) > (a_len+1)): #try to merge into a bigger itemset that is frequent
association_p1 = aprioriHelper(association, a_len+1) #create association+1 new candidates- create bigger itemset and get more candidates for association rules
association_p1 = calculateConfidence(freq, association_p1, supportData, rules, conf)
if len(association_p1) > 1: #need to have at least two sets in order to merge
rulesFromConsequences(freq, association_p1, supportData, rules, conf) #recursively call to build bigger itemset and get more rules
def main():
dataset = [line.split() for line in open('datatest.txt')]
L, supportData = apriori(dataset, minSupport=8)
rules = generateRules(L, supportData, conf=0)
rules = sorted(rules, key = itemgetter(2), reverse=True)
triples = []
doubles = []
i = 0
while len(triples) < 5:
if i == len(rules):
break
if len(rules[i][1]) == 2:
triples.append(rules[i])
i+=1
j = 0
while len(doubles) < 5:
if j == len(rules):
break
if len(rules[j][1]) == 1:
doubles.append(rules[j])
j+=1
if __name__ == '__main__':
main()
Any advice on the issue is appreciated. If you have any questions on the code or thought process please let me know. Apologies in advance if there are any careless mistakes.
Thank you for reading

Related

How to perform Partial-mapped crossover in python3?

I am new to genetic algorithms and made one the other day that recreated a target string. So I tried to make one that could make a Magic Square. It was ok until I got to the crossover part, realising I couldn't just do a single point crossover. So I attempted to perform a Partially Mapped Crossover, and I could not and still can't get it to work. I understand how the Partially Mapped Crossover works I just can't implement it into python. Since my code isn't complete yet I isolated the crossover function in a different program and changed it so the parents were a fixed list.
Can someone please correct my code or if it is completely wrong show me how to perform a Partial Mapped Crossover on 2 lists with integers 1 to 9?
Also, I am sorry and understand that my naming of variables isn't that good but I was just trying to get the program to work making constant edits.
import random
parent1 = [1,2,3,4,5,6,7,8,9]
parent2 = [5,4,6,7,2,1,3,9,8]
firstCrossPoint = random.randint(0,len(parent1)-1) #Creating parameters for random sublist
secondCrossPoint = random.randint(firstCrossPoint+1,len(parent1))
parent1MiddleCross = parent1[firstCrossPoint:secondCrossPoint]
parent2MiddleCross = parent2[firstCrossPoint:secondCrossPoint]
child1 = (parent1[:firstCrossPoint] + parent2MiddleCross + parent1[secondCrossPoint:])
child2 = (parent2[:firstCrossPoint] + parent1MiddleCross + parent2[secondCrossPoint:])
relationsWithDupes = []
for i in range(len(parent1MiddleCross)):
relationsWithDupes.append([parent2MiddleCross[i], parent1MiddleCross[i]])
relations = []
for pair in relationsWithDupes:
for i in range(len(relationsWithDupes)):
if pair[0] in relationsWithDupes[i] or pair[1] in relationsWithDupes[i]:
if pair != relationsWithDupes[i]:
if pair[0] == relationsWithDupes[i][1]:
pair[0] = relationsWithDupes[i][0]
else:
pair[1] = relationsWithDupes[i][1]
if pair not in relations and pair[::-1] not in relations:
relations.append(pair)
for i in child1[:firstCrossPoint]:
for x in relations:
if i == x[0]:
i = x[1]
for i in child1[secondCrossPoint:]:
for x in relations:
if i == x[0]:
i = x[1]
for i in child2[:firstCrossPoint]:
for x in relations:
if i == x[1]:
i = x[0]
for i in child2[secondCrossPoint:]:
for x in relations:
if i == x[1]:
i = x[0]
print(child1)
print(child2)
import numpy as np
parent1 = [1,2,3,4,5,6,7,8,9]
parent2 = [5,4,6,7,2,1,3,9,8]
firstCrossPoint = np.random.randint(0,len(parent1)-2)
secondCrossPoint = np.random.randint(firstCrossPoint+1,len(parent1)-1)
print(firstCrossPoint, secondCrossPoint)
parent1MiddleCross = parent1[firstCrossPoint:secondCrossPoint]
parent2MiddleCross = parent2[firstCrossPoint:secondCrossPoint]
temp_child1 = parent1[:firstCrossPoint] + parent2MiddleCross + parent1[secondCrossPoint:]
temp_child2 = parent2[:firstCrossPoint] + parent1MiddleCross + parent2[secondCrossPoint:]
relations = []
for i in range(len(parent1MiddleCross)):
relations.append([parent2MiddleCross[i], parent1MiddleCross[i]])
print(relations)
def recursion1 (temp_child , firstCrossPoint , secondCrossPoint , parent1MiddleCross , parent2MiddleCross) :
child = np.array([0 for i in range(len(parent1))])
for i,j in enumerate(temp_child[:firstCrossPoint]):
c=0
for x in relations:
if j == x[0]:
child[i]=x[1]
c=1
break
if c==0:
child[i]=j
j=0
for i in range(firstCrossPoint,secondCrossPoint):
child[i]=parent2MiddleCross[j]
j+=1
for i,j in enumerate(temp_child[secondCrossPoint:]):
c=0
for x in relations:
if j == x[0]:
child[i+secondCrossPoint]=x[1]
c=1
break
if c==0:
child[i+secondCrossPoint]=j
child_unique=np.unique(child)
if len(child)>len(child_unique):
child=recursion1(child,firstCrossPoint,secondCrossPoint,parent1MiddleCross,parent2MiddleCross)
return(child)
def recursion2(temp_child,firstCrossPoint,secondCrossPoint,parent1MiddleCross,parent2MiddleCross):
child = np.array([0 for i in range(len(parent1))])
for i,j in enumerate(temp_child[:firstCrossPoint]):
c=0
for x in relations:
if j == x[1]:
child[i]=x[0]
c=1
break
if c==0:
child[i]=j
j=0
for i in range(firstCrossPoint,secondCrossPoint):
child[i]=parent1MiddleCross[j]
j+=1
for i,j in enumerate(temp_child[secondCrossPoint:]):
c=0
for x in relations:
if j == x[1]:
child[i+secondCrossPoint]=x[0]
c=1
break
if c==0:
child[i+secondCrossPoint]=j
child_unique=np.unique(child)
if len(child)>len(child_unique):
child=recursion2(child,firstCrossPoint,secondCrossPoint,parent1MiddleCross,parent2MiddleCross)
return(child)
child1=recursion1(temp_child1,firstCrossPoint,secondCrossPoint,parent1MiddleCross,parent2MiddleCross)
child2=recursion2(temp_child2,firstCrossPoint,secondCrossPoint,parent1MiddleCross,parent2MiddleCross)
print(child1)
print(child2)
I just stumbled across this when looking for an implementation of PMX and it seems unnecessarily complicated? I've included below an alternative I've just coded if anyone comes across the same issue.
def PMX_crossover(parent1, parent2, seed):
'''
parent1 and parent2 are 1D np.array
'''
rng = np.random.default_rng(seed=seed)
cutoff_1, cutoff_2 = np.sort(rng.choice(np.arange(len(parent1)+1), size=2, replace=False))
def PMX_one_offspring(p1, p2):
offspring = np.zeros(len(p1), dtype=p1.dtype)
# Copy the mapping section (middle) from parent1
offspring[cutoff_1:cutoff_2] = p1[cutoff_1:cutoff_2]
# copy the rest from parent2 (provided it's not already there
for i in np.concatenate([np.arange(0,cutoff_1), np.arange(cutoff_2,len(p1))]):
candidate = p2[i]
while candidate in p1[cutoff_1:cutoff_2]: # allows for several successive mappings
print(f"Candidate {candidate} not valid in position {i}") # DEBUGONLY
candidate = p2[np.where(p1 == candidate)[0][0]]
offspring[i] = candidate
return offspring
offspring1 = PMX_one_offspring(parent1, parent2)
offspring2 = PMX_one_offspring(parent2, parent1)
return offspring1, offspring2

Most frequently overlapping range - Python3.x

I'm a beginner, trying to write code listing the most frequently overlapping ranges in a list of ranges.
So, input is various ranges (#1 through #7 in the example figure; https://prntscr.com/kj80xl) and I would like to find the most common range (in the example 3,000- 4,000 in 6 out of 7 - 86 %). Actually, I would like to find top 5 most frequent.
Not all ranges overlap. Ranges are always positive and given as integers with 1 distance (standard range).
What I have now is only code comparing one sequence to another and returning the overlap, but after that I'm stuck.
def range_overlap(range_x,range_y):
x = (range_x[0], (range_x[-1])+1)
y = (range_y[0], (range_y[-1])+1)
overlap = (max(x[0],y[0]),min(x[-1],(y[-1])))
if overlap[0] <= overlap[1]:
return range(overlap[0], overlap[1])
else:
return "Out of range"
I would be very grateful for any help.
Better solution
I came up with a simpler solution (at least IMHO) so here it is:
def get_abs_min(ranges):
return min([min(r) for r in ranges])
def get_abs_max(ranges):
return max([max(r) for r in ranges])
def count_appearances(i, ranges):
return sum([1 for r in ranges if i in r])
def create_histogram(ranges):
keys = [str(i) for i in range(len(ranges) + 1)]
histogram = dict.fromkeys(keys)
results = []
min = get_abs_min(range_list)
max = get_abs_max(range_list)
for i in range(min, max):
count = str(count_appearances(i, ranges))
if histogram[count] is None:
histogram[count] = dict(start=i, end=None)
elif histogram[count]['end'] is None:
histogram[count]['end'] = i
elif histogram[count]['end'] == i - 1:
histogram[count]['end'] = i
else:
start = histogram[count]['start']
end = histogram[count]['end']
results.append((range(start, end + 1), count))
histogram[count]['start'] = i
histogram[count]['end'] = None
for count, d in histogram.items():
if d is not None and d['start'] is not None and d['end'] is not None:
results.append((range(d['start'], d['end'] + 1), count))
return results
def main(ranges, top):
appearances = create_histogram(ranges)
return sorted(appearances, key=lambda t: t[1], reverse=True)[:top]
The idea here is as simple as iterating through a superposition of all the ranges and building a histogram of appearances (e.g. the number of original ranges this current i appears in)
After that just sort and slice according to the chosen size of the results.
Just call main with the ranges and the top number you want (or None if you want to see all results).
OLDER EDITS BELOW
I (almost) agree with #Kasramvd's answer.
here is my take on it:
from collections import Counter
from itertools import combinations
def range_overlap(x, y):
common_part = list(set(x) & set(y))
if common_part:
return range(common_part[0], common_part[-1] +1)
else:
return False
def get_most_common(range_list, top_frequent):
overlaps = Counter(range_overlap(i, j) for i, j in
combinations(list_of_ranges, 2))
return [(r, i) for (r, i) in overlaps.most_common(top_frequent) if r]
you need to input the range_list and the number of top_frequent you want.
EDIT
the previous answer solved this question for all 2's combinations over the range list.
This edit is tested against your input and results with the correct answer:
from collections import Counter
from itertools import combinations
def range_overlap(*args):
sets = [set(r) for r in args]
common_part = list(set(args[0]).intersection(*sets))
if common_part:
return range(common_part[0], common_part[-1] +1)
else:
return False
def get_all_possible_combinations(range_list):
all_combos = []
for i in range(2, len(range_list)):
all_combos.append(combinations(range_list, i))
all_combos = [list(combo) for combo in all_combos]
return all_combos
def get_most_common_for_combo(combo):
return list(filter(None, [range_overlap(*option) for option in combo]))
def get_most_common(range_list, top_frequent):
all_overlaps = []
combos = get_all_possible_combinations(range_list)
for combo in combos:
all_overlaps.extend(get_most_common_for_combo(combo))
return [r for (r, i) in Counter(all_overlaps).most_common(top_frequent) if r]
And to get the results just run get_most_common(range_list, top_frequent)
Tested on my machine (ubunut 16.04 with python 3.5.2) with your input range_list and top_frequent = 5 with the results:
[range(3000, 4000), range(2500, 4000), range(1500, 4000), range(3000, 6000), range(1, 4000)]
You can first change your function to return a valid range in both cases so that you can use it in a set of comparisons. Also, since Python's range objects are not already created iterables but smart objects that only get start, stop and step attributes of a range and create the range on-demand, you can do a little change on your function as well.
def range_overlap(range_x,range_y):
rng = range(max(range_x.start, range_y.start),
min(range_x.stop, range_y.stop)+1)
if rng.start < rng.stop:
return rng.start, rng.stop
Now, if you have a set of ranges and you want to compare all the pairs you can use itertools.combinations to get all the pairs and then using range_overlap and collections.Counter you can find the number of overlapped ranges.
from collections import Counter
from itertools import combinations
overlaps = Counter(range_overlap(i,j) for i, j in
combinations(list_of_ranges, 2))

Why data = copy.deepcopy(G) matters in Karger min cut algorithm?

Here is the code to compute the min cut posted here karger min cut algorithm in python 2.7. Without data = copy.deepcopy(G), the efficiency to find the min cut is not good. Can anybody explain why? Thanks.
import random, copy
data = open("***.txt","r")
G = {}
for line in data:
lst = [int(s) for s in line.split()]
G[lst[0]] = lst[1:]
def choose_random_key(G):
v1 = random.choice(list(G.keys()))
v2 = random.choice(list(G[v1]))
return v1, v2
def karger(G):
length = []
while len(G) > 2:
v1, v2 = choose_random_key(G)
G[v1].extend(G[v2])
for x in G[v2]:
G[x].remove(v2)
G[x].append(v1)
while v1 in G[v1]:
G[v1].remove(v1)
del G[v2]
for key in G.keys():
length.append(len(G[key]))
return length[0]
def operation(n):
i = 0
count = 10000
while i < n:
data = copy.deepcopy(G)
min_cut = karger(data)
if min_cut < count:
count = min_cut
i = i + 1
return count
print(operation(100))
data = G and karger(data) doesn't duplicate dictionary and karger(data) would use original G dictionary. But karger changes values in data so automatically it would change values in original dictionary. So next executions of karger(data) would use dictionary with different values.
Remove deepcopy and add print(data) before karger(data) and you see different values in data.

How to reduce a collection of ranges to a minimal set of ranges [duplicate]

This question already has answers here:
Union of multiple ranges
(5 answers)
Closed 7 years ago.
I'm trying to remove overlapping values from a collection of ranges.
The ranges are represented by a string like this:
499-505 100-115 80-119 113-140 500-550
I want the above to be reduced to two ranges: 80-140 499-550. That covers all the values without overlap.
Currently I have the following code.
cr = "100-115 115-119 113-125 80-114 180-185 500-550 109-120 95-114 200-250".split(" ")
ar = []
br = []
for i in cr:
(left,right) = i.split("-")
ar.append(left);
br.append(right);
inc = 0
for f in br:
i = int(f)
vac = []
jnc = 0
for g in ar:
j = int(g)
if(i >= j):
vac.append(j)
del br[jnc]
jnc += jnc
print vac
inc += inc
I split the array by - and store the range limits in ar and br. I iterate over these limits pairwise and if the i is at least as great as the j, I want to delete the element. But the program doesn't work. I expect it to produce this result: 80-125 500-550 200-250 180-185
For a quick and short solution,
from operator import itemgetter
from itertools import groupby
cr = "499-505 100-115 80-119 113-140 500-550".split(" ")
fullNumbers = []
for i in cr:
a = int(i.split("-")[0])
b = int(i.split("-")[1])
fullNumbers+=range(a,b+1)
# Remove duplicates and sort it
fullNumbers = sorted(list(set(fullNumbers)))
# Taken From http://stackoverflow.com/questions/2154249
def convertToRanges(data):
result = []
for k, g in groupby(enumerate(data), lambda (i,x):i-x):
group = map(itemgetter(1), g)
result.append(str(group[0])+"-"+str(group[-1]))
return result
print convertToRanges(fullNumbers)
#Output: ['80-140', '499-550']
For the given set in your program, output is ['80-125', '180-185', '200-250', '500-550']
Main Possible drawback of this solution: This may not be scalable!
Let me offer another solution that doesn't take time linearly proportional to the sum of the range sizes. Its running time is linearly proportional to the number of ranges.
def reduce(range_text):
parts = range_text.split()
if parts == []:
return ''
ranges = [ tuple(map(int, part.split('-'))) for part in parts ]
ranges.sort()
new_ranges = []
left, right = ranges[0]
for range in ranges[1:]:
next_left, next_right = range
if right + 1 < next_left: # Is the next range to the right?
new_ranges.append((left, right)) # Close the current range.
left, right = range # Start a new range.
else:
right = max(right, next_right) # Extend the current range.
new_ranges.append((left, right)) # Close the last range.
return ' '.join([ '-'.join(map(str, range)) for range in new_ranges ]
This function works by sorting the ranges, then looking at them in order and merging consecutive ranges that intersect.
Examples:
print(reduce('499-505 100-115 80-119 113-140 500-550'))
# => 80-140 499-550
print(reduce('100-115 115-119 113-125 80-114 180-185 500-550 109-120 95-114 200-250'))
# => 80-125 180-185 200-250 500-550

Numpy Vs nested dictionaries, which one is more efficient in terms of runtime and memory?

I am new to numpy.I have referred to the following SO question:
Why NumPy instead of Python lists?
The final comment in the above question seems to indicate that numpy is probably slower on a particular dataset.
I am working on a 1650*1650*1650 data set. These are essentially similarity values for each movie in the MovieLens data set along with the movie id.
My options are to either use a 3D numpy array or a nested dictionary. On a reduced data set of 100*100*100, the run times were not too different.
Please find the Ipython code snippet below:
for id1 in range(1,count+1):
data1 = df[df.movie_id == id1].set_index('user_id')[cols]
sim_score = {}
for id2 in range (1, count+1):
if id1 != id2:
data2 = df[df.movie_id == id2].set_index('user_id')[cols]
sim = calculatePearsonCorrUnified(data1, data2)
else:
sim = 1
sim_matrix_panel[id1]['Sim'][id2] = sim
import pdb
from math import sqrt
def calculatePearsonCorrUnified(df1, df2):
sim_score = 0
common_movies_or_users = []
for temp_id in df1.index:
if temp_id in df2.index:
common_movies_or_users.append(temp_id)
#pdb.set_trace()
n = len(common_movies_or_users)
#print ('No. of common movies: ' + str(n))
if n == 0:
return sim_score;
# Ratings corresponding to user_1 / movie_1, present in the common list
rating1 = df1.loc[df1.index.isin(common_movies_or_users)]['rating'].values
# Ratings corresponding to user_2 / movie_2, present in the common list
rating2 = df2.loc[df2.index.isin(common_movies_or_users)]['rating'].values
sum1 = sum (rating1)
sum2 = sum (rating2)
# Sum up the squares
sum1Sq = sum (np.square(rating1))
sum2Sq = sum (np.square(rating2))
# Sum up the products
pSum = sum(np.multiply(rating1, rating2))
# Calculate Pearson score
num = pSum-(sum1*sum2/n)
den = sqrt(float(sum1Sq-pow(sum1,2)/n) * float(sum2Sq-pow(sum2,2)/n))
if den==0: return 0
sim_score = (num/den)
return sim_score
What would be the best way to most precisely time the runtime with either of these options?
Any pointers would be greatly appreciated.

Categories