Computing Adjusted Rand Index

Computing Adjusted Rand Index - python

I am trying to compute the ARI between two sets of clusters, using this code:
#computes ARI for this type of clustering
def ARI(table,n):
index = 0
sum_a = 0
sum_b = 0
for i in range(len(table)-1):
for j in range(len(table)-1):
sum_a += choose(table[i][len(table)-1],2)
sum_b += choose(table[len(table)-1][j],2)
index += choose(table[i][j],2)
expected_index = (sum_a*sum_b)
expected_index = expected_index/choose(n,2)
max_index = (sum_a+sum_b)
max_index = max_index/2
return (index - expected_index)/(max_index-expected_index)
#choose to compute rand
def choose(n,r):
f = math.factorial
if (n-r)>=0:
return f(n) // f(r) // f(n-r)
else:
return 0
assuming I have created the contingency table correctly, I still get values outside the range of (-1,1).
For instance:
Contingency table:
[1, 0, 0, 0, 0, 0, 0, 1]
[1, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 1, 0, 0, 0, 1]
[0, 1, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 1, 1, 2]
[1, 0, 1, 0, 1, 0, 0, 3]
[0, 0, 0, 0, 0, 0, 1, 1]
[3, 1, 1, 1, 1, 1, 2, 0]
yields an ARI of -1.6470588235294115 when I run my code.
Is there a bug in this code?
Also Here is how I am computing the contingency matrix:
table = [[0 for _ in range(len(subjects)+1)]for _ in range(len(subjects)+1)]
#comparing all clusters
for i in range(len(clusters)):
index_count = 0
for subject, orgininsts in orig_clusters.items():
madeinsts = clusters[i].instances
intersect_count = 0
#comparing all instances between the 2 clusters
for orginst in orgininsts:
for madeinst in makeinsts:
if orginst == madeinst:
intersect_count += 1
table[index_count][i] = intersect_count
index_count += 1
for i in range(len(table)-1):
a = 0
b = 0
for j in range(len(table)-1):
a += table[i][j]
b += table[j][i]
table[i][len(table)-1] = a
table[len(table)-1][i] = b
clusters is a list of cluster objects that have attribute instances, which is a list of instances contained in that cluster. orig_clusters is a dictonary with keys representing cluster labels, and values are a list of instances contained in that cluster. Is there a bug in this code?

You make some mistakes calculating the ARI in your code -- you calculate a and b too often because you loop over your table twice instead of just once.
Also, you pass n as a parameter, but apparently it is set to 10 (that is how I get your result). It would be easier to just pass the table and then calculate n from there. I fixed your code a bit:
def ARI(table):
index = 0
sum_a = 0
sum_b = 0
n = sum([sum(subrow) for subrow in table]) #all items summed
for i in range(len(table)):
b_row = 0#this is to hold the col sums
for j in range(len(table)):
index += choose(table[i][j], 2)
b_row += table[j][i]
#outside of j-loop b.c. we want to use a=rowsums, b=colsums
sum_a += choose(sum(table[i]), 2)
sum_b += choose(b_row, 2)
expected_index = (sum_a*sum_b)
expected_index = expected_index/choose(n,2)
max_index = (sum_a+sum_b)
max_index = max_index/2
return (index - expected_index)/(max_index-expected_index)
or if you pass on the table with row- and column sums:
def ARI(table):
index = 0
sum_a = 0
sum_b = 0
n = sum(table[len(table)-1]) + sum([table[i][len(table)-1] for i in range(len(table)-1)])
for i in range(len(table)-1):
sum_a += choose(table[i][len(table)-1],2)
sum_b += choose(table[len(table)-1][i],2)
for j in range(len(table)-1):
index += choose(table[i][j],2)
expected_index = (sum_a*sum_b)
expected_index = expected_index/choose(n,2)
max_index = (sum_a+sum_b)
max_index = max_index/2
return (index - expected_index)/(max_index-expected_index)
then
def choose(n,r):
f = math.factorial
if (n-r)>=0:
return f(n) // f(r) // f(n-r)
else:
return 0
table = [[1, 0, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 1, 0, 0, 0, 1],
[0, 1, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 1, 1, 2],
[1, 0, 1, 0, 1, 0, 0, 3],
[0, 0, 0, 0, 0, 0, 1, 1],
[3, 1, 1, 1, 1, 1, 2, 0]]
ARI(table)
ARI(table)
Out[56]: -0.0604008667388949
The correct result!

Related

Python variable and data structure scope

if matrix[row][col] == 0:
continue # continue if it is a water cell
matrix[row][col] = 0 # mark the cell visited by making it a water cell
This is a snippet from the program below. My question is how you are able to update countIslandsBFS()'s matrix from a function call to visitIslandBFS()
In general, I had thought only global variables could be changed with a function call. The visitIslandBFS() function they wrote returns nothing - so how is it possible to update the matrix passed in by countIslandBFS()?
from collections import deque
def countIslandsBFS(matrix):
rows = len(matrix)
cols = len(matrix[0])
totalIslands = 0
for i in range(rows):
for j in range(cols):
if (matrix[i][j] == 1): # only if the cell is a land
# we have found an island
totalIslands += 1
visitIslandBFS(matrix, i, j)
return totalIslands
def visitIslandBFS(matrix, x, y):
neighbors = deque([(x, y)])
while neighbors:
row, col = neighbors.popleft()
if row < 0 or row >= len(matrix) or col < 0 or col >= len(matrix[0]):
continue # continue, if it is not a valid cell
if matrix[row][col] == 0:
continue # continue if it is a water cell
matrix[row][col] = 0 # mark the cell visited by making it a water cell
# insert all neighboring cells to the queue for BFS
neighbors.extend([(row + 1, col)]) # lower cell
neighbors.extend([(row - 1, col)]) # upper cell
neighbors.extend([(row, col + 1)]) # right cell
neighbors.extend([(row, col - 1)]) # left cell
def main():
print(countIslandsBFS([[0, 1, 1, 1, 0], [0, 0, 0, 1, 1], [
0, 1, 1, 1, 0], [0, 1, 1, 0, 0], [0, 0, 0, 0, 0]]))
print(countIslandsBFS([[1, 1, 1, 0, 0], [0, 1, 0, 0, 1], [
0, 0, 1, 1, 0], [0, 0, 1, 0, 0], [0, 0, 1, 0, 0]]))
main()

how to count negative and positive numbers in each set from a list?

I have list which contains 30 numbers
list = [-21,-22,-33,-55,-454,65,48,-516,614,6,2,-64,-64,-87,6,45,87,15,11,03,-34,-6,-68,-959,-653,24,658,68,9,-2181]
Now first I want to count the number of continuous 3 positive or negative numbers. For that I am using this program:
list = [-21,-22,-33,-55,-454,65,48,-516,614,6,2,-64,-64,-87,6,45,87,15,11,03,-34,-6,-68,-959,-653,24,658,68,9,-2181]
counts = []
count = 0
daysCounter = 1
plus_counter = 0
minus_counter = 0
row_counter = 0
answer_counter = 1
for each in list: # for the "dev column"
if each > 0:
minus_counter = 0
plus_counter += 1
if plus_counter == 3:
count = answer_counter
row_counter = answer_counter
counts.append(count)
plus_counter = 0
answer_counter += 1
else:
counts.append(0)
elif each < 0:
plus_counter = 0
minus_counter += 1
if minus_counter == 3:
count = answer_counter
row_counter = answer_counter
counts.append(count)
minus_counter = 0
answer_counter += 1
else:
counts.append(0)
row_counter += 1
print counts
output:
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 5, 0, 0, 6, 0, 0, 0, 0, 7, 0, 0]
This is correct but I want to reset the counter at %10 == 0position. Basically, if the list contains 30 elements then I want to count between 0 to 10th element then 11th to 20th then 21th to 30th element.
desired output:
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0]

Basically, you will have to reset to default values for each period:
# other code
for i, each in enumerate(list): # for the "dev column"
if i % 10 == 0:
count = 0
daysCounter = 1
plus_counter = 0
minus_counter = 0
row_counter = 0
answer_counter = 1
# remaining code
Note: You should not name your list as list, because you override the built-in.

Your question didnt make sense to me, you say you want to reset on every module 10 index, yet you then go on to say in a list of 30 elements you want
0 to 10 (this is 11 elements)
11 to 20 (this is 10 elements)
21 to 30 (this is 10 elements)
The total of this would be 31 elements but you said your list has 30 elements. which would be indexed from 0 to 29. So i have made an assumption here that you do mean every 10 elelemts I.E 0 to 9, 10 to 19, 20 to 29. This makes my output out of line with yours but again i can only make an assumption here that you miscounted with your indexes.
nums = [
-21, -22, -33, -55, -454, 65, 48, -516, 614, 6,
2, -64, -64, -87, 6, 45, 87, 15, 11, 3,
-34, -6, -68, -959, -653, 24, 658, 68, 9, -2181
]
nths = 10
sequential_limit = 3
sequential_count = sequential_finds = 0
indexer = sequential_limit - 1
sequential_list = [0 for _ in range(indexer)]
skip = 0
for index, num in enumerate(nums[indexer:], indexer):
result = 0
if index % nths == 0:
sequential_count = sequential_finds = 0
skip = indexer
if skip:
skip -= 1
else:
negative = sum(1 for next_num in nums[index - indexer:index + 1] if next_num < 0)
positive = sum(1 for next_num in nums[index - indexer:index + 1] if next_num >= 0)
if sequential_limit in (positive, negative):
sequential_finds += 1
sequential_count = 0
skip = indexer
result = sequential_finds
sequential_list.append(result)
print(sequential_list)
OUTPUT
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0]

I think the above desired output you post is not correct
Add this code in the end, This code will reset between 0th to 9th, 10th to 19,20th to 29 elements of a list.
list_len = len(counts)
total_multiple = int(list_len/10)
for i in range(1, total_multiple):
count = 0
for j in range(10*i, 10*i+10):
if(counts[j] > 0):
counts[j] = count
count += 1
print(counts)
It will modify your list and prints
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]

Fibonacci Adder Machine - Calculating Digits in Fibonacci Sequence

I'm working on a program that will calculate fibonacci numbers with certain digit limitations (i.e. first fibonacci number with 100 digits). The code I have below overall is running, but I am hitting a logical error that has me stumped.
The goal of the code is to calculate Fibonacci numbers in a fashion similar to binary addition. Using an array, each element is to hold a digit from 0 - 9, so each array index represents a 10's place.
It starts working and looping through fine, but it gets off between 13 and 21 because of the way the loop is handled. It adds the number sin the 10's place together and then saves a 31 number.
Is there a way to break out or stop it from adding those together that I'm not seeing?
num1 = [0]*100
num2 = [0]*100
num2[len(num2)-1] = 1
carry = 0
flag = True
while (flag):
#Main for loop to work through the array
for count in range (1, len(num2)):
carry = num2[len(num2) - count] + num1[len(num1) - count]
if carry > 9:
num2[len(num2)- (count + 1)] = num2[len(num2)- (count + 1)] + 1
carry = carry % 10
num1[len(num1) - count] = num2[len(num2) - count]
num2[len(num2) - count] = carry
else:
num1[len(num1) - count] = num2[len(num2) - count]
num2[len(num2) - count] = carry
print(num2)
if num2[0] != 0:
flag = False
Each time it passes the main while loop I'm hoping to see
[0,0,...,0,1]
[0,0,...,0,2]
[0,0,...,0,3]
[0,0,...,0,5]
[0,0,...,0,8]
[0,0,...,1,3]
[0,0,...,2,1]
...
but after it hits the [...,2,1] loop it moves on to [...,3,1]

Here's a bit of a cleaner version of what I believe you're trying to get at.
#Init of Fib variables
a = 0
b = 1
num = 10 #Change this to the number of fib calculation loops.
x = 0
output_arr_len = 100 #Change this to 10 for testing, as it's easier to see.
while x < num:
#Calculate Fib Sequence
c = a + b
a = b
b = c
x += 1
#Output Array
print([0] * (output_arr_len - len(str(c))) + [int(i) for i in str(c)])
Below is the output of the first 20 loops with output_arr_len set to 10.
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 3]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 5]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 8]
[0, 0, 0, 0, 0, 0, 0, 0, 1, 3]
[0, 0, 0, 0, 0, 0, 0, 0, 2, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 3, 4]
[0, 0, 0, 0, 0, 0, 0, 0, 5, 5]
[0, 0, 0, 0, 0, 0, 0, 0, 8, 9]
[0, 0, 0, 0, 0, 0, 0, 1, 4, 4]
[0, 0, 0, 0, 0, 0, 0, 2, 3, 3]
[0, 0, 0, 0, 0, 0, 0, 3, 7, 7]
[0, 0, 0, 0, 0, 0, 0, 6, 1, 0]
[0, 0, 0, 0, 0, 0, 0, 9, 8, 7]
[0, 0, 0, 0, 0, 0, 1, 5, 9, 7]
[0, 0, 0, 0, 0, 0, 2, 5, 8, 4]
[0, 0, 0, 0, 0, 0, 4, 1, 8, 1]
[0, 0, 0, 0, 0, 0, 6, 7, 6, 5]
[0, 0, 0, 0, 0, 1, 0, 9, 4, 6]

It was even more tricky than in my comments, but this version works correctly:
num1 = [0]*10
num2 = [0]*10
num2[len(num2)-1] = 1
sum = 0
carry = 0
flag = True
while (flag):
#Main for loop to work through the array
for count in range (1, len(num2)):
sum = num2[len(num2) - count] + num1[len(num1) - count] + carry
num1[len(num1) - count] = num2[len(num2) - count]
if sum > 9:
sum = sum % 10
carry = 1
else:
carry = 0
num2[len(num2) - count] = sum
if carry == 1:
num2[0] = num2[0] + 1
print(num2)
if num2[0] != 0:
flag = False
You have also to copy to new1 before applying the carry, even when doing it at the next higher bit...

Here's the corrections to your code. Note that Python has infinite precision integers, so I've added a Fibonacci generator to check the answer.
num1 = [0]*100
num2 = [0]*100
num2[len(num2)-1] = 1
flag = True
# Fibonacci generator for verification of answer
def fib():
a,b = 0,1
while True:
a,b = b,a+b
yield a
# Instance of the generator
f = fib()
# convert a list of single-digit integers to a real integer for display
def value(L):
assert all(n < 10 for n in L) # bug checking for invalid list values.
return int(''.join([str(i) for i in L]))
while (flag):
#Main for loop to work through the array
# Start with zero carry for first digit
carry = 0
for count in range (1,len(num2)+1): # originally off-by-1.
# compute the sum plus the carry of previous sum
temp = num2[len(num2) - count] + num1[len(num1) - count] + carry
# shift num2 digit to num1 digit
num1[len(num1) - count] = num2[len(num2) - count]
# new num2 digit is the one's place of temp sum.
num2[len(num2) - count] = temp % 10
# remember the carry (10's place) for next sum.
carry = temp // 10
# Check for bugs...compare the answer with the next Fibonacci number
assert value(num1) == next(f)
if num1[0] != 0:
flag = False
print(value(num1))
Note you can make the for loop a little simpler by remembering that negative offsets access an array from the end (num2[-1] is the last item in the array) and range can count backwards:
for count in range(-1,-len(num2)-1,-1):
temp = num2[count] + num1[count] + carry
num1[count] = num2[count]
num2[count] = temp % 10
carry = temp // 10 # remember carry for next digit

Data detector for number 1 and 0

I have a data set containing with only 0 and 1. I want to have a detector to find where 1 starts and where 1 ends, and then return something related to their index to a different list each. So I've written some codes as below:
n= [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
def detector (data):
x = 0
start = []
end = []
for index, i in enumerate(data):
if x == 0 and i == 1:
start.append((index+1))
x == 1
elif x == 1 and i==0:
end.append((index))
x == 0
return start, end
print (detector(n))
However when I run the code above, it returned like below, which is not my desired output.
([1, 2, 3, 4, 22, 23, 24, 25, 26, 27, 28, 34, 35, 36, 37, 38], [])
My desired output is as below:
([1, 22, 34], [4,28,38])
As you can see above, the start_time should be[1,22,34] and end_time should be [4,28,38].
If anyone knows how to solve the issue, pls let me know. Appreciated!!

One issue is certainly, that you dont change flag.
== is a comparison operator and does not assign a new value to flag

using enumerate to get positions of 1s and zip to find when sequence of consecutive 1s starts/ends
ones_positions = [position
for position, value in enumerate(n)
if value == 1]
ones_starts = [ones_positions[0]] + [
next_position
for position, next_position in zip(ones_positions,
ones_positions[1:])
if next_position - position > 1]
ones_ends = [position
for position, next_position in zip(ones_positions,
ones_positions[1:])
if next_position - position > 1] + [ones_positions[-1]]
gives us
>>>ones_starts
[0, 21, 33]
>>>ones_ends
[3, 27, 37]
we can specify enumerate's start parameter if you want your indices to start from 1 (when they are naturally start from 0)
ones_positions = [position
for position, value in enumerate(n, start=1)
if value == 1]
after that
>>>ones_starts
[1, 22, 34]
>>>ones_ends
[4, 28, 38]
Finally we can write it as function:
def detector(data, target_value=1):
positions = [position
for position, value in enumerate(data, start=1)
if value == target_value]
start_times = [positions[0]] + [
next_position
for position, next_position in zip(positions,
positions[1:])
if next_position - position > 1]
end_times = [position
for position, next_position in zip(positions,
positions[1:])
if next_position - position > 1] + [positions[-1]]
return start_times, end_times
and test
n = [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
print(detector(n))
gives us
([1, 22, 34], [4, 28, 38])

n = [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
prev_num = 0
starts = []
ends = []
result = (starts, ends)
for idx, num in enumerate(n):
if prev_num == 0 and num == 1:
starts.append(idx + 1)
elif prev_num == 1 and num == 0:
ends.append(idx + 1)
elif num == 1 and idx == (len(n) - 1):
ends.append(idx + 1)
prev_num = num
print(result)
Which prints:
[[1, 22, 34], [5, 29, 38]]

Since #DanielChristiany pointed you where your mistake was. I will present you my solution which is faster than any of presented(at least that works correctly):
edges = (index for index, i in enumerate(n[1:], 1) if i != n[index-1])
if n[0] == 1:
edges = (1, *edges)
if n[-1] == 1:
some = (*edges, len(n))
print(edges[::2], edges[1::2])
Basically it firstly searches edges where element changes from 0 to 1 or from 1 to 0. Then checks if first and last elements are 1 and then print result.
This solution also uses less memory since it uses generators.

You could also try using groupby:
import itertools
L = [[y[0] for y in it]
for x,it in
itertools.groupby(enumerate(n),lambda x: x[1])
][::2]
res = [x[0] for x in L],[x[-1] for x in L]
You could probably arrive at an even more correct solution without using indexes.
Thanks to vishes_shell for the correction

How to get individual numbers inside list which is inside another list in Python?

I'm completely at a loss on trying to figure out how to get the individual numbers inside in a list
Here is my code:
infinity = 1000000
invalid_node = -1
class Node:
previous = invalid_node
distFromSource = infinity
visited = False
def populateNetwork(fileName):
network = []
networkFile = open(fileName, "r")
for line in networkFile:
network.append(map(int, line.strip().split(',')))
return network
def populateNodeTable(network, StartNode):
nodeTable = []
for node in network:
nodeTable.append(Node())
nodeTable[StartNode].distFromSource = 0
nodeTable[StartNode].visited = True
return nodeTable
network = populateNetwork('network.txt')
nodeTable = populateNodeTable(network, 1)
nodeTable2 = populateNodeTable(network, 2)
print "Visited Nodes"
for node in nodeTable:
print node.previous, node.distFromSource, node.visited
print
print "This is what is inside network"
for line in network:
print line
print
print "what is inside index 6"
print network[6]
Here is the output:
Visited Nodes
-1 1000000 False
-1 0 True
-1 1000000 False
-1 1000000 False
-1 1000000 False
-1 1000000 False
-1 1000000 False
This is what is inside network
[0, 2, 4, 1, 6, 0, 0]
[2, 0, 0, 0, 5, 0, 0]
[4, 0, 0, 0, 0, 5, 0]
[1, 0, 0, 0, 1, 1, 0]
[6, 5, 0, 1, 0, 5, 5]
[0, 0, 5, 1, 5, 0, 0]
[0, 0, 0, 0, 5, 0, 0]
what is inside index 6
[0, 0, 0, 0, 5, 0, 0]
My question is, how do I get the individual numbers inside an index to be used to calculate? So for example index[1] contains "0, 2, 4, 1, 6, 0, 0" and I am going to use these numbers to do addition so 0+2+4+1+6+0+0 = 13. I'm really confused.

print network[0] # 0, 2, 4, 1, 6, 0, 0
print network[0][0] # 0
print network[0][1] # 2
print network[0][2] # 4
for x in network[0]:
print x
# 0
# 2
# 4
# 1
# 6
# 0
# 0
print sum(network[0]) # 13

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Computing Adjusted Rand Index - python

Related

Python variable and data structure scope

how to count negative and positive numbers in each set from a list?

Fibonacci Adder Machine - Calculating Digits in Fibonacci Sequence

Data detector for number 1 and 0

How to get individual numbers inside list which is inside another list in Python?

Categories

Resources