finding connected components in an image using python - python

Actually an image is descretized into 3 bins (0,1,2) .So any color that falls into particular bin is replaced with the bin no.Therefore discretized image can be viewed as this matrix:
a=[[2,1,2,2,1,1],
[2,2,1,2,1,1],
[2,1,3,2,1,1],
[2,2,2,1,1,2],
[2,2,1,1,2,2],
[2,2,1,1,2,2]]
The next step is to compute the connected components. Individual components will be labeled with letters (A;B;C;D;E;F etc) and we will need to keep a table which maintains the discretized color associated with each label, along with the number of pixels with that label. Of course, the same discretized color can be associated with different labels if multiple contiguous regions of the same color exist. The image may then become
b=[[B,C,B,B,A,A],
[B,B,C,B,A,A],
[B,C,D,B,A,A],
[B,B,B,A,A,E],
[B,B,A,A,E,E],
[B,B,A,A,E,E]]
and the connected components table will be:
Label A B C D E
Color 1 2 1 3 1
Size 12 15 3 1 5
Let q=4.The components A, B, and E have more than q pixels, and the components C and D less than q pixels. Therefore the pixels in A;B and E are classied as coherent, while the pixels in C and D are classied as incoherent. The CCV for this image will be
Color : 1 2 3
coherent: 17 15 0
incoherent: 3 0 1
A given color bucket may thus contain only coherent pixels (as does 2), only incoherent pixels
(as does 3), or a mixture of coherent and incoherent pixels (as does 1). If we assume there are only 3 possible discretized colors, the CCV can also be written as
<(17; 3) ; (15; 0) ; (0; 1)>
for three colors
Please anybody help me with the algorithm for finding connected components
I have implemented iterative dfs and recursive dfs ,but both seem to be inefficient ,they take nearly 30 minutes to compute connected components of an image.Anybody help me how to find it ?I'm running out of time I have to submit my project. I'm pasting both my codes:
Image size:384*256
code using recursive dfs:
import cv2
import sys
from PIL import Image
import ImageFilter
import numpy
import PIL.Image
from numpy import array
stack=[]
z=0
sys.setrecursionlimit(9000000)
def main():
imageFile='C:\Users\Abhi\Desktop\cbir-p\New folder\gray_image.jpg'
size = Image.open(imageFile).size
print size
im=Image.open(imageFile)
inimgli=[]
for x in range(size[0]):
inimgli.append([])
for y in range(size[1]):
inten=im.getpixel((x,y))
inimgli[x].append(inten)
for item in inimgli:
item.insert(0,0)
item.append(0)
inimgli.insert(0,[0]*len(inimgli[0]))
inimgli.append([0]*len(inimgli[0]))
blurimg=[]
for i in range(1,len(inimgli)-1):
blurimg.append([])
for j in range(1,len(inimgli[0])-1):
blurimg[i-1].append((inimgli[i-1][j-1]+inimgli[i-1][j]+inimgli[i-1][j+1]+inimgli[i][j-1]+inimgli[i][j]+inimgli[i][j+1]+inimgli[i+1][j-1]+inimgli[i+1][j]+inimgli[i+1][j+1])/9)
#print blurimg
displi=numpy.array(blurimg).T
im1 = Image.fromarray(displi)
im1.show()
#i1.save('gray.png')
descretize(blurimg)
def descretize(rblurimg):
count=-1
desc={}
for i in range(64):
descli=[]
for t in range(4):
count=count+1
descli.append(count)
desc[i]=descli
del descli
#print len(rblurimg),len(rblurimg[0])
#print desc
drblur=[]
for x in range(len(rblurimg)):
drblur.append([])
for y in range(len(rblurimg[0])):
for item in desc:
if rblurimg[x][y] in desc[item]:
drblur[x].append(item)
#displi1=numpy.array(drblur).T
#im1 = Image.fromarray(displi1)
#im1.show()
#im1.save('xyz.tif')
#print drblur
connected(drblur)
def connected(rdrblur):
table={}
#print len(rdrblur),len(rdrblur[0])
for item in rdrblur:
item.insert(0,0)
item.append(0)
#print len(rdrblur),len(rdrblur[0])
rdrblur.insert(0,[0]*len(rdrblur[0]))
rdrblur.append([0]*len(rdrblur[0]))
copy=[]
for item in rdrblur:
copy.append(item[:])
global z
count=0
for i in range(1,len(rdrblur)-1):
for j in range(1,len(rdrblur[0])-1):
if (i,j) not in stack:
if rdrblur[i][j]==copy[i][j]:
z=0
times=dfs(i,j,str(count),rdrblur,copy)
table[count]=(rdrblur[i][j],times+1)
count=count+1
#z=0
#times=dfs(1,255,str(count),rdrblur,copy)
#print times
#print stack
stack1=[]
#copy.pop()
#copy.pop(0)
#print c
#print table
for item in table.values():
stack1.append(item)
#print stack1
table2={}
for v in range(64):
table2[v]={'coherent':0,'incoherent':0}
#for item in stack1:
# if item[0] not in table2.keys():
# table2[item[0]]={'coherent':0,'incoherent':0}
for item in stack1:
if item[1]>300:
table2[item[0]]['coherent']=table2[item[0]]['coherent']+item[1]
else:
table2[item[0]]['incoherent']=table2[item[0]]['incoherent']+item[1]
print table2
#return table2
def dfs(x,y,co,b,c):
dx = [-1,-1,-1,0,0,1,1,1]
dy = [-1,0,1,-1,1,-1,0,1]
global z
#print x,y,co
c[x][y]=co
stack.append((x,y))
#print dx ,dy
for i in range(8):
nx = x+(dx[i])
ny = y+(dy[i])
#print nx,ny
if b[x][y] == c[nx][ny]:
dfs(nx,ny,co,b,c)
z=z+1
return z
if __name__ == '__main__':
main()
iterative dfs:
def main():
imageFile='C:\Users\Abhi\Desktop\cbir-p\New folder\gray_image.jpg'
size = Image.open(imageFile).size
print size
im=Image.open(imageFile)
inimgli=[]
for x in range(size[0]):
inimgli.append([])
for y in range(size[1]):
inten=im.getpixel((x,y))
inimgli[x].append(inten)
for item in inimgli:
item.insert(0,0)
item.append(0)
inimgli.insert(0,[0]*len(inimgli[0]))
inimgli.append([0]*len(inimgli[0]))
blurimg=[]
for i in range(1,len(inimgli)-1):
blurimg.append([])
for j in range(1,len(inimgli[0])-1):
blurimg[i-1].append((inimgli[i-1][j-1]+inimgli[i-1][j]+inimgli[i-1][j+1]+inimgli[i][j-1]+inimgli[i][j]+inimgli[i][j+1]+inimgli[i+1][j-1]+inimgli[i+1][j]+inimgli[i+1][j+1])/9)
#print blurimg
#displi=numpy.array(blurimg).T
#im1 = Image.fromarray(displi)
#im1.show()
#i1.save('gray.png')
descretize(blurimg)
def descretize(rblurimg):
count=-1
desc={}
for i in range(64):
descli=[]
for t in range(4):
count=count+1
descli.append(count)
desc[i]=descli
del descli
#print len(rblurimg),len(rblurimg[0])
#print desc
drblur=[]
for x in range(len(rblurimg)):
drblur.append([])
for y in range(len(rblurimg[0])):
for item in desc:
if rblurimg[x][y] in desc[item]:
drblur[x].append(item)
#displi1=numpy.array(drblur).T
#im1 = Image.fromarray(displi1)
#im1.show()
#im1.save('xyz.tif')
#print drblur
connected(drblur)
def connected(rdrblur):
for item in rdrblur:
item.insert(0,0)
item.append(0)
#print len(rdrblur),len(rdrblur[0])
rdrblur.insert(0,[0]*len(rdrblur[0]))
rdrblur.append([0]*len(rdrblur[0]))
#print len(rdrblur),len(rdrblur[0])
copy=[]
for item in rdrblur:
copy.append(item[:])
count=0
#temp=0
#print len(alpha)
for i in range(1,len(rdrblur)-1):
for j in range(1,len(rdrblur[0])-1):
if (i,j) not in visited:
dfs(i,j,count,rdrblur,copy)
count=count+1
print "success"
def dfs(x,y,co,b,c):
global z
#print x,y,co
stack=[]
c[x][y]=str(co)
visited.append((x,y))
stack.append((x,y))
while len(stack) != 0:
exstack=find_neighbors(stack.pop(),co,b,c)
stack.extend(exstack)
#print visited
#print stack
#print len(visited)
#print c
'''while (len(stack)!=0):
(x1,y1)=stack.pop()
exstack=find_neighbors(x1,y1)
stack.extend(exstack)'''
def find_neighbors((x2,y2),cin,b,c):
#print x2,y2
neighborli=[]
for i in range(8):
x=x2+(dx[i])
y=y2+(dy[i])
if (x,y) not in visited:
if b[x2][y2]==b[x][y]:
visited.append((x,y))
c[x][y]=str(cin)
neighborli.append((x,y))
return neighborli
if __name__ == '__main__':
main()

Here's another post I have answered which doing exactly the same thing
which include a sample code using simply DFS.
How do I find the connected components in a binary image?
Modify the DFS function: add one parameter current_color = {0,1,2}, so that you can decide if you can go to another node from this node or not. (If the nabouring node has same color with current_color and not yet visit, recurssively visit that node)

The DFS is good algorithm but the recursive algorithm is space inefficient and non recursive one is very complex so I would advice connected component labelling algorithm which uses disjoint-set datastructure in two pass to get solution in non recursive way in linear time.
Note: Use image processing libraries for the same as they do have parallel fast implementation.

I had a similar issue, but in 3D, and asked a question about that here:
Increasing efficiency of union-find
I found the union-find algorithm to be much faster than anything else for my case (which makes sense given the complexity)

Related

I need to be able to 'rotate' a list thats in a 'square' looking formation into a diamond shape python

I am creating a word search solver and need a way to rotate the word search, which is in a list, so the left corner is the 'top' and the bottom right is at the 'bottom'
I have this:
Puzzle = ["FUNCTIONRRIRAI",
"RAIOONFRCCPWON",
"PTCSNOBEUITOLO",
"BNCACIANTOSLIH",
"RBYOLILYNREFBT",
"HYYNOGESTIBRIY",
"AATTSIONCMCENP",
"UORTENRRCBFVAU",
"CEBEECVWIERORI",
"PROCESSORTOPYF",
"OHCOMPUTERHSOS",
"YCYPRESREOSMRW",
"OATHBRMVTHHCTR",
"PGORWOOUIPSCHP"]
I need it in the formation of:
Puzzle = ["F","RU","PAN","BTIC",...]
so it appears that the word search has been rotated 45 degrees
any suggestions/help would be appreciated
Code for find_horizontal and words to find:
def load_words_to_find(file_name):
word_list = []
file = open(file_name, "r")
for line in file.readlines():
word_list.append(line)
word_list = list(map(lambda s: s.strip(), word_list))
return word_list
def find_horizontal(Puzzle, Words, ReplaceWith, Found):
# Parameters :- List:Puzzle, List:Words, Character:ReplaceWith, List:Found
# Return :- List:Outpuz, List:Found
# Find all words which are horizontally in place (left to right and right to left), return the puzzle and list of found words
rev = ''
Outpuz = Puzzle
for line in Puzzle:
rev = line[::-1]
for word in Words:
if word in line:
Found.append(word)
Puzzle[Puzzle.index(line)] = line.replace(word, ReplaceWith * len(word))
if word in rev:
Found.append(word)
Puzzle[Puzzle.index(line)] = line.replace(word[::-1], ReplaceWith * len(word))
else:
pass
print("Found: ", Found)
print(Outpuz)
return Outpuz, Found
find_horizontal(Puzzle, load_words_to_find("words.txt"), ".", [])
Kind of silly, but you could insert string iterators to the front of a list, and then join and yield the next character from each iterator.
rows = [
"FUNCTIONRRIRAI",
"RAIOONFRCCPWON",
"PTCSNOBEUITOLO",
"BNCACIANTOSLIH",
"RBYOLILYNREFBT",
"HYYNOGESTIBRIY",
"AATTSIONCMCENP",
"UORTENRRCBFVAU",
"CEBEECVWIERORI",
"PROCESSORTOPYF",
"OHCOMPUTERHSOS",
"YCYPRESREOSMRW",
"OATHBRMVTHHCTR",
"PGORWOOUIPSCHP"
]
def get_next_diagonal(rows):
iters = []
for row in rows:
iters.insert(0, iter(row))
yield "".join(next(it, "") for it in iters)
while iters[0].__length_hint__():
yield "".join(next(it, "") for it in iters)
for diagonal in get_next_diagonal(rows):
print(diagonal)
Output:
F
RU
PAN
BTIC
RNCOT
HBCSOI
AYYANNO
UAYOCOFN
COTNLIBRR
PERTOIAECR
ORBTSGLNUCI
YHOEEIEYTIPR
OCCCENOSNOTWA
PAYOECRNTRSOOI
GTPMSVRCIELLN
OHRPSWCMBFIO
RBEUOIBCRBH
WRSTREFEIT
OMRETRVNY
OVEROOAP
UTOHPRU
IHSSYI
PHMOF
SCRS
CTW
HR
P
Apologies for the lack of efficiency, but here is my quick solution.
We view the grid as a set of coordinates. This solution is based off the fact that every item in the output you want will have X and Y coordinates than add up to a certain number. For example, the first item "F" has a location of (0,0), which sum to 0. The second two items "RU" are located ad (1,0) and (0,1), which both add up to 1. The third line "PAN" has letters located at (0,2), (1,1) and (2,0), and so on. Tracking this number with "i" in my solution, it needs to be big enough to scan down and across the lists, so this is twice the size of the length of the array. So we scan the list for items that are located at x and y coordinates where x+y==i.
Code:
Puzzle = ["FUNCTIONRRIRAI",
"RAIOONFRCCPWON",
"PTCSNOBEUITOLO",
"BNCACIANTOSLIH",
"RBYOLILYNREFBT",
"HYYNOGESTIBRIY",
"AATTSIONCMCENP",
"UORTENRRCBFVAU",
"CEBEECVWIERORI",
"PROCESSORTOPYF",
"OHCOMPUTERHSOS",
"YCYPRESREOSMRW",
"OATHBRMVTHHCTR",
"PGORWOOUIPSCHP"]
output = []
i = 0
while i < len(Puzzle)*2:
single_string = ""
for y in range(0,len(Puzzle)):
for x in range(0,len(Puzzle[0])):
if (x + y) == i:
single_string += Puzzle[x][y]
if single_string != "":
output.append(single_string)
i += 1
print(output)
Outputs
['F',
'RU',
'PAN',
'BTIC',
'RNCOT',
'HBCSOI',
'AYYANNO',
'UAYOCOFN',
'COTNLIBRR',
'PERTOIAECR',
'ORBTSGLNUCI',
'YHOEEIEYTIPR',
'OCCCENOSNOTWA',
'PAYOECRNTRSOOI',
'GTPMSVRCIELLN',
'OHRPSWCMBFIO',
'RBEUOIBCRBH',
'WRSTREFEIT',
'OMRETRVNY',
'OVEROOAP',
'UTOHPRU',
'IHSSYI',
'PHMOF',
'SCRS',
'CTW',
'HR',
'P']
Hope this helps. Happy to clarify anything if needed
I find it easier and clearer to just generate the indexes in two loops, down the rows starting at the first column and then along the columns starting at the last row:
puzzle = ['FUNCTIONRRIRAI',
'RAIOONFRCCPWON',
'PTCSNOBEUITOLO',
'BNCACIANTOSLIH',
'RBYOLILYNREFBT',
'HYYNOGESTIBRIY',
'AATTSIONCMCENP',
'UORTENRRCBFVAU',
'CEBEECVWIERORI',
'PROCESSORTOPYF',
'OHCOMPUTERHSOS',
'YCYPRESREOSMRW',
'OATHBRMVTHHCTR',
'PGORWOOUIPSCHP']
nrows = len(puzzle)
ncols = len(puzzle[0])
output = []
for ir in range(nrows):
row = []
ic = 0
jr = ir
while jr >= 0:
row.append(puzzle[jr][ic])
ic += 1
jr -= 1
output.append(''.join(row))
for ic in range(1, ncols):
row = []
ir = nrows - 1
jc = ic
while jc < ncols:
row.append(puzzle[ir][jc])
ir -= 1
jc += 1
output.append(''.join(row))
for row in output:
print(row)
Output:
F
RU
PAN
BTIC
RNCOT
HBCSOI
AYYANNO
UAYOCOFN
COTNLIBRR
PERTOIAECR
ORBTSGLNUCI
YHOEEIEYTIPR
OCCCENOSNOTWA
PAYOECRNTRSOOI
GTPMSVRCIELLN
OHRPSWCMBFIO
RBEUOIBCRBH
WRSTREFEIT
OMRETRVNY
OVEROOAP
UTOHPRU
IHSSYI
PHMOF
SCRS
CTW
HR
P
Variation on a theme - generate diagonal indices.
If you trace your finger down the left edge, those are the starting point for the top-half diagonals and tracing your finger across the bottom edge are starting points for the bottom-half diagonals.
Top Half:
From each diagonal's starting point the first dimension's indices range from the starting point to zero (negative steps) and the second dimension's indices range from zero to a maximum of the length of the second dimension (positive steps).
Bottom Half:
From each diagonal's starting point the first dimension's indices range from the length of the first dimension minus one to a minimum of zero (negative steps) and the second dimension's indices range from the starting point to a the length of the second dimension (positive steps).
For both sets of diagonals you can take advantage of the fact that zip will stop when the shortest iterable is exhausted.
def cw(dims,ragged=False):
'''Generate indices for diagonals based on dims.
dims --> tuple: (nrows,ncolumns)
Currently only implemented for all rows have same number of columns
Diagonals as if the rectangle was rotated CW 45 degrees
bottom-left to upper-right
'''
if ragged: raise NotImplementedError
nrows,ncolumns = dims
# top half
index1 = range(0,ncolumns)
for i in range(nrows):
yield zip(range(i,-1,-1),index1)
# bottom half
index0 = range(nrows-1,-1,-1)
for i in range(1,nrows):
yield zip(index0,range(i,ncolumns))
new = []
for diagonal in cw((len(Puzzle),len(Puzzle[0]))):
new.append(''.join(Puzzle[j][k] for (j,k) in diagonal))
And a CCW rotation solution
def ccw(dims,ragged=False):
'''Generate indices for diagonals based on dims.
dims --> tuple: (nrows,ncolumns)
Currently only implemented for all rows have same number of columns
Diagonals as if the rectangle was rotated CCW 45 degrees
top-left to bottom-right
'''
if ragged: raise NotImplementedError
nrows,ncolumns = dims
# top half
index0 = range(0,nrows)
for i in range(ncolumns,-1,-1):
yield zip(index0,range(i,ncolumns))
# bottom half
index1 = range(0,ncolumns)
for i in range(1,nrows):
yield zip(range(i,nrows),index1,)

Appending list sometimes give 'IndexError: list index out of range' error and results in not as expected

So i'm still new to programming and trying to implement an initialization method for a clustering problem using python-2.7.
The steps are:
Pick a random data from dataset as first centroid
While number of data in centroid < n_klas : Calculate the data distance to the data in centroids
Calculate the probability of all datas to their closest centroid using formula
P(x) = D(x)**2 / sum(D(x)**2), in which D(x) is euclidean distance from data[x] to the closest centroid
Pick Data with highest P(x), then loop back to no.2.
But when i try to appending data sometimes i got this error 'IndexError: list index out of range' and sometimes the code works but only give 2 different centroid and the 3rd to n centroid give the same values as the 2nd centroid.
Where did i do wrong?
(Edit: i edited the steps to doi it because i was wrong)
def pickcentroid(df):
x = df.values.tolist()
n_klas = 3
# random.seed(2)
idx_pusat_pertama = random.randint(0, len(df))
centroid = []
centroid_idx = []
centroid.append(x[idx_pusat_pertama])
centroid_idx.append(idx_pusat_pertama)
prob_data = []
while len(centroid) < n_klas:
ac_mindist = 0
for i in x:
dist_ke_c = []
for c in centroid:
dist_ke_c.append(dist(i,c))
ac_mindist += min(dist_ke_c)**2
for idx in range(len(df)) :
if idx not in centroid_idx:
dist_ke_c2 = []
mindist_per_data = 0
for c in centroid:
dist_ke_c2.append(dist(x[idx],c))
mindist_per_data = min(dist_ke_c2)**2
prob_data.append(mindist_per_data/ac_mindist)
else:
prob_data.append(0)
new_cen_idx = prob_data.index(max(prob_data))
centroid_idx.append(new_cen_idx)
centroid.append(x[new_cen_idx])
print(centroid)
return centroid
def dist(x,y):
r = np.array(x) - np.array(y)
distance = np.linalg.norm(r)
# print(distance)
return distance
c = pickcentroid(df)
And the data looks like this:
-0.19864726098025476,-0.2174575876560727
-0.19427576174137176,-0.2658220115362011
0.24385376109048476,0.1555938625346895
-0.23636704446757748,0.14005058641250595
0.37563103051045826,0.33204816285389527
-0.13210748354848134,-0.0019122205360639893
-0.17120654390561796,0.04231258139538708
0.2865229979171536,0.34175192153482764
-0.328896319205639,-0.22737124434792602
0.03115098005450885,0.17089336362457433
Thankyou very much for your kind help
The randint(a, b) returns random integers from a to b, including b. So, when you use randint(0, len(x)), you might get the value len(x) as output, which is out of range when used as index.
For your use case, you could probably use random_value = random.choice(x) instead.

Spawning objects in groups when the first object of the group was spawned randomly Python

I'm currently doing a project, and in the code I have, I'm trying to get trees .*. and mountains .^. to spawn in groups around the first tree or mountain which is spawned randomly, however, I can't figure out how to get the trees and mountains to spawn in groups around a single randomly generated point. Any help?
grid = []
def draw_board():
row = 0
for i in range(0,625):
if grid[i] == 1:
print("..."),
elif grid[i] == 2:
print("..."),
elif grid[i] == 3:
print(".*."),
elif grid[i] == 4:
print(".^."),
elif grid[i] == 5:
print("[T]"),
else:
print("ERR"),
row = row + 1
if row == 25:
print ("\n")
row = 0
return
There's a number of ways you can do it.
Firstly, you can just simulate the groups directly, i.e. pick a range on the grid and fill it with a specific figure.
def generate_grid(size):
grid = [0] * size
right = 0
while right < size:
left = right
repeat = min(random.randint(1, 5), size - right) # *
right = left + repeat
grid[left:right] = [random.choice(figures)] * repeat
return grid
Note that the group size need not to be uniformly distributed, you can use any convenient distribution, e.g. Poisson.
Secondly, you can use a Markov Chain. In this case group lengths will implicitly follow a Geometric distribution. Here's the code:
def transition_matrix(A):
"""Ensures that each row of transition matrix sums to 1."""
copy = []
for i, row in enumerate(A):
total = sum(row)
copy.append([item / total for item in row])
return copy
def generate_grid(size):
# Transition matrix ``A`` defines the probability of
# changing from figure i to figure j for each pair
# of figures i and j. The grouping effect can be
# obtained by setting diagonal entries A[i][i] to
# larger values.
#
# You need to specify this manually.
A = transition_matrix([[5, 1],
[1, 5]]) # Assuming 2 figures.
grid = [random.choice(figures)]
for i in range(1, size):
current = grid[-1]
next = choice(figures, A[current])
grid.append(next)
return grid
Where the choice function is explained in this StackOverflow answer.

Algorithm for matching objects

I have 1,000 objects, each object has 4 attribute lists: a list of words, images, audio files and video files.
I want to compare each object against:
a single object, Ox, from the 1,000.
every other object.
A comparison will be something like:
sum(words in common+ images in common+...).
I want an algorithm that will help me find the closest 5, say, objects to Ox and (a different?) algorithm to find the closest 5 pairs of objects
I've looked into cluster analysis and maximal matching and they don't seem to exactly fit this scenario. I don't want to use these method if something more apt exists, so does this look like a particular type of algorithm to anyone, or can anyone point me in the right direction to applying the algorithms I mentioned to this?
I made an example program for how to solve your first question. But you have to implement ho you want to compare images, audio and videos. And I assume every object has the same length for all lists. To answer your question number two it would be something similar, but with a double loop.
import numpy as np
from random import randint
class Thing:
def __init__(self, words, images, audios, videos):
self.words = words
self.images = images
self.audios = audios
self.videos = videos
def compare(self, other):
score = 0
# Assuming the attribute lists have the same length for both objects
# and that they are sorted in the same manner:
for i in range(len(self.words)):
if self.words[i] == other.words[i]:
score += 1
for i in range(len(self.images)):
if self.images[i] == other.images[i]:
score += 1
# And so one for audio and video. You have to make sure you know
# what method to use for determining when an image/audio/video are
# equal.
return score
N = 1000
things = []
words = np.random.randint(5, size=(N,5))
images = np.random.randint(5, size=(N,5))
audios = np.random.randint(5, size=(N,5))
videos = np.random.randint(5, size=(N,5))
# For testing purposes I assign each attribute to a list (array) containing
# five random integers. I don't know how you actually intend to do it.
for i in xrange(N):
things.append(Thing(words[i], images[i], audios[i], videos[i]))
# I will assume that object number 999 (i=999) is the Ox:
ox = 999
scores = np.zeros(N - 1)
for i in xrange(N - 1):
scores[i] = (things[ox].compare(things[i]))
best = np.argmax(scores)
print "The most similar thing is thing number %d." % best
print
print "Ox attributes:"
print things[ox].words
print things[ox].images
print things[ox].audios
print things[ox].videos
print
print "Best match attributes:"
print things[ox].words
print things[ox].images
print things[ox].audios
print things[ox].videos
EDIT:
Now here is the same program modified sligthly to answer your second question. It turned out to be very simple. I basically just needed to add 4 lines:
Changing scores into a (N,N) array instead of just (N).
Adding for j in xrange(N): and thus creating a double loop.
if i == j:
break
where 3. and 4. is just to make sure that I only compare each pair of things once and not twice and don't compary any things with themselves.
Then there is a few more lines of code that is needed to extract the indices of the 5 largest values in scores. I also reformated the printing so it will be easy to confirm by eye that the printed pairs are actually very similar.
Here comes the new code:
import numpy as np
class Thing:
def __init__(self, words, images, audios, videos):
self.words = words
self.images = images
self.audios = audios
self.videos = videos
def compare(self, other):
score = 0
# Assuming the attribute lists have the same length for both objects
# and that they are sorted in the same manner:
for i in range(len(self.words)):
if self.words[i] == other.words[i]:
score += 1
for i in range(len(self.images)):
if self.images[i] == other.images[i]:
score += 1
for i in range(len(self.audios)):
if self.audios[i] == other.audios[i]:
score += 1
for i in range(len(self.videos)):
if self.videos[i] == other.videos[i]:
score += 1
# You have to make sure you know what method to use for determining
# when an image/audio/video are equal.
return score
N = 1000
things = []
words = np.random.randint(5, size=(N,5))
images = np.random.randint(5, size=(N,5))
audios = np.random.randint(5, size=(N,5))
videos = np.random.randint(5, size=(N,5))
# For testing purposes I assign each attribute to a list (array) containing
# five random integers. I don't know how you actually intend to do it.
for i in xrange(N):
things.append(Thing(words[i], images[i], audios[i], videos[i]))
################################################################################
############################# This is the new part: ############################
################################################################################
scores = np.zeros((N, N))
# Scores will become a triangular matrix where scores[i, j]=value means that
# value is the number of attrributes thing[i] and thing[j] have in common.
for i in xrange(N):
for j in xrange(N):
if i == j:
break
# Break the loop here because:
# * When i==j we would compare thing[i] with itself, and we don't
# want that.
# * For every combination where j>i we would repeat all the
# comparisons for j<i and create duplicates. We don't want that.
scores[i, j] = (things[i].compare(things[j]))
# I want the 5 most similar pairs:
n = 5
# This list will contain a tuple for each of the n most similar pairs:
best_list = []
for k in xrange(n):
ij = np.argmax(scores) # Returns a single integer: ij = i*n + j
i = ij / N
j = ij % N
best_list.append((i, j))
# Erease this score so that on next iteration the second largest score
# is found:
scores[i, j] = 0
for k, (i, j) in enumerate(best_list):
# The number 1 most similar pair is the BEST match of all.
# The number N most similar pair is the WORST match of all.
print "The number %d most similar pair is thing number %d and %d." \
% (k+1, i, j)
print "Thing%4d:" % i, \
things[i].words, things[i].images, things[i].audios, things[i].videos
print "Thing%4d:" % j, \
things[j].words, things[j].images, things[j].audios, things[j].videos
print
If your comparison works with "create a sum of all features and find those which the closest sum", there is a simple trick to get close objects:
Put all objects into an array
Calculate all the sums
Sort the array by sum.
If you take any index, the objects close to it will now have a close index as well. So to find the 5 closest objects, you just need to look at index+5 to index-5 in the sorted array.

Quickly counting particles in grid

I've written some python code to calculate a certain quantity from a cosmological simulation. It does this by checking whether a particle in contained within a box of size 8,000^3, starting at the origin and advancing the box when all particles contained within it are found. As I am counting ~2 million particles altogether, and the total size of the simulation volume is 150,000^3, this is taking a long time.
I'll post my code below, does anybody have any suggestions on how to improve it?
Thanks in advance.
from __future__ import division
import numpy as np
def check_range(pos, i, j, k):
a = 0
if i <= pos[2] < i+8000:
if j <= pos[3] < j+8000:
if k <= pos[4] < k+8000:
a = 1
return a
def sigma8(data):
N = []
to_do = data
print 'Counting number of particles per cell...'
for k in range(0,150001,8000):
for j in range(0,150001,8000):
for i in range(0,150001,8000):
temp = []
n = []
for count in range(len(to_do)):
n.append(check_range(to_do[count],i,j,k))
to_do[count][1] = n[count]
if to_do[count][1] == 0:
temp.append(to_do[count])
#Only particles that have not been found are
# searched for again
to_do = temp
N.append(sum(n))
print 'Next row'
print 'Next slice, %i still to find' % len(to_do)
print 'Calculating sigma8...'
if not sum(N) == len(data):
return 'Error!\nN measured = {0}, total N = {1}'.format(sum(N), len(data))
else:
return 'sigma8 = %.4f, variance = %.4f, mean = %.4f' % (np.sqrt(sum((N-np.mean(N))**2)/len(N))/np.mean(N), np.var(N),np.mean(N))
I'll try to post some code, but my general idea is the following: create a Particle class that knows about the box that it lives in, which is calculated in the __init__. Each box should have a unique name, which might be the coordinate of the bottom left corner (or whatever you use to locate your boxes).
Get a new instance of the Particle class for each particle, then use a Counter (from the collections module).
Particle class looks something like:
# static consts - outside so that every instance of Particle doesn't take them along
# for the ride...
MAX_X = 150,000
X_STEP = 8000
# etc.
class Particle(object):
def __init__(self, data):
self.x = data[xvalue]
self.y = data[yvalue]
self.z = data[zvalue]
self.compute_box_label()
def compute_box_label(self):
import math
x_label = math.floor(self.x / X_STEP)
y_label = math.floor(self.y / Y_STEP)
z_label = math.floor(self.z / Z_STEP)
self.box_label = str(x_label) + '-' + str(y_label) + '-' + str(z_label)
Anyway, I imagine your sigma8 function might look like:
def sigma8(data):
import collections as col
particles = [Particle(x) for x in data]
boxes = col.Counter([x.box_label for x in particles])
counts = boxes.most_common()
#some other stuff
counts will be a list of tuples which map a box label to the number of particles in that box. (Here we're treating particles as indistinguishable.)
Using list comprehensions is much faster than using loops---I think the reason is that you're basically relying more on the underlying C, but I'm not the person to ask. Counter is (supposedly) highly-optimized as well.
Note: None of this code has been tested, so you shouldn't try the cut-and-paste-and-hope-it-works method here.

Categories