Related
I have set up a very simple SVC to classify the MNIST digits. For some reason, the classifier is pretty consistently incorrectly predicting the digit 5, but when trying all other numbers it doesn't miss a single one. Does anyone have any idea if I might be setting this up wrong, or if it's just really bad at predicting the number 5?
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
data = datasets.load_digits()
images = data.images
targets = data.target
# Split into train and test sets
images_train, images_test, imlabels_train, imlabels_test = train_test_split(images, targets, test_size=.2, shuffle=False)
# Re-shape data so that it's 2D
images_train = np.reshape(images_train, (np.shape(images_train)[0], 64))
images_test = np.reshape(images_test, (np.shape(images_test)[0], 64))
svm_classifier = SVC(gamma='auto').fit(images_train, imlabels_train)
number_correct_svc = 0
preds = []
for label_index in range(len(imlabels_test)):
pred = svm_classifier.predict(images_test[label_index].reshape(1,-1))
if pred[0] == imlabels_test[label_index]:
number_correct_svc += 1
preds.append(pred[0])
print("Support Vector Classifier...")
print(f"\tPercent correct for all test data: {100*number_correct_svc/len(imlabels_test)}%")
confusion_matrix(preds,imlabels_test)
Here is the resulting confusion matrix:
array([[22, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 15, 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 15, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 21, 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 21, 0, 0, 0, 0, 0],
[13, 21, 20, 16, 16, 37, 23, 20, 31, 16],
[ 0, 0, 0, 0, 0, 0, 14, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 16, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 21]], dtype=int64)
I've been reading the sklearn page for SVC but can't tell what I'm doing wrong
Update:
I tried using SCV(gamma='scale') and it seems much more reasonable. It would still be nice to know why 'auto' doesn't work?
with scale:
array([[34, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 36, 0, 0, 0, 0, 0, 0, 1, 0],
[ 0, 0, 35, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 27, 0, 0, 0, 0, 0, 1],
[ 1, 0, 0, 0, 34, 0, 0, 0, 0, 0],
[ 0, 0, 0, 2, 0, 37, 0, 0, 0, 1],
[ 0, 0, 0, 0, 0, 0, 37, 0, 0, 0],
[ 0, 0, 0, 2, 0, 0, 0, 35, 0, 1],
[ 0, 0, 0, 6, 1, 0, 0, 1, 31, 1],
[ 0, 0, 0, 0, 2, 0, 0, 0, 1, 33]], dtype=int64)
The second question is much easier to deal with. The thing is in RBF kernel the gamma denotes how wiggly the decision boundary would be. What do we mean by "wiggly"? The higher the value of gamma more precise the decision boundary would be. Decision boundary of the SVM.
if gamma='scale' (default) is passed then it uses 1 / (n_features *X.var()) as value of gamma,
if ‘auto’, uses 1 / n_features.
In the second case the gamma is higher. For MNIST standard deviation is less than 1. As a result the second decision boundary is much more precise giving a better result than the previous case.
Here's the problem: I have two vectors A (1Xn) and B (1Xm) where n>m. I'm looking for a matrix T (nXm), such that AT=B. T has the following properties: All elements of T are either 1's or 0's. The elements in each row in T sum to 1. Ideally, I would like the program to return the best solution where as many elements of AT-B=0 if there is not a perfect solution.
Here's an example:
import numpy as np
A = np.array([-1.051, 1.069, 0.132, -0.003, -0.001, 0.066, -0.28,
-0.121, 0.075, 0.006, 0.229, -0.018, -0.213, -0.11])
B = np.array([-1.051, 1.201, -0.003, -0.001, 0.066, -0.121, 0.075,
-0.045,-0.231, -0.11])
T = np.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])
# This should equal a vector of 0's
print A.dot(T)-B
I've come up with something, but I don't think it's totally satisfactory. I'd prefer not to have to go this route as it's clunky. I first try 1 to 1 mapping because that's a common solution to many of the mappings. Anything left over I move to iterating over all possibilities. This gets messy a bit quickly. You'll also notice I'm fairly new to numpy. I'd appreciate any feedback on how to improve this. Thanks.
def solver(B,A):
m=B.size
n=A.size
start=np.ones((1,n))
start=np.concatenate((start,np.zeros((m-1,n))),axis=0).astype(np.int)
for i in xrange(0,m):
T=np.roll(start,i,axis=0)
test=B.dot(T)-A
if i==0:
matches=np.absolute(test)<.0001
else:
matches=np.vstack((matches,np.absolute(test)<.0001))
rA=(A-B.dot(matches))[np.absolute(A-B.dot(matches))>.0001]
Amissing=A-B.dot(matches)
rB=(B-B*np.sum(matches,axis=1))[np.absolute(B-B*np.sum(matches,axis=1))>.0001]
Bmissing=B-B*np.sum(matches,axis=1)
rm=rB.size
rn=rA.size
rmxrn = np.arange(rm*rn).reshape(rm,rn)
dif=np.absolute(rA)
best=np.zeros(shape=(rm,rn))
for i in xrange(0, 2**(rm*rn)):
arr = (i >> rmxrn) % 2
if np.amax(np.sum(arr,axis=1))>1 or np.sum(arr)>rm:
continue
else:
diftemp=rB.dot(arr)-rA
besttemp=arr
if np.sum(np.absolute(diftemp))<np.sum(np.absolute(dif)):
dif=diftemp
best=besttemp
if np.sum(np.absolute(dif)<.0001)==rn:
break
best=best.astype(np.bool)
matchesT=matches.T
bestT=best.T
newbestT=np.zeros(shape=(m,rn)).astype(np.bool).T
for x in xrange(0,rn):
counter=0
for i, value in enumerate(Bmissing):
if abs(Bmissing[i])>.0001:
newbestT[x,i]=bestT[x,counter]
counter=counter+1
for x in xrange(0,rn):
counter=0
for i, value in enumerate(Amissing):
if abs(Amissing[i])>.0001:
matchesT[i]=newbestT[counter]
counter=counter+1
return(matchesT.T)
A=np.array([-1.051,1.201,-0.003,-0.001,0.066,-0.121,0.075,-0.045,-0.231,-0.11])
B=np.array([-1.051,1.069,0.132,-0.003,-0.001,0.066,-0.28,-0.121,0.075,0.006,0.229,-0.018,-0.213,-0.11])
print solver(B,A)
I'm still solving this problem, taken from the current "Google Foobar" challenge. It's a variation of the "Lights Out" game, in which pressing a light will flip the state of every light on the same row and the same column.
I previously tried using a BFS, which turned out to be too slow for n > 6, while I need to handle 2 < n < 16. I currently have a program that can handle all even n and all odd numbers except 13 and 15. Here's what it does:
I use the strategy outlined by #Aryabhata to find a special solution x' of some system Ax = b that can be associated with an instance of this problem (see here for details).
Having found a base of the null space of A, I compute all sums of x' plus a linear combination of the vectors of the base.
The set of those sums is the set of all solutions of the original problem, therefore I can find by brute-force the solution that achieves the minimum.
It should be noted that, for n even, the null space is empty (A is invertible), therefore x' achieves the minimum because it's the only solution. If n is odd the number of vectors in a base of the null space is 2n - 2, therefore the search space has size 2^(2n - 2), which is 2^28 in the worst case (n = 15).
Here's my program:
from itertools import product
MEMO = {}
def bits(iterable):
bit = 1
res = 0
for elem in iterable:
if elem:
res |= bit
bit <<= 1
return res
def mask(current, n):
if (current, n) in MEMO:
return MEMO[(current, n)]
result = 0
if current < n:
for j in xrange(n):
result += (2 ** ((current - 1)*n + j) + 2 ** (current*n + j))
else:
for i in xrange(n):
result += (2 ** (i*n + current - n) + 2 ** (i*n + current - n + 1))
MEMO[(current, n)] = result
return result
# See: https://math.stackexchange.com/a/441697/4471
def check(matrix, n):
parities = [sum(row) % 2 for row in matrix]
for i in xrange(n):
parities.append(sum([row[i] for row in matrix]) % 2)
return len(set(parities)) == 1
def minimize(matrix, current, n):
if current == 0:
# See: https://stackoverflow.com/a/9831671/374865
return bin(matrix).count("1")
else:
return min(minimize(matrix ^ mask(current, n), current - 1, n),
minimize(matrix, current - 1, n))
def solve(matrix, n):
result = [0 for i in xrange(n) for j in xrange(n)]
for i, j in product(xrange(n), repeat=2):
if matrix[i][j]:
for k in xrange(n):
result[i*n + k] ^= 1
result[k*n + j] ^= 1
result[i*n + j] ^= 1
if n % 2 == 0:
return sum(result)
else:
return minimize(bits(result), 2*n - 2, n)
def answer(matrix):
n = len(matrix)
if n % 2 == 0:
return solve(matrix, n)
else:
if check(matrix, n):
return solve(matrix, n)
else:
return -1
I've already tried optimizing it: for instance, matrices are encoded as binary numbers by the function bits, while the function mask creates binary masks that are used to add a single element of the base to x'. Those masks are also memoized because they are frequently used, so that they are calculated only once.
The number of ones is then counted using the idiom bin(n).count('1'), which should be the fastest implementation (I checked it against the classical one by Kernighan).
So, what else can I do to squeeze more performance out of my program? Here are a few test cases:
print answer([
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
]), 1
print answer([
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
]), 14
print answer([
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
]), 15
print answer([
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
]), 14
print answer([
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
]), 15
EDIT: I passed this round. This implementation solves correctly 4 out of 5 test cases, then I brute-forced the fifth. I'm still interested in further optimizations or different algorithms!
EDIT 2: This answer, and in particular this paper give a proof that this particular problem is NP-hard (Section 3), which hints that we shouldn't be looking for a polynomial algorithm. So the question becomes: "What is the best exponent we can get?".
I tried everything about linear algebra, and since it is GF2, I do not think I could find the polynomial solution. Since the maximum number is 15, I further optimised it to approximately 2^15.
For even number
So, for n is even, there is a quicker way than standard linear algebra. If you have for example something like this,
0000
0100
0000
0000
The one solution should be (flip the row and column of the point exactly n times)
0100
1111
0100
0100
If you think about it, if you have a point which you want to flip, you can flip every point of the row and column once. (If that make sense), so it is easy to find one particular solution.
If I have something like this
0100
0010
0010
0000
one solution could be
1131
1221
1221
0120
and since flipping twice makes no difference, the solution can be reduced to
1111
1001
1001
0100
Then odd number
If n is odd, I can think of nothing but search. However, we can expand the n -> n+1 such that the solution to the problem should not contain flipping points of last row and last column.
If you have something 3x3 like:
010
001
001
you can always try expand solution to something like:
010x
001x
001x
xxxx
First, you will determine all the points in 3 by 3,
1111
1001 + ?
1001
0100
where ? should be solution to
000x
000x
000x
xxxx
As you can see, no matter how to flip, there is no way you can satisfy unless the xxx are the same bits. And you can try all the combination of the bottom to flip, and then you can determine the right hand side flipping or not by determine whether flipping results minimum number of 1 of the row.
I am really bad at explaining things, hope it will be clear enough.
I want to echo that darwinsenior's answer is very helpful! However, it took me a very long time to figure it out, even after reading that answer several times.
So, if you're late to foobar, like me, but want to get through this one without resorting to Java, here's a hint that might help.
The following light pattern isn't solvable, which I think is what confused me.
010
001
001
Here's a non-trivial example to demonstrate darwinsenior's idea:
Say you want to solve this (N=5)
11010
01000
11100
10011
00010
We know this is solvable because the parity of all sums and columns is odd.
And if N were even, it would be easier to find the answer.
So, expand to N=6 as follows:
110100
010000
111000
100110
000100
000000
Like darwinsenior said, we want a solution to this that doesn't touch any lights in the bottom row or right-most column. Then we could take that solution, ignore the bottom row and right column and we'd have a solution to the original N=5 problem. So, we want to plug in values (not just zeros) but not have any button pushes in those columns in your answer.
This means you can't put a 1 in the bottom right. A light in the bottom right would mean at least 1 button pushed in the bottom row or right-most column. So that's one "degree of freedom" gone. Next, for the bottom row to have no button pushes, all those lights must have an even parity. Look to the answer to even N case to see why. So in the case above the parity is odd. We can fill the bottom row, but we must use an odd number of 1's. This removes another "degree of freedom". If we plug in 4 values (either 1s or 0s) then the 5th value is determined by this parity requirement. So, N-1 degrees of freedom here.
This is where the brute force part comes in. I had to try all possible values here (in this case all sets of 5 bits with odd parity)
One example is to plug in 10101
110100
010000
111000
100110
000100
10101_
Now we can use the rule for even N and get a solution.
I'll write down the actual sum of row and column for each point, even though just the parity is needed in order to make it clearer what I did.
65555o 01111o
53343o 11101o
65465o -> 01001o
66554o 00110o
54333o 10111o
66464_ 00000_
I put little o's on the far right to say that the parity is odd, and because we haven't done anything with those yet. Because the parity is odd, this is no good, we would have a solution with all these being touched. But they all have odd parity, so we just need to plug in values such that the parity of the right-most column is odd, so the parity at each point is even (if that makes sense)
This is what darwinsenior said in this comment above (but I had a tough time following) The only requirement is that the parity of the column is odd and therefore no buttons on far right need to be pushed in the solution.
We don't need to brute force this, we can use some logic to figure out which buttons to push while maintaining the parity requirement. By the way, we have N-1 free variables here, so 2*(N-1) free variables, just as in other solutions mentioned. Just that here we can see the effect of our choices on the button push count. I'll choose these values for the column: 11001
Now the example is:
110101 X00000
010001 000X00
111000 -- again use even N solution -> 0X00X0
100110 00XX00
000101 0X0000
10101_ 000000
So, I think that gives us an answer to the original N=5 case (just remove the zeros at bottom and at right). It has 7 button pushes, which I think is the best we can do with this one, but I'm not sure.
One more thing- even with this big reduction in the number of cases that need to be brute forced, I still had to do what Eugene said and use a list of ints, not a list of list of ints. Look to Jacopo's code and the "bits" function for that. Fun stuff.
So I think you shouldn't need to brute force the odd case at all. My linear isn't too strong, but in R^n, if you want to find the shortest x satisfying Ax=b (which is essentially what we're doing), after finding some special solution x' you can project onto the nullspace of A and subtract the projection from x'. I believe this method should work even in F_2, though I'm not sure; please correct me if I'm wrong.
I like how #rustonian clarified the top answer on here, but there is one assumption that he took that I believe to be wrong, and that assumption is that the bottom right most bit of the added column and row can not be 1. It in fact can be 1 so that it may change all of the other added bits to 0. Here is an example of what I mean:
011 0110 4434 0010
110 -> 1100 -> 3444 -> 1000
101 1011 4644 0000
0101 4443 0001
So it seems that the bottom right bit can be 1 iff it used to turn off all other added bits. This will not take away from the 3x3 solution since the toggling the bottom right added bit does not effect the original 3x3 space.
I'm in the process of novelty detection using machine-learning. I have tried using one-class svm in scikit learn.
from sklearn import svm
train_data = [[0, 0, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1], [0, 3, 0, 0, 0, 1, 0, 0], [0, 11, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 4]]
test_data = [[0, 0, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0, 0]]
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(train_data)
pred_test = clf.predict(test_data)
I'm new to this area and I want to know how can I say there is novelty in my test data?
The inliers are labeled 1, and the outliers (i.e., the novelties in your case) are labeled -1 (as the result of the predict function).
Please notice that the current documentation incorrectly states that the outliers are labeled 1 & inliers are labeled 0. Please check out the latest updates on github repo for the correct information.
check = clf.predict(test_data)
if check = 1 then not anomaly and
if check = -1 then it an anomaly i.e. data is outlier
I am still coding a fingerprint image preprocessor on Python. I see in MATLAB there is a special function to remove H breaks and spurs:
bwmorph(a , 'hbreak')
bwmorph(a , 'spur')
I have searched scikit, OpenCV and others but couldn't find an equivalent for these two use of bwmorph. Can anybody point me to right direction or do i have to implement my own?
Edit October 2017
the skimage module now has at least 2 options:
skeletonize and thin
Example with comparison
from skimage.morphology import thin, skeletonize
import numpy as np
import matplotlib.pyplot as plt
square = np.zeros((7, 7), dtype=np.uint8)
square[1:-1, 2:-2] = 1
square[0, 1] = 1
thinned = thin(square)
skel = skeletonize(square)
f, ax = plt.subplots(2, 2)
ax[0,0].imshow(square)
ax[0,0].set_title('original')
ax[0,0].get_xaxis().set_visible(False)
ax[0,1].axis('off')
ax[1,0].imshow(thinned)
ax[1,0].set_title('morphology.thin')
ax[1,1].imshow(skel)
ax[1,1].set_title('morphology.skeletonize')
plt.show()
Original post
I have found this solution by joefutrelle on github.
It seems (visually) to give similar results as the Matlab version.
Hope that helps!
Edit:
As it was pointed out in the comments, I'll extend my initial post as the mentioned link might change:
Looking for a substitute in Python for bwmorph from Matlab I stumbled upon the following code from joefutrelle on Github (at the end of this post as it's very long).
I have figured out two ways to implement this into my script (I'm a beginner and I'm sure there are better ways!):
1) copy the whole code into your script and then call the function (but this makes the script harder to read)
2) copy the code it in a new python file 'foo' and save it. Now copy it in the Python\Lib (eg. C:\Program Files\Python35\Lib) folder. In your original script you can call the function by writing:
from foo import bwmorph_thin
Then you'll feed the function with your binary image:
skeleton = bwmorph_thin(foo_image, n_iter = math.inf)
import numpy as np
from scipy import ndimage as ndi
# lookup tables for bwmorph_thin
G123_LUT = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
0, 0, 0], dtype=np.bool)
G123P_LUT = np.array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0], dtype=np.bool)
def bwmorph_thin(image, n_iter=None):
"""
Perform morphological thinning of a binary image
Parameters
----------
image : binary (M, N) ndarray
The image to be thinned.
n_iter : int, number of iterations, optional
Regardless of the value of this parameter, the thinned image
is returned immediately if an iteration produces no change.
If this parameter is specified it thus sets an upper bound on
the number of iterations performed.
Returns
-------
out : ndarray of bools
Thinned image.
See also
--------
skeletonize
Notes
-----
This algorithm [1]_ works by making multiple passes over the image,
removing pixels matching a set of criteria designed to thin
connected regions while preserving eight-connected components and
2 x 2 squares [2]_. In each of the two sub-iterations the algorithm
correlates the intermediate skeleton image with a neighborhood mask,
then looks up each neighborhood in a lookup table indicating whether
the central pixel should be deleted in that sub-iteration.
References
----------
.. [1] Z. Guo and R. W. Hall, "Parallel thinning with
two-subiteration algorithms," Comm. ACM, vol. 32, no. 3,
pp. 359-373, 1989.
.. [2] Lam, L., Seong-Whan Lee, and Ching Y. Suen, "Thinning
Methodologies-A Comprehensive Survey," IEEE Transactions on
Pattern Analysis and Machine Intelligence, Vol 14, No. 9,
September 1992, p. 879
Examples
--------
>>> square = np.zeros((7, 7), dtype=np.uint8)
>>> square[1:-1, 2:-2] = 1
>>> square[0,1] = 1
>>> square
array([[0, 1, 0, 0, 0, 0, 0],
[0, 0, 1, 1, 1, 0, 0],
[0, 0, 1, 1, 1, 0, 0],
[0, 0, 1, 1, 1, 0, 0],
[0, 0, 1, 1, 1, 0, 0],
[0, 0, 1, 1, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0]], dtype=uint8)
>>> skel = bwmorph_thin(square)
>>> skel.astype(np.uint8)
array([[0, 1, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0]], dtype=uint8)
"""
# check parameters
if n_iter is None:
n = -1
elif n_iter <= 0:
raise ValueError('n_iter must be > 0')
else:
n = n_iter
# check that we have a 2d binary image, and convert it
# to uint8
skel = np.array(image).astype(np.uint8)
if skel.ndim != 2:
raise ValueError('2D array required')
if not np.all(np.in1d(image.flat,(0,1))):
raise ValueError('Image contains values other than 0 and 1')
# neighborhood mask
mask = np.array([[ 8, 4, 2],
[16, 0, 1],
[32, 64,128]],dtype=np.uint8)
# iterate either 1) indefinitely or 2) up to iteration limit
while n != 0:
before = np.sum(skel) # count points before thinning
# for each subiteration
for lut in [G123_LUT, G123P_LUT]:
# correlate image with neighborhood mask
N = ndi.correlate(skel, mask, mode='constant')
# take deletion decision from this subiteration's LUT
D = np.take(lut, N)
# perform deletion
skel[D] = 0
after = np.sum(skel) # coint points after thinning
if before == after:
# iteration had no effect: finish
break
# count down to iteration limit (or endlessly negative)
n -= 1
return skel.astype(np.bool)
"""
# here's how to make the LUTs
def nabe(n):
return np.array([n>>i&1 for i in range(0,9)]).astype(np.bool)
def hood(n):
return np.take(nabe(n), np.array([[3, 2, 1],
[4, 8, 0],
[5, 6, 7]]))
def G1(n):
s = 0
bits = nabe(n)
for i in (0,2,4,6):
if not(bits[i]) and (bits[i+1] or bits[(i+2) % 8]):
s += 1
return s==1
g1_lut = np.array([G1(n) for n in range(256)])
def G2(n):
n1, n2 = 0, 0
bits = nabe(n)
for k in (1,3,5,7):
if bits[k] or bits[k-1]:
n1 += 1
if bits[k] or bits[(k+1) % 8]:
n2 += 1
return min(n1,n2) in [2,3]
g2_lut = np.array([G2(n) for n in range(256)])
g12_lut = g1_lut & g2_lut
def G3(n):
bits = nabe(n)
return not((bits[1] or bits[2] or not(bits[7])) and bits[0])
def G3p(n):
bits = nabe(n)
return not((bits[5] or bits[6] or not(bits[3])) and bits[4])
g3_lut = np.array([G3(n) for n in range(256)])
g3p_lut = np.array([G3p(n) for n in range(256)])
g123_lut = g12_lut & g3_lut
g123p_lut = g12_lut & g3p_lut
"""`
You will have to implement those on your own since they aren't present in OpenCV or skimage as far as I know.
However, it should be straightforward to check MATLAB's code on how it works and write your own version in Python/NumPy.
Here is a guide describing in detail NumPy functions exclusively for MATLAB users, with hints on equivalent functions in MATLAB and NumPy:
Link