I am trying to reproduce the algorithm described in the Isolation Forest paper in python. http://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf?q=isolation
This is my current code:
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
def _h(i):
return np.log(i) + 0.5772156649
def _c(n):
if n > 2:
h = _h(n-1)
return 2*h - 2*(n - 1)/n
if n == 2:
return 1
else:
return 0
def _anomaly_score(dict_scores, n_samples):
score = np.array([np.mean(dict_scores[k]) for k in dict_scores.keys()])
score = -score/_c(n_samples)
return 2**score
def _split_data(X):
''' split the data in the left and right nodes '''
n_samples, n_columns = X.shape
n_features = n_columns - 1
feature_id = np.random.randint(low=0, high=n_features-1)
feature = X[:, feature_id]
split_value = np.random.choice(feature)
left_X = X[feature <= split_value]
right_X = X[feature > split_value]
return left_X, right_X, feature_id, split_value
def iTree(X, add_index=False, max_depth = np.inf):
''' construct an isolation tree and returns the number of step required
to isolate an element. A column of index is added to the input matrix X if
add_index=True. This column is required in the algorithm. '''
n_split = {}
def iterate(X, count = 0):
n_samples, n_columns = X.shape
n_features = n_columns - 1
if count > max_depth:
for index in X[:,-1]:
n_split[index] = count
return
if n_samples == 1:
index = X[0, n_columns-1]
n_split[index] = count
return
else:
lX, rX, feature_id, split_value = _split_data(X)
# Uncomment the print to visualize a draft of
# the construction of the tree
#print(lX[:,-1], rX[:,-1], feature_id, split_value, n_split)
n_samples_lX, _ = lX.shape
n_samples_rX, _ = rX.shape
if n_samples_lX > 0:
iterate(lX, count+1)
if n_samples_rX >0:
iterate(rX, count+1)
if add_index:
n_samples, _ = X.shape
X = np.c_[X, range(n_samples)]
iterate(X)
return n_split
class iForest():
''' Class to construct the isolation forest.
-n_estimators: is the number of trees in the forest,
-sample_size: is the bootstrap parameter used during the construction
of the forest,
-add_index: adds a column of index to the matrix X. This is required and
add_index can be set to False only if the last column of X contains
already indeces.
-max_depth: is the maximum depth of each tree
'''
def __init__(self, n_estimators=20, sample_size=None, add_index = True,
max_depth = 100):
self.n_estimators = n_estimators
self.sample_size = sample_size
self.add_index = add_index
self.max_depth = max_depth
return
def fit(self, X):
n_samples, n_features = X.shape
if self.sample_size == None:
self.sample_size = int(n_samples/2)
if self.add_index:
X = np.c_[X, range(n_samples)]
trees = [iTree(X[np.random.choice(n_samples,
self.sample_size,
replace=False)],
max_depth=self.max_depth)
for i in range(self.n_estimators)]
self.all_anomaly_score_ = {k:None for k in range(n_samples)}
for k in self.all_anomaly_score_.keys():
self.all_anomaly_score_[k] = np.array([tree[k]
for tree in trees
if k in tree])
self.anomaly_score_ = _anomaly_score(self.all_anomaly_score_, n_samples)
return self
The main part of the code is the iTree function that returns a dictionary with the number of steps required to isolate each sample.
A column of index is attached to the input matrix X in order to make easier to understand what are the samples in each node.
When I compare the result obtained with my code and the ones obtained with the isolation forest available for R I get different results.
Consider for example that stackloss dataset :
data = pd.read_csv("stackloss.csv")
X = data.as_matrix()[:, 1:]
max_depth = 100
itree = iTree(X, add_index=True, max_depth=max_depth) #example of isolation tree
iforest = iForest(n_estimators=1, max_depth=max_depth, sample_size=21) # isolation forest
iforest.fit(X)
sol = np.argsort(iforest.anomaly_score_)
#correct sol = [10 5 4 8 12 9 11 17 6 19 7 14 13 15 18 3 20 16 2 1 0]
sol is often different by correct solution obtained with the R software.
https://r-forge.r-project.org/projects/iforest/
The correct solution in R has been obtained with:
> tr = IsolationTrees(stackloss,ntree = 100000,hlim = 100, rFactor = 1)
> as = AnomalyScore(stackloss, tr)
> order(as$outF)
[1] 11 6 5 9 13 10 12 18 7 20 8 15 14 16 19 4 21 17 3 2 1
> order(as$outF)-1
[1] 10 5 4 8 12 9 11 17 6 19 7 14 13 15 18 3 20 16 2 1 0
>
Where is the mistake ?
I have finally been able to solve the problem.
The code is still slow due to the continuous copy operation performed in each split on the data.
This is the working version of the algorithm.
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import pandas as pd
def _h(i):
return np.log(i) + 0.5772156649
def _c(n):
if n > 2:
h = _h(n-1)
return 2*h - 2*(n - 1)/n
if n == 2:
return 1
else:
return 0
def _anomaly_score(score, n_samples):
score = -score/_c(n_samples)
return 2**score
def _split_data(X):
''' split the data in the left and right nodes '''
n_samples, n_columns = X.shape
n_features = n_columns - 1
m = M = 0
while m == M:
feature_id = np.random.randint(low=0, high=n_features)
feature = X[:, feature_id]
m = feature.min()
M = feature.max()
#print(m, M, feature_id, X.shape)
split_value = np.random.uniform(m, M, 1)
left_X = X[feature <= split_value]
right_X = X[feature > split_value]
return left_X, right_X, feature_id, split_value
def iTree(X, add_index=False, max_depth = np.inf):
''' construct an isolation tree and returns the number of step required
to isolate an element. A column of index is added to the input matrix X if
add_index=True. This column is required in the algorithm. '''
n_split = {}
def iterate(X, count = 0):
n_samples, n_columns = X.shape
n_features = n_columns - 1
if count > max_depth:
for index in X[:,-1]:
n_split[index] = count
return
if n_samples == 1:
index = X[0, n_columns-1]
n_split[index] = count
return
else:
lX, rX, feature_id, split_value = _split_data(X)
# Uncomment the print to visualize a draft of
# the construction of the tree
#print(lX[:,-1], rX[:,-1], feature_id, split_value, n_split)
n_samples_lX, _ = lX.shape
n_samples_rX, _ = rX.shape
if n_samples_lX > 0:
iterate(lX, count+1)
if n_samples_rX >0:
iterate(rX, count+1)
if add_index:
n_samples, _ = X.shape
X = np.c_[X, range(n_samples)]
iterate(X)
return n_split
class iForest():
''' Class to construct the isolation forest.
-n_estimators: is the number of trees in the forest,
-sample_size: is the bootstrap parameter used during the construction
of the forest,
-add_index: adds a column of index to the matrix X. This is required and
add_index can be set to False only if the last column of X contains
already indeces.
-max_depth: is the maximum depth of each tree
'''
def __init__(self, n_estimators=20, sample_size=None, add_index = True,
max_depth = 100):
self.n_estimators = n_estimators
self.sample_size = sample_size
self.add_index = add_index
self.max_depth = max_depth
return
def fit(self, X):
n_samples, n_features = X.shape
if self.sample_size == None:
self.sample_size = int(n_samples/2)
if self.add_index:
X = np.c_[X, range(n_samples)]
trees = [iTree(X[np.random.choice(n_samples,
self.sample_size,
replace=False)],
max_depth=self.max_depth)
for i in range(self.n_estimators)]
self.path_length_ = {k:None for k in range(n_samples)}
for k in self.path_length_.keys():
self.path_length_[k] = np.array([tree[k]
for tree in trees
if k in tree])
self.path_length_ = np.array([self.path_length_[k].mean() for k in
self.path_length_.keys()])
self.anomaly_score_ = _anomaly_score(self.path_length_, self.sample_size)
return self
self.anomaly_score_ = _anomaly_score(self.all_anomaly_score_, n_samples)
You're calculating _anomaly_score with n_samples which is total number of samples. However, you are building trees with subsamples. Therefore, when you're calculating the average search length '_c(n)' you should use sample_size instead of n_samples as the trees are build with subsamples. So, I believe your code should be:
self.anomaly_score_ = _anomaly_score(self.all_anomaly_score_, self.sample_size)
There is a pull-request in scikit-learn: https://github.com/scikit-learn/scikit-learn/pull/4163
Donbeo, your code works pretty well with just a few minor adjustments, the main problem it had is that you missed one of the base cases (end condition) of the recursive algorithm, so it hangs in loop when that condition comes up. You need something to this effect in the _split_data function (shown in code below) and also handle the this case in the iterate function (not shown)
minv = maxv = 0
inspected = Set() # this set tracks the candidates that we already inspected
while minv == maxv:
# check whether we run out of features to try an none of them has different values,
# if that is the case we need to break the loop otherwise this loops forever
if len(inspected) == n_features:
# if we run out of features to try an none of them has different values,
# return -1 to signal the caller that we can't split X anymore.
return X, X, -1, None
feature_id = np.random.randint(low=0, high=n_features)
if feature_id not in inspected:
inspected.add(feature_id)
split_feature = X[:, feature_id]
minv = split_feature.min()
maxv = split_feature.max()
Related
I customize a dqn agent to solve a circuit problem.for example, the state is 1D input which represents five nodes' value(node_0 to node_4),shape=(5,),and actions are choosing one of six components (like whose values are [0,1,2,3,4,5])to place in the circuit to get a new state,named state_.So the action_space is (6,).My goal is to make five values that in one state to reach fixed value as possible.For example,the initial state is [0.8,0.7,0.9,0.98,0.9] and my goal is to make five value higher than 0.95. I mean, i put a component whose value is 3, in node_0 and it becomes 0.95 from 0.8,it meets the requirments.and the node_3 don't need place a component because it is 0.98.And i set up a limit that the sum of placed components value can't over 10.
here are some hyperparameter:
gamma = 0.9
TARGET_REPLACE_ITER = 500
nodes = 5
memory_capability = 1000
batch_size = 30
epsilon_start =1
epsilon_end = 0.0001
epsilon_decay = 0.06
learning_rate = 0.001
epsilon = 1
n_state = 5
n_action = 6
I make two neural networks to do,one is eval_net and the other is target_net,the code is below:
class NN(nn.Module):
def __init__(self, ):
super(NN,self).__init__()
self.fc1 = nn.Linear(n_state, 16)
self.fc1.weight.data.normal_(0, 0.1)
self.fc2 = nn.Linear(16,32)
self.out = nn.Linear(32, n_action)
self.out.weight.data.normal_(0, 0.1)
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
action_value = self.out(x)
return action_value
here is the agent:
class Ckt_Opt(object):
def __init__(self,):
self.learn_step_counter = 0
self.memory = np.zeros((memory_capability, n_state * 2 + 2))
self.memory_cntr = 0
self.eval_net, self.target_net = NN(), NN()
self.loss_func = nn.MSELoss()
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)
def choose_action(self, state):
state = torch.unsqueeze(torch.FloatTensor(state),0)
if random.random() < epsilon:
action = random.randint(0,len(com_data) - 1) # choose a random component value,com_data is a list which means components' value ,from 0 to 5
else:
action_value = self.eval_net.forward(state)
action = torch.max(action_value, 1)[1].numpy()[0] # I copied this code from others and i guess it means choose the max Q value action
return action
def step(self,action):
self.ran_node = random.choice([a for a,x in enumerate(vio_node) if x == 1]) # x=1 means the node state value is lower than 0.95,ran_node means i randomly select node to place coponent
str1 = '.param cap_0_%d_val=%e\n' % (self.ran_node, com_data[action]) #
self.decap_param[self.ran_node] = com_data[action] # this two lines means change the placed component,it doesn't matter
def learn(self):
# target net update
if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
self.target_net.load_state_dict(self.eval_net.state_dict())
self.learn_step_counter += 1
# sample from memory
sample_index = np.random.choice(memory_capability, batch_size)
b_memory = self.memory[sample_index, :]
b_s = torch.FloatTensor(b_memory[:, :n_state])
b_a = torch.LongTensor(b_memory[:, n_state:n_state + 1].astype(int))
b_r = torch.FloatTensor(b_memory[:, n_state + 1:n_state + 2])
b_s_ = torch.FloatTensor(b_memory[:, -n_state:])
q_eval = self.eval_net(b_s).gather(1, b_a) # shape (batch, 1)
q_next = self.target_net(b_s_).detach()
q_target = b_r + gamma * q_next.max(1)[0] # shape (batch, 1)
loss = self.loss_func(q_eval, q_target)
# calculate and update eval_net
self.optimizer.zero_grad()
loss.backward() # i don't understand this three lines
self.optimizer.step()
the main function is below:
ckt = Ckt_Opt()
for i in range(0,50):
ckt.reset() # no component is placed,get initial state
state = read_result() # a function to read state after taking action,here is reading initial state
for j in range(500):
action = pdn.choose_action(state)
state_,Cof,vio_node = pdn.step(action)
# Cof means whether the sum of component value is more than the limit(1 means more than limit, 0 not),vio_node means whether place component
reward = sum(-((state_ -0.95) * vio_node)**2 *500) + (nodes - sum(vio_node))if Cof == 0 else \
sum(-((state_ - 0.95) * vio_node)**2 *5000)
# equations mean give priority to placing components in nodes with low state value to improve the reward.for example,node_0 is 0.6 and node_1 is 0.9,the penalty(equals negative reward) of node_0 is -(0.6-0.95)^2 *500 = -61.25,and node_1's penalty is -(0.9-0.95)^2 *500 = -1.25
ckt.store_transition(state,action,reward,state_) # just store in the experience memory
state = state_
epsilon = epsilon_end +(epsilon_start - epsilon_end) * math.exp(-1. *epsilon_decay * i)
My goal is to find a solution that has the best reward.For example,the initial state is [0.6,0.7,0.8,0.9,0.97],and the placed component values are [5,4,0,1,0],get the best state is [0.85,0.9,0.91,0.93,0.97], it can't make every state value get over than 0.95 because of some reason.
But!!! I ran many times and always get a wired solution like [1,1,1,1,0] or [2,2,2,2,0], which is not make sense, i think it must be something wrong with choose_action function or learn function,but i can't find it because i am new to DQN
Could anyone help me ? thanks a lot
I'm using k-prototyps library for mixed numerical and numinal data type. According to https://github.com/nicodv/kmodes/issues/46
to calculate the silhouette score in k prototypes, I calculate the silhouette score of categorical data (based on hamming distance) and the silhouette score of numerical data (based on euclidean distance), but the developed code is Pretty slow and it takes 10h to calculate silhouette for 60000 records. My laptop has 12G Ram and corei 7.
Any help to improve the speed of code, please?
import numpy as np
import pandas as pd
from kmodes.kprototypes import KPrototypes
# -------- import data
df = pd.read_csv(r'C:\Users\data.csv')
# ------------- Normalize the data ---------------
# print(df.columns) # To get columns name
x_df = df[['R', 'F']]
x_df_norm = x_df.apply(lambda x: (x - x.min(axis=0)) / (x.max(axis=0) - x.min(axis=0)))
x_df_norm['COType'] = df[['COType']]
def calc_euclian_dis(_s1, _s2):
# s1 = np.array((3, 5))
_eucl_dist = np.linalg.norm(_s2 - _s1) # calculate Euclidean distance, accept input an array [2 6]
return _eucl_dist
def calc_simpleMatching_dis(_s1, _s2):
_cat_dist = 0
if (_s1 != _s2):
_cat_dist = 1
return _cat_dist
k = 3
# calculate silhoutte for one cluster number
kproto = KPrototypes(n_clusters=k, init='Cao', verbose=2)
clusters_label = kproto.fit_predict(x_df_norm, categorical=[2])
_identical_cluster_labels = list(dict.fromkeys(clusters_label))
# Assign clusters lables to the Dataset
x_df_norm['Cluster_label'] = clusters_label
# ------------- calculate _silhouette_Index -------------
# 1. Calculate ai
_silhouette_Index_arr = []
for i in x_df_norm.itertuples():
_ai_cluster_label = i[-1]
# return samples of the same cluster
_samples_cluster = x_df_norm[x_df_norm['Cluster_label'] == _ai_cluster_label]
_dist_array_ai = []
_s1_nume_att = np.array((i[1], i[2]))
_s1_cat_att = i[3]
for j in _samples_cluster.itertuples():
_s2_nume_att = np.array((j[1], j[2]))
_s2_cat_att = j[3]
_euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
_cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
_dist_array_ai.append(_euclian_dis + (kproto.gamma * _cat_dis))
ai = np.average(_dist_array_ai)
# 2. Calculate bi
# 2.1. determine the samples of other clusters
_identical_cluster_labels.remove(_ai_cluster_label)
_dic_cluseter = {}
_bi_arr = []
for ii in _identical_cluster_labels:
_samples = x_df_norm[x_df_norm['Cluster_label'] == ii]
# 2.2. calculate bi
_dist_array_bi = []
for j in _samples.itertuples():
_s2_nume_att = np.array((j[1], j[2]))
_s2_cat_att = j[3]
_euclian_dis = calc_euclian_dis(_s1_nume_att, _s2_nume_att)
_cat_dis = calc_simpleMatching_dis(_s1_cat_att, _s2_cat_att)
_dist_array_bi.append(_euclian_dis + (kproto.gamma * _cat_dis))
_bi_arr.append(np.average(_dist_array_bi))
_identical_cluster_labels.append(_ai_cluster_label)
# min bi is determined as final bi variable
bi = min(_bi_arr)
# 3. calculate silhouette Index
if ai == bi:
_silhouette_i = 0
elif ai < bi:
_silhouette_i = 1 - (ai / bi)
elif ai > bi:
_silhouette_i = 1 - (bi / ai)
_silhouette_Index_arr.append(_silhouette_i)
silhouette_score = np.average(_silhouette_Index_arr)
print('_silhouette_Index = ' + str(silhouette_score))
Hei! I reimplemented your function by using the linear algebra operators for computing dissimilarities instead of using a lot of for loops:
It is way faster :-)
def euclidean_dissim(a, b, **_):
"""Euclidean distance dissimilarity function
b is the single point, a is the matrix of vectors"""
if np.isnan(a).any() or np.isnan(b).any():
raise ValueError("Missing values detected in numerical columns.")
return np.linalg.norm(a - b, axis=1)
def matching_dissim(a, b, **_):
"""Simple matching dissimilarity function
b is the single point, a is the matrix of all other vectors,
count how many matching values so difference = 0 """
# We are subtracting to dimension since is not similarity but a dissimilarity
dimension = len(b)
return dimension - np.sum((b-a)==0,axis=1)
def calc_silhouette_proto(dataset,numerical_pos, cat_pos,kproto_model):
'''------------- calculate _silhouette_Index -------------'''
# 1. Compute a(i)
silhouette_Index_arr = []
for i in dataset.itertuples():
# convert tuple to np array
i = np.array(i)
unique_cluster_labels = list(np.unique(dataset['cluster_labels']))
# We need each time to remove the considered tuple from the dataset since we don't compute distances from itself
data = dataset.copy()
ai_cluster = i[-1] # The cluster is in the last position of the tuple
# Removing the tuple from the dataset
tuple_index = dataset.index.isin([i[0]])
data = data[~tuple_index]
# Get samples of the same cluster
samples_of_cluster = data[data['cluster_labels'] == ai_cluster].loc[:,data.columns!='cluster_labels'].to_numpy()
# Compute the 2 distances among the single points and all the others
euclidian_distances = euclidean_dissim(samples_of_cluster[:,numerical_pos],i[np.array(numerical_pos)+1])
categ_distances = matching_dissim(samples_of_cluster[:,cat_pos],i[np.array(cat_pos)+1])
# Weighted average of the 2 distances
ai = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances))
# 2. Calculate bi
unique_cluster_labels.remove(ai_cluster)
bi_arr = []
for ii in unique_cluster_labels:
# Get all the samples of cluster ii
samples = data[data['cluster_labels'] == ii].loc[:,data.columns!='cluster_labels'].to_numpy()
# Compute the 2 distances among the single points and all the others
euclidian_distances = np.linalg.norm(samples[:,numerical_pos] - i[np.array(numerical_pos)+1], axis=1)
categ_distances = matching_dissim(samples[:,cat_pos],i[np.array(cat_pos)+1])
distance_bi = np.average(euclidian_distances) + (kproto_model.gamma * np.average(categ_distances))
bi_arr.append(np.average(distance_bi))
# min bi is determined as final bi variable
if(len(bi_arr)==0):
bi = 0
else:
bi = min(bi_arr)
# 3. calculate silhouette Index
if ai == bi:
silhouette_i = 0
elif ai < bi:
silhouette_i = 1 - (ai / bi)
elif ai > bi:
silhouette_i = 1 - (bi / ai)
silhouette_Index_arr.append(silhouette_i)
silhouette_score = np.average(silhouette_Index_arr)
return silhouette_score
I am following the book «Learning form data» which has the following exercise:
Create a linearly separable Input data set.
Implement single layer perceptron on the generated data set.
To create a data set, it says to choose a two dimensional plane and then to choose a random line in the plane. Points to one side of the plane are classified positive and points to the other are classified negative. I was able to follow this with the help of https://datasciencelab.wordpress.com/2014/01/10/machine-learning-classics-the-perceptron/
import numpy as np
import random
import os, subprocess
class Perceptron:
def __init__(self, N):
# Random linearly separated data
xA,yA,xB,yB = [random.uniform(-1, 1) for i in range(4)]
self.V = np.array([xB*yA-xA*yB, yB-yA, xA-xB])
self.X = self.generate_points(N)
def generate_points(self, N):
X = []
for i in range(N):
x1,x2 = [random.uniform(-1, 1) for i in range(2)]
x = np.array([1,x1,x2])
s = int(np.sign(self.V.T.dot(x)))
X.append((x, s))
return X
def plot(self, mispts=None, vec=None, save=False):
fig = plt.figure(figsize=(5,5))
plt.xlim(-1,1)
plt.ylim(-1,1)
V = self.V
a, b = -V[1]/V[2], -V[0]/V[2]
l = np.linspace(-1,1)
plt.plot(l, a*l+b, 'k-')
cols = {1: 'r', -1: 'b'}
for x,s in self.X:
plt.plot(x[1], x[2], cols[s]+'o')
if mispts:
for x,s in mispts:
plt.plot(x[1], x[2], cols[s]+'.')
if vec != None:
aa, bb = -vec[1]/vec[2], -vec[0]/vec[2]
plt.plot(l, aa*l+bb, 'g-', lw=2)
if save:
if not mispts:
plt.title('N = %s' % (str(len(self.X))))
else:
plt.title('N = %s with %s test points' \
% (str(len(self.X)),str(len(mispts))))
plt.savefig('p_N%s' % (str(len(self.X))), \
dpi=200, bbox_inches='tight')
def classification_error(self, vec, pts=None):
# Error defined as fraction of misclassified points
if not pts:
pts = self.X
M = len(pts)
n_mispts = 0
for x,s in pts:
if int(np.sign(vec.T.dot(x))) != s:
n_mispts += 1
error = n_mispts / float(M)
return error
def choose_miscl_point(self, vec):
# Choose a random point among the misclassified
pts = self.X
mispts = []
for x,s in pts:
if int(np.sign(vec.T.dot(x))) != s:
mispts.append((x, s))
return mispts[random.randrange(0,len(mispts))]
def pla(self, save=False):
# Initialize the weigths to zeros
w = np.zeros(3)
X, N = self.X, len(self.X)
it = 0
# Iterate until all points are correctly classified
while self.classification_error(w) != 0:
it += 1
# Pick random misclassified point
x, s = self.choose_miscl_point(w)
# Update weights
w += s*x
if save:
self.plot(vec=w)
plt.title('N = %s, Iteration %s\n' \
% (str(N),str(it)))
plt.savefig('p_N%s_it%s' % (str(N),str(it)), \
dpi=200, bbox_inches='tight')
self.w = w
def check_error(self, M, vec):
check_pts = self.generate_points(M)
return self.classification_error(vec, pts=check_pts)
Image of the logic and code is given below:
2 D linearly separable data
I want to create a N dimensional linearly separable data set similarly.
for example a 10 dimensional set of points. Points to one side of 9 dimensional hyperplane are to be classified positive and to the other as negative.
I have no clue on how to proceed. Any help is appreciated.
This code below is suppose to run a bayes classifier for a full covariance gaussian (http://courses.ee.sun.ac.za/Pattern_Recognition_813/lectures/lecture03/node2.html), but I get two errors when I run the code. They are:
RuntimeWarning: Mean of empty slice.
warnings.warn("Mean of empty slice.", RuntimeWarning)
and
RuntimeWarning: Degrees of freedom <= 0 for slice
warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
This is my code:
def modelFull(train, test):
err_train = 0
err_test = 0
x_train = []
x_test = []
labels = []
train_labels = []
test_labels = []
for i in train:
x_train.append(i[:-1]/255)
labels.append(i[-1])
train_labels.append(i[-1])
for i in test:
x_test.append(i[:-1]/255)
labels.append(i[-1])
test_labels.append(i[-1])
x_train = np.array(x_train)
x_0 = []
x_1 = []
for i in train:
if i[-1] == 0:
x_0.append(i[:-1]/255)
if i[-1] == 1:
x_1.append(i[:-1]/255)
x_0 = np.array(x_0)
x_1 = np.array(x_1)
p_0 = float(x_0.shape[0])/float((x_0.shape[0]+x_1.shape[0]))
p_1 = float(x_1.shape[0])/float((x_0.shape[0]+x_1.shape[0]))
train_x0_mean = x_0.mean(axis=0)
train_x1_mean = x_1.mean(axis=0)
cov_x0 = np.cov(np.transpose(x_0))
cov_x1 = np.cov(np.transpose(x_1))
cov_x0 = cov_x0 + np.eye(256) * .01
cov_x1 = cov_x1 + np.eye(256) * .01
det_x1_cov = -float(np.linalg.slogdet(cov_x1)[1])
det_x0_cov = -float(np.linalg.slogdet(cov_x0)[1])
train_results = []
test_results = []
for x in x_train:
x0_minus_mu_T = np.transpose((x-train_x0_mean))
x0_inverse = np.linalg.inv(cov_x0)
x0_minus_mu = x-train_x0_mean
x1_minus_mu_T = np.transpose((x-train_x1_mean))
x1_inverse = np.linalg.inv(cov_x1)
x1_minus_mu = x-train_x1_mean
x_0_probability = det_x0_cov - (x0_minus_mu_T.dot(x0_inverse)).dot(x0_minus_mu)
x_1_probability = det_x1_cov - (x1_minus_mu_T.dot(x1_inverse)).dot(x1_minus_mu)
if (x_0_probability+np.log(p_0))/(x_1_probability+np.log(p_1)) < 1:
train_results.append(1)
else:
train_results.append(0)
for x in x_test:
x0_minus_mu_T = np.transpose((x-train_x0_mean))
x0_inverse = np.linalg.inv(cov_x0)
x0_minus_mu = x-train_x0_mean
x1_minus_mu_T = np.transpose((x-train_x1_mean))
x1_inverse = np.linalg.inv(cov_x1)
x1_minus_mu = x-train_x1_mean
x_0_probability = det_x0_cov - (x0_minus_mu_T.dot(x0_inverse)).dot(x0_minus_mu)
x_1_probability = det_x1_cov - (x1_minus_mu_T.dot(x1_inverse)).dot(x1_minus_mu)
if (x_0_probability+np.log(p_0))/(x_1_probability+np.log(p_1)) < 1:
test_results.append(1)
else:
test_results.append(0)
train_correct = 0
test_correct = 0
for i in range(len(train_results)):
if int(train_results[i]) == int(train_labels[i]):
train_correct +=1
for i in range(len(test_results)):
if int(test_results[i]) == int(test_labels[i]):
test_correct +=1
err_train = 1-(float(test_correct)/ len(test_results))
err_train = 1-(float(train_correct)/ len(train_results))
return err_train, err_test
RuntimeWarning: Degrees of freedom <= 0 for slice
occurs when you use the wrong shape, e.g.:
import numpy as np
x = np.random.random([1000,1])
y = np.random.random([1000,1])
print(x.shape, y.shape)
# (1000, 1) (1000, 1)
t = np.cov(x, y) #RuntimeWarning
t = np.cov(x.T, y.T) #This works
An edge case is: the array you calculate covariance on only contains one element.
np.cov([0.5])
In addition to the use of wrong shape mentioned above, using np.nanstd across all NaNs array will also invoke the message of "RuntimeWarning: Degrees of freedom <= 0 for slice"; using np.nanmean across all NaNs array will invoke the message of "RuntimeWarning: Mean of empty slice.".
Is there an elegant way finding the number of connected components using a pre-calculated KDTree? Right now find the connected components using a breath-first search algorithm with the adjacency matrix given by the KDTree of k-nearest neighbours, but is there a better possibility?
import collections
import numpy as np
from sklearn import neighbors
N = 100
N_k = 8
ra = np.random.random
X0,X1 = ra(N),ra(N)
X0[0:N//2]+= 2
X1[0:N//2]+= 2
X = np.array([X0,X1]).T
tree = neighbors.KDTree(X)
dist, adj = tree.query(X, k = N_k+1)
dist = dist[:,1::]
adj = adj[:,1::]
print("Inside of find_components_lifo")
print("N = %d/ N_k = %d"%(N,N_k))
labels = np.zeros(N, dtype = np.int) - 1
n = 0
steps = 0
remains = (labels == -1)
while n < N:
i = np.arange(0,N,1)[remains][np.random.randint(0,N - n)]
# This is important for directed graphs
labels[i] = i
lifo = collections.deque([i])
while lifo:
ele = lifo.pop()
for k in adj[ele,:]:
if labels[k] == -1:
labels[k] = labels[i]
lifo.append(k)
elif labels[k] != labels[i]:
labels[labels == labels[i]] = labels[k]
remains = (labels == -1)
n = N - len(np.nonzero(remains)[0])
unique = np.unique(labels)
labels_ = np.zeros(N, dtype = np.int) - 1
for i, label in enumerate(unique):
choice = (labels == label)
N_cl = len(np.nonzero(choice)[0])
print("We found a cluster with N = %d"%N_cl)
labels_[choice] = i
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
fixticks(ax)
plt.show()
colors_ = np.array(colors)
for i in range(N):
for j in range(N_k):
ax.plot([X0[i],X0[adj[i,j]]],[X1[i],X1[adj[i,j]]], color = colors_[labels_[i]], alpha = 0.3)
ax.grid(True)
ax.scatter(X0,X1, s = 60, color = "Black")
plt.show()
I
I think you can use scipy's connected_components and scikit-learn's kneighbors_graph together. Does this produce what you're looking for?
from sklearn import neighbors
from scipy.sparse import csgraph
adj = neighbors.kneighbors_graph(X, N_k)
n_components, labels = csgraph.connected_components(adj)