Splitting coef into arrays applicable for multi class - python

I use this function to plot the best and worst features (coef) for each label.
def plot_coefficients(classifier, feature_names, top_features=20):
coef = classifier.coef_.ravel()
for i in np.split(coef,6):
top_positive_coefficients = np.argsort(i)[-top_features:]
top_negative_coefficients = np.argsort(i)[:top_features]
top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
# create plot
plt.figure(figsize=(15, 5))
colors = ["red" if c < 0 else "blue" for c in i[top_coefficients]]
plt.bar(np.arange(2 * top_features), i[top_coefficients], color=colors)
feature_names = np.array(feature_names)
plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha="right")
plt.show()
Applying it to sklearn.LinearSVC:
if (name == "LinearSVC"):
print(clf.coef_)
print(clf.intercept_)
plot_coefficients(clf, cv.get_feature_names())
The CountVectorizer used has a dimension of (15258, 26728).
It's a multi-class decision problem with 6 labels. Using .ravel returns a flat array with a length of 6*26728=160368. Meaning that all indicies that are higher than 26728 are out of bound for axis 1. Here are the top and bottom indices for one label:
i[ 0. 0. 0.07465654 ... -0.02112607 0. -0.13656274]
Top [39336 35593 29445 29715 36418 28631 28332 40843 34760 35887 48455 27753
33291 54136 36067 33961 34644 38816 36407 35781]
i[ 0. 0. 0.07465654 ... -0.02112607 0. -0.13656274]
Bot [39397 40215 34521 39392 34586 32206 36526 42766 48373 31783 35404 30296
33165 29964 50325 53620 34805 32596 34807 40895]
The first entry in the "top" list has the index 39336. This is equal to the entry 39337-26728=12608 in the vocabulary. What would I need to change in the code to make this applicable?
EDIT:
X_train = sparse.hstack([training_sentences,entities1train,predictionstraining_entity1,entities2train,predictionstraining_entity2,graphpath_training,graphpathlength_training])
y_train = DFTrain["R"]
X_test = sparse.hstack([testing_sentences,entities1test,predictionstest_entity1,entities2test,predictionstest_entity2,graphpath_testing,graphpathlength_testing])
y_test = DFTest["R"]
Dimensions:
(15258, 26728)
(15258, 26728)
(0, 0) 1
...
(15257, 0) 1
(15258, 26728)
(0, 0) 1
...
(15257, 0) 1
(15258, 26728)
(15258L, 1L)
File "TwoFeat.py", line 708, in plot_coefficients
colors = ["red" if c < 0 else "blue" for c in i[top_coefficients]]
MemoryError

First, is it necessary you have to use ravel()?
LinearSVC (or in fact any other classifier which has coef_) gives out coef_ in a shape:
coef_ : array, shape = [n_features] if n_classes == 2 else [n_classes, n_features]
Weights assigned to the features (coefficients in the primal problem).
So this has number of rows equal to the classes, and number of columns equal to features. For each class, you just need to access right row. The order of classes will be available from classifier.classes_ attribute.
Secondly, the indenting of your code is wrong. The code in which plot should be inside the for loop to plot for each class. Currently its outside the scope of for loop, so only will print for last class.
Correcting these two things, here's a sample reproducible code to plot the top and bottom features for each class.
def plot_coefficients(classifier, feature_names, top_features=20):
# Access the coefficients from classifier
coef = classifier.coef_
# Access the classes
classes = classifier.classes_
# Iterate the loop for number of classes
for i in range(len(classes)):
print(classes[i])
# Access the row containing the coefficients for this class
class_coef = coef[i]
# Below this, I have just replaced 'i' in your code with 'class_coef'
# Pass this to get top and bottom features
top_positive_coefficients = np.argsort(class_coef)[-top_features:]
top_negative_coefficients = np.argsort(class_coef)[:top_features]
# Concatenate the above two
top_coefficients = np.hstack([top_negative_coefficients,
top_positive_coefficients])
# create plot
plt.figure(figsize=(10, 3))
colors = ["red" if c < 0 else "blue" for c in class_coef[top_coefficients]]
plt.bar(np.arange(2 * top_features), class_coef[top_coefficients], color=colors)
feature_names = np.array(feature_names)
# Here I corrected the start to 0 (Your code has 1, which shifted the labels)
plt.xticks(np.arange(0, 1 + 2 * top_features),
feature_names[top_coefficients], rotation=60, ha="right")
plt.show()
Now just use this method as you like:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space']
dataset = fetch_20newsgroups(subset='all', categories=categories,
shuffle=True, random_state=42)
vectorizer = CountVectorizer()
# Just to replace classes from integers to their actual labels,
# you can use anything as you like in y
y = []
mapping_dict = dict(enumerate(dataset.target_names))
for i in dataset.target:
y.append(mapping_dict[i])
# Learn the words from data
X = vectorizer.fit_transform(dataset.data)
clf = LinearSVC(random_state=42)
clf.fit(X, y)
plot_coefficients(clf, vectorizer.get_feature_names())
Output from above code:
'alt.atheism'
'comp.graphics'
'sci.space'
'talk.religion.misc'

Related

selecting data points neighbourhood to support vectors

I have been thinking of this but not sure how to do it. I have a binary imbalanced data, and would like to use svm to select just subset of the majority data points nearest to support vector. Thereafter, I can fit a binary classifier on this "balanced" data.
To illustrate what I mean, a MWE:
# packages import
from collections import Counter
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
import seaborn as sns
# sample data
X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.9], flip_y=0, random_state=1)
# class distribution summary
print(Counter(y))
Counter({0: 91, 1: 9})
# fit svm model
svc_model = SVC(kernel='linear', random_state=32)
svc_model.fit(X, y)
plt.figure(figsize=(10, 8))
# Plotting our two-features-space
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, s=50)
# Constructing a hyperplane using a formula.
w = svc_model.coef_[0] # w consists of 2 elements
b = svc_model.intercept_[0] # b consists of 1 element
x_points = np.linspace(-1, 1) # generating x-points from -1 to 1
y_points = -(w[0] / w[1]) * x_points - b / w[1] # getting corresponding y-points
# Plotting a red hyperplane
plt.plot(x_points, y_points, c='r')
The two classes are well separated by the hyperplane. We can see the support vectors for both classes (even better for class 1).
Since the minority class 0 has 9-data-points, I want to down-sample class 0 by selecting its support vectors, and 8 other data points nearest to it. So that the class distribution becomes {0: 9, 1: 9} ignoring all other data points of 0. I will then use this to fit a binary classifier like LR (or even SVC).
My question is how to select those data points of class 0 nearest to the class support vector, taking into account, a way to reach a balance with data points of minority class 1.
This can be achieved as follows: Get the support vector for class 0, (sv0), iterate over all data points in class 0 (X[y == 0]), compute the distances (d) to the point represented by the support vector, sort them, take the 9 with the smallest values, and concatenate them with the points of class 1 to create the downsampled data (X_ds, y_ds).
sv0 = svc_model.support_vectors_[0]
distances = []
for i, x in enumerate(X[y == 0]):
d = np.linalg.norm(sv0 - x)
distances.append((i, d))
distances.sort(key=lambda tup: tup[1])
index = [i for i, d in distances][:9]
X_ds = np.concatenate((X[y == 0][index], X[y == 1]))
y_ds = np.concatenate((y[y == 0][index], y[y == 1]))
plt.plot(x_points[19:-29], y_points[19:-29], c='r')
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, s=50)
plt.scatter(X_ds[y_ds == 0][:,0], X_ds[y_ds == 0][:,1], color='yellow', alpha=0.4)

Why does the predict_proba function return 2 columns?

Why does the predict_proba function give 2 columns?
I looked to this website:
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.predict_proba
However, it just says returns: T: array-like of shape (n_samples, n_classes)
Returns the probability of the sample for each class in the model, where classes are ordered as they are in self.classes_.
I still don't understand why the output always returns 2 columns.
import numpy as np
import pandas as pd
from pylab import rcParams
import seaborn as sb
from sklearn.preprocessing import scale
from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
rcParams['figure.figsize'] = 5,4
sb.set_style('whitegrid')
from sklearn.linear_model import LogisticRegression
import os
cwd = os.getcwd()
file_path = cwd + '\\Default.xlsx'
default_data = pd.read_excel(file_path)
default_data = pd.read_excel('Default.xlsx')
default_data = default_data.drop(['Unnamed: 0'], axis=1)
default_data['default_factor'] = default_data.default.factorize()[0]
default_data['student_factor'] = default_data.student.factorize()[0]
X = default_data[['balance']]
y = default_data['default_factor']
lr = LogisticRegression()
lr.fit(X, y)
X_pred = np.linspace(start = 0, stop = 3000, num = 2).reshape(-1,1)
y_pred = lr.predict_proba(X_pred)
X_pred
X_pred.shape
y_pred.shape
Short answer
In every column it gives you information about the probability, that sample belong to this class (zero column shows the probability for belonging to class 0, first column shows the probability for belonging to class 1 and so on)
Detailed answer
Let's say that y_pred.shape gives you shape (2, 2) means, that you have 2 samples and 2 classes.
let's say that your X_pred looks like this:
In: print(X_pred)
Out: [[ 0.],
[3000.]]
that means that you have two samples:
sample one, with only feature x = [0] and
sample two, with only feature x = [3000]
let's say that output of your prediction looks like this:
In: print(y_pred)
Out: [[0.28, 0.72]
[0.65, 0.35]]
so it means, that sample one most probably belongs to class = 1 (first row tells you that it could be class 0 with probability 28% and class 1 with probability 72%)
and sample two most probably belongs to class = 0 (second row tells you that it could be class 0 with probability 65% and class 1 with probability 35%)

How to plot feature importance for random forest in python

I have created a random forest model, and would like to plot the feature importances
model_RF_tune = RandomForestClassifier(random_state=0, n_estimators = 80,
min_samples_split =10, max_depth= None, max_features = "auto",)
I have tried defining a function:
def plot_feature_importances_health(model):
n_features = model.data.shape
plt.barh(range(n_features), model.feature_importances_, align = "center")
plt.yticks(np.arrange(n_features), df_health_reconstructed.feature_names)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.ylim(-1, n_features)
but this
plot_feature_importances_health(model_RF_tune)
Gives this result:
AttributeError: 'RandomForestClassifier' object has no attribute 'data'
How do I plot it correctly?
Not all models can execute model.data. Would you like to try my codes instead? However, the codes plot the top 10 features only.
# use RandomForestClassifier to look for important key features
n = 10 # choose top n features
rfc = RandomForestClassifier(random_state=SEED, n_estimators=200, max_depth=3)
rfc_model = rfc.fit(X, y)
(pd.Series(rfc_model.feature_importances_, index=X.columns)
.nlargest(n)
.plot(kind='barh', figsize=[8, n/2.5],color='navy')
.invert_yaxis()) # most important feature is on top, ie, descending order
ticks_x = np.linspace(0, 0.5, 6) # (start, end, number of ticks)
plt.xticks(ticks_x, fontsize=15, color='black')
plt.yticks(size=15, color='navy' )
plt.title('Top Features derived by RandomForestClassifier', family='fantasy', size=15)
print(list((pd.Series(rfc_model.feature_importances_, index=X.columns).nlargest(n)).index))
This one seems to work for me
%matplotlib inline
#do code to support model
#"data" is the X dataframe and model is the SKlearn object
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(dataframe_name.columns,
model_name.feature_importances_):
feats[feature] = importance #add the name/value pair
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-
importance'})
importances.sort_values(by='Gini-importance').plot(kind='barh',
color="SeaGreen",figsize=(10,8))

Sklearn logistic regression shape error, but x, y shapes are consistent

I get a ValueError: Found input variables with inconsistent numbers of samples: [20000, 1] when I run the following even though the row values of x and y are correct. I load in the RCV1 dataset, get indices of the categories with the top x documents, create list of tuples with equal number of randomly-selected positives and negatives for each category, and then finally attempt to run a logistic regression on one of the categories.
import sklearn.datasets
from sklearn import model_selection, preprocessing
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from scipy import sparse
rcv1 = sklearn.datasets.fetch_rcv1()
def get_top_cat_indices(target_matrix, num_cats):
cat_counts = target_matrix.sum(axis=0)
#cat_counts = cat_counts.reshape((1,103)).tolist()[0]
cat_counts = cat_counts.reshape((103,))
#b = sorted(cat_counts, reverse=True)
ind_temp = np.argsort(cat_counts)[::-1].tolist()[0]
ind = [ind_temp[i] for i in range(5)]
return ind
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
cat_present = x.tocsr()[np.where(temp.sum(axis=1)>0)[0],:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[np.where(temp.sum(axis=1)==0)[0],:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[idx_cat,:]
sampled_y_neg = temp.tocsr()[idx_nocat,:]
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
ind = get_top_cat_indices(rcv1.target, 5)
test_res = prepare_data(train_x, train_y, ind, 20000)
x, y = test_res[0]
print(x.shape)
print(y.shape)
LogisticRegression().fit(x, y)
Could it be an issue with the sparse matrices, or problem with dimensionality (there are 20K samples and 47K features)
When I run your code, I get following error:
AttributeError: 'bool' object has no attribute 'any'
That's because y for LogisticRegression needs to numpy array. So, I changed last line to:
LogisticRegression().fit(x, y.A.flatten())
Then I get following error:
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0
This is because your sampling code has a bug. You need to subset y array with rows having that category before using sampling indices. See code below:
def prepare_data(x, y, top_cat_indices, sample_size):
res_lst = []
for i in top_cat_indices:
# get column of indices with relevant cat
temp = y.tocsc()[:, i]
# all docs with labeled category
c1 = np.where(temp.sum(axis=1)>0)[0]
c2 = np.where(temp.sum(axis=1)==0)[0]
cat_present = x.tocsr()[c1,:]
# all docs other than labelled category
cat_notpresent = x.tocsr()[c2,:]
# get indices equal to 1/2 of sample size
idx_cat = np.random.randint(cat_present.shape[0], size=int(sample_size/2))
idx_nocat = np.random.randint(cat_notpresent.shape[0], size=int(sample_size/2))
# concatenate the ids
sampled_x_pos = cat_present.tocsr()[idx_cat,:]
sampled_x_neg = cat_notpresent.tocsr()[idx_nocat,:]
sampled_x = sparse.vstack((sampled_x_pos, sampled_x_neg))
sampled_y_pos = temp.tocsr()[c1][idx_cat,:]
print(sampled_y_pos.nnz)
sampled_y_neg = temp.tocsr()[c2][idx_nocat,:]
print(sampled_y_neg.nnz)
sampled_y = sparse.vstack((sampled_y_pos, sampled_y_neg))
res_lst.append((sampled_x, sampled_y))
return res_lst
Now, Everything works like a charm

python scikit-learn clustering with missing data

I want to cluster data with missing columns. Doing it manually I would calculate the distance in case of a missing column simply without this column.
With scikit-learn, missing data is not possible. There is also no chance to specify a user distance function.
Is there any chance to cluster with missing data?
Example data:
n_samples = 1500
noise = 0.05
X, _ = make_swiss_roll(n_samples, noise)
rnd = np.random.rand(X.shape[0],X.shape[1])
X[rnd<0.1] = np.nan
I think you can use an iterative EM-type algorithm:
Initialize missing values to their column means
Repeat until convergence:
Perform K-means clustering on the filled-in data
Set the missing values to the centroid coordinates of the clusters to which they were assigned
Implementation
import numpy as np
from sklearn.cluster import KMeans
def kmeans_missing(X, n_clusters, max_iter=10):
"""Perform K-Means clustering on data with missing values.
Args:
X: An [n_samples, n_features] array of data to cluster.
n_clusters: Number of clusters to form.
max_iter: Maximum number of EM iterations to perform.
Returns:
labels: An [n_samples] vector of integer labels.
centroids: An [n_clusters, n_features] array of cluster centroids.
X_hat: Copy of X with the missing values filled in.
"""
# Initialize missing values to their column means
missing = ~np.isfinite(X)
mu = np.nanmean(X, 0, keepdims=1)
X_hat = np.where(missing, mu, X)
for i in xrange(max_iter):
if i > 0:
# initialize KMeans with the previous set of centroids. this is much
# faster and makes it easier to check convergence (since labels
# won't be permuted on every iteration), but might be more prone to
# getting stuck in local minima.
cls = KMeans(n_clusters, init=prev_centroids)
else:
# do multiple random initializations in parallel
cls = KMeans(n_clusters, n_jobs=-1)
# perform clustering on the filled-in data
labels = cls.fit_predict(X_hat)
centroids = cls.cluster_centers_
# fill in the missing values based on their cluster centroids
X_hat[missing] = centroids[labels][missing]
# when the labels have stopped changing then we have converged
if i > 0 and np.all(labels == prev_labels):
break
prev_labels = labels
prev_centroids = cls.cluster_centers_
return labels, centroids, X_hat
Example with fake data
from sklearn.datasets import make_blobs
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def make_fake_data(fraction_missing, n_clusters=5, n_samples=1500,
n_features=3, seed=None):
# complete data
gen = np.random.RandomState(seed)
X, true_labels = make_blobs(n_samples, n_features, n_clusters,
random_state=gen)
# with missing values
missing = gen.rand(*X.shape) < fraction_missing
Xm = np.where(missing, np.nan, X)
return X, true_labels, Xm
X, true_labels, Xm = make_fake_data(fraction_missing=0.3, n_clusters=5, seed=0)
labels, centroids, X_hat = kmeans_missing(Xm, n_clusters=5)
# plot the inferred points, color-coded according to the true cluster labels
fig, ax = plt.subplots(1, 2, subplot_kw={'projection':'3d', 'aspect':'equal'})
ax[0].scatter3D(X[:, 0], X[:, 1], X[:, 2], c=true_labels, cmap='gist_rainbow')
ax[1].scatter3D(X_hat[:, 0], X_hat[:, 1], X_hat[:, 2], c=true_labels,
cmap='gist_rainbow')
ax[0].set_title('Original data')
ax[1].set_title('Imputed (30% missing values)')
fig.tight_layout()
Benchmark
To assess the algorithm's performance, we can use the adjusted mutual information between the true and inferred cluster labels. A score of 1 is perfect performance and 0 represents chance:
from sklearn.metrics import adjusted_mutual_info_score
fraction = np.arange(0.0, 1.0, 0.05)
n_repeat = 10
scores = np.empty((2, fraction.shape[0], n_repeat))
for i, frac in enumerate(fraction):
for j in range(n_repeat):
X, true_labels, Xm = make_fake_data(fraction_missing=frac, n_clusters=5)
labels, centroids, X_hat = kmeans_missing(Xm, n_clusters=5)
any_missing = np.any(~np.isfinite(Xm), 1)
scores[0, i, j] = adjusted_mutual_info_score(labels, true_labels)
scores[1, i, j] = adjusted_mutual_info_score(labels[any_missing],
true_labels[any_missing])
fig, ax = plt.subplots(1, 1)
scores_all, scores_missing = scores
ax.errorbar(fraction * 100, scores_all.mean(-1),
yerr=scores_all.std(-1), label='All labels')
ax.errorbar(fraction * 100, scores_missing.mean(-1),
yerr=scores_missing.std(-1),
label='Labels with missing values')
ax.set_xlabel('% missing values')
ax.set_ylabel('Adjusted mutual information')
ax.legend(loc='best', frameon=False)
ax.set_ylim(0, 1)
ax.set_xlim(-5, 100)
Update:
In fact, after a quick Google search it seems that what I've come up with above is pretty much the same as the k-POD algorithm for K-means clustering of missing data (Chi, Chi & Baraniuk, 2016).
Here is a different algorithm that I use. Instead of replacing the missing values the values are ignored and in order to capture the differences between missing and non-missing i impliment missing dummies.
Compared to Alis algorithm it seems is easier for observations with missing observatons to jump from class to class. Since I do not fill the missing values.
I fortunely did not have the time to compare using Ali's beautiful code, but feel free to do it (I might do it when I get the time) and contribute to the discussion about the best method.
import numpy as np
class kmeans_missing(object):
def __init__(self,potential_centroids,n_clusters):
#initialize with potential centroids
self.n_clusters=n_clusters
self.potential_centroids=potential_centroids
def fit(self,data,max_iter=10,number_of_runs=1):
n_clusters=self.n_clusters
potential_centroids=self.potential_centroids
dist_mat=np.zeros((data.shape[0],n_clusters))
all_centroids=np.zeros((n_clusters,data.shape[1],number_of_runs))
costs=np.zeros((number_of_runs,))
for k in range(number_of_runs):
idx=np.random.choice(range(potential_centroids.shape[0]), size=(n_clusters), replace=False)
centroids=potential_centroids[idx]
clusters=np.zeros(data.shape[0])
old_clusters=np.zeros(data.shape[0])
for i in range(max_iter):
#Calc dist to centroids
for j in range(n_clusters):
dist_mat[:,j]=np.nansum((data-centroids[j])**2,axis=1)
#Assign to clusters
clusters=np.argmin(dist_mat,axis=1)
#Update clusters
for j in range(n_clusters):
centroids[j]=np.nanmean(data[clusters==j],axis=0)
if all(np.equal(clusters,old_clusters)):
break # Break when to change in clusters
if i==max_iter-1:
print('no convergence before maximal iterations are reached')
else:
clusters,old_clusters=old_clusters,clusters
all_centroids[:,:,k]=centroids
costs[k]=np.mean(np.min(dist_mat,axis=1))
self.costs=costs
self.cost=np.min(costs)
self.best_model=np.argmin(costs)
self.centroids=all_centroids[:,:,self.best_model]
self.all_centroids=all_centroids
def predict(self,data):
dist_mat=np.zeros((data.shape[0],self.n_clusters))
for j in range(self.n_clusters):
dist_mat[:,j]=np.nansum((data-self.centroids[j])**2,axis=1)
prediction=np.argmin(dist_mat,axis=1)
cost=np.min(dist_mat,axis=1)
return prediction,cost
Here is an example on how though it might be usefull.
from sklearn.datasets import make_blobs
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from kmeans_missing import *
def make_fake_data(fraction_missing, n_clusters=5, n_samples=1500,
n_features=2, seed=None):
# complete data
gen = np.random.RandomState(seed)
X, true_labels = make_blobs(n_samples, n_features, n_clusters,
random_state=gen)
# with missing values
missing = gen.rand(*X.shape) < fraction_missing
Xm = np.where(missing, np.nan, X)
return X, true_labels, Xm
X, true_labels, X_hat = make_fake_data(fraction_missing=0.3, n_clusters=3, seed=0)
X_missing_dummies=np.isnan(X_hat)
n_clusters=3
X_hat = np.concatenate((X_hat,X_missing_dummies),axis=1)
kmeans_m=kmeans_missing(X_hat,n_clusters)
kmeans_m.fit(X_hat,max_iter=100,number_of_runs=10)
print(kmeans_m.costs)
prediction,cost=kmeans_m.predict(X_hat)
for i in range(n_clusters):
print([np.mean((prediction==i)*(true_labels==j)) for j in range(3)],np.mean((prediction==i)))
--EDIT--
In this example the occurrences of missing values are completly random and when that is the case. Not adding the missing value dummies preforms better, since missing value dummies in that case is noise. Not including them would also be the correct thing to do in order to compare with Ali's algorithm.

Categories