How to find an "x" amount of closest elements to a centroid - python

I am working on a dataset that is very high dimensional and have performed k-means clustering on it. I am trying to find the 20 closest points to each centroid. The dimensions of the dataset (X_emb) is 10 x 2816. Provided is code that I used to find the single-most closest point to each centroid. The commented out code is a potential solution that I found, but I was not able to make it accurately work.
import numpy as np
import pickle as pkl
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.neighbors import NearestNeighbors
from visualization.make_video_v2 import make_video_from_numpy
from scipy.spatial import cKDTree
n_s_train = 10000
df = pkl.load(open('cluster_data/mixed_finetuning_data.pkl', 'rb'))
N = len(df)
X = []
X_emb = []
for i in range(N):
play = df.iloc[i]
if df.iloc[i].label == 1:
X_emb.append(play['embedding'])
X.append(play['input'])
X_emb = np.array(X_emb)
kmeans = KMeans(n_clusters=10)
kmeans.fit(X_emb)
results = kmeans.cluster_centers_
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)
# def find_k_closest(centroids, data, k=1, distance_norm=2):
# kdtree = cKDTree(data, leafsize=30)
# distances, indices = kdtree.query(centroids, k, p=distance_norm)
# if k > 1:
# indices = indices[:,-1]
# values = data[indices]
# return indices, values
# indices, values = find_k_closest(results, X_emb)

You can use the pairwise distances to calculate the distances for every point with the centroids with every point in X_emb, then using numpy finding the index of the min 20 elements and finally geting them from X_emb
from sklearn.metrics import pairwise_distances
distances = pairwise_distances(centroids, X_emb, metric='euclidean')
ind = [np.argpartition(i, 20)[:20] for i in distances]
closest = [X_emb[indexes] for indexes in ind]
The shape of closest will be (num of centroids x 20)

You can the NearestNeighbors class from sklearn this way:
from sklearn.neighbors import NearestNeighbors
def find_k_closest(centroids, data):
nns = {}
neighbors = NearesNieghbors(n_neighbors=20).fit(data)
for center in centroids:
nns[center] = neighbors.kneighbors(center, return_distance=false)
return nns
the nns dictionary should contain the centers as key and the list of neighbors as value

Related

Run Different Scikit-learn Clustering Algorithms on Dataset

I have a dataframe like below. The shape is (24,7)
Name x1 x2 x3 x4 x5 x6
Harry 102 204 0.43 0.21 1.02 0.39
James 242 500 0.31 0.11 0.03 0.73
.
.
.
Mike 3555 4002 0.12 0.03 0.52. 0.11
Henry 532 643 0.01 0.02 0.33 0.10
I want to run Scikit-learn's Different Clustering Algorithms Script on the above dataframe. However, the input data looks quite confusing, not too sure how to input my dataframe
https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html#sphx-glr-auto-examples-cluster-plot-cluster-comparison-py
There are two main differences between your scenario and the scikit-learn example you link to:
You only have one dataset, not several different ones to compare.
You have six features, not just two.
Point one allows you to simplify the example code by deleting the loops over the different datasets and related calculations. Point two implies that you cannot easily plot your results. Instead, you could just add the predicted class labels found by each algorithm to your dataset.
So you could modify the example code like this:
import time
import warnings
import numpy as np
import pandas as pd
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
np.random.seed(0)
# ============
# Introduce your dataset
# ============
my_df = # Insert your data here, as a pandas dataframe.
features = [f'x{i}' for i in range(1, 7)]
X = my_df[features].values
# ============
# Set up cluster parameters
# ============
params = {
"quantile": 0.3,
"eps": 0.3,
"damping": 0.9,
"preference": -200,
"n_neighbors": 3,
"n_clusters": 3,
"min_samples": 7,
"xi": 0.05,
"min_cluster_size": 0.1,
}
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = max(cluster.estimate_bandwidth(X, quantile=params["quantile"]),
0.001) # arbitrary correction to avoid 0
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(
X, n_neighbors=params["n_neighbors"], include_self=False
)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# ============
# Create cluster objects
# ============
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=params["n_clusters"])
ward = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"], linkage="ward", connectivity=connectivity
)
spectral = cluster.SpectralClustering(
n_clusters=params["n_clusters"],
eigen_solver="arpack",
affinity="nearest_neighbors",
)
dbscan = cluster.DBSCAN(eps=params["eps"])
optics = cluster.OPTICS(
min_samples=params["min_samples"],
xi=params["xi"],
min_cluster_size=params["min_cluster_size"],
)
affinity_propagation = cluster.AffinityPropagation(
damping=params["damping"], preference=params["preference"], random_state=0
)
average_linkage = cluster.AgglomerativeClustering(
linkage="average",
affinity="cityblock",
n_clusters=params["n_clusters"],
connectivity=connectivity,
)
birch = cluster.Birch(n_clusters=params["n_clusters"])
gmm = mixture.GaussianMixture(
n_components=params["n_clusters"], covariance_type="full"
)
clustering_algorithms = (
("MiniBatch\nKMeans", two_means),
("Affinity\nPropagation", affinity_propagation),
("MeanShift", ms),
("Spectral\nClustering", spectral),
("Ward", ward),
("Agglomerative\nClustering", average_linkage),
("DBSCAN", dbscan),
("OPTICS", optics),
("BIRCH", birch),
("Gaussian\nMixture", gmm),
)
for name, algorithm in clustering_algorithms:
t0 = time.time()
# catch warnings related to kneighbors_graph
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="the number of connected components of the "
+ "connectivity matrix is [0-9]{1,2}"
+ " > 1. Completing it to avoid stopping the tree early.",
category=UserWarning,
)
warnings.filterwarnings(
"ignore",
message="Graph is not fully connected, spectral embedding"
+ " may not work as expected.",
category=UserWarning,
)
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, "labels_"):
y_pred = algorithm.labels_.astype(int)
else:
y_pred = algorithm.predict(X)
# Add cluster labels to the dataset
my_df[name] = y_pred
PS : please replace : data = X_data.iloc[:20000] by your X
import numpy as np
import matplotlib as plt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import cluster, metrics
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn import preprocessing
from collections import Counter
from sklearn.cluster import DBSCAN
from sklearn import mixture
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
comp_model = pd.DataFrame(columns=['Model', 'Score_Silhouette',
'num_clusters', 'size_clusters',
'parameters'])
K-Means :
def k_means(X_data, nb_clusters, model_comp):
ks = nb_clusters
inertias = []
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
for num_clusters in ks:
# Create a KMeans instance with k clusters: model
model = KMeans(n_clusters=num_clusters, n_init=1)
# Fit model to samples
model.fit(X_scaled)
# Append the inertia to the list of inertias
inertias.append(model.inertia_)
silh = metrics.silhouette_score(X_scaled, model.labels_)
# Counting the amount of data in each cluster
taille_clusters = Counter(model.labels_)
data = [{'Model': 'kMeans',
'Score_Silhouette': silh,
'num_clusters': num_clusters,
'size_clusters': taille_clusters,
'parameters': 'nb_clusters :'+str(num_clusters)}]
model_comp = model_comp.append(data, ignore_index=True, sort=False)
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
return model_comp
comp_model = k_means(X_data=df,
nb_clusters=pd.np.arange(2, 11, 1),
model_comp=comp_model)
DBscan :
def dbscan_grid_search(X_data, model_comp, eps_space=0.5,
min_samples_space=5, min_clust=0, max_clust=10):
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
# Starting a tally of total iterations
n_iterations = 0
# Looping over each combination of hyperparameters
for eps_val in eps_space:
for samples_val in min_samples_space:
dbscan_grid = DBSCAN(eps=eps_val,
min_samples=samples_val)
# fit_transform
clusters = dbscan_grid.fit_predict(X=X_scaled)
# Counting the amount of data in each cluster
cluster_count = Counter(clusters)
#n_clusters = sum(abs(pd.np.unique(clusters))) - 1
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
# Increasing the iteration tally with each run of the loop
n_iterations += 1
# Appending the lst each time n_clusters criteria is reached
if n_clusters >= min_clust and n_clusters <= max_clust:
silh = metrics.silhouette_score(X_scaled, clusters)
data = [{'Model': 'Dbscan',
'Score_Silhouette': silh,
'num_clusters': n_clusters,
'size_clusters': cluster_count,
'parameters': 'eps :'+str(eps_val)+'+ samples_val :'+str(samples_val)}]
model_comp = model_comp.append(
data, ignore_index=True, sort=False)
return model_comp
comp_model = dbscan_grid_search(X_data=df,
model_comp=comp_model,
eps_space=pd.np.arange(0.1, 5, 0.6),
min_samples_space=pd.np.arange(1, 30, 3),
min_clust=2,
max_clust=10)
GMM :
def gmm(X_data, nb_clusters, model_comp):
ks = nb_clusters
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
for num_clusters in ks:
# Create a KMeans instance with k clusters: model
gmm = mixture.GaussianMixture(n_components=num_clusters).fit(X_scaled)
# Fit model to samples
gmm.fit(X_scaled)
pred = gmm.predict(X_scaled)
cluster_count = Counter(pred)
silh = metrics.silhouette_score(X_scaled, pred)
data = [{'Model': 'GMM',
'Score_Silhouette': silh,
'num_clusters': num_clusters,
'size_clusters': cluster_count,
'parameters': 'nb_clusters :'+str(num_clusters)}]
model_comp = model_comp.append(data, ignore_index=True, sort=False)
return model_comp
comp_model = gmm(X_data=df,
nb_clusters=pd.np.arange(2, 11, 1),
model_comp=comp_model
)
At the end you will have comp_model which will contain all the results of your algo. Here I am using three algorithms, after you selected the best fit for you (with score silhouette and number of cluster).
You should check the repartitions of each cluster :
https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py

How to optimize Shapely and Sklearn code?

I am working with a dataset of 4.2 millions points and my codes is already taking a while to process, however below code is taking several hours to process (the code was provided in other public question and basically it takes the nearest linestring to a point, finds the nearest point from that line string and calculus the distance)
The codes actually does an awesome job, but takes too long for its purposes, How I can optimize or do the same thing in a shortest time?
import geopandas as gpd
import numpy as np
from shapely.geometry import Point, LineString
from shapely.ops import nearest_points
from sklearn.neighbors import DistanceMetric
EARTH_RADIUS_IN_MILES = 3440.1 #NAUTICAL MILES
panama = gpd.read_file("/Users/Danilo/Documents/Python/panama_coastline/panama_coastline.shp")
for c in range(b):
#p = Point(-77.65325423107359,9.222038196656131)
p=Point(data['longitude'][c],data['latitude'][c])
def closest_line(point, linestrings):
return np.argmin( [p.distance(linestring) for linestring in panama.geometry] )
closest_linestring = panama.geometry[ closest_line(p, panama.geometry) ]
closest_linestring
closest_point = nearest_points(p, closest_linestring)
dist = DistanceMetric.get_metric('haversine')
points_as_floats = [ np.array([p.x, p.y]) for p in closest_point ]
haversine_distances = dist.pairwise(np.radians(points_as_floats), np.radians(points_as_floats) )
haversine_distances *= EARTH_RADIUS_IN_MILES
dtc1=haversine_distances[0][1]
dtc.append(dtc1)
Edit: Simplify to single calculation with BallTree
Imports
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point, LineString
from shapely.ops import nearest_points
Read Panama
panama = gpd.read_file("panama_coastline/panama_coastline.shp")
Get all points, long,lat format:
def get_points_as_numpy(geom):
work_list = []
for g in geom:
work_list.append( np.array(g.coords) )
return np.concatenate(work_list)
all_coastline_points = get_points_as_numpy(panama.geometry)
Create Balltree
from sklearn.neighbors import BallTree
import numpy as np
panama_radians = np.radians(np.flip(all_coastline_points,axis=1))
tree = BallTree(panama_radians, leaf_size=12, metric='haversine')
Create 1M random points:
mean = [8.5,-80]
cov = [[1,0],[0,5]] # diagonal covariance, points lie on x or y-axis
random_gps = np.random.multivariate_normal(mean,cov,(10**6))
random_points = pd.DataFrame( {'lat' : random_gps[:,0], 'long' : random_gps[:,1]})
random_points.head()
Calculate closest coast point (<30 Seconds on my machine)
distances, index = tree.query( np.radians(random_gps), k=1)
Put results in DataFrame
EARTH_RADIUS_IN_MILES = 3440.1
random_points['distance_to_coast'] = distances * EARTH_RADIUS_IN_MILES
random_points['closest_lat'] = all_coastline_points[index][:,0,1]
random_points['closest_long'] = all_coastline_points[index][:,0,0]

How to visualize the cluster result as a graph with different node color based on its cluster?

i have a geodesic distance of graph data in .csv format
i want to reduce it into 2D using Multidimensional Scaling (MDS) and cluster it using Kmedoids
This is my code:
# coding: utf-8
import numpy as np
import csv
from sklearn import manifold
from sklearn.metrics.pairwise import pairwise_distances
import kmedoidss
rawdata = csv.reader(open('data.csv', 'r').readlines()[1:])
# Process the data into a 2D array, omitting the header row
data, labels = [], []
for row in rawdata:
labels.append(row[1])
data.append([int(i) for i in row[1:]])
#print data
# Now run very basic MDS
# Documentation here: http://scikit-learn.org/dev/modules/generated/sklearn.manifold.MDS.html#sklearn.manifold.MDS
mds = manifold.MDS(n_components=2, dissimilarity="precomputed")
pos = mds.fit_transform(data)
# distance matrix
D = pairwise_distances(pos, metric='euclidean')
# split into c clusters
M, C = kmedoidss.kMedoids(D, 3)
print ('Data awal : ')
for index, point_idx in enumerate(pos, 1):
print(index, point_idx)
print ('\n medoids:' )
for point_idx in M:
print('{} index ke - {} '.format (pos[point_idx], point_idx+1))
print('')
print('clustering result:')
for label in C:
for point_idx in C[label]:
print('cluster- {}:{} index- {}'.format(label, pos[point_idx], point_idx+1))
kmedoidss.py
import numpy as np
import random
def kMedoids(D, k, tmax=100):
# determine dimensions of distance matrix D
m, n = D.shape
# randomly initialize an array of k medoid indices
M = np.sort(np.random.choice(n, k))
# create a copy of the array of medoid indices
Mnew = np.copy(M)
# initialize a dictionary to represent clusters
C = {}
for t in xrange(tmax):
# determine clusters, i. e. arrays of data indices
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# update cluster medoids
for kappa in range(k):
J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
j = np.argmin(J)
Mnew[kappa] = C[kappa][j]
np.sort(Mnew)
# check for convergence
if np.array_equal(M, Mnew):
break
M = np.copy(Mnew)
else:
# final update of cluster memberships
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# return results
return M, C
how to visualize the cluster result as a graph with different node color based on its cluster?
You don't need MDS to run kMedoids - just run it on the original distance matrix (kMedoids can also be made to work on a similarity matrix by switching min for max).
Use MDS only for plotting.
The usual approach for visualization is to use a loop over clusters, and plot each cluster in a different color; or to use a color predicate. There are many examples in the scipy documentation.
http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
y_pred = labels.astype(np.int)
plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
where X is your pos variable (2d mds result) and labels are an integer cluster number for every point. Since you don't have your data in thid "labels" layout, consider using a loop instead:
for label, pts in C.items():
plt.scatter(pos[pts, 0], pos[pts, 1], color=colors[label])
plt.show()

python scikit-learn clustering with missing data

I want to cluster data with missing columns. Doing it manually I would calculate the distance in case of a missing column simply without this column.
With scikit-learn, missing data is not possible. There is also no chance to specify a user distance function.
Is there any chance to cluster with missing data?
Example data:
n_samples = 1500
noise = 0.05
X, _ = make_swiss_roll(n_samples, noise)
rnd = np.random.rand(X.shape[0],X.shape[1])
X[rnd<0.1] = np.nan
I think you can use an iterative EM-type algorithm:
Initialize missing values to their column means
Repeat until convergence:
Perform K-means clustering on the filled-in data
Set the missing values to the centroid coordinates of the clusters to which they were assigned
Implementation
import numpy as np
from sklearn.cluster import KMeans
def kmeans_missing(X, n_clusters, max_iter=10):
"""Perform K-Means clustering on data with missing values.
Args:
X: An [n_samples, n_features] array of data to cluster.
n_clusters: Number of clusters to form.
max_iter: Maximum number of EM iterations to perform.
Returns:
labels: An [n_samples] vector of integer labels.
centroids: An [n_clusters, n_features] array of cluster centroids.
X_hat: Copy of X with the missing values filled in.
"""
# Initialize missing values to their column means
missing = ~np.isfinite(X)
mu = np.nanmean(X, 0, keepdims=1)
X_hat = np.where(missing, mu, X)
for i in xrange(max_iter):
if i > 0:
# initialize KMeans with the previous set of centroids. this is much
# faster and makes it easier to check convergence (since labels
# won't be permuted on every iteration), but might be more prone to
# getting stuck in local minima.
cls = KMeans(n_clusters, init=prev_centroids)
else:
# do multiple random initializations in parallel
cls = KMeans(n_clusters, n_jobs=-1)
# perform clustering on the filled-in data
labels = cls.fit_predict(X_hat)
centroids = cls.cluster_centers_
# fill in the missing values based on their cluster centroids
X_hat[missing] = centroids[labels][missing]
# when the labels have stopped changing then we have converged
if i > 0 and np.all(labels == prev_labels):
break
prev_labels = labels
prev_centroids = cls.cluster_centers_
return labels, centroids, X_hat
Example with fake data
from sklearn.datasets import make_blobs
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def make_fake_data(fraction_missing, n_clusters=5, n_samples=1500,
n_features=3, seed=None):
# complete data
gen = np.random.RandomState(seed)
X, true_labels = make_blobs(n_samples, n_features, n_clusters,
random_state=gen)
# with missing values
missing = gen.rand(*X.shape) < fraction_missing
Xm = np.where(missing, np.nan, X)
return X, true_labels, Xm
X, true_labels, Xm = make_fake_data(fraction_missing=0.3, n_clusters=5, seed=0)
labels, centroids, X_hat = kmeans_missing(Xm, n_clusters=5)
# plot the inferred points, color-coded according to the true cluster labels
fig, ax = plt.subplots(1, 2, subplot_kw={'projection':'3d', 'aspect':'equal'})
ax[0].scatter3D(X[:, 0], X[:, 1], X[:, 2], c=true_labels, cmap='gist_rainbow')
ax[1].scatter3D(X_hat[:, 0], X_hat[:, 1], X_hat[:, 2], c=true_labels,
cmap='gist_rainbow')
ax[0].set_title('Original data')
ax[1].set_title('Imputed (30% missing values)')
fig.tight_layout()
Benchmark
To assess the algorithm's performance, we can use the adjusted mutual information between the true and inferred cluster labels. A score of 1 is perfect performance and 0 represents chance:
from sklearn.metrics import adjusted_mutual_info_score
fraction = np.arange(0.0, 1.0, 0.05)
n_repeat = 10
scores = np.empty((2, fraction.shape[0], n_repeat))
for i, frac in enumerate(fraction):
for j in range(n_repeat):
X, true_labels, Xm = make_fake_data(fraction_missing=frac, n_clusters=5)
labels, centroids, X_hat = kmeans_missing(Xm, n_clusters=5)
any_missing = np.any(~np.isfinite(Xm), 1)
scores[0, i, j] = adjusted_mutual_info_score(labels, true_labels)
scores[1, i, j] = adjusted_mutual_info_score(labels[any_missing],
true_labels[any_missing])
fig, ax = plt.subplots(1, 1)
scores_all, scores_missing = scores
ax.errorbar(fraction * 100, scores_all.mean(-1),
yerr=scores_all.std(-1), label='All labels')
ax.errorbar(fraction * 100, scores_missing.mean(-1),
yerr=scores_missing.std(-1),
label='Labels with missing values')
ax.set_xlabel('% missing values')
ax.set_ylabel('Adjusted mutual information')
ax.legend(loc='best', frameon=False)
ax.set_ylim(0, 1)
ax.set_xlim(-5, 100)
Update:
In fact, after a quick Google search it seems that what I've come up with above is pretty much the same as the k-POD algorithm for K-means clustering of missing data (Chi, Chi & Baraniuk, 2016).
Here is a different algorithm that I use. Instead of replacing the missing values the values are ignored and in order to capture the differences between missing and non-missing i impliment missing dummies.
Compared to Alis algorithm it seems is easier for observations with missing observatons to jump from class to class. Since I do not fill the missing values.
I fortunely did not have the time to compare using Ali's beautiful code, but feel free to do it (I might do it when I get the time) and contribute to the discussion about the best method.
import numpy as np
class kmeans_missing(object):
def __init__(self,potential_centroids,n_clusters):
#initialize with potential centroids
self.n_clusters=n_clusters
self.potential_centroids=potential_centroids
def fit(self,data,max_iter=10,number_of_runs=1):
n_clusters=self.n_clusters
potential_centroids=self.potential_centroids
dist_mat=np.zeros((data.shape[0],n_clusters))
all_centroids=np.zeros((n_clusters,data.shape[1],number_of_runs))
costs=np.zeros((number_of_runs,))
for k in range(number_of_runs):
idx=np.random.choice(range(potential_centroids.shape[0]), size=(n_clusters), replace=False)
centroids=potential_centroids[idx]
clusters=np.zeros(data.shape[0])
old_clusters=np.zeros(data.shape[0])
for i in range(max_iter):
#Calc dist to centroids
for j in range(n_clusters):
dist_mat[:,j]=np.nansum((data-centroids[j])**2,axis=1)
#Assign to clusters
clusters=np.argmin(dist_mat,axis=1)
#Update clusters
for j in range(n_clusters):
centroids[j]=np.nanmean(data[clusters==j],axis=0)
if all(np.equal(clusters,old_clusters)):
break # Break when to change in clusters
if i==max_iter-1:
print('no convergence before maximal iterations are reached')
else:
clusters,old_clusters=old_clusters,clusters
all_centroids[:,:,k]=centroids
costs[k]=np.mean(np.min(dist_mat,axis=1))
self.costs=costs
self.cost=np.min(costs)
self.best_model=np.argmin(costs)
self.centroids=all_centroids[:,:,self.best_model]
self.all_centroids=all_centroids
def predict(self,data):
dist_mat=np.zeros((data.shape[0],self.n_clusters))
for j in range(self.n_clusters):
dist_mat[:,j]=np.nansum((data-self.centroids[j])**2,axis=1)
prediction=np.argmin(dist_mat,axis=1)
cost=np.min(dist_mat,axis=1)
return prediction,cost
Here is an example on how though it might be usefull.
from sklearn.datasets import make_blobs
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from kmeans_missing import *
def make_fake_data(fraction_missing, n_clusters=5, n_samples=1500,
n_features=2, seed=None):
# complete data
gen = np.random.RandomState(seed)
X, true_labels = make_blobs(n_samples, n_features, n_clusters,
random_state=gen)
# with missing values
missing = gen.rand(*X.shape) < fraction_missing
Xm = np.where(missing, np.nan, X)
return X, true_labels, Xm
X, true_labels, X_hat = make_fake_data(fraction_missing=0.3, n_clusters=3, seed=0)
X_missing_dummies=np.isnan(X_hat)
n_clusters=3
X_hat = np.concatenate((X_hat,X_missing_dummies),axis=1)
kmeans_m=kmeans_missing(X_hat,n_clusters)
kmeans_m.fit(X_hat,max_iter=100,number_of_runs=10)
print(kmeans_m.costs)
prediction,cost=kmeans_m.predict(X_hat)
for i in range(n_clusters):
print([np.mean((prediction==i)*(true_labels==j)) for j in range(3)],np.mean((prediction==i)))
--EDIT--
In this example the occurrences of missing values are completly random and when that is the case. Not adding the missing value dummies preforms better, since missing value dummies in that case is noise. Not including them would also be the correct thing to do in order to compare with Ali's algorithm.

Principal Component Analysis (PCA) in Python

I have a (26424 x 144) array and I want to perform PCA over it using Python. However, there is no particular place on the web that explains about how to achieve this task (There are some sites which just do PCA according to their own - there is no generalized way of doing so that I can find). Anybody with any sort of help will do great.
I posted my answer even though another answer has already been accepted; the accepted answer relies on a deprecated function; additionally, this deprecated function is based on Singular Value Decomposition (SVD), which (although perfectly valid) is the much more memory- and processor-intensive of the two general techniques for calculating PCA. This is particularly relevant here because of the size of the data array in the OP. Using covariance-based PCA, the array used in the computation flow is just 144 x 144, rather than 26424 x 144 (the dimensions of the original data array).
Here's a simple working implementation of PCA using the linalg module from SciPy. Because this implementation first calculates the covariance matrix, and then performs all subsequent calculations on this array, it uses far less memory than SVD-based PCA.
(the linalg module in NumPy can also be used with no change in the code below aside from the import statement, which would be from numpy import linalg as LA.)
The two key steps in this PCA implementation are:
calculating the covariance matrix; and
taking the eivenvectors & eigenvalues of this cov matrix
In the function below, the parameter dims_rescaled_data refers to the desired number of dimensions in the rescaled data matrix; this parameter has a default value of just two dimensions, but the code below isn't limited to two but it could be any value less than the column number of the original data array.
def PCA(data, dims_rescaled_data=2):
"""
returns: data transformed in 2 dims/columns + regenerated original data
pass in: data as 2D NumPy array
"""
import numpy as NP
from scipy import linalg as LA
m, n = data.shape
# mean center the data
data -= data.mean(axis=0)
# calculate the covariance matrix
R = NP.cov(data, rowvar=False)
# calculate eigenvectors & eigenvalues of the covariance matrix
# use 'eigh' rather than 'eig' since R is symmetric,
# the performance gain is substantial
evals, evecs = LA.eigh(R)
# sort eigenvalue in decreasing order
idx = NP.argsort(evals)[::-1]
evecs = evecs[:,idx]
# sort eigenvectors according to same index
evals = evals[idx]
# select the first n eigenvectors (n is desired dimension
# of rescaled data array, or dims_rescaled_data)
evecs = evecs[:, :dims_rescaled_data]
# carry out the transformation on the data using eigenvectors
# and return the re-scaled data, eigenvalues, and eigenvectors
return NP.dot(evecs.T, data.T).T, evals, evecs
def test_PCA(data, dims_rescaled_data=2):
'''
test by attempting to recover original data array from
the eigenvectors of its covariance matrix & comparing that
'recovered' array with the original data
'''
_ , _ , eigenvectors = PCA(data, dim_rescaled_data=2)
data_recovered = NP.dot(eigenvectors, m).T
data_recovered += data_recovered.mean(axis=0)
assert NP.allclose(data, data_recovered)
def plot_pca(data):
from matplotlib import pyplot as MPL
clr1 = '#2026B2'
fig = MPL.figure()
ax1 = fig.add_subplot(111)
data_resc, data_orig = PCA(data)
ax1.plot(data_resc[:, 0], data_resc[:, 1], '.', mfc=clr1, mec=clr1)
MPL.show()
>>> # iris, probably the most widely used reference data set in ML
>>> df = "~/iris.csv"
>>> data = NP.loadtxt(df, delimiter=',')
>>> # remove class labels
>>> data = data[:,:-1]
>>> plot_pca(data)
The plot below is a visual representation of this PCA function on the iris data. As you can see, a 2D transformation cleanly separates class I from class II and class III (but not class II from class III, which in fact requires another dimension).
You can find a PCA function in the matplotlib module:
import numpy as np
from matplotlib.mlab import PCA
data = np.array(np.random.randint(10,size=(10,3)))
results = PCA(data)
results will store the various parameters of the PCA.
It is from the mlab part of matplotlib, which is the compatibility layer with the MATLAB syntax
EDIT:
on the blog nextgenetics I found a wonderful demonstration of how to perform and display a PCA with the matplotlib mlab module, have fun and check that blog!
Another Python PCA using numpy. The same idea as #doug but that one didn't run.
from numpy import array, dot, mean, std, empty, argsort
from numpy.linalg import eigh, solve
from numpy.random import randn
from matplotlib.pyplot import subplots, show
def cov(X):
"""
Covariance matrix
note: specifically for mean-centered data
note: numpy's `cov` uses N-1 as normalization
"""
return dot(X.T, X) / X.shape[0]
# N = data.shape[1]
# C = empty((N, N))
# for j in range(N):
# C[j, j] = mean(data[:, j] * data[:, j])
# for k in range(j + 1, N):
# C[j, k] = C[k, j] = mean(data[:, j] * data[:, k])
# return C
def pca(data, pc_count = None):
"""
Principal component analysis using eigenvalues
note: this mean-centers and auto-scales the data (in-place)
"""
data -= mean(data, 0)
data /= std(data, 0)
C = cov(data)
E, V = eigh(C)
key = argsort(E)[::-1][:pc_count]
E, V = E[key], V[:, key]
U = dot(data, V) # used to be dot(V.T, data.T).T
return U, E, V
""" test data """
data = array([randn(8) for k in range(150)])
data[:50, 2:4] += 5
data[50:, 2:5] += 5
""" visualize """
trans = pca(data, 3)[0]
fig, (ax1, ax2) = subplots(1, 2)
ax1.scatter(data[:50, 0], data[:50, 1], c = 'r')
ax1.scatter(data[50:, 0], data[50:, 1], c = 'b')
ax2.scatter(trans[:50, 0], trans[:50, 1], c = 'r')
ax2.scatter(trans[50:, 0], trans[50:, 1], c = 'b')
show()
Which yields the same thing as the much shorter
from sklearn.decomposition import PCA
def pca2(data, pc_count = None):
return PCA(n_components = 4).fit_transform(data)
As I understand it, using eigenvalues (first way) is better for high-dimensional data and fewer samples, whereas using Singular value decomposition is better if you have more samples than dimensions.
This is a job for numpy.
And here's a tutorial demonstrating how pincipal component analysis can be done using numpy's built-in modules like mean,cov,double,cumsum,dot,linalg,array,rank.
http://glowingpython.blogspot.sg/2011/07/principal-component-analysis-with-numpy.html
Notice that scipy also has a long explanation here
- https://github.com/scikit-learn/scikit-learn/blob/babe4a5d0637ca172d47e1dfdd2f6f3c3ecb28db/scikits/learn/utils/extmath.py#L105
with the scikit-learn library having more code examples -
https://github.com/scikit-learn/scikit-learn/blob/babe4a5d0637ca172d47e1dfdd2f6f3c3ecb28db/scikits/learn/utils/extmath.py#L105
Here are scikit-learn options. With both methods, StandardScaler was used because PCA is effected by scale
Method 1: Have scikit-learn choose the minimum number of principal components such that at least x% (90% in example below) of the variance is retained.
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
iris = load_iris()
# mean-centers and auto-scales the data
standardizedData = StandardScaler().fit_transform(iris.data)
pca = PCA(.90)
principalComponents = pca.fit_transform(X = standardizedData)
# To get how many principal components was chosen
print(pca.n_components_)
Method 2: Choose the number of principal components (in this case, 2 was chosen)
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
iris = load_iris()
standardizedData = StandardScaler().fit_transform(iris.data)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X = standardizedData)
# to get how much variance was retained
print(pca.explained_variance_ratio_.sum())
Source: https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60
UPDATE: matplotlib.mlab.PCA is since release 2.2 (2018-03-06) indeed deprecated.
The library matplotlib.mlab.PCA (used in this answer) is not deprecated. So for all the folks arriving here via Google, I'll post a complete working example tested with Python 2.7.
Use the following code with care as it uses a now deprecated library!
from matplotlib.mlab import PCA
import numpy
data = numpy.array( [[3,2,5], [-2,1,6], [-1,0,4], [4,3,4], [10,-5,-6]] )
pca = PCA(data)
Now in `pca.Y' is the original data matrix in terms of the principal components basis vectors. More details about the PCA object can be found here.
>>> pca.Y
array([[ 0.67629162, -0.49384752, 0.14489202],
[ 1.26314784, 0.60164795, 0.02858026],
[ 0.64937611, 0.69057287, -0.06833576],
[ 0.60697227, -0.90088738, -0.11194732],
[-3.19578784, 0.10251408, 0.00681079]])
You can use matplotlib.pyplot to draw this data, just to convince yourself that the PCA yields "good" results. The names list is just used to annotate our five vectors.
import matplotlib.pyplot
names = [ "A", "B", "C", "D", "E" ]
matplotlib.pyplot.scatter(pca.Y[:,0], pca.Y[:,1])
for label, x, y in zip(names, pca.Y[:,0], pca.Y[:,1]):
matplotlib.pyplot.annotate( label, xy=(x, y), xytext=(-2, 2), textcoords='offset points', ha='right', va='bottom' )
matplotlib.pyplot.show()
Looking at our original vectors we'll see that data[0] ("A") and data[3] ("D") are rather similar as are data[1] ("B") and data[2] ("C"). This is reflected in the 2D plot of our PCA transformed data.
In addition to all the other answers, here is some code to plot the biplot using sklearn and matplotlib.
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import StandardScaler
iris = datasets.load_iris()
X = iris.data
y = iris.target
#In general a good idea is to scale the data
scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)
pca = PCA()
x_new = pca.fit_transform(X)
def myplot(score,coeff,labels=None):
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
plt.scatter(xs * scalex,ys * scaley, c = y)
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
if labels is None:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
else:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()
#Call the function. Use only the 2 PCs.
myplot(x_new[:,0:2],np.transpose(pca.components_[0:2, :]))
plt.show()
I've made a little script for comparing the different PCAs appeared as an answer here:
import numpy as np
from scipy.linalg import svd
shape = (26424, 144)
repeat = 20
pca_components = 2
data = np.array(np.random.randint(255, size=shape)).astype('float64')
# data normalization
# data.dot(data.T)
# (U, s, Va) = svd(data, full_matrices=False)
# data = data / s[0]
from fbpca import diffsnorm
from timeit import default_timer as timer
from scipy.linalg import svd
start = timer()
for i in range(repeat):
(U, s, Va) = svd(data, full_matrices=False)
time = timer() - start
err = diffsnorm(data, U, s, Va)
print('svd time: %.3fms, error: %E' % (time*1000/repeat, err))
from matplotlib.mlab import PCA
start = timer()
_pca = PCA(data)
for i in range(repeat):
U = _pca.project(data)
time = timer() - start
err = diffsnorm(data, U, _pca.fracs, _pca.Wt)
print('matplotlib PCA time: %.3fms, error: %E' % (time*1000/repeat, err))
from fbpca import pca
start = timer()
for i in range(repeat):
(U, s, Va) = pca(data, pca_components, True)
time = timer() - start
err = diffsnorm(data, U, s, Va)
print('facebook pca time: %.3fms, error: %E' % (time*1000/repeat, err))
from sklearn.decomposition import PCA
start = timer()
_pca = PCA(n_components = pca_components)
_pca.fit(data)
for i in range(repeat):
U = _pca.transform(data)
time = timer() - start
err = diffsnorm(data, U, _pca.explained_variance_, _pca.components_)
print('sklearn PCA time: %.3fms, error: %E' % (time*1000/repeat, err))
start = timer()
for i in range(repeat):
(U, s, Va) = pca_mark(data, pca_components)
time = timer() - start
err = diffsnorm(data, U, s, Va.T)
print('pca by Mark time: %.3fms, error: %E' % (time*1000/repeat, err))
start = timer()
for i in range(repeat):
(U, s, Va) = pca_doug(data, pca_components)
time = timer() - start
err = diffsnorm(data, U, s[:pca_components], Va.T)
print('pca by doug time: %.3fms, error: %E' % (time*1000/repeat, err))
pca_mark is the pca in Mark's answer.
pca_doug is the pca in doug's answer.
Here is an example output (but the result depends very much on the data size and pca_components, so I'd recommend to run your own test with your own data. Also, facebook's pca is optimized for normalized data, so it will be faster and more accurate in that case):
svd time: 3212.228ms, error: 1.907320E-10
matplotlib PCA time: 879.210ms, error: 2.478853E+05
facebook pca time: 485.483ms, error: 1.260335E+04
sklearn PCA time: 169.832ms, error: 7.469847E+07
pca by Mark time: 293.758ms, error: 1.713129E+02
pca by doug time: 300.326ms, error: 1.707492E+02
EDIT:
The diffsnorm function from fbpca calculates the spectral-norm error of a Schur decomposition.
This will may be the simplest answer one can find for the PCA including easily understandable steps. Let say we want to retain 2 principal dimensions from the 144 which provides maximum information.
Firstly, convert your 2-D array to a dataframe:
import pandas as pd
# Here X is your array of size (26424 x 144)
data = pd.DataFrame(X)
Then, there are two methods one can go with:
Method 1: Manual calculation
Step 1: Apply column standardization on X
from sklearn import preprocessing
scalar = preprocessing.StandardScaler()
standardized_data = scalar.fit_transform(data)
Step 2: Find Co-variance matrix S of original matrix X
sample_data = standardized_data
covar_matrix = np.cov(sample_data)
Step 3: Find eigen values and eigen vectors of S (here 2D, so 2 of each)
from scipy.linalg import eigh
# eigh() function will provide eigen-values and eigen-vectors for a given matrix.
# eigvals=(low value, high value) takes eigen value numbers in ascending order
values, vectors = eigh(covar_matrix, eigvals=(142,143))
# Converting the eigen vectors into (2,d) shape for easyness of further computations
vectors = vectors.T
Step 4: Transform the data
# Projecting the original data sample on the plane formed by two principal eigen vectors by vector-vector multiplication.
new_coordinates = np.matmul(vectors, sample_data.T)
print(new_coordinates.T)
This new_coordinates.T will be of size (26424 x 2) with 2 principal components.
Method 2: Using Scikit-Learn
Step 1: Apply column standardization on X
from sklearn import preprocessing
scalar = preprocessing.StandardScaler()
standardized_data = scalar.fit_transform(data)
Step 2: Initializing the pca
from sklearn import decomposition
# n_components = numbers of dimenstions you want to retain
pca = decomposition.PCA(n_components=2)
Step 3: Using pca to fit the data
# This line takes care of calculating co-variance matrix, eigen values, eigen vectors and multiplying top 2 eigen vectors with data-matrix X.
pca_data = pca.fit_transform(sample_data)
This pca_data will be of size (26424 x 2) with 2 principal components.
For the sake def plot_pca(data): will work, it is necessary to replace the lines
data_resc, data_orig = PCA(data)
ax1.plot(data_resc[:, 0], data_resc[:, 1], '.', mfc=clr1, mec=clr1)
with lines
newData, data_resc, data_orig = PCA(data)
ax1.plot(newData[:, 0], newData[:, 1], '.', mfc=clr1, mec=clr1)
this sample code loads the Japanese yield curve, and creates PCA components.
It then estimates a given date's move using the PCA and compares it against the actual move.
%matplotlib inline
import numpy as np
import scipy as sc
from scipy import stats
from IPython.display import display, HTML
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import datetime
from datetime import timedelta
import quandl as ql
start = "2016-10-04"
end = "2019-10-04"
ql_data = ql.get("MOFJ/INTEREST_RATE_JAPAN", start_date = start, end_date = end).sort_index(ascending= False)
eigVal_, eigVec_ = np.linalg.eig(((ql_data[:300]).diff(-1)*100).cov()) # take latest 300 data-rows and normalize to bp
print('number of PCA are', len(eigVal_))
loc_ = 10
plt.plot(eigVec_[:,0], label = 'PCA1')
plt.plot(eigVec_[:,1], label = 'PCA2')
plt.plot(eigVec_[:,2], label = 'PCA3')
plt.xticks(range(len(eigVec_[:,0])), ql_data.columns)
plt.legend()
plt.show()
x = ql_data.diff(-1).iloc[loc_].values * 100 # set the differences
x_ = x[:,np.newaxis]
a1, _, _, _ = np.linalg.lstsq(eigVec_[:,0][:, np.newaxis], x_) # linear regression without intercept
a2, _, _, _ = np.linalg.lstsq(eigVec_[:,1][:, np.newaxis], x_)
a3, _, _, _ = np.linalg.lstsq(eigVec_[:,2][:, np.newaxis], x_)
pca_mv = m1 * eigVec_[:,0] + m2 * eigVec_[:,1] + m3 * eigVec_[:,2] + c1 + c2 + c3
pca_MV = a1[0][0] * eigVec_[:,0] + a2[0][0] * eigVec_[:,1] + a3[0][0] * eigVec_[:,2]
pca_mV = b1 * eigVec_[:,0] + b2 * eigVec_[:,1] + b3 * eigVec_[:,2]
display(pd.DataFrame([eigVec_[:,0], eigVec_[:,1], eigVec_[:,2], x, pca_MV]))
print('PCA1 regression is', a1, a2, a3)
plt.plot(pca_MV)
plt.title('this is with regression and no intercept')
plt.plot(ql_data.diff(-1).iloc[loc_].values * 100, )
plt.title('this is with actual moves')
plt.show()

Categories