Related
I'm currently using K-means clustering on text data (marketing activity descriptions) vectorized by tf-idf, and have an elbow-informed optional k, have made a scatterplot using PCA, and have added a column with cluster labels to my data frame (all in python). So in one sense I can interpret my clustering model by reviewing the labeled text data.
However, I would like to also be able to extract N most frequent words from each of the clusters.
First I'm reading in the data and getting an optimal k via elbow:
# import pandas to use dataframes and handle tabular data, e.g the labeled text dataset for clustering
import pandas as pd
# read in the data using panda's "read_csv" function
col_list = ["DOC_ID", "TEXT", "CODE"]
data = pd.read_csv('/Users/williammarcellino/Downloads/AEMO_Sample.csv', usecols=col_list, encoding='latin-1')
# use regular expression to clean annoying "/n" newline characters
data = data.replace(r'\n',' ', regex=True)
#import sklearn for TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# vectorize text in the df and fit the TEXT data. Builds a vocabulary (a python dict) to map most frequent words
# to features indices and compute word occurrence frequency (sparse matrix). Word frequencies are then reweighted
# using the Inverse Document Frequency (IDF) vector collected feature-wise over the corpus.
vectorizer = TfidfVectorizer(stop_words={'english'})
X = vectorizer.fit_transform(data.TEXT)
#use elbow method to determine optimal "K"
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
Sum_of_squared_distances = []
# we'll try a range of K values, use sum of squared means on new observations to deteremine new centriods (clusters) or not
K = range(6,16)
for k in K:
km = KMeans(n_clusters=k, max_iter=200, n_init=10)
km = km.fit(X)
Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
Based on that, I build a model at k=9:
# optimal "K" value from elobow plot above
true_k = 9
# define an unsupervised clustering "model" using KMeans
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=10)
#fit model to data
model.fit(X)
# define clusters lables (which are integers--a human needs to make them interpretable)
labels=model.labels_
title=[data.DOC_ID]
#make a "clustered" version of the dataframe
data_cl=data
# add label values as a new column, "Cluster"
data_cl['Cluster'] = labels
# I used this to look at my output on a small sample; remove for large datasets in actual analyses
print(data_cl)
# output our new, clustered dataframe to a csv file
data_cl.to_csv('/Users/me/Downloads/AEMO_Sample_clustered.csv')
Finally I plot the principle components:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
model_indices = model.fit_predict(X)
pca = PCA(n_components=2)
scatter_plot_points = pca.fit_transform(X.toarray())
colors = ["r", "b", "c", "y", "m", "paleturquoise", "g", 'aquamarine', 'tab:orange']
x_axis = [o[0] for o in scatter_plot_points]
y_axis = [o[1] for o in scatter_plot_points]
fig, ax = plt.subplots(figsize=(20,10))
ax.scatter(x_axis, y_axis, c=[colors[d] for d in model_indices])
for i, txt in enumerate(labels):
ax.annotate(txt, (x_axis[i]+.005, y_axis[i]), size=10)
Any help extracting and plotting top terms from each cluster would be a great help. Thanks.
I was able to answer my question by using code found here.
def get_top_features_cluster(tf_idf_array, prediction, n_feats):
prediction = km.predict(scatter_plot_points)
labels = np.unique(prediction)
dfs = []
for label in labels:
id_temp = np.where(prediction==label) # indices for each cluster
x_means = np.mean(tf_idf_array[id_temp], axis = 0) # returns average score across cluster
sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
features = tf_idf_vectorizor.get_feature_names()
best_features = [(features[i], x_means[i]) for i in sorted_means]
df = pd.DataFrame(best_features, columns = ['features', 'score'])
dfs.append(df)
return dfs
dfs = get_top_features_cluster(tf_idf_array, prediction, 15)
this code is not working for me, so I did something like:
vectorizer = TfidfVectorizer(stop_words=stopwords)
X = vectorizer.fit_transform(dfi['text'][~dfi['text'].isna()])
print('How many clusters do you want to use?')
true_k = int(input())
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=10)
model.fit(X)
labels=model.labels_
clusters=pd.DataFrame(list(zip(dfi['text'][~dfi['text'].isna()],labels)),columns=['title','cluster'])
features = vectorizer.get_feature_names()
n_feats=15
for i in range(true_k):
cclust=X[clusters['cluster'] == i]
meanWts=cclust.A.mean(axis=0)
sorted_mean_ix = np.argsort(meanWts)[::-1][:n_feats] # indices with top 15 scores
#get most important feature names:
print(np.array(features)[sorted_mean_ix])
I have not clustered data in a while and at the moment i have a massive list of accounts with their perspective areas (or OUs in the table below).
I have used kmeans and kmodes to try and cluster based on OU - meaning that I want the output to group the 17 OUs i have and cluster them based on the provided information. Thus far the output has provided me with clustering based on each record individually and not based on each OU. can some one help me figure out how to group the output then cluster somehow? below is the same of the code used.
# Building the model with 3 clusters
kmode = KModes(n_clusters=3, init = "random", n_init = 5, verbose=1)
clusters = kmode.fit_predict(df)
clusters
#insert the predicted cluster values in our original dataset.
df.insert(0, "Cluster", clusters, True)
df.head(10)
I don't have access to your data set, but below is a generic example of how to do clustering.
# Cluster analysis, or clustering, is an unsupervised machine learning task.
# It involves automatically discovering natural grouping in data. Unlike supervised learning (like predictive modeling),
# clustering algorithms only interpret the input data and find natural groups or clusters in feature space.
import statsmodels.api as sm
import numpy as np
import pandas as pd
mtcars = sm.datasets.get_rdataset("mtcars", "datasets", cache=True).data
df_cars = pd.DataFrame(mtcars)
df_cars.head()
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import KMeans
from matplotlib import pyplot
# define dataset
X = df_cars[['mpg','hp']]
# define the model
model = KMeans(n_clusters=8)
# fit the model
model.fit(X)
# assign a cluster to each example
yhat = model.predict(X)
X['kmeans']=yhat
pyplot.scatter(X['mpg'], X['hp'], c=X['kmeans'], cmap='rainbow', s=50, alpha=0.8)
See the link below for more details.
https://github.com/ASH-WICUS/Notebooks/blob/master/Clustering%20Algorithms%20Compared.ipynb
I'm performing PCA preprocessing on a dataset of 78 variables. How would I calculate the optimal value of PCA variables?
My first thought was to start at, for example, 5 and working my way up and calculating accuracy . However, for obvious reasons this wasn't a time effective means of calculating.
Does anyone have any suggestions/experience? Or even a methodology for calculating the optimal value?
First look at the dataset distribution and then used explained_variance_ to find the number of components.
Start with projecting your samples on a 2-D graph.
Assume I have a face dataset (Olivetti-faces) 40 people and each person has 10 samples. Overall 400 images. We will split 280 trains and 120 test samples.
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split
olivetti = fetch_olivetti_faces()
x = olivetti.images # Train
y = olivetti.target # Labels
x_train, x_test, y_train, y_test = train_test_split(x, y,
test_size=0.3,
random_state=42)
x_train = x_train.reshape((x_train.shape[0], x.shape[1] * x.shape[2]))
x_test = x_test.reshape((x_test.shape[0], x.shape[1] * x.shape[2]))
x = x.reshape((x.shape[0]), x.shape[1] * x.shape[2])
Now we want to see how pixels are distributed. To understand clearly, we will display the pixels in a 2-D graph.
from sklearn.decomposition import PCA
from matplotlib.pyplot import figure, get_cmap, colorbar, show
class_num = 40
sample_num = 10
pca = PCA(n_components=2).fit_transform(x)
idx_range = class_num * sample_num
fig = figure(figsize=(6, 3), dpi=300)
ax = fig.add_subplot(1, 1, 1)
c_map = get_cmap(name='jet', lut=class_num)
scatter = ax.scatter(pca[:idx_range, 0], pca[:idx_range, 1],
c=y[:idx_range],s=10, cmap=c_map)
ax.set_xlabel("First Principal Component")
ax.set_ylabel("Second Principal Component")
ax.set_title("PCA projection of {} people".format(class_num))
colorbar(mappable=scatter)
show()
We can say 40 people, each with 10 samples are not distinguishable with only 2 principal components.
Please remember we created this graph from the main dataset, neither train nor test.
How are many principal components we need to clearly distinguish the data?
To answer the above question we will be using explained_variance_.
From the documentation:
The amount of variance explained by each of the selected components. Equal to n_components largest eigenvalues of the covariance matrix of X.
from matplotlib.pyplot import plot, xlabel, ylabel
pca2 = PCA().fit(x)
plot(pca2.explained_variance_, linewidth=2)
xlabel('Components')
ylabel('Explained Variaces')
show()
From the above graph, we can see after 100 components PCA distinguishes the people.
Simplified-code:
from sklearn.datasets import fetch_olivetti_faces
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
x, _ = fetch_olivetti_faces(return_X_y=True)
pca2 = PCA().fit(x)
plt.plot(pca2.explained_variance_, linewidth=2)
plt.xlabel('Components')
plt.ylabel('Explained Variances')
plt.show()
Today I'm working on a dataset from Kaggle https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data. I would like to segment my dataset by beds, baths, neighborhood and use a DBSCAN to get a clustering by price in each segment. The problem is because each segment is different, I don't want to use the same epsilon for all my dataset but for each segment the best epsilon, do you know an efficient way to do it ?
from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler
sklearn.utils.check_random_state(1000)
Clus_dataSet = pdf[['beds','baths','neighborhood','price']]
Clus_dataSet = np.nan_to_num(Clus_dataSet)
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=6).fit(Clus_dataSet)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
pdf["Clus_Db"]=labels
realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels))
Thank you.
A heuristic for the setting of Epsilon and MinPts parameters has been proposed in the original DBSCAN paper
Once the MinPts value is set (e.g. 2 ∗ Number of features) the partitioning result strongly depends on Epsilon. The heuristic suggests to infer epsilon through a visual analysis of the k-dist plot.
A toy example of the procedure with two gaussian distributions is reported in the following.
from sklearn.neighbors import NearestNeighbors
from matplotlib import pyplot as plt
from sklearn.datasets import make_biclusters
data,lab,_ = make_biclusters((200,2), 2, noise=0.1, minval=0, maxval=1)
minpts = 4
nbrs = NearestNeighbors(n_neighbors=minpts, algorithm='ball_tree').fit(data)
distances, indices = nbrs.kneighbors(data)
k_dist = [x[-1] for x in distances]
f,ax = plt.subplots(1,2,figsize = (10,5))
ax[0].set_title('k-dist plot for k = minpts = 4')
ax[0].plot(sorted(k_dist))
ax[0].set_xlabel('object index after sorting by k-distance')
ax[0].set_ylabel('k-distance')
ax[1].set_title('original data')
ax[1].scatter(data[:,0],data[:,1],c = lab[0])
In the resulting k-dist plot, the "elbow" theoretically divides noise objects from cluster objects and indeed gives an indication on a plausible range of values for Epsilon (tailored on the dataset in combination with the selected value of MinPts). In this toy example, I would say between 0.05 and 0.075.
After doing PCA on my data and plotting the kmeans clusters, my plot looks really weird. The centers of the clusters and scatter plot of the points do not make sense to me. Here is my code:
#clicks, conversion, bounce and search are lists of values.
clicks=[2,0,0,8,7,...]
conversion = [1,0,0,6,0...]
bounce = [2,4,5,0,1....]
X = np.array([clicks,conversion, bounce]).T
y = np.array(search)
num_clusters = 5
pca=PCA(n_components=2, whiten=True)
data2D = pca.fit_transform(X)
print data2D
>>> [[-0.07187948 -0.17784291]
[-0.07173769 -0.26868727]
[-0.07173789 -0.26867958]
...,
[-0.06942414 -0.25040886]
[-0.06950897 -0.19591147]
[-0.07172973 -0.2687937 ]]
km = KMeans(n_clusters=num_clusters, init='k-means++',n_init=10, verbose=1)
km.fit_transform(X)
labels=km.labels_
centers2D = pca.fit_transform(km.cluster_centers_)
colors=['#000000','#FFFFFF','#FF0000','#00FF00','#0000FF']
col_map=dict(zip(set(labels),colors))
label_color = [col_map[l] for l in labels]
plt.scatter( data2D[:,0], data2D[:,1], c=label_color)
plt.hold(True)
plt.scatter(centers2D[:,0], centers2D[:,1], marker='x', c='r')
plt.show()
The red crosses are the center of the clusters. Any help would be great.
Your ordering of PCA and KMeans is screwing things up...
Here is what you need to do:
Normalize your data.
Perform PCA on X to reduce the dimensions from 5 to 2 and produce Data2D
Normalize again
Cluster Data2D with KMeans
Plot the Centroids on top of Data2D.
Where as, here is what you have done above:
Perform PCA on X to reduce the dimensions from 5 to 2 to produce Data2D
Cluster the original data, X, in 5 dimensions.
Perform a separate PCA on your cluster centroids, which produces a completely different 2D subspace for the centroids.
Plot the PCA reduced Data2D with the PCA reduced centroids on top even though these no longer are coupled properly.
Normalization:
Take a look at the code below and you'll see that it puts the centroids right where they need to be. The normalization is key and is completely reversible. ALWAYS normalize your data when you cluster as the distance metrics need to move through all of the spaces equally. Clustering is one of the most important times to normalize your data, but in general... ALWAYS NORMALIZE :-)
A heuristic discussion that goes beyond your original question:
The entire point of dimensionality reduction is to make the KMeans clustering easier and to project out dimensions which don't add to the variance of the data. So you should pass the reduced data to your clustering algorithm. I'll add that there are very few 5D datasets which can be projected down to 2D without throwing out a lot of variance i.e. look at the PCA diagnostics to see whether 90% of the original variance has been preserved. If not, then you might not want to be so aggressive in your PCA.
New Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns
%matplotlib inline
# read your data, replace 'stackoverflow.csv' with your file path
df = pd.read_csv('/Users/angus/Desktop/Downloads/stackoverflow.csv', usecols[0, 2, 4],names=['freq', 'visit_length', 'conversion_cnt'],header=0).dropna()
df.describe()
#Normalize the data
df_norm = (df - df.mean()) / (df.max() - df.min())
num_clusters = 5
pca=PCA(n_components=2)
UnNormdata2D = pca.fit_transform(df_norm)
# Check the resulting varience
var = pca.explained_variance_ratio_
print "Varience after PCA: ",var
#Normalize again following PCA: data2D
data2D = (UnNormdata2D - UnNormdata2D.mean()) / (UnNormdata2D.max()-UnNormdata2D.min())
print "Data2D: "
print data2D
km = KMeans(n_clusters=num_clusters, init='k-means++',n_init=10, verbose=1)
km.fit_transform(data2D)
labels=km.labels_
centers2D = km.cluster_centers_
colors=['#000000','#FFFFFF','#FF0000','#00FF00','#0000FF']
col_map=dict(zip(set(labels),colors))
label_color = [col_map[l] for l in labels]
plt.scatter( data2D[:,0], data2D[:,1], c=label_color)
plt.hold(True)
plt.scatter(centers2D[:,0], centers2D[:,1],marker='x',s=150.0,color='purple')
plt.show()
Plot:
Output:
Varience after PCA: [ 0.65725709 0.29875307]
Data2D:
[[-0.00338421 -0.0009403 ]
[-0.00512081 -0.00095038]
[-0.00512081 -0.00095038]
...,
[-0.00477349 -0.00094836]
[-0.00373153 -0.00094232]
[-0.00512081 -0.00095038]]
Initialization complete
Iteration 0, inertia 51.225
Iteration 1, inertia 38.597
Iteration 2, inertia 36.837
...
...
Converged at iteration 31
Hope this helps!
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# read your data, replace 'stackoverflow.csv' with your file path
df = pd.read_csv('stackoverflow.csv', usecols=[0, 2, 4], names=['freq', 'visit_length', 'conversion_cnt'], header=0).dropna()
df.describe()
Out[3]:
freq visit_length conversion_cnt
count 289705.0000 289705.0000 289705.0000
mean 0.2624 20.7598 0.0748
std 0.4399 55.0571 0.2631
min 0.0000 1.0000 0.0000
25% 0.0000 6.0000 0.0000
50% 0.0000 10.0000 0.0000
75% 1.0000 21.0000 0.0000
max 1.0000 2500.0000 1.0000
# binarlize freq and conversion_cnt
df.freq = np.where(df.freq > 1.0, 1, 0)
df.conversion_cnt = np.where(df.conversion_cnt > 0.0, 1, 0)
feature_names = df.columns
X_raw = df.values
transformer = PCA(n_components=2)
X_2d = transformer.fit_transform(X_raw)
# over 99.9% variance captured by 2d data
transformer.explained_variance_ratio_
Out[4]: array([ 9.9991e-01, 6.6411e-05])
# do clustering
estimator = KMeans(n_clusters=5, init='k-means++', n_init=10, verbose=1)
estimator.fit(X_2d)
labels = estimator.labels_
colors = ['#000000','#FFFFFF','#FF0000','#00FF00','#0000FF']
col_map=dict(zip(set(labels),colors))
label_color = [col_map[l] for l in labels]
fig, ax = plt.subplots()
ax.scatter(X_2d[:,0], X_2d[:,1], c=label_color)
ax.scatter(estimator.cluster_centers_[:,0], estimator.cluster_centers_[:,1], marker='x', s=50, c='r')
KMeans tries to minimize within-group Euclidean distance, and this may or may not be appropriate for your data. Just based on the graph, I would consider a Gaussian Mixture Model to do the unsupervised clustering.
Also, if you have superior knowledge on which observations might be classified into which category/label, you can do a semi-supervised learning.