get the centroid row index from k-means clustering using sklearn - python

Hy all,
I have a panda DataFrame from which, i would like to cluster all rows and get the row index of each cluster centroid . I am using sklearn and this is what i have:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
X = pd.DataFrame(np.random.rand(10,5))
kmeans = KMeans(n_clusters=3)
Y = pd.DataFrame(kmeans.fit_predict(X.as_matrix()),columns=['cluster ID'] )
Z =pd.DataFrame(kmeans.cluster_centers_[Y['cluster ID']])
result = pd.concat([X , Y, Z], axis=1)
pd.DataFrame(result)
is there any way to get the index of the closest row to centroid
thx

Thx. This code work:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
X = pd.DataFrame(np.random.rand(10,5))
model= KMeans(n_clusters=3)
clusassign = model.fit_predict(X.as_matrix())
min_dist = np.min(cdist(X.as_matrix(), model.cluster_centers_, 'euclidean'), axis=1)
Y = pd.DataFrame(min_dist, index=X.index, columns=['Center_euclidean_dist'])
Z = pd.DataFrame(clusassign, index=X.index, columns=['cluster_ID'])
PAP = pd.concat([Y,Z], axis=1)
grouped = PAP.groupby(['cluster_ID'])
grouped.idxmin()

Related

DBSCAN Clustering loop

Using DB scan I am iterating through a csv with x, y, z, and id data. I would like to generate a new csv for a every combination of eps and minimum samples that occur within a set range.
The output csv should also include the number of points in the cluster, eps value, and minimum samples used as columns along with the original x, y, z, and id data. The code below will do this, but will only create a csv for the last cluster calculated.
%matplotlib notebook
from interp import *
import pandas as pd
import pandas as pdfrom sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
import itertools
points_file = "examples/example-1/fcr.csv"
eps_values = np.arange(0.1,0.5,0.1)
min_samples = np.arange(5,20)
dbscan_params = list(itertools.product(eps_values, min_samples))
cluster_n = []
epsvalues = []
min_samp = []
for p in dbscan_params:
dbscan_cluster = DBSCAN(eps=p[0],
min_samples=p[1]).fit(points)
epsvalues.append(p[0])
min_samp.append(p[1]), cluster_n.append(
len(np.unique(dbscan_cluster.labels_)))
df = pd.read_csv(points_file, header=None, names=["x", "y", "z", "id"])
df["cluster"] = clustering.labels_
df["eps"] = eps
df["min_sample"] = min_sam
csv_name = f'fcr_{eps}e{min_sam}m.csv'
df.to_csv(csv_name, index=False)

How to find an "x" amount of closest elements to a centroid

I am working on a dataset that is very high dimensional and have performed k-means clustering on it. I am trying to find the 20 closest points to each centroid. The dimensions of the dataset (X_emb) is 10 x 2816. Provided is code that I used to find the single-most closest point to each centroid. The commented out code is a potential solution that I found, but I was not able to make it accurately work.
import numpy as np
import pickle as pkl
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.neighbors import NearestNeighbors
from visualization.make_video_v2 import make_video_from_numpy
from scipy.spatial import cKDTree
n_s_train = 10000
df = pkl.load(open('cluster_data/mixed_finetuning_data.pkl', 'rb'))
N = len(df)
X = []
X_emb = []
for i in range(N):
play = df.iloc[i]
if df.iloc[i].label == 1:
X_emb.append(play['embedding'])
X.append(play['input'])
X_emb = np.array(X_emb)
kmeans = KMeans(n_clusters=10)
kmeans.fit(X_emb)
results = kmeans.cluster_centers_
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)
# def find_k_closest(centroids, data, k=1, distance_norm=2):
# kdtree = cKDTree(data, leafsize=30)
# distances, indices = kdtree.query(centroids, k, p=distance_norm)
# if k > 1:
# indices = indices[:,-1]
# values = data[indices]
# return indices, values
# indices, values = find_k_closest(results, X_emb)
You can use the pairwise distances to calculate the distances for every point with the centroids with every point in X_emb, then using numpy finding the index of the min 20 elements and finally geting them from X_emb
from sklearn.metrics import pairwise_distances
distances = pairwise_distances(centroids, X_emb, metric='euclidean')
ind = [np.argpartition(i, 20)[:20] for i in distances]
closest = [X_emb[indexes] for indexes in ind]
The shape of closest will be (num of centroids x 20)
You can the NearestNeighbors class from sklearn this way:
from sklearn.neighbors import NearestNeighbors
def find_k_closest(centroids, data):
nns = {}
neighbors = NearesNieghbors(n_neighbors=20).fit(data)
for center in centroids:
nns[center] = neighbors.kneighbors(center, return_distance=false)
return nns
the nns dictionary should contain the centers as key and the list of neighbors as value

Printing column/variable names after feature selection

I am trying feature selection on the Iris dateset.
I'm referencing from Feature Selection with Univariate Statistical Tests
I am using below lines and I want to find out the significant features:
import pandas
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
dataframe = pandas.read_csv("C:\\dateset\\iris.csv"]))
array = dataframe.values
X = array[:,0:4]
Y = array[:,4]
test = SelectKBest(score_func=f_classif, k=2)
fit = test.fit(X, Y)
set_printoptions(precision=2)
arr = fit.scores_
print (arr)
# [ 119.26 47.36 1179.03 959.32]
To show the indexes of the top 2 by its score, I added:
idx = (-arr).argsort()[:2]
print (idx)
# [2 3]
Further, how can I have the column/variable names (instead of their indexes)?
Use indexing, here is possible use columns names, because selected first 4 columns:
#first 4 columns
X = array[:,0:4]
cols = dataframe.columns[idx]
If selection is different for X variable is necessary also filter by position DataFrame:
#e.g. selected 3. to 7. column
X = array[:,2:6]
cols = dataframe.iloc[:, 2:6].columns[idx]
import pandas
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
dataframe = pandas.read_csv("iris.csv")
array = dataframe.values
X = array[:,0:4]
Y = array[:,4]
test = SelectKBest(score_func=f_classif, k=2)
fit = test.fit(X, Y)
set_printoptions(precision=2)
arr = fit.scores_
idx = (-arr).argsort()[:2]
print (idx)
print (arr)
#names=[dataframe.columns[j] for j in X]
names = dataframe.columns[idx]
print(names)
Output
[2 3]
[ 119.26 47.36 1179.03 959.32]
Index(['petal_length', 'petal_width'], dtype='object')

loop through dataframe columns to do simple linear regression?

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
df = pd.read_excel("Book1.xlsx")
for column in df:
X = df["Row Labels"]
Y = df[column]
y1 =Y.values.reshape(-1,1)
x1 =X.values.reshape(-1,1)
regressor = LinearRegression()
regressor.fit(x1, y1)
y_new = []
y_i = []
for i in range(12,24):
y_new.append(regressor.predict([[i]]))
y_i.append(i)
df2 = pd.DataFrame({'column':y_new})
i write this code to loop through the dataframe columns to do simple linear regression and put all the predicted value in dataframe. but it is predicting only the last columns value.
df2 = pd.DataFrame({'column':y_new}) creates a column named 'column' verbatim (not the name saved in the variable column. Moreover, df2 is recreated in every iteration, each iteration it only saves the last y_new.
I think what you want is to create a new column in df2 in each iteration:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
df = pd.read_excel("Book1.xlsx")
df2 = pd.DataFrame()
for column in df:
X = df["Row Labels"]
Y = df[column]
y1 =Y.values.reshape(-1,1)
x1 =X.values.reshape(-1,1)
regressor = LinearRegression()
regressor.fit(x1, y1)
y_new = []
y_i = []
for i in range(12,24):
y_new.append(regressor.predict([[i]]))
y_i.append(i)
df2[column] = y_new

NameError: name 'X' is not defined sklearn

I am working through this multiple regression problem with this walk through however the code that starts at
section : #Treating categorical variables with One-hot-encoding at website: https://towardsdatascience.com/what-makes-a-movie-hit-a-jackpot-learning-from-data-with-multiple-linear-regression-339f6c1a7022
I ran code up to this point but it doesn't work for (X)
Actual code:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# LabelEncoder for a number of columns
class MultiColumnLabelEncoder:
def __init__(self, columns = None):
self.columns = columns # list of column to encode
def fit(self, X, y=None):
return self
def transform(self, X):
'''
Transforms columns of X specified in self.columns using
LabelEncoder(). If no columns specified, transforms all
columns in X.
'''
output = X.copy()
if self.columns is not None:
for col in self.columns:
output[col] = LabelEncoder().fit_transform(output[col])
else:
for colname, col in output.iteritems():
output[colname] = LabelEncoder().fit_transform(col)
return output
def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)
le = MultiColumnLabelEncoder()
X_train_le = le.fit_transform(X)
Here is the error that I get:
Traceback (most recent call last):
File "<ipython-input-63-581cea150670>", line 34, in <module>
X_train_le = le.fit_transform(X)
NameError: name 'X' is not defined
Your code shouldn't be able to work because you left out 40 lines of codes that she wrote before that snippet of codes. She has defined X earlier. The codes can be obtained from Github.
#importing the libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
import statsmodels.api as sm
import pyreadr
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import explained_variance_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
result = pyreadr.read_r('Movies.RData')# also works for Rds
print(result.keys())
df = pd.DataFrame(result['movies'], columns=result['movies'].keys() )
df.shape
df.shape[0]
df.set_index("title", inplace=True) #setting the index name
df_1 = df.loc[:, ['imdb_rating','genre', 'runtime', 'best_pic_nom',
'top200_box', 'director', 'actor1']]
#Let's also check the column-wise distribution of null values
print(df_1.isnull().values.sum())
print(df_1.isnull().sum())
#Dropping missing values from my dataset
df_1.dropna(how='any', inplace=True)
print(df_1.isnull().values.sum()) #checking for missing values after the dropna()
#Splitting for 2 matrices: independent variables used for prediction and dependent variables (that is predicted)
X = df_1.drop(["imdb_rating", 'runtime'], axis = 1) #Feature Matrix
y = df_1["imdb_rating"] #Dependent Variables

Categories