I have a csv file which looks like below
page Page_Value
page1 12
page2 1
page3 2
page4 3
page5 10
page6 11
page7 13
page8 67
page9 70
#covert page to numeric
labelEncoder = LabelEncoder()
labelEncoder.fit(dataset_PV['page'])
dataset_PV['page'] = labelEncoder.transform(dataset_PV['page'])
#find out no. of cluster using elbow methos
from sklearn.cluster import KMeans
from sklearn import preprocessing
wcss = []
for i in range(1,10):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
kmeans.fit(dataset_PV)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(15,8))
plt.plot(range(1,10), wcss,marker='o')
plt.title('Elbow graph')
plt.xlabel('Number of clusters')
plt.ylabel('within-cluster sums of squares (WCSS)')
plt.show()
#fit model
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(dataset_PV)
dataset_PV['clusters'] = clusters
I want to create combination of pages based on Page Value to get maximum of Page values. Here I have used K-means for which I have converted page variable to numeric. I am not sure if I should use k-means or sort the Page_value variable and then group them(not sure about the code).
Output something like this:
Cluster1 = page2,page3,page4
Cluster2 = page5,page6,page7,page12
Cluster3 = page7,page8,page9
Thanks
You do not need to sort pages first. Have you tried Opencv K-mean? I hope it helps.
[https://docs.opencv.org/master/d1/d5c/tutorial_py_kmeans_opencv.html]
import numpy as np
import cv2 as cv
pages = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']
x = np.array((12,1,2,3,10,11,13,67,70))
x = np.float32(x)
x = np.reshape(x, (-1,1))
# Define criteria = ( type, max_iter = 10 , epsilon = 1.0 )
criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 10, 1.0)
# Set flags (Just to avoid line break in the code)
flags = cv.KMEANS_RANDOM_CENTERS
K = 3
# Apply KMeans
compactness, labels, centers = cv.kmeans(x, K, None, criteria, 10, flags)
labels = labels.flatten()
# result
res = dict()
for i in range(K):
res[i] = []
for idx, lab in enumerate(labels):
res[lab].append(pages[idx])
print(res)
Another solution using Sklearn:
from sklearn.cluster import KMeans
import numpy as np
pages = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']
x = np.array((12,1,2,3,10,11,13,67,70))
x = np.float32(x)
x = np.reshape(x, (-1,1))
K=3
km = KMeans(n_clusters=K)
km.fit(x)
labels = km.predict(x)
labels = labels.flatten()
# result
res = dict()
for i in range(K):
res[i] = []
for idx, lab in enumerate(labels):
res[lab].append(pages[idx])
print(res)
You have done most of the work, but the name of the page should not be included in the calculation of KMeans, that makes no sense.
i.e. LabelEncoder is not necessary
tl;Dr
The short answers you can refer to #Sơn Ninh.
If you want to visualization my answer may help you.
I write a function (label_encoding) for you, and you can use it to get the mapping of id that helps to draw pictures.
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import textwrap
from io import StringIO
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from typing import Union, NamedTuple
from collections import defaultdict
def main():
df = import_file()
suitable_n: int
x = df.Page_Value.values.reshape(-1, 1)
if not 'Use the Elbow method to get a suitable N.':
# You can omit this if you don't want to see it at all.
elbow_find_n(x)
suitable_n = 3
# fit model
kmeans = KMeans(n_clusters=suitable_n, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(x)
# labelEncoder = LabelEncoder()
# labelEncoder.fit(df['page'])
# df['page'] = labelEncoder.transform(df['page'])
df.loc[:, ['page']], mapping_table = label_encoding(df.loc[:, ['page']])
df = rebuild_df(df, clusters, mapping_table) # 'page-id', 'page', 'Page_Value', 'clusters'
print(df)
dict_by_cluster_value = defaultdict(list)
for cur_set in set(df['clusters']): # Output the format that you define.
print(f'Cluster{cur_set} = {",".join(df.page[df.clusters == cur_set])}')
dict_by_cluster_value[cur_set].extend(df.page[df.clusters == cur_set].to_list())
print(dict(dict_by_cluster_value)) # it's ok with defaultdict, I convert the type is for print beautiful.
visualizing_the_clusters(kmeans, df)
class RGBColor(NamedTuple):
BLACK = '#000000'
# AZURE = '#F0FFFF'
OLIVE = '#808000'
PINK = '#FFC0CB'
# WHITE = '#000000' <-- not suitable put it on background is white.
GOLD = 'FFD700'
BLUE = '#0000FF'
GREEN = '#00FF00'
RED = '#FF0000'
YELLOW = '#FFFF00'
ORANGE = '#FFA500'
PURPLE = '#FF00FF'
def get_tuple(self):
return (attr_name for attr_name in dir(self) if not attr_name.startswith('_') and attr_name.isupper())
def label_encoding(label_col: Union[pd.DataFrame, np.ndarray], is_need_mapping_table=True) -> tuple:
"""
USAGE:
df.loc[:, ['col_xxx', ]], mapping_table = label_encoding(df.loc[:, ['col_xxx']])
"""
nda_rtn_value = LabelEncoder().fit_transform(label_col.values.ravel()) if isinstance(label_col, pd.DataFrame) else LabelEncoder().fit_transform(label_col)
rtn_dict = dict()
if is_need_mapping_table:
list_value = [e[0] for e in label_col.values] if isinstance(label_col, pd.DataFrame) else [e for e in label_col]
rtn_dict = dict(zip(nda_rtn_value, list_value))
if isinstance(label_col, pd.DataFrame):
nda_rtn_value = nda_rtn_value.reshape(-1, 1)
return nda_rtn_value, rtn_dict
def import_file() -> pd.DataFrame:
page_content = textwrap.dedent( # Remove any common leading whitespace from every line in text.
"""\
page,Page_Value
page1,12
page2,1
page3,2
page4,3
page5,10
page6,11
page7,13
page8,67
page9,70
"""
)
df = pd.read_csv(StringIO(page_content), header=0)
return df
def elbow_find_n(x):
wcss = []
for i in range(1, 10):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
kmeans.fit(x)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(15, 8))
plt.plot(range(1, 10), wcss, marker='o')
plt.title('Elbow graph')
plt.xlabel('Number of Clusters')
plt.ylabel('within-cluster sums of squares WCSS')
plt.show()
def rebuild_df(df, clusters, mapping_table):
df['clusters'] = clusters
df.rename(columns={'page': 'page-id'}, inplace=True)
df['page'] = df.apply(lambda df_: mapping_table[df_['page-id']], axis=1)
df = df.reindex(['page-id', 'page', 'clusters', 'Page_Value', ], axis=1)
return df
def visualizing_the_clusters(kmeans: KMeans, df: pd.DataFrame):
standard_rgb = RGBColor()
# plt.scatter(df[df.clusters == 0]['page-id'], df[df.clusters == 0]['Page_Value'], s=2, c='red', label='Careful')
# plt.scatter(df[df.clusters == 2]['page-id'], df[df.clusters == 2]['Page_Value'], s=2, c='cyan', label='Careless')
# ...
for color_idx, (cur_set, color) in enumerate(
zip(set(df.clusters), standard_rgb.get_tuple())
):
contain_cluster_index = df.clusters == cur_set
plt.scatter(df[contain_cluster_index]['page-id'], df[contain_cluster_index]['Page_Value'],
s=2, c=color, label=f'Cluster{cur_set}: {kmeans.cluster_centers_[cur_set][0]}')
n_cluster = len(kmeans.cluster_centers_)
plt.scatter(np.ones(n_cluster)*(-1), kmeans.cluster_centers_[:, 0], s=(50/n_cluster), c='purple', label='Centroids')
plt.title('Page and Page_Value')
plt.xlabel('Page_ID')
plt.ylabel('Page_Value')
plt.legend(loc=(1.05, 0.5))
plt.tight_layout()
plt.show()
if __name__ == '__main__':
main()
output
page-id page clusters Page_Value
0 0 page1 0 12
1 1 page2 2 1
2 2 page3 2 2
3 3 page4 2 3
4 4 page5 0 10
5 5 page6 0 11
6 6 page7 0 13
7 7 page8 1 67
8 8 page9 1 70
Cluster0 = page1,page5,page6,page7
Cluster1 = page8,page9
Cluster2 = page2,page3,page4
{0: ['page1', 'page5', 'page6', 'page7'], 1: ['page8', 'page9'], 2: ['page2', 'page3', 'page4']}
Related
I have a dataframe like below. The shape is (24,7)
Name x1 x2 x3 x4 x5 x6
Harry 102 204 0.43 0.21 1.02 0.39
James 242 500 0.31 0.11 0.03 0.73
.
.
.
Mike 3555 4002 0.12 0.03 0.52. 0.11
Henry 532 643 0.01 0.02 0.33 0.10
I want to run Scikit-learn's Different Clustering Algorithms Script on the above dataframe. However, the input data looks quite confusing, not too sure how to input my dataframe
https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html#sphx-glr-auto-examples-cluster-plot-cluster-comparison-py
There are two main differences between your scenario and the scikit-learn example you link to:
You only have one dataset, not several different ones to compare.
You have six features, not just two.
Point one allows you to simplify the example code by deleting the loops over the different datasets and related calculations. Point two implies that you cannot easily plot your results. Instead, you could just add the predicted class labels found by each algorithm to your dataset.
So you could modify the example code like this:
import time
import warnings
import numpy as np
import pandas as pd
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
np.random.seed(0)
# ============
# Introduce your dataset
# ============
my_df = # Insert your data here, as a pandas dataframe.
features = [f'x{i}' for i in range(1, 7)]
X = my_df[features].values
# ============
# Set up cluster parameters
# ============
params = {
"quantile": 0.3,
"eps": 0.3,
"damping": 0.9,
"preference": -200,
"n_neighbors": 3,
"n_clusters": 3,
"min_samples": 7,
"xi": 0.05,
"min_cluster_size": 0.1,
}
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = max(cluster.estimate_bandwidth(X, quantile=params["quantile"]),
0.001) # arbitrary correction to avoid 0
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(
X, n_neighbors=params["n_neighbors"], include_self=False
)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# ============
# Create cluster objects
# ============
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=params["n_clusters"])
ward = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"], linkage="ward", connectivity=connectivity
)
spectral = cluster.SpectralClustering(
n_clusters=params["n_clusters"],
eigen_solver="arpack",
affinity="nearest_neighbors",
)
dbscan = cluster.DBSCAN(eps=params["eps"])
optics = cluster.OPTICS(
min_samples=params["min_samples"],
xi=params["xi"],
min_cluster_size=params["min_cluster_size"],
)
affinity_propagation = cluster.AffinityPropagation(
damping=params["damping"], preference=params["preference"], random_state=0
)
average_linkage = cluster.AgglomerativeClustering(
linkage="average",
affinity="cityblock",
n_clusters=params["n_clusters"],
connectivity=connectivity,
)
birch = cluster.Birch(n_clusters=params["n_clusters"])
gmm = mixture.GaussianMixture(
n_components=params["n_clusters"], covariance_type="full"
)
clustering_algorithms = (
("MiniBatch\nKMeans", two_means),
("Affinity\nPropagation", affinity_propagation),
("MeanShift", ms),
("Spectral\nClustering", spectral),
("Ward", ward),
("Agglomerative\nClustering", average_linkage),
("DBSCAN", dbscan),
("OPTICS", optics),
("BIRCH", birch),
("Gaussian\nMixture", gmm),
)
for name, algorithm in clustering_algorithms:
t0 = time.time()
# catch warnings related to kneighbors_graph
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="the number of connected components of the "
+ "connectivity matrix is [0-9]{1,2}"
+ " > 1. Completing it to avoid stopping the tree early.",
category=UserWarning,
)
warnings.filterwarnings(
"ignore",
message="Graph is not fully connected, spectral embedding"
+ " may not work as expected.",
category=UserWarning,
)
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, "labels_"):
y_pred = algorithm.labels_.astype(int)
else:
y_pred = algorithm.predict(X)
# Add cluster labels to the dataset
my_df[name] = y_pred
PS : please replace : data = X_data.iloc[:20000] by your X
import numpy as np
import matplotlib as plt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import cluster, metrics
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn import preprocessing
from collections import Counter
from sklearn.cluster import DBSCAN
from sklearn import mixture
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
comp_model = pd.DataFrame(columns=['Model', 'Score_Silhouette',
'num_clusters', 'size_clusters',
'parameters'])
K-Means :
def k_means(X_data, nb_clusters, model_comp):
ks = nb_clusters
inertias = []
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
for num_clusters in ks:
# Create a KMeans instance with k clusters: model
model = KMeans(n_clusters=num_clusters, n_init=1)
# Fit model to samples
model.fit(X_scaled)
# Append the inertia to the list of inertias
inertias.append(model.inertia_)
silh = metrics.silhouette_score(X_scaled, model.labels_)
# Counting the amount of data in each cluster
taille_clusters = Counter(model.labels_)
data = [{'Model': 'kMeans',
'Score_Silhouette': silh,
'num_clusters': num_clusters,
'size_clusters': taille_clusters,
'parameters': 'nb_clusters :'+str(num_clusters)}]
model_comp = model_comp.append(data, ignore_index=True, sort=False)
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
return model_comp
comp_model = k_means(X_data=df,
nb_clusters=pd.np.arange(2, 11, 1),
model_comp=comp_model)
DBscan :
def dbscan_grid_search(X_data, model_comp, eps_space=0.5,
min_samples_space=5, min_clust=0, max_clust=10):
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
# Starting a tally of total iterations
n_iterations = 0
# Looping over each combination of hyperparameters
for eps_val in eps_space:
for samples_val in min_samples_space:
dbscan_grid = DBSCAN(eps=eps_val,
min_samples=samples_val)
# fit_transform
clusters = dbscan_grid.fit_predict(X=X_scaled)
# Counting the amount of data in each cluster
cluster_count = Counter(clusters)
#n_clusters = sum(abs(pd.np.unique(clusters))) - 1
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
# Increasing the iteration tally with each run of the loop
n_iterations += 1
# Appending the lst each time n_clusters criteria is reached
if n_clusters >= min_clust and n_clusters <= max_clust:
silh = metrics.silhouette_score(X_scaled, clusters)
data = [{'Model': 'Dbscan',
'Score_Silhouette': silh,
'num_clusters': n_clusters,
'size_clusters': cluster_count,
'parameters': 'eps :'+str(eps_val)+'+ samples_val :'+str(samples_val)}]
model_comp = model_comp.append(
data, ignore_index=True, sort=False)
return model_comp
comp_model = dbscan_grid_search(X_data=df,
model_comp=comp_model,
eps_space=pd.np.arange(0.1, 5, 0.6),
min_samples_space=pd.np.arange(1, 30, 3),
min_clust=2,
max_clust=10)
GMM :
def gmm(X_data, nb_clusters, model_comp):
ks = nb_clusters
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
for num_clusters in ks:
# Create a KMeans instance with k clusters: model
gmm = mixture.GaussianMixture(n_components=num_clusters).fit(X_scaled)
# Fit model to samples
gmm.fit(X_scaled)
pred = gmm.predict(X_scaled)
cluster_count = Counter(pred)
silh = metrics.silhouette_score(X_scaled, pred)
data = [{'Model': 'GMM',
'Score_Silhouette': silh,
'num_clusters': num_clusters,
'size_clusters': cluster_count,
'parameters': 'nb_clusters :'+str(num_clusters)}]
model_comp = model_comp.append(data, ignore_index=True, sort=False)
return model_comp
comp_model = gmm(X_data=df,
nb_clusters=pd.np.arange(2, 11, 1),
model_comp=comp_model
)
At the end you will have comp_model which will contain all the results of your algo. Here I am using three algorithms, after you selected the best fit for you (with score silhouette and number of cluster).
You should check the repartitions of each cluster :
https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py
I am trying to get RF feature importance, I fit the random forest on the data like this:
model = RandomForestRegressor()
n = model.fit(self.X_train,self.y_train)
if n is not None:
df = pd.DataFrame(data = n , columns = ["Feature","Importance_Score"])
df["Feature_Name"] = np.array(self.X_Headers)
df = df.drop(["Feature"], axis = 1)
df[["Feature_Name","Importance_Score"]].to_csv("RF_Importances.csv", index = False)
del df
However, the n variable returns None, why is this happening?
Not very sure how model.fit(self.X_train,self.y_train) is supposed to work. Need more information about how you set up the model.
If we set this up using simulated data, it works:
np.random.seed(111)
X = pd.DataFrame(np.random.normal(0,1,(100,5)),columns=['A','B','C','D','E'])
y = np.random.normal(0,1,100)
model = RandomForestRegressor()
n = model.fit(X,y)
if n is not None:
df = pd.DataFrame({'features':X.columns,'importance':n.feature_importances_})
df
features importance
0 A 0.176091
1 B 0.183817
2 C 0.169927
3 D 0.267574
4 E 0.202591
i'm working with flask python framework on a data science project , and i need to add selected columns from a csv file in this clustering code, please can anyone help me ? knowing that the clustering code can read the columns, i could save csv file and select the columns from the server side so i just need to add it to this clustering code
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
class Clustering():
def __init__(self, filename, start_column, end_column):
self.n = start_column
self.m = end_column
self.filename = filename
self.dataset = pd.read_csv(self.filename)
self.X = self.dataset.iloc[:,[self.n,self.m]].values
def show_test(a):
return "just a test object"+a[0]+","+a[1]
#def return_x(self):
######## return concerned columns of the dataset ########
#return self.X
def print_elbow(self, number_of_k):
# Plot the graph to visualize the Elbow Method to find the optimal number of cluster
self.k = number_of_k
wcss=[]
silhouette_values = {}
for i in range (2,self.k):
self.kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter=300, n_init = 10, random_state =None)
self.kmeans.fit(self.X)
wcss.append(self.kmeans.inertia_) # Sum of squared distances of samples to their closest cluster center.
# Compute the silhouette scores for each sample
cluster_labels = self.kmeans.fit_predict(self.X)
silhouette_avg = silhouette_score(self.X, cluster_labels)
silhouette_values[i] = silhouette_avg
print("For n_clusters =", i,"The average silhouette_score is :", silhouette_avg)
print("Best silhouette score:", max(silhouette_values, key=silhouette_values.get))
plt.plot(range(2,self.k),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
return
def print_kmeans(self, Optimal_k):
plt.style.use('seaborn-deep')
# Applying KMeans to the dataset with the optimal number of cluster
self.opt_k = Optimal_k
self.kmeans=KMeans(n_clusters= self.opt_k, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_kmeans = self.kmeans.fit_predict(self.X)
for i in range(self.opt_k):
plt.scatter(self.X[y_kmeans == i, 0], self.X[y_kmeans == i,1],s = 80, marker='o', alpha=0.7 , label = 'Cluster {}'.format(i+1))
plt.scatter(self.kmeans.cluster_centers_[:,0], self.kmeans.cluster_centers_[:,1], s = 100, c = 'black',edgecolors='none', label = 'Centroids')
plt.title('Clusters')
plt.xlabel('first column')
plt.ylabel('second column')
plt.legend()
plt.show()
return ```
Please try this.
Clustering('file.csv',start,end+1).
'file.csv' represents file path.
start denotes starting column number (int).
end is the last column u want to read. +1 to include that column.
I new to python and machine learning. I got an error when trying to implement (decision_regions) plot.
I am not sure I understand the problem so I really need help solving this problem.
I think the problem because the target is string maybe I am nor sure. But I do not know how to fix this problem please I need help to fix this
# import arff data using panda
data = arff.loadarff('Run1/Tr.arff')
df = pd.DataFrame(data[0])
data =pd.DataFrame(df)
data = data.loc[:,'ATT1':'ATT576']
target = df['Class']
target=target.astype(str)
#split the data into training and testing
data_train, data_test, target_train, target_test = train_test_split(data, target,test_size=0.30, random_state=0)
model1 = DecisionTreeClassifier(criterion='entropy', max_depth=1)
num_est = [1, 2, 3, 10]
label = ['AdaBoost (n_est=1)', 'AdaBoost (n_est=2)', 'AdaBoost (n_est=3)', 'AdaBoost (n_est=20)']
fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2)
grid = itertools.product([0,1],repeat=2)
for n_est, label, grd in zip(num_est, label, grid):
boosting = AdaBoostClassifier(base_estimator=model1,n_estimators=n_est) boosting.fit(data_train,target_train)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(data_train , target_train, clf=boosting, legend=2)
plt.title(label)
plt.show();
------------------------------------------------------------------ ValueError Traceback (most recent call
> last) <ipython-input-18-646828965d5c> in <module>
> 7 boosting.fit(data_train,target_train)
> 8 ax = plt.subplot(gs[grd[0], grd[1]])
> ----> 9 fig = plot_decision_regions(data_train , target_train, clf=boosting, legend=2) # clf cannot be change because it's a
> parameter
> 10 plt.title(label)
> 11
>
> /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/mlxtend/plotting/decision_regions.py
> in plot_decision_regions(X, y, clf, feature_index,
> filler_feature_values, filler_feature_ranges, ax, X_highlight, res,
> legend, hide_spines, markers, colors, scatter_kwargs, contourf_kwargs,
> scatter_highlight_kwargs)
> 127 """
> 128
> --> 129 check_Xy(X, y, y_int=True) # Validate X and y arrays
> 130 dim = X.shape[1]
> 131
>
> /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/mlxtend/utils/checking.py
> in check_Xy(X, y, y_int)
> 14 # check types
> 15 if not isinstance(X, np.ndarray):
> ---> 16 raise ValueError('X must be a NumPy array. Found %s' % type(X))
> 17 if not isinstance(y, np.ndarray):
> 18 raise ValueError('y must be a NumPy array. Found %s' % type(y))
>
> ValueError: X must be a NumPy array. Found <class
> 'pandas.core.frame.DataFrame'>`enter code here`
I have used another similer dataset. In your code you are trying to plot with more tan 2 features which is not possible with 'plot_decision_regions' you have to use different methodes discusses in the given link Plotting decision boundary for High Dimension Data. But if you want to use only two features then you can use bellow code.
from scipy.io import arff
import pandas as pd
import itertools
from matplotlib import gridspec
from mlxtend.plotting import plot_decision_regions
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from matplotlib import pyplot as plt
data = arff.loadarff('TR.arff')
data = pd.DataFrame(data[0])
df = data.loc[:,['att1','att2','class']]
for col_name in df.columns:
if(df[col_name].dtype == 'object'):
df[col_name]= df[col_name].astype('category')
df[col_name] = df[col_name].cat.codes
target = df['class']
df=df.drop(['class'],axis=1)
data_train, data_test, target_train, target_test = train_test_split(df, target,test_size=0.30, random_state=0)
model1 = DecisionTreeClassifier(criterion='entropy', max_depth=1)
num_est = [1, 2, 3, 10]
label = ['AdaBoost (n_est=1)', 'AdaBoost (n_est=2)', 'AdaBoost (n_est=3)', 'AdaBoost (n_est=20)']
fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2)
grid = itertools.product([0,1],repeat=2)
for n_est, label, grd in zip(num_est, label, grid):
boosting = AdaBoostClassifier(base_estimator=model1,n_estimators=n_est)
boosting.fit(data_train,target_train)
ax = plt.subplot(gs[grd[0], grd[1]])
fig = plot_decision_regions(data_train.values , target_train.values, clf=boosting, legend=2)
plt.title(label)
plt.show();
Convert your data into an array then pass it to the function.
numpy_matrix = data.as_matrix()
I have the following code to perform hierarchical clutering on data:
Z = linkage(data,method='weighted')
plt.subplot(2,1,1)
dendro = dendrogram(Z)
leaves = dendro['leaves']
print leaves
plt.show()
How ever at the dendogram all the clusters have the same color (blue). Is there a way to use different colors with respect to similarity in between clusters?
Look at the documentation, Looks like you could pass the link_color_func keyword or color_threshold keyword to have different colors.
Edit:
The default behavior of the dendrogram coloring scheme is, given a color_threshold = 0.7*max(Z[:,2]) to color all the descendent links below a cluster node k the same color if k is the first node below the cut threshold; otherwise, all links connecting nodes with distances greater than or equal to the threshold are colored blue [from the docs].
What the hell does this mean? Well, if you look at a dendrogram, different clusters linked together. The "distance" between two clusters is the height of the link between them. The color_threshold is the height below which new clusters will be different colors. If all your clusters are blue, then you need to raise your color_threshold. For example,
In [48]: mat = np.random.rand(10, 10)
In [49]: z = linkage(mat, method="weighted")
In [52]: d = dendrogram(z)
In [53]: d['color_list']
Out[53]: ['g', 'g', 'b', 'r', 'c', 'c', 'c', 'b', 'b']
In [54]: plt.show()
I can check what the default color_threshold is by
In [56]: 0.7*np.max(z[:,2])
Out[56]: 1.0278719020096947
If I lower the color_threshold, I get more blue because more links have distances greater than the new color_threshold. You can see this visually because all the links above 0.9 are now blue:
In [64]: d = dendrogram(z, color_threshold=.9)
In [65]: d['color_list']
Out[65]: ['g', 'b', 'b', 'r', 'b', 'b', 'b', 'b', 'b']
In [66]: plt.show()
If I increase the color_threshold to 1.2, the links below 1.2 will no longer be blue. Additionally, the cyan and red links will merge into a single color because their parent link is below 1.2:
The following code will produce a dendrogram with a different color for each leaf. If in the process of merging clusters it encounters two clusters with different colors, then it selects the default one dflt_col = tab:blue.
Note: the link_matrix function is a plain-copy of the one from the AgglomerativeClustering example in scikit-learn.
To explain what all it does, it's really time-consuming. Thus, print directly every unclear step.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform, pdist
from matplotlib.pyplot import cm
from sklearn.cluster import AgglomerativeClustering
import matplotlib.colors as clrs
def link_matrix(model, **kwargs):
# Create linkage matrix and then plot the dendrogram as in the standard sci-kit learn documentation
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
Z = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
return Z
def assign_link_colors(model):
n_clusters = len(model.Z)
scl_map_to_hex = mpl.cm.ScalarMappable(cmap = "jet").to_rgba(np.unique(model.labels_), norm = True) #colors.to_hex()
col = [clrs.to_hex(rgb) for rgb in scl_map_to_hex]
dic_labels = {s:[c, idx] for s, c, idx in zip(np.arange(len(model.feature_names_in_), dtype = int), model.feature_names_in_, model.labels_, )}
model.dict_idx_name_cl = {k: v for k, v in sorted(dic_labels.items(), key=lambda item: item[1][1])}
dflt_col = "tab:blue" # Unclustered blue
model.dict_colors = {x:col[model.dict_idx_name_cl[x][1]] for x in model.dict_idx_name_cl}
link_cols = {}
for i, i_cl in enumerate(model.Z[:,:2].astype(int)): # select only 1st two rows
c1, c2 = (link_cols[x] if x > n_clusters else model.dict_colors[x] for x in i_cl)
# Choice of coloring assignment: if same color --> ok; if no leaf, dft ("undefined") color
if c1 == c2:
tmp_cl = c1
elif min(i_cl) <= n_clusters: # select the leaf color
tmp_cl = model.dict_colors[min(i_cl)]
else:
tmp_cl = dflt_col
link_cols[i+1+n_clusters] = tmp_cl
#print(f'-link_cols: {link_cols}',)
return link_cols
def mod_2_dendrogram(model, **kwargs):
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(int(.5 * len(model.feature_names_in_)), 7))
print(f'-0.7*max(Z[:,2]): {0.7*max(model.Z[:,2])}',)
# Plot the corresponding dendrogram
ddata = dendrogram(model.Z, #count_sort = "descending",
**kwargs)
# Plot distances on the dendrogram
# plot cluster points & distance labels
y_lim = dist_thr
for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
x = sum(i[1:3])/2
y = d[1]
if y > y_lim:
plt.plot(x, y, 'o', c=c, markeredgewidth=0)
plt.annotate(np.round(y,2), (x, y), xytext=(0, -5),
textcoords='offset points',
va='top', ha='center', fontsize=9)
plt.axhline(y=dist_thr, color='orange', alpha = 0.7, linestyle='--', label = f"threshold: {int(model.dist_thr)}")
plt.title(f'Agglomerative Dendrogram with n_clust: {model.n_clusters_}')
plt.xlabel('Clusters')
plt.ylabel('Distance')
plt.legend()
return ddata
Now, the running example:
import string
import pandas as pd
np.random.seed(0)
dist = np.random.randint(1e4, size = (10,10))
np.fill_diagonal(dist, 0)
dist = pd.DataFrame(dist, columns = list(string.ascii_lowercase)[:dist.shape[0]])
dist_thr = 1.5e3
model = AgglomerativeClustering(distance_threshold = dist_thr, n_clusters=None, linkage = "single", metric = "precomputed",)
model.dist_thr = dist_thr
model = model.fit(dist)
model.Z = link_matrix(model)
link_cols = assign_link_colors(model)
_ = mod_2_dendrogram(model, labels = dist.columns,
link_color_func = lambda x: link_cols[x])