I have a csv file which looks like below
page Page_Value
page1 12
page2 1
page3 2
page4 3
page5 10
page6 11
page7 13
page8 67
page9 70
#covert page to numeric
labelEncoder = LabelEncoder()
labelEncoder.fit(dataset_PV['page'])
dataset_PV['page'] = labelEncoder.transform(dataset_PV['page'])
#find out no. of cluster using elbow methos
from sklearn.cluster import KMeans
from sklearn import preprocessing
wcss = []
for i in range(1,10):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
kmeans.fit(dataset_PV)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(15,8))
plt.plot(range(1,10), wcss,marker='o')
plt.title('Elbow graph')
plt.xlabel('Number of clusters')
plt.ylabel('within-cluster sums of squares (WCSS)')
plt.show()
#fit model
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(dataset_PV)
dataset_PV['clusters'] = clusters
I want to create combination of pages based on Page Value to get maximum of Page values. Here I have used K-means for which I have converted page variable to numeric. I am not sure if I should use k-means or sort the Page_value variable and then group them(not sure about the code).
Output something like this:
Cluster1 = page2,page3,page4
Cluster2 = page5,page6,page7,page12
Cluster3 = page7,page8,page9
Thanks
You do not need to sort pages first. Have you tried Opencv K-mean? I hope it helps.
[https://docs.opencv.org/master/d1/d5c/tutorial_py_kmeans_opencv.html]
import numpy as np
import cv2 as cv
pages = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']
x = np.array((12,1,2,3,10,11,13,67,70))
x = np.float32(x)
x = np.reshape(x, (-1,1))
# Define criteria = ( type, max_iter = 10 , epsilon = 1.0 )
criteria = (cv.TERM_CRITERIA_EPS + cv.TERM_CRITERIA_MAX_ITER, 10, 1.0)
# Set flags (Just to avoid line break in the code)
flags = cv.KMEANS_RANDOM_CENTERS
K = 3
# Apply KMeans
compactness, labels, centers = cv.kmeans(x, K, None, criteria, 10, flags)
labels = labels.flatten()
# result
res = dict()
for i in range(K):
res[i] = []
for idx, lab in enumerate(labels):
res[lab].append(pages[idx])
print(res)
Another solution using Sklearn:
from sklearn.cluster import KMeans
import numpy as np
pages = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8', 'p9']
x = np.array((12,1,2,3,10,11,13,67,70))
x = np.float32(x)
x = np.reshape(x, (-1,1))
K=3
km = KMeans(n_clusters=K)
km.fit(x)
labels = km.predict(x)
labels = labels.flatten()
# result
res = dict()
for i in range(K):
res[i] = []
for idx, lab in enumerate(labels):
res[lab].append(pages[idx])
print(res)
You have done most of the work, but the name of the page should not be included in the calculation of KMeans, that makes no sense.
i.e. LabelEncoder is not necessary
tl;Dr
The short answers you can refer to #Sơn Ninh.
If you want to visualization my answer may help you.
I write a function (label_encoding) for you, and you can use it to get the mapping of id that helps to draw pictures.
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import textwrap
from io import StringIO
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from typing import Union, NamedTuple
from collections import defaultdict
def main():
df = import_file()
suitable_n: int
x = df.Page_Value.values.reshape(-1, 1)
if not 'Use the Elbow method to get a suitable N.':
# You can omit this if you don't want to see it at all.
elbow_find_n(x)
suitable_n = 3
# fit model
kmeans = KMeans(n_clusters=suitable_n, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(x)
# labelEncoder = LabelEncoder()
# labelEncoder.fit(df['page'])
# df['page'] = labelEncoder.transform(df['page'])
df.loc[:, ['page']], mapping_table = label_encoding(df.loc[:, ['page']])
df = rebuild_df(df, clusters, mapping_table) # 'page-id', 'page', 'Page_Value', 'clusters'
print(df)
dict_by_cluster_value = defaultdict(list)
for cur_set in set(df['clusters']): # Output the format that you define.
print(f'Cluster{cur_set} = {",".join(df.page[df.clusters == cur_set])}')
dict_by_cluster_value[cur_set].extend(df.page[df.clusters == cur_set].to_list())
print(dict(dict_by_cluster_value)) # it's ok with defaultdict, I convert the type is for print beautiful.
visualizing_the_clusters(kmeans, df)
class RGBColor(NamedTuple):
BLACK = '#000000'
# AZURE = '#F0FFFF'
OLIVE = '#808000'
PINK = '#FFC0CB'
# WHITE = '#000000' <-- not suitable put it on background is white.
GOLD = 'FFD700'
BLUE = '#0000FF'
GREEN = '#00FF00'
RED = '#FF0000'
YELLOW = '#FFFF00'
ORANGE = '#FFA500'
PURPLE = '#FF00FF'
def get_tuple(self):
return (attr_name for attr_name in dir(self) if not attr_name.startswith('_') and attr_name.isupper())
def label_encoding(label_col: Union[pd.DataFrame, np.ndarray], is_need_mapping_table=True) -> tuple:
"""
USAGE:
df.loc[:, ['col_xxx', ]], mapping_table = label_encoding(df.loc[:, ['col_xxx']])
"""
nda_rtn_value = LabelEncoder().fit_transform(label_col.values.ravel()) if isinstance(label_col, pd.DataFrame) else LabelEncoder().fit_transform(label_col)
rtn_dict = dict()
if is_need_mapping_table:
list_value = [e[0] for e in label_col.values] if isinstance(label_col, pd.DataFrame) else [e for e in label_col]
rtn_dict = dict(zip(nda_rtn_value, list_value))
if isinstance(label_col, pd.DataFrame):
nda_rtn_value = nda_rtn_value.reshape(-1, 1)
return nda_rtn_value, rtn_dict
def import_file() -> pd.DataFrame:
page_content = textwrap.dedent( # Remove any common leading whitespace from every line in text.
"""\
page,Page_Value
page1,12
page2,1
page3,2
page4,3
page5,10
page6,11
page7,13
page8,67
page9,70
"""
)
df = pd.read_csv(StringIO(page_content), header=0)
return df
def elbow_find_n(x):
wcss = []
for i in range(1, 10):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
kmeans.fit(x)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(15, 8))
plt.plot(range(1, 10), wcss, marker='o')
plt.title('Elbow graph')
plt.xlabel('Number of Clusters')
plt.ylabel('within-cluster sums of squares WCSS')
plt.show()
def rebuild_df(df, clusters, mapping_table):
df['clusters'] = clusters
df.rename(columns={'page': 'page-id'}, inplace=True)
df['page'] = df.apply(lambda df_: mapping_table[df_['page-id']], axis=1)
df = df.reindex(['page-id', 'page', 'clusters', 'Page_Value', ], axis=1)
return df
def visualizing_the_clusters(kmeans: KMeans, df: pd.DataFrame):
standard_rgb = RGBColor()
# plt.scatter(df[df.clusters == 0]['page-id'], df[df.clusters == 0]['Page_Value'], s=2, c='red', label='Careful')
# plt.scatter(df[df.clusters == 2]['page-id'], df[df.clusters == 2]['Page_Value'], s=2, c='cyan', label='Careless')
# ...
for color_idx, (cur_set, color) in enumerate(
zip(set(df.clusters), standard_rgb.get_tuple())
):
contain_cluster_index = df.clusters == cur_set
plt.scatter(df[contain_cluster_index]['page-id'], df[contain_cluster_index]['Page_Value'],
s=2, c=color, label=f'Cluster{cur_set}: {kmeans.cluster_centers_[cur_set][0]}')
n_cluster = len(kmeans.cluster_centers_)
plt.scatter(np.ones(n_cluster)*(-1), kmeans.cluster_centers_[:, 0], s=(50/n_cluster), c='purple', label='Centroids')
plt.title('Page and Page_Value')
plt.xlabel('Page_ID')
plt.ylabel('Page_Value')
plt.legend(loc=(1.05, 0.5))
plt.tight_layout()
plt.show()
if __name__ == '__main__':
main()
output
page-id page clusters Page_Value
0 0 page1 0 12
1 1 page2 2 1
2 2 page3 2 2
3 3 page4 2 3
4 4 page5 0 10
5 5 page6 0 11
6 6 page7 0 13
7 7 page8 1 67
8 8 page9 1 70
Cluster0 = page1,page5,page6,page7
Cluster1 = page8,page9
Cluster2 = page2,page3,page4
{0: ['page1', 'page5', 'page6', 'page7'], 1: ['page8', 'page9'], 2: ['page2', 'page3', 'page4']}
I'm currently working on a little project with sklearn and python and I'm wondering why my boundary decision line is broken. My dataset is mostly measurements of a strain gauge, the associated temperature and if the measurement is provided by a "faulty" sensor.
# coding=utf-8
# Libraries
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier as KN
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sn
from Model import LogisticRegressionUsingGD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
url = "measurementsStrainGauge3.csv"
columnsHead = ['µm', 'tmp','fault']
dataset = pd.read_csv(url, names=columnsHead)
dataset.head()
# X = feature values, all the columns except the last column
X = dataset.iloc[:, :-1]
# y = target values, last column of the data frame
y = dataset.iloc[:, -1]
#Filtering data
faultyData = dataset.loc[y == 1]
notFaultyData = dataset.loc[y == 0]
#Model building
X = np.c_[np.ones((X.shape[0], 1)), X]
y = y[:, np.newaxis]
theta = np.zeros((X.shape[1], 1))
model = LogisticRegression()
model.fit(X, y.ravel())
predicted_classes = model.predict(X)
accuracy = accuracy_score(y.flatten(),predicted_classes)
parameters = model.coef_
params = parameters.flatten()
print(params)
print("Precision : %", accuracy)
x_values = [np.min(X[:, 1] - 5), np.max(X[:, 2] + 5)]
y_values = - (params[0] + np.dot(params[1], x_values)) / params[2]
# Plots
plt.plot(x_values, y_values, label=u'Ligne de décision')
plt.scatter(faultyData.iloc[:, 0], faultyData.iloc[:, 1],
s=10, label='Faute')
plt.scatter(notFaultyData.iloc[:, 0], notFaultyData.iloc[:, 1],
s=10, label='Non faute')
plt.xlabel(u'Déformation (µ/m)')
plt.ylabel(u'Température (C°)')
plt.legend()
plt.show()
Edit : Here is the data I use (1 is "faulty" and 0 is "non faulty") :
6973,15.02,0
3017,41.75,0
5900,61.35,1
8610,63.57,1
5405,44.42,0
3965,-5.13,0
3079,12.64,0
4562,13.09,0
4185,46.78,0
6734,34.73,0
5711,-7.34,0
5006,25.04,0
7614,51.4,1
3265,27.81,0
7218,60.65,1
5852,35.75,0
7880,46.89,0
7819,11.53,0
4775,2.16,0
5128,-14.42,0
6385,-7.32,0
3511,17.18,0
6303,28.88,0
3476,29.81,0
6285,61.21,1
3437,-2.2,0
8914,66.67,1
6306,67.7,1
3327,36.57,0
7842,-16.59,0
7336,67.02,1
4949,57,1
4036,66.4,1
3644,-0.57,0
6082,13.8,0
8044,65.51,1
7659,52.96,1
3319,40.44,0
7928,8.28,0
6812,35.83,0
7080,70.66,1
6876,79.59,1
7826,27.75,0
4514,69,1
5885,-18.39,0
4063,77.65,1
6827,-7.36,0
5085,50.1,1
7353,71.37,1
8878,11.08,0
4385,48.06,0
4204,27.01,0
6614,15.66,0
3379,-12.1,0
8312,-13.57,0
5565,21.29,0
3670,-18.79,0
4152,31.22,0
5448,-17.83,0
3081,32.11,0
8674,32.2,0
4224,21.73,0
7701,63.21,1
8984,18.09,0
6266,5.5,0
8223,32.91,0
3709,76.47,0
4888,-5.16,0
4824,-1.02,0
8579,4.81,0
8588,48.98,0
7805,73.59,1
3859,-1.31,0
4666,43.92,0
3473,-7.51,0
4301,-12.26,0
6421,65.2,1
8345,35.49,0
5840,45.75,0
4702,-1.85,0
6538,7.98,0
3217,44.56,0
6450,70.51,1
3444,12.54,0
5220,-13.33,0
8724,-16.96,0
6043,73.71,1
3187,23.54,0
6696,6.83,0
7928,34.15,0
3013,36.46,0
7376,76.77,1
7752,22.78,0
7328,-14.24,0
6690,71.65,1
6253,-1.57,0
4238,60.1,1
6569,33.7,0
6213,13.37,0
4075,48.68,0
7964,16.1,0
7810,65.45,1
6350,25.03,0
6275,61.15,1
6883,56.02,1
3622,2.82,0
4570,0.04,0
6514,37.81,0
3999,-19.13,0
5082,-6.88,0
6987,25.56,0
5706,42.42,0
5474,28.61,0
5932,4.84,0
4110,-2.27,0
7662,0.89,0
8851,-5.14,0
4370,58.47,1
5541,40.52,0
5408,11.39,0
7986,76.91,1
7124,79.9,1
3654,22.37,0
8165,2.77,0
8452,32.72,0
8849,49.46,0
8517,3.56,0
6027,2.8,0
8405,26.44,0
8313,76.85,1
3545,59.98,0
4033,77.04,1
3083,61.34,0
3041,47.35,0
4901,5.1,0
8225,0.49,0
8525,36.75,0
8402,-4.46,0
6794,36.73,0
6317,79.12,1
4961,18.47,0
5790,11.45,0
6661,-16.26,0
6211,45.59,0
4277,43.98,0
3116,-19.83,0
3971,34.46,0
5417,39.99,0
8881,73.96,1
7119,-12.92,0
7011,48.87,0
6932,31.42,0
4118,32.2,0
4412,70.49,1
5908,20.69,0
5367,3.74,0
7461,24.85,0
5154,26.32,0
6019,46.53,0
4566,-19.92,0
5633,48.09,0
6558,50.27,1
7257,-10.97,0
3896,74.1,0
8084,-5.84,0
3163,40.61,0
3983,45.91,0
4684,23.51,0
5147,75.9,1
6120,72.83,1
8039,63.16,1
6498,-1.05,0
3332,54.26,0
7504,52.7,1
3477,79.28,0
5549,13.41,0
6377,75.99,1
5114,19.59,0
8631,-3.75,0
4806,12.49,0
4923,6.8,0
8470,14.24,0
8032,-12.38,0
5387,-11.47,0
3330,21.95,0
3716,16.77,0
8085,39.17,0
3869,5.53,0
6466,71.76,1
6988,31.83,0
4922,10.24,0
8340,-9.13,0
4136,62.2,1
3747,45.66,0
5042,32.84,0
8492,14.71,0
6282,37.44,0
8732,36.03,0
7694,62.94,1
6814,67.12,1
6757,-2.81,0
5299,8.04,0
5733,71.57,1
3282,61.78,0
7036,53.86,1
3740,47.41,0
4021,53.49,1
5853,-2.98,0
7212,50.47,1
7237,21.88,0
5048,76.42,1
5289,-18.42,0
6370,40.66,0
5922,-0.84,0
4287,40.22,0
3039,50.98,0
7127,68.39,1
7718,45.12,0
5731,75.06,1
7578,76.26,1
7934,18.88,0
3404,72.66,0
8704,-3.06,0
8933,77.09,1
3789,6.55,0
4859,12.35,0
5283,32.99,0
4998,-4.25,0
6613,-1.29,0
5432,23.25,0
7086,17.65,0
4057,-2.48,0
4436,-4.3,0
8527,31.34,0
6375,63.06,1
7101,-13.35,0
5043,30.15,0
7747,29.09,0
4056,30.35,0
8823,21.67,0
4860,48.11,0
3699,69.05,0
4808,69.35,1
6619,25.9,0
4098,3.9,0
8463,73.25,1
5328,41.71,0
5073,68.73,1
4063,49.4,0
3353,29.46,0
6205,21.64,0
7663,5.2,0
6336,28.68,0
6559,64.37,1
5606,29.07,0
4768,5.83,0
5040,8.76,0
7409,36.27,0
7438,56.12,1
8719,42.81,0
3859,5.62,0
5280,-10.07,0
7795,-7.19,0
3874,-17.21,0
3356,6.77,0
3642,19.1,0
3619,65.96,0
5938,5.05,0
7545,65.69,1
5440,36.21,0
7870,30.08,0
3159,20.17,0
8689,44.11,0
5367,76.86,1
8470,-5.38,0
3394,76.58,0
8644,58.69,1
6883,0.8,0
8900,34.32,0
6060,-11.32,0
6081,45.06,0
5936,-8.27,0
3523,47.16,0
6247,77.33,1
4984,31.52,0
4176,21.07,0
3317,36.41,0
8621,10.17,0
6562,1.93,0
5837,8.01,0
5336,64.17,1
6620,44.64,0
5312,59.82,1
6323,11.16,0
7213,55.46,1
6894,30.54,0
7062,40.89,0
6575,36.44,0
3679,77.68,0
6566,29.49,0
7351,-6.37,0
5227,14.63,0
5461,0.9,0
7577,-18.63,0
4630,18.04,0
5132,37.62,0
8925,-17.93,0
8626,62.48,1
6980,21.47,0
8169,72.86,1
5566,63.81,1
7655,37.05,0
7134,-18.12,0
5795,26.67,0
6392,64.86,1
3324,-0.46,0
4810,22.8,0
8712,67.22,1
3803,62.02,0
4065,23.9,0
4695,59.94,1
7620,57.72,1
6799,67.89,1
5147,30.54,0
4629,-14.92,0
3560,-17.5,0
8586,54.64,1
3822,45.33,0
5930,-14.71,0
7754,41.33,0
3547,23.34,0
4163,32.52,0
8550,63.04,1
7552,-1.77,0
7803,-0.39,0
3628,45.4,0
6413,-17.97,0
6258,-14.1,0
7000,-16.14,0
8570,-2.87,0
3395,16.93,0
4259,41.77,0
8980,63.7,1
7635,58.79,1
3271,-5.45,0
3743,-4.47,0
3847,20.11,0
8649,26.46,0
4804,22.25,0
8054,68.84,1
5955,50.28,1
4421,13.44,0
8391,22.63,0
6611,27.72,0
4832,37.76,0
4960,9.2,0
6035,-8.52,0
6136,75.5,1
8702,52.76,1
4351,49.14,0
4085,5.4,0
7357,-11.35,0
5080,25.12,0
5243,79.92,1
6144,36.6,0
4686,27.78,0
4740,77.34,1
8634,22.09,0
3611,38.18,0
5529,13.2,0
3044,2.07,0
5618,1.39,0
3534,5.96,0
3281,21.92,0
6296,-4.04,0
6422,53.66,1
4770,36.74,0
5285,38.3,0
3466,-0.31,0
8347,78.31,1
4789,44.55,0
8260,-4.02,0
8314,8.51,0
4146,2.78,0
8530,-14.13,0
4529,71.55,1
7826,21.49,0
5980,72.18,1
7218,-1.31,0
5861,19.5,0
5662,50.07,1
6087,56.6,1
8219,66.81,1
7180,1.24,0
6594,54.13,1
8408,70.9,1
3766,-0.97,0
3113,35.67,0
7871,71.23,1
4898,-8.25,0
I found a couple of issues in your code.
I couldn't understand why you are # Filtering the data and what exactly you are trying out in # Model building. You have ended up changing the data.
When it comes to the plot, you are plotting a line between 2 x coordinates while generating the y coordinate using the model. This will not work as the model is predicting the class the point belongs to and there are already a couple of features namely Temperature and Deformation on the individual axis.
This is the right way of creating a plot for classification tasks with the data and the decision surface of the model.
I have fixed your code to generate the graph, it's a basic implementation of the complete method.
columnsHead = ['µm', 'tmp','fault']
dataset = pd.read_csv(url, names=columnsHead)
print(dataset.head())
# X = feature values, all the columns except the last column
X = dataset.iloc[:, :-1].values
# y = target values, last column of the data frame
y = dataset.iloc[:, -1]
model = LogisticRegression()
model.fit(X, y)
# Creating mesh
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 100),
np.arange(y_min, y_max, 1))
# Plotting decision boundary
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure()
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
plt.title("Decision surface of LogisticRegression")
plt.axis('tight')
colors = "br"
for i, color in zip(model.classes_, colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0], X[idx, 1], c=color, cmap=plt.cm.Paired,
edgecolor='black', s=20)
plt.show()
Resulting plot