How to adjust this DBSCAN algorithm python - python

I am using this clustering algorithm to cluster lat and lon points. I am using pre-written code which is given at http://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html.
The code is as follows and takes in my file with over 4000 lat and lon points. However I want to adjust this code so that it only defines a cluster as points within say 0.000020 of each other, as I want my clusters to be almost at street level.
At the moment I am getting 11 clusters whereas in theory I want at least 100 clusters.I have tried adjusting and changing different figures but to no avail.
print(__doc__)
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
##############################################################################
# Generate sample data
input = np.genfromtxt(open("dataset_import_noaddress.csv","rb"),delimiter=",", skip_header=1)
coordinates = np.delete(input, [0,1], 1)
X, labels_true = make_blobs(n_samples=4000, centers=coordinates, cluster_std=0.0000005,
random_state=0)
X = StandardScaler().fit_transform(X)
##############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels))
##############################################################################
# Plot result
import matplotlib.pyplot as plt
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

You appear to be changing the data generation only:
X, labels_true = make_blobs(n_samples=4000, centers=coordinates, cluster_std=0.0000005,
random_state=0)
instead of the clustering algorithm:
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
^^^^^^^ almost your complete data set?
For geographic data, make sure to use haversine distance instead of Euclidean distance. Earth is more like a sphere than a flat Euclidean world.

Related

selecting data points neighbourhood to support vectors

I have been thinking of this but not sure how to do it. I have a binary imbalanced data, and would like to use svm to select just subset of the majority data points nearest to support vector. Thereafter, I can fit a binary classifier on this "balanced" data.
To illustrate what I mean, a MWE:
# packages import
from collections import Counter
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
import seaborn as sns
# sample data
X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.9], flip_y=0, random_state=1)
# class distribution summary
print(Counter(y))
Counter({0: 91, 1: 9})
# fit svm model
svc_model = SVC(kernel='linear', random_state=32)
svc_model.fit(X, y)
plt.figure(figsize=(10, 8))
# Plotting our two-features-space
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, s=50)
# Constructing a hyperplane using a formula.
w = svc_model.coef_[0] # w consists of 2 elements
b = svc_model.intercept_[0] # b consists of 1 element
x_points = np.linspace(-1, 1) # generating x-points from -1 to 1
y_points = -(w[0] / w[1]) * x_points - b / w[1] # getting corresponding y-points
# Plotting a red hyperplane
plt.plot(x_points, y_points, c='r')
The two classes are well separated by the hyperplane. We can see the support vectors for both classes (even better for class 1).
Since the minority class 0 has 9-data-points, I want to down-sample class 0 by selecting its support vectors, and 8 other data points nearest to it. So that the class distribution becomes {0: 9, 1: 9} ignoring all other data points of 0. I will then use this to fit a binary classifier like LR (or even SVC).
My question is how to select those data points of class 0 nearest to the class support vector, taking into account, a way to reach a balance with data points of minority class 1.
This can be achieved as follows: Get the support vector for class 0, (sv0), iterate over all data points in class 0 (X[y == 0]), compute the distances (d) to the point represented by the support vector, sort them, take the 9 with the smallest values, and concatenate them with the points of class 1 to create the downsampled data (X_ds, y_ds).
sv0 = svc_model.support_vectors_[0]
distances = []
for i, x in enumerate(X[y == 0]):
d = np.linalg.norm(sv0 - x)
distances.append((i, d))
distances.sort(key=lambda tup: tup[1])
index = [i for i, d in distances][:9]
X_ds = np.concatenate((X[y == 0][index], X[y == 1]))
y_ds = np.concatenate((y[y == 0][index], y[y == 1]))
plt.plot(x_points[19:-29], y_points[19:-29], c='r')
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, s=50)
plt.scatter(X_ds[y_ds == 0][:,0], X_ds[y_ds == 0][:,1], color='yellow', alpha=0.4)

How to implement kmeans clustering as a feature for classification techniques in SVM?

Ive already created a clustering and saved the model but im confused what should i do with this model and how to use it as a feature for classification.
This clustering is based on the coordinate of a crime place. after the data has been clustered, i want to use the clustered model as features in SVM.
import pandas as pd
import matplotlib.pyplot as plt
import random
import numpy as np
import xlrd
import pickle
import tkinter as tk
from tkinter import *
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')
#kmeans section
#Creating and labelling latitudes of X and Y and plotting it
data=pd.read_excel("sanfrancisco.xlsx")
x1=data['X']
y1=data['Y']
X = np.array(list(zip(x1,y1)))
# Elbow method
from sklearn.cluster import KMeans
wcss = [] #empty string
# to check in range for 10 cluster
for i in range(1,11):
kmeans = KMeans(n_clusters=i, init='k-means++') # will generate centroids
kmeans.fit(X)
wcss.append(kmeans.inertia_) # to find euclidean distance
plot1 = plt.figure(1)
plt.xlabel("Number of Clusters")
plt.ylabel("Euclidean Distance")
plt.plot(range(1,11), wcss)
k = 3
# data visual section.. Eg: how many crimes in diff month, most number of crime in a day in a week
# most number crime in what address, most number of crimes in what city, how many crime occur
# in how much time. , etc..
# X coordinates of random centroids
C_x = np.random.randint(0, np.max(X)-20, size=k)
# Y coordinates of random centroids
C_y = np.random.randint(0, np.max(X)-20, size=k)
C = np.array(list(zip(C_x,C_y)), dtype=np.float32)
print("Initial Centroids")
print(C)
# n_clustersr takes numbers of clusters, init chooses random data points for the initial centroids
# in default sckit provides 10 times of count and chooses the best one, in order to elak n_init assigned to 1
model = KMeans(n_clusters=k, init='random', n_init=1)
model.fit_transform(X)
centroids = model.cluster_centers_ # final centroids
rgb_colors = {0.: 'y',
1.: 'c',
2.: 'fuchsia',
}
if k == 4:
rgb_colors[3.] = 'lime'
if k == 6:
rgb_colors[3.] = 'lime'
rgb_colors[4.] = 'orange'
rgb_colors[5.] = 'tomato'
new_labels = pd.Series(model.labels_.astype(float)) # label that predicted by kmeans
plot2 = plt.figure(2)
plt.scatter(x1, y1, c=new_labels.map(rgb_colors), s=20)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='black', s=200 )
plt.xlabel('Final Cluster Centers\n Iteration Count=' +str(model.n_iter_)+
'\n Objective Function Value: ' +str(model.inertia_))
plt.ylabel('y')
plt.title("k-Means")
plt.show()
# save the model to disk
filename = 'clusteredmatrix.sav'
pickle.dump(model, open(filename,'wb'))
Your problem is not much clear, but if you want to see the behavior of clusters, I recommend you to use a tool like Weka, so that you can freely cluster them and get meaningful inferences before going into complex coding stuff!

How to cluster data with user_id - k-means algorithm

I want to cluster data of users by user_id, because I need to analyze each cluster after clustering.
my clustering algorithm is k-means/k=3. I'm using python.
my data:
V1,V2
100,10
150,20
200,10
120,15
300,10
400,10
300,10
400,10
I removed user_id column from this data. as far as I know that I should remove user_id for k-means clustering.
my python code:
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
from copy import deepcopy
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')
# Importing the dataset
data = pd.read_csv('C:/Users/S.M_Emamian/Desktop/xclara.csv')
print("Input Data and Shape")
print(data.shape)
data.head()
# Getting the values and plotting it
f1 = data['V1'].values
f2 = data['V2'].values
X = np.array(list(zip(f1, f2)))
plt.scatter(f1, f2, c='black', s=7)
# Euclidean Distance Caculator
def dist(a, b, ax=1):
return np.linalg.norm(a - b, axis=ax)
# Number of clusters
k = 3
# X coordinates of random centroids
C_x = np.random.randint(0, np.max(X)-20, size=k)
# Y coordinates of random centroids
C_y = np.random.randint(0, np.max(X)-20, size=k)
C = np.array(list(zip(C_x, C_y)), dtype=np.float32)
print("Initial Centroids")
print(C)
# Plotting along with the Centroids
plt.scatter(f1, f2, c='#050505', s=7)
plt.scatter(C_x, C_y, marker='*', s=200, c='g')
# To store the value of centroids when it updates
C_old = np.zeros(C.shape)
# Cluster Lables(0, 1, 2)
clusters = np.zeros(len(X))
# Error func. - Distance between new centroids and old centroids
error = dist(C, C_old, None)
# Loop will run till the error becomes zero
while error != 0:
# Assigning each value to its closest cluster
for i in range(len(X)):
distances = dist(X[i], C)
cluster = np.argmin(distances)
clusters[i] = cluster
# Storing the old centroid values
C_old = deepcopy(C)
# Finding the new centroids by taking the average value
for i in range(k):
points = [X[j] for j in range(len(X)) if clusters[j] == i]
C[i] = np.mean(points, axis=0)
error = dist(C, C_old, None)
colors = ['r', 'g', 'b', 'y', 'c', 'm']
fig, ax = plt.subplots()
for i in range(k):
points = np.array([X[j] for j in range(len(X)) if clusters[j] == i])
ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i])
ax.scatter(C[:, 0], C[:, 1], marker='*', s=200, c='#050505')
'''
==========================================================
scikit-learn
==========================================================
'''
from sklearn.cluster import KMeans
# Number of clusters
kmeans = KMeans(n_clusters=3)
# Fitting the input data
kmeans = kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.predict(X)
# Centroid values
centroids = kmeans.cluster_centers_
# Comparing with scikit-learn centroids
print("Centroid values")
print("Scratch")
print(C) # From Scratch
print("sklearn")
print(centroids) # From sci-kit learn
my code works fine and it visualizes my data as well.
but I need to keep user_id.
for example, I would like to know user_id=5 is Which of the clusters?
Just add user_id after clustering.
Actually, what you probably want to do is the opposite: just add the cluster label to your original data that still has the cluster labels.
As long as you don't change the data order this is a trivial stacking operation.

sklearn DBSCAN to cluster GPS positions with big epsilon

I want to use DBSCAN from sklearn to find clusters from my GPS positions. I don't understand why the coordinate [ 18.28, 57.63] (lower right corner in the figure) is clustered together with the other coordinates to the left. Could it be some problem with big epsilon? sklearn version 0.19.0.
To reproduce this:
I copied demo code from here: http://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html but I replaced the sample data with a few coordinates (see variable X in the code below). I got the inspiration from here: http://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
# #############################################################################
# Generate sample data
X = np.array([[ 11.95, 57.70],
[ 16.28, 57.63],
[ 16.27, 57.63],
[ 16.28, 57.66],
[ 11.95, 57.63],
[ 12.95, 57.63],
[ 18.28, 57.63],
[ 11.97, 57.70]])
# #############################################################################
# Compute DBSCAN
kms_per_radian = 6371.0088
epsilon = 400 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=2, algorithm='ball_tree', metric='haversine').fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
# #############################################################################
# Plot result
import matplotlib.pyplot as plt
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
I recently made the same mistake (using hdbscan), and it was the cause of some 'strange' results. For example, the same point would sometimes be included in a cluster, and sometimes be flagged as a noise point. "How can this be?", I kept wondering. It turned out to be because I was passing lat/lon directly and not converting to radians first.
The OP's self-supplied answer is correct, but short on details. One could, of course, just multiply the lat/lon values by π/180, but—if you're already using numpy anyway—the simplest fix is to change this line in the original code:
db = DBSCAN(eps=epsilon, ... metric='haversine').fit(X)
to:
db = DBSCAN(eps=epsilon, ... metric='haversine').fit(np.radians(X))
The haversine metric requires data in radian

How to get result of DBSCAN refer to example from http://scikit-learn.org/

Refer to this example of using DBSCAN, real data input for clustering process is 'X'. But following to the example, i used 'X1' for build model for clustering.
# -*- coding: utf-8 -*-
"""
===================================
Demo of DBSCAN clustering algorithm
===================================
Finds core samples of high density and expands clusters from them.
"""
#print(__doc__)
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X=[(9,0),(7,8),(8,6),(1,2),(1,3),(7,6),(10,14)]
X1 = StandardScaler().fit_transform(X)
##############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X1)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool) # bikin matriks False ukuran matriks db.labels
core_samples_mask[db.core_sample_indices_] = True # bikin matriks, kalau indexnya ada di matriks db, maka true
labels = db.labels_
print "cluster: ", set(labels)
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
In this case i want to get members of noise, so I print xy if k=-1. Unfortunately, xy is refers to X1 not the real data X.
# Plot result
import matplotlib.pyplot as plt
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
class_member_mask = (labels == k)
if k == -1:
# Black used for noise.
xy = X1[class_member_mask]
print "Noise :", xy
else:
xy = X1[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
xy = X1[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
When I try to replace X1 to 'X', I get an error.
xy = X[class_member_mask]
error:
xy=X[class_member_mask&~core_samples_mask]
TypeError: only integer arrays with one element can be converted to an index
May be its because format X1 and X is different. I think it's will solve if I know to how convert X format to X1
X=[(9,0),(7,8),(8,6),(1,2),(1,3),(7,6),(10,14)]
X1=[[ 0.8406627 -1.30435512]
[ 0.25219881 0.56856505]
[ 0.54643076 0.10033501]
[-1.51319287 -0.83612508]
[-1.51319287 -0.60201006]
[ 0.25219881 0.10033501]
[ 1.13489465 1.97325518]]
Help me, give suggestion please...
Convert X1 to numpy array:
X1=[[ 0.8406627, -1.30435512],
[ 0.25219881, 0.56856505],
[ 0.54643076, 0.10033501],
[-1.51319287, -0.83612508],
[-1.51319287, -0.60201006],
[ 0.25219881, 0.10033501],
[ 1.13489465, 1.97325518]]
X1 = np.asarray(X1)

Categories