DBSCAN Clustering loop - python

Using DB scan I am iterating through a csv with x, y, z, and id data. I would like to generate a new csv for a every combination of eps and minimum samples that occur within a set range.
The output csv should also include the number of points in the cluster, eps value, and minimum samples used as columns along with the original x, y, z, and id data. The code below will do this, but will only create a csv for the last cluster calculated.
%matplotlib notebook
from interp import *
import pandas as pd
import pandas as pdfrom sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
import itertools
points_file = "examples/example-1/fcr.csv"
eps_values = np.arange(0.1,0.5,0.1)
min_samples = np.arange(5,20)
dbscan_params = list(itertools.product(eps_values, min_samples))
cluster_n = []
epsvalues = []
min_samp = []
for p in dbscan_params:
dbscan_cluster = DBSCAN(eps=p[0],
min_samples=p[1]).fit(points)
epsvalues.append(p[0])
min_samp.append(p[1]), cluster_n.append(
len(np.unique(dbscan_cluster.labels_)))
df = pd.read_csv(points_file, header=None, names=["x", "y", "z", "id"])
df["cluster"] = clustering.labels_
df["eps"] = eps
df["min_sample"] = min_sam
csv_name = f'fcr_{eps}e{min_sam}m.csv'
df.to_csv(csv_name, index=False)

Related

Want to use the output created from first def as input variables for another second def within class in python

Here I have created the output x from first def(null_checking) function and want to use the same output (x) as input for second def(variance) within class. I tried a lot but couldnot.
#import dependencies
#import dependencies
from optparse import Values
from re import X
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import pandas as pd
from .prac import checking_null
#import datasets
datasets = pd.read_csv('Car_sales.csv')
datasets
features = ['Fuel_efficiency', 'Power_perf_factor', 'Engine_size', 'Horsepower', 'Fuel_capacity', 'Curb_weight']
features
class extraction():
def __init__(self, datasets, features):
self.features = features
self.datasets = datasets
#checking and removing null rows present in the dataframe
def null_checking(self):
datasets1 = self.datasets[self.features]
print(datasets1)
for items, x in enumerate(datasets1):
if items =='True':
continue
x = datasets1.dropna(axis=0)
print(x)
#calculating variance inflation factorss
def variance(self):
x.inner_display()
#we need intercept for calculating variance inflation factor
x['intercept'] = 1
print(x)
#Making the new dataframe
df = pd.DataFrame()
df['variables'] = x.columns
df['VIF'] = [variance_inflation_factor(x, i) for i in range(x.shape[1])]
print(df)

Run Different Scikit-learn Clustering Algorithms on Dataset

I have a dataframe like below. The shape is (24,7)
Name x1 x2 x3 x4 x5 x6
Harry 102 204 0.43 0.21 1.02 0.39
James 242 500 0.31 0.11 0.03 0.73
.
.
.
Mike 3555 4002 0.12 0.03 0.52. 0.11
Henry 532 643 0.01 0.02 0.33 0.10
I want to run Scikit-learn's Different Clustering Algorithms Script on the above dataframe. However, the input data looks quite confusing, not too sure how to input my dataframe
https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html#sphx-glr-auto-examples-cluster-plot-cluster-comparison-py
There are two main differences between your scenario and the scikit-learn example you link to:
You only have one dataset, not several different ones to compare.
You have six features, not just two.
Point one allows you to simplify the example code by deleting the loops over the different datasets and related calculations. Point two implies that you cannot easily plot your results. Instead, you could just add the predicted class labels found by each algorithm to your dataset.
So you could modify the example code like this:
import time
import warnings
import numpy as np
import pandas as pd
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
np.random.seed(0)
# ============
# Introduce your dataset
# ============
my_df = # Insert your data here, as a pandas dataframe.
features = [f'x{i}' for i in range(1, 7)]
X = my_df[features].values
# ============
# Set up cluster parameters
# ============
params = {
"quantile": 0.3,
"eps": 0.3,
"damping": 0.9,
"preference": -200,
"n_neighbors": 3,
"n_clusters": 3,
"min_samples": 7,
"xi": 0.05,
"min_cluster_size": 0.1,
}
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = max(cluster.estimate_bandwidth(X, quantile=params["quantile"]),
0.001) # arbitrary correction to avoid 0
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(
X, n_neighbors=params["n_neighbors"], include_self=False
)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# ============
# Create cluster objects
# ============
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=params["n_clusters"])
ward = cluster.AgglomerativeClustering(
n_clusters=params["n_clusters"], linkage="ward", connectivity=connectivity
)
spectral = cluster.SpectralClustering(
n_clusters=params["n_clusters"],
eigen_solver="arpack",
affinity="nearest_neighbors",
)
dbscan = cluster.DBSCAN(eps=params["eps"])
optics = cluster.OPTICS(
min_samples=params["min_samples"],
xi=params["xi"],
min_cluster_size=params["min_cluster_size"],
)
affinity_propagation = cluster.AffinityPropagation(
damping=params["damping"], preference=params["preference"], random_state=0
)
average_linkage = cluster.AgglomerativeClustering(
linkage="average",
affinity="cityblock",
n_clusters=params["n_clusters"],
connectivity=connectivity,
)
birch = cluster.Birch(n_clusters=params["n_clusters"])
gmm = mixture.GaussianMixture(
n_components=params["n_clusters"], covariance_type="full"
)
clustering_algorithms = (
("MiniBatch\nKMeans", two_means),
("Affinity\nPropagation", affinity_propagation),
("MeanShift", ms),
("Spectral\nClustering", spectral),
("Ward", ward),
("Agglomerative\nClustering", average_linkage),
("DBSCAN", dbscan),
("OPTICS", optics),
("BIRCH", birch),
("Gaussian\nMixture", gmm),
)
for name, algorithm in clustering_algorithms:
t0 = time.time()
# catch warnings related to kneighbors_graph
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="the number of connected components of the "
+ "connectivity matrix is [0-9]{1,2}"
+ " > 1. Completing it to avoid stopping the tree early.",
category=UserWarning,
)
warnings.filterwarnings(
"ignore",
message="Graph is not fully connected, spectral embedding"
+ " may not work as expected.",
category=UserWarning,
)
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, "labels_"):
y_pred = algorithm.labels_.astype(int)
else:
y_pred = algorithm.predict(X)
# Add cluster labels to the dataset
my_df[name] = y_pred
PS : please replace : data = X_data.iloc[:20000] by your X
import numpy as np
import matplotlib as plt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import cluster, metrics
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn import preprocessing
from collections import Counter
from sklearn.cluster import DBSCAN
from sklearn import mixture
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
comp_model = pd.DataFrame(columns=['Model', 'Score_Silhouette',
'num_clusters', 'size_clusters',
'parameters'])
K-Means :
def k_means(X_data, nb_clusters, model_comp):
ks = nb_clusters
inertias = []
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
for num_clusters in ks:
# Create a KMeans instance with k clusters: model
model = KMeans(n_clusters=num_clusters, n_init=1)
# Fit model to samples
model.fit(X_scaled)
# Append the inertia to the list of inertias
inertias.append(model.inertia_)
silh = metrics.silhouette_score(X_scaled, model.labels_)
# Counting the amount of data in each cluster
taille_clusters = Counter(model.labels_)
data = [{'Model': 'kMeans',
'Score_Silhouette': silh,
'num_clusters': num_clusters,
'size_clusters': taille_clusters,
'parameters': 'nb_clusters :'+str(num_clusters)}]
model_comp = model_comp.append(data, ignore_index=True, sort=False)
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
return model_comp
comp_model = k_means(X_data=df,
nb_clusters=pd.np.arange(2, 11, 1),
model_comp=comp_model)
DBscan :
def dbscan_grid_search(X_data, model_comp, eps_space=0.5,
min_samples_space=5, min_clust=0, max_clust=10):
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
# Starting a tally of total iterations
n_iterations = 0
# Looping over each combination of hyperparameters
for eps_val in eps_space:
for samples_val in min_samples_space:
dbscan_grid = DBSCAN(eps=eps_val,
min_samples=samples_val)
# fit_transform
clusters = dbscan_grid.fit_predict(X=X_scaled)
# Counting the amount of data in each cluster
cluster_count = Counter(clusters)
#n_clusters = sum(abs(pd.np.unique(clusters))) - 1
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
# Increasing the iteration tally with each run of the loop
n_iterations += 1
# Appending the lst each time n_clusters criteria is reached
if n_clusters >= min_clust and n_clusters <= max_clust:
silh = metrics.silhouette_score(X_scaled, clusters)
data = [{'Model': 'Dbscan',
'Score_Silhouette': silh,
'num_clusters': n_clusters,
'size_clusters': cluster_count,
'parameters': 'eps :'+str(eps_val)+'+ samples_val :'+str(samples_val)}]
model_comp = model_comp.append(
data, ignore_index=True, sort=False)
return model_comp
comp_model = dbscan_grid_search(X_data=df,
model_comp=comp_model,
eps_space=pd.np.arange(0.1, 5, 0.6),
min_samples_space=pd.np.arange(1, 30, 3),
min_clust=2,
max_clust=10)
GMM :
def gmm(X_data, nb_clusters, model_comp):
ks = nb_clusters
data = X_data.iloc[:20000]
X = data.values
X_scaled = preprocessing.StandardScaler().fit_transform(X)
for num_clusters in ks:
# Create a KMeans instance with k clusters: model
gmm = mixture.GaussianMixture(n_components=num_clusters).fit(X_scaled)
# Fit model to samples
gmm.fit(X_scaled)
pred = gmm.predict(X_scaled)
cluster_count = Counter(pred)
silh = metrics.silhouette_score(X_scaled, pred)
data = [{'Model': 'GMM',
'Score_Silhouette': silh,
'num_clusters': num_clusters,
'size_clusters': cluster_count,
'parameters': 'nb_clusters :'+str(num_clusters)}]
model_comp = model_comp.append(data, ignore_index=True, sort=False)
return model_comp
comp_model = gmm(X_data=df,
nb_clusters=pd.np.arange(2, 11, 1),
model_comp=comp_model
)
At the end you will have comp_model which will contain all the results of your algo. Here I am using three algorithms, after you selected the best fit for you (with score silhouette and number of cluster).
You should check the repartitions of each cluster :
https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py

How to use numpy return_inverse table on a different array?

I have a lookup table created using -
lookupTable, data_training_panda_y_indexed = np.unique(data_training_panda_y, return_inverse=True)
However, I want to apply the lookupTable on a different array data_cross_validation_panda_y
data_training_panda_y is a list of strings which can be these values - Incoming, Outgoing, Neutral.
So, lookUpTable is ndArray ('Incoming' 'Outgoing 'Neutral')
Code so far -
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from numpy import dtype
from _codecs import lookup
#Load data
data = np.genfromtxt('../Data/bezdekIris.csv',delimiter=',',usecols=[0,1,2,3,4],dtype=None)
labels = np.genfromtxt('../Data/bezdekIris.csv',delimiter=',',usecols=[4],dtype=None)
#Shuffle the rows
np.random.shuffle(data)
#Cut the data into 3 parts
data_rows = np.size(data, 0)
training_rows = int(round(0.6*data_rows))
cross_validation_rows = int(round(0.2*data_rows))
testing_rows = data_rows - training_rows - cross_validation_rows
data_training_panda = pd.DataFrame(data[:training_rows])
data_training_panda_X = data_training_panda.iloc[:,0:4]
data_training_panda_y = data_training_panda.iloc[:,4]
data_cross_validation_panda = pd.DataFrame(data[training_rows:training_rows+cross_validation_rows])
data_cross_validation_panda_X = data_cross_validation_panda.iloc[:,0:4]
data_cross_validation_panda_y = data_cross_validation_panda.iloc[:,4]
data_testing_panda = pd.DataFrame(data[training_rows+cross_validation_rows:])
data_testing_panda_X = data_testing_panda.iloc[:,0:4]
data_testing_panda_y = data_testing_panda.iloc[:,4]
#Take out the labels from the 3 parts
lookupTable, data_training_panda_y_indexed = np.unique(data_training_panda_y, return_inverse=True)
#Label the CV and Testing
data_cross_validation_panda_y_indexed = np.array([])
data_testing_panda_y_indexed = np.array([])
bezdekIris.csv Sample Data -
5.1,3.5,1.4,0.2,Incoming
4.9,3.0,1.4,0.2,Outgoing
4.7,3.2,1.3,0.2,Netural
Using searchsorted could be a solution.
data_cross_validation_panda_y_indexed = np.searchsorted(lookupTable, data_cross_validation_panda_y)

get the centroid row index from k-means clustering using sklearn

Hy all,
I have a panda DataFrame from which, i would like to cluster all rows and get the row index of each cluster centroid . I am using sklearn and this is what i have:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
X = pd.DataFrame(np.random.rand(10,5))
kmeans = KMeans(n_clusters=3)
Y = pd.DataFrame(kmeans.fit_predict(X.as_matrix()),columns=['cluster ID'] )
Z =pd.DataFrame(kmeans.cluster_centers_[Y['cluster ID']])
result = pd.concat([X , Y, Z], axis=1)
pd.DataFrame(result)
is there any way to get the index of the closest row to centroid
thx
Thx. This code work:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
X = pd.DataFrame(np.random.rand(10,5))
model= KMeans(n_clusters=3)
clusassign = model.fit_predict(X.as_matrix())
min_dist = np.min(cdist(X.as_matrix(), model.cluster_centers_, 'euclidean'), axis=1)
Y = pd.DataFrame(min_dist, index=X.index, columns=['Center_euclidean_dist'])
Z = pd.DataFrame(clusassign, index=X.index, columns=['cluster_ID'])
PAP = pd.concat([Y,Z], axis=1)
grouped = PAP.groupby(['cluster_ID'])
grouped.idxmin()

Plotting Historical Cointegration Values between two pairs

Here is the sample ADF test in python to check for Cointegration between two pairs. However the final result gives only the numeric value for co-integration. How to get the historical results of Co-integration.
Taken from http://www.leinenbock.com/adf-test-in-python/
import numpy as np
import statsmodels.api as stat
import statsmodels.tsa.stattools as ts
x = np.random.normal(0,1, 1000)
y = np.random.normal(0,1, 1000)
def cointegration_test(y, x):
result = stat.OLS(y, x).fit()
return ts.adfuller(result.resid)
I assume you want to test for expanding cointegration? Note that you should use sm.tsa.coint to test for cointegration. You could test for historical cointegrating relationship between realgdp and realdpi using pandas like so
import pandas as pd
import statsmodels.api as sm
data = sm.datasets.macrodata.load_pandas().data
def rolling_coint(x, y):
yy = y[:len(x)]
# returns only the p-value
return sm.tsa.coint(x, yy)[1]
historical_coint = pd.expanding_apply(data.realgdp, rolling_coint,
min_periods=36,
args=(data.realdpi,))

Categories