I am doing the DBSCAN clustering in python. I want to achieve an adaptive way to return the number of clusters by self calculating its eps and Minpts parameters. Below is my code.
import math
import copy
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
def loadDataSet(fileName, splitChar='\t'):
dataSet = []
with open(fileName) as fr:
for line in fr.readlines():
curline = line.strip().split(splitChar)
fltline = list(map(float, curline))
dataSet.append(fltline)
return dataSet
def dist(a,b):
return math.sqrt(math.pow(a[0]-b[0],2) + math.pow(a[1]-b[1],2))
def returnDk(matrix,k):
Dk = []
for i in range(len(matrix)):
Dk.append(matrix[i][k])
return Dk
def returnDkAverage(Dk):
sum = 0
for i in range(len(Dk)):
sum = sum + Dk[i]
return sum/len(Dk)
def CalculateDistMatrix(dataset):
DistMatrix = [[0 for j in range(len(dataset))] for i in range(len(dataset))]
for i in range(len(dataset)):
for j in range(len(dataset)):
DistMatrix[i][j] = dist(dataset[i], dataset[j])
return DistMatrix
def returnEpsCandidate(dataSet):
DistMatrix = CalculateDistMatrix(dataSet)
tmp_matrix = copy.deepcopy(DistMatrix)
for i in range(len(tmp_matrix)):
tmp_matrix[i].sort()
EpsCandidate = []
for k in range(1,len(dataSet)):
Dk = returnDk(tmp_matrix,k)
DkAverage = returnDkAverage(Dk)
EpsCandidate.append(DkAverage)
return EpsCandidate
def returnMinptsCandidate(DistMatrix,EpsCandidate):
MinptsCandidate = []
for k in range(len(EpsCandidate)):
tmp_eps = EpsCandidate[k]
tmp_count = 0
for i in range(len(DistMatrix)):
for j in range(len(DistMatrix[i])):
if DistMatrix[i][j] <= tmp_eps:
tmp_count = tmp_count + 1
MinptsCandidate.append(tmp_count/len(dataSet))
return MinptsCandidate
def returnClusterNumberList(dataset,EpsCandidate,MinptsCandidate):
np_dataset = np.array(dataset)
ClusterNumberList = []
for i in range(len(EpsCandidate)):
clustering = DBSCAN(eps= EpsCandidate[i],min_samples= MinptsCandidate[i]).fit(np_dataset)
num_clustering = max(clustering.labels_)
ClusterNumberList.append(num_clustering)
return ClusterNumberList
if __name__ == '__main__':
data = pd.read_csv('/Users/Desktop/Mic/recorder_test1/New folder/MFCCresultsforclustering/MFCCresultsforclustering.csv')
dataSet = data.iloc[:,0:13].values
EpsCandidate = returnEpsCandidate(dataSet)
DistMatrix = CalculateDistMatrix(dataSet)
MinptsCandidate = returnMinptsCandidate(DistMatrix,EpsCandidate)
ClusterNumberList = returnClusterNumberList(dataSet,EpsCandidate,MinptsCandidate)
print(EpsCandidate)
print(MinptsCandidate)
print('cluster number list is')
print(ClusterNumberList)
However, the output with the loading data set is all [-1]s. I am wondering where is the mistake. Am I right for this general direction? If not, how can I achieve the adaptive DBSCAN clustering?
Related
Friedman’s H-statistic The interpretable ML book by Christoph Molnar actually gives us a workable approach, by using Friedman’s H-statistic based on the decomposition of the partial dependence values to calculate the feature interactions.
In Python, sklearn_gbmi will accept feature sets of length two and higher but does not provide support for the first-order measure, very similar to interact.gbm in R. It only works on gradient boosting based models
I found a manual Python implementation from here, posted below for reference, where the feature interactions were calculated.
import itertools
import math
import pandas as pd
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from pdpbox.pdp_calc_utils import _calc_ice_lines_inter
from pdpbox.pdp import pdp_isolate, PDPInteract
from pdpbox.utils import (_check_model, _check_dataset, _check_percentile_range, _check_feature,
_check_grid_type, _check_memory_limit, _make_list,
_calc_memory_usage, _get_grids, _get_grid_combos, _check_classes)
from joblib import Parallel, delayed
def pdp_multi_interact(model, dataset, model_features, features,
num_grid_points=None, grid_types=None, percentile_ranges=None, grid_ranges=None,
cust_grid_points=None,
cust_grid_combos=None, use_custom_grid_combos=False,
memory_limit=0.5, n_jobs=1, predict_kwds=None, data_transformer=None):
def _expand_default(x, default, length):
if x is None:
return [default] * length
return x
def _get_grid_combos(feature_grids, feature_types):
grids = [list(feature_grid) for feature_grid in feature_grids]
for i in range(len(feature_types)):
if feature_types[i] == 'onehot':
grids[i] = np.eye(len(grids[i])).astype(int).tolist()
return np.stack(np.meshgrid(*grids), -1).reshape(-1, len(grids))
if predict_kwds is None:
predict_kwds = dict()
nr_feats = len(features)
# check function inputs
n_classes, predict = _check_model(model=model)
_check_dataset(df=dataset)
_dataset = dataset.copy()
# prepare the grid
pdp_isolate_outs = []
if use_custom_grid_combos:
grid_combos = cust_grid_combos
feature_grids = []
feature_types = []
else:
num_grid_points = _expand_default(x=num_grid_points, default=10, length=nr_feats)
grid_types = _expand_default(x=grid_types, default='percentile', length=nr_feats)
for i in range(nr_feats):
_check_grid_type(grid_type=grid_types[i])
percentile_ranges = _expand_default(x=percentile_ranges, default=None, length=nr_feats)
for i in range(nr_feats):
_check_percentile_range(percentile_range=percentile_ranges[i])
grid_ranges = _expand_default(x=grid_ranges, default=None, length=nr_feats)
cust_grid_points = _expand_default(x=cust_grid_points, default=None, length=nr_feats)
_check_memory_limit(memory_limit=memory_limit)
pdp_isolate_outs = []
for idx in range(nr_feats):
pdp_isolate_out = pdp_isolate(
model=model, dataset=_dataset, model_features=model_features, feature=features[idx],
num_grid_points=num_grid_points[idx], grid_type=grid_types[idx], percentile_range=percentile_ranges[idx],
grid_range=grid_ranges[idx], cust_grid_points=cust_grid_points[idx], memory_limit=memory_limit,
n_jobs=n_jobs, predict_kwds=predict_kwds, data_transformer=data_transformer)
pdp_isolate_outs.append(pdp_isolate_out)
if n_classes > 2:
feature_grids = [pdp_isolate_outs[i][0].feature_grids for i in range(nr_feats)]
feature_types = [pdp_isolate_outs[i][0].feature_type for i in range(nr_feats)]
else:
feature_grids = [pdp_isolate_outs[i].feature_grids for i in range(nr_feats)]
feature_types = [pdp_isolate_outs[i].feature_type for i in range(nr_feats)]
grid_combos = _get_grid_combos(feature_grids, feature_types)
feature_list = []
for i in range(nr_feats):
feature_list.extend(_make_list(features[i]))
# Parallel calculate ICE lines
true_n_jobs = _calc_memory_usage(
df=_dataset, total_units=len(grid_combos), n_jobs=n_jobs, memory_limit=memory_limit)
grid_results = Parallel(n_jobs=true_n_jobs)(delayed(_calc_ice_lines_inter)(
grid_combo, data=_dataset, model=model, model_features=model_features, n_classes=n_classes,
feature_list=feature_list, predict_kwds=predict_kwds, data_transformer=data_transformer)
for grid_combo in grid_combos)
ice_lines = pd.concat(grid_results, axis=0).reset_index(drop=True)
pdp = ice_lines.groupby(feature_list, as_index=False).mean()
# combine the final results
pdp_interact_params = {'n_classes': n_classes,
'features': features,
'feature_types': feature_types,
'feature_grids': feature_grids}
if n_classes > 2:
pdp_interact_out = []
for n_class in range(n_classes):
_pdp = pdp[feature_list + ['class_%d_preds' % n_class]].rename(
columns={'class_%d_preds' % n_class: 'preds'})
pdp_interact_out.append(
PDPInteract(which_class=n_class,
pdp_isolate_outs=[pdp_isolate_outs[i][n_class] for i in range(nr_feats)],
pdp=_pdp, **pdp_interact_params))
else:
pdp_interact_out = PDPInteract(
which_class=None, pdp_isolate_outs=pdp_isolate_outs, pdp=pdp, **pdp_interact_params)
return pdp_interact_out
def center(arr): return arr - np.mean(arr)
def compute_f_vals(mdl, X, features, selectedfeatures, num_grid_points=10, use_data_grid=False):
f_vals = {}
data_grid = None
if use_data_grid:
data_grid = X[selectedfeatures].values
# Calculate partial dependencies for full feature set
p_full = pdp_multi_interact(mdl, X, features, selectedfeatures,
num_grid_points=[num_grid_points] * len(selectedfeatures),
cust_grid_combos=data_grid,
use_custom_grid_combos=use_data_grid)
f_vals[tuple(selectedfeatures)] = center(p_full.pdp.preds.values)
grid = p_full.pdp.drop('preds', axis=1)
# Calculate partial dependencies for [1..SFL-1]
for n in range(1, len(selectedfeatures)):
for subsetfeatures in itertools.combinations(selectedfeatures, n):
if use_data_grid:
data_grid = X[list(subsetfeatures)].values
p_partial = pdp_multi_interact(mdl, X, features, subsetfeatures,
num_grid_points=[num_grid_points] * len(selectedfeatures),
cust_grid_combos=data_grid,
use_custom_grid_combos=use_data_grid)
p_joined = pd.merge(grid, p_partial.pdp, how='left')
f_vals[tuple(subsetfeatures)] = center(p_joined.preds.values)
return f_vals
# the second-order H-measure:
def compute_h_val(f_vals, selectedfeatures):
denom_els = f_vals[tuple(selectedfeatures)].copy()
numer_els = f_vals[tuple(selectedfeatures)].copy()
sign = -1.0
for n in range(len(selectedfeatures)-1, 0, -1):
for subfeatures in itertools.combinations(selectedfeatures, n):
numer_els += sign * f_vals[tuple(subfeatures)]
sign *= -1.0
numer = np.sum(numer_els**2)
denom = np.sum(denom_els**2)
return math.sqrt(numer/denom) if numer < denom else np.nan
# first-order H-measure as well:
def compute_h_val_any(f_vals, allfeatures, selectedfeature):
otherfeatures = list(allfeatures)
otherfeatures.remove(selectedfeature)
denom_els = f_vals[tuple(allfeatures)].copy()
numer_els = denom_els.copy()
numer_els -= f_vals[(selectedfeature,)]
numer_els -= f_vals[tuple(otherfeatures)]
numer = np.sum(numer_els**2)
denom = np.sum(denom_els**2)
return math.sqrt(numer/denom) if numer < denom else np.nan
df = sns.load_dataset("diamonds")
data = pd.get_dummies(df, ["cut", "color", "clarity"])
X = data.drop("cut_Ideal", axis=1)
y = data["cut_Ideal"]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.33,
random_state = 42)
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
max_depth=1, random_state=0).fit(X_train, y_train)
f_val = compute_f_vals(gbc, X, X.columns, ['carat', 'depth'], num_grid_points=10, use_data_grid=False)
# second-order H-measure:
compute_h_val(f_val, ['carat', 'depth'])
I want to calculate feature interactions for all the columns in a dataframe. How could I do that?
I am not here to avail free code writing service I just want to capture a little bit of knowledge from the experienced programmers with discussing things. I was just expecting a suggestion/reference for the appropriate library of methods on finding the feature interactions?
I use the nearest neighbors method to predict the price of a stock. I have raw data in example.txt file. I use the close column (price at the end of the period = 1 minute). Linear regression predicts well (shown in green). But the method of nearest neighbors works only at the beginning and then turns into a straight line, please tell me how to fix this? Here is my code I wrote:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
class Reader:
def __init__(self, filename='example.txt'):
self.filename = filename
def read(self):
try:
file = open(self.filename)
return file.read()
except IOError:
return "File not found"
def main():
x = Reader('example.txt')
print(x.read())
class Regression:
def __init__(self, window, P0, Ptest, i):
self.window = window
self.P0 = P0
self.Ptest = Ptest
self.i = i
self.data_train = self.get_data_train()
self.x_train = self.get_x_train()
self.y_train = self.get_y_train()
self.data_test = self.get_data_test()
self.x_test = self.get_x_test()
self.y_test = self.get_y_test()
def get_data_train(self):
""" Method of obtaining data train on prices for the entire period."""
x = Reader('example.txt')
data = x.read().splitlines()
close_column = [x.split(',')[7] for x in data][1:]
result = [float(item) for item in close_column]
relative_price = result[:int(len(result)*P0)]
return relative_price
def get_data_test(self):
""" Method of obtaining data test on prices for the entire period."""
x = Reader('example.txt')
data = x.read().splitlines()
close_column = [x.split(',')[7] for x in data][1:]
result = [float(item) for item in close_column]
len_x_test = int(len(result) * Ptest)
len_x_train = int(len(result) * P0)
relative_price = result[(len_x_train + (len_x_test * self.i)): len_x_train + len_x_test
* (self.i + 1)]
return relative_price
def get_x_train(self):
x = []
for i in range(len(self.data_train)):
if i + self.window < len(self.data_train):
x.append(self.data_train[i: i + self.window])
return x
def get_y_train(self):
y = []
for i in self.data_train[self.window:]:
y += [i]
return y
def get_x_test(self):
x = []
for i in range(len(self.data_test)):
if i + self.window < len(self.data_test):
x.append(self.data_test[i: i + self.window])
return x
def get_y_test(self):
y = []
for i in self.data_test[self.window:]:
y += [i]
return y
class Linear_regression(Regression):
def callculate(self):
reg_linear = LinearRegression().fit(self.x_train, self.y_train)
y_pred = reg_linear.predict(self.x_test)
return y_pred
class Nearest_neighbor(Regression):
def callculate(self):
reg_neighbor = KNeighborsRegressor(n_neighbors=window, weights='distance')
reg_neighbor.fit(self.x_train, self.y_train)
y_pred = reg_neighbor.predict(self.x_test)
return y_pred
window = 10
Pk = 1
P0 = 0.1
Ptest = 0.01
k = (Pk - P0)/Ptest
i = 0
y_real = []
y_neigh = []
y_lin = []
while i < k:
lin_price = list(Linear_regression(window, P0, Ptest, i).callculate())
neighbor = list(Nearest_neighbor(window, P0, Ptest, i).callculate())
y_neigh.extend(neighbor)
y_lin.extend(lin_price)
y_real.extend(list(Linear_regression(window, P0, Ptest, i).y_test))
i += 1
""" Output to graphs of the received data """
fig, ax = plt.subplots()
ax.plot(y_real, label='Initial data')
ax.plot(y_neigh, label='Nearest Neighbor Data')
ax.plot(y_lin, label='Linear Regression Data')
ax.set_xlabel('Time (min)')
ax.set_ylabel('Price, ($)')
ax.legend()
plt.show()
"Linear regression predicts well"
No, it never predicted well. You just looked at the graph and thought it looked kind of similar. But if you look more closely, your 'model' simply takes the price of a bit ago as the prediction of the price now. That means, it's not predicting anything! It's a history device, not a prediction device.
That's why if you feed back this sort of 'model' into itself you get a straight line: it always predicts the next price is going to be equal to the last one.
Cost function implemented with Python:
**Thanks for help to achieve this.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
load_data = pd.read_csv('C:\python_program\ex1data1.txt',sep = ",",header = None)
feature_vale = load_data[0]
y = np.matrix(load_data[1])
m = len(feature_vale)
plt.scatter(load_data[0],load_data[1],marker='+',c = 'r')
plt.title("Cost_Function")
plt.xlabel("Population of City in 10,000s")
plt.ylabel("Profit in $10,000s")
df = pd.DataFrame(pd.Series(1,index= range(0,m)))
df[1] = load_data[0]
X = np.matrix(df)
row_theta = np.zeros(2,dtype = int)
theta = np.array([row_theta]) # Transpose the array
prediction = np.dot(X,theta.T)
error = (prediction-y.T)
error_df = pd.DataFrame(error)
#square the error
squared_error = np.square(error_df)
sum = np.sum(squared_error)
print(sum)
J = np.sum(squared_error) / (2 * m)
print(J)
Data reference link: searchcode.com/codesearch/view/5404318
repeat the following steps and let me know
load_data = pd.read_csv('data.txt',sep = ",",header = None)
feature_vale = load_data[0]
y = np.matrix(load_data[1])
m = len(feature_vale)
#print(m)
#plt.scatter(load_data[0],load_data[1])
df = pd.DataFrame(pd.Series(1,index= range(0,m)))
df[1] = load_data[0]
X = np.matrix(df)
row_theta = np.zeros(2,dtype = int)
theta = np.array([row_theta]) # Transpose the array
print(theta.T)
prediction = np.matmul(X,theta.T)
error = (prediction-y)
error_df = pd.DataFrame(error)
squared_error = np.square(error_df)
print(squared_error)
Hello i need help for create genetic algorithme for converge to maximum or minimum value.
I develop a code for found maximum sentence ascii sum, but my code not converge to maximum, my code make "yoyo" value
like this picture :
matploltib output
i share my code :
import random
import statistics
EVOLUTION=[]
words = [
["Un", "Des", "Une", "On", "Elle"],
["a", "eu", "avait", "est", "était", "fut"],
["soif", "rouge"]
]
def individual(data):
#return tuple(random.choice(range(len(feature))) for feature in data)
return tuple(random.choice(range(len(feature))) for feature in data)
def population(data, initial=100):
return [individual(data) for i in range(initial)]
def fitness(individual, data):
chaine=sentence(individual,words)
somme = 0
for caractere in chaine:
somme = somme + ord(caractere)
print(chaine)
print(somme)
EVOLUTION.append(somme)
return somme
#return sum(data[i][individual[i]] for i in range(len(individual)))
def grade(population, data):
fit = [fitness(ind, data) for ind in population]
return statistics.mean(fit)
def mutate(ind, data):
gene = random.randrange(0, len(ind))
clone = list(ind)
clone[gene] = random.randrange(0, len(data[gene]))
#print(sentence(tuple(clone),words))
return tuple(clone)
def cross(mother, father):
return tuple(round(statistics.mean(genes)) for genes in zip(mother, father))
def sentence(individual, words):
return ' '.join([words[i][individual[i]] for i in range(len(words))])
def evolve(population, data, retain=0.0, random_select=0.00, mutation_rate=0.00):
def cmp_ind(ind):
return fitness(ind, data)
sorted_population = sorted(population, key=cmp_ind, reverse=True)
len_retained = round(len(population) * retain)
retained = sorted_population[:len_retained]
random_selected = [
ind
for ind in sorted_population[len_retained:]
if random.random() <= random_select
]
mutated = [
mutate(ind, data)
for ind in sorted_population[len_retained:]
if random.random() <= mutation_rate
]
children = [
cross(random.choice(sorted_population),
random.choice(sorted_population))
for i in range(len(population) - len(random_selected) - len(mutated))
]
return random_selected + mutated + children
if __name__ == '__main__':
data = [[len(w) for w in ws] for ws in words]
initial_population = population(data, 30)
next_population = initial_population
max_iter = 3
for i in range(max_iter):
next_population = evolve(next_population, data)
sorted_population = sorted(next_population, key=lambda x: fitness(x, data))
best_individual = sorted_population[0]
print("best solution :")
chaine=sentence(best_individual,words)
somme = 0
for caractere in chaine:
somme = somme + ord(caractere)
print(chaine)
print(somme)
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
plt.plot(EVOLUTION)
plt.savefig('myfig')
i want to found highter solution in my fitness function
thanks for advance for your help
I am trying to plot a decision tree using ID3 in Python. I am really new to Python and couldn't understand the implementation of the following code. I need to know how I can apply this code to my data.
from math import log
import operator
def entropy(data):
entries = len(data)
labels = {}
for feat in data:
label = feat[-1]
if label not in labels.keys():
labels[label] = 0
labels[label] += 1
entropy = 0.0
for key in labels:
probability = float(labels[key])/entries
entropy -= probability * log(probability,2)
return entropy
def split(data, axis, val):
newData = []
for feat in data:
if feat[axis] == val:
reducedFeat = feat[:axis]
reducedFeat.extend(feat[axis+1:])
newData.append(reducedFeat)
return newData
def choose(data):
features = len(data[0]) - 1
baseEntropy = entropy(data)
bestInfoGain = 0.0;
bestFeat = -1
for i in range(features):
featList = [ex[i] for ex in data]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
newData = split(data, i, value)
probability = len(newData)/float(len(data))
newEntropy += probability * entropy(newData)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeat = i
return bestFeat
def majority(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def tree(data,labels):
classList = [ex[-1] for ex in data]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(data[0]) == 1:
return majority(classList)
bestFeat = choose(data)
bestFeatLabel = labels[bestFeat]
theTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [ex[bestFeat] for ex in data]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
theTree[bestFeatLabel][value] = tree(split/(data, bestFeat, value),subLabels)
return theTree
So what I did after this is the following:
infile=open("SData.csv","r")
data=infile.read()
tree(data)
The error which I got is "1 argument is missing" which is the label which I have to define and this is where I don't know what I have to put. I tried the variable for which I have to make the decision tree but it doesn't work:
tree(data,MinTemp)
Here I get an error "MinTemp is not defined".
Please help me out and let me know what I should do to have a look at the tree.
Following is the part of data and I want to generate a tree for MinTemp
MinTemp,Rainfall,Tempat9,RHat9,CAat9,WSat9
high,no,mild,normal,overcast,weak
high,no,mild,normal,cloudy,weak
high,no,mild,normal,cloudy,mild
high,yes,mild,high,cloudy,weak
high,yes,mild,high,cloudy,mild
medium,yes,mild,high,cloudy,mild
high,no,mild,high,overcast,weak
high,no,mild,normal,sunny,weak
high,no,hot,normal,sunny,weak
high,no,hot,normal,overcast,weak