How to find feature Interactions between all columns in a dataframe, Python? - python

Friedman’s H-statistic The interpretable ML book by Christoph Molnar actually gives us a workable approach, by using Friedman’s H-statistic based on the decomposition of the partial dependence values to calculate the feature interactions.
In Python, sklearn_gbmi will accept feature sets of length two and higher but does not provide support for the first-order measure, very similar to interact.gbm in R. It only works on gradient boosting based models
I found a manual Python implementation from here, posted below for reference, where the feature interactions were calculated.
import itertools
import math
import pandas as pd
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from pdpbox.pdp_calc_utils import _calc_ice_lines_inter
from pdpbox.pdp import pdp_isolate, PDPInteract
from pdpbox.utils import (_check_model, _check_dataset, _check_percentile_range, _check_feature,
_check_grid_type, _check_memory_limit, _make_list,
_calc_memory_usage, _get_grids, _get_grid_combos, _check_classes)
from joblib import Parallel, delayed
def pdp_multi_interact(model, dataset, model_features, features,
num_grid_points=None, grid_types=None, percentile_ranges=None, grid_ranges=None,
cust_grid_points=None,
cust_grid_combos=None, use_custom_grid_combos=False,
memory_limit=0.5, n_jobs=1, predict_kwds=None, data_transformer=None):
def _expand_default(x, default, length):
if x is None:
return [default] * length
return x
def _get_grid_combos(feature_grids, feature_types):
grids = [list(feature_grid) for feature_grid in feature_grids]
for i in range(len(feature_types)):
if feature_types[i] == 'onehot':
grids[i] = np.eye(len(grids[i])).astype(int).tolist()
return np.stack(np.meshgrid(*grids), -1).reshape(-1, len(grids))
if predict_kwds is None:
predict_kwds = dict()
nr_feats = len(features)
# check function inputs
n_classes, predict = _check_model(model=model)
_check_dataset(df=dataset)
_dataset = dataset.copy()
# prepare the grid
pdp_isolate_outs = []
if use_custom_grid_combos:
grid_combos = cust_grid_combos
feature_grids = []
feature_types = []
else:
num_grid_points = _expand_default(x=num_grid_points, default=10, length=nr_feats)
grid_types = _expand_default(x=grid_types, default='percentile', length=nr_feats)
for i in range(nr_feats):
_check_grid_type(grid_type=grid_types[i])
percentile_ranges = _expand_default(x=percentile_ranges, default=None, length=nr_feats)
for i in range(nr_feats):
_check_percentile_range(percentile_range=percentile_ranges[i])
grid_ranges = _expand_default(x=grid_ranges, default=None, length=nr_feats)
cust_grid_points = _expand_default(x=cust_grid_points, default=None, length=nr_feats)
_check_memory_limit(memory_limit=memory_limit)
pdp_isolate_outs = []
for idx in range(nr_feats):
pdp_isolate_out = pdp_isolate(
model=model, dataset=_dataset, model_features=model_features, feature=features[idx],
num_grid_points=num_grid_points[idx], grid_type=grid_types[idx], percentile_range=percentile_ranges[idx],
grid_range=grid_ranges[idx], cust_grid_points=cust_grid_points[idx], memory_limit=memory_limit,
n_jobs=n_jobs, predict_kwds=predict_kwds, data_transformer=data_transformer)
pdp_isolate_outs.append(pdp_isolate_out)
if n_classes > 2:
feature_grids = [pdp_isolate_outs[i][0].feature_grids for i in range(nr_feats)]
feature_types = [pdp_isolate_outs[i][0].feature_type for i in range(nr_feats)]
else:
feature_grids = [pdp_isolate_outs[i].feature_grids for i in range(nr_feats)]
feature_types = [pdp_isolate_outs[i].feature_type for i in range(nr_feats)]
grid_combos = _get_grid_combos(feature_grids, feature_types)
feature_list = []
for i in range(nr_feats):
feature_list.extend(_make_list(features[i]))
# Parallel calculate ICE lines
true_n_jobs = _calc_memory_usage(
df=_dataset, total_units=len(grid_combos), n_jobs=n_jobs, memory_limit=memory_limit)
grid_results = Parallel(n_jobs=true_n_jobs)(delayed(_calc_ice_lines_inter)(
grid_combo, data=_dataset, model=model, model_features=model_features, n_classes=n_classes,
feature_list=feature_list, predict_kwds=predict_kwds, data_transformer=data_transformer)
for grid_combo in grid_combos)
ice_lines = pd.concat(grid_results, axis=0).reset_index(drop=True)
pdp = ice_lines.groupby(feature_list, as_index=False).mean()
# combine the final results
pdp_interact_params = {'n_classes': n_classes,
'features': features,
'feature_types': feature_types,
'feature_grids': feature_grids}
if n_classes > 2:
pdp_interact_out = []
for n_class in range(n_classes):
_pdp = pdp[feature_list + ['class_%d_preds' % n_class]].rename(
columns={'class_%d_preds' % n_class: 'preds'})
pdp_interact_out.append(
PDPInteract(which_class=n_class,
pdp_isolate_outs=[pdp_isolate_outs[i][n_class] for i in range(nr_feats)],
pdp=_pdp, **pdp_interact_params))
else:
pdp_interact_out = PDPInteract(
which_class=None, pdp_isolate_outs=pdp_isolate_outs, pdp=pdp, **pdp_interact_params)
return pdp_interact_out
def center(arr): return arr - np.mean(arr)
def compute_f_vals(mdl, X, features, selectedfeatures, num_grid_points=10, use_data_grid=False):
f_vals = {}
data_grid = None
if use_data_grid:
data_grid = X[selectedfeatures].values
# Calculate partial dependencies for full feature set
p_full = pdp_multi_interact(mdl, X, features, selectedfeatures,
num_grid_points=[num_grid_points] * len(selectedfeatures),
cust_grid_combos=data_grid,
use_custom_grid_combos=use_data_grid)
f_vals[tuple(selectedfeatures)] = center(p_full.pdp.preds.values)
grid = p_full.pdp.drop('preds', axis=1)
# Calculate partial dependencies for [1..SFL-1]
for n in range(1, len(selectedfeatures)):
for subsetfeatures in itertools.combinations(selectedfeatures, n):
if use_data_grid:
data_grid = X[list(subsetfeatures)].values
p_partial = pdp_multi_interact(mdl, X, features, subsetfeatures,
num_grid_points=[num_grid_points] * len(selectedfeatures),
cust_grid_combos=data_grid,
use_custom_grid_combos=use_data_grid)
p_joined = pd.merge(grid, p_partial.pdp, how='left')
f_vals[tuple(subsetfeatures)] = center(p_joined.preds.values)
return f_vals
# the second-order H-measure:
def compute_h_val(f_vals, selectedfeatures):
denom_els = f_vals[tuple(selectedfeatures)].copy()
numer_els = f_vals[tuple(selectedfeatures)].copy()
sign = -1.0
for n in range(len(selectedfeatures)-1, 0, -1):
for subfeatures in itertools.combinations(selectedfeatures, n):
numer_els += sign * f_vals[tuple(subfeatures)]
sign *= -1.0
numer = np.sum(numer_els**2)
denom = np.sum(denom_els**2)
return math.sqrt(numer/denom) if numer < denom else np.nan
# first-order H-measure as well:
def compute_h_val_any(f_vals, allfeatures, selectedfeature):
otherfeatures = list(allfeatures)
otherfeatures.remove(selectedfeature)
denom_els = f_vals[tuple(allfeatures)].copy()
numer_els = denom_els.copy()
numer_els -= f_vals[(selectedfeature,)]
numer_els -= f_vals[tuple(otherfeatures)]
numer = np.sum(numer_els**2)
denom = np.sum(denom_els**2)
return math.sqrt(numer/denom) if numer < denom else np.nan
df = sns.load_dataset("diamonds")
data = pd.get_dummies(df, ["cut", "color", "clarity"])
X = data.drop("cut_Ideal", axis=1)
y = data["cut_Ideal"]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.33,
random_state = 42)
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
max_depth=1, random_state=0).fit(X_train, y_train)
f_val = compute_f_vals(gbc, X, X.columns, ['carat', 'depth'], num_grid_points=10, use_data_grid=False)
# second-order H-measure:
compute_h_val(f_val, ['carat', 'depth'])
I want to calculate feature interactions for all the columns in a dataframe. How could I do that?
I am not here to avail free code writing service I just want to capture a little bit of knowledge from the experienced programmers with discussing things. I was just expecting a suggestion/reference for the appropriate library of methods on finding the feature interactions?

Related

Several errors in the selection of important features with genetic algorithms:

code:
import numpy as np
import pandas as pd
import math
import target as target
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
dataset = pd.read_csv('Tehran_hies98.csv')
to_drop = ['Address', 'weight', 'WH', 'inc3', 'Income', 'exp1', 'exp4', 'exp5', 'exp6', 'exp7',
'exp8', 'exp11'
, 'exp12', 'exp13', 'Income_Mis', 'exp2', 'exp3', 'exp9', 'inc2', 'inc1', 'exp14']
dataset.drop(to_drop, inplace=True, axis=1)
dataset = pd.get_dummies(dataset,
columns=['HSize', 'SSex', 'SAge', 'SMadrak', 'SActivity', 'SMarital',
'Tasarrof', 'Otagh',
'ZirBana'], drop_first=True)
target, feature_list = 'DV', [i for i in dataset.columns if i not in target]
def init_population(n, c):
return np.array([[math.ceil(e) for e in pop] for pop in (np.random.rand(n, c) - 0.5)]),
np.zeros((2, c))-1
def single_poin_crossover(population):
r, c, n = population.shape[0], population.shape[1], np.random.randint(1, population.shape[1])
for i in range(0, r, 2):
population[i], population[i + 1] = np.append(population[i][0:n], population[i + 1]
[n:c]), np.append(
population[i + 1][0:n], population[i][n:c])
return population
def flip_mutation(population):
return population.max() - population
def random_selection(population):
r = population.shape[0]
new_population = population.copy()
for i in range(r):
new_population[i] = population[np.random.randint(0, r)]
return new_population
def get_fitness(data, feature_list, target, population):
fitness = []
for i in range(population.shape[0]):
columns = [feature_list[j] for j in range(population.shape[1]) if population[i, j] == 1]
fitness.append(predictive_model(data[columns], data[target]))
return fitness
def predictive_model(X, y):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
lr = LogisticRegression(solver='liblinear', max_iter=100, random_state=7)
lr.fit(X_train, y_train)
return accuracy_score(y_test, lr.predict(X_test))
def genetic_algorithm(data, feature_list, target, n, max_iter):
c = len(feature_list)
population, memory = init_population(n, c)
temp1 = population
temp2 = memory
population, memory = temp2, temp1
fitness = get_fitness(data, feature_list, target, population)
optimal_value = max(fitness)
optimal_solution = population[np.where(fitness == optimal_value)][0]
for i in range(max_iter):
population = random_selection(population)
population = single_poin_crossover(population)
if np.random.rand() < 0.3:
population = flip_mutation(population)
temp1 = population
temp2 = memory
population, memory = temp2, temp1
fitness = get_fitness(data, feature_list, target, population)
if max(fitness) > optimal_value:
optimal_value = max(fitness)
optimal_solution = population[np.where(fitness == optimal_value)][0]
return optimal_solution, optimal_value
feature_set, acc_score = genetic_algorithm(dataset, feature_list, target, 10, 1000)
feature_set = [feature_list[i] for i in range(len(feature_list)) if feature_set[i] == 1]
print('Optimal Feature Set\n', feature_set, '\nOptimal Accuracy = ', round(acc_score * 100), '%')
First error:
target, feature_list = 'DV', [i for i in dataset.columns if i not in target]
TypeError: argument of type 'module' is not iterable
for code:
target, feature_list = 'DV', [i for i in dataset.columns if i not in target]
Second error:
If I delete this part of the above code:
if i not in target
This error appears:
raise KeyError(key) from err
KeyError: 'DV'
I wrote this code and so far this problem has arisen.
If you see any other bugs, please help me fix it.
Please help me if possible or put the correct code
Thanks.

Python KNN Regression

I am trying to predict a cars MPG by using a KNN algorithm. I first cleaned my data, made a test and training dataset, and then I made a normalized and non-normalized KNN function. Now I am trying to pass my testing data through a KNN algorithm, and then create a list of all the predictions. I then want to use mean squared error to analyze my predictions. Currently, I have not been able to set up the function to pass my testing data through. Any guidance would be greatly appreciated!
import pandas as pd
import numpy as np
import math
from google.colab import drive
drive.mount('/content/drive')
pd.set_option('display.max_columns', 100)
vehicles = pd.read_csv('/content/drive/MyDrive/CS_167/vehicles (2).csv')
subset_cars = vehicles[vehicles["fuelType"] == 'Regular']
final_sub = subset_cars[["comb08", "year", "cylinders", "displ"]]
column_nulls = final_sub.isna().any()
Cylinder_no_null = final_sub.cylinders.dropna()
displ_no_null = final_sub.displ.dropna()
pure_data = final_sub.dropna()
# pure_data.head()
shuffled_data = pure_data.sample(frac=1, random_state=41)
test_data = shuffled_data.iloc[0:500]
train_data = shuffled_data.iloc[500:]
train_data_euc = train_data.copy()
test_data_euc = test_data.copy()
def Regression_KNN(MPG,train_data_euc,k):
train_data_euc['euc_dis'] = np.sqrt(
(MPG['year']-train_data_euc['year'])**2+
(MPG['cylinders']-train_data_euc['cylinders'])**2+
(MPG['displ']-train_data_euc['displ'])**2)
sorted_train_data = train_data_euc.sort_values(['euc_dis'])
prediction = sorted_train_data.iloc[0:k]['comb08'].mean()
return prediction
MPG ={}
MPG['year'] = 2020
MPG['cylinders'] = 4
MPG['displ'] = 5.2
print(f"The average MPG for this car is: %d" %Regression_KNN(MPG, train_data_euc, 5))
z_train_copy = train_data_euc.copy()
z_train_year_std = z_train_copy['year'].std()
z_train_year_mean = z_train_copy['year'].mean()
z_train_cylinders_std = z_train_copy['cylinders'].std()
z_train_cylinders_mean = z_train_copy['cylinders'].mean()
z_train_displ_std = z_train_copy['displ'].std()
z_train_displ_mean = z_train_copy['displ'].mean()
z_train_euc_std = z_train_copy['euc_dis'].std()
z_train_euc_mean = z_train_copy['euc_dis'].mean()
z_train_copy['year'] = (z_train_copy['year'] - z_train_year_mean)/z_train_year_std
z_train_copy['cylinders'] = (z_train_copy['cylinders'] - z_train_cylinders_mean)/z_train_cylinders_std
z_train_copy['displ'] = (z_train_copy['displ'] - z_train_displ_mean)/z_train_displ_std
z_train_copy['euc_dis'] = (z_train_copy['euc_dis'] - z_train_euc_mean)/z_train_euc_std
def Z_TRAIN_KNN(MPG, z_train_copy, k):
z_train_copy['euc_dis'] = np.sqrt(
(MPG['year']-z_train_copy['year'])**2+
(MPG['cylinders']-z_train_copy['cylinders'])**2+
(MPG['displ']-z_train_copy['displ'])**2)
z_train_sorted_data = z_train_copy.sort_values(['euc_dis'])
z_train_prediction = z_train_sorted_data.iloc[0:k]['comb08'].mean()
return z_train_prediction
MPG ={}
MPG['year'] = 2020
MPG['cylinders'] = 4
MPG['displ'] = 5.2
print(f"The average MPG for this car is: %d" %Z_TRAIN_KNN(MPG, z_train_copy, 5))
def regression_all_kNN(test_data_euc,z_train_data,k):
#apply the classify_kNN function to each item in the test data with the train
#data and k passed as the other two arguments. The result will be a series of
#the individual results.
for i in test_data:
z_train_data['euc_dis'] = np.sqrt(
(test_data['year']- z_train_data['year'])**2+
(test_data['cylinders']- z_train_data['cylinders'])**2+
(test_data['displ']- z_train_data['displ'])**2)
sorted_train_data = z_train_data.sort_values(['euc_dis'])
prediction = test_data.apply(regression_all_kNN,args=(z_train_data,k))
return prediction
predictions5NN = regression_all_kNN(test_data, train_data, 5)

Why is the node gain output from xgboost different from that calculated manually?

We can get xgboost tree structure from trees_to_dataframe():
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.datasets import load_boston
data = load_boston()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
model = xgb.XGBRegressor(random_state=1,
n_estimators=1, # 只有一棵树
max_depth=2,
learning_rate=0.1
)
model.fit(X, y)
tree_frame = model._Booster.trees_to_dataframe()
tree_frame
In which, according to the SO thread How is xgboost quality calculated?, gain should be calculated by:
However it is different from this code:
def mse_obj(preds, labels):
grad = labels-preds
hess = np.ones_like(labels)
return grad, hess
Gain,Hessian = mse_obj(y.mean(),y)
L = X[tree_frame['Feature'][0]] < tree_frame['Split'][0]
R = X[tree_frame['Feature'][0]] >= tree_frame['Split'][0]
GL = Gain[L].sum()
GR = Gain[R].sum()
HL = Hessian[L].sum()
HR = Hessian[R].sum()
reg_lambda = 1.0
gain = (GL**2/(HL+reg_lambda)+GR**2/(HR+reg_lambda)-(GL+GR)**2/(HL+HR+reg_lambda))
gain # 18817.811191871013
L = (X[tree_frame['Feature'][0]] < tree_frame['Split'][0])&((X[tree_frame['Feature'][1]] < tree_frame['Split'][1]))
R = (X[tree_frame['Feature'][0]] < tree_frame['Split'][0])&((X[tree_frame['Feature'][1]] >= tree_frame['Split'][1]))
GL = Gain[L].sum()
GR = Gain[R].sum()
HL = Hessian[L].sum()
HR = Hessian[R].sum()
reg_lambda = 1.0
gain = (GL**2/(HL+reg_lambda)+GR**2/(HR+reg_lambda)-(GL+GR)**2/(HL+HR+reg_lambda))
gain # 7841.627971119211
L = (X[tree_frame['Feature'][0]] > tree_frame['Split'][0])&((X[tree_frame['Feature'][2]] < tree_frame['Split'][2]))
R = (X[tree_frame['Feature'][0]] > tree_frame['Split'][0])&((X[tree_frame['Feature'][2]] >= tree_frame['Split'][2]))
GL = Gain[L].sum()
GR = Gain[R].sum()
HL = Hessian[L].sum()
HR = Hessian[R].sum()
reg_lambda = 1.0
gain = (GL**2/(HL+reg_lambda)+GR**2/(HR+reg_lambda)-(GL+GR)**2/(HL+HR+reg_lambda))
gain # 2634.409414953051
Did I miss something?
Eventually I found out where I was wrong. The default prediction value defined by base_score is 0.5, and we should use base_score as model's predicted value before any tree is builded when calculate the gradient for each sample.
Gain,Hessian = mse_obj(model.get_params()['base_score'], y)
After this, everything seems ok.

cosine similarity and euclidean distance between features of same sample and features of different sample are almost same in tf.keras

I have extracted features from face and left and right iris using MobileNetV2 in keras tensorflow transfer learning. The features are then concatenated. I have tried to find the cosine similarity, euclidean distance and squared euclidean distance between the concatenated features. The distance scores I get for all the metrics do not show much difference between features belonging to the same sample pairs and feature belonging to different samples pairs. I will like have distances between sample features to have smaller distance than distances of different sample pairs. I think I am doing something fundamentally wrong somewhere. I want to be able to plot probability distribution from the scores I get in that the distributions from the different scores should not overlap.
Attached is my code. Any help would be much appreciated. Thank you
import pandas as pd
import datetime
from scipy.spatial import distance
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import backend as K
import h5py
import argparse
import random
import math
import tensorflow as tf
modelPath_face = 'HACK-FACE-SMALL-5-02.hdf5'
modelPath_liris = 'HACK-LIRIS-SMALL-5-02.hdf5'
modelPath_riris = 'HACK-RIRIS-SMALL-5-02.hdf5'
def cosine_distance(vests):
x, y = vests
x = K.l2_normalize(x, axis=-1)
y = K.l2_normalize(y, axis=-1)
return -K.mean(x * y, axis=-1, keepdims=True)
def compute_cosine_distances(a, b):
# x shape is n_a * dim
# y shape is n_b * dim
# results shape is n_a * n_b
normalize_a = tf.nn.l2_normalize(a, 1)
normalize_b = tf.nn.l2_normalize(b, 1)
distance = 1 - tf.matmul(normalize_a, normalize_b, transpose_b=True)
return distance
def cos_dist_output_shape(shapes):
shape1, shape2 = shapes
return (shape1[0], 1)
def eudis(v1, v2):
dist = [(a - b) ** 2 for a, b in zip(v1, v2)]
dist = math.sqrt(sum(dist))
return dist
def Extract(lst):
return [item[0] for item in lst]
if __name__ == '__main__':
import cv2
ap = argparse.ArgumentParser()
ap.add_argument("-df", "--db-face", default=modelPath_face, help="path to face HDF5 file")
ap.add_argument("-dli", "--db-liris", default=modelPath_liris, help="path to left iris HDF5 file")
ap.add_argument("-dri", "--db-riris", default=modelPath_riris, help="path to right iris HDF5 file")
ap.add_argument("-b", "--batch-size", type=int, default=8,
help="batch size of images to be passed through network")
ap.add_argument("-s", "--buffer-size", type=int, default=1000, help="size of feature extraction buffer")
ap.add_argument('--num-classes', default=53, type=int, metavar='NC',
help='number of classes (default: 10000)')
args = vars(ap.parse_args())
le = LabelEncoder()
db_face = h5py.File(args["db_face"], "r")
db_liris = h5py.File(args["db_liris"], "r")
db_riris = h5py.File(args["db_riris"], "r")
face_features = db_face["face_features"][:]
liris_features = db_liris["liris_features"][:]
riris_features = db_riris["riris_features"][:]
face_labels = db_face["labels"][:]
liris_labels = db_liris["labels"][:]
riris_labels = db_riris["labels"][:]
n_classes_to_find = le.fit_transform(face_labels)
n_classes = len(le.classes_)
args['num_classes'] = n_classes
face_labels = tf.keras.utils.to_categorical(face_labels, num_classes=args['num_classes'])
liris_labels = tf.keras.utils.to_categorical(liris_labels, num_classes=args['num_classes'])
riris_labels = tf.keras.utils.to_categorical(riris_labels, num_classes=args['num_classes'])
all_features = np.concatenate([face_features, liris_features, riris_features], axis=1)
all_labels = np.concatenate([face_labels, liris_labels, riris_labels], axis=1)
all_labels = np.argmax(all_labels, axis=-1)
stacked_all_ft = np.column_stack((all_features, all_labels))
columnIndex = stacked_all_ft.shape[1] - 1
sorted_stacked_all_ft = stacked_all_ft[stacked_all_ft[:, columnIndex].argsort()]
mylabels = sorted_stacked_all_ft[:, -1]
myfeatures = sorted_stacked_all_ft[:, :-1]
unique_labels = np.unique(mylabels)
myfeatures_shape = myfeatures.shape
# pca = PCA(0.9)
#
# pca.fit(myfeatures)
#
# PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
# svd_solver='auto', tol=0.0, whiten=False)
#
# print(pca.n_components_)
#
# myfeatures_pca = pca.transform(myfeatures)
df_pos_pair = pd.DataFrame()
df_neg_pair = pd.DataFrame()
positive_counter = 0
general_counter = 0
start_time = datetime.datetime.now().replace(microsecond=0)
print("COMPARISON OF ALL FILES: Started")
myfeatures_pca = myfeatures
for i in range(myfeatures_pca.shape[0]):
for j in range(myfeatures_pca.shape[0]):
if j > i:
general_counter += 1
pair_pos = mylabels[j] == mylabels[i]
if pair_pos:
positive_counter += 1
try:
f1 = myfeatures_pca[i]
f2 = myfeatures_pca[j]
# score = tf.keras.losses.cosine_similarity(f1, f2, axis=-1)
# score = score.numpy()
# aa = f1.reshape(1, -1)
# ba = f2.reshape(1, -1)
# score = cosine_similarity(aa, ba)
# score = score[0][0]
# score = eudis(f1, f2)
score = distance.sqeuclidean(f1, f2)
# if i in data1.index and j in data1.columns:
df_pos_pair = df_pos_pair.append({'score': score}, ignore_index=True)
general_counter += 1
if general_counter % 25 == 0:
end_time = datetime.datetime.now().replace(microsecond=0)
print("COMPLETED OF ALL FILES: {}, TOTAL TIME: {}".format(general_counter,
(end_time - start_time)))
except Exception as e:
print(e)
else:
for k2 in range(positive_counter):
r = random.randint(j + 1, myfeatures_pca.shape[0] - 2)
if mylabels[r] != mylabels[i]:
try:
f3 = myfeatures_pca[i]
f4 = myfeatures_pca[r]
# scoren = tf.keras.losses.cosine_similarity(f3, f4, axis=-1)
# scoren = score.numpy()
scoren = distance.sqeuclidean(f3, f4)
# aa2 = f3.reshape(1, -1)
# ba2 = f4.reshape(1, -1)
# scoren = cosine_similarity(aa2, ba2)
# scoren = scoren[0][0]
# scoren = eudis(f3, f4)
df_neg_pair = df_neg_pair.append({'score': scoren}, ignore_index=True)
general_counter += 1
end_time = datetime.datetime.now().replace(microsecond=0)
print("COMPLETED OF ALL FILES: {}, TOTAL TIME: {}".format(general_counter,
(end_time - start_time)))
except Exception as e:
print(e)
positive_counter = 0
break
end_time = datetime.datetime.now().replace(microsecond=0)
print("COMPARISON OF ALL FILES: Ended in {} ".format((end_time - start_time)))
df_neg_pair.to_csv("sqed_face_iris_neg_scores_pca_feb_01.csv", index=False)
df_pos_pair.to_csv("sqed_face_iris_pos_scores_pca_feb_01.csv", index=False)

Wrong fit when using k nearest neighbors regression

I use the nearest neighbors method to predict the price of a stock. I have raw data in example.txt file. I use the close column (price at the end of the period = 1 minute). Linear regression predicts well (shown in green). But the method of nearest neighbors works only at the beginning and then turns into a straight line, please tell me how to fix this? Here is my code I wrote:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
class Reader:
def __init__(self, filename='example.txt'):
self.filename = filename
def read(self):
try:
file = open(self.filename)
return file.read()
except IOError:
return "File not found"
def main():
x = Reader('example.txt')
print(x.read())
class Regression:
def __init__(self, window, P0, Ptest, i):
self.window = window
self.P0 = P0
self.Ptest = Ptest
self.i = i
self.data_train = self.get_data_train()
self.x_train = self.get_x_train()
self.y_train = self.get_y_train()
self.data_test = self.get_data_test()
self.x_test = self.get_x_test()
self.y_test = self.get_y_test()
def get_data_train(self):
""" Method of obtaining data train on prices for the entire period."""
x = Reader('example.txt')
data = x.read().splitlines()
close_column = [x.split(',')[7] for x in data][1:]
result = [float(item) for item in close_column]
relative_price = result[:int(len(result)*P0)]
return relative_price
def get_data_test(self):
""" Method of obtaining data test on prices for the entire period."""
x = Reader('example.txt')
data = x.read().splitlines()
close_column = [x.split(',')[7] for x in data][1:]
result = [float(item) for item in close_column]
len_x_test = int(len(result) * Ptest)
len_x_train = int(len(result) * P0)
relative_price = result[(len_x_train + (len_x_test * self.i)): len_x_train + len_x_test
* (self.i + 1)]
return relative_price
def get_x_train(self):
x = []
for i in range(len(self.data_train)):
if i + self.window < len(self.data_train):
x.append(self.data_train[i: i + self.window])
return x
def get_y_train(self):
y = []
for i in self.data_train[self.window:]:
y += [i]
return y
def get_x_test(self):
x = []
for i in range(len(self.data_test)):
if i + self.window < len(self.data_test):
x.append(self.data_test[i: i + self.window])
return x
def get_y_test(self):
y = []
for i in self.data_test[self.window:]:
y += [i]
return y
class Linear_regression(Regression):
def callculate(self):
reg_linear = LinearRegression().fit(self.x_train, self.y_train)
y_pred = reg_linear.predict(self.x_test)
return y_pred
class Nearest_neighbor(Regression):
def callculate(self):
reg_neighbor = KNeighborsRegressor(n_neighbors=window, weights='distance')
reg_neighbor.fit(self.x_train, self.y_train)
y_pred = reg_neighbor.predict(self.x_test)
return y_pred
window = 10
Pk = 1
P0 = 0.1
Ptest = 0.01
k = (Pk - P0)/Ptest
i = 0
y_real = []
y_neigh = []
y_lin = []
while i < k:
lin_price = list(Linear_regression(window, P0, Ptest, i).callculate())
neighbor = list(Nearest_neighbor(window, P0, Ptest, i).callculate())
y_neigh.extend(neighbor)
y_lin.extend(lin_price)
y_real.extend(list(Linear_regression(window, P0, Ptest, i).y_test))
i += 1
""" Output to graphs of the received data """
fig, ax = plt.subplots()
ax.plot(y_real, label='Initial data')
ax.plot(y_neigh, label='Nearest Neighbor Data')
ax.plot(y_lin, label='Linear Regression Data')
ax.set_xlabel('Time (min)')
ax.set_ylabel('Price, ($)')
ax.legend()
plt.show()
"Linear regression predicts well"
No, it never predicted well. You just looked at the graph and thought it looked kind of similar. But if you look more closely, your 'model' simply takes the price of a bit ago as the prediction of the price now. That means, it's not predicting anything! It's a history device, not a prediction device.
That's why if you feed back this sort of 'model' into itself you get a straight line: it always predicts the next price is going to be equal to the last one.

Categories