I want to perform feature selection and nested cross validation on a data set. I wrote this script:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pickle
#import neptune.new as neptune
import pandas as pd
import shap
#full_X_train = df
full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
def run_model_with_grid_search(param_grid={},output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest')
cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc
split_y_train, split_y_test = y_train[train_ix],y_train[test_ix] #add in .iloc
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
model = model_name
#model.set_params(**best_params)
rfecv = {'RFECV Features': {'cv': 5,
'estimator': model,
'step': 1,
'scoring': 'accuracy',
'verbose': 50}}
rfecv.fit(split_x_train,split_y_train)
print(rfecv.n_features_)
X_selected_train = rfecv.transform(split_x_train)
X_selected_test = rfecv.transform(split_x_test)
search = GridSearchCV(model,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True)
result = search.fit(X_selected_train,split_y_train)
best_model = result.best_estimator_
y_pred_train = best_model.predict(X_selected_train)
y_pred_test = best_model.predict(X_selected_test)
accuracy_train = metrics.accuracy_score(split_y_train, y_pred_train)
accuracy_test = metrics.accuracy_score(split_y_test, y_pred_test)
return
param_grid = [{
# 'random_forest_with_hpo_no_fs_geno_class__bootstrap':[True,False],
# 'random_forest_with_hpo_no_fs_geno_class__max_depth':[10,20,30,40,50,60,70,80],
# 'random_forest_with_hpo_no_fs_geno_class__max_features':['auto','sqrt'],
'min_samples_leaf':[1,3,5],
# 'random_forest_with_hpo_no_fs_geno_class__n_estimators':[200,500,700,1000,1500,2000]
}]
run_model_with_grid_search(param_grid=param_grid)
And I receive the error:
File "test3.py", line 83, in <module>
run_model_with_grid_search(param_grid=param_grid)
File "test3.py", line 57, in run_model_with_grid_search
rfecv.fit(split_x_train,split_y_train)
AttributeError: 'dict' object has no attribute 'fit'
Could someone please tell me how to fix this? Thank you.
This is SVM code
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import joblib
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
data=np.load('data.npy')
target=np.load('target.npy')
train_data,test_data,train_target,test_target=train_test_split(data,target,test_size=0.2)
model=SVC()
model=SVC(kernel='poly')
model.fit(train_data,train_target)
model.kernel
predicted_target=model.predict(test_data)
acc=accuracy_score(test_target,predicted_target)
print('Accuracy:',acc)
This is KNN code
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import joblib
data=np.load('data.npy')
target=np.load('target.npy')
print(data.shape)
print(target.shape,target)
train_data,test_data,train_target,test_target=train_test_split(data,target,test_size=0.2)
print(train_data.shape,train_target.shape)
print(test_data.shape,test_target.shape)
model=KNeighborsClassifier(n_neighbors=30)
model.fit(train_data,train_target)
predicted_target=model.predict(test_data)
print(predicted_target)
print(test_target)
acc=accuracy_score(test_target,predicted_target)
print('Accuracy:',acc)
I need to combine SVM and knn
How to combine SVM with KNN?
How to combine SVM with KNN
a) Train a support vector machine on the collection of nearest neighbors
b) Kernel function is defined as:
K(x,y) = <(x),(y)>
Distance function is converted to kernel function
K(x,y) = <x,y>
=1/2(<x,x> +<y,y> -<x-y, x-y >)
= ½(d(x,0) + d(y,0) – d(x,y))
How can I apply this algorithm to this code?
I wanted to use sklearn.pipeline instead of using imblearn.pipeline to incorporate `RandomUnderSampler()'. My original data requires missing value imputation and scaling. Here I have breast cancer data as a toy example. However, it gave me the following error message. I appreciate your suggestions. Thanks for your time!
from numpy.random import seed
seed(12)
from sklearn.datasets import load_breast_cancer
import time
from sklearn.metrics import make_scorer
from imblearn.metrics import geometric_mean_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler
from imblearn.under_sampling import RandomUnderSampler
gmean = make_scorer(geometric_mean_score, greater_is_better=True)
X, y = load_breast_cancer(return_X_y=True)
start_time1 = time.time()
scoring = {'G-mean': gmean}
LR_pipe = Pipeline([("impute", SimpleImputer(strategy='constant',fill_value= 0)),("scale", MaxAbsScaler()),("rus", RandomUnderSampler()),("LR", LogisticRegression(solver='lbfgs', random_state=0, class_weight='balanced', max_iter=100000))])
LRscores = cross_validate(LR_pipe,X, y, cv=5,scoring=scoring)
end_time1 = time.time()
print ("Computational time in seconds = " +str(end_time1 - start_time1) )
sorted(LRscores.keys())
LR_Gmean = LRscores['test_G-mean'].mean()
print("G-mean: %f" % (LR_Gmean))
Error message:
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RandomUnderSampler()' (type <class 'imblearn.under_sampling._prototype_selection._random_under_sampler.RandomUnderSampler'>) doesn't
We should import make_pipeline from imblearn.pipeline and not from sklearn.pipeline: make_pipeline from sklearn needs the transformers to implement fit and transform methods. sklearn.pipeline import Pipeline was conflicting with imblearn.pipeline import Pipeline!
My input file is under the form:
gold,Program,MethodType,RequirementID,MethodID,CallersT,CallersN,CallersU,CallersCallersT,CallersCallersN,CallersCallersU,CalleesT,CalleesN,CalleesU,CalleesCalleesT,CalleesCalleesN,CalleesCalleesU,classGold,VariableTraceValue
T,chess,Inner,1,1,1,0,0,1,0,0,1,0,0,0,0,1,Trace,N
N,chess,Inner,2,1,0,1,0,0,1,0,0,1,0,0,0,1,NoTrace,N
N,chess,Inner,3,1,0,1,0,0,1,0,0,1,0,0,0,1,NoTrace,N
N,chess,Inner,4,1,0,1,0,0,1,0,0,1,0,0,0,1,Trace,N
N,chess,Inner,5,1,0,1,0,0,1,0,0,1,0,0,0,1,NoTrace,N
N,chess,Inner,6,1,0,1,0,0,1,0,0,1,0,0,0,1,Trace,U
N,chess,Inner,7,1,0,1,0,0,1,0,0,1,0,0,0,1,Trace,T
N,chess,Inner,8,1,0,1,0,0,1,0,0,1,0,0,0,1,NoTrace,N
N,chess,Inner,1,3,0,1,0,0,1,0,0,1,1,0,1,1,Trace,x
N,chess,Inner,2,3,0,1,0,0,1,0,0,1,1,0,1,1,NoTrace,x
I would like to perform hyperparameter tuning for the random forest technique. I am trying to predict the value of the first column of my input file based on the values of the remaining columns. I am also trying to perform hyperparameter tuning of the random forest technique. The problem is that I receive the error message features and labels not defined for the following line of code: RFR_random.fit(features, labels). I don't know how to define features and labels as I have just followed the code of a tutorial. Here is my source code:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
from sklearn.metrics import mean_squared_error
import sys
from sklearn.model_selection import validation_curve
import matplotlib.pyplot as plt
def main(seed):
dataset = pd.read_csv( 'inputFieldsMajority+MethodCalls.txt', sep= ',', index_col=False)
dataset['Program'] = dataset['Program'].astype('category').cat.codes
dataset['MethodType'] = dataset['MethodType'].astype('category').cat.codes
dataset['classGold']=dataset['classGold'].astype('category').cat.codes
dataset['VariableTraceValue']=dataset['VariableTraceValue'].astype('category').cat.codes
dataset=dataset.drop(columns=['RequirementID'], axis=1)
dataset=dataset.drop(columns=['MethodID'], axis=1)
dataset=dataset.drop(columns=['Program'], axis=1)
pd.set_option('display.max_columns', None)
row_count, column_count = dataset.shape
n_estimators=[500,800,1500,2500,5000]
max_features=['auto','sqrt', 'log2']
max_depth=[10,20,30,40,50]
max_depth.append(None)
min_samples_split=[2,5,10,15, 20]
min_samples_leaf=[1,2,5,10,15]
grid_param={'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf}
from sklearn.model_selection import RandomizedSearchCV
RFR=RandomForestClassifier(random_state=1)
RFR_random=RandomizedSearchCV(estimator=RFR, param_distributions=grid_param,
n_iter=500, cv=5, verbose=2, random_state=42,
n_jobs=-1)
RFR_random.fit(features, labels)
print(RFR_random.best_params_)