This is SVM code
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import joblib
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
data=np.load('data.npy')
target=np.load('target.npy')
train_data,test_data,train_target,test_target=train_test_split(data,target,test_size=0.2)
model=SVC()
model=SVC(kernel='poly')
model.fit(train_data,train_target)
model.kernel
predicted_target=model.predict(test_data)
acc=accuracy_score(test_target,predicted_target)
print('Accuracy:',acc)
This is KNN code
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import joblib
data=np.load('data.npy')
target=np.load('target.npy')
print(data.shape)
print(target.shape,target)
train_data,test_data,train_target,test_target=train_test_split(data,target,test_size=0.2)
print(train_data.shape,train_target.shape)
print(test_data.shape,test_target.shape)
model=KNeighborsClassifier(n_neighbors=30)
model.fit(train_data,train_target)
predicted_target=model.predict(test_data)
print(predicted_target)
print(test_target)
acc=accuracy_score(test_target,predicted_target)
print('Accuracy:',acc)
I need to combine SVM and knn
How to combine SVM with KNN?
How to combine SVM with KNN
a) Train a support vector machine on the collection of nearest neighbors
b) Kernel function is defined as:
K(x,y) = <(x),(y)>
Distance function is converted to kernel function
K(x,y) = <x,y>
=1/2(<x,x> +<y,y> -<x-y, x-y >)
= ½(d(x,0) + d(y,0) â d(x,y))
How can I apply this algorithm to this code?
Related
I want to perform feature selection and nested cross validation on a data set. I wrote this script:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pickle
#import neptune.new as neptune
import pandas as pd
import shap
#full_X_train = df
full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
def run_model_with_grid_search(param_grid={},output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest')
cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc
split_y_train, split_y_test = y_train[train_ix],y_train[test_ix] #add in .iloc
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
model = model_name
#model.set_params(**best_params)
rfecv = {'RFECV Features': {'cv': 5,
'estimator': model,
'step': 1,
'scoring': 'accuracy',
'verbose': 50}}
rfecv.fit(split_x_train,split_y_train)
print(rfecv.n_features_)
X_selected_train = rfecv.transform(split_x_train)
X_selected_test = rfecv.transform(split_x_test)
search = GridSearchCV(model,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True)
result = search.fit(X_selected_train,split_y_train)
best_model = result.best_estimator_
y_pred_train = best_model.predict(X_selected_train)
y_pred_test = best_model.predict(X_selected_test)
accuracy_train = metrics.accuracy_score(split_y_train, y_pred_train)
accuracy_test = metrics.accuracy_score(split_y_test, y_pred_test)
return
param_grid = [{
# 'random_forest_with_hpo_no_fs_geno_class__bootstrap':[True,False],
# 'random_forest_with_hpo_no_fs_geno_class__max_depth':[10,20,30,40,50,60,70,80],
# 'random_forest_with_hpo_no_fs_geno_class__max_features':['auto','sqrt'],
'min_samples_leaf':[1,3,5],
# 'random_forest_with_hpo_no_fs_geno_class__n_estimators':[200,500,700,1000,1500,2000]
}]
run_model_with_grid_search(param_grid=param_grid)
And I receive the error:
File "test3.py", line 83, in <module>
run_model_with_grid_search(param_grid=param_grid)
File "test3.py", line 57, in run_model_with_grid_search
rfecv.fit(split_x_train,split_y_train)
AttributeError: 'dict' object has no attribute 'fit'
Could someone please tell me how to fix this? Thank you.
How to amend this code (which is a reproducible example):
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pandas as pd
full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
def run_model_with_grid_search(param_grid={},output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'):
cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc
split_y_train, split_y_test = y_train[train_ix],y_train[test_ix] #add in .iloc
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
model = model_name
rfecv = RFECV(estimator=model, step=1,cv=5, scoring='roc_auc')
search = GridSearchCV(model,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True)
pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',search)])
result = pipeline.fit(split_x_train,split_y_train)
#result = search.fit(split_x_train,split_y_train)
best_model = result.best_estimator_
yhat = best_model.predict(split_x_test)
print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,result.best_score_,result.best_params_))
return
param_grid = [{
'min_samples_leaf':[1,3,5],
}]
run_model_with_grid_search(param_grid=param_grid)
Generates:
Attribute Error: Pipeline object has no attribute 'best_estimator_'
The ultimate aim is to perform nested cross validation, hyper parameter optimization and feature selection in this function, and I was trying to follow this example
How to edit this function to perform that correctly?
Normally, you'd run grid search on the pipeline, not the pipeline on grid search. Is there a certain reason you'd want it the other way round?
pipeline = Pipeline([('feature_sele',rfecv), ('clf',model)])
search = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=cv_inner, refit=True)
result = search.fit(split_x_train, split_y_train)
best_model = result.best_estimator_
(param_grid will require clf_ prefix to hyperparameter names ofc.)
On an unlrelated note, accuracy seems to be undefined.
I wanted to use sklearn.pipeline instead of using imblearn.pipeline to incorporate `RandomUnderSampler()'. My original data requires missing value imputation and scaling. Here I have breast cancer data as a toy example. However, it gave me the following error message. I appreciate your suggestions. Thanks for your time!
from numpy.random import seed
seed(12)
from sklearn.datasets import load_breast_cancer
import time
from sklearn.metrics import make_scorer
from imblearn.metrics import geometric_mean_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler
from imblearn.under_sampling import RandomUnderSampler
gmean = make_scorer(geometric_mean_score, greater_is_better=True)
X, y = load_breast_cancer(return_X_y=True)
start_time1 = time.time()
scoring = {'G-mean': gmean}
LR_pipe = Pipeline([("impute", SimpleImputer(strategy='constant',fill_value= 0)),("scale", MaxAbsScaler()),("rus", RandomUnderSampler()),("LR", LogisticRegression(solver='lbfgs', random_state=0, class_weight='balanced', max_iter=100000))])
LRscores = cross_validate(LR_pipe,X, y, cv=5,scoring=scoring)
end_time1 = time.time()
print ("Computational time in seconds = " +str(end_time1 - start_time1) )
sorted(LRscores.keys())
LR_Gmean = LRscores['test_G-mean'].mean()
print("G-mean: %f" % (LR_Gmean))
Error message:
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RandomUnderSampler()' (type <class 'imblearn.under_sampling._prototype_selection._random_under_sampler.RandomUnderSampler'>) doesn't
We should import make_pipeline from imblearn.pipeline and not from sklearn.pipeline: make_pipeline from sklearn needs the transformers to implement fit and transform methods. sklearn.pipeline import Pipeline was conflicting with imblearn.pipeline import Pipeline!
I am new in Data Science and have struggled in the problem for the Kaggle's problem. When I use random forest regression for predicting the rating, it is found high Score using Train Test Split but Low Score while using CV Score.
with train test split_randomforest 0.8746277302652172
with no train test split_randomforest 0.8750717943467078
with CV randomforest 10.713885026374156 %
https://www.kaggle.com/data13/machine-learning-model-to-predict-app-rating-94
import time
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.metrics import r2_score
import statsmodels.api as sm
import sklearn.model_selection as ms
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.cluster import KMeans
from sklearn.neighbors import KDTree
from sklearn import svm
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from lightgbm import LGBMClassifier
database = pd.read_csv(r"C:\Users\Anson\Downloads\49864_274957_bundle_archive\googleplaystore.csv")# store wine type as an attribute
## Size - Strip the M and k value
database['Size'] = database['Size'].apply(lambda x : x.strip('M'))
database['Size'] = database['Size'].apply(lambda x : x.strip('k'))
##
## Rating - Fill the Blank Value with median
database['Rating'].fillna(database['Rating'].median(),inplace=True)
database['Rating'].replace(19,database['Rating'].median(),inplace=True)
###
## Reviews - replace the blank cell
database['Reviews'].replace('3.0M',3000000,inplace=True)
database['Reviews'].replace('0',float("NaN"),inplace=True)
database.dropna(subset=['Reviews'],inplace=True)
##
## Strip the + value
database['Installs'] = database['Installs'].apply(lambda x : x.strip('+'))
database['Installs'] = database['Installs'].apply(lambda x : x.replace(',',''))
database['Price'] = database['Price'].apply(lambda x : x.strip('$'))
###
## Drop Blank
database['Content Rating'].fillna("NaN",inplace=True)
database.dropna(subset=['Content Rating'],inplace=True)
##
## Drop Wrong Number
database['Last Updated'].replace('1.0.19',float("NaN"),inplace=True)
database.dropna(subset=['Last Updated'],inplace=True)
database['Last Updated'] = database['Last Updated'].apply(lambda x : time.mktime(datetime.datetime.strptime(x, '%B %d, %Y').timetuple()))
##
le = preprocessing.LabelEncoder()
database['App'] = le.fit_transform(database['App'])
database['Category'] = le.fit_transform(database['Category'])
database['Content Rating'] = le.fit_transform(database['Content Rating'])
database['Type'] = le.fit_transform(database['Type'])
database['Genres'] = le.fit_transform(database['Genres'])
###############################
##feature engineering
features = ['App', 'Reviews', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated']
X=database[features]
y=database['Rating']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=None)
rfc= RandomForestRegressor()
rfc.fit(X_train,y_train)
rfc.fit(X,y)
rfc_score=rfc.score(X_test,y_test)
rfc_score1=rfc.score(X,y)
score_CV_randomforest = cross_val_score(rfc,X,y,cv=KFold(n_splits=5, shuffle=True),scoring='r2')
score_CV_randomforest = score_CV_randomforest.mean()*100
print("with train test split_randomforest", rfc_score)
print("with no train test split_randomforest", rfc_score1)
print("with CV randomforest", score_CV_randomforest, "%")
Train/Test Split:
You are using 80:20 ratio fro training and testing.
Cross-validation
when the data set is randomly split up into âkâ groups. One of the groups is used as the test set and the rest are used as the training set. The model is trained on the training set and scored on the test set. Then the process is repeated until each unique group as been used as the test set.
You are using 5-fold cross validation, the data set would be split into 5 groups, and the model would be trained and tested 5 separate times so each group would get a chance to be the test set.
So the reason for different result is, that model is trained on different random samples.
I am using the iris flower dataset to do the sorting. I need to make a confusion matrix through cross validation (fold = 10) but I don't know how to do it. I generated the confusion matrix of only one round.
# I am using TPOT autoML library for python
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import LabelEncoder
tpot_data = pd.read_csv('iris.csv')
tpot_data = tpot_data.apply(LabelEncoder().fit_transform)
features = tpot_data.drop('species', axis=1).values
training_features, testing_features, training_target, testing_target = \
train_test_split(features, tpot_data['species'].values, random_state=10)
exported_pipeline = make_pipeline(StackingEstimator(estimator=GaussianNB()),
MultinomialNB(alpha=0.01, fit_prior=False)
)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(testing_target, results))
pd.crosstab(testing_target, results, rownames=['Actual Class'], colnames=['Predicted Class'])
from sklearn.model_selection import cross_val_score
array_cross_val_score = cross_val_score(estimator=exported_pipeline, X=training_features,
y=training_target, cv=10, scoring='accuracy')
# I would like the confusion matrix to be based on the average cross-validation
np.mean(array_cross_val_score)