When running grid search on Inverse of regularization strength parameter and number of nearest neighbors parameter for logistic regression , linear SVM and K nearest neighbors classifier , The best parameters obtained from gridsearch are not really the best when verifying manually by training on same training data set. Code below
# Convert to a DataFrame.
import pandas as pd
from sklearn.datasets import fetch_openml
df = fetch_openml('credit-g', as_frame=True).frame
df.head(5)
df.dtypes
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12, 12))
st = fig.suptitle("univariate distributions and target distribution", fontsize=20)
# Using columns that we need for this plot
nfeatures = df[['duration', 'credit_amount' , 'age']]
target = df['class']
# creating 4x4 grid
grid = plt.GridSpec(4, 4, hspace=0.4, wspace=0.4)
# creating the normal plots in grid 1 , 2 ,3 and 4
p1 = fig.add_subplot(grid[:2,:2])
p2 = fig.add_subplot(grid[:2,2:])
p3 = fig.add_subplot(grid[2:,:2])
p4 = fig.add_subplot(grid[2:,2:])
p1.hist(nfeatures['duration'])
p2.hist(nfeatures['credit_amount'])
p3.hist(nfeatures['age'])
p4.hist(target)
p1.set_xlabel('duration')
p2.set_xlabel('credit_amount')
p3.set_xlabel('age')
p4.set_xlabel('class')
# customizing to look neat
st.set_y(0.95)
fig.subplots_adjust(top=0.92)
from sklearn.model_selection import train_test_split
columns = [column for column in df.columns if column != 'class']
X = df[columns]
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 ,random_state=11)
#X_train , y_train , X_valid , y_valid = train_test_split(X,)
# basic preprocessing on train sets
# numeric_columns = ['duration','credit_amount' , 'installment_commitment' , 'residence_since' , 'age' ,'existing_credits' , 'num_dependents' ]
numeric_columns = df.select_dtypes(include=['float64']).columns
categorical_columns = [column for column in columns if column not in numeric_columns]
temp = X_train[categorical_columns]
X_train_ohe = pd.concat([pd.get_dummies(temp),X_train[numeric_columns]],axis=1)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
lr = LogisticRegression(max_iter=1000)
cr = cross_val_score(lr,X_train_ohe,y_train)
print(cr)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
# define the data preparation for the categorical columns
t1 = [('cat', OneHotEncoder(), categorical_columns)]
col_transform = ColumnTransformer(transformers=t1)
# define the models
models = {'lr_model':LogisticRegression(max_iter=1000), 'lsvm_model':LinearSVC(max_iter=2500) , 'knn_model':KNeighborsClassifier()}
for name,model in models.items():
# define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
# define the model cross-validation configuration
#cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate the pipeline using cross validation and calculate MAE
score = cross_val_score(pipeline, X_train, y_train)
print(name ,score.mean())
# define the data preparation for the categorical columns and numeric columns
t2 = [('cat', OneHotEncoder(), categorical_columns), ('num', StandardScaler(), numeric_columns)]
col_transform = ColumnTransformer(transformers=t2)
# try with new column transformer
for name,model in models.items():
# define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
# define the model cross-validation configuration
#cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate the pipeline using cross validation and calculate MAE
score = cross_val_score(pipeline, X_train, y_train)
print(name ,score.mean())
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
f1_scorer = make_scorer(f1_score, pos_label="bad")
# 'prep__num__with_mean': [True, False],
# 'prep__num__with_std': [True, False],
param_grid = {
'm__C': [0.1, 1.0 , 0.01],
}
param_grid_knn = {
'm__n_neighbors': [5, 10 , 15],
}
for name,model in models.items():
# define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
# define the model cross-validation configuration
#cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate the pipeline using cross validation and calculate MAE
if name == 'knn_model':
grid_clf = GridSearchCV(pipeline, param_grid_knn, cv=5, scoring=f1_scorer )
else:
grid_clf = GridSearchCV(pipeline, param_grid, cv=5, scoring=f1_scorer)
grid_clf.fit(X_train, y_train)
print(name,grid_clf.best_params_)
print(name, grid_clf.best_estimator_.score(X_test, y_test))
lr_array = []
lr_c = [0.01,0.1,1]
for c in lr_c:
pipeline = Pipeline(steps=[('prep',col_transform), ('m', LogisticRegression(max_iter=1000, C=c))])
pipeline.fit(X_train,y_train)
y_hat = pipeline.predict(X_train)
lr_array.append(f1_score(y_train,y_hat,pos_label="bad"))
lsvm_array = []
lsvm_c = [0.01,0.1,1]
for c in lsvm_c:
pipeline = Pipeline(steps=[('prep',col_transform), ('m', LinearSVC(dual=True,max_iter=2500,C=c))])
pipeline.fit(X_train,y_train)
y_hat = pipeline.predict(X_train)
lsvm_array.append(f1_score(y_train,y_hat,pos_label="bad"))
knn_array = []
knn_n = [5,10,15]
for n in knn_n:
pipeline = Pipeline(steps=[('prep',col_transform), ('m', KNeighborsClassifier(n_neighbors=n))])
pipeline.fit(X_train,y_train)
y_hat = pipeline.predict(X_train)
knn_array.append(f1_score(y_train,y_hat,pos_label="bad"))
fig = plt.figure(figsize=(12, 12))
# creating 3x1 grid
grid = plt.GridSpec(3, 1, hspace=0.4, wspace=0.4)
# creating the normal plots in grid 1 , 2 ,3
p1 = fig.add_subplot(grid[0,:])
p2 = fig.add_subplot(grid[1,:])
p3 = fig.add_subplot(grid[2,:])
p1.scatter(lr_c,lr_array)
p2.scatter(lsvm_c,lsvm_array)
p3.scatter(knn_n,knn_array)
The trend changes when using different scores and evaluating on test set instead train set but the best parameters never seem to be same for grid search and manual verification . What could be the reason for this? For example if you run the above code grid search tells you 10 is the best value for n_neighbors but the graph at the end shows 5 does better .Is the comparison not being implemented correctly ? You can check the runs with output at this link https://github.com/binodmathews93/AppliedMachineLearningCourse/blob/master/Applied_Machine_Learning_Homework_2.ipynb
Hyperparameter tuning is performed on the validation (development) set, not on the training set.
Grid Search Cross-Validation is using the K-Fold strategy to build a validation set that is used only for validation, not for training.
You are manually performing training and validation on the same set which is an incorrect approach.
pipeline = Pipeline(steps=[('prep',col_transform), ('m', LogisticRegression(max_iter=1000, C=c))])
pipeline.fit(X_train,y_train) # <- here is the problem
y_hat = pipeline.predict(X_train)
lr_array.append(f1_score(y_train,y_hat,pos_label="bad"))
This will only lead to hyperparameter choices that will boost performance on the training set which is not what you want (you what a set of hyperparameters that lead to good performance on the test set - that generalizes well).
This is the reason why the K (in KNN) is lower when you are doing the manual testing - lower K leads to less "regularization" and therefore is optimal, although incorrect, choice from the perspective of the training set.
If you want to manually verify the results, you will need to build the validation set by yourself (and don't use it during the training), or you will need to manually call K-fold cross-validation procedure.
Related
After making a desicion tree function, I have decided to check how accurate is the tree, and to confirm that atleast the first split is the same if I'll make another trees with the same data
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os
from sklearn import tree
from sklearn import preprocessing
import sys
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
.....
def desicion_tree(data_set:pd.DataFrame,val_1 : str, val_2 : str):
#Encoder -- > fit doesn't accept strings
feature_cols = data_set.columns[0:-1]
X = data_set[feature_cols] # Independent variables
y = data_set.Mut #class
y = y.to_list()
le = preprocessing.LabelBinarizer()
y = le.fit_transform(y)
# Split data set into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) # 75%
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth= 4, criterion= 'entropy')
# Train Decision Tree Classifer
clf.fit(X_train, y_train)
# Predict the response for test dataset
y_pred = clf.predict(X_test)
#Perform cross validation
for i in range(2, 8):
plt.figure(figsize=(14, 7))
# Perform Kfold cross validation
#cv = ShuffleSplit(test_size=0.25, random_state=0)
kf = KFold(n_splits=5,shuffle= True)
scores = cross_val_score(estimator=clf, X=X, y=y, n_jobs=4, cv=kf)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
tree.plot_tree(clf,filled = True,feature_names=feature_cols,class_names=[val_1,val_2])
plt.show()
desicion_tree(car_rep_sep_20, 'Categorial', 'Non categorial')
Down , I wrote a loop in order to rectreate the tree with splitted values using Kfold. The accuracy is changing (around 90%) but the tree is the same, where did I mistaken?
cross_val_score clones the estimator in order to fit-and-score on the various folds, so the clf object remains the same as when you fit it to the entire dataset before the loop, and so the plotted tree is that one rather than any of the cross-validated ones.
To get what you're after, I think you can use cross_validate with option return_estimator=True. You also shouldn't need the loop, if your cv object has the number of splits desired:
kf = KFold(n_splits=5, shuffle=True)
cv_results = cross_validate(
estimator=clf,
X=X,
y=y,
n_jobs=4,
cv=kf,
return_estimator=True,
)
print("%0.2f accuracy with a standard deviation of %0.2f" % (
cv_results['test_score'].mean(),
cv_results['test_score'].std(),
))
for est in cv_results['estimator']:
tree.plot_tree(est, filled=True, feature_names=feature_cols, class_names=[val_1, val_2])
plt.show();
Alternatively, loop manually over the folds (or other cv iteration), fitting the model and plotting its tree in the loop.
I am writing a python script that deal with sentiment analysis and I did the pre-process for the text and vectorize the categorical features and split the dataset, then I use the LogisticRegression model and I got accuracy 84%
When I upload a new dataset and try to deploy the created model I got accuracy 51,84%
code:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
stop_words = set(stopwords.words('english'))
import joblib
def load_dataset(filename, cols):
dataset = pd.read_csv(filename, encoding='latin-1')
dataset.columns = cols
return dataset
dataset = load_dataset("F:\AIenv\sentiment_analysis\input_2_balanced.csv", ["id","label","date","text"])
dataset.head()
dataset['clean_text'] = dataset['text'].apply(processTweet)
# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(dataset["clean_text"].apply(lambda x: x.split(" ")))]
# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
# transform each document into a vector data
doc2vec_df = dataset["clean_text"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
dataset = pd.concat([dataset, doc2vec_df], axis=1)
# add tf-idfs columns
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(dataset["clean_text"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = dataset.index
dataset = pd.concat([dataset, tfidf_df], axis=1)
x = dataset.iloc[:,3]
y = dataset.iloc[:,1]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)
from sklearn.pipeline import Pipeline
# create pipeline
pipeline = Pipeline([
('bow', CountVectorizer(strip_accents='ascii',
stop_words=['english'],
lowercase=True)),
('tfidf', TfidfTransformer()),
('classifier', LogisticRegression(C=15.075475376884423,penalty="l2")),
])
# Parameter grid settings for LogisticRegression
parameters = {'bow__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
}
grid = GridSearchCV(pipeline, cv=10, param_grid=parameters, verbose=1,n_jobs=-1)
grid.fit(X_train,y_train)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
#get predictions from best model above
y_preds = grid.predict(X_test)
cm = confusion_matrix(y_test, y_preds)
print("accuracy score: ",accuracy_score(y_test,y_preds))
print("\n")
print("confusion matrix: \n",cm)
print("\n")
print(classification_report(y_test,y_preds))
joblib.dump(grid,"F:\\AIenv\\sentiment_analysis\\RF_jupyter.pkl")
RF_Model = joblib.load("F:\\AIenv\\sentiment_analysis\\RF_jupyter.pkl")
test_twtr_preds = RF_Model.predict(test_twtr["clean_text"])
I have conducted survey research on different classifications performance in Sentiment analysis.
For a specific twitter dataset, I used to perform models like Logistic Regression, Naïve Bayes, Support vector machine, k-nearest neighbors (KNN), and Decision tree as well.
Observations of the selected dataset show that Logistic Regression and Naïve Bayes perform well in all types of testing with accuracy. SVM at the next. Then Decision tree classification with accuracy. As of the result, KNN scores lowest with accuracy level. Logistic regression and Naïve Bayes models are performing respectively better in sentiment analysis and predictions.
Sentiment Classifier (Accuracy Score RMSE)
LR (78.3541 1.053619)
NB (76.764706 1.064738)
SVM (73.5835 1.074752)
DT (69.2941 1.145234)
KNN (62.9476 1.376589)
Feature extraction is very critical in these cases.
#This may help you.
Importing Essentials
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import time
df = pd.read_csv('FilePath', header=0)
X = df['content']
y = df['sentiment']
def lrSentimentAnalysis(n):
# Using CountVectorizer to convert text into tokens/features
vect = CountVectorizer(ngram_range=(1, 1))
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=n)
# Using training data to transform text into counts of features for each message
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)
# dual = [True, False]
max_iter = [100, 110, 120, 130, 140, 150]
C = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5]
solvers = ['newton-cg', 'lbfgs', 'liblinear']
param_grid = dict(max_iter=max_iter, C=C, solver=solvers)
LR1 = LogisticRegression(penalty='l2', multi_class='auto')
grid = GridSearchCV(estimator=LR1, param_grid=param_grid, cv=10, n_jobs=-1)
grid_result = grid.fit(X_train_dtm, y_train)
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
y_pred = grid_result.predict(X_test_dtm)
print ('Accuracy Score: ', metrics.accuracy_score(y_test, y_pred) * 100, '%')
# print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred))
# print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
# print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print ('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
return [n, metrics.accuracy_score(y_test, y_pred) * 100, grid_result.best_estimator_.get_params()['max_iter'],
grid_result.best_estimator_.get_params()['C'], grid_result.best_estimator_.get_params()['solver']]
def darwConfusionMetrix(accList):
# Using CountVectorizer to convert text into tokens/features
vect = CountVectorizer(ngram_range=(1, 1))
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=accList[0])
# Using training data to transform text into counts of features for each message
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)
# Accuracy using Logistic Regression Model
LR = LogisticRegression(penalty='l2', max_iter=accList[2], C=accList[3], solver=accList[4])
LR.fit(X_train_dtm, y_train)
y_pred = LR.predict(X_test_dtm)
# creating a heatmap for confusion matrix
data = metrics.confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(data, columns=np.unique(y_test), index=np.unique(y_test))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize=(10, 7))
sns.set(font_scale=1.4) # for label size
sns.heatmap(df_cm, cmap="Blues", annot=True, annot_kws={"size": 16}) # font size
fig0 = plt.gcf()
fig0.show()
fig0.savefig('FilePath', dpi=100)
def findModelWithBestAccuracy(accList):
accuracyList = []
for item in accList:
accuracyList.append(item[1])
N = accuracyList.index(max(accuracyList))
print('Best Model:', accList[N])
return accList[N]
accList = []
print('Logistic Regression')
print('grid search method for hyperparameter tuning (accurcy by cross validation) ')
for i in range(2, 7):
n = i / 10.0
print ("\nsplit ", i - 1, ": n=", n)
accList.append(lrSentimentAnalysis(n))
darwConfusionMetrix(findModelWithBestAccuracy(accList))
Preprocessing is a vital part of building a well-performing classifier. When you have such a large discrepancy between training and test set performance, it is likely that some error has occurred in your preprocessing (of your test set).
A classifier is also available without any programming. The second video here (below) shows how sentiments can be classified from keywords in mails.
You can visit the web service insight classifiers and try a free build first.
Your new data can be very different from the first dataset you used to train and test your model. Preprocessing techniques and statistical analysis will help you characterise your data and compare different datasets. Poor performance on new data can be observed for various reasons including:
your initial dataset is not statistically representative of a bigger data dataset (example: your dataset is a corner case)
Overfitting: you over-train your model which incorporates specificities (noise) of the training data
Different preprocessing methods
Unbalanced training data set. ML techniques work best with balanced dataset (equal occurrence of different classes in the training set)
I am using the following Python code to make output predictions depending on some values using decision trees based on entropy/gini index. My input data is contained in the file: https://drive.google.com/file/d/1C8GZ2wiqFUW3WuYxyc0G3axgkM1Uwsb6/view?usp=sharing
The first column "gold" in the file contains the output that I am trying to predict (either T or N). The remaining columns represents some 0 or 1 data that I can use to predict the first column. I am using a test set of 30% and a training set of 70%. I am getting the same precision/recall using either entropy or gini index. I am getting a precision of 0.80 for T and a recall of 0.54 for T. I would like to increase the precision of T and I am okay if the recall for T goes down as well, I am willing to accept this tradeoff. I do not care about the precision/recall of N predictions, I am just trying to improve the precision of T, that's all I care about. I guess increasing the precision means that we should abstain from making predictions in some situations that we are not certain about. How to do that?
# Run this program on your local python
# interpreter, provided you have installed
# the required libraries.
# Importing the required packages
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
from sklearn import tree
import collections
import pydotplus
# Function importing Dataset
column_count =0
def importdata():
balance_data = pd.read_csv( 'data1extended.txt', sep= ',')
row_count, column_count = balance_data.shape
# Printing the dataswet shape
print ("Dataset Length: ", len(balance_data))
print ("Dataset Shape: ", balance_data.shape)
print("Number of columns ", column_count)
# Printing the dataset obseravtions
print ("Dataset: ",balance_data.head())
return balance_data, column_count
def columns(balance_data):
row_count, column_count = balance_data.shape
return column_count
# Function to split the dataset
def splitdataset(balance_data, column_count):
# Separating the target variable
X = balance_data.values[:, 1:column_count]
Y = balance_data.values[:, 0]
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size = 0.3, random_state = 100)
return X, Y, X_train, X_test, y_train, y_test
# Function to perform training with giniIndex.
def train_using_gini(X_train, X_test, y_train):
# Creating the classifier object
clf_gini = DecisionTreeClassifier(criterion = "gini",
random_state = 100,max_depth=3, min_samples_leaf=5)
# Performing training
clf_gini.fit(X_train, y_train)
return clf_gini
# Function to perform training with entropy.
def tarin_using_entropy(X_train, X_test, y_train):
# Decision tree with entropy
clf_entropy = DecisionTreeClassifier(
criterion = "entropy", random_state = 100,
max_depth = 3, min_samples_leaf = 5)
# Performing training
clf_entropy.fit(X_train, y_train)
return clf_entropy
# Function to make predictions
def prediction(X_test, clf_object):
# Predicton on test with giniIndex
y_pred = clf_object.predict(X_test)
print("Predicted values:")
print(y_pred)
return y_pred
# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):
print("Confusion Matrix: ",
confusion_matrix(y_test, y_pred))
print ("Accuracy : ",
accuracy_score(y_test,y_pred)*100)
print("Report : ",
classification_report(y_test, y_pred))
#Univariate selection
def selection(column_count, data):
# data = pd.read_csv("data1extended.txt")
X = data.iloc[:,1:column_count] #independent columns
y = data.iloc[:,0] #target column i.e price range
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
df=pd.DataFrame(data, columns=X)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
print(featureScores.nlargest(5,'Score')) #print 10 best features
return X,y,data,df
#Feature importance
def feature(X,y):
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(5).plot(kind='barh')
plt.show()
#Correlation Matrix
def correlation(data, column_count):
corrmat = data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(column_count,column_count))
#plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")
def generate_decision_tree(X,y):
clf = DecisionTreeClassifier(random_state=0)
data_feature_names = ['callersAtLeast1T','CalleesAtLeast1T','callersAllT','calleesAllT','CallersAtLeast1N','CalleesAtLeast1N','CallersAllN','CalleesAllN','childrenAtLeast1T','parentsAtLeast1T','childrenAtLeast1N','parentsAtLeast1N','childrenAllT','parentsAllT','childrenAllN','ParentsAllN','ParametersatLeast1T','FieldMethodsAtLeast1T','ReturnTypeAtLeast1T','ParametersAtLeast1N','FieldMethodsAtLeast1N','ReturnTypeN','ParametersAllT','FieldMethodsAllT','ParametersAllN','FieldMethodsAllN']
#generate model
model = clf.fit(X, y)
# Create DOT data
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=data_feature_names,
class_names=y)
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
# Show graph
Image(graph.create_png())
# Create PDF
graph.write_pdf("tree.pdf")
# Create PNG
graph.write_png("tree.png")
# Driver code
def main():
# Building Phase
data,column_count = importdata()
X, Y, X_train, X_test, y_train, y_test = splitdataset(data, column_count)
clf_gini = train_using_gini(X_train, X_test, y_train)
clf_entropy = tarin_using_entropy(X_train, X_test, y_train)
# Operational Phase
print("Results Using Gini Index:")
# Prediction using gini
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)
print("Results Using Entropy:")
# Prediction using entropy
y_pred_entropy = prediction(X_test, clf_entropy)
cal_accuracy(y_test, y_pred_entropy)
#COMMENTED OUT THE 4 FOLLOWING LINES DUE TO MEMORY ERROR
#X,y,dataheaders,df=selection(column_count,data)
#generate_decision_tree(X,y)
#feature(X,y)
#correlation(dataheaders,column_count)
# Calling main function
if __name__=="__main__":
main()
I would suggest using Pipelines, to build data pipelines and GridSearchCV to find the best possible hyper-parameters and classifiers for the pipe.
A basic example;
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_class
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipe = Pipeline[('kbest', SelectKBest(chi2, k=3000)),
('clf', DecisionTreeClassifier())
])
pipe_params = {'kbest__k': range(1, 10, 1),
'kbest__score_func': [f_classif, chi2],
'clf__max_depth': np.arange(1,30),
'clf__min_samples_leaf': [1,2,4,5,10,20,30,40,80,100]}
grid_search = GridSearchCV(pipe, pipe_params, n_jobs=-1
scoring=accuracy_score, cv=10)
grid_search.fit(X_train, Y_train)
This will iterate over every hyper-parameters in pipe_params and choose the best classifier based on accuracy_score.
I am trying to optimize hyperparameters for ridge regression. But also add polynomial features. So, pipeline looks okay but getting error when try to gridsearchcv. Here:
# Importing the Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from collections import Counter
from IPython.core.display import display, HTML
sns.set_style('darkgrid')
# Data Preprocessing
from sklearn.datasets import load_boston
boston_dataset = load_boston()
dataset = pd.DataFrame(boston_dataset.data, columns = boston_dataset.feature_names)
dataset['MEDV'] = boston_dataset.target
# X and y Variables
X = dataset.iloc[:, 0:13].values
y = dataset.iloc[:, 13].values.reshape(-1,1)
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 25)
# Building the Model ------------------------------------------------------------------------
# Fitting regressior to the Training set
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
steps = [
('scalar', StandardScaler()),
('poly', PolynomialFeatures(degree=2)),
('model', Ridge())
]
ridge_pipe = Pipeline(steps)
ridge_pipe.fit(X_train, y_train)
# Predicting the Test set results
y_pred = ridge_pipe.predict(X_test)
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = ridge_pipe, X = X_train, y = y_train, cv = 10)
accuracies.mean()
#accuracies.std()
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = [ {'alpha': np.arange(0, 0.2, 0.01) } ]
grid_search = GridSearchCV(estimator = ridge_pipe,
param_grid = parameters,
scoring = 'accuracy',
cv = 10,
n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train) # <-- GETTING ERROR IN HERE
Error:
ValueError: Invalid parameter ridge for estimator
What to do or, is there a better way to use ridge regression with pipeline? I would be pleased if put some sources about gridsearch because I am a newbie on this. The error:
There are two problems in your code. First since you are using a pipeline, you need to specify in the params list which part of the pipeline does the params belongs to. See the official documentation for more information :
The purpose of the pipeline is to assemble several steps that can be
cross-validated together while setting different parameters. For this,
it enables setting parameters of the various steps using their names
and the parameter name separated by a ‘__’, as in the example below
In this case, since alpha is going to be used with ridge-regression and you have used the string model in the Pipeline defintion, you need to rename the key alpha to model_alpha:
steps = [
('scalar', StandardScaler()),
('poly', PolynomialFeatures(degree=2)),
('model', Ridge()) # <------ Whatever string you assign here will be used later
]
# Since you have named it as 'model', you need change it to 'model_alpha'
parameters = [ {'model__alpha': np.arange(0, 0.2, 0.01) } ]
Next, you need to understand this dataset is for Regression. You should not use accuracy here, instead use a regression based scoring function like, mean_squared_error. Here are some other metrics for regression that you can use. Something like this
from sklearn.metrics import mean_squared_error, make_scorer
scoring_func = make_scorer(mean_squared_error)
grid_search = GridSearchCV(estimator = ridge_pipe,
param_grid = parameters,
scoring = scoring_func, #<--- Use the scoring func defined above
cv = 10,
n_jobs = -1)
Here is a link to a Google colab notebook with working code.
For the GridSearchCV parameters, the parameter name for ridge should be 'ridge__alpha' (note 2 underscores) instead of just 'alpha'.
I have a dataset, which has previously been split into 3 sets: train, validation and test. These sets have to be used as given in order to compare the performance across different algorithms.
I would now like to optimize the parameters of my SVM using the validation set. However, I cannot find how to input the validation set explicitly into sklearn.grid_search.GridSearchCV(). Below is some code I've previously used for doing K-fold cross-validation on the training set. However, for this problem I need to use the validation set as given. How can I do that?
from sklearn import svm, cross_validation
from sklearn.grid_search import GridSearchCV
# (some code left out to simplify things)
skf = cross_validation.StratifiedKFold(y_train, n_folds=5, shuffle = True)
clf = GridSearchCV(svm.SVC(tol=0.005, cache_size=6000,
class_weight=penalty_weights),
param_grid=tuned_parameters,
n_jobs=2,
pre_dispatch="n_jobs",
cv=skf,
scoring=scorer)
clf.fit(X_train, y_train)
Use PredefinedSplit
ps = PredefinedSplit(test_fold=your_test_fold)
then set cv=ps in GridSearchCV
test_fold : “array-like, shape (n_samples,)
test_fold[i] gives the test set fold of sample i. A value of -1 indicates that the corresponding sample is not part of any test set folds, but will instead always be put into the training fold.
Also see here
when using a validation set, set the test_fold to 0 for all samples that are part of the validation set, and to -1 for all other samples.
Consider using the hypopt Python package (pip install hypopt) for which I am an author. It's a professional package created specifically for parameter optimization with a validation set. It works with any scikit-learn model out-of-the-box and can be used with Tensorflow, PyTorch, Caffe2, etc. as well.
# Code from https://github.com/cgnorthcutt/hypopt
# Assuming you already have train, test, val sets and a model.
from hypopt import GridSearch
param_grid = [
{'C': [1, 10, 100], 'kernel': ['linear']},
{'C': [1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
# Grid-search all parameter combinations using a validation set.
opt = GridSearch(model = SVR(), param_grid = param_grid)
opt.fit(X_train, y_train, X_val, y_val)
print('Test Score for Optimized Parameters:', opt.score(X_test, y_test))
EDIT: I (think I) received -1's on this response because I'm suggesting a package that I authored. This is unfortunate, given that the package was created specifically to solve this type of problem.
# Import Libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import PredefinedSplit
# Split Data to Train and Validation
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8, stratify = y,random_state = 2020)
# Create a list where train data indices are -1 and validation data indices are 0
split_index = [-1 if x in X_train.index else 0 for x in X.index]
# Use the list to create PredefinedSplit
pds = PredefinedSplit(test_fold = split_index)
# Use PredefinedSplit in GridSearchCV
clf = GridSearchCV(estimator = estimator,
cv=pds,
param_grid=param_grid)
# Fit with all data
clf.fit(X, y)
To add to the #Vinubalan's answer, when the train-valid-test split is not done with Scikit-learn's train_test_split() function, i.e., the dataframes are already split manually beforehand and scaled/normalized so as to prevent leakage from training data, the numpy arrays can be concatenated.
import numpy as np
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
from sklearn.model_selection import PredefinedSplit, GridSearchCV
split_index = [-1]*len(X_train) + [0]*len(X_val)
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0)
pds = PredefinedSplit(test_fold = split_index)
clf = GridSearchCV(estimator = estimator,
cv=pds,
param_grid=param_grid)
# Fit with all data
clf.fit(X, y)
I wanted to provide some reproducible code that creates a validation split using the last 20% of observations.
from sklearn import datasets
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
# load data
df_train = datasets.fetch_california_housing(as_frame=True).data
y = datasets.fetch_california_housing().target
param_grid = {"max_depth": [5, 6],
'learning_rate': [0.03, 0.06],
'subsample': [.5, .75]
}
model = GradientBoostingRegressor()
# Create a single validation split
val_prop = .2
n_val_rows = round(len(df_train) * val_prop)
val_starting_index = len(df_train) - n_val_rows
cv = PredefinedSplit([-1 if i < val_starting_index else 0 for i in df_train.index])
# Use PredefinedSplit in GridSearchCV
results = GridSearchCV(estimator = model,
cv=cv,
param_grid=param_grid,
verbose=True,
n_jobs=-1)
# Fit with all data
results.fit(df_train, y)
results.best_params_
The cv argument of the SearchCV i.e. Grid or Random can just be an iterable of indices too for train and validation split i.e. cv=((train_idcs, val_idcs),).
Note that the data on which the search classifier will be fit should be the train+val set and the indices specified will be used by the sklearn to separate them internally. Additionally, when working with dataframes, the indices specified should be accessible as ilocs, so reset indices (don't drop them if they will be required later).
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (
train_test_split,
RandomizedSearchCV,
)
data = load_iris(as_frame=True)["frame"]
# These indices will serves as explicit and predefined split
train_idcs, val_idcs = train_test_split(
data.index,
random_state=42,
stratify=data.target,
)
param_grid = dict(
n_estimators=[50,100,150,200],
max_samples=[0.85,0.9,0.95,1],
max_depth=[3,5,7,10],
max_features=["sqrt", "log2", 0.85, 0.9, 0.95, 1],
)
search_clf = RandomizedSearchCV(
estimator=RandomForestClassifier(),
param_distributions=param_grid,
n_iter=50,
cv=((train_idcs, val_idcs),), # explicit predefined split in terms of indices
random_state=42,
)
# X is the first 4 columns i.e. the sepal and petal widths and lengths
# and y is the 5th column i.e. target column
search_clf.fit(X=data.iloc[:,:4], y=data.target)
Also, be mindful if you want to refit on the whole data or only on the train data and thus retrain the classifier using the best fit parameters accordingly.