I am trying to use Grid-Search-Cross-Validation to find the best value of hyperparameter C. I did split the data set into two subsets contains 50% of the Mnist 784, and used only one of the two subsets with 60% and 40% for training and testing respectively.
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
import numpy as np
mnist = fetch_openml('mnist_784')
X, y = mnist['data'], mnist['target']
X_1, X_2, y_1, y_2 = train_test_split(X, y, test_size=0.5, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.4)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svm = LinearSVC(dual=False, max_iter=10000)
param_grid = {'C': [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]}
grid = GridSearchCV(svm, param_grid, scoring='accuracy')
grid.fit(X_train_scaled, y_train)
print("Best value of C:", grid.best_params_['C'])
accuracy = grid.score(X_test_scaled, y_test)
print("Test accuracy:", accuracy)
I have tried everything without any progress.
I tried minimizing the data and then normalizing it, also tried to increase the max_iter=10000 and 15000.
Related
I have a binary classification problem. I've been using cross validation to optimize the ElasticNet parameters. However ElasticNet only seems to work when I supply roc_auc as the scoring method to be used during CV, However I also want to test out a wide range of scoring methods, in particular accuracy. Specifically, when using accuracy, ElasticNet returns this error:
ValueError: Classification metrics can't handle a mix of binary and continuous targets
However my y targets are indeed binary. Below is a replication of my problem using the dataset from here:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
data = pd.read_csv('data 2.csv')
# by default majority class (benign) will be negative
lb = LabelBinarizer()
data['diagnosis'] = lb.fit_transform(data['diagnosis'].values)
targets = data['diagnosis']
data.drop(['id', 'diagnosis', 'Unnamed: 32'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(data, targets, stratify=targets)
#elastic net logistic regression
lr = ElasticNet(max_iter=2000)
scorer = 'accuracy'
param_grid = {
'alpha': [1e-4, 1e-3, 1e-2, 0.01, 0.1, 1, 5, 10],
'l1_ratio': np.arange(0.2, 0.9, 0.1)
}
skf = StratifiedKFold(n_splits=10)
clf = GridSearchCV(lr, param_grid, scoring=scorer, cv=skf, return_train_score=True,
n_jobs=-1)
clf.fit(X_train.values, y_train.values)
I figured that ElasticNet might be trying to solve a linear regression problem so I tried lr = LogisticRegression(penalty='elasticnet', l1_ratios=[0.1, 0.5, 0.9], solver='saga') as the classifier but the same problem persists.
If I use as the scoring metric scorer = 'roc_auc' then the model is built as expected.
Also, as a sanity to check to see if there is something wrong with the data I tried the same but with a random forest classifier and here the problem disappears:
# random forest
clf = RandomForestClassifier(n_jobs=-1)
param_grid = {
'min_samples_split': [3, 5, 10],
'n_estimators' : [100, 300],
'max_depth': [3, 5, 15, 25],
'max_features': [3, 5, 10, 20]
}
skf = StratifiedKFold(n_splits=10)
scorer = 'accuracy'
grid_search = GridSearchCV(clf, param_grid, scoring=scorer,
cv=skf, return_train_score=True, n_jobs=-1)
grid_search.fit(X_train.values, y_train.values)
Has anyone got any ideas on what's happening here?
ElasticNet is a regression model.
If you want an ElasticNet penalty in classification, use LogisticRegression:
lr = LogisticRegression(solver="saga", penalty="elasticnet")
Minimal Reproducible Example:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
lr = LogisticRegression(solver="saga", penalty="elasticnet", max_iter=2000)
param_grid = {
'l1_ratio': np.arange(0.2, 0.9, 0.1)
}
clf = GridSearchCV(lr, param_grid, scoring='accuracy', cv=StratifiedKFold(n_splits=10), return_train_score=True, n_jobs=-1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
I am currently attempting to train a neural network that predicts a 1kHz sine wave.
While the model itself has an accuracy score of 0.89, it does not accurately predict my test data.
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
#Generate 1kHz sine wave
pi = np.pi
X = np.arange(0,2*pi,0.05)
y = np.sin(1000*X)
tscv = TimeSeriesSplit(n_splits=3, test_size=30)
for train_index, test_index in tscv.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
plt.plot(X_train, y_train)
plt.plot(X_test, y_test)
#Training using train samples
sc = StandardScaler()
X_train = sc.fit_transform(X_train.reshape(-1,1))
X_test = sc.fit_transform(X_test.reshape(-1,1))
regr = MLPRegressor(random_state=1, max_iter=1000,hidden_layer_sizes=(32, 32)).fit(X_train, y_train)
regr.fit(X_train, y_train)
plt.plot(X_train, regr.predict(X_train), color = 'red')
plt.scatter(X_train, y_train)
regr.score(X_train, y_train)
Result of training
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_test, regr.predict(X_test))
Result of test
As you can see, the test data is far less periodic than the ML model. Why is this the case?
I have the following unbalanced data set with two features (keon i.e. gender and alder i.e. age) that was balanced using under_sampling method which I trained on different classifier to predict the call_ending_reason where 0 is No and 1 is Yes:
The balanced dataset with both 1 and 0 have same kind of distribution which can be visualized like this:
However, after performing under_sampling method on the above shown dataset and training both type of dataset in various classifier from sklearn, the balanced dataset is detecting 1s high precision but 0s with very low precision. The opposite happens when I use the main dataset.
Here is the code:
x = filtered_data_limited_features_with_yes_no
y = filtered_data_limited_features_with_yes_no['call_ending_reason']
del x['call_ending_reason']
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = 0.80)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
# rfc = MLPClassifier(verbose=True,hidden_layer_sizes=(100,50,10),learning_rate='constant',learning_rate_init=0.0001, n_iter_no_change=50, max_iter=100)
# rfc = GaussianNB()
rfc=RandomForestClassifier()
param_grid = {
'n_estimators': [50,100,200,500],
'max_features': ['auto', 'sqrt', 'log2'],
'criterion' :['gini', 'entropy']
}
CV_rfc_all_data = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 10)
# rfc = LinearSVC()
CV_rfc_all_data.fit(X_train, y_train)
from sklearn.metrics import classification_report
print(classification_report(y_test, CV_rfc_all_data.predict(X_test)))
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler( random_state=1)
df_balanced, balanced_labels = ros.fit_resample(x, y)
####TRAINING AND PREDICTING CLASSIFIER BASED ON BALANCED DATASET
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(df_balanced, balanced_labels, train_size = 0.70)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
# rfc=RandomForestClassifier()
# param_grid = {
# 'n_estimators': [50,100,200,500],
# 'max_features': ['auto', 'sqrt', 'log2'],
# 'criterion' :['gini', 'entropy']
# }
# CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 10)
# CV_rfc = MLPClassifier(verbose=True,hidden_layer_sizes=(100,50,10),learning_rate='invscaling',learning_rate_init=0.0003, n_iter_no_change=50, max_iter=100)
CV_rfc = DecisionTreeClassifier()
CV_rfc.fit(X_train, y_train)
# CV_rfc.best_params_
Questions:
Given the visualization:
What classifier should be used to train the classifier with more than 65% precision for both predicting 1 and 0
Do I need to scale the data given its only 2 features? If so how should I do that properly to scale both training and testing data
You can try to set the class_weight="balanced" argument the models, it is supported in most of the models that are supported by scikit-learn It won't be magic, but in my experience, it usually works better than under or over sampling.
For the metric used in your grid search, I would use the f1_score as suggested by #Erwan, it will penalize heavily poor precision and poor recall, and will reward hyper parameters that yield a more balanced model.
I've trained a simple random forest algorithm and bagging classifier (n_estimators = 100). Is it possible to plot the history of accuracy in bagging Classifier? How to calculate the variance of in 100 samples?
I've just printed the accuracy value for both algorithms:
# DecisionTree
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90)
clf2 = tree.DecisionTreeClassifier()
clf2.fit(X_tr, y_tr)
pred2 = clf2.predict(X_test)
acc2 = clf2.score(X_test, y_test)
acc2 # 0.6983930778739185
# Bagging
clf3 = BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=0.5, max_features=0.5, n_estimators=100,\
verbose=2)
clf3.fit(X_tr, y_tr)
pred3 = clf3.predict(X_test)
acc3=clf3.score(X_test,y_test)
acc3 # 0.911619283065513
I don't think that you can get this information from the fitted BaggingClassifier. But you can create such a plot by fitting for different n_estimators:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X, X_test, y, y_test = train_test_split(iris.data,
iris.target,
test_size=0.20)
estimators = list(range(1, 20))
accuracy = []
for n_estimators in estimators:
clf = BaggingClassifier(DecisionTreeClassifier(max_depth=1),
max_samples=0.2,
n_estimators=n_estimators)
clf.fit(X, y)
acc = clf.score(X_test, y_test)
accuracy.append(acc)
plt.plot(estimators, accuracy)
plt.xlabel("Number of estimators")
plt.ylabel("Accuracy")
plt.show()
(Of course, the iris dataset is easily fit with just a single DecisionTreeClassifier, so I set max_depth=1 in this example.)
For a statistically meaningful result, you should fit a BaggingClassifier multiple times for each n_estimators and take the average of the obtained accuracies.
I want to have metrics per class label and an aggregate confusion matrix from a cross validation in scikit learn.
I wrote a method that performs a cross-validation for scikit learn that sums the confusion matrices and also stores all the predicted labels. Then, it calls scikit learn methods to print out the metrics.
The code below should run with any recent scikit learn installation, you can test it out with any dataset.
Is below the correct way to gather an aggregate cm and a classification_report when doing StratifiedKFold cross validation?
from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold
import numpy as np
def customCrossValidation(self, X, y, classifier, n_folds=10, shuffle=True, random_state=0):
''' Perform a cross validation and print out the metrics '''
skf = StratifiedKFold(y, n_folds=n_folds, shuffle=shuffle, random_state=random_state)
cm = None
y_predicted_overall = None
y_test_overall = None
for train_index, test_index in skf:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
classifier.fit(X_train, y_train)
y_predicted = classifier.predict(X_test)
# collect the y_predicted per fold
if y_predicted_overall is None:
y_predicted_overall = y_predicted
y_test_overall = y_test
else:
y_predicted_overall = np.concatenate([y_predicted_overall, y_predicted])
y_test_overall = np.concatenate([y_test_overall, y_test])
cv_cm = metrics.confusion_matrix(y_test, y_predicted)
# sum the cv per fold
if cm is None:
cm = cv_cm
else:
cm += cv_cm
print (metrics.classification_report(y_test_overall, y_predicted_overall, digits=3))
print (cm)