I am trying to run my lightgbm for feature selection as below;
initialization
# Initialize an empty array to hold feature importances
feature_importances = np.zeros(features_sample.shape[1])
# Create the model with several hyperparameters
model = lgb.LGBMClassifier(objective='binary',
boosting_type = 'goss',
n_estimators = 10000, class_weight ='balanced')
then i fit the model as below
# Fit the model twice to avoid overfitting
for i in range(2):
# Split into training and validation set
train_features, valid_features, train_y, valid_y = train_test_split(train_X, train_Y, test_size = 0.25, random_state = i)
# Train using early stopping
model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)],
eval_metric = 'auc', verbose = 200)
# Record the feature importances
feature_importances += model.feature_importances_
but i get the below error
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is: [6] valid_0's auc: 0.88648
ValueError: operands could not be broadcast together with shapes (87,) (83,) (87,)
An example for getting feature importance in lightgbm when using train model.
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def plotImp(model, X , num = 20, fig_size = (40, 20)):
feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns})
plt.figure(figsize=fig_size)
sns.set(font_scale = 5)
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
ascending=False)[0:num])
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances-01.png')
plt.show()
Depending on whether we trained the model using scikit-learn or lightgbm methods, to get importance we should choose respectively feature_importances_ property or feature_importance() function, like in this example (where model is a result of lgbm.fit() / lgbm.train(), and train_columns = x_train_df.columns):
import pandas as pd
def get_lgbm_varimp(model, train_columns, max_vars=50):
if "basic.Booster" in str(model.__class__):
# lightgbm.basic.Booster was trained directly, so using feature_importance() function
cv_varimp_df = pd.DataFrame([train_columns, model.feature_importance()]).T
else:
# Scikit-learn API LGBMClassifier or LGBMRegressor was fitted,
# so using feature_importances_ property
cv_varimp_df = pd.DataFrame([train_columns, model.feature_importances_]).T
cv_varimp_df.columns = ['feature_name', 'varimp']
cv_varimp_df.sort_values(by='varimp', ascending=False, inplace=True)
cv_varimp_df = cv_varimp_df.iloc[0:max_vars]
return cv_varimp_df
Note that we rely on the assumption that feature importance values are ordered just like the model matrix columns were ordered during training (incl. one-hot dummy cols), see LightGBM #209.
For the LightGBM's 3.1.1 version, extending the comment of #user3067175 :
pd.DataFrame({'Value':model.feature_importance(),'Feature':features}).sort_values(by="Value",ascending=False)
is a list of feature names,within the same order of your dataset, can be replaced by features = df_train.columns.tolist().
This should return the feature importance with the same order of plot.
Note: If you use LGBMRegressor or LGBMClassifier, you should use
pd.DataFrame({'Value':model.feature_importances_,'Feature':features}).sort_values(by="Value",ascending=False)
If you want to examine a loaded model that you don't have the training data, you can get feature importance and the feature name by
df_feature_importance = (
pd.DataFrame({
'feature': model.feature_name(),
'importance': model.feature_importance(),
})
.sort_values('importance', ascending=False)
)
Related
Good morning, I'm new in machine learning and neural networks. I am trying to build a fully connected neural network to solve a regression problem. The dataset is composed by 18 features and 1 label, and all of these are physical quantities.
You can find the code below. I upload the figure of the loss function evolution along the epochs (you can find it below). I am not sure if there is overfitting. Someone can explain me why there is or not overfitting?
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from keras import optimizers
from sklearn.metrics import r2_score
from keras import regularizers
from keras import backend
from tensorflow.keras import regularizers
from keras.regularizers import l2
# =============================================================================
# Scelgo il test size
# =============================================================================
test_size = 0.2
dataset = pd.read_csv('DataSet.csv', decimal=',', delimiter = ";")
label = dataset.iloc[:,-1]
features = dataset.drop(columns = ['Label'])
y_max_pre_normalize = max(label)
y_min_pre_normalize = min(label)
def denormalize(y):
final_value = y*(y_max_pre_normalize-y_min_pre_normalize)+y_min_pre_normalize
return final_value
# =============================================================================
# Split
# =============================================================================
X_train1, X_test1, y_train1, y_test1 = train_test_split(features, label, test_size = test_size, shuffle = True)
y_test2 = y_test1.to_frame()
y_train2 = y_train1.to_frame()
# =============================================================================
# Normalizzo
# =============================================================================
scaler1 = preprocessing.MinMaxScaler()
scaler2 = preprocessing.MinMaxScaler()
X_train = scaler1.fit_transform(X_train1)
X_test = scaler2.fit_transform(X_test1)
scaler3 = preprocessing.MinMaxScaler()
scaler4 = preprocessing.MinMaxScaler()
y_train = scaler3.fit_transform(y_train2)
y_test = scaler4.fit_transform(y_test2)
# =============================================================================
# Creo la rete
# =============================================================================
optimizer = tf.keras.optimizers.Adam(lr=0.001)
model = Sequential()
model.add(Dense(60, input_shape = (X_train.shape[1],), activation = 'relu',kernel_initializer='glorot_uniform'))
model.add(Dropout(0.2))
model.add(Dense(60, activation = 'relu',kernel_initializer='glorot_uniform'))
model.add(Dropout(0.2))
model.add(Dense(60, activation = 'relu',kernel_initializer='glorot_uniform'))
model.add(Dense(1,activation = 'linear',kernel_initializer='glorot_uniform'))
model.compile(loss = 'mse', optimizer = optimizer, metrics = ['mse'])
history = model.fit(X_train, y_train, epochs = 100,
validation_split = 0.1, shuffle=True, batch_size=250
)
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
y_train_pred = denormalize(y_train_pred)
y_test_pred = denormalize(y_test_pred)
plt.figure()
plt.plot((y_test1),(y_test_pred),'.', color='darkviolet', alpha=1, marker='o', markersize = 2, markeredgecolor = 'black', markeredgewidth = 0.1)
plt.plot((np.array((-0.1,7))),(np.array((-0.1,7))),'-', color='magenta')
plt.xlabel('True')
plt.ylabel('Predicted')
plt.title('Test')
plt.figure()
plt.plot((y_train1),(y_train_pred),'.', color='darkviolet', alpha=1, marker='o', markersize = 2, markeredgecolor = 'black', markeredgewidth = 0.1)
plt.plot((np.array((-0.1,7))),(np.array((-0.1,7))),'-', color='magenta')
plt.xlabel('True')
plt.ylabel('Predicted')
plt.title('Train')
plt.figure()
plt.plot(loss_values,'b',label = 'training loss')
plt.plot(val_loss_values,'r',label = 'val training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss Function')
plt.legend()
print("\n\nThe R2 score on the test set is:\t{:0.3f}".format(r2_score(y_test_pred, y_test1)))
print("The R2 score on the train set is:\t{:0.3f}".format(r2_score(y_train_pred, y_train1)))
from sklearn import metrics
# Measure MSE error.
score = metrics.mean_squared_error(y_test_pred,y_test1)
print("\n\nFinal score test (MSE): %0.4f" %(score))
score1 = metrics.mean_squared_error(y_train_pred,y_train1)
print("Final score train (MSE): %0.4f" %(score1))
score2 = np.sqrt(metrics.mean_squared_error(y_test_pred,y_test1))
print(f"Final score test (RMSE): %0.4f" %(score2))
score3 = np.sqrt(metrics.mean_squared_error(y_train_pred,y_train1))
print(f"Final score train (RMSE): %0.4f" %(score3))
EDIT:
I tried alse to do feature importances and to raise n_epochs, these are the results:
Feature Importance:
No Feature Importace:
Looks like you don't have overfitting! Your training and validation curves are descending together and converging. The clearest sign you could get of overfitting would be a deviation between these two curves, something like this:
Since your two curves are descending and are not diverging, it indicates your NN training is healthy.
HOWEVER! Your validation curve is suspiciously below the training curve. This hints a possible data leakage (train and test data have been mixed somehow). More info on a nice an short blog post. In general, you should split the data before any other preprocessing (normalizing, augmentation, shuffling, etc...).
Other causes for this could be some type of regularization (dropout, BN, etc..) that is active while computing the training accuracy and it's deactivated when computing the Validation/Test accuracy.
Overfitting is, when the model does not generalize to other data than the training data. When this happen you will have a very (!) low training loss but a high validation loss. You can think of it this way: if you have N points you can fit a N-1 polynomial such that you have a zero training loss (your model hits all your training points perfectly). But, if you apply that model to some other data, it will most likely produce a very high error (see the image below). Here the red line is our model and the green is the true data (+ noice), and you can see in the last picture we get zero training error. In the first, our model is too simple (high train/high validation error), the second is good (low train/low valuidation error) the third and last is too complex i.e overfitting (very low train/high validation error).
Neural network can work in the same way, so by looking at your training vs validation error, you can conclude if it overfits or not
No, this is not overfitting as your validation loss isn´t increasing.
Nevertheless, if I were you I would be a little bit skeptical. Try to train your model for even more epochs and watch out for the validation loss.
What you definitely should do, is to observe the following:
- are there duplicates or near-duplicates in the data (creates information leakage from train to test validation split)
- are there features that have a causal connection to the target variable
Edit:
Usually, you have some random component in a real-world dataset, so that rules that are observed in train data aren´t 100% true for validation data.
Your plot shows that the validation loss is even more decreasing as train loss decreases. Usually, you get to some point in training, where the rules you observe in train data are too specific to describe the whole data. That´s when overfitting begins. Hence, it is weird, that your validation loss doesn´t increase again.
Please check whether your validation loss approaches zero when you´re training for more epochs. If it´s the case I would check your database very carefully.
Let´s assume, that there is a kind of information leakage from the train set to the validation set (through duplicate records for example). Your model would change the weights to describe very specific rules. When applying your model to new data it would fail miserably since the observed connections are not really general.
Another common data problem is, that features may have an inversed causality.
The thing that validation loss is generally lower than train error is probably depending on dropout and regularization, since it´s applied while training but not for predicting/testing.
I put some emphasis on this because a tiny bug or an error in the data can "fuck up" your whole model.
I am new to statistic modelling so please forgive if I am mistaken about this.
I am currently working on a function in python which will predict accuracy score for logistics regression model on the test data set. User will have the flexibility to supply model parameters/coefficients (other than the ones generated by training model-part of the requirement). I have a functional code which updates the coefficients but accuracy or prediction on test data set stays the same no matter how different model parameters I supply. My understanding is that the score on test set should change if I change model coefficients?
I am using statsmodel library to make things easier for me and following this link. Can someone please help me understand what am I missing ? Below is the code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.formula.api as sm
from sklearn.model_selection import train_test_split
data = pd.read_csv("E:\\Dev\\testing\\rawdata.txt", header=None,
names=['Exam1', 'Exam2', 'Admitted'])
X = data.copy() # ou training data
y = X.Admitted.copy() # copy “y” column values out
X.drop(['Admitted'], axis=1, inplace=True) # then, drop y column
# manually add the intercept
X['intercept'] = 1.0 # so we don't need to use sm.add_constant every time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
model = sm.Logit(y_train, X_train)
result = model.fit()
print("old parameters :\n" + str(list(result.params)))
#New parameters supplied
mdict = { 'Exam1':10000000.2234, 'Exam2':1.1233423, 'intercept':2313.423 }
result.params = mdict
print("New parameters: \n"+str(result.params))
def logitPredict(modelParams, X, threshold):
probabilities = modelParams.predict(X)
return [1 if x >= threshold else 0 for x in probabilities]
predictions = logitPredict(result, X_test, .5)
accuracy = np.mean(predictions == y_test)
#accuracy always remains same as train model
print ('accuracy = {0}%'.format(accuracy*100) )
#test sample
myExams = pd.DataFrame({'Exam1': [40.], 'Exam2': [78.], 'intercept': [1.]})
myExams
print ('Your probability = {0}%'.format(result.predict(myExams)[0]*100))
I'm new to machine learning and trying Sklearn for the first time. I have two dataframes, one with data to train a logistic regression model (with 10-fold cross-validation) and another one to predict classes ('0,1') using that model.
Here's my code so far using bits of tutorials I found on Sklearn docs and on the Web:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
# Import dataframe with training data
df = pd.read_csv('summary_44.csv')
cols = df.columns.drop('num_class') # Data to use (num_class is the column with the classes)
# Import dataframe with data to predict
df_pred = pd.read_csv('new_predictions.csv')
# Scores
df_data = df.ix[:,:-1].values
# Target
df_target = df.ix[:,-1].values
# Values to predict
df_test = df_pred.ix[:,:-1].values
# Scores' names
df_data_names = cols.values
# Scaling
X, X_pred, y = scale(df_data), scale(df_test), df_target
# Define number of folds
kf = KFold(n_splits=10)
kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator
# Logistic regression normalizing variables
LogReg = LogisticRegression()
# 10-fold cross-validation
scores = [LogReg.fit(X[train], y[train]).score(X[test], y[test]) for train, test in kf.split(X)]
print scores
# Predict new
novel = LogReg.predict(X_pred)
Is this the correct way to implement a Logistic Regression?
I know that the fit() method should be used after cross-validation in order to train the model and use it for predictions. However, since I called fit() inside a list comprehension I really don't know if my model was "fitted" and can be used to make predictions.
I general things are okay, but there are some problems.
Scaling
X, X_pred, y = scale(df_data), scale(df_test), df_target
You scale training and test data independently, which isn't correct. Both datasets must be scaled with the same scaler. "Scale" is a simple function, but it is better to use something else, for example StandardScaler.
scaler = StandardScaler()
scaler.fit(df_data)
X = scaler.transform(df_data)
X_pred = scaler.transform(df_test)
Cross-validation and predicting.
How your code works? You split data 10 times into train and hold-out set; 10 times fit model on train set and calculate score on hold-out set. This way you get cross-validation scores, but the model is fitted only on a part of data. So it would be better to fit model on the whole dataset and then make a prediction:
LogReg.fit(X, y)
novel = LogReg.predict(X_pred)
I want to notice that there are advanced technics like stacking and boosting, but if you learn using sklearn, then it is better to stick to the basics.
The problem is that my train data could not be placed into RAM due to train data size. So I need a method which first builds one tree on whole train data set, calculate residuals build another tree and so on (like gradient boosted tree do). Obviously if I call model = xgb.train(param, batch_dtrain, 2) in some loop - it will not help, because in such case it just rebuilds whole model for each batch.
Try saving your model after you train on the first batch. Then, on successive runs, provide the xgb.train method with the filepath of the saved model.
Here's a small experiment that I ran to convince myself that it works:
First, split the boston dataset into training and testing sets.
Then split the training set into halves.
Fit a model with the first half and get a score that will serve as a benchmark.
Then fit two models with the second half; one model will have the additional parameter xgb_model. If passing in the extra parameter didn't make a difference, then we would expect their scores to be similar..
But, fortunately, the new model seems to perform much better than the first.
import xgboost as xgb
from sklearn.cross_validation import train_test_split as ttsplit
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error as mse
X = load_boston()['data']
y = load_boston()['target']
# split data into training and testing sets
# then split training set in half
X_train, X_test, y_train, y_test = ttsplit(X, y, test_size=0.1, random_state=0)
X_train_1, X_train_2, y_train_1, y_train_2 = ttsplit(X_train,
y_train,
test_size=0.5,
random_state=0)
xg_train_1 = xgb.DMatrix(X_train_1, label=y_train_1)
xg_train_2 = xgb.DMatrix(X_train_2, label=y_train_2)
xg_test = xgb.DMatrix(X_test, label=y_test)
params = {'objective': 'reg:linear', 'verbose': False}
model_1 = xgb.train(params, xg_train_1, 30)
model_1.save_model('model_1.model')
# ================= train two versions of the model =====================#
model_2_v1 = xgb.train(params, xg_train_2, 30)
model_2_v2 = xgb.train(params, xg_train_2, 30, xgb_model='model_1.model')
print(mse(model_1.predict(xg_test), y_test)) # benchmark
print(mse(model_2_v1.predict(xg_test), y_test)) # "before"
print(mse(model_2_v2.predict(xg_test), y_test)) # "after"
# 23.0475232194
# 39.6776876084
# 27.2053239482
reference: https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/training.py
There is now (version 0.6?) a process_update parameter that might help. Here's an experiment with it:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import ShuffleSplit
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error as mse
boston = load_boston()
features = boston.feature_names
X = boston.data
y = boston.target
X=pd.DataFrame(X,columns=features)
y = pd.Series(y,index=X.index)
# split data into training and testing sets
rs = ShuffleSplit(test_size=0.3, n_splits=1, random_state=0)
for train_idx,test_idx in rs.split(X): # this looks silly
pass
train_split = round(len(train_idx) / 2)
train1_idx = train_idx[:train_split]
train2_idx = train_idx[train_split:]
X_train = X.loc[train_idx]
X_train_1 = X.loc[train1_idx]
X_train_2 = X.loc[train2_idx]
X_test = X.loc[test_idx]
y_train = y.loc[train_idx]
y_train_1 = y.loc[train1_idx]
y_train_2 = y.loc[train2_idx]
y_test = y.loc[test_idx]
xg_train_0 = xgb.DMatrix(X_train, label=y_train)
xg_train_1 = xgb.DMatrix(X_train_1, label=y_train_1)
xg_train_2 = xgb.DMatrix(X_train_2, label=y_train_2)
xg_test = xgb.DMatrix(X_test, label=y_test)
params = {'objective': 'reg:linear', 'verbose': False}
model_0 = xgb.train(params, xg_train_0, 30)
model_1 = xgb.train(params, xg_train_1, 30)
model_1.save_model('model_1.model')
model_2_v1 = xgb.train(params, xg_train_2, 30)
model_2_v2 = xgb.train(params, xg_train_2, 30, xgb_model=model_1)
params.update({'process_type': 'update',
'updater' : 'refresh',
'refresh_leaf': True})
model_2_v2_update = xgb.train(params, xg_train_2, 30, xgb_model=model_1)
print('full train\t',mse(model_0.predict(xg_test), y_test)) # benchmark
print('model 1 \t',mse(model_1.predict(xg_test), y_test))
print('model 2 \t',mse(model_2_v1.predict(xg_test), y_test)) # "before"
print('model 1+2\t',mse(model_2_v2.predict(xg_test), y_test)) # "after"
print('model 1+update2\t',mse(model_2_v2_update.predict(xg_test), y_test)) # "after"
Output:
full train 17.8364309709
model 1 24.2542132108
model 2 25.6967017352
model 1+2 22.8846455135
model 1+update2 14.2816257268
I created a gist of jupyter notebook to demonstrate that xgboost model can be trained incrementally. I used boston dataset to train the model. I did 3 experiments - one shot learning, iterative one shot learning, iterative incremental learning. In incremental training, I passed the boston data to the model in batches of size 50.
The gist of the gist is that you'll have to iterate over the data multiple times for the model to converge to the accuracy attained by one shot (all data) learning.
Here is the corresponding code for doing iterative incremental learning with xgboost.
batch_size = 50
iterations = 25
model = None
for i in range(iterations):
for start in range(0, len(x_tr), batch_size):
model = xgb.train({
'learning_rate': 0.007,
'update':'refresh',
'process_type': 'update',
'refresh_leaf': True,
#'reg_lambda': 3, # L2
'reg_alpha': 3, # L1
'silent': False,
}, dtrain=xgb.DMatrix(x_tr[start:start+batch_size], y_tr[start:start+batch_size]), xgb_model=model)
y_pr = model.predict(xgb.DMatrix(x_te))
#print(' MSE itr#{}: {}'.format(int(start/batch_size), sklearn.metrics.mean_squared_error(y_te, y_pr)))
print('MSE itr#{}: {}'.format(i, sklearn.metrics.mean_squared_error(y_te, y_pr)))
y_pr = model.predict(xgb.DMatrix(x_te))
print('MSE at the end: {}'.format(sklearn.metrics.mean_squared_error(y_te, y_pr)))
XGBoost version: 0.6
looks like you don't need anything other than call your xgb.train(....) again but provide the model result from the previous batch:
# python
params = {} # your params here
ith_batch = 0
n_batches = 100
model = None
while ith_batch < n_batches:
d_train = getBatchData(ith_batch)
model = xgb.train(params, d_train, xgb_model=model)
ith_batch += 1
this is based on https://xgboost.readthedocs.io/en/latest/python/python_api.html
If your problem is regarding the dataset size and you do not really need Incremental Learning (you are not dealing with an Streaming app, for instance), then you should check out Spark or Flink.
This two frameworks can train on very large datasets with a small RAM, leveraging disk memory. Both framework deal with memory issues internally. While Flink had it solved first, Spark has caught up in recent releases.
Take a look at:
"XGBoost4J: Portable Distributed XGBoost in Spark, Flink and Dataflow": http://dmlc.ml/2016/03/14/xgboost4j-portable-distributed-xgboost-in-spark-flink-and-dataflow.html
Spark Integration: http://dmlc.ml/2016/10/26/a-full-integration-of-xgboost-and-spark.html
To paulperry's code, If change one line from "train_split = round(len(train_idx) / 2)" to "train_split = len(train_idx) - 50". model 1+update2 will changed from 14.2816257268 to 45.60806270012028. And a lot of "leaf=0" result in dump file.
Updated model is not good when update sample set is relative small.
For binary:logistic, updated model is unusable when update sample set has only one class.
One possible solution that I have not tested is to used a dask dataframe which should act the same as a pandas dataframe but (I assume) utilize disk and reads in and out of RAM. here are some helpful links.
this link mentions how to use it with xgboost also see
also see.
further there is an experimental options from XGBoost as well here but it is "not ready for production"
It's not based on xgboost, but there is a C++ incremental decision tree.
see gaenari.
Continuous chunking data can be inserted and updated, and rebuilds can be run if concept drift reduces accuracy.
I agree with #desertnaut in his solution.
I have a dataset where I split it into 4 batches. I have to do an initial fit without the xgb_model parameter first, then the next fits will have the xgb_model parameter, like in this (I'm using the Sklearn API):
for i, (X_batch, y_batch) in enumerate(zip(self.X_train_batched, self.y_train_batched)):
print(f'Step: {i}',end = ' ')
if i == 0:
model_xgbc.fit(X_batch, y_batch, eval_set=[(self.X_valid, self.y_valid)],
verbose=False, eval_metric = ['logloss'],
early_stopping_rounds = 400)
else:
model_xgbc.fit(X_batch, y_batch, eval_set=[(self.X_valid, self.y_valid)],
verbose=False, eval_metric = ['logloss'],
early_stopping_rounds = 400, xgb_model=model_xgbc)
preds = model_xgbc.predict(self.X_valid)
rmse = metrics.mean_squared_error(self.y_valid, preds,squared=False)
Hey guys you can use my simple code for incremental model training with xgb base class :
batch_size = 10000000
X_train="your pandas training DataFrame"
y_train="Your lables"
#Store eval results
evals_result={}
Deval = xgb.DMatrix(X_valid, y_valid)
eval_sets = [(Dtrain, 'train'), (Deval, 'eval')]
for start in range(0, n, batch_size):
model = xgb.train({'refresh_leaf': True,
'process_type': 'default',
'max_depth': 5,
'objective': 'reg:squarederror',
'num_parallel_tree': 2,
'learning_rate':0.05,
'n_jobs':-1},
dtrain=xgb.DMatrix(X_train, y_train), evals=eval_sets, early_stopping_rounds=5,num_boost_round=100,evals_result=evals_result,xgb_model=model)
I'm trying to preform recursive feature elimination using scikit-learn and a random forest classifier, with OOB ROC as the method of scoring each subset created during the recursive process.
However, when I try to use the RFECV method, I get an error saying AttributeError: 'RandomForestClassifier' object has no attribute 'coef_'
Random Forests don't have coefficients per se, but they do have rankings by Gini score. So, I'm wondering how to get arround this problem.
Please note that I want to use a method that will explicitly tell me what features from my pandas DataFrame were selected in the optimal grouping as I am using recursive feature selection to try to minimize the amount of data I will input into the final classifier.
Here's some example code:
from sklearn import datasets
import pandas as pd
from pandas import Series
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
iris = datasets.load_iris()
x=pd.DataFrame(iris.data, columns=['var1','var2','var3', 'var4'])
y=pd.Series(iris.target, name='target')
rf = RandomForestClassifier(n_estimators=500, min_samples_leaf=5, n_jobs=-1)
rfecv = RFECV(estimator=rf, step=1, cv=10, scoring='ROC', verbose=2)
selector=rfecv.fit(x, y)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Users/bbalin/anaconda/lib/python2.7/site-packages/sklearn/feature_selection/rfe.py", line 336, in fit
ranking_ = rfe.fit(X_train, y_train).ranking_
File "/Users/bbalin/anaconda/lib/python2.7/site-packages/sklearn/feature_selection/rfe.py", line 148, in fit
if estimator.coef_.ndim > 1:
AttributeError: 'RandomForestClassifier' object has no attribute 'coef_'
Here's what I've done to adapt RandomForestClassifier to work with RFECV:
class RandomForestClassifierWithCoef(RandomForestClassifier):
def fit(self, *args, **kwargs):
super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
self.coef_ = self.feature_importances_
Just using this class does the trick if you use 'accuracy' or 'f1' score. For 'roc_auc', RFECV complains that multiclass format is not supported. Changing it to two-class classification with the code below, the 'roc_auc' scoring works. (Using Python 3.4.1 and scikit-learn 0.15.1)
y=(pd.Series(iris.target, name='target')==2).astype(int)
Plugging into your code:
from sklearn import datasets
import pandas as pd
from pandas import Series
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
class RandomForestClassifierWithCoef(RandomForestClassifier):
def fit(self, *args, **kwargs):
super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
self.coef_ = self.feature_importances_
iris = datasets.load_iris()
x=pd.DataFrame(iris.data, columns=['var1','var2','var3', 'var4'])
y=(pd.Series(iris.target, name='target')==2).astype(int)
rf = RandomForestClassifierWithCoef(n_estimators=500, min_samples_leaf=5, n_jobs=-1)
rfecv = RFECV(estimator=rf, step=1, cv=2, scoring='roc_auc', verbose=2)
selector=rfecv.fit(x, y)
This is my code, I've tidied it up a bit to make it relevant to your task:
features_to_use = fea_cols # this is a list of features
# empty dataframe
trim_5_df = DataFrame(columns=features_to_use)
run=1
# this will remove the 5 worst features determined by their feature importance computed by the RF classifier
while len(features_to_use)>6:
print('number of features:%d' % (len(features_to_use)))
# build the classifier
clf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
# train the classifier
clf.fit(train[features_to_use], train['OpenStatusMod'].values)
print('classifier score: %f\n' % clf.score(train[features_to_use], df['OpenStatusMod'].values))
# predict the class and print the classification report, f1 micro, f1 macro score
pred = clf.predict(test[features_to_use])
print(classification_report(test['OpenStatusMod'].values, pred, target_names=status_labels))
print('micro score: ')
print(metrics.precision_recall_fscore_support(test['OpenStatusMod'].values, pred, average='micro'))
print('macro score:\n')
print(metrics.precision_recall_fscore_support(test['OpenStatusMod'].values, pred, average='macro'))
# predict the class probabilities
probs = clf.predict_proba(test[features_to_use])
# rescale the priors
new_probs = kf.cap_and_update_priors(priors, probs, private_priors, 0.001)
# calculate logloss with the rescaled probabilities
print('log loss: %f\n' % log_loss(test['OpenStatusMod'].values, new_probs))
row={}
if hasattr(clf, "feature_importances_"):
# sort the features by importance
sorted_idx = np.argsort(clf.feature_importances_)
# reverse the order so it is descending
sorted_idx = sorted_idx[::-1]
# add to dataframe
row['num_features'] = len(features_to_use)
row['features_used'] = ','.join(features_to_use)
# trim the worst 5
sorted_idx = sorted_idx[: -5]
# swap the features list with the trimmed features
temp = features_to_use
features_to_use=[]
for feat in sorted_idx:
features_to_use.append(temp[feat])
# add the logloss performance
row['logloss']=[log_loss(test['OpenStatusMod'].values, new_probs)]
print('')
# add the row to the dataframe
trim_5_df = trim_5_df.append(DataFrame(row))
run +=1
So what I'm doing here is I have a list of features I want to train and then predict against, using the feature importances I then trim the worst 5 and repeat. During each run I add a row to record the prediction performance so that I can do some analysis later.
The original code was much bigger I had different classifiers and datasets I was analysing but I hope you get the picture from the above. The thing I noticed was that for random forest the number of features I removed on each run affected the performance so trimming by 1, 3 and 5 features at a time resulted in a different set of best features.
I found that using a GradientBoostingClassifer was more predictable and repeatable in the sense that the final set of best features agreed whether I trimmed 1 feature at a time or 3 or 5.
I hope I'm not teaching you to suck eggs here, you probably know more than me, but my approach to ablative anlaysis was to use a fast classifier to get a rough idea of the best sets of features, then use a better performing classifier, then start hyper parameter tuning, again doing coarse grain comaprisons and then fine grain once I get a feel of what the best params were.
I submitted a request to add coef_ so RandomForestClassifier may be used with RFECV. However, the change had already been made. This change will be in version 0.17.
https://github.com/scikit-learn/scikit-learn/issues/4945
You can pull the latest dev build if you want to use it now.
Here's what I ginned up. It's a pretty simple solution, and relies on a custom accuracy metric (called weightedAccuracy) since I'm classifying a highly unbalanced dataset. But, it should be easily made more extensible if desired.
from sklearn import datasets
import pandas
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
def get_enhanced_confusion_matrix(actuals, predictions, labels):
""""enhances confusion_matrix by adding sensivity and specificity metrics"""
cm = confusion_matrix(actuals, predictions, labels = labels)
sensitivity = float(cm[1][1]) / float(cm[1][0]+cm[1][1])
specificity = float(cm[0][0]) / float(cm[0][0]+cm[0][1])
weightedAccuracy = (sensitivity * 0.9) + (specificity * 0.1)
return cm, sensitivity, specificity, weightedAccuracy
iris = datasets.load_iris()
x=pandas.DataFrame(iris.data, columns=['var1','var2','var3', 'var4'])
y=pandas.Series(iris.target, name='target')
response, _ = pandas.factorize(y)
xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(x, response, test_size = .25, random_state = 36583)
print "building the first forest"
rf = RandomForestClassifier(n_estimators = 500, min_samples_split = 2, n_jobs = -1, verbose = 1)
rf.fit(xTrain, yTrain)
importances = pandas.DataFrame({'name':x.columns,'imp':rf.feature_importances_
}).sort(['imp'], ascending = False).reset_index(drop = True)
cm, sensitivity, specificity, weightedAccuracy = get_enhanced_confusion_matrix(yTest, rf.predict(xTest), [0,1])
numFeatures = len(x.columns)
rfeMatrix = pandas.DataFrame({'numFeatures':[numFeatures],
'weightedAccuracy':[weightedAccuracy],
'sensitivity':[sensitivity],
'specificity':[specificity]})
print "running RFE on %d features"%numFeatures
for i in range(1,numFeatures,1):
varsUsed = importances['name'][0:i]
print "now using %d of %s features"%(len(varsUsed), numFeatures)
xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(x[varsUsed], response, test_size = .25)
rf = RandomForestClassifier(n_estimators = 500, min_samples_split = 2,
n_jobs = -1, verbose = 1)
rf.fit(xTrain, yTrain)
cm, sensitivity, specificity, weightedAccuracy = get_enhanced_confusion_matrix(yTest, rf.predict(xTest), [0,1])
print("\n"+str(cm))
print('the sensitivity is %d percent'%(sensitivity * 100))
print('the specificity is %d percent'%(specificity * 100))
print('the weighted accuracy is %d percent'%(weightedAccuracy * 100))
rfeMatrix = rfeMatrix.append(
pandas.DataFrame({'numFeatures':[len(varsUsed)],
'weightedAccuracy':[weightedAccuracy],
'sensitivity':[sensitivity],
'specificity':[specificity]}), ignore_index = True)
print("\n"+str(rfeMatrix))
maxAccuracy = rfeMatrix.weightedAccuracy.max()
maxAccuracyFeatures = min(rfeMatrix.numFeatures[rfeMatrix.weightedAccuracy == maxAccuracy])
featuresUsed = importances['name'][0:maxAccuracyFeatures].tolist()
print "the final features used are %s"%featuresUsed