using this dataset https://www.kaggle.com/datasets/harlfoxem/housesalesprediction?datasetId=128&searchQuery=hyperparameter+tuning&language=Python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
#Random seed for reproducibiliity
np.random.seed(5)
####DATA PRE-PROCESSING ####
#read the data
data = pd.read_csv('kc_house_data.csv')
#check for missing/null values
print(data.isnull().any())
#create a 'house age' colum that encompasses both date (which doesn't help in predicting price) and the year built
data['date'] = pd.to_datetime(data['date']) #first convert date format
data["house_age"] = data["date"].dt.year - data['yr_built']
#if 'renovated' is 0, it means not renovated- so to stop confusion, if year renovated is not 0, add 1 to a new 'renovated' column
data['renovated'] = data['yr_renovated'].apply(lambda yr: 0 if yr == 0 else 1)
#Now drop the columns we do not need
data=data.drop('date', axis=1)
data=data.drop('yr_renovated', axis=1)
data=data.drop('yr_built', axis=1)
data.head(5)
print(data)
#Split into test and train
X = data
Y = X['price'].values
X = X.drop('price', axis = 1)
X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.20, random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_test, Y_test, test_size=0.5) #splitting into validation sets as well - not sure if needed tho
#Scale
standard_scaler = StandardScaler() #Unsure if min max scaler would be more appropriate here?
X_train= pd.DataFrame(standard_scaler.fit_transform(X_train),columns=X_train.columns)
X_test = pd.DataFrame(standard_scaler.transform(X_test),columns = X_test.columns)
X_val = pd.DataFrame(standard_scaler.transform(X_test),columns = X_val.columns)
#print(X_train.shape, X_val.shape, X_test.shape, Y_train.shape, Y_val.shape, Y_test.shape)
# Use scikit-learn to grid search the batch size and epochs
def create_model():
# create model
model = Sequential()
model.add(Dense(32, input_dim=19, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam' , metrics='mse')
return model
# create model
model = KerasRegressor(build_fn=create_model, verbose=0)
# define the grid search parameters
batch_size = [5, 10, 20, 40, 60, 80]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring = 'neg_mean_squared_error')
grid_result = grid.fit(X_train, Y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
param_grid = dict(epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring = 'neg_mean_squared_error')
grid_result = grid.fit(X_train, Y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
It's just not working for me. What I want to do is create a simple neural network and then finetune the epochs and batch size. But this is just giving me 'acccuracy' as NAN. I'm so confused. Any help appreciated
Current output:
Best: -419841571707.859070 using {'batch_size': 5}
Best: -419841571132.739441 using {'epochs': 100}
However if i add batch sizes of 1,2,3 etc, it will give that- it always says the best result is the smallest batch size. Is this right?
Related
I have a dataset and have one hot encoded the target column (5 different strings throughout the entire column) using pd.get_dummies. I have then used sklearn's train_test_split function to create the training, testing and validation sets. The training set (features) were then normalized with standardScalar(). I have fit the training sets of both the features and the target to a logistic regression model.
I am now trying to calculate the accuracy score for the training, validation and test sets but am having no luck. My code up to this part is below:
dataset = pd.read_csv('tabular_data/clean_tabular_data.csv')
features, label = load_airbnb(dataset, 'Category')
label_series = dataset['Category']
label_encoded = pd.get_dummies(label_series)
X_train, X_test, y_train, y_test = train_test_split(features, label_encoded, test_size=0.3)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.5)
# normalize the features
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_validation_scaled = scaler.transform(X_validation)
X_test_scaled = scaler.transform(X_test)
# get baseline classification model
model = LogisticRegression()
y_train = y_train.iloc[:, 0]
model.fit(X_train_scaled, y_train)
y_train_pred = model.predict(X_train_scaled)
y_train_pred = np.argmax(y_train_pred, axis=0)
y_validation_pred = model.predict(X_validation_scaled)
y_validation_pred = np.argmax(y_validation_pred, axis =0)
y_test_pred = model.predict(X_test_scaled)
y_test_pred = np.argmax(y_test_pred, axis = 0)
# evaluate model using accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
validation_acc = accuracy_score(y_validation, y_validation_pred)
The error I am getting is here: "File "C:\Users\lcox1\Documents\VSCode\AiCore\Data science\classification_prac.py", line 56, in
train_acc = accuracy_score(y_train, y_train_pred)
TypeError: Singleton array 16 cannot be considered a valid collection."
I am fairly new to python so have no idea what the issue is. Any help appreciated.
You are getting that error because of these lines:
y_train_pred = model.predict(X_train_scaled)
y_train_pred = np.argmax(y_train_pred, axis=0)
When you call model.predict(), it actually returns you an array of predicted labels, and not the probabilities. And if you do argmax of this array, you get 1 value, which is the index of the maximum value, hence it throws you the error, during prediction.
Most likely you mean to do:
y_train_pred = model.predict_proba(X_train_scaled)
y_train_pred = np.argmax(y_train_pred, axis=1)
y_train_pred
As #BenReiniger pointed out in the comments, if you are trying to train a model on multi class labels, you should not one-hot encode. Try something below, where I used an example dataset, and have the labels as a category:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
data = load_iris()
features = data.data
label_series = pd.Series(data.target).map({0:"setosa",1:"virginica",2:"versicolor"})
label_series = pd.Categorical(label_series)
le = LabelEncoder()
label_encoded = le.fit_transform(label_series)
Running your code with some changes:
X_train, X_test, y_train, y_test = train_test_split(features, label_encoded, test_size=0.3)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.5)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_validation_scaled = scaler.transform(X_validation)
X_test_scaled = scaler.transform(X_test)
# get baseline classification model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
y_train_pred = model.predict_proba(X_train_scaled)
y_train_pred = np.argmax(y_train_pred, axis=1)
y_validation_pred = model.predict_proba(X_validation_scaled)
y_validation_pred = np.argmax(y_validation_pred, axis =1)
y_test_pred = model.predict_proba(X_test_scaled)
y_test_pred = np.argmax(y_test_pred, axis = 1)
# evaluate model using accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
validation_acc = accuracy_score(y_validation, y_validation_pred
The results:
print(train_acc,test_acc,validation_acc)
0.9809523809523809 0.9090909090909091 1.0
I built a knn model for classification. Unfortunately, my model has accuracy > 80%, and I would like to get a better result. Can I ask for some tips? Maybe I used too many predictors?
My data = https://www.openml.org/search?type=data&sort=runs&id=53&status=active
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
heart_disease = pd.read_csv('heart_disease.csv', sep=';', decimal=',')
y = heart_disease['heart_disease']
X = heart_disease.drop(["heart_disease"], axis=1)
correlation_matrix = heart_disease.corr()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
scaler = MinMaxScaler(feature_range=(-1,1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
knn_3 = KNeighborsClassifier(3, n_jobs = -1)
knn_3.fit(X_train, y_train)
y_train_pred = knn_3.predict(X_train)
labels = ['0', '1']
print('Training set')
print(pd.DataFrame(confusion_matrix(y_train, y_train_pred), index = labels, columns = labels))
print(accuracy_score(y_train, y_train_pred))
print(f1_score(y_train, y_train_pred))
y_test_pred = knn_3.predict(X_test)
print('Test set')
print(pd.DataFrame(confusion_matrix(y_test, y_test_pred), index = labels, columns = labels))
print(accuracy_score(y_test, y_test_pred))
print(f1_score(y_test, y_test_pred))
hyperparameters = {'n_neighbors' : range(1, 15), 'weights': ['uniform','distance']}
knn_best = GridSearchCV(KNeighborsClassifier(), hyperparameters, n_jobs = -1, error_score = 'raise')
knn_best.fit(X_train,y_train)
knn_best.best_params_
y_train_pred_best = knn_best.predict(X_train)
y_test_pred_best = knn_best.predict(X_test)
print('Training set')
print(pd.DataFrame(confusion_matrix(y_train, y_train_pred_best), index = labels, columns = labels))
print(accuracy_score(y_train, y_train_pred_best))
print(f1_score(y_train, y_train_pred_best))
print('Test set')
print(pd.DataFrame(confusion_matrix(y_test, y_test_pred_best), index = labels, columns = labels))
print(accuracy_score(y_test, y_test_pred_best))
print(f1_score(y_test, y_test_pred_best))
```.
Just a little part of answer, to find the best number for k_neighbors.
errlist = [] #an error list to append
for i in range(1,40): #from 0-40 numbers to use in k_neighbors
knn_i = KNeighborsClassifier(k_neighbors=i)
knn_i.fit(X_train,y_train)
errlist.append(np.mean(knn_i.predict(X_test)!=y_test)) # append the mean of failed-predict numbers
plot a line to see best k_neighbors:
plt.plot(range(1,40),errlist)
feel free to change the numbers for range.
this is the first time I ask a question on this platform. I'm using Sklearn's MLPregressor model to do the bike rental prediction. I need to test and verify the "RMSE" in the test base, however when I perform the prediction, this error is returned ( X has 19 features, but MLPRegressor is expecting 100 features as input).
The two dataframes (training and testing) were treated in the same way, and I've already verified that their features have the same names. I also tried to transform the format of the variables from pd.dataframe to numpy.ndarray and normalize the data with StandardScaler. but all my attempts lead to the same error. can anybody help me?
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
#------------------------------------------------
# Import of training and test data, already treated. The data were treated in the same way
caminho = '/content/treino_tratado.csv'
train = pd.read_csv(caminho)
train
caminho2 = '/content/teste_tratado.csv'
test = pd.read_csv(caminho2)
test.head(3)
#-----------------------------------------------------
#definition of my predictor variables and target variable
X = treino.drop('aluguéis', axis = 1)
y = treino['aluguéis']
# I chose to convert the x and y variables to np array. in the sklearn documentation it is shown that the variables must be in this format
X = X.to_numpy()
y = y.to_numpy()
#test data
test = test.to_numpy()
#----------------------------------------------------------
# model
X, y = make_regression(n_samples=200, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 , random_state=1)
regr = MLPRegressor(hidden_layer_sizes=(1000,), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000).fit(X_train, y_train)
predicoes = regr.predict(X_test)
from sklearn.metrics import mean_squared_error
import math
mse = mean_squared_error(y_test, predicoes)
rmse = math.sqrt(mse)
print(rmse)
# output = 109.25977971042573
#----------------------------------------------------------------
# prediction in test database
predicoes2 = regr.predict(test)
# error -> X has 19 features, but MLPRegressor is expecting 100 features as input.
Do you know that this line is creating a new random dataset with the X, y variables?
X, y = make_regression(n_samples=200, random_state=1)
Why are you using it?
It creates a new dataset with 100 features, and that is probably the reason of your error.
This is way I can values from single fold trained model
clf.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='auc', verbose=100, early_stopping_rounds=200)
import shap # package used to calculate Shap values
# Create object that can calculate shap values
explainer = shap.TreeExplainer(clf)
# Calculate Shap values
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)
As you know result from different fold might be different - how to average this shap_values?
Because we have such rule:
It is fine to average the SHAP values from models with the same output
trained on the same input features, just make sure to also average the
expected_value from each explainer. However, if you have
non-overlapping test sets then you can't average the SHAP values from
the test sets since they are for different samples. You could just
explain the SHAP values for the whole dataset using each of your
models and then average that into a single matrix. (It's fine to
explain examples in your training set, just remember you may be
overfit to them)
So we need here some holdout dataset to follow that rule. I did something like this to get erything to work as expected:
shap_values = None
from sklearn.model_selection import cross_val_score, StratifiedKFold
(X_train, X_test, y_train, y_test) = train_test_split(df[feat], df['target'].values,
test_size=0.2, shuffle = True,stratify =df['target'].values,
random_state=42)
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds_idx = [(train_idx, val_idx)
for train_idx, val_idx in folds.split(X_train, y=y_train)]
auc_scores = []
oof_preds = np.zeros(df[feat].shape[0])
test_preds = []
for n_fold, (train_idx, valid_idx) in enumerate(folds_idx):
train_x, train_y = df[feat].iloc[train_idx], df['target'].iloc[train_idx]
valid_x, valid_y = df[feat].iloc[valid_idx], df['target'].iloc[valid_idx]
clf = lgb.LGBMClassifier(nthread=4, boosting_type= 'gbdt', is_unbalance= True,random_state = 42,
learning_rate= 0.05, max_depth= 3,
reg_lambda=0.1 , reg_alpha= 0.01,min_child_samples= 21,subsample_for_bin= 5000,
metric= 'auc', n_estimators= 5000 )
clf.fit(train_x, train_y,
eval_set=[(train_x, train_y), (valid_x, valid_y)],
eval_metric='auc', verbose=False, early_stopping_rounds=100)
explainer = shap.TreeExplainer(clf)
if shap_values is None:
shap_values = explainer.shap_values(X_test)
else:
shap_values += explainer.shap_values(X_test)
oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
auc_scores.append(roc_auc_score(valid_y, oof_preds[valid_idx]))
print( 'AUC: ', np.mean(auc_scores))
shap_values /= 10 # number of folds
shap.summary_plot(shap_values, X_test)
I am using lightGBM for finding feature importance but I am getting error LightGBMError: b'len of label is not same with #data' .
X.shape
(73147, 12)
y.shape
(73147,)
Code:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
# Initialize an empty array to hold feature importances
feature_importances = np.zeros(X.shape[1])
# Create the model with several hyperparameters
model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')
# Fit the model twice to avoid overfitting
for i in range(2):
# Split into training and validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = i)
# Train using early stopping
model.fit(X, y_train, early_stopping_rounds=100, eval_set = [(X_test, y_test)],
eval_metric = 'auc', verbose = 200)
# Record the feature importances
feature_importances += model.feature_importances_
See screenshot below:
You seem to have a typo in your code; instead of
model.fit(X, y_train, [...])
it should be
model.fit(X_train, y_train, [...])
As it is now, it is understandable that the length of X and y_train is not the same, hence your error.