I am trying to predict one from two values which can appear in column 'exit'. I have clean data (about 20 columns and 4k rows contain typical information about customers like 'sex', 'age' ...). In training dataset about 20% customers were qualified as '1'. I made two models- svm and random forest- but both predict for test dataset mostly '0' (almost everytime). Recall of two models is 0.
I atached code where I think I could do some stupid mistake. Any ideas why recall is so low during 80% accuracies?
def ml_model():
print('sklearn: %s' % sklearn.__version__)
df = pd.read_csv('clean_data.csv')
df.head()
feat = df.drop(columns=['target'], axis=1)
label = df["target"]
x_train, x_test, y_train, y_test = train_test_split(feat, label, test_size=0.3)
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
# SVC method
support_vector_classifier = SVC(probability=True)
# Grid search
rand_list = {"C": stats.uniform(0.1, 10),
"gamma": stats.uniform(0.1, 1)}
auc = make_scorer(roc_auc_score)
rand_search_svc = RandomizedSearchCV(support_vector_classifier, param_distributions=rand_list, n_iter=100, n_jobs=4, cv=3, random_state=42,
scoring=auc)
rand_search_svc.fit(x_train, y_train)
support_vector_classifier = rand_search_svc.best_estimator_
cross_val_svc = cross_val_score(estimator=support_vector_classifier, X=x_train, y=y_train, cv=10, n_jobs=-1)
print("Cross Validation Accuracy for SVM: ", round(cross_val_svc.mean() * 100, 2), "%")
predicted_y = support_vector_classifier.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, predicted_y).ravel()
precision_score = tp / (tp + fp)
recall_score = tp / (tp + fn)
print("Recall score SVC: ", recall_score)
# Random forests
random_forest_classifier = RandomForestClassifier()
# Grid search
param_dist = {"max_depth": [3, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(2, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
rand_search_rf = RandomizedSearchCV(random_forest_classifier, param_distributions=param_dist,
n_iter=100, cv=5, iid=False)
rand_search_rf.fit(x_train, y_train)
random_forest_classifier = rand_search_rf.best_estimator_
cross_val_rfc = cross_val_score(estimator=random_forest_classifier, X=x_train, y=y_train, cv=10, n_jobs=-1)
print("Cross Validation Accuracy for RF: ", round(cross_val_rfc.mean() * 100, 2), "%")
predicted_y = random_forest_classifier.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, predicted_y).ravel()
precision_score = tp / (tp + fp)
recall_score = tp / (tp + fn)
print("Recall score RF: ", recall_score)
new_data = pd.read_csv('new_data.csv')
new_data = cleaning_data_to_predict(new_data)
if round(cross_val_svc.mean() * 100, 2) > round(cross_val_rfc.mean() * 100, 2):
predictions = support_vector_classifier.predict(new_data)
predictions_proba = support_vector_classifier.predict_proba(new_data)
else:
predictions = random_forest_classifier.predict(new_data)
predictions_proba = random_forest_classifier.predict_proba(new_data)
f = open("output.txt", "w+")
for i in range(len(predictions.tolist())):
print("id: ", i, "probability: ", predictions_proba.tolist()[i][1], "exit: ", predictions.tolist()[i], file=open("output.txt", "a"))
If I have not missed it, you forgot to scale your test set.
So, you need to scale it as well. Note that you should just transform it, do not fit it again. See below.
x_test = sc_x.transform(x_test)
I agree with #e_kapti, also check the formula of the recall and accuracy, you might consider using the F1 Score instead (https://en.wikipedia.org/wiki/F1_score).
Recall = TP / (TP+FN) Accuracy = (TP + TN) / (TP + TN + FP + FN) With TP, FP, TN, FN being number of true positives, false positives, true negatives and false negatives, respectively.
Related
I am trying to compute precision, recall, and f1 score on my test dataset. However, I am using ImageDataGenerator format, not using train_test_split (x_train, y_train, x_test and y_test). That's why I couldn't find any references online.
IMAGE_SIZE = 224
BATCH_SIZE = 64
EPOCH = 30
CHANNEL = 3
CLASSES = 10
train_path = "/Users/ba/Documents/mycodes/datasets/DS/train"
valid_path = "/Users/ba/Documents/mycodes/datasets/DS/val"
test_path = "/Users/ba/Documents/mycodes/datasets/DS/test"
train_batches = ImageDataGenerator(preprocessing_function=tf.keras.applications.mobilenet_v3.preprocess_input) \
.flow_from_directory(directory=train_path, target_size=(IMAGE_SIZE,IMAGE_SIZE), batch_size=BATCH_SIZE)
valid_batches = ImageDataGenerator(preprocessing_function=tf.keras.applications.mobilenet_v3.preprocess_input) \
.flow_from_directory(directory=valid_path, target_size=(IMAGE_SIZE,IMAGE_SIZE), batch_size=BATCH_SIZE)
test_batches = ImageDataGenerator(preprocessing_function=tf.keras.applications.mobilenet_v3.preprocess_input) \
.flow_from_directory(directory=test_path, target_size=(IMAGE_SIZE,IMAGE_SIZE), batch_size=BATCH_SIZE, shuffle=False)
Then I tried to calculate precision, recall, and f1 in the following way down below:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
y_pred_logits = model.predict(test_batches)
y_pred = tf.math.argmax(y_pred_logits)
test_classes = test_batches.classes
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(test_classes, y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(test_classes, y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(test_classes, y_pred)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(test_classes, y_pred)
print('F1 score: %f' % f1)
Unfortunately it throws this error message:
ValueError: Found input variables with inconsistent numbers of samples: [1887, 10]
Can you help me re-write the code, or any other references using ImageDataGenerator format I used?
So the problem were on y_true (test_classes) and y_pred. With this, one can also calculate the confusion matrix.
#Making prediction
y_true = test_batches.classes
predictions = model.predict(x=test_batches, steps=len(test_batches), verbose=0)
y_pred = predictions.argmax(axis=1)
print("Precision Score: ",precision_score(y_true, y_pred, pos_label='positive', average='micro'))
print("Recall Score: ",recall_score(y_true, y_pred, pos_label='positive', average='micro'))
print("F1 Score: ",f1_score(y_true, y_pred, pos_label='positive', average='micro'))
print("Accuracy Score: ",accuracy_score(y_true, y_pred))
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(img_array, img_labels,
shuffle=True, stratify=img_labels,
test_size=0.1, random_state=42)
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', mode='min',save_best_only=True,verbose=1)
restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss',factor=0.2,patience=3,verbose=1,min_delta=0.0001)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
hist = model.fit(x_train, y_train, epochs =50, batch_size=64,callbacks = [checkpoint,reduce_lr], validation_data=(x_test, y_test))
plt.plot(hist.history['loss'], color='b', label='Training Loss')
plt.plot(hist.history['val_loss'], color='r', label='Validation Loss')
plt.legend(loc='upper right')
plt.subplot(1, 2, 2)
plt.ylabel('Accuracy', fontsize=16)
plt.plot(hist.history['accuracy'], color='b', label='Training Accuracy')
plt.plot(hist.history['val_accuracy'], color='r', label='Validation Accuracy')
plt.legend(loc='lower right')
plt.show()
Output
y_pred=model.predict(x_test)
y_pred=np.argmax(y_pred, axis=1)
y_test=np.argmax(y_test, axis=1)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix
#FP, FN, TP, TN değerleri bulma
FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)
FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
TP = np.diag(confusion_matrix)
TN = confusion_matrix.sum() - (FP + FN + TP)
FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)
# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP)
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
ACC*100
Output: array([87.21092226, 98.74616885, 85.81777654, 89.69072165, 80.46809696, 93.56366676, 83.75592087])
While the accuracy is around 60% as a result of deep learning, the accuracy is around 80%-85% on average as a result of calculation from the confusion matrix. What is the difference in accuracy here? Do both truths have different meanings?
I am trying to run a Python code using train XGBoost using multioutput regression, I am getting value error. Thanks for helping.
Please find my data sample
Layers Model Technique Accuracy-1 Accuracy-2 Latency time
18-27 Net 1 0.96 0.99 334368.0 0.99
38-37 MNet 1 0.76 0.99 313348.0 0.99
Below is my code using XGBoost
def optimize(trial,x,y,regressor):
max_depth = trial.suggest_int("max_depth",3,30)
n_estimators = trial.suggest_int("n_estimators",100,3000)
max_leaves= trial.suggest_int("max_leaves",1,10)
colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.0, 1.0)
gamma = trial.suggest_uniform('gamma', 0.0, 0.05)
min_child_weight = trial.suggest_uniform('min_child_weight',1,3)
reg_lambda = trial.suggest_uniform('reg_lambda',0.5,1)
model = xgb.XGBRegressor(
objective ='reg:squarederror',
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate,
colsample_bytree=colsample_bytree,
gamma=gamma,
min_child_weight=min_child_weight,
reg_lambda=reg_lambda,
max_leaves=max_leaves)
kf=model_selection.KFold(n_splits=5)
error=[]
for idx in kf.split(X=x , y=y):
train_idx , test_idx= idx[0],idx[1]
xtrain=x[train_idx]
ytrain=y[train_idx]
xtest=x[test_idx]
ytest=y[test_idx]
model.fit(x,y)
y_pred = model.predict(xtest)
fold_err = metrics.mean_squared_error(ytest,y_pred)
error.append(fold_err)
return np.mean(error)
def optimize_xgb(X,y):
list_of_y = ["Target 1","Target 2", "Target 3","Target 4"]
for i,m in zip(range(y.shape[1]),list_of_y):
print("{} optimized Parameters on MSE Error".format(m))
optimization_function = partial(optimize , x=X,y=y[:,i],regressor="random_forest")
study = optuna.create_study(direction="minimize")
study.optimize(optimization_function,n_trials=1)
data["Latency"] = minmax_scale(data["Latency"])
X = data[["Layers ","Model"]]
Y = data[['Accuracy-1', 'Accuracy-2','Latency', 'time ']]
encoder = OneHotEncoder(sparse=False)
onehot = encoder.fit_transform(X)
X_encoded = encoder.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
np.array(X_encoded), np.array(Y), test_size=0.3, random_state=42)
def modeling(X,y,max_depth=10,n_estimators=300,max_leaves=10,
learning_rate=0.01,colsample_bytree=0.001,gamma=0.0001,min_child_weight=2,
reg_lambda=0.3):
model = xgb.XGBRegressor(objective='reg:squarederror',
n_estimators=n_estimators,
max_depth=max_depth,
max_leaves=max_leaves,
learning_rate=learning_rate,
gamma=gamma,
min_child_weight=min_child_weight,
colsample_bytree=colsample_bytree)
if y.shape[1] ==1:
print(" Apply Xgboost for one single Target....\n")
model_xgb = model.fit(X, y)
else:
print(" Apply Xgboost for {} Targets....".format(y.shape[1]))
model_xgb = MOR(model).fit(X, y)
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
scores = []
for i in range(y.shape[1]):
scores.append(np.abs(cross_val_score(model, X, y[:,i], scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)))
print('Mean MSE of the {} target : {} ({})'.format(i,scores[i].mean(), scores[i].std()) )
return model_xgb
model_xgb = modeling(X_train,y_train,optimize="no")
y_estimated = model_xgb.predict(X_test)
mse(y_estimated,y_test)
################
y = np.random.random((1000,1))
model_xgb = modeling(X,y,optimize="no")
Error retrieving ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields Layers, Model
I've done DecisionTreeRegression as well as RandomForestRegression on the same dataset.
For RandomForest I've used 5 random best combinations and the results were all similar results as you'd expect. I've calculated the average of R^2, RMSE and MAE and have gotten
R^2 : 0.7, MAE: 145716, RMSE: 251828.
For DecisionTree I've used Repeated K-Fold, calculated the average and gotten:
R^2: 0.29, MAE: 121791, RMSE: 198280.
No transformations or scaling have been done on the response variable which is Home Prices.
I'm new to statistics, but I'm pretty sure R^2 should be higher, if MAE and RMSE are lower on the same dataset when there is no scaling done. That being said, the dataset in question is pretty low in quality compared to the other datasets that I'm using which are yielding the appropriate proportions in error scores.
My question is, since this dataset is poor in quality, and I'm sure that there will be negative R2 values as well as above one for the DecisionTree Model for this dataset: Is it possible that calculating the mean of scores after cross-validation gives arbitrary results for R^2, if some of the R^2 values are not in the 0-1 interval, or is it more likely that there's an issue with the logic of my code(or something else)?
def decisionTreeRegression(df, features):
df = df.sample(frac=1, random_state=0)
scaler = StandardScaler()
X = df[features]
y = df[['Price']]
param_grid = {'max_depth': np.arange(1,40,3)}
tree = GridSearchCV(DecisionTreeRegressor(), param_grid,return_train_score=False)
tree.fit(X,y)
tree_final = DecisionTreeRegressor(max_depth=tree.best_params_['max_depth'])
cv = RepeatedKFold(n_splits=5, n_repeats=100)
mae_scores = cross_val_score(tree_final, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
mse_scores = cross_val_score(tree_final, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
r2_scores = cross_val_score(tree_final, X, y, scoring='r2', cv=cv, n_jobs=-1)
return makeScoresCV(mae_scores,mse_scores,r2_scores)
def makeScoresCV(mae_scores,mse_scores,r2_scores):
# convert scores to positive
mae_scores= absolute(mae_scores)
mse_scores= absolute(mse_scores)
# summarize the result
s_mean = mean(mae_scores)
s_mean2 = mean(mse_scores)
s_mean3 = mean(r2_scores)
return s_mean,np.sqrt(s_mean2),s_mean3
mae, rmse, r2 = decisionTreeRegression(df_de,fe_de)
print("mae : " + str(mae))
print("rmse : " + str(rmse))
print("r2 : " + str(r2))
Console:
mae : 153189.34673362423
rmse : 253284.5137707182
r2 : 0.30183525616923246
Random Forest (seperate notebook):
scaler = StandardScaler()
X = df.drop('Price', axis = 1)
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=123, shuffle=True)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
def evaluate(model, test_features, test_labels):
predictions = model.predict(test_features)
rmse = (np.sqrt(mean_squared_error(test_labels,predictions)))
r2 = r2_score(test_labels,predictions) # from sklearn.metrics
mae = np.sum(np.absolute((test_labels - predictions))) / len(predictions)
return mae,r2,rmse
maes = []
rmses = []
r2s = []
for i in range(10):
rf_random.fit(X_train, y_train)
best_random = rf_random.best_estimator_
mae,r2,rmse = evaluate(best_random, X_test, y_test)
maes.append(mae)
rmses.append(rmse)
r2s.append(r2)
print("MAE")
print(math.fsum(maes) / len(maes))
print("RMSE")
print(math.fsum(rmses) / len(rmses))
print("R2")
print(math.fsum(r2s) / len(r2s))
Console:
MAE
145716.7264983288
RMSE
251828.40328030512
R2
0.7082730127977784
I'm inputting a large array of data with a classification of 0 or 1 into scikit-learn. However, when I change the order of the columns in my array, the cross validation accuracy of the test changes from 68% to 99%, with no change in the code. Can anyone think why this would be happening? The code is as follows:
Blockquote
full_data = np.array(Datacsv)
print full_data
# Want to predict y
# first row of array is classification status 0 or 1
#
Y = full_data[:,0]
# Image data
#
X = full_data[:, 1:]
#http://scikit-learn.org/stable/modules/generate/sklearn.model_selection.StratifiedKFold.html
#provides train/test indices to split data into train/test sets (patritions into 10 random)
classification = SVC(kernel='linear',C=1)
d = classification.fit(X,Y)
scores = cross_validation.cross_val_score(classification,X,Y,cv=10, scoring='accuracy')
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(),scores.std()*2))
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,Y, test_size=0.4,random_state=0)
kfolds = StratifiedKFold(Y,n_folds=10, shuffle=True, random_state=None)
scores = []
sensitivity = []
specificity= []
PPV = []
NPV = []
ROC_AUC = []
for train_index, test_index in kfolds:
Y_pred = classification.fit(X[train_index], Y[train_index]).predict(X[test_index])
# Get accuracy score of each fold and append to scores list
scores.append(accuracy_score(Y[test_index], Y_pred))
#y_prob = svm.fit(X[train_index], y[train_index]).predict_proba(X[test_index])
#make pfrom sklearn.cross_validation import StratifiedKFoldython list to numpy array
# Generate confusion matrix from which we can get TP, FP, FN and FN
confusion = confusion_matrix(Y[test_index],Y_pred)
TP = confusion[1,1]
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]
cv_sens = TP/ float(TP + FN) # Sensitivity
sensitivity.append(cv_sens)
cv_spec = TN / float(TN + FP) # specificity
specificity.append(cv_spec)
cv_ppv = TP / float(TP + FP) # PPV
PPV.append(cv_ppv)
cv_npv = TN / float(TN + FN)
NPV.append(cv_npv)
cv_roc_auc = roc_auc_score(Y[test_index], Y_pred)
ROC_AUC.append(cv_roc_auc)