CatBoost border - python

I can't start catboost learning with catboost because of a small border.
X = pandas.read_csv("../input/x_y_test/X.csv")
X_test = pandas.read_csv("../input/x_y_test/X_test.csv")
y = pandas.read_csv("../input/y-data/y.csv")
X = X.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
y = y.reset_index(drop = True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .3, random_state = 1337)
X_train = X_train.reset_index(drop = True)
X_val = X_val.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_val = y_val.reset_index(drop = True)
model_cb = CatBoostClassifier(eval_metric = "Accuracy", n_estimators = 1200, random_seed = 70)
model_cb.fit(X_train, y_train, eval_set = (X_val, y_val), use_best_model = True)
so I got
CatboostError: catboost/libs/metrics/metric.cpp:3929: All train targets are greater than border 0.5
data
https://drive.google.com/drive/folders/1m7bNIs0mZQQkAsvkETB3n6j62p9QJX39?usp=sharing

Your main error is that you're feeding y_train to your algo as:
id skilled
0 138177 0
1 36214 0
2 103206 1
3 22699 1
4 96145 1
I believe what you really intended was just y_train.skilled
Run reassignment like below before your fitting and you're fine to go:
y_train = y_train.skilled # just skill is enough
y_val = y_val.skilled # just skill is enough
model_cb = CatBoostClassifier(eval_metric = "Accuracy", n_estimators = 1200, random_seed = 70)
model_cb.fit(X_train, y_train, eval_set = (X_val, y_val), use_best_model = True)
On a side note, do you really believe id in X_train possesses any predictive ability. Why not drop it from features as well?

Related

Decreasing error in test data with small dataset

I have a problem that the training error is too good, but the test error is too bad. I've already use PCA to reduce the dimension of the feature and these are the best that i can get so far but it still not good enough for test data evaluation:
XGBoost :
R2 Score : 0.559832465443366
MSE : 0.021168084677487115
RMSE : 0.1454925588388874
MAE : 0.12313938140869134
dataset: https://docs.google.com/spreadsheets/d/1xLTv4jLh7j3sTh0UKMHnSUvMXx1qNiXZ/edit?usp=share_link&ouid=116330084208220275542&rtpof=true&sd=true
these are my codes:
dataset = pd.read_excel('Data.xlsx')
x = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 4)
sc = StandardScaler()
x_train[:, :] = sc.fit_transform(x_train[:, :])
x_test[:, :] = sc.transform(x_test[:, :])
pca = PCA(n_components = 4)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
rf = RandomForestRegressor()
adb = AdaBoostRegressor()
xgb = xgb.XGBRegressor()
gbrt = GradientBoostingRegressor()
rf_parameters = {'n_estimators':[200,500],'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'], 'max_features': ['sqrt', 'log2', None]}
adb_parameters = {'n_estimators':[200,500],'loss':['linear', 'square', 'exponential']}
xgb_parameters = {'booster':['gbtree', 'dart'],
'sampling_method':['uniform', 'gradient_based'],
'tree_method':['auto','exact','approx','hist','gpu_hist'],
'n_estimators':[200,500]}
gbrt_parameters = {'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],'n_estimators':[200,500],'criterion':['friedman_mse', 'squared_error'], 'max_features':['auto', 'sqrt', 'log2']}
rf_grid = GridSearchCV(rf, rf_parameters, cv = 8, n_jobs = -1)
adb_grid = GridSearchCV(adb, adb_parameters, cv = 8, n_jobs = -1)
xgb_grid = GridSearchCV(xgb, xgb_parameters, cv = 8, n_jobs = -1)
gbrt_grid = GridSearchCV(gbrt, gbrt_parameters, cv = 8, n_jobs = -1)
rf_grid.fit(x_train, y_train)
adb_grid.fit(x_train, y_train)
xgb_grid.fit(x_train, y_train)
gbrt_grid.fit(x_train, y_train)
y_pred_rf = rf_grid.predict(x_test)
y_pred_adb = adb_grid.predict(x_test)
y_pred_xgb = xgb_grid.predict(x_test)
y_pred_gbrt = gbrt_grid.predict(x_test)`
what should i do to reducing the test data error, but the dataset only consist of 60 data and i use 80-20 splitting. Thank you
I've already use PCA to reduce the dimension of the feature and these are the best that i can get so far but it still not good enough for test data evaluation, what should i do to reducing the test data error, but the dataset only consist of 60 data and i use 80-20 splitting. Thank you

Python: Doing MNLogit, outputs only show 6 of the 7 classes of the target variable

There are 7 classes (3,4,5,6,7,8,9) of the target variable (‘quality’). It is only showing up 1-6 after encoding or 4-9 without encoding. Am I missing something??
reduced_data = pd.read_csv('winequality-white.csv', sep=';')
reduced_data[['quality']] = OrdinalEncoder().fit_transform(reduced_data[['quality']].to_numpy().reshape(-1, 1))
reduced_data.dropna()
model_1 = reduced_data.drop(columns=['fixed acidity','volatile acidity','citric acid',
'residual sugar','pH','free sulfur dioxide','total sulfur dioxide','sulphates']
X_train, X_test, Y_train, Y_test = train_test_split(model_1.drop(columns=['quality']), model_1['quality'],
test_size=0.25, random_state=2)
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
model_reduced = sm.MNLogit(Y_train, X_train)
results_reduced = model_reduced.fit()
print(results_reduced.summary())

Neural network does not predict properly with Count Vectorizer

I'm trying to do a Sentiment Analysis prediction using the text and the scores of random IMDB reviews. I turned all the words into a Bag Of Words and put it all in a neural network. The prediction however does not seem to be correct and it always shows a 50% positive and a 50% negative prediction for anything that I type as a review.
reviews = pd.read_csv('reviews.txt', header=None)
labels = pd.read_csv('labels.txt', header=None)
Y = (labels=='positive').astype(np.int_)
print(type(reviews))
print(reviews.head())
print(labels.head())
#Split into train/test
x_train, x_test, y_train, y_test = train_test_split(reviews,Y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train)
#min_df = 19 seems to be the first number that fills all 10 000 entries - thus the 10 most commonly used words
vect = CountVectorizer(min_df=19, max_features=10000)
fitter = vect.fit(x_train[0])
X_train = fitter.transform(x_train[0])
X_test = fitter.transform(x_test[0])
X_val = fitter.transform(x_val[0])
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
feature_names = vect.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("Vocabulary content:\n {}".format(fitter.vocabulary_))
X_train = pad_sequences(X_train.toarray(), maxlen=100, value=0.)
X_test = pad_sequences(X_test.toarray(), maxlen=100, value=0.)
X_val = pad_sequences(X_val.toarray(), maxlen=100, value=0.)
Y_train = to_categorical(y_train, 2)
Y_test = to_categorical(y_test, 2)
Y_val = to_categorical(y_val, 2)
tensorflow.reset_default_graph()
input_layer = tflearn.input_data(shape=[None, 100])
net = tflearn.embedding(input_layer, input_dim=10000, output_dim=128)
hid = tflearn.fully_connected(input_layer, 10, activation='tanh') # a hidden layer with 10 neurons
output_layer = tflearn.fully_connected(hid, 2, activation='softmax')
sgd = tflearn.SGD(learning_rate=0.04, lr_decay=0.96, decay_step=1000)
net = tflearn.regression(output_layer, optimizer=sgd, loss='categorical_crossentropy')
model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir='tfdir')
try:
model.fit(X_train, Y_train, n_epoch=5, validation_set=(X_val, Y_val), batch_size=100, show_metric=True, run_id="Imdb")
except KeyboardInterrupt as e:
print("Stopped by user")
The training, validation and test accuracy is always ~0.65 at maximum no matter how much I tune the hyperparameters.
my_review = "This movie sucks"
my_review_enc = fitter.transform([my_review])
my_review_enc_pad = pad_sequences(my_review_enc.toarray(), maxlen=100, value=0.)
prediction = model.predict(my_review_enc_pad)
prediction
As you can see, the positive and negative prediction is always at 50%
What am I doing wrong?

train_test_split not splitting data

There is a dataframe that consists of 14 columns in total, the last column is the target label with integer values = 0 or 1.
I have defined:
X = df.iloc[:,1:13] ---- this consists of the feature values
y = df.iloc[:,-1] ------ this consists of the corresponding labels
Both have the same length as desired, X is the dataframe that consists of 13 columns, shape (159880, 13), y is an array type with shape(159880,)
But when I perform train_test_split() on X,y- the function is not working properly.
Below is the straightforward code:
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state = 0)
After this split, both X_train and X_test have shape (119910,13). y_train is having shape (39970,13) and y_test is having shape (39970,)
This is weird, even after defining test_size parameter, the results stay the same.
Please advise, what could have been going wrong.
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_feature_importances
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
def model():
df = pd.read_csv('train.csv', encoding = 'ISO-8859-1')
df = df[np.isfinite(df['compliance'])]
df = df.fillna(0)
df['compliance'] = df['compliance'].astype('int')
df = df.drop(['grafitti_status', 'violation_street_number','violation_street_name','violator_name',
'inspector_name','mailing_address_str_name','mailing_address_str_number','payment_status',
'compliance_detail', 'collection_status','payment_date','disposition','violation_description',
'hearing_date','ticket_issued_date','mailing_address_str_name','city','state','country',
'violation_street_name','agency_name','violation_code'], axis=1)
df['violation_zip_code'] = df['violation_zip_code'].replace(['ONTARIO, Canada',', Australia','M3C1L-7000'], 0)
df['zip_code'] = df['zip_code'].replace(['ONTARIO, Canada',', Australia','M3C1L-7000'], 0)
df['non_us_str_code'] = df['non_us_str_code'].replace(['ONTARIO, Canada',', Australia','M3C1L-7000'], 0)
df['violation_zip_code'] = pd.to_numeric(df['violation_zip_code'], errors='coerce')
df['zip_code'] = pd.to_numeric(df['zip_code'], errors='coerce')
df['non_us_str_code'] = pd.to_numeric(df['non_us_str_code'], errors='coerce')
#df.violation_zip_code = df.violation_zip_code.replace('-','', inplace=True)
df['violation_zip_code'] = np.nan_to_num(df['violation_zip_code'])
df['zip_code'] = np.nan_to_num(df['zip_code'])
df['non_us_str_code'] = np.nan_to_num(df['non_us_str_code'])
X = df.iloc[:,0:13]
y = df.iloc[:,-1]
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state = 0)
print(y_train.shape)
You have mixed up the results of train_test_split, it should be
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0)
if args.mode == "train":
# Load Data
data, labels = load_dataset('C:/Users/PC/Desktop/train/k')
# Train ML models
knn(data, labels,'C:/Users/PC/Desktop/train/knn.pkl' )

Doing cross validation from scratch

I found this function definition on Stack Overflow:
def fold_i_of_k(dataset, i, k):
n = len(dataset)
return len(dataset[n*(i-1)//k:n*i//k])
# this is my code below
#Constants
FOLD_I = 1
FOLD_K =10
#Creating 10 folds
counter = 1
s=0
total_ac = 0
while counter!=FOLD_K+1:
print("Fold ",counter)
fold = fold_i_of_k(dataset,counter,10)
d_fold = dataset[s:s + fold]
#print(d_fold.index.values)
#print(d_fold.iloc[1:3,0:2])
d_test = d_fold
X_test = d_test.iloc[:,0:11]
y_test = d_test.iloc[:,11:12]
d_train = dataset.drop(dataset.index[s:s+fold])
X_train = d_train.iloc[:,0:11]
y_train = d_train.iloc[:,11:12]
##print(dataset)
##print(d_fold)
##print(d_train)
##print(d_test)
##print(len(X_train))
##print(len(y_train))
##print(X_test)
##print(y_test)
#print(fold)
X_train = X_train.as_matrix()
X_train = preprocessing.scale(X_train)
y_train = y_train.as_matrix()
X_test = X_test.as_matrix()
X_test = preprocessing.scale(X_test)
y_test = y_test.as_matrix()
#l1 = len(y_train)
#np.reshape(y_train, l1)
#print(y_train)
from numpy import array
#l = len(y_test)
#np.reshape(y_test, l)
#print(y_test)
data.reshape((data.shape[0], 1))
y_train = array(y_train)
print(y_train.shape)
lr = LogisticRegression()
lr.fit(X_train,y_train)
#lr_pred = lr.predict(X_test)
#ac = accuracy_score(y_test,lr_pred)
#print(ac)
##print(classification_report(y_test,lr_pred))
total_ac = total_ac + ac
s = s + fold
counter= counter+1
total_ac = total_ac / FOLD_K
print("Cross validation accuracy is: ",total_ac)`
I am getting following error:
/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:578:
DataConversionWarning: A column-vector y was passed when a 1d array
was expected. Please change the shape of y to (n_samples, ), for
example using ravel().
y = column_or_1d(y, warn=True)
How can I fix it?
y_train.ravel() solved the problem.

Categories