Iterative split of multilabel classification dataset in pandas dataframe - python

I have dataset which contains text column with string values and multiple column with value 1 or 0 (classified or no). I want to use skmultilearn to split this data with even distribution, but I got this error:
KeyError: 'key of type tuple not found and not a MultiIndex'
And this is my code:
import pandas as pd
from skmultilearn.model_selection import iterative_train_test_split
y = pd.read_csv("dataset.csv")
x = y.pop("text")
x_train, x_test, y_train, y_test = iterative_train_test_split(x, y, test_size=0.1)

Here is what worked for me (this is 98/1/1 split):
import os
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
def main():
# load dataset
y = pd.read_csv("dataset.csv")
x = y.pop("text")
# save tag names to reuse them later for creating pandas DataFrames
tag_names = y.columns
# Data has to be in ndarray format
y = y.to_numpy()
x = x.to_numpy()
# split to train / test
msss = MultilabelStratifiedShuffleSplit(n_splits=2, test_size=0.02, random_state=42)
for train_index, test_index in msss.split(x, y):
x_train, x_test_temp = x[train_index], x[test_index]
y_train, y_test_temp = y[train_index], y[test_index]
# make some memory space
del x
del y
# split to test / validation
msss = MultilabelStratifiedShuffleSplit(n_splits=2, test_size=0.5, random_state=42)
for test_index, val_index in msss.split(x_test_temp, y_test_temp):
x_test, x_val = x_test_temp[test_index], x_test_temp[val_index]
y_test, y_val = y_test_temp[test_index], y_test_temp[val_index]
# train dataset
df_train = pd.DataFrame(data=y_train, columns=tag_names)
df_train.insert(0, "snippet", x_train)
# validation dataset
df_val = pd.DataFrame(data=y_val, columns=tag_names)
df_val.insert(0, "snippet", x_val)
# test dataset
df_test = pd.DataFrame(data=y_test, columns=tag_names)
df_test.insert(0, "snippet", x_test)
if __name__ == "__main__":
main()

Related

how to change my code to use k fold cross validation with k = 5

I want to change my code so that instead of this part:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100, test_size=0.2)
train_data = X_train.copy()
train_data.loc[:, 'target'] = y_train
test_data = X_test.copy()
test_data.loc[:, 'target'] = y_test
data_config = DataConfig(
target=['target'], #target should always be a list. Multi-targets are only supported for
regression. Multi-Task Classification is not implemented
continuous_cols=train_data.columns.tolist(),
categorical_cols=[],
normalize_continuous_features=True
)
trainer_config = TrainerConfig(
auto_lr_find=True,
batch_size=64,
max_epochs=10,
)
optimizer_config = {'optimizer':'Adam', 'optimizer_params':{'weight_decay': 0, 'amsgrad':
False}, 'lr_scheduler':None, 'lr_scheduler_params':{},
'lr_scheduler_monitor_metric':'valid_loss'}
model_config = NodeConfig(
task="classification",
num_layers=2,
num_trees=512,
learning_rate=1,
embed_categorical=True,
)
tabular_model = TabularModel(
data_config=data_config,
model_config=model_config,
optimizer_config=optimizer_config,
trainer_config=trainer_config,
)
tabular_model.fit(train=train_data, test=test_data)
pred = tabular_model.predict(test_data)
pred['prediction'] = pred['prediction'].astype(int)
pred.loc[(pred['prediction'] >= 1 )] = 1
print_metrics(test_data['target'], pred["prediction"].astype('int'), tag="Holdout")
I want to Use the K fold method with k = 5 or 10.
Thank you for your advice.
The complete code example that I have used method train_test_split is above.
Here is an example of the k-fold method:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.4, random_state=0)
X_train.shape, y_train.shape
X_test.shape, y_test.shape
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)
result (in this example):
0.9666666666666667
The example is from here: https://scikit-learn.org/stable/modules/cross_validation.html

How to get the train and test data for each fold in kfold cross validation?

How can I access the train and test data for each fold in cross validation? I would like to save these in .csv files. I tried using the split function which generates the indices but it returns a generator object, not the indices.
from sklearn.model_selection import StratifiedKFold, KFold
import numpy as np
X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))
skf = StratifiedKFold(n_splits=3)
x = skf.split(X, y, groups)
x
Output:
<generator object _BaseKFold.split at 0x7ff195979580>
StratifiedKFold returns a generator, therefore you it to iterate over it as follows:
skf = StratifiedKFold(n_splits=3)
for train_index, test_index in skf.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

Shuffle and split 2 numpy arrays so as to maintain their ordering with respect to each other

I have 2 numpy arrays X and Y, with shape X: [4750, 224, 224, 3] and Y: [4750,1].
X is the training dataset and Y is the correct output label for each entry.
I want to split the data into train and test so as to validate my machine learning model. Therefore, I want to split them randomly so that they both have the correct ordering after random split is applied on X and Y. ie- every row of X is correctly has its corresponding label unchanged after the split.
How can I achieve the above objective ?
This is how I would do it
def split(x, y, train_ratio=0.7):
x_size = x.shape[0]
train_size = int(x_size * train_ratio)
test_size = x_size - train_size
train_indices = np.random.choice(x_size, size=train_size, replace=False)
mask = np.zeros(x_size, dtype=bool)
mask[train_indices] = True
x_train, y_train = x[mask], y[mask]
x_test, y_test = x[~mask], y[~mask]
return (x_train, y_train), (x_test, y_test)
I simply choose the required number of indices I need (randomly) for my train set, remaining will be for the test set.
Then use a mask to select the train and test samples.
You can also use the scikit-learn train_test_split to split your data using just 2 lines of code :
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)
sklearn.model_selection.train_test_split is a good choice!
But to craft one of your own
import numpy as np
def my_train_test_split(X, Y, train_ratio=0.8):
"""return X_train, Y_train, X_test, Y_test"""
n = X.shape[0]
split = int(n * train_ratio)
index = np.arange(n)
np.random.shuffle(index)
return X[index[:split]], Y[index[:split]], X[index[split:]], Y[index[split:]]

train_test_split not splitting data

There is a dataframe that consists of 14 columns in total, the last column is the target label with integer values = 0 or 1.
I have defined:
X = df.iloc[:,1:13] ---- this consists of the feature values
y = df.iloc[:,-1] ------ this consists of the corresponding labels
Both have the same length as desired, X is the dataframe that consists of 13 columns, shape (159880, 13), y is an array type with shape(159880,)
But when I perform train_test_split() on X,y- the function is not working properly.
Below is the straightforward code:
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state = 0)
After this split, both X_train and X_test have shape (119910,13). y_train is having shape (39970,13) and y_test is having shape (39970,)
This is weird, even after defining test_size parameter, the results stay the same.
Please advise, what could have been going wrong.
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_feature_importances
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
def model():
df = pd.read_csv('train.csv', encoding = 'ISO-8859-1')
df = df[np.isfinite(df['compliance'])]
df = df.fillna(0)
df['compliance'] = df['compliance'].astype('int')
df = df.drop(['grafitti_status', 'violation_street_number','violation_street_name','violator_name',
'inspector_name','mailing_address_str_name','mailing_address_str_number','payment_status',
'compliance_detail', 'collection_status','payment_date','disposition','violation_description',
'hearing_date','ticket_issued_date','mailing_address_str_name','city','state','country',
'violation_street_name','agency_name','violation_code'], axis=1)
df['violation_zip_code'] = df['violation_zip_code'].replace(['ONTARIO, Canada',', Australia','M3C1L-7000'], 0)
df['zip_code'] = df['zip_code'].replace(['ONTARIO, Canada',', Australia','M3C1L-7000'], 0)
df['non_us_str_code'] = df['non_us_str_code'].replace(['ONTARIO, Canada',', Australia','M3C1L-7000'], 0)
df['violation_zip_code'] = pd.to_numeric(df['violation_zip_code'], errors='coerce')
df['zip_code'] = pd.to_numeric(df['zip_code'], errors='coerce')
df['non_us_str_code'] = pd.to_numeric(df['non_us_str_code'], errors='coerce')
#df.violation_zip_code = df.violation_zip_code.replace('-','', inplace=True)
df['violation_zip_code'] = np.nan_to_num(df['violation_zip_code'])
df['zip_code'] = np.nan_to_num(df['zip_code'])
df['non_us_str_code'] = np.nan_to_num(df['non_us_str_code'])
X = df.iloc[:,0:13]
y = df.iloc[:,-1]
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state = 0)
print(y_train.shape)
You have mixed up the results of train_test_split, it should be
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0)
if args.mode == "train":
# Load Data
data, labels = load_dataset('C:/Users/PC/Desktop/train/k')
# Train ML models
knn(data, labels,'C:/Users/PC/Desktop/train/knn.pkl' )

writing a train_test_split function with numpy

I am trying to write my own train test split function using numpy instead of using sklearn's train_test_split function. I am splitting the data into 70% training and 30% test. I am using the boston housing data set from sklearn.
This is the shape of the data:
housing_features.shape #(506,13) where 506 is sample size and it has 13 features.
This is my code:
city_data = datasets.load_boston()
housing_prices = city_data.target
housing_features = city_data.data
def shuffle_split_data(X, y):
split = np.random.rand(X.shape[0]) < 0.7
X_Train = X[split]
y_Train = y[split]
X_Test = X[~split]
y_Test = y[~split]
print len(X_Train), len(y_Train), len(X_Test), len(y_Test)
return X_Train, y_Train, X_Test, y_Test
try:
X_train, y_train, X_test, y_test = shuffle_split_data(housing_features, housing_prices)
print "Successful"
except:
print "Fail"
The print output i got is:
362 362 144 144
"Successful"
But i know it was not successful because i get a different numbers for the length when i run it again Versus just using SKlearn's train test function and always get 354 for the length of X_train.
#correct output
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(housing_features, housing_prices, test_size=0.3, random_state=42)
print len(X_train)
#354
What am i missing my my function?
Because you're using np.random.rand which gives you random numbers and it'll be close to 70% for 0.7 limit for very big numbers. You could use np.percentile for that to get value for 70% and then compare with that value as you did:
def shuffle_split_data(X, y):
arr_rand = np.random.rand(X.shape[0])
split = arr_rand < np.percentile(arr_rand, 70)
X_train = X[split]
y_train = y[split]
X_test = X[~split]
y_test = y[~split]
print len(X_Train), len(y_Train), len(X_Test), len(y_Test)
return X_train, y_train, X_test, y_test
EDIT
Alternatively you could use np.random.choice to select indices with your desired amount. For your case:
np.random.choice(range(X.shape[0]), int(0.7*X.shape[0]))

Categories