train_test_split not splitting data - python

There is a dataframe that consists of 14 columns in total, the last column is the target label with integer values = 0 or 1.
I have defined:
X = df.iloc[:,1:13] ---- this consists of the feature values
y = df.iloc[:,-1] ------ this consists of the corresponding labels
Both have the same length as desired, X is the dataframe that consists of 13 columns, shape (159880, 13), y is an array type with shape(159880,)
But when I perform train_test_split() on X,y- the function is not working properly.
Below is the straightforward code:
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state = 0)
After this split, both X_train and X_test have shape (119910,13). y_train is having shape (39970,13) and y_test is having shape (39970,)
This is weird, even after defining test_size parameter, the results stay the same.
Please advise, what could have been going wrong.
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_feature_importances
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
def model():
df = pd.read_csv('train.csv', encoding = 'ISO-8859-1')
df = df[np.isfinite(df['compliance'])]
df = df.fillna(0)
df['compliance'] = df['compliance'].astype('int')
df = df.drop(['grafitti_status', 'violation_street_number','violation_street_name','violator_name',
'inspector_name','mailing_address_str_name','mailing_address_str_number','payment_status',
'compliance_detail', 'collection_status','payment_date','disposition','violation_description',
'hearing_date','ticket_issued_date','mailing_address_str_name','city','state','country',
'violation_street_name','agency_name','violation_code'], axis=1)
df['violation_zip_code'] = df['violation_zip_code'].replace(['ONTARIO, Canada',', Australia','M3C1L-7000'], 0)
df['zip_code'] = df['zip_code'].replace(['ONTARIO, Canada',', Australia','M3C1L-7000'], 0)
df['non_us_str_code'] = df['non_us_str_code'].replace(['ONTARIO, Canada',', Australia','M3C1L-7000'], 0)
df['violation_zip_code'] = pd.to_numeric(df['violation_zip_code'], errors='coerce')
df['zip_code'] = pd.to_numeric(df['zip_code'], errors='coerce')
df['non_us_str_code'] = pd.to_numeric(df['non_us_str_code'], errors='coerce')
#df.violation_zip_code = df.violation_zip_code.replace('-','', inplace=True)
df['violation_zip_code'] = np.nan_to_num(df['violation_zip_code'])
df['zip_code'] = np.nan_to_num(df['zip_code'])
df['non_us_str_code'] = np.nan_to_num(df['non_us_str_code'])
X = df.iloc[:,0:13]
y = df.iloc[:,-1]
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state = 0)
print(y_train.shape)

You have mixed up the results of train_test_split, it should be
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0)

if args.mode == "train":
# Load Data
data, labels = load_dataset('C:/Users/PC/Desktop/train/k')
# Train ML models
knn(data, labels,'C:/Users/PC/Desktop/train/knn.pkl' )

Related

How can I get the final tree model?

Given this model:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
import graphviz
X, y = make_classification(n_samples=1000, n_features=10,n_informative=3, n_redundant=5, random_state=42)
df = pd.DataFrame(data=X)
df.columns = 'X' + (df.columns+1).astype(str)
df[df.columns[-3:]] = df[df.columns[-3:]].astype(int)
df['Y'] = y
X_train, X_test, y_train, y_test = train_test_split(df.drop('Y', axis=1), df['Y'], test_size=0.3, random_state=42)
n_negative_class = y_train.value_counts().sort_index()[0]
n_positive_class = y_train.value_counts().sort_index()[1]
xgb = XGBClassifier(random_state = 42, n_estimators=50,
scale_pos_weight = n_negative_class/n_positive_class,
use_label_encoder=False)
xgb.fit(X_train, y_train, eval_metric="auc")
y_train_scores = xgb.predict_proba(X_test)[:,1]
xgboost.to_graphviz(xgb, num_trees=49)
How can I plot the final tree used in xgb.predict_proba(X_test)[:,1]? Is necesarily the last one (as XGBoost trees learn from the last tree)? Or XGBoost chooses some tree among those 50 estimators given the loss or eval_metric given?

Iterative split of multilabel classification dataset in pandas dataframe

I have dataset which contains text column with string values and multiple column with value 1 or 0 (classified or no). I want to use skmultilearn to split this data with even distribution, but I got this error:
KeyError: 'key of type tuple not found and not a MultiIndex'
And this is my code:
import pandas as pd
from skmultilearn.model_selection import iterative_train_test_split
y = pd.read_csv("dataset.csv")
x = y.pop("text")
x_train, x_test, y_train, y_test = iterative_train_test_split(x, y, test_size=0.1)
Here is what worked for me (this is 98/1/1 split):
import os
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
def main():
# load dataset
y = pd.read_csv("dataset.csv")
x = y.pop("text")
# save tag names to reuse them later for creating pandas DataFrames
tag_names = y.columns
# Data has to be in ndarray format
y = y.to_numpy()
x = x.to_numpy()
# split to train / test
msss = MultilabelStratifiedShuffleSplit(n_splits=2, test_size=0.02, random_state=42)
for train_index, test_index in msss.split(x, y):
x_train, x_test_temp = x[train_index], x[test_index]
y_train, y_test_temp = y[train_index], y[test_index]
# make some memory space
del x
del y
# split to test / validation
msss = MultilabelStratifiedShuffleSplit(n_splits=2, test_size=0.5, random_state=42)
for test_index, val_index in msss.split(x_test_temp, y_test_temp):
x_test, x_val = x_test_temp[test_index], x_test_temp[val_index]
y_test, y_val = y_test_temp[test_index], y_test_temp[val_index]
# train dataset
df_train = pd.DataFrame(data=y_train, columns=tag_names)
df_train.insert(0, "snippet", x_train)
# validation dataset
df_val = pd.DataFrame(data=y_val, columns=tag_names)
df_val.insert(0, "snippet", x_val)
# test dataset
df_test = pd.DataFrame(data=y_test, columns=tag_names)
df_test.insert(0, "snippet", x_test)
if __name__ == "__main__":
main()

r2 score turns out to be negative

I study support vector regression but I faced a problem: my r2 score becomes negative. Is that normal or is there any changeable part in my code to fix this?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
df = pd.read_csv('Position_Salaries.csv')
df.head()
X = df.iloc[:, 1:2].values
y = df.iloc[:, -1].values
from sklearn.preprocessing import StandardScaler
y = y.reshape(len(y),1)
x_scaler = StandardScaler()
y_scaler = StandardScaler()
X = x_scaler.fit_transform(X)
y = y_scaler.fit_transform(y)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)
regressor = SVR(kernel="rbf")
regressor.fit(x_train,y_train.ravel())
y_pred = y_scaler.inverse_transform(regressor.predict(x_scaler.transform(x_test)))
from sklearn.metrics import r2_score
r2_score(y_scaler.inverse_transform(y_test), y_pred)
My output is -0.5313206322807349
In this part, your X is in scaled version
X = x_scaler.fit_transform(X)
In this part, your x_test also in scaled version
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)
When creating prediction, you shouldn't transform your input again since your x_test already in scaled version
y_pred = y_scaler.inverse_transform(regressor.predict(x_scaler.transform(x_test)))
From the documentation of sklearn.metrics.r2_score.
Best possible score is 1.0 and it can be negative (because the model
can be arbitrarily worse). A constant model that always predicts the
expected value of y, disregarding the input features, would get a R^2
score of 0.0.
Per documentation:
Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse)

ValueError: Shape of passed values is (39, 1), indices imply (39, 7)

Heyy I am trying to do a simple logistic regression on my data which is returns (y) versus market indices (x).
import numpy as np
from sklearn import metrics
data = pd.read_excel ('Datafile.xlsx', index_col=0)
#split dataset into features and target variable
col_features = ['Market Beta','Value','Size','High-Yield Spread','Term Spread','Momentum','Bid-Ask Spread']
target=['Return']
x = data[col_features] #features
y = data[target] #target
#split x and y into training and testing datasets
from sklearn.model_selection import train_test_split
x_train, y_train, x_test, y_test = train_test_split (x, y, test_size = 0.25, random_state = 0)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
y_train = np.argmax(y_train, axis=1)
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
The error I get is
ValueError: Shape of passed values is (39, 1), indices imply (39, 7)
Thank you.
you just confused the order of train_test_split results so x_test and y_train became switched. Proper order should be this:
x_train, x_test, y_train, y_test = train_test_split(x, y, ...

How to spliting datasets - Number of labels=150 does not match number of samples=600

I have a data sample of 750x256.
Rows = 750
Columns = 256
If I split my data into 20%. I will have for X_train 600 samples and y_train 150 samples.
Then the problem would accure when doing decisionTreeRegressor
it will say Number of y_train=150 does not match number of samples=600
But if I split my test_size into 50%, then it will work.
is there a way to around this? I don't want to use 50% of my test_size.
Any help would be great!
here is my code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import graphviz
#Load the data
dataset = pd.read_csv('new_york.csv')
dataset['Higher'] = dataset['2016-12'].gt(dataset['2016-11']).astype(int)
X = dataset.iloc[:, 6:254].values
y = dataset.iloc[:, 255].values
#Taking care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, :248])
X[:, :248] = imputer.transform(X[:, :248])
#Split the data into train and test sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_test, y_train = train_test_split(X, y, test_size = .2, random_state = 0)
#let's build our first model
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
clf = DecisionTreeClassifier(max_depth=6)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)
train_test_split() returns X_train, X_test, y_train, y_test, you have y_train and y_test in the wrong order.
If you use a split of 50% this is not causing an error because y_train and y_test will have the same size (but the wrong values obviously).

Categories