Simple Imputation model n_feature and input n_feature not matching - python

I am trying to learn Simple Imputer on the data set provided on the course tab on Kaggle - https://www.kaggle.com/alexisbcook/missing-values
CSV file is available on above link.
While trying out the code I am getting following error.
ValueError: Number of features of the model must match the input. Model n_features is 6 and input n_features is 9
Any help to sort out the issue will be appreciated.
My Code:
import pandas as pd
df0 = pd.read_csv('/Users/ratnam03chanakya/Desktop/Projects/Kaggle/02.melb_data/melb_data.csv')
df0.head()
y = df0.Price
features = ['Rooms', 'Distance', 'Bathroom', 'Car', 'Landsize', 'BuildingArea','YearBuilt', 'Lattitude', 'Longtitude']
X = df0[features]
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X,y,random_state=0)
# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
# Drop columns in training and validation data model_selection
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
def score_dataset(X_train, X_valid, y_train, y_valid):
model0 = RandomForestRegressor()
model0.fit(reduced_X_train, y_train)
model0_predict = model0.predict(X_valid)
mae = mean_absolute_error(y_valid,model0_predict)
return mae
print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))
# IMPUTATION
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train,imputed_X_valid,y_train,y_valid))

Related

Scikit-Learn Numpy - Use One Hot Encoder on only string or categorical columns in dataset

I have a simple linear regression model below that uses one hot encoding to transform every X value. My question is how can I modify the code below to use one hot encoding for every column except one (e.g. the integer one highlighted below)
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
# define the location of the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"
# load the dataset
dataset = read_csv(url, header=None)
# retrieve the array of data
data = dataset.values
# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=1)
# one-hot encode input variables that are objects
onehot_encoder = OneHotEncoder()
onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)
# define the model
model = LogisticRegression()
# fit on the training set
model.fit(X_train, y_train)
# predict on test set
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))
I tried only feeding in 8 columns instead of 9 to OHE but got the error:
ValueError: The number of features in X is different to the number of features of the fitted data. The fitted data had 9 features and the X has 8 features.

Error: y could not convert string to float python random forests

I am using Python and random forests to predict the first column of my input file, my input file is under the form of:
T,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Here is the link to my full data: https://drive.google.com/file/d/1gjKoSi4rmMYZVm31LZ2Li92HM9USlu6A/view?usp=sharing
I am trying to predict the first column either T or N, depending on the values of the remaining columns and I am using random forests. I am getting the following error, how to fix it? Here is the code:
import pandas as pd
import numpy as np
dataset = pd.read_csv( 'data1extended.txt', sep= ',')
dataset.head()
row_count, column_count = dataset.shape
X = dataset.iloc[:, 1:column_count].values
y = dataset.iloc[:, 0].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))
Try changing your target variable to numeric first. Assuming 'gold' column is your target, run this immediately after loading the data to a dataframe.
dataset['gold'] = dataset['gold'].astype('category').cat.codes

ValueError: could not convert string to float: 'Pregnant'

I am solving a decision tree classification problem. code is below
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
# load dataset
pima = pd.read_csv("diabetes.csv", header=None, names=col_names)
#split dataset in features and target variable
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols] # Features
y = pima.label # Target variable
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
And Preview of dataset:
dataset
I am getting an error
ValueError: could not convert string to float: 'Pregnant'
Please help me solve this error.
Change this line to read the data with headers from csv file:
From:
pima = pd.read_csv("diabetes.csv", header=None, names=col_names)
to
pima = pd.read_csv("diabetes.csv") # This will import the data file with the header names from the csv, which you can change later if required.
Or manually remove the top row using this code:
pima = pima.iloc[1:]
The first non header line of your dataset contains what looks to be a duplicate header line. Thus the first value of X is "Pregnant" and not a float as you require.
You could either filter out non float values or fix your dataset.

how to convert pd.DataFrame to tf.data.Dataset (or use insted pd.DataFrame) for DNNClassifier

I am having this warning.
Instructions for updating:
To construct input pipelines, use the tf.data module.
I have had some search but I couldn't figure out the logic behind the tf.data.Dataset, so I couldn't manage converting pd.DataFrame into tf.data.Dataset.
I also need help for predictions at the end of the code, I couldn't figure out right way to compare predictions(high probability output) with label.
Note: data has no column names, so I have added a1 to a784 names to columns so I can assign them to feature_columns.
Thanks is advance.
Here is the code:
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn import metrics
from tensorflow.python.data import Dataset
mnist_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/mnist_train_small.csv",header=None)
mnist_df.describe()
mnist_df.columns
hand_df = mnist_df[0]
matrix_df = mnist_df.drop([0],axis=1)
matrix_df.head()
hand_df.head()
#creating cols array and append a1 to a784 in order to name columns
cols=[]
for i in range(785):
if i!=0:
a = '{}{}'.format('a',i)
cols.append(a)
matrix_df.columns = cols
mnist_df = mnist_df.head(10000)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(matrix_df, hand_df, test_size=0.3, random_state=101)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
matrix_df = pd.DataFrame(data=scaler.fit_transform(matrix_df),
columns=matrix_df.columns,
index=matrix_df.index)
#naming columns so I will not get error while assigning feature_columns
for i in range(len(cols)):
a=i+1
b='{}{}'.format('a',a)
cols[i] = tf.feature_column.numeric_column(str(b))
matrix_df.head()
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,
batch_size=10,num_epochs=1000,
shuffle=True)
my_optimizer = tf.train.AdagradOptimizer(learning_rate=0.03)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
model = tf.estimator.DNNClassifier(feature_columns=cols,
hidden_units=[32,64],
n_classes=10,
optimizer=my_optimizer,
config=tf.estimator.RunConfig(keep_checkpoint_max=1))
model.train(input_fn=input_func,steps=1000)
predict_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test,
batch_size=50,
num_epochs=1,
shuffle=False)
pred_gen = model.predict(predict_input_func)
predictions = list(pred_gen)
predictions[0]

Error while fitting model after One Hot Encoding

I am using one hot encoding(I am aware ordinal encoding is better in this case) for categorical variables of Titanic dataset. The one hot encoding is successfully done. However, the model fitting throws the following error:
ValueError: setting an array element with a sequence.
Here is the code I am running:
from sklearn.preprocessing import OneHotEncoder
def one_hot_encode_features(df_train,df_test):
features = ['Fare', 'Cabin', 'Age', 'Sex']
#features = [ 'Cabin', 'Sex']
df_combined = pd.concat([df_train[features], df_test[features]])
for feature in features:
le = preprocessing.LabelEncoder()
onehot_encoder = OneHotEncoder()
le = le.fit(df_combined[feature])
integer_encoding_train=le.transform(df_train[feature])
integer_encoding_test=le.transform(df_test[feature])
integer_encoding_train = integer_encoding_train.reshape(len(integer_encoding_train), 1)
integer_encoding_test = integer_encoding_test.reshape(len(integer_encoding_test), 1)
df_train[feature] = onehot_encoder.fit_transform(integer_encoding_train)
df_test[feature] = onehot_encoder.fit_transform(integer_encoding_test)
return df_train, df_test
data_train, data_test = one_hot_encode_features(data_train, data_test)
from sklearn.model_selection import train_test_split
X = data_train.drop(['Survived', 'PassengerId'], axis=1)
Y = data_train['Survived']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=23)
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
clf = GaussianNB()
acc_scorer = make_scorer(accuracy_score)
clf.fit(X_train, Y_train)
The error is removed if I use ordinal encoding instead of One Hot. I am new to handling category variables so cannot figure out the error.

Categories