ValueError: could not convert string to float: sklearn - python

Recently I was working with a dataset in python and got an unexpected error. the error was: ValueError: could not convert string to float. Actually in the dataset there were text data also which I converted into integer with LabelEncoder. But when I am going in the training part where I fit the model, I'm getting this error which makes no sense.
code:
import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn import preprocessing
import pandas as pd
import pickle
import numpy as np
data = pd.read_csv("house_train.csv")
data = data.fillna(value=0)
dataX_train = data.drop(["SalePrice"], axis = 1)
dataX_test = data.SalePrice
le = preprocessing.LabelEncoder()
dataX_train.MSZoning = le.fit_transform(list(data["MSZoning"]))
dataX_train.Street = le.fit_transform(list(data["Street"]))
dataX_train.Alley = le.fit_transform(list(data["Alley"]))
dataX_train.LotShape = le.fit_transform(list(data["LotShape"]))
dataX_train.LandContour = le.fit_transform(list(data["LandContour"]))
dataX_train.Utilities = le.fit_transform(list(data["Utilities"]))
dataX_train.LotConfig = le.fit_transform(list(data["LotConfig"]))
dataX_train.LandSlope = le.fit_transform(list(data["LandSlope"]))
dataX_train.Neighborhood = le.fit_transform(list(data["Neighborhood"]))
dataX_train.Condition1 = le.fit_transform(list(data["Condition1"]))
dataX_train.Condition2 = le.fit_transform(list(data["Condition2"]))
dataX_train.BldgType = le.fit_transform(list(data["BldgType"]))
dataX_train.HouseStyle = le.fit_transform(list(data["HouseStyle"]))
dataX_train.RoofStyle = le.fit_transform(list(data["RoofStyle"]))
dataX_train.RoofMatl = le.fit_transform(list(data["RoofMatl"]))
dataX_train.Exterior1st = le.fit_transform(list(data["Exterior1st"]))
dataX_train.Exterior2nd = le.fit_transform(list(data["Exterior2nd"]))
dataX_train.MasVnrType = le.fit_transform(list(data["MasVnrType"]))
dataX_train.ExterQual = le.fit_transform(list(data["ExterQual"]))
dataX_train.ExterCond = le.fit_transform(list(data["ExterCond"]))
dataX_train.Foundation = le.fit_transform(list(data["Foundation"]))
dataX_train.BsmtQual = le.fit_transform(list(data["BsmtQual"]))
dataX_train.BsmtExposure = le.fit_transform(list(data["BsmtExposure"]))
dataX_train.BsmtFinType1 = le.fit_transform(list(data["BsmtFinType1"]))
dataX_train.BsmtFinType2 = le.fit_transform(list(data["BsmtFinType2"]))
dataX_train.Heating = le.fit_transform(list(data["Heating"]))
dataX_train.HeatingQC = le.fit_transform(list(data["HeatingQC"]))
dataX_train.CentralAir = le.fit_transform(list(data["CentralAir"]))
dataX_train.Electrical = le.fit_transform(list(data["Electrical"]))
dataX_train.KitchenQual = le.fit_transform(list(data["KitchenQual"]))
dataX_train.Functional = le.fit_transform(list(data["Functional"]))
dataX_train.FireplaceQu = le.fit_transform(list(data["FireplaceQu"]))
dataX_train.GarageType = le.fit_transform(list(data["GarageType"]))
dataX_train.GarageFinish = le.fit_transform(list(data["GarageFinish"]))
dataX_train.GarageQual = le.fit_transform(list(data["GarageQual"]))
dataX_train.GarageCond = le.fit_transform(list(data["GarageCond"]))
dataX_train.PavedDrive = le.fit_transform(list(data["PavedDrive"]))
dataX_train.PoolQC = le.fit_transform(list(data["PoolQC"]))
dataX_train.Fence = le.fit_transform(list(data["Fence"]))
dataX_train.MiscFeature = le.fit_transform(list(data["MiscFeature"]))
dataX_train.SaleType = le.fit_transform(list(data["SaleType"]))
dataX_train.SaleCondition = le.fit_transform(list(data["SaleCondition"]))
best = 0
x_train, x_test, y_train, y_test = model_selection.train_test_split(dataX_train, dataX_test,
test_size = 0.2)
clf = linear_model.LinearRegression()
clf.fit(x_train, y_train)
acc = clf.score(x_test, y_test)
if acc > best:
best = acc
with open("housingmodel.pickle", "wb") as f:
pickle.dump(clf , f)
print(acc)

first of all check out if you encoded all of your features in dataX_train, I think you missed something there.
try: dataX_train.dtypes and check if there is any of non-numeric values and then use to_numeric on a non numeric columns. For example
dataX_train['NonNumericCol'] = dataX_train['NonNumericCol'].apply(pd.to_numeric)

Related

AttributeError: module 'tensorflow_estimator.python.estimator.api._v1.estimator' has no attribute 'inpus'

I am trying to use linear classifier to predict, the constructing and training of the estimator is listed here:
model = tf.estimator.LinearClassifier(
n_classes = 2,
model_dir = "ongoing",
feature_columns = categorical_features + continuous_features
(
FEATURES = ['Age', 'Gender', 'ICD9Code']
LABEL = 'Condition'
def get_input_fn(data_set, num_epochs, n_batch, shuffle):
input = tf.compat.v1.estimator.inputs.pandas_input_fn(
x = pd.DataFrame({k: data_set[k].values for k in FEATURES}),
y = pd.Series(data_set[LABEL].values),
batch_size = n_batch,
num_epochs = num_epochs,
shuffle = shuffle
)
return input
model.train(
input_fn = get_input_fn(csv_data, num_epochs = None, n_batch = 10461, shuffle = False
),
steps = 1000
)
predict_data = pd.read_csv('feature_condition.csv', usecols = ['PatientGuid', 'Age', 'Gender', 'ICD9Code'], nrows = 5)
predict_input_fn = tf.estimator.inpus.numpy_input_fn(
x = {"x": predict_data},
y = None,
batch_size = 5,
shuffle = False,
num_threads = 5
)
predict_results = model.predict(predict_input_fn)
print(predict_results)
got the error:
AttributeError: module 'tensorflow_estimator.python.estimator.api._v1.estimator' has no attribute 'inpus'
my tensorflow version is 2.4.1
can you please help me to resolve this problem? THX!
update: I have already corrected the typo error, and the error has been fixed, but I got one warning listed here:
The name tf.estimator.inputs.numpy_input_fn is deprecated. Please use tf.compat.v1.estimator.inputs.numpy_input_fn instead.
after I used the suggested function, I got the same wanring listed here:
The name tf.estimator.inputs.numpy_input_fn is deprecated. Please use tf.compat.v1.estimator.inputs.numpy_input_fn instead
It really confused me, can you please help to fix it? THX!
I uploaded my complete code in google drive, this is the link here:
https://drive.google.com/file/d/1R6bRcv8Afjx4cPLBZaBpuCcDg71fNN3Y/view?usp=sharing
Your issue can be resolved if you can change tf.estimator.inpus.numpy_input_fn to tf.estimator.inputs.numpy_input_fn. It's typo error.
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import json
import os
import numpy as np
import pandas as pd
from pandas.core.frame import DataFrame
from tensorflow.train import SequenceExample, FeatureLists
from tensorflow import feature_column
from tensorflow.keras import layers
csv_file = 'feature_condition.csv'
csv_data = pd.read_csv(csv_file, low_memory = False)
csv_df = pd.DataFrame(csv_data)
test_file = 'test.csv'
test_data = pd.read_csv(test_file, low_memory = False)
test_df = pd.DataFrame(test_data)
CONTI_FEATURES = ['Age']
CATE_FEATURES = ['Gender', 'ICD9Code']
# create the feature column:
continuous_features = [tf.feature_column.numeric_column(k) for k in CONTI_FEATURES]
categorical_features = [tf.feature_column.categorical_column_with_hash_bucket(k, hash_bucket_size = 1000) for k in CATE_FEATURES]
model = tf.estimator.LinearClassifier(
n_classes = 2,
model_dir = "ongoing",
feature_columns = categorical_features + continuous_features
)
FEATURES = ['Age', 'Gender', 'ICD9Code']
LABEL = 'Condition'
# input function:
def get_input_fn(data_set, num_epochs, n_batch, shuffle):
input = tf.compat.v1.estimator.inputs.pandas_input_fn(
x = pd.DataFrame({k: data_set[k].values for k in FEATURES}),
y = pd.Series(data_set[LABEL].values),
batch_size = n_batch,
num_epochs = num_epochs,
shuffle = shuffle
)
return input
# train the model
model.train(
input_fn = get_input_fn(csv_data, num_epochs = None, n_batch = 10461, shuffle = False
),
steps = 1000
)
# iterate every data in test dataset and make a prediction:
row_pre = 0
for i in test_data.loc[:,'PatientGuid']:
dict = {'Age': test_data.loc[row_pre]['Age'],
'Gender': test_data.loc[row_pre]['Gender'],
'ICD9Code': test_data.loc[row_pre]['ICD9Code'],
}
df = pd.DataFrame(dict, index = [1,2,3])
predict_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
#predict_input_fn = tf.estimator.inputs.numpy_input_fn(
x = {k: df[k].values for k in FEATURES},
y = None,
batch_size = 1,
num_epochs = 1,
shuffle = False,
num_threads = 1
)
predict_results = model.predict(predict_input_fn)
row_pre += 1
You can ignore deprecated warnings.

Python KNN Regression

I am trying to predict a cars MPG by using a KNN algorithm. I first cleaned my data, made a test and training dataset, and then I made a normalized and non-normalized KNN function. Now I am trying to pass my testing data through a KNN algorithm, and then create a list of all the predictions. I then want to use mean squared error to analyze my predictions. Currently, I have not been able to set up the function to pass my testing data through. Any guidance would be greatly appreciated!
import pandas as pd
import numpy as np
import math
from google.colab import drive
drive.mount('/content/drive')
pd.set_option('display.max_columns', 100)
vehicles = pd.read_csv('/content/drive/MyDrive/CS_167/vehicles (2).csv')
subset_cars = vehicles[vehicles["fuelType"] == 'Regular']
final_sub = subset_cars[["comb08", "year", "cylinders", "displ"]]
column_nulls = final_sub.isna().any()
Cylinder_no_null = final_sub.cylinders.dropna()
displ_no_null = final_sub.displ.dropna()
pure_data = final_sub.dropna()
# pure_data.head()
shuffled_data = pure_data.sample(frac=1, random_state=41)
test_data = shuffled_data.iloc[0:500]
train_data = shuffled_data.iloc[500:]
train_data_euc = train_data.copy()
test_data_euc = test_data.copy()
def Regression_KNN(MPG,train_data_euc,k):
train_data_euc['euc_dis'] = np.sqrt(
(MPG['year']-train_data_euc['year'])**2+
(MPG['cylinders']-train_data_euc['cylinders'])**2+
(MPG['displ']-train_data_euc['displ'])**2)
sorted_train_data = train_data_euc.sort_values(['euc_dis'])
prediction = sorted_train_data.iloc[0:k]['comb08'].mean()
return prediction
MPG ={}
MPG['year'] = 2020
MPG['cylinders'] = 4
MPG['displ'] = 5.2
print(f"The average MPG for this car is: %d" %Regression_KNN(MPG, train_data_euc, 5))
z_train_copy = train_data_euc.copy()
z_train_year_std = z_train_copy['year'].std()
z_train_year_mean = z_train_copy['year'].mean()
z_train_cylinders_std = z_train_copy['cylinders'].std()
z_train_cylinders_mean = z_train_copy['cylinders'].mean()
z_train_displ_std = z_train_copy['displ'].std()
z_train_displ_mean = z_train_copy['displ'].mean()
z_train_euc_std = z_train_copy['euc_dis'].std()
z_train_euc_mean = z_train_copy['euc_dis'].mean()
z_train_copy['year'] = (z_train_copy['year'] - z_train_year_mean)/z_train_year_std
z_train_copy['cylinders'] = (z_train_copy['cylinders'] - z_train_cylinders_mean)/z_train_cylinders_std
z_train_copy['displ'] = (z_train_copy['displ'] - z_train_displ_mean)/z_train_displ_std
z_train_copy['euc_dis'] = (z_train_copy['euc_dis'] - z_train_euc_mean)/z_train_euc_std
def Z_TRAIN_KNN(MPG, z_train_copy, k):
z_train_copy['euc_dis'] = np.sqrt(
(MPG['year']-z_train_copy['year'])**2+
(MPG['cylinders']-z_train_copy['cylinders'])**2+
(MPG['displ']-z_train_copy['displ'])**2)
z_train_sorted_data = z_train_copy.sort_values(['euc_dis'])
z_train_prediction = z_train_sorted_data.iloc[0:k]['comb08'].mean()
return z_train_prediction
MPG ={}
MPG['year'] = 2020
MPG['cylinders'] = 4
MPG['displ'] = 5.2
print(f"The average MPG for this car is: %d" %Z_TRAIN_KNN(MPG, z_train_copy, 5))
def regression_all_kNN(test_data_euc,z_train_data,k):
#apply the classify_kNN function to each item in the test data with the train
#data and k passed as the other two arguments. The result will be a series of
#the individual results.
for i in test_data:
z_train_data['euc_dis'] = np.sqrt(
(test_data['year']- z_train_data['year'])**2+
(test_data['cylinders']- z_train_data['cylinders'])**2+
(test_data['displ']- z_train_data['displ'])**2)
sorted_train_data = z_train_data.sort_values(['euc_dis'])
prediction = test_data.apply(regression_all_kNN,args=(z_train_data,k))
return prediction
predictions5NN = regression_all_kNN(test_data, train_data, 5)

Why is the node gain output from xgboost different from that calculated manually?

We can get xgboost tree structure from trees_to_dataframe():
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.datasets import load_boston
data = load_boston()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
model = xgb.XGBRegressor(random_state=1,
n_estimators=1, # 只有一棵树
max_depth=2,
learning_rate=0.1
)
model.fit(X, y)
tree_frame = model._Booster.trees_to_dataframe()
tree_frame
In which, according to the SO thread How is xgboost quality calculated?, gain should be calculated by:
However it is different from this code:
def mse_obj(preds, labels):
grad = labels-preds
hess = np.ones_like(labels)
return grad, hess
Gain,Hessian = mse_obj(y.mean(),y)
L = X[tree_frame['Feature'][0]] < tree_frame['Split'][0]
R = X[tree_frame['Feature'][0]] >= tree_frame['Split'][0]
GL = Gain[L].sum()
GR = Gain[R].sum()
HL = Hessian[L].sum()
HR = Hessian[R].sum()
reg_lambda = 1.0
gain = (GL**2/(HL+reg_lambda)+GR**2/(HR+reg_lambda)-(GL+GR)**2/(HL+HR+reg_lambda))
gain # 18817.811191871013
L = (X[tree_frame['Feature'][0]] < tree_frame['Split'][0])&((X[tree_frame['Feature'][1]] < tree_frame['Split'][1]))
R = (X[tree_frame['Feature'][0]] < tree_frame['Split'][0])&((X[tree_frame['Feature'][1]] >= tree_frame['Split'][1]))
GL = Gain[L].sum()
GR = Gain[R].sum()
HL = Hessian[L].sum()
HR = Hessian[R].sum()
reg_lambda = 1.0
gain = (GL**2/(HL+reg_lambda)+GR**2/(HR+reg_lambda)-(GL+GR)**2/(HL+HR+reg_lambda))
gain # 7841.627971119211
L = (X[tree_frame['Feature'][0]] > tree_frame['Split'][0])&((X[tree_frame['Feature'][2]] < tree_frame['Split'][2]))
R = (X[tree_frame['Feature'][0]] > tree_frame['Split'][0])&((X[tree_frame['Feature'][2]] >= tree_frame['Split'][2]))
GL = Gain[L].sum()
GR = Gain[R].sum()
HL = Hessian[L].sum()
HR = Hessian[R].sum()
reg_lambda = 1.0
gain = (GL**2/(HL+reg_lambda)+GR**2/(HR+reg_lambda)-(GL+GR)**2/(HL+HR+reg_lambda))
gain # 2634.409414953051
Did I miss something?
Eventually I found out where I was wrong. The default prediction value defined by base_score is 0.5, and we should use base_score as model's predicted value before any tree is builded when calculate the gradient for each sample.
Gain,Hessian = mse_obj(model.get_params()['base_score'], y)
After this, everything seems ok.

How to find feature Interactions between all columns in a dataframe, Python?

Friedman’s H-statistic The interpretable ML book by Christoph Molnar actually gives us a workable approach, by using Friedman’s H-statistic based on the decomposition of the partial dependence values to calculate the feature interactions.
In Python, sklearn_gbmi will accept feature sets of length two and higher but does not provide support for the first-order measure, very similar to interact.gbm in R. It only works on gradient boosting based models
I found a manual Python implementation from here, posted below for reference, where the feature interactions were calculated.
import itertools
import math
import pandas as pd
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from pdpbox.pdp_calc_utils import _calc_ice_lines_inter
from pdpbox.pdp import pdp_isolate, PDPInteract
from pdpbox.utils import (_check_model, _check_dataset, _check_percentile_range, _check_feature,
_check_grid_type, _check_memory_limit, _make_list,
_calc_memory_usage, _get_grids, _get_grid_combos, _check_classes)
from joblib import Parallel, delayed
def pdp_multi_interact(model, dataset, model_features, features,
num_grid_points=None, grid_types=None, percentile_ranges=None, grid_ranges=None,
cust_grid_points=None,
cust_grid_combos=None, use_custom_grid_combos=False,
memory_limit=0.5, n_jobs=1, predict_kwds=None, data_transformer=None):
def _expand_default(x, default, length):
if x is None:
return [default] * length
return x
def _get_grid_combos(feature_grids, feature_types):
grids = [list(feature_grid) for feature_grid in feature_grids]
for i in range(len(feature_types)):
if feature_types[i] == 'onehot':
grids[i] = np.eye(len(grids[i])).astype(int).tolist()
return np.stack(np.meshgrid(*grids), -1).reshape(-1, len(grids))
if predict_kwds is None:
predict_kwds = dict()
nr_feats = len(features)
# check function inputs
n_classes, predict = _check_model(model=model)
_check_dataset(df=dataset)
_dataset = dataset.copy()
# prepare the grid
pdp_isolate_outs = []
if use_custom_grid_combos:
grid_combos = cust_grid_combos
feature_grids = []
feature_types = []
else:
num_grid_points = _expand_default(x=num_grid_points, default=10, length=nr_feats)
grid_types = _expand_default(x=grid_types, default='percentile', length=nr_feats)
for i in range(nr_feats):
_check_grid_type(grid_type=grid_types[i])
percentile_ranges = _expand_default(x=percentile_ranges, default=None, length=nr_feats)
for i in range(nr_feats):
_check_percentile_range(percentile_range=percentile_ranges[i])
grid_ranges = _expand_default(x=grid_ranges, default=None, length=nr_feats)
cust_grid_points = _expand_default(x=cust_grid_points, default=None, length=nr_feats)
_check_memory_limit(memory_limit=memory_limit)
pdp_isolate_outs = []
for idx in range(nr_feats):
pdp_isolate_out = pdp_isolate(
model=model, dataset=_dataset, model_features=model_features, feature=features[idx],
num_grid_points=num_grid_points[idx], grid_type=grid_types[idx], percentile_range=percentile_ranges[idx],
grid_range=grid_ranges[idx], cust_grid_points=cust_grid_points[idx], memory_limit=memory_limit,
n_jobs=n_jobs, predict_kwds=predict_kwds, data_transformer=data_transformer)
pdp_isolate_outs.append(pdp_isolate_out)
if n_classes > 2:
feature_grids = [pdp_isolate_outs[i][0].feature_grids for i in range(nr_feats)]
feature_types = [pdp_isolate_outs[i][0].feature_type for i in range(nr_feats)]
else:
feature_grids = [pdp_isolate_outs[i].feature_grids for i in range(nr_feats)]
feature_types = [pdp_isolate_outs[i].feature_type for i in range(nr_feats)]
grid_combos = _get_grid_combos(feature_grids, feature_types)
feature_list = []
for i in range(nr_feats):
feature_list.extend(_make_list(features[i]))
# Parallel calculate ICE lines
true_n_jobs = _calc_memory_usage(
df=_dataset, total_units=len(grid_combos), n_jobs=n_jobs, memory_limit=memory_limit)
grid_results = Parallel(n_jobs=true_n_jobs)(delayed(_calc_ice_lines_inter)(
grid_combo, data=_dataset, model=model, model_features=model_features, n_classes=n_classes,
feature_list=feature_list, predict_kwds=predict_kwds, data_transformer=data_transformer)
for grid_combo in grid_combos)
ice_lines = pd.concat(grid_results, axis=0).reset_index(drop=True)
pdp = ice_lines.groupby(feature_list, as_index=False).mean()
# combine the final results
pdp_interact_params = {'n_classes': n_classes,
'features': features,
'feature_types': feature_types,
'feature_grids': feature_grids}
if n_classes > 2:
pdp_interact_out = []
for n_class in range(n_classes):
_pdp = pdp[feature_list + ['class_%d_preds' % n_class]].rename(
columns={'class_%d_preds' % n_class: 'preds'})
pdp_interact_out.append(
PDPInteract(which_class=n_class,
pdp_isolate_outs=[pdp_isolate_outs[i][n_class] for i in range(nr_feats)],
pdp=_pdp, **pdp_interact_params))
else:
pdp_interact_out = PDPInteract(
which_class=None, pdp_isolate_outs=pdp_isolate_outs, pdp=pdp, **pdp_interact_params)
return pdp_interact_out
def center(arr): return arr - np.mean(arr)
def compute_f_vals(mdl, X, features, selectedfeatures, num_grid_points=10, use_data_grid=False):
f_vals = {}
data_grid = None
if use_data_grid:
data_grid = X[selectedfeatures].values
# Calculate partial dependencies for full feature set
p_full = pdp_multi_interact(mdl, X, features, selectedfeatures,
num_grid_points=[num_grid_points] * len(selectedfeatures),
cust_grid_combos=data_grid,
use_custom_grid_combos=use_data_grid)
f_vals[tuple(selectedfeatures)] = center(p_full.pdp.preds.values)
grid = p_full.pdp.drop('preds', axis=1)
# Calculate partial dependencies for [1..SFL-1]
for n in range(1, len(selectedfeatures)):
for subsetfeatures in itertools.combinations(selectedfeatures, n):
if use_data_grid:
data_grid = X[list(subsetfeatures)].values
p_partial = pdp_multi_interact(mdl, X, features, subsetfeatures,
num_grid_points=[num_grid_points] * len(selectedfeatures),
cust_grid_combos=data_grid,
use_custom_grid_combos=use_data_grid)
p_joined = pd.merge(grid, p_partial.pdp, how='left')
f_vals[tuple(subsetfeatures)] = center(p_joined.preds.values)
return f_vals
# the second-order H-measure:
def compute_h_val(f_vals, selectedfeatures):
denom_els = f_vals[tuple(selectedfeatures)].copy()
numer_els = f_vals[tuple(selectedfeatures)].copy()
sign = -1.0
for n in range(len(selectedfeatures)-1, 0, -1):
for subfeatures in itertools.combinations(selectedfeatures, n):
numer_els += sign * f_vals[tuple(subfeatures)]
sign *= -1.0
numer = np.sum(numer_els**2)
denom = np.sum(denom_els**2)
return math.sqrt(numer/denom) if numer < denom else np.nan
# first-order H-measure as well:
def compute_h_val_any(f_vals, allfeatures, selectedfeature):
otherfeatures = list(allfeatures)
otherfeatures.remove(selectedfeature)
denom_els = f_vals[tuple(allfeatures)].copy()
numer_els = denom_els.copy()
numer_els -= f_vals[(selectedfeature,)]
numer_els -= f_vals[tuple(otherfeatures)]
numer = np.sum(numer_els**2)
denom = np.sum(denom_els**2)
return math.sqrt(numer/denom) if numer < denom else np.nan
df = sns.load_dataset("diamonds")
data = pd.get_dummies(df, ["cut", "color", "clarity"])
X = data.drop("cut_Ideal", axis=1)
y = data["cut_Ideal"]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.33,
random_state = 42)
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
max_depth=1, random_state=0).fit(X_train, y_train)
f_val = compute_f_vals(gbc, X, X.columns, ['carat', 'depth'], num_grid_points=10, use_data_grid=False)
# second-order H-measure:
compute_h_val(f_val, ['carat', 'depth'])
I want to calculate feature interactions for all the columns in a dataframe. How could I do that?
I am not here to avail free code writing service I just want to capture a little bit of knowledge from the experienced programmers with discussing things. I was just expecting a suggestion/reference for the appropriate library of methods on finding the feature interactions?

Sklearn - Found input variables with inconsistent numbers of samples: [16512, 4128]

Going through the second chapter of Hands-On Machine Learning with Scikit-Learn & TensorFlow, running to the error stated above. This happens when I try to implement the following line:
linReg.fit(housingPrepared, housing_labels)
Researching online it looks like it has to do something with the dimensions of my features and my labels not matching up. Printing the shape of housingPrepared (X) and housing_labels (Y) yields the following result:
(16512, 16) (4128,)
I've spent the last hour going through line by line to see if I missed a line in this chapter, can't find anything. Wondering if someone here might have an intuition on where a potential solution for this problem could be.
Thank you so much in advance. All my code up to the problem line is posted below:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from zlib import crc32
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from CategoricalEncoder import CategoricalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import check_array
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetchHousingData(housingUrl=HOUSING_URL, housingPath=HOUSING_PATH):
if not os.path.isdir(housingPath):
os.makedirs(housingPath)
tgzPath = os.path.join(housingPath, "housing.tgz")
urllib.request.urlretrieve(housingUrl, tgzPath)
housingTgz = tarfile.open(tgzPath)
housingTgz.extractall(path=housingPath)
housingTgz.close()
def loadHousingData(housingPath=HOUSING_PATH):
return pd.read_csv("https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv")
housing = loadHousingData()
#plt.hist(housing['longitude'],bins=50)
#plt.show()
def splitTrainTesT(data, testRatio):
shuffled_indices = np.random.permutation(len(data))
testSetSize = int(len(data)* testRatio)
testIndices = shuffled_indices[:testSetSize]
trainIndices = shuffled_indices[testSetSize:]
return data.iloc[trainIndices], data.iloc[testIndices]
def testSetCheck(identifier, testRatio):
return crc32(np.int64(identifier)) & 0xffffffff < testRatio * 2 ** 32
def splitTrainTestByID(data, testRatio, idColumn):
ids = data[idColumn]
inTestSet = ids.apply(lambda id_: testSetCheck(id_, testRatio))
return data.loc[~inTestSet], data.loc[inTestSet]
#housingWithID = housing.reset_index()
#trainSet, testSet = splitTrainTestByID(housingWithID,0.2,"index")
trainSet, testSet = train_test_split(housing,test_size=0.2,random_state=42)
housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
#plt.hist(housing["income_cat"])
#plt.show()
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for trainIndex, testIndex in split.split(housing, housing["income_cat"]):
stratTrainSet = housing.loc[trainIndex]
stratTestSet = housing.loc[testIndex]
for set in (stratTrainSet, stratTestSet):
set.drop("income_cat", axis=1, inplace=True)
housing = stratTrainSet.copy()
#print(housing)
#plt.scatter(x=housing["latitude"],y=housing["longitude"], alpha=0.4)
#plt.show()
corr_matrix = housing.corr()
#print(corr_matrix["median_house_value"].sort_values(ascending=False))
#attribues = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
#scatter_matrix(housing[attribues], figsize=(12,8))
#plt.show()
""" PREPARING DATA FOR MACHINE LEARNING ALGORITHMS"""
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTestSet["median_house_value"].copy()
housing.dropna(subset=["total_bedrooms"])
imputer = Imputer(strategy="median")
housingNum = housing.drop("ocean_proximity", axis=1)
imputer.fit(housingNum)
X = imputer.transform(housingNum)
housingTr = pd.DataFrame(X, columns=housingNum.columns)
housingCat = housing["ocean_proximity"]
housingCatEncoded, housingCategories = housingCat.factorize()
encoder = OneHotEncoder()
housingCat1Hot = encoder.fit_transform(housingCatEncoded.reshape(-1,1))
"""Custom Transformers For Rooms Per Household, etc"""
roomsIX, bedroomsIX, populationIX, householdsIX = 3,4,5,6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, addBedroomsPerRoom = True):
self.addBedroomsPerRoom = addBedroomsPerRoom
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
roomsPerHousehold = X[:,roomsIX]/X[:,householdsIX]
populationPerHousehold = X[:,populationIX]/X[:,householdsIX]
if self.addBedroomsPerRoom:
bedroomsPerRoom = X[:,bedroomsIX]/X[:,roomsIX]
return np.c_[X, roomsPerHousehold, populationPerHousehold, bedroomsPerRoom]
else:
return np.c_[X, roomsPerHousehold, populationPerHousehold]
attrAdder = CombinedAttributesAdder(addBedroomsPerRoom=False)
housingExtraAttribs = attrAdder.transform(housing.values)
numPipeline = Pipeline([('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housingNumTr = numPipeline.fit_transform(housingNum)
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attributeNames):
self.attributeNames = attributeNames
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attributeNames].values
numAttribs = list(housingNum)
catAttribs = ["ocean_proximity"]
numPipeline = Pipeline([('selector', DataFrameSelector(numAttribs)),
('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),])
"""UPDATE SKLEARN TO INCLUDE CATEGORICAL ENCODER LIBRARY"""
catPipeline = Pipeline([('selector', DataFrameSelector(catAttribs)),
('cat_encoder', CategoricalEncoder(encoding='onehot-dense')),
])
fullPipeline = FeatureUnion(transformer_list=[("num_pipeline", numPipeline), ("cat_pipeline", catPipeline),])
housingPrepared = fullPipeline.fit_transform(housing)
linReg = LinearRegression()
print(housingPrepared.shape, housing_labels.shape)
linReg.fit(housingPrepared, housing_labels)
I believe the problem is in these two lines:
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTestSet["median_house_value"].copy()
Change it to:
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTrainSet["median_house_value"].copy()
and you're good to go.

Categories