Tensorflow: how to create feature_columns for numpy matrix input - python

I'm using tensorflow 1.8.0, python 3.6.5.
The data is iris data set. Here is the code:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import tensorflow as tf
X = iris['data']
y = iris['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
input_train=tf.estimator.inputs.numpy_input_fn(x=X_train,
y=y_train, num_epochs=100, shuffle=False)
classifier_model = tf.estimator.DNNClassifier(hidden_units=[10,
20, 10], n_classes=3, feature_columns=??)
Here is my problem, how do I setup the feature_columns for a numpy matrix?
If I covert the X and y to pandas.DataFrame, I can use the following code for the feature_columns, and it works in the DNNClassifier model.
features = X.columns
feature_columns = [tf.feature_column.numeric_column(key=key) for key in features]

You can wrap your numpy ndarray in a dictionary and pass it to numpy_input_fn method as input x and then use the key in that dictionary to define your feature_column. Also note that because each data in your X_train has 4 dimensions, you need to specify the shape parameter when defining tf.feature_column.numeric_column. Here is the completed code:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import tensorflow as tf
iris = load_iris()
X = iris['data']
y = iris['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
input_train = tf.estimator.inputs.numpy_input_fn(
x = {'x': X_train},
y = y_train,
num_epochs = 100,
shuffle = False)
feature_columns = [tf.feature_column.numeric_column(key='x', shape=(X_train.shape[1],))]
classifier_model = tf.estimator.DNNClassifier(
hidden_units=[10, 20, 10],
n_classes=3,
feature_columns=feature_columns)

Related

How can I get the final tree model?

Given this model:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
import graphviz
X, y = make_classification(n_samples=1000, n_features=10,n_informative=3, n_redundant=5, random_state=42)
df = pd.DataFrame(data=X)
df.columns = 'X' + (df.columns+1).astype(str)
df[df.columns[-3:]] = df[df.columns[-3:]].astype(int)
df['Y'] = y
X_train, X_test, y_train, y_test = train_test_split(df.drop('Y', axis=1), df['Y'], test_size=0.3, random_state=42)
n_negative_class = y_train.value_counts().sort_index()[0]
n_positive_class = y_train.value_counts().sort_index()[1]
xgb = XGBClassifier(random_state = 42, n_estimators=50,
scale_pos_weight = n_negative_class/n_positive_class,
use_label_encoder=False)
xgb.fit(X_train, y_train, eval_metric="auc")
y_train_scores = xgb.predict_proba(X_test)[:,1]
xgboost.to_graphviz(xgb, num_trees=49)
How can I plot the final tree used in xgb.predict_proba(X_test)[:,1]? Is necesarily the last one (as XGBoost trees learn from the last tree)? Or XGBoost chooses some tree among those 50 estimators given the loss or eval_metric given?

Error when running linear kernel ridge regression fitting on dataset

Trying to fit a linear kernel ridge regression model on a dataset with 8 features.
import pandas as pd
import urllib.request
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls'
urllib.request.urlretrieve(url, './Concrete_Data.xls')
data = pd.read_excel('./Concrete_Data.xls')
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
new_col_names = ["Cement", "BlastFurnaceSlag", "FlyAsh", "Water", "Superplasticizer","CoarseAggregate", "FineAggregate", "Age", "CC_Strength"]
curr_col_names = list(data.columns)
mapper = {}
for i,name in enumerate(curr_col_names):
mapper[name] = new_col_names[i]
data = data.rename(columns=mapper)
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
from sklearn.kernel_ridge import KernelRidge
kr = KernelRidge(alpha=1.0)
kr.fit(x_train, y_train)
y_pred_kr = kr.predict(y_test)
When I try to run the code, there is an error that says the expected array is meant to be 2D but is a 1D array. Could someone let me know what I am possibly doing wrong?

What does the error mean and how to fix it - "ValueError: query data dimension must match training data dimension"

I am trying to write the code for K-NN
Below is my code. - I know that issue is in `predict() but I am not able to figure out how o fix it.
# Importing the libraries
import numpy as np
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('UniversalBank.csv')
X = dataset.iloc[:,[ 1,2,3,5,6,7,8,10,11,12,13]].values #,
y = dataset.iloc[:,9].values
#Splitting the dataset to training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state= 0)
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#Fitting the classifier to training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train,y_train)
#Predicting the test results
y_pred = classifier.predict(X_test)

How can I predict the outcome in python?

I have the following code, where i predict a value from 4 input values:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
data = np.loadtxt('C:/Users/hedeg/Desktop/RulaSoftEdgePrediction.txt')
X_train = np.array(data[0:3500,0:4])
y_train = np.array(data[0:3500,4])
X_test = np.array(data[3500::,0:4])
y_test = np.array(data[3500::,4])
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)
I get this error msg:
raise ValueError("Unknown label type: %s" % repr(ys))
ValueError: Unknown label type: (array([1. , 1.1, 1.2, ..., 3. , 3. , 3. ]),)
How can i solve this problem?
Try to use this one:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_blobs
# generate 2d classification dataset
X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=1)
# fit final model
model = LogisticRegression()
model.fit(X, y)
# example of training a final classification model
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_blobs
# generate 2d classification dataset
X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=1)
# fit final model
model = LogisticRegression()
model.fit(X, y)

Why the output of cross_validate differ from the hardcode loop when using XGBClassifier?

Code #1 Pass pipeline with PCA, XGBClassifier steps to scikit-learn cross_validate function
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import random
random.seed(42)
import numpy as np
np.random.seed(42)
kwargs = {
'n_jobs': -1,
'cv': LeaveOneOut(),
'X': X,
'y': y
}
pipe = Pipeline([
('pca', PCA(1, random_state=42)),
('xgbc', XGBClassifier(random_state=42))
])
results = cross_validate(pipe, **kwargs)
print(results['test_score'].mean())
Code #2 Write cross-validation loop hardcode and calculate mean accuracy for exactly same input X as Code #1
from xgboost import XGBClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.decomposition import PCA
import random
random.seed(42)
import numpy as np
np.random.seed(42)
acc = []
for train_idx, test_idx in LeaveOneOut().split(X, y):
x_train, x_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
pca = PCA(1, random_state=42)
pca.fit(x_train)
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)
model = XGBClassifier(random_state=42, n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
acc.append(score)
print(np.mean(acc))

Categories