I have this Decision Tree algorithm:
import sys
from class_vis import prettyPicture
from prep_terrain_data import makeTerrainData
import numpy as np
import pylab as pl
features_train, labels_train, features_test, labels_test = makeTerrainData()
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
clf = DecisionTreeClassifier()
clf = clf.fit(features_train, labels_train)
prediction = clf.predict(features_test, labels_test)
acc = accuracy_score(labels_test, prediction)
def submitAccuracies():
return {"acc":round(acc,3)}
When I run it, I get the following error:
Traceback (most recent call last):
File "vm_main.py", line 33, in <module>
import main
File "/tmp/vmuser_yafetikvhw/main.py", line 2, in <module>
import studentMain
File "/tmp/vmuser_yafetikvhw/studentMain.py", line 10, in <module>
student_output = student_code.submitAccuracies()
File "/tmp/vmuser_yafetikvhw/decisionTreeAccuracyQuiz.py", line 34, in submitAccuracies
return float({"acc":round(acc,3)})
TypeError: a float is required
I'm stuck here. I've tried turning my input into a float but still get same error. For example: acc = acc/1.0 or acc = float(acc)
Thanks.
score
returns the number of correctly classified samples (int).
From here:
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
Try this one:
return {"acc":round(float(acc), 3)}
Related
I'm trying to get randomized hyperparameter search to work with the voting classifier from sklearn by adapting the example given in the sklearn documentation.
I've seen this minimal working example, but it breaks in many ways using my version of sklearn.
Here is a stripped-down example:
import numpy as np
from sklearn import __version__ as skv
from sklearn.ensemble import RandomForestClassifier as RFClassi
from sklearn.ensemble import HistGradientBoostingClassifier as HGBClassi
from sklearn.tree import DecisionTreeClassifier as DTClassi
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.datasets import load_iris
print(f"sklearn version: {skv}")
df_X, target = load_iris(return_X_y=True, as_frame=True)
ensemble = ['rf','dtree','hgb']
hy_pa_grid = {
'hgb': dict(learning_rate = list(np.linspace(0.01,0.5,10).round(3))),
'rf':dict(criterion = ['gini', 'entropy']),
'dtree':dict(criterion = ['gini', 'entropy']),
}
clfs = {'hgb' : HGBClassi(), 'rf': RFClassi(), 'dtree' : DTClassi()}
vc = VotingClassifier(estimators = clfs.items(), voting = 'soft')
params = {
f"{c}__{p}" : hy_pa_grid[c][p]
for c in ensemble
for p in hy_pa_grid[c].keys()
}
print("\n".join(map(str,params.items())))
clf = RandomizedSearchCV(estimator = vc, param_distributions = params)
clf.fit(df_X,target)
The output I get is this:
sklearn version: 1.1.3
{'rf__criterion': ['gini', 'entropy'], 'dtree__criterion': ['gini', 'entropy'], 'hgb__learning_rate': [0.01, 0.064, 0.119, 0.173, 0.228, 0.282, 0.337, 0.391, 0.446, 0.5]}
Traceback (most recent call last):
File "vc.py", line 34, in <module>
clf.fit(df_X,target)
File "/home/USER/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py", line 789, in fit
base_estimator = clone(self.estimator)
File "/home/USER/.local/lib/python3.8/site-packages/sklearn/base.py", line 87, in clone
new_object_params[name] = clone(param, safe=False)
File "/home/USER/.local/lib/python3.8/site-packages/sklearn/base.py", line 68, in clone
return copy.deepcopy(estimator)
File "/usr/lib/python3.8/copy.py", line 161, in deepcopy
rv = reductor(4)
TypeError: cannot pickle 'dict_items' object
Any ideas for getting round this? I also tried doing it with GridSearchCV, as in the example, but I get the same error.
Oops, it turns out the problem was in
estimators = clfs.items()
All was well once I wrapped it in tuple() to be an actual tuple rather than a generator.
Hi I'm having some trouble with my code i'm kinda new to python but I do have some understanding of classes and objects.
import random
class ScrappyKNN():
def fit(self,X_train,Y_train):
self.X_train = X_train
self.Y_train = Y_train
def predict(self, X_test,a):
predictions = []
for row in X_test:
label = random.choice(self.Y_train)
predictions.append(label)
return predictions
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
Y = iris.target
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = .5)
#from sklearn.neighbours import KNeighboursClassifier
my_classifier = ScrappyKNN()
my_classifier = my_classifier.fit(X_train,Y_train)
predictions = my_classifier.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test, predictions))
but I'm getting an error when I run it:
Traceback (most recent call last):
File "pipelineKNeighbours.py", line 30, in <module>
predictions = my_classifier.predict(X_test)
AttributeError: 'NoneType' object has no attribute 'predict'
What am I doing wrong?
The line
my_classifier = my_classifier.fit(X_train,Y_train)
redefines my_classifier as the return value of fit(), which is None, so my_classifier is no longer pointing to an instance of your ScrappyKNN class, but to a NoneType object - thus, the error.
I am trying to do regression for my own data following the example of Deep Neural Network Regression with Boston Data.
Following is my code.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from sklearn import cross_validation
from sklearn import metrics
from sklearn import preprocessing
import tensorflow as tf
from tensorflow.contrib import learn
from numpy import genfromtxt
def main():
x_test = genfromtxt('ARM4mDec2002Jul2015OklahomaV2_mar_apr_may_date_time_normalized_16000_test_data_x.csv', delimiter=',')
y_test = genfromtxt('ARM4mDec2002Jul2015OklahomaV2_mar_apr_may_date_time_normalized_16000_test_data_y.csv', delimiter=',')
x_train = genfromtxt('ARM4mDec2002Jul2015OklahomaV2_mar_apr_may_date_time_normalized_16000_training_data_x.csv', delimiter=',')
y_train = genfromtxt('ARM4mDec2002Jul2015OklahomaV2_mar_apr_may_date_time_normalized_16000_training_data_y.csv', delimiter=',')
# Build 2 layer fully connected DNN with 10, 10 units respectively.
feature_columns = learn.infer_real_valued_columns_from_input(x_train)
regressor = learn.DNNRegressor(
feature_columns=feature_columns, hidden_units=[10, 10])
# Fit
regressor.fit(x_train, y_train, steps=10, batch_size=1)
# Predict and score
y_predicted = list(
regressor.predict(scaler.transform(x_test), as_iterable=True))
score = metrics.mean_squared_error(y_predicted, y_test)
print('MSE: {0:f}'.format(score))
if __name__ == '__main__':
tf.app.run()
I am getting lots of warning and invalid argument error. The complete console output is available here. I would like to put the highlights of the error messages below.
W tensorflow/core/framework/op_kernel.cc:975] Invalid argument: Nan in summary histogram for: dnn/hiddenlayer_0_activation
[[Node: dnn/hiddenlayer_0_activation = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"](dnn/hiddenlayer_0_activation/tag, dnn/hiddenlayer_0/hiddenlayer_0/Relu)]]
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "arm_data_regression.py", line 39, in main
regressor.fit(x_train, y_train, steps=10, batch_size=1)
File "/home/shehab/.local/lib/python2.7/site-packages/tensorflow/python/util/deprecation.py", line 191, in new_func
return func(*args, **kwargs)
Is this because I am using some deprecated TensorFlow API?
The problem is really strange, because that piece of worked pretty fine with other dataset.
The full code:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
# # Split the Learning Set
X_fit, X_eval, y_fit, y_eval= train_test_split(
train, target, test_size=0.2, random_state=1
)
clf = xgb.XGBClassifier(missing=np.nan, max_depth=6,
n_estimators=5, learning_rate=0.15,
subsample=1, colsample_bytree=0.9, seed=1400)
# fitting
clf.fit(X_fit, y_fit, early_stopping_rounds=50, eval_metric="logloss", eval_set=[(X_eval, y_eval)])
#print y_pred
y_pred= clf.predict_proba(test)[:,1]
Last line causes the error below (full output provided):
Will train until validation_0 error hasn't decreased in 50 rounds.
[0] validation_0-logloss:0.554366
[1] validation_0-logloss:0.451454
[2] validation_0-logloss:0.372142
[3] validation_0-logloss:0.309450
[4] validation_0-logloss:0.259002
Traceback (most recent call last):
File "../src/script.py", line 57, in
y_pred= clf.predict_proba(test)[:,1]
File "/opt/conda/lib/python3.4/site-packages/xgboost-0.4-py3.4.egg/xgboost/sklearn.py", line 435, in predict_proba
test_dmatrix = DMatrix(data, missing=self.missing)
File "/opt/conda/lib/python3.4/site-packages/xgboost-0.4-py3.4.egg/xgboost/core.py", line 220, in __init__
feature_types)
File "/opt/conda/lib/python3.4/site-packages/xgboost-0.4-py3.4.egg/xgboost/core.py", line 147, in _maybe_pandas_data
raise ValueError('DataFrame.dtypes for data must be int, float or bool')
ValueError: DataFrame.dtypes for data must be int, float or bool
Exception ignored in: >
Traceback (most recent call last):
File "/opt/conda/lib/python3.4/site-packages/xgboost-0.4-py3.4.egg/xgboost/core.py", line 289, in __del__
_check_call(_LIB.XGDMatrixFree(self.handle))
AttributeError: 'DMatrix' object has no attribute 'handle'
What is wrong here? I have no idea how to fix that
UPD1: Acctually this is kaggle problem: https://www.kaggle.com/insaff/bnp-paribas-cardif-claims-management/xgboost
The problem here is related to the initial data: some of values are float or integer and some object. This is why we need to cast them:
from sklearn import preprocessing
for f in train.columns:
if train[f].dtype=='object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train[f].values))
train[f] = lbl.transform(list(train[f].values))
for f in test.columns:
if test[f].dtype=='object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(test[f].values))
test[f] = lbl.transform(list(test[f].values))
train.fillna((-999), inplace=True)
test.fillna((-999), inplace=True)
train=np.array(train)
test=np.array(test)
train = train.astype(float)
test = test.astype(float)
You might also want to take a look at categorical variable solution as shown below:
for col in train.select_dtypes(include=['object']).columns:
train[col] = train[col].astype('category')
test[col] = test[col].astype('category')
# Encoding categorical features
for col in train.select_dtypes(include=['category']).columns:
train[col] = train[col].cat.codes
test[col] = test[col].cat.codes
train.fillna((-999), inplace=True)
test.fillna((-999), inplace=True)
train=np.array(train)
test=np.array(test)
I am building a Bayesian Ridge Regression using sklearn on the Parkinson's Telemonitoring Data Set. This is the code:
import math
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
data1 = pd.read_csv("data.csv")
msk = np.random.rand(len(data1)) < 0.66
train = data1[msk]
test = data1[~msk]
y = train[['motor_UPDRS','total_UPDRS']]
X = train.drop('motor_UPDRS',axis = 1)
X = X.drop('total_UPDRS',axis = 1)
labels = test[['motor_UPDRS','total_UPDRS']]
test = test.drop('motor_UPDRS',axis = 1)
test = test.drop('total_UPDRS',axis = 1)
clf = linear_model.BayesianRidge()
clf.fit(X,y)
The data set is divided into 66% training set and 33% test set ratio. When I run it,I get the following error:
Traceback (most recent call last):
File "<ipython-input-8-c4e92f3e0bf9>", line 1, in <module>
runfile('C:/Users/Keshav/Desktop/Spring/ML/Project/parkinsons/main6.py', wdir='C:/Users/Keshav/Desktop/Spring/ML/Project/parkinsons')
File "C:\Users\Keshav\Anaconda\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 580, in runfile
execfile(filename, namespace)
File "C:/Users/Keshav/Desktop/Spring/ML/Project/parkinsons/main6.py", line 27, in <module>
clf.fit(X,y)
File "C:\Users\Keshav\Anaconda\lib\site-packages\sklearn\linear_model\bayes.py", line 212, in fit
self._set_intercept(X_mean, y_mean, X_std)
File "C:\Users\Keshav\Anaconda\lib\site-packages\sklearn\linear_model\base.py", line 159, in _set_intercept
self.coef_ = self.coef_ / X_std
ValueError: operands could not be broadcast together with shapes (20,2) (20,)
Any idea how to resolve it?