This question already has an answer here:
Supported target types are: ('binary', 'multiclass'). Got 'continuous' instead
(1 answer)
Closed 4 months ago.
I am trying to follow a machine-learning tutorial listed here: https://machinelearningmastery.com/machine-learning-in-python-step-by-step/, but I am encountering an issue. I was able to run the following code on my Macbook air, however, it did not work on my Windows machine. I checked other questions with similar titles, none of which seem to fit my problem.
Why is this happening? How can it be fixed?
My entire code:
# Python version
import sys
print('Python: {}'.format(sys.version))
# scipy
import scipy
print('scipy: {}'.format(scipy.__version__))
# numpy
import numpy
print('numpy: {}'.format(numpy.__version__))
# matplotlib
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
# pandas
import pandas
print('pandas: {}'.format(pandas.__version__))
# scikit-learn
import sklearn
print('sklearn: {}'.format(sklearn.__version__))
# compare algorithms
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Load dataset
url = "energyFormatted.csv"
names = ['TOTAL', 'PURCHASED', 'NUCLEAR', 'SOLAR', 'WIND', 'NATURAL_GAS', 'COAL', 'OIL']
dataset = read_csv(url, names=names)
print(dataset.shape)
# Split-out validation dataset
array = dataset.values
X = array[:, 0:4]
y = array[:, 4]
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1, shuffle=True)
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
names.append(name)
print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
The line that's giving me an error:
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
The error itself:
Traceback (most recent call last):
File "D:\Applications\pythonProject\venv\lib\site-packages\joblib\parallel.py", line 862, in dispatch_one_batch
tasks = self._ready_batches.get(block=False)
File "C:\Users\danie\AppData\Local\Programs\Python\Python39\lib\queue.py", line 168, in get
raise Empty
_queue.Empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\danie\AppData\Roaming\JetBrains\PyCharmCE2022.2\scratches\FY23 SCI FAIR\main.py", line 63, in <module>
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=None)
File "D:\Applications\pythonProject\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 515, in cross_val_score
cv_results = cross_validate(
File "D:\Applications\pythonProject\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 266, in cross_validate
results = parallel(
File "D:\Applications\pythonProject\venv\lib\site-packages\joblib\parallel.py", line 1085, in __call__
if self.dispatch_one_batch(iterator):
File "D:\Applications\pythonProject\venv\lib\site-packages\joblib\parallel.py", line 873, in dispatch_one_batch
islice = list(itertools.islice(iterator, big_batch_size))
File "D:\Applications\pythonProject\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 266, in <genexpr>
results = parallel(
File "D:\Applications\pythonProject\venv\lib\site-packages\sklearn\model_selection\_split.py", line 340, in split
for train, test in super().split(X, y, groups):
File "D:\Applications\pythonProject\venv\lib\site-packages\sklearn\model_selection\_split.py", line 86, in split
for test_index in self._iter_test_masks(X, y, groups):
File "D:\Applications\pythonProject\venv\lib\site-packages\sklearn\model_selection\_split.py", line 717, in _iter_test_masks
test_folds = self._make_test_folds(X, y)
File "D:\Applications\pythonProject\venv\lib\site-packages\sklearn\model_selection\_split.py", line 660, in _make_test_folds
raise ValueError(
ValueError: Supported target types are: ('binary', 'multiclass'). Got 'continuous' instead.
CSV:
28564,0,6284.08,1713.84,19.9948,19994.8,19.9948,19.9948
28411,0,6250.42,852.33,0,20740.03,568.22,0
27515,0,6053.3,550.3,0,20361.1,550.3,0
24586,491.72,5408.92,245.86,0,17947.78,491.72,0
26653,533.06,6130.19,0,0,18923.63,1066.12,0
26836,805.08,6172.28,0,0,18785.2,1073.44,0
26073,1303.65,5736.06,0,0,17990.37,1042.92,0
27055,1352.75,6222.65,0,0,18397.4,1082.2,0
26236,1311.8,6034.28,0,0,17578.12,1311.8,0
26020,1821.4,3903,0,0,18994.6,1040.8,260.2
26538,0,4246.08,265.38,13799.76,6369.12,0,1326.9
25800,3354,5160,0,0,14964,1290,1032
26682,3468.66,5603.22,0,0,14941.92,1600.92,1067.28
24997,3499.58,5499.34,0,0,13248.41,1499.82,1249.85
25100,3765,4769,0,0,13052,1506,2008
24651,4190.67,4930.2,0,0,12325.5,1232.55,1972.08
12053,0,1084.77,0,3133.78,6508.62,0,723.18
11500,2070,2415,0,0,4255,690,2070
Accuracy doesn't make sense for scoring a continuous variable. The error indicates that your y values are of type float (or are otherwise continuous). You could try something like sklearn.metrics.mean_squared_error instead of accuracy.
I am following Müller & Guido's Machine Learning with Python book, and I am trying to run classifications on this dataset.
So far my code looks like this:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
# Read the Churn data into a dataset (pandas) from the cvs file
dataset = pd.read_csv(r'C:\Users\Amalie\IdeaProjects\INFO284\src\Lab2.csv')
# Make the data into a 2D NumPy array (as scikit-learn expects for the data)
dataframe = dataset[['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines',
'InternetService', 'OnlineSecurity', 'Churn']]
y = dataframe['Churn'] # Target
X = dataframe.drop('Churn', 1) # Features ( all other than target column 'Churn' )
# Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20) # Split into test/training sets
logReg = LogisticRegression(max_iter=100000).fit(X_train, y_train)
print("Training set score: {:.3f}".format(logReg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logReg.score(X_test, y_test)))
When I run it, I get this error:
Traceback (most recent call last):
File "C:/Users/Amalie/IdeaProjects/INFO284/src/Lab5.py", line 19, in <module>
logReg = LogisticRegression(max_iter=100000).fit(X_train, y_train)
File "C:\Users\Amalie\IdeaProjects\INFO284\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1514, in fit
accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
File "C:\Users\Amalie\IdeaProjects\INFO284\venv\lib\site-packages\sklearn\base.py", line 581, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "C:\Users\Amalie\IdeaProjects\INFO284\venv\lib\site-packages\sklearn\utils\validation.py", line 976, in check_X_y
estimator=estimator,
File "C:\Users\Amalie\IdeaProjects\INFO284\venv\lib\site-packages\sklearn\utils\validation.py", line 746, in check_array
array = np.asarray(array, order=order, dtype=dtype)
File "C:\Users\Amalie\IdeaProjects\INFO284\venv\lib\site-packages\pandas\core\generic.py", line 1993, in __array__
return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'No'
Process finished with exit code 1
It says that the problem is with this line
logReg = LogisticRegression(max_iter=100000).fit(X_train, y_train)
I have used the fit()-method before when running other classification problems, but I've never come across this issue before. What am I doing wrong?
I'm trying to calculate the accuracy score, of a SVM using Laplacian kernel (as a pre-computed kernel). However, I'm getting the error as below when I try to calculate the accuracy score.
My code :
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics.pairwise import laplacian_kernel
#Load the iris data
iris_data = load_iris()
#Split the data and target
X = iris_data.data
y = iris_data.target
#Convert X and y to a numpy array
X = np.array(X)
y = np.array(y)
#Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, shuffle=True)
#Using Laplacian kernel - https://scikit-learn.org/stable/modules/metrics.html#laplacian-kernel
K = np.array(laplacian_kernel(X_train, gamma=.5))
svm = SVC(kernel='precomputed').fit(K, np.ravel(y_train))
pred_y = svm.predict(K)
#Print accuracy score - here is where the error is happening.
print(accuracy_score(y_test, pred_y))
When I run this code, I'm getting error as shown below :
Traceback (most recent call last):
File "/Users/user/Desktop/Research/Src/Laplace.py", line 36, in <module>
print(accuracy_score(y_test, pred_y))
File "/Users/user/miniforge3/envs/user_venv/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "/Users/user/miniforge3/envs/user/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 202, in accuracy_score
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
File "/Users/user/miniforge3/envs/user/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 83, in _check_targets
check_consistent_length(y_true, y_pred)
File "/Users/user/miniforge3/envs/user/lib/python3.8/site-packages/sklearn/utils/validation.py", line 262, in check_consistent_length
raise ValueError("Found input variables with inconsistent numbers of"
ValueError: Found input variables with inconsistent numbers of samples: [45, 105]
So how can I resolve this error?
You calculated pred_y using your train inputs which has 105 elements and y_test has 45 elements.
You need to add a step:
#user3046211's code
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics.pairwise import laplacian_kernel
#Load the iris data
iris_data = load_iris()
#Split the data and target
X = iris_data.data
y = iris_data.target
#Convert X and y to a numpy array
X = np.array(X)
y = np.array(y)
#Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, shuffle=True)
#Using Laplacian kernel - https://scikit-learn.org/stable/modules/metrics.html#laplacian-kernel
K = np.array(laplacian_kernel(X_train, gamma=.5))
svm = SVC(kernel='precomputed').fit(K, np.ravel(y_train))
pred_y = svm.predict(K)
#Print accuracy score - here is where the error is happening.
print(accuracy_score(y_test, pred_y))
# NEW CODE STARTS HERE
K_test = np.array(laplacian_kernel(X=X_test,Y=X_train, gamma=.5))
pred_y_test = svm.predict(K_test)
print(accuracy_score(y_test, pred_y_test))
Trying to get a result out, but getting this error instead:
C:\Users\my_is\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:548: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
Traceback (most recent call last):
File "C:\Users\my_is\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\my_is\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 890, in fit
super().fit(
File "C:\Users\my_is\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 181, in fit
check_classification_targets(y)
File "C:\Users\my_is\anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 172, in check_classification_targets
raise ValueError("Unknown label type: %r" % y_type)
ValueError: Unknown label type: 'continuous'
Here is my code:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.tree import DecisionTreeClassifier
data = load_boston()
c = np.array([1 if y > np.median(data['target']) else 0 for y in data['target']])
X_train, X_test, c_train, c_test = train_test_split(data['data'], c, random_state=0)
tree = DecisionTreeClassifier()
tree.fit(X_train, c_train)
#print(data.target)
#logReg = LogisticRegression()
#logReg.fit(X_train, c_train)
#result = cross_validate(logReg, data.data, data.target, cv=5, return_train_score=True)
result = cross_validate(tree, data.data, data.target, cv=5, return_train_score=True)
display(result)
I am completely new to python and ML, any help is appreciated
You have a mistake here:
result = cross_validate(tree, data.data, data.target, cv=5, return_train_score=True)
Should be:
result = cross_validate(tree, data.data, c, cv=5, return_train_score=True)
I have a problem when I tried to use logistic regression with the idea to determine a t predicted value in the penultimate line. The error is:
"Traceback (most recent call last):
line 15, in
t_pred = logreg(X_test)
TypeError: 'LogisticRegression' object is not callable"
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
df=pd.read_csv('datos.csv')
X=df1 = df.iloc[:,1:5]
t=df.iloc[:,0]
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.2, random_state=0)
logreg=LogisticRegression(solver='lbfgs')
predicted = cross_val_predict(logreg, X_train, t_train, cv=10)
print(accuracy_score(t_train, predicted))
logreg.fit(X_train, t_train)
t_pred = logreg(X_test)
print(accuracy_score(t_test, t_pred))
This is due to:
t_pred = logreg(X_test)
You need to use a method of the object logreg, not supply the params directly to it.
Notice how you used logreg.fit(). fit() is a method which handles the training data. Similarly, you will need to call predict() to get the predictions on new data.
Try this:
t_pred = logreg.predict(X_test)