Logistic Regression get Value error could not convert string to float: '?' - python

I am very new at this stuff. This is from a course I am taking;
I need to fit the Logistic Regression classifier
I enter
from sklearn.linear_model import LogisticRegression
C=1.0
classifier = LogisticRegression(C=C, penalty='l1')
classifier.fit(x, y)
and get a Value Error
ValueError Traceback (most recent call last) <ipython-input-33-9d4de811daf9> in <module>()
----> 1 classifier.fit(x, y)
~\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight) 1214 1215 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype,
-> 1216 order="C") 1217 check_classification_targets(y) 1218 self.classes_ = np.unique(y)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
571 X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
572 ensure_2d, allow_nd, ensure_min_samples,
--> 573 ensure_min_features, warn_on_dtype, estimator)
574 if multi_output:
575 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
431 force_all_finite)
432 else:
--> 433 array = np.array(array, dtype=dtype, order=order, copy=copy)
434
435 if ensure_2d:
ValueError: could not convert string to float: '?'
Please help

The training input x and output y must be of type np.float64.
If you want to use strings, you need to encode them before fitting .
Check this post out for it :
Link

Related

ValueError: Input contains NaN, infinity or a value too large for dtype('float64') While fitting the model After Imputation

I pass the predictors from an imputation pipeline,
I check the columns for NaN and inf values with
col_name = X.columns.to_series()[np.isinf(X).any()]
There are no columns with missing values or inf values.
classifier = MLPClassifier
I do: clf.fit(X,y) then get the error ValueError: Input contains NaN, infinity or a value too large for dtype('float64')
What am I missing here? Can anybody help or guide me as to what to do????
What might be the other possible explanations for this error? What else should I be checking for? Also
full traceback:
Input In [137], in <cell line: 4>()
1 #l = list(X.isin([np.inf, -np.inf]))
2 #col_name = X.columns.to_series()[np.isnan(X).any()]
3 #col_name
----> 4 clf.fit(X, y)
File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:394, in Pipeline.fit(self, X, y, **fit_params)
392 if self._final_estimator != "passthrough":
393 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 394 self._final_estimator.fit(Xt, y, **fit_params_last_step)
396 return self
File ~\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:752, in BaseMultilayerPerceptron.fit(self, X, y)
735 def fit(self, X, y):
736 """Fit the model to data matrix X and target(s) y.
737
738 Parameters
(...)
750 Returns a trained MLP model.
751 """
--> 752 return self._fit(X, y, incremental=False)
File ~\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:393, in BaseMultilayerPerceptron._fit(self, X, y, incremental)
386 raise ValueError(
387 "hidden_layer_sizes must be > 0, got %s." % hidden_layer_sizes
388 )
389 first_pass = not hasattr(self, "coefs_") or (
390 not self.warm_start and not incremental
391 )
--> 393 X, y = self._validate_input(X, y, incremental, reset=first_pass)
395 n_samples, n_features = X.shape
397 # Ensure y is 2D
File ~\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:1100, in MLPClassifier._validate_input(self, X, y, incremental, reset)
1099 def _validate_input(self, X, y, incremental, reset):
-> 1100 X, y = self._validate_data(
1101 X,
1102 y,
1103 accept_sparse=["csr", "csc"],
1104 multi_output=True,
1105 dtype=(np.float64, np.float32),
1106 reset=reset,
1107 )
1108 if y.ndim == 2 and y.shape[1] == 1:
1109 y = column_or_1d(y, warn=True)
File ~\anaconda3\lib\site-packages\sklearn\base.py:581, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
579 y = check_array(y, **check_y_params)
580 else:
--> 581 X, y = check_X_y(X, y, **check_params)
582 out = X, y
584 if not no_val_X and check_params.get("ensure_2d", True):
File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:979, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
962 raise ValueError("y cannot be None")
964 X = check_array(
965 X,
966 accept_sparse=accept_sparse,
(...)
976 estimator=estimator,
977 )
--> 979 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric)
981 check_consistent_length(X, y)
983 return X, y
File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:989, in _check_y(y, multi_output, y_numeric)
987 """Isolated part of check_X_y dedicated to y validation"""
988 if multi_output:
--> 989 y = check_array(
990 y, accept_sparse="csr", force_all_finite=True, ensure_2d=False, dtype=None
991 )
992 else:
993 y = column_or_1d(y, warn=True)
File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:800, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
794 raise ValueError(
795 "Found array with dim %d. %s expected <= 2."
796 % (array.ndim, estimator_name)
797 )
799 if force_all_finite:
--> 800 _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan")
802 if ensure_min_samples > 0:
803 n_samples = _num_samples(array)
File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:114, in _assert_all_finite(X, allow_nan, msg_dtype)
107 if (
108 allow_nan
109 and np.isinf(X).any()
110 or not allow_nan
111 and not np.isfinite(X).all()
112 ):
113 type_err = "infinity" if allow_nan else "NaN, infinity"
--> 114 raise ValueError(
115 msg_err.format(
116 type_err, msg_dtype if msg_dtype is not None else X.dtype
117 )
118 )
119 # for object dtype data, we only check for NaNs (GH-13254)
120 elif X.dtype == np.dtype("object") and not allow_nan:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
You may want to check nan as well
col_name = X.columns.to_series()[np.isinf(X).any() | np.isnan(X).any()]

ValueError on KNN in Python

Everything in my code is working well, until I try to use the KNN algorithm to predict the quality of wine using its attributes. This is my first time trying this code for KNN.
this part is giving me errors
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=0)
classifier.fit(wine_train[X], y_train)
Error location:
1 from sklearn.neighbors import KNeighborsClassifier
2 classifier = KNeighborsClassifier(n_neighbors=0)
----> 3 classifier.fit(wine_train[X], y_train)
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/utils/validation.py:979, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
962 raise ValueError("y cannot be None")
964 X = check_array(
965 X,
966 accept_sparse=accept_sparse,
(...)
976 estimator=estimator,
977 )
--> 979 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric)
981 check_consistent_length(X, y)
983 return X, y
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/utils/validation.py:989, in _check_y(y, multi_output, y_numeric)
987 """Isolated part of check_X_y dedicated to y validation"""
988 if multi_output:
--> 989 y = check_array(
990 y, accept_sparse="csr", force_all_finite=True, ensure_2d=False, dtype=None
991 )
992 else:
993 y = column_or_1d(y, warn=True)
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/utils/validation.py:800, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
794 raise ValueError(
795 "Found array with dim %d. %s expected <= 2."
796 % (array.ndim, estimator_name)
797 )
799 if force_all_finite:
--> 800 _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan")
802 if ensure_min_samples > 0:
803 n_samples = _num_samples(array)
File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/utils/validation.py:122, in _assert_all_finite(X, allow_nan, msg_dtype)
120 elif X.dtype == np.dtype("object") and not allow_nan:
121 if _object_dtype_isnan(X).any():
--> 122 raise ValueError("Input contains NaN")
ValueError: Input contains NaN

Passing a 2-d array as an element to sklearn.SVM

I have a single independent variable X where each element is a 2-d array with shape (20, 431). The variable X itself is a 2-d numpy array of shape (200,). . How do I pass this to the sklearn.SVM object?
Edit: actual dataframe:
Category Features
0 1 [[-177.08171, -219.89174, -253.55954, -218.560...
1 0 [[-291.89288, -316.40735, -389.8398, -413.6302...
2 1 [[-355.88293, -351.0909, -364.43524, -400.7097.
Each element of Features is a 20*431 numpy array. I need to use these features to classify the category.
x = data.iloc[:, 1].values
y = data.iloc[:, 0].values
x.shape
(200, )
x[0].shape
(20, 431)
y.shape
(200, )
Fitting the model after splitting into train and test data:
classifier = SVC(kernel = 'rbf', random_state=0)
classifier.fit(x_train, y_train)
Error:
TypeError Traceback (most recent call last)
TypeError: only size-1 arrays can be converted to Python scalars
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-203-fdf0fc8db087> in <module>
----> 1 classifier.fit(x_train, y_train)
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\sklearn\svm\_base.py in fit(self, X, y, sample_weight)
160 X, y = self._validate_data(X, y, dtype=np.float64,
161 order='C', accept_sparse='csr',
--> 162 accept_large_sparse=False)
163
164 y = self._validate_targets(y)
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
430 y = check_array(y, **check_y_params)
431 else:
--> 432 X, y = check_X_y(X, y, **check_params)
433 out = X, y
434
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
801 ensure_min_samples=ensure_min_samples,
802 ensure_min_features=ensure_min_features,
--> 803 estimator=estimator)
804 if multi_output:
805 y = check_array(y, accept_sparse='csr', force_all_finite=True,
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
597 array = array.astype(dtype, casting="unsafe", copy=False)
598 else:
--> 599 array = np.asarray(array, order=order, dtype=dtype)
600 except ComplexWarning:
601 raise ValueError("Complex data not supported\n"
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: setting an array element with a sequence.

cosine_similarity between 2 pandas df column to get cosine distance

I have a dataframe as shown below:
vector_a vector_b
[1,2,3] [2,5,6]
[0,2,1] [2,9,1]
[4,7,1] [1,7,4]
I would like to do sklearn's cosine_similarity between the columns vector_a and vector_b to get a new column called 'cosine_distance' in the same dataframe. Do note that vector_a and vector_b are pandas df columns of list.
This is what I have attempted:
df['vector_a'] = df['vector_a'].apply(lambda x: np.asarray(x))
df['vector_b'] = df['vector_b'].apply(lambda x: np.asarray(x))
df['cosine_distance'] = cosine_similarity(df['vector_a'].apply(lambda x: np.transpose(x)),
df['vector_b'].apply(lambda x: np.transpose(x)))
And I got this error:
---> 58 df['cosine_distance'] = cosine_similarity(df['vector_a'].apply(lambda x: np.transpose(x)), df['vector_b'].apply(lambda x: np.transpose(x)))
~\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in cosine_similarity(X, Y, dense_output)
1025 # to avoid recursive import
1026
-> 1027 X, Y = check_pairwise_arrays(X, Y)
1028
1029 X_normalized = normalize(X, copy=True)
~\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in check_pairwise_arrays(X, Y, precomputed, dtype)
110 else:
111 X = check_array(X, accept_sparse='csr', dtype=dtype,
--> 112 estimator=estimator)
113 Y = check_array(Y, accept_sparse='csr', dtype=dtype,
114 estimator=estimator)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
494 try:
495 warnings.simplefilter('error', ComplexWarning)
--> 496 array = np.asarray(array, dtype=dtype, order=order)
497 except ComplexWarning:
498 raise ValueError("Complex data not supported\n"
~\Anaconda3\lib\site-packages\numpy\core\numeric.py in asarray(a, dtype, order)
536
537 """
--> 538 return array(a, dtype, copy=False, order=order)
539
540
ValueError: setting an array element with a sequence.
Thank you in advance!
TLDR:
df['cosine_similarity'] = df.apply(
lambda row: cosine_similarity([row['vector_a']], [row['vector_b']])[0][0],
axis=1)
Explanation:
cosine_similarity expects 2D np.array, or list of lists. It doesn't know how to interpret pd.Series of lists. However, even if we did convert it to list of lists, the next problem arises:
cosine_similarity returns all-vs-all similarity. So, let's limit to pairwise comparison, artificially creating second dimension (note the extra square brackets in [row['vector_a']], [row['vector_b']]), and then taking the only element of a 1x1 array (zeros at the end of cosine_similarity(...)[0][0])

ValueError: could not convert string to float SMOTE fit_sample Python Oversampling

I have a credit risk analysis dataset which goes like this:
Loan_ID Age Income(LPA) Employed_yr Education Loan_status
1 18 2.4 1 12th 1
2 46 43 26 Post Grad 0
3 22 12 4 Grad 0
4 25 17 1 Grad 1
1 means default and 0 means non-default in loan_status.
Now the number of defaults is very less around 1000, and number of non defaults are 25,000. So I want to do over sampling or synthetic sampling.
Till here the code runs fine
cred_loan = pd.read_csv("Credit_Risk_Analysis.csv")
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
y= cred_loan.loan_status
X = cred_loan.drop('loan_status', axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
random_state=27)
sm = SMOTE(random_state=27, ratio=1.0)
from sklearn.linear_model import LogisticRegression
After this I do the following and there is an error
[IN] X_train, y_train = sm.fit_sample(X_train, y_train)
[OUT]ValueError Traceback (most recent call
last)
<ipython-input-39-0995f82b5705> in <module>
----> 1 X_train, y_train = sm.fit_sample(X_train, y_train)
2
~\Anaconda3\lib\site-packages\imblearn\base.py in fit_resample(self, X, y)
77
78 check_classification_targets(y)
---> 79 X, y, binarize_y = self._check_X_y(X, y)
80
81 self.sampling_strategy_ = check_sampling_strategy(
~\Anaconda3\lib\site-packages\imblearn\base.py in _check_X_y(X, y)
135 def _check_X_y(X, y):
136 y, binarize_y = check_target_type(y,
indicate_one_vs_all=True)
--> 137 X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
138 return X, y, binarize_y
139
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y,
accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite,
ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features,
y_numeric, warn_on_dtype, estimator)
754 warnings.warn("A column-vector y was passed when a 1d
array was"
755 " expected. Please change the shape of y
to "
--> 756 "(n_samples, ), for example using
ravel().",
757 DataConversionWarning, stacklevel=2)
758 return np.ravel(y)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in
check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy,
force_all_finite, ensure_2d, allow_nd, ensure_min_samples,
ensure_min_features, warn_on_dtype, estimator)
565 if copy and np.may_share_memory(array, array_orig):
566 array = np.array(array, dtype=dtype, order=order)
--> 567
568 if (warn_on_dtype and dtypes_orig is not None and
569 {array.dtype} != set(dtypes_orig)):
ValueError: could not convert string to float: 'MORTGAGE'
Can anyone help please?

Categories