Problem using make_column_transformer in Sklearn - python

This is my code/model that I'm trying to implement:
kf = KFold(n_splits=10,shuffle=True,random_state=2652124)
transf = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',max_features=1500, min_df=5, max_df=0.7, stop_words=stop)
scaler = MinMaxScaler(feature_range=(0, 1))
metadados = ['F13','F14','F19','F21','F22']
cls = RandomForestClassifier(n_estimators=1000,random_state=0)
features = make_column_transformer(
(transf,'textimage'),(transf,'subtitle'),
(scaler, metadata),(scaler,'F3'),remainder ='drop')
X = features.fit_transform(data)
y = data['classification']
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
cls.fit(X_train,y_train)
y_score = cls.fit(X_train, y_train).predict_proba(X_test)
roc = roc_auc_score(y_test, y_score[:,1])
pred = cls.predict(X_test)
acs = accuracy_score(y_test,pred)
clr = classification_report(y_test,pred)
The error:
TypeError Traceback (most recent call last)
TypeError: only size-1 arrays can be converted to Python scalars
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-13-6bdcb91ff478> in <module>
14
15 kfnum = 1
---> 16 X = features.fit_transform(data)
17 y = data['classe']
18 catr = 'timagem + metadados + legenda'
~/.local/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
529 self._validate_remainder(X)
530
--> 531 result = self._fit_transform(X, y, _fit_transform_one)
532
533 if not result:
~/.local/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
456 self._iter(fitted=fitted, replace_strings=True))
457 try:
--> 458 return Parallel(n_jobs=self.n_jobs)(
459 delayed(func)(
460 transformer=clone(trans) if not fitted else trans,
~/.local/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
1049 self._iterating = self._original_iterator is not None
1050
-> 1051 while self.dispatch_one_batch(iterator):
1052 pass
1053
~/.local/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
864 return False
865 else:
--> 866 self._dispatch(tasks)
867 return True
868
~/.local/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
782 with self._lock:
783 job_idx = len(self._jobs)
--> 784 job = self._backend.apply_async(batch, callback=cb)
785 # A job can complete so quickly than its callback is
786 # called before we get here, causing self._jobs to
~/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~/.local/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/.local/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/.local/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
~/.local/lib/python3.8/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
688 if y is None:
689 # fit method of arity 1 (unsupervised transformation)
--> 690 return self.fit(X, **fit_params).transform(X)
691 else:
692 # fit method of arity 2 (supervised transformation)
~/.local/lib/python3.8/site-packages/sklearn/preprocessing/_data.py in fit(self, X, y)
334 # Reset internal state before fitting
335 self._reset()
--> 336 return self.partial_fit(X, y)
337
338 def partial_fit(self, X, y=None):
~/.local/lib/python3.8/site-packages/sklearn/preprocessing/_data.py in partial_fit(self, X, y)
367
368 first_pass = not hasattr(self, 'n_samples_seen_')
--> 369 X = self._validate_data(X, reset=first_pass,
370 estimator=self, dtype=FLOAT_DTYPES,
371 force_all_finite="allow-nan")
~/.local/lib/python3.8/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
418 f"requires y to be passed, but the target y is None."
419 )
--> 420 X = check_array(X, **check_params)
421 out = X
422 else:
~/.local/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~/.local/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
596 array = array.astype(dtype, casting="unsafe", copy=False)
597 else:
--> 598 array = np.asarray(array, order=order, dtype=dtype)
599 except ComplexWarning:
600 raise ValueError("Complex data not supported\n"
~/.local/lib/python3.8/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
~/.local/lib/python3.8/site-packages/pandas/core/series.py in __array__(self, dtype)
795 dtype='datetime64[ns]')
796 """
--> 797 return np.asarray(self.array, dtype)
798
799 # ----------------------------------------------------------------------
~/.local/lib/python3.8/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
~/.local/lib/python3.8/site-packages/pandas/core/arrays/numpy_.py in __array__(self, dtype)
209
210 def __array__(self, dtype=None) -> np.ndarray:
--> 211 return np.asarray(self._ndarray, dtype=dtype)
212
213 _HANDLED_TYPES = (np.ndarray, numbers.Number)
~/.local/lib/python3.8/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: setting an array element with a sequence.
I have no problemas using only:
features = make_column_transformer(
(transf,'textimage'),(transf,'subtitle'),
(scaler, metadata),remainder ='drop')
So my problem is the column 'F3' in my dataframe, which is an array in each row:
0 [0.0026778684, 0.003117677, 0.00040434036, 0.0...
1 [0.061992627, 0.047432333, 0.012270351, 0.0102...
2 [0.0, 0.0, 0.0, 4.3830705e-06, 1.3149212e-05, ...
3 [0.30314153, 0.04477268, 0.01840577, 0.0319251...
4 [0.2563626, 0.03259786, 0.018686974, 0.0198365...
...
1287 [0.11471527, 0.032394826, 0.012400794, 0.01131...
1288 [0.002138354, 0.001044489, 0.0007786191, 0.001...
1289 [0.056204572, 0.026556363, 0.02082041, 0.01966...
1290 [0.051759016, 0.0058623934, 0.0054726205, 0.00...
1291 [0.0, 5.4140626e-05, 4.4114586e-05, 4.8125003e...
Name: F3, Length: 1292, dtype: object
Can anyone help me with that? How can I change a column into a list into a pipeline, or how can I concatenate the tranform with a list? Any suggestions?

Related

ValueError: Input contains NaN, infinity or a value too large for dtype('float64') While fitting the model After Imputation

I pass the predictors from an imputation pipeline,
I check the columns for NaN and inf values with
col_name = X.columns.to_series()[np.isinf(X).any()]
There are no columns with missing values or inf values.
classifier = MLPClassifier
I do: clf.fit(X,y) then get the error ValueError: Input contains NaN, infinity or a value too large for dtype('float64')
What am I missing here? Can anybody help or guide me as to what to do????
What might be the other possible explanations for this error? What else should I be checking for? Also
full traceback:
Input In [137], in <cell line: 4>()
1 #l = list(X.isin([np.inf, -np.inf]))
2 #col_name = X.columns.to_series()[np.isnan(X).any()]
3 #col_name
----> 4 clf.fit(X, y)
File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:394, in Pipeline.fit(self, X, y, **fit_params)
392 if self._final_estimator != "passthrough":
393 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 394 self._final_estimator.fit(Xt, y, **fit_params_last_step)
396 return self
File ~\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:752, in BaseMultilayerPerceptron.fit(self, X, y)
735 def fit(self, X, y):
736 """Fit the model to data matrix X and target(s) y.
737
738 Parameters
(...)
750 Returns a trained MLP model.
751 """
--> 752 return self._fit(X, y, incremental=False)
File ~\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:393, in BaseMultilayerPerceptron._fit(self, X, y, incremental)
386 raise ValueError(
387 "hidden_layer_sizes must be > 0, got %s." % hidden_layer_sizes
388 )
389 first_pass = not hasattr(self, "coefs_") or (
390 not self.warm_start and not incremental
391 )
--> 393 X, y = self._validate_input(X, y, incremental, reset=first_pass)
395 n_samples, n_features = X.shape
397 # Ensure y is 2D
File ~\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:1100, in MLPClassifier._validate_input(self, X, y, incremental, reset)
1099 def _validate_input(self, X, y, incremental, reset):
-> 1100 X, y = self._validate_data(
1101 X,
1102 y,
1103 accept_sparse=["csr", "csc"],
1104 multi_output=True,
1105 dtype=(np.float64, np.float32),
1106 reset=reset,
1107 )
1108 if y.ndim == 2 and y.shape[1] == 1:
1109 y = column_or_1d(y, warn=True)
File ~\anaconda3\lib\site-packages\sklearn\base.py:581, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
579 y = check_array(y, **check_y_params)
580 else:
--> 581 X, y = check_X_y(X, y, **check_params)
582 out = X, y
584 if not no_val_X and check_params.get("ensure_2d", True):
File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:979, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
962 raise ValueError("y cannot be None")
964 X = check_array(
965 X,
966 accept_sparse=accept_sparse,
(...)
976 estimator=estimator,
977 )
--> 979 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric)
981 check_consistent_length(X, y)
983 return X, y
File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:989, in _check_y(y, multi_output, y_numeric)
987 """Isolated part of check_X_y dedicated to y validation"""
988 if multi_output:
--> 989 y = check_array(
990 y, accept_sparse="csr", force_all_finite=True, ensure_2d=False, dtype=None
991 )
992 else:
993 y = column_or_1d(y, warn=True)
File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:800, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
794 raise ValueError(
795 "Found array with dim %d. %s expected <= 2."
796 % (array.ndim, estimator_name)
797 )
799 if force_all_finite:
--> 800 _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan")
802 if ensure_min_samples > 0:
803 n_samples = _num_samples(array)
File ~\anaconda3\lib\site-packages\sklearn\utils\validation.py:114, in _assert_all_finite(X, allow_nan, msg_dtype)
107 if (
108 allow_nan
109 and np.isinf(X).any()
110 or not allow_nan
111 and not np.isfinite(X).all()
112 ):
113 type_err = "infinity" if allow_nan else "NaN, infinity"
--> 114 raise ValueError(
115 msg_err.format(
116 type_err, msg_dtype if msg_dtype is not None else X.dtype
117 )
118 )
119 # for object dtype data, we only check for NaNs (GH-13254)
120 elif X.dtype == np.dtype("object") and not allow_nan:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
You may want to check nan as well
col_name = X.columns.to_series()[np.isinf(X).any() | np.isnan(X).any()]

AxisError: axis 1 is out of bounds for array of dimension 1 using sklearn

I have trouble using OneVsRestClassifier and cross-validation from sklearn.
train, test = train_test_split(tickets, random_state=42, test_size=0.30, shuffle=True)
X_train = train[['TK_Poids_brut', 'TK_Poids_tare']]
y_train = train['TK_Qualite']
X_test = test[['TK_Poids_brut', 'TK_Poids_tare']]
y_test = test['TK_Qualite']
le = preprocessing.LabelEncoder()
y_train_tra = le.fit_transform(y_train)
printDataInfo(X_train,y_train_tra)
#The printDataInfo function is there just to display information about X and y
clf_OvR_SVC = OneVsRestClassifier(LinearSVC(random_state=0))
cross_v = cross_validate(clf_OvR_SVC, X_train, y_train_tra, error_score="raise",scoring=dict(ac=make_scorer(accuracy_score), roc=make_scorer(roc_auc_score, multi_class="ovr")), cv=5)
cross_v
When I do this I get the following error:
---------------------------------------------------------------------------
AxisError Traceback (most recent call last)
C:\TEMP/ipykernel_20332/2926737612.py in <module>
23
24 clf_OvR_SVC = OneVsRestClassifier(LinearSVC(random_state=0))
---> 25 cross_v = cross_validate(clf_OvR_SVC, X_train, y_train_tra ,error_score="raise",scoring=dict(ac=make_scorer(accuracy_score), roc=make_scorer(roc_auc_score, multi_class="ovr")), cv=5)
26 cross_v
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
248 parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
249 pre_dispatch=pre_dispatch)
--> 250 results = parallel(
251 delayed(_fit_and_score)(
252 clone(estimator), X, y, scorers, train, test, verbose, None,
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1041 # remaining jobs.
1042 self._iterating = False
-> 1043 if self.dispatch_one_batch(iterator):
1044 self._iterating = self._original_iterator is not None
1045
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
859 return False
860 else:
--> 861 self._dispatch(tasks)
862 return True
863
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
777 with self._lock:
778 job_idx = len(self._jobs)
--> 779 job = self._backend.apply_async(batch, callback=cb)
780 # A job can complete so quickly than its callback is
781 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\Anaconda3\lib\site-packages\sklearn\utils\fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
623
624 fit_time = time.time() - start_time
--> 625 test_scores = _score(estimator, X_test, y_test, scorer, error_score)
626 score_time = time.time() - start_time - fit_time
627 if return_train_score:
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, error_score)
685 scores = scorer(estimator, X_test)
686 else:
--> 687 scores = scorer(estimator, X_test, y_test)
688 except Exception:
689 if error_score == 'raise':
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
85 for name, scorer in self._scorers.items():
86 if isinstance(scorer, _BaseScorer):
---> 87 score = scorer._score(cached_call, estimator,
88 *args, **kwargs)
89 else:
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _score(self, method_caller, estimator, X, y_true, sample_weight)
240 **self._kwargs)
241 else:
--> 242 return self._sign * self._score_func(y_true, y_pred,
243 **self._kwargs)
244
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py in roc_auc_score(y_true, y_score, average, sample_weight, max_fpr, multi_class, labels)
535 if multi_class == 'raise':
536 raise ValueError("multi_class must be in ('ovo', 'ovr')")
--> 537 return _multiclass_roc_auc_score(y_true, y_score, labels,
538 multi_class, average, sample_weight)
539 elif y_type == "binary":
~\Anaconda3\lib\site-packages\sklearn\metrics\_ranking.py in _multiclass_roc_auc_score(y_true, y_score, labels, multi_class, average, sample_weight)
593 """
594 # validation of the input y_score
--> 595 if not np.allclose(1, y_score.sum(axis=1)):
596 raise ValueError(
597 "Target scores need to be probabilities for multiclass "
~\Anaconda3\lib\site-packages\numpy\core\_methods.py in _sum(a, axis, dtype, out, keepdims, initial, where)
45 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
46 initial=_NoValue, where=True):
---> 47 return umr_sum(a, axis, dtype, out, keepdims, initial, where)
48
49 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,
AxisError: axis 1 is out of bounds for array of dimension 1
Here is the input data format:
I already tried to put both in numpy array and I tried to reshape y in (6108,1) but I always get the same error.
type :
x: <class 'pandas.core.frame.DataFrame'>
y: <class 'numpy.ndarray'>
shape :
X: (6108, 2)
y: (6108,)
data :
x: TK_Poids_brut TK_Poids_tare
8436 14420 14160
7014 17160 12320
3931 28060 15040
6749 16680 14360
2984 10060 9100
... ... ...
5734 19700 15420
5191 25380 14620
5390 19460 14760
860 16160 14100
7270 15520 14500
[6108 rows x 2 columns]
y: [132 85 160 118 118 40 88 126 12 40 41 138 5 125 125 147 111 118
153 40 118 126 118 125 123 62 177 45 118 105 3 1 105 142 116 100
118 125 118 78 124 3 126 53 138 118 40 118 53 124 126 98 118 155
118 131 5 135 130 3 118 105 118 126 105 87 118 118 24 124 130 130
...
118 124 118 180 118 58 124 126 153 126 124 118 125 153 86 94 126 118
130 105 42 62 124 78]

Seaborn the hue attribute causing error in plots

The hue feature is not working when I am using pairplot.
Here is my data frame:
Here is the code that doesn't work:
sns.pairplot(activities, hue="Day")
If I remove the hue option it works. Also if I change the hue to a numerical column (such as Distance) it works, but it is not working with the Day column for some reason. Here's the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_121/1783531066.py in <module>
----> 1 sns.pairplot(activities, hue="Day")
/opt/conda/lib/python3.7/site-packages/seaborn/_decorators.py in inner_f(*args, **kwargs)
44 )
45 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 46 return f(**kwargs)
47 return inner_f
48
/opt/conda/lib/python3.7/site-packages/seaborn/axisgrid.py in pairplot(data, hue, hue_order, palette, vars, x_vars, y_vars, kind, diag_kind, markers, height, aspect, corner, dropna, plot_kws, diag_kws, grid_kws, size)
2020 elif diag_kind == "kde":
2021 diag_kws.setdefault("fill", True)
-> 2022 grid.map_diag(kdeplot, **diag_kws)
2023
2024 # Maybe plot on the off-diagonals
/opt/conda/lib/python3.7/site-packages/seaborn/axisgrid.py in map_diag(self, func, **kwargs)
1400 plot_kwargs.setdefault("hue_order", self._hue_order)
1401 plot_kwargs.setdefault("palette", self._orig_palette)
-> 1402 func(x=vector, **plot_kwargs)
1403 self._clean_axis(ax)
1404
/opt/conda/lib/python3.7/site-packages/seaborn/_decorators.py in inner_f(*args, **kwargs)
44 )
45 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 46 return f(**kwargs)
47 return inner_f
48
/opt/conda/lib/python3.7/site-packages/seaborn/distributions.py in kdeplot(x, y, shade, vertical, kernel, bw, gridsize, cut, clip, legend, cumulative, shade_lowest, cbar, cbar_ax, cbar_kws, ax, weights, hue, palette, hue_order, hue_norm, multiple, common_norm, common_grid, levels, thresh, bw_method, bw_adjust, log_scale, color, fill, data, data2, **kwargs)
1733 legend=legend,
1734 estimate_kws=estimate_kws,
-> 1735 **plot_kws,
1736 )
1737
/opt/conda/lib/python3.7/site-packages/seaborn/distributions.py in plot_univariate_density(self, multiple, common_norm, common_grid, fill, legend, estimate_kws, **plot_kws)
914 common_grid,
915 estimate_kws,
--> 916 log_scale,
917 )
918
/opt/conda/lib/python3.7/site-packages/seaborn/distributions.py in _compute_univariate_density(self, data_variable, common_norm, common_grid, estimate_kws, log_scale)
314
315 # Estimate the density of observations at this level
--> 316 density, support = estimator(observations, weights=weights)
317
318 if log_scale:
/opt/conda/lib/python3.7/site-packages/seaborn/_statistics.py in __call__(self, x1, x2, weights)
185 """Fit and evaluate on univariate or bivariate data."""
186 if x2 is None:
--> 187 return self._eval_univariate(x1, weights)
188 else:
189 return self._eval_bivariate(x1, x2, weights)
/opt/conda/lib/python3.7/site-packages/seaborn/_statistics.py in _eval_univariate(self, x, weights)
144 support = self.support
145 if support is None:
--> 146 support = self.define_support(x, cache=False)
147
148 kde = self._fit(x, weights)
/opt/conda/lib/python3.7/site-packages/seaborn/_statistics.py in define_support(self, x1, x2, weights, cache)
117 """Create the evaluation grid for a given data set."""
118 if x2 is None:
--> 119 support = self._define_support_univariate(x1, weights)
120 else:
121 support = self._define_support_bivariate(x1, x2, weights)
/opt/conda/lib/python3.7/site-packages/seaborn/_statistics.py in _define_support_univariate(self, x, weights)
89 def _define_support_univariate(self, x, weights):
90 """Create a 1D grid of evaluation points."""
---> 91 kde = self._fit(x, weights)
92 bw = np.sqrt(kde.covariance.squeeze())
93 grid = self._define_support_grid(
/opt/conda/lib/python3.7/site-packages/seaborn/_statistics.py in _fit(self, fit_data, weights)
135 fit_kws["weights"] = weights
136
--> 137 kde = stats.gaussian_kde(fit_data, **fit_kws)
138 kde.set_bandwidth(kde.factor * self.bw_adjust)
139
/opt/conda/lib/python3.7/site-packages/scipy/stats/kde.py in __init__(self, dataset, bw_method, weights)
204 self._neff = 1/sum(self._weights**2)
205
--> 206 self.set_bandwidth(bw_method=bw_method)
207
208 def evaluate(self, points):
/opt/conda/lib/python3.7/site-packages/scipy/stats/kde.py in set_bandwidth(self, bw_method)
552 raise ValueError(msg)
553
--> 554 self._compute_covariance()
555
556 def _compute_covariance(self):
/opt/conda/lib/python3.7/site-packages/scipy/stats/kde.py in _compute_covariance(self)
564 bias=False,
565 aweights=self.weights))
--> 566 self._data_inv_cov = linalg.inv(self._data_covariance)
567
568 self.covariance = self._data_covariance * self.factor**2
/opt/conda/lib/python3.7/site-packages/scipy/linalg/basic.py in inv(a, overwrite_a, check_finite)
937
938 """
--> 939 a1 = _asarray_validated(a, check_finite=check_finite)
940 if len(a1.shape) != 2 or a1.shape[0] != a1.shape[1]:
941 raise ValueError('expected square matrix')
/opt/conda/lib/python3.7/site-packages/scipy/_lib/_util.py in _asarray_validated(a, check_finite, sparse_ok, objects_ok, mask_ok, as_inexact)
294 if not objects_ok:
295 if a.dtype is np.dtype('O'):
--> 296 raise ValueError('object arrays are not supported')
297 if as_inexact:
298 if not np.issubdtype(a.dtype, np.inexact):
ValueError: object arrays are not supported
Any ideas why hue isn't working?
You can see the error:
ValueError: object arrays are not supported
Means the variable needs to be numerical.

Python "sklearn" ValueError

hope you are having a great day! I know the problem is silly and most of you can probably figure it out. But i do need help. So yeah. Here's the problem:
The code goes something like this:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
data = pd.read_csv("Data.csv")
X = data.drop(columns = "Answers")
Y = data["Answers"]
algorithm = DecisionTreeClassifier()
algorithm.fit(X, Y)
And I know I'm dumb and probably don't know why its outputting this shi*:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-46-0deefc30b34f> in <module>
9 algorithm = DecisionTreeClassifier()
10
---> 11 algorithm.fit(X, Y)
~\anaconda3\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
888 """
889
--> 890 super().fit(
891 X, y,
892 sample_weight=sample_weight,
~\anaconda3\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
154 check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
155 check_y_params = dict(ensure_2d=False, dtype=None)
--> 156 X, y = self._validate_data(X, y,
157 validate_separately=(check_X_params,
158 check_y_params))
~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
427 # :(
428 check_X_params, check_y_params = validate_separately
--> 429 X = check_array(X, **check_X_params)
430 y = check_array(y, **check_y_params)
431 else:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
596 array = array.astype(dtype, casting="unsafe", copy=False)
597 else:
--> 598 array = np.asarray(array, order=order, dtype=dtype)
599 except ComplexWarning:
600 raise ValueError("Complex data not supported\n"
~\anaconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
~\anaconda3\lib\site-packages\pandas\core\generic.py in __array__(self, dtype)
1779
1780 def __array__(self, dtype=None) -> np.ndarray:
-> 1781 return np.asarray(self._values, dtype=dtype)
1782
1783 def __array_wrap__(self, result, context=None):
~\anaconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: could not convert string to float: "Wat's your name?"
Thank you for viewing this problem and perhaps in solving it. Have a great day!

Passing a 2-d array as an element to sklearn.SVM

I have a single independent variable X where each element is a 2-d array with shape (20, 431). The variable X itself is a 2-d numpy array of shape (200,). . How do I pass this to the sklearn.SVM object?
Edit: actual dataframe:
Category Features
0 1 [[-177.08171, -219.89174, -253.55954, -218.560...
1 0 [[-291.89288, -316.40735, -389.8398, -413.6302...
2 1 [[-355.88293, -351.0909, -364.43524, -400.7097.
Each element of Features is a 20*431 numpy array. I need to use these features to classify the category.
x = data.iloc[:, 1].values
y = data.iloc[:, 0].values
x.shape
(200, )
x[0].shape
(20, 431)
y.shape
(200, )
Fitting the model after splitting into train and test data:
classifier = SVC(kernel = 'rbf', random_state=0)
classifier.fit(x_train, y_train)
Error:
TypeError Traceback (most recent call last)
TypeError: only size-1 arrays can be converted to Python scalars
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-203-fdf0fc8db087> in <module>
----> 1 classifier.fit(x_train, y_train)
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\sklearn\svm\_base.py in fit(self, X, y, sample_weight)
160 X, y = self._validate_data(X, y, dtype=np.float64,
161 order='C', accept_sparse='csr',
--> 162 accept_large_sparse=False)
163
164 y = self._validate_targets(y)
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
430 y = check_array(y, **check_y_params)
431 else:
--> 432 X, y = check_X_y(X, y, **check_params)
433 out = X, y
434
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
801 ensure_min_samples=ensure_min_samples,
802 ensure_min_features=ensure_min_features,
--> 803 estimator=estimator)
804 if multi_output:
805 y = check_array(y, accept_sparse='csr', force_all_finite=True,
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
597 array = array.astype(dtype, casting="unsafe", copy=False)
598 else:
--> 599 array = np.asarray(array, order=order, dtype=dtype)
600 except ComplexWarning:
601 raise ValueError("Complex data not supported\n"
~\Anaconda3\envs\LangDetEnv1.0\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: setting an array element with a sequence.

Categories