ValueError: could not convert string to float: -- problem with lables - python

I'm getting the following error when trying to extract festues, I have to do the split before the feature extraction. The y contains the labels so not sure why I'm getting the error. data is wav files and the labels are text originally
for cls in os.listdir(path):
for sound in tqdm(os.listdir(os.path.join(path, cls))):
wav = librosa.load(os.path.join(os.path.join(path, cls, sound)), sr=16000)[0].astype(np.float32)
tmp_samples.append(wav[0])
tmp_labels.append(cls)
tmp_labels=np.array(tmp_labels)
X_train, y_train , X_test , y_test = train_test_split( tmp_samples, tmp_labels , test_size=0.60,shuffle=True)
encoder = LabelBinarizer()
y_test = encoder.fit_transform(y_test)
minmax_scaler = MinMaxScaler()
X_train = np.asarray( X_train ).reshape(-1,1)
X_train = minmax_scaler.fit_transform( X_train )
X_test = np.asarray( X_test ).reshape(-1,1)
X_test = minmax_scaler.fit_transform( X_test )
y_test = encoder.fit_transform(y_test)
for x,y in zip(X_test,y_test):
extract_features(x[0], y, model, plain_samples , plain_labels )
def extract_features(wav, cls, model, plain_samples, plain_labels):
for feature in model(wav)[1]:
plain_samples.append(feature)
plain_labels.append(cls)
Error:
Traceback (most recent call last):
File "optunaCopy.py", line 523, in <module>
main(sys.argv[1:])
File "optunaCopy.py", line 439, in main
X_train, y_train , X_test , y_test,X_valid,y_valid = create_dataset(path)
File "optunaCopy.py", line 129, in create_dataset
X_test = minmax_scaler.fit_transform( X_test )
File "C:\Users\x\anaconda3\envs\yamnet\lib\site-packages\sklearn\base.py", line 844, in fit_transform
return self.fit(X, **fit_params).transform(X)
File "C:\Users\x\anaconda3\envs\yamnet\lib\site-packages\sklearn\preprocessing\_data.py", line 416, in fit
return self.partial_fit(X, y)
File "C:\Users\x\anaconda3\envs\yamnet\lib\site-packages\sklearn\preprocessing\_data.py", line 458, in partial_fit
force_all_finite="allow-nan",
File "C:\Users\x\anaconda3\envs\yamnet\lib\site-packages\sklearn\base.py", line 557, in _validate_data
X = check_array(X, **check_params)
File "C:\Users\x\anaconda3\envs\yamnet\lib\site-packages\sklearn\utils\validation.py", line 738, in check_array
array = np.asarray(array, order=order, dtype=dtype)
File "C:\Users\x\anaconda3\envs\yamnet\lib\site-packages\numpy\core\_asarray.py", line 83, in asarray
return array(a, dtype, copy=False, order=order)
ValueError: could not convert string to float: 'hh'

Related

ValueError: Found array with dim 3. Estimator expected <= 2.explored the existing solutions

Was running my Python program and I received this error:
3 fits failed with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/usr/local/lib/python3.8/dist-packages/sklearn/neighbors/_classification.py", line 198, in fit
return self._fit(X, y)
File "/usr/local/lib/python3.8/dist-packages/sklearn/neighbors/_base.py", line 400, in _fit
X, y = self._validate_data(X, y, accept_sparse="csr", multi_output=True)
File "/usr/local/lib/python3.8/dist-packages/sklearn/base.py", line 581, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "/usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py", line 964, in check_X_y
X = check_array(
File "/usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py", line 794, in check_array
raise ValueError(
ValueError: Found array with dim 3. Estimator expected <= 2.
My code looks like this:
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = StratifiedKFold(n_splits=3, random_state=1, shuffle=True)
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
names.append(name)
print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

Pandas Array Exception: Data must be 1-Dimensional

This is my Python script for using Markov Blanket Algorithm on my Dataset:
df1 = read_csv("input-binary-120-training.csv")
Y1 = df1[df1.CategoryL == 1].CategoryL
X1 = minmax_scale(df1[df1.CategoryL == 1].ix[:, 1:24], axis = 0)
y_train = Y1.values
df2 = read_csv("input-binary-120-test.csv")
Y2 = df2[df2.CategoryL == 1].CategoryL
X2 = minmax_scale(df2[df2.CategoryL == 1].ix[:, 1:24], axis = 0)
y_test = Y2.values
x_test = X2.reshape(X2.shape[0], X2.shape[1], 1)
seed(2017)
kfold = KFold(n_splits=5, random_state=27, shuffle=True)
scores = list()
# Create a PyImpetus classification object and initialize with required parameters
model = PPIMBC(LogisticRegression(random_state=27, max_iter=1000, class_weight="balanced"), cv=0, num_simul=20, simul_type=0, simul_size=0.2, random_state=27, sig_test_type="non-parametric", verbose=2, p_val_thresh=0.05)
x_train = model.fit_transform(X1, Y1)
x_test = model.transform(x_test)
print("Markov Blanket: ", model.MB)
But for the line X_train = model.fit_transform(X1,Y1) I got the exception:
Data must be 1-Dimensional.
I used X1.flatten() but it doesn't work. Could you please advise me about this issue?
Full error:
x_train = model.fit_transform(X1, Y1)
File "/home/osboxes/Downloads/Thesis/PyImpetus.py", line 326, in fit_transform
self.fit(data, Y)
File "/home/osboxes/Downloads/Thesis/PyImpetus.py", line 299, in fit
final_MB, final_feat_imp = self._find_MB(data.copy(), Y)
File "/home/osboxes/Downloads/Thesis/PyImpetus.py", line 221, in _find_MB
Y = np.reshape(Y, (-1, 1))
File "<__array_function__ internals>", line 6, in reshape
File "/home/osboxes/venv/lib/python3.6/site-packages/numpy/core/fromnumeric.py", line 299, in reshape
return _wrapfunc(a, 'reshape', newshape, order=order)
File "/home/osboxes/venv/lib/python3.6/site-packages/numpy/core/fromnumeric.py", line 55, in _wrapfunc
return _wrapit(obj, method, *args, **kwds)
File "/home/osboxes/venv/lib/python3.6/site-packages/numpy/core/fromnumeric.py", line 48, in _wrapit
result = wrap(result)
File "/home/osboxes/venv/lib/python3.6/site-packages/pandas/core/generic.py", line 1999, in __array_wrap__
return self._constructor(result, **d).__finalize__(self)
File "/home/osboxes/venv/lib/python3.6/site-packages/pandas/core/series.py", line 311, in __init__
data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True)
File "/home/osboxes/venv/lib/python3.6/site-packages/pandas/core/internals/construction.py", line 729, in sanitize_array
raise Exception("Data must be 1-dimensional")
Exception: Data must be 1-dimensional
Try to reshape Y1 either Y1=Y1[:, 0] or Y1=Y1.ravel() to get a 1D dimension.

performing K-fold Cross Validation with scoring = 'f1 or Recall or Precision' for multi-class problem

I know this can easily be implemented for a binary classification problem. But it seems to be a bit tough in the case of a multi-class problem.
I have a dataset that is un-balanced and is an example of a 4-class classification problem. I have applied the RandomForestClassifier() on it to test various measures of the algorithm such as accuracy, precision, recall, f1_score, etc. Now I wanted to perform the K-fold Cross Validation on the training set with 10 splits and I want the 'scoring' parameter of the cross_val_score() function to be 'f1' instead of 'accuracy'.
My code:
# Random Forest
np.random.seed(123)
from sklearn.ensemble import RandomForestClassifier
classifier_RF = RandomForestClassifier(random_state = 0)
classifier_RF.fit(X_train, Y_train)
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier_RF, X = X_train, y = Y_train, cv = 10, scoring = 'f1')
print("F1_Score: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
However, when I try to run this code, I am getting an error as follows:
ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].
I have tried setting the average parameter to 'weighted' in the cross_val_function() as follows:
accuracies = cross_val_score(estimator = classifier_RF, X = X_train, y = Y_train, cv = 10, scoring = 'f1', average = 'weighted')
but that's giving an error as follows:
TypeError: cross_val_score() got an unexpected keyword argument 'average'
The entire traceback is as follows:
Traceback (most recent call last):
File "<ipython-input-1-ba4a5e1de09a>", line 97, in <module>
accuracies = cross_val_score(estimator = classifier_RF, X = X_train, y = Y_train, cv = 10, scoring = 'f1')
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 406, in cross_val_score
error_score=error_score)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 248, in cross_validate
for train, test in cv.split(X, y, groups))
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 1048, in __call__
if self.dispatch_one_batch(iterator):
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 866, in dispatch_one_batch
self._dispatch(tasks)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 784, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 572, in __init__
self.results = batch()
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 263, in __call__
for func, args, kwargs in self.items]
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 263, in <listcomp>
for func, args, kwargs in self.items]
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 560, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 607, in _score
scores = scorer(estimator, X_test, y_test)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 88, in __call__
*args, **kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 213, in _score
**self._kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1047, in f1_score
zero_division=zero_division)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1175, in fbeta_score
zero_division=zero_division)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1434, in precision_recall_fscore_support
pos_label)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1265, in _check_set_wise_labels
% (y_type, average_options))
ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].
You need to use make_score to define your metric and its parameters:
from sklearn.metrics import make_scorer, f1_score
scoring = {'f1_score' : make_scorer(f1_score, average='weighted')}
and then use this in your cross_val_score:
results = cross_val_score(estimator = classifier_RF,
X = X_train,
y = Y_train,
cv = 10,
scoring = scoring)

Can not fit training data feature to match label data after Vectorizing

i have school project that demand me to use machine learning, after several troubleshoot i meet deadend, don't know how to solve it.
i have this code:
db_connection = 'mysql+pymysql://root:#localhost/databases'
conn = create_engine(db_connection)
df = pd.read_sql("SELECT * from barang", conn)
cth_data = pd.DataFrame(df)
#print(cth_data.head())
cth_data = cth_data.dropna()
y = cth_data['kode_aset']
x = cth_data[['merk','ukuran','bahan','harga']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
clf=RandomForestClassifier(n_estimators=100)
vectorizer = CountVectorizer( max_features = 50000, ngram_range = ( 1,50 ) )
d_feture = vectorizer.fit_transform(x_train)
#d_label = vectorizer.transform(y_train)
clf.fit(d_feture, y_train)
t_data = vectorizer.transform(x_test)
y_pred=clf.predict(t_data)
print ("Model_Accuracy: " + str(np.mean(y_pred == y_test)))
i fetched the data from mysql database here is the database:
Screenshot of database:
ended up with this kind of error:
File "Machine_learn_V_0.0.1.py", line 41, in <module>
clf.fit(d_feture, y_train)
File "C:\Python35\lib\site-packages\sklearn\ensemble\forest.py", line 333, in fit
for i, t in enumerate(trees))
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\parallel.py", line 917, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\parallel.py", line 759, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\parallel.py", line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 182, in apply_async
result = ImmediateResult(func)
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 549, in __init__
self.results = batch()
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Python35\lib\site-packages\sklearn\ensemble\forest.py", line 119, in _parallel_build_trees
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
File "C:\Python35\lib\site-packages\sklearn\tree\tree.py", line 801, in fit
X_idx_sorted=X_idx_sorted)
File "C:\Python35\lib\site-packages\sklearn\tree\tree.py", line 236, in fit
"number of samples=%d" % (len(y), n_samples))
ValueError: Number of labels=223 does not match number of samples=4
CountVectorizer takes strings, it can not process columns as you wished it would, which means you should concatenated strings from cth_data[['merk','ukuran','bahan','harga']] into a single column, e.g.:
cols = ['merk','ukuran','bahan','harga']
cth_data['combined'] = cth_data[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
x = cth_data["combined"]
from there on your code should work

How to use SMOTENC inside pipeline (Error: Some of the categorical indices are out of range)?

I would greatly appreciate if you could let me know how to use SMOTENC. I wrote:
# Data
XX = pd.read_csv('Financial Distress.csv')
y = np.array(XX['Financial Distress'].values.tolist())
y = np.array([0 if i > -0.50 else 1 for i in y])
Na = np.array(pd.read_csv('Na.csv', header=None).values)
XX = XX.iloc[:, 3:127]
# Use get-dummies to convert categorical features into dummy ones
dis_features = ['x121']
X = pd.get_dummies(XX, columns=dis_features)
# # Divide Data into Train and Test
indices = np.arange(y.shape[0])
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, indices, stratify=y, test_size=0.3,
random_state=42)
num_indices=list(X)[:X.shape[1]-37]
cat_indices=list(X)[X.shape[1]-37:]
num_indices1 = list(X.iloc[:,np.r_[0:94,95,97,100:123]].columns.values)
cat_indices1 = list(X.iloc[:,np.r_[94,96,98,99,123:160]].columns.values)
print(len(num_indices1))
print(len(cat_indices1))
pipeline=Pipeline(steps= [
# Categorical features
('feature_processing', FeatureUnion(transformer_list = [
('categorical', MultiColumn(cat_indices)),
#numeric
('numeric', Pipeline(steps = [
('select', MultiColumn(num_indices)),
('scale', StandardScaler())
]))
])),
('clf', rg)
]
)
pipeline_with_resampling = make_pipeline(SMOTENC(categorical_features=cat_indices1), pipeline)
# # Grid Search to determine best params
cv=StratifiedKFold(n_splits=5,random_state=42)
rg_cv = GridSearchCV(pipeline_with_resampling, param_grid, cv=cv, scoring = 'f1')
rg_cv.fit(X_train, y_train)
Therefore, as it is indicated I have 5 categorical features. Really, indices 123 to 160 are related to one categorical feature with 37 possible values which is converted into 37 columns using get_dummies. Unfortunately, it throws the following error:
Traceback (most recent call last):
File "D:/mifs-master_2/MU/learning-from-imbalanced-classes-master/learning-from-imbalanced-classes-master/continuous/Final Logit/SMOTENC/logit-final - Copy.py", line 424, in <module>
rg_cv.fit(X_train, y_train)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 722, in fit
self._run_search(evaluate_candidates)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 1191, in _run_search
evaluate_candidates(ParameterGrid(self.param_grid))
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 711, in evaluate_candidates
cv.split(X, y, groups)))
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 917, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 759, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 182, in apply_async
result = ImmediateResult(func)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 549, in __init__
self.results = batch()
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 528, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\pipeline.py", line 237, in fit
Xt, yt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\pipeline.py", line 200, in _fit
cloned_transformer, Xt, yt, **fit_params_steps[name])
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 342, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\pipeline.py", line 576, in _fit_resample_one
X_res, y_res = sampler.fit_resample(X, y, **fit_params)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\base.py", line 85, in fit_resample
output = self._fit_resample(X, y)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\over_sampling\_smote.py", line 940, in _fit_resample
self._validate_estimator()
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\over_sampling\_smote.py", line 933, in _validate_estimator
' should be between 0 and {}'.format(self.n_features_))
ValueError: Some of the categorical indices are out of range. Indices should be between 0 and 160
Thanks in advance.
As it follows, two pipelines should be used:
num_indices1 = list(X.iloc[:,np.r_[0:94,95,97,100:120,121:123]].columns.values)
cat_indices1 = list(X.iloc[:,np.r_[94,96,98,99,120]].columns.values)
print(len(num_indices1))
print(len(cat_indices1))
cat_indices = [94, 96, 98, 99, 120]
from imblearn.pipeline import make_pipeline
pipeline=Pipeline(steps= [
# Categorical features
('feature_processing', FeatureUnion(transformer_list = [
('categorical', MultiColumn(cat_indices1)),
#numeric
('numeric', Pipeline(steps = [
('select', MultiColumn(num_indices1)),
('scale', StandardScaler())
]))
])),
('clf', rg)
]
)
pipeline_with_resampling = make_pipeline(SMOTENC(categorical_features=cat_indices), pipeline)
You can not dummies your categorical variables and use it later SMOTENC because it already implements in its algorithm get_dummies what will bias your model.
However, I recommend using SMOTE () instead of SMOTENC (), but in this case you must first apply get_demmies.
You cannot use scikit learn pipeline with imblearn pipeline. The imblearn pipeline implements fit_sample as well as fit_predict. Sklearn pipeline onle implements fit_predict. You cannot combine them.
First, don't do the get_dummies. Then, change the way you do your categorical_features, and put a list of booleans for if it's categorical or not.
Try this:
cat_cols = []
for col in x.columns:
if x[col].dtype == 'object': #or 'category' if that's the case
cat_cols.append(True)
else:
cat_cols.append(False)
Then pass cat_cols to your SMOTENC:
smote_nc = SMOTENC(categorical_features=cat_cols, random_state=0)

Categories