Python Doc2Vec sklearn ValueError Unknown - python

I'm using doc2vec to train a model on tagged sentences that can then be used for multiclass classification on other sentences in the future.
I was able to vectorize the sentences but am now getting an error when I try to train the model.
ValueError: Unknown label type: 'unknown'
I'm very new at this, but after searching other posts, it looks like it has to do with my y value not being an array. But i'm not sure how to solve this. Can someone please suggest a resolution to this?
Here are the relevant parts of my code:
import pandas as pd
import numpy as np
np.random.seed(0)
def read_text_file(f):
df_complete = pd.read_csv(f)
df = df_complete.loc[: , ["Text", "Score"]]
df.dropna(how = "any", inplace = True)
return df
df = read_text_file("input/Reviews.csv")
print(df.head())
def sampling_dataset(df):
count = 5000
class_df_sampled = pd.DataFrame(columns = ["Score", "Text"])
temp = []
for c in df.Score.unique():
class_indexes = df[df.Score == c].index
random_indexes = np.random.choice(class_indexes, count, replace = False)
temp.append(df.loc[random_indexes])
for each_df in temp:
class_df_sampled = pd.concat([class_df_sampled, each_df], axis = 0)
return class_df_sampled
df = sampling_dataset(df)
df.reset_index(drop = True, inplace = True)
print(df.head())
print(df.shape)
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re
lmtzr = WordNetLemmatizer()
w = re.compile("\w+", re.I)
def label_sentences(df):
labeled_sentences = []
for index, datapoint in df.iterrows():
tokenized_words = re.findall(w, datapoint["Text"].lower())
labeled_sentences.append(LabeledSentence(words = tokenized_words, tags = ['SENT_%s' % index]))
return labeled_sentences
def train_doc2vec_model(labeled_sentences):
model = Doc2Vec(alpha = 0.025, min_alpha = 0.025)
model.build_vocab(labeled_sentences)
for epoch in range(10):
model.train(labeled_sentences, total_examples = 25000, epochs = 10)
model.alpha -= 0.002
model.min_alpha = model.alpha
return model
sen = label_sentences(df)
model = train_doc2vec_model(sen)
def vectorize_comments(df, d2v_model):
y = []
comments = []
for i in range(0, df.shape[0]):
label = 'SENT_%s' % i
comments.append(d2v_model.docvecs[label])
df['vectorized_comments'] = comments
return df
df = vectorize_comments(df, model)
print(df.head(2))
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier as RFC
import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
import pickle
def train_classifier(X, y):
n_estimators = [200, 400]
min_samples_split = [2]
min_samples_leaf = [1]
bootstrap = [True]
parameters = {
'n_estimators': n_estimators,
'min_samples_leaf': min_samples_leaf,
'min_samples_split': min_samples_split
}
clf = GridSearchCV(RFC(verbose = 1, n_jobs = 4), cv = 4, param_grid = parameters)
clf.fit(X, y)
return clf
X_train, X_test, y_train, y_test = cross_validation.train_test_split(df['vectorized_comments'].T.tolist(), df['Score'], test_size = 0.02, random_state = 17)
classifier = train_classifier(X_train, y_train)
print(classifier.best_score_, "----------------Best Accuracy score on Cross Validation Sets")
print(classifier.score(X_test, y_test))
f = open("Output.txt", "w")
f.write("Best Accuracy score on Cross Validation Sets %f" % classifier.best_score_, )
f.write("Score on Test Set %f" % classifier.score(X_test, y_test))
f.close()
Here is the full stack error:
Traceback (most recent call last):
File "<ipython-input-4-a9ad2a977535>", line 1, in <module>
runfile('C:/Users/user/.spyder-py3/multiclass doc2vec.py', wdir='C:/Users/user/.spyder-py3')
File "C:\Users\user\Anaconda31\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:\Users\user\Anaconda31\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/user/.spyder-py3/multiclass doc2vec.py", line 105, in <module>
classifier = train_classifier(X_train,y_train)
File "C:/Users/user/.spyder-py3/multiclass doc2vec.py", line 101, in train_classifier
clf.fit(X, y)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\grid_search.py", line 838, in fit
return self._fit(X, y, ParameterGrid(self.param_grid))
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\grid_search.py", line 574, in _fit
for parameters in parameter_iterable
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\cross_validation.py", line 1675, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\ensemble\forest.py", line 273, in fit
y, expanded_class_weight = self._validate_y_class_weight(y)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\ensemble\forest.py", line 471, in _validate_y_class_weight
check_classification_targets(y)
File "C:\Users\user\Anaconda31\lib\site-packages\sklearn\utils\multiclass.py", line 172, in check_classification_targets
raise ValueError("Unknown label type: %r" % y_type)
ValueError: Unknown label type: 'unknown'

Related

Sklearn: not receiving transformer parameters while trying to crossvalidate pipeline

I am trying to use the cross_validate function on a pipeline. The pipeline works correctly if I train it normally with fit but I am getting an error when i use the cross_validate. Basically, the parameters I pass to the transformer on the pipeline are NoneType when I use the cross_validate function. Why is this the case and how could I fix it? I tried to do a minimum example here
from sklearn.model_selection import cross_validate
from simpletransformers.classification import ClassificationModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
class Transformer(BaseEstimator, TransformerMixin):
def __init__(self, model, mtype, mname, num_labels: int):
self._model = model
self._inst_model = None
self._num_labels = num_labels
self._type = mtype
self._name =mname
def fit(self, train_input, y=None):
self._create_model()
self._train_model(train_input)
return self
def transform(self, eval_df, y=None):
result, model_outputs, wrong_predictions = self._inst_model.eval_model(eval_df=eval_df)
return model_outputs
def _create_model(self):
self._inst_model = self._model(self._type, self._name, args={"output_dir": 'min_ex'},
num_labels=self._num_labels)
def _train_model(self, train_input):
train_df, eval_df = train_test_split(train_input, test_size=0.20)
return self._inst_model.train_model(train_df, eval_df=eval_df)
if __name__ == '__main__':
categories = ['sci.med', 'sci.space']
X_t, y_t = fetch_20newsgroups(random_state=1,
subset='train',
categories=categories,
remove=('footers', 'quotes'),
return_X_y=True)
X = pd.DataFrame({
'text': X_t,
'labels': y_t,
})
y = y_t
transformer_grid = {
"model": ClassificationModel,
"num_labels": 14,
"mtype": "electra",
"mname": "german-nlp-group/electra-base-german-uncased"
}
classifier_grid = {
'n_estimators' : 100,
'random_state': 42
}
pipe = Pipeline([
('feats', FeatureUnion([
('transformer', Pipeline([
('transformer', Transformer(**transformer_grid)),
])),
])),
('classifier', RandomForestClassifier(**classifier_grid))
])
# pipe.fit(X, y)
cv_results = cross_validate(pipe, X, y, cv=5, scoring='accuracy', n_jobs=1)
And the error I am getting is this
2021-03-04 16:05:54.861544: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
/root/complex_semantics/lib/python3.8/site-packages/sklearn/base.py:209: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
warnings.warn('From version 0.24, get_params will raise an '
/root/complex_semantics/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:548: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
Traceback (most recent call last):
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 330, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 292, in _fit
X, fitted_transformer = fit_transform_one_cached(
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/memory.py", line 352, in __call__
return self.func(*args, **kwargs)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 953, in fit_transform
results = self._parallel_func(X, y, fit_params, _fit_transform_one)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 978, in _parallel_func
return Parallel(n_jobs=self.n_jobs)(delayed(func)(
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 1029, in __call__
if self.dispatch_one_batch(iterator):
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 847, in dispatch_one_batch
self._dispatch(tasks)
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 765, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 572, in __init__
self.results = batch()
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 252, in __call__
return [func(*args, **kwargs)
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 252, in <listcomp>
return [func(*args, **kwargs)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 376, in fit_transform
return last_step.fit_transform(Xt, y, **fit_params_last_step)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/base.py", line 693, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "min_ex.py", line 23, in fit
self._create_model()
File "min_ex.py", line 32, in _create_model
self._inst_model = self._model(self._type, self._name, args={"output_dir": 'min_ex'},
TypeError: 'NoneType' object is not callable
warnings.warn("Estimator fit failed. The score on this train-test"
Edit:
I changed to use another variable for when the model is instantiated, still the same issue. Took out print statements for easier reading. The only thing missing for the code to run is the loading of data. It still gives me same error only when crossvalidate
Edit 2:
Created a minimal reproducible example by adding a synthetic dataset

performing K-fold Cross Validation with scoring = 'f1 or Recall or Precision' for multi-class problem

I know this can easily be implemented for a binary classification problem. But it seems to be a bit tough in the case of a multi-class problem.
I have a dataset that is un-balanced and is an example of a 4-class classification problem. I have applied the RandomForestClassifier() on it to test various measures of the algorithm such as accuracy, precision, recall, f1_score, etc. Now I wanted to perform the K-fold Cross Validation on the training set with 10 splits and I want the 'scoring' parameter of the cross_val_score() function to be 'f1' instead of 'accuracy'.
My code:
# Random Forest
np.random.seed(123)
from sklearn.ensemble import RandomForestClassifier
classifier_RF = RandomForestClassifier(random_state = 0)
classifier_RF.fit(X_train, Y_train)
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier_RF, X = X_train, y = Y_train, cv = 10, scoring = 'f1')
print("F1_Score: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
However, when I try to run this code, I am getting an error as follows:
ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].
I have tried setting the average parameter to 'weighted' in the cross_val_function() as follows:
accuracies = cross_val_score(estimator = classifier_RF, X = X_train, y = Y_train, cv = 10, scoring = 'f1', average = 'weighted')
but that's giving an error as follows:
TypeError: cross_val_score() got an unexpected keyword argument 'average'
The entire traceback is as follows:
Traceback (most recent call last):
File "<ipython-input-1-ba4a5e1de09a>", line 97, in <module>
accuracies = cross_val_score(estimator = classifier_RF, X = X_train, y = Y_train, cv = 10, scoring = 'f1')
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 406, in cross_val_score
error_score=error_score)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 248, in cross_validate
for train, test in cv.split(X, y, groups))
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 1048, in __call__
if self.dispatch_one_batch(iterator):
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 866, in dispatch_one_batch
self._dispatch(tasks)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 784, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 572, in __init__
self.results = batch()
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 263, in __call__
for func, args, kwargs in self.items]
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 263, in <listcomp>
for func, args, kwargs in self.items]
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 560, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 607, in _score
scores = scorer(estimator, X_test, y_test)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 88, in __call__
*args, **kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 213, in _score
**self._kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1047, in f1_score
zero_division=zero_division)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1175, in fbeta_score
zero_division=zero_division)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1434, in precision_recall_fscore_support
pos_label)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1265, in _check_set_wise_labels
% (y_type, average_options))
ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].
You need to use make_score to define your metric and its parameters:
from sklearn.metrics import make_scorer, f1_score
scoring = {'f1_score' : make_scorer(f1_score, average='weighted')}
and then use this in your cross_val_score:
results = cross_val_score(estimator = classifier_RF,
X = X_train,
y = Y_train,
cv = 10,
scoring = scoring)

Can not fit training data feature to match label data after Vectorizing

i have school project that demand me to use machine learning, after several troubleshoot i meet deadend, don't know how to solve it.
i have this code:
db_connection = 'mysql+pymysql://root:#localhost/databases'
conn = create_engine(db_connection)
df = pd.read_sql("SELECT * from barang", conn)
cth_data = pd.DataFrame(df)
#print(cth_data.head())
cth_data = cth_data.dropna()
y = cth_data['kode_aset']
x = cth_data[['merk','ukuran','bahan','harga']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
clf=RandomForestClassifier(n_estimators=100)
vectorizer = CountVectorizer( max_features = 50000, ngram_range = ( 1,50 ) )
d_feture = vectorizer.fit_transform(x_train)
#d_label = vectorizer.transform(y_train)
clf.fit(d_feture, y_train)
t_data = vectorizer.transform(x_test)
y_pred=clf.predict(t_data)
print ("Model_Accuracy: " + str(np.mean(y_pred == y_test)))
i fetched the data from mysql database here is the database:
Screenshot of database:
ended up with this kind of error:
File "Machine_learn_V_0.0.1.py", line 41, in <module>
clf.fit(d_feture, y_train)
File "C:\Python35\lib\site-packages\sklearn\ensemble\forest.py", line 333, in fit
for i, t in enumerate(trees))
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\parallel.py", line 917, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\parallel.py", line 759, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\parallel.py", line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 182, in apply_async
result = ImmediateResult(func)
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 549, in __init__
self.results = batch()
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "C:\Python35\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Python35\lib\site-packages\sklearn\ensemble\forest.py", line 119, in _parallel_build_trees
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
File "C:\Python35\lib\site-packages\sklearn\tree\tree.py", line 801, in fit
X_idx_sorted=X_idx_sorted)
File "C:\Python35\lib\site-packages\sklearn\tree\tree.py", line 236, in fit
"number of samples=%d" % (len(y), n_samples))
ValueError: Number of labels=223 does not match number of samples=4
CountVectorizer takes strings, it can not process columns as you wished it would, which means you should concatenated strings from cth_data[['merk','ukuran','bahan','harga']] into a single column, e.g.:
cols = ['merk','ukuran','bahan','harga']
cth_data['combined'] = cth_data[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
x = cth_data["combined"]
from there on your code should work

How to use SMOTENC inside pipeline (Error: Some of the categorical indices are out of range)?

I would greatly appreciate if you could let me know how to use SMOTENC. I wrote:
# Data
XX = pd.read_csv('Financial Distress.csv')
y = np.array(XX['Financial Distress'].values.tolist())
y = np.array([0 if i > -0.50 else 1 for i in y])
Na = np.array(pd.read_csv('Na.csv', header=None).values)
XX = XX.iloc[:, 3:127]
# Use get-dummies to convert categorical features into dummy ones
dis_features = ['x121']
X = pd.get_dummies(XX, columns=dis_features)
# # Divide Data into Train and Test
indices = np.arange(y.shape[0])
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, indices, stratify=y, test_size=0.3,
random_state=42)
num_indices=list(X)[:X.shape[1]-37]
cat_indices=list(X)[X.shape[1]-37:]
num_indices1 = list(X.iloc[:,np.r_[0:94,95,97,100:123]].columns.values)
cat_indices1 = list(X.iloc[:,np.r_[94,96,98,99,123:160]].columns.values)
print(len(num_indices1))
print(len(cat_indices1))
pipeline=Pipeline(steps= [
# Categorical features
('feature_processing', FeatureUnion(transformer_list = [
('categorical', MultiColumn(cat_indices)),
#numeric
('numeric', Pipeline(steps = [
('select', MultiColumn(num_indices)),
('scale', StandardScaler())
]))
])),
('clf', rg)
]
)
pipeline_with_resampling = make_pipeline(SMOTENC(categorical_features=cat_indices1), pipeline)
# # Grid Search to determine best params
cv=StratifiedKFold(n_splits=5,random_state=42)
rg_cv = GridSearchCV(pipeline_with_resampling, param_grid, cv=cv, scoring = 'f1')
rg_cv.fit(X_train, y_train)
Therefore, as it is indicated I have 5 categorical features. Really, indices 123 to 160 are related to one categorical feature with 37 possible values which is converted into 37 columns using get_dummies. Unfortunately, it throws the following error:
Traceback (most recent call last):
File "D:/mifs-master_2/MU/learning-from-imbalanced-classes-master/learning-from-imbalanced-classes-master/continuous/Final Logit/SMOTENC/logit-final - Copy.py", line 424, in <module>
rg_cv.fit(X_train, y_train)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 722, in fit
self._run_search(evaluate_candidates)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 1191, in _run_search
evaluate_candidates(ParameterGrid(self.param_grid))
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 711, in evaluate_candidates
cv.split(X, y, groups)))
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 917, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 759, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 182, in apply_async
result = ImmediateResult(func)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 549, in __init__
self.results = batch()
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 528, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\pipeline.py", line 237, in fit
Xt, yt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\pipeline.py", line 200, in _fit
cloned_transformer, Xt, yt, **fit_params_steps[name])
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 342, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\pipeline.py", line 576, in _fit_resample_one
X_res, y_res = sampler.fit_resample(X, y, **fit_params)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\base.py", line 85, in fit_resample
output = self._fit_resample(X, y)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\over_sampling\_smote.py", line 940, in _fit_resample
self._validate_estimator()
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\over_sampling\_smote.py", line 933, in _validate_estimator
' should be between 0 and {}'.format(self.n_features_))
ValueError: Some of the categorical indices are out of range. Indices should be between 0 and 160
Thanks in advance.
As it follows, two pipelines should be used:
num_indices1 = list(X.iloc[:,np.r_[0:94,95,97,100:120,121:123]].columns.values)
cat_indices1 = list(X.iloc[:,np.r_[94,96,98,99,120]].columns.values)
print(len(num_indices1))
print(len(cat_indices1))
cat_indices = [94, 96, 98, 99, 120]
from imblearn.pipeline import make_pipeline
pipeline=Pipeline(steps= [
# Categorical features
('feature_processing', FeatureUnion(transformer_list = [
('categorical', MultiColumn(cat_indices1)),
#numeric
('numeric', Pipeline(steps = [
('select', MultiColumn(num_indices1)),
('scale', StandardScaler())
]))
])),
('clf', rg)
]
)
pipeline_with_resampling = make_pipeline(SMOTENC(categorical_features=cat_indices), pipeline)
You can not dummies your categorical variables and use it later SMOTENC because it already implements in its algorithm get_dummies what will bias your model.
However, I recommend using SMOTE () instead of SMOTENC (), but in this case you must first apply get_demmies.
You cannot use scikit learn pipeline with imblearn pipeline. The imblearn pipeline implements fit_sample as well as fit_predict. Sklearn pipeline onle implements fit_predict. You cannot combine them.
First, don't do the get_dummies. Then, change the way you do your categorical_features, and put a list of booleans for if it's categorical or not.
Try this:
cat_cols = []
for col in x.columns:
if x[col].dtype == 'object': #or 'category' if that's the case
cat_cols.append(True)
else:
cat_cols.append(False)
Then pass cat_cols to your SMOTENC:
smote_nc = SMOTENC(categorical_features=cat_cols, random_state=0)

Problems with combining Keras 2.0 and pymc3

I am trying to combine keras 2.0 with pymc3 to build a neural network. It is a modification of the code from Thomas Weicki's Bayesian deep learning II
This is the code I have:
import numpy as np
import pymc3 as pm
import theano
import theano.tensor as T
from keras.layers import Input, Dense
from keras import backend as K
from sklearn import datasets
from sklearn.preprocessing import scale
from sklearn.cross_validation import train_test_split
from sklearn.datasets import make_moons
from scipy.stats import mode
X, Y = make_moons(noise=0.2, random_state=0, n_samples=1000)
X = scale(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.6)
ann_input = theano.shared(X_train.astype(np.float32))
ann_output = theano.shared(Y_train.astype(np.float32))
print (X_train.shape)
print (Y_train.shape)
class GaussWeights(object):
def __init__(self):
self.count = 0
def __call__(self, shape, name='w',dtype=None):
return pm.Normal(
name, mu=0, sd=.1,
testval=K.random_normal(shape,dtype=dtype),
shape=shape)
n_hidden = 16
def build_ann(x, y, init):
b = (T.ones_like(x[:]))
rows = b.shape.eval()[0]
cols = b.shape.eval()[1]
with pm.Model() as m:
i = Input(tensor=x, shape=(rows,cols))
layer1 = Dense(16,kernel_initializer=init, activation='tanh')(i)
layer2 = Dense(1, kernel_initializer=init, activation='sigmoid')(layer1)
layer2 = layer2.reshape((rows,))
out = pm.Bernoulli('out', layer2, observed=y)
return m, out
#m,out = build_ann(ann_input, ann_output)
m,out = build_ann(ann_input, ann_output, GaussWeights())
with m:
#Run ADVI which returns posterior means, standard deviations, and the evidence lower bound (ELBO)
ann_input.set_value(X_train.astype(np.float32))
ann_output.set_value(Y_train.astype(np.float32))
v_params = pm.variational.advi(n=50000)
trace = pm.variational.sample_vp(v_params, draws=5000)
# Replace shared variables with testing set
ann_input.set_value(X_test.astype(np.float32))
ann_output.set_value(Y_test.astype(np.float32))
with m:
ppc = pm.sample_ppc(trace, samples=500)
# Use probability of > 0.5 to assume prediction of class 1
pred = ppc['out'].mean(axis=0) > 0.5
pred_mode = mode(ppc['out'], axis=0).mode[0, :]
print (pred.shape)
print('Accuracy = {}%'.format((Y_test == pred).mean() * 100))
But I get the following error which I don't know how to fix:
Traceback (most recent call last):
File "keras_deep_learning.py", line 50, in <module>
m,out = build_ann(ann_input, ann_output, GaussWeights())
File "keras_deep_learning.py", line 43, in build_ann
layer1 = Dense(16,kernel_initializer=init, activation='tanh')(i)
File "/home/gbenga/.local/lib/python3.5/site-packages/keras/engine/topology.py", line 558, in __call__
self.build(input_shapes[0])
File "/home/gbenga/.local/lib/python3.5/site-packages/keras/layers/core.py", line 827, in build
constraint=self.kernel_constraint)
File "/home/gbenga/.local/lib/python3.5/site-packages/keras/legacy/interfaces.py", line 88, in wrapper
return func(*args, **kwargs)
File "/home/gbenga/.local/lib/python3.5/site-packages/keras/engine/topology.py", line 391, in add_weight
weight = K.variable(initializer(shape), dtype=dtype, name=name)
File "/home/gbenga/.local/lib/python3.5/site-packages/keras/backend/theano_backend.py", line 143, in variable
value = value.eval()
File "/home/gbenga/.local/lib/python3.5/site-packages/theano/gof/graph.py", line 516, in eval
self._fn_cache[inputs] = theano.function(inputs, self)
File "/home/gbenga/.local/lib/python3.5/site-packages/theano/compile/function.py", line 326, in function
output_keys=output_keys)
File "/home/gbenga/.local/lib/python3.5/site-packages/theano/compile/pfunc.py", line 486, in pfunc
output_keys=output_keys)
File "/home/gbenga/.local/lib/python3.5/site-packages/theano/compile/function_module.py", line 1794, in orig_function
output_keys=output_keys).create(
File "/home/gbenga/.local/lib/python3.5/site-packages/theano/compile/function_module.py", line 1446, in __init__
accept_inplace)
File "/home/gbenga/.local/lib/python3.5/site-packages/theano/compile/function_module.py", line 177, in std_fgraph
update_mapping=update_mapping)
File "/home/gbenga/.local/lib/python3.5/site-packages/theano/gof/fg.py", line 180, in __init__
self.__import_r__(output, reason="init")
File "/home/gbenga/.local/lib/python3.5/site-packages/theano/gof/fg.py", line 361, in __import_r__
raise MissingInputError("Undeclared input", variable=variable)
theano.gof.fg.MissingInputError: Undeclared input
Unfortunately with Keras 2.0 you can no longer use a symbolic initializer for the weights. Try downgrading to Keras 1.2 it will work then.
See the following issues for reference:
https://github.com/fchollet/keras/issues/6546
https://github.com/fchollet/keras/issues/6551

Categories