AttributeError: 'PipelinedRDD' object has no attribute '_jdf' - python

I am fairly new to PySpark. I am getting an attribute error while trying to run a logistic regression. I am trying to run a logistic regression on minmaxscaler vectors to get the probability values of a likely match up between the data points.
number_games = df2.filter(df2.GAME_ID > 22000000).filter(
df2.GAME_ID < 40000000).groupby("TEAM_ABBREVIATION").agg(
(F.sum("FGM") / F.countDistinct("GAME_ID")).alias('Points_Per_Game'))
vectorassembler = VectorAssembler(inputCols=["Points_Per_Game"],
outputCol="Performance")
scaler = MinMaxScaler(inputCol="Performance", outputCol="Output")
vectors = vectorassembler.transform(number_games)
scaler_model = scaler.fit(vectors)
scaler_data = scaler_model.transform(vectors)
statistics_teams = scaler_data.select('TEAM_ABBREVIATION',
'Output') # teams match up against one another
statistics_teams
RDD2 = sc.parallelize(statistics_teams.collect())
# RDD4 = RDD2.map( lambda x: x.split()) even as a pipelineRDD I get the same attribute error
lr = LogisticRegression(maxIter=20, regParam=0.001)
logistic_model = lr.fit(RDD2)
logistic_model.show()
the error returns
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-46-3c0eb05824a8> in <module>
1 lr = LogisticRegression(maxIter=20, regParam=0.001)
----> 2 logistic_model = lr.fit(RDD4)
3
4 logistic_model.show()
c:\users\user\appdata\local\programs\python\python39\lib\site-packages\pyspark\ml\base.py in fit(self, dataset, params)
159 return self.copy(params)._fit(dataset)
160 else:
--> 161 return self._fit(dataset)
162 else:
163 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
c:\users\user\appdata\local\programs\python\python39\lib\site-packages\pyspark\ml\wrapper.py in _fit(self, dataset)
333
334 def _fit(self, dataset):
--> 335 java_model = self._fit_java(dataset)
336 model = self._create_model(java_model)
337 return self._copyValues(model)
c:\users\user\appdata\local\programs\python\python39\lib\site-packages\pyspark\ml\wrapper.py in _fit_java(self, dataset)
330 """
331 self._transfer_params_to_java()
--> 332 return self._java_obj.fit(dataset._jdf)
333
334 def _fit(self, dataset):
AttributeError: 'PipelinedRDD' object has no attribute '_jdf'

Can you try to call .fit() on the actual data frame, in this case statistics_teams? I think that LogisticRegression works with data frames and not RDDs.

Related

Error using sci-kit learn MLPClassifier: 'str' object has no attribute 'decode'

I am creating, tuning, and fitting various sci-kit learn models for a classification problem. The structure of the code below works fine for all other methods (SVM, Logistic Regression, Random Forest, Decision Tree, etc.)
When running the MLPClassifier() and its hyperparameter tuning, I get the error message originating from the last line where I fit:
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_8160/4102235488.py in <module>
19
20 reverse_mlp_en_clf = GridSearchCV(reverse_mlp_en, reverse_mlp_en_hyperparameters, cv = 5, verbose = 1, n_jobs = 5, scoring = 'f1')
---> 21 reverse_mlp_en_best_model = reverse_mlp_en_clf.fit(X_en, y_en)
~\AppData\Roaming\Python\Python38\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
737 refit_start_time = time.time()
738 if y is not None:
--> 739 self.best_estimator_.fit(X, y, **fit_params)
740 else:
741 self.best_estimator_.fit(X, **fit_params)
~\AppData\Roaming\Python\Python38\site-packages\sklearn\neural_network\_multilayer_perceptron.py in fit(self, X, y)
992 self : returns a trained MLP model.
993 """
--> 994 return self._fit(X, y, incremental=(self.warm_start and
995 hasattr(self, "classes_")))
996
~\AppData\Roaming\Python\Python38\site-packages\sklearn\neural_network\_multilayer_perceptron.py in _fit(self, X, y, incremental)
372 # Run the LBFGS solver
373 elif self.solver == 'lbfgs':
--> 374 self._fit_lbfgs(X, y, activations, deltas, coef_grads,
375 intercept_grads, layer_units)
376 return self
~\AppData\Roaming\Python\Python38\site-packages\sklearn\neural_network\_multilayer_perceptron.py in _fit_lbfgs(self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units)
468 },
469 args=(X, y, activations, deltas, coef_grads, intercept_grads))
--> 470 self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
471 self.loss_ = opt_res.fun
472 self._unpack(opt_res.x)
~\AppData\Roaming\Python\Python38\site-packages\sklearn\utils\optimize.py in _check_optimize_result(solver, result, max_iter, extra_warning_msg)
241 " https://scikit-learn.org/stable/modules/"
242 "preprocessing.html"
--> 243 ).format(solver, result.status, result.message.decode("latin1"))
244 if extra_warning_msg is not None:
245 warning_msg += "\n" + extra_warning_msg
AttributeError: 'str' object has no attribute 'decode'
Code is as follows:
# Reverse hyperparameter tuning - MLP English
# Create model
reverse_mlp_en = MLPClassifier()
# Define parameters, store in dictionary
reverse_mlp_en_hidden_layer_sizes = [(50,),(100,),(150,), (200,)]
reverse_mlp_en_activation = ['logistic', 'tanh', 'relu']
reverse_mlp_en_alpha = [0.0001, 0.05]
reverse_mlp_en_learning_rate = ['constant','adaptive']
reverse_mlp_en_hyperparameters = dict(hidden_layer_sizes = reverse_mlp_en_hidden_layer_sizes, activation = reverse_mlp_en_activation
, solver = reverse_mlp_en_solver, alpha = reverse_mlp_en_alpha, learning_rate =reverse_mlp_en_learning_rate )
# Get model with the best parameters
reverse_mlp_en_clf = GridSearchCV(reverse_mlp_en, reverse_mlp_en_hyperparameters, cv = 5, verbose = 1, n_jobs = 5, scoring = 'f1')
reverse_mlp_en_best_model = reverse_mlp_en_clf.fit(X_en, y_en)
Libraries are up to date if I am not mistaken.
If anyone has an idea that would be greatly appreciated - I just find it weird how:
The same code, but using different models, works fine and
how the error is only shown after fitting everything (which is taking 40 minutes or so)

huggingface transformer models: KeyError: 'input_ids' message at beginning of BERT model training

Using the Huggingface transformer library, I am encountering a bug in the final step when I go to fine tune the BERT language model for masked language modelling task. I am looking to fine tune it for an in domain finance corpus that model has not been trained on yet. However, I get the following error message when I call the model to train: KeyError: 'input_ids'. Provided below are the steps and code I took. Any insights are appreciated!
First, I created a dataset object from a pandas dataframe that was in turn created from a csv file with 1 column of many rows of text:
unlabelled_dataset = Dataset.from_pandas(unlabelled)
Second, I then tokenized the dataset with the following code:
tokenizerBERT = BertTokenizerFast.from_pretrained('bert-base-uncased') #BERT model tokenization & check
tokenizerBERT(unlabelled_dataset['paragraphs'], padding=True, truncation=True)
tokenizerBERT.save_pretrained('tokenizers/pytorch/labelled/BERT/')
Third, I created a data collator as instructed:
data_collator_BERT = DataCollatorForLanguageModeling(tokenizer=tokenizerBERT, mlm=True, mlm_probability=0.15)
Next, I select my model from_pretrained to get the benefits of transfer learning:
model_BERT = BertForMaskedLM.from_pretrained("bert-base-uncased")
Next, I passed my training args to the transformer trainer and initialize:
training_args_BERT = TrainingArguments(
output_dir="./BERT",
num_train_epochs=10,
evaluation_strategy='steps',
warmup_steps=10000,
weight_decay=0.01,
per_gpu_train_batch_size=64,
)
trainer_BERT = Trainer(
model=model_BERT,
args=training_args_BERT,
data_collator=data_collator_BERT,
train_dataset=unlabelled_dataset,
)
Last, I call the model to train and get the error KeyError: 'input_ids'
trainer_BERT.train()
Any insights on how to debug this approach to training the model?
Provided below is the exact error message received:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-9-83b7063dea0b> in <module>
----> 1 trainer_BERT.train()
2 trainer.save_model("./models/royalBERT")
~/anaconda3/lib/python3.7/site-packages/transformers/trainer.py in train(self, model_path, trial)
755 self.control = self.callback_handler.on_epoch_begin(self.args, self.state, self.control)
756
--> 757 for step, inputs in enumerate(epoch_iterator):
758
759 # Skip past any already trained steps if resuming training
~/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py in __next__(self)
361
362 def __next__(self):
--> 363 data = self._next_data()
364 self._num_yielded += 1
365 if self._dataset_kind == _DatasetKind.Iterable and \
~/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self)
401 def _next_data(self):
402 index = self._next_index() # may raise StopIteration
--> 403 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
404 if self._pin_memory:
405 data = _utils.pin_memory.pin_memory(data)
~/anaconda3/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
45 else:
46 data = self.dataset[possibly_batched_index]
---> 47 return self.collate_fn(data)
~/anaconda3/lib/python3.7/site-packages/transformers/data/data_collator.py in __call__(self, examples)
193 ) -> Dict[str, torch.Tensor]:
194 if isinstance(examples[0], (dict, BatchEncoding)):
--> 195 examples = [e["input_ids"] for e in examples]
196 batch = self._tensorize_batch(examples)
197 if self.mlm:
~/anaconda3/lib/python3.7/site-packages/transformers/data/data_collator.py in <listcomp>(.0)
193 ) -> Dict[str, torch.Tensor]:
194 if isinstance(examples[0], (dict, BatchEncoding)):
--> 195 examples = [e["input_ids"] for e in examples]
196 batch = self._tensorize_batch(examples)
197 if self.mlm:
KeyError: 'input_ids'
Though the tokenizer is passed through the DataCollator, I think we have to perform tokenization on the data:
Hence, we need to perform tokenization on the data as:
train_dataset = tokenizer.encode(unlabeled_data, add_special_tokens=True, return_tensors="pt")
trainer_BERT = Trainer(
model=model_BERT,
args=training_args_BERT,
data_collator=data_collator_BERT,
train_dataset=train_dataset,
)

DNNClassifier: 'DataFrame' object has no attribute 'dtype'

I am trying to run a tensorflow DNNClassifier model with some data, that I read from a csv. Even though I converted the datatype of each column to float32, I keeo getting the 'DataFrame' object has no attribute 'dtype' Error. I would really appreciate if you could help me.
Dataformat:
27 columns, 23 input, 4 classes
Thank you
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
factors = pd.read_csv('xxx.csv')
#Formatting data to float32
factors['1'] = factors['1'].astype('float32')
factors['2'] = factors['2'].astype('float32')
...
factors['27'] = factors['27'].astype('float32')
#Definition of in- and output
feat_data = factors[['1', '2', ... '23']]
labels = factors[['24', '25','26', '27']]
#Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feat_data,labels, test_size=0.3, random_state=101)
from sklearn.preprocessing import MinMaxScalerscaler = MinMaxScaler()
scaled_x_train = scaler.fit_transform(X_train) scaled_x_test = scaler.transform(X_test)
#Model
from tensorflow import estimator
feat_cols = [tf.feature_column.numeric_column('x', shape [23],dtype=tf.float32)]
deep_model = estimator.DNNClassifier(hidden_units=[23,23,23],
feature_columns=feat_cols,
n_classes=4, optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.01) )
input_fn = estimator.inputs.numpy_input_fn(x {'x':scaled_x_train},y=y_train,shuffle=True,batch_size=10,num_epochs=5)
deep_model.train(input_fn=input_fn,steps=50)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-169-9b2e050e4e40> in <module>()
----> 1 deep_model.train(input_fn=input_fn,steps=50)
~\Anaconda\envs\tfdeeplearning\lib\site- packages\tensorflow\python\estimator\estimator.py in train(self, input_fn, hooks, steps, max_steps)
239 hooks.append(training.StopAtStepHook(steps, max_steps))
240
--> 241 loss = self._train_model(input_fn=input_fn, hooks=hooks)
242 logging.info('Loss for final step: %s.', loss)
243 return self
~\Anaconda\envs\tfdeeplearning\lib\site-packages\tensorflow\python\estimator\estimator.py in _train_model(self, input_fn, hooks)
626 global_step_tensor = self._create_and_assert_global_step(g)
627 features, labels = self._get_features_and_labels_from_input_fn(
--> 628 input_fn, model_fn_lib.ModeKeys.TRAIN)
629 estimator_spec = self._call_model_fn(features, labels,
630 model_fn_lib.ModeKeys.TRAIN)
~\Anaconda\envs\tfdeeplearning\lib\site-packages\tensorflow\python\estimator\estimator.py in _get_features_and_labels_from_input_fn(self, input_fn, mode)
497
498 def _get_features_and_labels_from_input_fn(self, input_fn, mode):
--> 499 result = self._call_input_fn(input_fn, mode)
500 if isinstance(result, (list, tuple)):
501 if len(result) != 2:
~\Anaconda\envs\tfdeeplearning\lib\site-packages\tensorflow\python\estimator\estimator.py in _call_input_fn(***failed resolving arguments***)
583 kwargs['config'] = self.config
584 with ops.device('/cpu:0'):
--> 585 return input_fn(**kwargs)
586
587 def _call_model_fn(self, features, labels, mode):
~\Anaconda\envs\tfdeeplearning\lib\site-packages\tensorflow\python\estimator\inputs\numpy_io.py in input_fn()
122 num_threads=num_threads,
123 enqueue_size=batch_size,
--> 124 num_epochs=num_epochs)
125
126 features = (queue.dequeue_many(batch_size) if num_epochs is None
~\Anaconda\envs\tfdeeplearning\lib\site-packages\tensorflow\python\estimator\inputs\queues\feeding_functions.py in _enqueue_data(data, capacity, shuffle, min_after_dequeue, num_threads, seed, name, enqueue_size, num_epochs)
315 elif isinstance(data, collections.OrderedDict):
316 types = [dtypes.int64] + [
--> 317 dtypes.as_dtype(col.dtype) for col in data.values()
318 ]
319 queue_shapes = [()] + [col.shape[1:] for col in data.values()]
~\Anaconda\envs\tfdeeplearning\lib\site-packages\tensorflow\python\estimator\inputs\queues\feeding_functions.py in <listcomp>(.0)
315 elif isinstance(data, collections.OrderedDict):
316 types = [dtypes.int64] + [
--> 317 dtypes.as_dtype(col.dtype) for col in data.values()
318 ]
319 queue_shapes = [()] + [col.shape[1:] for col in data.values()]
~\Anaconda\envs\tfdeeplearning\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
3079 if name in self._info_axis:
3080 return self[name]
-> 3081 return object.__getattribute__(self, name)
3082
3083 def __setattr__(self, name, value):
AttributeError: 'DataFrame' object has no attribute 'dtype'`$`
Tensorflow assumes that you pass numpy arrays not pandas DataFrames (which have dtype attribute). So, you should pass df.values instead of df to tensorflow functions.

Cannot cast array data from dtype('float64') to dtype('S32') according to the rule 'safe' ON Model_result=model.predict(classify)

I'm trying to run a python program but get an error. This code is to predict the values and classify it. It works on a GradientBoostingClassifier model but not with a LinearSVC() class model.
This code is used to create the model for the output class:
data_file=pd.read_csv('C:\\Class.csv')
testx = data_file.iloc[0:, :5]
testy= data_file.iloc[0:, 5]
model = make_pipeline(LinearSVC())
model.fit(testx,testy)
import pickle
filename = 'C:\\Model\\svm.sav'
pickle.dump(model, open(filename, 'wb'))
See the below code which has an error when using the the linearSVC output class model
import pickle
import numpy as np
time=0.0
print "\n"
time = input(" \t Please Enter Time in Mili Seconds (Ms): ")
print "\n"
#Data-Read
filename1 = 'c:\\Model\\HR_model.sav'
filename2 = 'c:\\Model\\SpO2_model.sav'
filename3 = 'c:\\Model\\SYS_model.sav'
filename4 = 'c:\\Model\\Dia_model.sav'
filename5 = 'c:\\Model\\Mean_model.sav'
filename6 = 'c:\\Model\\svm.sav'
classify=list()
HR = pickle.load(open(filename1, 'rb'))
hr_result=HR.predict(time)
classify.append(str(float(hr_result)))
SPO2 = pickle.load(open(filename2, 'rb'))
SPO2_result=SPO2.predict(time)
classify.append(str(float(SPO2_result)))
Sys = pickle.load(open(filename3, 'rb'))
Sys_result=Sys.predict(time)
classify.append(str(float(Sys_result)))
DIA = pickle.load(open(filename4, 'rb'))
DIA_result=DIA.predict(time)
classify.append(str(float(DIA_result)))
MEan = pickle.load(open(filename5, 'rb'))
MEan_result=MEan.predict(time)
classify.append(str(float(MEan_result)))
#Classifier train Model
model = pickle.load(open(filename6, 'rb'))
Model_result=model.predict(classify)
print "\n"
print "Input Features:"
print classify
print "\n"
print "Output Class: "
print Model_result
Upon running the program, I get the following error:
TypeError Traceback (most recent call
last) C:\ForcastPredictFromModels.py in () 50 #CLassifier
train Model 51 model = pickle.load(open(filename6, 'rb')) ---> 52
Model_result=model.predict(classify) 53 print "\n" 54
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-
packages\sklearn\utils\metaestimators.pyc in (*args, **kwargs)
52 53 # lambda, but not partial, allows help() to work with
update_wrapper ---> 54 out = lambda *args, **kwargs:
self.fn(obj, *args, **kwargs) 55 # update the docstring of
the returned function 56 update_wrapper(out, self.fn)
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-
packages\sklearn\pipeline.pyc in predict(self, X) 325 if
transform is not None: 326 Xt =
transform.transform(Xt)
--> 327 return self.steps[-1][-1].predict(Xt) 328 329 #if_delegate_has_method(delegate='_final_estimator')
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\linear_model\base.pyc
in predict(self, X) 334 Predicted class label per sample.
335 """
--> 336 scores = self.decision_function(X) 337 if len(scores.shape) == 1: 338 indices = (scores >
0).astype(np.int)
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\linear_model\base.pyc
in decision_function(self, X) 318 319 scores =
safe_sparse_dot(X, self.coef_.T,
--> 320 dense_output=True) + self.intercept_ 321 return scores.ravel() if scores.shape[1]
== 1 else scores 322
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\utils\extmath.pyc
in safe_sparse_dot(a, b, dense_output) 187 return ret 188
else:
--> 189 return fast_dot(a, b) 190 191
TypeError: Cannot cast array data from dtype('float64') to
dtype('S32') according to the rule 'safe'

H2O Python API: retrieve best models from GridSearch

I am performing a GridSearch with H2O using the Python API using the following code,
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid import H2OGridSearch
hyper_parameters = {'ntrees':[10, 50, 100, 200], 'max_depth':[5, 10, 15, 20, 25], 'balance_classes':[True, False]}
search_criteria = {
"strategy": "RandomDiscrete",
"max_runtime_secs": 600,
"max_models": 30,
"stopping_metric": 'AUTO',
"stopping_tolerance": 0.0001,
'seed': 42
}
grid_search = H2OGridSearch(H2ORandomForestEstimator, hyper_parameters, search_criteria=search_criteria)
grid_search.train(x=events_names_x,
y="total_rsvps",
training_frame=train,
validation_frame=test)
Once run I want to print the models and predict in order of AUC,
grid_search.sort_by('auc', False)
I get the following error,
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-272-b250bf2b838e> in <module>()
----> 1 grid_search.sort_by('auc', False)
/Users/stereo/.pyenv/versions/3.5.2/lib/python3.5/site-packages/h2o/grid/grid_search.py in sort_by(self, metric, increasing)
663
664 if metric[-1] != ')': metric += '()'
--> 665 c_values = [list(x) for x in zip(*sorted(eval('self.' + metric + '.items()'), key=lambda k_v: k_v[1]))]
666 c_values.insert(1, [self.get_hyperparams(model_id, display=False) for model_id in c_values[0]])
667 if not increasing:
/Users/stereo/.pyenv/versions/3.5.2/lib/python3.5/site-packages/h2o/grid/grid_search.py in <module>()
/Users/stereo/.pyenv/versions/3.5.2/lib/python3.5/site-packages/h2o/grid/grid_search.py in auc(self, train, valid, xval)
606 :return: The AUC.
607 """
--> 608 return {model.model_id: model.auc(train, valid, xval) for model in self.models}
609
610 def aic(self, train=False, valid=False, xval=False):
/Users/stereo/.pyenv/versions/3.5.2/lib/python3.5/site-packages/h2o/grid/grid_search.py in <dictcomp>(.0)
606 :return: The AUC.
607 """
--> 608 return {model.model_id: model.auc(train, valid, xval) for model in self.models}
609
610 def aic(self, train=False, valid=False, xval=False):
/Users/stereo/.pyenv/versions/3.5.2/lib/python3.5/site-packages/h2o/model/model_base.py in auc(self, train, valid, xval)
669 tm = ModelBase._get_metrics(self, train, valid, xval)
670 m = {}
--> 671 for k, v in viewitems(tm): m[k] = None if v is None else v.auc()
672 return list(m.values())[0] if len(m) == 1 else m
673
/Users/stereo/.pyenv/versions/3.5.2/lib/python3.5/site-packages/h2o/model/metrics_base.py in auc(self)
158 :return: Retrieve the AUC for this set of metrics.
159 """
--> 160 return self._metric_json['AUC']
161
162 def aic(self):
KeyError: 'AUC'
Any advise on:
can print the models in order of performance
forecast with the model with the highest AUC
what you need is
sorted_grid = grid_search.get_grid(sort_by='auc',decreasing=True)
print(sorted_grid)
you can change decreasing to False if you would prefer

Categories