What should do to fix my scikit-learn program? - python

A snippet of code involving RandomForestClassifier using the python machine learning library scikit-learn.
I am trying to give weight to different classes using the class_weight opition in the scikit's RandomForestClassifier.Below is my code snippet and then the error that I am getting
print 'Training...'
forest = RandomForestClassifier(n_estimators=500,class_weight= {0:1,1:1,2:1,3:1,4:1,5:1,6:1,7:4})
forest = forest.fit( train_data[0::,1::], train_data[0::,0] )
print 'Predicting...'
output = forest.predict(test_data).astype(int)
predictions_file = open("myfirstforest.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'
And I am getting the following error:
Training...
IndexError Traceback (most recent call last)
<ipython-input-20-122f2e5a0d3b> in <module>()
84 print 'Training...'
85 forest = RandomForestClassifier(n_estimators=500,class_weight={0:1,1:1,2:1,3:1,4:1,5:1,6:1,7:4})
---> 86 forest = forest.fit( train_data[0::,1::], train_data[0::,0] )
87
88 print 'Predicting...'
/home/rpota/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in fit(self, X, y, sample_weight)
216 self.n_outputs_ = y.shape[1]
217
--> 218 y, expanded_class_weight = self._validate_y_class_weight(y)
219
220 if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
/home/rpota/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in _validate_y_class_weight(self, y)
433 class_weight = self.class_weight
434 expanded_class_weight = compute_sample_weight(class_weight,
--> 435 y_original)
436
437 return y, expanded_class_weight
/home/rpota/anaconda/lib/python2.7/site-packages/sklearn/utils/class_weight.pyc in compute_sample_weight(class_weight, y, indices)
150 weight_k = compute_class_weight(class_weight_k,
151 classes_full,
--> 152 y_full)
153
154 weight_k = weight_k[np.searchsorted(classes_full, y_full)]
/home/rpota/anaconda/lib/python2.7/site-packages/sklearn/utils/class_weight.pyc in compute_class_weight(class_weight, classes, y)
58 for c in class_weight:
59 i = np.searchsorted(classes, c)
---> 60 if classes[i] != c:
61 raise ValueError("Class label %d not present." % c)
62 else:
IndexError: index 2 is out of bounds for axis 0 with size 2
Please help!.

Related

AttributeError: 'PipelinedRDD' object has no attribute '_jdf'

I am fairly new to PySpark. I am getting an attribute error while trying to run a logistic regression. I am trying to run a logistic regression on minmaxscaler vectors to get the probability values of a likely match up between the data points.
number_games = df2.filter(df2.GAME_ID > 22000000).filter(
df2.GAME_ID < 40000000).groupby("TEAM_ABBREVIATION").agg(
(F.sum("FGM") / F.countDistinct("GAME_ID")).alias('Points_Per_Game'))
vectorassembler = VectorAssembler(inputCols=["Points_Per_Game"],
outputCol="Performance")
scaler = MinMaxScaler(inputCol="Performance", outputCol="Output")
vectors = vectorassembler.transform(number_games)
scaler_model = scaler.fit(vectors)
scaler_data = scaler_model.transform(vectors)
statistics_teams = scaler_data.select('TEAM_ABBREVIATION',
'Output') # teams match up against one another
statistics_teams
RDD2 = sc.parallelize(statistics_teams.collect())
# RDD4 = RDD2.map( lambda x: x.split()) even as a pipelineRDD I get the same attribute error
lr = LogisticRegression(maxIter=20, regParam=0.001)
logistic_model = lr.fit(RDD2)
logistic_model.show()
the error returns
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-46-3c0eb05824a8> in <module>
1 lr = LogisticRegression(maxIter=20, regParam=0.001)
----> 2 logistic_model = lr.fit(RDD4)
3
4 logistic_model.show()
c:\users\user\appdata\local\programs\python\python39\lib\site-packages\pyspark\ml\base.py in fit(self, dataset, params)
159 return self.copy(params)._fit(dataset)
160 else:
--> 161 return self._fit(dataset)
162 else:
163 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
c:\users\user\appdata\local\programs\python\python39\lib\site-packages\pyspark\ml\wrapper.py in _fit(self, dataset)
333
334 def _fit(self, dataset):
--> 335 java_model = self._fit_java(dataset)
336 model = self._create_model(java_model)
337 return self._copyValues(model)
c:\users\user\appdata\local\programs\python\python39\lib\site-packages\pyspark\ml\wrapper.py in _fit_java(self, dataset)
330 """
331 self._transfer_params_to_java()
--> 332 return self._java_obj.fit(dataset._jdf)
333
334 def _fit(self, dataset):
AttributeError: 'PipelinedRDD' object has no attribute '_jdf'
Can you try to call .fit() on the actual data frame, in this case statistics_teams? I think that LogisticRegression works with data frames and not RDDs.

Fine tuning bert on next sentence prediction task

I am trying to fine-tune Bert using the Huggingface library on next sentence prediction task. I looked at the tutorial and I am trying to use DataCollatorForNextSentencePrediction and TextDatasetForNextSentencePrediction . When I am using that I get the following error. I have provided my code bellow.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-7678758b2c9c> in <module>()
56 train(bert_model,bert_tokenizer,train_data_set_path)
57 #prepare_data_set(bert_tokenizer)
---> 58 main()
9 frames
<ipython-input-18-7678758b2c9c> in main()
54 bert_model = BertForNextSentencePrediction.from_pretrained("bert-base-cased")
55 train_data_set_path = "/content/drive/My Drive/next_sentence/line_data_set_file.txt"
---> 56 train(bert_model,bert_tokenizer,train_data_set_path)
57 #prepare_data_set(bert_tokenizer)
58 main()
<ipython-input-18-7678758b2c9c> in train(bert_model, bert_tokenizer, path, eval_path)
47
48 )
---> 49 trainer.train()
50 trainer.save_model(out_dir)
51 def main():
/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in train(self, model_path, trial)
697
698 epoch_pbar = tqdm(epoch_iterator, desc="Iteration", disable=disable_tqdm)
--> 699 for step, inputs in enumerate(epoch_iterator):
700
701 # Skip past any already trained steps if resuming training
/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in __next__(self)
361
362 def __next__(self):
--> 363 data = self._next_data()
364 self._num_yielded += 1
365 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
401 def _next_data(self):
402 index = self._next_index() # may raise StopIteration
--> 403 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
404 if self._pin_memory:
405 data = _utils.pin_memory.pin_memory(data)
/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
45 else:
46 data = self.dataset[possibly_batched_index]
---> 47 return self.collate_fn(data)
/usr/local/lib/python3.6/dist-packages/transformers/data/data_collator.py in __call__(self, examples)
356 for i, doc in enumerate(examples):
357 input_id, segment_id, attention_mask, label = self.create_examples_from_document(doc, i, examples)
--> 358 input_ids.extend(input_id)
359 segment_ids.extend(segment_id)
360 attention_masks.extend(attention_mask)
/usr/local/lib/python3.6/dist-packages/transformers/data/data_collator.py in create_examples_from_document(self, document, doc_index, examples)
444 random_document = examples[random_document_index]
445 random_start = random.randint(0, len(random_document) - 1)
--> 446 for j in range(random_start, len(random_document)):
447 tokens_b.extend(random_document[j])
448 if len(tokens_b) >= target_b_length:
/usr/lib/python3.6/random.py in randint(self, a, b)
219 """
220
--> 221 return self.randrange(a, b+1)
222
223 def _randbelow(self, n, int=int, maxsize=1<<BPF, type=type,
/usr/lib/python3.6/random.py in randrange(self, start, stop, step, _int)
197 return istart + self._randbelow(width)
198 if step == 1:
--> 199 raise ValueError("empty range for randrange() (%d,%d, %d)" % (istart, istop, width))
200
201 # Non-unit step argument supplied.
ValueError: empty range for randrange() (0,0, 0)
def train(bert_model,bert_tokenizer,path,eval_path=None):
out_dir = "/content/drive/My Drive/next_sentence/"
training_args = TrainingArguments(
output_dir=out_dir,
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=30,
save_steps=10000,
save_total_limit=2,
)
data_collator = DataCollatorForNextSentencePrediction(
tokenizer=bert_tokenizer,mlm=False,block_size=512,nsp_probability =0.5
)
dataset = TextDatasetForNextSentencePrediction(
tokenizer = bert_tokenizer,
file_path=path,
block_size=512,
)
trainer = Trainer(
model=bert_model,
args=training_args,
train_dataset=dataset,
data_collator=data_collator,
)
trainer.train()
trainer.save_model(out_dir)
def main():
print("Running main")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
bert_model = BertForNextSentencePrediction.from_pretrained("bert-base-cased")
train_data_set_path = "/content/drive/My Drive/next_sentence/line_data_set_file.txt"
train(bert_model,bert_tokenizer,train_data_set_path)
#prepare_data_set(bert_tokenizer)
main()

Python XGB Labels not reading the correct label size

I'm getting an error when trying to run a simple XGB. The error is mentioned in the below but the solution is not helping.
XGBoost Error info.labels.size() != 0U (0 vs. 0)
The error is as follows:
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-748-7f7917b57181> in <module>
11 param = {'max_depth':3, 'eta':1, 'objective':'binary:logistic'}
12
---> 13 bst = xgb.train(param, dtrain, num_round)
14 preds = bst.predict(dtest)
~\AppData\Local\Continuum\anaconda3\envs\torch_env\lib\site-packages\xgboost\training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks)
207 evals=evals,
208 obj=obj, feval=feval,
--> 209 xgb_model=xgb_model, callbacks=callbacks)
210
211
~\AppData\Local\Continuum\anaconda3\envs\torch_env\lib\site-packages\xgboost\training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
72 # Skip the first update if it is a recovery step.
73 if version % 2 == 0:
---> 74 bst.update(dtrain, i, obj)
75 bst.save_rabit_checkpoint()
76 version += 1
~\AppData\Local\Continuum\anaconda3\envs\torch_env\lib\site-packages\xgboost\core.py in update(self, dtrain, iteration, fobj)
1247 _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
1248 ctypes.c_int(iteration),
-> 1249 dtrain.handle))
1250 else:
1251 pred = self.predict(dtrain, training=True)
~\AppData\Local\Continuum\anaconda3\envs\torch_env\lib\site-packages\xgboost\core.py in _check_call(ret)
187 """
188 if ret != 0:
--> 189 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
190
191
XGBoostError: [15:11:21] C:/Users/Administrator/workspace/xgboost-win64_release_1.0.0/src/objective/regression_obj.cu:60: Check failed: preds.Size() == info.labels_.Size() (3843 vs. 0) : labels are not correctly providedpreds.size=3843, label.size=0
So the above points to a label size issue. When I run the following code, my training data (train_xgb) and my label data (test_xgb) match.
train_xgb = train_dummies.iloc[:, 2:]
test_xgb = train_dummies.iloc[:, 2]
print(train_xgb.shape, test_xgb.shape, type(train_xgb), type(test_xgb))
dtrain = xgb.DMatrix(train_xgb)
dtest = xgb.DMatrix(test_xgb)
Here is the result of the print statement above:
(3843, 3143) (3843,) <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
I've tried changing the labels to their own dataframe and numpy array with no change in the error.

Cannot cast array data from dtype('float64') to dtype('S32') according to the rule 'safe' ON Model_result=model.predict(classify)

I'm trying to run a python program but get an error. This code is to predict the values and classify it. It works on a GradientBoostingClassifier model but not with a LinearSVC() class model.
This code is used to create the model for the output class:
data_file=pd.read_csv('C:\\Class.csv')
testx = data_file.iloc[0:, :5]
testy= data_file.iloc[0:, 5]
model = make_pipeline(LinearSVC())
model.fit(testx,testy)
import pickle
filename = 'C:\\Model\\svm.sav'
pickle.dump(model, open(filename, 'wb'))
See the below code which has an error when using the the linearSVC output class model
import pickle
import numpy as np
time=0.0
print "\n"
time = input(" \t Please Enter Time in Mili Seconds (Ms): ")
print "\n"
#Data-Read
filename1 = 'c:\\Model\\HR_model.sav'
filename2 = 'c:\\Model\\SpO2_model.sav'
filename3 = 'c:\\Model\\SYS_model.sav'
filename4 = 'c:\\Model\\Dia_model.sav'
filename5 = 'c:\\Model\\Mean_model.sav'
filename6 = 'c:\\Model\\svm.sav'
classify=list()
HR = pickle.load(open(filename1, 'rb'))
hr_result=HR.predict(time)
classify.append(str(float(hr_result)))
SPO2 = pickle.load(open(filename2, 'rb'))
SPO2_result=SPO2.predict(time)
classify.append(str(float(SPO2_result)))
Sys = pickle.load(open(filename3, 'rb'))
Sys_result=Sys.predict(time)
classify.append(str(float(Sys_result)))
DIA = pickle.load(open(filename4, 'rb'))
DIA_result=DIA.predict(time)
classify.append(str(float(DIA_result)))
MEan = pickle.load(open(filename5, 'rb'))
MEan_result=MEan.predict(time)
classify.append(str(float(MEan_result)))
#Classifier train Model
model = pickle.load(open(filename6, 'rb'))
Model_result=model.predict(classify)
print "\n"
print "Input Features:"
print classify
print "\n"
print "Output Class: "
print Model_result
Upon running the program, I get the following error:
TypeError Traceback (most recent call
last) C:\ForcastPredictFromModels.py in () 50 #CLassifier
train Model 51 model = pickle.load(open(filename6, 'rb')) ---> 52
Model_result=model.predict(classify) 53 print "\n" 54
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-
packages\sklearn\utils\metaestimators.pyc in (*args, **kwargs)
52 53 # lambda, but not partial, allows help() to work with
update_wrapper ---> 54 out = lambda *args, **kwargs:
self.fn(obj, *args, **kwargs) 55 # update the docstring of
the returned function 56 update_wrapper(out, self.fn)
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-
packages\sklearn\pipeline.pyc in predict(self, X) 325 if
transform is not None: 326 Xt =
transform.transform(Xt)
--> 327 return self.steps[-1][-1].predict(Xt) 328 329 #if_delegate_has_method(delegate='_final_estimator')
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\linear_model\base.pyc
in predict(self, X) 334 Predicted class label per sample.
335 """
--> 336 scores = self.decision_function(X) 337 if len(scores.shape) == 1: 338 indices = (scores >
0).astype(np.int)
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\linear_model\base.pyc
in decision_function(self, X) 318 319 scores =
safe_sparse_dot(X, self.coef_.T,
--> 320 dense_output=True) + self.intercept_ 321 return scores.ravel() if scores.shape[1]
== 1 else scores 322
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\utils\extmath.pyc
in safe_sparse_dot(a, b, dense_output) 187 return ret 188
else:
--> 189 return fast_dot(a, b) 190 191
TypeError: Cannot cast array data from dtype('float64') to
dtype('S32') according to the rule 'safe'

XGBoost Python: XGBoostError: we need weight to evaluate ams

i am trying to use XGBoost package in Python
I have this error when running this code
import xgboost as xgb
data=np.array(traindata.drop('Category',axis=1))
labels=np.array(traindata['Category'].cat.codes)
dtrain = xgb.DMatrix( data, label=labels)
param = {'bst:max_depth':6, 'bst:eta':0.5, 'silent':1, 'objective':'multi:softprob' }
param['nthread'] = 4
param['eval_metric'] = 'mlogloss'
param['lambda'] = 1
param['num_class']=39
evallist = [(dtrain,'train')]
plst = param.items()
plst += [('eval_metric', 'ams#0')]
num_round = 10
bst = xgb.train( plst, dtrain, num_round, evallist )
bst.save_model('0001.model')
--------------------------------------------------------------------------- XGBoostError Traceback (most recent call
last) in ()
17
18 num_round = 10
---> 19 bst = xgb.train( plst, dtrain, num_round, evallist )
20
21 bst.save_model('0001.model')
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/xgboost/training.pyc
in train(params, dtrain, num_boost_round, evals, obj, feval, maximize,
early_stopping_rounds, evals_result, verbose_eval, learning_rates,
xgb_model)
122 nboost += 1
123 if len(evals) != 0:
--> 124 bst_eval_set = bst.eval_set(evals, i, feval)
125 if isinstance(bst_eval_set, STRING_TYPES):
126 msg = bst_eval_set
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/xgboost/core.pyc
in eval_set(self, evals, iteration, feval)
753 _check_call(_LIB.XGBoosterEvalOneIter(self.handle, iteration,
754 dmats, evnames, len(evals),
--> 755 ctypes.byref(msg)))
756 return msg.value
757 else:
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/xgboost/core.pyc
in _check_call(ret)
95 """
96 if ret != 0:
---> 97 raise XGBoostError(_LIB.XGBGetLastError())
98
99
XGBoostError: we need weight to evaluate ams
i don't see anything about it in the docs
https://xgboost.readthedocs.io/en/latest/python/python_intro.html
http://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
When computing the ams metric, you need to have a weight for each labeled training point. You set the weights by using the keyword argument weight when creating your DMatrix. A simple example.
weights = np.ones(len(labels))
dtrain = xgb.DMatrix(data, label = labels, weight = weights)
And an in-depth example from a recent Kaggle competition: https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-numpy.py.

Categories