A snippet of code involving RandomForestClassifier using the python machine learning library scikit-learn.
I am trying to give weight to different classes using the class_weight opition in the scikit's RandomForestClassifier.Below is my code snippet and then the error that I am getting
print 'Training...'
forest = RandomForestClassifier(n_estimators=500,class_weight= {0:1,1:1,2:1,3:1,4:1,5:1,6:1,7:4})
forest = forest.fit( train_data[0::,1::], train_data[0::,0] )
print 'Predicting...'
output = forest.predict(test_data).astype(int)
predictions_file = open("myfirstforest.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'
And I am getting the following error:
Training...
IndexError Traceback (most recent call last)
<ipython-input-20-122f2e5a0d3b> in <module>()
84 print 'Training...'
85 forest = RandomForestClassifier(n_estimators=500,class_weight={0:1,1:1,2:1,3:1,4:1,5:1,6:1,7:4})
---> 86 forest = forest.fit( train_data[0::,1::], train_data[0::,0] )
87
88 print 'Predicting...'
/home/rpota/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in fit(self, X, y, sample_weight)
216 self.n_outputs_ = y.shape[1]
217
--> 218 y, expanded_class_weight = self._validate_y_class_weight(y)
219
220 if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
/home/rpota/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in _validate_y_class_weight(self, y)
433 class_weight = self.class_weight
434 expanded_class_weight = compute_sample_weight(class_weight,
--> 435 y_original)
436
437 return y, expanded_class_weight
/home/rpota/anaconda/lib/python2.7/site-packages/sklearn/utils/class_weight.pyc in compute_sample_weight(class_weight, y, indices)
150 weight_k = compute_class_weight(class_weight_k,
151 classes_full,
--> 152 y_full)
153
154 weight_k = weight_k[np.searchsorted(classes_full, y_full)]
/home/rpota/anaconda/lib/python2.7/site-packages/sklearn/utils/class_weight.pyc in compute_class_weight(class_weight, classes, y)
58 for c in class_weight:
59 i = np.searchsorted(classes, c)
---> 60 if classes[i] != c:
61 raise ValueError("Class label %d not present." % c)
62 else:
IndexError: index 2 is out of bounds for axis 0 with size 2
Please help!.
Related
I am fairly new to PySpark. I am getting an attribute error while trying to run a logistic regression. I am trying to run a logistic regression on minmaxscaler vectors to get the probability values of a likely match up between the data points.
number_games = df2.filter(df2.GAME_ID > 22000000).filter(
df2.GAME_ID < 40000000).groupby("TEAM_ABBREVIATION").agg(
(F.sum("FGM") / F.countDistinct("GAME_ID")).alias('Points_Per_Game'))
vectorassembler = VectorAssembler(inputCols=["Points_Per_Game"],
outputCol="Performance")
scaler = MinMaxScaler(inputCol="Performance", outputCol="Output")
vectors = vectorassembler.transform(number_games)
scaler_model = scaler.fit(vectors)
scaler_data = scaler_model.transform(vectors)
statistics_teams = scaler_data.select('TEAM_ABBREVIATION',
'Output') # teams match up against one another
statistics_teams
RDD2 = sc.parallelize(statistics_teams.collect())
# RDD4 = RDD2.map( lambda x: x.split()) even as a pipelineRDD I get the same attribute error
lr = LogisticRegression(maxIter=20, regParam=0.001)
logistic_model = lr.fit(RDD2)
logistic_model.show()
the error returns
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-46-3c0eb05824a8> in <module>
1 lr = LogisticRegression(maxIter=20, regParam=0.001)
----> 2 logistic_model = lr.fit(RDD4)
3
4 logistic_model.show()
c:\users\user\appdata\local\programs\python\python39\lib\site-packages\pyspark\ml\base.py in fit(self, dataset, params)
159 return self.copy(params)._fit(dataset)
160 else:
--> 161 return self._fit(dataset)
162 else:
163 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
c:\users\user\appdata\local\programs\python\python39\lib\site-packages\pyspark\ml\wrapper.py in _fit(self, dataset)
333
334 def _fit(self, dataset):
--> 335 java_model = self._fit_java(dataset)
336 model = self._create_model(java_model)
337 return self._copyValues(model)
c:\users\user\appdata\local\programs\python\python39\lib\site-packages\pyspark\ml\wrapper.py in _fit_java(self, dataset)
330 """
331 self._transfer_params_to_java()
--> 332 return self._java_obj.fit(dataset._jdf)
333
334 def _fit(self, dataset):
AttributeError: 'PipelinedRDD' object has no attribute '_jdf'
Can you try to call .fit() on the actual data frame, in this case statistics_teams? I think that LogisticRegression works with data frames and not RDDs.
I am trying to fine-tune Bert using the Huggingface library on next sentence prediction task. I looked at the tutorial and I am trying to use DataCollatorForNextSentencePrediction and TextDatasetForNextSentencePrediction . When I am using that I get the following error. I have provided my code bellow.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-7678758b2c9c> in <module>()
56 train(bert_model,bert_tokenizer,train_data_set_path)
57 #prepare_data_set(bert_tokenizer)
---> 58 main()
9 frames
<ipython-input-18-7678758b2c9c> in main()
54 bert_model = BertForNextSentencePrediction.from_pretrained("bert-base-cased")
55 train_data_set_path = "/content/drive/My Drive/next_sentence/line_data_set_file.txt"
---> 56 train(bert_model,bert_tokenizer,train_data_set_path)
57 #prepare_data_set(bert_tokenizer)
58 main()
<ipython-input-18-7678758b2c9c> in train(bert_model, bert_tokenizer, path, eval_path)
47
48 )
---> 49 trainer.train()
50 trainer.save_model(out_dir)
51 def main():
/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in train(self, model_path, trial)
697
698 epoch_pbar = tqdm(epoch_iterator, desc="Iteration", disable=disable_tqdm)
--> 699 for step, inputs in enumerate(epoch_iterator):
700
701 # Skip past any already trained steps if resuming training
/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in __next__(self)
361
362 def __next__(self):
--> 363 data = self._next_data()
364 self._num_yielded += 1
365 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
401 def _next_data(self):
402 index = self._next_index() # may raise StopIteration
--> 403 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
404 if self._pin_memory:
405 data = _utils.pin_memory.pin_memory(data)
/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
45 else:
46 data = self.dataset[possibly_batched_index]
---> 47 return self.collate_fn(data)
/usr/local/lib/python3.6/dist-packages/transformers/data/data_collator.py in __call__(self, examples)
356 for i, doc in enumerate(examples):
357 input_id, segment_id, attention_mask, label = self.create_examples_from_document(doc, i, examples)
--> 358 input_ids.extend(input_id)
359 segment_ids.extend(segment_id)
360 attention_masks.extend(attention_mask)
/usr/local/lib/python3.6/dist-packages/transformers/data/data_collator.py in create_examples_from_document(self, document, doc_index, examples)
444 random_document = examples[random_document_index]
445 random_start = random.randint(0, len(random_document) - 1)
--> 446 for j in range(random_start, len(random_document)):
447 tokens_b.extend(random_document[j])
448 if len(tokens_b) >= target_b_length:
/usr/lib/python3.6/random.py in randint(self, a, b)
219 """
220
--> 221 return self.randrange(a, b+1)
222
223 def _randbelow(self, n, int=int, maxsize=1<<BPF, type=type,
/usr/lib/python3.6/random.py in randrange(self, start, stop, step, _int)
197 return istart + self._randbelow(width)
198 if step == 1:
--> 199 raise ValueError("empty range for randrange() (%d,%d, %d)" % (istart, istop, width))
200
201 # Non-unit step argument supplied.
ValueError: empty range for randrange() (0,0, 0)
def train(bert_model,bert_tokenizer,path,eval_path=None):
out_dir = "/content/drive/My Drive/next_sentence/"
training_args = TrainingArguments(
output_dir=out_dir,
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=30,
save_steps=10000,
save_total_limit=2,
)
data_collator = DataCollatorForNextSentencePrediction(
tokenizer=bert_tokenizer,mlm=False,block_size=512,nsp_probability =0.5
)
dataset = TextDatasetForNextSentencePrediction(
tokenizer = bert_tokenizer,
file_path=path,
block_size=512,
)
trainer = Trainer(
model=bert_model,
args=training_args,
train_dataset=dataset,
data_collator=data_collator,
)
trainer.train()
trainer.save_model(out_dir)
def main():
print("Running main")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
bert_model = BertForNextSentencePrediction.from_pretrained("bert-base-cased")
train_data_set_path = "/content/drive/My Drive/next_sentence/line_data_set_file.txt"
train(bert_model,bert_tokenizer,train_data_set_path)
#prepare_data_set(bert_tokenizer)
main()
I'm getting an error when trying to run a simple XGB. The error is mentioned in the below but the solution is not helping.
XGBoost Error info.labels.size() != 0U (0 vs. 0)
The error is as follows:
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-748-7f7917b57181> in <module>
11 param = {'max_depth':3, 'eta':1, 'objective':'binary:logistic'}
12
---> 13 bst = xgb.train(param, dtrain, num_round)
14 preds = bst.predict(dtest)
~\AppData\Local\Continuum\anaconda3\envs\torch_env\lib\site-packages\xgboost\training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks)
207 evals=evals,
208 obj=obj, feval=feval,
--> 209 xgb_model=xgb_model, callbacks=callbacks)
210
211
~\AppData\Local\Continuum\anaconda3\envs\torch_env\lib\site-packages\xgboost\training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
72 # Skip the first update if it is a recovery step.
73 if version % 2 == 0:
---> 74 bst.update(dtrain, i, obj)
75 bst.save_rabit_checkpoint()
76 version += 1
~\AppData\Local\Continuum\anaconda3\envs\torch_env\lib\site-packages\xgboost\core.py in update(self, dtrain, iteration, fobj)
1247 _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
1248 ctypes.c_int(iteration),
-> 1249 dtrain.handle))
1250 else:
1251 pred = self.predict(dtrain, training=True)
~\AppData\Local\Continuum\anaconda3\envs\torch_env\lib\site-packages\xgboost\core.py in _check_call(ret)
187 """
188 if ret != 0:
--> 189 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
190
191
XGBoostError: [15:11:21] C:/Users/Administrator/workspace/xgboost-win64_release_1.0.0/src/objective/regression_obj.cu:60: Check failed: preds.Size() == info.labels_.Size() (3843 vs. 0) : labels are not correctly providedpreds.size=3843, label.size=0
So the above points to a label size issue. When I run the following code, my training data (train_xgb) and my label data (test_xgb) match.
train_xgb = train_dummies.iloc[:, 2:]
test_xgb = train_dummies.iloc[:, 2]
print(train_xgb.shape, test_xgb.shape, type(train_xgb), type(test_xgb))
dtrain = xgb.DMatrix(train_xgb)
dtest = xgb.DMatrix(test_xgb)
Here is the result of the print statement above:
(3843, 3143) (3843,) <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
I've tried changing the labels to their own dataframe and numpy array with no change in the error.
I'm trying to run a python program but get an error. This code is to predict the values and classify it. It works on a GradientBoostingClassifier model but not with a LinearSVC() class model.
This code is used to create the model for the output class:
data_file=pd.read_csv('C:\\Class.csv')
testx = data_file.iloc[0:, :5]
testy= data_file.iloc[0:, 5]
model = make_pipeline(LinearSVC())
model.fit(testx,testy)
import pickle
filename = 'C:\\Model\\svm.sav'
pickle.dump(model, open(filename, 'wb'))
See the below code which has an error when using the the linearSVC output class model
import pickle
import numpy as np
time=0.0
print "\n"
time = input(" \t Please Enter Time in Mili Seconds (Ms): ")
print "\n"
#Data-Read
filename1 = 'c:\\Model\\HR_model.sav'
filename2 = 'c:\\Model\\SpO2_model.sav'
filename3 = 'c:\\Model\\SYS_model.sav'
filename4 = 'c:\\Model\\Dia_model.sav'
filename5 = 'c:\\Model\\Mean_model.sav'
filename6 = 'c:\\Model\\svm.sav'
classify=list()
HR = pickle.load(open(filename1, 'rb'))
hr_result=HR.predict(time)
classify.append(str(float(hr_result)))
SPO2 = pickle.load(open(filename2, 'rb'))
SPO2_result=SPO2.predict(time)
classify.append(str(float(SPO2_result)))
Sys = pickle.load(open(filename3, 'rb'))
Sys_result=Sys.predict(time)
classify.append(str(float(Sys_result)))
DIA = pickle.load(open(filename4, 'rb'))
DIA_result=DIA.predict(time)
classify.append(str(float(DIA_result)))
MEan = pickle.load(open(filename5, 'rb'))
MEan_result=MEan.predict(time)
classify.append(str(float(MEan_result)))
#Classifier train Model
model = pickle.load(open(filename6, 'rb'))
Model_result=model.predict(classify)
print "\n"
print "Input Features:"
print classify
print "\n"
print "Output Class: "
print Model_result
Upon running the program, I get the following error:
TypeError Traceback (most recent call
last) C:\ForcastPredictFromModels.py in () 50 #CLassifier
train Model 51 model = pickle.load(open(filename6, 'rb')) ---> 52
Model_result=model.predict(classify) 53 print "\n" 54
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-
packages\sklearn\utils\metaestimators.pyc in (*args, **kwargs)
52 53 # lambda, but not partial, allows help() to work with
update_wrapper ---> 54 out = lambda *args, **kwargs:
self.fn(obj, *args, **kwargs) 55 # update the docstring of
the returned function 56 update_wrapper(out, self.fn)
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-
packages\sklearn\pipeline.pyc in predict(self, X) 325 if
transform is not None: 326 Xt =
transform.transform(Xt)
--> 327 return self.steps[-1][-1].predict(Xt) 328 329 #if_delegate_has_method(delegate='_final_estimator')
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\linear_model\base.pyc
in predict(self, X) 334 Predicted class label per sample.
335 """
--> 336 scores = self.decision_function(X) 337 if len(scores.shape) == 1: 338 indices = (scores >
0).astype(np.int)
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\linear_model\base.pyc
in decision_function(self, X) 318 319 scores =
safe_sparse_dot(X, self.coef_.T,
--> 320 dense_output=True) + self.intercept_ 321 return scores.ravel() if scores.shape[1]
== 1 else scores 322
C:\Users\xx\AppData\Local\Enthought\Canopy\User\lib\site-packages\sklearn\utils\extmath.pyc
in safe_sparse_dot(a, b, dense_output) 187 return ret 188
else:
--> 189 return fast_dot(a, b) 190 191
TypeError: Cannot cast array data from dtype('float64') to
dtype('S32') according to the rule 'safe'
i am trying to use XGBoost package in Python
I have this error when running this code
import xgboost as xgb
data=np.array(traindata.drop('Category',axis=1))
labels=np.array(traindata['Category'].cat.codes)
dtrain = xgb.DMatrix( data, label=labels)
param = {'bst:max_depth':6, 'bst:eta':0.5, 'silent':1, 'objective':'multi:softprob' }
param['nthread'] = 4
param['eval_metric'] = 'mlogloss'
param['lambda'] = 1
param['num_class']=39
evallist = [(dtrain,'train')]
plst = param.items()
plst += [('eval_metric', 'ams#0')]
num_round = 10
bst = xgb.train( plst, dtrain, num_round, evallist )
bst.save_model('0001.model')
--------------------------------------------------------------------------- XGBoostError Traceback (most recent call
last) in ()
17
18 num_round = 10
---> 19 bst = xgb.train( plst, dtrain, num_round, evallist )
20
21 bst.save_model('0001.model')
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/xgboost/training.pyc
in train(params, dtrain, num_boost_round, evals, obj, feval, maximize,
early_stopping_rounds, evals_result, verbose_eval, learning_rates,
xgb_model)
122 nboost += 1
123 if len(evals) != 0:
--> 124 bst_eval_set = bst.eval_set(evals, i, feval)
125 if isinstance(bst_eval_set, STRING_TYPES):
126 msg = bst_eval_set
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/xgboost/core.pyc
in eval_set(self, evals, iteration, feval)
753 _check_call(_LIB.XGBoosterEvalOneIter(self.handle, iteration,
754 dmats, evnames, len(evals),
--> 755 ctypes.byref(msg)))
756 return msg.value
757 else:
/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/xgboost/core.pyc
in _check_call(ret)
95 """
96 if ret != 0:
---> 97 raise XGBoostError(_LIB.XGBGetLastError())
98
99
XGBoostError: we need weight to evaluate ams
i don't see anything about it in the docs
https://xgboost.readthedocs.io/en/latest/python/python_intro.html
http://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
When computing the ams metric, you need to have a weight for each labeled training point. You set the weights by using the keyword argument weight when creating your DMatrix. A simple example.
weights = np.ones(len(labels))
dtrain = xgb.DMatrix(data, label = labels, weight = weights)
And an in-depth example from a recent Kaggle competition: https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/higgs-numpy.py.