Saving Random Forest - python

I want to save and load a fitted Random Forest Classifier, but I get an error.
forest = RandomForestClassifier(n_estimators = 100, max_features = mf_val)
forest = forest.fit(L1[0:100], L2[0:100])
joblib.dump(forest, 'screening_forest/screening_forest.pkl')
forest2 = joblib.load('screening_forest/screening_forest.pkl')
The error is:
File "C:\Users\mkolarek\Documents\other\TrackerResultAnalysis\ScreeningClassif
ier\ScreeningClassifier.py", line 67, in <module>
forest2 = joblib.load('screening_forest/screening_forest.pkl')
File "C:\Python27\lib\site-packages\sklearn\externals\joblib\numpy_pickle.py",
line 425, in load
obj = unpickler.load()
File "C:\Python27\lib\pickle.py", line 858, in load
dispatch[key](self)
File "C:\Python27\lib\site-packages\sklearn\externals\joblib\numpy_pickle.py",
line 285, in load_build
Unpickler.load_build(self)
File "C:\Python27\lib\pickle.py", line 1217, in load_build
setstate(state)
File "_tree.pyx", line 2280, in sklearn.tree._tree.Tree.__setstate__ (sklearn\
tree\_tree.c:18350)
ValueError: Did not recognise loaded array layout
Press any key to continue . . .
Do I have to initialize forest2 or something?

I solved it with cPickle instead:
with open('screening_forest/screening_forest.pickle', 'wb') as f:
cPickle.dump(forest, f)
with open('screening_forest/screening_forest.pickle', 'rb') as f:
forest2 = cPickle.load(f)
but a joblib solution could be useful as well.

Here is the method that you can try
model = RandomForestClassifier()
model.fit(data,lables)
import pickle
Model_file = 'model.pkl'
pickle.dump(model, open(Model_file, 'wb'))
'''Reloading the model
load the model from Saved file'''
loaded_model = pickle.load(open(Model_file, 'rb'))

Related

Generating data via SDV GaussianCopula throws "numpy.linalg.LinAlgError: SVD did not converge" in Python

I am currently using SDV and GaussianCopula (https://sdv.dev/SDV/user_guides/single_table/gaussian_copula.html) to train my models. I have a given data set which is loaded for training.
However, I get the following error message when creating the datasets:
Saving Model to path D:/.../GaussianCopula/model_MLB_1.pkl
Generating 22479 rows of synthetic data
Traceback (most recent call last):
File ".\generate_gaussian_model.py", line 47, in <module>
samples = gaussianCopula.sample(len(data.index))
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\sdv\tabular\base.py", line 442, in sample
return self._sample_batch(num_rows, max_retries, max_rows_multiplier)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\sdv\tabular\base.py", line 300, in _sample_batch
num_rows, conditions, transformed_conditions, float_rtol)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\sdv\tabular\base.py", line 228, in _sample_rows
sampled = self._sample(num_rows)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\sdv\tabular\copulas.py", line 319, in _sample
return self._model.sample(num_rows, conditions=conditions)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\copulas\__init__.py", line 36, in wrapper
return function(self, *args, **kwargs)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\copulas\multivariate\gaussian.py", line 249, in sample
samples = self._get_normal_samples(num_rows, conditions)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\copulas\multivariate\gaussian.py", line 223, in _get_normal_samples
samples = np.random.multivariate_normal(means, covariance, size=num_rows)
File "mtrand.pyx", line 4120, in numpy.random.mtrand.RandomState.multivariate_normal
File "<__array_function__ internals>", line 6, in svd
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\numpy\linalg\linalg.py", line 1660, in svd
u, s, vh = gufunc(a, signature=signature, extobj=extobj)
File "C:\Users\...\AppData\Local\Programs\Python\Python37\lib\site-packages\numpy\linalg\linalg.py", line 97, in _raise_linalgerror_svd_nonconvergence
raise LinAlgError("SVD did not converge")
numpy.linalg.LinAlgError: SVD did not converge
I also checked out this following thread and tried to apply their solution (which you can see below) but it didn't work.
And this is my class (generate_gaussian_model.py) and what I've tried so far:
from sdv.tabular import GaussianCopula
import pickle
import pandas as pd
from pandas.core.indexes.base import Index
header_import_path = "C:/Users/.../headers/all_headers.txt"
all_mlb_names = ['MLB_1', 'MLB_7', 'MLB_19', 'MLB_31', 'MLB_41', 'MLB_45', 'MLB_49', 'MLB_53', 'MLB_58']
with open(header_import_path, 'rb') as fp:
all_headers = pickle.load(fp)
for mlb_file_name in all_mlb_names:
#Create separate model for each MLB Table
model_export_path = "D:/.../GaussianCopula/model_{0}.pkl".format(mlb_file_name)
synth_data_export_path = "C:/Users/.../models/generated/{0}_samples.csv".format(mlb_file_name)
data_import_path = "C:/Users/.../models/original/{0}.csv".format(mlb_file_name)
headers = all_headers[mlb_file_name]
print("Read data for table {0}".format(mlb_file_name))
data = pd.read_csv(data_import_path, sep='|', names=headers)
# This is necessary to remove invalid columns from my original dataset
for colname in data.columns:
if colname.startswith("Calculation"):
data = data.drop(axis=1, labels=[colname])
# Thought this would fix my issue but it didn't
# https://stackoverflow.com/questions/21827594/raise-linalgerrorsvd-did-not-converge-linalgerror-svd-did-not-converge-in-m
data.dropna(inplace=True)
#print("Takes a third of the dataset")
data = data.sample(frac=0.3)
print(data)
gaussianCopula = GaussianCopula()
print("Start training of GaussianCopula Model")
gaussianCopula.fit(data)
print("Saving Model to path {0}".format(model_export_path))
gaussianCopula.save(model_export_path)
print("Generating {0} rows of synthetic data".format(len(data.index)))
# Here it begins to crash
samples = gaussianCopula.sample(len(data.index))
samples.to_csv(synth_data_export_path, header=True, sep='|', index=False)
The following command would work, but these are not enough datasets for me: data = data.sample(n=1000)
Hope you guys can help me out and explain this error message to me.

Problem with Dataloader object not subscriptable

I am now running a Python program using Pytorch. I use my own dataset, not torch.data.dataset. I download data from a pickle file extracted from feature extraction. But the following errors appear:
Traceback (most recent call last):
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demo-emotion.py", line 326, in <module>
fire.Fire(demo)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\fire\core.py", line 138, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\fire\core.py", line 468, in _Fire
target=component.__name__)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\fire\core.py", line 672, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demo-emotion.py", line 304, in demo
train(model,train_set1, valid_set=valid_set, test_set=test1, save=save, n_epochs=n_epochs,batch_size=batch_size,seed=seed)
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demo-emotion.py", line 172, in train
n_epochs=n_epochs,
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demo-emotion.py", line 37, in train_epoch
loader=np.asarray(list(loader))
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\utils\data\dataloader.py", line 345, in __next__
data = self._next_data()
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\utils\data\dataloader.py", line 385, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\utils\data\_utils\fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\utils\data\_utils\fetch.py", line 44, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\utils\data\dataset.py", line 257, in __getitem__
return self.dataset[self.indices[idx]]
TypeError: 'DataLoader' object is not subscriptable
The code is:
train_set1 = Owndata()
train1, test1 = train_set1 .get_splits()
# prepare data loaders
train_dl = torch.utils.data.DataLoader(train1, batch_size=32, shuffle=True)
test_dl =torch.utils.data.DataLoader(test1, batch_size=1024, shuffle=False)
test_set1 = Owndata()
'''print('test_set# ',test_set)'''
if valid_size:
valid_set = Owndata()
indices = torch.randperm(len(train_set1))
train_indices = indices[:len(indices) - valid_size]
valid_indices = indices[len(indices) - valid_size:]
train_set1 = torch.utils.data.Subset(train_dl, train_indices)
valid_set = torch.utils.data.Subset(valid_set, valid_indices)
else:
valid_set = None
model = DenseNet(
growth_rate=growth_rate,
block_config=block_config,
num_classes=10,
small_inputs=True,
efficient=efficient,
)
train(model,train_set1, valid_set=valid_set, test_set=test1, save=save, n_epochs=n_epochs, batch_size=batch_size, seed=seed)
Any help is appreciated! Thanks a lot in advance!!
It is not the line giving you an error as it's the very last train function you are not showing.
You are confusing two things:
torch.utils.data.Dataset object is indexable (dataset[5] works fine for example). It is a simple object which defines how to get a single (usually single) sample of data.
torch.utils.data.DataLoader - non-indexable, only iterable, usually returns batches of data from above Dataset. Can work in parallel using num_workers. It's what you are trying to index while you should use dataset for that.
Please see PyTorch documentation about data to get a better grasp on how those work.

How to use the HuggingFace transformers pipelines?

I'm trying to do a simple text classification project with Transformers, I want to use the pipeline feature added in the V2.3, but there is little to no documentation.
data = pd.read_csv("data.csv")
FLAUBERT_NAME = "flaubert-base-cased"
encoder = LabelEncoder()
target = encoder.fit_transform(data["category"])
y = target
X = data["text"]
model = FlaubertForSequenceClassification.from_pretrained(FLAUBERT_NAME)
tokenizer = FlaubertTokenizer.from_pretrained(FLAUBERT_NAME)
pipe = TextClassificationPipeline(model, tokenizer, device=-1) # device=-1 -> Use only CPU
print("Test #1: pipe('Bonjour le monde')=", pipe(['Bonjour le monde']))
Traceback (most recent call last):
File "C:/Users/PLHT09191/Documents/work/dev/Classif_Annonces/src/classif_annonce.py", line 33, in <module>
model = FlaubertForSequenceClassification.from_pretrained(FLAUBERT_NAME)
File "C:\Users\Myself\Documents\work\dev\Classif_Annonces\venv\lib\site-packages\transformers-2.4.1-py3.5.egg\transformers\modeling_utils.py", line 463, in from_pretrained
model = cls(config, *model_args, **model_kwargs)
File "C:\Users\Myself\Documents\work\dev\Classif_Annonces\venv\lib\site-packages\transformers-2.4.1-py3.5.egg\transformers\modeling_flaubert.py", line 343, in __init__
super(FlaubertForSequenceClassification, self).__init__(config)
File "C:\Users\Myself\Documents\work\dev\Classif_Annonces\venv\lib\site-packages\transformers-2.4.1-py3.5.egg\transformers\modeling_xlm.py", line 733, in __init__
self.transformer = XLMModel(config)
File "C:\Users\Myself\Documents\work\dev\Classif_Annonces\venv\lib\site-packages\transformers-2.4.1-py3.5.egg\transformers\modeling_xlm.py", line 382, in __init__
self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
File "C:\Users\Myself\Documents\work\dev\Classif_Annonces\venv\lib\site-packages\transformers-2.4.1-py3.5.egg\transformers\modeling_xlm.py", line 203, in __init__
self.lin2 = nn.Linear(dim_hidden, out_dim)
File "C:\Users\Myself\Documents\work\dev\Classif_Annonces\venv\lib\site-packages\torch\nn\modules\linear.py", line 72, in __init__
self.weight = Parameter(torch.Tensor(out_features, in_features))
RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 9437184 bytes. Buy new RAM!
Process finished with exit code 1
How can I use my pipeline with my X and y data?

Python: pickle.load __init__ error

I have trained a HMM model to add punctuation into Arabic text and I want to save it to not repeating the training phase every time I enter a text to the model for tagging it .. I use pickle for these task as I see in tutorials. I do exactly like them but it fail and give me these error!.
Traceback (most recent call last):
File "C:\Python27\file_pun_tag.py", line 205, in <module>
hmm_tagger("test_file.txt")
File "C:\Python27\file_pun_tag.py", line 179, in hmm_tagger
hmm = pickle.load(saved_model)
File "C:\Python27\lib\pickle.py", line 1378, in load
return Unpickler(file).load()
File "C:\Python27\lib\pickle.py", line 858, in load
dispatch[key](self)
File "C:\Python27\lib\pickle.py", line 1133, in load_reduce
value = func(*args)
TypeError: __init__() takes at least 3 arguments (2 given)
I tried several solutions but none of them working with me ...
Here is the code where I save my model. It is working correctly for saving the model and creating the "hmm.pickle":
file = codecs.open("train_sents_hmm.txt", "r", "utf_8")
train_sents = file.readlines()
labelled_sequences, tag_set, symbols = load_pun(train_sents)
trainer = nltk.HiddenMarkovModelTrainer (tag_set, symbols)
hmm = trainer.train_supervised (labelled_sequences, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
# save object
save_model = open("hmm.pickle", "wb")
pickle.dump(hmm, save_model, -1)
save_model.close()
And here is the code when i'm trying to load the model after saving it, and here where it gives me the error:
saved_model = open("hmm.pickle", "rb")
hmm = pickle.load(saved_model)
saved_model.close()

How to convert protobuf graph to binary wire format?

I have a method to convert binary wire format to human readable format but I cannot do the inverse of this
import tensorflow as tf
from tensorflow.python.platform import gfile
def converter(filename):
with gfile.FastGFile(filename,'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
tf.import_graph_def(graph_def, name='')
tf.train.write_graph(graph_def, 'pbtxt/', 'protobuf.pb', as_text=True)
return
I just have to type the file name for this and it works. But on doing the opposite i get
File "pb_to_pbtxt.py", line 16, in <module>
converter('protobuf.pb') # here you can write the name of the file to be converted
File "pb_to_pbtxt.py", line 11, in converter
graph_def.ParseFromString(f.read())
File "/usr/local/lib/python2.7/dist-packages/google/protobuf/message.py", line 185, in ParseFromString
self.MergeFromString(serialized)
File "/usr/local/lib/python2.7/dist-packages/google/protobuf/internal/python_message.py", line 1008, in MergeFromString
if self._InternalParse(serialized, 0, length) != length:
File "/usr/local/lib/python2.7/dist-packages/google/protobuf/internal/python_message.py", line 1034, in InternalParse
new_pos = local_SkipField(buffer, new_pos, end, tag_bytes)
File "/usr/local/lib/python2.7/dist-packages/google/protobuf/internal/decoder.py", line 868, in SkipField
return WIRETYPE_TO_SKIPPER[wire_type](buffer, pos, end)
File "/usr/local/lib/python2.7/dist-packages/google/protobuf/internal/decoder.py", line 838, in _RaiseInvalidWireType
raise _DecodeError('Tag had invalid wire type.')
You can perform the reverse translation using the google.protobuf.text_format module:
import tensorflow as tf
from google.protobuf import text_format
def convert_pbtxt_to_graphdef(filename):
"""Returns a `tf.GraphDef` proto representing the data in the given pbtxt file.
Args:
filename: The name of a file containing a GraphDef pbtxt (text-formatted
`tf.GraphDef` protocol buffer data).
Returns:
A `tf.GraphDef` protocol buffer.
"""
with tf.gfile.FastGFile(filename, 'r') as f:
graph_def = tf.GraphDef()
file_content = f.read()
# Merges the human-readable string in `file_content` into `graph_def`.
text_format.Merge(file_content, graph_def)
return graph_def
You can use tf.Graph.as_graph_def() and then Protobuf's SerializeToString() like so:
proto_graph = # obtained by calling tf.Graph.as_graph_def()
with open("my_graph.bin", "wb") as f:
f.write(proto_graph.SerializeToString())
If you just want to write the file and do not care about the encoding you can also use tf.train.write_graph()
v = tf.Variable(0, name='my_variable')
sess = tf.Session()
tf.train.write_graph(sess.graph_def, '/tmp/my-model', 'train.pbtxt')
Note: Tested on TF 0.10, not sure about earlier versions.

Categories