How to use the HuggingFace transformers pipelines? - python

I'm trying to do a simple text classification project with Transformers, I want to use the pipeline feature added in the V2.3, but there is little to no documentation.
data = pd.read_csv("data.csv")
FLAUBERT_NAME = "flaubert-base-cased"
encoder = LabelEncoder()
target = encoder.fit_transform(data["category"])
y = target
X = data["text"]
model = FlaubertForSequenceClassification.from_pretrained(FLAUBERT_NAME)
tokenizer = FlaubertTokenizer.from_pretrained(FLAUBERT_NAME)
pipe = TextClassificationPipeline(model, tokenizer, device=-1) # device=-1 -> Use only CPU
print("Test #1: pipe('Bonjour le monde')=", pipe(['Bonjour le monde']))
Traceback (most recent call last):
File "C:/Users/PLHT09191/Documents/work/dev/Classif_Annonces/src/classif_annonce.py", line 33, in <module>
model = FlaubertForSequenceClassification.from_pretrained(FLAUBERT_NAME)
File "C:\Users\Myself\Documents\work\dev\Classif_Annonces\venv\lib\site-packages\transformers-2.4.1-py3.5.egg\transformers\modeling_utils.py", line 463, in from_pretrained
model = cls(config, *model_args, **model_kwargs)
File "C:\Users\Myself\Documents\work\dev\Classif_Annonces\venv\lib\site-packages\transformers-2.4.1-py3.5.egg\transformers\modeling_flaubert.py", line 343, in __init__
super(FlaubertForSequenceClassification, self).__init__(config)
File "C:\Users\Myself\Documents\work\dev\Classif_Annonces\venv\lib\site-packages\transformers-2.4.1-py3.5.egg\transformers\modeling_xlm.py", line 733, in __init__
self.transformer = XLMModel(config)
File "C:\Users\Myself\Documents\work\dev\Classif_Annonces\venv\lib\site-packages\transformers-2.4.1-py3.5.egg\transformers\modeling_xlm.py", line 382, in __init__
self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
File "C:\Users\Myself\Documents\work\dev\Classif_Annonces\venv\lib\site-packages\transformers-2.4.1-py3.5.egg\transformers\modeling_xlm.py", line 203, in __init__
self.lin2 = nn.Linear(dim_hidden, out_dim)
File "C:\Users\Myself\Documents\work\dev\Classif_Annonces\venv\lib\site-packages\torch\nn\modules\linear.py", line 72, in __init__
self.weight = Parameter(torch.Tensor(out_features, in_features))
RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 9437184 bytes. Buy new RAM!
Process finished with exit code 1
How can I use my pipeline with my X and y data?

Related

RuntimeError: Unknown qengine?The code is written correctly, but there is a problem, why and how to solve it?

import torch
import sounddevice as sd
import time
language = "ru"
model_id = "ru_v3"
sample_rate = 48000
speaker = "baya"
put_accent = True
put_yo = True
device = torch.device('cpu')
text = "Здраствуй Олег, рад знакомству"
model, _ = torch.hub.load(repo_or_dir='snakers4/silero-models',
model='silero_tts',
language=language,
speaker=model_id
)
model.to(device)
audio = model.apply_tts(text=text,
speaker=speaker,
sample_rate=sample_rate,
put_accent=put_accent,
put_yo=put_yo)
print(text)
sd.play(audio, sample_rate)
time.sleep(len(audio) / sample_rate)
sd.stop()
Error message:
Using cache found in C:\Users\User/.cache\torch\hub\snakers4_silero-models_master
Traceback (most recent call last):
File "D:/Voice_Assistent2/main.py", line 14, in <module>
model, _ = torch.hub.load(repo_or_dir='snakers4/silero-models',
File "D:\Voice_Assistent2\venv\lib\site-packages\torch\hub.py", line 399, in load
model = _load_local(repo_or_dir, model, *args, **kwargs)
File "D:\Voice_Assistent2\venv\lib\site-packages\torch\hub.py", line 428, in _load_local
model = entry(*args, **kwargs)
File "C:\Users\User/.cache\torch\hub\snakers4_silero-models_master\src\silero\silero.py", line 88, in silero_tts
model = imp.load_pickle("tts_models", "model")
File "D:\Voice_Assistent2\venv\lib\site-packages\torch\package\package_importer.py", line 249, in load_pickle
result = unpickler.load()
File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\pickle.py", line 1210, in load
dispatch[key[0]](self)
File "C:\Users\User\AppData\Local\Programs\Python\Python38\lib\pickle.py", line 1251, in load_binpersid
self.append(self.persistent_load(pid))
File "D:\Voice_Assistent2\venv\lib\site-packages\torch\package\package_importer.py", line 227, in persistent_load
loaded_reduces[reduce_id] = func(self, *args)
File "D:\Voice_Assistent2\venv\lib\site-packages\torch\jit\_script.py", line 344, in unpackage_script_module
cpp_module = torch._C._import_ir_module_from_package(
RuntimeError: Unknown qengine

GPT 2 - TypeError: Cannot cast array data from dtype('O') to dtype('int64') according to the rule 'safe'

I am working with gpt2, python 3.9 and tensorflow 2.5 and when connecting to flask (flask run in terminal) I get a following message:
TypeError: Cannot cast array data from dtype('O') to dtype('int64') according to the rule 'safe'
Here is the code in generator.py
#!/usr/bin/env python3
import fire
import json
import os
import numpy as np
import tensorflow.compat.v1 as tf
# import model, sample, encoder
from text_generator import model
from text_generator import sample
from text_generator import encoder
class AI:
def generate_text(self, input_text):
model_name = '117M_Trained'
seed = None,
nsamples = 1
batch_size = 1
length = 150
temperature = 1
top_k = 40
top_p = 1
models_dir = 'models'
self.response = ''
models_dir = os.path.expanduser(os.path.expandvars(models_dir))
if batch_size is None:
batch_size = 1
assert nsamples % batch_size == 0
enc = encoder.get_encoder(model_name, models_dir)
hparams = model.default_hparams()
cur_path = os.path.dirname(__file__) + '/models' + '/' + model_name
with open(cur_path + '/hparams.json') as f:
hparams.override_from_dict(json.load(f))
if length is None:
length = hparams.n_ctx // 2
elif length > hparams.n_ctx:
raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx)
with tf.Session(graph=tf.Graph()) as sess:
context = tf.placeholder(tf.int32, [batch_size, None])
np.random.seed(seed)
tf.set_random_seed(seed)
output = sample.sample_sequence(
hparams=hparams, length=length,
context=context,
batch_size=batch_size,
temperature=temperature, top_k=top_k, top_p=top_p
)
saver = tf.train.Saver()
ckpt = tf.train.latest_checkpoint(cur_path)
saver.restore(sess, ckpt)
context_tokens = enc.encode(input_text)
generated = 0
for _ in range(nsamples // batch_size):
out = sess.run(output, feed_dict={
context: [context_tokens for _ in range(batch_size)]
})[:, len(context_tokens):]
for i in range(batch_size):
generated += 1
text = enc.decode(out[i])
self.response = text
return self.response
ai = AI()
text = ai.generate_text('How are you?')
print(text)
Any help is appreciated 🙏 ps I have also added below the entire traceback
* Serving Flask app 'text_generator' (lazy loading)
* Environment: development
* Debug mode: on
2021-09-14 19:58:08.687907: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Traceback (most recent call last):
File "_mt19937.pyx", line 178, in numpy.random._mt19937.MT19937._legacy_seeding
TypeError: 'tuple' object cannot be interpreted as an integer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/dusandev/miniconda3/bin/flask", line 8, in <module>
sys.exit(main())
File "/Users/dusandev/miniconda3/lib/python3.9/site-packages/flask/cli.py", line 990, in main
cli.main(args=sys.argv[1:])
File "/Users/dusandev/miniconda3/lib/python3.9/site-packages/flask/cli.py", line 596, in main
return super().main(*args, **kwargs)
File "/Users/dusandev/miniconda3/lib/python3.9/site-packages/click/core.py", line 1062, in main
rv = self.invoke(ctx)
File "/Users/dusandev/miniconda3/lib/python3.9/site-packages/click/core.py", line 1668, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/Users/dusandev/miniconda3/lib/python3.9/site-packages/click/core.py", line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/Users/dusandev/miniconda3/lib/python3.9/site-packages/click/core.py", line 763, in invoke
return __callback(*args, **kwargs)
File "/Users/dusandev/miniconda3/lib/python3.9/site-packages/click/decorators.py", line 84, in new_func
return ctx.invoke(f, obj, *args, **kwargs)
File "/Users/dusandev/miniconda3/lib/python3.9/site-packages/click/core.py", line 763, in invoke
return __callback(*args, **kwargs)
File "/Users/dusandev/miniconda3/lib/python3.9/site-packages/flask/cli.py", line 845, in run_command
app = DispatchingApp(info.load_app, use_eager_loading=eager_loading)
File "/Users/dusandev/miniconda3/lib/python3.9/site-packages/flask/cli.py", line 321, in __init__
self._load_unlocked()
File "/Users/dusandev/miniconda3/lib/python3.9/site-packages/flask/cli.py", line 346, in _load_unlocked
self._app = rv = self.loader()
File "/Users/dusandev/miniconda3/lib/python3.9/site-packages/flask/cli.py", line 402, in load_app
app = locate_app(self, import_name, name)
File "/Users/dusandev/miniconda3/lib/python3.9/site-packages/flask/cli.py", line 256, in locate_app
__import__(module_name)
File "/Users/dusandev/Desktop/AI/text_generator/__init__.py", line 2, in <module>
from .routes import generator
File "/Users/dusandev/Desktop/AI/text_generator/routes.py", line 2, in <module>
from .generator import ai
File "/Users/dusandev/Desktop/AI/text_generator/generator.py", line 74, in <module>
text = ai.generate_text('How are you?')
File "/Users/dusandev/Desktop/AI/text_generator/generator.py", line 46, in generate_text
np.random.seed(seed)
File "mtrand.pyx", line 244, in numpy.random.mtrand.RandomState.seed
File "_mt19937.pyx", line 166, in numpy.random._mt19937.MT19937._legacy_seeding
File "_mt19937.pyx", line 186, in numpy.random._mt19937.MT19937._legacy_seeding
TypeError: Cannot cast array data from dtype('O') to dtype('int64') according to the rule 'safe'
The problem is the line None, in your code. This is causing the tuple (None,) as the input to the np.random.seed(seed). It accepts integer, but you are sending the tuple.

Problem with Dataloader object not subscriptable

I am now running a Python program using Pytorch. I use my own dataset, not torch.data.dataset. I download data from a pickle file extracted from feature extraction. But the following errors appear:
Traceback (most recent call last):
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demo-emotion.py", line 326, in <module>
fire.Fire(demo)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\fire\core.py", line 138, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\fire\core.py", line 468, in _Fire
target=component.__name__)
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\fire\core.py", line 672, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demo-emotion.py", line 304, in demo
train(model,train_set1, valid_set=valid_set, test_set=test1, save=save, n_epochs=n_epochs,batch_size=batch_size,seed=seed)
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demo-emotion.py", line 172, in train
n_epochs=n_epochs,
File "C:\Users\hp\Downloads\efficient_densenet_pytorch-master\demo-emotion.py", line 37, in train_epoch
loader=np.asarray(list(loader))
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\utils\data\dataloader.py", line 345, in __next__
data = self._next_data()
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\utils\data\dataloader.py", line 385, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\utils\data\_utils\fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\utils\data\_utils\fetch.py", line 44, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "C:\Users\hp\Anaconda3\envs\tf-gpu\lib\site-packages\torch\utils\data\dataset.py", line 257, in __getitem__
return self.dataset[self.indices[idx]]
TypeError: 'DataLoader' object is not subscriptable
The code is:
train_set1 = Owndata()
train1, test1 = train_set1 .get_splits()
# prepare data loaders
train_dl = torch.utils.data.DataLoader(train1, batch_size=32, shuffle=True)
test_dl =torch.utils.data.DataLoader(test1, batch_size=1024, shuffle=False)
test_set1 = Owndata()
'''print('test_set# ',test_set)'''
if valid_size:
valid_set = Owndata()
indices = torch.randperm(len(train_set1))
train_indices = indices[:len(indices) - valid_size]
valid_indices = indices[len(indices) - valid_size:]
train_set1 = torch.utils.data.Subset(train_dl, train_indices)
valid_set = torch.utils.data.Subset(valid_set, valid_indices)
else:
valid_set = None
model = DenseNet(
growth_rate=growth_rate,
block_config=block_config,
num_classes=10,
small_inputs=True,
efficient=efficient,
)
train(model,train_set1, valid_set=valid_set, test_set=test1, save=save, n_epochs=n_epochs, batch_size=batch_size, seed=seed)
Any help is appreciated! Thanks a lot in advance!!
It is not the line giving you an error as it's the very last train function you are not showing.
You are confusing two things:
torch.utils.data.Dataset object is indexable (dataset[5] works fine for example). It is a simple object which defines how to get a single (usually single) sample of data.
torch.utils.data.DataLoader - non-indexable, only iterable, usually returns batches of data from above Dataset. Can work in parallel using num_workers. It's what you are trying to index while you should use dataset for that.
Please see PyTorch documentation about data to get a better grasp on how those work.

TypeError: can't pickle cv2.xfeatures2d_SIFT objects (occured during using joblib.Parallel)

I am trying to convert my code to parallel and used joblib.Parallel for the purpose. I got the above error while feature calculating stage. Here is my code.
from utils import FeatureGetter
#some code
Class model:
#initialize
def getDescriptors(image):
descriptors = self.feature_getter.get_features(image)
return descriptors
def train():
#some code
self.feature_getter = FeatureGetter()
descriptors_list = Parallel(n_jobs=-1)(delayed(self.getDescriptors)(image) for image in self.X_train)
Complete Error log
Traceback (most recent call last):
File "test_parallel.py", line 375, in <module>
bow.trainModel()
File "test_parallel.py", line 144, in trainModel
self.desc_list = Parallel(n_jobs=-1)(delayed(self.getDescriptors)(image) for image in self.X_train)
File "/home/vamsi.muthireddy/miniconda3/lib/python3.6/site-packages/joblib/parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "/home/vamsi.muthireddy/miniconda3/lib/python3.6/site-packages/joblib/parallel.py", line 620, in dispatch_one_batch
tasks = BatchedCalls(itertools.islice(iterator, batch_size))
File "/home/vamsi.muthireddy/miniconda3/lib/python3.6/site-packages/joblib/parallel.py", line 127, in __init__
self.items = list(iterator_slice)
File "test_parallel.py", line 144, in <genexpr>
self.desc_list = Parallel(n_jobs=-1)(delayed(self.getDescriptors)(image) for image in self.X_train)
File "/home/vamsi.muthireddy/miniconda3/lib/python3.6/site-packages/joblib/parallel.py", line 183, in delayed
pickle.dumps(function)
TypeError: can't pickle cv2.xfeatures2d_SIFT objects

Python: pickle.load __init__ error

I have trained a HMM model to add punctuation into Arabic text and I want to save it to not repeating the training phase every time I enter a text to the model for tagging it .. I use pickle for these task as I see in tutorials. I do exactly like them but it fail and give me these error!.
Traceback (most recent call last):
File "C:\Python27\file_pun_tag.py", line 205, in <module>
hmm_tagger("test_file.txt")
File "C:\Python27\file_pun_tag.py", line 179, in hmm_tagger
hmm = pickle.load(saved_model)
File "C:\Python27\lib\pickle.py", line 1378, in load
return Unpickler(file).load()
File "C:\Python27\lib\pickle.py", line 858, in load
dispatch[key](self)
File "C:\Python27\lib\pickle.py", line 1133, in load_reduce
value = func(*args)
TypeError: __init__() takes at least 3 arguments (2 given)
I tried several solutions but none of them working with me ...
Here is the code where I save my model. It is working correctly for saving the model and creating the "hmm.pickle":
file = codecs.open("train_sents_hmm.txt", "r", "utf_8")
train_sents = file.readlines()
labelled_sequences, tag_set, symbols = load_pun(train_sents)
trainer = nltk.HiddenMarkovModelTrainer (tag_set, symbols)
hmm = trainer.train_supervised (labelled_sequences, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
# save object
save_model = open("hmm.pickle", "wb")
pickle.dump(hmm, save_model, -1)
save_model.close()
And here is the code when i'm trying to load the model after saving it, and here where it gives me the error:
saved_model = open("hmm.pickle", "rb")
hmm = pickle.load(saved_model)
saved_model.close()

Categories