import numpy as np
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
import coremltools as ct
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
sentence_fragment = "The Oceans are"
class NEO(torch.nn.Module):
def __init__(self, model):
super(NEO, self).__init__()
self.next_token_predictor = model
def forward(self, x):
sentence = x
predictions, _ = self.next_token_predictor(sentence)
token = torch.argmax(predictions[-1, :], dim=0, keepdim=True)
sentence = torch.cat((sentence, token), 0)
return sentence
token_predictor = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M", torchscript=True).eval()
context = torch.tensor(tokenizer.encode(sentence_fragment))
random_tokens = torch.randint(10000, (5,))
traced_token_predictor = torch.jit.trace(token_predictor, random_tokens)
model = NEO(model=traced_token_predictor)
scripted_model = torch.jit.script(model)
# Custom model
sentence_fragment = "The Oceans are"
for i in range(10):
context = torch.tensor(tokenizer.encode(sentence_fragment))
torch_out = scripted_model(context)
sentence_fragment = tokenizer.decode(torch_out)
print("Custom model: {}".format(sentence_fragment))
# Stock model
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M", torchscript=True).eval()
sentence_fragment = "The Oceans are"
input_ids = tokenizer(sentence_fragment, return_tensors="pt").input_ids
gen_tokens = model.generate(input_ids, do_sample=True, max_length=20)
gen_text = tokenizer.batch_decode(gen_tokens)[0]
print("Stock model: "+gen_text)
RUN 1
Output:
Custom model: The Oceans are the most important source of water for the entire world
Stock model: The Oceans are on the rise. The American Southwest is thriving, but the southern United States still
RUN 2
Output:
Custom model: The Oceans are the most important source of water for the entire world.
Stock model: The Oceans are the land of man
This is a short video of the Australian government
The custom model always returns the same output. However with the do_sampling = True stock model.generate return different results on each call. I spent a lot of time figuring out how do_sampling works for transformers so I require help from you guys, appreciate it.
How to code a custom model to have different results on each call?
Thanks!
So, the answer would be to implement sampling :D
class NEO(torch.nn.Module):
def __init__(self, model):
super(NEO, self).__init__()
self.next_token_predictor = model
def forward(self, x):
sentence = x
predictions, _ = self.next_token_predictor(sentence)
# get top K (k=2) indicies of highest probs of tokens
# 2 indicies would be enough, anyway you will got 2 in a power of N variations
_, topK = torch.topk(predictions[-1, :], 2, dim=0)
# get one of two of those indicies randomly, and concat sentence
perm = torch.randperm(topK.size(0))
idx = perm[:1]
token = topK[idx.long()]
sentence = torch.cat((sentence, token), 0)
return sentence
Related
For data preparation I have this code
# Data preparartion
default_fin['ID'] = default_fin['ID'].astype(str)
credit_risk['ID'] = credit_risk['ID'].astype(str)
credit_risk['person_income'] = credit_risk['person_income'].astype(float)
credit_risk['loan_amnt'] = credit_risk['loan_amnt'].astype(float)
credit_risk['credit_type'] = credit_risk['credit_type'].astype(str)
credit_risk['cb_person_default_on_file'] = credit_risk['cb_person_default_on_file'].astype(str)
# It is important to sort order and products chronologically
credit_risk.sort_values(by=['ID', 'person_income', 'loan_amnt', 'credit_type', 'cb_person_default_on_file'], inplace=True
)
combined_defaulted_by_credit_type = credit_risk.groupby("credit_type").apply(lambda loans: ' '.join(loans['ID'].tolist()))
combined_defaulted_by_credit_type = pd.DataFrame(combined_defaulted_by_credit_type,columns=['all_credit_Ids'])
print(f'Number of credit types: {combined_defaulted_by_credit_type.shape[0]}')
combined_defaulted_by_credit_type.reset_index(inplace=True)
combined_defaulted_by_credit_type.credit_type = combined_defaulted_by_credit_type.credit_type.astype(str)
combined_defaulted_by_credit_type.head()
Then here I build the vocabulary and train the model
TRAIN_USER_MODEL = True # True - create a new model, False - load a previosuly created model
MODEL_DIR = 'models'
if not os.path.exists(MODEL_DIR):
os.makedirs(MODEL_DIR)
embeddings_dim = 200 # dimensionality of user representation
filename = f'models/customer2vec.{embeddings_dim}d.model'
if TRAIN_USER_MODEL:
class TaggedDocumentIterator(object):
def __init__(self, df):
self.df = df
def __iter__(self):
for row in self.df.itertuples():
yield TaggedDocument(words=dict(row._asdict())['all_credit_Ids'].split(), tags=[dict(row._asdict())['credit_type']])
it = TaggedDocumentIterator(combined_defaulted_by_credit_type)
doc_model = gensim.models.doc2vec.Doc2Vec(vector_size=embeddings_dim,
window=5,
min_count=10,
workers=mp.cpu_count(),
alpha=0.055,
min_alpha=0.055,
epochs=120) # use fixed learning rate
train_corpus = list(it)
doc_model.build_vocab(train_corpus)
print(f'Model saved to [{filename}]')
else:
doc_model = Doc2Vec.load(filename)
for epoch in tqdm(range(10)):
doc_model.alpha -= 0.005 # decrease the learning rate
doc_model.min_alpha = doc_model.alpha # fix the learning rate, no decay
doc_model.train(train_corpus, total_examples=doc_model.corpus_count, epochs=doc_model.epochs)
print('Iteration:', epoch)
doc_model.save(filename)
print(f'Model loaded to [{filename}]')
Then I created the doc_vectors here
doc_vectors = doc_model.dv
cust_doc = list(doc_model.dv.key_to_index.keys())
doc_vector_dict = {arg:doc_model.dv[arg] for arg in cust_doc}
X_doc = pd.DataFrame(doc_vector_dict).T.values
X_doc.shape, len(cust_doc), credit_risk["credit_type"].nunique()
Visualization
# Visualize the customer semantic space using TSNE
ids_sample_str = set([str(id) for id in ids_sample])
idx = []
for i, credit_type in enumerate(doc_vector_dict):
if credit_type in ids_sample_str:
idx.append(i)
X_doc_subset = X_doc[idx] # only sampled user IDs
X_doc_subset.shape
distance_matrix_doc = pairwise_distances(X_doc_subset, X_doc_subset, metric='cosine', n_jobs=-1)
tsne_doc = TSNE(metric="precomputed", n_components=2, verbose=1, perplexity=30, n_iter=500)
And here comes the error
ValueError: Found array with 0 sample(s) (shape=(0, 0)) while a minimum of 2 is required.
tsne_results_doc = tsne_doc.fit_transform(distance_matrix_doc)
I am trying to implement the NER example using BERT and pytorch from the huggingface guide (https://huggingface.co/transformers/custom_datasets.html#ft-trainer). Reading in the data works fine, but when I want to start the training, I get the error
Expected input batch_size (16) to match target batch_size (4000)
When I try to change the input batch_size, the target batch_size is always the input batch_size*250. I would greatly appreciate, if someone could look over the following code and sees my mistakes.
from pathlib import Path
import re
def read_data(file_path):
file_path = Path(file_path)
raw_text = file_path.read_text().strip()
raw_docs = re.split(r'\n\t?\n', raw_text)
token_docs = []
tag_docs = []
for doc in raw_docs:
tokens = []
tags = []
for line in doc.split('\n'):
token, tag = line.split('\t')
tokens.append(token)
tags.append(tag)
token_docs.append(tokens)
tag_docs.append(tags)
return token_docs, tag_docs
train_texts, train_tags = read_data('data/train.tsv')
val_texts, val_tags = read_data('data/test.tsv')
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
import numpy as np
def encode_tags(tags, encodings):
labels = [[tag2id[tag] for tag in doc] for doc in tags]
encoded_labels = []
for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
# create an empty array of -100
doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
arr_offset = np.array(doc_offset)
# set labels whose first offset position is 0 and the second is not 0
doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
encoded_labels.append(doc_enc_labels.tolist())
return encoded_labels
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)
import torch
class TestDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = TestDataset(train_encodings, train_labels)
val_dataset = TestDataset(val_encodings, val_labels)
from transformers import DistilBertForTokenClassification
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=3, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-cased")
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset # evaluation dataset
)
trainer.train()
Instead of DistilBertForSequenceClassification Model, you want to use DistilBertForTokenClassification in the last cell.
Hello and greetings from Greece
class Model(nn.Module):
def __init__(self, embedding_size, num_numerical_cols, output_size, layers, p=0.4):
super().__init__()
self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
self.embedding_dropout = nn.Dropout(p)
self.batch_norm_num = nn.BatchNorm1d(num_numerical_cols)
all_layers = []
num_categorical_cols = sum((nf for ni, nf in embedding_size))
input_size = num_categorical_cols + num_numerical_cols
for i in layers:
all_layers.append(nn.Linear(input_size, i))
all_layers.append(nn.ReLU(inplace=True))
all_layers.append(nn.BatchNorm1d(i))
all_layers.append(nn.Dropout(p))
input_size = i
all_layers.append(nn.Linear(layers[-1], output_size))
self.layers = nn.Sequential(*all_layers)
def forward(self, x_categorical, x_numerical):
embeddings = []
for i,e in enumerate(self.all_embeddings):
embeddings.append(e(x_categorical[:,i]))
x = torch.cat(embeddings, 1)
x = self.embedding_dropout(x)
x_numerical = self.batch_norm_num(x_numerical)
x = torch.cat([x, x_numerical], 1)
x = self.layers(x)
return x
Suppose I have this nn for classification and I create two instances
model_1=Model(categorical_embedding_sizes, numerical_data.shape[1], 2, [200,100,50], p=0.4)
model_2=Model(categorical_embedding_sizes, numerical_data.shape[1], 2, [200,100,50], p=0.4)
Αnd after I trained these two models i saved them with torch.save as model_1.pt and model_2.pt
Is there a way to create a new model with the mean parameters of the two models ?
something like
model_new.weight=(model_1.weight+model_2.weight)/2
model_new.bias=(model_1.bias+model_2.bias)/2
Thank you in advance
You can easily do this by generating a state dictionary from your two models' state dictionaries:
state_1 = model_1.state_dict()
state_2 = model_2.state_dict()
for layer in state_1:
state_1[layer] = (state_1[layer] + state_2[layer])/2
The above will loop through parameters (weights and biases) of all layers.
Then overwrite this new state on either model_1 or a newly instanced model, like so:
model_new = Model(categorical_embedding_sizes, numerical_data.shape[1], 2, [200,100,50], p=0.4)
model_new.load_state_dict(state1)
I'm trying to train the model for intent recognition. I tried removing all special characters and stop words but unable to resolve this error. I tried removing integers also but it's throwing an error. My data has two columns with one text and one intent column
The code I've written is
class IntentDetectionData:
DATA_COLUMN = "text"
LABEL_COLUMN = "intent"
def __init__(self, train, test, tokenizer: FullTokenizer, classes, max_seq_len=192):
self.tokenizer = tokenizer
self.max_seq_len = 0
self.classes = classes
train, test = map(lambda df: df.reindex(df[IntentDetectionData.DATA_COLUMN].str.len().sort_values().index), [train, test])
((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])
print("max seq_len", self.max_seq_len)
self.max_seq_len = min(self.max_seq_len, max_seq_len)
self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])
def _prepare(self, df):
x, y = [], []
for _, row in tqdm(df.iterrows()):
text, label = row[IntentDetectionData.DATA_COLUMN], row[IntentDetectionData.LABEL_COLUMN]
tokens = self.tokenizer.tokenize(text)
tokens = ["[CLS]"] + tokens + ["[SEP]"]
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
self.max_seq_len = max(self.max_seq_len, len(token_ids))
x.append(token_ids)
y.append(self.classes.index(label))
return np.array(x), np.array(y)
def _pad(self, ids):
x = []
for input_ids in ids:
input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
x.append(np.array(input_ids))
return np.array(x)
The next function is
def create_model(max_seq_len, bert_ckpt_file):
with tf.io.gfile.GFile(bert_config_file, "r") as reader:
bc = StockBertConfig.from_json_string(reader.read())
bert_params = map_stock_config_to_params(bc)
bert_params.adapter_size = None
bert = BertModelLayer.from_params(bert_params, name="bert")
input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids")
bert_output = bert(input_ids)
print("bert shape", bert_output.shape)
cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
cls_out = keras.layers.Dropout(0.5)(cls_out)
logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
logits = keras.layers.Dropout(0.5)(logits)
logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits)
model = keras.Model(inputs=input_ids, outputs=logits)
model.build(input_shape=(None, max_seq_len))
load_stock_weights(bert, bert_ckpt_file)
return model
The next code is:
classes = train.intent.unique().tolist()
data = IntentDetectionData(train, test, tokenizer, classes, max_seq_len=10000)
After running the above code I'm getting error like
ValueError: Unsupported string type: <class 'float'>
I ran into the same issue and I bumped into this GitHub issue with quite a lot of ideas: https://github.com/google-research/bert/issues/559
In my case, I had some NaN values in my dataframes (train, test). I had to replace them with something like:
train.fillna('unknown',inplace=True)
Similarly with test. This meant my "float" values are now strings.
I'm trying to implement a neural network model with TensorFlow for text classification. I created a custom layer in order to calculate a weighted average and to learn weights but I keep getting this weird error that tells "Cannot get value inside Tensorflow graph function". Here's what I've coded so far (For a small sample texts just to test).
This is my custom layer:
from tensorflow.keras import layers
import tensorflow as tf
import numpy as np
class WeightedAverageLayer(layers.Layer):
def __init__(self, vocab_size, embedding_dimension, doc_length):
super(WeightedAverageLayer, self).__init__()
self.embedding_dimension = embedding_dimension
self.vocab_size = vocab_size
self.doc_length = doc_length
#Initialize the embedding layer
self.embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dimension, input_length=doc_length)
#Initialize the words importance
b_init = tf.zeros_initializer()
self.b = self.add_weight(shape=(vocab_size,), initializer='random_normal', trainable=True)
def calculate_average(self, doc):
doc_words = self.embedding_layer(doc)
word_embeddings = self.embedding_layer.get_weights()[0]
s = 0
avg = 0
z = np.zeros((self.embedding_dimension))
for i in range(self.embedding_dimension):
for j in range(len(doc)):
val = doc[j]
vw = word_embeddings[val]
a = self.b[val]
s = s + vw[i] * a
avg = s / len(doc)
z[i] = avg
s = 0
avg = 0
return z
def call(self, docs):
result = []
for doc in docs:
vect = self.calculate_average(doc)
result.append(vect)
result_np = np.array(result)
return tf.stack(result_np)
#return list(map(self.calculate_average, docs))
and this is the main file where i test my layer in a model:
vocab_size = 20
embedding_dim = 10
max_length = 4
# define documents
docs = ['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!',
'Weak', 'Poor effort!', 'not good', 'poor work', 'Could have done better.', 'You are amazing']
# define class labels
labels = np.array([[1,0],[1,0],[1,0],[1,0],[1,0],[0,2],[0,2],[0,2],[0,2],[0,2],[1,0]])
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(docs)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(docs)
pad = pad_sequences(sequences, maxlen=max_length, padding="post")
average_layer = WeightedAverageLayer(vocab_size, embedding_dim, max_length )
output_layer = tf.keras.layers.Dense(2, activation='softmax')
input_docs = Input(shape=(max_length))
weighted_average = average_layer(input_docs)
output = output_layer(weighted_average)
model = Model(input_docs, output)
Please note that I tested the layer alone and it worked fine.