The solution I have now works, but the downside is that takes over 1 hour to fit a simple model. Most of the training time is lost in the python generator, in the code called WindowGenerator.gen
If we postfix all the data, I would think it would affect the models performance since we will then have zeros for most of the data.
I have an eventlog with data on the format (approximates): (600 000, 500)
The first feature is the group/caseID and the three last are labels. Each case has on average 6 events, but it deviates from 4 all the way to 100. In tensorflow v1 there was a generator that seemed like what I wanted: tf.contrib.training.batch_sequences_with_states
I want to load in batches that get postfixed, my solution was to create an empty 3d array in the desired shape defined by (batchsize, the max length case, num_features)
The idea is to create a sliding window, where we are guessing the target columns by an offset/shift.
Example:
batch_1 = pd.DataFrame([[1,1,0.1,11],
[1,2,0.2,12],
[1,3,0.3,13],
[1,4,0.4,14],
[1,4,0.4,14],
[2,5,0.5,15],
[2,6,0.6,16],
[2,7,0.7,17],
[3,8,0.8,18],
[3,9,0.9,19],
[3,10,0.7,20]], columns=["id", "x1","x2", "target"])
# we want to be on the form before sliding:
[ [[1,1,0.1,11],
[1,2,0.2,12],
[1,3,0.3,13],
[1,4,0.4,14],
[1,4,0.4,14] ],
[[2,5,0.5,15],
[2,6,0.6,16],
[2,7,0.7,17],
[2,7,0.7,17],
[2,0,0,0],
[2,0,0,0] ],
[3,8,0.8,18],
[3,9,0.9,19],
[3,10,0.7,20]
[3,0,0,0],
[3,0,0,0], ]
]
The splitting of data:
unique_caseids = eventvise_training_data.caseid.unique()
np.random.shuffle(unique_caseids)
n = len(unique_caseids)
train_ids = unique_caseids[0:int(n*0.7)]
val_ids = unique_caseids[int(n*0.7):int(n*0.9)]
test_ids = unique_caseids[int(n*0.9):]
train_df = eventvise_training_data[eventvise_training_data.caseid.isin(train_ids)]
val_df = eventvise_training_data[eventvise_training_data.caseid.isin(val_ids)]
test_df = eventvise_training_data[eventvise_training_data.caseid.isin(test_ids)]
The WindowGenerator object
class WindowGenerator(object):
def __init__(self, input_width, label_width, shift,
train_df=train_df, val_df=val_df, test_df=test_df,id_column=0,
label_columns=None, batchsize=32, target_neg_ind=-3):
# Store the raw data.
self.train_df = train_df
self.val_df = val_df
self.test_df = test_df
self.batchsize = batchsize
self.id_column = id_column
self.id_name = id_column
self.unique_ids = unique_ids
self.target_neg_ind = target_neg_ind
self.generated_train_counter = 0
self.generated_val_counter = 0
self.generated_test_counter = 0
if self.unique_ids is None:
if type(id_column) == int:
self.unique_train_ids = train_df[train_df.columns[id_column]].unique()
self.unique_val_ids = val_df[val_df.columns[id_column]].unique()
self.unique_test_ids = test_df[test_df.columns[id_column]].unique()
self.id_name = train_df.columns[id_column]
elif type(id_column) == str:
self.unique_train_ids = train_df[id_column].unique()
self.unique_val_ids = val_df[id_column].unique()
self.unique_test_ids = test_df[id_column].unique()
# need the length of unique ids
self.num_unique_train = len(self.unique_train_ids)
self.num_unique_val = len(self.unique_val_ids)
self.num_unique_test = len(self.unique_test_ids)
# Work out the label column indices.
self.label_columns = label_columns
if label_columns is not None:
self.label_columns_indices = {name: i for i, name in
enumerate(label_columns)}
self.column_indices = {name: i for i, name in
enumerate(train_df.columns)}
# Work out the window parameters.
self.input_width = input_width
self.label_width = label_width
self.shift = shift
self.total_window_size = input_width + shift
self.input_slice = slice(0, input_width)
self.input_indices = np.arange(self.total_window_size)[self.input_slice]
self.label_start = self.total_window_size - self.label_width
self.labels_slice = slice(self.label_start, None)
self.label_indices = np.arange(self.total_window_size)[self.labels_slice]
self.label_name = list(self.label_columns_indices.keys())
def split_window(self, data, index_start_of_target=-3, type_of_data="train", seq_sliding=0, features=None):
seq_sliding = 0
if features is None:
features = self.generate_split_by_id(data, type_of_data=type_of_data)
if features is None:
return None, None, None, True
max_sliding = features.shape[1] - self.total_window_size
if seq_sliding > max_sliding:
return inputs, labels, features, False
if seq_sliding < 1:
input_slice = self.input_slice
output_slice = self.labels_slice
elif seq_sliding >= 1:
input_slice = slice(0+seq_sliding, self.input_width + seq_sliding)
output_slice = slice(0+seq_sliding, None)
inputs = features[:, input_slice, :index_start_of_target]
labels = features[:, output_slice, :]
if self.label_columns is not None:
labels = tf.stack( #-1 since we have removed the id columns
[labels[:, seq_sliding + self.label_start:seq_sliding+self.total_window_size, self.column_indices[name] - 1] for name in self.label_columns],
axis=-1)
# Slicing doesn't preserve static shape information, so set the shapes
# manually.
inputs.set_shape([None, self.input_width, None])
labels.set_shape([None, self.label_width, len(self.label_columns)])
return inputs, labels, features, False
def generate_split_by_id(self, data, type_of_data):
# to get the appropriate data for continuing on generating new batches
counter, num_unique, unique_ids = self.get_data_info(type_of_data=type_of_data)
start = counter
end = counter+self.batchsize
id_num = []
if end > num_unique: # to stop the batch collection before we run out caseIDs
end = num_unique
print("§§Finished training on all the data -- reseting counter§§")
flag = 1
counter = 0
self.set_data_info(type_of_data=type_of_data, counter=counter)
return
for num in range(start, end):
id_num.append(unique_ids[num])
counter += 1
self.set_data_info(type_of_data=type_of_data, counter=counter)
stacking_blocks = []
max_timesteps = 0
for ids in id_num[:]:
temp = data[data[self.id_name] == ids].drop(columns=[self.id_name]).to_numpy("float64")
if temp.shape[0] > max_timesteps:
max_timesteps = temp.shape[0]
stacking_blocks.append(temp)
# will create a postfix 3d-tensor
fill_array = np.zeros((len(id_num),max_timesteps,temp.shape[1]))
for sample_idx, sample in enumerate(stacking_blocks):
for time_step_idx, time_step in enumerate(sample):
fill_array[sample_idx, time_step_idx] = time_step
return tf.stack(fill_array)
def gen(self, data, type_of_data):
while 1:
# reset the sliding
sliding = 0
features = None
while 1:
input_data, output_data, features, stop_flag = self.split_window(data,
index_start_of_target=self.target_neg_ind,
type_of_data=type_of_data, seq_sliding=sliding)
sliding += 1
# break whens we run out of batches
if input_data is None:
break
yield input_data, output_data
if stop_flag:
break
def get_data_info(self, type_of_data=None):
if type_of_data == "train":
counter = self.generated_train_counter
num_unique = self.num_unique_train
unique_ids = self.unique_train_ids
elif type_of_data == "val":
counter = self.generated_val_counter
num_unique = self.num_unique_val
unique_ids = self.unique_val_ids
elif type_of_data == "test":
counter = self.generated_test_counter
num_unique = self.num_unique_test
unique_ids = self.unique_test_ids
return counter, num_unique, unique_ids
def set_data_info(self, type_of_data=None, counter=0):
if type_of_data == "train":
self.generated_train_counter = counter
elif type_of_data == "val":
self.generated_val_counter = counter
elif type_of_data == "test":
self.generated_test_counter = counter
I am extracting from the log file and print using the below code
for line in data:
g = re.findall(r'([\d.]+).*?(GET|POST|PUT|DELETE)', line)
print (g)
[('1.1.1.1', 'PUT')]
[('2.2.2.2', 'GET')]
[('1.1.1.1', 'PUT')]
[('2.2.2.2', 'POST')]
How to add to the output
output
1.1.1.1: PUT = 2
2.2.2.2: GET = 1,POST=1
You could use a dictionary to count:
# initialize the count dict
count_dict= dict()
for line in data:
g = re.findall(r'([\d.]+).*?(GET|POST|PUT|DELETE)', line)
for tup in g:
# get the counts for tuple tup if we don't have it yet
# use 0 (second argument to .get)
num= count_dict.get(tup, 0)
# increase the count and write it back
count_dict[tup]= num+1
# now iterate over the key (tuple) - value (counts)-pairs
# and print the result
for tup, count in count_dict.items():
print(tup, count)
Ok, I have to admit this doesn't give the exact output, you want, but from this you can do in a similar manner:
out_dict= dict()
for (comma_string, request_type), count in count_dict.items():
out_str= out_dict.get(comma_string, '')
sep='' if out_str == '' else ', '
out_str= f'{out_str}{sep}{request_type} = {count}'
out_dict[comma_string]= out_str
for tup, out_str in out_dict.items():
print(tup, out_str)
From your data that outputs:
1.1.1.1 PUT = 2
2.2.2.2 GET = 1, POST = 1
I would look towards Counter.
from collections import Counter
results = []
for line in data:
g = re.findall(r'([\d.]+).*?(GET|POST|PUT|DELETE)', line)
results.append(g[0])
ip_list = set(result[0] for result in results)
for ip in ip_list:
print(ip, Counter(result[1] for result in results if result[0] == ip ))
You can use collection.defaultdict
Ex:
from collections import defaultdict
result = defaultdict(list)
for line in data:
for ip, method in re.findall(r'([\d.]+).*?(GET|POST|PUT|DELETE)', line):
result[ip].append(method)
for k, v in result.items():
temp = ""
for i in set(v):
temp += " {} = {}".format(i, v.count(i))
print("{}{}".format(k, temp))
from collections import Counter
x = [[('1.1.1.1', 'PUT')],[('2.2.2.2', 'GET')],[('1.1.1.1', 'PUT')],[('2.2.2.2', 'POST')]]
# step 1: convert x into a dict.
m = {}
for i in x:
a, b = i[0]
if a not in m.keys():
m[a] = [b]
else:
x = m[a]
x.append(b)
m[a] = x
print('new dict is {}'.format(m))
# step 2 count frequency
m_values = list(m.values())
yy = []
for i in m_values:
x = []
k = list(Counter(i).keys())
v = list(Counter(i).values())
for i in range(len(k)):
x.append(k[i] + '=' + str(v[i]))
yy.append(x)
# step 3, update the value of the dict
m_keys = list(m.keys())
n = len(m_keys)
for i in range(n):
m[m_keys[i]] = yy[i]
print("final dict is{}".format(m))
Output is
new dict is {'1.1.1.1': ['PUT', 'PUT'], '2.2.2.2': ['GET', 'POST']}
final dict is{'1.1.1.1': ['PUT=2'], '2.2.2.2': ['GET=1', 'POST=1']}
Without dependencies and using a dict for counting, in a very basic way. Given the data_set:
data_set = [[('1.1.1.1', 'PUT')],
[('2.2.2.2', 'GET')],
[('2.2.2.2', 'POST')],
[('1.1.1.1', 'PUT')]]
Initialize the variables (manually, just few verbs) then iterate over the data:
counter = {'PUT': 0, 'GET': 0, 'POST': 0, 'DELETE': 0}
res = {}
for data in data_set:
ip, verb = data[0]
if not ip in res:
res[ip] = counter
else:
res[ip][verb] += 1
print(res)
#=> {'1.1.1.1': {'PUT': 1, 'GET': 0, 'POST': 1, 'DELETE': 0}, '2.2.2.2': {'PUT': 1, 'GET': 0, 'POST': 1, 'DELETE': 0}}
It's required to format the output to better fits your needs.
I'm writing an input pipeline using tf.data.Dataset. I'd like to use python code to load and transform my samples, the code returns a dictionary of tensors. Unfortunately I don't see how I can define that as the output type that is passed to tf.py_func.
I have a workaround where my function returns list of tensors instead of a dictionary, but it makes my code less readable as I have 4 keys in that dict.
The code looks somehow as follows
file_list = ....
def load(file_name):
return {"image": np.zeros(...,dtype=np.float32),
"label": 1.0} # there is more labels, in the original code
ds = tf.data.Dataset.from_tensor_slices(file_list)
ds.shuffle(...)
out_type = [{'image':tf.float32, "label":tf.float32 }] # ????
ds.map(lambda x: tf.py_func(load, [x], out_type))
ds.batch(...)
ds.prefetch(1)
This answer is in response to Celso Franca's comment.
I did find a way but not returning a dict but rather using tf_example.SerializeToString().
The two functions were used for processing BERT input on the fly. It worked greate and saved me many hours of pre-processing upfront, while not losing any performance in the training process.
def _convert(label, text):
"""Decodes a csv-line to a TensorFlow Example, serialized as a string."""
np_label = label.numpy()
np_text = text.numpy()
tokens_a = tokenizer.tokenize(np_text)
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > seq_length - 2:
tokens_a = tokens_a[0: (seq_length - 2)]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == seq_length
assert len(input_mask) == seq_length
assert len(segment_ids) == seq_length
label_id = label_map[np_label]
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(input_ids)
features["input_mask"] = create_int_feature(input_mask)
features["segment_ids"] = create_int_feature(segment_ids)
features["label_ids"] = create_int_feature([label_id])
features["is_real_example"] = create_int_feature([int(True)])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
# tf.py_function only accepts true tf datatypes like string
return tf_example.SerializeToString()
def _decode_record(record):
"""Decodes a record to a TensorFlow example."""
example = tf.parse_single_example(record, name_to_features)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for name in list(example.keys()):
t = example[name]
if t.dtype == tf.int64:
t = tf.to_int32(t)
example[name] = t
return example
def input_fn(params):
"""The actual input function."""
filenames = tf.data.Dataset.list_files(file_pattern)
label_col = processor.get_label_col()
text_col = processor.get_text_col()
d = filenames.apply(
tf.contrib.data.parallel_interleave(
lambda filename: tf.data.experimental.CsvDataset(filename,
[tf.float32, tf.string],
select_cols=[label_col, text_col],
field_delim=delimiter,
header=True),
cycle_length=2))
if is_training:
d = d.repeat()
d = d.shuffle(buffer_size=100)
d = d.map(lambda label, text: tf.py_function(_convert, [label, text], tf.string))
d = d.map(_decode_record)
d = d.batch(batch_size=params["batch_size"], drop_remainder=drop_remainder)
return d