How to create an efficient sliding window generator for timeseries event log with caseIDs - python

The solution I have now works, but the downside is that takes over 1 hour to fit a simple model. Most of the training time is lost in the python generator, in the code called WindowGenerator.gen
If we postfix all the data, I would think it would affect the models performance since we will then have zeros for most of the data.
I have an eventlog with data on the format (approximates): (600 000, 500)
The first feature is the group/caseID and the three last are labels. Each case has on average 6 events, but it deviates from 4 all the way to 100. In tensorflow v1 there was a generator that seemed like what I wanted: tf.contrib.training.batch_sequences_with_states
I want to load in batches that get postfixed, my solution was to create an empty 3d array in the desired shape defined by (batchsize, the max length case, num_features)
The idea is to create a sliding window, where we are guessing the target columns by an offset/shift.
Example:
batch_1 = pd.DataFrame([[1,1,0.1,11],
[1,2,0.2,12],
[1,3,0.3,13],
[1,4,0.4,14],
[1,4,0.4,14],
[2,5,0.5,15],
[2,6,0.6,16],
[2,7,0.7,17],
[3,8,0.8,18],
[3,9,0.9,19],
[3,10,0.7,20]], columns=["id", "x1","x2", "target"])
# we want to be on the form before sliding:
[ [[1,1,0.1,11],
[1,2,0.2,12],
[1,3,0.3,13],
[1,4,0.4,14],
[1,4,0.4,14] ],
[[2,5,0.5,15],
[2,6,0.6,16],
[2,7,0.7,17],
[2,7,0.7,17],
[2,0,0,0],
[2,0,0,0] ],
[3,8,0.8,18],
[3,9,0.9,19],
[3,10,0.7,20]
[3,0,0,0],
[3,0,0,0], ]
]
The splitting of data:
unique_caseids = eventvise_training_data.caseid.unique()
np.random.shuffle(unique_caseids)
n = len(unique_caseids)
train_ids = unique_caseids[0:int(n*0.7)]
val_ids = unique_caseids[int(n*0.7):int(n*0.9)]
test_ids = unique_caseids[int(n*0.9):]
train_df = eventvise_training_data[eventvise_training_data.caseid.isin(train_ids)]
val_df = eventvise_training_data[eventvise_training_data.caseid.isin(val_ids)]
test_df = eventvise_training_data[eventvise_training_data.caseid.isin(test_ids)]
The WindowGenerator object
class WindowGenerator(object):
def __init__(self, input_width, label_width, shift,
train_df=train_df, val_df=val_df, test_df=test_df,id_column=0,
label_columns=None, batchsize=32, target_neg_ind=-3):
# Store the raw data.
self.train_df = train_df
self.val_df = val_df
self.test_df = test_df
self.batchsize = batchsize
self.id_column = id_column
self.id_name = id_column
self.unique_ids = unique_ids
self.target_neg_ind = target_neg_ind
self.generated_train_counter = 0
self.generated_val_counter = 0
self.generated_test_counter = 0
if self.unique_ids is None:
if type(id_column) == int:
self.unique_train_ids = train_df[train_df.columns[id_column]].unique()
self.unique_val_ids = val_df[val_df.columns[id_column]].unique()
self.unique_test_ids = test_df[test_df.columns[id_column]].unique()
self.id_name = train_df.columns[id_column]
elif type(id_column) == str:
self.unique_train_ids = train_df[id_column].unique()
self.unique_val_ids = val_df[id_column].unique()
self.unique_test_ids = test_df[id_column].unique()
# need the length of unique ids
self.num_unique_train = len(self.unique_train_ids)
self.num_unique_val = len(self.unique_val_ids)
self.num_unique_test = len(self.unique_test_ids)
# Work out the label column indices.
self.label_columns = label_columns
if label_columns is not None:
self.label_columns_indices = {name: i for i, name in
enumerate(label_columns)}
self.column_indices = {name: i for i, name in
enumerate(train_df.columns)}
# Work out the window parameters.
self.input_width = input_width
self.label_width = label_width
self.shift = shift
self.total_window_size = input_width + shift
self.input_slice = slice(0, input_width)
self.input_indices = np.arange(self.total_window_size)[self.input_slice]
self.label_start = self.total_window_size - self.label_width
self.labels_slice = slice(self.label_start, None)
self.label_indices = np.arange(self.total_window_size)[self.labels_slice]
self.label_name = list(self.label_columns_indices.keys())
def split_window(self, data, index_start_of_target=-3, type_of_data="train", seq_sliding=0, features=None):
seq_sliding = 0
if features is None:
features = self.generate_split_by_id(data, type_of_data=type_of_data)
if features is None:
return None, None, None, True
max_sliding = features.shape[1] - self.total_window_size
if seq_sliding > max_sliding:
return inputs, labels, features, False
if seq_sliding < 1:
input_slice = self.input_slice
output_slice = self.labels_slice
elif seq_sliding >= 1:
input_slice = slice(0+seq_sliding, self.input_width + seq_sliding)
output_slice = slice(0+seq_sliding, None)
inputs = features[:, input_slice, :index_start_of_target]
labels = features[:, output_slice, :]
if self.label_columns is not None:
labels = tf.stack( #-1 since we have removed the id columns
[labels[:, seq_sliding + self.label_start:seq_sliding+self.total_window_size, self.column_indices[name] - 1] for name in self.label_columns],
axis=-1)
# Slicing doesn't preserve static shape information, so set the shapes
# manually.
inputs.set_shape([None, self.input_width, None])
labels.set_shape([None, self.label_width, len(self.label_columns)])
return inputs, labels, features, False
def generate_split_by_id(self, data, type_of_data):
# to get the appropriate data for continuing on generating new batches
counter, num_unique, unique_ids = self.get_data_info(type_of_data=type_of_data)
start = counter
end = counter+self.batchsize
id_num = []
if end > num_unique: # to stop the batch collection before we run out caseIDs
end = num_unique
print("§§Finished training on all the data -- reseting counter§§")
flag = 1
counter = 0
self.set_data_info(type_of_data=type_of_data, counter=counter)
return
for num in range(start, end):
id_num.append(unique_ids[num])
counter += 1
self.set_data_info(type_of_data=type_of_data, counter=counter)
stacking_blocks = []
max_timesteps = 0
for ids in id_num[:]:
temp = data[data[self.id_name] == ids].drop(columns=[self.id_name]).to_numpy("float64")
if temp.shape[0] > max_timesteps:
max_timesteps = temp.shape[0]
stacking_blocks.append(temp)
# will create a postfix 3d-tensor
fill_array = np.zeros((len(id_num),max_timesteps,temp.shape[1]))
for sample_idx, sample in enumerate(stacking_blocks):
for time_step_idx, time_step in enumerate(sample):
fill_array[sample_idx, time_step_idx] = time_step
return tf.stack(fill_array)
def gen(self, data, type_of_data):
while 1:
# reset the sliding
sliding = 0
features = None
while 1:
input_data, output_data, features, stop_flag = self.split_window(data,
index_start_of_target=self.target_neg_ind,
type_of_data=type_of_data, seq_sliding=sliding)
sliding += 1
# break whens we run out of batches
if input_data is None:
break
yield input_data, output_data
if stop_flag:
break
def get_data_info(self, type_of_data=None):
if type_of_data == "train":
counter = self.generated_train_counter
num_unique = self.num_unique_train
unique_ids = self.unique_train_ids
elif type_of_data == "val":
counter = self.generated_val_counter
num_unique = self.num_unique_val
unique_ids = self.unique_val_ids
elif type_of_data == "test":
counter = self.generated_test_counter
num_unique = self.num_unique_test
unique_ids = self.unique_test_ids
return counter, num_unique, unique_ids
def set_data_info(self, type_of_data=None, counter=0):
if type_of_data == "train":
self.generated_train_counter = counter
elif type_of_data == "val":
self.generated_val_counter = counter
elif type_of_data == "test":
self.generated_test_counter = counter

Related

Coco_eval - Average Precision and Recall

I trained my model with maskrcnn and now I need to test it. How can I extract AP and AR and plot the graph, ok I know how to plot with matplotlib, but I need to plot Precision-recall curve but for that don't know how to access AP and AR values. Where are they saved?
I'm using this coco_eval script, and from here I see in function summarize there are print("IoU metric: {}".format(iou_type)) and this I got in output and under that AP and AR results, but I can't find it here in code. Where is this calculation?
coco_eval.py
import json
import tempfile
import numpy as np
import copy
import time
import torch
import torch._six
from pycocotools.cocoeval import COCOeval
from pycocotools.coco import COCO
import pycocotools.mask as mask_util
from collections import defaultdict
import utils
class CocoEvaluator(object):
def __init__(self, coco_gt, iou_types):
assert isinstance(iou_types, (list, tuple))
coco_gt = copy.deepcopy(coco_gt)
self.coco_gt = coco_gt
self.iou_types = iou_types
self.coco_eval = {}
for iou_type in iou_types:
self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
self.img_ids = []
self.eval_imgs = {k: [] for k in iou_types}
def update(self, predictions):
img_ids = list(np.unique(list(predictions.keys())))
self.img_ids.extend(img_ids)
for iou_type in self.iou_types:
results = self.prepare(predictions, iou_type)
coco_dt = loadRes(self.coco_gt, results) if results else COCO()
coco_eval = self.coco_eval[iou_type]
coco_eval.cocoDt = coco_dt
coco_eval.params.imgIds = list(img_ids)
img_ids, eval_imgs = evaluate(coco_eval)
self.eval_imgs[iou_type].append(eval_imgs)
def synchronize_between_processes(self):
for iou_type in self.iou_types:
self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
def accumulate(self):
for coco_eval in self.coco_eval.values():
coco_eval.accumulate()
def summarize(self):
for iou_type, coco_eval in self.coco_eval.items():
print("IoU metric: {}".format(iou_type))
coco_eval.summarize()
def prepare(self, predictions, iou_type):
if iou_type == "bbox":
return self.prepare_for_coco_detection(predictions)
elif iou_type == "segm":
return self.prepare_for_coco_segmentation(predictions)
elif iou_type == "keypoints":
return self.prepare_for_coco_keypoint(predictions)
else:
raise ValueError("Unknown iou type {}".format(iou_type))
def prepare_for_coco_detection(self, predictions):
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
boxes = prediction["boxes"]
boxes = convert_to_xywh(boxes).tolist()
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
coco_results.extend(
[
{
"image_id": original_id,
"category_id": labels[k],
"bbox": box,
"score": scores[k],
}
for k, box in enumerate(boxes)
]
)
return coco_results
def prepare_for_coco_segmentation(self, predictions):
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
scores = prediction["scores"]
labels = prediction["labels"]
masks = prediction["masks"]
masks = masks > 0.5
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
rles = [
mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
for mask in masks
]
for rle in rles:
rle["counts"] = rle["counts"].decode("utf-8")
coco_results.extend(
[
{
"image_id": original_id,
"category_id": labels[k],
"segmentation": rle,
"score": scores[k],
}
for k, rle in enumerate(rles)
]
)
return coco_results
def prepare_for_coco_keypoint(self, predictions):
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
continue
boxes = prediction["boxes"]
boxes = convert_to_xywh(boxes).tolist()
scores = prediction["scores"].tolist()
labels = prediction["labels"].tolist()
keypoints = prediction["keypoints"]
keypoints = keypoints.flatten(start_dim=1).tolist()
coco_results.extend(
[
{
"image_id": original_id,
"category_id": labels[k],
'keypoints': keypoint,
"score": scores[k],
}
for k, keypoint in enumerate(keypoints)
]
)
return coco_results
def convert_to_xywh(boxes):
xmin, ymin, xmax, ymax = boxes.unbind(1)
return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
def merge(img_ids, eval_imgs):
all_img_ids = utils.all_gather(img_ids)
all_eval_imgs = utils.all_gather(eval_imgs)
merged_img_ids = []
for p in all_img_ids:
merged_img_ids.extend(p)
merged_eval_imgs = []
for p in all_eval_imgs:
merged_eval_imgs.append(p)
merged_img_ids = np.array(merged_img_ids)
merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
# keep only unique (and in sorted order) images
merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
merged_eval_imgs = merged_eval_imgs[..., idx]
return merged_img_ids, merged_eval_imgs
def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
img_ids, eval_imgs = merge(img_ids, eval_imgs)
img_ids = list(img_ids)
eval_imgs = list(eval_imgs.flatten())
coco_eval.evalImgs = eval_imgs
coco_eval.params.imgIds = img_ids
coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
#################################################################
# From pycocotools, just removed the prints and fixed
# a Python3 bug about unicode not defined
#################################################################
# Ideally, pycocotools wouldn't have hard-coded prints
# so that we could avoid copy-pasting those two functions
def createIndex(self):
# create index
# print('creating index...')
anns, cats, imgs = {}, {}, {}
imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
if 'annotations' in self.dataset:
for ann in self.dataset['annotations']:
imgToAnns[ann['image_id']].append(ann)
anns[ann['id']] = ann
if 'images' in self.dataset:
for img in self.dataset['images']:
imgs[img['id']] = img
if 'categories' in self.dataset:
for cat in self.dataset['categories']:
cats[cat['id']] = cat
if 'annotations' in self.dataset and 'categories' in self.dataset:
for ann in self.dataset['annotations']:
catToImgs[ann['category_id']].append(ann['image_id'])
# print('index created!')
# create class members
self.anns = anns
self.imgToAnns = imgToAnns
self.catToImgs = catToImgs
self.imgs = imgs
self.cats = cats
maskUtils = mask_util
def loadRes(self, resFile):
"""
Load result file and return a result api object.
Args:
self (obj): coco object with ground truth annotations
resFile (str): file name of result file
Returns:
res (obj): result api object
"""
res = COCO()
res.dataset['images'] = [img for img in self.dataset['images']]
# print('Loading and preparing results...')
# tic = time.time()
if isinstance(resFile, torch._six.string_classes):
anns = json.load(open(resFile))
elif type(resFile) == np.ndarray:
anns = self.loadNumpyAnnotations(resFile)
else:
anns = resFile
assert type(anns) == list, 'results in not an array of objects'
annsImgIds = [ann['image_id'] for ann in anns]
assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
'Results do not correspond to current coco set'
if 'caption' in anns[0]:
imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
for id, ann in enumerate(anns):
ann['id'] = id + 1
elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
bb = ann['bbox']
x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
if 'segmentation' not in ann:
ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
ann['area'] = bb[2] * bb[3]
ann['id'] = id + 1
ann['iscrowd'] = 0
elif 'segmentation' in anns[0]:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
# now only support compressed RLE format as segmentation results
ann['area'] = maskUtils.area(ann['segmentation'])
if 'bbox' not in ann:
ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
ann['id'] = id + 1
ann['iscrowd'] = 0
elif 'keypoints' in anns[0]:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for id, ann in enumerate(anns):
s = ann['keypoints']
x = s[0::3]
y = s[1::3]
x1, x2, y1, y2 = np.min(x), np.max(x), np.min(y), np.max(y)
ann['area'] = (x2 - x1) * (y2 - y1)
ann['id'] = id + 1
ann['bbox'] = [x1, y1, x2 - x1, y2 - y1]
# print('DONE (t={:0.2f}s)'.format(time.time()- tic))
res.dataset['annotations'] = anns
createIndex(res)
return res
def evaluate(self):
'''
Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
:return: None
'''
# tic = time.time()
# print('Running per image evaluation...')
p = self.params
# add backward compatibility if useSegm is specified in params
if p.useSegm is not None:
p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
# print('Evaluate annotation type *{}*'.format(p.iouType))
p.imgIds = list(np.unique(p.imgIds))
if p.useCats:
p.catIds = list(np.unique(p.catIds))
p.maxDets = sorted(p.maxDets)
self.params = p
self._prepare()
# loop through images, area range, max detection number
catIds = p.catIds if p.useCats else [-1]
if p.iouType == 'segm' or p.iouType == 'bbox':
computeIoU = self.computeIoU
elif p.iouType == 'keypoints':
computeIoU = self.computeOks
self.ious = {
(imgId, catId): computeIoU(imgId, catId)
for imgId in p.imgIds
for catId in catIds}
evaluateImg = self.evaluateImg
maxDet = p.maxDets[-1]
evalImgs = [
evaluateImg(imgId, catId, areaRng, maxDet)
for catId in catIds
for areaRng in p.areaRng
for imgId in p.imgIds
]
# this is NOT in the pycocotools code, but could be done outside
evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
self._paramsEval = copy.deepcopy(self.params)
# toc = time.time()
# print('DONE (t={:0.2f}s).'.format(toc-tic))
return p.imgIds, evalImgs
#################################################################
# end of straight copy from pycocotools, just removing the prints
#################################################################
And this is my code for evaluation:
def evaluate(model, data_loader, device):
n_threads = torch.get_num_threads()
# FIXME remove this and make paste_masks_in_image run on the GPU
torch.set_num_threads(1)
cpu_device = torch.device("cpu")
model.eval()
metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Test:'
coco = get_coco_api_from_dataset(data_loader.dataset)
iou_types = _get_iou_types(model)
coco_evaluator = CocoEvaluator(coco, iou_types)
for images, targets in metric_logger.log_every(data_loader, 100, header):
images = list(img.to(device) for img in images)
if torch.cuda.is_available():
torch.cuda.synchronize()
model_time = time.time()
outputs = model(images)
outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
model_time = time.time() - model_time
res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
evaluator_time = time.time()
coco_evaluator.update(res)
evaluator_time = time.time() - evaluator_time
metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print("Averaged stats:", metric_logger)
coco_evaluator.synchronize_between_processes()
# accumulate predictions from all images
coco_evaluator.accumulate()
coco_evaluator.summarize()
torch.set_num_threads(n_threads)
return coco_evaluator
This is my results what I got:

Python negative Value Error dimensions are not allowed

I am implementing genetic algorithm but I am facing an error after the first generation with the message: ValueError: negative dimensions are not allowed
I actually change the nfilters parameter from nfilters=[74,27,23] to nfilters=[64,128,256], I don't know if it is due to this parameters.
I declared my class sequential as follow:
class CNN(Sequential):
def __init__(self,nfilters,sfilters):
super().__init__()
tf.random.set_seed(0)
self.add(Conv2D(nfilters[0],kernel_size=(sfilters[0],sfilters[0]),padding='same',activation='relu',input_shape=(50,50,3)))
self.add(MaxPooling2D(pool_size=(2,2),strides=(2,2)))
self.add(Conv2D(nfilters[1],kernel_size=(sfilters[1],sfilters[1]),padding='same',activation='relu'))
self.add(MaxPooling2D(pool_size=(2,2),strides=(2,2)))
self.add(Conv2D(nfilters[2],kernel_size=(sfilters[2],sfilters[2]),padding='same',activation='relu'))
self.add(Conv2D(nfilters[2], kernel_size=(sfilters[2], sfilters[2]), padding='same', activation='relu'))
self.add(Flatten())
self.add(Dropout(0.5))
self.add(Dense(128,activation='relu'))
self.add(Dropout(0.5))
self.add(Dense(128, activation='relu'))
self.add(Dense(num_classes, activation='sigmoid'))
self.compile(loss=keras.losses.binary_crossentropy,
optimizer=tf.optimizers.Adam(learning_rate=0.001),
metrics=['accuracy'])
nfilters = [64,128,256] #nfilters = [74,27,23]
sfilters = [9,3,2] #sfilters = [9,3,2]
Then my class genetic is declared as the following:
class Genetic:
def __init__(self,pop_size,nlayers,max_nfilters,max_sfilters):
self.pop_size = pop_size
self.nlayers = nlayers
self.max_nfilters = max_nfilters
self.max_sfilters = max_sfilters
self.max_acc = 0
self.best_arch = np.zeros((1,6))
self.gen_acc = []
def generate_population(self):
np.random.seed(0)
pop_nlayers = np.random.randint(1,self.max_nfilters,(self.pop_size,self.nlayers))
pop_sfilters = np.random.randint(1,self.max_sfilters,(self.pop_size,self.nlayers))
pop_total = np.concatenate((pop_nlayers,pop_sfilters),axis=1)
return pop_total
def select_parents(self,pop,nparents,fitness):
parents = np.zeros((nparents,pop.shape[1]))
for i in range(nparents):
best = np.argmax(fitness)
parents[i] = pop[best]
fitness[best] = -99999
return parents
def crossover(self,parents):
nchild = self.pop_size - parents.shape[0]
nparents = parents.shape[0]
child = np.zeros((nchild,parents.shape[1]))
for i in range(nchild):
first = i % nparents
second = (i+1) % nparents
child[i,:2] = parents[first][:2]
child[i,2] = parents[second][2]
child[i,3:5] = parents[first][3:5]
child[i,5] = parents[second][5]
return child
def mutation(self,child):
for i in range(child.shape[0]):
val = np.random.randint(1,6)
ind = np.random.randint(1,4) - 1
if child[i][ind] + val > 100:
child[i][ind] -= val
else:
child[i][ind] += val
val = np.random.randint(1,4)
ind = np.random.randint(4,7) - 1
if child[i][ind] + val > 20:
child[i][ind] -= val
else:
child[i][ind] += val
return child
def fitness(self,pop,X,Y,epochs):
pop_acc = []
for i in range(pop.shape[0]):
nfilters = pop[i][0:3]
sfilters = pop[i][3:]
model = CNN(nfilters,sfilters)
#H = model.fit_generator(datagen.flow(X,Y,batch_size=256),epochs=epochs,callbacks=[early_stopping_monitor])
H = model.fit_generator(datagen.flow(X,Y,batch_size=256),steps_per_epoch=len(X_trainRusReshaped) / batch_size,epochs=epochs,validation_data=(X_testRusReshaped, Y_testRusHot),callbacks=[early_stopping_monitor])
acc = H.history['accuracy']
pop_acc.append(max(acc)*100)
if max(pop_acc) > self.max_acc:
self.max_acc = max(pop_acc)
self.best_arch = pop[np.argmax(pop_acc)]
self.gen_acc.append(max(pop_acc))
return pop_acc
def smooth_curve(self,factor,gen):
smoothed_points = []
for point in self.gen_acc:
if smoothed_points:
prev = smoothed_points[-1]
smoothed_points.append(prev*factor + point * (1-factor))
else:
smoothed_points.append(point)
plt.plot(range(gen+1),smoothed_points,'g',label='Smoothed training acc')
plt.xticks(np.arange(gen+1))
plt.legend()
plt.title('Fitness Accuracy vs Generations')
plt.xlabel('Generations')
plt.ylabel('Fitness (%)')
plt.show()
plt.savefig('smoothCurve.png')
When I launch these lines of codes, I have the error after 20 epochs on the first generation:
#Starting Genetic Algoritm
pop_size = 2 #10
nlayers = 3 #3
max_nfilters = 500 #100
max_sfilters = 20
epochs = 20
num_generations = 2 #10
genCNN = Genetic(pop_size,nlayers,max_nfilters,max_sfilters)
pop = genCNN.generate_population()
for i in range(num_generations+1):
pop_acc = genCNN.fitness(pop,X_trainRusReshaped,Y_trainRusHot,epochs)
print('Best Accuracy at the generation {}: {}'.format(i,genCNN.max_acc))
parents = genCNN.select_parents(pop,5,pop_acc.copy())
child = genCNN.crossover(parents)
child = genCNN.mutation(child)
pop = np.concatenate((parents,child),axis=0).astype('int')
Any idea where this error is coming from? I tried to increase max_filters from 100 to 500 but it does not solved anything.

ValueError: Found array with 0 sample(s) (shape=(0, 0)) while a minimum of 2 is required

For data preparation I have this code
# Data preparartion
default_fin['ID'] = default_fin['ID'].astype(str)
credit_risk['ID'] = credit_risk['ID'].astype(str)
credit_risk['person_income'] = credit_risk['person_income'].astype(float)
credit_risk['loan_amnt'] = credit_risk['loan_amnt'].astype(float)
credit_risk['credit_type'] = credit_risk['credit_type'].astype(str)
credit_risk['cb_person_default_on_file'] = credit_risk['cb_person_default_on_file'].astype(str)
# It is important to sort order and products chronologically
credit_risk.sort_values(by=['ID', 'person_income', 'loan_amnt', 'credit_type', 'cb_person_default_on_file'], inplace=True
)
combined_defaulted_by_credit_type = credit_risk.groupby("credit_type").apply(lambda loans: ' '.join(loans['ID'].tolist()))
combined_defaulted_by_credit_type = pd.DataFrame(combined_defaulted_by_credit_type,columns=['all_credit_Ids'])
print(f'Number of credit types: {combined_defaulted_by_credit_type.shape[0]}')
combined_defaulted_by_credit_type.reset_index(inplace=True)
combined_defaulted_by_credit_type.credit_type = combined_defaulted_by_credit_type.credit_type.astype(str)
combined_defaulted_by_credit_type.head()
Then here I build the vocabulary and train the model
TRAIN_USER_MODEL = True # True - create a new model, False - load a previosuly created model
MODEL_DIR = 'models'
if not os.path.exists(MODEL_DIR):
os.makedirs(MODEL_DIR)
embeddings_dim = 200 # dimensionality of user representation
filename = f'models/customer2vec.{embeddings_dim}d.model'
if TRAIN_USER_MODEL:
class TaggedDocumentIterator(object):
def __init__(self, df):
self.df = df
def __iter__(self):
for row in self.df.itertuples():
yield TaggedDocument(words=dict(row._asdict())['all_credit_Ids'].split(), tags=[dict(row._asdict())['credit_type']])
it = TaggedDocumentIterator(combined_defaulted_by_credit_type)
doc_model = gensim.models.doc2vec.Doc2Vec(vector_size=embeddings_dim,
window=5,
min_count=10,
workers=mp.cpu_count(),
alpha=0.055,
min_alpha=0.055,
epochs=120) # use fixed learning rate
train_corpus = list(it)
doc_model.build_vocab(train_corpus)
print(f'Model saved to [{filename}]')
else:
doc_model = Doc2Vec.load(filename)
for epoch in tqdm(range(10)):
doc_model.alpha -= 0.005 # decrease the learning rate
doc_model.min_alpha = doc_model.alpha # fix the learning rate, no decay
doc_model.train(train_corpus, total_examples=doc_model.corpus_count, epochs=doc_model.epochs)
print('Iteration:', epoch)
doc_model.save(filename)
print(f'Model loaded to [{filename}]')
Then I created the doc_vectors here
doc_vectors = doc_model.dv
cust_doc = list(doc_model.dv.key_to_index.keys())
doc_vector_dict = {arg:doc_model.dv[arg] for arg in cust_doc}
X_doc = pd.DataFrame(doc_vector_dict).T.values
X_doc.shape, len(cust_doc), credit_risk["credit_type"].nunique()
Visualization
# Visualize the customer semantic space using TSNE
ids_sample_str = set([str(id) for id in ids_sample])
idx = []
for i, credit_type in enumerate(doc_vector_dict):
if credit_type in ids_sample_str:
idx.append(i)
X_doc_subset = X_doc[idx] # only sampled user IDs
X_doc_subset.shape
distance_matrix_doc = pairwise_distances(X_doc_subset, X_doc_subset, metric='cosine', n_jobs=-1)
tsne_doc = TSNE(metric="precomputed", n_components=2, verbose=1, perplexity=30, n_iter=500)
And here comes the error
ValueError: Found array with 0 sample(s) (shape=(0, 0)) while a minimum of 2 is required.
tsne_results_doc = tsne_doc.fit_transform(distance_matrix_doc)

ID3 Algorithm in Python

I am trying to plot a decision tree using ID3 in Python. I am really new to Python and couldn't understand the implementation of the following code. I need to know how I can apply this code to my data.
from math import log
import operator
def entropy(data):
entries = len(data)
labels = {}
for feat in data:
label = feat[-1]
if label not in labels.keys():
labels[label] = 0
labels[label] += 1
entropy = 0.0
for key in labels:
probability = float(labels[key])/entries
entropy -= probability * log(probability,2)
return entropy
def split(data, axis, val):
newData = []
for feat in data:
if feat[axis] == val:
reducedFeat = feat[:axis]
reducedFeat.extend(feat[axis+1:])
newData.append(reducedFeat)
return newData
def choose(data):
features = len(data[0]) - 1
baseEntropy = entropy(data)
bestInfoGain = 0.0;
bestFeat = -1
for i in range(features):
featList = [ex[i] for ex in data]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
newData = split(data, i, value)
probability = len(newData)/float(len(data))
newEntropy += probability * entropy(newData)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeat = i
return bestFeat
def majority(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def tree(data,labels):
classList = [ex[-1] for ex in data]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(data[0]) == 1:
return majority(classList)
bestFeat = choose(data)
bestFeatLabel = labels[bestFeat]
theTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [ex[bestFeat] for ex in data]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
theTree[bestFeatLabel][value] = tree(split/(data, bestFeat, value),subLabels)
return theTree
So what I did after this is the following:
infile=open("SData.csv","r")
data=infile.read()
tree(data)
The error which I got is "1 argument is missing" which is the label which I have to define and this is where I don't know what I have to put. I tried the variable for which I have to make the decision tree but it doesn't work:
tree(data,MinTemp)
Here I get an error "MinTemp is not defined".
Please help me out and let me know what I should do to have a look at the tree.
Following is the part of data and I want to generate a tree for MinTemp
MinTemp,Rainfall,Tempat9,RHat9,CAat9,WSat9
high,no,mild,normal,overcast,weak
high,no,mild,normal,cloudy,weak
high,no,mild,normal,cloudy,mild
high,yes,mild,high,cloudy,weak
high,yes,mild,high,cloudy,mild
medium,yes,mild,high,cloudy,mild
high,no,mild,high,overcast,weak
high,no,mild,normal,sunny,weak
high,no,hot,normal,sunny,weak
high,no,hot,normal,overcast,weak

How to add a member function to an existing Python object?

Previously I created a lot of Python objects of class A, and I would like to add a new function plotting_in_PC_space_with_coloring_option() (the purpose of this function is to plot some data in this object) to class A and use those old objects to call plotting_in_PC_space_with_coloring_option().
An example is:
import copy
import numpy as np
from math import *
from pybrain.structure import *
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.datasets.supervised import SupervisedDataSet
import pickle
import neural_network_related
class A(object):
"""the neural network for simulation"""
'''
todo:
- find boundary
- get_angles_from_coefficients
'''
def __init__(self,
index, # the index of the current network
list_of_coor_data_files, # accept multiple files of training data
energy_expression_file, # input, output files
preprocessing_settings = None,
connection_between_layers = None, connection_with_bias_layers = None,
PCs = None, # principal components
):
self._index = index
self._list_of_coor_data_files = list_of_coor_data_files
self._energy_expression_file = energy_expression_file
self._data_set = []
for item in list_of_coor_data_files:
self._data_set += self.get_many_cossin_from_coordiantes_in_file(item)
self._preprocessing_settings = preprocessing_settings
self._connection_between_layers = connection_between_layers
self._connection_with_bias_layers = connection_with_bias_layers
self._node_num = [8, 15, 2, 15, 8]
self._PCs = PCs
def save_into_file(self, filename = None):
if filename is None:
filename = "network_%s.pkl" % str(self._index) # by default naming with its index
with open(filename, 'wb') as my_file:
pickle.dump(self, my_file, pickle.HIGHEST_PROTOCOL)
return
def get_cossin_from_a_coordinate(self, a_coordinate):
num_of_coordinates = len(a_coordinate) / 3
a_coordinate = np.array(a_coordinate).reshape(num_of_coordinates, 3)
diff_coordinates = a_coordinate[1:num_of_coordinates, :] - a_coordinate[0:num_of_coordinates - 1,:] # bond vectors
diff_coordinates_1=diff_coordinates[0:num_of_coordinates-2,:];diff_coordinates_2=diff_coordinates[1:num_of_coordinates-1,:]
normal_vectors = np.cross(diff_coordinates_1, diff_coordinates_2);
normal_vectors_normalized = np.array(map(lambda x: x / sqrt(np.dot(x,x)), normal_vectors))
normal_vectors_normalized_1 = normal_vectors_normalized[0:num_of_coordinates-3, :];normal_vectors_normalized_2 = normal_vectors_normalized[1:num_of_coordinates-2,:];
diff_coordinates_mid = diff_coordinates[1:num_of_coordinates-2]; # these are bond vectors in the middle (remove the first and last one), they should be perpendicular to adjacent normal vectors
cos_of_angles = range(len(normal_vectors_normalized_1))
sin_of_angles_vec = range(len(normal_vectors_normalized_1))
sin_of_angles = range(len(normal_vectors_normalized_1)) # initialization
for index in range(len(normal_vectors_normalized_1)):
cos_of_angles[index] = np.dot(normal_vectors_normalized_1[index], normal_vectors_normalized_2[index])
sin_of_angles_vec[index] = np.cross(normal_vectors_normalized_1[index], normal_vectors_normalized_2[index])
sin_of_angles[index] = sqrt(np.dot(sin_of_angles_vec[index], sin_of_angles_vec[index])) * np.sign(sum(sin_of_angles_vec[index]) * sum(diff_coordinates_mid[index]));
return cos_of_angles + sin_of_angles
def get_many_cossin_from_coordinates(self, coordinates):
return map(self.get_cossin_from_a_coordinate, coordinates)
def get_many_cossin_from_coordiantes_in_file (self, filename):
coordinates = np.loadtxt(filename)
return self.get_many_cossin_from_coordinates(coordinates)
def mapminmax(self, my_list): # for preprocessing in network
my_min = min(my_list)
my_max = max(my_list)
mul_factor = 2.0 / (my_max - my_min)
offset = (my_min + my_max) / 2.0
result_list = np.array(map(lambda x : (x - offset) * mul_factor, my_list))
return (result_list, (mul_factor, offset)) # also return the parameters for processing
def get_mapminmax_preprocess_result_and_coeff(self,data=None):
if data is None:
data = self._data_set
data = np.array(data)
data = np.transpose(data)
result = []; params = []
for item in data:
temp_result, preprocess_params = self.mapminmax(item)
result.append(temp_result)
params.append(preprocess_params)
return (np.transpose(np.array(result)), params)
def mapminmax_preprocess_using_coeff(self, input_data=None, preprocessing_settings=None):
# try begin
if preprocessing_settings is None:
preprocessing_settings = self._preprocessing_settings
temp_setttings = np.transpose(np.array(preprocessing_settings))
result = []
for item in input_data:
item = np.multiply(item - temp_setttings[1], temp_setttings[0])
result.append(item)
return result
# try end
def get_expression_of_network(self, connection_between_layers=None, connection_with_bias_layers=None):
if connection_between_layers is None:
connection_between_layers = self._connection_between_layers
if connection_with_bias_layers is None:
connection_with_bias_layers = self._connection_with_bias_layers
node_num = self._node_num
expression = ""
# first part: network
for i in range(2):
expression = '\n' + expression
mul_coef = connection_between_layers[i].params.reshape(node_num[i + 1], node_num[i])
bias_coef = connection_with_bias_layers[i].params
for j in range(np.size(mul_coef, 0)):
temp_expression = 'layer_%d_unit_%d = tanh( ' % (i + 1, j)
for k in range(np.size(mul_coef, 1)):
temp_expression += ' %f * layer_%d_unit_%d +' % (mul_coef[j, k], i, k)
temp_expression += ' %f);\n' % (bias_coef[j])
expression = temp_expression + expression # order of expressions matter in OpenMM
# second part: definition of inputs
index_of_backbone_atoms = [2, 5, 7, 9, 15, 17, 19];
for i in range(len(index_of_backbone_atoms) - 3):
index_of_coss = i
index_of_sins = i + 4
expression += 'layer_0_unit_%d = (raw_layer_0_unit_%d - %f) * %f;\n' % \
(index_of_coss, index_of_coss, self._preprocessing_settings[index_of_coss][1], self._preprocessing_settings[index_of_coss][0])
expression += 'layer_0_unit_%d = (raw_layer_0_unit_%d - %f) * %f;\n' % \
(index_of_sins, index_of_sins, self._preprocessing_settings[index_of_sins][1], self._preprocessing_settings[index_of_sins][0])
expression += 'raw_layer_0_unit_%d = cos(dihedral_angle_%d);\n' % (index_of_coss, i)
expression += 'raw_layer_0_unit_%d = sin(dihedral_angle_%d);\n' % (index_of_sins, i)
expression += 'dihedral_angle_%d = dihedral(p%d, p%d, p%d, p%d);\n' % \
(i, index_of_backbone_atoms[i], index_of_backbone_atoms[i+1],index_of_backbone_atoms[i+2],index_of_backbone_atoms[i+3])
return expression
def write_expression_into_file(self, out_file = None):
if out_file is None: out_file = self._energy_expression_file
expression = self.get_expression_of_network()
with open(out_file, 'w') as f_out:
f_out.write(expression)
return
def get_mid_result(self, input_data=None, connection_between_layers=None, connection_with_bias_layers=None):
if input_data is None: input_data = self._data_set
if connection_between_layers is None: connection_between_layers = self._connection_between_layers
if connection_with_bias_layers is None: connection_with_bias_layers = self._connection_with_bias_layers
node_num = self._node_num
temp_mid_result = range(4)
mid_result = []
# first need to do preprocessing
for item in self.mapminmax_preprocess_using_coeff(input_data, self._preprocessing_settings):
for i in range(4):
mul_coef = connection_between_layers[i].params.reshape(node_num[i + 1], node_num[i]) # fix node_num
bias_coef = connection_with_bias_layers[i].params
previous_result = item if i == 0 else temp_mid_result[i - 1]
temp_mid_result[i] = np.dot(mul_coef, previous_result) + bias_coef
if i != 3: # the last output layer is a linear layer, while others are tanh layers
temp_mid_result[i] = map(tanh, temp_mid_result[i])
mid_result.append(copy.deepcopy(temp_mid_result)) # note that should use deepcopy
return mid_result
def get_PC_and_save_it_to_network(self):
'''get PCs and save the result into _PCs
'''
mid_result = self.get_mid_result()
self._PCs = [item[1] for item in mid_result]
return
def train(self):
####################### set up autoencoder begin #######################
node_num = self._node_num
in_layer = LinearLayer(node_num[0], "IL")
hidden_layers = [TanhLayer(node_num[1], "HL1"), TanhLayer(node_num[2], "HL2"), TanhLayer(node_num[3], "HL3")]
bias_layers = [BiasUnit("B1"),BiasUnit("B2"),BiasUnit("B3"),BiasUnit("B4")]
out_layer = LinearLayer(node_num[4], "OL")
layer_list = [in_layer] + hidden_layers + [out_layer]
molecule_net = FeedForwardNetwork()
molecule_net.addInputModule(in_layer)
for item in (hidden_layers + bias_layers):
molecule_net.addModule(item)
molecule_net.addOutputModule(out_layer)
connection_between_layers = range(4); connection_with_bias_layers = range(4)
for i in range(4):
connection_between_layers[i] = FullConnection(layer_list[i], layer_list[i+1])
connection_with_bias_layers[i] = FullConnection(bias_layers[i], layer_list[i+1])
molecule_net.addConnection(connection_between_layers[i]) # connect two neighbor layers
molecule_net.addConnection(connection_with_bias_layers[i])
molecule_net.sortModules() # this is some internal initialization process to make this module usable
####################### set up autoencoder end #######################
trainer = BackpropTrainer(molecule_net, learningrate=0.002,momentum=0.4,verbose=False, weightdecay=0.1, lrdecay=1)
data_set = SupervisedDataSet(node_num[0], node_num[4])
sincos = self._data_set
(sincos_after_process, self._preprocessing_settings) = self.get_mapminmax_preprocess_result_and_coeff(data = sincos)
for item in sincos_after_process: # is it needed?
data_set.addSample(item, item)
trainer.trainUntilConvergence(data_set, maxEpochs=50)
self._connection_between_layers = connection_between_layers
self._connection_with_bias_layers = connection_with_bias_layers
print("Done!\n")
return
def create_sge_files_for_simulation(self,potential_centers = None):
if potential_centers is None:
potential_centers = self.get_boundary_points()
neural_network_related.create_sge_files(potential_centers)
return
def get_boundary_points(self, list_of_points = None, num_of_bins = 5):
if list_of_points is None: list_of_points = self._PCs
x = [item[0] for item in list_of_points]
y = [item[1] for item in list_of_points]
temp = np.histogram2d(x,y, bins=[num_of_bins, num_of_bins])
hist_matrix = temp[0]
# add a set of zeros around this region
hist_matrix = np.insert(hist_matrix, num_of_bins, np.zeros(num_of_bins), 0)
hist_matrix = np.insert(hist_matrix, 0, np.zeros(num_of_bins), 0)
hist_matrix = np.insert(hist_matrix, num_of_bins, np.zeros(num_of_bins + 2), 1)
hist_matrix = np.insert(hist_matrix, 0, np.zeros(num_of_bins +2), 1)
hist_matrix = (hist_matrix != 0).astype(int)
sum_of_neighbors = np.zeros(np.shape(hist_matrix)) # number of neighbors occupied with some points
for i in range(np.shape(hist_matrix)[0]):
for j in range(np.shape(hist_matrix)[1]):
if i != 0: sum_of_neighbors[i,j] += hist_matrix[i - 1][j]
if j != 0: sum_of_neighbors[i,j] += hist_matrix[i][j - 1]
if i != np.shape(hist_matrix)[0] - 1: sum_of_neighbors[i,j] += hist_matrix[i + 1][j]
if j != np.shape(hist_matrix)[1] - 1: sum_of_neighbors[i,j] += hist_matrix[i][j + 1]
bin_width_0 = temp[1][1]-temp[1][0]
bin_width_1 = temp[2][1]-temp[2][0]
min_coor_in_PC_space_0 = temp[1][0] - 0.5 * bin_width_0 # multiply by 0.5 since we want the center of the grid
min_coor_in_PC_space_1 = temp[2][0] - 0.5 * bin_width_1
potential_centers = []
for i in range(np.shape(hist_matrix)[0]):
for j in range(np.shape(hist_matrix)[1]):
if hist_matrix[i,j] == 0 and sum_of_neighbors[i,j] != 0: # no points in this block but there are points in neighboring blocks
temp_potential_center = [round(min_coor_in_PC_space_0 + i * bin_width_0, 2), round(min_coor_in_PC_space_1 + j * bin_width_1, 2)]
potential_centers.append(temp_potential_center)
return potential_centers
# this function is added after those old objects of A were created
def plotting_in_PC_space_with_coloring_option(self,
list_of_coordinate_files_for_plotting=None, # accept multiple files
color_option='pure'):
'''
by default, we are using training data, and we also allow external data input
'''
if list_of_coordinate_files_for_plotting is None:
PCs_to_plot = self._PCs
else:
temp_sincos = []
for item in list_of_coordinate_files_for_plotting:
temp_sincos += self.get_many_cossin_from_coordiantes_in_file(item)
temp_mid_result = self.get_mid_result(input_data = temp_sincos)
PCs_to_plot = [item[1] for item in temp_mid_result]
(x, y) = ([item[0] for item in PCs_to_plot], [item[1] for item in PCs_to_plot])
# coloring
if color_option == 'pure':
coloring = 'red'
elif color_option == 'step':
coloring = range(len(x))
fig, ax = plt.subplots()
ax.scatter(x,y, c=coloring)
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
plt.show()
return
But it seems that plotting_in_PC_space_with_coloring_option() was not binded to those old objects, is here any way to fix it (I do not want to recreate these objects since creation involves CPU-intensive calculation and would take very long time to do it)?
Thanks!
Something like this:
class A:
def q(self): print 1
a = A()
def f(self): print 2
setattr(A, 'f', f)
a.f()
This is called a monkey patch.

Categories