Memory leak in Midas depthmap computation? - python

I'm using Midas very like in Huggingface's demo.
My issue is that the RAM usage increase at each depth map computation.
Here is the full code.
#!venv/bin/python3
from pathlib import Path
import psutil
import numpy as np
import torch
import cv2
def make_model():
model_type = "DPT_BEiT_L_512" # MiDaS v3.1 - Large
midas = torch.hub.load("intel-isl/MiDaS", model_type)
device = torch.device("cuda")
midas.to(device)
midas.eval()
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
transform = midas_transforms.dpt_transform
return {"transform": transform,
"device": device,
"midas": midas
}
def inference(cv_image, model):
"""Make the inference."""
transform = model['transform']
device = model["device"]
midas = model["midas"]
input_batch = transform(cv_image).to(device)
with torch.no_grad():
prediction = midas(input_batch)
prediction = torch.nn.functional.interpolate(
prediction.unsqueeze(1),
size=cv_image.shape[:2],
mode="bilinear",
align_corners=False,
).squeeze()
output = prediction.cpu().numpy()
formatted = (output * 255 / np.max(output)).astype('uint8')
return formatted
# Create Midas "DPT_BEiT_L_512" - MiDaS v3.1 - Large
model = make_model()
image_dir = Path('.') / "all_images"
for image_file in image_dir.iterdir():
ram_usage = psutil.virtual_memory()[2]
print("image", ram_usage)
cv_image = cv2.imread(str(image_file))
_ = inference(cv_image, model)
In short:
Create the model "DPT_BEiT_L_512"
Define the function inference
loop over the images in the directory all_images
for each: cv2.imread
compute the depthmap (do not keep the result in memory)
I see that the RAM usage keeps raising over and over.
Variation:
I've tried to read only one image (so only one cv2.imread) and, in the loop, only add a random noise on that image. Up to random noise, the inference function always receive the same image.
In that case, the RAM usage is stable.
QUESTIONS:
Where does the memory leak come from ?
Do I have to "reset" something between two inferences ?
EDIT some variations
variation 1: always the same image
replace the iterdir loop by this:
cv_image = cv2.imread("image.jpg")
for i in range(1, 100):
ram_usage = psutil.virtual_memory()[2]
print(i, ram_usage)
_ = get_depthmap(cv_image, model)
Here you get no memory leak.
Variation 2: do not compute the depth map
for image_file in image_dir.iterdir():
ram_usage = psutil.virtual_memory()[2]
print("image", ram_usage)
cv_image = cv2.imread(str(image_file))
# _ = get_depthmap(cv_image, model)
The memory leak does not occurs.
I deduce that cv2.imread itself does not makes the leak.
Variation 3: same image, random noise:
cv_image = cv2.imread("image.jpg")
for i in range(1, 100):
ram_usage = psutil.virtual_memory()[2]
print(i, ram_usage)
noise = np.random.randn(
cv_image.shape[0], cv_image.shape[1], cv_image.shape[2]) * 20
noisy_img = cv_image + noise
noisy_img = np.clip(noisy_img, 0, 255)
_ = get_depthmap(noisy_img, model)
No leak in this version.

Related

Why converting npy files (containing video frames) to tfrecords consumes too much disk space?

I am working on a violence detection service. I am trying to develop software based on the code in this repo. My dataset consists of videos resided in two directories "Violence" and "Non-Violence".
I used this code to generate npy files out of RGB channels and optical flow features. The output of this part would be 2 folders containing npy array with 244x244x5 shape. (np.float32 dtype). so it's like I have video frames in RGB in the first 3 channels (npy[...,:3]) and optical flow features in the next two channels (npy[..., 3:]).
Now I am trying to convert them to tfrecords and use tf.data.tfrecorddataset to speed up the training process. Since my model input has to be a cube tensor, my training elements has to be 64 frames of each video. It means the data point shape has to be 64x244x244x5.
So I used this code to convert the npy files to tfrecords.
from pathlib import Path
from os.path import join
import tensorflow as tf
import numpy as np
import cv2
from tqdm import tqdm
def normalize(data):
mean = np.mean(data)
std = np.std(data)
return (data - mean) / std
def random_flip(video, prob):
s = np.random.rand()
if s < prob:
video = np.flip(m=video, axis=2)
return video
def color_jitter(video):
# range of s-component: 0-1
# range of v component: 0-255
s_jitter = np.random.uniform(-0.2, 0.2)
v_jitter = np.random.uniform(-30, 30)
for i in range(len(video)):
hsv = cv2.cvtColor(video[i], cv2.COLOR_RGB2HSV)
s = hsv[..., 1] + s_jitter
v = hsv[..., 2] + v_jitter
s[s < 0] = 0
s[s > 1] = 1
v[v < 0] = 0
v[v > 255] = 255
hsv[..., 1] = s
hsv[..., 2] = v
video[i] = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)
return video
def uniform_sample(video: str, target_frames: int = 64) -> np.ndarray:
"""
gets video and outputs n_frames number of frames in video.
Args:
video:
target_frames:
Returns:
"""
len_frames = int(len(data))
interval = int(np.ceil(len_frames / target_frames))
# init empty list for sampled video and
sampled_video = []
for i in range(0, len_frames, interval):
sampled_video.append(video[i])
# calculate number of padded frames and fix it
num_pad = target_frames - len(sampled_video)
if num_pad > 0:
padding = [video[i] for i in range(-num_pad, 0)]
sampled_video += padding
return np.array(sampled_video, dtype=np.float32)
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
if __name__ == '__main__':
path = Path('transformed/')
npy_files = list(path.rglob('*.npy'))[:100]
aug = True
# one_hots = to_categorical(range(2), dtype=np.int8)
path_to_save = 'data_tfrecords'
tfrecord_path = join(path_to_save, 'all_data.tfrecord')
with tf.io.TFRecordWriter(tfrecord_path) as writer:
for file in tqdm(npy_files, desc='files converted'):
# load npy files
npy = np.load(file.as_posix(), mmap_mode='r')
data = np.float32(npy)
del npy
# Uniform sampling
data = uniform_sample(data, target_frames=64)
# Add augmentation
if aug:
data[..., :3] = color_jitter(data[..., :3])
data = random_flip(data, prob=0.5)
# Normalization
data[..., :3] = normalize(data[..., :3])
data[..., 3:] = normalize(data[..., 3:])
# Label one hot encoding
label = 1 if file.parent.stem.startswith('F') else 0
# label = one_hots[label]
feature = {'image': _bytes_feature(tf.compat.as_bytes(data.tobytes())),
'label': _int64_feature(int(label))}
example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
The code works fine, but the real problem is that it consumes too much disk drive. my whole dataset consisting of 2000 videos takes 12 GB, when I converted them to npy files, it became around 80 GB, and now using tfrecords It became over 120 GB or so. How can I convert them in an efficient way to reduce the space required to store them?
The answer might be too late. But I see you are still saving the video frame in your tfrecords file.
Try removing the "image" feature from your features list. And saving per frame as their Height, Width, Channels, and so forth.
feature = {'label': _int64_feature(int(label))}
Which is why the file is taking more space.

How Could I increase the speed

I am using below code for an image processing related study. The code works fine as functionality but it is too slow that one step takes up to 10 seconds.
I need faster process speed to reach at the aim.
import numpy
import glob, os
import cv2
import os
input = cv2.imread(path)
def nothing(x): # for trackbar
pass
windowName = "Image"
cv2.namedWindow(windowName)
cv2.createTrackbar("coef", windowName, 0, 25000, nothing)
condition = True
while (condition):
coef = cv2.getTrackbarPos("coef", windowName)
temp_img = input
row = temp_img.shape[0]
col = temp_img.shape[1]
print(coef)
red = []
green = []
for i in range(row):
for y in range(col):
# temp_img[i][y][0] = 0
temp_img[i][y][1] = temp_img[i][y][1]* (coef / 100)
temp_img[i][y][1] = temp_img[i][y][2] * (1 - (coef / 100))
# relative_diff = value_g - value_r
# temp =cv2.resize(temp,(1000,800))
cv2.imshow(windowName, temp_img)
# cv2.imwrite("output2.jpg", temp)
print("fin")
# cv2.waitKey(0)
if cv2.waitKey(30) >= 0:
condition = False
cv2.destroyAllWindows()
Is there anybody have an idea having faster result on the aim?
It's not entirely clear to me what object temp_img is exactly, but if it behaves like a numpy array, you could replace your loop by
temp_img[:,:,0] = temp_img[:,:,1]*(coef/100)
temp_img[:,:,1] = temp_img[:,:,2]*(1-coef/1000)
which should result in a significant speed up if your array is large. The implementation of such operations on arrays are optimised very well, whereas python loops are generally quite slow.
Edit based on comments:
Since you're working with large images and have some expensive operations that need an unscaled version but only need to be executed once, your code could get the following kind of structure
import... #do all your imports
def expensive_operations(image, *args, **kwargs):
#do all your expensive operations like object detection
def scale_image(image, scale):
#create a scaled version of image
def cheap_operations(scaled_image, windowName):
#perform cheap operations, e.g.
coef = cv2.getTrackbarPos("coef", windowName)
temp_img = np.copy(scaled_image)
temp_img[:,:,1] = temp_img[:,:,1]* (coef / 100)
temp_img[:,:,2] = temp_img[:,:,2] * (1 - (coef / 100))
cv2.imshow(windowName, temp_img)
input = cv2.imread(path)
windowName = "Image"
cv2.namedWindow(windowName)
cv2.createTrackbar("coef", windowName, 0, 25000, nothing)
condition = True
expensive_results = expensive_operations(input) #possibly with some more args and keyword args
scaled_image = scale_image(input)
while condition:
cheap_operations(scaled_image, windowName)
if cv2.waitKey(30) >= 0:
condition = False
cv2.destroyAllWindows()
I do this kind of thing in nip2. It's an image processing spreadsheet that can manipulate huge images quickly. It has no problems doing this kind of operation on any size image at 60fps.
I made you an example workspace: http://www.rollthepotato.net/~john/coeff.ws
Here's what it looks like working on a 1gb starfield image:
You can drag the slider to change coeff. The processed image updates instantly as you drag. You can zoom and pan around the processed image to check details and adjust coeff.
The underlying image processing library is libvips, which has a Python binding, pyvips. In pyvips, your program would be:
import pyvips
def adjust(image, coeff):
return image * [1, coeff / 100, 1 - coeff / 100]
Though that's without the GUI elements, of course.

Python: How to reshape an array based on image input?

I have a function which applies masking operation on the input images as follows:
file_names = glob(os.path.join(IMAGE_DIR, "*.jpg"))
masks_prediction = np.zeros((2000, 2000, len(file_names)))
for i in range(len(file_names)):
print(i)
image = skimage.io.imread(file_names[i])
predictions = model.detect([image], verbose=1)
p = predictions[0]
masks = p['masks']
merged_mask = np.zeros((masks.shape[0], masks.shape[1]))
for j in range(masks.shape[2]):
merged_mask[masks[:,:,j]==True] = True
masks_prediction[:,:,i] = merged_mask
print(masks_prediction.shape)
So basically it reads all the images from the directory, creates a mask for each and runs the detection.
However, since the images are of different sizes, it does not work:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-10-764e6229811a> in <module>()
10 for j in range(masks.shape[2]):
11 merged_mask[masks[:,:,j]==True] = True
---> 12 masks_prediction[:,:,i] = merged_mask
13 print(masks_prediction.shape)
ValueError: could not broadcast input array from shape (1518,1077) into shape (2000,2000)
I was thinking of a way to know the size of each image before the mask operation is applied (before line 12 in the error message), thus passing the exact image shape size correctly for the masking operation.
Is this somehow possible in Python?
EDIT: So apparently people somehow didn't get what I wanted to achieve - although I genuinely believe it was written in a very simple way. Nevertheless here is the entire code (copied from ipython notebook) where the function is located:
import os
import sys
import random
import math
import re
import time
import numpy as np
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import skimage.draw
# Root directory of the project
ROOT_DIR = os.path.abspath("../../")
# Import Mask RCNN
sys.path.append(ROOT_DIR) # To find local version of the library
from mrcnn import utils
from mrcnn import visualize
from mrcnn.visualize import display_images
import mrcnn.model as modellib
from mrcnn.model import log
from glob import glob
import components
%matplotlib inline
# Directories to be referred
MODEL_DIR = os.path.join(ROOT_DIR, "logs")
IMAGE_DIR = os.path.join(ROOT_DIR, "datasets/components/back/predict")
ANNOTATION_DIR = os.path.join(ROOT_DIR, "datasets/components/front/")
WEIGHTS_PATH = os.path.join(ROOT_DIR, "logs/back/mask_rcnn_components_0100.h5")
config = components.ComponentsConfig()
# Override the training configurations with a few
# changes for inferencing.
class InferenceConfig(config.__class__):
# Run detection on one image at a time
GPU_COUNT = 1
IMAGES_PER_GPU = 1
config = InferenceConfig()
config.display()
# Create model in inference mode
with tf.device(DEVICE):
model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR,
config=config)
# Load weights
print("Loading weights ", WEIGHTS_PATH)
model.load_weights(WEIGHTS_PATH, by_name=True)
file_names = glob(os.path.join(IMAGE_DIR, "*.jpg"))
masks_prediction = np.zeros((2000, 2000, len(file_names)))
for i in range(len(file_names)):
print(i)
image = skimage.io.imread(file_names[i])
predictions = model.detect([image], verbose=1)
p = predictions[0]
masks = p['masks']
merged_mask = np.zeros((masks.shape[0], masks.shape[1]))
for j in range(masks.shape[2]):
merged_mask[masks[:,:,j]==True] = True
masks_prediction[:,:,i] = merged_mask
print(masks_prediction.shape)
dataset = components.ComponentsDataset()
dataset.load_components(ANNOTATION_DIR, "predict")
accuracy = 0
precision = 0
for image_id in range(len(dataset.image_info)):
name = dataset.image_info[image_id]['id']
file_name = os.path.join(IMAGE_DIR, name)
image_id_pred = file_names.index(file_name)
merged_mask = masks_prediction[:, :, image_id_pred]
annotated_mask = dataset.load_mask(image_id)[0]
merged_annotated_mask = np.zeros((510, 510))
for i in range(annotated_mask.shape[2]):
merged_annotated_mask[annotated_mask[:,:,i]==True] = True
accuracy += np.sum(merged_mask==merged_annotated_mask) / (1200 * 1600)
all_correct = np.sum(merged_annotated_mask[merged_mask == 1])
precision += all_correct / (np.sum(merged_mask))
print('accuracy:{}'.format(accuracy / len(file_names)))
print('precision:{}'.format(precision / len(file_names)))
file_names = glob(os.path.join(IMAGE_DIR, "*.jpg"))
class_names = ['BG', 'screw', 'lid']
test_image = skimage.io.imread(file_names[random.randint(0,len(file_names)-1)])
predictions = model.detect([test_image], verbose=1) # We are replicating the same image to fill up the batch_size
p = predictions[0]
visualize.display_instances(test_image, p['rois'], p['masks'], p['class_ids'],
class_names, p['scores'])
The image is just a numpy array. So to answer your question "is it possible to know the size of each image": Yes, simply use the shape of the image.
If you are working on many images of different sizes, it might make sense to resize them to a uniform resolution.
skimage has a built-in functionality for that, the skimage.transform.resize method.
Look at the docs here.
If you use resize, you should make sure that no artifacts are introduced to your images. Check the result of the resizing operation before you use it.
The resize of skimage is fairly slow. If you need more performance, you could use opencv. They have a great python API and since there is a conda package, installation has become really easy.
resized_images = []
file_names = glob(os.path.join(IMAGE_DIR, "*.jpg"))
for i in range(len(file_names)):
print("Resizing: " + str(i))
image = skimage.io.imread(file_names[i])
image_resized = resize(image, (1200, 800),anti_aliasing=True)
resized_images.append(image_resized)

Why does Tensorflow-GPU run out of memory mid-epoch? [duplicate]

I'm iteratively deepdreaming images in a directory using the Google's TensorFlow DeepDream implementation (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/deepdream/deepdream.ipynb).
My code is as follows:
model_fn = tensorflow_inception_graph.pb
# creating TensorFlow session and loading the model
graph = tf.Graph()
sess = tf.InteractiveSession(graph=graph)
with tf.gfile.FastGFile(model_fn, 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
t_input = tf.placeholder(np.float32, name='input') # define the input tensor
imagenet_mean = 117.0
t_preprocessed = tf.expand_dims(t_input-imagenet_mean, 0)
tf.import_graph_def(graph_def, {'input':t_preprocessed})
def render_deepdream(t_obj, img0=img_noise,
iter_n=10, step=1.5, octave_n=4, octave_scale=1.4):
t_score = tf.reduce_mean(t_obj) # defining the optimization objective
t_grad = tf.gradients(t_score, t_input)[0] # behold the power of automatic differentiation!
# split the image into a number of octaves
img = img0
octaves = []
for i in range(octave_n-1):
hw = img.shape[:2]
lo = resize(img, np.int32(np.float32(hw)/octave_scale))
hi = img-resize(lo, hw)
img = lo
octaves.append(hi)
# generate details octave by octave
for octave in range(octave_n):
if octave>0:
hi = octaves[-octave]
img = resize(img, hi.shape[:2])+hi
for i in range(iter_n):
g = calc_grad_tiled(img, t_grad)
img += g*(step / (np.abs(g).mean()+1e-7))
#print('.',end = ' ')
#clear_output()
#showarray(img/255.0)
return img/255.0
def morphPicture(filename1,filename2,blend,width):
img1 = PIL.Image.open(filename1)
img2 = PIL.Image.open(filename2)
if width is not 0:
img2 = resizePicture(filename2,width)
finalImage= PIL.Image.blend(img1, img2, blend)
del img1
del img2
return finalImage
def save_array(arr, name,direc, ext="png"):
img = np.uint8(np.clip(arr, 0, 1)*255)
img =cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
cv2.imwrite("{d}/{n}.{e}".format(d=direc, n=name, e=ext), img)
del img
framesDir = "my directory"
os.chdir(framesDir)
outputDir ="my directory"
for file in os.listdir(framesDir):
img0 = PIL.Image.open(file)
img0 = np.float32(img0)
dreamedImage = render_deepdream(tf.square(T('mixed4c')),img0,iter_n=3,octave_n=6)
save_array(dreamedImage,1,outputDir,'jpg')
break
i=1
j=0
with tf.device('/gpu:0'):
for file in os.listdir(framesDir):
if j<=1: #already processed first image so we skip it here
j+=1
continue
else:
dreamedImage = "my directory"+str(i)+'.jpg' # get the previous deep dreamed frame
img1 = file # get the next undreamed frame
morphedImage = morphPicture(dreamedImage,img1,0.5,0) #blend the images
morphedImage=np.float32(morphedImage)
dreamedImage = render_deepdream(tf.square(T('mixed4c')),morphedImage,iter_n=3,octave_n=6) #deep dream a
#blend of the two frames
i+=1
save_array(dreamedImage,i,outputDir,'jpg') #save the dreamed image
del dreamedImage
del img1
del morphedImage
time.sleep(0.5)
Whenever I run the code for more than an hour, the script stops with a MemoryError. I'm assuming there must be a memory leak somewhere, but I'm unable to find it. I thought that by including multiple del statements, I would get rid of the objects that were clogging up the RAM/CPU, but it doesn't seem to be working.
Is there an obvious build up of objects that I am missing within my code? Or is the build up somewhere beneath my code, i.e. within tensorflow?
Any help/suggestions would be much appreciated. Thanks.
FYI there are 901 images in the directory. I am using Windows 7 with NVIDIA GeForce GTX 980 Ti.
99% of the time, when using tensorflow, "memory leaks" are actually due to operations that are continuously added to the graph while iterating — instead of building the graph first, then using it in a loop.
The fact that you specify a device (with tf.device('/gpu:0) for your loop is a hint that it is the case: you typically specify a device for new nodes as this does not affect nodes that are already defined.
Fortunately, tensorflow has a convenient tool to spot those errors: tf.Graph.finalize. When called, this function prevents further nodes to be added to your graph. It is good practice to call this function before iterating.
So in your case I would call tf.get_default_graph().finalize() before your loop and look for any error it may throw.

Out of memory converting image files to numpy array

I'm trying to run a loop that iterates through an image folder and returns two numpy arrays: x - stores the image as a numpy array y - stores the label.
A folder can easily have over 40.000 rgb images, with dimensions (224,224).
I have around 12Gb of memory but after some iterations, the used memory just spikes up and everything stops.
What can I do to fix this issue?
def create_set(path, quality):
x_file = glob.glob(path + '*')
x = []
for i, img in enumerate(x_file):
image = cv2.imread(img, cv2.IMREAD_COLOR)
x.append(np.asarray(image))
if i % 50 == 0:
print('{} - {} images processed'.format(path, i))
x = np.asarray(x)
x = x/255
y = np.zeros((x.shape[0], 2))
if quality == 0:
y[:,0] = 1
else:
y[:,1] = 1
return x, y
You just can't load that many images into memory. You're trying to load every file in a given path to memory, by appending them to x.
Try processing them in batches, or if you're doing this for a tensorflow application try writing them to .tfrecords first.
If you want to save some memory, leave the images as np.uint8 rather than casting them to float (which happens automatically when you normalise them in this line > x = x/255)
You also don't need np.asarray in your x.append(np.asarray(image)) line. image is already an array. np.asarray is for converting lists, tuples, etc to arrays.
edit:
a very rough batching example:
def batching function(imlist, batchsize):
ims = []
batch = imlist[:batchsize]
for image in batch:
ims.append(image)
other_processing()
new_imlist = imlist[batchsize:]
return x, new_imlist
def main():
imlist = all_the_globbing_here()
for i in range(total_files/batch_size):
ims, imlist = batching_function(imlist, batchsize)
process_images(ims)

Categories