I have a script using ray like this:
import ray
from PIL import Image
ray.init(
object_store_memory=1000 * 1024 * 1024 * 100,
ignore_reinit_error=True,
num_cpus=128,
num_gpus=1,
)
img_paths = np.array([200k image paths])
#ray.remote
def read_img(path):
img = np.asarray(Image.open(path))
return img
images = ray.get([read_img.remote(path) for img_path in img_paths[:10000]])
When I process ~5000 images via img_paths[:5000], this program executes in about 5 seconds. When I bump this up to ~10000, the program takes 4 minutes to execute and gives me messages like:
(raylet) Spilled 132187 MiB, 12533 objects, write throughput 1052 MiB/s.
This is my first time using ray, so I'm not sure how to prevent this from happening.
Ended up solving this by processing my images in batches. The system wasn't allocating the requested 100 GB of ram, so objects were still spilling. Looks something like this:
refs = [read_img.remote(path) for path in paths]
images = np.empty((1920, 1920))
for i in range(len(refs) // 5000 + 1):
images = np.vstack(
(images, np.array(ray.get(refs[i * 5000 : (i + 1) * 5000])))
)
Related
I'm using Midas very like in Huggingface's demo.
My issue is that the RAM usage increase at each depth map computation.
Here is the full code.
#!venv/bin/python3
from pathlib import Path
import psutil
import numpy as np
import torch
import cv2
def make_model():
model_type = "DPT_BEiT_L_512" # MiDaS v3.1 - Large
midas = torch.hub.load("intel-isl/MiDaS", model_type)
device = torch.device("cuda")
midas.to(device)
midas.eval()
midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
transform = midas_transforms.dpt_transform
return {"transform": transform,
"device": device,
"midas": midas
}
def inference(cv_image, model):
"""Make the inference."""
transform = model['transform']
device = model["device"]
midas = model["midas"]
input_batch = transform(cv_image).to(device)
with torch.no_grad():
prediction = midas(input_batch)
prediction = torch.nn.functional.interpolate(
prediction.unsqueeze(1),
size=cv_image.shape[:2],
mode="bilinear",
align_corners=False,
).squeeze()
output = prediction.cpu().numpy()
formatted = (output * 255 / np.max(output)).astype('uint8')
return formatted
# Create Midas "DPT_BEiT_L_512" - MiDaS v3.1 - Large
model = make_model()
image_dir = Path('.') / "all_images"
for image_file in image_dir.iterdir():
ram_usage = psutil.virtual_memory()[2]
print("image", ram_usage)
cv_image = cv2.imread(str(image_file))
_ = inference(cv_image, model)
In short:
Create the model "DPT_BEiT_L_512"
Define the function inference
loop over the images in the directory all_images
for each: cv2.imread
compute the depthmap (do not keep the result in memory)
I see that the RAM usage keeps raising over and over.
Variation:
I've tried to read only one image (so only one cv2.imread) and, in the loop, only add a random noise on that image. Up to random noise, the inference function always receive the same image.
In that case, the RAM usage is stable.
QUESTIONS:
Where does the memory leak come from ?
Do I have to "reset" something between two inferences ?
EDIT some variations
variation 1: always the same image
replace the iterdir loop by this:
cv_image = cv2.imread("image.jpg")
for i in range(1, 100):
ram_usage = psutil.virtual_memory()[2]
print(i, ram_usage)
_ = get_depthmap(cv_image, model)
Here you get no memory leak.
Variation 2: do not compute the depth map
for image_file in image_dir.iterdir():
ram_usage = psutil.virtual_memory()[2]
print("image", ram_usage)
cv_image = cv2.imread(str(image_file))
# _ = get_depthmap(cv_image, model)
The memory leak does not occurs.
I deduce that cv2.imread itself does not makes the leak.
Variation 3: same image, random noise:
cv_image = cv2.imread("image.jpg")
for i in range(1, 100):
ram_usage = psutil.virtual_memory()[2]
print(i, ram_usage)
noise = np.random.randn(
cv_image.shape[0], cv_image.shape[1], cv_image.shape[2]) * 20
noisy_img = cv_image + noise
noisy_img = np.clip(noisy_img, 0, 255)
_ = get_depthmap(noisy_img, model)
No leak in this version.
python yolo v4 algo for object detection
assume I have 50 cameras, I need to do object detection in all 50 cameras. It shouldn't run in series. all 50 should run in parallel. how to do this. for Realtime purpose. I tried multithreading and processing but I am very very bad at it. I am complete beginner to python and this seems very hard for me.
I know none of us have 50 camera to check so I created a path variable where location of 50 images are specified. just need to run 50 images in parallel for object detection
import os, time
import cv2
coco_classes = ["car", "plate", "motorcycle"]
net = cv2.dnn.readNet("custom.weights", "custom.cfg")
model = cv2.dnn_DetectionModel(net)
model.setInputParams(size=(416, 416), scale=1 / 255, swapRB=True)
path = './img/'
for fn in os.listdir(path):
image = cv2.imread(path + fn)
t = time.time()
c, v, b = model.detect(image, 0.2, 0.4)
t = time.time() - t
c = [coco_classes[x] for x in c]
print('{}ms : '.format(int(t * 1000)), list(zip(c, v)))```
I am using below code for an image processing related study. The code works fine as functionality but it is too slow that one step takes up to 10 seconds.
I need faster process speed to reach at the aim.
import numpy
import glob, os
import cv2
import os
input = cv2.imread(path)
def nothing(x): # for trackbar
pass
windowName = "Image"
cv2.namedWindow(windowName)
cv2.createTrackbar("coef", windowName, 0, 25000, nothing)
condition = True
while (condition):
coef = cv2.getTrackbarPos("coef", windowName)
temp_img = input
row = temp_img.shape[0]
col = temp_img.shape[1]
print(coef)
red = []
green = []
for i in range(row):
for y in range(col):
# temp_img[i][y][0] = 0
temp_img[i][y][1] = temp_img[i][y][1]* (coef / 100)
temp_img[i][y][1] = temp_img[i][y][2] * (1 - (coef / 100))
# relative_diff = value_g - value_r
# temp =cv2.resize(temp,(1000,800))
cv2.imshow(windowName, temp_img)
# cv2.imwrite("output2.jpg", temp)
print("fin")
# cv2.waitKey(0)
if cv2.waitKey(30) >= 0:
condition = False
cv2.destroyAllWindows()
Is there anybody have an idea having faster result on the aim?
It's not entirely clear to me what object temp_img is exactly, but if it behaves like a numpy array, you could replace your loop by
temp_img[:,:,0] = temp_img[:,:,1]*(coef/100)
temp_img[:,:,1] = temp_img[:,:,2]*(1-coef/1000)
which should result in a significant speed up if your array is large. The implementation of such operations on arrays are optimised very well, whereas python loops are generally quite slow.
Edit based on comments:
Since you're working with large images and have some expensive operations that need an unscaled version but only need to be executed once, your code could get the following kind of structure
import... #do all your imports
def expensive_operations(image, *args, **kwargs):
#do all your expensive operations like object detection
def scale_image(image, scale):
#create a scaled version of image
def cheap_operations(scaled_image, windowName):
#perform cheap operations, e.g.
coef = cv2.getTrackbarPos("coef", windowName)
temp_img = np.copy(scaled_image)
temp_img[:,:,1] = temp_img[:,:,1]* (coef / 100)
temp_img[:,:,2] = temp_img[:,:,2] * (1 - (coef / 100))
cv2.imshow(windowName, temp_img)
input = cv2.imread(path)
windowName = "Image"
cv2.namedWindow(windowName)
cv2.createTrackbar("coef", windowName, 0, 25000, nothing)
condition = True
expensive_results = expensive_operations(input) #possibly with some more args and keyword args
scaled_image = scale_image(input)
while condition:
cheap_operations(scaled_image, windowName)
if cv2.waitKey(30) >= 0:
condition = False
cv2.destroyAllWindows()
I do this kind of thing in nip2. It's an image processing spreadsheet that can manipulate huge images quickly. It has no problems doing this kind of operation on any size image at 60fps.
I made you an example workspace: http://www.rollthepotato.net/~john/coeff.ws
Here's what it looks like working on a 1gb starfield image:
You can drag the slider to change coeff. The processed image updates instantly as you drag. You can zoom and pan around the processed image to check details and adjust coeff.
The underlying image processing library is libvips, which has a Python binding, pyvips. In pyvips, your program would be:
import pyvips
def adjust(image, coeff):
return image * [1, coeff / 100, 1 - coeff / 100]
Though that's without the GUI elements, of course.
I am working on video editor for raspberry pi, and I have a problem with speed of placing image over image. Currently, using imagemagick it takes up to 10 seconds just to place one image over another, using 1080x1920 png images, on raspberry pi, and that's way too much. With the number of images time goes up as well. Any ideas on how to speed it up?
Imagemagick code:
composite -blend 90 img1.png img2.png new.png
Video editor with yet slow opacity support here
--------EDIT--------
slightly faster way:
import numpy as np
from PIL import Image
size_X, size_Y = 1920, 1080# put images resolution, else output may look wierd
image1 = np.resize(np.asarray(Image.open('img1.png').convert('RGB')), (size_X, size_Y, 3))
image2 = np.resize(np.asarray(Image.open('img2.png').convert('RGB')), (size_X, size_Y, 3))
output = image1*transparency+image2*(1-transparency)
Image.fromarray(np.uint8(output)).save('output.png')
My Raspberry Pi is unavailable at the moment - all I am saying is that there was some smoke involved and I do software, not hardware! As a result, I have only tested this on a Mac. It uses Numba.
First I used your Numpy code on these 2 images:
and
Then I implemented the same thing using Numba. The Numba version runs 5.5x faster on my iMac. As the Raspberry Pi has 4 cores, you could try experimenting with:
#jit(nopython=True,parallel=True)
def method2(image1,image2,transparency):
...
Here is the code:
#!/usr/bin/env python3
import numpy as np
from PIL import Image
import numba
from numba import jit
def method1(image1,image2,transparency):
result = image1*transparency+image2*(1-transparency)
return result
#jit(nopython=True)
def method2(image1,image2,transparency):
h, w, c = image1.shape
for y in range(h):
for x in range(w):
for z in range(c):
image1[y][x][z] = image1[y][x][z] * transparency + (image2[y][x][z]*(1-transparency))
return image1
i1 = np.array(Image.open('image1.jpg').convert('RGB'))
i2 = np.array(Image.open('image2.jpg').convert('RGB'))
res = method1(i1,i2,0.4)
res = method2(i1,i2,0.4)
Image.fromarray(np.uint8(res)).save('result.png')
The result is:
Other thoughts... I did the composite in-place, overwriting the input image1 to try and save cache space. That may help or hinder - please experiment. I may not have processed the pixels in the optimal order - please experiment.
Just as another option, I tried in pyvips (full disclosure: I'm the pyvips maintainer, so I'm not very neutral):
#!/usr/bin/python3
import sys
import time
import pyvips
start = time.time()
a = pyvips.Image.new_from_file(sys.argv[1], access="sequential")
b = pyvips.Image.new_from_file(sys.argv[2], access="sequential")
out = a * 0.2 + b * 0.8
out.write_to_file(sys.argv[3])
print("pyvips took {} milliseconds".format(1000 * (time.time() - start)))
pyvips is a "pipeline" image processing library, so that code will execute the load, processing and save all in parallel.
On this two core, four thread i5 laptop using Mark's two test images I see:
$ ./overlay-vips.py blobs.jpg ships.jpg x.jpg
took 39.156198501586914 milliseconds
So 39ms for two jpg loads, processing and one jpg save.
You can time just the blend part by copying the source images and the result to memory, like this:
a = pyvips.Image.new_from_file(sys.argv[1]).copy_memory()
b = pyvips.Image.new_from_file(sys.argv[2]).copy_memory()
start = time.time()
out = (a * 0.2 + b * 0.8).copy_memory()
print("pyvips between memory buffers took {} milliseconds"
.format(1000 * (time.time() - start)))
I see:
$ ./overlay-vips.py blobs.jpg ships.jpg x.jpg
pyvips between memory buffers took 15.432596206665039 milliseconds
numpy is about 60ms on this same test.
I tried a slight variant of Mark's nice numba example:
#!/usr/bin/python3
import sys
import time
import numpy as np
from PIL import Image
import numba
from numba import jit, prange
#jit(nopython=True, parallel=True)
def method2(image1, image2, transparency):
h, w, c = image1.shape
for y in prange(h):
for x in range(w):
for z in range(c):
image1[y][x][z] = image1[y][x][z] * transparency \
+ (image2[y][x][z] * (1 - transparency))
return image1
# run once to force a compile
i1 = np.array(Image.open(sys.argv[1]).convert('RGB'))
i2 = np.array(Image.open(sys.argv[2]).convert('RGB'))
res = method2(i1, i2, 0.2)
# run again and time it
i1 = np.array(Image.open(sys.argv[1]).convert('RGB'))
i2 = np.array(Image.open(sys.argv[2]).convert('RGB'))
start = time.time()
res = method2(i1, i2, 0.2)
print("numba took {} milliseconds".format(1000 * (time.time() - start)))
Image.fromarray(np.uint8(res)).save(sys.argv[3])
And I see:
$ ./overlay-numba.py blobs.jpg ships.jpg x.jpg
numba took 8.110523223876953 milliseconds
So on this laptop, numba is about 2x faster than pyvips.
If you time load and save as well, it's quite a bit slower:
$ ./overlay-numba.py blobs.jpg ships.jpg x.jpg
numba plus load and save took 272.8157043457031 milliseconds
But that seems unfair, since almost all that time is in PIL load and save.
I want to represent an audio file in an image with a maximum size of 180×180 pixels.
I want to generate this image so that it somehow gives a representation of the audio file, think of it like SoundCloud's waveform (amplitude graph)?.
I wonder if any of you have something for this. I have been searching around for a bit, mainly "audio visualization" and "audio thumbnailing", but I have not found anything useful.
I first posted this to ux.stackexchange.com, this is my attempt to reach any programmers working on this.
You could also break up the audio into a chunks and measure the RMS (a measure of loudness). let's say you want an image that is 180 pixels wide.
I'll use pydub, a light-weight wrapper I wrote around the std lib wave modeule:
from pydub import AudioSegment
# first I'll open the audio file
sound = AudioSegment.from_mp3("some_song.mp3")
# break the sound 180 even chunks (or however
# many pixels wide the image should be)
chunk_length = len(sound) / 180
loudness_of_chunks = []
for i in range(180):
start = i * chunk_length
end = chunk_start + chunk_length
chunk = sound[start:end]
loudness_of_chunks.append(chunk.rms)
the for loop can be represented as the following list comprehension, I just wanted it to be clear:
loudness_of_chunks = [
sound[ i*chunk_length : (i+1)*chunk_length ].rms
for i in range(180)]
Now the only think left to do is scale the RMS down to a 0 - 180 scale (since you want the image to be 180px tall)
max_rms = max(loudness_of_chunks)
scaled_loudness = [ (loudness / max_rms) * 180 for loudness in loudness_of_chunks]
I'll leave the drawing of the actual pixels to you, I'm not very experienced with PIL or ImageMagik :/
Based on Jiaaro's answer (thanks for writing pydub!), and built for web2py here's my two cents:
def generate_waveform():
img_width = 1170
img_height = 140
line_color = 180
filename = os.path.join(request.folder,'static','sounds','adg3.mp3')
# first I'll open the audio file
sound = pydub.AudioSegment.from_mp3(filename)
# break the sound 180 even chunks (or however
# many pixels wide the image should be)
chunk_length = len(sound) / img_width
loudness_of_chunks = [
sound[ i*chunk_length : (i+1)*chunk_length ].rms
for i in range(img_width)
]
max_rms = float(max(loudness_of_chunks))
scaled_loudness = [ round(loudness * img_height/ max_rms) for loudness in loudness_of_chunks]
# now convert the scaled_loudness to an image
im = Image.new('L',(img_width, img_height),color=255)
draw = ImageDraw.Draw(im)
for x,rms in enumerate(scaled_loudness):
y0 = img_height - rms
y1 = img_height
draw.line((x,y0,x,y1), fill=line_color, width=1)
buffer = cStringIO.StringIO()
del draw
im = im.filter(ImageFilter.SMOOTH).filter(ImageFilter.DETAIL)
im.save(buffer,'PNG')
buffer.seek(0)
return response.stream(buffer, filename=filename+'.png')