Speed up Image Feature Extraction - python

It's the program of import multiple images and extract feature.
The problem is that it's too slow
I think it's because there's so many for loop.
For example
for q in range(0, height-32 , 32):
for w in range(0 , width-32 ,32):
for j in range(0,64,8):
for n in range(0,64,8):
How can I change my code to speed up?
import numpy as np
from scipy.fftpack import dct
from PIL import Image
import glob
import matplotlib.pyplot as plt
def image_open(path):
image_list = []
#for filename in glob.glob('path/*.jpg'):
for filename in glob.glob(path+'/*.jpg'):
im=Image.open(filename)
image_list.append(im)
return image_list
path = 'C:\\Users\\LG\\PycharmProjects\\photo'
images = image_open(path)
for i in range(0, len(images)):
box3 = (0,0,256,256)
a = images[i].crop(box3)
(y,cb,cr) = a.split()
width , height = y.size
y.show()
for q in range(0, height-32 , 32):
for w in range(0 , width-32 ,32):
for j in range(0,64,8):
for n in range(0,64,8):
print(w/32)

Not sure why you are using so many loops so I’d suggest you try to improve that as best as you can first.
After that, look into threads. Threading
If the number of images being consumed is low, you could run the operation on each image on a separate thread.

Related

Python 3.7 : multiprocessing a for loop with shared variables

first a bit of context :
I'm trying to write down a python script to convert Image in greyscale (.tif) to a .jpeg with the so called ''jet'' colormap. I managed to do it with a for loop but it's a bit long for one image (millions of pixels to treat !), so I would like to use multiprocessing.
My problem here is that to convert each grey pixel into a coloured one I have to use two variables (the minimum value of light intensity ''min_img'' and an vector ''dx_cm'' to go from the initial grey scale to a 256 scale, corresponding to the jet colormap).
So to pass the information of ''min_img'' and ''dx_cm'' to the processes I try to use multiprocessing.Value() but in return I get the error :
RuntimeError: Synchronized objects should only be shared between processes through inheritance
I tried many different things from different sources and no matter the version of my code I'm struggling with that error. So I'm sorry if my code isn't clean, I would be very grateful if someone could help me with that.
My non-working code :
import multiprocessing
from PIL import Image
from matplotlib import cm
def fun(gr_list,dx,minp):
dx_cmp = dx.value
min_imgp = minp.value
rgb_res=list()
for i in range(len(gr_list)):
rgb_res.extend(cm.jet(round(((gr_list[i]-min_imgp)/dx_cmp)-1))[0:-1])
return rgb_res
if __name__ == '__main__':
RGB_list=list()
n = multiprocessing.cpu_count()
img = Image.open(r'some_path_to_a.tif')
Img_grey=list(img.getdata())
dx_cm = multiprocessing.Value('d',(max(Img_grey)-min(Img_grey))/256)
min_img = multiprocessing.Value('d',min(Img_grey))
with multiprocessing.Pool(n) as p:
RGB_list = list(p.map(fun, (Img_grey,dx_cm,min_img)))
res = Image.frombytes("RGB", (img.size[0], img.size[1]), bytes([int(0.5 + 255*i) for i in RGB_list]))
res.save('rgb_file.jpg')
PS : Here is an example of the the initial for loop that I would like to parallelize :
from PIL import Image
from matplotlib import cm
if __name__ == '__main__':
img = Image.open(r'some_path_to_a.tif')
Img_grey = list(img.getdata())
dx_cm = (max(Img_grey)-min(Img_grey))/256
min_img = min(Img_grey)
Img_rgb = list()
for i in range(len(Img_grey)):
Img_rgb.extend(cm.jet(round(((Img_grey[i]-min_img)/dx_cm)-1))[0:-1])
res = Image.frombytes("RGB", (img.size[0], img.size[1]), bytes([int(0.5 + 255*i) for i in Img_rgb]))
res.save('rgb_file.jpg')
Your fun method is looping over some list, but in this case it will receive a "part", an item from your list, so it should return only the result of its processing.
I have changed the working code to run with multiprocessing.
As the fun method returns a list, the p.map will return a list of lists (a list of results) and that need to be flatten, were done with list extends method before.
Tried with process pool and thread pool multiprocessing, in my scenario there wasn't any performance gains.
Process multiprocessing:
from PIL import Image
from matplotlib import cm
import multiprocessing
def fun(d):
part, dx_cm, min_img = d
return cm.jet(round(((part-min_img)/dx_cm)-1))[0:-1]
if __name__ == '__main__':
img = Image.open(r'a.tif')
Img_grey = list(img.getdata())
def Gen(img_data):
dx_cm = (max(img_data)-min(img_data))/256
min_img = min(img_data)
for part in img_data:
yield part, dx_cm, min_img
n = multiprocessing.cpu_count()
with multiprocessing.Pool(n) as p:
Img_rgb = [item for sublist in p.map(fun, Gen(Img_grey)) for item in sublist]
res = Image.frombytes("RGB", (img.size[0], img.size[1]), bytes([int(0.5 + 255*i) for i in Img_rgb]))
res.save('b.jpg')
Thread multiprocessing:
from PIL import Image
from matplotlib import cm
import multiprocessing
from multiprocessing.pool import ThreadPool
if __name__ == '__main__':
img = Image.open(r'a.tif')
Img_grey = list(img.getdata())
dx_cm = (max(Img_grey)-min(Img_grey))/256
min_img = min(Img_grey)
def fun(part):
return cm.jet(round(((part-min_img)/dx_cm)-1))[0:-1]
n = multiprocessing.cpu_count()
with ThreadPool(n) as p:
Img_rgb = [item for sublist in p.map(fun, Img_grey) for item in sublist]
res = Image.frombytes("RGB", (img.size[0], img.size[1]), bytes([int(0.5 + 255*i) for i in Img_rgb]))
res.save('b.jpg')
So it seems that the computational burden isn't big enough for multiprocessing to be helpful.
Nevertheless, for those coming across this topic interested in the image processing part of my question, I found another much quicker way (15 to 20 x than previous method) to do the same thing without a for loop :
from matplotlib import cm
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import numpy as np
from PIL import Image
cm_jet = cm.get_cmap('jet')
img_src = Image.open(r'path to your grey image')
img_src.mode='I'
Img_grey = list(img_src.getdata())
max_img = max(Img_grey)
min_img = min(Img_grey)
rgb_array=np.uint8(cm_jet(((np.array(img_src)-min_img)/(max_img-min_img)))*255)
ax = plt.subplot(111)
im = ax.imshow(rgb_array, cmap='jet')
divider = make_axes_locatable(ax)
cax_plot = divider.append_axes("right", size="5%", pad=0.05)
cbar=plt.colorbar(im, cax=cax_plot, ticks=[0,63.75,127.5,191.25,255])
dx_plot=(max_img-min_img)/255
cbar.ax.set_yticklabels([str(min_img),str(round(min_img+63.75*dx_plot)),str(round(min_img+127.5*dx_plot)),str(round(min_img+191.25*dx_plot)), str(max_img)])
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
plt.savefig('test_jet.jpg', quality=95, dpi=1000)

Reading the huge image data for training classifiers

I am new to python and Machine Learning. I have a huge image dataset of cars having more than 27000 images and labels. I am trying to create a dataset so I can use it in my training classifier, but ofcourse handling this amount of data will be a real pain for the Memory, and that's where I am stuck. At first I was trying to do something like this.
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpg
import cv2
import gc
import numpy as np
from sklearn.preprocessing import normalize
import gc
import resource
import h5py
bbox = "/run/media/fdai5182/LAMAMADAN/Morethan4000samples/data/labels"
imagepath = "/run/media/fdai5182/LAMAMADAN/Morethan4000samples/data/image"
training_data = []
training_labels = []
count = 0
for root, _, files in os.walk(bbox):
cdp = os.path.abspath(root)
for rootImage , _ , fileImage in os.walk(imagepath):
cdpimg = os.path.abspath(r)
for f in files:
ct = 0
name,ext = os.path.splitext(f)
for fI in fileImage:
n , e = os.path.splitext(fI)
if name == n and ext == ".txt" and e == ".jpg":
cip = os.path.join(cdp,f)
cipimg = os.path.join(cdpimg,fI)
txt = open(cip,"r")
for q in txt:
ct = ct + 1
if ct == 3:
x1 = int(q.rsplit(' ')[0])
y1 = int(q.rsplit(' ')[1])
x2 = int(q.rsplit(' ')[2])
y2 = int(q.rsplit(' ')[3])
try:
read_img = mpg.imread(cipimg)
read_img = read_img.astype('float32')
read_img_bbox = read_img[y1:y2, x1:x2,:]
resize_img = cv2.cv2.resize(read_img_bbox,(300,300))
resize_img /= 255.0
training_labels.append(int(cipimg.split('\\')[4]))
training_data.append(resize_img)
print("len Of Training_data",len(training_data))
training_labels.append(int(cipimg.split('/')[8]))
del resize_img
print("len Of Training Labels", len(training_labels))
gc.collect()
except Exception as e:
print("Error",str(e), cip)
count = count + 1
print(count)
txt.flush()
txt.close()
np.save('/run/media/fdai5182/LAMA MADAN/Training_Data_4000Samples',training_data)
np.save('/run/media/fdai5182/LAMA MADAN/Training_Labels_4000Samples',training_labels)
print("DONE")
But it always gives me a huge Memory error after reading images even on 32gb RAM.
So, for that I want to do some other steps which may be useful taking less memory and get this working.
The Steps I want to do are as follows.
allocate np array X of shape N,150,150,3/300,300,3 of type
float32 (not astype)
iterate through images and fill each row of array X with 150,150,3 image pixels
normalize in-place: X /= 255
Write in file (.npy format)
What I did till now is
import cv2
import matplotlib.pyplot as plt
import matplotlib.iamge as mpg
import numpy as np
bbox = "/run/media/fdai5182/LAMAMADAN/Morethan4000samples/data/labels"
imagepath = "/run/media/fdai5182/LAMAMADAN/Morethan4000samples/data/image"
for root, _, files in os.walk(bbox):
cdp = os.path.abspath(root)
for rootImage, _, fileImage in os.walk(imagepath):
cdpimg = os.path.abspath(rootImage)
for f in files:
ct = 0
name,ext = os.path.splitext(f)
for fI in fileImage:
n , e = os.path.splitext(fI)
if name == n and ext == ".txt" and e == ".jpg":
nparrayX = np.zeros((150,150,3)).view('float32')
cip = os.path.join(cdp,f)
cipImg = os.path.join(cdpimg,fI)
read_image = mpg.imread(cip)
resize_image = cv2.cv2.resize(read_image,(150,150))
Am I on the right path?
Also, How can I fill each row of imageformat with 150,150,3 image pixels. I don't want to use list anymore as they take more Memory and time consuming.
Please help me through this.
Also, as a new member if the question is not obeying the rules and regulations of StackOverflow please tell me and I will edit it more.
Thank you,
Both tensorflow/keras and pytorch provide data set / generator classes, which you can use to construct memory efficient data loaders.
For tensorflow/keras there is an excellent tutorial created by Stanford's Shervine Amidi.
For pytorch you find a good tutorial on the project's man page.
I would strongly suggest to make use of these frameworks for your implementation since they allow you to avoid writing boiler-plate code and make your training scalable.
Thank you for your help . But I wanted to do it manually to check How can we do it without using other generators. Below is my Code.
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpg
import numpy as np
import os
N = 0
training_labels = []
bbox = "D:/Morethan4000samples/data/labels"
imagepath = "D:/Morethan4000samples/data/image/"
for root, _, files in os.walk(imagepath):
cdp = os.path.abspath(root)
for f in files:
name, ext = os.path.splitext(f)
if ext == ".jpg":
cip = os.path.join(cdp,f)
N += 1
print(N)
imageX = np.zeros((N,227,227,3), dtype='float32')
i = 0
for root, _ , files in os.walk(imagepath):
cdp = os.path.abspath(root)
print(cdp)
for f in files:
ct = 0
name, ext = os.path.splitext(f)
if ext == ".jpg":
cip = os.path.join(cdp,f)
read = mpg.imread(cip)
cipLabel = cip.replace('image','labels')
cipLabel = cipLabel.replace('.jpg','.txt')
nameL , extL = os.path.splitext(cipLabel)
if extL == '.txt':
boxes = open(cipLabel, 'r')
for q in boxes:
ct = ct + 1
if ct == 3:
x1 = int(q.rsplit(' ')[0])
y1 = int(q.rsplit(' ')[1])
x2 = int(q.rsplit(' ')[2])
y2 = int(q.rsplit(' ')[3])
readimage = read[y1:y2, x1:x2]
resize = cv2.cv2.resize(readimage,(227,227))
resize = cv2.cv2.GaussianBlur(resize, (5,5),0)
imageX[i] = resize
#training_labels.append(int(cip.split('\\')[4]))
training_labels.append(int(cip.split('/')[8]))
print(len(training_labels), len(imageX))
i += 1
print(i)
imageX /= 255.0
plt.imshow(imageX[10])
plt.show()
print(imageX.shape)
print(len(training_labels))
np.save("/run/media/fdai5182/LAMA MADAN/Morethan4000samples/227227/training_images", imageX)
np.save("/run/media/fdai5182/LAMA MADAN/Morethan4000samples/227227/trainin_labels",training_labels)
To save each of your image in a row of matrix of same dimensions is the most efficient way to do that.

How to deal with this when I use numpy.array() to deal with 'list'?

import cv2
import numpy
list_pixel=[]
list_label=[]
for i in range(0,10):
for j in range(0,10):
list_pixel.append(cv2.imread("C:\\Users\\kimcho\\Desktop\\testdata\\testdata_"+str(i)+"_0"+str(j)+".png",0))
list_label.append(i)
j=0
list_pixel.pop(0)
list_label.pop(0)
list_pixel=numpy.array(list_pixel)
print(list_pixel)
print(list_pixel.shape)
print(list_pixel[0].shape)
How to deal with this when I use numpy.array() to deal with 'list'?I wanna make datasets by imitating keras.But,the datasets I made didn't satisfy me.I want it to act like keras,to return a value like this:
It can return a value of(60000,28,28)
But as for my datasets,it can only return like this:
Only return a value of(99,)—— I got 99 pictures and I want to load their pixel into list_pixel
Here is my code:
Hoping anyone can help me solve this problem.Deeply thank you!!!
When using cv2.imread you already load the image as a numpy array.
A simple approach to it is the following:
import cv2
import numpy as np
list_label = np.arange(0, 10)
path = "C:\\Users\\kimcho\\Desktop\\testdata\\testdata_{0}_0{1}.png"
list_pixel = np.array([
cv2.imread(path.format(i, j), 0) for i in range(0, 10)
for j in range(0, 10)
])
Let's try it on a simple scenario: we suppose there is just an image 28x28 in size, such as this one:
Let's say it is in the path my/path/image.png.
import cv2
import numpy as np
list_label = np.arange(0, 10)
path = "my/path/image.png"
list_pixel = np.array([
cv2.imread(path,0) for i in range(0, 10)
for j in range(0, 10)
])
When running list_pixel.shape you get (100, 28, 28).

How to display a set of files with MatPlotLib in a loop

I have a set of .txt named "occupancyGrid_i", i being a number from 0-100.
What I'd like to do is to open every one of them and show them for 3 seconds. The data of the .txt is a [N x M] matrix.
import numpy
import matplotlib.pyplot as plt
import time
while True:
matrix = numpy.loadtxt('res/matrix_' + str(i) + '.txt')
plt.clf()
plt.imshow(matrix)
plt.show()
time.sleep(3)
i=i+1
What I have done so far doesn't seem to be enough. What am I doing wrong?
You can try something like this, adapting the code suggested in this answer:
import os
import numpy as np
import pylab as plt
N_IMAGES = 100
VMIN, VMAX = 0, 1 # range of values in matrices
i = 0
while True:
if i < N_IMAGES:
path = 'res/matrix_' + str(i) + '.txt'
if os.path.exists(path): # check if file exists
matrix = np.loadtxt('matrices/matrix_' + str(i) + '.txt')
plt.imshow(matrix, vmin=VMIN, vmax=VMAX)
plt.title("Matrix {}".format(i))
plt.pause(3)
i += 1
else:
# terminate you program or start from the beginning
break
# i = 0
# continue
I dont know what exactly your goal is. But to display text in matplotlib you can use text from pyplot.
`
import numpy
import matplotlib.pyplot as plt
import time
for i in range(1,5):
s = ''
with open(str(i)+'.txt','r') as f:
s=f.read()
plt.text(0.5, 0.67,s,transform=plt.gca().transAxes)
plt.show()
time.sleep(3)
First 2 argument (0.5 ,0.67) are cordinate of displayed text.
I think you should find some other way of displaying text. Just print them on your console, plotting them is not the best way to represent text data.

Writting a file with multiple images in a grid

I'm trying to write a file with multiple images (100) in a 10x10 grid. I use 3 for iterations to:
-open the file
-set coordinates (i,j)
The problem is when I look my file, all I can see is the last image multiple times. Maybe the files is overwrite every time that the program enters the for loop. Until now I can't find a solution.
The code is:
import Image
from os import listdir
from os.path import isfile, join
files = [ f for f in listdir("/mnt/hgfs/Documents/Notebooks/test1/") if isfile(join("/mnt/hgfs/Documents/Notebooks/test1/", f)) ]
new_im = Image.new('RGB', (3000,3000))
for i in xrange(0,3000,300):
for j in xrange(0,3000,300):
for ima in files:
#paste the image at location i,j:
im = Image.open(ima)
im.thumbnail((300,300))
new_im.paste(im, (i,j))
new_im.save("hola.png")
Thanks!
Here's a simple bug fix. You only need two for loops, not three.
import Image
from os import listdir
from os.path import isfile, join
files = [ f for f in listdir("/mnt/hgfs/Documents/Notebooks/test1/") if isfile(join("/mnt/hgfs/Documents/Notebooks/test1/", f)) ]
new_im = Image.new('RGB', (3000,3000))
index = 0
for i in xrange(0,3000,300):
for j in xrange(0,3000,300):
im = Image.open(files[index])
im.thumbnail((300,300))
new_im.paste(im, (i,j))
index += 1
new_im.save("hola.png")
This is the Python 3 code to make a squre grid of images file from any directory with images using matplotlib.
Square size calculates dynamicly by count of existing images.
import math
import os
import matplotlib.pyplot as plt
# Config:
images_dir = './your_dir_with_images'
result_grid_filename = './grid.jpg'
result_figsize_resolution = 40 # 1 = 100px
images_list = os.listdir(images_dir)
images_count = len(images_list)
print('Images: ', images_list)
print('Images count: ', images_count)
# Calculate the grid size:
grid_size = math.ceil(math.sqrt(images_count))
# Create plt plot:
fig, axes = plt.subplots(grid_size, grid_size, figsize=(result_figsize_resolution, result_figsize_resolution))
current_file_number = 0
for image_filename in images_list:
x_position = current_file_number % grid_size
y_position = current_file_number // grid_size
plt_image = plt.imread(images_dir + '/' + images_list[current_file_number])
axes[x_position, y_position].imshow(plt_image)
print((current_file_number + 1), '/', images_count, ': ', image_filename)
current_file_number += 1
plt.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0)
plt.savefig(result_grid_filename)
Images in the directory screenshot
result grid image file

Categories