Im trying to read images from folders into a dataframe , where each row in the dataframe is all the images for a folder :
import cv2
import os,glob
import matplotlib.pylab as plt
from os import listdir,makedirs
from os.path import isfile,join
import pandas as pd
import PIL
import numpy as np
from scipy.ndimage import imread
pth = 'C:/Users/Documents/myfolder/'
folders = os.listdir(pth)
videos = pd.DataFrame()
for folder in folders:
pth_upd = pth + folder + '/'
allfiles = os.listdir(pth_upd)
files = []
columns = ['data']
index = [folders]
for file in allfiles:
files.append(file) if ('.bmp' in file) else None
samples = np.empty((0,64,64))
for file in files:
img = cv2.imread(os.path.join(pth_upd,file),cv2.IMREAD_GRAYSCALE)
img = img.reshape(1,64,64)
samples = np.append(samples, img, axis=0)
result = pd.DataFrame([samples], index=[folder], columns=['videos'])
videos = videos.append(result)
after reading all the images in each folder into (samples array ) how can I insert images for each folder in a dataframe row
ValueError Traceback (most recent call last)
in
17 samples = np.append(samples, img, axis=0)
18
---> 19 result = pd.DataFrame([samples], index=[folder], columns=['videos'])
20 videos = videos.append(result)
ValueError: Must pass 2-d input
:
It's certainly possible to put strings of the resized images into pandas, but there are much better ways to accomplish CNN training. I adapted your image processing code to show how you could do what you asked:
import io
import pandas as pd
import numpy as np
import sklearn
import requests
import tempfile
import os
import cv2
# Image processing for the df
def process_imgfile(x):
img = cv2.imread(os.path.join(
x.Folder, x.image),cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (64, 64))
img = str(img)
return img
# Simulate folders with images in them
with tempfile.TemporaryDirectory() as f:
f1 = os.path.join(f, "Folder1")
f2 = os.path.join(f, "Folder2")
os.mkdir(f1)
os.mkdir(f2)
print(r.status_code)
for x in range(20):
with open(os.path.join(f1, "f1-{}.jpg".format(x)), "wb") as file1, open(
os.path.join(f2, "f2-{}.jpg".format(x)), "wb") as file2:
r = requests.get(
'https://upload.wikimedia.org/wikipedia/en/a/a9/Example.jpg',
stream=True)
for chunk in r.iter_content(16): # File writing...
file1.write(chunk)
file2.write(chunk)
result = [x for x in os.walk(f)]
folder1 = result[1][2]
folder2 = result[2][2]
# Generate dataframe data
j = {"Folder":[], "image":[]}
for x in folder1:
j["Folder"].append(result[1][0])
j["image"].append(x)
for x in folder2:
j["Folder"].append(result[2][0])
j["image"].append(x)
# Use the process_imgfile function to append image data
df = pd.DataFrame(j)
df["imgdata"] = df.apply(process_imgfile, axis=1)
But on a large set of images this is not going to work. Instead, check out ImageDataGenerator which can let you load images at train and test time. It can also help you apply augmentation or synthesize data.
Related
I have the following code in which I am loading a single DICOM file and checking if there are sagittal and coronal view present or not.
I want to modify this to read all DICOM files from the folder.
print there is no sagittal and coronal view if sag_aspect,cor_aspect value is zero
How do I do this?
import pydicom
import numpy as np
import matplotlib.pyplot as plt
import sys
import glob
# load the DICOM files
files = []
print('glob: {}'.format(sys.argv[1]))
for fname in glob.glob('dicom/3.dcm', recursive=False):
print("loading: {}".format(fname))
files.append(pydicom.dcmread(fname))
print("file count: {}".format(len(files)))
# skip files with no SliceLocation (eg scout views)
slices = []
skipcount = 0
for f in files:
if hasattr(f, 'SliceLocation'):
slices.append(f)
else:
skipcount = skipcount + 1
print("skipped, no SliceLocation: {}".format(skipcount))
# ensure they are in the correct order
slices = sorted(slices, key=lambda s: s.SliceLocation)
# pixel aspects, assuming all slices are the same
ps = slices[0].PixelSpacing
ss = slices[0].SliceThickness
ax_aspect = ps[1]/ps[0]
sag_aspect = ps[1]/ss
cor_aspect = ss/ps[0]
# create 3D array
img_shape = list(slices[0].pixel_array.shape)
img_shape.append(len(slices))
img3d = np.zeros(img_shape)
# fill 3D array with the images from the files
for i, s in enumerate(slices):
img2d = s.pixel_array
img3d[:, :, i] = img2d
# plot 3 orthogonal slices
print(img3d.shape)
print(img_shape)
a1 = plt.subplot(2, 2, 1)
plt.imshow(img3d[:, :, img_shape[2]//2])
a1.set_title("transverse view")
a1.set_aspect(ax_aspect)
a2 = plt.subplot(2, 2, 2)
#print(img3d[:, img_shape[1]//2, :].shape)
plt.imshow(img3d[:, img_shape[1]//2, :])
a2.set_title("sagital view")
a2.set_aspect(sag_aspect)
a3 = plt.subplot(2, 2, 3)
plt.imshow(img3d[img_shape[0]//2, :, :].T)
a3.set_title("coronal view")
a3.set_aspect(cor_aspect)
plt.show()
For reading multiple dicom files from a folder you can use the code below.
import os
from pathlib import Path
import pydicom
dir_path = r"path\to\dicom\files"
dicom_set = []
for root, _, filenames in os.walk(dir_path):
for filename in filenames:
dcm_path = Path(root, filename)
if dcm_path.suffix == ".dcm":
try:
dicom = pydicom.dcmread(dcm_path, force=True)
except IOError as e:
print(f"Can't import {dcm_path.stem}")
else:
dicom_set.append(dicom)
I have leveraged the pathlib library which I strongly suggest to use whenever dealing with folder/file paths. I have also added an exception, but you can modify it to meet your needs.
I am new to python and Machine Learning. I have a huge image dataset of cars having more than 27000 images and labels. I am trying to create a dataset so I can use it in my training classifier, but ofcourse handling this amount of data will be a real pain for the Memory, and that's where I am stuck. At first I was trying to do something like this.
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpg
import cv2
import gc
import numpy as np
from sklearn.preprocessing import normalize
import gc
import resource
import h5py
bbox = "/run/media/fdai5182/LAMAMADAN/Morethan4000samples/data/labels"
imagepath = "/run/media/fdai5182/LAMAMADAN/Morethan4000samples/data/image"
training_data = []
training_labels = []
count = 0
for root, _, files in os.walk(bbox):
cdp = os.path.abspath(root)
for rootImage , _ , fileImage in os.walk(imagepath):
cdpimg = os.path.abspath(r)
for f in files:
ct = 0
name,ext = os.path.splitext(f)
for fI in fileImage:
n , e = os.path.splitext(fI)
if name == n and ext == ".txt" and e == ".jpg":
cip = os.path.join(cdp,f)
cipimg = os.path.join(cdpimg,fI)
txt = open(cip,"r")
for q in txt:
ct = ct + 1
if ct == 3:
x1 = int(q.rsplit(' ')[0])
y1 = int(q.rsplit(' ')[1])
x2 = int(q.rsplit(' ')[2])
y2 = int(q.rsplit(' ')[3])
try:
read_img = mpg.imread(cipimg)
read_img = read_img.astype('float32')
read_img_bbox = read_img[y1:y2, x1:x2,:]
resize_img = cv2.cv2.resize(read_img_bbox,(300,300))
resize_img /= 255.0
training_labels.append(int(cipimg.split('\\')[4]))
training_data.append(resize_img)
print("len Of Training_data",len(training_data))
training_labels.append(int(cipimg.split('/')[8]))
del resize_img
print("len Of Training Labels", len(training_labels))
gc.collect()
except Exception as e:
print("Error",str(e), cip)
count = count + 1
print(count)
txt.flush()
txt.close()
np.save('/run/media/fdai5182/LAMA MADAN/Training_Data_4000Samples',training_data)
np.save('/run/media/fdai5182/LAMA MADAN/Training_Labels_4000Samples',training_labels)
print("DONE")
But it always gives me a huge Memory error after reading images even on 32gb RAM.
So, for that I want to do some other steps which may be useful taking less memory and get this working.
The Steps I want to do are as follows.
allocate np array X of shape N,150,150,3/300,300,3 of type
float32 (not astype)
iterate through images and fill each row of array X with 150,150,3 image pixels
normalize in-place: X /= 255
Write in file (.npy format)
What I did till now is
import cv2
import matplotlib.pyplot as plt
import matplotlib.iamge as mpg
import numpy as np
bbox = "/run/media/fdai5182/LAMAMADAN/Morethan4000samples/data/labels"
imagepath = "/run/media/fdai5182/LAMAMADAN/Morethan4000samples/data/image"
for root, _, files in os.walk(bbox):
cdp = os.path.abspath(root)
for rootImage, _, fileImage in os.walk(imagepath):
cdpimg = os.path.abspath(rootImage)
for f in files:
ct = 0
name,ext = os.path.splitext(f)
for fI in fileImage:
n , e = os.path.splitext(fI)
if name == n and ext == ".txt" and e == ".jpg":
nparrayX = np.zeros((150,150,3)).view('float32')
cip = os.path.join(cdp,f)
cipImg = os.path.join(cdpimg,fI)
read_image = mpg.imread(cip)
resize_image = cv2.cv2.resize(read_image,(150,150))
Am I on the right path?
Also, How can I fill each row of imageformat with 150,150,3 image pixels. I don't want to use list anymore as they take more Memory and time consuming.
Please help me through this.
Also, as a new member if the question is not obeying the rules and regulations of StackOverflow please tell me and I will edit it more.
Thank you,
Both tensorflow/keras and pytorch provide data set / generator classes, which you can use to construct memory efficient data loaders.
For tensorflow/keras there is an excellent tutorial created by Stanford's Shervine Amidi.
For pytorch you find a good tutorial on the project's man page.
I would strongly suggest to make use of these frameworks for your implementation since they allow you to avoid writing boiler-plate code and make your training scalable.
Thank you for your help . But I wanted to do it manually to check How can we do it without using other generators. Below is my Code.
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpg
import numpy as np
import os
N = 0
training_labels = []
bbox = "D:/Morethan4000samples/data/labels"
imagepath = "D:/Morethan4000samples/data/image/"
for root, _, files in os.walk(imagepath):
cdp = os.path.abspath(root)
for f in files:
name, ext = os.path.splitext(f)
if ext == ".jpg":
cip = os.path.join(cdp,f)
N += 1
print(N)
imageX = np.zeros((N,227,227,3), dtype='float32')
i = 0
for root, _ , files in os.walk(imagepath):
cdp = os.path.abspath(root)
print(cdp)
for f in files:
ct = 0
name, ext = os.path.splitext(f)
if ext == ".jpg":
cip = os.path.join(cdp,f)
read = mpg.imread(cip)
cipLabel = cip.replace('image','labels')
cipLabel = cipLabel.replace('.jpg','.txt')
nameL , extL = os.path.splitext(cipLabel)
if extL == '.txt':
boxes = open(cipLabel, 'r')
for q in boxes:
ct = ct + 1
if ct == 3:
x1 = int(q.rsplit(' ')[0])
y1 = int(q.rsplit(' ')[1])
x2 = int(q.rsplit(' ')[2])
y2 = int(q.rsplit(' ')[3])
readimage = read[y1:y2, x1:x2]
resize = cv2.cv2.resize(readimage,(227,227))
resize = cv2.cv2.GaussianBlur(resize, (5,5),0)
imageX[i] = resize
#training_labels.append(int(cip.split('\\')[4]))
training_labels.append(int(cip.split('/')[8]))
print(len(training_labels), len(imageX))
i += 1
print(i)
imageX /= 255.0
plt.imshow(imageX[10])
plt.show()
print(imageX.shape)
print(len(training_labels))
np.save("/run/media/fdai5182/LAMA MADAN/Morethan4000samples/227227/training_images", imageX)
np.save("/run/media/fdai5182/LAMA MADAN/Morethan4000samples/227227/trainin_labels",training_labels)
To save each of your image in a row of matrix of same dimensions is the most efficient way to do that.
I have two programs that the first program is about to convert .h5 file to .tiff file and second program is predict temperature using arima model.
here is first program for converting .h5 file to .tiff file and min and max .tiff and it also call geo tiff.
from osgeo import gdal
import numpy as np
import os
import h5py
from collections import defaultdict
from osgeo import osr
import datetime
in_dir = r'/Users/sunnybhargav/Desktop/jan'
out_dir = r'/Users/sunnybhargav/Desktop/new_output'
#in_dir = input('Enter input directory path where hdf files are stored: ')
#out_dir = input('Enter output directory path where geotiff files are to be stored: ')
def arrayToTif(array,tifFilePath,proj,transform,nodatavalue):
with open(tifFilePath,'a') as file:
pass
# write raster
out_ds = gdal.GetDriverByName('GTiff').Create(tifFilePath,
array.shape[1],
array.shape[0],
1, #Number of bands
gdal.GDT_Float32)
out_ds.GetRasterBand(1).WriteArray(array)
out_ds.GetRasterBand(1).SetNoDataValue(nodatavalue)
# close tif to write into disk (free tif file)
out_ds = None
dates_dict = defaultdict(list)
for root,directories,filenames in os.walk(in_dir):
for filename in filenames:
if (filename.endswith('.h5')):
hdffileDate = filename[6:15]
hdfdate = (int(hdffileDate[0:2]))
dates_dict[hdfdate].append(filename)
print(filename)
for key in dates_dict.keys():
file_list = dates_dict[key]
min_lst = 1000*np.ones((2816,2805))
max_lst = -1000*np.ones((2816,2805))
for v in file_list:
hdf_ds = h5py.File(os.path.join(in_dir,v))
lst = np.array(hdf_ds['LST'])[0,:,:]
hdf_ds = gdal.Open(os.path.join(in_dir,v))
metadata = hdf_ds.GetMetadata_Dict()
lst = lst.astype('Float32')
max_lst = np.maximum(max_lst,lst)
lst[lst==-999] = 999
min_lst = np.minimum(min_lst,lst)
min_lst[min_lst==999] = -999
transform = (0,1,0,0,0,-1)
proj = None
nodatavalue = -999
tiffileDate = v[6:15]
MinName = 'MIN' +v[0:2]+str.lower(tiffileDate) + '.tif'
MaxName = 'MAX' +v[0:2]+str.lower(tiffileDate) + '.tif'
arrayToTif
(max_lst,os.path.join(out_dir,MaxName),proj,transform,nodatavalue)
arrayToTif
(min_lst,os.path.join(out_dir,MinName),proj,transform,nodatavalue)
del lst
del min_lst
del max_lst
second program
in second program is for tiff to get ndarray and then get out put for particular all max temprature and predict next 5 day prediction.
import pandas as pd
import seaborn as sns
import matplotlib
import numpy as np
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from numpy import genfromtxt
import csv
import datetime
from datetime import datetime
import time
from matplotlib import pyplot
from pandas import Series
from statsmodels.tsa.arima_model import ARIMA
import numpy
from statsmodels.tsa.stattools import adfuller
import matplotlib.pylab as plt
from statsmodels.tsa.stattools import acf, pacf
from sklearn.metrics import mean_squared_error
import numpy as np
import subprocess
import gdal,osr
from gdalconst import *
import os
import numpy as np
from PIL import Image
import scipy.misc
from datetime import datetime
# import timeseries as ts
count = 1
max_temp = []
min_temp = []
filename = []
filenamer = []
max_temp_points = []
min_temp_points = []
source = r'/Volumes/bhargav 1/data/NEW_MAX'
for root, dirs, filenames in os.walk(source):
print(filenames)
for f in filenames:
print (f)
dataset = gdal.Open( source + '//' + f ,gdal.GA_ReadOnly)
#print(dataset)
geotransform = dataset.GetGeoTransform()
band = dataset.GetRasterBand(1)
data = band.ReadAsArray(0,0,dataset.RasterXSize,dataset.RasterYSize).astype(np.float64)
#print(np.histogram(data,bins=500))
print(np.shape(data))
max_temp_point = data[793][1160]
max_temp_point = max_temp_point - 273
print(max_temp_point)
print("Count:",count)
max_temp_points.append(max_temp_point)
count = count + 1
print(np.shape(max_temp_points))
print(np.mean(max_temp_points))
count = 1
np.save("Max_temp_points_1",max_temp_points)
X = max_temp_points
model = ARIMA(X, order=(5,0,4))
model_fit = model.fit(disp=-1)
# print summary of fit model
print(model_fit.summary())
forecast = model_fit.predict()
print (forecast)
# plot
start_index = len(X)
end_index = start_index + 6
predict_val = model_fit.predict(start=start_index, end=end_index)
print('Prediction:',predict_val)
pyplot.plot(X)
pyplot.plot(forecast, color='red')
pyplot.show()
I have a code which uses a list of image URLs from a CSV file and then performs face detection on those images after which it loads some models and does predictions on those images.
I did some load tests and found that the get_face function in the code takes a major chunk of the time required to produce the results and the extra time is taken by the pickle file created for predictions.
Question: Is there a possibility that by running these processes in threads, time can be reduced and also how this can be done in a multi threading way?
Here is the code example:
from __future__ import division
import numpy as np
from multiprocessing import Process, Queue, Pool
import os
import pickle
import pandas as pd
import dlib
from skimage import io
from skimage.transform import resize
df = pd.read_csv('/home/instaurls.csv')
detector = dlib.get_frontal_face_detector()
img_width, img_height = 139, 139
confidence = 0.8
def get_face():
output = None
data1 = []
for row in df.itertuples():
img = io.imread(row[1])
dets = detector(img, 1)
for i, d in enumerate(dets):
img = img[d.top():d.bottom(), d.left():d.right()]
img = resize(img, (img_width, img_height))
output = np.expand_dims(img, axis=0)
break
data1.append(output)
data1 = np.concatenate(data1)
return data1
get_face()
csv sample
data
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/23101834_1502115223199537_1230866541029883904_n.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/17883193_940000882769400_8455736118338387968_a.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/22427207_1737576603205281_7879421442167668736_n.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/12976287_1720757518213286_1180118177_a.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/23101834_1502115223199537_1230866541029883904_n.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/16788491_748497378632253_566270225134125056_a.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/21819738_128551217878233_9151523109507956736_n.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/14295447_318848895135407_524281974_a.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/18160229_445050155844926_2783054824017494016_a.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/23101834_1502115223199537_1230866541029883904_n.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/17883193_940000882769400_8455736118338387968_a.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/22427207_1737576603205281_7879421442167668736_n.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/12976287_1720757518213286_1180118177_a.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/23101834_1502115223199537_1230866541029883904_n.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/16788491_748497378632253_566270225134125056_a.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/21819738_128551217878233_9151523109507956736_n.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/14295447_318848895135407_524281974_a.jpg
https://scontent-frx5-1.cdninstagram.com/t51.2885-19/s320x320/18160229_445050155844926_2783054824017494016_a.jpg
https://scontent-frt3-2.cdninstagram.com/t51.2885-19/s320x320/23101834_1502115223199537_1230866541029883904_n.jpg
Here is how you could try to do it in parallel:
from __future__ import division
import numpy as np
from multiprocessing import Process, Queue, Pool
import os
import pickle
import pandas as pd
import dlib
from skimage import io
from skimage.transform import resize
from csv import DictReader
df = DictReader(open('/home/instaurls.csv')) # DictReader is iterable
detector = dlib.get_frontal_face_detector()
img_width, img_height = 139, 139
confidence = 0.8
def get_face(row):
"""
Here row is dictionary where keys are CSV header names
and values are values from current CSV row.
"""
output = None
img = io.imread(row[1]) # row[1] has to be changed to row['data']?
dets = detector(img, 1)
for i, d in enumerate(dets):
img = img[d.top():d.bottom(), d.left():d.right()]
img = resize(img, (img_width, img_height))
output = np.expand_dims(img, axis=0)
break
return output
if __name__ == '__main__':
pool = Pool() # default to number CPU cores
data = list(pool.imap(get_face, df))
print np.concatenate(data)
Pay attention to get_face and argument that it has. Also, to what it returns. This is what I meant when I said smaller chunks of work. Now get_face processes one row from CSV.
When you run this script, pool will be a reference to a instance of a Pool and you then call get_face for each row/tuple in df.itertuples().
After everything is done, data holds processing data and then you do np.concatenate on it.
I have downloaded Caltech101. Its structure is:
#Caltech101 dir
#class1 dir
#images of class1 jpgs
#class2 dir
#images of class2 jpgs
...
#class100 dir
#images of class100 jpgs
My problem is that I can't keep in memory two np arrays x and y of shape (9144, 240, 180, 3) and (9144). So my solution is to overallocate a h5py dataset, load them in 2 chunks and write them to file one after the other. Precisely:
from __future__ import print_function
import os
import glob
from scipy.misc import imread, imresize
from sklearn.utils import shuffle
import numpy as np
import h5py
from time import time
def load_chunk(images_dset, labels_dset, chunk_of_classes, counter, type_key, prev_chunk_length):
# getting images and processing
xtmp = []
ytmp = []
for label in chunk_of_classes:
img_list = sorted(glob.glob(os.path.join(dir_name, label, "*.jpg")))
for img in img_list:
img = imread(img, mode='RGB')
img = imresize(img, (240, 180))
xtmp.append(img)
ytmp.append(label)
print(label, 'done')
x = np.concatenate([arr[np.newaxis] for arr in xtmp])
y = np.array(ytmp, dtype=type_key)
print('x: ', type(x), np.shape(x), 'y: ', type(y), np.shape(y))
# writing to dataset
a = time()
images_dset[prev_chunk_length:prev_chunk_length+x.shape[0], :, :, :] = x
print(labels_dset.shape)
print(y.shape, y.shape[0])
print(type(y), y.dtype)
print(prev_chunk_length)
labels_dset[prev_chunk_length:prev_chunk_length+y.shape[0]] = y
b = time()
print('Chunk', counter, 'written in', b-a, 'seconds')
return prev_chunk_length+x.shape[0]
def write_to_file(remove_DS_Store):
if os.path.isfile('caltech101.h5'):
print('File exists already')
return
else:
# the name of each dir is the name of a class
classes = os.listdir(dir_name)
if remove_DS_Store:
classes.pop(0) # removes .DS_Store - may not be used on other terminals
# need the dtype of y in order to initialize h5 dataset
s = ''
key_type_y = s.join(['S', str(len(max(classes, key=len)))])
classes = np.array(classes, dtype=key_type_y)
# number of chunks in which the dataset must be divided
nb_chunks = 2
nb_chunks_loaded = 0
prev_chunk_length = 0
# open file and allocating a dataset
f = h5py.File('caltech101.h5', 'a')
imgs = f.create_dataset('images', shape=(9144, 240, 180, 3), dtype='uint8')
labels = f.create_dataset('labels', shape=(9144,), dtype=key_type_y)
for class_sublist in np.array_split(classes, nb_chunks):
# loading chunk by chunk in a function to avoid memory overhead
prev_chunk_length = load_chunk(imgs, labels, class_sublist, nb_chunks_loaded, key_type_y, prev_chunk_length)
nb_chunks_loaded += 1
f.close()
print('Images and labels saved to \'caltech101.h5\'')
return
dir_name = '../Datasets/Caltech101'
write_to_file(remove_DS_Store=True)
This works quite well, and also reading is actually fast enough. The problem is that I need to shuffle the dataset.
Observations:
Shuffling the dataset objects: obviously veeeery slow because they're on disk.
Creating an array of shuffled indices and use advanced numpy indexing. This means slower reading from file.
Shuffling before writing to file would be nice, problem: I have only about half of the dataset in memory each time. I would get an improper shuffling.
Can you think of a way to shuffle before writing? I'm open also to solutions which rethink the writing process, as long as it doesn't use a lot of memory.
You could shuffle the file paths before reading the image data.
Instead of shuffling the image data in memory, create a list of all file paths that belong to the dataset. Then shuffle the list of file paths. Now you can create your HDF5 database as before.
You could for example use glob to create the list of files for shuffling:
import glob
import random
files = glob.glob('../Datasets/Caltech101/*/*.jpg')
shuffeled_files = random.shuffle(files)
You could then retrieve the class label and image name from the path:
import os
for file_path in shuffeled_files:
label = os.path.basename(os.path.dirname(file_path))
image_id = os.path.splitext(os.path.basename(file_path))[0]