How to compress hdf5 file when resizing?

How to compress hdf5 file when resizing? - python

Here is my code:
n = 100000 #This is what makes it tricky - lots of files going into this hdf5 file
with h5py.File('image1.h5','w') as f:
dset_X = f.create_dataset('X',(1,960,224,224),maxshape=(None,960,224,224),chunks=True,compression='gzip')
dset_y = f.create_dataset('y',(1,112,224*224),maxshape=(None,112,224*224),chunks=True,compression='gzip')
n_images = 0
for fl in files[:n]:
X_chunk,y_chunk = get_arrays(fl)
dset_X.resize(n_images+1,axis=0)
dset_y.resize(n_images+1,axis=0)
print dset_X.shape,dset_y.shape
dset_X[n_images:n_images+1,:,:,:]=X_chunk
dset_y[n_images:n_images+1,:,:]=y_chunk
n_images+=1
This works fine and dandy. However, with 1 file, the size of the hdf5 is 6.7MB. With 2 files its 37MB ( should be 12 MB right?). With 10 its all the way up to 388MB (should be 67 right?)
So clearly adding the compression flag to the end of the 2nd and third line isn't working as intended. How can I achieve something like this?

I ended up doing this successfully using pytables.
def get_arrays(each_file):
lab = color.rgb2lab(io.imread(each_file))
X = lab[:,:,:1]
y = lab[:,:,1:]
X_rows,X_columns,X_channels=X.shape
y_rows,y_columns,y_channels=y.shape
X_channels_first = np.transpose(X,(2,0,1))
X_sample = np.expand_dims(X_channels_first,axis=0)
X_3d = np.tile(X_sample,(1,3,1,1))
X_3d_scaled = X_3d * 255.0/X_3d.max()
hc = extract_hypercolumn(model,[3,8,15,22],X_3d_scaled)
hc_scaled = (hc -hc.min())/(hc.max()-hc.min())
print hc_scaled.max(),hc_scaled.min()
hc_expand_dims = np.expand_dims(hc_scaled,axis=0)
y_reshaped = np.reshape(y,(y_rows*y_columns,y_channels))
classed_pixels_first = KNN.predict_proba(y_reshaped)
classed_classes_first = np.transpose(classed_pixels_first,(1,0))
classed_expand_dims = np.expand_dims(classed_classes_first,axis=0)
print "hypercolumn shape: ",hc_expand_dims.shape,"classified output color shape: ",classed_expand_dims.shape
return hc_expand_dims,classed_expand_dims
filters = tables.Filters(complevel=5, complib='zlib')
with tables.openFile('raw.h5','w') as f:
# filters = tables.Filters(complib='blosc', complevel=5)
dset_X = f.create_earray(f.root, 'X', tables.Atom.from_dtype(np.dtype('Float64')), (0,960,224,224),filters=filters)
dset_y = f.create_earray(f.root, 'y', tables.Atom.from_dtype(np.dtype('Float64')), (0,112,224*224),filters=filters)
for fl in files[0:12000]:
X_chunk,y_chunk=get_arrays(fl)
dset_X.append(X_chunk)
dset_y.append(y_chunk)

Related

Pattern Matching Support

I'm trying to create a system for pattern matching in images and this is what I have currently:
def ratio(x,y):
return 1/(x*y)
def memory(image):
i__ia=0
l__img = Image.open(image)
l__pix = l__img.load()
l__width = l__img.width
l__height = l__img.height
s__img = None
l__r = 0
l__b = 0
l__g = 0
s__str = None
s__per = 0
for filename in os.listdir("Memory/"):
i__ia+=1
m__percent = 0
p__r = 0
p__g = 0
p__b = 0
f = os.path.join("Memory/", filename)
m__img = Image.open(f)
m__pix = m__img.load()
for l__w in range(0,l__width,1):
for l__h in range(0,l__height,1):
try:
if m__pix[l__w,l__h] == l__pix[l__w,l__h]:
temp = m__pix[l__w,l__h]
a__percent = ratio(l__width,l__height)
p__r+=temp[0]
p__g+=temp[1]
p__b+=temp[2]
m__percent+=a__percent
else:
p__r+=temp[0]
p__g+=temp[1]
p__b+=temp[2]
a__percent = ratio(l__width,l__height)
m__percent-=a__percent
except:
a__percent = ratio(l__width,l__height)
m__percent-=a__percent
pass
if m__percent > s__per and p__r >= l__r and p__b >= l__b and p__g >= l__g:
s__per = m__percent
s__img = m__img
s__str = f
l__r = (1/(p__r+p__b+p__g))*p__r
l__b = (1/(p__r+p__b+p__g))*p__b
l__g = (1/(p__r+p__b+p__g))*p__g
else:
m__percent = 0
if s__per > 0.8:
return True, s__per, s__str,l__r,l__g,l__b
else:
try:
tes = Image.open(image)
tes.save(f'Memory/unknown{i__ia}.png')
return False, s__per, s__str,l__r,l__g,l__b
except:
pass
My end goal is to find differences between pictures, find similarities, be able to classify the image based on how similar it is to other images.
Currently, all I've made is something that can tell how similar two images are based on their color differences and size differences.
I'm having trouble thinking of a way to detect if there is a similar shape or pattern in the image.
Any help is greatly appreciated!

Using a for loop to plot arrays from dictionaries

I have a dictionary with multiple key defined as (arbitrary inputs):
colors = {}
colors['red'] = {}
colors['blue'] = {}
colors['red'][clustname] = np.array([])
colors['blue'][clustname] = np.array([])
basically I want to plot a red v blue graph for each 'cluster'. I have 13 'clusters' in total with differing color values for each. The names in my code are different from the arbitrary ones above, but I figured it would be easier to understand with basic values then to look at the overall code:
colpath = '/home/jacob/PHOTOMETRY/RESTFRAME_COLOURS/' #This is the path to the restframe colors
goodcolindx = {}
colfiledat = {}
colors = {}
colors['UMINV'] = {}
colors['VMINJ'] = {}
colors['NUVMINV'] = {}
colors['id'] = {}
for iclust in range(len(clustname)):
colors['UMINV'][clustname[iclust]] = np.array([])
colors['VMINJ'][clustname[iclust]] = np.array([])
colors['id'][clustname[iclust]] = np.array([])
colors['NUVMINV'][clustname[iclust]] = np.array([])
filepath = catpath + clustname[iclust] + "_totalall_" + extname[iclust] + ".cat"
photdat[clustname[iclust]] = ascii.read(filepath)
filepath = zpath + "compilation_" + clustname[iclust] + ".dat"
zdat[clustname[iclust]] = ascii.read(filepath)
colfilepath = colpath + 'RESTFRAME_MASTER_' + clustname[iclust] + '_indivredshifts.cat'
colfiledat[clustname[iclust]] = ascii.read(colfilepath)
goodcolindx[clustname[iclust]] = np.where((colfiledat[clustname[iclust]]['REDSHIFTUSED'] > 0.9) & \
(colfiledat[clustname[iclust]]['REDSHIFTUSED'] < 1.5) & \
(photdat[clustname[iclust]]['totmask'] == 0) & \
(photdat[clustname[iclust]]['K_flag'] == 0) & \
((zdat[clustname[iclust]]['quality'] == 3) | (zdat[clustname[iclust]]['quality'] == 4)))
goodcolindx[clustname[iclust]] = goodcolindx[clustname[iclust]][0]
for igood in range(len(goodcolindx[clustname[iclust]])):
idstring = str(photdat[clustname[iclust]]['id'][goodcolindx[clustname[iclust]][igood]])
colors['NUVMINV'][clustname[iclust]] = np.append(colors['NUVMINV'][clustname[iclust]], -2.5 *
np.log10(colfiledat[clustname[iclust]]['NUV'][goodcolindx[clustname[iclust]][igood]]
/ colfiledat[clustname[iclust]]['V'][goodcolindx[clustname[iclust]][igood]]))'SpARCS-0035'
colors['UMINV'][clustname[iclust]] = np.append(colors['UMINV'][clustname[iclust]], colfiledat[clustname[iclust]]['UMINV'][goodcolindx[clustname[iclust]][igood]])
colors['id'][clustname[iclust]] = np.append(colors['id'][clustname[iclust]], photdat[clustname[iclust]]['id'][goodcolindx[clustname[iclust]][igood]])
colors['VMINJ'][clustname[iclust]] = np.append(colors['VMINJ'][clustname[iclust]], colfiledat[clustname[iclust]]['VMINJ'][goodcolindx[clustname[iclust]][igood]])
for iclustc in colors:
plt.plot(colors['VMINJ'][clustname[iclustc]], colors['UMINV'][clustname[iclustc]], 'ko')
plt.show()
So in this case, my 'red' is the VMINJ and my 'blue' is the UMINV. I am trying to use a for loop to cycle through all the cluster names that I have, but I keep getting the error back 'String indices must be integers'. I understand the basics of that, but don't know how to fix my code to make plots for each 'red' v 'blue' for each cluster. Any help would be awesome, let me know if you have questions

I figured it out. I changed the for loop to:
for iclust in range(len(clustname)):
plt.plot(colors['UMINV'][clustname[iclust]]....
and that worked

Python program doesn't write to output csv, everything else seems to work correctly

from subprocess import check_output
import csv, operator
extinction_pct = operator.itemgetter('AOT 500','AOT 675','AOT 870','AOT 936','AOT 1020')
with open('csv_export.csv') as f_csv:
reader = csv.DictReader(f_csv)
for row in reader:
with open("INPUT", 'w') as f_in:
f_in.write("&INPUT\n")
f_in.write("WLINF = 0.250\n") #lower frequency value
f_in.write("WLSUP = 4.0\n") #highest frequency value
f_in.write("WLINC = 0.5\n") #wavelength increment
f_in.write("IDAY = 289\n") #computing for a specific day
#f_in.write("ALAT = {Lat}\n".format(**row)) # for Python versions less than 3.6
f_in.write(f"ALAT = {row['Lat']}\n") #latitude of the location
#f_in.write("ALON = {Long}\n".format(**row)) # for Python versions less than 3.6
f_in.write(f"ALON = {row['Long']}\n") #longitude of the location
f_in.write("IDATM = 3\n") #atmopsheric model 2 - mid latitude summer
f_in.write("ISALB = 5\n") #surface albedo feature
f_in.write("IAER = 5\n") #boundary layer aerosol type selection - 5 - user defined spectral dependance of BLA
f_in.write("WLBAER = .500,.675,.870,.936,1.02\n") #wavelenght points for IAER
f_in.write("WBAER = 5*0.9\n") #single scattering albedo
f_in.write("GBAER = 5*0.8\n") #assymetric factor used with IAER
#f_in.write("TIME = {sama]}\n".format(**row)) # for Python versions less than 3.6
f_in.write(f"TIME = {row['sama']}\n") #Time in IST format (-5.30hr)
#f_in.write("QBAER = {}\n".format(','.join(extinction_pct(row))) # for Python versions less than 3.6
f_in.write(f"QBAER = {','.join(extinction_pct(row))}\n") #extinction efficiency percentage
f_in.write("ZOUT = 0.0,15.0\n") #TOA defining
f_in.write("/\n")
check_output('sbdart >> output1.csv',shell=True) #slarrt is the program, and ouytput.csv is the output file
This is my code, with help from #wwii
My last line, check_output csv doesnt write to my output file at all. What could be the issue?
thanks
sbdart is a program, that takes the INPUT file and outputs in the command line

Using method provided here you can try using this.
import subprocess
proc = subprocess.Popen('cmd.exe', stdin = subprocess.PIPE, stdout = subprocess.PIPE)
stdout, stderr = proc.communicate('sbdart >> output.csv')
Make sure you put the full path of sbdart or navigate to the folder having sbdart or add location of sbdart to system path
There are a bunch of other methods in the link provided

Working on Linux with python 3.5
Assume sbdart is executable and we have a file called output1.csv
sbdart looks like this for our test case:
echo $1
echo "$(cat $1)"
output1.csv is as follows:
&INPUT
WLINF = 0.250
WLSUP = 4.0
WLINC = 0.5
IDAY = 289
ALAT = {row['Lat']}
ALON = {row['Long']}
IDATM = 3
ISALB = 5
IAER = 5
WLBAER = .500,.675,.870,.936,1.02
WBAER = 5*0.9
GBAER = 5*0.8
TIME = {row['sama']}
QBAER = {','.join(extinction_pct(row))}
ZOUT = 0.0,15.0
/
>>> import subprocess
>>> subprocess.check_output(['./sbdart output1.csv'],shell=True)
b"output1.csv\n&INPUT\nWLINF = 0.250\nWLSUP = 4.0\nWLINC = 0.5\nIDAY = 289\nALAT = {row['Lat']}\nALON = {row['Long']}\nIDATM = 3\nISALB = 5\nIAER = 5\nWLBAER = .500,.675,.870,.936,1.02\nWBAER = 5*0.9\nGBAER = 5*0.8\nTIME = {row['sama']}\nQBAER = {','.join(extinction_pct(row))}\nZOUT = 0.0,15.0\n/\n"
>>>

While loop incrementer not functioning properly

Right now, my code is correctly spitting out the first game (identified by start_id) in games. I am trying to increment in the bottom two lines, but the while loop doesn't seem to read the fact that I'm incrementing. So the input of this with start_id 800 and end_id 802 is just the information from 800, for some reason.
Am I using the incrementers correctly? Should I be initializing one of i or start_id elsewhere?
games = console(start_id, end_id)
final_output = []
while start_id < (end_id + 1):
single_game = []
i = 0
game_id = games[i][0]
time_entries = games[i][1][2][0]
play_entries = games[i][1][2][1]
score_entries = games[i][1][2][2]
team_entries = games[i][1][2][3]
bovada = games[i][1][0][0][0]
at_capacity = games[i][1][0][1]
idisagree_yetrespect_thatcall = games[i][1][0][2][0]
imsailingaway = games[i][1][1][0][0]
homeiswheretheheartis = games[i][1][1][1][0]
zipper = zip(time_entries, play_entries, score_entries, team_entries)
for play_by_play in zipper:
single_game.append(game_id)
single_game.append(play_by_play)
single_game.append(bovada)
single_game.append(at_capacity)
single_game.append(idisagree_yetrespect_thatcall)
single_game.append(imsailingaway)
single_game.append(homeiswheretheheartis)
start_id += 1
i += 1
final_output.append(single_game)
return final_output

Your problem is that you initialize the increment-er i inside the while loop so every time your loop iterates i is reset to zero.
Try changing it to:
i = 0
while start_id < (end_id + 1):
...

Python and matplotlib - Plotting and drawing images inside class methods result in figures that are "not responding"

I have created a class, ParRec, that reads data from Par/Rec-files (binary images -> numpy-arrays), and now I want to add additional methods to the class to increase functionality, such as a method createFigure(self, ind) that would draw an image of the 2D-image at ind in the 3D-stack of images.
However, this doesn't work no matter how I try, I only get empty figures which are "not responding". There is nothing wrong with the data per se, because when I saved the array to outside the python script and did the exact same thing as in the createFigure-method it did work. These are the relevant parts of the code:
import numpy as np
import matplotlib.pyplot as plt
class ParRec():
def __init__(self, filename, equalResolution = True):
"""
This class is used to read and create a ParRec-object, which contains
arrays of parameters and the corresponding image matrix, in a similar way
to the IDL-struct system previously implemented.
Additionally, the class has several methods that can be called later to
simplify calculations, e.g. removePhaseImages and selectSlice.
filename - full pathname to the Par/Rec-files, without .extension
equalResolution - if set to false, ParRec-object collects parameters like
xres and yres in arrays, otherwise they are constants
and all images in the stack are assumed to be of equal
dimensions.
"""
self.slice_number = 0
self.echo_number = 0
self.dynamic_scan_number = 0
self.cardiac_phase_number = 0
self.image_type_mr = 0
self.scanning_sequence = 0
self.index_in_REC_file = 0 # (in images)
self.image_pixel_size = 0 # (in bits)
self.scan_percentage = 0
self.recon_resolution = 0 # (x y)
self.rescale_intercept = 0
self.rescale_slope = 0
self.scale_slope = 0
self.window_center = 0
self.window_width = 0
self.image_angulation = 0 # (ap,fh,rl in degrees)
self.image_offcentre = 0 # (3*float)
self.slice_thickness = 0 # (in mm)
self.slice_gap = 0 # (in mm)
self.image_display_orientation = 0
self.slice_orientation = 0 # (TRA/SAG/COR) = (1/2/3)
self.fmri_status_indication = 0
self.image_type_ed_es = 0 # (end diast/end syst)
self.pixel_spacing = 0 # (x y) (in mm)
self.echo_time = 0
self.dyn_scan_begin_time = 0
self.trigger_time = 0
self.diffusion_b_factor = 0
self.number_of_averages = 0
self.image_flip_angle = 0 # (in degrees)
self.cardiac_frequency = 0 # (bpm)
self.minimum_RR_interval = 0 # (in ms)
self.maximum_RR_interval = 0 # (in ms)
self.turbo_factor = 0 # (0=no turbo)
self.inversion_delay = 0 # (in ms)
self.diffusion_b_value_number = 0 # (imagekey!)
self.gradient_orientation_number = 0 # (imagekey!)
self.contrast_type = 0
self.diffusion_anisotropy_type = 0
self.diffusion = 0 # (ap, fh, rl)
self.label_type = None # (ASL) (imagekey!)
self.xres = 0
self.yres = 0
self.size = 0
self.equalResolution = equalResolution
# Read ParRec-files
self.readPar(filename)
self.readRec(filename)
# Get number of slices, dynamics, information about phase images etc.
self.getAdditionalData()
def readPar(self,filename)
...
def redRec(self,filename)
...
self.matrix = ...
def createFigure(self, ind):
img = self.matrix[:,:,ind].astype(np.float32)
fig = plt.figure()
plt.imshow(img, cmap = plt.cm.gray)
return fig
if __name__ == '__main__':
filename = '...\filename'
obj = ParRec(filename)
fig = obj.createFigure(0)
plt.show()
Can anybody explain what's going on? That is, why the image-drawing doesn't work when used as a class-method like this, and how to make it work?
EDIT: Inserted the init method, and managed to screw up the indentation a bit, but that is not a problem in the original code.
Best regards,
Mikael

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to compress hdf5 file when resizing? - python

Related

Pattern Matching Support

Using a for loop to plot arrays from dictionaries

Python program doesn't write to output csv, everything else seems to work correctly

While loop incrementer not functioning properly

Python and matplotlib - Plotting and drawing images inside class methods result in figures that are "not responding"

Categories

Resources