Python process being killed due to out of 256Gb memory

Python process being killed due to out of 256Gb memory - python

The training dataset is a 42GB JSON file. mesh is medical subject headings, consider it as an id or a label. neighbors_mesh is a 28,000 dimension list that has information about mesh that are close to each other. We got this data from training mesh terms for 1.07 M data through KNN. The MLB fit transform returns a 28,000-dimension vector of 0 and 1. But each element is int64 by default. I have tried to reduce it by mask.astype(int__). It's still 32bit.
The iteration blocks 256GB of memory after running for about 1M iterations and still gets killed.
My python version is 3.9
The machine has 256GB memory, 20GB Swap memory, and 48 Core CPU, and GPU.
def build_dataset(train_path, neighbors, journal_mesh, MeSH_id_pair_file, index_dic):
mapping_id = {}
with open(MeSH_id_pair_file, 'r') as f:
for line in f:
(key, value) = line.split('=')
mapping_id[key] = value.strip()
meshIDs = list(mapping_id.values())
meshIDs = label2index(meshIDs, index_dic)
meshIDs_str = [str(x) for x in meshIDs]
print('Total number of labels %d' % len(meshIDs_str))
mlb = MultiLabelBinarizer(classes=meshIDs_str)
mlb.fit(meshIDs_str)
pmid_neighbors, neighbors_mesh = read_neighbors(neighbors, index_dic)
f = open(train_path, encoding="utf8")
objects = ijson.items(f, 'articles.item')
dataset = []
print("Objects: ", type(objects))
print("pmid neighboors: ", type(pmid_neighbors))
for i, obj in enumerate(tqdm(objects)):
data_point = {}
try:
ids = obj["pmid"]
heading = obj['title'].strip()
heading = heading.translate(str.maketrans('', '', '[]'))
abstract = obj["abstractText"].strip()
clean_abstract = abstract.translate(str.maketrans('', '', '[]'))
if len(heading) == 0 or heading == 'In process':
print('paper ', ids, ' does not have title!')
continue
elif len(clean_abstract) == 0:
print('paper ', ids, ' does not have abstract!')
continue
else:
mesh_id = obj['mesh']
journal = obj['journal']
year = obj['year']
mesh_from_journal = journal_mesh[journal]
mesh_from_neighbors = []
if i < len(pmid_neighbors) and ids == pmid_neighbors[i]:
mesh_from_neighbors = neighbors_mesh[i]
mesh_from_journal_str = [str(x) for x in mesh_from_journal]
mesh_from_neighbors_str = [str(x) for x in mesh_from_neighbors]
mesh = list(set(mesh_from_journal_str + mesh_from_neighbors_str))
mask = mlb.fit_transform([mesh])
mask = mask.astype(np.int_)
mask = mask.tolist()
print("MEsh Size: ", sys.getsizeof(mask))
print("Mesh content size: ", sys.getsizeof(mask[0][0]))
print("Mesh content type: ", type(mask[0][0]))
data_point['pmid'] = ids
data_point['title'] = heading
data_point['abstractText'] = clean_abstract
data_point['meshID'] = mesh_id
data_point['meshMask'] = mask
data_point['year'] = year
dataset.append(data_point)
print("dataset Size: ", sys.getsizeof(dataset))
except AttributeError:
print(f'An excaption occured for pmid: {obj["pmid"].strip()}', AttributeError.args())
pubmed = {'articles': dataset}
return pubmed

I successfully finished running the code by adding f.close() after the iteration is completed. The result is a 88GB dataset. But I am still curious why it's taking up so much space.

Related

OSError: The topology is loaded by filename extension, and the detected ".pd" format is not supported

I'm writing a code to generate features of ligands. Code is:
def parsePDB(self):
top = self.pdb.topology
residues = [str(residue)[:3] for residue in top.residues]
residues_cp = residues.copy()
# number of all groups in the complex except the ligand
LIG_number = residues.index('LIG')
self.residues_indices = [i for i in range(top.n_residues) if i != LIG_number]
# Get the types of atoms in the protein and ligand
self.rec_indices = top.select('protein')
self.lig_indices = top.select('resname LIG')
table, bond = top.to_dataframe()
self.all_ele = table['element']
self.lig_ele = table['element'][self.lig_indices]
H_num = []
for num, i in enumerate(self.all_ele):
if i == 'H':
H_num.append(num)
# Get the serial number of the atom in each residue or group
removes = []
for i in self.residues_indices:
atoms = top.residue(i).atoms
each_atoms = [j.index for j in atoms]
heavy_atoms = [x for x in each_atoms if not x in H_num]
if len(heavy_atoms) == 0:
removes.append(i)
else:
self.atoms_indices.append(heavy_atoms)
if len(removes) != 0:
for i in removes:
self.residues_indices.remove(i)
self.residues = [residues_cp[x] for x in self.residues_indices]
# Get the 3D coordinates for all atoms
self.xyz = self.pdb.xyz[0]
return self
I'm getting this error:
OSError: The topology is loaded by filename extension, and the detected ".pd" format is not supported. Supported topology formats include ".pdb", ".pdb.gz", ".h5", ".lh5", ".prmtop", ".parm7", ".prm7", ".psf", ".mol2", ".hoomdxml", ".gro", ".arc", ".hdf5" and ".gsd".
Can anyone please suggest me how to resolve this issue? Thanks

How to get big amount of data as fast as possible

I am trying to return an array of constructed objects that are build on top of objects that I retrieve from some url plus another fields that I get from another url.
I have an array that consists of two arrays that each has about 8000 objects...
I have tried to make each object construction as a thread however it still takes a lot of time...
Any solution? Here is my code:
def get_all_players_full_data(ea_players_json):
all = []
ea_players_json = list(ea_players_json.values())
for i in range(len(ea_players_json)):
for player_obj in ea_players_json[i]:
all.append(player_obj)
for player_obj in range(len(all)):
all_data = []
with concurrent.futures.ThreadPoolExecutor(len(all)) as executor:
for player_data in all:
future = executor.submit(build_full_player_data_obj, player_data)
print(future.result())
all_data.append(future.result())
def build_full_player_data_obj(ea_player_data):
if ea_player_data.get("c") is not None:
player_full_name = ea_player_data.get("c")
else:
player_full_name = ea_player_data.get("f") + " " + ea_player_data.get("l")
player_id = ea_player_data.get("id")
# go to futhead to find all cards of that player
futhead_url_player_data = f'{FUTHEAD_PLAYER}{player_full_name}'
details_of_specific_player = json.loads(requests.get(futhead_url_player_data).content)
cards_from_the_same_id = []
for player_in_json_futhead in details_of_specific_player:
if player_in_json_futhead["player_id"] == player_id:
rating = player_in_json_futhead["rating"]
specific_card_id = player_in_json_futhead["def_id"]
revision = player_in_json_futhead["revision_type"]
name = player_in_json_futhead["full_name"]
nation = player_in_json_futhead["nation_name"]
position = player_in_json_futhead["position"]
club = player_in_json_futhead["club_name"]
cards_from_the_same_id.append(Player(specific_card_id, name, rating, revision, nation,
position, club))
return cards_from_the_same_id

error when comparing files : IndexError: list index out of range

I have a program that compares files line by line and calculate the precision by reading two folders a "gold folder" and a "prediction folder).
The extracted files are like this:
T1 Task 5 19 nonlinear wave
T2 Task 5 29 nonlinear wave equations
T3 Task 15 29 wave equations
T4 Task 86 111 general analytical method
T5 Task 94 111 analytical method
T6 Task 199 213 minimum stages
T7 Task 268 287 efficient technique
T8 Task 268 298 efficient technique relatingto
also the gold files:
T1 Process 5 14 oxidation
T2 Material 69 84 Ti-based alloys
T3 Material 186 192 alloys
T4 Task 264 349 understand the role that composition has on the oxidation behavior of Ti-based alloys
T5 Process 312 321 oxidation
T6 Material 334 349 Ti-based alloys
T7 Material 400 415 Ti-based alloys
T8 Material 445 451 alloys
T9 Process 480 489 oxidation
The problem is that this code generates this error:
Traceback (most recent call last):
File "C:\Users\chedi\Downloads\Semeval\eval.py", line 214, in <module>
calculateMeasures(folder_gold, folder_pred, remove_anno)
File "C:\Users\chedi\Downloads\Semeval\eval.py", line 31, in calculateMeasures
res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
File "C:\Users\chedi\Downloads\Semeval\eval.py", line 130, in normaliseAnnotations
r_g_offs = r_g[1].split(" ")
IndexError: list index out of range
The error is in the line 130 and in the format of the extracted files, but they seems in the same format: first and second column separated by a tab, the offset by space
#!/usr/bin/python
# by Mattew Peters, who spotted that sklearn does macro averaging not micro averaging correctly and changed it
import os
from sklearn.metrics import precision_recall_fscore_support
import sys
def calculateMeasures(folder_gold="data/dev/", folder_pred="data_pred/dev/", remove_anno = ""):
'''
Calculate P, R, F1, Macro F
:param folder_gold: folder containing gold standard .ann files
:param folder_pred: folder containing prediction .ann files
:param remove_anno: if set if "rel", relations will be ignored. Use this setting to only evaluate
keyphrase boundary recognition and keyphrase classification. If set to "types", only keyphrase boundary recognition is evaluated.
Note that for the later, false positive
:return:
'''
flist_gold = os.listdir(folder_gold)
res_all_gold = []
res_all_pred = []
targets = []
for f in flist_gold:
# ignoring non-.ann files, should there be any
if not str(f).endswith(".ann"):
continue
f_gold = open(os.path.join(folder_gold, f), "r")
try:
f_pred = open(os.path.join(folder_pred, f), "r")
res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
except IOError:
print(f + " file missing in " + folder_pred + ". Assuming no predictions are available for this file.")
res_full_pred, res_pred, spans_pred, rels_pred = [], [], [], []
res_full_gold, res_gold, spans_gold, rels_gold = normaliseAnnotations(f_gold, remove_anno)
spans_all = set(spans_gold + spans_pred)
for i, r in enumerate(spans_all):
if r in spans_gold:
target = res_gold[spans_gold.index(r)].split(" ")[0]
res_all_gold.append(target)
if not target in targets:
targets.append(target)
else:
# those are the false positives, contained in pred but not gold
res_all_gold.append("NONE")
if r in spans_pred:
target_pred = res_pred[spans_pred.index(r)].split(" ")[0]
res_all_pred.append(target_pred)
else:
# those are the false negatives, contained in gold but not pred
res_all_pred.append("NONE")
#y_true, y_pred, labels, targets
prec, recall, f1, support = precision_recall_fscore_support(
res_all_gold, res_all_pred, labels=targets, average=None)
# unpack the precision, recall, f1 and support
metrics = {}
for k, target in enumerate(targets):
metrics[target] = {
'precision': prec[k],
'recall': recall[k],
'f1-score': f1[k],
'support': support[k]
}
# now micro-averaged
if remove_anno != 'types':
prec, recall, f1, s = precision_recall_fscore_support(
res_all_gold, res_all_pred, labels=targets, average='micro')
metrics['overall'] = {
'precision': prec,
'recall': recall,
'f1-score': f1,
'support': sum(support)
}
else:
# just binary classification, nothing to average
metrics['overall'] = metrics['KEYPHRASE-NOTYPES']
print_report(metrics, targets)
return metrics
def print_report(metrics, targets, digits=2):
def _get_line(results, target, columns):
line = [target]
for column in columns[:-1]:
line.append("{0:0.{1}f}".format(results[column], digits))
line.append("%s" % results[columns[-1]])
return line
columns = ['precision', 'recall', 'f1-score', 'support']
fmt = '%11s' + '%9s' * 4 + '\n'
report = [fmt % tuple([''] + columns)]
report.append('\n')
for target in targets:
results = metrics[target]
line = _get_line(results, target, columns)
report.append(fmt % tuple(line))
report.append('\n')
# overall
line = _get_line(metrics['overall'], 'avg / total', columns)
report.append(fmt % tuple(line))
report.append('\n')
print(''.join(report))
def normaliseAnnotations(file_anno, remove_anno):
'''
Parse annotations from the annotation files: remove relations (if requested), convert rel IDs to entity spans
:param file_anno:
:param remove_anno:
:return:
'''
res_full_anno = []
res_anno = []
spans_anno = []
rels_anno = []
for l in file_anno:
r_g = l.strip().split("\t")
r_g_offs = r_g[1].split(" ")
# remove relation instances if specified
if remove_anno != "" and r_g_offs[0].endswith("-of"):
continue
res_full_anno.append(l.strip())
# normalise relation instances by looking up entity spans for relation IDs
if r_g_offs[0].endswith("-of"):
arg1 = r_g_offs[1].replace("Arg1:", "")
arg2 = r_g_offs[2].replace("Arg2:", "")
for l in res_full_anno:
r_g_tmp = l.strip().split("\t")
if r_g_tmp[0] == arg1:
ent1 = r_g_tmp[1].replace(" ", "_")
if r_g_tmp[0] == arg2:
ent2 = r_g_tmp[1].replace(" ", "_")
spans_anno.append(" ".join([ent1, ent2]))
res_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
rels_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
else:
spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]]))
keytype = r_g[1]
if remove_anno == "types":
keytype = "KEYPHRASE-NOTYPES"
res_anno.append(keytype)
for r in rels_anno:
r_offs = r.split(" ")
# reorder hyponyms to start with smallest index
if r_offs[0] == "Synonym-of" and r_offs[2].split("_")[1] < r_offs[1].split("_")[1]: # 1, 2
r = " ".join([r_offs[0], r_offs[2], r_offs[1]])
# Check, in all other hyponym relations, if the synonymous entity with smallest index is used for them.
# If not, change it so it is.
if r_offs[0] == "Synonym-of":
for r2 in rels_anno:
r2_offs = r2.split(" ")
if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[1]:
r_new = " ".join([r2_offs[0], r_offs[2], r2_offs[2]])
rels_anno[rels_anno.index(r2)] = r_new
if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[2]:
r_new = " ".join([r2_offs[0], r2_offs[1], r_offs[2]])
rels_anno[rels_anno.index(r2)] = r_new
rels_anno = list(set(rels_anno))
res_full_anno_new = []
res_anno_new = []
spans_anno_new = []
for r in res_full_anno:
r_g = r.strip().split("\t")
if r_g[0].startswith("R") or r_g[0] == "*":
continue
ind = res_full_anno.index(r)
res_full_anno_new.append(r)
res_anno_new.append(res_anno[ind])
spans_anno_new.append(spans_anno[ind])
for r in rels_anno:
res_full_anno_new.append("R\t" + r)
res_anno_new.append(r)
spans_anno_new.append(" ".join([r.split(" ")[1], r.split(" ")[2]]))
return res_full_anno_new, res_anno_new, spans_anno_new, rels_anno
if __name__ == '__main__':
folder_gold = "data/dev/"
folder_pred = "data_pred/dev/"
remove_anno = "" # "", "rel" or "types"
if len(sys.argv) >= 2:
folder_gold = sys.argv[1]
if len(sys.argv) >= 3:
folder_pred = sys.argv[2]
if len(sys.argv) == 4:
remove_anno = sys.argv[3]
calculateMeasures(folder_gold, folder_pred, remove_anno)

Without having files on my own, I tried with the "gold" file you provided, namely:
T1 Process 5 14 oxidation
T2 Material 69 84 Ti-based alloys
T3 Material 186 192 alloys
T4 Task 264 349 understand the role that composition has on the oxidation behavior of Ti-based alloys
T5 Process 312 321 oxidation
T6 Material 334 349 Ti-based alloys
T7 Material 400 415 Ti-based alloys
T8 Material 445 451 alloys
T9 Process 480 489 oxidation
For the program to be able to run correctly and not get the error you get of 'list index out of range' in the line of code you mention, it is fundamental that between the first column (the 'Ts') and the second column there is a tab and between the other columns a space. Failing to have a correct file formatted in this way (for example having a space instead of a tab between the first two columns) will give that error. Indeed what really happens in the line
r_g = l.strip('\n').split("\t")
is that first the newline gets removed at the end of the line, and than the line is splitted by tab. This means that the line gets splitted in two elements, which make up the list r_g. In this case r_g_offs can be calculated correctly and will contain a list of elements which are all the columns but the first. In some cases then, this will be used later for example in
spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]]))
just to mention one.
Let's look at the case which doesn't work and let's try to understand why.
If the file .ann (gold) is not formatted in this way:
T1\tProcess (tab between)
but instead is
T1 Process (space)
the code
r_g = l.strip('\n').split("\t")
will produce a list of just one element and not of two, e.g.
r_g = ['T1 Process ...']
In this case, r_g has only one element, the element r_g[0] so when one tries to access an element that doesn't exist (r_g[1]) via
r_g_offs = r_g[1].split()
one will get an
IndexError: list index out of range
There exists another case in which you could get the aforementioned error.
In the case of empty line at the end of the file, r_g = [''], which means r_g is a list of only one element. Now, similar to the previous case, when the script executes the line r_g_offs = r_g[1].split(), will try to access r_g[1], which doesn't exist since the only element in the list in this case is r_g[0] and you will get the 'list index out of range' error.
The code I can run:
#!/usr/bin/python
# by Mattew Peters, who spotted that sklearn does macro averaging not
# micro averaging correctly and changed it
import os
from sklearn.metrics import precision_recall_fscore_support
import sys
def calculateMeasures(folder_gold="data/dev/", folder_pred="data_pred/dev/", remove_anno=""):
'''
Calculate P, R, F1, Macro F
:param folder_gold: folder containing gold standard .ann files
:param folder_pred: folder containing prediction .ann files
:param remove_anno: if set if "rel", relations will be ignored. Use this setting to only evaluate
keyphrase boundary recognition and keyphrase classification. If set to "types", only keyphrase boundary recognition is evaluated.
Note that for the later, false positive
:return:
'''
flist_gold = os.listdir(folder_gold)
res_all_gold = []
res_all_pred = []
targets = []
for f in flist_gold:
# ignoring non-.ann files, should there
# be any
if not str(f).endswith(".ann"):
continue
f_gold = open(os.path.join(folder_gold, f), "r")
try:
f_pred = open(os.path.join(folder_pred, f), "r")
res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
except IOError:
print(f + " file missing in " + folder_pred + ". Assuming no predictions are available for this file.")
res_full_pred, res_pred, spans_pred, rels_pred = [], [], [], []
res_full_gold, res_gold, spans_gold, rels_gold = normaliseAnnotations(f_gold, remove_anno)
spans_all = set(spans_gold + spans_pred)
for i, r in enumerate(spans_all):
if r in spans_gold:
target = res_gold[spans_gold.index(r)].split(" ")[0]
res_all_gold.append(target)
if not target in targets:
targets.append(target)
else:
res_all_gold.append("NONE")
if r in spans_pred:
target_pred = res_pred[spans_pred.index(r)].split(" ")[0]
res_all_pred.append(target_pred)
else:
res_all_pred.append("NONE")
#y_true, y_pred, labels, targets
prec, recall, f1, support = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average=None)
metrics = {}
for k, target in enumerate(targets):
metrics[target] = {
'precision': prec[k],
'recall': recall[k],
'f1-score': f1[k],
'support': support[k]
}
# now
# micro-averaged
if remove_anno != 'types':
prec, recall, f1, s = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average='micro')
metrics['overall'] = {
'precision': prec,
'recall': recall,
'f1-score': f1,
'support': sum(support)
}
else:
# just
# binary
# classification,
# nothing
# to
# average
metrics['overall'] = metrics['KEYPHRASE-NOTYPES']
print_report(metrics, targets)
return metrics
def print_report(metrics, targets, digits=2):
def _get_line(results, target, columns):
line = [target]
for column in columns[:-1]:
line.append("{0:0.{1}f}".format(results[column], digits))
line.append("%s" % results[columns[-1]])
return line
columns = ['precision', 'recall', 'f1-score', 'support']
fmt = '%11s' + '%9s' * 4 + '\n'
report = [fmt % tuple([''] + columns)]
report.append('\n')
for target in targets:
results = metrics[target]
line = _get_line(results, target, columns)
report.append(fmt % tuple(line))
report.append('\n')
# overall
line = _get_line(
metrics['overall'], 'avg / total', columns)
report.append(fmt % tuple(line))
report.append('\n')
print(''.join(report))
def normaliseAnnotations(file_anno, remove_anno):
'''
Parse annotations from the annotation files: remove relations (if requested), convert rel IDs to entity spans
:param file_anno:
:param remove_anno:
:return:
'''
res_full_anno = []
res_anno = []
spans_anno = []
rels_anno = []
for l in file_anno:
print(l)
print(l.strip('\n'))
r_g = l.strip('\n').split("\t")
print(r_g)
print(len(r_g))
r_g_offs = r_g[1].split()
print(r_g_offs)
if remove_anno != "" and r_g_offs[0].endswith("-of"):
continue
res_full_anno.append(l.strip())
if r_g_offs[0].endswith("-of"):
arg1 = r_g_offs[1].replace("Arg1:", "")
arg2 = r_g_offs[2].replace("Arg2:", "")
for l in res_full_anno:
r_g_tmp = l.strip().split("\t")
if r_g_tmp[0] == arg1:
ent1 = r_g_tmp[1].replace(" ", "_")
if r_g_tmp[0] == arg2:
ent2 = r_g_tmp[1].replace(" ", "_")
spans_anno.append(" ".join([ent1, ent2]))
res_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
rels_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
else:
spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]]))
keytype = r_g[1]
if remove_anno == "types":
keytype = "KEYPHRASE-NOTYPES"
res_anno.append(keytype)
for r in rels_anno:
r_offs = r.split(" ")
# reorder hyponyms to start with smallest index
# 1, 2
if r_offs[0] == "Synonym-of" and r_offs[2].split("_")[1] < r_offs[1].split("_")[1]:
r = " ".join([r_offs[0], r_offs[2], r_offs[1]])
if r_offs[0] == "Synonym-of":
for r2 in rels_anno:
r2_offs = r2.split(" ")
if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[1]:
r_new = " ".join([r2_offs[0], r_offs[2], r2_offs[2]])
rels_anno[rels_anno.index(r2)] = r_new
if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[2]:
r_new = " ".join([r2_offs[0], r2_offs[1], r_offs[2]])
rels_anno[rels_anno.index(r2)] = r_new
rels_anno = list(set(rels_anno))
res_full_anno_new = []
res_anno_new = []
spans_anno_new = []
for r in res_full_anno:
r_g = r.strip().split("\t")
if r_g[0].startswith("R") or r_g[0] == "*":
continue
ind = res_full_anno.index(r)
res_full_anno_new.append(r)
res_anno_new.append(res_anno[ind])
spans_anno_new.append(spans_anno[ind])
for r in rels_anno:
res_full_anno_new.append("R\t" + r)
res_anno_new.append(r)
spans_anno_new.append(" ".join([r.split(" ")[1], r.split(" ")[2]]))
return res_full_anno_new, res_anno_new, spans_anno_new, rels_anno
if __name__ == '__main__':
folder_gold = "data/dev/"
folder_pred = "data_pred/dev/"
remove_anno = "" # "", "rel" or "types"
if len(sys.argv) >= 2:
folder_gold = sys.argv[1]
if len(sys.argv) >= 3:
folder_pred = sys.argv[2]
From the two cases shown above, we can conclude that the script is very sensible to how the file are formatted/written (tab, spaces and no empty line at the end), so care will be needed when producing those files and feeding them to the main script.

How to compress hdf5 file when resizing?

Here is my code:
n = 100000 #This is what makes it tricky - lots of files going into this hdf5 file
with h5py.File('image1.h5','w') as f:
dset_X = f.create_dataset('X',(1,960,224,224),maxshape=(None,960,224,224),chunks=True,compression='gzip')
dset_y = f.create_dataset('y',(1,112,224*224),maxshape=(None,112,224*224),chunks=True,compression='gzip')
n_images = 0
for fl in files[:n]:
X_chunk,y_chunk = get_arrays(fl)
dset_X.resize(n_images+1,axis=0)
dset_y.resize(n_images+1,axis=0)
print dset_X.shape,dset_y.shape
dset_X[n_images:n_images+1,:,:,:]=X_chunk
dset_y[n_images:n_images+1,:,:]=y_chunk
n_images+=1
This works fine and dandy. However, with 1 file, the size of the hdf5 is 6.7MB. With 2 files its 37MB ( should be 12 MB right?). With 10 its all the way up to 388MB (should be 67 right?)
So clearly adding the compression flag to the end of the 2nd and third line isn't working as intended. How can I achieve something like this?

I ended up doing this successfully using pytables.
def get_arrays(each_file):
lab = color.rgb2lab(io.imread(each_file))
X = lab[:,:,:1]
y = lab[:,:,1:]
X_rows,X_columns,X_channels=X.shape
y_rows,y_columns,y_channels=y.shape
X_channels_first = np.transpose(X,(2,0,1))
X_sample = np.expand_dims(X_channels_first,axis=0)
X_3d = np.tile(X_sample,(1,3,1,1))
X_3d_scaled = X_3d * 255.0/X_3d.max()
hc = extract_hypercolumn(model,[3,8,15,22],X_3d_scaled)
hc_scaled = (hc -hc.min())/(hc.max()-hc.min())
print hc_scaled.max(),hc_scaled.min()
hc_expand_dims = np.expand_dims(hc_scaled,axis=0)
y_reshaped = np.reshape(y,(y_rows*y_columns,y_channels))
classed_pixels_first = KNN.predict_proba(y_reshaped)
classed_classes_first = np.transpose(classed_pixels_first,(1,0))
classed_expand_dims = np.expand_dims(classed_classes_first,axis=0)
print "hypercolumn shape: ",hc_expand_dims.shape,"classified output color shape: ",classed_expand_dims.shape
return hc_expand_dims,classed_expand_dims
filters = tables.Filters(complevel=5, complib='zlib')
with tables.openFile('raw.h5','w') as f:
# filters = tables.Filters(complib='blosc', complevel=5)
dset_X = f.create_earray(f.root, 'X', tables.Atom.from_dtype(np.dtype('Float64')), (0,960,224,224),filters=filters)
dset_y = f.create_earray(f.root, 'y', tables.Atom.from_dtype(np.dtype('Float64')), (0,112,224*224),filters=filters)
for fl in files[0:12000]:
X_chunk,y_chunk=get_arrays(fl)
dset_X.append(X_chunk)
dset_y.append(y_chunk)

How do I raise the memory limit in python?

I have written a python (2.7) script but it use a lot of memory so I get a out of memory error. Is it possible to use memory?
My code (or the github):
from itertools import combinations
import numpy
# Find the unused members and put this in a other group
def findMembers(listIn,listMembers):
lengthlist2 = (len(listMembers)-len(listIn[0]))
group2 = [0] * lengthlist2 #making the other groups based on the length of the first group
for i in listIn:
wichRow = 0
for x in listMembers:
if not (x in i) :
group2[wichRow] = x
wichRow += 1
listIn.append(group2)
return listIn
#you give a list of members and the numbers of groups
#you get back all the possibilities of combinations
def findCombinations(listMembers,numbersOfGroups):
groupTemp = [] #list needed to save correctly all the combinations
group = [] #list needed for keep it simple
newGroup = [] #list that will be returned
for listPossibilities in combinations(listMembers,(len(listMembers)/numbersOfGroups)):
groupTemp.append(list(listPossibilities))
group.append(groupTemp) #saving all the possibilities
groupTemp = []
for k in group:
# place the unused members in group2
k = (findMembers(k,listMembers))
if numbersOfGroups > 2:
groupTemp = []
groupTemp = findCombinations(k[1],numbersOfGroups-1)
for i in groupTemp:
listTemp = []
listTemp.append(k[0])
listTemp.extend(i)
newGroup.append(listTemp)
else:
newGroup = group
return newGroup
# Calculate the happiness of the group
def findHappiness(tabel,listIn):
happiness = 0
for i in listIn:
for j in i:
for k in i:
happiness += tabel[j][k]
return happiness
def buildTabel(members): #build a random survey
tabel = numpy.random.random((members,members))
return tabel
def calculateHappiness(group):
print "Finding all the happiness: "
maxhappiness = 0
i = 0
for x in group:
happiness = findHappiness(tabel,x)
if happiness > maxhappiness:
maxhappiness = happiness
y = x
progress = int(round((((i)*1.0/(len(group)))*100.0)))
update_progress(progress)
i += 1
print "\n Best solution: ", y, " with: ", maxhappiness, " happiness"
def update_progress(progress):
print '\r[{0}] {1}%'.format('#'*(progress/5), progress),
if __name__ == "__main__":
members = 24 # members of the group
numbersOfGroups = 3
tabel = buildTabel(members) #preferences will be stored here
listMembers = (range(members)) #members of the group that need to be divided
print "Searching all the combinations..."
group = findCombinations(listMembers,numbersOfGroups) #find all the combinations (recursive)
print len(group)," combinations"
calculateHappiness(group) #calculate the most happiest group and print
the error:
Searching all the combinations...
Traceback (most recent call last):
File "main.py", line 75, in <module>
calculateHappiness(group) #calculate the most happiest group and print
File "main.py", line 38, in findCombinations
newGroup = group
MemoryError
I'm using windows 10 64bit with 6gb ram. Is it possible to use virtual ram or disk space of mine hard drive disk?

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python process being killed due to out of 256Gb memory - python

I successfully finished running the code by adding f.close() after the iteration is completed. The result is a 88GB dataset. But I am still curious why it's taking up so much space.

Related

OSError: The topology is loaded by filename extension, and the detected ".pd" format is not supported

How to get big amount of data as fast as possible

error when comparing files : IndexError: list index out of range

How to compress hdf5 file when resizing?

How do I raise the memory limit in python?

Categories

Resources