I'm new to this package. When studying the codes from https://github.com/diningphil/graph-mixture-density-networks (with some minor modification). In the notebook file SIR Simulation with DGL_ERDOS-RENYI.ipynb, during the simulation process, I encountered a weird thing:
If I set debug = True, which means I'm not using the pool = concurrent.futures.ProcessPoolExecutor(max_workers=processes) but just run it one by one, both the .json files and the .bin files will be created according to the json_filepath variable.
However, when I deleted the output and run it by setting debug = False so all the codes will run simultaneously if my understanding is correct, but the json_file will not be created and the program seems to terminate at the step graph.to(torch.device(device)) as all my print command is not executed afterward. I only have the .bin files created.
Could anyone help me by telling me the possible reason or waht I should do about it? Thanks a lot!
'''
run simulation and store
1) state of all nodes at each time step
into a single pandas dataframe for all beta, gamma and repetitions
2) R_0
3) number of total people infected (total - susceptible at the end of the iteration)
'''
seed = 38
torch.manual_seed(seed)
device = 'cuda'
beta_range = [0, 1]
gamma_range = [0.1, 1]
iterations = 5
no_graph_samples = 20
no_realizations = 100
family_name = 'erdos_renyi'
folder = Path(f'{family_name}')
if not os.path.exists(folder):
os.makedirs(folder)
def simulate(p, graph_size, graph_sample, graphs_folder):
json_filepath = str(Path(graphs_folder, f'data_{graph_sample}.json'))
graph_filename = graphs_folder / Path(f'sample{graph_sample}.bin')
json_data = {'family': family_name,
'p': p,
'graph_size': graph_size,
'no_graph_samples': no_graph_samples,
'graph_samples': []
}
sample = {'graph_filename': str(graph_filename),
'simulations': []}
if not os.path.exists(graph_filename):
graph = create_erdos_renyi_graph(graph_size, p)
save_graphs(str(graph_filename), graph)
else:
graph = load_graphs(str(graph_filename))[0][0]
#print('test')
graph.to(torch.device(device))
## every code above this line will run, at least print() will work
if not os.path.exists(json_filepath):
print('test: json_does not exit')
for realizations in range(no_realizations):
beta = float(torch.FloatTensor(1).uniform_(beta_range[0], beta_range[1]))
gamma = float(torch.FloatTensor(1).uniform_(gamma_range[0], gamma_range[1]))
R0 = beta/gamma
graph.register_message_func(lambda x: SIR_message_func(beta, x))
graph.register_reduce_func(lambda x: SIR_reduce_func(gamma, x))
for initial_probability_of_infection in [0.01, 0.05, 0.1]:
simulation = {'beta': beta, 'gamma': gamma, 'R0': R0, 'init_infection_prob': initial_probability_of_infection}
S, I, R, first_infected = simulate_SIR(graph, initial_probability_of_infection, iterations)
simulation['S'] = S
simulation['I'] = I
simulation['R'] = R
simulation['first_infected'] = first_infected
simulation['total_infected'] = graph_size - S[-1]
sample['simulations'].append(deepcopy(simulation))
#print("Realization ", realizations, "produced ", graph_size - S[-1], "infected")
json_data['graph_samples'].append(sample)
with open(json_filepath, 'w') as f:
line = json.dumps(json_data)
f.write(line + '\n')
#json.dump(json_data, f)
print('dumped')
else:
print('test: there is json')
print(sample)
# with open(json_filepath, 'r') as f:
# json.load(f)
# print('loaded but why')
debug = False
processes = 100
import concurrent.futures
pool = concurrent.futures.ProcessPoolExecutor(max_workers=processes)
#for graph_size in [10, 50, 100, 200, 500, 1000]:
for graph_size in [10]:
for p in [0.01, 0.05]:
#for p in [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]:
graphs_folder = folder / Path(f'graphs_size{graph_size}_p{float(p)}')
#store each graph in a different folder (create path based on graph size, prob of edge and graph sample)
if not os.path.exists(graphs_folder):
os.makedirs(graphs_folder)
for graph_sample in range(no_graph_samples):
if not debug:
pool.submit(simulate, p, graph_size, graph_sample, graphs_folder)
else: # DEBUG
simulate(p, graph_size, graph_sample, graphs_folder)
pool.shutdown() # wait the batch of configs to terminate
Related
I want to do a grid search on time series data. Is there any function to do that to search through what I have listed on "lgb_params" for example?
lgb_params = {
"learning_rate": [0.001, 0.01, 0.1, 0.2],
"max_depth": [3, 5, 7, 9],
"num_leaves": [5, 10, 15],
"num_boost_round": 10000,
"early_stopping_rounds": 300,
"feature_fraction": [0.2, 0.3, 0.5, 0.7, 0.8],
"verbose": 0
}
lgbtrain = lgb.Dataset(data=X_train, label=y_train, feature_name=cols)
lgbval = lgb.Dataset(data=X_val, label=y_val, reference=lgbtrain, feature_name=cols)
model = lgb.train(lgb_params, lgbtrain,
valid_sets=[lgbtrain, lgbval],
num_boost_round=lgb_params['num_boost_round'],
early_stopping_rounds=lgb_params['early_stopping_rounds'],
feval=lgbm_smape,
verbose_eval=100)
The code above of course does not work in the end since lgb params contains keys with more than 1 values (e.g., learning_rate, max_depth etc.). Well, those are the ones I actually want to search for and that's where the problem is...
I think I came up with a solution, it is currently running and haven't finished since it searches through a lot of values but here's the function that I wrote just in case anyone needs it:
def param_search(lgb_param_dict):
min_error = float("inf")
best_params = dict()
best_iter = float("inf")
for i in range(len(lgb_param_dict["learning_rate"])):
lgb_params = dict()
lgb_params["learning_rate"] = lgb_param_dict["learning_rate"][i]
for j in range(len(lgb_param_dict["max_depth"])):
lgb_params["max_depth"] = lgb_param_dict["max_depth"][j]
for k in range(len(lgb_param_dict["num_leaves"])):
lgb_params["num_leaves"] = lgb_param_dict["num_leaves"][k]
for s in range(len(lgb_param_dict["feature_fraction"])):
lgb_params["feature_fraction"] = lgb_param_dict["feature_fraction"][s]
print(" ")
print("##########")
print("Learning_rate = " + str(lgb_params["learning_rate"]))
print("max_depth = " + str(lgb_params["max_depth"]))
print("num_leaves = " + str(lgb_params["num_leaves"]))
print("feature_fraction = " + str(lgb_params["feature_fraction"]))
model = lgb.train(lgb_params, lgbtrain,
valid_sets=[lgbtrain, lgbval],
num_boost_round=lgb_full_params["num_boost_round"],
early_stopping_rounds=lgb_full_params["early_stopping_rounds"],
feval=lgbm_smape,
verbose_eval=500)
print("Learning_rate = " + str(lgb_params["learning_rate"]))
print("max_depth = " + str(lgb_params["max_depth"]))
print("num_leaves = " + str(lgb_params["num_leaves"]))
print("feature_fraction = " + str(lgb_params["feature_fraction"]))
if min_error > dict(model.best_score["valid_1"])["SMAPE"]:
min_error = dict(model.best_score["valid_1"])["SMAPE"]
best_params = model.params
best_iter = model.best_iteration
else:
continue
return min_error, best_params, best_iter
Print statement are for readability. There is probably better way to write this function but I'll approve it as an answer if it finishes without any problems.
Edit: It worked!
I have a program that compares files line by line and calculate the precision by reading two folders a "gold folder" and a "prediction folder).
The extracted files are like this:
T1 Task 5 19 nonlinear wave
T2 Task 5 29 nonlinear wave equations
T3 Task 15 29 wave equations
T4 Task 86 111 general analytical method
T5 Task 94 111 analytical method
T6 Task 199 213 minimum stages
T7 Task 268 287 efficient technique
T8 Task 268 298 efficient technique relatingto
also the gold files:
T1 Process 5 14 oxidation
T2 Material 69 84 Ti-based alloys
T3 Material 186 192 alloys
T4 Task 264 349 understand the role that composition has on the oxidation behavior of Ti-based alloys
T5 Process 312 321 oxidation
T6 Material 334 349 Ti-based alloys
T7 Material 400 415 Ti-based alloys
T8 Material 445 451 alloys
T9 Process 480 489 oxidation
The problem is that this code generates this error:
Traceback (most recent call last):
File "C:\Users\chedi\Downloads\Semeval\eval.py", line 214, in <module>
calculateMeasures(folder_gold, folder_pred, remove_anno)
File "C:\Users\chedi\Downloads\Semeval\eval.py", line 31, in calculateMeasures
res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
File "C:\Users\chedi\Downloads\Semeval\eval.py", line 130, in normaliseAnnotations
r_g_offs = r_g[1].split(" ")
IndexError: list index out of range
The error is in the line 130 and in the format of the extracted files, but they seems in the same format: first and second column separated by a tab, the offset by space
#!/usr/bin/python
# by Mattew Peters, who spotted that sklearn does macro averaging not micro averaging correctly and changed it
import os
from sklearn.metrics import precision_recall_fscore_support
import sys
def calculateMeasures(folder_gold="data/dev/", folder_pred="data_pred/dev/", remove_anno = ""):
'''
Calculate P, R, F1, Macro F
:param folder_gold: folder containing gold standard .ann files
:param folder_pred: folder containing prediction .ann files
:param remove_anno: if set if "rel", relations will be ignored. Use this setting to only evaluate
keyphrase boundary recognition and keyphrase classification. If set to "types", only keyphrase boundary recognition is evaluated.
Note that for the later, false positive
:return:
'''
flist_gold = os.listdir(folder_gold)
res_all_gold = []
res_all_pred = []
targets = []
for f in flist_gold:
# ignoring non-.ann files, should there be any
if not str(f).endswith(".ann"):
continue
f_gold = open(os.path.join(folder_gold, f), "r")
try:
f_pred = open(os.path.join(folder_pred, f), "r")
res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
except IOError:
print(f + " file missing in " + folder_pred + ". Assuming no predictions are available for this file.")
res_full_pred, res_pred, spans_pred, rels_pred = [], [], [], []
res_full_gold, res_gold, spans_gold, rels_gold = normaliseAnnotations(f_gold, remove_anno)
spans_all = set(spans_gold + spans_pred)
for i, r in enumerate(spans_all):
if r in spans_gold:
target = res_gold[spans_gold.index(r)].split(" ")[0]
res_all_gold.append(target)
if not target in targets:
targets.append(target)
else:
# those are the false positives, contained in pred but not gold
res_all_gold.append("NONE")
if r in spans_pred:
target_pred = res_pred[spans_pred.index(r)].split(" ")[0]
res_all_pred.append(target_pred)
else:
# those are the false negatives, contained in gold but not pred
res_all_pred.append("NONE")
#y_true, y_pred, labels, targets
prec, recall, f1, support = precision_recall_fscore_support(
res_all_gold, res_all_pred, labels=targets, average=None)
# unpack the precision, recall, f1 and support
metrics = {}
for k, target in enumerate(targets):
metrics[target] = {
'precision': prec[k],
'recall': recall[k],
'f1-score': f1[k],
'support': support[k]
}
# now micro-averaged
if remove_anno != 'types':
prec, recall, f1, s = precision_recall_fscore_support(
res_all_gold, res_all_pred, labels=targets, average='micro')
metrics['overall'] = {
'precision': prec,
'recall': recall,
'f1-score': f1,
'support': sum(support)
}
else:
# just binary classification, nothing to average
metrics['overall'] = metrics['KEYPHRASE-NOTYPES']
print_report(metrics, targets)
return metrics
def print_report(metrics, targets, digits=2):
def _get_line(results, target, columns):
line = [target]
for column in columns[:-1]:
line.append("{0:0.{1}f}".format(results[column], digits))
line.append("%s" % results[columns[-1]])
return line
columns = ['precision', 'recall', 'f1-score', 'support']
fmt = '%11s' + '%9s' * 4 + '\n'
report = [fmt % tuple([''] + columns)]
report.append('\n')
for target in targets:
results = metrics[target]
line = _get_line(results, target, columns)
report.append(fmt % tuple(line))
report.append('\n')
# overall
line = _get_line(metrics['overall'], 'avg / total', columns)
report.append(fmt % tuple(line))
report.append('\n')
print(''.join(report))
def normaliseAnnotations(file_anno, remove_anno):
'''
Parse annotations from the annotation files: remove relations (if requested), convert rel IDs to entity spans
:param file_anno:
:param remove_anno:
:return:
'''
res_full_anno = []
res_anno = []
spans_anno = []
rels_anno = []
for l in file_anno:
r_g = l.strip().split("\t")
r_g_offs = r_g[1].split(" ")
# remove relation instances if specified
if remove_anno != "" and r_g_offs[0].endswith("-of"):
continue
res_full_anno.append(l.strip())
# normalise relation instances by looking up entity spans for relation IDs
if r_g_offs[0].endswith("-of"):
arg1 = r_g_offs[1].replace("Arg1:", "")
arg2 = r_g_offs[2].replace("Arg2:", "")
for l in res_full_anno:
r_g_tmp = l.strip().split("\t")
if r_g_tmp[0] == arg1:
ent1 = r_g_tmp[1].replace(" ", "_")
if r_g_tmp[0] == arg2:
ent2 = r_g_tmp[1].replace(" ", "_")
spans_anno.append(" ".join([ent1, ent2]))
res_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
rels_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
else:
spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]]))
keytype = r_g[1]
if remove_anno == "types":
keytype = "KEYPHRASE-NOTYPES"
res_anno.append(keytype)
for r in rels_anno:
r_offs = r.split(" ")
# reorder hyponyms to start with smallest index
if r_offs[0] == "Synonym-of" and r_offs[2].split("_")[1] < r_offs[1].split("_")[1]: # 1, 2
r = " ".join([r_offs[0], r_offs[2], r_offs[1]])
# Check, in all other hyponym relations, if the synonymous entity with smallest index is used for them.
# If not, change it so it is.
if r_offs[0] == "Synonym-of":
for r2 in rels_anno:
r2_offs = r2.split(" ")
if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[1]:
r_new = " ".join([r2_offs[0], r_offs[2], r2_offs[2]])
rels_anno[rels_anno.index(r2)] = r_new
if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[2]:
r_new = " ".join([r2_offs[0], r2_offs[1], r_offs[2]])
rels_anno[rels_anno.index(r2)] = r_new
rels_anno = list(set(rels_anno))
res_full_anno_new = []
res_anno_new = []
spans_anno_new = []
for r in res_full_anno:
r_g = r.strip().split("\t")
if r_g[0].startswith("R") or r_g[0] == "*":
continue
ind = res_full_anno.index(r)
res_full_anno_new.append(r)
res_anno_new.append(res_anno[ind])
spans_anno_new.append(spans_anno[ind])
for r in rels_anno:
res_full_anno_new.append("R\t" + r)
res_anno_new.append(r)
spans_anno_new.append(" ".join([r.split(" ")[1], r.split(" ")[2]]))
return res_full_anno_new, res_anno_new, spans_anno_new, rels_anno
if __name__ == '__main__':
folder_gold = "data/dev/"
folder_pred = "data_pred/dev/"
remove_anno = "" # "", "rel" or "types"
if len(sys.argv) >= 2:
folder_gold = sys.argv[1]
if len(sys.argv) >= 3:
folder_pred = sys.argv[2]
if len(sys.argv) == 4:
remove_anno = sys.argv[3]
calculateMeasures(folder_gold, folder_pred, remove_anno)
Without having files on my own, I tried with the "gold" file you provided, namely:
T1 Process 5 14 oxidation
T2 Material 69 84 Ti-based alloys
T3 Material 186 192 alloys
T4 Task 264 349 understand the role that composition has on the oxidation behavior of Ti-based alloys
T5 Process 312 321 oxidation
T6 Material 334 349 Ti-based alloys
T7 Material 400 415 Ti-based alloys
T8 Material 445 451 alloys
T9 Process 480 489 oxidation
For the program to be able to run correctly and not get the error you get of 'list index out of range' in the line of code you mention, it is fundamental that between the first column (the 'Ts') and the second column there is a tab and between the other columns a space. Failing to have a correct file formatted in this way (for example having a space instead of a tab between the first two columns) will give that error. Indeed what really happens in the line
r_g = l.strip('\n').split("\t")
is that first the newline gets removed at the end of the line, and than the line is splitted by tab. This means that the line gets splitted in two elements, which make up the list r_g. In this case r_g_offs can be calculated correctly and will contain a list of elements which are all the columns but the first. In some cases then, this will be used later for example in
spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]]))
just to mention one.
Let's look at the case which doesn't work and let's try to understand why.
If the file .ann (gold) is not formatted in this way:
T1\tProcess (tab between)
but instead is
T1 Process (space)
the code
r_g = l.strip('\n').split("\t")
will produce a list of just one element and not of two, e.g.
r_g = ['T1 Process ...']
In this case, r_g has only one element, the element r_g[0] so when one tries to access an element that doesn't exist (r_g[1]) via
r_g_offs = r_g[1].split()
one will get an
IndexError: list index out of range
There exists another case in which you could get the aforementioned error.
In the case of empty line at the end of the file, r_g = [''], which means r_g is a list of only one element. Now, similar to the previous case, when the script executes the line r_g_offs = r_g[1].split(), will try to access r_g[1], which doesn't exist since the only element in the list in this case is r_g[0] and you will get the 'list index out of range' error.
The code I can run:
#!/usr/bin/python
# by Mattew Peters, who spotted that sklearn does macro averaging not
# micro averaging correctly and changed it
import os
from sklearn.metrics import precision_recall_fscore_support
import sys
def calculateMeasures(folder_gold="data/dev/", folder_pred="data_pred/dev/", remove_anno=""):
'''
Calculate P, R, F1, Macro F
:param folder_gold: folder containing gold standard .ann files
:param folder_pred: folder containing prediction .ann files
:param remove_anno: if set if "rel", relations will be ignored. Use this setting to only evaluate
keyphrase boundary recognition and keyphrase classification. If set to "types", only keyphrase boundary recognition is evaluated.
Note that for the later, false positive
:return:
'''
flist_gold = os.listdir(folder_gold)
res_all_gold = []
res_all_pred = []
targets = []
for f in flist_gold:
# ignoring non-.ann files, should there
# be any
if not str(f).endswith(".ann"):
continue
f_gold = open(os.path.join(folder_gold, f), "r")
try:
f_pred = open(os.path.join(folder_pred, f), "r")
res_full_pred, res_pred, spans_pred, rels_pred = normaliseAnnotations(f_pred, remove_anno)
except IOError:
print(f + " file missing in " + folder_pred + ". Assuming no predictions are available for this file.")
res_full_pred, res_pred, spans_pred, rels_pred = [], [], [], []
res_full_gold, res_gold, spans_gold, rels_gold = normaliseAnnotations(f_gold, remove_anno)
spans_all = set(spans_gold + spans_pred)
for i, r in enumerate(spans_all):
if r in spans_gold:
target = res_gold[spans_gold.index(r)].split(" ")[0]
res_all_gold.append(target)
if not target in targets:
targets.append(target)
else:
res_all_gold.append("NONE")
if r in spans_pred:
target_pred = res_pred[spans_pred.index(r)].split(" ")[0]
res_all_pred.append(target_pred)
else:
res_all_pred.append("NONE")
#y_true, y_pred, labels, targets
prec, recall, f1, support = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average=None)
metrics = {}
for k, target in enumerate(targets):
metrics[target] = {
'precision': prec[k],
'recall': recall[k],
'f1-score': f1[k],
'support': support[k]
}
# now
# micro-averaged
if remove_anno != 'types':
prec, recall, f1, s = precision_recall_fscore_support(res_all_gold, res_all_pred, labels=targets, average='micro')
metrics['overall'] = {
'precision': prec,
'recall': recall,
'f1-score': f1,
'support': sum(support)
}
else:
# just
# binary
# classification,
# nothing
# to
# average
metrics['overall'] = metrics['KEYPHRASE-NOTYPES']
print_report(metrics, targets)
return metrics
def print_report(metrics, targets, digits=2):
def _get_line(results, target, columns):
line = [target]
for column in columns[:-1]:
line.append("{0:0.{1}f}".format(results[column], digits))
line.append("%s" % results[columns[-1]])
return line
columns = ['precision', 'recall', 'f1-score', 'support']
fmt = '%11s' + '%9s' * 4 + '\n'
report = [fmt % tuple([''] + columns)]
report.append('\n')
for target in targets:
results = metrics[target]
line = _get_line(results, target, columns)
report.append(fmt % tuple(line))
report.append('\n')
# overall
line = _get_line(
metrics['overall'], 'avg / total', columns)
report.append(fmt % tuple(line))
report.append('\n')
print(''.join(report))
def normaliseAnnotations(file_anno, remove_anno):
'''
Parse annotations from the annotation files: remove relations (if requested), convert rel IDs to entity spans
:param file_anno:
:param remove_anno:
:return:
'''
res_full_anno = []
res_anno = []
spans_anno = []
rels_anno = []
for l in file_anno:
print(l)
print(l.strip('\n'))
r_g = l.strip('\n').split("\t")
print(r_g)
print(len(r_g))
r_g_offs = r_g[1].split()
print(r_g_offs)
if remove_anno != "" and r_g_offs[0].endswith("-of"):
continue
res_full_anno.append(l.strip())
if r_g_offs[0].endswith("-of"):
arg1 = r_g_offs[1].replace("Arg1:", "")
arg2 = r_g_offs[2].replace("Arg2:", "")
for l in res_full_anno:
r_g_tmp = l.strip().split("\t")
if r_g_tmp[0] == arg1:
ent1 = r_g_tmp[1].replace(" ", "_")
if r_g_tmp[0] == arg2:
ent2 = r_g_tmp[1].replace(" ", "_")
spans_anno.append(" ".join([ent1, ent2]))
res_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
rels_anno.append(" ".join([r_g_offs[0], ent1, ent2]))
else:
spans_anno.append(" ".join([r_g_offs[1], r_g_offs[2]]))
keytype = r_g[1]
if remove_anno == "types":
keytype = "KEYPHRASE-NOTYPES"
res_anno.append(keytype)
for r in rels_anno:
r_offs = r.split(" ")
# reorder hyponyms to start with smallest index
# 1, 2
if r_offs[0] == "Synonym-of" and r_offs[2].split("_")[1] < r_offs[1].split("_")[1]:
r = " ".join([r_offs[0], r_offs[2], r_offs[1]])
if r_offs[0] == "Synonym-of":
for r2 in rels_anno:
r2_offs = r2.split(" ")
if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[1]:
r_new = " ".join([r2_offs[0], r_offs[2], r2_offs[2]])
rels_anno[rels_anno.index(r2)] = r_new
if r2_offs[0] == "Hyponym-of" and r_offs[1] == r2_offs[2]:
r_new = " ".join([r2_offs[0], r2_offs[1], r_offs[2]])
rels_anno[rels_anno.index(r2)] = r_new
rels_anno = list(set(rels_anno))
res_full_anno_new = []
res_anno_new = []
spans_anno_new = []
for r in res_full_anno:
r_g = r.strip().split("\t")
if r_g[0].startswith("R") or r_g[0] == "*":
continue
ind = res_full_anno.index(r)
res_full_anno_new.append(r)
res_anno_new.append(res_anno[ind])
spans_anno_new.append(spans_anno[ind])
for r in rels_anno:
res_full_anno_new.append("R\t" + r)
res_anno_new.append(r)
spans_anno_new.append(" ".join([r.split(" ")[1], r.split(" ")[2]]))
return res_full_anno_new, res_anno_new, spans_anno_new, rels_anno
if __name__ == '__main__':
folder_gold = "data/dev/"
folder_pred = "data_pred/dev/"
remove_anno = "" # "", "rel" or "types"
if len(sys.argv) >= 2:
folder_gold = sys.argv[1]
if len(sys.argv) >= 3:
folder_pred = sys.argv[2]
From the two cases shown above, we can conclude that the script is very sensible to how the file are formatted/written (tab, spaces and no empty line at the end), so care will be needed when producing those files and feeding them to the main script.
Here is my code:
n = 100000 #This is what makes it tricky - lots of files going into this hdf5 file
with h5py.File('image1.h5','w') as f:
dset_X = f.create_dataset('X',(1,960,224,224),maxshape=(None,960,224,224),chunks=True,compression='gzip')
dset_y = f.create_dataset('y',(1,112,224*224),maxshape=(None,112,224*224),chunks=True,compression='gzip')
n_images = 0
for fl in files[:n]:
X_chunk,y_chunk = get_arrays(fl)
dset_X.resize(n_images+1,axis=0)
dset_y.resize(n_images+1,axis=0)
print dset_X.shape,dset_y.shape
dset_X[n_images:n_images+1,:,:,:]=X_chunk
dset_y[n_images:n_images+1,:,:]=y_chunk
n_images+=1
This works fine and dandy. However, with 1 file, the size of the hdf5 is 6.7MB. With 2 files its 37MB ( should be 12 MB right?). With 10 its all the way up to 388MB (should be 67 right?)
So clearly adding the compression flag to the end of the 2nd and third line isn't working as intended. How can I achieve something like this?
I ended up doing this successfully using pytables.
def get_arrays(each_file):
lab = color.rgb2lab(io.imread(each_file))
X = lab[:,:,:1]
y = lab[:,:,1:]
X_rows,X_columns,X_channels=X.shape
y_rows,y_columns,y_channels=y.shape
X_channels_first = np.transpose(X,(2,0,1))
X_sample = np.expand_dims(X_channels_first,axis=0)
X_3d = np.tile(X_sample,(1,3,1,1))
X_3d_scaled = X_3d * 255.0/X_3d.max()
hc = extract_hypercolumn(model,[3,8,15,22],X_3d_scaled)
hc_scaled = (hc -hc.min())/(hc.max()-hc.min())
print hc_scaled.max(),hc_scaled.min()
hc_expand_dims = np.expand_dims(hc_scaled,axis=0)
y_reshaped = np.reshape(y,(y_rows*y_columns,y_channels))
classed_pixels_first = KNN.predict_proba(y_reshaped)
classed_classes_first = np.transpose(classed_pixels_first,(1,0))
classed_expand_dims = np.expand_dims(classed_classes_first,axis=0)
print "hypercolumn shape: ",hc_expand_dims.shape,"classified output color shape: ",classed_expand_dims.shape
return hc_expand_dims,classed_expand_dims
filters = tables.Filters(complevel=5, complib='zlib')
with tables.openFile('raw.h5','w') as f:
# filters = tables.Filters(complib='blosc', complevel=5)
dset_X = f.create_earray(f.root, 'X', tables.Atom.from_dtype(np.dtype('Float64')), (0,960,224,224),filters=filters)
dset_y = f.create_earray(f.root, 'y', tables.Atom.from_dtype(np.dtype('Float64')), (0,112,224*224),filters=filters)
for fl in files[0:12000]:
X_chunk,y_chunk=get_arrays(fl)
dset_X.append(X_chunk)
dset_y.append(y_chunk)
I am using the parallel programming module for python I have a function that returns me an array but when I print the variable that contain the value of the function parallelized returns me "pp._Task object at 0x04696510" and not the value of the matrix.
Here is the code:
from __future__ import print_function
import scipy, pylab
from scipy.io.wavfile import read
import sys
import peakpicker as pea
import pp
import fingerprint as fhash
import matplotlib
import numpy as np
import tdft
import subprocess
import time
if __name__ == '__main__':
start=time.time()
#Peak picking dimensions
f_dim1 = 30
t_dim1 = 80
f_dim2 = 10
t_dim2 = 20
percentile = 80
base = 100 # lowest frequency bin used (peaks below are too common/not as useful for identification)
high_peak_threshold = 75
low_peak_threshold = 60
#TDFT parameters
windowsize = 0.008 #set the window size (0.008s = 64 samples)
windowshift = 0.004 #set the window shift (0.004s = 32 samples)
fftsize = 1024 #set the fft size (if srate = 8000, 1024 --> 513 freq. bins separated by 7.797 Hz from 0 to 4000Hz)
#Hash parameters
delay_time = 250 # 250*0.004 = 1 second#200
delta_time = 250*3 # 750*0.004 = 3 seconds#300
delta_freq = 128 # 128*7.797Hz = approx 1000Hz#80
#Time pair parameters
TPdelta_freq = 4
TPdelta_time = 2
#Cargando datos almacenados
database=np.loadtxt('database.dat')
songnames=np.loadtxt('songnames.dat', dtype=str, delimiter='\t')
separator = '.'
print('Please enter an audio sample file to identify: ')
userinput = raw_input('---> ')
subprocess.call(['ffmpeg','-y','-i',userinput, '-ac', '1','-ar', '8k', 'filesample.wav'])
sample = read('filesample.wav')
userinput = userinput.split(separator,1)[0]
print('Analyzing the audio sample: '+str(userinput))
srate = sample[0] #sample rate in samples/second
audio = sample[1] #audio data
spectrogram = tdft.tdft(audio, srate, windowsize, windowshift, fftsize)
mytime = spectrogram.shape[0]
freq = spectrogram.shape[1]
print('The size of the spectrogram is time: '+str(mytime)+' and freq: '+str(freq))
threshold = pea.find_thres(spectrogram, percentile, base)
peaks = pea.peak_pick(spectrogram,f_dim1,t_dim1,f_dim2,t_dim2,threshold,base)
print('The initial number of peaks is:'+str(len(peaks)))
peaks = pea.reduce_peaks(peaks, fftsize, high_peak_threshold, low_peak_threshold)
print('The reduced number of peaks is:'+str(len(peaks)))
#Store information for the spectrogram graph
samplePeaks = peaks
sampleSpectro = spectrogram
hashSample = fhash.hashSamplePeaks(peaks,delay_time,delta_time,delta_freq)
print('The dimensions of the hash matrix of the sample: '+str(hashSample.shape))
# tuple of all parallel python servers to connect with
ppservers = ()
#ppservers = ("10.0.0.1",)
if len(sys.argv) > 1:
ncpus = int(sys.argv[1])
# Creates jobserver with ncpus workers
job_server = pp.Server(ncpus, ppservers=ppservers)
else:
# Creates jobserver with automatically detected number of workers
job_server = pp.Server(ppservers=ppservers)
print ("Starting pp with", job_server.get_ncpus(), "workers")
print('Attempting to identify the sample audio clip.')
Here I call the function in fingerprint, the commented line worked, but when I try parallelize don't work:
timepairs = job_server.submit(fhash.findTimePairs, (database, hashSample, TPdelta_freq, TPdelta_time, ))
# timepairs = fhash.findTimePairs(database, hashSample, TPdelta_freq, TPdelta_time)
print (timepairs)
#Compute number of matches by song id to determine a match
numSongs = len(songnames)
songbins= np.zeros(numSongs)
numOffsets = len(timepairs)
offsets = np.zeros(numOffsets)
index = 0
for i in timepairs:
offsets[index]=i[0]-i[1]
index = index+1
songbins[i[2]] += 1
# Identify the song
#orderarray=np.column_stack((songbins,songnames))
#orderarray=orderarray[np.lexsort((songnames,songbins))]
q3=np.percentile(songbins, 75)
q1=np.percentile(songbins, 25)
j=0
for i in songbins:
if i>(q3+(3*(q3-q1))):
print("Result-> "+str(i)+":"+songnames[j])
j+=1
end=time.time()
print('Tiempo: '+str(end-start)+' s')
print("Time elapsed: ", +time.time() - start, "s")
fig3 = pylab.figure(1003)
ax = fig3.add_subplot(111)
ind = np.arange(numSongs)
width = 0.35
rects1 = ax.bar(ind,songbins,width,color='blue',align='center')
ax.set_ylabel('Number of Matches')
ax.set_xticks(ind)
xtickNames = ax.set_xticklabels(songnames)
matplotlib.pyplot.setp(xtickNames)
pylab.title('Song Identification')
fig3.show()
pylab.show()
print('The sample song is: '+str(songnames[np.argmax(songbins)]))
The function in fingerprint that I try to parallelize is:
def findTimePairs(hash_database,sample_hash,deltaTime,deltaFreq):
"Find the matching pairs between sample audio file and the songs in the database"
timePairs = []
for i in sample_hash:
for j in hash_database:
if(i[0] > (j[0]-deltaFreq) and i[0] < (j[0] + deltaFreq)):
if(i[1] > (j[1]-deltaFreq) and i[1] < (j[1] + deltaFreq)):
if(i[2] > (j[2]-deltaTime) and i[2] < (j[2] + deltaTime)):
timePairs.append((j[3],i[3],j[4]))
else:
continue
else:
continue
else:
continue
return timePairs
The complete error is:
Traceback (most recent call last):
File "analisisPrueba.py", line 93, in <module>
numOffsets = len(timepairs)
TypeError: object of type '_Task' has no len()
The submit() method submits a task to the server. What you get back is a reference to the task, not its result. (How could it return its result? submit() returns before any of that work has been done!) You should instead provide a callback function to receive the results. For example, timepairs.append is a function that will take the result and append it to the list timepairs.
timepairs = []
job_server.submit(fhash.findTimePairs, (database, hashSample, TPdelta_freq, TPdelta_time, ), callback=timepairs.append)
(Each findTimePairs call should calculate one result, in case that isn't obvious, and you should submit multiple tasks. Otherwise you're invoking all the machinery of Parallel Python for no reason. And make sure you call job_server.wait() to wait for all the tasks to finish before trying to do anything with your results. In short, read the documentation and some example scripts and make sure you understand how it works.)
I created a python program which uses the function "CostPath" of ArcGIS to automatically build least-cost paths (LCPs) between several polygons contained in the shapefile "selected_patches.shp". My python program seems to work but it is much too slow. I must build 275493 LCPs. Unfortunately, I don't know how to speed up my program (I am a beginner in Python programming language and ArcGIS). Or is there another solution to calculate rapidly least-cost paths between several polygons with ArcGIS (I use ArcGIS 10.1) ? Here is my code:
# Import system modules
import arcpy
from arcpy import env
from arcpy.sa import *
arcpy.CheckOutExtension("Spatial")
# Overwrite outputs
arcpy.env.overwriteOutput = True
# Set the workspace
arcpy.env.workspace = "C:\Users\LCP"
# Set the extent environment
arcpy.env.extent = "costs.tif"
rowsInPatches_start = arcpy.SearchCursor("selected_patches.shp")
for rowStart in rowsInPatches_start:
ID_patch_start = rowStart.getValue("GRIDCODE")
expressionForSelectInPatches_start = "GRIDCODE=%s" % (ID_patch_start) ## Define SQL expression for the fonction Select Layer By Attribute
# Process: Select Layer By Attribute in Patches_start
arcpy.MakeFeatureLayer_management("selected_patches.shp", "Selected_patch_start", expressionForSelectInPatches_start)
# Process: Cost Distance
outCostDist=CostDistance("Selected_patch_start", "costs.tif", "", "outCostLink.tif")
# Save the output
outCostDist.save("outCostDist.tif")
rowsInSelectedPatches_end = arcpy.SearchCursor("selected_patches.shp")
for rowEnd in rowsInSelectedPatches_end:
ID_patch_end = rowEnd.getValue("GRIDCODE")
expressionForSelectInPatches_end = "GRIDCODE=%s" % (ID_patch_end) ## Define SQL expression for the fonction Select Layer By Attribute
# Process: Select Layer By Attribute in Patches_end
arcpy.MakeFeatureLayer_management("selected_patches.shp", "Selected_patch_end", expressionForSelectInPatches_end)
# Process: Cost Path
outCostPath = CostPath("Selected_patch_end", "outCostDist.tif", "outCostLink.tif", "EACH_ZONE","FID")
# Save the output
outCostPath.save('P_' + str(int(ID_patch_start)) + '_' + str(int(ID_patch_end)) + ".tif")
# Writing in file .txt
outfile=open('P_' + str(int(ID_patch_start)) + '_' + str(int(ID_patch_end)) + ".txt", "w")
rowsTxt = arcpy.SearchCursor('P_' + str(int(ID_patch_start)) + '_' + str(int(ID_patch_end)) + ".tif")
for rowTxt in rowsTxt:
value = rowTxt.getValue("Value")
count = rowTxt.getValue("Count")
pathcost = rowTxt.getValue("PATHCOST")
startrow = rowTxt.getValue("STARTROW")
startcol = rowTxt.getValue("STARTCOL")
print value, count, pathcost, startrow, startcol
outfile.write(str(value) + " " + str(count) + " " + str(pathcost) + " " + str(startrow) + " " + str(startcol) + "\n")
outfile.close()
Thanks very much for your help.
The speed it takes to write to disc vs calculating your cost can be a bottleneck, consider adding a thread to handle all of your writes.
This:
for rowTxt in rowsTxt:
value = rowTxt.getValue("Value")
count = rowTxt.getValue("Count")
pathcost = rowTxt.getValue("PATHCOST")
startrow = rowTxt.getValue("STARTROW")
startcol = rowTxt.getValue("STARTCOL")
print value, count, pathcost, startrow, startcol
outfile.write(str(value) + " " + str(count) + " " + str(pathcost) + " " + str(startrow) + " " + str(startcol) + "\n")
Can be converted into a thread function by making rowsTxt a global variable, and having your thread write to disk from rowsTxt.
After you complete all of your processing you can have an additional global boolean so that your thread function can end when you are done writing everything and you can close your thread.
Example thread function I currently use:
import threading
class ThreadExample:
def __init__(self):
self.receiveThread = None
def startRXThread(self):
self.receiveThread = threading.Thread(target = self.receive)
self.receiveThread.start()
def stopRXThread(self):
if self.receiveThread is not None:
self.receiveThread.__Thread__stop()
self.receiveThread.join()
self.receiveThread = None
def receive(self):
while true:
#do stuff for the life of the thread
#in my case, I listen on a socket for data
#and write it out
So for your case, you could add a class variable to the thread class
self.rowsTxt
and then update your receive to check self.rowsTxt, and if it is not empty, handle it as u do in the code snippet i took from you above. After you handle it, set self.rowsTxt back to None. You could update your threads self.rowsTxt with your main function as it gets rowsTxt. Consider using a buffer like list for self.rowsTxt so you don't miss writing anything.
The most immediate change you can make to significant improve speed would be to switch to data access cursors (e.g. arcpy.da.SearchCursor()). To illustrate, I ran a benchmark test a while back to see the data access cursors perform compared to the old cursors.
The attached figure shows the results of a benchmark test on the new da method UpdateCursor versus the old UpdateCursor method. Essentially, the benchmark test performs the following workflow:
Create random points (10, 100, 1000, 10000, 100000)
Randomly sample from a normal distribution and add value to a new
column in the random points attribute table with a cursor
Run 5 iterations of each random point scenario for both the new and
old UpdateCursor methods and write the mean value to lists
Plot the results
import arcpy, os, numpy, time
arcpy.env.overwriteOutput = True
outws = r'C:\temp'
fc = os.path.join(outws, 'randomPoints.shp')
iterations = [10, 100, 1000, 10000, 100000]
old = []
new = []
meanOld = []
meanNew = []
for x in iterations:
arcpy.CreateRandomPoints_management(outws, 'randomPoints', '', '', x)
arcpy.AddField_management(fc, 'randFloat', 'FLOAT')
for y in range(5):
# Old method ArcGIS 10.0 and earlier
start = time.clock()
rows = arcpy.UpdateCursor(fc)
for row in rows:
# generate random float from normal distribution
s = float(numpy.random.normal(100, 10, 1))
row.randFloat = s
rows.updateRow(row)
del row, rows
end = time.clock()
total = end - start
old.append(total)
del start, end, total
# New method 10.1 and later
start = time.clock()
with arcpy.da.UpdateCursor(fc, ['randFloat']) as cursor:
for row in cursor:
# generate random float from normal distribution
s = float(numpy.random.normal(100, 10, 1))
row[0] = s
cursor.updateRow(row)
end = time.clock()
total = end - start
new.append(total)
del start, end, total
meanOld.append(round(numpy.mean(old),4))
meanNew.append(round(numpy.mean(new),4))
#######################
# plot the results
import matplotlib.pyplot as plt
plt.plot(iterations, meanNew, label = 'New (da)')
plt.plot(iterations, meanOld, label = 'Old')
plt.title('arcpy.da.UpdateCursor -vs- arcpy.UpdateCursor')
plt.xlabel('Random Points')
plt.ylabel('Time (minutes)')
plt.legend(loc = 2)
plt.show()