Related
I want to parallelize the spec which is generated by _spectrum_generator. I am using futures.ThreadPoolExecutor which is called in _gather_lcms_data. The spec is passed through function. The file is in .mzML format. Below is the output that i get which is empty.
(base) ashish#14-ce3xxx:~/GNPS_LCMSDashboard$ python3 lcms_map.py
Empty DataFrame
Columns: [mz, rt, i, scan, index, polarity]
Index: []
The output should be look like this:
(base) ashish#14-ce3xxx:/media/ashish/ubuntu7/GNPS_LCMSDashboard$ python3 lcms_map.py
mz rt i scan index polarity
0 169.038696 0.003722 1652.959961 1 1 1
1 177.969086 0.003722 1786.755127 1 1 1
2 194.156967 0.003722 1802.361450 1 1 1
3 154.059418 0.003722 1840.889160 1 1 1
4 164.080978 0.003722 1973.758423 1 1 1
5 150.079514 0.003722 1976.528687 1 1 1
6 160.096634 0.003722 2057.728516 1 1 1
7 201.182205 0.003722 2077.768311 1 1 1
8 162.078735 0.003722 2101.843018 1 1 1
9 171.044205 0.003722 2223.230713 1 1 1
Below is the code of _spectrum_generator:
def _spectrum_generator(filename, min_rt, max_rt):
run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
# Don't do this if the min_rt and max_rt are not reasonable values
if min_rt <= 0 and max_rt > 1000:
for spec in run:
yield spec
else:
try:
min_rt_index = _find_lcms_rt(run, min_rt) # These are inclusive on left
max_rt_index = _find_lcms_rt(run, max_rt) + 1 # Exclusive on the right
for spec_index in tqdm(range(min_rt_index, max_rt_index)):
spec = run[spec_index]
yield spec
print("USED INDEX")
except:
run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
for spec in run:
yield spec
print("USED BRUTEFORCE")
Below is code of lcms_map.py:
import os
import pymzml
import numpy as np
import datashader as ds
from tqdm import tqdm
import json
import pandas as pd
import xarray
import time
import utils
import plotly.express as px
import plotly.graph_objects as go
from utils import _spectrum_generator
from utils import _get_scan_polarity
from multiprocessing import Pool
import concurrent.futures
from multiprocessing import Process
# Enum for polarity
POLARITY_POS = 1
POLARITY_NEG = 2
def _gather_lcms_data(filename, min_rt, max_rt, min_mz, max_mz, polarity_filter="None", top_spectrum_peaks=100, include_polarity=False):
all_mz = []
all_rt = []
all_polarity = []
all_i = []
all_scan = []
all_index = []
spectrum_index = 0
number_spectra = 0
all_msn_mz = []
all_msn_rt = []
all_msn_polarity = []
all_msn_scan = []
all_msn_level = []
#fun(filename, min_rt, max_rt)
for spec in _spectrum_generator(filename, min_rt, max_rt):
rt = spec.scan_time_in_minutes()
try:
# Still waiting for the window
if rt < min_rt:
continue
# pass
# We've passed the window
if rt > max_rt:
break
except:
pass
if polarity_filter == "None":
pass
else:
scan_polarity = _get_scan_polarity(spec)
if polarity_filter != scan_polarity:
continue
if spec.ms_level == 1:
spectrum_index += 1
number_spectra += 1
try:
# Filtering peaks by mz
if min_mz <= 0 and max_mz >= 2000:
peaks = spec.peaks("raw")
else:
peaks = spec.reduce(mz_range=(min_mz, max_mz))
# Filtering out zero rows
peaks = peaks[~np.any(peaks < 1.0, axis=1)]
# Sorting by intensity
peaks = peaks[peaks[:,1].argsort()]
peaks = peaks[-1 * top_spectrum_peaks:]
mz, intensity = zip(*peaks)
all_mz += list(mz)
all_i += list(intensity)
all_rt += len(mz) * [rt]
all_scan += len(mz) * [spec.ID]
all_index += len(mz) * [number_spectra]
# Adding polarity
if include_polarity is True:
scan_polarity = _get_scan_polarity(spec)
if scan_polarity == "Positive":
all_polarity += len(mz) * [POLARITY_POS]
else:
all_polarity += len(mz) * [POLARITY_NEG]
except:
pass
elif spec.ms_level > 1:
try:
msn_mz = spec.selected_precursors[0]["mz"]
if msn_mz < min_mz or msn_mz > max_mz:
continue
all_msn_mz.append(msn_mz)
all_msn_rt.append(rt)
all_msn_scan.append(spec.ID)
all_msn_level.append(spec.ms_level)
# Adding polarity
if include_polarity is True:
scan_polarity = _get_scan_polarity(spec)
if scan_polarity == "Positive":
all_msn_polarity.append(POLARITY_POS)
else:
all_msn_polarity.append(POLARITY_NEG)
except:
pass
ms1_results = {}
ms1_results["mz"] = all_mz
ms1_results["rt"] = all_rt
ms1_results["i"] = all_i
ms1_results["scan"] = all_scan
ms1_results["index"] = all_index
msn_results = {}
msn_results["precursor_mz"] = all_msn_mz
msn_results["rt"] = all_msn_rt
msn_results["scan"] = all_msn_scan
msn_results["level"] = all_msn_level
# Adding polarity
if include_polarity is True:
ms1_results["polarity"] = all_polarity
msn_results["polarity"] = all_msn_polarity
ms1_results = pd.DataFrame(ms1_results)
msn_results = pd.DataFrame(msn_results)
return ms1_results, number_spectra, msn_results
def _get_feather_filenames(filename):
output_ms1_filename = filename + ".ms1.feather"
output_msn_filename = filename + ".msn.feather"
return output_ms1_filename, output_msn_filename
# These are caching layers for fast loading
def _save_lcms_data_feather(filename):
output_ms1_filename, output_msn_filename = _get_feather_filenames(filename)
start=time.time()
# with Pool(5) as p:
# #ms1_results, number_spectra, msn_results = p.starmap(_gather_lcms_data, (filename, 0, 1000000, 0, 10000, "None", 100000, True))
# ms1_results, number_spectra, msn_results = _gather_lcms_data(filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
# with concurrent.futures.ProcessPoolExecutor(max_workers=100) as executor:
# f=executor.submit(_gather_lcms_data, filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
# ms1_results, number_spectra, msn_results = f.result()
ms1_results, number_spectra, msn_results = _gather_lcms_data(filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
print(ms1_results.head(10))
print("Gathered data in", time.time() - start)
ms1_results = ms1_results.sort_values(by='i', ascending=False).reset_index()
ms1_results.to_feather(output_ms1_filename)
msn_results.to_feather(output_msn_filename)
_save_lcms_data_feather("/media/ashish/ubuntu7/GNPS_LCMSDashboard/QC_0.mzML")
How do i get the desired output by parallelizing. Suggest the changes that i need make in order to make it work.
As Simon Lundberg pointed out, you posted very complicated code, which makes it difficult to parallelize and even more difficult to explain how it is to be done. But if you were to present a simplified version of your code that was readily parallelizable, any answer would not be dealing with the complexities of your actual current code and would therefore be of little help. So I will try to create code that is an abstraction of your code's structure and then show how I would parallelize that. I am afraid that since you are not that familiar with multiprocessing, this may be rather difficult for you to follow.
First, a few observations about your code:
_gather_lcms_data currently is passed a filename and then using a generator function, _spectrum_generator, loops on all of its elements, called variable spec. In each loop iteration variable results are appended to various lists and variable number_spectra is optionally incremented. You have another variable spectrum_index that is also optionally incremented but its value is not otherwise used and could be eliminated. Finally, these lists are added to various dictionaries.
To parallelize the _gather_lcms_data function, it needs to process a single element, spec from the _spectrum_generator function so that we can run multiple invocations of this function in parallel. Consequently it needs to return a tuple of elements back to the main process which will do the necessary appending to lists and then create the dictionaries.
In your current code for each iteration of spec you optionally increment number_spectra and optionally append elements to various lists. Since we are now going to be parallelizing this function by returning individual elements, the main process must (1) accumulate the returned number_spectra value and optionally append the returned elements to result lists. Where in the original code you were not appending an element to a list for a given iteration, in the parallelized code you must return a None value so that the main process knows that for that iteration nothing needs to be appended.
In this abstraction, I have also reduced the number of lists down to two and I am generating dummy results.
First an abstraction of your current code.
def _spectrum_generator(filename, min_rt, max_rt):
#run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
run = [1, 2, 3, 4, 5, 6]
for spec in run:
yield spec
def _gather_lcms_data(filename, min_rt, max_rt, min_mz, max_mz, polarity_filter="None", top_spectrum_peaks=100, include_polarity=False):
# Remainder of the list declarations omitted for simplicity
all_mz = []
all_msn_mz= []
number_spectra = 0
for spec in _spectrum_generator(filename, min_rt, max_rt):
... # Code omitted for simplicity
number_spectra += 1 # Conditionally done
msn_mz = spec # Conditionally done
all_msn_mz.append(msn_mz)
mz = (spec * spec,) # Conditionally done
all_mz += list(mz)
...
ms1_results = {}
msn_results = {}
...
ms1_results["mz"] = all_mz
msn_results["precursor_mz"] = all_msn_mz
...
# Return
return ms1_results, number_spectra, msn_results
def _save_lcms_data_feather(filename):
ms1_results, number_spectra, msn_results = _gather_lcms_data(filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
print(ms1_results)
print(number_spectra)
print(msn_results)
if __name__ == '__main__':
_save_lcms_data_feather("/media/ashish/ubuntu7/GNPS_LCMSDashboard/QC_0.mzML")
Prints:
{'mz': [1, 4, 9, 16, 25, 36]}
6
{'precursor_mz': [1, 2, 3, 4, 5, 6]}
This is the parallelized version of the above code:
def _spectrum_generator(filename, min_rt, max_rt):
#run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
run = [1, 2, 3, 4, 5, 6]
for spec in run:
yield spec
def _gather_lcms_data(spec, min_rt, max_rt, min_mz, max_mz, polarity_filter="None", top_spectrum_peaks=100, include_polarity=False):
# Remainder of the list declarations omitted for simplicity
number_spectra = 0
... # Code omitted for simplicity
number_spectra += 1
msn_mz = spec # Conditionally done. If not done then set msn_mz to None
mz = list((spec * spec,)) # Conditionally done. If not done then set mz to None
...
return mz, number_spectra, msn_mz
def _save_lcms_data_feather(filename):
from multiprocessing import Pool
from functools import partial
min_rt = 0
max_rt = 1000000
worker_function = partial(_gather_lcms_data, min_rt=min_rt, max_rt=max_rt, min_mz=0, max_mz=10000, polarity_filter="None", top_spectrum_peaks=1000000, include_polarity=True)
with Pool() as pool:
all_mz = []
all_msn_mz = []
number_spectra = 0
for mz, _number_spectra, msn_mz in pool.map(worker_function, _spectrum_generator(filename, min_rt, max_rt)):
if mz is not None:
all_mz += mz
number_spectra += _number_spectra
if msn_mz is not None:
all_msn_mz.append(msn_mz)
ms1_results = {}
msn_results = {}
ms1_results["mz"] = all_mz
msn_results["precursor_mz"] = all_msn_mz
print(ms1_results)
print(number_spectra)
print(msn_results)
if __name__ == '__main__':
_save_lcms_data_feather("/media/ashish/ubuntu7/GNPS_LCMSDashboard/QC_0.mzML")
I need to get the same results in any iteration. I tried to use random.seed() in my script, but it doesn't work. How can I fix it?
My script:
import statistics
Oddr=[]
Oddr_fem=[]
Oddr_mal=[]
chi = []
chi_fem=[]
chi_mal=[]
for k in range(100):
random.seed(10)
result = []
exclude_hlthy = []
for i in set(sick['predicted_age']):
sick_ppl = sick.index[sick['predicted_age'] == i].tolist()
L_sick = len(sick_ppl)
if L_sick == 0:
continue
hlth_peers = healthy[healthy.predicted_age == i]
L_healthy = hlth_peers.shape[0]
if L_healthy < len(sick_ppl):
pass
else:
hlthy_subsample = list(np.random.choice([x for x in hlth_peers.index if not x in exclude_hlthy],
L_sick, replace = False))
exclude_hlthy += hlthy_subsample
result += hlthy_subsample
table_ready = healthy.loc[result]
whole_table = table_ready.append(sick, ignore_index=False)
cross_tab = pd.crosstab(index=whole_table['dc013'], columns=whole_table['rate_aging'])
oddsratio=(cross_tab[1][1]*cross_tab[0][0])/(cross_tab[1][0]*cross_tab[0][1])
#Oddr += oddsratio
Oddr.append(oddsratio)
In this script I got several random tables whole_table from one subsample.
For every iteration you're creating a new random.seed(10) so try to put this line of code out of the for cycle and then should work
I want to get some basic statistics from some csv files without loading the whole file in memory. I do it in two ways, one seemingly "smart" way using pandas and another casual way using csv I expect the pandas way to be faster but the csv way is actually faster by a very large margin. I was wondering why.
Here is my code:
import pandas as pd
import csv
movies = pd.read_csv('movies.csv') # movieId,title,genres
movie_count = movies.shape[0] # 9742
movieId_min = ratings.movieId.min()
movieId_max = ratings.movieId.max()
movieId_disperse = movies.movieId.sort_values().to_dict()
movieId_squeeze = {v: k for k, v in movieId_disperse.items()}
def get_ratings_stats():
gp_by_user = []
gp_by_movie = [0] * movie_count
top_rator = (0, 0) # (idx, value)
top_rated = (0, 0) # (idx, value)
rating_count = 0
user_count = 0
last_user = -1
for row in csv.DictReader(open('ratings.csv')):
user = int(row['userId'])-1
movie = movieId_squeeze[int(row['movieId'])]
if last_user != user:
last_user = user
user_count += 1
gp_by_user += [0]
rating_count += 1
gp_by_user[user] += 1
gp_by_movie[movie] += 1
top_rator = (user, gp_by_user[user]) if gp_by_user[user] > top_rator[1] else top_rator
top_rated = (movie, gp_by_movie[movie]) if gp_by_movie[movie] > top_rated[1] else top_rated
top_rator = (top_rator[0]+1, top_rator[1])
top_rated = (movieId_disperse[top_rated[0]], top_rated[1])
return rating_count, top_rator, top_rated
Now if I replace the line:
for row in csv.DictReader(open('ratings.csv')):
With:
for chunk in pd.read_csv('ratings.csv', chunksize=1000):
for _,row in chunk.iterrows():
The code actually becomes 10 times slower.
Here are the timing results:
> %timeit get_ratings_stats() # with csv
325 ms ± 9.98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
> %timeit get_ratings_stats() # with pandas
3.45 s ± 67.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Any comments as to how I can make this code better/faster/more readable would be much appreciated
I think the point is that you shouldn't use pandas if you're going to then treat the big, expensive data structure like a dict. The question shouldn't be how to get pandas to be better at that, it should be how to write your code with pandas to do what you want.
import pandas as pd
def get_ratings_stats():
movie_rating_data = pd.read_csv('ratings.csv')
# Get the movie with the best rating
top_movie = movie_rating_data.loc[:, ['movieId', 'rating']].groupby('movieId').agg('max').sort_values(by='rating', ascending=False).iloc[:, 0]
# Get the user with the best rating
top_user = movie_rating_data.loc[:, ['userId', 'rating']].groupby('userId').agg('max').sort_values(by='rating', ascending=False).iloc[:, 0]
return movie_rating_data.shape[0], top_movie, top_user
def get_ratings_stats_slowly():
movies = pd.DataFrame(columns = ["movieId", "ratings"])
users = pd.DataFrame(users = ["userId", "ratings"])
data_size = 0
for chunk in pd.read_csv('ratings.csv', chunksize=1000):
movies = movies.append(chunk.loc[:, ['movieId', 'rating']].groupby('movieId').agg('max'))
users = users.append(chunk.loc[:, ['userId', 'rating']].groupby('userId').agg('max'))
data_size += chunk.shape[0]
top_movie = movies.loc[:, ['movieId', 'rating']].groupby('movieId').agg('max').sort_values(by='rating', ascending=False).iloc[:, 0]
top_user = users.loc[:, ['userId', 'rating']].groupby('userId').agg('max').sort_values(by='rating', ascending=False).iloc[:, 0]
return data_size, top_movie, top_user
I'm not really sure that this is what you want to do overall, but your code is incomprehensible - this should be a good place to start (you could replace .agg('max') with .count() if you're interested in the number of ratings, etc).
I think parallel processing is the answer for your question. I've tried doing some parallel processing on your problem but I had to split the ratings file into multiple files for processing.
What I did initially was to duplicate the ratings data from the CSV files by a factor of 10, and then I executed your script to have an initial execution time, which for me was about 3.6 seconds. Now, by splitting the files into multiple ones, that can be addressed by multiple child processes, and for example by using my script with -k 2 (basically 2 workers), the total execution time reduced to 1.87 seconds. If I use -k 4 (4 workers) the execution time will be 1.13 seconds.
I am not sure if it is possible to read the CSV in chunks and basically do a parallel seek reading from the CSV, from a single big file, but that would make it a lot faster, the only drawback being the need to do an initial count of the rows in the big CSV file, to know how many rows will go per worker.
The splitting script:
import csv
file_path = "data/ratings.csv"
out_path = "data/big_ratings_{}.csv"
out_csv = None
for i in range(10):
print("Iteration #{}".format(i+1))
pin = open(file_path, "r")
pout = open(out_path.format(i), "w")
in_csv = csv.DictReader(pin)
out_csv = csv.DictWriter(pout, fieldnames=in_csv.fieldnames)
out_csv.writeheader()
for row in in_csv:
out_csv.writerow(row)
pin.close()
pout.close()
The actual rating processing script
import time
import csv
import argparse
import os
import sys
from multiprocessing import Process, Queue, Value
import pandas as pd
top_rator_queue = Queue()
top_rated_queue = Queue()
DEFAULT_NO_OF_WORKERS = 1
RATINGS_FILE_PATH = "data/big_ratings_{}.csv"
NUMBER_OF_FILES = 10
class ProcessRatings(Process):
def __init__(self, file_index_range, top_rator_queue, top_rated_queue, movie_id_squeeze):
super(ProcessRatings, self).__init__()
self.file_index_range = file_index_range
self.top_rator_queue = top_rator_queue
self.top_rated_queue = top_rated_queue
self.movie_id_squeeze = movie_id_squeeze
def run(self):
for file_index in self.file_index_range:
print("[PID: {}] Processing file index {} .".format(os.getpid(), file_index))
start = time.time()
gp_by_user = []
gp_by_movie = [0] * movie_count
top_rator = (0, 0) # (idx, value)
top_rated = (0, 0) # (idx, value)
rating_count = 0
user_count = 0
last_user = -1
for row in csv.DictReader(open(RATINGS_FILE_PATH.format(file_index))):
user = int(row['userId'])-1
movie = self.movie_id_squeeze[int(row['movieId'])]
if last_user != user:
last_user = user
user_count += 1
gp_by_user += [0]
gp_by_user[user] += 1
gp_by_movie[movie] += 1
top_rator = (user, gp_by_user[user]) if gp_by_user[user] > top_rator[1] else top_rator
top_rated = (movie, gp_by_movie[movie]) if gp_by_movie[movie] > top_rated[1] else top_rated
end = time.time()
print("[PID: {}] Processing time for file index {} : {}s!".format(os.getpid(), file_index, end-start))
print("[PID: {}] WORKER DONE!".format(os.getpid()))
if __name__ == "__main__":
print("Processing ratings in multiple worker processes.")
start = time.time()
# script arguments handling
parser = argparse.ArgumentParser()
parser.add_argument("-k", dest="workers", action="store")
args_space = parser.parse_args()
# determine the number of workers
number_of_workers = DEFAULT_NO_OF_WORKERS
if args_space.workers:
number_of_workers = int(args_space.workers)
else:
print("Number of workers not specified. Assuming: {}".format(number_of_workers))
# rating data
rating_count = 0
movies = pd.read_csv('data/movies.csv') # movieId,title,genres
movie_count = movies.shape[0] # 9742
movieId_min = movies.movieId.min()
movieId_max = movies.movieId.max()
movieId_disperse = movies.movieId.sort_values().to_dict()
movieId_squeeze = {v: k for k, v in movieId_disperse.items()}
# process data
processes = []
# initialize the worker processes
number_of_files_per_worker = NUMBER_OF_FILES // number_of_workers
for i in range(number_of_workers):
p = ProcessRatings(
range(i, i+number_of_files_per_worker), # file index
top_rator_queue,
top_rated_queue,
movieId_squeeze
)
p.start()
processes.append(p)
print("MAIN: Wait for processes to finish ...")
# wait until all processes are done
while True:
# determine if the processes are still running
if not any(p.is_alive() for p in processes):
break
# gather the data and do a final processing
end = time.time()
print("Processing time: {}s".format(end - start))
print("Rating count: {}".format(rating_count))
Right now, my code is correctly spitting out the first game (identified by start_id) in games. I am trying to increment in the bottom two lines, but the while loop doesn't seem to read the fact that I'm incrementing. So the input of this with start_id 800 and end_id 802 is just the information from 800, for some reason.
Am I using the incrementers correctly? Should I be initializing one of i or start_id elsewhere?
games = console(start_id, end_id)
final_output = []
while start_id < (end_id + 1):
single_game = []
i = 0
game_id = games[i][0]
time_entries = games[i][1][2][0]
play_entries = games[i][1][2][1]
score_entries = games[i][1][2][2]
team_entries = games[i][1][2][3]
bovada = games[i][1][0][0][0]
at_capacity = games[i][1][0][1]
idisagree_yetrespect_thatcall = games[i][1][0][2][0]
imsailingaway = games[i][1][1][0][0]
homeiswheretheheartis = games[i][1][1][1][0]
zipper = zip(time_entries, play_entries, score_entries, team_entries)
for play_by_play in zipper:
single_game.append(game_id)
single_game.append(play_by_play)
single_game.append(bovada)
single_game.append(at_capacity)
single_game.append(idisagree_yetrespect_thatcall)
single_game.append(imsailingaway)
single_game.append(homeiswheretheheartis)
start_id += 1
i += 1
final_output.append(single_game)
return final_output
Your problem is that you initialize the increment-er i inside the while loop so every time your loop iterates i is reset to zero.
Try changing it to:
i = 0
while start_id < (end_id + 1):
...
Hi a sample output from running this code is shown as well, The script runs as well, you could run it to get a sense of what it does. My problem, is given the three different calls to the route() function what codes can I add to get a list of all previous objective function values ie obj variable on line 79.
desired output = obj_rslt = [20.989285714285714, 21.166176470588233, 25.8656 ]
I have tried to use the copy.copy() but it does not work, I need a list of all values as shown above for further work in another function. Thank you.
#import statements
import copy
import networkx as nx
import random as rand
#Define nodes and Edges
pos = {1001:(-42503,-3748871),1002:(-42267,-3749806),1003:(-40938,-3750235),1004: (-39452,-3750624),1005:(-39985,-3749564),1006:(-38473,-3749615),1007:(-41714,-3747171),1008:(-42279,-3745275),1009:(-41853,-3744185),1010:(-42000,-3746561),1011:(-42651,-3746188),1012:(-42195,-3747788),1013:(-41498,-3748890),1014:(-40366,-3748684),1015:(-43036,-3750284)}
edge = [(1001, 1003,{'length':0.35}),(1001, 1004,{'length':0.46}),(1001, 1009,{'length':0.49}),(1002, 1007,{'length':0.22}),(1002, 9972,{'length':0.54}),(1002, 1013,{'length':0.59}),(1003, 1014,{'length':0.25}),(1004, 1010,{'length':0.29}),(1004, 1013,{'length':0.57}),(1004, 1003,{'length':0.43}),(1004, 1006,{'length':0.37}),(1005, 1002,{'length':0.23}),(1005, 14566,{'length':0.72}),(1006, 1005,{'length':0.6}),(1007, 1003,{'length':0.39}),(1007, 1010,{'length':0.11}),(1009, 1001,{'length':0.51}),(1010, 1005,{'length':0.2}),(1011, 1004,{'length':0.37}),(1012, 1006,{'length':0.17}),(1013, 1005,{'length':0.19}),(1013, 1007,{'length':0.21}),(1014, 1005,{'length':0.35}),(1014, 1009,{'length':0.51})]
#Create the graph and add the nodes and edges
X = nx.MultiDiGraph()
X.add_nodes_from(pos.keys())
X.add_edges_from(edge)
def routes():
""" This function cretaes busroutes """
individual = []
shortest_path_length = []
num_routes = 3
#Generate the bus routes
for i in xrange(num_routes):
while True:
try:
A = int(rand.choice(pos.keys()))
B = int(rand.choice(pos.keys()))
path = nx.dijkstra_path(X,A,B,weight='length')
individual.append(path)
pathlength = round(nx.dijkstra_path_length(X,A,B),2)
if pathlength > 1:
shortest_path_length.append(pathlength)
break
else: pathlength
except:pass
# Loop through the list of shortest path nodes to get the nodes for the bus route
#bus_route_nodes = [map(lambda x: str(x) + '.0', l) for l in individual]
veh_spid = []
veh_hdw = []
obj_rslt = []
for ind in individual:
try:
headway = rand.randint(2, 30)
veh_hdw.append(headway)
speed = rand.choice([2,15,66])
veh_spid.append(speed)
except: pass
# Print bus route features
print 'OUTPUTS:'
print 'Route Name:', str(ind[0]) + ' ' + '-' + ' ' + str(ind[-1])
print 'Route Number:', individual.index(ind) + 1
print 'Route headway = ',headway
print 'Route speed = ',speed
print 'shortest path length', shortest_path_length
# Average network characteristics gotten by taken the mean of individual characteristics that make up each network
ntwk_len = sum(shortest_path_length)
ntwk_spid = sum(veh_spid)/len(veh_spid)
ntwk_hdwy = sum(veh_hdw )/len(veh_hdw)
#Calculate objective function values
obj = [0]
obj = copy.copy(obj)
obj = ( (ntwk_len/ntwk_spid) + 5 * (60/ntwk_hdwy) + (ntwk_len) )
obj_rslt.append(obj)
print 'obj_rslt', obj_rslt
print
return individual
#Three distinct method calls
routes()
routes()
routes()
OUTPUTS:
Route Name: 1014 - 1001
Route Number: 6
Route headway = 29
Route speed = 2
shortest path length [1.8, 2.77, 1.02]
obj_rslt [20.989285714285714]
OUTPUTS:
Route Name: 1003 - 1007
Route Number: 9
Route headway = 5
Route speed = 66
shortest path length [2.37, 2.57, 1.05]
obj_rslt [21.166176470588233]
OUTPUTS:
Route Name: 1012 - 1013
Route Number: 6
Route headway = 6
Route speed = 66
shortest path length [2.2, 1.85, 1.59]
obj_rslt [25.8656]
desired output = obj_rslt = [20.989285714285714, 21.166176470588233, 25.8656 ]
The problem is that you always set obj_rslt to []
Move the assignment obj_rslt = [] out of the method after the definition of pos and edge then you should be fine
#...your code
obj_results = []
def routes():
#do some computation and assume you store it in
#a variable called res
obj_results.append(res)
return res
routes()
routes()
routes()
print obj_results
The ouput will be a list with all three results