I want to parallelize the spec which is generated by _spectrum_generator. I am using futures.ThreadPoolExecutor which is called in _gather_lcms_data. The spec is passed through function. The file is in .mzML format. Below is the output that i get which is empty.
(base) ashish#14-ce3xxx:~/GNPS_LCMSDashboard$ python3 lcms_map.py
Empty DataFrame
Columns: [mz, rt, i, scan, index, polarity]
Index: []
The output should be look like this:
(base) ashish#14-ce3xxx:/media/ashish/ubuntu7/GNPS_LCMSDashboard$ python3 lcms_map.py
mz rt i scan index polarity
0 169.038696 0.003722 1652.959961 1 1 1
1 177.969086 0.003722 1786.755127 1 1 1
2 194.156967 0.003722 1802.361450 1 1 1
3 154.059418 0.003722 1840.889160 1 1 1
4 164.080978 0.003722 1973.758423 1 1 1
5 150.079514 0.003722 1976.528687 1 1 1
6 160.096634 0.003722 2057.728516 1 1 1
7 201.182205 0.003722 2077.768311 1 1 1
8 162.078735 0.003722 2101.843018 1 1 1
9 171.044205 0.003722 2223.230713 1 1 1
Below is the code of _spectrum_generator:
def _spectrum_generator(filename, min_rt, max_rt):
run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
# Don't do this if the min_rt and max_rt are not reasonable values
if min_rt <= 0 and max_rt > 1000:
for spec in run:
yield spec
else:
try:
min_rt_index = _find_lcms_rt(run, min_rt) # These are inclusive on left
max_rt_index = _find_lcms_rt(run, max_rt) + 1 # Exclusive on the right
for spec_index in tqdm(range(min_rt_index, max_rt_index)):
spec = run[spec_index]
yield spec
print("USED INDEX")
except:
run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
for spec in run:
yield spec
print("USED BRUTEFORCE")
Below is code of lcms_map.py:
import os
import pymzml
import numpy as np
import datashader as ds
from tqdm import tqdm
import json
import pandas as pd
import xarray
import time
import utils
import plotly.express as px
import plotly.graph_objects as go
from utils import _spectrum_generator
from utils import _get_scan_polarity
from multiprocessing import Pool
import concurrent.futures
from multiprocessing import Process
# Enum for polarity
POLARITY_POS = 1
POLARITY_NEG = 2
def _gather_lcms_data(filename, min_rt, max_rt, min_mz, max_mz, polarity_filter="None", top_spectrum_peaks=100, include_polarity=False):
all_mz = []
all_rt = []
all_polarity = []
all_i = []
all_scan = []
all_index = []
spectrum_index = 0
number_spectra = 0
all_msn_mz = []
all_msn_rt = []
all_msn_polarity = []
all_msn_scan = []
all_msn_level = []
#fun(filename, min_rt, max_rt)
for spec in _spectrum_generator(filename, min_rt, max_rt):
rt = spec.scan_time_in_minutes()
try:
# Still waiting for the window
if rt < min_rt:
continue
# pass
# We've passed the window
if rt > max_rt:
break
except:
pass
if polarity_filter == "None":
pass
else:
scan_polarity = _get_scan_polarity(spec)
if polarity_filter != scan_polarity:
continue
if spec.ms_level == 1:
spectrum_index += 1
number_spectra += 1
try:
# Filtering peaks by mz
if min_mz <= 0 and max_mz >= 2000:
peaks = spec.peaks("raw")
else:
peaks = spec.reduce(mz_range=(min_mz, max_mz))
# Filtering out zero rows
peaks = peaks[~np.any(peaks < 1.0, axis=1)]
# Sorting by intensity
peaks = peaks[peaks[:,1].argsort()]
peaks = peaks[-1 * top_spectrum_peaks:]
mz, intensity = zip(*peaks)
all_mz += list(mz)
all_i += list(intensity)
all_rt += len(mz) * [rt]
all_scan += len(mz) * [spec.ID]
all_index += len(mz) * [number_spectra]
# Adding polarity
if include_polarity is True:
scan_polarity = _get_scan_polarity(spec)
if scan_polarity == "Positive":
all_polarity += len(mz) * [POLARITY_POS]
else:
all_polarity += len(mz) * [POLARITY_NEG]
except:
pass
elif spec.ms_level > 1:
try:
msn_mz = spec.selected_precursors[0]["mz"]
if msn_mz < min_mz or msn_mz > max_mz:
continue
all_msn_mz.append(msn_mz)
all_msn_rt.append(rt)
all_msn_scan.append(spec.ID)
all_msn_level.append(spec.ms_level)
# Adding polarity
if include_polarity is True:
scan_polarity = _get_scan_polarity(spec)
if scan_polarity == "Positive":
all_msn_polarity.append(POLARITY_POS)
else:
all_msn_polarity.append(POLARITY_NEG)
except:
pass
ms1_results = {}
ms1_results["mz"] = all_mz
ms1_results["rt"] = all_rt
ms1_results["i"] = all_i
ms1_results["scan"] = all_scan
ms1_results["index"] = all_index
msn_results = {}
msn_results["precursor_mz"] = all_msn_mz
msn_results["rt"] = all_msn_rt
msn_results["scan"] = all_msn_scan
msn_results["level"] = all_msn_level
# Adding polarity
if include_polarity is True:
ms1_results["polarity"] = all_polarity
msn_results["polarity"] = all_msn_polarity
ms1_results = pd.DataFrame(ms1_results)
msn_results = pd.DataFrame(msn_results)
return ms1_results, number_spectra, msn_results
def _get_feather_filenames(filename):
output_ms1_filename = filename + ".ms1.feather"
output_msn_filename = filename + ".msn.feather"
return output_ms1_filename, output_msn_filename
# These are caching layers for fast loading
def _save_lcms_data_feather(filename):
output_ms1_filename, output_msn_filename = _get_feather_filenames(filename)
start=time.time()
# with Pool(5) as p:
# #ms1_results, number_spectra, msn_results = p.starmap(_gather_lcms_data, (filename, 0, 1000000, 0, 10000, "None", 100000, True))
# ms1_results, number_spectra, msn_results = _gather_lcms_data(filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
# with concurrent.futures.ProcessPoolExecutor(max_workers=100) as executor:
# f=executor.submit(_gather_lcms_data, filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
# ms1_results, number_spectra, msn_results = f.result()
ms1_results, number_spectra, msn_results = _gather_lcms_data(filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
print(ms1_results.head(10))
print("Gathered data in", time.time() - start)
ms1_results = ms1_results.sort_values(by='i', ascending=False).reset_index()
ms1_results.to_feather(output_ms1_filename)
msn_results.to_feather(output_msn_filename)
_save_lcms_data_feather("/media/ashish/ubuntu7/GNPS_LCMSDashboard/QC_0.mzML")
How do i get the desired output by parallelizing. Suggest the changes that i need make in order to make it work.
As Simon Lundberg pointed out, you posted very complicated code, which makes it difficult to parallelize and even more difficult to explain how it is to be done. But if you were to present a simplified version of your code that was readily parallelizable, any answer would not be dealing with the complexities of your actual current code and would therefore be of little help. So I will try to create code that is an abstraction of your code's structure and then show how I would parallelize that. I am afraid that since you are not that familiar with multiprocessing, this may be rather difficult for you to follow.
First, a few observations about your code:
_gather_lcms_data currently is passed a filename and then using a generator function, _spectrum_generator, loops on all of its elements, called variable spec. In each loop iteration variable results are appended to various lists and variable number_spectra is optionally incremented. You have another variable spectrum_index that is also optionally incremented but its value is not otherwise used and could be eliminated. Finally, these lists are added to various dictionaries.
To parallelize the _gather_lcms_data function, it needs to process a single element, spec from the _spectrum_generator function so that we can run multiple invocations of this function in parallel. Consequently it needs to return a tuple of elements back to the main process which will do the necessary appending to lists and then create the dictionaries.
In your current code for each iteration of spec you optionally increment number_spectra and optionally append elements to various lists. Since we are now going to be parallelizing this function by returning individual elements, the main process must (1) accumulate the returned number_spectra value and optionally append the returned elements to result lists. Where in the original code you were not appending an element to a list for a given iteration, in the parallelized code you must return a None value so that the main process knows that for that iteration nothing needs to be appended.
In this abstraction, I have also reduced the number of lists down to two and I am generating dummy results.
First an abstraction of your current code.
def _spectrum_generator(filename, min_rt, max_rt):
#run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
run = [1, 2, 3, 4, 5, 6]
for spec in run:
yield spec
def _gather_lcms_data(filename, min_rt, max_rt, min_mz, max_mz, polarity_filter="None", top_spectrum_peaks=100, include_polarity=False):
# Remainder of the list declarations omitted for simplicity
all_mz = []
all_msn_mz= []
number_spectra = 0
for spec in _spectrum_generator(filename, min_rt, max_rt):
... # Code omitted for simplicity
number_spectra += 1 # Conditionally done
msn_mz = spec # Conditionally done
all_msn_mz.append(msn_mz)
mz = (spec * spec,) # Conditionally done
all_mz += list(mz)
...
ms1_results = {}
msn_results = {}
...
ms1_results["mz"] = all_mz
msn_results["precursor_mz"] = all_msn_mz
...
# Return
return ms1_results, number_spectra, msn_results
def _save_lcms_data_feather(filename):
ms1_results, number_spectra, msn_results = _gather_lcms_data(filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
print(ms1_results)
print(number_spectra)
print(msn_results)
if __name__ == '__main__':
_save_lcms_data_feather("/media/ashish/ubuntu7/GNPS_LCMSDashboard/QC_0.mzML")
Prints:
{'mz': [1, 4, 9, 16, 25, 36]}
6
{'precursor_mz': [1, 2, 3, 4, 5, 6]}
This is the parallelized version of the above code:
def _spectrum_generator(filename, min_rt, max_rt):
#run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
run = [1, 2, 3, 4, 5, 6]
for spec in run:
yield spec
def _gather_lcms_data(spec, min_rt, max_rt, min_mz, max_mz, polarity_filter="None", top_spectrum_peaks=100, include_polarity=False):
# Remainder of the list declarations omitted for simplicity
number_spectra = 0
... # Code omitted for simplicity
number_spectra += 1
msn_mz = spec # Conditionally done. If not done then set msn_mz to None
mz = list((spec * spec,)) # Conditionally done. If not done then set mz to None
...
return mz, number_spectra, msn_mz
def _save_lcms_data_feather(filename):
from multiprocessing import Pool
from functools import partial
min_rt = 0
max_rt = 1000000
worker_function = partial(_gather_lcms_data, min_rt=min_rt, max_rt=max_rt, min_mz=0, max_mz=10000, polarity_filter="None", top_spectrum_peaks=1000000, include_polarity=True)
with Pool() as pool:
all_mz = []
all_msn_mz = []
number_spectra = 0
for mz, _number_spectra, msn_mz in pool.map(worker_function, _spectrum_generator(filename, min_rt, max_rt)):
if mz is not None:
all_mz += mz
number_spectra += _number_spectra
if msn_mz is not None:
all_msn_mz.append(msn_mz)
ms1_results = {}
msn_results = {}
ms1_results["mz"] = all_mz
msn_results["precursor_mz"] = all_msn_mz
print(ms1_results)
print(number_spectra)
print(msn_results)
if __name__ == '__main__':
_save_lcms_data_feather("/media/ashish/ubuntu7/GNPS_LCMSDashboard/QC_0.mzML")
I am trying to use multiprocessing to speed up my data processing. I am working on a machine with 6 Cores, so I want to iterate through a table of 12 million rows, and for each of these rows I iterate through several time steps doing a calculation (executing a function).
This line I would like to split up that it runs in parallel on different cores:
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
I tried something with
from multiprocessing import Pool
but I did not manage to pass the arguments of the function and the iterator.
I would appreciate any idea. I am new to Python.
This is what i have:
import pyreadr
import pandas as pd
import numpy as np
import time
from datetime import timedelta
import functools
from pathlib import Path
def read_data():
current_path = os.getcwd()
myfile = os.path.join(str(Path(current_path).parents[0]), 'dummy.RData')
result = pyreadr.read_r(myfile)
pc = result["pc"]
u = result["u"]
return pc, u
# add one column per time
def prepare_output_structure(pc):
ini_cols = pc.columns
pc = pc.reindex(columns=[*pc.columns, *np.arange(0, 11), 'cat'], fill_value=0)
pc.reset_index(level=0, inplace=True)
# print(pc.columns, pc.shape, pc.dtypes)
return pc, ini_cols
def conjunction(*conditions):
return functools.reduce(np.logical_and, conditions)
def timeloop(t_final: int, count_final: int, tipo):
if tipo == 'A':
count_ini = 35
else: # B:
count_ini = 30
yy_list = []
for t in np.arange(0, 11):
yy = ((count_final - count_ini) / t_final) * t + count_ini
yy_list.append(int(yy))
return yy_list
def rowiteration(i, output, ini_cols, cols):
c_2: bool = pc.loc[i, 'tipo'] == u.iloc[:, 0].str[:1] # first character of category e.g. 'A1'
c_5: bool = pc.loc[i, 't_final'] >= u.iloc[:, 1] # t_min (u)
c_6: bool = pc.loc[i, 't_final'] <= (u.iloc[:, 2]) # t_max (u)
pc.loc[i, 'cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
pc.iloc[i, (0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(pc.loc[i, 't_final']), int(pc.loc[i, 'count_final']), pc.loc[i, 'tipo'])
out = pd.DataFrame(pc.iloc[i, :])
out = pd.DataFrame(out.transpose(), columns=cols)
output = output.append(out.iloc[0, :])
return output
if __name__ == '__main__':
start_time = time.time()
pc, u = read_data()
nrowpc = len(pc.index)
a = np.arange(0, nrowpc) # filas tabla pc
# print(a, nrowpc, len(pc.index))
pc, ini_cols = prepare_output_structure(pc)
cols = pc.columns
output = pd.DataFrame()
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
pc2 = pd.concat(test, ignore_index=True)
pc2 = pc2.iloc[:, np.r_[5, (len(ini_cols)+1):(len(pc2.columns))]]
print(pc2.head)
elapsed_time_secs = time.time() - start_time
msg = "Execution took: %s secs (Wall clock time)" % timedelta(milliseconds=elapsed_time_secs)
print(msg)```
Replace your [rowiteration(i, output, ini_cols, cols) for i in a] with:
from multiprocessing import Pool
n_cpu = 10 # put in the number of threads of cpu
with Pool(processes=n_cpu) as pool:
ret = pool.starmap(rowiteration,
[(i, output, ini_cols, cols) for i in a])
Here is an approach that I think solves the problem and that only sends what is necessary to the worker processes. I haven't tested this as is (which would be difficult without the data your code reads in) but this is basic idea:
import multiprocessing as mp
p = mp.Pool(processes=mp.cpu_count())
# Note that you already define the static cols and ini_cols
# in global scope so you don't need to pass them to the Pool.
# ... Other functions you've defined ...
def rowiteration(row):
c_2: bool = row['tipo'] == u.iloc[:, 0].str[:1]
c_5: bool = row['t_final'] >= u.iloc[:, 1]
c_6: bool = row['t_final'] <= (u.iloc[:, 2])
row['cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
row[(0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(row['t_final']), int(row['count_final']), row['tipo'])
return row
out = []
for row in p.imap_unordered(rowiteration, [r for _, r in pc.iterrows()]):
row.index = cols
out.append(cols)
pc2 = pd.DataFrame(out, ignore_index=True)
I'm unable to generate all entries in Kaltura. An ApiException with the message "Unable to generate list. max matches value was reached" (Error: QUERY_EXCEEDED_MAX_MATCHES_ALLOWED) gets triggered.
I tried to work around such issue by setting my sessionPrivileges to disableentitlement
class class_chk_integrity():
client = None
pagesize = 0
def __init__(self,worker_num, progress):
self.pagesize = 30
self.worker_num = worker_num
self.progress = progress
config = KalturaConfiguration(2723521)
config.serviceUrl = "https://www.kaltura.com/"
self.client = KalturaClient(config)
ks = self.client.session.start("KALTURA_ADMIN_SECRET",
"email#email.com",
KalturaPluginsCore.KalturaSessionType.ADMIN,
"KALTURA_PARTNER_ID",
432000,
"disableentitlement")
self.client.setKs(ks)
I also tried to filter based on the id's. However, I can't manage to put the filter.idNotIn to work properly.
def get_total_reg(self, cont, lastEntryIds, lastEntryCreatedAt):
filter = KalturaPluginsCore.KalturaBaseEntryFilter()
if lastEntryIds != "":
filter.idNotIn = lastEntryIds
filter.orderBy = KalturaBaseEntryOrderBy.CREATED_AT_DESC
pager = KalturaPluginsCore.KalturaFilterPager()
pageIndex = 1
entriesGot = 0
pager.pageSize = self.pagesize
pager.setPageIndex = pageIndex
result = self.client.baseEntry.list(filter, pager)
totalCount = result.totalCount
if totalCount > 10000:
totalCount = 9970
if totalCount <= 0:
cont = False
while entriesGot < totalCount:
pager.pageSize = self.pagesize
pageIndex += 1
pager.pageIndex = pageIndex
result = self.client.baseEntry.list(filter, pager)
entriesGot += len(result.objects)
for e in result.objects:
if lastEntryIds == "":
lastEntryIds.append(e.id)
else:
lastEntryIds.append(e.id)
lastEntryCreatedAt = e.createdAt
return result.totalCount, self.pagesize, cont, lastEntryIds, lastEntryCreatedAt
This is my how I'm calling the functions
if __name__ == '__main__':
try:
log = _ServiceUtils.log()
log.setup('all', 'integrity')
cont = True
lastEntryIds = []
lastEntryCreatedAt = 0
while cont is True:
kmc = class_chk_integrity(0,0)
kmc_total_reg, kmc_page_size, cont, lastEntryIds, lastEntryCreatedAt = kmc.get_total_reg(cont, lastEntryIds, lastEntryCreatedAt)
interval = 10
max_threads = math.ceil(kmc_total_reg / (interval * kmc_page_size))
# max_threads = 1
threads_list = []
print('TOTAL REG : %s | PAGE_SIZE : %s | INTERVAL : %s | THREADS : %s' % (kmc_total_reg,kmc_page_size,interval,max_threads))
progress = class_progress_thread(max_threads)
for index in range(0,max_threads):
page_ini = index * interval
page_end = index * interval + interval
progress.add_worker_progress(index,datetime.now())
threads_list.append(threading.Thread(target=thread_chk_integrity, args=(index, log, index * interval + 1,index * interval + interval,progress)))
threads_list.append(threading.Thread(target=thread_output_progress, args=(progress,max_threads)))
for thread in threads_list:
thread.start()
for thread in threads_list:
thread.join()
while not progress.stop(): time.sleep(30)
except KeyboardInterrupt:
try:
sys.exit(0)
except SystemExit:
os._exit(0)
I'd appreciate any help with this.
Thank you for your attention.
if totalCount > 10000:
totalCount = 9970
I'm curious to know why you are changing the totalCount this way.
Short answer - paging works as long as the result set is up to 10K.
To work around that, sort the result by creation date (as you did), and when you get to 10K, start with a new search where the created_at date in the filter is the last value you got in the previous search. Reset your paging of course.
I have the following code
global total_pds
total_pds = []
ksplit = wr.s3.list_objects(pred_path)
ksplit = list(ksplit)
def process(x):
dk = wr.s3.read_parquet(path = pred_path+x,dataset=False)
return dk
def log_result(result):
print(len(total_pds), end = ' ')
total_pds.append(result)
def error_back(error):
print('error', error)
pool = mp.Pool(processes=4,maxtasksperchild=10)
dcms_info = [pool.apply_async(process, args=(spl,), callback = log_result, error_callback = error_back) for spl in ksplit]
for x in dcms_info:
x.wait()
pool.close()
pool.join()
dataset = pd.concat(total_pds, ignore_index=True)
the last element throw me this error:
error("'i' format requires -2147483648 <= number <= 2147483647"
Thank you
I tried to use multiprocessing on this for loop:
def segment_features(segment_pixels):
features = []
npixels, nbands = segment_pixels.shape
for b in range(nbands):
stats = scipy.stats.describe(segment_pixels[:, b])
band_stats = list(stats.minmax) + list(stats)[2:]
if npixels == 1:
# in this case the variance = nan, change it 0.0
band_stats[3] = 0.0
features += band_stats
return features
segment_ids = np.unique(segments)
objects = []
object_ids = []
for id in segment_ids:
segment_pixels = img[segments == id]
object_features = segment_features(segment_pixels)
objects.append(object_features)
object_ids.append(id)
By replacing the for loop section with this:
def segment_features_calc(segment_pixels):
segment_pixels = img[segments == id]
object_features = segment_features(segment_pixels)
objects.append(object_features)
object_ids.append(id)
print("segment "+str(id)+" features calculated")
n=mp.cpu_count()
if__name__ == '__main__':
p = mp.Pool(processes = n)
start = time.time()
async_result = p.map_async(segment_features_calc,list(segment_ids))
p.close()
p.join()
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))
However, the multiprocessing does not get executed properly (multiprocessing stops after 0.17 seconds whereas the items to loop are close to 270,000+ segment IDs). Any insights on how to solve the issue?