I want to parallelize the spec which is generated by _spectrum_generator. I am using futures.ThreadPoolExecutor which is called in _gather_lcms_data. The spec is passed through function. The file is in .mzML format. Below is the output that i get which is empty.
(base) ashish#14-ce3xxx:~/GNPS_LCMSDashboard$ python3 lcms_map.py
Empty DataFrame
Columns: [mz, rt, i, scan, index, polarity]
Index: []
The output should be look like this:
(base) ashish#14-ce3xxx:/media/ashish/ubuntu7/GNPS_LCMSDashboard$ python3 lcms_map.py
mz rt i scan index polarity
0 169.038696 0.003722 1652.959961 1 1 1
1 177.969086 0.003722 1786.755127 1 1 1
2 194.156967 0.003722 1802.361450 1 1 1
3 154.059418 0.003722 1840.889160 1 1 1
4 164.080978 0.003722 1973.758423 1 1 1
5 150.079514 0.003722 1976.528687 1 1 1
6 160.096634 0.003722 2057.728516 1 1 1
7 201.182205 0.003722 2077.768311 1 1 1
8 162.078735 0.003722 2101.843018 1 1 1
9 171.044205 0.003722 2223.230713 1 1 1
Below is the code of _spectrum_generator:
def _spectrum_generator(filename, min_rt, max_rt):
run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
# Don't do this if the min_rt and max_rt are not reasonable values
if min_rt <= 0 and max_rt > 1000:
for spec in run:
yield spec
else:
try:
min_rt_index = _find_lcms_rt(run, min_rt) # These are inclusive on left
max_rt_index = _find_lcms_rt(run, max_rt) + 1 # Exclusive on the right
for spec_index in tqdm(range(min_rt_index, max_rt_index)):
spec = run[spec_index]
yield spec
print("USED INDEX")
except:
run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
for spec in run:
yield spec
print("USED BRUTEFORCE")
Below is code of lcms_map.py:
import os
import pymzml
import numpy as np
import datashader as ds
from tqdm import tqdm
import json
import pandas as pd
import xarray
import time
import utils
import plotly.express as px
import plotly.graph_objects as go
from utils import _spectrum_generator
from utils import _get_scan_polarity
from multiprocessing import Pool
import concurrent.futures
from multiprocessing import Process
# Enum for polarity
POLARITY_POS = 1
POLARITY_NEG = 2
def _gather_lcms_data(filename, min_rt, max_rt, min_mz, max_mz, polarity_filter="None", top_spectrum_peaks=100, include_polarity=False):
all_mz = []
all_rt = []
all_polarity = []
all_i = []
all_scan = []
all_index = []
spectrum_index = 0
number_spectra = 0
all_msn_mz = []
all_msn_rt = []
all_msn_polarity = []
all_msn_scan = []
all_msn_level = []
#fun(filename, min_rt, max_rt)
for spec in _spectrum_generator(filename, min_rt, max_rt):
rt = spec.scan_time_in_minutes()
try:
# Still waiting for the window
if rt < min_rt:
continue
# pass
# We've passed the window
if rt > max_rt:
break
except:
pass
if polarity_filter == "None":
pass
else:
scan_polarity = _get_scan_polarity(spec)
if polarity_filter != scan_polarity:
continue
if spec.ms_level == 1:
spectrum_index += 1
number_spectra += 1
try:
# Filtering peaks by mz
if min_mz <= 0 and max_mz >= 2000:
peaks = spec.peaks("raw")
else:
peaks = spec.reduce(mz_range=(min_mz, max_mz))
# Filtering out zero rows
peaks = peaks[~np.any(peaks < 1.0, axis=1)]
# Sorting by intensity
peaks = peaks[peaks[:,1].argsort()]
peaks = peaks[-1 * top_spectrum_peaks:]
mz, intensity = zip(*peaks)
all_mz += list(mz)
all_i += list(intensity)
all_rt += len(mz) * [rt]
all_scan += len(mz) * [spec.ID]
all_index += len(mz) * [number_spectra]
# Adding polarity
if include_polarity is True:
scan_polarity = _get_scan_polarity(spec)
if scan_polarity == "Positive":
all_polarity += len(mz) * [POLARITY_POS]
else:
all_polarity += len(mz) * [POLARITY_NEG]
except:
pass
elif spec.ms_level > 1:
try:
msn_mz = spec.selected_precursors[0]["mz"]
if msn_mz < min_mz or msn_mz > max_mz:
continue
all_msn_mz.append(msn_mz)
all_msn_rt.append(rt)
all_msn_scan.append(spec.ID)
all_msn_level.append(spec.ms_level)
# Adding polarity
if include_polarity is True:
scan_polarity = _get_scan_polarity(spec)
if scan_polarity == "Positive":
all_msn_polarity.append(POLARITY_POS)
else:
all_msn_polarity.append(POLARITY_NEG)
except:
pass
ms1_results = {}
ms1_results["mz"] = all_mz
ms1_results["rt"] = all_rt
ms1_results["i"] = all_i
ms1_results["scan"] = all_scan
ms1_results["index"] = all_index
msn_results = {}
msn_results["precursor_mz"] = all_msn_mz
msn_results["rt"] = all_msn_rt
msn_results["scan"] = all_msn_scan
msn_results["level"] = all_msn_level
# Adding polarity
if include_polarity is True:
ms1_results["polarity"] = all_polarity
msn_results["polarity"] = all_msn_polarity
ms1_results = pd.DataFrame(ms1_results)
msn_results = pd.DataFrame(msn_results)
return ms1_results, number_spectra, msn_results
def _get_feather_filenames(filename):
output_ms1_filename = filename + ".ms1.feather"
output_msn_filename = filename + ".msn.feather"
return output_ms1_filename, output_msn_filename
# These are caching layers for fast loading
def _save_lcms_data_feather(filename):
output_ms1_filename, output_msn_filename = _get_feather_filenames(filename)
start=time.time()
# with Pool(5) as p:
# #ms1_results, number_spectra, msn_results = p.starmap(_gather_lcms_data, (filename, 0, 1000000, 0, 10000, "None", 100000, True))
# ms1_results, number_spectra, msn_results = _gather_lcms_data(filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
# with concurrent.futures.ProcessPoolExecutor(max_workers=100) as executor:
# f=executor.submit(_gather_lcms_data, filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
# ms1_results, number_spectra, msn_results = f.result()
ms1_results, number_spectra, msn_results = _gather_lcms_data(filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
print(ms1_results.head(10))
print("Gathered data in", time.time() - start)
ms1_results = ms1_results.sort_values(by='i', ascending=False).reset_index()
ms1_results.to_feather(output_ms1_filename)
msn_results.to_feather(output_msn_filename)
_save_lcms_data_feather("/media/ashish/ubuntu7/GNPS_LCMSDashboard/QC_0.mzML")
How do i get the desired output by parallelizing. Suggest the changes that i need make in order to make it work.
As Simon Lundberg pointed out, you posted very complicated code, which makes it difficult to parallelize and even more difficult to explain how it is to be done. But if you were to present a simplified version of your code that was readily parallelizable, any answer would not be dealing with the complexities of your actual current code and would therefore be of little help. So I will try to create code that is an abstraction of your code's structure and then show how I would parallelize that. I am afraid that since you are not that familiar with multiprocessing, this may be rather difficult for you to follow.
First, a few observations about your code:
_gather_lcms_data currently is passed a filename and then using a generator function, _spectrum_generator, loops on all of its elements, called variable spec. In each loop iteration variable results are appended to various lists and variable number_spectra is optionally incremented. You have another variable spectrum_index that is also optionally incremented but its value is not otherwise used and could be eliminated. Finally, these lists are added to various dictionaries.
To parallelize the _gather_lcms_data function, it needs to process a single element, spec from the _spectrum_generator function so that we can run multiple invocations of this function in parallel. Consequently it needs to return a tuple of elements back to the main process which will do the necessary appending to lists and then create the dictionaries.
In your current code for each iteration of spec you optionally increment number_spectra and optionally append elements to various lists. Since we are now going to be parallelizing this function by returning individual elements, the main process must (1) accumulate the returned number_spectra value and optionally append the returned elements to result lists. Where in the original code you were not appending an element to a list for a given iteration, in the parallelized code you must return a None value so that the main process knows that for that iteration nothing needs to be appended.
In this abstraction, I have also reduced the number of lists down to two and I am generating dummy results.
First an abstraction of your current code.
def _spectrum_generator(filename, min_rt, max_rt):
#run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
run = [1, 2, 3, 4, 5, 6]
for spec in run:
yield spec
def _gather_lcms_data(filename, min_rt, max_rt, min_mz, max_mz, polarity_filter="None", top_spectrum_peaks=100, include_polarity=False):
# Remainder of the list declarations omitted for simplicity
all_mz = []
all_msn_mz= []
number_spectra = 0
for spec in _spectrum_generator(filename, min_rt, max_rt):
... # Code omitted for simplicity
number_spectra += 1 # Conditionally done
msn_mz = spec # Conditionally done
all_msn_mz.append(msn_mz)
mz = (spec * spec,) # Conditionally done
all_mz += list(mz)
...
ms1_results = {}
msn_results = {}
...
ms1_results["mz"] = all_mz
msn_results["precursor_mz"] = all_msn_mz
...
# Return
return ms1_results, number_spectra, msn_results
def _save_lcms_data_feather(filename):
ms1_results, number_spectra, msn_results = _gather_lcms_data(filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
print(ms1_results)
print(number_spectra)
print(msn_results)
if __name__ == '__main__':
_save_lcms_data_feather("/media/ashish/ubuntu7/GNPS_LCMSDashboard/QC_0.mzML")
Prints:
{'mz': [1, 4, 9, 16, 25, 36]}
6
{'precursor_mz': [1, 2, 3, 4, 5, 6]}
This is the parallelized version of the above code:
def _spectrum_generator(filename, min_rt, max_rt):
#run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
run = [1, 2, 3, 4, 5, 6]
for spec in run:
yield spec
def _gather_lcms_data(spec, min_rt, max_rt, min_mz, max_mz, polarity_filter="None", top_spectrum_peaks=100, include_polarity=False):
# Remainder of the list declarations omitted for simplicity
number_spectra = 0
... # Code omitted for simplicity
number_spectra += 1
msn_mz = spec # Conditionally done. If not done then set msn_mz to None
mz = list((spec * spec,)) # Conditionally done. If not done then set mz to None
...
return mz, number_spectra, msn_mz
def _save_lcms_data_feather(filename):
from multiprocessing import Pool
from functools import partial
min_rt = 0
max_rt = 1000000
worker_function = partial(_gather_lcms_data, min_rt=min_rt, max_rt=max_rt, min_mz=0, max_mz=10000, polarity_filter="None", top_spectrum_peaks=1000000, include_polarity=True)
with Pool() as pool:
all_mz = []
all_msn_mz = []
number_spectra = 0
for mz, _number_spectra, msn_mz in pool.map(worker_function, _spectrum_generator(filename, min_rt, max_rt)):
if mz is not None:
all_mz += mz
number_spectra += _number_spectra
if msn_mz is not None:
all_msn_mz.append(msn_mz)
ms1_results = {}
msn_results = {}
ms1_results["mz"] = all_mz
msn_results["precursor_mz"] = all_msn_mz
print(ms1_results)
print(number_spectra)
print(msn_results)
if __name__ == '__main__':
_save_lcms_data_feather("/media/ashish/ubuntu7/GNPS_LCMSDashboard/QC_0.mzML")
I want to run the runBacktest() function in async is this possible?
import pandas as pd
from pathlib import Path
from datetime import datetime
from indicators import *
#Loading the file.
dfCryptoCap = pd.read_csv(f"{Path(__file__).parent.resolve()}\CRYPTOCAP_TOTAL, 720_b2571.csv")
dfBtcUsd = pd.read_csv(f"{Path(__file__).parent.resolve()}\INDEX_BTCUSD, 720_17561.csv")
# Add Column for converted unix timestamp to datetime
dfCryptoCap['timeiso'] = pd.to_datetime(dfCryptoCap['time'],unit='s')
dfBtcUsd['timeiso'] = pd.to_datetime(dfBtcUsd['time'],unit='s')
dfCryptoCapHA = generateHeikinAshi(dfCryptoCap)
dfBtcUsdHA = generateHeikinAshi(dfBtcUsd)
results = []
def runBacktest(lenSmooth1, winningLenSmooth1, winningPNL):
dfCryptoCapEMA = dfCryptoCapHA.copy()
dfCryptoCapEMA['open'] = calculateEMA(dfCryptoCapHA['open'], lenSmooth1)
dfCryptoCapEMA['high'] = calculateEMA(dfCryptoCapHA['high'], lenSmooth1)
dfCryptoCapEMA['low'] = calculateEMA(dfCryptoCapHA['low'], lenSmooth1)
dfCryptoCapEMA['close'] = calculateEMA(dfCryptoCapHA['close'], lenSmooth1)
# print(dfCryptoCapSMA1)
portfoliosize = 1000
entryPrice = 0.0
traderesult = 0.0
for i in range(1, len(dfCryptoCapEMA)):
if dfCryptoCapEMA.iloc[i]['close'] > dfCryptoCapEMA.iloc[i]['open'] and dfCryptoCapEMA.iloc[i -1]['close'] <= dfCryptoCapEMA.iloc[i -1]['open']:
btcOHLC = dfBtcUsd.loc[dfBtcUsd['time'] == dfCryptoCapEMA.iloc[i]['time']]
entryPrice = btcOHLC.iloc[0]['close'].tolist()
elif dfCryptoCapEMA.iloc[i]['close'] < dfCryptoCapEMA.iloc[i]['open'] and dfCryptoCapEMA.iloc[i -1]['close'] >= dfCryptoCapEMA.iloc[i -1]['open']:
btcOHLC = dfBtcUsd.loc[dfBtcUsd['time'] == dfCryptoCapEMA.iloc[i]['time']]
try:
traderesult = (btcOHLC.iloc[0]['close'].tolist() - entryPrice) / entryPrice * 100
except:
traderesult = 0
if traderesult > 0:
portfoliosize = portfoliosize * (1 + (traderesult / 100))
elif traderesult < 0:
portfoliosize = portfoliosize * (1 - (abs(traderesult) / 100))
result = f"Round - lenSmooth1 = {lenSmooth1} | PNL = {round(portfoliosize,2)} || currentWinner = {winningLenSmooth1} | currentWinnerPNL = {round(winningPNL,2)}"
#print(result)
if portfoliosize > winningPNL:
results.append(result)
winningPNL = portfoliosize
winningLenSmooth1 = lenSmooth1
return [winningLenSmooth1, winningPNL]
result = []
for x in range(1, 151, 1):
if x == 1:
result = runBacktest(x, 0, 0)
else:
result = runBacktest(x, result[0], result[1])
print(results[len(results) - 1])
Currently, the backtest runs synchronously and with larger datasets each iteration take up to a minute at the moment. I want to speed up the process by run runBacktest() asynchronous with different lenSmooth1 value and review the results at the end.
i tried to add the following to my script but i don't see any improvements in duration
import asyncio
async def run_tasks():
tasks = [runBacktest(x, 0, 0) for x in range(1, 151, 1)]
#await asyncio.wait(tasks)
await asyncio.gather(*tasks)
def main():
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(run_tasks())
loop.close()
main()
print(results)
Async will improve performance when you are working with IO operations (e.g. waiting for a response over a network, loading a file, etc.) but it won't do much to help with CPU-bound processes.
This article does a great job of breaking down different means of achieving concurrency in Python. What you're looking for is likely an implementation of the multiprocessing library.
I have a bunch of matrix multiplication operations that are performed only row-wise. I was wondering how to speed-up the computation by parallelization:
data = np.random.randint(1, 100, (100000, 800))
indices_1 = np.equal(data, 1)
A = np.zeros((100000, 100))
B = np.random.randn(800, 100)
for i in range(100000):
ones = indices_1[i]
not_ones = ~indices_1[i]
B_ones = B[ones]
B_not_ones = B[not_ones]
A[i] = (data[i][not_ones] # B_not_ones) # np.linalg.inv(B_not_ones.T # B_not_ones)
data[i][ones] = A[i] # B_ones.T
I tried multiprocessor but for some reason, but it did not perform better than sequential. Here is my multiprocessor implementation:
from multiprocessing.pool import ThreadPool, Pool
pool = ThreadPool() # can also use Pool
def f(i):
ones = indices_1[i]
not_ones = ~indices_1[i]
B_ones = B[ones]
B_not_ones = B[not_ones]
A[i] = (data[i][not_ones] # B_not_ones) # np.linalg.inv(B_not_ones.T # B_not_ones)
data[i][ones] = A[i] # B_ones.T
pool.map(f, range(100000))
Both yielded the same amount of running time (around 32 seconds). Other parallelization method like concurrent.futures did not improve the runtime (used like below):
with concurrent.futures.ThreadPoolExecutor() as executor:
result = executor.map(f, range(100000))
I also tried to apply dask but could not make their framework work in my case. Any help will be much appreciated! Thanks!
import numpy as np
import multiprocessing as mp
data = list(np.random.randint(1, 100, (100000, 800)))
indices_1 = np.equal(data, 1)
A = list(np.zeros((100000, 100)))
B = np.random.randn(800, 100)
def f(data, A, i):
ones = indices_1[i]
not_ones = ~indices_1[i]
B_ones = B[ones]
B_not_ones = B[not_ones]
A[i] = (data[i][not_ones] # B_not_ones) # np.linalg.inv(B_not_ones.T # B_not_ones)
data[i][ones] = A[i] # B_ones.T
with mp.Manager() as manager:
data_global = manager.list(data)
A_global = manager.list(A)
with mp.Pool() as p:
results = [ p.apply_async(f, (data_global, A_global, i,)) for i in range(100000) ]
for i in results:
i.wait()
data_global = list(data_global)
A_global = list(A_global)
I tried to use multiprocessing on this for loop:
def segment_features(segment_pixels):
features = []
npixels, nbands = segment_pixels.shape
for b in range(nbands):
stats = scipy.stats.describe(segment_pixels[:, b])
band_stats = list(stats.minmax) + list(stats)[2:]
if npixels == 1:
# in this case the variance = nan, change it 0.0
band_stats[3] = 0.0
features += band_stats
return features
segment_ids = np.unique(segments)
objects = []
object_ids = []
for id in segment_ids:
segment_pixels = img[segments == id]
object_features = segment_features(segment_pixels)
objects.append(object_features)
object_ids.append(id)
By replacing the for loop section with this:
def segment_features_calc(segment_pixels):
segment_pixels = img[segments == id]
object_features = segment_features(segment_pixels)
objects.append(object_features)
object_ids.append(id)
print("segment "+str(id)+" features calculated")
n=mp.cpu_count()
if__name__ == '__main__':
p = mp.Pool(processes = n)
start = time.time()
async_result = p.map_async(segment_features_calc,list(segment_ids))
p.close()
p.join()
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))
However, the multiprocessing does not get executed properly (multiprocessing stops after 0.17 seconds whereas the items to loop are close to 270,000+ segment IDs). Any insights on how to solve the issue?
I am attempting to speed up calculations on a pandas DataFrame using multiprocessing which goes really well minus the fact that assigning the result of the calculation to the df.ix does not work here like it does in my code without trying multiprocessing here
I've added a #sanity check to the code which outputs valid values and would make me think this would work just fine, but the DataFrame doesn't get populated (stays as NaN). Does anyone know why that may be, and more importantly, what changes may be needed to plug the values into the DataFrame in the context of multiprocessing?
Output of sanity check:
should be setting df.ix[4][1] to: 23.2506112824
should be setting df.ix[0][0] to: 0.0
should be setting df.ix[7][0] to: 15.9574526264
code:
import mysql.connector
import numpy as np
from colormath.color_objects import LabColor
from colormath.color_diff import delta_e_cie2000
import pandas as pd
from mysql.connector.pooling import MySQLConnectionPool
from multiprocessing import Pool
pool = Pool()
cnx = mysql.connector.connect(user='user', password='pass',host='localhost', database='database')
cursor = cnx.cursor()
selectstmt = 'SELECT CIE_Lab, ID FROM `database`.`table`'
cursor.execute(selectstmt)
color = cursor.fetchall()
df = pd.DataFrame(columns = color, index = color)
sides = df.index
headers = df.dtypes.index
shape = df.shape[0]
def delta(cie_Lab1, cie_Lab2):
cie_Lab1 = cie_Lab1[1:]
cie_Lab1 = cie_Lab1[:-1]
cie_Lab2 = cie_Lab2[1:]
cie_Lab2 = cie_Lab2[:-1]
CIE_list1 = cie_Lab1.split(",")
CIE_list2 = cie_Lab2.split(",")
#print CIE_list1
CIE_L1 = CIE_list1[0]
CIE_a1 = CIE_list1[1]
CIE_b1 = CIE_list1[2]
CIE_L2 = CIE_list2[0]
CIE_a2 = CIE_list2[1]
CIE_b2 = CIE_list2[2]
color1 = LabColor(lab_l=CIE_L1, lab_a=CIE_a1, lab_b=CIE_b1)
color2 = LabColor(lab_l=CIE_L2, lab_a=CIE_a2, lab_b=CIE_b2)
deltae = delta_e_cie2000(color1, color2, Kl=1, Kc=1, Kh=1)
return deltae
def deltas(nums):
listoflists = []
for num in range(nums):
for mun in range(nums):
listoflists.append([num,mun])
return listoflists
def update(inp):
sides = df.index
headers = df.dtypes.index
num = inp[0]
mun = inp[1]
res = delta(headers[num][0], sides[mun][0])
#sanity check
print "should be setting df.ix["+str(mun)+"]["+str(num)+"] to: "+str(res)
df.ix[mun][num] = res
if __name__ == '__main__':
pool = Pool(4)
pool.map(update, deltas(shape))
pool.close()
pool.join()
print df
Dataframe example:
([69.62248143012944, -54.15108764844451, 67.92070706614288], 1) \
([69.62248143012944, -54.15108764844451, 67.920... NaN
([58.17848217611454, -52.251714243997995, 56.77... NaN
([87.02539335188214, -32.15758725885986, 66.450... NaN
([86.86259502866965, -31.483524711078015, 75.14... NaN
([85.39154525710671, -31.683349117376856, 71.35... NaN