"aborted (disconnected)" on multithreading in python 3.5.2 - python

I wrote this python code that runs a simple markov chain to generate output, and I want to launch several threads of the same code and have them all report back to one central part of the program. I have assigned a queue to send messages on and one to receive them on, one for each thread I'm launching. In the main loop of the thread, I put the first item from the moves deque into qIn and I wait for a prompt from qOut to produce another result. In the main loop of the central part of the program, one of the threads is randomly selected every second and the data it sent is consumed and it is signalled to send more. This all works as intended when running one thread, but as soon as I increase the number of threads, such as 3, after a few seconds I get a message saying "aborted (disconnected)" and execution stops. What have I done wrong here?
import time
from numpy.random import choice
from numpy import array
from collections import OrderedDict, deque
from threading import *
from queue import Queue
import random
class processMarkovChain():
def __init__(self, qIn, qOut, IOtoCPU=65, IOtoIO=35, CPUtoCPU=75, CPUTtoIO=25, ID=0):
IOTransitionProbabilities = OrderedDict( sorted({"CPU":IOtoCPU, "IO ":IOtoIO }.items(), key=lambda x: x[0]))
CPUTransitionProbabilities = OrderedDict( sorted({"CPU":CPUtoCPU, "IO ":CPUTtoIO}.items(), key=lambda x: x[0]))
self.TP = OrderedDict( sorted({"CPU":CPUTransitionProbabilities,
"IO ":IOTransitionProbabilities}.items(), key=lambda x: x[0]))
self.state = "CPU"
self.moves = deque(self.generate_move() for _ in range(50))
self.ID = ID
self.qIn = qIn
self.qOut = qOut
def generate_move(self):
draw = choice(list(self.TP.keys()), 1, p=array(list(self.TP[self.state].values()))/100)
self.state = draw[0]
return draw[0]
def count(self, state):
counter = 0
for s in self.moves:
if s != state:
break
counter += 1
return counter
def run_loop(self):
while(1):
time.sleep(1)
retMove = self.moves.popleft()
print(self.ID, retMove, self.count(retMove))
self.qIn.put([retMove, self.count(retMove)], timeout=1000)
self.qOut.get(timeout=1000)
self.moves.append(self.generate_move())
numThreads = 1
inQueueList = [Queue() for ID in range(numThreads)]
outQueueList = [Queue() for ID in range(numThreads)]
threadList = [Thread(target=processMarkovChain(ID=ID,
qIn=inQueueList[ID],
qOut=outQueueList[ID]).run_loop).start() for ID in range(numThreads)]
while 1:
time.sleep(1)
luckyThread = random.randint(0,numThreads-1)
print(inQueueList[luckyThread].get(timeout=1000))
outQueueList[luckyThread].put("hello", timeout=1000)
Sample output from one thread:
0 CPU 1
['CPU', 1]
0 CPU 0
['CPU', 0]
0 IO 1
['IO ', 1]
0 IO 0
['IO ', 0]
0 CPU 2
['CPU', 2]
0 CPU 1
['CPU', 1]
0 CPU 0
['CPU', 0]
0 IO 0
['IO ', 0]
0 CPU 0
['CPU', 0]
0 IO 1
...
Sample output from three threads:
0 CPU 6
1 IO 0
2 IO 4
['IO ', 4]
['CPU', 6]aborted (disconnected)

Related

Parallelizing the process

I want to parallelize the spec which is generated by _spectrum_generator. I am using futures.ThreadPoolExecutor which is called in _gather_lcms_data. The spec is passed through function. The file is in .mzML format. Below is the output that i get which is empty.
(base) ashish#14-ce3xxx:~/GNPS_LCMSDashboard$ python3 lcms_map.py
Empty DataFrame
Columns: [mz, rt, i, scan, index, polarity]
Index: []
The output should be look like this:
(base) ashish#14-ce3xxx:/media/ashish/ubuntu7/GNPS_LCMSDashboard$ python3 lcms_map.py
mz rt i scan index polarity
0 169.038696 0.003722 1652.959961 1 1 1
1 177.969086 0.003722 1786.755127 1 1 1
2 194.156967 0.003722 1802.361450 1 1 1
3 154.059418 0.003722 1840.889160 1 1 1
4 164.080978 0.003722 1973.758423 1 1 1
5 150.079514 0.003722 1976.528687 1 1 1
6 160.096634 0.003722 2057.728516 1 1 1
7 201.182205 0.003722 2077.768311 1 1 1
8 162.078735 0.003722 2101.843018 1 1 1
9 171.044205 0.003722 2223.230713 1 1 1
Below is the code of _spectrum_generator:
def _spectrum_generator(filename, min_rt, max_rt):
run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
# Don't do this if the min_rt and max_rt are not reasonable values
if min_rt <= 0 and max_rt > 1000:
for spec in run:
yield spec
else:
try:
min_rt_index = _find_lcms_rt(run, min_rt) # These are inclusive on left
max_rt_index = _find_lcms_rt(run, max_rt) + 1 # Exclusive on the right
for spec_index in tqdm(range(min_rt_index, max_rt_index)):
spec = run[spec_index]
yield spec
print("USED INDEX")
except:
run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
for spec in run:
yield spec
print("USED BRUTEFORCE")
Below is code of lcms_map.py:
import os
import pymzml
import numpy as np
import datashader as ds
from tqdm import tqdm
import json
import pandas as pd
import xarray
import time
import utils
import plotly.express as px
import plotly.graph_objects as go
from utils import _spectrum_generator
from utils import _get_scan_polarity
from multiprocessing import Pool
import concurrent.futures
from multiprocessing import Process
# Enum for polarity
POLARITY_POS = 1
POLARITY_NEG = 2
def _gather_lcms_data(filename, min_rt, max_rt, min_mz, max_mz, polarity_filter="None", top_spectrum_peaks=100, include_polarity=False):
all_mz = []
all_rt = []
all_polarity = []
all_i = []
all_scan = []
all_index = []
spectrum_index = 0
number_spectra = 0
all_msn_mz = []
all_msn_rt = []
all_msn_polarity = []
all_msn_scan = []
all_msn_level = []
#fun(filename, min_rt, max_rt)
for spec in _spectrum_generator(filename, min_rt, max_rt):
rt = spec.scan_time_in_minutes()
try:
# Still waiting for the window
if rt < min_rt:
continue
# pass
# We've passed the window
if rt > max_rt:
break
except:
pass
if polarity_filter == "None":
pass
else:
scan_polarity = _get_scan_polarity(spec)
if polarity_filter != scan_polarity:
continue
if spec.ms_level == 1:
spectrum_index += 1
number_spectra += 1
try:
# Filtering peaks by mz
if min_mz <= 0 and max_mz >= 2000:
peaks = spec.peaks("raw")
else:
peaks = spec.reduce(mz_range=(min_mz, max_mz))
# Filtering out zero rows
peaks = peaks[~np.any(peaks < 1.0, axis=1)]
# Sorting by intensity
peaks = peaks[peaks[:,1].argsort()]
peaks = peaks[-1 * top_spectrum_peaks:]
mz, intensity = zip(*peaks)
all_mz += list(mz)
all_i += list(intensity)
all_rt += len(mz) * [rt]
all_scan += len(mz) * [spec.ID]
all_index += len(mz) * [number_spectra]
# Adding polarity
if include_polarity is True:
scan_polarity = _get_scan_polarity(spec)
if scan_polarity == "Positive":
all_polarity += len(mz) * [POLARITY_POS]
else:
all_polarity += len(mz) * [POLARITY_NEG]
except:
pass
elif spec.ms_level > 1:
try:
msn_mz = spec.selected_precursors[0]["mz"]
if msn_mz < min_mz or msn_mz > max_mz:
continue
all_msn_mz.append(msn_mz)
all_msn_rt.append(rt)
all_msn_scan.append(spec.ID)
all_msn_level.append(spec.ms_level)
# Adding polarity
if include_polarity is True:
scan_polarity = _get_scan_polarity(spec)
if scan_polarity == "Positive":
all_msn_polarity.append(POLARITY_POS)
else:
all_msn_polarity.append(POLARITY_NEG)
except:
pass
ms1_results = {}
ms1_results["mz"] = all_mz
ms1_results["rt"] = all_rt
ms1_results["i"] = all_i
ms1_results["scan"] = all_scan
ms1_results["index"] = all_index
msn_results = {}
msn_results["precursor_mz"] = all_msn_mz
msn_results["rt"] = all_msn_rt
msn_results["scan"] = all_msn_scan
msn_results["level"] = all_msn_level
# Adding polarity
if include_polarity is True:
ms1_results["polarity"] = all_polarity
msn_results["polarity"] = all_msn_polarity
ms1_results = pd.DataFrame(ms1_results)
msn_results = pd.DataFrame(msn_results)
return ms1_results, number_spectra, msn_results
def _get_feather_filenames(filename):
output_ms1_filename = filename + ".ms1.feather"
output_msn_filename = filename + ".msn.feather"
return output_ms1_filename, output_msn_filename
# These are caching layers for fast loading
def _save_lcms_data_feather(filename):
output_ms1_filename, output_msn_filename = _get_feather_filenames(filename)
start=time.time()
# with Pool(5) as p:
# #ms1_results, number_spectra, msn_results = p.starmap(_gather_lcms_data, (filename, 0, 1000000, 0, 10000, "None", 100000, True))
# ms1_results, number_spectra, msn_results = _gather_lcms_data(filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
# with concurrent.futures.ProcessPoolExecutor(max_workers=100) as executor:
# f=executor.submit(_gather_lcms_data, filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
# ms1_results, number_spectra, msn_results = f.result()
ms1_results, number_spectra, msn_results = _gather_lcms_data(filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
print(ms1_results.head(10))
print("Gathered data in", time.time() - start)
ms1_results = ms1_results.sort_values(by='i', ascending=False).reset_index()
ms1_results.to_feather(output_ms1_filename)
msn_results.to_feather(output_msn_filename)
_save_lcms_data_feather("/media/ashish/ubuntu7/GNPS_LCMSDashboard/QC_0.mzML")
How do i get the desired output by parallelizing. Suggest the changes that i need make in order to make it work.
As Simon Lundberg pointed out, you posted very complicated code, which makes it difficult to parallelize and even more difficult to explain how it is to be done. But if you were to present a simplified version of your code that was readily parallelizable, any answer would not be dealing with the complexities of your actual current code and would therefore be of little help. So I will try to create code that is an abstraction of your code's structure and then show how I would parallelize that. I am afraid that since you are not that familiar with multiprocessing, this may be rather difficult for you to follow.
First, a few observations about your code:
_gather_lcms_data currently is passed a filename and then using a generator function, _spectrum_generator, loops on all of its elements, called variable spec. In each loop iteration variable results are appended to various lists and variable number_spectra is optionally incremented. You have another variable spectrum_index that is also optionally incremented but its value is not otherwise used and could be eliminated. Finally, these lists are added to various dictionaries.
To parallelize the _gather_lcms_data function, it needs to process a single element, spec from the _spectrum_generator function so that we can run multiple invocations of this function in parallel. Consequently it needs to return a tuple of elements back to the main process which will do the necessary appending to lists and then create the dictionaries.
In your current code for each iteration of spec you optionally increment number_spectra and optionally append elements to various lists. Since we are now going to be parallelizing this function by returning individual elements, the main process must (1) accumulate the returned number_spectra value and optionally append the returned elements to result lists. Where in the original code you were not appending an element to a list for a given iteration, in the parallelized code you must return a None value so that the main process knows that for that iteration nothing needs to be appended.
In this abstraction, I have also reduced the number of lists down to two and I am generating dummy results.
First an abstraction of your current code.
def _spectrum_generator(filename, min_rt, max_rt):
#run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
run = [1, 2, 3, 4, 5, 6]
for spec in run:
yield spec
def _gather_lcms_data(filename, min_rt, max_rt, min_mz, max_mz, polarity_filter="None", top_spectrum_peaks=100, include_polarity=False):
# Remainder of the list declarations omitted for simplicity
all_mz = []
all_msn_mz= []
number_spectra = 0
for spec in _spectrum_generator(filename, min_rt, max_rt):
... # Code omitted for simplicity
number_spectra += 1 # Conditionally done
msn_mz = spec # Conditionally done
all_msn_mz.append(msn_mz)
mz = (spec * spec,) # Conditionally done
all_mz += list(mz)
...
ms1_results = {}
msn_results = {}
...
ms1_results["mz"] = all_mz
msn_results["precursor_mz"] = all_msn_mz
...
# Return
return ms1_results, number_spectra, msn_results
def _save_lcms_data_feather(filename):
ms1_results, number_spectra, msn_results = _gather_lcms_data(filename, 0, 1000000, 0, 10000, polarity_filter="None", top_spectrum_peaks=100000, include_polarity=True)
print(ms1_results)
print(number_spectra)
print(msn_results)
if __name__ == '__main__':
_save_lcms_data_feather("/media/ashish/ubuntu7/GNPS_LCMSDashboard/QC_0.mzML")
Prints:
{'mz': [1, 4, 9, 16, 25, 36]}
6
{'precursor_mz': [1, 2, 3, 4, 5, 6]}
This is the parallelized version of the above code:
def _spectrum_generator(filename, min_rt, max_rt):
#run = pymzml.run.Reader(filename, MS_precisions=MS_precisions)
run = [1, 2, 3, 4, 5, 6]
for spec in run:
yield spec
def _gather_lcms_data(spec, min_rt, max_rt, min_mz, max_mz, polarity_filter="None", top_spectrum_peaks=100, include_polarity=False):
# Remainder of the list declarations omitted for simplicity
number_spectra = 0
... # Code omitted for simplicity
number_spectra += 1
msn_mz = spec # Conditionally done. If not done then set msn_mz to None
mz = list((spec * spec,)) # Conditionally done. If not done then set mz to None
...
return mz, number_spectra, msn_mz
def _save_lcms_data_feather(filename):
from multiprocessing import Pool
from functools import partial
min_rt = 0
max_rt = 1000000
worker_function = partial(_gather_lcms_data, min_rt=min_rt, max_rt=max_rt, min_mz=0, max_mz=10000, polarity_filter="None", top_spectrum_peaks=1000000, include_polarity=True)
with Pool() as pool:
all_mz = []
all_msn_mz = []
number_spectra = 0
for mz, _number_spectra, msn_mz in pool.map(worker_function, _spectrum_generator(filename, min_rt, max_rt)):
if mz is not None:
all_mz += mz
number_spectra += _number_spectra
if msn_mz is not None:
all_msn_mz.append(msn_mz)
ms1_results = {}
msn_results = {}
ms1_results["mz"] = all_mz
msn_results["precursor_mz"] = all_msn_mz
print(ms1_results)
print(number_spectra)
print(msn_results)
if __name__ == '__main__':
_save_lcms_data_feather("/media/ashish/ubuntu7/GNPS_LCMSDashboard/QC_0.mzML")

How to make this task improve cpu usage?

I try to hash many file, but it not use full of cpu power. it only consume 25%. i test to move the heavy process into thread. but still no different. im from nodejs use sharp library. with same task. it consume all cpu usage. How python to make it full power?
import cv2
import math
import datetime
import hashlib
import threading
def thread_function(image, yPos, xPos, wSizeBlock, hSizeBlock):
block = image[yPos:yPos+wSizeBlock, xPos:xPos+hSizeBlock]
hash = hashlib.sha256()
hash.update(block.tobytes())
print(hash.hexdigest())
image = cv2.imread('frame323.jpg', cv2.IMREAD_COLOR)
dimension = {
'width': image.shape[1],
'height': image.shape[0]
}
wSizeBlock = int(16)
hSizeBlock = int(16)
wBlockLength = math.floor(dimension['width'] / wSizeBlock)
hBlockLength = math.floor(dimension['height'] / hSizeBlock)
count = 0
start_time = datetime.datetime.now()
print(start_time)
for k in range(0, 500):
for i in range(0, wBlockLength):
for j in range(0, hBlockLength):
xPos = int(i*wSizeBlock)
yPos = int(j*hSizeBlock)
x = threading.Thread(target=thread_function, args=(image, xPos, yPos, wSizeBlock, hSizeBlock))
x.start()
count += 1
count = 0
end_time = datetime.datetime.now()
print(end_time)
For CPU intensive operations that can be split up into smaller tasks, you would want to use the multiprocessing module. It is similar to the threading module in that it allows multiple functions to be ran at once. Syntax looks something like this:
import multiprocessing as mp
def add(a, b):
return a + b
p = mp.Process(target=add, args=(1, 2))
p.start()

Missing result when using threadid parallel computing python

I'm running code on a server and splitting across the 4 cores I have. The results are as expected when I have 3 parameter values but when I have 4, it only gives 3 results, code is like:
parameter_values = [0.2,0.3,0.4,0.5]
def runsoncores(threadid):
np.random.seed(threadid)
tf.random.set_seed(threadid)
parameter_for_sim = parameter_values[threadid]
# **runs simulation here with the parameter value**
filename = 'results_' + str(parameter_for_sim) + '.npy'
np.save(filename,samples)
def parallel_run(threadid, gpu):
with tf.name_scope(gpu):
with tf.device(gpu):
runoncores(threadid)
return
gpu_list = tf.config.experimental.list_logical_devices('GPU')
num_threads = len(gpu_list)
print(num_threads)
threads = list()
start = time.time()
for index in range(num_threads):
x = threading.Thread(target=parallel_run, args=(index,gpu_list[index].name))
threads.append(x)
x.start()
for index, thread in enumerate(threads):
thread.join()
end = time.time()
print('Threaded time taken: ', end-start)
What is happening and how can I get the same number of results as input values? Thanks!

How to efficiently perform row-wise operations using pandas?

I want to get some basic statistics from some csv files without loading the whole file in memory. I do it in two ways, one seemingly "smart" way using pandas and another casual way using csv I expect the pandas way to be faster but the csv way is actually faster by a very large margin. I was wondering why.
Here is my code:
import pandas as pd
import csv
movies = pd.read_csv('movies.csv') # movieId,title,genres
movie_count = movies.shape[0] # 9742
movieId_min = ratings.movieId.min()
movieId_max = ratings.movieId.max()
movieId_disperse = movies.movieId.sort_values().to_dict()
movieId_squeeze = {v: k for k, v in movieId_disperse.items()}
def get_ratings_stats():
gp_by_user = []
gp_by_movie = [0] * movie_count
top_rator = (0, 0) # (idx, value)
top_rated = (0, 0) # (idx, value)
rating_count = 0
user_count = 0
last_user = -1
for row in csv.DictReader(open('ratings.csv')):
user = int(row['userId'])-1
movie = movieId_squeeze[int(row['movieId'])]
if last_user != user:
last_user = user
user_count += 1
gp_by_user += [0]
rating_count += 1
gp_by_user[user] += 1
gp_by_movie[movie] += 1
top_rator = (user, gp_by_user[user]) if gp_by_user[user] > top_rator[1] else top_rator
top_rated = (movie, gp_by_movie[movie]) if gp_by_movie[movie] > top_rated[1] else top_rated
top_rator = (top_rator[0]+1, top_rator[1])
top_rated = (movieId_disperse[top_rated[0]], top_rated[1])
return rating_count, top_rator, top_rated
Now if I replace the line:
for row in csv.DictReader(open('ratings.csv')):
With:
for chunk in pd.read_csv('ratings.csv', chunksize=1000):
for _,row in chunk.iterrows():
The code actually becomes 10 times slower.
Here are the timing results:
> %timeit get_ratings_stats() # with csv
325 ms ± 9.98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
> %timeit get_ratings_stats() # with pandas
3.45 s ± 67.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Any comments as to how I can make this code better/faster/more readable would be much appreciated
I think the point is that you shouldn't use pandas if you're going to then treat the big, expensive data structure like a dict. The question shouldn't be how to get pandas to be better at that, it should be how to write your code with pandas to do what you want.
import pandas as pd
def get_ratings_stats():
movie_rating_data = pd.read_csv('ratings.csv')
# Get the movie with the best rating
top_movie = movie_rating_data.loc[:, ['movieId', 'rating']].groupby('movieId').agg('max').sort_values(by='rating', ascending=False).iloc[:, 0]
# Get the user with the best rating
top_user = movie_rating_data.loc[:, ['userId', 'rating']].groupby('userId').agg('max').sort_values(by='rating', ascending=False).iloc[:, 0]
return movie_rating_data.shape[0], top_movie, top_user
def get_ratings_stats_slowly():
movies = pd.DataFrame(columns = ["movieId", "ratings"])
users = pd.DataFrame(users = ["userId", "ratings"])
data_size = 0
for chunk in pd.read_csv('ratings.csv', chunksize=1000):
movies = movies.append(chunk.loc[:, ['movieId', 'rating']].groupby('movieId').agg('max'))
users = users.append(chunk.loc[:, ['userId', 'rating']].groupby('userId').agg('max'))
data_size += chunk.shape[0]
top_movie = movies.loc[:, ['movieId', 'rating']].groupby('movieId').agg('max').sort_values(by='rating', ascending=False).iloc[:, 0]
top_user = users.loc[:, ['userId', 'rating']].groupby('userId').agg('max').sort_values(by='rating', ascending=False).iloc[:, 0]
return data_size, top_movie, top_user
I'm not really sure that this is what you want to do overall, but your code is incomprehensible - this should be a good place to start (you could replace .agg('max') with .count() if you're interested in the number of ratings, etc).
I think parallel processing is the answer for your question. I've tried doing some parallel processing on your problem but I had to split the ratings file into multiple files for processing.
What I did initially was to duplicate the ratings data from the CSV files by a factor of 10, and then I executed your script to have an initial execution time, which for me was about 3.6 seconds. Now, by splitting the files into multiple ones, that can be addressed by multiple child processes, and for example by using my script with -k 2 (basically 2 workers), the total execution time reduced to 1.87 seconds. If I use -k 4 (4 workers) the execution time will be 1.13 seconds.
I am not sure if it is possible to read the CSV in chunks and basically do a parallel seek reading from the CSV, from a single big file, but that would make it a lot faster, the only drawback being the need to do an initial count of the rows in the big CSV file, to know how many rows will go per worker.
The splitting script:
import csv
file_path = "data/ratings.csv"
out_path = "data/big_ratings_{}.csv"
out_csv = None
for i in range(10):
print("Iteration #{}".format(i+1))
pin = open(file_path, "r")
pout = open(out_path.format(i), "w")
in_csv = csv.DictReader(pin)
out_csv = csv.DictWriter(pout, fieldnames=in_csv.fieldnames)
out_csv.writeheader()
for row in in_csv:
out_csv.writerow(row)
pin.close()
pout.close()
The actual rating processing script
import time
import csv
import argparse
import os
import sys
from multiprocessing import Process, Queue, Value
import pandas as pd
top_rator_queue = Queue()
top_rated_queue = Queue()
DEFAULT_NO_OF_WORKERS = 1
RATINGS_FILE_PATH = "data/big_ratings_{}.csv"
NUMBER_OF_FILES = 10
class ProcessRatings(Process):
def __init__(self, file_index_range, top_rator_queue, top_rated_queue, movie_id_squeeze):
super(ProcessRatings, self).__init__()
self.file_index_range = file_index_range
self.top_rator_queue = top_rator_queue
self.top_rated_queue = top_rated_queue
self.movie_id_squeeze = movie_id_squeeze
def run(self):
for file_index in self.file_index_range:
print("[PID: {}] Processing file index {} .".format(os.getpid(), file_index))
start = time.time()
gp_by_user = []
gp_by_movie = [0] * movie_count
top_rator = (0, 0) # (idx, value)
top_rated = (0, 0) # (idx, value)
rating_count = 0
user_count = 0
last_user = -1
for row in csv.DictReader(open(RATINGS_FILE_PATH.format(file_index))):
user = int(row['userId'])-1
movie = self.movie_id_squeeze[int(row['movieId'])]
if last_user != user:
last_user = user
user_count += 1
gp_by_user += [0]
gp_by_user[user] += 1
gp_by_movie[movie] += 1
top_rator = (user, gp_by_user[user]) if gp_by_user[user] > top_rator[1] else top_rator
top_rated = (movie, gp_by_movie[movie]) if gp_by_movie[movie] > top_rated[1] else top_rated
end = time.time()
print("[PID: {}] Processing time for file index {} : {}s!".format(os.getpid(), file_index, end-start))
print("[PID: {}] WORKER DONE!".format(os.getpid()))
if __name__ == "__main__":
print("Processing ratings in multiple worker processes.")
start = time.time()
# script arguments handling
parser = argparse.ArgumentParser()
parser.add_argument("-k", dest="workers", action="store")
args_space = parser.parse_args()
# determine the number of workers
number_of_workers = DEFAULT_NO_OF_WORKERS
if args_space.workers:
number_of_workers = int(args_space.workers)
else:
print("Number of workers not specified. Assuming: {}".format(number_of_workers))
# rating data
rating_count = 0
movies = pd.read_csv('data/movies.csv') # movieId,title,genres
movie_count = movies.shape[0] # 9742
movieId_min = movies.movieId.min()
movieId_max = movies.movieId.max()
movieId_disperse = movies.movieId.sort_values().to_dict()
movieId_squeeze = {v: k for k, v in movieId_disperse.items()}
# process data
processes = []
# initialize the worker processes
number_of_files_per_worker = NUMBER_OF_FILES // number_of_workers
for i in range(number_of_workers):
p = ProcessRatings(
range(i, i+number_of_files_per_worker), # file index
top_rator_queue,
top_rated_queue,
movieId_squeeze
)
p.start()
processes.append(p)
print("MAIN: Wait for processes to finish ...")
# wait until all processes are done
while True:
# determine if the processes are still running
if not any(p.is_alive() for p in processes):
break
# gather the data and do a final processing
end = time.time()
print("Processing time: {}s".format(end - start))
print("Rating count: {}".format(rating_count))

Python sklearn and multiprocessing

I'm trying to parallelise training of classifiers from sklearn (gaussian mixture modell in this case) using multiprocessing and I get a lot worse classifiers in comparison with running them sequentially. Additionally each time after training the results are different as if the code was not thread safe. Can anyone explain me what is going on? Here is the code and at the end the thread function:
nrProc = 8
semaphore = Semaphore(nrProc)
m = Manager()
models = m.list()
modelsOut = m.list()
processes = []
cnt = 0
for event_label in data_positive:
models.append(mixture.GMM(**classifier_params))
models.append(mixture.GMM(**classifier_params))
for event_label in data_positive:
if classifier_method == 'gmm':
processes.append(Process(target=trainProcess, args=(models[cnt], data_positive[event_label], semaphore, modelsOut)))
cnt = cnt + 1
processes.append(Process(target=trainProcess, args=(models[cnt], data_negative[event_label], semaphore, modelsOut)))
cnt = cnt + 1
else:
raise ValueError("Unknown classifier method ["+classifier_method+"]")
for proc in processes:
proc.start()
for proc in processes:
proc.join()
cnt = 0
for event_label in data_positive:
model_container['models'][event_label] = {}
model_container['models'][event_label]['positive'] = modelsOut[cnt]
cnt = cnt + 1
model_container['models'][event_label]['negative'] = modelsOut[cnt]
cnt = cnt + 1
def trainProcess(model, data, semaphore, modelsOut):
semaphore.acquire()
modelsOut.append(model.fit(data))
semaphore.release()
return 0
So the solution is to use clone function from sklearn which does a deep copy of the estimator.

Categories