I'm working "for fun" on a dash app that compares execution time of different algorithms.
Here, I want to compare list sorting algorithms, and in this particular case, I'll be talking about the recursive merging algo.
The app works as follows: generate a set of random lists with chosen lengths, chose an algo, run the tests. Using concurrent.futures.ProcessPoolExecutor, all lists are sorted at the same time in a different process. I've tried with up to 150 lists and it works fine.
Then, I wanted the merging algo to use 2 processes if the list is bigger than 1024 element, which is also working fine... until I have more than 4 lists. And if I run the test with like 100 lists of length under 1024 and add only one list of a length that will make it use 2 processes to be sorted, it won't work and will return a memory error.
Also, I know about the multiprocessing.cpu_count() function but that doesn't seem to be of any use here? It says I have 12 cpu, but will run 150 processes at the same time and crash for 5 * 2 sub-processes.
So, could anyone explain this to me? I have 32Go of ram and we're talking about 5 lists of 2000 int so...
Edit: Adding the code I used
Each generated list is stored as a Data object:
class Data:
def __init__(self, data):
self.datas = data
self.sorted_datas = None
self.insert_sort_time = None
self.merge_sort_time = None
self.mt_merge_sort_time = None
self.heapify_sort_time = None
def __lt__(self, other):
return (len(self.datas) < len(other.datas))
def __repr__(self):
return str(self.datas)
def _sort_by_merging(self):
a = time.time()
self.sorted_datas = [item for item in self.datas]
self.sorted_datas = mergeSort(self.sorted_datas)
b = time.time() - a
return b
def _sort_by_multiprocMerging(self):
a = time.time()
self.sorted_datas = [item for item in self.datas]
self.sorted_datas = multiprocMerging(self.sorted_datas)
b = time.time() - a
return b
...
And all the Data objects are stored in a DataSet object:
class DataSet:
def __init__(self):
self.raw_datas=[]
self._datas = []
def add(self, new_data):
heapq.heappush(self.raw_datas, new_data)
def sort(self):
self._datas = [heapq.heappop(self.raw_datas) for _ in range(len(self.raw_datas))]
self.raw_datas = self._datas
def run_tests(self, *algos):
if 'merge' in algos:
self.merge_sort_time = 0
self.merge_datas = []
with concurrent.futures.ProcessPoolExecutor() as executor:
results = [executor.submit(datas._sort_by_merging) for datas in self.raw_datas]
i = 0
for result in concurrent.futures.as_completed(results):
self.merge_datas.append((len(self.raw_datas[i].datas), result.result()))
self.raw_datas[i].merge_sort_time = self.merge_datas[i][1] * 1000
self.merge_sort_time += self.raw_datas[i].merge_sort_time
i += 1
if 'mp_merge' in algos:
self.mt_merge_sort_time = 0
self.mt_merge_datas = []
with concurrent.futures.ProcessPoolExecutor() as executor:
results = [executor.submit(datas._sort_by_multiprocMerging) for datas in self.raw_datas]
i = 0
for result in concurrent.futures.as_completed(results):
self.mt_merge_datas.append((len(self.raw_datas[i].datas), result.result()))
self.raw_datas[i].mt_merge_sort_time = self.mt_merge_datas[i][1] * 1000
self.mt_merge_sort_time += self.raw_datas[i].mt_merge_sort_time
i += 1
...
And here are the sorting algos :
def mergeSort(my_list):
if len(my_list) > 1:
mid = len(my_list) // 2
left = my_list[:mid]
right = my_list[mid:]
mergeSort(left)
mergeSort(right)
i = 0
j = 0
k = 0
while i < len(left) and j < len(right):
if left[i] < right[j]:
my_list[k] = left[i]
i += 1
else:
my_list[k] = right[j]
j += 1
k += 1
while i < len(left):
my_list[k] = left[i]
i += 1
k += 1
while j < len(right):
my_list[k] = right[j]
j += 1
k += 1
return my_list
def multiprocMerging(my_list):
if len(my_list) > 1024:
mid = len(my_list) // 2
left = my_list[:mid]
right = my_list[mid:]
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(mergeSort, [left, right])
sides = [result for result in results]
i = 0
j = 0
k = 0
while i < len(left) and j < len(right):
if sides[0][i] < sides[1][j]:
my_list[k] = sides[0][i]
i += 1
else:
my_list[k] = sides[1][j]
j += 1
k += 1
while i < len(sides[0]):
my_list[k] = sides[0][i]
i += 1
k += 1
while j < len(sides[1]):
my_list[k] = sides[1][j]
j += 1
k += 1
else:
mergeSort(my_list)
return my_list
Edit 2 : here is the console log when the app crashes:
Traceback (most recent call last):
File "<string>", line 1, in <module>
Process SpawnProcess-186:5:
File "D:\Python\lib\multiprocessing\__init__.py", line 16, in <module>
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "D:\Python\lib\multiprocessing\__init__.py", line 16, in <module>
from . import context
File "D:\Python\lib\multiprocessing\context.py", line 6, in <module>
Process SpawnProcess-184:10:
from . import reduction
File "D:\Python\lib\multiprocessing\reduction.py", line 16, in <module>
Traceback (most recent call last):
File "D:\Python\lib\multiprocessing\process.py", line 315, in _bootstrap
self.run()
File "D:\Python\lib\multiprocessing\process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "D:\Python\lib\concurrent\futures\process.py", line 233, in _process_worker
call_item = call_queue.get(block=True)
File "D:\Python\lib\multiprocessing\queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "D:\Projets\Python\Algorithmic\AlgoWebSite\DashApps\algos\Dunod\list_sorting.py", line 2, in <module>
import numpy as np
File "D:\Projets\Python\Algorithmic\venv\lib\site-packages\numpy\__init__.py", line 140, in <module>
from . import core
File "D:\Projets\Python\Algorithmic\venv\lib\site-packages\numpy\core\__init__.py", line 72, in <module>
from . import numeric
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "<frozen importlib._bootstrap>", line 975, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 671, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 779, in exec_module
Traceback (most recent call last):
File "<frozen importlib._bootstrap_external>", line 911, in get_code
File "<frozen importlib._bootstrap_external>", line 580, in _compile_bytecode
MemoryError
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "D:\Python\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "D:\Python\lib\multiprocessing\spawn.py", line 125, in _main
prepare(preparation_data)
File "D:\Python\lib\multiprocessing\spawn.py", line 236, in prepare
_fixup_main_from_path(data['init_main_from_path'])
File "D:\Python\lib\multiprocessing\spawn.py", line 287, in _fixup_main_from_path
main_content = runpy.run_path(main_path,
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "D:\Python\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "D:\Python\lib\multiprocessing\spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
File "D:\Python\lib\concurrent\futures\__init__.py", line 8, in <module>
File "D:\Python\lib\sre_compile.py", line 291, in _optimize_charset
Traceback (most recent call last):
from . import context
import socket
Traceback (most recent call last):
File "<string>", line 1, in <module>
from concurrent.futures._base import (FIRST_COMPLETED,
File "D:\Python\lib\runpy.py", line 264, in run_path
Traceback (most recent call last):
charmap[k] = 1
File "D:\Python\lib\concurrent\futures\_base.py", line 7, in <module>
Traceback (most recent call last):
File "<string>", line 1, in <module>
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "D:\Python\lib\multiprocessing\spawn.py", line 116, in spawn_main
File "D:\Python\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "D:\Python\lib\multiprocessing\spawn.py", line 125, in _main
exitcode = _main(fd, parent_sentinel)
File "D:\Python\lib\multiprocessing\spawn.py", line 126, in _main
prepare(preparation_data)
self = reduction.pickle.load(from_parent)
IndexError: bytearray index out of range
Traceback (most recent call last):
File "<string>", line 1, in <module>
import logging
File "D:\Python\lib\multiprocessing\__init__.py", line 16, in <module>
File "D:\Python\lib\multiprocessing\spawn.py", line 236, in prepare
Traceback (most recent call last):
File "D:\Python\lib\multiprocessing\context.py", line 6, in <module>
File "D:\Python\lib\socket.py", line 49, in <module>
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "D:\Python\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "D:\Python\lib\multiprocessing\spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
File "D:\Python\lib\concurrent\futures\__init__.py", line 8, in <module>
File "<string>", line 1, in <module>
During handling of the above exception, another exception occurred:
File "<string>", line 1, in <module>
_fixup_main_from_path(data['init_main_from_path'])
File "<string>", line 1, in <module>
File "<string>", line 1, in <module>
File "<string>", line 1, in <module>
from concurrent.futures._base import (FIRST_COMPLETED,
File "D:\Python\lib\concurrent\futures\_base.py", line 7, in <module>
File "D:\Python\lib\concurrent\futures\process.py", line 54, in <module>
Process SpawnProcess-186:6:
from . import context
File "D:\Python\lib\multiprocessing\spawn.py", line 287, in _fixup_main_from_path
code, fname = _get_code_from_file(run_name, path_name)
import logging
import multiprocessing.connection
File "D:\Python\lib\multiprocessing\connection.py", line 21, in <module>
Traceback (most recent call last):
File "D:\Python\lib\multiprocessing\process.py", line 315, in _bootstrap
self.run()
File "D:\Python\lib\multiprocessing\process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "D:\Python\lib\multiprocessing\spawn.py", line 116, in spawn_main
File "D:\Python\lib\multiprocessing\context.py", line 6, in <module>
File "D:\Python\lib\multiprocessing\__init__.py", line 16, in <module>
Traceback (most recent call last):
main_content = runpy.run_path(main_path,
File "D:\Python\lib\multiprocessing\__init__.py", line 16, in <module>
File "D:\Python\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "D:\Python\lib\multiprocessing\spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
File "D:\Python\lib\concurrent\futures\process.py", line 54, in <module>
from . import reduction
File "<string>", line 1, in <module>
from . import context
File "D:\Python\lib\runpy.py", line 264, in run_path
import _multiprocessing
File "D:\Python\lib\runpy.py", line 239, in _get_code_from_file
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "D:\Python\lib\concurrent\futures\process.py", line 233, in _process_worker
call_item = call_queue.get(block=True)
exitcode = _main(fd, parent_sentinel)
from . import context
import multiprocessing.connection
File "D:\Python\lib\multiprocessing\reduction.py", line 15, in <module>
File "D:\Python\lib\multiprocessing\spawn.py", line 116, in spawn_main
File "D:\Python\lib\multiprocessing\context.py", line 6, in <module>
code, fname = _get_code_from_file(run_name, path_name)
File "D:\Python\lib\runpy.py", line 239, in _get_code_from_file
code = compile(f.read(), fname, 'exec')
File "D:\Python\lib\multiprocessing\queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "D:\Projets\Python\Algorithmic\AlgoWebSite\DashApps\algos\Dunod\list_sorting.py", line 2, in <module>
import numpy as np
File "D:\Python\lib\multiprocessing\spawn.py", line 126, in _main
code = compile(f.read(), fname, 'exec')
MemoryError
import pickle
ImportError: DLL load failed while importing _multiprocessing: Le fichier de pagination est insuffisant pour terminer cette opération.
self = reduction.pickle.load(from_parent)
File "D:\Python\lib\concurrent\futures\process.py", line 54, in <module>
import multiprocessing.connection
File "D:\Projets\Python\Algorithmic\venv\lib\site-packages\numpy\__init__.py", line 140, in <module>
from . import core
File "D:\Projets\Python\Algorithmic\venv\lib\site-packages\numpy\core\__init__.py", line 98, in <module>
from . import _add_newdocs
File "<frozen importlib._bootstrap>", line 991, in _find_and_load
File "<frozen importlib._bootstrap>", line 975, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 671, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 779, in exec_module
File "<frozen importlib._bootstrap_external>", line 874, in get_code
File "D:\Python\lib\multiprocessing\connection.py", line 21, in <module>
import _multiprocessing
ImportError: DLL load failed while importing _multiprocessing: Le fichier de pagination est insuffisant pour terminer cette opération.
File "<frozen importlib._bootstrap>", line 975, in _find_and_load_unlocked
File "D:\Python\lib\multiprocessing\connection.py", line 21, in <module>
File "D:\Python\lib\pickle.py", line 34, in <module>
File "<frozen importlib._bootstrap>", line 671, in _load_unlocked
from . import reduction
File "D:\Python\lib\multiprocessing\reduction.py", line 15, in <module>
import pickle
File "D:\Python\lib\pickle.py", line 37, in <module>
File "D:\Python\lib\multiprocessing\context.py", line 6, in <module>
from . import reduction
File "D:\Python\lib\multiprocessing\reduction.py", line 15, in <module>
exitcode = _main(fd, parent_sentinel)
File "D:\Python\lib\multiprocessing\spawn.py", line 126, in _main
import _multiprocessing
ImportError: DLL load failed while importing _multiprocessing: Le fichier de pagination est insuffisant pour terminer cette opération.
File "<frozen importlib._bootstrap_external>", line 779, in exec_module
File "<frozen importlib._bootstrap_external>", line 874, in get_code
File "<frozen importlib._bootstrap_external>", line 973, in get_data
MemoryError
File "<frozen importlib._bootstrap_external>", line 973, in get_data
Internal Server Error: /django_plotly_dash/app/lists/_dash-update-component
concurrent.futures.process._RemoteTraceback:
"""
Traceback (most recent call last):
File "D:\Python\lib\concurrent\futures\process.py", line 239, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "D:\Projets\Python\Algorithmic\AlgoWebSite\DashApps\toolbox\list_datas.py", line 47, in _sort_by_threadmerging
self.sorted_datas = multiThreadMerging(self.sorted_datas)
File "D:\Projets\Python\Algorithmic\AlgoWebSite\DashApps\algos\Dunod\list_sorting.py", line 103, in multiThreadMerging
sides = [result for result in results]
File "D:\Projets\Python\Algorithmic\AlgoWebSite\DashApps\algos\Dunod\list_sorting.py", line 103, in <listcomp>
sides = [result for result in results]
File "D:\Python\lib\concurrent\futures\process.py", line 484, in _chain_from_iterable_of_lists
for element in iterable:
File "D:\Python\lib\concurrent\futures\_base.py", line 611, in result_iterator
yield fs.pop().result()
File "D:\Python\lib\concurrent\futures\_base.py", line 432, in result
return self.__get_result()
File "D:\Python\lib\concurrent\futures\_base.py", line 388, in __get_result
raise self._exception
concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "D:\Projets\Python\Algorithmic\venv\lib\site-packages\django\core\handlers\exception.py", line 47, in inner
response = get_response(request)
File "D:\Projets\Python\Algorithmic\venv\lib\site-packages\django\core\handlers\base.py", line 179, in _get_response
response = wrapped_callback(request, *callback_args, **callback_kwargs)
File "D:\Projets\Python\Algorithmic\venv\lib\site-packages\django\views\decorators\csrf.py", line 54, in wrapped_view
return view_func(*args, **kwargs)
File "D:\Projets\Python\Algorithmic\venv\lib\site-packages\django_plotly_dash\views.py", line 74, in update
return _update(request, ident, stateless, **kwargs)
File "D:\Projets\Python\Algorithmic\venv\lib\site-packages\django_plotly_dash\views.py", line 93, in _update
resp = view_func()
File "D:\Projets\Python\Algorithmic\venv\lib\site-packages\django_plotly_dash\dash_wrapper.py", line 560, in dispatch
return self.dispatch_with_args(body, argMap=dict())
File "D:\Projets\Python\Algorithmic\venv\lib\site-packages\django_plotly_dash\dash_wrapper.py", line 647, in dispatch_with_args
res = self.callback_map[target_id]['callback'](*args, **argMap)
File "D:\Projets\Python\Algorithmic\venv\lib\site-packages\dash\dash.py", line 985, in add_context
output_value = func(*args, **kwargs) # %% callback invoked %%
File "D:\Projets\Python\Algorithmic\AlgoWebSite\DashApps\Apps\lists.py", line 382, in mergeTest
data_set.run_tests('mt_merge')
File "D:\Projets\Python\Algorithmic\AlgoWebSite\DashApps\toolbox\list_datas.py", line 104, in run_tests
self.mt_merge_datas.append((len(self.raw_datas[i].datas), result.result()))
File "D:\Python\lib\concurrent\futures\_base.py", line 432, in result
return self.__get_result()
File "D:\Python\lib\concurrent\futures\_base.py", line 388, in __get_result
raise self._exception
concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.
HTTP POST /django_plotly_dash/app/lists/_dash-update-component 500 [1.74, 127.0.0.1:63195]
I am very new to the python and I have been playing around with Panda dataframes, but when I use a groupby, I am not longer able to iterate over the dataframes using the labels.
Can some help me ?
newDF=df[df['Currency'].str.contains(currency)&df['Description'].str.contains('fx')]
newDF=newDF.rename(index=str, columns={ "Paid": "Withdrawn"})
moneyWithdrawnByUserDF=pd.DataFrame(newDF.groupby(['FirstName'])[['Withdrawn']].sum())
for index,row in moneyWithdrawnByUserDF.iterrows():
print row['FirstName']
The output/error I got is below :
Index([u'Email', u'FirstName', u'LastName', u'Owed', u'Withdrawn', u'UserId',
Traceback (most recent call last):
File "main.py", line 416, in <module>
sys.exit(main(sys.argv[1:]))
File "main.py", line 412, in main
parseGroups()
u'Category', u'Description', u'Id', u'Currency', u'Cost', u'Details',
u'GroupId'],
dtype='object')
File "main.py", line 45, in parseGroups
parseGroup(group)
File "main.py", line 81, in parseGroup processCurrencies(df)
File "main.py", line 95, in processCurrencies processCurrency(df, currency)
File "main.py", line 105, in processCurrency moneyWithdrawnByUserDF=calculateMoneyWithdrawnByUser(df, currency)
File "main.py", line 319, in calculateMoneyWithdrawnByUser
print row['FirstName']
File "/usr/local/lib/python2.7/site-packages/pandas/core/series.py", line 601, in __getitem__
result = self.index.get_value(self, key)
File "/usr/local/lib/python2.7/site-packages/pandas/core/indexes/base.py", line 2491, in get_value
raise e1
KeyError: 'FirstName'
Thank you
I think you need change:
moneyWithdrawnByUserDF=pd.DataFrame(newDF.groupby(['FirstName'])[['Withdrawn']].sum())
by reset_index:
moneyWithdrawnByUserDF= newDF.groupby(['FirstName'])['Withdrawn'].sum().reset_index()
Or parameter as_index=False for DataFrame:
moneyWithdrawnByUserDF= newDF.groupby(['FirstName'], as_index=False)['Withdrawn'].sum()