Update shared dictionary using mpire package - python

I am working to update a shared dictionary synchronously using mpire package in Python in a multi-core machine (i.e., parallel processing to update a dict). The environment I am using is a Linux machine with 8 vCPU and 16 GB memory in Amazon Sagemaker. Below is a sample/dummy code snippet I am using for this. But I am unable to make it working. I know I can perhaps use Process or map methods from multiprocessing package to accomplish this task. I am just checking if there is any way I can do it using mpire package. Any help would be greatly appreciated. Thanks much!
def myFunc(shared_objects, id_val):
indata, output = shared_objects
# Temporary store for model output for an input ID
temp: Dict[str, int] = dict()
# Filter data for input ID and store output in temp variable
indata2 = indata.loc[indata['ID']==id_val]
temp = indata2.groupby(['M_CODE'])['VALUE'].sum().to_dict()
# store the result .. I want this to happen synchronously
output[id_val] = temp
#*******************************************************************
if __name__ == '__main__':
from mpire import WorkerPool
from multiprocessing import Manager
# This is just a sample data
inputData = pd.DataFrame(dict({'ID':['A', 'B', 'A', 'C', 'A'],
'M_CODE':['AKQ1', 'ALM3', 'BLC4', 'ALM4', 'BLC4'],
'VALUE':[0.75, 1, 1.75, 0.67, 3], }))
start_time = datetime.now()
print(start_time, '>> Process started.')
# Use a shared dict to store results from various workers
manager = Manager()
# dict on Manager has no lock at all!
# https://stackoverflow.com/questions/2936626/how-to-share-a-dictionary-between-multiple-processes-in-python-without-locking
output: Dict[str, Dict[str, int]] = manager.dict()
shared_objects = inputData, output
with WorkerPool(n_jobs=7, shared_objects=shared_objects) as pool:
results = pool.map_unordered(myFunc, inputData['ID'].unique(), progress_bar=True)
print(datetime.now(), '>> Process completed -> total time taken:', datetime.now()-start_time)
Below is the error I'm stuck with:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-10-df7d847398a1> in <module>
37
38 with WorkerPool(n_jobs=7, shared_objects=shared_objects) as pool:
---> 39 results = pool.map_unordered(myFunc, inputData['ID'].unique(), progress_bar=True)
40
41 print(datetime.now(), '>> Process completed -> total time taken:', datetime.now()-start_time)
/opt/conda/lib/python3.7/site-packages/mpire/pool.py in map_unordered(self, func, iterable_of_args, iterable_len, max_tasks_active, chunk_size, n_splits, worker_lifespan, progress_bar, progress_bar_position, enable_insights, worker_init, worker_exit, task_timeout, worker_init_timeout, worker_exit_timeout)
418 n_splits, worker_lifespan, progress_bar, progress_bar_position,
419 enable_insights, worker_init, worker_exit, task_timeout, worker_init_timeout,
--> 420 worker_exit_timeout))
421
422 def imap(self, func: Callable, iterable_of_args: Union[Sized, Iterable], iterable_len: Optional[int] = None,
/opt/conda/lib/python3.7/site-packages/mpire/pool.py in imap_unordered(self, func, iterable_of_args, iterable_len, max_tasks_active, chunk_size, n_splits, worker_lifespan, progress_bar, progress_bar_position, enable_insights, worker_init, worker_exit, task_timeout, worker_init_timeout, worker_exit_timeout)
664 # Terminate if exception has been thrown at this point
665 if self._worker_comms.exception_thrown():
--> 666 self._handle_exception(progress_bar_handler)
667
668 # All results are in: it's clean up time
/opt/conda/lib/python3.7/site-packages/mpire/pool.py in _handle_exception(self, progress_bar_handler)
729 # Raise
730 logger.debug("Re-raising obtained exception")
--> 731 raise err(traceback_str)
732
733 def stop_and_join(self, progress_bar_handler: Optional[ProgressBarHandler] = None,
ValueError:
Exception occurred in Worker-0 with the following arguments:
Arg 0: 'A'
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/mpire/worker.py", line 352, in _run_safely
results = func()
File "/opt/conda/lib/python3.7/site-packages/mpire/worker.py", line 288, in _func
_results = func(args)
File "/opt/conda/lib/python3.7/site-packages/mpire/worker.py", line 455, in _helper_func
return self._call_func(func, args)
File "/opt/conda/lib/python3.7/site-packages/mpire/worker.py", line 472, in _call_func
return func(args)
File "<ipython-input-10-df7d847398a1>", line 9, in myFunc
indata2 = indata.loc[indata['ID']==id_val]
File "/opt/conda/lib/python3.7/site-packages/pandas/core/ops/common.py", line 69, in new_method
return method(self, other)
File "/opt/conda/lib/python3.7/site-packages/pandas/core/arraylike.py", line 32, in __eq__
return self._cmp_method(other, operator.eq)
File "/opt/conda/lib/python3.7/site-packages/pandas/core/series.py", line 5502, in _cmp_method
res_values = ops.comparison_op(lvalues, rvalues, op)
File "/opt/conda/lib/python3.7/site-packages/pandas/core/ops/array_ops.py", line 262, in comparison_op
"Lengths must match to compare", lvalues.shape, rvalues.shape
ValueError: ('Lengths must match to compare', (5,), (1,))
[Update]: Here is the code I found to be working fine using only the package multiprocessing.
def myFunc(id_val, output, indata):
# Temporary store for model output for an input ID
temp: Dict[str, int] = dict()
# Filter data for input ID and store output in temp variable
indata2 = indata.loc[indata['ID']==id_val]
temp = indata2.groupby(['M_CODE'])['VALUE'].sum().to_dict()
# store the result .. I want this to happen synchronously
output[id_val] = temp
#*******************************************************************
if __name__ == '__main__':
import pandas as pd
from typing import Dict
from itertools import repeat
from multiprocessing import Manager
from datetime import datetime
# This is just a sample data
inputData = pd.DataFrame(dict({'ID':['A', 'B', 'A', 'C', 'A'],
'M_CODE':['AKQ1', 'ALM3', 'BLC4', 'ALM4', 'BLC4'],
'VALUE':[0.75, 1, 1.75, 0.67, 3], }))
start_time = datetime.now()
print(start_time, '>> Process started.')
# Use a shared dict to store results from various workers
with Manager() as manager:
# dict on Manager has no lock at all!
# https://stackoverflow.com/questions/2936626/how-to-share-a-dictionary-between-multiple-processes-in-python-without-locking
output: Dict[str, Dict[str, int]] = manager.dict()
# Start processes involving n workers
# Set chunksize to effciently handling the tasks across workers so none remains idle as much as possible
with manager.Pool(processes=7, ) as pool:
pool.starmap(myFunc,
zip(inputData['ID'].unique(), repeat(output), repeat(inputData)),
chunksize = max(inputData['ID'].nunique() // (7*4), 1))
output = dict(output)
print(datetime.now(), '>> Process completed -> total time taken:', datetime.now()-start_time)

UPDATE:
Now that I better understand the specific issue, I can say the issue lies with the relationship of mpire.WorkerPool.map_unordered chunking procedure with the expected inputs to the pandas.loc function. Specifically, MyFunc gets id_val as a singular Numpy array such as array(['A'], dtype=object) as detailed in the chunking explanation and the source code. On the other side, indata['ID'] in the loc function a pandas Series. One of these has to be changed for the comparison to work, but based on what your code is trying to do, the id_val can be changed to give just its item, like:
id_val.item()
indata2 = indata.loc[indata['ID']==id_val]
Making the new MyFunc (which on my machine gets your script to run):
def myFunc(shared_objects, id_val):
indata, output = shared_objects
# Keep just the value of the id_val
id_val.item()
# Temporary store for model output for an input ID
temp: Dict[str, int] = dict()
# Filter data for input ID and store output in temp variable
indata2 = indata.loc[indata['ID']==id_val]
temp = indata2.groupby(['M_CODE'])['VALUE'].sum().to_dict()
# store the result .. I want this to happen synchronously
output[id_val] = temp
The reason why this isn't an issue in your multiprocessing-only solution is because zip is chunking inputData['ID'].unique() the way you expect: it only gives the value, not the value wrapped in an array object. Nice job finding an alternative solution, though!
The error is occurring in the function line:
indata2 = indata.loc[indata['ID']==id_val
Per the main error:
File "/opt/conda/lib/python3.7/site-packages/pandas/core/ops/array_ops.py", line 262, in comparison_op "Lengths must match to compare", lvalues.shape, rvalues.shape ValueError: ('Lengths must match to compare', (5,), (1,))
This is an element-wise equality match between Series(['A', 'B', 'A', 'C', 'A']).unique() and Series(['A', 'B', 'A', 'C', 'A']) . Which will never work unless there are no repeated values in 'ID' . I'm not sure what you are trying to do exactly with this statement, but that is certainly the cause of your error.

Related

Issues publishing to device shadow using the aws-iot-device-sdk-python-v2

In a python application that uses the aws iot device sdk for python v2 (v1.7.1) I am running into an issue where I cannot update the device shadow.
After starting the program, the DeviceShadowManager will attempt to get the latest shadow state and set it locally.
If a delta state is present the DeviceShadowManager will merge the last reported state and delta state and publish it.
That works. However, when the manager subscribes for updates, after the initial setup, I am running into an error,
where when the desired state changes, the manager cannot update the reported state. Here is the error:
Exception ignored in: <class 'TypeError'>
Traceback (most recent call last):
File "/Users/tom/.../lib/python3.9/site-packages/awscrt/mqtt.py", line 506, in callback_wrapper
callback(topic=topic, payload=payload)
TypeError: callback_wrapper() missing 3 required positional arguments: 'dup', 'qos', and 'retain'
I looked at the source, but just do not understand why a TypeError is raised,
especially because this exact scenario seems to be handled by the try and except block or am I getting it all wrong?
The source of the error:
if callback:
def callback_wrapper(topic, payload, dup, qos, retain):
try:
callback(topic=topic, payload=payload, dup=dup, qos=QoS(qos), retain=retain)
except TypeError:
# This callback used to have fewer args.
# Try again, passing only those those args, to cover case where
# user function failed to take forward-compatibility **kwargs.
callback(topic=topic, payload=payload) # this is line 506
Below you can find my code and the log of the program.
This dataclass represents the shadow:
from dataclasses import dataclass
#dataclass
class DeviceShadow:
score_threshold: float = 0.6
minimum_distance: int = 150
The shadow is managed by the DeviceShadowManager. Most of this is based on the shadow sample from the aforementioned repository.
from dataclasses import asdict
from queue import Queue
from threading import Lock
from awscrt import mqtt
from awsiot import iotshadow
from awsiot.iotshadow import IotShadowClient
from app.device_shadow.device_shadow import DeviceShadow, from_json as device_shadow_from_json
from app.models import log
SHADOW_VALUE_DEFAULT = DeviceShadow()
class DeviceShadowManager:
_shadow_client: IotShadowClient
shadow_value: DeviceShadow = DeviceShadow()
_lock = Lock()
_thing_name: str
def __init__(self, thing_name: str, mqtt_connection: mqtt.Connection):
self._thing_name = thing_name
self._shadow_client = iotshadow.IotShadowClient(mqtt_connection)
update_accepted_subscribed_future, _ = self._shadow_client.subscribe_to_update_shadow_accepted(
request=iotshadow.UpdateShadowSubscriptionRequest(thing_name=self._thing_name),
qos=mqtt.QoS.AT_LEAST_ONCE,
callback=self.on_update_shadow_accepted # omitted
)
update_rejected_subscribed_future, _ = self._shadow_client.subscribe_to_update_shadow_rejected(
request=iotshadow.UpdateShadowSubscriptionRequest(thing_name=self._thing_name),
qos=mqtt.QoS.AT_LEAST_ONCE,
callback=self.on_update_shadow_rejected # omitted
)
# Wait for subscriptions to succeed
update_accepted_subscribed_future.result(60)
update_rejected_subscribed_future.result(60)
log.info("Subscribing to Get responses...")
get_accepted_subscribed_future, _ = self._shadow_client.subscribe_to_get_shadow_accepted(
request=iotshadow.GetShadowSubscriptionRequest(thing_name=self._thing_name),
qos=mqtt.QoS.AT_LEAST_ONCE,
callback=self.on_get_shadow_accepted)
get_rejected_subscribed_future, _ = self._shadow_client.subscribe_to_get_shadow_rejected(
request=iotshadow.GetShadowSubscriptionRequest(thing_name=self._thing_name),
qos=mqtt.QoS.AT_LEAST_ONCE,
callback=self.on_get_shadow_rejected) # omitted
# Wait for subscriptions to succeed
get_accepted_subscribed_future.result()
get_rejected_subscribed_future.result()
log.info("Subscribing to Delta events...")
delta_subscribed_future, _ = self._shadow_client.subscribe_to_shadow_delta_updated_events(
request=iotshadow.ShadowDeltaUpdatedSubscriptionRequest(
thing_name=self._thing_name
),
qos=mqtt.QoS.AT_LEAST_ONCE,
callback=self.on_shadow_delta_updated)
# Wait for subscription to succeed
delta_subscribed_future.result()
# From here on out the rest runs asynchronously.
# Issue request for shadow's current value.
# The response will be received by the on_get_accepted() callback
with self._lock:
publish_get_future = self._shadow_client.publish_get_shadow(
request=iotshadow.GetShadowRequest(
thing_name=self._thing_name,
),
qos=mqtt.QoS.AT_LEAST_ONCE
)
# Ensure that publish succeeds
publish_get_future.result()
def on_get_shadow_accepted(self, response: iotshadow.GetShadowResponse) -> None:
log.info("Finished getting initial shadow value.")
if response.state and response.state.delta:
if not response.state.reported:
response.state.reported = {}
merged_state = self.merge_states(response.state.delta, response.state.desired)
return self.set_desired(device_shadow_from_json(merged_state))
if response.state and response.state.reported:
return self.set_local(device_shadow_from_json(response.state.reported))
self.set_desired(SHADOW_VALUE_DEFAULT)
return
def on_shadow_delta_updated(self, delta: iotshadow.ShadowDeltaUpdatedEvent) -> None:
if delta.state:
if delta.state is None:
log.info("Delta reports that nothing is set. Setting defaults...")
self.set_desired(SHADOW_VALUE_DEFAULT)
return
log.info("Delta reports that desired shadow is '{}'. Changing local shadow...".format(delta.state))
self.set_desired(self.merge_states(delta.state, self.shadow_value))
else:
log.info("Delta did not report a change")
#staticmethod
def merge_states(delta: dict, reported: DeviceShadow):
for key, value in delta.items():
reported[key] = value
return reported
def set_local(self, value: DeviceShadow) -> None:
with self._lock:
self.shadow_value = value
def set_desired(self, new_value: DeviceShadow) -> None:
with self._lock:
if self.shadow_value == new_value:
log.debug("Local shadow is already '{}'.".format(new_value))
return
log.debug("Changing local shadow to '{}'.".format(new_value))
self.shadow_value = new_value
log.debug("Updating reported shadow to '{}'...".format(new_value))
request = iotshadow.UpdateShadowRequest(
thing_name=self._thing_name,
state=iotshadow.ShadowState(
desired=asdict(new_value),
reported=asdict(new_value),
),
)
self._shadow_client.publish_update_shadow(request, mqtt.QoS.AT_LEAST_ONCE)
Below you will find the log:
DEBUG:app.mqtt:Connecting to xxxxxxxxxxxxxx-ats.iot.eu-central-1.amazonaws.com with client ID '80d8bc54-971e-0e65-a537-37d14a3cb630'...
INFO:app.models:Subscribing to Get responses...
INFO:app.models:Subscribing to Delta events...
INFO:app.models:Finished getting initial shadow value.
DEBUG:app.models:Changed local shadow to 'DeviceShadow(score_threshold=0.7, minimum_distance=1503)'.
DEBUG:app.models:Updating reported shadow to 'DeviceShadow(score_threshold=0.7, minimum_distance=1503)'...
INFO:app.models:Update request published.
DEBUG:app.models:Finished updating reported shadow to '{'score_threshold': 0.7, 'minimum_distance': 1503}'.
INFO:app.models:Delta reports that desired shadow is '{'minimum_distance': 15035}'. Changing local shadow...
Exception ignored in: <class 'TypeError'>
Traceback (most recent call last):
File "/Users/tom/.../lib/python3.9/site-packages/awscrt/mqtt.py", line 506, in callback_wrapper
callback(topic=topic, payload=payload)
TypeError: callback_wrapper() missing 3 required positional arguments: 'dup', 'qos', and 'retain'
DEBUG:app.models:Finished updating reported shadow to '{'score_threshold': 0.7, 'minimum_distance': 1503}'.
As you can see the stacktrace is pretty short, is there a way to debug this better?
Any ideas to why it is giving me this particular error and maybe how to solve it?
All help is appreciated!
I am pretty sure the problem lies within
#staticmethod
def merge_states(delta: dict, reported: DeviceShadow):
for key, value in delta.items():
reported[key] = value
return reported
where the __setitem__ call on the reported argument raises a TypeError because the reported argument is a DeviceShadow dataclass object that doesn't support item assignment.
If you want to set fields of a dataclass where you have a string of the field name, you can use setattr(reported, key, value).

Why python uses 'CallByValue' function call when a dict (mutable dtype) is an argument to a function that is implemented via multiprocessing Pool?

Note: This is an contrived example to a bigger problem
from multiprocessing import Pool
dict1 = {'key1':1}
def alterDict(dict_num):
for key in dict_num:
dict_num[key] = 20000
alterDict(dict1)
print(dict1) # output is {'key1': 20000}
dict1 = {'key1':1}
with Pool(2) as p:
p.map(alterDict,[dict1])
print(dict1) # output is {'key1': 1}
Why are the outputs different ?
Is there a way to circumvent Pool from using a 'call by value' style of a function call ?
I want to make pool use a call by reference style of a function call
when you are using multiprocessing and you want to change object like dict, list etc.. (shared data) you need to use Sharing state between process.
import multiprocessing as mp
def alterDict(dict_num):
for key, _ in dict_num.items():
dict_num[key] = 20000
with mp.Manager() as manager:
d = manager.dict()
d['key'] = 1
with manager.Pool() as pool:
pool.map(alterDict, [d])
print(dict(d))
# {'key': 20000} # output
BTW you should use dict_num.items() with items otherwise you will got error:
/usr/local/lib/python3.8/multiprocessing/managers.py in _callmethod(self, methodname, args, kwds)
848 dispatch(conn, None, 'decref', (token.id,))
849 return proxy
--> 850 raise convert_to_error(kind, result)
851
852 def _getvalue(self):
AttributeError: 'NoneType' object has no attribute '_registry'

Python 2.7 ProcessPoolExecutor throwing IOError: [Errno 32] Broken pipe

I am streaming data into a class in chunks. For each chunk of data, two different types of np.convolve() are executed on the same ProcessPoolExecutor. The type of convolve that was called is determined by a return variable.
The order of the data must be maintained, so each future has an associated sequence number. The output function enforces that only data from contiguous futures is returned (not shown below). From what I understand I am properly calling the ProcessPoolExecutor.shutdown() function, but I am still getting a IOError:
The errors is:
$ python processpoolerror.py
ran 5000000 samples in 3.70395112038 sec: 1.34990982265 Msps
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/queues.py", line 268, in _feed
send(obj)
IOError: [Errno 32] Broken pipe
Sorry it's a bit long, but I have pruned this class down as much as possible while keeping the error. On my machine Ubuntu 16.04.2 with a Intel(R) Core(TM) i7-6700K CPU # 4.00GHz the paired down code always gives this error. In the non-pruned version of this code, the Broken pipe occurs 25% of the time.
If you edit line 78 to True, and print during the execution, the error is not thrown. If you reduce the amount of data on line 100, the error is not thrown. What am I doing wrong here? Thanks.
import numpy as np
from concurrent.futures import ProcessPoolExecutor
import time
def _do_xcorr3(rev_header, packet_chunk, seq):
r1 = np.convolve(rev_header, packet_chunk, 'full')
return 0, seq, r1
def _do_power3(power_kernel, packet_chunk, seq):
cp = np.convolve(power_kernel, np.abs(packet_chunk) ** 2, 'full')
return 1, seq, cp
class ProcessPoolIssues():
## Constructor
# #param chunk_size how many samples to feed in during input() stage
def __init__(self,header,chunk_size=500,poolsize=5):
self.chunk_size = chunk_size ##! How many samples to feed
# ProcessPool stuff
self.poolsize = poolsize
self.pool = ProcessPoolExecutor(poolsize)
self.futures = []
# xcr stage stuff
self.results0 = []
self.results0.append((0, -1, np.zeros(chunk_size)))
# power stage stuff
self.results1 = []
self.results1.append((1, -1, np.zeros(chunk_size)))
self.countin = 0
self.countout = -1
def shutdown(self):
self.pool.shutdown(wait=True)
## Returns True if all data has been extracted for given inputs
def all_done(self):
return self.countin == self.countout+1
## main function
# #param packet_chunk an array of chunk_size samples to be computed
def input(self, packet_chunk):
assert len(packet_chunk) == self.chunk_size
fut0 = self.pool.submit(_do_xcorr3, packet_chunk, packet_chunk, self.countin)
self.futures.append(fut0)
fut1 = self.pool.submit(_do_power3, packet_chunk, packet_chunk, self.countin)
self.futures.append(fut1)
self.countin += 1
# loops through thread pool, copying any results from done threads into results0/1 (and then terminating them)
def cultivate_pool(self):
todel = []
for i, f in enumerate(self.futures):
# print "checking", f
if f.done():
a, b, c = f.result()
if a == 0:
self.results0.append((a,b,c)) # results from one type of future
elif a == 1:
self.results1.append((a,b,c)) # results from another type of future
todel.append(i)
# now we need to remove items from futures that are done
# we need do it in reverse order so we remove items from the end first (thereby not affecting indices as we go)
for i in sorted(todel, reverse=True):
del self.futures[i]
if False: # change this to true and error goes away
print "deleting future #", i
# may return None
def output(self):
self.cultivate_pool() # modifies self.results list
# wait for both results to be done before clearing
if len(self.results0) and len(self.results1):
del self.results0[0]
del self.results1[0]
self.countout += 1
return None
def testRate():
chunk = 500
# a value of 10000 will throw: IOError: [Errno 32] Broken pipe
# smaller values like 1000 do not
din = chunk * 10000
np.random.seed(666)
search = np.random.random(233) + np.random.random(233) * 1j
input = np.random.random(din) + np.random.random(din) * 1j
pct = ProcessPoolIssues(search, chunk, poolsize=8)
st = time.time()
for x in range(0, len(input), chunk):
slice = input[x:x + chunk]
if len(slice) != chunk:
break
pct.input(slice)
pct.output()
while not pct.all_done():
pct.output()
ed = time.time()
dt = ed - st
print "ran", din, "samples in", dt, "sec:", din / dt / 1E6, "Msps"
pct.shutdown()
if __name__ == '__main__':
testRate()
This is probably happening because you're exceeding the buffer size of the pipe when you try sending in larger chunks at once.
def _do_xcorr3(rev_header, packet_chunk, seq):
r1 = np.convolve(rev_header, packet_chunk, 'full')
return 0, seq, r1
def _do_power3(power_kernel, packet_chunk, seq):
cp = np.convolve(power_kernel, np.abs(packet_chunk) ** 2, 'full')
return 1, seq, cp
the values r1 and cp are very large because you are convolving with the square of the chunks.
Hence, when you try to run this with larger chunk sizes, the buffer of IO Pipe can't handle it. Refer this for clearer understanding.
As for the second part of the question,
if False: # change this to true and error goes away
print "deleting future #", i
Found this in the py3 docs:
16.2.4.4. Reentrancy
Binary buffered objects (instances of BufferedReader, BufferedWriter, BufferedRandom and BufferedRWPair) are not reentrant. While reentrant calls will not happen in normal situations, they can arise from doing I/O in a signal handler. If a thread tries to re-enter a buffered object which it is already accessing, a RuntimeError is raised. Note this doesn’t prohibit a different thread from entering the buffered object.
The above implicitly extends to text files, since the open() function will wrap a buffered object inside a TextIOWrapper. This includes standard streams and therefore affects the built-in function print() as well.

Cannot grok python multiprocessing

I need to run a function for the each of the elements of my database.
When I try the following:
from multiprocessing import Pool
from pymongo import Connection
def foo():
...
connection1 = Connection('127.0.0.1', 27017)
db1 = connection1.data
my_pool = Pool(6)
my_pool.map(foo, db1.index.find())
I'm getting the following error:
Job 1, 'python myscript.py ' terminated by signal SIGKILL (Forced quit)
Which is, I think, caused by db1.index.find() eating all the available ram while trying to return millions of database elements...
How should I modify my code for it to work?
Some logs are here:
dmesg | tail -500 | grep memory
[177886.768927] Out of memory: Kill process 3063 (python) score 683 or sacrifice child
[177891.001379] [<ffffffff8110e51a>] out_of_memory+0xfa/0x250
[177891.021362] Out of memory: Kill process 3063 (python) score 684 or sacrifice child
[177891.025399] [<ffffffff8110e51a>] out_of_memory+0xfa/0x250
The actual function below:
def create_barrel(item):
connection = Connection('127.0.0.1', 27017)
db = connection.data
print db.index.count()
barrel = []
fls = []
if 'name' in item.keys():
barrel.append(WhitespaceTokenizer().tokenize(item['name']))
name = item['name']
elif 'name.utf-8' in item.keys():
barrel.append(WhitespaceTokenizer().tokenize(item['name.utf-8']))
name = item['name.utf-8']
else:
print item.keys()
if 'files' in item.keys():
for file in item['files']:
if 'path' in file.keys():
barrel.append(WhitespaceTokenizer().tokenize(" ".join(file['path'])))
fls.append(("\\".join(file['path']),file['length']))
elif 'path.utf-8' in file.keys():
barrel.append(WhitespaceTokenizer().tokenize(" ".join(file['path.utf-8'])))
fls.append(("\\".join(file['path.utf-8']),file['length']))
else:
print file
barrel.append(WhitespaceTokenizer().tokenize(file))
if len(fls) < 1:
fls.append((name,item['length']))
barrel = sum(barrel,[])
for s in barrel:
vs = re.findall("\d[\d|\.]*\d", s) #versions i.e. numbes such as 4.2.7500
b0 = []
for s in barrel:
b0.append(re.split("[" + string.punctuation + "]", s))
b1 = filter(lambda x: x not in string.punctuation, sum(b0,[]))
flag = True
while flag:
bb = []
flag = False
for bt in b1:
if bt[0] in string.punctuation:
bb.append(bt[1:])
flag = True
elif bt[-1] in string.punctuation:
bb.append(bt[:-1])
flag = True
else:
bb.append(bt)
b1 = bb
b2 = b1 + barrel + vs
b3 = list(set(b2))
b4 = map(lambda x: x.lower(), b3)
b_final = {}
b_final['_id'] = item['_id']
b_final['tags'] = b4
b_final['name'] = name
b_final['files'] = fls
print db.barrels.insert(b_final)
I've noticed interesting thing. Then I press ctrl+c to stop process I'm getting the following:
python index2barrel.py
Traceback (most recent call last):
File "index2barrel.py", line 83, in <module>
my_pool.map(create_barrel, db1.index.find, 6)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 227, in map
return self.map_async(func, iterable, chunksize).get()
File "/usr/lib/python2.7/multiprocessing/pool.py", line 280, in map_async
iterable = list(iterable)
TypeError: 'instancemethod' object is not iterable
I mean, why multiprocessing is trying to convert somethin to the list? Isn't it the source of the problem?
from the stack trace:
brk(0x231ccf000) = 0x231ccf000
futex(0x1abb150, FUTEX_WAKE_PRIVATE, 1) = 1
sendto(3, "+\0\0\0\260\263\355\356\0\0\0\0\325\7\0\0\0\0\0\0data.index\0\0"..., 43, 0, NULL, 0) = 43
recvfrom(3, "Some text from my database."..., 491663, 0, NULL, NULL) = 491663
... [manymany times]
brk(0x2320d5000) = 0x2320d5000
.... manymany times
The above sample goes and goes in strace output and for some reason strace -o logfile python myscript.py
does not halt. It just eats all the available ram and writes in log file.
UPDATE. Using imap instead of map solved my problem.
Since the find() operation is returning the cursor the the map function and since you say that this runs without a problem when you do
for item in db1.index.find(): create_barrel(item)
it looks like the create_barrel function is OK.
Can you try to limit the number of results returned in the cursor and see if this helps? I think the syntax would be:
db1.index.find().limit(100)
If you could try this and see if it helps it might help to get the cause of the problem.
EDIT1: I think you are going about this the wrong way by using the map function - I think you should be using map_reduce in the mongo python driver - that way the map function will be executed by the mongod process.
map() function gives the items in chunks to the given function. By default this chunksize is calculated like this (link to source):
chunksize, extra = divmod(len(iterable), len(self._pool) * 4)
This probably results in too big chunk size in your case and lets the process run out of memory. Try setting the chunk size manually like this:
my_pool.map(foo, db1.index.find(), 100)
EDIT: You should also consider reusing the db connection and closing them after usage. Now you create new db connection for each item, and you don't call close() to them.
EDIT2: Also check if the while loop gets into an infinite loop (would explain the symptoms).
EDIT3: Based on the traceback you added the map function tries to convert the cursor to a list, causing all the items to be fetched at once. This happens because it want's to find how many items there are in the set. This is part of map() code from pool.py:
if not hasattr(iterable, '__len__'):
iterable = list(iterable)
You could try this to avoid conversion to list:
cursor = db1.index.find()
cursor.__len__ = cursor.count()
my_pool.map(foo, cursor)

memory overflow when using numpy load in a loop

Looping over npz files load causes memory overflow (depending on the file
list length).
None of the following seems to help
Deleting the variable which stores the data in the file.
Using mmap.
calling gc.collect() (garbage collection).
The following code should reproduce the phenomenon:
import numpy as np
# generate a file for the demo
X = np.random.randn(1000,1000)
np.savez('tmp.npz',X=X)
# here come the overflow:
for i in xrange(1000000):
data = np.load('tmp.npz')
data.close() # avoid the "too many files are open" error
in my real application the loop is over a list of files and the overflow exceeds 24GB of RAM!
please note that this was tried on ubuntu 11.10, and for both numpy v
1.5.1 as well as 1.6.0
I have filed a report in numpy ticket 2048 but this may be of a wider interest and so I am posting it here as well (moreover, I am not sure that this is a bug but may result of my bad programming).
SOLUTION (by HYRY):
the command
del data.f
should precede the command
data.close()
for more information and a method to find the solution, please read HYRY's kind answer below
I think this is a bug, and maybe I found the solution: call "del data.f".
for i in xrange(10000000):
data = np.load('tmp.npz')
del data.f
data.close() # avoid the "too many files are open" error
to found this kind of memory leak. you can use the following code:
import numpy as np
import gc
# here come the overflow:
for i in xrange(10000):
data = np.load('tmp.npz')
data.close() # avoid the "too many files are open" error
d = dict()
for o in gc.get_objects():
name = type(o).__name__
if name not in d:
d[name] = 1
else:
d[name] += 1
items = d.items()
items.sort(key=lambda x:x[1])
for key, value in items:
print key, value
After the test program, I created a dict and count objects in gc.get_objects(). Here is the output:
...
wrapper_descriptor 1382
function 2330
tuple 9117
BagObj 10000
NpzFile 10000
list 20288
dict 21001
From the result we know that there are something wrong with BagObj and NpzFile. Find the code:
class NpzFile(object):
def __init__(self, fid, own_fid=False):
...
self.zip = _zip
self.f = BagObj(self)
if own_fid:
self.fid = fid
else:
self.fid = None
def close(self):
"""
Close the file.
"""
if self.zip is not None:
self.zip.close()
self.zip = None
if self.fid is not None:
self.fid.close()
self.fid = None
def __del__(self):
self.close()
class BagObj(object):
def __init__(self, obj):
self._obj = obj
def __getattribute__(self, key):
try:
return object.__getattribute__(self, '_obj')[key]
except KeyError:
raise AttributeError, key
NpzFile has del(), NpzFile.f is a BagObj, and BagObj._obj is NpzFile, this is a reference cycle and will cause both NpzFile and BagObj uncollectable. Here is some explanation in Python document: http://docs.python.org/library/gc.html#gc.garbage
So, to break the reference cycle, will need to call "del data.f"
What I found as the solution: (python==3.8 and numpy==1.18.5)
import gc # import garbage collector interface
for i in range(1000):
data = np.load('tmp.npy')
# process data
del data
gc.collect()

Categories