I am downloading zip files and looking inside them to check their contents for a few million items, but I am constantly accruing memory and I will eventually go OOM, even with small semaphores.
Consider the block:
async def zip_reader(self, blobFileName, blobEndPoint, semaphore):
try:
# access blob
async with ClientSecretCredential(TENANT, CLIENTID, CLIENTSECRET) as credential:
async with BlobServiceClient(account_url="https://blob1.blob.core.windows.net/", credential=credential, max_single_get_size=64 * 1024 * 1024, max_chunk_get_size=32 * 1024 * 1024) as blob_service_client:
async with blob_service_client.get_blob_client(container=blobEndPoint, blob=blobFileName) as blob_client:
async with semaphore:
logger.info(f"Starting: {blobFileName}, {blobEndPoint}")
# open bytes
writtenbytes = io.BytesIO()
# write file to it
stream = await blob_client.download_blob(max_concurrency=25)
stream = await stream.readinto(writtenbytes)
# zipfile
f = ZipFile(writtenbytes)
# file list
file_list = [s for s in f.namelist()]
# send to df
t_df = pd.DataFrame({'fileList': file_list})
# add fileName
t_df['blobFileName'] = blobFileName
t_df['blobEndPoint'] = blobEndPoint
if semaphore.locked():
await asyncio.sleep(1)
logger.info(f"Completed: {blobFileName}")
# clean up here; also tried del on objs here as well
self.cleanup()
return t_df
async def cleanup(self):
gc.collect()
await asyncio.sleep(1)
async def async_file_as_bytes_generator(self, blobFileName, blobEndPoint, semaphore):
"""
main caller
"""
semaphore = asyncio.Semaphore(value=semaphore)
return await asyncio.gather(*[self.zip_reader(fn, ep, semaphore) for fn, ep in zip(blobFileName, blobEndPoint)], # also tried attaching here)
asyncio.gather has no strategy to limit the number of simultaneous tasks in execution at all. Your semaphore may limit how many are being fetched and processed at once - but gather will wait for all data frames to be avalible, and return all at once.
Instead of using a single await asyncio.gather use something like asyncio.wait with a timeout, and keep control of how many tasks are running, yielding the complete dataframes as they become ready.
And then, you didn't show the remaining of your program leading to the call to async_file_as_bytes_generator, but it will have to consume the dataframes as they are yielded and dispose of them, of course.
Also: no need to do explicit calls to gc.collect ever: this is a no-operation. Python does free your memory if your program is correct, and keep no references to objects consuming it. Otherwise there is nothing gc.collect could do anyway.
Your "main caller" can be something along this - but as I denoted, you have to check the code that calls it so that it consumes each dataframe at once, and not expect a list with all dataframes as your current code do.
async def async_file_as_bytes_generator(self, blobFileName, blobEndPoint, task_limit):
"""
main caller
"""
semaphore = asyncio.Semaphore(value=task_limit)
all_tasks = {self.zip_reader(fn, ep, semaphore) for fn, ep in zip(blobFileName, blobEndPoint)}
current_tasks = set()
while all_tasks or current_tasks:
while all_tasks and len(current_tasks < task_limit):
current_tasks.add(all_tasks.pop())
done, incomplete = await asyncio.wait(current_tasks, return_when=asyncio.FIRST_COMPLETED)
for task in done:
# optionally check for task exception
yield task.result()
current_tasks = incomplete
Related
I am at the end of a very long journey...
Here is the long story if you are interested.
https://github.com/hardbyte/python-can/issues/1336
Sorry for the incredibly long code snippet but I am not sure where I am going wrong so I thought more is more.
The code is as follows request_inst() requests instrumentation data using the dict request_info from an MCU, the MCU responds and this is picked up by the listener. obtain_message() creates a future with which to store all_data that is yielded from the listener with msg = await reader.get_message(). I attempt to structure this process with lock. store_data() is where I store the response data from the MCU, this is a dict called all_data. all_data when printed outside of the listener appears with zero values as shown below. The purpose of the code is to make all_data available outside of the event loop but currently even with this implementation I cannot get all_data to appear without zero values showing up in the dict.
import asyncio
import can
from can.notifier import MessageRecipient
from typing import List
freq = 0.0003
# this is the respond ids and the and the parameter ids of the data
# stored data is suppose to fill up the None with a value
all_data = {268439810: {16512: [None], 16513: [None], 16514: [None], 16515: [None]},
268444162: {16512: [None], 16513: [None], 16514: [None], 16515: [None]}}
request_info = {286326784: {16512, 16513, 16514, 16515},
287440896: {16512, 16513, 16514, 16515}}
# all the request ids in that have been configured
cm4_read_ids = [286326784, 287440896]
# all the response ids in that have been configured
mcu_respond_ids = [268439810, 268444162]
# finds arb id and pid and logs the data from the message
# async def store_data(arb_id, msg_data, msg_dlc):
async def store_data(msg: can.Message, lock):
pid = int.from_bytes(msg.data[0:2], 'little')
arb_id = msg.arbitration_id
if arb_id in mcu_respond_ids:
async with lock:
if msg.dlc == 5:
all_data[arb_id][pid][0] = int.from_bytes(msg.data[2:4], 'little', signed=False)
elif msg.dlc == 7:
all_data[arb_id][pid][0] = int.from_bytes(msg.data[2:6], 'little', signed=False)
return all_data
async def request_inst(bus: can.Bus):
print('Request inst active')
while True:
for key in request_info:
for val in request_info[key]:
pid = int(val)
pidbytes = pid.to_bytes(2, 'little')
msg = can.Message(arbitration_id=key, data=pidbytes)
bus.send(msg)
await asyncio.sleep(freq)
# await store_data(reader)
async def message_obtain(reader: can.AsyncBufferedReader, lock):
print('Started it the get message process')
while True:
await asyncio.sleep(0.01)
msg = await reader.get_message()
future = await store_data(msg, lock)
async with lock:
print('This is the future')
print(future)
async def main() -> None:
with can.Bus(
interface="socketcan", channel="can0", receive_own_messages=True
) as bus:
# Create Notifier with an explicit loop to use for scheduling of callbacks
loop = asyncio.get_running_loop()
reader = can.AsyncBufferedReader()
lock = asyncio.Lock()
listeners: List[MessageRecipient] = [reader]
notifier = can.Notifier(bus, listeners, loop=loop)
try:
task1 = asyncio.create_task(request_inst(bus))
task2 = asyncio.create_task(message_obtain(reader, lock))
await asyncio.gather(task1, task2)
except KeyboardInterrupt:
notifier.stop()
bus.shutdown()
if __name__ == "__main__":
asyncio.run(main())
The issue I am seeing is that even on this cut down one page wonder I am seeing what I believe to be concurrency issues.
As you can see I have tried locking but still I am seeing these zero values appear in all_data See below where pid 16514's balue becomes 0 when the messages being returned by the MCU are not zero.
n.b. the output below where the incorrect data is shown is the output from print(future)
The real value should never be 0 as it is a measured value.
b6 = (1016872961).to_bytes(4, 'little')
struct.unpack('<f', b6)
(0.01907348819077015,)
Am I doing anything very stupid? It feels like I am not accessing the data in the listener correctly despite using lock when all_data is being modified.
If I print from the listeners the data is always correct even when all_data is returning 0 values.
If anyone is able to help me it would be much appreciated.
It appears the problem was not software related and the zeros were real. Every day is a learning day!
This is a PCAN image of the highlighted send and a response shown at the top.
EDIT: Confirmed MCU response issue - looks like the firmware on the MCU. My Rigol wouldn't trigger on the ID so I had to 1/8 video and then screen shot that to catch it in the act. You can see the response is all 7 bytes of nothing.
Ok ok ok, it turns out I was doing something really silly. I was using nested dictionaries to store data about different ids but the keys were the same. After some investigation using id(id1[some_pid1]) and id(id2[some_pid1]) I discovered the keys had the same memory address.
data_dict = {id1: {some_pid1: value, some_pid2: value},
id2: {some_pid1: value, some_pid2: value}}
This appeared to all that it was a race condition but actually I was just writing zeros (which turned out to be forced from the MCU) to the wrong id because it shared a key with the other id.
Whoops
I am using motor but pymongo was my initial choice, switched to motor because it is an async version of mongodb in python.
My aim here is to query the mongodb with large number of calls at the same time with minimal waiting time.
There's about 1000 symbols and for each symbol I have to query its latest candlestick data from mongodb from time to time in order to perform certain calculation. I need to query the latest 5K documents for each symbol. So the collection contains roughly 1000 * 5000 = 5,000,000 documents.
With Motor and asyncio, I use the following method to fetch documents asynchronously, but it takes really long time to run the code and I can't seem to know why. I am using 8 core cpu on a virtual machine.
Any help with this problem?
async def getCandleList(symbol): # each symbol contains about 5K latest candles in the collection
final_str = "{'symbol': '%s'}"%(symbol)
resultType = 'candlestick_archive'
dbName = 'candle_db'
cursor = eval("db.{}.find({}).sort('timeStamp',-1)".format(dbName, final_str))
finalList = await cursor.to_list(length=None)
return finalList
async def taskForEachSymbol(symbol):
while True:
candleList = await getCandleList(symbol)
await generateSignal(candleList) # a function that generates certain signals in real time
def getAllTasks():
awaitableTasks = []
for symbol in symbolList: # symbolList contains around 1k symbols
awaitableTasks.append(asyncio.create_task(taskForEachSymbol(symbol)))
return awaitableTasks
async def mainTask():
awaitableTasks = getAllTasks()
await asyncio.gather(*awaitableTasks, return_exceptions=False)
async def main()
mainLoop.run_until_complete(mainTask())
print('completed! ... ')
if __name__ == '__main__':
mainLoop=asyncio.new_event_loop()
asyncio.set_event_loop(mainLoop)
client = motor.motor_asyncio.AsyncIOMotorClient(io_loop=mainLoop)
db = client.candles
main()
async def simultaneous_chunked_download(urls_paths, label):
timeout = ClientTimeout(total=60000)
sem = asyncio.Semaphore(5)
async with aiohttp.ClientSession(timeout=timeout, connector=aiohttp.TCPConnector(verify_ssl=False)) as cs:
async def _fetch(r, path):
async with sem:
async with aiofiles.open(path, "wb") as f:
async for chunk in r.content.iter_any():
if not chunk:
break
size = await f.write(chunk)
if not indeterminate:
bar._done += size
bar.show(bar._done)
if indeterminate:
bar._done += 1
bar.show(bar._done)
indeterminate = False
total_length = 0
tasks = []
for url, path in urls_paths.items():
r = await cs.get(url)
if not indeterminate:
try:
total_length += r.content_length
except Exception:
indeterminate = True
tasks.append(_fetch(r, path))
verbose_print(f"url: {url},\npath: {path}\n\n")
if not indeterminate:
bar = progress.Bar(
expected_size=total_length, label=label, width=28, hide=False
)
else:
bar = progress.Bar(
expected_size=len(tasks), label=label, width=28, hide=False
)
logger._pause_file_output = True
bar.show(0)
bar._done = 0
await asyncio.gather(*tasks)
logger._pause_file_output = False
bar.done()
The function I have above is for downloading a dictionary of urls asynchronously and then printing out a progress bar. An example of its usage:
The code itself runs perfectly fine, however i keep getting these errors:
Whilst benign, they are an eyesore and could point towards my lack of knowledge on both http and asynchronous code, so i would rather try and get it fixed. However im at a loss on where or what is causing it, especially as i like i said the code runs perfectly fine regardless.
If you would like a more practical hands on attempt at recreating this the full code is on my github repo on the dev branch: https://github.com/ohitstom/spicetify-easyinstall/tree/dev
Most of the program can be disregarding if you are testing this out, just press the install button and the problematic code will show itself towards the end.
Bare in mind this is a spotify themer so if you have spotify/spicetify installed you will want to use a vm.
FIXED!:
# Create App
globals.app = QtWidgets.QApplication(sys.argv)
globals.app.setStyleSheet(gui.QSS)
# Configure asyncio loop to work with PyQt5
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
loop = QEventLoop(globals.app)
asyncio.set_event_loop(loop)
# Setup GUI
globals.gui = gui.MainWindow()
globals.gui.show()
# Set off loop
with loop:
sys.exit(loop.run_until_complete(globals.gui.exit_request.wait()))
class MainWindow(QuickWidget):
def __init__(self):
super().__init__(
name="main_window",
...etc
)
self.exit_request = asyncio.Event()
......etc
def closeEvent(self, *args):
self.exit_request.set()
Asyncio and aiohttp have some problems when running a lot of tasks concurrently on Windows, I've been having a lot of problems with it lately.
There are some workarounds available, the ones I use most are:
# set this before your event loop initialization or main function
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
Or:
loop = asyncio.ProactorEventLoop()
asyncio.set_event_loop(loop)
loop.run_until_complete(your_main())
I have been working on a small PoC where I am trying to do a I/O Bound application to execute functions without being blocked. Currently I have created something like this:
import time
import concurrent.futures
found_products = []
site_catalog = [
"https://www.graffitishop.net/Sneakers",
"https://www.graffitishop.net/T-shirts",
"https://www.graffitishop.net/Sweatshirts",
"https://www.graffitishop.net/Shirts"
]
def threading_feeds():
# Create own thread for each URL as we want to run concurrent
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(monitor_feed, site_catalog)
def monitor_feed(link: str) -> None:
old_payload = product_data(...)
while True:
new_payload = product_data(...)
if old_payload != new_payload:
for links in new_payload:
if links not in found_products:
logger.info(f'Detected new link -> {found_link} | From -> {link}')
# Execute filtering function without blocking, how?
filtering(link=found_link)
else:
logger.info("Nothing new")
time.sleep(60)
continue
def filtering(found_link):
# More code will be added in the future to handle logical code parts
...
# Test
time.sleep(60)
Problem: Currently the issue is that whenever we enter the row filtering(link=found_link) there will be a call to filtering(...) which sleeps for 60 seconds (This is only a mock data, in the future I will have a logical code part instead), what it does then is that the monitor_feed stops the execution and waits until the filtering() is finished.
My Question: I wonder how can I be able to execute the filtering(...) and still continue to loop through the monitor_feed without being blocked when we call filtering(...)?
This is your code with small modifications - mostly problem was with wrong names of variable (because then are very similar)
To make sure I use names executor1, executor2 and executor2 has to be create before while True because it has to exist all time when threads are used.
If you have def filtering(filtered_link) then you have to use the same name filtered_link in submit(..., filtered_link=...)
import concurrent.futures
import time
found_products = []
site_catalog = [
"https://www.graffitishop.net/Sneakers",
"https://www.graffitishop.net/T-shirts",
"https://www.graffitishop.net/Sweatshirts",
"https://www.graffitishop.net/Shirts"
]
def threading_feeds():
print('[threading_feeds] running')
# Create own thread for each URL as we want to run concurrent
with concurrent.futures.ThreadPoolExecutor() as executor1:
executor1.map(monitor_feed, site_catalog)
def monitor_feed(link: str) -> None:
print('[monitor_feed] start')
old_payload = ['old'] # product_data(...)
# executor has to exist all time
with concurrent.futures.ThreadPoolExecutor() as executor2:
while True:
print('[monitor_feed] run loop')
new_payload = ['new1', 'new2', 'new3'] # product_data(...)
if old_payload != new_payload:
for product_link in new_payload:
if product_link not in found_products:
print(f'Detected new link -> {product_link} | From -> {link}')
executor2.submit(filtering, filtered_link=product_link)
#executor2.submit(filtering, product_link)
print("Continue")
time.sleep(2)
def filtering(filtered_link):
# More code will be added in the future to handle logical code parts
#...
# Test
print(f'[filtering]: start: {filtered_link}')
time.sleep(60)
print(f'[filtering]: end: {filtered_link}')
# --- start --
threading_feeds()
Just poking my toe into asynchronous programming in Python, and ran into an interesting application for it where I need to gather file sizes on about 10 files each on ~100 machines to see which machines aren't purging their log files appropriately.
My synchronous approach was:
File_info = namedtuple("File_info", "machineinfo size")
machines = utils.list_machines() # the computers being queried
# each machine object has attributes like "name", "IP", and "site_id" among others
file_sizes = {}
# {filename: [File_info, ...], ...}
for m in machines:
print(f"Processing {m}...") # this is "Processing {m}...".format(m=m)
# isn't Python 3.6 awesome?!
for path in glob.glob(f"//{m.IP}/somedir/*.dbf"):
fname = os.path.split(path)[-1].lower()
machineinfo = (m.site_id, m.name)
size = os.stat(path).st_size
file_sizes.setdefault(fname, []).append(File_info(registerinfo, size))
This works great, but takes a long time with the network operations pulling all those globs and stats. I wanted to use Python 3.5's async/await syntax with asyncio to asynchronize those calls. Here's what I came up with:
File_info = namedtuple("File_info", "machineinfo size")
machines = utils.list_machines()
file_sizes = {}
# {filename: [File_info, ...], ...}
async def getfilesizes(machine, loop):
machineinfo = machine.site_id, machine.name
paths = glob.glob(f"//{machine.IP}/somedir/*.dbf")
coros = [getsize(path) for path in paths]
results = loop.run_until_complete(asyncio.gather(*coros))
sizes = {fname: File_info(machineinfo, size) for (fname, size) in results}
return sizes
async def getsize(path):
return os.path.split(path)[-1], os.stat(path).st_size
loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*(getfilesizes(m, loop) for m in machines)))
for result in results:
file_sizes.update(result)
# I have a problem here since my dict values are lists that need to extend
# not overwrite, but that's not relevant for the error I'm getting
However the script hangs inside the outer loop.run_until_complete section. What am I doing wrong?
A coroutine that wants to run another coroutine (as getfilesizes does with getsize) should await it rather than scheduling it in the event loop.
...
async def getfilesizes(machine): # changed func sig
machineinfo = machine.site_id, machine.name
paths = glob.glob(f"//{machine.IP}/somedir/*.dbf")
coros = [getsize(path) for path in paths]
results = await asyncio.gather(*coros) # await the results instead!
sizes = {fname: File_info(machineinfo, size) for (fname, size) in results}
return sizes
...
Since asyncio.gather creates one Future from any number of coroutines, this await functionally acts on the whole group of coroutines and grabs all their results at once.