Boto3 - possible upload generator object to s3? - python

I would like to upload the results of a generator to s3 while fully taking advantage of the generator.
A mve of what I'm trying to do is:
def gen():
for i in ['a','b','c','d']
yield i
s3_object.put?(data=gen())
I've seen examples of putting local filepaths into .put, but havent seen anything regarding generators.
Ideally this would fully take advantage of the generator so I don't have to write all the data to disk or memory.
Is something like this possible?

No, this is currently not possible. According to the doc, the Body parameter only accepts bytes or seekable file-like object.

Yes you can do this using a multipart upload. The minimum size for a "part" is 5MiB, so you need to build a buffer that holds the data until 5MiB is reached.
Here is a class that gets the job done:
import boto3
class MultipartUpload:
def __init__(self, bucket, key, client=None, encoding="utf8"):
self.bucket = bucket
self.key = key
self.client = client or boto3.client("s3")
self.encoding = encoding
upload = self.client.create_multipart_upload(
Bucket=bucket, Key=key
)
self.upload_id = upload["UploadId"]
self.part = 1
self.parts_container = []
self.buffer = b''
def write(self, content):
if isinstance(content, str):
self.buffer += content.encode(self.encoding)
elif isinstance(content, bytes):
self.buffer += content
else:
raise TypeError(f"Received bad data of type {type(content)}. Must be bytes or string")
if len(self.buffer) > 5 * 1024 * 1024: # 5 MiB Minimum part upload
self.commit()
def commit(self):
print("Sending multipart")
resp = self.client.upload_part(
Body=self.buffer, Bucket=self.bucket, Key=self.key,
PartNumber=self.part, UploadId=self.upload_id
)
self.parts_container.append(
{"ETag": resp["ETag"], "PartNumber": self.part}
)
self.buffer = b''
self.part += 1
def save(self):
if self.buffer: # If we have leftover data; commit it first
self.commit()
return self.client.complete_multipart_upload(
Bucket=self.bucket, Key=self.key, UploadId=self.upload_id,
MultipartUpload={"Parts": self.parts_container}
)
def abort(self):
self.client.abort_multipart_upload(
Bucket=self.bucket, Key=self.key, UploadId=self.upload_id
)
def __enter__(self):
return self
def __exit__(self, type_, value, tb):
if type_:
self.abort()
else:
self.save()
You could of course improve this for things like threading and further optimisation. You would use it like a file handle:
with MultipartUpload("your-bucket", "file.txt") as upload:
for data in gen():
upload.write(data)

Related

Getting UpdateNotify error when implementing a python RTD client

I'm trying to implement an RTD client using this project as an example, but without success.
Instance as RTD server the example contained in the win32com package below, and in Excel it works perfectly, but in the RTD client used as a template, it generates this error.
RTD client code
import functools
import pythoncom
import win32com.client
from win32com import universal
from win32com.client import gencache
from win32com.server.util import wrap
EXCEL_TLB_GUID = '{00020813-0000-0000-C000-000000000046}'
EXCEL_TLB_LCID = 0
EXCEL_TLB_MAJOR = 1
EXCEL_TLB_MINOR = 4
gencache.EnsureModule(EXCEL_TLB_GUID, EXCEL_TLB_LCID, EXCEL_TLB_MAJOR, EXCEL_TLB_MINOR)
universal.RegisterInterfaces(EXCEL_TLB_GUID,
EXCEL_TLB_LCID, EXCEL_TLB_MAJOR, EXCEL_TLB_MINOR,
['IRtdServer', 'IRTDUpdateEvent'])
# noinspection PyProtectedMember
class ObjectWrapperCOM:
LCID = 0x0
def __init__(self, obj):
self._impl = obj # type: win32com.client.CDispatch
def __getattr__(self, item):
flags, dispid = self._impl._find_dispatch_type_(item)
if dispid is None:
raise AttributeError("{} is not a valid property or method for this object.".format(item))
return functools.partial(self._impl._oleobj_.Invoke, dispid, self.LCID, flags, True)
# noinspection PyPep8Naming
class RTDUpdateEvent:
_com_interfaces_ = ['IRTDUpdateEvent']
_public_methods_ = ['Disconnect', 'UpdateNotify']
_public_attrs_ = ['HeartbeatInterval']
# Implementation of IRTDUpdateEvent.
HeartbeatInterval = -1
def __init__(self, event_driven=True):
self.ready = False
self._event_driven = event_driven
def UpdateNotify(self):
if self._event_driven:
self.ready = True
def Disconnect(self):
pass
class RTDClient:
MAX_REGISTERED_TOPICS = 1024
def __init__(self, class_id):
"""
:param classid: can either be class ID or program ID
"""
self._class_id = class_id
self._rtd = None
self._update_event = None
self._topic_to_id = {}
self._id_to_topic = {}
self._topic_values = {}
self._last_topic_id = 0
def connect(self, event_driven=True):
"""
Connects to the RTD server.
Set event_driven to false if you to disable update notifications.
In this case you'll need to call refresh_data manually.
"""
dispatch = win32com.client.Dispatch(self._class_id)
self._update_event = RTDUpdateEvent(event_driven)
try:
self._rtd = win32com.client.CastTo(dispatch, 'IRtdServer')
except TypeError:
# Automated makepy failed...no detailed construction available for the class
self._rtd = ObjectWrapperCOM(dispatch)
self._rtd.ServerStart(wrap(self._update_event))
def update(self):
"""
Check if there is data waiting and call RefreshData if necessary. Returns True if new data has been received.
Note that you should call this following a call to pythoncom.PumpWaitingMessages(). If you neglect to
pump the message loop you'll never receive UpdateNotify callbacks.
"""
# noinspection PyUnresolvedReferences
pythoncom.PumpWaitingMessages()
if self._update_event.ready:
self._update_event.ready = False
self.refresh_data()
return True
else:
return False
def refresh_data(self):
"""
Grabs new data from the RTD server.
"""
(ids, values) = self._rtd.RefreshData(self.MAX_REGISTERED_TOPICS)
for id_, value in zip(ids, values):
if id_ is None and value is None:
# This is probably the end of message
continue
assert id_ in self._id_to_topic, "Topic ID {} is not registered.".format(id_)
topic = self._id_to_topic[id_]
self._topic_values[topic] = value
def get(self, topic: tuple):
"""
Gets the value of a registered topic. Returns None if no value is available. Throws an exception if
the topic isn't registered.
"""
assert topic in self._topic_to_id, 'Topic %s not registered.' % (topic,)
return self._topic_values.get(topic)
def register_topic(self, topic: tuple):
"""
Registers a topic with the RTD server. The topic's value will be updated in subsequent data refreshes.
"""
if topic not in self._topic_to_id:
id_ = self._last_topic_id
self._last_topic_id += 1
self._topic_to_id[topic] = id_
self._id_to_topic[id_] = topic
self._rtd.ConnectData(id_, topic, True)
def unregister_topic(self, topic: tuple):
"""
Un-register topic so that it will not get updated.
:param topic:
:return:
"""
assert topic in self._topic_to_id, 'Topic %s not registered.' % (topic,)
self._rtd.DisconnectData(self._topic_to_id[topic])
def disconnect(self):
"""
Closes RTD server connection.
:return:
"""
self._rtd.ServerTerminate()
The example RTD Server is Python.RTD.TimeServer and it works great in Excel, but the RTD client in the above example throws this error:
File "C:\Users\XXXXXX\AppData\Local\Temp\gen_py\3.9\00020813-0000-0000-C000-000000000046x0x1x9.py", line 20963, in UpdateNotify
return self.oleobj.InvokeTypes(10, LCID, 1, (24, 0), (),)
pywintypes.com_error: (-2147352573, 'Member not found.', None, None)
I have no knowledge of COM, but in the struggle to learn.
Any suggestions from friends?
You need to implement all the methods defined by the IRTDServer interface.
https://learn.microsoft.com/en-us/dotnet/api/microsoft.office.interop.excel.irtdserver?view=excel-pia
Once you do that excel should be able to find all methods it needs to work with your server.

Streaming upload of a slice of a (large) file using python-requests

I need to do a streaming upload (i.e., not load the full file part in memory) of a slice of a large (multi-GB) file, using python-requests.
I've looked around in the doc and on Stack Overflow, and haven't found a working way to do it (again, without loading the full slice in memory).
Here's the code I have:
class FileSlice(AbstractContextManager):
"""
File-like object that only reads a slice of a file
Inspired by stackoverflow.com/a/29838711/593036, but actually works.
"""
def __init__(self, filepath: str, seek_from: int, read_limit: int):
self.filepath = filepath
self.seek_from = seek_from
self.read_limit = read_limit
self.n_seen = 0
def __enter__(self):
self.f = open(self.filepath, "rb")
self.f.seek(self.seek_from)
return self
def __len__(self):
total_length = os.fstat(self.f.fileno()).st_size
return min(self.read_limit, total_length - self.seek_from)
def read(self, n=-1):
if self.n_seen >= self.read_limit:
return b""
remaining_amount = self.read_limit - self.n_seen
n_to_read = remaining_amount if n < 0 else min(n, remaining_amount)
self.n_seen += n_to_read
return self.f.read(n_to_read)
def __iter__(self):
yield self.read(n=io.DEFAULT_BUFFER_SIZE)
def __exit__(self, *args):
self.f.close()
Then the actual request:
with FileSlice(filepath, seek_from=i * chunk_size, read_limit=chunk_size) as data:
r = requests.put(presigned_url, data=data)
r.raise_for_status()
This seems pretty complex, so I'm wondering:
if I'm missing a simpler way
if my approach is correct.
Thank you!

python - remain in base class when calling parent method from child

I have the following base class:
class ClientRepo(Repository):
def __init__(self) -> None:
self.__clientList = []
def hasClientWithId(self, clientId):
for client in self.__clientList:
if client.getId() == clientId:
return True
return False
def addClient(self, client):
if type(client).__name__ == 'ClientDAO':
if not self.hasClientWithId(client.getId()):
client.setClientId(self.__maximumIndexInClientList() + 1)
self.__clientList.append(client)
else:
raise ObjectAlreadyInCollectionException
else:
raise TypeError
which basically only holds a list and can add a ClientDAO to it.
And the following, which derives from it:
class ClientFileRepository(ClientRepo):
def __init__(self, fileName) -> None:
super().__init__()
self.__fileName = fileName
self.__file = None
def hasClientWithId(self, clientId):
self.__loadRepo()
hasClientWithId = super().hasClientWithId(clientId)
super().clean()
return hasClientWithId
def addClient(self, client):
self.__loadRepo()
super().addClient(client)
self.__storeRepo()
super().clean()
def __loadFileReadMode(self):
self.__file = open(self.__fileName, "r")
def __loadFileWriteMode(self):
self.__file = open(self.__fileName, "w")
def __closeFile(self):
self.__file.close()
def __loadRepo(self):
self.__loadFileReadMode()
for line in self.__file:
splitLine = line.split()
clientToAdd = ClientDAO(splitLine[1])
clientToAdd.setClientId(int(splitLine[0]))
super().addClientWithId(clientToAdd)
self.__closeFile()
def __storeRepo(self):
self.__loadFileWriteMode()
self.__file.write("")
for client in super().getList():
self.__file.write(self.clientToString(client))
self.__closeFile()
def clientToString(self, clientDAO):
return str(clientDAO.getId()) + " " + clientDAO.getName() + "\n"
a class which should load the list from a file, call addClient from parent, and store the updated list in the file. The problem is that after child class loads the file in addClient, it calls the method in the parent, which calls hasClientWithId, from the child, again. But I want it to call hasClientWithId, from the parent, that is, the context it is in. Can I achieve that?
I can think of several ways to achieve your goal. I ranked them from worst to best
1. Exactly what you asked for
You wanted that ClientRepo.addClient calls ClientRepo.hasClientWithId instead of ClientFileRepository.hasClientWithId. It is possible to enforce that:
class ClientRepo(Repository):
def addClient(self, client):
if type(client).__name__ == 'ClientDAO':
if not ClientRepo.hasClientWithId(self, client.getId()):
client.setClientId(self.__maximumIndexInClientList() + 1)
self.__clientList.append(client)
else:
raise ObjectAlreadyInCollectionException
else:
raise TypeError
This is not a good approach, because it's unintuitive and breaks the principles of OOP. Any other programmer writing a subclass of ClientRepo that overrides hasClientWithId would expect that this will have an effect for every call to hasClientWithId even inside of addClient
2. Let ClientFileRepository decide which function to use
Add a variable
self.__isFileOpen = False
in ClientFileRepository.__init__, set it to True when you open the file and to False when you close the file. Then change the hasClientWithId within ClientFileRepository to
def hasClientWithId(self, clientId):
if not self.__isFileOpen:
self.__loadRepo()
result = super().hasClientWithId(clientId)
super().clean()
return result
else:
return super().hasClientWithId(clientId)
to avoid opening the same file again. This works, but it is pretty difficult to write new functions for this class, because you always need to be aware if the function call is a call from within your class or from somewhere else. Also this seems pretty inefficient, because you read and write the entire file, even when you only add one client.
3. Read the file only once and modify the underlying ClientRepo
class ClientFileRepository(ClientRepo):
def __init__(self, fileName) -> None:
super().__init__()
self.__fileName = fileName
self.__loadRepo()
# No hasClientWithId needed
def addClient(self, client):
super().addClient(client)
self.__storeRepo()
def __loadRepo(self):
with open(self.__filename) as file:
for line in file:
splitLine = line.split()
clientToAdd = ClientDAO(splitLine[1])
clientToAdd.setClientId(int(splitLine[0]))
super().addClientWithId(clientToAdd)
def __storeRepo(self):
with open(self.__filename, "w") as file:
file.write("")
for client in super().getList():
file.write(self.clientToString(client))
This obviously assumes that the file is not changed by someone else between calls to addClient and the program still overwrites the entire file for every addClient. If this is a problem for you it is best to be explicit and make loadRepo and storeRepo public. Then the programmer using this class can decide when loading and saving are necessary and useful. You can use context managers for this.
Extra: Read and save the file for every method
You can use function decorators to use solution 2 without writing the same code for every function:
import functools
def loadAndStore(function):
#functoools.wraps(function)
def wrappedFunction(self, *args, **kwargs):
if self.__isFileOpen:
return function(self, *args, **kwargs)
else:
self.__isFileOpen = True
self.__loadRepo()
try:
return function(self, *args, **kwargs)
except Exception as e: # Only catch expected exceptions
raise
finally:
self.__storeRepo()
self.clear() # some cleanup
self.__isFileOpen = False
return wrappedFunction
class ClientFileRepository(ClientRepo):
def __init__(self, fileName) -> None:
super().__init__()
self.__fileName = fileName
self.__isFileOpen = False
#loadAndStore
def hasClientWithId(self, clientId):
return super().hasClientWithId(clientId)
#loadAndStore
def addClient(self, client):
super().addClient(client)
def __loadRepo(self):
with open(self.__filename) as file:
for line in file:
splitLine = line.split()
clientToAdd = ClientDAO(splitLine[1])
clientToAdd.setClientId(int(splitLine[0]))
super().addClientWithId(clientToAdd)
def __storeRepo(self):
with open(self.__filename, "w") as file:
file.write("")
for client in super().getList():
file.write(self.clientToString(client))
Be careful here, using this is not very intuitive. For example self.__isFileOpen is defined in __init__, but none of the methods below directly use it. Instead its use is hidden in the loadAndStore decorator.
Some quick hints at the end:
type(client).__name__ == 'ClientDAO' is bad practice. Use isinstance(client, ClientDAO) to fully adopt OOP
If this is not part of a bigger project with given naming conventions use the python style guide
Using private variables like __fileName is generally considered unnecessary, just prefix the variable with one underscore to indicate "internal use". The same is true for functions.

creating a temporary file in python with FUSE

I am trying to write a program using python-fuse, but I can't get file writing down.
my file_class looks like this
class FuseFile(object):
def __init__(self, path, flags, *mode):
debug(path)
#debug(mode);
self.file = tempfile.TemporaryFile(*mode);
self.fd = self.file.fileno()
self.path = path
def write(self, buf, offset):
head, tail = os.path.split(self.path)
self.file.seek(offset);
self.file.write(buf);
return len(buf)
def read(self, length, offset):
file = apiCall("readfile",{"file":self.path}).read();
slen = len(file)
if length < slen:
if offset + size > slen:
size = slen - offset
buf = file[offset:offset+size]
else:
buf = ''
return file # I don't know if this buff stuff is necesarry...
def ftruncate(self, len):
self.file.truncate(len);
def release(self, flags):
self.file.close()
def flush(self):
self._fflush()
def fsync(self, isfsyncfile):
self._fflush()
if isfsyncfile and hasattr(os, 'fdatasync'):
os.fdatasync(self.fd)
else:
os.fsync(self.fd)
def _fflush(self):
if 'w' in self.file.mode or 'a' in self.file.mode:
self.file.flush()
but when I try and edit the file in a editor like VIM I get this:
"mnt/stuff.txt" E514: write error (file system full?)
WARNING: Original file may be lost or damaged
don't quit the editor until the file is successfully written!
[EDIT] I found the problem, I didn't have a open method, but even so, I eventually took out the file_class to implement the methods in the main FUSE class, because that seems to work better
I eventually found out that the probelm was that I hadn't created a open(), or create() method in my file class, but eventually I settled with implementing all the methods in the main FUSE class because the file_class didn't seem to be working for me

Python: Creating a streaming gzip'd file-like?

I'm trying to figure out the best way to compress a stream with Python's zlib.
I've got a file-like input stream (input, below) and an output function which accepts a file-like (output_function, below):
with open("file") as input:
output_function(input)
And I'd like to gzip-compress input chunks before sending them to output_function:
with open("file") as input:
output_function(gzip_stream(input))
It looks like the gzip module assumes that either the input or the output will be a gzip'd file-on-disk… So I assume that the zlib module is what I want.
However, it doesn't natively offer a simple way to create a stream file-like… And the stream-compression it does support comes by way of manually adding data to a compression buffer, then flushing that buffer.
Of course, I could write a wrapper around zlib.Compress.compress and zlib.Compress.flush (Compress is returned by zlib.compressobj()), but I'd be worried about getting buffer sizes wrong, or something similar.
So, what's the simplest way to create a streaming, gzip-compressing file-like with Python?
Edit: To clarify, the input stream and the compressed output stream are both too large to fit in memory, so something like output_function(StringIO(zlib.compress(input.read()))) doesn't really solve the problem.
It's quite kludgy (self referencing, etc; just put a few minutes writing it, nothing really elegant), but it does what you want if you're still interested in using gzip instead of zlib directly.
Basically, GzipWrap is a (very limited) file-like object that produces a gzipped file out of a given iterable (e.g., a file-like object, a list of strings, any generator...)
Of course, it produces binary so there was no sense in implementing "readline".
You should be able to expand it to cover other cases or to be used as an iterable object itself.
from gzip import GzipFile
class GzipWrap(object):
# input is a filelike object that feeds the input
def __init__(self, input, filename = None):
self.input = input
self.buffer = ''
self.zipper = GzipFile(filename, mode = 'wb', fileobj = self)
def read(self, size=-1):
if (size < 0) or len(self.buffer) < size:
for s in self.input:
self.zipper.write(s)
if size > 0 and len(self.buffer) >= size:
self.zipper.flush()
break
else:
self.zipper.close()
if size < 0:
ret = self.buffer
self.buffer = ''
else:
ret, self.buffer = self.buffer[:size], self.buffer[size:]
return ret
def flush(self):
pass
def write(self, data):
self.buffer += data
def close(self):
self.input.close()
Here is a cleaner, non-self-referencing version based on Ricardo Cárdenes' very helpful answer.
from gzip import GzipFile
from collections import deque
CHUNK = 16 * 1024
class Buffer (object):
def __init__ (self):
self.__buf = deque()
self.__size = 0
def __len__ (self):
return self.__size
def write (self, data):
self.__buf.append(data)
self.__size += len(data)
def read (self, size=-1):
if size < 0: size = self.__size
ret_list = []
while size > 0 and len(self.__buf):
s = self.__buf.popleft()
size -= len(s)
ret_list.append(s)
if size < 0:
ret_list[-1], remainder = ret_list[-1][:size], ret_list[-1][size:]
self.__buf.appendleft(remainder)
ret = ''.join(ret_list)
self.__size -= len(ret)
return ret
def flush (self):
pass
def close (self):
pass
class GzipCompressReadStream (object):
def __init__ (self, fileobj):
self.__input = fileobj
self.__buf = Buffer()
self.__gzip = GzipFile(None, mode='wb', fileobj=self.__buf)
def read (self, size=-1):
while size < 0 or len(self.__buf) < size:
s = self.__input.read(CHUNK)
if not s:
self.__gzip.close()
break
self.__gzip.write(s)
return self.__buf.read(size)
Advantages:
Avoids repeated string concatenation, which would cause the entire string to be copied repeatedly.
Reads a fixed CHUNK size from the input stream, instead of reading whole lines at a time (which can be arbitrarily long).
Avoids circular references.
Avoids misleading public "write" method of GzipCompressStream(), which is really only used internally.
Takes advantage of name mangling for internal member variables.
The gzip module supports compressing to a file-like object, pass a fileobj parameter to GzipFile, as well as a filename. The filename you pass in doesn't need to exist, but the gzip header has a filename field which needs to be filled out.
Update
This answer does not work. Example:
# tmp/try-gzip.py
import sys
import gzip
fd=gzip.GzipFile(fileobj=sys.stdin)
sys.stdout.write(fd.read())
output:
===> cat .bash_history | python tmp/try-gzip.py > tmp/history.gzip
Traceback (most recent call last):
File "tmp/try-gzip.py", line 7, in <module>
sys.stdout.write(fd.read())
File "/usr/lib/python2.7/gzip.py", line 254, in read
self._read(readsize)
File "/usr/lib/python2.7/gzip.py", line 288, in _read
pos = self.fileobj.tell() # Save current position
IOError: [Errno 29] Illegal seek
Use the cStringIO (or StringIO) module in conjunction with zlib:
>>> import zlib
>>> from cStringIO import StringIO
>>> s.write(zlib.compress("I'm a lumberjack"))
>>> s.seek(0)
>>> zlib.decompress(s.read())
"I'm a lumberjack"
This works (at least in python 3):
with s3.open(path, 'wb') as f:
gz = gzip.GzipFile(filename, 'wb', 9, f)
gz.write(b'hello')
gz.flush()
gz.close()
Here it writes to s3fs's file object with a gzip compression on it.
The magic is the f parameter, which is GzipFile's fileobj. You have to provide a file name for gzip's header.
An even cleaner & more generalized version made of reusable components:
gzipped_iter = igizip(io_iter(input_file_obj))
gzipped_file_obj = iter_io(prefetch(gzipped_iter))
The functions above are from my gist:
iter_io and io_iter provide transparent conversion to/from Iterable[AnyStr] <-> SupportsRead[AnyStr]
igzip does streaming gzip compression
(optional) prefetch concurrently pulls from an underlying iterable via a thread, yielding to consumer as normal, for concurrent read/write
def as_bytes(s: str | bytes):
if type(s) not in [str, bytes]:
raise TypeError
return s.encode() if isinstance(s, str) else s
def iter_io(iterable: Iterable[AnyStr], buffer_size: int = io.DEFAULT_BUFFER_SIZE):
"""
Returns a buffered file obj that reads bytes from an iterable of str/bytes.
Example:
iter_io(['abc', 'def', 'g']).read() == b'abcdefg'
iter_io([b'abcd', b'efg']).read(5) == b'abcde'
"""
class IterIO(io.RawIOBase):
def __init__(self, iterable: Iterable[AnyStr]):
self._leftover = b''
self._iterable = (as_bytes(s) for s in iterable if s)
def readable(self):
return True
def readinto(self, buf):
try:
chunk = self._leftover or next(self._iterable)
except StopIteration:
return 0 # indicate EOF
output, self._leftover = chunk[:len(buf)], chunk[len(buf):]
buf[:len(output)] = output
return len(output)
return io.BufferedReader(IterIO(iterable), buffer_size=buffer_size)
def io_iter(fo: SupportsRead[AnyStr], size: int = io.DEFAULT_BUFFER_SIZE):
"""
Returns an iterator that reads from a file obj in sized chunks.
Example:
list(io_iter(io.StringIO('abcdefg'), 3)) == ['abc', 'def', 'g']
list(io_iter(io.BytesIO(b'abcdefg'), 4)) == [b'abcd', b'efg']
Usage notes/TODO:
* file obj isn't closed, fix /w keep_open=False and an internal contextmanager
"""
return iter(lambda: fo.read(size), fo.read(0))
def igzip(chunks: Iterable[AnyStr], level=zlib.Z_DEFAULT_COMPRESSION):
"""
Streaming gzip: lazily compresses an iterable of bytes or str (utf8)
Example:
gzipped_bytes_iter = igzip(['hello ', 'world!'])
gzip.decompress(b''.join(gzipped_bytes_iter)).encode() == 'hello world!'
"""
def gen():
gzip_format = 0b10000
c = zlib.compressobj(level=level, wbits=zlib.MAX_WBITS + gzip_format)
yield from (c.compress(as_bytes(chunk)) for chunk in chunks)
yield c.flush()
return filter(None, gen())
def prefetch(iterable: Iterable[Any], n: int = 1) -> Iterator[Any]:
"""
Prefetch an iterable via thread, yielding original contents as normal.
Example:
def slow_produce(*args):
for x in args:
time.sleep(1)
yield x
def slow_consume(iterable):
for _ in iterable:
time.sleep(1)
slow_consume(prefetch(slow_produce('a', 'b'))) # takes 3 sec, not 4
# Prefetch
# produce: | 'a' | 'b' |
# consume: | 'a' | 'b' |
# seconds: 0 --- 1 --- 2 --- 3
# No prefetch
# produce: | 'a' | | 'b' |
# consume: | 'a' | | 'b' |
# seconds: 0 --- 1 --- 2 --- 3 --- 4
Usage notes/TODO:
* mem leak: Thread is GC'd only after iterable is fully consumed, fix /w __del__
"""
queue = Queue(n)
finished = object()
def produce():
for x in iterable:
queue.put(x)
queue.put(finished)
t = Thread(target=produce, daemon=True)
t.start()
while True:
item = queue.get()
if item is finished:
break
else:
yield item

Categories