Streaming upload of a slice of a (large) file using python-requests - python

I need to do a streaming upload (i.e., not load the full file part in memory) of a slice of a large (multi-GB) file, using python-requests.
I've looked around in the doc and on Stack Overflow, and haven't found a working way to do it (again, without loading the full slice in memory).
Here's the code I have:
class FileSlice(AbstractContextManager):
"""
File-like object that only reads a slice of a file
Inspired by stackoverflow.com/a/29838711/593036, but actually works.
"""
def __init__(self, filepath: str, seek_from: int, read_limit: int):
self.filepath = filepath
self.seek_from = seek_from
self.read_limit = read_limit
self.n_seen = 0
def __enter__(self):
self.f = open(self.filepath, "rb")
self.f.seek(self.seek_from)
return self
def __len__(self):
total_length = os.fstat(self.f.fileno()).st_size
return min(self.read_limit, total_length - self.seek_from)
def read(self, n=-1):
if self.n_seen >= self.read_limit:
return b""
remaining_amount = self.read_limit - self.n_seen
n_to_read = remaining_amount if n < 0 else min(n, remaining_amount)
self.n_seen += n_to_read
return self.f.read(n_to_read)
def __iter__(self):
yield self.read(n=io.DEFAULT_BUFFER_SIZE)
def __exit__(self, *args):
self.f.close()
Then the actual request:
with FileSlice(filepath, seek_from=i * chunk_size, read_limit=chunk_size) as data:
r = requests.put(presigned_url, data=data)
r.raise_for_status()
This seems pretty complex, so I'm wondering:
if I'm missing a simpler way
if my approach is correct.
Thank you!

Related

Boto3 - possible upload generator object to s3?

I would like to upload the results of a generator to s3 while fully taking advantage of the generator.
A mve of what I'm trying to do is:
def gen():
for i in ['a','b','c','d']
yield i
s3_object.put?(data=gen())
I've seen examples of putting local filepaths into .put, but havent seen anything regarding generators.
Ideally this would fully take advantage of the generator so I don't have to write all the data to disk or memory.
Is something like this possible?
No, this is currently not possible. According to the doc, the Body parameter only accepts bytes or seekable file-like object.
Yes you can do this using a multipart upload. The minimum size for a "part" is 5MiB, so you need to build a buffer that holds the data until 5MiB is reached.
Here is a class that gets the job done:
import boto3
class MultipartUpload:
def __init__(self, bucket, key, client=None, encoding="utf8"):
self.bucket = bucket
self.key = key
self.client = client or boto3.client("s3")
self.encoding = encoding
upload = self.client.create_multipart_upload(
Bucket=bucket, Key=key
)
self.upload_id = upload["UploadId"]
self.part = 1
self.parts_container = []
self.buffer = b''
def write(self, content):
if isinstance(content, str):
self.buffer += content.encode(self.encoding)
elif isinstance(content, bytes):
self.buffer += content
else:
raise TypeError(f"Received bad data of type {type(content)}. Must be bytes or string")
if len(self.buffer) > 5 * 1024 * 1024: # 5 MiB Minimum part upload
self.commit()
def commit(self):
print("Sending multipart")
resp = self.client.upload_part(
Body=self.buffer, Bucket=self.bucket, Key=self.key,
PartNumber=self.part, UploadId=self.upload_id
)
self.parts_container.append(
{"ETag": resp["ETag"], "PartNumber": self.part}
)
self.buffer = b''
self.part += 1
def save(self):
if self.buffer: # If we have leftover data; commit it first
self.commit()
return self.client.complete_multipart_upload(
Bucket=self.bucket, Key=self.key, UploadId=self.upload_id,
MultipartUpload={"Parts": self.parts_container}
)
def abort(self):
self.client.abort_multipart_upload(
Bucket=self.bucket, Key=self.key, UploadId=self.upload_id
)
def __enter__(self):
return self
def __exit__(self, type_, value, tb):
if type_:
self.abort()
else:
self.save()
You could of course improve this for things like threading and further optimisation. You would use it like a file handle:
with MultipartUpload("your-bucket", "file.txt") as upload:
for data in gen():
upload.write(data)

Using a custom object in pandas.read_csv()

I am interested in streaming a custom object into a pandas dataframe. According to the documentation, any object with a read() method can be used. However, even after implementing this function I am still getting this error:
ValueError: Invalid file path or buffer object type: <class '__main__.DataFile'>
Here is a simple version of the object, and how I am calling it:
class DataFile(object):
def __init__(self, files):
self.files = files
def read(self):
for file_name in self.files:
with open(file_name, 'r') as file:
for line in file:
yield line
import pandas as pd
hours = ['file1.csv', 'file2.csv', 'file3.csv']
data = DataFile(hours)
df = pd.read_csv(data)
Am I missing something, or is it just not possible to use a custom generator in Pandas? When I call the read() method it works just fine.
EDIT:
The reason I want to use a custom object rather than concatenating the dataframes together is to see if it is possible to reduce memory usage. I have used the gensim library in the past, and it makes it really easy to use custom data objects, so I was hoping to find some similar approach.
One way to make a file-like object in Python3 by subclassing io.RawIOBase.
And using Mechanical snail's iterstream,
you can convert any iterable of bytes into a file-like object:
import tempfile
import io
import pandas as pd
def iterstream(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE):
"""
http://stackoverflow.com/a/20260030/190597 (Mechanical snail)
Lets you use an iterable (e.g. a generator) that yields bytestrings as a
read-only input stream.
The stream implements Python 3's newer I/O API (available in Python 2's io
module).
For efficiency, the stream is buffered.
"""
class IterStream(io.RawIOBase):
def __init__(self):
self.leftover = None
def readable(self):
return True
def readinto(self, b):
try:
l = len(b) # We're supposed to return at most this much
chunk = self.leftover or next(iterable)
output, self.leftover = chunk[:l], chunk[l:]
b[:len(output)] = output
return len(output)
except StopIteration:
return 0 # indicate EOF
return io.BufferedReader(IterStream(), buffer_size=buffer_size)
class DataFile(object):
def __init__(self, files):
self.files = files
def read(self):
for file_name in self.files:
with open(file_name, 'rb') as f:
for line in f:
yield line
def make_files(num):
filenames = []
for i in range(num):
with tempfile.NamedTemporaryFile(mode='wb', delete=False) as f:
f.write(b'''1,2,3\n4,5,6\n''')
filenames.append(f.name)
return filenames
# hours = ['file1.csv', 'file2.csv', 'file3.csv']
hours = make_files(3)
print(hours)
data = DataFile(hours)
df = pd.read_csv(iterstream(data.read()), header=None)
print(df)
prints
0 1 2
0 1 2 3
1 4 5 6
2 1 2 3
3 4 5 6
4 1 2 3
5 4 5 6
The documentation mentions the read method but it's actually checking if it's a is_file_like argument (that's where the exception is thrown). That function is actually very simple:
def is_file_like(obj):
if not (hasattr(obj, 'read') or hasattr(obj, 'write')):
return False
if not hasattr(obj, "__iter__"):
return False
return True
So it also needs an __iter__ method.
But that's not the only problem. Pandas requires that it actually behaves file-like. So the read method should accept an additional argument for the number of bytes (so you can't make read a generator - because it has to be callable with 2 arguments and should return a string).
So for example:
class DataFile(object):
def __init__(self, files):
self.data = """a b
1 2
2 3
"""
self.pos = 0
def read(self, x):
nxt = self.pos + x
ret = self.data[self.pos:nxt]
self.pos = nxt
return ret
def __iter__(self):
yield from self.data.split('\n')
will be recognized as valid input.
However it's harder for multiple files, I hoped that fileinput could have some appropriate routines but it doesn't seem like it:
import fileinput
pd.read_csv(fileinput.input([...]))
# ValueError: Invalid file path or buffer object type: <class 'fileinput.FileInput'>
How about this alternative approach:
def get_merged_csv(flist, **kwargs):
return pd.concat([pd.read_csv(f, **kwargs) for f in flist], ignore_index=True)
df = get_merged_csv(hours)

How can I apply changes to source files with Python?

I am refactoring C++ code using the Python bindings of the Clang compiler (cindex). Using that, I analyze the AST and prepare changes. I end up with a list of operations similar to the following:
DELETE line 10
INSERT line 5 column 32: << "tofu"
REPLACE from line 31 colum 6 to line 33 column 82 with: std::cout << "Thanks SO"
...
My question is how to turn these into actual file changes.
Doing it directly with python seems tedious: patches need to be applied in the right order and checked for consistency. It looks quite hard and error-prone.
I also can’t find a good library to help (clang does have something called a Rewriter, but it isn't wrapped in Python. I'd really like to avoid C++ for refactoring if possible).
Maybe an idea could be to generate patches and apply them with git, maybe? But even that seems a bit tedious.
Any ideas?
So I rolled out my own. The code is almost certainly buggy and not very pretty, but I'm posting it in the hope it might help someone until a better solution is found.
class PatchRecord(object):
""" Record patches, validate them, order them, and apply them """
def __init__(self):
# output of readlines for each patched file
self.lines = {}
# list of patches for each patched file
self.patches = {}
class Patch(object):
""" Abstract base class for editing operations """
def __init__(self, filename, start, end):
self.filename = filename
self.start = start
self.end = end
def __repr__(self):
return "{op}: {filename} {start}/{end} {what}".format(
op=self.__class__.__name__.upper(),
filename=self.filename,
start=format_place(self.start),
end=format_place(self.end),
what=getattr(self, "what", ""))
def apply(self, lines):
print "Warning: applying no-op patch"
class Delete(Patch):
def __init__(self, filename, extent):
super(PatchRecord.Delete, self).__init__(
filename, extent.start, extent.end)
print "DELETE: {file} {extent}".format(file=self.filename,
extent=format_extent(extent))
def apply(self, lines):
lines[self.start.line - 1:self.end.line] = [
lines[self.start.line - 1][:self.start.column - 1] +
lines[self.end.line - 1][self.end.column:]]
class Insert(Patch):
def __init__(self, filename, start, what):
super(PatchRecord.Insert, self).__init__(filename, start, start)
self.what = what
print "INSERT {where} {what}".format(what=what, where=format_place(self.start))
def apply(self, lines):
line = lines[self.start.line - 1]
lines[self.start.line - 1] = "%s%s%s" % (
line[:self.start.column],
self.what,
line[self.start.column:])
class Replace(Patch):
def __init__(self, filename, extent, what):
super(PatchRecord.Replace, self).__init__(
filename, extent.start, extent.end)
self.what = what
print "REPLACE: {where} {what}".format(what=what,
where=format_extent(extent))
def apply(self, lines):
lines[self.start.line - 1:self.end.line] = [
lines[self.start.line - 1][:self.start.column - 1] +
self.what +
lines[self.end.line - 1][self.end.column - 1:]]
# Convenience functions for creating patches
def delete(self, filename, extent):
self.patches[filename] = self.patches.get(
filename, []) + [self.Delete(filename, extent)]
def insert(self, filename, where, what):
self.patches[filename] = self.patches.get(
filename, []) + [self.Insert(filename, where, what)]
def replace(self, filename, extent, what):
self.patches[filename] = self.patches.get(
filename, []) + [self.Replace(filename, extent, what)]
def _pos_to_tuple(self, position):
""" Convert a source location to a tuple for use as a sorting key """
return (position.line, position.column)
def sort(self, filename):
""" Sort patches by extent start """
self.patches[filename].sort(key=lambda p: self._pos_to_tuple(p.start))
def validate(self, filename):
"""Try to insure patches are consistent"""
print "Checking patches for %s" % filename
self.sort(filename)
previous = self.patches[filename][0]
for p in self.patches[filename][1:]:
assert(self._pos_to_tuple(p.start) >
self._pos_to_tuple(previous.start))
def _apply(self, filename):
self.sort(filename)
lines = self._getlines(filename)
for p in reversed(self.patches[filename]):
print p
p.apply(lines)
def _getlines(self, filename):
""" Get source file lines for editing """
if not filename in self.lines:
with open(filename) as f:
self.lines[filename] = f.readlines()
return self.lines[filename]
def apply(self):
for filename in self.patches:
self.validate(filename)
self._apply(filename)
# with open(filename+".patched","w") as output:
with open(filename, "w") as output:
output.write("".join(self._getlines(filename)))
Just create a PatchRecord object, add changes using the create, replace and delete methods, and apply them with apply when you're ready.

Stacking filters around a file object

I would like to stack filters around an open() function. These filters are supposed, for example, to change every encountered a characters into b in the stream read from the file.
For example, here is a code sample:
def filter (stream):
for line in stream:
yield line.replace('a', 'b')
def add_filter(filter, file):
return io.TextIOWrapper(filter(file))
def processing_file(f):
import sys
for line in f:
sys.stdout.write("aa: " + line)
f = open('./example.txt', 'r')
f = add_filter(filter, f)
processing_file(f)
I guess that the filter_a() function should return a TextIOWrapper to mimic the result of an open() function. But, I keep having the following error message:
AttributeError: 'generator' object has no attribute 'readable'
In fact, I understand the error, but I do not know how to work around and make it work properly.
You can iterate directly over the filter generator:
with open('./example.txt', 'r') as f:
for line in filter(f):
sys.stdout.write("aa: " + line)
I came with a solution to my own question... I, first, have to admit that my question was not totally well formed and may have lacked of precision. So, I do not blame anybody to have discarded it.
My original intention was to come out with a stackable framework of filters over a stream (open()). Trying to make it easy to use, also.
I mainly found inspiration in this answer on StackOverflow which was solving about 90% of my problem.
So, imagine we have two filters (which are coded as generators):
def tab_filter(stream):
for line in stream:
yield line.replace ('\t', ' ' * 8)
def a_filter(stream):
for line in stream:
yield line.replace ('a', 'z')
Then, we have this class allowing to wrap a generator inside a stream:
class IterStream(object):
"File-like streaming iterator."
def __init__(self, generator):
self.generator = generator
self.iterator = iter(generator)
self.leftover = ''
def __len__(self):
return self.generator.__len__()
def __iter__(self):
return self.iterator
def next(self):
return self.iterator.next()
def read(self, size):
data = self.leftover
count = len(self.leftover)
try:
while count < size:
chunk = self.next()
data += chunk
count += len(chunk)
except StopIteration:
self.leftover = ''
return data
if count > size:
self.leftover = data[size:]
return data[:size]
Using it in the code will be as follow:
import sys
f = IterStream(a_filter(IterStream(tab_filter(open('Example.txt', 'r')))))
for line in f:
sys.stdout.write("aa: " + line)
But, this is not yet totally satisfactory because we need a lot of useless function stacking. So, I decided to wrap it inside a decorator:
def streamfilter(filter):
def stream(iostream):
return IterStream(filter(iostream))
return stream
#streamfilter
def tab_filter(stream):
for line in stream:
yield line.replace ('\t', ' ' * 8)
#streamfilter
def a_filter(stream):
for line in stream:
yield line.replace ('a', 'z')
Then, using the code is much easier now:
import sys
f = a_filter(tab_filter(open('Example.txt', 'r')))
for line in f:
sys.stdout.write("aa: " + line)
I hope that some of you will find this few lines useful.

Python: Creating a streaming gzip'd file-like?

I'm trying to figure out the best way to compress a stream with Python's zlib.
I've got a file-like input stream (input, below) and an output function which accepts a file-like (output_function, below):
with open("file") as input:
output_function(input)
And I'd like to gzip-compress input chunks before sending them to output_function:
with open("file") as input:
output_function(gzip_stream(input))
It looks like the gzip module assumes that either the input or the output will be a gzip'd file-on-disk… So I assume that the zlib module is what I want.
However, it doesn't natively offer a simple way to create a stream file-like… And the stream-compression it does support comes by way of manually adding data to a compression buffer, then flushing that buffer.
Of course, I could write a wrapper around zlib.Compress.compress and zlib.Compress.flush (Compress is returned by zlib.compressobj()), but I'd be worried about getting buffer sizes wrong, or something similar.
So, what's the simplest way to create a streaming, gzip-compressing file-like with Python?
Edit: To clarify, the input stream and the compressed output stream are both too large to fit in memory, so something like output_function(StringIO(zlib.compress(input.read()))) doesn't really solve the problem.
It's quite kludgy (self referencing, etc; just put a few minutes writing it, nothing really elegant), but it does what you want if you're still interested in using gzip instead of zlib directly.
Basically, GzipWrap is a (very limited) file-like object that produces a gzipped file out of a given iterable (e.g., a file-like object, a list of strings, any generator...)
Of course, it produces binary so there was no sense in implementing "readline".
You should be able to expand it to cover other cases or to be used as an iterable object itself.
from gzip import GzipFile
class GzipWrap(object):
# input is a filelike object that feeds the input
def __init__(self, input, filename = None):
self.input = input
self.buffer = ''
self.zipper = GzipFile(filename, mode = 'wb', fileobj = self)
def read(self, size=-1):
if (size < 0) or len(self.buffer) < size:
for s in self.input:
self.zipper.write(s)
if size > 0 and len(self.buffer) >= size:
self.zipper.flush()
break
else:
self.zipper.close()
if size < 0:
ret = self.buffer
self.buffer = ''
else:
ret, self.buffer = self.buffer[:size], self.buffer[size:]
return ret
def flush(self):
pass
def write(self, data):
self.buffer += data
def close(self):
self.input.close()
Here is a cleaner, non-self-referencing version based on Ricardo Cárdenes' very helpful answer.
from gzip import GzipFile
from collections import deque
CHUNK = 16 * 1024
class Buffer (object):
def __init__ (self):
self.__buf = deque()
self.__size = 0
def __len__ (self):
return self.__size
def write (self, data):
self.__buf.append(data)
self.__size += len(data)
def read (self, size=-1):
if size < 0: size = self.__size
ret_list = []
while size > 0 and len(self.__buf):
s = self.__buf.popleft()
size -= len(s)
ret_list.append(s)
if size < 0:
ret_list[-1], remainder = ret_list[-1][:size], ret_list[-1][size:]
self.__buf.appendleft(remainder)
ret = ''.join(ret_list)
self.__size -= len(ret)
return ret
def flush (self):
pass
def close (self):
pass
class GzipCompressReadStream (object):
def __init__ (self, fileobj):
self.__input = fileobj
self.__buf = Buffer()
self.__gzip = GzipFile(None, mode='wb', fileobj=self.__buf)
def read (self, size=-1):
while size < 0 or len(self.__buf) < size:
s = self.__input.read(CHUNK)
if not s:
self.__gzip.close()
break
self.__gzip.write(s)
return self.__buf.read(size)
Advantages:
Avoids repeated string concatenation, which would cause the entire string to be copied repeatedly.
Reads a fixed CHUNK size from the input stream, instead of reading whole lines at a time (which can be arbitrarily long).
Avoids circular references.
Avoids misleading public "write" method of GzipCompressStream(), which is really only used internally.
Takes advantage of name mangling for internal member variables.
The gzip module supports compressing to a file-like object, pass a fileobj parameter to GzipFile, as well as a filename. The filename you pass in doesn't need to exist, but the gzip header has a filename field which needs to be filled out.
Update
This answer does not work. Example:
# tmp/try-gzip.py
import sys
import gzip
fd=gzip.GzipFile(fileobj=sys.stdin)
sys.stdout.write(fd.read())
output:
===> cat .bash_history | python tmp/try-gzip.py > tmp/history.gzip
Traceback (most recent call last):
File "tmp/try-gzip.py", line 7, in <module>
sys.stdout.write(fd.read())
File "/usr/lib/python2.7/gzip.py", line 254, in read
self._read(readsize)
File "/usr/lib/python2.7/gzip.py", line 288, in _read
pos = self.fileobj.tell() # Save current position
IOError: [Errno 29] Illegal seek
Use the cStringIO (or StringIO) module in conjunction with zlib:
>>> import zlib
>>> from cStringIO import StringIO
>>> s.write(zlib.compress("I'm a lumberjack"))
>>> s.seek(0)
>>> zlib.decompress(s.read())
"I'm a lumberjack"
This works (at least in python 3):
with s3.open(path, 'wb') as f:
gz = gzip.GzipFile(filename, 'wb', 9, f)
gz.write(b'hello')
gz.flush()
gz.close()
Here it writes to s3fs's file object with a gzip compression on it.
The magic is the f parameter, which is GzipFile's fileobj. You have to provide a file name for gzip's header.
An even cleaner & more generalized version made of reusable components:
gzipped_iter = igizip(io_iter(input_file_obj))
gzipped_file_obj = iter_io(prefetch(gzipped_iter))
The functions above are from my gist:
iter_io and io_iter provide transparent conversion to/from Iterable[AnyStr] <-> SupportsRead[AnyStr]
igzip does streaming gzip compression
(optional) prefetch concurrently pulls from an underlying iterable via a thread, yielding to consumer as normal, for concurrent read/write
def as_bytes(s: str | bytes):
if type(s) not in [str, bytes]:
raise TypeError
return s.encode() if isinstance(s, str) else s
def iter_io(iterable: Iterable[AnyStr], buffer_size: int = io.DEFAULT_BUFFER_SIZE):
"""
Returns a buffered file obj that reads bytes from an iterable of str/bytes.
Example:
iter_io(['abc', 'def', 'g']).read() == b'abcdefg'
iter_io([b'abcd', b'efg']).read(5) == b'abcde'
"""
class IterIO(io.RawIOBase):
def __init__(self, iterable: Iterable[AnyStr]):
self._leftover = b''
self._iterable = (as_bytes(s) for s in iterable if s)
def readable(self):
return True
def readinto(self, buf):
try:
chunk = self._leftover or next(self._iterable)
except StopIteration:
return 0 # indicate EOF
output, self._leftover = chunk[:len(buf)], chunk[len(buf):]
buf[:len(output)] = output
return len(output)
return io.BufferedReader(IterIO(iterable), buffer_size=buffer_size)
def io_iter(fo: SupportsRead[AnyStr], size: int = io.DEFAULT_BUFFER_SIZE):
"""
Returns an iterator that reads from a file obj in sized chunks.
Example:
list(io_iter(io.StringIO('abcdefg'), 3)) == ['abc', 'def', 'g']
list(io_iter(io.BytesIO(b'abcdefg'), 4)) == [b'abcd', b'efg']
Usage notes/TODO:
* file obj isn't closed, fix /w keep_open=False and an internal contextmanager
"""
return iter(lambda: fo.read(size), fo.read(0))
def igzip(chunks: Iterable[AnyStr], level=zlib.Z_DEFAULT_COMPRESSION):
"""
Streaming gzip: lazily compresses an iterable of bytes or str (utf8)
Example:
gzipped_bytes_iter = igzip(['hello ', 'world!'])
gzip.decompress(b''.join(gzipped_bytes_iter)).encode() == 'hello world!'
"""
def gen():
gzip_format = 0b10000
c = zlib.compressobj(level=level, wbits=zlib.MAX_WBITS + gzip_format)
yield from (c.compress(as_bytes(chunk)) for chunk in chunks)
yield c.flush()
return filter(None, gen())
def prefetch(iterable: Iterable[Any], n: int = 1) -> Iterator[Any]:
"""
Prefetch an iterable via thread, yielding original contents as normal.
Example:
def slow_produce(*args):
for x in args:
time.sleep(1)
yield x
def slow_consume(iterable):
for _ in iterable:
time.sleep(1)
slow_consume(prefetch(slow_produce('a', 'b'))) # takes 3 sec, not 4
# Prefetch
# produce: | 'a' | 'b' |
# consume: | 'a' | 'b' |
# seconds: 0 --- 1 --- 2 --- 3
# No prefetch
# produce: | 'a' | | 'b' |
# consume: | 'a' | | 'b' |
# seconds: 0 --- 1 --- 2 --- 3 --- 4
Usage notes/TODO:
* mem leak: Thread is GC'd only after iterable is fully consumed, fix /w __del__
"""
queue = Queue(n)
finished = object()
def produce():
for x in iterable:
queue.put(x)
queue.put(finished)
t = Thread(target=produce, daemon=True)
t.start()
while True:
item = queue.get()
if item is finished:
break
else:
yield item

Categories