Encoding And Decoding File Field into bytes - python

I have the goal of changing the file object (video uploaded by user) to bytes and then chunking it and then changing these chunks again to images or frames. the following code is a snippet from a django app.
def handle_uploaded_file(f):
with open('./chunks_made.txt', 'wb+') as destination:
for chunk in f.chunks():
print(type(chunk))
print(len(chunk))
destination.write(chunk)
chunk_length = len(chunk)
read_batches(len(chunk))
def read_batches(chunk_size):
with open('./chunks_made.txt', 'rb') as file:
content = file.read(chunk_size)
frame = cv2.imdecode(content, cv2.IMREAD_COLOR)
plt.imshow(frame)
plt.show()
The process view which calls these functions:
def process(request):
video_file = request.FILES['video']
handle_uploaded_file(video_file)
data = 'some_data'
return render(request, 'video/result.html', {'data':video_file})
I don't know how to decode the bytes into the frames as a real image.

Related

image corrupted after reading image file in chunk

so i am trying to ready image file form django request in chunks, the django filehandler chunks method does not work well for me, so i created a custom on, it works but the end product wasnt what i was expecting, so after reading the files in chunks and putting them together somehow the image get corrupts and i dont have any solution for it.
def process_download_with_progress(self, image_file, length):
process_recoder = ProgressRecorder(self)
print('Upload: Task Started')
fs = FileSystemStorage()
buffer = io.BytesIO()
chunk_size = 0
for chunk in read_chunk(image_file.file, length):
chunk_size += 1
buffer.write(chunk)
process_recoder.set_progress(chunk_size, length, description=f'uploaded {chunk_size*length} bytes of the file')
buffer.seek(0)
image = ImageFile(buffer, name=image_file.name)
fs.save(image_file.name, content=image)
return 'Done'
def read_chunk(file_object, chunk_size=125):
while True:
file = file_object.read(chunk_size)
if not file:
break
yield file
so this my code, any help will be appreciated, thanks.

Stream Bytes chunks to csv rows in python

I need to process a large remote CSV line by line without downloading it entirely.
Below is the closest I got.
I iterate byte chunks from Azure, and have some code to handle truncated lines.
But this cannot work if csv values contain a newline as I am not able to discernate between value newlines and csv newlines.
# this does not work
def azure_iter_lines(logger_scope, client, file_path):
# get a StorageStreamDownloader
# https://learn.microsoft.com/en-us/python/api/azure-storage-file-datalake/azure.storage.filedatalake.storagestreamdownloader?view=azure-python
file_client = client.get_file_client(file_path)
file_handle = file_client.download_file()
truncated_line = ''
for chunk in file_handle.chunks():
# have the previous truncated line appended to the next block
chunk_txt = truncated_line + chunk.decode("utf-8")
lines = chunk_txt.split('\n') # THIS CANNOT WORK AS VALUES CONTAIN NEWLINES
for line in lines[0:len(lines)-2]:
yield line
truncated_line = lines[len(lines)-1]
# process the last chunk (same code)
chunk_txt = truncated_line
lines = chunk_txt.split('\n') # THIS CANNOT WORK AS VALUES CONTAIN NEWLINES
for line in lines[0:len(lines)-2]:
yield line
truncated_line = lines[len(lines)-1]
Ideally I would use csv.DictReader() but I was not able to to so as it downloads the file entirely.
# this does not work
def azure_iter_lines(logger_scope, client, file_path):
file_client = client.get_file_client(file_path)
file_handle = file_client.download_file()
buffer = io.BytesIO()
file_handle.readinto(buffer) # THIS DOWNLOADS THE FILE ENTIRELY
csvreader = csv.DictReader(buffer, delimiter=";")
return csvreader
Here is an update using some hints by #H.Leger
Please note that this still does not work
file_client = client.get_file_client(file_path)
file_handle = file_client.download_file()
stream = codecs.iterdecode(file_handle.chunks(), 'utf-8')
csvreader = csv.DictReader(stream, delimiter=";")
for row in csvreader:
print(row)
# => _csv.Error: new-line character seen in unquoted field - do you need to open the file in universal-newline mode?
EDIT: Final solution based on #paiv answer
EDIT: Updated solution to use io instead of codecs for faster parsing
import io
import csv
import ctypes as ct
# bytes chunk iterator to python stream adapter
# https://stackoverflow.com/a/67547597/2523414
class ChunksAdapter:
def __init__(self, chunks):
self.chunks = chunks
self.buf = b''
self.closed = False
def readable(self):
return True
def writable(self):
return False
def seekable(self):
return False
def close(self):
self.closed = True
def read(self, size):
if not self.buf:
self.buf = next(self.chunks, b'')
res, self.buf = self.buf[:size], self.buf[size:]
return res
# get the downloader object
file_client = client.get_file_client(file_path)
downloader = file_client.download_file()
# adapt the downloader iterator to a byte stream
file_object = ChunksAdapter(downloader.chunks())
# decode bytes stream to utf-8
text_stream = io.TextIOWrapper(file_object, encoding='utf-8', newline='')
# update csv field limit to handle large fields
# https://stackoverflow.com/a/54517228/2523414
csv.field_size_limit(int(ct.c_ulong(-1).value // 2))
csvreader = csv.DictReader(text_stream, delimiter=";", quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in csvreader:
print(row)
Disclaimer: I know little Azure specifics. Ultimately, you would want to stream separate chunks too.
In Python, given a file object, you can set up CSV streaming this way:
import codecs
import csv
codec = codecs.getreader('utf-8')
text_stream = codec(file_object)
csvreader = csv.DictReader(text_stream)
Now you can iterate over csvreader, and it will read from file_object in a streaming fasion.
Edit: as #Martijn Pieters suggested, we can gain performance with TextIOWrapper instead of codecs:
text_stream = io.TextIOWrapper(file_object, encoding='utf-8', newline='')
Check the comment in csv module on newline parameter.
But Azure's StorageStreamDownloader does not provide python's file object interface. It has .chunks() generator (which I assume will invoke separate HTTP request to retrieve next chunk).
You can adapt .chunks() into a file object with a simple adapter:
class ChunksAdapter:
def __init__(self, chunks):
self.chunks = chunks
self.buf = b''
def read(self, size):
if not self.buf:
self.buf = next(self.chunks, b'')
res, self.buf = self.buf[:size], self.buf[size:]
return res
And use like
downloader = file_client.download_file()
file_object = ChunksAdapter(downloader.chunks())
Be sure to configure DictReader for the appropriate CSV dialect.
And set appropriate values for max_single_get_size, max_chunk_get_size on the blob client.
I believe the requests package can be useful for you. Using the stream option while getting your file and the Response.iter_lines() function should do what you need :
import codecs
import csv
import requests
url = "https://navitia.opendatasoft.com//explore/dataset/all-datasets/download?format=csv"
r = requests.get(url, stream=True) # using the stream option to avoid loading everything
try:
buffer = r.iter_lines() # iter_lines() will feed you the distant file line by line
reader = csv.DictReader(codecs.iterdecode(buffer, 'utf-8'), delimiter=';')
for row in reader:
print(row) # Do stuff here
finally:
r.close()

Compress a read file data direct on memory (for e-mail)

I am written a code to send a e-mail through SMTP on Python. On some part of the code I attach some file to the e-mail body, sometimes this attachment is bigger than 25MB (or close) and I would like to compress it.
Due the serve restriction, I would to not create a zip/gz/tar/... on the folder and just compress the already read data to get a compressed_data byte stream that looks identical to a compressed read file. Is it possible?
def add_attachment(file_path: str,id_value: Optional[str] = None, compress_file: Optional[bool] = False):
'''Add an attachment or image to be used into the message.
Parameters:
File path
Image id for the HTML use case.
Compress the file.
'''
filename = os.path.split(file_path)[1]
# Guess the content type based on the file's extension.
ctype, encoding = mimetypes.guess_type(file_path)
if ctype is None or encoding is not None:
ctype = 'application/octet-stream'
maintype, subtype = ctype.split('/', 1)
with open(file_path, 'rb') as fp:
file_size = os.fstat(fp.fileno()).st_size
if file_size >= EMAIL_MAX_ATTACH_SIZE:
compress_file = True # Force compression.
file_data = fp.read()
# Compress the file before send (it is asked to or the uncompressed file size surpasses the e-mail attachment limit).
if compress_file:
pass #TODO
# Get the file type.
if maintype == 'application':
mime_file = MIMEApplication(file_data, subtype, name=filename)
elif maintype == 'image':
mime_file = MIMEImage(file_data, subtype,name=filename)
else:
return
# Add the ID to the attachment content.
if id_value:
mime_file.add_header('Content-ID', id_value)
else:
mime_file.add_header('Content-ID', '<' + os.path.splitext(filename)[0] + '>')
# Add the attachment to the e-mail
msg_root.attach(mime_file)

upload image django rest framework corrupted

I have the next view to upload a image but the generated image is corrupted.
class FileUploadView(views.APIView):
parser_classes = (parsers.FileUploadParser,)
def uploadFile(self, up_file):
if not os.path.exists(BUILDING_PHOTOS_FOLDER):
os.mkdir(BUILDING_PHOTOS_FOLDER)
file_name = '{}.jpeg'.format(uuid.uuid4())
destination = open(
'{}/{}'.format(BUILDING_PHOTOS_FOLDER, file_name), 'wb+')
for chunk in up_file.chunks():
destination.write(chunk)
destination.close()
def put(self, request, filename, format=None):
file_obj = request.data['file']
self.uploadFile(file_obj)
return HttpResponse(status=204)
Hi you can try like this
def uploadFile(self, f):
filename = f.name
redington_path = settings.MEDIA_ROOT + '/yourpath/'
if not os.path.exists(redington_path):
os.makedirs(redington_path)
BASE_PATH = settings.MEDIA_ROOT + '/yourpath/'
os.mkdir(os.path.join(BASE_PATH, vendorid))
with open(BASE_PATH + filename, 'wb+') as destination:
for chunk in f.chunks():
destination.write(chunk)
you need get files from request.FILES['file']
def put(self, request, filename, format=None):
file_obj = request.FILES['file']
self.uploadFile(file_obj)
return HttpResponse(status=204)
Need to add set format=jpeg and file_name should be up_file.name
class FileUploadView(views.APIView):
parser_classes = (parsers.FileUploadParser,)
def uploadFile(self, up_file):
if not os.path.exists(BUILDING_PHOTOS_FOLDER):
os.mkdir(BUILDING_PHOTOS_FOLDER)
up_file.name
file_name = up_file.name
destination = open('{}/{}'.format(BUILDING_PHOTOS_FOLDER, file_name), 'wb+')
for chunk in up_file.chunks():
destination.write(chunk)
destination.close()
def put(self, request, filename, format='jpeg'):
file_obj = request.data['file']
self.uploadFile(file_obj)
return HttpResponse(status=204)
Finally i solve sending the image as raw binary from frontend.

Create and stream a large archive without storing it in memory or on disk

I want to allow users to download an archive of multiple large files at once. However, the files and the archive may be too large to store in memory or on disk on my server (they are streamed in from other servers on the fly). I'd like to generate the archive as I stream it to the user.
I can use Tar or Zip or whatever is simplest. I am using django, which allows me to return a generator or file-like object in my response. This object could be used to pump the process along. However, I am having trouble figuring out how to build this sort of thing around the zipfile or tarfile libraries, and I'm afraid they may not support reading files as they go, or reading the archive as it is built.
This answer on converting an iterator to a file-like object might help. tarfile#addfile takes an iterable, but it appears to immediately pass that to shutil.copyfileobj, so this may not be as generator-friendly as I had hoped.
I ended up using SpiderOak ZipStream.
You can do it by generating and streaming a zip file with no compression, which is basically to just add the headers before each file's content. You're right, the libraries don't support this, but you can hack around them to get it working.
This code wraps zipfile.ZipFile with a class that manages the stream and creates instances of zipfile.ZipInfo for the files as they come. CRC and size can be set at the end. You can push data from the input stream into it with put_file(), write() and flush(), and read data out of it to the output stream with read().
import struct
import zipfile
import time
from StringIO import StringIO
class ZipStreamer(object):
def __init__(self):
self.out_stream = StringIO()
# write to the stringIO with no compression
self.zipfile = zipfile.ZipFile(self.out_stream, 'w', zipfile.ZIP_STORED)
self.current_file = None
self._last_streamed = 0
def put_file(self, name, date_time=None):
if date_time is None:
date_time = time.localtime(time.time())[:6]
zinfo = zipfile.ZipInfo(name, date_time)
zinfo.compress_type = zipfile.ZIP_STORED
zinfo.flag_bits = 0x08
zinfo.external_attr = 0600 << 16
zinfo.header_offset = self.out_stream.pos
# write right values later
zinfo.CRC = 0
zinfo.file_size = 0
zinfo.compress_size = 0
self.zipfile._writecheck(zinfo)
# write header to stream
self.out_stream.write(zinfo.FileHeader())
self.current_file = zinfo
def flush(self):
zinfo = self.current_file
self.out_stream.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size, zinfo.file_size))
self.zipfile.filelist.append(zinfo)
self.zipfile.NameToInfo[zinfo.filename] = zinfo
self.current_file = None
def write(self, bytes):
self.out_stream.write(bytes)
self.out_stream.flush()
zinfo = self.current_file
# update these...
zinfo.CRC = zipfile.crc32(bytes, zinfo.CRC) & 0xffffffff
zinfo.file_size += len(bytes)
zinfo.compress_size += len(bytes)
def read(self):
i = self.out_stream.pos
self.out_stream.seek(self._last_streamed)
bytes = self.out_stream.read()
self.out_stream.seek(i)
self._last_streamed = i
return bytes
def close(self):
self.zipfile.close()
Keep in mind that this code was just a quick proof of concept and I did no further development or testing once I decided to let the http server itself deal with this problem. A few things you should look into if you decide to use it is to check if nested folders are archived correctly, and filename encoding (which is always a pain with zip files anyway).
You can stream a ZipFile to a Pylons or Django response fileobj by wrapping the fileobj in something file-like that implements tell(). This will buffer each individual file in the zip in memory, but stream the zip itself. We use it to stream download a zip file full of images, so we never buffer more than a single image in memory.
This example streams to sys.stdout. For Pylons use response.body_file, for Django you can use the HttpResponse itself as a file.
import zipfile
import sys
class StreamFile(object):
def __init__(self, fileobj):
self.fileobj = fileobj
self.pos = 0
def write(self, str):
self.fileobj.write(str)
self.pos += len(str)
def tell(self):
return self.pos
def flush(self):
self.fileobj.flush()
# Wrap a stream so ZipFile can use it
out = StreamFile(sys.stdout)
z = zipfile.ZipFile(out, 'w', zipfile.ZIP_DEFLATED)
for i in range(5):
z.writestr("hello{0}.txt".format(i), "this is hello{0} contents\n".format(i) * 3)
z.close()
Here is the solution from Pedro Werneck (from above) but with a fix to avoid collecting all data in memory (read method is fixed a little bit):
class ZipStreamer(object):
def __init__(self):
self.out_stream = StringIO.StringIO()
# write to the stringIO with no compression
self.zipfile = zipfile.ZipFile(self.out_stream, 'w', zipfile.ZIP_STORED)
self.current_file = None
self._last_streamed = 0
def put_file(self, name, date_time=None):
if date_time is None:
date_time = time.localtime(time.time())[:6]
zinfo = zipfile.ZipInfo(name, date_time)
zinfo.compress_type = zipfile.ZIP_STORED
zinfo.flag_bits = 0x08
zinfo.external_attr = 0600 << 16
zinfo.header_offset = self.out_stream.pos
# write right values later
zinfo.CRC = 0
zinfo.file_size = 0
zinfo.compress_size = 0
self.zipfile._writecheck(zinfo)
# write header to mega_streamer
self.out_stream.write(zinfo.FileHeader())
self.current_file = zinfo
def flush(self):
zinfo = self.current_file
self.out_stream.write(
struct.pack("<LLL", zinfo.CRC, zinfo.compress_size,
zinfo.file_size))
self.zipfile.filelist.append(zinfo)
self.zipfile.NameToInfo[zinfo.filename] = zinfo
self.current_file = None
def write(self, bytes):
self.out_stream.write(bytes)
self.out_stream.flush()
zinfo = self.current_file
# update these...
zinfo.CRC = zipfile.crc32(bytes, zinfo.CRC) & 0xffffffff
zinfo.file_size += len(bytes)
zinfo.compress_size += len(bytes)
def read(self):
self.out_stream.seek(self._last_streamed)
bytes = self.out_stream.read()
self._last_streamed = 0
# cleaning up memory in each iteration
self.out_stream.seek(0)
self.out_stream.truncate()
self.out_stream.flush()
return bytes
def close(self):
self.zipfile.close()
then you can use stream_generator function as a stream for a zip file
def stream_generator(files_paths):
s = ZipStreamer()
for f in files_paths:
s.put_file(f)
with open(f) as _f:
s.write(_f.read())
s.flush()
yield s.read()
s.close()
example for Falcon:
class StreamZipEndpoint(object):
def on_get(self, req, resp):
files_pathes = [
'/path/to/file/1',
'/path/to/file/2',
]
zip_filename = 'output_filename.zip'
resp.content_type = 'application/zip'
resp.set_headers([
('Content-Disposition', 'attachment; filename="%s"' % (
zip_filename,))
])
resp.stream = stream_generator(files_pathes)
An option is to use stream-zip (full disclosure: written by me)
Amending its example slightly:
from datetime import datetime
from stream_zip import stream_zip, ZIP_64
def non_zipped_files():
modified_at = datetime.now()
perms = 0o600
# Hard coded in this example, but in real cases could
# for example yield data from a remote source
def file_1_data():
for i in range(0, 1000):
yield b'Some bytes'
def file_2_data():
for i in range(0, 1000):
yield b'Some bytes'
yield 'my-file-1.txt', modified_at, perms, ZIP64, file_1_data()
yield 'my-file-2.txt', modified_at, perms, ZIP64, file_2_data()
zipped_chunks = stream_zip(non_zipped_files())
# Can print each chunk, or return them to a client,
# say using Django's StreamingHttpResponse
for zipped_chunk in zipped_chunks:
print(zipped_chunk)

Categories