Gzip to base64 encoding adds characters to JSON string - python

I have a nested python dictionary that is serialized into a json string, that I am further converting to a compressed Gzip file and base64 encoding it. However, once I convert it back to the JSON string, it adds \\ to the string, which isn't in the original JSON string before conversion. This happens at each of the nested dictionary levels. These are the functions:
import json
import io
import gzip
import base64
import zlib
class numpy_encoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return super(numpy_encoder, self).default(obj)
def dict_json_dump(dictionary):
dumped = json.dumps(dictionary, cls = numpy_encoder, separators=(",", ":"))
return dumped
def gzip_json_encoder(json_string):
stream = io.BytesIO()
with gzip.open(filename=stream, mode='wt') as zipfile:
json.dump(json_string, zipfile)
return stream
def base64_encoder(gzip_string):
return base64.b64encode(gzip_string.getvalue())
We can use the functions as follows:
json_dict = pe.dict_json_dump(test_dictionary)
gzip_json = pe.gzip_json_encoder(json_dict)
base64_gzip = pe.base64_encoder(gzip_json)
When I check the base64_gzip with the following function:
json_str = zlib.decompress(base64.b64decode(base64_gzip), 16 + zlib.MAX_WBITS)
I get the JSON string back in a format like this(truncated):
b'"{\\"trainingResults\\":{\\"confusionMatrix\\":{\\"tn\\":2,\\"fn\\":1,\\"tp\\":1,\\"fp\\":1},\\"auc\\":{\\"score\\":0.5,\\"tpr\\":[0.0,0.5,0.5,1.0],\\"fpr\\":[0.0,0.333,0.667,1.0]},\\"f1\\"
This isn't the full string, but the contents of the string itself is accurate. What I'm not sure about is why the back slashes are showing up when I convert it back. Anyone have any suggestions? I tried utf-8 encoding on my JSON as well, with no luck. Any help is appreciated!

You're doing JSON encoding twice: Once in dict_json_dump() and again in gzip_json_encoder(). Since json_string is already encoded, you don't need to call json.dump() in gzip_json_encoder().
def gzip_json_encoder(json_string):
stream = io.BytesIO()
with gzip.open(filename=stream, mode='wt') as zipfile:
zipfile.write(json_string)
return stream

Related

string formatted as base64 to base64 object

From a post request, I receive a base64 in JSON. Problem is that I can't just change the file type from string to base64 since it's already formatted as base64. need the base64 to convert it back to an image.
json_data = request.get_json(force=True)
img = json_data['img']
print(img)
with open("imageToSave.png", "wb") as fh:
fh.write(base64.decodebytes(img))
In order to decode it, add the third line which decodes the string to base64
json_data = request.get_json(force=True)
img = json_data['img']
imgdata = base64.b64decode(img)
filename = 'upload/newimg.jpg' # I assume you have a way of picking unique filenames
with open(filename, 'wb') as f:
f.write(imgdata)

Generate temporary URLs in Django 2 Python 3

There is one same question in StackOverflow with this link:
How to generate temporary URLs in Django
But the accepted answer code is for Python 2 and I converted it to Python 3:
import hashlib, zlib
import pickle as pickle
import urllib.request, urllib.parse, urllib.error
my_secret = "michnorts"
def encode_data(data):
"""Turn `data` into a hash and an encoded string, suitable for use with `decode_data`."""
text = zlib.compress(pickle.dumps(data, 0)).encode('base64').replace('\n', '')
m = hashlib.md5(my_secret + text).hexdigest()[:12]
return m, text
def decode_data(hash, enc):
"""The inverse of `encode_data`."""
text = urllib.parse.unquote(enc)
m = hashlib.md5(my_secret + text).hexdigest()[:12]
if m != hash:
raise Exception("Bad hash!")
data = pickle.loads(zlib.decompress(text.decode('base64')))
return data
hash, enc = encode_data(['Hello', 'Goodbye'])
print(hash, enc)
print(decode_data(hash, enc))
But it have error :
text = zlib.compress(pickle.dumps(data, 0)).encode('base64').replace('\n', '')
AttributeError: 'bytes' object has no attribute 'encode'
How should I fix this?
Trying to adapt your code to Python 3, I came up with this:
import hashlib, zlib
import pickle as pickle
import urllib.request, urllib.parse, urllib.error
import base64
my_secret = "michnorts"
def encode_data(data):
"""Turn `data` into a hash and an encoded string, suitable for use with `decode_data`."""
compressed_text = zlib.compress(pickle.dumps(data, 0))
text = base64.b64encode(compressed_text).decode().replace('\n', '')
m = hashlib.md5(str.encode('{}{}'.format(my_secret, text))).hexdigest()[:12]
return m, text
def decode_data(hash, enc):
"""The inverse of `encode_data`."""
text = urllib.parse.unquote(enc)
m = hashlib.md5(str.encode('{}{}'.format(my_secret, text))).hexdigest()[:12]
if m != hash:
raise Exception("Bad hash!")
data = pickle.loads(zlib.decompress(base64.b64decode(text)))
return data
hash, enc = encode_data(['Hello', 'Goodbye'])
print(hash, enc)
print(decode_data(hash, enc))
There are some things that I needed to take into account:
in Python 3, the way to encode/decode into base64 is by using the base64 module
to cast a bytes object into a string, I used the bytes.decode method
to cast a string object into a bytes object, I used the str.encode function
the hashlib.md5 function accepts a bytesobject (strings need to be previously encoded)
I changed the way you concatenate strings (i.e. str1 + str2) with a more pythonic construction: '{}{}'.format(str1, str2)
I hope this will be helpful ;)
I recommend to use the built-in secrets modules, especially secrets.token_urlsafe.

How can I retain ASCII hex code points when writing an ElementTree in Python?

I've loaded an xml file (Rhythmbox's database file) into Python 3 via the ElementTree parser. After modifying the tree and writing it to disk (ElementTree.write()) using the ascii encoding all of the ASCII hex characters that are in hex code point are converted to ASCII decimal code point. For example here is a diff containing the copyright symbol:
< <copyright>© WNYC</copyright>
---
> <copyright>© WNYC</copyright>
Is there any way to tell Python/ElementTree not to do this? I'd like all the hex codes to stay in hex code point.
I found a solution. First I created a new codec error handler and then monkey patched ElementTree._get_writer() to use the new error handler. Looks like:
from xml.etree import ElementTree
import io
import contextlib
import codecs
def lower_first(s):
return s[:1].lower() + s[1:] if s else ''
def html_replace(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
s = []
for c in exc.object[exc.start:exc.end]:
s.append('&#%s;' % lower_first(hex(ord(c))[1:].upper()))
return ''.join(s), exc.end
else:
raise TypeError("can't handle %s" % exc.__name__)
codecs.register_error('html_replace', html_replace)
# monkey patch this python function to prevent it from using xmlcharrefreplace
#contextlib.contextmanager
def _get_writer(file_or_filename, encoding):
# returns text write method and release all resources after using
try:
write = file_or_filename.write
except AttributeError:
# file_or_filename is a file name
if encoding == "unicode":
file = open(file_or_filename, "w")
else:
file = open(file_or_filename, "w", encoding=encoding,
errors="html_replace")
with file:
yield file.write
else:
# file_or_filename is a file-like object
# encoding determines if it is a text or binary writer
if encoding == "unicode":
# use a text writer as is
yield write
else:
# wrap a binary writer with TextIOWrapper
with contextlib.ExitStack() as stack:
if isinstance(file_or_filename, io.BufferedIOBase):
file = file_or_filename
elif isinstance(file_or_filename, io.RawIOBase):
file = io.BufferedWriter(file_or_filename)
# Keep the original file open when the BufferedWriter is
# destroyed
stack.callback(file.detach)
else:
# This is to handle passed objects that aren't in the
# IOBase hierarchy, but just have a write method
file = io.BufferedIOBase()
file.writable = lambda: True
file.write = write
try:
# TextIOWrapper uses this methods to determine
# if BOM (for UTF-16, etc) should be added
file.seekable = file_or_filename.seekable
file.tell = file_or_filename.tell
except AttributeError:
pass
file = io.TextIOWrapper(file,
encoding=encoding,
errors='html_replace',
newline="\n")
# Keep the original file open when the TextIOWrapper is
# destroyed
stack.callback(file.detach)
yield file.write
ElementTree._get_writer = _get_writer

Prepend information in base64 encoding

Here is an answer which gives some information on how to base64 encode a file. However, I also want to pass in the filetype and mimetype. for the information in the base64 encoded string.
So far I have for my base64 string:
x=base64.b64encode(open('/Users/user/Desktop/img.PNG').read())
What is the correct information to prepend, and how would I do this?
It seems like the following is how I would get the base64 file information to pass to the server:
file = '/Users/user/Desktop/img.PNG'
prepend_info = 'data:%s;base64' % mimetypes.guess_type(file)[0]
base_64_data = open(file).read().encode('base64')
image_data_base64 = '%s,%s' % (prepend_info, base_64_data)
This then gives me:
data:image/png;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wB...
Perhaps something along these lines:
from __future__ import print_function
import base64
import binascii
import os
def base64_encode_file(filename):
filetype = os.path.splitext(filename)[1][1:] # remove leading '.' from ext
with open(filename) as file:
data = file.read()
return base64.b64encode(','.join((filename, filetype, data))), data
filename = 'C:/Users/martin/Desktop/img.PNG'
#filename = '/Users/user/Desktop/img.PNG'
encoded, data = base64_encode_file(filename)
print('encoded: {} (hex file data: {})'.format(encoded, binascii.hexlify(data)))
decoded = base64.b64decode(encoded).split(',', 2)
print('decoded:', decoded[0], decoded[1], binascii.hexlify(decoded[2]))
Output:
encoded: QzovVXNlcnMvbWFydGluL0Rlc2t0b3AvaW1nLlBORyxQTkcsiVBORwo=
(hex file data: 89504e470a)
decoded: C:/Users/martin/Desktop/img.PNG PNG 89504e470a

Python standard library to POST multipart/form-data encoded data

I would like to POST multipart/form-data encoded data.
I have found an external module that does it: http://atlee.ca/software/poster/index.html
however I would rather avoid this dependency. Is there a way to do this using the standard libraries?
thanks
The standard library does not currently support that. There is cookbook recipe that includes a fairly short piece of code that you just may want to copy, though, along with long discussions of alternatives.
It's an old thread but still a popular one, so here is my contribution using only standard modules.
The idea is the same than here but support Python 2.x and Python 3.x.
It also has a body generator to avoid unnecessarily memory usage.
import codecs
import mimetypes
import sys
import uuid
try:
import io
except ImportError:
pass # io is requiered in python3 but not available in python2
class MultipartFormdataEncoder(object):
def __init__(self):
self.boundary = uuid.uuid4().hex
self.content_type = 'multipart/form-data; boundary={}'.format(self.boundary)
#classmethod
def u(cls, s):
if sys.hexversion < 0x03000000 and isinstance(s, str):
s = s.decode('utf-8')
if sys.hexversion >= 0x03000000 and isinstance(s, bytes):
s = s.decode('utf-8')
return s
def iter(self, fields, files):
"""
fields is a sequence of (name, value) elements for regular form fields.
files is a sequence of (name, filename, file-type) elements for data to be uploaded as files
Yield body's chunk as bytes
"""
encoder = codecs.getencoder('utf-8')
for (key, value) in fields:
key = self.u(key)
yield encoder('--{}\r\n'.format(self.boundary))
yield encoder(self.u('Content-Disposition: form-data; name="{}"\r\n').format(key))
yield encoder('\r\n')
if isinstance(value, int) or isinstance(value, float):
value = str(value)
yield encoder(self.u(value))
yield encoder('\r\n')
for (key, filename, fd) in files:
key = self.u(key)
filename = self.u(filename)
yield encoder('--{}\r\n'.format(self.boundary))
yield encoder(self.u('Content-Disposition: form-data; name="{}"; filename="{}"\r\n').format(key, filename))
yield encoder('Content-Type: {}\r\n'.format(mimetypes.guess_type(filename)[0] or 'application/octet-stream'))
yield encoder('\r\n')
with fd:
buff = fd.read()
yield (buff, len(buff))
yield encoder('\r\n')
yield encoder('--{}--\r\n'.format(self.boundary))
def encode(self, fields, files):
body = io.BytesIO()
for chunk, chunk_len in self.iter(fields, files):
body.write(chunk)
return self.content_type, body.getvalue()
Demo
# some utf8 key/value pairs
fields = [('প্রায়', 42), ('bar', b'23'), ('foo', 'ން:')]
files = [('myfile', 'image.jpg', open('image.jpg', 'rb'))]
# iterate and write chunk in a socket
content_type, body = MultipartFormdataEncoder().encode(fields, files)
You can't do this with the stdlib quickly. Howevewr, see the MultiPartForm class in this PyMOTW. You can probably use or modify that to accomplish whatever you need:
PyMOTW: urllib2 - Library for opening URLs.

Categories