Porting pickle py2 to py3 strings become bytes

Porting pickle py2 to py3 strings become bytes - python

I have a pickle file that was created with python 2.7 that I'm trying to port to python 3.6. The file is saved in py 2.7 via pickle.dumps(self.saved_objects, -1)
and loaded in python 3.6 via loads(data, encoding="bytes") (from a file opened in rb mode). If I try opening in r mode and pass encoding=latin1 to loads I get UnicodeDecode errors. When I open it as a byte stream it loads, but literally every string is now a byte string. Every object's __dict__ keys are all b"a_variable_name" which then generates attribute errors when calling an_object.a_variable_name because __getattr__ passes a string and __dict__ only contains bytes. I feel like I've tried every combination of arguments and pickle protocols already. Apart from forcibly converting all objects' __dict__ keys to strings I'm at a loss. Any ideas?
** Skip to 4/28/17 update for better example
-------------------------------------------------------------------------------------------------------------
** Update 4/27/17
This minimum example illustrates my problem:
From py 2.7.13
import pickle
class test(object):
def __init__(self):
self.x = u"test ¢" # including a unicode str breaks things
t = test()
dumpstr = pickle.dumps(t)
>>> dumpstr
"ccopy_reg\n_reconstructor\np0\n(c__main__\ntest\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nS'x'\np6\nVtest \xa2\np7\nsb."
From py 3.6.1
import pickle
class test(object):
def __init__(self):
self.x = "xyz"
dumpstr = b"ccopy_reg\n_reconstructor\np0\n(c__main__\ntest\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nS'x'\np6\nVtest \xa2\np7\nsb."
t = pickle.loads(dumpstr, encoding="bytes")
>>> t
<__main__.test object at 0x040E3DF0>
>>> t.x
Traceback (most recent call last):
File "<pyshell#15>", line 1, in <module>
t.x
AttributeError: 'test' object has no attribute 'x'
>>> t.__dict__
{b'x': 'test ¢'}
>>>
-------------------------------------------------------------------------------------------------------------
Update 4/28/17
To re-create my issue I'm posting my actual raw pickle data here
The pickle file was created in python 2.7.13, windows 10 using
with open("raw_data.pkl", "wb") as fileobj:
pickle.dump(library, fileobj, protocol=0)
(protocol 0 so it's human readable)
To run it you'll need classes.py
# classes.py
class Library(object): pass
class Book(object): pass
class Student(object): pass
class RentalDetails(object): pass
And the test script here:
# load_pickle.py
import pickle, sys, itertools, os
raw_pkl = "raw_data.pkl"
is_py3 = sys.version_info.major == 3
read_modes = ["rb"]
encodings = ["bytes", "utf-8", "latin-1"]
fix_imports_choices = [True, False]
files = ["raw_data_%s.pkl" % x for x in range(3)]
def py2_test():
with open(raw_pkl, "rb") as fileobj:
loaded_object = pickle.load(fileobj)
print("library dict: %s" % (loaded_object.__dict__.keys()))
return loaded_object
def py2_dumps():
library = py2_test()
for protcol, path in enumerate(files):
print("dumping library to %s, protocol=%s" % (path, protcol))
with open(path, "wb") as writeobj:
pickle.dump(library, writeobj, protocol=protcol)
def py3_test():
# this test iterates over the different options trying to load
# the data pickled with py2 into a py3 environment
print("starting py3 test")
for (read_mode, encoding, fix_import, path) in itertools.product(read_modes, encodings, fix_imports_choices, files):
py3_load(path, read_mode=read_mode, fix_imports=fix_import, encoding=encoding)
def py3_load(path, read_mode, fix_imports, encoding):
from traceback import print_exc
print("-" * 50)
print("path=%s, read_mode = %s fix_imports = %s, encoding = %s" % (path, read_mode, fix_imports, encoding))
if not os.path.exists(path):
print("start this file with py2 first")
return
try:
with open(path, read_mode) as fileobj:
loaded_object = pickle.load(fileobj, fix_imports=fix_imports, encoding=encoding)
# print the object's __dict__
print("library dict: %s" % (loaded_object.__dict__.keys()))
# consider the test a failure if any member attributes are saved as bytes
test_passed = not any((isinstance(k, bytes) for k in loaded_object.__dict__.keys()))
print("Test %s" % ("Passed!" if test_passed else "Failed"))
except Exception:
print_exc()
print("Test Failed")
input("Press Enter to continue...")
print("-" * 50)
if is_py3:
py3_test()
else:
# py2_test()
py2_dumps()
put all 3 in the same directory and run c:\python27\python load_pickle.py first which will create 1 pickle file for each of the 3 protocols. Then run the same command with python 3 and notice that it version converts the __dict__ keys to bytes. I had it working for about 6 hours, but for the life of me I can't figure out how I broke it again.

In short, you're hitting bug 22005 with datetime.date objects in the RentalDetails objects.
That can be worked around with the encoding='bytes' parameter, but that leaves your classes with __dict__ containing bytes:
>>> library = pickle.loads(pickle_data, encoding='bytes')
>>> dir(library)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: '<' not supported between instances of 'str' and 'bytes'
It's possible to manually fix that based on your specific data:
def fix_object(obj):
"""Decode obj.__dict__ containing bytes keys"""
obj.__dict__ = dict((k.decode("ascii"), v) for k, v in obj.__dict__.items())
def fix_library(library):
"""Walk all library objects and decode __dict__ keys"""
fix_object(library)
for student in library.students:
fix_object(student)
for book in library.books:
fix_object(book)
for rental in book.rentals:
fix_object(rental)
But that's fragile and enough of a pain you should be looking for a better option.
1) Implement __getstate__/__setstate__ that maps datetime objects to a non-broken representation, for instance:
class Event(object):
"""Example class working around datetime pickling bug"""
def __init__(self):
self.date = datetime.date.today()
def __getstate__(self):
state = self.__dict__.copy()
state["date"] = state["date"].toordinal()
return state
def __setstate__(self, state):
self.__dict__.update(state)
self.date = datetime.date.fromordinal(self.date)
2) Don't use pickle at all. Along the lines of __getstate__/__setstate__, you can just implement to_dict/from_dict methods or similar in your classes for saving their content as json or some other plain format.
A final note, having a backreference to library in each object shouldn't be required.

You should treat pickle data as specific to the (major) version of Python that created it.
(See Gregory Smith's message w.r.t. issue 22005.)
The best way to get around this is to write a Python 2.7 program to read the pickled data, and write it out in a neutral format.
Taking a quick look at your actual data, it seems to me that an SQLite database is appropriate as an interchange format, since the Books contain references to a Library and RentalDetails. You could create separate tables for each.

Question: Porting pickle py2 to py3 strings become bytes
The given encoding='latin-1' below, is ok.
Your Problem with b'' are the result of using encoding='bytes'.
This will result in dict-keys being unpickled as bytes instead of as str.
The Problem data are the datetime.date values '\x07á\x02\x10', starting at line 56 in raw-data.pkl.
It's a konwn Issue, as pointed already.
Unpickling python2 datetime under python3
http://bugs.python.org/issue22005
For a workaround, I have patched pickle.py and got unpickled object, e.g.
book.library.books[0].rentals[0].rental_date=2017-02-16
This will work for me:
t = pickle.loads(dumpstr, encoding="latin-1")
Output:
<main.test object at 0xf7095fec>
t.__dict__={'x': 'test ¢'}
test ¢
Tested with Python:3.4.2

Related

File I/O error using nglview.show_biopython(structure)

So I have been trying to get into visualizing proteins in python, so I went on youtube and found some tutorials I ended up on a tutorial that was teaching you how to visualize a protein from the COVID-19 virus, so I went and setup anaconda, got jupyter notebook working vscode, and downloaded the necessary files from the PDB database, and made sure they were in the same directory as my notebook but when I run the the nglview.show_biopython(structure) function I get an ValueError: I/O opertation on a closed file. I'm stummed this is my first time using jupyter notebook so maybe there is something I'm missing, I don't know.
This what the code looks like
from Bio.PDB import *
import nglview as nv
parser = PDBParser()
structure = parser.get_structure("6YYT", "6YYT.pdb")
view = nv.show_biopython(structure)
This is the error
Output exceeds the size limit. Open the full output data in a text editor
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_1728\2743687014.py in <module>
----> 1 view = nv.show_biopython(structure)
c:\Users\jerem\anaconda3\lib\site-packages\nglview\show.py in show_biopython(entity, **kwargs)
450 '''
451 entity = BiopythonStructure(entity)
--> 452 return NGLWidget(entity, **kwargs)
453
454
c:\Users\jerem\anaconda3\lib\site-packages\nglview\widget.py in __init__(self, structure, representations, parameters, **kwargs)
243 else:
244 if structure is not None:
--> 245 self.add_structure(structure, **kwargs)
246
247 if representations:
c:\Users\jerem\anaconda3\lib\site-packages\nglview\widget.py in add_structure(self, structure, **kwargs)
1111 if not isinstance(structure, Structure):
1112 raise ValueError(f'{structure} is not an instance of Structure')
-> 1113 self._load_data(structure, **kwargs)
1114 self._ngl_component_ids.append(structure.id)
1115 if self.n_components > 1:
...
--> 200 return io_str.getvalue()
201
202
ValueError: I/O operation on closed file
I only get this error when using nglview.show_biopython, when I run the get_structure() function it can read the file just fine. I can visualize other molucles just fine, or maybe that's because I was using the ASE library instead of a file. I don't know, that's why I'm here.
Update: Recently I found out that I can visualize the protein using nglview.show_file() instead of using nglview.show_biopython(). Even though I can visualize proteins now and techincally my problem has been solved I would still like to know why the show_biopython() function isn't working properly.

I also figured out another way to fix this problem. After going back to the tutorial I was talking about I saw that it was made back in 2021. After seeing this I wonder if we were using the same verions of each package, turns out we were not. I'm not sure what version of nglview they were using, but they were using biopython 1.79 which was the latest verion back in 2021 and I was using biopython 1.80. While using biopython 1.80 I was getting the error seen above. But now that I'm using biopython 1.79 I get this:
file = "6YYT.pdb"
parser = PDBParser()
structure = parser.get_structure("6YYT", file)
structure
view = nv.show_biopython(structure)
view
Output:
c:\Users\jerem\anaconda3\lib\site-packages\Bio\PDB\StructureBuilder.py:89:
PDBConstructionWarning: WARNING: Chain A is discontinuous at line 12059.
warnings.warn(
So I guess there is something going on with biopython 1.80, so I'm going to stick with 1.79

I had a similar problem with:
from Bio.PDB import *
import nglview as nv
parser = PDBParser(QUIET = True)
structure = parser.get_structure("2ms2", "2ms2.pdb")
save_pdb = PDBIO()
save_pdb.set_structure(structure)
save_pdb.save('pdb_out.pdb')
view = nv.show_biopython(structure)
view
error was like in question:
.................site-packages/nglview/adaptor.py:201, in BiopythonStructure.get_structure_string(self)
199 io_str = StringIO()
200 io_pdb.save(io_str)
--> 201 return io_str.getvalue()
ValueError: I/O operation on closed file
I modified site-packages/nglview/adaptor.py:201, in BiopythonStructure.get_structure_string(self):
def get_structure_string(self):
from Bio.PDB import PDBIO
from io import StringIO
io_pdb = PDBIO()
io_pdb.set_structure(self._entity)
io_str = StringIO()
io_pdb.save(io_str)
return io_str.getvalue()
with :
def get_structure_string(self):
from Bio.PDB import PDBIO
import mmap
io_pdb = PDBIO()
io_pdb.set_structure(self._entity)
mo = mmap_str()
io_pdb.save(mo)
return mo.read()
and added this new class mmap_str() , in same file:
import mmap
import copy
class mmap_str():
import mmap #added import at top
instance = None
def __init__(self):
self.mm = mmap.mmap(-1, 2)
self.a = ''
b = '\n'
self.mm.write(b.encode(encoding = 'utf-8'))
self.mm.seek(0)
#print('self.mm.read().decode() ',self.mm.read().decode(encoding = 'utf-8'))
self.mm.seek(0)
def __new__(cls, *args, **kwargs):
if not isinstance(cls.instance, cls):
cls.instance = object.__new__(cls)
return cls.instance
def write(self, string):
self.a = str(copy.deepcopy(self.mm.read().decode(encoding = 'utf-8'))).lstrip('\n')
self.mm.seek(0)
#print('a -> ', self.a)
len_a = len(self.a)
self.mm = mmap.mmap(-1, len(self.a)+len(string))
#print('a :', self.a)
#print('len self.mm ', len(self.mm))
#print('lenght string : ', len(string))
#print(bytes((self.a+string).encode()))
self.mm.write(bytes((self.a+string).encode()))
self.mm.seek(0)
#print('written once ')
#self.mm.seek(0)
def read(self):
self.mm.seek(0)
a = self.mm.read().decode().lstrip('\n')
self.mm.seek(0)
return a
def __enter__(self):
return self
def __exit__(self, *args):
pass
if I uncomment the print statements I'll get the :
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
error , but commenting them out I get:
while using thenglview.show_file(filename) I get:
tha's because, as could be seen looking at the pdb_out.pdb file
outputted by my code, Biopytho.PDB.PDBParser.get_structure(name , filename) doesnt retrieve the pdb header responsible for generate full CRYSTALLOGRAPHIC SYMMETRY/or biopython can't handle it (not sure about this, help if you know better), but just the coordinates.
Still don't understand what is going on with the :
--> 201 return io_str.getvalue()
ValueError: I/O operation on closed file
it could be something related to jupiter ipykernal ? hope somebody could shed more light into this, dont know how the framework runs, but is definitely different from a normal python interpreter. As an example:
Same code in one of my Python virtualenv, will run forever, so it could be ipykernel dont like StringIO()s or do something strange to them ?
OK thanks to the hint in the answer below, I went inspecting PDBIO.py in github repo for version Biopython 1.80 and compared the save method of PDBIO : def save(self, file, select=_select, write_end=True, preserve_atom_numbering=False): with the one in Biopython 1.79,
see first bit:
and last bit:
so apparently the big difference is the with fhandle: block in version 1.80.
So I realized that changing adaptor.py with adding a subclass of StringIO that looks like:
from io import StringIO
class StringIO(StringIO):
def __exit__(self, *args, **kwargs):
print('exiting from subclassed StringIO !!!!!')
pass
and modifying def get_structure_string(self): like this:
def get_structure_string(self):
from Bio.PDB import PDBIO
#from io import StringIO
io_pdb = PDBIO()
io_pdb.set_structure(self._entity)
io_str = StringIO()
io_pdb.save(io_str)
return io_str.getvalue()
was enough to get my Biopython 1.80 work in jupiter with nglview.
That told I am not sure what are the pitfalls of not closing the StringIO object we use for the visualization, but apparently its what Biopython 1.79 was doing like my first answer using a mmap object was doing too (not closing the mmap_str)

Another way to solve the probelm:
tried to understand git, I ended up with this, seems more coherent with the previous habits in the biopython project, but cant push it.
It makes use of as_handle from BIO.file :https://github.com/biopython/biopython/blob/e1902d1cdd3aa9325b4622b25d82fbf54633e251/Bio/File.py#L28
#contextlib.contextmanager
def as_handle(handleish, mode="r", **kwargs):
r"""Context manager to ensure we are using a handle.
Context manager for arguments that can be passed to SeqIO and AlignIO read, write,
and parse methods: either file objects or path-like objects (strings, pathlib.Path
instances, or more generally, anything that can be handled by the builtin 'open'
function).
When given a path-like object, returns an open file handle to that path, with provided
mode, which will be closed when the manager exits.
All other inputs are returned, and are *not* closed.
Arguments:
- handleish - Either a file handle or path-like object (anything which can be
passed to the builtin 'open' function, such as str, bytes,
pathlib.Path, and os.DirEntry objects)
- mode - Mode to open handleish (used only if handleish is a string)
- kwargs - Further arguments to pass to open(...)
Examples
--------
>>> from Bio import File
>>> import os
>>> with File.as_handle('seqs.fasta', 'w') as fp:
... fp.write('>test\nACGT')
...
10
>>> fp.closed
True
>>> handle = open('seqs.fasta', 'w')
>>> with File.as_handle(handle) as fp:
... fp.write('>test\nACGT')
...
10
>>> fp.closed
False
>>> fp.close()
>>> os.remove("seqs.fasta") # tidy up
"""
try:
with open(handleish, mode, **kwargs) as fp:
yield fp
except TypeError:
yield handleish
Anyone could pass it along ? [of course needs to be checked out, my tests are OK, but I am a novice].

How to serialize a scandir.DirEntry in Python for sending through a network socket?

I have server and client programs that communicate with each other through a network socket.
What I want is to send a directory entry (scandir.DirEntry) obtained from scandir.scandir() through the socket.
For now I am using pickle and cPickle modules and have come up with the following (excerpt only):
import scandir, pickle
s = scandir.scandir("D:\\PYTHON")
entry = s.next()
data = pickle.dumps(entry)
However, I am getting the following error stack:
File "untitled.py", line 5, in <module>
data = pickle.dumps(item)
File "C:\Python27\Lib\pickle.py", line 1374, in dumps
Pickler(file, protocol).dump(obj)
File "C:\Python27\Lib\pickle.py", line 224, in dump
self.save(obj)
File "C:\Python27\Lib\pickle.py", line 306, in save
rv = reduce(self.proto)
File "C:\Python27\Lib\copy_reg.py", line 70, in _reduce_ex
raise TypeError, "can't pickle %s objects" % base.__name__
TypeError: can't pickle DirEntry objects
How can I get rid of this error?
I have heard of using marshall or JSON.
UPDATE: JSON is not dumping all the data within the object.
Is there any completely different way to do so to send the object through the socket?
Thanks in advance for any help.

Yes, os.DirEntry objects are intended to be short-lived, not really kept around or serialized. If you need the data in them to be serialized, looks like you've figured that out in your own answer -- serialize (pickle) a dict version of the attributes you need.
To deserialize into an object that walks and quacks like an os.DirEntry instance, create a PseudoDirEntry class that mimics the things you need.
Note that you can directly serialize the stat object already, which saves you picking the fields out of that.
Combined, that would look like this:
class PseudoDirEntry:
def __init__(self, name, path, is_dir, stat):
self.name = name
self.path = path
self._is_dir = is_dir
self._stat = stat
def is_dir(self):
return self._is_dir
def stat(self):
return self._stat
And then:
>>> import os, pickle
>>> entry = list(os.scandir())[0]
>>> pickled = pickle.dumps({'name': entry.name, 'path': entry.path, 'is_dir': entry.is_dir(), 'stat': entry.stat()})
>>> loaded = pickle.loads(pickled)
>>> pseudo = PseudoDirEntry(loaded['name'], loaded['path'], loaded['is_dir'], loaded['stat'])
>>> pseudo.name
'.DS_Store'
>>> pseudo.is_dir()
False
>>> pseudo.stat()
os.stat_result(st_mode=33188, st_ino=8370294, st_dev=16777220, st_nlink=1, st_uid=502, st_gid=20, st_size=8196, st_atime=1478356967, st_mtime=1477601172, st_ctime=1477601172)

Well I myself have figured out that for instances of non-standard classes like this scandir.DirEntry, the best way is to convert the class member data into a (possibly nested) combination of standard objects like (list, dict, etc.).
For example, in the particular case of scandir.DirEntry, it can be done as follows.
import scandir, pickle
s = scandir.scandir("D:\\PYTHON")
entry = s.next()
# first convert the stat object to st_
st = entry.stat()
st_ = {'st_mode':st.st_mode, 'st_size':st.st_size,\
'st_atime':st.st_atime, 'st_mtime':st.st_mtime,\
'st_ctime':st.st_ctime}
# now convert the entry object to entry_
entry_ = {'name':entry.name, 'is_dir':entry.is_dir(), \
'path':entry.path, 'stat':st_}
# one may need some other class member data also as necessary
# now pickle the converted entry_
data = pickle.dumps(entry_)
Although for my purpose, I only require the data, after the unpickling in the other end, one may need to reconstruct the unpickled entry_ to unpickled scandir.DirEntry object 'entry'. However, I am yet to figure out how to reconstruct the class instance and set the data for the behaviour of methods like is_dir(), stat().

using generators and cStringIO in python to stream strings

I'm trying to read a very large string stream using cStringIO in a python dictionary:
def stream_read(self, path):
try:
# create a string stream from the contents at 'path'
# note: the string at self._my_dict[path] is 7MB in size
stream = StringIO.StringIO(self._my_dict[path])
while True:
# buffer size is 128kB, or 128 * 1024
buf = stream.read(self.buffer_size)
if buf != '':
yield buf
else:
raise StopIteration
except KeyError:
raise IOError("Could not get content")
And in my test suite, I'm testing this function by first testing stream_write, asserting that the data exists at that path, and then calling stream_read:
def test_stream(self):
filename = self.gen_random_string()
# test 7MB
content = self.gen_random_string(7 * 1024 * 1024)
# test stream write
io = StringIO.StringIO(content)
self._storage.stream_write(filename, io)
io.close()
self.assertTrue(self._storage.exists(filename))
# test read / write
data = ''
for buf in self._storage.stream_read(filename):
data += buf
self.assertEqual(content, data)
Yet in my test suite, I'm catching an AssertionError:
======================================================================
FAIL: test_stream (test_swift_storage.TestSwiftStorage)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/home/bacongobbler/.../test/test_local_storage.py", line 44, in test_stream
self.assertEqual(content, data)
AssertionError: '[squelched]' != '<cStringIO.StringI object at 0x3148e70>'
----------------------------------------------------------------------
Ran 28 tests in 20.495s
FAILED (failures=1)
It looks related to an issue I posted last week, but I'm still not quite sure I understand why stream is getting set to the Generator as a string in this case.
If anyone wants to take a closer look at the source code, it's all up at https://github.com/bacongobbler/docker-registry/blob/106-swift-storage/test/utils/mock_swift_storage.py

You store just the StringIO object when calling self._storage.stream_write(filename, io):
def put_content(self, path, content, chunk=None):
path = self._init_path(path)
try:
self._swift_container[path] = content
except Exception:
raise IOError("Could not put content")
where content is the io object you passed in.
Later on, you pass that file object to StringIO again:
stream = StringIO.StringIO(self.get_content(path))
This calls str() on self.get_content(path), storing the string representation of a cStringIO.StringI() instance:
>>> from cStringIO import StringIO
>>> str(StringIO('test data'))
'<cStringIO.StringI object at 0x1074ea470>'
Your reading code works fine, it is your writing mock that needs to actually take the data out of the StringIO object.
A .read() call will do here:
def put_content(self, path, content, chunk=None):
path = self._init_path(path)
try:
self._swift_container[path] = content.read()
except Exception:
raise IOError("Could not put content")

pickle load error "init() takes exactly 2 arguments (1 given)"

My issue is that a custom class has been saved with pickle.dump, since these files were saved the custom class has been changed and now when I use pickle.load I am getting this error. Is it a problem with the saved file?
The error:
File "/cprprod/extern/lib/python2.7/pickle.py", line 1378, in load
return Unpickler(file).load()
File "/cprprod/extern/lib/python2.7/pickle.py", line 858, in load
dispatch[key](self)
file "/cprprod/extern/lib/python2.7/pickle.py", line 1070, in load_inst
self._instantiate(klass, self.marker())
File "/cprprod/extern/lib/python2.7/pickle.py", line 1060, in _instantiate
value = klass(*args)
Is there anything I can do to load the file?
The code
file = open(filename,'rb')
obj = pickle.load(file)
will give me the error.
Here is some minimal code which can reproduce the error:
import pickle
class foo:
def __init__(self,a):
self.a = a
def __str__(self):
return str(self.a)
obj = foo(1)
with open('junk','wb') as f:
pickle.dump(obj,f)
class foo:
def __init__(self,a,b):
self.a = a
self.b = b
def __str__(self):
return '%s %s'%(self.a,self.b)
def __getinitargs__(self):
return (self.a,self.b)
with open('junk','rb') as f:
obj = pickle.load(f)
print str(obj)

Given the contrived code that I posted on your behalf in the question, we can "fix" this error as:
with open('junk','rb') as f:
try:
obj = pickle.load(f)
except Exception as e:
print e
position = f.tell()
a = foo.__getinitargs__
del foo.__getinitargs__
f.seek(position)
obj = pickle.load(f)
foo.__getinitargs__ = a
print str(obj)
Now we see that the instance has been unpickled and no longer has attribute b.

If you added __getinitargs__() then it is up to you to make sure your new class can handle the arguments passed to __init__(). Old data that doesn't have the __getinitargs__ data will still lead to __init__ to be called but with no arguments.
Make the arguments to __init__ optional via keyword arguments:
def __init__(self, otherarg=None):
if otherarg is None:
# created from an old-revision pickle. Handle separately.
# The pickle will be loaded *normally* and data will still be set normally
return
self.otherarg = otherarg
When loading the old-style pickle, the data for these classes will still be restored. You can use __setstate__() to transform the internal state as needed.
Alternatively, temporarily remove the __getinitargs__ method from the class:
initargs = foo.__getinitargs__.__func__
del foo.__getinitargs__
obj = pickle.load(f)
foo.__getinitargs__ = initargs
and re-dump your pickles from the now-loaded objects with __getinitargs__ reinstated.
I've tested both methods and in both cases the old data is loaded correctly and you can then dump your objects again to a new pickle file with __getinitargs__ just fine.

You might want to modify the custom class to optionally require a second parameter. This would keep back award compatibility with your pickled objects.

ConfigParser with Unicode items

my troubles with ConfigParser continue. It seems it doesn't support Unicode very well. The config file is indeed saved as UTF-8, but when ConfigParser reads it it seems to be encoded into something else. I assumed it was latin-1 and I thougt overriding optionxform could help:
-- configfile.cfg --
[rules]
Häjsan = 3
☃ = my snowman
-- myapp.py --
# -*- coding: utf-8 -*-
import ConfigParser
def _optionxform(s):
try:
newstr = s.decode('latin-1')
newstr = newstr.encode('utf-8')
return newstr
except Exception, e:
print e
cfg = ConfigParser.ConfigParser()
cfg.optionxform = _optionxform
cfg.read("myconfig")
Of course, when I read the config I get:
'ascii' codec can't decode byte 0xc3 in position 0: ordinal not in range(128)
I've tried a couple of different variations of decoding 's' but the point seems moot, since it really should be a unicode object from the beginning. After all, the config file is UTF-8? I have confirmed that's something is wrong in the way ConfigParser reads the file by stubbing it out with this DummyConfig class. If I use that then everything is nice unicode, fine and dandy.
-- config.py --
# -*- coding: utf-8 -*-
apa = {'rules': [(u'Häjsan', 3), (u'☃', u'my snowman')]}
class DummyConfig(object):
def sections(self):
return apa.keys()
def items(self, section):
return apa[section]
def add_section(self, apa):
pass
def set(self, *args):
pass
Any ideas what could be causing this or suggestions of other config modules that supports Unicode better are most welcome. I don't want to use sys.setdefaultencoding()!

The ConfigParser.readfp() method can take a file object, have you tried opening the file object with the correct encoding using the codecs module before sending it to ConfigParser like below:
cfg.readfp(codecs.open("myconfig", "r", "utf8"))
For Python 3.2 or above, readfp() is deprecated. Use read_file() instead.

In python 3.2 encoding parameter was introduced to read(), so it can now be used as:
cfg.read("myconfig", encoding='utf-8')

Try to overwrite the write function in RawConfigParser() like this:
class ConfigWithCoder(RawConfigParser):
def write(self, fp):
"""Write an .ini-format representation of the configuration state."""
if self._defaults:
fp.write("[%s]\n" % "DEFAULT")
for (key, value) in self._defaults.items():
fp.write("%s = %s\n" % (key, str(value).replace('\n', '\n\t')))
fp.write("\n")
for section in self._sections:
fp.write("[%s]\n" % section)
for (key, value) in self._sections[section].items():
if key == "__name__":
continue
if (value is not None) or (self._optcre == self.OPTCRE):
if type(value) == unicode:
value = ''.join(value).encode('utf-8')
else:
value = str(value)
value = value.replace('\n', '\n\t')
key = " = ".join((key, value))
fp.write("%s\n" % (key))
fp.write("\n")

Seems to be a problem with the ConfigParser version for python 2x, and version for 3x is free of this problem. In this issue of the Python Bug Tracker, the status is Closed + WONTFIX.
I've fixed it editing the ConfigParser.py file. In the write method (about the line 412), change:
key = " = ".join((key, str(value).replace('\n', '\n\t')))
by
key = " = ".join((key, str(value).decode('utf-8').replace('\n', '\n\t')))
I don't know if it's a real solution, but tested in Windows 7 and Ubuntu 15.04, works like a charm, and I can share and work with the same .ini file in both systems.

what I did is just:
file_name = file_name.decode("utf-8")
cfg.read(file_name)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Porting pickle py2 to py3 strings become bytes - python

Related

File I/O error using nglview.show_biopython(structure)

How to serialize a scandir.DirEntry in Python for sending through a network socket?

using generators and cStringIO in python to stream strings

pickle load error "init() takes exactly 2 arguments (1 given)"

ConfigParser with Unicode items

Categories

Resources

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Porting pickle py2 to py3 strings become bytes - python

Related

File I/O error using nglview.show_biopython(structure)

How to serialize a scandir.DirEntry in Python for sending through a network socket?

using generators and cStringIO in python to stream strings

pickle load error "__init__() takes exactly 2 arguments (1 given)"

ConfigParser with Unicode items

Categories

Resources

pickle load error "init() takes exactly 2 arguments (1 given)"