Proper multiprocess-safe file locking in python - python

I'm trying to implement thread- and multiprocess-safe disk cache for small application. It is only required to run on modern linux distributions, no compatibility with Windows or OS X is required.
Here is the code:
class LockedOpen(object):
READ = 0
WRITE = 1
def __init__(self, filename, mode=READ, block=True):
self._filename = filename
if mode == LockedOpen.READ:
self._open_mode = os.O_RDONLY
self._lock_op = fcntl.LOCK_SH
self._open_mode_str = 'rb'
else:
self._open_mode = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
self._lock_op = fcntl.LOCK_EX
self._open_mode_str = 'wb'
if not block:
self._lock_op = self._lock_op | fcntl.LOCK_NB
def __enter__(self,):
self._fd = os.open(self._filename, self._open_mode)
fcntl.flock(self._fd, self._lock_op)
self._file_obj = os.fdopen(self._fd, self._open_mode_str)
return self._file_obj
def __exit__(self, exc_type, exc_val, exc_tb):
self._file_obj.flush()
os.fdatasync(self._fd)
fcntl.flock(self._fd, fcntl.LOCK_UN)
self._file_obj.close()
class DiskCache(object):
def __init__(self, dirname):
self._dir = dirname
def _filename_from_key(self, key):
return os.path.join(self._dir, key)
def put(self, key, value):
with LockedOpen(self._filename_from_key(key), LockedOpen.WRITE) as f:
f.write(value)
def get(self, key):
with LockedOpen(self._filename_from_key(key)) as f:
return f.read()
def delete(self, key):
os.unlink(self._filename_from_key(key))
The question is, am I doing this right? Are there any errors/caveats/implications with this code that I should know about? Any advice regarding performance is welcome as well.
Thanks~

Related

Class inheritance type checking after pickling in Python

Is there a sure-fire way to check that the class of an object is a sub-class of the desired super?
For Example, in a migration script that I'm writing, I have to convert objects of a given type to dictionaries in a given manner to ensure two-way compatability of the data.
This is best summed up like so:
Serializable
User
Status
Issue
Test
Set
Step
Cycle
However, when I'm recursively checking objects after depickling, I receive a Test object that yields the following results:
Testing data object type:
type(data)
{type}< class'__main.Test' >
Testing Class type:
type(Test())
{type}< class'__main.Test' >
Testing object type against class type:
type(Test()) == type(data)
{bool}False
Testing if object isinstance() of Class:
isinstance(data, Test)
{bool}False
Testing if Class isinstance() of Super Class:
isinstance(Test(), Serializable)
{bool}True
Testing isinstance() of Super Class::
isinstance(data, Serializable)
{bool}False
Interestingly, it doesn't appear to have any such problem prior to pickling as it handles the creation of dictionary and integrity hash just fine.
This only crops up with depickled objects in both Pickle and Dill.
For Context, here's the code in it's native environment - the DataCache object that is pickled:
class DataCache(object):
_hash=""
_data = None
#staticmethod
def genHash(data):
dataDict = DataCache.dictify(data)
datahash = json.dumps(dataDict, sort_keys=True)
return hashlib.sha256(datahash).digest()
#staticmethod
def dictify(data):
if isinstance(data,list):
datahash = []
for item in data:
datahash.append(DataCache.dictify(item))
elif isinstance(data,(dict, collections.OrderedDict)):
datahash = collections.OrderedDict()
for key,value in datahash.iteritems():
datahash[key]= DataCache.dictify(value)
elif isinstance(data, Serializable):
datahash = data.toDict()
else:
datahash = data
return datahash
def __init__(self, restoreDict = {}):
if restoreDict:
self.__dict__.update(restoreDict)
def __getinitargs__(self):
return (self.__dict__)
def set(self, data):
self._hash = DataCache.genHash(data)
self._data = data
def verify(self):
dataHash = DataCache.genHash(self._data)
return (self._hash == dataHash)
def get(self):
return self._data
Finally, I know there's arguments for using JSON for readability in storage, I needed Pickle's ability to convert straight to and from Objects without specifying the object type myself. (thanks to the nesting, it's not really feasible)
Am I going mad here or does pickling do something to the class definitions?
EDIT:
Minimal Implementation:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
from aenum import Enum
import json # _tricks
import base64
import argparse
import os
import sys
import datetime
import dill
import hashlib
import collections
class Serializable(object):
def __init__(self, initDict={}):
if initDict:
self.__dict__.update(initDict)
def __str__(self):
return str(self.sortSelf())
def sortSelf(self):
return collections.OrderedDict(sorted(self.__dict__.items()))
def toDict(self):
return self.__dict__
def fromDict(self, dict):
# Not using __dict__.update(...) to avoid polluting objects with the excess data
varMap = self.__dict__
if dict and varMap:
for key in varMap:
if (key in dict):
varMap[key] = dict[key]
self.__dict__.update(varMap)
return self
return None
class Issue(Serializable):
def __init__(self, initDict={}):
self.id = 0
self.key = ""
self.fields = {}
if initDict:
self.__dict__.update(initDict)
Serializable.__init__(self)
def fieldToDict(self, obj, key, type):
if key in obj:
result = obj[key]
else:
return None
if result is None:
return None
if isinstance(result, type):
return result.toDict()
return result
def fromDict(self, jsonDict):
super(Issue, self).fromDict(jsonDict)
self.fields["issuetype"] = IssueType().fromDict(self.fields["issuetype"])
self.fields["assignee"] = User().fromDict(self.fields["assignee"])
self.fields["creator"] = User().fromDict(self.fields["creator"])
self.fields["reporter"] = User().fromDict(self.fields["reporter"])
return self
def toDict(self):
result = super(Issue, self).toDict()
blankKeys = []
for fieldName, fieldValue in self.fields.iteritems():
if fieldValue is None:
blankKeys.append(fieldName)
if blankKeys:
for key in blankKeys:
self.fields.pop(key, None)
result["fields"]["issuetype"] = self.fieldToDict(result["fields"], "issuetype", IssueType)
result["fields"]["creator"] = self.fieldToDict(result["fields"], "creator", User)
result["fields"]["reporter"] = self.fieldToDict(result["fields"], "reporter", User)
result["fields"]["assignee"] = self.fieldToDict(result["fields"], "assignee", User)
return result
class IssueType(Serializable):
def __init__(self):
self.id = 0
self.name = ""
def toDict(self):
return {"id": str(self.id)}
class Project(Serializable):
def __init__(self):
Serializable.__init__(self)
self.id = 0
self.name = ""
self.key = ""
class Cycle(Serializable):
def __init__(self):
self.id = 0
self.name = ""
self.totalExecutions = 0
self.endDate = ""
self.description = ""
self.totalExecuted = 0
self.started = ""
self.versionName = ""
self.projectKey = ""
self.versionId = 0
self.environment = ""
self.totalCycleExecutions = 0
self.build = ""
self.ended = ""
self.name = ""
self.modifiedBy = ""
self.projectId = 0
self.startDate = ""
self.executionSummaries = {'executionSummary': []}
class Step(Serializable):
def __init__(self):
self.id = ""
self.orderId = 0
self.step = ""
self.data = ""
self.result = ""
self.attachmentsMap = {}
def toDict(self):
dict = {}
dict["step"] = self.step
dict["data"] = self.data
dict["result"] = self.result
dict["attachments"] = []
return dict
class Status(Serializable):
def __init__(self):
self.id = 0
self.name = ""
self.description = ""
self.isFinal = True
self.color = ""
self.isNative = True
self.statusCount = 0
self.statusPercent = 0.0
class User(Serializable):
def __init__(self):
self.displayName = ""
self.name = ""
self.emailAddress = ""
self.key = ""
self.active = False
self.timeZone = ""
class Execution(Serializable):
def __init__(self):
self.id = 0
self.orderId = 0
self.cycleId = -1
self.cycleName = ""
self.issueId = 0
self.issueKey = 0
self.projectKey = ""
self.comment = ""
self.versionId = 0,
self.versionName = "",
self.executedOn = ""
self.creationDate = ""
self.executedByUserName = ""
self.assigneeUserName = ""
self.status = {}
self.executionStatus = ""
def fromDict(self, jsonDict):
super(Execution, self).fromDict(jsonDict)
self.status = Status().fromDict(self.status)
# This is already listed as Execution Status, need to associate and convert!
return self
def toDict(self):
result = super(Execution, self).toDict()
result['status'] = result['status'].toDict()
return result
class ExecutionContainer(Serializable):
def __init__(self):
self.executions = []
def fromDict(self, jsonDict):
super(ExecutionContainer, self).fromDict(jsonDict)
self.executions = []
for executionDict in jsonDict["executions"]:
self.executions.append(Execution().fromDict(executionDict))
return self
class Test(Issue):
def __init__(self, initDict={}):
if initDict:
self.__dict__.update(initDict)
Issue.__init__(self)
def toDict(self):
result = super(Test, self).toDict()
stepField = "CustomField_0001"
if result["fields"][stepField]:
steps = []
for step in result["fields"][stepField]["steps"]:
steps.append(step.toDict())
result["fields"][stepField] = steps
return result
def fromDict(self, jsonDict):
super(Test, self).fromDict(jsonDict)
stepField = "CustomField_0001"
steps = []
if stepField in self.fields:
for step in self.fields[stepField]["steps"]:
steps.append(Step().fromDict(step))
self.fields[stepField] = {"steps": steps}
return self
class Set(Issue):
def __init__(self, initDict={}):
self.__dict__.update(initDict)
Issue.__init__(self)
class DataCache(object):
_hash = ""
_data = None
#staticmethod
def genHash(data):
dataDict = DataCache.dictify(data)
datahash = json.dumps(dataDict, sort_keys=True)
return hashlib.sha256(datahash).digest()
#staticmethod
def dictify(data):
if isinstance(data, list):
datahash = []
for item in data:
datahash.append(DataCache.dictify(item))
elif isinstance(data, (dict, collections.OrderedDict)):
datahash = collections.OrderedDict()
for key, value in datahash.iteritems():
datahash[key] = DataCache.dictify(value)
elif isinstance(data, Serializable):
datahash = data.toDict()
else:
datahash = data
return datahash
def __init__(self, restoreDict={}):
if restoreDict:
self.__dict__.update(restoreDict)
def __getinitargs__(self):
return (self.__dict__)
def set(self, data):
self._hash = DataCache.genHash(data)
self._data = data
def verify(self):
dataHash = DataCache.genHash(self._data)
return (self._hash == dataHash)
def get(self):
return self._data
def saveCache(name, projectKey, object):
filePath = "migration_caches/{projectKey}".format(projectKey=projectKey)
if not os.path.exists(path=filePath):
os.makedirs(filePath)
cache = DataCache()
cache.set(object)
targetFile = open("{path}/{name}".format(name=name, path=filePath), 'wb')
dill.dump(obj=cache, file=targetFile)
targetFile.close()
def loadCache(name, projectKey):
filePath = "migration_caches/{projectKey}/{name}".format(name=name, projectKey=projectKey)
result = False
try:
targetFile = open(filePath, 'rb')
try:
cache = dill.load(targetFile)
if isinstance(cache, DataCache):
if cache.verify():
result = cache.get()
except EOFError:
# except BaseException:
print ("Failed to load cache from file: {filePath}\n".format(filePath=filePath))
except IOError:
("Failed to load cache file at: {filePath}\n".format(filePath=filePath))
targetFile.close()
return result
testIssue = Test().fromDict({"id": 1000,
"key": "TEST",
"fields": {
"issuetype": {
"id": 1,
"name": "TestIssue"
},
"assignee": "Minothor",
"reporter": "Minothor",
"creator": "Minothor",
}
})
saveCache("Test", "TestProj", testIssue)
result = loadCache("Test", "TestProj")
EDIT 2
The script in it's current form, now seems to work correctly with vanilla Pickle, (initially switched to Dill due to a similar issue, which was solved by the switch).
However, if you are here with this issue and require Dill's features, then as Mike noted in the comments - it's possible to change the settings in dill.settings to have Dill behave pickle referenced items only with joblib mode, effectively mirroring pickle's standard pickling behaviour.

How to effiiciently rebuild pandas hdfstore table when append fails

I am working on using the hdfstore in pandas to data frames from an ongoing iterative process. At each iteration, I append to a table in the hdfstore. Here is a toy example:
import pandas as pd
from pandas import HDFStore
import numpy as np
from random import choice
from string import ascii_letters
alphanum=np.array(list(ascii_letters)+range(0,9))
def hdfstore_append(storefile,key,df,format="t",columns=None,data_columns=None):
if df is None:
return
if key[0]!='/':
key='/'+key
with HDFStore(storefile) as store:
if key not in store.keys():
store.put(key,df,format=format,columns=columns,data_columns=data_columns)
else:
try:
store.append(key,df)
except Exception as inst:
df = pd.concat([store.get(key),df])
store.put(key,df,format=format,columns=columns,
data_columns=data_columns)
storefile="db.h5"
for i in range(0,100):
df=pd.DataFrame([dict(n=np.random.randn(),
s=''.join(alphanum[np.random.randint(1,len(alphanum),np.random.randint(1,2*(i+1))]))],index=[i])
hdfstore_append(storefile,'/SO/df',df,columns=df.columns,data_columns=True)
The hdfstore_append function guards against the various exceptions hdfstore.append throws, and rebuilds the table when necessary. The issue with this approach is that it gets very slow when the table in the store becomes very large.
Is there a more efficient way to do this?
Below is an example of an efficient method for building large pandas hdfstores. The key is to cache the frame numbers when the tables becomes large. Also instead of appending, removing pre-existing data will essentially create a put.
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import six
import logging
import os
from abc import ABCMeta, abstractmethod, abstractproperty
import warnings
import pandas as pd
logger = logging.getLogger(__name__)
class FramewiseData(object):
"Abstract base class defining a data container with framewise access."
__metaclass__ = ABCMeta
#abstractmethod
def put(self, df):
pass
#abstractmethod
def get(self, frame_no):
pass
#abstractproperty
def frames(self):
pass
#abstractmethod
def close(self):
pass
#abstractproperty
def t_column(self):
pass
def __getitem__(self, frame_no):
return self.get(frame_no)
def __len__(self):
return len(self.frames)
def dump(self, N=None):
"""Return data from all, or the first N, frames in a single DataFrame
Parameters
----------
N : integer
optional; if None, return all frames
Returns
-------
DataFrame
"""
if N is None:
return pd.concat(iter(self))
else:
i = iter(self)
return pd.concat((next(i) for _ in range(N)))
#property
def max_frame(self):
return max(self.frames)
def _validate(self, df):
if self.t_column not in df.columns:
raise ValueError("Cannot write frame without a column "
"called {0}".format(self.t_column))
if df[self.t_column].nunique() != 1:
raise ValueError("Found multiple values for 'frame'. "
"Write one frame at a time.")
def __iter__(self):
return self._build_generator()
def _build_generator(self):
for frame_no in self.frames:
yield self.get(frame_no)
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.close()
KEY_PREFIX = 'Frame_'
len_key_prefix = len(KEY_PREFIX)
def code_key(frame_no):
"Turn the frame_no into a 'natural name' string idiomatic of HDFStore"
key = '{0}{1}'.format(KEY_PREFIX, frame_no)
return key
def decode_key(key):
frame_no = int(key[len_key_prefix:])
return frame_no
class PandasHDFStore(FramewiseData):
"""An interface to an HDF5 file with framewise access, using pandas.
Save each frame's data to a node in a pandas HDFStore.
Any additional keyword arguments to the constructor are passed to
pandas.HDFStore().
"""
def __init__(self, filename, mode='a', t_column='frame', **kwargs):
self.filename = os.path.abspath(filename)
self._t_column = t_column
self.store = pd.HDFStore(self.filename, mode, **kwargs)
#property
def t_column(self):
return self._t_column
#property
def max_frame(self):
return max(self.frames)
def put(self, df):
if len(df) == 0:
warnings.warn('An empty DataFrame was passed to put(). Continuing.')
return
frame_no = df[self.t_column].values[0] # validated to be all the same
key = code_key(frame_no)
# Store data as tabular instead of fixed-format.
# Make sure remove any prexisting data, so don't really 'append'.
try:
self.store.remove(key)
except KeyError:
pass
self.store.put(key, df, format='table')
def get(self, frame_no):
key = code_key(frame_no)
frame = self.store.get(key)
return frame
#property
def frames(self):
"""Returns sorted list of integer frame numbers in file"""
return self._get_frame_nos()
def _get_frame_nos(self):
"""Returns sorted list of integer frame numbers in file"""
# Pandas' store.keys() scans the entire file looking for stored Pandas
# structures. This is very slow for large numbers of frames.
# Instead, scan the root level of the file for nodes with names
# matching our scheme; we know they are DataFrames.
r = [decode_key(key) for key in self.store.root._v_children.keys() if
key.startswith(KEY_PREFIX)]
r.sort()
return r
def close(self):
self.store.close()
class PandasHDFStoreBig(PandasHDFStore):
"""Like PandasHDFStore, but keeps a cache of frame numbers.
This can give a large performance boost when a file contains thousands
of frames.
If a file was made in PandasHDFStore, opening it with this class
and then closing it will add a cache (if mode != 'r').
Any additional keyword arguments to the constructor are passed to
pandas.HDFStore().
"""
def __init__(self, filename, mode='a', t_column='frame', **kwargs):
self._CACHE_NAME = '_Frames_Cache'
self._frames_cache = None
self._cache_dirty = False # Whether _frames_cache needs to be written out
super(PandasHDFStoreBig, self).__init__(filename, mode, t_column,
**kwargs)
#property
def frames(self):
# Hit memory cache, then disk cache
if self._frames_cache is not None:
return self._frames_cache
else:
try:
self._frames_cache = list(self.store[self._CACHE_NAME].index.values)
self._cache_dirty = False
except KeyError:
self._frames_cache = self._get_frame_nos()
self._cache_dirty = True # In memory, but not in file
return self._frames_cache
def put(self, df):
self._invalidate_cache()
super(PandasHDFStoreBig, self).put(df)
def rebuild_cache(self):
"""Delete cache on disk and rebuild it."""
self._invalidate_cache()
_ = self.frames # Compute cache
self._flush_cache()
def _invalidate_cache(self):
self._frames_cache = None
try:
del self.store[self._CACHE_NAME]
except KeyError: pass
def _flush_cache(self):
"""Writes frame cache if dirty and file is writable."""
if (self._frames_cache is not None and self._cache_dirty
and self.store.root._v_file._iswritable()):
self.store[self._CACHE_NAME] = pd.DataFrame({'dummy': 1},
index=self._frames_cache)
self._cache_dirty = False
def close(self):
"""Updates cache, writes if necessary, then closes file."""
if self.store.root._v_file._iswritable():
_ = self.frames # Compute cache
self._flush_cache()
super(PandasHDFStoreBig, self).close()
class PandasHDFStoreSingleNode(FramewiseData):
"""An interface to an HDF5 file with framewise access,
using pandas, that is faster for cross-frame queries.
This implementation is more complex than PandasHDFStore,
but it simplifies (speeds up?) cross-frame queries,
like queries for a single probe's entire trajectory.
Any additional keyword arguments to the constructor are passed to
pandas.HDFStore().
"""
def __init__(self, filename, key='FrameData', mode='a', t_column='frame',
use_tabular_copy=False, **kwargs):
self.filename = os.path.abspath(filename)
self.key = key
self._t_column = t_column
self.store = pd.HDFStore(self.filename, mode, **kwargs)
with pd.get_store(self.filename) as store:
try:
store[self.key]
except KeyError:
pass
else:
self._validate_node(use_tabular_copy)
#property
def t_column(self):
return self._t_column
def put(self, df):
if len(df) == 0:
warnings.warn('An empty DataFrame was passed to put(). Continuing.')
return
self._validate(df)
self.store.append(self.key, df, data_columns=True)
def get(self, frame_no):
frame = self.store.select(self.key, '{0} == {1}'.format(
self._t_column, frame_no))
return frame
def dump(self, N=None):
"""Return data from all, or the first N, frames in a single DataFrame
Parameters
----------
N : integer
optional; if None, return all frames
Returns
-------
DataFrame
"""
if N is None:
return self.store.select(self.key)
else:
Nth_frame = self.frames[N - 1]
return self.store.select(self.key, '{0} <= {1}'.format(
self._t_column, Nth_frame))
def close(self):
self.store.close()
def __del__(self):
if hasattr(self, 'store'):
self.close()
#property
def frames(self):
"""Returns sorted list of integer frame numbers in file"""
# I assume one column can fit in memory, which is not ideal.
# Chunking does not seem to be implemented for select_column.
frame_nos = self.store.select_column(self.key, self.t_column).unique()
frame_nos.sort()
return frame_nos
def _validate_node(self, use_tabular_copy):
# The HDFStore might be non-tabular, which means we cannot select a
# subset, and this whole structure will not work.
# For convenience, this can rewrite the table into a tabular node.
if use_tabular_copy:
self.key = _make_tabular_copy(self.filename, self.key)
pandas_type = getattr(getattr(getattr(
self.store._handle.root, self.key, None), '_v_attrs', None),
'pandas_type', None)
if not pandas_type == 'frame_table':
raise ValueError("This node is not tabular. Call with "
"use_tabular_copy=True to proceed.")
def _make_tabular_copy(store, key):
"""Copy the contents nontabular node in a pandas HDFStore
into a tabular node"""
tabular_key = key + '/tabular'
logger.info("Making a tabular copy of %s at %s", (key, tabular_key))
store.append(tabular_key, store.get(key), data_columns=True)
return tabular_key

Get all subclasses defined in several python modules

I got 2 python files implement test repository factory pattern:
testrepository.py
class TestRepository(object):
def __init__(self):
pass
def generate(self, username, password):
pass
def update(self):
pass
class RepositoryFactory(object):
factories = {}
def add_repo(identification, repo_factory):
RepositoryFactory.factories[identification] = repo_factory
add_repo = staticmethod(add_repo)
def create_repo(identification, server, dirpath, reponame):
if identification not in RepositoryFactory.factories:
RepositoryFactory.factories[identification] = \
eval(identification + '.Factory()')
return RepositoryFactory.factories[identification].create(server, dirpath, reponame)
create_repo = staticmethod(create_repo)
gitrepository.py
from testrepository import TestRepository, RepositoryFactory
class GitRepository(TestRepository):
def __init__(self, server, dirpath, reponame):
super(GitRepository, self).__init__()
self.server = server
self.dirpath = dirpath
self.reponame = reponame
self.username = None
self.password = None
self.fullPath = path.join(self.dirpath, self.reponame)
def generate(self, username, password):
pass
def update(self):
pass
class Factory(object):
#staticmethod
def create(server, dirpath, reponame):
return GitRepository(server=server, dirpath=dirpath, reponame=reponame)
if __name__ == '__main__':
repo_type = TestRepository.__subclasses__().pop().__name__
repo = RepositoryFactory.create_repo(repo_type,
server='localhost',
dirpath='/home/user/git',
reponame='repo')
repo.generate(username='test', password='test')
repo.update()
Problem is repo_type = TestRepository.__subclasses__() in example above is empty, if I put all classes in one file it works:
>>> print TestRepository.__subclasses__()
[<class '__main__.GitRepository'>]
so question is how to fetch all subclasses from several python modules?

python unpickling EOFError when Reading file

I've written a basic storage class that holds data in a list of lists writes it to a file and then allows it to be read back from the file for persistence. It was working on the system it was originally written (windows 7) I moved it to a second system (windows 8) and now it seems to be malfunctioning.
The error I'm receiving is:
Traceback (most recent call last):
File "D:\Code Vault\pydb.py", line 69, in <module>
print Users.ReadData()
File "D:\Code Vault\pydb.py", line 23, in ReadData
self.data = cPickle.load(self.Inputstream)
EOFError
The full code for the class is as follows
class table():
#TODO Insert Key Feild to facilitate ID Search
def __init__(self,*args):
self.tablename = args[0]
self.data = []
self.colnames = {}
self.filename = str(self.tablename)+ ".p"
for e in args[1:]:
self.colnames.update({e:len(self.colnames)})
self.Filestorage = open(self.filename, "wb" )
self.Filestorage.close()
def ReadData(self):
self.Inputstream = open(self.filename, "rb")
self.Inputstream.seek(0)
self.data = cPickle.load(self.Inputstream)
self.Inputstream.close()
return self.data
def AddData (self, *args):
self.tempdata = []
for e in args:
self.tempdata.append(e)
if len(args) == len(self.colnames):
self.data.append(self.tempdata)
else:
return "Incorrect argument quantities"
return self.tempdata
def Commit(self):
self.Outputstream = open(self.filename, "ab" )
cPickle.dump(self.data, self.Outputstream)
self.Outputstream.close()
return self.ReadData()
def Search(self,query):
for e in self.ReadData():
if query in e:
return self.ReadData().index(e)
else:
return "Value " + str(query) + " does not exist"
def Update(self,query, edit):
for e in self.data:
if query in e:
e[e.index(query)] = edit
return self.data
def DumpTable(self):
self.data = []
self.Outputstream = open(self.filename+".p", "wb" )
cPickle.dump(self.data, self.Outputstream)
self.Outputstream.close()
self.ReadData()
Users = table("Users","Username","Password")
Users.AddData("bob","123456")
# Users.Commit()
# print Users.data
# print Users.filename
# print Users.data
print Users.ReadData()
It's probably a case of code blindness as it was working previously. Any Ideas would be appreciated
The amended code that answers the question.
import pickle
class table():
#TODO Insert Key Feild to facilitate ID Search
def __init__(self,*args):
self.tablename = args[0]
self.data = []
self.colnames = {}
self.filename = str(self.tablename)+ ".p"
for e in args[1:]:
self.colnames.update({e:len(self.colnames)})
try: #creating this conditional prevented the contents being erased upon instantiating the class
f = open(self.filename,"rb")
except IOError:
self.createfile = open(self.filename, "wb" )
self.createfile.close()
def ReadData(self):
self.Inputstream = open(self.filename, "rb")
self.data = pickle.load(self.Inputstream)
self.Inputstream.close()
return self.data
def AddData (self, *args):
self.tempdata = []
for e in args:
self.tempdata.append(e)
if len(args) == len(self.colnames):
self.data.append(self.tempdata)
else:
return "Incorrect argument quantities"
return self.tempdata
def Commit(self):
self.Outputstream = open(self.filename, "wb" )
pickle.dump(self.data, self.Outputstream)
self.Outputstream.close()
def createfile(self):
self.createfile = open(self.filename, "wb" )
self.createfile.close()
def Search(self,query):
for e in self.ReadData():
if query in e:
return self.ReadData().index(e)
else:
return "Value " + str(query) + " does not exist"
def Update(self,query, edit):
for e in self.data:
if query in e:
e[e.index(query)] = edit
return self.data
def DumpTable(self):
self.data = []
self.Outputstream = open(self.filename+".p", "wb" )
pickle.dump(self.data, self.Outputstream)
self.Outputstream.close()
self.ReadData()
The solution seems to be here pickle.load() raising EOFError in Windows
i.e. file needs to be open for read on binary mode with "rb" flag.

Python generate sorted list

I want to compress my movies automatically. So I've written a mediainfo wrapper class in python, to generate a xml output, which I then parse to a movieinfo class, with a list of audio and subtitle tracks.
__author__ = 'dominik'
class Error(Exception):
""" Error class
"""
class ValidationError(Error):
""" Invalid or missing xml items
"""
class MovieInfo(object):
""" Description of movie file
"""
def __init__(self, media_info):
self._video_track = None
self._audio_tracks = []
self._subtitle_tracks = []
self.valid_movie = True
for track in media_info.tracks:
if track.track_type == "Audio":
self._audio_tracks.append(AudioTrack(track))
elif track.track_type == "Text":
self._subtitle_tracks.append(SubtitleTrack(track))
elif track.track_type == "Video":
self._video_track = VideoTrack(track)
#property
def audio_tracks(self):
if not hasattr(self, "_audio_tracks"):
self._audio_tracks = []
if len(self._audio_tracks) != 0:
return self._audio_tracks
#property
def subtitle_tracks(self):
if not hasattr(self, "_subtitle_tracks"):
self._subtitle_tracks = []
if len(self._subtitle_tracks) != 0:
return self._subtitle_tracks
class Track(object):
""" Abstract track class for audio and subtitle tracks
"""
__KNOWN_LANGUAGE_CODES = {"en": "ENG", "de": "DE"}
def __init__(self, track, valid_codecs):
self._valid = True
track_id = int(track.id)
codec_id = self._determine_codec(track.codec_id, valid_codecs)
language = self._determine_language(track.language)
self._id = track_id
self._codec_id = codec_id
self._language = language
def _determine_codec(self, track_codec, types):
result = types.get(track_codec, None)
if result is None:
self._valid = False
return result
def _determine_language(self, track_language, types=__KNOWN_LANGUAGE_CODES):
result = types.get(track_language, None)
if result is None:
self._valid = False
return result
class AudioTrack(Track):
""" Audio track class
"""
__KNOWN_AUDIO_CODECS = {"A_DTS": "DTS", "A_AC3": "AC3"}
def __init__(self, track):
self._type = 1
Track.__init__(self, track, self.__KNOWN_AUDIO_CODECS)
class SubtitleTrack(Track):
""" Subtitle track class
"""
__KNOWN_SUBTITLE_CODECS = {"S_VOBSUB": "VOBSUB"}
def __init__(self, track):
self._type = 2
if track.forced == "Yes":
self._forced = True
else:
self._forced = False
Track.__init__(self, track, self.__KNOWN_SUBTITLE_CODECS)
class VideoTrack(object):
""" Video track class (only one video track in movie info!)
"""
def __init__(self, track):
self._type = 0
self._framerate = float(track.frame_rate)
self._width = track.width
self._height = track.height
Here is the mediainfo class (it's the pymediainfo class):
from subprocess import Popen
import os
from tempfile import mkstemp
from bs4 import BeautifulSoup, NavigableString
from setuptools.compat import unicode
class Track(object):
""" Hold the track information
"""
def __getattr__(self, item):
try:
return object.__getattribute__(self, item)
except:
pass
return None
def __init__(self, xml_track):
self.xml_track = xml_track
self.track_type = xml_track.attrs["type"]
for child in self.xml_track.children:
if not isinstance(child, NavigableString):
node_name = child.name.lower().strip()
node_value = unicode(child.string)
node_other_name = "other_%s" % node_name
if getattr(self, node_name) is None:
setattr(self, node_name, node_value)
else:
if getattr(self, node_other_name) is None:
setattr(self, node_other_name, [node_value, ])
else:
getattr(self, node_other_name).append(node_value)
for key in [c for c in self.__dict__.keys() if c.startswith("other_")]:
try:
primary = key.replace("other_", "")
setattr(self, primary, int(getattr(self, primary)))
except:
for value in getattr(self, key):
try:
actual = getattr(self, primary)
setattr(self, primary, int(value))
getattr(self, key).append(actual)
break
except:
pass
def __repr__(self):
return("<Track id='{0}', type='{1}'>".format(self.id, self.track_type))
def to_data(self):
data = {}
for k, v in self.__dict__.items():
if k != 'xml_track':
data[k] = v
return data
class Mediainfo(object):
""" MediaInfo wrapper
"""
def __init__(self, xml):
self.xml_dom = xml
if isinstance(xml, str):
self.xml_dom = BeautifulSoup(xml, "xml")
def _populate_tracks(self):
if self.xml_dom is None:
return
for xml_track in self.xml_dom.Mediainfo.File.find_all("track"):
self._tracks.append(Track(xml_track))
#property
def tracks(self):
if not hasattr(self, "_tracks"):
self._tracks = []
if len(self._tracks) == 0:
self._populate_tracks()
return self._tracks
#staticmethod
def parse(filename):
filehandler_out, filename_out = mkstemp(".xml", "mediainfo-")
filehandler_err, filename_err = mkstemp(".error", "mediainfo-")
filepointer_out = os.fdopen(filehandler_out, "r+b")
filepointer_err = os.fdopen(filehandler_err, "r+b")
mediainfo_command = ["mediainfo", "-f", "--Output=XML", filename]
p = Popen(mediainfo_command, stdout=filepointer_out, stderr=filepointer_err)
p.wait()
filepointer_out.seek(0)
xml_dom = BeautifulSoup(filepointer_out.read(), "xml")
filepointer_out.close()
filepointer_err.close()
print(xml_dom)
return Mediainfo(xml_dom)
def to_data(self):
data = {'tracks': []}
for track in self.tracks:
data['tracks'].append(track.to_data())
return data
This class gives me every track in the xml and then I parse the relevant info in movieinfo.
Ok now I have a list of audiotracks e.g. 3 tracks one in german language and DTS, one in german and AC3 and one in english and AC3. Now I want to get the ids from the tracks in the format "1,2,3" to give it to handbrake cli.
My problem is the order of the tracks. If there is a german DTS track this schould be the first track, the second track should be also the first, but compressed to aac and the third track should be one english track in AAC. If there is only a german AC3 track then the first track should be this track but compressed to AAC, and the second track should englisch and AAC.
I don't know exactly how I can achive that, can you help me? I'm new to python, and come from C, C++ and C#. In C# this is very easy to get with lambda.
Assuming you know to define a compare-tor that given two items can define which is bigger then Python functions as well as C or C++.
Start here -
1. https://wiki.python.org/moin/HowTo/Sorting/
https://developers.google.com/edu/python/sorting
http://docs.python.org/2/library/functions.html#sorted
Using sorted method and define the key you want.

Categories