How to effiiciently rebuild pandas hdfstore table when append fails

How to effiiciently rebuild pandas hdfstore table when append fails - python

I am working on using the hdfstore in pandas to data frames from an ongoing iterative process. At each iteration, I append to a table in the hdfstore. Here is a toy example:
import pandas as pd
from pandas import HDFStore
import numpy as np
from random import choice
from string import ascii_letters
alphanum=np.array(list(ascii_letters)+range(0,9))
def hdfstore_append(storefile,key,df,format="t",columns=None,data_columns=None):
if df is None:
return
if key[0]!='/':
key='/'+key
with HDFStore(storefile) as store:
if key not in store.keys():
store.put(key,df,format=format,columns=columns,data_columns=data_columns)
else:
try:
store.append(key,df)
except Exception as inst:
df = pd.concat([store.get(key),df])
store.put(key,df,format=format,columns=columns,
data_columns=data_columns)
storefile="db.h5"
for i in range(0,100):
df=pd.DataFrame([dict(n=np.random.randn(),
s=''.join(alphanum[np.random.randint(1,len(alphanum),np.random.randint(1,2*(i+1))]))],index=[i])
hdfstore_append(storefile,'/SO/df',df,columns=df.columns,data_columns=True)
The hdfstore_append function guards against the various exceptions hdfstore.append throws, and rebuilds the table when necessary. The issue with this approach is that it gets very slow when the table in the store becomes very large.
Is there a more efficient way to do this?

Below is an example of an efficient method for building large pandas hdfstores. The key is to cache the frame numbers when the tables becomes large. Also instead of appending, removing pre-existing data will essentially create a put.
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import six
import logging
import os
from abc import ABCMeta, abstractmethod, abstractproperty
import warnings
import pandas as pd
logger = logging.getLogger(__name__)
class FramewiseData(object):
"Abstract base class defining a data container with framewise access."
__metaclass__ = ABCMeta
#abstractmethod
def put(self, df):
pass
#abstractmethod
def get(self, frame_no):
pass
#abstractproperty
def frames(self):
pass
#abstractmethod
def close(self):
pass
#abstractproperty
def t_column(self):
pass
def __getitem__(self, frame_no):
return self.get(frame_no)
def __len__(self):
return len(self.frames)
def dump(self, N=None):
"""Return data from all, or the first N, frames in a single DataFrame
Parameters
----------
N : integer
optional; if None, return all frames
Returns
-------
DataFrame
"""
if N is None:
return pd.concat(iter(self))
else:
i = iter(self)
return pd.concat((next(i) for _ in range(N)))
#property
def max_frame(self):
return max(self.frames)
def _validate(self, df):
if self.t_column not in df.columns:
raise ValueError("Cannot write frame without a column "
"called {0}".format(self.t_column))
if df[self.t_column].nunique() != 1:
raise ValueError("Found multiple values for 'frame'. "
"Write one frame at a time.")
def __iter__(self):
return self._build_generator()
def _build_generator(self):
for frame_no in self.frames:
yield self.get(frame_no)
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.close()
KEY_PREFIX = 'Frame_'
len_key_prefix = len(KEY_PREFIX)
def code_key(frame_no):
"Turn the frame_no into a 'natural name' string idiomatic of HDFStore"
key = '{0}{1}'.format(KEY_PREFIX, frame_no)
return key
def decode_key(key):
frame_no = int(key[len_key_prefix:])
return frame_no
class PandasHDFStore(FramewiseData):
"""An interface to an HDF5 file with framewise access, using pandas.
Save each frame's data to a node in a pandas HDFStore.
Any additional keyword arguments to the constructor are passed to
pandas.HDFStore().
"""
def __init__(self, filename, mode='a', t_column='frame', **kwargs):
self.filename = os.path.abspath(filename)
self._t_column = t_column
self.store = pd.HDFStore(self.filename, mode, **kwargs)
#property
def t_column(self):
return self._t_column
#property
def max_frame(self):
return max(self.frames)
def put(self, df):
if len(df) == 0:
warnings.warn('An empty DataFrame was passed to put(). Continuing.')
return
frame_no = df[self.t_column].values[0] # validated to be all the same
key = code_key(frame_no)
# Store data as tabular instead of fixed-format.
# Make sure remove any prexisting data, so don't really 'append'.
try:
self.store.remove(key)
except KeyError:
pass
self.store.put(key, df, format='table')
def get(self, frame_no):
key = code_key(frame_no)
frame = self.store.get(key)
return frame
#property
def frames(self):
"""Returns sorted list of integer frame numbers in file"""
return self._get_frame_nos()
def _get_frame_nos(self):
"""Returns sorted list of integer frame numbers in file"""
# Pandas' store.keys() scans the entire file looking for stored Pandas
# structures. This is very slow for large numbers of frames.
# Instead, scan the root level of the file for nodes with names
# matching our scheme; we know they are DataFrames.
r = [decode_key(key) for key in self.store.root._v_children.keys() if
key.startswith(KEY_PREFIX)]
r.sort()
return r
def close(self):
self.store.close()
class PandasHDFStoreBig(PandasHDFStore):
"""Like PandasHDFStore, but keeps a cache of frame numbers.
This can give a large performance boost when a file contains thousands
of frames.
If a file was made in PandasHDFStore, opening it with this class
and then closing it will add a cache (if mode != 'r').
Any additional keyword arguments to the constructor are passed to
pandas.HDFStore().
"""
def __init__(self, filename, mode='a', t_column='frame', **kwargs):
self._CACHE_NAME = '_Frames_Cache'
self._frames_cache = None
self._cache_dirty = False # Whether _frames_cache needs to be written out
super(PandasHDFStoreBig, self).__init__(filename, mode, t_column,
**kwargs)
#property
def frames(self):
# Hit memory cache, then disk cache
if self._frames_cache is not None:
return self._frames_cache
else:
try:
self._frames_cache = list(self.store[self._CACHE_NAME].index.values)
self._cache_dirty = False
except KeyError:
self._frames_cache = self._get_frame_nos()
self._cache_dirty = True # In memory, but not in file
return self._frames_cache
def put(self, df):
self._invalidate_cache()
super(PandasHDFStoreBig, self).put(df)
def rebuild_cache(self):
"""Delete cache on disk and rebuild it."""
self._invalidate_cache()
_ = self.frames # Compute cache
self._flush_cache()
def _invalidate_cache(self):
self._frames_cache = None
try:
del self.store[self._CACHE_NAME]
except KeyError: pass
def _flush_cache(self):
"""Writes frame cache if dirty and file is writable."""
if (self._frames_cache is not None and self._cache_dirty
and self.store.root._v_file._iswritable()):
self.store[self._CACHE_NAME] = pd.DataFrame({'dummy': 1},
index=self._frames_cache)
self._cache_dirty = False
def close(self):
"""Updates cache, writes if necessary, then closes file."""
if self.store.root._v_file._iswritable():
_ = self.frames # Compute cache
self._flush_cache()
super(PandasHDFStoreBig, self).close()
class PandasHDFStoreSingleNode(FramewiseData):
"""An interface to an HDF5 file with framewise access,
using pandas, that is faster for cross-frame queries.
This implementation is more complex than PandasHDFStore,
but it simplifies (speeds up?) cross-frame queries,
like queries for a single probe's entire trajectory.
Any additional keyword arguments to the constructor are passed to
pandas.HDFStore().
"""
def __init__(self, filename, key='FrameData', mode='a', t_column='frame',
use_tabular_copy=False, **kwargs):
self.filename = os.path.abspath(filename)
self.key = key
self._t_column = t_column
self.store = pd.HDFStore(self.filename, mode, **kwargs)
with pd.get_store(self.filename) as store:
try:
store[self.key]
except KeyError:
pass
else:
self._validate_node(use_tabular_copy)
#property
def t_column(self):
return self._t_column
def put(self, df):
if len(df) == 0:
warnings.warn('An empty DataFrame was passed to put(). Continuing.')
return
self._validate(df)
self.store.append(self.key, df, data_columns=True)
def get(self, frame_no):
frame = self.store.select(self.key, '{0} == {1}'.format(
self._t_column, frame_no))
return frame
def dump(self, N=None):
"""Return data from all, or the first N, frames in a single DataFrame
Parameters
----------
N : integer
optional; if None, return all frames
Returns
-------
DataFrame
"""
if N is None:
return self.store.select(self.key)
else:
Nth_frame = self.frames[N - 1]
return self.store.select(self.key, '{0} <= {1}'.format(
self._t_column, Nth_frame))
def close(self):
self.store.close()
def __del__(self):
if hasattr(self, 'store'):
self.close()
#property
def frames(self):
"""Returns sorted list of integer frame numbers in file"""
# I assume one column can fit in memory, which is not ideal.
# Chunking does not seem to be implemented for select_column.
frame_nos = self.store.select_column(self.key, self.t_column).unique()
frame_nos.sort()
return frame_nos
def _validate_node(self, use_tabular_copy):
# The HDFStore might be non-tabular, which means we cannot select a
# subset, and this whole structure will not work.
# For convenience, this can rewrite the table into a tabular node.
if use_tabular_copy:
self.key = _make_tabular_copy(self.filename, self.key)
pandas_type = getattr(getattr(getattr(
self.store._handle.root, self.key, None), '_v_attrs', None),
'pandas_type', None)
if not pandas_type == 'frame_table':
raise ValueError("This node is not tabular. Call with "
"use_tabular_copy=True to proceed.")
def _make_tabular_copy(store, key):
"""Copy the contents nontabular node in a pandas HDFStore
into a tabular node"""
tabular_key = key + '/tabular'
logger.info("Making a tabular copy of %s at %s", (key, tabular_key))
store.append(tabular_key, store.get(key), data_columns=True)
return tabular_key

Related

Python ThreadPoolExecutor is faster than a loop for CPU-bound task. How come?

Recently I've been working on a project, and found behaviour that I don't understand. We have endpoint that fetches documents from mongodb and then applies transformation to each document, replacing some symbols in data using regex. And what's bothering me is that for 7400 documents applying transformation function in a regular loop takes 6 seconds to finish. And using ThreadPoolExecutor.map finishes in 3.4 seconds. As far as I know GIL prevents python interpreter from running more that one thread simultaneously, so for CPU-bound tasks should run slower in ThreadPoolExecutor than in regular loop. But this is not a case here. How so? I'm assuming that re operations somehow release GIL but not sure. Here is code:
# parts that are responsible for substituting symbols in mongo document
class Converter:
def __init__(self, reverse=False):
self.reverse = reverse
def key_convert(self, key, reverse, path):
return key
def value_convert(self, value, reverse, path):
return value
def recurse(self, data, path=tuple()):
if isinstance(data, Mapping):
_data = {}
for k, v in data.items():
next_path = path + (k,)
key = self.key_convert(k, self.reverse, path)
value = self.value_convert(v, self.reverse, next_path)
_data[key] = self.recurse(value, next_path)
return _data
elif isinstance(data, Iterable) and not isinstance(data, (str, bytes)):
return [
self.recurse(it, path + (idx,))
for idx, it in enumerate(data)]
return self.value_convert(data, self.reverse, path)
def convert(self, data):
self.reverse = False
return self.recurse(data)
def unconvert(self, data):
self.reverse = True
return self.recurse(data)
class MongoConverter(Converter):
_to_mongo = ((rec(r"\."), "\u00B7"), (rec(r"^\$"), "#"),)
_from_mongo = ((rec("\u00B7"), "."), (rec("^#"), "$"),)
def key_convert_to_mongo(self, key):
return pattern_substitute(key, self._to_mongo)
def key_convert_from_mongo(self, key):
return pattern_substitute(key, self._from_mongo)
def key_convert(self, key, reverse, _):
if reverse:
return self.key_convert_from_mongo(key)
return self.key_convert_to_mongo(key)
def value_convert(self, value, reverse, path):
if reverse:
return value
return as_datetime(value)
def pattern_substitute(value, pattern_substitutes):
for pattern, substitute in pattern_substitutes:
value = pattern.sub(substitute, value)
return value
# storage adapter
class MongoStorage:
def __init__(self, collection, converter=None):
self.collection = collection
self.converter = converter if converter else MongoConverter()
self._context = None
self.executor = ThreadPoolExecutor()
def after_find(self, data):
if data is not None:
return self.converter.unconvert(data)
def find(self, filtr=None, limit=-1, **kwargs):
filtr = filter_converter.convert(filtr)
if limit == 0:
return []
if limit == -1:
limit = 0
# this part is what I'm asking about. Regular loop here is slower
return self.executor.map(
self.after_find,
self.collection.find(filtr, limit=limit, **kwargs)
)
Thank you for answers.

Python ValueError: too many values to unpack (expected 3)

I am getting that exception from this code:
from collections import Counter, ChainMap
from itertools import chain
import re
import pickle
class EmoLex(object):
def __init__(self, emolex_filepath=None):
if emolex_filepath:
with open(emolex_filepath) as emolex_file:
self.parser = self._load_and_parse(emolex_file)
def __len__(self):
return len(self.keys())
def keys(self):
return self._parser_keys()
def _parser_keys(self):
return self.parser.keys
def categorize_token(self, token):
return self.parser[token.lower()]
def annotate_doc(self, doc):
return [ self.categorize_token(word.lower()) for word in doc ]
def summarize_doc(self, doc):
annotation = self.annotate_doc(doc)
# return just the summarization
return self.summarize_annotation(annotation, doc)
def summarize_annotation(self, annotation, doc):
wc = len([w for w in doc if re.match('\w+', w)])
ctr = Counter(list(self._flatten_list_of_sets(annotation)))
# Convert to percentiles
summary = {k: float(v)/float(wc) for (k,v) in dict(ctr).items()}
# Set keys that did not occur to 0
not_counted = { k: 0.0 for k in
self._parser_keys() - set(summary.keys()) }
# Merge the two dictionaries
return dict(ChainMap(summary, not_counted))
def load(self, pickle_filepath):
with open(pickle_filepath, 'rb') as pickle_file:
self.parser = pickle.load(pickle_file)
def dump(self, pickle_filepath):
with open(pickle_filepath, 'wb') as pickle_file:
pickle.dump(self.parser, pickle_file)
l_of_s: List[Set[str]] -> generator List[str]
def _flatten_list_of_sets(self, l_of_s):
return chain.from_iterable([ list(categories)
for categories in l_of_s ])
def _load_and_parse(self, emolex_file):
return NrcDiscreteParser(emolex_file.read().splitlines())
#EmoLex(emolex_filepath="/Users/sakshigupta/Dropbox/Sakshi_July_2021/reports_Old_and_New/NRC-VAD-Lexicon.txt")
lexicon = EmoLex("/Users/sakshigupta/Dropbox/Sakshi_July_2021/reports_Old_and_New/NRC-VAD-Lexicon.txt")

Which function are you calling?
This exception is seen when the number of returned values are more than the number of expected values.
Check the example given below
def dummy_function():
return 1,2,3
On calling the above function with only two return variables
a,b = dummy_function()
I get this ->
ValueError: too many values to unpack (expected 2)
This means that on the caller end I am expecting only two values but the functioning is returning more than 2

Adding class objects to Pytorch Dataloader: batch must contain tensors

I have a custom Pytorch dataset that returns a dictionary containing a class object "queries".
class QueryDataset(torch.utils.data.Dataset):
def __init__(self, queries, values, targets):
super(QueryDataset).__init__()
self.queries = queries
self.values = values
self.targets = targets
def __len__(self):
return self.values.shape[0]
def __getitem__(self, idx):
sample = DeviceDict({'query': self.queries[idx],
"values": self.values[idx],
"targets": self.targets[idx]})
return sample
The problem is that when I put the queries in a data loader I get default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'query.Query'>. Is there a way to have a class object in my data loader? It blows up at next(iterator) in the code below.
train_queries = QueryDataset(train_queries)
train_loader = torch.utils.data.DataLoader(train_queries,
batch_size=10],
shuffle=True,
drop_last=False)
for i in range(epochs):
iterator = iter(train_loader)
for i in range(len(train_loader)):
batch = next(iterator)
out = model(batch)
loss = criterion(out["pred"], batch["targets"])
self.optimizer.zero_grad()
loss.sum().backward()
self.optimizer.step()

You need to define your own colate_fn in order to do this.
A sloppy approach just to show you how stuff works here, would be something like this:
import torch
class DeviceDict:
def __init__(self, data):
self.data = data
def print_data(self):
print(self.data)
class QueryDataset(torch.utils.data.Dataset):
def __init__(self, queries, values, targets):
super(QueryDataset).__init__()
self.queries = queries
self.values = values
self.targets = targets
def __len__(self):
return 5
def __getitem__(self, idx):
sample = {'query': self.queries[idx],
"values": self.values[idx],
"targets": self.targets[idx]}
return sample
def custom_collate(dict):
return DeviceDict(dict)
dt = QueryDataset("q","v","t")
dl = torch.utils.data.DataLoader(dtt,batch_size=1,collate_fn=custom_collate)
t = next(iter(dl))
t.print_data()
Basically colate_fn allows you to achieve custom batching or adding support for custom data types as explained in the link I previously provided.
As you see it just shows the concept, you need to change it based on your own needs.

For those curious, this is the DeviceDict and custom collate function that I used to get things to work.
class DeviceDict(dict):
def __init__(self, *args):
super(DeviceDict, self).__init__(*args)
def to(self, device):
dd = DeviceDict()
for k, v in self.items():
if torch.is_tensor(v):
dd[k] = v.to(device)
else:
dd[k] = v
return dd
def collate_helper(elems, key):
if key == "query":
return elems
else:
return torch.utils.data.dataloader.default_collate(elems)
def custom_collate(batch):
elem = batch[0]
return DeviceDict({key: collate_helper([d[key] for d in batch], key) for key in elem})

Python generate sorted list

I want to compress my movies automatically. So I've written a mediainfo wrapper class in python, to generate a xml output, which I then parse to a movieinfo class, with a list of audio and subtitle tracks.
__author__ = 'dominik'
class Error(Exception):
""" Error class
"""
class ValidationError(Error):
""" Invalid or missing xml items
"""
class MovieInfo(object):
""" Description of movie file
"""
def __init__(self, media_info):
self._video_track = None
self._audio_tracks = []
self._subtitle_tracks = []
self.valid_movie = True
for track in media_info.tracks:
if track.track_type == "Audio":
self._audio_tracks.append(AudioTrack(track))
elif track.track_type == "Text":
self._subtitle_tracks.append(SubtitleTrack(track))
elif track.track_type == "Video":
self._video_track = VideoTrack(track)
#property
def audio_tracks(self):
if not hasattr(self, "_audio_tracks"):
self._audio_tracks = []
if len(self._audio_tracks) != 0:
return self._audio_tracks
#property
def subtitle_tracks(self):
if not hasattr(self, "_subtitle_tracks"):
self._subtitle_tracks = []
if len(self._subtitle_tracks) != 0:
return self._subtitle_tracks
class Track(object):
""" Abstract track class for audio and subtitle tracks
"""
__KNOWN_LANGUAGE_CODES = {"en": "ENG", "de": "DE"}
def __init__(self, track, valid_codecs):
self._valid = True
track_id = int(track.id)
codec_id = self._determine_codec(track.codec_id, valid_codecs)
language = self._determine_language(track.language)
self._id = track_id
self._codec_id = codec_id
self._language = language
def _determine_codec(self, track_codec, types):
result = types.get(track_codec, None)
if result is None:
self._valid = False
return result
def _determine_language(self, track_language, types=__KNOWN_LANGUAGE_CODES):
result = types.get(track_language, None)
if result is None:
self._valid = False
return result
class AudioTrack(Track):
""" Audio track class
"""
__KNOWN_AUDIO_CODECS = {"A_DTS": "DTS", "A_AC3": "AC3"}
def __init__(self, track):
self._type = 1
Track.__init__(self, track, self.__KNOWN_AUDIO_CODECS)
class SubtitleTrack(Track):
""" Subtitle track class
"""
__KNOWN_SUBTITLE_CODECS = {"S_VOBSUB": "VOBSUB"}
def __init__(self, track):
self._type = 2
if track.forced == "Yes":
self._forced = True
else:
self._forced = False
Track.__init__(self, track, self.__KNOWN_SUBTITLE_CODECS)
class VideoTrack(object):
""" Video track class (only one video track in movie info!)
"""
def __init__(self, track):
self._type = 0
self._framerate = float(track.frame_rate)
self._width = track.width
self._height = track.height
Here is the mediainfo class (it's the pymediainfo class):
from subprocess import Popen
import os
from tempfile import mkstemp
from bs4 import BeautifulSoup, NavigableString
from setuptools.compat import unicode
class Track(object):
""" Hold the track information
"""
def __getattr__(self, item):
try:
return object.__getattribute__(self, item)
except:
pass
return None
def __init__(self, xml_track):
self.xml_track = xml_track
self.track_type = xml_track.attrs["type"]
for child in self.xml_track.children:
if not isinstance(child, NavigableString):
node_name = child.name.lower().strip()
node_value = unicode(child.string)
node_other_name = "other_%s" % node_name
if getattr(self, node_name) is None:
setattr(self, node_name, node_value)
else:
if getattr(self, node_other_name) is None:
setattr(self, node_other_name, [node_value, ])
else:
getattr(self, node_other_name).append(node_value)
for key in [c for c in self.__dict__.keys() if c.startswith("other_")]:
try:
primary = key.replace("other_", "")
setattr(self, primary, int(getattr(self, primary)))
except:
for value in getattr(self, key):
try:
actual = getattr(self, primary)
setattr(self, primary, int(value))
getattr(self, key).append(actual)
break
except:
pass
def __repr__(self):
return("<Track id='{0}', type='{1}'>".format(self.id, self.track_type))
def to_data(self):
data = {}
for k, v in self.__dict__.items():
if k != 'xml_track':
data[k] = v
return data
class Mediainfo(object):
""" MediaInfo wrapper
"""
def __init__(self, xml):
self.xml_dom = xml
if isinstance(xml, str):
self.xml_dom = BeautifulSoup(xml, "xml")
def _populate_tracks(self):
if self.xml_dom is None:
return
for xml_track in self.xml_dom.Mediainfo.File.find_all("track"):
self._tracks.append(Track(xml_track))
#property
def tracks(self):
if not hasattr(self, "_tracks"):
self._tracks = []
if len(self._tracks) == 0:
self._populate_tracks()
return self._tracks
#staticmethod
def parse(filename):
filehandler_out, filename_out = mkstemp(".xml", "mediainfo-")
filehandler_err, filename_err = mkstemp(".error", "mediainfo-")
filepointer_out = os.fdopen(filehandler_out, "r+b")
filepointer_err = os.fdopen(filehandler_err, "r+b")
mediainfo_command = ["mediainfo", "-f", "--Output=XML", filename]
p = Popen(mediainfo_command, stdout=filepointer_out, stderr=filepointer_err)
p.wait()
filepointer_out.seek(0)
xml_dom = BeautifulSoup(filepointer_out.read(), "xml")
filepointer_out.close()
filepointer_err.close()
print(xml_dom)
return Mediainfo(xml_dom)
def to_data(self):
data = {'tracks': []}
for track in self.tracks:
data['tracks'].append(track.to_data())
return data
This class gives me every track in the xml and then I parse the relevant info in movieinfo.
Ok now I have a list of audiotracks e.g. 3 tracks one in german language and DTS, one in german and AC3 and one in english and AC3. Now I want to get the ids from the tracks in the format "1,2,3" to give it to handbrake cli.
My problem is the order of the tracks. If there is a german DTS track this schould be the first track, the second track should be also the first, but compressed to aac and the third track should be one english track in AAC. If there is only a german AC3 track then the first track should be this track but compressed to AAC, and the second track should englisch and AAC.
I don't know exactly how I can achive that, can you help me? I'm new to python, and come from C, C++ and C#. In C# this is very easy to get with lambda.

Assuming you know to define a compare-tor that given two items can define which is bigger then Python functions as well as C or C++.
Start here -
1. https://wiki.python.org/moin/HowTo/Sorting/
https://developers.google.com/edu/python/sorting
http://docs.python.org/2/library/functions.html#sorted
Using sorted method and define the key you want.

How can I change in Python the return/input type of a list that is implemented as an class attribute?

EDIT (complete rephrase of the problem as the original version (see "original version", later) is misleading):
Here is the setting: I have a object which has a list of objects of type
<class 'One'>. I would like to access this list but rather work with objects
of type <class 'Two'> which is an enriched version of <class 'One'>.
Background (1):
One could be an object that can be stored easily via a ORM. The ORM would handle the list depending on the data model
Two would be an object like One but enriched by many features or the way it can be accessed
Background (2):
I try to solve a SQLAlchemy related question that I asked here. So, the answer to the present question could be also a solution to that question changing return/input type of SQLAlchemy-lists.
Here is some code for illustration:
import numpy as np
class One(object):
"""
Data Transfere Object (DTO)
"""
def __init__(self, name, data):
assert type(name) == str
assert type(data) == str
self.name = name
self.data = data
def __repr__(self):
return "%s(%r, %r)" %(self.__class__.__name__, self.name, self.data)
class Two(np.ndarray):
_DTO = One
def __new__(cls, name, data):
dto = cls._DTO(name, data)
return cls.newByDTO(dto)
#classmethod
def newByDTO(cls, dto):
obj = np.fromstring(dto.data, dtype="float", sep=',').view(cls)
obj.setflags(write=False) # Immutable
obj._dto = dto
return obj
#property
def name(self):
return self._dto.name
class DataUI(object):
def __init__(self, list_of_ones):
for one in list_of_ones:
assert type(one) == One
self.list_of_ones = list_of_ones
if __name__ == '__main__':
o1 = One('first object', "1, 3.0, 7, 8,1")
o2 = One('second object', "3.7, 8, 10")
my_data = DataUI ([o1, o2])
How to implement a list_of_twos which operates on list_of_ones but provides the user a list with elements of type Two:
type (my_data.list_of_twos[1]) == Two
>>> True
my_data.list_of_twos.append(Two("test", "1, 7, 4.5"))
print my_data.list_of_ones[-1]
>>> One('test', '1, 7, 4.5')
Original version of the question:
Here is an illustration of the problem:
class Data(object):
def __init__(self, name, data_list):
self.name = name
self.data_list = data_list
if __name__ == '__main__':
my_data = Data ("first data set", [0, 1, 1.4, 5])
I would like to access my_data.data_list via another list (e.g. my_data.data_np_list) that handles list-elements as a different type (e.g. as numpy.ndarray):
>>> my_data.data_np_list[1]
array(1)
>>> my_data.data_np_list.append(np.array(7))
>>> print my_data.data_list
[0, 1, 1.4, 5, 7]

You should use a property
class Data(object):
def __init__(self, name, data_list):
self.name = name
self.data_list = data_list
#property
def data_np_list(self):
return numpy.array(self.data_list)
if __name__ == '__main__':
my_data = Data ("first data set", [0, 1, 1.4, 5])
print my_data.data_np_list
edit: numpy use a continous memory area. python list are linked list. You can't have both at the same time without paying a performance cost which will make the whole thing useless. They are different data structures.

No, you can't do it easily (or at all without losing any performance gain you might get in using numpy.array). You're wanting two fundamentally different structures mirroring one another, this will mean storing the two and transferring any modifications between the two; subclassing both list and numpy.array to observe modifications will be the only way to do that.

Not sure whether your approach is correct.
A property getter would help achieve what you're doing. Here's something similar using arrays instead of numpy.
I've made the array (or in your case numpy data type) the internal representation, with the conversion to list only done on demand with a temporary object returned.
import unittest
import array
class GotAGetter(object):
"""Gets something.
"""
def __init__(self, name, data_list):
super(GotAGetter, self).__init__()
self.name = name
self.data_array = array.array('i', data_list)
#property
def data_list(self):
return list(self.data_array)
class TestProperties(unittest.TestCase):
def testProperties(self):
data = [1,3,5]
test = GotAGetter('fred', data)
aString = str(test.data_array)
lString = str(test.data_list) #Here you go.
try:
test.data_list = 'oops'
self.fail('Should have had an attribute error by now')
except AttributeError as exAttr:
self.assertEqual(exAttr.message, "can't set attribute")
self.assertEqual(aString, "array('i', [1, 3, 5])",
"The array doesn't look right")
self.assertEqual(lString, '[1, 3, 5]',
"The list property doesn't look right")
if __name__ == "__main__":
unittest.main()

One solution I just came up with would be to implement a View of the list via class ListView which takes the following arguments:
raw_list: a list of One-objects
raw2new: a function that converts One-objects to Two-objects
new2raw: a function that converts Two-objects to One-objects
Here is a the code:
class ListView(list):
def __init__(self, raw_list, raw2new, new2raw):
self._data = raw_list
self.converters = {'raw2new': raw2new,
'new2raw': new2raw}
def __repr__(self):
repr_list = [self.converters['raw2new'](item) for item in self._data]
repr_str = "["
for element in repr_list:
repr_str += element.__repr__() + ",\n "
repr_str = repr_str[:-3] + "]"
return repr_str
def append(self, item):
self._data.append(self.converters['new2raw'](item))
def pop(self, index):
self._data.pop(index)
def __getitem__(self, index):
return self.converters['raw2new'](self._data[index])
def __setitem__(self, key, value):
self._data.__setitem__(key, self.converters['new2raw'](value))
def __delitem__(self, key):
return self._data.__delitem__(key)
def __getslice__(self, i, j):
return ListView(self._data.__getslice__(i,j), **self.converters)
def __contains__(self, item):
return self._data.__contains__(self.converters['new2raw'](item))
def __add__(self, other_list_view):
assert self.converters == other_list_view.converters
return ListView(
self._data + other_list_view._data,
**self.converters
)
def __len__(self):
return len(self._data)
def __eq__(self, other):
return self._data == other._data
def __iter__(self):
return iter([self.converters['raw2new'](item) for item in self._data])
Now, DataUI has to look something like this:
class DataUI(object):
def __init__(self, list_of_ones):
for one in list_of_ones:
assert type(one) == One
self.list_of_ones = list_of_ones
self.list_of_twos = ListView(
self.list_of_ones,
Two.newByDTO,
Two.getDTO
)
With that, Two needs the following method:
def getDTO(self):
return self._dto
The entire example would now look like the following:
import unittest
import numpy as np
class ListView(list):
def __init__(self, raw_list, raw2new, new2raw):
self._data = raw_list
self.converters = {'raw2new': raw2new,
'new2raw': new2raw}
def __repr__(self):
repr_list = [self.converters['raw2new'](item) for item in self._data]
repr_str = "["
for element in repr_list:
repr_str += element.__repr__() + ",\n "
repr_str = repr_str[:-3] + "]"
return repr_str
def append(self, item):
self._data.append(self.converters['new2raw'](item))
def pop(self, index):
self._data.pop(index)
def __getitem__(self, index):
return self.converters['raw2new'](self._data[index])
def __setitem__(self, key, value):
self._data.__setitem__(key, self.converters['new2raw'](value))
def __delitem__(self, key):
return self._data.__delitem__(key)
def __getslice__(self, i, j):
return ListView(self._data.__getslice__(i,j), **self.converters)
def __contains__(self, item):
return self._data.__contains__(self.converters['new2raw'](item))
def __add__(self, other_list_view):
assert self.converters == other_list_view.converters
return ListView(
self._data + other_list_view._data,
**self.converters
)
def __len__(self):
return len(self._data)
def __iter__(self):
return iter([self.converters['raw2new'](item) for item in self._data])
def __eq__(self, other):
return self._data == other._data
class One(object):
"""
Data Transfere Object (DTO)
"""
def __init__(self, name, data):
assert type(name) == str
assert type(data) == str
self.name = name
self.data = data
def __repr__(self):
return "%s(%r, %r)" %(self.__class__.__name__, self.name, self.data)
class Two(np.ndarray):
_DTO = One
def __new__(cls, name, data):
dto = cls._DTO(name, data)
return cls.newByDTO(dto)
#classmethod
def newByDTO(cls, dto):
obj = np.fromstring(dto.data, dtype="float", sep=',').view(cls)
obj.setflags(write=False) # Immutable
obj._dto = dto
return obj
#property
def name(self):
return self._dto.name
def getDTO(self):
return self._dto
class DataUI(object):
def __init__(self, list_of_ones):
for one in list_of_ones:
assert type(one) == One
self.list_of_ones = list_of_ones
self.list_of_twos = ListView(
self.list_of_ones,
Two.newByDTO,
Two.getDTO
)
class TestListView(unittest.TestCase):
def testProperties(self):
o1 = One('first object', "1, 3.0, 7, 8,1")
o2 = One('second object', "3.7, 8, 10")
my_data = DataUI ([o1, o2])
t1 = Two('third object', "4.8, 8.2, 10.3")
t2 = Two('forth object', "33, 1.8, 1.0")
# append:
my_data.list_of_twos.append(t1)
# __getitem__:
np.testing.assert_array_equal(my_data.list_of_twos[2], t1)
# __add__:
np.testing.assert_array_equal(
(my_data.list_of_twos + my_data.list_of_twos)[5], t1)
# __getslice__:
np.testing.assert_array_equal(
my_data.list_of_twos[1:],
my_data.list_of_twos[1:2] + my_data.list_of_twos[2:]
)
# __contains__:
self.assertEqual(my_data.list_of_twos.__contains__(t1), True)
# __setitem__:
my_data.list_of_twos.__setitem__(1, t1),
np.testing.assert_array_equal(my_data.list_of_twos[1], t1)
# __delitem__:
l1 = len(my_data.list_of_twos)
my_data.list_of_twos.__delitem__(1)
l2 = len(my_data.list_of_twos)
self.assertEqual(l1 - 1, l2)
# __iter__:
my_data_2 = DataUI ([])
for two in my_data.list_of_twos:
my_data_2.list_of_twos.append(two)
if __name__ == '__main__':
unittest.main()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to effiiciently rebuild pandas hdfstore table when append fails - python

Related

Python ThreadPoolExecutor is faster than a loop for CPU-bound task. How come?

Python ValueError: too many values to unpack (expected 3)

Adding class objects to Pytorch Dataloader: batch must contain tensors

Python generate sorted list

How can I change in Python the return/input type of a list that is implemented as an class attribute?

Categories

Resources