Adding class objects to Pytorch Dataloader: batch must contain tensors - python

I have a custom Pytorch dataset that returns a dictionary containing a class object "queries".
class QueryDataset(torch.utils.data.Dataset):
def __init__(self, queries, values, targets):
super(QueryDataset).__init__()
self.queries = queries
self.values = values
self.targets = targets
def __len__(self):
return self.values.shape[0]
def __getitem__(self, idx):
sample = DeviceDict({'query': self.queries[idx],
"values": self.values[idx],
"targets": self.targets[idx]})
return sample
The problem is that when I put the queries in a data loader I get default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'query.Query'>. Is there a way to have a class object in my data loader? It blows up at next(iterator) in the code below.
train_queries = QueryDataset(train_queries)
train_loader = torch.utils.data.DataLoader(train_queries,
batch_size=10],
shuffle=True,
drop_last=False)
for i in range(epochs):
iterator = iter(train_loader)
for i in range(len(train_loader)):
batch = next(iterator)
out = model(batch)
loss = criterion(out["pred"], batch["targets"])
self.optimizer.zero_grad()
loss.sum().backward()
self.optimizer.step()

You need to define your own colate_fn in order to do this.
A sloppy approach just to show you how stuff works here, would be something like this:
import torch
class DeviceDict:
def __init__(self, data):
self.data = data
def print_data(self):
print(self.data)
class QueryDataset(torch.utils.data.Dataset):
def __init__(self, queries, values, targets):
super(QueryDataset).__init__()
self.queries = queries
self.values = values
self.targets = targets
def __len__(self):
return 5
def __getitem__(self, idx):
sample = {'query': self.queries[idx],
"values": self.values[idx],
"targets": self.targets[idx]}
return sample
def custom_collate(dict):
return DeviceDict(dict)
dt = QueryDataset("q","v","t")
dl = torch.utils.data.DataLoader(dtt,batch_size=1,collate_fn=custom_collate)
t = next(iter(dl))
t.print_data()
Basically colate_fn allows you to achieve custom batching or adding support for custom data types as explained in the link I previously provided.
As you see it just shows the concept, you need to change it based on your own needs.

For those curious, this is the DeviceDict and custom collate function that I used to get things to work.
class DeviceDict(dict):
def __init__(self, *args):
super(DeviceDict, self).__init__(*args)
def to(self, device):
dd = DeviceDict()
for k, v in self.items():
if torch.is_tensor(v):
dd[k] = v.to(device)
else:
dd[k] = v
return dd
def collate_helper(elems, key):
if key == "query":
return elems
else:
return torch.utils.data.dataloader.default_collate(elems)
def custom_collate(batch):
elem = batch[0]
return DeviceDict({key: collate_helper([d[key] for d in batch], key) for key in elem})

Related

Python ValueError: too many values to unpack (expected 3)

I am getting that exception from this code:
from collections import Counter, ChainMap
from itertools import chain
import re
import pickle
class EmoLex(object):
def __init__(self, emolex_filepath=None):
if emolex_filepath:
with open(emolex_filepath) as emolex_file:
self.parser = self._load_and_parse(emolex_file)
def __len__(self):
return len(self.keys())
def keys(self):
return self._parser_keys()
def _parser_keys(self):
return self.parser.keys
def categorize_token(self, token):
return self.parser[token.lower()]
def annotate_doc(self, doc):
return [ self.categorize_token(word.lower()) for word in doc ]
def summarize_doc(self, doc):
annotation = self.annotate_doc(doc)
# return just the summarization
return self.summarize_annotation(annotation, doc)
def summarize_annotation(self, annotation, doc):
wc = len([w for w in doc if re.match('\w+', w)])
ctr = Counter(list(self._flatten_list_of_sets(annotation)))
# Convert to percentiles
summary = {k: float(v)/float(wc) for (k,v) in dict(ctr).items()}
# Set keys that did not occur to 0
not_counted = { k: 0.0 for k in
self._parser_keys() - set(summary.keys()) }
# Merge the two dictionaries
return dict(ChainMap(summary, not_counted))
def load(self, pickle_filepath):
with open(pickle_filepath, 'rb') as pickle_file:
self.parser = pickle.load(pickle_file)
def dump(self, pickle_filepath):
with open(pickle_filepath, 'wb') as pickle_file:
pickle.dump(self.parser, pickle_file)
l_of_s: List[Set[str]] -> generator List[str]
def _flatten_list_of_sets(self, l_of_s):
return chain.from_iterable([ list(categories)
for categories in l_of_s ])
def _load_and_parse(self, emolex_file):
return NrcDiscreteParser(emolex_file.read().splitlines())
#EmoLex(emolex_filepath="/Users/sakshigupta/Dropbox/Sakshi_July_2021/reports_Old_and_New/NRC-VAD-Lexicon.txt")
lexicon = EmoLex("/Users/sakshigupta/Dropbox/Sakshi_July_2021/reports_Old_and_New/NRC-VAD-Lexicon.txt")
Which function are you calling?
This exception is seen when the number of returned values are more than the number of expected values.
Check the example given below
def dummy_function():
return 1,2,3
On calling the above function with only two return variables
a,b = dummy_function()
I get this ->
ValueError: too many values to unpack (expected 2)
This means that on the caller end I am expecting only two values but the functioning is returning more than 2

set variable network layers based on parameters in pytorch

I want to make the following network definition to a parametric one. The number of continuous and discrete columns varies for different data. I first pass the whole input data, which in this case is 110 dimensional , from a linear with a relu activation. The output of each categorical field of my data varies based on a previous one-hot encoding data transformation. I need to define a nn.Linear(110, number of encodings) for each of them.
class Generator(nn.Module):
def __init__(self):
super(Generator, self).__init__(110)
self.lin1 = nn.Linear(110,110)
self.lin_numerical = nn.Linear(110, 6)
self.lin_cat_job = nn.Linear(110, 9)
self.lin_cat_sex = nn.Linear(110, 2)
self.lin_cat_incomeclass = nn.Linear(110, 7)
def forward(self, x):
x = torch.relu(self.lin1(x))
x_numerical = f.leaky_relu(self.lin_numerical(x))
x_cat1 = f.gumbel_softmax(self.lin_cat_job(x), tau=0.2)
x_cat2 = f.gumbel_softmax(self.lin_cat_sex(x), tau=0.2)
x_cat3 = f.gumbel_softmax(self.lin_cat_incomeclass(x), tau=0.2)
x_final = torch.cat((x_numerical, x_cat1, x_cat2, x_cat3),1)
return x_final
I have managed to change the init part, using discrete_columns input which is an ordereddict that has the name and number of one-hot-encoding of each categorical field of my data as key and values, and continuous_columns which is only a list with the names of the continuous columns. But I have no idea how to edit the forward part:
class Generator(nn.Module):
def __init__(self, input_dim, continuous_columns, discrete_columns):
super(Generator, self).__init__()
self._input_dim = input_dim
self._discrete_columns = discrete_columns
self._num_continuous_columns = len(continuous_columns)
self.lin1 = nn.Linear(self._input_dim, self._input_dim)
self.lin_numerical = nn.Linear(self._input_dim, self._num_continuous_columns)
for key, value in self._discrete_columns.items():
setattr(self, "lin_cat_{}".format(key), nn.Linear(self._input_dim, value))
def forward(self, x):
x = torch.relu(self.lin1(x))
x_numerical = f.leaky_relu(self.lin_numerical(x))
####
This is the problematic part
#####
return x
You don't need to use setattr and honestly should not since you'd need getattr, it brings more trouble than it solves if there's any other ways to do the job.
Now this is what I'd do for this task
self.lin_cat = nn.ModuleDict()
for key, value in self._discrete_columns.items():
self.lin_cat[key] = nn.Linear(self._input_dim, value)
# setattr(self, "lin_cat_{}".format(key), nn.Linear(self._input_dim, value))
def forward(self, x):
x = torch.relu(self.lin1(x))
x_numerical = f.leaky_relu(self.lin_numerical(x))
x_cat = []
for key in self.lin_cat:
x_cat.append(f.gumbel_softmax(self.lin_cat[key](x), tau=0.2))
x_final = torch.cat((x_numerical, *x_cat), 1)
return x

Dataset.from_generator: TypeError: `generator` must be callable

I am currently using a generator to produce my training and validation datasets using tf.data.Dataset.from_generator. I have a class method that takes care of this for me:
def build_dataset(self, batch_size=16, shuffle=16, validation=None):
train_dataset = tf.data.Dataset.from_generator(import_images(validation=validation), (tf.float32, tf.float32))
self.train_dataset = train_dataset.shuffle(shuffle).repeat(-1).batch(batch_size).prefetch(1)
if validation is not None:
val_dataset = tf.data.Dataset.from_generator(import_images(validation=validation), (tf.float32, tf.float32))
self.val_dataset = val_dataset.repeat(1).batch(batch_size).prefetch(1)
The problem is passing in (validation=validation) to my import_images generator creates the generator object which Tensorflow doesn't want, and it gives me the error:
TypeError: `generator` must be callable.
Because I have to pass in validation to tell my generator to produce a separate training and validation version, I am required to create two versions of the same generator. It also doesn't allow me to pass in other arguments to control the percentage of training and validation examples - meaning the generator has to be static. Any suggestions?
I recently encountered a similar problem, but I'm a beginner so not sure if this will help.
Try add a call function in your class.
Below are the original class which raise TypeError: `generator` must be callable.
class DataGen:
def __init__(self, files, data_path):
self.i = 0
self.files=files
self.data_path=data_path
def __load__(self, files_name):
data_path = os.path.join(self.data_path, files_name)
arr_img, arr_mask = load_patch(data_path)
return arr_img, arr_mask
def getitem(self, index):
_img, _mask = self.__load__(self.files[index])
return _img, _mask
def __iter__(self):
return self
def __next__(self):
if self.i < len(self.files):
img_arr, mask_arr = self.getitem(self.i)
self.i += 1
else:
raise StopIteration()
return img_arr, mask_arr
Then I revised the code as below and it worked for me.
class DataGen:
def __init__(self, files, data_path):
self.i = 0
self.files=files
self.data_path=data_path
def __load__(self, files_name):
data_path = os.path.join(self.data_path, files_name)
arr_img, arr_mask = load_patch(data_path)
return arr_img, arr_mask
def getitem(self, index):
_img, _mask = self.__load__(self.files[index])
return _img, _mask
def __iter__(self):
return self
def __next__(self):
if self.i < len(self.files):
img_arr, mask_arr = self.getitem(self.i)
self.i += 1
else:
raise StopIteration()
return img_arr, mask_arr
def __call__(self):
self.i = 0
return self

Class method return iterator

I implemented an iterator class as following:
import numpy as np
import time
class Data:
def __init__(self, filepath):
# Computationaly expensive
print("Computationally expensive")
time.sleep(10)
print("Done!")
def __iter__(self):
return self
def __next__(self):
return np.zeros((2,2)), np.zeros((2,2))
count = 0
for batch_x, batch_y in Data("hello.csv"):
print(batch_x, batch_y)
count = count + 1
if count > 5:
break
count = 0
for batch_x, batch_y in Data("hello.csv"):
print(batch_x, batch_y)
count = count + 1
if count > 5:
break
However the constructor is computationally expensive, and the for loop might be called multiple times. For example, in above code the constructor is called twice (each for loop create a new Data object).
How do I separate constructor and iterator? I am hoping to have the following code, where constructor is called once only:
data = Data(filepath)
for batch_x, batch_y in data.get_iterator():
print(batch_x, batch_y)
for batch_x, batch_y in data.get_iterator():
print(batch_x, batch_y)
You can just iterate over an iterable object directly, for..in doesn't require anything else:
data = Data(filepath)
for batch_x, batch_y in data:
print(batch_x, batch_y)
for batch_x, batch_y in data:
print(batch_x, batch_y)
That said, depending on how you implement __iter__(), this could be buggy.
E.g.:
Bad
class Data:
def __init__(self, filepath):
self._items = load_items(filepath)
self._i = 0
def __iter__(self): return self
def __next__(self):
if self._i >= len(self._items): # Or however you check if data is available
raise StopIteration
result = self._items[self._i]
self._i += 1
return result
Because then you couldn't iterate over the same object twice, as self._i would still point at the end of the loop.
Good-ish
class Data:
def __init__(self, filepath):
self._items = load_items(filepath)
def __iter__(self):
self._i = 0
return self
def __next__(self):
if self._i >= len(self._items):
raise StopIteration
result = self._items[self._i]
self._i += 1
return result
This resets the index every time you're about to iterate, fixing the above. This won't work if you're nesting iteration over the same object.
Better
To fix that, keep the iteration state in a separate iterator object:
class Data:
class Iter:
def __init__(self, data):
self._data = data
self._i = 0
def __next__(self):
if self._i >= len(self._data._items): # check for available data
raise StopIteration
result = self._data._items[self._i]
self._i = self._i + 1
def __init__(self, filepath):
self._items = load_items(filepath)
def __iter__(self):
return self.Iter(self)
This is the most flexible approach, but it's unnecessarily verbose if you can use either of the below ones.
Simple, using yield
If you use Python's generators, the language will take care of keeping track of iteration state for you, and it should do so correctly even when nesting loops:
class Data:
def __init__(self, filepath):
self._items= load_items(filepath)
def __iter__(self):
for it in self._items: # Or whatever is appropriate
yield return it
Simple, pass-through to underlying iterable
If the "computationally expensive" part is loading all the data into memory, you can just use the cached data directly.
class Data:
def __init__(self, filepath):
self._items = load_items(filepath)
def __iter__(self):
return iter(self._items)
Instead of creating a new instance of Data, create a second class IterData that contains an __init__ method that runs a process which is not as computationally expensive as instantiating Data. Then, create a classmethod in Data as an alternative constructor for IterData:
class IterData:
def __init__(self, filepath):
#only pass the necessary data
def __iter__(self):
#implement iter here
class Data:
def __init__(self, filepath):
# Computationaly expensive
#classmethod
def new_iter(cls, filepath):
return IterData(filepath)
results = Data.new_iter('path')
for batch_x, batch_y in results:
pass

How to effiiciently rebuild pandas hdfstore table when append fails

I am working on using the hdfstore in pandas to data frames from an ongoing iterative process. At each iteration, I append to a table in the hdfstore. Here is a toy example:
import pandas as pd
from pandas import HDFStore
import numpy as np
from random import choice
from string import ascii_letters
alphanum=np.array(list(ascii_letters)+range(0,9))
def hdfstore_append(storefile,key,df,format="t",columns=None,data_columns=None):
if df is None:
return
if key[0]!='/':
key='/'+key
with HDFStore(storefile) as store:
if key not in store.keys():
store.put(key,df,format=format,columns=columns,data_columns=data_columns)
else:
try:
store.append(key,df)
except Exception as inst:
df = pd.concat([store.get(key),df])
store.put(key,df,format=format,columns=columns,
data_columns=data_columns)
storefile="db.h5"
for i in range(0,100):
df=pd.DataFrame([dict(n=np.random.randn(),
s=''.join(alphanum[np.random.randint(1,len(alphanum),np.random.randint(1,2*(i+1))]))],index=[i])
hdfstore_append(storefile,'/SO/df',df,columns=df.columns,data_columns=True)
The hdfstore_append function guards against the various exceptions hdfstore.append throws, and rebuilds the table when necessary. The issue with this approach is that it gets very slow when the table in the store becomes very large.
Is there a more efficient way to do this?
Below is an example of an efficient method for building large pandas hdfstores. The key is to cache the frame numbers when the tables becomes large. Also instead of appending, removing pre-existing data will essentially create a put.
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import six
import logging
import os
from abc import ABCMeta, abstractmethod, abstractproperty
import warnings
import pandas as pd
logger = logging.getLogger(__name__)
class FramewiseData(object):
"Abstract base class defining a data container with framewise access."
__metaclass__ = ABCMeta
#abstractmethod
def put(self, df):
pass
#abstractmethod
def get(self, frame_no):
pass
#abstractproperty
def frames(self):
pass
#abstractmethod
def close(self):
pass
#abstractproperty
def t_column(self):
pass
def __getitem__(self, frame_no):
return self.get(frame_no)
def __len__(self):
return len(self.frames)
def dump(self, N=None):
"""Return data from all, or the first N, frames in a single DataFrame
Parameters
----------
N : integer
optional; if None, return all frames
Returns
-------
DataFrame
"""
if N is None:
return pd.concat(iter(self))
else:
i = iter(self)
return pd.concat((next(i) for _ in range(N)))
#property
def max_frame(self):
return max(self.frames)
def _validate(self, df):
if self.t_column not in df.columns:
raise ValueError("Cannot write frame without a column "
"called {0}".format(self.t_column))
if df[self.t_column].nunique() != 1:
raise ValueError("Found multiple values for 'frame'. "
"Write one frame at a time.")
def __iter__(self):
return self._build_generator()
def _build_generator(self):
for frame_no in self.frames:
yield self.get(frame_no)
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.close()
KEY_PREFIX = 'Frame_'
len_key_prefix = len(KEY_PREFIX)
def code_key(frame_no):
"Turn the frame_no into a 'natural name' string idiomatic of HDFStore"
key = '{0}{1}'.format(KEY_PREFIX, frame_no)
return key
def decode_key(key):
frame_no = int(key[len_key_prefix:])
return frame_no
class PandasHDFStore(FramewiseData):
"""An interface to an HDF5 file with framewise access, using pandas.
Save each frame's data to a node in a pandas HDFStore.
Any additional keyword arguments to the constructor are passed to
pandas.HDFStore().
"""
def __init__(self, filename, mode='a', t_column='frame', **kwargs):
self.filename = os.path.abspath(filename)
self._t_column = t_column
self.store = pd.HDFStore(self.filename, mode, **kwargs)
#property
def t_column(self):
return self._t_column
#property
def max_frame(self):
return max(self.frames)
def put(self, df):
if len(df) == 0:
warnings.warn('An empty DataFrame was passed to put(). Continuing.')
return
frame_no = df[self.t_column].values[0] # validated to be all the same
key = code_key(frame_no)
# Store data as tabular instead of fixed-format.
# Make sure remove any prexisting data, so don't really 'append'.
try:
self.store.remove(key)
except KeyError:
pass
self.store.put(key, df, format='table')
def get(self, frame_no):
key = code_key(frame_no)
frame = self.store.get(key)
return frame
#property
def frames(self):
"""Returns sorted list of integer frame numbers in file"""
return self._get_frame_nos()
def _get_frame_nos(self):
"""Returns sorted list of integer frame numbers in file"""
# Pandas' store.keys() scans the entire file looking for stored Pandas
# structures. This is very slow for large numbers of frames.
# Instead, scan the root level of the file for nodes with names
# matching our scheme; we know they are DataFrames.
r = [decode_key(key) for key in self.store.root._v_children.keys() if
key.startswith(KEY_PREFIX)]
r.sort()
return r
def close(self):
self.store.close()
class PandasHDFStoreBig(PandasHDFStore):
"""Like PandasHDFStore, but keeps a cache of frame numbers.
This can give a large performance boost when a file contains thousands
of frames.
If a file was made in PandasHDFStore, opening it with this class
and then closing it will add a cache (if mode != 'r').
Any additional keyword arguments to the constructor are passed to
pandas.HDFStore().
"""
def __init__(self, filename, mode='a', t_column='frame', **kwargs):
self._CACHE_NAME = '_Frames_Cache'
self._frames_cache = None
self._cache_dirty = False # Whether _frames_cache needs to be written out
super(PandasHDFStoreBig, self).__init__(filename, mode, t_column,
**kwargs)
#property
def frames(self):
# Hit memory cache, then disk cache
if self._frames_cache is not None:
return self._frames_cache
else:
try:
self._frames_cache = list(self.store[self._CACHE_NAME].index.values)
self._cache_dirty = False
except KeyError:
self._frames_cache = self._get_frame_nos()
self._cache_dirty = True # In memory, but not in file
return self._frames_cache
def put(self, df):
self._invalidate_cache()
super(PandasHDFStoreBig, self).put(df)
def rebuild_cache(self):
"""Delete cache on disk and rebuild it."""
self._invalidate_cache()
_ = self.frames # Compute cache
self._flush_cache()
def _invalidate_cache(self):
self._frames_cache = None
try:
del self.store[self._CACHE_NAME]
except KeyError: pass
def _flush_cache(self):
"""Writes frame cache if dirty and file is writable."""
if (self._frames_cache is not None and self._cache_dirty
and self.store.root._v_file._iswritable()):
self.store[self._CACHE_NAME] = pd.DataFrame({'dummy': 1},
index=self._frames_cache)
self._cache_dirty = False
def close(self):
"""Updates cache, writes if necessary, then closes file."""
if self.store.root._v_file._iswritable():
_ = self.frames # Compute cache
self._flush_cache()
super(PandasHDFStoreBig, self).close()
class PandasHDFStoreSingleNode(FramewiseData):
"""An interface to an HDF5 file with framewise access,
using pandas, that is faster for cross-frame queries.
This implementation is more complex than PandasHDFStore,
but it simplifies (speeds up?) cross-frame queries,
like queries for a single probe's entire trajectory.
Any additional keyword arguments to the constructor are passed to
pandas.HDFStore().
"""
def __init__(self, filename, key='FrameData', mode='a', t_column='frame',
use_tabular_copy=False, **kwargs):
self.filename = os.path.abspath(filename)
self.key = key
self._t_column = t_column
self.store = pd.HDFStore(self.filename, mode, **kwargs)
with pd.get_store(self.filename) as store:
try:
store[self.key]
except KeyError:
pass
else:
self._validate_node(use_tabular_copy)
#property
def t_column(self):
return self._t_column
def put(self, df):
if len(df) == 0:
warnings.warn('An empty DataFrame was passed to put(). Continuing.')
return
self._validate(df)
self.store.append(self.key, df, data_columns=True)
def get(self, frame_no):
frame = self.store.select(self.key, '{0} == {1}'.format(
self._t_column, frame_no))
return frame
def dump(self, N=None):
"""Return data from all, or the first N, frames in a single DataFrame
Parameters
----------
N : integer
optional; if None, return all frames
Returns
-------
DataFrame
"""
if N is None:
return self.store.select(self.key)
else:
Nth_frame = self.frames[N - 1]
return self.store.select(self.key, '{0} <= {1}'.format(
self._t_column, Nth_frame))
def close(self):
self.store.close()
def __del__(self):
if hasattr(self, 'store'):
self.close()
#property
def frames(self):
"""Returns sorted list of integer frame numbers in file"""
# I assume one column can fit in memory, which is not ideal.
# Chunking does not seem to be implemented for select_column.
frame_nos = self.store.select_column(self.key, self.t_column).unique()
frame_nos.sort()
return frame_nos
def _validate_node(self, use_tabular_copy):
# The HDFStore might be non-tabular, which means we cannot select a
# subset, and this whole structure will not work.
# For convenience, this can rewrite the table into a tabular node.
if use_tabular_copy:
self.key = _make_tabular_copy(self.filename, self.key)
pandas_type = getattr(getattr(getattr(
self.store._handle.root, self.key, None), '_v_attrs', None),
'pandas_type', None)
if not pandas_type == 'frame_table':
raise ValueError("This node is not tabular. Call with "
"use_tabular_copy=True to proceed.")
def _make_tabular_copy(store, key):
"""Copy the contents nontabular node in a pandas HDFStore
into a tabular node"""
tabular_key = key + '/tabular'
logger.info("Making a tabular copy of %s at %s", (key, tabular_key))
store.append(tabular_key, store.get(key), data_columns=True)
return tabular_key

Categories