Here is the code where the error occurs from:
train_data = LngDataset(zipfile.Path(archive, "train/train/"))
test_data = LngDataset(zipfile.Path(archive, "test/test/"))
train_loader = data.DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = data.DataLoader(test_data, batch_size=32, shuffle=True)
archive is a zipfile which has two folders train and test.
The LngDataset() function:
class LngDataset(data.Dataset):
def __init__(self, root):
super().__init__()
self.items = list(root.iterdir())
def __getitem__(self, index):
item = self.items[index]
if item.name.startswith("de_"):
y = 0
elif item.name.startswith("en_"):
y = 1
elif item.name.startswith("es_"):
y = 2
else:
raise ValueError(f"Unable to determine label for: {item.name}")
signal, sample_rate = soundfile.read(item.open("r"))
X = fbanks(signal, sample_rate).astype(np.float32)
X = X[np.newaxis, ...]
return X, y
def __len__(self):
return len(self.items)
I checked the documentation of zipfile. The Path attribute exists. So I dont understand how I can solve this issue.
Related
I am attempting to create a model, I have a sample of images and labels, and in my attempt at loading the two as a custom dataset to the dataloader, I keep encountering a type error:
'list' object cannot be interpreted as an integer.
Here's the code:
train_data = vids_and_label(training_data_set, vid_cvs_labels, sequence_length=10, transform=train_transfrom)
train_loader = DataLoader(train_data, batch_size=4, shuffle=True, num_workers=4)
class vids_and_label(Dataset):
def __init__(self, vid_title, labels, sequence_length=60, transform=None):
self.label = labels
self.video_title = vid_title
self.sequence_length = sequence_length
self.transform = transform
def __len__(self):
return self.video_title
def __getitem__(self, item):
path = self.video_title[item]
_vid = f'{path}'.split('\\')[-1]
_lbls = self.label
frames = []
_label = _lbls.iloc[(_lbls.loc[_lbls['file'] == _vid].index.values[0]), 1]
if _label == 'fake' or _label == 'FAKE':
_label = 0
if _label == 'real' or _label == 'REAL':
_label = 1
for i, _frame in enumerate(frame_slicer(path)):
frames.append(self.transform(_frame))
if len(frames) == self.sequence_length:
break
frames = torch.stack(frames)
frames = frames[:self.sequence_length]
return frames, _label
I am training a keras model with a data generator that reads the data in batches from a directory. This works great with model.fit(). But when using model.predict(), I would like to have both ypred and ytrue values returned.
Can I enable/modify model.predict() to do this (maybe with a custom callback)?
class DataGenerator(tf.keras.utils.Sequence):
def__init__(self, ids, batch_size=256):
self.batch_size=batch_size
self.ids = ids
def __len__(self):
return(self.ids)
def __getitem__(self, index):
X, y = np.load(f'data/{index}.npy', allow_pickle=True)
return X, y
def on_epoch_end(self):
'''Shuffle ids in each epoch'''
self.ids = np.random.choice(self.ids, len(self.ids), replace=False)
model = buildModel() #builds a multilayer perceptron
train_ids = np.arange(10000) #training data are in data/0.npy, data/1.npy, ... data/9999.npy
val_ids = np.arange(10000, 12000)
train_generator = DataGenerator(train_ids)
val_generator = DataGenerator(val_ids)
# Train model
history = model.fit(x=train_generator, epochs=100)
# Validate model (but I don't have ytrue)
ypred = model.predict(x=val_generator).reshape(-1)
# What I would like to achieve
(ypred, ytrue) = model.predict(x=val_generator, callbacks=[some_custom_callback])
# Or
ypred = model.predict(x=val_generator)
ytrue = some_fancy_method(val_generator)
This can be done by adding a method to your DataGenerator class that takes the fitted model as input, applies it to the generated data batches, and returns ytrue and ypred.
class DataGenerator(tf.keras.utils.Sequence):
def__init__(self, ids, batch_size=256):
self.batch_size=batch_size
self.ids = ids
def __len__(self):
return(self.ids)
def __getitem__(self, index):
X, y = self.load_data(index)
return X, y
def load_data(self, index):
X, y = np.load(f'data/{index}.npy', allow_pickle=True)
return X, y
def predict(self, model):
ytrue, ypred = [], []
for index in self.ids:
X, y = self.load_data(index)
pred = model.predict(X).reshape(-1)
ytrue.extend(y)
ypred.extend(pred)
return ytrue, ypred
def on_epoch_end(self):
'''Shuffle ids in each epoch'''
self.ids = np.random.choice(self.ids, len(self.ids), replace=False)
train_generator = DataGenerator(train_ids)
val_generator = DataGenerator(val_ids)
# Train model
history = model.fit(x=train_generator, epochs=100)
# Validate model
ypred, ytrue = val_generator.predict(model)
I'm trying to preprocess some text data but after creating a pytorch dataloader, and looping through to check if it is working, I get a Broken Pipe Error. However when trying again in Google Colab, the code works so I think it may be a problem with my setup.
(The Collate class is useless, I just haven't removed it yet.)
import numpy as np
import pandas as pd
data = pd.read_csv("imdb.csv")
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import spacy
spacy_eng = spacy.load("en")
class Vocabulary():
def __init__(self, freq_threshold=4):
self.word_to_index = {"<PAD>":0, "<SOS>":1, "<EOS>":2, "<UNK>":3}
self.freq_threshold = freq_threshold
self.max_length = 0
def __len__(self):
return len(self.word_to_index)
#staticmethod
def tokenizer_eng(text):
return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
def build_vocabulary(self, sentence_list):
frequencies = {}
idx = 4
longest_length = 0
for sentence in sentence_list:
if len(sentence) > longest_length:
self.max_length = len(sentence)
longest_length = self.max_length
for word in self.tokenizer_eng(sentence):
if word not in frequencies:
frequencies[word] = 1
else:
frequencies[word] += 1
if frequencies[word] == self.freq_threshold:
self.word_to_index[word] = idx
idx += 1
self.max_length += 25
def numericalize(self, text):
tokenized_text = self.tokenizer_eng(text)
vector_text = []
for token in tokenized_text:
if token in self.word_to_index:
vector_text.append(self.word_to_index[token])
else:
vector_text.append(self.word_to_index["<UNK>"])
vector_text.append(self.word_to_index["<EOS>"])
pad_length = self.max_length - len(vector_text)
for i in range(0, pad_length):
vector_text.append(self.word_to_index["<PAD>"])
return vector_text
class IMDBDataset(Dataset):
def __init__(self):
data = pd.read_csv("imdb.csv").to_numpy()
self.target = []
for data_point in data[:, 2]:
if data_point == "neg":
self.target.append(0)
else:
self.target.append(1)
self.text = data[:, 4]
self.vocab = Vocabulary()
self.vocab.build_vocabulary(self.text)
def __len__(self):
return self.text.shape[0]
def __getitem__(self, idx):
review = self.text[idx]
vector_text = [self.vocab.word_to_index["<SOS>"]]
vector_text += self.vocab.numericalize(review)
target = self.target[idx]
return torch.tensor(vector_text), torch.tensor(target)
class Collate:
def __init__(self, pad_idx):
self.pad_idx = pad_idx
def __call__(self, batch):
text = [item[0] for item in batch]
text = nn.utils.rnn.pad_sequence(text, batch_first=False, padding_value=self.pad_idx)
return text, batch[1]
def get_loader(batch_size=32, num_workers=4, shuffle=True, pin_memory=True):
dataset = IMDBDataset()
pad_idx = dataset.vocab.word_to_index["<PAD>"]
loader = DataLoader(
dataset=dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=shuffle,
pin_memory=pin_memory,
collate_fn=Collate(pad_idx=pad_idx) # Redundant now
)
return loader, dataset
train_dl, train_ds = get_loader()
for idx, (data, target) in enumerate(train_dl):
print(data.shape)
Dunno why it worked, but by removing the get_loader() function and just getting the dataloader by itself, fixed this.
train_dl = DataLoader(dataset, batch_size=32, shuffle=True)
You can try using
if __name__ == '__main__' and '__file__' in globals():
I guess you are using Windows. Pytorch's dataloader gives this error when you are setting num_workers > 0. So to fix this error, set num_workers = 0 or call the dataloader under if __name__ == "__main__: (I can't explain you why the last one works).
class CIFAR10Sequence(Sequence):
def __init__(self, x_set, y_set, batch_size):
self.x, self.y = x_set, y_set
self.epoch = 0
self.batch = 0
self.batch_size = batch_size
self.per = 4
def __len__(self):
return int(np.ceil(len(self.x) / float(self.batch_size)))
def __getitem__(self, idx):
batch_x = self.x[int(np.ceil(self.x.shape[0]*(self.per/100)))]
batch_y = self.y[int(np.ceil(self.x.shape[0]*(self.per/100)))]
return np.array(batch_x), np.array(batch_y)
return (batch_x, batch_y)
def on_batch_end(self):
if self.epoch % 100 == 0:
self.per = self.per*1.9
self.epoch += 1
train_datagen = CIFAR10Sequence(new_x_sort, new_x_sort, 100)
test_datagen = CIFAR10Sequence(cifar100_dataset.x_test,
cifar100_dataset.x_test, 100)
model.fit_generator(generator=train_datagen, steps_per_epoch=len(new_x_sort)//100, epochs=20)
but I get:
TypeError: 'CIFAR10Sequence' object is not an iterator
You need __iter__() function in CIFAR10Sequence.
Something like
def __iter__(self):
self.cur_idx += 1
return self[self.cur_idx]
It looks like the Sequence object must have been referenced in another context before you got to the fit_generator call. That call doesn't take a keyword arg for the generator, so you would get keyword arg error if you did reach that call. The Sequence object does have __iter__() so it's iterable, but it doesn't have __next__() so it's not an iterator and if it was referenced as such it would throw that error. __iter__() is all that's needed for keras fit.
I am making a prediction and implementing a neural network, is currently working with the numpy library and I am adapting the code to the data that I have.
I leave the current progress of the neural network, I have an error at the end of the code and I do not understand it well.
Anyone who can help me please?
import numpy as np
from sklearn.cross_validation import train_test_split
class LinearLayer:
def __init__(self, n_input, n_output):
self.n = n_input
self.m = n_output
self.W = (1/np.sqrt(n_input))*np.random.rand(n_input+1, n_output)
def forward(self, X):
self.input = np.zeros((X.shape[0],self.n+1))
# if only one feature, the input should always be a batch, at least
if len(X.shape) == 1: # of one element
self.input[:-1,:] = X.reshape(-1,self.n)
else:
self.input[:,:-1] = X
self.input[:,-1] = 1
self.output = self.input.dot(self.W) # xW + b
return self.output
def backward(self, d_out):
self.gradients = self.W.dot(d_out)[:-1]
self.dW = np.einsum("ij,ki", self.input, d_out)
return self.gradients
def updateWeights(self, lr=0.1):
self.W = self.W - lr*self.dW
class Sigmoid:
def __init__(self, n_input):
self.output = np.zeros(n_input)
self.gradients = np.zeros(n_input)
def forward(self, X):
self.output = 1/(np.exp(-X)+1)
return self.output
def backward(self, d_out):
ds = self.output.T*(1 - self.output).T
self.gradients = ds*d_out
return self.gradients
print("Training a multilayer perceptron\n")
import pandas as pd
data = pd.read_csv('Data_Balanceada.csv') #Data (74,11)
X = data.iloc[:,0:11]
y = data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=1)
h1 = LinearLayer(11,1) #stack some layers
s1 = Sigmoid(7)
h2 = LinearLayer(7,1)
s2 = Sigmoid(1)
def loss(pred, target):
return np.mean(np.power(pred-target,2))
predict = lambda x: s2.forward(h2.forward(s1.forward(h1.forward(x))))
backpropagate = lambda d: h1.backward(s1.backward(h2.backward(s2.backward(d))))
lr = 0.005
n = 0 # patience
max_epochs = 1500
valid = loss(predict(X_test), y_test)
for i in range(max_epochs):
l = 0
p = predict(X_train)
backpropagate(p.T-y_train.T)
h1.updateWeights(lr)
h2.updateWeights(lr)
l = loss(p,y_train)
new_valid = loss(predict(X_test), y_test)
if new_valid < valid:
valid = new_valid
n = 0
else:
n += 1
if n > 50: break
if i%50 == 0:
print("Loss: {0}\t\tValidation: {1}".format(l/100, valid))
lr = lr*0.97
# Validation
print("\nFinal validation loss: {0}. {1} epochs\n".format(loss(predict(X_test), y_test),i+1))
#print(np.argmax(predict(X_test), axis=1))
#print(np.argmax(y_test, axis=1))
link Dataset:
https://mega.nz/#!jM8AQAbB!61NOeJadGXtiKJQsn_tdJ955p5lRD6kQjBlCQTHtt6I
I have this error:
Data must be 1-dimensional
IMG - ERROR