I'm trying to preprocess some text data but after creating a pytorch dataloader, and looping through to check if it is working, I get a Broken Pipe Error. However when trying again in Google Colab, the code works so I think it may be a problem with my setup.
(The Collate class is useless, I just haven't removed it yet.)
import numpy as np
import pandas as pd
data = pd.read_csv("imdb.csv")
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import spacy
spacy_eng = spacy.load("en")
class Vocabulary():
def __init__(self, freq_threshold=4):
self.word_to_index = {"<PAD>":0, "<SOS>":1, "<EOS>":2, "<UNK>":3}
self.freq_threshold = freq_threshold
self.max_length = 0
def __len__(self):
return len(self.word_to_index)
#staticmethod
def tokenizer_eng(text):
return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
def build_vocabulary(self, sentence_list):
frequencies = {}
idx = 4
longest_length = 0
for sentence in sentence_list:
if len(sentence) > longest_length:
self.max_length = len(sentence)
longest_length = self.max_length
for word in self.tokenizer_eng(sentence):
if word not in frequencies:
frequencies[word] = 1
else:
frequencies[word] += 1
if frequencies[word] == self.freq_threshold:
self.word_to_index[word] = idx
idx += 1
self.max_length += 25
def numericalize(self, text):
tokenized_text = self.tokenizer_eng(text)
vector_text = []
for token in tokenized_text:
if token in self.word_to_index:
vector_text.append(self.word_to_index[token])
else:
vector_text.append(self.word_to_index["<UNK>"])
vector_text.append(self.word_to_index["<EOS>"])
pad_length = self.max_length - len(vector_text)
for i in range(0, pad_length):
vector_text.append(self.word_to_index["<PAD>"])
return vector_text
class IMDBDataset(Dataset):
def __init__(self):
data = pd.read_csv("imdb.csv").to_numpy()
self.target = []
for data_point in data[:, 2]:
if data_point == "neg":
self.target.append(0)
else:
self.target.append(1)
self.text = data[:, 4]
self.vocab = Vocabulary()
self.vocab.build_vocabulary(self.text)
def __len__(self):
return self.text.shape[0]
def __getitem__(self, idx):
review = self.text[idx]
vector_text = [self.vocab.word_to_index["<SOS>"]]
vector_text += self.vocab.numericalize(review)
target = self.target[idx]
return torch.tensor(vector_text), torch.tensor(target)
class Collate:
def __init__(self, pad_idx):
self.pad_idx = pad_idx
def __call__(self, batch):
text = [item[0] for item in batch]
text = nn.utils.rnn.pad_sequence(text, batch_first=False, padding_value=self.pad_idx)
return text, batch[1]
def get_loader(batch_size=32, num_workers=4, shuffle=True, pin_memory=True):
dataset = IMDBDataset()
pad_idx = dataset.vocab.word_to_index["<PAD>"]
loader = DataLoader(
dataset=dataset,
batch_size=batch_size,
num_workers=num_workers,
shuffle=shuffle,
pin_memory=pin_memory,
collate_fn=Collate(pad_idx=pad_idx) # Redundant now
)
return loader, dataset
train_dl, train_ds = get_loader()
for idx, (data, target) in enumerate(train_dl):
print(data.shape)
Dunno why it worked, but by removing the get_loader() function and just getting the dataloader by itself, fixed this.
train_dl = DataLoader(dataset, batch_size=32, shuffle=True)
You can try using
if __name__ == '__main__' and '__file__' in globals():
I guess you are using Windows. Pytorch's dataloader gives this error when you are setting num_workers > 0. So to fix this error, set num_workers = 0 or call the dataloader under if __name__ == "__main__: (I can't explain you why the last one works).
Related
I am attempting to create a model, I have a sample of images and labels, and in my attempt at loading the two as a custom dataset to the dataloader, I keep encountering a type error:
'list' object cannot be interpreted as an integer.
Here's the code:
train_data = vids_and_label(training_data_set, vid_cvs_labels, sequence_length=10, transform=train_transfrom)
train_loader = DataLoader(train_data, batch_size=4, shuffle=True, num_workers=4)
class vids_and_label(Dataset):
def __init__(self, vid_title, labels, sequence_length=60, transform=None):
self.label = labels
self.video_title = vid_title
self.sequence_length = sequence_length
self.transform = transform
def __len__(self):
return self.video_title
def __getitem__(self, item):
path = self.video_title[item]
_vid = f'{path}'.split('\\')[-1]
_lbls = self.label
frames = []
_label = _lbls.iloc[(_lbls.loc[_lbls['file'] == _vid].index.values[0]), 1]
if _label == 'fake' or _label == 'FAKE':
_label = 0
if _label == 'real' or _label == 'REAL':
_label = 1
for i, _frame in enumerate(frame_slicer(path)):
frames.append(self.transform(_frame))
if len(frames) == self.sequence_length:
break
frames = torch.stack(frames)
frames = frames[:self.sequence_length]
return frames, _label
Here is the code where the error occurs from:
train_data = LngDataset(zipfile.Path(archive, "train/train/"))
test_data = LngDataset(zipfile.Path(archive, "test/test/"))
train_loader = data.DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = data.DataLoader(test_data, batch_size=32, shuffle=True)
archive is a zipfile which has two folders train and test.
The LngDataset() function:
class LngDataset(data.Dataset):
def __init__(self, root):
super().__init__()
self.items = list(root.iterdir())
def __getitem__(self, index):
item = self.items[index]
if item.name.startswith("de_"):
y = 0
elif item.name.startswith("en_"):
y = 1
elif item.name.startswith("es_"):
y = 2
else:
raise ValueError(f"Unable to determine label for: {item.name}")
signal, sample_rate = soundfile.read(item.open("r"))
X = fbanks(signal, sample_rate).astype(np.float32)
X = X[np.newaxis, ...]
return X, y
def __len__(self):
return len(self.items)
I checked the documentation of zipfile. The Path attribute exists. So I dont understand how I can solve this issue.
I am using data generator in Keras to train a model with a large dataset. But I am getting the error Error when checking input: expected input_8 to have 4 dimensions, but got array with shape () ever time on the last batch of the first epoch. But I checked my dataset file, it doesn't have an empty array, so how did empty array come? I even tried printing the array as they were generated and few of them were shown empty. Here is my code for the data generator:
class data_generator(Sequence):
def __init__(self,data_file,type_data,batch_size,shuffle=True):
self.data_file = data_file
self.type_data = type_data
self.batch_size = batch_size
self.shuffle = shuffle
self.on_epoch_end()
def on_epoch_end(self):
if self.type_data == "train":
self.indices = np.arange(3450000)
else:
self.indices = np.arange(345000)
if self.shuffle:
np.random.shuffle(self.indices)
def __data__generation(self,indices):
return X,Y
def __len__(self):
if self.type_data == "train":
return int(np.ceil(10000 / float(self.batch_size)))
else:
return int(np.ceil(1000 / float(self.batch_size)))
def __getitem__(self,index):
#print(self.indices[(index)*self.batch_size], self.indices[(index+1)*self.batch_size])
X = np.array(HDF5Matrix(self.data_file, self.type_data + "_X", start = self.indices[index*self.batch_size], end = self.indices[(index+1)*self.batch_size]))
Y = np.array(HDF5Matrix(self.data_file, self.type_data + "_Y", start = self.indices[index*self.batch_size], end = self.indices[(index+1)*self.batch_size]))
#print(X.shape, Y.shape)
return X,Y
And here is my code for starting the fit generator:
train_generator = data_generator("drive/My Drive/Dataset/dataset.h5", "train", 20)
eval_generator = data_generator("drive/My Drive/Dataset/dataset.h5", "eval", 20)
model = create_model()
history = model.fit_generator(generator = train_generator,epochs = 100,validation_data=eval_generator,use_multiprocessing=False)
How do I solve this issue? Also is there any alternative for data generator for training on large datasets? The data generator is very buggy and gives lots of error.
The code had few mistakes. I changed it and now it is working, but still, don't know why exactly that error occurred. Here is the new code:
class data_generator(Sequence):
def __init__(self,data_file,type_data,batch_size,shuffle=True):
self.data_file = data_file
self.type_data = type_data
self.batch_size = batch_size
self.shuffle = shuffle
self.on_epoch_end()
def on_epoch_end(self):
if self.type_data == "train":
self.indices = np.arange(3450000)
else:
self.indices = np.arange(345000)
if self.shuffle:
np.random.shuffle(self.indices)
def __data__generation(self,indices):
X = []
Y = []
for index in indices:
X.append(np.array(HDF5Matrix(self.data_file, self.type_data + "_X", start = index, end = index + 1)[0]))
Y.append(np.array(HDF5Matrix(self.data_file, self.type_data + "_Y", start = index, end = index + 1)[0]))
X = np.array(X)
Y = np.array(Y)
return X,Y
def __len__(self):
if self.type_data == "train":
return int(np.ceil(3450000 / float(self.batch_size)))
else:
return int(np.ceil(345000 / float(self.batch_size)))
def __getitem__(self,index):
indices = self.indices[index*self.batch_size:(index+1)*self.batch_size]
X, Y = self.__data__generation(indices)
#print(X.shape, Y.shape, index)
return X,Y
Keras needs while true (infinite loop) to avoid StopIteration. But in a plain generator, after correct steps_per_epoch (sample_size//batch_size), the shape would be zero.
I am making a prediction and implementing a neural network, is currently working with the numpy library and I am adapting the code to the data that I have.
I leave the current progress of the neural network, I have an error at the end of the code and I do not understand it well.
Anyone who can help me please?
import numpy as np
from sklearn.cross_validation import train_test_split
class LinearLayer:
def __init__(self, n_input, n_output):
self.n = n_input
self.m = n_output
self.W = (1/np.sqrt(n_input))*np.random.rand(n_input+1, n_output)
def forward(self, X):
self.input = np.zeros((X.shape[0],self.n+1))
# if only one feature, the input should always be a batch, at least
if len(X.shape) == 1: # of one element
self.input[:-1,:] = X.reshape(-1,self.n)
else:
self.input[:,:-1] = X
self.input[:,-1] = 1
self.output = self.input.dot(self.W) # xW + b
return self.output
def backward(self, d_out):
self.gradients = self.W.dot(d_out)[:-1]
self.dW = np.einsum("ij,ki", self.input, d_out)
return self.gradients
def updateWeights(self, lr=0.1):
self.W = self.W - lr*self.dW
class Sigmoid:
def __init__(self, n_input):
self.output = np.zeros(n_input)
self.gradients = np.zeros(n_input)
def forward(self, X):
self.output = 1/(np.exp(-X)+1)
return self.output
def backward(self, d_out):
ds = self.output.T*(1 - self.output).T
self.gradients = ds*d_out
return self.gradients
print("Training a multilayer perceptron\n")
import pandas as pd
data = pd.read_csv('Data_Balanceada.csv') #Data (74,11)
X = data.iloc[:,0:11]
y = data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=1)
h1 = LinearLayer(11,1) #stack some layers
s1 = Sigmoid(7)
h2 = LinearLayer(7,1)
s2 = Sigmoid(1)
def loss(pred, target):
return np.mean(np.power(pred-target,2))
predict = lambda x: s2.forward(h2.forward(s1.forward(h1.forward(x))))
backpropagate = lambda d: h1.backward(s1.backward(h2.backward(s2.backward(d))))
lr = 0.005
n = 0 # patience
max_epochs = 1500
valid = loss(predict(X_test), y_test)
for i in range(max_epochs):
l = 0
p = predict(X_train)
backpropagate(p.T-y_train.T)
h1.updateWeights(lr)
h2.updateWeights(lr)
l = loss(p,y_train)
new_valid = loss(predict(X_test), y_test)
if new_valid < valid:
valid = new_valid
n = 0
else:
n += 1
if n > 50: break
if i%50 == 0:
print("Loss: {0}\t\tValidation: {1}".format(l/100, valid))
lr = lr*0.97
# Validation
print("\nFinal validation loss: {0}. {1} epochs\n".format(loss(predict(X_test), y_test),i+1))
#print(np.argmax(predict(X_test), axis=1))
#print(np.argmax(y_test, axis=1))
link Dataset:
https://mega.nz/#!jM8AQAbB!61NOeJadGXtiKJQsn_tdJ955p5lRD6kQjBlCQTHtt6I
I have this error:
Data must be 1-dimensional
IMG - ERROR
The first module minibatch.
import numpy as np
import tensorflow as tf
import Utils.neighbor_samplers as samplers
class Minibatch:
def __init__(self, embedding, batch_size):
self.embedding = embedding
self.batch_idx = [i for i in range(self.embedding.shape[0])]
self.batch_size = batch_size
self.iters = 0
self.batch_permutation = np.random.permutation(self.batch_idx) #List
self.next_batch = []
self.max_iters = embedding.shape[0] // self.batch_size
def next_batch(self):
self.start_idx = self.iters * self.batch_size
self.iters += 1
self.end_idx = self.start_idx + self.batch_size
#next_batch = tf.nn.embedding_lookup(embedding, [i for i in range(start_idx, end_idx)])
self.next_idx = self.batch_permutation[self.start_idx : self.end_idx]
#next_neighbors = tf.nn.embedding_lookup(self.embedding, next_idx)
def shuffle(self):
self.batch_permutation = np.random.permutation(self.batch_idx)
self.batch_num = 0
and the second module model.
def train(self):
batch = minibatch.Minibatch(self.normal_embedding, self.batch_size)
for epoch in range(self.epoch):
batch.shuffle()
print('Epoch : %04d' %(epoch + 1))
for iter in range(batch.max_iters):
if iter % 100 == 0 and iter != 0:
print('%d iters done' %(iter))
next_idx = batch.next_batch
With the last line next_idx = batch.next_batch, I want to get the next batch's indexes to look up in embeddings. But it keeps showing empty list as result.