DataLoader fails with T5 tokenizer - nlp

I am trying to train a T5 model for NER problem, but whenever I try to load the data it fails.
Here is how I load the tokenizer:
tokenizer = AutoTokenizer.from_pretrained("t5-base")
training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)
Then I created the DataLoad as follows:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
'shuffle': True,
'num_workers': 0,
}
test_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
I was facing issues training the model so I tried to debug the loading as:
for idx, batch in enumerate(training_loader):
print(idx)
break
But I am getting this error:
<ipython-input-8-fa9e34b10c06> in __getitem__(self, index)
34 if mapping[0] == 0 and mapping[1] != 0:
35 # overwrite label
---> 36 encoded_labels[idx] = labels[i]
37 i += 1
38
IndexError: list index out of range
I checked the dataset, it is not empty and there is data, not sure what is wrong.
I tried the same code with BERT by changing this line:
tokenizer = AutoTokenizer.from_pretrained("t5-base")
To:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
& it worked perfectly, so the issue is with working with T5
Here is the Dataset class:
class dataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.len = len(dataframe)
self.data = dataframe
self.tokenizer = tokenizer
self.max_len = max_len
def __getitem__(self, index):
# step 1: get the sentence and word labels
sentence = self.data.sentence[index].strip().split()
word_labels = self.data.word_labels[index].split(",")
# step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
# BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
encoding = self.tokenizer(sentence,
is_split_into_words=True,
return_offsets_mapping=True,
truncation=True,
padding='max_length',
max_length=self.max_len)
#return_offsets_mapping=True,
#padding='max_length',
#truncation=True,)
# step 3: create token labels only for first word pieces of each tokenized word
labels = [labels_to_ids[label] for label in word_labels]
# code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
# create an empty array of -100 of length max_length
encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
# set only labels whose first offset position is 0 and the second is not 0
i = 0
for idx, mapping in enumerate(encoding["offset_mapping"]):
if mapping[0] == 0 and mapping[1] != 0:
# overwrite label
encoded_labels[idx] = labels[i]
i += 1
# step 4: turn everything into PyTorch tensors
item = {key: torch.as_tensor(val) for key, val in encoding.items()}
item['labels'] = torch.as_tensor(encoded_labels)
return item
def __len__(self):
return self.len

Related

PyTorch Dataloader bucket by tensor length

I've been trying to create a custom Dataloader that can serve batches of data that are all same-sized to feed into a Conv2d layer for classification purposes.
Here's some test data
X is a NUMBER OF POINTS x CHOICES x NUM_FEATURES, while y is the label (that can be any integer CHOICES-1)
I'm having trouble writing the Sampler and Dataloader.
import random
import torch
from collections import defaultdict
from sklearn.utils import shuffle
from torch.utils.data import Dataset, DataLoader
from typing import Sequence, Iterator
import numpy as np
sample_probs = np.array([2.04302017e-03, 6.84249612e-03, 3.18776004e-02, 6.69332322e-01,
1.79056125, 1.63388916, 1.31819391, 1.43798623,
2.44057406, 5.51664089e-01, 9.66624185e-02, 1.67495225e-02,
3.59960696e-03, 2.43216687e-05])
X = []
y = []
train_datasets = []
i_dict = {0: 19,
1: 63,
2: 30,
3: 6192,
4: 16564,
5: 15115,
6: 12195,
7: 13303,
8: 22578,
9: 5103,
10: 894,
11: 155,
12: 33,
13: 2}
for i in range(2,16):
temp_x = []
temp_y = []
for j in range(i_dict[i-2]):
temp_x.append(torch.rand(i, 4, 1))
temp_y.append(torch.tensor(random.randint(0,i-1)))
X = torch.stack(temp_x)
y = torch.stack(temp_y)
train_datasets.append((X.clone(),y.clone()))
class WeightedBucketSampler(torch.utils.data.Sampler):
def __init__(self, data, weights: Sequence[float], num_samples: int,
replacement: bool = True, generator=None, shuffle=True, drop_last=False):
super().__init__(data)
self.shuffle = shuffle
self.drop_last = drop_last
self.weights = torch.as_tensor(weights, dtype=torch.double)
self.num_samples = num_samples
self.replacement = replacement
self.generator = generator
self.buckets = defaultdict(list)
'''data is a CustomDataset containing a tensor of COUNT x NUM_ROUTES x FEATURES x 1 and a tensor with the corresponding labels'''
counter = 0
for i in range(len(data)):
self.buckets[i+2] += [data[i][0],data[i][1]]
counter += len(data[i][0])
self.length = counter
def __iter__(self) -> Iterator[int]:
# Choose a bucket depending on the weighted sample
rand_bucket = torch.multinomial(self.weights, self.num_samples, self.replacement, generator=self.generator).tolist()[0]
shifter = sum([len(self.buckets[i+2][0]) for i in range(rand_bucket)])
# Generate random indices from the bucket
rand_tensor = torch.randperm(len(self.buckets[rand_bucket+2][0]), generator=self.generator)
yield from torch.add(rand_tensor, shifter).tolist()
def __len__(self):
return self.length
class CustomDataset(Dataset):
def __init__(self, data):
self.routes = dict()
self.choice = dict()
counter = 0
for i in range(len(data)):
for j in range(len(data[i][0])):
self.routes[counter] = data[i][0][j]
self.choice[counter] = data[i][1][j]
counter += 1
def __len__(self):
return len(self.choice)
def __getitem__(self, idx):
choice = self.choice[idx]
routes = self.routes[idx]
return routes, choice
train_datasets_ds = CustomDataset(train_datasets)
bucket_sampler = WeightedBucketSampler(train_datasets, sample_probs,len(sample_probs), shuffle=True, drop_last=False)
loader = DataLoader(train_datasets_ds, sampler=bucket_sampler, batch_size=32, pin_memory=True)
for X,y in loader:
print(X.size(),y.size())
This code is a combination of WeightedRandomSampler and Bucket sampling code
I'm essentially sampling via the sample weights of each classification to choose a bucket, and from that bucket choose randomly to form a batch up to batch_size.
However, when going through loader, I get the output:
...
torch.Size([32, 10, 4, 1]) torch.Size([32])
torch.Size([32, 10, 4, 1]) torch.Size([32])
torch.Size([32, 10, 4, 1]) torch.Size([32])
torch.Size([18, 10, 4, 1]) torch.Size([18])
The sum of all these batches add up to the elements in bucket 10. So it's right, but it's not jumping to another bucket. Rerunning the code
for X,y in loader:
print(X.size(),y.size())
will produce another bucket's batches.
I'm still learning PyTorch, so some of the code might be inefficient. Would love some advice as well!
Thanks to some help on the unofficial PyTorch Discord channel (sudomaze), I've fixed my problem. There's a need to iterate through all the data in the sampler.
The __len__ function in the sampler also needed fixing.
class WeightedBucketSampler(Sampler[List[int]]):
def __init__(self, data, weights: Sequence[float], num_samples: int,
replacement: bool = True, generator=None, shuffle=True, batch_size=32, drop_last=False):
super().__init__(data)
self.shuffle = shuffle
self.drop_last = drop_last
self.weights = torch.as_tensor(weights, dtype=torch.double)
self.num_samples = num_samples
self.replacement = replacement
self.generator = generator
self.batch_size = batch_size
self.buckets = defaultdict(list)
'''data is a CustomDataset containing a tensor of COUNT x NUM_ROUTES x FEATURES x 1 and a tensor with the corresponding labels'''
counter = 0
for i in range(len(data)):
self.buckets[i+2] += [data[i][0],data[i][1]]
counter += len(data[i][0])
self.length = counter
def __iter__(self) -> Iterator[int]:
# Choose a bucket depending on the weighted sample
rand_bucket = torch.multinomial(self.weights, self.num_samples, self.replacement, generator=self.generator)
batch = [0] * self.batch_size
idx_in_batch = 0
for bucket_idx in rand_bucket.tolist():
bucketsample_count = 0
shifter = sum([len(self.buckets[i+2][0]) for i in range(bucket_idx)])
# Generate random indices from the bucket and shift them
rand_tensor = torch.randperm(len(self.buckets[bucket_idx+2][0]), generator=self.generator)
# print(len(self.buckets[bucket_idx+2][0]), len(rand_tensor.tolist()))
for idx in rand_tensor.tolist():
batch[idx_in_batch] = idx+shifter
idx_in_batch += 1
if idx_in_batch == self.batch_size:
bucketsample_count += self.batch_size
yield batch
idx_in_batch = 0
batch = [0] * self.batch_size
if idx_in_batch > 0:
bucketsample_count += idx_in_batch
yield batch[:idx_in_batch]
# The last remaining tensors are added into one batch. Terminate batch and move to next bucket
idx_in_batch = 0
batch = [0] * self.batch_size
continue
def __len__(self):
return (self.length + (self.batch_size - 1)) // self.batch_size
class CustomDataset(Dataset):
def __init__(self, data):
self.routes = dict()
self.choice = dict()
counter = 0
for i in range(len(data)):
for j in range(len(data[i][0])):
self.routes[counter] = data[i][0][j]
self.choice[counter] = data[i][1][j]
counter += 1
def __len__(self):
return len(self.choice)
def __getitem__(self, idx):
choice = self.choice[idx]
routes = self.routes[idx]
return routes, choice
w = np.array([len(i[0]) for i in train_datasets])
sample_probs = 1/sample_probs*w
train_datasets_ds = CustomDataset(train_datasets)
bucket_sampler = WeightedBucketSampler(train_datasets, sample_probs,len(sample_probs), shuffle=True, batch_size=batch_size, drop_last=False)
train_loader = DataLoader(train_datasets_ds, batch_sampler=bucket_sampler)

How to input Concatenation

I want to combine two models for my training data, but I do not know why it's always wrong. My topic is get one embedding, then adjust the length, combine two embedding in other def() function
# Please comment your code
import numpy as np
def Concatenation (posts, model):
emb_table = []
for i in posts:
for word in i:
if word in model:
emb_table.append(model.wv[word])
else:
emb_table.append([0]*model.vector_size)
return emb_table
len_list = [len(s) for s in traning_post_new]
seq_length = max(len_list)
seq_length
# # Padding and encoding
# def encode_and_add_padding(sentences, seq_length, word_index):
# sent_encoded = []
# for sent in sentences:
# temp_encoded = [word_index[word] if word in word_index else word_index['[UNKNOWN]'] for word in sent]
# if len(temp_encoded) < seq_length:
# temp_encoded += [word_index['[PAD]']] * (seq_length - len(temp_encoded))
# sent_encoded.append(temp_encoded)
# return sent_encoded
# sent_encoded = encode_and_add_padding(sentences, seq_length, word_index)
# print(sent_encoded)
def encode_and_add_padding(posts, model1, model2):
for i in range(len(posts)): # i is index
if len(posts[i]) < seq_length:
posts[i].extend(["[PAD]"]*(seq_length-len(posts[i])))
else:
posts[i] = posts[i][:seq_length]
embedding1 = Concatenation (posts, model1)
embedding2 = Concatenation (posts, model2)
print(embedding1.shape)
final_embedding = np.concatenate((embedding1,embedding2), 0)
return final_embedding

Pytorch sequential data loader

I have looked through the documentation with for ex class IterableDataset and Start / End but I'm just not good enough to solve this one at the moment.
Training my model with random batches is fine, but using it for predictions I need it to start from min(index) up to max(index). So I wanted to re-use below and change to fit that.
Now it will take random items from the range so I can get duplicate predictions of the same index number. ex range(5) in index 1,2,3,4,5 might give 4,2,2,3,4 not desired 1,2,3,4,5.
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)
DataLoader shuffle = False, then it just takes len / max of index.
I probably need to change the sampler.
class CompanyDataset(Dataset):
def __init__(self, csv_name, root_dir, training_length, forecast_window):
"""
Args:
csv_file (string): Path to the csv file.
root_dir (string): Directory
"""
# load raw data file
csv_file = os.path.join(root_dir, csv_name)
self.df = pd.read_csv(csv_file)
self.root_dir = root_dir
self.transform = MinMaxScaler()
self.T = training_length
self.S = forecast_window
def __len__(self):
# return number of sensors
return len(self.df.groupby(by=["index"]))
# Will pull an index between 0 and __len__.
def __getitem__(self, idx):
# Sensors are indexed from 1
idx = idx + 1
# np.random.seed(0)
start = np.random.randint(0, len(self.df[self.df["index"] == idx]) - self.T - self.S)
Company = str(self.df[self.df["index"] == idx][["station"]][start:start + 1].values.item())
index_in = torch.tensor([i for i in range(start, start + self.T)])
index_tar = torch.tensor([i for i in range(start + self.T, start + self.T + self.S)])
_input = torch.tensor(self.df[self.df["index"] == idx][
["A1","A2","A3","A4","A5","A6","A7","A8", "A9", "A10", "A11"]][
start: start + self.T].values)
target = torch.tensor(self.df[self.df["index"] == idx][
[["A1","A2","A3","A4","A5","A6","A7","A8", "A9", "A10", "A11"]][
start + self.T: start + self.T + self.S].values)
scaler = self.transform
scaler.fit(_input[:, 0].unsqueeze(-1))
_input[:, 0] = torch.tensor(scaler.transform(_input[:, 0].unsqueeze(-1)).squeeze(-1))
target[:, 0] = torch.tensor(scaler.transform(target[:, 0].unsqueeze(-1)).squeeze(-1))
dump(scaler, 'scalar_item.joblib')
return index_in, index_tar, _input, target, station

Building a dataset with dataloader pytorch getting error cannot import name 'read_data_sets'

Loading data into dataset using pytorch dataloader.
Getting error cannot import name 'read_data_sets'
Tried searaching for results from similar issues.
If there is confusion about file instead of module and it can't find read_data_sets in your file How do i change to fix?
class MRDataset(data.Dataset):
def __init__(self, root_dir, task, plane, train=True, transform=None, weights=None):
super().__init__()
self.task = task
self.plane = plane
self.root_dir = root_dir
self.train = train
if self.train:
self.folder_path = self.root_dir + 'train/{0}/'.format(plane)
self.records = pd.read_csv(
self.root_dir + 'train-{0}.csv'.format(task), header=None, names=['id', 'label'])
else:
transform = None
self.folder_path = self.root_dir + 'valid/{0}/'.format(plane)
self.records = pd.read_csv(
self.root_dir + 'valid-{0}.csv'.format(task), header=None, names=['id', 'label'])
self.records['id'] = self.records['id'].map(
lambda i: '0' * (4 - len(str(i))) + str(i))
self.paths = [self.folder_path + filename +
'.npy' for filename in self.records['id'].tolist()]
self.labels = self.records['label'].tolist()
self.transform = transform
if weights is None:
pos = np.sum(self.labels)
neg = len(self.labels) - pos
self.weights = torch.FloatTensor([1, neg / pos])
else:
self.weights = torch.FloatTensor(weights)
def __len__(self):
return len(self.paths)
def __getitem__(self, index):
array = np.load(self.paths[index])
label = self.labels[index]
if label == 1:
label = torch.FloatTensor([[0, 1]])
elif label == 0:
label = torch.FloatTensor([[1, 0]])
if self.transform:
array = self.transform(array)
else:
array = np.stack((array,)*3, axis=1)
array = torch.FloatTensor(array)
# if label.item() == 1:
# weight = np.array([self.weights[1]])
# weight = torch.FloatTensor(weight)
# else:
# weight = np.array([self.weights[0]])
# weight = torch.FloatTensor(weight)
return array, label, self.weights
There is a model and train class to run this. Arguments specified in train.
Running the train should load data and run through model

Keras: merge two models with different inputs and use fit_generator to train merged model

I want to merge two model with different models, and use fit_generator to train the merged model. And the generator is by myself.
This is one of the generators.
def image_generator(self, batch_size, train_test, data_type, concat=False):
train, test = self.split_train_test()
data = train if train_test == 'train' else test
print("Creating %s generator with %d samples." % (train_test, len(data)))
print ("image_generator")
while 1:
X, y = [], []
# Generate batch_size samples.
for _ in range(batch_size):
# Reset to be safe.
sequence = None
# Get a random sample.
sample = random.choice(data)
# Check to see if we've already saved this sequence.
if data_type is "images":
# Get and resample frames.
frames = self.get_frames_for_sample(sample)
frames = self.rescale_list(frames, self.seq_length)
# Build the image sequence
sequence = self.build_image_sequence(frames)
else:
# Get the sequence from disk.
sequence = self.get_image_sequence(data_type, sample, train_test)
if sequence is None:
print("Can't find sequence. Did you generate them?")
sys.exit() # TODO this should raise
if concat:
# We want to pass the sequence back as a single array. This
# is used to pass into an MLP rather than an RNN.
sequence = np.concatenate(sequence).ravel()
X.append(sequence)
y.append(self.get_class_one_hot(sample[1]))
yield np.array(X), np.array(y)
This is get_image_sequences:
def get_image_sequence(self, data_type, sample, train_test):
"""get the images shaped with array."""
# train,ApplyEyeMakeup,v_ApplyEyeMakeup_g10_c02,99
num = random.randint(1, int(sample[3]))
path = glob.glob('./data/' + train_test + '/' + sample[1] + '/' + sample[2] + '-' + '*' + num + '.jpg')
if os.path.isfile(path):
img = Image.open(path)
if img.size != target_size:
img = img.resize(target_size)
img = img_to_array(img)
img = np.expand_dims(img, axis=0)
img /= 255
return img
else:
print ("path is error" + path)
return None
Now, merge and fit it:
modeltmp = merge([model1.output, model2.output], mode='concat', concat_axis=1)
modeltmp = BatchNormalization()(modeltmp)
modeltmp = Dense(1024, activation='relu')(modeltmp)
modeltmp = Dense(len(classes), activation='softmax')(modeltmp)
model = Model(input=[model1.input, model2.input], outputs=modeltmp)
# model1 --- generator
train_gen_1 = data.image_generator(batch_size, 'train', cnn_lstm_datatype, concat)
test_gen_1 = data.image_generator(batch_size, 'test', cnn_lstm_datatype, concat)
# model2 ---- generator
train_gen_2 = data.frame_generator(batch_size=batch_size, train_test='train', data_type=cnn_lstm_datatype, concat=concat)
test_gen_2 = data.frame_generator(batch_size=batch_size, train_test='test', data_type=cnn_lstm_datatype, concat=concat)
model.fit_generator([train_gen_1, train_gen_2],
verbose=1,
steps_per_epoch=batch_size,
validation_steps=10,
epochs=10000,
callbacks=[checkpointer, tb, early_stopper, csv_logger],
validation_data=[test_gen_1, test_gen_2]
)
However, I get the error:
TypeError: Error when checking model input: data should be a Numpy array, or list/dict of Numpy arrays. Found: generator object image_generator at 0x12205df00 ...
How can I solve it?Thanks!

Categories

Resources