How to feed different pad IDs to a collate function? - pytorch

I usually use a custom collate_fn and use it as an argument when defining my DataLoader. It usually looks something like:
def collate_fn(batch):
max_len = max([len(b['input_ids']) for b in batch])
input_ids = [b['input_ids'] + ([0] * (max_len - len(b['input_ids'])))]
labels = [b['label'] for b in batch]
return input_ids
As you can see, I'm using 0 for my padding sequence. What I'm wondering is, since language models and their tokenizers use different IDs for padding tokens, is there a way that I can make the collate_fn flexible to take that into account?

I was able to make a workaround by making a Trainer class and making the collate_fn a method. After that I was able to do something like self.pad_token_id = tokenizer.pad_token_id and modify the original collate_fn to use self.pad_token_id rather than a hardcoded value.
I'm still curious if there's any way to do this while keeping collate_fn a top-level function though. For example if there would be any way to pass an argument or something.
<Original>
def collate_fn(batch):
max_len = max([len(b['input_ids']) for b in batch])
input_ids = [b['input_ids'] + ([0] * (max_len - len(b['input_ids']))) for b in batch]
return input_ids
class Trainer():
def __init__(self, tokenizer, ...):
...
def train(self):
train_dataloader = DataLoader(features, collate_fn=collate_fn, ...)
...
<Workaround>
class Trainer():
def __init__(self, tokenizer, ...):
self.pad_token_id = tokenizer.pad_token_id
...
def collate_fn(self, batch):
max_len = max([len(b['input_ids']) for b in batch])
input_ids = [b['input_ids'] + ([self.pad_token_id] * (max_len - len(b['input_ids']))) for b in batch]
return input_ids
def train(self):
train_dataloader = DataLoader(features, collate_fn=self.collate_fn, ...)
...

Related

Does Keras official sample code about Transformer applied in time-series contain Position Embedding part?

The sample code for referring from url:https://keras.io/examples/timeseries/timeseries_transformer_classification/
I could not find out any description about "Position Embedding" content in full page of above url. When I looked through Transformer applied in NLP, I can clearly see the class named "TokenAndPositionEmbedding".
If it does not contain "Position Embedding", how can I apply Position Embedding in time series in sample code?
From what I can tell it does not contain the positional embedding. Something like this should work.
class PositionEmbeddingFixedWeights(Layer):
def __init__(self, sequence_length, vocab_size, output_dim, **kwargs):
super(PositionEmbeddingFixedWeights, self).__init__(**kwargs)
word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)
position_embedding_matrix = self.get_position_encoding(sequence_length, output_dim)
self.word_embedding_layer = Embedding(
input_dim=vocab_size, output_dim=output_dim,
weights=[word_embedding_matrix],
trainable=False
)
self.position_embedding_layer = Embedding(
input_dim=sequence_length, output_dim=output_dim,
weights=[position_embedding_matrix],
trainable=False
)
def get_position_encoding(self, seq_len, d, n=10000):
P = np.zeros((seq_len, d))
for k in range(seq_len):
for i in np.arange(int(d/2)):
denominator = np.power(n, 2*i/d)
P[k, 2*i] = np.sin(k/denominator)
P[k, 2*i+1] = np.cos(k/denominator)
return P
def call(self, inputs):
position_indices = tf.range(tf.shape(inputs)[-1])
embedded_words = self.word_embedding_layer(inputs)
embedded_indices = self.position_embedding_layer(position_indices)
return embedded_words + embedded_indices
This class originated from https://machinelearningmastery.com/the-transformer-positional-encoding-layer-in-keras-part-2/

Pytorch dynamic amount of Layers?

I am trying to specify a dynamic amount of layers, which I seem to be doing wrong.
My issue is that when I define the 100 layers here, I will get an error in the forward step.
But when I define the layer properly it works?
Below simplified example
class PredictFromEmbeddParaSmall(LightningModule):
def __init__(self, hyperparams={'lr': 0.0001}):
super(PredictFromEmbeddParaSmall, self).__init__()
#Input is something like tensor.size=[768*100]
self.TO_ILLUSTRATE = nn.Linear(768, 5)
self.enc_ref=[]
for i in range(100):
self.enc_red.append(nn.Linear(768, 5))
# gather the layers output sth
self.dense_simple1 = nn.Linear(5*100, 2)
self.output = nn.Sigmoid()
def forward(self, x):
# first input to enc_red
x_vecs = []
for i in range(self.para_count):
layer = self.enc_red[i]
# The first dim is the batch size here, output is correct
processed_slice = x[:, i * 768:(i + 1) * 768]
# This works and give the out of size 5
rand = self.TO_ILLUSTRATE(processed_slice)
#This will fail? Error below
ret = layer(processed_slice)
#more things happening we can ignore right now since we fail earlier
I get this error when executing "ret = layer.forward(processed_slice)"
RuntimeError: Expected object of device type cuda but got device type
cpu for argument #1 'self' in call to _th_addmm
Is there a smarter way to program this? OR solve the error?
You should use a ModuleList from pytorch instead of a list: https://pytorch.org/docs/master/generated/torch.nn.ModuleList.html . That is because Pytorch has to keep a graph with all modules of your model, if you just add them in a list they are not properly indexed in the graph, resulting in the error you faced.
Your coude should be something alike:
class PredictFromEmbeddParaSmall(LightningModule):
def __init__(self, hyperparams={'lr': 0.0001}):
super(PredictFromEmbeddParaSmall, self).__init__()
#Input is something like tensor.size=[768*100]
self.TO_ILLUSTRATE = nn.Linear(768, 5)
self.enc_ref=nn.ModuleList() # << MODIFIED LINE <<
for i in range(100):
self.enc_red.append(nn.Linear(768, 5))
# gather the layers output sth
self.dense_simple1 = nn.Linear(5*100, 2)
self.output = nn.Sigmoid()
def forward(self, x):
# first input to enc_red
x_vecs = []
for i in range(self.para_count):
layer = self.enc_red[i]
# The first dim is the batch size here, output is correct
processed_slice = x[:, i * 768:(i + 1) * 768]
# This works and give the out of size 5
rand = self.TO_ILLUSTRATE(processed_slice)
#This will fail? Error below
ret = layer(processed_slice)
#more things happening we can ignore right now since we fail earlier
Then it should work all right!
Edit: alternative way.
Instead of using ModuleList you can also just use nn.Sequential, this allows you to avoid using the for loop in the forward pass. That also means that you will not have access to intermediary activations, so that is not the solution for you if you need them.
class PredictFromEmbeddParaSmall(LightningModule):
def __init__(self, hyperparams={'lr': 0.0001}):
super(PredictFromEmbeddParaSmall, self).__init__()
#Input is something like tensor.size=[768*100]
self.TO_ILLUSTRATE = nn.Linear(768, 5)
self.enc_ref=[]
for i in range(100):
self.enc_red.append(nn.Linear(768, 5))
self.enc_red = nn.Seqential(*self.enc_ref) # << MODIFIED LINE <<
# gather the layers output sth
self.dense_simple1 = nn.Linear(5*100, 2)
self.output = nn.Sigmoid()
def forward(self, x):
# first input to enc_red
x_vecs = []
out = self.enc_red(x) # << MODIFIED LINE <<
A little bit more adjustable solution which comes down to matter of taste or complexity of your exact situation was posted here.
For reference I post an adjusted version of the code here:
import torch
from torch import nn, optim
from torch.nn.modules import Module
from implem.settings import settings
class Model(nn.Module):
def __init__(self, input_size, layers_data: list, learning_rate=0.01, optimizer=optim.Adam):
super().__init__()
self.layers = nn.ModuleList()
self.input_size = input_size # Can be useful later ...
for size, activation in layers_data:
self.layers.append(nn.Linear(input_size, size))
input_size = size # For the next layer
if activation is not None:
assert isinstance(activation, Module), \
"Each tuples should contain a size (int) and a torch.nn.modules.Module."
self.layers.append(activation)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.learning_rate = learning_rate
self.optimizer = optimizer(params=self.parameters(), lr=learning_rate)
def forward(self, input_data):
for layer in self.layers:
input_data = layer(input_data)
return input_data
# test that the net is working properly
if __name__ == "__main__":
data_size = 5
layer1, layer2 = 10, 10
output_size = 2
data = torch.randn(data_size)
mlp = Model(data_size, [(layer1, nn.ReLU()), (layer2, nn.ReLU()), (output_size, nn.Sigmoid())])
output = mlp(data)
print("done")

ValueError: optimizer got an empty parameter list

I create the following simple linear class:
class Decoder(nn.Module):
def __init__(self, K, h=()):
super().__init__()
h = (K,)+h+(K,)
self.layers = [nn.Linear(h1,h2) for h1,h2 in zip(h, h[1:])]
def forward(self, x):
for layer in self.layers[:-1]:
x = F.relu(layer(x))
return self.layers[-1](x)
However, when I try to put the parameters in a optimizer class I get the error ValueError: optimizer got an empty parameter list.
decoder = Decoder(4)
LR = 1e-3
opt = optim.Adam(decoder.parameters(), lr=LR)
Is there something I'm doing obviously wrong with the class definition?
Since you store your layers in a regular pythonic list inside your Decoder, Pytorch has no way of telling these members of the self.list are actually sub modules. Convert this list into pytorch's nn.ModuleList and your problem will be solved
class Decoder(nn.Module):
def __init__(self, K, h=()):
super().__init__()
h = (K,)+h+(K,)
self.layers = nn.ModuleList(nn.Linear(h1,h2) for h1,h2 in zip(h, h[1:]))

how to use output of sklearn pipeline elements

I have three features:
feature_one -> number of tokens in the given sentence.
feature_two -> number of verbs in the given sentence.
feature_three -> number of tokens - number of verbs in the given sentence.
(feature_one - feature_two)
I have written custom transformers for feature_one and feature_two and want to written custom transformer for feature_three such that I can use result of feature_one and feature_two by running pipeline as:
Pipeline([
#input to feature_one and feature_two is list of sentences.
("feature", FeatureUnion([
("feature_one", feature_one_transformer()),
("feature_two", feature_two_transformer())
])),
("feature_three", feature_three_transformer())
])
feature_one_transformer:
class feature_one_transformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, x, y):
return self
def transform(self, sentence_list):
number_of_tokens_in_sentence_list = list()
for sentence in sentence_list:
number_of_tokens = compute_number_of_tokens
number_of_tokens_in_sentence_lista.append(number_of_tokens)
return pandas.DataFrame(number_of_tokens_in_sentence_list)
feature_two_transformer:
class feature_two_transformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, x, y):
return self
def transform(self, sentence_list):
number_of_verbs_in_sentence_list = list()
for sentence in sentence_list:
number_of_verbs = compute_number_of_verbs_in_sentence
number_of_verbs_in_sentence_lista.append(number_of_verbs)
return pandas.DataFrame(number_of_verbs_in_sentence_list)
Can somebody tell me how should I write custom transformer for feature_three and how to use in pipeline so that I can use result of feature_one and feature_two transformers. Thank you.
It's not clear to me why you would want to make this so complicated. I would just use one transformer that does everything you want. Something like this:
class features_transformer(BaseEstimator, TransformerMixin):
def __init__(self, variable):
self.variable = variable
def fit(self, X):
return self
def transform(self, X):
X['number_of_tokens'] = X[self.variable].apply(lambda cell: compute_number_of_tokens(cell))
X['number_of_verbs'] = X[self.variable].apply(lambda cell: compute_number_of_verbs(cell))
X['tokens_minus_verbs'] = X['number_of_tokens'] - X['number_of_verbs']
return X
new_X = features_transformer('sentences').fit_transform(X)

Transformer operating on multiple features in pyspark.ml

I want to make my own transformer of features in a DataFrame, so that I add a column which is, for example, a difference between two other columns. I followed this question, but the transformer there operates on one column only. pyspark.ml.Transformer takes a string as an argument for inputCol, so of course I can not specify multiple columns.
So basically, what I want to achieve is a _transform() method that resembles this one:
def _transform(self, dataset):
out_col = self.getOutputCol()
in_col = dataset.select([self.getInputCol()])
# Define transformer logic
def f(col1, col2):
return col1 - col2
t = IntegerType()
return dataset.withColumn(out_col, udf(f, t)(in_col))
How is this possible to do?
I managed to solve the problem by first creating a Vector out of the set of features that I want to operate on, and then applying the transform on the newly generated vector feature. Below is an example code of how to make a new feature which is a different of two other features:
class MeasurementDifferenceTransformer(Transformer, HasInputCol, HasOutputCol):
#keyword_only
def __init__(self, inputCol=None, outputCol=None):
super(MeasurementDifferenceTransformer, self).__init__()
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, inputCol=None, outputCol=None):
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)
def _transform(self, dataset):
out_col = self.getOutputCol()
in_col = dataset[self.getInputCol()]
# Define transformer logic
def f(vector):
return float(vector[0] - vector[1])
t = FloatType()
return dataset.withColumn(out_col, udf(lambda x: f(x), t)(in_col))
To use it, we first instantiate a VectorAssembler to create the a vector feature:
pair_assembler = VectorAssembler(inputCols=["col1", "col2"], outputCol="cols_vector")
Then we instantiate the transformer:
pair_transformer = MeasurementDifferenceTransformer(inputCol="cols_vector", outputCol="col1_minus_col2")
Finally we transform the data:
pairfeats = pair_assembler.transform(df)
difffeats = pait_transformer.transform(pairfeats)
You don't need to go through all these trouble in order to operate on multiple columns. Here's a better approach using HasInputCols (instead of HasInputCol)
class MeasurementDifferenceTransformer(Transformer, HasInputCols, HasOutputCol):
#keyword_only
def __init__(self, inputCols=None, outputCol=None):
super(MeasurementDifferenceTransformer, self).__init__()
kwargs = self._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, inputCols=None, outputCol=None):
kwargs = self._input_kwargs
return self._set(**kwargs)
def _transform(self, dataset):
out_col = self.getOutputCol()
in_col = self.getInputCols()
# Define transformer logic
def f(col1, col2):
return float(col1-col2)
t = FloatType()
return dataset.withColumn(out_col, udf(lambda f, t)(*in_col))

Resources