how to use output of sklearn pipeline elements - scikit-learn

I have three features:
feature_one -> number of tokens in the given sentence.
feature_two -> number of verbs in the given sentence.
feature_three -> number of tokens - number of verbs in the given sentence.
(feature_one - feature_two)
I have written custom transformers for feature_one and feature_two and want to written custom transformer for feature_three such that I can use result of feature_one and feature_two by running pipeline as:
Pipeline([
#input to feature_one and feature_two is list of sentences.
("feature", FeatureUnion([
("feature_one", feature_one_transformer()),
("feature_two", feature_two_transformer())
])),
("feature_three", feature_three_transformer())
])
feature_one_transformer:
class feature_one_transformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, x, y):
return self
def transform(self, sentence_list):
number_of_tokens_in_sentence_list = list()
for sentence in sentence_list:
number_of_tokens = compute_number_of_tokens
number_of_tokens_in_sentence_lista.append(number_of_tokens)
return pandas.DataFrame(number_of_tokens_in_sentence_list)
feature_two_transformer:
class feature_two_transformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, x, y):
return self
def transform(self, sentence_list):
number_of_verbs_in_sentence_list = list()
for sentence in sentence_list:
number_of_verbs = compute_number_of_verbs_in_sentence
number_of_verbs_in_sentence_lista.append(number_of_verbs)
return pandas.DataFrame(number_of_verbs_in_sentence_list)
Can somebody tell me how should I write custom transformer for feature_three and how to use in pipeline so that I can use result of feature_one and feature_two transformers. Thank you.

It's not clear to me why you would want to make this so complicated. I would just use one transformer that does everything you want. Something like this:
class features_transformer(BaseEstimator, TransformerMixin):
def __init__(self, variable):
self.variable = variable
def fit(self, X):
return self
def transform(self, X):
X['number_of_tokens'] = X[self.variable].apply(lambda cell: compute_number_of_tokens(cell))
X['number_of_verbs'] = X[self.variable].apply(lambda cell: compute_number_of_verbs(cell))
X['tokens_minus_verbs'] = X['number_of_tokens'] - X['number_of_verbs']
return X
new_X = features_transformer('sentences').fit_transform(X)

Related

How to feed different pad IDs to a collate function?

I usually use a custom collate_fn and use it as an argument when defining my DataLoader. It usually looks something like:
def collate_fn(batch):
max_len = max([len(b['input_ids']) for b in batch])
input_ids = [b['input_ids'] + ([0] * (max_len - len(b['input_ids'])))]
labels = [b['label'] for b in batch]
return input_ids
As you can see, I'm using 0 for my padding sequence. What I'm wondering is, since language models and their tokenizers use different IDs for padding tokens, is there a way that I can make the collate_fn flexible to take that into account?
I was able to make a workaround by making a Trainer class and making the collate_fn a method. After that I was able to do something like self.pad_token_id = tokenizer.pad_token_id and modify the original collate_fn to use self.pad_token_id rather than a hardcoded value.
I'm still curious if there's any way to do this while keeping collate_fn a top-level function though. For example if there would be any way to pass an argument or something.
<Original>
def collate_fn(batch):
max_len = max([len(b['input_ids']) for b in batch])
input_ids = [b['input_ids'] + ([0] * (max_len - len(b['input_ids']))) for b in batch]
return input_ids
class Trainer():
def __init__(self, tokenizer, ...):
...
def train(self):
train_dataloader = DataLoader(features, collate_fn=collate_fn, ...)
...
<Workaround>
class Trainer():
def __init__(self, tokenizer, ...):
self.pad_token_id = tokenizer.pad_token_id
...
def collate_fn(self, batch):
max_len = max([len(b['input_ids']) for b in batch])
input_ids = [b['input_ids'] + ([self.pad_token_id] * (max_len - len(b['input_ids']))) for b in batch]
return input_ids
def train(self):
train_dataloader = DataLoader(features, collate_fn=self.collate_fn, ...)
...

Does Keras official sample code about Transformer applied in time-series contain Position Embedding part?

The sample code for referring from url:https://keras.io/examples/timeseries/timeseries_transformer_classification/
I could not find out any description about "Position Embedding" content in full page of above url. When I looked through Transformer applied in NLP, I can clearly see the class named "TokenAndPositionEmbedding".
If it does not contain "Position Embedding", how can I apply Position Embedding in time series in sample code?
From what I can tell it does not contain the positional embedding. Something like this should work.
class PositionEmbeddingFixedWeights(Layer):
def __init__(self, sequence_length, vocab_size, output_dim, **kwargs):
super(PositionEmbeddingFixedWeights, self).__init__(**kwargs)
word_embedding_matrix = self.get_position_encoding(vocab_size, output_dim)
position_embedding_matrix = self.get_position_encoding(sequence_length, output_dim)
self.word_embedding_layer = Embedding(
input_dim=vocab_size, output_dim=output_dim,
weights=[word_embedding_matrix],
trainable=False
)
self.position_embedding_layer = Embedding(
input_dim=sequence_length, output_dim=output_dim,
weights=[position_embedding_matrix],
trainable=False
)
def get_position_encoding(self, seq_len, d, n=10000):
P = np.zeros((seq_len, d))
for k in range(seq_len):
for i in np.arange(int(d/2)):
denominator = np.power(n, 2*i/d)
P[k, 2*i] = np.sin(k/denominator)
P[k, 2*i+1] = np.cos(k/denominator)
return P
def call(self, inputs):
position_indices = tf.range(tf.shape(inputs)[-1])
embedded_words = self.word_embedding_layer(inputs)
embedded_indices = self.position_embedding_layer(position_indices)
return embedded_words + embedded_indices
This class originated from https://machinelearningmastery.com/the-transformer-positional-encoding-layer-in-keras-part-2/

Class raises "name 'smth' is not defined" error, though I've already defined it by 'self,smth = ~~~imputer()'?

I am currently confronting "name 'imputer' is not defined" error.
the thing is in the init(self) part, have already defined 'imputer' by declaring 'self.imputer = IterativeImputer(max_iter=10)'.
Can anyone explain why this happens?
the whole code is as follows:
away_defencePressure_idx = 15
class IterImputer(BaseEstimator, TransformerMixin):
def __init__(self):
self.imputer = IterativeImputer(max_iter=10)
def fit(self, X, y=None):
imputer.fit(X)
return self
def transform(self, X, y=None):
imputed = imputer.transform(X)
X.T[away_defencePressure_idx] = imputed.T[away_defencePressure_idx]
return X
p = Pipeline([
('imputerA', IterImputer())
])
p.fit(X)
You defined self.imputer but not imputer. Just add self. in front of it:
def fit(self, X, y=None):
self.imputer.fit(X)
return self
def transform(self, X, y=None):
imputed = self.imputer.transform(X)
X.T[away_defencePressure_idx] = imputed.T[away_defencePressure_idx]
return X

ValueError: optimizer got an empty parameter list

I create the following simple linear class:
class Decoder(nn.Module):
def __init__(self, K, h=()):
super().__init__()
h = (K,)+h+(K,)
self.layers = [nn.Linear(h1,h2) for h1,h2 in zip(h, h[1:])]
def forward(self, x):
for layer in self.layers[:-1]:
x = F.relu(layer(x))
return self.layers[-1](x)
However, when I try to put the parameters in a optimizer class I get the error ValueError: optimizer got an empty parameter list.
decoder = Decoder(4)
LR = 1e-3
opt = optim.Adam(decoder.parameters(), lr=LR)
Is there something I'm doing obviously wrong with the class definition?
Since you store your layers in a regular pythonic list inside your Decoder, Pytorch has no way of telling these members of the self.list are actually sub modules. Convert this list into pytorch's nn.ModuleList and your problem will be solved
class Decoder(nn.Module):
def __init__(self, K, h=()):
super().__init__()
h = (K,)+h+(K,)
self.layers = nn.ModuleList(nn.Linear(h1,h2) for h1,h2 in zip(h, h[1:]))

Transformer operating on multiple features in pyspark.ml

I want to make my own transformer of features in a DataFrame, so that I add a column which is, for example, a difference between two other columns. I followed this question, but the transformer there operates on one column only. pyspark.ml.Transformer takes a string as an argument for inputCol, so of course I can not specify multiple columns.
So basically, what I want to achieve is a _transform() method that resembles this one:
def _transform(self, dataset):
out_col = self.getOutputCol()
in_col = dataset.select([self.getInputCol()])
# Define transformer logic
def f(col1, col2):
return col1 - col2
t = IntegerType()
return dataset.withColumn(out_col, udf(f, t)(in_col))
How is this possible to do?
I managed to solve the problem by first creating a Vector out of the set of features that I want to operate on, and then applying the transform on the newly generated vector feature. Below is an example code of how to make a new feature which is a different of two other features:
class MeasurementDifferenceTransformer(Transformer, HasInputCol, HasOutputCol):
#keyword_only
def __init__(self, inputCol=None, outputCol=None):
super(MeasurementDifferenceTransformer, self).__init__()
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, inputCol=None, outputCol=None):
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)
def _transform(self, dataset):
out_col = self.getOutputCol()
in_col = dataset[self.getInputCol()]
# Define transformer logic
def f(vector):
return float(vector[0] - vector[1])
t = FloatType()
return dataset.withColumn(out_col, udf(lambda x: f(x), t)(in_col))
To use it, we first instantiate a VectorAssembler to create the a vector feature:
pair_assembler = VectorAssembler(inputCols=["col1", "col2"], outputCol="cols_vector")
Then we instantiate the transformer:
pair_transformer = MeasurementDifferenceTransformer(inputCol="cols_vector", outputCol="col1_minus_col2")
Finally we transform the data:
pairfeats = pair_assembler.transform(df)
difffeats = pait_transformer.transform(pairfeats)
You don't need to go through all these trouble in order to operate on multiple columns. Here's a better approach using HasInputCols (instead of HasInputCol)
class MeasurementDifferenceTransformer(Transformer, HasInputCols, HasOutputCol):
#keyword_only
def __init__(self, inputCols=None, outputCol=None):
super(MeasurementDifferenceTransformer, self).__init__()
kwargs = self._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, inputCols=None, outputCol=None):
kwargs = self._input_kwargs
return self._set(**kwargs)
def _transform(self, dataset):
out_col = self.getOutputCol()
in_col = self.getInputCols()
# Define transformer logic
def f(col1, col2):
return float(col1-col2)
t = FloatType()
return dataset.withColumn(out_col, udf(lambda f, t)(*in_col))

Resources