I'm trying to do attention mechanism while returning the tensor I'm getting the following error
ValueError: Shape mismatch: The shape of labels (received (64, 53)) should equal the shape of logits except for the last dimension (received (64, 1, 500)).
Please find the below code
Here is code for attention please correct me if it is wrong
class Attention(tf.keras.layers.Layer):
def __init__(self):
super().__init__()
def call(self,enc_op,hidden_state):
# print(enc_op.shape,hidden_state.shape)
query_with_time_axis = tf.expand_dims(hidden_state, 1)
context_vector = tf.matmul(enc_op,tf.transpose(query_with_time_axis,perm=[0,2,1]))
context_vector = tf.nn.softmax(context_vector,axis=1)
context_vector = context_vector * enc_op
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector
Here is decoder part I'm calling the attention from here
class Decoder(tf.keras.layers.Layer):
def init(self,vocab_size,embedding_dim,input_length,dec_units):
super().init()
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.dec_units = dec_units
self.input_length = input_length
self.attention = Attention()
def build(self,input_shape):
self.embedding = Embedding(input_dim=self.vocab_size,output_dim = self.embedding_dim,input_shape = input_shape,
mask_zero = True, name = "embedding_layer_decoder")
self.lstm = LSTM(self.dec_units,return_sequences=True,return_state=True,name = "Decoder_LSTM")
def call(self,target_sentances,enc_op,hidden_state,cell_state):
target_embed = self.embedding(target_sentances)
for i in range(target_embed.shape[1]):
context_vector = self.attention(enc_op,hidden_state)
y = tf.concat([context_vector, target_embed[:,i,:]], axis=-1)
y = tf.expand_dims(y, 1)
lstm_output,hidden_state,_ = self.lstm(y,initial_state = [hidden_state,cell_state])
return lstm_output
class Mymodel(Model):
def __init__(self,encoder_inputs_length,decoder_inputs_length,output_vocab_size):
super().__init__()
self.encoder = Encoder(vocab_size = 500, embedding_dim = 50, input_length = encoder_inputs_length, enc_units=64)
self.decoder = Decoder(vocab_size = 500, embedding_dim = 50, input_length = decoder_inputs_length, dec_units=64)
self.dense = Dense(output_vocab_size,activation = "softmax")
def call(self,data):
input,output = data[0],data[1]
print(input.shape,output.shape)
encoder_output,encoder_h,encoder_c = self.encoder(input)
print("="*20, "ENCODER", "="*20)
print("-"*35)
print(encoder_output)
print("ENCODER ==> OUTPUT SHAPE",encoder_output.shape)
print("ENCODER ==> HIDDEN STATE SHAPE",encoder_h.shape)
print("ENCODER ==> CELL STATE SHAPE", encoder_c.shape)
print("="*20,"Decoder","="*20)
decoder_output = self.decoder(output,encoder_output,encoder_h,encoder_c)
output1 = self.dense(decoder_output)
print("-"*35)
print("Final output shape",output.shape)
print("="*50)
return output1
model = Mymodel(encoder_inputs_length=30,decoder_inputs_length=20,output_vocab_size=500)
ENCODER_SEQ_LEN = 30
DECODER_SEQ_LEN = 20
optimizer = tf.keras.optimizers.Adam()
model.compile(optimizer=optimizer,loss=tf.keras.losses.SparseCategoricalCrossentropy())
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
model.fit([inp, targ], targ, steps_per_epoch=1)
The shape of my input and target is
(64, 55) (64, 53)
64 is batch size
Related
I am trying to get the predictions for an RNN model. However, while generating the predictions I'm having this error:
TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not int
class RNNClassifier(nn.Module):
def __init__(self, embedding_dim, hidden_dim, layer_dim,vocab_size, output_dim, batch_size):
super(RNNClassifier, self).__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
self.output_dim = output_dim
self.batch_size = batch_size
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.flatten = nn.Flatten(start_dim=1)
self.rnn = nn.RNN(embedding_dim,hidden_dim, layer_dim)
self.dropout = nn.Dropout(0.3)
self.dense = nn.Linear(in_features = hidden_dim, out_features = 1)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
hidden = torch.zeros(self.layer_dim, self.batch_size, self.hidden_dim).requires_grad_()
print(hidden.shape)
embeds = self.embedding(x)
lstm_out, hidden = self.rnn(embeds, hidden.detach())
print(lstm_out.shape)
lstm_out = lstm_out[:,-1,:] #getting the last timestep
print(lstm_out.shape)
out = self.dropout(lstm_out)
out = self.dense(lstm_out)
sig_out = self.sigmoid(out)
return sig_out, x
Predictions code and error snippet
def get_predictions(model, loader):
all_preds = torch.tensor([])
for batch_x, batch_y in enumerate(loader):
#batch_x, batch_y = sample[batch_x], sample[batch_y]
preds = model(batch_x)
preds = torch.round(preds)
all_preds = torch.cat((all_preds, preds), dim=0)
return all_preds
with torch.no_grad():
network.eval()
prediction_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)
prediction_loader = next(iter(prediction_loader))
train_preds = get_predictions(network, prediction_loader)
I am using PyTorch Lightning version 1.4.0 and have defined the following class for the dataset:
class CustomTrainDataset(Dataset):
'''
Custom PyTorch Dataset for training
Args:
data (pd.DataFrame) - DF containing product info (and maybe also ratings)
all_itemIds (list) - Python3 list containing all Item IDs
'''
def __init__(self, data, all_orderIds):
self.users, self.items, self.labels = self.get_dataset(data, all_orderIds)
def __len__(self):
return len(self.users)
def __getitem__(self, idx):
return self.users[idx], self.items[idx], self.labels[idx]
def get_dataset(self, data, all_orderIds):
users, items, labels = [], [], []
user_item_set = set(zip(train_ratings['CustomerID'], train_ratings['ItemCode']))
num_negatives = 7
for u, i in user_item_set:
users.append(u)
items.append(i)
labels.append(1)
for _ in range(num_negatives):
negative_item = np.random.choice(all_itemIds)
while (u, negative_item) in user_item_set:
negative_item = np.random.choice(all_itemIds)
users.append(u)
items.append(negative_item)
labels.append(0)
return torch.tensor(users), torch.tensor(items), torch.tensor(labels)
followed by the PL class:
class NCF(pl.LightningModule):
'''
Neural Collaborative Filtering (NCF)
Args:
num_users (int): Number of unique users
num_items (int): Number of unique items
data (pd.DataFrame): Dataframe containing the food ratings for training
all_orderIds (list): List containing all orderIds (train + test)
'''
def __init__(self, num_users, num_items, data, all_itemIds):
# def __init__(self, num_users, num_items, ratings, all_movieIds):
super().__init__()
self.user_embedding = nn.Embedding(num_embeddings = num_users, embedding_dim = 8)
# self.user_embedding = nn.Embedding(num_embeddings = num_users, embedding_dim = 10)
self.item_embedding = nn.Embedding(num_embeddings = num_items, embedding_dim = 8)
# self.item_embedding = nn.Embedding(num_embeddings = num_items, embedding_dim = 10)
self.fc1 = nn.Linear(in_features = 16, out_features = 64)
# self.fc1 = nn.Linear(in_features = 20, out_features = 64)
self.fc2 = nn.Linear(in_features = 64, out_features = 64)
self.fc3 = nn.Linear(in_features = 64, out_features = 32)
self.output = nn.Linear(in_features = 32, out_features = 1)
self.data = data
# self.ratings = ratings
# self.all_movieIds = all_movieIds
self.all_orderIds = all_orderIds
def forward(self, user_input, item_input):
# Pass through embedding layers
user_embedded = self.user_embedding(user_input)
item_embedded = self.item_embedding(item_input)
# Concat the two embedding layers
vector = torch.cat([user_embedded, item_embedded], dim = -1)
# Pass through dense layer
vector = nn.ReLU()(self.fc1(vector))
vector = nn.ReLU()(self.fc2(vector))
vector = nn.ReLU()(self.fc3(vector))
# Output layer
pred = nn.Sigmoid()(self.output(vector))
return pred
def training_step(self, batch, batch_idx):
user_input, item_input, labels = batch
predicted_labels = self(user_input, item_input)
loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.parameters())
def train_dataloader(self):
return DataLoader(
ChupsTrainDataset(
self.data, self.all_orderIds
),
batch_size = 32, num_workers = 2
# Google Colab's suggested max number of worker in current
# system is 2 and not 4.
)
print(f"num_users = {num_users}, num_items = {num_items} & all_itemIds = {len(all_itemIds)}")
# num_users = 12958, num_items = 511238 & all_itemIds = 9114
# Initialize NCF model-
model = NCF(num_users, num_items, train_ratings, all_itemIds)
trainer = pl.Trainer(
max_epochs = 75, gpus = 1,
# max_epochs = 5,
reload_dataloaders_every_n_epochs = True,
# reload_dataloaders_every_epoch = True, # deprecated!
progress_bar_refresh_rate = 50,
logger = False, checkpoint_callback = False)
trainer.fit(model)
# Save trained model as a checkpoint-
trainer.save_checkpoint("NCF_Trained.ckpt")
To load the saved checkpoint, I have tried:
trained_model = NCF.load_from_checkpoint(
"NCF_Trained.ckpt", num_users = num_users,
num_items = train_ratings, data = train_ratings,
all_itemIds = all_itemIds)
trained_model = NCF(num_users, num_items, train_ratings, all_orderIds).load_from_checkpoint(checkpoint_path = "NCF_Trained.ckpt")
But these don't seem to work. How do I load this saved checkpoint?
Thanks!
add a line in your init method:
self.save_hyperparameters(logger=False)
Then call
trained_model = NCF.load_from_checkpoint("NCF_Trained.ckpt")
As shown in here, load_from_checkpoint is a primary way to load weights in pytorch-lightning and it automatically load hyperparameter used in training. So you do not need to pass params except for overwriting existing ones. My suggestion is to try trained_model = NCF.load_from_checkpoint("NCF_Trained.ckpt")
In my case it was crucial to set the model into the evaluation mode via model.eval(). Otherwise it would produce wrong results.
I am trying to use and learn PyTorch Transformer with DeepMind math dataset. I have tokenized (char not word) sequence that is fed into model. Models forward function is doing once forward for encoder and multiple forwards for decoder (till all batch outputs reach token, this is still TODO).
I am struggling with Transformer masks and decoder forward as it throws the error:
k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
RuntimeError: shape '[-1, 24, 64]' is invalid for input of size 819200.
Source is N = 32, S = 50, E = 512. Target is N = 32, S = 3, E = 512.
It is possible that I have wrong implementation of masks or that source and target lengths are different, not realy sure.
class PositionalEncoding(nn.Module):
# function to positionally encode src and target sequencies
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class MyTransformerModel(nn.Module):
# should implement init and forward function
# define separate functions for masks
# define forward function with
# implement:
# embedding layer
# positional encoding
# encoder layer
# decoder layer
# final classification layer
# encoder -> forward once
# decoder -> forward multiple times (for one encoder forward)
# decoder output => concatenate to input e.g. decoder_input = torch.cat([decoder_input], [decoder_output])
# early stopping => all in batch reach <eos> token
def __init__(self, vocab_length = 30, sequence_length = 512, num_encoder_layers = 3, num_decoder_layers = 2, num_hidden_dimension = 256, feed_forward_dimensions = 1024, attention_heads = 8, dropout = 0.1, pad_idx = 3, device = "CPU", batch_size = 32):
super(MyTransformerModel, self).__init__()
self.src_embedding = nn.Embedding(vocab_length, sequence_length)
self.pos_encoder = PositionalEncoding(sequence_length, dropout)
self.src_mask = None # attention mask
self.memory_mask = None # attention mask
self.pad_idx = pad_idx
self.device = device
self.batch_size = batch_size
self.transformer = nn.Transformer(
sequence_length,
attention_heads,
num_encoder_layers,
num_decoder_layers,
feed_forward_dimensions,
dropout,
)
def src_att_mask(self, src_len):
mask = (torch.triu(torch.ones(src_len, src_len)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def no_peak_att_mask(self, batch_size, src_len, time_step):
mask = np.zeros((batch_size, src_len), dtype=bool)
mask[:, time_step: ] = 1 # np.NINF
mask = torch.from_numpy(mask)
return mask
def make_src_key_padding_mask(self, src):
# mask "<pad>"
src_mask = src.transpose(0, 1) == self.pad_idx
return src_mask.to(self.device)
def make_trg_key_padding_mask(self, trg):
tgt_mask = trg.transpose(0, 1) == self.pad_idx
return tgt_mask.to(self.device)
def forward(self, src, trg):
src_seq_length, N = src.shape
trg_seq_length, N = trg.shape
embed_src = self.src_embedding(src)
position_embed_src = self.pos_encoder(embed_src)
embed_trg = self.src_embedding(trg)
position_embed_trg = self.pos_encoder(embed_trg)
src_padding_mask = self.make_src_key_padding_mask(src)
trg_padding_mask = self.make_trg_key_padding_mask(trg)
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)
time_step = 1
att_mask = self.no_peak_att_mask(self.batch_size, src_seq_length, time_step).to(self.device)
encoder_output = self.transformer.encoder.forward(position_embed_src, src_key_padding_mask = src_padding_mask)
# TODO : implement loop for transformer decoder forward fn, implement early stopping
# where to feed decoder_output?
decoder_output = self.transformer.decoder.forward(position_embed_trg, encoder_output, trg_mask, att_mask, trg_padding_mask, src_padding_mask)
return decoder_output
Can anyone pin point where I have made a mistake?
It looks like I have messed dimensions order (as Transformer does not have batch first option). Corrected code is below:
class MyTransformerModel(nn.Module):
def __init__(self, d_model = 512, vocab_length = 30, sequence_length = 512, num_encoder_layers = 3, num_decoder_layers = 2, num_hidden_dimension = 256, feed_forward_dimensions = 1024, attention_heads = 8, dropout = 0.1, pad_idx = 3, device = "CPU", batch_size = 32):
#, ninp, device, nhead=8, nhid=2048, nlayers=2, dropout=0.1, src_pad_idx = 1, max_len=5000, forward_expansion= 4):
super(MyTransformerModel, self).__init__()
self.src_embedding = nn.Embedding(vocab_length, d_model)
self.pos_encoder = PositionalEncoding(d_model, dropout)
self.vocab_length = vocab_length
self.d_model = d_model
self.src_mask = None # attention mask
self.memory_mask = None # attention mask
self.pad_idx = pad_idx
self.device = device
self.batch_size = batch_size
self.transformer = nn.Transformer(
d_model,
attention_heads,
num_encoder_layers,
num_decoder_layers,
feed_forward_dimensions,
dropout,
)
self.fc = nn.Linear(d_model, vocab_length)
# self.init_weights() <= used in tutorial
def src_att_mask(self, src_len):
mask = (torch.triu(torch.ones(src_len, src_len)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def no_peak_att_mask(self, batch_size, src_len, time_step):
mask = np.zeros((batch_size, src_len), dtype=bool)
mask[:, time_step: ] = 1 # np.NINF
mask = torch.from_numpy(mask)
# mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def make_src_key_padding_mask(self, src):
# mask "<pad>"
src_mask = src.transpose(0, 1) == self.pad_idx
# src_mask = src == self.pad_idx
# (N, src_len)
return src_mask.to(self.device)
def make_trg_key_padding_mask(self, trg):
# same as above -> expected tgt_key_padding_mask: (N, T)
tgt_mask = trg.transpose(0, 1) == self.pad_idx
# tgt_mask = trg == self.pad_idx
# (N, src_len)
return tgt_mask.to(self.device)
def init_weights(self):
initrange = 0.1
nn.init.uniform_(self.encoder.weight, -initrange, initrange)
nn.init.zeros_(self.decoder.weight)
nn.init.uniform_(self.decoder.weight, -initrange, initrange)
def forward(self, src, trg):
N, src_seq_length = src.shape
N, trg_seq_length = trg.shape
# S - source sequence length
# T - target sequence length
# N - batch size
# E - feature number
# src: (S, N, E) (sourceLen, batch, features)
# tgt: (T, N, E)
# src_mask: (S, S)
# tgt_mask: (T, T)
# memory_mask: (T, S)
# src_key_padding_mask: (N, S)
# tgt_key_padding_mask: (N, T)
# memory_key_padding_mask: (N, S)
src = rearrange(src, 'n s -> s n')
trg = rearrange(trg, 'n t -> t n')
print("src shape {}".format(src.shape))
print(src)
print("trg shape {}".format(trg.shape))
print(trg)
embed_src = self.src_embedding(src)
print("embed_src shape {}".format(embed_src.shape))
print(embed_src)
position_embed_src = self.pos_encoder(embed_src)
print("position_embed_src shape {}".format(position_embed_src.shape))
print(position_embed_src)
embed_trg = self.src_embedding(trg)
print("embed_trg shape {}".format(embed_trg.shape))
print(embed_trg)
position_embed_trg = self.pos_encoder(embed_trg)
# position_embed_trg = position_embed_trg.transpose(0, 1)
print("position_embed_trg shape {}".format(position_embed_trg.shape))
print(position_embed_trg)
src_padding_mask = self.make_src_key_padding_mask(src)
print("KEY - src_padding_mask shape {}".format(src_padding_mask.shape))
print("should be of shape: src_key_padding_mask: (N, S)")
print(src_padding_mask)
trg_padding_mask = self.make_trg_key_padding_mask(trg)
print("KEY - trg_padding_mask shape {}".format(trg_padding_mask.shape))
print("should be of shape: trg_key_padding_mask: (N, T)")
print(trg_padding_mask)
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)
print("trg_mask shape {}".format(trg_mask.shape))
print("trg_mask should be of shape tgt_mask: (T, T)")
print(trg_mask)
# att_mask = self.src_att_mask(trg_seq_length).to(self.device)
time_step = 1
# error => memory_mask: expected shape! (T, S) !!! this is not a key_padding_mask!
# att_mask = self.no_peak_att_mask(self.batch_size, src_seq_length, time_step).to(self.device)
# print("att_mask shape {}".format(att_mask.shape))
# print("att_mask should be of shape memory_mask: (T, S)")
# print(att_mask)
att_mask = None
# get encoder output
# forward(self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None)
# forward encoder just once for a batch
# attention forward of encoder expects => src, src_mask, src_key_padding_mask +++ possible positional encoding error !!!
encoder_output = self.transformer.encoder.forward(position_embed_src, src_key_padding_mask = src_padding_mask)
print("encoder_output")
print("encoder_output shape {}".format(encoder_output.shape))
print(encoder_output)
# forward decoder till all in batch did not reach <eos>?
# def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None,
# memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None,
# memory_key_padding_mask: Optional[Tensor] = None)
# first forward
decoder_output = self.transformer.decoder.forward(position_embed_trg, encoder_output, trg_mask, att_mask, trg_padding_mask, src_padding_mask)
# TODO: target in => target out shifted by one, loop till all in batch meet stopping criteria || max len is reached
#
print("decoder_output")
print("decoder_output shape {}".format(decoder_output.shape))
print(decoder_output)
output = rearrange(decoder_output, 't n e -> n t e')
output = self.fc(output)
print("output")
print("output shape {}".format(output.shape))
print(output)
predicted = F.log_softmax(output, dim=-1)
print("predicted")
print("predicted shape {}".format(predicted.shape))
print(predicted)
# top k
top_value, top_index = torch.topk(predicted, k=1)
top_index = torch.squeeze(top_index)
print("top_index")
print("top_index shape {}".format(top_index.shape))
print(top_index)
print("top_value")
print("top_value shape {}".format(top_value.shape))
print(top_value)
return top_index
Here is my attention layer
class Attention(Layer):
def __init__(self, **kwargs):
self.init = initializers.get('normal')
self.supports_masking = True
self.attention_dim = 50
super(Attention, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = K.variable(self.init((input_shape[-1], 1)))
self.b = K.variable(self.init((self.attention_dim, )))
self.u = K.variable(self.init((self.attention_dim, 1)))
self.trainable_weights = [self.W, self.b, self.u]
super(Attention, self).build(input_shape)
def compute_mask(self, inputs, mask=None):
return mask
def call(self, x, mask=None):
uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
ait = K.dot(uit, self.u)
ait = K.squeeze(ait, -1)
ait = K.exp(ait)
if mask is not None:
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
ait = K.expand_dims(ait)
weighted_input = x * ait
output = K.sum(weighted_input, axis=1)
return output
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[-1])
I am trying to combine CNN with attention network for text classification. Following is my code in keras:-
def inputs_and_embeddings(features, config):
inputs, embeddings = [], []
for f in features:
E = Embedding if not config.fixed_embedding else FixedEmbedding
# i = Input(shape=(config.doc_size,), dtype='int32', name=f.name)
i = Input(shape=(config.doc_size,), dtype='int32', name=f.name)
e = E(f.input_dim, f.output_dim, weights=[f.weights],
input_length=config.doc_size)(i)
inputs.append(i)
embeddings.append(e)
return inputs, embeddings
inputs, embeddings = inputs_and_embeddings(features, config)
#calculating the size of documents and all features.
seq = concat(embeddings)
cshape = (config.doc_size, sum(f.output_dim for f in features))
seq = Reshape((1,)+cshape)(seq)
#seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above
# seq = Bidirectional()
# Convolution(s)
convLayers = []
for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums):
seq2 = Convolution2D(
filter_num,
filter_size,
cshape[1],
border_mode='valid',
activation='relu',
dim_ordering='th'
)(seq)
seq2 = MaxPooling2D(
pool_size=(config.doc_size-filter_size+1, 1),
dim_ordering='th'
)(seq2)
# seq2 = Flatten()(seq2)
convLayers.append(seq2)
seq = Concatenate(axis=1)(convLayers)
if config.drop_prob:
seq = Dropout(config.drop_prob)(seq)
for s in config.hidden_sizes:
seq = Dense(s, activation='relu')(seq)
#need reshaping here
seq = Reshape((200,3))(seq)
word_encoder = Bidirectional(GRU(50, return_sequences=True))(seq)
rnn_type = 'GRU'
dense_transform_word = Dense(
100,
activation='relu', kernel_regularizer=l2_reg,
name='dense_transform_word')(word_encoder)
# word attention
attention_weighted_sentence = Model(
inputs, Attention(name="word_attention")(dense_transform_word))
word_attention_model = attention_weighted_sentence
attention_weighted_sentence.summary()
# sentence-attention-weighted document scores
texts_in = Input(shape=(MAX_SEQ_LEN,config.doc_size), dtype='int32', name="input_2")
attention_weighted_sentences = TimeDistributed(attention_weighted_sentence)(texts_in)
if rnn_type is 'GRU':
#sentence_encoder = Bidirectional(GRU(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.2))(attention_weighted_sentences)
dropout = Dropout(0.1)(attention_weighted_sentences)
sentence_encoder = Bidirectional(GRU(50, return_sequences=True))(dropout)
else:
sentence_encoder = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.2))(attention_weighted_sentences)
dense_transform_sentence = Dense(
100,
activation='relu',
name='dense_transform_sentence',
kernel_regularizer=l2_reg)(sentence_encoder)
# sentence attention
attention_weighted_text = Attention(name="sentence_attention")(dense_transform_sentence)
prediction = Dense(19, activation='sigmoid')(attention_weighted_text)
model = Model(inputs, prediction)
model.summary()
I am getting error message Graph disconnected error when I initialize model with inputs and prediction as shown in code. On researching I found that this error occurs when there is no connection between inputs and outputs. However, I can't figure out the input of my model. Can anyone please help me with this?
def inputs_and_embeddings(features, config):
inputs, embeddings = [], []
for f in features:
E = Embedding if not config.fixed_embedding else FixedEmbedding
# i = Input(shape=(config.doc_size,), dtype='int32', name=f.name)
i = Input(shape=(config.doc_size,), dtype='int32', name=f.name)
e = E(f.input_dim,
f.output_dim,
weights=[f.weights],
input_length=config.doc_size)(i)
inputs.append(i)
embeddings.append(e)
return inputs, embeddings
inputs, embeinputsddings = inputs_and_embeddings(features, config)
#calculating the size of documents and all features.
seq = concat(embeddings)
cshape = (config.doc_size, sum(f.output_dim for f in features))
seq = Reshape((1,)+cshape)(seq)
#seq = Reshape((1, config.doc_size, w2v.output_dim))(embeddings) #old way of doing the above
# seq = Bidirectional()
# Convolution(s)
convLayers = []
for filter_size, filter_num in zip(config.filter_sizes, config.filter_nums):
seq2 = Convolution2D(
filter_num,
filter_size,
cshape[1],
border_mode='valid',
activation='relu',
dim_ordering='th'
)(seq)
seq2 = MaxPooling2D(
pool_size=(config.doc_size-filter_size+1, 1),
dim_ordering='th'
)(seq2)
# seq2 = Flatten()(seq2)
convLayers.append(seq2)
seq = Concatenate(axis=1)(convLayers)
if config.drop_prob:
seq = Dropout(config.drop_prob)(seq)
for s in config.hidden_sizes:
seq = Dense(s, activation='relu')(seq)
#need reshaping here
seq = Reshape((200,3))(seq)
word_encoder = Bidirectional(GRU(50, return_sequences=True))(seq)
rnn_type = 'GRU'
dense_transform_word = Dense(
100,
activation='relu', kernel_regularizer=l2_reg,
name='dense_transform_word')(word_encoder)
outputs = Attention(name="word_attention")(dense_transform_word)
# word attention
attention_weighted_sentence = Model(
inputs, outputs)
word_attention_model = attention_weighted_sentence
attention_weighted_sentence.summary()
# sentence-attention-weighted document scores
texts_in = Input(shape=(MAX_SEQ_LEN,config.doc_size), dtype='int32', name="input_2")
attention_weighted_sentences = TimeDistributed(outputs)(texts_in)
if rnn_type is 'GRU':
#sentence_encoder = Bidirectional(GRU(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.2))(attention_weighted_sentences)
dropout = Dropout(0.1)(attention_weighted_sentences)
sentence_encoder = Bidirectional(GRU(50, return_sequences=True))(dropout)
else:
sentence_encoder = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.2))(attention_weighted_sentences)
dense_transform_sentence = Dense(
100,
activation='relu',
name='dense_transform_sentence',
kernel_regularizer=l2_reg)(sentence_encoder)
# sentence attention
attention_weighted_text = Attention(name="sentence_attention")(dense_transform_sentence)
prediction = Dense(19, activation='sigmoid')(attention_weighted_text)
model = Model([inputs, texts_in], prediction)
model.summary()
I'm trying to implement crf rather softmax after BiLSTM, and I'm using keras_contrib to get crf. I think I make some mistake about dimention of array, but I can't fix it.
Here is code:
# preds = Dense(num_label, activation='softmax')(out)
# preds_binary = Dense(2, activation='softmax')(out)
'''
test 1
'''
preds = kcl.CRF(num_label, sparse_target=True)(out)
preds_binary = kcl.CRF(2, sparse_target=True)(out)
here is error message:
ValueError: Index out of range using input dim 2; input has only 2 dims for 'crf_1/strided_slice' (op: 'StridedSlice') with input shapes: [?,5], [3], [3], [3] and with computed input tensors: input[3] = <1 1 1>.
Anybody here can help me?
#giser_yugang Here's my code:
num_labels = 5
train_array = [X_train, POS1_train, POS2_train]
test_array = [X_test, POS1_test, POS2_test]
train_label = [Y_train, binary_label_train]
test_label = [Y_test, binary_label_test ]
x_test_drug, x_test_med, y_test_drug, y_test_med = pd.splitDrug_Med(id_test, X_test, Y_test, POS1_test, POS2_test,
binary_label_test)
print("\nthe shape of x_test_drug[0]: ", x_test_drug[0].shape, '\n')
print("\nthe shape of x_test_med[0] : ", x_test_med[0].shape, '\n')
print("load word2vec...")
len_dic, embedding_matrix = ld.load_word_matrix(GLOVE_DIR,
MAX_NB_WORDS,
word_index,
EMBEDDING_DIM)
print("create word embedding layer...")
embedding_layer = Embedding(len_dic,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=True)
print("create position embedding layer...")
position_em_dim = 10
pos_embedding_matrix = np.random.uniform(-0.1, 0.1, size=(400, position_em_dim))
print("the shape of pos_embedding_matrix", pos_embedding_matrix.shape)
pos_embedding_layer = Embedding(400,
position_em_dim,
weights=[pos_embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=True)
print('create model...')
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
word_embedded_sequences = embedding_layer(sequence_input)
pos1_sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
pos1_embedded_sequences = pos_embedding_layer(pos1_sequence_input)
pos2_sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
pos2_embedded_sequences = pos_embedding_layer(pos2_sequence_input)
# embedded_sequences = merge([word_embedded_sequences, pos1_embedded_sequences, pos2_embedded_sequences],
# mode='concat')
embedded_sequences = concatenate([word_embedded_sequences, pos1_embedded_sequences, pos2_embedded_sequences], axis=-1)
'''
#lstm_attention_add_pos_add_last_two_out
'''
embedded_sequences = Dropout(0.3)(embedded_sequences)
lstm_forward, lstm_backword_reverse = Bidirectional(LSTM(hidden_dim, dropout_W=0.3,
dropout_U=0.3,
return_sequences=True,
U_regularizer=regularizers.l2(0.0001)),
merge_mode=None)(embedded_sequences)
# lstm_forward = LSTM(150, dropout_W=0.2, dropout_U=0.2, return_sequences=True)(embedded_sequences)
# lstm_backword = LSTM(150, dropout_W=0.2, dropout_U=0.2, return_sequences=True, go_backwards=True)(embedded_sequences)
flip_layer = Lambda(lambda x: K.reverse(x, 1), output_shape=lambda x: (x[0], x[1], x[2]))
flip_layer.supports_masking = True
lstm_backword = flip_layer(lstm_backword_reverse)
# lstm_sequence = merge([lstm_forward, lstm_backword_reverse], mode='concat', concat_axis=-1)
lstm_sequence = concatenate([lstm_forward, lstm_backword_reverse], axis=-1)
# pos_featrue = merge([pos1_embedded_sequences, pos2_embedded_sequences], mode='concat', concat_axis=-1)
pos_featrue = concatenate([pos1_embedded_sequences, pos2_embedded_sequences], axis=-1)
pos_featrue = TimeDistributed(Dense(20, init='he_normal'))(pos_featrue)
h_feature = TimeDistributed(Dense(hidden_dim * 2))(lstm_sequence)
# att_feature = merge([h_feature, pos_featrue], mode='concat', concat_axis=-1)
att_feature = concatenate([h_feature, pos_featrue], axis=-1)
weights = AttentionWeight2(name='attention')(att_feature)
weights_repeat = RepeatVector(hidden_dim * 2)(weights)
weights_repeat_per = Permute((2, 1))(weights_repeat)
# mul = merge([lstm_sequence, weights_repeat_per], mode='mul')
mul = multiply([lstm_sequence, weights_repeat_per])
sumpool = Lambda(lambda x: K.sum(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
sumpool.supports_masking = True
att_out = sumpool(mul)
lastout = Lambda(slice, output_shape=lambda x: (x[0], x[2]), arguments={'index': -1})
lstm_last_forward = lastout(lstm_forward)
lstm_last_backward = lastout(lstm_backword)
# lstm_last = merge([lstm_last_forward, lstm_last_backward], mode='concat')
lstm_last = concatenate([lstm_last_forward, lstm_last_backward], axis=-1)
att_out = Dense(hidden_dim * 2)(att_out)
lstm_last = Dense(hidden_dim * 2)(lstm_last)
# out = merge([att_out, lstm_last], mode='sum')
out = add([att_out, lstm_last])
out = Dropout(0.5)(out)
out = Activation(activation='tanh')(out)
preds = Dense(num_label, activation='softmax')(out)
preds_binary = Dense(2, activation='softmax')(out)
'''
test 1
'''
preds = kcl.CRF(num_label, sparse_target=True)(out)
preds_binary = kcl.CRF(2, sparse_target=True)(out)
'''
If it's not enough, I'll give you more.