Pytorch transformer forward function masks implementation for decoder forward function

Pytorch transformer forward function masks implementation for decoder forward function - pytorch

I am trying to use and learn PyTorch Transformer with DeepMind math dataset. I have tokenized (char not word) sequence that is fed into model. Models forward function is doing once forward for encoder and multiple forwards for decoder (till all batch outputs reach token, this is still TODO).
I am struggling with Transformer masks and decoder forward as it throws the error:
k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
RuntimeError: shape '[-1, 24, 64]' is invalid for input of size 819200.
Source is N = 32, S = 50, E = 512. Target is N = 32, S = 3, E = 512.
It is possible that I have wrong implementation of masks or that source and target lengths are different, not realy sure.
class PositionalEncoding(nn.Module):
# function to positionally encode src and target sequencies
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class MyTransformerModel(nn.Module):
# should implement init and forward function
# define separate functions for masks
# define forward function with
# implement:
# embedding layer
# positional encoding
# encoder layer
# decoder layer
# final classification layer
# encoder -> forward once
# decoder -> forward multiple times (for one encoder forward)
# decoder output => concatenate to input e.g. decoder_input = torch.cat([decoder_input], [decoder_output])
# early stopping => all in batch reach <eos> token
def __init__(self, vocab_length = 30, sequence_length = 512, num_encoder_layers = 3, num_decoder_layers = 2, num_hidden_dimension = 256, feed_forward_dimensions = 1024, attention_heads = 8, dropout = 0.1, pad_idx = 3, device = "CPU", batch_size = 32):
super(MyTransformerModel, self).__init__()
self.src_embedding = nn.Embedding(vocab_length, sequence_length)
self.pos_encoder = PositionalEncoding(sequence_length, dropout)
self.src_mask = None # attention mask
self.memory_mask = None # attention mask
self.pad_idx = pad_idx
self.device = device
self.batch_size = batch_size
self.transformer = nn.Transformer(
sequence_length,
attention_heads,
num_encoder_layers,
num_decoder_layers,
feed_forward_dimensions,
dropout,
)
def src_att_mask(self, src_len):
mask = (torch.triu(torch.ones(src_len, src_len)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def no_peak_att_mask(self, batch_size, src_len, time_step):
mask = np.zeros((batch_size, src_len), dtype=bool)
mask[:, time_step: ] = 1 # np.NINF
mask = torch.from_numpy(mask)
return mask
def make_src_key_padding_mask(self, src):
# mask "<pad>"
src_mask = src.transpose(0, 1) == self.pad_idx
return src_mask.to(self.device)
def make_trg_key_padding_mask(self, trg):
tgt_mask = trg.transpose(0, 1) == self.pad_idx
return tgt_mask.to(self.device)
def forward(self, src, trg):
src_seq_length, N = src.shape
trg_seq_length, N = trg.shape
embed_src = self.src_embedding(src)
position_embed_src = self.pos_encoder(embed_src)
embed_trg = self.src_embedding(trg)
position_embed_trg = self.pos_encoder(embed_trg)
src_padding_mask = self.make_src_key_padding_mask(src)
trg_padding_mask = self.make_trg_key_padding_mask(trg)
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)
time_step = 1
att_mask = self.no_peak_att_mask(self.batch_size, src_seq_length, time_step).to(self.device)
encoder_output = self.transformer.encoder.forward(position_embed_src, src_key_padding_mask = src_padding_mask)
# TODO : implement loop for transformer decoder forward fn, implement early stopping
# where to feed decoder_output?
decoder_output = self.transformer.decoder.forward(position_embed_trg, encoder_output, trg_mask, att_mask, trg_padding_mask, src_padding_mask)
return decoder_output
Can anyone pin point where I have made a mistake?

It looks like I have messed dimensions order (as Transformer does not have batch first option). Corrected code is below:
class MyTransformerModel(nn.Module):
def __init__(self, d_model = 512, vocab_length = 30, sequence_length = 512, num_encoder_layers = 3, num_decoder_layers = 2, num_hidden_dimension = 256, feed_forward_dimensions = 1024, attention_heads = 8, dropout = 0.1, pad_idx = 3, device = "CPU", batch_size = 32):
#, ninp, device, nhead=8, nhid=2048, nlayers=2, dropout=0.1, src_pad_idx = 1, max_len=5000, forward_expansion= 4):
super(MyTransformerModel, self).__init__()
self.src_embedding = nn.Embedding(vocab_length, d_model)
self.pos_encoder = PositionalEncoding(d_model, dropout)
self.vocab_length = vocab_length
self.d_model = d_model
self.src_mask = None # attention mask
self.memory_mask = None # attention mask
self.pad_idx = pad_idx
self.device = device
self.batch_size = batch_size
self.transformer = nn.Transformer(
d_model,
attention_heads,
num_encoder_layers,
num_decoder_layers,
feed_forward_dimensions,
dropout,
)
self.fc = nn.Linear(d_model, vocab_length)
# self.init_weights() <= used in tutorial
def src_att_mask(self, src_len):
mask = (torch.triu(torch.ones(src_len, src_len)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def no_peak_att_mask(self, batch_size, src_len, time_step):
mask = np.zeros((batch_size, src_len), dtype=bool)
mask[:, time_step: ] = 1 # np.NINF
mask = torch.from_numpy(mask)
# mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def make_src_key_padding_mask(self, src):
# mask "<pad>"
src_mask = src.transpose(0, 1) == self.pad_idx
# src_mask = src == self.pad_idx
# (N, src_len)
return src_mask.to(self.device)
def make_trg_key_padding_mask(self, trg):
# same as above -> expected tgt_key_padding_mask: (N, T)
tgt_mask = trg.transpose(0, 1) == self.pad_idx
# tgt_mask = trg == self.pad_idx
# (N, src_len)
return tgt_mask.to(self.device)
def init_weights(self):
initrange = 0.1
nn.init.uniform_(self.encoder.weight, -initrange, initrange)
nn.init.zeros_(self.decoder.weight)
nn.init.uniform_(self.decoder.weight, -initrange, initrange)
def forward(self, src, trg):
N, src_seq_length = src.shape
N, trg_seq_length = trg.shape
# S - source sequence length
# T - target sequence length
# N - batch size
# E - feature number
# src: (S, N, E) (sourceLen, batch, features)
# tgt: (T, N, E)
# src_mask: (S, S)
# tgt_mask: (T, T)
# memory_mask: (T, S)
# src_key_padding_mask: (N, S)
# tgt_key_padding_mask: (N, T)
# memory_key_padding_mask: (N, S)
src = rearrange(src, 'n s -> s n')
trg = rearrange(trg, 'n t -> t n')
print("src shape {}".format(src.shape))
print(src)
print("trg shape {}".format(trg.shape))
print(trg)
embed_src = self.src_embedding(src)
print("embed_src shape {}".format(embed_src.shape))
print(embed_src)
position_embed_src = self.pos_encoder(embed_src)
print("position_embed_src shape {}".format(position_embed_src.shape))
print(position_embed_src)
embed_trg = self.src_embedding(trg)
print("embed_trg shape {}".format(embed_trg.shape))
print(embed_trg)
position_embed_trg = self.pos_encoder(embed_trg)
# position_embed_trg = position_embed_trg.transpose(0, 1)
print("position_embed_trg shape {}".format(position_embed_trg.shape))
print(position_embed_trg)
src_padding_mask = self.make_src_key_padding_mask(src)
print("KEY - src_padding_mask shape {}".format(src_padding_mask.shape))
print("should be of shape: src_key_padding_mask: (N, S)")
print(src_padding_mask)
trg_padding_mask = self.make_trg_key_padding_mask(trg)
print("KEY - trg_padding_mask shape {}".format(trg_padding_mask.shape))
print("should be of shape: trg_key_padding_mask: (N, T)")
print(trg_padding_mask)
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)
print("trg_mask shape {}".format(trg_mask.shape))
print("trg_mask should be of shape tgt_mask: (T, T)")
print(trg_mask)
# att_mask = self.src_att_mask(trg_seq_length).to(self.device)
time_step = 1
# error => memory_mask: expected shape! (T, S) !!! this is not a key_padding_mask!
# att_mask = self.no_peak_att_mask(self.batch_size, src_seq_length, time_step).to(self.device)
# print("att_mask shape {}".format(att_mask.shape))
# print("att_mask should be of shape memory_mask: (T, S)")
# print(att_mask)
att_mask = None
# get encoder output
# forward(self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None)
# forward encoder just once for a batch
# attention forward of encoder expects => src, src_mask, src_key_padding_mask +++ possible positional encoding error !!!
encoder_output = self.transformer.encoder.forward(position_embed_src, src_key_padding_mask = src_padding_mask)
print("encoder_output")
print("encoder_output shape {}".format(encoder_output.shape))
print(encoder_output)
# forward decoder till all in batch did not reach <eos>?
# def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None,
# memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None,
# memory_key_padding_mask: Optional[Tensor] = None)
# first forward
decoder_output = self.transformer.decoder.forward(position_embed_trg, encoder_output, trg_mask, att_mask, trg_padding_mask, src_padding_mask)
# TODO: target in => target out shifted by one, loop till all in batch meet stopping criteria || max len is reached
#
print("decoder_output")
print("decoder_output shape {}".format(decoder_output.shape))
print(decoder_output)
output = rearrange(decoder_output, 't n e -> n t e')
output = self.fc(output)
print("output")
print("output shape {}".format(output.shape))
print(output)
predicted = F.log_softmax(output, dim=-1)
print("predicted")
print("predicted shape {}".format(predicted.shape))
print(predicted)
# top k
top_value, top_index = torch.topk(predicted, k=1)
top_index = torch.squeeze(top_index)
print("top_index")
print("top_index shape {}".format(top_index.shape))
print(top_index)
print("top_value")
print("top_value shape {}".format(top_value.shape))
print(top_value)
return top_index

Related

ViVIT PyTorch: RuntimeError: multi-target not supported at /pytorch/aten/src/THCUNN/generic/ClassNLLCriterion.cu:15

I am trying to run Video Vision Transformer (ViViT) code with my dataset but getting an error using CrossEntropyLoss from Pytorch as the Loss function.
There are 6 classes I have:
['Run', 'Sit', 'Walk', 'Wave', 'Sit', 'Stand']
Optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, weight_decay=1e-9, momentum=0.9)
Class Weights
tensor([0.0045, 0.0042, 0.0048, 0.0038, 0.0070, 0.0065])
Loss Function
loss_func = nn.CrossEntropyLoss(weight=class_weights.to(device))
Code Throwning Error
train_epoch(model, optimizer, train_loader, train_loss_history, loss_func)
Error
RuntimeError: multi-target not supported at /pytorch/aten/src/THCUNN/generic/ClassNLLCriterion.cu:15
Code Calling the transformer
model = ViViT(224, 16, 100, 16).cuda()
Getting Video Frames
def get_frames(filename, n_frames=1):
frames = []
v_cap = cv2.VideoCapture(filename)
v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_list = np.linspace(0, v_len - 1, n_frames + 1, dtype=np.int16)
frame_dims = np.array([224, 224, 3])
for fn in range(v_len):
success, frame = v_cap.read()
if success is False:
continue
if (fn in frame_list):
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = cv2.resize(frame, (frame_dims[0], frame_dims[1]))
frames.append(frame)
v_cap.release()
return frames, v_len
Dataset Preprocessing
class DatasetProcessing(data.Dataset):
def __init__(self, df, root_dir):
super(DatasetProcessing, self).__init__()
# List of all videos path
video_list = df["Video"].apply(lambda x: root_dir + '/' + x)
self.video_list = np.asarray(video_list)
self.df = df
def __getitem__(self, index):
# Ensure that the raw videos are in respective folders and folder name matches the output class label
video_label = self.video_list[index].split('/')[-2]
video_name = self.video_list[index].split('/')[-1]
video_frames, len_ = get_frames(self.video_list[index], n_frames = 15)
video_frames = np.asarray(video_frames)
video_frames = video_frames/255
class_list = ['Run', 'Walk', 'Wave', 'Sit', 'Turn', 'Stand']
class_id_loc = np.where(class_list == video_label)
label = class_id_loc
d = torch.as_tensor(np.array(video_frames).astype('float'))
l = torch.as_tensor(np.array(label).astype('float'))
return (d, l)
def __len__(self):
return self.video_list.shape[0]
Training Epochs
def train_epoch(model, optimizer, data_loader, loss_history, loss_func):
total_samples = len(data_loader.dataset)
model.train()
for i, (data, target) in enumerate(data_loader):
optimizer.zero_grad()
x = data.cuda()
data = rearrange(x, 'b p h w c -> b p c h w').cuda()
target = target.type(torch.LongTensor).cuda()
pred = model(data.float())
output = F.log_softmax(pred, dim=1)
loss = loss_func(output, target.squeeze(1))
loss.backward()
optimizer.step()
if i % 100 == 0:
print('[' + '{:5}'.format(i * len(data)) + '/' + '{:5}'.format(total_samples) +
' (' + '{:3.0f}'.format(100 * i / len(data_loader)) + '%)] Loss: ' +
'{:6.4f}'.format(loss.item()))
loss_history.append(loss.item())
Evaluate Model
def evaluate(model, data_loader, loss_history, loss_func):
model.eval()
total_samples = len(data_loader.dataset)
correct_samples = 0
total_loss = 0
with torch.no_grad():
for data, target in data_loader:
x = data.cuda()
data = rearrange(x, 'b p h w c -> b p c h w').cuda()
target = target.type(torch.LongTensor).cuda()
output = F.log_softmax(model(data.float()), dim=1)
loss = loss_func(output, target)
_, pred = torch.max(output, dim=1)
total_loss += loss.item()
correct_samples += pred.eq(target).sum()
avg_loss = total_loss / total_samples
loss_history.append(avg_loss)
print('\nAverage test loss: ' + '{:.4f}'.format(avg_loss) +
' Accuracy:' + '{:5}'.format(correct_samples) + '/' +
'{:5}'.format(total_samples) + ' (' +
'{:4.2f}'.format(100.0 * correct_samples / total_samples) + '%)\n')
Transformer
class Transformer(nn.Module):
def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
super().__init__()
self.layers = nn.ModuleList([])
self.norm = nn.LayerNorm(dim)
for _ in range(depth):
self.layers.append(nn.ModuleList([
PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
]))
def forward(self, x):
for attn, ff in self.layers:
x = attn(x) + x
x = ff(x) + x
return self.norm(x)
ViViT Code
class ViViT(nn.Module):
def __init__(self, image_size, patch_size, num_classes, num_frames, dim = 192, depth = 4, heads = 3, pool = 'cls', in_channels = 3, dim_head = 64, dropout = 0.,
emb_dropout = 0., scale_dim = 4, ):
super().__init__()
assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'
assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
num_patches = (image_size // patch_size) ** 2
patch_dim = in_channels * patch_size ** 2
self.to_patch_embedding = nn.Sequential(
Rearrange('b t c (h p1) (w p2) -> b t (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
nn.Linear(patch_dim, dim),
)
self.pos_embedding = nn.Parameter(torch.randn(1, num_frames, num_patches + 1, dim))
self.space_token = nn.Parameter(torch.randn(1, 1, dim))
self.space_transformer = Transformer(dim, depth, heads, dim_head, dim*scale_dim, dropout)
self.temporal_token = nn.Parameter(torch.randn(1, 1, dim))
self.temporal_transformer = Transformer(dim, depth, heads, dim_head, dim*scale_dim, dropout)
self.dropout = nn.Dropout(emb_dropout)
self.pool = pool
self.mlp_head = nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, num_classes)
)
def forward(self, x):
x = self.to_patch_embedding(x)
b, t, n, _ = x.shape
cls_space_tokens = repeat(self.space_token, '() n d -> b t n d', b = b, t=t)
x = torch.cat((cls_space_tokens, x), dim=2)
x += self.pos_embedding[:, :, :(n + 1)]
x = self.dropout(x)
x = rearrange(x, 'b t n d -> (b t) n d')
x = self.space_transformer(x)
x = rearrange(x[:, 0], '(b t) ... -> b t ...', b=b)
cls_temporal_tokens = repeat(self.temporal_token, '() n d -> b n d', b=b)
x = torch.cat((cls_temporal_tokens, x), dim=1)
x = self.temporal_transformer(x)
x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]
return self.mlp_head(x)

Multi target appears to be a feature supported since version 1.10.0.
https://discuss.pytorch.org/t/crossentropyloss-vs-per-class-probabilities-target/138331
Please check your pytorch version.
Please refer to the example of using the UTF101 top5 dataset, which is available on my Colab. The version of pytorch is 1.12.0+cu113, and the code you listed was able to run the training almost exactly as it was written.

PyTorch: GRU, one-to-many / many-to-one

I would like to implement a GRU able to encode a sequence of vectors to one vector (many-to-one), and then another GRU able to decode a vector to a sequence of vector (one-to-many). The size of the vectors wouldn't be changed. I would like to have an opinion about what I implemented.
Here is the code:
class AEGRU(nn.Module):
def __init__(self, opt):
super(AEGRU, self).__init__()
self.length = 256
self.latent_space = 256
self.num_layers = 1
self.GRU_enc = nn.GRU(input_size=3, hidden_size=self.latent_space, num_layers=self.num_layers, batch_first=True)
self.fc_enc = nn.Linear(self.latent_space, self.latent_space)
self.GRU_dec = nn.GRU(input_size=self.latent_space, hidden_size=3, num_layers=self.num_layers, batch_first=True)
self.fc_dec = nn.Linear(3, 3)
def enc(self, x):
# x has shape: Batch_size x self.length x 3
h0 = torch.zeros(self.num_layers, x.shape[0], self.latent_space).cuda()
out, _ = self.GRU_enc(x, h0)
out = out[:, -1, :]
out = self.fc_enc(out)
return out
def dec(self, x):
# x has shape: Batch_size x self.latent_space
x = x[:, None, :]
h = torch.zeros(self.num_layers, x.shape[0], 3).cuda()
# method 1 ??
'''outputs = torch.zeros(x.shape[0], self.length, 3).cuda()
for i in range(self.length):
out, h = self.GRU_dec(x, h)
outputs[:, i, :] = out[:, 0, :]'''
# method 2 ??
x = x.repeat(1, self.length, 1)
outputs, _ = self.GRU_dec(x, h)
# linear layer
outputs = self.fc_dec(outputs)
return outputs
def forward(self, x):
self.indices = []
latent = self.enc(x)
output = self.dec(latent)
return output
I am not sure whether this is the good way to do a one-to-many GRU. Could I have some opinions about this?
Thanks for reading!

getting shape mismatch error between shape of labels and logits?

I'm trying to do attention mechanism while returning the tensor I'm getting the following error
ValueError: Shape mismatch: The shape of labels (received (64, 53)) should equal the shape of logits except for the last dimension (received (64, 1, 500)).
Please find the below code
Here is code for attention please correct me if it is wrong
class Attention(tf.keras.layers.Layer):
def __init__(self):
super().__init__()
def call(self,enc_op,hidden_state):
# print(enc_op.shape,hidden_state.shape)
query_with_time_axis = tf.expand_dims(hidden_state, 1)
context_vector = tf.matmul(enc_op,tf.transpose(query_with_time_axis,perm=[0,2,1]))
context_vector = tf.nn.softmax(context_vector,axis=1)
context_vector = context_vector * enc_op
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector
Here is decoder part I'm calling the attention from here
class Decoder(tf.keras.layers.Layer):
def init(self,vocab_size,embedding_dim,input_length,dec_units):
super().init()
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.dec_units = dec_units
self.input_length = input_length
self.attention = Attention()
def build(self,input_shape):
self.embedding = Embedding(input_dim=self.vocab_size,output_dim = self.embedding_dim,input_shape = input_shape,
mask_zero = True, name = "embedding_layer_decoder")
self.lstm = LSTM(self.dec_units,return_sequences=True,return_state=True,name = "Decoder_LSTM")
def call(self,target_sentances,enc_op,hidden_state,cell_state):
target_embed = self.embedding(target_sentances)
for i in range(target_embed.shape[1]):
context_vector = self.attention(enc_op,hidden_state)
y = tf.concat([context_vector, target_embed[:,i,:]], axis=-1)
y = tf.expand_dims(y, 1)
lstm_output,hidden_state,_ = self.lstm(y,initial_state = [hidden_state,cell_state])
return lstm_output
class Mymodel(Model):
def __init__(self,encoder_inputs_length,decoder_inputs_length,output_vocab_size):
super().__init__()
self.encoder = Encoder(vocab_size = 500, embedding_dim = 50, input_length = encoder_inputs_length, enc_units=64)
self.decoder = Decoder(vocab_size = 500, embedding_dim = 50, input_length = decoder_inputs_length, dec_units=64)
self.dense = Dense(output_vocab_size,activation = "softmax")
def call(self,data):
input,output = data[0],data[1]
print(input.shape,output.shape)
encoder_output,encoder_h,encoder_c = self.encoder(input)
print("="*20, "ENCODER", "="*20)
print("-"*35)
print(encoder_output)
print("ENCODER ==> OUTPUT SHAPE",encoder_output.shape)
print("ENCODER ==> HIDDEN STATE SHAPE",encoder_h.shape)
print("ENCODER ==> CELL STATE SHAPE", encoder_c.shape)
print("="*20,"Decoder","="*20)
decoder_output = self.decoder(output,encoder_output,encoder_h,encoder_c)
output1 = self.dense(decoder_output)
print("-"*35)
print("Final output shape",output.shape)
print("="*50)
return output1
model = Mymodel(encoder_inputs_length=30,decoder_inputs_length=20,output_vocab_size=500)
ENCODER_SEQ_LEN = 30
DECODER_SEQ_LEN = 20
optimizer = tf.keras.optimizers.Adam()
model.compile(optimizer=optimizer,loss=tf.keras.losses.SparseCategoricalCrossentropy())
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
model.fit([inp, targ], targ, steps_per_epoch=1)
The shape of my input and target is
(64, 55) (64, 53)
64 is batch size

PyTorch to Keras model

I'm trying to replicate a model, but I'm having difficulties doing so with Keras. Here is my current implementation:
filters = 256
kernel_size = 3
strides = 1
# Head module
input = Input(shape=(img_height//scale_fact, img_width//scale_fact, img_depth))
conv0 = Conv2D(filters, kernel_size, strides=strides, padding='same',
kernel_regularizer=regularizers.l2(0.01))(input)
# Body module
res = Conv2D(filters, kernel_size, strides=strides, padding='same')(conv0)
act = ReLU()(res)
res = Conv2D(filters, kernel_size, strides=strides, padding='same')(act)
res_rec = Add()([conv0, res])
for i in range(res_blocks):
res1 = Conv2D(filters, kernel_size, strides=strides, padding='same')(res_rec)
act = ReLU()(res1)
res2 = Conv2D(filters, kernel_size, strides=strides, padding='same')(act)
res_rec = Add()([res_rec, res2])
conv = Conv2D(filters, kernel_size, strides=strides, padding='same',
kernel_regularizer=regularizers.l2(0.01))(res_rec)
add = Add()([conv0, conv])
# Tail module
conv = Conv2D(filters, kernel_size, strides=strides, padding='same',
kernel_regularizer=regularizers.l2(0.01))(add)
act = ReLU()(conv)
up = UpSampling2D(size=scale_fact if scale_fact != 4 else 2)(act) # TODO: try "Conv2DTranspose"
# mul = Multiply([np.zeros((img_width,img_height,img_depth)).fill(0.1), up])(up)
# When it's a 4X factor, we want the upscale split in two procedures
if(scale_fact == 4):
conv = Conv2D(filters, kernel_size, strides=strides, padding='same',
kernel_regularizer=regularizers.l2(0.01))(up)
act = ReLU()(conv)
up = UpSampling2D(size=2)(act) # TODO: try "Conv2DTranspose"
output = Conv2D(filters=3,
kernel_size=1,
strides=1,
padding='same',
kernel_regularizer=regularizers.l2(0.01))(up)
model = Model(inputs=input, outputs=output)
Here is a link to the file I'm trying to replicate. How am I supposed to replicate this custom PyTorch UpSampler that implements a customized PixelShuffling method ?
Here is the relevant part of the UpSampler that I'm having trouble with, for the most part:
import tensorflow as tf
import tensorflow.contrib.slim as slim
"""
Method to upscale an image using
conv2d transpose. Based on upscaling
method defined in the paper
x: input to be upscaled
scale: scale increase of upsample
features: number of features to compute
activation: activation function
"""
def upsample(x,scale=2,features=64,activation=tf.nn.relu):
assert scale in [2,3,4]
x = slim.conv2d(x,features,[3,3],activation_fn=activation)
if scale == 2:
ps_features = 3*(scale**2)
x = slim.conv2d(x,ps_features,[3,3],activation_fn=activation)
#x = slim.conv2d_transpose(x,ps_features,6,stride=1,activation_fn=activation)
x = PS(x,2,color=True)
elif scale == 3:
ps_features =3*(scale**2)
x = slim.conv2d(x,ps_features,[3,3],activation_fn=activation)
#x = slim.conv2d_transpose(x,ps_features,9,stride=1,activation_fn=activation)
x = PS(x,3,color=True)
elif scale == 4:
ps_features = 3*(2**2)
for i in range(2):
x = slim.conv2d(x,ps_features,[3,3],activation_fn=activation)
#x = slim.conv2d_transpose(x,ps_features,6,stride=1,activation_fn=activation)
x = PS(x,2,color=True)
return x
"""
Borrowed from https://github.com/tetrachrome/subpixel
Used for subpixel phase shifting after deconv operations
"""
def _phase_shift(I, r):
bsize, a, b, c = I.get_shape().as_list()
bsize = tf.shape(I)[0] # Handling Dimension(None) type for undefined batch dim
X = tf.reshape(I, (bsize, a, b, r, r))
X = tf.transpose(X, (0, 1, 2, 4, 3)) # bsize, a, b, 1, 1
X = tf.split(X, a, 1) # a, [bsize, b, r, r]
X = tf.concat([tf.squeeze(x, axis=1) for x in X],2) # bsize, b, a*r, r
X = tf.split(X, b, 1) # b, [bsize, a*r, r]
X = tf.concat([tf.squeeze(x, axis=1) for x in X],2) # bsize, a*r, b*r
return tf.reshape(X, (bsize, a*r, b*r, 1))
"""
Borrowed from https://github.com/tetrachrome/subpixel
Used for subpixel phase shifting after deconv operations
"""
def PS(X, r, color=False):
if color:
Xc = tf.split(X, 3, 3)
X = tf.concat([_phase_shift(x, r) for x in Xc],3)
else:
X = _phase_shift(X, r)
return X

Tensorflow : ValueError: Can't load save_path when it is None

import os
import tarfile
from six.moves import urllib
URL = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
PATH = 'aclImdb'
def fetch_data(url = URL, path = PATH):
if not os.path.isdir(path):
os.makedirs(path)
file_path = os.path.join(oath, "aclImdb_v1.tar.gz")
urllib.request.urlretrieve(url, file_path)
file_gz = tarfile.open(file_path)
file_gz.extractall(path = path)
file_gz.close()
import pyprind # for progress visualisation
import pandas as pd
PATH = 'aclImdb'
labels = {'pos': 1, 'neg': 0} # int class labels for 'positive' and 'negative'
pbar = pyprind.ProgBar(50000) # initialise a progress bar with 50k iterations = no. of docs
df = pd.DataFrame()
# use nested for loops to iterate over 'train' & 'test' subdir
for s in ('test', 'train'):
for l in ('pos', 'neg'): # and read text files from 'pos' and 'neg' subdir
path = os.path.join(PATH, s, l)
for file in os.listdir(path):
# append to the df pandas DataFrame with an int class (post = 1, neg = 0)
with open(os.path.join(path, file), 'r', encoding = 'utf-8') as infile:
txt = infile.read()
df = df.append([[txt, labels[l]]], ignore_index = True)
pbar.update()
df.columns = ['review', 'sentiment']
import numpy as np
np. random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index = False, encoding = 'utf-8')
n_words = max(list(word_to_int.values())) + 1
df = pd.read_csv('movie_data.csv', encoding = 'utf-8')
df.head(3)
# Separate words and count each word's occurence
import pyprind # for progress visualisation
from collections import Counter
from string import punctuation
import re
counts = Counter() # collects the counts of occurence of each unique word
pbar = pyprind.ProgBar(len(df['review']),
title = 'Counting word occurences...') # progress bar
for i, review in enumerate(df['review']):
text = ''.join([c if c not in punctuation else ' '+c+' '
for c in review]).lower()
df.loc[i, 'review'] = text
pbar.update()
counts.update(text.split())
# Mapping each unique word to an int
word_counts = sorted(counts, key = counts.get, reverse = True)
print(word_counts[:5])
word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)}
mapped_reviews = []
pbar = pyprind.ProgBar(len(df['review']),
title = 'Map movie reviews to integers...')
# Left-pad with zeros if the sequence length < 200
# Use 200 elements if the length > 200
sequence_length = 200
sequences = np.zeros((len(mapped_reviews), sequence_length), dtype = int)
for i, row in enumerate(mapped_reviews):
review_arr = np.array(row)
sequences[i, -len(row):] = review_arr[-sequence_length:]
# Split the dataset into training and test sets
X_train = sequences[:25000, :]
y_train = df.loc[:25000, 'sentiment'].values
X_test = sequences[25000:, :]
y_test = df.loc[25000:, 'sentiment'].values
# Define the mini-batches generator
np.random.seed(123)
def batch_gen(x, y = None, batch_size = 64):
n_batches = len(x) // batch_size
x = x[:n_batches * batch_size]
if y is not None:
y = y[:n_batches * batch_size]
for ii in range(0, len(x), batch_size):
if y is not None:
yield x[ii : ii + batch_size], y[ii : ii + batch_size]
else:
yield x[ii : ii + batch_size]
import tensorflow as tf
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' ## suppress the 3.5 warning if using TF 1.4
class SentimentRNN(object):
# Define __init__
def __init__(self,
n_words,
seq_len = 200,
lstm_size = 256,
num_layers = 1,
batch_size = 64,
learning_rate = 0.0001,
embed_size = 200):
self.n_words = n_words
self.seq_len = seq_len
self.lstm_size = lstm_size # no. of hidden units
self.num_layers = num_layers
self.batch_size = batch_size
self.learning_rate = learning_rate
self.embed_size = embed_size
self.g = tf.Graph()
with self.g.as_default():
tf.set_random_seed(123)
self.build()
self.saver = tf.train.Saver()
self.init_op = tf.global_variables_initializer()
# Define the build method
def build(self):
# Define the placeholders
tf_x = tf.placeholder(tf.int32,
shape = (self.batch_size, self.seq_len),
name = 'tf_x')
tf_y = tf.placeholder(tf.float32,
shape = (self.batch_size),
name = 'tf_y')
tf_keepprob = tf.placeholder(tf.float32,
name = 'tf_keepprob')
# Create the embedding layer
embedding = tf.Variable(
tf.random_uniform(
shape = (self.n_words, self.embed_size),
minval = -1,
maxval = 1),
name = 'embedding')
embed_x = tf.nn.embedding_lookup(embedding,
tf_x,
name = 'embed_x')
# Define LSTM cells and stack them
cells = tf.contrib.rnn.MultiRNNCell(
[tf.contrib.rnn.DropoutWrapper(
tf.contrib.rnn.BasicLSTMCell(num_units = self.lstm_size),
output_keep_prob = tf_keepprob)
for i in range(self.num_layers)])
# Define the initial state:
self.initial_state = cells.zero_state(
self.batch_size, tf.float32)
print(' << initial state >> ', self.initial_state)
# Put together components with tf.nn.dynamic_rnn
lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
cell = cells,
inputs = embed_x,
initial_state = self.initial_state)
## lstm_outputs shape: [batch_size, max_time, cells.output_size]
print('\n << lstm_output >> ', lstm_outputs)
print('\n << final state >> ', self.final_state)
# Apply a full-connected layer on the RNN output
logits = tf.layers.dense(
inputs = lstm_outputs[:, -1],
units = 1, # dimensionality of the output space
activation = None,
name = 'logits')
# Remove dimensions of size 1 from the tensor shape
logits = tf.squeeze(input = logits,
name = 'logits_squeezed')
print ('\n << logits >> ', logits)
# If you want prob's
y_proba = tf.nn.sigmoid(logits, name = 'probabilities')
predictions = {'probabilities' : y_proba,
'labels' : tf.cast(tf.round(y_proba),
tf.int32,
name = 'labels')}
print('\n << predictions >> ', predictions)
# Define the cost function
cost = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(
labels = tf_y,
logits = logits),
name = 'cost')
# Define the optimiser
optimizer = tf.train.AdamOptimizer(self.learning_rate)
train_op = optimizer.minimize(cost, name = 'train_op')
# Define the train method
def train(self, X_train, y_train, num_epochs):
with tf.Session(graph = self.g) as sess:
sess.run(self.init_op)
iteration = 1
for epoch in range(num_epochs):
state = sess.run(self.initial_state)
for batch_x, batch_y in batch_gen(
X_train,
y_train,
batch_size = self.batch_size):
feed = {'tf_x:0' : batch_x,
'tf_y:0' : batch_y,
'tf_keepprob:0' : 0.5,
self.initial_state : state}
loss, _, state = sess.run(
['cost:0',
'train_op',
self.final_state],
feed_dict=feed)
if iteration % 20 == 0:
print("Epoch: %d/%d Iteration: %d "
"| Train loss: %.5f" % (
epoch + 1,
num_epochs,
iteration,
loss))
iteration += 1
if (epoch + 1) % 10 == 0:
self.saver.save(
sess,
"model/sentiment-%d.ckpt" % epoch)
# Define the predict method
def predict(self, X_data, return_proba=False):
preds = []
with tf.Session(graph = self.g) as sess:
self.saver.restore(
sess,
tf.train.latest_checkpoint('model/'))
test_state = sess.run(self.initial_state)
for ii, batch_x in enumerate(batch_gen(
x = X_data,
y = None,
batch_size = self.batch_size), 1):
feed = {'tf_x:0' : batch_x,
'tf_keepprob:0' : 1.0,
self.initial_state : test_state}
if return_proba:
pred, test_state = sess.run(
['probabilities:0', self.final_state],
feed_dict=feed)
else:
pred, test_state = sess.run(
['labels:0', self.final_state],
feed_dict=feed)
preds.append(pred)
return np.concatenate(preds)
for review in df['review']:
mapped_reviews.append([word_to_int[word] for word in review.split()])
pbar.update()
rnn = SentimentRNN(n_words = n_words,
seq_len = sequence_length,
embed_size = 256,
lstm_size = 128,
num_layers = 1,
batch_size = 100,
learning_rate = 0.001)
preds = rnn.predict(X_test)
y_true = y_test\[:len(preds)\]
print('Test accuracy... %.3f' % (np.sum(preds == y_true) / len(y_true)))][1]
Create an object of the SentimentRNN class with the following parameters:
n_words = n_words, seq_len = sequence_length, embed_size = 256, lstm_size = 128, num_layers = 1, batch_size = 100, learning_rate = 0.001.
Since we have a relatively small dataset, the number of layers = 1 may generalise better
enter image description here
ValueError Traceback (most recent call last)
<ipython-input-23-a3cfe03a9a49> in <module>()
----> 1 preds = rnn.predict(X_test)
2 y_true = y_test[:len(preds)]
3 print('Test accuracy... %.3f' % (np.sum(preds == y_true) / len(y_true)))
<ipython-input-12-d83ee67c43b6> in predict(self, X_data, return_proba)
173 self.saver.restore(
174 sess,
--> 175 tf.train.latest_checkpoint('model/'))
176 test_state = sess.run(self.initial_state)
177
/usr/local/anaconda/lib/python3.6/site-packages/tensorflow/python/training/saver.py in restore(self, sess, save_path)
1680 return
1681 if save_path is None:
-> 1682 raise ValueError("Can't load save_path when it is None.")
1683 logging.info("Restoring parameters from %s", save_path)
1684 if context.in_graph_mode():
ValueError: Can't load save_path when it is None.

The error just means tf.train.latest_checkpoint didn't find anything. It returns None, then the Saver complains because it was passed None. So there's no checkpoint in that directory.

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Pytorch transformer forward function masks implementation for decoder forward function - pytorch

Related

ViVIT PyTorch: RuntimeError: multi-target not supported at /pytorch/aten/src/THCUNN/generic/ClassNLLCriterion.cu:15

PyTorch: GRU, one-to-many / many-to-one

getting shape mismatch error between shape of labels and logits?

PyTorch to Keras model

Tensorflow : ValueError: Can't load save_path when it is None

Categories

Resources