I have the following network where I am trying to do triplet loss:
First, I have a custom Convolution class ConvBlock(nn.Module):
def __init__(self, ngpu, input_c, output_c, mode=0):
super(ConvBlock, self).__init__()
self.ngpu = ngpu
self.input_c = input_c
self.output_c = output_c
self.mode = mode
self.b1 = nn.Sequential(
nn.Conv2d(input_c, output_c, 3, stride=1, padding=1),
#nn.BatchNorm2d(output_c),
nn.PReLU(),
)
self.b2 = nn.Sequential(
nn.Conv2d(output_c, output_c, 3, stride=1, padding=1),
#nn.BatchNorm2d(output_c),
nn.PReLU(),
)
self.pool = nn.Sequential(
nn.MaxPool2d(2, 2),
)
def forward(self, input):
batch_size = input.size(0)
if self.mode == 0:
b1 = self.b1(input)
hidden = self.pool(b1)
return hidden, b1
elif self.mode == 1:
b1 = self.b1(input)
b2 = self.b2(b1)
hidden = self.pool(b2)
return hidden, b2
elif self.mode == 2:
b1 = self.b1(input)
hidden = self.b2(b1)
return hidden
I now have an encoder module:
class _Encoder(nn.Module):
def __init__(self, ngpu,nc,nef,out_size,nz):
super(_Encoder, self).__init__()
self.ngpu = ngpu
self.nc = nc
self.nef = nef
self.out_size = out_size
self.nz = nz
self.c1 = ConvBlock(self.ngpu, nc, nef, 0) # 3 - 64
self.c2 = ConvBlock(self.ngpu, nef, nef*2, 0) # 64-128
self.c3 = ConvBlock(self.ngpu, nef*2, nef*4, 1) # 128-256
self.c4 = ConvBlock(self.ngpu, nef*4, nef*8, 1) # 256 -512
self.c5 = ConvBlock(self.ngpu, nef*8, nef*8, 2) # 512-512
# 8 because..the depth went from 32 to 32*8
self.mean = nn.Linear(nef * 8 * out_size * (out_size/2), nz)
self.logvar = nn.Linear(nef * 8 * out_size * (out_size/2), nz)
#for reparametrization trick
def sampler(self, mean, logvar):
std = logvar.mul(0.5).exp_()
if args.cuda:
eps = torch.cuda.FloatTensor(std.size()).normal_()
else:
eps = torch.FloatTensor(std.size()).normal_()
eps = Variable(eps)
return eps.mul(std).add_(mean)
def forward(self, input):
batch_size = input.size(0)
if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1:
c1_out, c1_x = nn.parallel.data_parallel(self.c1, input, range(self.ngpu))
c2_out, c2_x = nn.parallel.data_parallel(self.c2, c1_out, range(self.ngpu))
c3_out, c3_x = nn.parallel.data_parallel(self.c3, c2_out, range(self.ngpu))
c4_out, c4_x = nn.parallel.data_parallel(self.c4, c3_out, range(self.ngpu))
hidden = nn.parallel.data_parallel(self.c5, c4_out, range(self.ngpu))
# hidden = nn.parallel.data_parallel(self.encoder, input, range(self.ngpu))
hidden = hidden.view(batch_size, -1)
mean = nn.parallel.data_parallel(self.mean, hidden, range(self.ngpu))
logvar = nn.parallel.data_parallel(self.logvar, hidden, range(self.ngpu))
else:
c1_out, c1_x = self.c1(input)
c2_out, c2_x = self.c2(c1_out)
c3_out, c3_x = self.c3(c2_out)
c4_out, c4_x = self.c4(c3_out)
hidden = self.c5(c4_out)
# hidden = self.encoder(input)
hidden = hidden.view(batch_size, -1)
mean, logvar = self.mean(hidden), self.logvar(hidden)
latent_z = self.sampler(mean, logvar)
if ADD_SKIP_CONNECTION:
return latent_z,mean,logvar,{"c1_x":c1_x, "c2_x":c2_x, "c3_x":c3_x, "c4_x":c4_x}
else:
return latent_z,mean,logvar,{"c1_x":None, "c2_x":None, "c3_x":None, "c4_x":None}
I initialize my encoder as a single object:
encoder = _Encoder(ngpu,nc,nef,out_size,nz)
encoder = encoder.cuda()
And then I am applying some functions:
latent_x,mean_x,logvar_x,skip_x = self.encoder(x)
latent_y,mean_y,logvar_y,skip_y = self.encoder(y)
latent_z,mean_z,logvar_z,skip_z = self.encoder(z)
dist_a = F.pairwise_distance(mean_x, mean_y, 2)
dist_b = F.pairwise_distance(mean_x, mean_z, 2)
loss_triplet = triplet_loss(dist_a, dist_b, target)
optimizer.zero_grad()
loss_triplet.backward()
optimizer.step()
I am starting to doubt if the weights are actually being shared across the 3 encoder blocks. Please help me check an tell me if it does
Related
I used the code implemented by bigmb, but the output of R2AttUnet in .train() mode are 10 times bigger than in .eval() mode. The U_net and AttU_net are good. But R2AttU_net and R2U_net always have this problem. I think it's because the same BatchNorm is used in Recurrent_block. But I have no idea how to fix it. Can anyone help me?
from __future__ import print_function, division
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
import torch
from torch.nn import init
class conv_block(nn.Module):
"""
Convolution Block
"""
def __init__(self, in_ch, out_ch):
super(conv_block, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=True),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
nn.Conv2d(out_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=True),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True))
def forward(self, x):
x = self.conv(x)
return x
class up_conv(nn.Module):
"""
Up Convolution Block
"""
def __init__(self, in_ch, out_ch):
super(up_conv, self).__init__()
self.up = nn.Sequential(
nn.Upsample(scale_factor=2),
nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=True),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True)
)
def forward(self, x):
x = self.up(x)
return x
### initalize the module
def init_weights(net, init_type='normal'):
#print('initialization method [%s]' % init_type)
if init_type == 'kaiming':
net.apply(weights_init_kaiming)
else:
raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
def weights_init_kaiming(m):
classname = m.__class__.__name__
#print(classname)
if classname.find('Conv') != -1:
init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
elif classname.find('Linear') != -1:
init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
elif classname.find('BatchNorm') != -1:
init.normal_(m.weight.data, 1.0, 0.02)
init.constant_(m.bias.data, 0.0)
class U_Net(nn.Module):
"""
UNet - Basic Implementation
Paper : https://arxiv.org/abs/1505.04597
"""
def __init__(self, in_ch=3, out_ch=1):
super(U_Net, self).__init__()
n1 = 64
filters = [n1, n1 * 2, n1 * 4, n1 * 8, n1 * 16]
self.Maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Maxpool4 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Conv1 = conv_block(in_ch, filters[0])
self.Conv2 = conv_block(filters[0], filters[1])
self.Conv3 = conv_block(filters[1], filters[2])
self.Conv4 = conv_block(filters[2], filters[3])
self.Conv5 = conv_block(filters[3], filters[4])
self.Up5 = up_conv(filters[4], filters[3])
self.Up_conv5 = conv_block(filters[4], filters[3])
self.Up4 = up_conv(filters[3], filters[2])
self.Up_conv4 = conv_block(filters[3], filters[2])
self.Up3 = up_conv(filters[2], filters[1])
self.Up_conv3 = conv_block(filters[2], filters[1])
self.Up2 = up_conv(filters[1], filters[0])
# self.Up_conv2 = conv_block(filters[1], filters[0])
self.Up_conv2 = conv_block(filters[1], filters[1])
self.Conv = nn.Conv2d(filters[1], out_ch, kernel_size=1, stride=1, padding=0)
self.active = torch.nn.Sigmoid()
# self.active = torch.nn.Tanh()
# initialise weights
for m in self.modules():
if isinstance(m, nn.Conv2d):
init_weights(m, init_type='kaiming')
elif isinstance(m, nn.BatchNorm2d):
init_weights(m, init_type='kaiming')
def forward(self, x):
# print('in:', x.max(), x.min())
e1 = self.Conv1(x)
e2 = self.Maxpool1(e1)
e2 = self.Conv2(e2)
e3 = self.Maxpool2(e2)
e3 = self.Conv3(e3)
e4 = self.Maxpool3(e3)
e4 = self.Conv4(e4)
e5 = self.Maxpool4(e4)
e5 = self.Conv5(e5)
# print('e5:', e5.max(), e5.min())
d5 = self.Up5(e5)
d5 = torch.cat((e4, d5), dim=1)
d5 = self.Up_conv5(d5)
d4 = self.Up4(d5)
d4 = torch.cat((e3, d4), dim=1)
d4 = self.Up_conv4(d4)
d3 = self.Up3(d4)
d3 = torch.cat((e2, d3), dim=1)
d3 = self.Up_conv3(d3)
d2 = self.Up2(d3)
d2 = torch.cat((e1, d2), dim=1)
# d2 = self.Up_conv2(d2)
# print('d2:', d2.max(), d2.min())
out = self.Conv(d2)
# print('out:', out.max(), out.min())
out = self.active(out)
# print('out:', out.max(), out.min())
# out = self.f_conv(d2)
# print('out:', out.max(), out.min())
return out
class Recurrent_block(nn.Module):
"""
Recurrent Block for R2Unet_CNN
"""
def __init__(self, out_ch, t=2):
super(Recurrent_block, self).__init__()
self.t = t
self.out_ch = out_ch
self.conv = nn.Sequential(
nn.Conv2d(out_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=True),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True)
)
def forward(self, x):
for i in range(self.t):
if i == 0:
x = self.conv(x)
out = self.conv(x + x)
return out
class RRCNN_block(nn.Module):
"""
Recurrent Residual Convolutional Neural Network Block
"""
def __init__(self, in_ch, out_ch, t=2):
super(RRCNN_block, self).__init__()
self.RCNN = nn.Sequential(
Recurrent_block(out_ch, t=t),
Recurrent_block(out_ch, t=t)
)
self.Conv = nn.Conv2d(in_ch, out_ch, kernel_size=1, stride=1, padding=0)
def forward(self, x):
x1 = self.Conv(x)
x2 = self.RCNN(x1)
out = x1 + x2
return out
class R2U_Net(nn.Module):
"""
R2U-Unet implementation
Paper: https://arxiv.org/abs/1802.06955
"""
def __init__(self, img_ch=3, output_ch=1, t=2):
super(R2U_Net, self).__init__()
n1 = 64
filters = [n1, n1 * 2, n1 * 4, n1 * 8, n1 * 16]
self.Maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
self.Maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Upsample = nn.Upsample(scale_factor=2)
self.RRCNN1 = RRCNN_block(img_ch, filters[0], t=t)
self.RRCNN2 = RRCNN_block(filters[0], filters[1], t=t)
self.RRCNN3 = RRCNN_block(filters[1], filters[2], t=t)
self.RRCNN4 = RRCNN_block(filters[2], filters[3], t=t)
self.RRCNN5 = RRCNN_block(filters[3], filters[4], t=t)
self.Up5 = up_conv(filters[4], filters[3])
self.Up_RRCNN5 = RRCNN_block(filters[4], filters[3], t=t)
self.Up4 = up_conv(filters[3], filters[2])
self.Up_RRCNN4 = RRCNN_block(filters[3], filters[2], t=t)
self.Up3 = up_conv(filters[2], filters[1])
self.Up_RRCNN3 = RRCNN_block(filters[2], filters[1], t=t)
self.Up2 = up_conv(filters[1], filters[0])
self.Up_RRCNN2 = RRCNN_block(filters[1], filters[0], t=t)
self.Conv = nn.Conv2d(filters[1], output_ch, kernel_size=1, stride=1, padding=0)
self.active = torch.nn.Sigmoid()
def forward(self, x):
e1 = self.RRCNN1(x)
e2 = self.Maxpool(e1)
e2 = self.RRCNN2(e2)
e3 = self.Maxpool1(e2)
e3 = self.RRCNN3(e3)
e4 = self.Maxpool2(e3)
e4 = self.RRCNN4(e4)
e5 = self.Maxpool3(e4)
e5 = self.RRCNN5(e5)
d5 = self.Up5(e5)
d5 = torch.cat((e4, d5), dim=1)
d5 = self.Up_RRCNN5(d5)
d4 = self.Up4(d5)
d4 = torch.cat((e3, d4), dim=1)
d4 = self.Up_RRCNN4(d4)
d3 = self.Up3(d4)
d3 = torch.cat((e2, d3), dim=1)
d3 = self.Up_RRCNN3(d3)
d2 = self.Up2(d3)
d2 = torch.cat((e1, d2), dim=1)
# d2 = self.Up_RRCNN2(d2)
out = self.Conv(d2)
out = self.active(out)
return out
class Attention_block(nn.Module):
"""
Attention Block
"""
def __init__(self, F_g, F_l, F_int):
super(Attention_block, self).__init__()
self.W_g = nn.Sequential(
nn.Conv2d(F_l, F_int, kernel_size=1, stride=1, padding=0, bias=True),
nn.BatchNorm2d(F_int)
)
self.W_x = nn.Sequential(
nn.Conv2d(F_g, F_int, kernel_size=1, stride=1, padding=0, bias=True),
nn.BatchNorm2d(F_int)
)
self.psi = nn.Sequential(
nn.Conv2d(F_int, 1, kernel_size=1, stride=1, padding=0, bias=True),
nn.BatchNorm2d(1),
nn.Sigmoid()
)
self.relu = nn.ReLU(inplace=True)
def forward(self, g, x):
g1 = self.W_g(g)
x1 = self.W_x(x)
psi = self.relu(g1 + x1)
psi = self.psi(psi)
out = x * psi
return out
class AttU_Net(nn.Module):
"""
Attention Unet implementation
Paper: https://arxiv.org/abs/1804.03999
"""
def __init__(self, img_ch=3, output_ch=1):
super(AttU_Net, self).__init__()
n1 = 64
filters = [n1, n1 * 2, n1 * 4, n1 * 8, n1 * 16]
self.Maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Maxpool4 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Conv1 = conv_block(img_ch, filters[0])
self.Conv2 = conv_block(filters[0], filters[1])
self.Conv3 = conv_block(filters[1], filters[2])
self.Conv4 = conv_block(filters[2], filters[3])
self.Conv5 = conv_block(filters[3], filters[4])
self.Up5 = up_conv(filters[4], filters[3])
self.Att5 = Attention_block(F_g=filters[3], F_l=filters[3], F_int=filters[2])
self.Up_conv5 = conv_block(filters[4], filters[3])
self.Up4 = up_conv(filters[3], filters[2])
self.Att4 = Attention_block(F_g=filters[2], F_l=filters[2], F_int=filters[1])
self.Up_conv4 = conv_block(filters[3], filters[2])
self.Up3 = up_conv(filters[2], filters[1])
self.Att3 = Attention_block(F_g=filters[1], F_l=filters[1], F_int=filters[0])
self.Up_conv3 = conv_block(filters[2], filters[1])
self.Up2 = up_conv(filters[1], filters[0])
self.Att2 = Attention_block(F_g=filters[0], F_l=filters[0], F_int=32)
self.Up_conv2 = conv_block(filters[1], filters[0])
self.Conv = nn.Conv2d(filters[1], output_ch, kernel_size=1, stride=1, padding=0)
self.active = torch.nn.Sigmoid()
def forward(self, x):
e1 = self.Conv1(x)
e2 = self.Maxpool1(e1)
e2 = self.Conv2(e2)
e3 = self.Maxpool2(e2)
e3 = self.Conv3(e3)
e4 = self.Maxpool3(e3)
e4 = self.Conv4(e4)
e5 = self.Maxpool4(e4)
e5 = self.Conv5(e5)
#print(x5.shape)
d5 = self.Up5(e5)
#print(d5.shape)
x4 = self.Att5(g=d5, x=e4)
d5 = torch.cat((x4, d5), dim=1)
d5 = self.Up_conv5(d5)
d4 = self.Up4(d5)
x3 = self.Att4(g=d4, x=e3)
d4 = torch.cat((x3, d4), dim=1)
d4 = self.Up_conv4(d4)
d3 = self.Up3(d4)
x2 = self.Att3(g=d3, x=e2)
d3 = torch.cat((x2, d3), dim=1)
d3 = self.Up_conv3(d3)
d2 = self.Up2(d3)
x1 = self.Att2(g=d2, x=e1)
d2 = torch.cat((x1, d2), dim=1)
# d2 = self.Up_conv2(d2)
out = self.Conv(d2)
out = self.active(out)
return out
class R2AttU_Net(nn.Module):
"""
Residual Recuurent Block with attention Unet
Implementation : https://github.com/LeeJunHyun/Image_Segmentation
"""
def __init__(self, in_ch=3, out_ch=1, t=2):
super(R2AttU_Net, self).__init__()
n1 = 64
filters = [n1, n1 * 2, n1 * 4, n1 * 8, n1 * 16]
self.Maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2)
self.Maxpool4 = nn.MaxPool2d(kernel_size=2, stride=2)
self.RRCNN1 = RRCNN_block(in_ch, filters[0], t=t)
self.RRCNN2 = RRCNN_block(filters[0], filters[1], t=t)
self.RRCNN3 = RRCNN_block(filters[1], filters[2], t=t)
self.RRCNN4 = RRCNN_block(filters[2], filters[3], t=t)
self.RRCNN5 = RRCNN_block(filters[3], filters[4], t=t)
self.Up5 = up_conv(filters[4], filters[3])
self.Att5 = Attention_block(F_g=filters[3], F_l=filters[3], F_int=filters[2])
self.Up_RRCNN5 = RRCNN_block(filters[4], filters[3], t=t)
self.Up4 = up_conv(filters[3], filters[2])
self.Att4 = Attention_block(F_g=filters[2], F_l=filters[2], F_int=filters[1])
self.Up_RRCNN4 = RRCNN_block(filters[3], filters[2], t=t)
self.Up3 = up_conv(filters[2], filters[1])
self.Att3 = Attention_block(F_g=filters[1], F_l=filters[1], F_int=filters[0])
self.Up_RRCNN3 = RRCNN_block(filters[2], filters[1], t=t)
self.Up2 = up_conv(filters[1], filters[0])
self.Att2 = Attention_block(F_g=filters[0], F_l=filters[0], F_int=32)
self.Up_RRCNN2 = RRCNN_block(filters[1], filters[1], t=t)
self.Conv = nn.Conv2d(filters[1], out_ch, kernel_size=1, stride=1, padding=0)
self.active = torch.nn.Sigmoid()
def forward(self, x):
e1 = self.RRCNN1(x)
e2 = self.Maxpool1(e1)
e2 = self.RRCNN2(e2)
e3 = self.Maxpool2(e2)
e3 = self.RRCNN3(e3)
e4 = self.Maxpool3(e3)
e4 = self.RRCNN4(e4)
e5 = self.Maxpool4(e4)
e5 = self.RRCNN5(e5)
d5 = self.Up5(e5)
e4 = self.Att5(g=d5, x=e4)
d5 = torch.cat((e4, d5), dim=1)
d5 = self.Up_RRCNN5(d5)
d4 = self.Up4(d5)
e3 = self.Att4(g=d4, x=e3)
d4 = torch.cat((e3, d4), dim=1)
d4 = self.Up_RRCNN4(d4)
d3 = self.Up3(d4)
e2 = self.Att3(g=d3, x=e2)
d3 = torch.cat((e2, d3), dim=1)
d3 = self.Up_RRCNN3(d3)
d2 = self.Up2(d3)
e1 = self.Att2(g=d2, x=e1)
d2 = torch.cat((e1, d2), dim=1)
d2 = self.Up_RRCNN2(d2)
out = self.Conv(d2)
out = self.active(out)
return out
Model build:
self.Generator = R2AttU_Net(3, self.n_bs*3).cuda()
Train:
self.Generator.train()
batch_size = target_n.shape[0]
out_img_delta_bs = self.Generator(target_n)
print(out_img_delta_bs.max(), out_img_delta_bs.min(), gt_target_delta_bs.max(), gt_target_delta_bs.min())
loss1 = self.L1_loss(out_img_delta_bs, gt_target_delta_bs)
loss = loss1
self.optim.zero_grad()
loss.backward()
self.optim.step()
Out Train after 200/1188 Iters:
tensor(0.2333, device='cuda:0', grad_fn=) tensor(-0.2569, device='cuda:0', grad_fn=) tensor(0.2999, device='cuda:0') tensor(-0.3227, device='cuda:0')
Eval:
self.Generator.eval()
with torch.no_grad():
out_img_delta_bs_ = self.Generator(target_n)
loss1_ = self.L1_loss(out_img_delta_bs_, gt_target_delta_bs)
loss_ = loss1_
print(out_img_delta_bs_.max(), out_img_delta_bs_.min(), gt_target_delta_bs.max(), gt_target_delta_bs.min())
Out Eval:
tensor(0.0087, device='cuda:0') tensor(-0.0210, device='cuda:0') tensor(0.4791, device='cuda:0') tensor(-0.4507, device='cuda:0')
How to fix the BatchNorm layers in Recurrent Block and Recurrent Residual Convolutional Neural Network Block?
Why this happened?
I am given this set of parameters where I need to code into a forward pass.
model_list = [dict(type='Conv2D', in_channels=3, out_channels=32, kernel_size=5, stride=1, padding=2),
dict(type='ReLU'),
dict(type='MaxPooling', kernel_size=2, stride=2),
dict(type='Linear', in_dim=8192, out_dim=10)]
criterion = dict(type='SoftmaxCrossEntropy')
def __init__(self, modules, criterion):
self.modules = []
for m in modules:
if m['type'] == 'Conv2D':
self.modules.append(
Conv2D(m['in_channels'],
m['out_channels'],
m['kernel_size'],
m['stride'],
m['padding'])
)
elif m['type'] == 'ReLU':
self.modules.append(
ReLU()
)
elif m['type'] == 'MaxPooling':
self.modules.append(
MaxPooling(m['kernel_size'],
m['stride'])
)
elif m['type'] == 'Linear':
self.modules.append(
Linear(m['in_dim'],
m['out_dim'])
)
if criterion['type'] == 'SoftmaxCrossEntropy':
self.criterion = SoftmaxCrossEntropy()
And basically what I am trying to do is this
def forward(self, x, y):
probs = None
loss = None
Conv2D_output = self.modules['Conv2D'](x)
Relu_output = self.modules['ReLU'](Conv2D_output)
MaxPooling_output = self.modules['MaxPooling'](Relu_output)
Linear_output = self.modules['Linear'](MaxPooling_output)
scores = Linear_output
probs, loss = self.criterion.forward(scores, y)
self.cache = (probs, x, y)
return probs, loss
Is there anyway I could do it? Since the dictionary is appended as an object into a list?
I am using the following pre-trained resnet18 code to make a classification based on some input images.
The code is working properly with RGB images, but I want to make the needed changes to let it accept grey images (1 channel images).
I modified part of the code as following:
self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=7, stride=2, padding=3,
bias=False)
but I got the following error:
RuntimeError: Error(s) in loading state_dict for ResNet:
size mismatch for conv1.weight: copying a param with shape torch.Size([64, 3, 7, 7]) from
checkpoint, the shape in current model is torch.Size([64, 1, 7, 7]).
Can you tell me please how I can solve this problem?
Thanks in advance.
Here is the code:
import torch.nn as nn
import torch.utils.model_zoo as model_zoo
from torch.utils.model_zoo import load_url as load_state_dict_from_url
# from .utils import load_state_dict_from_url
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
'wide_resnet50_2', 'wide_resnet101_2']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
}
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=dilation, groups=groups, bias=False, dilation=dilation)
def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
base_width=64, dilation=1, norm_layer=None):
super(BasicBlock, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
if groups != 1 or base_width != 64:
raise ValueError('BasicBlock only supports groups=1 and base_width=64')
if dilation > 1:
raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
# Both self.conv1 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
base_width=64, dilation=1, norm_layer=None):
super(Bottleneck, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
width = int(planes * (base_width / 64.)) * groups
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
self.conv2 = conv3x3(width, width, stride, groups, dilation)
self.bn2 = norm_layer(width)
self.conv3 = conv1x1(width, planes * self.expansion)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
groups=1, width_per_group=64, replace_stride_with_dilation=None,
norm_layer=None):
super(ResNet, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
self._norm_layer = norm_layer
self.output_dim = num_classes
print('Output layer dim = ' + str(self.output_dim))
self.inplanes = 64
self.dilation = 1
if replace_stride_with_dilation is None:
# each element in the tuple indicates if we should replace
# the 2x2 stride with a dilated convolution instead
replace_stride_with_dilation = [False, False, False]
if len(replace_stride_with_dilation) != 3:
raise ValueError("replace_stride_with_dilation should be None "
"or a 3-element tuple, got {}".format(replace_stride_with_dilation))
self.groups = groups
self.base_width = width_per_group
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
dilate=replace_stride_with_dilation[0])
self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
dilate=replace_stride_with_dilation[1])
self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
dilate=replace_stride_with_dilation[2])
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, self.output_dim)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if zero_init_residual:
for m in self.modules():
if isinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight, 0)
elif isinstance(m, BasicBlock):
nn.init.constant_(m.bn2.weight, 0)
def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
norm_layer = self._norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
norm_layer(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
self.base_width, previous_dilation, norm_layer))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes, groups=self.groups,
base_width=self.base_width, dilation=self.dilation,
norm_layer=norm_layer))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.reshape(x.size(0), -1)
x = self.fc(x)
return x
def _resnet(arch, block, layers, pretrained, progress, **kwargs):
model = ResNet(block, layers, **kwargs)
if pretrained:
state_dict = load_state_dict_from_url(model_urls[arch],
progress=progress)
# print(state_dict.keys())
# print(model.fc.weight)
for key in kwargs:
if key == 'num_classes':
num_classes = kwargs[key]
if num_classes != 1000:
state_dict['fc.weight'] = nn.init.xavier_normal_(model.fc.weight, gain=1)
state_dict['fc.bias'] = nn.init.constant_(model.fc.bias, val=0)
model.load_state_dict(state_dict)
return model
I'm trying to do attention mechanism while returning the tensor I'm getting the following error
ValueError: Shape mismatch: The shape of labels (received (64, 53)) should equal the shape of logits except for the last dimension (received (64, 1, 500)).
Please find the below code
Here is code for attention please correct me if it is wrong
class Attention(tf.keras.layers.Layer):
def __init__(self):
super().__init__()
def call(self,enc_op,hidden_state):
# print(enc_op.shape,hidden_state.shape)
query_with_time_axis = tf.expand_dims(hidden_state, 1)
context_vector = tf.matmul(enc_op,tf.transpose(query_with_time_axis,perm=[0,2,1]))
context_vector = tf.nn.softmax(context_vector,axis=1)
context_vector = context_vector * enc_op
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector
Here is decoder part I'm calling the attention from here
class Decoder(tf.keras.layers.Layer):
def init(self,vocab_size,embedding_dim,input_length,dec_units):
super().init()
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.dec_units = dec_units
self.input_length = input_length
self.attention = Attention()
def build(self,input_shape):
self.embedding = Embedding(input_dim=self.vocab_size,output_dim = self.embedding_dim,input_shape = input_shape,
mask_zero = True, name = "embedding_layer_decoder")
self.lstm = LSTM(self.dec_units,return_sequences=True,return_state=True,name = "Decoder_LSTM")
def call(self,target_sentances,enc_op,hidden_state,cell_state):
target_embed = self.embedding(target_sentances)
for i in range(target_embed.shape[1]):
context_vector = self.attention(enc_op,hidden_state)
y = tf.concat([context_vector, target_embed[:,i,:]], axis=-1)
y = tf.expand_dims(y, 1)
lstm_output,hidden_state,_ = self.lstm(y,initial_state = [hidden_state,cell_state])
return lstm_output
class Mymodel(Model):
def __init__(self,encoder_inputs_length,decoder_inputs_length,output_vocab_size):
super().__init__()
self.encoder = Encoder(vocab_size = 500, embedding_dim = 50, input_length = encoder_inputs_length, enc_units=64)
self.decoder = Decoder(vocab_size = 500, embedding_dim = 50, input_length = decoder_inputs_length, dec_units=64)
self.dense = Dense(output_vocab_size,activation = "softmax")
def call(self,data):
input,output = data[0],data[1]
print(input.shape,output.shape)
encoder_output,encoder_h,encoder_c = self.encoder(input)
print("="*20, "ENCODER", "="*20)
print("-"*35)
print(encoder_output)
print("ENCODER ==> OUTPUT SHAPE",encoder_output.shape)
print("ENCODER ==> HIDDEN STATE SHAPE",encoder_h.shape)
print("ENCODER ==> CELL STATE SHAPE", encoder_c.shape)
print("="*20,"Decoder","="*20)
decoder_output = self.decoder(output,encoder_output,encoder_h,encoder_c)
output1 = self.dense(decoder_output)
print("-"*35)
print("Final output shape",output.shape)
print("="*50)
return output1
model = Mymodel(encoder_inputs_length=30,decoder_inputs_length=20,output_vocab_size=500)
ENCODER_SEQ_LEN = 30
DECODER_SEQ_LEN = 20
optimizer = tf.keras.optimizers.Adam()
model.compile(optimizer=optimizer,loss=tf.keras.losses.SparseCategoricalCrossentropy())
for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
model.fit([inp, targ], targ, steps_per_epoch=1)
The shape of my input and target is
(64, 55) (64, 53)
64 is batch size
I am trying to implement a custom GRU layer in keras 2.1.2-py36_0 where i want to use the following gate equations:
zt = act ( Wz.ht-1 + xt )
rt = act ( Wr.ht-1 + xt )
ht = act ( Wh.(r * ht-1) + xt )
instead of keras current implementation of gates as:
zt = act ( Wz.ht-1 + Uzxt )
rt = act ( Wr.ht-1 + Urxt )
ht = act ( Wh.(r * ht-1) + Uhxt )
Customizing GRU cell for the data
class CGRUCell(Layer):
def __init__(self, units,
activation='tanh',
recurrent_activation='hard_sigmoid',
use_bias=True,
kernel_initializer='glorot_uniform',
recurrent_initializer='orthogonal',
bias_initializer='zeros',
kernel_regularizer=None,
recurrent_regularizer=None,
bias_regularizer=None,
kernel_constraint=None,
recurrent_constraint=None,
bias_constraint=None,
dropout=0.,
recurrent_dropout=0.,
implementation=1,
**kwargs):
super(CGRUCell, self).__init__(**kwargs)
self.units = units
self.activation = activations.get(activation)
self.recurrent_activation = activations.get(recurrent_activation)
self.use_bias = use_bias
self.kernel_initializer = initializers.get(kernel_initializer)
self.recurrent_initializer = initializers.get(recurrent_initializer)
self.bias_initializer = initializers.get(bias_initializer)
self.kernel_regularizer = regularizers.get(kernel_regularizer)
self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
self.bias_regularizer = regularizers.get(bias_regularizer)
self.kernel_constraint = constraints.get(kernel_constraint)
self.recurrent_constraint = constraints.get(recurrent_constraint)
self.bias_constraint = constraints.get(bias_constraint)
self.dropout = min(1., max(0., dropout))
self.recurrent_dropout = min(1., max(0., recurrent_dropout))
self.implementation = implementation
self.state_size = self.units
self._dropout_mask = None
self._recurrent_dropout_mask = None
def build(self, input_shape):
input_dim = input_shape[-1]
#self.kernel = self.add_weight(shape=(input_dim, self.units * 3),
# name='kernel',
# initializer=self.kernel_initializer,
# regularizer=self.kernel_regularizer,
# constraint=self.kernel_constraint)
self.recurrent_kernel = self.add_weight(
shape=(self.units, self.units * 3),
name='recurrent_kernel',
initializer=self.recurrent_initializer,
regularizer=self.recurrent_regularizer,
constraint=self.recurrent_constraint)
if self.use_bias:
self.bias = self.add_weight(shape=(self.units * 3,),
name='bias',
initializer=self.bias_initializer,
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
else:
self.bias = None
#self.kernel_z = self.kernel[:, :self.units]
self.recurrent_kernel_z = self.recurrent_kernel[:, :self.units]
#self.kernel_r = self.kernel[:, self.units: self.units * 2]
self.recurrent_kernel_r = self.recurrent_kernel[:,
self.units:
self.units * 2]
#self.kernel_h = self.kernel[:, self.units * 2:]
self.recurrent_kernel_h = self.recurrent_kernel[:, self.units * 2:]
if self.use_bias:
self.bias_z = self.bias[:self.units]
self.bias_r = self.bias[self.units: self.units * 2]
self.bias_h = self.bias[self.units * 2:]
else:
self.bias_z = None
self.bias_r = None
self.bias_h = None
self.built = True
def call(self, inputs, states, training=None):
h_tm1 = states[0] # previous memory
if 0 < self.dropout < 1 and self._dropout_mask is None:
self._dropout_mask = _generate_dropout_mask(
_generate_dropout_ones(inputs, K.shape(inputs)[-1]),
self.dropout,
training=training,
count=3)
if (0 < self.recurrent_dropout < 1 and
self._recurrent_dropout_mask is None):
self._recurrent_dropout_mask = _generate_dropout_mask(
_generate_dropout_ones(inputs, self.units),
self.recurrent_dropout,
training=training,
count=3)
# dropout matrices for input units
dp_mask = self._dropout_mask
# dropout matrices for recurrent units
rec_dp_mask = self._recurrent_dropout_mask
if self.implementation == 1:
if 0. < self.dropout < 1.:
inputs_z = inputs * dp_mask[0]
inputs_r = inputs * dp_mask[1]
inputs_h = inputs * dp_mask[2]
else:
inputs_z = inputs
inputs_r = inputs
inputs_h = inputs
print(inputs)
# Custom implementation of inputs which are already embedding parameters
#x_z = K.dot(inputs_z, self.kernel_z)
#x_r = K.dot(inputs_r, self.kernel_r)
#x_h = K.dot(inputs_h, self.kernel_h)
#if self.use_bias:
# x_z = K.bias_add(x_z, self.bias_z)
# x_r = K.bias_add(x_r, self.bias_r)
# x_h = K.bias_add(x_h, self.bias_h)
x_z = inputs_z
x_r = inputs_r
x_h = inputs_h
if 0. < self.recurrent_dropout < 1.:
h_tm1_z = h_tm1 * rec_dp_mask[0]
h_tm1_r = h_tm1 * rec_dp_mask[1]
h_tm1_h = h_tm1 * rec_dp_mask[2]
else:
h_tm1_z = h_tm1
h_tm1_r = h_tm1
h_tm1_h = h_tm1
z = self.recurrent_activation(x_z + K.dot(h_tm1_z,
self.recurrent_kernel_z))
r = self.recurrent_activation(x_r + K.dot(h_tm1_r,
self.recurrent_kernel_r))
hh = self.activation(x_h + K.dot(r * h_tm1_h,
self.recurrent_kernel_h))
else:
if 0. < self.dropout < 1.:
inputs *= dp_mask[0]
# Custom implementation of inputs which are already embedding parameters
#matrix_x = K.dot(inputs, self.kernel)
#if self.use_bias:
# matrix_x = K.bias_add(matrix_x, self.bias)
matrix_x = inputs
if 0. < self.recurrent_dropout < 1.:
h_tm1 *= rec_dp_mask[0]
matrix_inner = K.dot(h_tm1,
self.recurrent_kernel[:, :2 * self.units])
x_z = matrix_x[:, :self.units]
x_r = matrix_x[:, self.units: 2 * self.units]
recurrent_z = matrix_inner[:, :self.units]
recurrent_r = matrix_inner[:, self.units: 2 * self.units]
z = self.recurrent_activation(x_z + recurrent_z)
r = self.recurrent_activation(x_r + recurrent_r)
x_h = matrix_x[:, 2 * self.units:]
recurrent_h = K.dot(r * h_tm1,
self.recurrent_kernel[:, 2 * self.units:])
hh = self.activation(x_h + recurrent_h)
h = z * h_tm1 + (1 - z) * hh
if 0 < self.dropout + self.recurrent_dropout:
if training is None:
h._uses_learning_phase = True
return h, [h]
def get_config(self):
config = {'units': self.units,
'activation': activations.serialize(self.activation),
'recurrent_activation': activations.serialize(self.recurrent_activation),
'use_bias': self.use_bias,
'kernel_initializer': initializers.serialize(self.kernel_initializer),
'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
'bias_initializer': initializers.serialize(self.bias_initializer),
'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
'bias_regularizer': regularizers.serialize(self.bias_regularizer),
'kernel_constraint': constraints.serialize(self.kernel_constraint),
'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
'bias_constraint': constraints.serialize(self.bias_constraint),
'dropout': self.dropout,
'recurrent_dropout': self.recurrent_dropout,
'implementation': self.implementation}
base_config = super(CGRUCell, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class CGRU(RNN):
#interfaces.legacy_recurrent_support
def __init__(self, units,
activation='tanh',
recurrent_activation='hard_sigmoid',
use_bias=True,
kernel_initializer='glorot_uniform',
recurrent_initializer='orthogonal',
bias_initializer='zeros',
kernel_regularizer=None,
recurrent_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
recurrent_constraint=None,
bias_constraint=None,
dropout=0.,
recurrent_dropout=0.,
implementation=1,
return_sequences=False,
return_state=False,
go_backwards=False,
stateful=False,
unroll=False,
**kwargs):
if implementation == 0:
warnings.warn('`implementation=0` has been deprecated, '
'and now defaults to `implementation=1`.'
'Please update your layer call.')
cell = CGRUCell(units,
activation=activation,
recurrent_activation=recurrent_activation,
use_bias=use_bias,
kernel_initializer=kernel_initializer,
recurrent_initializer=recurrent_initializer,
bias_initializer=bias_initializer,
kernel_regularizer=kernel_regularizer,
recurrent_regularizer=recurrent_regularizer,
bias_regularizer=bias_regularizer,
kernel_constraint=kernel_constraint,
recurrent_constraint=recurrent_constraint,
bias_constraint=bias_constraint,
dropout=dropout,
recurrent_dropout=recurrent_dropout,
implementation=implementation)
super(CGRU, self).__init__(cell,
return_sequences=return_sequences,
return_state=return_state,
go_backwards=go_backwards,
stateful=stateful,
unroll=unroll,
**kwargs)
self.activity_regularizer = regularizers.get(activity_regularizer)
def call(self, inputs, mask=None, training=None, initial_state=None):
self.cell._dropout_mask = None
self.cell._recurrent_dropout_mask = None
return super(CGRU, self).call(inputs,
mask=mask,
training=training,
initial_state=initial_state)
#property
def units(self):
return self.cell.units
#property
def activation(self):
return self.cell.activation
#property
def recurrent_activation(self):
return self.cell.recurrent_activation
#property
def use_bias(self):
return self.cell.use_bias
#property
def kernel_initializer(self):
return self.cell.kernel_initializer
#property
def recurrent_initializer(self):
return self.cell.recurrent_initializer
#property
def bias_initializer(self):
return self.cell.bias_initializer
#property
def kernel_regularizer(self):
return self.cell.kernel_regularizer
#property
def recurrent_regularizer(self):
return self.cell.recurrent_regularizer
#property
def bias_regularizer(self):
return self.cell.bias_regularizer
#property
def kernel_constraint(self):
return self.cell.kernel_constraint
#property
def recurrent_constraint(self):
return self.cell.recurrent_constraint
#property
def bias_constraint(self):
return self.cell.bias_constraint
#property
def dropout(self):
return self.cell.dropout
#property
def recurrent_dropout(self):
return self.cell.recurrent_dropout
#property
def implementation(self):
return self.cell.implementation
def get_config(self):
config = {'units': self.units,
'activation': activations.serialize(self.activation),
'recurrent_activation': activations.serialize(self.recurrent_activation),
'use_bias': self.use_bias,
'kernel_initializer': initializers.serialize(self.kernel_initializer),
'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
'bias_initializer': initializers.serialize(self.bias_initializer),
'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
'bias_regularizer': regularizers.serialize(self.bias_regularizer),
'activity_regularizer': regularizers.serialize(self.activity_regularizer),
'kernel_constraint': constraints.serialize(self.kernel_constraint),
'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
'bias_constraint': constraints.serialize(self.bias_constraint),
'dropout': self.dropout,
'recurrent_dropout': self.recurrent_dropout,
'implementation': self.implementation}
base_config = super(CGRU, self).get_config()
del base_config['cell']
return dict(list(base_config.items()) + list(config.items()))
#classmethod
def from_config(cls, config):
if 'implementation' in config and config['implementation'] == 0:
config['implementation'] = 1
return cls(**config)
Model Implementation is as follows:
user_input = Input(batch_shape=(batch_size,chunk_size,), dtype='int32', name='user_inputs')
user_emb = Embedding(input_dim=num_users+1, output_dim=out_dim, input_length=chunk_size)(user_input)
item_input = Input(batch_shape=(batch_size,chunk_size,), dtype='int32', name='item_inputs')
item_emb = Embedding(input_dim=num_items+1, output_dim=out_dim, input_length=chunk_size)(item_input)
inputs = keras.layers.add([user_emb, item_emb])
gru_args = {
"units":hidden_size,
"return_sequences":True,
#"return_state":True,
"stateful":True,
"unroll":False
}
gru = CGRU(**gru_args)(inputs)
outputs = Dense(num_items+1, activation='softmax')(gru)
[recc_model = Model(inputs=\[user_input,item_input\], outputs=outputs)
recc_model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=\[metrics.cate][1]gorical_accuracy])
#metrics=[metrics.sparse_categorical_accuracy])
But on running the code I am getting the following error which seems is due to gradients are getting computed to None:
ValueError: Tried to convert 'x' to a tensor and failed. Error: None values not supported.
Find the complete error here: https://pastebin.com/n9UzCRiP
The error occurs because the bias weights are added to the model but not used anywhere.
When you call self.add_weight(...), you have to make sure these weights are used somewhere in your model. Otherwise, since these weights are not connected to the loss tensor, TF cannot compute the gradient and an error will be raised.
If you don't need the bias weights, you can either remove the add_weight lines, or set use_bias=False in your cell.
Also, I think you don't need to re-implement a CGRU layer to use a custom cell. Just wrap your custom cell with the built-in RNN layer should work.
gru = RNN(CGRUCell(hidden_size, use_bias=False),
return_sequences=True,
stateful=True,
unroll=False)(inputs)