ViVIT PyTorch: RuntimeError: multi-target not supported at /pytorch/aten/src/THCUNN/generic/ClassNLLCriterion.cu:15 - nlp

I am trying to run Video Vision Transformer (ViViT) code with my dataset but getting an error using CrossEntropyLoss from Pytorch as the Loss function.
There are 6 classes I have:
['Run', 'Sit', 'Walk', 'Wave', 'Sit', 'Stand']
Optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, weight_decay=1e-9, momentum=0.9)
Class Weights
tensor([0.0045, 0.0042, 0.0048, 0.0038, 0.0070, 0.0065])
Loss Function
loss_func = nn.CrossEntropyLoss(weight=class_weights.to(device))
Code Throwning Error
train_epoch(model, optimizer, train_loader, train_loss_history, loss_func)
Error
RuntimeError: multi-target not supported at /pytorch/aten/src/THCUNN/generic/ClassNLLCriterion.cu:15
Code Calling the transformer
model = ViViT(224, 16, 100, 16).cuda()
Getting Video Frames
def get_frames(filename, n_frames=1):
frames = []
v_cap = cv2.VideoCapture(filename)
v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_list = np.linspace(0, v_len - 1, n_frames + 1, dtype=np.int16)
frame_dims = np.array([224, 224, 3])
for fn in range(v_len):
success, frame = v_cap.read()
if success is False:
continue
if (fn in frame_list):
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = cv2.resize(frame, (frame_dims[0], frame_dims[1]))
frames.append(frame)
v_cap.release()
return frames, v_len
Dataset Preprocessing
class DatasetProcessing(data.Dataset):
def __init__(self, df, root_dir):
super(DatasetProcessing, self).__init__()
# List of all videos path
video_list = df["Video"].apply(lambda x: root_dir + '/' + x)
self.video_list = np.asarray(video_list)
self.df = df
def __getitem__(self, index):
# Ensure that the raw videos are in respective folders and folder name matches the output class label
video_label = self.video_list[index].split('/')[-2]
video_name = self.video_list[index].split('/')[-1]
video_frames, len_ = get_frames(self.video_list[index], n_frames = 15)
video_frames = np.asarray(video_frames)
video_frames = video_frames/255
class_list = ['Run', 'Walk', 'Wave', 'Sit', 'Turn', 'Stand']
class_id_loc = np.where(class_list == video_label)
label = class_id_loc
d = torch.as_tensor(np.array(video_frames).astype('float'))
l = torch.as_tensor(np.array(label).astype('float'))
return (d, l)
def __len__(self):
return self.video_list.shape[0]
Training Epochs
def train_epoch(model, optimizer, data_loader, loss_history, loss_func):
total_samples = len(data_loader.dataset)
model.train()
for i, (data, target) in enumerate(data_loader):
optimizer.zero_grad()
x = data.cuda()
data = rearrange(x, 'b p h w c -> b p c h w').cuda()
target = target.type(torch.LongTensor).cuda()
pred = model(data.float())
output = F.log_softmax(pred, dim=1)
loss = loss_func(output, target.squeeze(1))
loss.backward()
optimizer.step()
if i % 100 == 0:
print('[' + '{:5}'.format(i * len(data)) + '/' + '{:5}'.format(total_samples) +
' (' + '{:3.0f}'.format(100 * i / len(data_loader)) + '%)] Loss: ' +
'{:6.4f}'.format(loss.item()))
loss_history.append(loss.item())
Evaluate Model
def evaluate(model, data_loader, loss_history, loss_func):
model.eval()
total_samples = len(data_loader.dataset)
correct_samples = 0
total_loss = 0
with torch.no_grad():
for data, target in data_loader:
x = data.cuda()
data = rearrange(x, 'b p h w c -> b p c h w').cuda()
target = target.type(torch.LongTensor).cuda()
output = F.log_softmax(model(data.float()), dim=1)
loss = loss_func(output, target)
_, pred = torch.max(output, dim=1)
total_loss += loss.item()
correct_samples += pred.eq(target).sum()
avg_loss = total_loss / total_samples
loss_history.append(avg_loss)
print('\nAverage test loss: ' + '{:.4f}'.format(avg_loss) +
' Accuracy:' + '{:5}'.format(correct_samples) + '/' +
'{:5}'.format(total_samples) + ' (' +
'{:4.2f}'.format(100.0 * correct_samples / total_samples) + '%)\n')
Transformer
class Transformer(nn.Module):
def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
super().__init__()
self.layers = nn.ModuleList([])
self.norm = nn.LayerNorm(dim)
for _ in range(depth):
self.layers.append(nn.ModuleList([
PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
]))
def forward(self, x):
for attn, ff in self.layers:
x = attn(x) + x
x = ff(x) + x
return self.norm(x)
ViViT Code
class ViViT(nn.Module):
def __init__(self, image_size, patch_size, num_classes, num_frames, dim = 192, depth = 4, heads = 3, pool = 'cls', in_channels = 3, dim_head = 64, dropout = 0.,
emb_dropout = 0., scale_dim = 4, ):
super().__init__()
assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'
assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
num_patches = (image_size // patch_size) ** 2
patch_dim = in_channels * patch_size ** 2
self.to_patch_embedding = nn.Sequential(
Rearrange('b t c (h p1) (w p2) -> b t (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
nn.Linear(patch_dim, dim),
)
self.pos_embedding = nn.Parameter(torch.randn(1, num_frames, num_patches + 1, dim))
self.space_token = nn.Parameter(torch.randn(1, 1, dim))
self.space_transformer = Transformer(dim, depth, heads, dim_head, dim*scale_dim, dropout)
self.temporal_token = nn.Parameter(torch.randn(1, 1, dim))
self.temporal_transformer = Transformer(dim, depth, heads, dim_head, dim*scale_dim, dropout)
self.dropout = nn.Dropout(emb_dropout)
self.pool = pool
self.mlp_head = nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, num_classes)
)
def forward(self, x):
x = self.to_patch_embedding(x)
b, t, n, _ = x.shape
cls_space_tokens = repeat(self.space_token, '() n d -> b t n d', b = b, t=t)
x = torch.cat((cls_space_tokens, x), dim=2)
x += self.pos_embedding[:, :, :(n + 1)]
x = self.dropout(x)
x = rearrange(x, 'b t n d -> (b t) n d')
x = self.space_transformer(x)
x = rearrange(x[:, 0], '(b t) ... -> b t ...', b=b)
cls_temporal_tokens = repeat(self.temporal_token, '() n d -> b n d', b=b)
x = torch.cat((cls_temporal_tokens, x), dim=1)
x = self.temporal_transformer(x)
x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]
return self.mlp_head(x)

Multi target appears to be a feature supported since version 1.10.0.
https://discuss.pytorch.org/t/crossentropyloss-vs-per-class-probabilities-target/138331
Please check your pytorch version.
Please refer to the example of using the UTF101 top5 dataset, which is available on my Colab. The version of pytorch is 1.12.0+cu113, and the code you listed was able to run the training almost exactly as it was written.

Related

RuntimeError: mat1 and mat2 shapes cannot be multiplied (256x726 and 1000x1000)

I'm trying to measure the latent space clustering but the error raised.
class AutoEncoder(nn.Module):
def __init__(self, input_dim1, input_dim2, hidden_dims, agg, sep_decode):
super(AutoEncoder, self).__init__()
self.agg = agg
self.sep_decode = sep_decode
print("hidden_dims:", hidden_dims)
self.encoder_layers = []
self.encoder2_layers = []
dims = [[input_dim1, input_dim2]] + hidden_dims
for i in range(len(dims) - 1):
if i == 0:
layer = nn.Sequential(nn.Linear(dims[i][0], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i][1], dims[i+1]), nn.ReLU())
elif i != 0 and i < len(dims) - 2:
layer = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
else:
layer = nn.Linear(dims[i], dims[i+1])
layer2 = nn.Linear(dims[i], dims[i+1])
self.encoder_layers.append(layer)
self.encoder2_layers.append(layer2)
self.encoder = nn.Sequential(*self.encoder_layers)
self.encoder2 = nn.Sequential(*self.encoder2_layers)
self.decoder_layers = []
self.decoder2_layers = []
hidden_dims.reverse()
dims = hidden_dims + [[input_dim1, input_dim2]]
if self.agg == "concat" and not self.sep_decode:
dims[0] = 2 * dims[0]
for i in range(len(dims) - 1):
if i < len(dims) - 2:
layer = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
else:
layer = nn.Linear(dims[i], dims[i+1][0])
layer2 = nn.Linear(dims[i], dims[i+1][1])
self.decoder_layers.append(layer)
self.decoder2_layers.append(layer2)
self.decoder = nn.Sequential(*self.decoder_layers)
self.decoder2 = nn.Sequential(*self.decoder2_layers)
def forward(self, x1, x2):
z1 = self.encoder(x1)
z2 = self.encoder2(x2)
if self.agg == "max":
z = torch.max(z1, z2)
elif self.agg == "multi":
z = z1 * z2
elif self.agg == "sum":
z = z1 + z2
elif self.agg == "concat":
z = torch.cat([z1, z2], dim=1)
if self.sep_decode:
x_bar1 = self.decoder(z1)
x_bar1 = F.normalize(x_bar1, dim=-1)
x_bar2 = self.decoder2(z2)
x_bar2 = F.normalize(x_bar2, dim=-1)
else:
x_bar1 = self.decoder(z)
x_bar1 = F.normalize(x_bar1, dim=-1)
x_bar2 = self.decoder2(z)
x_bar2 = F.normalize(x_bar2, dim=-1)
return x_bar1, x_bar2, z
class TopicCluster(nn.Module):
def __init__(self, args):
super(TopicCluster, self).__init__()
self.alpha = 1.0
self.dataset_path = args.dataset_path
self.args = args
self.device = args.device
self.temperature = args.temperature
self.distribution = args.distribution
self.agg_method = args.agg_method
self.sep_decode = (args.sep_decode == 1)
input_dim1 = args.input_dim1
input_dim2 = args.input_dim2
hidden_dims = eval(args.hidden_dims)
self.model = AutoEncoder(input_dim1, input_dim2, hidden_dims, self.agg_method, self.sep_decode)
if self.agg_method == "concat":
self.topic_emb = Parameter(torch.Tensor(args.n_clusters, 2*hidden_dims[-1]))
else:
self.topic_emb = Parameter(torch.Tensor(args.n_clusters, hidden_dims[-1]))
torch.nn.init.xavier_normal_(self.topic_emb.data)
def pretrain(self, input_data, pretrain_epoch=200):
pretrained_path = os.path.join(self.dataset_path, f"pretrained_{args.suffix}.pt")
if os.path.exists(pretrained_path) and self.args.load_pretrain:
# load pretrain weights
print(f"loading pretrained model from {pretrained_path}")
self.model.load_state_dict(torch.load(pretrained_path))
else:
train_loader = DataLoader(input_data, batch_size=self.args.batch_size, shuffle=True)
optimizer = Adam(self.model.parameters(), lr=self.args.lr)
for epoch in range(pretrain_epoch):
total_loss = 0
for batch_idx, (x1, x2, _, weight) in enumerate(train_loader):
x1 = x1.to(self.device)
x2 = x2.to(self.device)
weight = weight.to(self.device)
optimizer.zero_grad()
x_bar1, x_bar2, z = self.model(x1, x2)
loss = cosine_dist(x_bar1, x1) + cosine_dist(x_bar2, x2) #, weight)
total_loss += loss.item()
loss.backward()
optimizer.step()
print(f"epoch {epoch}: loss = {total_loss / (batch_idx+1):.4f}")
torch.save(self.model.state_dict(), pretrained_path)
print(f"model saved to {pretrained_path}")
def cluster_assign(self, z):
if self.distribution == 'student':
p = 1.0 / (1.0 + torch.sum(
torch.pow(z.unsqueeze(1) - self.topic_emb, 2), 2) / self.alpha)
p = p.pow((self.alpha + 1.0) / 2.0)
p = (p.t() / torch.sum(p, 1)).t()
else:
self.topic_emb.data = F.normalize(self.topic_emb.data, dim=-1)
z = F.normalize(z, dim=-1)
sim = torch.matmul(z, self.topic_emb.t()) / self.temperature
p = F.softmax(sim, dim=-1)
return p
def forward(self, x1, x2):
x_bar1, x_bar2, z = self.model(x1, x2)
p = self.cluster_assign(z)
return x_bar1, x_bar2, z, p
def target_distribution(self, x1, x2, freq, method='all', top_num=0):
_, _, z = self.model(x1, x2)
p = self.cluster_assign(z).detach()
if method == 'all':
q = p**2 / (p * freq.unsqueeze(-1)).sum(dim=0)
q = (q.t() / q.sum(dim=1)).t()
elif method == 'top':
assert top_num > 0
q = p.clone()
sim = torch.matmul(self.topic_emb, z.t())
_, selected_idx = sim.topk(k=top_num, dim=-1)
for i, topic_idx in enumerate(selected_idx):
q[topic_idx] = 0
q[topic_idx, i] = 1
return p, q
def cosine_dist(x_bar, x, weight=None):
if weight is None:
weight = torch.ones(x.size(0), device=x.device)
cos_sim = (x_bar * x).sum(-1)
cos_dist = 1 - cos_sim
cos_dist = (cos_dist * weight).sum() / weight.sum()
return cos_dist
def train(args, emb_dict):
# ipdb.set_trace()
inv_vocab = {k: " ".join(v) for k, v in emb_dict["inv_vocab"].items()}
vocab = {" ".join(k):v for k, v in emb_dict["vocab"].items()}
print(f"Vocab size: {len(vocab)}")
embs = F.normalize(torch.tensor(emb_dict["vs_emb"]), dim=-1)
embs2 = F.normalize(torch.tensor(emb_dict["oh_emb"]), dim=-1)
freq = np.array(emb_dict["tuple_freq"])
if not args.use_freq:
freq = np.ones_like(freq)
input_data = TensorDataset(embs, embs2, torch.arange(embs.size(0)), torch.tensor(freq))
topic_cluster = TopicCluster(args).to(args.device)
topic_cluster.pretrain(input_data, args.pretrain_epoch)
train_loader = DataLoader(input_data, batch_size=args.batch_size, shuffle=False)
optimizer = Adam(topic_cluster.parameters(), lr=args.lr)
# topic embedding initialization
embs = embs.to(args.device)
embs2 = embs2.to(args.device)
x_bar1, x_bar2, z = topic_cluster.model(embs, embs2)
z = F.normalize(z, dim=-1)
print(f"Running K-Means for initialization")
kmeans = KMeans(n_clusters=args.n_clusters, n_init=5)
if args.use_freq:
y_pred = kmeans.fit_predict(z.data.cpu().numpy(), sample_weight=freq)
else:
y_pred = kmeans.fit_predict(z.data.cpu().numpy())
print(f"Finish K-Means")
freq = torch.tensor(freq).to(args.device)
y_pred_last = y_pred
topic_cluster.topic_emb.data = torch.tensor(kmeans.cluster_centers_).to(args.device)
topic_cluster.train()
i = 0
for epoch in range(50):
if epoch % 5 == 0:
_, _, z, p = topic_cluster(embs, embs2)
z = F.normalize(z, dim=-1)
topic_cluster.topic_emb.data = F.normalize(topic_cluster.topic_emb.data, dim=-1)
if not os.path.exists(os.path.join(args.dataset_path, f"clusters_{args.suffix}")):
os.makedirs(os.path.join(args.dataset_path, f"clusters_{args.suffix}"))
embed_save_path = os.path.join(args.dataset_path, f"clusters_{args.suffix}/embed_{epoch}.pt")
torch.save({
"inv_vocab": emb_dict['inv_vocab'],
"embed": z.detach().cpu().numpy(),
"topic_embed": topic_cluster.topic_emb.detach().cpu().numpy(),
}, embed_save_path)
f = open(os.path.join(args.dataset_path, f"clusters_{args.suffix}/{epoch}.txt"), 'w')
pred_cluster = p.argmax(-1)
result_strings = []
for j in range(args.n_clusters):
if args.sort_method == 'discriminative':
word_idx = torch.arange(embs.size(0))[pred_cluster == j]
sorted_idx = torch.argsort(p[pred_cluster == j][:, j], descending=True)
word_idx = word_idx[sorted_idx]
else:
sim = torch.matmul(topic_cluster.topic_emb[j], z.t())
_, word_idx = sim.topk(k=30, dim=-1)
word_cluster = []
freq_sum = 0
for idx in word_idx:
freq_sum += freq[idx].item()
if inv_vocab[idx.item()] not in word_cluster:
word_cluster.append(inv_vocab[idx.item()])
if len(word_cluster) >= 10:
break
result_strings.append((freq_sum, f"Topic {j} ({freq_sum}): " + ', '.join(word_cluster)+'\n'))
result_strings = sorted(result_strings, key=lambda x: x[0], reverse=True)
for result_string in result_strings:
f.write(result_string[1])
for x1, x2, idx, weight in train_loader:
if i % args.update_interval == 0:
p, q = topic_cluster.target_distribution(embs, embs2, freq.clone().fill_(1), method='all', top_num=epoch+1)
y_pred = p.cpu().numpy().argmax(1)
delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
y_pred_last = y_pred
if i > 0 and delta_label < args.tol:
print(f'delta_label {delta_label:.4f} < tol ({args.tol})')
print('Reached tolerance threshold. Stopping training.')
return None
i += 1
x1 = x1.to(args.device)
x2 = x2.to(args.device)
idx = idx.to(args.device)
weight = weight.to(args.device)
x_bar1, x_bar2, _, p = topic_cluster(x1, x2)
reconstr_loss = cosine_dist(x_bar1, x1) + cosine_dist(x_bar2, x2) #, weight)
kl_loss = F.kl_div(p.log(), q[idx], reduction='none').sum(-1)
kl_loss = (kl_loss * weight).sum() / weight.sum()
loss = args.gamma * kl_loss + reconstr_loss
if i % args.update_interval == 0:
print(f"KL loss: {kl_loss}; Reconstruction loss: {reconstr_loss}")
optimizer.zero_grad()
loss.backward()
optimizer.step()
return None
if __name__ == "__main__":
# CUDA_VISIBLE_DEVICES=0 python3 latent_space_clustering.py --dataset_path ./pandemic --input_emb_name po_tuple_features_all_svos.pk
parser = argparse.ArgumentParser(
description='train',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset_path', type=str)
parser.add_argument('--input_emb_name', type=str)
parser.add_argument('--lr', type=float, default=5e-4)
parser.add_argument('--n_clusters', default=30, type=int)
parser.add_argument('--input_dim1', default=1000, type=int)
parser.add_argument('--input_dim2', default=1000, type=int)
parser.add_argument('--agg_method', default="multi", choices=["sum", "multi", "concat", "attend"], type=str)
parser.add_argument('--sep_decode', default=0, choices=[0, 1], type=int)
parser.add_argument('--pretrain_epoch', default=100, type=int)
parser.add_argument('--load_pretrain', default=False, action='store_true')
parser.add_argument('--temperature', default=0.1, type=float)
parser.add_argument('--sort_method', default='generative', choices=['generative', 'discriminative'])
parser.add_argument('--distribution', default='softmax', choices=['softmax', 'student'])
parser.add_argument('--batch_size', default=256, type=int)
parser.add_argument('--use_freq', default=False, action='store_true')
parser.add_argument('--hidden_dims', default='[1000, 2000, 1000, 100]', type=str)
parser.add_argument('--suffix', type=str, default='')
parser.add_argument('--gamma', default=5, type=float, help='weight of clustering loss')
parser.add_argument('--update_interval', default=100, type=int)
parser.add_argument('--tol', default=0.001, type=float)
args = parser.parse_args()
args.cuda = torch.cuda.is_available()
print("use cuda: {}".format(args.cuda))
args.device = torch.device("cuda" if args.cuda else "cpu")
print(args)
with open(os.path.join(args.dataset_path, args.input_emb_name), "rb") as fin:
emb_dict = pk.load(fin)
candidate_idx = train(args, emb_dict)
print(candidate_idx)
The error I'm getting is: RuntimeError: mat1 and mat2 shapes cannot be multiplied (256x726 and 1000x1000). I cannot figure out which part is the problem. Please help me.. Thank you so much
for the images runtime error like
enter image description here

PyTorch: GRU, one-to-many / many-to-one

I would like to implement a GRU able to encode a sequence of vectors to one vector (many-to-one), and then another GRU able to decode a vector to a sequence of vector (one-to-many). The size of the vectors wouldn't be changed. I would like to have an opinion about what I implemented.
Here is the code:
class AEGRU(nn.Module):
def __init__(self, opt):
super(AEGRU, self).__init__()
self.length = 256
self.latent_space = 256
self.num_layers = 1
self.GRU_enc = nn.GRU(input_size=3, hidden_size=self.latent_space, num_layers=self.num_layers, batch_first=True)
self.fc_enc = nn.Linear(self.latent_space, self.latent_space)
self.GRU_dec = nn.GRU(input_size=self.latent_space, hidden_size=3, num_layers=self.num_layers, batch_first=True)
self.fc_dec = nn.Linear(3, 3)
def enc(self, x):
# x has shape: Batch_size x self.length x 3
h0 = torch.zeros(self.num_layers, x.shape[0], self.latent_space).cuda()
out, _ = self.GRU_enc(x, h0)
out = out[:, -1, :]
out = self.fc_enc(out)
return out
def dec(self, x):
# x has shape: Batch_size x self.latent_space
x = x[:, None, :]
h = torch.zeros(self.num_layers, x.shape[0], 3).cuda()
# method 1 ??
'''outputs = torch.zeros(x.shape[0], self.length, 3).cuda()
for i in range(self.length):
out, h = self.GRU_dec(x, h)
outputs[:, i, :] = out[:, 0, :]'''
# method 2 ??
x = x.repeat(1, self.length, 1)
outputs, _ = self.GRU_dec(x, h)
# linear layer
outputs = self.fc_dec(outputs)
return outputs
def forward(self, x):
self.indices = []
latent = self.enc(x)
output = self.dec(latent)
return output
I am not sure whether this is the good way to do a one-to-many GRU. Could I have some opinions about this?
Thanks for reading!

Numpy implementation for regression using NN

I am implementing my own Neural Network model for regression using only NumPy, and I'm getting really weird results when I'm testing my model on m > 1 samples (for m=1 it works fine).. It seems like the model collapses and predicts only specific values for the whole batch:
Input:
X [[ 7.62316802 -6.12433912]
[ 1.11048966 4.97509421]]
Expected Output:
Y [[16.47952332 12.50288412]]
Model Output
y_hat [[10.42446234 10.42446234]]
Any idea what might cause this issue?
My code:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# np.seterr(all=None, divide=None, over=None, under=None, invalid=None)
data_x = np.random.uniform(0, 10, size=(2, 1))
data_y = (2 * data_x).sum(axis=0, keepdims=True)
# data_y = data_x[0, :] ** 2 + data_x[1, :] ** 2
# data_y = data_y.reshape((1, -1))
# # fig = plt.figure()
# # ax = fig.add_subplot(111, projection='3d')
# # ax.scatter(data_x[0, :], data_x[1, :], data_y)
# # plt.show()
memory = dict()
nn_architecture = [
{"input_dim": 2, "output_dim": 6, "activation": "sigmoid", "bias": True},
{"input_dim": 6, "output_dim": 4, "activation": "sigmoid", "bias": True},
{"input_dim": 4, "output_dim": 1, "activation": "relu", "bias": True}
]
def init_network_parameters(nn_architecture):
parameters = []
for idx, layer in enumerate(nn_architecture):
layer_params = {}
input_dim, output_dim, activation, bias = layer.values()
W = np.random.uniform(0, 1, (output_dim, input_dim))
B = np.zeros((output_dim, 1))
if bias:
B = np.ones((output_dim, 1))
activation_func = identity
backward_activation_func = identity_backward
if activation is 'sigmoid':
activation_func = sigmoid
backward_activation_func = sigmoid_backward
elif activation is 'relu':
activation_func = relu
backward_activation_func = relu_backward
else:
print(f"Activation function set to identity for layer {idx}")
layer_params[f"W"] = W
layer_params[f"B"] = B
layer_params[f"activation"] = activation_func
layer_params[f"backward_activation"] = backward_activation_func
layer_params[f"bias"] = bias
parameters.append(layer_params)
return parameters
def identity(z):
return z
def sigmoid(z):
return np.clip(1 / (1 + np.exp(-z)), -100, 100)
def relu(z):
output = np.array(z, copy=True)
output[z <= 0] = 0
return output
def identity_backward(z, dA):
return dA
def sigmoid_backward(z, dA):
return np.clip(z * (1-z) * dA, -100, 100)
def relu_backward(z, dA):
output = np.ones(z.shape)
output[z <= 0] = 0
return output * dA
def forward_single_layer(prev_A, parameters, idx):
W = parameters[f"W"]
B = parameters[f"B"]
activation = parameters[f"activation"]
if parameters["bias"]:
curr_Z = W.dot(prev_A) + B
else:
curr_Z = W.dot(prev_A)
curr_A = activation(curr_Z)
memory[f"Z{idx+1}"] = curr_Z
memory[f"A{idx+1}"] = curr_A
return curr_Z, curr_A
def forward(X, parameters):
prev_A = X
memory["A0"] = prev_A
for idx, layer_params in enumerate(parameters):
curr_Z, prev_A = forward_single_layer(prev_A=prev_A, parameters=layer_params, idx=idx)
return prev_A
def criteria(y_hat, y):
assert y_hat.shape == y.shape
n = y_hat.shape[0]
m = y_hat.shape[1]
loss = np.sum(y_hat - y, axis=1) / m
dA = (y_hat - y) / m
return loss, dA
def backward_single_layer(prev_A, dA, curr_W, curr_Z, backward_activation, idx):
m = prev_A.shape[1]
dZ = backward_activation(z=curr_Z, dA=dA)
dW = np.dot(dZ, prev_A.T) / m
dB = np.sum(dZ, axis=1, keepdims=True) / m
dA = np.dot(curr_W.T, dZ)
return dA, dW, dB
def backpropagation(parameters, dA):
grads = {}
for idx in reversed(range(len(parameters))):
layer = parameters[idx]
prev_A = memory[f"A{idx}"]
curr_Z = memory[f"Z{idx+1}"]
curr_W = layer["W"]
backward_activation = layer["backward_activation"]
dA, dW, dB = backward_single_layer(prev_A, dA, curr_W, curr_Z, backward_activation, idx)
grads[f"W{idx}"] = dW
grads[f"B{idx}"] = dB
return grads
def update_params(parameters, grads, lr=0.001):
new_params = []
for idx, layer in enumerate(parameters):
layer["W"] -= lr*grads[f"W{idx}"]
layer["B"] -= lr*grads[f"B{idx}"]
new_params.append(layer)
return new_params
X = np.random.uniform(-10, 10, (2, 2))
Y = 2*X[0, :] + X[1, :] ** 2
Y = Y.reshape((1, X.shape[1]))
parameters = init_network_parameters(nn_architecture)
n_epochs = 1000
lr = 0.01
loss_history = []
for i in range(n_epochs):
y_hat = forward(X, parameters)
loss, dA = criteria(y_hat, Y)
loss_history.append(loss)
grads = backpropagation(parameters, dA)
parameters = update_params(parameters, grads, lr)
if not i % 10:
print(f"Epoch {i}/{n_epochs} loss={loss}")
print("X", X)
print("Y", Y)
print("y_hat", y_hat)
There wasn't a problem with my implementation, just overfitting.
More information can be found here.

Backpropogation with self defined loss function and network

I want to create a CNN network for object localization (It is given that there is only one object). For this I am using some general layers and in the end I want to get the nearest and farthest corner to origin. I am also using self defined loss function which is (100 - intersaction over union in %). Loss is not converging. What may be the problem? Wheather backpropogation will work with this network or some other problem? Below is the code:
For illustraion purpose see .
Network:
class convnet(nn.Module):
def __init__(self):
super(convnet, self).__init__()
self.conv1 = nn.Conv2d(1, 4, kernel_size=5)
self.pool1 = nn.MaxPool2d(kernel_size=3,stride=3)
self.conv2 = nn.Conv2d(4, 8, kernel_size=5)
self.pool2 = nn.MaxPool2d(kernel_size=3,stride=3)
self.conv3 = nn.Conv2d(8, 16, kernel_size=5)
self.pool3 = nn.MaxPool2d(kernel_size=3,stride=3)
self.fc1 = nn.Linear(5040, 1000)
self.fc2 = nn.Linear(1000, 84)
self.fc3 = nn.Linear(84, 4)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.pool1(x)
x = F.relu(self.conv2(x))
x = self.pool2(x)
x = F.relu(self.conv3(x))
x = self.pool3(x)
x = x.view(-1, 5040)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
a = self.sigmoid(self.fc3(x))
c = torch.zeros(a.shape[0], 2)
for idx, x in enumerate(a):
d1 = x[0] ** 2 + x[1] ** 2
d2 = x[2] ** 2 + x[3] ** 2
d3 = x[0] ** 2 + x[3] ** 2
d4 = x[2] ** 2 + x[1] ** 2
dmin = min(d1, d2, d3, d4)
if d1 == dmin:
c[idx] = torch.tensor([x[0], x[1]])
elif d2 == dmin:
c[idx] = torch.tensor([x[2], x[3]])
elif d3 == dmin:
c[idx] = torch.tensor([x[0], x[3]])
elif d4 == dmin:
c[idx] = torch.tensor([x[2], x[1]])
m = torch.tensor([[640, 480, 640, 480]]).type(torch.DoubleTensor).cuda()
return c*m
def sigmoid(self, z):
return 1/(1+torch.exp(-z))
Loss function:
def iou(box_a, box_b):
A = box_a.size(0)
B = box_b.size(0)
max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
box_b[:, :2].unsqueeze(0).expand(A, B, 2))
inter = torch.clamp((max_xy - min_xy), min=0)
inter =inter[:, :, 0] * inter[:, :, 1]
area_a = ((box_a[:, 2]-box_a[:, 0]) *
(box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)
area_b = ((box_b[:, 2]-box_b[:, 0]) *
(box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)
union = area_a + area_b - inter
return ((inter / union)*100/float(A*A)).sum()
def criterion(output, labels):
return (100-iou(output, labels))
You can check full code here: link

PyTorch to Keras model

I'm trying to replicate a model, but I'm having difficulties doing so with Keras. Here is my current implementation:
filters = 256
kernel_size = 3
strides = 1
# Head module
input = Input(shape=(img_height//scale_fact, img_width//scale_fact, img_depth))
conv0 = Conv2D(filters, kernel_size, strides=strides, padding='same',
kernel_regularizer=regularizers.l2(0.01))(input)
# Body module
res = Conv2D(filters, kernel_size, strides=strides, padding='same')(conv0)
act = ReLU()(res)
res = Conv2D(filters, kernel_size, strides=strides, padding='same')(act)
res_rec = Add()([conv0, res])
for i in range(res_blocks):
res1 = Conv2D(filters, kernel_size, strides=strides, padding='same')(res_rec)
act = ReLU()(res1)
res2 = Conv2D(filters, kernel_size, strides=strides, padding='same')(act)
res_rec = Add()([res_rec, res2])
conv = Conv2D(filters, kernel_size, strides=strides, padding='same',
kernel_regularizer=regularizers.l2(0.01))(res_rec)
add = Add()([conv0, conv])
# Tail module
conv = Conv2D(filters, kernel_size, strides=strides, padding='same',
kernel_regularizer=regularizers.l2(0.01))(add)
act = ReLU()(conv)
up = UpSampling2D(size=scale_fact if scale_fact != 4 else 2)(act) # TODO: try "Conv2DTranspose"
# mul = Multiply([np.zeros((img_width,img_height,img_depth)).fill(0.1), up])(up)
# When it's a 4X factor, we want the upscale split in two procedures
if(scale_fact == 4):
conv = Conv2D(filters, kernel_size, strides=strides, padding='same',
kernel_regularizer=regularizers.l2(0.01))(up)
act = ReLU()(conv)
up = UpSampling2D(size=2)(act) # TODO: try "Conv2DTranspose"
output = Conv2D(filters=3,
kernel_size=1,
strides=1,
padding='same',
kernel_regularizer=regularizers.l2(0.01))(up)
model = Model(inputs=input, outputs=output)
Here is a link to the file I'm trying to replicate. How am I supposed to replicate this custom PyTorch UpSampler that implements a customized PixelShuffling method ?
Here is the relevant part of the UpSampler that I'm having trouble with, for the most part:
import tensorflow as tf
import tensorflow.contrib.slim as slim
"""
Method to upscale an image using
conv2d transpose. Based on upscaling
method defined in the paper
x: input to be upscaled
scale: scale increase of upsample
features: number of features to compute
activation: activation function
"""
def upsample(x,scale=2,features=64,activation=tf.nn.relu):
assert scale in [2,3,4]
x = slim.conv2d(x,features,[3,3],activation_fn=activation)
if scale == 2:
ps_features = 3*(scale**2)
x = slim.conv2d(x,ps_features,[3,3],activation_fn=activation)
#x = slim.conv2d_transpose(x,ps_features,6,stride=1,activation_fn=activation)
x = PS(x,2,color=True)
elif scale == 3:
ps_features =3*(scale**2)
x = slim.conv2d(x,ps_features,[3,3],activation_fn=activation)
#x = slim.conv2d_transpose(x,ps_features,9,stride=1,activation_fn=activation)
x = PS(x,3,color=True)
elif scale == 4:
ps_features = 3*(2**2)
for i in range(2):
x = slim.conv2d(x,ps_features,[3,3],activation_fn=activation)
#x = slim.conv2d_transpose(x,ps_features,6,stride=1,activation_fn=activation)
x = PS(x,2,color=True)
return x
"""
Borrowed from https://github.com/tetrachrome/subpixel
Used for subpixel phase shifting after deconv operations
"""
def _phase_shift(I, r):
bsize, a, b, c = I.get_shape().as_list()
bsize = tf.shape(I)[0] # Handling Dimension(None) type for undefined batch dim
X = tf.reshape(I, (bsize, a, b, r, r))
X = tf.transpose(X, (0, 1, 2, 4, 3)) # bsize, a, b, 1, 1
X = tf.split(X, a, 1) # a, [bsize, b, r, r]
X = tf.concat([tf.squeeze(x, axis=1) for x in X],2) # bsize, b, a*r, r
X = tf.split(X, b, 1) # b, [bsize, a*r, r]
X = tf.concat([tf.squeeze(x, axis=1) for x in X],2) # bsize, a*r, b*r
return tf.reshape(X, (bsize, a*r, b*r, 1))
"""
Borrowed from https://github.com/tetrachrome/subpixel
Used for subpixel phase shifting after deconv operations
"""
def PS(X, r, color=False):
if color:
Xc = tf.split(X, 3, 3)
X = tf.concat([_phase_shift(x, r) for x in Xc],3)
else:
X = _phase_shift(X, r)
return X

Resources