RuntimeError: mat1 and mat2 shapes cannot be multiplied (256x726 and 1000x1000) - pytorch

I'm trying to measure the latent space clustering but the error raised.
class AutoEncoder(nn.Module):
def __init__(self, input_dim1, input_dim2, hidden_dims, agg, sep_decode):
super(AutoEncoder, self).__init__()
self.agg = agg
self.sep_decode = sep_decode
print("hidden_dims:", hidden_dims)
self.encoder_layers = []
self.encoder2_layers = []
dims = [[input_dim1, input_dim2]] + hidden_dims
for i in range(len(dims) - 1):
if i == 0:
layer = nn.Sequential(nn.Linear(dims[i][0], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i][1], dims[i+1]), nn.ReLU())
elif i != 0 and i < len(dims) - 2:
layer = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
else:
layer = nn.Linear(dims[i], dims[i+1])
layer2 = nn.Linear(dims[i], dims[i+1])
self.encoder_layers.append(layer)
self.encoder2_layers.append(layer2)
self.encoder = nn.Sequential(*self.encoder_layers)
self.encoder2 = nn.Sequential(*self.encoder2_layers)
self.decoder_layers = []
self.decoder2_layers = []
hidden_dims.reverse()
dims = hidden_dims + [[input_dim1, input_dim2]]
if self.agg == "concat" and not self.sep_decode:
dims[0] = 2 * dims[0]
for i in range(len(dims) - 1):
if i < len(dims) - 2:
layer = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
else:
layer = nn.Linear(dims[i], dims[i+1][0])
layer2 = nn.Linear(dims[i], dims[i+1][1])
self.decoder_layers.append(layer)
self.decoder2_layers.append(layer2)
self.decoder = nn.Sequential(*self.decoder_layers)
self.decoder2 = nn.Sequential(*self.decoder2_layers)
def forward(self, x1, x2):
z1 = self.encoder(x1)
z2 = self.encoder2(x2)
if self.agg == "max":
z = torch.max(z1, z2)
elif self.agg == "multi":
z = z1 * z2
elif self.agg == "sum":
z = z1 + z2
elif self.agg == "concat":
z = torch.cat([z1, z2], dim=1)
if self.sep_decode:
x_bar1 = self.decoder(z1)
x_bar1 = F.normalize(x_bar1, dim=-1)
x_bar2 = self.decoder2(z2)
x_bar2 = F.normalize(x_bar2, dim=-1)
else:
x_bar1 = self.decoder(z)
x_bar1 = F.normalize(x_bar1, dim=-1)
x_bar2 = self.decoder2(z)
x_bar2 = F.normalize(x_bar2, dim=-1)
return x_bar1, x_bar2, z
class TopicCluster(nn.Module):
def __init__(self, args):
super(TopicCluster, self).__init__()
self.alpha = 1.0
self.dataset_path = args.dataset_path
self.args = args
self.device = args.device
self.temperature = args.temperature
self.distribution = args.distribution
self.agg_method = args.agg_method
self.sep_decode = (args.sep_decode == 1)
input_dim1 = args.input_dim1
input_dim2 = args.input_dim2
hidden_dims = eval(args.hidden_dims)
self.model = AutoEncoder(input_dim1, input_dim2, hidden_dims, self.agg_method, self.sep_decode)
if self.agg_method == "concat":
self.topic_emb = Parameter(torch.Tensor(args.n_clusters, 2*hidden_dims[-1]))
else:
self.topic_emb = Parameter(torch.Tensor(args.n_clusters, hidden_dims[-1]))
torch.nn.init.xavier_normal_(self.topic_emb.data)
def pretrain(self, input_data, pretrain_epoch=200):
pretrained_path = os.path.join(self.dataset_path, f"pretrained_{args.suffix}.pt")
if os.path.exists(pretrained_path) and self.args.load_pretrain:
# load pretrain weights
print(f"loading pretrained model from {pretrained_path}")
self.model.load_state_dict(torch.load(pretrained_path))
else:
train_loader = DataLoader(input_data, batch_size=self.args.batch_size, shuffle=True)
optimizer = Adam(self.model.parameters(), lr=self.args.lr)
for epoch in range(pretrain_epoch):
total_loss = 0
for batch_idx, (x1, x2, _, weight) in enumerate(train_loader):
x1 = x1.to(self.device)
x2 = x2.to(self.device)
weight = weight.to(self.device)
optimizer.zero_grad()
x_bar1, x_bar2, z = self.model(x1, x2)
loss = cosine_dist(x_bar1, x1) + cosine_dist(x_bar2, x2) #, weight)
total_loss += loss.item()
loss.backward()
optimizer.step()
print(f"epoch {epoch}: loss = {total_loss / (batch_idx+1):.4f}")
torch.save(self.model.state_dict(), pretrained_path)
print(f"model saved to {pretrained_path}")
def cluster_assign(self, z):
if self.distribution == 'student':
p = 1.0 / (1.0 + torch.sum(
torch.pow(z.unsqueeze(1) - self.topic_emb, 2), 2) / self.alpha)
p = p.pow((self.alpha + 1.0) / 2.0)
p = (p.t() / torch.sum(p, 1)).t()
else:
self.topic_emb.data = F.normalize(self.topic_emb.data, dim=-1)
z = F.normalize(z, dim=-1)
sim = torch.matmul(z, self.topic_emb.t()) / self.temperature
p = F.softmax(sim, dim=-1)
return p
def forward(self, x1, x2):
x_bar1, x_bar2, z = self.model(x1, x2)
p = self.cluster_assign(z)
return x_bar1, x_bar2, z, p
def target_distribution(self, x1, x2, freq, method='all', top_num=0):
_, _, z = self.model(x1, x2)
p = self.cluster_assign(z).detach()
if method == 'all':
q = p**2 / (p * freq.unsqueeze(-1)).sum(dim=0)
q = (q.t() / q.sum(dim=1)).t()
elif method == 'top':
assert top_num > 0
q = p.clone()
sim = torch.matmul(self.topic_emb, z.t())
_, selected_idx = sim.topk(k=top_num, dim=-1)
for i, topic_idx in enumerate(selected_idx):
q[topic_idx] = 0
q[topic_idx, i] = 1
return p, q
def cosine_dist(x_bar, x, weight=None):
if weight is None:
weight = torch.ones(x.size(0), device=x.device)
cos_sim = (x_bar * x).sum(-1)
cos_dist = 1 - cos_sim
cos_dist = (cos_dist * weight).sum() / weight.sum()
return cos_dist
def train(args, emb_dict):
# ipdb.set_trace()
inv_vocab = {k: " ".join(v) for k, v in emb_dict["inv_vocab"].items()}
vocab = {" ".join(k):v for k, v in emb_dict["vocab"].items()}
print(f"Vocab size: {len(vocab)}")
embs = F.normalize(torch.tensor(emb_dict["vs_emb"]), dim=-1)
embs2 = F.normalize(torch.tensor(emb_dict["oh_emb"]), dim=-1)
freq = np.array(emb_dict["tuple_freq"])
if not args.use_freq:
freq = np.ones_like(freq)
input_data = TensorDataset(embs, embs2, torch.arange(embs.size(0)), torch.tensor(freq))
topic_cluster = TopicCluster(args).to(args.device)
topic_cluster.pretrain(input_data, args.pretrain_epoch)
train_loader = DataLoader(input_data, batch_size=args.batch_size, shuffle=False)
optimizer = Adam(topic_cluster.parameters(), lr=args.lr)
# topic embedding initialization
embs = embs.to(args.device)
embs2 = embs2.to(args.device)
x_bar1, x_bar2, z = topic_cluster.model(embs, embs2)
z = F.normalize(z, dim=-1)
print(f"Running K-Means for initialization")
kmeans = KMeans(n_clusters=args.n_clusters, n_init=5)
if args.use_freq:
y_pred = kmeans.fit_predict(z.data.cpu().numpy(), sample_weight=freq)
else:
y_pred = kmeans.fit_predict(z.data.cpu().numpy())
print(f"Finish K-Means")
freq = torch.tensor(freq).to(args.device)
y_pred_last = y_pred
topic_cluster.topic_emb.data = torch.tensor(kmeans.cluster_centers_).to(args.device)
topic_cluster.train()
i = 0
for epoch in range(50):
if epoch % 5 == 0:
_, _, z, p = topic_cluster(embs, embs2)
z = F.normalize(z, dim=-1)
topic_cluster.topic_emb.data = F.normalize(topic_cluster.topic_emb.data, dim=-1)
if not os.path.exists(os.path.join(args.dataset_path, f"clusters_{args.suffix}")):
os.makedirs(os.path.join(args.dataset_path, f"clusters_{args.suffix}"))
embed_save_path = os.path.join(args.dataset_path, f"clusters_{args.suffix}/embed_{epoch}.pt")
torch.save({
"inv_vocab": emb_dict['inv_vocab'],
"embed": z.detach().cpu().numpy(),
"topic_embed": topic_cluster.topic_emb.detach().cpu().numpy(),
}, embed_save_path)
f = open(os.path.join(args.dataset_path, f"clusters_{args.suffix}/{epoch}.txt"), 'w')
pred_cluster = p.argmax(-1)
result_strings = []
for j in range(args.n_clusters):
if args.sort_method == 'discriminative':
word_idx = torch.arange(embs.size(0))[pred_cluster == j]
sorted_idx = torch.argsort(p[pred_cluster == j][:, j], descending=True)
word_idx = word_idx[sorted_idx]
else:
sim = torch.matmul(topic_cluster.topic_emb[j], z.t())
_, word_idx = sim.topk(k=30, dim=-1)
word_cluster = []
freq_sum = 0
for idx in word_idx:
freq_sum += freq[idx].item()
if inv_vocab[idx.item()] not in word_cluster:
word_cluster.append(inv_vocab[idx.item()])
if len(word_cluster) >= 10:
break
result_strings.append((freq_sum, f"Topic {j} ({freq_sum}): " + ', '.join(word_cluster)+'\n'))
result_strings = sorted(result_strings, key=lambda x: x[0], reverse=True)
for result_string in result_strings:
f.write(result_string[1])
for x1, x2, idx, weight in train_loader:
if i % args.update_interval == 0:
p, q = topic_cluster.target_distribution(embs, embs2, freq.clone().fill_(1), method='all', top_num=epoch+1)
y_pred = p.cpu().numpy().argmax(1)
delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
y_pred_last = y_pred
if i > 0 and delta_label < args.tol:
print(f'delta_label {delta_label:.4f} < tol ({args.tol})')
print('Reached tolerance threshold. Stopping training.')
return None
i += 1
x1 = x1.to(args.device)
x2 = x2.to(args.device)
idx = idx.to(args.device)
weight = weight.to(args.device)
x_bar1, x_bar2, _, p = topic_cluster(x1, x2)
reconstr_loss = cosine_dist(x_bar1, x1) + cosine_dist(x_bar2, x2) #, weight)
kl_loss = F.kl_div(p.log(), q[idx], reduction='none').sum(-1)
kl_loss = (kl_loss * weight).sum() / weight.sum()
loss = args.gamma * kl_loss + reconstr_loss
if i % args.update_interval == 0:
print(f"KL loss: {kl_loss}; Reconstruction loss: {reconstr_loss}")
optimizer.zero_grad()
loss.backward()
optimizer.step()
return None
if __name__ == "__main__":
# CUDA_VISIBLE_DEVICES=0 python3 latent_space_clustering.py --dataset_path ./pandemic --input_emb_name po_tuple_features_all_svos.pk
parser = argparse.ArgumentParser(
description='train',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset_path', type=str)
parser.add_argument('--input_emb_name', type=str)
parser.add_argument('--lr', type=float, default=5e-4)
parser.add_argument('--n_clusters', default=30, type=int)
parser.add_argument('--input_dim1', default=1000, type=int)
parser.add_argument('--input_dim2', default=1000, type=int)
parser.add_argument('--agg_method', default="multi", choices=["sum", "multi", "concat", "attend"], type=str)
parser.add_argument('--sep_decode', default=0, choices=[0, 1], type=int)
parser.add_argument('--pretrain_epoch', default=100, type=int)
parser.add_argument('--load_pretrain', default=False, action='store_true')
parser.add_argument('--temperature', default=0.1, type=float)
parser.add_argument('--sort_method', default='generative', choices=['generative', 'discriminative'])
parser.add_argument('--distribution', default='softmax', choices=['softmax', 'student'])
parser.add_argument('--batch_size', default=256, type=int)
parser.add_argument('--use_freq', default=False, action='store_true')
parser.add_argument('--hidden_dims', default='[1000, 2000, 1000, 100]', type=str)
parser.add_argument('--suffix', type=str, default='')
parser.add_argument('--gamma', default=5, type=float, help='weight of clustering loss')
parser.add_argument('--update_interval', default=100, type=int)
parser.add_argument('--tol', default=0.001, type=float)
args = parser.parse_args()
args.cuda = torch.cuda.is_available()
print("use cuda: {}".format(args.cuda))
args.device = torch.device("cuda" if args.cuda else "cpu")
print(args)
with open(os.path.join(args.dataset_path, args.input_emb_name), "rb") as fin:
emb_dict = pk.load(fin)
candidate_idx = train(args, emb_dict)
print(candidate_idx)
The error I'm getting is: RuntimeError: mat1 and mat2 shapes cannot be multiplied (256x726 and 1000x1000). I cannot figure out which part is the problem. Please help me.. Thank you so much
for the images runtime error like
enter image description here

Related

How can I simultaneously pred 10 model files with a for loop?

I want to use a for loop to call model_1 to model_10 with a for loop and save them in y_pred[].
i have to use a for loop unconditionally.
and also
Code to find and print the minimum, maximum, median, average, and standard deviation of y_pred.
y_gt=0
y_pred=[]
y_pred_lstm=[]
path = '/content/drive/MyDrive/INU_DeepLearning/term_project_part2/model/model_10.pt'
net = torch.load(path)
loss_fn = nn.MSELoss(reduction="sum")
test_loss = test(test_dataloader, net, loss_fn, sd_train.max_price, mode='test')
X, y_gt_ = sd_test[0:]
X_ = X / sd_train.max_price
X_ = torch.Tensor(X_).to(device)
y = net(X_)
y_gt = y_gt_[0]
y_pred.append((y*sd_train.max_price).detach().cpu().numpy()[0])
print(y_pred)
print(y_gt)
This is the section to pred
`
def test(dataloader, model, loss_fn, max_price, mode='val'):
size = len(dataloader.dataset)
num_batches = len(dataloader)
model.eval()
test_loss = 0
count_test = 0
with torch.no_grad():
for X, y in dataloader:
X_ = X / max_price
y_ = y / max_price
X_, y_ = X_.to(device), y_.to(device)
pred = model(X_)
test_loss += loss_fn(pred, y_).item()#.reshape(-1,1)).item()
count_test += len(X)
test_loss /= count_test
test_loss = max_price*np.sqrt(test_loss)
if mode == 'val':
print(f"Validation RMSE: {test_loss:>7f} \n")
else:
print(f"Test RMSE: {test_loss:>7f} \n")
return test_loss
`

ViVIT PyTorch: RuntimeError: multi-target not supported at /pytorch/aten/src/THCUNN/generic/ClassNLLCriterion.cu:15

I am trying to run Video Vision Transformer (ViViT) code with my dataset but getting an error using CrossEntropyLoss from Pytorch as the Loss function.
There are 6 classes I have:
['Run', 'Sit', 'Walk', 'Wave', 'Sit', 'Stand']
Optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, weight_decay=1e-9, momentum=0.9)
Class Weights
tensor([0.0045, 0.0042, 0.0048, 0.0038, 0.0070, 0.0065])
Loss Function
loss_func = nn.CrossEntropyLoss(weight=class_weights.to(device))
Code Throwning Error
train_epoch(model, optimizer, train_loader, train_loss_history, loss_func)
Error
RuntimeError: multi-target not supported at /pytorch/aten/src/THCUNN/generic/ClassNLLCriterion.cu:15
Code Calling the transformer
model = ViViT(224, 16, 100, 16).cuda()
Getting Video Frames
def get_frames(filename, n_frames=1):
frames = []
v_cap = cv2.VideoCapture(filename)
v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_list = np.linspace(0, v_len - 1, n_frames + 1, dtype=np.int16)
frame_dims = np.array([224, 224, 3])
for fn in range(v_len):
success, frame = v_cap.read()
if success is False:
continue
if (fn in frame_list):
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = cv2.resize(frame, (frame_dims[0], frame_dims[1]))
frames.append(frame)
v_cap.release()
return frames, v_len
Dataset Preprocessing
class DatasetProcessing(data.Dataset):
def __init__(self, df, root_dir):
super(DatasetProcessing, self).__init__()
# List of all videos path
video_list = df["Video"].apply(lambda x: root_dir + '/' + x)
self.video_list = np.asarray(video_list)
self.df = df
def __getitem__(self, index):
# Ensure that the raw videos are in respective folders and folder name matches the output class label
video_label = self.video_list[index].split('/')[-2]
video_name = self.video_list[index].split('/')[-1]
video_frames, len_ = get_frames(self.video_list[index], n_frames = 15)
video_frames = np.asarray(video_frames)
video_frames = video_frames/255
class_list = ['Run', 'Walk', 'Wave', 'Sit', 'Turn', 'Stand']
class_id_loc = np.where(class_list == video_label)
label = class_id_loc
d = torch.as_tensor(np.array(video_frames).astype('float'))
l = torch.as_tensor(np.array(label).astype('float'))
return (d, l)
def __len__(self):
return self.video_list.shape[0]
Training Epochs
def train_epoch(model, optimizer, data_loader, loss_history, loss_func):
total_samples = len(data_loader.dataset)
model.train()
for i, (data, target) in enumerate(data_loader):
optimizer.zero_grad()
x = data.cuda()
data = rearrange(x, 'b p h w c -> b p c h w').cuda()
target = target.type(torch.LongTensor).cuda()
pred = model(data.float())
output = F.log_softmax(pred, dim=1)
loss = loss_func(output, target.squeeze(1))
loss.backward()
optimizer.step()
if i % 100 == 0:
print('[' + '{:5}'.format(i * len(data)) + '/' + '{:5}'.format(total_samples) +
' (' + '{:3.0f}'.format(100 * i / len(data_loader)) + '%)] Loss: ' +
'{:6.4f}'.format(loss.item()))
loss_history.append(loss.item())
Evaluate Model
def evaluate(model, data_loader, loss_history, loss_func):
model.eval()
total_samples = len(data_loader.dataset)
correct_samples = 0
total_loss = 0
with torch.no_grad():
for data, target in data_loader:
x = data.cuda()
data = rearrange(x, 'b p h w c -> b p c h w').cuda()
target = target.type(torch.LongTensor).cuda()
output = F.log_softmax(model(data.float()), dim=1)
loss = loss_func(output, target)
_, pred = torch.max(output, dim=1)
total_loss += loss.item()
correct_samples += pred.eq(target).sum()
avg_loss = total_loss / total_samples
loss_history.append(avg_loss)
print('\nAverage test loss: ' + '{:.4f}'.format(avg_loss) +
' Accuracy:' + '{:5}'.format(correct_samples) + '/' +
'{:5}'.format(total_samples) + ' (' +
'{:4.2f}'.format(100.0 * correct_samples / total_samples) + '%)\n')
Transformer
class Transformer(nn.Module):
def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
super().__init__()
self.layers = nn.ModuleList([])
self.norm = nn.LayerNorm(dim)
for _ in range(depth):
self.layers.append(nn.ModuleList([
PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
]))
def forward(self, x):
for attn, ff in self.layers:
x = attn(x) + x
x = ff(x) + x
return self.norm(x)
ViViT Code
class ViViT(nn.Module):
def __init__(self, image_size, patch_size, num_classes, num_frames, dim = 192, depth = 4, heads = 3, pool = 'cls', in_channels = 3, dim_head = 64, dropout = 0.,
emb_dropout = 0., scale_dim = 4, ):
super().__init__()
assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'
assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
num_patches = (image_size // patch_size) ** 2
patch_dim = in_channels * patch_size ** 2
self.to_patch_embedding = nn.Sequential(
Rearrange('b t c (h p1) (w p2) -> b t (h w) (p1 p2 c)', p1 = patch_size, p2 = patch_size),
nn.Linear(patch_dim, dim),
)
self.pos_embedding = nn.Parameter(torch.randn(1, num_frames, num_patches + 1, dim))
self.space_token = nn.Parameter(torch.randn(1, 1, dim))
self.space_transformer = Transformer(dim, depth, heads, dim_head, dim*scale_dim, dropout)
self.temporal_token = nn.Parameter(torch.randn(1, 1, dim))
self.temporal_transformer = Transformer(dim, depth, heads, dim_head, dim*scale_dim, dropout)
self.dropout = nn.Dropout(emb_dropout)
self.pool = pool
self.mlp_head = nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, num_classes)
)
def forward(self, x):
x = self.to_patch_embedding(x)
b, t, n, _ = x.shape
cls_space_tokens = repeat(self.space_token, '() n d -> b t n d', b = b, t=t)
x = torch.cat((cls_space_tokens, x), dim=2)
x += self.pos_embedding[:, :, :(n + 1)]
x = self.dropout(x)
x = rearrange(x, 'b t n d -> (b t) n d')
x = self.space_transformer(x)
x = rearrange(x[:, 0], '(b t) ... -> b t ...', b=b)
cls_temporal_tokens = repeat(self.temporal_token, '() n d -> b n d', b=b)
x = torch.cat((cls_temporal_tokens, x), dim=1)
x = self.temporal_transformer(x)
x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]
return self.mlp_head(x)
Multi target appears to be a feature supported since version 1.10.0.
https://discuss.pytorch.org/t/crossentropyloss-vs-per-class-probabilities-target/138331
Please check your pytorch version.
Please refer to the example of using the UTF101 top5 dataset, which is available on my Colab. The version of pytorch is 1.12.0+cu113, and the code you listed was able to run the training almost exactly as it was written.

Numpy implementation for regression using NN

I am implementing my own Neural Network model for regression using only NumPy, and I'm getting really weird results when I'm testing my model on m > 1 samples (for m=1 it works fine).. It seems like the model collapses and predicts only specific values for the whole batch:
Input:
X [[ 7.62316802 -6.12433912]
[ 1.11048966 4.97509421]]
Expected Output:
Y [[16.47952332 12.50288412]]
Model Output
y_hat [[10.42446234 10.42446234]]
Any idea what might cause this issue?
My code:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# np.seterr(all=None, divide=None, over=None, under=None, invalid=None)
data_x = np.random.uniform(0, 10, size=(2, 1))
data_y = (2 * data_x).sum(axis=0, keepdims=True)
# data_y = data_x[0, :] ** 2 + data_x[1, :] ** 2
# data_y = data_y.reshape((1, -1))
# # fig = plt.figure()
# # ax = fig.add_subplot(111, projection='3d')
# # ax.scatter(data_x[0, :], data_x[1, :], data_y)
# # plt.show()
memory = dict()
nn_architecture = [
{"input_dim": 2, "output_dim": 6, "activation": "sigmoid", "bias": True},
{"input_dim": 6, "output_dim": 4, "activation": "sigmoid", "bias": True},
{"input_dim": 4, "output_dim": 1, "activation": "relu", "bias": True}
]
def init_network_parameters(nn_architecture):
parameters = []
for idx, layer in enumerate(nn_architecture):
layer_params = {}
input_dim, output_dim, activation, bias = layer.values()
W = np.random.uniform(0, 1, (output_dim, input_dim))
B = np.zeros((output_dim, 1))
if bias:
B = np.ones((output_dim, 1))
activation_func = identity
backward_activation_func = identity_backward
if activation is 'sigmoid':
activation_func = sigmoid
backward_activation_func = sigmoid_backward
elif activation is 'relu':
activation_func = relu
backward_activation_func = relu_backward
else:
print(f"Activation function set to identity for layer {idx}")
layer_params[f"W"] = W
layer_params[f"B"] = B
layer_params[f"activation"] = activation_func
layer_params[f"backward_activation"] = backward_activation_func
layer_params[f"bias"] = bias
parameters.append(layer_params)
return parameters
def identity(z):
return z
def sigmoid(z):
return np.clip(1 / (1 + np.exp(-z)), -100, 100)
def relu(z):
output = np.array(z, copy=True)
output[z <= 0] = 0
return output
def identity_backward(z, dA):
return dA
def sigmoid_backward(z, dA):
return np.clip(z * (1-z) * dA, -100, 100)
def relu_backward(z, dA):
output = np.ones(z.shape)
output[z <= 0] = 0
return output * dA
def forward_single_layer(prev_A, parameters, idx):
W = parameters[f"W"]
B = parameters[f"B"]
activation = parameters[f"activation"]
if parameters["bias"]:
curr_Z = W.dot(prev_A) + B
else:
curr_Z = W.dot(prev_A)
curr_A = activation(curr_Z)
memory[f"Z{idx+1}"] = curr_Z
memory[f"A{idx+1}"] = curr_A
return curr_Z, curr_A
def forward(X, parameters):
prev_A = X
memory["A0"] = prev_A
for idx, layer_params in enumerate(parameters):
curr_Z, prev_A = forward_single_layer(prev_A=prev_A, parameters=layer_params, idx=idx)
return prev_A
def criteria(y_hat, y):
assert y_hat.shape == y.shape
n = y_hat.shape[0]
m = y_hat.shape[1]
loss = np.sum(y_hat - y, axis=1) / m
dA = (y_hat - y) / m
return loss, dA
def backward_single_layer(prev_A, dA, curr_W, curr_Z, backward_activation, idx):
m = prev_A.shape[1]
dZ = backward_activation(z=curr_Z, dA=dA)
dW = np.dot(dZ, prev_A.T) / m
dB = np.sum(dZ, axis=1, keepdims=True) / m
dA = np.dot(curr_W.T, dZ)
return dA, dW, dB
def backpropagation(parameters, dA):
grads = {}
for idx in reversed(range(len(parameters))):
layer = parameters[idx]
prev_A = memory[f"A{idx}"]
curr_Z = memory[f"Z{idx+1}"]
curr_W = layer["W"]
backward_activation = layer["backward_activation"]
dA, dW, dB = backward_single_layer(prev_A, dA, curr_W, curr_Z, backward_activation, idx)
grads[f"W{idx}"] = dW
grads[f"B{idx}"] = dB
return grads
def update_params(parameters, grads, lr=0.001):
new_params = []
for idx, layer in enumerate(parameters):
layer["W"] -= lr*grads[f"W{idx}"]
layer["B"] -= lr*grads[f"B{idx}"]
new_params.append(layer)
return new_params
X = np.random.uniform(-10, 10, (2, 2))
Y = 2*X[0, :] + X[1, :] ** 2
Y = Y.reshape((1, X.shape[1]))
parameters = init_network_parameters(nn_architecture)
n_epochs = 1000
lr = 0.01
loss_history = []
for i in range(n_epochs):
y_hat = forward(X, parameters)
loss, dA = criteria(y_hat, Y)
loss_history.append(loss)
grads = backpropagation(parameters, dA)
parameters = update_params(parameters, grads, lr)
if not i % 10:
print(f"Epoch {i}/{n_epochs} loss={loss}")
print("X", X)
print("Y", Y)
print("y_hat", y_hat)
There wasn't a problem with my implementation, just overfitting.
More information can be found here.

Loss and accuracy not changing of LSTM model

Picture of the train and test lossHere’s a look at the LSTM model
`
# Create LSTM Model
class LSTMModel(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super(LSTMModel, self).__init__()
# Number of hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
# LSTM
self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True, dropout=0.1)
# Readout layer
self.f1 = nn.Linear(hidden_dim, output_dim)
self.dropout_layer = nn.Dropout(p=0.2)
self.softmax = nn.Softmax()
def forward(self, x):
# Initialize hidden state with zeros
h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).type(torch.FloatTensor))
# Initialize cell state
c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).type(torch.FloatTensor))
# One time step
out, (hn, cn) = self.lstm(x, (h0,c0))
out = self.dropout_layer(hn[-1])
out = self.f1(out)
out = self.softmax(out)
return out
#LSTM Configuration
batch_size = 3000
num_epochs = 20
learning_rate = 0.001#Check this learning rate
# Create LSTM
input_dim = 1 # input dimension
hidden_dim = 30 # hidden layer dimension
layer_dim = 15 # number of hidden layers
output_dim = 1 # output dimension
num_layers = 10 #num_layers
print("input_dim = ", input_dim,"\nhidden_dim = ", hidden_dim,"\nlayer_dim = ", layer_dim,"\noutput_dim = ", output_dim)
model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
model.cuda()
error = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
graph_index = 0
test_loss = []
train_loss = []
plt_test_index = []
plt_train_index = []
tmp_index = []
tmp_train = []
tmp_test = []
# model.init_hidden()
for epoch in range(num_epochs):
# Train
model.train()
loss_list_train = []
loss_list_test = []
total_train = 0
equals_train = 0
total_test = 0
num0_train = 0
num1_train = 0
num0_test = 0
num1_test = 0
equals_test = 0
TP_train = 0
FP_train = 0
TN_train = 0
FN_train = 0
TP_test = 0
FP_test = 0
TN_test = 0
FN_test = 0
# for i, (inputs, targets) in enumerate(train_loader):
for i, (inputs, targets) in enumerate(train_loader):
train = Variable(inputs.type(torch.FloatTensor).cuda())
targets = Variable(targets.type(torch.FloatTensor).cuda())
optimizer.zero_grad()
outputs = model(train)
loss = error(outputs, targets)
loss_list_train.append(loss.item())
loss.backward()
# loss.backward(retain_graph=True)
optimizer.step()
t = np.where(targets.cpu().detach().numpy() > 0.5, 1, 0)
o = np.where(outputs.cpu().detach().numpy() > 0.5, 1, 0)
total_train += t.shape[0]
equals_train += np.sum(t == o)
num0_train += np.sum(t == 0)
num1_train += np.sum(t == 1)
TP_train += np.sum(np.logical_and(t == 1, o==1))
FP_train += np.sum(np.logical_and(t == 1, o==0))
TN_train += np.sum(np.logical_and(t == 0, o==0))
FN_train += np.sum(np.logical_and(t == 0, o==1))
tb.save_value('Train Loss', 'train_loss', globaliter, loss.item())
globaliter += 1
tb.flush_line('train_loss')
print(i)
# Test
model.eval()
targets_plot = np.array([])
outputs_plot = np.array([])
inputs_plot = np.array([])
for inputs, targets in test_loader:
inputs = Variable(inputs.type(torch.FloatTensor).cuda())
targets = Variable(targets.type(torch.FloatTensor).cuda())
outputs = model(inputs)
loss = error(outputs, targets)
loss_list_test.append(loss.item())
#print(outputs.cpu().detach().numpy())
t = np.where(targets.cpu().detach().numpy() > 0.5, 1, 0)
o = np.where(outputs.cpu().detach().numpy() > 0.5, 1, 0)
total_test += t.shape[0]
equals_test += np.sum(t == o)
num0_test += np.sum(t == 0)
num1_test += np.sum(t == 1)
TP_test += np.sum(np.logical_and(t == 1, o==1))
FP_test += np.sum(np.logical_and(t == 0, o==1))
TN_test += np.sum(np.logical_and(t == 0, o==0))
FN_test += np.sum(np.logical_and(t == 1, o==0))
tb.save_value('Test Loss', 'test_loss', globaliter2, loss.item())
globaliter2 += 1
tb.flush_line('test_loss')
# Save value in array
graph_index += 1
plt_train_index.append(graph_index)
plt_test_index.append(graph_index)
train_loss.append(np.mean(np.array(loss_list_train)))
test_loss.append(np.mean(np.array(loss_list_test)))
print("------------------------------")
print("Epoch : ", epoch)
print("----- Train -----")
print("Total =", total_train, " | Num 0 =", num0_train, " | Num 1 =", num1_train)
print("Equals =", equals_train)
print("Accuracy =", (equals_train / total_train)*100, "%")
# print("TP =", TP_train / total_train, "% | TN =", TN_train / total_train, "% | FP =", FP_train / total_train, "% | FN =", FN_train / total_train, "%")
print("----- Test -----")
print("Total =", total_test, " | Num 0 =", num0_test, " | Num 1 =", num1_test)
print("Equals =", equals_test)
print("Accuracy =", (equals_test / total_test)*100, "%")
I am using the model to do binary classification on the sequence length of 300. The accuracy and the loss are not changing over several epochs.I tried changing the no. of layers,no of hidden states, activation function, but all to no avail. I don’t know what i am doing wrong, I am probably missing something fundamental. Any help is appreciated

How can I implement custom GRU in keras

I am trying to implement a custom GRU layer in keras 2.1.2-py36_0 where i want to use the following gate equations:
zt = act ( Wz.ht-1 + xt )
rt = act ( Wr.ht-1 + xt )
ht = act ( Wh.(r * ht-1) + xt )
instead of keras current implementation of gates as:
zt = act ( Wz.ht-1 + Uzxt )
rt = act ( Wr.ht-1 + Urxt )
ht = act ( Wh.(r * ht-1) + Uhxt )
Customizing GRU cell for the data
class CGRUCell(Layer):
def __init__(self, units,
activation='tanh',
recurrent_activation='hard_sigmoid',
use_bias=True,
kernel_initializer='glorot_uniform',
recurrent_initializer='orthogonal',
bias_initializer='zeros',
kernel_regularizer=None,
recurrent_regularizer=None,
bias_regularizer=None,
kernel_constraint=None,
recurrent_constraint=None,
bias_constraint=None,
dropout=0.,
recurrent_dropout=0.,
implementation=1,
**kwargs):
super(CGRUCell, self).__init__(**kwargs)
self.units = units
self.activation = activations.get(activation)
self.recurrent_activation = activations.get(recurrent_activation)
self.use_bias = use_bias
self.kernel_initializer = initializers.get(kernel_initializer)
self.recurrent_initializer = initializers.get(recurrent_initializer)
self.bias_initializer = initializers.get(bias_initializer)
self.kernel_regularizer = regularizers.get(kernel_regularizer)
self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
self.bias_regularizer = regularizers.get(bias_regularizer)
self.kernel_constraint = constraints.get(kernel_constraint)
self.recurrent_constraint = constraints.get(recurrent_constraint)
self.bias_constraint = constraints.get(bias_constraint)
self.dropout = min(1., max(0., dropout))
self.recurrent_dropout = min(1., max(0., recurrent_dropout))
self.implementation = implementation
self.state_size = self.units
self._dropout_mask = None
self._recurrent_dropout_mask = None
def build(self, input_shape):
input_dim = input_shape[-1]
#self.kernel = self.add_weight(shape=(input_dim, self.units * 3),
# name='kernel',
# initializer=self.kernel_initializer,
# regularizer=self.kernel_regularizer,
# constraint=self.kernel_constraint)
self.recurrent_kernel = self.add_weight(
shape=(self.units, self.units * 3),
name='recurrent_kernel',
initializer=self.recurrent_initializer,
regularizer=self.recurrent_regularizer,
constraint=self.recurrent_constraint)
if self.use_bias:
self.bias = self.add_weight(shape=(self.units * 3,),
name='bias',
initializer=self.bias_initializer,
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
else:
self.bias = None
#self.kernel_z = self.kernel[:, :self.units]
self.recurrent_kernel_z = self.recurrent_kernel[:, :self.units]
#self.kernel_r = self.kernel[:, self.units: self.units * 2]
self.recurrent_kernel_r = self.recurrent_kernel[:,
self.units:
self.units * 2]
#self.kernel_h = self.kernel[:, self.units * 2:]
self.recurrent_kernel_h = self.recurrent_kernel[:, self.units * 2:]
if self.use_bias:
self.bias_z = self.bias[:self.units]
self.bias_r = self.bias[self.units: self.units * 2]
self.bias_h = self.bias[self.units * 2:]
else:
self.bias_z = None
self.bias_r = None
self.bias_h = None
self.built = True
def call(self, inputs, states, training=None):
h_tm1 = states[0] # previous memory
if 0 < self.dropout < 1 and self._dropout_mask is None:
self._dropout_mask = _generate_dropout_mask(
_generate_dropout_ones(inputs, K.shape(inputs)[-1]),
self.dropout,
training=training,
count=3)
if (0 < self.recurrent_dropout < 1 and
self._recurrent_dropout_mask is None):
self._recurrent_dropout_mask = _generate_dropout_mask(
_generate_dropout_ones(inputs, self.units),
self.recurrent_dropout,
training=training,
count=3)
# dropout matrices for input units
dp_mask = self._dropout_mask
# dropout matrices for recurrent units
rec_dp_mask = self._recurrent_dropout_mask
if self.implementation == 1:
if 0. < self.dropout < 1.:
inputs_z = inputs * dp_mask[0]
inputs_r = inputs * dp_mask[1]
inputs_h = inputs * dp_mask[2]
else:
inputs_z = inputs
inputs_r = inputs
inputs_h = inputs
print(inputs)
# Custom implementation of inputs which are already embedding parameters
#x_z = K.dot(inputs_z, self.kernel_z)
#x_r = K.dot(inputs_r, self.kernel_r)
#x_h = K.dot(inputs_h, self.kernel_h)
#if self.use_bias:
# x_z = K.bias_add(x_z, self.bias_z)
# x_r = K.bias_add(x_r, self.bias_r)
# x_h = K.bias_add(x_h, self.bias_h)
x_z = inputs_z
x_r = inputs_r
x_h = inputs_h
if 0. < self.recurrent_dropout < 1.:
h_tm1_z = h_tm1 * rec_dp_mask[0]
h_tm1_r = h_tm1 * rec_dp_mask[1]
h_tm1_h = h_tm1 * rec_dp_mask[2]
else:
h_tm1_z = h_tm1
h_tm1_r = h_tm1
h_tm1_h = h_tm1
z = self.recurrent_activation(x_z + K.dot(h_tm1_z,
self.recurrent_kernel_z))
r = self.recurrent_activation(x_r + K.dot(h_tm1_r,
self.recurrent_kernel_r))
hh = self.activation(x_h + K.dot(r * h_tm1_h,
self.recurrent_kernel_h))
else:
if 0. < self.dropout < 1.:
inputs *= dp_mask[0]
# Custom implementation of inputs which are already embedding parameters
#matrix_x = K.dot(inputs, self.kernel)
#if self.use_bias:
# matrix_x = K.bias_add(matrix_x, self.bias)
matrix_x = inputs
if 0. < self.recurrent_dropout < 1.:
h_tm1 *= rec_dp_mask[0]
matrix_inner = K.dot(h_tm1,
self.recurrent_kernel[:, :2 * self.units])
x_z = matrix_x[:, :self.units]
x_r = matrix_x[:, self.units: 2 * self.units]
recurrent_z = matrix_inner[:, :self.units]
recurrent_r = matrix_inner[:, self.units: 2 * self.units]
z = self.recurrent_activation(x_z + recurrent_z)
r = self.recurrent_activation(x_r + recurrent_r)
x_h = matrix_x[:, 2 * self.units:]
recurrent_h = K.dot(r * h_tm1,
self.recurrent_kernel[:, 2 * self.units:])
hh = self.activation(x_h + recurrent_h)
h = z * h_tm1 + (1 - z) * hh
if 0 < self.dropout + self.recurrent_dropout:
if training is None:
h._uses_learning_phase = True
return h, [h]
def get_config(self):
config = {'units': self.units,
'activation': activations.serialize(self.activation),
'recurrent_activation': activations.serialize(self.recurrent_activation),
'use_bias': self.use_bias,
'kernel_initializer': initializers.serialize(self.kernel_initializer),
'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
'bias_initializer': initializers.serialize(self.bias_initializer),
'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
'bias_regularizer': regularizers.serialize(self.bias_regularizer),
'kernel_constraint': constraints.serialize(self.kernel_constraint),
'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
'bias_constraint': constraints.serialize(self.bias_constraint),
'dropout': self.dropout,
'recurrent_dropout': self.recurrent_dropout,
'implementation': self.implementation}
base_config = super(CGRUCell, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class CGRU(RNN):
#interfaces.legacy_recurrent_support
def __init__(self, units,
activation='tanh',
recurrent_activation='hard_sigmoid',
use_bias=True,
kernel_initializer='glorot_uniform',
recurrent_initializer='orthogonal',
bias_initializer='zeros',
kernel_regularizer=None,
recurrent_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
recurrent_constraint=None,
bias_constraint=None,
dropout=0.,
recurrent_dropout=0.,
implementation=1,
return_sequences=False,
return_state=False,
go_backwards=False,
stateful=False,
unroll=False,
**kwargs):
if implementation == 0:
warnings.warn('`implementation=0` has been deprecated, '
'and now defaults to `implementation=1`.'
'Please update your layer call.')
cell = CGRUCell(units,
activation=activation,
recurrent_activation=recurrent_activation,
use_bias=use_bias,
kernel_initializer=kernel_initializer,
recurrent_initializer=recurrent_initializer,
bias_initializer=bias_initializer,
kernel_regularizer=kernel_regularizer,
recurrent_regularizer=recurrent_regularizer,
bias_regularizer=bias_regularizer,
kernel_constraint=kernel_constraint,
recurrent_constraint=recurrent_constraint,
bias_constraint=bias_constraint,
dropout=dropout,
recurrent_dropout=recurrent_dropout,
implementation=implementation)
super(CGRU, self).__init__(cell,
return_sequences=return_sequences,
return_state=return_state,
go_backwards=go_backwards,
stateful=stateful,
unroll=unroll,
**kwargs)
self.activity_regularizer = regularizers.get(activity_regularizer)
def call(self, inputs, mask=None, training=None, initial_state=None):
self.cell._dropout_mask = None
self.cell._recurrent_dropout_mask = None
return super(CGRU, self).call(inputs,
mask=mask,
training=training,
initial_state=initial_state)
#property
def units(self):
return self.cell.units
#property
def activation(self):
return self.cell.activation
#property
def recurrent_activation(self):
return self.cell.recurrent_activation
#property
def use_bias(self):
return self.cell.use_bias
#property
def kernel_initializer(self):
return self.cell.kernel_initializer
#property
def recurrent_initializer(self):
return self.cell.recurrent_initializer
#property
def bias_initializer(self):
return self.cell.bias_initializer
#property
def kernel_regularizer(self):
return self.cell.kernel_regularizer
#property
def recurrent_regularizer(self):
return self.cell.recurrent_regularizer
#property
def bias_regularizer(self):
return self.cell.bias_regularizer
#property
def kernel_constraint(self):
return self.cell.kernel_constraint
#property
def recurrent_constraint(self):
return self.cell.recurrent_constraint
#property
def bias_constraint(self):
return self.cell.bias_constraint
#property
def dropout(self):
return self.cell.dropout
#property
def recurrent_dropout(self):
return self.cell.recurrent_dropout
#property
def implementation(self):
return self.cell.implementation
def get_config(self):
config = {'units': self.units,
'activation': activations.serialize(self.activation),
'recurrent_activation': activations.serialize(self.recurrent_activation),
'use_bias': self.use_bias,
'kernel_initializer': initializers.serialize(self.kernel_initializer),
'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
'bias_initializer': initializers.serialize(self.bias_initializer),
'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
'bias_regularizer': regularizers.serialize(self.bias_regularizer),
'activity_regularizer': regularizers.serialize(self.activity_regularizer),
'kernel_constraint': constraints.serialize(self.kernel_constraint),
'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
'bias_constraint': constraints.serialize(self.bias_constraint),
'dropout': self.dropout,
'recurrent_dropout': self.recurrent_dropout,
'implementation': self.implementation}
base_config = super(CGRU, self).get_config()
del base_config['cell']
return dict(list(base_config.items()) + list(config.items()))
#classmethod
def from_config(cls, config):
if 'implementation' in config and config['implementation'] == 0:
config['implementation'] = 1
return cls(**config)
Model Implementation is as follows:
user_input = Input(batch_shape=(batch_size,chunk_size,), dtype='int32', name='user_inputs')
user_emb = Embedding(input_dim=num_users+1, output_dim=out_dim, input_length=chunk_size)(user_input)
item_input = Input(batch_shape=(batch_size,chunk_size,), dtype='int32', name='item_inputs')
item_emb = Embedding(input_dim=num_items+1, output_dim=out_dim, input_length=chunk_size)(item_input)
inputs = keras.layers.add([user_emb, item_emb])
gru_args = {
"units":hidden_size,
"return_sequences":True,
#"return_state":True,
"stateful":True,
"unroll":False
}
gru = CGRU(**gru_args)(inputs)
outputs = Dense(num_items+1, activation='softmax')(gru)
[recc_model = Model(inputs=\[user_input,item_input\], outputs=outputs)
recc_model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=\[metrics.cate][1]gorical_accuracy])
#metrics=[metrics.sparse_categorical_accuracy])
But on running the code I am getting the following error which seems is due to gradients are getting computed to None:
ValueError: Tried to convert 'x' to a tensor and failed. Error: None values not supported.
Find the complete error here: https://pastebin.com/n9UzCRiP
The error occurs because the bias weights are added to the model but not used anywhere.
When you call self.add_weight(...), you have to make sure these weights are used somewhere in your model. Otherwise, since these weights are not connected to the loss tensor, TF cannot compute the gradient and an error will be raised.
If you don't need the bias weights, you can either remove the add_weight lines, or set use_bias=False in your cell.
Also, I think you don't need to re-implement a CGRU layer to use a custom cell. Just wrap your custom cell with the built-in RNN layer should work.
gru = RNN(CGRUCell(hidden_size, use_bias=False),
return_sequences=True,
stateful=True,
unroll=False)(inputs)

Resources