Python: Cost keeps increasing in a neural network that uses TensorFlow - python-3.x

I am trying to create a neural network using TensorFlow but my cost keeps increasing.
This is my code so far:
class AI_core:
def __init__(self, nodes_in_each_layer):
self.data_in_placeholder = tf.placeholder("float", [None, nodes_in_each_layer[0]])
self.data_out_placeholder = tf.placeholder("float")
self.init_neural_network(nodes_in_each_layer)
def init_neural_network(self, n_nodes_h):
#n_nodes_h contains the number of nodes for each layer
#n_nodes_h[0] = number of inputs
#n_nodes_h[-1] = number of outputs
self.layers = [None for i in range(len(n_nodes_h)-1)]
for i in range(1, len(n_nodes_h)):
self.layers[i-1] = {"weights":tf.Variable(tf.random_normal([n_nodes_h[i-1], n_nodes_h[i]])),
"biases":tf.Variable(tf.random_normal([n_nodes_h[i]]))}
def neural_network_model(self, data):
for i in range(len(self.layers)):
data = tf.matmul(data, self.layers[i]["weights"]) + self.layers[i]["biases"]
if i != len(self.layers):
data = tf.nn.relu(data)
return data
def train_neural_network(self, data):
prediction = self.neural_network_model(self.data_in_placeholder)
cost = tf.reduce_mean(tf.square(self.data_out_placeholder-prediction))
optimiser = tf.train.GradientDescentOptimizer(learning_rate=0.0001).minimize(cost)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
epoch_loss = 0
for _ in range(int(data.length)):
epoch_x, epoch_y = data.next_batch()
c = sess.run(cost, feed_dict={self.data_in_placeholder: epoch_x, self.data_out_placeholder: epoch_y})
_ = sess.run(optimiser, feed_dict={self.data_in_placeholder: epoch_x, self.data_out_placeholder: epoch_y})
epoch_loss += np.sum(c)
print("loss =", epoch_loss)
For now I am trying to get the network to approximate the math.sin function.
I have set the nodes_in_each_layer = [1, 5, 5, 5, 1] and batch_size = 3. This is the output:
loss = 0.8417138457298279
loss = 1.190976768732071
loss = 1.8150676786899567
loss = 2.433938592672348
loss = 3.092040628194809
loss = 3.478498786687851
loss = 3.7894928753376007
loss = 4.598285228013992
loss = 5.418278068304062
loss = 5.555390268564224

It looks like you keep adding the value of loss with those in previous iterations.
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
epoch_loss = 0
for _ in range(int(data.length)):
epoch_x, epoch_y = data.next_batch()
c = sess.run(cost, feed_dict={self.data_in_placeholder: epoch_x, self.data_out_placeholder: epoch_y})
_ = sess.run(optimiser, feed_dict={self.data_in_placeholder: epoch_x, self.data_out_placeholder: epoch_y})
epoch_loss += np.sum(c)
print("loss =", epoch_loss)

Related

"IndexError: tensors used as indices must be long, byte or bool tensors" Pytorch

The dataset is a custom torch.geometric dataset
inv_mask = ~mask
--> 224 loop_attr[edge_index[0][inv_mask]] = edge_attr[inv_mask]
225
226 edge_attr = torch.cat([edge_attr[mask], loop_attr], dim=0)
IndexError: tensors used as indices must be long, byte or bool tensors
Code:-
from torch_geometric.nn import GCNConv
class GCN(torch.nn.Module):
def __init__(self, hidden_channels):
super().__init__()
torch.manual_seed(13213)
self.conv1 = GCNConv(dataset.num_features, hidden_channels)
self.conv2 = GCNConv(hidden_channels, num_classes)
def forward(self,x, edge_index):
x = self.conv1(x, edge_index)
x = x.relu()
x = F.dropout(x, p=0.5, training = self.training)
x = self.conv2(x, edge_index)
return x
model = GCN(hidden_channels = 16)
dataset.train_mask = torch.tensor([range(0,14000)]).type(torch.bool)
dataset.test_mask = torch.tensor([range(14000, 22470)]).type(torch.bool)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01, weight_decay = 5e-4)
criterion = torch.nn.CrossEntropyLoss()
dataset.y = dataset.y.float()
dataset.x = dataset.x.float()
dataset.edge_index = dataset.edge_index.float()
def train():
model.train()
optimizer.zero_grad()
out = model(dataset.x, dataset.edge_index)
loss = criterion(out[dataset.train_mask], dataset.y[dataset.train_mask])
loss.backward()
optimizer.step()
return loss
def test():
model.eval()
out = model(dataset.x, dataset.edge_index)
# pred = out.argmax(dim=1)
test_correct = out[dataset.test_mask] == dataset.y[dataset.test_mask]
test_acc = int(test_correct.sum()) / int(data.test_mask.sum())
return test_acc
for e in range(1,101):
loss = train()
print(f'Epoch: {epoch:02d}, Loss: {loss:.3f}')
The error points to optimizer.zero_grad()
could anyone please explain how to debug code in pytorch, since i used tensorflow for almost every deep learning task I did but when it came to GNN I felt torch geometric would be a viable option.
Please help me get ahead of this error and also suggest ways for me to improve the code ....

Loss Not Decreasing for a Bert from Scratch PyTorch Model

I followed Aladdin Persson's Youtube video to code up just the encoder portion of the transformer model in PyTorch, except I just used the Pytorch's multi-head attention layer. The model seems to produce the correct shape of data. However, during training, the training loss does not drop and the resulting model always predicts the same output of 0.4761. Dataset used for training is from the Sarcasm Detection Dataset from Kaggle. Would appreciate any help you guys can give on errors that I have made.
import pandas as pd
from transformers import BertTokenizer
import torch.nn as nn
import torch
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
import math
df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_input = tokenizer(df['headline'].tolist(), return_tensors='pt',padding=True)
X = encoded_input['input_ids']
y = torch.tensor(df['is_sarcastic'].values).float()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.empty_cache()
class TransformerBlock(nn.Module):
def __init__(self,embed_dim, num_heads, dropout, expansion_ratio):
super(TransformerBlock, self).__init__()
self.attention = nn.MultiheadAttention(embed_dim, num_heads)
self.norm1 = nn.LayerNorm(embed_dim)
self.norm2 = nn.LayerNorm(embed_dim)
self.feed_forward = nn.Sequential(
nn.Linear(embed_dim, expansion_ratio*embed_dim),
nn.ReLU(),
nn.Linear(expansion_ratio*embed_dim,embed_dim)
)
self.dropout = nn.Dropout(dropout)
def forward(self, value, key, query):
attention, _ = self.attention(value, key, query)
x=self.dropout(self.norm1(attention+query))
forward = self.feed_forward(x)
out=self.dropout(self.norm2(forward+x))
return out
class Encoder(nn.Module):
#the vocab size is one more than the max value in the X matrix.
def __init__(self,vocab_size=30109,embed_dim=128,num_layers=1,num_heads=4,device="cpu",expansion_ratio=4,dropout=0.1,max_length=193):
super(Encoder,self).__init__()
self.device = device
self.word_embedding = nn.Embedding(vocab_size,embed_dim)
self.position_embedding = nn.Embedding(max_length,embed_dim)
self.layers = nn.ModuleList(
[
TransformerBlock(embed_dim,num_heads,dropout,expansion_ratio) for _ in range(num_layers)
]
)
self.dropout = nn.Dropout(dropout)
self.classifier1 = nn.Linear(embed_dim,embed_dim)
self.classifier2 = nn.Linear(embed_dim,1)
self.relu = nn.ReLU()
def forward(self,x):
N, seq_length = x.shape
positions = torch.arange(0,seq_length).expand(N, seq_length).to(self.device)
out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
for layer in self.layers:
#print(out.shape)
out = layer(out,out,out)
#Get the first output for classification
#Pooled output from hugging face is: Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function.
#Pooled output from hugging face will be different from out[:,0,:], which is the output from the CLS token.
out = self.relu(self.classifier1(out[:,0,:]))
out = self.classifier2(out)
return out
torch.cuda.empty_cache()
net = Encoder(device=device)
net.to(device)
batch_size = 32
num_train_samples = X_train.shape[0]
num_val_samples = X_test.shape[0]
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(net.parameters(),lr=1e-5)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5)
val_loss_hist=[]
loss_hist=[]
epoch = 0
min_val_loss = math.inf
print("Training Started")
patience = 0
for _ in range(100):
epoch += 1
net.train()
epoch_loss = 0
permutation = torch.randperm(X_train.size()[0])
for i in range(0,X_train.size()[0], batch_size):
indices = permutation[i:i+batch_size]
features=X_train[indices].to(device)
labels=y_train[indices].reshape(-1,1).to(device)
output = net.forward(features)
loss = criterion(output, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss+=loss.item()
epoch_loss = epoch_loss / num_train_samples * num_val_samples
loss_hist.append(epoch_loss)
#print("Eval")
net.eval()
epoch_val_loss = 0
permutation = torch.randperm(X_test.size()[0])
for i in range(0,X_test.size()[0], batch_size):
indices = permutation[i:i+batch_size]
features=X_test[indices].to(device)
labels = y_test[indices].reshape(-1,1).to(device)
output = net.forward(features)
loss = criterion(output, labels)
epoch_val_loss+=loss.item()
val_loss_hist.append(epoch_val_loss)
scheduler.step(epoch_val_loss)
#if epoch % 5 == 0:
print("Epoch: " + str(epoch) + " Train Loss: " + format(epoch_loss, ".4f") + ". Val Loss: " + format(epoch_val_loss, ".4f") + " LR: " + str(optimizer.param_groups[0]['lr']))
if epoch_val_loss < min_val_loss:
min_val_loss = epoch_val_loss
torch.save(net.state_dict(), "torchmodel/weights_best.pth")
print('\033[93m'+"Model Saved"+'\033[0m')
patience = 0
else:
patience += 1
if (patience == 10):
break
print("Training Ended")

ValueError: Cannot feed value of shape (784,) for Tensor 'x:0', which has shape '(?, 784)'

This is my first experience with Tensorflow. There appears to be many queries to this ValueError, however I am not getting any relief. I am using the notMNIST dataset, which is split 70/30 train test.
The error message appears to suggest there is a problem with my mini-batch. I have printed the shape of the placeholders, reshaped the input and label data to no success.
import tensorflow as tf
tf.reset_default_graph()
num_inputs = 28*28 # Size of images in pixels
num_hidden1 = 500
num_hidden2 = 500
num_outputs = len(np.unique(y)) # Number of classes (labels)
learning_rate = 0.0011
inputs = tf.placeholder(tf.float32, shape=[None, num_inputs], name="x")
labels = tf.placeholder(tf.int32, shape=[None], name = "y")
print(np.expand_dims(inputs, axis=0))
print(np.expand_dims(labels, axis=0))
def neuron_layer(x, num_neurons, name, activation=None):
with tf.name_scope(name):
num_inputs = int(x.get_shape()[1])
stddev = 2 / np.sqrt(num_inputs)
init = tf.truncated_normal([num_inputs, num_neurons], stddev=stddev)
W = tf.Variable(init, name = "weights")
b = tf.Variable(tf.zeros([num_neurons]), name= "biases")
z = tf.matmul(x, W) + b
if activation == "sigmoid":
return tf.sigmoid(z)
elif activation == "relu":
return tf.nn.relu(z)
else:
return z
with tf.name_scope("dnn"):
hidden1 = neuron_layer(inputs, num_hidden1, "hidden1", activation="relu")
hidden2 = neuron_layer(hidden1, num_hidden2, "hidden2", activation="relu")
logits = neuron_layer(hidden2, num_outputs, "output")
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("evaluation"):
correct = tf.nn.in_top_k(logits, labels, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
with tf.name_scope("train"):
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
grads = optimizer.compute_gradients(loss)
training_op = optimizer.apply_gradients(grads)
for var in tf.trainable_variables():
tf.summary.histogram(var.op.name + "/values", var)
for grad, var in grads:
if grad is not None:
tf.summary.histogram(var.op.name + "/gradients", grad)
# summary
accuracy_summary = tf.summary.scalar('accuracy', accuracy)
# merge all summary
tf.summary.histogram('hidden1/activations', hidden1)
tf.summary.histogram('hidden2/activations', hidden2)
merged = tf.summary.merge_all()
init = tf.global_variables_initializer()
saver = tf.train.Saver()
from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs/example03/dnn_final"
logdir = "{}/run-{}/".format(root_logdir, now)
train_writer = tf.summary.FileWriter("models/dnn0/train",
tf.get_default_graph())
test_writer = tf.summary.FileWriter("models/dnn0/test", tf.get_default_graph())
num_epochs = 50
batch_size = 128
with tf.Session() as sess:
init.run()
print("Epoch\tTrain accuracy\tTest accuracy")
for epoch in range(num_epochs):
for idx_start in range(0, x_train.shape[0], batch_size):
idx_end = num_epochs
x_batch, y_batch = x_train[batch_size], y_train[batch_size]
sess.run(training_op, feed_dict={inputs: x_batch, labels: y_batch})
summary_train, acc_train = sess.run([merged, accuracy],
feed_dict={x: x_batch, y: y_batch})
summary_test, acc_test = sess.run([accuracy_summary, accuracy],
feed_dict={x: x_test, y: y_test})
train_writer.add_summary(summary_train, epoch)
test_writer.add_summary(summary_test, epoch)
print("{}\t{}\t{}".format(epoch, acc_train, acc_test))
save_path = saver.save(sess, "models/dnn0.ckpt")
The following error
ValueError: Cannot feed value of shape (784,) for Tensor 'x:0', which has shape '(?, 784)'
occurs in line 96
sess.run(training_op, feed_dict={inputs: x_batch, labels: y_batch})
Your tensors do have mixed up shapes. You feed a tensor where the batch index is at the end into a tensor where the batch index is at the front.
Do x_batch = numpy.swapaxes(x_batch, 1, 0) before feeding the tensor.
On this line, you're referring to inputs and labels
sess.run(training_op, feed_dict={inputs: x_batch, labels: y_batch})
Where as on the lines below,
summary_train, acc_train = sess.run([merged, accuracy],
feed_dict={x: x_batch, y: y_batch})
summary_test, acc_test = sess.run([accuracy_summary, accuracy],
feed_dict={x: x_test, y: y_test})
you're referring to x and y. Change these to be the same. I.e. it should be the same value as your placeholder variables. (inputs and labels)

Value Error in tensorflow

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/temp/data", one_hot=True)
n_nodes_hl1 = 500
n_nodes_hl2 = 500
n_nodes_hl3 = 500
n_classes = 10
batch_size = 100
# matrix = height * width
x = tf.placeholder('float', [None, 784])
y = tf.placeholder('float')
# defining the neural network
def neural_network_model(data):
hiddenLayer1 = {'weights': tf.Variable(tf.random_normal([784, n_nodes_hl1])),
'biases': tf.Variable(tf.random_normal([n_nodes_hl1]))}
hiddenLayer2 = {'weights': tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
'biases': tf.Variable(tf.random_normal([n_nodes_hl2]))}
hiddenLayer3 = {'weights': tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
'biases': tf.Variable(tf.random_normal([n_nodes_hl3]))}
outputLayer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),
'biases': tf.Variable(tf.random_normal([n_classes]))}
l1 = tf.add(tf.matmul(data, hiddenLayer1['weights']), hiddenLayer1['biases'])
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1, hiddenLayer2['weights']), hiddenLayer2['biases'])
l2 = tf.nn.relu(l2)
l3 = tf.add(tf.matmul(l2, hiddenLayer3['weights']), hiddenLayer3['biases'])
l3 = tf.nn.relu(l3)
output = tf.matmul(l3, outputLayer['weights']), outputLayer['biases']
return output
# training the network
def train_neural_network(x):
prediction = neural_network_model(x)
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(prediction,tf.squeeze(y)))
#cost = tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y)
#cost = tf.reduce_mean(cost) * 100
optimizer = tf.train.AdamOptimizer(0.003).minimize(cost)
# cycles feed forward + backprop
numberOfEpochs = 10
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
#dealing with training data
for epoch in range(numberOfEpochs):
epoch_loss = 0
for _ in range(int(mnist.train.num_examples / batch_size)):
epoch_x, epoch_y = mnist.train.next_batch(batch_size)
_, c = sess.run([optimizer, cost], feed_dict={x: epoch_x, y: epoch_y})
epoch_loss += c
print('Epoch', epoch, ' completed out of ', numberOfEpochs, ' loss: ', epoch_loss)
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
print('Accuracy: ', accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))
train_neural_network(x)
I am new to Tensorflow and I am trying to train my model to read datasets. But every time I run the code, I get this error:
Traceback (most recent call last):
File "firstAI.py", line 87, in
train_neural_network(x)
File "firstAI.py", line 62, in train_neural_network
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(prediction,tf.squeeze(y)));
File "/home/phillipus/.local/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 1935, in sparse_softmax_cross_entropy_with_logits
labels, logits)
File "/home/phillipus/.local/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 1713, in _ensure_xent_args
"named arguments (labels=..., logits=..., ...)" % name)
ValueError: Only call sparse_softmax_cross_entropy_with_logits with named arguments (labels=..., logits=..., ...)
Looks like the problem is at the "cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(prediction,tf.squeeze(y)))" and the calling of the "train_neural_network(x)" function. I am new to Tensorflow so my troubleshooting isn't at its best, anyone to help me?
Maybe you could try using tf.nn.softmax_cross_entropy_with_logits rather than tf.nn.sparse_softmax_cross_entropy_with_logits inside the cost calculation.
However, if you want to continue to use tf.nn.sparse_softmax_cross_entropy_with_logits then this link might help: Tensorflow ValueError: Only call `sparse_softmax_cross_entropy_with_logits` with named arguments .
By the way, what are the versions of tensorflow and python you're using?
Try running this:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/temp/data", one_hot=True)
n_nodes_hl1 = 500
n_nodes_hl2 = 500
n_nodes_hl3 = 500
n_classes = 10
batch_size = 100
# matrix = height * width
x = tf.placeholder('float', [None, 784])
y = tf.placeholder('float')
# defining the neural network
def neural_network_model(data):
hiddenLayer1 = {'weights': tf.Variable(tf.random_normal([784,
n_nodes_hl1])),
'biases': tf.Variable(tf.random_normal([n_nodes_hl1]))}
hiddenLayer2 = {'weights': tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
'biases': tf.Variable(tf.random_normal([n_nodes_hl2]))}
hiddenLayer3 = {'weights': tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
'biases': tf.Variable(tf.random_normal([n_nodes_hl3]))}
outputLayer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),
'biases': tf.Variable(tf.random_normal([n_classes]))}
l1 = tf.add(tf.matmul(data, hiddenLayer1['weights']), hiddenLayer1['biases'])
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1, hiddenLayer2['weights']), hiddenLayer2['biases'])
l2 = tf.nn.relu(l2)
l3 = tf.add(tf.matmul(l2, hiddenLayer3['weights']), hiddenLayer3['biases'])
l3 = tf.nn.relu(l3)
output = tf.add(tf.matmul(l3, outputLayer['weights']),outputLayer['biases'])
return output
prediction = neural_network_model(x)
cost = tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y)
optimizer = tf.train.AdamOptimizer(0.003).minimize(cost)
# cycles feed forward + backprop
numberOfEpochs = 10
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
#dealing with training data
for epoch in range(numberOfEpochs):
epoch_loss = 0
for _ in range(int(mnist.train.num_examples / batch_size)):
epoch_x, epoch_y = mnist.train.next_batch(batch_size)
_, c = sess.run([optimizer, cost], feed_dict={x: epoch_x, y: epoch_y})
epoch_loss += c
print('Epoch', epoch, ' completed out of ', numberOfEpochs, ' loss: ', epoch_loss)
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
print('Accuracy: ', accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))
Try this code
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/temp/data/", one_hot = True)
n_nodes_hl1 = 500
n_nodes_hl2 = 500
n_nodes_hl3 = 500
n_classes = 10
batch_size = 100
#height x width
x = tf.placeholder('float',[None, 784])
y = tf.placeholder('float')
def neural_network_model(data):
hidden_1_layer = {'weights':tf.Variable(tf.random_normal([784,n_nodes_hl1])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl1]))}
hidden_2_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1,n_nodes_hl2])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl2]))}
hidden_3_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl2,n_nodes_hl3])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl3]))}
output_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl3,n_classes])),
'biases':tf.Variable(tf.random_normal([n_classes]))}
l1 = tf.add(tf.matmul(data, hidden_1_layer['weights']),hidden_1_layer['biases'])
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1, hidden_2_layer['weights']),hidden_2_layer['biases'])
l2 = tf.nn.relu(l2)
l3 = tf.add(tf.matmul(l2, hidden_3_layer['weights']),hidden_3_layer['biases'])
l3 = tf.nn.relu(l3)
output = tf.matmul(l3, output_layer['weights']) + output_layer['biases']
return output
def train_neural_network(x):
prediction = neural_network_model(x)
cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=prediction,labels=y))
optimizer = tf.train.AdamOptimizer().minimize(cost)
hm_epochs = 10
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
for epoch in range(hm_epochs):
epoch_loss = 0
for _ in range(int(mnist.train.num_examples/batch_size)):
epoch_x,epoch_y = mnist.train.next_batch(batch_size)
_,epoch_c = sess.run([optimizer, cost], feed_dict = {x: epoch_x, y: epoch_y})
epoch_loss += epoch_c
print('Epoch', epoch, 'completed out of ', hm_epochs, 'loss: ', epoch_loss)
correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
print('Accuracy:', accuracy.eval({x:mnist.test.images, y: mnist.test.labels}))
train_neural_network(x)

PyTorch network produces constant output

I am trying to train a simple MLP to approximate y=f(a,b,c).
My code is as below.
import torch
import torch.nn as nn
from torch.autograd import Variable
# hyper parameters
input_size = 3
output_size = 1
num_epochs = 50
learning_rate = 0.001
# Network definition
class FeedForwardNet(nn.Module):
def __init__(self, l1_size, l2_size):
super(FeedForwardNet, self).__init__()
self.fc1 = nn.Linear(input_size, l1_size)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(l1_size, l2_size)
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(l2_size, output_size)
def forward(self, x):
out = self.fc1(x)
out = self.relu1(out)
out = self.fc2(out)
out = self.relu2(out)
out = self.fc3(out)
return out
model = FeedForwardNet(5 , 3)
# sgd optimizer
optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum=0.9)
for epoch in range(11):
print ('Epoch ', epoch)
for i in range(trainX_light.shape[0]):
X = Variable( torch.from_numpy(trainX_light[i]).view(-1, 3) )
Y = Variable( torch.from_numpy(trainY_light[i]).view(-1, 1) )
# forward
optimizer.zero_grad()
output = model(X)
loss = (Y - output).pow(2).sum()
print (output.data[0,0])
loss.backward()
optimizer.step()
totalnorm = 0
for p in model.parameters():
modulenorm = p.grad.data.norm()
totalnorm += modulenorm ** 2
totalnorm = math.sqrt(totalnorm)
print (totalnorm)
# validation code
if (epoch + 1) % 5 == 0:
print (' test points',testX_light.shape[0])
total_loss = 0
for t in range(testX_light.shape[0]):
X = Variable( torch.from_numpy(testX_light[t]).view(-1, 3) )
Y = Variable( torch.from_numpy(testY_light[t]).view(-1, 1) )
output = model(X)
loss = (Y - output).pow(2).sum()
print (output.data[0,0])
total_loss += loss
print ('epoch ', epoch, 'avg_loss ', total_loss.data[0] / testX_light.shape[0])
print ('Done')
The problem that I have now is, the validation code
output = model(X)
is always producing an exact same output value (I guess this value is some sort of garbage). I am not sure what mistake I am doing in this part. Could some help me figure out the mistake in my code?
The reason that network produced random values (and inf later) was the exploding gradient problem. Clipping the gradient (torch.nn.utils.clip_grad_norm(model.parameters(), 0.1)) helped.

Resources