How to use SimpleRNN to model a modulus function & predict many to many sequence - keras

I have designed this toy problem to understand the working of SimpleRNN in Keras.
My input sequence is:
[x1,x2,x3,x4,x5]
and the corresponding output is:
[(0+x1)%2,(x1+x2)%2,(x2+x3)%2,(x3+x4)%2,(x4+x5)%2)]
My code is:
import numpy as np
import random
from scipy.ndimage.interpolation import shift
def generate_sequence():
max_len = 5
x = np.random.randint(1,100,max_len)
shifted_x = shift(x, 1, cval=0)
y = (x + shifted_x) % 2
return x.reshape(max_len,1),y.reshape(max_len,1),shifted_x.reshape(max_len,1)
X_train = np.zeros((100,5,1))
y_train = np.zeros((100,5,1))
for i in range(100):
x,y,z = generate_sequence()
X_train[i] = x
y_train[i] = y
X_test = np.zeros((100,5,1))
y_test = np.zeros((100,5,1))
for i in range(100):
x,y,z = generate_sequence()
X_test[i] = x
y_test[i] = y
from keras.layers import SimpleRNN
model = Sequential()
model.add(SimpleRNN(3,input_shape=(5,1),return_sequences=True,name='rnn'))
model.add(Dense(1,activation='sigmoid'))
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
optimizer='sgd',
metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
batch_size=70,
epochs=200,verbose=0,validation_split=0.3)
score, acc = model.evaluate(X_test, y_test,
batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
When I train this SimpleRNN I only get an accuracy of 50%, each item in the sequence only depends on the previous item. Why is the RNN struggling to learn this?
100/100 [==============================] - 0s 37us/step
Test score: 0.6975522041320801
Test accuracy: 0.5120000243186951
UPDATE:
It turns out mod function is very hard to model, I switched to simple data generation strategy like y[t] = x[t] < x[t-1], then I could see the model performing with 80% binary accuracy.
def generate_rnn_sequence():
max_len = 5
x = np.random.randint(1,100,max_len)
shifted_x = shift(x, 1, cval=0)
y = (x < shifted_x).astype(float)
return x.reshape(5,1),y.reshape(5,1)
So How do i model a mod function using a RNN?

Related

Calculate gradient of validation error w.r.t inputs using Keras/Tensorflow or autograd

I need to calculate the gradient of the validation error w.r.t inputs x. I'm trying to see how much the validation error changes when I perturb one of the training samples.
The validation error (E) explicitly depends on the model weights (W).
The model weights explicitly depend on the inputs (x and y).
Therefore, the validation error implicitly depends on the inputs.
I'm trying to calculate the gradient of E w.r.t x directly.
An alternative approach would be to calculate the gradient of E w.r.t W (can easily be calculated) and the gradient of W w.r.t x (cannot do at the moment), which would allow the gradient of E w.r.t x to be calculated.
I have attached a toy example. Thanks in advance!
import numpy as np
import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from autograd import grad
train_images = mnist.train_images()
train_labels = mnist.train_labels()
test_images = mnist.test_images()
test_labels = mnist.test_labels()
# Normalize the images.
train_images = (train_images / 255) - 0.5
test_images = (test_images / 255) - 0.5
# Flatten the images.
train_images = train_images.reshape((-1, 784))
test_images = test_images.reshape((-1, 784))
# Build the model.
model = Sequential([
Dense(64, activation='relu', input_shape=(784,)),
Dense(64, activation='relu'),
Dense(10, activation='softmax'),
])
# Compile the model.
model.compile(
optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'],
)
# Train the model.
model.fit(
train_images,
to_categorical(train_labels),
epochs=5,
batch_size=32,
)
model.save_weights('model.h5')
# Load the model's saved weights.
# model.load_weights('model.h5')
calculate_mse = tf.keras.losses.MeanSquaredError()
test_x = test_images[:5]
test_y = to_categorical(test_labels)[:5]
train_x = train_images[:1]
train_y = to_categorical(train_labels)[:1]
train_y = tf.convert_to_tensor(train_y, np.float32)
train_x = tf.convert_to_tensor(train_x, np.float64)
with tf.GradientTape() as tape:
tape.watch(train_x)
model.fit(train_x, train_y, epochs=1, verbose=0)
valid_y_hat = model(test_x, training=False)
mse = calculate_mse(test_y, valid_y_hat)
de_dx = tape.gradient(mse, train_x)
print(de_dx)
# approach 2 - does not run
def calculate_validation_mse(x):
model.fit(x, train_y, epochs=1, verbose=0)
valid_y_hat = model(test_x, training=False)
mse = calculate_mse(test_y, valid_y_hat)
return mse
train_x = train_images[:1]
train_y = to_categorical(train_labels)[:1]
validation_gradient = grad(calculate_validation_mse)
de_dx = validation_gradient(train_x)
print(de_dx)
Here's how you can do this. Derivation is as below.
Few things to note,
I have reduced the feature size from 784 to 256 as I was running out of memory in colab (line marked in the code) . Might have to do some mem profiling to find out why
Only computed grads for the first layer. Easily extendable to other layers
Disclaimer: this derivation is correct to best of my knowledge. Please do some research and verify that it is the case. You will run into memory issues for larger inputs and layer sizes.
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
f = 256
model = Sequential([
Dense(64, activation='relu', input_shape=(f,)),
Dense(64, activation='relu'),
Dense(10, activation='softmax'),
])
# Compile the model.
model.compile(
optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'],
)
w = model.weights[0]
# Inputs and labels
x_tr = tf.Variable(np.random.normal(size=(1,f)), shape=(1, f), dtype='float32')
y_tr = np.random.choice([0,1,2,3,4,5,6,7,8,9], size=(1,1))
y_tr_onehot = tf.keras.utils.to_categorical(y_tr, num_classes=10).astype('float32')
x_v = tf.Variable(np.random.normal(size=(1,f)), shape=(1, f), dtype='float32')
y_v = np.random.choice([0,1,2,3,4,5,6,7,8,9], size=(1,1))
y_v_onehot = tf.keras.utils.to_categorical(y_v, num_classes=10).astype('float32')
# In the context of GradientTape
with tf.GradientTape() as tape1:
with tf.GradientTape() as tape2:
y_tr_pred = model(x_tr)
tr_loss = tf.keras.losses.MeanSquaredError()(y_tr_onehot, y_tr_pred)
tmp_g = tape2.gradient(tr_loss, w)
print(tmp_g.shape)
# d(dE_tr/d(theta))/dx
# Warning this step consumes lot of memory for large layers
lr = 0.001
grads_1 = -lr * tape1.jacobian(tmp_g, x_tr)
with tf.GradientTape() as tape3:
y_v_pred = model(x_v)
v_loss = tf.keras.losses.MeanSquaredError()(y_v_onehot, y_v_pred)
# dE_val/d(theta)
grads_2 = tape3.gradient(v_loss, w)[tf.newaxis, :]
# Just crunching the dimension to get the final desired shape of (1,256)
grad = tf.matmul(tf.reshape(grads_2,[1, -1]), tf.reshape(tf.transpose(grads_1,[2,1,0,3]),[1, -1, 256]))

with tensorflow and keras how to find the "category" of a given string

Hello ML/AI newbie here,
I'm asking this question because I've no idea about machine learning, ai, e.t.c and I've no idea how to continue, what questions to ask. Even if I accidentally find the solution i wouldn't know.
Ok, I followed this tutorial about "Text Classification" and it went pretty well, no problems up to here.
https://www.youtube.com/watch?v=6g4O5UOH304&list=WL&index=8&t=0s
It classifies IMDB comments and checks if a review is "Positive" or "Negative", "0" or "1"
My question is
Let say I've my own dataset, similar to IMDB but instead of "0" and "1" I have several categories as numbers like "1,2,3,4,5,6,7,8,9,10,11,12,...." for each string. So I need it to return one of these numbers (since it's learning let say two of them if it can't decide)
What should I do?
A link to a tutorial related to what I need would be great too.
import tensorflow as tf
from tensorflow import keras
import numpy as np
data = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=3000)
word_index = data.get_word_index()
word_index = {k:(v+3) for k, v in word_index.items()}
word_index["<PAD>"] = 0;
word_index["<START>"] = 1;
word_index["<UNK>"] = 2;
word_index["<UNUSED>"] = 3;
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding="post", maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding="post", maxlen=250)
def decode_review(text):
return " ".join([reverse_word_index.get(i, "?") for i in text])
model = keras.Sequential()
model.add(keras.layers.Embedding(10000, 6))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid"))
#model.summary()
model.compile(optimizer="adam", loss="binary_crossentropy", metrics="accuracy")
x_val = train_data[:10000]
x_train = train_data[10000:]
y_val = train_labels[:10000]
y_train = train_labels[10000:]
fitModel = model.fit(x_train, y_train, epochs=40, batch_size=512, validation_data=(x_val, y_val), verbose=1)
results = model.evaluate(test_data, test_labels)
print(results)
for index in range(20):
test_review = test_data[index]
predict = model.predict([test_review])
if predict[0] > 0.8:
print(decode_review(test_data[index]))
print(str(predict[0]))
print(str(test_labels[index]))
your task is a multiclass classification problem and for this reason, you have to modify your output layer. you have two possibilities.
if you have 1D integer encoded target you can use sparse_categorical_crossentropy as loss function, softmax as the last activation and the dimension of the last dense output equal to the number of class to predict
X = np.random.randint(0,10, (1000,100))
y = np.random.randint(0,3, 1000)
model = Sequential([
Dense(128, input_dim = 100),
Dense(3, activation='softmax'),
])
model.summary()
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(X, y, epochs=3)
Otherwise, if you have one-hot encoded your target you can use categorical_crossentropy, softmax as the last activation and the dimension of the last dense output equal to the number of class to predict
X = np.random.randint(0,10, (1000,100))
y = pd.get_dummies(np.random.randint(0,3, 1000)).values
model = Sequential([
Dense(128, input_dim = 100),
Dense(3, activation='softmax'),
])
model.summary()
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(X, y, epochs=3)
the usage of softmax enables to interpret the output as probability scores which sum to 1
when you compute the final prediction, to obtain the predicted class you can simply to in this way np.argmax(model.predict(X), axis=1)
these are some basic tutorials for multiclass text classification:
https://towardsdatascience.com/multi-class-text-classification-with-lstm-using-tensorflow-2-0-d88627c10a35
https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17

Implementing and tuning a simple CNN for 3D data using Keras Conv3D

I'm trying to implement a 3D CNN using Keras. However, I am having some difficulties understanding some details in the results obtained and further enhancing the accuracy.
The data that I am trying to analyzing have the shape {64(d1)x64(d2)x38(d3)}, where d1 and d2 are the length and width of the image (64x64 pixels) and d3 is the time dimension. In other words, I have 38 images. The channel parameter is set to 1 as my data are actually raw data and not really colorful images.
My data consist of 219 samples, hence 219x64x64x38. They are divided into training and validation sets with 20% for validation. In addition, I have a fixed 147 additional data for testing.
Below is my code that works fine. It creates a txt file that saves the results for the different combination of parameters in my network (grid search). Here in this code, I only consider tuning 2 parameters: the number of filters and lambda for L2 regularizer. I fixed the dropout and the kernel size for the filters. However, later I considered their variations.
I also tried to set the seed value so that I have some sort of reproducibility (I don't think that I have achieved this task).
My question is that:
Given the below architecture and code, I always reach for all the given combinations of parameters a convergence for the training accuracy towards 1 (which is good). However, for the validation accuracy it is most of the time around 80% +/- 4% (rarely below 70%) despite the hyper-parameters combination. Similar behavior for the test accuracy. How can I enhance this accuracy to above 90% ?
As far as I know, having a gap between the train and validation/test accuracy is a result from overfitting. However, in my model I am adding dropouts and L2 regularizers and also changing the size of my network which should somehow reduce this gap (but it is not).
Is there anything else I can do besides modifying my input data? Does adding more layers help? Or is there maybe a pre-trained 3D CNN like in the case of 2D CNN (e.g., AlexNet)? Should I try ConvLSTM? Is this the limit of this architecture?
Thank you :)
import numpy as np
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Conv3D, MaxPooling3D, Dense, Flatten, Activation
from keras.utils import to_categorical
from keras.regularizers import l2
from keras.layers import Dropout
from keras.utils import multi_gpu_model
import scipy.io as sio
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from keras.callbacks import ReduceLROnPlateau
tf.set_random_seed(1234)
def normalize_minmax(X_train):
"""
Normalize to [0,1]
"""
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_minmax_train = min_max_scaler.fit_transform(X_train)
return X_minmax_train
# generate and prepare the dataset
def get_data():
# Load and prepare the data
X_data = sio.loadmat('./X_train')['X_train']
Y_data = sio.loadmat('./Y_train')['targets_train']
X_test = sio.loadmat('./X_test')['X_test']
Y_test = sio.loadmat('./Y_test')['targets_test']
return X_data, Y_data, X_test, Y_test
def get_model(X_train, Y_train, X_validation, Y_validation, F1_nb, F2_nb, F3_nb, kernel_size_1, kernel_size_2, kernel_size_3, l2_lambda, learning_rate, reduce_lr, dropout_conv1, dropout_conv2, dropout_conv3, dropout_dense, no_epochs):
no_classes = 5
sample_shape = (64, 64, 38, 1)
batch_size = 32
dropout_seed = 30
conv_seed = 20
# Create the model
model = Sequential()
model.add(Conv3D(F1_nb, kernel_size=kernel_size_1, kernel_regularizer=l2(l2_lambda), padding='same', kernel_initializer='glorot_uniform', input_shape=sample_shape))
model.add(Activation('selu'))
model.add(MaxPooling3D(pool_size=(2,2,2)))
model.add(Dropout(dropout_conv1, seed=conv_seed))
model.add(Conv3D(F2_nb, kernel_size=kernel_size_2, kernel_regularizer=l2(l2_lambda), padding='same', kernel_initializer='glorot_uniform'))
model.add(Activation('selu'))
model.add(MaxPooling3D(pool_size=(2,2,2)))
model.add(Dropout(dropout_conv2, seed=conv_seed))
model.add(Conv3D(F3_nb, kernel_size=kernel_size_3, kernel_regularizer=l2(l2_lambda), padding='same', kernel_initializer='glorot_uniform'))
model.add(Activation('selu'))
model.add(MaxPooling3D(pool_size=(2,2,2)))
model.add(Dropout(dropout_conv3, seed=conv_seed))
model.add(Flatten())
model.add(Dense(512, kernel_regularizer=l2(l2_lambda), kernel_initializer='glorot_uniform'))
model.add(Activation('selu'))
model.add(Dropout(dropout_dense, seed=dropout_seed))
model.add(Dense(no_classes, activation='softmax'))
model = multi_gpu_model(model, gpus = 2)
# Compile the model
model.compile(loss=keras.losses.categorical_crossentropy,
optimizer=keras.optimizers.Adam(lr=learning_rate),
metrics=['accuracy'])
# Train the model.
history = model.fit(X_train, Y_train, batch_size=batch_size, epochs=no_epochs, validation_data=(X_validation, Y_validation),callbacks=[reduce_lr])
return model, history
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
learning_rate = 0.001
no_epochs = 100
X_data, Y_data, X_test, Y_test = get_data()
# Normalize the train/val data
for i in range(X_data.shape[0]):
for j in range(X_data.shape[3]):
X_data[i,:,:,j] = normalize_minmax(X_data[i,:,:,j])
X_data = np.expand_dims(X_data, axis=4)
# Normalize the test data
for i in range(X_test.shape[0]):
for j in range(X_test.shape[3]):
X_test[i,:,:,j] = normalize_minmax(X_test[i,:,:,j])
X_test = np.expand_dims(X_test, axis=4)
# Shuffle the training data
# fix random seed for reproducibility
seedValue = 40
permutation = np.random.RandomState(seed=seedValue).permutation(len(X_data))
X_data = X_data[permutation]
Y_data = Y_data[permutation]
Y_data = np.squeeze(Y_data)
Y_test = np.squeeze(Y_test)
#Split between train and validation (20%). Here I did not use the classical validation_split=0.2 just to make sure that the data is the same for the different architectures I am using.
X_train = X_data[0:175,:,:,:,:]
Y_train = Y_data[0:175]
X_validation = X_data[176:,:,:,:]
Y_validation = Y_data[176:]
Y_train = to_categorical(Y_train,num_classes=5).astype(np.integer)
Y_validation = to_categorical(Y_validation,num_classes=5).astype(np.integer)
Y_test = to_categorical(Y_test,num_classes=5).astype(np.integer)
l2_lambda_list = [(1*pow(10,-4)),(2*pow(10,-4)),
(3*pow(10,-4)),
(4*pow(10,-4)),
(5*pow(10,-4)),(6*pow(10,-4)),
(7*pow(10,-4)),
(8*pow(10,-4)),(9*pow(10,-4)),(10*pow(10,-4))
]
filters_nb = [(128,64,64),(128,64,32),(128,64,16),(128,64,8),(128,32,32),(128,32,16),(128,32,8),(128,16,8),(128,8,8),
(64,64,32),(64,64,16),(64,64,8),(64,32,32),(64,32,16),(64,32,8),(64,16,16),(64,16,8),(64,8,8),
(32,32,16),(32,32,8),(32,16,16),(32,16,8),(32,8,8),
(16,16,16),(16,16,8),(16,8,8)
]
DropOut = [(0.25,0.25,0.25,0.5),
(0,0,0,0.1),(0,0,0,0.2),(0,0,0,0.3),(0,0,0,0.4),(0,0,0,0.5),
(0.1,0.1,0.1,0),(0.2,0.2,0.2,0),(0.3,0.3,0.3,0),(0.4,0.4,0.4,0),(0.5,0.5,0.5,0),
(0.1,0.1,0.1,0.1),(0.1,0.1,0.1,0.2),(0.1,0.1,0.1,0.3),(0.1,0.1,0.1,0.4),(0.1,0.1,0.1,0.5),
(0.15,0.15,0.15,0.1),(0.15,0.15,0.15,0.2),(0.15,0.15,0.15,0.3),(0.15,0.15,0.15,0.4),(0.15,0.15,0.15,0.5),
(0.2,0.2,0.2,0.1),(0.2,0.2,0.2,0.2),(0.2,0.2,0.2,0.3),(0.2,0.2,0.2,0.4),(0.2,0.2,0.2,0.5),
(0.25,0.25,0.25,0.1),(0.25,0.25,0.25,0.2),(0.25,0.25,0.25,0.3),(0.25,0.25,0.25,0.4),(0.25,0.25,0.25,0.5),
(0.3,0.3,0.3,0.1),(0.3,0.3,0.3,0.2),(0.3,0.3,0.3,0.3),(0.3,0.3,0.3,0.4),(0.3,0.3,0.3,0.5),
(0.35,0.35,0.35,0.1),(0.35,0.35,0.35,0.2),(0.35,0.35,0.35,0.3),(0.35,0.35,0.35,0.4),(0.35,0.35,0.35,0.5)
]
kernel_size = [(3,3,3),
(2,3,3),(2,3,4),(2,3,5),(2,3,6),(2,3,7),(2,3,8),(2,3,9),(2,3,10),(2,3,11),(2,3,12),(2,3,13),(2,3,14),(2,3,15),
(3,3,3),(3,3,4),(3,3,5),(3,3,6),(3,3,7),(3,3,8),(3,3,9),(3,3,10),(3,3,11),(3,3,12),(3,3,13),(3,3,14),(3,3,15),
(3,4,3),(3,4,4),(3,4,5),(3,4,6),(3,4,7),(3,4,8),(3,4,9),(3,4,10),(3,4,11),(3,4,12),(3,4,13),(3,4,14),(3,4,15),
]
for l in range(len(l2_lambda_list)):
l2_lambda = l2_lambda_list[l]
f = open("My Results.txt", "a")
lambda_Str = str(l2_lambda)
f.write("---------------------------------------\n")
f.write("lambda = "+f"{lambda_Str}\n")
f.write("---------------------------------------\n")
for i in range(len(filters_nb)):
F1_nb = filters_nb[i][0]
F2_nb = filters_nb[i][1]
F3_nb = filters_nb[i][2]
kernel_size_1 = kernel_size[0]
kernel_size_2 = kernel_size_1
kernel_size_3 = kernel_size_1
dropout_conv1 = DropOut[0][0]
dropout_conv2 = DropOut[0][1]
dropout_conv3 = DropOut[0][2]
dropout_dense = DropOut[0][3]
# fit model
model, history = get_model(X_train, Y_train, X_validation, Y_validation, F1_nb, F2_nb, F3_nb, kernel_size_1, kernel_size_2, kernel_size_3, l2_lambda, learning_rate, reduce_lr, dropout_conv1, dropout_conv2, dropout_conv3, dropout_dense, no_epochs)
# Evaluate metrics
predictions = model.predict(X_test)
out = np.argmax(predictions, axis=1)
Y_test = sio.loadmat('./Y_test')['targets_test']
Y_test = np.squeeze(Y_test)
loss = history.history['loss'][no_epochs-1]
acc = history.history['acc'][no_epochs-1]
val_loss = history.history['val_loss'][no_epochs-1]
val_acc = history.history['val_acc'][no_epochs-1]
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(Y_test, out)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y_test, out,average='macro')
a = str(filters_nb[i][0]) + ',' + str(filters_nb[i][1]) + ',' + str(filters_nb[i][2]) + ': ' + str('f1-metric: ') + str('%f' % f1) + str(' | loss: ') + str('%f' % loss) + str(' | acc: ') + str('%f' % acc) + str(' | val_loss: ') + str('%f' % val_loss) + str(' | val_acc: ') + str('%f' % val_acc) + str(' | test_acc: ') + str('%f' % accuracy)
f.write(f"{a}\n")
f.close()

Translating LSTM model from Keras to Pytorch

I am having a hard time translating a quite simple LSTM model from Keras to Pytorch. X (get it here) corresponds to 1152 samples of 90 timesteps, each timestep has only 1 dimension. y (here) is a single prediction at t = 91 for all 1152 samples.
In Keras:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
import numpy as np
import pandas as pd
X = pd.read_csv('X.csv', header = None).values
X.shape
y = pd.read_csv('y.csv', header = None).values
y.shape
# From Keras documentation [https://keras.io/layers/recurrent/]:
# Input shape 3D tensor with shape (batch_size, timesteps, input_dim).
X = np.reshape(X, (1152, 90, 1))
regressor = Sequential()
regressor.add(LSTM(units = 100, return_sequences = True, input_shape = (90, 1)))
regressor.add(Dropout(0.3))
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.3))
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.3))
regressor.add(LSTM(units = 50))
regressor.add(Dropout(0.3))
regressor.add(Dense(units = 1, activation = 'linear'))
regressor.compile(optimizer = 'rmsprop', loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
regressor.fit(X, y, epochs = 10, batch_size = 32)
... leads me to:
# Epoch 10/10
# 1152/1152 [==============================] - 33s 29ms/sample - loss: 0.0068 - mean_absolute_error: 0.0628
Then in Pytorch:
import torch
from torch import nn, optim
from sklearn.metrics import mean_absolute_error
X = pd.read_csv('X.csv', header = None).values
y = pd.read_csv('y.csv', header = None).values
X = torch.tensor(X, dtype = torch.float32)
y = torch.tensor(y, dtype = torch.float32)
dataset = torch.utils.data.TensorDataset(X, y)
loader = torch.utils.data.DataLoader(dataset, batch_size = 32, shuffle = True)
class regressor_LSTM(nn.Module):
def __init__(self):
super().__init__()
self.lstm1 = nn.LSTM(input_size = 1, hidden_size = 100)
self.lstm2 = nn.LSTM(100, 50)
self.lstm3 = nn.LSTM(50, 50, dropout = 0.3, num_layers = 2)
self.dropout = nn.Dropout(p = 0.3)
self.linear = nn.Linear(in_features = 50, out_features = 1)
def forward(self, X):
# From the Pytorch documentation [https://pytorch.org/docs/stable/_modules/torch/nn/modules/rnn.html]:
# **input** of shape `(seq_len, batch, input_size)`
X = X.view(90, 32, 1)
# I am discarding hidden/cell states since in Keras I am using a stateless approach
# [https://keras.io/examples/lstm_stateful/]
X, _ = self.lstm1(X)
X = self.dropout(X)
X, _ = self.lstm2(X)
X = self.dropout(X)
X, _ = self.lstm3(X)
X = self.dropout(X)
X = self.linear(X)
return X
regressor = regressor_LSTM()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(regressor.parameters())
for epoch in range(10):
running_loss = 0.
running_mae = 0.
for i, data in enumerate(loader):
inputs, labels = data
optimizer.zero_grad()
outputs = regressor(inputs)
outputs = outputs[-1].view(*labels.shape)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
mae = mean_absolute_error(labels.detach().cpu().numpy().flatten(), outputs.detach().cpu().numpy().flatten())
running_mae += mae
print('EPOCH %3d: loss %.5f - MAE %.5f' % (epoch+1, running_loss/len(loader), running_mae/len(loader)))
... leads me to:
# EPOCH 10: loss 0.04220 - MAE 0.16762
You can notice that both loss and MAE are quite different (Pytorch's are much higher). If I use Pytorch's model to predict the values, they all return as a constant.
What am I doing wrong?
Oh I believe I made considerable progress. It seems that the way to represent y is different between Keras and Pytorch. In Keras, we should pass it as a single value representing one timestep in the future (or, at least, for the problem I am trying to solve). But in Pytorch, y must be X shifted one timestep to the future. It is like this:
time_series = [0, 1, 2, 3, 4, 5]
X = [0, 1, 2, 3, 4]
# Keras:
y = [5]
# Pytorch:
y = [1, 2, 3, 4, 5]
This way, Pytorch compares all values in the time slice when calculating loss. I believe Keras rearranges the data under the hood to conform to this approach, as the code works when fed the variables just like that. But in Pytorch, I was estimating loss based only on one value (the one I was trying to predict), not the whole series, therefore I believe it could not correctly capture the time dependency.
When taking this in consideration, I got to:
EPOCH 100: loss 0.00551 - MAE 0.058435
And, most importantly, comparing true and predicted values in a separate dataset got me to
The patterns were clearly captured by the model.
Hooray!

Print input while training model on every iteration

I want to check what data is in input, or to check output of some layer. For this i do the following:
import tensorflow.keras.backend as K
import tensorflow as tf
import numpy as np
x = [[i, i * 3 + 1] for i in range(100)]
y = [2 * i + 1 for i in range(100)]
x = np.array(x)
y = np.array(y)
print_weights = tf.keras.callbacks.LambdaCallback(
on_batch_end=lambda batch, logs: print(K.get_value(model.layers[1].input)))
def sobaka():
a = tf.keras.Input(shape=(2,))
b = tf.keras.layers.Dense(1)
c = b(a)
model = tf.keras.models.Model(a, c)
optimizer = tf.keras.optimizers.Adam(lr=0.1)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
return model
kek = tf.placeholder(tf.float32, shape=(2,))
model = sobaka()
model.fit(x, y, batch_size=1, epochs=2, callbacks=[print_weights])
So every batch (one training sample) it would print input tensor. But, i got an error:
You must feed a value for placeholder tensor 'input_1' with dtype
float and shape [?,2]
Please, help me understand how to fit placeholder in my code. And is there any possible solution to print information every iteration? (when batch is ,for example, 10?)
One option is to use a [custom callback][1]
like so:
class MyCallback(tf.keras.callbacks.Callback):
def __init__(self, patience=0):
super(MyCallback, self).__init__()
def on_epoch_begin(self, epoch, logs=None):
tf.print(self.model.get_weights())
model.fit(
x_train,
y_train,
epochs=epochs,
batch_size=batch_size,
callbacks=[MyCallback()],
validation_data=(x_test, y_test),
)

Resources