keras model with one connection per input node - keras

I would like to create a sequential model in keras with one hidden layer with as many nodes as there are input nodes. Each input node should be connected to only one of the hidden nodes. All nodes in the hidden layer should be connected to a single output node: as in this image
I would like to be able to specify the activation function of the hidden layer.
Is it possible to achieve that with a Sequential() model in keras?

Here is a custom layer where you can do everything you want:
import keras
import tensorflow as tf
from keras.layers import *
from keras import Sequential
import numpy as np
tf.set_random_seed(10)
class MyDenseLayer(keras.layers.Layer):
def __init__(self):
super(MyDenseLayer, self).__init__()
def parametric_relu(self, _x):
# some more or less complicated activation
# with own weight
pos = tf.nn.relu(_x)
neg = self.alphas * (_x - abs(_x)) * 0.5
return pos + neg
def build(self, input_shape):
# main weight
self.kernel = self.add_weight("kernel",
shape=[int(input_shape[-1]),],
initializer=tf.random_normal_initializer())
# any additional weights here
self.alphas = self.add_weight('alpha', shape=[int(input_shape[-1]),],
initializer=tf.constant_initializer(0.0),
dtype=tf.float32)
self.size = int(input_shape[-1])
def call(self, input):
linear = tf.matmul(input, self.kernel*tf.eye(self.size))
nonlinear = self.parametric_relu(linear)
return nonlinear
model = Sequential()
model.add(MyDenseLayer())
model.build((None, 4))
print(model.summary())
x = np.ones((5,4))
print(model.predict(x))

Related

Model not training when using batch normalization with keras functional API

I'm going through some tutorials using the Keras functional API in Tensorflow 2, and I'm having some trouble including BatchNormalization layers when using the functional API.
Using roughly the same code:
This network trains with the sequential API and batch normalization
This network trains with the functional API, but commenting out the batch normalization layers
This network does not train using the functional API and batch normalization layers
Am I missing a step somewhere? Do I set training=true or training=false somewhere in the code?
Working Sequential Code:
#subclassed layers in keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import SeparableConv2D
from tensorflow.keras.layers import BatchNormalization
import numpy as np
import logging
tf.get_logger().setLevel(logging.ERROR)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import matplotlib.pyplot as plt
%matplotlib inline
cifar_dataset = keras.datasets.cifar10
(train_images, train_labels), (test_images,
test_labels) = cifar_dataset.load_data()
EPOCHS = 128
BATCH_SIZE = 128
#standardize dataset
mean = np.mean(train_images)
stdev = np.std(train_images)
train_images = (train_images - mean)/stdev
test_images = (test_images - mean)/stdev
#change labels to one-hot
train_labels = to_categorical(train_labels, num_classes=10)
test_labels = to_categorical(test_labels, num_classes=10)
# Keras model subclassing: build your own layers
#CNN -> batch norm -> Relu
#create a class for this kind of block
class CNNBlock(layers.Layer):#inherits from layers.Layer - keeps track of what we need for back propagation
def __init__(self, out_channels, kernel_size=3, strides=(1,1)): #needs both __init__ and call functions, initialize layer
super(CNNBlock, self).__init__() #superclass layers.Layer with our new class
self.conv = layers.Conv2D(out_channels, kernel_size, padding='same',
kernel_initializer='he_normal',bias_initializer='zeros')#initialize the conv portion of this class
self.bn = layers.BatchNormalization()#initialize batch normalization in this block
def call(self, input_tensor, training=False): #what happens when this block is encountered, specify training bool for traning/evaluation
#call method (forward method in pytorch)
#take input tensor, run it though our initialized layers in __init__
x = self.conv(input_tensor)#run convolution operation
x = self.bn(x, training=training)#batch norm
x = tf.nn.relu(x)#activation function for this layer
return x
class CNNBlock_init(layers.Layer):#inherits from layers.Layer - keeps track of what we need for back propagation
def __init__(self, out_channels, input_size, kernel_size=3): #needs both __init__ and call functions, initialize layer
super(CNNBlock_init, self).__init__() #superclass layers.Layer with our new class - make sure new class name matches
self.input_size = input_size
self.conv = layers.Conv2D(out_channels, kernel_size,
input_shape=input_size, #first layer needs input shape to build properly
padding='same')#initialize the conv portion of this class
self.bn = layers.BatchNormalization()#initialize batch normalization in this block
def call(self, input_tensor, training=False): #what happens when this block is encountered, specify training bool for traning/evaluation
#call method (forward method in pytorch)
#take input tensor, run it though our initialized layers in __init__
x = self.conv(input_tensor,input_shape=self.input_size)#run convolution operation
x = self.bn(x, training=training)#batch norm
x = tf.nn.relu(x)#activation function for this layer
return x
#build model with this
model = keras.Sequential(
[
CNNBlock(64,kernel_size=4,strides=(2,2)),
Dropout(0.2),
CNNBlock(64,kernel_size=2,strides=(2,2)),
Dropout(0.2),
CNNBlock(32),
Dropout(0.2),
CNNBlock(32),
MaxPooling2D(pool_size=(2,2), strides=2),
Dropout(0.2),
Flatten(),
Dense(64, activation='relu',#dense layers to combine features
kernel_initializer='he_normal',
bias_initializer='zeros'),
Dropout(0.2),
Dense(10, activation='softmax',#softmax for classification
kernel_initializer='glorot_uniform',
bias_initializer='zeros')
])
#compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#model.build(input_shape=(32,32,3))
#model.summary()
#train model
history = model.fit(
train_images, train_labels,
validation_data=(test_images,test_labels),
epochs=EPOCHS, batch_size=BATCH_SIZE,
verbose=1, shuffle=True) #verbose 1 is cool gives time for each epoch
#evaluate model
import matplotlib.pyplot as plt
%matplotlib inline
def plot_error(history):
history_dict_vals = history.__dict__['history']
history_x = history.epoch
plt.plot(history_x,history_dict_vals['accuracy'],'r-', label='training accuracy')
plt.plot(history_x,history_dict_vals['val_accuracy'],'g-', label='test accuracy')
plt.axis([0,len(history_x),0.0,1])
plt.xlabel('training epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()
print(f"Final test accuracy = {history_dict_vals['val_accuracy'][-1]}")
plot_error(history)
Working Functional Code:
# same convolutional structure but with the keras functional API
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import SeparableConv2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
import numpy as np
import logging
tf.get_logger().setLevel(logging.ERROR)
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import matplotlib.pyplot as plt
%matplotlib inline
cifar_dataset = keras.datasets.cifar10
(train_images, train_labels), (test_images,
test_labels) = cifar_dataset.load_data()
EPOCHS = 128
BATCH_SIZE = 128
#standardize dataset
mean = np.mean(train_images)
stdev = np.std(train_images)
train_images = (train_images - mean)/stdev
test_images = (test_images - mean)/stdev
#change labels to one-hot
train_labels = to_categorical(train_labels, num_classes=10)
test_labels = to_categorical(test_labels, num_classes=10)
# Keras model subclassing: build your own layers
#CNN -> batch norm -> Relu
#create a class for this kind of block
class CNNBlock(layers.Layer):#inherits from layers.Layer - keeps track of what we need for back propagation
def __init__(self, out_channels, kernel_size=3, strides=(1,1)): #needs both __init__ and call functions, initialize layer
super(CNNBlock, self).__init__() #superclass layers.Layer with our new class
self.conv = layers.Conv2D(out_channels, kernel_size, padding='same',
kernel_initializer='he_normal',bias_initializer='zeros')#initialize the conv portion of this class
#self.bn = layers.BatchNormalization()#initialize batch normalization in this block
def call(self, input_tensor, training=False): #what happens when this block is encountered, specify training bool for traning/evaluation
#call method (forward method in pytorch)
#take input tensor, run it though our initialized layers in __init__
x = self.conv(input_tensor)#run convolution operation
#x = self.bn(x, training=training)#batch norm
x = tf.nn.relu(x)#activation function for this layer
return x
class CNNBlock_init(layers.Layer):#inherits from layers.Layer - keeps track of what we need for back propagation
def __init__(self, out_channels, input_size, kernel_size=3): #needs both __init__ and call functions, initialize layer
super(CNNBlock_init, self).__init__() #superclass layers.Layer with our new class - make sure new class name matches
self.input_size = input_size
self.conv = layers.Conv2D(out_channels, kernel_size,
input_shape=input_size, #first layer needs input shape to build properly
padding='same')#initialize the conv portion of this class
#self.bn = layers.BatchNormalization()#initialize batch normalization in this block
def call(self, input_tensor, training=False): #what happens when this block is encountered, specify training bool for traning/evaluation
#call method (forward method in pytorch)
#take input tensor, run it though our initialized layers in __init__
x = self.conv(input_tensor,input_shape=self.input_size)#run convolution operation
#x = self.bn(x, training=training)#batch norm
x = tf.nn.relu(x)#activation function for this layer
return x
#build model with this
#Build the model with the Keras functional API
input_shape = (32,32,3)
chanDim = -1
#define model with first inputs
inputs = Input(shape=input_shape)
#functional API passing layers through
x = CNNBlock(64,kernel_size=4,strides=(2,2))(inputs)
x = Dropout(0.2)(x)
x = CNNBlock(64,kernel_size=2,strides=(2,2))(x)
x = Dropout(0.2)(x)
x = CNNBlock(64)(x)
x = MaxPooling2D(pool_size=(2,2), strides=2)(x)
x = Dropout(0.2)(x)
x = Flatten()(x)
x = Dense(64, activation='relu',#dense layers to combine features
kernel_initializer='he_normal',
bias_initializer='zeros')(x)
x = Dropout(0.2)(x)
y = Dense(10, activation='softmax',#softmax for classification
kernel_initializer='glorot_uniform',
bias_initializer='zeros')(x)
#initialize model with inputs and outputs
model = Model(inputs, y, name='convnet_func')
#compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
#train model
history = model.fit(
train_images, train_labels,
validation_data=(test_images,test_labels),
epochs=EPOCHS, batch_size=BATCH_SIZE,
verbose=1, shuffle=True) #verbose 1 is cool gives time for each epoch
#evaluate model
import matplotlib.pyplot as plt
%matplotlib inline
def plot_error(history):
history_dict_vals = history.__dict__['history']
history_x = history.epoch
plt.plot(history_x,history_dict_vals['accuracy'],'r-', label='training accuracy')
plt.plot(history_x,history_dict_vals['val_accuracy'],'g-', label='test accuracy')
plt.axis([0,len(history_x),0.0,1])
plt.xlabel('training epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()
print(f"Final test accuracy = {history_dict_vals['val_accuracy'][-1]}")
plot_error(history)
Unfortunately the model does not train when I remove the comments around the batch normalization layers.

Custom Trainable Layers in Keras

In keras, we can use a Lambda layer to create a custom layer, like this:
def f(x):
return x**2
model.add(Lambda(f))
Now my question is, how to make such custom function trainable? How to make this function such that it raises an input to the power w, where w is trainable. Like this:
def f(x):
return x**w
The problem can be solved by making a new layer via subclassing,
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.optimizers import *
import numpy as np
class ScaleLayer(tf.keras.layers.Layer):
def __init__(self):
super(ScaleLayer, self).__init__()
self.scale = tf.Variable(1., trainable=True)
def call(self, inputs):
return inputs ** self.scale
x = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13]).reshape(-1,1)
y = x**3.25
l = ScaleLayer()
a1 = tf.keras.layers.Input(shape=1)
a2 = l(a1)
model = tf.keras.models.Model(a1,a2)
model.compile(optimizer=Adam(learning_rate=0.01), loss='mse')
model.fit(x,y, epochs=500, verbose=0)
print(l.weights) # This prints 3.25
More about this can be found here

What can be the cause of the validation loss increasing and the accuracy remaining constant to zero while the train loss decreases?

I am trying to solve a multiclass text classification problem. Due to specific requirements from my project I am trying to use skorch (https://skorch.readthedocs.io/en/stable/index.html) to wrap pytorch for the sklearn pipeline. What I am trying to do is fine-tune a pretrained version of BERT from Huggingface (https://huggingface.co) with my dataset. I have tried, in the best of my knowledge, to follow the instructions from skorch on how I should input my data, structure the model etc. Still during the training the train loss decreases until the 8th epoch where it starts fluctuating, all while the validation loss increases from the beginning and the validation accuracy remains constant to zero. My pipeline setup is
from sklearn.pipeline import Pipeline
pipeline = Pipeline(
[
("tokenizer", Tokenizer()),
("classifier", _get_new_transformer())
]
in which I am using a tokenizer class to preprocess my dataset, tokenizing it for BERT and creating the attention masks. It looks like this
import torch
from transformers import AutoTokenizer, AutoModel
from torch import nn
import torch.nn.functional as F
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
import numpy as np
class Tokenizer(BaseEstimator, TransformerMixin):
def __init__(self):
super(Tokenizer, self).__init__()
self.tokenizer = AutoTokenizer.from_pretrained(/path/to/model)
def _tokenize(self, X, y=None):
tokenized = self.tokenizer.encode_plus(X, max_length=20, add_special_tokens=True, pad_to_max_length=True)
tokenized_text = tokenized['input_ids']
attention_mask = tokenized['attention_mask']
return np.array(tokenized_text), np.array(attention_mask)
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
word_tokens, attention_tokens = np.array([self._tokenize(string)[0] for string in tqdm(X)]), \
np.array([self._tokenize(string)[1] for string in tqdm(X)])
X = word_tokens, attention_tokens
return X
def fit_transform(self, X, y=None, **fit_params):
self = self.fit(X, y)
return self.transform(X, y)
then I initialize the model I want to fine-tune as
class Transformer(nn.Module):
def __init__(self, num_labels=213, dropout_proba=.1):
super(Transformer, self).__init__()
self.num_labels = num_labels
self.model = AutoModel.from_pretrained(/path/to/model)
self.dropout = torch.nn.Dropout(dropout_proba)
self.classifier = torch.nn.Linear(768, num_labels)
def forward(self, X, **kwargs):
X_tokenized, attention_mask = torch.stack([x.unsqueeze(0) for x in X[0]]),\
torch.stack([x.unsqueeze(0) for x in X[1]])
_, X = self.model(X_tokenized.squeeze(), attention_mask.squeeze())
X = F.relu(X)
X = self.dropout(X)
X = self.classifier(X)
return X
I initialize the model and create the classifier with skorch as follows
from skorch import NeuralNetClassifier
from skorch.dataset import CVSplit
from skorch.callbacks import ProgressBar
import torch
from transformers import AdamW
def _get_new_transformer() -> NeuralNetClassifier:
transformer = Transformer()
net = NeuralNetClassifier(
transformer,
lr=2e-5,
max_epochs=10,
criterion=torch.nn.CrossEntropyLoss,
optimizer=AdamW,
callbacks=[ProgressBar(postfix_keys=['train_loss', 'valid_loss'])],
train_split=CVSplit(cv=2, random_state=0)
)
return net
and I use fit like that
pipeline.fit(X=dataset.training_samples, y=dataset.training_labels)
in which my training samples are lists of strings and my labels are the an array containing the indexes of each class, as pytorch requires.
This is a sample of what happens
training history
I have tried to keep train only the fully connected layer and not BERT but I have the same issue again. I also tested the train accuracy after the training process and it was only 0,16%. I would be grateful for any advice or insight on how to solve my problem! I am pretty new with skorch and not so comfortable with pytorch yet and I believe that I am missing something really simple. Thank you very much in advance!

Why would you add a variable to the _trainable_weights list of a layer?

In this notebook https://nbviewer.jupyter.org/github/krasserm/bayesian-machine-learning/blob/master/bayesian_neural_networks.ipynb, the author defines the function
def mixture_prior_params(sigma_1, sigma_2, pi):
params = K.variable([sigma_1, sigma_2, pi], name='mixture_prior_params')
sigma = np.sqrt(pi * sigma_1 ** 2 + (1 - pi) * sigma_2 ** 2)
return params, sigma
which creates a variable and returns a tuple. This method is then called
prior_params, prior_sigma = mixture_prior_params(sigma_1=1.0, sigma_2=0.1, pi=0.2)
Then, in the class DenseVariational, which is a custom layer, in the method build, the prior_params global variable is added to the private list _trainable_weights
def build(self, input_shape):
self._trainable_weights.append(prior_params)
...
Why would one need or want to do this? If I attempt to print the trainable parameters of either the custom layer or a model made of this custom layer, for example
# Create the model with DenseVariational layers
model = Model(x_in, x_out)
print("model.trainable_weights =", model.trainable_weights)
I can see that each DenseVariational layer contains a mixture_prior_params trainable parameter. Why should one declare mixture_prior_params, more specifically, sigma_1, sigma_2 and pi, outside of the layer, if they are trainable parameters of the layer?
After having looked at this question Can I share weights between keras layers but have other parameters differ? and its answer (https://stackoverflow.com/a/45258859/3924118) and having printed the values of the trainable variables of the model after the model has been trained, it seems like this is a way of sharing a variable across different layers, given that the value of that variable seems to be equal across layers, after the model has been trained.
I have created a simple example (with TensorFlow 2.0.0 and Keras 2.3.1) that shows this
import numpy as np
from keras import activations, initializers
from keras import backend as K
from keras import optimizers
from keras.layers import Input
from keras.layers import Layer
from keras.models import Model
shared_variable = K.variable([0.3], name='my_shared_variable')
class MyLayer(Layer):
def __init__(self, output_dim, activation=None, **kwargs):
self.output_dim = output_dim
self.activation = activations.get(activation)
super().__init__(**kwargs)
def build(self, input_shape):
self._trainable_weights.append(shared_variable)
self.my_weight = self.add_weight(name='my_weight',
shape=(input_shape[1], self.output_dim),
initializer=initializers.normal(),
trainable=True)
super().build(input_shape)
def call(self, x):
return self.activation(K.dot(x, self.my_weight * shared_variable))
def compute_output_shape(self, input_shape):
return input_shape[0], self.output_dim
if __name__ == "__main__":
# Define the architecture of the model.
x_in = Input(shape=(1,))
h1 = MyLayer(20, activation='relu')(x_in)
h2 = MyLayer(20, activation='relu')(h1)
x_out = MyLayer(1)(h2)
model = Model(x_in, x_out)
print("h1.trainable_weights (before training) =", model.layers[1].trainable_weights[0])
print("h2.trainable_weights (before training) =", model.layers[2].trainable_weights[0])
# Prepare the model for training.
model.compile(loss="mse", optimizer=optimizers.Adam(lr=0.03))
# Generate dataset.
X = np.linspace(-0.5, 0.5, 100).reshape(-1, 1)
y = 10 * np.sin(2 * np.pi * X)
# Train the model.
model.fit(X, y, batch_size=1, epochs=100, verbose=0)
print("h1.trainable_weights (after training) =", model.layers[1].trainable_weights[0])
print("h2.trainable_weights (after training) =", model.layers[2].trainable_weights[0])
The output is
h1.trainable_weights (before training) = <tf.Variable 'my_shared_variable:0' shape=(1,) dtype=float32, numpy=array([0.3], dtype=float32)>
h2.trainable_weights (before training) = <tf.Variable 'my_shared_variable:0' shape=(1,) dtype=float32, numpy=array([0.3], dtype=float32)>
h1.trainable_weights (after training) = <tf.Variable 'my_shared_variable:0' shape=(1,) dtype=float32, numpy=array([0.7049409], dtype=float32)>
h2.trainable_weights (after training) = <tf.Variable 'my_shared_variable:0' shape=(1,) dtype=float32, numpy=array([0.7049409], dtype=float32)>

how to implement Grad-CAM on your own network?

I want to implement Grad-CAM on my own network, should I save my model and load it, then treat my saved model like VGG-16, then do similar operations?
I tried to search on the internet, and I found that all methods are based on famous models, not their owns.
So I wonder, maybe I just need to treat my own model as VGG-16, then do similar things.
Hi i have one solution in pytorch
import torch
import torch.nn as nn
from torch.utils import data
from torchvision import transforms
from torchvision import datasets
import matplotlib.pyplot as plt
import numpy as np
# use the ImageNet transformation
transform = transforms.Compose([transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
# define a 1 image dataset
dataset = datasets.ImageFolder(root='./data/Elephant/', transform=transform)
# define the dataloader to load that single image
dataloader = data.DataLoader(dataset=dataset, shuffle=False, batch_size=1)
vgg19 = Mymodel() ## create an object of your model
vgg19.load_state_dict(torch.load("your_vgg19_weights"))
class VGG(nn.Module):
def __init__(self):
super(VGG, self).__init__()
# get the pretrained VGG19 network
self.vgg = vgg19
# disect the network to access its last convolutional layer
self.features_conv = self.vgg.features[:36] # 36th layer was my last conv layer
# get the max pool of the features stem
self.max_pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
# get the classifier of the vgg19
self.classifier = self.vgg.classifier
# placeholder for the gradients
self.gradients = None
# hook for the gradients of the activations
def activations_hook(self, grad):
self.gradients = grad
def forward(self, x):
x = self.features_conv(x)
# register the hook
h = x.register_hook(self.activations_hook)
# apply the remaining pooling
x = self.max_pool(x)
x = x.view((1, -1))
x = self.classifier(x)
return x
# method for the gradient extraction
def get_activations_gradient(self):
return self.gradients
# method for the activation exctraction
def get_activations(self, x):
return self.features_conv(x)
vgg = VGG()
# set the evaluation mode
vgg.eval()
# get the image from the dataloader
img, _ = next(iter(dataloader))
# get the most likely prediction of the model
pred_class = vgg(img).argmax(dim=1).numpy()[0]
pred = vgg(img)
pred[:, pred_class].backward()
# pull the gradients out of the model
gradients = vgg.get_activations_gradient()
# pool the gradients across the channels
pooled_gradients = torch.mean(gradients, dim=[0, 2, 3])
# get the activations of the last convolutional layer
activations = vgg.get_activations(img).detach()
# weight the channels by corresponding gradients
for i in range(512):
activations[:, i, :, :] *= pooled_gradients[i]
# average the channels of the activations
heatmap = torch.mean(activations, dim=1).squeeze()
# relu on top of the heatmap
# expression (2) in https://arxiv.org/pdf/1610.02391.pdf
heatmap = np.maximum(heatmap, 0)
# normalize the heatmap
heatmap /= torch.max(heatmap)
heatmap = heatmap.numpy()
import cv2
img = cv2.imread('./data/Elephant/data/05fig34.jpg')
heatmap = cv2.resize(heatmap, (img.shape[1], img.shape[0]))
heatmap = np.uint8(255 * heatmap)
heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
superimposed_img = heatmap * 0.4 + img
cv2.imwrite('./map.jpg', superimposed_img) ###saves gradcam visualization image

Resources