How to implement hierarchical Transformer for document classification in Keras? - keras

Hierarchical attention mechanism for document classification has been presented by Yang et al.
https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf
Its implementation is available on https://github.com/ShawnyXiao/TextClassification-Keras
Also, the implementation of the document classification with Transformer is available on https://keras.io/examples/nlp/text_classification_with_transformer
But, it's not hierarchical.
I have googled a lot but didn't find any implementation of a hierarchical Transformer. Does anyone know how to implement a hierarchical transformer for document classification in Keras?
My implementation is as follows. Note that the implementation extended from Nandan implementation for document classification. https://keras.io/examples/nlp/text_classification_with_transformer.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical
class MultiHeadSelfAttention(layers.Layer):
def __init__(self, embed_dim, num_heads=8):
super(MultiHeadSelfAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
if embed_dim % num_heads != 0:
raise ValueError(
f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
)
self.projection_dim = embed_dim // num_heads
self.query_dense = layers.Dense(embed_dim)
self.key_dense = layers.Dense(embed_dim)
self.value_dense = layers.Dense(embed_dim)
self.combine_heads = layers.Dense(embed_dim)
def attention(self, query, key, value):
score = tf.matmul(query, key, transpose_b=True)
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_score = score / tf.math.sqrt(dim_key)
weights = tf.nn.softmax(scaled_score, axis=-1)
output = tf.matmul(weights, value)
return output, weights
def separate_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs):
# x.shape = [batch_size, seq_len, embedding_dim]
batch_size = tf.shape(inputs)[0]
query = self.query_dense(inputs) # (batch_size, seq_len, embed_dim)
key = self.key_dense(inputs) # (batch_size, seq_len, embed_dim)
value = self.value_dense(inputs) # (batch_size, seq_len, embed_dim)
query = self.separate_heads(
query, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
key = self.separate_heads(
key, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
value = self.separate_heads(
value, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
attention, weights = self.attention(query, key, value)
attention = tf.transpose(
attention, perm=[0, 2, 1, 3]
) # (batch_size, seq_len, num_heads, projection_dim)
concat_attention = tf.reshape(
attention, (batch_size, -1, self.embed_dim)
) # (batch_size, seq_len, embed_dim)
output = self.combine_heads(
concat_attention
) # (batch_size, seq_len, embed_dim)
return output
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate, name=None):
super(TransformerBlock, self).__init__(name=name)
self.att = MultiHeadSelfAttention(embed_dim, num_heads)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(dropout_rate)
self.dropout2 = layers.Dropout(dropout_rate)
def call(self, inputs, training):
attn_output = self.att(inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim, name=None):
super(TokenAndPositionEmbedding, self).__init__(name=name)
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
def compute_output_shape(self, input_shape):
# it changes the shape from (batch_size, maxlen) to (batch_size, maxlen, embed_dim)
return input_shape + (self.pos_emb.output_dim,)
# Lower level (produce a representation of each sentence):
embed_dim = 100 # Embedding size for each token
num_heads = 2 # Number of attention heads
ff_dim = 64 # Hidden layer size in feed forward network inside transformer
L1_dense_units = 100 # Size of the sentence-level representations output by the word-level model
dropout_rate = 0.1
vocab_size = 1000
class_number = 5
max_docs = 10000
max_sentences = 15
max_words = 60
word_input = layers.Input(shape=(max_words,), name='word_input')
word_embedding = TokenAndPositionEmbedding(maxlen=max_words, vocab_size=vocab_size,
embed_dim=embed_dim, name='word_embedding')(word_input)
word_transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='word_transformer')(word_embedding)
word_pool = layers.GlobalAveragePooling1D(name='word_pooling')(word_transformer)
word_drop = layers.Dropout(dropout_rate, name='word_drop')(word_pool)
word_dense = layers.Dense(L1_dense_units, activation="relu", name='word_dense')(word_drop)
word_encoder = keras.Model(word_input, word_dense)
word_encoder.summary()
# =========================================================================
# Upper level (produce a representation of each document):
L2_dense_units = 100
sentence_input = layers.Input(shape=(max_sentences, max_words), name='sentence_input')
sentence_encoder = tf.keras.layers.TimeDistributed(word_encoder, name='sentence_encoder')(sentence_input)
sentence_transformer = TransformerBlock(embed_dim=L1_dense_units, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='sentence_transformer')(sentence_encoder)
sentence_pool = layers.GlobalAveragePooling1D(name='sentence_pooling')(sentence_transformer)
sentence_out = layers.Dropout(dropout_rate)(sentence_pool)
preds = layers.Dense(class_number , activation='softmax', name='sentence_output')(sentence_out)
model = keras.Model(sentence_input, preds)
model.summary()
The summary of the model is as follows:
Model: "model_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
word_input (InputLayer) [(None, 60)] 0
word_embedding (TokenAndPos (None, 60, 100) 106000
itionEmbedding)
word_transformer (Transform (None, 60, 100) 53764
erBlock)
word_pooling (GlobalAverage (None, 100) 0
Pooling1D)
word_drop (Dropout) (None, 100) 0
word_dense (Dense) (None, 100) 10100
=================================================================
Total params: 169,864
Trainable params: 169,864
Non-trainable params: 0
_________________________________________________________________
Model: "model_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
sentence_input (InputLayer) [(None, 15, 60)] 0
sentence_encoder (TimeDistr (None, 15, 100) 169864
ibuted)
sentence_transformer (Trans (None, 15, 100) 53764
formerBlock)
sentence_pooling (GlobalAve (None, 100) 0
ragePooling1D)
dropout_9 (Dropout) (None, 100) 0
sentence_output (Dense) (None, 5) 505
=================================================================
Total params: 224,133
Trainable params: 224,133
Non-trainable params: 0
Everything is ok and you can copy and paste these codes in colab to see the summary of the model.
But, my problem is for positional encoding at the sentence level.
How to apply positional encoding at the sentence level?

The implementation is recursive in the sense that you treat the average of your outputs of transformer x as the input to transformer x+1.
So let's say your data is structured as (batch, chapter, paragraph, sentence, token).
After the first transformation you end up with (batch, chapter, paragraph, sentence, embedding) so then you average and get (batch, chapter, paragraph, sentence_embedding_in).
Apply another transformation and get (batch, chapter, paragraph, sentence_embedding_out).
Average again and get (batch, chapter, paragraph_embedding). Rinse & Repeat.
The implementation of the paper is actually in a different repository:
https://github.com/ematvey/hierarchical-attention-networks
They actually do something different from what I've described and apply transformers at the bottom and RNN at the top. In theory you could do the opposite or apply RNN at each layer (that would be really slow). As far as the implementation is concerned you can abstract from that - the principle remains the same: you apply a transformation, average the outputs and feed it into the next higher-level "layer" (or "module" using torch lingo).

Related

Discrepancy between tensorflow model accuracy and actual accuracy

I am trying to do categorical classification of data. The data consists of 3 text variables, and one real value. I split the data into three sets - training, validation and testing. I am using tensorflow and python. For the test data I get the following stats: test data statistics
A categorical accuracy of 0.9919, however when I perform a prediction on the same test data, and evaluate the accuracy with the sci-kit classification_report function, I get an accuracy of 0.60, as seen here: classification report.
df.info() looks like this ('category' being the Y value): info. I don't think that the sci-kit learn accuracy statistic is misrepresenting the prediction, since the confusion matrix looks like this.
I have re-built the model multiple times, and tried balancing classes with class weights just in case, however this still wouldn't explain the discrepancy between tensorflows accuracy, and the sci-kit one (gained by prediction).
The code:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
df = pd.read_csv("procData_nosub.csv")
df = df.sample(frac=1).reset_index(drop=True) # Shuffling
df = df[(df.category != 'Pictures')] # Removing small categories
df = df[(df.category != 'Software')]
df = df.drop("fileAmount", axis=1)
df = df.drop("more100Files", axis=1)
train, val, test = np.split(df.sample(frac=1), [int(0.8 * len(df)), int(0.9 * len(df))])
# Function to convert dataframe to dataset (from https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers)
def df_to_dataset(dataframe, shuffle=True, batch_size=64):
df = dataframe.copy()
labels = df.pop('category')
df = {key: value[:,tf.newaxis] for key, value in df.items()}
ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(df))
ds = ds.batch(batch_size)
ds = ds.prefetch(batch_size)
return ds
train_data = df_to_dataset(train)
test_data = df_to_dataset(test)
validation_data = df_to_dataset(val)
# Function to convert text (for the Y) to one-hot-encoding
catVals = np.unique(df['category'])
table = tf.lookup.StaticHashTable(
initializer = tf.lookup.KeyValueTensorInitializer(
keys = tf.constant(catVals),
values = list(range(len(catVals)))
),
default_value = -1,
name = "target_encoding"
)
#tf.function
def target(x):
return table.lookup(x)
def fetch(features, labels):
return features, tf.one_hot(target(labels), len(catVals))
# Applying the text (Y) -> one-hot-encoding
train_data_f = train_data.map(fetch)
test_data_f = test_data.map(fetch)
validation_data_f = validation_data.map(fetch)
# Using an encoder
embedding = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
hub_layer = hub.KerasLayer(embedding, output_shape = 512, input_shape = [], dtype=tf.string, trainable=True)
# Normalizing real values (from https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers)
def get_normalization_layer(name, dataset):
normalizer = tf.keras.layers.Normalization(axis=None)
feature_ds = dataset.map(lambda x, y: x[name])
normalizer.adapt(feature_ds)
return normalizer
all_inputs = []
encoded_features = []
# Adding filesize to features
for header in ['fileSize']: # fileAmount
numeric_col = tf.keras.Input(shape=(1,), name=header)
normalization_layer = get_normalization_layer(header, train_data_f)
encoded_numeric_col = normalization_layer(numeric_col)
all_inputs.append(numeric_col)
encoded_features.append(encoded_numeric_col)
# Adding title, description, files to features
for header in ['title', 'description', 'files']:
text_col = tf.keras.Input(shape=(), name=header, dtype='string')
encoded_text_col = hub_layer(text_col)
all_inputs.append(text_col)
encoded_features.append(encoded_text_col)
# Describing the model
all_features = tf.keras.layers.concatenate(encoded_features)
x = all_features
for i in range(3):
x = tf.keras.layers.Dense(16, activation='relu')(x)
# x = tf.keras.layers.Dropout(0.2)(x)
output = tf.keras.layers.Dense(len(catVals), activation='softmax')(x)
model = tf.keras.Model(all_inputs, output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
loss=tf.keras.losses.CategoricalCrossentropy(), # deleted from_logits=True
metrics=["categorical_accuracy",
tf.keras.metrics.Precision(),
tf.keras.metrics.Recall(),
tfa.metrics.F1Score(num_classes=len(catVals),
average='macro',
threshold=0.5)])
history = model.fit(train_data_f, epochs=5, validation_data=validation_data_f) # removed class weights
model.evaluate(test_data_f)
model.evaluate(validation_data_f)
# Taking x and y from the test data
test_x = test_data_f.unbatch().map(lambda x, y: x)
test_y = test_data_f.unbatch().map(lambda x, y: y)
test_predicted = model.predict(test_x)
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# converting test_y to a numpy array (from a tensorflow Dataset)
test_y = np.array([x for x in test_y])
print(classification_report(test_y.argmax(1), test_predicted.argmax(1)))
print(tf.math.confusion_matrix(test_y.argmax(1), test_predicted.argmax(1)))
As a TLDR: I am using universal-sentence-encoder-multilingual/3 from tensorflow hub, and the model looks like this: model build, fit history. Model summary:
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
fileSize (InputLayer) [(None, 1)] 0 []
title (InputLayer) [(None,)] 0 []
description (InputLayer) [(None,)] 0 []
files (InputLayer) [(None,)] 0 []
normalization (Normalization) (None, 1) 3 ['fileSize[0][0]']
keras_layer (KerasLayer) (None, 512) 68927232 ['title[0][0]',
'description[0][0]',
'files[0][0]']
concatenate (Concatenate) (None, 1537) 0 ['normalization[0][0]',
'keras_layer[0][0]',
'keras_layer[1][0]',
'keras_layer[2][0]']
dense (Dense) (None, 16) 24608 ['concatenate[0][0]']
dense_1 (Dense) (None, 16) 272 ['dense[0][0]']
dense_2 (Dense) (None, 16) 272 ['dense_1[0][0]']
dense_3 (Dense) (None, 4) 68 ['dense_2[0][0]']
==================================================================================================
Total params: 68,952,455
Trainable params: 68,952,452
Non-trainable params: 3
__________________________________________________________________________________________________

Input 0 of layer "dense_14" is incompatible with the layer: expected axis -1 of input shape to have value 148, but received input with shape (32, 21)

I'm trying to run a script using my data, but I get this error:
Input 0 of layer "dense_14" is incompatible with the layer: expected axis -1 of input shape to have value 148, but received input with shape (32, 21).
I understand that the shapes of my data which are ((11598, 1)) aren't compatible with the model shapes. If anyone can help ?
This is the code:
from __future__ import print_function, division
import tensorflow as tf
from keras.layers import Input, Dense, Activation
from keras.layers import Maximum, Concatenate
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, BatchNormalization
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
class MalGAN():
def __init__(self):
self.apifeature_dims = 128
self.z_dims = 20
self.hide_layers = 256
self.generator_layers = [self.apifeature_dims+self.z_dims, self.hide_layers, self.apifeature_dims]
self.substitute_detector_layers = [self.apifeature_dims, self.hide_layers, 1]
self.blackbox = 'MLP'
optimizer = Adam(lr=0.001)
# Build and Train blackbox_detector
self.blackbox_detector = self.build_blackbox_detector()
# Build and compile the substitute_detector
self.substitute_detector = self.build_substitute_detector()
self.substitute_detector.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
# Build the generator
self.generator = self.build_generator()
# The generator takes malware and noise as input and generates adversarial malware examples
#example = Input(shape=(None,1),name='exampe',dtype='float32')
# noise = Input(shape=(None,1),name='noise',dtype='float32')
example = Input(shape=(self.apifeature_dims,))
noise = Input(shape=(self.z_dims,))
input = [example, noise]
malware_examples = self.generator(input)
# For the combined model we will only train the generator
self.substitute_detector.trainable = True
# The discriminator takes generated images as input and determines validity
validity = self.substitute_detector(malware_examples)
# The combined model (stacked generator and substitute_detector)
# Trains the generator to fool the discriminator
self.combined = Model(input, validity)
self.combined.compile(loss='binary_crossentropy', optimizer=optimizer)
def build_blackbox_detector(self):
if self.blackbox is 'MLP':
blackbox_detector = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,
solver='sgd', verbose=0, tol=1e-4, random_state=1,
learning_rate_init=.1)
return blackbox_detector
def build_generator(self):
#example = Input(shape=(None,1))
# noise = Input(shape=(None,1))
example = Input(shape=(self.apifeature_dims,))
noise = Input(shape=(self.z_dims,))
#example = Input(shape=(256),name='exampe',dtype='float32')
#noise = Input(shape=(256),name='noise',dtype='float32')
x = Concatenate(axis=1)([example, noise])
for dim in self.generator_layers[1:]:
x = Dense(dim)(x)
x = Activation(activation='sigmoid')(x)
x = Maximum()([example, x])
generator = Model([example, noise], x, name='generator')
generator.summary()
return generator
def build_substitute_detector(self):
input = Input(shape=(self.substitute_detector_layers[0],))
#input = Input(shape=(5,256),dtype='float32')
x = input
for dim in self.substitute_detector_layers[1:]:
x = Dense(dim)(x)
x = Activation(activation='sigmoid')(x)
substitute_detector = Model(input, x, name='substitute_detector')
substitute_detector.summary()
return substitute_detector
def load_data(self, filename):
# data = pd.read_csv("feature_vectors_system calls.csv")
data = np.load(filename)
xmal, ymal, xben, yben = data['arr_0'].reshape(-1,1), data['arr_1'].reshape(-1,1), data['arr_2'].reshape(-1,1), data['arr_3'].reshape(-1,1)
return (xmal, ymal), (xben, yben)
def train(self, epochs, batch_size=128):
# Load the dataset
(xmal, ymal), (xben, yben) = self.load_data('my_data.npz')
xtrain_mal, xtest_mal, ytrain_mal, ytest_mal = train_test_split(xmal, ymal, test_size=0.20)
xtrain_ben, xtest_ben, ytrain_ben, ytest_ben = train_test_split(xben, yben, test_size=0.20)
print("xmal is : ", xmal.shape)
print("ymal is : ", ymal.shape)
print("xben is : ",xben.shape)
print("yben is ", yben.shape)
print("here 1")
# Train blackbox_detctor
self.blackbox_detector.fit(np.concatenate([xmal, xben]),
np.concatenate([ymal, yben]))
ytrain_ben_blackbox = self.blackbox_detector.predict(xtrain_ben)
Original_Train_TRR = self.blackbox_detector.score(xtrain_mal, ytrain_mal)
Original_Test_TRR = self.blackbox_detector.score(xtest_mal, ytest_mal)
print("here 2")
Train_TRR, Test_TRR = [], []
for epoch in range(epochs):
for step in range(1):#range(xtrain_mal.shape[0] // batch_size):
# ---------------------
# Train substitute_detector
# ---------------------
# Select a random batch of malware examples
idx = np.random.randint(0, xtrain_mal.shape[0],batch_size)
xmal_batch = xtrain_mal[idx]
noise = np.random.uniform(0, 1, (batch_size, self.z_dims))
idx = np.random.randint(0, xmal_batch.shape[0], batch_size)
xben_batch = xtrain_ben[idx]
yben_batch = ytrain_ben_blackbox[idx]
# Generate a batch of new malware examples
# noise.shape = (320,2)
#xmal_batch = np.asarray(xmal_batch).astype(np.float32)
#noise = np.asarray(noise).astype(np.float32)
# xmal_batch.shape = (32,1)
# noise.shape= (32,20)
print("xmal is: ", xmal_batch.shape)
print("noise is:", noise.shape)
#xmal_batch = tf.convert_to_tensor(xmal_batch)
#noise = tf.convert_to_tensor(noise)
xmal_batch.shape = (128,1)
noise.shape= (128,20)
#print("xmal is:", xmal)
#print("noise is:", noise)
# xmal_batch = np.asarray(xmal_batch).astype(np.float32)
noise = np.asarray(noise).astype(np.float32)
gen_examples = self.generator.predict([xmal_batch, noise])
# gen_examples = np.vectorize(gen_examples)
# gen_examples = np.array(gen_examples)
# gen_examples = gen_examples.reshape(gen_examples.shape[0],-1)
# print(gen_examples.shape)
gen_examples = gen_examples.reshape(-1,1)
ymal_batch = self.blackbox_detector.predict(np.ones(gen_examples.shape)*(gen_examples > 0.5))
print("ymal shape is:", ymal_batch.shape)
print("gen_examples is:", gen_examples.shape)
# gen_examples.shape = (2688,128)
# Train the substitute_detector
ymal_batch = np.array([0 for _ in range(len(gen_examples)//2)] + [1 for _ in range(len(gen_examples)//2)])
d_loss_real = self.substitute_detector.train_on_batch((gen_examples), ymal_batch)
print("yben shape is:", yben_batch.shape)
print("xben shape is:", xben_batch.shape)
# xben_batch.shape =(1,128)
# yben_batch.shape=(128,1)
print("yben shape is:", yben_batch.shape)
print("xben shape is:", xben_batch.shape)
d_loss_fake = self.substitute_detector.train_on_batch(xben_batch, yben_batch)
d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
# ---------------------
# Train Generator
# ---------------------
idx = np.random.randint(0, xtrain_mal.shape[0], batch_size)
xmal_batch = xtrain_mal[idx]
noise = np.random.uniform(0, 1, (batch_size, self.z_dims))
# Train the generator
g_loss = self.combined.train_on_batch([xmal_batch, noise], np.zeros((batch_size, 1)))
# Compute Train TRR
noise = np.random.uniform(0, 1, (xtrain_mal.shape[0], self.z_dims))
gen_examples = self.generator.predict([xtrain_mal, noise])
TRR = self.blackbox_detector.score(np.ones(gen_examples.shape) * (gen_examples > 0.5), ytrain_mal)
Train_TRR.append(TRR)
# Compute Test TRR
noise = np.random.uniform(0, 1, (xtest_mal.shape[0], self.z_dims))
gen_examples = self.generator.predict([xtest_mal, noise])
TRR = self.blackbox_detector.score(np.ones(gen_examples.shape) * (gen_examples > 0.5), ytest_mal)
Test_TRR.append(TRR)
# Plot the progress
print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))
print('Original_Train_TRR: {0}, Adver_Train_TRR: {1}'.format(Original_Train_TRR, Train_TRR[-1]))
print('Original_Test_TRR: {0}, Adver_Test_TRR: {1}'.format(Original_Test_TRR, Test_TRR[-1]))
# Plot TRR
plt.figure()
plt.plot(range(epochs), Train_TRR, c='r', label='Training Set', linewidth=2)
plt.plot(range(epochs), Test_TRR, c='g', linestyle='--', label='Validation Set', linewidth=2)
plt.xlabel("Epoch")
plt.ylabel("TRR")
plt.legend()
plt.show()
if __name__ == '__main__':
malgan = MalGAN()
malgan.train(epochs=1000, batch_size=128)
The output is:
Model: "substitute_detector"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_16 (InputLayer) [(None, 128)] 0
dense_12 (Dense) (None, 256) 33024
activation_12 (Activation) (None, 256) 0
dense_13 (Dense) (None, 1) 257
activation_13 (Activation) (None, 1) 0
=================================================================
Total params: 33,281
Trainable params: 33,281
Non-trainable params: 0
_________________________________________________________________
Model: "generator"
__________________________________________________________________________________________________
/usr/local/lib/python3.7/dist-packages/keras/optimizer_v2/adam.py:105: UserWarning: The `lr` argument is deprecated, use `learning_rate` instead.
super(Adam, self).__init__(name, **kwargs)
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_17 (InputLayer) [(None, 128)] 0 []
input_18 (InputLayer) [(None, 20)] 0 []
concatenate_3 (Concatenate) (None, 148) 0 ['input_17[0][0]',
'input_18[0][0]']
dense_14 (Dense) (None, 256) 38144 ['concatenate_3[0][0]']
activation_14 (Activation) (None, 256) 0 ['dense_14[0][0]']
dense_15 (Dense) (None, 128) 32896 ['activation_14[0][0]']
activation_15 (Activation) (None, 128) 0 ['dense_15[0][0]']
maximum_3 (Maximum) (None, 128) 0 ['input_17[0][0]',
'activation_15[0][0]']
==================================================================================================
Total params: 71,040
Trainable params: 71,040
Non-trainable params: 0
__________________________________________________________________________________________________
xmal is : (11598, 1)
ymal is : (11598, 1)
xben is : (11598, 1)
yben is (11598, 1)
here 1
after which I get
/usr/local/lib/python3.7/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:1109: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
/usr/local/lib/python3.7/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:696: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (10) reached and the optimization hasn't converged yet.
ConvergenceWarning,
here 2
xmal is: (128, 1)
noise is: (128, 20)
WARNING:tensorflow:Model was constructed with shape (None, 128) for input KerasTensor(type_spec=TensorSpec(shape=(None, 128), dtype=tf.float32, name='input_17'), name='input_17', description="created by layer 'input_17'"), but it was called on an input with incompatible shape (32, 1).
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-4-937d6995f5c4> in <module>()
240 if __name__ == '__main__':
241 malgan = MalGAN()
--> 242 malgan.train(epochs=1000, batch_size=128)
2 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/func_graph.py in autograph_handler(*args, **kwargs)
1145 except Exception as e: # pylint:disable=broad-except
1146 if hasattr(e, "ag_error_metadata"):
-> 1147 raise e.ag_error_metadata.to_exception(e)
1148 else:
1149 raise
ValueError: in user code:
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1801, in predict_function *
return step_function(self, iterator)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1790, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1783, in run_step **
outputs = model.predict_step(data)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1751, in predict_step
return self(x, training=False)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/usr/local/lib/python3.7/dist-packages/keras/engine/input_spec.py", line 249, in assert_input_compatibility
f'Input {input_index} of layer "{layer_name}" is '
ValueError: Exception encountered when calling layer "generator" (type Functional).
Input 0 of layer "dense_14" is incompatible with the layer: expected axis -1 of input shape to have value 148, but received input with shape (32, 21)
Call arguments received:
• inputs=('tf.Tensor(shape=(32, 1), dtype=int64)', 'tf.Tensor(shape=(32, 20), dtype=float32)')
• training=False
• mask=None

PyTorch multi-class: ValueError: Expected input batch_size (416) to match target batch_size (32)

I have created a mutli-class classification neural network. Training, and validation iterators where created with BigBucketIterator method with fields {'text_normalized_tweet':TEXT, 'label': LABEL}
TEXT = a tweet
LABEL = a float number (with 3 values: 0,1,2)
Below I execute a dummy example of my neural network:
import torch.nn as nn
class MultiClassClassifer(nn.Module):
#define all the layers used in model
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
#Constructor
super(MultiClassClassifer, self).__init__()
#embedding layer
self.embedding = nn.Embedding(vocab_size, embedding_dim)
#dense layer
self.hiddenLayer = nn.Linear(embedding_dim, hidden_dim)
#Batch normalization layer
self.batchnorm = nn.BatchNorm1d(hidden_dim)
#output layer
self.output = nn.Linear(hidden_dim, output_dim)
#activation layer
self.act = nn.Softmax(dim=1) #2d-tensor
#initialize weights of embedding layer
self.init_weights()
def init_weights(self):
initrange = 1.0
self.embedding.weight.data.uniform_(-initrange, initrange)
def forward(self, text, text_lengths):
embedded = self.embedding(text)
#packed sequence
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
tensor, batch_size = packed_embedded[0], packed_embedded[1]
hidden_1 = self.batchnorm(self.hiddenLayer(tensor))
return self.act(self.output(hidden_1))
Instantiate the model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 64
OUTPUT_DIM = 3
model = MultiClassClassifer(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
When I call
text, text_lengths = batch.text_normalized_tweet
predictions = model(text, text_lengths).squeeze()
loss = criterion(predictions, batch.label)
it returns,
ValueError: Expected input batch_size (416) to match target batch_size (32).
model(text, text_lengths).squeeze() = torch.Size([416, 3])
batch.label = torch.Size([32])
I can see that the two objects have different sizes, but I have no clue how to fix this?
You may find the Google Colab notebook here
Shapes of each in, out tensor of my forward() method:
torch.Size([32, 10, 100]) #self.embedding(text)
torch.Size([320, 100]) #nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
torch.Size([320, 64]) #self.batchnorm(self.hiddenLayer(tensor))
torch.Size([320, 3]) #self.act(self.output(hidden_1))
You shouldn't be using the squeeze function after the forward pass, that doesn't make sense.
After removing the squeeze function, as you see, the shape of your final output is [320,3] whereas it is expecting [32,3]. One way to fix this is to average out the embeddings you obtain for each word after the self.Embedding function like shown below:
def forward(self, text, text_lengths):
embedded = self.embedding(text)
embedded = torch.mean(embedded, dim=1, keepdim=True)
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
tensor, batch_size = packed_embedded[0], packed_embedded[1]
hidden_1 = self.batchnorm(self.hiddenLayer(tensor))
return self.act(self.output(hidden_1))

Expected input batch_size (18) to match target batch_size (6)

Is RNN for image classification available only for gray image?
The following program works for gray image classification.
If RGB images are used, I have this error:
Expected input batch_size (18) to match target batch_size (6)
at this line loss = criterion(outputs, labels).
My data loading for train, valid and test are as follows.
input_size = 300
inputH = 300
inputW = 300
#Data transform (normalization & data augmentation)
stats = ((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
train_resize_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
train_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.RandomHorizontalFlip(),
tt.ToTensor(),
tt.Normalize(*stats)])
valid_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
test_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
#Create dataset
train_ds = ImageFolder('./data/train', train_tfms)
valid_ds = ImageFolder('./data/valid', valid_tfms)
test_ds = ImageFolder('./data/test', test_tfms)
from torch.utils.data.dataloader import DataLoader
batch_size = 6
#Training data loader
train_dl = DataLoader(train_ds, batch_size, shuffle = True, num_workers = 8, pin_memory=True)
#Validation data loader
valid_dl = DataLoader(valid_ds, batch_size, shuffle = True, num_workers = 8, pin_memory=True)
#Test data loader
test_dl = DataLoader(test_ds, 1, shuffle = False, num_workers = 1, pin_memory=True)
My model is as follows.
num_steps = 300
hidden_size = 256 #size of hidden layers
num_classes = 5
num_epochs = 20
learning_rate = 0.001
# Fully connected neural network with one hidden layer
num_layers = 2 # 2 RNN layers are stacked
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)#batch must have first dimension
#our inpyt needs to have shape
#x -> (batch_size, seq, input_size)
self.fc = nn.Linear(hidden_size, num_classes)#this fc is after RNN. So needs the last hidden size of RNN
def forward(self, x):
#according to ducumentation of RNN in pytorch
#rnn needs input, h_0 for inputs at RNN (h_0 is initial hidden state)
#the following one is initial hidden layer
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)#first one is number of layers and second one is batch size
#output has two outputs. The first tensor contains the output features of the hidden last layer for all time steps
#the second one is hidden state f
out, _ = self.rnn(x, h0)
#output has batch_size, num_steps, hidden size
#we need to decode hidden state only the last time step
#out (N, 30, 128)
#Since we need only the last time step
#Out (N, 128)
out = out[:, -1, :] #-1 for last time step, take all for N and 128
out = self.fc(out)
return out
stacked_rnn_model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()#cross entropy has softmax at output
#optimizer = torch.optim.Adam(stacked_rnn_model.parameters(), lr=learning_rate) #optimizer used gradient optimization using Adam
optimizer = torch.optim.SGD(stacked_rnn_model.parameters(), lr=learning_rate)
# Train the model
n_total_steps = len(train_dl)
for epoch in range(num_epochs):
t_losses=[]
for i, (images, labels) in enumerate(train_dl):
# origin shape: [6, 3, 300, 300]
# resized: [6, 300, 300]
images = images.reshape(-1, num_steps, input_size).to(device)
print('images shape')
print(images.shape)
labels = labels.to(device)
# Forward pass
outputs = stacked_rnn_model(images)
print('outputs shape')
print(outputs.shape)
loss = criterion(outputs, labels)
t_losses.append(loss)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
Printing images and outputs shapes are
images shape
torch.Size([18, 300, 300])
outputs shape
torch.Size([18, 5])
Where is the mistake?
Tl;dr: You are flattening the first two axes, namely batch and channels.
I am not sure you are taking the right approach but I will write about that layer.
In any case, let's look at the issue you are facing. You have a data loader that produces (6, 3, 300, 300), i.e. batches of 6 three-channel 300x300 images. By the look of it you are looking to reshape each batch element (3, 300, 300) into (step_size=300, -1).
However instead of that you are affecting the first axis - which you shouldn't - with images.reshape(-1, num_steps, input_size). This will have the desired effect when working with a single-channel images since dim=1 wouldn't be the "channel axis". In your case your have 3 channels, therefore, the resulting shape is: (6*3*300*300//300//300, 300, 300) which is (18, 300, 300) since num_steps=300 and input_size=300. As a result you are left with 18 batch elements instead of 6.
Instead what you want is to reshape with (batch_size, num_steps, -1). Leaving the last axis (a.k.a. seq_length) of variable size. This will result in a shape (6, 300, 900).
Here is a corrected and reduced snippet:
batch_size = 6
channels = 3
inputH, inputW = 300, 300
train_ds = TensorDataset(torch.rand(100, 3, inputH, inputW), torch.rand(100, 5))
train_dl = DataLoader(train_ds, batch_size)
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN, self).__init__()
# (batch_size, seq, input_size)
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
# (batch_size, hidden_size)
self.fc = nn.Linear(hidden_size, num_classes)
# (batch_size, num_classes)
def forward(self, x):
out, _ = self.rnn(x)
out = out[:, -1, :]
out = self.fc(out)
return out
num_steps = 300
input_size = inputH*inputW*channels//num_steps
hidden_size = 256
num_classes = 5
num_layers = 2
rnn = RNN(input_size, hidden_size, num_layers, num_classes)
for x, y in train_dl:
print(x.shape, y.shape)
images = images.reshape(batch_size, num_steps, -1)
print(images.shape)
outputs = rnn(images)
print(outputs.shape)
break
As I said in the beginning I am a bit wary about this approach because you are essentially feeding your RNN a RGB 300x300 image in the form of a sequence of 300 flattened vectors... I can't say if that makes sense and terms of training and if the model will be able to learn from that. I could be wrong!

Make a "non-fully connected" (singly connected?) neural network in keras

I don't know the name of what I'm looking for, but I want to make a layer in keras where each input is multiplied by its own, independent weight and bias. E.g. if there were 10 inputs, there would be 10 weights, and 10 biases, and each input would be multiplied by its weight and summed with its bias to get 10 outputs.
For example here is a simple Dense network:
from keras.layers import Input, Dense
from keras.models import Model
N = 10
input = Input((N,))
output = Dense(N)(input)
model = Model(input, output)
model.summary()
As you can see, this model has 110 parameters, because it is fully connected:
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_2 (InputLayer) (None, 10) 0
_________________________________________________________________
dense_2 (Dense) (None, 10) 110
=================================================================
Total params: 110
Trainable params: 110
Non-trainable params: 0
_________________________________________________________________
I want to replace output = Dense(N)(input) with something like output = SinglyConnected()(input), such that the model now has 20 parameters: 10 weights and 10 Biases.
Create a custom layer:
class SingleConnected(Layer):
#creator
def __init__(self, **kwargs):
super(SingleConnected, self).__init__(**kwargs)
#creates weights
def build(self, input_shape):
weight_shape = (1,) * (len(input_shape) - 1)
weight_shape = weight_shape + (input_shape[-1]) #(....., input)
self.kernel = self.add_weight(name='kernel',
shape=weight_shape,
initializer='uniform',
trainable=True)
self.bias = self.add_weight(name='bias',
shape=weight_shape,
initializer='zeros',
trainable=True)
self.built=True
#operation:
def call(self, inputs):
return (inputs * self.kernel) + self.bias
#output shape
def compute_output_shape(self, input_shape):
return input_shape
#for saving the model - only necessary if you have parameters in __init__
def get_config(self):
config = super(SingleConnected, self).get_config()
return config
Use the layer:
model.add(SingleConnected())

Resources