Related
I'm trying to run a script using my data, but I get this error:
Input 0 of layer "dense_14" is incompatible with the layer: expected axis -1 of input shape to have value 148, but received input with shape (32, 21).
I understand that the shapes of my data which are ((11598, 1)) aren't compatible with the model shapes. If anyone can help ?
This is the code:
from __future__ import print_function, division
import tensorflow as tf
from keras.layers import Input, Dense, Activation
from keras.layers import Maximum, Concatenate
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, BatchNormalization
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
class MalGAN():
def __init__(self):
self.apifeature_dims = 128
self.z_dims = 20
self.hide_layers = 256
self.generator_layers = [self.apifeature_dims+self.z_dims, self.hide_layers, self.apifeature_dims]
self.substitute_detector_layers = [self.apifeature_dims, self.hide_layers, 1]
self.blackbox = 'MLP'
optimizer = Adam(lr=0.001)
# Build and Train blackbox_detector
self.blackbox_detector = self.build_blackbox_detector()
# Build and compile the substitute_detector
self.substitute_detector = self.build_substitute_detector()
self.substitute_detector.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
# Build the generator
self.generator = self.build_generator()
# The generator takes malware and noise as input and generates adversarial malware examples
#example = Input(shape=(None,1),name='exampe',dtype='float32')
# noise = Input(shape=(None,1),name='noise',dtype='float32')
example = Input(shape=(self.apifeature_dims,))
noise = Input(shape=(self.z_dims,))
input = [example, noise]
malware_examples = self.generator(input)
# For the combined model we will only train the generator
self.substitute_detector.trainable = True
# The discriminator takes generated images as input and determines validity
validity = self.substitute_detector(malware_examples)
# The combined model (stacked generator and substitute_detector)
# Trains the generator to fool the discriminator
self.combined = Model(input, validity)
self.combined.compile(loss='binary_crossentropy', optimizer=optimizer)
def build_blackbox_detector(self):
if self.blackbox is 'MLP':
blackbox_detector = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,
solver='sgd', verbose=0, tol=1e-4, random_state=1,
learning_rate_init=.1)
return blackbox_detector
def build_generator(self):
#example = Input(shape=(None,1))
# noise = Input(shape=(None,1))
example = Input(shape=(self.apifeature_dims,))
noise = Input(shape=(self.z_dims,))
#example = Input(shape=(256),name='exampe',dtype='float32')
#noise = Input(shape=(256),name='noise',dtype='float32')
x = Concatenate(axis=1)([example, noise])
for dim in self.generator_layers[1:]:
x = Dense(dim)(x)
x = Activation(activation='sigmoid')(x)
x = Maximum()([example, x])
generator = Model([example, noise], x, name='generator')
generator.summary()
return generator
def build_substitute_detector(self):
input = Input(shape=(self.substitute_detector_layers[0],))
#input = Input(shape=(5,256),dtype='float32')
x = input
for dim in self.substitute_detector_layers[1:]:
x = Dense(dim)(x)
x = Activation(activation='sigmoid')(x)
substitute_detector = Model(input, x, name='substitute_detector')
substitute_detector.summary()
return substitute_detector
def load_data(self, filename):
# data = pd.read_csv("feature_vectors_system calls.csv")
data = np.load(filename)
xmal, ymal, xben, yben = data['arr_0'].reshape(-1,1), data['arr_1'].reshape(-1,1), data['arr_2'].reshape(-1,1), data['arr_3'].reshape(-1,1)
return (xmal, ymal), (xben, yben)
def train(self, epochs, batch_size=128):
# Load the dataset
(xmal, ymal), (xben, yben) = self.load_data('my_data.npz')
xtrain_mal, xtest_mal, ytrain_mal, ytest_mal = train_test_split(xmal, ymal, test_size=0.20)
xtrain_ben, xtest_ben, ytrain_ben, ytest_ben = train_test_split(xben, yben, test_size=0.20)
print("xmal is : ", xmal.shape)
print("ymal is : ", ymal.shape)
print("xben is : ",xben.shape)
print("yben is ", yben.shape)
print("here 1")
# Train blackbox_detctor
self.blackbox_detector.fit(np.concatenate([xmal, xben]),
np.concatenate([ymal, yben]))
ytrain_ben_blackbox = self.blackbox_detector.predict(xtrain_ben)
Original_Train_TRR = self.blackbox_detector.score(xtrain_mal, ytrain_mal)
Original_Test_TRR = self.blackbox_detector.score(xtest_mal, ytest_mal)
print("here 2")
Train_TRR, Test_TRR = [], []
for epoch in range(epochs):
for step in range(1):#range(xtrain_mal.shape[0] // batch_size):
# ---------------------
# Train substitute_detector
# ---------------------
# Select a random batch of malware examples
idx = np.random.randint(0, xtrain_mal.shape[0],batch_size)
xmal_batch = xtrain_mal[idx]
noise = np.random.uniform(0, 1, (batch_size, self.z_dims))
idx = np.random.randint(0, xmal_batch.shape[0], batch_size)
xben_batch = xtrain_ben[idx]
yben_batch = ytrain_ben_blackbox[idx]
# Generate a batch of new malware examples
# noise.shape = (320,2)
#xmal_batch = np.asarray(xmal_batch).astype(np.float32)
#noise = np.asarray(noise).astype(np.float32)
# xmal_batch.shape = (32,1)
# noise.shape= (32,20)
print("xmal is: ", xmal_batch.shape)
print("noise is:", noise.shape)
#xmal_batch = tf.convert_to_tensor(xmal_batch)
#noise = tf.convert_to_tensor(noise)
xmal_batch.shape = (128,1)
noise.shape= (128,20)
#print("xmal is:", xmal)
#print("noise is:", noise)
# xmal_batch = np.asarray(xmal_batch).astype(np.float32)
noise = np.asarray(noise).astype(np.float32)
gen_examples = self.generator.predict([xmal_batch, noise])
# gen_examples = np.vectorize(gen_examples)
# gen_examples = np.array(gen_examples)
# gen_examples = gen_examples.reshape(gen_examples.shape[0],-1)
# print(gen_examples.shape)
gen_examples = gen_examples.reshape(-1,1)
ymal_batch = self.blackbox_detector.predict(np.ones(gen_examples.shape)*(gen_examples > 0.5))
print("ymal shape is:", ymal_batch.shape)
print("gen_examples is:", gen_examples.shape)
# gen_examples.shape = (2688,128)
# Train the substitute_detector
ymal_batch = np.array([0 for _ in range(len(gen_examples)//2)] + [1 for _ in range(len(gen_examples)//2)])
d_loss_real = self.substitute_detector.train_on_batch((gen_examples), ymal_batch)
print("yben shape is:", yben_batch.shape)
print("xben shape is:", xben_batch.shape)
# xben_batch.shape =(1,128)
# yben_batch.shape=(128,1)
print("yben shape is:", yben_batch.shape)
print("xben shape is:", xben_batch.shape)
d_loss_fake = self.substitute_detector.train_on_batch(xben_batch, yben_batch)
d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
# ---------------------
# Train Generator
# ---------------------
idx = np.random.randint(0, xtrain_mal.shape[0], batch_size)
xmal_batch = xtrain_mal[idx]
noise = np.random.uniform(0, 1, (batch_size, self.z_dims))
# Train the generator
g_loss = self.combined.train_on_batch([xmal_batch, noise], np.zeros((batch_size, 1)))
# Compute Train TRR
noise = np.random.uniform(0, 1, (xtrain_mal.shape[0], self.z_dims))
gen_examples = self.generator.predict([xtrain_mal, noise])
TRR = self.blackbox_detector.score(np.ones(gen_examples.shape) * (gen_examples > 0.5), ytrain_mal)
Train_TRR.append(TRR)
# Compute Test TRR
noise = np.random.uniform(0, 1, (xtest_mal.shape[0], self.z_dims))
gen_examples = self.generator.predict([xtest_mal, noise])
TRR = self.blackbox_detector.score(np.ones(gen_examples.shape) * (gen_examples > 0.5), ytest_mal)
Test_TRR.append(TRR)
# Plot the progress
print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))
print('Original_Train_TRR: {0}, Adver_Train_TRR: {1}'.format(Original_Train_TRR, Train_TRR[-1]))
print('Original_Test_TRR: {0}, Adver_Test_TRR: {1}'.format(Original_Test_TRR, Test_TRR[-1]))
# Plot TRR
plt.figure()
plt.plot(range(epochs), Train_TRR, c='r', label='Training Set', linewidth=2)
plt.plot(range(epochs), Test_TRR, c='g', linestyle='--', label='Validation Set', linewidth=2)
plt.xlabel("Epoch")
plt.ylabel("TRR")
plt.legend()
plt.show()
if __name__ == '__main__':
malgan = MalGAN()
malgan.train(epochs=1000, batch_size=128)
The output is:
Model: "substitute_detector"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_16 (InputLayer) [(None, 128)] 0
dense_12 (Dense) (None, 256) 33024
activation_12 (Activation) (None, 256) 0
dense_13 (Dense) (None, 1) 257
activation_13 (Activation) (None, 1) 0
=================================================================
Total params: 33,281
Trainable params: 33,281
Non-trainable params: 0
_________________________________________________________________
Model: "generator"
__________________________________________________________________________________________________
/usr/local/lib/python3.7/dist-packages/keras/optimizer_v2/adam.py:105: UserWarning: The `lr` argument is deprecated, use `learning_rate` instead.
super(Adam, self).__init__(name, **kwargs)
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_17 (InputLayer) [(None, 128)] 0 []
input_18 (InputLayer) [(None, 20)] 0 []
concatenate_3 (Concatenate) (None, 148) 0 ['input_17[0][0]',
'input_18[0][0]']
dense_14 (Dense) (None, 256) 38144 ['concatenate_3[0][0]']
activation_14 (Activation) (None, 256) 0 ['dense_14[0][0]']
dense_15 (Dense) (None, 128) 32896 ['activation_14[0][0]']
activation_15 (Activation) (None, 128) 0 ['dense_15[0][0]']
maximum_3 (Maximum) (None, 128) 0 ['input_17[0][0]',
'activation_15[0][0]']
==================================================================================================
Total params: 71,040
Trainable params: 71,040
Non-trainable params: 0
__________________________________________________________________________________________________
xmal is : (11598, 1)
ymal is : (11598, 1)
xben is : (11598, 1)
yben is (11598, 1)
here 1
after which I get
/usr/local/lib/python3.7/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:1109: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
/usr/local/lib/python3.7/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:696: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (10) reached and the optimization hasn't converged yet.
ConvergenceWarning,
here 2
xmal is: (128, 1)
noise is: (128, 20)
WARNING:tensorflow:Model was constructed with shape (None, 128) for input KerasTensor(type_spec=TensorSpec(shape=(None, 128), dtype=tf.float32, name='input_17'), name='input_17', description="created by layer 'input_17'"), but it was called on an input with incompatible shape (32, 1).
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-4-937d6995f5c4> in <module>()
240 if __name__ == '__main__':
241 malgan = MalGAN()
--> 242 malgan.train(epochs=1000, batch_size=128)
2 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/func_graph.py in autograph_handler(*args, **kwargs)
1145 except Exception as e: # pylint:disable=broad-except
1146 if hasattr(e, "ag_error_metadata"):
-> 1147 raise e.ag_error_metadata.to_exception(e)
1148 else:
1149 raise
ValueError: in user code:
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1801, in predict_function *
return step_function(self, iterator)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1790, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1783, in run_step **
outputs = model.predict_step(data)
File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1751, in predict_step
return self(x, training=False)
File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/usr/local/lib/python3.7/dist-packages/keras/engine/input_spec.py", line 249, in assert_input_compatibility
f'Input {input_index} of layer "{layer_name}" is '
ValueError: Exception encountered when calling layer "generator" (type Functional).
Input 0 of layer "dense_14" is incompatible with the layer: expected axis -1 of input shape to have value 148, but received input with shape (32, 21)
Call arguments received:
• inputs=('tf.Tensor(shape=(32, 1), dtype=int64)', 'tf.Tensor(shape=(32, 20), dtype=float32)')
• training=False
• mask=None
Hierarchical attention mechanism for document classification has been presented by Yang et al.
https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf
Its implementation is available on https://github.com/ShawnyXiao/TextClassification-Keras
Also, the implementation of the document classification with Transformer is available on https://keras.io/examples/nlp/text_classification_with_transformer
But, it's not hierarchical.
I have googled a lot but didn't find any implementation of a hierarchical Transformer. Does anyone know how to implement a hierarchical transformer for document classification in Keras?
My implementation is as follows. Note that the implementation extended from Nandan implementation for document classification. https://keras.io/examples/nlp/text_classification_with_transformer.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical
class MultiHeadSelfAttention(layers.Layer):
def __init__(self, embed_dim, num_heads=8):
super(MultiHeadSelfAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
if embed_dim % num_heads != 0:
raise ValueError(
f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
)
self.projection_dim = embed_dim // num_heads
self.query_dense = layers.Dense(embed_dim)
self.key_dense = layers.Dense(embed_dim)
self.value_dense = layers.Dense(embed_dim)
self.combine_heads = layers.Dense(embed_dim)
def attention(self, query, key, value):
score = tf.matmul(query, key, transpose_b=True)
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_score = score / tf.math.sqrt(dim_key)
weights = tf.nn.softmax(scaled_score, axis=-1)
output = tf.matmul(weights, value)
return output, weights
def separate_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs):
# x.shape = [batch_size, seq_len, embedding_dim]
batch_size = tf.shape(inputs)[0]
query = self.query_dense(inputs) # (batch_size, seq_len, embed_dim)
key = self.key_dense(inputs) # (batch_size, seq_len, embed_dim)
value = self.value_dense(inputs) # (batch_size, seq_len, embed_dim)
query = self.separate_heads(
query, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
key = self.separate_heads(
key, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
value = self.separate_heads(
value, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
attention, weights = self.attention(query, key, value)
attention = tf.transpose(
attention, perm=[0, 2, 1, 3]
) # (batch_size, seq_len, num_heads, projection_dim)
concat_attention = tf.reshape(
attention, (batch_size, -1, self.embed_dim)
) # (batch_size, seq_len, embed_dim)
output = self.combine_heads(
concat_attention
) # (batch_size, seq_len, embed_dim)
return output
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate, name=None):
super(TransformerBlock, self).__init__(name=name)
self.att = MultiHeadSelfAttention(embed_dim, num_heads)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(dropout_rate)
self.dropout2 = layers.Dropout(dropout_rate)
def call(self, inputs, training):
attn_output = self.att(inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim, name=None):
super(TokenAndPositionEmbedding, self).__init__(name=name)
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
def compute_output_shape(self, input_shape):
# it changes the shape from (batch_size, maxlen) to (batch_size, maxlen, embed_dim)
return input_shape + (self.pos_emb.output_dim,)
# Lower level (produce a representation of each sentence):
embed_dim = 100 # Embedding size for each token
num_heads = 2 # Number of attention heads
ff_dim = 64 # Hidden layer size in feed forward network inside transformer
L1_dense_units = 100 # Size of the sentence-level representations output by the word-level model
dropout_rate = 0.1
vocab_size = 1000
class_number = 5
max_docs = 10000
max_sentences = 15
max_words = 60
word_input = layers.Input(shape=(max_words,), name='word_input')
word_embedding = TokenAndPositionEmbedding(maxlen=max_words, vocab_size=vocab_size,
embed_dim=embed_dim, name='word_embedding')(word_input)
word_transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='word_transformer')(word_embedding)
word_pool = layers.GlobalAveragePooling1D(name='word_pooling')(word_transformer)
word_drop = layers.Dropout(dropout_rate, name='word_drop')(word_pool)
word_dense = layers.Dense(L1_dense_units, activation="relu", name='word_dense')(word_drop)
word_encoder = keras.Model(word_input, word_dense)
word_encoder.summary()
# =========================================================================
# Upper level (produce a representation of each document):
L2_dense_units = 100
sentence_input = layers.Input(shape=(max_sentences, max_words), name='sentence_input')
sentence_encoder = tf.keras.layers.TimeDistributed(word_encoder, name='sentence_encoder')(sentence_input)
sentence_transformer = TransformerBlock(embed_dim=L1_dense_units, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='sentence_transformer')(sentence_encoder)
sentence_pool = layers.GlobalAveragePooling1D(name='sentence_pooling')(sentence_transformer)
sentence_out = layers.Dropout(dropout_rate)(sentence_pool)
preds = layers.Dense(class_number , activation='softmax', name='sentence_output')(sentence_out)
model = keras.Model(sentence_input, preds)
model.summary()
The summary of the model is as follows:
Model: "model_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
word_input (InputLayer) [(None, 60)] 0
word_embedding (TokenAndPos (None, 60, 100) 106000
itionEmbedding)
word_transformer (Transform (None, 60, 100) 53764
erBlock)
word_pooling (GlobalAverage (None, 100) 0
Pooling1D)
word_drop (Dropout) (None, 100) 0
word_dense (Dense) (None, 100) 10100
=================================================================
Total params: 169,864
Trainable params: 169,864
Non-trainable params: 0
_________________________________________________________________
Model: "model_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
sentence_input (InputLayer) [(None, 15, 60)] 0
sentence_encoder (TimeDistr (None, 15, 100) 169864
ibuted)
sentence_transformer (Trans (None, 15, 100) 53764
formerBlock)
sentence_pooling (GlobalAve (None, 100) 0
ragePooling1D)
dropout_9 (Dropout) (None, 100) 0
sentence_output (Dense) (None, 5) 505
=================================================================
Total params: 224,133
Trainable params: 224,133
Non-trainable params: 0
Everything is ok and you can copy and paste these codes in colab to see the summary of the model.
But, my problem is for positional encoding at the sentence level.
How to apply positional encoding at the sentence level?
The implementation is recursive in the sense that you treat the average of your outputs of transformer x as the input to transformer x+1.
So let's say your data is structured as (batch, chapter, paragraph, sentence, token).
After the first transformation you end up with (batch, chapter, paragraph, sentence, embedding) so then you average and get (batch, chapter, paragraph, sentence_embedding_in).
Apply another transformation and get (batch, chapter, paragraph, sentence_embedding_out).
Average again and get (batch, chapter, paragraph_embedding). Rinse & Repeat.
The implementation of the paper is actually in a different repository:
https://github.com/ematvey/hierarchical-attention-networks
They actually do something different from what I've described and apply transformers at the bottom and RNN at the top. In theory you could do the opposite or apply RNN at each layer (that would be really slow). As far as the implementation is concerned you can abstract from that - the principle remains the same: you apply a transformation, average the outputs and feed it into the next higher-level "layer" (or "module" using torch lingo).
I train a binary classification model with about 8000 images, about 4000 per class, I do transfer learning on resnet50, I freeze all the layers and I got results: val_loss: 0.0340 - val_acc: 0.9890
But when I do a test on the model I get almost random results with very high probability ...
I don't understand how that makes sense?
After all, the model did not study the pictures of validation so how is there such a big gap between the results in the graph and the results in production?
model.summary():
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
resnet50 (Model) (None, 2048) 23587712
_________________________________________________________________
dense (Dense) (None, 2) 4098
=================================================================
Total params: 23,591,810
Trainable params: 4,098
Non-trainable params: 23,587,712
_________________________________________________________________
(Dense - softmax activation)
Here the all code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import cv2
import os
from tensorflow.python.keras.applications import ResNet50
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import optimizers
from keras.applications.resnet50 import preprocess_input
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint
NUM_CLASSES = 2
CHANNELS = 3
IMAGE_RESIZE = 224
RESNET50_POOLING_AVERAGE = 'avg'
DENSE_LAYER_ACTIVATION = 'softmax'
OBJECTIVE_FUNCTION = 'categorical_crossentropy'
LOSS_METRICS = ['accuracy']
NUM_EPOCHS = 10
EARLY_STOP_PATIENCE = 3
STEPS_PER_EPOCH_TRAINING = 10
STEPS_PER_EPOCH_VALIDATION = 10
BATCH_SIZE_TRAINING = 100
BATCH_SIZE_VALIDATION = 100
# Using 1 to easily manage mapping between test_generator & prediction for submission preparation
BATCH_SIZE_TESTING = 1
model = Sequential()
model.add(ResNet50(include_top = False, pooling = RESNET50_POOLING_AVERAGE, weights = 'imagenet'))
model.add(Dense(NUM_CLASSES, activation = DENSE_LAYER_ACTIVATION))
model.layers[0].trainable = False
sgd = optimizers.SGD(lr = 0.01, decay = 1e-6, momentum = 0.9, nesterov = True)
model.compile(optimizer = sgd, loss = OBJECTIVE_FUNCTION, metrics = LOSS_METRICS)
image_size = IMAGE_RESIZE
data_generator = ImageDataGenerator(preprocessing_function=preprocess_input)
train_generator = data_generator.flow_from_directory(
'./datasets/Helmet_verification_v2/train',
target_size=(image_size, image_size),
batch_size=BATCH_SIZE_TRAINING,
class_mode='categorical')
validation_generator = data_generator.flow_from_directory(
'./datasets/Helmet_verification_v2/validation',
target_size=(image_size, image_size),
batch_size=BATCH_SIZE_VALIDATION,
class_mode='categorical')
output_dir = "./models/working"
output_file = output_dir + "/best.hdf5"
if not os.path.exists(output_dir):
print("create folder: {}".format(output_dir))
os.makedirs(output_dir)
cb_early_stopper = EarlyStopping(monitor = 'val_loss', patience = EARLY_STOP_PATIENCE)
cb_checkpointer = ModelCheckpoint(filepath = output_file, monitor = 'val_loss', save_best_only = True, mode = 'auto')
fit_history = model.fit_generator(
train_generator,
steps_per_epoch=STEPS_PER_EPOCH_TRAINING,
epochs = NUM_EPOCHS,
validation_data=validation_generator,
validation_steps=STEPS_PER_EPOCH_VALIDATION,
callbacks=[cb_checkpointer, cb_early_stopper]
)
model.load_weights(output_file)
Test:
test_generator = data_generator.flow_from_directory(
directory = './datasets/Helmet_verification_v2/test',
target_size = (image_size, image_size),
batch_size = BATCH_SIZE_TESTING,
class_mode = None,
shuffle = False,
seed = 123
)
test_generator.reset()
pred = model.predict_generator(test_generator, steps = len(test_generator), verbose = 1)
predicted_class_indices = np.argmax(pred, axis = 1)
some data examples:
Okay, I found the problem, I used the wrong optimizer, I replaced SGD with Adam and it fixed the problem, the results are amazing (only the probability remains too high).
model.compile(optimizer = optimizers.Adam(1e-3), loss = OBJECTIVE_FUNCTION, metrics = LOSS_METRICS)
I am trying to replicate the code from here and apply the bert model to another dataset but after I create my own test and train I stumble upon this problem.
here's my full file
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import json
import re
import numpy as np
from bert.tokenization import FullTokenizer
from tqdm import tqdm
#from keras.backend.tensorflow_backend import set_session
import keras.backend as K
#To make tf 2.0 compatible with tf1.0 code, we disable the tf2.0 functionalities
tf.compat.v1.disable_eager_execution()
# Initialize session
sess = tf.compat.v1.Session()
# Params for bert model and tokenization
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
max_seq_length = 1024
#Load all files from a directory in a DataFrame.
def load_dataset(directory):
data = {}
data["text"] = []
data["label"] = []
with open(directory) as json_file:
temp = json.load(json_file)
for p in temp['Outputs']:
data["text"].append(p["text"])
data["label"].append(p["class"])
return pd.DataFrame.from_dict(data)
class PaddingInputExample(object):
"""Fake example so the num input examples is a multiple of the batch size.
When running eval/predict on the TPU, we need to pad the number of examples
to be a multiple of the batch size, because the TPU requires a fixed batch
size. The alternative is to drop the last batch, which is bad because it means
the entire output data won't be generated.
We use this class instead of `None` because treating `None` as padding
battches could cause silent errors.
"""
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text_a, text_b=None, label=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
def create_tokenizer_from_hub_module(bert_path):
"""Get the vocab file and casing info from the Hub module."""
bert_module = hub.Module(bert_path)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
vocab_file, do_lower_case = sess.run(
[
tokenization_info["vocab_file"],
tokenization_info["do_lower_case"],
]
)
return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
def convert_single_example(tokenizer, example, max_seq_length=256):
"""Converts a single `InputExample` into a single `InputFeatures`."""
if isinstance(example, PaddingInputExample):
input_ids = [0] * max_seq_length
input_mask = [0] * max_seq_length
segment_ids = [0] * max_seq_length
label = 0
return input_ids, input_mask, segment_ids, label
tokens_a = tokenizer.tokenize(example.text_a)
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0 : (max_seq_length - 2)]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
return input_ids, input_mask, segment_ids, example.label
def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
"""Convert a set of `InputExample`s to a list of `InputFeatures`."""
input_ids, input_masks, segment_ids, labels = [], [], [], []
for example in tqdm(examples, desc="Converting examples to features"):
input_id, input_mask, segment_id, label = convert_single_example(
tokenizer, example, max_seq_length
)
input_ids.append(input_id)
input_masks.append(input_mask)
segment_ids.append(segment_id)
labels.append(label)
return (
np.array(input_ids),
np.array(input_masks),
np.array(segment_ids),
np.array(labels).reshape(-1, 1),
)
def convert_text_to_examples(texts, labels):
"""Create InputExamples"""
InputExamples = []
for text, label in zip(texts, labels):
InputExamples.append(
InputExample(guid=None, text_a=" ".join(text), text_b=None, label=label)
)
return InputExamples
class BertLayer(tf.keras.layers.Layer):
def __init__(
self,
n_fine_tune_layers=10,
pooling="mean",
bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
**kwargs,
):
self.n_fine_tune_layers = n_fine_tune_layers
self.trainable = True
self.output_size = 768
self.pooling = pooling
self.bert_path = bert_path
if self.pooling not in ["first", "mean"]:
raise NameError(
f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
)
super(BertLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.bert = hub.Module(
self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
)
# Remove unused layers
trainable_vars = self.bert.variables
if self.pooling == "first":
trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
trainable_layers = ["pooler/dense"]
elif self.pooling == "mean":
trainable_vars = [
var
for var in trainable_vars
if not "/cls/" in var.name and not "/pooler/" in var.name
]
trainable_layers = []
else:
raise NameError(
f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
)
# Select how many layers to fine tune
for i in range(self.n_fine_tune_layers):
trainable_layers.append(f"encoder/layer_{str(11 - i)}")
# Update trainable vars to contain only the specified layers
trainable_vars = [
var
for var in trainable_vars
if any([l in var.name for l in trainable_layers])
]
# Add to trainable weights
for var in trainable_vars:
self._trainable_weights.append(var)
for var in self.bert.variables:
if var not in self._trainable_weights:
self._non_trainable_weights.append(var)
super(BertLayer, self).build(input_shape)
def call(self, inputs):
inputs = [K.cast(x, dtype="int32") for x in inputs]
input_ids, input_mask, segment_ids = inputs
bert_inputs = dict(
input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
)
if self.pooling == "first":
pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
"pooled_output"
]
elif self.pooling == "mean":
result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
"sequence_output"
]
mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
input_mask = tf.cast(input_mask, tf.float32)
pooled = masked_reduce_mean(result, input_mask)
else:
raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")
return pooled
def compute_output_shape(self, input_shape):
return (input_shape[0], self.output_size)
# Build model
def build_model(max_seq_length):
in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
bert_inputs = [in_id, in_mask, in_segment]
bert_output = BertLayer(n_fine_tune_layers=3)
bert_output = (bert_output)(bert_inputs)
dense = tf.keras.layers.Dense(256, activation="relu")(bert_output)
pred = tf.keras.layers.Dense(1, activation="sigmoid")(dense)
model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()
return model
def initialize_vars(sess):
sess.run(tf.compat.v1.local_variables_initializer())
sess.run(tf.compat.v1.global_variables_initializer())
sess.run(tf.compat.v1.tables_initializer())
K.tensorflow_backend.set_session(sess)
def main():
# Params for bert model and tokenization
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
max_seq_length = 1024
train_df = load_dataset('ShuffledDatasetTrain.jsonl')
test_df = load_dataset('ShuffledDatasetTest.jsonl')
# Create datasets (Only take up to max_seq_length words for memory)
train_text = train_df["text"].tolist()
train_text = [" ".join(t.split()[0:max_seq_length]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = train_df["label"].tolist()
test_text = test_df["text"].tolist()
test_text = [" ".join(t.split()[0:max_seq_length]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = test_df["label"].tolist()
# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module(bert_path)
# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, train_label)
test_examples = convert_text_to_examples(test_text, test_label)
# Convert to features
(
train_input_ids,
train_input_masks,
train_segment_ids,
train_labels,
) = convert_examples_to_features(
tokenizer, train_examples, max_seq_length=max_seq_length
)
(
test_input_ids,
test_input_masks,
test_segment_ids,
test_labels,
) = convert_examples_to_features(
tokenizer, test_examples, max_seq_length=max_seq_length
)
model = build_model(max_seq_length)
# Instantiate variables
initialize_vars(sess)
model.fit(
[train_input_ids, train_input_masks, train_segment_ids],
train_labels,
validation_data=(
[test_input_ids, test_input_masks, test_segment_ids],
test_labels,
),
epochs=1,
batch_size=8,
)
if __name__ == "__main__":
main()
and here's the error
Using TensorFlow backend.
Converting examples to features: 100%|██████████| 13000/13000 [03:32<00:00, 61.29it/s]
Converting examples to features: 100%|██████████| 2000/2000 [00:32<00:00, 61.83it/s]
WARNING:tensorflow:From C:\Users\Nitish_2\Miniconda3\lib\site-packages\tensorflow_core\python\ops\resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
WARNING:tensorflow:From C:\Users\Nitish_2\Miniconda3\lib\site-packages\tensorflow_core\python\ops\resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_ids (InputLayer) [(None, 1024)] 0
__________________________________________________________________________________________________
input_masks (InputLayer) [(None, 1024)] 0
__________________________________________________________________________________________________
segment_ids (InputLayer) [(None, 1024)] 0
__________________________________________________________________________________________________
bert_layer (BertLayer) (None, 768) 110104890 input_ids[0][0]
input_masks[0][0]
segment_ids[0][0]
__________________________________________________________________________________________________
dense (Dense) (None, 256) 196864 bert_layer[0][0]
__________________________________________________________________________________________________
dense_1 (Dense) (None, 1) 257 dense[0][0]
==================================================================================================
Total params: 110,302,011
Trainable params: 21,460,737
Non-trainable params: 88,841,274
__________________________________________________________________________________________________
Train on 13000 samples, validate on 2000 samples
2019-12-30 00:45:54.780164: W tensorflow/core/framework/op_kernel.cc:1622] OP_REQUIRES failed at resource_variable_ops.cc:660 : Not found: Resource localhost/bert_layer_module/bert/embeddings/word_embeddings/class tensorflow::Var does not exist.
Traceback (most recent call last):
File "C:/Users/Nitish_2/PycharmProjects/GPT-detection/Model.py", line 323, in <module>
main()
File "C:/Users/Nitish_2/PycharmProjects/GPT-detection/Model.py", line 319, in main
batch_size=8,
File "C:\Users\Nitish_2\Miniconda3\lib\site-packages\tensorflow_core\python\keras\engine\training.py", line 728, in fit
use_multiprocessing=use_multiprocessing)
File "C:\Users\Nitish_2\Miniconda3\lib\site-packages\tensorflow_core\python\keras\engine\training_arrays.py", line 674, in fit
steps_name='steps_per_epoch')
File "C:\Users\Nitish_2\Miniconda3\lib\site-packages\tensorflow_core\python\keras\engine\training_arrays.py", line 393, in model_iteration
batch_outs = f(ins_batch)
File "C:\Users\Nitish_2\Miniconda3\lib\site-packages\tensorflow_core\python\keras\backend.py", line 3580, in __call__
run_metadata=self.run_metadata)
File "C:\Users\Nitish_2\Miniconda3\lib\site-packages\tensorflow_core\python\client\session.py", line 1472, in __call__
run_metadata_ptr)
tensorflow.python.framework.errors_impl.FailedPreconditionError: Error while reading resource variable bert_layer_module/bert/encoder/layer_10/attention/self/query/kernel from Container: localhost. This could mean that the variable was uninitialized. Not found: Resource localhost/bert_layer_module/bert/encoder/layer_10/attention/self/query/kernel/class tensorflow::Var does not exist.
[[{{node bert_layer/bert_layer_module_apply_tokens/bert/encoder/layer_10/attention/self/query/MatMul/ReadVariableOp}}]]
any insight as to why this is happening would be appreciated, and please tell me if I didn't provide the appropriate info, this is my first time asking a question here.
I was using tensorflow + keras while trying to implement a "Text classification" model to classify different types of movie reviews. I am running into a error which tells that the shapes aren't equal.
Because I am not sure where the error might be hidden, I can't produce a reprex example as i am not certain how to isolate the problem. It may be worth your time if the line with variable(x_val) is there as there may be a problem with the partitioning.
note this is not the final code. as I had already encountered an error at this point, i stopped writing it.
from __future__ import absolute_import, division, print_function
import numpy as np
import tensorflow as tf
from tensorflow import keras
imdb = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
#print ("The length if training data: "len(train_data[0]), "And labels: "len(test_data[0]))
word_index = imdb.get_word_index()
word_index = {k: (v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNKNOWN>"] = 2
word_index["<END>"] = 3
reverse_word_index = dict([(value, key) for (key, value) in word_index.items() ])
def decode_review(text):
return (' '.join([reverse_word_index.get(i , "?") for i in text ]))
print (decode_review(train_data[0]))
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
value=word_index["<PAD>"],
padding="post",
maxlen=256)
test_data = keras.preprocessing.sequence.pad_sequences(test_data,
value=word_index["<PAD>"],
padding="post",
maxlen=256)
#print ('train length :' ,len(train_data[0]), 'test length: ', len(train_data[1]))
vocab_size = 10000
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size , 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(16, activation=tf.nn.sigmoid))
print ("the model summary is :======>>" , model.summary())
model.compile(optimizer="adam" , loss="binary_crossentropy", metrics=["acc"])
x_val = train_data[:10000]
partial_x_train = train_data[10000:]
y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]
history = model.fit(partial_x_train , partial_y_train , epochs=40 , batch_size=512,
validation_data=(x_val, y_val), verbose=1)
This is the error message I was getting:-----
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, None, 16) 160000
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16) 0
_________________________________________________________________
dense_2 (Dense) (None, 16) 272
_________________________________________________________________
dense_3 (Dense) (None, 16) 272
=================================================================
Total params: 160,544
Trainable params: 160,544
Non-trainable params: 0
_________________________________________________________________
the model summary is :======>> None
WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-02082e1f39d4> in <module>()
57
58 history = model.fit(partial_x_train , partial_y_train , epochs=40 , batch_size=512,
---> 59 validation_data=(x_val, y_val), verbose=1)
ValueError: A target array with shape (15000, 1) was passed for an output of shape (None, 16) while using as loss `binary_crossentropy`. This loss expects targets to have the same shape as the output.
You need to update the final/output layer of your model. Since it's a binary classification problem, the output Dense layer should have one node like follows:
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
You may want to check out this tutorial on text classification using IMDB dataset.