What I want to do is to add three external trainable parameters in the VAE network using the following function:
def gmmpara_init():
theta_init = tf.Variable(K.ones((n_centroid,1))/n_centroid,trainable=True)
u_init=tf.Variable(K.zeros((n_centroid,latent_dim)),trainable=True)
lambda_init=tf.Variable(K.ones((n_centroid,latent_dim)),trainable=True)
return theta_init,u_init,lambda_init
Then, ideally, I expect that the three parameters can be trained together with the neural network parameters. But the full code always run with errors.
Traceback (most recent call last):
File "vade_modified.py", line 214, in <module>
vade.fit(X, X,shuffle=True,epochs=epoch,batch_size=batch_size,callbacks=[epoch_begin])
File "/home/shuiqiao/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py", line 66, in _method_wrapper
return method(self, *args, **kwargs)
File "/home/shuiqiao/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py", line 848, in fit
tmp_logs = train_function(iterator)
File "/home/shuiqiao/anaconda3/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py", line 580, in __call__
result = self._call(*args, **kwds)
File "/home/shuiqiao/anaconda3/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py", line 644, in _call
return self._stateless_fn(*args, **kwds)
File "/home/shuiqiao/anaconda3/lib/python3.7/site-packages/tensorflow/python/eager/function.py", line 2420, in __call__
return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
File "/home/shuiqiao/anaconda3/lib/python3.7/site-packages/tensorflow/python/eager/function.py", line 1665, in _filtered_call
self.captured_inputs)
File "/home/shuiqiao/anaconda3/lib/python3.7/site-packages/tensorflow/python/eager/function.py", line 1746, in _call_flat
ctx, args, cancellation_manager=cancellation_manager))
File "/home/shuiqiao/anaconda3/lib/python3.7/site-packages/tensorflow/python/eager/function.py", line 598, in call
ctx=ctx)
File "/home/shuiqiao/anaconda3/lib/python3.7/site-packages/tensorflow/python/eager/execute.py", line 74, in quick_execute
"tensors, but found {}".format(keras_symbolic_tensors))
tensorflow.python.eager.core._SymbolicException: Inputs to eager execution function cannot be Keras symbolic tensors, but found [<tf.Tensor 'lambda/Identity:0' shape=(100, 20) dtype=float32>, <tf.Tensor 'dense_3/Identity:0' shape=(100, 20) dtype=float32>, <tf.Tensor 'dense_4/Identity:0' shape=(100, 20) dtype=float32>]
Anyone know how to handle this error? Much appreciated.
The full code is shown as follows:
# -*- coding: utf-8 -*-
import numpy as np
from tensorflow import keras
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Dense, Lambda,Layer
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
import tensorflow as tf
# from tensorflow.keras import objectives
import scipy.io as scio
import gzip
from six.moves import cPickle
import sys
# import theano
# import theano.tensor as T
import math
from sklearn import mixture
from sklearn.cluster import KMeans
from keras.models import model_from_json
import warnings
warnings.filterwarnings("ignore")
def floatX(X):
return np.asarray(X)
def sampling(args):
z_mean, z_log_var = args
epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.)
return z_mean + K.exp(z_log_var / 2) * epsilon
#=====================================
def cluster_acc(Y_pred, Y):
from sklearn.utils.linear_assignment_ import linear_assignment
assert Y_pred.size == Y.size
D = max(Y_pred.max(), Y.max())+1
w = np.zeros((D,D), dtype=np.int64)
for i in range(Y_pred.size):
w[Y_pred[i], Y[i]] += 1
ind = linear_assignment(w.max() - w)
return sum([w[i,j] for i,j in ind])*1.0/Y_pred.size, w
#==================================================
def load_data(dataset):
path = 'dataset/'+dataset+'/'
if dataset == 'mnist':
(x_train,y_train),(x_test,y_test) = keras.datasets.mnist.load_data()
x_train = x_train / 255
x_test = x_test / 255
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))
X = np.concatenate((x_train,x_test))
Y = np.concatenate((y_train,y_test))
if dataset == 'reuters10k':
data=scio.loadmat(path+'reuters10k.mat')
X = data['X']
Y = data['Y'].squeeze()
if dataset == 'har':
data=scio.loadmat(path+'HAR.mat')
X=data['X']
# X=X.astype('float32')
Y=data['Y']-1
X=X[:10200]
Y=Y[:10200]
return X,Y
def config_init(dataset):
if dataset == 'mnist':
return 784,3000,10,0.002,0.002,10,0.9,0.9,1,'sigmoid'
if dataset == 'reuters10k':
return 2000,15,4,0.002,0.002,5,0.5,0.5,1,'linear'
if dataset == 'har':
return 561,120,6,0.002,0.00002,10,0.9,0.9,5,'linear'
def gmmpara_init():
theta_init = tf.Variable(K.ones((n_centroid,1))/n_centroid,trainable=True)
u_init=tf.Variable(K.zeros((n_centroid,latent_dim)),trainable=True)
lambda_init=tf.Variable(K.ones((n_centroid,latent_dim)),trainable=True)
return theta_init,u_init,lambda_init
#================================
def get_gamma(tempz):
temp_Z=K.repeat(tempz,n_centroid)
temp_theta_tensor3 = K.repeat_elements(theta_p,latent_dim,axis=1);
temp_p_c_z=K.exp(K.sum((K.log(temp_theta_tensor3)-0.5*K.log(2*math.pi*lambda_p)-\
K.square(temp_Z-u_p)/(2*lambda_p)),axis=-1))+1e-10
return temp_p_c_z/K.sum(temp_p_c_z,axis=-1,keepdims=True)
#=====================================================
def vae_loss(x, x_decoded_mean):
Z=K.repeat(z,n_centroid) #(3,4) --> (3,n_centroid,4), 3 is the batch size
z_mean_t=K.repeat(z_mean,n_centroid)#(3,4) --> (3,n_centroid,4)
z_log_var_t=K.repeat(z_log_var,n_centroid)#(3,4) --> (3,n_centroid,4)
u_tensor3=u_p #(n_centroid,4)
lambda_tensor3=lambda_p #(n_centroid,4)
theta_tensor3=K.repeat_elements(theta_p,latent_dim,axis=1); #(n_centroid,1)-->(n_centroid,latent_dim), there is a potential problem here, as theta_p is related to the n_centroid, how to update it if we repeat it to a new dimension
p_c_z=K.exp(K.sum((K.log(theta_tensor3)-0.5*K.log(2*math.pi*lambda_tensor3)-\
K.square(Z-u_tensor3)/(2*lambda_tensor3)),axis=-1))+1e-10 # p_c_z should be in shape(3,n_centroid)
gamma=p_c_z/K.sum(p_c_z,axis=-1,keepdims=True) #(3,n_centroid)
gamma_t=K.repeat(gamma,latent_dim) #(3,latent_dim,n_centroid)
if datatype == 'sigmoid':
loss=alpha*original_dim*keras.losses.binary_crossentropy(x, x_decoded_mean)\
+K.sum(0.5*gamma*K.sum(K.log(lambda_tensor3)+K.exp(z_log_var_t)/lambda_tensor3+K.square(z_mean_t-u_tensor3)/lambda_tensor3,axis=2),axis=1)\
-0.5*K.sum(z_log_var+1,axis=-1)\
+K.sum((K.log(gamma/math.pi))*gamma,axis=-1) # corresponding to the second last item in Eq. 12
else:
loss=alpha*original_dim * keras.losses.mean_squared_error(x, x_decoded_mean)\
+K.sum(0.5*gamma_t*(latent_dim*K.log(math.pi*2)+K.log(lambda_tensor3)+K.exp(z_log_var_t)/lambda_tensor3+K.square(z_mean_t-u_tensor3)/lambda_tensor3),axis=(1,2))\
-0.5*K.sum(z_log_var+1,axis=-1)\
-K.sum(K.log(K.repeat_elements(theta_p.dimshuffle('x',0),batch_size,0))*gamma,axis=-1)\
+K.sum(K.log(gamma)*gamma,axis=-1)
return loss
#================================
#===================================
def lr_decay():
if dataset == 'mnist':
# adam_nn.lr.set_value(max(adam_nn.lr.get_value()*decay_nn,0.0002))
# adam_gmm.lr.set_value(max(adam_gmm.lr.get_value()*decay_gmm,0.0002))
pass
else:
adam_nn.lr.set_value(adam_nn.lr.get_value()*decay_nn)
adam_gmm.lr.set_value(adam_gmm.lr.get_value()*decay_gmm)
print ('lr_nn:%f'%adam_nn.lr.get_value())
print ('lr_gmm:%f'%adam_gmm.lr.get_value())
def epochBegin(epoch):
if epoch % decay_n == 0 and epoch!=0:
pass
# lr_decay()
'''
sample = sample_output.predict(X,batch_size=batch_size)
g = mixture.GMM(n_components=n_centroid,covariance_type='diag')
g.fit(sample)
p=g.predict(sample)
acc_g=cluster_acc(p,Y)
if epoch <1 and ispretrain == False:
u_p.set_value(floatX(g.means_.T))
print ('no pretrain,random init!')
'''
gamma = gamma_output.predict(X,batch_size=batch_size)
acc=cluster_acc(np.argmax(gamma,axis=1),Y)
global accuracy
accuracy+=[acc[0]]
if epoch>0 :
#print ('acc_gmm_on_z:%0.8f'%acc_g[0])
print ('acc_p_c_z:%0.8f'%acc[0])
if epoch==1 and dataset == 'har' and acc[0]<0.77:
print ('=========== HAR dataset:bad init!Please run again! ============')
sys.exit(0)
class EpochBegin(Callback):#https://keras.io/guides/writing_your_own_callbacks/ inherit from the Callback class, then implement some functions
def on_epoch_begin(self, epoch, logs={}):# the name is specified, see in the link
epochBegin(epoch)
#==============================================
dataset = 'mnist'
db = sys.argv[1]
if db in ['mnist','reuters10k','har']:
dataset = db
print ('training on: ' + dataset)
ispretrain = False
batch_size = 100
latent_dim = 20
intermediate_dim = [50,50,100]
# theano.config.floatX='float32'
accuracy=[]
X,Y = load_data(dataset)
original_dim,epoch,n_centroid,lr_nn,lr_gmm,decay_n,decay_nn,decay_gmm,alpha,datatype = config_init(dataset)
theta_p,u_p,lambda_p = gmmpara_init()
#===================
x = Input(batch_shape=(batch_size, original_dim))
h = Dense(intermediate_dim[0], activation='relu')(x)
h = Dense(intermediate_dim[1], activation='relu')(h)
h = Dense(intermediate_dim[2], activation='relu')(h)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
h_decoded = Dense(intermediate_dim[-1], activation='relu')(z)
h_decoded = Dense(intermediate_dim[-2], activation='relu')(h_decoded)
h_decoded = Dense(intermediate_dim[-3], activation='relu')(h_decoded)
x_decoded_mean = Dense(original_dim, activation=datatype)(h_decoded)
#========================
#========================
Gamma = Lambda(get_gamma, output_shape=(n_centroid,))(z)
sample_output = Model(x, z_mean)
gamma_output = Model(x,Gamma)
#===========================================
vade = Model(x, x_decoded_mean)
vade.summary()
adam_nn= Adam(lr=lr_nn,epsilon=1e-4)
vade.compile(optimizer=adam_nn, loss =vae_loss)
epoch_begin=EpochBegin()
#-------------------------------------------------------
vade.fit(X, X,shuffle=True,epochs=epoch,batch_size=batch_size,callbacks=[epoch_begin])
I am working on something similar regarding deep clustering using Tensorflow 2.0 and Keras. The solution that I have found that works is to define your own custom layer whose weights are the new trainable parameters that you want. I tried modifying your code to define a new GMM layer:
class GMMLayer(keras.layers.Layer):
def __init__(self, latent_dim, n_clusters):
super(GMMLayer, self).__init__()
self.latent_dim = latent_dim
self.n_clusters = n_clusters
def build(self, input_shape):
self.pi = self.add_weight(name='pi', shape=(self.n_clusters, 1), trainable=True, initializer=tf.constant_initializer(value=1/self.n_clusters))
self.u = self.add_weight(name='u', shape=(self.latent_dim, self.n_clusters), trainable=True, initializer='zeros_initializer')
self.lam = self.add_weight(name='lam', shape=(self.latent_dim, self.n_clusters), trainable=True, initializer='ones_initializer')
def compute_output_shape(self, input_shape):
return (input_shape[0], self.n_clusters)
def call(self, inputs):
z_in, z_mean_in, z_log_var_in = inputs
temp_Z = tf.transpose(K.repeat(z_in, self.n_clusters), perm=[0, 2, 1]) #(3,4) --> (3,n_centroid,4), 3 is the batch size
theta_tensor3 = tf.transpose(K.repeat_elements(self.pi, self.latent_dim, axis=1)) # (n_centroid,1)-->(n_centroid,latent_dim), there is a potential problem here, as theta_p is related to the n_centroid, how to update it if we repeat it to a new dimension
p_c_z = K.exp(K.sum((K.log(theta_tensor3) - 0.5 * K.log(2 * math.pi * self.lam) - \
K.square(temp_Z - self.u) / (2 * self.lam)),
axis=-1)) + 1e-10 # p_c_z should be in shape(batch_size, n_centroid)
gamma = p_c_z / K.sum(p_c_z, axis=-1, keepdims=True)
z_mean_t = tf.transpose(K.repeat(z_mean_in, self.n_clusters), perm=[0, 2, 1])
z_log_var_t = tf.transpose(K.repeat(z_log_var_in, self.n_clusters), perm=[0, 2, 1])
gmm_loss = K.sum(0.5 * gamma * K.sum(
K.log(self.lam) + K.exp(z_log_var_t) / self.lam + K.square(z_mean_t - self.u) / self.lam,
axis=2), axis=1) \
- 0.5 * K.sum(z_log_var + 1, axis=-1) \
- 0.5 * K.sum(z_log_var + 1, axis=-1) \
+ K.sum((K.log(gamma / math.pi)) * gamma, axis=-1)
return [gamma, gmm_loss]
Let me know if this helps!
Related
I have a 3 file. In the datamodule file, I have created data and used the basic format of the PyTorch Lightning. In the linear_model I made a linear regression model based on this page. Finally, I have a train file, I am calling the model and trying to fit the data. But I am getting this error
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/mostafiz/Dropbox/MSc/Thesis/regreesion_EC/src/test_train.py", line 10, in <module>
train_dataloader=datamodule.DataModuleClass().setup().train_dataloader(),
AttributeError: 'tuple' object has no attribute 'train_dataloader'
Sample datamodule file
class DataModuleClass(pl.LightningDataModule):
def __init__(self):
super().__init__()
self.sigma = 5
self.batch_size = 10
self.prepare_data()
def prepare_data(self):
x = np.random.uniform(0, 10, 10)
e = np.random.normal(0, self.sigma, len(x))
y = x + e
X = np.transpose(np.array([x, e]))
self.x_train_tensor = torch.from_numpy(X).float().to(device)
self.y_train_tensor = torch.from_numpy(y).float().to(device)
training_dataset = TensorDataset(self.x_train_tensor, self.y_train_tensor)
self.training_dataset = training_dataset
def setup(self):
data = self.training_dataset
self.train_data, self.val_data = random_split(data, [8, 2])
return self.train_data, self.val_data
def train_dataloader(self):
return DataLoader(self.train_data)
def val_dataloader(self):
return DataLoader(self.val_data)
Sample training file
from . import datamodule, linear_model
model = linear_model.LinearRegression(input_dim=2, l1_strength=1, l2_strength=1)
trainer = pl.Trainer()
trainer.fit(model,
train_dataloader=datamodule.DataModuleClass().setup().train_dataloader(),
val_dataloaders=datamodule.DataModuleClass().setup().val_dataloaders())
Let me know if you need more code or explanation.
Update (Based on the comment)
Now, I am getting the following error after removing self.prepare_data() from the __init__() of the DataModuleClass(), removed return self.train_data, self.val_data from setup(), and changed the test file to
data_module = datamodule.DataModuleClass()
trainer = pl.Trainer()
trainer.fit(model,data_module)
Error:
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/mostafiz/Dropbox/MSc/Thesis/regreesion_EC/src/test_train.py", line 10, in <module>
train_dataloader=datamodule.DataModuleClass().train_dataloader(),
File "/home/mostafiz/Dropbox/MSc/Thesis/regreesion_EC/src/datamodule.py", line 54, in train_dataloader
return DataLoader(self.train_data)
AttributeError: 'DataModuleClass' object has no attribute 'train_data'
Most of the things were correct, except few things like:
def prepare_data(self):
This function was right except that it should not return anything.
Another thing was
def setup(self,stage=None):
Requires stage variable which can be set to a default value of none in case we don't want to switch between different test and train stage.
Putting everything together, here is the code:
from argparse import ArgumentParser
import numpy as np
import pytorch_lightning as pl
from torch.utils.data import random_split, DataLoader, TensorDataset
import torch
from torch.autograd import Variable
from torchvision import transforms
import pytorch_lightning as pl
import torch
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam
from torch.optim.optimizer import Optimizer
class LinearRegression(pl.LightningModule):
def __init__(
self,
input_dim: int = 2,
output_dim: int = 1,
bias: bool = True,
learning_rate: float = 1e-4,
optimizer: Optimizer = Adam,
l1_strength: float = 0.0,
l2_strength: float = 0.0
):
super().__init__()
self.save_hyperparameters()
self.optimizer = optimizer
self.linear = nn.Linear(in_features=self.hparams.input_dim, out_features=self.hparams.output_dim, bias=bias)
def forward(self, x):
y_hat = self.linear(x)
return y_hat
def training_step(self, batch, batch_idx):
x, y = batch
# flatten any input
x = x.view(x.size(0), -1)
y_hat = self(x)
loss = F.mse_loss(y_hat, y, reduction='sum')
# L1 regularizer
if self.hparams.l1_strength > 0:
l1_reg = sum(param.abs().sum() for param in self.parameters())
loss += self.hparams.l1_strength * l1_reg
# L2 regularizer
if self.hparams.l2_strength > 0:
l2_reg = sum(param.pow(2).sum() for param in self.parameters())
loss += self.hparams.l2_strength * l2_reg
loss /= x.size(0)
tensorboard_logs = {'train_mse_loss': loss}
progress_bar_metrics = tensorboard_logs
return {'loss': loss, 'log': tensorboard_logs, 'progress_bar': progress_bar_metrics}
def validation_step(self, batch, batch_idx):
x, y = batch
x = x.view(x.size(0), -1)
y_hat = self(x)
return {'val_loss': F.mse_loss(y_hat, y)}
def validation_epoch_end(self, outputs):
val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
tensorboard_logs = {'val_mse_loss': val_loss}
progress_bar_metrics = tensorboard_logs
return {'val_loss': val_loss, 'log': tensorboard_logs, 'progress_bar': progress_bar_metrics}
def configure_optimizers(self):
return self.optimizer(self.parameters(), lr=self.hparams.learning_rate)
np.random.seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class DataModuleClass(pl.LightningDataModule):
def __init__(self):
super().__init__()
self.sigma = 5
self.batch_size = 10
def prepare_data(self):
x = np.random.uniform(0, 10, 10)
e = np.random.normal(0, self.sigma, len(x))
y = x + e
X = np.transpose(np.array([x, e]))
self.x_train_tensor = torch.from_numpy(X).float().to(device)
self.y_train_tensor = torch.from_numpy(y).float().to(device)
training_dataset = TensorDataset(self.x_train_tensor, self.y_train_tensor)
self.training_dataset = training_dataset
def setup(self,stage=None):
data = self.training_dataset
self.train_data, self.val_data = random_split(data, [8, 2])
def train_dataloader(self):
return DataLoader(self.train_data)
def val_dataloader(self):
return DataLoader(self.val_data)
model = LinearRegression(input_dim=2, l1_strength=1, l2_strength=1)
trainer = pl.Trainer()
dummy = DataModuleClass()
trainer.fit(model,dummy)
I implement a custom layer called "MultiHeadAttention".When I try to use it, that caused
tensorflow.python.framework.errors_impl.InvalidArgumentError:
Incompatible shapes: [128] vs. [128,256,256]
...(omit)...(training/SGD/gradients/multi_head_attention_1/mul_1_grad/Shape,
training/SGD/gradients/multi_head_attention_1/mul_1_grad/Shape_1)]]
MultiHeadAttention code:
class MultiHeadAttention(Layer):
def __init__(self, n_head: int, model_dim: int, **kwargs):
self.n_head = n_head
self.model_dim = model_dim
self.dim_per_head = model_dim // n_head
super(MultiHeadAttention, self).__init__(**kwargs)
def build(self, input_shape):
if isinstance(input_shape, list):
input_shape = input_shape[0]
self.query_kernel = self.add_weight(name='query_kernel',
shape=(input_shape[2], self.dim_per_head * self.n_head),
initializer='uniform', trainable=True)
self.key_kernel = self.add_weight(name='key_kernel',
shape=(input_shape[2], self.dim_per_head * self.n_head),
initializer='uniform', trainable=True)
self.value_kernel = self.add_weight(name='value_kernel',
shape=(input_shape[2], self.dim_per_head * self.n_head),
initializer='uniform', trainable=True)
self.output_kernel = self.add_weight(name='output_kernel',
shape=(self.dim_per_head * self.n_head, self.model_dim),
initializer='uniform', trainable=True)
self.output_bias = self.add_weight(name='output_bias',
shape=(self.model_dim,),
initializer='zeros', trainable=True)
super(MultiHeadAttention, self).build(input_shape)
def call(self, x):
if isinstance(x, list):
attn, attn_mask = x
attn_mask = K.repeat_elements(attn_mask, self.n_head, 0)
else:
attn = x
attn_mask = None
query_big = K.dot(attn, self.query_kernel)
key_big = K.dot(attn, self.key_kernel)
value_big = K.dot(attn, self.value_kernel) # batch ,seq_len, hid*n_head
def reshape1(x):
s = list(x.shape)
x = K.reshape(x, [-1, s[1], self.n_head, s[2] // self.n_head])
x = K.permute_dimensions(x, [2, 0, 1, 3])
x = K.reshape(x, [-1, s[1], s[2] // self.n_head])
return x
query_big = reshape1(query_big)
key_big = reshape1(key_big)
value_big = reshape1(value_big)
# print(value_big.shape)
result = scale_dot_product(query_big, key_big, value_big, attn_mask) # n_head * batch, seq_len, hid
def reshape2(x):
s = list(x.shape) # [n_head * batch_size, len_v, d_v]
x = K.reshape(x, [self.n_head, -1, s[1], s[2]])
x = K.permute_dimensions(x, [1, 2, 0, 3])
x = K.reshape(x, [-1, s[1], self.n_head * s[2]]) # [batch_size, len_v, n_head * d_v]
return x
result = reshape2(result)
result = K.dot(result, self.output_kernel) + self.output_bias
return result
def compute_output_shape(self, input_shape):
if isinstance(input_shape, list):
input_shape = input_shape[0]
return (input_shape[0], input_shape[1], self.model_dim)
def compute_mask(self, inputs, mask=None):
return None
def scale_dot_product(query: tf.Tensor,
key: tf.Tensor,
value: tf.Tensor,
attn_mask=None):
shape_list = list(value.shape)
mul = K.batch_dot(query, K.permute_dimensions(key, (0, 2, 1)))
if attn_mask is not None:
attn_mask = K.cast(attn_mask, dtype=tf.float32)
mul = attn_mask * mul + (1.0 - attn_mask) * neg_inf
scale = mul / K.sqrt(K.cast(shape_list[-1], mul.dtype))
softmax = K.softmax(scale)
result = K.batch_dot(softmax, value)
return result
A simple example:
import numpy as np
import keras.backend as K
from keras.optimizers import SGD
from keras import Input, Model, losses
from keras.layers import Embedding, Lambda, Dense
import MultiHeadAttention
if __name__ == "__main__":
max_len = 256
word_dim = 200
vacab_size = 10000
input = Input(shape=(max_len,), name="Input-Sentence")
word_embedding = Embedding(vacab_size, word_dim, input_length=max_len,
mask_zero=False, trainable=True)(input)
inp_mask = Lambda(lambda t: K.any(K.not_equal(t, 0), axis=-1), name="Input_mask")(input)
out = word_embedding
# There were something wrong with the custom layer of MultiHeadAttention. if comment line below,it would be ok.
out = MultiHeadAttention(n_head=8, model_dim=word_dim)([out, inp_mask])
out = Dense(2, activation="softmax")(out)
model = Model(inputs=input, outputs=out)
model.summary()
model.compile(optimizer=SGD(), loss=losses.sparse_categorical_crossentropy)
# example data
data_num = 1024
x = np.array(np.random.randint(0, vacab_size, (data_num, max_len)).tolist())
y = np.array(np.random.randint(0, 2, (data_num, max_len, 1)).tolist())
print(x.shape, y.shape)
model.fit(x, y, epochs=24, batch_size=16)
keras==2.2.4
tf == 1.13.1
Error information:
Traceback (most recent call last):
File "D:\PyCharm Community Edition
2018.1.4\helpers\pydev\pydev_run_in_console.py", line 52, in run_file
pydev_imports.execfile(file, globals, locals) # execute the script
File "D:\PyCharm Community Edition
2018.1.4\helpers\pydev_pydev_imps_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File
"C:/Users/Administrator/PyProgram/InfosExtractor/code/BERT/MultiAttentionTest.py",
line 30, in
model.fit(x, y, epochs=24, batch_size=16)
File "D:\Anaconda3.7\lib\site-packages\keras\engine\training.py", line
1039, in fit
validation_steps=validation_steps)
File
"D:\Anaconda3.7\lib\site-packages\keras\engine\training_arrays.py",
line 199, in fit_loop
outs = f(ins_batch)
File
"D:\Anaconda3.7\lib\site-packages\keras\backend\tensorflow_backend.py",
line 2715, in call
return self._call(inputs)
File
"D:\Anaconda3.7\lib\site-packages\keras\backend\tensorflow_backend.py",
line 2675, in _call
fetched = self._callable_fn(*array_vals)
File
"D:\Anaconda3.7\lib\site-packages\tensorflow\python\client\session.py",
line 1454, in call
self._session._session, self._handle, args, status, None)
File
"D:\Anaconda3.7\lib\site-packages\tensorflow\python\framework\errors_impl.py",
line 519, in exit
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError:
Incompatible shapes: [128] vs. [128,256,256]
[[Node:
training/SGD/gradients/multi_head_attention_1/mul_1_grad/BroadcastGradientArgs
= BroadcastGradientArgs[T=DT_INT32, _class=["loc:#training/SGD/gradients/multi_head_attention_1/mul_1_grad/Reshape_1"],
_device="/job:localhost/replica:0/task:0/device:CPU:0"](training/SGD/gradients/multi_head_attention_1/mul_1_grad/Shape,
training/SGD/gradients/multi_head_attention_1/mul_1_grad/Shape_1)]]
Because of the attn_mask's shape that doesn't match the mul in the "scale_dot_product" method. So I make some changes:
First, add parameter keep_dim in "inp_mask":inp_mask = Lambda(lambda t: K.any(K.not_equal(t, 0), axis=-1, keep_dim=True), name="Input_mask")(input). But it still doesn't work.
Second, comment the line attn_mask = K.repeat_elements(attn_mask, self.n_head, 0) and do a new method called "reshape_mask"
def reshape_mask(mask, head_num):
if mask is None:
return mask
seq_len = K.shape(mask)[1]
mask = K.expand_dims(mask, axis=1)
mask = K.tile(mask, [1, head_num, 1])
return K.reshape(mask, (-1, seq_len))
Third, rewrite the method scale_dot_product.
def scale_dot_product(query: tf.Tensor,
key: tf.Tensor,
value: tf.Tensor,
attn_mask=None):
feature_dim = K.shape(query)[-1]
e = K.batch_dot(query, key, axes=2) / K.sqrt(K.cast(feature_dim, dtype=K.floatx()))
e = K.exp(e - K.max(e, axis=-1, keepdims=True))
if attn_mask is not None:
e *= K.cast(K.expand_dims(attn_mask, axis=-2), K.floatx())
a = e / (K.sum(e, axis=-1, keepdims=True) + K.epsilon())
v = K.batch_dot(a, value)
return v
Cheers! Cheers! Cheers! Cheers! Cheers! The problem has been solved!
I keep getting the error message below. I cannot seem to pinpoint to the tensor mentioned. Below you'll find the trainer.py and main.py modules. The model I am developing is GAN on CelebA dataset. I am running the code on a remote server so have spent a handful amount of time debugging my model.
This is the full error message:
Traceback (most recent call last):
File "main.py", line 52, in <module>
main(opt)
File "main.py", line 47, in main
trainer.train(train_loader)
File "/home/path/trainer.py", line 45, in train
d_loss_cls = F.binary_cross_entropy_with_logits(out_cls, label_org, size_average=False) / out_cls.size(0)
File "/home/path/anaconda3/lib/python3.7/site-packages/torch/nn/functional.py", line 2077, in binary_cross_entropy_with_logits
return torch.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction_enum)
RuntimeError: expected type torch.cuda.FloatTensor but got torch.FloatTensor
trainer.py
from tqdm import tqdm
import torch
import torch.nn.functional as F
from model import Discriminator, Generator
from tensorboardX import SummaryWriter
class Trainer():
def __init__(self, opt):
# Generator
self.G = Generator(64, 5, 6)
# Discriminator
self.D = Discriminator(128, 64, 5, 6)
# Generator optimizer
self.g_optimizer = torch.optim.Adam(self.G.parameters(), opt.lr)
self.d_optimizer = torch.optim.Adam(self.D.parameters(), opt.lr)
self.opt = opt
if self.opt.cuda:
self.G = self.G.cuda()
self.D = self.D.cuda()
def train(self, data_loader):
"""Function to train the model
"""
print('Training model')
writer_d = SummaryWriter('runs/disc') # discriminator writer
writer_g = SummaryWriter('runs/gen') # generator writer
print('Start training...')
for epoch in tqdm(range(self.opt.epochs)):
for x_real, label_org in tqdm(data_loader):
pass
# Generate target domain labels randomly.
rand_idx = torch.randperm(label_org.size(0))
label_trg = label_org[rand_idx]
c_org = label_org.clone()
c_trg = label_org.clone()
if self.opt.cuda:
x_real = x_real.cuda() # Input images
c_org = c_org.cuda() # Original domain labels
c_trg = c_trg.cuda() # Target domain labels
label_org = label_org.cuda() # Labels for computing classification loss
label_trg = label_trg.cuda() # Labels for computing classification loss
out_src, out_cls = self.D(x_real)
d_loss_real = - torch.mean(out_src)
d_loss_cls = F.binary_cross_entropy_with_logits(out_cls, label_org, size_average=False) / out_cls.size(0)
# Compute loss with fake images
x_fake = self.G(x_real, c_trg)
out_src, out_cls = self.D(x_fake.detach())
d_loss_fake = torch.mean(out_src)
# Compute loss for gradient penalty
alpha = torch.rand(x_real.size(0), 1, 1, 1).cuda()
x_hat = (alpha * x_real.data + (1 - alpha) * x_fake.data).requires_grad_(True)
out_src, _ = self.D(x_hat)
# Backward and optimize
d_loss = d_loss_real + d_loss_fake + d_loss_cls
self.g_optimizer.zero_grad()
self.d_optimizer.zero_grad()
d_loss.backward()
self.d_optimizer.step()
if (i + 1) % 2 == 0:
# Original-to-target domain
x_fake = self.G(x_real, c_trg)
out_src, out_cls = self.D(x_fake)
g_loss_fake = - torch.mean(out_src)
g_loss_cls = F.binary_cross_entropy_with_logits(out_cls, label_trg, size_average=False) / out_cls.size(0)
# Target-to-original domain
x_reconst = self.G(x_fake, c_org)
g_loss_rec = torch.mean(torch.abs(x_real - x_reconst))
# Backward and optimize
g_loss = g_loss_fake + g_loss_rec
self.g_optimizer.zero_grad()
self.d_optimizer.zero_grad()
g_loss.backward()
self.g_optimizer.step()
# write loss to tensorboard
writer_d.add_scalar('data/loss', d_loss, epoch)
writer_d.add_scalar('data/loss', g_loss, epoch)
print('Finished Training')
def test(self, data_loader):
with torch.no_grad():
for i, (x_real, c_org) in enumerate(data_loader):
# Prepare input images and target domain labels.
if self.opt.cuda:
x_real = x_real.cuda()
# Translate images.
x_fake_list = [x_real]
for c_trg in c_trg_list:
x_fake_list.append(self.G(x_real, c_trg))
main.py
import argparse
import random
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from preprocess import pre_process
from celeb_dataset import CelebDataset
from trainer import Trainer
# Setting up the argument parser
parser = argparse.ArgumentParser()
parser.add_argument('--workers', type=int, help='number of data loading workers', default=4)
parser.add_argument('--batchSize', type=int, default=8, help='input batch size')
parser.add_argument('--epochs', type=int, default=20, help='number of epochs to train')
parser.add_argument('--lr', type=float, default=0.0002, help='learning rate')
parser.add_argument('--cuda', action='store_true', help='enables cuda')
parser.add_argument('--manualSeed', type=int, help='manual seed')
parser.add_argument('--dataset_path', type=str, default='./data/celeba', help='dataset path')
opt = parser.parse_args()
print(opt)
if opt.manualSeed is None:
opt.manualSeed = random.randint(1, 10000)
print("Random Seed: ", opt.manualSeed)
def main(opt):
# Setup the parameters for the training/testing
params = {
'batch_size': opt.batchSize,
'shuffle': True,
'num_workers': opt.workers
}
# preprocess and setup dataset and datalader
processed_data = pre_process(opt.dataset_path)
train_dataset = CelebDataset(processed_data[:-2000])
test_dataset = CelebDataset(processed_data[2000:])
train_loader = DataLoader(train_dataset, **params)
test_loader = DataLoader(test_dataset, **params)
trainer = Trainer(opt)
trainer.train(train_loader)
trainer.test(test_loader)
if __name__ == "__main__":
main(opt)
You are getting that error because one of out_cls, label_org is not on the GPU.
Where does your code enact the parser.add_argument('--cuda', action='store_true', help='enables cuda') option?
Perhaps something like:
trainer = Trainer(opt)
if opt.cuda:
trainer = trainer.cuda()
When I use fit_generator in Keras, I get the validation set split into minibatches, and each minibatch is evaluated as training progresses. I want the validation data used exactly once at the end of each epoch. That is, my code is currently:
def model_fit_generator(self):
#This does the actual training of the model
earlystop = EarlyStopping(monitor='val_acc', patience=5, verbose=2, mode='auto')
self.__model.fit_generator(generator=self.train_generator,
validation_data=self.valid_generator,
steps_per_epoch=self.s_per_e,
epochs=self.epochs,
validation_steps = self.v_per_e,
shuffle=False,
verbose=2,
callbacks=[earlystop])
model_filename = '_'.join([str(x) for x in now_list]) + '_model.h5'
self.__model.save(model_filename)
def model_evaluate(self):
self.model_fit_generator()
evaluation = self.__model.evaluate_generator(self.valid_generator, self.v_per_e, verbose=0)
return evaluation
How do I change this so that I have the validation data used once, at the end of each epoch, to decide whether early stopping is useful?
EDIT: In response to a comment, here is a complete MWE, showing that the validation data are being used at the same time as the training data. Note this code will produce an error, but it also prints out batch numbers to show that validation and training sets are both being used. To run this code, you will need 10 CSV files of data, which I can provide, but I'd rather just give you the output right after this code.
from __future__ import division
from __future__ import print_function
from pandas import concat
from pandas import DataFrame
import sys, keras, GPy, GPyOpt
import numpy as np
import pandas as pd
from keras import backend as K
from keras.models import Model
from keras.metrics import binary_crossentropy
from keras.layers import Dense, Input, LSTM, Lambda
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
class my_model():
def __init__(self, n_lags=2, hid_dim_1=5, epochs=2, batch_size=1):
self.n_lags = n_lags
self.hid_dim_1 = hid_dim_1
self.epochs = epochs
self.batch_size = batch_size
self.train_generator, self.s_per_e, self.valid_generator, self.v_per_e, self.n_vars = self.read_data()
self.__model = self.model()
def read_data(self):
n_vars = 2
num_sample_minibatches = 6
num_valid_minibatches = 4
sample_IDs = range(1, self.batch_size+num_sample_minibatches)
valid_IDs = range(num_sample_minibatches+1, max(sample_IDs)+num_valid_minibatches+1)
params = {'batch_size': self.batch_size, 'n_lags': self.n_lags, 'n_vars': n_vars}
train_generator = DataGenerator(sample_IDs, **params)
valid_generator = DataGenerator(valid_IDs, **params)
s_per_e = int(len(sample_IDs) - self.batch_size + 1) #e.g. if you have 1,2,3,4,5,6 then you can create 4 sequences of length 3 (batch_size)
v_per_e = int(len(valid_IDs) - self.batch_size + 1)
return train_generator, s_per_e, valid_generator, v_per_e, n_vars
def model(self):
#https://github.com/twairball/keras_lstm_vae/blob/master/lstm_vae/vae.py
a_input = Input(shape=(self.n_lags, self.n_vars,), name='a_input')
cond_on_this = Input(shape=(self.n_vars,), name="cond_on_this")
b_lstm = LSTM(self.hid_dim_1)(a_input)
outputs = Dense(self.hid_dim_1, activation='sigmoid')(b_lstm)
my_model1 = Model([a_input, cond_on_this], outputs)
my_model1.compile(optimizer=Adam(lr=0.001), loss=binary_crossentropy)
return my_model1
def my_model_fit_generator(self):
earlystop = EarlyStopping(monitor='val_acc', patience=5, verbose=2, mode='auto')
self.__model.fit_generator(generator=self.train_generator,
validation_data=self.valid_generator,
steps_per_epoch=self.s_per_e,
epochs=self.epochs,
validation_steps = self.v_per_e,
shuffle=False,
verbose=2,
callbacks=[earlystop])
def my_model_evaluate(self):
self.my_model_fit_generator()
evaluation = self.__model.evaluate_generator(self.valid_generator, self.v_per_e, verbose=0)
return evaluation
class DataGenerator(keras.utils.Sequence):
'Generates data for Keras'
def __init__(self, list_IDs, batch_size, n_lags, n_vars, shuffle=False):
'Initialization'
self.list_IDs = list_IDs
self.batch_size = batch_size
self.n_lags = n_lags
self.n_vars = n_vars
self.shuffle = shuffle
self.on_epoch_end()
def __len__(self):
'Denotes the number of batches per epoch'
batches_per_epoch = int(np.floor(len(self.list_IDs) - self.batch_size + 1))
return batches_per_epoch
def __getitem__(self, index):
'Generate one batch of data'
#Here's my evidence that the validation minibatches are being used during training!
print('batch number: ', index+1, 'of: ', int(np.floor(len(self.list_IDs) - self.batch_size + 1)))
indexes = self.indexes[index:(index+self.batch_size)]
# Find list of IDs
list_IDs_temp = [self.list_IDs[k] for k in indexes]
# Generate data
data, cond_on_this = self.__data_generation(list_IDs_temp)
return [np.asarray(data), np.asarray(cond_on_this)], np.asarray(cond_on_this)
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.list_IDs))
if self.shuffle == True:
np.random.shuffle(self.indexes)
#From MachineLearningMastery
def series_to_supervised(self, data, n_out=1, dropnan=True):
n_vars = 1 if type(data) is list else data.shape[1]
df = DataFrame(data)
cols, names = list(), list()
#input sequence t-n, ..., t-1
for i in range(self.n_lags, 0, -1): #for i in 3 to 0 not including 0
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j+1, i)) for j in range (self.n_vars)]
#forecast sequence t, t+1, ..., t+n
for i in range(0, n_out):
cols.append(df.shift(-i))
if i==0:
names += [('var%d(t)' % (j+1)) for j in range(self.n_vars)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(self.n_vars)]
agg = concat(cols, axis=1)
agg.columns = names
if dropnan:
agg.dropna(inplace=True)
return agg
def __data_generation(self, list_IDs_temp):
'Generates data containing batch_size samples'
data_np_array = np.empty((self.batch_size, self.n_vars), dtype=float)
for i, ID in enumerate(list_IDs_temp):
#Read in a data file corresponding to this ID; put it into the numpy array.
data_file = './pollution_' + str(i) + '.csv'
df_data = pd.read_csv(data_file, sep=",", header=0)
df_data.columns = ['date','pollution','dew','temp','press','wnd_dir','wnd_spd','snow','rain']
df_data_vals = df_data[['pollution', 'temp']] #this is shape (24, 2)
data_np_array[i,] = np.asarray(df_data_vals)
data_s2s = np.asarray(self.series_to_supervised(data_np_array))
data_data = data_s2s[:, :int(self.n_vars*self.n_lags)]
data_cond = data_s2s[:, int(self.n_vars*self.n_lags):]
data_data = data_data.reshape((data_data.shape[0], self.n_lags, self.n_vars))
return data_data, data_cond
def run_my_model(n_lags=2, hid_dim_1=5, epochs=2, batch_size=1):
_my_model = my_model(n_lags=n_lags, hid_dim_1=hid_dim_1, epochs=epochs, batch_size=batch_size)
mymodel_evaluation = _my_model.my_model_evaluate()
return mymodel_evaluation
#Bounds for hyperparameters
bounds = [{'name': 'hid_dim_1', 'type': 'discrete', 'domain': (5, 10)}]
#Bayesian Optimization
def f(x):
evaluation = run_my_model(hid_dim_1 = int(x[:,0]), epochs = 2, batch_size = 1)
print("binary crossentropy:\t{0}".format(evaluation[0]))
print(evaluation)
return evaluation
#Optimizer instance
opt_mymodel = GPyOpt.methods.BayesianOptimization(f=f, domain=bounds, initial_design_numdata=1)
#Run optimizer
opt_mymodel.run_optimization(max_iter=2)
opt_mymodel.x_opt
Relevant Output:
Using TensorFlow backend.
Epoch 1/2
batch number: 1 of: 4
batch number: 1 of: 6
batch number: 2 of: 4
batch number: 2 of: 6
batch number: 3 of: 4
batch number: 3 of: 6
batch number: 4batch number: 4 of: 4
of: 6
batch number: 5 of: 6
batch number: 6 of: 6
Traceback (most recent call last):
...Error after this...
I'm trying to create a simple RNN using keras but I'm getting this error.
Input is a stream of letters represented by binary classes. The shape is (10, 5, 95). 10 batches, 5 letters at a time, 95 characters in total.
I'm guessing it has something to do with incorrect input fed back as input but I'm not sure how to handle it.
Traceback (most recent call last):
File "07_rnn.py", line 90, in <module>
model.fit(x, y, epochs=3, batch_size=BATCHSIZE)
File "/home/dmabelin/.local/lib/python3.5/site-packages/keras/models.py", line 965, in fit
validation_steps=validation_steps)
File "/home/dmabelin/.local/lib/python3.5/site-packages/keras/engine/training.py", line 1593, in fit
batch_size=batch_size)
File "/home/dmabelin/.local/lib/python3.5/site-packages/keras/engine/training.py", line 1430, in _standardize_user_data
exception_prefix='target')
File "/home/dmabelin/.local/lib/python3.5/site-packages/keras/engine/training.py", line 110, in _standardize_input_data
'with shape ' + str(data_shape))
ValueError: Error when checking target: expected activation_1 to have 2 dimensions, but got array with shape (10, 5, 95)
Code
import numpy as np
import glob
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical
CHARMAP = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890-=!##$%^&*()_+`~[]\{}|;':\",./<>?"
SEQLEN = 5
BATCHSIZE = 10
ALPHASIZE = len(CHARMAP)
INTERNALSIZE = 128
FILES = "shakespeare/*.txt"
LEARNING_RATE = 0.001
## Data related stuff
def char_to_value(char):
idx = CHARMAP.find(char)
if idx >= 0:
return idx
else:
return 0
def char_to_class_map(char):
value = char_to_value(char)
return to_categorical(value,ALPHASIZE)
def value_to_char(value):
return CHARMAP[value]
# iterate every single file
def get_file_data(pattern, index):
paths = glob.glob(pattern)
length = len(paths)
if index < length:
data = []
with open(paths[index], "r") as file:
for line in file:
line_values = [char_to_class_map(l) for l in line]
data = data + list(line_values)
return data
else:
return None
# get batch data in file
def build_line_data(file_data, seqlen, batch_index, batch_count):
length = len(file_data)
start = batch_index * batch_count
end = start+seqlen
x = []
y = []
while end+1 <= length and len(x) < batch_count:
x_line = file_data[start:end]
y_line = file_data[start+1:end+1]
x.append(x_line)
y.append(y_line)
start = start + 1
end = start + seqlen
x = np.array(x)
y = np.array(y)
return x,y
def create_model():
model = Sequential()
model.add(LSTM(INTERNALSIZE,input_shape=(SEQLEN, ALPHASIZE)))
model.add(Dense(ALPHASIZE))
model.add(Activation('softmax'))
#adam optimizer
optimizer = Adam(lr=LEARNING_RATE)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
return model
print('before create_model')
model = create_model()
print('after create_model')
for i in range(1):
print('before get file data')
file_data = get_file_data(FILES, i)
print('after get file data')
idx = 0
while True:
print('before build line data')
x,y = build_line_data(file_data, SEQLEN, idx ,BATCHSIZE)
print('after build line data')
print('before fit')
model.fit(x, y, epochs=3, batch_size=BATCHSIZE)
print('after fit')
idx = idx + 1
if 0 == len(x):
break
if idx > 10:
break
github link: https://github.com/djaney/ml-studies/blob/master/07_rnn.py
Edit:
return_sequences=True in LSTM fixed it.
What are you trying to predict? If it is a sequences-to-sequence model than return_sequences=true is the right way to go.
The reason for the error is that your target was 3 dimensional (batchsize,sequence_length,features) and the LSTM layer only outputs (batchsize,features) for the last time step of the sequence if return_sequences=false.
So depending on your application you have to change the shape of your targets or set return_sequences=true as you already did.