In prior to get to the point, I apologize for my English sentences that can be rather awkward. Because English is not my first language.
Now I am struggle with using theano.tensor.scan function properly. But, I don’t know how ‘sequences parameter’ works.
I created 3 dimensional array (42,10,7002) and input as sequences. I expect the unit of sequences would be 2 dimensional array (10,7002) and the number of steps would be 42.
But it seems like unit of sequences is (1,7002) how can I handle unit of sequences that should be row 10 and column 7002 ?
thanks for reading this question.
add
# -*- coding: utf-8 -*-
__author__ = "Haizhou Qu"
import readFile
import numpy as np
import theano
import theano.tensor as T
from six.moves import zip
# from theano.compile.debugmode import DebugMode
theano.config.optimizer='fast_compile'
theano.config.exception_verbosity='high'
theano.config.compute_test_value = 'warn'
epsilon = 1e-6
dtype = theano.config.floatX
minibatch_size_g = 0
longest_seq_g = 0
voca_dim_global = 0
n_time_step_input_g = 0
n_timestep_target_g = 0
def printT(x):
t = theano.printing.Print('T')(x)
return t
def shared(value, name=None):
return theano.shared(value.astype(dtype), name=name)
def shared_zeros(shape, name=None):
return shared(value=np.zeros(shape), name=name)
def shared_zeros_like(x, name=None):
return shared_zeros(shape=x.shape, name=name)
def init_weights(shape, name=None):
bound = np.sqrt(1.0/shape[1])
w = np.random.uniform(-bound, bound, shape)
return shared(value=w, name=name)
def adadelta(params, cost, lr=1.0, rho=0.95):
# from https://github.com/fchollet/keras/blob/master/keras/optimizers.py
cost = cost.astype('float32')
grads = T.grad(cost, params)
accus = [shared_zeros_like(p.get_value()) for p in params]
delta_accus = [shared_zeros_like(p.get_value()) for p in params]
updates = []
for p, g, a, d_a in zip(params, grads, accus, delta_accus):
new_a = rho * a + (1.0 - rho) * T.square(g)
updates.append((a, new_a))
update = g * T.sqrt(d_a + epsilon) / T.sqrt(new_a + epsilon)
new_p = p - lr * update
updates.append((p, new_p))
new_d_a = rho * d_a + (1.0 - rho) * T.square(update)
updates.append((d_a, new_d_a))
return updates
def categorical_crossentropy(y_true, y_pred):
# from https://github.com/fchollet/keras/blob/master/keras/objectives.py
y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
y_pred = y_true.astype('int64')
# only matrix can be calculated
cce, updates = theano.scan(
fn=T.nnet.categorical_crossentropy,
sequences=[y_pred,y_true]
)
cce.astype('float32')
return T.mean(cce)
def mean_square_error(y_true, y_pred):
return T.mean(T.square(y_pred - y_true))
class LSTM(object):
def __init__(self, size, dim):
self.size = size
self.dim = dim
shape_b = (minibatch_size_g, size)
shape_U = (dim, size)
shape_W = (size, size)
self.h_tm1 = shared_zeros(shape_b, "h_tm1")
self.c_tm1 = shared_zeros(shape_b, "c_tm1")
self.Ui = init_weights(shape_U, "Ui")
self.Wi = init_weights(shape_W, "Wi")
self.bi = shared_zeros(shape_b, "bi")
self.Uf = init_weights(shape_U, "Uf")
self.Wf = init_weights(shape_W, "Wf")
self.bf = shared_zeros(shape_b, "bf")
self.Uo = init_weights(shape_U, "Uo")
self.Wo = init_weights(shape_W, "Wo")
self.bo = shared_zeros(shape_b, "bo")
self.Ug = init_weights(shape_U, "Ug")
self.Wg = init_weights(shape_W, "Wg")
self.bg = shared_zeros(shape_b, "bg")
self.params = [
self.Ui, self.Wi, self.bi,
self.Uf, self.Wf, self.bf,
self.Uo, self.Wo, self.bo,
self.Ug, self.Wg, self.bg
]
def set_state(self, h, c):
self.h_tm1.set_value(h.get_value())
self.c_tm1.set_value(c.get_value())
def reset_state(self):
self.h_tm1 = shared_zeros((1, self.size), "h_tm1")
self.c_tm1 = shared_zeros((1, self.size), "c_tm1")
#staticmethod
def step(
x_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg
):
"""
x_t.shape = (timestep=1, dim)
x_t.shape = (n_samples, timestep=1, dim)
"""
# x_t.eval().shape
x_t = x_t.reshape( (minibatch_size_g , -1) )
#x_t = x_t.reshape( (voca_dim_global , -1) )
h_tm1 = h_tm1.reshape( (-1, n_time_step_input_g) )
c_tm1 = c_tm1.reshape( (-1, n_time_step_input_g) )
i_t = T.nnet.sigmoid(T.dot(x_t, Ui) + T.dot(h_tm1, Wi) + bi)
a=T.dot(x_t, Uf)
b=T.dot(h_tm1, Wf)
c=a+b
f_t=c+bf
#f_t = T.nnet.sigmoid(T.dot(x_t, Uf) + T.dot(h_tm1, Wf) + bf)
o_t = T.nnet.sigmoid(T.dot(x_t, Uo) + T.dot(h_tm1, Wo) + bo)
g_t = T.tanh(T.dot(x_t, Ug) + T.dot(h_tm1, Wg) + bg)
c_t = c_tm1 * f_t + g_t * i_t
h_t = T.tanh(c_t) * o_t
#c_t = c_t.reshape( (1, -1) )
#h_t = h_t.reshape( (1, -1) )
return h_t, c_t
def forward(self, X):
"""
X.shape = (timesteps, dim)
X.shape = (n_samples, timesteps, dim)
"""
X = X.reshape( (-1, voca_dim_global * minibatch_size_g) )
states, updates = theano.scan(
fn=self.step,
sequences=[ X ],
outputs_info=[self.h_tm1, self.c_tm1],
non_sequences=[
self.Ui, self.Wi, self.bi,
self.Uf, self.Wf, self.bf,
self.Uo, self.Wo, self.bo,
self.Ug, self.Wg, self.bg
]
)
updates = [(self.h_tm1, states[0][-1]), (self.c_tm1, states[1][-1])]
print("### forward completed ###")
return states, updates
class LSTMEncoder(LSTM):
def encode(self, X):
states, updates = self.forward(X)
h_t = states[0][-1]
c_t = states[1][-1]
return h_t, c_t, updates
class LSTMDecoder(LSTM):
def __init__(self, size, dim, h_tm1=None, c_tm1=None):
super(LSTMDecoder, self).__init__(size=size, dim=dim)
self.Wh = init_weights((size, dim), "Wh")
self.bh = shared_zeros((1, dim), "bh")
self.h_tm1 = h_tm1 or shared_zeros((1, size), "h_tm1")
self.c_tm1 = c_tm1 or shared_zeros((1, size), "c_tm1")
self.y_t = shared_zeros((1, dim), "y_t")
# self.decode_length = theano.shared(decode_length)
self.params.append(self.Wh)
self.params.append(self.bh)
def decode_step(
self, y_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg,
Wh, bh
):
h_t, c_t = self.step(
y_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg
)
y_t = T.dot(h_t, Wh) + bh
return y_t, h_t, c_t
def decode(self, h_tm1, c_tm1, timesteps):
outputs, updates = theano.scan(
fn=self.decode_step,
outputs_info=[self.y_t, h_tm1, c_tm1],
non_sequences=[
self.Ui, self.Wi, self.bi,
self.Uf, self.Wf, self.bf,
self.Uo, self.Wo, self.bo,
self.Ug, self.Wg, self.bg,
self.Wh, self.bh
],
n_steps=timesteps
)
updates = [
(self.h_tm1, outputs[1][-1]),
(self.c_tm1, outputs[2][-1])
]
return T.flatten(outputs[0], 2), updates
class Seq2Seq(object):
def __init__(self, size, dim):
self.encoder = LSTMEncoder(size, dim)
self.decoder = LSTMDecoder(size, dim)
self.params = []
self.params += self.encoder.params
self.params += self.decoder.params
self._predict = None
self._train = None
self._test = None
def compile(self, loss_func, optimizer):
seq_input = T.tensor3()
seq_target = T.tensor3()
decode_timesteps = T.iscalar()
h_tm1, c_tm1, updates_encode = self.encoder.encode(seq_input)
seq_predict_flex, updates_decode_flex = self.decoder.decode(h_tm1, c_tm1, decode_timesteps)
seq_predict, updates_decode = self.decoder.decode(h_tm1, c_tm1, T.shape(seq_target)[0])
loss = loss_func(seq_predict, seq_target)
self._predict = theano.function([seq_input, decode_timesteps], seq_predict_flex,
updates=updates_encode+updates_decode_flex)
self._test = theano.function([seq_input, seq_target], loss, updates=updates_encode+updates_decode)
updates = []
updates += updates_encode
updates += updates_decode
updates += optimizer(self.params, loss)
self._train = theano.function([seq_input, seq_target], loss, updates=updates)
def predict(self, seq_input, decode_timesteps):
self.encoder.reset_state()
self.decoder.reset_state()
return self._predict(seq_input, decode_timesteps)
def train(self, seq_input, seq_target):
self.encoder.reset_state()
self.decoder.reset_state()
return self._train(seq_input, seq_target)
def test(self, seq_input, seq_target):
self.encoder.reset_state()
self.decoder.reset_state()
return self._test(seq_input, seq_target)
def train(x, target):
for mini_batch, target in zip(x,target):
print("mini_batch shape :",mini_batch.shape)
mini_batch = mini_batch.astype(dtype)
target = target.astype(dtype)
print(seq2seq.train(mini_batch, target))
def predict(x, target):
for mini_batch, target in zip(x,target):
so = seq2seq.predict(x, n_time_step_output_g)
print(so)
loss = seq2seq.test(x, so)
print(loss)
if __name__ == "__main__":
si, st, maxlen_input, minibatch_size, voca_dim = readFile.preprocessing()
voca_dim_global = voca_dim + 2
minibatch_size_g = si[0].shape[1]
print("minibatch_size_g : " , 10)
print("minibatch_size_g : " , si[0].shape[1])
n_time_step_input_g = si[0].shape[0]
n_time_step_output_g = st[0].shape[0]
seq2seq = Seq2Seq(n_time_step_input_g, voca_dim_global )
seq2seq.compile(loss_func=categorical_crossentropy, optimizer=adadelta)
print("select a menu")
print("1. Training")
print("2. Predict and test translated sentence.")
val = input("selection : ")
if val == 1:
train(si, st)
elif val == 2:
predict(si, st)
Related
I'm trying to measure the latent space clustering but the error raised.
class AutoEncoder(nn.Module):
def __init__(self, input_dim1, input_dim2, hidden_dims, agg, sep_decode):
super(AutoEncoder, self).__init__()
self.agg = agg
self.sep_decode = sep_decode
print("hidden_dims:", hidden_dims)
self.encoder_layers = []
self.encoder2_layers = []
dims = [[input_dim1, input_dim2]] + hidden_dims
for i in range(len(dims) - 1):
if i == 0:
layer = nn.Sequential(nn.Linear(dims[i][0], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i][1], dims[i+1]), nn.ReLU())
elif i != 0 and i < len(dims) - 2:
layer = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
else:
layer = nn.Linear(dims[i], dims[i+1])
layer2 = nn.Linear(dims[i], dims[i+1])
self.encoder_layers.append(layer)
self.encoder2_layers.append(layer2)
self.encoder = nn.Sequential(*self.encoder_layers)
self.encoder2 = nn.Sequential(*self.encoder2_layers)
self.decoder_layers = []
self.decoder2_layers = []
hidden_dims.reverse()
dims = hidden_dims + [[input_dim1, input_dim2]]
if self.agg == "concat" and not self.sep_decode:
dims[0] = 2 * dims[0]
for i in range(len(dims) - 1):
if i < len(dims) - 2:
layer = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
else:
layer = nn.Linear(dims[i], dims[i+1][0])
layer2 = nn.Linear(dims[i], dims[i+1][1])
self.decoder_layers.append(layer)
self.decoder2_layers.append(layer2)
self.decoder = nn.Sequential(*self.decoder_layers)
self.decoder2 = nn.Sequential(*self.decoder2_layers)
def forward(self, x1, x2):
z1 = self.encoder(x1)
z2 = self.encoder2(x2)
if self.agg == "max":
z = torch.max(z1, z2)
elif self.agg == "multi":
z = z1 * z2
elif self.agg == "sum":
z = z1 + z2
elif self.agg == "concat":
z = torch.cat([z1, z2], dim=1)
if self.sep_decode:
x_bar1 = self.decoder(z1)
x_bar1 = F.normalize(x_bar1, dim=-1)
x_bar2 = self.decoder2(z2)
x_bar2 = F.normalize(x_bar2, dim=-1)
else:
x_bar1 = self.decoder(z)
x_bar1 = F.normalize(x_bar1, dim=-1)
x_bar2 = self.decoder2(z)
x_bar2 = F.normalize(x_bar2, dim=-1)
return x_bar1, x_bar2, z
class TopicCluster(nn.Module):
def __init__(self, args):
super(TopicCluster, self).__init__()
self.alpha = 1.0
self.dataset_path = args.dataset_path
self.args = args
self.device = args.device
self.temperature = args.temperature
self.distribution = args.distribution
self.agg_method = args.agg_method
self.sep_decode = (args.sep_decode == 1)
input_dim1 = args.input_dim1
input_dim2 = args.input_dim2
hidden_dims = eval(args.hidden_dims)
self.model = AutoEncoder(input_dim1, input_dim2, hidden_dims, self.agg_method, self.sep_decode)
if self.agg_method == "concat":
self.topic_emb = Parameter(torch.Tensor(args.n_clusters, 2*hidden_dims[-1]))
else:
self.topic_emb = Parameter(torch.Tensor(args.n_clusters, hidden_dims[-1]))
torch.nn.init.xavier_normal_(self.topic_emb.data)
def pretrain(self, input_data, pretrain_epoch=200):
pretrained_path = os.path.join(self.dataset_path, f"pretrained_{args.suffix}.pt")
if os.path.exists(pretrained_path) and self.args.load_pretrain:
# load pretrain weights
print(f"loading pretrained model from {pretrained_path}")
self.model.load_state_dict(torch.load(pretrained_path))
else:
train_loader = DataLoader(input_data, batch_size=self.args.batch_size, shuffle=True)
optimizer = Adam(self.model.parameters(), lr=self.args.lr)
for epoch in range(pretrain_epoch):
total_loss = 0
for batch_idx, (x1, x2, _, weight) in enumerate(train_loader):
x1 = x1.to(self.device)
x2 = x2.to(self.device)
weight = weight.to(self.device)
optimizer.zero_grad()
x_bar1, x_bar2, z = self.model(x1, x2)
loss = cosine_dist(x_bar1, x1) + cosine_dist(x_bar2, x2) #, weight)
total_loss += loss.item()
loss.backward()
optimizer.step()
print(f"epoch {epoch}: loss = {total_loss / (batch_idx+1):.4f}")
torch.save(self.model.state_dict(), pretrained_path)
print(f"model saved to {pretrained_path}")
def cluster_assign(self, z):
if self.distribution == 'student':
p = 1.0 / (1.0 + torch.sum(
torch.pow(z.unsqueeze(1) - self.topic_emb, 2), 2) / self.alpha)
p = p.pow((self.alpha + 1.0) / 2.0)
p = (p.t() / torch.sum(p, 1)).t()
else:
self.topic_emb.data = F.normalize(self.topic_emb.data, dim=-1)
z = F.normalize(z, dim=-1)
sim = torch.matmul(z, self.topic_emb.t()) / self.temperature
p = F.softmax(sim, dim=-1)
return p
def forward(self, x1, x2):
x_bar1, x_bar2, z = self.model(x1, x2)
p = self.cluster_assign(z)
return x_bar1, x_bar2, z, p
def target_distribution(self, x1, x2, freq, method='all', top_num=0):
_, _, z = self.model(x1, x2)
p = self.cluster_assign(z).detach()
if method == 'all':
q = p**2 / (p * freq.unsqueeze(-1)).sum(dim=0)
q = (q.t() / q.sum(dim=1)).t()
elif method == 'top':
assert top_num > 0
q = p.clone()
sim = torch.matmul(self.topic_emb, z.t())
_, selected_idx = sim.topk(k=top_num, dim=-1)
for i, topic_idx in enumerate(selected_idx):
q[topic_idx] = 0
q[topic_idx, i] = 1
return p, q
def cosine_dist(x_bar, x, weight=None):
if weight is None:
weight = torch.ones(x.size(0), device=x.device)
cos_sim = (x_bar * x).sum(-1)
cos_dist = 1 - cos_sim
cos_dist = (cos_dist * weight).sum() / weight.sum()
return cos_dist
def train(args, emb_dict):
# ipdb.set_trace()
inv_vocab = {k: " ".join(v) for k, v in emb_dict["inv_vocab"].items()}
vocab = {" ".join(k):v for k, v in emb_dict["vocab"].items()}
print(f"Vocab size: {len(vocab)}")
embs = F.normalize(torch.tensor(emb_dict["vs_emb"]), dim=-1)
embs2 = F.normalize(torch.tensor(emb_dict["oh_emb"]), dim=-1)
freq = np.array(emb_dict["tuple_freq"])
if not args.use_freq:
freq = np.ones_like(freq)
input_data = TensorDataset(embs, embs2, torch.arange(embs.size(0)), torch.tensor(freq))
topic_cluster = TopicCluster(args).to(args.device)
topic_cluster.pretrain(input_data, args.pretrain_epoch)
train_loader = DataLoader(input_data, batch_size=args.batch_size, shuffle=False)
optimizer = Adam(topic_cluster.parameters(), lr=args.lr)
# topic embedding initialization
embs = embs.to(args.device)
embs2 = embs2.to(args.device)
x_bar1, x_bar2, z = topic_cluster.model(embs, embs2)
z = F.normalize(z, dim=-1)
print(f"Running K-Means for initialization")
kmeans = KMeans(n_clusters=args.n_clusters, n_init=5)
if args.use_freq:
y_pred = kmeans.fit_predict(z.data.cpu().numpy(), sample_weight=freq)
else:
y_pred = kmeans.fit_predict(z.data.cpu().numpy())
print(f"Finish K-Means")
freq = torch.tensor(freq).to(args.device)
y_pred_last = y_pred
topic_cluster.topic_emb.data = torch.tensor(kmeans.cluster_centers_).to(args.device)
topic_cluster.train()
i = 0
for epoch in range(50):
if epoch % 5 == 0:
_, _, z, p = topic_cluster(embs, embs2)
z = F.normalize(z, dim=-1)
topic_cluster.topic_emb.data = F.normalize(topic_cluster.topic_emb.data, dim=-1)
if not os.path.exists(os.path.join(args.dataset_path, f"clusters_{args.suffix}")):
os.makedirs(os.path.join(args.dataset_path, f"clusters_{args.suffix}"))
embed_save_path = os.path.join(args.dataset_path, f"clusters_{args.suffix}/embed_{epoch}.pt")
torch.save({
"inv_vocab": emb_dict['inv_vocab'],
"embed": z.detach().cpu().numpy(),
"topic_embed": topic_cluster.topic_emb.detach().cpu().numpy(),
}, embed_save_path)
f = open(os.path.join(args.dataset_path, f"clusters_{args.suffix}/{epoch}.txt"), 'w')
pred_cluster = p.argmax(-1)
result_strings = []
for j in range(args.n_clusters):
if args.sort_method == 'discriminative':
word_idx = torch.arange(embs.size(0))[pred_cluster == j]
sorted_idx = torch.argsort(p[pred_cluster == j][:, j], descending=True)
word_idx = word_idx[sorted_idx]
else:
sim = torch.matmul(topic_cluster.topic_emb[j], z.t())
_, word_idx = sim.topk(k=30, dim=-1)
word_cluster = []
freq_sum = 0
for idx in word_idx:
freq_sum += freq[idx].item()
if inv_vocab[idx.item()] not in word_cluster:
word_cluster.append(inv_vocab[idx.item()])
if len(word_cluster) >= 10:
break
result_strings.append((freq_sum, f"Topic {j} ({freq_sum}): " + ', '.join(word_cluster)+'\n'))
result_strings = sorted(result_strings, key=lambda x: x[0], reverse=True)
for result_string in result_strings:
f.write(result_string[1])
for x1, x2, idx, weight in train_loader:
if i % args.update_interval == 0:
p, q = topic_cluster.target_distribution(embs, embs2, freq.clone().fill_(1), method='all', top_num=epoch+1)
y_pred = p.cpu().numpy().argmax(1)
delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
y_pred_last = y_pred
if i > 0 and delta_label < args.tol:
print(f'delta_label {delta_label:.4f} < tol ({args.tol})')
print('Reached tolerance threshold. Stopping training.')
return None
i += 1
x1 = x1.to(args.device)
x2 = x2.to(args.device)
idx = idx.to(args.device)
weight = weight.to(args.device)
x_bar1, x_bar2, _, p = topic_cluster(x1, x2)
reconstr_loss = cosine_dist(x_bar1, x1) + cosine_dist(x_bar2, x2) #, weight)
kl_loss = F.kl_div(p.log(), q[idx], reduction='none').sum(-1)
kl_loss = (kl_loss * weight).sum() / weight.sum()
loss = args.gamma * kl_loss + reconstr_loss
if i % args.update_interval == 0:
print(f"KL loss: {kl_loss}; Reconstruction loss: {reconstr_loss}")
optimizer.zero_grad()
loss.backward()
optimizer.step()
return None
if __name__ == "__main__":
# CUDA_VISIBLE_DEVICES=0 python3 latent_space_clustering.py --dataset_path ./pandemic --input_emb_name po_tuple_features_all_svos.pk
parser = argparse.ArgumentParser(
description='train',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset_path', type=str)
parser.add_argument('--input_emb_name', type=str)
parser.add_argument('--lr', type=float, default=5e-4)
parser.add_argument('--n_clusters', default=30, type=int)
parser.add_argument('--input_dim1', default=1000, type=int)
parser.add_argument('--input_dim2', default=1000, type=int)
parser.add_argument('--agg_method', default="multi", choices=["sum", "multi", "concat", "attend"], type=str)
parser.add_argument('--sep_decode', default=0, choices=[0, 1], type=int)
parser.add_argument('--pretrain_epoch', default=100, type=int)
parser.add_argument('--load_pretrain', default=False, action='store_true')
parser.add_argument('--temperature', default=0.1, type=float)
parser.add_argument('--sort_method', default='generative', choices=['generative', 'discriminative'])
parser.add_argument('--distribution', default='softmax', choices=['softmax', 'student'])
parser.add_argument('--batch_size', default=256, type=int)
parser.add_argument('--use_freq', default=False, action='store_true')
parser.add_argument('--hidden_dims', default='[1000, 2000, 1000, 100]', type=str)
parser.add_argument('--suffix', type=str, default='')
parser.add_argument('--gamma', default=5, type=float, help='weight of clustering loss')
parser.add_argument('--update_interval', default=100, type=int)
parser.add_argument('--tol', default=0.001, type=float)
args = parser.parse_args()
args.cuda = torch.cuda.is_available()
print("use cuda: {}".format(args.cuda))
args.device = torch.device("cuda" if args.cuda else "cpu")
print(args)
with open(os.path.join(args.dataset_path, args.input_emb_name), "rb") as fin:
emb_dict = pk.load(fin)
candidate_idx = train(args, emb_dict)
print(candidate_idx)
The error I'm getting is: RuntimeError: mat1 and mat2 shapes cannot be multiplied (256x726 and 1000x1000). I cannot figure out which part is the problem. Please help me.. Thank you so much
for the images runtime error like
enter image description here
I am using Scikit-optimize package to perform a hyperparameter optimization task on a LSTM. I am using the gp_minimize function in it. I have a sales forecasting task for a store. When I run the optimization task twice and obtain the separate results. I noticed that the results are different. I am setting the TensorFlow seed, the numpy seed and the seed in gp_minimize. I do not understand what the problem should be.
shown below is my code. Any help is much appreciated.
import skopt
from sklearn.ensemble import RandomForestRegressor
from skopt import gp_minimize, dump
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_convergence
from skopt.plots import plot_objective, plot_evaluations
from skopt.utils import use_named_args
import matplotlib as mplt
from xgboost import XGBRegressor
mplt.use('agg') # Must be before importing matplotlib.pyplot or pylab!
import matplotlib.pyplot as plt
import csv
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt
import atexit
from time import time, strftime, localtime
from datetime import timedelta
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
randomState = 46
np.random.seed(randomState)
tf.set_random_seed(randomState)
input_size = 1
num_layers = 1
columns = ['Sales', 'DayOfWeek', 'SchoolHoliday', 'Promo','lagged_Open','lagged_promo','lagged_SchoolHoliday']
features = len(columns)
fileName = None
column_min_max = None
Error_file_name = None
Error_plot_name = None
fileNames = ['store165_1']
column_min_max_all = [[[0, 9000], [1, 7]]]
num_steps = None
lstm_size = None
batch_size = None
init_learning_rate = None
learning_rate_decay = None
init_epoch = None # 5
max_epoch = None # 100 or 50
hidden1_nodes = None
hidden2_nodes = None
dropout_rate = None
hidden1_activation = None
hidden2_activation = None
lstm_activation = None
lowest_error = 0.0
start = None
iteration = 0
bestTestPrediction = None
bestValiPrediction = None
bestTestTrueVal = None
bestValiTrueVal = None
lstm_num_steps = Integer(low=2, high=14, name='lstm_num_steps')
size = Integer(low=8, high=128, name='size')
lstm_hidden1_nodes = Integer(low=4, high=64, name='lstm_hidden1_nodes')
lstm_hidden2_nodes = Integer(low=2, high=32, name='lstm_hidden2_nodes')
lstm_learning_rate_decay = Real(low=0.7, high=0.99, prior='uniform', name='lstm_learning_rate_decay')
lstm_max_epoch = Integer(low=60, high=200, name='lstm_max_epoch')
lstm_init_epoch = Integer(low=5, high=50, name='lstm_init_epoch')
lstm_batch_size = Integer(low=5, high=64, name='lstm_batch_size')
lstm_dropout_rate = Real(low=0.1, high=0.9, prior='uniform', name='lstm_dropout_rate')
lstm_init_learning_rate = Real(low=1e-4, high=1e-1, prior='log-uniform', name='lstm_init_learning_rate')
lstm_hidden1_activation = Categorical(categories=[tf.nn.relu, tf.nn.tanh], name='lstm_hidden1_activation')
lstm_hidden2_activation = Categorical(categories=[tf.nn.relu, tf.nn.tanh], name='lstm_hidden2_activation')
lstm_lstm_activation = Categorical(categories=[tf.nn.relu, tf.nn.tanh], name='lstm_lstm_activation')
dimensions = [lstm_num_steps, size, lstm_hidden1_nodes, lstm_hidden2_nodes, lstm_init_epoch, lstm_max_epoch,
lstm_learning_rate_decay, lstm_batch_size, lstm_dropout_rate, lstm_init_learning_rate,
lstm_hidden1_activation, lstm_hidden2_activation, lstm_lstm_activation]
default_parameters = [5, 35, 30, 15, 5, 60, 0.99, 8, 0.1, 0.01, tf.nn.relu, tf.nn.relu, tf.nn.relu]
def secondsToStr(elapsed=None):
if elapsed is None:
return strftime("%Y-%m-%d %H:%M:%S", localtime())
else:
return str(timedelta(seconds=elapsed))
def log(s, elapsed=None):
line = "=" * 40
print(line)
print(secondsToStr(), '-', s)
if elapsed:
print("Elapsed time:", elapsed)
print(line)
print()
def endlog():
end = time()
elapsed = end - start
log("End Program", secondsToStr(elapsed))
def plot():
fig = plt.figure()
fig = plt.figure(dpi=100, figsize=(20, 7))
error_vals = pd.read_csv(Error_file_name, header=None)
iterations = range(len(error_vals.iloc[:, 0]))
values = error_vals.iloc[:, 0].get_values()
plt.plot(iterations,values , label='RMSE')
plt.legend(loc='upper left', frameon=False)
plt.xlabel("Iteration")
plt.ylabel("RMSE")
plt.grid(ls='--')
plt.savefig(Error_plot_name, format='png', bbox_inches='tight', transparent=False)
plt.close()
def generate_batches(train_X, train_y, batch_size):
num_batches = int(len(train_X)) // batch_size
if batch_size * num_batches < len(train_X):
num_batches += 1
batch_indices = range(num_batches)
for j in batch_indices:
batch_X = train_X[j * batch_size: (j + 1) * batch_size]
batch_y = train_y[j * batch_size: (j + 1) * batch_size]
# assert set(map(len, batch_X)) == {num_steps}
yield batch_X, batch_y
def segmentation(data):
seq = [price for tup in data[columns].values for price in tup]
seq = np.array(seq)
# split into items of features
seq = [np.array(seq[i * features: (i + 1) * features])
for i in range(len(seq) // features)]
# split into groups of num_steps
X = np.array([seq[i: i + num_steps] for i in range(len(seq) - num_steps)])
y = np.array([seq[i + num_steps] for i in range(len(seq) - num_steps)])
# get only sales value
y = [[y[i][0]] for i in range(len(y))]
y = np.asarray(y)
return X, y
def scale(data):
for i in range(len(column_min_max)):
data[columns[i]] = (data[columns[i]] - column_min_max[i][0]) / ((column_min_max[i][1]) - (column_min_max[i][0]))
return data
def rescle(test_pred):
prediction = [(pred * (column_min_max[0][1] - column_min_max[0][0])) + column_min_max[0][0] for pred in test_pred]
return prediction
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
itemindex = np.where(y_true == 0)
y_true = np.delete(y_true, itemindex)
y_pred = np.delete(y_pred, itemindex)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def RMSPE(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
itemindex = np.where(y_true == 0)
y_true = np.delete(y_true, itemindex)
y_pred = np.delete(y_pred, itemindex)
return np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true)), axis=0))
def pre_process():
store_data = pd.read_csv(fileName)
store_data['lagged_Open'] = store_data['lagged_Open'].astype(int)
store_data['lagged_promo'] = store_data['lagged_promo'].astype(int)
store_data['lagged_SchoolHoliday'] = store_data['lagged_SchoolHoliday'].astype(int)
# sftp://wso2#192.168.32.11/home/wso2/suleka/salesPred/store2_1_original.csv
# store_data = store_data.drop(store_data[(store_data.Open == 0) & (store_data.Sales == 0)].index)
#
# store_data = store_data.drop(store_data[(store_data.Open != 0) & (store_data.Sales == 0)].index)
# ---for segmenting original data --------------------------------
original_data = store_data.copy()
## train_size = int(len(store_data) * (1.0 - test_ratio))
validation_len = len(store_data[(store_data.Month == 6) & (store_data.Year == 2015)].index)
test_len = len(store_data[(store_data.Month == 7) & (store_data.Year == 2015)].index)
train_size = int(len(store_data) - (validation_len + test_len))
train_data = store_data[:train_size]
validation_data = store_data[(train_size - num_steps): validation_len + train_size]
test_data = store_data[((validation_len + train_size) - num_steps):]
original_val_data = validation_data.copy()
original_test_data = test_data.copy()
# -------------- processing train data---------------------------------------
scaled_train_data = scale(train_data)
train_X, train_y = segmentation(scaled_train_data)
# -------------- processing validation data---------------------------------------
scaled_validation_data = scale(validation_data)
val_X, val_y = segmentation(scaled_validation_data)
# -------------- processing test data---------------------------------------
scaled_test_data = scale(test_data)
test_X, test_y = segmentation(scaled_test_data)
# ----segmenting original validation data-----------------------------------------------
nonescaled_val_X, nonescaled_val_y = segmentation(original_val_data)
# ----segmenting original test data-----------------------------------------------
nonescaled_test_X, nonescaled_test_y = segmentation(original_test_data)
return train_X, train_y, test_X, test_y, val_X, val_y, nonescaled_test_y, nonescaled_val_y
def setupRNN(inputs,model_dropout_rate):
cell = tf.contrib.rnn.LSTMCell(lstm_size, state_is_tuple=True, activation=lstm_activation,use_peepholes=True)
val1, _ = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
val = tf.transpose(val1, [1, 0, 2])
last = tf.gather(val, int(val.get_shape()[0]) - 1, name="last_lstm_output")
# hidden layer
hidden1 = tf.layers.dense(last, units=hidden1_nodes, activation=hidden2_activation)
hidden2 = tf.layers.dense(hidden1, units=hidden2_nodes, activation=hidden1_activation)
dropout = tf.layers.dropout(hidden2, rate=model_dropout_rate, training=True)
weight = tf.Variable(tf.truncated_normal([hidden2_nodes, input_size]))
bias = tf.Variable(tf.constant(0.1, shape=[input_size]))
prediction = tf.nn.relu(tf.matmul(dropout, weight) + bias)
return prediction
# saver = tf.train.Saver()
# saver.save(sess, "checkpoints_sales/sales_pred.ckpt")
#use_named_args(dimensions=dimensions)
def fitness(lstm_num_steps, size, lstm_hidden1_nodes, lstm_hidden2_nodes, lstm_init_epoch, lstm_max_epoch,
lstm_learning_rate_decay, lstm_batch_size, lstm_dropout_rate, lstm_init_learning_rate,
lstm_hidden1_activation, lstm_hidden2_activation, lstm_lstm_activation):
global bestValiPrediction,bestValiTrueVal,bestTestPrediction,bestTestTrueVal, iteration, hidden1_activation, hidden2_activation, lstm_activation, lowest_error, num_steps, lstm_size, hidden2_nodes, hidden2_activation, hidden1_activation, hidden1_nodes, lstm_activation, init_epoch, max_epoch, learning_rate_decay, dropout_rate, init_learning_rate
# num_steps = lstm_num_steps
# lstm_size = size
# batch_size = lstm_batch_size
# learning_rate_decay = lstm_learning_rate_decay
# init_epoch = lstm_init_epoch
# max_epoch = lstm_max_epoch
# hidden1_nodes = lstm_hidden1_nodes
# hidden2_nodes = lstm_hidden2_nodes
# dropout_rate = lstm_dropout_rate
# init_learning_rate = lstm_init_learning_rate
# hidden1_activation = lstm_hidden1_activation
# hidden2_activation = lstm_hidden2_activation
# lstm_activation = lstm_lstm_activation
num_steps = np.int32(lstm_num_steps)
lstm_size = np.int32(size)
batch_size = np.int32(lstm_batch_size)
learning_rate_decay = np.float32(lstm_learning_rate_decay)
init_epoch = np.int32(lstm_init_epoch)
max_epoch = np.int32(lstm_max_epoch)
hidden1_nodes = np.int32(lstm_hidden1_nodes)
hidden2_nodes = np.int32(lstm_hidden2_nodes)
dropout_rate = np.float32(lstm_dropout_rate)
init_learning_rate = np.float32(lstm_init_learning_rate)
hidden1_activation = lstm_hidden1_activation
hidden2_activation = lstm_hidden2_activation
lstm_activation = lstm_lstm_activation
# log_dir = log_dir_name(lstm_num_steps, size,lstm_hidden1_nodes,lstm_hidden2_nodes,lstm_learning_rate,lstm_init_epoch,lstm_max_epoch,
# lstm_learning_rate_decay,lstm_batch_size)
train_X, train_y, test_X, test_y, val_X, val_y, nonescaled_test_y, nonescaled_val_y = pre_process()
inputs = tf.placeholder(tf.float32, [None, num_steps, features], name="inputs")
targets = tf.placeholder(tf.float32, [None, input_size], name="targets")
model_learning_rate = tf.placeholder(tf.float32, None, name="learning_rate")
model_dropout_rate = tf.placeholder_with_default(0.0, shape=())
global_step = tf.Variable(0, trainable=False)
prediction = setupRNN(inputs,model_dropout_rate)
model_learning_rate = tf.train.exponential_decay(learning_rate=model_learning_rate, global_step=global_step, decay_rate=learning_rate_decay,
decay_steps=init_epoch, staircase=False)
with tf.name_scope('loss'):
model_loss = tf.losses.mean_squared_error(targets, prediction)
with tf.name_scope('adam_optimizer'):
train_step = tf.train.AdamOptimizer(model_learning_rate).minimize(model_loss,global_step=global_step)
train_step = train_step
# with tf.name_scope('accuracy'):
# correct_prediction = tf.sqrt(tf.losses.mean_squared_error(prediction, targets))
#
# accuracy = correct_prediction
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for epoch_step in range(max_epoch):
for batch_X, batch_y in generate_batches(train_X, train_y, batch_size):
train_data_feed = {
inputs: batch_X,
targets: batch_y,
model_learning_rate: init_learning_rate,
model_dropout_rate: dropout_rate
}
sess.run(train_step, train_data_feed)
val_data_feed = {
inputs: val_X,
}
vali_pred = sess.run(prediction, val_data_feed)
vali_pred_vals = rescle(vali_pred)
vali_pred_vals = np.array(vali_pred_vals)
vali_pred_vals = (np.round(vali_pred_vals, 0)).astype(np.int32)
vali_pred_vals = vali_pred_vals.flatten()
vali_pred_vals = vali_pred_vals.tolist()
vali_nonescaled_y = nonescaled_val_y.flatten()
vali_nonescaled_y = vali_nonescaled_y.tolist()
val_error = sqrt(mean_squared_error(vali_nonescaled_y, vali_pred_vals))
val_mae = mean_absolute_error(vali_nonescaled_y, vali_pred_vals)
val_mape = mean_absolute_percentage_error(vali_nonescaled_y, vali_pred_vals)
val_rmspe = RMSPE(vali_nonescaled_y, vali_pred_vals)
with open(Error_file_name, "a") as f:
writer = csv.writer(f)
writer.writerows(
zip([fileName], [val_error], [val_mae], [val_mape],[val_rmspe]))
if iteration == 0:
lowest_error = val_error
test_data_feed = {
inputs: test_X,
}
test_pred = sess.run(prediction, test_data_feed)
test_pred_vals = rescle(test_pred)
test_pred_vals = np.array(test_pred_vals)
test_pred_vals = (np.round(test_pred_vals, 0)).astype(np.int32)
test_pred_vals = test_pred_vals.flatten()
test_pred_vals = test_pred_vals.tolist()
test_nonescaled_y = nonescaled_test_y.flatten()
test_nonescaled_y = test_nonescaled_y.tolist()
test_error = sqrt(mean_squared_error(test_nonescaled_y, test_pred_vals))
test_mae = mean_absolute_error(test_nonescaled_y, test_pred_vals)
test_mape = mean_absolute_percentage_error(test_nonescaled_y, test_pred_vals)
test_rmspe = RMSPE(test_nonescaled_y, test_pred_vals)
with open("best_withZero_addi_config.csv", "a") as f:
writer = csv.writer(f)
writer.writerows(
zip([fileName], [num_steps], [lstm_size], [hidden2_nodes], [hidden2_activation], [hidden1_activation],
[hidden1_nodes], [lstm_activation], [init_epoch], [max_epoch], [learning_rate_decay],
[dropout_rate], [batch_size], [init_learning_rate], [val_error],[val_mae], [val_mape],[val_rmspe], [test_error],[test_mae],[test_mape],[test_rmspe]))
elif val_error < lowest_error:
# Save the new model to harddisk.
saver = tf.train.Saver()
saver.save(sess, "checkpoints_sales/sales_pred.ckpt")
test_data_feed = {
inputs: test_X,
}
test_pred = sess.run(prediction, test_data_feed)
test_pred_vals = rescle(test_pred)
test_pred_vals = np.array(test_pred_vals)
test_pred_vals = (np.round(test_pred_vals, 0)).astype(np.int32)
test_pred_vals = test_pred_vals.flatten()
test_pred_vals = test_pred_vals.tolist()
test_nonescaled_y = nonescaled_test_y.flatten()
test_nonescaled_y = test_nonescaled_y.tolist()
bestValiPrediction = vali_pred_vals
bestValiTrueVal = vali_nonescaled_y
bestTestPrediction = test_pred_vals
bestTestTrueVal = test_nonescaled_y
test_error = sqrt(mean_squared_error(test_nonescaled_y, test_pred_vals))
test_mae = mean_absolute_error(test_nonescaled_y, test_pred_vals)
test_mape = mean_absolute_percentage_error(test_nonescaled_y, test_pred_vals)
test_rmspe = RMSPE(test_nonescaled_y, test_pred_vals)
with open("best_withZero_addi_config.csv", "a") as f:
writer = csv.writer(f)
writer.writerows(
zip([fileName], [num_steps], [lstm_size], [hidden2_nodes], [hidden2_activation],
[hidden1_activation],
[hidden1_nodes], [lstm_activation], [init_epoch], [max_epoch], [learning_rate_decay],
[dropout_rate], [batch_size], [init_learning_rate], [val_error],[val_mae], [val_mape],[val_rmspe], [test_error],[test_mae],[test_mape],[test_rmspe]))
# Update the classification accuracy.
lowest_error = val_error
# Clear the Keras session, otherwise it will keep adding new
# models to the same TensorFlow graph each time we create
# a model with a different set of hyper-parameters.
# sess.clear_session()
sess.close()
tf.reset_default_graph()
# NOTE: Scikit-optimize does minimization so it tries to
# find a set of hyper-parameters with the LOWEST fitness-value.
# Because we are interested in the HIGHEST classification
# accuracy, we need to negate this number so it can be minimized.
iteration += 1
return val_error
if __name__ == '__main__':
start = time()
for i in range(len(fileNames)):
iteration = 0
lowest_error = 0.0
fileName = '{}{}{}'.format('/home/wso2/suleka/salesPred/', fileNames[i],'.csv')
Error_file_name = '{}{}{}'.format('all_validation_errors/errors_', fileNames[i], '.csv')
vali_data = '{}{}{}'.format('validation_data/vali__data_', fileNames[i], '.csv')
predic_data = '{}{}{}'.format('prediction_data/predic__data_', fileNames[i], '.csv')
Skopt_object_name = '{}{}{}'.format('/home/wso2/suleka/salesPred/skopt_objects/object_', fileNames[i], '.gz')
column_min_max = column_min_max_all[i]
# Bayesian optimization using Gaussian Processes.
# acq_func -> https://arxiv.org/pdf/1807.02811.pdf
search_result = gp_minimize(func=fitness,
dimensions=dimensions,
acq_func='EI', # Expected Improvement.
n_calls=300,
x0=default_parameters,
random_state=randomState)
with open(vali_data, "w") as f:
writer = csv.writer(f)
writer.writerows(zip(bestValiTrueVal, bestValiPrediction))
with open(predic_data, "w") as f:
writer = csv.writer(f)
writer.writerows(zip(bestTestTrueVal, bestTestPrediction, search_result.x))
bestTestPrediction = None
bestValiPrediction = None
bestTestTrueVal = None
bestValiTrueVal = None
dump(search_result, Skopt_object_name, store_objective=True)
# print()
# plot()
atexit.register(endlog)
log("Start Program")
I have the following network where I am trying to do triplet loss:
First, I have a custom Convolution class ConvBlock(nn.Module):
def __init__(self, ngpu, input_c, output_c, mode=0):
super(ConvBlock, self).__init__()
self.ngpu = ngpu
self.input_c = input_c
self.output_c = output_c
self.mode = mode
self.b1 = nn.Sequential(
nn.Conv2d(input_c, output_c, 3, stride=1, padding=1),
#nn.BatchNorm2d(output_c),
nn.PReLU(),
)
self.b2 = nn.Sequential(
nn.Conv2d(output_c, output_c, 3, stride=1, padding=1),
#nn.BatchNorm2d(output_c),
nn.PReLU(),
)
self.pool = nn.Sequential(
nn.MaxPool2d(2, 2),
)
def forward(self, input):
batch_size = input.size(0)
if self.mode == 0:
b1 = self.b1(input)
hidden = self.pool(b1)
return hidden, b1
elif self.mode == 1:
b1 = self.b1(input)
b2 = self.b2(b1)
hidden = self.pool(b2)
return hidden, b2
elif self.mode == 2:
b1 = self.b1(input)
hidden = self.b2(b1)
return hidden
I now have an encoder module:
class _Encoder(nn.Module):
def __init__(self, ngpu,nc,nef,out_size,nz):
super(_Encoder, self).__init__()
self.ngpu = ngpu
self.nc = nc
self.nef = nef
self.out_size = out_size
self.nz = nz
self.c1 = ConvBlock(self.ngpu, nc, nef, 0) # 3 - 64
self.c2 = ConvBlock(self.ngpu, nef, nef*2, 0) # 64-128
self.c3 = ConvBlock(self.ngpu, nef*2, nef*4, 1) # 128-256
self.c4 = ConvBlock(self.ngpu, nef*4, nef*8, 1) # 256 -512
self.c5 = ConvBlock(self.ngpu, nef*8, nef*8, 2) # 512-512
# 8 because..the depth went from 32 to 32*8
self.mean = nn.Linear(nef * 8 * out_size * (out_size/2), nz)
self.logvar = nn.Linear(nef * 8 * out_size * (out_size/2), nz)
#for reparametrization trick
def sampler(self, mean, logvar):
std = logvar.mul(0.5).exp_()
if args.cuda:
eps = torch.cuda.FloatTensor(std.size()).normal_()
else:
eps = torch.FloatTensor(std.size()).normal_()
eps = Variable(eps)
return eps.mul(std).add_(mean)
def forward(self, input):
batch_size = input.size(0)
if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1:
c1_out, c1_x = nn.parallel.data_parallel(self.c1, input, range(self.ngpu))
c2_out, c2_x = nn.parallel.data_parallel(self.c2, c1_out, range(self.ngpu))
c3_out, c3_x = nn.parallel.data_parallel(self.c3, c2_out, range(self.ngpu))
c4_out, c4_x = nn.parallel.data_parallel(self.c4, c3_out, range(self.ngpu))
hidden = nn.parallel.data_parallel(self.c5, c4_out, range(self.ngpu))
# hidden = nn.parallel.data_parallel(self.encoder, input, range(self.ngpu))
hidden = hidden.view(batch_size, -1)
mean = nn.parallel.data_parallel(self.mean, hidden, range(self.ngpu))
logvar = nn.parallel.data_parallel(self.logvar, hidden, range(self.ngpu))
else:
c1_out, c1_x = self.c1(input)
c2_out, c2_x = self.c2(c1_out)
c3_out, c3_x = self.c3(c2_out)
c4_out, c4_x = self.c4(c3_out)
hidden = self.c5(c4_out)
# hidden = self.encoder(input)
hidden = hidden.view(batch_size, -1)
mean, logvar = self.mean(hidden), self.logvar(hidden)
latent_z = self.sampler(mean, logvar)
if ADD_SKIP_CONNECTION:
return latent_z,mean,logvar,{"c1_x":c1_x, "c2_x":c2_x, "c3_x":c3_x, "c4_x":c4_x}
else:
return latent_z,mean,logvar,{"c1_x":None, "c2_x":None, "c3_x":None, "c4_x":None}
I initialize my encoder as a single object:
encoder = _Encoder(ngpu,nc,nef,out_size,nz)
encoder = encoder.cuda()
And then I am applying some functions:
latent_x,mean_x,logvar_x,skip_x = self.encoder(x)
latent_y,mean_y,logvar_y,skip_y = self.encoder(y)
latent_z,mean_z,logvar_z,skip_z = self.encoder(z)
dist_a = F.pairwise_distance(mean_x, mean_y, 2)
dist_b = F.pairwise_distance(mean_x, mean_z, 2)
loss_triplet = triplet_loss(dist_a, dist_b, target)
optimizer.zero_grad()
loss_triplet.backward()
optimizer.step()
I am starting to doubt if the weights are actually being shared across the 3 encoder blocks. Please help me check an tell me if it does
I am trying to implement a custom GRU layer in keras 2.1.2-py36_0 where i want to use the following gate equations:
zt = act ( Wz.ht-1 + xt )
rt = act ( Wr.ht-1 + xt )
ht = act ( Wh.(r * ht-1) + xt )
instead of keras current implementation of gates as:
zt = act ( Wz.ht-1 + Uzxt )
rt = act ( Wr.ht-1 + Urxt )
ht = act ( Wh.(r * ht-1) + Uhxt )
Customizing GRU cell for the data
class CGRUCell(Layer):
def __init__(self, units,
activation='tanh',
recurrent_activation='hard_sigmoid',
use_bias=True,
kernel_initializer='glorot_uniform',
recurrent_initializer='orthogonal',
bias_initializer='zeros',
kernel_regularizer=None,
recurrent_regularizer=None,
bias_regularizer=None,
kernel_constraint=None,
recurrent_constraint=None,
bias_constraint=None,
dropout=0.,
recurrent_dropout=0.,
implementation=1,
**kwargs):
super(CGRUCell, self).__init__(**kwargs)
self.units = units
self.activation = activations.get(activation)
self.recurrent_activation = activations.get(recurrent_activation)
self.use_bias = use_bias
self.kernel_initializer = initializers.get(kernel_initializer)
self.recurrent_initializer = initializers.get(recurrent_initializer)
self.bias_initializer = initializers.get(bias_initializer)
self.kernel_regularizer = regularizers.get(kernel_regularizer)
self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
self.bias_regularizer = regularizers.get(bias_regularizer)
self.kernel_constraint = constraints.get(kernel_constraint)
self.recurrent_constraint = constraints.get(recurrent_constraint)
self.bias_constraint = constraints.get(bias_constraint)
self.dropout = min(1., max(0., dropout))
self.recurrent_dropout = min(1., max(0., recurrent_dropout))
self.implementation = implementation
self.state_size = self.units
self._dropout_mask = None
self._recurrent_dropout_mask = None
def build(self, input_shape):
input_dim = input_shape[-1]
#self.kernel = self.add_weight(shape=(input_dim, self.units * 3),
# name='kernel',
# initializer=self.kernel_initializer,
# regularizer=self.kernel_regularizer,
# constraint=self.kernel_constraint)
self.recurrent_kernel = self.add_weight(
shape=(self.units, self.units * 3),
name='recurrent_kernel',
initializer=self.recurrent_initializer,
regularizer=self.recurrent_regularizer,
constraint=self.recurrent_constraint)
if self.use_bias:
self.bias = self.add_weight(shape=(self.units * 3,),
name='bias',
initializer=self.bias_initializer,
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
else:
self.bias = None
#self.kernel_z = self.kernel[:, :self.units]
self.recurrent_kernel_z = self.recurrent_kernel[:, :self.units]
#self.kernel_r = self.kernel[:, self.units: self.units * 2]
self.recurrent_kernel_r = self.recurrent_kernel[:,
self.units:
self.units * 2]
#self.kernel_h = self.kernel[:, self.units * 2:]
self.recurrent_kernel_h = self.recurrent_kernel[:, self.units * 2:]
if self.use_bias:
self.bias_z = self.bias[:self.units]
self.bias_r = self.bias[self.units: self.units * 2]
self.bias_h = self.bias[self.units * 2:]
else:
self.bias_z = None
self.bias_r = None
self.bias_h = None
self.built = True
def call(self, inputs, states, training=None):
h_tm1 = states[0] # previous memory
if 0 < self.dropout < 1 and self._dropout_mask is None:
self._dropout_mask = _generate_dropout_mask(
_generate_dropout_ones(inputs, K.shape(inputs)[-1]),
self.dropout,
training=training,
count=3)
if (0 < self.recurrent_dropout < 1 and
self._recurrent_dropout_mask is None):
self._recurrent_dropout_mask = _generate_dropout_mask(
_generate_dropout_ones(inputs, self.units),
self.recurrent_dropout,
training=training,
count=3)
# dropout matrices for input units
dp_mask = self._dropout_mask
# dropout matrices for recurrent units
rec_dp_mask = self._recurrent_dropout_mask
if self.implementation == 1:
if 0. < self.dropout < 1.:
inputs_z = inputs * dp_mask[0]
inputs_r = inputs * dp_mask[1]
inputs_h = inputs * dp_mask[2]
else:
inputs_z = inputs
inputs_r = inputs
inputs_h = inputs
print(inputs)
# Custom implementation of inputs which are already embedding parameters
#x_z = K.dot(inputs_z, self.kernel_z)
#x_r = K.dot(inputs_r, self.kernel_r)
#x_h = K.dot(inputs_h, self.kernel_h)
#if self.use_bias:
# x_z = K.bias_add(x_z, self.bias_z)
# x_r = K.bias_add(x_r, self.bias_r)
# x_h = K.bias_add(x_h, self.bias_h)
x_z = inputs_z
x_r = inputs_r
x_h = inputs_h
if 0. < self.recurrent_dropout < 1.:
h_tm1_z = h_tm1 * rec_dp_mask[0]
h_tm1_r = h_tm1 * rec_dp_mask[1]
h_tm1_h = h_tm1 * rec_dp_mask[2]
else:
h_tm1_z = h_tm1
h_tm1_r = h_tm1
h_tm1_h = h_tm1
z = self.recurrent_activation(x_z + K.dot(h_tm1_z,
self.recurrent_kernel_z))
r = self.recurrent_activation(x_r + K.dot(h_tm1_r,
self.recurrent_kernel_r))
hh = self.activation(x_h + K.dot(r * h_tm1_h,
self.recurrent_kernel_h))
else:
if 0. < self.dropout < 1.:
inputs *= dp_mask[0]
# Custom implementation of inputs which are already embedding parameters
#matrix_x = K.dot(inputs, self.kernel)
#if self.use_bias:
# matrix_x = K.bias_add(matrix_x, self.bias)
matrix_x = inputs
if 0. < self.recurrent_dropout < 1.:
h_tm1 *= rec_dp_mask[0]
matrix_inner = K.dot(h_tm1,
self.recurrent_kernel[:, :2 * self.units])
x_z = matrix_x[:, :self.units]
x_r = matrix_x[:, self.units: 2 * self.units]
recurrent_z = matrix_inner[:, :self.units]
recurrent_r = matrix_inner[:, self.units: 2 * self.units]
z = self.recurrent_activation(x_z + recurrent_z)
r = self.recurrent_activation(x_r + recurrent_r)
x_h = matrix_x[:, 2 * self.units:]
recurrent_h = K.dot(r * h_tm1,
self.recurrent_kernel[:, 2 * self.units:])
hh = self.activation(x_h + recurrent_h)
h = z * h_tm1 + (1 - z) * hh
if 0 < self.dropout + self.recurrent_dropout:
if training is None:
h._uses_learning_phase = True
return h, [h]
def get_config(self):
config = {'units': self.units,
'activation': activations.serialize(self.activation),
'recurrent_activation': activations.serialize(self.recurrent_activation),
'use_bias': self.use_bias,
'kernel_initializer': initializers.serialize(self.kernel_initializer),
'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
'bias_initializer': initializers.serialize(self.bias_initializer),
'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
'bias_regularizer': regularizers.serialize(self.bias_regularizer),
'kernel_constraint': constraints.serialize(self.kernel_constraint),
'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
'bias_constraint': constraints.serialize(self.bias_constraint),
'dropout': self.dropout,
'recurrent_dropout': self.recurrent_dropout,
'implementation': self.implementation}
base_config = super(CGRUCell, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class CGRU(RNN):
#interfaces.legacy_recurrent_support
def __init__(self, units,
activation='tanh',
recurrent_activation='hard_sigmoid',
use_bias=True,
kernel_initializer='glorot_uniform',
recurrent_initializer='orthogonal',
bias_initializer='zeros',
kernel_regularizer=None,
recurrent_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
recurrent_constraint=None,
bias_constraint=None,
dropout=0.,
recurrent_dropout=0.,
implementation=1,
return_sequences=False,
return_state=False,
go_backwards=False,
stateful=False,
unroll=False,
**kwargs):
if implementation == 0:
warnings.warn('`implementation=0` has been deprecated, '
'and now defaults to `implementation=1`.'
'Please update your layer call.')
cell = CGRUCell(units,
activation=activation,
recurrent_activation=recurrent_activation,
use_bias=use_bias,
kernel_initializer=kernel_initializer,
recurrent_initializer=recurrent_initializer,
bias_initializer=bias_initializer,
kernel_regularizer=kernel_regularizer,
recurrent_regularizer=recurrent_regularizer,
bias_regularizer=bias_regularizer,
kernel_constraint=kernel_constraint,
recurrent_constraint=recurrent_constraint,
bias_constraint=bias_constraint,
dropout=dropout,
recurrent_dropout=recurrent_dropout,
implementation=implementation)
super(CGRU, self).__init__(cell,
return_sequences=return_sequences,
return_state=return_state,
go_backwards=go_backwards,
stateful=stateful,
unroll=unroll,
**kwargs)
self.activity_regularizer = regularizers.get(activity_regularizer)
def call(self, inputs, mask=None, training=None, initial_state=None):
self.cell._dropout_mask = None
self.cell._recurrent_dropout_mask = None
return super(CGRU, self).call(inputs,
mask=mask,
training=training,
initial_state=initial_state)
#property
def units(self):
return self.cell.units
#property
def activation(self):
return self.cell.activation
#property
def recurrent_activation(self):
return self.cell.recurrent_activation
#property
def use_bias(self):
return self.cell.use_bias
#property
def kernel_initializer(self):
return self.cell.kernel_initializer
#property
def recurrent_initializer(self):
return self.cell.recurrent_initializer
#property
def bias_initializer(self):
return self.cell.bias_initializer
#property
def kernel_regularizer(self):
return self.cell.kernel_regularizer
#property
def recurrent_regularizer(self):
return self.cell.recurrent_regularizer
#property
def bias_regularizer(self):
return self.cell.bias_regularizer
#property
def kernel_constraint(self):
return self.cell.kernel_constraint
#property
def recurrent_constraint(self):
return self.cell.recurrent_constraint
#property
def bias_constraint(self):
return self.cell.bias_constraint
#property
def dropout(self):
return self.cell.dropout
#property
def recurrent_dropout(self):
return self.cell.recurrent_dropout
#property
def implementation(self):
return self.cell.implementation
def get_config(self):
config = {'units': self.units,
'activation': activations.serialize(self.activation),
'recurrent_activation': activations.serialize(self.recurrent_activation),
'use_bias': self.use_bias,
'kernel_initializer': initializers.serialize(self.kernel_initializer),
'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
'bias_initializer': initializers.serialize(self.bias_initializer),
'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
'bias_regularizer': regularizers.serialize(self.bias_regularizer),
'activity_regularizer': regularizers.serialize(self.activity_regularizer),
'kernel_constraint': constraints.serialize(self.kernel_constraint),
'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
'bias_constraint': constraints.serialize(self.bias_constraint),
'dropout': self.dropout,
'recurrent_dropout': self.recurrent_dropout,
'implementation': self.implementation}
base_config = super(CGRU, self).get_config()
del base_config['cell']
return dict(list(base_config.items()) + list(config.items()))
#classmethod
def from_config(cls, config):
if 'implementation' in config and config['implementation'] == 0:
config['implementation'] = 1
return cls(**config)
Model Implementation is as follows:
user_input = Input(batch_shape=(batch_size,chunk_size,), dtype='int32', name='user_inputs')
user_emb = Embedding(input_dim=num_users+1, output_dim=out_dim, input_length=chunk_size)(user_input)
item_input = Input(batch_shape=(batch_size,chunk_size,), dtype='int32', name='item_inputs')
item_emb = Embedding(input_dim=num_items+1, output_dim=out_dim, input_length=chunk_size)(item_input)
inputs = keras.layers.add([user_emb, item_emb])
gru_args = {
"units":hidden_size,
"return_sequences":True,
#"return_state":True,
"stateful":True,
"unroll":False
}
gru = CGRU(**gru_args)(inputs)
outputs = Dense(num_items+1, activation='softmax')(gru)
[recc_model = Model(inputs=\[user_input,item_input\], outputs=outputs)
recc_model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=\[metrics.cate][1]gorical_accuracy])
#metrics=[metrics.sparse_categorical_accuracy])
But on running the code I am getting the following error which seems is due to gradients are getting computed to None:
ValueError: Tried to convert 'x' to a tensor and failed. Error: None values not supported.
Find the complete error here: https://pastebin.com/n9UzCRiP
The error occurs because the bias weights are added to the model but not used anywhere.
When you call self.add_weight(...), you have to make sure these weights are used somewhere in your model. Otherwise, since these weights are not connected to the loss tensor, TF cannot compute the gradient and an error will be raised.
If you don't need the bias weights, you can either remove the add_weight lines, or set use_bias=False in your cell.
Also, I think you don't need to re-implement a CGRU layer to use a custom cell. Just wrap your custom cell with the built-in RNN layer should work.
gru = RNN(CGRUCell(hidden_size, use_bias=False),
return_sequences=True,
stateful=True,
unroll=False)(inputs)
I am a beginner of theano. I am studying it now. I'd like to print 'value' and 'shape' of TensorVariable while operating theano.function. When I used print fuction of python, print function ran before compiling theano function. So I learned using print fuction is useless. Thereofre I tried my another hand. I added a following syntax to execute theano.printing.Print.
(cce is return value of theano.scan. Therefore, Maybe it is not symbolic vaiable.
Actually, I am confused by the concept of TensorVariable and shared variable. TensorVariable is a sort of shared variable?)
x = theano.tensor.tensor3() # define data type
t_print =theano.printing.Print("cce value is : ")(x)
f = theano.function([x], t_print) # define theano.function
f(cce) # call f (print value of cce)
Then, Following Error Occured
TypeError: ('Bad input argument to theano function with name "seq2seq.py : 98" at index 0(0-based)', 'Expected an array-like object, but found a Variable: maybe you are trying to call a function on a (possibly shared) variable instead of a numeric array?')
Could you possibly let me know how to correct this code to print value of cce(TensorVariable) ? Or, is it impossible to print the value of TensorVariable when theano.function is on progress ?
Thank you for reading my question.
ADDED -
here is my source code. this is a large picture. theano.function() starts with last line. loss_func is 'categorical_crossentropy function'. last 4 line is about theano function
def categorical_crossentropy(y_true, y_pred):
y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
y_pred = y_pred.reshape( (-1, voca_dim_g) )
y_true = y_true.reshape( (-1, voca_dim_g) )
cce, updates = theano.scan(
fn=T.nnet.categorical_crossentropy,
sequences=[y_pred,y_true]
)
##### I want to print cce HERE #######
return T.mean(cce)
#staticmethod
def step(
x_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg
):
"""
x_t.shape = (timestep=1, dim)
x_t.shape = (n_samples, timestep=1, dim)
"""
i_t = T.nnet.sigmoid(T.dot(x_t, Ui) + T.dot(h_tm1, Wi) + bi)
f_t = T.nnet.sigmoid(T.dot(x_t, Uf) + T.dot(h_tm1, Wf) + bf)
o_t = T.nnet.sigmoid(T.dot(x_t, Uo) + T.dot(h_tm1, Wo) + bo)
g_t = T.tanh(T.dot(x_t, Ug) + T.dot(h_tm1, Wg) + bg)
c_t = c_tm1 * f_t + g_t * i_t
h_t = T.tanh(c_t) * o_t
return h_t, c_t
#########################################################################################################################
def forward(self, X):
states, updates = theano.scan(
fn=self.step,
sequences=[ X ],
outputs_info=[self.h_tm1, self.c_tm1],
non_sequences=[
self.Ui, self.Wi, self.bi,
self.Uf, self.Wf, self.bf,
self.Uo, self.Wo, self.bo,
self.Ug, self.Wg, self.bg
]
)
updates = [(self.h_tm1, states[0][-1]), (self.c_tm1, states[1][-1])]
return states, updates
#########################################################################################################################
def encode(self, X):
states, updates = self.forward(X)
h_t = states[0][-1]
c_t = states[1][-1]
return h_t, c_t, updates
def decode_step(
self, y_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg,
Wh, bh
):
h_t, c_t = self.step(
y_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg
)
y_t = T.dot(h_t, Wh) + bh
return y_t, h_t, c_t
def decode(self, h_tm1, c_tm1, timesteps):
outputs, updates = theano.scan(
fn=self.decode_step,
outputs_info=[self.y_t, h_tm1, c_tm1],
non_sequences=[
self.Ui, self.Wi, self.bi,
self.Uf, self.Wf, self.bf,
self.Uo, self.Wo, self.bo,
self.Ug, self.Wg, self.bg,
self.Wh, self.bh
],
n_steps=timesteps
)
updates = [
(self.h_tm1, outputs[1][-1]),
(self.c_tm1, outputs[2][-1])
]
return outputs[0], updates
h_tm1, c_tm1, updates_encode = encode(seq_input)
seq_predict, updates_decode = decode(h_tm1, c_tm1, T.shape(seq_target)[0])
loss = loss_func(seq_predict, seq_target)
self._train = theano.function([seq_input, seq_target], loss, updates = updates)
below is full source code
# -*- coding: utf-8 -*-
__modifier__ = "Lee Guk Beom, Lee Jae Sang, Jang Jae Kwang (alphabetical Order)"
import readFile
import numpy as np
import theano
import theano.tensor as T
from six.moves import zip
from theano.compile.debugmode import DebugMode
import nltk
import sys
import os
from nltk.tokenize import sent_tokenize
import codecs
#theano.config.optimizer='fast_compile'
#theano.config.exception_verbosity='high'
#theano.config.compute_test_value = 'warn'
epsilon = 1e-6
dtype = theano.config.floatX
minibatch_size_g = 0
longest_seq_g = 0
voca_dim_g = 0
n_time_step_input_g = 0
n_timestep_target_g = 0
word_to_index_input_g = dict()
word_to_index_targrt_g = dict()
index_to_word_target_g = dict()
#########################################################################################################################
def shared(value, name=None):
return theano.shared(value.astype(dtype), name=name)
#########################################################################################################################
def shared_zeros(shape, name=None):
return shared(value=np.zeros(shape), name=name)
#########################################################################################################################
def shared_zeros_like(x, name=None):
return shared_zeros(shape=x.shape, name=name)
#########################################################################################################################
def init_weights(shape, name=None):
bound = np.sqrt(1.0/shape[1])
w = np.random.uniform(-bound, bound, shape)
return shared(value=w, name=name)
#########################################################################################################################
def adadelta(params, cost, lr=1.0, rho=0.95):
# from https://github.com/fchollet/keras/blob/master/keras/optimizers.py
cost = cost.astype('float32')
grads = T.grad(cost, params)
accus = [shared_zeros_like(p.get_value()) for p in params]
delta_accus = [shared_zeros_like(p.get_value()) for p in params]
updates = []
for p, g, a, d_a in zip(params, grads, accus, delta_accus):
new_a = rho * a + (1.0 - rho) * T.square(g)
updates.append((a, new_a))
update = g * T.sqrt(d_a + epsilon) / T.sqrt(new_a + epsilon)
new_p = p - lr * update
updates.append((p, new_p))
new_d_a = rho * d_a + (1.0 - rho) * T.square(update)
updates.append((d_a, new_d_a))
return updates
#########################################################################################################################
def categorical_crossentropy(y_true, y_pred):
# from https://github.com/fchollet/keras/blob/master/keras/objectives.py
y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
# y_true = y_true.reshape( (-1, minibatch_size_g, voca_dim_g) )
'''
cce = T.nnet.categorical_crossentropy(y_pred,y_true)
# only matrix can be calculated
'''
# Y_PRED SOFTMAX
y_pred = y_pred.reshape( (-1, voca_dim_g) )
# y_pred_flat = T.nnet.softmax(y_pred)
y_true = y_true.reshape( (-1, voca_dim_g) )
cce, updates = theano.scan(
fn=T.nnet.categorical_crossentropy,
sequences=[y_pred,y_true]
)
return T.mean(cce)
#########################################################################################################################
def mean_square_error(y_true, y_pred):
return T.mean(T.square(y_pred - y_true))
#########################################################################################################################
class LSTM(object):
def __init__(self, size, dim):
self.size = size
self.dim = dim
shape_b = (minibatch_size_g, size)
shape_U = (dim, size)
shape_W = (size, size)
self.h_tm1 = shared_zeros(shape_b, "h_tm1")
self.c_tm1 = shared_zeros(shape_b, "c_tm1")
self.Ui = init_weights(shape_U, "Ui")
self.Wi = init_weights(shape_W, "Wi")
self.bi = shared_zeros(shape_b, "bi")
self.Uf = init_weights(shape_U, "Uf")
self.Wf = init_weights(shape_W, "Wf")
self.bf = shared_zeros(shape_b, "bf")
self.Uo = init_weights(shape_U, "Uo")
self.Wo = init_weights(shape_W, "Wo")
self.bo = shared_zeros(shape_b, "bo")
self.Ug = init_weights(shape_U, "Ug")
self.Wg = init_weights(shape_W, "Wg")
self.bg = shared_zeros(shape_b, "bg")
self.params = [
self.Ui, self.Wi, self.bi,
self.Uf, self.Wf, self.bf,
self.Uo, self.Wo, self.bo,
self.Ug, self.Wg, self.bg
]
def set_state(self, h, c):
self.h_tm1.set_value(h.get_value())
self.c_tm1.set_value(c.get_value())
def reset_state(self):
self.h_tm1 = shared_zeros((1, self.size), "h_tm1")
self.c_tm1 = shared_zeros((1, self.size), "c_tm1")
#########################################################################################################################
#staticmethod
def step(
x_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg
):
"""
x_t.shape = (timestep=1, dim)
x_t.shape = (n_samples, timestep=1, dim)
"""
i_t = T.nnet.sigmoid(T.dot(x_t, Ui) + T.dot(h_tm1, Wi) + bi)
f_t = T.nnet.sigmoid(T.dot(x_t, Uf) + T.dot(h_tm1, Wf) + bf)
o_t = T.nnet.sigmoid(T.dot(x_t, Uo) + T.dot(h_tm1, Wo) + bo)
g_t = T.tanh(T.dot(x_t, Ug) + T.dot(h_tm1, Wg) + bg)
c_t = c_tm1 * f_t + g_t * i_t
h_t = T.tanh(c_t) * o_t
return h_t, c_t
#########################################################################################################################
def forward(self, X):
states, updates = theano.scan(
fn=self.step,
sequences=[ X ],
outputs_info=[self.h_tm1, self.c_tm1],
non_sequences=[
self.Ui, self.Wi, self.bi,
self.Uf, self.Wf, self.bf,
self.Uo, self.Wo, self.bo,
self.Ug, self.Wg, self.bg
]
)
updates = [(self.h_tm1, states[0][-1]), (self.c_tm1, states[1][-1])]
return states, updates
#########################################################################################################################
class LSTMEncoder(LSTM):
def encode(self, X):
states, updates = self.forward(X)
h_t = states[0][-1]
c_t = states[1][-1]
return h_t, c_t, updates
class LSTMDecoder(LSTM):
def __init__(self, size, dim, h_tm1=None, c_tm1=None):
super(LSTMDecoder, self).__init__(size=size, dim=dim)
self.Wh = init_weights((size, dim), "Wh")
self.bh = shared_zeros((minibatch_size_g, dim), "bh")
self.h_tm1 = h_tm1 or shared_zeros((minibatch_size_g, size), "h_tm1")
self.c_tm1 = c_tm1 or shared_zeros((minibatch_size_g, size), "c_tm1")
self.y_t = shared_zeros((minibatch_size_g, dim), "y_t")
# self.decode_length = theano.shared(decode_length)
self.params.append(self.Wh)
self.params.append(self.bh)
def decode_step(
self, y_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg,
Wh, bh
):
h_t, c_t = self.step(
y_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg
)
y_t = T.dot(h_t, Wh) + bh
return y_t, h_t, c_t
def decode(self, h_tm1, c_tm1, timesteps):
outputs, updates = theano.scan(
fn=self.decode_step,
outputs_info=[self.y_t, h_tm1, c_tm1],
non_sequences=[
self.Ui, self.Wi, self.bi,
self.Uf, self.Wf, self.bf,
self.Uo, self.Wo, self.bo,
self.Ug, self.Wg, self.bg,
self.Wh, self.bh
],
n_steps=timesteps
)
updates = [
(self.h_tm1, outputs[1][-1]),
(self.c_tm1, outputs[2][-1])
]
# return T.flatten(outputs[0], 3), updates
return outputs[0], updates
#staticmethod
def argmax(seq):
seq = T.argmax(seq, axis=2)
return seq
#########################################################################################################################
class Seq2Seq(object):
def __init__(self, size, dim):
self.encoder = LSTMEncoder(size, dim)
self.decoder = LSTMDecoder(size, dim)
self.params = []
self.params += self.encoder.params
self.params += self.decoder.params
self._predict = None
self._train = None
self._test = None
def compile(self, loss_func, optimizer):
seq_input = T.tensor3()
seq_target = T.tensor3()
decode_timesteps = T.iscalar()
h_tm1, c_tm1, updates_encode = self.encoder.encode(seq_input)
seq_predict_flex, updates_decode_flex = self.decoder.decode(h_tm1, c_tm1, decode_timesteps)
seq_argmax = self.decoder.argmax(seq_predict_flex)
seq_predict, updates_decode = self.decoder.decode(h_tm1, c_tm1, T.shape(seq_target)[0])
loss = loss_func(seq_predict, seq_target)
self._predict = theano.function([seq_input, decode_timesteps], seq_argmax, updates=updates_encode+updates_decode_flex)
self._test = theano.function([seq_input, seq_target], loss, updates=updates_encode+updates_decode)
updates = []
updates += updates_encode
updates += updates_decode
updates += optimizer(self.params, loss)
self._train = theano.function([seq_input, seq_target], loss, updates = updates)
def predict(self, seq_input, decode_timesteps):
self.encoder.reset_state()
self.decoder.reset_state()
return self._predict(seq_input, decode_timesteps)
def train(self, seq_input, seq_target):
self.encoder.reset_state()
self.decoder.reset_state()
return self._train(seq_input, seq_target)
def test(self, seq_input, seq_target):
self.encoder.reset_state()
self.decoder.reset_state()
return self._test(seq_input, seq_target)
#########################################################################################################################
def train(x, target):
for mini_batch, target in zip(x,target):
mini_batch = mini_batch.astype(dtype)
target = target.astype(dtype)
print("result of train function(loss or update) :", seq2seq.train(mini_batch, target))
#########################################################################################################################
# make weight information to pickle file
# information of Encooder class and decoder class of Seq2Seq class
# Encooder and decoder class should have function that returns value of weight variables
# one list contains elements that save weights' information
def save_weight():
None
#########################################################################################################################
def gen_processed_seq(input_sentence):
tokenized_seq = nltk.word_tokenize( input_sentence )
input_sentences = [ None for _ in range(1) ]
input_sentences[0] = tokenized_seq
seq_input = readFile.word_to_idx(input_sentences, word_to_index_input_g )
sorted_seq_input = [ None for _ in range(minibatch_size_g) ]
sorted_seq_input[0] = seq_input[0]
input_len = len(seq_input[0])
for i in range(minibatch_size_g-1):
for j in range(input_len):
sorted_seq_input[i+1] = [-1]
input_finally = []
input_finally.append(sorted_seq_input)
return input_finally
#########################################################################################################################
def gen_one_hot(input_len, input_seq):
one_hot = readFile.seq_to_1hot(n_time_step_input_g, input_seq, "predict", 1, 1)
one_hot[0] = one_hot[0].astype(dtype)
print("one_hot : ", one_hot)
return one_hot
def get_idx(argmax, num_of_word):
idx_list = argmax[ : num_of_word, 0]
return idx_list
#########################################################################################################################
def predict():
input_sentence = raw_input("Input the English Sentence You Want to Translate into Spanish : ")
input_seq = gen_processed_seq(input_sentence)
print("input_seq[0][0] : ",input_seq[0][0])
num_of_word = len(input_seq[0][0])
one_hot = gen_one_hot(n_time_step_input_g, input_seq)
argmax = seq2seq.predict(one_hot[0] , n_time_step_input_g )
print("argmax_fin shape : ", argmax.shape)
print("argmax_fin : ", argmax)
idx_list_np = get_idx(argmax, num_of_word)
idx_list_py = idx_list_np.tolist()
print("index_to_word_target_g : ",index_to_word_target_g)
print("index_to_word_target_g[6] :", index_to_word_target_g[6])
result = readFile.idx_to_word(idx_list_py, index_to_word_target_g)
translated = ""
for elem in result :
translated += elem
translated += " "
print("translated : " , translated)
print("Translation End")
#########################################################################################################################
def gen_global_var(word_to_index_input, word_to_index_targrt, voca_dim, si, st, index_to_word_target):
global word_to_index_input_g
global word_to_index_targrt_g
global voca_dim_g
global minibatch_size_g
global n_time_step_input_g
global n_timestep_target_g
global index_to_word_target_g
word_to_index_input_g = word_to_index_input
word_to_index_targrt_g = word_to_index_targrt
voca_dim_g = voca_dim + 2
minibatch_size_g = si[0].shape[1]
n_time_step_input_g = si[0].shape[0]
n_timestep_target_g = st[0].shape[0]
index_to_word_target_g = index_to_word_target
return
#########################################################################################################################
def menu(si, st):
None
#########################################################################################################################
def gen_object():
return None
#########################################################################################################################
if __name__ == "__main__":
si, st, maxlen_input, minibatch_size, voca_dim, word_to_index_input, word_to_index_targrt, index_to_word_target = readFile.preprocessing()
gen_global_var(word_to_index_input, word_to_index_targrt, voca_dim, si, st, index_to_word_target)
seq2seq = Seq2Seq(n_time_step_input_g, voca_dim_g )
seq2seq.compile(loss_func=categorical_crossentropy, optimizer=adadelta)
while(True):
print("select a menu")
print("1. Training")
print("2. Translate specific English sentence into Spanish.")
val = input("selection : ")
if val == 1:
train(si, st)
elif val == 2:
predict()
and readfile.py is
import numpy as np
import itertools
import nltk
import sys
import os
from nltk.tokenize import sent_tokenize
import codecs
unknown_token = 'UNKNOWN_TOKEN'
start_token = '_S'
end_token = '__E'
num_of_seq = 0
input_path = "./europarl-v7.es-en.en"
target_path = "./europarl-v7.es-en.es"
minibatch_unit = 100
voca_dim = 3000
SEQ_NUM_LIMIT = 1000
##########################################################################################
def file_tokenize(file):
f = codecs.open( file, "r", "utf-8" )
tokenized_seq = []
sentences = []
total_sentence_num = 0
# sequence tokenize
for i,line in enumerate(f):
print("tokenized Sentence No." , i)
# strip() method to remove the newline character at the end of the input line.
tokenized_seq = nltk.word_tokenize( line.strip() )
tokenized_seq.insert(0, start_token)
tokenized_seq.append(end_token)
sentences.append(tokenized_seq)
total_sentence_num += 1;
if(total_sentence_num == SEQ_NUM_LIMIT):
break
return sentences,total_sentence_num
##########################################################################################
# Count the word frequencies
def cntWordFreq(sentences):
word_freq = nltk.FreqDist(itertools.chain(*sentences))
return word_freq
##########################################################################################
# Get the most common words and build index_to_word and word_to_index vectors
def build_WordToIdx_IdxtoWord(word_freq):
vocab = word_freq.most_common(voca_dim-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
return index_to_word, word_to_index
##########################################################################################
# change word to index
def word_to_idx(sequences, word_to_index ) :
for i, sent in enumerate(sequences):
sequences[i] = [w if w in word_to_index else unknown_token for w in sent]
sequences[i] = [word_to_index[w] if w in word_to_index else -1 for w in sequences[i]]
return sequences
##########################################################################################
def idx_to_word(seq, index_to_word):
for i, sent in enumerate(seq):
seq[i] = index_to_word[sent]
#seq[i] = [index_to_word[sent] if sent in index_to_word else '?' ]
return seq
##########################################################################################
def sortByLen(seqs_input, seqs_target) :
# check maximum sentence length
max_len_input = 0
max_len_target = 0
for sentence in seqs_input :
tmp = len(sentence)
if max_len_input < tmp:
max_len_input = tmp
for sentence in seqs_target :
tmp = len(sentence)
if max_len_target < tmp:
max_len_target = tmp
seqs_sorted_input = [ [] for _ in range(max_len_input+1) ]
seqs_sorted_target = [ [] for _ in range(max_len_input+1) ]
i = 0
for sentence_input, sentence_target in zip(seqs_input, seqs_target) :
sentence_len = len(sentence_input)
seqs_sorted_input[sentence_len].append(sentence_input)
seqs_sorted_target[sentence_len].append(sentence_target)
i+=1
return seqs_sorted_input, seqs_sorted_target, max_len_input, max_len_target
##########################################################################################
def find_maxlen(sentence_group):
max_seq_len = 0
for seq in sentence_group :
if len(seq) > max_seq_len :
max_seq_len = len(seq)
return max_seq_len
##########################################################################################
def sort_by_timestep(sentence_group):
same_len_seq = np.asarray(sentence_group)
same_len_seq = apply_to_m1(same_len_seq)
sorted_seq = same_len_seq.transpose()
return sorted_seq
##########################################################################################
def seq_to_1hot(max_len, sorted_sentences, type, minibatch_unit, num_of_seq):
one_hot = [None for _ in range( len(sorted_sentences) )]
for i, sentence_group in enumerate(sorted_sentences):
if sentence_group and len(sentence_group[0]) != 0 :
max_seq_len = find_maxlen(sentence_group)
row = max_seq_len * minibatch_unit
one_hot[i] = np.zeros( (row, voca_dim + 2) )
time_step_seq = sort_by_timestep(sentence_group)
j = 0
for word_idx in np.nditer( time_step_seq ) :
if word_idx != -1:
one_hot[i][j][word_idx] = 1
j+=1
one_hot[i] = np.reshape(one_hot[i], ( max_seq_len, -1, voca_dim+2) )
return one_hot
##########################################################################################
def apply_to_m1(lst, dtype=np.int64):
inner_max_len = max(map(len, lst))
result = np.zeros( [len(lst), inner_max_len], dtype )
result[:] = -1
for i, row in enumerate(lst):
for j, val in enumerate(row):
result[i][j] = val
return result
##########################################################################################
def seq_group_by_mini_batch_size(minibatch_unit, sorted_seq, num_of_seq):
idx = 0
cnt = 0
minibatch_seq = [ [] for _ in range( (num_of_seq/minibatch_unit)+1) ]
for seqs in sorted_seq :
if seqs :
for seq in seqs :
if seq:
minibatch_seq[idx].append(seq)
cnt+=1
if minibatch_unit == cnt:
cnt = 0
idx+= 1
for i, seq in enumerate (minibatch_seq):
if seq == []:
minibatch_seq = minibatch_seq[: i- 1]
break
return minibatch_seq
##########################################################################################
def preprocessing():
global num_of_seq
global minibatch_unit
global input_path
global target_path
print("Start Preprocessing")
sentences_input, total_sentence_num = file_tokenize(input_path)
sentences_target, total_sentence_num_target = file_tokenize(target_path)
print("FINISHED : file_tokenize ")
word_freq_input = cntWordFreq(sentences_input)
word_freq_target = cntWordFreq(sentences_target)
print("FINISHED : cntWordFreq ")
index_to_word_input, word_to_index_input = build_WordToIdx_IdxtoWord(word_freq_input)
index_to_word_target, word_to_index_targrt = build_WordToIdx_IdxtoWord(word_freq_target)
print("FINISHED : build_WordToIdx_IdxtoWord ")
seqs_input = word_to_idx(sentences_input, word_to_index_input)
seqs_target = word_to_idx(sentences_target, word_to_index_targrt)
print("FINISHED : word_to_idx ")
seqs_sorted_input, seqs_sorted_target, maxlen_input, maxlen_target = sortByLen(seqs_input, seqs_target)
print("FINISHED : sortByLen ")
for seqs in seqs_input:
if seqs:
for seq in seqs:
if seq:
num_of_seq+=1
seq_by_mini_batch_size_input = seq_group_by_mini_batch_size(minibatch_unit, seqs_sorted_input, num_of_seq)
seq_by_mini_batch_size_target = seq_group_by_mini_batch_size(minibatch_unit, seqs_sorted_target, num_of_seq)
print("FINISHED : seq_group_by_mini_batch_size ")
_1hot_input = seq_to_1hot(maxlen_input, seq_by_mini_batch_size_input, "input",minibatch_unit, num_of_seq)
_1hot_target = seq_to_1hot(maxlen_target, seq_by_mini_batch_size_target, "target",minibatch_unit, num_of_seq)
print("FINISHED : seq_to_1hot ")
if minibatch_unit > total_sentence_num:
minibatch_unit = total_sentence_num
print("exit preprocessing")
return _1hot_input, _1hot_target, maxlen_input, minibatch_unit, voca_dim, word_to_index_input, word_to_index_targrt, index_to_word_target