How can I implement custom GRU in keras - python-3.x

I am trying to implement a custom GRU layer in keras 2.1.2-py36_0 where i want to use the following gate equations:
zt = act ( Wz.ht-1 + xt )
rt = act ( Wr.ht-1 + xt )
ht = act ( Wh.(r * ht-1) + xt )
instead of keras current implementation of gates as:
zt = act ( Wz.ht-1 + Uzxt )
rt = act ( Wr.ht-1 + Urxt )
ht = act ( Wh.(r * ht-1) + Uhxt )
Customizing GRU cell for the data
class CGRUCell(Layer):
def __init__(self, units,
activation='tanh',
recurrent_activation='hard_sigmoid',
use_bias=True,
kernel_initializer='glorot_uniform',
recurrent_initializer='orthogonal',
bias_initializer='zeros',
kernel_regularizer=None,
recurrent_regularizer=None,
bias_regularizer=None,
kernel_constraint=None,
recurrent_constraint=None,
bias_constraint=None,
dropout=0.,
recurrent_dropout=0.,
implementation=1,
**kwargs):
super(CGRUCell, self).__init__(**kwargs)
self.units = units
self.activation = activations.get(activation)
self.recurrent_activation = activations.get(recurrent_activation)
self.use_bias = use_bias
self.kernel_initializer = initializers.get(kernel_initializer)
self.recurrent_initializer = initializers.get(recurrent_initializer)
self.bias_initializer = initializers.get(bias_initializer)
self.kernel_regularizer = regularizers.get(kernel_regularizer)
self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
self.bias_regularizer = regularizers.get(bias_regularizer)
self.kernel_constraint = constraints.get(kernel_constraint)
self.recurrent_constraint = constraints.get(recurrent_constraint)
self.bias_constraint = constraints.get(bias_constraint)
self.dropout = min(1., max(0., dropout))
self.recurrent_dropout = min(1., max(0., recurrent_dropout))
self.implementation = implementation
self.state_size = self.units
self._dropout_mask = None
self._recurrent_dropout_mask = None
def build(self, input_shape):
input_dim = input_shape[-1]
#self.kernel = self.add_weight(shape=(input_dim, self.units * 3),
# name='kernel',
# initializer=self.kernel_initializer,
# regularizer=self.kernel_regularizer,
# constraint=self.kernel_constraint)
self.recurrent_kernel = self.add_weight(
shape=(self.units, self.units * 3),
name='recurrent_kernel',
initializer=self.recurrent_initializer,
regularizer=self.recurrent_regularizer,
constraint=self.recurrent_constraint)
if self.use_bias:
self.bias = self.add_weight(shape=(self.units * 3,),
name='bias',
initializer=self.bias_initializer,
regularizer=self.bias_regularizer,
constraint=self.bias_constraint)
else:
self.bias = None
#self.kernel_z = self.kernel[:, :self.units]
self.recurrent_kernel_z = self.recurrent_kernel[:, :self.units]
#self.kernel_r = self.kernel[:, self.units: self.units * 2]
self.recurrent_kernel_r = self.recurrent_kernel[:,
self.units:
self.units * 2]
#self.kernel_h = self.kernel[:, self.units * 2:]
self.recurrent_kernel_h = self.recurrent_kernel[:, self.units * 2:]
if self.use_bias:
self.bias_z = self.bias[:self.units]
self.bias_r = self.bias[self.units: self.units * 2]
self.bias_h = self.bias[self.units * 2:]
else:
self.bias_z = None
self.bias_r = None
self.bias_h = None
self.built = True
def call(self, inputs, states, training=None):
h_tm1 = states[0] # previous memory
if 0 < self.dropout < 1 and self._dropout_mask is None:
self._dropout_mask = _generate_dropout_mask(
_generate_dropout_ones(inputs, K.shape(inputs)[-1]),
self.dropout,
training=training,
count=3)
if (0 < self.recurrent_dropout < 1 and
self._recurrent_dropout_mask is None):
self._recurrent_dropout_mask = _generate_dropout_mask(
_generate_dropout_ones(inputs, self.units),
self.recurrent_dropout,
training=training,
count=3)
# dropout matrices for input units
dp_mask = self._dropout_mask
# dropout matrices for recurrent units
rec_dp_mask = self._recurrent_dropout_mask
if self.implementation == 1:
if 0. < self.dropout < 1.:
inputs_z = inputs * dp_mask[0]
inputs_r = inputs * dp_mask[1]
inputs_h = inputs * dp_mask[2]
else:
inputs_z = inputs
inputs_r = inputs
inputs_h = inputs
print(inputs)
# Custom implementation of inputs which are already embedding parameters
#x_z = K.dot(inputs_z, self.kernel_z)
#x_r = K.dot(inputs_r, self.kernel_r)
#x_h = K.dot(inputs_h, self.kernel_h)
#if self.use_bias:
# x_z = K.bias_add(x_z, self.bias_z)
# x_r = K.bias_add(x_r, self.bias_r)
# x_h = K.bias_add(x_h, self.bias_h)
x_z = inputs_z
x_r = inputs_r
x_h = inputs_h
if 0. < self.recurrent_dropout < 1.:
h_tm1_z = h_tm1 * rec_dp_mask[0]
h_tm1_r = h_tm1 * rec_dp_mask[1]
h_tm1_h = h_tm1 * rec_dp_mask[2]
else:
h_tm1_z = h_tm1
h_tm1_r = h_tm1
h_tm1_h = h_tm1
z = self.recurrent_activation(x_z + K.dot(h_tm1_z,
self.recurrent_kernel_z))
r = self.recurrent_activation(x_r + K.dot(h_tm1_r,
self.recurrent_kernel_r))
hh = self.activation(x_h + K.dot(r * h_tm1_h,
self.recurrent_kernel_h))
else:
if 0. < self.dropout < 1.:
inputs *= dp_mask[0]
# Custom implementation of inputs which are already embedding parameters
#matrix_x = K.dot(inputs, self.kernel)
#if self.use_bias:
# matrix_x = K.bias_add(matrix_x, self.bias)
matrix_x = inputs
if 0. < self.recurrent_dropout < 1.:
h_tm1 *= rec_dp_mask[0]
matrix_inner = K.dot(h_tm1,
self.recurrent_kernel[:, :2 * self.units])
x_z = matrix_x[:, :self.units]
x_r = matrix_x[:, self.units: 2 * self.units]
recurrent_z = matrix_inner[:, :self.units]
recurrent_r = matrix_inner[:, self.units: 2 * self.units]
z = self.recurrent_activation(x_z + recurrent_z)
r = self.recurrent_activation(x_r + recurrent_r)
x_h = matrix_x[:, 2 * self.units:]
recurrent_h = K.dot(r * h_tm1,
self.recurrent_kernel[:, 2 * self.units:])
hh = self.activation(x_h + recurrent_h)
h = z * h_tm1 + (1 - z) * hh
if 0 < self.dropout + self.recurrent_dropout:
if training is None:
h._uses_learning_phase = True
return h, [h]
def get_config(self):
config = {'units': self.units,
'activation': activations.serialize(self.activation),
'recurrent_activation': activations.serialize(self.recurrent_activation),
'use_bias': self.use_bias,
'kernel_initializer': initializers.serialize(self.kernel_initializer),
'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
'bias_initializer': initializers.serialize(self.bias_initializer),
'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
'bias_regularizer': regularizers.serialize(self.bias_regularizer),
'kernel_constraint': constraints.serialize(self.kernel_constraint),
'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
'bias_constraint': constraints.serialize(self.bias_constraint),
'dropout': self.dropout,
'recurrent_dropout': self.recurrent_dropout,
'implementation': self.implementation}
base_config = super(CGRUCell, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class CGRU(RNN):
#interfaces.legacy_recurrent_support
def __init__(self, units,
activation='tanh',
recurrent_activation='hard_sigmoid',
use_bias=True,
kernel_initializer='glorot_uniform',
recurrent_initializer='orthogonal',
bias_initializer='zeros',
kernel_regularizer=None,
recurrent_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
recurrent_constraint=None,
bias_constraint=None,
dropout=0.,
recurrent_dropout=0.,
implementation=1,
return_sequences=False,
return_state=False,
go_backwards=False,
stateful=False,
unroll=False,
**kwargs):
if implementation == 0:
warnings.warn('`implementation=0` has been deprecated, '
'and now defaults to `implementation=1`.'
'Please update your layer call.')
cell = CGRUCell(units,
activation=activation,
recurrent_activation=recurrent_activation,
use_bias=use_bias,
kernel_initializer=kernel_initializer,
recurrent_initializer=recurrent_initializer,
bias_initializer=bias_initializer,
kernel_regularizer=kernel_regularizer,
recurrent_regularizer=recurrent_regularizer,
bias_regularizer=bias_regularizer,
kernel_constraint=kernel_constraint,
recurrent_constraint=recurrent_constraint,
bias_constraint=bias_constraint,
dropout=dropout,
recurrent_dropout=recurrent_dropout,
implementation=implementation)
super(CGRU, self).__init__(cell,
return_sequences=return_sequences,
return_state=return_state,
go_backwards=go_backwards,
stateful=stateful,
unroll=unroll,
**kwargs)
self.activity_regularizer = regularizers.get(activity_regularizer)
def call(self, inputs, mask=None, training=None, initial_state=None):
self.cell._dropout_mask = None
self.cell._recurrent_dropout_mask = None
return super(CGRU, self).call(inputs,
mask=mask,
training=training,
initial_state=initial_state)
#property
def units(self):
return self.cell.units
#property
def activation(self):
return self.cell.activation
#property
def recurrent_activation(self):
return self.cell.recurrent_activation
#property
def use_bias(self):
return self.cell.use_bias
#property
def kernel_initializer(self):
return self.cell.kernel_initializer
#property
def recurrent_initializer(self):
return self.cell.recurrent_initializer
#property
def bias_initializer(self):
return self.cell.bias_initializer
#property
def kernel_regularizer(self):
return self.cell.kernel_regularizer
#property
def recurrent_regularizer(self):
return self.cell.recurrent_regularizer
#property
def bias_regularizer(self):
return self.cell.bias_regularizer
#property
def kernel_constraint(self):
return self.cell.kernel_constraint
#property
def recurrent_constraint(self):
return self.cell.recurrent_constraint
#property
def bias_constraint(self):
return self.cell.bias_constraint
#property
def dropout(self):
return self.cell.dropout
#property
def recurrent_dropout(self):
return self.cell.recurrent_dropout
#property
def implementation(self):
return self.cell.implementation
def get_config(self):
config = {'units': self.units,
'activation': activations.serialize(self.activation),
'recurrent_activation': activations.serialize(self.recurrent_activation),
'use_bias': self.use_bias,
'kernel_initializer': initializers.serialize(self.kernel_initializer),
'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
'bias_initializer': initializers.serialize(self.bias_initializer),
'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
'bias_regularizer': regularizers.serialize(self.bias_regularizer),
'activity_regularizer': regularizers.serialize(self.activity_regularizer),
'kernel_constraint': constraints.serialize(self.kernel_constraint),
'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
'bias_constraint': constraints.serialize(self.bias_constraint),
'dropout': self.dropout,
'recurrent_dropout': self.recurrent_dropout,
'implementation': self.implementation}
base_config = super(CGRU, self).get_config()
del base_config['cell']
return dict(list(base_config.items()) + list(config.items()))
#classmethod
def from_config(cls, config):
if 'implementation' in config and config['implementation'] == 0:
config['implementation'] = 1
return cls(**config)
Model Implementation is as follows:
user_input = Input(batch_shape=(batch_size,chunk_size,), dtype='int32', name='user_inputs')
user_emb = Embedding(input_dim=num_users+1, output_dim=out_dim, input_length=chunk_size)(user_input)
item_input = Input(batch_shape=(batch_size,chunk_size,), dtype='int32', name='item_inputs')
item_emb = Embedding(input_dim=num_items+1, output_dim=out_dim, input_length=chunk_size)(item_input)
inputs = keras.layers.add([user_emb, item_emb])
gru_args = {
"units":hidden_size,
"return_sequences":True,
#"return_state":True,
"stateful":True,
"unroll":False
}
gru = CGRU(**gru_args)(inputs)
outputs = Dense(num_items+1, activation='softmax')(gru)
[recc_model = Model(inputs=\[user_input,item_input\], outputs=outputs)
recc_model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=\[metrics.cate][1]gorical_accuracy])
#metrics=[metrics.sparse_categorical_accuracy])
But on running the code I am getting the following error which seems is due to gradients are getting computed to None:
ValueError: Tried to convert 'x' to a tensor and failed. Error: None values not supported.
Find the complete error here: https://pastebin.com/n9UzCRiP

The error occurs because the bias weights are added to the model but not used anywhere.
When you call self.add_weight(...), you have to make sure these weights are used somewhere in your model. Otherwise, since these weights are not connected to the loss tensor, TF cannot compute the gradient and an error will be raised.
If you don't need the bias weights, you can either remove the add_weight lines, or set use_bias=False in your cell.
Also, I think you don't need to re-implement a CGRU layer to use a custom cell. Just wrap your custom cell with the built-in RNN layer should work.
gru = RNN(CGRUCell(hidden_size, use_bias=False),
return_sequences=True,
stateful=True,
unroll=False)(inputs)

Related

Coding a forward pass of the Convnet

I am given this set of parameters where I need to code into a forward pass.
model_list = [dict(type='Conv2D', in_channels=3, out_channels=32, kernel_size=5, stride=1, padding=2),
dict(type='ReLU'),
dict(type='MaxPooling', kernel_size=2, stride=2),
dict(type='Linear', in_dim=8192, out_dim=10)]
criterion = dict(type='SoftmaxCrossEntropy')
def __init__(self, modules, criterion):
self.modules = []
for m in modules:
if m['type'] == 'Conv2D':
self.modules.append(
Conv2D(m['in_channels'],
m['out_channels'],
m['kernel_size'],
m['stride'],
m['padding'])
)
elif m['type'] == 'ReLU':
self.modules.append(
ReLU()
)
elif m['type'] == 'MaxPooling':
self.modules.append(
MaxPooling(m['kernel_size'],
m['stride'])
)
elif m['type'] == 'Linear':
self.modules.append(
Linear(m['in_dim'],
m['out_dim'])
)
if criterion['type'] == 'SoftmaxCrossEntropy':
self.criterion = SoftmaxCrossEntropy()
And basically what I am trying to do is this
def forward(self, x, y):
probs = None
loss = None
Conv2D_output = self.modules['Conv2D'](x)
Relu_output = self.modules['ReLU'](Conv2D_output)
MaxPooling_output = self.modules['MaxPooling'](Relu_output)
Linear_output = self.modules['Linear'](MaxPooling_output)
scores = Linear_output
probs, loss = self.criterion.forward(scores, y)
self.cache = (probs, x, y)
return probs, loss
Is there anyway I could do it? Since the dictionary is appended as an object into a list?

how can I predict only 5 days price in this LSTM model (by pytorch)

class StockDataset(Dataset):
#데이터 셋은 i번째 레코드 값을 주는 역할 수행
def __init__(self, symbol, x_frames, y_frames, start, end):
self.symbol = symbol
self.x_frames = x_frames
self.y_frames = y_frames
self.start = datetime.datetime(*start)
self.end = datetime.datetime(*end)
#위에서 지정했던 데이터, 날짜 값들 다 받아옴
self.data = pdr.DataReader(self.symbol, 'yahoo', self.start, self.end)
def __len__(self):
return len(self.data) - (self.x_frames + self.y_frames) + 1
def __getitem__(self, idx):
global data
#global data_set
#데이터셋 i번째 값 입력받았을때 그걸 출력해줌 데이터를 '리스트'화 하는 것
idx += self.x_frames
data = self.data.iloc[idx-self.x_frames:idx+self.y_frames]
data = data[['High', 'Low', 'Open', 'Close', 'Adj Close', 'Volume']]
data = data.apply(lambda x: np.log(x+1) - np.log(x[self.x_frames-1]+1)) #로그수익률 변환 한뒤, 혹시모를 결측값 위해 1더해줌
global x_ex
global y_ex
x_ex= data[:self.x_frames]
y_ex= data[self.x_frames:]
data = data.values #numpy array로 변환한거
X = data[:self.x_frames]
y = data[self.x_frames:]
return X, y
This one is dataset
class LSTM(nn.Module):
#50분 이후부터 모델설명
def __init__(self, input_dim, hidden_dim, output_dim, num_layers, batch_size, dropout, use_bn):
super(LSTM, self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.output_dim = output_dim
self.num_layers = num_layers
self.batch_size = batch_size
self.dropout = dropout
self.use_bn = use_bn
self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)
self.hidden = self.init_hidden()
self.regressor = self.make_regressor()
def init_hidden(self):
return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))
def make_regressor(self):
layers = []
if self.use_bn:
layers.append(nn.BatchNorm1d(self.hidden_dim))
layers.append(nn.Dropout(self.dropout))
layers.append(nn.Linear(self.hidden_dim, self.hidden_dim // 2))
layers.append(nn.ReLU())
layers.append(nn.Linear(self.hidden_dim // 2, self.output_dim))
regressor = nn.Sequential(*layers)
return regressor
def forward(self, x):
lstm_out, self.hidden = self.lstm(x, self.hidden)
y_pred = self.regressor(lstm_out[-1].view(self.batch_size, -1))
return y_pred
This one is model
def test(model, partition, args):
global y_true
global y_pred
global X
testloader = DataLoader(partition['test'],
batch_size=args.batch_size,
shuffle=False, drop_last=True)
model.eval()
test_acc = 0.0
with torch.no_grad():
for i, (X, y) in enumerate(testloader):
X = X.transpose(0, 1).float().to(args.device)
y_true = y[:, :, 3].float().to(args.device)
model.hidden = [hidden.to(args.device) for hidden in model.init_hidden()]
y_pred = model(X)
test_acc += metric(y_pred, y_true)[0]
test_acc = test_acc / len(testloader)
return test_acc
This is test data loader.
# ====== Random Seed Initialization ====== #
seed = 666
np.random.seed(seed)
torch.manual_seed(seed)
parser = argparse.ArgumentParser()
args = parser.parse_args("")
args.exp_name = "exp1_lr"
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
# ====== Data Loading ====== #
args.symbol = '005930.KS' #원하는 종목
args.batch_size = 4 #배치사이즈
args.x_frames = 5 #수정x 이전 n일치 데이터 이게 너무 길면 1주일 예측 불가능
args.y_frames = 5 #수정y 이후 n일치 데이터 이게 너무 길면 1주일 예측 불가능
# ====== Model Capacity ===== #
args.input_dim = 6
args.hid_dim = 50
args.n_layers = 2 # (은닉층의 레이어 갯수) https://justkode.kr/deep-learning/pytorch-rnn링크 참고
# ====== Regularization ======= #
args.l2 = 0.0001
args.dropout = 0.3
args.use_bn = True
# ====== Optimizer & Training ====== #
args.optim = 'RMSprop' #'RMSprop' #SGD, RMSprop, ADAM...
args.lr = 0.001
args.epoch = 1
# ====== Experiment Variable ====== #
name_var1 = 'lr' # lr=러닝레이트
name_var2 = 'n_layers'#뉴럴 네트워크 몇개를 쌓을것인지?
list_var1 = [0.001, 0.0001, 0.00001]
list_var2 = [1,2,3]
#데이터셋 실제 형성
trainset = StockDataset(args.symbol, args.x_frames, args.y_frames, (2012,1,1), (2021,1,1)) #학습기간
valset = StockDataset(args.symbol, args.x_frames, args.y_frames, (2021,1,2), (2021,12,30)) #검증기간 최소 +6 월, +19 일 안하면 float division by zero 에러 발생 왜?? 21년 기준
testset = StockDataset(args.symbol, args.x_frames, args.y_frames, (2022,1,10), (2022,1,14)) #테스트기간 최소 +6 월, + 25일 안하면 float division by zero 에러 발생. 22년기준
#기간이 일정 영업일 이상을 요구하는듯? <<146 영업일 이상 데이터 요구. 그만큼 안주면 오류남 왜??
partition = {'train': trainset, 'val':valset, 'test':testset}
for var1 in list_var1:
for var2 in list_var2:
setattr(args, name_var1, var1)
setattr(args, name_var2, var2)
print(args)
setting, result = experiment(partition, deepcopy(args))
save_exp_result(setting, result)
#꼭 디렉토리에 있는 파일들 지운다음에 그래프 그려야한다. 안그러면 결과값 전부 겹쳐서 나옴
This one is hyper parameter regulate.
I wonder how can I get result when I set testset length in 5days? (like (2022,1,10) (2022,1,14))
This cord didn't work when I set testset length at least 7month (maybe + 146 trade day)
error is float divided by zero. (when I use lower 146 days.)
if I set length +146 days, then codes work well.
I think this code cause error:
data = data.apply(lambda x: np.log(x+1) - np.log(x[self.x_frames-1]+1))
log data was so small, so error occurred. (my opinion)
data is yahoo finance data. Thanx to read
When I # below code, then data got infinite.
data = data.apply(lambda x: np.log(x+1) - np.log(x[self.x_frames-1]+1))

RuntimeError: mat1 and mat2 shapes cannot be multiplied (256x726 and 1000x1000)

I'm trying to measure the latent space clustering but the error raised.
class AutoEncoder(nn.Module):
def __init__(self, input_dim1, input_dim2, hidden_dims, agg, sep_decode):
super(AutoEncoder, self).__init__()
self.agg = agg
self.sep_decode = sep_decode
print("hidden_dims:", hidden_dims)
self.encoder_layers = []
self.encoder2_layers = []
dims = [[input_dim1, input_dim2]] + hidden_dims
for i in range(len(dims) - 1):
if i == 0:
layer = nn.Sequential(nn.Linear(dims[i][0], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i][1], dims[i+1]), nn.ReLU())
elif i != 0 and i < len(dims) - 2:
layer = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
else:
layer = nn.Linear(dims[i], dims[i+1])
layer2 = nn.Linear(dims[i], dims[i+1])
self.encoder_layers.append(layer)
self.encoder2_layers.append(layer2)
self.encoder = nn.Sequential(*self.encoder_layers)
self.encoder2 = nn.Sequential(*self.encoder2_layers)
self.decoder_layers = []
self.decoder2_layers = []
hidden_dims.reverse()
dims = hidden_dims + [[input_dim1, input_dim2]]
if self.agg == "concat" and not self.sep_decode:
dims[0] = 2 * dims[0]
for i in range(len(dims) - 1):
if i < len(dims) - 2:
layer = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
else:
layer = nn.Linear(dims[i], dims[i+1][0])
layer2 = nn.Linear(dims[i], dims[i+1][1])
self.decoder_layers.append(layer)
self.decoder2_layers.append(layer2)
self.decoder = nn.Sequential(*self.decoder_layers)
self.decoder2 = nn.Sequential(*self.decoder2_layers)
def forward(self, x1, x2):
z1 = self.encoder(x1)
z2 = self.encoder2(x2)
if self.agg == "max":
z = torch.max(z1, z2)
elif self.agg == "multi":
z = z1 * z2
elif self.agg == "sum":
z = z1 + z2
elif self.agg == "concat":
z = torch.cat([z1, z2], dim=1)
if self.sep_decode:
x_bar1 = self.decoder(z1)
x_bar1 = F.normalize(x_bar1, dim=-1)
x_bar2 = self.decoder2(z2)
x_bar2 = F.normalize(x_bar2, dim=-1)
else:
x_bar1 = self.decoder(z)
x_bar1 = F.normalize(x_bar1, dim=-1)
x_bar2 = self.decoder2(z)
x_bar2 = F.normalize(x_bar2, dim=-1)
return x_bar1, x_bar2, z
class TopicCluster(nn.Module):
def __init__(self, args):
super(TopicCluster, self).__init__()
self.alpha = 1.0
self.dataset_path = args.dataset_path
self.args = args
self.device = args.device
self.temperature = args.temperature
self.distribution = args.distribution
self.agg_method = args.agg_method
self.sep_decode = (args.sep_decode == 1)
input_dim1 = args.input_dim1
input_dim2 = args.input_dim2
hidden_dims = eval(args.hidden_dims)
self.model = AutoEncoder(input_dim1, input_dim2, hidden_dims, self.agg_method, self.sep_decode)
if self.agg_method == "concat":
self.topic_emb = Parameter(torch.Tensor(args.n_clusters, 2*hidden_dims[-1]))
else:
self.topic_emb = Parameter(torch.Tensor(args.n_clusters, hidden_dims[-1]))
torch.nn.init.xavier_normal_(self.topic_emb.data)
def pretrain(self, input_data, pretrain_epoch=200):
pretrained_path = os.path.join(self.dataset_path, f"pretrained_{args.suffix}.pt")
if os.path.exists(pretrained_path) and self.args.load_pretrain:
# load pretrain weights
print(f"loading pretrained model from {pretrained_path}")
self.model.load_state_dict(torch.load(pretrained_path))
else:
train_loader = DataLoader(input_data, batch_size=self.args.batch_size, shuffle=True)
optimizer = Adam(self.model.parameters(), lr=self.args.lr)
for epoch in range(pretrain_epoch):
total_loss = 0
for batch_idx, (x1, x2, _, weight) in enumerate(train_loader):
x1 = x1.to(self.device)
x2 = x2.to(self.device)
weight = weight.to(self.device)
optimizer.zero_grad()
x_bar1, x_bar2, z = self.model(x1, x2)
loss = cosine_dist(x_bar1, x1) + cosine_dist(x_bar2, x2) #, weight)
total_loss += loss.item()
loss.backward()
optimizer.step()
print(f"epoch {epoch}: loss = {total_loss / (batch_idx+1):.4f}")
torch.save(self.model.state_dict(), pretrained_path)
print(f"model saved to {pretrained_path}")
def cluster_assign(self, z):
if self.distribution == 'student':
p = 1.0 / (1.0 + torch.sum(
torch.pow(z.unsqueeze(1) - self.topic_emb, 2), 2) / self.alpha)
p = p.pow((self.alpha + 1.0) / 2.0)
p = (p.t() / torch.sum(p, 1)).t()
else:
self.topic_emb.data = F.normalize(self.topic_emb.data, dim=-1)
z = F.normalize(z, dim=-1)
sim = torch.matmul(z, self.topic_emb.t()) / self.temperature
p = F.softmax(sim, dim=-1)
return p
def forward(self, x1, x2):
x_bar1, x_bar2, z = self.model(x1, x2)
p = self.cluster_assign(z)
return x_bar1, x_bar2, z, p
def target_distribution(self, x1, x2, freq, method='all', top_num=0):
_, _, z = self.model(x1, x2)
p = self.cluster_assign(z).detach()
if method == 'all':
q = p**2 / (p * freq.unsqueeze(-1)).sum(dim=0)
q = (q.t() / q.sum(dim=1)).t()
elif method == 'top':
assert top_num > 0
q = p.clone()
sim = torch.matmul(self.topic_emb, z.t())
_, selected_idx = sim.topk(k=top_num, dim=-1)
for i, topic_idx in enumerate(selected_idx):
q[topic_idx] = 0
q[topic_idx, i] = 1
return p, q
def cosine_dist(x_bar, x, weight=None):
if weight is None:
weight = torch.ones(x.size(0), device=x.device)
cos_sim = (x_bar * x).sum(-1)
cos_dist = 1 - cos_sim
cos_dist = (cos_dist * weight).sum() / weight.sum()
return cos_dist
def train(args, emb_dict):
# ipdb.set_trace()
inv_vocab = {k: " ".join(v) for k, v in emb_dict["inv_vocab"].items()}
vocab = {" ".join(k):v for k, v in emb_dict["vocab"].items()}
print(f"Vocab size: {len(vocab)}")
embs = F.normalize(torch.tensor(emb_dict["vs_emb"]), dim=-1)
embs2 = F.normalize(torch.tensor(emb_dict["oh_emb"]), dim=-1)
freq = np.array(emb_dict["tuple_freq"])
if not args.use_freq:
freq = np.ones_like(freq)
input_data = TensorDataset(embs, embs2, torch.arange(embs.size(0)), torch.tensor(freq))
topic_cluster = TopicCluster(args).to(args.device)
topic_cluster.pretrain(input_data, args.pretrain_epoch)
train_loader = DataLoader(input_data, batch_size=args.batch_size, shuffle=False)
optimizer = Adam(topic_cluster.parameters(), lr=args.lr)
# topic embedding initialization
embs = embs.to(args.device)
embs2 = embs2.to(args.device)
x_bar1, x_bar2, z = topic_cluster.model(embs, embs2)
z = F.normalize(z, dim=-1)
print(f"Running K-Means for initialization")
kmeans = KMeans(n_clusters=args.n_clusters, n_init=5)
if args.use_freq:
y_pred = kmeans.fit_predict(z.data.cpu().numpy(), sample_weight=freq)
else:
y_pred = kmeans.fit_predict(z.data.cpu().numpy())
print(f"Finish K-Means")
freq = torch.tensor(freq).to(args.device)
y_pred_last = y_pred
topic_cluster.topic_emb.data = torch.tensor(kmeans.cluster_centers_).to(args.device)
topic_cluster.train()
i = 0
for epoch in range(50):
if epoch % 5 == 0:
_, _, z, p = topic_cluster(embs, embs2)
z = F.normalize(z, dim=-1)
topic_cluster.topic_emb.data = F.normalize(topic_cluster.topic_emb.data, dim=-1)
if not os.path.exists(os.path.join(args.dataset_path, f"clusters_{args.suffix}")):
os.makedirs(os.path.join(args.dataset_path, f"clusters_{args.suffix}"))
embed_save_path = os.path.join(args.dataset_path, f"clusters_{args.suffix}/embed_{epoch}.pt")
torch.save({
"inv_vocab": emb_dict['inv_vocab'],
"embed": z.detach().cpu().numpy(),
"topic_embed": topic_cluster.topic_emb.detach().cpu().numpy(),
}, embed_save_path)
f = open(os.path.join(args.dataset_path, f"clusters_{args.suffix}/{epoch}.txt"), 'w')
pred_cluster = p.argmax(-1)
result_strings = []
for j in range(args.n_clusters):
if args.sort_method == 'discriminative':
word_idx = torch.arange(embs.size(0))[pred_cluster == j]
sorted_idx = torch.argsort(p[pred_cluster == j][:, j], descending=True)
word_idx = word_idx[sorted_idx]
else:
sim = torch.matmul(topic_cluster.topic_emb[j], z.t())
_, word_idx = sim.topk(k=30, dim=-1)
word_cluster = []
freq_sum = 0
for idx in word_idx:
freq_sum += freq[idx].item()
if inv_vocab[idx.item()] not in word_cluster:
word_cluster.append(inv_vocab[idx.item()])
if len(word_cluster) >= 10:
break
result_strings.append((freq_sum, f"Topic {j} ({freq_sum}): " + ', '.join(word_cluster)+'\n'))
result_strings = sorted(result_strings, key=lambda x: x[0], reverse=True)
for result_string in result_strings:
f.write(result_string[1])
for x1, x2, idx, weight in train_loader:
if i % args.update_interval == 0:
p, q = topic_cluster.target_distribution(embs, embs2, freq.clone().fill_(1), method='all', top_num=epoch+1)
y_pred = p.cpu().numpy().argmax(1)
delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
y_pred_last = y_pred
if i > 0 and delta_label < args.tol:
print(f'delta_label {delta_label:.4f} < tol ({args.tol})')
print('Reached tolerance threshold. Stopping training.')
return None
i += 1
x1 = x1.to(args.device)
x2 = x2.to(args.device)
idx = idx.to(args.device)
weight = weight.to(args.device)
x_bar1, x_bar2, _, p = topic_cluster(x1, x2)
reconstr_loss = cosine_dist(x_bar1, x1) + cosine_dist(x_bar2, x2) #, weight)
kl_loss = F.kl_div(p.log(), q[idx], reduction='none').sum(-1)
kl_loss = (kl_loss * weight).sum() / weight.sum()
loss = args.gamma * kl_loss + reconstr_loss
if i % args.update_interval == 0:
print(f"KL loss: {kl_loss}; Reconstruction loss: {reconstr_loss}")
optimizer.zero_grad()
loss.backward()
optimizer.step()
return None
if __name__ == "__main__":
# CUDA_VISIBLE_DEVICES=0 python3 latent_space_clustering.py --dataset_path ./pandemic --input_emb_name po_tuple_features_all_svos.pk
parser = argparse.ArgumentParser(
description='train',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset_path', type=str)
parser.add_argument('--input_emb_name', type=str)
parser.add_argument('--lr', type=float, default=5e-4)
parser.add_argument('--n_clusters', default=30, type=int)
parser.add_argument('--input_dim1', default=1000, type=int)
parser.add_argument('--input_dim2', default=1000, type=int)
parser.add_argument('--agg_method', default="multi", choices=["sum", "multi", "concat", "attend"], type=str)
parser.add_argument('--sep_decode', default=0, choices=[0, 1], type=int)
parser.add_argument('--pretrain_epoch', default=100, type=int)
parser.add_argument('--load_pretrain', default=False, action='store_true')
parser.add_argument('--temperature', default=0.1, type=float)
parser.add_argument('--sort_method', default='generative', choices=['generative', 'discriminative'])
parser.add_argument('--distribution', default='softmax', choices=['softmax', 'student'])
parser.add_argument('--batch_size', default=256, type=int)
parser.add_argument('--use_freq', default=False, action='store_true')
parser.add_argument('--hidden_dims', default='[1000, 2000, 1000, 100]', type=str)
parser.add_argument('--suffix', type=str, default='')
parser.add_argument('--gamma', default=5, type=float, help='weight of clustering loss')
parser.add_argument('--update_interval', default=100, type=int)
parser.add_argument('--tol', default=0.001, type=float)
args = parser.parse_args()
args.cuda = torch.cuda.is_available()
print("use cuda: {}".format(args.cuda))
args.device = torch.device("cuda" if args.cuda else "cpu")
print(args)
with open(os.path.join(args.dataset_path, args.input_emb_name), "rb") as fin:
emb_dict = pk.load(fin)
candidate_idx = train(args, emb_dict)
print(candidate_idx)
The error I'm getting is: RuntimeError: mat1 and mat2 shapes cannot be multiplied (256x726 and 1000x1000). I cannot figure out which part is the problem. Please help me.. Thank you so much
for the images runtime error like
enter image description here

Pytorch - Are my weights being shared in this network?

I have the following network where I am trying to do triplet loss:
First, I have a custom Convolution class ConvBlock(nn.Module):
def __init__(self, ngpu, input_c, output_c, mode=0):
super(ConvBlock, self).__init__()
self.ngpu = ngpu
self.input_c = input_c
self.output_c = output_c
self.mode = mode
self.b1 = nn.Sequential(
nn.Conv2d(input_c, output_c, 3, stride=1, padding=1),
#nn.BatchNorm2d(output_c),
nn.PReLU(),
)
self.b2 = nn.Sequential(
nn.Conv2d(output_c, output_c, 3, stride=1, padding=1),
#nn.BatchNorm2d(output_c),
nn.PReLU(),
)
self.pool = nn.Sequential(
nn.MaxPool2d(2, 2),
)
def forward(self, input):
batch_size = input.size(0)
if self.mode == 0:
b1 = self.b1(input)
hidden = self.pool(b1)
return hidden, b1
elif self.mode == 1:
b1 = self.b1(input)
b2 = self.b2(b1)
hidden = self.pool(b2)
return hidden, b2
elif self.mode == 2:
b1 = self.b1(input)
hidden = self.b2(b1)
return hidden
I now have an encoder module:
class _Encoder(nn.Module):
def __init__(self, ngpu,nc,nef,out_size,nz):
super(_Encoder, self).__init__()
self.ngpu = ngpu
self.nc = nc
self.nef = nef
self.out_size = out_size
self.nz = nz
self.c1 = ConvBlock(self.ngpu, nc, nef, 0) # 3 - 64
self.c2 = ConvBlock(self.ngpu, nef, nef*2, 0) # 64-128
self.c3 = ConvBlock(self.ngpu, nef*2, nef*4, 1) # 128-256
self.c4 = ConvBlock(self.ngpu, nef*4, nef*8, 1) # 256 -512
self.c5 = ConvBlock(self.ngpu, nef*8, nef*8, 2) # 512-512
# 8 because..the depth went from 32 to 32*8
self.mean = nn.Linear(nef * 8 * out_size * (out_size/2), nz)
self.logvar = nn.Linear(nef * 8 * out_size * (out_size/2), nz)
#for reparametrization trick
def sampler(self, mean, logvar):
std = logvar.mul(0.5).exp_()
if args.cuda:
eps = torch.cuda.FloatTensor(std.size()).normal_()
else:
eps = torch.FloatTensor(std.size()).normal_()
eps = Variable(eps)
return eps.mul(std).add_(mean)
def forward(self, input):
batch_size = input.size(0)
if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu > 1:
c1_out, c1_x = nn.parallel.data_parallel(self.c1, input, range(self.ngpu))
c2_out, c2_x = nn.parallel.data_parallel(self.c2, c1_out, range(self.ngpu))
c3_out, c3_x = nn.parallel.data_parallel(self.c3, c2_out, range(self.ngpu))
c4_out, c4_x = nn.parallel.data_parallel(self.c4, c3_out, range(self.ngpu))
hidden = nn.parallel.data_parallel(self.c5, c4_out, range(self.ngpu))
# hidden = nn.parallel.data_parallel(self.encoder, input, range(self.ngpu))
hidden = hidden.view(batch_size, -1)
mean = nn.parallel.data_parallel(self.mean, hidden, range(self.ngpu))
logvar = nn.parallel.data_parallel(self.logvar, hidden, range(self.ngpu))
else:
c1_out, c1_x = self.c1(input)
c2_out, c2_x = self.c2(c1_out)
c3_out, c3_x = self.c3(c2_out)
c4_out, c4_x = self.c4(c3_out)
hidden = self.c5(c4_out)
# hidden = self.encoder(input)
hidden = hidden.view(batch_size, -1)
mean, logvar = self.mean(hidden), self.logvar(hidden)
latent_z = self.sampler(mean, logvar)
if ADD_SKIP_CONNECTION:
return latent_z,mean,logvar,{"c1_x":c1_x, "c2_x":c2_x, "c3_x":c3_x, "c4_x":c4_x}
else:
return latent_z,mean,logvar,{"c1_x":None, "c2_x":None, "c3_x":None, "c4_x":None}
I initialize my encoder as a single object:
encoder = _Encoder(ngpu,nc,nef,out_size,nz)
encoder = encoder.cuda()
And then I am applying some functions:
latent_x,mean_x,logvar_x,skip_x = self.encoder(x)
latent_y,mean_y,logvar_y,skip_y = self.encoder(y)
latent_z,mean_z,logvar_z,skip_z = self.encoder(z)
dist_a = F.pairwise_distance(mean_x, mean_y, 2)
dist_b = F.pairwise_distance(mean_x, mean_z, 2)
loss_triplet = triplet_loss(dist_a, dist_b, target)
optimizer.zero_grad()
loss_triplet.backward()
optimizer.step()
I am starting to doubt if the weights are actually being shared across the 3 encoder blocks. Please help me check an tell me if it does

how to print TensorVariable

I am a beginner of theano. I am studying it now. I'd like to print 'value' and 'shape' of TensorVariable while operating theano.function. When I used print fuction of python, print function ran before compiling theano function. So I learned using print fuction is useless. Thereofre I tried my another hand. I added a following syntax to execute theano.printing.Print.
(cce is return value of theano.scan. Therefore, Maybe it is not symbolic vaiable.
Actually, I am confused by the concept of TensorVariable and shared variable. TensorVariable is a sort of shared variable?)
x = theano.tensor.tensor3() # define data type
t_print =theano.printing.Print("cce value is : ")(x)
f = theano.function([x], t_print) # define theano.function
f(cce) # call f (print value of cce)
Then, Following Error Occured
TypeError: ('Bad input argument to theano function with name "seq2seq.py : 98" at index 0(0-based)', 'Expected an array-like object, but found a Variable: maybe you are trying to call a function on a (possibly shared) variable instead of a numeric array?')
Could you possibly let me know how to correct this code to print value of cce(TensorVariable) ? Or, is it impossible to print the value of TensorVariable when theano.function is on progress ?
Thank you for reading my question.
ADDED -
here is my source code. this is a large picture. theano.function() starts with last line. loss_func is 'categorical_crossentropy function'. last 4 line is about theano function
def categorical_crossentropy(y_true, y_pred):
y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
y_pred = y_pred.reshape( (-1, voca_dim_g) )
y_true = y_true.reshape( (-1, voca_dim_g) )
cce, updates = theano.scan(
fn=T.nnet.categorical_crossentropy,
sequences=[y_pred,y_true]
)
##### I want to print cce HERE #######
return T.mean(cce)
#staticmethod
def step(
x_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg
):
"""
x_t.shape = (timestep=1, dim)
x_t.shape = (n_samples, timestep=1, dim)
"""
i_t = T.nnet.sigmoid(T.dot(x_t, Ui) + T.dot(h_tm1, Wi) + bi)
f_t = T.nnet.sigmoid(T.dot(x_t, Uf) + T.dot(h_tm1, Wf) + bf)
o_t = T.nnet.sigmoid(T.dot(x_t, Uo) + T.dot(h_tm1, Wo) + bo)
g_t = T.tanh(T.dot(x_t, Ug) + T.dot(h_tm1, Wg) + bg)
c_t = c_tm1 * f_t + g_t * i_t
h_t = T.tanh(c_t) * o_t
return h_t, c_t
#########################################################################################################################
def forward(self, X):
states, updates = theano.scan(
fn=self.step,
sequences=[ X ],
outputs_info=[self.h_tm1, self.c_tm1],
non_sequences=[
self.Ui, self.Wi, self.bi,
self.Uf, self.Wf, self.bf,
self.Uo, self.Wo, self.bo,
self.Ug, self.Wg, self.bg
]
)
updates = [(self.h_tm1, states[0][-1]), (self.c_tm1, states[1][-1])]
return states, updates
#########################################################################################################################
def encode(self, X):
states, updates = self.forward(X)
h_t = states[0][-1]
c_t = states[1][-1]
return h_t, c_t, updates
def decode_step(
self, y_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg,
Wh, bh
):
h_t, c_t = self.step(
y_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg
)
y_t = T.dot(h_t, Wh) + bh
return y_t, h_t, c_t
def decode(self, h_tm1, c_tm1, timesteps):
outputs, updates = theano.scan(
fn=self.decode_step,
outputs_info=[self.y_t, h_tm1, c_tm1],
non_sequences=[
self.Ui, self.Wi, self.bi,
self.Uf, self.Wf, self.bf,
self.Uo, self.Wo, self.bo,
self.Ug, self.Wg, self.bg,
self.Wh, self.bh
],
n_steps=timesteps
)
updates = [
(self.h_tm1, outputs[1][-1]),
(self.c_tm1, outputs[2][-1])
]
return outputs[0], updates
h_tm1, c_tm1, updates_encode = encode(seq_input)
seq_predict, updates_decode = decode(h_tm1, c_tm1, T.shape(seq_target)[0])
loss = loss_func(seq_predict, seq_target)
self._train = theano.function([seq_input, seq_target], loss, updates = updates)
below is full source code
# -*- coding: utf-8 -*-
__modifier__ = "Lee Guk Beom, Lee Jae Sang, Jang Jae Kwang (alphabetical Order)"
import readFile
import numpy as np
import theano
import theano.tensor as T
from six.moves import zip
from theano.compile.debugmode import DebugMode
import nltk
import sys
import os
from nltk.tokenize import sent_tokenize
import codecs
#theano.config.optimizer='fast_compile'
#theano.config.exception_verbosity='high'
#theano.config.compute_test_value = 'warn'
epsilon = 1e-6
dtype = theano.config.floatX
minibatch_size_g = 0
longest_seq_g = 0
voca_dim_g = 0
n_time_step_input_g = 0
n_timestep_target_g = 0
word_to_index_input_g = dict()
word_to_index_targrt_g = dict()
index_to_word_target_g = dict()
#########################################################################################################################
def shared(value, name=None):
return theano.shared(value.astype(dtype), name=name)
#########################################################################################################################
def shared_zeros(shape, name=None):
return shared(value=np.zeros(shape), name=name)
#########################################################################################################################
def shared_zeros_like(x, name=None):
return shared_zeros(shape=x.shape, name=name)
#########################################################################################################################
def init_weights(shape, name=None):
bound = np.sqrt(1.0/shape[1])
w = np.random.uniform(-bound, bound, shape)
return shared(value=w, name=name)
#########################################################################################################################
def adadelta(params, cost, lr=1.0, rho=0.95):
# from https://github.com/fchollet/keras/blob/master/keras/optimizers.py
cost = cost.astype('float32')
grads = T.grad(cost, params)
accus = [shared_zeros_like(p.get_value()) for p in params]
delta_accus = [shared_zeros_like(p.get_value()) for p in params]
updates = []
for p, g, a, d_a in zip(params, grads, accus, delta_accus):
new_a = rho * a + (1.0 - rho) * T.square(g)
updates.append((a, new_a))
update = g * T.sqrt(d_a + epsilon) / T.sqrt(new_a + epsilon)
new_p = p - lr * update
updates.append((p, new_p))
new_d_a = rho * d_a + (1.0 - rho) * T.square(update)
updates.append((d_a, new_d_a))
return updates
#########################################################################################################################
def categorical_crossentropy(y_true, y_pred):
# from https://github.com/fchollet/keras/blob/master/keras/objectives.py
y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
# y_true = y_true.reshape( (-1, minibatch_size_g, voca_dim_g) )
'''
cce = T.nnet.categorical_crossentropy(y_pred,y_true)
# only matrix can be calculated
'''
# Y_PRED SOFTMAX
y_pred = y_pred.reshape( (-1, voca_dim_g) )
# y_pred_flat = T.nnet.softmax(y_pred)
y_true = y_true.reshape( (-1, voca_dim_g) )
cce, updates = theano.scan(
fn=T.nnet.categorical_crossentropy,
sequences=[y_pred,y_true]
)
return T.mean(cce)
#########################################################################################################################
def mean_square_error(y_true, y_pred):
return T.mean(T.square(y_pred - y_true))
#########################################################################################################################
class LSTM(object):
def __init__(self, size, dim):
self.size = size
self.dim = dim
shape_b = (minibatch_size_g, size)
shape_U = (dim, size)
shape_W = (size, size)
self.h_tm1 = shared_zeros(shape_b, "h_tm1")
self.c_tm1 = shared_zeros(shape_b, "c_tm1")
self.Ui = init_weights(shape_U, "Ui")
self.Wi = init_weights(shape_W, "Wi")
self.bi = shared_zeros(shape_b, "bi")
self.Uf = init_weights(shape_U, "Uf")
self.Wf = init_weights(shape_W, "Wf")
self.bf = shared_zeros(shape_b, "bf")
self.Uo = init_weights(shape_U, "Uo")
self.Wo = init_weights(shape_W, "Wo")
self.bo = shared_zeros(shape_b, "bo")
self.Ug = init_weights(shape_U, "Ug")
self.Wg = init_weights(shape_W, "Wg")
self.bg = shared_zeros(shape_b, "bg")
self.params = [
self.Ui, self.Wi, self.bi,
self.Uf, self.Wf, self.bf,
self.Uo, self.Wo, self.bo,
self.Ug, self.Wg, self.bg
]
def set_state(self, h, c):
self.h_tm1.set_value(h.get_value())
self.c_tm1.set_value(c.get_value())
def reset_state(self):
self.h_tm1 = shared_zeros((1, self.size), "h_tm1")
self.c_tm1 = shared_zeros((1, self.size), "c_tm1")
#########################################################################################################################
#staticmethod
def step(
x_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg
):
"""
x_t.shape = (timestep=1, dim)
x_t.shape = (n_samples, timestep=1, dim)
"""
i_t = T.nnet.sigmoid(T.dot(x_t, Ui) + T.dot(h_tm1, Wi) + bi)
f_t = T.nnet.sigmoid(T.dot(x_t, Uf) + T.dot(h_tm1, Wf) + bf)
o_t = T.nnet.sigmoid(T.dot(x_t, Uo) + T.dot(h_tm1, Wo) + bo)
g_t = T.tanh(T.dot(x_t, Ug) + T.dot(h_tm1, Wg) + bg)
c_t = c_tm1 * f_t + g_t * i_t
h_t = T.tanh(c_t) * o_t
return h_t, c_t
#########################################################################################################################
def forward(self, X):
states, updates = theano.scan(
fn=self.step,
sequences=[ X ],
outputs_info=[self.h_tm1, self.c_tm1],
non_sequences=[
self.Ui, self.Wi, self.bi,
self.Uf, self.Wf, self.bf,
self.Uo, self.Wo, self.bo,
self.Ug, self.Wg, self.bg
]
)
updates = [(self.h_tm1, states[0][-1]), (self.c_tm1, states[1][-1])]
return states, updates
#########################################################################################################################
class LSTMEncoder(LSTM):
def encode(self, X):
states, updates = self.forward(X)
h_t = states[0][-1]
c_t = states[1][-1]
return h_t, c_t, updates
class LSTMDecoder(LSTM):
def __init__(self, size, dim, h_tm1=None, c_tm1=None):
super(LSTMDecoder, self).__init__(size=size, dim=dim)
self.Wh = init_weights((size, dim), "Wh")
self.bh = shared_zeros((minibatch_size_g, dim), "bh")
self.h_tm1 = h_tm1 or shared_zeros((minibatch_size_g, size), "h_tm1")
self.c_tm1 = c_tm1 or shared_zeros((minibatch_size_g, size), "c_tm1")
self.y_t = shared_zeros((minibatch_size_g, dim), "y_t")
# self.decode_length = theano.shared(decode_length)
self.params.append(self.Wh)
self.params.append(self.bh)
def decode_step(
self, y_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg,
Wh, bh
):
h_t, c_t = self.step(
y_t, h_tm1, c_tm1,
Ui, Wi, bi, Uf, Wf, bf,
Uo, Wo, bo, Ug, Wg, bg
)
y_t = T.dot(h_t, Wh) + bh
return y_t, h_t, c_t
def decode(self, h_tm1, c_tm1, timesteps):
outputs, updates = theano.scan(
fn=self.decode_step,
outputs_info=[self.y_t, h_tm1, c_tm1],
non_sequences=[
self.Ui, self.Wi, self.bi,
self.Uf, self.Wf, self.bf,
self.Uo, self.Wo, self.bo,
self.Ug, self.Wg, self.bg,
self.Wh, self.bh
],
n_steps=timesteps
)
updates = [
(self.h_tm1, outputs[1][-1]),
(self.c_tm1, outputs[2][-1])
]
# return T.flatten(outputs[0], 3), updates
return outputs[0], updates
#staticmethod
def argmax(seq):
seq = T.argmax(seq, axis=2)
return seq
#########################################################################################################################
class Seq2Seq(object):
def __init__(self, size, dim):
self.encoder = LSTMEncoder(size, dim)
self.decoder = LSTMDecoder(size, dim)
self.params = []
self.params += self.encoder.params
self.params += self.decoder.params
self._predict = None
self._train = None
self._test = None
def compile(self, loss_func, optimizer):
seq_input = T.tensor3()
seq_target = T.tensor3()
decode_timesteps = T.iscalar()
h_tm1, c_tm1, updates_encode = self.encoder.encode(seq_input)
seq_predict_flex, updates_decode_flex = self.decoder.decode(h_tm1, c_tm1, decode_timesteps)
seq_argmax = self.decoder.argmax(seq_predict_flex)
seq_predict, updates_decode = self.decoder.decode(h_tm1, c_tm1, T.shape(seq_target)[0])
loss = loss_func(seq_predict, seq_target)
self._predict = theano.function([seq_input, decode_timesteps], seq_argmax, updates=updates_encode+updates_decode_flex)
self._test = theano.function([seq_input, seq_target], loss, updates=updates_encode+updates_decode)
updates = []
updates += updates_encode
updates += updates_decode
updates += optimizer(self.params, loss)
self._train = theano.function([seq_input, seq_target], loss, updates = updates)
def predict(self, seq_input, decode_timesteps):
self.encoder.reset_state()
self.decoder.reset_state()
return self._predict(seq_input, decode_timesteps)
def train(self, seq_input, seq_target):
self.encoder.reset_state()
self.decoder.reset_state()
return self._train(seq_input, seq_target)
def test(self, seq_input, seq_target):
self.encoder.reset_state()
self.decoder.reset_state()
return self._test(seq_input, seq_target)
#########################################################################################################################
def train(x, target):
for mini_batch, target in zip(x,target):
mini_batch = mini_batch.astype(dtype)
target = target.astype(dtype)
print("result of train function(loss or update) :", seq2seq.train(mini_batch, target))
#########################################################################################################################
# make weight information to pickle file
# information of Encooder class and decoder class of Seq2Seq class
# Encooder and decoder class should have function that returns value of weight variables
# one list contains elements that save weights' information
def save_weight():
None
#########################################################################################################################
def gen_processed_seq(input_sentence):
tokenized_seq = nltk.word_tokenize( input_sentence )
input_sentences = [ None for _ in range(1) ]
input_sentences[0] = tokenized_seq
seq_input = readFile.word_to_idx(input_sentences, word_to_index_input_g )
sorted_seq_input = [ None for _ in range(minibatch_size_g) ]
sorted_seq_input[0] = seq_input[0]
input_len = len(seq_input[0])
for i in range(minibatch_size_g-1):
for j in range(input_len):
sorted_seq_input[i+1] = [-1]
input_finally = []
input_finally.append(sorted_seq_input)
return input_finally
#########################################################################################################################
def gen_one_hot(input_len, input_seq):
one_hot = readFile.seq_to_1hot(n_time_step_input_g, input_seq, "predict", 1, 1)
one_hot[0] = one_hot[0].astype(dtype)
print("one_hot : ", one_hot)
return one_hot
def get_idx(argmax, num_of_word):
idx_list = argmax[ : num_of_word, 0]
return idx_list
#########################################################################################################################
def predict():
input_sentence = raw_input("Input the English Sentence You Want to Translate into Spanish : ")
input_seq = gen_processed_seq(input_sentence)
print("input_seq[0][0] : ",input_seq[0][0])
num_of_word = len(input_seq[0][0])
one_hot = gen_one_hot(n_time_step_input_g, input_seq)
argmax = seq2seq.predict(one_hot[0] , n_time_step_input_g )
print("argmax_fin shape : ", argmax.shape)
print("argmax_fin : ", argmax)
idx_list_np = get_idx(argmax, num_of_word)
idx_list_py = idx_list_np.tolist()
print("index_to_word_target_g : ",index_to_word_target_g)
print("index_to_word_target_g[6] :", index_to_word_target_g[6])
result = readFile.idx_to_word(idx_list_py, index_to_word_target_g)
translated = ""
for elem in result :
translated += elem
translated += " "
print("translated : " , translated)
print("Translation End")
#########################################################################################################################
def gen_global_var(word_to_index_input, word_to_index_targrt, voca_dim, si, st, index_to_word_target):
global word_to_index_input_g
global word_to_index_targrt_g
global voca_dim_g
global minibatch_size_g
global n_time_step_input_g
global n_timestep_target_g
global index_to_word_target_g
word_to_index_input_g = word_to_index_input
word_to_index_targrt_g = word_to_index_targrt
voca_dim_g = voca_dim + 2
minibatch_size_g = si[0].shape[1]
n_time_step_input_g = si[0].shape[0]
n_timestep_target_g = st[0].shape[0]
index_to_word_target_g = index_to_word_target
return
#########################################################################################################################
def menu(si, st):
None
#########################################################################################################################
def gen_object():
return None
#########################################################################################################################
if __name__ == "__main__":
si, st, maxlen_input, minibatch_size, voca_dim, word_to_index_input, word_to_index_targrt, index_to_word_target = readFile.preprocessing()
gen_global_var(word_to_index_input, word_to_index_targrt, voca_dim, si, st, index_to_word_target)
seq2seq = Seq2Seq(n_time_step_input_g, voca_dim_g )
seq2seq.compile(loss_func=categorical_crossentropy, optimizer=adadelta)
while(True):
print("select a menu")
print("1. Training")
print("2. Translate specific English sentence into Spanish.")
val = input("selection : ")
if val == 1:
train(si, st)
elif val == 2:
predict()
and readfile.py is
import numpy as np
import itertools
import nltk
import sys
import os
from nltk.tokenize import sent_tokenize
import codecs
unknown_token = 'UNKNOWN_TOKEN'
start_token = '_S'
end_token = '__E'
num_of_seq = 0
input_path = "./europarl-v7.es-en.en"
target_path = "./europarl-v7.es-en.es"
minibatch_unit = 100
voca_dim = 3000
SEQ_NUM_LIMIT = 1000
##########################################################################################
def file_tokenize(file):
f = codecs.open( file, "r", "utf-8" )
tokenized_seq = []
sentences = []
total_sentence_num = 0
# sequence tokenize
for i,line in enumerate(f):
print("tokenized Sentence No." , i)
# strip() method to remove the newline character at the end of the input line.
tokenized_seq = nltk.word_tokenize( line.strip() )
tokenized_seq.insert(0, start_token)
tokenized_seq.append(end_token)
sentences.append(tokenized_seq)
total_sentence_num += 1;
if(total_sentence_num == SEQ_NUM_LIMIT):
break
return sentences,total_sentence_num
##########################################################################################
# Count the word frequencies
def cntWordFreq(sentences):
word_freq = nltk.FreqDist(itertools.chain(*sentences))
return word_freq
##########################################################################################
# Get the most common words and build index_to_word and word_to_index vectors
def build_WordToIdx_IdxtoWord(word_freq):
vocab = word_freq.most_common(voca_dim-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
return index_to_word, word_to_index
##########################################################################################
# change word to index
def word_to_idx(sequences, word_to_index ) :
for i, sent in enumerate(sequences):
sequences[i] = [w if w in word_to_index else unknown_token for w in sent]
sequences[i] = [word_to_index[w] if w in word_to_index else -1 for w in sequences[i]]
return sequences
##########################################################################################
def idx_to_word(seq, index_to_word):
for i, sent in enumerate(seq):
seq[i] = index_to_word[sent]
#seq[i] = [index_to_word[sent] if sent in index_to_word else '?' ]
return seq
##########################################################################################
def sortByLen(seqs_input, seqs_target) :
# check maximum sentence length
max_len_input = 0
max_len_target = 0
for sentence in seqs_input :
tmp = len(sentence)
if max_len_input < tmp:
max_len_input = tmp
for sentence in seqs_target :
tmp = len(sentence)
if max_len_target < tmp:
max_len_target = tmp
seqs_sorted_input = [ [] for _ in range(max_len_input+1) ]
seqs_sorted_target = [ [] for _ in range(max_len_input+1) ]
i = 0
for sentence_input, sentence_target in zip(seqs_input, seqs_target) :
sentence_len = len(sentence_input)
seqs_sorted_input[sentence_len].append(sentence_input)
seqs_sorted_target[sentence_len].append(sentence_target)
i+=1
return seqs_sorted_input, seqs_sorted_target, max_len_input, max_len_target
##########################################################################################
def find_maxlen(sentence_group):
max_seq_len = 0
for seq in sentence_group :
if len(seq) > max_seq_len :
max_seq_len = len(seq)
return max_seq_len
##########################################################################################
def sort_by_timestep(sentence_group):
same_len_seq = np.asarray(sentence_group)
same_len_seq = apply_to_m1(same_len_seq)
sorted_seq = same_len_seq.transpose()
return sorted_seq
##########################################################################################
def seq_to_1hot(max_len, sorted_sentences, type, minibatch_unit, num_of_seq):
one_hot = [None for _ in range( len(sorted_sentences) )]
for i, sentence_group in enumerate(sorted_sentences):
if sentence_group and len(sentence_group[0]) != 0 :
max_seq_len = find_maxlen(sentence_group)
row = max_seq_len * minibatch_unit
one_hot[i] = np.zeros( (row, voca_dim + 2) )
time_step_seq = sort_by_timestep(sentence_group)
j = 0
for word_idx in np.nditer( time_step_seq ) :
if word_idx != -1:
one_hot[i][j][word_idx] = 1
j+=1
one_hot[i] = np.reshape(one_hot[i], ( max_seq_len, -1, voca_dim+2) )
return one_hot
##########################################################################################
def apply_to_m1(lst, dtype=np.int64):
inner_max_len = max(map(len, lst))
result = np.zeros( [len(lst), inner_max_len], dtype )
result[:] = -1
for i, row in enumerate(lst):
for j, val in enumerate(row):
result[i][j] = val
return result
##########################################################################################
def seq_group_by_mini_batch_size(minibatch_unit, sorted_seq, num_of_seq):
idx = 0
cnt = 0
minibatch_seq = [ [] for _ in range( (num_of_seq/minibatch_unit)+1) ]
for seqs in sorted_seq :
if seqs :
for seq in seqs :
if seq:
minibatch_seq[idx].append(seq)
cnt+=1
if minibatch_unit == cnt:
cnt = 0
idx+= 1
for i, seq in enumerate (minibatch_seq):
if seq == []:
minibatch_seq = minibatch_seq[: i- 1]
break
return minibatch_seq
##########################################################################################
def preprocessing():
global num_of_seq
global minibatch_unit
global input_path
global target_path
print("Start Preprocessing")
sentences_input, total_sentence_num = file_tokenize(input_path)
sentences_target, total_sentence_num_target = file_tokenize(target_path)
print("FINISHED : file_tokenize ")
word_freq_input = cntWordFreq(sentences_input)
word_freq_target = cntWordFreq(sentences_target)
print("FINISHED : cntWordFreq ")
index_to_word_input, word_to_index_input = build_WordToIdx_IdxtoWord(word_freq_input)
index_to_word_target, word_to_index_targrt = build_WordToIdx_IdxtoWord(word_freq_target)
print("FINISHED : build_WordToIdx_IdxtoWord ")
seqs_input = word_to_idx(sentences_input, word_to_index_input)
seqs_target = word_to_idx(sentences_target, word_to_index_targrt)
print("FINISHED : word_to_idx ")
seqs_sorted_input, seqs_sorted_target, maxlen_input, maxlen_target = sortByLen(seqs_input, seqs_target)
print("FINISHED : sortByLen ")
for seqs in seqs_input:
if seqs:
for seq in seqs:
if seq:
num_of_seq+=1
seq_by_mini_batch_size_input = seq_group_by_mini_batch_size(minibatch_unit, seqs_sorted_input, num_of_seq)
seq_by_mini_batch_size_target = seq_group_by_mini_batch_size(minibatch_unit, seqs_sorted_target, num_of_seq)
print("FINISHED : seq_group_by_mini_batch_size ")
_1hot_input = seq_to_1hot(maxlen_input, seq_by_mini_batch_size_input, "input",minibatch_unit, num_of_seq)
_1hot_target = seq_to_1hot(maxlen_target, seq_by_mini_batch_size_target, "target",minibatch_unit, num_of_seq)
print("FINISHED : seq_to_1hot ")
if minibatch_unit > total_sentence_num:
minibatch_unit = total_sentence_num
print("exit preprocessing")
return _1hot_input, _1hot_target, maxlen_input, minibatch_unit, voca_dim, word_to_index_input, word_to_index_targrt, index_to_word_target

Resources