Picture of the train and test lossHere’s a look at the LSTM model
`
# Create LSTM Model
class LSTMModel(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super(LSTMModel, self).__init__()
# Number of hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
# LSTM
self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True, dropout=0.1)
# Readout layer
self.f1 = nn.Linear(hidden_dim, output_dim)
self.dropout_layer = nn.Dropout(p=0.2)
self.softmax = nn.Softmax()
def forward(self, x):
# Initialize hidden state with zeros
h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).type(torch.FloatTensor))
# Initialize cell state
c0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).type(torch.FloatTensor))
# One time step
out, (hn, cn) = self.lstm(x, (h0,c0))
out = self.dropout_layer(hn[-1])
out = self.f1(out)
out = self.softmax(out)
return out
#LSTM Configuration
batch_size = 3000
num_epochs = 20
learning_rate = 0.001#Check this learning rate
# Create LSTM
input_dim = 1 # input dimension
hidden_dim = 30 # hidden layer dimension
layer_dim = 15 # number of hidden layers
output_dim = 1 # output dimension
num_layers = 10 #num_layers
print("input_dim = ", input_dim,"\nhidden_dim = ", hidden_dim,"\nlayer_dim = ", layer_dim,"\noutput_dim = ", output_dim)
model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
model.cuda()
error = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
graph_index = 0
test_loss = []
train_loss = []
plt_test_index = []
plt_train_index = []
tmp_index = []
tmp_train = []
tmp_test = []
# model.init_hidden()
for epoch in range(num_epochs):
# Train
model.train()
loss_list_train = []
loss_list_test = []
total_train = 0
equals_train = 0
total_test = 0
num0_train = 0
num1_train = 0
num0_test = 0
num1_test = 0
equals_test = 0
TP_train = 0
FP_train = 0
TN_train = 0
FN_train = 0
TP_test = 0
FP_test = 0
TN_test = 0
FN_test = 0
# for i, (inputs, targets) in enumerate(train_loader):
for i, (inputs, targets) in enumerate(train_loader):
train = Variable(inputs.type(torch.FloatTensor).cuda())
targets = Variable(targets.type(torch.FloatTensor).cuda())
optimizer.zero_grad()
outputs = model(train)
loss = error(outputs, targets)
loss_list_train.append(loss.item())
loss.backward()
# loss.backward(retain_graph=True)
optimizer.step()
t = np.where(targets.cpu().detach().numpy() > 0.5, 1, 0)
o = np.where(outputs.cpu().detach().numpy() > 0.5, 1, 0)
total_train += t.shape[0]
equals_train += np.sum(t == o)
num0_train += np.sum(t == 0)
num1_train += np.sum(t == 1)
TP_train += np.sum(np.logical_and(t == 1, o==1))
FP_train += np.sum(np.logical_and(t == 1, o==0))
TN_train += np.sum(np.logical_and(t == 0, o==0))
FN_train += np.sum(np.logical_and(t == 0, o==1))
tb.save_value('Train Loss', 'train_loss', globaliter, loss.item())
globaliter += 1
tb.flush_line('train_loss')
print(i)
# Test
model.eval()
targets_plot = np.array([])
outputs_plot = np.array([])
inputs_plot = np.array([])
for inputs, targets in test_loader:
inputs = Variable(inputs.type(torch.FloatTensor).cuda())
targets = Variable(targets.type(torch.FloatTensor).cuda())
outputs = model(inputs)
loss = error(outputs, targets)
loss_list_test.append(loss.item())
#print(outputs.cpu().detach().numpy())
t = np.where(targets.cpu().detach().numpy() > 0.5, 1, 0)
o = np.where(outputs.cpu().detach().numpy() > 0.5, 1, 0)
total_test += t.shape[0]
equals_test += np.sum(t == o)
num0_test += np.sum(t == 0)
num1_test += np.sum(t == 1)
TP_test += np.sum(np.logical_and(t == 1, o==1))
FP_test += np.sum(np.logical_and(t == 0, o==1))
TN_test += np.sum(np.logical_and(t == 0, o==0))
FN_test += np.sum(np.logical_and(t == 1, o==0))
tb.save_value('Test Loss', 'test_loss', globaliter2, loss.item())
globaliter2 += 1
tb.flush_line('test_loss')
# Save value in array
graph_index += 1
plt_train_index.append(graph_index)
plt_test_index.append(graph_index)
train_loss.append(np.mean(np.array(loss_list_train)))
test_loss.append(np.mean(np.array(loss_list_test)))
print("------------------------------")
print("Epoch : ", epoch)
print("----- Train -----")
print("Total =", total_train, " | Num 0 =", num0_train, " | Num 1 =", num1_train)
print("Equals =", equals_train)
print("Accuracy =", (equals_train / total_train)*100, "%")
# print("TP =", TP_train / total_train, "% | TN =", TN_train / total_train, "% | FP =", FP_train / total_train, "% | FN =", FN_train / total_train, "%")
print("----- Test -----")
print("Total =", total_test, " | Num 0 =", num0_test, " | Num 1 =", num1_test)
print("Equals =", equals_test)
print("Accuracy =", (equals_test / total_test)*100, "%")
I am using the model to do binary classification on the sequence length of 300. The accuracy and the loss are not changing over several epochs.I tried changing the no. of layers,no of hidden states, activation function, but all to no avail. I don’t know what i am doing wrong, I am probably missing something fundamental. Any help is appreciated
Related
I want to use a for loop to call model_1 to model_10 with a for loop and save them in y_pred[].
i have to use a for loop unconditionally.
and also
Code to find and print the minimum, maximum, median, average, and standard deviation of y_pred.
y_gt=0
y_pred=[]
y_pred_lstm=[]
path = '/content/drive/MyDrive/INU_DeepLearning/term_project_part2/model/model_10.pt'
net = torch.load(path)
loss_fn = nn.MSELoss(reduction="sum")
test_loss = test(test_dataloader, net, loss_fn, sd_train.max_price, mode='test')
X, y_gt_ = sd_test[0:]
X_ = X / sd_train.max_price
X_ = torch.Tensor(X_).to(device)
y = net(X_)
y_gt = y_gt_[0]
y_pred.append((y*sd_train.max_price).detach().cpu().numpy()[0])
print(y_pred)
print(y_gt)
This is the section to pred
`
def test(dataloader, model, loss_fn, max_price, mode='val'):
size = len(dataloader.dataset)
num_batches = len(dataloader)
model.eval()
test_loss = 0
count_test = 0
with torch.no_grad():
for X, y in dataloader:
X_ = X / max_price
y_ = y / max_price
X_, y_ = X_.to(device), y_.to(device)
pred = model(X_)
test_loss += loss_fn(pred, y_).item()#.reshape(-1,1)).item()
count_test += len(X)
test_loss /= count_test
test_loss = max_price*np.sqrt(test_loss)
if mode == 'val':
print(f"Validation RMSE: {test_loss:>7f} \n")
else:
print(f"Test RMSE: {test_loss:>7f} \n")
return test_loss
`
I'm trying to measure the latent space clustering but the error raised.
class AutoEncoder(nn.Module):
def __init__(self, input_dim1, input_dim2, hidden_dims, agg, sep_decode):
super(AutoEncoder, self).__init__()
self.agg = agg
self.sep_decode = sep_decode
print("hidden_dims:", hidden_dims)
self.encoder_layers = []
self.encoder2_layers = []
dims = [[input_dim1, input_dim2]] + hidden_dims
for i in range(len(dims) - 1):
if i == 0:
layer = nn.Sequential(nn.Linear(dims[i][0], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i][1], dims[i+1]), nn.ReLU())
elif i != 0 and i < len(dims) - 2:
layer = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
else:
layer = nn.Linear(dims[i], dims[i+1])
layer2 = nn.Linear(dims[i], dims[i+1])
self.encoder_layers.append(layer)
self.encoder2_layers.append(layer2)
self.encoder = nn.Sequential(*self.encoder_layers)
self.encoder2 = nn.Sequential(*self.encoder2_layers)
self.decoder_layers = []
self.decoder2_layers = []
hidden_dims.reverse()
dims = hidden_dims + [[input_dim1, input_dim2]]
if self.agg == "concat" and not self.sep_decode:
dims[0] = 2 * dims[0]
for i in range(len(dims) - 1):
if i < len(dims) - 2:
layer = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
layer2 = nn.Sequential(nn.Linear(dims[i], dims[i+1]), nn.ReLU())
else:
layer = nn.Linear(dims[i], dims[i+1][0])
layer2 = nn.Linear(dims[i], dims[i+1][1])
self.decoder_layers.append(layer)
self.decoder2_layers.append(layer2)
self.decoder = nn.Sequential(*self.decoder_layers)
self.decoder2 = nn.Sequential(*self.decoder2_layers)
def forward(self, x1, x2):
z1 = self.encoder(x1)
z2 = self.encoder2(x2)
if self.agg == "max":
z = torch.max(z1, z2)
elif self.agg == "multi":
z = z1 * z2
elif self.agg == "sum":
z = z1 + z2
elif self.agg == "concat":
z = torch.cat([z1, z2], dim=1)
if self.sep_decode:
x_bar1 = self.decoder(z1)
x_bar1 = F.normalize(x_bar1, dim=-1)
x_bar2 = self.decoder2(z2)
x_bar2 = F.normalize(x_bar2, dim=-1)
else:
x_bar1 = self.decoder(z)
x_bar1 = F.normalize(x_bar1, dim=-1)
x_bar2 = self.decoder2(z)
x_bar2 = F.normalize(x_bar2, dim=-1)
return x_bar1, x_bar2, z
class TopicCluster(nn.Module):
def __init__(self, args):
super(TopicCluster, self).__init__()
self.alpha = 1.0
self.dataset_path = args.dataset_path
self.args = args
self.device = args.device
self.temperature = args.temperature
self.distribution = args.distribution
self.agg_method = args.agg_method
self.sep_decode = (args.sep_decode == 1)
input_dim1 = args.input_dim1
input_dim2 = args.input_dim2
hidden_dims = eval(args.hidden_dims)
self.model = AutoEncoder(input_dim1, input_dim2, hidden_dims, self.agg_method, self.sep_decode)
if self.agg_method == "concat":
self.topic_emb = Parameter(torch.Tensor(args.n_clusters, 2*hidden_dims[-1]))
else:
self.topic_emb = Parameter(torch.Tensor(args.n_clusters, hidden_dims[-1]))
torch.nn.init.xavier_normal_(self.topic_emb.data)
def pretrain(self, input_data, pretrain_epoch=200):
pretrained_path = os.path.join(self.dataset_path, f"pretrained_{args.suffix}.pt")
if os.path.exists(pretrained_path) and self.args.load_pretrain:
# load pretrain weights
print(f"loading pretrained model from {pretrained_path}")
self.model.load_state_dict(torch.load(pretrained_path))
else:
train_loader = DataLoader(input_data, batch_size=self.args.batch_size, shuffle=True)
optimizer = Adam(self.model.parameters(), lr=self.args.lr)
for epoch in range(pretrain_epoch):
total_loss = 0
for batch_idx, (x1, x2, _, weight) in enumerate(train_loader):
x1 = x1.to(self.device)
x2 = x2.to(self.device)
weight = weight.to(self.device)
optimizer.zero_grad()
x_bar1, x_bar2, z = self.model(x1, x2)
loss = cosine_dist(x_bar1, x1) + cosine_dist(x_bar2, x2) #, weight)
total_loss += loss.item()
loss.backward()
optimizer.step()
print(f"epoch {epoch}: loss = {total_loss / (batch_idx+1):.4f}")
torch.save(self.model.state_dict(), pretrained_path)
print(f"model saved to {pretrained_path}")
def cluster_assign(self, z):
if self.distribution == 'student':
p = 1.0 / (1.0 + torch.sum(
torch.pow(z.unsqueeze(1) - self.topic_emb, 2), 2) / self.alpha)
p = p.pow((self.alpha + 1.0) / 2.0)
p = (p.t() / torch.sum(p, 1)).t()
else:
self.topic_emb.data = F.normalize(self.topic_emb.data, dim=-1)
z = F.normalize(z, dim=-1)
sim = torch.matmul(z, self.topic_emb.t()) / self.temperature
p = F.softmax(sim, dim=-1)
return p
def forward(self, x1, x2):
x_bar1, x_bar2, z = self.model(x1, x2)
p = self.cluster_assign(z)
return x_bar1, x_bar2, z, p
def target_distribution(self, x1, x2, freq, method='all', top_num=0):
_, _, z = self.model(x1, x2)
p = self.cluster_assign(z).detach()
if method == 'all':
q = p**2 / (p * freq.unsqueeze(-1)).sum(dim=0)
q = (q.t() / q.sum(dim=1)).t()
elif method == 'top':
assert top_num > 0
q = p.clone()
sim = torch.matmul(self.topic_emb, z.t())
_, selected_idx = sim.topk(k=top_num, dim=-1)
for i, topic_idx in enumerate(selected_idx):
q[topic_idx] = 0
q[topic_idx, i] = 1
return p, q
def cosine_dist(x_bar, x, weight=None):
if weight is None:
weight = torch.ones(x.size(0), device=x.device)
cos_sim = (x_bar * x).sum(-1)
cos_dist = 1 - cos_sim
cos_dist = (cos_dist * weight).sum() / weight.sum()
return cos_dist
def train(args, emb_dict):
# ipdb.set_trace()
inv_vocab = {k: " ".join(v) for k, v in emb_dict["inv_vocab"].items()}
vocab = {" ".join(k):v for k, v in emb_dict["vocab"].items()}
print(f"Vocab size: {len(vocab)}")
embs = F.normalize(torch.tensor(emb_dict["vs_emb"]), dim=-1)
embs2 = F.normalize(torch.tensor(emb_dict["oh_emb"]), dim=-1)
freq = np.array(emb_dict["tuple_freq"])
if not args.use_freq:
freq = np.ones_like(freq)
input_data = TensorDataset(embs, embs2, torch.arange(embs.size(0)), torch.tensor(freq))
topic_cluster = TopicCluster(args).to(args.device)
topic_cluster.pretrain(input_data, args.pretrain_epoch)
train_loader = DataLoader(input_data, batch_size=args.batch_size, shuffle=False)
optimizer = Adam(topic_cluster.parameters(), lr=args.lr)
# topic embedding initialization
embs = embs.to(args.device)
embs2 = embs2.to(args.device)
x_bar1, x_bar2, z = topic_cluster.model(embs, embs2)
z = F.normalize(z, dim=-1)
print(f"Running K-Means for initialization")
kmeans = KMeans(n_clusters=args.n_clusters, n_init=5)
if args.use_freq:
y_pred = kmeans.fit_predict(z.data.cpu().numpy(), sample_weight=freq)
else:
y_pred = kmeans.fit_predict(z.data.cpu().numpy())
print(f"Finish K-Means")
freq = torch.tensor(freq).to(args.device)
y_pred_last = y_pred
topic_cluster.topic_emb.data = torch.tensor(kmeans.cluster_centers_).to(args.device)
topic_cluster.train()
i = 0
for epoch in range(50):
if epoch % 5 == 0:
_, _, z, p = topic_cluster(embs, embs2)
z = F.normalize(z, dim=-1)
topic_cluster.topic_emb.data = F.normalize(topic_cluster.topic_emb.data, dim=-1)
if not os.path.exists(os.path.join(args.dataset_path, f"clusters_{args.suffix}")):
os.makedirs(os.path.join(args.dataset_path, f"clusters_{args.suffix}"))
embed_save_path = os.path.join(args.dataset_path, f"clusters_{args.suffix}/embed_{epoch}.pt")
torch.save({
"inv_vocab": emb_dict['inv_vocab'],
"embed": z.detach().cpu().numpy(),
"topic_embed": topic_cluster.topic_emb.detach().cpu().numpy(),
}, embed_save_path)
f = open(os.path.join(args.dataset_path, f"clusters_{args.suffix}/{epoch}.txt"), 'w')
pred_cluster = p.argmax(-1)
result_strings = []
for j in range(args.n_clusters):
if args.sort_method == 'discriminative':
word_idx = torch.arange(embs.size(0))[pred_cluster == j]
sorted_idx = torch.argsort(p[pred_cluster == j][:, j], descending=True)
word_idx = word_idx[sorted_idx]
else:
sim = torch.matmul(topic_cluster.topic_emb[j], z.t())
_, word_idx = sim.topk(k=30, dim=-1)
word_cluster = []
freq_sum = 0
for idx in word_idx:
freq_sum += freq[idx].item()
if inv_vocab[idx.item()] not in word_cluster:
word_cluster.append(inv_vocab[idx.item()])
if len(word_cluster) >= 10:
break
result_strings.append((freq_sum, f"Topic {j} ({freq_sum}): " + ', '.join(word_cluster)+'\n'))
result_strings = sorted(result_strings, key=lambda x: x[0], reverse=True)
for result_string in result_strings:
f.write(result_string[1])
for x1, x2, idx, weight in train_loader:
if i % args.update_interval == 0:
p, q = topic_cluster.target_distribution(embs, embs2, freq.clone().fill_(1), method='all', top_num=epoch+1)
y_pred = p.cpu().numpy().argmax(1)
delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
y_pred_last = y_pred
if i > 0 and delta_label < args.tol:
print(f'delta_label {delta_label:.4f} < tol ({args.tol})')
print('Reached tolerance threshold. Stopping training.')
return None
i += 1
x1 = x1.to(args.device)
x2 = x2.to(args.device)
idx = idx.to(args.device)
weight = weight.to(args.device)
x_bar1, x_bar2, _, p = topic_cluster(x1, x2)
reconstr_loss = cosine_dist(x_bar1, x1) + cosine_dist(x_bar2, x2) #, weight)
kl_loss = F.kl_div(p.log(), q[idx], reduction='none').sum(-1)
kl_loss = (kl_loss * weight).sum() / weight.sum()
loss = args.gamma * kl_loss + reconstr_loss
if i % args.update_interval == 0:
print(f"KL loss: {kl_loss}; Reconstruction loss: {reconstr_loss}")
optimizer.zero_grad()
loss.backward()
optimizer.step()
return None
if __name__ == "__main__":
# CUDA_VISIBLE_DEVICES=0 python3 latent_space_clustering.py --dataset_path ./pandemic --input_emb_name po_tuple_features_all_svos.pk
parser = argparse.ArgumentParser(
description='train',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset_path', type=str)
parser.add_argument('--input_emb_name', type=str)
parser.add_argument('--lr', type=float, default=5e-4)
parser.add_argument('--n_clusters', default=30, type=int)
parser.add_argument('--input_dim1', default=1000, type=int)
parser.add_argument('--input_dim2', default=1000, type=int)
parser.add_argument('--agg_method', default="multi", choices=["sum", "multi", "concat", "attend"], type=str)
parser.add_argument('--sep_decode', default=0, choices=[0, 1], type=int)
parser.add_argument('--pretrain_epoch', default=100, type=int)
parser.add_argument('--load_pretrain', default=False, action='store_true')
parser.add_argument('--temperature', default=0.1, type=float)
parser.add_argument('--sort_method', default='generative', choices=['generative', 'discriminative'])
parser.add_argument('--distribution', default='softmax', choices=['softmax', 'student'])
parser.add_argument('--batch_size', default=256, type=int)
parser.add_argument('--use_freq', default=False, action='store_true')
parser.add_argument('--hidden_dims', default='[1000, 2000, 1000, 100]', type=str)
parser.add_argument('--suffix', type=str, default='')
parser.add_argument('--gamma', default=5, type=float, help='weight of clustering loss')
parser.add_argument('--update_interval', default=100, type=int)
parser.add_argument('--tol', default=0.001, type=float)
args = parser.parse_args()
args.cuda = torch.cuda.is_available()
print("use cuda: {}".format(args.cuda))
args.device = torch.device("cuda" if args.cuda else "cpu")
print(args)
with open(os.path.join(args.dataset_path, args.input_emb_name), "rb") as fin:
emb_dict = pk.load(fin)
candidate_idx = train(args, emb_dict)
print(candidate_idx)
The error I'm getting is: RuntimeError: mat1 and mat2 shapes cannot be multiplied (256x726 and 1000x1000). I cannot figure out which part is the problem. Please help me.. Thank you so much
for the images runtime error like
enter image description here
I know for a fact that changing hyperparameters of an LSTM model or selecting different BERT layers causes changes in the classification result. I have tested this out using TensorFlow and Keras. I recently switched to Pytorch to do the same design, but no matter what I change, the result remains the same. Below is the code. Am I doing anything wrong?
def pad_sents(sents, pad_token): #Pad list of sentences according to the longest sentence in the batch.
sents_padded = []
max_len = max(len(s) for s in sents)
batch_size = len(sents)
for s in sents:
padded = [pad_token] * max_len
padded[:len(s)] = s
sents_padded.append(padded)
return sents_padded
def sents_to_tensor(tokenizer, sents, device):
tokens_list = [tokenizer.tokenize(str(sent)) for sent in sents]
sents_lengths = [len(tokens) for tokens in tokens_list]
tokens_list_padded = pad_sents(tokens_list, '[PAD]')
sents_lengths = torch.tensor(sents_lengths, device=device)
masks = []
for tokens in tokens_list_padded:
mask = [0 if token=='[PAD]' else 1 for token in tokens]
masks.append(mask)
masks_tensor = torch.tensor(masks, dtype=torch.long, device=device)
tokens_id_list = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list_padded]
sents_tensor = torch.tensor(tokens_id_list, dtype=torch.long, device=device)
return sents_tensor, masks_tensor, sents_lengths
class BERT_LSTM_Model(nn.Module):
def __init__(self, device, dropout_rate, n_class, lstm_hidden_size=None):
super(BERT_LSTM_Model, self).__init__()
self.bert_config = BertConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)
self.bert = BertModel.from_pretrained('bert-base-uncased',config =self.bert_config)
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',config =self.bert_config)
if not lstm_hidden_size:
self.lstm_hidden_size = self.bert.config.hidden_size
else:
self.lstm_hidden_size = lstm_hidden_size
self.n_class = n_class
self.dropout_rate = dropout_rate
self.lstm = nn.LSTM(self.bert.config.hidden_size, self.lstm_hidden_size, bidirectional=True)
self.hidden_to_softmax = nn.Linear(self.lstm_hidden_size * 2, n_class, bias=True)
self.dropout = nn.Dropout(p=self.dropout_rate)
self.device = device
def forward(self, sents):
sents_tensor, masks_tensor, sents_lengths = sents_to_tensor(self.tokenizer, sents, self.device)
encoded_layers = self.bert(input_ids=sents_tensor, attention_mask=masks_tensor)[2] #,output_all_encoded_layers=False) #output_hidden_states output_hidden_states=True
bert_hidden_layer = encoded_layers[12]
bert_hidden_layer = bert_hidden_layer.permute(1, 0, 2) #permute rotates the tensor. if tensor.shape = 3,4,5 tensor.permute(1,0,2), then tensor,shape= 4,3,5 (batch_size, sequence_length, hidden_size)
enc_hiddens, (last_hidden, last_cell) = self.lstm(pack_padded_sequence(bert_hidden_layer, sents_lengths, enforce_sorted=False)) #enforce_sorted=False #pack_padded_sequence(data and batch_sizes
output_hidden = torch.cat((last_hidden[0], last_hidden[1]), dim=1) # (batch_size, 2*hidden_size)
output_hidden = self.dropout(output_hidden)
pre_softmax = self.hidden_to_softmax(output_hidden)
return pre_softmax
def batch_iter(data, batch_size, shuffle=False, bert=None):
batch_num = math.ceil(data.shape[0] / batch_size)
index_array = list(range(data.shape[0]))
if shuffle:
data = data.sample(frac=1)
for i in range(batch_num):
indices = index_array[i * batch_size: (i + 1) * batch_size]
examples = data.iloc[indices]
targets = list(examples.train_label.values)
yield sents, targets # list[list[str]] if not bert else list[str], list[int]
def validation(model, df_val, loss_func, device):
was_training = model.training
model.eval()
train_BERT_tweet = list(df_val.train_BERT_tweet)
train_label = list(df_val.train_label)
val_batch_size = 16
n_batch = int(np.ceil(df_val.shape[0]/val_batch_size))
total_loss = 0.
with torch.no_grad():
for i in range(n_batch):
sents = train_BERT_tweet[i*val_batch_size: (i+1)*val_batch_size]
targets = torch.tensor(train_label[i*val_batch_size: (i+1)*val_batch_size],
dtype=torch.long, device=device)
batch_size = len(sents)
pre_softmax = model(sents)
batch_loss = loss_func(pre_softmax, targets)
total_loss += batch_loss.item()*batch_size
if was_training:
model.train()
return total_loss/df_val.shape[0]
def train():
label_name = ['Yes', 'Maybe', 'No']
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
start_time = time.time()
print('Importing data...', file=sys.stderr)
df_train = pd.read_csv('trainn.csv') #, index_col=0)
df_val = pd.read_csv('valn.csv') #, index_col=0)
train_label = dict(df_train.train_label.value_counts())
label_max = float(max(train_label.values()))
train_label_weight = torch.tensor([label_max/train_label[i] for i in range(len(train_label))], device=device)
print('Done! time elapsed %.2f sec' % (time.time() - start_time), file=sys.stderr)
print('-' * 80, file=sys.stderr)
start_time = time.time()
print('Set up model...', file=sys.stderr)
model = BERT_LSTM_Model(device=device, dropout_rate=0.2, n_class=len(label_name),lstm_hidden_size=768)
optimizer = AdamW(model.parameters(), lr=1e-3, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps=100, t_total=1000) #changed the last 2 arguments to old ones
model = model.to(device)
print('Use device: %s' % device, file=sys.stderr)
print('Done! time elapsed %.2f sec' % (time.time() - start_time), file=sys.stderr)
print('-' * 80, file=sys.stderr)
model.train()
cn_loss = torch.nn.CrossEntropyLoss(weight=train_label_weight, reduction='mean')
torch.save(cn_loss, 'loss_func3') # for later testing
train_batch_size =16
valid_niter = 500
log_every = 10
model_save_path = 'NonLinear_bert_uncased_model.bin'
num_trial = 0
train_iter = patience = cum_loss = report_loss = 0
cum_examples = report_examples = epoch = 0
hist_valid_scores = []
train_time = begin_time = time.time()
print('Begin Maximum Likelihood training...')
for epoch in range(20):
for sents, targets in batch_iter(df_train, batch_size=train_batch_size, shuffle=True): # for each epoch
train_iter += 1
optimizer.zero_grad()
batch_size = len(sents)
pre_softmax = model(sents)
loss = cn_loss(pre_softmax, torch.tensor(targets, dtype=torch.long, device=device))
loss.backward()
optimizer.step()
scheduler.step()
batch_losses_val = loss.item() * batch_size
report_loss += batch_losses_val
cum_loss += batch_losses_val
report_examples += batch_size
cum_examples += batch_size
if train_iter % log_every == 0:
print('epoch %d, iter %d, avg. loss %.2f, '
'cum. examples %d, speed %.2f examples/sec, '
'time elapsed %.2f sec' % (epoch, train_iter,
report_loss / report_examples,
cum_examples,
report_examples / (time.time() - train_time),
time.time() - begin_time), file=sys.stderr)
train_time = time.time()
report_loss = report_examples = 0.
#torch.save(model.state_dict(), 'LSTM_bert_uncased_model.bin')
# perform validation
if train_iter % valid_niter == 0:
print('epoch %d, iter %d, cum. loss %.2f, cum. examples %d' % (epoch, train_iter,
cum_loss / cum_examples,
cum_examples), file=sys.stderr)
cum_loss = cum_examples = 0.
print('begin validation ...', file=sys.stderr)
validation_loss = validation(model, df_val, cn_loss, device=device) # dev batch size can be a bit larger
print('validation: iter %d, loss %f' % (train_iter, validation_loss), file=sys.stderr)
is_better = len(hist_valid_scores) == 0 or validation_loss < min(hist_valid_scores)
hist_valid_scores.append(validation_loss)
if is_better:
patience = 0
print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)
torch.save(model.state_dict(), 'LSTM_bert_uncased_model.bin')
# also save the optimizers' state
torch.save(optimizer.state_dict(), model_save_path + '.optim')
elif patience < 5:
patience += 1
print('hit patience %d' % patience, file=sys.stderr)
if patience == 20:
num_trial += 1
print('hit #%d trial' % num_trial, file=sys.stderr)
if num_trial == 3:
print('early stop!', file=sys.stderr)
exit(0)
# decay lr, and restore from previously best checkpoint
print('load previously best model and decay learning rate to %f%%' %
(0.1*100), file=sys.stderr)
# load model model.load_state_dict(torch.load('LSTM_bert_uncased_model.bin'))
model = model.to(device)
print('restore parameters of the optimizers', file=sys.stderr)
optimizer.load_state_dict(torch.load(model_save_path + '.optim'))
# set new lr
for param_group in optimizer.param_groups:
param_group['lr'] *= 0.5
# reset patience
patience = 0
if epoch == 100:
print('reached maximum number of epochs!', file=sys.stderr)
exit(0)
def test():
label_name = ['Yes', 'Maybe', 'No']
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
model = BERT_LSTM_Model(device=device, dropout_rate=0.3, n_class=len(label_name), lstm_hidden_size=768)
model.load_state_dict(torch.load('LSTM_bert_uncased_model.bin'))
model.to(device)
model.eval()
df_test = pd.read_csv('testn.csv')
test_batch_size = 16
n_batch = int(np.ceil(df_test.shape[0]/test_batch_size))
cn_loss = torch.load('loss_func3', map_location=lambda storage, loc: storage).to(device)
train_BERT_tweet = list(df_test.train_BERT_tweet)
train_label = list(df_test.train_label)
test_loss = 0.
prediction = []
prob = []
softmax = torch.nn.Softmax(dim=1)
with torch.no_grad():
for i in range(n_batch):
sents = train_BERT_tweet[i*test_batch_size: (i+1)*test_batch_size]
targets = torch.tensor(train_label[i * test_batch_size: (i + 1) * test_batch_size],
dtype=torch.long, device=device)
batch_size = len(sents)
pre_softmax = model(sents)
batch_loss = cn_loss(pre_softmax, targets)
test_loss += batch_loss.item()*batch_size
prob_batch = softmax(pre_softmax)
prob.append(prob_batch)
prediction.extend([t.item() for t in list(torch.argmax(prob_batch, dim=1))])
accuracy = accuracy_score(df_test.train_label.values, prediction)
matthews = matthews_corrcoef(df_test.train_label.values, prediction)
f1_macro = f1_score(df_test.train_label.values, prediction, average='macro')
print('accuracy: %.2f' % accuracy)
print('matthews coef: %.2f' % matthews)
print('f1_macro: %.2f' % f1_macro)
TrainingModel = train()
TestingModel = test()
The data can be accessed from https://github.com/Kosisochi/DataSnippet
I didnt know how else to create a synthetic data.
Also, the training and validation loss remains quite high with the lowest being around 0.93.
I also tried a CNN and the same issue remained. Is there something I'm over looking? thanks for your help.
import os
import tarfile
from six.moves import urllib
URL = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
PATH = 'aclImdb'
def fetch_data(url = URL, path = PATH):
if not os.path.isdir(path):
os.makedirs(path)
file_path = os.path.join(oath, "aclImdb_v1.tar.gz")
urllib.request.urlretrieve(url, file_path)
file_gz = tarfile.open(file_path)
file_gz.extractall(path = path)
file_gz.close()
import pyprind # for progress visualisation
import pandas as pd
PATH = 'aclImdb'
labels = {'pos': 1, 'neg': 0} # int class labels for 'positive' and 'negative'
pbar = pyprind.ProgBar(50000) # initialise a progress bar with 50k iterations = no. of docs
df = pd.DataFrame()
# use nested for loops to iterate over 'train' & 'test' subdir
for s in ('test', 'train'):
for l in ('pos', 'neg'): # and read text files from 'pos' and 'neg' subdir
path = os.path.join(PATH, s, l)
for file in os.listdir(path):
# append to the df pandas DataFrame with an int class (post = 1, neg = 0)
with open(os.path.join(path, file), 'r', encoding = 'utf-8') as infile:
txt = infile.read()
df = df.append([[txt, labels[l]]], ignore_index = True)
pbar.update()
df.columns = ['review', 'sentiment']
import numpy as np
np. random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index = False, encoding = 'utf-8')
n_words = max(list(word_to_int.values())) + 1
df = pd.read_csv('movie_data.csv', encoding = 'utf-8')
df.head(3)
# Separate words and count each word's occurence
import pyprind # for progress visualisation
from collections import Counter
from string import punctuation
import re
counts = Counter() # collects the counts of occurence of each unique word
pbar = pyprind.ProgBar(len(df['review']),
title = 'Counting word occurences...') # progress bar
for i, review in enumerate(df['review']):
text = ''.join([c if c not in punctuation else ' '+c+' '
for c in review]).lower()
df.loc[i, 'review'] = text
pbar.update()
counts.update(text.split())
# Mapping each unique word to an int
word_counts = sorted(counts, key = counts.get, reverse = True)
print(word_counts[:5])
word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)}
mapped_reviews = []
pbar = pyprind.ProgBar(len(df['review']),
title = 'Map movie reviews to integers...')
# Left-pad with zeros if the sequence length < 200
# Use 200 elements if the length > 200
sequence_length = 200
sequences = np.zeros((len(mapped_reviews), sequence_length), dtype = int)
for i, row in enumerate(mapped_reviews):
review_arr = np.array(row)
sequences[i, -len(row):] = review_arr[-sequence_length:]
# Split the dataset into training and test sets
X_train = sequences[:25000, :]
y_train = df.loc[:25000, 'sentiment'].values
X_test = sequences[25000:, :]
y_test = df.loc[25000:, 'sentiment'].values
# Define the mini-batches generator
np.random.seed(123)
def batch_gen(x, y = None, batch_size = 64):
n_batches = len(x) // batch_size
x = x[:n_batches * batch_size]
if y is not None:
y = y[:n_batches * batch_size]
for ii in range(0, len(x), batch_size):
if y is not None:
yield x[ii : ii + batch_size], y[ii : ii + batch_size]
else:
yield x[ii : ii + batch_size]
import tensorflow as tf
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' ## suppress the 3.5 warning if using TF 1.4
class SentimentRNN(object):
# Define __init__
def __init__(self,
n_words,
seq_len = 200,
lstm_size = 256,
num_layers = 1,
batch_size = 64,
learning_rate = 0.0001,
embed_size = 200):
self.n_words = n_words
self.seq_len = seq_len
self.lstm_size = lstm_size # no. of hidden units
self.num_layers = num_layers
self.batch_size = batch_size
self.learning_rate = learning_rate
self.embed_size = embed_size
self.g = tf.Graph()
with self.g.as_default():
tf.set_random_seed(123)
self.build()
self.saver = tf.train.Saver()
self.init_op = tf.global_variables_initializer()
# Define the build method
def build(self):
# Define the placeholders
tf_x = tf.placeholder(tf.int32,
shape = (self.batch_size, self.seq_len),
name = 'tf_x')
tf_y = tf.placeholder(tf.float32,
shape = (self.batch_size),
name = 'tf_y')
tf_keepprob = tf.placeholder(tf.float32,
name = 'tf_keepprob')
# Create the embedding layer
embedding = tf.Variable(
tf.random_uniform(
shape = (self.n_words, self.embed_size),
minval = -1,
maxval = 1),
name = 'embedding')
embed_x = tf.nn.embedding_lookup(embedding,
tf_x,
name = 'embed_x')
# Define LSTM cells and stack them
cells = tf.contrib.rnn.MultiRNNCell(
[tf.contrib.rnn.DropoutWrapper(
tf.contrib.rnn.BasicLSTMCell(num_units = self.lstm_size),
output_keep_prob = tf_keepprob)
for i in range(self.num_layers)])
# Define the initial state:
self.initial_state = cells.zero_state(
self.batch_size, tf.float32)
print(' << initial state >> ', self.initial_state)
# Put together components with tf.nn.dynamic_rnn
lstm_outputs, self.final_state = tf.nn.dynamic_rnn(
cell = cells,
inputs = embed_x,
initial_state = self.initial_state)
## lstm_outputs shape: [batch_size, max_time, cells.output_size]
print('\n << lstm_output >> ', lstm_outputs)
print('\n << final state >> ', self.final_state)
# Apply a full-connected layer on the RNN output
logits = tf.layers.dense(
inputs = lstm_outputs[:, -1],
units = 1, # dimensionality of the output space
activation = None,
name = 'logits')
# Remove dimensions of size 1 from the tensor shape
logits = tf.squeeze(input = logits,
name = 'logits_squeezed')
print ('\n << logits >> ', logits)
# If you want prob's
y_proba = tf.nn.sigmoid(logits, name = 'probabilities')
predictions = {'probabilities' : y_proba,
'labels' : tf.cast(tf.round(y_proba),
tf.int32,
name = 'labels')}
print('\n << predictions >> ', predictions)
# Define the cost function
cost = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(
labels = tf_y,
logits = logits),
name = 'cost')
# Define the optimiser
optimizer = tf.train.AdamOptimizer(self.learning_rate)
train_op = optimizer.minimize(cost, name = 'train_op')
# Define the train method
def train(self, X_train, y_train, num_epochs):
with tf.Session(graph = self.g) as sess:
sess.run(self.init_op)
iteration = 1
for epoch in range(num_epochs):
state = sess.run(self.initial_state)
for batch_x, batch_y in batch_gen(
X_train,
y_train,
batch_size = self.batch_size):
feed = {'tf_x:0' : batch_x,
'tf_y:0' : batch_y,
'tf_keepprob:0' : 0.5,
self.initial_state : state}
loss, _, state = sess.run(
['cost:0',
'train_op',
self.final_state],
feed_dict=feed)
if iteration % 20 == 0:
print("Epoch: %d/%d Iteration: %d "
"| Train loss: %.5f" % (
epoch + 1,
num_epochs,
iteration,
loss))
iteration += 1
if (epoch + 1) % 10 == 0:
self.saver.save(
sess,
"model/sentiment-%d.ckpt" % epoch)
# Define the predict method
def predict(self, X_data, return_proba=False):
preds = []
with tf.Session(graph = self.g) as sess:
self.saver.restore(
sess,
tf.train.latest_checkpoint('model/'))
test_state = sess.run(self.initial_state)
for ii, batch_x in enumerate(batch_gen(
x = X_data,
y = None,
batch_size = self.batch_size), 1):
feed = {'tf_x:0' : batch_x,
'tf_keepprob:0' : 1.0,
self.initial_state : test_state}
if return_proba:
pred, test_state = sess.run(
['probabilities:0', self.final_state],
feed_dict=feed)
else:
pred, test_state = sess.run(
['labels:0', self.final_state],
feed_dict=feed)
preds.append(pred)
return np.concatenate(preds)
for review in df['review']:
mapped_reviews.append([word_to_int[word] for word in review.split()])
pbar.update()
rnn = SentimentRNN(n_words = n_words,
seq_len = sequence_length,
embed_size = 256,
lstm_size = 128,
num_layers = 1,
batch_size = 100,
learning_rate = 0.001)
preds = rnn.predict(X_test)
y_true = y_test\[:len(preds)\]
print('Test accuracy... %.3f' % (np.sum(preds == y_true) / len(y_true)))][1]
Create an object of the SentimentRNN class with the following parameters:
n_words = n_words, seq_len = sequence_length, embed_size = 256, lstm_size = 128, num_layers = 1, batch_size = 100, learning_rate = 0.001.
Since we have a relatively small dataset, the number of layers = 1 may generalise better
enter image description here
ValueError Traceback (most recent call last)
<ipython-input-23-a3cfe03a9a49> in <module>()
----> 1 preds = rnn.predict(X_test)
2 y_true = y_test[:len(preds)]
3 print('Test accuracy... %.3f' % (np.sum(preds == y_true) / len(y_true)))
<ipython-input-12-d83ee67c43b6> in predict(self, X_data, return_proba)
173 self.saver.restore(
174 sess,
--> 175 tf.train.latest_checkpoint('model/'))
176 test_state = sess.run(self.initial_state)
177
/usr/local/anaconda/lib/python3.6/site-packages/tensorflow/python/training/saver.py in restore(self, sess, save_path)
1680 return
1681 if save_path is None:
-> 1682 raise ValueError("Can't load save_path when it is None.")
1683 logging.info("Restoring parameters from %s", save_path)
1684 if context.in_graph_mode():
ValueError: Can't load save_path when it is None.
The error just means tf.train.latest_checkpoint didn't find anything. It returns None, then the Saver complains because it was passed None. So there's no checkpoint in that directory.
wordsList = np.load('training_data/wordsList.npy')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('training_data/wordVectors.npy')
loaded some positiveFiles and negativeFiles in the variable
with tf.device('/gpu:0'):
ids = np.zeros((numFiles, maxSeqLength), dtype='int32')
fileCounter = 0
for pf in positiveFiles:
with open(pf, "r") as f:
indexCounter = 0
line=f.readline()
cleanedLine = cleanSentences(line)
split = cleanedLine.split()
for word in split:
try:
ids[fileCounter][indexCounter] = wordsList.index(word)
except ValueError:
ids[fileCounter][indexCounter] = 399999 #Vector for unkown words
#print('value :' + str(ids))
indexCounter = indexCounter + 1
if indexCounter >= maxSeqLength:
break
fileCounter = fileCounter + 1
for nf in negativeFiles:
with open(nf, "r") as f:
indexCounter = 0
line=f.readline()
cleanedLine = cleanSentences(line)
split = cleanedLine.split()
for word in split:
try:
ids[fileCounter][indexCounter] = wordsList.index(word)
except ValueError:
ids[fileCounter][indexCounter] = 399999 #Vector for unkown words
# print('value :' + str(ids))
indexCounter = indexCounter + 1
if indexCounter >= maxSeqLength:
break
fileCounter = fileCounter + 1
#Pass into embedding function and see if it evaluates.
np.save('idsMatrix', ids)
batchSize = 24
Training and testing methods
def getTrainBatch():
labels = []
arr = np.zeros([batchSize, maxSeqLength])
for i in range(batchSize):
if (i % 2 == 0):
num = randint(1,11499)
labels.append([1,0])
else:
num = randint(13499,24999)
labels.append([0,1])
arr[i] = ids[num-1:num]
return arr, labels
def getTestBatch():
labels = []
arr = np.zeros([batchSize, maxSeqLength])
for i in range(batchSize):
num = randint(11499,13499)
if (num <= 12499):
labels.append([1,0])
else:
labels.append([0,1])
arr[i] = ids[num-1:num]
return arr, labels
with tf.device('/gpu:0'):
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 100000
tf.reset_default_graph()
labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])
data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors, input_data)
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)
with tf.device('/gpu:0'):
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)
sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())
with tf.device('/gpu:0'):
for i in range(iterations):
nextBatch, nextBatchLabels = getTrainBatch();
sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
iterations = 10
for i in range(iterations):
nextBatch, nextBatchLabels = getTestBatch();
sess.run(accuracy, {input_data: nextBatch, labels: nextBatchLabels})
Here I am trying to predict the output in the form of 1 or 0 for a given sentence.
after loading this file from the checkpoint by this..How am I suppose to test the sentence is Positive(1) or Negative(0).
new_saver = tf.train.import_meta_graph('models/pretrained....')
new_saver.restore(sess, tf.train.latest_checkpoint('models/./'))
Please help.
Use naming for inputs and output, then retrieve the tensor from graph to do prediction; I have suggested few required changes and additional code to get prediction going
...
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength], name='inputs')
...
prediction = (tf.matmul(last, weight) + bias)
# you may use softmax if you want probabilities for prediction, but not for calculating the loss
# prediction = tf.nn.softmax(prediction)
prediction = tf.identity(prediction, name='prediction')
...
with tf.device('/gpu:0'):
for i in range(iterations):
nextBatch, nextBatchLabels = getTrainBatch();
sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels}
saver.save(sess, 'model')
code for restoring: here use the relative/absolute path to model.meta and model
new_saver = tf.train.import_meta_graph('/path/to/model.meta')
new_saver.restore(sess, '/path/to/model')
with tf.Session() as sess:
g = tf.get_default_graph()
inputs = g.get_tensor_by_name('inputs:0')
prediction = g.get_tensor_by_name('prediction:0')
prediction_ = sess.run(prediction, {inputs: your_inputs})