Training step not executing in pytorch lightning - pytorch

I am working to finetune a t5 model to summarize Amazon reviews. I am following this tutorial here: https://towardsdatascience.com/fine-tuning-a-t5-transformer-for-any-summarization-task-82334c64c81
I noticed that the training_step in my code is never being executed as the training loss remains "NaN" throughout the epoch. However, the validation_step is computed fine.
I already confirmed that there are no empty strings in the data and have tried multiple batch sizes.
This is the error
RuntimeError Traceback (most recent call last)
<ipython-input-53-45d4afebefac> in <module>()
----> 1 trainer.fit(model)
8 frames
<ipython-input-46-00fddffa2209> in training_epoch_end(self, outputs)
134 print("OUTPUTS")
135 print(outputs)
--> 136 avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
137 tensorboard_logs = {"avg_train_loss": avg_train_loss}
138 return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}
RuntimeError: stack expects a non-empty TensorList
I found that the training_step function is never being executed by adding print statements inside the training_step function.
Below is my code for the T5FineTuner class (sorry I can't be any more concise):
class T5FineTuner(pl.LightningModule):
def __init__(self, hparams):
super(T5FineTuner, self).__init__()
self.hparams = hparams
self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
self.rouge_metric = load_metric('rouge')
if self.hparams.freeze_embeds:
self.freeze_embeds()
if self.hparams.freeze_encoder:
self.freeze_params(self.model.get_encoder())
assert_all_frozen(self.model.get_encoder())
n_observations_per_split = {
"train": self.hparams.n_train,
"validation": self.hparams.n_val,
"test": self.hparams.n_test,
}
self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()}
def freeze_params(self, model):
for par in model.parameters():
par.requires_grad = False
def freeze_embeds(self):
"""Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
try:
self.freeze_params(self.model.model.shared)
for d in [self.model.model.encoder, self.model.model.decoder]:
freeze_params(d.embed_positions)
freeze_params(d.embed_tokens)
except AttributeError:
self.freeze_params(self.model.shared)
for d in [self.model.encoder, self.model.decoder]:
self.freeze_params(d.embed_tokens)
def lmap(self, f, x):
"""list(map(f, x))"""
return list(map(f, x))
def is_logger(self):
return True
def parse_score(self, result):
return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
def forward(
self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
):
return self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
labels=labels,
)
def _step(self, batch):
labels = batch["target_ids"]
labels[labels[:, :] == self.tokenizer.pad_token_id] = -100
# print(labels)
outputs = self(
input_ids=batch["source_ids"],
attention_mask=batch["source_mask"],
labels=labels,
decoder_attention_mask=batch['target_mask']
)
# print(outputs)
loss = outputs[0]
return loss
def ids_to_clean_text(self, generated_ids):
gen_text = self.tokenizer.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
return self.lmap(str.strip, gen_text)
def _generative_step(self, batch) :
t0 = time.time()
generated_ids = self.model.generate(
batch["source_ids"],
attention_mask=batch["source_mask"],
use_cache=True,
decoder_attention_mask=batch['target_mask'],
max_length=150,
num_beams=2,
repetition_penalty=2.5,
length_penalty=1.0,
early_stopping=False,
)
preds = self.ids_to_clean_text(generated_ids)
target = self.ids_to_clean_text(batch["target_ids"])
gen_time = (time.time() - t0) / batch["source_ids"].shape[0]
loss = self._step(batch)
# print("LOSS _generative_step")
# print(loss)
base_metrics = {'val_loss': loss}
# rouge: Dict = self.calc_generative_metrics(preds, target)
summ_len = np.mean(self.lmap(len, generated_ids))
base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target)
self.rouge_metric.add_batch(preds, target)
# rouge_results = self.rouge_metric.compute()
# rouge_dict = self.parse_score(rouge_results)
# base_metrics.update(rouge1=rouge_dict['rouge1'], rougeL=rouge_dict['rougeL'])
return base_metrics
def training_step(self, batch, batch_idx):
print("training_step")
print(batch)
loss = self._step(batch)
tensorboard_logs = {"train_loss": loss}
print("LOSS")
print(loss)
return {"loss": loss, "log": tensorboard_logs}
def training_epoch_end(self, outputs):
print("OUTPUTS")
print(outputs)
avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
tensorboard_logs = {"avg_train_loss": avg_train_loss}
return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}
def validation_step(self, batch, batch_idx):
print("validation_step")
return self._generative_step(batch)
def validation_epoch_end(self, outputs):
avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
tensorboard_logs = {"val_loss": avg_loss}
rouge_results = self.rouge_metric.compute()
rouge_dict = self.parse_score(rouge_results)
tensorboard_logs.update(rouge1=rouge_dict['rouge1'], rougeL=rouge_dict['rougeL'])
## Clear out the lists for next epoch
self.target_gen= []
self.prediction_gen=[]
return {"avg_val_loss": avg_loss,
"rouge1" : rouge_results['rouge1'],
"rougeL" : rouge_results['rougeL'],
"log": tensorboard_logs, 'progress_bar': tensorboard_logs}
def configure_optimizers(self):
"Prepare optimizer and schedule (linear warmup and decay)"
model = self.model
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": self.hparams.weight_decay,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
self.opt = optimizer
return [optimizer]
def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None, using_native_amp=False, optimizer_closure=None, on_tpu=None, using_lbfgs=None):
# if self.trainer.use_tpu:
# xm.optimizer_step(optimizer)
# else:
optimizer.step()
optimizer.zero_grad()
self.lr_scheduler.step()
def get_tqdm_dict(self):
tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
return tqdm_dict
def train_dataloader(self):
print("train_dataloader")
n_samples = self.n_obs['train']
print(n_samples)
dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, num_workers=4)
print(len(dataloader.dataset))
print(self.hparams.train_batch_size * max(1, self.hparams.n_gpu))
print(self.hparams.gradient_accumulation_steps)
print(float(self.hparams.num_train_epochs))
t_total = (
(len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
# // self.hparams.gradient_accumulation_steps
* float(self.hparams.num_train_epochs)
)
print(t_total)
scheduler = get_linear_schedule_with_warmup(
self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
)
self.lr_scheduler = scheduler
return dataloader
def val_dataloader(self):
n_samples = self.n_obs['validation']
# validation_dataset = get_dataset(tokenizer=self.tokenizer, type_path="validation", num_samples=n_samples, args=self.hparams)
return DataLoader(validation_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)
def test_dataloader(self):
n_samples = self.n_obs['test']
# test_dataset = get_dataset(tokenizer=self.tokenizer, type_path="test", num_samples=n_samples, args=self.hparams)
return DataLoader(test_dataset, batch_size=self.hparams.test_batch_size, num_workers=4)
Below are my parameters:
args_dict = dict(
output_dir="", # path to save the checkpoints
model_name_or_path='t5-small',
tokenizer_name_or_path='t5-small',
max_input_length=512,
max_output_length=150,
freeze_encoder=False,
freeze_embeds=False,
learning_rate=3e-4,
weight_decay=0.0,
adam_epsilon=1e-8,
warmup_steps=0,
train_batch_size=20,
eval_batch_size=20,
num_train_epochs=2,
gradient_accumulation_steps=8,
n_gpu=1,
resume_from_checkpoint=None,
val_check_interval = 0.05,
n_val=1000,
n_train=-1,
n_test=-1,
early_stop_callback=False,
fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
seed=42,
)

It seems that this code is quite outdated. What makes this conflict is the optimizer_step() method. I just commented out this whole segment below and it worked for me. If you want to do any custom logic in this function, better to consult the latest code on GitHub.
def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None, using_native_amp=False,on_tpu=None,using_lbfgs=None, optimizer_closure=None):
if self.trainer.use_tpu:
xm.optimizer_step(optimizer)
else:
optimizer.step(closure=optimizer_closure)
optimizer.zero_grad()
self.lr_scheduler.step()

Related

how can I predict only 5 days price in this LSTM model (by pytorch)

class StockDataset(Dataset):
#데이터 셋은 i번째 레코드 값을 주는 역할 수행
def __init__(self, symbol, x_frames, y_frames, start, end):
self.symbol = symbol
self.x_frames = x_frames
self.y_frames = y_frames
self.start = datetime.datetime(*start)
self.end = datetime.datetime(*end)
#위에서 지정했던 데이터, 날짜 값들 다 받아옴
self.data = pdr.DataReader(self.symbol, 'yahoo', self.start, self.end)
def __len__(self):
return len(self.data) - (self.x_frames + self.y_frames) + 1
def __getitem__(self, idx):
global data
#global data_set
#데이터셋 i번째 값 입력받았을때 그걸 출력해줌 데이터를 '리스트'화 하는 것
idx += self.x_frames
data = self.data.iloc[idx-self.x_frames:idx+self.y_frames]
data = data[['High', 'Low', 'Open', 'Close', 'Adj Close', 'Volume']]
data = data.apply(lambda x: np.log(x+1) - np.log(x[self.x_frames-1]+1)) #로그수익률 변환 한뒤, 혹시모를 결측값 위해 1더해줌
global x_ex
global y_ex
x_ex= data[:self.x_frames]
y_ex= data[self.x_frames:]
data = data.values #numpy array로 변환한거
X = data[:self.x_frames]
y = data[self.x_frames:]
return X, y
This one is dataset
class LSTM(nn.Module):
#50분 이후부터 모델설명
def __init__(self, input_dim, hidden_dim, output_dim, num_layers, batch_size, dropout, use_bn):
super(LSTM, self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.output_dim = output_dim
self.num_layers = num_layers
self.batch_size = batch_size
self.dropout = dropout
self.use_bn = use_bn
self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers)
self.hidden = self.init_hidden()
self.regressor = self.make_regressor()
def init_hidden(self):
return (torch.zeros(self.num_layers, self.batch_size, self.hidden_dim),
torch.zeros(self.num_layers, self.batch_size, self.hidden_dim))
def make_regressor(self):
layers = []
if self.use_bn:
layers.append(nn.BatchNorm1d(self.hidden_dim))
layers.append(nn.Dropout(self.dropout))
layers.append(nn.Linear(self.hidden_dim, self.hidden_dim // 2))
layers.append(nn.ReLU())
layers.append(nn.Linear(self.hidden_dim // 2, self.output_dim))
regressor = nn.Sequential(*layers)
return regressor
def forward(self, x):
lstm_out, self.hidden = self.lstm(x, self.hidden)
y_pred = self.regressor(lstm_out[-1].view(self.batch_size, -1))
return y_pred
This one is model
def test(model, partition, args):
global y_true
global y_pred
global X
testloader = DataLoader(partition['test'],
batch_size=args.batch_size,
shuffle=False, drop_last=True)
model.eval()
test_acc = 0.0
with torch.no_grad():
for i, (X, y) in enumerate(testloader):
X = X.transpose(0, 1).float().to(args.device)
y_true = y[:, :, 3].float().to(args.device)
model.hidden = [hidden.to(args.device) for hidden in model.init_hidden()]
y_pred = model(X)
test_acc += metric(y_pred, y_true)[0]
test_acc = test_acc / len(testloader)
return test_acc
This is test data loader.
# ====== Random Seed Initialization ====== #
seed = 666
np.random.seed(seed)
torch.manual_seed(seed)
parser = argparse.ArgumentParser()
args = parser.parse_args("")
args.exp_name = "exp1_lr"
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
# ====== Data Loading ====== #
args.symbol = '005930.KS' #원하는 종목
args.batch_size = 4 #배치사이즈
args.x_frames = 5 #수정x 이전 n일치 데이터 이게 너무 길면 1주일 예측 불가능
args.y_frames = 5 #수정y 이후 n일치 데이터 이게 너무 길면 1주일 예측 불가능
# ====== Model Capacity ===== #
args.input_dim = 6
args.hid_dim = 50
args.n_layers = 2 # (은닉층의 레이어 갯수) https://justkode.kr/deep-learning/pytorch-rnn링크 참고
# ====== Regularization ======= #
args.l2 = 0.0001
args.dropout = 0.3
args.use_bn = True
# ====== Optimizer & Training ====== #
args.optim = 'RMSprop' #'RMSprop' #SGD, RMSprop, ADAM...
args.lr = 0.001
args.epoch = 1
# ====== Experiment Variable ====== #
name_var1 = 'lr' # lr=러닝레이트
name_var2 = 'n_layers'#뉴럴 네트워크 몇개를 쌓을것인지?
list_var1 = [0.001, 0.0001, 0.00001]
list_var2 = [1,2,3]
#데이터셋 실제 형성
trainset = StockDataset(args.symbol, args.x_frames, args.y_frames, (2012,1,1), (2021,1,1)) #학습기간
valset = StockDataset(args.symbol, args.x_frames, args.y_frames, (2021,1,2), (2021,12,30)) #검증기간 최소 +6 월, +19 일 안하면 float division by zero 에러 발생 왜?? 21년 기준
testset = StockDataset(args.symbol, args.x_frames, args.y_frames, (2022,1,10), (2022,1,14)) #테스트기간 최소 +6 월, + 25일 안하면 float division by zero 에러 발생. 22년기준
#기간이 일정 영업일 이상을 요구하는듯? <<146 영업일 이상 데이터 요구. 그만큼 안주면 오류남 왜??
partition = {'train': trainset, 'val':valset, 'test':testset}
for var1 in list_var1:
for var2 in list_var2:
setattr(args, name_var1, var1)
setattr(args, name_var2, var2)
print(args)
setting, result = experiment(partition, deepcopy(args))
save_exp_result(setting, result)
#꼭 디렉토리에 있는 파일들 지운다음에 그래프 그려야한다. 안그러면 결과값 전부 겹쳐서 나옴
This one is hyper parameter regulate.
I wonder how can I get result when I set testset length in 5days? (like (2022,1,10) (2022,1,14))
This cord didn't work when I set testset length at least 7month (maybe + 146 trade day)
error is float divided by zero. (when I use lower 146 days.)
if I set length +146 days, then codes work well.
I think this code cause error:
data = data.apply(lambda x: np.log(x+1) - np.log(x[self.x_frames-1]+1))
log data was so small, so error occurred. (my opinion)
data is yahoo finance data. Thanx to read
When I # below code, then data got infinite.
data = data.apply(lambda x: np.log(x+1) - np.log(x[self.x_frames-1]+1))

tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [100,200] vs. [100,10,200]

The shape of the tensor input to my model is(None, 10, 256),after processing by the attention layer, the shape becomes(None, 256),How should I modify layercompute_output_shape(self, input_shape) so that the shape of the model does not change?
attention layer
class Attention_layer(Layer):
def __init__(self,
W_regularizer=None, b_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True, **kwargs):
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(W_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
super(Attention_layer, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight(name='att_weight',shape=(input_shape[-1], input_shape[-1],),
initializer=self.init,
regularizer=self.W_regularizer,
constraint=self.W_constraint
)
if self.bias:
self.b = self.add_weight((input_shape[-1],),
initializer='zero',
name='{}_b'.format(self.name),
regularizer=self.b_regularizer,
constraint=self.b_constraint)
super(Attention_layer, self).build(input_shape)
def compute_mask(self, input, input_mask=None):#build(input_shape):
# do not pass the mask to the next layers
return None
def call(self, x, mask=None):#call(x):
uit = K.dot(x, self.W)
if self.bias:
uit += self.b
uit = K.tanh(uit)
a = K.exp(uit)
# apply mask after the exp. will be re-normalized next
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
a *= K.cast(mask, K.floatx())
# in some cases especially in the early stages of training the sum may be almost zero
# and this results in NaN's. A workaround is to add a very small positive number to the sum.
# a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
# a = K.expand_dims(a)
weighted_input = x * a
print(weighted_input)
return K.sum(weighted_input, axis=1)#output.shape = (batch_size, embedding_size)
def compute_output_shape(self, input_shape):
return input_shape[0], input_shape[-1]

A trained efficientnet-v2-b3 with train-acc 0.99 and val-acc 0.99 is very good at predicting the test dataset but poor at the train & val dataset

I used the pretrained EfficientNet-v2-b3 model from 'https://github.com/rwightman/pytorch-image-models' and the pytorch framework to train the cigarette box. The train is as follows:
There are 1100 classes each of which is one cigarette specification. All the images are stored in a directory named original_dataset_20210805 each sub-directory of which accounts for a class of images.
Remove the classes each of which has only less than 50 images. There are 959 classes remained.
For each class, random select 10 images into the validation dataset named 'valData', random select about 1/10 images into the test dataset named 'testData', and the remained images are selected into the train dataset named 'trainData'.
For each image, resize it into w×h = 200×300.
To augment data, rotate each image with the angle of 90°, and all the images with rotation 90° of each class are selected into one class. For example, if there is one cigarette specification A, then rotate all the images of A by 90° and all the rotated images are named a new class A-rot1. Then rotate 180° to obtain A-rot2 and rotate 270° to A-rot3. Carry the rotations on all classes, then we have 959×4=3836 classes.
The 'trainData' has 502172 images, the 'valData' has 38360 images, and the 'testData' has 21463 images.
Use the pretrained model to start train. Save the best model as follow:
if train_acc > last_train_acc and val_acc > last_val_acc:
save_best_model()
Exit train if train_acc >= 0.99 and val_acc >= 0.99
At Epoch 121, the train exits with train_acc 0.9911 and val_acc 0.9902.
Use the best model to infer the testData, and the accuracy is 0.981. Using the best model to infer trainData, I expect the accuracy should be more than 0.99 but actually it is 0.84. Using the model on valData, the actual accuracy is 0.82. This is very strange. Then I use the best model on another original_dataset_20210709 which is some different from the above original_dataset_20210805. And the images in original_dataset_20210709 haven't been resized into w×h=200×300. The accuracy is 0.969.
The infer codes are as follows:
def infer(cfg:Config):
transform_test = build_transforms(cfg.img_height, cfg.img_width, 'test')
model = get_model(cfg, 'test')
model = model.to(cfg.get_device())
model.eval()
records = []
sub_classes = os.listdir(cfg.test_data_dirname)
if sub_classes is None or len(sub_classes) < 1:
return
sub_classes= sorted(sub_classes)
classid_dict = {}
with open(cfg.classid_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
tokens = line.split(',')
classid_dict[int(tokens[0])] = tokens[1]
records.append(cfg.test_data_dirname + ',' + str(len(sub_classes)) + ' classes\n')
records.append('image, prediction result\n')
start_time = datetime.now()
elapsed = 0.0
count = 0
with torch.no_grad():
for sub_cls in sub_classes:
print(' process sub-directory' + sub_cls)
files = os.listdir(os.path.join(cfg.test_data_dirname, sub_cls))
count += len(files)
if files is None or len(files) < 1:
print('The sub-directory ' + sub_cls + " has no files")
continue
for file in files:
try:
img_path = os.path.join(cfg.test_data_dirname, sub_cls, file)
if os.path.isfile(img_path):
img_test = Image.open(img_path)
img = img_test
img = transform_test(img).to(cfg.get_device())
img = torch.unsqueeze(img, 0)
output = model(img)
_, preds = torch.max(output.data, 1)
id = preds[0].item()
if classid_dict.get(id) is not None:
#print(img_path + ' is predicted as:' + classid_dict[id])
records.append(sub_cls + '/' + file + ',' + classid_dict[id] + '\n')
log_func(sub_cls + '/' + file + ' is predicted as:' + classid_dict[id])
pass
else:
records.append(sub_cls + '/' + file + ', unknown class\n')
except Exception as e:
print(str(e))
elapsed = (datetime.now() - start_time).total_seconds()
records.append('elapsed {:.4f} sec,average elapsed {:.4f} sec\n'.format(elapsed, elapsed/count))
result_path = os.path.join(cfg.results_dir, 'infer_' + cfg.backbone + '_' + str(cfg.num_classes) + '_' + format_datetime(datetime.now()) + '.csv')
with open(result_path, 'w', encoding='utf-8') as f:
f.writelines(records)
I check the python codes and find the possible reanson maybe is from the transform on the image before fed into the model. The transform codes are as follows:
def build_transforms(img_height, img_width, run_mode="train", mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
if run_mode == 'train':
transform = T.Compose([
# Use OpenCV to open the image
T.Lambda(lambda img: random_rotate_bound(img, 30)),
T.Lambda(lambda img: random_translate(img, 20)),
T.Lambda(lambda img: random_zoom(img)),
T.Lambda(lambda img: sameScaleZoom(img, img_height, img_width)),
T.RandomChoice([T.Lambda(lambda img: random_AffineTransform(img)),
T.Lambda(lambda img: random_warpPerspective(img))]),
T.RandomChoice([T.Lambda(lambda img: random_degarde_img(img)),
T.Lambda(lambda img: random_mosaic(img)),
T.Lambda(lambda img: random_motion_blur(img)),
T.Lambda(lambda img: random_focus_blur(img))]),
# Convert the OpenCV-format image into PIL before continue
T.ToPILImage('RGB'),
T.RandomOrder([T.ColorJitter(brightness=0.5),
T.ColorJitter(contrast=(0.2, 1.8)),
T.ColorJitter(saturation=(0.2, 1.8)),
T.ColorJitter(hue=0.08)]),
T.ToTensor(),
T.Normalize(mean, std)
])
else:
transform = T.Compose([
#T.Lambda(lambda img: sameScaleZoom(img, img_height, img_width)),
# On this case, use PIL rather than OpenCV to open the image
T.Resize(size=(img_height, img_width)),
T.ToTensor(),
T.Normalize(mean, std)
])
return transform
To verify my guess, for the infer dataset 'valData' (not use 'trainData' because it takes too much time), I change the transform from transform_test = build_transforms(cfg.img_height, cfg.img_width, 'test') into transform_test = build_transforms(cfg.img_height, cfg.img_width, 'train'). Expectedly, the accuracy is 0.9918.
My question is:
In reference, the trained model has an accuracy of 0.989 on the
testData, but an accuracy of about 0.84 on the trainData and about
0.82 on the valData.
What I do wrong in the transform?
Or is there other reason to cause such a strange phenomenon?
Thanks all people who are willing to answer the question.
Appened 1:
12) The validation code is as follows:
def val(cfg:Config, model, criterion, transform=None):
start_time = datetime.now()
val_loss = 0
total = 0
val_correct = 0
model.eval()
if transform is None:
transform = build_transforms(cfg.img_height, cfg.img_width)
dset_loader, dset_size = load_data(cfg, transform, run_mode='val', shuffle=False)
for data in dset_loader:
inputs, labels = data
if cfg.is_use_cuda:
#inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
inputs = inputs.cuda()
labels = torch.stack([anno.cuda() for anno in labels])
else:
#inputs, labels = Variable(inputs), Variable(labels)
pass
with torch.no_grad():
outputs = model(inputs)
loss = criterion(outputs, labels)
_, preds = torch.max(outputs.data, 1)
val_loss += loss.data.item()*inputs.size(0)
val_correct += torch.sum(preds == labels.data)
val_loss /= dset_size
val_acc = val_correct.item()*1.0/dset_size
elapsed = (datetime.now() - start_time).total_seconds()
log_func('exit val,{} samples,elapsed {:.4f} sec,average elapsed{:.4f} sec'.format(dset_size, elapsed, elapsed/dset_size))
return val_loss, val_acc
The load_data code is:
def load_data(cfg:Config, transform, run_mode='train', shuffle=True):
if run_mode == 'train':
dataset = TheDataset(cfg, transform, run_mode)
data_loader = DataLoader(dataset, batch_size=cfg.train_batch_size, shuffle=shuffle, num_workers=cfg.num_workers)
return data_loader, len(dataset)
else:
dataset = TheDataset(cfg, transform, run_mode)
data_loader = DataLoader(dataset, batch_size=cfg.val_batch_size, shuffle=shuffle, num_workers=cfg.num_workers)
return data_loader, len(dataset)
The class 'TheDataset' is defined as follows:
class TheDataset(Dataset):
def __init__(self, cfg:Config, transforms, run_mode='train') -> None:
super().__init__()
self.img_mode = cfg.img_mode
self.transforms = transforms
self.config = cfg
self.run_mode = run_mode
assert cfg is not None, "The config object cannot be none"
assert cfg.train_data_dirname is not None, "The train data cannot be none"
assert transforms is not None, 'The transforms cannot be none'
self.label_list = list()
self.path_list = list()
self.label_2_path_index_list = {} # Key:the label,value:a list each element of which is the index of the image file path related to the key in path_list
if run_mode == 'train':
self.dirname = cfg.train_data_dirname
self.file_path = cfg.train_data_file_list
elif run_mode == 'val':
self.dirname = cfg.val_data_dirname
self.file_path = cfg.val_data_file_list
elif run_mode == 'test':
self.dirname = cfg.test_data_dirname
self.file_path = cfg.test_data_file_list
else:
self.dirname = cfg.train_data_dirname
self.file_path = cfg.train_data_file_list
index = 0
with open(self.file_path, 'r') as f:
for line in f:
if line is not None and len(line) > 5:
a_path, a_label = line.strip().split(',')
if a_path is not None and a_label is not None:
a_label = int(a_label)
self.path_list.append(os.path.join(self.dirname, a_path.strip()))
self.label_list.append(a_label)
if self.label_2_path_index_list.get(a_label) is None:
self.label_2_path_index_list[a_label] = []
self.label_2_path_index_list[a_label].append(index)
index += 1
def __getitem__(self, index):
img_path = self.path_list[index]
img_label = self.label_list[index]
img = cv2.imread(img_path)
if self.img_mode == 'RGB':
try:
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
except:
msg = 'cannot convert to RGB:' + img_path
log_func(msg)
img = self.transforms(img)
return img, img_label
def __len__(self):
return len(self.label_list)
def __repr__(self):
return self.__str__()
def __str__(self):
return "TheDataset info: datasize={}, num_labels={}".format(len(self.path_list), len(self.label_2_path_index_list))
Append 2:
15) the whole train.py is :
from pathlib import WindowsPath
import sys
import json
import os
import cv2
import torch
import torch.nn as nn
from PIL import Image
import torch.optim as optim
from torch.autograd import Variable
from datetime import datetime
import pandas as pd
from torch.cuda.amp.grad_scaler import GradScaler
from torch.cuda.amp.autocast_mode import autocast
from torchvision import transforms, datasets
from efficientnet_pytorch import EfficientNet
import torch.nn.functional as F
from part01_data import load_data
from part03_transform import build_transforms
from part02_model import get_model, exp_lr_scheduler
from utils import print, set_logpath, format_datetime, write_one_log_record
from config import Config, ConfigEncoder
log_path = ''
def val(cfg:Config, model, criterion, transform=None):
start_time = datetime.now()
val_loss = 0
total = 0
val_correct = 0
model.eval()
if transform is None:
transform = build_transforms(cfg.img_height, cfg.img_width)
dset_loader, dset_size = load_data(cfg, transform, run_mode='val', shuffle=False)
for data in dset_loader:
inputs, labels = data
if cfg.is_use_cuda:
#inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
inputs = inputs.cuda()
labels = torch.stack([anno.cuda() for anno in labels])
else:
#inputs, labels = Variable(inputs), Variable(labels)
pass
with torch.no_grad():
outputs = model(inputs)
loss = criterion(outputs, labels)
_, preds = torch.max(outputs.data, 1)
val_loss += loss.data.item()*inputs.size(0)
val_correct += torch.sum(preds == labels.data)
val_loss /= dset_size
val_acc = val_correct.item()*1.0/dset_size
elapsed = (datetime.now() - start_time).total_seconds()
print('val exit,{} samples,elapsed {:.4f} sec,average elapsed {:.4f} sec'.format(dset_size, elapsed, elapsed/dset_size))
return val_loss, val_acc
def train(cfg:Config, shuffle=True):
train_log_path = os.path.join(cfg.results_dir, cfg.backbone + '_' + str(cfg.num_classes) + 'classes_' + format_datetime(datetime.now()) + '.csv')
print('Begin to train,the data directory:' + cfg.train_data_dirname)
if cfg.is_use_apex:
scaler = GradScaler()
# step 1:Preparation
best_acc = 0.0
best_val_acc = 0.0
start_epoch = -1
criterion = nn.CrossEntropyLoss()
model_ft, optimizer_args, start_epoch, best_acc, best_val_acc = get_model(cfg, 'train')
if cfg.is_use_cuda:
model_ft = model_ft.cuda()
criterion = criterion.cuda()
optimizer = optim.SGD(model_ft.parameters(), lr=1e-2, momentum=0.9, weight_decay=0.0004)
if optimizer_args is not None:
optimizer.load_state_dict(optimizer_args)
since = datetime.now()
best_model_wts = model_ft.state_dict()
transform = build_transforms(cfg.img_height, cfg.img_width)
print('the transforms are as follows:')
print(str(transform))
print('preparation is finished')
write_one_log_record('epoch, train loss, train accuracy, validation loss, validation accuracy, elapsed/minute\n', train_log_path, 'w')
start_epoch_dt = datetime.now()
for epoch in range(start_epoch+1,cfg.num_epochs):
# step 2:load data and adjust optimizer
model_ft.train(True)
dset_loader, dset_size = load_data(cfg, transform, run_mode='train', shuffle=shuffle)
print('Epoch: {}/{},totally {} images'.format(epoch+1, cfg.num_epochs, dset_size))
optimizer = exp_lr_scheduler(optimizer, epoch)
running_loss = 0.0
running_corrects = 0
count = 0
batch_count = len(dset_loader)
start_batches_dt = datetime.now()
# step 3:begin batch train
for data in dset_loader:
# step 3.1:detach sample and label and move them to the device
inputs, labels = data
if cfg.is_use_cuda:
#inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
inputs = inputs.cuda()
labels = torch.stack([anno.cuda() for anno in labels])
else:
#inputs, labels = Variable(inputs), Variable(labels)
pass
# step 3.2:compute and forward
optimizer.zero_grad()
if cfg.is_use_apex:
with autocast():
outputs = model_ft(inputs)
loss = criterion(outputs, labels)
scaler.scale(loss).backward()
scaler.unscale_(optimizer)
scaler.step(optimizer)
scaler.update()
else:
outputs = model_ft(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# step 3.3:detach label and compute loss and correct count
_, preds = torch.max(outputs.data, 1)
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
# step 3.4:print batch info
count += 1
start_batches_dt = output_batch_info(cfg, epoch, count, batch_count, loss.item(), outputs.size()[0], start_batches_dt)
# step 4:exit this epoch and compute the loss
train_loss = running_loss / dset_size
train_acc = running_corrects.double() / dset_size
val_loss, val_acc = val(cfg, model_ft, criterion, transform)
# step 5:judge the best model and save it
best_model_wts, best_acc, best_val_acc = save_best_model(cfg, model_ft, best_model_wts, train_acc, best_acc, val_acc, best_val_acc)
# step 6:save the last checkpoint
save_newest_checkpoint(cfg, model_ft, optimizer, epoch, best_acc, best_val_acc)
# step 7:save the middle checkpoint
save_checkpoint_per_epochs(cfg, model_ft, optimizer, epoch, best_acc, best_val_acc)
# step 8:compute the loss, accuracy and elapsed time in this epoch
start_epoch_dt = summarize_epoch_info(start_epoch_dt, epoch, train_loss, train_acc, val_loss, val_acc, train_log_path)
# step 9:judge it is proper to exit the train process
if have_meet_acc_requirement_or_not(cfg, epoch, train_loss, train_acc, val_loss, val_acc):
break
time_elapsed = (datetime.now() - since).total_seconds()
print('train complete,elapsed {}hours {:.4f} minutes'.format(time_elapsed//3600, (time_elapsed - (time_elapsed//3600)*3600)/60))
return best_model_wts
def output_batch_info(cfg:Config, epoch, count, batch_count, loss_per_sample, size_of_this_batch, start_batches_dt):
flag = ''
elapsed = (datetime.now() - start_batches_dt).total_seconds()
if count % cfg.print_per_batch == 0:
flag = str(cfg.print_per_batch)
more_time = (batch_count - count) * elapsed/cfg.print_per_batch
if size_of_this_batch < cfg.train_batch_size: # the last batch
flag = '本'
more_time = (batch_count - count) * elapsed
if len(flag) > 0:
print(' Epoch: {}, batch: {}/{}, average train loss of each sample: {:.4f}, batch {} elapsed: {:.4f} sec,this epoch needs more {:.4f} sec'.format(epoch+1, count, batch_count, loss_per_sample, flag, elapsed, more_time))
return datetime.now()
return start_batches_dt
def have_meet_acc_requirement_or_not(cfg: Config, epoch, train_loss, train_acc, val_loss, val_acc):
if train_acc < cfg.acc_valve or (cfg.is_check_best_with_val_loss and val_acc < cfg.acc_valve):
return False
return True
def summarize_epoch_info(start_epoch_dt, epoch, train_loss, train_acc, val_loss, val_acc, output_path):
elapsed = (datetime.now() - start_epoch_dt).total_seconds()/60
remained_minutes = (cfg.num_epochs - epoch - 1)*elapsed
remained_hours = remained_minutes//60
remained_minutes = remained_minutes - remained_hours*60
record = '{},{:.4f},{:.4f},{:.4f},{:.4f},{:.4f}\n'.format(epoch+1, train_loss, train_acc, val_loss, val_acc, elapsed)
write_one_log_record(record, output_path, 'a')
return datetime.now()
def save_one_checkpoint(model, optimizer, epoch, best_acc, best_val_acc, output_path):
checkpoint = {
'net': model.state_dict(),
'optimizer': optimizer.state_dict(),
'epoch': epoch,
'best_acc': best_acc,
'best_val_acc': best_val_acc
}
torch.save(checkpoint, output_path)
def save_checkpoint_per_epochs(cfg:Config, model, optimizer, epoch, best_acc, best_val_acc):
if cfg.save_per_epoch > 0 and (epoch+1)%cfg.save_per_epoch == 0:
checkpoint_path = cfg.resume_ckpt_dir + "/" + cfg.backbone + f'_checkpoint_{epoch+1}_' + str(cfg.num_classes) + 'classes.pth'
save_one_checkpoint(model, optimizer, epoch, best_acc, best_val_acc, checkpoint_path)
def save_newest_checkpoint(cfg:Config, model, optimizer, epoch, best_acc, best_val_acc):
checkpoint_path = cfg.resume_ckpt_dir + "/" + cfg.backbone + '_checkpoint_last_' + str(cfg.num_classes) + 'classes.pth'
save_one_checkpoint(model, optimizer, epoch, best_acc, best_val_acc, checkpoint_path)
def save_best_model(cfg:Config, model, best_model_weights, train_acc, best_acc, val_acc, best_val_acc):
if train_acc <= best_acc or (cfg.is_check_best_with_val_loss and val_acc <= best_val_acc):
return best_model_weights, best_acc, best_val_acc
best_model_weights = model.state_dict()
model_out_path = cfg.models_dir + "/" + cfg.backbone + '_best_' + str(cfg.num_classes) + 'classes.pth'
torch.save(best_model_weights, model_out_path)
best_acc = train_acc
best_val_acc = val_acc if val_acc > best_val_acc else best_val_acc
return best_model_weights, train_acc, best_val_acc
def infer(cfg:Config):
transform_test = build_transforms(cfg.img_height, cfg.img_width, 'test')
#transform_test = build_transforms(cfg.img_height, cfg.img_width, 'train')
model = get_model(cfg, 'test')
model = model.to(cfg.get_device())
model.eval()
records = []
sub_classes = os.listdir(cfg.test_data_dirname)
if sub_classes is None or len(sub_classes) < 1:
return
sub_classes= sorted(sub_classes)
classid_dict = {}
with open(cfg.classid_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
tokens = line.split(',')
classid_dict[int(tokens[0])] = tokens[1]
records.append(cfg.test_data_dirname + ',' + str(len(sub_classes)) + ' classes\n')
records.append('image, predict \n')
start_time = datetime.now()
elapsed = 0.0
count = 0
with torch.no_grad():
for sub_cls in sub_classes:
files = os.listdir(os.path.join(cfg.test_data_dirname, sub_cls))
count += len(files)
if files is None or len(files) < 1:
continue
for file in files:
try:
img_path = os.path.join(cfg.test_data_dirname, sub_cls, file)
if os.path.isfile(img_path):
# 当使用test模式transform = build_transforms(cfg.img_height, cfg.img_width, 'test')生成变换时,
# 使用img = Image.open(img_path)
img_test = Image.open(img_path)
img = img_test
img = transform_test(img).to(cfg.get_device())
img = torch.unsqueeze(img, 0)
output = model(img)
_, preds = torch.max(output.data, 1)
id = preds[0].item()
if classid_dict.get(id) is not None:
records.append(sub_cls + '/' + file + ',' + classid_dict[id] + '\n')
print(sub_cls + '/' + file + ' is predicted as:' + classid_dict[id])
pass
else:
records.append(sub_cls + '/' + file + ', unknown\n')
except Exception as e:
print(str(e))
elapsed = (datetime.now() - start_time).total_seconds()
records.append('elapsed {:.4f} sec ,average elapsed {:.4f} sec\n'.format(elapsed, elapsed/count))
result_path = os.path.join(cfg.results_dir, 'infer_' + cfg.backbone + '_' + str(cfg.num_classes) + '_' + format_datetime(datetime.now()) + '.csv')
with open(result_path, 'w', encoding='utf-8') as f:
f.writelines(records)
def use_one_model(cfg:Config, model_name):
cfg.backbone = model_name
log_path = os.path.join(cfg.log_dir, cfg.backbone + '_' + str(cfg.num_classes) + 'classes_' + format_datetime(datetime.now()) + '.log')
set_logpath(log_path)
start_time = datetime.now()
torch.cuda.empty_cache()
print('start, the args are:=====')
args = json.dumps(cfg, ensure_ascii=False, cls=ConfigEncoder, indent=2)
print(args)
try:
#train(cfg)
infer(cfg)
except Exception as e:
print(str(e))
elapsed = (datetime.now() - start_time).total_seconds()
hours = elapsed//3600
minutes = (elapsed - hours*3600)/60
def use_many_models(cfg:Config):
#backbones = ['efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2', 'adv-efficientnet-b0', 'adv-efficientnet-b1', 'adv-efficientnet-b2', 'tf_efficientnet_b0_ns', 'tf_efficientnet_b1_ns','tf_efficientnet_b2_ns', 'efficientnet-b3', 'adv-efficientnet-b3', 'tf_efficientnet_b3_ns']
backbones = ['tf_efficientnetv2_b0', 'tf_efficientnetv2_b1', 'tf_efficientnetv2_b2', 'tf_efficientnetv2_b3', 'tf_efficientnetv2_s']
for backbone in backbones:
use_one_model(cfg, backbone)
if __name__ == '__main__':
cfg = Config()
use_one_model(cfg, cfg.backbone)

GPU memory increasing at each batch (PyTorch)

I am trying to build a convolutionnal network using ConvLSTM layer (LSTM cell but with convolutions instead of matrix multiplications), but the problem is that my GPU memory increases at each batch, even if I'm deleting variables, and getting the true value for the loss (and not the graph) for each iteration. I may be doing something wrong but that exact same script ran without issues with another model (with more parameters and also using ConvLSTM layer).
Each batch is composed of num_batch x 3 images (grayscale) and I'm trying to predict the difference |Im(t+1)-Im(t)| with the input Im(t)
def main():
config = Config()
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, num_workers=0, shuffle=True, drop_last=True)
nb_img = len(train_dataset)
util.clear_progress_dir()
step_tensorboard = 0
###################################
# Model Setup #
###################################
model = fully_convLSTM()
if torch.cuda.is_available():
model = model.float().cuda()
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(),lr=lr)
util.enumerate_params([model])
###################################
# Training Loop #
###################################
model.train() #Put model in training mode
train_loss_recon = []
train_loss_recon2 = []
for epoch in tqdm(range(config.num_epochs)):
running_loss1 = 0.0
running_loss2 = 0.0
for i, (inputs, outputs) in enumerate(train_dataloader, 0):
print(i)
torch.cuda.empty_cache()
gc.collect()
# if torch.cuda.is_available():
inputs = autograd.Variable(inputs.float()).cuda()
outputs = autograd.Variable(outputs.float()).cuda()
im1 = inputs[:,0,:,:,:]
im2 = inputs[:,1,:,:,:]
im3 = inputs[:,2,:,:,:]
diff1 = torch.abs(im2 - im1).cuda().float()
diff2 = torch.abs(im3 - im2).cuda().float()
model.initialize_hidden()
optimizer.zero_grad()
pred1 = model.forward(im1)
loss = reconstruction_loss(diff1, pred1)
loss.backward()
# optimizer.step()
model.update_hidden()
optimizer.zero_grad()
pred2 = model.forward(im2)
loss2 = reconstruction_loss(diff2, pred2)
loss2.backward()
optimizer.step()
model.update_hidden()
## print statistics
running_loss1 += loss.detach().data
running_loss2 += loss2.detach().data
if i==0:
with torch.no_grad():
img_grid_diff_true = (diff2).cpu()
img_grid_diff_pred = (pred2).cpu()
f, axes = plt.subplots(2, 4, figsize=(48,48))
for l in range(4):
axes[0, l].imshow(img_grid_diff_true[l].squeeze(0).squeeze(0), cmap='gray')
axes[1, l].imshow(img_grid_diff_pred[l].squeeze(0).squeeze(0), cmap='gray')
plt.show()
plt.close()
writer_recon_loss.add_scalar('Reconstruction loss', running_loss1, step_tensorboard)
writer_recon_loss2.add_scalar('Reconstruction loss2', running_loss2, step_tensorboard)
step_tensorboard += 1
del pred1
del pred2
del im1
del im2
del im3
del diff1
del diff2#, im1_noised, im2_noised
del inputs
del outputs
del loss
del loss2
for obj in gc.get_objects():
if torch.is_tensor(obj) :
del obj
torch.cuda.empty_cache()
gc.collect()
epoch_loss = running_loss1 / len(train_dataloader.dataset)
epoch_loss2 = running_loss2/ len(train_dataloader.dataset)
print(f"Epoch {epoch} loss reconstruction1: {epoch_loss:.6f}")
print(f"Epoch {epoch} loss reconstruction2: {epoch_loss2:.6f}")
train_loss_recon.append(epoch_loss)
train_loss_recon2.append(epoch_loss2)
del running_loss1, running_loss2, epoch_loss, epoch_loss2
Here is the model used :
class ConvLSTMCell(nn.Module):
def __init__(self, input_channels, hidden_channels, kernel_size):
super(ConvLSTMCell, self).__init__()
# assert hidden_channels % 2 == 0
self.input_channels = input_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
# self.num_features = 4
self.padding = 1
self.Wxi = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whi = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxf = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whf = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxc = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whc = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxo = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Who = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wci = None
self.Wcf = None
self.Wco = None
def forward(self, x, h, c): ## Equation (3) dans Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting
ci = torch.sigmoid(self.Wxi(x) + self.Whi(h) + c * self.Wci)
cf = torch.sigmoid(self.Wxf(x) + self.Whf(h) + c * self.Wcf)
cc = cf * c + ci * torch.tanh(self.Wxc(x) + self.Whc(h)) ###gt= tanh(cc)
co = torch.sigmoid(self.Wxo(x) + self.Who(h) + cc * self.Wco) ##channel out = hidden channel
ch = co * torch.tanh(cc)
return ch, cc #short memory, long memory
def init_hidden(self, batch_size, hidden, shape):
if self.Wci is None:
self.Wci = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
self.Wcf = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
self.Wco = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
else:
assert shape[0] == self.Wci.size()[2], 'Input Height Mismatched!'
assert shape[1] == self.Wci.size()[3], 'Input Width Mismatched!'
return (autograd.Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda(),
autograd.Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda())
class fully_convLSTM(nn.Module):
def __init__(self):
super(fully_convLSTM, self).__init__()
layers = []
self.hidden_list = [1,32,32,1]#,32,64,32,
for k in range(len(self.hidden_list)-1): # Define blocks of [ConvLSTM,BatchNorm,Relu]
name_conv = "self.convLSTM" +str(k)
cell_conv = ConvLSTMCell(self.hidden_list[k],self.hidden_list[k+1],3)
setattr(self, name_conv, cell_conv)
name_batchnorm = "self.batchnorm"+str(k)
batchnorm=nn.BatchNorm2d(self.hidden_list[k+1])
setattr(self, name_batchnorm, batchnorm)
name_relu =" self.relu"+str(k)
relu=nn.ReLU()
setattr(self, name_relu, relu)
self.sigmoid = nn.Sigmoid()
self.internal_state=[]
def initialize_hidden(self):
for k in range(len(self.hidden_list)-1):
name_conv = "self.convLSTM" +str(k)
(h,c) = getattr(self,name_conv).init_hidden(config.batch_size, self.hidden_list[k+1],(256,256))
self.internal_state.append((h,c))
self.internal_state_new=[]
def update_hidden(self):
for i, hidden in enumerate(self.internal_state_new):
self.internal_state[i] = (hidden[0].detach(), hidden[1].detach())
self.internal_state_new = []
def forward(self, input):
x = input
for k in range(len(self.hidden_list)-1):
name_conv = "self.convLSTM" +str(k)
name_batchnorm = "self.batchnorm"+str(k)
name_relu =" self.relu"+str(k)
x, c = getattr(self,name_conv)(x, self.internal_state[k][1], self.internal_state[k][0])
self.internal_state_new.append((x.detach(),c.detach()))
x = getattr(self,name_batchnorm)(x)
if k!= len(self.hidden_list)-2:
x = getattr(self,name_relu)(x)
else :
x = self.sigmoid(x)
return x
So my question is, what in my code is causing memory to accumulate during the training phase?
A few quick notes about training code:
torch.Variable is deprecated since at least 8 minor versions (see here), don't use it
gc.collect() has no point, PyTorch does the garbage collector on it's own
Don't use torch.cuda.empty_cache() for each batch, as PyTorch reserves some GPU memory (doesn't give it back to OS) so it doesn't have to allocate it for each batch once again. It will make your code slow, don't use this function at all tbh, PyTorch handles this.
Don't spam random memory cleaning, that's most probably not where the error is
Model
Yes, this is probably the case (although it's hard to read this model's code).
Take notice of self.internal_state list and self.internal_state_new list also.
Each time you call model.initialize_hidden() a new set of tensor is added to this list (and never cleaned as far as I can tell)
self.internal_state_new seems to be cleaned in update_hidden, maybe self.internal_state should be also?
In essence, check out this self.internal_state property of your model, the list grows indefinitely from what I see. Initializing with zeros everywhere is quite strange, there is probably no need to do that (e.g. PyTorch's RNN is initialized with zeros by default, this is probably similar).

Finetuning BERT with LSTM via PyTorch and transformers library. Metrics remain the same with hyperparameter changes

I know for a fact that changing hyperparameters of an LSTM model or selecting different BERT layers causes changes in the classification result. I have tested this out using TensorFlow and Keras. I recently switched to Pytorch to do the same design, but no matter what I change, the result remains the same. Below is the code. Am I doing anything wrong?
def pad_sents(sents, pad_token): #Pad list of sentences according to the longest sentence in the batch.
sents_padded = []
max_len = max(len(s) for s in sents)
batch_size = len(sents)
for s in sents:
padded = [pad_token] * max_len
padded[:len(s)] = s
sents_padded.append(padded)
return sents_padded
def sents_to_tensor(tokenizer, sents, device):
tokens_list = [tokenizer.tokenize(str(sent)) for sent in sents]
sents_lengths = [len(tokens) for tokens in tokens_list]
tokens_list_padded = pad_sents(tokens_list, '[PAD]')
sents_lengths = torch.tensor(sents_lengths, device=device)
masks = []
for tokens in tokens_list_padded:
mask = [0 if token=='[PAD]' else 1 for token in tokens]
masks.append(mask)
masks_tensor = torch.tensor(masks, dtype=torch.long, device=device)
tokens_id_list = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list_padded]
sents_tensor = torch.tensor(tokens_id_list, dtype=torch.long, device=device)
return sents_tensor, masks_tensor, sents_lengths
class BERT_LSTM_Model(nn.Module):
def __init__(self, device, dropout_rate, n_class, lstm_hidden_size=None):
super(BERT_LSTM_Model, self).__init__()
self.bert_config = BertConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)
self.bert = BertModel.from_pretrained('bert-base-uncased',config =self.bert_config)
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',config =self.bert_config)
if not lstm_hidden_size:
self.lstm_hidden_size = self.bert.config.hidden_size
else:
self.lstm_hidden_size = lstm_hidden_size
self.n_class = n_class
self.dropout_rate = dropout_rate
self.lstm = nn.LSTM(self.bert.config.hidden_size, self.lstm_hidden_size, bidirectional=True)
self.hidden_to_softmax = nn.Linear(self.lstm_hidden_size * 2, n_class, bias=True)
self.dropout = nn.Dropout(p=self.dropout_rate)
self.device = device
def forward(self, sents):
sents_tensor, masks_tensor, sents_lengths = sents_to_tensor(self.tokenizer, sents, self.device)
encoded_layers = self.bert(input_ids=sents_tensor, attention_mask=masks_tensor)[2] #,output_all_encoded_layers=False) #output_hidden_states output_hidden_states=True
bert_hidden_layer = encoded_layers[12]
bert_hidden_layer = bert_hidden_layer.permute(1, 0, 2) #permute rotates the tensor. if tensor.shape = 3,4,5 tensor.permute(1,0,2), then tensor,shape= 4,3,5 (batch_size, sequence_length, hidden_size)
enc_hiddens, (last_hidden, last_cell) = self.lstm(pack_padded_sequence(bert_hidden_layer, sents_lengths, enforce_sorted=False)) #enforce_sorted=False #pack_padded_sequence(data and batch_sizes
output_hidden = torch.cat((last_hidden[0], last_hidden[1]), dim=1) # (batch_size, 2*hidden_size)
output_hidden = self.dropout(output_hidden)
pre_softmax = self.hidden_to_softmax(output_hidden)
return pre_softmax
def batch_iter(data, batch_size, shuffle=False, bert=None):
batch_num = math.ceil(data.shape[0] / batch_size)
index_array = list(range(data.shape[0]))
if shuffle:
data = data.sample(frac=1)
for i in range(batch_num):
indices = index_array[i * batch_size: (i + 1) * batch_size]
examples = data.iloc[indices]
targets = list(examples.train_label.values)
yield sents, targets # list[list[str]] if not bert else list[str], list[int]
def validation(model, df_val, loss_func, device):
was_training = model.training
model.eval()
train_BERT_tweet = list(df_val.train_BERT_tweet)
train_label = list(df_val.train_label)
val_batch_size = 16
n_batch = int(np.ceil(df_val.shape[0]/val_batch_size))
total_loss = 0.
with torch.no_grad():
for i in range(n_batch):
sents = train_BERT_tweet[i*val_batch_size: (i+1)*val_batch_size]
targets = torch.tensor(train_label[i*val_batch_size: (i+1)*val_batch_size],
dtype=torch.long, device=device)
batch_size = len(sents)
pre_softmax = model(sents)
batch_loss = loss_func(pre_softmax, targets)
total_loss += batch_loss.item()*batch_size
if was_training:
model.train()
return total_loss/df_val.shape[0]
def train():
label_name = ['Yes', 'Maybe', 'No']
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
start_time = time.time()
print('Importing data...', file=sys.stderr)
df_train = pd.read_csv('trainn.csv') #, index_col=0)
df_val = pd.read_csv('valn.csv') #, index_col=0)
train_label = dict(df_train.train_label.value_counts())
label_max = float(max(train_label.values()))
train_label_weight = torch.tensor([label_max/train_label[i] for i in range(len(train_label))], device=device)
print('Done! time elapsed %.2f sec' % (time.time() - start_time), file=sys.stderr)
print('-' * 80, file=sys.stderr)
start_time = time.time()
print('Set up model...', file=sys.stderr)
model = BERT_LSTM_Model(device=device, dropout_rate=0.2, n_class=len(label_name),lstm_hidden_size=768)
optimizer = AdamW(model.parameters(), lr=1e-3, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps=100, t_total=1000) #changed the last 2 arguments to old ones
model = model.to(device)
print('Use device: %s' % device, file=sys.stderr)
print('Done! time elapsed %.2f sec' % (time.time() - start_time), file=sys.stderr)
print('-' * 80, file=sys.stderr)
model.train()
cn_loss = torch.nn.CrossEntropyLoss(weight=train_label_weight, reduction='mean')
torch.save(cn_loss, 'loss_func3') # for later testing
train_batch_size =16
valid_niter = 500
log_every = 10
model_save_path = 'NonLinear_bert_uncased_model.bin'
num_trial = 0
train_iter = patience = cum_loss = report_loss = 0
cum_examples = report_examples = epoch = 0
hist_valid_scores = []
train_time = begin_time = time.time()
print('Begin Maximum Likelihood training...')
for epoch in range(20):
for sents, targets in batch_iter(df_train, batch_size=train_batch_size, shuffle=True): # for each epoch
train_iter += 1
optimizer.zero_grad()
batch_size = len(sents)
pre_softmax = model(sents)
loss = cn_loss(pre_softmax, torch.tensor(targets, dtype=torch.long, device=device))
loss.backward()
optimizer.step()
scheduler.step()
batch_losses_val = loss.item() * batch_size
report_loss += batch_losses_val
cum_loss += batch_losses_val
report_examples += batch_size
cum_examples += batch_size
if train_iter % log_every == 0:
print('epoch %d, iter %d, avg. loss %.2f, '
'cum. examples %d, speed %.2f examples/sec, '
'time elapsed %.2f sec' % (epoch, train_iter,
report_loss / report_examples,
cum_examples,
report_examples / (time.time() - train_time),
time.time() - begin_time), file=sys.stderr)
train_time = time.time()
report_loss = report_examples = 0.
#torch.save(model.state_dict(), 'LSTM_bert_uncased_model.bin')
# perform validation
if train_iter % valid_niter == 0:
print('epoch %d, iter %d, cum. loss %.2f, cum. examples %d' % (epoch, train_iter,
cum_loss / cum_examples,
cum_examples), file=sys.stderr)
cum_loss = cum_examples = 0.
print('begin validation ...', file=sys.stderr)
validation_loss = validation(model, df_val, cn_loss, device=device) # dev batch size can be a bit larger
print('validation: iter %d, loss %f' % (train_iter, validation_loss), file=sys.stderr)
is_better = len(hist_valid_scores) == 0 or validation_loss < min(hist_valid_scores)
hist_valid_scores.append(validation_loss)
if is_better:
patience = 0
print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)
torch.save(model.state_dict(), 'LSTM_bert_uncased_model.bin')
# also save the optimizers' state
torch.save(optimizer.state_dict(), model_save_path + '.optim')
elif patience < 5:
patience += 1
print('hit patience %d' % patience, file=sys.stderr)
if patience == 20:
num_trial += 1
print('hit #%d trial' % num_trial, file=sys.stderr)
if num_trial == 3:
print('early stop!', file=sys.stderr)
exit(0)
# decay lr, and restore from previously best checkpoint
print('load previously best model and decay learning rate to %f%%' %
(0.1*100), file=sys.stderr)
# load model model.load_state_dict(torch.load('LSTM_bert_uncased_model.bin'))
model = model.to(device)
print('restore parameters of the optimizers', file=sys.stderr)
optimizer.load_state_dict(torch.load(model_save_path + '.optim'))
# set new lr
for param_group in optimizer.param_groups:
param_group['lr'] *= 0.5
# reset patience
patience = 0
if epoch == 100:
print('reached maximum number of epochs!', file=sys.stderr)
exit(0)
def test():
label_name = ['Yes', 'Maybe', 'No']
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
model = BERT_LSTM_Model(device=device, dropout_rate=0.3, n_class=len(label_name), lstm_hidden_size=768)
model.load_state_dict(torch.load('LSTM_bert_uncased_model.bin'))
model.to(device)
model.eval()
df_test = pd.read_csv('testn.csv')
test_batch_size = 16
n_batch = int(np.ceil(df_test.shape[0]/test_batch_size))
cn_loss = torch.load('loss_func3', map_location=lambda storage, loc: storage).to(device)
train_BERT_tweet = list(df_test.train_BERT_tweet)
train_label = list(df_test.train_label)
test_loss = 0.
prediction = []
prob = []
softmax = torch.nn.Softmax(dim=1)
with torch.no_grad():
for i in range(n_batch):
sents = train_BERT_tweet[i*test_batch_size: (i+1)*test_batch_size]
targets = torch.tensor(train_label[i * test_batch_size: (i + 1) * test_batch_size],
dtype=torch.long, device=device)
batch_size = len(sents)
pre_softmax = model(sents)
batch_loss = cn_loss(pre_softmax, targets)
test_loss += batch_loss.item()*batch_size
prob_batch = softmax(pre_softmax)
prob.append(prob_batch)
prediction.extend([t.item() for t in list(torch.argmax(prob_batch, dim=1))])
accuracy = accuracy_score(df_test.train_label.values, prediction)
matthews = matthews_corrcoef(df_test.train_label.values, prediction)
f1_macro = f1_score(df_test.train_label.values, prediction, average='macro')
print('accuracy: %.2f' % accuracy)
print('matthews coef: %.2f' % matthews)
print('f1_macro: %.2f' % f1_macro)
TrainingModel = train()
TestingModel = test()
The data can be accessed from https://github.com/Kosisochi/DataSnippet
I didnt know how else to create a synthetic data.
Also, the training and validation loss remains quite high with the lowest being around 0.93.
I also tried a CNN and the same issue remained. Is there something I'm over looking? thanks for your help.

Resources