Trying to train a basic model for this competition. This is a multi-class problem where the values are from 1 to 5 (with 0.5 size steps). The problem is that the training error is very bad and so is the inference.
Is my model correct for this problem? I didn't use warmup at all and I ran it only for a few epochs but I don't think it will make much difference.
Also, here's a result of an inference:
(0, tensor([[0.0024, 0.4513, 1.3045, 0.0350, 0.4182, 0.6813]]))
loss is 0?
predictions are too low (we should start from 1)
Clearly something is wrong with the model - What is it?
Here's my code (which is based on this article):
class LangKnolModel(pl.LightningModule):
def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
super().__init__()
self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
self.classifier = nn.Linear(self.bert.config.hidden_size, 6)
self.n_training_steps = n_training_steps
self.max_epochs=8,
self.n_warmup_steps = n_warmup_steps
self.criterion = nn.MSELoss()
def forward(self, input_ids, attention_mask, labels=None):
output = self.bert(input_ids, attention_mask=attention_mask)
output = self.classifier(output.pooler_output)
# output = torch.sigmoid(output)
loss = 0
if labels is not None:
loss = self.criterion(output, labels)
return loss, output
def training_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("train_loss", loss, prog_bar=True, logger=True)
return {"loss": loss, "predictions": outputs, "labels": labels}
def validation_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
return loss
def test_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("test_loss", loss, prog_bar=True, logger=True)
return loss
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=self.n_warmup_steps,
num_training_steps=self.n_training_steps
)
return dict(
optimizer=optimizer,
lr_scheduler=dict(
scheduler=scheduler,
interval='step'
)
)
training
model = LangKnolModel(
n_classes=len(LABELS_COLS),
n_warmup_steps=1,
n_training_steps=16,
)
trainer = pl.Trainer(
gpus=1,
max_epochs=5
)
trainer.fit(model, data_module)
Related
I'm trying to use pytorch-lightning for token-classification model. I have already built a model for token classification without lightning. I'm confused on what changes should be done with existing code to integrate pytorch-lightning.
Following is my pytorch code:
model = BertForTokenClassification.from_pretrained(
'bert-large-cased',
num_labels=len(tag2idx),
output_attentions = False,
output_hidden_states = False
)
for _ in trange(epochs, desc="Epoch"):
# ========================================
# Training
# ========================================
model.train()
total_loss = 0
for step, batch in enumerate(train_dataloader):
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
model.zero_grad()
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
loss = outputs[0]
loss.backward()
total_loss += loss.item()
torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
optimizer.step()
scheduler.step()
avg_train_loss = total_loss / len(train_dataloader)
loss_values.append(avg_train_loss)
# ========================================
# Validation
# ========================================
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions , true_labels = [], []
for batch in valid_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch
with torch.no_grad():
outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
logits = outputs[1].detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
eval_loss += outputs[0].mean().item()
predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
true_labels.extend(label_ids)
eval_loss = eval_loss / len(valid_dataloader)
validation_loss_values.append(eval_loss)
pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
valid_tags = [tag_values[l_i] for l in true_labels
for l_i in l if tag_values[l_i] != "PAD"]
f1 = f1_score([valid_tags], [pred_tags])
Following is the code which I tried for pytorch lightning.
class LightningModule(pl.LightningModule):
def __init__(self, lr, lr_backbone, weight_decay, batch_size):
super().__init__()
self.model = BertForTokenClassification.from_pretrained("bert-large-cased",
num_labels=len(tag2idx),
output_attentions = False,
output_hidden_states = False)
self.lr = lr
self.lr_backbone = lr_backbone
self.weight_decay = weight_decay
self.batch_size = batch_size
def forward(self, input_ids, attention_mask, labels):
outputs = self.model(
input_ids, token_type_ids=None, attention_mask=attention_mask, labels=labels
)
loss = outputs[0]
logits = outputs[1]
return loss, logits
def training_step(self, batch, batch_idx):
b_input_ids, b_input_mask, b_labels = batch
outputs = self.model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
loss = outputs[0]
self.log("train_loss", loss)
return loss
def validation_step(self, batch, batch_idx):
b_input_ids, b_input_mask, b_labels = batch
outputs = self.model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask, labels=b_labels)
eval_loss = outputs[0]
self.log("val_loss", eval_loss)
return eval_loss
def validation_end(self, outputs):
eval_loss = np.mean([x["val_loss"] for x in outputs])
self.log("val_loss", eval_loss)
pred_tags = [tag_values[p_i] for p, l in zip(self.predictions, self.true_labels)
for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
valid_tags = [tag_values[l_i] for l in self.true_labels
for l_i in l if tag_values[l_i] != "PAD"]
f1 = f1_score([valid_tags], [pred_tags])
self.log("val_f1", f1)
def configure_optimizers(self):
# optimizer = torch.optim.NAdam(optimizer_grouped_parameters,lr=4e-6,eps=1e-8)
# scheduler = scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps )
return torch.optim.NAdam(optimizer_grouped_parameters,lr=4e-6,eps=1e-8)
def train_dataloader(self):
return train_dataloader # return your dataloader
def val_dataloader(self):
return valid_dataloader # return your validation dataloader
model = LightningModule(lr=1e-6, lr_backbone=1e-5, weight_decay=1e-4, batch_size=32)
trainer = pl.Trainer(accelerator='gpu', gradient_clip_val=0.1, max_epochs=epochs, auto_scale_batch_size=None, default_root_dir="lightning_output/", enable_checkpointing=False)
trainer.fit(model)
But, when I run inference, I get the following error.
TypeError: forward() missing 2 required positional arguments: 'attention_mask' and 'labels'
I'm getting CUDA error: device-side assert triggered error when finetuning LayoutLMv3ForSequenceClassification model using images having two class labels.
I have a total 1734 for training image samples. Following is the major part of my code
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False, ocr_lang="eng")
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
processor = LayoutLMv3Processor(feature_extractor,tokenizer)
class DocumentClassificationDataset(Dataset):
def __init__(self, image_paths, processor):
self.image_paths = image_paths
self.processor = processor
def __len__(self):
return len(self.image_paths)
def __getitem__(self, item):
image_path = self.image_paths[item]
image = Image.open(image_path).convert("RGB")
width, height = image.size
json_path = image_path.with_suffix(".json")
with open(json_path, "r") as f:
ocr_result = json.load(f)
width_scale = 1000/width
height_scale = 1000/height
words = []
boxes = []
for row in ocr_result:
boxes.append(scale_bounding_box(row["bounding_box"], width_scale, height_scale))
words.append(row["word"])
encoding = processor(
image,
words,
boxes=boxes,
max_length=512,
padding="max_length",
truncation=True,
return_tensors="pt"
)
label = DOCUMENT_CLASSES.index(image_path.parent.name)
return dict(
input_ids = encoding["input_ids"].flatten(),
attention_mask = encoding["attention_mask"].flatten(),
bbox = encoding["bbox"].flatten(end_dim=1),
pixel_values = encoding["pixel_values"].flatten(end_dim=1),
labels = torch.tensor(label, dtype=torch.long)
)
train_dataset = DocumentClassificationDataset(train_images, processor)
test_dataset = DocumentClassificationDataset(test_images, processor)
train_data_loader = DataLoader(
train_dataset,
batch_size=4,
shuffle=True,
num_workers=2
)
test_data_loader = DataLoader(
test_dataset,
batch_size=4,
shuffle=False,
num_workers=2
)
class ModelModule(pl.LightningModule):
def __init__(self, n_classes: int):
super().__init__()
self.model =
LayoutLMv3ForSequenceClassification.from_pretrained(
"microsoft/layoutlmv3-base",
num_labels = n_classes
)
self.train_accuracy = Accuracy(task="multiclass",
num_classes=n_classes)
self.val_accuracy = Accuracy(task="multiclass",
num_classes=n_classes)
def forward(self, input_ids, attention_mask, bbox,
pixel_values, labels=None):
return self.model(
input_ids,
attention_mask=attention_mask,
bbox=bbox,
pixel_values=pixel_values,
labels=labels
)
def training_step(self, batch, batch_idx):
labels = batch["labels"]
outputs = self(
batch["input_ids"],
batch["attention_mask"],
batch["bbox"],
batch["pixel_values"],
labels
)
loss = outputs.loss
self.log("train_loss", loss)
self.train_accuracy(outputs.logits, labels)
self.log("train_acc", self.train_accuracy, on_step=True, on_epoch=True)
return loss
def validation_step(self, batch, batch_idx):
labels = batch["labels"]
outputs = self(
batch["input_ids"],
batch["attention_mask"],
batch["bbox"],
batch["pixel_values"],
labels
)
loss = outputs.loss
self.log("val_loss", loss)
self.val_accuracy(outputs.logits, labels)
self.log("val_acc", self.val_accuracy, on_step=False, on_epoch=True)
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.model.parameters(), lr=0.00001)
model_checkpoint = ModelCheckpoint(
filename="{epcoh}-{step}-{val_loss:.4f}",
save_last=True,
save_top_k=3,
monitor="val_loss",
mode="min"
)
trainer = pl.Trainer(
accelerator="gpu",
precision=16,
devices=1,
max_epochs=4,
callbacks=[
model_checkpoint
]
)
trainer.fit(model_module, train_data_loader, test_data_loader)
Model is getting trained when I tried with lesser number of samples (200). I'm getting this error only When the number of samples increases.
When I searched, most of the time this issue occurs because of the label mismatch, but that will not be the issue in this case.
I am training a multi-label text classifier using PyTorch with Roberta. However after 2nd epoch ram fills and kernel crashes I checked and ram is not freed after every epoch. I have 64GB RAM, 8 CPU cores
What can be the problem?
Here is the my PyTorch implementation:
class ReaderTextDataset(Dataset):
def __init__(self,
data: pd.DataFrame,
tokenizer: RobertaTokenizer,
max_token_len: int = 512):
self.tokenizer = tokenizer
self.data = data
self.max_token_len = max_token_len
def __len__(self):
return len(self.data)
def __getitem__(self, index: int):
data_row = self.data.iloc[index]
readerText = data_row.readerText
labels = data_row[LABEL_COLUMNS]
encoding = self.tokenizer.encode_plus(
readerText,
add_special_tokens=True,
max_length=self.max_token_len,
return_token_type_ids=False,
padding="max_length",
truncation=True,
return_attention_mask=True,
return_tensors='pt',
)
return dict(readerText=readerText,
input_ids=encoding["input_ids"].flatten(),
attention_mask=encoding["attention_mask"].flatten(),
labels=torch.FloatTensor(labels))
train_dataset = ReaderTextDataset(train_df, tokenizer, max_token_len=512)
roberta_model = RobertaModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
NUM_WORKERS = 6
class ReaderTextDataModule(pl.LightningDataModule):
def __init__(self,
train_df,
test_df,
tokenizer,
batch_size=8,
max_token_len=512):
super().__init__()
self.batch_size = batch_size
self.train_df = train_df
self.test_df = test_df
self.tokenizer = tokenizer
self.max_token_len = max_token_len
def setup(self, stage=None):
self.train_dataset = ReaderTextDataset(self.train_df, self.tokenizer,
self.max_token_len)
self.test_dataset = ReaderTextDataset(self.test_df, self.tokenizer,
self.max_token_len)
def train_dataloader(self):
return DataLoader(self.train_dataset,
batch_size=self.batch_size,
shuffle=True,
num_workers=NUM_WORKERS)
def val_dataloader(self):
return DataLoader(self.test_dataset,
batch_size=self.batch_size,
num_workers=NUM_WORKERS)
def test_dataloader(self):
return DataLoader(self.test_dataset,
batch_size=self.batch_size,
num_workers=NUM_WORKERS)
class ReaderTextTagger(pl.LightningModule):
def __init__(self,
n_classes: int,
n_training_steps=None,
n_warmup_steps=None):
super().__init__()
self.bert = RobertaModel.from_pretrained(BERT_MODEL_NAME,
return_dict=True)
self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
self.n_training_steps = n_training_steps
self.n_warmup_steps = n_warmup_steps
self.criterion = nn.BCELoss()
def forward(self, input_ids, attention_mask, labels=None):
output = self.bert(input_ids, attention_mask=attention_mask)
output = self.classifier(output.pooler_output)
output = torch.sigmoid(output)
loss = 0
if labels is not None:
loss = self.criterion(output, labels)
return loss, output
def training_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("train_loss", loss, prog_bar=True, logger=True)
return {"loss": loss, "predictions": outputs, "labels": labels}
def validation_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("val_loss", loss, prog_bar=True, logger=True)
return loss
def test_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("test_loss", loss, prog_bar=True, logger=True)
return loss
def training_epoch_end(self, outputs):
labels = []
predictions = []
for output in outputs:
for out_labels in output["labels"].detach().cpu():
labels.append(out_labels)
for out_predictions in output["predictions"].detach().cpu():
predictions.append(out_predictions)
labels = torch.stack(labels).int()
predictions = torch.stack(predictions)
for i, name in enumerate(LABEL_COLUMNS):
class_roc_auc = auroc(predictions[:, i], labels[:, i])
self.logger.experiment.add_scalar(f"{name}_roc_auc/Train",
class_roc_auc,
self.current_epoch)
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=self.n_warmup_steps,
num_training_steps=self.n_training_steps)
return dict(optimizer=optimizer,
lr_scheduler=dict(scheduler=scheduler, interval='step'))
enter code here
You return output, which stores in the GPU, in every train_step. Try to move it to cpu if you really need to store output by
return {"loss": loss, "predictions": outputs.cpu(), "labels": labels}
I am really new to Machine Learning and I am not so well versed in coding in general. However there is need to look through the customers feedback at our store, that average quite a lot each year, yet we cannot tell % of positive, negative and neutral.
Currently I am trying to train a Bert Model to do simple multi labeled sentiment analysis. The input is our store's customers feedback. The customers feedback is not always so clearly defined since customers do tend to tell long and long about their experience and their sentiment is not always so clear. However we managed to get positive, negative and neutral, each set 2247 samples.
But when I try to train it the training accuracy is around 0.4% which is super low. Validation score is around 60%. F1-score is around 60% for each of the label. I wonder what can be done to improve this training accuracy. I have been stuck for a while. Please take a look at my codes and help me out with this.
I have tried changing learning rate (tried all learning rate Bert suggested and 1e-5),changing Max LEN, changing amount of EPOCH, changing drop out rate (0.1, 0.2, 0.3, 0.4, 0.5), but so far nothing yielded results.
#read dataset
df = pd.read_csv("data.csv",header=None, names=['content', 'sentiment'], sep='\;', lineterminator='\r',encoding = "ISO-8859-1",engine="python")
from sklearn.utils import shuffle
df = shuffle(df)
df['sentiment'] = df['sentiment'].replace(to_replace = [-1, 0, 1], value = [0, 1, 2])
df.head()
#Load pretrained FinBert model and get bert tokenizer from it
PRE_TRAINED_MODEL_NAME = 'TurkuNLP/bert-base-finnish-cased-v1'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
#Choose sequence Length
token_lens = []
for txt in df.content:
tokens = tokenizer.encode(txt, max_length=512)
token_lens.append(len(tokens))
sns.distplot(token_lens)
plt.xlim([0, 256]);
plt.xlabel('Token count');
MAX_LEN = 260
#Make a PyTorch dataset
class FIDataset(Dataset):
def __init__(self, texts, targets, tokenizer, max_len):
self.texts = texts
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, item):
text = str(self.texts[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
return {
'text': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
#split test and train
df_train, df_test = train_test_split(
df,
test_size=0.1,
random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
df_test,
test_size=0.5,
random_state=RANDOM_SEED
)
df_train.shape, df_val.shape, df_test.shape
#data loader function
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = FIDataset(
texts=df.content.to_numpy(),
targets=df.sentiment.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
return DataLoader(
ds,
batch_size=batch_size,
num_workers=4
)
#Load data into train, test, val
BATCH_SIZE = 16
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):
def __init__(self, n_classes):
super(SentimentClassifier, self).__init__()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.1)
self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
def forward(self, input_ids, attention_mask):
returned = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
pooled_output = returned["pooler_output"]
output = self.drop(pooled_output)
return self.out(output)
#Create a Classifier instance and move to GPU
model = SentimentClassifier(3)
model = model.to(device)
#Optimize with AdamW
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr= 2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)
#Train each Epoch function
def train_epoch(
model,
data_loader,
loss_fn,
optimizer,
device,
scheduler,
n_examples
):
model = model.train()
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_predictions.double() / n_examples, np.mean(losses)
#Eval model function
def eval_model(model, data_loader, loss_fn, device, n_examples):
model = model.eval()
losses = []
correct_predictions = 0
with torch.no_grad():
torch.cuda.empty_cache()
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
return correct_predictions.double() / n_examples, np.mean(losses)
#training loop through each epochs
import torch
torch.cuda.empty_cache()
history = defaultdict(list)
best_accuracy = 0
if __name__ == '__main__':
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(df_train)
)
print(f'Train loss {train_loss} accuracy {train_acc}')
val_acc, val_loss = eval_model(
model,
val_data_loader,
loss_fn,
device,
len(df_val)
)
print(f'Val loss {val_loss} accuracy {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_accuracy:
torch.save(model.state_dict(), 'best_model_state.bin')
best_accuracy = val_acc
-- Edit: I have printed out preds and targets as well as train and val accuracy
Here _, preds = torch.max(outputs, dim=1), you probably want argmax, not max?
Print out preds and targets to better see what's going on.
Edit after preds and targets printed out. For epochs 4 and 5, preds matches targets exactly, so train accuracy should be 1. I think the issue is that the accuracy is divided by n_examples, which is a number of examples in the whole train dataset, while it should be divided by the number of examples in the epoch.
I’m trying to solve a VQA classification problem. my training loss is not changing at all while training the model.
I put in comment the CNN model and try to run it with the text only, but still, no change in loss value.
I pass through those models:
class question_lstm(nn.Module):
def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout, output_dim, que_size):
super(question_lstm, self).__init__()
self.hid_dim = hid_dim
self.n_layers = n_layers
self.embedding = nn.Embedding(input_dim, emb_dim)
self.tanh = nn.Tanh()
self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
self.dropout = nn.Dropout(dropout)
#self.fc1=nn.Linear(n_layers*hid_dim,que_size)
self.fc1=nn.Linear(n_layers*output_dim,que_size)
def forward(self, question):
emb_question=self.embedding(question) #(batchsize, input_dim, emb_dim=256)
emb_question=self.dropout(emb_question)
emb_question=self.tanh(emb_question)
emb_question = emb_question.transpose(0, 1) #(input_dim, batchsize, emb_dim)
output, (hidden, cell) = self.lstm(emb_question)
qu_feature = torch.cat((hidden, cell), dim=2)
qu_feature = qu_feature.transpose(0, 1) #(batchsize=100, num_layer=2, hid_dim=2048)
question_output =self.fc1(qu_feature)
return question_output
class vqamodel(nn.Module):
def __init__(self, output_dim,input_dim, emb_dim, hid_dim, n_layers, dropout, answer_len, que_size,):
super(vqamodel,self).__init__()
#self.image=img_CNN(img_size,image_feature)
self.question=question_lstm(input_dim, emb_dim, hid_dim, n_layers, dropout,output_dim,que_size)
self.tanh=nn.Tanh()
self.relu=nn.ReLU()
self.dropout=nn.Dropout(dropout)
self.fc1=nn.Linear(que_size,output_dim)
self.fc2=nn.Linear(output_dim,answer_len)
def forward(self, image, question):
question_emb=self.question(question)
combine =question_emb #*img_emb
out_feature=self.fc1(combine) #(batchsize=100, output_dim=2048)
out_feature=self.relu(out_feature)
out_feature=self.dropout(out_feature)
out_feature=self.fc2(out_feature) #(batchsize=100, answer_len=1000)
return (out_feature)
I’m using cross entropy loss and Adam:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vqa_model.parameters(),lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
any idea what can cause this constant loss value?
the train loop:
def train(model,criterion,optimizer,scheduler):
start_time = time.time() #the time we start the train
for epoch in range(num_epochs):
train_loss = 0
#test_loss = 0
train_correct = 0
#test_correct = 0
vqa_model.train()
for i,sample in enumerate(train_VQAdataset_loader):
#image = sample['image'].to(device=device)
question = sample['question'].to(torch.int64).to(device=device)
label = sample['answer'].to(device=device)
output = vqa_model(image, question) # forward
loss = criterion(output, label)
optimizer.zero_grad() # Zero the gradients
loss.backward() # backprop
optimizer.step() # Update weights
scheduler.step()
# Statitcs
train_loss += loss.item() # save the loss for the entire epoch
_, predictions = torch.max(output, 1)
train_correct += (predictions == label).sum() #number of success - cumulative
train_losses.append(train_loss / len(train_VQAdataset_loader))