How to use pipeline for Custom token-classification model - python-3.x

Model description
I add simple custom pytorch-crf layer on top of TokenClassification model. It will make the model more robust.
I train the model successfully but when I test the mode. The folder doesn't have config.json file inside it. So the pipeline function gives error as
Error:AttributeError: 'BERT_CRF' object has no attribute 'config'
CODE
class BERT_CRF(nn.Module):
def __init__(self, bert_model, num_labels):
super(BERT_CRF, self).__init__()
self.bert = bert_model
self.dropout = nn.Dropout(0.25)
self.classifier = nn.Linear(768, num_labels)
self.crf = CRF(num_labels, batch_first = True)
def forward(self, input_ids, attention_mask, labels=None, token_type_ids=None):
outputs = self.bert(input_ids, attention_mask=attention_mask)
sequence_output = torch.stack((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4])).mean(dim=0)
sequence_output = self.dropout(sequence_output)
emission = self.classifier(sequence_output) # [32,256,17]
if labels is not None:
labels=labels.reshape(attention_mask.size()[0],attention_mask.size()[1])
loss = -self.crf(log_soft(emission, 2), labels, mask=attention_mask.type(torch.uint8), reduction='mean')
prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))
return [loss, prediction]
else:
prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))
return prediction
tokenizer = AutoTokenizer.from_pretrained("fine-tuned_model",model_max_length=256)
bert_model = BertForTokenClassification.from_pretrained('spanbert_base',id2label=id2label,label2id=label2id)
bert_model.config.output_hidden_states=True
model = BERT_CRF(bert_model, num_labels=21)
model.load_state_dict(torch.load("fine-tuned_model/pytorch_model.bin"))
model.eval()
token_classifier = pipeline("token-classification", model=model, aggregation_strategy="max",tokenizer=tokenizer,grouped_entities=True)
AttributeError: 'BERT_CRF' object has no attribute 'config'

Related

CUDA error: device-side assert triggered when training sample size is increased

I'm getting CUDA error: device-side assert triggered error when finetuning LayoutLMv3ForSequenceClassification model using images having two class labels.
I have a total 1734 for training image samples. Following is the major part of my code
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False, ocr_lang="eng")
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
processor = LayoutLMv3Processor(feature_extractor,tokenizer)
class DocumentClassificationDataset(Dataset):
def __init__(self, image_paths, processor):
self.image_paths = image_paths
self.processor = processor
def __len__(self):
return len(self.image_paths)
def __getitem__(self, item):
image_path = self.image_paths[item]
image = Image.open(image_path).convert("RGB")
width, height = image.size
json_path = image_path.with_suffix(".json")
with open(json_path, "r") as f:
ocr_result = json.load(f)
width_scale = 1000/width
height_scale = 1000/height
words = []
boxes = []
for row in ocr_result:
boxes.append(scale_bounding_box(row["bounding_box"], width_scale, height_scale))
words.append(row["word"])
encoding = processor(
image,
words,
boxes=boxes,
max_length=512,
padding="max_length",
truncation=True,
return_tensors="pt"
)
label = DOCUMENT_CLASSES.index(image_path.parent.name)
return dict(
input_ids = encoding["input_ids"].flatten(),
attention_mask = encoding["attention_mask"].flatten(),
bbox = encoding["bbox"].flatten(end_dim=1),
pixel_values = encoding["pixel_values"].flatten(end_dim=1),
labels = torch.tensor(label, dtype=torch.long)
)
train_dataset = DocumentClassificationDataset(train_images, processor)
test_dataset = DocumentClassificationDataset(test_images, processor)
train_data_loader = DataLoader(
train_dataset,
batch_size=4,
shuffle=True,
num_workers=2
)
test_data_loader = DataLoader(
test_dataset,
batch_size=4,
shuffle=False,
num_workers=2
)
class ModelModule(pl.LightningModule):
def __init__(self, n_classes: int):
super().__init__()
self.model =
LayoutLMv3ForSequenceClassification.from_pretrained(
"microsoft/layoutlmv3-base",
num_labels = n_classes
)
self.train_accuracy = Accuracy(task="multiclass",
num_classes=n_classes)
self.val_accuracy = Accuracy(task="multiclass",
num_classes=n_classes)
def forward(self, input_ids, attention_mask, bbox,
pixel_values, labels=None):
return self.model(
input_ids,
attention_mask=attention_mask,
bbox=bbox,
pixel_values=pixel_values,
labels=labels
)
def training_step(self, batch, batch_idx):
labels = batch["labels"]
outputs = self(
batch["input_ids"],
batch["attention_mask"],
batch["bbox"],
batch["pixel_values"],
labels
)
loss = outputs.loss
self.log("train_loss", loss)
self.train_accuracy(outputs.logits, labels)
self.log("train_acc", self.train_accuracy, on_step=True, on_epoch=True)
return loss
def validation_step(self, batch, batch_idx):
labels = batch["labels"]
outputs = self(
batch["input_ids"],
batch["attention_mask"],
batch["bbox"],
batch["pixel_values"],
labels
)
loss = outputs.loss
self.log("val_loss", loss)
self.val_accuracy(outputs.logits, labels)
self.log("val_acc", self.val_accuracy, on_step=False, on_epoch=True)
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.model.parameters(), lr=0.00001)
model_checkpoint = ModelCheckpoint(
filename="{epcoh}-{step}-{val_loss:.4f}",
save_last=True,
save_top_k=3,
monitor="val_loss",
mode="min"
)
trainer = pl.Trainer(
accelerator="gpu",
precision=16,
devices=1,
max_epochs=4,
callbacks=[
model_checkpoint
]
)
trainer.fit(model_module, train_data_loader, test_data_loader)
Model is getting trained when I tried with lesser number of samples (200). I'm getting this error only When the number of samples increases.
When I searched, most of the time this issue occurs because of the label mismatch, but that will not be the issue in this case.

How to add simple custom pytorch-crf layer on top of TokenClassification model using pytorch and Trainer

I followed this link, but its implemented in Keras.
Cannot add CRF layer on top of BERT in keras for NER
Model description
Is it possible to add simple custom pytorch-crf layer on top of TokenClassification model. It will make the model more robust.
from torchcrf import CRF
model_checkpoint = "dslim/bert-base-NER"
tokenizer = BertTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)
config = BertConfig.from_pretrained(model_checkpoint, output_hidden_states=True)
bert_model = BertForTokenClassification.from_pretrained(model_checkpoint,id2label=id2label,label2id=label2id,ignore_mismatched_sizes=True)
class BERT_CRF(nn.Module):
def __init__(self, bert_model, num_labels):
super(BERT_CRF, self).__init__()
self.bert = bert_model
self.dropout = nn.Dropout(0.25)
self.classifier = nn.Linear(4*768, num_labels)
self.crf = CRF(num_labels, batch_first = True)
def forward(self, input_ids, attention_mask, labels=None, token_type_ids=None):
outputs = self.bert(input_ids, attention_mask=attention_mask)
**sequence_output = torch.cat((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4]),-1)**
sequence_output = self.dropout(sequence_output)
emission = self.classifier(sequence_output) # [32,256,17]
labels=labels.reshape(attention_mask.size()[0],attention_mask.size()[1])
if labels is not None:
loss = -self.crf(log_soft(emission, 2), labels, mask=attention_mask.type(torch.uint8), reduction='mean')
prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))
return [loss, prediction]
else:
prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))
return prediction
args = TrainingArguments(
"spanbert_crf_ner-pos2",
# evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=1,
weight_decay=0.01,
per_device_train_batch_size=8,
# per_device_eval_batch_size=32
fp16=True
# bf16=True #Ampere GPU
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_data,
# eval_dataset=train_data,
# data_collator=data_collator,
# compute_metrics=compute_metrics,
tokenizer=tokenizer)
I get error on line **sequence_output = torch.cat((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4]),-1)**
As outputs = self.bert(input_ids, attention_mask=attention_mask) gives the logits for tokenclassification. How can we get hidden states so that I can concate last 4 hidden states. so that I can dooutputs[1][-1]`?
Or is their easier way to implement BERT-CRF model?

token_type_ids error in transformers.BertForTokenClassification (HuggingFace)

I'm facing an error with token_type_ids in my training function that uses BertForTokenClassification in HuggingFace.
class BERTClassification(nn.Module):
def __init__(self):
super(BERTClassification, self).__init__()
self.encoder = BertForTokenClassification.from_pretrained(config.pretrainDIR, local_files_only=True, num_labels=2)
def forward(self, input_ids, labels, token_type_ids=None, attention_mask=None):
output= self.encoder(input_ids = input_ids, labels = labels, token_type_ids = token_type_ids, attention_mask = attention_mask)
return output
class Trainer():
def __init__(self, model, data_loader, optimizer, device, scheduler):
self.model = model
self.data_loader = data_loader
self.optimizer = optimizer
self.device = device
self.scheduler = scheduler
def train_func(self):
self.model.to(self.device)
self.model.train()
for bi, d in tqdm(enumerate(self.data_loader), total=len(self.data_loader)):
ids = d["input_ids"]
targets = d["labels"]
type_ids = d['type_ids']
attention_mask = d['attention_mask']
ids = ids.to(self.device)
targets = targets.to(self.device)
type_ids = type_ids.to(self.device)
attention_mask = attention_mask.to(self.device)
self.optimizer.zero_grad()
output = self.model(
input_ids = ids,
labels = targets,
token_type_ids = type_ids,
attention_mask = attention_mask
)
where self.model is called from a pretrained BERTClassification module.
I get an error only when I include token_type_ids in self.model. Otherwise, the code runs just fine.
Error messages I get using CUDA is:
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`
Error messages I get using cpu is:
index out of range in self
I'm confused because the shape of token_type_ids is equivalent to the shape of ids, targets, and attention_mask.

Different training result obtained from training simple LSTM in Keras and Pytorch

I’m trying to implement my LSTM model from Keras to Pytorch, but the results in Pytorch seem really bad at the moment. The network is really simple as below.
model = Sequential()
model.add(LSTM(10, input_length=shape[1], input_dim=shape[2]))
# output shape: (1, 1)
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(1,activation="linear"))
model.compile(loss="mse", optimizer="adam")
model.summary()
And I migrate it to the Pytorch framework,
class LSTM(nn.Module):
def __init__(self, input_dim, hidden_dim, num_layers, output_dim,bilstm=False):
super(LSTM, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.isBi = bilstm
self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True,bidirectional=bilstm).double()
# for name, param in self.lstm.named_parameters():
# if name.startswith("weight"):
# nn.init.orthogonal_(param)
# else:
# pass
self.fc1 = nn.Sequential(nn.Linear(hidden_dim, 10).double(),nn.Tanh())
self.final_layer1 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer2 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer3 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer4 = nn.Sequential(nn.Linear(10,output_dim).double())
def forward(self, x):
out, (hn, cn) = self.lstm(x)
out = out[:, -1, :]
out = self.fc1(out)
out = self.final_layer1(out)
out = self.final_layer2(out)
out = self.final_layer3(out)
out = self.final_layer4(out)
return out
The result is really bad. I was wondering if the initializing methods/activation functions used in Keras are different from the one I used in Pytorch(Keras seems to be using hard_sigmoid where Pytorch uses sigmoid?).
Would really appreciate it if somebody could help me with this problem!
UPDATED
My training code in Pytorch.
criterion = nn.MSELoss()
model = LSTM(input_dim,hidden_dim,num_layers,output_dim,bilstm)
model = model.cuda()
optimizer = optim.Adam(model.parameters(),lr=0.001)
for epoch in range(1,epoch_number+1):
model.train()
iteration = 0
for i,data in enumerate(train_loader):
dat, label = data
dat = dat.double()
label = label.double()
if torch.cuda.is_available():
dat = dat.cuda()
label = label.cuda()
else:
dat = Variable(dat)
label = Variable(label)
out = model(dat)
optimizer.zero_grad()
loss = criterion(out, label)
loss.backward()
optimizer.step()

GRU included CNN for text Generator

I try to integrate CNN to GRU. My model gets the image through CNN. The features from CNN will pass to the GRU frame by frame. The structure is shown in the picture.
This is my example code that implemented follows the above structure.
encoder :
### input Image size [batch,seq,colorch,hight,weight]
### expext output
class CNNencoder(nn.Module):
def __init__(self, input_size, hidden_size,batch_size =5):
super(CNNencoder, self).__init__()
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.hidden_size = hidden_size
self.modelVGG = models.vgg11(pretrained = False)
self.modelVGG = self.modelVGG.to(self.device)
self.adaptor = nn.Linear(8192, self.hidden_size)
self.adaptor = self.adaptor.to(self.device)
self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=False) ## (inputSize, hidden_size)
self.gru = self.gru.to(self.device)
self.batch_size = batch_size
def forward(self, input, hidden):
seqs = input.size()[1]
for indexseq in range(0, seqs):
inputImageBatch = input[:, indexseq,:,:,:].view(-1,3,128,128)
features = self.modelVGG.features(inputImageBatch)
flat_features = features.view(features.size(0), 1,-1) # flatten
if indexseq == 0:
output = flat_features
else:
output = torch.cat((output, flat_features), dim=1)
# output = flat_features ## expected [batch,seq,features]
outputAdaptor = self.adaptor(output)
outputGru, hidden = self.gru(outputAdaptor, hidden)
return outputGru, hidden
def initHidden(self):
return torch.zeros(1, self.batch_size, self.hidden_size, device=self.device)
I would like to know. How CNN parameter will get gradient through time?
If I create a model from this class and execute loss.backword().
loss values get from decoder section.

Resources