Predicting the output of a sample text using AraBERT model - nlp

I'm training an AraBERT model for NER, exactly as what they did in this notbook. After training and saving the model, I would like to see the predictions for some samples, I started by writing the following code:
from transformers import AutoTokenizer, AutoModel
from arabert.preprocess import ArabertPreprocessor
import torch
def predict(sample_text):
encoded_text = tokenizer.encode_plus(
sample_text,
max_length=138,
add_special_tokens=True,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
input_ids = encoded_text ['input_ids']
attention_mask = encoded_text ['attention_mask']
output = arabert_model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)
print(f'Text: {text_preprocessed}')
print(f'Tags : {prediction}')
return prediction
arabert_model = AutoModel.from_pretrained('/gdrive/MyDrive/AraBERT Model Config')
text = "محمد ذهب إلى أمريكا للحصول على شهادة الماجستير. "
arabert_prep = ArabertPreprocessor(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
text_preprocessed=arabert_prep.preprocess(text)
predict(text_preprocessed)
I got this error:
TypeError: max() received an invalid combination of arguments - got (BaseModelOutputWithPoolingAndCrossAttentions, dim=int), but expected one of:
* (Tensor input)
* (Tensor input, Tensor other, *, Tensor out)
* (Tensor input, int dim, bool keepdim, *, tuple of Tensors out)
* (Tensor input, name dim, bool keepdim, *, tuple of Tensors out)
I want the output to be in the form of:
[B-PER, O, O, B-LOC, O, O, O, O]
How can I accomplish that?

Related

Try to use Matlab CNN Resnet50 pre-trained model to convert ONNX to use in Pytorch

I first use onnx2pytorch to convert onnx to pytorch, and then use it to use the test set to get the accuracy.
model = torch.load("")
model.eval()
for epoch in range(1) :
testing_loss = 0.0
testing_correct = 0.0
zpred, ztrue = [], []
for test_inputs, test_labels in test_data :
test_inputs, test_labels = test_inputs.to(device), test_labels.to(device)
output = model(test_inputs) #error poit
loss = criterion(output, test_labels)
_, pred = torch.max(output.data, 1)
testing_loss += loss.item()
testing_correct += torch.sum(pred == test_labels.data)
output = (torch.max(torch.exp(output), 1)[1]).data.cpu().numpy()
zpred.extend(output)
testlabel = test_labels.data.cpu().numpy()
ztrue.extend(testlabel)
test_loss = testing_loss / len(test_loader)
test_acc = 100 * testing_correct.cpu().numpy() / len(test_loader)
print('Epoch is : {}, Test Loss is : {:.4f} Test Accuracy is :{:.4f}%'.format(epoch + 1, test_loss, test_acc))
I got this error, how can I fix the error like this?
sum() received an invalid combination of arguments - got (Tensor, Tensor), but expected one of:
* (Tensor input, *, torch.dtype dtype)
* (Tensor input, tuple of ints dim, bool keepdim, *, torch.dtype dtype, Tensor out)
* (Tensor input, tuple of names dim, bool keepdim, *, torch.dtype dtype, Tensor out)
Is onnx model inference feasible on pytorch?
Thank you guys

SHAP Error - OM when allocating tensor with shape[23020,128,768]

I am trying to get shap values using shap.KernelExplainer for a bert classifier implemented using keras layers. The error that I get comes because there is not enough memory. But I am not sure what it is causing it because I have reduced all the parameters to as much as I can.
Error:
ResourceExhaustedError: OOM when allocating tensor with shape[23020,128,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Tile]
Transformer Model:
def process_sentences(sentence: List[str],
tokenizer: PreTrainedTokenizer,
max_len: int) -> Dict[str, np.ndarray]:
"""
Tokenize the text sentences.
Parameters
----------
sentence:
Sentence to be processed.
tokenizer:
Tokenizer to be used.
Returns
-------
Tokenized representation containing:
- input_ids
- attention_mask
"""
# since we are using the model for classification, we need to include special char (i.e, '[CLS]', ''[SEP]')
# check the example here: https://huggingface.co/transformers/v4.4.2/quicktour.html
z = tokenizer(sentence,
add_special_tokens=True,
padding='max_length',
max_length=max_len,
truncation=True,
return_attention_mask = True,
return_tensors='np')
return z;
use_bert = True
if use_bert:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
else:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
if use_bert:
from transformers import TFBertModel, BertConfig
config = BertConfig(output_hidden_states=True)
transformer = TFBertModel.from_pretrained('bert-base-uncased', config=config)
else:
from transformers import TFDistilBertModel, DistilBertConfig
config = DistilBertConfig(output_hidden_states=True)
transformer = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)
X_train = train.clean_review.values.tolist()
X_test=test.clean_review.values.tolist()
# tokenize datasets
X_train = process_sentences(X_train, tokenizer, max_len)
X_test = process_sentences(X_test, tokenizer, max_len)
y_train = y_train.rate
y_test = y_test.rate
class Classifier(tf.keras.Model):
def __init__(self,
transformer,
hidden_dims: int = 128,
output_dims: int = 2,
dropout_rate: float = 0.2):
"""
Constructor
Parameters
----------
transformer:
Transformer model to be leveraged.
hidden_dims:
hidden layer's dimension.
output_dims:
Output layer's dimension.
dropout_rate:
Dropout layer's dropout rate.
"""
super().__init__()
self.hidden_dims = hidden_dims
self.output_dims = output_dims
self.dropout_rate = dropout_rate
self.transformer = transformer
self.dense_1 = tf.keras.layers.Dense(self.hidden_dims, activation='relu')
self.dropout_1 = tf.keras.layers.Dropout(self.dropout_rate)
self.dense_2 = tf.keras.layers.Dense(self.output_dims, activation='softmax')
def call(self,
input_ids: Union[np.ndarray, tf.Tensor],
attention_mask: Optional[Union[np.ndarray, tf.Tensor]]=None,
training=False):
"""
Performs forward pass throguh the model.
Parameters
----------
input_ids:
Indices of input sequence tokens in the vocabulary.
attention_mask:
Mask to avoid performing attention on padding token indices.
Returns
-------
Classification probabilities.
"""
out = self.transformer(input_ids=input_ids, attention_mask=attention_mask, training=training)
out = out.last_hidden_state[:, 0, :] # extract the embedding corresponding to [CLS] token
out = self.dense_1(out)
out = self.dropout_1(out, training=training)
out = self.dense_2(out)
return out
# define the classification model
model = Classifier(transformer)
ShAP:
##background
X_train=train.clean_review.values.tolist()[:10]
X_train = process_sentences(X_train, tokenizer, max_len)
# tokenize text
tokenized_samples = X_train
X_train = tokenized_samples['input_ids']
# the values of the kwargs have to be `tf.Tensor`.
# see transformers issue #14404: https://github.com/huggingface/transformers/issues/14404
kwargs_train = {k:tf.constant(v) for k, v in tokenized_samples.items() if k == 'attention_mask'}
##SAMPLE
X_test=test.clean_review.values.tolist()[:3]
X_test = process_sentences(X_test, tokenizer, max_len)
# tokenize text
tokenized_samples = X_test
X_test = tokenized_samples['input_ids']
# the values of the kwargs have to be `tf.Tensor`.
# see transformers issue #14404: https://github.com/huggingface/transformers/issues/14404
kwargs_test = {k:tf.constant(v) for k, v in tokenized_samples.items() if k == 'attention_mask'}
kernel_explainer = shap.KernelExplainer(model, X_train)
kernel_shap_values = kernel_explainer.shap_values(X_test)

Keras: setting an array element with a sequence error

I try to build a very simple LSTM to classify text.
def encoded(texts):
res = [one_hot(text, 100000, filters='!"#$%&()*+,-./:;<=>?#[\]^_`{|}~', split=' ') for text in texts]
return res
def train(X, y, X_t, y_t):
X = encoded(X)
X_t = encoded(X_t)
model = Sequential()
model.add(Embedding(100000,100))
model.add(Bidirectional(LSTM(20,return_sequences = True),merge_mode='ave'))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.fit(np.array(X), np.array(y), batch_size=16, epochs=8)
score = model.evaluate(np.array(X_t), np.array(y_t), batch_size = 16)
print(score)
However I got this error:
ValueError: setting an array element with a sequence.
It seems like embedding layer didnt create right dimension vector or something wrong with the format of input X(X_t).
Any idea?

Testing a bAbI dataset trained model using input()

I was learning to use Memory Networks using the keras chatbot of bAbI dataset. I tried making a few modifications to the code such that it takes input from user and predicts the answer but failed. I want to take input from user using input() function but don't know how to do it. I tried reading .txt files but it didn't work.
Here is the original code by keras-team
from __future__ import print_function
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout
from keras.layers import add, dot, concatenate
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from functools import reduce
import tarfile
import numpy as np
import re
def tokenize(sent):
'''Return the tokens of a sentence including punctuation.
tokenize('Bob dropped the apple. Where is the apple?')
['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
'''
return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]
def parse_stories(lines, only_supporting=False):
'''Parse stories provided in the bAbi tasks format
If only_supporting is true, only the sentences
that support the answer are kept.
'''
data = []
story = []
for line in lines:
line = line.decode('utf-8').strip()
nid, line = line.split(' ', 1)
nid = int(nid)
if nid == 1:
story = []
if '\t' in line:
q, a, supporting = line.split('\t')
q = tokenize(q)
substory = None
if only_supporting:
# Only select the related substory
supporting = map(int, supporting.split())
substory = [story[i - 1] for i in supporting]
else:
# Provide all the substories
substory = [x for x in story if x]
data.append((substory, q, a))
story.append('')
else:
sent = tokenize(line)
story.append(sent)
return data
def get_stories(f, only_supporting=False, max_length=None):
'''Given a file name, read the file,
retrieve the stories,
and then convert the sentences into a single story.
If max_length is supplied,
any stories longer than max_length tokens will be discarded.
'''
data = parse_stories(f.readlines(), only_supporting=only_supporting)
flatten = lambda data: reduce(lambda x, y: x + y, data)
data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
return data
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
X = []
Xq = []
Y = []
for story, query, answer in data:
x = [word_idx[w] for w in story]
xq = [word_idx[w] for w in query]
# let's not forget that index 0 is reserved
y = np.zeros(len(word_idx) + 1)
y[word_idx[answer]] = 1
X.append(x)
Xq.append(xq)
Y.append(y)
return (pad_sequences(X, maxlen=story_maxlen),
pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))
try:
path = get_file('babi-tasks-v1-2.tar.gz', origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')
except:
print('Error downloading dataset, please download it manually:\n'
'$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz\n'
'$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz')
raise
tar = tarfile.open(path)
challenges = {
# QA1 with 10,000 samples
'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',
# QA2 with 10,000 samples
'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt',
}
challenge_type = 'single_supporting_fact_10k'
challenge = challenges[challenge_type]
print('Extracting stories for the challenge:', challenge_type)
train_stories = get_stories(tar.extractfile(challenge.format('train')))
test_stories = get_stories(tar.extractfile(challenge.format('test')))
vocab = set()
for story, q, answer in train_stories + test_stories:
vocab |= set(story + q + [answer])
vocab = sorted(vocab)
# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
story_maxlen = max(map(len, (x for x, _, _ in train_stories + test_stories)))
query_maxlen = max(map(len, (x for _, x, _ in train_stories + test_stories)))
print('-')
print('Vocab size:', vocab_size, 'unique words')
print('Story max length:', story_maxlen, 'words')
print('Query max length:', query_maxlen, 'words')
print('Number of training stories:', len(train_stories))
print('Number of test stories:', len(test_stories))
print('-')
print('Here\'s what a "story" tuple looks like (input, query, answer):')
print(train_stories[1])
print('-')
print('Vectorizing the word sequences...')
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
inputs_train, queries_train, answers_train = vectorize_stories(train_stories,
word_idx,
story_maxlen,
query_maxlen)
inputs_test, queries_test, answers_test = vectorize_stories(test_stories,
word_idx,
story_maxlen,
query_maxlen)
print('-')
print('inputs: integer tensor of shape (samples, max_length)')
print('inputs_train shape:', inputs_train.shape)
print('inputs_test shape:', inputs_test.shape)
print('-')
print('queries: integer tensor of shape (samples, max_length)')
print('queries_train shape:', queries_train.shape)
print('queries_test shape:', queries_test.shape)
print('-')
print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)')
print('answers_train shape:', answers_train.shape)
print('answers_test shape:', answers_test.shape)
print('-')
print('Compiling...')
# placeholders
input_sequence = Input((story_maxlen,))
question = Input((query_maxlen,))
# encoders
# embed the input sequence into a sequence of vectors
input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_size,
output_dim=64))
input_encoder_m.add(Dropout(0.3))
# output: (samples, story_maxlen, embedding_dim)
# embed the input into a sequence of vectors of size query_maxlen
input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_size,
output_dim=query_maxlen))
input_encoder_c.add(Dropout(0.3))
# output: (samples, story_maxlen, query_maxlen)
# embed the question into a sequence of vectors
question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_size,
output_dim=64,
input_length=query_maxlen))
question_encoder.add(Dropout(0.3))
# output: (samples, query_maxlen, embedding_dim)
# encode input sequence and questions (which are indices)
# to sequences of dense vectors
input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)
# compute a 'match' between the first input vector sequence
# and the question vector sequence
# shape: `(samples, story_maxlen, query_maxlen)`
match = dot([input_encoded_m, question_encoded], axes=(2, 2))
match = Activation('softmax')(match)
# add the match matrix with the second input vector sequence
response = add([match, input_encoded_c]) # (samples, story_maxlen, query_maxlen)
response = Permute((2, 1))(response) # (samples, query_maxlen, story_maxlen)
# concatenate the match matrix with the question vector sequence
answer = concatenate([response, question_encoded])
# the original paper uses a matrix multiplication for this reduction step.
# we choose to use a RNN instead.
answer = LSTM(32)(answer) # (samples, 32)
# one regularization layer -- more would probably be needed.
answer = Dropout(0.3)(answer)
answer = Dense(vocab_size)(answer) # (samples, vocab_size)
# we output a probability distribution over the vocabulary
answer = Activation('softmax')(answer)
# build the final model
model = Model([input_sequence, question], answer)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
metrics=['accuracy'])
# train
model.fit([inputs_train, queries_train], answers_train,
batch_size=32,
epochs=120,
validation_data=([inputs_test, queries_test], answers_test))
model_path1 = r'model_chatbot.h5'
model.save(model_path1)
#model save as pickle file
# model load again
# write story answer question in the format in a text file
model.load_weights(model_path1)
pred_results = model.predict(([inputs_test, queries_test]))
# Display a selected test story
n = np.random.randint(0,1000)
story_list = test_stories[n][0]
story =' '.join(word for word in story_list)
print("Story is:",story)
question_list = test_stories[n][1]
ques =' '.join(word for word in question_list)
print("Question is: ",ques)
ans = test_stories[n][2]
print("Actual answer is: ", ans)
#Generate prediction from model
val_max = np.argmax(pred_results[n])
for key, val in word_idx.items():
if val == val_max:
k = key
print("Machine answer is: ", k)
print("I am ", pred_results[n][val_max], "certain of it")
How can I take input using input() and make predictions?

Pytorch: How to compute IoU (Jaccard Index) for semantic segmentation

Can someone provide a toy example of how to compute IoU (intersection over union) for semantic segmentation in pytorch?
As of 2021, there's no need to implement your own IoU, as torchmetrics comes equipped with it - here's the link.
It is named torchmetrics.JaccardIndex (previously torchmetrics.IoU) and calculates what you want.
It works with PyTorch and PyTorch Lightning, also with distributed training.
From the documentation:
torchmetrics.JaccardIndex(num_classes, ignore_index=None, absent_score=0.0, threshold=0.5, multilabel=False, reduction='elementwise_mean', compute_on_step=None, **kwargs)
Computes Intersection over union, or Jaccard index calculation:
J(A,B) = \frac{|A\cap B|}{|A\cup B|}
Where: A and B are both tensors of the same size, containing integer class values. They may be subject to conversion from input data (see description below). Note that it is different from box IoU.
Works with binary, multiclass and multi-label data. Accepts probabilities from a model output or integer class values in prediction. Works with multi-dimensional preds and target.
Forward accepts
preds (float or long tensor): (N, ...) or (N, C, ...) where C is the number of classes
target (long tensor): (N, ...) If preds and target
are the same shape and preds is a float tensor, we use the
self.threshold argument to convert into integer labels. This is the case for binary and multi-label probabilities.
If preds has an extra dimension as in the case of multi-class scores we perform an argmax on dim=1.
Official example:
>>> from torchmetrics import JaccardIndex
>>> target = torch.randint(0, 2, (10, 25, 25))
>>> pred = torch.tensor(target)
>>> pred[2:5, 7:13, 9:15] = 1 - pred[2:5, 7:13, 9:15]
>>> jaccard = JaccardIndex(num_classes=2)
>>> jaccard(pred, target)
tensor(0.9660)
I found this somewhere and adapted it for me. I'll post the link if I can find it again. Sorry in case this was a dublicate.
The key function here is the function called iou. The wrapping function evaluate_performance is not universal, but it shows that one needs to iterate over all results before computing IoU.
import torch
import pandas as pd # For filelist reading
import myPytorchDatasetClass # Custom dataset class, inherited from torch.utils.data.dataset
def iou(pred, target, n_classes = 12):
ious = []
pred = pred.view(-1)
target = target.view(-1)
# Ignore IoU for background class ("0")
for cls in xrange(1, n_classes): # This goes from 1:n_classes-1 -> class "0" is ignored
pred_inds = pred == cls
target_inds = target == cls
intersection = (pred_inds[target_inds]).long().sum().data.cpu()[0] # Cast to long to prevent overflows
union = pred_inds.long().sum().data.cpu()[0] + target_inds.long().sum().data.cpu()[0] - intersection
if union == 0:
ious.append(float('nan')) # If there is no ground truth, do not include in evaluation
else:
ious.append(float(intersection) / float(max(union, 1)))
return np.array(ious)
def evaluate_performance(net):
# Dataloader for test data
batch_size = 1
filelist_name_test = '/path/to/my/test/filelist.txt'
data_root_test = '/path/to/my/data/'
dset_test = myPytorchDatasetClass.CustomDataset(filelist_name_test, data_root_test)
test_loader = torch.utils.data.DataLoader(dataset=dset_test,
batch_size=batch_size,
shuffle=False,
pin_memory=True)
data_info = pd.read_csv(filelist_name_test, header=None)
num_test_files = data_info.shape[0]
sample_size = num_test_files
# Containers for results
preds = Variable(torch.zeros((sample_size, 60, 36, 60)))
gts = Variable(torch.zeros((sample_size, 60, 36, 60)))
dataiter = iter(test_loader)
for i in xrange(sample_size):
images, labels, filename = dataiter.next()
images = Variable(images).cuda()
labels = Variable(labels)
gts[i:i+batch_size, :, :, :] = labels
outputs = net(images)
outputs = outputs.permute(0, 2, 3, 4, 1).contiguous()
val, pred = torch.max(outputs, 4)
preds[i:i+batch_size, :, :, :] = pred.cpu()
acc = iou(preds, gts)
return acc
Say your outputs are of shape [32, 256, 256] # 32 is the minibatch size and 256x256 is the image's height and width, and the labels are also the same shape.
Then you can use sklearn's jaccard_similarity_score after some reshaping.
If both are torch tensors, then:
lbl = labels.cpu().numpy().reshape(-1)
target = output.cpu().numpy().reshape(-1)
Now:
from sklearn.metrics import jaccard_similarity_score as jsc
print(jsc(target,lbl))

Resources