Select multiple target variables in keras tensorflow - python-3.x

I am trying to follow a kaggle kernel for BERT implementation :
https://www.kaggle.com/hiromoon166/save-bert-fine-tuning-model
But i am not able to select target variables. I have to select multiple target variables as my y variable as it is a multi-label classification.
This is the line of code i am stuck on:
train_lines, train_labels = train_df['comment_text'].values, train_df.target.values
def convert_lines(example, max_seq_length,tokenizer):
max_seq_length -=2
all_tokens = []
longer = 0
for i in range(example.shape[0]):
tokens_a = tokenizer.tokenize(example[i])
if len(tokens_a)>max_seq_length:
tokens_a = tokens_a[:max_seq_length]
longer += 1
one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
all_tokens.append(one_token)
print(longer)
return np.array(all_tokens)
nb_epochs=1
bsz = 32
dict_path = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
tokenizer = tokenization.FullTokenizer(vocab_file=dict_path, do_lower_case=True)
print('build tokenizer done')
train_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
train_df = train_df.sample(frac=0.01,random_state = 42)
#train_df['comment_text'] = train_df['comment_text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n', ' ', regex=True)
train_lines, train_labels = train_df['comment_text'].values, train_df.target.values
print('sample used',train_lines.shape)
token_input = convert_lines(train_lines,maxlen,tokenizer)
seg_input = np.zeros((token_input.shape[0],maxlen))
mask_input = np.ones((token_input.shape[0],maxlen))
print(token_input.shape)
print(seg_input.shape)
print(mask_input.shape)
print('begin training')
model3.fit([token_input, seg_input, mask_input],train_labels,batch_size=bsz,epochs=nb_epochs)
Please help me understand how to select target variables here?

Related

Why are albumentations Augmentations (Yolo / YoloV5) altering Bounding Boxes if no augmentations are being placed?

I was using the Albumentations library in order to perform some data augmentations on an object detection dataset that I intended to train a YoloV5 model on.
I have to perform the augmentations seperately and save the images locally to disk, but when I do I noticed that some of the output bounding boxes returned aren't generating properly.
I have my augmentations set up in a seperate aug.py file, shown below (augmentations purposefully removed in debugging attempts, see below) -
import albumentations as A
import cv2
PROB = 0.5
bbp = A.BboxParams(format="yolo")
horizontal_flip_transform = A.Compose([
], bbox_params = bbp)
vertical_flip_transform = A.Compose([
], bbp)
pixel_dropout_transform = A.Compose([
], bbox_params = bbp)
random_rotate = A.Compose([
], bbox_params = bbp )
#NOTE: THIS METHOD IMPLIES THAT THE IMAGE WIDTHS MUST BE AT LEAST 50 PIXELS
#Remove this aug to remove this constraint
random_crop = A.Compose([
], bbox_params = bbp)
augs = [horizontal_flip_transform, vertical_flip_transform, pixel_dropout_transform, random_rotate, random_crop]
def get_augmentations():
return augs
And the relevant parts of my implementation for performing the augmentations and saving them to disk is below:
def run_augments_on_image(img_name, bboxes, max_images_to_generate = 500):
ret = []
img = np.array(Image.open(img_name), dtype=np.uint8)
transforms = get_augmentations()
for i in range(min(len(transforms), max_images_to_generate)):
transformed = transforms[i](image=img, bboxes = bboxes)
ret.append((transformed["image"] , transformed["bboxes"]))
return ret
def run_and_save_augments_on_image_sets(batch_img_names, bboxes_urls, max_images_to_generate, dataset_dir, trainval):
num_images = 0
for i in range(len(batch_img_names)):
bboxes = []
with open(os.path.join(dataset_dir, trainval, 'labels', bboxes_urls[i]), 'r') as f:
for row in f:
x = row.strip().split(' ')
x.append(row[0])
x.pop(0)
x[0] = float(x[0])
x[1] = float(x[1])
x[2] = float(x[2])
x[3] = float(x[3])
bboxes.append(x)
trans = run_augments_on_image(os.path.join(dataset_dir, trainval, 'images', batch_img_names[i]), bboxes)
img_index = len(os.listdir(os.path.join(dataset_dir, 'train' , 'images'))) + len(os.listdir(os.path.join(dataset_dir, 'valid', 'images'))) + 1
for j in range(len(trans)):
img_trans, bboxes_trans = trans[j]
p = Image.fromarray(img_trans).save(os.path.join(dataset_dir, trainval, 'images' , f'image-{img_index}.{batch_img_names[j].split(".")[-1]}'))
with open(os.path.join(dataset_dir, trainval, 'labels', f'image-{img_index}.txt'), 'w') as f:
for boxs in bboxes_trans:
print(f'{boxs[-1]} {boxs[0]} {boxs[1]} {boxs[2]} {boxs[3]}', file=f)
num_images += 1
img_index += 1
if num_images >= max_images_to_generate:
break
if num_images >= max_images_to_generate:
break
For testing purposes (some of the bounding boxes were off), I removed all the actual augmentations, expecting the input image label (one augmented image example shown below) to be equal to augmented label since there were no augmentations. But, as you can see, the two labels are different.
img-original.txt
0 0.5662285714285714 0.2740066225165563 0.5297714285714286 0.4837913907284769
img-augmented.txt
0 0.51488 0.47173333333333334 0.6405099999999999 0.6527333333333334
(The labels above are in normalized xywh YOLO format)
Why is albumentations altering the labels? None of the augmentations in augs.py contain anything.

How to implement a custom Pytorch Dataset with map() method

Many of the Pytorch examples use the Dataset map() method. For example:
https://huggingface.co/voidful/wav2vec2-large-xlsr-53-tw-gpt
ds = load_dataset("common_voice", 'zh-TW', split="test")
ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
def map_to_array(batch):
audio = batch["audio"]
batch["speech"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
batch["sampling_rate"] = audio["sampling_rate"]
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower().replace("’", "'")
return batch
ds = ds.map(map_to_array)
def map_to_pred(batch):
features = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0], padding=True, return_tensors="pt")
input_values = features.input_values.to(device)
attention_mask = features.attention_mask.to(device)
with torch.no_grad():
logits = model(input_values, attention_mask=attention_mask).logits
pred_ids = torch.argmax(logits, dim=-1)
batch["predicted"] = processor.batch_decode(pred_ids)
batch["target"] = batch["sentence"]
return batch
result = ds.map(map_to_pred, batched=True, batch_size=3, remove_columns=list(ds.features.keys()))
However, implementing a custom Map style dataset only requires __len__() and __getitem__()
What is the correct way to convert a custom Dataset into one with all the useful methods needed by the examples?

getting TypeError: Expected int32, got None of type 'NoneType' instead

I have implemented sequence to sequence model with attention layer if I 300000 data points I'm not getting any error if I use all of my data points I'm getting following error model.fit
TypeError: Expected int32, got None of type 'NoneType' instead.
what would be the reason for this?
the code before model.fit is
class encoder_decoder(tf.keras.Model):
def __init__(self,embedding_size,encoder_inputs_length,output_length,vocab_size,output_vocab_size,score_fun,units):
super(encoder_decoder,self).__init__()
self.vocab_size = vocab_size
self.enc_units = units
self.embedding_size = embedding_size
self.encoder_inputs_length = encoder_inputs_length
self.output_length = output_length
self.lstm_output = 0
self.state_h = 0
self.state_c = 0
self.output_vocab_size = output_vocab_size
self.dec_units = units
self.score_fun = score_fun
self.att_units = units
self.encoder=Encoder(self.vocab_size,self.embedding_size,self.enc_units,self.encoder_inputs_length)
self.decoder = Decoder(self.output_vocab_size, self.embedding_size, self.output_length, self.dec_units ,self.score_fun ,self.att_units)
# self.dense = Dense(self.output_vocab_size,activation = "softmax")
def call(self,data):
input,output = data[0],data[1]
encoder_hidden = self.encoder.initialize_states(input.shape[0])
encoder_output,encoder_hidden,encoder_cell = self.encoder(input,encoder_hidden)
decoder_hidden = encoder_hidden
decoder_cell =encoder_cell
decoder_output = self.decoder(output,encoder_output,decoder_hidden,decoder_cell)
return decoder_output
Inside the call function I'm initializing states for the encoder where I'm getting
the number of rows from input using the following line of code
encoder_hidden = self.encoder.initialize_states(input.shape[0])
If I print input, I'm getting shape as (None,55)
That's the reason I'm getting this error.
Here my total number data points is 330614 when I use all my data I getting this
error, when I use only 330000 data points I'm getting this error,
if I print batch inside def method I'm getting shape as (64,55)
Please find my below code for creating dataset for my sequence to sequence model
the function to reprocess the data and the function to create the dataset
and a function the load the dataset
def preprocess_sentence(w):
# w = unicode_to_ascii(w.lower().strip())
w = re.sub(r"([?.!,¿])", r" \1 ", w)
w = re.sub(r'[" "]+', " ", w)
w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
w = w.strip()
w = '<start> ' + w + ' <end>'
return w
def create_dataset(path, num_examples):
lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
# lines1 = lines[330000:]
# lines = lines[0:323386]+lines1
word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]]
word_pairs = [[i[0],i[1]] for i in word_pairs]
return zip(*word_pairs)
def tokenize(lang):
lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
filters='')
lang_tokenizer.fit_on_texts(lang)
tensor = lang_tokenizer.texts_to_sequences(lang)
tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
return tensor, lang_tokenizer
def load_dataset(path, num_examples=None):
# creating cleaned input, output pairs
targ_lang, inp_lang = create_dataset(path, num_examples)
input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer,targ_lang,inp_lang
# Try experimenting with the size of that dataset
num_examples = None
input_tensor, target_tensor, inp_lang, targ_lang,targ_lang_text,inp_lang_text = load_dataset(path, num_examples)
# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
max_length_targ,max_length_inp
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
the shape of datasets as follows
shape of input train (269291, 55)
shape of target train (269291, 53)
shape of input test (67323, 55)
shape of target test (67323, 53)
You can share the code block before the model.fit.
NoneType error is indicating that the final array which is passed to the model is for some reason empty. You can add print statements at previous steps to understand where along the way your array became empty.
Compare the scenario to the case where you are taking all your data points so that you can understand where the array is changing and how it is handled prior to passing it through model.fit.

'Word2Vec' object has no attribute 'index2word'

I'm getting this error "AttributeError: 'Word2Vec' object has no attribute 'index2word'" in following code in python. Anyone knows how can I solve it?
Acctually "tfidf_weighted_averaged_word_vectorizer" throws the error. "obli.csv" contains line of sentences.
Thank you.
from feature_extractors import tfidf_weighted_averaged_word_vectorizer
dataset = get_data2()
corpus, labels = dataset.data, dataset.target
corpus, labels = remove_empty_docs(corpus, labels)
# print('Actual class label:', dataset.target_names[labels[10]])
train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus,
labels,
test_data_proportion=0.3)
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(train_corpus)
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train,
tfidf_vectors=tfidf_train_features,
tfidf_vocabulary=vocab,
model=model,
num_features=100)
def get_data2():
obli = pd.read_csv('db/obli.csv').values.ravel().tolist()
cl0 = [0 for x in range(len(obli))]
nonObli = pd.read_csv('db/nonObli.csv').values.ravel().tolist()
cl1 = [1 for x in range(len(nonObli))]
all = obli + nonObli
db = Db(all,cl0 + cl1)
db.data = all
db.target = cl0 + cl1
return db
This is code from chapter 4 of Text Analytics for Python by Dipanjan Sarkar.
index2word in gensim has been moved since that text was published.
Instead of model.index2word you should use model.wv.index2word.

Add new variables to optimize to a tensorflow model

I have a tensorflow version of GloVe:
Xij = tf.placeholder(tf.float32, shape=[None], name="Xij")
wordI = tf.placeholder(tf.int32, shape=[None], name="wordI")
wordIW = tf.Variable(tf.random_uniform([inputSize, embedSize], initRange, -initRange), name="wordIW")
wordIB = tf.Variable(tf.random_uniform([inputSize], initRange, -initRange), name="wordIB")
wi = tf.nn.embedding_lookup([wordIW], wordI, name="wi")
bi = tf.nn.embedding_lookup([wordIB], wordI, name="bi")
wordJ = tf.placeholder(tf.int32, shape=[None], name="wordJ")
wordJW = tf.Variable(tf.random_uniform([outputSize, embedSize], initRange, -initRange), name="wordJW")
wordJB = tf.Variable(tf.random_uniform([outputSize], initRange, -initRange), name="wordJB")
wj = tf.nn.embedding_lookup([wordJW], wordJ, name="wj")
bj = tf.nn.embedding_lookup([wordJB], wordJ, name="bj")
scalingFactor = tf.constant([scalingFactor], name="scalingFactor")
countMax = tf.constant([countMax], name="countMax")
wFactor = tf.minimum(1.0, tf.pow(tf.div(Xij, countMax), scalingFactor))
wiwjProduct = tf.reduce_sum(tf.multiply(wi, wj), 1)
logXij = tf.log(Xij)
dist = tf.square(tf.add_n([wiwjProduct, bi, bj, tf.negative(logXij)]))
loss = tf.reduce_sum(tf.multiply(wFactor, dist), name="loss")
tf.summary.scalar("GloVeLoss", loss)
global_step = tf.Variable(0, trainable=False, name="global_step")
learn_rate = tf.Variable(learn_rate, trainable=False, name="learn_rate")
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learn_rate, name="optimizer").minimize(loss, global_step=global_step)
saver = tf.train.Saver()
I would like to add a new input (increase input size of 1 and add the corresponding rows in wordIW and wordIB). I can't find a proper way to modify the existing graph.
ps: my goal is to stop the gradient on existing variables and optimize the loss only on the new input.

Resources