TensorBoard embedding visualizer does not show labels - python-3.x

I'm trying to use TensorBoard embedding visualizer to represent a set of 7307 verb embeddings that I've just generated, but the plotted points disappears when I select the Enable 3d labels mode.
Here's my code:
def plot(tsne_matrix, labels_path):
PATH = os.getcwd()
LOG_DIR = PATH
metadata = os.path.join(LOG_DIR, labels_path)
# Setup a 2D tensor that holds the embeddings
words = tf.Variable(tsne_matrix, name = "words")
with tf.Session() as session:
# Periodically save the model variables in a checkpoint in LOG_DIR.
saver = tf.train.Saver([words])
session.run(words.initializer)
saver.save(session, os.path.join(LOG_DIR, "model.ckpt"))
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = words.name
embedding.metadata_path = metadata
summary_writer = tf.summary.FileWriter(LOG_DIR)
projector.visualize_embeddings(summary_writer, config)
The metadata that I want to use consists just of the names of the embeddings (in my case verbs). They are stored in a list in a dictionary with other lists, so I'm using this function to load them to a tsv file (the required format):
# Extract list of labels:
def labels2tsv(name, path):
output = json2dict("output_parsed.json")
if name == 'verbs':
labels_list = list(output["verbs"].keys())
elif name == 'objects':
labels_list = list(output["objects"].keys())
with open(path, 'w') as f:
wr = csv.writer(f, delimiter='\t')
wr.writerow(str(labels_list))
The code that I execute then is:
# obtain labels
labels2tsv('verbs', 'verbs_metadata.tsv')
labels2tsv('objects', 'objects_metadata.tsv')
# plotting
tsne_verbs = np.load('verbs_tsne.npy')
plot(tsne_verbs, "verbs_metadata.tsv")
Finally, I access to TensorBoard through the command tensorboard --logdir=LOG_DIR.
The generated projector_config.pbtxt file (which is also in LOG_DIR) has the following content:
embeddings {
tensor_name: "Variable:0"
metadata_path: "verbs_metadata.tsv"
}
I guess that the points disappear because I'm not doing a correct metadata association, but I can't see the mistake. It also crashes on both Chrome and Firefox.

Related

Text Classification on a custom dataset with spacy v3

I am really struggling to make things work with the new spacy v3 version. The documentation is full. However, I am trying to run a training loop in a script.
(I am also not able to perform text classification training with CLI approach).
Data are publically available here.
import pandas as pd
from spacy.training import Example
import random
TRAIN_DATA = pd.read_json('data.jsonl', lines = True)
nlp = spacy.load('en_core_web_sm')
config = {
"threshold": 0.5,
}
textcat = nlp.add_pipe("textcat", config=config, last=True)
label = TRAIN_DATA['label'].unique()
for label in label:
textcat.add_label(str(label))
nlp = spacy.blank("en")
nlp.begin_training()
# Loop for 10 iterations
for itn in range(100):
# Shuffle the training data
losses = {}
TRAIN_DATA = TRAIN_DATA.sample(frac = 1)
# Batch the examples and iterate over them
for batch in spacy.util.minibatch(TRAIN_DATA.values, size=4):
texts = [nlp.make_doc(text) for text, entities in batch]
annotations = [{"cats": entities} for text, entities in batch]
# uses an example object rather than text/annotation tuple
print(texts)
print(annotations)
examples = [Example.from_dict(a)]
nlp.update(examples, losses=losses)
if itn % 20 == 0:
print(losses)

Huggingface Transformers NER - Offset Mapping Causing ValueError in NumPy boolean array indexing assignment

I was trying out the NER tutorial Token Classification with W-NUT Emerging Entities (https://huggingface.co/transformers/custom_datasets.html#tok-ner) in google colab using the Annotated Corpus for Named Entity Recognition data on Kaggle (https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus?select=ner_dataset.csv).
I will outline my process in detail to facilitate an understanding of what I was doing and to let the community help me figure out the source of the indexing assignment error.
To load the data from google drive where I have saved it, I used the following code
# import pandas library
import pandas as pd
# columns to select
cols_to_select = ["Sentence #", "Word", "Tag"]
# google drive data path
data_path = '/content/drive/MyDrive/Colab Notebooks/ner/ner_dataset.csv'
# load the data from google colab
dataset = pd.read_csv(data_path, encoding="latin-1")[cols_to_select].fillna(method = 'ffill')
I run the following code to parse the sentences and tags
class SentenceGetter(object):
def __init__(self, data):
self.n_sent = 1
self.data = data
self.empty = False
agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
s["Tag"].values.tolist())]
self.grouped = self.data.groupby("Sentence #").apply(agg_func)
self.sentences = [s for s in self.grouped]
def retrieve(self):
try:
s = self.grouped["Sentence: {}".format(self.n_sent)]
self.n_sent += 1
return s
except:
return None
# get full data
getter = SentenceGetter(dataset)
# get sentences
sentences = [[s[0] for s in sent] for sent in getter.sentences]
# get tags/labels
tags = [[s[1] for s in sent] for sent in getter.sentences]
# take a look at the data
print(sentences[0][0:5], tags[0][0:5], sep='\n')
I then split the data into train, val, and test sets
# import the sklearn module
from sklearn.model_selection import train_test_split
# split data in to temp and test sets
temp_texts, test_texts, temp_tags, test_tags = train_test_split(sentences,
tags,
test_size=0.20,
random_state=15)
# split data into train and validation sets
train_texts, val_texts, train_tags, val_tags = train_test_split(temp_texts,
temp_tags,
test_size=0.20,
random_state=15)
After splitting the data, I created encodings for tags and the tokens
unique_tags=dataset.Tag.unique()
# create tags to id
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
# create id to tags
id2tag = {id: tag for tag, id in tag2id.items()}
I then installed the transformer library in colab
# install the transformers library
! pip install transformers
Next I imported the small bert model
# import the transformers module
from transformers import BertTokenizerFast
# import the small bert model
model_name = "google/bert_uncased_L-4_H-512_A-8"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
I then created the encodings for the tokens
# create train set encodings
train_encodings = tokenizer(train_texts,
is_split_into_words=True,
return_offsets_mapping=True,
padding=True,
max_length=128,
truncation=True)
# create validation set encodings
val_encodings = tokenizer(val_texts,
is_split_into_words=True,
return_offsets_mapping=True,
padding=True,
max_length=128,
truncation=True)
# create test set encodings
test_encodings = tokenizer(test_texts,
is_split_into_words=True,
return_offsets_mapping=True,
padding=True,
max_length=128,
truncation=True)
In the tutorial, it uses offset-mapping to handle the problem that arise with word-piece tokenization, specifically, the mismatch between tokens and labels. It is when running the offset-mapping code in the tutorial that I get the error. Below is the offset mapping function used in the tutorial:
# the offset function
import numpy as np
def encode_tags(tags, encodings):
labels = [[tag2id[tag] for tag in doc] for doc in tags]
encoded_labels = []
for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
# create an empty array of -100
doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
arr_offset = np.array(doc_offset)
# set labels whose first offset position is 0 and the second is not 0
doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
encoded_labels.append(doc_enc_labels.tolist())
return encoded_labels
# return the encoded labels
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)
test_labels = encode_tags(test_tags, test_encodings)
After running the above code, it gives me the following error, and I can't figure out where the source of the error lies. Any help and pointers would be appreciated.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-19-afdff0186eb3> in <module>()
17
18 # return the encoded labels
---> 19 train_labels = encode_tags(train_tags, train_encodings)
20 val_labels = encode_tags(val_tags, val_encodings)
21 test_labels = encode_tags(test_tags, test_encodings)
<ipython-input-19-afdff0186eb3> in encode_tags(tags, encodings)
11
12 # set labels whose first offset position is 0 and the second is not 0
---> 13 doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
14 encoded_labels.append(doc_enc_labels.tolist())
15
ValueError: NumPy boolean array indexing assignment cannot assign 38 input values to the 37 output values where the mask is true

Retraining pre-trained word embeddings in Python using Gensim

I want to retrain pre-trained word embeddings in Python using Gensim. The pre-trained embeddings I want to use is Google's Word2Vec in the file GoogleNews-vectors-negative300.bin.
Following Gensim's word2vec tutorial, "it’s not possible to resume training with models generated by the C tool, load_word2vec_format(). You can still use them for querying/similarity, but information vital for training (the vocab tree) is missing there."
Therefore I can't use the KeyedVectors and for training a model the tutorial suggests to use:
model = gensim.models.Word2Vec.load('/tmp/mymodel')
model.train(more_sentences)
(https://rare-technologies.com/word2vec-tutorial/)
However, when I try this:
from gensim.models import Word2Vec
model = Word2Vec.load('data/GoogleNews-vectors-negative300.bin')
I get an error message:
1330 # Because of loading from S3 load can't be used (missing readline in smart_open)
1331 if sys.version_info > (3, 0):
-> 1332 return _pickle.load(f, encoding='latin1')
1333 else:
1334 return _pickle.loads(f.read())
UnpicklingError: invalid load key, '3'.
I didn't find a way to convert the binary google new file into a text file properly, and even if so I'm not sure whether that would solve my problem.
Does anyone have a solution to this problem or knows about a different way to retrain pre-trained word embeddings?
The Word2Vec.load() method can only load full models in gensim's native format (based on Python object-pickling) – not any other binary/text formats.
And, as per the documentation's note that "it’s not possible to resume training with models generated by the C tool", there's simply not enough information in the GoogleNews raw-vectors files to reconstruct the full working model that was used to train them. (That would require both some internal model-weights, not saved in that file, and word-frequency-information for controlling sampling, also not saved in that file.)
The best you could do is create a new Word2Vec model, then patch some/all of the GoogleNews vectors into it before doing your own training. This is an error-prone process with no real best-practices and many caveats about the interpretation of final results. (For example, if you bring in all the vectors, but then only re-train a subset using only your own corpus & word-frequencies, the more training you do – making the word-vectors better fit your corpus – the less such re-trained words will have any useful comparability to retained untrained words.)
Essentially, if you can look at the gensim Word2Vec source & work-out how to patch-together such a frankenstein-model, it may be appropriate. But there's no built-in support or handy off-the-shelf recipes that make it easy, because it's an inherently murky process.
I have already answered it here .
Save the google news model as text file in wor2vec format using gensim.
Refer this answer to save it as text file
Then try this code .
import os
import pickle
import numpy as np
import gensim
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec
import operator
os.mkdir("model_dir")
# class EpochSaver(CallbackAny2Vec):
# '''Callback to save model after each epoch.'''
# def __init__(self, path_prefix):
# self.path_prefix = path_prefix
# self.epoch = 0
# def on_epoch_end(self, model):
# list_of_existing_files = os.listdir(".")
# output_path = 'model_dir/{}_epoch{}.model'.format(self.path_prefix, self.epoch)
# try:
# model.save(output_path)
# except:
# model.wv.save_word2vec_format('model_dir/model_{}.bin'.format(self.epoch), binary=True)
# print("number of epochs completed = {}".format(self.epoch))
# self.epoch += 1
# list_of_total_files = os.listdir(".")
# saver = EpochSaver("my_finetuned")
# function to load vectors from existing model.
# I am loading glove vectors from a text file, benefit of doing this is that I get complete vocab of glove as well.
# If you are using a previous word2vec model I would recommed save that in txt format.
# In case you decide not to do it, you can tweak the function to get vectors for words in your vocab only.
def load_vectors(token2id, path, limit=None):
embed_shape = (len(token2id), 300)
freqs = np.zeros((len(token2id)), dtype='f')
vectors = np.zeros(embed_shape, dtype='f')
i = 0
with open(path, encoding="utf8", errors='ignore') as f:
for o in f:
token, *vector = o.split(' ')
token = str.lower(token)
if len(o) <= 100:
continue
if limit is not None and i > limit:
break
vectors[token2id[token]] = np.array(vector, 'f')
i += 1
return vectors
# path of text file of your word vectors.
embedding_name = "word2vec.txt"
data = "<training data(new line separated tect file)>"
# Dictionary to store a unique id for each token in vocab( in my case vocab contains both my vocab and glove vocab)
token2id = {}
# This dictionary will contain all the words and their frequencies.
vocab_freq_dict = {}
# Populating vocab_freq_dict and token2id from my data.
id_ = 0
training_examples = []
file = open("{}".format(data),'r', encoding="utf-8")
for line in file.readlines():
words = line.strip().split(" ")
training_examples.append(words)
for word in words:
if word not in vocab_freq_dict:
vocab_freq_dict.update({word:0})
vocab_freq_dict[word] += 1
if word not in token2id:
token2id.update({word:id_})
id_ += 1
# Populating vocab_freq_dict and token2id from glove vocab.
max_id = max(token2id.items(), key=operator.itemgetter(1))[0]
max_token_id = token2id[max_id]
with open(embedding_name, encoding="utf8", errors='ignore') as f:
for o in f:
token, *vector = o.split(' ')
token = str.lower(token)
if len(o) <= 100:
continue
if token not in token2id:
max_token_id += 1
token2id.update({token:max_token_id})
vocab_freq_dict.update({token:1})
with open("vocab_freq_dict","wb") as vocab_file:
pickle.dump(vocab_freq_dict, vocab_file)
with open("token2id", "wb") as token2id_file:
pickle.dump(token2id, token2id_file)
# converting vectors to keyedvectors format for gensim
vectors = load_vectors(token2id, embedding_name)
vec = KeyedVectors(300)
vec.add(list(token2id.keys()), vectors, replace=True)
# setting vectors(numpy_array) to None to release memory
vectors = None
params = dict(min_count=1,workers=14,iter=6,size=300)
model = Word2Vec(**params)
# using build from vocab to build the vocab
model.build_vocab_from_freq(vocab_freq_dict)
# using token2id to create idxmap
idxmap = np.array([token2id[w] for w in model.wv.index2entity])
# Setting hidden weights(syn0 = between input layer and hidden layer) = your vectors arranged accoring to ids
model.wv.vectors[:] = vec.vectors[idxmap]
# Setting hidden weights(syn0 = between hidden layer and output layer) = your vectors arranged accoring to ids
model.trainables.syn1neg[:] = vec.vectors[idxmap]
model.train(training_examples, total_examples=len(training_examples), epochs=model.epochs)
output_path = 'model_dir/final_model.model'
model.save(output_path)

Tensorflow : How to get tensor name from Tensorboard?

I downloaded a ssd_mobilenet_v2_coco from tensorflow detection model zoo. And I used import_pb_to_tensorboard.py to show the structure on Tensorboard.
I find a node named 'image_tensor', this is the picture discribed in Tensorboard.
I want to use the function 'get_tensor_by_name()' to input a new image and get the ouputs. However, it failed.
I tried 'get_operation_by_name()' , it didn't work neither.
Here is the code:
import tensorflow as tf
def one_image(im_path, model_path):
sess= tf.Session()
with sess.as_default():
image_tensor = tf.image.decode_jpeg(tf.read_file(im_path), channels=3)
saver = tf.train.import_meta_graph(model_path + "/model.ckpt.meta")
saver.restore(sess, tf.train.latest_checkpoint(model_path))
graph = tf.get_default_graph()
# x = graph.get_tensor_by_name("import/image_tensor:0")
# out_put = graph.get_tensor_by_name("import/detection_classes:0")
x = graph.get_operation_by_name("import/image_tensor").outputs[0]
outputs = graph.get_operation_by_name("import/detection_classes").outputs[0]
out_put = sess.run(outputs, feed_dict={x: image_tensor.eval()})
print(out_put)
sess.close()
if __name__ == "__main__":
one_image("testimg-4-resize.jpg", "ssd_mobilenet_v2_coco_2018_03_29")
And here is the KeyError:
KeyError: "The name 'import/image_tensor' refers to an Operation not in the graph."
I am wondering how to get the tensor name from Tensorboard and whether there is another way to load model from 'only-ckpts'.
'only-ckpts' means files only include 'model.ckpt.data-00000-of-00001' , 'model.ckpt.index' and 'model.ckpt.meta'.
Any advice will be grateful. Thank you in advance.
The tool import_pb_to_tensorboard.py uses tf.import_graph_def to import the graph and uses default name argument, which is "import" as documented.
Your code imports the graph through tf.train.import_meta_graph and uses default import_scope argument, which will not prefix imported tensor or operation name. It is obvious then you have two options to correct this error:
Do the following in place of your import_meta_graph line:
saver = tf.train.import_meta_graph(model_path + "/model.ckpt.meta",
import_scope='import')
Remove import/ prefix when trying to get tensor or operation by name like this:
x = graph.get_tensor_by_name("image_tensor:0")
out_put = graph.get_tensor_by_name("detection_classes:0")
x = graph.get_operation_by_name("image_tensor").outputs[0]
outputs = graph.get_operation_by_name("detection_classes").outputs[0]

Using keras flow_from_directory when running on google cloud machine learning engine

I would like to train my keras model on google cloud machine learning engine. I am currently using image augmentation and grabbing images from a local directory.
train_datagen = ImageDataGenerator(
preprocessing_function=preprocess_input,
rotation_range=30,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True
)
train_generator = train_datagen.flow_from_directory(
args.train_dir,
target_size=(IM_WIDTH, IM_HEIGHT),
batch_size=batch_size,
)
Is it possible to achieve this behavior on google cloud bucket? Could I first download the images to a local machine? I'm seeing alot of people using pickle on ML engine, but that doesn't quite make sense since images are 'generated' at the time of training.
I have created a working version of flow_from_directory using google storage api instead of os. It's not perfect and some functionality are missing.
import multiprocessing.pool
from functools import partial
from keras.preprocessing.image import Iterator
import warnings
import numpy as np
import keras.backend as K
import keras
from google.cloud import storage
import os
# rewrite of flow_from_directory
# https://github.com/keras-team/keras/blob/master/keras/preprocessing/image.py
def flow_from_google_storage(imageDataGen, project, bucket, directory,
target_size=(256, 256), color_mode='rgb',
classes=None, class_mode='categorical',
batch_size=32, shuffle=True, seed=None,
save_to_dir=None,
save_prefix='',
save_format='png',
follow_links=False,
subset=None,
interpolation='nearest'):
"""Takes the path to a directory, and generates batches of augmented/normalized data.
# Arguments
directory: path to the target directory.
It should contain one subdirectory per class.
Any PNG, JPG, BMP, PPM or TIF images inside each of the subdirectories directory tree will be included in the generator.
See [this script](https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d) for more details.
target_size: tuple of integers `(height, width)`, default: `(256, 256)`.
The dimensions to which all images found will be resized.
color_mode: one of "grayscale", "rbg". Default: "rgb".
Whether the images will be converted to have 1 or 3 color channels.
classes: optional list of class subdirectories (e.g. `['dogs', 'cats']`). Default: None.
If not provided, the list of classes will be automatically
inferred from the subdirectory names/structure under `directory`,
where each subdirectory will be treated as a different class
(and the order of the classes, which will map to the label indices, will be alphanumeric).
The dictionary containing the mapping from class names to class
indices can be obtained via the attribute `class_indices`.
class_mode: one of "categorical", "binary", "sparse", "input" or None. Default: "categorical".
Determines the type of label arrays that are returned: "categorical" will be 2D one-hot encoded labels,
"binary" will be 1D binary labels, "sparse" will be 1D integer labels, "input" will be images identical
to input images (mainly used to work with autoencoders).
If None, no labels are returned (the generator will only yield batches of image data, which is useful to use
`model.predict_generator()`, `model.evaluate_generator()`, etc.).
Please note that in case of class_mode None,
the data still needs to reside in a subdirectory of `directory` for it to work correctly.
batch_size: size of the batches of data (default: 32).
shuffle: whether to shuffle the data (default: True)
seed: optional random seed for shuffling and transformations.
save_to_dir: None or str (default: None). This allows you to optionally specify a directory to which to save
the augmented pictures being generated (useful for visualizing what you are doing).
save_prefix: str. Prefix to use for filenames of saved pictures (only relevant if `save_to_dir` is set).
save_format: one of "png", "jpeg" (only relevant if `save_to_dir` is set). Default: "png".
follow_links: whether to follow symlinks inside class subdirectories (default: False).
subset: Subset of data (`"training"` or `"validation"`) if
`validation_split` is set in `ImageDataGenerator`.
interpolation: Interpolation method used to resample the image if the
target size is different from that of the loaded image.
Supported methods are `"nearest"`, `"bilinear"`, and `"bicubic"`.
If PIL version 1.1.3 or newer is installed, `"lanczos"` is also
supported. If PIL version 3.4.0 or newer is installed, `"box"` and
`"hamming"` are also supported. By default, `"nearest"` is used.
# Returns
A DirectoryIterator yielding tuples of `(x, y)` where `x` is a numpy array containing a batch
of images with shape `(batch_size, *target_size, channels)` and `y` is a numpy array of corresponding labels.
"""
return GoogleStorageIterator(project, bucket,
directory, imageDataGen,
target_size=target_size, color_mode=color_mode,
classes=classes, class_mode=class_mode,
data_format=imageDataGen.data_format,
batch_size=batch_size, shuffle=shuffle, seed=seed,
save_to_dir=save_to_dir,
save_prefix=save_prefix,
save_format=save_format,
follow_links=follow_links,
subset=subset,
interpolation=interpolation)
class GoogleStorageIterator(Iterator):
"""Iterator capable of reading images from a directory on disk.
# Arguments
directory: Path to the directory to read images from.
Each subdirectory in this directory will be
considered to contain images from one class,
or alternatively you could specify class subdirectories
via the `classes` argument.
image_data_generator: Instance of `ImageDataGenerator`
to use for random transformations and normalization.
target_size: tuple of integers, dimensions to resize input images to.
color_mode: One of `"rgb"`, `"grayscale"`. Color mode to read images.
classes: Optional list of strings, names of subdirectories
containing images from each class (e.g. `["dogs", "cats"]`).
It will be computed automatically if not set.
class_mode: Mode for yielding the targets:
`"binary"`: binary targets (if there are only two classes),
`"categorical"`: categorical targets,
`"sparse"`: integer targets,
`"input"`: targets are images identical to input images (mainly
used to work with autoencoders),
`None`: no targets get yielded (only input images are yielded).
batch_size: Integer, size of a batch.
shuffle: Boolean, whether to shuffle the data between epochs.
seed: Random seed for data shuffling.
data_format: String, one of `channels_first`, `channels_last`.
save_to_dir: Optional directory where to save the pictures
being yielded, in a viewable format. This is useful
for visualizing the random transformations being
applied, for debugging purposes.
save_prefix: String prefix to use for saving sample
images (if `save_to_dir` is set).
save_format: Format to use for saving sample images
(if `save_to_dir` is set).
subset: Subset of data (`"training"` or `"validation"`) if
validation_split is set in ImageDataGenerator.
interpolation: Interpolation method used to resample the image if the
target size is different from that of the loaded image.
Supported methods are "nearest", "bilinear", and "bicubic".
If PIL version 1.1.3 or newer is installed, "lanczos" is also
supported. If PIL version 3.4.0 or newer is installed, "box" and
"hamming" are also supported. By default, "nearest" is used.
"""
def __init__(self, project, bucket, directory, image_data_generator,
target_size=(256, 256), color_mode='rgb',
classes=None, class_mode='categorical',
batch_size=32, shuffle=True, seed=None,
data_format=None,
save_to_dir=None, save_prefix='', save_format='png',
follow_links=False,
subset=None,
interpolation='nearest'):
if data_format is None:
data_format = K.image_data_format()
self.directory = directory
self.image_data_generator = image_data_generator
self.target_size = tuple(target_size)
if color_mode not in {'rgb', 'grayscale'}:
raise ValueError('Invalid color mode:', color_mode,
'; expected "rgb" or "grayscale".')
self.color_mode = color_mode
self.data_format = data_format
if self.color_mode == 'rgb':
if self.data_format == 'channels_last':
self.image_shape = self.target_size + (3,)
else:
self.image_shape = (3,) + self.target_size
else:
if self.data_format == 'channels_last':
self.image_shape = self.target_size + (1,)
else:
self.image_shape = (1,) + self.target_size
self.classes = classes
if class_mode not in {'categorical', 'binary', 'sparse',
'input', None}:
raise ValueError('Invalid class_mode:', class_mode,
'; expected one of "categorical", '
'"binary", "sparse", "input"'
' or None.')
self.class_mode = class_mode
self.save_to_dir = save_to_dir
self.save_prefix = save_prefix
self.save_format = save_format
self.interpolation = interpolation
if subset is not None:
validation_split = self.image_data_generator._validation_split
if subset == 'validation':
split = (0, validation_split)
elif subset == 'training':
split = (validation_split, 1)
else:
raise ValueError('Invalid subset name: ', subset,
'; expected "training" or "validation"')
else:
split = None
self.subset = subset
white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm', 'tif', 'tiff'}
# init gs
self.storage_client = storage.Client(project)
self.bucket = self.storage_client.get_bucket(bucket)
# first, count the number of samples and classes
self.samples = 0
if not classes:
labels_folder_iter = self.bucket.list_blobs(delimiter="/", prefix=self.directory)
list(labels_folder_iter) # populate labels_folder_iter
classes = [p[len(self.directory):-1] for p in sorted(labels_folder_iter.prefixes)]
self.num_classes = len(classes)
self.class_indices = dict(zip(classes, range(len(classes))))
pool = multiprocessing.pool.ThreadPool()
function_partial = partial(self._count_valid_files_in_directory,
white_list_formats=white_list_formats,
follow_links=follow_links,
split=split)
self.samples = sum(pool.map(function_partial,
(os.path.join(self.directory, subdir) for subdir in classes)))
print('Found %d images belonging to %d classes.' % (self.samples, self.num_classes))
print(self.class_indices)
# second, build an index of the images in the different class subfolders
results = []
self.filenames = []
self.classes = np.zeros((self.samples,), dtype='int32')
i = 0
for dirpath in (os.path.join(self.directory, subdir) for subdir in classes):
results.append(pool.apply_async(self._list_valid_filenames_in_directory,
(dirpath, white_list_formats, split,
self.class_indices, follow_links)))
for res in results:
classes, filenames = res.get()
self.classes[i:i + len(classes)] = classes
self.filenames += filenames
i += len(classes)
pool.close()
pool.join()
super(GoogleStorageIterator, self).__init__(self.samples, batch_size, shuffle, seed)
def _get_batches_of_transformed_samples(self, index_array):
batch_x = np.zeros((len(index_array),) + self.image_shape, dtype=K.floatx())
grayscale = self.color_mode == 'grayscale'
# build batch of image data
for i, j in enumerate(index_array):
fname = self.filenames[j]
blob = self.bucket.get_blob(os.path.join(self.directory, fname), self.storage_client)
img = self.load_img_from_string(blob.download_as_string(self.storage_client),
grayscale=grayscale,
target_size=self.target_size,
interpolation=self.interpolation)
x = keras.preprocessing.image.img_to_array(img, data_format=self.data_format)
x = self.image_data_generator.random_transform(x)
x = self.image_data_generator.standardize(x)
batch_x[i] = x
# TODO write save to gs
# optionally save augmented images to disk for debugging purposes
# if self.save_to_dir:
# for i, j in enumerate(index_array):
# img = keras.preprocessing.image.array_to_img(batch_x[i], self.data_format, scale=True)
# fname = '{prefix}_{index}_{hash}.{format}'.format(prefix=self.save_prefix,
# index=j,
# hash=np.random.randint(1e7),
# format=self.save_format)
# img.save(os.path.join(self.save_to_dir, fname))
# build batch of labels
if self.class_mode == 'input':
batch_y = batch_x.copy()
elif self.class_mode == 'sparse':
batch_y = self.classes[index_array]
elif self.class_mode == 'binary':
batch_y = self.classes[index_array].astype(K.floatx())
elif self.class_mode == 'categorical':
batch_y = np.zeros((len(batch_x), self.num_classes), dtype=K.floatx())
for i, label in enumerate(self.classes[index_array]):
batch_y[i, label] = 1.
else:
return batch_x
return batch_x, batch_y
def next(self):
"""For python 2.x.
# Returns
The next batch.
"""
with self.lock:
index_array = next(self.index_generator)
# The transformation of images is not under thread lock
# so it can be done in parallel
return self._get_batches_of_transformed_samples(index_array)
def _count_valid_files_in_directory(self, directory, white_list_formats, split, follow_links):
"""Count files with extension in `white_list_formats` contained in directory.
# Arguments
directory: absolute path to the directory
containing files to be counted
white_list_formats: set of strings containing allowed extensions for
the files to be counted.
split: tuple of floats (e.g. `(0.2, 0.6)`) to only take into
account a certain fraction of files in each directory.
E.g.: `segment=(0.6, 1.0)` would only account for last 40 percent
of images in each directory.
follow_links: boolean.
# Returns
the count of files with extension in `white_list_formats` contained in
the directory.
"""
num_files = len(list(self._iter_valid_files(directory, white_list_formats, follow_links)))
if split:
start, stop = int(split[0] * num_files), int(split[1] * num_files)
else:
start, stop = 0, num_files
return stop - start
def _iter_valid_files(self, directory, white_list_formats, follow_links):
"""Count files with extension in `white_list_formats` contained in directory.
# Arguments
directory: absolute path to the directory
containing files to be counted
white_list_formats: set of strings containing allowed extensions for
the files to be counted.
follow_links: boolean.
# Yields
tuple of (root, filename) with extension in `white_list_formats`.
"""
def _recursive_list(subpath):
# TODO should return all file path relative to subpath walk trhough any directory it find
if subpath[-1] != '/':
subpath = subpath + '/'
iter_blobs = self.bucket.list_blobs(delimiter="/", prefix=subpath)
blobs = list(iter_blobs)
return sorted(map(lambda blob: (subpath, blob.name[len(subpath):]), blobs), key=lambda x: x[1])
for root, fname in _recursive_list(directory):
for extension in white_list_formats:
if fname.lower().endswith('.tiff'):
warnings.warn('Using \'.tiff\' files with multiple bands will cause distortion. '
'Please verify your output.')
if fname.lower().endswith('.' + extension):
yield root, fname
def _list_valid_filenames_in_directory(self, directory, white_list_formats, split,
class_indices, follow_links):
"""List paths of files in `subdir` with extensions in `white_list_formats`.
# Arguments
directory: absolute path to a directory containing the files to list.
The directory name is used as class label and must be a key of `class_indices`.
white_list_formats: set of strings containing allowed extensions for
the files to be counted.
split: tuple of floats (e.g. `(0.2, 0.6)`) to only take into
account a certain fraction of files in each directory.
E.g.: `segment=(0.6, 1.0)` would only account for last 40 percent
of images in each directory.
class_indices: dictionary mapping a class name to its index.
follow_links: boolean.
# Returns
classes: a list of class indices
filenames: the path of valid files in `directory`, relative from
`directory`'s parent (e.g., if `directory` is "dataset/class1",
the filenames will be ["class1/file1.jpg", "class1/file2.jpg", ...]).
"""
dirname = os.path.basename(directory)
if split:
num_files = len(list(self._iter_valid_files(directory, white_list_formats, follow_links)))
start, stop = int(split[0] * num_files), int(split[1] * num_files)
valid_files = list(self._iter_valid_files(directory, white_list_formats, follow_links))[start: stop]
else:
valid_files = self._iter_valid_files(directory, white_list_formats, follow_links)
classes = []
filenames = []
for root, fname in valid_files:
classes.append(class_indices[dirname])
absolute_path = os.path.join(root, fname)
relative_path = os.path.join(dirname, os.path.relpath(absolute_path, directory))
filenames.append(relative_path)
return classes, filenames
def load_img_from_string(self, img_string, grayscale=False, target_size=None,
interpolation='nearest'):
from PIL import Image as pil_image
import io
_PIL_INTERPOLATION_METHODS = {
'nearest': pil_image.NEAREST,
'bilinear': pil_image.BILINEAR,
'bicubic': pil_image.BICUBIC,
}
"""Loads an image into PIL format.
# Arguments
path: Path to image file
grayscale: Boolean, whether to load the image as grayscale.
target_size: Either `None` (default to original size)
or tuple of ints `(img_height, img_width)`.
interpolation: Interpolation method used to resample the image if the
target size is different from that of the loaded image.
Supported methods are "nearest", "bilinear", and "bicubic".
If PIL version 1.1.3 or newer is installed, "lanczos" is also
supported. If PIL version 3.4.0 or newer is installed, "box" and
"hamming" are also supported. By default, "nearest" is used.
# Returns
A PIL Image instance.
# Raises
ImportError: if PIL is not available.
ValueError: if interpolation method is not supported.
"""
if pil_image is None:
raise ImportError('Could not import PIL.Image. '
'The use of `array_to_img` requires PIL.')
img = pil_image.open(io.BytesIO(img_string))
if grayscale:
if img.mode != 'L':
img = img.convert('L')
else:
if img.mode != 'RGB':
img = img.convert('RGB')
if target_size is not None:
width_height_tuple = (target_size[1], target_size[0])
if img.size != width_height_tuple:
if interpolation not in _PIL_INTERPOLATION_METHODS:
raise ValueError(
'Invalid interpolation method {} specified. Supported '
'methods are {}'.format(
interpolation,
", ".join(_PIL_INTERPOLATION_METHODS.keys())))
resample = _PIL_INTERPOLATION_METHODS[interpolation]
img = img.resize(width_height_tuple, resample)
return img
Yes, you can first download the images from GCS to the VM using os.system('gstuil cp YOUR_IMAGES .').
Transform your images into TFRecords, store them in Google Cloud Storage. TFRecordDataset has support for Google Cloud Storage.
Using TFRecords have performance advantages, if you train on large datasets I recommend to use TFRecords.

Resources