From ae7672f912c7b70147665080ad2eb6e7eb331d46 Mon Sep 17 00:00:00 2001 From: Trent Houliston Date: Fri, 4 Feb 2022 14:04:36 +1100 Subject: [PATCH 1/2] Swap from using keras to using a custom loop. This will allow for some more interesting visual mesh designs in the future. --- training/callbacks/image_tensorboard.py | 102 -------- training/dataset/__init__.py | 6 - training/dataset/label/seeker.py | 2 +- training/dataset/visual_mesh_dataset.py | 3 +- training/export.py | 10 +- training/find_lr.py | 122 +++------- training/flavour/__init__.py | 4 +- training/flavour/learning_rate.py | 27 +++ training/flavour/optimiser.py | 35 +++ .../{image_callback.py => progress_images.py} | 4 +- training/flavour/test_metrics.py | 2 +- training/{callbacks => images}/__init__.py | 2 - .../classification_images.py | 7 +- .../{callbacks => images}/seeker_images.py | 8 +- training/layer/__init__.py | 2 +- training/learning_rate/__init__.py | 17 ++ .../{callbacks => learning_rate}/one_cycle.py | 15 +- training/learning_rate/static.py | 24 ++ training/metrics/test/bucket.py | 2 +- training/model/visual_mesh_model.py | 9 +- training/testing.py | 27 ++- training/training.py | 229 ++++++++++++------ 22 files changed, 334 insertions(+), 325 deletions(-) delete mode 100644 training/callbacks/image_tensorboard.py create mode 100644 training/flavour/learning_rate.py create mode 100644 training/flavour/optimiser.py rename training/flavour/{image_callback.py => progress_images.py} (96%) rename training/{callbacks => images}/__init__.py (93%) rename training/{callbacks => images}/classification_images.py (95%) rename training/{callbacks => images}/seeker_images.py (97%) create mode 100644 training/learning_rate/__init__.py rename training/{callbacks => learning_rate}/one_cycle.py (88%) create mode 100644 training/learning_rate/static.py diff --git a/training/callbacks/image_tensorboard.py b/training/callbacks/image_tensorboard.py deleted file mode 100644 index 55ecbac..0000000 --- a/training/callbacks/image_tensorboard.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright (C) 2017-2020 Trent Houliston -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to -# permit persons to whom the Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the -# Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR -# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -import os - -import tensorflow as tf - - -class ImageTensorBoard(tf.keras.callbacks.TensorBoard): - def __init__(self, **kwargs): - super(ImageTensorBoard, self).__init__(**kwargs) - - def _image_metrics(self): - return [m for m in self.model.metrics if hasattr(m, "images")] - - def _filter_logs(self, logs): - if logs is None: - return None - - # Find all the metrics that are actually image metrics - remove_list = self._image_metrics() - remove_list = [m.name for m in remove_list] - remove_list.extend(["val_{}".format(n) for n in remove_list]) - - return {k: v for k, v in logs.items() if k not in remove_list} - - def on_batch_begin(self, batch, logs=None): - super(ImageTensorBoard, self).on_batch_begin(batch, self._filter_logs(logs)) - - def on_batch_end(self, batch, logs=None): - super(ImageTensorBoard, self).on_batch_end(batch, self._filter_logs(logs)) - - def on_epoch_begin(self, epoch, logs=None): - super(ImageTensorBoard, self).on_epoch_begin(epoch, self._filter_logs(logs)) - - def on_epoch_end(self, epoch, logs=None): - - if logs is not None: - with self._train_writer.as_default(): - for m in self._image_metrics(): - tf.summary.image(m.name, m.images(logs[m.name]), epoch) - - if any([l.startswith("val_") for l in logs.keys()]): - with self._val_writer.as_default(): - for m in self._image_metrics(): - tf.summary.image(m.name, m.images(logs["val_{}".format(m.name)]), epoch) - - super(ImageTensorBoard, self).on_epoch_end(epoch, self._filter_logs(logs)) - - def on_predict_batch_begin(self, batch, logs=None): - super(ImageTensorBoard, self).on_predict_batch_begin(batch, self._filter_logs(logs)) - - def on_predict_batch_end(self, batch, logs=None): - super(ImageTensorBoard, self).on_predict_batch_end(batch, self._filter_logs(logs)) - - def on_predict_begin(self, logs=None): - super(ImageTensorBoard, self).on_predict_begin(self._filter_logs(logs)) - - def on_predict_end(self, logs=None): - super(ImageTensorBoard, self).on_predict_end(self._filter_logs(logs)) - - def on_test_batch_begin(self, batch, logs=None): - super(ImageTensorBoard, self).on_test_batch_begin(batch, self._filter_logs(logs)) - - def on_test_batch_end(self, batch, logs=None): - super(ImageTensorBoard, self).on_test_batch_end(batch, self._filter_logs(logs)) - - def on_test_begin(self, logs=None): - super(ImageTensorBoard, self).on_test_begin(self._filter_logs(logs)) - - def on_test_end(self, logs=None): - super(ImageTensorBoard, self).on_test_end(self._filter_logs(logs)) - - def on_train_batch_begin(self, batch, logs=None): - super(ImageTensorBoard, self).on_train_batch_begin(batch, self._filter_logs(logs)) - - def on_train_batch_end(self, batch, logs=None): - super(ImageTensorBoard, self).on_train_batch_end(batch, self._filter_logs(logs)) - - def on_train_begin(self, logs=None): - super(ImageTensorBoard, self).on_train_begin(self._filter_logs(logs)) - - def on_train_end(self, logs=None): - super(ImageTensorBoard, self).on_train_end(self._filter_logs(logs)) - - def set_model(self, model): - super(ImageTensorBoard, self).set_model(model) - - def set_params(self, params): - super(ImageTensorBoard, self).set_params(params) diff --git a/training/dataset/__init__.py b/training/dataset/__init__.py index 0e9eba1..cf1e25d 100644 --- a/training/dataset/__init__.py +++ b/training/dataset/__init__.py @@ -68,9 +68,3 @@ def Dataset(paths, batch_size, view, example, orientation, label, projection, ke label=label, keys=keys, ).build() - - -# Convert a dataset into a format that will be accepted by keras fit -def keras_dataset(args): - # Return in the format (x, y, weights) - return ((args["X"], args["G"]), args["Y"]) diff --git a/training/dataset/label/seeker.py b/training/dataset/label/seeker.py index 0914719..9ce4426 100644 --- a/training/dataset/label/seeker.py +++ b/training/dataset/label/seeker.py @@ -54,7 +54,7 @@ def __call__(self, image, Hoc, V, valid, **features): # Transform our remaining targets into observation plane space uOCo, l = tf.linalg.normalize(tf.einsum("ij,kj->ki", Hoc[:3, :3], targets), axis=-1) - # Check if this signal is close enough in distance to our observation plane to be considered + # Check if this point is close enough in distance to our observation plane to be considered height = Hoc[2, 3] ratio = tf.squeeze(l, axis=-1) / height uOCo = tf.gather( diff --git a/training/dataset/visual_mesh_dataset.py b/training/dataset/visual_mesh_dataset.py index 06f138f..2a71fe2 100644 --- a/training/dataset/visual_mesh_dataset.py +++ b/training/dataset/visual_mesh_dataset.py @@ -13,9 +13,10 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import tensorflow as tf import math +import tensorflow as tf + class VisualMeshDataset: def __init__(self, paths, batch_size, view, example, orientation, projection, label, keys): diff --git a/training/export.py b/training/export.py index d6d0361..7bd07f2 100644 --- a/training/export.py +++ b/training/export.py @@ -20,7 +20,6 @@ import tensorflow as tf -from .dataset import keras_dataset from .flavour import Dataset from .layer.graph_convolution import GraphConvolution from .model import VisualMeshModel @@ -29,12 +28,10 @@ def export(config, output_path): # Get the training dataset so we know the output size - training_dataset = ( - Dataset(config, "training").map(keras_dataset, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE) - ) + training_dataset = Dataset(config, "training") # Get the dimensionality of the Y part of the dataset - output_dims = training_dataset.element_spec[1].shape[-1] + output_dims = training_dataset.element_spec["Y"].shape[-1] # Define the model model = VisualMeshModel(structure=config["network"]["structure"], output_dims=output_dims) @@ -47,7 +44,8 @@ def export(config, output_path): raise RuntimeError("Could not find weights to load into the network") # We have to run a predict step so that everything is loaded properly - model.predict(training_dataset.take(1)) + for v in training_dataset.take(1): + model(v["X"], v["G"], training=False) # Print out the model summary model.summary() diff --git a/training/find_lr.py b/training/find_lr.py index e0cdcd4..98efd59 100644 --- a/training/find_lr.py +++ b/training/find_lr.py @@ -23,115 +23,59 @@ import tensorflow as tf -from .dataset import keras_dataset -from .flavour import Dataset, Loss +from .flavour import Dataset, Loss, Optimiser from .model import VisualMeshModel mpl.use("Agg") import matplotlib.pyplot as plt # isort:skip -class LRProgress(tf.keras.callbacks.Callback): - def __init__(self, n_steps, lr_schedule, **kwargs): - super(LRProgress, self).__init__(**kwargs) - - self.lr_schedule = lr_schedule - self.n_steps = n_steps - - # Tracking loss values - self.smooth_loss = None - self.losses = [] - - # Progress bar settings - self.n_history = 20 - self.lr_progress = None - self.loss_title = None - self.loss_progress = None - - def on_epoch_end(self, epoch, logs=None): - # Skip when we explode at the end - if not math.isfinite(logs["loss"]): - return - - # Calculate smoothed loss values - self.smooth_loss = logs["loss"] if self.smooth_loss is None else self.smooth_loss * 0.98 + logs["loss"] * 0.02 - self.losses.append(math.log1p(self.smooth_loss)) - - # Create the lr_progress bar so we can see how far it is - if self.lr_progress is None: - self.lr_progress = tqdm(total=self.n_steps, dynamic_ncols=True) - self.lr_progress.update() - self.lr_progress.set_description("LR: {:.3e}".format(self.model.optimizer.lr.numpy())) - - # Create our series of moving loss graphs - if self.loss_progress is None: - self.loss_title = tqdm(bar_format="{desc}", desc="Loss Graph") - self.loss_progress = [ - tqdm(bar_format="{desc}|{bar}|", total=self.losses[-1], dynamic_ncols=True) - for i in range(self.n_history) - ] - - valid_i = -min(len(self.losses), self.n_history) - - # Get the maximum of the losses - loss_max = max(self.losses) - - for bar, loss in zip(self.loss_progress[valid_i:], self.losses[valid_i:]): - bar.total = loss_max - bar.n = loss - bar.set_description("{:.3e}".format(math.expm1(loss))) - - # Find valid learning rates for def find_lr(config, output_path, min_lr, max_lr, n_steps, window_size): # Open the training dataset and put it on repeat - training_dataset = Dataset(config, "training").map(keras_dataset).repeat() + training_dataset = Dataset(config, "training").repeat() # Get the dimensionality of the Y part of the dataset - output_dims = training_dataset.element_spec[1].shape[-1] + output_dims = training_dataset.element_spec["Y"].shape[-1] # Define the model model = VisualMeshModel(structure=config["network"]["structure"], output_dims=output_dims) - def lr_schedule(epoch, lr): + def lr_schedule(epoch): return min_lr * (max_lr / min_lr) ** (epoch / n_steps) - # Setup the optimiser - if config["training"]["optimiser"]["type"] == "Adam": - optimiser = tf.optimizers.Adam(learning_rate=min_lr) - elif config["training"]["optimiser"]["type"] == "SGD": - optimiser = tf.optimizers.SGD(learning_rate=min_lr) - elif config["training"]["optimiser"]["type"] == "Ranger": - import tensorflow_addons as tfa - - optimiser = tfa.optimizers.Lookahead( - tfa.optimizers.RectifiedAdam(learning_rate=min_lr), - sync_period=int(config["training"]["optimiser"]["sync_period"]), - slow_step_size=float(config["training"]["optimiser"]["slow_step_size"]), - ) - else: - raise RuntimeError("Unknown optimiser type" + config["training"]["optimiser"]["type"]) - - # Compile the model grabbing the flavours for the loss and metrics - model.compile(optimizer=optimiser, loss=Loss(config)) - - # Run the fit function on the model to calculate the learning rates - history = model.fit( - training_dataset, - epochs=n_steps, - steps_per_epoch=1, - callbacks=[ - tf.keras.callbacks.TerminateOnNaN(), - tf.keras.callbacks.LearningRateScheduler(lr_schedule), - LRProgress(n_steps, lr_schedule), - ], - verbose=False, - ) + optimiser = Optimiser(config, lr_schedule(0)) + loss_fn = Loss(config) + + losses = [] + lrs = [] + with tqdm(desc=f"LR: {optimiser.lr.numpy():.3e} Loss: ---", total=n_steps) as progress: + for step, data in enumerate(training_dataset): + if step > n_steps: + break + + optimiser.lr = lr_schedule(step) + + with tf.GradientTape() as tape: + logits = model(data["X"], data["G"], training=True) + loss = loss_fn(data["Y"], logits) + + if not tf.math.is_finite(loss): + break + + grads = tape.gradient(loss, model.trainable_weights) + optimiser.apply_gradients(zip(grads, model.trainable_weights)) + + losses.append(loss.numpy()) + lrs.append(optimiser.lr.numpy()) + + progress.set_description(f"LR: {optimiser.lr.numpy():.3e} Loss: {loss.numpy():.3e}") + progress.update() # Extract the loss and LR from before it became nan - loss = np.array(history.history["loss"][:-1]) - lr = np.array(history.history["lr"][:-1]) + loss = np.array(losses[:-1]) + lr = np.array(lrs[:-1]) # Make a smoothed version of the loss smooth_loss = np.convolve(loss, np.ones(window_size), "same") / np.convolve( diff --git a/training/flavour/__init__.py b/training/flavour/__init__.py index 1f17ab9..2506e52 100644 --- a/training/flavour/__init__.py +++ b/training/flavour/__init__.py @@ -14,7 +14,9 @@ # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. from .dataset import Dataset -from .image_callback import ImageCallback +from .learning_rate import LearningRate from .loss import Loss from .metrics import Metrics +from .optimiser import Optimiser +from .progress_images import ProgressImages from .test_metrics import TestMetrics diff --git a/training/flavour/learning_rate.py b/training/flavour/learning_rate.py new file mode 100644 index 0000000..2def07c --- /dev/null +++ b/training/flavour/learning_rate.py @@ -0,0 +1,27 @@ +# Copyright (C) 2017-2020 Trent Houliston +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import tensorflow as tf + +from ..learning_rate import OneCycle, Static + + +def LearningRate(config): + + # Determine the learning rate policy to use + if config["training"]["learning_rate"]["type"] == "static": + return Static(config=config) + elif config["training"]["learning_rate"]["type"] == "one_cycle": + return OneCycle(config=config) diff --git a/training/flavour/optimiser.py b/training/flavour/optimiser.py new file mode 100644 index 0000000..87f58ea --- /dev/null +++ b/training/flavour/optimiser.py @@ -0,0 +1,35 @@ +# Copyright (C) 2017-2020 Trent Houliston +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import tensorflow as tf + + +def Optimiser(config, initial_lr): + + # Setup the optimiser + if config["training"]["optimiser"]["type"] == "Adam": + return tf.optimizers.Adam(learning_rate=initial_lr) + elif config["training"]["optimiser"]["type"] == "SGD": + return tf.optimizers.SGD(learning_rate=initial_lr) + elif config["training"]["optimiser"]["type"] == "Ranger": + import tensorflow_addons as tfa + + return tfa.optimizers.Lookahead( + tfa.optimizers.RectifiedAdam(learning_rate=initial_lr), + sync_period=int(config["training"]["optimiser"]["sync_period"]), + slow_step_size=float(config["training"]["optimiser"]["slow_step_size"]), + ) + else: + raise RuntimeError("Unknown optimiser type" + config["training"]["optimiser"]["type"]) diff --git a/training/flavour/image_callback.py b/training/flavour/progress_images.py similarity index 96% rename from training/flavour/image_callback.py rename to training/flavour/progress_images.py index cc40724..c7ea7bb 100644 --- a/training/flavour/image_callback.py +++ b/training/flavour/progress_images.py @@ -13,12 +13,12 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -from ..callbacks import ClassificationImages, SeekerImages +from ..images import ClassificationImages, SeekerImages from .dataset import Dataset from .merge_configuration import merge_configuration -def ImageCallback(config, output_path): +def ProgressImages(config, output_path): validation_config = merge_configuration(config, config["dataset"].get("config", {})) diff --git a/training/flavour/test_metrics.py b/training/flavour/test_metrics.py index b501431..a5b63f2 100644 --- a/training/flavour/test_metrics.py +++ b/training/flavour/test_metrics.py @@ -13,9 +13,9 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -from training.metrics.test.seeker_hourglass import SeekerHourglass import training.metrics.test.confusion_curve as confusion from training.metrics.test import Confusion, ConfusionCurve +from training.metrics.test.seeker_hourglass import SeekerHourglass def TestMetrics(config): diff --git a/training/callbacks/__init__.py b/training/images/__init__.py similarity index 93% rename from training/callbacks/__init__.py rename to training/images/__init__.py index 6813666..62d6c32 100644 --- a/training/callbacks/__init__.py +++ b/training/images/__init__.py @@ -14,6 +14,4 @@ # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. from .classification_images import ClassificationImages -from .one_cycle import OneCycle from .seeker_images import SeekerImages -from .image_tensorboard import ImageTensorBoard diff --git a/training/callbacks/classification_images.py b/training/images/classification_images.py similarity index 95% rename from training/callbacks/classification_images.py rename to training/images/classification_images.py index 7ae2c5e..7fcc002 100644 --- a/training/callbacks/classification_images.py +++ b/training/images/classification_images.py @@ -28,9 +28,8 @@ import matplotlib.pyplot as plt # isort:skip -class ClassificationImages(tf.keras.callbacks.Callback): +class ClassificationImages: def __init__(self, output_path, dataset, colours): - super(ClassificationImages, self).__init__() self.colours = colours self.writer = tf.summary.create_file_writer(os.path.join(output_path, "images")) @@ -103,12 +102,12 @@ def image(img, C, X, colours): return (img_hash, data) - def on_epoch_end(self, epoch, logs=None): + def __call__(self, model, epoch): # Make a dataset that we can infer from, we need to make the input a tuple in a tuple. # If it is not it considers G to be Y and it fails to execute # Then using this dataset of images, do a prediction using the model - predictions = self.model((self.X, self.G)) + predictions = model(self.X, self.G) # Work out the valid data ranges for each of the objects images = [] diff --git a/training/callbacks/seeker_images.py b/training/images/seeker_images.py similarity index 97% rename from training/callbacks/seeker_images.py rename to training/images/seeker_images.py index c05b413..b8cb608 100644 --- a/training/callbacks/seeker_images.py +++ b/training/images/seeker_images.py @@ -17,7 +17,6 @@ import math import os -import cv2 import numpy as np import tensorflow as tf @@ -25,9 +24,8 @@ from training.projection import project -class SeekerImages(tf.keras.callbacks.Callback): +class SeekerImages: def __init__(self, output_path, dataset, model, max_distance, geometry, radius, scale): - super(SeekerImages, self).__init__() self.max_distance = max_distance self.radius = radius @@ -190,12 +188,12 @@ def image(self, img, X, Y, Hoc, lens, nm): return (img_hash, output) - def on_epoch_end(self, epoch, logs=None): + def __call__(self, model, epoch): # Make a dataset that we can infer from, we need to make the input a tuple in a tuple. # If it is not it considers G to be Y and it fails to execute # Then using this dataset of images, do a prediction using the model - predictions = self.model((self.X, self.G)) + predictions = model(self.X, self.G) # Work out the valid data ranges for each of the objects images = [] diff --git a/training/layer/__init__.py b/training/layer/__init__.py index 77d6238..985e08a 100644 --- a/training/layer/__init__.py +++ b/training/layer/__init__.py @@ -13,5 +13,5 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -from .graph_convolution import GraphConvolution from .depthwise_seperable_graph_convolution import DepthwiseSeparableGraphConvolution +from .graph_convolution import GraphConvolution diff --git a/training/learning_rate/__init__.py b/training/learning_rate/__init__.py new file mode 100644 index 0000000..39e49e7 --- /dev/null +++ b/training/learning_rate/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 2017-2020 Trent Houliston +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from .one_cycle import OneCycle +from .static import Static diff --git a/training/callbacks/one_cycle.py b/training/learning_rate/one_cycle.py similarity index 88% rename from training/callbacks/one_cycle.py rename to training/learning_rate/one_cycle.py index b5253c3..1b75987 100644 --- a/training/callbacks/one_cycle.py +++ b/training/learning_rate/one_cycle.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2020 Alex Biddulph +# Copyright (C) 2017-2020 Trent Houliston # # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated # documentation files (the "Software"), to deal in the Software without restriction, including without limitation the @@ -13,16 +13,9 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import os -import numpy as np - -import tensorflow as tf - - -class OneCycle(tf.keras.callbacks.LearningRateScheduler): - def __init__(self, config, **kwargs): - super(OneCycle, self).__init__(self.calc_lr, **kwargs) +class OneCycle: + def __init__(self, config): # lr ranges self.min_lr = float(config["training"]["learning_rate"]["min_learning_rate"]) @@ -34,7 +27,7 @@ def __init__(self, config, **kwargs): self.decay_epochs = int(config["training"]["epochs"]) - self.cycle_epochs self.start_step = None if config["training"]["learning_rate"].get("hot_start", False) else 0 - def calc_lr(self, epoch, lr): + def __call__(self, epoch): # Update our start step if we haven't run yet self.start_step = epoch if self.start_step is None else self.start_step diff --git a/training/learning_rate/static.py b/training/learning_rate/static.py new file mode 100644 index 0000000..9489b7c --- /dev/null +++ b/training/learning_rate/static.py @@ -0,0 +1,24 @@ +# Copyright (C) 2017-2020 Trent Houliston +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +class Static: + def __init__(self, config, **kwargs): + + # lr ranges + self.lr = float(config["training"]["learning_rate"]["value"]) + + def __call__(self, epoch): + return self.lr diff --git a/training/metrics/test/bucket.py b/training/metrics/test/bucket.py index 8efdb4c..604f1c2 100644 --- a/training/metrics/test/bucket.py +++ b/training/metrics/test/bucket.py @@ -17,7 +17,7 @@ def curve_bucket(x, y): - return tf.math.sqrt(tf.add(tf.math.squared_difference(x[:-1], x[1:]), tf.math.squared_difference(y[:-1], y[1:]),)) + return tf.math.sqrt(tf.add(tf.math.squared_difference(x[:-1], x[1:]), tf.math.squared_difference(y[:-1], y[1:]))) def x_bucket(x, y): diff --git a/training/model/visual_mesh_model.py b/training/model/visual_mesh_model.py index ef09fb2..2c738f0 100644 --- a/training/model/visual_mesh_model.py +++ b/training/model/visual_mesh_model.py @@ -14,7 +14,7 @@ # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import tensorflow as tf -from training.layer import GraphConvolution, DepthwiseSeparableGraphConvolution +from training.layer import DepthwiseSeparableGraphConvolution, GraphConvolution class VisualMeshModel(tf.keras.Model): @@ -74,13 +74,10 @@ def __init__(self, structure, output_dims): {**{k: v["inputs"] for k, v in structure.items()}, "X": [], "G": []}, ) - def call(self, X, training=False): - - # Split out the graph and logits - logits, G = X + def call(self, X, G, training=False): # Run through each of the layers which are sorted in topological order - results = {"X": logits, "G": G} + results = {"X": X, "G": G} for s in self.stages: # Get the operation and inputs from the list of ops op, inputs = self.ops[s] diff --git a/training/testing.py b/training/testing.py index a2ae219..7d7a0ec 100644 --- a/training/testing.py +++ b/training/testing.py @@ -13,26 +13,20 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import os - -import numpy as np +from tqdm import tqdm import tensorflow as tf -from .dataset import keras_dataset -from .flavour import Dataset, Loss, Metrics, TestMetrics +from .flavour import Dataset, Loss, TestMetrics from .model import VisualMeshModel def test(config, output_path): # Get the testing dataset - testing_dataset = ( - Dataset(config, "testing").map(keras_dataset, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE) - ) - + testing_dataset = Dataset(config, "testing") # Get the dimensionality of the Y part of the dataset - output_dims = testing_dataset.element_spec[1].shape[-1] + output_dims = testing_dataset.element_spec["Y"].shape[-1] # Define the model model = VisualMeshModel(structure=config["network"]["structure"], output_dims=output_dims) @@ -46,10 +40,17 @@ def test(config, output_path): # Get the metrics and curves we will be building metrics = TestMetrics(config) - model.compile(loss=Loss(config), metrics=metrics) + loss_fn = Loss(config) + + loss_sum = 0 + loss_count = 0 + for data in tqdm(testing_dataset, desc="Testing", unit="batch", dynamic_ncols=True, leave=False): + logits = model(data["X"], data["G"], training=False) + loss_sum += loss_fn(data["Y"], logits) + loss_count += 1 - # Run the evaluation step for each of the batches to build up our metrics - model.evaluate(testing_dataset) + for m in metrics: + m.update_state(data["Y"], logits) # Save all the metric data for m in metrics: diff --git a/training/training.py b/training/training.py index 6774e9a..6999af4 100644 --- a/training/training.py +++ b/training/training.py @@ -15,95 +15,178 @@ import os -import yaml -from tqdm import tqdm +from tqdm import tqdm, trange import tensorflow as tf +from training import dataset -from .callbacks import ImageTensorBoard, OneCycle -from .dataset import keras_dataset -from .flavour import Dataset, ImageCallback, Loss, Metrics +from .flavour import Dataset, LearningRate, Loss, Metrics, Optimiser, ProgressImages from .model import VisualMeshModel +# Writes out the metrics, handling the different kinds of metrics that we could have +def _write_metrics(writer, step, loss, metrics, prefix="", images=True, reset=False): + m_status = [f"Loss: {loss:.02g}"] + + with writer.as_default(): + tf.summary.scalar(f"{prefix}loss", loss, step) + for m in metrics: + if hasattr(m, "images"): + if images: + tf.summary.image(f"{prefix}{m.name}", m.images(m.result()), step) + else: + r = m.result() + m_status.append(f"{m.name}: {r:.02g}") + tf.summary.scalar(f"{prefix}{m.name}", r, step) + + return m_status + + +class DatasetGrouper: + def __init__(self, dataset, group_size): + self.dataset = dataset + self.it = iter(dataset) + self.group_size = group_size + + def reset(self): + self.it = iter(dataset) + + def __len__(self): + if self.group_size is None: + return len(self.dataset) + else: + return self.group_size + + def __iter__(self): + # Entire dataset as group + if self.group_size is None: + for v in self.dataset: + yield v + else: + for i in range(self.group_size): + yield next(self.it) + + # Train the network def train(config, output_path): - # Open the two datasets for training and validation - training_dataset = Dataset(config, "training").map(keras_dataset) - validation_dataset = Dataset(config, "validation").map(keras_dataset) + # Get the flavour of network that we are training and prepare them + training_dataset = Dataset(config, "training") + validation_dataset = Dataset(config, "validation") + loss_fn = Loss(config) + metrics = Metrics(config) + learning_rate = LearningRate(config) + optimiser = Optimiser(config, learning_rate(0)) + progress_images = ProgressImages(config, output_path) - # If we are using batches_per_epoch as a number rather than the whole dataset - if "batches_per_epoch" in config["training"]: - training_dataset = training_dataset.repeat() - - # Get the dimensionality of the Y part of the dataset - output_dims = training_dataset.element_spec[1].shape[-1] + # Get the dimensionality of the Y part of the dataset so we know the size of the last layer of the network + output_dims = training_dataset.element_spec["Y"].shape[-1] # Define the model model = VisualMeshModel(structure=config["network"]["structure"], output_dims=output_dims) - # Determine the learning rate policy to use - if config["training"]["learning_rate"]["type"] == "static": - learning_rate = float(config["training"]["learning_rate"]["value"]) - lr_callback = [] - elif config["training"]["learning_rate"]["type"] == "one_cycle": - learning_rate = float(config["training"]["learning_rate"]["min_learning_rate"]) - lr_callback = [OneCycle(config=config, verbose=True)] - - # Setup the optimiser - if config["training"]["optimiser"]["type"] == "Adam": - optimiser = tf.optimizers.Adam(learning_rate=learning_rate) - elif config["training"]["optimiser"]["type"] == "SGD": - optimiser = tf.optimizers.SGD(learning_rate=learning_rate) - elif config["training"]["optimiser"]["type"] == "Ranger": - import tensorflow_addons as tfa - - optimiser = tfa.optimizers.Lookahead( - tfa.optimizers.RectifiedAdam(learning_rate=learning_rate), - sync_period=int(config["training"]["optimiser"]["sync_period"]), - slow_step_size=float(config["training"]["optimiser"]["slow_step_size"]), - ) - else: - raise RuntimeError("Unknown optimiser type" + config["training"]["optimiser"]["type"]) - - # Compile the model grabbing the flavours for the loss and metrics - model.compile(optimizer=optimiser, loss=Loss(config), metrics=Metrics(config)) + # Create the tensorboard writers + train_writer = tf.summary.create_file_writer(os.path.join(output_path, "train")) + val_writer = tf.summary.create_file_writer(os.path.join(output_path, "validation")) - # Find the latest checkpoint file and load it + # Find the latest checkpoint file and load it so we can resume training checkpoint_file = tf.train.latest_checkpoint(output_path) if checkpoint_file is not None: model.load_weights(checkpoint_file) - # Fit the model - history = model.fit( - training_dataset, - epochs=config["training"]["epochs"], - steps_per_epoch=( - None if "batches_per_epoch" not in config["training"] else config["training"]["batches_per_epoch"] - ), - validation_data=validation_dataset, - validation_steps=config["training"]["validation"]["samples"], - callbacks=[ - ImageTensorBoard( - log_dir=output_path, - update_freq=config["training"]["validation"]["log_frequency"], - profile_batch=0, - write_graph=True, - histogram_freq=1, - ), - tf.keras.callbacks.ModelCheckpoint( - filepath=os.path.join(output_path, "model"), - monitor="loss", - save_weights_only=True, - save_best_only=True, - ), - tf.keras.callbacks.TerminateOnNaN(), - ImageCallback(config, output_path), - *lr_callback, - ], - ) - - # Pickle the history object - with open(os.path.join(output_path, "history.yaml"), "w") as out: - yaml.dump(history.history, out, default_flow_style=None, width=float("inf")) + # Create the iterator objects to loop through the dataset + if config["training"]["batches"] is not None: + training_dataset = training_dataset.repeat() + validation_dataset = validation_dataset.repeat() + training = DatasetGrouper(training_dataset, config["training"]["batches_per_epoch"]) + validation = DatasetGrouper(validation_dataset, config["training"]["validation"]["samples"]) + + batch_no = 0 + best_loss = None + for epoch in trange(config["training"]["epochs"], desc="Epoch", unit="epoch", dynamic_ncols=True): + + # Update the learning rate + optimiser.lr = learning_rate(epoch) + + # Perform the training loop + loss_sum = 0 + loss_count = 0 + for data in tqdm(training, desc="Train Batch", unit="batch", dynamic_ncols=True, leave=False): + + # Run the network + with tf.GradientTape() as tape: + logits = model(data["X"], data["G"], training=True) + loss = loss_fn(data["Y"], logits) + + # If our loss ever becomes non finite end training + if not tf.math.is_finite(loss): + tqdm.write("Loss is not finite, terminating training") + exit(1) + + # Apply the gradient update + grads = tape.gradient(loss, model.trainable_weights) + optimiser.apply_gradients(zip(grads, model.trainable_weights)) + + # Update all the metric states + for m in metrics: + m.update_state(data["Y"], logits) + loss_sum += loss + loss_count += 1 + + # If we are doing batch level logs + if config["training"]["validation"]["log_frequency"] == "batch": + _write_metrics( + writer=train_writer, + loss=loss, + step=batch_no, + prefix="batch/", + metrics=metrics, + images=False, + reset=False, + ) + + batch_no += 1 + + # Output progress images + progress_images(model, epoch) + + # Log epoch level stats + train_metric = _write_metrics( + writer=train_writer, + loss=(loss_sum / loss_count), + step=epoch, + prefix="epoch/", + metrics=metrics, + images=True, + reset=True, + ) + + tqdm.write("Training Epoch {}\n{}".format(epoch, ", ".join(train_metric))) + + # Run validation at the end of each step + loss_sum = 0 + loss_count = 0 + for data in tqdm(validation, desc="Validation Batch", unit="batch", dynamic_ncols=True, leave=False): + logits = model(data["X"], data["G"], training=False) + loss_sum += loss_fn(data["Y"], logits) + loss_count += 1 + + for m in metrics: + m.update_state(data["Y"], logits) + + # Validation stats + validation_metric = _write_metrics( + writer=val_writer, + loss=loss_sum / loss_count, + step=epoch, + prefix="epoch/", + metrics=metrics, + images=True, + reset=True, + ) + + tqdm.write("Validation Epoch {}\n{}".format(epoch, ", ".join(validation_metric))) + + # If this is the best model we have seen, save it + if best_loss is None or best_loss > loss_sum / loss_count: + model.save_weights(os.path.join(output_path, "checkpoint")) From 3684814af840c28fc091d5aab78de0816285556f Mon Sep 17 00:00:00 2001 From: Alex Biddulph Date: Thu, 16 Jun 2022 13:04:48 +1000 Subject: [PATCH 2/2] Fix lookup function --- cpp/visualmesh/visualmesh.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/visualmesh/visualmesh.hpp b/cpp/visualmesh/visualmesh.hpp index 4804765..f8539c8 100644 --- a/cpp/visualmesh/visualmesh.hpp +++ b/cpp/visualmesh/visualmesh.hpp @@ -148,7 +148,7 @@ class VisualMesh { // z height from the transformation matrix const Scalar& h = Hoc[2][3]; auto mesh = height(h); - return mesh->lookup(Hoc, lens); + return std::make_pair(mesh, mesh.lookup(Hoc, lens)); } private: