From ae7672f912c7b70147665080ad2eb6e7eb331d46 Mon Sep 17 00:00:00 2001
From: Trent Houliston <trent@houliston.me>
Date: Fri, 4 Feb 2022 14:04:36 +1100
Subject: [PATCH 1/2] Swap from using keras to using a custom loop.

This will allow for some more interesting visual mesh designs in the future.
---
 training/callbacks/image_tensorboard.py       | 102 --------
 training/dataset/__init__.py                  |   6 -
 training/dataset/label/seeker.py              |   2 +-
 training/dataset/visual_mesh_dataset.py       |   3 +-
 training/export.py                            |  10 +-
 training/find_lr.py                           | 122 +++-------
 training/flavour/__init__.py                  |   4 +-
 training/flavour/learning_rate.py             |  27 +++
 training/flavour/optimiser.py                 |  35 +++
 .../{image_callback.py => progress_images.py} |   4 +-
 training/flavour/test_metrics.py              |   2 +-
 training/{callbacks => images}/__init__.py    |   2 -
 .../classification_images.py                  |   7 +-
 .../{callbacks => images}/seeker_images.py    |   8 +-
 training/layer/__init__.py                    |   2 +-
 training/learning_rate/__init__.py            |  17 ++
 .../{callbacks => learning_rate}/one_cycle.py |  15 +-
 training/learning_rate/static.py              |  24 ++
 training/metrics/test/bucket.py               |   2 +-
 training/model/visual_mesh_model.py           |   9 +-
 training/testing.py                           |  27 ++-
 training/training.py                          | 229 ++++++++++++------
 22 files changed, 334 insertions(+), 325 deletions(-)
 delete mode 100644 training/callbacks/image_tensorboard.py
 create mode 100644 training/flavour/learning_rate.py
 create mode 100644 training/flavour/optimiser.py
 rename training/flavour/{image_callback.py => progress_images.py} (96%)
 rename training/{callbacks => images}/__init__.py (93%)
 rename training/{callbacks => images}/classification_images.py (95%)
 rename training/{callbacks => images}/seeker_images.py (97%)
 create mode 100644 training/learning_rate/__init__.py
 rename training/{callbacks => learning_rate}/one_cycle.py (88%)
 create mode 100644 training/learning_rate/static.py

diff --git a/training/callbacks/image_tensorboard.py b/training/callbacks/image_tensorboard.py
deleted file mode 100644
index 55ecbac..0000000
--- a/training/callbacks/image_tensorboard.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (C) 2017-2020 Trent Houliston <trent@houliston.me>
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
-# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
-# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
-# permit persons to whom the Software is furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
-# Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
-# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
-# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-import os
-
-import tensorflow as tf
-
-
-class ImageTensorBoard(tf.keras.callbacks.TensorBoard):
-    def __init__(self, **kwargs):
-        super(ImageTensorBoard, self).__init__(**kwargs)
-
-    def _image_metrics(self):
-        return [m for m in self.model.metrics if hasattr(m, "images")]
-
-    def _filter_logs(self, logs):
-        if logs is None:
-            return None
-
-        # Find all the metrics that are actually image metrics
-        remove_list = self._image_metrics()
-        remove_list = [m.name for m in remove_list]
-        remove_list.extend(["val_{}".format(n) for n in remove_list])
-
-        return {k: v for k, v in logs.items() if k not in remove_list}
-
-    def on_batch_begin(self, batch, logs=None):
-        super(ImageTensorBoard, self).on_batch_begin(batch, self._filter_logs(logs))
-
-    def on_batch_end(self, batch, logs=None):
-        super(ImageTensorBoard, self).on_batch_end(batch, self._filter_logs(logs))
-
-    def on_epoch_begin(self, epoch, logs=None):
-        super(ImageTensorBoard, self).on_epoch_begin(epoch, self._filter_logs(logs))
-
-    def on_epoch_end(self, epoch, logs=None):
-
-        if logs is not None:
-            with self._train_writer.as_default():
-                for m in self._image_metrics():
-                    tf.summary.image(m.name, m.images(logs[m.name]), epoch)
-
-            if any([l.startswith("val_") for l in logs.keys()]):
-                with self._val_writer.as_default():
-                    for m in self._image_metrics():
-                        tf.summary.image(m.name, m.images(logs["val_{}".format(m.name)]), epoch)
-
-        super(ImageTensorBoard, self).on_epoch_end(epoch, self._filter_logs(logs))
-
-    def on_predict_batch_begin(self, batch, logs=None):
-        super(ImageTensorBoard, self).on_predict_batch_begin(batch, self._filter_logs(logs))
-
-    def on_predict_batch_end(self, batch, logs=None):
-        super(ImageTensorBoard, self).on_predict_batch_end(batch, self._filter_logs(logs))
-
-    def on_predict_begin(self, logs=None):
-        super(ImageTensorBoard, self).on_predict_begin(self._filter_logs(logs))
-
-    def on_predict_end(self, logs=None):
-        super(ImageTensorBoard, self).on_predict_end(self._filter_logs(logs))
-
-    def on_test_batch_begin(self, batch, logs=None):
-        super(ImageTensorBoard, self).on_test_batch_begin(batch, self._filter_logs(logs))
-
-    def on_test_batch_end(self, batch, logs=None):
-        super(ImageTensorBoard, self).on_test_batch_end(batch, self._filter_logs(logs))
-
-    def on_test_begin(self, logs=None):
-        super(ImageTensorBoard, self).on_test_begin(self._filter_logs(logs))
-
-    def on_test_end(self, logs=None):
-        super(ImageTensorBoard, self).on_test_end(self._filter_logs(logs))
-
-    def on_train_batch_begin(self, batch, logs=None):
-        super(ImageTensorBoard, self).on_train_batch_begin(batch, self._filter_logs(logs))
-
-    def on_train_batch_end(self, batch, logs=None):
-        super(ImageTensorBoard, self).on_train_batch_end(batch, self._filter_logs(logs))
-
-    def on_train_begin(self, logs=None):
-        super(ImageTensorBoard, self).on_train_begin(self._filter_logs(logs))
-
-    def on_train_end(self, logs=None):
-        super(ImageTensorBoard, self).on_train_end(self._filter_logs(logs))
-
-    def set_model(self, model):
-        super(ImageTensorBoard, self).set_model(model)
-
-    def set_params(self, params):
-        super(ImageTensorBoard, self).set_params(params)
diff --git a/training/dataset/__init__.py b/training/dataset/__init__.py
index 0e9eba1..cf1e25d 100644
--- a/training/dataset/__init__.py
+++ b/training/dataset/__init__.py
@@ -68,9 +68,3 @@ def Dataset(paths, batch_size, view, example, orientation, label, projection, ke
         label=label,
         keys=keys,
     ).build()
-
-
-# Convert a dataset into a format that will be accepted by keras fit
-def keras_dataset(args):
-    # Return in the format (x, y, weights)
-    return ((args["X"], args["G"]), args["Y"])
diff --git a/training/dataset/label/seeker.py b/training/dataset/label/seeker.py
index 0914719..9ce4426 100644
--- a/training/dataset/label/seeker.py
+++ b/training/dataset/label/seeker.py
@@ -54,7 +54,7 @@ def __call__(self, image, Hoc, V, valid, **features):
         # Transform our remaining targets into observation plane space
         uOCo, l = tf.linalg.normalize(tf.einsum("ij,kj->ki", Hoc[:3, :3], targets), axis=-1)
 
-        # Check if this signal is close enough in distance to our observation plane to be considered
+        # Check if this point is close enough in distance to our observation plane to be considered
         height = Hoc[2, 3]
         ratio = tf.squeeze(l, axis=-1) / height
         uOCo = tf.gather(
diff --git a/training/dataset/visual_mesh_dataset.py b/training/dataset/visual_mesh_dataset.py
index 06f138f..2a71fe2 100644
--- a/training/dataset/visual_mesh_dataset.py
+++ b/training/dataset/visual_mesh_dataset.py
@@ -13,9 +13,10 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-import tensorflow as tf
 import math
 
+import tensorflow as tf
+
 
 class VisualMeshDataset:
     def __init__(self, paths, batch_size, view, example, orientation, projection, label, keys):
diff --git a/training/export.py b/training/export.py
index d6d0361..7bd07f2 100644
--- a/training/export.py
+++ b/training/export.py
@@ -20,7 +20,6 @@
 
 import tensorflow as tf
 
-from .dataset import keras_dataset
 from .flavour import Dataset
 from .layer.graph_convolution import GraphConvolution
 from .model import VisualMeshModel
@@ -29,12 +28,10 @@
 def export(config, output_path):
 
     # Get the training dataset so we know the output size
-    training_dataset = (
-        Dataset(config, "training").map(keras_dataset, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
-    )
+    training_dataset = Dataset(config, "training")
 
     # Get the dimensionality of the Y part of the dataset
-    output_dims = training_dataset.element_spec[1].shape[-1]
+    output_dims = training_dataset.element_spec["Y"].shape[-1]
 
     # Define the model
     model = VisualMeshModel(structure=config["network"]["structure"], output_dims=output_dims)
@@ -47,7 +44,8 @@ def export(config, output_path):
         raise RuntimeError("Could not find weights to load into the network")
 
     # We have to run a predict step so that everything is loaded properly
-    model.predict(training_dataset.take(1))
+    for v in training_dataset.take(1):
+        model(v["X"], v["G"], training=False)
 
     # Print out the model summary
     model.summary()
diff --git a/training/find_lr.py b/training/find_lr.py
index e0cdcd4..98efd59 100644
--- a/training/find_lr.py
+++ b/training/find_lr.py
@@ -23,115 +23,59 @@
 
 import tensorflow as tf
 
-from .dataset import keras_dataset
-from .flavour import Dataset, Loss
+from .flavour import Dataset, Loss, Optimiser
 from .model import VisualMeshModel
 
 mpl.use("Agg")
 import matplotlib.pyplot as plt  # isort:skip
 
 
-class LRProgress(tf.keras.callbacks.Callback):
-    def __init__(self, n_steps, lr_schedule, **kwargs):
-        super(LRProgress, self).__init__(**kwargs)
-
-        self.lr_schedule = lr_schedule
-        self.n_steps = n_steps
-
-        # Tracking loss values
-        self.smooth_loss = None
-        self.losses = []
-
-        # Progress bar settings
-        self.n_history = 20
-        self.lr_progress = None
-        self.loss_title = None
-        self.loss_progress = None
-
-    def on_epoch_end(self, epoch, logs=None):
-        # Skip when we explode at the end
-        if not math.isfinite(logs["loss"]):
-            return
-
-        # Calculate smoothed loss values
-        self.smooth_loss = logs["loss"] if self.smooth_loss is None else self.smooth_loss * 0.98 + logs["loss"] * 0.02
-        self.losses.append(math.log1p(self.smooth_loss))
-
-        # Create the lr_progress bar so we can see how far it is
-        if self.lr_progress is None:
-            self.lr_progress = tqdm(total=self.n_steps, dynamic_ncols=True)
-        self.lr_progress.update()
-        self.lr_progress.set_description("LR:   {:.3e}".format(self.model.optimizer.lr.numpy()))
-
-        # Create our series of moving loss graphs
-        if self.loss_progress is None:
-            self.loss_title = tqdm(bar_format="{desc}", desc="Loss Graph")
-            self.loss_progress = [
-                tqdm(bar_format="{desc}|{bar}|", total=self.losses[-1], dynamic_ncols=True)
-                for i in range(self.n_history)
-            ]
-
-        valid_i = -min(len(self.losses), self.n_history)
-
-        # Get the maximum of the losses
-        loss_max = max(self.losses)
-
-        for bar, loss in zip(self.loss_progress[valid_i:], self.losses[valid_i:]):
-            bar.total = loss_max
-            bar.n = loss
-            bar.set_description("{:.3e}".format(math.expm1(loss)))
-
-
 # Find valid learning rates for
 def find_lr(config, output_path, min_lr, max_lr, n_steps, window_size):
 
     # Open the training dataset and put it on repeat
-    training_dataset = Dataset(config, "training").map(keras_dataset).repeat()
+    training_dataset = Dataset(config, "training").repeat()
 
     # Get the dimensionality of the Y part of the dataset
-    output_dims = training_dataset.element_spec[1].shape[-1]
+    output_dims = training_dataset.element_spec["Y"].shape[-1]
 
     # Define the model
     model = VisualMeshModel(structure=config["network"]["structure"], output_dims=output_dims)
 
-    def lr_schedule(epoch, lr):
+    def lr_schedule(epoch):
         return min_lr * (max_lr / min_lr) ** (epoch / n_steps)
 
-    # Setup the optimiser
-    if config["training"]["optimiser"]["type"] == "Adam":
-        optimiser = tf.optimizers.Adam(learning_rate=min_lr)
-    elif config["training"]["optimiser"]["type"] == "SGD":
-        optimiser = tf.optimizers.SGD(learning_rate=min_lr)
-    elif config["training"]["optimiser"]["type"] == "Ranger":
-        import tensorflow_addons as tfa
-
-        optimiser = tfa.optimizers.Lookahead(
-            tfa.optimizers.RectifiedAdam(learning_rate=min_lr),
-            sync_period=int(config["training"]["optimiser"]["sync_period"]),
-            slow_step_size=float(config["training"]["optimiser"]["slow_step_size"]),
-        )
-    else:
-        raise RuntimeError("Unknown optimiser type" + config["training"]["optimiser"]["type"])
-
-    # Compile the model grabbing the flavours for the loss and metrics
-    model.compile(optimizer=optimiser, loss=Loss(config))
-
-    # Run the fit function on the model to calculate the learning rates
-    history = model.fit(
-        training_dataset,
-        epochs=n_steps,
-        steps_per_epoch=1,
-        callbacks=[
-            tf.keras.callbacks.TerminateOnNaN(),
-            tf.keras.callbacks.LearningRateScheduler(lr_schedule),
-            LRProgress(n_steps, lr_schedule),
-        ],
-        verbose=False,
-    )
+    optimiser = Optimiser(config, lr_schedule(0))
+    loss_fn = Loss(config)
+
+    losses = []
+    lrs = []
+    with tqdm(desc=f"LR: {optimiser.lr.numpy():.3e} Loss: ---", total=n_steps) as progress:
+        for step, data in enumerate(training_dataset):
+            if step > n_steps:
+                break
+
+            optimiser.lr = lr_schedule(step)
+
+            with tf.GradientTape() as tape:
+                logits = model(data["X"], data["G"], training=True)
+                loss = loss_fn(data["Y"], logits)
+
+            if not tf.math.is_finite(loss):
+                break
+
+            grads = tape.gradient(loss, model.trainable_weights)
+            optimiser.apply_gradients(zip(grads, model.trainable_weights))
+
+            losses.append(loss.numpy())
+            lrs.append(optimiser.lr.numpy())
+
+            progress.set_description(f"LR: {optimiser.lr.numpy():.3e} Loss: {loss.numpy():.3e}")
+            progress.update()
 
     # Extract the loss and LR from before it became nan
-    loss = np.array(history.history["loss"][:-1])
-    lr = np.array(history.history["lr"][:-1])
+    loss = np.array(losses[:-1])
+    lr = np.array(lrs[:-1])
 
     # Make a smoothed version of the loss
     smooth_loss = np.convolve(loss, np.ones(window_size), "same") / np.convolve(
diff --git a/training/flavour/__init__.py b/training/flavour/__init__.py
index 1f17ab9..2506e52 100644
--- a/training/flavour/__init__.py
+++ b/training/flavour/__init__.py
@@ -14,7 +14,9 @@
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 from .dataset import Dataset
-from .image_callback import ImageCallback
+from .learning_rate import LearningRate
 from .loss import Loss
 from .metrics import Metrics
+from .optimiser import Optimiser
+from .progress_images import ProgressImages
 from .test_metrics import TestMetrics
diff --git a/training/flavour/learning_rate.py b/training/flavour/learning_rate.py
new file mode 100644
index 0000000..2def07c
--- /dev/null
+++ b/training/flavour/learning_rate.py
@@ -0,0 +1,27 @@
+# Copyright (C) 2017-2020 Trent Houliston <trent@houliston.me>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import tensorflow as tf
+
+from ..learning_rate import OneCycle, Static
+
+
+def LearningRate(config):
+
+    # Determine the learning rate policy to use
+    if config["training"]["learning_rate"]["type"] == "static":
+        return Static(config=config)
+    elif config["training"]["learning_rate"]["type"] == "one_cycle":
+        return OneCycle(config=config)
diff --git a/training/flavour/optimiser.py b/training/flavour/optimiser.py
new file mode 100644
index 0000000..87f58ea
--- /dev/null
+++ b/training/flavour/optimiser.py
@@ -0,0 +1,35 @@
+# Copyright (C) 2017-2020 Trent Houliston <trent@houliston.me>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import tensorflow as tf
+
+
+def Optimiser(config, initial_lr):
+
+    # Setup the optimiser
+    if config["training"]["optimiser"]["type"] == "Adam":
+        return tf.optimizers.Adam(learning_rate=initial_lr)
+    elif config["training"]["optimiser"]["type"] == "SGD":
+        return tf.optimizers.SGD(learning_rate=initial_lr)
+    elif config["training"]["optimiser"]["type"] == "Ranger":
+        import tensorflow_addons as tfa
+
+        return tfa.optimizers.Lookahead(
+            tfa.optimizers.RectifiedAdam(learning_rate=initial_lr),
+            sync_period=int(config["training"]["optimiser"]["sync_period"]),
+            slow_step_size=float(config["training"]["optimiser"]["slow_step_size"]),
+        )
+    else:
+        raise RuntimeError("Unknown optimiser type" + config["training"]["optimiser"]["type"])
diff --git a/training/flavour/image_callback.py b/training/flavour/progress_images.py
similarity index 96%
rename from training/flavour/image_callback.py
rename to training/flavour/progress_images.py
index cc40724..c7ea7bb 100644
--- a/training/flavour/image_callback.py
+++ b/training/flavour/progress_images.py
@@ -13,12 +13,12 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-from ..callbacks import ClassificationImages, SeekerImages
+from ..images import ClassificationImages, SeekerImages
 from .dataset import Dataset
 from .merge_configuration import merge_configuration
 
 
-def ImageCallback(config, output_path):
+def ProgressImages(config, output_path):
 
     validation_config = merge_configuration(config, config["dataset"].get("config", {}))
 
diff --git a/training/flavour/test_metrics.py b/training/flavour/test_metrics.py
index b501431..a5b63f2 100644
--- a/training/flavour/test_metrics.py
+++ b/training/flavour/test_metrics.py
@@ -13,9 +13,9 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-from training.metrics.test.seeker_hourglass import SeekerHourglass
 import training.metrics.test.confusion_curve as confusion
 from training.metrics.test import Confusion, ConfusionCurve
+from training.metrics.test.seeker_hourglass import SeekerHourglass
 
 
 def TestMetrics(config):
diff --git a/training/callbacks/__init__.py b/training/images/__init__.py
similarity index 93%
rename from training/callbacks/__init__.py
rename to training/images/__init__.py
index 6813666..62d6c32 100644
--- a/training/callbacks/__init__.py
+++ b/training/images/__init__.py
@@ -14,6 +14,4 @@
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 from .classification_images import ClassificationImages
-from .one_cycle import OneCycle
 from .seeker_images import SeekerImages
-from .image_tensorboard import ImageTensorBoard
diff --git a/training/callbacks/classification_images.py b/training/images/classification_images.py
similarity index 95%
rename from training/callbacks/classification_images.py
rename to training/images/classification_images.py
index 7ae2c5e..7fcc002 100644
--- a/training/callbacks/classification_images.py
+++ b/training/images/classification_images.py
@@ -28,9 +28,8 @@
 import matplotlib.pyplot as plt  # isort:skip
 
 
-class ClassificationImages(tf.keras.callbacks.Callback):
+class ClassificationImages:
     def __init__(self, output_path, dataset, colours):
-        super(ClassificationImages, self).__init__()
 
         self.colours = colours
         self.writer = tf.summary.create_file_writer(os.path.join(output_path, "images"))
@@ -103,12 +102,12 @@ def image(img, C, X, colours):
 
         return (img_hash, data)
 
-    def on_epoch_end(self, epoch, logs=None):
+    def __call__(self, model, epoch):
 
         # Make a dataset that we can infer from, we need to make the input a tuple in a tuple.
         # If it is not it considers G to be Y and it fails to execute
         # Then using this dataset of images, do a prediction using the model
-        predictions = self.model((self.X, self.G))
+        predictions = model(self.X, self.G)
 
         # Work out the valid data ranges for each of the objects
         images = []
diff --git a/training/callbacks/seeker_images.py b/training/images/seeker_images.py
similarity index 97%
rename from training/callbacks/seeker_images.py
rename to training/images/seeker_images.py
index c05b413..b8cb608 100644
--- a/training/callbacks/seeker_images.py
+++ b/training/images/seeker_images.py
@@ -17,7 +17,6 @@
 import math
 import os
 
-import cv2
 import numpy as np
 
 import tensorflow as tf
@@ -25,9 +24,8 @@
 from training.projection import project
 
 
-class SeekerImages(tf.keras.callbacks.Callback):
+class SeekerImages:
     def __init__(self, output_path, dataset, model, max_distance, geometry, radius, scale):
-        super(SeekerImages, self).__init__()
 
         self.max_distance = max_distance
         self.radius = radius
@@ -190,12 +188,12 @@ def image(self, img, X, Y, Hoc, lens, nm):
 
         return (img_hash, output)
 
-    def on_epoch_end(self, epoch, logs=None):
+    def __call__(self, model, epoch):
 
         # Make a dataset that we can infer from, we need to make the input a tuple in a tuple.
         # If it is not it considers G to be Y and it fails to execute
         # Then using this dataset of images, do a prediction using the model
-        predictions = self.model((self.X, self.G))
+        predictions = model(self.X, self.G)
 
         # Work out the valid data ranges for each of the objects
         images = []
diff --git a/training/layer/__init__.py b/training/layer/__init__.py
index 77d6238..985e08a 100644
--- a/training/layer/__init__.py
+++ b/training/layer/__init__.py
@@ -13,5 +13,5 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-from .graph_convolution import GraphConvolution
 from .depthwise_seperable_graph_convolution import DepthwiseSeparableGraphConvolution
+from .graph_convolution import GraphConvolution
diff --git a/training/learning_rate/__init__.py b/training/learning_rate/__init__.py
new file mode 100644
index 0000000..39e49e7
--- /dev/null
+++ b/training/learning_rate/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2017-2020 Trent Houliston <trent@houliston.me>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+from .one_cycle import OneCycle
+from .static import Static
diff --git a/training/callbacks/one_cycle.py b/training/learning_rate/one_cycle.py
similarity index 88%
rename from training/callbacks/one_cycle.py
rename to training/learning_rate/one_cycle.py
index b5253c3..1b75987 100644
--- a/training/callbacks/one_cycle.py
+++ b/training/learning_rate/one_cycle.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2020 Alex Biddulph <Alexander.Biddulph@uon.edu.au>
+# Copyright (C) 2017-2020 Trent Houliston <trent@houliston.me>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 # documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
@@ -13,16 +13,9 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-import os
 
-import numpy as np
-
-import tensorflow as tf
-
-
-class OneCycle(tf.keras.callbacks.LearningRateScheduler):
-    def __init__(self, config, **kwargs):
-        super(OneCycle, self).__init__(self.calc_lr, **kwargs)
+class OneCycle:
+    def __init__(self, config):
 
         # lr ranges
         self.min_lr = float(config["training"]["learning_rate"]["min_learning_rate"])
@@ -34,7 +27,7 @@ def __init__(self, config, **kwargs):
         self.decay_epochs = int(config["training"]["epochs"]) - self.cycle_epochs
         self.start_step = None if config["training"]["learning_rate"].get("hot_start", False) else 0
 
-    def calc_lr(self, epoch, lr):
+    def __call__(self, epoch):
 
         # Update our start step if we haven't run yet
         self.start_step = epoch if self.start_step is None else self.start_step
diff --git a/training/learning_rate/static.py b/training/learning_rate/static.py
new file mode 100644
index 0000000..9489b7c
--- /dev/null
+++ b/training/learning_rate/static.py
@@ -0,0 +1,24 @@
+# Copyright (C) 2017-2020 Trent Houliston <trent@houliston.me>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+class Static:
+    def __init__(self, config, **kwargs):
+
+        # lr ranges
+        self.lr = float(config["training"]["learning_rate"]["value"])
+
+    def __call__(self, epoch):
+        return self.lr
diff --git a/training/metrics/test/bucket.py b/training/metrics/test/bucket.py
index 8efdb4c..604f1c2 100644
--- a/training/metrics/test/bucket.py
+++ b/training/metrics/test/bucket.py
@@ -17,7 +17,7 @@
 
 
 def curve_bucket(x, y):
-    return tf.math.sqrt(tf.add(tf.math.squared_difference(x[:-1], x[1:]), tf.math.squared_difference(y[:-1], y[1:]),))
+    return tf.math.sqrt(tf.add(tf.math.squared_difference(x[:-1], x[1:]), tf.math.squared_difference(y[:-1], y[1:])))
 
 
 def x_bucket(x, y):
diff --git a/training/model/visual_mesh_model.py b/training/model/visual_mesh_model.py
index ef09fb2..2c738f0 100644
--- a/training/model/visual_mesh_model.py
+++ b/training/model/visual_mesh_model.py
@@ -14,7 +14,7 @@
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 import tensorflow as tf
-from training.layer import GraphConvolution, DepthwiseSeparableGraphConvolution
+from training.layer import DepthwiseSeparableGraphConvolution, GraphConvolution
 
 
 class VisualMeshModel(tf.keras.Model):
@@ -74,13 +74,10 @@ def __init__(self, structure, output_dims):
             {**{k: v["inputs"] for k, v in structure.items()}, "X": [], "G": []},
         )
 
-    def call(self, X, training=False):
-
-        # Split out the graph and logits
-        logits, G = X
+    def call(self, X, G, training=False):
 
         # Run through each of the layers which are sorted in topological order
-        results = {"X": logits, "G": G}
+        results = {"X": X, "G": G}
         for s in self.stages:
             # Get the operation and inputs from the list of ops
             op, inputs = self.ops[s]
diff --git a/training/testing.py b/training/testing.py
index a2ae219..7d7a0ec 100644
--- a/training/testing.py
+++ b/training/testing.py
@@ -13,26 +13,20 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-import os
-
-import numpy as np
+from tqdm import tqdm
 
 import tensorflow as tf
 
-from .dataset import keras_dataset
-from .flavour import Dataset, Loss, Metrics, TestMetrics
+from .flavour import Dataset, Loss, TestMetrics
 from .model import VisualMeshModel
 
 
 def test(config, output_path):
 
     # Get the testing dataset
-    testing_dataset = (
-        Dataset(config, "testing").map(keras_dataset, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
-    )
-
+    testing_dataset = Dataset(config, "testing")
     # Get the dimensionality of the Y part of the dataset
-    output_dims = testing_dataset.element_spec[1].shape[-1]
+    output_dims = testing_dataset.element_spec["Y"].shape[-1]
 
     # Define the model
     model = VisualMeshModel(structure=config["network"]["structure"], output_dims=output_dims)
@@ -46,10 +40,17 @@ def test(config, output_path):
 
     # Get the metrics and curves we will be building
     metrics = TestMetrics(config)
-    model.compile(loss=Loss(config), metrics=metrics)
+    loss_fn = Loss(config)
+
+    loss_sum = 0
+    loss_count = 0
+    for data in tqdm(testing_dataset, desc="Testing", unit="batch", dynamic_ncols=True, leave=False):
+        logits = model(data["X"], data["G"], training=False)
+        loss_sum += loss_fn(data["Y"], logits)
+        loss_count += 1
 
-    # Run the evaluation step for each of the batches to build up our metrics
-    model.evaluate(testing_dataset)
+        for m in metrics:
+            m.update_state(data["Y"], logits)
 
     # Save all the metric data
     for m in metrics:
diff --git a/training/training.py b/training/training.py
index 6774e9a..6999af4 100644
--- a/training/training.py
+++ b/training/training.py
@@ -15,95 +15,178 @@
 
 import os
 
-import yaml
-from tqdm import tqdm
+from tqdm import tqdm, trange
 
 import tensorflow as tf
+from training import dataset
 
-from .callbacks import ImageTensorBoard, OneCycle
-from .dataset import keras_dataset
-from .flavour import Dataset, ImageCallback, Loss, Metrics
+from .flavour import Dataset, LearningRate, Loss, Metrics, Optimiser, ProgressImages
 from .model import VisualMeshModel
 
 
+# Writes out the metrics, handling the different kinds of metrics that we could have
+def _write_metrics(writer, step, loss, metrics, prefix="", images=True, reset=False):
+    m_status = [f"Loss: {loss:.02g}"]
+
+    with writer.as_default():
+        tf.summary.scalar(f"{prefix}loss", loss, step)
+        for m in metrics:
+            if hasattr(m, "images"):
+                if images:
+                    tf.summary.image(f"{prefix}{m.name}", m.images(m.result()), step)
+            else:
+                r = m.result()
+                m_status.append(f"{m.name}: {r:.02g}")
+                tf.summary.scalar(f"{prefix}{m.name}", r, step)
+
+    return m_status
+
+
+class DatasetGrouper:
+    def __init__(self, dataset, group_size):
+        self.dataset = dataset
+        self.it = iter(dataset)
+        self.group_size = group_size
+
+    def reset(self):
+        self.it = iter(dataset)
+
+    def __len__(self):
+        if self.group_size is None:
+            return len(self.dataset)
+        else:
+            return self.group_size
+
+    def __iter__(self):
+        # Entire dataset as group
+        if self.group_size is None:
+            for v in self.dataset:
+                yield v
+        else:
+            for i in range(self.group_size):
+                yield next(self.it)
+
+
 # Train the network
 def train(config, output_path):
 
-    # Open the two datasets for training and validation
-    training_dataset = Dataset(config, "training").map(keras_dataset)
-    validation_dataset = Dataset(config, "validation").map(keras_dataset)
+    # Get the flavour of network that we are training and prepare them
+    training_dataset = Dataset(config, "training")
+    validation_dataset = Dataset(config, "validation")
+    loss_fn = Loss(config)
+    metrics = Metrics(config)
+    learning_rate = LearningRate(config)
+    optimiser = Optimiser(config, learning_rate(0))
+    progress_images = ProgressImages(config, output_path)
 
-    # If we are using batches_per_epoch as a number rather than the whole dataset
-    if "batches_per_epoch" in config["training"]:
-        training_dataset = training_dataset.repeat()
-
-    # Get the dimensionality of the Y part of the dataset
-    output_dims = training_dataset.element_spec[1].shape[-1]
+    # Get the dimensionality of the Y part of the dataset so we know the size of the last layer of the network
+    output_dims = training_dataset.element_spec["Y"].shape[-1]
 
     # Define the model
     model = VisualMeshModel(structure=config["network"]["structure"], output_dims=output_dims)
 
-    # Determine the learning rate policy to use
-    if config["training"]["learning_rate"]["type"] == "static":
-        learning_rate = float(config["training"]["learning_rate"]["value"])
-        lr_callback = []
-    elif config["training"]["learning_rate"]["type"] == "one_cycle":
-        learning_rate = float(config["training"]["learning_rate"]["min_learning_rate"])
-        lr_callback = [OneCycle(config=config, verbose=True)]
-
-    # Setup the optimiser
-    if config["training"]["optimiser"]["type"] == "Adam":
-        optimiser = tf.optimizers.Adam(learning_rate=learning_rate)
-    elif config["training"]["optimiser"]["type"] == "SGD":
-        optimiser = tf.optimizers.SGD(learning_rate=learning_rate)
-    elif config["training"]["optimiser"]["type"] == "Ranger":
-        import tensorflow_addons as tfa
-
-        optimiser = tfa.optimizers.Lookahead(
-            tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
-            sync_period=int(config["training"]["optimiser"]["sync_period"]),
-            slow_step_size=float(config["training"]["optimiser"]["slow_step_size"]),
-        )
-    else:
-        raise RuntimeError("Unknown optimiser type" + config["training"]["optimiser"]["type"])
-
-    # Compile the model grabbing the flavours for the loss and metrics
-    model.compile(optimizer=optimiser, loss=Loss(config), metrics=Metrics(config))
+    # Create the tensorboard writers
+    train_writer = tf.summary.create_file_writer(os.path.join(output_path, "train"))
+    val_writer = tf.summary.create_file_writer(os.path.join(output_path, "validation"))
 
-    # Find the latest checkpoint file and load it
+    # Find the latest checkpoint file and load it so we can resume training
     checkpoint_file = tf.train.latest_checkpoint(output_path)
     if checkpoint_file is not None:
         model.load_weights(checkpoint_file)
 
-    # Fit the model
-    history = model.fit(
-        training_dataset,
-        epochs=config["training"]["epochs"],
-        steps_per_epoch=(
-            None if "batches_per_epoch" not in config["training"] else config["training"]["batches_per_epoch"]
-        ),
-        validation_data=validation_dataset,
-        validation_steps=config["training"]["validation"]["samples"],
-        callbacks=[
-            ImageTensorBoard(
-                log_dir=output_path,
-                update_freq=config["training"]["validation"]["log_frequency"],
-                profile_batch=0,
-                write_graph=True,
-                histogram_freq=1,
-            ),
-            tf.keras.callbacks.ModelCheckpoint(
-                filepath=os.path.join(output_path, "model"),
-                monitor="loss",
-                save_weights_only=True,
-                save_best_only=True,
-            ),
-            tf.keras.callbacks.TerminateOnNaN(),
-            ImageCallback(config, output_path),
-            *lr_callback,
-        ],
-    )
-
-    # Pickle the history object
-    with open(os.path.join(output_path, "history.yaml"), "w") as out:
-        yaml.dump(history.history, out, default_flow_style=None, width=float("inf"))
+    # Create the iterator objects to loop through the dataset
+    if config["training"]["batches"] is not None:
+        training_dataset = training_dataset.repeat()
+    validation_dataset = validation_dataset.repeat()
+    training = DatasetGrouper(training_dataset, config["training"]["batches_per_epoch"])
+    validation = DatasetGrouper(validation_dataset, config["training"]["validation"]["samples"])
+
+    batch_no = 0
+    best_loss = None
+    for epoch in trange(config["training"]["epochs"], desc="Epoch", unit="epoch", dynamic_ncols=True):
+
+        # Update the learning rate
+        optimiser.lr = learning_rate(epoch)
+
+        # Perform the training loop
+        loss_sum = 0
+        loss_count = 0
+        for data in tqdm(training, desc="Train Batch", unit="batch", dynamic_ncols=True, leave=False):
+
+            # Run the network
+            with tf.GradientTape() as tape:
+                logits = model(data["X"], data["G"], training=True)
+                loss = loss_fn(data["Y"], logits)
+
+            # If our loss ever becomes non finite end training
+            if not tf.math.is_finite(loss):
+                tqdm.write("Loss is not finite, terminating training")
+                exit(1)
+
+            # Apply the gradient update
+            grads = tape.gradient(loss, model.trainable_weights)
+            optimiser.apply_gradients(zip(grads, model.trainable_weights))
+
+            # Update all the metric states
+            for m in metrics:
+                m.update_state(data["Y"], logits)
+            loss_sum += loss
+            loss_count += 1
+
+            # If we are doing batch level logs
+            if config["training"]["validation"]["log_frequency"] == "batch":
+                _write_metrics(
+                    writer=train_writer,
+                    loss=loss,
+                    step=batch_no,
+                    prefix="batch/",
+                    metrics=metrics,
+                    images=False,
+                    reset=False,
+                )
+
+            batch_no += 1
+
+        # Output progress images
+        progress_images(model, epoch)
+
+        # Log epoch level stats
+        train_metric = _write_metrics(
+            writer=train_writer,
+            loss=(loss_sum / loss_count),
+            step=epoch,
+            prefix="epoch/",
+            metrics=metrics,
+            images=True,
+            reset=True,
+        )
+
+        tqdm.write("Training Epoch {}\n{}".format(epoch, ", ".join(train_metric)))
+
+        # Run validation at the end of each step
+        loss_sum = 0
+        loss_count = 0
+        for data in tqdm(validation, desc="Validation Batch", unit="batch", dynamic_ncols=True, leave=False):
+            logits = model(data["X"], data["G"], training=False)
+            loss_sum += loss_fn(data["Y"], logits)
+            loss_count += 1
+
+            for m in metrics:
+                m.update_state(data["Y"], logits)
+
+        # Validation stats
+        validation_metric = _write_metrics(
+            writer=val_writer,
+            loss=loss_sum / loss_count,
+            step=epoch,
+            prefix="epoch/",
+            metrics=metrics,
+            images=True,
+            reset=True,
+        )
+
+        tqdm.write("Validation Epoch {}\n{}".format(epoch, ", ".join(validation_metric)))
+
+        # If this is the best model we have seen, save it
+        if best_loss is None or best_loss > loss_sum / loss_count:
+            model.save_weights(os.path.join(output_path, "checkpoint"))

From 3684814af840c28fc091d5aab78de0816285556f Mon Sep 17 00:00:00 2001
From: Alex Biddulph <c3124185@uon.edu.au>
Date: Thu, 16 Jun 2022 13:04:48 +1000
Subject: [PATCH 2/2] Fix lookup function

---
 cpp/visualmesh/visualmesh.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/visualmesh/visualmesh.hpp b/cpp/visualmesh/visualmesh.hpp
index 4804765..f8539c8 100644
--- a/cpp/visualmesh/visualmesh.hpp
+++ b/cpp/visualmesh/visualmesh.hpp
@@ -148,7 +148,7 @@ class VisualMesh {
         // z height from the transformation matrix
         const Scalar& h = Hoc[2][3];
         auto mesh       = height(h);
-        return mesh->lookup(Hoc, lens);
+        return std::make_pair(mesh, mesh.lookup(Hoc, lens));
     }
 
 private: