From aa423cc38a95d001e426ff8e427bce7f387877c4 Mon Sep 17 00:00:00 2001
From: Piv <18462828+Piv200@users.noreply.github.com>
Date: Sat, 20 Nov 2021 13:37:26 +1030
Subject: [PATCH] Start adding unsupervised train loop

---
 unsupervised/models.py |  4 +--
 unsupervised/train.py  | 67 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/unsupervised/models.py b/unsupervised/models.py
index 7e56d09..c79fa51 100644
--- a/unsupervised/models.py
+++ b/unsupervised/models.py
@@ -9,8 +9,8 @@ def wrap_mobilenet_nnconv5_for_utrain(model):
 
     This just exposes the lower disparity layers as outputs, so they can be used to train at different scales/image
     resolutions.
-    :param model:
-    :return:
+    :param model: Fast Depth model to wrap
+    :return: Keras model that takes same input as model and outputs the model output plus 3 disparity layers
     """
     input = model.input
     disp_1 = model.get_layer('conv_pw_%d_relu' % 15).output
diff --git a/unsupervised/train.py b/unsupervised/train.py
index 01f028a..0e7a743 100644
--- a/unsupervised/train.py
+++ b/unsupervised/train.py
@@ -5,16 +5,71 @@ Allows pluggable depth networks for differing performance (including fast-depth)
 """
 
 import tensorflow.keras as keras
+import warp
+import unsupervised.loss as loss
 
 
-class SFMLearner(keras.Model):
+class UnsupervisedPoseDepthLearner(keras.Model):
+    """
+    Keras model to learn simultaneous depth + pose from image/video sequences.
+
+    To train this, the datasource should yield 3 frames and camera intrinsics.
+    Optionally velocity + timestamp per frame to train to real scale
+    """
+
+    def __init__(self, depth_model, pose_model, num_scales=3, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.depth_model = depth_model
+        self.pose_model = pose_model
+        self.num_scales = num_scales
 
-    def __init__(depth_model, pose_model):
-        pass
 
     def train_step(self, data):
-        pass
+        """
+
+        :param data: Format: {frames: Mat[3], intrinsics: Tensor}
+        """
+        # Pass through depth for target image
+        # TODO: Convert frame to tensor (or do this in the dataloader)
+        # TODO: Ensure the depth output includes enough outputs for each scale
+        depth = self.depth_model(data.frames[1])
+
+        # Pass through depth -> pose for both source images
+        # TODO: Concat these poses using tf.concat
+        pose1 = self.pose_model(data.frames[1], data.frames[0])
+        pose2 = self.pose_model(data.frames[1], data.frames[2])
+
+        shape = depth[0].shape
+
+        # TODO: Pull coords out of train step into initialiser, then it only needs to be created once.
+        #   Ideally the size/batch size will still be calculated automatically
+        coords = warp.image_coordinate(shape[0], shape[1], shape[2])
+
+        scale_losses = []
+        # For each scale, do the projective inverse warp step and calculate losses
+        for i in range(self.num_scales):
+            # TODO: Could simplify this by stacking the source images (see sfmlearner)
+            #   It isn't too much of an issue right now since we're only using 2 images like in monodepth
+            # For each depth output (scale), do the projective inverse warp on each input image and calculate the losses
+            # Only take the min loss between the two warped images (from monodepth2)
+            warp1 = warp.projective_inverse_warp(data.frames[0], depth[i], pose1, data.intrinsics, coords)
+            warp2 = warp.projective_inverse_warp(data.frames[2], depth[i], pose1, data.intrinsics, coords)
+
+            # Per pixel loss is just the difference in pixel intensities?
+            # Something like l1 plus ssim
+            loss1 = loss.make_combined_ssim_l1_loss(data.frames[1], warp1)
+            loss2 = loss.make_combined_ssim_l1_loss(data.frames[1], warp2)
+
+            # Take the min from these? Or min after auto-masking? I think after auto masking
 
 
-def make_sfm_learner_pose_net(input_shape=(224, 224, 3)):
-    pass
+            # Also do the auto masking from monodepth2 (compare pixel difference between warped with difference
+            # in source, if source is more different then ignore the pixel).
+            pass
+
+        # Collect losses, average them out
+
+        # Calculate smooth losses
+
+        
+        pass
\ No newline at end of file