From aa423cc38a95d001e426ff8e427bce7f387877c4 Mon Sep 17 00:00:00 2001 From: Piv <18462828+Piv200@users.noreply.github.com> Date: Sat, 20 Nov 2021 13:37:26 +1030 Subject: [PATCH] Start adding unsupervised train loop --- unsupervised/models.py | 4 +-- unsupervised/train.py | 67 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/unsupervised/models.py b/unsupervised/models.py index 7e56d09..c79fa51 100644 --- a/unsupervised/models.py +++ b/unsupervised/models.py @@ -9,8 +9,8 @@ def wrap_mobilenet_nnconv5_for_utrain(model): This just exposes the lower disparity layers as outputs, so they can be used to train at different scales/image resolutions. - :param model: - :return: + :param model: Fast Depth model to wrap + :return: Keras model that takes same input as model and outputs the model output plus 3 disparity layers """ input = model.input disp_1 = model.get_layer('conv_pw_%d_relu' % 15).output diff --git a/unsupervised/train.py b/unsupervised/train.py index 01f028a..0e7a743 100644 --- a/unsupervised/train.py +++ b/unsupervised/train.py @@ -5,16 +5,71 @@ Allows pluggable depth networks for differing performance (including fast-depth) """ import tensorflow.keras as keras +import warp +import unsupervised.loss as loss -class SFMLearner(keras.Model): +class UnsupervisedPoseDepthLearner(keras.Model): + """ + Keras model to learn simultaneous depth + pose from image/video sequences. + + To train this, the datasource should yield 3 frames and camera intrinsics. + Optionally velocity + timestamp per frame to train to real scale + """ + + def __init__(self, depth_model, pose_model, num_scales=3, *args, **kwargs): + super().__init__(*args, **kwargs) + self.depth_model = depth_model + self.pose_model = pose_model + self.num_scales = num_scales - def __init__(depth_model, pose_model): - pass def train_step(self, data): - pass + """ + + :param data: Format: {frames: Mat[3], intrinsics: Tensor} + """ + # Pass through depth for target image + # TODO: Convert frame to tensor (or do this in the dataloader) + # TODO: Ensure the depth output includes enough outputs for each scale + depth = self.depth_model(data.frames[1]) + + # Pass through depth -> pose for both source images + # TODO: Concat these poses using tf.concat + pose1 = self.pose_model(data.frames[1], data.frames[0]) + pose2 = self.pose_model(data.frames[1], data.frames[2]) + + shape = depth[0].shape + + # TODO: Pull coords out of train step into initialiser, then it only needs to be created once. + # Ideally the size/batch size will still be calculated automatically + coords = warp.image_coordinate(shape[0], shape[1], shape[2]) + + scale_losses = [] + # For each scale, do the projective inverse warp step and calculate losses + for i in range(self.num_scales): + # TODO: Could simplify this by stacking the source images (see sfmlearner) + # It isn't too much of an issue right now since we're only using 2 images like in monodepth + # For each depth output (scale), do the projective inverse warp on each input image and calculate the losses + # Only take the min loss between the two warped images (from monodepth2) + warp1 = warp.projective_inverse_warp(data.frames[0], depth[i], pose1, data.intrinsics, coords) + warp2 = warp.projective_inverse_warp(data.frames[2], depth[i], pose1, data.intrinsics, coords) + + # Per pixel loss is just the difference in pixel intensities? + # Something like l1 plus ssim + loss1 = loss.make_combined_ssim_l1_loss(data.frames[1], warp1) + loss2 = loss.make_combined_ssim_l1_loss(data.frames[1], warp2) + + # Take the min from these? Or min after auto-masking? I think after auto masking -def make_sfm_learner_pose_net(input_shape=(224, 224, 3)): - pass + # Also do the auto masking from monodepth2 (compare pixel difference between warped with difference + # in source, if source is more different then ignore the pixel). + pass + + # Collect losses, average them out + + # Calculate smooth losses + + + pass \ No newline at end of file