Merge branch 'unsupervised' into 'main'

Packnet See merge request vato007/fast-depth-tf!4
2021-07-31 01:31:25 +00:00
parent e5b07fb766 6514fb0e86
commit d3a63c6bcd
16 changed files with 1232 additions and 152 deletions
--- a/dense_depth_functional.py
+++ b/dense_depth_functional.py
@@ -1,4 +1,3 @@
-import tensorflow as tf
 import tensorflow.keras as keras
 import tensorflow_datasets as tfds

@@ -25,7 +24,8 @@ def dense_depth(size, weights=None, shape=(224, 224, 3)):
    densenet_output_channels = densenet.layers[-1].output.shape[-1]

    # Reduce the feature set (pointwise)
-    decoder = keras.layers.Conv2D(filters=densenet_output_channels, kernel_size=1, padding='same')(densenet.output)
+    decoder = keras.layers.Conv2D(
+        filters=densenet_output_channels, kernel_size=1, padding='same')(densenet.output)

    # The actual decoder
    decoder = dense_upsample_block(
@@ -66,19 +66,19 @@ def dense_nnconv5(size, weights=None, shape=(224, 224, 3), half_features=True):

    # Reduce the feature set (pointwise)
    decoder = keras.layers.Conv2D(filters=int(densenet_output_shape[-1]), kernel_size=1, padding='same',
-                            input_shape=densenet_output_shape, name='conv2')(densenet.output)
+                                  input_shape=densenet_output_shape, name='conv2')(densenet.output)

    # TODO: More intermediate layers here?

    # Fast Depth Decoder
    decoder = fd.nnconv5(decoder, densenet.get_layer('pool3_pool').output_shape[3], 1,
-                   skip_connection=densenet.get_layer('pool3_pool').output)
+                         skip_connection=densenet.get_layer('pool3_pool').output)
    decoder = fd.nnconv5(decoder, densenet.get_layer('pool2_pool').output_shape[3], 2,
-                   skip_connection=densenet.get_layer('pool2_pool').output)
+                         skip_connection=densenet.get_layer('pool2_pool').output)
    decoder = fd.nnconv5(decoder, densenet.get_layer('pool1').output_shape[3], 3,
-                   skip_connection=densenet.get_layer('pool1').output)
+                         skip_connection=densenet.get_layer('pool1').output)
    decoder = fd.nnconv5(decoder, densenet.get_layer('conv1/relu').output_shape[3], 4,
-                   skip_connection=densenet.get_layer('conv1/relu').output)
+                         skip_connection=densenet.get_layer('conv1/relu').output)

    # Final Pointwise for depth extraction
    decoder = keras.layers.Conv2D(1, 1, padding='same')(decoder)
@@ -87,30 +87,6 @@ def dense_nnconv5(size, weights=None, shape=(224, 224, 3), half_features=True):
    return keras.Model(inputs=input, outputs=decoder, name="fast_dense_depth")


-def load_nyu(download_dir='../nyu'):
-    """
-    Load the nyu_v2 dataset train split. Will be downloaded to ../nyu
-    :return: nyu_v2 dataset builder
-    """
-    builder = tfds.builder('nyu_depth_v2')
-    builder.download_and_prepare(download_dir=download_dir)
-    return builder \
-        .as_dataset(split='train', shuffle_files=True) \
-        .shuffle(buffer_size=1024) \
-        .batch(8) \
-        .map(lambda x: fd.crop_and_resize(x))
-
-
-def load_nyu_evaluate(download_dir='../nyu'):
-    """
-    Load the nyu_v2 dataset validation split. Will be downloaded to ../nyu
-    :return: nyu_v2 dataset builder
-    """
-    builder = tfds.builder('nyu_depth_v2')
-    builder.download_and_prepare(download_dir=download_dir)
-    return builder.as_dataset(split='validation').batch(1).map(lambda x: fd.crop_and_resize(x))
-
-
 if __name__ == '__main__':
    model = dense_depth(169, 'imagenet')
    model.summary()
--- a/fast_depth_functional.py
+++ b/fast_depth_functional.py
@@ -1,7 +1,8 @@
-import tensorflow as tf
 import tensorflow.keras as keras
-import tensorflow_datasets as tfds
-# Needed for the kitti dataset, don't delete
+
+from load import load_nyu_evaluate
+from metric import *
+from util import crop_and_resize

 """
 Unofficial tensorflow keras implementation of FastDepth (mobilenet_nnconv5).
@@ -74,59 +75,6 @@ def mobilenet_nnconv5(weights=None, shape=(224, 224, 3)):
    return keras.Model(inputs=input, outputs=x, name="fast_depth")


-def delta1_metric(y_true, y_pred):
-    maxRatio = tf.maximum(y_pred / y_true, y_true / y_pred)
-    return tf.nn.moments(tf.cast(maxRatio < tf.convert_to_tensor(1.25), tf.float32), axes=None)[0]
-
-
-def delta2_metric(y_true, y_pred):
-    maxRatio = tf.maximum(y_pred / y_true, y_true / y_pred)
-    return tf.nn.moments(tf.cast(maxRatio < tf.convert_to_tensor(1.25 ** 2), tf.float32), axes=None)[0]
-
-
-def delta3_metric(y_true, y_pred):
-    maxRatio = tf.maximum(y_pred / y_true, y_true / y_pred)
-    return tf.nn.moments(tf.cast(maxRatio < tf.convert_to_tensor(1.25 ** 3), tf.float32), axes=None)[0]
-
-
-def compile(model, optimiser=keras.optimizers.SGD(), loss=keras.losses.MeanSquaredError(), custom_metrics=None):
-    """
-    Compile FastDepth model with relevant metrics
-    :param model: Model to compile
-    :param optimiser: Custom optimiser to use
-    :param loss: Loss function to use
-    :param include_metrics: Whether to include metrics (RMSE, MSE, a1,2,3)
-    """
-    model.compile(optimizer=optimiser,
-                  loss=loss,
-                  metrics=[keras.metrics.RootMeanSquaredError(),
-                           keras.metrics.MeanSquaredError(),
-                           delta1_metric,
-                           delta2_metric,
-                           delta3_metric] if custom_metrics is None else custom_metrics)
-
-
-def train(existing_model=None, pretrained_weights='imagenet', epochs=4, save_file=None, dataset=None):
-    """
-    Compile, train and save (if a save file is specified) a Fast Depth model.
-    :param existing_model: Existing FastDepth model to train. None will create
-    :param pretrained_weights: Weights to use if existing_model is not specified. See keras.applications.MobileNet
-        weights parameter for options here.
-    :param epochs: Number of epochs to run for
-    :param save_file: File/directory to save to after training. By default the model won't be saved
-    :param dataset: Train dataset to use. By default will DOWNLOAD and use tensorflow nyu_v2 dataset
-    """
-    if not existing_model:
-        existing_model = mobilenet_nnconv5(pretrained_weights)
-        compile(existing_model)
-    if not dataset:
-        dataset = load_nyu()
-    existing_model.fit(dataset, epochs=epochs)
-    if save_file:
-        existing_model.save(save_file)
-    return existing_model
-
-
 def evaluate(compiled_model, dataset=None):
    """
    Evaluate the model using rmse, delta1/2/3 metrics
@@ -150,66 +98,6 @@ def forward(model, image):
    return model(crop_and_resize(image))


-def load_model(file):
-    """
-    Load previously trained FastDepth model from disk. Will include relevant metrics (custom objects)
-    :param file: File/directory to load the model from
-    :return:
-    """
-    return keras.models.load_model(file, custom_objects={'delta1_metric': delta1_metric,
-                                                         'delta2_metric': delta2_metric,
-                                                         'delta3_metric': delta3_metric})
-
-
-def crop_and_resize(x):
-    shape = tf.shape(x['depth'])
-    img_shape = tf.shape(x['image'])
-    # Ensure we get a square for when we resize is later.
-    # For horizontal images this is basically just cropping the sides off
-    center_shape = min(shape[1], shape[2], img_shape[1], img_shape[2])
-
-    def layer():
-        return keras.Sequential([
-            keras.layers.experimental.preprocessing.CenterCrop(
-                center_shape, center_shape),
-            keras.layers.experimental.preprocessing.Resizing(
-                224, 224, interpolation='nearest')
-        ])
-
-    # Reshape label to 4d, can't use array unwrap as it's unsupported by tensorflow
-    return layer()(x['image']), layer()(tf.reshape(x['depth'], [shape[0], shape[1], shape[2], 1]))
-
-
-def load_nyu(download_dir='../nyu'):
-    """
-    Load the nyu_v2 dataset train split. Will be downloaded to ../nyu
-    :return: nyu_v2 dataset builder
-    """
-    builder = tfds.builder('nyu_depth_v2')
-    builder.download_and_prepare(download_dir=download_dir)
-    return builder \
-        .as_dataset(split='train', shuffle_files=True) \
-        .shuffle(buffer_size=1024) \
-        .batch(8) \
-        .map(lambda x: crop_and_resize(x))
-
-
-def load_nyu_evaluate(download_dir='../nyu'):
-    """
-    Load the nyu_v2 dataset validation split. Will be downloaded to ../nyu
-    :return: nyu_v2 dataset builder
-    """
-    builder = tfds.builder('nyu_depth_v2')
-    builder.download_and_prepare(download_dir=download_dir)
-    return builder.as_dataset(split='validation').batch(1).map(lambda x: crop_and_resize(x))
-
-
-def load_kitti(download_dir='../kitti'):
-    ds = tfds.builder('kitti_depth')
-    ds.download_and_prepare(download_dir=download_dir)
-    return ds.as_dataset(tfds.Split.TRAIN).batch(8).map(lambda x: crop_and_resize(x))
-
-
 if __name__ == '__main__':
    model = mobilenet_nnconv5()
    model.summary()
--- a/group_norm.py
+++ b/group_norm.py
@@ -0,0 +1,209 @@
+# MIT License
+#
+# Copyright (c) 2019 Somshubra Majumdar
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Taken from: https://github.com/titu1994/Keras-Group-Normalization/blob/master/group_norm.py
+
+from tensorflow.keras import backend as K
+from tensorflow.keras import constraints
+from tensorflow.keras import initializers
+from tensorflow.keras import regularizers
+from tensorflow.keras.layers import Layer, InputSpec
+
+
+class GroupNormalization(Layer):
+    """Group normalization layer
+    Group Normalization divides the channels into groups and computes within each group
+    the mean and variance for normalization. GN's computation is independent of batch sizes,
+    and its accuracy is stable in a wide range of batch sizes
+    # Arguments
+        groups: Integer, the number of groups for Group Normalization.
+        axis: Integer, the axis that should be normalized
+            (typically the features axis).
+            For instance, after a `Conv2D` layer with
+            `data_format="channels_first"`,
+            set `axis=1` in `BatchNormalization`.
+        epsilon: Small float added to variance to avoid dividing by zero.
+        center: If True, add offset of `beta` to normalized tensor.
+            If False, `beta` is ignored.
+        scale: If True, multiply by `gamma`.
+            If False, `gamma` is not used.
+            When the next layer is linear (also e.g. `nn.relu`),
+            this can be disabled since the scaling
+            will be done by the next layer.
+        beta_initializer: Initializer for the beta weight.
+        gamma_initializer: Initializer for the gamma weight.
+        beta_regularizer: Optional regularizer for the beta weight.
+        gamma_regularizer: Optional regularizer for the gamma weight.
+        beta_constraint: Optional constraint for the beta weight.
+        gamma_constraint: Optional constraint for the gamma weight.
+    # Input shape
+        Arbitrary. Use the keyword argument `input_shape`
+        (tuple of integers, does not include the samples axis)
+        when using this layer as the first layer in a model.
+    # Output shape
+        Same shape as input.
+    # References
+        - [Group Normalization](https://arxiv.org/abs/1803.08494)
+    """
+
+    def __init__(self,
+                 groups=32,
+                 axis=-1,
+                 epsilon=1e-5,
+                 center=True,
+                 scale=True,
+                 beta_initializer='zeros',
+                 gamma_initializer='ones',
+                 beta_regularizer=None,
+                 gamma_regularizer=None,
+                 beta_constraint=None,
+                 gamma_constraint=None,
+                 **kwargs):
+        super(GroupNormalization, self).__init__(**kwargs)
+        self.supports_masking = True
+        self.groups = groups
+        self.axis = axis
+        self.epsilon = epsilon
+        self.center = center
+        self.scale = scale
+        self.beta_initializer = initializers.get(beta_initializer)
+        self.gamma_initializer = initializers.get(gamma_initializer)
+        self.beta_regularizer = regularizers.get(beta_regularizer)
+        self.gamma_regularizer = regularizers.get(gamma_regularizer)
+        self.beta_constraint = constraints.get(beta_constraint)
+        self.gamma_constraint = constraints.get(gamma_constraint)
+        self.gamma = None
+        self.beta = None
+
+    def build(self, input_shape):
+        dim = input_shape[self.axis]
+
+        if dim is None:
+            raise ValueError('Axis ' + str(self.axis) + ' of '
+                                                        'input tensor should have a defined dimension '
+                                                        'but the layer received an input with shape ' +
+                             str(input_shape) + '.')
+
+        if dim < self.groups:
+            raise ValueError('Number of groups (' + str(self.groups) + ') cannot be '
+                                                                       'more than the number of channels (' +
+                             str(dim) + ').')
+
+        if dim % self.groups != 0:
+            raise ValueError('Number of groups (' + str(self.groups) + ') must be a '
+                                                                       'multiple of the number of channels (' +
+                             str(dim) + ').')
+
+        self.input_spec = InputSpec(ndim=len(input_shape),
+                                    axes={self.axis: dim})
+        shape = (dim,)
+
+        if self.scale:
+            self.gamma = self.add_weight(shape=shape,
+                                         name='gamma',
+                                         initializer=self.gamma_initializer,
+                                         regularizer=self.gamma_regularizer,
+                                         constraint=self.gamma_constraint)
+        if self.center:
+            self.beta = self.add_weight(shape=shape,
+                                        name='beta',
+                                        initializer=self.beta_initializer,
+                                        regularizer=self.beta_regularizer,
+                                        constraint=self.beta_constraint)
+        self.built = True
+
+    def call(self, inputs, **kwargs):
+        input_shape = K.int_shape(inputs)
+        tensor_input_shape = K.shape(inputs)
+
+        # Prepare broadcasting shape.
+        reduction_axes = list(range(len(input_shape)))
+        del reduction_axes[self.axis]
+        broadcast_shape = [1] * len(input_shape)
+        broadcast_shape[self.axis] = input_shape[self.axis] // self.groups
+        broadcast_shape.insert(1, self.groups)
+
+        reshape_group_shape = K.shape(inputs)
+        group_axes = [reshape_group_shape[i] for i in range(len(input_shape))]
+        group_axes[self.axis] = input_shape[self.axis] // self.groups
+        group_axes.insert(1, self.groups)
+
+        # reshape inputs to new group shape
+        group_shape = [group_axes[0], self.groups] + group_axes[2:]
+        group_shape = K.stack(group_shape)
+        inputs = K.reshape(inputs, group_shape)
+
+        group_reduction_axes = list(range(len(group_axes)))
+        group_reduction_axes = group_reduction_axes[2:]
+
+        mean = K.mean(inputs, axis=group_reduction_axes, keepdims=True)
+        variance = K.var(inputs, axis=group_reduction_axes, keepdims=True)
+
+        inputs = (inputs - mean) / (K.sqrt(variance + self.epsilon))
+
+        # prepare broadcast shape
+        inputs = K.reshape(inputs, group_shape)
+        outputs = inputs
+
+        # In this case we must explicitly broadcast all parameters.
+        if self.scale:
+            broadcast_gamma = K.reshape(self.gamma, broadcast_shape)
+            outputs = outputs * broadcast_gamma
+
+        if self.center:
+            broadcast_beta = K.reshape(self.beta, broadcast_shape)
+            outputs = outputs + broadcast_beta
+
+        outputs = K.reshape(outputs, tensor_input_shape)
+
+        return outputs
+
+    def get_config(self):
+        config = {
+            'groups': self.groups,
+            'axis': self.axis,
+            'epsilon': self.epsilon,
+            'center': self.center,
+            'scale': self.scale,
+            'beta_initializer': initializers.serialize(self.beta_initializer),
+            'gamma_initializer': initializers.serialize(self.gamma_initializer),
+            'beta_regularizer': regularizers.serialize(self.beta_regularizer),
+            'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
+            'beta_constraint': constraints.serialize(self.beta_constraint),
+            'gamma_constraint': constraints.serialize(self.gamma_constraint)
+        }
+        base_config = super(GroupNormalization, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+
+if __name__ == '__main__':
+    from tensorflow.keras.layers import Input
+    from tensorflow.keras.models import Model
+
+    ip = Input(shape=(None, None, 4))
+    # ip = Input(batch_shape=(100, None, None, 2))
+    x = GroupNormalization(groups=2, axis=-1, epsilon=0.1)(ip)
+    model = Model(ip, x)
+    model.summary()
--- a/load.py
+++ b/load.py
@@ -0,0 +1,48 @@
+import tensorflow.keras as keras
+import tensorflow_datasets as tfds
+
+from losses import dense_depth_loss_function
+from metric import *
+from util import crop_and_resize
+
+
+def load_nyu(download_dir='../nyu', out_shape=(224, 224)):
+    """
+    Load the nyu_v2 dataset train split. Will be downloaded to ../nyu
+    :return: nyu_v2 dataset builder
+    """
+    builder = tfds.builder('nyu_depth_v2')
+    builder.download_and_prepare(download_dir=download_dir)
+    return builder \
+        .as_dataset(split='train', shuffle_files=True) \
+        .shuffle(buffer_size=1024) \
+        .batch(8) \
+        .map(lambda x: crop_and_resize(x, out_shape))
+
+
+def load_nyu_evaluate(download_dir='../nyu', out_shape=(224, 224)):
+    """
+    Load the nyu_v2 dataset validation split. Will be downloaded to ../nyu
+    :return: nyu_v2 dataset builder
+    """
+    builder = tfds.builder('nyu_depth_v2')
+    builder.download_and_prepare(download_dir=download_dir)
+    return builder.as_dataset(split='validation').batch(1).map(lambda x: crop_and_resize(x, out_shape))
+
+
+def load_kitti(download_dir='../kitti', out_shape=(224, 224)):
+    ds = tfds.builder('kitti_depth')
+    ds.download_and_prepare(download_dir=download_dir)
+    return ds.as_dataset(tfds.Split.TRAIN).batch(8).map(lambda x: crop_and_resize(x, out_shape))
+
+
+def load_model(file):
+    """
+    Load previously trained FastDepth model from disk. Will include relevant metrics (custom objects)
+    :param file: File/directory to load the model from
+    :return:
+    """
+    return keras.models.load_model(file, custom_objects={'delta1_metric': delta1_metric,
+                                                         'delta2_metric': delta2,
+                                                         'delta3_metric': delta3,
+                                                         'dense_depth_loss_function': dense_depth_loss_function})
--- a/losses.py
+++ b/losses.py
@@ -6,15 +6,15 @@ def dense_depth_loss_function(y, y_pred):
    Implementation of the loss from the dense depth paper https://arxiv.org/pdf/1812.11941.pdf
    """
    # Point-wise L1 loss
-    l_depth = tf.reduce_mean(tf.math.abs(y_pred - y), axis=-1)
+    l1_depth = tf.reduce_mean(tf.math.abs(y_pred - y), axis=-1)

    # L1 loss over image gradients
    dy, dx = tf.image.image_gradients(y)
    dy_pred, dx_pred = tf.image.image_gradients(y_pred)
-    l_grad = tf.reduce_mean(tf.math.abs(dy_pred - dy) +
-                            tf.math.abs(dx_pred - dx), axis=-1)
+    gradient = tf.reduce_mean(tf.math.abs(dy_pred - dy) +
+                              tf.math.abs(dx_pred - dx), axis=-1)

    #  Structural Similarity (SSIM)
-    l_ssim = (1 - tf.image.ssim(y, y_pred, 500)) / 2
+    ssim = (1 - tf.image.ssim(y, y_pred, 500)) / 2

-    return 0.1 * tf.reduce_mean(l_depth) + tf.reduce_mean(l_grad) + l_ssim
+    return 0.1 * tf.reduce_mean(l1_depth) + tf.reduce_mean(gradient) + ssim
--- a/metric.py
+++ b/metric.py
@@ -0,0 +1,16 @@
+import tensorflow as tf
+
+
+def delta1_metric(y_true, y_pred):
+    max_ratio = tf.maximum(y_pred / y_true, y_true / y_pred)
+    return tf.reduce_mean(tf.cast(max_ratio < tf.convert_to_tensor(1.25), tf.float32))
+
+
+def delta2(y_true, y_pred):
+    max_ratio = tf.maximum(y_pred / y_true, y_true / y_pred)
+    return tf.reduce_mean(tf.cast(max_ratio < tf.convert_to_tensor(1.25 ** 2), tf.float32))
+
+
+def delta3(y_true, y_pred):
+    max_ratio = tf.maximum(y_pred / y_true, y_true / y_pred)
+    return tf.reduce_mean(tf.cast(max_ratio < tf.convert_to_tensor(1.25 ** 3), tf.float32))
--- a/openvino_inference.py
+++ b/openvino_inference.py
@@ -0,0 +1,74 @@
+import argparse
+import time
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+from openvino.inference_engine import IECore
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse and return command line arguments"""
+    parser = argparse.ArgumentParser(add_help=False)
+    args = parser.add_argument_group('Options')
+    # fmt: off
+    args.add_argument('-h', '--help', action='help', help='Show this help message and exit.')
+    args.add_argument('-m', '--model', required=True, type=str,
+                      help='Required. Path to an .xml or .onnx file with a trained model.')
+    args.add_argument('-i', '--input', required=True, type=str, help='Required. Path to an image file.')
+    args.add_argument('-d', '--device', default='CPU', type=str,
+                      help='Optional. Specify the target device to infer on; CPU, GPU, MYRIAD, HDDL or HETERO: '
+                           'is acceptable. The sample will look for a suitable plugin for device specified. '
+                           'Default value is CPU.')
+    # fmt: on
+    return parser.parse_args()
+
+
+def sample(model_location, image_location, device='CPU'):
+    ie = IECore()
+
+    net = ie.read_network(model=model_location)
+    input_blob = next(iter(net.input_info))
+    output_blob = next(iter(net.outputs))
+
+    b, c, h, w = net.input_info[input_blob].input_data.shape
+    image = cv2.imread(image_location)
+    input_ratio = h / w
+    target_ratio = image.shape[0] / image.shape[1]
+    crop_axis = 0 if target_ratio > input_ratio else 1
+    crop_factor = input_ratio * target_ratio / 2
+    center = [image.shape[0] / 2, image.shape[1] / 2]
+    x1 = int(center[0] - image.shape[0] * crop_factor) if crop_axis == 0 else 0
+    x2 = int(center[0] + image.shape[0] * crop_factor) if crop_axis == 0 else image.shape[0]
+    y1 = int(center[1] - image.shape[1] * crop_factor) if crop_axis == 1 else 0
+    y2 = int(center[1] + image.shape[1] * crop_factor) if crop_axis == 1 else image.shape[1]
+    # Crop to target aspect ratio
+    image = image[x1:x2, y1:y2]
+    if image.shape[:-1] != (h, w):
+        image = cv2.resize(image, (w, h))
+
+    image = image.transpose((2, 0, 1))
+    # For batching
+    image = np.expand_dims(image, axis=0)
+
+    exec_net = ie.load_network(network=net, device_name=device)
+    start = time.time()
+    res = exec_net.infer(inputs={input_blob: image})
+    print('First Inference Time Seconds: ' + str(time.time() - start))
+    start = time.time()
+    res = exec_net.infer(inputs={input_blob: image})
+    print('Second Inference Time Seconds: ' + str(time.time() - start))
+    start = time.time()
+    res = exec_net.infer(inputs={input_blob: image})
+    print('Third Inference Time Seconds: ' + str(time.time() - start))
+    res = res[output_blob]
+    depth = res[0][0]
+    fig = plt.figure()
+    ii = plt.imshow(depth, interpolation='nearest')
+    fig.colorbar(ii)
+    plt.show()
+
+
+if __name__ == '__main__':
+    parsed_args = parse_args()
+    sample(parsed_args.model, parsed_args.input, parsed_args.device)
--- a/packnet_functional.py
+++ b/packnet_functional.py
@@ -0,0 +1,150 @@
+import tensorflow as tf
+import tensorflow.keras as keras
+import tensorflow.keras.layers as layers
+from tensorflow import nn
+
+import group_norm
+
+
+def pack_layer():
+    pass
+
+
+def residual_layer(inputs, out_channels, stride, dropout=None):
+    """
+    Keras implementation of the Residual block (ResNet) as used in PackNet
+    :param inputs:
+    :param out_channels:
+    :param stride:
+    :param dropout:
+    :return:
+    """
+    x = layers.Conv2D(out_channels, 3, padding='same', strides=stride)(inputs)
+    x = layers.Conv2D(out_channels, 3, padding='same')(x)
+    shortcut = layers.Conv2D(
+        out_channels, 3, padding='same', strides=stride)(inputs)
+    if dropout:
+        shortcut = keras.layers.SpatialDropout2D(dropout)(shortcut)
+    x = keras.layers.Concatenate()([x, shortcut])
+    x = group_norm.GroupNormalization(16)(x)
+    return keras.layers.ELU()(x)
+
+
+# Packnet usually expects more than one layer per block (2,2,3,3)
+def residual_block(inputs, out_channels, residual_layers, stride, dropout=None):
+    x = inputs
+    for i in range(0, residual_layers):
+        x = residual_layer(x, out_channels, stride, dropout)
+    return x
+
+
+def packnet_conv2d(inputs, out_channels, kernel_size, stride):
+    x = keras.layers.Conv2D(out_channels, kernel_size,
+                            stride, padding='same')(inputs)
+    x = group_norm.GroupNormalization(16)(x)
+    return keras.layers.ELU()(x)
+
+
+def packnet_inverse_depth(inputs, out_channels=1, min_depth=0.5):
+    x = layers.Conv2D(out_channels, 3, padding='same')(inputs)
+    return keras.activations.sigmoid(x) / min_depth
+
+
+def pack_3d(inputs, kernel_size, r=2, features_3d=8):
+    """
+    Implementatino of the 3d packing block proposed here: https://arxiv.org/abs/1905.02693
+    :param inputs:
+    :param kernel_size:
+    :param r:
+    :param features_3d:
+    :return:
+    """
+    # Data format for single image in nyu is HWC (space_to_depth uses NHWC as default)
+    x = nn.space_to_depth(inputs, r)
+    x = tf.expand_dims(x, 4)
+    x = keras.layers.Conv3D(features_3d, kernel_size=3, padding='same')(x)
+    b, h, w, c, d = x.shape
+    x = keras.layers.Reshape((h, w, c * d))(x)
+    return packnet_conv2d(x, inputs.shape[3], kernel_size, 1)
+
+
+def unpack_3d(inputs, out_channels, kernel_size, r=2, features_3d=8):
+    x = packnet_conv2d(inputs, out_channels * (r ** 2) //
+                       features_3d, kernel_size, 1)
+    x = tf.expand_dims(x, 4)  # B x H/2 x W/2 x 4(out)/D x D
+    x = keras.layers.Conv3D(features_3d, kernel_size=3, padding='same')(x)
+    b, h, w, c, d = x.shape
+    x = keras.layers.Reshape([h, w, c * d])(x)
+    return nn.depth_to_space(x, r)
+
+
+# TODO: Support different size packnet for scaling up/down
+# TODO: Support different channel format (right now we're supporting NHWC, we should also support NCHW)
+def make_packnet(shape=(224, 224, 3), skip_add=True, features_3d=4, dropout=None):
+    """
+    Make the PackNet depth network.
+    :param shape: Input shape of the image
+    :param skip_add: Set to use add rather than concat skip connections, defaults to True
+    :return:
+    """
+
+    # ================ ENCODER =================
+    input = keras.layers.Input(shape=shape)
+    x = packnet_conv2d(input, 32, 5, 1)
+    skip_1 = x
+    x = packnet_conv2d(x, 64, 7, 1)
+    x = pack_3d(x, 5, features_3d=features_3d)
+    skip_2 = x
+    x = residual_block(x, 64, 2, 1, dropout)
+    x = pack_3d(x, 3, features_3d=features_3d)
+    skip_3 = x
+    x = residual_block(x, 128, 2, 1, dropout)
+    x = pack_3d(x, 3, features_3d=features_3d)
+    skip_4 = x
+    x = residual_block(x, 256, 3, 1, dropout)
+    x = pack_3d(x, 3, features_3d=features_3d)
+    skip_5 = x
+    x = residual_block(x, 512, 3, 1, dropout)
+    x = pack_3d(x, 3, features_3d=features_3d)
+    # ================ ENCODER =================
+
+    # ================ DECODER =================
+    # layer 7
+    x = unpack_3d(x, 512, 3, features_3d=features_3d)
+    x = keras.layers.Add()(
+        [x, skip_5]) if skip_add else keras.layers.Concatenate()([x, skip_5])
+    x = packnet_conv2d(x, 512, 3, 1)
+    # layer 8
+    x = unpack_3d(x, 256, 3, features_3d=features_3d)
+    x = keras.layers.Add()(
+        [x, skip_4]) if skip_add else keras.layers.Concatenate()([x, skip_4])
+    x = packnet_conv2d(x, 256, 3, 1)
+    layer_8 = x
+    # layer 9
+    x = packnet_inverse_depth(x, 1)
+    # layer 10
+    u_layer_8 = unpack_3d(layer_8, 128, 3, features_3d=features_3d)
+    x = keras.layers.UpSampling2D()(x)
+    x = keras.layers.Add()([u_layer_8, skip_3, x]) if skip_add else keras.layers.Concatenate()([u_layer_8, skip_3, x])
+    x = packnet_conv2d(x, 128, 3, 1)
+    layer_10 = x
+    # layer 11
+    x = packnet_inverse_depth(x, 1)
+    # layer 12
+    u_layer_10 = unpack_3d(layer_10, 64, 3, features_3d=features_3d)
+    x = keras.layers.UpSampling2D()(x)
+    x = keras.layers.Add()([u_layer_10, skip_2, x]) if skip_add else keras.layers.Concatenate()([u_layer_10, skip_2, x])
+    x = packnet_conv2d(x, 64, 3, 1)
+    layer_12 = x
+    # layer 13
+    x = packnet_inverse_depth(x)
+    # layer 14
+    u_layer_12 = unpack_3d(layer_12, 32, 3, features_3d=features_3d)
+    x = keras.layers.UpSampling2D()(x)
+    x = keras.layers.Add()([u_layer_12, skip_1, x]) if skip_add else keras.layers.Concatenate()([u_layer_12, skip_1, x])
+    x = packnet_conv2d(x, 32, 3, 1)
+    # layer 15
+    x = packnet_inverse_depth(x)
+    # ================ DECODER =================
+
+    return keras.Model(inputs=input, outputs=x, name="PackNet")
--- a/packnet_tests.py
+++ b/packnet_tests.py
@@ -0,0 +1,37 @@
+import unittest
+
+import tensorflow as tf
+
+import packnet_functional as p
+
+
+class PacknetTests(unittest.TestCase):
+
+    def test_pack_3d_layer(self):
+        # 3d packing expects a multiple of 16 for channels due to using 16 groups in group normalisation
+        test_input = tf.random.normal([4, 224, 224, 32])
+        y = p.pack_3d(test_input, 3, features_3d=4)
+        out_shape = [i for i in test_input.shape]
+        out_shape[1] = out_shape[1] // 2
+        out_shape[2] = out_shape[2] // 2
+        # TODO: Anything else we can test here for validity?
+        self.assertEqual(y.shape, out_shape)
+
+    def test_unpack_3d_layer(self):
+        num_output_channels = 32
+        test_input = tf.random.normal([4, 112, 112, 64])
+        y = p.unpack_3d(test_input, num_output_channels, 3, features_3d=4)
+        out_shape = [i for i in test_input.shape]
+        out_shape[1] = out_shape[1] * 2
+        out_shape[2] = out_shape[2] * 2
+        out_shape[3] = num_output_channels
+        # TODO: Anything else we can test here for validity?
+        self.assertEqual(y.shape, out_shape)
+
+    def test_packnet(self):
+        packnet = p.make_packnet()
+        self.assertIsNotNone(packnet)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/train.py
+++ b/train.py
@@ -0,0 +1,49 @@
+"""
+Collection of functions to train the various models, and use different losses
+"""
+import tensorflow.keras as keras
+
+from load import load_nyu
+from metric import *
+
+
+def compile(model, optimiser=keras.optimizers.SGD(), loss=keras.losses.MeanSquaredError(), custom_metrics=None):
+    """
+    Compile FastDepth model with relevant metrics
+    :param model: Model to compile
+    :param optimiser: Custom optimiser to use
+    :param loss: Loss function to use
+    :param include_metrics: Whether to include metrics (RMSE, MSE, a1,2,3)
+    """
+    model.compile(optimizer=optimiser,
+                  loss=loss,
+                  metrics=[keras.metrics.RootMeanSquaredError(),
+                           keras.metrics.MeanSquaredError(),
+                           delta1_metric,
+                           delta2,
+                           delta3,
+                           keras.metrics.MeanAbsolutePercentageError(),
+                           keras.metrics.MeanAbsoluteError()] if custom_metrics is None else custom_metrics)
+
+
+def train(existing_model=None, pretrained_weights='imagenet', epochs=4, save_file=None, dataset=None,
+          checkpoint='ckpt'):
+    """
+    Compile, train and save (if a save file is specified) a Fast Depth model.
+    :param existing_model: Existing FastDepth model to train. None will create
+    :param pretrained_weights: Weights to use if existing_model is not specified. See keras.applications.MobileNet
+        weights parameter for options here.
+    :param epochs: Number of epochs to run for
+    :param save_file: File/directory to save to after training. By default the model won't be saved
+    :param dataset: Train dataset to use. By default will DOWNLOAD and use tensorflow nyu_v2 dataset
+    :param checkpoint: Checkpoint to save to
+    """
+    callbacks = []
+    if checkpoint:
+        callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint, save_weights_only=True))
+    if not dataset:
+        dataset = load_nyu()
+    existing_model.fit(dataset, epochs=epochs, callbacks=callbacks)
+    if save_file:
+        existing_model.save(save_file)
+    return existing_model
--- a/unsupervised/loss.py
+++ b/unsupervised/loss.py
@@ -0,0 +1,53 @@
+import tensorflow as tf
+
+
+def l1_loss(target_img, reprojected_img):
+    """
+    Calculates the l1 norm between the target and reprojected image
+
+    :param target_img: Tensor (batch, height, width, 3)
+    :param reprojected_img: Tensor, same shape as target_img
+    :return: The per-pixel l1 norm -> Tensor (batch, height, width, 1)
+    """
+    return tf.reduce_mean(tf.abs(target_img - reprojected_img), axis=3)
+
+
+def l2_loss(target_img, reprojected_img):
+    """
+    Calculates the l2 norm between the target and reprojected image
+
+    :param target_img: Tensor (batch, height, width, 3)
+    :param reprojected_img: Tensor, same shape as target_img
+    :return: The per-pixel l2 norm -> Tensor (batch, height, width, 1)
+    """
+    return tf.reduce_mean((target_img - reprojected_img) ** 2 ** (1 / 2), axis=3)
+
+
+def make_combined_ssim_l1_loss(ssim_weight: int = 0.85, other_loss_fn=l1_loss):
+    """
+    Create a loss function that will calculate ssim for the two images, and use the other_loss_fn to calculate the
+    per pixel loss
+    :param ssim_weight: Weighting that should be applied to SSIM weight vs l1 difference between target and
+        reprojected image
+    :param other_loss_fn: Function to combine with the ssim
+    :return: Function to calculate the per-pixel combined ssim with other loss function
+    """
+
+    def combined_ssim_loss(target_img, reprojected_img):
+        """
+        Calculates the per-pixel photometric reconstruction loss for each source image,
+        combined this with the SSIM between the reconstructed image and the actual image.
+
+        Calculates the following:
+        ssim_weight * SSIM(target_img, reprojected_img) + (1 - ssim_weight) * other_loss_fn(target_img - reprojected_img)
+
+        :param target_img: Tensor with shape (batch, height, width, 3) - current image we're training on
+        :param reprojected_img: Tensor with same shape as target_img, Reprojected from some source image that
+            should be as close as possible to the target image
+        :return: Per-pixel loss -> Tensor with shape (batch, height, width, 1), where height and width match target_img
+            height and width
+        """
+        ssim = tf.image.ssim(target_img, reprojected_img, axis=3, keepdim=True)
+        return ssim_weight * ssim + (1 - ssim_weight) * other_loss_fn(target_img, reprojected_img)
+
+    return combined_ssim_loss
--- a/unsupervised/third-party/train.py
+++ b/unsupervised/third-party/train.py
@@ -0,0 +1,166 @@
+"""
+Trainer to learn depth information on unlabeled data (raw images/videos)
+
+Allows pluggable depth networks for differing performance (including fast-depth)
+"""
+
+import tensorflow as tf
+
+
+def compute_smooth_loss(self, pred_disp):
+    def gradient(pred):
+        D_dy = pred[:, 1:, :, :] - pred[:, :-1, :, :]
+        D_dx = pred[:, :, 1:, :] - pred[:, :, :-1, :]
+        return D_dx, D_dy
+    dx, dy = gradient(pred_disp)
+    dx2, dxdy = gradient(dx)
+    dydx, dy2 = gradient(dy)
+    return tf.reduce_mean(tf.abs(dx2)) + \
+        tf.reduce_mean(tf.abs(dxdy)) + \
+        tf.reduce_mean(tf.abs(dydx)) + \
+        tf.reduce_mean(tf.abs(dy2))
+
+
+def get_reference_explain_mask(self, downscaling):
+    opt = self.opt
+    tmp = np.array([0, 1])
+    ref_exp_mask = np.tile(tmp,
+                           (opt.batch_size,
+                            int(opt.img_height/(2**downscaling)),
+                            int(opt.img_width/(2**downscaling)),
+                            1))
+    ref_exp_mask = tf.constant(ref_exp_mask, dtype=tf.float32)
+    return ref_exp_mask
+
+
+def get_sfm_loss_fn(opt):
+    def sfm_loss_fn(y, y_pred):
+        # TODO: Correctly format a batch that is required for this loss function
+        pixel_loss = 0
+        exp_loss = 0
+        smooth_loss = 0
+        tgt_image_all = []
+        src_image_stack_all = []
+        proj_image_stack_all = []
+        proj_error_stack_all = []
+        exp_mask_stack_all = []
+        for s in range(opt.num_scales):
+            if opt.explain_reg_weight > 0:
+                # Construct a reference explainability mask (i.e. all
+                # pixels are explainable)
+                ref_exp_mask = get_reference_explain_mask(s)
+            # Scale the source and target images for computing loss at the
+            # according scale.
+            curr_tgt_image = tf.image.resize_area(tgt_image,
+                                                  [int(opt.img_height/(2**s)), int(opt.img_width/(2**s))])
+            curr_src_image_stack = tf.image.resize_area(src_image_stack,
+                                                        [int(opt.img_height/(2**s)), int(opt.img_width/(2**s))])
+
+            if opt.smooth_weight > 0:
+                smooth_loss += opt.smooth_weight/(2**s) * \
+                    compute_smooth_loss(pred_disp[s])
+
+            for i in range(opt.num_source):
+                # Inverse warp the source image to the target image frame
+                curr_proj_image = projective_inverse_warp(
+                    curr_src_image_stack[:, :, :, 3*i:3*(i+1)],
+                    tf.squeeze(pred_depth[s], axis=3),
+                    pred_poses[:, i, :],
+                    intrinsics[:, s, :, :])
+                curr_proj_error = tf.abs(curr_proj_image - curr_tgt_image)
+                # Cross-entropy loss as regularization for the
+                # explainability prediction
+                if opt.explain_reg_weight > 0:
+                    curr_exp_logits = tf.slice(pred_exp_logits[s],
+                                               [0, 0, 0, i*2],
+                                               [-1, -1, -1, 2])
+                    exp_loss += opt.explain_reg_weight * \
+                        self.compute_exp_reg_loss(curr_exp_logits,
+                                                  ref_exp_mask)
+                    curr_exp = tf.nn.softmax(curr_exp_logits)
+                # Photo-consistency loss weighted by explainability
+                if opt.explain_reg_weight > 0:
+                    pixel_loss += tf.reduce_mean(curr_proj_error *
+                                                 tf.expand_dims(curr_exp[:, :, :, 1], -1))
+                else:
+                    pixel_loss += tf.reduce_mean(curr_proj_error)
+                # Prepare images for tensorboard summaries
+                if i == 0:
+                    proj_image_stack = curr_proj_image
+                    proj_error_stack = curr_proj_error
+                    if opt.explain_reg_weight > 0:
+                        exp_mask_stack = tf.expand_dims(
+                            curr_exp[:, :, :, 1], -1)
+                else:
+                    proj_image_stack = tf.concat([proj_image_stack,
+                                                  curr_proj_image], axis=3)
+                    proj_error_stack = tf.concat([proj_error_stack,
+                                                  curr_proj_error], axis=3)
+                    if opt.explain_reg_weight > 0:
+                        exp_mask_stack = tf.concat([exp_mask_stack,
+                                                    tf.expand_dims(curr_exp[:, :, :, 1], -1)], axis=3)
+            tgt_image_all.append(curr_tgt_image)
+            src_image_stack_all.append(curr_src_image_stack)
+            proj_image_stack_all.append(proj_image_stack)
+            proj_error_stack_all.append(proj_error_stack)
+            if opt.explain_reg_weight > 0:
+                exp_mask_stack_all.append(exp_mask_stack)
+        total_loss = pixel_loss + smooth_loss + exp_loss
+        return total_loss
+        return sfm_loss_fn
+
+
+def photometric_reconstruction_loss(tgt_img, ref_imgs, intrinsics,
+                                    depth, explainability_mask, pose,
+                                    rotation_mode='euler', padding_mode='zeros'):
+    def one_scale(d, mask):
+        assert(mask is None or d.size()
+               [2:] == mask.size()[2:])
+        assert(pose.size(1) == len(ref_imgs))
+
+        reconstruction_loss = 0
+        b, _, h, w = d.size()
+        downscale = tgt_img.size(2)/h
+
+        tgt_img_scaled = F.interpolate(tgt_img, (h, w), mode='area')
+        ref_imgs_scaled = [F.interpolate(
+            ref_img, (h, w), mode='area') for ref_img in ref_imgs]
+        intrinsics_scaled = tf.concat(
+            (intrinsics[:, 0:2]/downscale, intrinsics[:, 2:]), dim=1)
+
+        warped_imgs = []
+        diff_maps = []
+
+        for i, ref_img in enumerate(ref_imgs_scaled):
+            current_pose = pose[:, i]
+
+            ref_img_warped, valid_points = inverse_warp(ref_img, depth[:, 0], current_pose,
+                                                        intrinsics_scaled,
+                                                        rotation_mode, padding_mode)
+            diff = (tgt_img_scaled - ref_img_warped) * \
+                valid_points.unsqueeze(1).float()
+
+            if explainability_mask is not None:
+                diff = diff * explainability_mask[:, i:i+1].expand_as(diff)
+
+            reconstruction_loss += diff.abs().mean()
+            assert((reconstruction_loss == reconstruction_loss).item() == 1)
+
+            warped_imgs.append(ref_img_warped[0])
+            diff_maps.append(diff[0])
+
+        return reconstruction_loss, warped_imgs, diff_maps
+
+    warped_results, diff_results = [], []
+    if type(explainability_mask) not in [tuple, list]:
+        explainability_mask = [explainability_mask]
+    if type(depth) not in [list, tuple]:
+        depth = [depth]
+
+    total_loss = 0
+    for d, mask in zip(depth, explainability_mask):
+        loss, warped, diff = one_scale(d, mask)
+        total_loss += loss
+        warped_results.append(warped)
+        diff_results.append(diff)
+    return total_loss, warped_results, diff_results
--- a/unsupervised/third-party/utils.py
+++ b/unsupervised/third-party/utils.py
@@ -0,0 +1,354 @@
+"""
+Utils to load and split image/video data.
+"""
+
+from __future__ import division
+import math
+import tensorflow as tf
+
+
+def euler2mat(z, y, x):
+    """Converts euler angles to rotation matrix
+     TODO: remove the dimension for 'N' (deprecated for converting all source
+           poses altogether)
+     Reference: https://github.com/pulkitag/pycaffe-utils/blob/master/rot_utils.py#L174
+    Args:
+        z: rotation angle along z axis (in radians) -- size = [B, N]
+        y: rotation angle along y axis (in radians) -- size = [B, N]
+        x: rotation angle along x axis (in radians) -- size = [B, N]
+    Returns:
+        Rotation matrix corresponding to the euler angles -- size = [B, N, 3, 3]
+    """
+    B = tf.shape(z)[0]
+    N = 1
+    z = tf.clip_by_value(z, -math.pi, math.pi)
+    y = tf.clip_by_value(y, -math.pi, math.pi)
+    x = tf.clip_by_value(x, -math.pi, math.pi)
+
+    # Expand to B x N x 1 x 1
+    z = tf.expand_dims(tf.expand_dims(z, -1), -1)
+    y = tf.expand_dims(tf.expand_dims(y, -1), -1)
+    x = tf.expand_dims(tf.expand_dims(x, -1), -1)
+
+    zeros = tf.zeros([B, N, 1, 1])
+    ones = tf.ones([B, N, 1, 1])
+
+    cosz = tf.cos(z)
+    sinz = tf.sin(z)
+    rotz_1 = tf.concat([cosz, -sinz, zeros], axis=3)
+    rotz_2 = tf.concat([sinz,  cosz, zeros], axis=3)
+    rotz_3 = tf.concat([zeros, zeros, ones], axis=3)
+    zmat = tf.concat([rotz_1, rotz_2, rotz_3], axis=2)
+
+    cosy = tf.cos(y)
+    siny = tf.sin(y)
+    roty_1 = tf.concat([cosy, zeros, siny], axis=3)
+    roty_2 = tf.concat([zeros, ones, zeros], axis=3)
+    roty_3 = tf.concat([-siny, zeros, cosy], axis=3)
+    ymat = tf.concat([roty_1, roty_2, roty_3], axis=2)
+
+    cosx = tf.cos(x)
+    sinx = tf.sin(x)
+    rotx_1 = tf.concat([ones, zeros, zeros], axis=3)
+    rotx_2 = tf.concat([zeros, cosx, -sinx], axis=3)
+    rotx_3 = tf.concat([zeros, sinx, cosx], axis=3)
+    xmat = tf.concat([rotx_1, rotx_2, rotx_3], axis=2)
+
+    rotMat = tf.matmul(tf.matmul(xmat, ymat), zmat)
+    return rotMat
+
+
+def pose_vec2mat(vec):
+    """Converts 6DoF parameters to transformation matrix
+    Args:
+        vec: 6DoF parameters in the order of tx, ty, tz, rx, ry, rz -- [B, 6]
+    Returns:
+        A transformation matrix -- [B, 4, 4]
+    """
+    batch_size, _ = vec.get_shape().as_list()
+    translation = tf.slice(vec, [0, 0], [-1, 3])
+    translation = tf.expand_dims(translation, -1)
+    rx = tf.slice(vec, [0, 3], [-1, 1])
+    ry = tf.slice(vec, [0, 4], [-1, 1])
+    rz = tf.slice(vec, [0, 5], [-1, 1])
+    rot_mat = euler2mat(rz, ry, rx)
+    rot_mat = tf.squeeze(rot_mat, axis=[1])
+    filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4])
+    filler = tf.tile(filler, [batch_size, 1, 1])
+    transform_mat = tf.concat([rot_mat, translation], axis=2)
+    transform_mat = tf.concat([transform_mat, filler], axis=1)
+    return transform_mat
+
+
+def pixel2cam(depth, pixel_coords, intrinsics, is_homogeneous=True):
+    """Transforms coordinates in the pixel frame to the camera frame.
+
+    Args:
+      depth: [batch, height, width]
+      pixel_coords: homogeneous pixel coordinates [batch, 3, height, width]
+      intrinsics: camera intrinsics [batch, 3, 3]
+      is_homogeneous: return in homogeneous coordinates
+    Returns:
+      Coords in the camera frame [batch, 3 (4 if homogeneous), height, width]
+    """
+    batch, height, width = depth.get_shape().as_list()
+    depth = tf.reshape(depth, [batch, 1, -1])
+    pixel_coords = tf.reshape(pixel_coords, [batch, 3, -1])
+    cam_coords = tf.matmul(tf.matrix_inverse(intrinsics), pixel_coords) * depth
+    if is_homogeneous:
+        ones = tf.ones([batch, 1, height*width])
+        cam_coords = tf.concat([cam_coords, ones], axis=1)
+    cam_coords = tf.reshape(cam_coords, [batch, -1, height, width])
+    return cam_coords
+
+
+def cam2pixel(cam_coords, proj):
+    """Transforms coordinates in a camera frame to the pixel frame.
+
+    Args:
+      cam_coords: [batch, 4, height, width]
+      proj: [batch, 4, 4]
+    Returns:
+      Pixel coordinates projected from the camera frame [batch, height, width, 2]
+    """
+    batch, _, height, width = cam_coords.get_shape().as_list()
+    cam_coords = tf.reshape(cam_coords, [batch, 4, -1])
+    unnormalized_pixel_coords = tf.matmul(proj, cam_coords)
+    x_u = tf.slice(unnormalized_pixel_coords, [0, 0, 0], [-1, 1, -1])
+    y_u = tf.slice(unnormalized_pixel_coords, [0, 1, 0], [-1, 1, -1])
+    z_u = tf.slice(unnormalized_pixel_coords, [0, 2, 0], [-1, 1, -1])
+    x_n = x_u / (z_u + 1e-10)
+    y_n = y_u / (z_u + 1e-10)
+    pixel_coords = tf.concat([x_n, y_n], axis=1)
+    pixel_coords = tf.reshape(pixel_coords, [batch, 2, height, width])
+    return tf.transpose(pixel_coords, perm=[0, 2, 3, 1])
+
+
+def meshgrid(batch, height, width, is_homogeneous=True):
+    """Construct a 2D meshgrid.
+
+    Args:
+      batch: batch size
+      height: height of the grid
+      width: width of the grid
+      is_homogeneous: whether to return in homogeneous coordinates
+    Returns:
+      x,y grid coordinates [batch, 2 (3 if homogeneous), height, width]
+    """
+    x_t = tf.matmul(tf.ones(shape=tf.stack([height, 1])),
+                    tf.transpose(tf.expand_dims(
+                        tf.linspace(-1.0, 1.0, width), 1), [1, 0]))
+    y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1),
+                    tf.ones(shape=tf.stack([1, width])))
+    x_t = (x_t + 1.0) * 0.5 * tf.cast(width - 1, tf.float32)
+    y_t = (y_t + 1.0) * 0.5 * tf.cast(height - 1, tf.float32)
+    if is_homogeneous:
+        ones = tf.ones_like(x_t)
+        coords = tf.stack([x_t, y_t, ones], axis=0)
+    else:
+        coords = tf.stack([x_t, y_t], axis=0)
+    coords = tf.tile(tf.expand_dims(coords, 0), [batch, 1, 1, 1])
+    return coords
+
+
+def projective_inverse_warp(img, depth, pose, intrinsics):
+    """Inverse warp a source image to the target image plane based on projection.
+
+    Args:
+      img: the source image [batch, height_s, width_s, 3]
+      depth: depth map of the target image [batch, height_t, width_t]
+      pose: target to source camera transformation matrix [batch, 6], in the
+            order of tx, ty, tz, rx, ry, rz
+      intrinsics: camera intrinsics [batch, 3, 3]
+    Returns:
+      Source image inverse warped to the target image plane [batch, height_t,
+      width_t, 3]
+    """
+    batch, height, width, _ = img.get_shape().as_list()
+    # Convert pose vector to matrix
+    pose = pose_vec2mat(pose)
+    # Construct pixel grid coordinates
+    pixel_coords = meshgrid(batch, height, width)
+    # Convert pixel coordinates to the camera frame
+    cam_coords = pixel2cam(depth, pixel_coords, intrinsics)
+    # Construct a 4x4 intrinsic matrix (TODO: can it be 3x4?)
+    filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4])
+    filler = tf.tile(filler, [batch, 1, 1])
+    intrinsics = tf.concat([intrinsics, tf.zeros([batch, 3, 1])], axis=2)
+    intrinsics = tf.concat([intrinsics, filler], axis=1)
+    # Get a 4x4 transformation matrix from 'target' camera frame to 'source'
+    # pixel frame.
+    proj_tgt_cam_to_src_pixel = tf.matmul(intrinsics, pose)
+    src_pixel_coords = cam2pixel(cam_coords, proj_tgt_cam_to_src_pixel)
+    output_img = bilinear_sampler(img, src_pixel_coords)
+    return output_img
+
+
+def bilinear_sampler(imgs, coords):
+    """Construct a new image by bilinear sampling from the input image.
+
+    Points falling outside the source image boundary have value 0.
+
+    Args:
+      imgs: source image to be sampled from [batch, height_s, width_s, channels]
+      coords: coordinates of source pixels to sample from [batch, height_t,
+        width_t, 2]. height_t/width_t correspond to the dimensions of the output
+        image (don't need to be the same as height_s/width_s). The two channels
+        correspond to x and y coordinates respectively.
+    Returns:
+      A new sampled image [batch, height_t, width_t, channels]
+    """
+    def _repeat(x, n_repeats):
+        rep = tf.transpose(
+            tf.expand_dims(tf.ones(shape=tf.stack([
+                n_repeats,
+            ])), 1), [1, 0])
+        rep = tf.cast(rep, 'float32')
+        x = tf.matmul(tf.reshape(x, (-1, 1)), rep)
+        return tf.reshape(x, [-1])
+
+    with tf.name_scope('image_sampling'):
+        coords_x, coords_y = tf.split(coords, [1, 1], axis=3)
+        inp_size = imgs.get_shape()
+        coord_size = coords.get_shape()
+        out_size = coords.get_shape().as_list()
+        out_size[3] = imgs.get_shape().as_list()[3]
+
+        coords_x = tf.cast(coords_x, 'float32')
+        coords_y = tf.cast(coords_y, 'float32')
+
+        x0 = tf.floor(coords_x)
+        x1 = x0 + 1
+        y0 = tf.floor(coords_y)
+        y1 = y0 + 1
+
+        y_max = tf.cast(tf.shape(imgs)[1] - 1, 'float32')
+        x_max = tf.cast(tf.shape(imgs)[2] - 1, 'float32')
+        zero = tf.zeros([1], dtype='float32')
+
+        x0_safe = tf.clip_by_value(x0, zero, x_max)
+        y0_safe = tf.clip_by_value(y0, zero, y_max)
+        x1_safe = tf.clip_by_value(x1, zero, x_max)
+        y1_safe = tf.clip_by_value(y1, zero, y_max)
+
+        # bilinear interp weights, with points outside the grid having weight 0
+        # wt_x0 = (x1 - coords_x) * tf.cast(tf.equal(x0, x0_safe), 'float32')
+        # wt_x1 = (coords_x - x0) * tf.cast(tf.equal(x1, x1_safe), 'float32')
+        # wt_y0 = (y1 - coords_y) * tf.cast(tf.equal(y0, y0_safe), 'float32')
+        # wt_y1 = (coords_y - y0) * tf.cast(tf.equal(y1, y1_safe), 'float32')
+
+        wt_x0 = x1_safe - coords_x
+        wt_x1 = coords_x - x0_safe
+        wt_y0 = y1_safe - coords_y
+        wt_y1 = coords_y - y0_safe
+
+        # indices in the flat image to sample from
+        dim2 = tf.cast(inp_size[2], 'float32')
+        dim1 = tf.cast(inp_size[2] * inp_size[1], 'float32')
+        base = tf.reshape(
+            _repeat(
+                tf.cast(tf.range(coord_size[0]), 'float32') * dim1,
+                coord_size[1] * coord_size[2]),
+            [out_size[0], out_size[1], out_size[2], 1])
+
+        base_y0 = base + y0_safe * dim2
+        base_y1 = base + y1_safe * dim2
+        idx00 = tf.reshape(x0_safe + base_y0, [-1])
+        idx01 = x0_safe + base_y1
+        idx10 = x1_safe + base_y0
+        idx11 = x1_safe + base_y1
+
+        # sample from imgs
+        imgs_flat = tf.reshape(imgs, tf.stack([-1, inp_size[3]]))
+        imgs_flat = tf.cast(imgs_flat, 'float32')
+        im00 = tf.reshape(
+            tf.gather(imgs_flat, tf.cast(idx00, 'int32')), out_size)
+        im01 = tf.reshape(
+            tf.gather(imgs_flat, tf.cast(idx01, 'int32')), out_size)
+        im10 = tf.reshape(
+            tf.gather(imgs_flat, tf.cast(idx10, 'int32')), out_size)
+        im11 = tf.reshape(
+            tf.gather(imgs_flat, tf.cast(idx11, 'int32')), out_size)
+
+        w00 = wt_x0 * wt_y0
+        w01 = wt_x0 * wt_y1
+        w10 = wt_x1 * wt_y0
+        w11 = wt_x1 * wt_y1
+
+        output = tf.add_n([
+            w00 * im00, w01 * im01,
+            w10 * im10, w11 * im11
+        ])
+        return output
+
+# Spatial transformer network bilinear sampler, taken from https://github.com/kevinzakka/spatial-transformer-network/blob/master/stn/transformer.py
+
+
+def stn_bilinear_sampler(img, x, y):
+    """
+    Performs bilinear sampling of the input images according to the
+    normalized coordinates provided by the sampling grid. Note that
+    the sampling is done identically for each channel of the input.
+    To test if the function works properly, output image should be
+    identical to input image when theta is initialized to identity
+    transform.
+    Input
+    -----
+    - img: batch of images in (B, H, W, C) layout.
+    - grid: x, y which is the output of affine_grid_generator.
+    Returns
+    -------
+    - out: interpolated images according to grids. Same size as grid.
+    """
+    H = tf.shape(img)[1]
+    W = tf.shape(img)[2]
+    max_y = tf.cast(H - 1, 'int32')
+    max_x = tf.cast(W - 1, 'int32')
+    zero = tf.zeros([], dtype='int32')
+
+    # rescale x and y to [0, W-1/H-1]
+    x = tf.cast(x, 'float32')
+    y = tf.cast(y, 'float32')
+    x = 0.5 * ((x + 1.0) * tf.cast(max_x-1, 'float32'))
+    y = 0.5 * ((y + 1.0) * tf.cast(max_y-1, 'float32'))
+
+    # grab 4 nearest corner points for each (x_i, y_i)
+    x0 = tf.cast(tf.floor(x), 'int32')
+    x1 = x0 + 1
+    y0 = tf.cast(tf.floor(y), 'int32')
+    y1 = y0 + 1
+
+    # clip to range [0, H-1/W-1] to not violate img boundaries
+    x0 = tf.clip_by_value(x0, zero, max_x)
+    x1 = tf.clip_by_value(x1, zero, max_x)
+    y0 = tf.clip_by_value(y0, zero, max_y)
+    y1 = tf.clip_by_value(y1, zero, max_y)
+
+    # get pixel value at corner coords
+    Ia = get_pixel_value(img, x0, y0)
+    Ib = get_pixel_value(img, x0, y1)
+    Ic = get_pixel_value(img, x1, y0)
+    Id = get_pixel_value(img, x1, y1)
+
+    # recast as float for delta calculation
+    x0 = tf.cast(x0, 'float32')
+    x1 = tf.cast(x1, 'float32')
+    y0 = tf.cast(y0, 'float32')
+    y1 = tf.cast(y1, 'float32')
+
+    # calculate deltas
+    wa = (x1-x) * (y1-y)
+    wb = (x1-x) * (y-y0)
+    wc = (x-x0) * (y1-y)
+    wd = (x-x0) * (y-y0)
+
+    # add dimension for addition
+    wa = tf.expand_dims(wa, axis=3)
+    wb = tf.expand_dims(wb, axis=3)
+    wc = tf.expand_dims(wc, axis=3)
+    wd = tf.expand_dims(wd, axis=3)
+
+    # compute output
+    out = tf.add_n([wa*Ia, wb*Ib, wc*Ic, wd*Id])
+
+    return out
--- a/unsupervised/train.py
+++ b/unsupervised/train.py
@@ -0,0 +1,20 @@
+"""
+Trainer to learn depth information on unlabeled data (raw images/videos)
+
+Allows pluggable depth networks for differing performance (including fast-depth)
+"""
+
+import tensorflow.keras as keras
+
+
+class SFMLearner(keras.Model):
+
+    def __init__(depth_model, pose_model):
+        pass
+
+    def train_step(self, data):
+        pass
+
+
+def make_sfm_learner_pose_net(input_shape=(224, 224, 3)):
+    pass
--- a/unsupervised/warp.py
+++ b/unsupervised/warp.py
@@ -0,0 +1,19 @@
+def projective_inverse_warp(target_img, source_img, depth, pose, intrinsics):
+    """
+    Calculate the reprojected image from the source to the target, based on the given depth, pose and intrinsics
+
+    SFM Learner inverse warp step
+        ps ~ K.T(t->s).Dt(pt).K^-1.pt
+
+    Idea is to map the pixel coordinates of the target image to 3d space (Dt(pt).K^-1.pt), then map these onto
+    the source image in pixel coordinates (K.T(t->s).{3d coord}), then using the projected coordinates we sample 
+    the pixels in the source image (ps) to reconstruct the target image.
+
+    :param target_img: Tensor (batch, height, width, 3)
+    :param source_img: Tensor, same shape as target_img
+    :param depth: Tensor, (batch, height, width, 1)
+    :param pose: (batch, 3, 3)
+    :param intrinsics: (batch, 3, 3)
+    :return: The source image reprojected to the target
+    """
+    pass
--- a/util.py
+++ b/util.py
@@ -0,0 +1,21 @@
+import tensorflow as tf
+import tensorflow.keras as keras
+
+
+def crop_and_resize(x, out_shape=(224, 224)):
+    shape = tf.shape(x['depth'])
+    img_shape = tf.shape(x['image'])
+    # Ensure we get a square for when we resize it later.
+    # For horizontal images this is basically just cropping the sides off
+    center_shape = tf.minimum(shape[1], tf.minimum(shape[2], tf.minimum(img_shape[1], img_shape[2])))
+
+    def layer():
+        return keras.Sequential([
+            keras.layers.experimental.preprocessing.CenterCrop(
+                center_shape, center_shape),
+            keras.layers.experimental.preprocessing.Resizing(
+                out_shape[0], out_shape[1], interpolation='nearest')
+        ])
+
+    # Reshape label to 4d, can't use array unwrap as it's unsupported by tensorflow
+    return layer()(x['image']), layer()(tf.reshape(x['depth'], [shape[0], shape[1], shape[2], 1]))