fast-depth-tf/packnet_functional.py

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
from tensorflow import nn

import group_norm


def pack_layer():
    pass


def residual_layer(inputs, out_channels, stride, dropout=None):
    """
    Keras implementation of the Residual block (ResNet) as used in PackNet
    :param inputs:
    :param out_channels:
    :param stride:
    :param dropout:
    :return:
    """
    x = layers.Conv2D(out_channels, 3, padding='same', strides=stride)(inputs)
    x = layers.Conv2D(out_channels, 3, padding='same')(x)
    shortcut = layers.Conv2D(
        out_channels, 1, padding='same', strides=stride)(inputs)
    if dropout:
        shortcut = keras.layers.SpatialDropout2D(dropout)(shortcut)
    x = keras.layers.Add()([x, shortcut])
    x = group_norm.GroupNormalization(16)(x)
    return keras.layers.ELU()(x)


# Packnet usually expects more than one layer per block (2,2,3,3)
def residual_block(inputs, out_channels, residual_layers, stride, dropout=None):
    x = inputs
    for i in range(0, residual_layers):
        x = residual_layer(x, out_channels, stride, dropout)
    return x


def packnet_conv2d(inputs, out_channels, kernel_size, stride):
    x = keras.layers.Conv2D(out_channels, kernel_size,
                            stride, padding='same')(inputs)
    x = group_norm.GroupNormalization(16)(x)
    return keras.layers.ELU()(x)


def packnet_inverse_depth(inputs, out_channels=1, min_depth=0.5):
    x = layers.Conv2D(out_channels, 3, padding='same')(inputs)
    return keras.activations.sigmoid(x) / min_depth


def pack_3d(inputs, kernel_size, r=2, features_3d=8):
    """
    Implementation of the 3d packing block proposed here: https://arxiv.org/abs/1905.02693
    :param inputs: Tensor inputs
    :param kernel_size: Conv3D kernels size
    :param r: Packing factor
    :param features_3d: Packing depth (increase to increase number of parameters and accuracy)
    :return:
    """
    # Data format for single image in nyu is HWC (space_to_depth uses NHWC as default)
    x = nn.space_to_depth(inputs, r)
    x = tf.expand_dims(x, 4)
    x = keras.layers.Conv3D(features_3d, kernel_size=3, padding='same')(x)
    b, h, w, c, d = x.shape
    x = keras.layers.Reshape((h, w, c * d))(x)
    return packnet_conv2d(x, inputs.shape[3], kernel_size, 1)


def unpack_3d(inputs, out_channels, kernel_size, r=2, features_3d=8):
    x = packnet_conv2d(inputs, out_channels * (r ** 2) //
                       features_3d, kernel_size, 1)
    x = tf.expand_dims(x, 4)  # B x H/2 x W/2 x 4(out)/D x D
    x = keras.layers.Conv3D(features_3d, kernel_size=3, padding='same')(x)
    b, h, w, c, d = x.shape
    x = keras.layers.Reshape([h, w, c * d])(x)
    return nn.depth_to_space(x, r)


# TODO: Support different channel format (right now we're supporting NHWC, we should also support NCHW)
def make_packnet(shape=(224, 224, 3), skip_add=False, features_3d=8, dropout=None, small=False):
    """
    Make the PackNet depth network.
    :param shape: Input shape of the image
    :param skip_add: Set to use add rather than concat skip connections, defaults to True
    :param features_3d: Number of layers in 3D conv for packing/unpacking layers
    :param dropout: Whether to build the model with dropout layers fpr regularisation. Useful in training only
    :param small: Set True to not include the middle-most layer. Reduces params from ~128M -> ~34M
    Further reductions can be achieved by using additive skip connections and less 3d features (down to min ~10M)
    :return: Packnet Keras Model
    """

    # ================ ENCODER =================
    input = keras.layers.Input(shape=shape)
    initial_conv_channels = 32 if small else 64
    x = packnet_conv2d(input, initial_conv_channels, 5, 1)
    skip_1 = x
    x = packnet_conv2d(x, 64, 7, 1)
    x = pack_3d(x, 5, features_3d=features_3d)
    skip_2 = x
    x = residual_block(x, 64, 2, 1, dropout)
    x = pack_3d(x, 3, features_3d=features_3d)
    skip_3 = x
    x = residual_block(x, 128, 2, 1, dropout)
    x = pack_3d(x, 3, features_3d=features_3d)
    skip_4 = x
    x = residual_block(x, 256, 3, 1, dropout)
    x = pack_3d(x, 3, features_3d=features_3d)
    skip_5 = x
    if not small:
        x = residual_block(x, 512, 3, 1, dropout)
        x = pack_3d(x, 3, features_3d=features_3d)
    # ================ ENCODER =================

    # ================ DECODER =================
    # Addition requires we half the outputs so there is a matching number of channels
    divide_factor = (2 if skip_add else 1)
    # layer 7
    if not small:
        x = unpack_3d(x, 512 // divide_factor, 3, features_3d=features_3d)
        x = keras.layers.Add()(
            [x, skip_5]) if skip_add else keras.layers.Concatenate()([x, skip_5])
        x = packnet_conv2d(x, 512, 3, 1)
    # layer 8
    x = unpack_3d(x, 256 // divide_factor, 3, features_3d=features_3d)
    x = keras.layers.Add()(
        [x, skip_4]) if skip_add else keras.layers.Concatenate()([x, skip_4])
    x = packnet_conv2d(x, 256, 3, 1)
    layer_8 = x
    # layer 9
    x = packnet_inverse_depth(x, 1)
    # layer 10
    u_layer_8 = unpack_3d(layer_8, 128 // divide_factor, 3, features_3d=features_3d)
    x = keras.layers.UpSampling2D()(x)
    x = keras.layers.Add()([u_layer_8, skip_3, x]) if skip_add else keras.layers.Concatenate()([u_layer_8, skip_3, x])
    x = packnet_conv2d(x, 128, 3, 1)
    layer_10 = x
    # layer 11
    x = packnet_inverse_depth(x, 1)
    # layer 12
    u_layer_10 = unpack_3d(layer_10, 64, 3, features_3d=features_3d)
    x = keras.layers.UpSampling2D()(x)
    x = keras.layers.Add()([u_layer_10, skip_2, x]) if skip_add else keras.layers.Concatenate()([u_layer_10, skip_2, x])
    x = packnet_conv2d(x, 64, 3, 1)
    layer_12 = x
    # layer 13
    x = packnet_inverse_depth(x)
    # layer 14
    u_layer_12 = unpack_3d(layer_12, initial_conv_channels, 3, features_3d=features_3d)
    x = keras.layers.UpSampling2D()(x)
    x = keras.layers.Add()([u_layer_12, skip_1, x]) if skip_add else keras.layers.Concatenate()([u_layer_12, skip_1, x])
    x = packnet_conv2d(x, initial_conv_channels, 3, 1)
    # layer 15
    x = packnet_inverse_depth(x)
    # ================ DECODER =================

    return keras.Model(inputs=input, outputs=x, name="PackNet")


if __name__ == '__main__':
    # This is the implementation used by the packnet sfm paper
    make_packnet().summary()

    # This is the very small version of packnet
    make_packnet(small=True, features_3d=1, skip_add=True).summary()