diff --git a/dense_depth_functional.py b/dense_depth_functional.py index 416d372..183de6c 100644 --- a/dense_depth_functional.py +++ b/dense_depth_functional.py @@ -1,4 +1,3 @@ -import tensorflow as tf import tensorflow.keras as keras import tensorflow_datasets as tfds @@ -25,7 +24,8 @@ def dense_depth(size, weights=None, shape=(224, 224, 3)): densenet_output_channels = densenet.layers[-1].output.shape[-1] # Reduce the feature set (pointwise) - decoder = keras.layers.Conv2D(filters=densenet_output_channels, kernel_size=1, padding='same')(densenet.output) + decoder = keras.layers.Conv2D( + filters=densenet_output_channels, kernel_size=1, padding='same')(densenet.output) # The actual decoder decoder = dense_upsample_block( @@ -66,19 +66,19 @@ def dense_nnconv5(size, weights=None, shape=(224, 224, 3), half_features=True): # Reduce the feature set (pointwise) decoder = keras.layers.Conv2D(filters=int(densenet_output_shape[-1]), kernel_size=1, padding='same', - input_shape=densenet_output_shape, name='conv2')(densenet.output) + input_shape=densenet_output_shape, name='conv2')(densenet.output) # TODO: More intermediate layers here? # Fast Depth Decoder decoder = fd.nnconv5(decoder, densenet.get_layer('pool3_pool').output_shape[3], 1, - skip_connection=densenet.get_layer('pool3_pool').output) + skip_connection=densenet.get_layer('pool3_pool').output) decoder = fd.nnconv5(decoder, densenet.get_layer('pool2_pool').output_shape[3], 2, - skip_connection=densenet.get_layer('pool2_pool').output) + skip_connection=densenet.get_layer('pool2_pool').output) decoder = fd.nnconv5(decoder, densenet.get_layer('pool1').output_shape[3], 3, - skip_connection=densenet.get_layer('pool1').output) + skip_connection=densenet.get_layer('pool1').output) decoder = fd.nnconv5(decoder, densenet.get_layer('conv1/relu').output_shape[3], 4, - skip_connection=densenet.get_layer('conv1/relu').output) + skip_connection=densenet.get_layer('conv1/relu').output) # Final Pointwise for depth extraction decoder = keras.layers.Conv2D(1, 1, padding='same')(decoder) @@ -87,30 +87,6 @@ def dense_nnconv5(size, weights=None, shape=(224, 224, 3), half_features=True): return keras.Model(inputs=input, outputs=decoder, name="fast_dense_depth") -def load_nyu(download_dir='../nyu'): - """ - Load the nyu_v2 dataset train split. Will be downloaded to ../nyu - :return: nyu_v2 dataset builder - """ - builder = tfds.builder('nyu_depth_v2') - builder.download_and_prepare(download_dir=download_dir) - return builder \ - .as_dataset(split='train', shuffle_files=True) \ - .shuffle(buffer_size=1024) \ - .batch(8) \ - .map(lambda x: fd.crop_and_resize(x)) - - -def load_nyu_evaluate(download_dir='../nyu'): - """ - Load the nyu_v2 dataset validation split. Will be downloaded to ../nyu - :return: nyu_v2 dataset builder - """ - builder = tfds.builder('nyu_depth_v2') - builder.download_and_prepare(download_dir=download_dir) - return builder.as_dataset(split='validation').batch(1).map(lambda x: fd.crop_and_resize(x)) - - if __name__ == '__main__': model = dense_depth(169, 'imagenet') model.summary() diff --git a/fast_depth_functional.py b/fast_depth_functional.py index 376eee8..672f224 100644 --- a/fast_depth_functional.py +++ b/fast_depth_functional.py @@ -1,7 +1,8 @@ -import tensorflow as tf import tensorflow.keras as keras -import tensorflow_datasets as tfds -# Needed for the kitti dataset, don't delete + +from load import load_nyu_evaluate +from metric import * +from util import crop_and_resize """ Unofficial tensorflow keras implementation of FastDepth (mobilenet_nnconv5). @@ -74,59 +75,6 @@ def mobilenet_nnconv5(weights=None, shape=(224, 224, 3)): return keras.Model(inputs=input, outputs=x, name="fast_depth") -def delta1_metric(y_true, y_pred): - maxRatio = tf.maximum(y_pred / y_true, y_true / y_pred) - return tf.nn.moments(tf.cast(maxRatio < tf.convert_to_tensor(1.25), tf.float32), axes=None)[0] - - -def delta2_metric(y_true, y_pred): - maxRatio = tf.maximum(y_pred / y_true, y_true / y_pred) - return tf.nn.moments(tf.cast(maxRatio < tf.convert_to_tensor(1.25 ** 2), tf.float32), axes=None)[0] - - -def delta3_metric(y_true, y_pred): - maxRatio = tf.maximum(y_pred / y_true, y_true / y_pred) - return tf.nn.moments(tf.cast(maxRatio < tf.convert_to_tensor(1.25 ** 3), tf.float32), axes=None)[0] - - -def compile(model, optimiser=keras.optimizers.SGD(), loss=keras.losses.MeanSquaredError(), custom_metrics=None): - """ - Compile FastDepth model with relevant metrics - :param model: Model to compile - :param optimiser: Custom optimiser to use - :param loss: Loss function to use - :param include_metrics: Whether to include metrics (RMSE, MSE, a1,2,3) - """ - model.compile(optimizer=optimiser, - loss=loss, - metrics=[keras.metrics.RootMeanSquaredError(), - keras.metrics.MeanSquaredError(), - delta1_metric, - delta2_metric, - delta3_metric] if custom_metrics is None else custom_metrics) - - -def train(existing_model=None, pretrained_weights='imagenet', epochs=4, save_file=None, dataset=None): - """ - Compile, train and save (if a save file is specified) a Fast Depth model. - :param existing_model: Existing FastDepth model to train. None will create - :param pretrained_weights: Weights to use if existing_model is not specified. See keras.applications.MobileNet - weights parameter for options here. - :param epochs: Number of epochs to run for - :param save_file: File/directory to save to after training. By default the model won't be saved - :param dataset: Train dataset to use. By default will DOWNLOAD and use tensorflow nyu_v2 dataset - """ - if not existing_model: - existing_model = mobilenet_nnconv5(pretrained_weights) - compile(existing_model) - if not dataset: - dataset = load_nyu() - existing_model.fit(dataset, epochs=epochs) - if save_file: - existing_model.save(save_file) - return existing_model - - def evaluate(compiled_model, dataset=None): """ Evaluate the model using rmse, delta1/2/3 metrics @@ -150,66 +98,6 @@ def forward(model, image): return model(crop_and_resize(image)) -def load_model(file): - """ - Load previously trained FastDepth model from disk. Will include relevant metrics (custom objects) - :param file: File/directory to load the model from - :return: - """ - return keras.models.load_model(file, custom_objects={'delta1_metric': delta1_metric, - 'delta2_metric': delta2_metric, - 'delta3_metric': delta3_metric}) - - -def crop_and_resize(x): - shape = tf.shape(x['depth']) - img_shape = tf.shape(x['image']) - # Ensure we get a square for when we resize is later. - # For horizontal images this is basically just cropping the sides off - center_shape = min(shape[1], shape[2], img_shape[1], img_shape[2]) - - def layer(): - return keras.Sequential([ - keras.layers.experimental.preprocessing.CenterCrop( - center_shape, center_shape), - keras.layers.experimental.preprocessing.Resizing( - 224, 224, interpolation='nearest') - ]) - - # Reshape label to 4d, can't use array unwrap as it's unsupported by tensorflow - return layer()(x['image']), layer()(tf.reshape(x['depth'], [shape[0], shape[1], shape[2], 1])) - - -def load_nyu(download_dir='../nyu'): - """ - Load the nyu_v2 dataset train split. Will be downloaded to ../nyu - :return: nyu_v2 dataset builder - """ - builder = tfds.builder('nyu_depth_v2') - builder.download_and_prepare(download_dir=download_dir) - return builder \ - .as_dataset(split='train', shuffle_files=True) \ - .shuffle(buffer_size=1024) \ - .batch(8) \ - .map(lambda x: crop_and_resize(x)) - - -def load_nyu_evaluate(download_dir='../nyu'): - """ - Load the nyu_v2 dataset validation split. Will be downloaded to ../nyu - :return: nyu_v2 dataset builder - """ - builder = tfds.builder('nyu_depth_v2') - builder.download_and_prepare(download_dir=download_dir) - return builder.as_dataset(split='validation').batch(1).map(lambda x: crop_and_resize(x)) - - -def load_kitti(download_dir='../kitti'): - ds = tfds.builder('kitti_depth') - ds.download_and_prepare(download_dir=download_dir) - return ds.as_dataset(tfds.Split.TRAIN).batch(8).map(lambda x: crop_and_resize(x)) - - if __name__ == '__main__': model = mobilenet_nnconv5() model.summary() diff --git a/group_norm.py b/group_norm.py new file mode 100644 index 0000000..11ccd0c --- /dev/null +++ b/group_norm.py @@ -0,0 +1,209 @@ +# MIT License +# +# Copyright (c) 2019 Somshubra Majumdar +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# Taken from: https://github.com/titu1994/Keras-Group-Normalization/blob/master/group_norm.py + +from tensorflow.keras import backend as K +from tensorflow.keras import constraints +from tensorflow.keras import initializers +from tensorflow.keras import regularizers +from tensorflow.keras.layers import Layer, InputSpec + + +class GroupNormalization(Layer): + """Group normalization layer + Group Normalization divides the channels into groups and computes within each group + the mean and variance for normalization. GN's computation is independent of batch sizes, + and its accuracy is stable in a wide range of batch sizes + # Arguments + groups: Integer, the number of groups for Group Normalization. + axis: Integer, the axis that should be normalized + (typically the features axis). + For instance, after a `Conv2D` layer with + `data_format="channels_first"`, + set `axis=1` in `BatchNormalization`. + epsilon: Small float added to variance to avoid dividing by zero. + center: If True, add offset of `beta` to normalized tensor. + If False, `beta` is ignored. + scale: If True, multiply by `gamma`. + If False, `gamma` is not used. + When the next layer is linear (also e.g. `nn.relu`), + this can be disabled since the scaling + will be done by the next layer. + beta_initializer: Initializer for the beta weight. + gamma_initializer: Initializer for the gamma weight. + beta_regularizer: Optional regularizer for the beta weight. + gamma_regularizer: Optional regularizer for the gamma weight. + beta_constraint: Optional constraint for the beta weight. + gamma_constraint: Optional constraint for the gamma weight. + # Input shape + Arbitrary. Use the keyword argument `input_shape` + (tuple of integers, does not include the samples axis) + when using this layer as the first layer in a model. + # Output shape + Same shape as input. + # References + - [Group Normalization](https://arxiv.org/abs/1803.08494) + """ + + def __init__(self, + groups=32, + axis=-1, + epsilon=1e-5, + center=True, + scale=True, + beta_initializer='zeros', + gamma_initializer='ones', + beta_regularizer=None, + gamma_regularizer=None, + beta_constraint=None, + gamma_constraint=None, + **kwargs): + super(GroupNormalization, self).__init__(**kwargs) + self.supports_masking = True + self.groups = groups + self.axis = axis + self.epsilon = epsilon + self.center = center + self.scale = scale + self.beta_initializer = initializers.get(beta_initializer) + self.gamma_initializer = initializers.get(gamma_initializer) + self.beta_regularizer = regularizers.get(beta_regularizer) + self.gamma_regularizer = regularizers.get(gamma_regularizer) + self.beta_constraint = constraints.get(beta_constraint) + self.gamma_constraint = constraints.get(gamma_constraint) + self.gamma = None + self.beta = None + + def build(self, input_shape): + dim = input_shape[self.axis] + + if dim is None: + raise ValueError('Axis ' + str(self.axis) + ' of ' + 'input tensor should have a defined dimension ' + 'but the layer received an input with shape ' + + str(input_shape) + '.') + + if dim < self.groups: + raise ValueError('Number of groups (' + str(self.groups) + ') cannot be ' + 'more than the number of channels (' + + str(dim) + ').') + + if dim % self.groups != 0: + raise ValueError('Number of groups (' + str(self.groups) + ') must be a ' + 'multiple of the number of channels (' + + str(dim) + ').') + + self.input_spec = InputSpec(ndim=len(input_shape), + axes={self.axis: dim}) + shape = (dim,) + + if self.scale: + self.gamma = self.add_weight(shape=shape, + name='gamma', + initializer=self.gamma_initializer, + regularizer=self.gamma_regularizer, + constraint=self.gamma_constraint) + if self.center: + self.beta = self.add_weight(shape=shape, + name='beta', + initializer=self.beta_initializer, + regularizer=self.beta_regularizer, + constraint=self.beta_constraint) + self.built = True + + def call(self, inputs, **kwargs): + input_shape = K.int_shape(inputs) + tensor_input_shape = K.shape(inputs) + + # Prepare broadcasting shape. + reduction_axes = list(range(len(input_shape))) + del reduction_axes[self.axis] + broadcast_shape = [1] * len(input_shape) + broadcast_shape[self.axis] = input_shape[self.axis] // self.groups + broadcast_shape.insert(1, self.groups) + + reshape_group_shape = K.shape(inputs) + group_axes = [reshape_group_shape[i] for i in range(len(input_shape))] + group_axes[self.axis] = input_shape[self.axis] // self.groups + group_axes.insert(1, self.groups) + + # reshape inputs to new group shape + group_shape = [group_axes[0], self.groups] + group_axes[2:] + group_shape = K.stack(group_shape) + inputs = K.reshape(inputs, group_shape) + + group_reduction_axes = list(range(len(group_axes))) + group_reduction_axes = group_reduction_axes[2:] + + mean = K.mean(inputs, axis=group_reduction_axes, keepdims=True) + variance = K.var(inputs, axis=group_reduction_axes, keepdims=True) + + inputs = (inputs - mean) / (K.sqrt(variance + self.epsilon)) + + # prepare broadcast shape + inputs = K.reshape(inputs, group_shape) + outputs = inputs + + # In this case we must explicitly broadcast all parameters. + if self.scale: + broadcast_gamma = K.reshape(self.gamma, broadcast_shape) + outputs = outputs * broadcast_gamma + + if self.center: + broadcast_beta = K.reshape(self.beta, broadcast_shape) + outputs = outputs + broadcast_beta + + outputs = K.reshape(outputs, tensor_input_shape) + + return outputs + + def get_config(self): + config = { + 'groups': self.groups, + 'axis': self.axis, + 'epsilon': self.epsilon, + 'center': self.center, + 'scale': self.scale, + 'beta_initializer': initializers.serialize(self.beta_initializer), + 'gamma_initializer': initializers.serialize(self.gamma_initializer), + 'beta_regularizer': regularizers.serialize(self.beta_regularizer), + 'gamma_regularizer': regularizers.serialize(self.gamma_regularizer), + 'beta_constraint': constraints.serialize(self.beta_constraint), + 'gamma_constraint': constraints.serialize(self.gamma_constraint) + } + base_config = super(GroupNormalization, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + def compute_output_shape(self, input_shape): + return input_shape + + +if __name__ == '__main__': + from tensorflow.keras.layers import Input + from tensorflow.keras.models import Model + + ip = Input(shape=(None, None, 4)) + # ip = Input(batch_shape=(100, None, None, 2)) + x = GroupNormalization(groups=2, axis=-1, epsilon=0.1)(ip) + model = Model(ip, x) + model.summary() diff --git a/load.py b/load.py new file mode 100644 index 0000000..4860143 --- /dev/null +++ b/load.py @@ -0,0 +1,48 @@ +import tensorflow.keras as keras +import tensorflow_datasets as tfds + +from losses import dense_depth_loss_function +from metric import * +from util import crop_and_resize + + +def load_nyu(download_dir='../nyu', out_shape=(224, 224)): + """ + Load the nyu_v2 dataset train split. Will be downloaded to ../nyu + :return: nyu_v2 dataset builder + """ + builder = tfds.builder('nyu_depth_v2') + builder.download_and_prepare(download_dir=download_dir) + return builder \ + .as_dataset(split='train', shuffle_files=True) \ + .shuffle(buffer_size=1024) \ + .batch(8) \ + .map(lambda x: crop_and_resize(x, out_shape)) + + +def load_nyu_evaluate(download_dir='../nyu', out_shape=(224, 224)): + """ + Load the nyu_v2 dataset validation split. Will be downloaded to ../nyu + :return: nyu_v2 dataset builder + """ + builder = tfds.builder('nyu_depth_v2') + builder.download_and_prepare(download_dir=download_dir) + return builder.as_dataset(split='validation').batch(1).map(lambda x: crop_and_resize(x, out_shape)) + + +def load_kitti(download_dir='../kitti', out_shape=(224, 224)): + ds = tfds.builder('kitti_depth') + ds.download_and_prepare(download_dir=download_dir) + return ds.as_dataset(tfds.Split.TRAIN).batch(8).map(lambda x: crop_and_resize(x, out_shape)) + + +def load_model(file): + """ + Load previously trained FastDepth model from disk. Will include relevant metrics (custom objects) + :param file: File/directory to load the model from + :return: + """ + return keras.models.load_model(file, custom_objects={'delta1_metric': delta1_metric, + 'delta2_metric': delta2, + 'delta3_metric': delta3, + 'dense_depth_loss_function': dense_depth_loss_function}) diff --git a/losses.py b/losses.py index 0e47ede..6051246 100644 --- a/losses.py +++ b/losses.py @@ -6,15 +6,15 @@ def dense_depth_loss_function(y, y_pred): Implementation of the loss from the dense depth paper https://arxiv.org/pdf/1812.11941.pdf """ # Point-wise L1 loss - l_depth = tf.reduce_mean(tf.math.abs(y_pred - y), axis=-1) + l1_depth = tf.reduce_mean(tf.math.abs(y_pred - y), axis=-1) # L1 loss over image gradients dy, dx = tf.image.image_gradients(y) dy_pred, dx_pred = tf.image.image_gradients(y_pred) - l_grad = tf.reduce_mean(tf.math.abs(dy_pred - dy) + - tf.math.abs(dx_pred - dx), axis=-1) + gradient = tf.reduce_mean(tf.math.abs(dy_pred - dy) + + tf.math.abs(dx_pred - dx), axis=-1) # Structural Similarity (SSIM) - l_ssim = (1 - tf.image.ssim(y, y_pred, 500)) / 2 + ssim = (1 - tf.image.ssim(y, y_pred, 500)) / 2 - return 0.1 * tf.reduce_mean(l_depth) + tf.reduce_mean(l_grad) + l_ssim + return 0.1 * tf.reduce_mean(l1_depth) + tf.reduce_mean(gradient) + ssim diff --git a/metric.py b/metric.py new file mode 100644 index 0000000..3a80ac1 --- /dev/null +++ b/metric.py @@ -0,0 +1,16 @@ +import tensorflow as tf + + +def delta1_metric(y_true, y_pred): + max_ratio = tf.maximum(y_pred / y_true, y_true / y_pred) + return tf.reduce_mean(tf.cast(max_ratio < tf.convert_to_tensor(1.25), tf.float32)) + + +def delta2(y_true, y_pred): + max_ratio = tf.maximum(y_pred / y_true, y_true / y_pred) + return tf.reduce_mean(tf.cast(max_ratio < tf.convert_to_tensor(1.25 ** 2), tf.float32)) + + +def delta3(y_true, y_pred): + max_ratio = tf.maximum(y_pred / y_true, y_true / y_pred) + return tf.reduce_mean(tf.cast(max_ratio < tf.convert_to_tensor(1.25 ** 3), tf.float32)) diff --git a/openvino_inference.py b/openvino_inference.py new file mode 100644 index 0000000..66efa54 --- /dev/null +++ b/openvino_inference.py @@ -0,0 +1,74 @@ +import argparse +import time + +import cv2 +import matplotlib.pyplot as plt +import numpy as np +from openvino.inference_engine import IECore + + +def parse_args() -> argparse.Namespace: + """Parse and return command line arguments""" + parser = argparse.ArgumentParser(add_help=False) + args = parser.add_argument_group('Options') + # fmt: off + args.add_argument('-h', '--help', action='help', help='Show this help message and exit.') + args.add_argument('-m', '--model', required=True, type=str, + help='Required. Path to an .xml or .onnx file with a trained model.') + args.add_argument('-i', '--input', required=True, type=str, help='Required. Path to an image file.') + args.add_argument('-d', '--device', default='CPU', type=str, + help='Optional. Specify the target device to infer on; CPU, GPU, MYRIAD, HDDL or HETERO: ' + 'is acceptable. The sample will look for a suitable plugin for device specified. ' + 'Default value is CPU.') + # fmt: on + return parser.parse_args() + + +def sample(model_location, image_location, device='CPU'): + ie = IECore() + + net = ie.read_network(model=model_location) + input_blob = next(iter(net.input_info)) + output_blob = next(iter(net.outputs)) + + b, c, h, w = net.input_info[input_blob].input_data.shape + image = cv2.imread(image_location) + input_ratio = h / w + target_ratio = image.shape[0] / image.shape[1] + crop_axis = 0 if target_ratio > input_ratio else 1 + crop_factor = input_ratio * target_ratio / 2 + center = [image.shape[0] / 2, image.shape[1] / 2] + x1 = int(center[0] - image.shape[0] * crop_factor) if crop_axis == 0 else 0 + x2 = int(center[0] + image.shape[0] * crop_factor) if crop_axis == 0 else image.shape[0] + y1 = int(center[1] - image.shape[1] * crop_factor) if crop_axis == 1 else 0 + y2 = int(center[1] + image.shape[1] * crop_factor) if crop_axis == 1 else image.shape[1] + # Crop to target aspect ratio + image = image[x1:x2, y1:y2] + if image.shape[:-1] != (h, w): + image = cv2.resize(image, (w, h)) + + image = image.transpose((2, 0, 1)) + # For batching + image = np.expand_dims(image, axis=0) + + exec_net = ie.load_network(network=net, device_name=device) + start = time.time() + res = exec_net.infer(inputs={input_blob: image}) + print('First Inference Time Seconds: ' + str(time.time() - start)) + start = time.time() + res = exec_net.infer(inputs={input_blob: image}) + print('Second Inference Time Seconds: ' + str(time.time() - start)) + start = time.time() + res = exec_net.infer(inputs={input_blob: image}) + print('Third Inference Time Seconds: ' + str(time.time() - start)) + res = res[output_blob] + depth = res[0][0] + fig = plt.figure() + ii = plt.imshow(depth, interpolation='nearest') + fig.colorbar(ii) + plt.show() + + +if __name__ == '__main__': + parsed_args = parse_args() + sample(parsed_args.model, parsed_args.input, parsed_args.device) diff --git a/packnet_functional.py b/packnet_functional.py new file mode 100644 index 0000000..c404c07 --- /dev/null +++ b/packnet_functional.py @@ -0,0 +1,150 @@ +import tensorflow as tf +import tensorflow.keras as keras +import tensorflow.keras.layers as layers +from tensorflow import nn + +import group_norm + + +def pack_layer(): + pass + + +def residual_layer(inputs, out_channels, stride, dropout=None): + """ + Keras implementation of the Residual block (ResNet) as used in PackNet + :param inputs: + :param out_channels: + :param stride: + :param dropout: + :return: + """ + x = layers.Conv2D(out_channels, 3, padding='same', strides=stride)(inputs) + x = layers.Conv2D(out_channels, 3, padding='same')(x) + shortcut = layers.Conv2D( + out_channels, 3, padding='same', strides=stride)(inputs) + if dropout: + shortcut = keras.layers.SpatialDropout2D(dropout)(shortcut) + x = keras.layers.Concatenate()([x, shortcut]) + x = group_norm.GroupNormalization(16)(x) + return keras.layers.ELU()(x) + + +# Packnet usually expects more than one layer per block (2,2,3,3) +def residual_block(inputs, out_channels, residual_layers, stride, dropout=None): + x = inputs + for i in range(0, residual_layers): + x = residual_layer(x, out_channels, stride, dropout) + return x + + +def packnet_conv2d(inputs, out_channels, kernel_size, stride): + x = keras.layers.Conv2D(out_channels, kernel_size, + stride, padding='same')(inputs) + x = group_norm.GroupNormalization(16)(x) + return keras.layers.ELU()(x) + + +def packnet_inverse_depth(inputs, out_channels=1, min_depth=0.5): + x = layers.Conv2D(out_channels, 3, padding='same')(inputs) + return keras.activations.sigmoid(x) / min_depth + + +def pack_3d(inputs, kernel_size, r=2, features_3d=8): + """ + Implementatino of the 3d packing block proposed here: https://arxiv.org/abs/1905.02693 + :param inputs: + :param kernel_size: + :param r: + :param features_3d: + :return: + """ + # Data format for single image in nyu is HWC (space_to_depth uses NHWC as default) + x = nn.space_to_depth(inputs, r) + x = tf.expand_dims(x, 4) + x = keras.layers.Conv3D(features_3d, kernel_size=3, padding='same')(x) + b, h, w, c, d = x.shape + x = keras.layers.Reshape((h, w, c * d))(x) + return packnet_conv2d(x, inputs.shape[3], kernel_size, 1) + + +def unpack_3d(inputs, out_channels, kernel_size, r=2, features_3d=8): + x = packnet_conv2d(inputs, out_channels * (r ** 2) // + features_3d, kernel_size, 1) + x = tf.expand_dims(x, 4) # B x H/2 x W/2 x 4(out)/D x D + x = keras.layers.Conv3D(features_3d, kernel_size=3, padding='same')(x) + b, h, w, c, d = x.shape + x = keras.layers.Reshape([h, w, c * d])(x) + return nn.depth_to_space(x, r) + + +# TODO: Support different size packnet for scaling up/down +# TODO: Support different channel format (right now we're supporting NHWC, we should also support NCHW) +def make_packnet(shape=(224, 224, 3), skip_add=True, features_3d=4, dropout=None): + """ + Make the PackNet depth network. + :param shape: Input shape of the image + :param skip_add: Set to use add rather than concat skip connections, defaults to True + :return: + """ + + # ================ ENCODER ================= + input = keras.layers.Input(shape=shape) + x = packnet_conv2d(input, 32, 5, 1) + skip_1 = x + x = packnet_conv2d(x, 64, 7, 1) + x = pack_3d(x, 5, features_3d=features_3d) + skip_2 = x + x = residual_block(x, 64, 2, 1, dropout) + x = pack_3d(x, 3, features_3d=features_3d) + skip_3 = x + x = residual_block(x, 128, 2, 1, dropout) + x = pack_3d(x, 3, features_3d=features_3d) + skip_4 = x + x = residual_block(x, 256, 3, 1, dropout) + x = pack_3d(x, 3, features_3d=features_3d) + skip_5 = x + x = residual_block(x, 512, 3, 1, dropout) + x = pack_3d(x, 3, features_3d=features_3d) + # ================ ENCODER ================= + + # ================ DECODER ================= + # layer 7 + x = unpack_3d(x, 512, 3, features_3d=features_3d) + x = keras.layers.Add()( + [x, skip_5]) if skip_add else keras.layers.Concatenate()([x, skip_5]) + x = packnet_conv2d(x, 512, 3, 1) + # layer 8 + x = unpack_3d(x, 256, 3, features_3d=features_3d) + x = keras.layers.Add()( + [x, skip_4]) if skip_add else keras.layers.Concatenate()([x, skip_4]) + x = packnet_conv2d(x, 256, 3, 1) + layer_8 = x + # layer 9 + x = packnet_inverse_depth(x, 1) + # layer 10 + u_layer_8 = unpack_3d(layer_8, 128, 3, features_3d=features_3d) + x = keras.layers.UpSampling2D()(x) + x = keras.layers.Add()([u_layer_8, skip_3, x]) if skip_add else keras.layers.Concatenate()([u_layer_8, skip_3, x]) + x = packnet_conv2d(x, 128, 3, 1) + layer_10 = x + # layer 11 + x = packnet_inverse_depth(x, 1) + # layer 12 + u_layer_10 = unpack_3d(layer_10, 64, 3, features_3d=features_3d) + x = keras.layers.UpSampling2D()(x) + x = keras.layers.Add()([u_layer_10, skip_2, x]) if skip_add else keras.layers.Concatenate()([u_layer_10, skip_2, x]) + x = packnet_conv2d(x, 64, 3, 1) + layer_12 = x + # layer 13 + x = packnet_inverse_depth(x) + # layer 14 + u_layer_12 = unpack_3d(layer_12, 32, 3, features_3d=features_3d) + x = keras.layers.UpSampling2D()(x) + x = keras.layers.Add()([u_layer_12, skip_1, x]) if skip_add else keras.layers.Concatenate()([u_layer_12, skip_1, x]) + x = packnet_conv2d(x, 32, 3, 1) + # layer 15 + x = packnet_inverse_depth(x) + # ================ DECODER ================= + + return keras.Model(inputs=input, outputs=x, name="PackNet") diff --git a/packnet_tests.py b/packnet_tests.py new file mode 100644 index 0000000..eb80577 --- /dev/null +++ b/packnet_tests.py @@ -0,0 +1,37 @@ +import unittest + +import tensorflow as tf + +import packnet_functional as p + + +class PacknetTests(unittest.TestCase): + + def test_pack_3d_layer(self): + # 3d packing expects a multiple of 16 for channels due to using 16 groups in group normalisation + test_input = tf.random.normal([4, 224, 224, 32]) + y = p.pack_3d(test_input, 3, features_3d=4) + out_shape = [i for i in test_input.shape] + out_shape[1] = out_shape[1] // 2 + out_shape[2] = out_shape[2] // 2 + # TODO: Anything else we can test here for validity? + self.assertEqual(y.shape, out_shape) + + def test_unpack_3d_layer(self): + num_output_channels = 32 + test_input = tf.random.normal([4, 112, 112, 64]) + y = p.unpack_3d(test_input, num_output_channels, 3, features_3d=4) + out_shape = [i for i in test_input.shape] + out_shape[1] = out_shape[1] * 2 + out_shape[2] = out_shape[2] * 2 + out_shape[3] = num_output_channels + # TODO: Anything else we can test here for validity? + self.assertEqual(y.shape, out_shape) + + def test_packnet(self): + packnet = p.make_packnet() + self.assertIsNotNone(packnet) + + +if __name__ == '__main__': + unittest.main() diff --git a/train.py b/train.py new file mode 100644 index 0000000..474d495 --- /dev/null +++ b/train.py @@ -0,0 +1,49 @@ +""" +Collection of functions to train the various models, and use different losses +""" +import tensorflow.keras as keras + +from load import load_nyu +from metric import * + + +def compile(model, optimiser=keras.optimizers.SGD(), loss=keras.losses.MeanSquaredError(), custom_metrics=None): + """ + Compile FastDepth model with relevant metrics + :param model: Model to compile + :param optimiser: Custom optimiser to use + :param loss: Loss function to use + :param include_metrics: Whether to include metrics (RMSE, MSE, a1,2,3) + """ + model.compile(optimizer=optimiser, + loss=loss, + metrics=[keras.metrics.RootMeanSquaredError(), + keras.metrics.MeanSquaredError(), + delta1_metric, + delta2, + delta3, + keras.metrics.MeanAbsolutePercentageError(), + keras.metrics.MeanAbsoluteError()] if custom_metrics is None else custom_metrics) + + +def train(existing_model=None, pretrained_weights='imagenet', epochs=4, save_file=None, dataset=None, + checkpoint='ckpt'): + """ + Compile, train and save (if a save file is specified) a Fast Depth model. + :param existing_model: Existing FastDepth model to train. None will create + :param pretrained_weights: Weights to use if existing_model is not specified. See keras.applications.MobileNet + weights parameter for options here. + :param epochs: Number of epochs to run for + :param save_file: File/directory to save to after training. By default the model won't be saved + :param dataset: Train dataset to use. By default will DOWNLOAD and use tensorflow nyu_v2 dataset + :param checkpoint: Checkpoint to save to + """ + callbacks = [] + if checkpoint: + callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint, save_weights_only=True)) + if not dataset: + dataset = load_nyu() + existing_model.fit(dataset, epochs=epochs, callbacks=callbacks) + if save_file: + existing_model.save(save_file) + return existing_model diff --git a/unsupervised/loss.py b/unsupervised/loss.py new file mode 100644 index 0000000..d62e5b1 --- /dev/null +++ b/unsupervised/loss.py @@ -0,0 +1,53 @@ +import tensorflow as tf + + +def l1_loss(target_img, reprojected_img): + """ + Calculates the l1 norm between the target and reprojected image + + :param target_img: Tensor (batch, height, width, 3) + :param reprojected_img: Tensor, same shape as target_img + :return: The per-pixel l1 norm -> Tensor (batch, height, width, 1) + """ + return tf.reduce_mean(tf.abs(target_img - reprojected_img), axis=3) + + +def l2_loss(target_img, reprojected_img): + """ + Calculates the l2 norm between the target and reprojected image + + :param target_img: Tensor (batch, height, width, 3) + :param reprojected_img: Tensor, same shape as target_img + :return: The per-pixel l2 norm -> Tensor (batch, height, width, 1) + """ + return tf.reduce_mean((target_img - reprojected_img) ** 2 ** (1 / 2), axis=3) + + +def make_combined_ssim_l1_loss(ssim_weight: int = 0.85, other_loss_fn=l1_loss): + """ + Create a loss function that will calculate ssim for the two images, and use the other_loss_fn to calculate the + per pixel loss + :param ssim_weight: Weighting that should be applied to SSIM weight vs l1 difference between target and + reprojected image + :param other_loss_fn: Function to combine with the ssim + :return: Function to calculate the per-pixel combined ssim with other loss function + """ + + def combined_ssim_loss(target_img, reprojected_img): + """ + Calculates the per-pixel photometric reconstruction loss for each source image, + combined this with the SSIM between the reconstructed image and the actual image. + + Calculates the following: + ssim_weight * SSIM(target_img, reprojected_img) + (1 - ssim_weight) * other_loss_fn(target_img - reprojected_img) + + :param target_img: Tensor with shape (batch, height, width, 3) - current image we're training on + :param reprojected_img: Tensor with same shape as target_img, Reprojected from some source image that + should be as close as possible to the target image + :return: Per-pixel loss -> Tensor with shape (batch, height, width, 1), where height and width match target_img + height and width + """ + ssim = tf.image.ssim(target_img, reprojected_img, axis=3, keepdim=True) + return ssim_weight * ssim + (1 - ssim_weight) * other_loss_fn(target_img, reprojected_img) + + return combined_ssim_loss diff --git a/unsupervised/third-party/train.py b/unsupervised/third-party/train.py new file mode 100644 index 0000000..9aed045 --- /dev/null +++ b/unsupervised/third-party/train.py @@ -0,0 +1,166 @@ +""" +Trainer to learn depth information on unlabeled data (raw images/videos) + +Allows pluggable depth networks for differing performance (including fast-depth) +""" + +import tensorflow as tf + + +def compute_smooth_loss(self, pred_disp): + def gradient(pred): + D_dy = pred[:, 1:, :, :] - pred[:, :-1, :, :] + D_dx = pred[:, :, 1:, :] - pred[:, :, :-1, :] + return D_dx, D_dy + dx, dy = gradient(pred_disp) + dx2, dxdy = gradient(dx) + dydx, dy2 = gradient(dy) + return tf.reduce_mean(tf.abs(dx2)) + \ + tf.reduce_mean(tf.abs(dxdy)) + \ + tf.reduce_mean(tf.abs(dydx)) + \ + tf.reduce_mean(tf.abs(dy2)) + + +def get_reference_explain_mask(self, downscaling): + opt = self.opt + tmp = np.array([0, 1]) + ref_exp_mask = np.tile(tmp, + (opt.batch_size, + int(opt.img_height/(2**downscaling)), + int(opt.img_width/(2**downscaling)), + 1)) + ref_exp_mask = tf.constant(ref_exp_mask, dtype=tf.float32) + return ref_exp_mask + + +def get_sfm_loss_fn(opt): + def sfm_loss_fn(y, y_pred): + # TODO: Correctly format a batch that is required for this loss function + pixel_loss = 0 + exp_loss = 0 + smooth_loss = 0 + tgt_image_all = [] + src_image_stack_all = [] + proj_image_stack_all = [] + proj_error_stack_all = [] + exp_mask_stack_all = [] + for s in range(opt.num_scales): + if opt.explain_reg_weight > 0: + # Construct a reference explainability mask (i.e. all + # pixels are explainable) + ref_exp_mask = get_reference_explain_mask(s) + # Scale the source and target images for computing loss at the + # according scale. + curr_tgt_image = tf.image.resize_area(tgt_image, + [int(opt.img_height/(2**s)), int(opt.img_width/(2**s))]) + curr_src_image_stack = tf.image.resize_area(src_image_stack, + [int(opt.img_height/(2**s)), int(opt.img_width/(2**s))]) + + if opt.smooth_weight > 0: + smooth_loss += opt.smooth_weight/(2**s) * \ + compute_smooth_loss(pred_disp[s]) + + for i in range(opt.num_source): + # Inverse warp the source image to the target image frame + curr_proj_image = projective_inverse_warp( + curr_src_image_stack[:, :, :, 3*i:3*(i+1)], + tf.squeeze(pred_depth[s], axis=3), + pred_poses[:, i, :], + intrinsics[:, s, :, :]) + curr_proj_error = tf.abs(curr_proj_image - curr_tgt_image) + # Cross-entropy loss as regularization for the + # explainability prediction + if opt.explain_reg_weight > 0: + curr_exp_logits = tf.slice(pred_exp_logits[s], + [0, 0, 0, i*2], + [-1, -1, -1, 2]) + exp_loss += opt.explain_reg_weight * \ + self.compute_exp_reg_loss(curr_exp_logits, + ref_exp_mask) + curr_exp = tf.nn.softmax(curr_exp_logits) + # Photo-consistency loss weighted by explainability + if opt.explain_reg_weight > 0: + pixel_loss += tf.reduce_mean(curr_proj_error * + tf.expand_dims(curr_exp[:, :, :, 1], -1)) + else: + pixel_loss += tf.reduce_mean(curr_proj_error) + # Prepare images for tensorboard summaries + if i == 0: + proj_image_stack = curr_proj_image + proj_error_stack = curr_proj_error + if opt.explain_reg_weight > 0: + exp_mask_stack = tf.expand_dims( + curr_exp[:, :, :, 1], -1) + else: + proj_image_stack = tf.concat([proj_image_stack, + curr_proj_image], axis=3) + proj_error_stack = tf.concat([proj_error_stack, + curr_proj_error], axis=3) + if opt.explain_reg_weight > 0: + exp_mask_stack = tf.concat([exp_mask_stack, + tf.expand_dims(curr_exp[:, :, :, 1], -1)], axis=3) + tgt_image_all.append(curr_tgt_image) + src_image_stack_all.append(curr_src_image_stack) + proj_image_stack_all.append(proj_image_stack) + proj_error_stack_all.append(proj_error_stack) + if opt.explain_reg_weight > 0: + exp_mask_stack_all.append(exp_mask_stack) + total_loss = pixel_loss + smooth_loss + exp_loss + return total_loss + return sfm_loss_fn + + +def photometric_reconstruction_loss(tgt_img, ref_imgs, intrinsics, + depth, explainability_mask, pose, + rotation_mode='euler', padding_mode='zeros'): + def one_scale(d, mask): + assert(mask is None or d.size() + [2:] == mask.size()[2:]) + assert(pose.size(1) == len(ref_imgs)) + + reconstruction_loss = 0 + b, _, h, w = d.size() + downscale = tgt_img.size(2)/h + + tgt_img_scaled = F.interpolate(tgt_img, (h, w), mode='area') + ref_imgs_scaled = [F.interpolate( + ref_img, (h, w), mode='area') for ref_img in ref_imgs] + intrinsics_scaled = tf.concat( + (intrinsics[:, 0:2]/downscale, intrinsics[:, 2:]), dim=1) + + warped_imgs = [] + diff_maps = [] + + for i, ref_img in enumerate(ref_imgs_scaled): + current_pose = pose[:, i] + + ref_img_warped, valid_points = inverse_warp(ref_img, depth[:, 0], current_pose, + intrinsics_scaled, + rotation_mode, padding_mode) + diff = (tgt_img_scaled - ref_img_warped) * \ + valid_points.unsqueeze(1).float() + + if explainability_mask is not None: + diff = diff * explainability_mask[:, i:i+1].expand_as(diff) + + reconstruction_loss += diff.abs().mean() + assert((reconstruction_loss == reconstruction_loss).item() == 1) + + warped_imgs.append(ref_img_warped[0]) + diff_maps.append(diff[0]) + + return reconstruction_loss, warped_imgs, diff_maps + + warped_results, diff_results = [], [] + if type(explainability_mask) not in [tuple, list]: + explainability_mask = [explainability_mask] + if type(depth) not in [list, tuple]: + depth = [depth] + + total_loss = 0 + for d, mask in zip(depth, explainability_mask): + loss, warped, diff = one_scale(d, mask) + total_loss += loss + warped_results.append(warped) + diff_results.append(diff) + return total_loss, warped_results, diff_results diff --git a/unsupervised/third-party/utils.py b/unsupervised/third-party/utils.py new file mode 100644 index 0000000..22ca1cb --- /dev/null +++ b/unsupervised/third-party/utils.py @@ -0,0 +1,354 @@ +""" +Utils to load and split image/video data. +""" + +from __future__ import division +import math +import tensorflow as tf + + +def euler2mat(z, y, x): + """Converts euler angles to rotation matrix + TODO: remove the dimension for 'N' (deprecated for converting all source + poses altogether) + Reference: https://github.com/pulkitag/pycaffe-utils/blob/master/rot_utils.py#L174 + Args: + z: rotation angle along z axis (in radians) -- size = [B, N] + y: rotation angle along y axis (in radians) -- size = [B, N] + x: rotation angle along x axis (in radians) -- size = [B, N] + Returns: + Rotation matrix corresponding to the euler angles -- size = [B, N, 3, 3] + """ + B = tf.shape(z)[0] + N = 1 + z = tf.clip_by_value(z, -math.pi, math.pi) + y = tf.clip_by_value(y, -math.pi, math.pi) + x = tf.clip_by_value(x, -math.pi, math.pi) + + # Expand to B x N x 1 x 1 + z = tf.expand_dims(tf.expand_dims(z, -1), -1) + y = tf.expand_dims(tf.expand_dims(y, -1), -1) + x = tf.expand_dims(tf.expand_dims(x, -1), -1) + + zeros = tf.zeros([B, N, 1, 1]) + ones = tf.ones([B, N, 1, 1]) + + cosz = tf.cos(z) + sinz = tf.sin(z) + rotz_1 = tf.concat([cosz, -sinz, zeros], axis=3) + rotz_2 = tf.concat([sinz, cosz, zeros], axis=3) + rotz_3 = tf.concat([zeros, zeros, ones], axis=3) + zmat = tf.concat([rotz_1, rotz_2, rotz_3], axis=2) + + cosy = tf.cos(y) + siny = tf.sin(y) + roty_1 = tf.concat([cosy, zeros, siny], axis=3) + roty_2 = tf.concat([zeros, ones, zeros], axis=3) + roty_3 = tf.concat([-siny, zeros, cosy], axis=3) + ymat = tf.concat([roty_1, roty_2, roty_3], axis=2) + + cosx = tf.cos(x) + sinx = tf.sin(x) + rotx_1 = tf.concat([ones, zeros, zeros], axis=3) + rotx_2 = tf.concat([zeros, cosx, -sinx], axis=3) + rotx_3 = tf.concat([zeros, sinx, cosx], axis=3) + xmat = tf.concat([rotx_1, rotx_2, rotx_3], axis=2) + + rotMat = tf.matmul(tf.matmul(xmat, ymat), zmat) + return rotMat + + +def pose_vec2mat(vec): + """Converts 6DoF parameters to transformation matrix + Args: + vec: 6DoF parameters in the order of tx, ty, tz, rx, ry, rz -- [B, 6] + Returns: + A transformation matrix -- [B, 4, 4] + """ + batch_size, _ = vec.get_shape().as_list() + translation = tf.slice(vec, [0, 0], [-1, 3]) + translation = tf.expand_dims(translation, -1) + rx = tf.slice(vec, [0, 3], [-1, 1]) + ry = tf.slice(vec, [0, 4], [-1, 1]) + rz = tf.slice(vec, [0, 5], [-1, 1]) + rot_mat = euler2mat(rz, ry, rx) + rot_mat = tf.squeeze(rot_mat, axis=[1]) + filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4]) + filler = tf.tile(filler, [batch_size, 1, 1]) + transform_mat = tf.concat([rot_mat, translation], axis=2) + transform_mat = tf.concat([transform_mat, filler], axis=1) + return transform_mat + + +def pixel2cam(depth, pixel_coords, intrinsics, is_homogeneous=True): + """Transforms coordinates in the pixel frame to the camera frame. + + Args: + depth: [batch, height, width] + pixel_coords: homogeneous pixel coordinates [batch, 3, height, width] + intrinsics: camera intrinsics [batch, 3, 3] + is_homogeneous: return in homogeneous coordinates + Returns: + Coords in the camera frame [batch, 3 (4 if homogeneous), height, width] + """ + batch, height, width = depth.get_shape().as_list() + depth = tf.reshape(depth, [batch, 1, -1]) + pixel_coords = tf.reshape(pixel_coords, [batch, 3, -1]) + cam_coords = tf.matmul(tf.matrix_inverse(intrinsics), pixel_coords) * depth + if is_homogeneous: + ones = tf.ones([batch, 1, height*width]) + cam_coords = tf.concat([cam_coords, ones], axis=1) + cam_coords = tf.reshape(cam_coords, [batch, -1, height, width]) + return cam_coords + + +def cam2pixel(cam_coords, proj): + """Transforms coordinates in a camera frame to the pixel frame. + + Args: + cam_coords: [batch, 4, height, width] + proj: [batch, 4, 4] + Returns: + Pixel coordinates projected from the camera frame [batch, height, width, 2] + """ + batch, _, height, width = cam_coords.get_shape().as_list() + cam_coords = tf.reshape(cam_coords, [batch, 4, -1]) + unnormalized_pixel_coords = tf.matmul(proj, cam_coords) + x_u = tf.slice(unnormalized_pixel_coords, [0, 0, 0], [-1, 1, -1]) + y_u = tf.slice(unnormalized_pixel_coords, [0, 1, 0], [-1, 1, -1]) + z_u = tf.slice(unnormalized_pixel_coords, [0, 2, 0], [-1, 1, -1]) + x_n = x_u / (z_u + 1e-10) + y_n = y_u / (z_u + 1e-10) + pixel_coords = tf.concat([x_n, y_n], axis=1) + pixel_coords = tf.reshape(pixel_coords, [batch, 2, height, width]) + return tf.transpose(pixel_coords, perm=[0, 2, 3, 1]) + + +def meshgrid(batch, height, width, is_homogeneous=True): + """Construct a 2D meshgrid. + + Args: + batch: batch size + height: height of the grid + width: width of the grid + is_homogeneous: whether to return in homogeneous coordinates + Returns: + x,y grid coordinates [batch, 2 (3 if homogeneous), height, width] + """ + x_t = tf.matmul(tf.ones(shape=tf.stack([height, 1])), + tf.transpose(tf.expand_dims( + tf.linspace(-1.0, 1.0, width), 1), [1, 0])) + y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1), + tf.ones(shape=tf.stack([1, width]))) + x_t = (x_t + 1.0) * 0.5 * tf.cast(width - 1, tf.float32) + y_t = (y_t + 1.0) * 0.5 * tf.cast(height - 1, tf.float32) + if is_homogeneous: + ones = tf.ones_like(x_t) + coords = tf.stack([x_t, y_t, ones], axis=0) + else: + coords = tf.stack([x_t, y_t], axis=0) + coords = tf.tile(tf.expand_dims(coords, 0), [batch, 1, 1, 1]) + return coords + + +def projective_inverse_warp(img, depth, pose, intrinsics): + """Inverse warp a source image to the target image plane based on projection. + + Args: + img: the source image [batch, height_s, width_s, 3] + depth: depth map of the target image [batch, height_t, width_t] + pose: target to source camera transformation matrix [batch, 6], in the + order of tx, ty, tz, rx, ry, rz + intrinsics: camera intrinsics [batch, 3, 3] + Returns: + Source image inverse warped to the target image plane [batch, height_t, + width_t, 3] + """ + batch, height, width, _ = img.get_shape().as_list() + # Convert pose vector to matrix + pose = pose_vec2mat(pose) + # Construct pixel grid coordinates + pixel_coords = meshgrid(batch, height, width) + # Convert pixel coordinates to the camera frame + cam_coords = pixel2cam(depth, pixel_coords, intrinsics) + # Construct a 4x4 intrinsic matrix (TODO: can it be 3x4?) + filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4]) + filler = tf.tile(filler, [batch, 1, 1]) + intrinsics = tf.concat([intrinsics, tf.zeros([batch, 3, 1])], axis=2) + intrinsics = tf.concat([intrinsics, filler], axis=1) + # Get a 4x4 transformation matrix from 'target' camera frame to 'source' + # pixel frame. + proj_tgt_cam_to_src_pixel = tf.matmul(intrinsics, pose) + src_pixel_coords = cam2pixel(cam_coords, proj_tgt_cam_to_src_pixel) + output_img = bilinear_sampler(img, src_pixel_coords) + return output_img + + +def bilinear_sampler(imgs, coords): + """Construct a new image by bilinear sampling from the input image. + + Points falling outside the source image boundary have value 0. + + Args: + imgs: source image to be sampled from [batch, height_s, width_s, channels] + coords: coordinates of source pixels to sample from [batch, height_t, + width_t, 2]. height_t/width_t correspond to the dimensions of the output + image (don't need to be the same as height_s/width_s). The two channels + correspond to x and y coordinates respectively. + Returns: + A new sampled image [batch, height_t, width_t, channels] + """ + def _repeat(x, n_repeats): + rep = tf.transpose( + tf.expand_dims(tf.ones(shape=tf.stack([ + n_repeats, + ])), 1), [1, 0]) + rep = tf.cast(rep, 'float32') + x = tf.matmul(tf.reshape(x, (-1, 1)), rep) + return tf.reshape(x, [-1]) + + with tf.name_scope('image_sampling'): + coords_x, coords_y = tf.split(coords, [1, 1], axis=3) + inp_size = imgs.get_shape() + coord_size = coords.get_shape() + out_size = coords.get_shape().as_list() + out_size[3] = imgs.get_shape().as_list()[3] + + coords_x = tf.cast(coords_x, 'float32') + coords_y = tf.cast(coords_y, 'float32') + + x0 = tf.floor(coords_x) + x1 = x0 + 1 + y0 = tf.floor(coords_y) + y1 = y0 + 1 + + y_max = tf.cast(tf.shape(imgs)[1] - 1, 'float32') + x_max = tf.cast(tf.shape(imgs)[2] - 1, 'float32') + zero = tf.zeros([1], dtype='float32') + + x0_safe = tf.clip_by_value(x0, zero, x_max) + y0_safe = tf.clip_by_value(y0, zero, y_max) + x1_safe = tf.clip_by_value(x1, zero, x_max) + y1_safe = tf.clip_by_value(y1, zero, y_max) + + # bilinear interp weights, with points outside the grid having weight 0 + # wt_x0 = (x1 - coords_x) * tf.cast(tf.equal(x0, x0_safe), 'float32') + # wt_x1 = (coords_x - x0) * tf.cast(tf.equal(x1, x1_safe), 'float32') + # wt_y0 = (y1 - coords_y) * tf.cast(tf.equal(y0, y0_safe), 'float32') + # wt_y1 = (coords_y - y0) * tf.cast(tf.equal(y1, y1_safe), 'float32') + + wt_x0 = x1_safe - coords_x + wt_x1 = coords_x - x0_safe + wt_y0 = y1_safe - coords_y + wt_y1 = coords_y - y0_safe + + # indices in the flat image to sample from + dim2 = tf.cast(inp_size[2], 'float32') + dim1 = tf.cast(inp_size[2] * inp_size[1], 'float32') + base = tf.reshape( + _repeat( + tf.cast(tf.range(coord_size[0]), 'float32') * dim1, + coord_size[1] * coord_size[2]), + [out_size[0], out_size[1], out_size[2], 1]) + + base_y0 = base + y0_safe * dim2 + base_y1 = base + y1_safe * dim2 + idx00 = tf.reshape(x0_safe + base_y0, [-1]) + idx01 = x0_safe + base_y1 + idx10 = x1_safe + base_y0 + idx11 = x1_safe + base_y1 + + # sample from imgs + imgs_flat = tf.reshape(imgs, tf.stack([-1, inp_size[3]])) + imgs_flat = tf.cast(imgs_flat, 'float32') + im00 = tf.reshape( + tf.gather(imgs_flat, tf.cast(idx00, 'int32')), out_size) + im01 = tf.reshape( + tf.gather(imgs_flat, tf.cast(idx01, 'int32')), out_size) + im10 = tf.reshape( + tf.gather(imgs_flat, tf.cast(idx10, 'int32')), out_size) + im11 = tf.reshape( + tf.gather(imgs_flat, tf.cast(idx11, 'int32')), out_size) + + w00 = wt_x0 * wt_y0 + w01 = wt_x0 * wt_y1 + w10 = wt_x1 * wt_y0 + w11 = wt_x1 * wt_y1 + + output = tf.add_n([ + w00 * im00, w01 * im01, + w10 * im10, w11 * im11 + ]) + return output + +# Spatial transformer network bilinear sampler, taken from https://github.com/kevinzakka/spatial-transformer-network/blob/master/stn/transformer.py + + +def stn_bilinear_sampler(img, x, y): + """ + Performs bilinear sampling of the input images according to the + normalized coordinates provided by the sampling grid. Note that + the sampling is done identically for each channel of the input. + To test if the function works properly, output image should be + identical to input image when theta is initialized to identity + transform. + Input + ----- + - img: batch of images in (B, H, W, C) layout. + - grid: x, y which is the output of affine_grid_generator. + Returns + ------- + - out: interpolated images according to grids. Same size as grid. + """ + H = tf.shape(img)[1] + W = tf.shape(img)[2] + max_y = tf.cast(H - 1, 'int32') + max_x = tf.cast(W - 1, 'int32') + zero = tf.zeros([], dtype='int32') + + # rescale x and y to [0, W-1/H-1] + x = tf.cast(x, 'float32') + y = tf.cast(y, 'float32') + x = 0.5 * ((x + 1.0) * tf.cast(max_x-1, 'float32')) + y = 0.5 * ((y + 1.0) * tf.cast(max_y-1, 'float32')) + + # grab 4 nearest corner points for each (x_i, y_i) + x0 = tf.cast(tf.floor(x), 'int32') + x1 = x0 + 1 + y0 = tf.cast(tf.floor(y), 'int32') + y1 = y0 + 1 + + # clip to range [0, H-1/W-1] to not violate img boundaries + x0 = tf.clip_by_value(x0, zero, max_x) + x1 = tf.clip_by_value(x1, zero, max_x) + y0 = tf.clip_by_value(y0, zero, max_y) + y1 = tf.clip_by_value(y1, zero, max_y) + + # get pixel value at corner coords + Ia = get_pixel_value(img, x0, y0) + Ib = get_pixel_value(img, x0, y1) + Ic = get_pixel_value(img, x1, y0) + Id = get_pixel_value(img, x1, y1) + + # recast as float for delta calculation + x0 = tf.cast(x0, 'float32') + x1 = tf.cast(x1, 'float32') + y0 = tf.cast(y0, 'float32') + y1 = tf.cast(y1, 'float32') + + # calculate deltas + wa = (x1-x) * (y1-y) + wb = (x1-x) * (y-y0) + wc = (x-x0) * (y1-y) + wd = (x-x0) * (y-y0) + + # add dimension for addition + wa = tf.expand_dims(wa, axis=3) + wb = tf.expand_dims(wb, axis=3) + wc = tf.expand_dims(wc, axis=3) + wd = tf.expand_dims(wd, axis=3) + + # compute output + out = tf.add_n([wa*Ia, wb*Ib, wc*Ic, wd*Id]) + + return out diff --git a/unsupervised/train.py b/unsupervised/train.py new file mode 100644 index 0000000..01f028a --- /dev/null +++ b/unsupervised/train.py @@ -0,0 +1,20 @@ +""" +Trainer to learn depth information on unlabeled data (raw images/videos) + +Allows pluggable depth networks for differing performance (including fast-depth) +""" + +import tensorflow.keras as keras + + +class SFMLearner(keras.Model): + + def __init__(depth_model, pose_model): + pass + + def train_step(self, data): + pass + + +def make_sfm_learner_pose_net(input_shape=(224, 224, 3)): + pass diff --git a/unsupervised/warp.py b/unsupervised/warp.py new file mode 100644 index 0000000..7cc19b3 --- /dev/null +++ b/unsupervised/warp.py @@ -0,0 +1,19 @@ +def projective_inverse_warp(target_img, source_img, depth, pose, intrinsics): + """ + Calculate the reprojected image from the source to the target, based on the given depth, pose and intrinsics + + SFM Learner inverse warp step + ps ~ K.T(t->s).Dt(pt).K^-1.pt + + Idea is to map the pixel coordinates of the target image to 3d space (Dt(pt).K^-1.pt), then map these onto + the source image in pixel coordinates (K.T(t->s).{3d coord}), then using the projected coordinates we sample + the pixels in the source image (ps) to reconstruct the target image. + + :param target_img: Tensor (batch, height, width, 3) + :param source_img: Tensor, same shape as target_img + :param depth: Tensor, (batch, height, width, 1) + :param pose: (batch, 3, 3) + :param intrinsics: (batch, 3, 3) + :return: The source image reprojected to the target + """ + pass diff --git a/util.py b/util.py new file mode 100644 index 0000000..e4bc29a --- /dev/null +++ b/util.py @@ -0,0 +1,21 @@ +import tensorflow as tf +import tensorflow.keras as keras + + +def crop_and_resize(x, out_shape=(224, 224)): + shape = tf.shape(x['depth']) + img_shape = tf.shape(x['image']) + # Ensure we get a square for when we resize it later. + # For horizontal images this is basically just cropping the sides off + center_shape = tf.minimum(shape[1], tf.minimum(shape[2], tf.minimum(img_shape[1], img_shape[2]))) + + def layer(): + return keras.Sequential([ + keras.layers.experimental.preprocessing.CenterCrop( + center_shape, center_shape), + keras.layers.experimental.preprocessing.Resizing( + out_shape[0], out_shape[1], interpolation='nearest') + ]) + + # Reshape label to 4d, can't use array unwrap as it's unsupported by tensorflow + return layer()(x['image']), layer()(tf.reshape(x['depth'], [shape[0], shape[1], shape[2], 1]))