Merge branch 'unsupervised' into 'main'

Packnet

See merge request vato007/fast-depth-tf!4
This commit is contained in:
Michael Pivato
2021-07-31 01:31:25 +00:00
16 changed files with 1232 additions and 152 deletions

View File

@@ -1,4 +1,3 @@
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_datasets as tfds
@@ -25,7 +24,8 @@ def dense_depth(size, weights=None, shape=(224, 224, 3)):
densenet_output_channels = densenet.layers[-1].output.shape[-1]
# Reduce the feature set (pointwise)
decoder = keras.layers.Conv2D(filters=densenet_output_channels, kernel_size=1, padding='same')(densenet.output)
decoder = keras.layers.Conv2D(
filters=densenet_output_channels, kernel_size=1, padding='same')(densenet.output)
# The actual decoder
decoder = dense_upsample_block(
@@ -66,19 +66,19 @@ def dense_nnconv5(size, weights=None, shape=(224, 224, 3), half_features=True):
# Reduce the feature set (pointwise)
decoder = keras.layers.Conv2D(filters=int(densenet_output_shape[-1]), kernel_size=1, padding='same',
input_shape=densenet_output_shape, name='conv2')(densenet.output)
input_shape=densenet_output_shape, name='conv2')(densenet.output)
# TODO: More intermediate layers here?
# Fast Depth Decoder
decoder = fd.nnconv5(decoder, densenet.get_layer('pool3_pool').output_shape[3], 1,
skip_connection=densenet.get_layer('pool3_pool').output)
skip_connection=densenet.get_layer('pool3_pool').output)
decoder = fd.nnconv5(decoder, densenet.get_layer('pool2_pool').output_shape[3], 2,
skip_connection=densenet.get_layer('pool2_pool').output)
skip_connection=densenet.get_layer('pool2_pool').output)
decoder = fd.nnconv5(decoder, densenet.get_layer('pool1').output_shape[3], 3,
skip_connection=densenet.get_layer('pool1').output)
skip_connection=densenet.get_layer('pool1').output)
decoder = fd.nnconv5(decoder, densenet.get_layer('conv1/relu').output_shape[3], 4,
skip_connection=densenet.get_layer('conv1/relu').output)
skip_connection=densenet.get_layer('conv1/relu').output)
# Final Pointwise for depth extraction
decoder = keras.layers.Conv2D(1, 1, padding='same')(decoder)
@@ -87,30 +87,6 @@ def dense_nnconv5(size, weights=None, shape=(224, 224, 3), half_features=True):
return keras.Model(inputs=input, outputs=decoder, name="fast_dense_depth")
def load_nyu(download_dir='../nyu'):
"""
Load the nyu_v2 dataset train split. Will be downloaded to ../nyu
:return: nyu_v2 dataset builder
"""
builder = tfds.builder('nyu_depth_v2')
builder.download_and_prepare(download_dir=download_dir)
return builder \
.as_dataset(split='train', shuffle_files=True) \
.shuffle(buffer_size=1024) \
.batch(8) \
.map(lambda x: fd.crop_and_resize(x))
def load_nyu_evaluate(download_dir='../nyu'):
"""
Load the nyu_v2 dataset validation split. Will be downloaded to ../nyu
:return: nyu_v2 dataset builder
"""
builder = tfds.builder('nyu_depth_v2')
builder.download_and_prepare(download_dir=download_dir)
return builder.as_dataset(split='validation').batch(1).map(lambda x: fd.crop_and_resize(x))
if __name__ == '__main__':
model = dense_depth(169, 'imagenet')
model.summary()

View File

@@ -1,7 +1,8 @@
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_datasets as tfds
# Needed for the kitti dataset, don't delete
from load import load_nyu_evaluate
from metric import *
from util import crop_and_resize
"""
Unofficial tensorflow keras implementation of FastDepth (mobilenet_nnconv5).
@@ -74,59 +75,6 @@ def mobilenet_nnconv5(weights=None, shape=(224, 224, 3)):
return keras.Model(inputs=input, outputs=x, name="fast_depth")
def delta1_metric(y_true, y_pred):
maxRatio = tf.maximum(y_pred / y_true, y_true / y_pred)
return tf.nn.moments(tf.cast(maxRatio < tf.convert_to_tensor(1.25), tf.float32), axes=None)[0]
def delta2_metric(y_true, y_pred):
maxRatio = tf.maximum(y_pred / y_true, y_true / y_pred)
return tf.nn.moments(tf.cast(maxRatio < tf.convert_to_tensor(1.25 ** 2), tf.float32), axes=None)[0]
def delta3_metric(y_true, y_pred):
maxRatio = tf.maximum(y_pred / y_true, y_true / y_pred)
return tf.nn.moments(tf.cast(maxRatio < tf.convert_to_tensor(1.25 ** 3), tf.float32), axes=None)[0]
def compile(model, optimiser=keras.optimizers.SGD(), loss=keras.losses.MeanSquaredError(), custom_metrics=None):
"""
Compile FastDepth model with relevant metrics
:param model: Model to compile
:param optimiser: Custom optimiser to use
:param loss: Loss function to use
:param include_metrics: Whether to include metrics (RMSE, MSE, a1,2,3)
"""
model.compile(optimizer=optimiser,
loss=loss,
metrics=[keras.metrics.RootMeanSquaredError(),
keras.metrics.MeanSquaredError(),
delta1_metric,
delta2_metric,
delta3_metric] if custom_metrics is None else custom_metrics)
def train(existing_model=None, pretrained_weights='imagenet', epochs=4, save_file=None, dataset=None):
"""
Compile, train and save (if a save file is specified) a Fast Depth model.
:param existing_model: Existing FastDepth model to train. None will create
:param pretrained_weights: Weights to use if existing_model is not specified. See keras.applications.MobileNet
weights parameter for options here.
:param epochs: Number of epochs to run for
:param save_file: File/directory to save to after training. By default the model won't be saved
:param dataset: Train dataset to use. By default will DOWNLOAD and use tensorflow nyu_v2 dataset
"""
if not existing_model:
existing_model = mobilenet_nnconv5(pretrained_weights)
compile(existing_model)
if not dataset:
dataset = load_nyu()
existing_model.fit(dataset, epochs=epochs)
if save_file:
existing_model.save(save_file)
return existing_model
def evaluate(compiled_model, dataset=None):
"""
Evaluate the model using rmse, delta1/2/3 metrics
@@ -150,66 +98,6 @@ def forward(model, image):
return model(crop_and_resize(image))
def load_model(file):
"""
Load previously trained FastDepth model from disk. Will include relevant metrics (custom objects)
:param file: File/directory to load the model from
:return:
"""
return keras.models.load_model(file, custom_objects={'delta1_metric': delta1_metric,
'delta2_metric': delta2_metric,
'delta3_metric': delta3_metric})
def crop_and_resize(x):
shape = tf.shape(x['depth'])
img_shape = tf.shape(x['image'])
# Ensure we get a square for when we resize is later.
# For horizontal images this is basically just cropping the sides off
center_shape = min(shape[1], shape[2], img_shape[1], img_shape[2])
def layer():
return keras.Sequential([
keras.layers.experimental.preprocessing.CenterCrop(
center_shape, center_shape),
keras.layers.experimental.preprocessing.Resizing(
224, 224, interpolation='nearest')
])
# Reshape label to 4d, can't use array unwrap as it's unsupported by tensorflow
return layer()(x['image']), layer()(tf.reshape(x['depth'], [shape[0], shape[1], shape[2], 1]))
def load_nyu(download_dir='../nyu'):
"""
Load the nyu_v2 dataset train split. Will be downloaded to ../nyu
:return: nyu_v2 dataset builder
"""
builder = tfds.builder('nyu_depth_v2')
builder.download_and_prepare(download_dir=download_dir)
return builder \
.as_dataset(split='train', shuffle_files=True) \
.shuffle(buffer_size=1024) \
.batch(8) \
.map(lambda x: crop_and_resize(x))
def load_nyu_evaluate(download_dir='../nyu'):
"""
Load the nyu_v2 dataset validation split. Will be downloaded to ../nyu
:return: nyu_v2 dataset builder
"""
builder = tfds.builder('nyu_depth_v2')
builder.download_and_prepare(download_dir=download_dir)
return builder.as_dataset(split='validation').batch(1).map(lambda x: crop_and_resize(x))
def load_kitti(download_dir='../kitti'):
ds = tfds.builder('kitti_depth')
ds.download_and_prepare(download_dir=download_dir)
return ds.as_dataset(tfds.Split.TRAIN).batch(8).map(lambda x: crop_and_resize(x))
if __name__ == '__main__':
model = mobilenet_nnconv5()
model.summary()

209
group_norm.py Normal file
View File

@@ -0,0 +1,209 @@
# MIT License
#
# Copyright (c) 2019 Somshubra Majumdar
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Taken from: https://github.com/titu1994/Keras-Group-Normalization/blob/master/group_norm.py
from tensorflow.keras import backend as K
from tensorflow.keras import constraints
from tensorflow.keras import initializers
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Layer, InputSpec
class GroupNormalization(Layer):
"""Group normalization layer
Group Normalization divides the channels into groups and computes within each group
the mean and variance for normalization. GN's computation is independent of batch sizes,
and its accuracy is stable in a wide range of batch sizes
# Arguments
groups: Integer, the number of groups for Group Normalization.
axis: Integer, the axis that should be normalized
(typically the features axis).
For instance, after a `Conv2D` layer with
`data_format="channels_first"`,
set `axis=1` in `BatchNormalization`.
epsilon: Small float added to variance to avoid dividing by zero.
center: If True, add offset of `beta` to normalized tensor.
If False, `beta` is ignored.
scale: If True, multiply by `gamma`.
If False, `gamma` is not used.
When the next layer is linear (also e.g. `nn.relu`),
this can be disabled since the scaling
will be done by the next layer.
beta_initializer: Initializer for the beta weight.
gamma_initializer: Initializer for the gamma weight.
beta_regularizer: Optional regularizer for the beta weight.
gamma_regularizer: Optional regularizer for the gamma weight.
beta_constraint: Optional constraint for the beta weight.
gamma_constraint: Optional constraint for the gamma weight.
# Input shape
Arbitrary. Use the keyword argument `input_shape`
(tuple of integers, does not include the samples axis)
when using this layer as the first layer in a model.
# Output shape
Same shape as input.
# References
- [Group Normalization](https://arxiv.org/abs/1803.08494)
"""
def __init__(self,
groups=32,
axis=-1,
epsilon=1e-5,
center=True,
scale=True,
beta_initializer='zeros',
gamma_initializer='ones',
beta_regularizer=None,
gamma_regularizer=None,
beta_constraint=None,
gamma_constraint=None,
**kwargs):
super(GroupNormalization, self).__init__(**kwargs)
self.supports_masking = True
self.groups = groups
self.axis = axis
self.epsilon = epsilon
self.center = center
self.scale = scale
self.beta_initializer = initializers.get(beta_initializer)
self.gamma_initializer = initializers.get(gamma_initializer)
self.beta_regularizer = regularizers.get(beta_regularizer)
self.gamma_regularizer = regularizers.get(gamma_regularizer)
self.beta_constraint = constraints.get(beta_constraint)
self.gamma_constraint = constraints.get(gamma_constraint)
self.gamma = None
self.beta = None
def build(self, input_shape):
dim = input_shape[self.axis]
if dim is None:
raise ValueError('Axis ' + str(self.axis) + ' of '
'input tensor should have a defined dimension '
'but the layer received an input with shape ' +
str(input_shape) + '.')
if dim < self.groups:
raise ValueError('Number of groups (' + str(self.groups) + ') cannot be '
'more than the number of channels (' +
str(dim) + ').')
if dim % self.groups != 0:
raise ValueError('Number of groups (' + str(self.groups) + ') must be a '
'multiple of the number of channels (' +
str(dim) + ').')
self.input_spec = InputSpec(ndim=len(input_shape),
axes={self.axis: dim})
shape = (dim,)
if self.scale:
self.gamma = self.add_weight(shape=shape,
name='gamma',
initializer=self.gamma_initializer,
regularizer=self.gamma_regularizer,
constraint=self.gamma_constraint)
if self.center:
self.beta = self.add_weight(shape=shape,
name='beta',
initializer=self.beta_initializer,
regularizer=self.beta_regularizer,
constraint=self.beta_constraint)
self.built = True
def call(self, inputs, **kwargs):
input_shape = K.int_shape(inputs)
tensor_input_shape = K.shape(inputs)
# Prepare broadcasting shape.
reduction_axes = list(range(len(input_shape)))
del reduction_axes[self.axis]
broadcast_shape = [1] * len(input_shape)
broadcast_shape[self.axis] = input_shape[self.axis] // self.groups
broadcast_shape.insert(1, self.groups)
reshape_group_shape = K.shape(inputs)
group_axes = [reshape_group_shape[i] for i in range(len(input_shape))]
group_axes[self.axis] = input_shape[self.axis] // self.groups
group_axes.insert(1, self.groups)
# reshape inputs to new group shape
group_shape = [group_axes[0], self.groups] + group_axes[2:]
group_shape = K.stack(group_shape)
inputs = K.reshape(inputs, group_shape)
group_reduction_axes = list(range(len(group_axes)))
group_reduction_axes = group_reduction_axes[2:]
mean = K.mean(inputs, axis=group_reduction_axes, keepdims=True)
variance = K.var(inputs, axis=group_reduction_axes, keepdims=True)
inputs = (inputs - mean) / (K.sqrt(variance + self.epsilon))
# prepare broadcast shape
inputs = K.reshape(inputs, group_shape)
outputs = inputs
# In this case we must explicitly broadcast all parameters.
if self.scale:
broadcast_gamma = K.reshape(self.gamma, broadcast_shape)
outputs = outputs * broadcast_gamma
if self.center:
broadcast_beta = K.reshape(self.beta, broadcast_shape)
outputs = outputs + broadcast_beta
outputs = K.reshape(outputs, tensor_input_shape)
return outputs
def get_config(self):
config = {
'groups': self.groups,
'axis': self.axis,
'epsilon': self.epsilon,
'center': self.center,
'scale': self.scale,
'beta_initializer': initializers.serialize(self.beta_initializer),
'gamma_initializer': initializers.serialize(self.gamma_initializer),
'beta_regularizer': regularizers.serialize(self.beta_regularizer),
'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
'beta_constraint': constraints.serialize(self.beta_constraint),
'gamma_constraint': constraints.serialize(self.gamma_constraint)
}
base_config = super(GroupNormalization, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def compute_output_shape(self, input_shape):
return input_shape
if __name__ == '__main__':
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
ip = Input(shape=(None, None, 4))
# ip = Input(batch_shape=(100, None, None, 2))
x = GroupNormalization(groups=2, axis=-1, epsilon=0.1)(ip)
model = Model(ip, x)
model.summary()

48
load.py Normal file
View File

@@ -0,0 +1,48 @@
import tensorflow.keras as keras
import tensorflow_datasets as tfds
from losses import dense_depth_loss_function
from metric import *
from util import crop_and_resize
def load_nyu(download_dir='../nyu', out_shape=(224, 224)):
"""
Load the nyu_v2 dataset train split. Will be downloaded to ../nyu
:return: nyu_v2 dataset builder
"""
builder = tfds.builder('nyu_depth_v2')
builder.download_and_prepare(download_dir=download_dir)
return builder \
.as_dataset(split='train', shuffle_files=True) \
.shuffle(buffer_size=1024) \
.batch(8) \
.map(lambda x: crop_and_resize(x, out_shape))
def load_nyu_evaluate(download_dir='../nyu', out_shape=(224, 224)):
"""
Load the nyu_v2 dataset validation split. Will be downloaded to ../nyu
:return: nyu_v2 dataset builder
"""
builder = tfds.builder('nyu_depth_v2')
builder.download_and_prepare(download_dir=download_dir)
return builder.as_dataset(split='validation').batch(1).map(lambda x: crop_and_resize(x, out_shape))
def load_kitti(download_dir='../kitti', out_shape=(224, 224)):
ds = tfds.builder('kitti_depth')
ds.download_and_prepare(download_dir=download_dir)
return ds.as_dataset(tfds.Split.TRAIN).batch(8).map(lambda x: crop_and_resize(x, out_shape))
def load_model(file):
"""
Load previously trained FastDepth model from disk. Will include relevant metrics (custom objects)
:param file: File/directory to load the model from
:return:
"""
return keras.models.load_model(file, custom_objects={'delta1_metric': delta1_metric,
'delta2_metric': delta2,
'delta3_metric': delta3,
'dense_depth_loss_function': dense_depth_loss_function})

View File

@@ -6,15 +6,15 @@ def dense_depth_loss_function(y, y_pred):
Implementation of the loss from the dense depth paper https://arxiv.org/pdf/1812.11941.pdf
"""
# Point-wise L1 loss
l_depth = tf.reduce_mean(tf.math.abs(y_pred - y), axis=-1)
l1_depth = tf.reduce_mean(tf.math.abs(y_pred - y), axis=-1)
# L1 loss over image gradients
dy, dx = tf.image.image_gradients(y)
dy_pred, dx_pred = tf.image.image_gradients(y_pred)
l_grad = tf.reduce_mean(tf.math.abs(dy_pred - dy) +
tf.math.abs(dx_pred - dx), axis=-1)
gradient = tf.reduce_mean(tf.math.abs(dy_pred - dy) +
tf.math.abs(dx_pred - dx), axis=-1)
# Structural Similarity (SSIM)
l_ssim = (1 - tf.image.ssim(y, y_pred, 500)) / 2
ssim = (1 - tf.image.ssim(y, y_pred, 500)) / 2
return 0.1 * tf.reduce_mean(l_depth) + tf.reduce_mean(l_grad) + l_ssim
return 0.1 * tf.reduce_mean(l1_depth) + tf.reduce_mean(gradient) + ssim

16
metric.py Normal file
View File

@@ -0,0 +1,16 @@
import tensorflow as tf
def delta1_metric(y_true, y_pred):
max_ratio = tf.maximum(y_pred / y_true, y_true / y_pred)
return tf.reduce_mean(tf.cast(max_ratio < tf.convert_to_tensor(1.25), tf.float32))
def delta2(y_true, y_pred):
max_ratio = tf.maximum(y_pred / y_true, y_true / y_pred)
return tf.reduce_mean(tf.cast(max_ratio < tf.convert_to_tensor(1.25 ** 2), tf.float32))
def delta3(y_true, y_pred):
max_ratio = tf.maximum(y_pred / y_true, y_true / y_pred)
return tf.reduce_mean(tf.cast(max_ratio < tf.convert_to_tensor(1.25 ** 3), tf.float32))

74
openvino_inference.py Normal file
View File

@@ -0,0 +1,74 @@
import argparse
import time
import cv2
import matplotlib.pyplot as plt
import numpy as np
from openvino.inference_engine import IECore
def parse_args() -> argparse.Namespace:
"""Parse and return command line arguments"""
parser = argparse.ArgumentParser(add_help=False)
args = parser.add_argument_group('Options')
# fmt: off
args.add_argument('-h', '--help', action='help', help='Show this help message and exit.')
args.add_argument('-m', '--model', required=True, type=str,
help='Required. Path to an .xml or .onnx file with a trained model.')
args.add_argument('-i', '--input', required=True, type=str, help='Required. Path to an image file.')
args.add_argument('-d', '--device', default='CPU', type=str,
help='Optional. Specify the target device to infer on; CPU, GPU, MYRIAD, HDDL or HETERO: '
'is acceptable. The sample will look for a suitable plugin for device specified. '
'Default value is CPU.')
# fmt: on
return parser.parse_args()
def sample(model_location, image_location, device='CPU'):
ie = IECore()
net = ie.read_network(model=model_location)
input_blob = next(iter(net.input_info))
output_blob = next(iter(net.outputs))
b, c, h, w = net.input_info[input_blob].input_data.shape
image = cv2.imread(image_location)
input_ratio = h / w
target_ratio = image.shape[0] / image.shape[1]
crop_axis = 0 if target_ratio > input_ratio else 1
crop_factor = input_ratio * target_ratio / 2
center = [image.shape[0] / 2, image.shape[1] / 2]
x1 = int(center[0] - image.shape[0] * crop_factor) if crop_axis == 0 else 0
x2 = int(center[0] + image.shape[0] * crop_factor) if crop_axis == 0 else image.shape[0]
y1 = int(center[1] - image.shape[1] * crop_factor) if crop_axis == 1 else 0
y2 = int(center[1] + image.shape[1] * crop_factor) if crop_axis == 1 else image.shape[1]
# Crop to target aspect ratio
image = image[x1:x2, y1:y2]
if image.shape[:-1] != (h, w):
image = cv2.resize(image, (w, h))
image = image.transpose((2, 0, 1))
# For batching
image = np.expand_dims(image, axis=0)
exec_net = ie.load_network(network=net, device_name=device)
start = time.time()
res = exec_net.infer(inputs={input_blob: image})
print('First Inference Time Seconds: ' + str(time.time() - start))
start = time.time()
res = exec_net.infer(inputs={input_blob: image})
print('Second Inference Time Seconds: ' + str(time.time() - start))
start = time.time()
res = exec_net.infer(inputs={input_blob: image})
print('Third Inference Time Seconds: ' + str(time.time() - start))
res = res[output_blob]
depth = res[0][0]
fig = plt.figure()
ii = plt.imshow(depth, interpolation='nearest')
fig.colorbar(ii)
plt.show()
if __name__ == '__main__':
parsed_args = parse_args()
sample(parsed_args.model, parsed_args.input, parsed_args.device)

150
packnet_functional.py Normal file
View File

@@ -0,0 +1,150 @@
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
from tensorflow import nn
import group_norm
def pack_layer():
pass
def residual_layer(inputs, out_channels, stride, dropout=None):
"""
Keras implementation of the Residual block (ResNet) as used in PackNet
:param inputs:
:param out_channels:
:param stride:
:param dropout:
:return:
"""
x = layers.Conv2D(out_channels, 3, padding='same', strides=stride)(inputs)
x = layers.Conv2D(out_channels, 3, padding='same')(x)
shortcut = layers.Conv2D(
out_channels, 3, padding='same', strides=stride)(inputs)
if dropout:
shortcut = keras.layers.SpatialDropout2D(dropout)(shortcut)
x = keras.layers.Concatenate()([x, shortcut])
x = group_norm.GroupNormalization(16)(x)
return keras.layers.ELU()(x)
# Packnet usually expects more than one layer per block (2,2,3,3)
def residual_block(inputs, out_channels, residual_layers, stride, dropout=None):
x = inputs
for i in range(0, residual_layers):
x = residual_layer(x, out_channels, stride, dropout)
return x
def packnet_conv2d(inputs, out_channels, kernel_size, stride):
x = keras.layers.Conv2D(out_channels, kernel_size,
stride, padding='same')(inputs)
x = group_norm.GroupNormalization(16)(x)
return keras.layers.ELU()(x)
def packnet_inverse_depth(inputs, out_channels=1, min_depth=0.5):
x = layers.Conv2D(out_channels, 3, padding='same')(inputs)
return keras.activations.sigmoid(x) / min_depth
def pack_3d(inputs, kernel_size, r=2, features_3d=8):
"""
Implementatino of the 3d packing block proposed here: https://arxiv.org/abs/1905.02693
:param inputs:
:param kernel_size:
:param r:
:param features_3d:
:return:
"""
# Data format for single image in nyu is HWC (space_to_depth uses NHWC as default)
x = nn.space_to_depth(inputs, r)
x = tf.expand_dims(x, 4)
x = keras.layers.Conv3D(features_3d, kernel_size=3, padding='same')(x)
b, h, w, c, d = x.shape
x = keras.layers.Reshape((h, w, c * d))(x)
return packnet_conv2d(x, inputs.shape[3], kernel_size, 1)
def unpack_3d(inputs, out_channels, kernel_size, r=2, features_3d=8):
x = packnet_conv2d(inputs, out_channels * (r ** 2) //
features_3d, kernel_size, 1)
x = tf.expand_dims(x, 4) # B x H/2 x W/2 x 4(out)/D x D
x = keras.layers.Conv3D(features_3d, kernel_size=3, padding='same')(x)
b, h, w, c, d = x.shape
x = keras.layers.Reshape([h, w, c * d])(x)
return nn.depth_to_space(x, r)
# TODO: Support different size packnet for scaling up/down
# TODO: Support different channel format (right now we're supporting NHWC, we should also support NCHW)
def make_packnet(shape=(224, 224, 3), skip_add=True, features_3d=4, dropout=None):
"""
Make the PackNet depth network.
:param shape: Input shape of the image
:param skip_add: Set to use add rather than concat skip connections, defaults to True
:return:
"""
# ================ ENCODER =================
input = keras.layers.Input(shape=shape)
x = packnet_conv2d(input, 32, 5, 1)
skip_1 = x
x = packnet_conv2d(x, 64, 7, 1)
x = pack_3d(x, 5, features_3d=features_3d)
skip_2 = x
x = residual_block(x, 64, 2, 1, dropout)
x = pack_3d(x, 3, features_3d=features_3d)
skip_3 = x
x = residual_block(x, 128, 2, 1, dropout)
x = pack_3d(x, 3, features_3d=features_3d)
skip_4 = x
x = residual_block(x, 256, 3, 1, dropout)
x = pack_3d(x, 3, features_3d=features_3d)
skip_5 = x
x = residual_block(x, 512, 3, 1, dropout)
x = pack_3d(x, 3, features_3d=features_3d)
# ================ ENCODER =================
# ================ DECODER =================
# layer 7
x = unpack_3d(x, 512, 3, features_3d=features_3d)
x = keras.layers.Add()(
[x, skip_5]) if skip_add else keras.layers.Concatenate()([x, skip_5])
x = packnet_conv2d(x, 512, 3, 1)
# layer 8
x = unpack_3d(x, 256, 3, features_3d=features_3d)
x = keras.layers.Add()(
[x, skip_4]) if skip_add else keras.layers.Concatenate()([x, skip_4])
x = packnet_conv2d(x, 256, 3, 1)
layer_8 = x
# layer 9
x = packnet_inverse_depth(x, 1)
# layer 10
u_layer_8 = unpack_3d(layer_8, 128, 3, features_3d=features_3d)
x = keras.layers.UpSampling2D()(x)
x = keras.layers.Add()([u_layer_8, skip_3, x]) if skip_add else keras.layers.Concatenate()([u_layer_8, skip_3, x])
x = packnet_conv2d(x, 128, 3, 1)
layer_10 = x
# layer 11
x = packnet_inverse_depth(x, 1)
# layer 12
u_layer_10 = unpack_3d(layer_10, 64, 3, features_3d=features_3d)
x = keras.layers.UpSampling2D()(x)
x = keras.layers.Add()([u_layer_10, skip_2, x]) if skip_add else keras.layers.Concatenate()([u_layer_10, skip_2, x])
x = packnet_conv2d(x, 64, 3, 1)
layer_12 = x
# layer 13
x = packnet_inverse_depth(x)
# layer 14
u_layer_12 = unpack_3d(layer_12, 32, 3, features_3d=features_3d)
x = keras.layers.UpSampling2D()(x)
x = keras.layers.Add()([u_layer_12, skip_1, x]) if skip_add else keras.layers.Concatenate()([u_layer_12, skip_1, x])
x = packnet_conv2d(x, 32, 3, 1)
# layer 15
x = packnet_inverse_depth(x)
# ================ DECODER =================
return keras.Model(inputs=input, outputs=x, name="PackNet")

37
packnet_tests.py Normal file
View File

@@ -0,0 +1,37 @@
import unittest
import tensorflow as tf
import packnet_functional as p
class PacknetTests(unittest.TestCase):
def test_pack_3d_layer(self):
# 3d packing expects a multiple of 16 for channels due to using 16 groups in group normalisation
test_input = tf.random.normal([4, 224, 224, 32])
y = p.pack_3d(test_input, 3, features_3d=4)
out_shape = [i for i in test_input.shape]
out_shape[1] = out_shape[1] // 2
out_shape[2] = out_shape[2] // 2
# TODO: Anything else we can test here for validity?
self.assertEqual(y.shape, out_shape)
def test_unpack_3d_layer(self):
num_output_channels = 32
test_input = tf.random.normal([4, 112, 112, 64])
y = p.unpack_3d(test_input, num_output_channels, 3, features_3d=4)
out_shape = [i for i in test_input.shape]
out_shape[1] = out_shape[1] * 2
out_shape[2] = out_shape[2] * 2
out_shape[3] = num_output_channels
# TODO: Anything else we can test here for validity?
self.assertEqual(y.shape, out_shape)
def test_packnet(self):
packnet = p.make_packnet()
self.assertIsNotNone(packnet)
if __name__ == '__main__':
unittest.main()

49
train.py Normal file
View File

@@ -0,0 +1,49 @@
"""
Collection of functions to train the various models, and use different losses
"""
import tensorflow.keras as keras
from load import load_nyu
from metric import *
def compile(model, optimiser=keras.optimizers.SGD(), loss=keras.losses.MeanSquaredError(), custom_metrics=None):
"""
Compile FastDepth model with relevant metrics
:param model: Model to compile
:param optimiser: Custom optimiser to use
:param loss: Loss function to use
:param include_metrics: Whether to include metrics (RMSE, MSE, a1,2,3)
"""
model.compile(optimizer=optimiser,
loss=loss,
metrics=[keras.metrics.RootMeanSquaredError(),
keras.metrics.MeanSquaredError(),
delta1_metric,
delta2,
delta3,
keras.metrics.MeanAbsolutePercentageError(),
keras.metrics.MeanAbsoluteError()] if custom_metrics is None else custom_metrics)
def train(existing_model=None, pretrained_weights='imagenet', epochs=4, save_file=None, dataset=None,
checkpoint='ckpt'):
"""
Compile, train and save (if a save file is specified) a Fast Depth model.
:param existing_model: Existing FastDepth model to train. None will create
:param pretrained_weights: Weights to use if existing_model is not specified. See keras.applications.MobileNet
weights parameter for options here.
:param epochs: Number of epochs to run for
:param save_file: File/directory to save to after training. By default the model won't be saved
:param dataset: Train dataset to use. By default will DOWNLOAD and use tensorflow nyu_v2 dataset
:param checkpoint: Checkpoint to save to
"""
callbacks = []
if checkpoint:
callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint, save_weights_only=True))
if not dataset:
dataset = load_nyu()
existing_model.fit(dataset, epochs=epochs, callbacks=callbacks)
if save_file:
existing_model.save(save_file)
return existing_model

53
unsupervised/loss.py Normal file
View File

@@ -0,0 +1,53 @@
import tensorflow as tf
def l1_loss(target_img, reprojected_img):
"""
Calculates the l1 norm between the target and reprojected image
:param target_img: Tensor (batch, height, width, 3)
:param reprojected_img: Tensor, same shape as target_img
:return: The per-pixel l1 norm -> Tensor (batch, height, width, 1)
"""
return tf.reduce_mean(tf.abs(target_img - reprojected_img), axis=3)
def l2_loss(target_img, reprojected_img):
"""
Calculates the l2 norm between the target and reprojected image
:param target_img: Tensor (batch, height, width, 3)
:param reprojected_img: Tensor, same shape as target_img
:return: The per-pixel l2 norm -> Tensor (batch, height, width, 1)
"""
return tf.reduce_mean((target_img - reprojected_img) ** 2 ** (1 / 2), axis=3)
def make_combined_ssim_l1_loss(ssim_weight: int = 0.85, other_loss_fn=l1_loss):
"""
Create a loss function that will calculate ssim for the two images, and use the other_loss_fn to calculate the
per pixel loss
:param ssim_weight: Weighting that should be applied to SSIM weight vs l1 difference between target and
reprojected image
:param other_loss_fn: Function to combine with the ssim
:return: Function to calculate the per-pixel combined ssim with other loss function
"""
def combined_ssim_loss(target_img, reprojected_img):
"""
Calculates the per-pixel photometric reconstruction loss for each source image,
combined this with the SSIM between the reconstructed image and the actual image.
Calculates the following:
ssim_weight * SSIM(target_img, reprojected_img) + (1 - ssim_weight) * other_loss_fn(target_img - reprojected_img)
:param target_img: Tensor with shape (batch, height, width, 3) - current image we're training on
:param reprojected_img: Tensor with same shape as target_img, Reprojected from some source image that
should be as close as possible to the target image
:return: Per-pixel loss -> Tensor with shape (batch, height, width, 1), where height and width match target_img
height and width
"""
ssim = tf.image.ssim(target_img, reprojected_img, axis=3, keepdim=True)
return ssim_weight * ssim + (1 - ssim_weight) * other_loss_fn(target_img, reprojected_img)
return combined_ssim_loss

166
unsupervised/third-party/train.py vendored Normal file
View File

@@ -0,0 +1,166 @@
"""
Trainer to learn depth information on unlabeled data (raw images/videos)
Allows pluggable depth networks for differing performance (including fast-depth)
"""
import tensorflow as tf
def compute_smooth_loss(self, pred_disp):
def gradient(pred):
D_dy = pred[:, 1:, :, :] - pred[:, :-1, :, :]
D_dx = pred[:, :, 1:, :] - pred[:, :, :-1, :]
return D_dx, D_dy
dx, dy = gradient(pred_disp)
dx2, dxdy = gradient(dx)
dydx, dy2 = gradient(dy)
return tf.reduce_mean(tf.abs(dx2)) + \
tf.reduce_mean(tf.abs(dxdy)) + \
tf.reduce_mean(tf.abs(dydx)) + \
tf.reduce_mean(tf.abs(dy2))
def get_reference_explain_mask(self, downscaling):
opt = self.opt
tmp = np.array([0, 1])
ref_exp_mask = np.tile(tmp,
(opt.batch_size,
int(opt.img_height/(2**downscaling)),
int(opt.img_width/(2**downscaling)),
1))
ref_exp_mask = tf.constant(ref_exp_mask, dtype=tf.float32)
return ref_exp_mask
def get_sfm_loss_fn(opt):
def sfm_loss_fn(y, y_pred):
# TODO: Correctly format a batch that is required for this loss function
pixel_loss = 0
exp_loss = 0
smooth_loss = 0
tgt_image_all = []
src_image_stack_all = []
proj_image_stack_all = []
proj_error_stack_all = []
exp_mask_stack_all = []
for s in range(opt.num_scales):
if opt.explain_reg_weight > 0:
# Construct a reference explainability mask (i.e. all
# pixels are explainable)
ref_exp_mask = get_reference_explain_mask(s)
# Scale the source and target images for computing loss at the
# according scale.
curr_tgt_image = tf.image.resize_area(tgt_image,
[int(opt.img_height/(2**s)), int(opt.img_width/(2**s))])
curr_src_image_stack = tf.image.resize_area(src_image_stack,
[int(opt.img_height/(2**s)), int(opt.img_width/(2**s))])
if opt.smooth_weight > 0:
smooth_loss += opt.smooth_weight/(2**s) * \
compute_smooth_loss(pred_disp[s])
for i in range(opt.num_source):
# Inverse warp the source image to the target image frame
curr_proj_image = projective_inverse_warp(
curr_src_image_stack[:, :, :, 3*i:3*(i+1)],
tf.squeeze(pred_depth[s], axis=3),
pred_poses[:, i, :],
intrinsics[:, s, :, :])
curr_proj_error = tf.abs(curr_proj_image - curr_tgt_image)
# Cross-entropy loss as regularization for the
# explainability prediction
if opt.explain_reg_weight > 0:
curr_exp_logits = tf.slice(pred_exp_logits[s],
[0, 0, 0, i*2],
[-1, -1, -1, 2])
exp_loss += opt.explain_reg_weight * \
self.compute_exp_reg_loss(curr_exp_logits,
ref_exp_mask)
curr_exp = tf.nn.softmax(curr_exp_logits)
# Photo-consistency loss weighted by explainability
if opt.explain_reg_weight > 0:
pixel_loss += tf.reduce_mean(curr_proj_error *
tf.expand_dims(curr_exp[:, :, :, 1], -1))
else:
pixel_loss += tf.reduce_mean(curr_proj_error)
# Prepare images for tensorboard summaries
if i == 0:
proj_image_stack = curr_proj_image
proj_error_stack = curr_proj_error
if opt.explain_reg_weight > 0:
exp_mask_stack = tf.expand_dims(
curr_exp[:, :, :, 1], -1)
else:
proj_image_stack = tf.concat([proj_image_stack,
curr_proj_image], axis=3)
proj_error_stack = tf.concat([proj_error_stack,
curr_proj_error], axis=3)
if opt.explain_reg_weight > 0:
exp_mask_stack = tf.concat([exp_mask_stack,
tf.expand_dims(curr_exp[:, :, :, 1], -1)], axis=3)
tgt_image_all.append(curr_tgt_image)
src_image_stack_all.append(curr_src_image_stack)
proj_image_stack_all.append(proj_image_stack)
proj_error_stack_all.append(proj_error_stack)
if opt.explain_reg_weight > 0:
exp_mask_stack_all.append(exp_mask_stack)
total_loss = pixel_loss + smooth_loss + exp_loss
return total_loss
return sfm_loss_fn
def photometric_reconstruction_loss(tgt_img, ref_imgs, intrinsics,
depth, explainability_mask, pose,
rotation_mode='euler', padding_mode='zeros'):
def one_scale(d, mask):
assert(mask is None or d.size()
[2:] == mask.size()[2:])
assert(pose.size(1) == len(ref_imgs))
reconstruction_loss = 0
b, _, h, w = d.size()
downscale = tgt_img.size(2)/h
tgt_img_scaled = F.interpolate(tgt_img, (h, w), mode='area')
ref_imgs_scaled = [F.interpolate(
ref_img, (h, w), mode='area') for ref_img in ref_imgs]
intrinsics_scaled = tf.concat(
(intrinsics[:, 0:2]/downscale, intrinsics[:, 2:]), dim=1)
warped_imgs = []
diff_maps = []
for i, ref_img in enumerate(ref_imgs_scaled):
current_pose = pose[:, i]
ref_img_warped, valid_points = inverse_warp(ref_img, depth[:, 0], current_pose,
intrinsics_scaled,
rotation_mode, padding_mode)
diff = (tgt_img_scaled - ref_img_warped) * \
valid_points.unsqueeze(1).float()
if explainability_mask is not None:
diff = diff * explainability_mask[:, i:i+1].expand_as(diff)
reconstruction_loss += diff.abs().mean()
assert((reconstruction_loss == reconstruction_loss).item() == 1)
warped_imgs.append(ref_img_warped[0])
diff_maps.append(diff[0])
return reconstruction_loss, warped_imgs, diff_maps
warped_results, diff_results = [], []
if type(explainability_mask) not in [tuple, list]:
explainability_mask = [explainability_mask]
if type(depth) not in [list, tuple]:
depth = [depth]
total_loss = 0
for d, mask in zip(depth, explainability_mask):
loss, warped, diff = one_scale(d, mask)
total_loss += loss
warped_results.append(warped)
diff_results.append(diff)
return total_loss, warped_results, diff_results

354
unsupervised/third-party/utils.py vendored Normal file
View File

@@ -0,0 +1,354 @@
"""
Utils to load and split image/video data.
"""
from __future__ import division
import math
import tensorflow as tf
def euler2mat(z, y, x):
"""Converts euler angles to rotation matrix
TODO: remove the dimension for 'N' (deprecated for converting all source
poses altogether)
Reference: https://github.com/pulkitag/pycaffe-utils/blob/master/rot_utils.py#L174
Args:
z: rotation angle along z axis (in radians) -- size = [B, N]
y: rotation angle along y axis (in radians) -- size = [B, N]
x: rotation angle along x axis (in radians) -- size = [B, N]
Returns:
Rotation matrix corresponding to the euler angles -- size = [B, N, 3, 3]
"""
B = tf.shape(z)[0]
N = 1
z = tf.clip_by_value(z, -math.pi, math.pi)
y = tf.clip_by_value(y, -math.pi, math.pi)
x = tf.clip_by_value(x, -math.pi, math.pi)
# Expand to B x N x 1 x 1
z = tf.expand_dims(tf.expand_dims(z, -1), -1)
y = tf.expand_dims(tf.expand_dims(y, -1), -1)
x = tf.expand_dims(tf.expand_dims(x, -1), -1)
zeros = tf.zeros([B, N, 1, 1])
ones = tf.ones([B, N, 1, 1])
cosz = tf.cos(z)
sinz = tf.sin(z)
rotz_1 = tf.concat([cosz, -sinz, zeros], axis=3)
rotz_2 = tf.concat([sinz, cosz, zeros], axis=3)
rotz_3 = tf.concat([zeros, zeros, ones], axis=3)
zmat = tf.concat([rotz_1, rotz_2, rotz_3], axis=2)
cosy = tf.cos(y)
siny = tf.sin(y)
roty_1 = tf.concat([cosy, zeros, siny], axis=3)
roty_2 = tf.concat([zeros, ones, zeros], axis=3)
roty_3 = tf.concat([-siny, zeros, cosy], axis=3)
ymat = tf.concat([roty_1, roty_2, roty_3], axis=2)
cosx = tf.cos(x)
sinx = tf.sin(x)
rotx_1 = tf.concat([ones, zeros, zeros], axis=3)
rotx_2 = tf.concat([zeros, cosx, -sinx], axis=3)
rotx_3 = tf.concat([zeros, sinx, cosx], axis=3)
xmat = tf.concat([rotx_1, rotx_2, rotx_3], axis=2)
rotMat = tf.matmul(tf.matmul(xmat, ymat), zmat)
return rotMat
def pose_vec2mat(vec):
"""Converts 6DoF parameters to transformation matrix
Args:
vec: 6DoF parameters in the order of tx, ty, tz, rx, ry, rz -- [B, 6]
Returns:
A transformation matrix -- [B, 4, 4]
"""
batch_size, _ = vec.get_shape().as_list()
translation = tf.slice(vec, [0, 0], [-1, 3])
translation = tf.expand_dims(translation, -1)
rx = tf.slice(vec, [0, 3], [-1, 1])
ry = tf.slice(vec, [0, 4], [-1, 1])
rz = tf.slice(vec, [0, 5], [-1, 1])
rot_mat = euler2mat(rz, ry, rx)
rot_mat = tf.squeeze(rot_mat, axis=[1])
filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4])
filler = tf.tile(filler, [batch_size, 1, 1])
transform_mat = tf.concat([rot_mat, translation], axis=2)
transform_mat = tf.concat([transform_mat, filler], axis=1)
return transform_mat
def pixel2cam(depth, pixel_coords, intrinsics, is_homogeneous=True):
"""Transforms coordinates in the pixel frame to the camera frame.
Args:
depth: [batch, height, width]
pixel_coords: homogeneous pixel coordinates [batch, 3, height, width]
intrinsics: camera intrinsics [batch, 3, 3]
is_homogeneous: return in homogeneous coordinates
Returns:
Coords in the camera frame [batch, 3 (4 if homogeneous), height, width]
"""
batch, height, width = depth.get_shape().as_list()
depth = tf.reshape(depth, [batch, 1, -1])
pixel_coords = tf.reshape(pixel_coords, [batch, 3, -1])
cam_coords = tf.matmul(tf.matrix_inverse(intrinsics), pixel_coords) * depth
if is_homogeneous:
ones = tf.ones([batch, 1, height*width])
cam_coords = tf.concat([cam_coords, ones], axis=1)
cam_coords = tf.reshape(cam_coords, [batch, -1, height, width])
return cam_coords
def cam2pixel(cam_coords, proj):
"""Transforms coordinates in a camera frame to the pixel frame.
Args:
cam_coords: [batch, 4, height, width]
proj: [batch, 4, 4]
Returns:
Pixel coordinates projected from the camera frame [batch, height, width, 2]
"""
batch, _, height, width = cam_coords.get_shape().as_list()
cam_coords = tf.reshape(cam_coords, [batch, 4, -1])
unnormalized_pixel_coords = tf.matmul(proj, cam_coords)
x_u = tf.slice(unnormalized_pixel_coords, [0, 0, 0], [-1, 1, -1])
y_u = tf.slice(unnormalized_pixel_coords, [0, 1, 0], [-1, 1, -1])
z_u = tf.slice(unnormalized_pixel_coords, [0, 2, 0], [-1, 1, -1])
x_n = x_u / (z_u + 1e-10)
y_n = y_u / (z_u + 1e-10)
pixel_coords = tf.concat([x_n, y_n], axis=1)
pixel_coords = tf.reshape(pixel_coords, [batch, 2, height, width])
return tf.transpose(pixel_coords, perm=[0, 2, 3, 1])
def meshgrid(batch, height, width, is_homogeneous=True):
"""Construct a 2D meshgrid.
Args:
batch: batch size
height: height of the grid
width: width of the grid
is_homogeneous: whether to return in homogeneous coordinates
Returns:
x,y grid coordinates [batch, 2 (3 if homogeneous), height, width]
"""
x_t = tf.matmul(tf.ones(shape=tf.stack([height, 1])),
tf.transpose(tf.expand_dims(
tf.linspace(-1.0, 1.0, width), 1), [1, 0]))
y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1),
tf.ones(shape=tf.stack([1, width])))
x_t = (x_t + 1.0) * 0.5 * tf.cast(width - 1, tf.float32)
y_t = (y_t + 1.0) * 0.5 * tf.cast(height - 1, tf.float32)
if is_homogeneous:
ones = tf.ones_like(x_t)
coords = tf.stack([x_t, y_t, ones], axis=0)
else:
coords = tf.stack([x_t, y_t], axis=0)
coords = tf.tile(tf.expand_dims(coords, 0), [batch, 1, 1, 1])
return coords
def projective_inverse_warp(img, depth, pose, intrinsics):
"""Inverse warp a source image to the target image plane based on projection.
Args:
img: the source image [batch, height_s, width_s, 3]
depth: depth map of the target image [batch, height_t, width_t]
pose: target to source camera transformation matrix [batch, 6], in the
order of tx, ty, tz, rx, ry, rz
intrinsics: camera intrinsics [batch, 3, 3]
Returns:
Source image inverse warped to the target image plane [batch, height_t,
width_t, 3]
"""
batch, height, width, _ = img.get_shape().as_list()
# Convert pose vector to matrix
pose = pose_vec2mat(pose)
# Construct pixel grid coordinates
pixel_coords = meshgrid(batch, height, width)
# Convert pixel coordinates to the camera frame
cam_coords = pixel2cam(depth, pixel_coords, intrinsics)
# Construct a 4x4 intrinsic matrix (TODO: can it be 3x4?)
filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4])
filler = tf.tile(filler, [batch, 1, 1])
intrinsics = tf.concat([intrinsics, tf.zeros([batch, 3, 1])], axis=2)
intrinsics = tf.concat([intrinsics, filler], axis=1)
# Get a 4x4 transformation matrix from 'target' camera frame to 'source'
# pixel frame.
proj_tgt_cam_to_src_pixel = tf.matmul(intrinsics, pose)
src_pixel_coords = cam2pixel(cam_coords, proj_tgt_cam_to_src_pixel)
output_img = bilinear_sampler(img, src_pixel_coords)
return output_img
def bilinear_sampler(imgs, coords):
"""Construct a new image by bilinear sampling from the input image.
Points falling outside the source image boundary have value 0.
Args:
imgs: source image to be sampled from [batch, height_s, width_s, channels]
coords: coordinates of source pixels to sample from [batch, height_t,
width_t, 2]. height_t/width_t correspond to the dimensions of the output
image (don't need to be the same as height_s/width_s). The two channels
correspond to x and y coordinates respectively.
Returns:
A new sampled image [batch, height_t, width_t, channels]
"""
def _repeat(x, n_repeats):
rep = tf.transpose(
tf.expand_dims(tf.ones(shape=tf.stack([
n_repeats,
])), 1), [1, 0])
rep = tf.cast(rep, 'float32')
x = tf.matmul(tf.reshape(x, (-1, 1)), rep)
return tf.reshape(x, [-1])
with tf.name_scope('image_sampling'):
coords_x, coords_y = tf.split(coords, [1, 1], axis=3)
inp_size = imgs.get_shape()
coord_size = coords.get_shape()
out_size = coords.get_shape().as_list()
out_size[3] = imgs.get_shape().as_list()[3]
coords_x = tf.cast(coords_x, 'float32')
coords_y = tf.cast(coords_y, 'float32')
x0 = tf.floor(coords_x)
x1 = x0 + 1
y0 = tf.floor(coords_y)
y1 = y0 + 1
y_max = tf.cast(tf.shape(imgs)[1] - 1, 'float32')
x_max = tf.cast(tf.shape(imgs)[2] - 1, 'float32')
zero = tf.zeros([1], dtype='float32')
x0_safe = tf.clip_by_value(x0, zero, x_max)
y0_safe = tf.clip_by_value(y0, zero, y_max)
x1_safe = tf.clip_by_value(x1, zero, x_max)
y1_safe = tf.clip_by_value(y1, zero, y_max)
# bilinear interp weights, with points outside the grid having weight 0
# wt_x0 = (x1 - coords_x) * tf.cast(tf.equal(x0, x0_safe), 'float32')
# wt_x1 = (coords_x - x0) * tf.cast(tf.equal(x1, x1_safe), 'float32')
# wt_y0 = (y1 - coords_y) * tf.cast(tf.equal(y0, y0_safe), 'float32')
# wt_y1 = (coords_y - y0) * tf.cast(tf.equal(y1, y1_safe), 'float32')
wt_x0 = x1_safe - coords_x
wt_x1 = coords_x - x0_safe
wt_y0 = y1_safe - coords_y
wt_y1 = coords_y - y0_safe
# indices in the flat image to sample from
dim2 = tf.cast(inp_size[2], 'float32')
dim1 = tf.cast(inp_size[2] * inp_size[1], 'float32')
base = tf.reshape(
_repeat(
tf.cast(tf.range(coord_size[0]), 'float32') * dim1,
coord_size[1] * coord_size[2]),
[out_size[0], out_size[1], out_size[2], 1])
base_y0 = base + y0_safe * dim2
base_y1 = base + y1_safe * dim2
idx00 = tf.reshape(x0_safe + base_y0, [-1])
idx01 = x0_safe + base_y1
idx10 = x1_safe + base_y0
idx11 = x1_safe + base_y1
# sample from imgs
imgs_flat = tf.reshape(imgs, tf.stack([-1, inp_size[3]]))
imgs_flat = tf.cast(imgs_flat, 'float32')
im00 = tf.reshape(
tf.gather(imgs_flat, tf.cast(idx00, 'int32')), out_size)
im01 = tf.reshape(
tf.gather(imgs_flat, tf.cast(idx01, 'int32')), out_size)
im10 = tf.reshape(
tf.gather(imgs_flat, tf.cast(idx10, 'int32')), out_size)
im11 = tf.reshape(
tf.gather(imgs_flat, tf.cast(idx11, 'int32')), out_size)
w00 = wt_x0 * wt_y0
w01 = wt_x0 * wt_y1
w10 = wt_x1 * wt_y0
w11 = wt_x1 * wt_y1
output = tf.add_n([
w00 * im00, w01 * im01,
w10 * im10, w11 * im11
])
return output
# Spatial transformer network bilinear sampler, taken from https://github.com/kevinzakka/spatial-transformer-network/blob/master/stn/transformer.py
def stn_bilinear_sampler(img, x, y):
"""
Performs bilinear sampling of the input images according to the
normalized coordinates provided by the sampling grid. Note that
the sampling is done identically for each channel of the input.
To test if the function works properly, output image should be
identical to input image when theta is initialized to identity
transform.
Input
-----
- img: batch of images in (B, H, W, C) layout.
- grid: x, y which is the output of affine_grid_generator.
Returns
-------
- out: interpolated images according to grids. Same size as grid.
"""
H = tf.shape(img)[1]
W = tf.shape(img)[2]
max_y = tf.cast(H - 1, 'int32')
max_x = tf.cast(W - 1, 'int32')
zero = tf.zeros([], dtype='int32')
# rescale x and y to [0, W-1/H-1]
x = tf.cast(x, 'float32')
y = tf.cast(y, 'float32')
x = 0.5 * ((x + 1.0) * tf.cast(max_x-1, 'float32'))
y = 0.5 * ((y + 1.0) * tf.cast(max_y-1, 'float32'))
# grab 4 nearest corner points for each (x_i, y_i)
x0 = tf.cast(tf.floor(x), 'int32')
x1 = x0 + 1
y0 = tf.cast(tf.floor(y), 'int32')
y1 = y0 + 1
# clip to range [0, H-1/W-1] to not violate img boundaries
x0 = tf.clip_by_value(x0, zero, max_x)
x1 = tf.clip_by_value(x1, zero, max_x)
y0 = tf.clip_by_value(y0, zero, max_y)
y1 = tf.clip_by_value(y1, zero, max_y)
# get pixel value at corner coords
Ia = get_pixel_value(img, x0, y0)
Ib = get_pixel_value(img, x0, y1)
Ic = get_pixel_value(img, x1, y0)
Id = get_pixel_value(img, x1, y1)
# recast as float for delta calculation
x0 = tf.cast(x0, 'float32')
x1 = tf.cast(x1, 'float32')
y0 = tf.cast(y0, 'float32')
y1 = tf.cast(y1, 'float32')
# calculate deltas
wa = (x1-x) * (y1-y)
wb = (x1-x) * (y-y0)
wc = (x-x0) * (y1-y)
wd = (x-x0) * (y-y0)
# add dimension for addition
wa = tf.expand_dims(wa, axis=3)
wb = tf.expand_dims(wb, axis=3)
wc = tf.expand_dims(wc, axis=3)
wd = tf.expand_dims(wd, axis=3)
# compute output
out = tf.add_n([wa*Ia, wb*Ib, wc*Ic, wd*Id])
return out

20
unsupervised/train.py Normal file
View File

@@ -0,0 +1,20 @@
"""
Trainer to learn depth information on unlabeled data (raw images/videos)
Allows pluggable depth networks for differing performance (including fast-depth)
"""
import tensorflow.keras as keras
class SFMLearner(keras.Model):
def __init__(depth_model, pose_model):
pass
def train_step(self, data):
pass
def make_sfm_learner_pose_net(input_shape=(224, 224, 3)):
pass

19
unsupervised/warp.py Normal file
View File

@@ -0,0 +1,19 @@
def projective_inverse_warp(target_img, source_img, depth, pose, intrinsics):
"""
Calculate the reprojected image from the source to the target, based on the given depth, pose and intrinsics
SFM Learner inverse warp step
ps ~ K.T(t->s).Dt(pt).K^-1.pt
Idea is to map the pixel coordinates of the target image to 3d space (Dt(pt).K^-1.pt), then map these onto
the source image in pixel coordinates (K.T(t->s).{3d coord}), then using the projected coordinates we sample
the pixels in the source image (ps) to reconstruct the target image.
:param target_img: Tensor (batch, height, width, 3)
:param source_img: Tensor, same shape as target_img
:param depth: Tensor, (batch, height, width, 1)
:param pose: (batch, 3, 3)
:param intrinsics: (batch, 3, 3)
:return: The source image reprojected to the target
"""
pass

21
util.py Normal file
View File

@@ -0,0 +1,21 @@
import tensorflow as tf
import tensorflow.keras as keras
def crop_and_resize(x, out_shape=(224, 224)):
shape = tf.shape(x['depth'])
img_shape = tf.shape(x['image'])
# Ensure we get a square for when we resize it later.
# For horizontal images this is basically just cropping the sides off
center_shape = tf.minimum(shape[1], tf.minimum(shape[2], tf.minimum(img_shape[1], img_shape[2])))
def layer():
return keras.Sequential([
keras.layers.experimental.preprocessing.CenterCrop(
center_shape, center_shape),
keras.layers.experimental.preprocessing.Resizing(
out_shape[0], out_shape[1], interpolation='nearest')
])
# Reshape label to 4d, can't use array unwrap as it's unsupported by tensorflow
return layer()(x['image']), layer()(tf.reshape(x['depth'], [shape[0], shape[1], shape[2], 1]))