Compare commits
3 Commits
unsupervis
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
513227a84a | ||
|
|
a053238310 | ||
|
|
28b11aaa44 |
@@ -1,8 +1,8 @@
|
|||||||
import coremltools as ct
|
import coremltools as ct
|
||||||
|
|
||||||
|
|
||||||
def convert_coreml(model_path, save_path='../mobilenet_nnconv5.mlmodel'):
|
def convert_coreml(model_path, save_path='mobilenet_nnconv5.mlmodel'):
|
||||||
mlmodel = ct.convert(model_path)
|
mlmodel = ct.convert(model_path, inputs=[ct.ImageType()])
|
||||||
mlmodel.save(save_path)
|
mlmodel.save(save_path)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
import tensorflow.keras as keras
|
import tensorflow.keras as keras
|
||||||
import tensorflow_datasets as tfds
|
|
||||||
|
|
||||||
import fast_depth_functional as fd
|
import fast_depth_functional as fd
|
||||||
|
|
||||||
@@ -12,12 +11,23 @@ def dense_upsample_block(input, out_channels, skip_connection):
|
|||||||
x = keras.layers.Concatenate()([x, skip_connection])
|
x = keras.layers.Concatenate()([x, skip_connection])
|
||||||
x = keras.layers.Conv2D(filters=out_channels,
|
x = keras.layers.Conv2D(filters=out_channels,
|
||||||
kernel_size=3, strides=1, padding='same')(x)
|
kernel_size=3, strides=1, padding='same')(x)
|
||||||
|
x = keras.layers.LeakyReLU(alpha=0.2)(x)
|
||||||
x = keras.layers.Conv2D(filters=out_channels,
|
x = keras.layers.Conv2D(filters=out_channels,
|
||||||
kernel_size=3, strides=1, padding='same')(x)
|
kernel_size=3, strides=1, padding='same')(x)
|
||||||
return keras.layers.LeakyReLU(alpha=0.2)(x)
|
return keras.layers.LeakyReLU(alpha=0.2)(x)
|
||||||
|
|
||||||
|
|
||||||
def dense_depth(size, weights=None, shape=(224, 224, 3)):
|
def dense_depth(size, weights=None, shape=(224, 224, 3)):
|
||||||
|
"""
|
||||||
|
Make the dense depth network graph using keras functional api.
|
||||||
|
|
||||||
|
Note that you should use the dense depth loss function, and use Adam as the optimiser with a learning rate of 0.0001
|
||||||
|
(default learning rate of Adam is 0.001).
|
||||||
|
:param size:
|
||||||
|
:param weights:
|
||||||
|
:param shape:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
input = keras.layers.Input(shape=shape)
|
input = keras.layers.Input(shape=shape)
|
||||||
densenet = dense_net(input, size, weights, shape)
|
densenet = dense_net(input, size, weights, shape)
|
||||||
|
|
||||||
@@ -37,6 +47,8 @@ def dense_depth(size, weights=None, shape=(224, 224, 3)):
|
|||||||
decoder = dense_upsample_block(
|
decoder = dense_upsample_block(
|
||||||
decoder, densenet_output_channels // 16, densenet.get_layer('conv1/relu').output)
|
decoder, densenet_output_channels // 16, densenet.get_layer('conv1/relu').output)
|
||||||
|
|
||||||
|
decoder = dense_upsample_block(decoder, int(densenet_output_channels / 32), input)
|
||||||
|
|
||||||
conv3 = keras.layers.Conv2D(
|
conv3 = keras.layers.Conv2D(
|
||||||
filters=1, kernel_size=3, strides=1, padding='same', name='conv3')(decoder)
|
filters=1, kernel_size=3, strides=1, padding='same', name='conv3')(decoder)
|
||||||
return keras.Model(inputs=input, outputs=conv3, name='dense_depth')
|
return keras.Model(inputs=input, outputs=conv3, name='dense_depth')
|
||||||
|
|||||||
@@ -72,6 +72,7 @@ def mobilenet_nnconv5(weights=None, shape=(224, 224, 3)):
|
|||||||
x = keras.layers.Conv2D(1, 1, padding='same')(x)
|
x = keras.layers.Conv2D(1, 1, padding='same')(x)
|
||||||
x = keras.layers.BatchNormalization()(x)
|
x = keras.layers.BatchNormalization()(x)
|
||||||
x = keras.layers.ReLU(6.)(x)
|
x = keras.layers.ReLU(6.)(x)
|
||||||
|
x = keras.layers.Reshape([shape[0], shape[1]])(x)
|
||||||
return keras.Model(inputs=input, outputs=x, name="fast_depth")
|
return keras.Model(inputs=input, outputs=x, name="fast_depth")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,10 @@ def dense_depth_loss_function(y, y_pred):
|
|||||||
"""
|
"""
|
||||||
Implementation of the loss from the dense depth paper https://arxiv.org/pdf/1812.11941.pdf
|
Implementation of the loss from the dense depth paper https://arxiv.org/pdf/1812.11941.pdf
|
||||||
"""
|
"""
|
||||||
|
if len(y.shape) == 3:
|
||||||
|
y = tf.expand_dims(y, 3)
|
||||||
|
if len(y_pred.shape) == 3:
|
||||||
|
y_pred = tf.expand_dims(y_pred, 3)
|
||||||
# Point-wise L1 loss
|
# Point-wise L1 loss
|
||||||
l1_depth = tf.reduce_mean(tf.math.abs(y_pred - y), axis=-1)
|
l1_depth = tf.reduce_mean(tf.math.abs(y_pred - y), axis=-1)
|
||||||
|
|
||||||
@@ -15,6 +19,6 @@ def dense_depth_loss_function(y, y_pred):
|
|||||||
tf.math.abs(dx_pred - dx), axis=-1)
|
tf.math.abs(dx_pred - dx), axis=-1)
|
||||||
|
|
||||||
# Structural Similarity (SSIM)
|
# Structural Similarity (SSIM)
|
||||||
ssim = (1 - tf.image.ssim(y, y_pred, 500)) / 2
|
ssim = tf.clip_by_value((1 - tf.image.ssim(y, y_pred, 100)) / 2, 0, 1)
|
||||||
|
|
||||||
return 0.1 * tf.reduce_mean(l1_depth) + tf.reduce_mean(gradient) + ssim
|
return 0.1 * tf.reduce_mean(l1_depth) + tf.reduce_mean(gradient) + ssim
|
||||||
|
|||||||
10
main.py
10
main.py
@@ -1,6 +1,4 @@
|
|||||||
import fast_depth_functional as fd
|
import fast_depth_functional as fd
|
||||||
from unsupervised.models import pose_net, wrap_mobilenet_nnconv5_for_utrain
|
|
||||||
from unsupervised.train import UnsupervisedPoseDepthLearner
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
fd.fix_windows_gpu()
|
fd.fix_windows_gpu()
|
||||||
@@ -11,11 +9,3 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
# Save in Tensorflow SavedModel format
|
# Save in Tensorflow SavedModel format
|
||||||
# tf.saved_model.save(model, 'fast_depth_nyu_v2_224_224_3_e1_saved_model')
|
# tf.saved_model.save(model, 'fast_depth_nyu_v2_224_224_3_e1_saved_model')
|
||||||
|
|
||||||
# Unsupervised
|
|
||||||
depth_model = fd.mobilenet_nnconv5()
|
|
||||||
pose_model = pose_net()
|
|
||||||
model = UnsupervisedPoseDepthLearner(wrap_mobilenet_nnconv5_for_utrain(depth_model), pose_model)
|
|
||||||
model.compile(optimizer='adam')
|
|
||||||
# TODO: Incorporate data generator
|
|
||||||
# model.fit()
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
|
|
||||||
def delta1_metric(y_true, y_pred):
|
def delta1(y_true, y_pred):
|
||||||
max_ratio = tf.maximum(y_pred / y_true, y_true / y_pred)
|
max_ratio = tf.maximum(y_pred / y_true, y_true / y_pred)
|
||||||
return tf.reduce_mean(tf.cast(max_ratio < tf.convert_to_tensor(1.25), tf.float32))
|
return tf.reduce_mean(tf.cast(max_ratio < tf.convert_to_tensor(1.25), tf.float32))
|
||||||
|
|
||||||
|
|||||||
2
train.py
2
train.py
@@ -19,7 +19,7 @@ def compile(model, optimiser=keras.optimizers.SGD(), loss=keras.losses.MeanSquar
|
|||||||
loss=loss,
|
loss=loss,
|
||||||
metrics=[keras.metrics.RootMeanSquaredError(),
|
metrics=[keras.metrics.RootMeanSquaredError(),
|
||||||
keras.metrics.MeanSquaredError(),
|
keras.metrics.MeanSquaredError(),
|
||||||
delta1_metric,
|
delta1,
|
||||||
delta2,
|
delta2,
|
||||||
delta3,
|
delta3,
|
||||||
keras.metrics.MeanAbsolutePercentageError(),
|
keras.metrics.MeanAbsolutePercentageError(),
|
||||||
|
|||||||
@@ -1,50 +0,0 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
import cv2
|
|
||||||
|
|
||||||
|
|
||||||
def video_generator(video_path_or_folder, intrinsics, allowed_extensions=('mp4', 'mkv', 'mov')):
|
|
||||||
"""
|
|
||||||
Create a generator for unsupervised training on depth sequences from a video file or folder of video files
|
|
||||||
:param video_path_or_folder: Video file or folder with list of video files to iterate through
|
|
||||||
:param intrinsics: Intrinsics for the videos TODO: Intrinsics per video
|
|
||||||
:param allowed_extensions: Allowed video extensions, to not accidentally pick files that aren't videos
|
|
||||||
:return: generator that yields dict of {frames: [frame1, frame2, frame3], intrinsics: [fx, fy, tx, ty]}
|
|
||||||
"""
|
|
||||||
if os.path.isfile(video_path_or_folder):
|
|
||||||
# TODO: How to re-yield? Is this enough, since I'm just returning the actual generator?
|
|
||||||
# Or do I need to iterate like below?
|
|
||||||
return _single_video_generator(video_path_or_folder)
|
|
||||||
else:
|
|
||||||
for root, dirs, files in os.walk(video_path_or_folder):
|
|
||||||
for file in files:
|
|
||||||
if os.path.splitext(file)[1] in allowed_extensions:
|
|
||||||
for frames in _single_video_generator(file):
|
|
||||||
yield frames
|
|
||||||
|
|
||||||
|
|
||||||
def _single_video_generator(video_file, intrinsics):
|
|
||||||
# Single video file
|
|
||||||
video = cv2.VideoCapture(video_file)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Buffer to store 3 frames, yield when this fills up
|
|
||||||
current_frames = []
|
|
||||||
while video.grab():
|
|
||||||
current_frames.append(video.retrieve())
|
|
||||||
if len(current_frames) == 3:
|
|
||||||
temp_frames = current_frames
|
|
||||||
current_frames = []
|
|
||||||
# TODO: Consider converting frames to tensor
|
|
||||||
yield {'frames': temp_frames, 'intrinsics': intrinsics}
|
|
||||||
finally:
|
|
||||||
video.release()
|
|
||||||
|
|
||||||
|
|
||||||
def image_generator(root_folder):
|
|
||||||
"""
|
|
||||||
Create an image generator for unsupervised training
|
|
||||||
:param root_folder:
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
@@ -9,8 +9,8 @@ def wrap_mobilenet_nnconv5_for_utrain(model):
|
|||||||
|
|
||||||
This just exposes the lower disparity layers as outputs, so they can be used to train at different scales/image
|
This just exposes the lower disparity layers as outputs, so they can be used to train at different scales/image
|
||||||
resolutions.
|
resolutions.
|
||||||
:param model: Fast Depth model to wrap
|
:param model:
|
||||||
:return: Keras model that takes same input as model and outputs the model output plus 3 disparity layers
|
:return:
|
||||||
"""
|
"""
|
||||||
input = model.input
|
input = model.input
|
||||||
disp_1 = model.get_layer('conv_pw_%d_relu' % 15).output
|
disp_1 = model.get_layer('conv_pw_%d_relu' % 15).output
|
||||||
|
|||||||
@@ -3,9 +3,7 @@ Utils to load and split image/video data.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
|
|
||||||
@@ -38,7 +36,7 @@ def euler2mat(z, y, x):
|
|||||||
cosz = tf.cos(z)
|
cosz = tf.cos(z)
|
||||||
sinz = tf.sin(z)
|
sinz = tf.sin(z)
|
||||||
rotz_1 = tf.concat([cosz, -sinz, zeros], axis=3)
|
rotz_1 = tf.concat([cosz, -sinz, zeros], axis=3)
|
||||||
rotz_2 = tf.concat([sinz, cosz, zeros], axis=3)
|
rotz_2 = tf.concat([sinz, cosz, zeros], axis=3)
|
||||||
rotz_3 = tf.concat([zeros, zeros, ones], axis=3)
|
rotz_3 = tf.concat([zeros, zeros, ones], axis=3)
|
||||||
zmat = tf.concat([rotz_1, rotz_2, rotz_3], axis=2)
|
zmat = tf.concat([rotz_1, rotz_2, rotz_3], axis=2)
|
||||||
|
|
||||||
@@ -60,49 +58,6 @@ def euler2mat(z, y, x):
|
|||||||
return rotMat
|
return rotMat
|
||||||
|
|
||||||
|
|
||||||
def euler2mat_noNDim(x, y, z):
|
|
||||||
"""
|
|
||||||
|
|
||||||
:param x: Tensor of shape (B, 1) - x axis rotation
|
|
||||||
:param y: Tensor of shape (B, 1) - y axis rotation
|
|
||||||
:param z: Tensor of shape (B, 1) - z axis rotation
|
|
||||||
:return: Rotation matrix for the given euler anglers, in the order rotation(x).rotation(y).rotation(z)
|
|
||||||
"""
|
|
||||||
batch_size = tf.shape(z)[0]
|
|
||||||
|
|
||||||
# Euler angles should be between -pi and pi, clip so the pose network is coerced to this range
|
|
||||||
z = tf.clip_by_value(z, -math.pi, math.pi)
|
|
||||||
y = tf.clip_by_value(y, -math.pi, math.pi)
|
|
||||||
x = tf.clip_by_value(x, -math.pi, math.pi)
|
|
||||||
|
|
||||||
zeros = tf.zeros([batch_size, 1])
|
|
||||||
ones = tf.ones([batch_size, 1])
|
|
||||||
|
|
||||||
cosx = tf.cos(x)
|
|
||||||
sinx = tf.sin(x)
|
|
||||||
rotx_1 = tf.concat([ones, zeros, zeros], axis=1)
|
|
||||||
rotx_2 = tf.concat([zeros, cosx, -sinx], axis=1)
|
|
||||||
rotx_3 = tf.concat([zeros, sinx, cosx], axis=1)
|
|
||||||
xmat = tf.reshape(tf.concat([rotx_1, rotx_2, rotx_3], axis=1), [batch_size, 3, 3])
|
|
||||||
|
|
||||||
cosz = tf.cos(z)
|
|
||||||
sinz = tf.sin(z)
|
|
||||||
rotz_1 = tf.concat([cosz, -sinz, zeros], axis=1)
|
|
||||||
rotz_2 = tf.concat([sinz, cosz, zeros], axis=1)
|
|
||||||
rotz_3 = tf.concat([zeros, zeros, ones], axis=1)
|
|
||||||
zmat = tf.reshape(tf.concat([rotz_1, rotz_2, rotz_3], axis=1), [batch_size, 3, 3])
|
|
||||||
|
|
||||||
cosy = tf.cos(y)
|
|
||||||
siny = tf.sin(y)
|
|
||||||
roty_1 = tf.concat([cosy, zeros, siny], axis=1)
|
|
||||||
roty_2 = tf.concat([zeros, ones, zeros], axis=1)
|
|
||||||
roty_3 = tf.concat([-siny, zeros, cosy], axis=1)
|
|
||||||
ymat = tf.reshape(tf.concat([roty_1, roty_2, roty_3], axis=1), [batch_size, 3, 3])
|
|
||||||
|
|
||||||
rotMat = tf.matmul(tf.matmul(zmat, ymat), xmat)
|
|
||||||
return rotMat
|
|
||||||
|
|
||||||
|
|
||||||
def pose_vec2mat(vec):
|
def pose_vec2mat(vec):
|
||||||
"""Converts 6DoF parameters to transformation matrix
|
"""Converts 6DoF parameters to transformation matrix
|
||||||
Args:
|
Args:
|
||||||
@@ -326,7 +281,6 @@ def bilinear_sampler(imgs, coords):
|
|||||||
])
|
])
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
# Spatial transformer network bilinear sampler, taken from https://github.com/kevinzakka/spatial-transformer-network/blob/master/stn/transformer.py
|
# Spatial transformer network bilinear sampler, taken from https://github.com/kevinzakka/spatial-transformer-network/blob/master/stn/transformer.py
|
||||||
|
|
||||||
|
|
||||||
@@ -355,8 +309,8 @@ def stn_bilinear_sampler(img, x, y):
|
|||||||
# rescale x and y to [0, W-1/H-1]
|
# rescale x and y to [0, W-1/H-1]
|
||||||
x = tf.cast(x, 'float32')
|
x = tf.cast(x, 'float32')
|
||||||
y = tf.cast(y, 'float32')
|
y = tf.cast(y, 'float32')
|
||||||
x = 0.5 * ((x + 1.0) * tf.cast(max_x - 1, 'float32'))
|
x = 0.5 * ((x + 1.0) * tf.cast(max_x-1, 'float32'))
|
||||||
y = 0.5 * ((y + 1.0) * tf.cast(max_y - 1, 'float32'))
|
y = 0.5 * ((y + 1.0) * tf.cast(max_y-1, 'float32'))
|
||||||
|
|
||||||
# grab 4 nearest corner points for each (x_i, y_i)
|
# grab 4 nearest corner points for each (x_i, y_i)
|
||||||
x0 = tf.cast(tf.floor(x), 'int32')
|
x0 = tf.cast(tf.floor(x), 'int32')
|
||||||
@@ -383,10 +337,10 @@ def stn_bilinear_sampler(img, x, y):
|
|||||||
y1 = tf.cast(y1, 'float32')
|
y1 = tf.cast(y1, 'float32')
|
||||||
|
|
||||||
# calculate deltas
|
# calculate deltas
|
||||||
wa = (x1 - x) * (y1 - y)
|
wa = (x1-x) * (y1-y)
|
||||||
wb = (x1 - x) * (y - y0)
|
wb = (x1-x) * (y-y0)
|
||||||
wc = (x - x0) * (y1 - y)
|
wc = (x-x0) * (y1-y)
|
||||||
wd = (x - x0) * (y - y0)
|
wd = (x-x0) * (y-y0)
|
||||||
|
|
||||||
# add dimension for addition
|
# add dimension for addition
|
||||||
wa = tf.expand_dims(wa, axis=3)
|
wa = tf.expand_dims(wa, axis=3)
|
||||||
@@ -395,6 +349,6 @@ def stn_bilinear_sampler(img, x, y):
|
|||||||
wd = tf.expand_dims(wd, axis=3)
|
wd = tf.expand_dims(wd, axis=3)
|
||||||
|
|
||||||
# compute output
|
# compute output
|
||||||
out = tf.add_n([wa * Ia, wb * Ib, wc * Ic, wd * Id])
|
out = tf.add_n([wa*Ia, wb*Ib, wc*Ic, wd*Id])
|
||||||
|
|
||||||
return out
|
return out
|
||||||
@@ -4,98 +4,17 @@ Trainer to learn depth information on unlabeled data (raw images/videos)
|
|||||||
Allows pluggable depth networks for differing performance (including fast-depth)
|
Allows pluggable depth networks for differing performance (including fast-depth)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import tensorflow as tf
|
import tensorflow.keras as keras
|
||||||
import tensorflow.python.keras as keras
|
|
||||||
from unsupervised import warp, loss
|
|
||||||
|
|
||||||
|
|
||||||
class UnsupervisedPoseDepthLearner(keras.Model):
|
class SFMLearner(keras.Model):
|
||||||
"""
|
|
||||||
Keras model to learn simultaneous depth + pose from image/video sequences.
|
|
||||||
|
|
||||||
To train this, the datasource should yield 3 frames and camera intrinsics.
|
def __init__(depth_model, pose_model):
|
||||||
Optionally velocity + timestamp per frame to train to real scale
|
pass
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, depth_model, pose_model, num_scales=3, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
self.depth_model = depth_model
|
|
||||||
self.pose_model = pose_model
|
|
||||||
# TODO: I think num_scales should be something defined on the depth model itself
|
|
||||||
self.num_scales = num_scales
|
|
||||||
self.smoothness = 1e-3
|
|
||||||
|
|
||||||
def train_step(self, data):
|
def train_step(self, data):
|
||||||
"""
|
pass
|
||||||
|
|
||||||
:param data: Format: {frames: Mat[3], intrinsics: Tensor}
|
|
||||||
"""
|
|
||||||
with tf.GradientTape as tape:
|
|
||||||
# Pass through depth for target image
|
|
||||||
# TODO: Convert frame to tensor (or do this in the dataloader)
|
|
||||||
# TODO: Ensure the depth output includes enough outputs for each scale
|
|
||||||
depth = self.depth_model(data.frames[1])
|
|
||||||
|
|
||||||
# Pass through depth -> pose for both source images
|
def make_sfm_learner_pose_net(input_shape=(224, 224, 3)):
|
||||||
# TODO: Concat these poses using tf.concat
|
pass
|
||||||
pose1 = self.pose_model(data.frames[1], data.frames[0])
|
|
||||||
pose2 = self.pose_model(data.frames[1], data.frames[2])
|
|
||||||
|
|
||||||
loss = self.calculate_loss(depth, pose1, pose2, data)
|
|
||||||
|
|
||||||
# Apply optimise step on total loss
|
|
||||||
# TODO: Do these need to be separate for depth/pose model?
|
|
||||||
grads = tape.gradient(loss, zip(self.depth_model.trainable_weights, self.pose_model.trainable_weights))
|
|
||||||
self.optimizer.apply_gradients(
|
|
||||||
zip(grads, self.depth_model.trainable_weights, self.pose_model.trainable_weights))
|
|
||||||
|
|
||||||
def calculate_loss(self, depth, pose1, pose2, data):
|
|
||||||
shape = depth[0].shape
|
|
||||||
|
|
||||||
# TODO: Pull coords out of train step into initialiser, then it only needs to be created once.
|
|
||||||
# Ideally the size/batch size will still be calculated automatically
|
|
||||||
coords = warp.image_coordinate(shape[0], shape[1], shape[2])
|
|
||||||
total_loss = 0
|
|
||||||
|
|
||||||
scale_losses = []
|
|
||||||
# For each scale, do the projective inverse warp step and calculate losses
|
|
||||||
for scale in range(self.num_scales):
|
|
||||||
# TODO: Could simplify this by stacking the source images (see sfmlearner)
|
|
||||||
# It isn't too much of an issue right now since we're only using 2 images (left/right)
|
|
||||||
# For each depth output (scale), do the projective inverse warp on each input image and calculate the losses
|
|
||||||
# Only take the min loss between the two warped images (from monodepth2)
|
|
||||||
# TODO: Need to bilinear resize the depth at each scale up to the size of image
|
|
||||||
warp1 = warp.projective_inverse_warp(data.frames[0], depth[scale], pose1, data.intrinsics, coords)
|
|
||||||
warp2 = warp.projective_inverse_warp(data.frames[2], depth[scale], pose2, data.intrinsics, coords)
|
|
||||||
|
|
||||||
# Per pixel loss is just the difference in pixel intensities?
|
|
||||||
# Something like l1 plus ssim
|
|
||||||
warp_loss1 = loss.make_combined_ssim_l1_loss(data.frames[1], warp1)
|
|
||||||
warp_loss2 = loss.make_combined_ssim_l1_loss(data.frames[1], warp2)
|
|
||||||
|
|
||||||
# Take loss between target (data.frames[1]) and source images (pre-warp)
|
|
||||||
source_loss1 = loss.make_combined_ssim_l1_loss(data.frames[1], data.frames[0])
|
|
||||||
source_loss2 = loss.make_combined_ssim_l1_loss(data.frames[1], data.frames[2])
|
|
||||||
|
|
||||||
# Take the min (per pixel) of the losses of warped/unwarped images (so min across pixels of 4 images)
|
|
||||||
# TODO: Verify the axes are correct
|
|
||||||
reprojection_loss = tf.reduce_mean(
|
|
||||||
tf.reduce_min(tf.concat([warp_loss1, warp_loss2, source_loss1, source_loss2], axis=3), axis=3))
|
|
||||||
|
|
||||||
# Calculate smooth losses
|
|
||||||
# TODO: Since smooth loss is calculated directly on the depth at the scale, we need
|
|
||||||
# to resize the target image to the same dimensions as the depth map at the current scale
|
|
||||||
# Can do this by just inspecting the shape of the depth and resizing to match that (but
|
|
||||||
# with 3 colour channels)
|
|
||||||
smooth_loss = loss.smooth_loss(depth[scale], data.frames[1])
|
|
||||||
|
|
||||||
# SFM Learner downscales smoothing loss depending on the scale
|
|
||||||
smoothed_reprojection_loss = self.smoothness * smooth_loss / (2 ** scale)
|
|
||||||
|
|
||||||
# Add to total loss (with smooth loss + smooth loss weighting applied to pixel losses)
|
|
||||||
total_loss += reprojection_loss + smoothed_reprojection_loss
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Collect losses, average them out (divide by number of scales)
|
|
||||||
total_loss /= self.num_scales
|
|
||||||
return total_loss
|
|
||||||
|
|||||||
@@ -1,223 +1,19 @@
|
|||||||
import math
|
def projective_inverse_warp(target_img, source_img, depth, pose, intrinsics):
|
||||||
|
|
||||||
import tensorflow as tf
|
|
||||||
|
|
||||||
|
|
||||||
def euler_to_matrix(x, y, z):
|
|
||||||
"""
|
|
||||||
|
|
||||||
:param x: Tensor of shape (B, 1) - x axis rotation
|
|
||||||
:param y: Tensor of shape (B, 1) - y axis rotation
|
|
||||||
:param z: Tensor of shape (B, 1) - z axis rotation
|
|
||||||
:return: Rotation matrix for the given euler anglers, in the order rotation(x) -> rotation(y) -> rotation(z)
|
|
||||||
"""
|
|
||||||
batch_size = tf.shape(z)[0]
|
|
||||||
|
|
||||||
# Euler angles should be between -pi and pi, clip so the pose network is coerced to this range
|
|
||||||
z = tf.clip_by_value(z, -math.pi, math.pi)
|
|
||||||
y = tf.clip_by_value(y, -math.pi, math.pi)
|
|
||||||
x = tf.clip_by_value(x, -math.pi, math.pi)
|
|
||||||
|
|
||||||
cosx = tf.cos(x)
|
|
||||||
sinx = tf.sin(x)
|
|
||||||
|
|
||||||
cosy = tf.cos(y)
|
|
||||||
siny = tf.sin(y)
|
|
||||||
|
|
||||||
cosz = tf.cos(z)
|
|
||||||
sinz = tf.sin(z)
|
|
||||||
|
|
||||||
# Otherwise this will need to be reversed
|
|
||||||
# Rotate about x, y then z. z goes first here as rotation is always left side of coordinates
|
|
||||||
# R = Rz(φ)Ry(θ)Rx(ψ)
|
|
||||||
# = | cos(θ)cos(φ) sin(ψ)sin(θ)cos(φ) − cos(ψ)sin(φ) cos(ψ)sin(θ)cos(φ) + sin(ψ)sin(φ) |
|
|
||||||
# | cos(θ)sin(φ) sin(ψ)sin(θ)sin(φ) + cos(ψ)cos(φ) cos(ψ)sin(θ)sin(φ) − sin(ψ)cos(φ) |
|
|
||||||
# | −sin(θ) sin(ψ)cos(θ) cos(ψ)cos(θ) |
|
|
||||||
row_1 = tf.concat([cosy * cosz, sinx * siny * cosz - cosx * sinz, cosx * siny * cosz + sinx * sinz], 1)
|
|
||||||
row_2 = tf.concat([cosy * sinz, sinx * siny * sinz + cosx * cosz, cosx * siny * sinz - sinx * cosz], 1)
|
|
||||||
row_3 = tf.concat([-siny, sinx * cosy, cosx * cosy], 1)
|
|
||||||
return tf.reshape(tf.concat([row_1, row_2, row_3], axis=1), [batch_size, 3, 3])
|
|
||||||
|
|
||||||
|
|
||||||
def pose_vec2mat(vec):
|
|
||||||
"""Converts 6DoF parameters to transformation matrix
|
|
||||||
Args:
|
|
||||||
vec: 6DoF parameters in the order of tx, ty, tz, rx, ry, rz -- [B, 6]
|
|
||||||
Returns:
|
|
||||||
A transformation matrix -- [B, 3, 4]
|
|
||||||
"""
|
|
||||||
batch_size, _ = vec.get_shape().as_list()
|
|
||||||
translation = tf.slice(vec, [0, 0], [-1, 3])
|
|
||||||
translation = tf.expand_dims(translation, -1)
|
|
||||||
rx = tf.slice(vec, [0, 3], [-1, 1])
|
|
||||||
ry = tf.slice(vec, [0, 4], [-1, 1])
|
|
||||||
rz = tf.slice(vec, [0, 5], [-1, 1])
|
|
||||||
rot_mat = euler_to_matrix(rx, ry, rz)
|
|
||||||
transform_mat = tf.concat([rot_mat, translation], axis=2)
|
|
||||||
return transform_mat
|
|
||||||
|
|
||||||
|
|
||||||
def image_coordinate(batch, height, width):
|
|
||||||
"""
|
|
||||||
Construct a tensor for the given height/width with elements the homogenous coordinates for the pixel
|
|
||||||
:param batch: Number of images in a batch
|
|
||||||
:param height: Height of image
|
|
||||||
:param width: Width of image
|
|
||||||
:return: Tensor of shape (B, height, width, 3), homogenous coordinates for an image.
|
|
||||||
Coordinates are in order [x, y, 1]
|
|
||||||
"""
|
|
||||||
x_coords = tf.range(width)
|
|
||||||
y_coords = tf.range(height)
|
|
||||||
|
|
||||||
x_mesh, y_mesh = tf.meshgrid(x_coords, y_coords)
|
|
||||||
|
|
||||||
ones_mesh = tf.cast(tf.ones([height, width]), tf.int32)
|
|
||||||
|
|
||||||
stacked = tf.stack([x_mesh, y_mesh, ones_mesh], axis=2)
|
|
||||||
|
|
||||||
return tf.cast(tf.repeat(tf.expand_dims(stacked, axis=0), batch, axis=0), dtype=tf.float32)
|
|
||||||
|
|
||||||
|
|
||||||
def projective_inverse_warp(source_img, depth, pose, intrinsics, coordinates):
|
|
||||||
"""
|
"""
|
||||||
Calculate the reprojected image from the source to the target, based on the given depth, pose and intrinsics
|
Calculate the reprojected image from the source to the target, based on the given depth, pose and intrinsics
|
||||||
|
|
||||||
SFM Learner inverse warp step
|
SFM Learner inverse warp step
|
||||||
ps ~ K.T(t->s).Dt(pt)*K^-1.pt
|
ps ~ K.T(t->s).Dt(pt).K^-1.pt
|
||||||
|
|
||||||
Note that the depth pixel Dt(pt) is multiplied by every coordinate value (just element-wise, not matrix multiplication)
|
|
||||||
|
|
||||||
Idea is to map the pixel coordinates of the target image to 3d space (Dt(pt).K^-1.pt), then map these onto
|
Idea is to map the pixel coordinates of the target image to 3d space (Dt(pt).K^-1.pt), then map these onto
|
||||||
the source image in pixel coordinates (K.T(t->s).{3d coord}), then using the projected coordinates we sample
|
the source image in pixel coordinates (K.T(t->s).{3d coord}), then using the projected coordinates we sample
|
||||||
the pixels in the source image (ps) to reconstruct the target image.
|
the pixels in the source image (ps) to reconstruct the target image.
|
||||||
|
|
||||||
:param source_img: Tensor (batch, height, width, 3)
|
:param target_img: Tensor (batch, height, width, 3)
|
||||||
:param depth: Tensor, (batch, height, width)
|
:param source_img: Tensor, same shape as target_img
|
||||||
:param pose: (batch, 6)
|
:param depth: Tensor, (batch, height, width, 1)
|
||||||
:param intrinsics: (batch, 3, 3) TODO: Intrinsics per image (per source/target image)?
|
:param pose: (batch, 3, 3)
|
||||||
:param coordinates: (batch, 3, height * width) - coordinates for the image. Pass this in so it doesn't need to be
|
:param intrinsics: (batch, 3, 3)
|
||||||
calculated on every warp step
|
|
||||||
:return: The source image reprojected to the target
|
:return: The source image reprojected to the target
|
||||||
"""
|
"""
|
||||||
# Convert pose vector (output of pose net) to pose matrix (4x4)
|
pass
|
||||||
pose_3x4 = pose_vec2mat(pose)
|
|
||||||
|
|
||||||
# Convert intrinsics matrix (3x3) to (4x4) so it can be multiplied by the pose net
|
|
||||||
# intrinsics_4x4 =
|
|
||||||
# Calculate inverse of the 4x4 intrinsics matrix
|
|
||||||
intrinsics_inverse = tf.linalg.inv(intrinsics)
|
|
||||||
|
|
||||||
depth_flat = tf.reshape(depth, [depth.shape[0], depth.shape[1] * depth.shape[2]])
|
|
||||||
|
|
||||||
# Do the function
|
|
||||||
sample_coordinates = tf.matmul(tf.matmul(intrinsics, pose_3x4),
|
|
||||||
tf.concat([depth_flat * tf.matmul(intrinsics_inverse, coordinates),
|
|
||||||
tf.ones([depth_flat.shape[0], 1, depth_flat.shape[1]])], axis=1))
|
|
||||||
|
|
||||||
# Normalise the x/y axes (divide by z axis)
|
|
||||||
sample_coordinates = sample_coordinates[:, 0:2] / sample_coordinates[:, 2]
|
|
||||||
|
|
||||||
# Reshape back to image coordinates
|
|
||||||
sample_coordinates = tf.reshape(tf.transpose(sample_coordinates, [0, 2, 1]),
|
|
||||||
[depth.shape[0], depth.shape[1], depth.shape[2], 2])
|
|
||||||
|
|
||||||
# sample from the source image using the coordinates applied by the function
|
|
||||||
return bilinear_sampler(source_img, sample_coordinates)
|
|
||||||
|
|
||||||
|
|
||||||
def bilinear_sampler(imgs, coords):
|
|
||||||
"""Construct a new image by bilinear sampling from the input image.
|
|
||||||
|
|
||||||
Points falling outside the source image boundary have value 0.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
imgs: source image to be sampled from [batch, height_s, width_s, channels]
|
|
||||||
coords: coordinates of source pixels to sample from [batch, height_t,
|
|
||||||
width_t, 2]. height_t/width_t correspond to the dimensions of the output
|
|
||||||
image (don't need to be the same as height_s/width_s). The two channels
|
|
||||||
correspond to x and y coordinates respectively.
|
|
||||||
Returns:
|
|
||||||
A new sampled image [batch, height_t, width_t, channels]
|
|
||||||
"""
|
|
||||||
|
|
||||||
def _repeat(x, n_repeats):
|
|
||||||
rep = tf.transpose(
|
|
||||||
tf.expand_dims(tf.ones(shape=tf.stack([
|
|
||||||
n_repeats,
|
|
||||||
])), 1), [1, 0])
|
|
||||||
rep = tf.cast(rep, 'float32')
|
|
||||||
x = tf.matmul(tf.reshape(x, (-1, 1)), rep)
|
|
||||||
return tf.reshape(x, [-1])
|
|
||||||
|
|
||||||
coords_x, coords_y = tf.split(coords, [1, 1], axis=3)
|
|
||||||
inp_size = imgs.get_shape()
|
|
||||||
coord_size = coords.get_shape()
|
|
||||||
out_size = coords.get_shape().as_list()
|
|
||||||
out_size[3] = imgs.get_shape().as_list()[3]
|
|
||||||
|
|
||||||
coords_x = tf.cast(coords_x, 'float32')
|
|
||||||
coords_y = tf.cast(coords_y, 'float32')
|
|
||||||
|
|
||||||
x0 = tf.floor(coords_x)
|
|
||||||
x1 = x0 + 1
|
|
||||||
y0 = tf.floor(coords_y)
|
|
||||||
y1 = y0 + 1
|
|
||||||
|
|
||||||
y_max = tf.cast(tf.shape(imgs)[1] - 1, 'float32')
|
|
||||||
x_max = tf.cast(tf.shape(imgs)[2] - 1, 'float32')
|
|
||||||
zero = tf.zeros([1], dtype='float32')
|
|
||||||
|
|
||||||
x0_safe = tf.clip_by_value(x0, zero, x_max)
|
|
||||||
y0_safe = tf.clip_by_value(y0, zero, y_max)
|
|
||||||
x1_safe = tf.clip_by_value(x1, zero, x_max)
|
|
||||||
y1_safe = tf.clip_by_value(y1, zero, y_max)
|
|
||||||
|
|
||||||
# bilinear interp weights, with points outside the grid having weight 0
|
|
||||||
# wt_x0 = (x1 - coords_x) * tf.cast(tf.equal(x0, x0_safe), 'float32')
|
|
||||||
# wt_x1 = (coords_x - x0) * tf.cast(tf.equal(x1, x1_safe), 'float32')
|
|
||||||
# wt_y0 = (y1 - coords_y) * tf.cast(tf.equal(y0, y0_safe), 'float32')
|
|
||||||
# wt_y1 = (coords_y - y0) * tf.cast(tf.equal(y1, y1_safe), 'float32')
|
|
||||||
|
|
||||||
wt_x0 = x1_safe - coords_x
|
|
||||||
wt_x1 = coords_x - x0_safe
|
|
||||||
wt_y0 = y1_safe - coords_y
|
|
||||||
wt_y1 = coords_y - y0_safe
|
|
||||||
|
|
||||||
# indices in the flat image to sample from
|
|
||||||
dim2 = tf.cast(inp_size[2], 'float32')
|
|
||||||
dim1 = tf.cast(inp_size[2] * inp_size[1], 'float32')
|
|
||||||
base = tf.reshape(
|
|
||||||
_repeat(
|
|
||||||
tf.cast(tf.range(coord_size[0]), 'float32') * dim1,
|
|
||||||
coord_size[1] * coord_size[2]),
|
|
||||||
[out_size[0], out_size[1], out_size[2], 1])
|
|
||||||
|
|
||||||
base_y0 = base + y0_safe * dim2
|
|
||||||
base_y1 = base + y1_safe * dim2
|
|
||||||
idx00 = tf.reshape(x0_safe + base_y0, [-1])
|
|
||||||
idx01 = x0_safe + base_y1
|
|
||||||
idx10 = x1_safe + base_y0
|
|
||||||
idx11 = x1_safe + base_y1
|
|
||||||
|
|
||||||
# sample from imgs
|
|
||||||
imgs_flat = tf.reshape(imgs, tf.stack([-1, inp_size[3]]))
|
|
||||||
imgs_flat = tf.cast(imgs_flat, 'float32')
|
|
||||||
im00 = tf.reshape(
|
|
||||||
tf.gather(imgs_flat, tf.cast(idx00, 'int32')), out_size)
|
|
||||||
im01 = tf.reshape(
|
|
||||||
tf.gather(imgs_flat, tf.cast(idx01, 'int32')), out_size)
|
|
||||||
im10 = tf.reshape(
|
|
||||||
tf.gather(imgs_flat, tf.cast(idx10, 'int32')), out_size)
|
|
||||||
im11 = tf.reshape(
|
|
||||||
tf.gather(imgs_flat, tf.cast(idx11, 'int32')), out_size)
|
|
||||||
|
|
||||||
w00 = wt_x0 * wt_y0
|
|
||||||
w01 = wt_x0 * wt_y1
|
|
||||||
w10 = wt_x1 * wt_y0
|
|
||||||
w11 = wt_x1 * wt_y1
|
|
||||||
|
|
||||||
output = tf.add_n([
|
|
||||||
w00 * im00, w01 * im01,
|
|
||||||
w10 * im10, w11 * im11
|
|
||||||
])
|
|
||||||
return output
|
|
||||||
|
|||||||
@@ -1,63 +0,0 @@
|
|||||||
import unittest
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import tensorflow as tf
|
|
||||||
|
|
||||||
import warp
|
|
||||||
|
|
||||||
|
|
||||||
class MyTestCase(unittest.TestCase):
|
|
||||||
def test_euler_to_rotation_matrix(self):
|
|
||||||
# quarter rotation in every
|
|
||||||
x = y = z = tf.expand_dims(tf.expand_dims(tf.constant(np.pi / 2), 0), 0)
|
|
||||||
x2 = y2 = z2 = tf.expand_dims(tf.expand_dims(tf.constant(np.pi / 4), 0), 0)
|
|
||||||
|
|
||||||
x_batch = tf.concat([x, x2], 0)
|
|
||||||
y_batch = tf.concat([y, y2], 0)
|
|
||||||
z_batch = tf.concat([z, z2], 0)
|
|
||||||
|
|
||||||
# TODO: Construct expected final rotation matrix, just 3x3 using numpy, so that we can do an
|
|
||||||
# elementwise comparison later. Probably also want to check the
|
|
||||||
|
|
||||||
rotation_matrices = warp.euler_to_matrix(x_batch, y_batch, z_batch)
|
|
||||||
# old_rot = utils.euler2mat_noNDim(x_batch, y_batch, z_batch)
|
|
||||||
|
|
||||||
self.assertEqual(rotation_matrices.shape, [2, 3, 3])
|
|
||||||
|
|
||||||
def test_coordinates(self):
|
|
||||||
height = 1000
|
|
||||||
width = 2000
|
|
||||||
coords = warp.image_coordinate(8, height, width)
|
|
||||||
|
|
||||||
self.assertEqual(coords.shape, [8, height, width, 3])
|
|
||||||
self.assertEqual(coords[0, 0, 0, 0], 0)
|
|
||||||
self.assertEqual(coords[0, 0, 0, 1], 0)
|
|
||||||
self.assertEqual(coords[0, 0, 0, 2], 1)
|
|
||||||
|
|
||||||
self.assertEqual(coords[0, height - 1, 0, 0], 0)
|
|
||||||
self.assertEqual(coords[0, height - 1, 0, 1], height - 1)
|
|
||||||
self.assertEqual(coords[0, height - 1, 0, 2], 1)
|
|
||||||
|
|
||||||
self.assertEqual(coords[0, height - 1, width - 1, 0], width - 1)
|
|
||||||
self.assertEqual(coords[0, height - 1, width - 1, 1], height - 1)
|
|
||||||
self.assertEqual(coords[0, height - 1, width - 1, 2], 1)
|
|
||||||
|
|
||||||
def test_warp(self):
|
|
||||||
height = 1000
|
|
||||||
width = 2000
|
|
||||||
coords = warp.image_coordinate(1, height, width)
|
|
||||||
coords = tf.reshape(coords, [1, height * width, 3])
|
|
||||||
coords = tf.transpose(coords, [0, 2, 1])
|
|
||||||
# source image to sample from
|
|
||||||
img = tf.random.uniform([1, height, width, 3]) * 255
|
|
||||||
|
|
||||||
intrinsics = tf.constant([[[1, 0, 0], [0, 1, 0], [0, 0, 1]]], dtype=tf.float32)
|
|
||||||
|
|
||||||
disp = tf.random.uniform([1, height, width]) * 255
|
|
||||||
pose = tf.random.uniform([1, 6])
|
|
||||||
|
|
||||||
self.assertEqual(warp.projective_inverse_warp(img, disp, pose, intrinsics, coords).shape, img.shape)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
unittest.main()
|
|
||||||
Reference in New Issue
Block a user