Add euler to rotation matrix, grid flattening
This commit is contained in:
354
unsupervised/third-party/utils.py
vendored
354
unsupervised/third-party/utils.py
vendored
@@ -1,354 +0,0 @@
|
||||
"""
|
||||
Utils to load and split image/video data.
|
||||
"""
|
||||
|
||||
from __future__ import division
|
||||
import math
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def euler2mat(z, y, x):
|
||||
"""Converts euler angles to rotation matrix
|
||||
TODO: remove the dimension for 'N' (deprecated for converting all source
|
||||
poses altogether)
|
||||
Reference: https://github.com/pulkitag/pycaffe-utils/blob/master/rot_utils.py#L174
|
||||
Args:
|
||||
z: rotation angle along z axis (in radians) -- size = [B, N]
|
||||
y: rotation angle along y axis (in radians) -- size = [B, N]
|
||||
x: rotation angle along x axis (in radians) -- size = [B, N]
|
||||
Returns:
|
||||
Rotation matrix corresponding to the euler angles -- size = [B, N, 3, 3]
|
||||
"""
|
||||
B = tf.shape(z)[0]
|
||||
N = 1
|
||||
z = tf.clip_by_value(z, -math.pi, math.pi)
|
||||
y = tf.clip_by_value(y, -math.pi, math.pi)
|
||||
x = tf.clip_by_value(x, -math.pi, math.pi)
|
||||
|
||||
# Expand to B x N x 1 x 1
|
||||
z = tf.expand_dims(tf.expand_dims(z, -1), -1)
|
||||
y = tf.expand_dims(tf.expand_dims(y, -1), -1)
|
||||
x = tf.expand_dims(tf.expand_dims(x, -1), -1)
|
||||
|
||||
zeros = tf.zeros([B, N, 1, 1])
|
||||
ones = tf.ones([B, N, 1, 1])
|
||||
|
||||
cosz = tf.cos(z)
|
||||
sinz = tf.sin(z)
|
||||
rotz_1 = tf.concat([cosz, -sinz, zeros], axis=3)
|
||||
rotz_2 = tf.concat([sinz, cosz, zeros], axis=3)
|
||||
rotz_3 = tf.concat([zeros, zeros, ones], axis=3)
|
||||
zmat = tf.concat([rotz_1, rotz_2, rotz_3], axis=2)
|
||||
|
||||
cosy = tf.cos(y)
|
||||
siny = tf.sin(y)
|
||||
roty_1 = tf.concat([cosy, zeros, siny], axis=3)
|
||||
roty_2 = tf.concat([zeros, ones, zeros], axis=3)
|
||||
roty_3 = tf.concat([-siny, zeros, cosy], axis=3)
|
||||
ymat = tf.concat([roty_1, roty_2, roty_3], axis=2)
|
||||
|
||||
cosx = tf.cos(x)
|
||||
sinx = tf.sin(x)
|
||||
rotx_1 = tf.concat([ones, zeros, zeros], axis=3)
|
||||
rotx_2 = tf.concat([zeros, cosx, -sinx], axis=3)
|
||||
rotx_3 = tf.concat([zeros, sinx, cosx], axis=3)
|
||||
xmat = tf.concat([rotx_1, rotx_2, rotx_3], axis=2)
|
||||
|
||||
rotMat = tf.matmul(tf.matmul(xmat, ymat), zmat)
|
||||
return rotMat
|
||||
|
||||
|
||||
def pose_vec2mat(vec):
|
||||
"""Converts 6DoF parameters to transformation matrix
|
||||
Args:
|
||||
vec: 6DoF parameters in the order of tx, ty, tz, rx, ry, rz -- [B, 6]
|
||||
Returns:
|
||||
A transformation matrix -- [B, 4, 4]
|
||||
"""
|
||||
batch_size, _ = vec.get_shape().as_list()
|
||||
translation = tf.slice(vec, [0, 0], [-1, 3])
|
||||
translation = tf.expand_dims(translation, -1)
|
||||
rx = tf.slice(vec, [0, 3], [-1, 1])
|
||||
ry = tf.slice(vec, [0, 4], [-1, 1])
|
||||
rz = tf.slice(vec, [0, 5], [-1, 1])
|
||||
rot_mat = euler2mat(rz, ry, rx)
|
||||
rot_mat = tf.squeeze(rot_mat, axis=[1])
|
||||
filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4])
|
||||
filler = tf.tile(filler, [batch_size, 1, 1])
|
||||
transform_mat = tf.concat([rot_mat, translation], axis=2)
|
||||
transform_mat = tf.concat([transform_mat, filler], axis=1)
|
||||
return transform_mat
|
||||
|
||||
|
||||
def pixel2cam(depth, pixel_coords, intrinsics, is_homogeneous=True):
|
||||
"""Transforms coordinates in the pixel frame to the camera frame.
|
||||
|
||||
Args:
|
||||
depth: [batch, height, width]
|
||||
pixel_coords: homogeneous pixel coordinates [batch, 3, height, width]
|
||||
intrinsics: camera intrinsics [batch, 3, 3]
|
||||
is_homogeneous: return in homogeneous coordinates
|
||||
Returns:
|
||||
Coords in the camera frame [batch, 3 (4 if homogeneous), height, width]
|
||||
"""
|
||||
batch, height, width = depth.get_shape().as_list()
|
||||
depth = tf.reshape(depth, [batch, 1, -1])
|
||||
pixel_coords = tf.reshape(pixel_coords, [batch, 3, -1])
|
||||
cam_coords = tf.matmul(tf.matrix_inverse(intrinsics), pixel_coords) * depth
|
||||
if is_homogeneous:
|
||||
ones = tf.ones([batch, 1, height*width])
|
||||
cam_coords = tf.concat([cam_coords, ones], axis=1)
|
||||
cam_coords = tf.reshape(cam_coords, [batch, -1, height, width])
|
||||
return cam_coords
|
||||
|
||||
|
||||
def cam2pixel(cam_coords, proj):
|
||||
"""Transforms coordinates in a camera frame to the pixel frame.
|
||||
|
||||
Args:
|
||||
cam_coords: [batch, 4, height, width]
|
||||
proj: [batch, 4, 4]
|
||||
Returns:
|
||||
Pixel coordinates projected from the camera frame [batch, height, width, 2]
|
||||
"""
|
||||
batch, _, height, width = cam_coords.get_shape().as_list()
|
||||
cam_coords = tf.reshape(cam_coords, [batch, 4, -1])
|
||||
unnormalized_pixel_coords = tf.matmul(proj, cam_coords)
|
||||
x_u = tf.slice(unnormalized_pixel_coords, [0, 0, 0], [-1, 1, -1])
|
||||
y_u = tf.slice(unnormalized_pixel_coords, [0, 1, 0], [-1, 1, -1])
|
||||
z_u = tf.slice(unnormalized_pixel_coords, [0, 2, 0], [-1, 1, -1])
|
||||
x_n = x_u / (z_u + 1e-10)
|
||||
y_n = y_u / (z_u + 1e-10)
|
||||
pixel_coords = tf.concat([x_n, y_n], axis=1)
|
||||
pixel_coords = tf.reshape(pixel_coords, [batch, 2, height, width])
|
||||
return tf.transpose(pixel_coords, perm=[0, 2, 3, 1])
|
||||
|
||||
|
||||
def meshgrid(batch, height, width, is_homogeneous=True):
|
||||
"""Construct a 2D meshgrid.
|
||||
|
||||
Args:
|
||||
batch: batch size
|
||||
height: height of the grid
|
||||
width: width of the grid
|
||||
is_homogeneous: whether to return in homogeneous coordinates
|
||||
Returns:
|
||||
x,y grid coordinates [batch, 2 (3 if homogeneous), height, width]
|
||||
"""
|
||||
x_t = tf.matmul(tf.ones(shape=tf.stack([height, 1])),
|
||||
tf.transpose(tf.expand_dims(
|
||||
tf.linspace(-1.0, 1.0, width), 1), [1, 0]))
|
||||
y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1),
|
||||
tf.ones(shape=tf.stack([1, width])))
|
||||
x_t = (x_t + 1.0) * 0.5 * tf.cast(width - 1, tf.float32)
|
||||
y_t = (y_t + 1.0) * 0.5 * tf.cast(height - 1, tf.float32)
|
||||
if is_homogeneous:
|
||||
ones = tf.ones_like(x_t)
|
||||
coords = tf.stack([x_t, y_t, ones], axis=0)
|
||||
else:
|
||||
coords = tf.stack([x_t, y_t], axis=0)
|
||||
coords = tf.tile(tf.expand_dims(coords, 0), [batch, 1, 1, 1])
|
||||
return coords
|
||||
|
||||
|
||||
def projective_inverse_warp(img, depth, pose, intrinsics):
|
||||
"""Inverse warp a source image to the target image plane based on projection.
|
||||
|
||||
Args:
|
||||
img: the source image [batch, height_s, width_s, 3]
|
||||
depth: depth map of the target image [batch, height_t, width_t]
|
||||
pose: target to source camera transformation matrix [batch, 6], in the
|
||||
order of tx, ty, tz, rx, ry, rz
|
||||
intrinsics: camera intrinsics [batch, 3, 3]
|
||||
Returns:
|
||||
Source image inverse warped to the target image plane [batch, height_t,
|
||||
width_t, 3]
|
||||
"""
|
||||
batch, height, width, _ = img.get_shape().as_list()
|
||||
# Convert pose vector to matrix
|
||||
pose = pose_vec2mat(pose)
|
||||
# Construct pixel grid coordinates
|
||||
pixel_coords = meshgrid(batch, height, width)
|
||||
# Convert pixel coordinates to the camera frame
|
||||
cam_coords = pixel2cam(depth, pixel_coords, intrinsics)
|
||||
# Construct a 4x4 intrinsic matrix (TODO: can it be 3x4?)
|
||||
filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4])
|
||||
filler = tf.tile(filler, [batch, 1, 1])
|
||||
intrinsics = tf.concat([intrinsics, tf.zeros([batch, 3, 1])], axis=2)
|
||||
intrinsics = tf.concat([intrinsics, filler], axis=1)
|
||||
# Get a 4x4 transformation matrix from 'target' camera frame to 'source'
|
||||
# pixel frame.
|
||||
proj_tgt_cam_to_src_pixel = tf.matmul(intrinsics, pose)
|
||||
src_pixel_coords = cam2pixel(cam_coords, proj_tgt_cam_to_src_pixel)
|
||||
output_img = bilinear_sampler(img, src_pixel_coords)
|
||||
return output_img
|
||||
|
||||
|
||||
def bilinear_sampler(imgs, coords):
|
||||
"""Construct a new image by bilinear sampling from the input image.
|
||||
|
||||
Points falling outside the source image boundary have value 0.
|
||||
|
||||
Args:
|
||||
imgs: source image to be sampled from [batch, height_s, width_s, channels]
|
||||
coords: coordinates of source pixels to sample from [batch, height_t,
|
||||
width_t, 2]. height_t/width_t correspond to the dimensions of the output
|
||||
image (don't need to be the same as height_s/width_s). The two channels
|
||||
correspond to x and y coordinates respectively.
|
||||
Returns:
|
||||
A new sampled image [batch, height_t, width_t, channels]
|
||||
"""
|
||||
def _repeat(x, n_repeats):
|
||||
rep = tf.transpose(
|
||||
tf.expand_dims(tf.ones(shape=tf.stack([
|
||||
n_repeats,
|
||||
])), 1), [1, 0])
|
||||
rep = tf.cast(rep, 'float32')
|
||||
x = tf.matmul(tf.reshape(x, (-1, 1)), rep)
|
||||
return tf.reshape(x, [-1])
|
||||
|
||||
with tf.name_scope('image_sampling'):
|
||||
coords_x, coords_y = tf.split(coords, [1, 1], axis=3)
|
||||
inp_size = imgs.get_shape()
|
||||
coord_size = coords.get_shape()
|
||||
out_size = coords.get_shape().as_list()
|
||||
out_size[3] = imgs.get_shape().as_list()[3]
|
||||
|
||||
coords_x = tf.cast(coords_x, 'float32')
|
||||
coords_y = tf.cast(coords_y, 'float32')
|
||||
|
||||
x0 = tf.floor(coords_x)
|
||||
x1 = x0 + 1
|
||||
y0 = tf.floor(coords_y)
|
||||
y1 = y0 + 1
|
||||
|
||||
y_max = tf.cast(tf.shape(imgs)[1] - 1, 'float32')
|
||||
x_max = tf.cast(tf.shape(imgs)[2] - 1, 'float32')
|
||||
zero = tf.zeros([1], dtype='float32')
|
||||
|
||||
x0_safe = tf.clip_by_value(x0, zero, x_max)
|
||||
y0_safe = tf.clip_by_value(y0, zero, y_max)
|
||||
x1_safe = tf.clip_by_value(x1, zero, x_max)
|
||||
y1_safe = tf.clip_by_value(y1, zero, y_max)
|
||||
|
||||
# bilinear interp weights, with points outside the grid having weight 0
|
||||
# wt_x0 = (x1 - coords_x) * tf.cast(tf.equal(x0, x0_safe), 'float32')
|
||||
# wt_x1 = (coords_x - x0) * tf.cast(tf.equal(x1, x1_safe), 'float32')
|
||||
# wt_y0 = (y1 - coords_y) * tf.cast(tf.equal(y0, y0_safe), 'float32')
|
||||
# wt_y1 = (coords_y - y0) * tf.cast(tf.equal(y1, y1_safe), 'float32')
|
||||
|
||||
wt_x0 = x1_safe - coords_x
|
||||
wt_x1 = coords_x - x0_safe
|
||||
wt_y0 = y1_safe - coords_y
|
||||
wt_y1 = coords_y - y0_safe
|
||||
|
||||
# indices in the flat image to sample from
|
||||
dim2 = tf.cast(inp_size[2], 'float32')
|
||||
dim1 = tf.cast(inp_size[2] * inp_size[1], 'float32')
|
||||
base = tf.reshape(
|
||||
_repeat(
|
||||
tf.cast(tf.range(coord_size[0]), 'float32') * dim1,
|
||||
coord_size[1] * coord_size[2]),
|
||||
[out_size[0], out_size[1], out_size[2], 1])
|
||||
|
||||
base_y0 = base + y0_safe * dim2
|
||||
base_y1 = base + y1_safe * dim2
|
||||
idx00 = tf.reshape(x0_safe + base_y0, [-1])
|
||||
idx01 = x0_safe + base_y1
|
||||
idx10 = x1_safe + base_y0
|
||||
idx11 = x1_safe + base_y1
|
||||
|
||||
# sample from imgs
|
||||
imgs_flat = tf.reshape(imgs, tf.stack([-1, inp_size[3]]))
|
||||
imgs_flat = tf.cast(imgs_flat, 'float32')
|
||||
im00 = tf.reshape(
|
||||
tf.gather(imgs_flat, tf.cast(idx00, 'int32')), out_size)
|
||||
im01 = tf.reshape(
|
||||
tf.gather(imgs_flat, tf.cast(idx01, 'int32')), out_size)
|
||||
im10 = tf.reshape(
|
||||
tf.gather(imgs_flat, tf.cast(idx10, 'int32')), out_size)
|
||||
im11 = tf.reshape(
|
||||
tf.gather(imgs_flat, tf.cast(idx11, 'int32')), out_size)
|
||||
|
||||
w00 = wt_x0 * wt_y0
|
||||
w01 = wt_x0 * wt_y1
|
||||
w10 = wt_x1 * wt_y0
|
||||
w11 = wt_x1 * wt_y1
|
||||
|
||||
output = tf.add_n([
|
||||
w00 * im00, w01 * im01,
|
||||
w10 * im10, w11 * im11
|
||||
])
|
||||
return output
|
||||
|
||||
# Spatial transformer network bilinear sampler, taken from https://github.com/kevinzakka/spatial-transformer-network/blob/master/stn/transformer.py
|
||||
|
||||
|
||||
def stn_bilinear_sampler(img, x, y):
|
||||
"""
|
||||
Performs bilinear sampling of the input images according to the
|
||||
normalized coordinates provided by the sampling grid. Note that
|
||||
the sampling is done identically for each channel of the input.
|
||||
To test if the function works properly, output image should be
|
||||
identical to input image when theta is initialized to identity
|
||||
transform.
|
||||
Input
|
||||
-----
|
||||
- img: batch of images in (B, H, W, C) layout.
|
||||
- grid: x, y which is the output of affine_grid_generator.
|
||||
Returns
|
||||
-------
|
||||
- out: interpolated images according to grids. Same size as grid.
|
||||
"""
|
||||
H = tf.shape(img)[1]
|
||||
W = tf.shape(img)[2]
|
||||
max_y = tf.cast(H - 1, 'int32')
|
||||
max_x = tf.cast(W - 1, 'int32')
|
||||
zero = tf.zeros([], dtype='int32')
|
||||
|
||||
# rescale x and y to [0, W-1/H-1]
|
||||
x = tf.cast(x, 'float32')
|
||||
y = tf.cast(y, 'float32')
|
||||
x = 0.5 * ((x + 1.0) * tf.cast(max_x-1, 'float32'))
|
||||
y = 0.5 * ((y + 1.0) * tf.cast(max_y-1, 'float32'))
|
||||
|
||||
# grab 4 nearest corner points for each (x_i, y_i)
|
||||
x0 = tf.cast(tf.floor(x), 'int32')
|
||||
x1 = x0 + 1
|
||||
y0 = tf.cast(tf.floor(y), 'int32')
|
||||
y1 = y0 + 1
|
||||
|
||||
# clip to range [0, H-1/W-1] to not violate img boundaries
|
||||
x0 = tf.clip_by_value(x0, zero, max_x)
|
||||
x1 = tf.clip_by_value(x1, zero, max_x)
|
||||
y0 = tf.clip_by_value(y0, zero, max_y)
|
||||
y1 = tf.clip_by_value(y1, zero, max_y)
|
||||
|
||||
# get pixel value at corner coords
|
||||
Ia = get_pixel_value(img, x0, y0)
|
||||
Ib = get_pixel_value(img, x0, y1)
|
||||
Ic = get_pixel_value(img, x1, y0)
|
||||
Id = get_pixel_value(img, x1, y1)
|
||||
|
||||
# recast as float for delta calculation
|
||||
x0 = tf.cast(x0, 'float32')
|
||||
x1 = tf.cast(x1, 'float32')
|
||||
y0 = tf.cast(y0, 'float32')
|
||||
y1 = tf.cast(y1, 'float32')
|
||||
|
||||
# calculate deltas
|
||||
wa = (x1-x) * (y1-y)
|
||||
wb = (x1-x) * (y-y0)
|
||||
wc = (x-x0) * (y1-y)
|
||||
wd = (x-x0) * (y-y0)
|
||||
|
||||
# add dimension for addition
|
||||
wa = tf.expand_dims(wa, axis=3)
|
||||
wb = tf.expand_dims(wb, axis=3)
|
||||
wc = tf.expand_dims(wc, axis=3)
|
||||
wd = tf.expand_dims(wd, axis=3)
|
||||
|
||||
# compute output
|
||||
out = tf.add_n([wa*Ia, wb*Ib, wc*Ic, wd*Id])
|
||||
|
||||
return out
|
||||
Reference in New Issue
Block a user