Files
2021-07-12 18:19:59 +09:30

355 lines
13 KiB
Python

"""
Utils to load and split image/video data.
"""
from __future__ import division
import math
import tensorflow as tf
def euler2mat(z, y, x):
"""Converts euler angles to rotation matrix
TODO: remove the dimension for 'N' (deprecated for converting all source
poses altogether)
Reference: https://github.com/pulkitag/pycaffe-utils/blob/master/rot_utils.py#L174
Args:
z: rotation angle along z axis (in radians) -- size = [B, N]
y: rotation angle along y axis (in radians) -- size = [B, N]
x: rotation angle along x axis (in radians) -- size = [B, N]
Returns:
Rotation matrix corresponding to the euler angles -- size = [B, N, 3, 3]
"""
B = tf.shape(z)[0]
N = 1
z = tf.clip_by_value(z, -math.pi, math.pi)
y = tf.clip_by_value(y, -math.pi, math.pi)
x = tf.clip_by_value(x, -math.pi, math.pi)
# Expand to B x N x 1 x 1
z = tf.expand_dims(tf.expand_dims(z, -1), -1)
y = tf.expand_dims(tf.expand_dims(y, -1), -1)
x = tf.expand_dims(tf.expand_dims(x, -1), -1)
zeros = tf.zeros([B, N, 1, 1])
ones = tf.ones([B, N, 1, 1])
cosz = tf.cos(z)
sinz = tf.sin(z)
rotz_1 = tf.concat([cosz, -sinz, zeros], axis=3)
rotz_2 = tf.concat([sinz, cosz, zeros], axis=3)
rotz_3 = tf.concat([zeros, zeros, ones], axis=3)
zmat = tf.concat([rotz_1, rotz_2, rotz_3], axis=2)
cosy = tf.cos(y)
siny = tf.sin(y)
roty_1 = tf.concat([cosy, zeros, siny], axis=3)
roty_2 = tf.concat([zeros, ones, zeros], axis=3)
roty_3 = tf.concat([-siny, zeros, cosy], axis=3)
ymat = tf.concat([roty_1, roty_2, roty_3], axis=2)
cosx = tf.cos(x)
sinx = tf.sin(x)
rotx_1 = tf.concat([ones, zeros, zeros], axis=3)
rotx_2 = tf.concat([zeros, cosx, -sinx], axis=3)
rotx_3 = tf.concat([zeros, sinx, cosx], axis=3)
xmat = tf.concat([rotx_1, rotx_2, rotx_3], axis=2)
rotMat = tf.matmul(tf.matmul(xmat, ymat), zmat)
return rotMat
def pose_vec2mat(vec):
"""Converts 6DoF parameters to transformation matrix
Args:
vec: 6DoF parameters in the order of tx, ty, tz, rx, ry, rz -- [B, 6]
Returns:
A transformation matrix -- [B, 4, 4]
"""
batch_size, _ = vec.get_shape().as_list()
translation = tf.slice(vec, [0, 0], [-1, 3])
translation = tf.expand_dims(translation, -1)
rx = tf.slice(vec, [0, 3], [-1, 1])
ry = tf.slice(vec, [0, 4], [-1, 1])
rz = tf.slice(vec, [0, 5], [-1, 1])
rot_mat = euler2mat(rz, ry, rx)
rot_mat = tf.squeeze(rot_mat, axis=[1])
filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4])
filler = tf.tile(filler, [batch_size, 1, 1])
transform_mat = tf.concat([rot_mat, translation], axis=2)
transform_mat = tf.concat([transform_mat, filler], axis=1)
return transform_mat
def pixel2cam(depth, pixel_coords, intrinsics, is_homogeneous=True):
"""Transforms coordinates in the pixel frame to the camera frame.
Args:
depth: [batch, height, width]
pixel_coords: homogeneous pixel coordinates [batch, 3, height, width]
intrinsics: camera intrinsics [batch, 3, 3]
is_homogeneous: return in homogeneous coordinates
Returns:
Coords in the camera frame [batch, 3 (4 if homogeneous), height, width]
"""
batch, height, width = depth.get_shape().as_list()
depth = tf.reshape(depth, [batch, 1, -1])
pixel_coords = tf.reshape(pixel_coords, [batch, 3, -1])
cam_coords = tf.matmul(tf.matrix_inverse(intrinsics), pixel_coords) * depth
if is_homogeneous:
ones = tf.ones([batch, 1, height*width])
cam_coords = tf.concat([cam_coords, ones], axis=1)
cam_coords = tf.reshape(cam_coords, [batch, -1, height, width])
return cam_coords
def cam2pixel(cam_coords, proj):
"""Transforms coordinates in a camera frame to the pixel frame.
Args:
cam_coords: [batch, 4, height, width]
proj: [batch, 4, 4]
Returns:
Pixel coordinates projected from the camera frame [batch, height, width, 2]
"""
batch, _, height, width = cam_coords.get_shape().as_list()
cam_coords = tf.reshape(cam_coords, [batch, 4, -1])
unnormalized_pixel_coords = tf.matmul(proj, cam_coords)
x_u = tf.slice(unnormalized_pixel_coords, [0, 0, 0], [-1, 1, -1])
y_u = tf.slice(unnormalized_pixel_coords, [0, 1, 0], [-1, 1, -1])
z_u = tf.slice(unnormalized_pixel_coords, [0, 2, 0], [-1, 1, -1])
x_n = x_u / (z_u + 1e-10)
y_n = y_u / (z_u + 1e-10)
pixel_coords = tf.concat([x_n, y_n], axis=1)
pixel_coords = tf.reshape(pixel_coords, [batch, 2, height, width])
return tf.transpose(pixel_coords, perm=[0, 2, 3, 1])
def meshgrid(batch, height, width, is_homogeneous=True):
"""Construct a 2D meshgrid.
Args:
batch: batch size
height: height of the grid
width: width of the grid
is_homogeneous: whether to return in homogeneous coordinates
Returns:
x,y grid coordinates [batch, 2 (3 if homogeneous), height, width]
"""
x_t = tf.matmul(tf.ones(shape=tf.stack([height, 1])),
tf.transpose(tf.expand_dims(
tf.linspace(-1.0, 1.0, width), 1), [1, 0]))
y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1),
tf.ones(shape=tf.stack([1, width])))
x_t = (x_t + 1.0) * 0.5 * tf.cast(width - 1, tf.float32)
y_t = (y_t + 1.0) * 0.5 * tf.cast(height - 1, tf.float32)
if is_homogeneous:
ones = tf.ones_like(x_t)
coords = tf.stack([x_t, y_t, ones], axis=0)
else:
coords = tf.stack([x_t, y_t], axis=0)
coords = tf.tile(tf.expand_dims(coords, 0), [batch, 1, 1, 1])
return coords
def projective_inverse_warp(img, depth, pose, intrinsics):
"""Inverse warp a source image to the target image plane based on projection.
Args:
img: the source image [batch, height_s, width_s, 3]
depth: depth map of the target image [batch, height_t, width_t]
pose: target to source camera transformation matrix [batch, 6], in the
order of tx, ty, tz, rx, ry, rz
intrinsics: camera intrinsics [batch, 3, 3]
Returns:
Source image inverse warped to the target image plane [batch, height_t,
width_t, 3]
"""
batch, height, width, _ = img.get_shape().as_list()
# Convert pose vector to matrix
pose = pose_vec2mat(pose)
# Construct pixel grid coordinates
pixel_coords = meshgrid(batch, height, width)
# Convert pixel coordinates to the camera frame
cam_coords = pixel2cam(depth, pixel_coords, intrinsics)
# Construct a 4x4 intrinsic matrix (TODO: can it be 3x4?)
filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4])
filler = tf.tile(filler, [batch, 1, 1])
intrinsics = tf.concat([intrinsics, tf.zeros([batch, 3, 1])], axis=2)
intrinsics = tf.concat([intrinsics, filler], axis=1)
# Get a 4x4 transformation matrix from 'target' camera frame to 'source'
# pixel frame.
proj_tgt_cam_to_src_pixel = tf.matmul(intrinsics, pose)
src_pixel_coords = cam2pixel(cam_coords, proj_tgt_cam_to_src_pixel)
output_img = bilinear_sampler(img, src_pixel_coords)
return output_img
def bilinear_sampler(imgs, coords):
"""Construct a new image by bilinear sampling from the input image.
Points falling outside the source image boundary have value 0.
Args:
imgs: source image to be sampled from [batch, height_s, width_s, channels]
coords: coordinates of source pixels to sample from [batch, height_t,
width_t, 2]. height_t/width_t correspond to the dimensions of the output
image (don't need to be the same as height_s/width_s). The two channels
correspond to x and y coordinates respectively.
Returns:
A new sampled image [batch, height_t, width_t, channels]
"""
def _repeat(x, n_repeats):
rep = tf.transpose(
tf.expand_dims(tf.ones(shape=tf.stack([
n_repeats,
])), 1), [1, 0])
rep = tf.cast(rep, 'float32')
x = tf.matmul(tf.reshape(x, (-1, 1)), rep)
return tf.reshape(x, [-1])
with tf.name_scope('image_sampling'):
coords_x, coords_y = tf.split(coords, [1, 1], axis=3)
inp_size = imgs.get_shape()
coord_size = coords.get_shape()
out_size = coords.get_shape().as_list()
out_size[3] = imgs.get_shape().as_list()[3]
coords_x = tf.cast(coords_x, 'float32')
coords_y = tf.cast(coords_y, 'float32')
x0 = tf.floor(coords_x)
x1 = x0 + 1
y0 = tf.floor(coords_y)
y1 = y0 + 1
y_max = tf.cast(tf.shape(imgs)[1] - 1, 'float32')
x_max = tf.cast(tf.shape(imgs)[2] - 1, 'float32')
zero = tf.zeros([1], dtype='float32')
x0_safe = tf.clip_by_value(x0, zero, x_max)
y0_safe = tf.clip_by_value(y0, zero, y_max)
x1_safe = tf.clip_by_value(x1, zero, x_max)
y1_safe = tf.clip_by_value(y1, zero, y_max)
# bilinear interp weights, with points outside the grid having weight 0
# wt_x0 = (x1 - coords_x) * tf.cast(tf.equal(x0, x0_safe), 'float32')
# wt_x1 = (coords_x - x0) * tf.cast(tf.equal(x1, x1_safe), 'float32')
# wt_y0 = (y1 - coords_y) * tf.cast(tf.equal(y0, y0_safe), 'float32')
# wt_y1 = (coords_y - y0) * tf.cast(tf.equal(y1, y1_safe), 'float32')
wt_x0 = x1_safe - coords_x
wt_x1 = coords_x - x0_safe
wt_y0 = y1_safe - coords_y
wt_y1 = coords_y - y0_safe
# indices in the flat image to sample from
dim2 = tf.cast(inp_size[2], 'float32')
dim1 = tf.cast(inp_size[2] * inp_size[1], 'float32')
base = tf.reshape(
_repeat(
tf.cast(tf.range(coord_size[0]), 'float32') * dim1,
coord_size[1] * coord_size[2]),
[out_size[0], out_size[1], out_size[2], 1])
base_y0 = base + y0_safe * dim2
base_y1 = base + y1_safe * dim2
idx00 = tf.reshape(x0_safe + base_y0, [-1])
idx01 = x0_safe + base_y1
idx10 = x1_safe + base_y0
idx11 = x1_safe + base_y1
# sample from imgs
imgs_flat = tf.reshape(imgs, tf.stack([-1, inp_size[3]]))
imgs_flat = tf.cast(imgs_flat, 'float32')
im00 = tf.reshape(
tf.gather(imgs_flat, tf.cast(idx00, 'int32')), out_size)
im01 = tf.reshape(
tf.gather(imgs_flat, tf.cast(idx01, 'int32')), out_size)
im10 = tf.reshape(
tf.gather(imgs_flat, tf.cast(idx10, 'int32')), out_size)
im11 = tf.reshape(
tf.gather(imgs_flat, tf.cast(idx11, 'int32')), out_size)
w00 = wt_x0 * wt_y0
w01 = wt_x0 * wt_y1
w10 = wt_x1 * wt_y0
w11 = wt_x1 * wt_y1
output = tf.add_n([
w00 * im00, w01 * im01,
w10 * im10, w11 * im11
])
return output
# Spatial transformer network bilinear sampler, taken from https://github.com/kevinzakka/spatial-transformer-network/blob/master/stn/transformer.py
def stn_bilinear_sampler(img, x, y):
"""
Performs bilinear sampling of the input images according to the
normalized coordinates provided by the sampling grid. Note that
the sampling is done identically for each channel of the input.
To test if the function works properly, output image should be
identical to input image when theta is initialized to identity
transform.
Input
-----
- img: batch of images in (B, H, W, C) layout.
- grid: x, y which is the output of affine_grid_generator.
Returns
-------
- out: interpolated images according to grids. Same size as grid.
"""
H = tf.shape(img)[1]
W = tf.shape(img)[2]
max_y = tf.cast(H - 1, 'int32')
max_x = tf.cast(W - 1, 'int32')
zero = tf.zeros([], dtype='int32')
# rescale x and y to [0, W-1/H-1]
x = tf.cast(x, 'float32')
y = tf.cast(y, 'float32')
x = 0.5 * ((x + 1.0) * tf.cast(max_x-1, 'float32'))
y = 0.5 * ((y + 1.0) * tf.cast(max_y-1, 'float32'))
# grab 4 nearest corner points for each (x_i, y_i)
x0 = tf.cast(tf.floor(x), 'int32')
x1 = x0 + 1
y0 = tf.cast(tf.floor(y), 'int32')
y1 = y0 + 1
# clip to range [0, H-1/W-1] to not violate img boundaries
x0 = tf.clip_by_value(x0, zero, max_x)
x1 = tf.clip_by_value(x1, zero, max_x)
y0 = tf.clip_by_value(y0, zero, max_y)
y1 = tf.clip_by_value(y1, zero, max_y)
# get pixel value at corner coords
Ia = get_pixel_value(img, x0, y0)
Ib = get_pixel_value(img, x0, y1)
Ic = get_pixel_value(img, x1, y0)
Id = get_pixel_value(img, x1, y1)
# recast as float for delta calculation
x0 = tf.cast(x0, 'float32')
x1 = tf.cast(x1, 'float32')
y0 = tf.cast(y0, 'float32')
y1 = tf.cast(y1, 'float32')
# calculate deltas
wa = (x1-x) * (y1-y)
wb = (x1-x) * (y-y0)
wc = (x-x0) * (y1-y)
wd = (x-x0) * (y-y0)
# add dimension for addition
wa = tf.expand_dims(wa, axis=3)
wb = tf.expand_dims(wb, axis=3)
wc = tf.expand_dims(wc, axis=3)
wd = tf.expand_dims(wd, axis=3)
# compute output
out = tf.add_n([wa*Ia, wb*Ib, wc*Ic, wd*Id])
return out