""" Utils to load and split image/video data. """ from __future__ import division import math import tensorflow as tf def euler2mat(z, y, x): """Converts euler angles to rotation matrix TODO: remove the dimension for 'N' (deprecated for converting all source poses altogether) Reference: https://github.com/pulkitag/pycaffe-utils/blob/master/rot_utils.py#L174 Args: z: rotation angle along z axis (in radians) -- size = [B, N] y: rotation angle along y axis (in radians) -- size = [B, N] x: rotation angle along x axis (in radians) -- size = [B, N] Returns: Rotation matrix corresponding to the euler angles -- size = [B, N, 3, 3] """ B = tf.shape(z)[0] N = 1 z = tf.clip_by_value(z, -math.pi, math.pi) y = tf.clip_by_value(y, -math.pi, math.pi) x = tf.clip_by_value(x, -math.pi, math.pi) # Expand to B x N x 1 x 1 z = tf.expand_dims(tf.expand_dims(z, -1), -1) y = tf.expand_dims(tf.expand_dims(y, -1), -1) x = tf.expand_dims(tf.expand_dims(x, -1), -1) zeros = tf.zeros([B, N, 1, 1]) ones = tf.ones([B, N, 1, 1]) cosz = tf.cos(z) sinz = tf.sin(z) rotz_1 = tf.concat([cosz, -sinz, zeros], axis=3) rotz_2 = tf.concat([sinz, cosz, zeros], axis=3) rotz_3 = tf.concat([zeros, zeros, ones], axis=3) zmat = tf.concat([rotz_1, rotz_2, rotz_3], axis=2) cosy = tf.cos(y) siny = tf.sin(y) roty_1 = tf.concat([cosy, zeros, siny], axis=3) roty_2 = tf.concat([zeros, ones, zeros], axis=3) roty_3 = tf.concat([-siny, zeros, cosy], axis=3) ymat = tf.concat([roty_1, roty_2, roty_3], axis=2) cosx = tf.cos(x) sinx = tf.sin(x) rotx_1 = tf.concat([ones, zeros, zeros], axis=3) rotx_2 = tf.concat([zeros, cosx, -sinx], axis=3) rotx_3 = tf.concat([zeros, sinx, cosx], axis=3) xmat = tf.concat([rotx_1, rotx_2, rotx_3], axis=2) rotMat = tf.matmul(tf.matmul(xmat, ymat), zmat) return rotMat def pose_vec2mat(vec): """Converts 6DoF parameters to transformation matrix Args: vec: 6DoF parameters in the order of tx, ty, tz, rx, ry, rz -- [B, 6] Returns: A transformation matrix -- [B, 4, 4] """ batch_size, _ = vec.get_shape().as_list() translation = tf.slice(vec, [0, 0], [-1, 3]) translation = tf.expand_dims(translation, -1) rx = tf.slice(vec, [0, 3], [-1, 1]) ry = tf.slice(vec, [0, 4], [-1, 1]) rz = tf.slice(vec, [0, 5], [-1, 1]) rot_mat = euler2mat(rz, ry, rx) rot_mat = tf.squeeze(rot_mat, axis=[1]) filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4]) filler = tf.tile(filler, [batch_size, 1, 1]) transform_mat = tf.concat([rot_mat, translation], axis=2) transform_mat = tf.concat([transform_mat, filler], axis=1) return transform_mat def pixel2cam(depth, pixel_coords, intrinsics, is_homogeneous=True): """Transforms coordinates in the pixel frame to the camera frame. Args: depth: [batch, height, width] pixel_coords: homogeneous pixel coordinates [batch, 3, height, width] intrinsics: camera intrinsics [batch, 3, 3] is_homogeneous: return in homogeneous coordinates Returns: Coords in the camera frame [batch, 3 (4 if homogeneous), height, width] """ batch, height, width = depth.get_shape().as_list() depth = tf.reshape(depth, [batch, 1, -1]) pixel_coords = tf.reshape(pixel_coords, [batch, 3, -1]) cam_coords = tf.matmul(tf.matrix_inverse(intrinsics), pixel_coords) * depth if is_homogeneous: ones = tf.ones([batch, 1, height*width]) cam_coords = tf.concat([cam_coords, ones], axis=1) cam_coords = tf.reshape(cam_coords, [batch, -1, height, width]) return cam_coords def cam2pixel(cam_coords, proj): """Transforms coordinates in a camera frame to the pixel frame. Args: cam_coords: [batch, 4, height, width] proj: [batch, 4, 4] Returns: Pixel coordinates projected from the camera frame [batch, height, width, 2] """ batch, _, height, width = cam_coords.get_shape().as_list() cam_coords = tf.reshape(cam_coords, [batch, 4, -1]) unnormalized_pixel_coords = tf.matmul(proj, cam_coords) x_u = tf.slice(unnormalized_pixel_coords, [0, 0, 0], [-1, 1, -1]) y_u = tf.slice(unnormalized_pixel_coords, [0, 1, 0], [-1, 1, -1]) z_u = tf.slice(unnormalized_pixel_coords, [0, 2, 0], [-1, 1, -1]) x_n = x_u / (z_u + 1e-10) y_n = y_u / (z_u + 1e-10) pixel_coords = tf.concat([x_n, y_n], axis=1) pixel_coords = tf.reshape(pixel_coords, [batch, 2, height, width]) return tf.transpose(pixel_coords, perm=[0, 2, 3, 1]) def meshgrid(batch, height, width, is_homogeneous=True): """Construct a 2D meshgrid. Args: batch: batch size height: height of the grid width: width of the grid is_homogeneous: whether to return in homogeneous coordinates Returns: x,y grid coordinates [batch, 2 (3 if homogeneous), height, width] """ x_t = tf.matmul(tf.ones(shape=tf.stack([height, 1])), tf.transpose(tf.expand_dims( tf.linspace(-1.0, 1.0, width), 1), [1, 0])) y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1), tf.ones(shape=tf.stack([1, width]))) x_t = (x_t + 1.0) * 0.5 * tf.cast(width - 1, tf.float32) y_t = (y_t + 1.0) * 0.5 * tf.cast(height - 1, tf.float32) if is_homogeneous: ones = tf.ones_like(x_t) coords = tf.stack([x_t, y_t, ones], axis=0) else: coords = tf.stack([x_t, y_t], axis=0) coords = tf.tile(tf.expand_dims(coords, 0), [batch, 1, 1, 1]) return coords def projective_inverse_warp(img, depth, pose, intrinsics): """Inverse warp a source image to the target image plane based on projection. Args: img: the source image [batch, height_s, width_s, 3] depth: depth map of the target image [batch, height_t, width_t] pose: target to source camera transformation matrix [batch, 6], in the order of tx, ty, tz, rx, ry, rz intrinsics: camera intrinsics [batch, 3, 3] Returns: Source image inverse warped to the target image plane [batch, height_t, width_t, 3] """ batch, height, width, _ = img.get_shape().as_list() # Convert pose vector to matrix pose = pose_vec2mat(pose) # Construct pixel grid coordinates pixel_coords = meshgrid(batch, height, width) # Convert pixel coordinates to the camera frame cam_coords = pixel2cam(depth, pixel_coords, intrinsics) # Construct a 4x4 intrinsic matrix (TODO: can it be 3x4?) filler = tf.constant([0.0, 0.0, 0.0, 1.0], shape=[1, 1, 4]) filler = tf.tile(filler, [batch, 1, 1]) intrinsics = tf.concat([intrinsics, tf.zeros([batch, 3, 1])], axis=2) intrinsics = tf.concat([intrinsics, filler], axis=1) # Get a 4x4 transformation matrix from 'target' camera frame to 'source' # pixel frame. proj_tgt_cam_to_src_pixel = tf.matmul(intrinsics, pose) src_pixel_coords = cam2pixel(cam_coords, proj_tgt_cam_to_src_pixel) output_img = bilinear_sampler(img, src_pixel_coords) return output_img def bilinear_sampler(imgs, coords): """Construct a new image by bilinear sampling from the input image. Points falling outside the source image boundary have value 0. Args: imgs: source image to be sampled from [batch, height_s, width_s, channels] coords: coordinates of source pixels to sample from [batch, height_t, width_t, 2]. height_t/width_t correspond to the dimensions of the output image (don't need to be the same as height_s/width_s). The two channels correspond to x and y coordinates respectively. Returns: A new sampled image [batch, height_t, width_t, channels] """ def _repeat(x, n_repeats): rep = tf.transpose( tf.expand_dims(tf.ones(shape=tf.stack([ n_repeats, ])), 1), [1, 0]) rep = tf.cast(rep, 'float32') x = tf.matmul(tf.reshape(x, (-1, 1)), rep) return tf.reshape(x, [-1]) with tf.name_scope('image_sampling'): coords_x, coords_y = tf.split(coords, [1, 1], axis=3) inp_size = imgs.get_shape() coord_size = coords.get_shape() out_size = coords.get_shape().as_list() out_size[3] = imgs.get_shape().as_list()[3] coords_x = tf.cast(coords_x, 'float32') coords_y = tf.cast(coords_y, 'float32') x0 = tf.floor(coords_x) x1 = x0 + 1 y0 = tf.floor(coords_y) y1 = y0 + 1 y_max = tf.cast(tf.shape(imgs)[1] - 1, 'float32') x_max = tf.cast(tf.shape(imgs)[2] - 1, 'float32') zero = tf.zeros([1], dtype='float32') x0_safe = tf.clip_by_value(x0, zero, x_max) y0_safe = tf.clip_by_value(y0, zero, y_max) x1_safe = tf.clip_by_value(x1, zero, x_max) y1_safe = tf.clip_by_value(y1, zero, y_max) # bilinear interp weights, with points outside the grid having weight 0 # wt_x0 = (x1 - coords_x) * tf.cast(tf.equal(x0, x0_safe), 'float32') # wt_x1 = (coords_x - x0) * tf.cast(tf.equal(x1, x1_safe), 'float32') # wt_y0 = (y1 - coords_y) * tf.cast(tf.equal(y0, y0_safe), 'float32') # wt_y1 = (coords_y - y0) * tf.cast(tf.equal(y1, y1_safe), 'float32') wt_x0 = x1_safe - coords_x wt_x1 = coords_x - x0_safe wt_y0 = y1_safe - coords_y wt_y1 = coords_y - y0_safe # indices in the flat image to sample from dim2 = tf.cast(inp_size[2], 'float32') dim1 = tf.cast(inp_size[2] * inp_size[1], 'float32') base = tf.reshape( _repeat( tf.cast(tf.range(coord_size[0]), 'float32') * dim1, coord_size[1] * coord_size[2]), [out_size[0], out_size[1], out_size[2], 1]) base_y0 = base + y0_safe * dim2 base_y1 = base + y1_safe * dim2 idx00 = tf.reshape(x0_safe + base_y0, [-1]) idx01 = x0_safe + base_y1 idx10 = x1_safe + base_y0 idx11 = x1_safe + base_y1 # sample from imgs imgs_flat = tf.reshape(imgs, tf.stack([-1, inp_size[3]])) imgs_flat = tf.cast(imgs_flat, 'float32') im00 = tf.reshape( tf.gather(imgs_flat, tf.cast(idx00, 'int32')), out_size) im01 = tf.reshape( tf.gather(imgs_flat, tf.cast(idx01, 'int32')), out_size) im10 = tf.reshape( tf.gather(imgs_flat, tf.cast(idx10, 'int32')), out_size) im11 = tf.reshape( tf.gather(imgs_flat, tf.cast(idx11, 'int32')), out_size) w00 = wt_x0 * wt_y0 w01 = wt_x0 * wt_y1 w10 = wt_x1 * wt_y0 w11 = wt_x1 * wt_y1 output = tf.add_n([ w00 * im00, w01 * im01, w10 * im10, w11 * im11 ]) return output # Spatial transformer network bilinear sampler, taken from https://github.com/kevinzakka/spatial-transformer-network/blob/master/stn/transformer.py def stn_bilinear_sampler(img, x, y): """ Performs bilinear sampling of the input images according to the normalized coordinates provided by the sampling grid. Note that the sampling is done identically for each channel of the input. To test if the function works properly, output image should be identical to input image when theta is initialized to identity transform. Input ----- - img: batch of images in (B, H, W, C) layout. - grid: x, y which is the output of affine_grid_generator. Returns ------- - out: interpolated images according to grids. Same size as grid. """ H = tf.shape(img)[1] W = tf.shape(img)[2] max_y = tf.cast(H - 1, 'int32') max_x = tf.cast(W - 1, 'int32') zero = tf.zeros([], dtype='int32') # rescale x and y to [0, W-1/H-1] x = tf.cast(x, 'float32') y = tf.cast(y, 'float32') x = 0.5 * ((x + 1.0) * tf.cast(max_x-1, 'float32')) y = 0.5 * ((y + 1.0) * tf.cast(max_y-1, 'float32')) # grab 4 nearest corner points for each (x_i, y_i) x0 = tf.cast(tf.floor(x), 'int32') x1 = x0 + 1 y0 = tf.cast(tf.floor(y), 'int32') y1 = y0 + 1 # clip to range [0, H-1/W-1] to not violate img boundaries x0 = tf.clip_by_value(x0, zero, max_x) x1 = tf.clip_by_value(x1, zero, max_x) y0 = tf.clip_by_value(y0, zero, max_y) y1 = tf.clip_by_value(y1, zero, max_y) # get pixel value at corner coords Ia = get_pixel_value(img, x0, y0) Ib = get_pixel_value(img, x0, y1) Ic = get_pixel_value(img, x1, y0) Id = get_pixel_value(img, x1, y1) # recast as float for delta calculation x0 = tf.cast(x0, 'float32') x1 = tf.cast(x1, 'float32') y0 = tf.cast(y0, 'float32') y1 = tf.cast(y1, 'float32') # calculate deltas wa = (x1-x) * (y1-y) wb = (x1-x) * (y-y0) wc = (x-x0) * (y1-y) wd = (x-x0) * (y-y0) # add dimension for addition wa = tf.expand_dims(wa, axis=3) wb = tf.expand_dims(wb, axis=3) wc = tf.expand_dims(wc, axis=3) wd = tf.expand_dims(wd, axis=3) # compute output out = tf.add_n([wa*Ia, wb*Ib, wc*Ic, wd*Id]) return out