diff --git a/c/coreslam_armv7l.c b/c/coreslam_armv7l.c new file mode 100644 index 0000000..ad91d9f --- /dev/null +++ b/c/coreslam_armv7l.c @@ -0,0 +1,117 @@ +/* +coreslam_armv7l.c.c ARM Cortex Neon acceleration for CoreSLAM + +Copyright (C) 2014 by Simon D. Levy + +This code is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This code is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with this code. If not, see . +*/ + + +#ifdef _MSC_VER +typedef __int64 int64_t; /* Define it from MSVC's internal type */ +#else +#include /* Use the C99 official header */ +#endif + +#include +#include + +#include + +#include "coreslam.h" +#include "coreslam_internals.h" + +/* Performs one rotation/translation */ +static void +neon_coord_4( + float32x4_t a_4, + float32x4_t b_4, + float32x4_t x_4, + float32x4_t y_4, + float32x4_t pos_4f, + float32x4_t point5_4, + int * result) +{ + float32x4_t tmp1 = vmulq_f32(a_4, x_4); + float32x4_t tmp2 = vmulq_f32(b_4, y_4); + tmp2 = vaddq_f32(tmp1, tmp2); + tmp2 = vaddq_f32(tmp2, pos_4f); + tmp2 = vaddq_f32(tmp2, point5_4); + int32x4_t c_4 = vcvtq_s32_f32(tmp2); + vst1q_s32(result, c_4); +} + + +int +distance_scan_to_map( + map_t * map, + scan_t * scan, + position_t position) +{ + /* Pre-compute sine and cosine of angle for rotation */ + double position_theta_radians = radians(position.theta_degrees); + double costheta = cos(position_theta_radians) * map->scale_pixels_per_mm; + double sintheta = sin(position_theta_radians) * map->scale_pixels_per_mm; + + /* Pre-compute pixel offset for translation */ + double pos_x_pix = position.x_mm * map->scale_pixels_per_mm; + double pos_y_pix = position.y_mm * map->scale_pixels_per_mm; + + + float32x4_t half_4 = vdupq_n_f32(0.5); + + float32x4_t costheta_4 = vdupq_n_f32(costheta); + float32x4_t sintheta_4 = vdupq_n_f32(sintheta); + float32x4_t nsintheta_4 = vdupq_n_f32(-sintheta); + + float32x4_t pos_x_4 = vdupq_n_f32(pos_x_pix); + float32x4_t pos_y_4 = vdupq_n_f32(pos_y_pix); + + int npoints = 0; /* number of points where scan matches map */ + int64_t sum = 0; + + /* Stride by 4 over obstacle points in scan */ + int i = 0; + for (i=0; iobst_npoints; i+=4) + { + /* Duplicate current obstacle point X and Y in 128-bit registers */ + float32x4_t scan_x_4 = vld1q_f32(&scan->obst_x_mm[i]); + float32x4_t scan_y_4 = vld1q_f32(&scan->obst_y_mm[i]); + + /* Compute X coordinate of 4 rotated / translated points at once */ + int xarr[4]; + neon_coord_4(costheta_4, nsintheta_4, scan_x_4, scan_y_4, pos_x_4, half_4, xarr); + + /* Compute Y coordinate of 4 rotated / translated points at once */ + int yarr[4]; + neon_coord_4(sintheta_4, costheta_4, scan_x_4, scan_y_4, pos_y_4, half_4, yarr); + + /* Handle rotated/translated points serially */ + int j; + for (j=0; j<4 && (i+j)obst_npoints; ++j) + { + int x = xarr[j]; + int y = yarr[j]; + + /* Add point if in map bounds */ + if (x >= 0 && x < map->size_pixels && y >= 0 && y < map->size_pixels) + { + sum += map->pixels[y * map->size_pixels + x]; + npoints++; + } + } + } + + return npoints ? (int)(sum * 1024 / npoints) : -1; +}