diff --git a/c/coreslam_armv7l.c b/c/coreslam_armv7l.c
new file mode 100644
index 0000000..ad91d9f
--- /dev/null
+++ b/c/coreslam_armv7l.c
@@ -0,0 +1,117 @@
+/*
+coreslam_armv7l.c.c ARM Cortex Neon acceleration for CoreSLAM
+
+Copyright (C) 2014 by Simon D. Levy
+
+This code is free software: you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+This code is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this code. If not, see .
+*/
+
+
+#ifdef _MSC_VER
+typedef __int64 int64_t; /* Define it from MSVC's internal type */
+#else
+#include /* Use the C99 official header */
+#endif
+
+#include
+#include
+
+#include
+
+#include "coreslam.h"
+#include "coreslam_internals.h"
+
+/* Performs one rotation/translation */
+static void
+neon_coord_4(
+ float32x4_t a_4,
+ float32x4_t b_4,
+ float32x4_t x_4,
+ float32x4_t y_4,
+ float32x4_t pos_4f,
+ float32x4_t point5_4,
+ int * result)
+{
+ float32x4_t tmp1 = vmulq_f32(a_4, x_4);
+ float32x4_t tmp2 = vmulq_f32(b_4, y_4);
+ tmp2 = vaddq_f32(tmp1, tmp2);
+ tmp2 = vaddq_f32(tmp2, pos_4f);
+ tmp2 = vaddq_f32(tmp2, point5_4);
+ int32x4_t c_4 = vcvtq_s32_f32(tmp2);
+ vst1q_s32(result, c_4);
+}
+
+
+int
+distance_scan_to_map(
+ map_t * map,
+ scan_t * scan,
+ position_t position)
+{
+ /* Pre-compute sine and cosine of angle for rotation */
+ double position_theta_radians = radians(position.theta_degrees);
+ double costheta = cos(position_theta_radians) * map->scale_pixels_per_mm;
+ double sintheta = sin(position_theta_radians) * map->scale_pixels_per_mm;
+
+ /* Pre-compute pixel offset for translation */
+ double pos_x_pix = position.x_mm * map->scale_pixels_per_mm;
+ double pos_y_pix = position.y_mm * map->scale_pixels_per_mm;
+
+
+ float32x4_t half_4 = vdupq_n_f32(0.5);
+
+ float32x4_t costheta_4 = vdupq_n_f32(costheta);
+ float32x4_t sintheta_4 = vdupq_n_f32(sintheta);
+ float32x4_t nsintheta_4 = vdupq_n_f32(-sintheta);
+
+ float32x4_t pos_x_4 = vdupq_n_f32(pos_x_pix);
+ float32x4_t pos_y_4 = vdupq_n_f32(pos_y_pix);
+
+ int npoints = 0; /* number of points where scan matches map */
+ int64_t sum = 0;
+
+ /* Stride by 4 over obstacle points in scan */
+ int i = 0;
+ for (i=0; iobst_npoints; i+=4)
+ {
+ /* Duplicate current obstacle point X and Y in 128-bit registers */
+ float32x4_t scan_x_4 = vld1q_f32(&scan->obst_x_mm[i]);
+ float32x4_t scan_y_4 = vld1q_f32(&scan->obst_y_mm[i]);
+
+ /* Compute X coordinate of 4 rotated / translated points at once */
+ int xarr[4];
+ neon_coord_4(costheta_4, nsintheta_4, scan_x_4, scan_y_4, pos_x_4, half_4, xarr);
+
+ /* Compute Y coordinate of 4 rotated / translated points at once */
+ int yarr[4];
+ neon_coord_4(sintheta_4, costheta_4, scan_x_4, scan_y_4, pos_y_4, half_4, yarr);
+
+ /* Handle rotated/translated points serially */
+ int j;
+ for (j=0; j<4 && (i+j)obst_npoints; ++j)
+ {
+ int x = xarr[j];
+ int y = yarr[j];
+
+ /* Add point if in map bounds */
+ if (x >= 0 && x < map->size_pixels && y >= 0 && y < map->size_pixels)
+ {
+ sum += map->pixels[y * map->size_pixels + x];
+ npoints++;
+ }
+ }
+ }
+
+ return npoints ? (int)(sum * 1024 / npoints) : -1;
+}