Optimize floating-point celt_inner_prod() and dual_inner_prod() for ARM NEON

The floating-point optimizations are not bit exact with C functions,
because of the different orders of floating-point operations.
But they are bit exact with the simulation C functions which simulate
the floating operations in the optimizations.

Change-Id: I149fda5b602fd5712b16fc8983df3c6c0c9e76ad

Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 7c356b9..ca988b6 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -35,8 +35,6 @@
 
 #if defined(OPUS_HAVE_RTCD)
 
-# if defined(FIXED_POINT)
-
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
 opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
   celt_inner_prod_c,   /* ARMv4 */
@@ -54,6 +52,7 @@
 };
 # endif
 
+# if defined(FIXED_POINT)
 #  if ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
     (defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
     (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index ea2f590..4ee13bd 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -30,8 +30,6 @@
 
 # include "armcpu.h"
 
-# if defined(FIXED_POINT)
-
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N);
 void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01,
@@ -68,6 +66,8 @@
 #  endif
 # endif
 
+# if defined(FIXED_POINT)
+
 #  if defined(OPUS_ARM_MAY_HAVE_NEON)
 opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
     opus_val32 *xcorr, int len, int max_pitch, int arch);
diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c
index ad18d2d..1ac38c4 100644
--- a/celt/arm/pitch_neon_intr.c
+++ b/celt/arm/pitch_neon_intr.c
@@ -126,4 +126,165 @@
 #endif
 }
 
+#else /* !FIXED_POINT */
+
+/* ========================================================================== */
+
+#ifdef OPUS_CHECK_ASM
+
+/* This part of code simulates floating-point NEON operations. */
+
+/* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
+/* operations of celt_inner_prod_neon(), and both functions should have bit */
+/* exact output.                                                            */
+static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
+{
+   int i;
+   opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
+   for (i = 0; i < N - 3; i += 4) {
+      xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
+      xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
+      xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
+      xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+   }
+   xy0 += xy2;
+   xy1 += xy3;
+   xy = xy0 + xy1;
+   for (; i < N; i++) {
+      xy = MAC16_16(xy, x[i], y[i]);
+   }
+   return xy;
+}
+
+/* dual_inner_prod_neon_float_c_simulation() simulates the floating-point   */
+/* operations of dual_inner_prod_neon(), and both functions should have bit */
+/* exact output.                                                            */
+static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+   int i;
+   opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
+   for (i = 0; i < N - 3; i += 4) {
+      xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
+      xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
+      xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
+      xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
+      xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
+      xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
+      xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
+      xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
+   }
+   xy01_0 += xy01_2;
+   xy02_0 += xy02_2;
+   xy01_1 += xy01_3;
+   xy02_1 += xy02_3;
+   xy01 = xy01_0 + xy01_1;
+   xy02 = xy02_0 + xy02_1;
+   for (; i < N; i++) {
+      xy01 = MAC16_16(xy01, x[i], y01[i]);
+      xy02 = MAC16_16(xy02, x[i], y02[i]);
+   }
+   *xy1 = xy01;
+   *xy2 = xy02;
+}
+
+#endif /* OPUS_CHECK_ASM */
+
+/* ========================================================================== */
+
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
+{
+    int i;
+    opus_val32 xy;
+    float32x4_t xy_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy_f32x2;
+
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y_f32x4;
+        x_f32x4  = vld1q_f32(&x[i]);
+        y_f32x4  = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        x_f32x4  = vld1q_f32(&x[i + 4]);
+        y_f32x4  = vld1q_f32(&y[i + 4]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+    }
+
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
+        const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        i += 4;
+    }
+
+    xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
+    xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
+    xy       = vget_lane_f32(xy_f32x2, 0);
+
+    for (; i < N; i++) {
+        xy = MAC16_16(xy, x[i], y[i]);
+    }
+
+#ifdef OPUS_CHECK_ASM
+    celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
 #endif
+
+    return xy;
+}
+
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+        int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+    int i;
+    opus_val32 xy01, xy02;
+    float32x4_t xy01_f32x4 = vdupq_n_f32(0);
+    float32x4_t xy02_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy01_f32x2, xy02_f32x2;
+
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
+        x_f32x4    = vld1q_f32(&x[i]);
+        y01_f32x4  = vld1q_f32(&y01[i]);
+        y02_f32x4  = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        x_f32x4    = vld1q_f32(&x[i + 4]);
+        y01_f32x4  = vld1q_f32(&y01[i + 4]);
+        y02_f32x4  = vld1q_f32(&y02[i + 4]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+    }
+
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4   = vld1q_f32(&x[i]);
+        const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
+        const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        i += 4;
+    }
+
+    xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
+    xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
+    xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
+    xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
+    xy01       = vget_lane_f32(xy01_f32x2, 0);
+    xy02       = vget_lane_f32(xy02_f32x2, 0);
+
+    for (; i < N; i++) {
+        xy01 = MAC16_16(xy01, x[i], y01[i]);
+        xy02 = MAC16_16(xy02, x[i], y02[i]);
+    }
+    *xy1 = xy01;
+    *xy2 = xy02;
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_val32 xy1_c, xy2_c;
+        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
+        celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
+        celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
+    }
+#endif
+}
+
+#endif /* FIXED_POINT */
diff --git a/celt/pitch.h b/celt/pitch.h
index d797844..e425f56 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -46,8 +46,7 @@
 #include "mips/pitch_mipsr1.h"
 #endif
 
-#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
-  || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
 # include "arm/pitch_arm.h"
 #endif