media/libaudioprocessing/AudioResamplerFirProcessNeon.h - third_party/android/platform/frameworks/av - Git at Google

 /*
  * Copyright (C) 2013 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
 #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H

 namespace android {

 // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h

 #if USE_NEON

 // use intrinsics if inline arm32 assembly is not possible
 #if !USE_INLINE_ASSEMBLY
 #define USE_INTRINSIC
 #endif

 // following intrinsics available only on ARM 64 bit ACLE
 #ifndef __aarch64__
 #undef vld1q_f32_x2
 #undef vld1q_s32_x2
 #endif

 #define TO_STRING2(x) #x
 #define TO_STRING(x) TO_STRING2(x)
 // uncomment to print GCC version, may be relevant for intrinsic optimizations
 /* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
         "." TO_STRING(__GNUC_MINOR__) \
         "." TO_STRING(__GNUC_PATCHLEVEL__)) */

 //
 // NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
 //
 // Two variants are presented here:
 // ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32.
 // ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header.
 //

 // Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
 // These are only used for inline assembly.
 #define ASSEMBLY_ACCUMULATE_MONO \
         "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
         "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
         "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
         "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
         "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
         "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
         "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */

 #define ASSEMBLY_ACCUMULATE_STEREO \
         "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
         "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
         "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
         "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
         "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
         "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
         "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
         "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/

 template <int CHANNELS, int STRIDE, bool FIXED>
 static inline void ProcessNeonIntrinsic(int32_t* out,
         int count,
         const int16_t* coefsP,
         const int16_t* coefsN,
         const int16_t* sP,
         const int16_t* sN,
         const int32_t* volumeLR,
         uint32_t lerpP,
         const int16_t* coefsP1,
         const int16_t* coefsN1)
 {
     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
     static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");

     sP -= CHANNELS*((STRIDE>>1)-1);
     coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16);
     coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16);

     int16x4_t interp;
     if (!FIXED) {
         interp = vdup_n_s16(lerpP);
         //interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0);
         coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16);
         coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16);
     }
     int32x4_t accum, accum2;
     // warning uninitialized if we use veorq_s32
     // (alternative to below) accum = veorq_s32(accum, accum);
     accum = vdupq_n_s32(0);
     if (CHANNELS == 2) {
         // (alternative to below) accum2 = veorq_s32(accum2, accum2);
         accum2 = vdupq_n_s32(0);
     }
     do {
         int16x8_t posCoef = vld1q_s16(coefsP);
         coefsP += 8;
         int16x8_t negCoef = vld1q_s16(coefsN);
         coefsN += 8;
         if (!FIXED) { // interpolate
             int16x8_t posCoef1 = vld1q_s16(coefsP1);
             coefsP1 += 8;
             int16x8_t negCoef1 = vld1q_s16(coefsN1);
             coefsN1 += 8;

             posCoef1 = vsubq_s16(posCoef1, posCoef);
             negCoef = vsubq_s16(negCoef, negCoef1);

             posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0);
             negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0);

             posCoef = vaddq_s16(posCoef, posCoef1);
             negCoef = vaddq_s16(negCoef, negCoef1);
         }
         switch (CHANNELS) {
         case 1: {
             int16x8_t posSamp = vld1q_s16(sP);
             int16x8_t negSamp = vld1q_s16(sN);
             sN += 8;
             posSamp = vrev64q_s16(posSamp);

             // dot product
             accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed
             accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed
             accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef));
             accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef));
             sP -= 8;
         } break;
         case 2: {
             int16x8x2_t posSamp = vld2q_s16(sP);
             int16x8x2_t negSamp = vld2q_s16(sN);
             sN += 16;
             posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
             posSamp.val[1] = vrev64q_s16(posSamp.val[1]);

             // dot product
             accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r
             accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r
             accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r
             accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r
             accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef));
             accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef));
             accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef));
             accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef));
             sP -= 16;
         } break;
         }
     } while (count -= 8);

     // multiply by volume and save
     volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
     int32x2_t vLR = vld1_s32(volumeLR);
     int32x2_t outSamp = vld1_s32(out);
     // combine and funnel down accumulator
     int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
     if (CHANNELS == 1) {
         // duplicate accum to both L and R
         outAccum = vpadd_s32(outAccum, outAccum);
     } else if (CHANNELS == 2) {
         // accum2 contains R, fold in
         int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
         outAccum = vpadd_s32(outAccum, outAccum2);
     }
     outAccum = vqrdmulh_s32(outAccum, vLR);
     outSamp = vqadd_s32(outSamp, outAccum);
     vst1_s32(out, outSamp);
 }

 template <int CHANNELS, int STRIDE, bool FIXED>
 static inline void ProcessNeonIntrinsic(int32_t* out,
         int count,
         const int32_t* coefsP,
         const int32_t* coefsN,
         const int16_t* sP,
         const int16_t* sN,
         const int32_t* volumeLR,
         uint32_t lerpP,
         const int32_t* coefsP1,
         const int32_t* coefsN1)
 {
     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
     static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");

     sP -= CHANNELS*((STRIDE>>1)-1);
     coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
     coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);

     int32x2_t interp;
     if (!FIXED) {
         interp = vdup_n_s32(lerpP);
         coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
         coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
     }
     int32x4_t accum, accum2;
     // warning uninitialized if we use veorq_s32
     // (alternative to below) accum = veorq_s32(accum, accum);
     accum = vdupq_n_s32(0);
     if (CHANNELS == 2) {
         // (alternative to below) accum2 = veorq_s32(accum2, accum2);
         accum2 = vdupq_n_s32(0);
     }
     do {
 #ifdef vld1q_s32_x2
         int32x4x2_t posCoef = vld1q_s32_x2(coefsP);
         coefsP += 8;
         int32x4x2_t negCoef = vld1q_s32_x2(coefsN);
         coefsN += 8;
 #else
         int32x4x2_t posCoef;
         posCoef.val[0] = vld1q_s32(coefsP);
         coefsP += 4;
         posCoef.val[1] = vld1q_s32(coefsP);
         coefsP += 4;
         int32x4x2_t negCoef;
         negCoef.val[0] = vld1q_s32(coefsN);
         coefsN += 4;
         negCoef.val[1] = vld1q_s32(coefsN);
         coefsN += 4;
 #endif
         if (!FIXED) { // interpolate
 #ifdef vld1q_s32_x2
             int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1);
             coefsP1 += 8;
             int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1);
             coefsN1 += 8;
 #else
             int32x4x2_t posCoef1;
             posCoef1.val[0] = vld1q_s32(coefsP1);
             coefsP1 += 4;
             posCoef1.val[1] = vld1q_s32(coefsP1);
             coefsP1 += 4;
             int32x4x2_t negCoef1;
             negCoef1.val[0] = vld1q_s32(coefsN1);
             coefsN1 += 4;
             negCoef1.val[1] = vld1q_s32(coefsN1);
             coefsN1 += 4;
 #endif

             posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]);
             posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]);
             negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]);
             negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]);

             posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0);
             posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0);
             negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0);
             negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0);

             posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]);
             posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]);
             negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]);
             negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]);
         }
         switch (CHANNELS) {
         case 1: {
             int16x8_t posSamp = vld1q_s16(sP);
             int16x8_t negSamp = vld1q_s16(sN);
             sN += 8;
             posSamp = vrev64q_s16(posSamp);

             int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15);
             int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15);
             int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15);
             int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15);

             // dot product
             posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
             posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
             negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
             negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);

             accum = vaddq_s32(accum, posSamp0);
             negSamp0 = vaddq_s32(negSamp0, negSamp1);
             accum = vaddq_s32(accum, posSamp1);
             accum = vaddq_s32(accum, negSamp0);

             sP -= 8;
         } break;
         case 2: {
             int16x8x2_t posSamp = vld2q_s16(sP);
             int16x8x2_t negSamp = vld2q_s16(sN);
             sN += 16;
             posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
             posSamp.val[1] = vrev64q_s16(posSamp.val[1]);

             // left
             int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15);
             int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15);
             int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15);
             int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15);

             // dot product
             posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
             posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
             negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
             negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);

             accum = vaddq_s32(accum, posSamp0);
             negSamp0 = vaddq_s32(negSamp0, negSamp1);
             accum = vaddq_s32(accum, posSamp1);
             accum = vaddq_s32(accum, negSamp0);

             // right
             posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15);
             posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15);
             negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15);
             negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15);

             // dot product
             posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
             posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
             negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
             negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);

             accum2 = vaddq_s32(accum2, posSamp0);
             negSamp0 = vaddq_s32(negSamp0, negSamp1);
             accum2 = vaddq_s32(accum2, posSamp1);
             accum2 = vaddq_s32(accum2, negSamp0);

             sP -= 16;
         } break;
         }
     } while (count -= 8);

     // multiply by volume and save
     volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
     int32x2_t vLR = vld1_s32(volumeLR);
     int32x2_t outSamp = vld1_s32(out);
     // combine and funnel down accumulator
     int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
     if (CHANNELS == 1) {
         // duplicate accum to both L and R
         outAccum = vpadd_s32(outAccum, outAccum);
     } else if (CHANNELS == 2) {
         // accum2 contains R, fold in
         int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
         outAccum = vpadd_s32(outAccum, outAccum2);
     }
     outAccum = vqrdmulh_s32(outAccum, vLR);
     outSamp = vqadd_s32(outSamp, outAccum);
     vst1_s32(out, outSamp);
 }

 template <int CHANNELS, int STRIDE, bool FIXED>
 static inline void ProcessNeonIntrinsic(float* out,
         int count,
         const float* coefsP,
         const float* coefsN,
         const float* sP,
         const float* sN,
         const float* volumeLR,
         float lerpP,
         const float* coefsP1,
         const float* coefsN1)
 {
     ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
     static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");

     sP -= CHANNELS*((STRIDE>>1)-1);
     coefsP = (const float*)__builtin_assume_aligned(coefsP, 16);
     coefsN = (const float*)__builtin_assume_aligned(coefsN, 16);

     float32x2_t interp;
     if (!FIXED) {
         interp = vdup_n_f32(lerpP);
         coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16);
         coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16);
     }
     float32x4_t accum, accum2;
     // warning uninitialized if we use veorq_s32
     // (alternative to below) accum = veorq_s32(accum, accum);
     accum = vdupq_n_f32(0);
     if (CHANNELS == 2) {
         // (alternative to below) accum2 = veorq_s32(accum2, accum2);
         accum2 = vdupq_n_f32(0);
     }
     do {
 #ifdef vld1q_f32_x2
         float32x4x2_t posCoef = vld1q_f32_x2(coefsP);
         coefsP += 8;
         float32x4x2_t negCoef = vld1q_f32_x2(coefsN);
         coefsN += 8;
 #else
         float32x4x2_t posCoef;
         posCoef.val[0] = vld1q_f32(coefsP);
         coefsP += 4;
         posCoef.val[1] = vld1q_f32(coefsP);
         coefsP += 4;
         float32x4x2_t negCoef;
         negCoef.val[0] = vld1q_f32(coefsN);
         coefsN += 4;
         negCoef.val[1] = vld1q_f32(coefsN);
         coefsN += 4;
 #endif
         if (!FIXED) { // interpolate
 #ifdef vld1q_f32_x2
             float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1);
             coefsP1 += 8;
             float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1);
             coefsN1 += 8;
 #else
             float32x4x2_t posCoef1;
             posCoef1.val[0] = vld1q_f32(coefsP1);
             coefsP1 += 4;
             posCoef1.val[1] = vld1q_f32(coefsP1);
             coefsP1 += 4;
             float32x4x2_t negCoef1;
             negCoef1.val[0] = vld1q_f32(coefsN1);
             coefsN1 += 4;
             negCoef1.val[1] = vld1q_f32(coefsN1);
             coefsN1 += 4;
 #endif
             posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]);
             posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]);
             negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]);
             negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]);

             posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0);
             posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0);
             negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev
             negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev
         }
         switch (CHANNELS) {
         case 1: {
 #ifdef vld1q_f32_x2
             float32x4x2_t posSamp = vld1q_f32_x2(sP);
             float32x4x2_t negSamp = vld1q_f32_x2(sN);
             sN += 8;
             sP -= 8;
 #else
             float32x4x2_t posSamp;
             posSamp.val[0] = vld1q_f32(sP);
             sP += 4;
             posSamp.val[1] = vld1q_f32(sP);
             sP -= 12;
             float32x4x2_t negSamp;
             negSamp.val[0] = vld1q_f32(sN);
             sN += 4;
             negSamp.val[1] = vld1q_f32(sN);
             sN += 4;
 #endif
             // effectively we want a vrev128q_f32()
             posSamp.val[0] = vrev64q_f32(posSamp.val[0]);
             posSamp.val[1] = vrev64q_f32(posSamp.val[1]);
             posSamp.val[0] = vcombine_f32(
                     vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0]));
             posSamp.val[1] = vcombine_f32(
                     vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1]));

             accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]);
             accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]);
             accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]);
             accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]);
         } break;
         case 2: {
             float32x4x2_t posSamp0 = vld2q_f32(sP);
             sP += 8;
             float32x4x2_t negSamp0 = vld2q_f32(sN);
             sN += 8;
             posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]);
             posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]);
             posSamp0.val[0] = vcombine_f32(
                     vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0]));
             posSamp0.val[1] = vcombine_f32(
                     vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1]));

             float32x4x2_t posSamp1 = vld2q_f32(sP);
             sP -= 24;
             float32x4x2_t negSamp1 = vld2q_f32(sN);
             sN += 8;
             posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]);
             posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]);
             posSamp1.val[0] = vcombine_f32(
                     vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0]));
             posSamp1.val[1] = vcombine_f32(
                     vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1]));

             // Note: speed is affected by accumulation order.
             // Also, speed appears slower using vmul/vadd instead of vmla for
             // stereo case, comparable for mono.

             accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]);
             accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]);
             accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]);
             accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]);

             accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed
             accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed
             accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed
             accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed
         } break;
         }
     } while (count -= 8);

     // multiply by volume and save
     volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8);
     float32x2_t vLR = vld1_f32(volumeLR);
     float32x2_t outSamp = vld1_f32(out);
     // combine and funnel down accumulator
     float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
     if (CHANNELS == 1) {
         // duplicate accum to both L and R
         outAccum = vpadd_f32(outAccum, outAccum);
     } else if (CHANNELS == 2) {
         // accum2 contains R, fold in
         float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
         outAccum = vpadd_f32(outAccum, outAccum2);
     }
     outSamp = vmla_f32(outSamp, outAccum, vLR);
     vst1_f32(out, outSamp);
 }

 template <>
 inline void ProcessL<1, 16>(int32_t* const out,
         int count,
         const int16_t* coefsP,
         const int16_t* coefsN,
         const int16_t* sP,
         const int16_t* sN,
         const int32_t* const volumeLR)
 {
 #ifdef USE_INTRINSIC
     ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
 #else
     const int CHANNELS = 1; // template specialization does not preserve params
     const int STRIDE = 16;
     sP -= CHANNELS*((STRIDE>>1)-1);
     asm (
         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0

         "1:                                      \n"

         "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
         "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
         "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs

         "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4

         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
         "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
         "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
         "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples

         // moving these ARM instructions before neon above seems to be slower
         "subs           %[count], %[count], #8   \n"// (1) update loop counter
         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples

         // sP used after branch (warning)
         "bne            1b                       \n"// loop

          ASSEMBLY_ACCUMULATE_MONO

         : [out]     "=Uv" (out[0]),
           [count]   "+r" (count),
           [coefsP0] "+r" (coefsP),
           [coefsN0] "+r" (coefsN),
           [sP]      "+r" (sP),
           [sN]      "+r" (sN)
         : [vLR]     "r" (volumeLR)
         : "cc", "memory",
           "q0", "q1", "q2", "q3",
           "q8", "q10"
     );
 #endif
 }

 template <>
 inline void ProcessL<2, 16>(int32_t* const out,
         int count,
         const int16_t* coefsP,
         const int16_t* coefsN,
         const int16_t* sP,
         const int16_t* sN,
         const int32_t* const volumeLR)
 {
 #ifdef USE_INTRINSIC
     ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
 #else
     const int CHANNELS = 2; // template specialization does not preserve params
     const int STRIDE = 16;
     sP -= CHANNELS*((STRIDE>>1)-1);
     asm (
         "veor           q0, q0, q0               \n"// (1) acc_L = 0
         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0

         "1:                                      \n"

         "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
         "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
         "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs

         "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
         "vrev64.16      q3, q3                   \n"// (0 combines+) reverse positive right

         "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
         "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
         "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
         "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
         "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
         "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
         "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
         "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right

         // moving these ARM before neon seems to be slower
         "subs           %[count], %[count], #8   \n"// (1) update loop counter
         "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples

         // sP used after branch (warning)
         "bne            1b                       \n"// loop

         ASSEMBLY_ACCUMULATE_STEREO

         : [out] "=Uv" (out[0]),
           [count] "+r" (count),
           [coefsP0] "+r" (coefsP),
           [coefsN0] "+r" (coefsN),
           [sP] "+r" (sP),
           [sN] "+r" (sN)
         : [vLR] "r" (volumeLR)
         : "cc", "memory",
           "q0", "q1", "q2", "q3",
           "q4", "q5", "q6",
           "q8", "q10"
      );
 #endif
 }

 template <>
 inline void Process<1, 16>(int32_t* const out,
         int count,
         const int16_t* coefsP,
         const int16_t* coefsN,
         const int16_t* coefsP1,
         const int16_t* coefsN1,
         const int16_t* sP,
         const int16_t* sN,
         uint32_t lerpP,
         const int32_t* const volumeLR)
 {
 #ifdef USE_INTRINSIC
     ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
             lerpP, coefsP1, coefsN1);
 #else

     const int CHANNELS = 1; // template specialization does not preserve params
     const int STRIDE = 16;
     sP -= CHANNELS*((STRIDE>>1)-1);
     asm (
         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
         "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0

         "1:                                      \n"

         "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
         "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
         "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
         "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
         "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation

         "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
         "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets

         "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
         "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs

         "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4

         "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
         "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set

         // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
         "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
         "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
         "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
         "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples

         // moving these ARM instructions before neon above seems to be slower
         "subs           %[count], %[count], #8   \n"// (1) update loop counter
         "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples

         // sP used after branch (warning)
         "bne            1b                       \n"// loop

         ASSEMBLY_ACCUMULATE_MONO

         : [out]     "=Uv" (out[0]),
           [count]   "+r" (count),
           [coefsP0] "+r" (coefsP),
           [coefsN0] "+r" (coefsN),
           [coefsP1] "+r" (coefsP1),
           [coefsN1] "+r" (coefsN1),
           [sP]      "+r" (sP),
           [sN]      "+r" (sN)
         : [lerpP]   "r" (lerpP),
           [vLR]     "r" (volumeLR)
         : "cc", "memory",
           "q0", "q1", "q2", "q3",
           "q8", "q9", "q10", "q11"
     );
 #endif
 }

 template <>
 inline void Process<2, 16>(int32_t* const out,
         int count,
         const int16_t* coefsP,
         const int16_t* coefsN,
         const int16_t* coefsP1,
         const int16_t* coefsN1,
         const int16_t* sP,
         const int16_t* sN,
         uint32_t lerpP,
         const int32_t* const volumeLR)
 {
 #ifdef USE_INTRINSIC
     ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
             lerpP, coefsP1, coefsN1);
 #else
     const int CHANNELS = 2; // template specialization does not preserve params
     const int STRIDE = 16;
     sP -= CHANNELS*((STRIDE>>1)-1);
     asm (
         "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
         "veor           q0, q0, q0               \n"// (1) acc_L = 0
         "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0

         "1:                                      \n"

         "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
         "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
         "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
         "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
         "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
         "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation

         "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
         "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets

         "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
         "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs

         "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
         "vrev64.16      q3, q3                   \n"// (1) reverse 8 samples of positive right

         "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
         "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set

         "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
         "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
         "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
         "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
         "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
         "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
         "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
         "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right

         // moving these ARM before neon seems to be slower
         "subs           %[count], %[count], #8   \n"// (1) update loop counter
         "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples

         // sP used after branch (warning)
         "bne            1b                       \n"// loop

         ASSEMBLY_ACCUMULATE_STEREO

         : [out] "=Uv" (out[0]),
           [count] "+r" (count),
           [coefsP0] "+r" (coefsP),
           [coefsN0] "+r" (coefsN),
           [coefsP1] "+r" (coefsP1),
           [coefsN1] "+r" (coefsN1),
           [sP] "+r" (sP),
           [sN] "+r" (sN)
         : [lerpP]   "r" (lerpP),
           [vLR] "r" (volumeLR)
         : "cc", "memory",
           "q0", "q1", "q2", "q3",
           "q4", "q5", "q6",
           "q8", "q9", "q10", "q11"
     );
 #endif
 }

 template <>
 inline void ProcessL<1, 16>(int32_t* const out,
         int count,
         const int32_t* coefsP,
         const int32_t* coefsN,
         const int16_t* sP,
         const int16_t* sN,
         const int32_t* const volumeLR)
 {
 #ifdef USE_INTRINSIC
     ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
 #else
     const int CHANNELS = 1; // template specialization does not preserve params
     const int STRIDE = 16;
     sP -= CHANNELS*((STRIDE>>1)-1);
     asm (
         "veor           q0, q0, q0                    \n"// result, initialize to 0

         "1:                                           \n"

         "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
         "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
         "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs

         "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side

         "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
         "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits

         "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
         "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits

         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples
         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples
         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples
         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples

         "vadd.s32       q0, q0, q12                   \n"// accumulate result
         "vadd.s32       q13, q13, q14                 \n"// accumulate result
         "vadd.s32       q0, q0, q15                   \n"// accumulate result
         "vadd.s32       q0, q0, q13                   \n"// accumulate result

         "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
         "subs           %[count], %[count], #8        \n"// update loop counter

         "bne            1b                            \n"// loop

         ASSEMBLY_ACCUMULATE_MONO

         : [out]     "=Uv" (out[0]),
           [count]   "+r" (count),
           [coefsP0] "+r" (coefsP),
           [coefsN0] "+r" (coefsN),
           [sP]      "+r" (sP),
           [sN]      "+r" (sN)
         : [vLR]     "r" (volumeLR)
         : "cc", "memory",
           "q0", "q1", "q2", "q3",
           "q8", "q9", "q10", "q11",
           "q12", "q13", "q14", "q15"
     );
 #endif
 }

 template <>
 inline void ProcessL<2, 16>(int32_t* const out,
         int count,
         const int32_t* coefsP,
         const int32_t* coefsN,
         const int16_t* sP,
         const int16_t* sN,
         const int32_t* const volumeLR)
 {
 #ifdef USE_INTRINSIC
     ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
 #else
     const int CHANNELS = 2; // template specialization does not preserve params
     const int STRIDE = 16;
     sP -= CHANNELS*((STRIDE>>1)-1);
     asm (
         "veor           q0, q0, q0                    \n"// result, initialize to 0
         "veor           q4, q4, q4                    \n"// result, initialize to 0

         "1:                                           \n"

         "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
         "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
         "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs

         "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
         "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right

         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits

         "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
         "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits

         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef

         "vadd.s32       q0, q0, q12                   \n"// accumulate result
         "vadd.s32       q13, q13, q14                 \n"// accumulate result
         "vadd.s32       q0, q0, q15                   \n"// accumulate result
         "vadd.s32       q0, q0, q13                   \n"// accumulate result

         "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
         "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits

         "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
         "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits

         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef

         "vadd.s32       q4, q4, q12                   \n"// accumulate result
         "vadd.s32       q13, q13, q14                 \n"// accumulate result
         "vadd.s32       q4, q4, q15                   \n"// accumulate result
         "vadd.s32       q4, q4, q13                   \n"// accumulate result

         "subs           %[count], %[count], #8        \n"// update loop counter
         "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples

         "bne            1b                            \n"// loop

         ASSEMBLY_ACCUMULATE_STEREO

         : [out]     "=Uv" (out[0]),
           [count]   "+r" (count),
           [coefsP0] "+r" (coefsP),
           [coefsN0] "+r" (coefsN),
           [sP]      "+r" (sP),
           [sN]      "+r" (sN)
         : [vLR]     "r" (volumeLR)
         : "cc", "memory",
           "q0", "q1", "q2", "q3",
           "q4", "q5", "q6",
           "q8", "q9", "q10", "q11",
           "q12", "q13", "q14", "q15"
     );
 #endif
 }

 template <>
 inline void Process<1, 16>(int32_t* const out,
         int count,
         const int32_t* coefsP,
         const int32_t* coefsN,
         const int32_t* coefsP1,
         const int32_t* coefsN1,
         const int16_t* sP,
         const int16_t* sN,
         uint32_t lerpP,
         const int32_t* const volumeLR)
 {
 #ifdef USE_INTRINSIC
     ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
             lerpP, coefsP1, coefsN1);
 #else
     const int CHANNELS = 1; // template specialization does not preserve params
     const int STRIDE = 16;
     sP -= CHANNELS*((STRIDE>>1)-1);
     asm (
         "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
         "veor           q0, q0, q0                    \n"// result, initialize to 0

         "1:                                           \n"

         "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
         "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
         "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
         "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
         "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs

         "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
         "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
         "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
         "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)

         "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
         "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
         "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
         "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)

         "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
         "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
         "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
         "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)

         "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side

         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits

         "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
         "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits

         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef

         "vadd.s32       q0, q0, q12                   \n"// accumulate result
         "vadd.s32       q13, q13, q14                 \n"// accumulate result
         "vadd.s32       q0, q0, q15                   \n"// accumulate result
         "vadd.s32       q0, q0, q13                   \n"// accumulate result

         "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
         "subs           %[count], %[count], #8        \n"// update loop counter

         "bne            1b                            \n"// loop

         ASSEMBLY_ACCUMULATE_MONO

         : [out]     "=Uv" (out[0]),
           [count]   "+r" (count),
           [coefsP0] "+r" (coefsP),
           [coefsN0] "+r" (coefsN),
           [coefsP1] "+r" (coefsP1),
           [coefsN1] "+r" (coefsN1),
           [sP]      "+r" (sP),
           [sN]      "+r" (sN)
         : [lerpP]   "r" (lerpP),
           [vLR]     "r" (volumeLR)
         : "cc", "memory",
           "q0", "q1", "q2", "q3",
           "q8", "q9", "q10", "q11",
           "q12", "q13", "q14", "q15"
     );
 #endif
 }

 template <>
 inline void Process<2, 16>(int32_t* const out,
         int count,
         const int32_t* coefsP,
         const int32_t* coefsN,
         const int32_t* coefsP1,
         const int32_t* coefsN1,
         const int16_t* sP,
         const int16_t* sN,
         uint32_t lerpP,
         const int32_t* const volumeLR)
 {
 #ifdef USE_INTRINSIC
     ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
             lerpP, coefsP1, coefsN1);
 #else
     const int CHANNELS = 2; // template specialization does not preserve params
     const int STRIDE = 16;
     sP -= CHANNELS*((STRIDE>>1)-1);
     asm (
         "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
         "veor           q0, q0, q0                    \n"// result, initialize to 0
         "veor           q4, q4, q4                    \n"// result, initialize to 0

         "1:                                           \n"

         "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
         "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
         "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
         "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
         "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
         "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs

         "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
         "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
         "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
         "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)

         "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
         "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
         "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
         "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)

         "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
         "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
         "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
         "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)

         "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
         "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right

         "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
         "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits

         "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
         "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits

         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef

         "vadd.s32       q0, q0, q12                   \n"// accumulate result
         "vadd.s32       q13, q13, q14                 \n"// accumulate result
         "vadd.s32       q0, q0, q15                   \n"// accumulate result
         "vadd.s32       q0, q0, q13                   \n"// accumulate result

         "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
         "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits

         "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
         "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits

         "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
         "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
         "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
         "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef

         "vadd.s32       q4, q4, q12                   \n"// accumulate result
         "vadd.s32       q13, q13, q14                 \n"// accumulate result
         "vadd.s32       q4, q4, q15                   \n"// accumulate result
         "vadd.s32       q4, q4, q13                   \n"// accumulate result

         "subs           %[count], %[count], #8        \n"// update loop counter
         "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples

         "bne            1b                            \n"// loop

         ASSEMBLY_ACCUMULATE_STEREO

         : [out]     "=Uv" (out[0]),
           [count]   "+r" (count),
           [coefsP0] "+r" (coefsP),
           [coefsN0] "+r" (coefsN),
           [coefsP1] "+r" (coefsP1),
           [coefsN1] "+r" (coefsN1),
           [sP]      "+r" (sP),
           [sN]      "+r" (sN)
         : [lerpP]   "r" (lerpP),
           [vLR]     "r" (volumeLR)
         : "cc", "memory",
           "q0", "q1", "q2", "q3",
           "q4", "q5", "q6",
           "q8", "q9", "q10", "q11",
           "q12", "q13", "q14", "q15"
     );
 #endif
 }

 template<>
 inline void ProcessL<1, 16>(float* const out,
         int count,
         const float* coefsP,
         const float* coefsN,
         const float* sP,
         const float* sN,
         const float* const volumeLR)
 {
     ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
 }

 template<>
 inline void ProcessL<2, 16>(float* const out,
         int count,
         const float* coefsP,
         const float* coefsN,
         const float* sP,
         const float* sN,
         const float* const volumeLR)
 {
     ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
             0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
 }

 template<>
 inline void Process<1, 16>(float* const out,
         int count,
         const float* coefsP,
         const float* coefsN,
         const float* coefsP1,
         const float* coefsN1,
         const float* sP,
         const float* sN,
         float lerpP,
         const float* const volumeLR)
 {
     ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
             lerpP, coefsP1, coefsN1);
 }

 template<>
 inline void Process<2, 16>(float* const out,
         int count,
         const float* coefsP,
         const float* coefsN,
         const float* coefsP1,
         const float* coefsN1,
         const float* sP,
         const float* sN,
         float lerpP,
         const float* const volumeLR)
 {
     ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
             lerpP, coefsP1, coefsN1);
 }

 #endif //USE_NEON

 } // namespace android

 #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/