| /* |
| * Bluetooth low-complexity, subband codec (SBC) |
| * |
| * Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org> |
| * Copyright (C) 2008-2010 Nokia Corporation |
| * Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org> |
| * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch> |
| * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com> |
| * |
| * This file is part of FFmpeg. |
| * |
| * FFmpeg is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * FFmpeg is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with FFmpeg; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| */ |
| |
| /** |
| * @file |
| * SBC ARM NEON optimizations |
| */ |
| |
| #include "libavutil/arm/asm.S" |
| #include "neon.S" |
| |
| #define SBC_PROTO_FIXED_SCALE 16 |
| |
| function ff_sbc_analyze_4_neon, export=1 |
| /* TODO: merge even and odd cases (or even merge all four calls to this |
| * function) in order to have only aligned reads from 'in' array |
| * and reduce number of load instructions */ |
| vld1.16 {d4, d5}, [r0, :64]! |
| vld1.16 {d8, d9}, [r2, :128]! |
| |
| vmull.s16 q0, d4, d8 |
| vld1.16 {d6, d7}, [r0, :64]! |
| vmull.s16 q1, d5, d9 |
| vld1.16 {d10, d11}, [r2, :128]! |
| |
| vmlal.s16 q0, d6, d10 |
| vld1.16 {d4, d5}, [r0, :64]! |
| vmlal.s16 q1, d7, d11 |
| vld1.16 {d8, d9}, [r2, :128]! |
| |
| vmlal.s16 q0, d4, d8 |
| vld1.16 {d6, d7}, [r0, :64]! |
| vmlal.s16 q1, d5, d9 |
| vld1.16 {d10, d11}, [r2, :128]! |
| |
| vmlal.s16 q0, d6, d10 |
| vld1.16 {d4, d5}, [r0, :64]! |
| vmlal.s16 q1, d7, d11 |
| vld1.16 {d8, d9}, [r2, :128]! |
| |
| vmlal.s16 q0, d4, d8 |
| vmlal.s16 q1, d5, d9 |
| |
| vpadd.s32 d0, d0, d1 |
| vpadd.s32 d1, d2, d3 |
| |
| vrshrn.s32 d0, q0, SBC_PROTO_FIXED_SCALE |
| |
| vld1.16 {d2, d3, d4, d5}, [r2, :128]! |
| |
| vdup.i32 d1, d0[1] /* TODO: can be eliminated */ |
| vdup.i32 d0, d0[0] /* TODO: can be eliminated */ |
| |
| vmull.s16 q3, d2, d0 |
| vmull.s16 q4, d3, d0 |
| vmlal.s16 q3, d4, d1 |
| vmlal.s16 q4, d5, d1 |
| |
| vpadd.s32 d0, d6, d7 /* TODO: can be eliminated */ |
| vpadd.s32 d1, d8, d9 /* TODO: can be eliminated */ |
| |
| vst1.32 {d0, d1}, [r1, :128] |
| |
| bx lr |
| endfunc |
| |
| function ff_sbc_analyze_8_neon, export=1 |
| /* TODO: merge even and odd cases (or even merge all four calls to this |
| * function) in order to have only aligned reads from 'in' array |
| * and reduce number of load instructions */ |
| vld1.16 {d4, d5}, [r0, :64]! |
| vld1.16 {d8, d9}, [r2, :128]! |
| |
| vmull.s16 q6, d4, d8 |
| vld1.16 {d6, d7}, [r0, :64]! |
| vmull.s16 q7, d5, d9 |
| vld1.16 {d10, d11}, [r2, :128]! |
| vmull.s16 q8, d6, d10 |
| vld1.16 {d4, d5}, [r0, :64]! |
| vmull.s16 q9, d7, d11 |
| vld1.16 {d8, d9}, [r2, :128]! |
| |
| vmlal.s16 q6, d4, d8 |
| vld1.16 {d6, d7}, [r0, :64]! |
| vmlal.s16 q7, d5, d9 |
| vld1.16 {d10, d11}, [r2, :128]! |
| vmlal.s16 q8, d6, d10 |
| vld1.16 {d4, d5}, [r0, :64]! |
| vmlal.s16 q9, d7, d11 |
| vld1.16 {d8, d9}, [r2, :128]! |
| |
| vmlal.s16 q6, d4, d8 |
| vld1.16 {d6, d7}, [r0, :64]! |
| vmlal.s16 q7, d5, d9 |
| vld1.16 {d10, d11}, [r2, :128]! |
| vmlal.s16 q8, d6, d10 |
| vld1.16 {d4, d5}, [r0, :64]! |
| vmlal.s16 q9, d7, d11 |
| vld1.16 {d8, d9}, [r2, :128]! |
| |
| vmlal.s16 q6, d4, d8 |
| vld1.16 {d6, d7}, [r0, :64]! |
| vmlal.s16 q7, d5, d9 |
| vld1.16 {d10, d11}, [r2, :128]! |
| vmlal.s16 q8, d6, d10 |
| vld1.16 {d4, d5}, [r0, :64]! |
| vmlal.s16 q9, d7, d11 |
| vld1.16 {d8, d9}, [r2, :128]! |
| |
| vmlal.s16 q6, d4, d8 |
| vld1.16 {d6, d7}, [r0, :64]! |
| vmlal.s16 q7, d5, d9 |
| vld1.16 {d10, d11}, [r2, :128]! |
| |
| vmlal.s16 q8, d6, d10 |
| vmlal.s16 q9, d7, d11 |
| |
| vpadd.s32 d0, d12, d13 |
| vpadd.s32 d1, d14, d15 |
| vpadd.s32 d2, d16, d17 |
| vpadd.s32 d3, d18, d19 |
| |
| vrshr.s32 q0, q0, SBC_PROTO_FIXED_SCALE |
| vrshr.s32 q1, q1, SBC_PROTO_FIXED_SCALE |
| vmovn.s32 d0, q0 |
| vmovn.s32 d1, q1 |
| |
| vdup.i32 d3, d1[1] /* TODO: can be eliminated */ |
| vdup.i32 d2, d1[0] /* TODO: can be eliminated */ |
| vdup.i32 d1, d0[1] /* TODO: can be eliminated */ |
| vdup.i32 d0, d0[0] /* TODO: can be eliminated */ |
| |
| vld1.16 {d4, d5}, [r2, :128]! |
| vmull.s16 q6, d4, d0 |
| vld1.16 {d6, d7}, [r2, :128]! |
| vmull.s16 q7, d5, d0 |
| vmull.s16 q8, d6, d0 |
| vmull.s16 q9, d7, d0 |
| |
| vld1.16 {d4, d5}, [r2, :128]! |
| vmlal.s16 q6, d4, d1 |
| vld1.16 {d6, d7}, [r2, :128]! |
| vmlal.s16 q7, d5, d1 |
| vmlal.s16 q8, d6, d1 |
| vmlal.s16 q9, d7, d1 |
| |
| vld1.16 {d4, d5}, [r2, :128]! |
| vmlal.s16 q6, d4, d2 |
| vld1.16 {d6, d7}, [r2, :128]! |
| vmlal.s16 q7, d5, d2 |
| vmlal.s16 q8, d6, d2 |
| vmlal.s16 q9, d7, d2 |
| |
| vld1.16 {d4, d5}, [r2, :128]! |
| vmlal.s16 q6, d4, d3 |
| vld1.16 {d6, d7}, [r2, :128]! |
| vmlal.s16 q7, d5, d3 |
| vmlal.s16 q8, d6, d3 |
| vmlal.s16 q9, d7, d3 |
| |
| vpadd.s32 d0, d12, d13 /* TODO: can be eliminated */ |
| vpadd.s32 d1, d14, d15 /* TODO: can be eliminated */ |
| vpadd.s32 d2, d16, d17 /* TODO: can be eliminated */ |
| vpadd.s32 d3, d18, d19 /* TODO: can be eliminated */ |
| |
| vst1.32 {d0, d1, d2, d3}, [r1, :128] |
| |
| bx lr |
| endfunc |
| |
| function ff_sbc_calc_scalefactors_neon, export=1 |
| @ parameters |
| @ r0 = sb_sample_f |
| @ r1 = scale_factor |
| @ r2 = blocks |
| @ r3 = channels |
| @ r4 = subbands |
| @ local variables |
| @ r5 = in_loop_1 |
| @ r6 = in |
| @ r7 = out_loop_1 |
| @ r8 = out |
| @ r9 = ch |
| @ r10 = sb |
| @ r11 = inc |
| @ r12 = blk |
| |
| push {r1-r2, r4-r12} |
| ldr r4, [sp, #44] |
| mov r11, #64 |
| |
| mov r9, #0 |
| 1: |
| add r5, r0, r9, lsl#5 |
| add r7, r1, r9, lsl#5 |
| |
| mov r10, #0 |
| 2: |
| add r6, r5, r10, lsl#2 |
| add r8, r7, r10, lsl#2 |
| mov r12, r2 |
| |
| vmov.s32 q0, #0 |
| vmov.s32 q1, #0x8000 @ 1 << SCALE_OUT_BITS |
| vmov.s32 q14, #1 |
| vmov.s32 q15, #16 @ 31 - SCALE_OUT_BITS |
| vadd.s32 q1, q1, q14 |
| 3: |
| vld1.32 {d16, d17}, [r6, :128], r11 |
| vabs.s32 q8, q8 |
| vld1.32 {d18, d19}, [r6, :128], r11 |
| vabs.s32 q9, q9 |
| vld1.32 {d20, d21}, [r6, :128], r11 |
| vabs.s32 q10, q10 |
| vld1.32 {d22, d23}, [r6, :128], r11 |
| vabs.s32 q11, q11 |
| vmax.s32 q0, q0, q8 |
| vmax.s32 q1, q1, q9 |
| vmax.s32 q0, q0, q10 |
| vmax.s32 q1, q1, q11 |
| subs r12, r12, #4 |
| bgt 3b |
| vmax.s32 q0, q0, q1 |
| vsub.s32 q0, q0, q14 |
| vclz.s32 q0, q0 |
| vsub.s32 q0, q15, q0 |
| vst1.32 {d0, d1}, [r8, :128] |
| |
| add r10, r10, #4 |
| cmp r10, r4 |
| blt 2b |
| |
| add r9, r9, #1 |
| cmp r9, r3 |
| blt 1b |
| |
| pop {r1-r2, r4-r12} |
| bx lr |
| endfunc |
| |
| /* |
| * constants: q13 = (31 - SCALE_OUT_BITS) |
| * q14 = 1 |
| * input: q0 - ((1 << SCALE_OUT_BITS) + 1) |
| * r5 - samples for channel 0 |
| * r6 - samples for shannel 1 |
| * output: q0, q1 - scale factors without joint stereo |
| * q2, q3 - scale factors with joint stereo |
| * q15 - joint stereo selection mask |
| */ |
| .macro calc_scalefactors |
| vmov.s32 q1, q0 |
| vmov.s32 q2, q0 |
| vmov.s32 q3, q0 |
| mov r3, r2 |
| 1: |
| vld1.32 {d18, d19}, [r6, :128], r11 |
| vbic.s32 q11, q9, q14 |
| vld1.32 {d16, d17}, [r5, :128], r11 |
| vhadd.s32 q10, q8, q11 |
| vhsub.s32 q11, q8, q11 |
| vabs.s32 q8, q8 |
| vabs.s32 q9, q9 |
| vabs.s32 q10, q10 |
| vabs.s32 q11, q11 |
| vmax.s32 q0, q0, q8 |
| vmax.s32 q1, q1, q9 |
| vmax.s32 q2, q2, q10 |
| vmax.s32 q3, q3, q11 |
| subs r3, r3, #1 |
| bgt 1b |
| vsub.s32 q0, q0, q14 |
| vsub.s32 q1, q1, q14 |
| vsub.s32 q2, q2, q14 |
| vsub.s32 q3, q3, q14 |
| vclz.s32 q0, q0 |
| vclz.s32 q1, q1 |
| vclz.s32 q2, q2 |
| vclz.s32 q3, q3 |
| vsub.s32 q0, q13, q0 |
| vsub.s32 q1, q13, q1 |
| vsub.s32 q2, q13, q2 |
| vsub.s32 q3, q13, q3 |
| .endm |
| |
| /* |
| * constants: q14 = 1 |
| * input: q15 - joint stereo selection mask |
| * r5 - value set by calc_scalefactors macro |
| * r6 - value set by calc_scalefactors macro |
| */ |
| .macro update_joint_stereo_samples |
| sub r8, r6, r11 |
| sub r7, r5, r11 |
| sub r6, r6, r11, asl #1 |
| sub r5, r5, r11, asl #1 |
| vld1.32 {d18, d19}, [r6, :128] |
| vbic.s32 q11, q9, q14 |
| vld1.32 {d16, d17}, [r5, :128] |
| vld1.32 {d2, d3}, [r8, :128] |
| vbic.s32 q3, q1, q14 |
| vld1.32 {d0, d1}, [r7, :128] |
| vhsub.s32 q10, q8, q11 |
| vhadd.s32 q11, q8, q11 |
| vhsub.s32 q2, q0, q3 |
| vhadd.s32 q3, q0, q3 |
| vbif.s32 q10, q9, q15 |
| vbif.s32 d22, d16, d30 |
| sub r11, r10, r11, asl #1 |
| sub r3, r2, #2 |
| 2: |
| vbif.s32 d23, d17, d31 |
| vst1.32 {d20, d21}, [r6, :128], r11 |
| vbif.s32 d4, d2, d30 |
| vld1.32 {d18, d19}, [r6, :128] |
| vbif.s32 d5, d3, d31 |
| vst1.32 {d22, d23}, [r5, :128], r11 |
| vbif.s32 d6, d0, d30 |
| vld1.32 {d16, d17}, [r5, :128] |
| vbif.s32 d7, d1, d31 |
| vst1.32 {d4, d5}, [r8, :128], r11 |
| vbic.s32 q11, q9, q14 |
| vld1.32 {d2, d3}, [r8, :128] |
| vst1.32 {d6, d7}, [r7, :128], r11 |
| vbic.s32 q3, q1, q14 |
| vld1.32 {d0, d1}, [r7, :128] |
| vhsub.s32 q10, q8, q11 |
| vhadd.s32 q11, q8, q11 |
| vhsub.s32 q2, q0, q3 |
| vhadd.s32 q3, q0, q3 |
| vbif.s32 q10, q9, q15 |
| vbif.s32 d22, d16, d30 |
| subs r3, r3, #2 |
| bgt 2b |
| sub r11, r10, r11, asr #1 |
| vbif.s32 d23, d17, d31 |
| vst1.32 {d20, d21}, [r6, :128] |
| vbif.s32 q2, q1, q15 |
| vst1.32 {d22, d23}, [r5, :128] |
| vbif.s32 q3, q0, q15 |
| vst1.32 {d4, d5}, [r8, :128] |
| vst1.32 {d6, d7}, [r7, :128] |
| .endm |
| |
| function ff_sbc_calc_scalefactors_j_neon, export=1 |
| @ parameters |
| @ r0 = in = sb_sample_f |
| @ r1 = out = scale_factor |
| @ r2 = blocks |
| @ r3 = subbands |
| @ local variables |
| @ r4 = consts = ff_sbcdsp_joint_bits_mask |
| @ r5 = in0 |
| @ r6 = in1 |
| @ r7 = out0 |
| @ r8 = out1 |
| @ r10 = zero |
| @ r11 = inc |
| @ return r0 = joint |
| |
| push {r3-r11} |
| movrelx r4, X(ff_sbcdsp_joint_bits_mask) |
| mov r10, #0 |
| mov r11, #64 |
| |
| vmov.s32 q14, #1 |
| vmov.s32 q13, #16 @ 31 - SCALE_OUT_BITS |
| |
| cmp r3, #4 |
| bne 8f |
| |
| 4: @ 4 subbands |
| add r5, r0, #0 |
| add r6, r0, #32 |
| add r7, r1, #0 |
| add r8, r1, #32 |
| vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS |
| vadd.s32 q0, q0, q14 |
| |
| calc_scalefactors |
| |
| @ check whether to use joint stereo for subbands 0, 1, 2 |
| vadd.s32 q15, q0, q1 |
| vadd.s32 q9, q2, q3 |
| vmov.s32 d31[1], r10 @ last subband -> no joint |
| vld1.32 {d16, d17}, [r4, :128]! |
| vcgt.s32 q15, q15, q9 |
| |
| @ calculate and save to memory 'joint' variable |
| @ update and save scale factors to memory |
| vand.s32 q8, q8, q15 |
| vbit.s32 q0, q2, q15 |
| vpadd.s32 d16, d16, d17 |
| vbit.s32 q1, q3, q15 |
| vpadd.s32 d16, d16, d16 |
| vst1.32 {d0, d1}, [r7, :128] |
| vst1.32 {d2, d3}, [r8, :128] |
| vmov.32 r0, d16[0] |
| |
| update_joint_stereo_samples |
| b 9f |
| |
| 8: @ 8 subbands |
| add r5, r0, #16 |
| add r6, r0, #48 |
| add r7, r1, #16 |
| add r8, r1, #48 |
| vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS |
| vadd.s32 q0, q0, q14 |
| |
| calc_scalefactors |
| |
| @ check whether to use joint stereo for subbands 4, 5, 6 |
| vadd.s32 q15, q0, q1 |
| vadd.s32 q9, q2, q3 |
| vmov.s32 d31[1], r10 @ last subband -> no joint |
| vld1.32 {d16, d17}, [r4, :128]! |
| vcgt.s32 q15, q15, q9 |
| |
| @ calculate part of 'joint' variable and save it to d24 |
| @ update and save scale factors to memory |
| vand.s32 q8, q8, q15 |
| vbit.s32 q0, q2, q15 |
| vpadd.s32 d16, d16, d17 |
| vbit.s32 q1, q3, q15 |
| vst1.32 {d0, d1}, [r7, :128] |
| vst1.32 {d2, d3}, [r8, :128] |
| vpadd.s32 d24, d16, d16 |
| |
| update_joint_stereo_samples |
| |
| add r5, r0, #0 |
| add r6, r0, #32 |
| add r7, r1, #0 |
| add r8, r1, #32 |
| vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS |
| vadd.s32 q0, q0, q14 |
| |
| calc_scalefactors |
| |
| @ check whether to use joint stereo for subbands 0, 1, 2, 3 |
| vadd.s32 q15, q0, q1 |
| vadd.s32 q9, q2, q3 |
| vld1.32 {d16, d17}, [r4, :128]! |
| vcgt.s32 q15, q15, q9 |
| |
| @ combine last part of 'joint' with d24 and save to memory |
| @ update and save scale factors to memory |
| vand.s32 q8, q8, q15 |
| vbit.s32 q0, q2, q15 |
| vpadd.s32 d16, d16, d17 |
| vbit.s32 q1, q3, q15 |
| vpadd.s32 d16, d16, d16 |
| vst1.32 {d0, d1}, [r7, :128] |
| vadd.s32 d16, d16, d24 |
| vst1.32 {d2, d3}, [r8, :128] |
| vmov.32 r0, d16[0] |
| |
| update_joint_stereo_samples |
| 9: |
| pop {r3-r11} |
| bx lr |
| endfunc |
| |
| function ff_sbc_enc_process_input_4s_neon, export=1 |
| @ parameters |
| @ r0 = positioin |
| @ r1 = pcm |
| @ r2 = X |
| @ r3 = nsamples |
| @ r4 = nchannels |
| @ local variables |
| @ r5 = ff_sbc_input_perm_4 |
| @ r6 = src / x |
| @ r7 = dst / y |
| |
| push {r1, r3-r7} |
| ldr r4, [sp, #24] |
| movrelx r5, X(ff_sbc_input_perm_4) |
| |
| @ handle X buffer wraparound |
| cmp r0, r3 |
| bge 1f @ if (position < nsamples) |
| add r7, r2, #576 @ &X[0][SBC_X_BUFFER_SIZE - 40] |
| add r6, r2, r0, lsl#1 @ &X[0][position] |
| vld1.16 {d0, d1, d2, d3}, [r6, :128]! |
| vst1.16 {d0, d1, d2, d3}, [r7, :128]! |
| vld1.16 {d0, d1, d2, d3}, [r6, :128]! |
| vst1.16 {d0, d1, d2, d3}, [r7, :128]! |
| vld1.16 {d0}, [r6, :64]! |
| vst1.16 {d0}, [r7, :64]! |
| cmp r4, #1 |
| ble 2f @ if (nchannels > 1) |
| add r7, r2, #1232 @ &X[1][SBC_X_BUFFER_SIZE - 40] |
| add r6, r2, #656 |
| add r6, r6, r0, lsl#1 @ &X[1][position] |
| vld1.16 {d0, d1, d2, d3}, [r6, :128]! |
| vst1.16 {d0, d1, d2, d3}, [r7, :128]! |
| vld1.16 {d0, d1, d2, d3}, [r6, :128]! |
| vst1.16 {d0, d1, d2, d3}, [r7, :128]! |
| vld1.16 {d0}, [r6, :64]! |
| vst1.16 {d0}, [r7, :64]! |
| 2: |
| mov r0, #288 @ SBC_X_BUFFER_SIZE - 40 |
| 1: |
| |
| add r6, r2, r0, lsl#1 @ &X[0][position] |
| add r7, r6, #656 @ &X[1][position] |
| |
| cmp r4, #1 |
| ble 8f @ if (nchannels > 1) |
| tst r1, #1 |
| beq 7f @ if (pcm & 1) |
| @ poor 'pcm' alignment |
| vld1.8 {d0, d1}, [r5, :128] |
| 1: |
| sub r6, r6, #16 |
| sub r7, r7, #16 |
| sub r0, r0, #8 |
| vld1.8 {d4, d5}, [r1]! |
| vuzp.16 d4, d5 |
| vld1.8 {d20, d21}, [r1]! |
| vuzp.16 d20, d21 |
| vswp d5, d20 |
| vtbl.8 d16, {d4, d5}, d0 |
| vtbl.8 d17, {d4, d5}, d1 |
| vtbl.8 d18, {d20, d21}, d0 |
| vtbl.8 d19, {d20, d21}, d1 |
| vst1.16 {d16, d17}, [r6, :128] |
| vst1.16 {d18, d19}, [r7, :128] |
| subs r3, r3, #8 |
| bgt 1b |
| b 9f |
| 7: |
| @ proper 'pcm' alignment |
| vld1.8 {d0, d1}, [r5, :128] |
| 1: |
| sub r6, r6, #16 |
| sub r7, r7, #16 |
| sub r0, r0, #8 |
| vld2.16 {d4, d5}, [r1]! |
| vld2.16 {d20, d21}, [r1]! |
| vswp d5, d20 |
| vtbl.8 d16, {d4, d5}, d0 |
| vtbl.8 d17, {d4, d5}, d1 |
| vtbl.8 d18, {d20, d21}, d0 |
| vtbl.8 d19, {d20, d21}, d1 |
| vst1.16 {d16, d17}, [r6, :128] |
| vst1.16 {d18, d19}, [r7, :128] |
| subs r3, r3, #8 |
| bgt 1b |
| b 9f |
| 8: |
| @ mono |
| vld1.8 {d0, d1}, [r5, :128] |
| 1: |
| sub r6, r6, #16 |
| sub r0, r0, #8 |
| vld1.8 {d4, d5}, [r1]! |
| vtbl.8 d16, {d4, d5}, d0 |
| vtbl.8 d17, {d4, d5}, d1 |
| vst1.16 {d16, d17}, [r6, :128] |
| subs r3, r3, #8 |
| bgt 1b |
| 9: |
| pop {r1, r3-r7} |
| bx lr |
| endfunc |
| |
| function ff_sbc_enc_process_input_8s_neon, export=1 |
| @ parameters |
| @ r0 = positioin |
| @ r1 = pcm |
| @ r2 = X |
| @ r3 = nsamples |
| @ r4 = nchannels |
| @ local variables |
| @ r5 = ff_sbc_input_perm_8 |
| @ r6 = src |
| @ r7 = dst |
| |
| push {r1, r3-r7} |
| ldr r4, [sp, #24] |
| movrelx r5, X(ff_sbc_input_perm_8) |
| |
| @ handle X buffer wraparound |
| cmp r0, r3 |
| bge 1f @ if (position < nsamples) |
| add r7, r2, #512 @ &X[0][SBC_X_BUFFER_SIZE - 72] |
| add r6, r2, r0, lsl#1 @ &X[0][position] |
| vld1.16 {d0, d1, d2, d3}, [r6, :128]! |
| vst1.16 {d0, d1, d2, d3}, [r7, :128]! |
| vld1.16 {d0, d1, d2, d3}, [r6, :128]! |
| vst1.16 {d0, d1, d2, d3}, [r7, :128]! |
| vld1.16 {d0, d1, d2, d3}, [r6, :128]! |
| vst1.16 {d0, d1, d2, d3}, [r7, :128]! |
| vld1.16 {d0, d1, d2, d3}, [r6, :128]! |
| vst1.16 {d0, d1, d2, d3}, [r7, :128]! |
| vld1.16 {d0, d1}, [r6, :128]! |
| vst1.16 {d0, d1}, [r7, :128]! |
| cmp r4, #1 |
| ble 2f @ if (nchannels > 1) |
| add r7, r2, #1168 @ &X[1][SBC_X_BUFFER_SIZE - 72] |
| add r6, r2, #656 |
| add r6, r6, r0, lsl#1 @ &X[1][position] |
| vld1.16 {d0, d1, d2, d3}, [r6, :128]! |
| vst1.16 {d0, d1, d2, d3}, [r7, :128]! |
| vld1.16 {d0, d1, d2, d3}, [r6, :128]! |
| vst1.16 {d0, d1, d2, d3}, [r7, :128]! |
| vld1.16 {d0, d1, d2, d3}, [r6, :128]! |
| vst1.16 {d0, d1, d2, d3}, [r7, :128]! |
| vld1.16 {d0, d1, d2, d3}, [r6, :128]! |
| vst1.16 {d0, d1, d2, d3}, [r7, :128]! |
| vld1.16 {d0, d1}, [r6, :128]! |
| vst1.16 {d0, d1}, [r7, :128]! |
| 2: |
| mov r0, #256 @ SBC_X_BUFFER_SIZE - 72 |
| 1: |
| |
| add r6, r2, r0, lsl#1 @ &X[0][position] |
| add r7, r6, #656 @ &X[1][position] |
| |
| cmp r4, #1 |
| ble 8f @ if (nchannels > 1) |
| tst r1, #1 |
| beq 7f @ if (pcm & 1) |
| @ poor 'pcm' alignment |
| vld1.8 {d0, d1, d2, d3}, [r5, :128] |
| 1: |
| sub r6, r6, #32 |
| sub r7, r7, #32 |
| sub r0, r0, #16 |
| vld1.8 {d4, d5, d6, d7}, [r1]! |
| vuzp.16 q2, q3 |
| vld1.8 {d20, d21, d22, d23}, [r1]! |
| vuzp.16 q10, q11 |
| vswp q3, q10 |
| vtbl.8 d16, {d4, d5, d6, d7}, d0 |
| vtbl.8 d17, {d4, d5, d6, d7}, d1 |
| vtbl.8 d18, {d4, d5, d6, d7}, d2 |
| vtbl.8 d19, {d4, d5, d6, d7}, d3 |
| vst1.16 {d16, d17, d18, d19}, [r6, :128] |
| vtbl.8 d16, {d20, d21, d22, d23}, d0 |
| vtbl.8 d17, {d20, d21, d22, d23}, d1 |
| vtbl.8 d18, {d20, d21, d22, d23}, d2 |
| vtbl.8 d19, {d20, d21, d22, d23}, d3 |
| vst1.16 {d16, d17, d18, d19}, [r7, :128] |
| subs r3, r3, #16 |
| bgt 1b |
| b 9f |
| 7: |
| @ proper 'pcm' alignment |
| vld1.8 {d0, d1, d2, d3}, [r5, :128] |
| 1: |
| sub r6, r6, #32 |
| sub r7, r7, #32 |
| sub r0, r0, #16 |
| vld2.16 {d4, d5, d6, d7}, [r1]! |
| vld2.16 {d20, d21, d22, d23}, [r1]! |
| vswp q3, q10 |
| vtbl.8 d16, {d4, d5, d6, d7}, d0 |
| vtbl.8 d17, {d4, d5, d6, d7}, d1 |
| vtbl.8 d18, {d4, d5, d6, d7}, d2 |
| vtbl.8 d19, {d4, d5, d6, d7}, d3 |
| vst1.16 {d16, d17, d18, d19}, [r6, :128] |
| vtbl.8 d16, {d20, d21, d22, d23}, d0 |
| vtbl.8 d17, {d20, d21, d22, d23}, d1 |
| vtbl.8 d18, {d20, d21, d22, d23}, d2 |
| vtbl.8 d19, {d20, d21, d22, d23}, d3 |
| vst1.16 {d16, d17, d18, d19}, [r7, :128] |
| subs r3, r3, #16 |
| bgt 1b |
| b 9f |
| 8: |
| @ mono |
| vld1.8 {d0, d1, d2, d3}, [r5, :128] |
| 1: |
| sub r6, r6, #32 |
| sub r0, r0, #16 |
| vld1.8 {d4, d5, d6, d7}, [r1]! |
| vtbl.8 d16, {d4, d5, d6, d7}, d0 |
| vtbl.8 d17, {d4, d5, d6, d7}, d1 |
| vtbl.8 d18, {d4, d5, d6, d7}, d2 |
| vtbl.8 d19, {d4, d5, d6, d7}, d3 |
| vst1.16 {d16, d17, d18, d19}, [r6, :128] |
| subs r3, r3, #16 |
| bgt 1b |
| 9: |
| pop {r1, r3-r7} |
| bx lr |
| endfunc |