| ; /* |
| ; * Provide SIMD DMVR SAD functions for VVC decoding |
| ; * |
| ; * Copyright (c) 2024 Stone Chen |
| ; * |
| ; * This file is part of FFmpeg. |
| ; * |
| ; * FFmpeg is free software; you can redistribute it and/or |
| ; * modify it under the terms of the GNU Lesser General Public |
| ; * License as published by the Free Software Foundation; either |
| ; * version 2.1 of the License, or (at your option) any later version. |
| ; * |
| ; * FFmpeg is distributed in the hope that it will be useful, |
| ; * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| ; * Lesser General Public License for more details. |
| ; * |
| ; * You should have received a copy of the GNU Lesser General Public |
| ; * License along with FFmpeg; if not, write to the Free Software |
| ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| ; */ |
| |
| %include "libavutil/x86/x86util.asm" |
| %define MAX_PB_SIZE 128 |
| %define ROWS 2 |
| |
| SECTION_RODATA |
| |
| pw_1: times 2 dw 1 |
| |
| ; DMVR SAD is only calculated on even rows to reduce complexity |
| ; Additionally the only valid sizes are 8x16, 16x8, and 16x16 |
| SECTION .text |
| |
| %macro MIN_MAX_SAD 3 |
| pminuw %3, %2, %1 |
| pmaxuw %1, %2, %1 |
| psubusw %1, %1, %3 |
| %endmacro |
| |
| %macro HORIZ_ADD 3 ; xm0, xm1, m1 |
| vextracti128 %1, %3, q0001 ; 3 2 1 0 |
| paddd %1, %2 ; xm0 (7 + 3) (6 + 2) (5 + 1) (4 + 0) |
| pshufd %2, %1, q0032 ; xm1 - - (7 + 3) (6 + 2) |
| paddd %1, %1, %2 ; xm0 _ _ (5 1 7 3) (4 0 6 2) |
| pshufd %2, %1, q0001 ; xm1 _ _ (5 1 7 3) (5 1 7 3) |
| paddd %1, %1, %2 ; (01234567) |
| %endmacro |
| |
| %if ARCH_X86_64 |
| %if HAVE_AVX2_EXTERNAL |
| |
| INIT_YMM avx2 |
| |
| cglobal vvc_sad, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx |
| movsxdifnidn dxq, dxd |
| movsxdifnidn dyq, dyd |
| |
| sub dxq, 2 |
| sub dyq, 2 |
| |
| mov off1q, 2 |
| mov off2q, 2 |
| |
| add off1q, dyq |
| sub off2q, dyq |
| |
| shl off1q, 7 |
| shl off2q, 7 |
| |
| add off1q, dxq |
| sub off2q, dxq |
| |
| lea src1q, [src1q + off1q * 2 + 2 * 2] |
| lea src2q, [src2q + off2q * 2 + 2 * 2] |
| |
| pxor m3, m3 |
| vpbroadcastd m4, [pw_1] |
| |
| cmp block_wd, 16 |
| je vvc_sad_16 |
| |
| vvc_sad_8: |
| .loop_height: |
| movu xm0, [src1q] |
| vinserti128 m0, m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1 |
| movu xm1, [src2q] |
| vinserti128 m1, m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1 |
| |
| MIN_MAX_SAD m1, m0, m2 |
| pmaddwd m1, m4 |
| paddd m3, m1 |
| |
| add src1q, 2 * MAX_PB_SIZE * ROWS * 2 |
| add src2q, 2 * MAX_PB_SIZE * ROWS * 2 |
| |
| sub block_hd, 4 |
| jg .loop_height |
| |
| HORIZ_ADD xm0, xm3, m3 |
| movd eax, xm0 |
| RET |
| |
| vvc_sad_16: |
| sar block_wd, 4 |
| .loop_height: |
| mov off1q, src1q |
| mov off2q, src2q |
| mov row_idxd, block_wd |
| |
| .loop_width: |
| movu m0, [src1q] |
| movu m1, [src2q] |
| MIN_MAX_SAD m1, m0, m2 |
| pmaddwd m1, m4 |
| paddd m3, m1 |
| |
| add src1q, 32 |
| add src2q, 32 |
| dec row_idxd |
| jg .loop_width |
| |
| lea src1q, [off1q + ROWS * MAX_PB_SIZE * 2] |
| lea src2q, [off2q + ROWS * MAX_PB_SIZE * 2] |
| |
| sub block_hd, 2 |
| jg .loop_height |
| |
| HORIZ_ADD xm0, xm3, m3 |
| movd eax, xm0 |
| RET |
| |
| %endif |
| %endif |