| ;****************************************************************************** |
| ;* Copyright (c) 2025 Niklas Haas |
| ;* |
| ;* This file is part of FFmpeg. |
| ;* |
| ;* FFmpeg is free software; you can redistribute it and/or |
| ;* modify it under the terms of the GNU Lesser General Public |
| ;* License as published by the Free Software Foundation; either |
| ;* version 2.1 of the License, or (at your option) any later version. |
| ;* |
| ;* FFmpeg is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| ;* Lesser General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU Lesser General Public |
| ;* License along with FFmpeg; if not, write to the Free Software |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| ;****************************************************************************** |
| |
| %include "libavutil/x86/x86util.asm" |
| |
| ; High-level explanation of how the x86 backend works: |
| ; |
| ; sws_processN is the shared entry point for all operation chains. This |
| ; function is responsible for the block loop, as well as initializing the |
| ; plane pointers. It will jump directly into the first operation kernel, |
| ; and each operation kernel will jump directly into the next one, with the |
| ; final kernel jumping back into the sws_process return point. (See label |
| ; `sws_process.return` in ops_int.asm) |
| ; |
| ; To handle the jump back to the return point, we append an extra address |
| ; corresponding to the correct sws_process.return label into the SwsOpChain, |
| ; and have the WRITE kernel jump into it as usual. (See the FINISH macro) |
| ; |
| ; Inside an operation chain, we use a custom calling convention to preserve |
| ; registers between kernels. The exact register allocation is found further |
| ; below in this file, but we basically reserve (and share) the following |
| ; registers: |
| ; |
| ; - const execq (read-only, shared execution data, see SwsOpExec); stores the |
| ; static metadata for this call and describes the image layouts |
| ; |
| ; - implq (read-only, operation chain, see SwsOpChain); stores the private data |
| ; for each operation as well as the pointer to the next kernel in the sequence. |
| ; This register is automatically incremented by the CONTINUE macro, and will |
| ; be reset back to the first operation kernel by sws_process. |
| ; |
| ; - bxd, yd: current line and block number, used as loop counters in sws_process. |
| ; Also used by e.g. the dithering code to do position-dependent dithering. |
| ; |
| ; - tmp0, tmp1: two temporary registers which are NOT preserved between kernels |
| ; |
| ; - inNq, outNq: plane pointers. These are incremented automatically after the |
| ; corresponding read/write operation, by the read/write kernels themselves. |
| ; sws_process will take care of resetting these to the next line after the |
| ; block loop is done. |
| ; |
| ; Additionally, we pass data between kernels by directly keeping them inside |
| ; vector registers. For this, we reserve the following registers: |
| ; |
| ; - mx, my, mz, mw: low half of the X, Y, Z and W components |
| ; - mx2, my2, mz2, mw2: high half of the X, Y, Z and W components |
| ; (As well as sized variants for xmx, ymx, etc.) |
| ; |
| ; The "high half" registers are only sometimes used; in order to enable |
| ; processing more pixels at the same time. See `decl_v2` below, which allows |
| ; assembling the same operation twice, once with only the lower half (V2=0), |
| ; and once with both halves (V2=1). The remaining vectors are free for use |
| ; inside operation kernels, starting from m8. |
| ; |
| ; The basic rule is that we always use the full set of both vector registers |
| ; when processing the largest element size within a pixel chain. For example, |
| ; if we load 8-bit values and convert them to 32-bit floats internally, then |
| ; we would have an operation chain which combines an SSE4 V2=0 u8 kernel (128 |
| ; bits = 16 pixels) with an AVX2 V2=1 f32 kernel (512 bits = 16 pixels). This |
| ; keeps the number of pixels being processed (the block size) constant. The |
| ; V2 setting is suffixed to the operation name (_m1 or _m2) during name |
| ; mangling. |
| ; |
| ; This design leaves us with the following set of possibilities: |
| ; |
| ; SSE4: |
| ; - max element is 32-bit: currently unsupported |
| ; - max element is 16-bit: currently unsupported |
| ; - max element is 8-bit: block size 32, u8_m2_sse4 |
| ; |
| ; AVX2: |
| ; - max element is 32-bit: block size 16, u32_m2_avx2, u16_m1_avx2, u8_m1_sse4 |
| ; - max element is 16-bit: block size 32, u16_m2_avx2, u8_m1_avx2 |
| ; - max element is 8-bit: block size 64, u8_m2_avx2 |
| ; |
| ; Meaning we need to cover the following code paths for each bit depth: |
| ; |
| ; - 8-bit kernels: m1_sse4, m2_sse4, m1_avx2, m2_avx2 |
| ; - 16-bit kernels: m1_avx2, m2_avx2 |
| ; - 32-bit kernels: m2_avx2 |
| ; |
| ; This is achieved by macro'ing each operation kernel and declaring it once |
| ; per SIMD version, and (if needed) once per V2 setting using decl_v2. (See |
| ; the bottom of ops_int.asm for an example) |
| ; |
| ; Finally, we overload some operation kernel to different number of components, |
| ; using the `decl_pattern` and `decl_common_patterns` macros. Inside these |
| ; kernels, the variables X, Y, Z and W will be set to 0 or 1 respectively, |
| ; depending on which components are active for this particular kernel instance. |
| ; They will receive the _pXYZW prefix during name mangling. |
| |
| struc SwsOpExec |
| .in0 resq 1 |
| .in1 resq 1 |
| .in2 resq 1 |
| .in3 resq 1 |
| .out0 resq 1 |
| .out1 resq 1 |
| .out2 resq 1 |
| .out3 resq 1 |
| .in_stride0 resq 1 |
| .in_stride1 resq 1 |
| .in_stride2 resq 1 |
| .in_stride3 resq 1 |
| .out_stride0 resq 1 |
| .out_stride1 resq 1 |
| .out_stride2 resq 1 |
| .out_stride3 resq 1 |
| .in_bump0 resq 1 |
| .in_bump1 resq 1 |
| .in_bump2 resq 1 |
| .in_bump3 resq 1 |
| .out_bump0 resq 1 |
| .out_bump1 resq 1 |
| .out_bump2 resq 1 |
| .out_bump3 resq 1 |
| .width resd 1 |
| .height resd 1 |
| .slice_y resd 1 |
| .slice_h resd 1 |
| .block_size_in resd 1 |
| .block_size_out resd 1 |
| endstruc |
| |
| struc SwsOpImpl |
| .cont resb 16 |
| .priv resb 16 |
| .next resb 0 |
| endstruc |
| |
| ;--------------------------------------------------------- |
| ; Common macros for declaring operations |
| |
| ; Declare an operation kernel with the correct name mangling. |
| %macro op 1 ; name |
| %ifdef X |
| %define ADD_PAT(name) p %+ X %+ Y %+ Z %+ W %+ _ %+ name |
| %else |
| %define ADD_PAT(name) name |
| %endif |
| |
| %ifdef V2 |
| %if V2 |
| %define ADD_MUL(name) name %+ _m2 |
| %else |
| %define ADD_MUL(name) name %+ _m1 |
| %endif |
| %else |
| %define ADD_MUL(name) name |
| %endif |
| |
| cglobal ADD_PAT(ADD_MUL(%1)), 0, 0, 0 ; already allocated by entry point |
| |
| %undef ADD_PAT |
| %undef ADD_MUL |
| %endmacro |
| |
| ; Declare an operation kernel twice, once with V2=0 and once with V2=1 |
| %macro decl_v2 2+ ; v2, func |
| %xdefine V2 %1 |
| %2 |
| %undef V2 |
| %endmacro |
| |
| ; Declare an operation kernel specialized to a given subset of active components |
| %macro decl_pattern 5+ ; X, Y, Z, W, func |
| %xdefine X %1 |
| %xdefine Y %2 |
| %xdefine Z %3 |
| %xdefine W %4 |
| %5 |
| %undef X |
| %undef Y |
| %undef Z |
| %undef W |
| %endmacro |
| |
| ; Declare an operation kernel specialized to each common component pattern |
| %macro decl_common_patterns 1+ ; func |
| decl_pattern 1, 0, 0, 0, %1 ; y |
| decl_pattern 1, 0, 0, 1, %1 ; ya |
| decl_pattern 1, 1, 1, 0, %1 ; yuv |
| decl_pattern 1, 1, 1, 1, %1 ; yuva |
| %endmacro |
| |
| ;--------------------------------------------------------- |
| ; Common names for the internal calling convention |
| %define mx m0 |
| %define my m1 |
| %define mz m2 |
| %define mw m3 |
| |
| %define xmx xm0 |
| %define xmy xm1 |
| %define xmz xm2 |
| %define xmw xm3 |
| |
| %define ymx ym0 |
| %define ymy ym1 |
| %define ymz ym2 |
| %define ymw ym3 |
| |
| %define mx2 m4 |
| %define my2 m5 |
| %define mz2 m6 |
| %define mw2 m7 |
| |
| %define xmx2 xm4 |
| %define xmy2 xm5 |
| %define xmz2 xm6 |
| %define xmw2 xm7 |
| |
| %define ymx2 ym4 |
| %define ymy2 ym5 |
| %define ymz2 ym6 |
| %define ymw2 ym7 |
| |
| ; Reserved in this order by the signature of SwsOpFunc |
| %define execq r0q |
| %define implq r1q |
| %define bxd r2d |
| %define yd r3d |
| |
| ; Extra registers for free use by kernels, not saved between ops |
| %define tmp0q r4q |
| %define tmp1q r5q |
| |
| %define tmp0d r4d |
| %define tmp1d r5d |
| |
| ; Registers for plane pointers; put at the end (and in ascending plane order) |
| ; so that we can avoid reserving them when not necessary |
| %define out0q r6q |
| %define in0q r7q |
| %define out1q r8q |
| %define in1q r9q |
| %define out2q r10q |
| %define in2q r11q |
| %define out3q r12q |
| %define in3q r13q |
| |
| ;--------------------------------------------------------- |
| ; Common macros for linking together different kernels |
| |
| ; Load the next operation kernel's address to a register |
| %macro LOAD_CONT 1 ; reg |
| mov %1, [implq + SwsOpImpl.cont] |
| %endmacro |
| |
| ; Tail call into the next operation kernel, given that kernel's address |
| %macro CONTINUE 1 ; reg |
| add implq, SwsOpImpl.next |
| jmp %1 |
| annotate_function_size |
| %endmacro |
| |
| ; Convenience macro to load and continue to the next kernel in one step |
| %macro CONTINUE 0 |
| LOAD_CONT tmp0q |
| CONTINUE tmp0q |
| %endmacro |
| |
| ; Final macro to end the operation chain, used by WRITE kernels to jump back |
| ; to the process function return point. Very similar to CONTINUE, but skips |
| ; incrementing the implq pointer, and also clears AVX registers to avoid |
| ; phantom dependencies between loop iterations. |
| %macro FINISH 1 ; reg |
| %if vzeroupper_required |
| ; we may jump back into an SSE read, so always zero upper regs here |
| vzeroupper |
| %endif |
| jmp %1 |
| annotate_function_size |
| %endmacro |
| |
| ; Helper for inline conditionals; used to conditionally include single lines |
| %macro IF 2+ ; cond, body |
| %if %1 |
| %2 |
| %endif |
| %endmacro |
| |
| ; Alternate name; for nested usage (to work around NASM limitations) |
| %macro IF1 2+ |
| %if %1 |
| %2 |
| %endif |
| %endmacro |