| ;****************************************************************************** |
| ;* Copyright (c) 2025 Niklas Haas |
| ;* |
| ;* This file is part of FFmpeg. |
| ;* |
| ;* FFmpeg is free software; you can redistribute it and/or |
| ;* modify it under the terms of the GNU Lesser General Public |
| ;* License as published by the Free Software Foundation; either |
| ;* version 2.1 of the License, or (at your option) any later version. |
| ;* |
| ;* FFmpeg is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| ;* Lesser General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU Lesser General Public |
| ;* License along with FFmpeg; if not, write to the Free Software |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| ;****************************************************************************** |
| |
| %include "ops_common.asm" |
| |
| SECTION .text |
| |
| ;--------------------------------------------------------- |
| ; Pixel type conversions |
| |
| %macro conv8to32f 0 |
| op convert_U8_F32 |
| LOAD_CONT tmp0q |
| IF X, vpsrldq xmx2, xmx, 8 |
| IF Y, vpsrldq xmy2, xmy, 8 |
| IF Z, vpsrldq xmz2, xmz, 8 |
| IF W, vpsrldq xmw2, xmw, 8 |
| IF X, pmovzxbd mx, xmx |
| IF Y, pmovzxbd my, xmy |
| IF Z, pmovzxbd mz, xmz |
| IF W, pmovzxbd mw, xmw |
| IF X, pmovzxbd mx2, xmx2 |
| IF Y, pmovzxbd my2, xmy2 |
| IF Z, pmovzxbd mz2, xmz2 |
| IF W, pmovzxbd mw2, xmw2 |
| IF X, vcvtdq2ps mx, mx |
| IF Y, vcvtdq2ps my, my |
| IF Z, vcvtdq2ps mz, mz |
| IF W, vcvtdq2ps mw, mw |
| IF X, vcvtdq2ps mx2, mx2 |
| IF Y, vcvtdq2ps my2, my2 |
| IF Z, vcvtdq2ps mz2, mz2 |
| IF W, vcvtdq2ps mw2, mw2 |
| CONTINUE tmp0q |
| %endmacro |
| |
| %macro conv16to32f 0 |
| op convert_U16_F32 |
| LOAD_CONT tmp0q |
| IF X, vextracti128 xmx2, mx, 1 |
| IF Y, vextracti128 xmy2, my, 1 |
| IF Z, vextracti128 xmz2, mz, 1 |
| IF W, vextracti128 xmw2, mw, 1 |
| IF X, pmovzxwd mx, xmx |
| IF Y, pmovzxwd my, xmy |
| IF Z, pmovzxwd mz, xmz |
| IF W, pmovzxwd mw, xmw |
| IF X, pmovzxwd mx2, xmx2 |
| IF Y, pmovzxwd my2, xmy2 |
| IF Z, pmovzxwd mz2, xmz2 |
| IF W, pmovzxwd mw2, xmw2 |
| IF X, vcvtdq2ps mx, mx |
| IF Y, vcvtdq2ps my, my |
| IF Z, vcvtdq2ps mz, mz |
| IF W, vcvtdq2ps mw, mw |
| IF X, vcvtdq2ps mx2, mx2 |
| IF Y, vcvtdq2ps my2, my2 |
| IF Z, vcvtdq2ps mz2, mz2 |
| IF W, vcvtdq2ps mw2, mw2 |
| CONTINUE tmp0q |
| %endmacro |
| |
| %macro conv32fto8 0 |
| op convert_F32_U8 |
| LOAD_CONT tmp0q |
| IF X, cvttps2dq mx, mx |
| IF Y, cvttps2dq my, my |
| IF Z, cvttps2dq mz, mz |
| IF W, cvttps2dq mw, mw |
| IF X, cvttps2dq mx2, mx2 |
| IF Y, cvttps2dq my2, my2 |
| IF Z, cvttps2dq mz2, mz2 |
| IF W, cvttps2dq mw2, mw2 |
| IF X, packusdw mx, mx2 |
| IF Y, packusdw my, my2 |
| IF Z, packusdw mz, mz2 |
| IF W, packusdw mw, mw2 |
| IF X, vextracti128 xmx2, mx, 1 |
| IF Y, vextracti128 xmy2, my, 1 |
| IF Z, vextracti128 xmz2, mz, 1 |
| IF W, vextracti128 xmw2, mw, 1 |
| vzeroupper |
| IF X, packuswb xmx, xmx2 |
| IF Y, packuswb xmy, xmy2 |
| IF Z, packuswb xmz, xmz2 |
| IF W, packuswb xmw, xmw2 |
| IF X, vpshufd xmx, xmx, q3120 |
| IF Y, vpshufd xmy, xmy, q3120 |
| IF Z, vpshufd xmz, xmz, q3120 |
| IF W, vpshufd xmw, xmw, q3120 |
| CONTINUE tmp0q |
| %endmacro |
| |
| %macro conv32fto16 0 |
| op convert_F32_U16 |
| LOAD_CONT tmp0q |
| IF X, cvttps2dq mx, mx |
| IF Y, cvttps2dq my, my |
| IF Z, cvttps2dq mz, mz |
| IF W, cvttps2dq mw, mw |
| IF X, cvttps2dq mx2, mx2 |
| IF Y, cvttps2dq my2, my2 |
| IF Z, cvttps2dq mz2, mz2 |
| IF W, cvttps2dq mw2, mw2 |
| IF X, packusdw mx, mx2 |
| IF Y, packusdw my, my2 |
| IF Z, packusdw mz, mz2 |
| IF W, packusdw mw, mw2 |
| IF X, vpermq mx, mx, q3120 |
| IF Y, vpermq my, my, q3120 |
| IF Z, vpermq mz, mz, q3120 |
| IF W, vpermq mw, mw, q3120 |
| CONTINUE tmp0q |
| %endmacro |
| |
| %macro min_max 0 |
| op min |
| IF X, vbroadcastss m8, [implq + SwsOpImpl.priv + 0] |
| IF Y, vbroadcastss m9, [implq + SwsOpImpl.priv + 4] |
| IF Z, vbroadcastss m10, [implq + SwsOpImpl.priv + 8] |
| IF W, vbroadcastss m11, [implq + SwsOpImpl.priv + 12] |
| LOAD_CONT tmp0q |
| IF X, minps mx, mx, m8 |
| IF Y, minps my, my, m9 |
| IF Z, minps mz, mz, m10 |
| IF W, minps mw, mw, m11 |
| IF X, minps mx2, m8 |
| IF Y, minps my2, m9 |
| IF Z, minps mz2, m10 |
| IF W, minps mw2, m11 |
| CONTINUE tmp0q |
| |
| op max |
| IF X, vbroadcastss m8, [implq + SwsOpImpl.priv + 0] |
| IF Y, vbroadcastss m9, [implq + SwsOpImpl.priv + 4] |
| IF Z, vbroadcastss m10, [implq + SwsOpImpl.priv + 8] |
| IF W, vbroadcastss m11, [implq + SwsOpImpl.priv + 12] |
| LOAD_CONT tmp0q |
| IF X, maxps mx, m8 |
| IF Y, maxps my, m9 |
| IF Z, maxps mz, m10 |
| IF W, maxps mw, m11 |
| IF X, maxps mx2, m8 |
| IF Y, maxps my2, m9 |
| IF Z, maxps mz2, m10 |
| IF W, maxps mw2, m11 |
| CONTINUE tmp0q |
| %endmacro |
| |
| %macro scale 0 |
| op scale |
| vbroadcastss m8, [implq + SwsOpImpl.priv] |
| LOAD_CONT tmp0q |
| IF X, mulps mx, m8 |
| IF Y, mulps my, m8 |
| IF Z, mulps mz, m8 |
| IF W, mulps mw, m8 |
| IF X, mulps mx2, m8 |
| IF Y, mulps my2, m8 |
| IF Z, mulps mz2, m8 |
| IF W, mulps mw2, m8 |
| CONTINUE tmp0q |
| %endmacro |
| |
| %macro load_dither_row 5 ; size_log2, y, addr, out, out2 |
| lea tmp0q, %2 |
| and tmp0q, (1 << %1) - 1 |
| shl tmp0q, %1+2 |
| %if %1 == 2 |
| VBROADCASTI128 %4, [%3 + tmp0q] |
| %else |
| mova %4, [%3 + tmp0q] |
| %if (4 << %1) > mmsize |
| mova %5, [%3 + tmp0q + mmsize] |
| %endif |
| %endif |
| %endmacro |
| |
| %macro dither 1 ; size_log2 |
| op dither%1 |
| %define DX m8 |
| %define DY m9 |
| %define DZ m10 |
| %define DW m11 |
| %define DX2 DX |
| %define DY2 DY |
| %define DZ2 DZ |
| %define DW2 DW |
| %if %1 == 0 |
| ; constant offset for all channels |
| vbroadcastss DX, [implq + SwsOpImpl.priv] |
| %define DY DX |
| %define DZ DX |
| %define DW DX |
| %elif %1 == 1 |
| ; 2x2 matrix, only sign of y matters |
| mov tmp0d, yd |
| and tmp0d, 1 |
| shl tmp0d, 3 |
| %if X || Z |
| ; dither matrix is stored directly in the private data |
| vbroadcastsd DX, [implq + SwsOpImpl.priv + tmp0q] |
| %endif |
| %if Y || W |
| xor tmp0d, 8 |
| vbroadcastsd DY, [implq + SwsOpImpl.priv + tmp0q] |
| %endif |
| %define DZ DX |
| %define DW DY |
| %else |
| ; matrix is at least 4x4, load all four channels with custom offset |
| %if (4 << %1) > mmsize |
| %define DX2 m12 |
| %define DY2 m13 |
| %define DZ2 m14 |
| %define DW2 m15 |
| %endif |
| ; dither matrix is stored indirectly at the private data address |
| mov tmp1q, [implq + SwsOpImpl.priv] |
| %if (4 << %1) > 2 * mmsize |
| ; need to add in x offset |
| mov tmp0d, bxd |
| shl tmp0d, 6 ; sizeof(float[16]) |
| and tmp0d, (4 << %1) - 1 |
| add tmp1q, tmp0q |
| %endif |
| IF X, load_dither_row %1, [yd + 0], tmp1q, DX, DX2 |
| IF Y, load_dither_row %1, [yd + 3], tmp1q, DY, DY2 |
| IF Z, load_dither_row %1, [yd + 2], tmp1q, DZ, DZ2 |
| IF W, load_dither_row %1, [yd + 5], tmp1q, DW, DW2 |
| %endif |
| LOAD_CONT tmp0q |
| IF X, addps mx, DX |
| IF Y, addps my, DY |
| IF Z, addps mz, DZ |
| IF W, addps mw, DW |
| IF X, addps mx2, DX2 |
| IF Y, addps my2, DY2 |
| IF Z, addps mz2, DZ2 |
| IF W, addps mw2, DW2 |
| CONTINUE tmp0q |
| %endmacro |
| |
| %macro dither_fns 0 |
| dither 0 |
| dither 1 |
| dither 2 |
| dither 3 |
| dither 4 |
| dither 5 |
| dither 6 |
| dither 7 |
| dither 8 |
| %endmacro |
| |
| %xdefine MASK(I, J) (1 << (5 * (I) + (J))) |
| %xdefine MASK_OFF(I) MASK(I, 4) |
| %xdefine MASK_ROW(I) (0x1F << (5 * (I))) |
| %xdefine MASK_COL(J) (0x8421 << J) |
| %xdefine MASK_ALL (1 << 20) - 1 |
| %xdefine MASK_LUMA MASK(0, 0) | MASK_OFF(0) |
| %xdefine MASK_ALPHA MASK(3, 3) | MASK_OFF(3) |
| %xdefine MASK_DIAG3 MASK(0, 0) | MASK(1, 1) | MASK(2, 2) |
| %xdefine MASK_OFF3 MASK_OFF(0) | MASK_OFF(1) | MASK_OFF(2) |
| %xdefine MASK_MAT3 MASK(0, 0) | MASK(0, 1) | MASK(0, 2) |\ |
| MASK(1, 0) | MASK(1, 1) | MASK(1, 2) |\ |
| MASK(2, 0) | MASK(2, 1) | MASK(2, 2) |
| %xdefine MASK_DIAG4 MASK_DIAG3 | MASK(3, 3) |
| %xdefine MASK_OFF4 MASK_OFF3 | MASK_OFF(3) |
| %xdefine MASK_MAT4 MASK_ALL & ~MASK_OFF4 |
| |
| %macro linear_row 7 ; res, x, y, z, w, row, mask |
| %define COL(J) ((%7) & MASK(%6, J)) ; true if mask contains component J |
| %define NOP(J) (J == %6 && !COL(J)) ; true if J is untouched input component |
| |
| ; load weights |
| IF COL(0), vbroadcastss m12, [tmp0q + %6 * 20 + 0] |
| IF COL(1), vbroadcastss m13, [tmp0q + %6 * 20 + 4] |
| IF COL(2), vbroadcastss m14, [tmp0q + %6 * 20 + 8] |
| IF COL(3), vbroadcastss m15, [tmp0q + %6 * 20 + 12] |
| |
| ; initialize result vector as appropriate |
| %if COL(4) ; offset |
| vbroadcastss %1, [tmp0q + %6 * 20 + 16] |
| %elif NOP(0) |
| ; directly reuse first component vector if possible |
| mova %1, %2 |
| %else |
| xorps %1, %1 |
| %endif |
| |
| IF COL(0), mulps m12, %2 |
| IF COL(1), mulps m13, %3 |
| IF COL(2), mulps m14, %4 |
| IF COL(3), mulps m15, %5 |
| IF COL(0), addps %1, m12 |
| IF NOP(0) && COL(4), addps %1, %3 ; first vector was not reused |
| IF COL(1), addps %1, m13 |
| IF NOP(1), addps %1, %3 |
| IF COL(2), addps %1, m14 |
| IF NOP(2), addps %1, %4 |
| IF COL(3), addps %1, m15 |
| IF NOP(3), addps %1, %5 |
| %endmacro |
| |
| %macro linear_inner 5 ; x, y, z, w, mask |
| %define ROW(I) ((%5) & MASK_ROW(I)) |
| IF1 ROW(0), linear_row m8, %1, %2, %3, %4, 0, %5 |
| IF1 ROW(1), linear_row m9, %1, %2, %3, %4, 1, %5 |
| IF1 ROW(2), linear_row m10, %1, %2, %3, %4, 2, %5 |
| IF1 ROW(3), linear_row m11, %1, %2, %3, %4, 3, %5 |
| IF ROW(0), mova %1, m8 |
| IF ROW(1), mova %2, m9 |
| IF ROW(2), mova %3, m10 |
| IF ROW(3), mova %4, m11 |
| %endmacro |
| |
| %macro linear_mask 2 ; name, mask |
| op %1 |
| mov tmp0q, [implq + SwsOpImpl.priv] ; address of matrix |
| linear_inner mx, my, mz, mw, %2 |
| linear_inner mx2, my2, mz2, mw2, %2 |
| CONTINUE |
| %endmacro |
| |
| ; specialized functions for very simple cases |
| %macro linear_dot3 0 |
| op dot3 |
| mov tmp0q, [implq + SwsOpImpl.priv] |
| vbroadcastss m12, [tmp0q + 0] |
| vbroadcastss m13, [tmp0q + 4] |
| vbroadcastss m14, [tmp0q + 8] |
| LOAD_CONT tmp0q |
| mulps mx, m12 |
| mulps m8, my, m13 |
| mulps m9, mz, m14 |
| addps mx, m8 |
| addps mx, m9 |
| mulps mx2, m12 |
| mulps m10, my2, m13 |
| mulps m11, mz2, m14 |
| addps mx2, m10 |
| addps mx2, m11 |
| CONTINUE tmp0q |
| %endmacro |
| |
| %macro linear_fns 0 |
| linear_dot3 |
| linear_mask luma, MASK_LUMA |
| linear_mask alpha, MASK_ALPHA |
| linear_mask lumalpha, MASK_LUMA | MASK_ALPHA |
| linear_mask row0, MASK_ROW(0) |
| linear_mask row0a, MASK_ROW(0) | MASK_ALPHA |
| linear_mask diag3, MASK_DIAG3 |
| linear_mask diag4, MASK_DIAG4 |
| linear_mask diagoff3, MASK_DIAG3 | MASK_OFF3 |
| linear_mask matrix3, MASK_MAT3 |
| linear_mask affine3, MASK_MAT3 | MASK_OFF3 |
| linear_mask affine3a, MASK_MAT3 | MASK_OFF3 | MASK_ALPHA |
| linear_mask matrix4, MASK_MAT4 |
| linear_mask affine4, MASK_MAT4 | MASK_OFF4 |
| %endmacro |
| |
| INIT_YMM avx2 |
| decl_common_patterns conv8to32f |
| decl_common_patterns conv16to32f |
| decl_common_patterns conv32fto8 |
| decl_common_patterns conv32fto16 |
| decl_common_patterns min_max |
| decl_common_patterns scale |
| decl_common_patterns dither_fns |
| linear_fns |