blob: 67c467281298f90182733aaa5c141dd26e7d082e [file]
/*
* Copyright (C) 2026 Ramiro Polla
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_AARCH64_OPS_IMPL_H
#define SWSCALE_AARCH64_OPS_IMPL_H
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
/* Similar to SwsPixelType */
typedef enum SwsAArch64PixelType {
AARCH64_PIXEL_U8,
AARCH64_PIXEL_U16,
AARCH64_PIXEL_U32,
AARCH64_PIXEL_F32,
AARCH64_PIXEL_TYPE_NB,
} SwsAArch64PixelType;
/* Similar to SwsOpType */
typedef enum SwsAArch64OpType {
AARCH64_SWS_OP_NONE = 0,
AARCH64_SWS_OP_PROCESS,
AARCH64_SWS_OP_PROCESS_RETURN,
AARCH64_SWS_OP_READ_BIT,
AARCH64_SWS_OP_READ_NIBBLE,
AARCH64_SWS_OP_READ_PACKED,
AARCH64_SWS_OP_READ_PLANAR,
AARCH64_SWS_OP_WRITE_BIT,
AARCH64_SWS_OP_WRITE_NIBBLE,
AARCH64_SWS_OP_WRITE_PACKED,
AARCH64_SWS_OP_WRITE_PLANAR,
AARCH64_SWS_OP_SWAP_BYTES,
AARCH64_SWS_OP_SWIZZLE,
AARCH64_SWS_OP_UNPACK,
AARCH64_SWS_OP_PACK,
AARCH64_SWS_OP_LSHIFT,
AARCH64_SWS_OP_RSHIFT,
AARCH64_SWS_OP_CLEAR,
AARCH64_SWS_OP_CONVERT,
AARCH64_SWS_OP_EXPAND,
AARCH64_SWS_OP_MIN,
AARCH64_SWS_OP_MAX,
AARCH64_SWS_OP_SCALE,
AARCH64_SWS_OP_LINEAR,
AARCH64_SWS_OP_DITHER,
AARCH64_SWS_OP_TYPE_NB,
} SwsAArch64OpType;
/* Each nibble in the mask corresponds to one component. */
typedef uint16_t SwsAArch64OpMask;
/**
* Affine coefficient mask for linear op. Packs a 4x5 matrix in execution
* order, where the offset is the first element, with 2 bits per element:
* 00: m[i][j] == 0
* 01: m[i][j] == 1
* 11: m[i][j] is any other coefficient
*/
typedef uint64_t SwsAArch64LinearOpMask;
typedef struct SwsAArch64LinearOp {
SwsAArch64LinearOpMask mask;
uint8_t fmla;
} SwsAArch64LinearOp;
typedef struct SwsAArch64DitherOp {
uint16_t y_offset;
uint8_t size_log2;
} SwsAArch64DitherOp;
/**
* SwsAArch64OpImplParams describes the parameters for an SwsAArch64OpType
* operation. It consists of simplified parameters from the SwsOp structure,
* with the purpose of being straight-forward to implement and execute.
*/
typedef struct SwsAArch64OpImplParams {
SwsAArch64OpType op;
SwsAArch64OpMask mask;
SwsAArch64PixelType type;
uint8_t block_size;
union {
uint8_t shift;
SwsAArch64OpMask swizzle;
SwsAArch64OpMask pack;
SwsAArch64PixelType to_type;
SwsAArch64LinearOp linear;
SwsAArch64DitherOp dither;
};
} SwsAArch64OpImplParams;
/* SwsAArch64OpMask-related helpers. */
#define MASK_GET(mask, idx) (((mask) >> ((idx) << 2)) & 0xf)
#define MASK_SET(mask, idx, val) do { (mask) |= (((val) & 0xf) << ((idx) << 2)); } while (0)
#define LOOP(mask, idx) \
for (int idx = 0; idx < 4; idx++) \
if (MASK_GET(mask, idx))
#define LOOP_BWD(mask, idx) \
for (int idx = 3; idx >= 0; idx--) \
if (MASK_GET(mask, idx))
#define LOOP_MASK(p, idx) LOOP(p->mask, idx)
#define LOOP_MASK_BWD(p, idx) LOOP_BWD(p->mask, idx)
#define LINEAR_MASK_GET(mask, idx, jdx) (((mask) >> (2 * ((5 * (idx) + (jdx))))) & 3)
#define LINEAR_MASK_SET(mask, idx, jdx, val) do { \
(mask) |= ((((SwsAArch64LinearOpMask) (val)) & 3) << (2 * ((5 * (idx) + (jdx))))); \
} while (0)
#define LINEAR_MASK_0 0
#define LINEAR_MASK_1 1
#define LINEAR_MASK_X 3
#define LOOP_LINEAR_MASK(p, idx, jdx) \
LOOP_MASK(p, idx) \
for (int jdx = 0; jdx < 5; jdx++) \
if (LINEAR_MASK_GET(p->linear.mask, idx, jdx))
/* Compute number of vector registers needed to store all coefficients. */
static inline int linear_num_vregs(const SwsAArch64OpImplParams *params)
{
int count = 0;
LOOP_LINEAR_MASK(params, i, j)
count++;
return (count + 3) / 4;
}
static inline int linear_index_to_sws_op(int idx)
{
const int reorder_col[5] = { 4, 0, 1, 2, 3 };
return reorder_col[idx];
}
static inline int linear_index_is_offset(int idx)
{
return (idx == 0);
}
static inline int linear_index_to_vx(int idx)
{
/* The offset shouldn't map to any vx, but to please UBSan we map
* it to 0. */
if (linear_index_is_offset(idx))
return 0;
return (idx - 1);
}
/**
* These values will be used by ops_asmgen to access fields inside of
* SwsOpExec and SwsOpImpl. The sizes are checked below when compiling
* for AArch64 to make sure there is no mismatch.
*/
#define offsetof_exec_in 0
#define offsetof_exec_out 32
#define offsetof_exec_in_bump 128
#define offsetof_exec_out_bump 160
#define offsetof_impl_cont 0
#define offsetof_impl_priv 16
#define sizeof_impl 32
#if ARCH_AARCH64 && HAVE_NEON
static_assert(offsetof_exec_in == offsetof(SwsOpExec, in), "SwsOpExec layout mismatch");
static_assert(offsetof_exec_out == offsetof(SwsOpExec, out), "SwsOpExec layout mismatch");
static_assert(offsetof_exec_in_bump == offsetof(SwsOpExec, in_bump), "SwsOpExec layout mismatch");
static_assert(offsetof_exec_out_bump == offsetof(SwsOpExec, out_bump), "SwsOpExec layout mismatch");
static_assert(offsetof_impl_cont == offsetof(SwsOpImpl, cont), "SwsOpImpl layout mismatch");
static_assert(offsetof_impl_priv == offsetof(SwsOpImpl, priv), "SwsOpImpl layout mismatch");
#endif
#endif /* SWSCALE_AARCH64_OPS_IMPL_H */