blob: 753aef51aea7aae41f2d846d3cfe4c65ef4d0a4c [file]
/*
* Copyright (C) 2026 Ramiro Polla
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "../ops_chain.h"
#include "libavutil/avassert.h"
#include "libavutil/avstring.h"
#include "libavutil/tree.h"
#include "ops_lookup.h"
#include "ops_impl_conv.c"
/*********************************************************************/
typedef struct SwsAArch64BackendContext {
SwsContext *sws;
int block_size;
} SwsAArch64BackendContext;
/*********************************************************************/
static int aarch64_setup_linear(const SwsAArch64OpImplParams *p,
const SwsOp *op, SwsImplResult *res)
{
/**
* Compute number of full vector registers needed to pack all non-zero
* coefficients.
*/
const int num_vregs = linear_num_vregs(p);
av_assert0(num_vregs <= 4);
float *coeffs = av_malloc(num_vregs * 4 * sizeof(float));
if (!coeffs)
return AVERROR(ENOMEM);
/**
* Copy non-zero coefficients, reordered to match SwsAArch64LinearOpMask.
* The coefficients are packed in sequential order. The same order must
* be followed in asmgen_op_linear().
*/
int i_coeff = 0;
LOOP_LINEAR_MASK(p, i, j) {
const int jj = linear_index_to_sws_op(j);
coeffs[i_coeff++] = (float) op->lin.m[i][jj].num / op->lin.m[i][jj].den;
}
res->priv.ptr = coeffs;
res->free = ff_op_priv_free;
return 0;
}
/*********************************************************************/
static int aarch64_setup_dither(const SwsAArch64OpImplParams *p,
const SwsOp *op, SwsImplResult *res)
{
/**
* The input dither matrix is (1 << size_log2)² pixels large. It is
* periodic, so the x and y offsets should be masked to fit inside
* (1 << size_log2).
* The width of the matrix is assumed to be at least 8, which matches
* the maximum block_size for aarch64 asmgen when f32 operations
* (i.e., dithering) are used. This guarantees that the x offset is
* aligned and that reading block_size elements does not extend past
* the end of the row. The x offset doesn't change between components,
* so it is only required to be masked once.
* The y offset, on the other hand, may change per component, and
* would therefore need to be masked for every y_offset value. To
* simplify the execution, we over-allocate the number of rows of
* the output dither matrix by the largest y_offset value. This way,
* we only need to mask y offset once, and can safely increment the
* dither matrix pointer by fixed offsets for every y_offset change.
*/
/* Find the largest y_offset value. */
const int size = 1 << op->dither.size_log2;
const int8_t *off = op->dither.y_offset;
int max_offset = 0;
for (int i = 0; i < 4; i++) {
if (off[i] >= 0)
max_offset = FFMAX(max_offset, off[i] & (size - 1));
}
/* Allocate (size + max_offset) rows to allow over-reading the matrix. */
const int stride = size * sizeof(float);
const int num_rows = size + max_offset;
float *matrix = av_malloc(num_rows * stride);
if (!matrix)
return AVERROR(ENOMEM);
for (int i = 0; i < size * size; i++)
matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
memcpy(&matrix[size * size], matrix, max_offset * stride);
res->priv.ptr = matrix;
res->free = ff_op_priv_free;
return 0;
}
/*********************************************************************/
static int aarch64_setup(SwsOpList *ops, int block_size, int n,
const SwsAArch64OpImplParams *p, SwsImplResult *out)
{
SwsOp *op = &ops->ops[n];
switch (op->op) {
case SWS_OP_READ:
/* Negative shift values to perform right shift using ushl. */
if (op->rw.frac == 3) {
out->priv = (SwsOpPriv) {
.u8 = {
-7, -6, -5, -4, -3, -2, -1, 0,
-7, -6, -5, -4, -3, -2, -1, 0,
}
};
}
break;
case SWS_OP_WRITE:
/* Shift values for ushl. */
if (op->rw.frac == 3) {
out->priv = (SwsOpPriv) {
.u8 = {
7, 6, 5, 4, 3, 2, 1, 0,
7, 6, 5, 4, 3, 2, 1, 0,
}
};
}
break;
case SWS_OP_CLEAR:
ff_sws_setup_clear(&(const SwsImplParams) { .op = op }, out);
break;
case SWS_OP_MIN:
case SWS_OP_MAX:
ff_sws_setup_clamp(&(const SwsImplParams) { .op = op }, out);
break;
case SWS_OP_SCALE:
ff_sws_setup_scale(&(const SwsImplParams) { .op = op }, out);
break;
case SWS_OP_LINEAR:
return aarch64_setup_linear(p, op, out);
case SWS_OP_DITHER:
return aarch64_setup_dither(p, op, out);
}
return 0;
}
/*********************************************************************/
static int aarch64_optimize(SwsAArch64BackendContext *bctx, SwsOpList *ops)
{
/* Currently, no optimization is performed. This is just a placeholder. */
/* Use at most two full vregs during the widest precision section */
bctx->block_size = (ff_sws_op_list_max_size(ops) == 4) ? 8 : 16;
return 0;
}
/*********************************************************************/
static int aarch64_compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
{
SwsAArch64BackendContext bctx;
int ret;
const int cpu_flags = av_get_cpu_flags();
if (!(cpu_flags & AV_CPU_FLAG_NEON))
return AVERROR(ENOTSUP);
/* Make on-stack copy of `ops` to iterate over */
SwsOpList rest = *ops;
bctx.sws = ctx;
ret = aarch64_optimize(&bctx, &rest);
if (ret < 0)
return ret;
SwsOpChain *chain = ff_sws_op_chain_alloc();
if (!chain)
return AVERROR(ENOMEM);
chain->cpu_flags = AV_CPU_FLAG_NEON;
*out = (SwsCompiledOp) {
.priv = chain,
.slice_align = 1,
.free = ff_sws_op_chain_free_cb,
.block_size = bctx.block_size,
};
/* Look up kernel functions. */
for (int i = 0; i < rest.num_ops; i++) {
SwsAArch64OpImplParams params = { 0 };
ret = convert_to_aarch64_impl(ctx, &rest, i, bctx.block_size, &params);
if (ret < 0)
goto error;
SwsFuncPtr func = ff_sws_aarch64_lookup(&params);
if (!func) {
ret = AVERROR(ENOTSUP);
goto error;
}
SwsImplResult res = { 0 };
ret = aarch64_setup(&rest, bctx.block_size, i, &params, &res);
if (ret < 0)
goto error;
ret = ff_sws_op_chain_append(chain, func, res.free, &res.priv);
if (ret < 0)
goto error;
}
/* Look up process/process_return functions. */
const SwsOp *read = ff_sws_op_list_input(&rest);
const SwsOp *write = ff_sws_op_list_output(&rest);
const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
const int write_planes = write->rw.packed ? 1 : write->rw.elems;
SwsAArch64OpMask mask = 0;
for (int i = 0; i < FFMAX(read_planes, write_planes); i++)
MASK_SET(mask, i, 1);
SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS, .mask = mask };
SwsAArch64OpImplParams return_params = { .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = mask };
SwsFuncPtr process_func = ff_sws_aarch64_lookup(&process_params);
SwsFuncPtr return_func = ff_sws_aarch64_lookup(&return_params);
if (!process_func || !return_func) {
ret = AVERROR(ENOTSUP);
goto error;
}
ret = ff_sws_op_chain_append(chain, return_func, NULL, &(SwsOpPriv) { 0 });
if (ret < 0)
goto error;
out->func = (SwsOpFunc) process_func;
out->cpu_flags = chain->cpu_flags;
error:
if (ret < 0)
ff_sws_op_chain_free(chain);
return ret;
}
/*********************************************************************/
const SwsOpBackend backend_aarch64 = {
.name = "aarch64",
.compile = aarch64_compile,
.hw_format = AV_PIX_FMT_NONE,
};