| /* |
| * Copyright (C) 2026 Ramiro Polla |
| * |
| * This file is part of FFmpeg. |
| * |
| * FFmpeg is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * FFmpeg is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with FFmpeg; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| */ |
| |
| #include "../ops_chain.h" |
| |
| #include "libavutil/avassert.h" |
| #include "libavutil/avstring.h" |
| #include "libavutil/tree.h" |
| |
| #include "ops_lookup.h" |
| |
| #include "ops_impl_conv.c" |
| |
| /*********************************************************************/ |
| typedef struct SwsAArch64BackendContext { |
| SwsContext *sws; |
| int block_size; |
| } SwsAArch64BackendContext; |
| |
| /*********************************************************************/ |
| static int aarch64_setup_linear(const SwsAArch64OpImplParams *p, |
| const SwsOp *op, SwsImplResult *res) |
| { |
| /** |
| * Compute number of full vector registers needed to pack all non-zero |
| * coefficients. |
| */ |
| const int num_vregs = linear_num_vregs(p); |
| av_assert0(num_vregs <= 4); |
| float *coeffs = av_malloc(num_vregs * 4 * sizeof(float)); |
| if (!coeffs) |
| return AVERROR(ENOMEM); |
| |
| /** |
| * Copy non-zero coefficients, reordered to match SwsAArch64LinearOpMask. |
| * The coefficients are packed in sequential order. The same order must |
| * be followed in asmgen_op_linear(). |
| */ |
| int i_coeff = 0; |
| LOOP_LINEAR_MASK(p, i, j) { |
| const int jj = linear_index_to_sws_op(j); |
| coeffs[i_coeff++] = (float) op->lin.m[i][jj].num / op->lin.m[i][jj].den; |
| } |
| |
| res->priv.ptr = coeffs; |
| res->free = ff_op_priv_free; |
| |
| return 0; |
| } |
| |
| /*********************************************************************/ |
| static int aarch64_setup_dither(const SwsAArch64OpImplParams *p, |
| const SwsOp *op, SwsImplResult *res) |
| { |
| /** |
| * The input dither matrix is (1 << size_log2)² pixels large. It is |
| * periodic, so the x and y offsets should be masked to fit inside |
| * (1 << size_log2). |
| * The width of the matrix is assumed to be at least 8, which matches |
| * the maximum block_size for aarch64 asmgen when f32 operations |
| * (i.e., dithering) are used. This guarantees that the x offset is |
| * aligned and that reading block_size elements does not extend past |
| * the end of the row. The x offset doesn't change between components, |
| * so it is only required to be masked once. |
| * The y offset, on the other hand, may change per component, and |
| * would therefore need to be masked for every y_offset value. To |
| * simplify the execution, we over-allocate the number of rows of |
| * the output dither matrix by the largest y_offset value. This way, |
| * we only need to mask y offset once, and can safely increment the |
| * dither matrix pointer by fixed offsets for every y_offset change. |
| */ |
| |
| /* Find the largest y_offset value. */ |
| const int size = 1 << op->dither.size_log2; |
| const int8_t *off = op->dither.y_offset; |
| int max_offset = 0; |
| for (int i = 0; i < 4; i++) { |
| if (off[i] >= 0) |
| max_offset = FFMAX(max_offset, off[i] & (size - 1)); |
| } |
| |
| /* Allocate (size + max_offset) rows to allow over-reading the matrix. */ |
| const int stride = size * sizeof(float); |
| const int num_rows = size + max_offset; |
| float *matrix = av_malloc(num_rows * stride); |
| if (!matrix) |
| return AVERROR(ENOMEM); |
| |
| for (int i = 0; i < size * size; i++) |
| matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den; |
| |
| memcpy(&matrix[size * size], matrix, max_offset * stride); |
| |
| res->priv.ptr = matrix; |
| res->free = ff_op_priv_free; |
| |
| return 0; |
| } |
| |
| /*********************************************************************/ |
| static int aarch64_setup(SwsOpList *ops, int block_size, int n, |
| const SwsAArch64OpImplParams *p, SwsImplResult *out) |
| { |
| SwsOp *op = &ops->ops[n]; |
| switch (op->op) { |
| case SWS_OP_READ: |
| /* Negative shift values to perform right shift using ushl. */ |
| if (op->rw.frac == 3) { |
| out->priv = (SwsOpPriv) { |
| .u8 = { |
| -7, -6, -5, -4, -3, -2, -1, 0, |
| -7, -6, -5, -4, -3, -2, -1, 0, |
| } |
| }; |
| } |
| break; |
| case SWS_OP_WRITE: |
| /* Shift values for ushl. */ |
| if (op->rw.frac == 3) { |
| out->priv = (SwsOpPriv) { |
| .u8 = { |
| 7, 6, 5, 4, 3, 2, 1, 0, |
| 7, 6, 5, 4, 3, 2, 1, 0, |
| } |
| }; |
| } |
| break; |
| case SWS_OP_CLEAR: |
| ff_sws_setup_clear(&(const SwsImplParams) { .op = op }, out); |
| break; |
| case SWS_OP_MIN: |
| case SWS_OP_MAX: |
| ff_sws_setup_clamp(&(const SwsImplParams) { .op = op }, out); |
| break; |
| case SWS_OP_SCALE: |
| ff_sws_setup_scale(&(const SwsImplParams) { .op = op }, out); |
| break; |
| case SWS_OP_LINEAR: |
| return aarch64_setup_linear(p, op, out); |
| case SWS_OP_DITHER: |
| return aarch64_setup_dither(p, op, out); |
| } |
| return 0; |
| } |
| |
| /*********************************************************************/ |
| static int aarch64_optimize(SwsAArch64BackendContext *bctx, SwsOpList *ops) |
| { |
| /* Currently, no optimization is performed. This is just a placeholder. */ |
| |
| /* Use at most two full vregs during the widest precision section */ |
| bctx->block_size = (ff_sws_op_list_max_size(ops) == 4) ? 8 : 16; |
| |
| return 0; |
| } |
| |
| /*********************************************************************/ |
| static int aarch64_compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out) |
| { |
| SwsAArch64BackendContext bctx; |
| int ret; |
| |
| const int cpu_flags = av_get_cpu_flags(); |
| if (!(cpu_flags & AV_CPU_FLAG_NEON)) |
| return AVERROR(ENOTSUP); |
| |
| /* Make on-stack copy of `ops` to iterate over */ |
| SwsOpList rest = *ops; |
| bctx.sws = ctx; |
| ret = aarch64_optimize(&bctx, &rest); |
| if (ret < 0) |
| return ret; |
| |
| SwsOpChain *chain = ff_sws_op_chain_alloc(); |
| if (!chain) |
| return AVERROR(ENOMEM); |
| chain->cpu_flags = AV_CPU_FLAG_NEON; |
| |
| *out = (SwsCompiledOp) { |
| .priv = chain, |
| .slice_align = 1, |
| .free = ff_sws_op_chain_free_cb, |
| .block_size = bctx.block_size, |
| }; |
| |
| /* Look up kernel functions. */ |
| for (int i = 0; i < rest.num_ops; i++) { |
| SwsAArch64OpImplParams params = { 0 }; |
| ret = convert_to_aarch64_impl(ctx, &rest, i, bctx.block_size, ¶ms); |
| if (ret < 0) |
| goto error; |
| SwsFuncPtr func = ff_sws_aarch64_lookup(¶ms); |
| if (!func) { |
| ret = AVERROR(ENOTSUP); |
| goto error; |
| } |
| SwsImplResult res = { 0 }; |
| ret = aarch64_setup(&rest, bctx.block_size, i, ¶ms, &res); |
| if (ret < 0) |
| goto error; |
| ret = ff_sws_op_chain_append(chain, func, res.free, &res.priv); |
| if (ret < 0) |
| goto error; |
| } |
| |
| /* Look up process/process_return functions. */ |
| const SwsOp *read = ff_sws_op_list_input(&rest); |
| const SwsOp *write = ff_sws_op_list_output(&rest); |
| const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0; |
| const int write_planes = write->rw.packed ? 1 : write->rw.elems; |
| SwsAArch64OpMask mask = 0; |
| for (int i = 0; i < FFMAX(read_planes, write_planes); i++) |
| MASK_SET(mask, i, 1); |
| |
| SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS, .mask = mask }; |
| SwsAArch64OpImplParams return_params = { .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = mask }; |
| SwsFuncPtr process_func = ff_sws_aarch64_lookup(&process_params); |
| SwsFuncPtr return_func = ff_sws_aarch64_lookup(&return_params); |
| if (!process_func || !return_func) { |
| ret = AVERROR(ENOTSUP); |
| goto error; |
| } |
| |
| ret = ff_sws_op_chain_append(chain, return_func, NULL, &(SwsOpPriv) { 0 }); |
| if (ret < 0) |
| goto error; |
| |
| out->func = (SwsOpFunc) process_func; |
| out->cpu_flags = chain->cpu_flags; |
| |
| error: |
| if (ret < 0) |
| ff_sws_op_chain_free(chain); |
| return ret; |
| } |
| |
| /*********************************************************************/ |
| const SwsOpBackend backend_aarch64 = { |
| .name = "aarch64", |
| .compile = aarch64_compile, |
| .hw_format = AV_PIX_FMT_NONE, |
| }; |