blob: 44e85510834a37278f97f9ecedfc3fa23ed1d310 [file]
/**
* Copyright (C) 2026 Niklas Haas
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <libavutil/bswap.h>
#include "uops_tmpl.h"
#ifndef BIT_DEPTH
# define BIT_DEPTH 8
#endif
#if IS_FLOAT && BIT_DEPTH == 32
# define PIXEL_TYPE SWS_PIXEL_F32
# define pixel_t float
# define inter_t float
# define PX F32
# define px f32
#elif BIT_DEPTH == 32
# define PIXEL_MAX 0xFFFFFFFFu
# define PIXEL_SWAP av_bswap32
# define pixel_t uint32_t
# define inter_t int64_t
# define PX U32
# define px u32
#elif BIT_DEPTH == 16
# define PIXEL_MAX 0xFFFFu
# define PIXEL_SWAP av_bswap16
# define pixel_t uint16_t
# define inter_t int64_t
# define PX U16
# define px u16
#elif BIT_DEPTH == 8
# define PIXEL_MAX 0xFFu
# define pixel_t uint8_t
# define inter_t int32_t
# define PX U8
# define px u8
#else
# error Invalid BIT_DEPTH
#endif
/*********************************
* Generic read/write operations *
*********************************/
DECL_READ(read_planar, const SwsCompMask mask)
{
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x[i] = in0[i];
if (Y) y[i] = in1[i];
if (Z) z[i] = in2[i];
if (W) w[i] = in3[i];
}
if (X) iter->in[0] += SIZEOF_BLOCK;
if (Y) iter->in[1] += SIZEOF_BLOCK;
if (Z) iter->in[2] += SIZEOF_BLOCK;
if (W) iter->in[3] += SIZEOF_BLOCK;
CONTINUE(x, y, z, w);
}
DECL_READ(read_packed, const SwsCompMask mask)
{
const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x[i] = in0[elems * i + 0];
if (Y) y[i] = in0[elems * i + 1];
if (Z) z[i] = in0[elems * i + 2];
if (W) w[i] = in0[elems * i + 3];
}
iter->in[0] += SIZEOF_BLOCK * elems;
CONTINUE(x, y, z, w);
}
DECL_WRITE(write_planar, const SwsCompMask mask)
{
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) out0[i] = x[i];
if (Y) out1[i] = y[i];
if (Z) out2[i] = z[i];
if (W) out3[i] = w[i];
}
if (X) iter->out[0] += SIZEOF_BLOCK;
if (Y) iter->out[1] += SIZEOF_BLOCK;
if (Z) iter->out[2] += SIZEOF_BLOCK;
if (W) iter->out[3] += SIZEOF_BLOCK;
}
DECL_WRITE(write_packed, const SwsCompMask mask)
{
const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) out0[elems * i + 0] = x[i];
if (Y) out0[elems * i + 1] = y[i];
if (Z) out0[elems * i + 2] = z[i];
if (W) out0[elems * i + 3] = w[i];
}
iter->out[0] += SIZEOF_BLOCK * elems;
}
#if BIT_DEPTH == 8
DECL_READ(read_bit, const SwsCompMask mask)
{
av_assert2(mask == SWS_COMP_ELEMS(1));
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
const pixel_t val = ((const pixel_t *) in0)[i >> 3];
x[i + 0] = (val >> 7) & 1;
x[i + 1] = (val >> 6) & 1;
x[i + 2] = (val >> 5) & 1;
x[i + 3] = (val >> 4) & 1;
x[i + 4] = (val >> 3) & 1;
x[i + 5] = (val >> 2) & 1;
x[i + 6] = (val >> 1) & 1;
x[i + 7] = (val >> 0) & 1;
}
iter->in[0] += SIZEOF_BLOCK >> 3;
CONTINUE(x, y, z, w);
}
DECL_READ(read_nibble, const SwsCompMask mask)
{
av_assert2(mask == SWS_COMP_ELEMS(1));
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) {
const pixel_t val = in0[i >> 1];
x[i + 0] = val >> 4; /* high nibble */
x[i + 1] = val & 0xF; /* low nibble */
}
iter->in[0] += SIZEOF_BLOCK >> 1;
CONTINUE(x, y, z, w);
}
DECL_WRITE(write_bit, const SwsCompMask mask)
{
av_assert2(mask == SWS_COMP_ELEMS(1));
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
out0[i >> 3] = x[i + 0] << 7 |
x[i + 1] << 6 |
x[i + 2] << 5 |
x[i + 3] << 4 |
x[i + 4] << 3 |
x[i + 5] << 2 |
x[i + 6] << 1 |
x[i + 7];
}
iter->out[0] += SIZEOF_BLOCK >> 3;
}
DECL_WRITE(write_nibble, const SwsCompMask mask)
{
av_assert2(mask == SWS_COMP_ELEMS(1));
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i += 2)
out0[i >> 1] = x[i] << 4 | x[i + 1];
iter->out[0] += SIZEOF_BLOCK >> 1;
}
#endif /* BIT_DEPTH == 8 */
SWS_FOR(PX, READ_PLANAR, DECL_IMPL_READ, read_planar)
SWS_FOR(PX, READ_PACKED, DECL_IMPL_READ, read_packed)
SWS_FOR(PX, READ_NIBBLE, DECL_IMPL_READ, read_nibble)
SWS_FOR(PX, READ_BIT, DECL_IMPL_READ, read_bit)
SWS_FOR(PX, WRITE_PLANAR, DECL_IMPL_WRITE, write_planar)
SWS_FOR(PX, WRITE_PACKED, DECL_IMPL_WRITE, write_packed)
SWS_FOR(PX, WRITE_NIBBLE, DECL_IMPL_WRITE, write_nibble)
SWS_FOR(PX, WRITE_BIT, DECL_IMPL_WRITE, write_bit)
SWS_FOR_STRUCT(PX, READ_PLANAR, DECL_ENTRY)
SWS_FOR_STRUCT(PX, READ_PACKED, DECL_ENTRY)
SWS_FOR_STRUCT(PX, READ_NIBBLE, DECL_ENTRY)
SWS_FOR_STRUCT(PX, READ_BIT, DECL_ENTRY)
SWS_FOR_STRUCT(PX, WRITE_PLANAR, DECL_ENTRY)
SWS_FOR_STRUCT(PX, WRITE_PACKED, DECL_ENTRY)
SWS_FOR_STRUCT(PX, WRITE_NIBBLE, DECL_ENTRY)
SWS_FOR_STRUCT(PX, WRITE_BIT, DECL_ENTRY)
/*****************************
* Scaling / filtering reads *
*****************************/
DECL_SETUP(setup_filter_v, params, out)
{
if (params->uop->par.filter.type != SWS_PIXEL_F32)
return AVERROR(ENOTSUP);
const SwsFilterWeights *filter = params->uop->data.kernel;
static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),
">8 byte pointers not supported");
/* Pre-convert weights to float */
float *weights = av_calloc(filter->num_weights, sizeof(float));
if (!weights)
return AVERROR(ENOMEM);
for (int i = 0; i < filter->num_weights; i++)
weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;
out->priv.ptr = weights;
out->priv.i32[2] = filter->filter_size;
out->free = ff_op_priv_free;
return 0;
}
/* Fully general vertical planar filter case */
DECL_READ(read_planar_fv, const SwsCompMask mask, const SwsPixelType type)
{
av_assert2(type == SWS_PIXEL_F32);
const SwsOpExec *exec = iter->exec;
const float *restrict weights = impl->priv.ptr;
const int filter_size = impl->priv.i32[2];
weights += filter_size * iter->y;
block_t xs, ys, zs, ws;
if (X) memset(&xs.f32, 0, sizeof(xs.f32));
if (Y) memset(&ys.f32, 0, sizeof(ys.f32));
if (Z) memset(&zs.f32, 0, sizeof(zs.f32));
if (W) memset(&ws.f32, 0, sizeof(ws.f32));
for (int j = 0; j < filter_size; j++) {
const float weight = weights[j];
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) xs.f32[i] += weight * in0[i];
if (Y) ys.f32[i] += weight * in1[i];
if (Z) zs.f32[i] += weight * in2[i];
if (W) ws.f32[i] += weight * in3[i];
}
if (X) in0 = bump_ptr(in0, exec->in_stride[0]);
if (Y) in1 = bump_ptr(in1, exec->in_stride[1]);
if (Z) in2 = bump_ptr(in2, exec->in_stride[2]);
if (W) in3 = bump_ptr(in3, exec->in_stride[3]);
}
if (X) iter->in[0] += SIZEOF_BLOCK;
if (Y) iter->in[1] += SIZEOF_BLOCK;
if (Z) iter->in[2] += SIZEOF_BLOCK;
if (W) iter->in[3] += SIZEOF_BLOCK;
CONTINUE(&xs, &ys, &zs, &ws);
}
DECL_SETUP(setup_filter_h, params, out)
{
if (params->uop->par.filter.type != SWS_PIXEL_F32)
return AVERROR(ENOTSUP);
SwsFilterWeights *filter = params->uop->data.kernel;
out->priv.ptr = av_refstruct_ref(filter->weights);
out->priv.i32[2] = filter->filter_size;
out->free = ff_op_priv_unref;
return 0;
}
/* Fully general horizontal planar filter case */
DECL_READ(read_planar_fh, const SwsCompMask mask, const SwsPixelType type)
{
av_assert2(type == SWS_PIXEL_F32);
const SwsOpExec *exec = iter->exec;
const int *restrict weights = impl->priv.ptr;
const int filter_size = impl->priv.i32[2];
const float scale = 1.0f / SWS_FILTER_SCALE;
const int xpos = iter->x;
weights += filter_size * iter->x;
block_t xs, ys, zs, ws;
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
const int offset = exec->in_offset_x[xpos + i];
pixel_t *start0 = bump_ptr(in0, offset);
pixel_t *start1 = bump_ptr(in1, offset);
pixel_t *start2 = bump_ptr(in2, offset);
pixel_t *start3 = bump_ptr(in3, offset);
inter_t sx = 0, sy = 0, sz = 0, sw = 0;
for (int j = 0; j < filter_size; j++) {
const int weight = weights[j];
if (X) sx += weight * start0[j];
if (Y) sy += weight * start1[j];
if (Z) sz += weight * start2[j];
if (W) sw += weight * start3[j];
}
if (X) xs.f32[i] = (float) sx * scale;
if (Y) ys.f32[i] = (float) sy * scale;
if (Z) zs.f32[i] = (float) sz * scale;
if (W) ws.f32[i] = (float) sw * scale;
weights += filter_size;
}
CONTINUE(&xs, &ys, &zs, &ws);
}
SWS_FOR(PX, READ_PLANAR_FV, DECL_IMPL_READ, read_planar_fv)
SWS_FOR(PX, READ_PLANAR_FH, DECL_IMPL_READ, read_planar_fh)
SWS_FOR_STRUCT(PX, READ_PLANAR_FV, DECL_ENTRY, .setup = fn(setup_filter_v) )
SWS_FOR_STRUCT(PX, READ_PLANAR_FH, DECL_ENTRY, .setup = fn(setup_filter_h) )
/***************************
* Permutation and copying *
***************************/
/* Permute by directly swapping the order of arguments to the continuation. */
#define DECL_PERMUTE(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3) \
static void NAME##_c(SwsOpIter *restrict iter, \
const SwsOpImpl *restrict impl, \
void *restrict in0, void *restrict in1, \
void *restrict in2, void *restrict in3) \
{ \
CONTINUE(in##IDX0, in##IDX1, in##IDX2, in##IDX3); \
}
#define DECL_COPY(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3) \
static void NAME##_c(SwsOpIter *restrict iter, \
const SwsOpImpl *restrict impl, \
void *restrict in0, void *restrict in1, \
void *restrict in2, void *restrict in3) \
{ \
const SwsCompMask mask = (MASK); \
block_t x, y, z, w; \
\
if (X) memcpy(&x.px, in##IDX0, SIZEOF_BLOCK); \
if (Y) memcpy(&y.px, in##IDX1, SIZEOF_BLOCK); \
if (Z) memcpy(&z.px, in##IDX2, SIZEOF_BLOCK); \
if (W) memcpy(&w.px, in##IDX3, SIZEOF_BLOCK); \
\
CONTINUE(X ? &x : in0, Y ? &y : in1, Z ? &z : in2, W ? &w : in3); \
}
SWS_FOR(PX, PERMUTE, DECL_PERMUTE)
SWS_FOR(PX, COPY, DECL_COPY)
SWS_FOR_STRUCT(PX, PERMUTE, DECL_ENTRY)
SWS_FOR_STRUCT(PX, COPY, DECL_ENTRY)
/*********************
* Format conversion *
*********************/
#define DECL_CAST(DST, dst) \
DECL_FUNC(to_##dst, const SwsCompMask mask) \
{ \
block_t xx, yy, zz, ww; \
\
SWS_LOOP \
for (int i = 0; i < SWS_BLOCK_SIZE; i++) { \
if (X) xx.dst[i] = x[i]; \
if (Y) yy.dst[i] = y[i]; \
if (Z) zz.dst[i] = z[i]; \
if (W) ww.dst[i] = w[i]; \
} \
\
CONTINUE(&xx, &yy, &zz, &ww); \
} \
\
SWS_FOR(PX, TO_##DST, DECL_IMPL, to_##dst) \
SWS_FOR_STRUCT(PX, TO_##DST, DECL_ENTRY)
DECL_CAST(U8, u8)
DECL_CAST(U16, u16)
DECL_CAST(U32, u32)
DECL_CAST(F32, f32)
/********************
* Bit manipulation *
********************/
#if !IS_FLOAT
DECL_FUNC(lshift, const SwsCompMask mask, const uint8_t amount)
{
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x[i] <<= amount;
if (Y) y[i] <<= amount;
if (Z) z[i] <<= amount;
if (W) w[i] <<= amount;
}
CONTINUE(x, y, z, w);
}
DECL_FUNC(rshift, const SwsCompMask mask, const uint8_t amount)
{
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x[i] >>= amount;
if (Y) y[i] >>= amount;
if (Z) z[i] >>= amount;
if (W) w[i] >>= amount;
}
CONTINUE(x, y, z, w);
}
#endif
SWS_FOR(PX, LSHIFT, DECL_IMPL, lshift)
SWS_FOR(PX, RSHIFT, DECL_IMPL, rshift)
SWS_FOR_STRUCT(PX, LSHIFT, DECL_ENTRY)
SWS_FOR_STRUCT(PX, RSHIFT, DECL_ENTRY)
#ifdef PIXEL_SWAP
DECL_FUNC(swap_bytes, const SwsCompMask mask)
{
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x[i] = PIXEL_SWAP(x[i]);
if (Y) y[i] = PIXEL_SWAP(y[i]);
if (Z) z[i] = PIXEL_SWAP(z[i]);
if (W) w[i] = PIXEL_SWAP(w[i]);
}
CONTINUE(x, y, z, w);
}
#endif /* PIXEL_SWAP */
SWS_FOR(PX, SWAP_BYTES, DECL_IMPL, swap_bytes)
SWS_FOR_STRUCT(PX, SWAP_BYTES, DECL_ENTRY)
#ifdef PIXEL_MAX
DECL_FUNC(expand_bit, const SwsCompMask mask)
{
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x[i] = x[i] ? PIXEL_MAX : 0;
if (Y) y[i] = y[i] ? PIXEL_MAX : 0;
if (Z) z[i] = z[i] ? PIXEL_MAX : 0;
if (W) w[i] = w[i] ? PIXEL_MAX : 0;
}
CONTINUE(x, y, z, w);
}
#endif
#if BIT_DEPTH == 8
DECL_FUNC(expand_pair, const SwsCompMask mask)
{
block_t x16, y16, z16, w16;
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x16.u16[i] = x[i] << 8 | x[i];
if (Y) y16.u16[i] = y[i] << 8 | y[i];
if (Z) z16.u16[i] = z[i] << 8 | z[i];
if (W) w16.u16[i] = w[i] << 8 | w[i];
}
CONTINUE(&x16, &y16, &z16, &w16);
}
DECL_FUNC(expand_quad, const SwsCompMask mask)
{
block_t x32, y32, z32, w32;
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x32.u32[i] = (uint32_t) x[i] << 24 | x[i] << 16 | x[i] << 8 | x[i];
if (Y) y32.u32[i] = (uint32_t) y[i] << 24 | y[i] << 16 | y[i] << 8 | y[i];
if (Z) z32.u32[i] = (uint32_t) z[i] << 24 | z[i] << 16 | z[i] << 8 | z[i];
if (W) w32.u32[i] = (uint32_t) w[i] << 24 | w[i] << 16 | w[i] << 8 | w[i];
}
CONTINUE(&x32, &y32, &z32, &w32);
}
#endif /* BIT_DEPTH == 8 */
SWS_FOR(PX, EXPAND_BIT, DECL_IMPL, expand_bit)
SWS_FOR(PX, EXPAND_PAIR, DECL_IMPL, expand_pair)
SWS_FOR(PX, EXPAND_QUAD, DECL_IMPL, expand_quad)
SWS_FOR_STRUCT(PX, EXPAND_BIT, DECL_ENTRY)
SWS_FOR_STRUCT(PX, EXPAND_PAIR, DECL_ENTRY)
SWS_FOR_STRUCT(PX, EXPAND_QUAD, DECL_ENTRY)
/*************************
* Packing and unpacking *
************************/
#if !IS_FLOAT
DECL_FUNC(unpack, const SwsCompMask mask,
const uint8_t bx, const uint8_t by,
const uint8_t bz, const uint8_t bw)
{
const uint8_t sx = bw + bz + by;
const uint8_t sy = bw + bz;
const uint8_t sz = bw;
const uint8_t sw = 0;
const pixel_t mx = (1 << bx) - 1;
const pixel_t my = (1 << by) - 1;
const pixel_t mz = (1 << bz) - 1;
const pixel_t mw = (1 << bw) - 1;
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
const pixel_t val = x[i];
if (X) x[i] = (val >> sx) & mx;
if (Y) y[i] = (val >> sy) & my;
if (Z) z[i] = (val >> sz) & mz;
if (W) w[i] = (val >> sw) & mw;
}
CONTINUE(x, y, z, w);
}
DECL_FUNC(pack, const SwsCompMask mask,
const uint8_t bx, const uint8_t by,
const uint8_t bz, const uint8_t bw)
{
const uint8_t sx = bw + bz + by;
const uint8_t sy = bw + bz;
const uint8_t sz = bw;
const uint8_t sw = 0;
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
pixel_t val = 0;
if (X) val |= x[i] << sx;
if (Y) val |= y[i] << sy;
if (Z) val |= z[i] << sz;
if (W) val |= w[i] << sw;
x[i] = val;
}
CONTINUE(x, y, z, w);
}
#endif /* !IS_FLOAT */
SWS_FOR(PX, UNPACK, DECL_IMPL, unpack)
SWS_FOR(PX, PACK, DECL_IMPL, pack)
SWS_FOR_STRUCT(PX, UNPACK, DECL_ENTRY)
SWS_FOR_STRUCT(PX, PACK, DECL_ENTRY)
/***********************
* Pixel data clearing *
***********************/
#ifdef PIXEL_MAX
DECL_FUNC(clear, const SwsCompMask mask, const SwsCompMask one,
const SwsCompMask zero)
{
#define ONE(N) SWS_COMP_TEST(one, N)
#define ZERO(N) SWS_COMP_TEST(zero, N)
const pixel_t cx = ONE(0) ? PIXEL_MAX : ZERO(0) ? 0 : impl->priv.px[0];
const pixel_t cy = ONE(1) ? PIXEL_MAX : ZERO(1) ? 0 : impl->priv.px[1];
const pixel_t cz = ONE(2) ? PIXEL_MAX : ZERO(2) ? 0 : impl->priv.px[2];
const pixel_t cw = ONE(3) ? PIXEL_MAX : ZERO(3) ? 0 : impl->priv.px[3];
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x[i] = cx;
if (Y) y[i] = cy;
if (Z) z[i] = cz;
if (W) w[i] = cw;
}
CONTINUE(x, y, z, w);
}
#endif
SWS_FOR(PX, CLEAR, DECL_IMPL, clear)
SWS_FOR_STRUCT(PX, CLEAR, DECL_ENTRY, .setup = ff_sws_setup_vec4)
/*************************
* Arithmetic operations *
*************************/
DECL_FUNC(scale, const SwsCompMask mask)
{
const pixel_t scale = impl->priv.px[0];
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x[i] *= scale;
if (Y) y[i] *= scale;
if (Z) z[i] *= scale;
if (W) w[i] *= scale;
}
CONTINUE(x, y, z, w);
}
DECL_FUNC(add, const SwsCompMask mask)
{
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x[i] += impl->priv.px[0];
if (Y) y[i] += impl->priv.px[1];
if (Z) z[i] += impl->priv.px[2];
if (W) w[i] += impl->priv.px[3];
}
CONTINUE(x, y, z, w);
}
DECL_FUNC(min, const SwsCompMask mask)
{
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x[i] = FFMIN(x[i], impl->priv.px[0]);
if (Y) y[i] = FFMIN(y[i], impl->priv.px[1]);
if (Z) z[i] = FFMIN(z[i], impl->priv.px[2]);
if (W) w[i] = FFMIN(w[i], impl->priv.px[3]);
}
CONTINUE(x, y, z, w);
}
DECL_FUNC(max, const SwsCompMask mask)
{
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x[i] = FFMAX(x[i], impl->priv.px[0]);
if (Y) y[i] = FFMAX(y[i], impl->priv.px[1]);
if (Z) z[i] = FFMAX(z[i], impl->priv.px[2]);
if (W) w[i] = FFMAX(w[i], impl->priv.px[3]);
}
CONTINUE(x, y, z, w);
}
SWS_FOR(PX, SCALE, DECL_IMPL, scale)
SWS_FOR(PX, ADD, DECL_IMPL, add)
SWS_FOR(PX, MIN, DECL_IMPL, min)
SWS_FOR(PX, MAX, DECL_IMPL, max)
SWS_FOR_STRUCT(PX, SCALE, DECL_ENTRY, .setup = ff_sws_setup_scalar )
SWS_FOR_STRUCT(PX, ADD, DECL_ENTRY, .setup = ff_sws_setup_vec4 )
SWS_FOR_STRUCT(PX, MIN, DECL_ENTRY, .setup = ff_sws_setup_vec4 )
SWS_FOR_STRUCT(PX, MAX, DECL_ENTRY, .setup = ff_sws_setup_vec4 )
/*************
* Dithering *
*************/
DECL_SETUP(setup_dither, params, out)
{
const SwsUOp *uop = params->uop;
const SwsDitherUOp *dither = &uop->par.dither;
const int size = 1 << dither->size_log2;
if (size >= SWS_BLOCK_SIZE) {
/* No extra padding needed */
out->priv.ptr = av_refstruct_ref(uop->data.ptr);
out->free = ff_op_priv_unref;
return 0;
}
const int stride = FFMAX(size, SWS_BLOCK_SIZE);
const int height = ff_sws_dither_height(dither);
pixel_t *matrix = av_malloc(sizeof(pixel_t) * height * stride);
if (!matrix)
return AVERROR(ENOMEM);
out->priv.ptr = matrix;
out->free = ff_op_priv_free;
/* Pad to multiple of block size. We don't need extra padding for the
* height because ff_sws_dither_height() already includes any padding
* necessary for the y_offset */
for (int y = 0; y < height; y++) {
pixel_t *row = &matrix[y * stride];
for (int x = 0; x < size; x++)
row[x] = uop->data.ptr[y * size + x].px;
for (int x = size; x < stride; x++)
row[x] = row[x % size];
}
return 0;
}
DECL_FUNC(dither, const SwsCompMask mask,
const uint8_t off0, const uint8_t off1,
const uint8_t off2, const uint8_t off3,
const uint8_t size_log2)
{
const int size = 1 << size_log2;
const int stride = FFMAX(size, SWS_BLOCK_SIZE);
const pixel_t *matrix = impl->priv.ptr;
matrix += (iter->y & (size - 1)) * stride;
matrix += (iter->x & (size - 1)) & ~(SWS_BLOCK_SIZE - 1);
const pixel_t *const row0 = &matrix[off0 * stride];
const pixel_t *const row1 = &matrix[off1 * stride];
const pixel_t *const row2 = &matrix[off2 * stride];
const pixel_t *const row3 = &matrix[off3 * stride];
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
if (X) x[i] += row0[i];
if (Y) y[i] += row1[i];
if (Z) z[i] += row2[i];
if (W) w[i] += row3[i];
}
CONTINUE(x, y, z, w);
}
SWS_FOR(PX, DITHER, DECL_IMPL, dither)
SWS_FOR_STRUCT(PX, DITHER, DECL_ENTRY, .setup = fn(setup_dither) )
/*********************
* Linear operations *
*********************/
typedef struct {
/* Stored in split form for convenience */
pixel_t m[4][4];
pixel_t k[4];
} fn(LinCoeffs);
DECL_SETUP(setup_linear, params, out)
{
const SwsUOp *uop = params->uop;
fn(LinCoeffs) c;
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++)
c.m[i][j] = uop->data.mat4[i][j].px;
c.k[i] = uop->data.mat4[i][4].px;
}
out->priv.ptr = av_memdup(&c, sizeof(c));
out->free = ff_op_priv_free;
return out->priv.ptr ? 0 : AVERROR(ENOMEM);
}
/**
* Fully general case for a 5x5 linear affine transformation. Should never be
* called without constant `mask`. This function will compile down to the
* appropriately optimized version for the required subset of operations when
* called with a constant mask.
*/
DECL_FUNC(linear, const SwsCompMask mask, const uint32_t one, const uint32_t zero)
{
const fn(LinCoeffs) c = *(const fn(LinCoeffs) *) impl->priv.ptr;
SWS_LOOP
for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
const pixel_t xx = x[i];
const pixel_t yy = y[i];
const pixel_t zz = z[i];
const pixel_t ww = w[i];
#define LIN_VAL(I, J, val) \
((one & SWS_MASK(I, J)) ? (val) : c.m[I][J] * (val))
#define LIN_ROW(I, var) do { \
var[i] = (zero & SWS_MASK(I, 4)) ? 0 : c.k[I]; \
if (!(zero & SWS_MASK(I, 0))) var[i] += LIN_VAL(I, 0, xx); \
if (!(zero & SWS_MASK(I, 1))) var[i] += LIN_VAL(I, 1, yy); \
if (!(zero & SWS_MASK(I, 2))) var[i] += LIN_VAL(I, 2, zz); \
if (!(zero & SWS_MASK(I, 3))) var[i] += LIN_VAL(I, 3, ww); \
} while (0)
if (X) LIN_ROW(0, x);
if (Y) LIN_ROW(1, y);
if (Z) LIN_ROW(2, z);
if (W) LIN_ROW(3, w);
}
CONTINUE(x, y, z, w);
}
SWS_FOR(PX, LINEAR, DECL_IMPL, linear)
SWS_FOR_STRUCT(PX, LINEAR, DECL_ENTRY, .setup = fn(setup_linear) )
#undef PIXEL_MAX
#undef PIXEL_SWAP
#undef pixel_t
#undef inter_t
#undef block_t
#undef PX
#undef px