libswscale/uops_tmpl.c - third_party/ffmpeg - Git at Google

 /**
  * Copyright (C) 2026 Niklas Haas
  *
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

 #include <libavutil/bswap.h>

 #include "uops_tmpl.h"

 #ifndef BIT_DEPTH
 #  define BIT_DEPTH 8
 #endif

 #if IS_FLOAT && BIT_DEPTH == 32
 #  define PIXEL_TYPE SWS_PIXEL_F32
 #  define pixel_t    float
 #  define inter_t    float
 #  define PX         F32
 #  define px         f32
 #elif BIT_DEPTH == 32
 #  define PIXEL_MAX  0xFFFFFFFFu
 #  define PIXEL_SWAP av_bswap32
 #  define pixel_t    uint32_t
 #  define inter_t    int64_t
 #  define PX         U32
 #  define px         u32
 #elif BIT_DEPTH == 16
 #  define PIXEL_MAX  0xFFFFu
 #  define PIXEL_SWAP av_bswap16
 #  define pixel_t    uint16_t
 #  define inter_t    int64_t
 #  define PX         U16
 #  define px         u16
 #elif BIT_DEPTH == 8
 #  define PIXEL_MAX  0xFFu
 #  define pixel_t    uint8_t
 #  define inter_t    int32_t
 #  define PX         U8
 #  define px         u8
 #else
 #  error Invalid BIT_DEPTH
 #endif

 /*********************************
  * Generic read/write operations *
  *********************************/

 DECL_READ(read_planar, const SwsCompMask mask)
 {
     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x[i] = in0[i];
         if (Y) y[i] = in1[i];
         if (Z) z[i] = in2[i];
         if (W) w[i] = in3[i];
     }

     if (X) iter->in[0] += SIZEOF_BLOCK;
     if (Y) iter->in[1] += SIZEOF_BLOCK;
     if (Z) iter->in[2] += SIZEOF_BLOCK;
     if (W) iter->in[3] += SIZEOF_BLOCK;

     CONTINUE(x, y, z, w);
 }

 DECL_READ(read_packed, const SwsCompMask mask)
 {
     const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x[i] = in0[elems * i + 0];
         if (Y) y[i] = in0[elems * i + 1];
         if (Z) z[i] = in0[elems * i + 2];
         if (W) w[i] = in0[elems * i + 3];
     }

     iter->in[0] += SIZEOF_BLOCK * elems;
     CONTINUE(x, y, z, w);
 }

 DECL_WRITE(write_planar, const SwsCompMask mask)
 {
     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) out0[i] = x[i];
         if (Y) out1[i] = y[i];
         if (Z) out2[i] = z[i];
         if (W) out3[i] = w[i];
     }

     if (X) iter->out[0] += SIZEOF_BLOCK;
     if (Y) iter->out[1] += SIZEOF_BLOCK;
     if (Z) iter->out[2] += SIZEOF_BLOCK;
     if (W) iter->out[3] += SIZEOF_BLOCK;
 }

 DECL_WRITE(write_packed, const SwsCompMask mask)
 {
     const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) out0[elems * i + 0] = x[i];
         if (Y) out0[elems * i + 1] = y[i];
         if (Z) out0[elems * i + 2] = z[i];
         if (W) out0[elems * i + 3] = w[i];
     }

     iter->out[0] += SIZEOF_BLOCK * elems;
 }

 #if BIT_DEPTH == 8

 DECL_READ(read_bit, const SwsCompMask mask)
 {
     av_assert2(mask == SWS_COMP_ELEMS(1));

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
         const pixel_t val = ((const pixel_t *) in0)[i >> 3];
         x[i + 0] = (val >> 7) & 1;
         x[i + 1] = (val >> 6) & 1;
         x[i + 2] = (val >> 5) & 1;
         x[i + 3] = (val >> 4) & 1;
         x[i + 4] = (val >> 3) & 1;
         x[i + 5] = (val >> 2) & 1;
         x[i + 6] = (val >> 1) & 1;
         x[i + 7] = (val >> 0) & 1;
     }

     iter->in[0] += SIZEOF_BLOCK >> 3;
     CONTINUE(x, y, z, w);
 }

 DECL_READ(read_nibble, const SwsCompMask mask)
 {
     av_assert2(mask == SWS_COMP_ELEMS(1));

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) {
         const pixel_t val = in0[i >> 1];
         x[i + 0] = val >> 4;  /* high nibble */
         x[i + 1] = val & 0xF; /* low nibble */
     }

     iter->in[0] += SIZEOF_BLOCK >> 1;
     CONTINUE(x, y, z, w);
 }

 DECL_WRITE(write_bit, const SwsCompMask mask)
 {
     av_assert2(mask == SWS_COMP_ELEMS(1));

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
         out0[i >> 3] = x[i + 0] << 7 |
                        x[i + 1] << 6 |
                        x[i + 2] << 5 |
                        x[i + 3] << 4 |
                        x[i + 4] << 3 |
                        x[i + 5] << 2 |
                        x[i + 6] << 1 |
                        x[i + 7];
     }

     iter->out[0] += SIZEOF_BLOCK >> 3;
 }

 DECL_WRITE(write_nibble, const SwsCompMask mask)
 {
     av_assert2(mask == SWS_COMP_ELEMS(1));

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i += 2)
         out0[i >> 1] = x[i] << 4 | x[i + 1];

     iter->out[0] += SIZEOF_BLOCK >> 1;
 }

 #endif /* BIT_DEPTH == 8 */

 SWS_FOR(PX, READ_PLANAR,    DECL_IMPL_READ,     read_planar)
 SWS_FOR(PX, READ_PACKED,    DECL_IMPL_READ,     read_packed)
 SWS_FOR(PX, READ_NIBBLE,    DECL_IMPL_READ,     read_nibble)
 SWS_FOR(PX, READ_BIT,       DECL_IMPL_READ,     read_bit)
 SWS_FOR(PX, WRITE_PLANAR,   DECL_IMPL_WRITE,    write_planar)
 SWS_FOR(PX, WRITE_PACKED,   DECL_IMPL_WRITE,    write_packed)
 SWS_FOR(PX, WRITE_NIBBLE,   DECL_IMPL_WRITE,    write_nibble)
 SWS_FOR(PX, WRITE_BIT,      DECL_IMPL_WRITE,    write_bit)

 SWS_FOR_STRUCT(PX, READ_PLANAR,     DECL_ENTRY)
 SWS_FOR_STRUCT(PX, READ_PACKED,     DECL_ENTRY)
 SWS_FOR_STRUCT(PX, READ_NIBBLE,     DECL_ENTRY)
 SWS_FOR_STRUCT(PX, READ_BIT,        DECL_ENTRY)
 SWS_FOR_STRUCT(PX, WRITE_PLANAR,    DECL_ENTRY)
 SWS_FOR_STRUCT(PX, WRITE_PACKED,    DECL_ENTRY)
 SWS_FOR_STRUCT(PX, WRITE_NIBBLE,    DECL_ENTRY)
 SWS_FOR_STRUCT(PX, WRITE_BIT,       DECL_ENTRY)

 /*****************************
  * Scaling / filtering reads *
  *****************************/

 DECL_SETUP(setup_filter_v, params, out)
 {
     if (params->uop->par.filter.type != SWS_PIXEL_F32)
         return AVERROR(ENOTSUP);

     const SwsFilterWeights *filter = params->uop->data.kernel;
     static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),
                   ">8 byte pointers not supported");

     /* Pre-convert weights to float */
     float *weights = av_calloc(filter->num_weights, sizeof(float));
     if (!weights)
         return AVERROR(ENOMEM);

     for (int i = 0; i < filter->num_weights; i++)
         weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;

     out->priv.ptr = weights;
     out->priv.i32[2] = filter->filter_size;
     out->free = ff_op_priv_free;
     return 0;
 }

 /* Fully general vertical planar filter case */
 DECL_READ(read_planar_fv, const SwsCompMask mask, const SwsPixelType type)
 {
     av_assert2(type == SWS_PIXEL_F32);
     const SwsOpExec *exec = iter->exec;
     const float *restrict weights = impl->priv.ptr;
     const int filter_size = impl->priv.i32[2];
     weights += filter_size * iter->y;

     block_t xs, ys, zs, ws;
     if (X) memset(&xs.f32, 0, sizeof(xs.f32));
     if (Y) memset(&ys.f32, 0, sizeof(ys.f32));
     if (Z) memset(&zs.f32, 0, sizeof(zs.f32));
     if (W) memset(&ws.f32, 0, sizeof(ws.f32));

     for (int j = 0; j < filter_size; j++) {
         const float weight = weights[j];

         SWS_LOOP
         for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
             if (X) xs.f32[i] += weight * in0[i];
             if (Y) ys.f32[i] += weight * in1[i];
             if (Z) zs.f32[i] += weight * in2[i];
             if (W) ws.f32[i] += weight * in3[i];
         }

         if (X) in0 = bump_ptr(in0, exec->in_stride[0]);
         if (Y) in1 = bump_ptr(in1, exec->in_stride[1]);
         if (Z) in2 = bump_ptr(in2, exec->in_stride[2]);
         if (W) in3 = bump_ptr(in3, exec->in_stride[3]);
     }

     if (X) iter->in[0] += SIZEOF_BLOCK;
     if (Y) iter->in[1] += SIZEOF_BLOCK;
     if (Z) iter->in[2] += SIZEOF_BLOCK;
     if (W) iter->in[3] += SIZEOF_BLOCK;

     CONTINUE(&xs, &ys, &zs, &ws);
 }

 DECL_SETUP(setup_filter_h, params, out)
 {
     if (params->uop->par.filter.type != SWS_PIXEL_F32)
         return AVERROR(ENOTSUP);

     SwsFilterWeights *filter = params->uop->data.kernel;
     out->priv.ptr = av_refstruct_ref(filter->weights);
     out->priv.i32[2] = filter->filter_size;
     out->free = ff_op_priv_unref;
     return 0;
 }

 /* Fully general horizontal planar filter case */
 DECL_READ(read_planar_fh, const SwsCompMask mask, const SwsPixelType type)
 {
     av_assert2(type == SWS_PIXEL_F32);
     const SwsOpExec *exec = iter->exec;
     const int *restrict weights = impl->priv.ptr;
     const int filter_size = impl->priv.i32[2];
     const float scale = 1.0f / SWS_FILTER_SCALE;
     const int xpos = iter->x;
     weights += filter_size * iter->x;

     block_t xs, ys, zs, ws;
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         const int offset = exec->in_offset_x[xpos + i];
         pixel_t *start0 = bump_ptr(in0, offset);
         pixel_t *start1 = bump_ptr(in1, offset);
         pixel_t *start2 = bump_ptr(in2, offset);
         pixel_t *start3 = bump_ptr(in3, offset);

         inter_t sx = 0, sy = 0, sz = 0, sw = 0;
         for (int j = 0; j < filter_size; j++) {
             const int weight = weights[j];
             if (X) sx += weight * start0[j];
             if (Y) sy += weight * start1[j];
             if (Z) sz += weight * start2[j];
             if (W) sw += weight * start3[j];
         }

         if (X) xs.f32[i] = (float) sx * scale;
         if (Y) ys.f32[i] = (float) sy * scale;
         if (Z) zs.f32[i] = (float) sz * scale;
         if (W) ws.f32[i] = (float) sw * scale;

         weights += filter_size;
     }

     CONTINUE(&xs, &ys, &zs, &ws);
 }

 SWS_FOR(PX, READ_PLANAR_FV, DECL_IMPL_READ, read_planar_fv)
 SWS_FOR(PX, READ_PLANAR_FH, DECL_IMPL_READ, read_planar_fh)
 SWS_FOR_STRUCT(PX, READ_PLANAR_FV, DECL_ENTRY, .setup = fn(setup_filter_v) )
 SWS_FOR_STRUCT(PX, READ_PLANAR_FH, DECL_ENTRY, .setup = fn(setup_filter_h) )

 /***************************
  * Permutation and copying *
  ***************************/

 /* Permute by directly swapping the order of arguments to the continuation. */
 #define DECL_PERMUTE(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3)      \
     static void NAME##_c(SwsOpIter *restrict iter,                              \
                          const SwsOpImpl *restrict impl,                        \
                          void *restrict in0, void *restrict in1,                \
                          void *restrict in2, void *restrict in3)                \
     {                                                                           \
         CONTINUE(in##IDX0, in##IDX1, in##IDX2, in##IDX3);                       \
     }

 #define DECL_COPY(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3)         \
     static void NAME##_c(SwsOpIter *restrict iter,                              \
                          const SwsOpImpl *restrict impl,                        \
                          void *restrict in0, void *restrict in1,                \
                          void *restrict in2, void *restrict in3)                \
     {                                                                           \
         const SwsCompMask mask = (MASK);                                        \
         block_t x, y, z, w;                                                     \
                                                                                 \
         if (X) memcpy(&x.px, in##IDX0, SIZEOF_BLOCK);                           \
         if (Y) memcpy(&y.px, in##IDX1, SIZEOF_BLOCK);                           \
         if (Z) memcpy(&z.px, in##IDX2, SIZEOF_BLOCK);                           \
         if (W) memcpy(&w.px, in##IDX3, SIZEOF_BLOCK);                           \
                                                                                 \
         CONTINUE(X ? &x : in0, Y ? &y : in1, Z ? &z : in2, W ? &w : in3);       \
     }

 SWS_FOR(PX, PERMUTE, DECL_PERMUTE)
 SWS_FOR(PX, COPY,    DECL_COPY)
 SWS_FOR_STRUCT(PX, PERMUTE, DECL_ENTRY)
 SWS_FOR_STRUCT(PX, COPY,    DECL_ENTRY)

 /*********************
  * Format conversion *
  *********************/

 #define DECL_CAST(DST, dst)                                                     \
     DECL_FUNC(to_##dst, const SwsCompMask mask)                                 \
     {                                                                           \
         block_t xx, yy, zz, ww;                                                 \
                                                                                 \
         SWS_LOOP                                                                \
         for (int i = 0; i < SWS_BLOCK_SIZE; i++) {                              \
             if (X) xx.dst[i] = x[i];                                            \
             if (Y) yy.dst[i] = y[i];                                            \
             if (Z) zz.dst[i] = z[i];                                            \
             if (W) ww.dst[i] = w[i];                                            \
         }                                                                       \
                                                                                 \
         CONTINUE(&xx, &yy, &zz, &ww);                                           \
     }                                                                           \
                                                                                 \
     SWS_FOR(PX, TO_##DST, DECL_IMPL, to_##dst)                                  \
     SWS_FOR_STRUCT(PX, TO_##DST, DECL_ENTRY)

 DECL_CAST(U8,  u8)
 DECL_CAST(U16, u16)
 DECL_CAST(U32, u32)
 DECL_CAST(F32, f32)

 /********************
  * Bit manipulation *
  ********************/

 #if !IS_FLOAT
 DECL_FUNC(lshift, const SwsCompMask mask, const uint8_t amount)
 {
     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x[i] <<= amount;
         if (Y) y[i] <<= amount;
         if (Z) z[i] <<= amount;
         if (W) w[i] <<= amount;
     }

     CONTINUE(x, y, z, w);
 }

 DECL_FUNC(rshift, const SwsCompMask mask, const uint8_t amount)
 {
     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x[i] >>= amount;
         if (Y) y[i] >>= amount;
         if (Z) z[i] >>= amount;
         if (W) w[i] >>= amount;
     }

     CONTINUE(x, y, z, w);
 }
 #endif

 SWS_FOR(PX, LSHIFT, DECL_IMPL, lshift)
 SWS_FOR(PX, RSHIFT, DECL_IMPL, rshift)

 SWS_FOR_STRUCT(PX, LSHIFT, DECL_ENTRY)
 SWS_FOR_STRUCT(PX, RSHIFT, DECL_ENTRY)

 #ifdef PIXEL_SWAP
 DECL_FUNC(swap_bytes, const SwsCompMask mask)
 {
     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x[i] = PIXEL_SWAP(x[i]);
         if (Y) y[i] = PIXEL_SWAP(y[i]);
         if (Z) z[i] = PIXEL_SWAP(z[i]);
         if (W) w[i] = PIXEL_SWAP(w[i]);
     }

     CONTINUE(x, y, z, w);
 }
 #endif /* PIXEL_SWAP */

 SWS_FOR(PX, SWAP_BYTES, DECL_IMPL, swap_bytes)
 SWS_FOR_STRUCT(PX, SWAP_BYTES, DECL_ENTRY)

 #ifdef PIXEL_MAX
 DECL_FUNC(expand_bit, const SwsCompMask mask)
 {
     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x[i] = x[i] ? PIXEL_MAX : 0;
         if (Y) y[i] = y[i] ? PIXEL_MAX : 0;
         if (Z) z[i] = z[i] ? PIXEL_MAX : 0;
         if (W) w[i] = w[i] ? PIXEL_MAX : 0;
     }

     CONTINUE(x, y, z, w);
 }
 #endif

 #if BIT_DEPTH == 8
 DECL_FUNC(expand_pair, const SwsCompMask mask)
 {
     block_t x16, y16, z16, w16;

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x16.u16[i] = x[i] << 8 | x[i];
         if (Y) y16.u16[i] = y[i] << 8 | y[i];
         if (Z) z16.u16[i] = z[i] << 8 | z[i];
         if (W) w16.u16[i] = w[i] << 8 | w[i];
     }

     CONTINUE(&x16, &y16, &z16, &w16);
 }

 DECL_FUNC(expand_quad, const SwsCompMask mask)
 {
     block_t x32, y32, z32, w32;

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x32.u32[i] = (uint32_t) x[i] << 24 | x[i] << 16 | x[i] << 8 | x[i];
         if (Y) y32.u32[i] = (uint32_t) y[i] << 24 | y[i] << 16 | y[i] << 8 | y[i];
         if (Z) z32.u32[i] = (uint32_t) z[i] << 24 | z[i] << 16 | z[i] << 8 | z[i];
         if (W) w32.u32[i] = (uint32_t) w[i] << 24 | w[i] << 16 | w[i] << 8 | w[i];
     }

     CONTINUE(&x32, &y32, &z32, &w32);
 }
 #endif /* BIT_DEPTH == 8 */

 SWS_FOR(PX, EXPAND_BIT,  DECL_IMPL, expand_bit)
 SWS_FOR(PX, EXPAND_PAIR, DECL_IMPL, expand_pair)
 SWS_FOR(PX, EXPAND_QUAD, DECL_IMPL, expand_quad)
 SWS_FOR_STRUCT(PX, EXPAND_BIT,  DECL_ENTRY)
 SWS_FOR_STRUCT(PX, EXPAND_PAIR, DECL_ENTRY)
 SWS_FOR_STRUCT(PX, EXPAND_QUAD, DECL_ENTRY)

 /*************************
  * Packing and unpacking *
  ************************/

 #if !IS_FLOAT
 DECL_FUNC(unpack, const SwsCompMask mask,
                   const uint8_t bx, const uint8_t by,
                   const uint8_t bz, const uint8_t bw)
 {
     const uint8_t sx = bw + bz + by;
     const uint8_t sy = bw + bz;
     const uint8_t sz = bw;
     const uint8_t sw = 0;

     const pixel_t mx = (1 << bx) - 1;
     const pixel_t my = (1 << by) - 1;
     const pixel_t mz = (1 << bz) - 1;
     const pixel_t mw = (1 << bw) - 1;

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         const pixel_t val = x[i];
         if (X) x[i] = (val >> sx) & mx;
         if (Y) y[i] = (val >> sy) & my;
         if (Z) z[i] = (val >> sz) & mz;
         if (W) w[i] = (val >> sw) & mw;
     }

     CONTINUE(x, y, z, w);
 }

 DECL_FUNC(pack, const SwsCompMask mask,
                 const uint8_t bx, const uint8_t by,
                 const uint8_t bz, const uint8_t bw)
 {
     const uint8_t sx = bw + bz + by;
     const uint8_t sy = bw + bz;
     const uint8_t sz = bw;
     const uint8_t sw = 0;

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         pixel_t val = 0;
         if (X) val |= x[i] << sx;
         if (Y) val |= y[i] << sy;
         if (Z) val |= z[i] << sz;
         if (W) val |= w[i] << sw;
         x[i] = val;
     }

     CONTINUE(x, y, z, w);
 }
 #endif /* !IS_FLOAT */

 SWS_FOR(PX, UNPACK, DECL_IMPL, unpack)
 SWS_FOR(PX, PACK,   DECL_IMPL, pack)
 SWS_FOR_STRUCT(PX, UNPACK,  DECL_ENTRY)
 SWS_FOR_STRUCT(PX, PACK,    DECL_ENTRY)

 /***********************
  * Pixel data clearing *
  ***********************/

 #ifdef PIXEL_MAX
 DECL_FUNC(clear, const SwsCompMask mask, const SwsCompMask one,
                  const SwsCompMask zero)
 {
     #define ONE(N)  SWS_COMP_TEST(one, N)
     #define ZERO(N) SWS_COMP_TEST(zero, N)
     const pixel_t cx = ONE(0) ? PIXEL_MAX : ZERO(0) ? 0 : impl->priv.px[0];
     const pixel_t cy = ONE(1) ? PIXEL_MAX : ZERO(1) ? 0 : impl->priv.px[1];
     const pixel_t cz = ONE(2) ? PIXEL_MAX : ZERO(2) ? 0 : impl->priv.px[2];
     const pixel_t cw = ONE(3) ? PIXEL_MAX : ZERO(3) ? 0 : impl->priv.px[3];

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x[i] = cx;
         if (Y) y[i] = cy;
         if (Z) z[i] = cz;
         if (W) w[i] = cw;
     }

     CONTINUE(x, y, z, w);
 }
 #endif

 SWS_FOR(PX, CLEAR, DECL_IMPL, clear)
 SWS_FOR_STRUCT(PX, CLEAR, DECL_ENTRY, .setup = ff_sws_setup_vec4)

 /*************************
  * Arithmetic operations *
  *************************/

 DECL_FUNC(scale, const SwsCompMask mask)
 {
     const pixel_t scale = impl->priv.px[0];

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x[i] *= scale;
         if (Y) y[i] *= scale;
         if (Z) z[i] *= scale;
         if (W) w[i] *= scale;
     }

     CONTINUE(x, y, z, w);
 }

 DECL_FUNC(add, const SwsCompMask mask)
 {
     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x[i] += impl->priv.px[0];
         if (Y) y[i] += impl->priv.px[1];
         if (Z) z[i] += impl->priv.px[2];
         if (W) w[i] += impl->priv.px[3];
     }

     CONTINUE(x, y, z, w);
 }

 DECL_FUNC(min, const SwsCompMask mask)
 {
     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x[i] = FFMIN(x[i], impl->priv.px[0]);
         if (Y) y[i] = FFMIN(y[i], impl->priv.px[1]);
         if (Z) z[i] = FFMIN(z[i], impl->priv.px[2]);
         if (W) w[i] = FFMIN(w[i], impl->priv.px[3]);
     }

     CONTINUE(x, y, z, w);
 }

 DECL_FUNC(max, const SwsCompMask mask)
 {
     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x[i] = FFMAX(x[i], impl->priv.px[0]);
         if (Y) y[i] = FFMAX(y[i], impl->priv.px[1]);
         if (Z) z[i] = FFMAX(z[i], impl->priv.px[2]);
         if (W) w[i] = FFMAX(w[i], impl->priv.px[3]);
     }

     CONTINUE(x, y, z, w);
 }

 SWS_FOR(PX, SCALE, DECL_IMPL, scale)
 SWS_FOR(PX, ADD,   DECL_IMPL, add)
 SWS_FOR(PX, MIN,   DECL_IMPL, min)
 SWS_FOR(PX, MAX,   DECL_IMPL, max)
 SWS_FOR_STRUCT(PX, SCALE, DECL_ENTRY, .setup = ff_sws_setup_scalar )
 SWS_FOR_STRUCT(PX, ADD,   DECL_ENTRY, .setup = ff_sws_setup_vec4 )
 SWS_FOR_STRUCT(PX, MIN,   DECL_ENTRY, .setup = ff_sws_setup_vec4 )
 SWS_FOR_STRUCT(PX, MAX,   DECL_ENTRY, .setup = ff_sws_setup_vec4 )

 /*************
  * Dithering *
  *************/

 DECL_SETUP(setup_dither, params, out)
 {
     const SwsUOp *uop = params->uop;
     const SwsDitherUOp *dither = &uop->par.dither;
     const int size = 1 << dither->size_log2;
     if (size >= SWS_BLOCK_SIZE) {
         /* No extra padding needed */
         out->priv.ptr = av_refstruct_ref(uop->data.ptr);
         out->free = ff_op_priv_unref;
         return 0;
     }

     const int stride = FFMAX(size, SWS_BLOCK_SIZE);
     const int height = ff_sws_dither_height(dither);
     pixel_t *matrix = av_malloc(sizeof(pixel_t) * height * stride);
     if (!matrix)
         return AVERROR(ENOMEM);
     out->priv.ptr = matrix;
     out->free = ff_op_priv_free;

     /* Pad to multiple of block size. We don't need extra padding for the
      * height because ff_sws_dither_height() already includes any padding
      * necessary for the y_offset */
     for (int y = 0; y < height; y++) {
         pixel_t *row = &matrix[y * stride];
         for (int x = 0; x < size; x++)
             row[x] = uop->data.ptr[y * size + x].px;
         for (int x = size; x < stride; x++)
             row[x] = row[x % size];
     }

     return 0;
 }

 DECL_FUNC(dither, const SwsCompMask mask,
                   const uint8_t off0, const uint8_t off1,
                   const uint8_t off2, const uint8_t off3,
                   const uint8_t size_log2)
 {
     const int size   = 1 << size_log2;
     const int stride = FFMAX(size, SWS_BLOCK_SIZE);

     const pixel_t *matrix = impl->priv.ptr;
     matrix += (iter->y & (size - 1)) * stride;
     matrix += (iter->x & (size - 1)) & ~(SWS_BLOCK_SIZE - 1);

     const pixel_t *const row0 = &matrix[off0 * stride];
     const pixel_t *const row1 = &matrix[off1 * stride];
     const pixel_t *const row2 = &matrix[off2 * stride];
     const pixel_t *const row3 = &matrix[off3 * stride];

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         if (X) x[i] += row0[i];
         if (Y) y[i] += row1[i];
         if (Z) z[i] += row2[i];
         if (W) w[i] += row3[i];
     }

     CONTINUE(x, y, z, w);
 }

 SWS_FOR(PX, DITHER, DECL_IMPL, dither)
 SWS_FOR_STRUCT(PX, DITHER, DECL_ENTRY, .setup = fn(setup_dither) )

 /*********************
  * Linear operations *
  *********************/

 typedef struct {
     /* Stored in split form for convenience */
     pixel_t m[4][4];
     pixel_t k[4];
 } fn(LinCoeffs);

 DECL_SETUP(setup_linear, params, out)
 {
     const SwsUOp *uop = params->uop;
     fn(LinCoeffs) c;

     for (int i = 0; i < 4; i++) {
         for (int j = 0; j < 4; j++)
             c.m[i][j] = uop->data.mat4[i][j].px;
         c.k[i] = uop->data.mat4[i][4].px;
     }

     out->priv.ptr = av_memdup(&c, sizeof(c));
     out->free = ff_op_priv_free;
     return out->priv.ptr ? 0 : AVERROR(ENOMEM);
 }

 /**
  * Fully general case for a 5x5 linear affine transformation. Should never be
  * called without constant `mask`. This function will compile down to the
  * appropriately optimized version for the required subset of operations when
  * called with a constant mask.
  */
 DECL_FUNC(linear, const SwsCompMask mask, const uint32_t one, const uint32_t zero)
 {
     const fn(LinCoeffs) c = *(const fn(LinCoeffs) *) impl->priv.ptr;

     SWS_LOOP
     for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
         const pixel_t xx = x[i];
         const pixel_t yy = y[i];
         const pixel_t zz = z[i];
         const pixel_t ww = w[i];

 #define LIN_VAL(I, J, val) \
     ((one & SWS_MASK(I, J)) ? (val) : c.m[I][J] * (val))

 #define LIN_ROW(I, var) do {                                    \
     var[i] = (zero & SWS_MASK(I, 4)) ? 0 : c.k[I];              \
     if (!(zero & SWS_MASK(I, 0))) var[i] += LIN_VAL(I, 0, xx);  \
     if (!(zero & SWS_MASK(I, 1))) var[i] += LIN_VAL(I, 1, yy);  \
     if (!(zero & SWS_MASK(I, 2))) var[i] += LIN_VAL(I, 2, zz);  \
     if (!(zero & SWS_MASK(I, 3))) var[i] += LIN_VAL(I, 3, ww);  \
 } while (0)

         if (X) LIN_ROW(0, x);
         if (Y) LIN_ROW(1, y);
         if (Z) LIN_ROW(2, z);
         if (W) LIN_ROW(3, w);
     }

     CONTINUE(x, y, z, w);
 }

 SWS_FOR(PX, LINEAR, DECL_IMPL, linear)
 SWS_FOR_STRUCT(PX, LINEAR, DECL_ENTRY, .setup = fn(setup_linear) )

 #undef PIXEL_MAX
 #undef PIXEL_SWAP
 #undef pixel_t
 #undef inter_t
 #undef block_t
 #undef PX
 #undef px
	/**
	* Copyright (C) 2026 Niklas Haas
	*
	* This file is part of FFmpeg.
	*
	* FFmpeg is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* FFmpeg is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with FFmpeg; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	*/

	#include <libavutil/bswap.h>

	#include "uops_tmpl.h"

	#ifndef BIT_DEPTH
	# define BIT_DEPTH 8
	#endif

	#if IS_FLOAT && BIT_DEPTH == 32
	# define PIXEL_TYPE SWS_PIXEL_F32
	# define pixel_t float
	# define inter_t float
	# define PX F32
	# define px f32
	#elif BIT_DEPTH == 32
	# define PIXEL_MAX 0xFFFFFFFFu
	# define PIXEL_SWAP av_bswap32
	# define pixel_t uint32_t
	# define inter_t int64_t
	# define PX U32
	# define px u32
	#elif BIT_DEPTH == 16
	# define PIXEL_MAX 0xFFFFu
	# define PIXEL_SWAP av_bswap16
	# define pixel_t uint16_t
	# define inter_t int64_t
	# define PX U16
	# define px u16
	#elif BIT_DEPTH == 8
	# define PIXEL_MAX 0xFFu
	# define pixel_t uint8_t
	# define inter_t int32_t
	# define PX U8
	# define px u8
	#else
	# error Invalid BIT_DEPTH
	#endif

	/*********************************
	* Generic read/write operations *
	*********************************/

	DECL_READ(read_planar, const SwsCompMask mask)
	{
	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x[i] = in0[i];
	if (Y) y[i] = in1[i];
	if (Z) z[i] = in2[i];
	if (W) w[i] = in3[i];
	}

	if (X) iter->in[0] += SIZEOF_BLOCK;
	if (Y) iter->in[1] += SIZEOF_BLOCK;
	if (Z) iter->in[2] += SIZEOF_BLOCK;
	if (W) iter->in[3] += SIZEOF_BLOCK;

	CONTINUE(x, y, z, w);
	}

	DECL_READ(read_packed, const SwsCompMask mask)
	{
	const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x[i] = in0[elems * i + 0];
	if (Y) y[i] = in0[elems * i + 1];
	if (Z) z[i] = in0[elems * i + 2];
	if (W) w[i] = in0[elems * i + 3];
	}

	iter->in[0] += SIZEOF_BLOCK * elems;
	CONTINUE(x, y, z, w);
	}

	DECL_WRITE(write_planar, const SwsCompMask mask)
	{
	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) out0[i] = x[i];
	if (Y) out1[i] = y[i];
	if (Z) out2[i] = z[i];
	if (W) out3[i] = w[i];
	}

	if (X) iter->out[0] += SIZEOF_BLOCK;
	if (Y) iter->out[1] += SIZEOF_BLOCK;
	if (Z) iter->out[2] += SIZEOF_BLOCK;
	if (W) iter->out[3] += SIZEOF_BLOCK;
	}

	DECL_WRITE(write_packed, const SwsCompMask mask)
	{
	const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) out0[elems * i + 0] = x[i];
	if (Y) out0[elems * i + 1] = y[i];
	if (Z) out0[elems * i + 2] = z[i];
	if (W) out0[elems * i + 3] = w[i];
	}

	iter->out[0] += SIZEOF_BLOCK * elems;
	}

	#if BIT_DEPTH == 8

	DECL_READ(read_bit, const SwsCompMask mask)
	{
	av_assert2(mask == SWS_COMP_ELEMS(1));

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
	const pixel_t val = ((const pixel_t *) in0)[i >> 3];
	x[i + 0] = (val >> 7) & 1;
	x[i + 1] = (val >> 6) & 1;
	x[i + 2] = (val >> 5) & 1;
	x[i + 3] = (val >> 4) & 1;
	x[i + 4] = (val >> 3) & 1;
	x[i + 5] = (val >> 2) & 1;
	x[i + 6] = (val >> 1) & 1;
	x[i + 7] = (val >> 0) & 1;
	}

	iter->in[0] += SIZEOF_BLOCK >> 3;
	CONTINUE(x, y, z, w);
	}

	DECL_READ(read_nibble, const SwsCompMask mask)
	{
	av_assert2(mask == SWS_COMP_ELEMS(1));

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) {
	const pixel_t val = in0[i >> 1];
	x[i + 0] = val >> 4; /* high nibble */
	x[i + 1] = val & 0xF; /* low nibble */
	}

	iter->in[0] += SIZEOF_BLOCK >> 1;
	CONTINUE(x, y, z, w);
	}

	DECL_WRITE(write_bit, const SwsCompMask mask)
	{
	av_assert2(mask == SWS_COMP_ELEMS(1));

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
	out0[i >> 3] = x[i + 0] << 7 \|
	x[i + 1] << 6 \|
	x[i + 2] << 5 \|
	x[i + 3] << 4 \|
	x[i + 4] << 3 \|
	x[i + 5] << 2 \|
	x[i + 6] << 1 \|
	x[i + 7];
	}

	iter->out[0] += SIZEOF_BLOCK >> 3;
	}

	DECL_WRITE(write_nibble, const SwsCompMask mask)
	{
	av_assert2(mask == SWS_COMP_ELEMS(1));

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i += 2)
	out0[i >> 1] = x[i] << 4 \| x[i + 1];

	iter->out[0] += SIZEOF_BLOCK >> 1;
	}

	#endif /* BIT_DEPTH == 8 */

	SWS_FOR(PX, READ_PLANAR, DECL_IMPL_READ, read_planar)
	SWS_FOR(PX, READ_PACKED, DECL_IMPL_READ, read_packed)
	SWS_FOR(PX, READ_NIBBLE, DECL_IMPL_READ, read_nibble)
	SWS_FOR(PX, READ_BIT, DECL_IMPL_READ, read_bit)
	SWS_FOR(PX, WRITE_PLANAR, DECL_IMPL_WRITE, write_planar)
	SWS_FOR(PX, WRITE_PACKED, DECL_IMPL_WRITE, write_packed)
	SWS_FOR(PX, WRITE_NIBBLE, DECL_IMPL_WRITE, write_nibble)
	SWS_FOR(PX, WRITE_BIT, DECL_IMPL_WRITE, write_bit)

	SWS_FOR_STRUCT(PX, READ_PLANAR, DECL_ENTRY)
	SWS_FOR_STRUCT(PX, READ_PACKED, DECL_ENTRY)
	SWS_FOR_STRUCT(PX, READ_NIBBLE, DECL_ENTRY)
	SWS_FOR_STRUCT(PX, READ_BIT, DECL_ENTRY)
	SWS_FOR_STRUCT(PX, WRITE_PLANAR, DECL_ENTRY)
	SWS_FOR_STRUCT(PX, WRITE_PACKED, DECL_ENTRY)
	SWS_FOR_STRUCT(PX, WRITE_NIBBLE, DECL_ENTRY)
	SWS_FOR_STRUCT(PX, WRITE_BIT, DECL_ENTRY)

	/*****************************
	* Scaling / filtering reads *
	*****************************/

	DECL_SETUP(setup_filter_v, params, out)
	{
	if (params->uop->par.filter.type != SWS_PIXEL_F32)
	return AVERROR(ENOTSUP);

	const SwsFilterWeights *filter = params->uop->data.kernel;
	static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),
	">8 byte pointers not supported");

	/* Pre-convert weights to float */
	float *weights = av_calloc(filter->num_weights, sizeof(float));
	if (!weights)
	return AVERROR(ENOMEM);

	for (int i = 0; i < filter->num_weights; i++)
	weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;

	out->priv.ptr = weights;
	out->priv.i32[2] = filter->filter_size;
	out->free = ff_op_priv_free;
	return 0;
	}

	/* Fully general vertical planar filter case */
	DECL_READ(read_planar_fv, const SwsCompMask mask, const SwsPixelType type)
	{
	av_assert2(type == SWS_PIXEL_F32);
	const SwsOpExec *exec = iter->exec;
	const float *restrict weights = impl->priv.ptr;
	const int filter_size = impl->priv.i32[2];
	weights += filter_size * iter->y;

	block_t xs, ys, zs, ws;
	if (X) memset(&xs.f32, 0, sizeof(xs.f32));
	if (Y) memset(&ys.f32, 0, sizeof(ys.f32));
	if (Z) memset(&zs.f32, 0, sizeof(zs.f32));
	if (W) memset(&ws.f32, 0, sizeof(ws.f32));

	for (int j = 0; j < filter_size; j++) {
	const float weight = weights[j];

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) xs.f32[i] += weight * in0[i];
	if (Y) ys.f32[i] += weight * in1[i];
	if (Z) zs.f32[i] += weight * in2[i];
	if (W) ws.f32[i] += weight * in3[i];
	}

	if (X) in0 = bump_ptr(in0, exec->in_stride[0]);
	if (Y) in1 = bump_ptr(in1, exec->in_stride[1]);
	if (Z) in2 = bump_ptr(in2, exec->in_stride[2]);
	if (W) in3 = bump_ptr(in3, exec->in_stride[3]);
	}

	if (X) iter->in[0] += SIZEOF_BLOCK;
	if (Y) iter->in[1] += SIZEOF_BLOCK;
	if (Z) iter->in[2] += SIZEOF_BLOCK;
	if (W) iter->in[3] += SIZEOF_BLOCK;

	CONTINUE(&xs, &ys, &zs, &ws);
	}

	DECL_SETUP(setup_filter_h, params, out)
	{
	if (params->uop->par.filter.type != SWS_PIXEL_F32)
	return AVERROR(ENOTSUP);

	SwsFilterWeights *filter = params->uop->data.kernel;
	out->priv.ptr = av_refstruct_ref(filter->weights);
	out->priv.i32[2] = filter->filter_size;
	out->free = ff_op_priv_unref;
	return 0;
	}

	/* Fully general horizontal planar filter case */
	DECL_READ(read_planar_fh, const SwsCompMask mask, const SwsPixelType type)
	{
	av_assert2(type == SWS_PIXEL_F32);
	const SwsOpExec *exec = iter->exec;
	const int *restrict weights = impl->priv.ptr;
	const int filter_size = impl->priv.i32[2];
	const float scale = 1.0f / SWS_FILTER_SCALE;
	const int xpos = iter->x;
	weights += filter_size * iter->x;

	block_t xs, ys, zs, ws;
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	const int offset = exec->in_offset_x[xpos + i];
	pixel_t *start0 = bump_ptr(in0, offset);
	pixel_t *start1 = bump_ptr(in1, offset);
	pixel_t *start2 = bump_ptr(in2, offset);
	pixel_t *start3 = bump_ptr(in3, offset);

	inter_t sx = 0, sy = 0, sz = 0, sw = 0;
	for (int j = 0; j < filter_size; j++) {
	const int weight = weights[j];
	if (X) sx += weight * start0[j];
	if (Y) sy += weight * start1[j];
	if (Z) sz += weight * start2[j];
	if (W) sw += weight * start3[j];
	}

	if (X) xs.f32[i] = (float) sx * scale;
	if (Y) ys.f32[i] = (float) sy * scale;
	if (Z) zs.f32[i] = (float) sz * scale;
	if (W) ws.f32[i] = (float) sw * scale;

	weights += filter_size;
	}

	CONTINUE(&xs, &ys, &zs, &ws);
	}

	SWS_FOR(PX, READ_PLANAR_FV, DECL_IMPL_READ, read_planar_fv)
	SWS_FOR(PX, READ_PLANAR_FH, DECL_IMPL_READ, read_planar_fh)
	SWS_FOR_STRUCT(PX, READ_PLANAR_FV, DECL_ENTRY, .setup = fn(setup_filter_v) )
	SWS_FOR_STRUCT(PX, READ_PLANAR_FH, DECL_ENTRY, .setup = fn(setup_filter_h) )

	/***************************
	* Permutation and copying *
	***************************/

	/* Permute by directly swapping the order of arguments to the continuation. */
	#define DECL_PERMUTE(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3) \
	static void NAME##_c(SwsOpIter *restrict iter, \
	const SwsOpImpl *restrict impl, \
	void restrict in0, void restrict in1, \
	void restrict in2, void restrict in3) \
	{ \
	CONTINUE(in##IDX0, in##IDX1, in##IDX2, in##IDX3); \
	}

	#define DECL_COPY(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3) \
	static void NAME##_c(SwsOpIter *restrict iter, \
	const SwsOpImpl *restrict impl, \
	void restrict in0, void restrict in1, \
	void restrict in2, void restrict in3) \
	{ \
	const SwsCompMask mask = (MASK); \
	block_t x, y, z, w; \
	\
	if (X) memcpy(&x.px, in##IDX0, SIZEOF_BLOCK); \
	if (Y) memcpy(&y.px, in##IDX1, SIZEOF_BLOCK); \
	if (Z) memcpy(&z.px, in##IDX2, SIZEOF_BLOCK); \
	if (W) memcpy(&w.px, in##IDX3, SIZEOF_BLOCK); \
	\
	CONTINUE(X ? &x : in0, Y ? &y : in1, Z ? &z : in2, W ? &w : in3); \
	}

	SWS_FOR(PX, PERMUTE, DECL_PERMUTE)
	SWS_FOR(PX, COPY, DECL_COPY)
	SWS_FOR_STRUCT(PX, PERMUTE, DECL_ENTRY)
	SWS_FOR_STRUCT(PX, COPY, DECL_ENTRY)

	/*********************
	* Format conversion *
	*********************/

	#define DECL_CAST(DST, dst) \
	DECL_FUNC(to_##dst, const SwsCompMask mask) \
	{ \
	block_t xx, yy, zz, ww; \
	\
	SWS_LOOP \
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) { \
	if (X) xx.dst[i] = x[i]; \
	if (Y) yy.dst[i] = y[i]; \
	if (Z) zz.dst[i] = z[i]; \
	if (W) ww.dst[i] = w[i]; \
	} \
	\
	CONTINUE(&xx, &yy, &zz, &ww); \
	} \
	\
	SWS_FOR(PX, TO_##DST, DECL_IMPL, to_##dst) \
	SWS_FOR_STRUCT(PX, TO_##DST, DECL_ENTRY)

	DECL_CAST(U8, u8)
	DECL_CAST(U16, u16)
	DECL_CAST(U32, u32)
	DECL_CAST(F32, f32)

	/********************
	* Bit manipulation *
	********************/

	#if !IS_FLOAT
	DECL_FUNC(lshift, const SwsCompMask mask, const uint8_t amount)
	{
	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x[i] <<= amount;
	if (Y) y[i] <<= amount;
	if (Z) z[i] <<= amount;
	if (W) w[i] <<= amount;
	}

	CONTINUE(x, y, z, w);
	}

	DECL_FUNC(rshift, const SwsCompMask mask, const uint8_t amount)
	{
	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x[i] >>= amount;
	if (Y) y[i] >>= amount;
	if (Z) z[i] >>= amount;
	if (W) w[i] >>= amount;
	}

	CONTINUE(x, y, z, w);
	}
	#endif

	SWS_FOR(PX, LSHIFT, DECL_IMPL, lshift)
	SWS_FOR(PX, RSHIFT, DECL_IMPL, rshift)

	SWS_FOR_STRUCT(PX, LSHIFT, DECL_ENTRY)
	SWS_FOR_STRUCT(PX, RSHIFT, DECL_ENTRY)

	#ifdef PIXEL_SWAP
	DECL_FUNC(swap_bytes, const SwsCompMask mask)
	{
	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x[i] = PIXEL_SWAP(x[i]);
	if (Y) y[i] = PIXEL_SWAP(y[i]);
	if (Z) z[i] = PIXEL_SWAP(z[i]);
	if (W) w[i] = PIXEL_SWAP(w[i]);
	}

	CONTINUE(x, y, z, w);
	}
	#endif /* PIXEL_SWAP */

	SWS_FOR(PX, SWAP_BYTES, DECL_IMPL, swap_bytes)
	SWS_FOR_STRUCT(PX, SWAP_BYTES, DECL_ENTRY)

	#ifdef PIXEL_MAX
	DECL_FUNC(expand_bit, const SwsCompMask mask)
	{
	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x[i] = x[i] ? PIXEL_MAX : 0;
	if (Y) y[i] = y[i] ? PIXEL_MAX : 0;
	if (Z) z[i] = z[i] ? PIXEL_MAX : 0;
	if (W) w[i] = w[i] ? PIXEL_MAX : 0;
	}

	CONTINUE(x, y, z, w);
	}
	#endif

	#if BIT_DEPTH == 8
	DECL_FUNC(expand_pair, const SwsCompMask mask)
	{
	block_t x16, y16, z16, w16;

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x16.u16[i] = x[i] << 8 \| x[i];
	if (Y) y16.u16[i] = y[i] << 8 \| y[i];
	if (Z) z16.u16[i] = z[i] << 8 \| z[i];
	if (W) w16.u16[i] = w[i] << 8 \| w[i];
	}

	CONTINUE(&x16, &y16, &z16, &w16);
	}

	DECL_FUNC(expand_quad, const SwsCompMask mask)
	{
	block_t x32, y32, z32, w32;

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x32.u32[i] = (uint32_t) x[i] << 24 \| x[i] << 16 \| x[i] << 8 \| x[i];
	if (Y) y32.u32[i] = (uint32_t) y[i] << 24 \| y[i] << 16 \| y[i] << 8 \| y[i];
	if (Z) z32.u32[i] = (uint32_t) z[i] << 24 \| z[i] << 16 \| z[i] << 8 \| z[i];
	if (W) w32.u32[i] = (uint32_t) w[i] << 24 \| w[i] << 16 \| w[i] << 8 \| w[i];
	}

	CONTINUE(&x32, &y32, &z32, &w32);
	}
	#endif /* BIT_DEPTH == 8 */

	SWS_FOR(PX, EXPAND_BIT, DECL_IMPL, expand_bit)
	SWS_FOR(PX, EXPAND_PAIR, DECL_IMPL, expand_pair)
	SWS_FOR(PX, EXPAND_QUAD, DECL_IMPL, expand_quad)
	SWS_FOR_STRUCT(PX, EXPAND_BIT, DECL_ENTRY)
	SWS_FOR_STRUCT(PX, EXPAND_PAIR, DECL_ENTRY)
	SWS_FOR_STRUCT(PX, EXPAND_QUAD, DECL_ENTRY)

	/*************************
	* Packing and unpacking *
	************************/

	#if !IS_FLOAT
	DECL_FUNC(unpack, const SwsCompMask mask,
	const uint8_t bx, const uint8_t by,
	const uint8_t bz, const uint8_t bw)
	{
	const uint8_t sx = bw + bz + by;
	const uint8_t sy = bw + bz;
	const uint8_t sz = bw;
	const uint8_t sw = 0;

	const pixel_t mx = (1 << bx) - 1;
	const pixel_t my = (1 << by) - 1;
	const pixel_t mz = (1 << bz) - 1;
	const pixel_t mw = (1 << bw) - 1;

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	const pixel_t val = x[i];
	if (X) x[i] = (val >> sx) & mx;
	if (Y) y[i] = (val >> sy) & my;
	if (Z) z[i] = (val >> sz) & mz;
	if (W) w[i] = (val >> sw) & mw;
	}

	CONTINUE(x, y, z, w);
	}

	DECL_FUNC(pack, const SwsCompMask mask,
	const uint8_t bx, const uint8_t by,
	const uint8_t bz, const uint8_t bw)
	{
	const uint8_t sx = bw + bz + by;
	const uint8_t sy = bw + bz;
	const uint8_t sz = bw;
	const uint8_t sw = 0;

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	pixel_t val = 0;
	if (X) val \|= x[i] << sx;
	if (Y) val \|= y[i] << sy;
	if (Z) val \|= z[i] << sz;
	if (W) val \|= w[i] << sw;
	x[i] = val;
	}

	CONTINUE(x, y, z, w);
	}
	#endif /* !IS_FLOAT */

	SWS_FOR(PX, UNPACK, DECL_IMPL, unpack)
	SWS_FOR(PX, PACK, DECL_IMPL, pack)
	SWS_FOR_STRUCT(PX, UNPACK, DECL_ENTRY)
	SWS_FOR_STRUCT(PX, PACK, DECL_ENTRY)

	/***********************
	* Pixel data clearing *
	***********************/

	#ifdef PIXEL_MAX
	DECL_FUNC(clear, const SwsCompMask mask, const SwsCompMask one,
	const SwsCompMask zero)
	{
	#define ONE(N) SWS_COMP_TEST(one, N)
	#define ZERO(N) SWS_COMP_TEST(zero, N)
	const pixel_t cx = ONE(0) ? PIXEL_MAX : ZERO(0) ? 0 : impl->priv.px[0];
	const pixel_t cy = ONE(1) ? PIXEL_MAX : ZERO(1) ? 0 : impl->priv.px[1];
	const pixel_t cz = ONE(2) ? PIXEL_MAX : ZERO(2) ? 0 : impl->priv.px[2];
	const pixel_t cw = ONE(3) ? PIXEL_MAX : ZERO(3) ? 0 : impl->priv.px[3];

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x[i] = cx;
	if (Y) y[i] = cy;
	if (Z) z[i] = cz;
	if (W) w[i] = cw;
	}

	CONTINUE(x, y, z, w);
	}
	#endif

	SWS_FOR(PX, CLEAR, DECL_IMPL, clear)
	SWS_FOR_STRUCT(PX, CLEAR, DECL_ENTRY, .setup = ff_sws_setup_vec4)

	/*************************
	* Arithmetic operations *
	*************************/

	DECL_FUNC(scale, const SwsCompMask mask)
	{
	const pixel_t scale = impl->priv.px[0];

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x[i] *= scale;
	if (Y) y[i] *= scale;
	if (Z) z[i] *= scale;
	if (W) w[i] *= scale;
	}

	CONTINUE(x, y, z, w);
	}

	DECL_FUNC(add, const SwsCompMask mask)
	{
	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x[i] += impl->priv.px[0];
	if (Y) y[i] += impl->priv.px[1];
	if (Z) z[i] += impl->priv.px[2];
	if (W) w[i] += impl->priv.px[3];
	}

	CONTINUE(x, y, z, w);
	}

	DECL_FUNC(min, const SwsCompMask mask)
	{
	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x[i] = FFMIN(x[i], impl->priv.px[0]);
	if (Y) y[i] = FFMIN(y[i], impl->priv.px[1]);
	if (Z) z[i] = FFMIN(z[i], impl->priv.px[2]);
	if (W) w[i] = FFMIN(w[i], impl->priv.px[3]);
	}

	CONTINUE(x, y, z, w);
	}

	DECL_FUNC(max, const SwsCompMask mask)
	{
	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x[i] = FFMAX(x[i], impl->priv.px[0]);
	if (Y) y[i] = FFMAX(y[i], impl->priv.px[1]);
	if (Z) z[i] = FFMAX(z[i], impl->priv.px[2]);
	if (W) w[i] = FFMAX(w[i], impl->priv.px[3]);
	}

	CONTINUE(x, y, z, w);
	}

	SWS_FOR(PX, SCALE, DECL_IMPL, scale)
	SWS_FOR(PX, ADD, DECL_IMPL, add)
	SWS_FOR(PX, MIN, DECL_IMPL, min)
	SWS_FOR(PX, MAX, DECL_IMPL, max)
	SWS_FOR_STRUCT(PX, SCALE, DECL_ENTRY, .setup = ff_sws_setup_scalar )
	SWS_FOR_STRUCT(PX, ADD, DECL_ENTRY, .setup = ff_sws_setup_vec4 )
	SWS_FOR_STRUCT(PX, MIN, DECL_ENTRY, .setup = ff_sws_setup_vec4 )
	SWS_FOR_STRUCT(PX, MAX, DECL_ENTRY, .setup = ff_sws_setup_vec4 )

	/*************
	* Dithering *
	*************/

	DECL_SETUP(setup_dither, params, out)
	{
	const SwsUOp *uop = params->uop;
	const SwsDitherUOp *dither = &uop->par.dither;
	const int size = 1 << dither->size_log2;
	if (size >= SWS_BLOCK_SIZE) {
	/* No extra padding needed */
	out->priv.ptr = av_refstruct_ref(uop->data.ptr);
	out->free = ff_op_priv_unref;
	return 0;
	}

	const int stride = FFMAX(size, SWS_BLOCK_SIZE);
	const int height = ff_sws_dither_height(dither);
	pixel_t matrix = av_malloc(sizeof(pixel_t) height * stride);
	if (!matrix)
	return AVERROR(ENOMEM);
	out->priv.ptr = matrix;
	out->free = ff_op_priv_free;

	/* Pad to multiple of block size. We don't need extra padding for the
	* height because ff_sws_dither_height() already includes any padding
	* necessary for the y_offset */
	for (int y = 0; y < height; y++) {
	pixel_t row = &matrix[y stride];
	for (int x = 0; x < size; x++)
	row[x] = uop->data.ptr[y * size + x].px;
	for (int x = size; x < stride; x++)
	row[x] = row[x % size];
	}

	return 0;
	}

	DECL_FUNC(dither, const SwsCompMask mask,
	const uint8_t off0, const uint8_t off1,
	const uint8_t off2, const uint8_t off3,
	const uint8_t size_log2)
	{
	const int size = 1 << size_log2;
	const int stride = FFMAX(size, SWS_BLOCK_SIZE);

	const pixel_t *matrix = impl->priv.ptr;
	matrix += (iter->y & (size - 1)) * stride;
	matrix += (iter->x & (size - 1)) & ~(SWS_BLOCK_SIZE - 1);

	const pixel_t const row0 = &matrix[off0 stride];
	const pixel_t const row1 = &matrix[off1 stride];
	const pixel_t const row2 = &matrix[off2 stride];
	const pixel_t const row3 = &matrix[off3 stride];

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	if (X) x[i] += row0[i];
	if (Y) y[i] += row1[i];
	if (Z) z[i] += row2[i];
	if (W) w[i] += row3[i];
	}

	CONTINUE(x, y, z, w);
	}

	SWS_FOR(PX, DITHER, DECL_IMPL, dither)
	SWS_FOR_STRUCT(PX, DITHER, DECL_ENTRY, .setup = fn(setup_dither) )

	/*********************
	* Linear operations *
	*********************/

	typedef struct {
	/* Stored in split form for convenience */
	pixel_t m[4][4];
	pixel_t k[4];
	} fn(LinCoeffs);

	DECL_SETUP(setup_linear, params, out)
	{
	const SwsUOp *uop = params->uop;
	fn(LinCoeffs) c;

	for (int i = 0; i < 4; i++) {
	for (int j = 0; j < 4; j++)
	c.m[i][j] = uop->data.mat4[i][j].px;
	c.k[i] = uop->data.mat4[i][4].px;
	}

	out->priv.ptr = av_memdup(&c, sizeof(c));
	out->free = ff_op_priv_free;
	return out->priv.ptr ? 0 : AVERROR(ENOMEM);
	}

	/**
	* Fully general case for a 5x5 linear affine transformation. Should never be
	* called without constant `mask`. This function will compile down to the
	* appropriately optimized version for the required subset of operations when
	* called with a constant mask.
	*/
	DECL_FUNC(linear, const SwsCompMask mask, const uint32_t one, const uint32_t zero)
	{
	const fn(LinCoeffs) c = (const fn(LinCoeffs) ) impl->priv.ptr;

	SWS_LOOP
	for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
	const pixel_t xx = x[i];
	const pixel_t yy = y[i];
	const pixel_t zz = z[i];
	const pixel_t ww = w[i];

	#define LIN_VAL(I, J, val) \
	((one & SWS_MASK(I, J)) ? (val) : c.m[I][J] * (val))

	#define LIN_ROW(I, var) do { \
	var[i] = (zero & SWS_MASK(I, 4)) ? 0 : c.k[I]; \
	if (!(zero & SWS_MASK(I, 0))) var[i] += LIN_VAL(I, 0, xx); \
	if (!(zero & SWS_MASK(I, 1))) var[i] += LIN_VAL(I, 1, yy); \
	if (!(zero & SWS_MASK(I, 2))) var[i] += LIN_VAL(I, 2, zz); \
	if (!(zero & SWS_MASK(I, 3))) var[i] += LIN_VAL(I, 3, ww); \
	} while (0)

	if (X) LIN_ROW(0, x);
	if (Y) LIN_ROW(1, y);
	if (Z) LIN_ROW(2, z);
	if (W) LIN_ROW(3, w);
	}

	CONTINUE(x, y, z, w);
	}

	SWS_FOR(PX, LINEAR, DECL_IMPL, linear)
	SWS_FOR_STRUCT(PX, LINEAR, DECL_ENTRY, .setup = fn(setup_linear) )

	#undef PIXEL_MAX
	#undef PIXEL_SWAP
	#undef pixel_t
	#undef inter_t
	#undef block_t
	#undef PX
	#undef px