libswscale/ops_optimizer.c - third_party/ffmpeg - Git at Google

 /**
  * Copyright (C) 2025 Niklas Haas
  *
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

 #include "libavutil/avassert.h"
 #include "libavutil/bswap.h"
 #include "libavutil/rational.h"

 #include "ops.h"
 #include "ops_internal.h"

 #define RET(x)                                                                 \
     do {                                                                       \
         if ((ret = (x)) < 0)                                                   \
             return ret;                                                        \
     } while (0)

 /* Returns true for operations that are independent per channel. These can
  * usually be commuted freely other such operations. */
 static bool op_type_is_independent(SwsOpType op)
 {
     switch (op) {
     case SWS_OP_SWAP_BYTES:
     case SWS_OP_LSHIFT:
     case SWS_OP_RSHIFT:
     case SWS_OP_CONVERT:
     case SWS_OP_DITHER:
     case SWS_OP_MIN:
     case SWS_OP_MAX:
     case SWS_OP_SCALE:
         return true;
     case SWS_OP_INVALID:
     case SWS_OP_READ:
     case SWS_OP_WRITE:
     case SWS_OP_SWIZZLE:
     case SWS_OP_CLEAR:
     case SWS_OP_LINEAR:
     case SWS_OP_PACK:
     case SWS_OP_UNPACK:
         return false;
     case SWS_OP_TYPE_NB:
         break;
     }

     av_unreachable("Invalid operation type!");
     return false;
 }

 /* merge_comp_flags() forms a monoid with flags_identity as the null element */
 static const unsigned flags_identity = SWS_COMP_ZERO | SWS_COMP_EXACT;
 static unsigned merge_comp_flags(unsigned a, unsigned b)
 {
     const unsigned flags_or  = SWS_COMP_GARBAGE;
     const unsigned flags_and = SWS_COMP_ZERO | SWS_COMP_EXACT;
     return ((a & b) & flags_and) | ((a | b) & flags_or);
 }

 /* Infer + propagate known information about components */
 void ff_sws_op_list_update_comps(SwsOpList *ops)
 {
     SwsComps next = { .unused = {true, true, true, true} };
     SwsComps prev = { .flags = {
         SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE,
     }};

     /* Forwards pass, propagates knowledge about the incoming pixel values */
     for (int n = 0; n < ops->num_ops; n++) {
         SwsOp *op = &ops->ops[n];

         /* Prefill min/max values automatically; may have to be fixed in
          * special cases */
         memcpy(op->comps.min, prev.min, sizeof(prev.min));
         memcpy(op->comps.max, prev.max, sizeof(prev.max));

         if (op->op != SWS_OP_SWAP_BYTES) {
             ff_sws_apply_op_q(op, op->comps.min);
             ff_sws_apply_op_q(op, op->comps.max);
         }

         switch (op->op) {
         case SWS_OP_READ:
             for (int i = 0; i < op->rw.elems; i++) {
                 if (ff_sws_pixel_type_is_int(op->type)) {
                     int bits = 8 * ff_sws_pixel_type_size(op->type);
                     if (!op->rw.packed && ops->src.desc) {
                         /* Use legal value range from pixdesc if available;
                          * we don't need to do this for packed formats because
                          * non-byte-aligned packed formats will necessarily go
                          * through SWS_OP_UNPACK anyway */
                         for (int c = 0; c < 4; c++) {
                             if (ops->src.desc->comp[c].plane == i) {
                                 bits = ops->src.desc->comp[c].depth;
                                 break;
                             }
                         }
                     }

                     op->comps.flags[i] = SWS_COMP_EXACT;
                     op->comps.min[i] = Q(0);
                     op->comps.max[i] = Q((1ULL << bits) - 1);
                 }
             }
             for (int i = op->rw.elems; i < 4; i++)
                 op->comps.flags[i] = prev.flags[i];
             break;
         case SWS_OP_WRITE:
             for (int i = 0; i < op->rw.elems; i++)
                 av_assert1(!(prev.flags[i] & SWS_COMP_GARBAGE));
             /* fall through */
         case SWS_OP_SWAP_BYTES:
         case SWS_OP_LSHIFT:
         case SWS_OP_RSHIFT:
         case SWS_OP_MIN:
         case SWS_OP_MAX:
             /* Linearly propagate flags per component */
             for (int i = 0; i < 4; i++)
                 op->comps.flags[i] = prev.flags[i];
             break;
         case SWS_OP_DITHER:
             /* Strip zero flag because of the nonzero dithering offset */
             for (int i = 0; i < 4; i++)
                 op->comps.flags[i] = prev.flags[i] & ~SWS_COMP_ZERO;
             break;
         case SWS_OP_UNPACK:
             for (int i = 0; i < 4; i++) {
                 if (op->pack.pattern[i])
                     op->comps.flags[i] = prev.flags[0];
                 else
                     op->comps.flags[i] = SWS_COMP_GARBAGE;
             }
             break;
         case SWS_OP_PACK: {
             unsigned flags = flags_identity;
             for (int i = 0; i < 4; i++) {
                 if (op->pack.pattern[i])
                     flags = merge_comp_flags(flags, prev.flags[i]);
                 if (i > 0) /* clear remaining comps for sanity */
                     op->comps.flags[i] = SWS_COMP_GARBAGE;
             }
             op->comps.flags[0] = flags;
             break;
         }
         case SWS_OP_CLEAR:
             for (int i = 0; i < 4; i++) {
                 if (op->c.q4[i].den) {
                     if (op->c.q4[i].num == 0) {
                         op->comps.flags[i] = SWS_COMP_ZERO | SWS_COMP_EXACT;
                     } else if (op->c.q4[i].den == 1) {
                         op->comps.flags[i] = SWS_COMP_EXACT;
                     }
                 } else {
                     op->comps.flags[i] = prev.flags[i];
                 }
             }
             break;
         case SWS_OP_SWIZZLE:
             for (int i = 0; i < 4; i++)
                 op->comps.flags[i] = prev.flags[op->swizzle.in[i]];
             break;
         case SWS_OP_CONVERT:
             for (int i = 0; i < 4; i++) {
                 op->comps.flags[i] = prev.flags[i];
                 if (ff_sws_pixel_type_is_int(op->convert.to))
                     op->comps.flags[i] |= SWS_COMP_EXACT;
             }
             break;
         case SWS_OP_LINEAR:
             for (int i = 0; i < 4; i++) {
                 unsigned flags = flags_identity;
                 AVRational min = Q(0), max = Q(0);
                 for (int j = 0; j < 4; j++) {
                     const AVRational k = op->lin.m[i][j];
                     AVRational mink = av_mul_q(prev.min[j], k);
                     AVRational maxk = av_mul_q(prev.max[j], k);
                     if (k.num) {
                         flags = merge_comp_flags(flags, prev.flags[j]);
                         if (k.den != 1) /* fractional coefficient */
                             flags &= ~SWS_COMP_EXACT;
                         if (k.num < 0)
                             FFSWAP(AVRational, mink, maxk);
                         min = av_add_q(min, mink);
                         max = av_add_q(max, maxk);
                     }
                 }
                 if (op->lin.m[i][4].num) { /* nonzero offset */
                     flags &= ~SWS_COMP_ZERO;
                     if (op->lin.m[i][4].den != 1) /* fractional offset */
                         flags &= ~SWS_COMP_EXACT;
                     min = av_add_q(min, op->lin.m[i][4]);
                     max = av_add_q(max, op->lin.m[i][4]);
                 }
                 op->comps.flags[i] = flags;
                 op->comps.min[i] = min;
                 op->comps.max[i] = max;
             }
             break;
         case SWS_OP_SCALE:
             for (int i = 0; i < 4; i++) {
                 op->comps.flags[i] = prev.flags[i];
                 if (op->c.q.den != 1) /* fractional scale */
                     op->comps.flags[i] &= ~SWS_COMP_EXACT;
                 if (op->c.q.num < 0)
                     FFSWAP(AVRational, op->comps.min[i], op->comps.max[i]);
             }
             break;

         case SWS_OP_INVALID:
         case SWS_OP_TYPE_NB:
             av_unreachable("Invalid operation type!");
         }

         prev = op->comps;
     }

     /* Backwards pass, solves for component dependencies */
     for (int n = ops->num_ops - 1; n >= 0; n--) {
         SwsOp *op = &ops->ops[n];

         switch (op->op) {
         case SWS_OP_READ:
         case SWS_OP_WRITE:
             for (int i = 0; i < op->rw.elems; i++)
                 op->comps.unused[i] = op->op == SWS_OP_READ;
             for (int i = op->rw.elems; i < 4; i++)
                 op->comps.unused[i] = next.unused[i];
             break;
         case SWS_OP_SWAP_BYTES:
         case SWS_OP_LSHIFT:
         case SWS_OP_RSHIFT:
         case SWS_OP_CONVERT:
         case SWS_OP_DITHER:
         case SWS_OP_MIN:
         case SWS_OP_MAX:
         case SWS_OP_SCALE:
             for (int i = 0; i < 4; i++)
                 op->comps.unused[i] = next.unused[i];
             break;
         case SWS_OP_UNPACK: {
             bool unused = true;
             for (int i = 0; i < 4; i++) {
                 if (op->pack.pattern[i])
                     unused &= next.unused[i];
                 op->comps.unused[i] = i > 0;
             }
             op->comps.unused[0] = unused;
             break;
         }
         case SWS_OP_PACK:
             for (int i = 0; i < 4; i++) {
                 if (op->pack.pattern[i])
                     op->comps.unused[i] = next.unused[0];
                 else
                     op->comps.unused[i] = true;
             }
             break;
         case SWS_OP_CLEAR:
             for (int i = 0; i < 4; i++) {
                 if (op->c.q4[i].den)
                     op->comps.unused[i] = true;
                 else
                     op->comps.unused[i] = next.unused[i];
             }
             break;
         case SWS_OP_SWIZZLE: {
             bool unused[4] = { true, true, true, true };
             for (int i = 0; i < 4; i++)
                 unused[op->swizzle.in[i]] &= next.unused[i];
             for (int i = 0; i < 4; i++)
                 op->comps.unused[i] = unused[i];
             break;
         }
         case SWS_OP_LINEAR:
             for (int j = 0; j < 4; j++) {
                 bool unused = true;
                 for (int i = 0; i < 4; i++) {
                     if (op->lin.m[i][j].num)
                         unused &= next.unused[i];
                 }
                 op->comps.unused[j] = unused;
             }
             break;
         }

         next = op->comps;
     }
 }

 /* returns log2(x) only if x is a power of two, or 0 otherwise */
 static int exact_log2(const int x)
 {
     int p;
     if (x <= 0)
         return 0;
     p = av_log2(x);
     return (1 << p) == x ? p : 0;
 }

 static int exact_log2_q(const AVRational x)
 {
     if (x.den == 1)
         return exact_log2(x.num);
     else if (x.num == 1)
         return -exact_log2(x.den);
     else
         return 0;
 }

 /**
  * If a linear operation can be reduced to a scalar multiplication, returns
  * the corresponding scaling factor, or 0 otherwise.
  */
 static bool extract_scalar(const SwsLinearOp *c, SwsComps prev, SwsComps next,
                            SwsConst *out_scale)
 {
     SwsConst scale = {0};

     /* There are components not on the main diagonal */
     if (c->mask & ~SWS_MASK_DIAG4)
         return false;

     for (int i = 0; i < 4; i++) {
         const AVRational s = c->m[i][i];
         if ((prev.flags[i] & SWS_COMP_ZERO) || next.unused[i])
             continue;
         if (scale.q.den && av_cmp_q(s, scale.q))
             return false;
         scale.q = s;
     }

     if (scale.q.den)
         *out_scale = scale;
     return scale.q.den;
 }

 /* Extracts an integer clear operation (subset) from the given linear op. */
 static bool extract_constant_rows(SwsLinearOp *c, SwsComps prev,
                                   SwsConst *out_clear)
 {
     SwsConst clear = {0};
     bool ret = false;

     for (int i = 0; i < 4; i++) {
         bool const_row = c->m[i][4].den == 1; /* offset is integer */
         for (int j = 0; j < 4; j++) {
             const_row &= c->m[i][j].num == 0 || /* scalar is zero */
                          (prev.flags[j] & SWS_COMP_ZERO); /* input is zero */
         }
         if (const_row && (c->mask & SWS_MASK_ROW(i))) {
             clear.q4[i] = c->m[i][4];
             for (int j = 0; j < 5; j++)
                 c->m[i][j] = Q(i == j);
             c->mask &= ~SWS_MASK_ROW(i);
             ret = true;
         }
     }

     if (ret)
         *out_clear = clear;
     return ret;
 }

 /* Unswizzle a linear operation by aligning single-input rows with
  * their corresponding diagonal */
 static bool extract_swizzle(SwsLinearOp *op, SwsComps prev, SwsSwizzleOp *out_swiz)
 {
     SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3);
     SwsLinearOp c = *op;

     for (int i = 0; i < 4; i++) {
         int idx = -1;
         for (int j = 0; j < 4; j++) {
             if (!c.m[i][j].num || (prev.flags[j] & SWS_COMP_ZERO))
                 continue;
             if (idx >= 0)
                 return false; /* multiple inputs */
             idx = j;
         }

         if (idx >= 0 && idx != i) {
             /* Move coefficient to the diagonal */
             c.m[i][i] = c.m[i][idx];
             c.m[i][idx] = Q(0);
             swiz.in[i] = idx;
         }
     }

     if (swiz.mask == SWS_SWIZZLE(0, 1, 2, 3).mask)
         return false; /* no swizzle was identified */

     c.mask = ff_sws_linear_mask(c);
     *out_swiz = swiz;
     *op = c;
     return true;
 }

 int ff_sws_op_list_optimize(SwsOpList *ops)
 {
     int ret;

 retry:
     ff_sws_op_list_update_comps(ops);

     for (int n = 0; n < ops->num_ops;) {
         SwsOp dummy = {0};
         SwsOp *op = &ops->ops[n];
         SwsOp *prev = n ? &ops->ops[n - 1] : &dummy;
         SwsOp *next = n + 1 < ops->num_ops ? &ops->ops[n + 1] : &dummy;

         /* common helper variable */
         bool noop = true;

         switch (op->op) {
         case SWS_OP_READ:
             /* Optimized further into refcopy / memcpy */
             if (next->op == SWS_OP_WRITE &&
                 next->rw.elems == op->rw.elems &&
                 next->rw.packed == op->rw.packed &&
                 next->rw.frac == op->rw.frac)
             {
                 ff_sws_op_list_remove_at(ops, n, 2);
                 av_assert1(ops->num_ops == 0);
                 return 0;
             }

             /* Skip reading extra unneeded components */
             if (!op->rw.packed) {
                 int needed = op->rw.elems;
                 while (needed > 0 && next->comps.unused[needed - 1])
                     needed--;
                 if (op->rw.elems != needed) {
                     op->rw.elems = needed;
                     goto retry;
                 }
             }
             break;

         case SWS_OP_SWAP_BYTES:
             /* Redundant (double) swap */
             if (next->op == SWS_OP_SWAP_BYTES) {
                 ff_sws_op_list_remove_at(ops, n, 2);
                 goto retry;
             }
             break;

         case SWS_OP_UNPACK:
             /* Redundant unpack+pack */
             if (next->op == SWS_OP_PACK && next->type == op->type &&
                 next->pack.pattern[0] == op->pack.pattern[0] &&
                 next->pack.pattern[1] == op->pack.pattern[1] &&
                 next->pack.pattern[2] == op->pack.pattern[2] &&
                 next->pack.pattern[3] == op->pack.pattern[3])
             {
                 ff_sws_op_list_remove_at(ops, n, 2);
                 goto retry;
             }
             break;

         case SWS_OP_LSHIFT:
         case SWS_OP_RSHIFT:
             /* Two shifts in the same direction */
             if (next->op == op->op) {
                 op->c.u += next->c.u;
                 ff_sws_op_list_remove_at(ops, n + 1, 1);
                 goto retry;
             }

             /* No-op shift */
             if (!op->c.u) {
                 ff_sws_op_list_remove_at(ops, n, 1);
                 goto retry;
             }
             break;

         case SWS_OP_CLEAR:
             for (int i = 0; i < 4; i++) {
                 if (!op->c.q4[i].den)
                     continue;

                 if ((prev->comps.flags[i] & SWS_COMP_ZERO) &&
                     !(prev->comps.flags[i] & SWS_COMP_GARBAGE) &&
                     op->c.q4[i].num == 0)
                 {
                     /* Redundant clear-to-zero of zero component */
                     op->c.q4[i].den = 0;
                 } else if (next->comps.unused[i]) {
                     /* Unnecessary clear of unused component */
                     op->c.q4[i] = (AVRational) {0, 0};
                 } else if (op->c.q4[i].den) {
                     noop = false;
                 }
             }

             if (noop) {
                 ff_sws_op_list_remove_at(ops, n, 1);
                 goto retry;
             }

             /* Transitive clear */
             if (next->op == SWS_OP_CLEAR) {
                 for (int i = 0; i < 4; i++) {
                     if (next->c.q4[i].den)
                         op->c.q4[i] = next->c.q4[i];
                 }
                 ff_sws_op_list_remove_at(ops, n + 1, 1);
                 goto retry;
             }

             /* Prefer to clear as late as possible, to avoid doing
              * redundant work */
             if ((op_type_is_independent(next->op) && next->op != SWS_OP_SWAP_BYTES) ||
                 next->op == SWS_OP_SWIZZLE)
             {
                 if (next->op == SWS_OP_CONVERT)
                     op->type = next->convert.to;
                 ff_sws_apply_op_q(next, op->c.q4);
                 FFSWAP(SwsOp, *op, *next);
                 goto retry;
             }
             break;

         case SWS_OP_SWIZZLE: {
             bool seen[4] = {0};
             bool has_duplicates = false;
             for (int i = 0; i < 4; i++) {
                 if (next->comps.unused[i])
                     continue;
                 if (op->swizzle.in[i] != i)
                     noop = false;
                 has_duplicates |= seen[op->swizzle.in[i]];
                 seen[op->swizzle.in[i]] = true;
             }

             /* Identity swizzle */
             if (noop) {
                 ff_sws_op_list_remove_at(ops, n, 1);
                 goto retry;
             }

             /* Transitive swizzle */
             if (next->op == SWS_OP_SWIZZLE) {
                 const SwsSwizzleOp orig = op->swizzle;
                 for (int i = 0; i < 4; i++)
                     op->swizzle.in[i] = orig.in[next->swizzle.in[i]];
                 ff_sws_op_list_remove_at(ops, n + 1, 1);
                 goto retry;
             }

             /* Try to push swizzles with duplicates towards the output */
             if (has_duplicates && op_type_is_independent(next->op)) {
                 if (next->op == SWS_OP_CONVERT)
                     op->type = next->convert.to;
                 if (next->op == SWS_OP_MIN || next->op == SWS_OP_MAX) {
                     /* Un-swizzle the next operation */
                     const SwsConst c = next->c;
                     for (int i = 0; i < 4; i++) {
                         if (!next->comps.unused[i])
                             next->c.q4[op->swizzle.in[i]] = c.q4[i];
                     }
                 }
                 FFSWAP(SwsOp, *op, *next);
                 goto retry;
             }

             /* Move swizzle out of the way between two converts so that
              * they may be merged */
             if (prev->op == SWS_OP_CONVERT && next->op == SWS_OP_CONVERT) {
                 op->type = next->convert.to;
                 FFSWAP(SwsOp, *op, *next);
                 goto retry;
             }
             break;
         }

         case SWS_OP_CONVERT:
             /* No-op conversion */
             if (op->type == op->convert.to) {
                 ff_sws_op_list_remove_at(ops, n, 1);
                 goto retry;
             }

             /* Transitive conversion */
             if (next->op == SWS_OP_CONVERT &&
                 op->convert.expand == next->convert.expand)
             {
                 av_assert1(op->convert.to == next->type);
                 op->convert.to = next->convert.to;
                 ff_sws_op_list_remove_at(ops, n + 1, 1);
                 goto retry;
             }

             /* Conversion followed by integer expansion */
             if (next->op == SWS_OP_SCALE && !op->convert.expand &&
                 !av_cmp_q(next->c.q, ff_sws_pixel_expand(op->type, op->convert.to)))
             {
                 op->convert.expand = true;
                 ff_sws_op_list_remove_at(ops, n + 1, 1);
                 goto retry;
             }
             break;

         case SWS_OP_MIN:
             for (int i = 0; i < 4; i++) {
                 if (next->comps.unused[i] || !op->c.q4[i].den)
                     continue;
                 if (av_cmp_q(op->c.q4[i], prev->comps.max[i]) < 0)
                     noop = false;
             }

             if (noop) {
                 ff_sws_op_list_remove_at(ops, n, 1);
                 goto retry;
             }
             break;

         case SWS_OP_MAX:
             for (int i = 0; i < 4; i++) {
                 if (next->comps.unused[i] || !op->c.q4[i].den)
                     continue;
                 if (av_cmp_q(prev->comps.min[i], op->c.q4[i]) < 0)
                     noop = false;
             }

             if (noop) {
                 ff_sws_op_list_remove_at(ops, n, 1);
                 goto retry;
             }
             break;

         case SWS_OP_DITHER:
             for (int i = 0; i < 4; i++) {
                 noop &= (prev->comps.flags[i] & SWS_COMP_EXACT) ||
                         next->comps.unused[i];
             }

             if (noop) {
                 ff_sws_op_list_remove_at(ops, n, 1);
                 goto retry;
             }
             break;

         case SWS_OP_LINEAR: {
             SwsSwizzleOp swizzle;
             SwsConst c;

             /* No-op (identity) linear operation */
             if (!op->lin.mask) {
                 ff_sws_op_list_remove_at(ops, n, 1);
                 goto retry;
             }

             if (next->op == SWS_OP_LINEAR) {
                 /* 5x5 matrix multiplication after appending [ 0 0 0 0 1 ] */
                 const SwsLinearOp m1 = op->lin;
                 const SwsLinearOp m2 = next->lin;
                 for (int i = 0; i < 4; i++) {
                     for (int j = 0; j < 5; j++) {
                         AVRational sum = Q(0);
                         for (int k = 0; k < 4; k++)
                             sum = av_add_q(sum, av_mul_q(m2.m[i][k], m1.m[k][j]));
                         if (j == 4) /* m1.m[4][j] == 1 */
                             sum = av_add_q(sum, m2.m[i][4]);
                         op->lin.m[i][j] = sum;
                     }
                 }
                 op->lin.mask = ff_sws_linear_mask(op->lin);
                 ff_sws_op_list_remove_at(ops, n + 1, 1);
                 goto retry;
             }

             /* Optimize away zero columns */
             for (int j = 0; j < 4; j++) {
                 const uint32_t col = SWS_MASK_COL(j);
                 if (!(prev->comps.flags[j] & SWS_COMP_ZERO) || !(op->lin.mask & col))
                     continue;
                 for (int i = 0; i < 4; i++)
                     op->lin.m[i][j] = Q(i == j);
                 op->lin.mask &= ~col;
                 goto retry;
             }

             /* Optimize away unused rows */
             for (int i = 0; i < 4; i++) {
                 const uint32_t row = SWS_MASK_ROW(i);
                 if (!next->comps.unused[i] || !(op->lin.mask & row))
                     continue;
                 for (int j = 0; j < 5; j++)
                     op->lin.m[i][j] = Q(i == j);
                 op->lin.mask &= ~row;
                 goto retry;
             }

             /* Convert constant rows to explicit clear instruction */
             if (extract_constant_rows(&op->lin, prev->comps, &c)) {
                 RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) {
                     .op    = SWS_OP_CLEAR,
                     .type  = op->type,
                     .comps = op->comps,
                     .c     = c,
                 }));
                 goto retry;
             }

             /* Multiplication by scalar constant */
             if (extract_scalar(&op->lin, prev->comps, next->comps, &c)) {
                 op->op = SWS_OP_SCALE;
                 op->c  = c;
                 goto retry;
             }

             /* Swizzle by fixed pattern */
             if (extract_swizzle(&op->lin, prev->comps, &swizzle)) {
                 RET(ff_sws_op_list_insert_at(ops, n, &(SwsOp) {
                     .op      = SWS_OP_SWIZZLE,
                     .type    = op->type,
                     .swizzle = swizzle,
                 }));
                 goto retry;
             }
             break;
         }

         case SWS_OP_SCALE: {
             const int factor2 = exact_log2_q(op->c.q);

             /* No-op scaling */
             if (op->c.q.num == 1 && op->c.q.den == 1) {
                 ff_sws_op_list_remove_at(ops, n, 1);
                 goto retry;
             }

             /* Scaling by integer before conversion to int */
             if (op->c.q.den == 1 &&
                 next->op == SWS_OP_CONVERT &&
                 ff_sws_pixel_type_is_int(next->convert.to))
             {
                 op->type = next->convert.to;
                 FFSWAP(SwsOp, *op, *next);
                 goto retry;
             }

             /* Scaling by exact power of two */
             if (factor2 && ff_sws_pixel_type_is_int(op->type)) {
                 op->op = factor2 > 0 ? SWS_OP_LSHIFT : SWS_OP_RSHIFT;
                 op->c.u = FFABS(factor2);
                 goto retry;
             }
             break;
         }
         }

         /* No optimization triggered, move on to next operation */
         n++;
     }

     return 0;
 }

 int ff_sws_solve_shuffle(const SwsOpList *const ops, uint8_t shuffle[],
                          int size, uint8_t clear_val,
                          int *read_bytes, int *write_bytes)
 {
     const SwsOp read = ops->ops[0];
     const int read_size = ff_sws_pixel_type_size(read.type);
     uint32_t mask[4] = {0};

     if (!ops->num_ops || read.op != SWS_OP_READ)
         return AVERROR(EINVAL);
     if (read.rw.frac || (!read.rw.packed && read.rw.elems > 1))
         return AVERROR(ENOTSUP);

     for (int i = 0; i < read.rw.elems; i++)
         mask[i] = 0x01010101 * i * read_size + 0x03020100;

     for (int opidx = 1; opidx < ops->num_ops; opidx++) {
         const SwsOp *op = &ops->ops[opidx];
         switch (op->op) {
         case SWS_OP_SWIZZLE: {
             uint32_t orig[4] = { mask[0], mask[1], mask[2], mask[3] };
             for (int i = 0; i < 4; i++)
                 mask[i] = orig[op->swizzle.in[i]];
             break;
         }

         case SWS_OP_SWAP_BYTES:
             for (int i = 0; i < 4; i++) {
                 switch (ff_sws_pixel_type_size(op->type)) {
                 case 2: mask[i] = av_bswap16(mask[i]); break;
                 case 4: mask[i] = av_bswap32(mask[i]); break;
                 }
             }
             break;

         case SWS_OP_CLEAR:
             for (int i = 0; i < 4; i++) {
                 if (!op->c.q4[i].den)
                     continue;
                 if (op->c.q4[i].num != 0 || !clear_val)
                     return AVERROR(ENOTSUP);
                 mask[i] = 0x1010101ul * clear_val;
             }
             break;

         case SWS_OP_CONVERT: {
             if (!op->convert.expand)
                 return AVERROR(ENOTSUP);
             for (int i = 0; i < 4; i++) {
                 switch (ff_sws_pixel_type_size(op->type)) {
                 case 1: mask[i] = 0x01010101 * (mask[i] & 0xFF);   break;
                 case 2: mask[i] = 0x00010001 * (mask[i] & 0xFFFF); break;
                 }
             }
             break;
         }

         case SWS_OP_WRITE: {
             if (op->rw.frac || (!op->rw.packed && op->rw.elems > 1))
                 return AVERROR(ENOTSUP);

             /* Initialize to no-op */
             memset(shuffle, clear_val, size);

             const int write_size  = ff_sws_pixel_type_size(op->type);
             const int read_chunk  = read.rw.elems * read_size;
             const int write_chunk = op->rw.elems * write_size;
             const int num_groups  = size / FFMAX(read_chunk, write_chunk);
             for (int n = 0; n < num_groups; n++) {
                 const int base_in  = n * read_chunk;
                 const int base_out = n * write_chunk;
                 for (int i = 0; i < op->rw.elems; i++) {
                     const int offset = base_out + i * write_size;
                     for (int b = 0; b < write_size; b++) {
                         const uint8_t idx = mask[i] >> (b * 8);
                         if (idx != clear_val)
                             shuffle[offset + b] = base_in + idx;
                     }
                 }
             }

             *read_bytes  = num_groups * read_chunk;
             *write_bytes = num_groups * write_chunk;
             return num_groups;
         }

         default:
             return AVERROR(ENOTSUP);
         }
     }

     return AVERROR(EINVAL);
 }
	/**
	* Copyright (C) 2025 Niklas Haas
	*
	* This file is part of FFmpeg.
	*
	* FFmpeg is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* FFmpeg is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with FFmpeg; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	*/

	#include "libavutil/avassert.h"
	#include "libavutil/bswap.h"
	#include "libavutil/rational.h"

	#include "ops.h"
	#include "ops_internal.h"

	#define RET(x) \
	do { \
	if ((ret = (x)) < 0) \
	return ret; \
	} while (0)

	/* Returns true for operations that are independent per channel. These can
	* usually be commuted freely other such operations. */
	static bool op_type_is_independent(SwsOpType op)
	{
	switch (op) {
	case SWS_OP_SWAP_BYTES:
	case SWS_OP_LSHIFT:
	case SWS_OP_RSHIFT:
	case SWS_OP_CONVERT:
	case SWS_OP_DITHER:
	case SWS_OP_MIN:
	case SWS_OP_MAX:
	case SWS_OP_SCALE:
	return true;
	case SWS_OP_INVALID:
	case SWS_OP_READ:
	case SWS_OP_WRITE:
	case SWS_OP_SWIZZLE:
	case SWS_OP_CLEAR:
	case SWS_OP_LINEAR:
	case SWS_OP_PACK:
	case SWS_OP_UNPACK:
	return false;
	case SWS_OP_TYPE_NB:
	break;
	}

	av_unreachable("Invalid operation type!");
	return false;
	}

	/* merge_comp_flags() forms a monoid with flags_identity as the null element */
	static const unsigned flags_identity = SWS_COMP_ZERO \| SWS_COMP_EXACT;
	static unsigned merge_comp_flags(unsigned a, unsigned b)
	{
	const unsigned flags_or = SWS_COMP_GARBAGE;
	const unsigned flags_and = SWS_COMP_ZERO \| SWS_COMP_EXACT;
	return ((a & b) & flags_and) \| ((a \| b) & flags_or);
	}

	/* Infer + propagate known information about components */
	void ff_sws_op_list_update_comps(SwsOpList *ops)
	{
	SwsComps next = { .unused = {true, true, true, true} };
	SwsComps prev = { .flags = {
	SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE,
	}};

	/* Forwards pass, propagates knowledge about the incoming pixel values */
	for (int n = 0; n < ops->num_ops; n++) {
	SwsOp *op = &ops->ops[n];

	/* Prefill min/max values automatically; may have to be fixed in
	* special cases */
	memcpy(op->comps.min, prev.min, sizeof(prev.min));
	memcpy(op->comps.max, prev.max, sizeof(prev.max));

	if (op->op != SWS_OP_SWAP_BYTES) {
	ff_sws_apply_op_q(op, op->comps.min);
	ff_sws_apply_op_q(op, op->comps.max);
	}

	switch (op->op) {
	case SWS_OP_READ:
	for (int i = 0; i < op->rw.elems; i++) {
	if (ff_sws_pixel_type_is_int(op->type)) {
	int bits = 8 * ff_sws_pixel_type_size(op->type);
	if (!op->rw.packed && ops->src.desc) {
	/* Use legal value range from pixdesc if available;
	* we don't need to do this for packed formats because
	* non-byte-aligned packed formats will necessarily go
	* through SWS_OP_UNPACK anyway */
	for (int c = 0; c < 4; c++) {
	if (ops->src.desc->comp[c].plane == i) {
	bits = ops->src.desc->comp[c].depth;
	break;
	}
	}
	}

	op->comps.flags[i] = SWS_COMP_EXACT;
	op->comps.min[i] = Q(0);
	op->comps.max[i] = Q((1ULL << bits) - 1);
	}
	}
	for (int i = op->rw.elems; i < 4; i++)
	op->comps.flags[i] = prev.flags[i];
	break;
	case SWS_OP_WRITE:
	for (int i = 0; i < op->rw.elems; i++)
	av_assert1(!(prev.flags[i] & SWS_COMP_GARBAGE));
	/* fall through */
	case SWS_OP_SWAP_BYTES:
	case SWS_OP_LSHIFT:
	case SWS_OP_RSHIFT:
	case SWS_OP_MIN:
	case SWS_OP_MAX:
	/* Linearly propagate flags per component */
	for (int i = 0; i < 4; i++)
	op->comps.flags[i] = prev.flags[i];
	break;
	case SWS_OP_DITHER:
	/* Strip zero flag because of the nonzero dithering offset */
	for (int i = 0; i < 4; i++)
	op->comps.flags[i] = prev.flags[i] & ~SWS_COMP_ZERO;
	break;
	case SWS_OP_UNPACK:
	for (int i = 0; i < 4; i++) {
	if (op->pack.pattern[i])
	op->comps.flags[i] = prev.flags[0];
	else
	op->comps.flags[i] = SWS_COMP_GARBAGE;
	}
	break;
	case SWS_OP_PACK: {
	unsigned flags = flags_identity;
	for (int i = 0; i < 4; i++) {
	if (op->pack.pattern[i])
	flags = merge_comp_flags(flags, prev.flags[i]);
	if (i > 0) /* clear remaining comps for sanity */
	op->comps.flags[i] = SWS_COMP_GARBAGE;
	}
	op->comps.flags[0] = flags;
	break;
	}
	case SWS_OP_CLEAR:
	for (int i = 0; i < 4; i++) {
	if (op->c.q4[i].den) {
	if (op->c.q4[i].num == 0) {
	op->comps.flags[i] = SWS_COMP_ZERO \| SWS_COMP_EXACT;
	} else if (op->c.q4[i].den == 1) {
	op->comps.flags[i] = SWS_COMP_EXACT;
	}
	} else {
	op->comps.flags[i] = prev.flags[i];
	}
	}
	break;
	case SWS_OP_SWIZZLE:
	for (int i = 0; i < 4; i++)
	op->comps.flags[i] = prev.flags[op->swizzle.in[i]];
	break;
	case SWS_OP_CONVERT:
	for (int i = 0; i < 4; i++) {
	op->comps.flags[i] = prev.flags[i];
	if (ff_sws_pixel_type_is_int(op->convert.to))
	op->comps.flags[i] \|= SWS_COMP_EXACT;
	}
	break;
	case SWS_OP_LINEAR:
	for (int i = 0; i < 4; i++) {
	unsigned flags = flags_identity;
	AVRational min = Q(0), max = Q(0);
	for (int j = 0; j < 4; j++) {
	const AVRational k = op->lin.m[i][j];
	AVRational mink = av_mul_q(prev.min[j], k);
	AVRational maxk = av_mul_q(prev.max[j], k);
	if (k.num) {
	flags = merge_comp_flags(flags, prev.flags[j]);
	if (k.den != 1) /* fractional coefficient */
	flags &= ~SWS_COMP_EXACT;
	if (k.num < 0)
	FFSWAP(AVRational, mink, maxk);
	min = av_add_q(min, mink);
	max = av_add_q(max, maxk);
	}
	}
	if (op->lin.m[i][4].num) { /* nonzero offset */
	flags &= ~SWS_COMP_ZERO;
	if (op->lin.m[i][4].den != 1) /* fractional offset */
	flags &= ~SWS_COMP_EXACT;
	min = av_add_q(min, op->lin.m[i][4]);
	max = av_add_q(max, op->lin.m[i][4]);
	}
	op->comps.flags[i] = flags;
	op->comps.min[i] = min;
	op->comps.max[i] = max;
	}
	break;
	case SWS_OP_SCALE:
	for (int i = 0; i < 4; i++) {
	op->comps.flags[i] = prev.flags[i];
	if (op->c.q.den != 1) /* fractional scale */
	op->comps.flags[i] &= ~SWS_COMP_EXACT;
	if (op->c.q.num < 0)
	FFSWAP(AVRational, op->comps.min[i], op->comps.max[i]);
	}
	break;

	case SWS_OP_INVALID:
	case SWS_OP_TYPE_NB:
	av_unreachable("Invalid operation type!");
	}

	prev = op->comps;
	}

	/* Backwards pass, solves for component dependencies */
	for (int n = ops->num_ops - 1; n >= 0; n--) {
	SwsOp *op = &ops->ops[n];

	switch (op->op) {
	case SWS_OP_READ:
	case SWS_OP_WRITE:
	for (int i = 0; i < op->rw.elems; i++)
	op->comps.unused[i] = op->op == SWS_OP_READ;
	for (int i = op->rw.elems; i < 4; i++)
	op->comps.unused[i] = next.unused[i];
	break;
	case SWS_OP_SWAP_BYTES:
	case SWS_OP_LSHIFT:
	case SWS_OP_RSHIFT:
	case SWS_OP_CONVERT:
	case SWS_OP_DITHER:
	case SWS_OP_MIN:
	case SWS_OP_MAX:
	case SWS_OP_SCALE:
	for (int i = 0; i < 4; i++)
	op->comps.unused[i] = next.unused[i];
	break;
	case SWS_OP_UNPACK: {
	bool unused = true;
	for (int i = 0; i < 4; i++) {
	if (op->pack.pattern[i])
	unused &= next.unused[i];
	op->comps.unused[i] = i > 0;
	}
	op->comps.unused[0] = unused;
	break;
	}
	case SWS_OP_PACK:
	for (int i = 0; i < 4; i++) {
	if (op->pack.pattern[i])
	op->comps.unused[i] = next.unused[0];
	else
	op->comps.unused[i] = true;
	}
	break;
	case SWS_OP_CLEAR:
	for (int i = 0; i < 4; i++) {
	if (op->c.q4[i].den)
	op->comps.unused[i] = true;
	else
	op->comps.unused[i] = next.unused[i];
	}
	break;
	case SWS_OP_SWIZZLE: {
	bool unused[4] = { true, true, true, true };
	for (int i = 0; i < 4; i++)
	unused[op->swizzle.in[i]] &= next.unused[i];
	for (int i = 0; i < 4; i++)
	op->comps.unused[i] = unused[i];
	break;
	}
	case SWS_OP_LINEAR:
	for (int j = 0; j < 4; j++) {
	bool unused = true;
	for (int i = 0; i < 4; i++) {
	if (op->lin.m[i][j].num)
	unused &= next.unused[i];
	}
	op->comps.unused[j] = unused;
	}
	break;
	}

	next = op->comps;
	}
	}

	/* returns log2(x) only if x is a power of two, or 0 otherwise */
	static int exact_log2(const int x)
	{
	int p;
	if (x <= 0)
	return 0;
	p = av_log2(x);
	return (1 << p) == x ? p : 0;
	}

	static int exact_log2_q(const AVRational x)
	{
	if (x.den == 1)
	return exact_log2(x.num);
	else if (x.num == 1)
	return -exact_log2(x.den);
	else
	return 0;
	}

	/**
	* If a linear operation can be reduced to a scalar multiplication, returns
	* the corresponding scaling factor, or 0 otherwise.
	*/
	static bool extract_scalar(const SwsLinearOp *c, SwsComps prev, SwsComps next,
	SwsConst *out_scale)
	{
	SwsConst scale = {0};

	/* There are components not on the main diagonal */
	if (c->mask & ~SWS_MASK_DIAG4)
	return false;

	for (int i = 0; i < 4; i++) {
	const AVRational s = c->m[i][i];
	if ((prev.flags[i] & SWS_COMP_ZERO) \|\| next.unused[i])
	continue;
	if (scale.q.den && av_cmp_q(s, scale.q))
	return false;
	scale.q = s;
	}

	if (scale.q.den)
	*out_scale = scale;
	return scale.q.den;
	}

	/* Extracts an integer clear operation (subset) from the given linear op. */
	static bool extract_constant_rows(SwsLinearOp *c, SwsComps prev,
	SwsConst *out_clear)
	{
	SwsConst clear = {0};
	bool ret = false;

	for (int i = 0; i < 4; i++) {
	bool const_row = c->m[i][4].den == 1; /* offset is integer */
	for (int j = 0; j < 4; j++) {
	const_row &= c->m[i][j].num == 0 \|\| /* scalar is zero */
	(prev.flags[j] & SWS_COMP_ZERO); /* input is zero */
	}
	if (const_row && (c->mask & SWS_MASK_ROW(i))) {
	clear.q4[i] = c->m[i][4];
	for (int j = 0; j < 5; j++)
	c->m[i][j] = Q(i == j);
	c->mask &= ~SWS_MASK_ROW(i);
	ret = true;
	}
	}

	if (ret)
	*out_clear = clear;
	return ret;
	}

	/* Unswizzle a linear operation by aligning single-input rows with
	* their corresponding diagonal */
	static bool extract_swizzle(SwsLinearOp op, SwsComps prev, SwsSwizzleOp out_swiz)
	{
	SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3);
	SwsLinearOp c = *op;

	for (int i = 0; i < 4; i++) {
	int idx = -1;
	for (int j = 0; j < 4; j++) {
	if (!c.m[i][j].num \|\| (prev.flags[j] & SWS_COMP_ZERO))
	continue;
	if (idx >= 0)
	return false; /* multiple inputs */
	idx = j;
	}

	if (idx >= 0 && idx != i) {
	/* Move coefficient to the diagonal */
	c.m[i][i] = c.m[i][idx];
	c.m[i][idx] = Q(0);
	swiz.in[i] = idx;
	}
	}

	if (swiz.mask == SWS_SWIZZLE(0, 1, 2, 3).mask)
	return false; /* no swizzle was identified */

	c.mask = ff_sws_linear_mask(c);
	*out_swiz = swiz;
	*op = c;
	return true;
	}

	int ff_sws_op_list_optimize(SwsOpList *ops)
	{
	int ret;

	retry:
	ff_sws_op_list_update_comps(ops);

	for (int n = 0; n < ops->num_ops;) {
	SwsOp dummy = {0};
	SwsOp *op = &ops->ops[n];
	SwsOp *prev = n ? &ops->ops[n - 1] : &dummy;
	SwsOp *next = n + 1 < ops->num_ops ? &ops->ops[n + 1] : &dummy;

	/* common helper variable */
	bool noop = true;

	switch (op->op) {
	case SWS_OP_READ:
	/* Optimized further into refcopy / memcpy */
	if (next->op == SWS_OP_WRITE &&
	next->rw.elems == op->rw.elems &&
	next->rw.packed == op->rw.packed &&
	next->rw.frac == op->rw.frac)
	{
	ff_sws_op_list_remove_at(ops, n, 2);
	av_assert1(ops->num_ops == 0);
	return 0;
	}

	/* Skip reading extra unneeded components */
	if (!op->rw.packed) {
	int needed = op->rw.elems;
	while (needed > 0 && next->comps.unused[needed - 1])
	needed--;
	if (op->rw.elems != needed) {
	op->rw.elems = needed;
	goto retry;
	}
	}
	break;

	case SWS_OP_SWAP_BYTES:
	/* Redundant (double) swap */
	if (next->op == SWS_OP_SWAP_BYTES) {
	ff_sws_op_list_remove_at(ops, n, 2);
	goto retry;
	}
	break;

	case SWS_OP_UNPACK:
	/* Redundant unpack+pack */
	if (next->op == SWS_OP_PACK && next->type == op->type &&
	next->pack.pattern[0] == op->pack.pattern[0] &&
	next->pack.pattern[1] == op->pack.pattern[1] &&
	next->pack.pattern[2] == op->pack.pattern[2] &&
	next->pack.pattern[3] == op->pack.pattern[3])
	{
	ff_sws_op_list_remove_at(ops, n, 2);
	goto retry;
	}
	break;

	case SWS_OP_LSHIFT:
	case SWS_OP_RSHIFT:
	/* Two shifts in the same direction */
	if (next->op == op->op) {
	op->c.u += next->c.u;
	ff_sws_op_list_remove_at(ops, n + 1, 1);
	goto retry;
	}

	/* No-op shift */
	if (!op->c.u) {
	ff_sws_op_list_remove_at(ops, n, 1);
	goto retry;
	}
	break;

	case SWS_OP_CLEAR:
	for (int i = 0; i < 4; i++) {
	if (!op->c.q4[i].den)
	continue;

	if ((prev->comps.flags[i] & SWS_COMP_ZERO) &&
	!(prev->comps.flags[i] & SWS_COMP_GARBAGE) &&
	op->c.q4[i].num == 0)
	{
	/* Redundant clear-to-zero of zero component */
	op->c.q4[i].den = 0;
	} else if (next->comps.unused[i]) {
	/* Unnecessary clear of unused component */
	op->c.q4[i] = (AVRational) {0, 0};
	} else if (op->c.q4[i].den) {
	noop = false;
	}
	}

	if (noop) {
	ff_sws_op_list_remove_at(ops, n, 1);
	goto retry;
	}

	/* Transitive clear */
	if (next->op == SWS_OP_CLEAR) {
	for (int i = 0; i < 4; i++) {
	if (next->c.q4[i].den)
	op->c.q4[i] = next->c.q4[i];
	}
	ff_sws_op_list_remove_at(ops, n + 1, 1);
	goto retry;
	}

	/* Prefer to clear as late as possible, to avoid doing
	* redundant work */
	if ((op_type_is_independent(next->op) && next->op != SWS_OP_SWAP_BYTES) \|\|
	next->op == SWS_OP_SWIZZLE)
	{
	if (next->op == SWS_OP_CONVERT)
	op->type = next->convert.to;
	ff_sws_apply_op_q(next, op->c.q4);
	FFSWAP(SwsOp, op, next);
	goto retry;
	}
	break;

	case SWS_OP_SWIZZLE: {
	bool seen[4] = {0};
	bool has_duplicates = false;
	for (int i = 0; i < 4; i++) {
	if (next->comps.unused[i])
	continue;
	if (op->swizzle.in[i] != i)
	noop = false;
	has_duplicates \|= seen[op->swizzle.in[i]];
	seen[op->swizzle.in[i]] = true;
	}

	/* Identity swizzle */
	if (noop) {
	ff_sws_op_list_remove_at(ops, n, 1);
	goto retry;
	}

	/* Transitive swizzle */
	if (next->op == SWS_OP_SWIZZLE) {
	const SwsSwizzleOp orig = op->swizzle;
	for (int i = 0; i < 4; i++)
	op->swizzle.in[i] = orig.in[next->swizzle.in[i]];
	ff_sws_op_list_remove_at(ops, n + 1, 1);
	goto retry;
	}

	/* Try to push swizzles with duplicates towards the output */
	if (has_duplicates && op_type_is_independent(next->op)) {
	if (next->op == SWS_OP_CONVERT)
	op->type = next->convert.to;
	if (next->op == SWS_OP_MIN \|\| next->op == SWS_OP_MAX) {
	/* Un-swizzle the next operation */
	const SwsConst c = next->c;
	for (int i = 0; i < 4; i++) {
	if (!next->comps.unused[i])
	next->c.q4[op->swizzle.in[i]] = c.q4[i];
	}
	}
	FFSWAP(SwsOp, op, next);
	goto retry;
	}

	/* Move swizzle out of the way between two converts so that
	* they may be merged */
	if (prev->op == SWS_OP_CONVERT && next->op == SWS_OP_CONVERT) {
	op->type = next->convert.to;
	FFSWAP(SwsOp, op, next);
	goto retry;
	}
	break;
	}

	case SWS_OP_CONVERT:
	/* No-op conversion */
	if (op->type == op->convert.to) {
	ff_sws_op_list_remove_at(ops, n, 1);
	goto retry;
	}

	/* Transitive conversion */
	if (next->op == SWS_OP_CONVERT &&
	op->convert.expand == next->convert.expand)
	{
	av_assert1(op->convert.to == next->type);
	op->convert.to = next->convert.to;
	ff_sws_op_list_remove_at(ops, n + 1, 1);
	goto retry;
	}

	/* Conversion followed by integer expansion */
	if (next->op == SWS_OP_SCALE && !op->convert.expand &&
	!av_cmp_q(next->c.q, ff_sws_pixel_expand(op->type, op->convert.to)))
	{
	op->convert.expand = true;
	ff_sws_op_list_remove_at(ops, n + 1, 1);
	goto retry;
	}
	break;

	case SWS_OP_MIN:
	for (int i = 0; i < 4; i++) {
	if (next->comps.unused[i] \|\| !op->c.q4[i].den)
	continue;
	if (av_cmp_q(op->c.q4[i], prev->comps.max[i]) < 0)
	noop = false;
	}

	if (noop) {
	ff_sws_op_list_remove_at(ops, n, 1);
	goto retry;
	}
	break;

	case SWS_OP_MAX:
	for (int i = 0; i < 4; i++) {
	if (next->comps.unused[i] \|\| !op->c.q4[i].den)
	continue;
	if (av_cmp_q(prev->comps.min[i], op->c.q4[i]) < 0)
	noop = false;
	}

	if (noop) {
	ff_sws_op_list_remove_at(ops, n, 1);
	goto retry;
	}
	break;

	case SWS_OP_DITHER:
	for (int i = 0; i < 4; i++) {
	noop &= (prev->comps.flags[i] & SWS_COMP_EXACT) \|\|
	next->comps.unused[i];
	}

	if (noop) {
	ff_sws_op_list_remove_at(ops, n, 1);
	goto retry;
	}
	break;

	case SWS_OP_LINEAR: {
	SwsSwizzleOp swizzle;
	SwsConst c;

	/* No-op (identity) linear operation */
	if (!op->lin.mask) {
	ff_sws_op_list_remove_at(ops, n, 1);
	goto retry;
	}

	if (next->op == SWS_OP_LINEAR) {
	/* 5x5 matrix multiplication after appending [ 0 0 0 0 1 ] */
	const SwsLinearOp m1 = op->lin;
	const SwsLinearOp m2 = next->lin;
	for (int i = 0; i < 4; i++) {
	for (int j = 0; j < 5; j++) {
	AVRational sum = Q(0);
	for (int k = 0; k < 4; k++)
	sum = av_add_q(sum, av_mul_q(m2.m[i][k], m1.m[k][j]));
	if (j == 4) /* m1.m[4][j] == 1 */
	sum = av_add_q(sum, m2.m[i][4]);
	op->lin.m[i][j] = sum;
	}
	}
	op->lin.mask = ff_sws_linear_mask(op->lin);
	ff_sws_op_list_remove_at(ops, n + 1, 1);
	goto retry;
	}

	/* Optimize away zero columns */
	for (int j = 0; j < 4; j++) {
	const uint32_t col = SWS_MASK_COL(j);
	if (!(prev->comps.flags[j] & SWS_COMP_ZERO) \|\| !(op->lin.mask & col))
	continue;
	for (int i = 0; i < 4; i++)
	op->lin.m[i][j] = Q(i == j);
	op->lin.mask &= ~col;
	goto retry;
	}

	/* Optimize away unused rows */
	for (int i = 0; i < 4; i++) {
	const uint32_t row = SWS_MASK_ROW(i);
	if (!next->comps.unused[i] \|\| !(op->lin.mask & row))
	continue;
	for (int j = 0; j < 5; j++)
	op->lin.m[i][j] = Q(i == j);
	op->lin.mask &= ~row;
	goto retry;
	}

	/* Convert constant rows to explicit clear instruction */
	if (extract_constant_rows(&op->lin, prev->comps, &c)) {
	RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) {
	.op = SWS_OP_CLEAR,
	.type = op->type,
	.comps = op->comps,
	.c = c,
	}));
	goto retry;
	}

	/* Multiplication by scalar constant */
	if (extract_scalar(&op->lin, prev->comps, next->comps, &c)) {
	op->op = SWS_OP_SCALE;
	op->c = c;
	goto retry;
	}

	/* Swizzle by fixed pattern */
	if (extract_swizzle(&op->lin, prev->comps, &swizzle)) {
	RET(ff_sws_op_list_insert_at(ops, n, &(SwsOp) {
	.op = SWS_OP_SWIZZLE,
	.type = op->type,
	.swizzle = swizzle,
	}));
	goto retry;
	}
	break;
	}

	case SWS_OP_SCALE: {
	const int factor2 = exact_log2_q(op->c.q);

	/* No-op scaling */
	if (op->c.q.num == 1 && op->c.q.den == 1) {
	ff_sws_op_list_remove_at(ops, n, 1);
	goto retry;
	}

	/* Scaling by integer before conversion to int */
	if (op->c.q.den == 1 &&
	next->op == SWS_OP_CONVERT &&
	ff_sws_pixel_type_is_int(next->convert.to))
	{
	op->type = next->convert.to;
	FFSWAP(SwsOp, op, next);
	goto retry;
	}

	/* Scaling by exact power of two */
	if (factor2 && ff_sws_pixel_type_is_int(op->type)) {
	op->op = factor2 > 0 ? SWS_OP_LSHIFT : SWS_OP_RSHIFT;
	op->c.u = FFABS(factor2);
	goto retry;
	}
	break;
	}
	}

	/* No optimization triggered, move on to next operation */
	n++;
	}

	return 0;
	}

	int ff_sws_solve_shuffle(const SwsOpList *const ops, uint8_t shuffle[],
	int size, uint8_t clear_val,
	int read_bytes, int write_bytes)
	{
	const SwsOp read = ops->ops[0];
	const int read_size = ff_sws_pixel_type_size(read.type);
	uint32_t mask[4] = {0};

	if (!ops->num_ops \|\| read.op != SWS_OP_READ)
	return AVERROR(EINVAL);
	if (read.rw.frac \|\| (!read.rw.packed && read.rw.elems > 1))
	return AVERROR(ENOTSUP);

	for (int i = 0; i < read.rw.elems; i++)
	mask[i] = 0x01010101 * i * read_size + 0x03020100;

	for (int opidx = 1; opidx < ops->num_ops; opidx++) {
	const SwsOp *op = &ops->ops[opidx];
	switch (op->op) {
	case SWS_OP_SWIZZLE: {
	uint32_t orig[4] = { mask[0], mask[1], mask[2], mask[3] };
	for (int i = 0; i < 4; i++)
	mask[i] = orig[op->swizzle.in[i]];
	break;
	}

	case SWS_OP_SWAP_BYTES:
	for (int i = 0; i < 4; i++) {
	switch (ff_sws_pixel_type_size(op->type)) {
	case 2: mask[i] = av_bswap16(mask[i]); break;
	case 4: mask[i] = av_bswap32(mask[i]); break;
	}
	}
	break;

	case SWS_OP_CLEAR:
	for (int i = 0; i < 4; i++) {
	if (!op->c.q4[i].den)
	continue;
	if (op->c.q4[i].num != 0 \|\| !clear_val)
	return AVERROR(ENOTSUP);
	mask[i] = 0x1010101ul * clear_val;
	}
	break;

	case SWS_OP_CONVERT: {
	if (!op->convert.expand)
	return AVERROR(ENOTSUP);
	for (int i = 0; i < 4; i++) {
	switch (ff_sws_pixel_type_size(op->type)) {
	case 1: mask[i] = 0x01010101 * (mask[i] & 0xFF); break;
	case 2: mask[i] = 0x00010001 * (mask[i] & 0xFFFF); break;
	}
	}
	break;
	}

	case SWS_OP_WRITE: {
	if (op->rw.frac \|\| (!op->rw.packed && op->rw.elems > 1))
	return AVERROR(ENOTSUP);

	/* Initialize to no-op */
	memset(shuffle, clear_val, size);

	const int write_size = ff_sws_pixel_type_size(op->type);
	const int read_chunk = read.rw.elems * read_size;
	const int write_chunk = op->rw.elems * write_size;
	const int num_groups = size / FFMAX(read_chunk, write_chunk);
	for (int n = 0; n < num_groups; n++) {
	const int base_in = n * read_chunk;
	const int base_out = n * write_chunk;
	for (int i = 0; i < op->rw.elems; i++) {
	const int offset = base_out + i * write_size;
	for (int b = 0; b < write_size; b++) {
	const uint8_t idx = mask[i] >> (b * 8);
	if (idx != clear_val)
	shuffle[offset + b] = base_in + idx;
	}
	}
	}

	read_bytes = num_groups read_chunk;
	write_bytes = num_groups write_chunk;
	return num_groups;
	}

	default:
	return AVERROR(ENOTSUP);
	}
	}

	return AVERROR(EINVAL);
	}