| /** |
| * Copyright (C) 2026 Niklas Haas |
| * |
| * This file is part of FFmpeg. |
| * |
| * FFmpeg is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * FFmpeg is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public |
| * License along with FFmpeg; if not, write to the Free Software |
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| */ |
| |
| #include <libavutil/bswap.h> |
| |
| #include "uops_tmpl.h" |
| |
| #ifndef BIT_DEPTH |
| # define BIT_DEPTH 8 |
| #endif |
| |
| #if IS_FLOAT && BIT_DEPTH == 32 |
| # define PIXEL_TYPE SWS_PIXEL_F32 |
| # define pixel_t float |
| # define inter_t float |
| # define PX F32 |
| # define px f32 |
| #elif BIT_DEPTH == 32 |
| # define PIXEL_MAX 0xFFFFFFFFu |
| # define PIXEL_SWAP av_bswap32 |
| # define pixel_t uint32_t |
| # define inter_t int64_t |
| # define PX U32 |
| # define px u32 |
| #elif BIT_DEPTH == 16 |
| # define PIXEL_MAX 0xFFFFu |
| # define PIXEL_SWAP av_bswap16 |
| # define pixel_t uint16_t |
| # define inter_t int64_t |
| # define PX U16 |
| # define px u16 |
| #elif BIT_DEPTH == 8 |
| # define PIXEL_MAX 0xFFu |
| # define pixel_t uint8_t |
| # define inter_t int32_t |
| # define PX U8 |
| # define px u8 |
| #else |
| # error Invalid BIT_DEPTH |
| #endif |
| |
| /********************************* |
| * Generic read/write operations * |
| *********************************/ |
| |
| DECL_READ(read_planar, const SwsCompMask mask) |
| { |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x[i] = in0[i]; |
| if (Y) y[i] = in1[i]; |
| if (Z) z[i] = in2[i]; |
| if (W) w[i] = in3[i]; |
| } |
| |
| if (X) iter->in[0] += SIZEOF_BLOCK; |
| if (Y) iter->in[1] += SIZEOF_BLOCK; |
| if (Z) iter->in[2] += SIZEOF_BLOCK; |
| if (W) iter->in[3] += SIZEOF_BLOCK; |
| |
| CONTINUE(x, y, z, w); |
| } |
| |
| DECL_READ(read_packed, const SwsCompMask mask) |
| { |
| const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1; |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x[i] = in0[elems * i + 0]; |
| if (Y) y[i] = in0[elems * i + 1]; |
| if (Z) z[i] = in0[elems * i + 2]; |
| if (W) w[i] = in0[elems * i + 3]; |
| } |
| |
| iter->in[0] += SIZEOF_BLOCK * elems; |
| CONTINUE(x, y, z, w); |
| } |
| |
| DECL_WRITE(write_planar, const SwsCompMask mask) |
| { |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) out0[i] = x[i]; |
| if (Y) out1[i] = y[i]; |
| if (Z) out2[i] = z[i]; |
| if (W) out3[i] = w[i]; |
| } |
| |
| if (X) iter->out[0] += SIZEOF_BLOCK; |
| if (Y) iter->out[1] += SIZEOF_BLOCK; |
| if (Z) iter->out[2] += SIZEOF_BLOCK; |
| if (W) iter->out[3] += SIZEOF_BLOCK; |
| } |
| |
| DECL_WRITE(write_packed, const SwsCompMask mask) |
| { |
| const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1; |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) out0[elems * i + 0] = x[i]; |
| if (Y) out0[elems * i + 1] = y[i]; |
| if (Z) out0[elems * i + 2] = z[i]; |
| if (W) out0[elems * i + 3] = w[i]; |
| } |
| |
| iter->out[0] += SIZEOF_BLOCK * elems; |
| } |
| |
| #if BIT_DEPTH == 8 |
| |
| DECL_READ(read_bit, const SwsCompMask mask) |
| { |
| av_assert2(mask == SWS_COMP_ELEMS(1)); |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) { |
| const pixel_t val = ((const pixel_t *) in0)[i >> 3]; |
| x[i + 0] = (val >> 7) & 1; |
| x[i + 1] = (val >> 6) & 1; |
| x[i + 2] = (val >> 5) & 1; |
| x[i + 3] = (val >> 4) & 1; |
| x[i + 4] = (val >> 3) & 1; |
| x[i + 5] = (val >> 2) & 1; |
| x[i + 6] = (val >> 1) & 1; |
| x[i + 7] = (val >> 0) & 1; |
| } |
| |
| iter->in[0] += SIZEOF_BLOCK >> 3; |
| CONTINUE(x, y, z, w); |
| } |
| |
| DECL_READ(read_nibble, const SwsCompMask mask) |
| { |
| av_assert2(mask == SWS_COMP_ELEMS(1)); |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) { |
| const pixel_t val = in0[i >> 1]; |
| x[i + 0] = val >> 4; /* high nibble */ |
| x[i + 1] = val & 0xF; /* low nibble */ |
| } |
| |
| iter->in[0] += SIZEOF_BLOCK >> 1; |
| CONTINUE(x, y, z, w); |
| } |
| |
| DECL_WRITE(write_bit, const SwsCompMask mask) |
| { |
| av_assert2(mask == SWS_COMP_ELEMS(1)); |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) { |
| out0[i >> 3] = x[i + 0] << 7 | |
| x[i + 1] << 6 | |
| x[i + 2] << 5 | |
| x[i + 3] << 4 | |
| x[i + 4] << 3 | |
| x[i + 5] << 2 | |
| x[i + 6] << 1 | |
| x[i + 7]; |
| } |
| |
| iter->out[0] += SIZEOF_BLOCK >> 3; |
| } |
| |
| DECL_WRITE(write_nibble, const SwsCompMask mask) |
| { |
| av_assert2(mask == SWS_COMP_ELEMS(1)); |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) |
| out0[i >> 1] = x[i] << 4 | x[i + 1]; |
| |
| iter->out[0] += SIZEOF_BLOCK >> 1; |
| } |
| |
| #endif /* BIT_DEPTH == 8 */ |
| |
| SWS_FOR(PX, READ_PLANAR, DECL_IMPL_READ, read_planar) |
| SWS_FOR(PX, READ_PACKED, DECL_IMPL_READ, read_packed) |
| SWS_FOR(PX, READ_NIBBLE, DECL_IMPL_READ, read_nibble) |
| SWS_FOR(PX, READ_BIT, DECL_IMPL_READ, read_bit) |
| SWS_FOR(PX, WRITE_PLANAR, DECL_IMPL_WRITE, write_planar) |
| SWS_FOR(PX, WRITE_PACKED, DECL_IMPL_WRITE, write_packed) |
| SWS_FOR(PX, WRITE_NIBBLE, DECL_IMPL_WRITE, write_nibble) |
| SWS_FOR(PX, WRITE_BIT, DECL_IMPL_WRITE, write_bit) |
| |
| SWS_FOR_STRUCT(PX, READ_PLANAR, DECL_ENTRY) |
| SWS_FOR_STRUCT(PX, READ_PACKED, DECL_ENTRY) |
| SWS_FOR_STRUCT(PX, READ_NIBBLE, DECL_ENTRY) |
| SWS_FOR_STRUCT(PX, READ_BIT, DECL_ENTRY) |
| SWS_FOR_STRUCT(PX, WRITE_PLANAR, DECL_ENTRY) |
| SWS_FOR_STRUCT(PX, WRITE_PACKED, DECL_ENTRY) |
| SWS_FOR_STRUCT(PX, WRITE_NIBBLE, DECL_ENTRY) |
| SWS_FOR_STRUCT(PX, WRITE_BIT, DECL_ENTRY) |
| |
| /***************************** |
| * Scaling / filtering reads * |
| *****************************/ |
| |
| DECL_SETUP(setup_filter_v, params, out) |
| { |
| if (params->uop->par.filter.type != SWS_PIXEL_F32) |
| return AVERROR(ENOTSUP); |
| |
| const SwsFilterWeights *filter = params->uop->data.kernel; |
| static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]), |
| ">8 byte pointers not supported"); |
| |
| /* Pre-convert weights to float */ |
| float *weights = av_calloc(filter->num_weights, sizeof(float)); |
| if (!weights) |
| return AVERROR(ENOMEM); |
| |
| for (int i = 0; i < filter->num_weights; i++) |
| weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE; |
| |
| out->priv.ptr = weights; |
| out->priv.i32[2] = filter->filter_size; |
| out->free = ff_op_priv_free; |
| return 0; |
| } |
| |
| /* Fully general vertical planar filter case */ |
| DECL_READ(read_planar_fv, const SwsCompMask mask, const SwsPixelType type) |
| { |
| av_assert2(type == SWS_PIXEL_F32); |
| const SwsOpExec *exec = iter->exec; |
| const float *restrict weights = impl->priv.ptr; |
| const int filter_size = impl->priv.i32[2]; |
| weights += filter_size * iter->y; |
| |
| block_t xs, ys, zs, ws; |
| if (X) memset(&xs.f32, 0, sizeof(xs.f32)); |
| if (Y) memset(&ys.f32, 0, sizeof(ys.f32)); |
| if (Z) memset(&zs.f32, 0, sizeof(zs.f32)); |
| if (W) memset(&ws.f32, 0, sizeof(ws.f32)); |
| |
| for (int j = 0; j < filter_size; j++) { |
| const float weight = weights[j]; |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) xs.f32[i] += weight * in0[i]; |
| if (Y) ys.f32[i] += weight * in1[i]; |
| if (Z) zs.f32[i] += weight * in2[i]; |
| if (W) ws.f32[i] += weight * in3[i]; |
| } |
| |
| if (X) in0 = bump_ptr(in0, exec->in_stride[0]); |
| if (Y) in1 = bump_ptr(in1, exec->in_stride[1]); |
| if (Z) in2 = bump_ptr(in2, exec->in_stride[2]); |
| if (W) in3 = bump_ptr(in3, exec->in_stride[3]); |
| } |
| |
| if (X) iter->in[0] += SIZEOF_BLOCK; |
| if (Y) iter->in[1] += SIZEOF_BLOCK; |
| if (Z) iter->in[2] += SIZEOF_BLOCK; |
| if (W) iter->in[3] += SIZEOF_BLOCK; |
| |
| CONTINUE(&xs, &ys, &zs, &ws); |
| } |
| |
| DECL_SETUP(setup_filter_h, params, out) |
| { |
| if (params->uop->par.filter.type != SWS_PIXEL_F32) |
| return AVERROR(ENOTSUP); |
| |
| SwsFilterWeights *filter = params->uop->data.kernel; |
| out->priv.ptr = av_refstruct_ref(filter->weights); |
| out->priv.i32[2] = filter->filter_size; |
| out->free = ff_op_priv_unref; |
| return 0; |
| } |
| |
| /* Fully general horizontal planar filter case */ |
| DECL_READ(read_planar_fh, const SwsCompMask mask, const SwsPixelType type) |
| { |
| av_assert2(type == SWS_PIXEL_F32); |
| const SwsOpExec *exec = iter->exec; |
| const int *restrict weights = impl->priv.ptr; |
| const int filter_size = impl->priv.i32[2]; |
| const float scale = 1.0f / SWS_FILTER_SCALE; |
| const int xpos = iter->x; |
| weights += filter_size * iter->x; |
| |
| block_t xs, ys, zs, ws; |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| const int offset = exec->in_offset_x[xpos + i]; |
| pixel_t *start0 = bump_ptr(in0, offset); |
| pixel_t *start1 = bump_ptr(in1, offset); |
| pixel_t *start2 = bump_ptr(in2, offset); |
| pixel_t *start3 = bump_ptr(in3, offset); |
| |
| inter_t sx = 0, sy = 0, sz = 0, sw = 0; |
| for (int j = 0; j < filter_size; j++) { |
| const int weight = weights[j]; |
| if (X) sx += weight * start0[j]; |
| if (Y) sy += weight * start1[j]; |
| if (Z) sz += weight * start2[j]; |
| if (W) sw += weight * start3[j]; |
| } |
| |
| if (X) xs.f32[i] = (float) sx * scale; |
| if (Y) ys.f32[i] = (float) sy * scale; |
| if (Z) zs.f32[i] = (float) sz * scale; |
| if (W) ws.f32[i] = (float) sw * scale; |
| |
| weights += filter_size; |
| } |
| |
| CONTINUE(&xs, &ys, &zs, &ws); |
| } |
| |
| SWS_FOR(PX, READ_PLANAR_FV, DECL_IMPL_READ, read_planar_fv) |
| SWS_FOR(PX, READ_PLANAR_FH, DECL_IMPL_READ, read_planar_fh) |
| SWS_FOR_STRUCT(PX, READ_PLANAR_FV, DECL_ENTRY, .setup = fn(setup_filter_v) ) |
| SWS_FOR_STRUCT(PX, READ_PLANAR_FH, DECL_ENTRY, .setup = fn(setup_filter_h) ) |
| |
| /*************************** |
| * Permutation and copying * |
| ***************************/ |
| |
| /* Permute by directly swapping the order of arguments to the continuation. */ |
| #define DECL_PERMUTE(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3) \ |
| static void NAME##_c(SwsOpIter *restrict iter, \ |
| const SwsOpImpl *restrict impl, \ |
| void *restrict in0, void *restrict in1, \ |
| void *restrict in2, void *restrict in3) \ |
| { \ |
| CONTINUE(in##IDX0, in##IDX1, in##IDX2, in##IDX3); \ |
| } |
| |
| #define DECL_COPY(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3) \ |
| static void NAME##_c(SwsOpIter *restrict iter, \ |
| const SwsOpImpl *restrict impl, \ |
| void *restrict in0, void *restrict in1, \ |
| void *restrict in2, void *restrict in3) \ |
| { \ |
| const SwsCompMask mask = (MASK); \ |
| block_t x, y, z, w; \ |
| \ |
| if (X) memcpy(&x.px, in##IDX0, SIZEOF_BLOCK); \ |
| if (Y) memcpy(&y.px, in##IDX1, SIZEOF_BLOCK); \ |
| if (Z) memcpy(&z.px, in##IDX2, SIZEOF_BLOCK); \ |
| if (W) memcpy(&w.px, in##IDX3, SIZEOF_BLOCK); \ |
| \ |
| CONTINUE(X ? &x : in0, Y ? &y : in1, Z ? &z : in2, W ? &w : in3); \ |
| } |
| |
| SWS_FOR(PX, PERMUTE, DECL_PERMUTE) |
| SWS_FOR(PX, COPY, DECL_COPY) |
| SWS_FOR_STRUCT(PX, PERMUTE, DECL_ENTRY) |
| SWS_FOR_STRUCT(PX, COPY, DECL_ENTRY) |
| |
| /********************* |
| * Format conversion * |
| *********************/ |
| |
| #define DECL_CAST(DST, dst) \ |
| DECL_FUNC(to_##dst, const SwsCompMask mask) \ |
| { \ |
| block_t xx, yy, zz, ww; \ |
| \ |
| SWS_LOOP \ |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { \ |
| if (X) xx.dst[i] = x[i]; \ |
| if (Y) yy.dst[i] = y[i]; \ |
| if (Z) zz.dst[i] = z[i]; \ |
| if (W) ww.dst[i] = w[i]; \ |
| } \ |
| \ |
| CONTINUE(&xx, &yy, &zz, &ww); \ |
| } \ |
| \ |
| SWS_FOR(PX, TO_##DST, DECL_IMPL, to_##dst) \ |
| SWS_FOR_STRUCT(PX, TO_##DST, DECL_ENTRY) |
| |
| DECL_CAST(U8, u8) |
| DECL_CAST(U16, u16) |
| DECL_CAST(U32, u32) |
| DECL_CAST(F32, f32) |
| |
| /******************** |
| * Bit manipulation * |
| ********************/ |
| |
| #if !IS_FLOAT |
| DECL_FUNC(lshift, const SwsCompMask mask, const uint8_t amount) |
| { |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x[i] <<= amount; |
| if (Y) y[i] <<= amount; |
| if (Z) z[i] <<= amount; |
| if (W) w[i] <<= amount; |
| } |
| |
| CONTINUE(x, y, z, w); |
| } |
| |
| DECL_FUNC(rshift, const SwsCompMask mask, const uint8_t amount) |
| { |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x[i] >>= amount; |
| if (Y) y[i] >>= amount; |
| if (Z) z[i] >>= amount; |
| if (W) w[i] >>= amount; |
| } |
| |
| CONTINUE(x, y, z, w); |
| } |
| #endif |
| |
| SWS_FOR(PX, LSHIFT, DECL_IMPL, lshift) |
| SWS_FOR(PX, RSHIFT, DECL_IMPL, rshift) |
| |
| SWS_FOR_STRUCT(PX, LSHIFT, DECL_ENTRY) |
| SWS_FOR_STRUCT(PX, RSHIFT, DECL_ENTRY) |
| |
| #ifdef PIXEL_SWAP |
| DECL_FUNC(swap_bytes, const SwsCompMask mask) |
| { |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x[i] = PIXEL_SWAP(x[i]); |
| if (Y) y[i] = PIXEL_SWAP(y[i]); |
| if (Z) z[i] = PIXEL_SWAP(z[i]); |
| if (W) w[i] = PIXEL_SWAP(w[i]); |
| } |
| |
| CONTINUE(x, y, z, w); |
| } |
| #endif /* PIXEL_SWAP */ |
| |
| SWS_FOR(PX, SWAP_BYTES, DECL_IMPL, swap_bytes) |
| SWS_FOR_STRUCT(PX, SWAP_BYTES, DECL_ENTRY) |
| |
| #ifdef PIXEL_MAX |
| DECL_FUNC(expand_bit, const SwsCompMask mask) |
| { |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x[i] = x[i] ? PIXEL_MAX : 0; |
| if (Y) y[i] = y[i] ? PIXEL_MAX : 0; |
| if (Z) z[i] = z[i] ? PIXEL_MAX : 0; |
| if (W) w[i] = w[i] ? PIXEL_MAX : 0; |
| } |
| |
| CONTINUE(x, y, z, w); |
| } |
| #endif |
| |
| #if BIT_DEPTH == 8 |
| DECL_FUNC(expand_pair, const SwsCompMask mask) |
| { |
| block_t x16, y16, z16, w16; |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x16.u16[i] = x[i] << 8 | x[i]; |
| if (Y) y16.u16[i] = y[i] << 8 | y[i]; |
| if (Z) z16.u16[i] = z[i] << 8 | z[i]; |
| if (W) w16.u16[i] = w[i] << 8 | w[i]; |
| } |
| |
| CONTINUE(&x16, &y16, &z16, &w16); |
| } |
| |
| DECL_FUNC(expand_quad, const SwsCompMask mask) |
| { |
| block_t x32, y32, z32, w32; |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x32.u32[i] = (uint32_t) x[i] << 24 | x[i] << 16 | x[i] << 8 | x[i]; |
| if (Y) y32.u32[i] = (uint32_t) y[i] << 24 | y[i] << 16 | y[i] << 8 | y[i]; |
| if (Z) z32.u32[i] = (uint32_t) z[i] << 24 | z[i] << 16 | z[i] << 8 | z[i]; |
| if (W) w32.u32[i] = (uint32_t) w[i] << 24 | w[i] << 16 | w[i] << 8 | w[i]; |
| } |
| |
| CONTINUE(&x32, &y32, &z32, &w32); |
| } |
| #endif /* BIT_DEPTH == 8 */ |
| |
| SWS_FOR(PX, EXPAND_BIT, DECL_IMPL, expand_bit) |
| SWS_FOR(PX, EXPAND_PAIR, DECL_IMPL, expand_pair) |
| SWS_FOR(PX, EXPAND_QUAD, DECL_IMPL, expand_quad) |
| SWS_FOR_STRUCT(PX, EXPAND_BIT, DECL_ENTRY) |
| SWS_FOR_STRUCT(PX, EXPAND_PAIR, DECL_ENTRY) |
| SWS_FOR_STRUCT(PX, EXPAND_QUAD, DECL_ENTRY) |
| |
| /************************* |
| * Packing and unpacking * |
| ************************/ |
| |
| #if !IS_FLOAT |
| DECL_FUNC(unpack, const SwsCompMask mask, |
| const uint8_t bx, const uint8_t by, |
| const uint8_t bz, const uint8_t bw) |
| { |
| const uint8_t sx = bw + bz + by; |
| const uint8_t sy = bw + bz; |
| const uint8_t sz = bw; |
| const uint8_t sw = 0; |
| |
| const pixel_t mx = (1 << bx) - 1; |
| const pixel_t my = (1 << by) - 1; |
| const pixel_t mz = (1 << bz) - 1; |
| const pixel_t mw = (1 << bw) - 1; |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| const pixel_t val = x[i]; |
| if (X) x[i] = (val >> sx) & mx; |
| if (Y) y[i] = (val >> sy) & my; |
| if (Z) z[i] = (val >> sz) & mz; |
| if (W) w[i] = (val >> sw) & mw; |
| } |
| |
| CONTINUE(x, y, z, w); |
| } |
| |
| DECL_FUNC(pack, const SwsCompMask mask, |
| const uint8_t bx, const uint8_t by, |
| const uint8_t bz, const uint8_t bw) |
| { |
| const uint8_t sx = bw + bz + by; |
| const uint8_t sy = bw + bz; |
| const uint8_t sz = bw; |
| const uint8_t sw = 0; |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| pixel_t val = 0; |
| if (X) val |= x[i] << sx; |
| if (Y) val |= y[i] << sy; |
| if (Z) val |= z[i] << sz; |
| if (W) val |= w[i] << sw; |
| x[i] = val; |
| } |
| |
| CONTINUE(x, y, z, w); |
| } |
| #endif /* !IS_FLOAT */ |
| |
| SWS_FOR(PX, UNPACK, DECL_IMPL, unpack) |
| SWS_FOR(PX, PACK, DECL_IMPL, pack) |
| SWS_FOR_STRUCT(PX, UNPACK, DECL_ENTRY) |
| SWS_FOR_STRUCT(PX, PACK, DECL_ENTRY) |
| |
| /*********************** |
| * Pixel data clearing * |
| ***********************/ |
| |
| #ifdef PIXEL_MAX |
| DECL_FUNC(clear, const SwsCompMask mask, const SwsCompMask one, |
| const SwsCompMask zero) |
| { |
| #define ONE(N) SWS_COMP_TEST(one, N) |
| #define ZERO(N) SWS_COMP_TEST(zero, N) |
| const pixel_t cx = ONE(0) ? PIXEL_MAX : ZERO(0) ? 0 : impl->priv.px[0]; |
| const pixel_t cy = ONE(1) ? PIXEL_MAX : ZERO(1) ? 0 : impl->priv.px[1]; |
| const pixel_t cz = ONE(2) ? PIXEL_MAX : ZERO(2) ? 0 : impl->priv.px[2]; |
| const pixel_t cw = ONE(3) ? PIXEL_MAX : ZERO(3) ? 0 : impl->priv.px[3]; |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x[i] = cx; |
| if (Y) y[i] = cy; |
| if (Z) z[i] = cz; |
| if (W) w[i] = cw; |
| } |
| |
| CONTINUE(x, y, z, w); |
| } |
| #endif |
| |
| SWS_FOR(PX, CLEAR, DECL_IMPL, clear) |
| SWS_FOR_STRUCT(PX, CLEAR, DECL_ENTRY, .setup = ff_sws_setup_vec4) |
| |
| /************************* |
| * Arithmetic operations * |
| *************************/ |
| |
| DECL_FUNC(scale, const SwsCompMask mask) |
| { |
| const pixel_t scale = impl->priv.px[0]; |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x[i] *= scale; |
| if (Y) y[i] *= scale; |
| if (Z) z[i] *= scale; |
| if (W) w[i] *= scale; |
| } |
| |
| CONTINUE(x, y, z, w); |
| } |
| |
| DECL_FUNC(add, const SwsCompMask mask) |
| { |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x[i] += impl->priv.px[0]; |
| if (Y) y[i] += impl->priv.px[1]; |
| if (Z) z[i] += impl->priv.px[2]; |
| if (W) w[i] += impl->priv.px[3]; |
| } |
| |
| CONTINUE(x, y, z, w); |
| } |
| |
| DECL_FUNC(min, const SwsCompMask mask) |
| { |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x[i] = FFMIN(x[i], impl->priv.px[0]); |
| if (Y) y[i] = FFMIN(y[i], impl->priv.px[1]); |
| if (Z) z[i] = FFMIN(z[i], impl->priv.px[2]); |
| if (W) w[i] = FFMIN(w[i], impl->priv.px[3]); |
| } |
| |
| CONTINUE(x, y, z, w); |
| } |
| |
| DECL_FUNC(max, const SwsCompMask mask) |
| { |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x[i] = FFMAX(x[i], impl->priv.px[0]); |
| if (Y) y[i] = FFMAX(y[i], impl->priv.px[1]); |
| if (Z) z[i] = FFMAX(z[i], impl->priv.px[2]); |
| if (W) w[i] = FFMAX(w[i], impl->priv.px[3]); |
| } |
| |
| CONTINUE(x, y, z, w); |
| } |
| |
| SWS_FOR(PX, SCALE, DECL_IMPL, scale) |
| SWS_FOR(PX, ADD, DECL_IMPL, add) |
| SWS_FOR(PX, MIN, DECL_IMPL, min) |
| SWS_FOR(PX, MAX, DECL_IMPL, max) |
| SWS_FOR_STRUCT(PX, SCALE, DECL_ENTRY, .setup = ff_sws_setup_scalar ) |
| SWS_FOR_STRUCT(PX, ADD, DECL_ENTRY, .setup = ff_sws_setup_vec4 ) |
| SWS_FOR_STRUCT(PX, MIN, DECL_ENTRY, .setup = ff_sws_setup_vec4 ) |
| SWS_FOR_STRUCT(PX, MAX, DECL_ENTRY, .setup = ff_sws_setup_vec4 ) |
| |
| /************* |
| * Dithering * |
| *************/ |
| |
| DECL_SETUP(setup_dither, params, out) |
| { |
| const SwsUOp *uop = params->uop; |
| const SwsDitherUOp *dither = &uop->par.dither; |
| const int size = 1 << dither->size_log2; |
| if (size >= SWS_BLOCK_SIZE) { |
| /* No extra padding needed */ |
| out->priv.ptr = av_refstruct_ref(uop->data.ptr); |
| out->free = ff_op_priv_unref; |
| return 0; |
| } |
| |
| const int stride = FFMAX(size, SWS_BLOCK_SIZE); |
| const int height = ff_sws_dither_height(dither); |
| pixel_t *matrix = av_malloc(sizeof(pixel_t) * height * stride); |
| if (!matrix) |
| return AVERROR(ENOMEM); |
| out->priv.ptr = matrix; |
| out->free = ff_op_priv_free; |
| |
| /* Pad to multiple of block size. We don't need extra padding for the |
| * height because ff_sws_dither_height() already includes any padding |
| * necessary for the y_offset */ |
| for (int y = 0; y < height; y++) { |
| pixel_t *row = &matrix[y * stride]; |
| for (int x = 0; x < size; x++) |
| row[x] = uop->data.ptr[y * size + x].px; |
| for (int x = size; x < stride; x++) |
| row[x] = row[x % size]; |
| } |
| |
| return 0; |
| } |
| |
| DECL_FUNC(dither, const SwsCompMask mask, |
| const uint8_t off0, const uint8_t off1, |
| const uint8_t off2, const uint8_t off3, |
| const uint8_t size_log2) |
| { |
| const int size = 1 << size_log2; |
| const int stride = FFMAX(size, SWS_BLOCK_SIZE); |
| |
| const pixel_t *matrix = impl->priv.ptr; |
| matrix += (iter->y & (size - 1)) * stride; |
| matrix += (iter->x & (size - 1)) & ~(SWS_BLOCK_SIZE - 1); |
| |
| const pixel_t *const row0 = &matrix[off0 * stride]; |
| const pixel_t *const row1 = &matrix[off1 * stride]; |
| const pixel_t *const row2 = &matrix[off2 * stride]; |
| const pixel_t *const row3 = &matrix[off3 * stride]; |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| if (X) x[i] += row0[i]; |
| if (Y) y[i] += row1[i]; |
| if (Z) z[i] += row2[i]; |
| if (W) w[i] += row3[i]; |
| } |
| |
| CONTINUE(x, y, z, w); |
| } |
| |
| SWS_FOR(PX, DITHER, DECL_IMPL, dither) |
| SWS_FOR_STRUCT(PX, DITHER, DECL_ENTRY, .setup = fn(setup_dither) ) |
| |
| /********************* |
| * Linear operations * |
| *********************/ |
| |
| typedef struct { |
| /* Stored in split form for convenience */ |
| pixel_t m[4][4]; |
| pixel_t k[4]; |
| } fn(LinCoeffs); |
| |
| DECL_SETUP(setup_linear, params, out) |
| { |
| const SwsUOp *uop = params->uop; |
| fn(LinCoeffs) c; |
| |
| for (int i = 0; i < 4; i++) { |
| for (int j = 0; j < 4; j++) |
| c.m[i][j] = uop->data.mat4[i][j].px; |
| c.k[i] = uop->data.mat4[i][4].px; |
| } |
| |
| out->priv.ptr = av_memdup(&c, sizeof(c)); |
| out->free = ff_op_priv_free; |
| return out->priv.ptr ? 0 : AVERROR(ENOMEM); |
| } |
| |
| /** |
| * Fully general case for a 5x5 linear affine transformation. Should never be |
| * called without constant `mask`. This function will compile down to the |
| * appropriately optimized version for the required subset of operations when |
| * called with a constant mask. |
| */ |
| DECL_FUNC(linear, const SwsCompMask mask, const uint32_t one, const uint32_t zero) |
| { |
| const fn(LinCoeffs) c = *(const fn(LinCoeffs) *) impl->priv.ptr; |
| |
| SWS_LOOP |
| for (int i = 0; i < SWS_BLOCK_SIZE; i++) { |
| const pixel_t xx = x[i]; |
| const pixel_t yy = y[i]; |
| const pixel_t zz = z[i]; |
| const pixel_t ww = w[i]; |
| |
| #define LIN_VAL(I, J, val) \ |
| ((one & SWS_MASK(I, J)) ? (val) : c.m[I][J] * (val)) |
| |
| #define LIN_ROW(I, var) do { \ |
| var[i] = (zero & SWS_MASK(I, 4)) ? 0 : c.k[I]; \ |
| if (!(zero & SWS_MASK(I, 0))) var[i] += LIN_VAL(I, 0, xx); \ |
| if (!(zero & SWS_MASK(I, 1))) var[i] += LIN_VAL(I, 1, yy); \ |
| if (!(zero & SWS_MASK(I, 2))) var[i] += LIN_VAL(I, 2, zz); \ |
| if (!(zero & SWS_MASK(I, 3))) var[i] += LIN_VAL(I, 3, ww); \ |
| } while (0) |
| |
| if (X) LIN_ROW(0, x); |
| if (Y) LIN_ROW(1, y); |
| if (Z) LIN_ROW(2, z); |
| if (W) LIN_ROW(3, w); |
| } |
| |
| CONTINUE(x, y, z, w); |
| } |
| |
| SWS_FOR(PX, LINEAR, DECL_IMPL, linear) |
| SWS_FOR_STRUCT(PX, LINEAR, DECL_ENTRY, .setup = fn(setup_linear) ) |
| |
| #undef PIXEL_MAX |
| #undef PIXEL_SWAP |
| #undef pixel_t |
| #undef inter_t |
| #undef block_t |
| #undef PX |
| #undef px |