| ;***************************************************************************** |
| ;* x86-optimized functions for colorspace filter |
| ;* |
| ;* Copyright (C) 2016 Ronald S. Bultje <rsbultje@gmail.com> |
| ;* |
| ;* This file is part of FFmpeg. |
| ;* |
| ;* FFmpeg is free software; you can redistribute it and/or |
| ;* modify it under the terms of the GNU Lesser General Public |
| ;* License as published by the Free Software Foundation; either |
| ;* version 2.1 of the License, or (at your option) any later version. |
| ;* |
| ;* FFmpeg is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| ;* Lesser General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU Lesser General Public |
| ;* License along with FFmpeg; if not, write to the Free Software |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| ;****************************************************************************** |
| |
| %include "libavutil/x86/x86util.asm" |
| |
| SECTION_RODATA |
| |
| pw_1: times 8 dw 1 |
| pw_2: times 8 dw 2 |
| pw_4: times 8 dw 4 |
| pw_8: times 8 dw 8 |
| pw_16: times 8 dw 16 |
| pw_64: times 8 dw 64 |
| pw_128: times 8 dw 128 |
| pw_256: times 8 dw 256 |
| pw_512: times 8 dw 512 |
| pw_1023: times 8 dw 1023 |
| pw_1024: times 8 dw 1024 |
| pw_2048: times 8 dw 2048 |
| pw_4095: times 8 dw 4095 |
| pw_8192: times 8 dw 8192 |
| pw_16384: times 8 dw 16384 |
| |
| pd_1: times 4 dd 1 |
| pd_2: times 4 dd 2 |
| pd_128: times 4 dd 128 |
| pd_512: times 4 dd 512 |
| pd_2048: times 4 dd 2048 |
| pd_8192: times 4 dd 8192 |
| pd_32768: times 4 dd 32768 |
| pd_131072: times 4 dd 131072 |
| |
| SECTION .text |
| |
| ; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3], |
| ; uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3], |
| ; int w, int h, const int16_t yuv2yuv_coeffs[3][3][8], |
| ; const int16_t yuv_offset[2][8]) |
| |
| %if ARCH_X86_64 |
| %macro YUV2YUV_FN 4 ; in_bitdepth, out_bitdepth, log2_chroma_w (horiz), log2_chroma_h (vert) |
| |
| %assign %%sh (14 + %1 - %2) |
| %assign %%rnd (1 << (%%sh - 1)) |
| %assign %%uvinoff (128 << (%1 - 8)) |
| %assign %%uvoutoff (128 << (%2 - 8)) |
| %if %3 == 0 |
| %assign %%ss 444 |
| %elif %4 == 0 |
| %assign %%ss 422 |
| %else ; %4 == 1 |
| %assign %%ss 420 |
| %endif ; %3/%4 |
| %if %2 != 8 |
| %assign %%maxval (1 << %2) - 1 |
| %endif ; %2 != 8 |
| |
| %assign %%ypsh %%sh - 1 |
| %if %%ypsh > 14 |
| %assign %%yoffsh %%ypsh - 13 |
| %assign %%ypsh 14 |
| %else |
| %assign %%yoffsh 1 |
| %endif |
| %assign %%yprnd (1 << (%%yoffsh - 1)) |
| %assign %%ypmul (1 << %%ypsh) |
| |
| cglobal yuv2yuv_ %+ %%ss %+ p%1to%2, 8, 14, 16, 0 - (4 * mmsize), \ |
| yo, yos, yi, yis, w, h, c, yoff, ui, vi, uo, vo |
| %if %3 == 1 |
| inc wd |
| sar wd, 1 |
| %if %4 == 1 |
| inc hd |
| sar hd, 1 |
| %endif ; %4 == 1 |
| %endif ; %3 == 1 |
| mov [rsp+3*mmsize+0], wd |
| mov [rsp+3*mmsize+4], hd |
| |
| mova m10, [cq] |
| pxor m11, m11 |
| mova m12, [pd_ %+ %%uvoutoff] |
| pslld m12, %%sh |
| paddd m12, [pd_ %+ %%rnd] |
| mova m13, [pw_ %+ %%uvinoff] |
| mova m14, [yoffq+ 0] ; y_off_in |
| mova m15, [yoffq+16] ; y_off_out |
| %if %%yoffsh != 0 |
| psllw m15, %%yoffsh |
| %endif |
| paddw m15, [pw_ %+ %%yprnd] |
| punpcklwd m10, m15 |
| mova m15, [pw_ %+ %%ypmul] |
| movh m0, [cq+1*16] ; cyu |
| movh m1, [cq+2*16] ; cyv |
| movh m2, [cq+4*16] ; cuu |
| movh m3, [cq+5*16] ; cuv |
| movh m4, [cq+7*16] ; cvu |
| movh m5, [cq+8*16] ; cvv |
| punpcklwd m0, m1 |
| punpcklwd m2, m3 |
| punpcklwd m4, m5 |
| mova [rsp+0*mmsize], m0 |
| mova [rsp+1*mmsize], m2 |
| mova [rsp+2*mmsize], m4 |
| |
| DEFINE_ARGS yo, yos, yi, yis, ui, vi, uo, vo, uis, vis, uos, vos, x, tmp |
| |
| mov uiq, [yiq+gprsize*1] |
| mov viq, [yiq+gprsize*2] |
| mov yiq, [yiq+gprsize*0] |
| mov uoq, [yoq+gprsize*1] |
| mov voq, [yoq+gprsize*2] |
| mov yoq, [yoq+gprsize*0] |
| mov uisq, [yisq+gprsize*1] |
| mov visq, [yisq+gprsize*2] |
| mov yisq, [yisq+gprsize*0] |
| mov uosq, [yosq+gprsize*1] |
| mov vosq, [yosq+gprsize*2] |
| mov yosq, [yosq+gprsize*0] |
| |
| .loop_v: |
| xor xq, xq |
| |
| .loop_h: |
| %if %4 == 1 |
| lea tmpq, [yiq+yisq] |
| %endif ; %4 == 1 |
| %if %1 == 8 |
| movu m0, [yiq+xq*(1<<%3)] ; y00/01 |
| %if %4 == 1 |
| movu m2, [tmpq+xq*2] ; y10/11 |
| %endif ; %4 == 1 |
| %if %3 == 1 |
| movh m4, [uiq+xq] ; u |
| movh m5, [viq+xq] ; v |
| %else ; %3 != 1 |
| movu m4, [uiq+xq] ; u |
| movu m5, [viq+xq] ; v |
| %endif ; %3 ==/!= 1 |
| punpckhbw m1, m0, m11 |
| punpcklbw m0, m11 |
| %if %4 == 1 |
| punpckhbw m3, m2, m11 |
| punpcklbw m2, m11 |
| %endif ; %4 == 1 |
| %if %3 == 0 |
| punpckhbw m2, m4, m11 |
| punpckhbw m3, m5, m11 |
| %endif ; %3 == 0 |
| punpcklbw m4, m11 |
| punpcklbw m5, m11 |
| %else ; %1 != 8 |
| movu m0, [yiq+xq*(2<<%3)] ; y00/01 |
| movu m1, [yiq+xq*(2<<%3)+mmsize] ; y00/01 |
| %if %4 == 1 |
| movu m2, [tmpq+xq*4] ; y10/11 |
| movu m3, [tmpq+xq*4+mmsize] ; y10/11 |
| %endif ; %4 == 1 |
| movu m4, [uiq+xq*2] ; u |
| movu m5, [viq+xq*2] ; v |
| %if %3 == 0 |
| movu m2, [uiq+xq*2+mmsize] |
| movu m3, [viq+xq*2+mmsize] |
| %endif ; %3 == 0 |
| %endif ; %1 ==/!= 8 |
| psubw m0, m14 |
| psubw m1, m14 |
| %if %4 == 1 |
| psubw m2, m14 |
| psubw m3, m14 |
| %endif ; %4 == 1 |
| psubw m4, m13 |
| psubw m5, m13 |
| %if %3 == 0 |
| psubw m2, m13 |
| psubw m3, m13 |
| %endif ; %3 == 0 |
| |
| SBUTTERFLY wd, 4, 5, 6 |
| pmaddwd m6, m4, [rsp+1*mmsize] |
| pmaddwd m7, m5, [rsp+1*mmsize] |
| %if %3 == 0 |
| SBUTTERFLY wd, 2, 3, 8 |
| pmaddwd m8, m2, [rsp+1*mmsize] |
| pmaddwd m9, m3, [rsp+1*mmsize] |
| %else ; %3 != 0 |
| pmaddwd m8, m4, [rsp+2*mmsize] |
| pmaddwd m9, m5, [rsp+2*mmsize] |
| %endif |
| paddd m6, m12 |
| paddd m7, m12 |
| paddd m8, m12 |
| paddd m9, m12 |
| psrad m6, %%sh |
| psrad m7, %%sh |
| psrad m8, %%sh |
| psrad m9, %%sh |
| packssdw m6, m7 |
| packssdw m8, m9 |
| %if %2 == 8 |
| packuswb m6, m8 |
| %if %3 == 0 |
| movu [uoq+xq], m6 |
| %else ; %3 != 0 |
| movh [uoq+xq], m6 |
| movhps [voq+xq], m6 |
| %endif ; %3 ==/!= 0 |
| %else ; %2 != 8 |
| CLIPW m6, m11, [pw_ %+ %%maxval] |
| CLIPW m8, m11, [pw_ %+ %%maxval] |
| movu [uoq+xq*2], m6 |
| %if %3 == 0 |
| movu [uoq+xq*2+mmsize], m8 |
| %else ; %3 != 0 |
| movu [voq+xq*2], m8 |
| %endif ; %3 ==/!= 0 |
| %endif ; %2 ==/!= 8 |
| |
| %if %3 == 0 |
| pmaddwd m6, m4, [rsp+2*mmsize] |
| pmaddwd m7, m5, [rsp+2*mmsize] |
| pmaddwd m8, m2, [rsp+2*mmsize] |
| pmaddwd m9, m3, [rsp+2*mmsize] |
| paddd m6, m12 |
| paddd m7, m12 |
| paddd m8, m12 |
| paddd m9, m12 |
| psrad m6, %%sh |
| psrad m7, %%sh |
| psrad m8, %%sh |
| psrad m9, %%sh |
| packssdw m6, m7 |
| packssdw m8, m9 |
| %if %2 == 8 |
| packuswb m6, m8 |
| movu [voq+xq], m6 |
| %else ; %2 != 8 |
| CLIPW m6, m11, [pw_ %+ %%maxval] |
| CLIPW m8, m11, [pw_ %+ %%maxval] |
| movu [voq+xq*2], m6 |
| movu [voq+xq*2+mmsize], m8 |
| %endif ; %2 ==/!= 8 |
| %endif ; %3 == 0 |
| |
| pmaddwd m4, [rsp+0*mmsize] |
| pmaddwd m5, [rsp+0*mmsize] ; uv_val |
| %if %3 == 0 |
| pmaddwd m2, [rsp+0*mmsize] |
| pmaddwd m3, [rsp+0*mmsize] |
| %endif ; %3 == 0 |
| |
| ; unpack y pixels with m15 (shifted round + offset), then multiply |
| ; by m10, add uv pixels, and we're done! |
| %if %3 == 1 |
| punpckhdq m8, m4, m4 |
| punpckldq m4, m4 |
| punpckhdq m9, m5, m5 |
| punpckldq m5, m5 |
| %else ; %3 != 1 |
| SWAP 8, 5, 2 |
| SWAP 3, 9 |
| %endif ; %3 ==/!= 1 |
| %if %4 == 1 |
| punpckhwd m6, m2, m15 |
| punpcklwd m2, m15 |
| punpckhwd m7, m3, m15 |
| punpcklwd m3, m15 |
| pmaddwd m2, m10 |
| pmaddwd m6, m10 |
| pmaddwd m3, m10 |
| pmaddwd m7, m10 |
| paddd m2, m4 |
| paddd m6, m8 |
| paddd m3, m5 |
| paddd m7, m9 |
| psrad m2, %%sh |
| psrad m6, %%sh |
| psrad m3, %%sh |
| psrad m7, %%sh |
| packssdw m2, m6 |
| packssdw m3, m7 |
| |
| lea tmpq, [yoq+yosq] |
| %if %2 == 8 |
| packuswb m2, m3 |
| movu [tmpq+xq*2], m2 |
| %else ; %2 != 8 |
| CLIPW m2, m11, [pw_ %+ %%maxval] |
| CLIPW m3, m11, [pw_ %+ %%maxval] |
| movu [tmpq+xq*4], m2 |
| movu [tmpq+xq*4+mmsize], m3 |
| %endif ; %2 ==/!= 8 |
| %endif ; %4 == 1 |
| |
| punpckhwd m6, m0, m15 |
| punpcklwd m0, m15 |
| punpckhwd m7, m1, m15 |
| punpcklwd m1, m15 |
| pmaddwd m0, m10 |
| pmaddwd m6, m10 |
| pmaddwd m1, m10 |
| pmaddwd m7, m10 |
| paddd m0, m4 |
| paddd m6, m8 |
| paddd m1, m5 |
| paddd m7, m9 |
| psrad m0, %%sh |
| psrad m6, %%sh |
| psrad m1, %%sh |
| psrad m7, %%sh |
| packssdw m0, m6 |
| packssdw m1, m7 |
| |
| %if %2 == 8 |
| packuswb m0, m1 |
| movu [yoq+xq*(1<<%3)], m0 |
| %else ; %2 != 8 |
| CLIPW m0, m11, [pw_ %+ %%maxval] |
| CLIPW m1, m11, [pw_ %+ %%maxval] |
| movu [yoq+xq*(2<<%3)], m0 |
| movu [yoq+xq*(2<<%3)+mmsize], m1 |
| %endif ; %2 ==/!= 8 |
| |
| add xq, mmsize >> %3 |
| cmp xd, dword [rsp+3*mmsize+0] |
| jl .loop_h |
| |
| %if %4 == 1 |
| lea yiq, [yiq+yisq*2] |
| lea yoq, [yoq+yosq*2] |
| %else ; %4 != 1 |
| add yiq, yisq |
| add yoq, yosq |
| %endif ; %4 ==/!= 1 |
| add uiq, uisq |
| add viq, visq |
| add uoq, uosq |
| add voq, vosq |
| dec dword [rsp+3*mmsize+4] |
| jg .loop_v |
| |
| RET |
| %endmacro |
| |
| %macro YUV2YUV_FNS 2 ; ss_w, ss_h |
| YUV2YUV_FN 8, 8, %1, %2 |
| YUV2YUV_FN 10, 8, %1, %2 |
| YUV2YUV_FN 12, 8, %1, %2 |
| YUV2YUV_FN 8, 10, %1, %2 |
| YUV2YUV_FN 10, 10, %1, %2 |
| YUV2YUV_FN 12, 10, %1, %2 |
| YUV2YUV_FN 8, 12, %1, %2 |
| YUV2YUV_FN 10, 12, %1, %2 |
| YUV2YUV_FN 12, 12, %1, %2 |
| %endmacro |
| |
| INIT_XMM sse2 |
| YUV2YUV_FNS 0, 0 |
| YUV2YUV_FNS 1, 0 |
| YUV2YUV_FNS 1, 1 |
| |
| ; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride, |
| ; uint8_t *yuv[3], ptrdiff_t yuv_stride[3], |
| ; int w, int h, const int16_t yuv2rgb_coeffs[3][3][8], |
| ; const int16_t yuv_offset[8]) |
| %macro YUV2RGB_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert) |
| %assign %%sh (%1 - 1) |
| %assign %%rnd (1 << (%%sh - 1)) |
| %assign %%uvoff (1 << (%1 - 1)) |
| %if %2 == 0 |
| %assign %%ss 444 |
| %elif %3 == 0 |
| %assign %%ss 422 |
| %else ; %3 == 1 |
| %assign %%ss 420 |
| %endif ; %2/%3 |
| |
| cglobal yuv2rgb_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 8 * mmsize, \ |
| rgb, rgbs, yuv, yuvs, ww, h, c, yoff |
| %if %2 == 1 |
| inc wwd |
| sar wwd, 1 |
| %endif ; %2 == 1 |
| %if %3 == 1 |
| inc hd |
| sar hd, 1 |
| %endif ; %3 == 1 |
| pxor m11, m11 |
| mova m15, [yoffq] ; yoff |
| movh m14, [cq+ 0] ; cy |
| movh m10, [cq+ 32] ; crv |
| movh m13, [cq+112] ; cbu |
| movh m12, [cq+ 64] ; cgu |
| movh m9, [cq+ 80] ; cgv |
| punpcklwd m14, [pw_ %+ %%rnd] ; cy, rnd |
| punpcklwd m13, m11 ; cbu, 0 |
| punpcklwd m11, m10 ; 0, crv |
| punpcklwd m12, m9 ; cgu, cgv |
| mova [rsp+0*mmsize], m11 |
| mova [rsp+1*mmsize], m12 |
| mova [rsp+2*mmsize], m13 |
| mova [rsp+3*mmsize], m14 |
| pxor m14, m14 |
| |
| DEFINE_ARGS r, rgbs, y, ys, ww, h, g, b, u, v, us, vs, x, tmp |
| |
| mov gq, [rq+1*gprsize] |
| mov bq, [rq+2*gprsize] |
| mov rq, [rq+0*gprsize] |
| mov uq, [yq+1*gprsize] |
| mov vq, [yq+2*gprsize] |
| mov yq, [yq+0*gprsize] |
| mov usq, [ysq+1*gprsize] |
| mov vsq, [ysq+2*gprsize] |
| mov ysq, [ysq+0*gprsize] |
| |
| .loop_v: |
| xor xq, xq |
| |
| .loop_h: |
| %if %3 == 1 |
| lea tmpq, [yq+ysq] |
| %endif ; %3 == 1 |
| %if %1 == 8 |
| movu m0, [yq+xq*(1<<%2)] |
| %if %3 == 1 |
| movu m2, [tmpq+xq*2] |
| %endif ; %3 == 1 |
| %if %2 == 1 |
| movh m4, [uq+xq] |
| movh m5, [vq+xq] |
| %else ; %2 != 1 |
| movu m4, [uq+xq] |
| movu m5, [vq+xq] |
| %endif ; %2 ==/!= 1 |
| punpckhbw m1, m0, m14 |
| punpcklbw m0, m14 |
| %if %3 == 1 |
| punpckhbw m3, m2, m14 |
| punpcklbw m2, m14 |
| %endif ; %3 == 1 |
| %if %2 == 0 |
| punpckhbw m2, m4, m14 |
| punpckhbw m3, m5, m14 |
| %endif ; %2 == 0 |
| punpcklbw m4, m14 |
| punpcklbw m5, m14 |
| %else ; %1 != 8 |
| movu m0, [yq+xq*(2<<%2)] |
| movu m1, [yq+xq*(2<<%2)+mmsize] |
| %if %3 == 1 |
| movu m2, [tmpq+xq*4] |
| movu m3, [tmpq+xq*4+mmsize] |
| %endif ; %3 == 1 |
| movu m4, [uq+xq*2] |
| movu m5, [vq+xq*2] |
| %if %2 == 0 |
| movu m2, [uq+xq*2+mmsize] |
| movu m3, [vq+xq*2+mmsize] |
| %endif ; %2 == 0 |
| %endif ; %1 ==/!= 8 |
| psubw m0, m15 |
| psubw m1, m15 |
| %if %3 == 1 |
| psubw m2, m15 |
| psubw m3, m15 |
| %endif ; %3 == 1 |
| psubw m4, [pw_ %+ %%uvoff] |
| psubw m5, [pw_ %+ %%uvoff] |
| SBUTTERFLY wd, 4, 5, 6 |
| %if %2 == 0 |
| psubw m2, [pw_ %+ %%uvoff] |
| psubw m3, [pw_ %+ %%uvoff] |
| SBUTTERFLY wd, 2, 3, 6 |
| %endif ; %2 == 0 |
| |
| ; calculate y+rnd full-resolution [0-3,6-9] |
| punpckhwd m6, m0, [pw_1] ; y, 1 |
| punpcklwd m0, [pw_1] ; y, 1 |
| punpckhwd m7, m1, [pw_1] ; y, 1 |
| punpcklwd m1, [pw_1] ; y, 1 |
| pmaddwd m0, [rsp+3*mmsize] |
| pmaddwd m6, [rsp+3*mmsize] |
| pmaddwd m1, [rsp+3*mmsize] |
| pmaddwd m7, [rsp+3*mmsize] |
| %if %3 == 1 |
| punpckhwd m8, m2, [pw_1] ; y, 1 |
| punpcklwd m2, [pw_1] ; y, 1 |
| punpckhwd m9, m3, [pw_1] ; y, 1 |
| punpcklwd m3, [pw_1] ; y, 1 |
| pmaddwd m2, [rsp+3*mmsize] |
| pmaddwd m8, [rsp+3*mmsize] |
| pmaddwd m3, [rsp+3*mmsize] |
| pmaddwd m9, [rsp+3*mmsize] |
| mova [rsp+4*mmsize], m2 |
| mova [rsp+5*mmsize], m8 |
| mova [rsp+6*mmsize], m3 |
| mova [rsp+7*mmsize], m9 |
| %endif ; %3 == 1 |
| |
| ; calculate r offsets (un-subsampled, then duplicate) |
| pmaddwd m10, m4, [rsp+0*mmsize] |
| %if %2 == 1 |
| pmaddwd m12, m5, [rsp+0*mmsize] |
| punpckhdq m11, m10, m10 |
| punpckldq m10, m10 |
| punpckhdq m13, m12, m12 |
| punpckldq m12, m12 |
| %else ; %2 != 1 |
| pmaddwd m11, m5, [rsp+0*mmsize] |
| pmaddwd m12, m2, [rsp+0*mmsize] |
| pmaddwd m13, m3, [rsp+0*mmsize] |
| %endif ; %2 ==/!= 1 |
| %if %3 == 1 |
| paddd m2, m10, [rsp+4*mmsize] |
| paddd m3, m11, [rsp+5*mmsize] |
| paddd m8, m12, [rsp+6*mmsize] |
| paddd m9, m13, [rsp+7*mmsize] |
| %endif |
| paddd m10, m0 |
| paddd m11, m6 |
| paddd m12, m1 |
| paddd m13, m7 |
| %if %3 == 1 |
| psrad m2, %%sh |
| psrad m3, %%sh |
| psrad m8, %%sh |
| psrad m9, %%sh |
| %endif ; %3 == 1 |
| psrad m10, %%sh |
| psrad m11, %%sh |
| psrad m12, %%sh |
| psrad m13, %%sh |
| %if %3 == 1 |
| lea tmpq, [rq+rgbsq*2] |
| packssdw m2, m3 |
| packssdw m8, m9 |
| mova [tmpq+xq*4], m2 |
| mova [tmpq+xq*4+mmsize], m8 |
| %endif ; %3 == 1 |
| packssdw m10, m11 |
| packssdw m12, m13 |
| mova [rq+xq*(2 << %2)], m10 |
| mova [rq+xq*(2 << %2)+mmsize], m12 |
| |
| ; calculate g offsets (un-subsampled, then duplicate) |
| pmaddwd m10, m4, [rsp+1*mmsize] |
| %if %2 == 1 |
| pmaddwd m12, m5, [rsp+1*mmsize] |
| punpckhdq m11, m10, m10 |
| punpckldq m10, m10 |
| punpckhdq m13, m12, m12 |
| punpckldq m12, m12 |
| %else ; %2 != 1 |
| pmaddwd m11, m5, [rsp+1*mmsize] |
| pmaddwd m12, m2, [rsp+1*mmsize] |
| pmaddwd m13, m3, [rsp+1*mmsize] |
| %endif ; %2 ==/!= 1 |
| %if %3 == 1 |
| paddd m2, m10, [rsp+4*mmsize] |
| paddd m3, m11, [rsp+5*mmsize] |
| paddd m8, m12, [rsp+6*mmsize] |
| paddd m9, m13, [rsp+7*mmsize] |
| %endif ; %3 == 1 |
| paddd m10, m0 |
| paddd m11, m6 |
| paddd m12, m1 |
| paddd m13, m7 |
| %if %3 == 1 |
| psrad m2, %%sh |
| psrad m3, %%sh |
| psrad m8, %%sh |
| psrad m9, %%sh |
| %endif ; %3 == 1 |
| psrad m10, %%sh |
| psrad m11, %%sh |
| psrad m12, %%sh |
| psrad m13, %%sh |
| %if %3 == 1 |
| lea tmpq, [gq+rgbsq*2] |
| packssdw m2, m3 |
| packssdw m8, m9 |
| mova [tmpq+xq*4], m2 |
| mova [tmpq+xq*4+mmsize], m8 |
| %endif ; %3 == 1 |
| packssdw m10, m11 |
| packssdw m12, m13 |
| mova [gq+xq*(2 << %2)], m10 |
| mova [gq+xq*(2 << %2)+mmsize], m12 |
| |
| ; calculate b offsets (un-subsampled, then duplicate) |
| pmaddwd m4, [rsp+2*mmsize] |
| pmaddwd m5, [rsp+2*mmsize] |
| %if %2 == 1 |
| punpckhdq m2, m4, m4 |
| punpckldq m4, m4 |
| punpckhdq m3, m5, m5 |
| punpckldq m5, m5 |
| %else ; %2 != 1 |
| pmaddwd m2, [rsp+2*mmsize] |
| pmaddwd m3, [rsp+2*mmsize] |
| SWAP 2, 5 |
| %endif ; %2 ==/!= 1 |
| paddd m0, m4 |
| paddd m6, m2 |
| paddd m1, m5 |
| paddd m7, m3 |
| %if %3 == 1 |
| paddd m4, [rsp+4*mmsize] |
| paddd m2, [rsp+5*mmsize] |
| paddd m5, [rsp+6*mmsize] |
| paddd m3, [rsp+7*mmsize] |
| %endif ; %3 == 1 |
| psrad m0, %%sh |
| psrad m6, %%sh |
| psrad m1, %%sh |
| psrad m7, %%sh |
| %if %3 == 1 |
| psrad m4, %%sh |
| psrad m2, %%sh |
| psrad m5, %%sh |
| psrad m3, %%sh |
| %endif ; %3 == 1 |
| packssdw m0, m6 |
| packssdw m1, m7 |
| movu [bq+xq*(2 << %2)], m0 |
| movu [bq+xq*(2 << %2)+mmsize], m1 |
| %if %3 == 1 |
| lea tmpq, [bq+rgbsq*2] |
| packssdw m4, m2 |
| packssdw m5, m3 |
| movu [tmpq+xq*4], m4 |
| movu [tmpq+xq*4+mmsize], m5 |
| %endif ; %3 == 1 |
| |
| add xd, mmsize >> %2 |
| cmp xd, wwd |
| jl .loop_h |
| |
| lea rq, [rq+rgbsq*(2 << %3)] |
| lea gq, [gq+rgbsq*(2 << %3)] |
| lea bq, [bq+rgbsq*(2 << %3)] |
| %if %3 == 1 |
| lea yq, [yq+ysq*2] |
| %else ; %3 != 0 |
| add yq, ysq |
| %endif ; %3 ==/!= 1 |
| add uq, usq |
| add vq, vsq |
| dec hd |
| jg .loop_v |
| |
| RET |
| %endmacro |
| |
| %macro YUV2RGB_FNS 2 |
| YUV2RGB_FN 8, %1, %2 |
| YUV2RGB_FN 10, %1, %2 |
| YUV2RGB_FN 12, %1, %2 |
| %endmacro |
| |
| INIT_XMM sse2 |
| YUV2RGB_FNS 0, 0 |
| YUV2RGB_FNS 1, 0 |
| YUV2RGB_FNS 1, 1 |
| |
| %macro RGB2YUV_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert) |
| %assign %%sh 29 - %1 |
| %assign %%rnd (1 << (%%sh - 15)) |
| %assign %%uvrnd ((128 << (%1 - 8)) << (%%sh - 14)) |
| %if %1 != 8 |
| %assign %%maxval ((1 << %1) - 1) |
| %endif ; %1 != 8 |
| %if %2 == 0 |
| %assign %%ss 444 |
| %elif %3 == 0 |
| %assign %%ss 422 |
| %else ; %3 == 1 |
| %assign %%ss 420 |
| %endif ; %2/%3 |
| |
| cglobal rgb2yuv_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 6 * mmsize, \ |
| yuv, yuvs, rgb, rgbs, ww, h, c, off |
| %if %2 == 1 |
| inc wwd |
| sar wwd, 1 |
| %endif ; %2 == 1 |
| %if %3 == 1 |
| inc hd |
| sar hd, 1 |
| %endif ; %3 == 1 |
| |
| ; prepare coeffs |
| movh m8, [offq] |
| movh m9, [pw_ %+ %%uvrnd] |
| psllw m8, %%sh - 14 |
| paddw m9, [pw_ %+ %%rnd] |
| paddw m8, [pw_ %+ %%rnd] |
| movh m0, [cq+ 0] |
| movh m1, [cq+ 16] |
| movh m2, [cq+ 32] |
| movh m3, [cq+ 48] |
| movh m4, [cq+ 64] |
| movh m5, [cq+ 80] |
| movh m6, [cq+112] |
| movh m7, [cq+128] |
| punpcklwd m0, m1 |
| punpcklwd m2, m8 |
| punpcklwd m3, m4 |
| punpcklwd m4, m5, m9 |
| punpcklwd m5, m6 |
| punpcklwd m7, m9 |
| |
| mova [rsp+0*mmsize], m0 ; cry, cgy |
| mova [rsp+1*mmsize], m2 ; cby, off + rnd |
| mova [rsp+2*mmsize], m3 ; cru, cgu |
| mova [rsp+3*mmsize], m4 ; cburv, uvoff + rnd |
| mova [rsp+4*mmsize], m5 ; cburv, cgv |
| mova [rsp+5*mmsize], m7 ; cbv, uvoff + rnd |
| |
| |
| DEFINE_ARGS y, ys, r, rgbs, ww, h, u, v, us, vs, g, b, tmp, x |
| mov gq, [rq+gprsize*1] |
| mov bq, [rq+gprsize*2] |
| mov rq, [rq+gprsize*0] |
| mov uq, [yq+gprsize*1] |
| mov vq, [yq+gprsize*2] |
| mov yq, [yq+gprsize*0] |
| mov usq, [ysq+gprsize*1] |
| mov vsq, [ysq+gprsize*2] |
| mov ysq, [ysq+gprsize*0] |
| |
| pxor m15, m15 |
| .loop_v: |
| xor xd, xd |
| |
| .loop_h: |
| ; top line y |
| mova m0, [rq+xq*(2<<%2)] |
| mova m3, [rq+xq*(2<<%2)+mmsize] |
| mova m1, [gq+xq*(2<<%2)] |
| mova m4, [gq+xq*(2<<%2)+mmsize] |
| mova m2, [bq+xq*(2<<%2)] |
| mova m5, [bq+xq*(2<<%2)+mmsize] |
| |
| punpcklwd m6, m0, m1 |
| punpckhwd m7, m0, m1 |
| punpcklwd m8, m3, m4 |
| punpckhwd m9, m3, m4 |
| punpcklwd m10, m2, [pw_16384] |
| punpckhwd m11, m2, [pw_16384] |
| punpcklwd m12, m5, [pw_16384] |
| punpckhwd m13, m5, [pw_16384] |
| |
| pmaddwd m6, [rsp+0*mmsize] |
| pmaddwd m7, [rsp+0*mmsize] |
| pmaddwd m8, [rsp+0*mmsize] |
| pmaddwd m9, [rsp+0*mmsize] |
| pmaddwd m10, [rsp+1*mmsize] |
| pmaddwd m11, [rsp+1*mmsize] |
| pmaddwd m12, [rsp+1*mmsize] |
| pmaddwd m13, [rsp+1*mmsize] |
| paddd m6, m10 |
| paddd m7, m11 |
| paddd m8, m12 |
| paddd m9, m13 |
| psrad m6, %%sh |
| psrad m7, %%sh |
| psrad m8, %%sh |
| psrad m9, %%sh |
| packssdw m6, m7 |
| packssdw m8, m9 |
| %if %1 == 8 |
| packuswb m6, m8 |
| movu [yq+xq*(1<<%2)], m6 |
| %else |
| CLIPW m6, m15, [pw_ %+ %%maxval] |
| CLIPW m8, m15, [pw_ %+ %%maxval] |
| movu [yq+xq*(2<<%2)], m6 |
| movu [yq+xq*(2<<%2)+mmsize], m8 |
| %endif |
| |
| %if %2 == 1 |
| ; subsampling cached data |
| pmaddwd m0, [pw_1] |
| pmaddwd m1, [pw_1] |
| pmaddwd m2, [pw_1] |
| pmaddwd m3, [pw_1] |
| pmaddwd m4, [pw_1] |
| pmaddwd m5, [pw_1] |
| |
| %if %3 == 1 |
| ; bottom line y, r/g portion only |
| lea tmpq, [rgbsq+xq*2] |
| mova m6, [rq+tmpq*2] |
| mova m9, [rq+tmpq*2+mmsize] |
| mova m7, [gq+tmpq*2] |
| mova m10, [gq+tmpq*2+mmsize] |
| mova m8, [bq+tmpq*2] |
| mova m11, [bq+tmpq*2+mmsize] |
| |
| punpcklwd m12, m6, m7 |
| punpckhwd m13, m6, m7 |
| punpcklwd m14, m9, m10 |
| punpckhwd m15, m9, m10 |
| |
| ; release two more registers |
| pmaddwd m6, [pw_1] |
| pmaddwd m7, [pw_1] |
| pmaddwd m9, [pw_1] |
| pmaddwd m10, [pw_1] |
| paddd m0, m6 |
| paddd m3, m9 |
| paddd m1, m7 |
| paddd m4, m10 |
| |
| ; bottom line y, b/rnd portion only |
| punpcklwd m6, m8, [pw_16384] |
| punpckhwd m7, m8, [pw_16384] |
| punpcklwd m9, m11, [pw_16384] |
| punpckhwd m10, m11, [pw_16384] |
| |
| pmaddwd m12, [rsp+0*mmsize] |
| pmaddwd m13, [rsp+0*mmsize] |
| pmaddwd m14, [rsp+0*mmsize] |
| pmaddwd m15, [rsp+0*mmsize] |
| pmaddwd m6, [rsp+1*mmsize] |
| pmaddwd m7, [rsp+1*mmsize] |
| pmaddwd m9, [rsp+1*mmsize] |
| pmaddwd m10, [rsp+1*mmsize] |
| paddd m12, m6 |
| paddd m13, m7 |
| paddd m14, m9 |
| paddd m15, m10 |
| psrad m12, %%sh |
| psrad m13, %%sh |
| psrad m14, %%sh |
| psrad m15, %%sh |
| packssdw m12, m13 |
| packssdw m14, m15 |
| lea tmpq, [yq+ysq] |
| %if %1 == 8 |
| packuswb m12, m14 |
| movu [tmpq+xq*2], m12 |
| %else |
| pxor m15, m15 |
| CLIPW m12, m15, [pw_ %+ %%maxval] |
| CLIPW m14, m15, [pw_ %+ %%maxval] |
| movu [tmpq+xq*4], m12 |
| movu [tmpq+xq*4+mmsize], m14 |
| %endif |
| |
| ; complete subsampling of r/g/b pixels for u/v |
| pmaddwd m8, [pw_1] |
| pmaddwd m11, [pw_1] |
| paddd m2, m8 |
| paddd m5, m11 |
| paddd m0, [pd_2] |
| paddd m1, [pd_2] |
| paddd m2, [pd_2] |
| paddd m3, [pd_2] |
| paddd m4, [pd_2] |
| paddd m5, [pd_2] |
| psrad m0, 2 |
| psrad m1, 2 |
| psrad m2, 2 |
| psrad m3, 2 |
| psrad m4, 2 |
| psrad m5, 2 |
| %else ; %3 != 1 |
| paddd m0, [pd_1] |
| paddd m1, [pd_1] |
| paddd m2, [pd_1] |
| paddd m3, [pd_1] |
| paddd m4, [pd_1] |
| paddd m5, [pd_1] |
| psrad m0, 1 |
| psrad m1, 1 |
| psrad m2, 1 |
| psrad m3, 1 |
| psrad m4, 1 |
| psrad m5, 1 |
| %endif ; %3 ==/!= 1 |
| packssdw m0, m3 |
| packssdw m1, m4 |
| packssdw m2, m5 |
| %endif ; %2 == 1 |
| |
| ; convert u/v pixels |
| SBUTTERFLY wd, 0, 1, 6 |
| punpckhwd m6, m2, [pw_16384] |
| punpcklwd m2, [pw_16384] |
| |
| pmaddwd m7, m0, [rsp+2*mmsize] |
| pmaddwd m8, m1, [rsp+2*mmsize] |
| pmaddwd m9, m2, [rsp+3*mmsize] |
| pmaddwd m10, m6, [rsp+3*mmsize] |
| pmaddwd m0, [rsp+4*mmsize] |
| pmaddwd m1, [rsp+4*mmsize] |
| pmaddwd m2, [rsp+5*mmsize] |
| pmaddwd m6, [rsp+5*mmsize] |
| paddd m7, m9 |
| paddd m8, m10 |
| paddd m0, m2 |
| paddd m1, m6 |
| psrad m7, %%sh |
| psrad m8, %%sh |
| psrad m0, %%sh |
| psrad m1, %%sh |
| packssdw m7, m8 |
| packssdw m0, m1 |
| %if %2 == 1 |
| %if %1 == 8 |
| packuswb m7, m0 |
| movh [uq+xq], m7 |
| movhps [vq+xq], m7 |
| %else |
| CLIPW m7, m15, [pw_ %+ %%maxval] |
| CLIPW m0, m15, [pw_ %+ %%maxval] |
| movu [uq+xq*2], m7 |
| movu [vq+xq*2], m0 |
| %endif |
| %else ; %2 != 1 |
| ; second set of u/v pixels |
| SBUTTERFLY wd, 3, 4, 6 |
| punpckhwd m6, m5, [pw_16384] |
| punpcklwd m5, [pw_16384] |
| |
| pmaddwd m8, m3, [rsp+2*mmsize] |
| pmaddwd m9, m4, [rsp+2*mmsize] |
| pmaddwd m10, m5, [rsp+3*mmsize] |
| pmaddwd m11, m6, [rsp+3*mmsize] |
| pmaddwd m3, [rsp+4*mmsize] |
| pmaddwd m4, [rsp+4*mmsize] |
| pmaddwd m5, [rsp+5*mmsize] |
| pmaddwd m6, [rsp+5*mmsize] |
| paddd m8, m10 |
| paddd m9, m11 |
| paddd m3, m5 |
| paddd m4, m6 |
| psrad m8, %%sh |
| psrad m9, %%sh |
| psrad m3, %%sh |
| psrad m4, %%sh |
| packssdw m8, m9 |
| packssdw m3, m4 |
| |
| %if %1 == 8 |
| packuswb m7, m8 |
| packuswb m0, m3 |
| movu [uq+xq], m7 |
| movu [vq+xq], m0 |
| %else |
| CLIPW m7, m15, [pw_ %+ %%maxval] |
| CLIPW m0, m15, [pw_ %+ %%maxval] |
| CLIPW m8, m15, [pw_ %+ %%maxval] |
| CLIPW m3, m15, [pw_ %+ %%maxval] |
| movu [uq+xq*2], m7 |
| movu [uq+xq*2+mmsize], m8 |
| movu [vq+xq*2], m0 |
| movu [vq+xq*2+mmsize], m3 |
| %endif |
| %endif ; %2 ==/!= 1 |
| |
| add xq, mmsize >> %2 |
| cmp xd, wwd |
| jl .loop_h |
| |
| %if %3 == 0 |
| add yq, ysq |
| %else ; %3 != 0 |
| lea yq, [yq+ysq*2] |
| %endif ; %3 ==/!= 0 |
| add uq, usq |
| add vq, vsq |
| lea rq, [rq+rgbsq*(2<<%3)] |
| lea gq, [gq+rgbsq*(2<<%3)] |
| lea bq, [bq+rgbsq*(2<<%3)] |
| dec hd |
| jg .loop_v |
| |
| RET |
| %endmacro |
| |
| %macro RGB2YUV_FNS 2 |
| RGB2YUV_FN 8, %1, %2 |
| RGB2YUV_FN 10, %1, %2 |
| RGB2YUV_FN 12, %1, %2 |
| %endmacro |
| |
| INIT_XMM sse2 |
| RGB2YUV_FNS 0, 0 |
| RGB2YUV_FNS 1, 0 |
| RGB2YUV_FNS 1, 1 |
| |
| ; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride, |
| ; int w, int h, const int16_t coeff[3][3][8]) |
| INIT_XMM sse2 |
| cglobal multiply3x3, 5, 7, 16, data, stride, ww, h, c |
| movh m0, [cq+ 0] |
| movh m1, [cq+ 32] |
| movh m2, [cq+ 48] |
| movh m3, [cq+ 80] |
| movh m4, [cq+ 96] |
| movh m5, [cq+128] |
| punpcklwd m0, [cq+ 16] |
| punpcklwd m1, [pw_8192] |
| punpcklwd m2, [cq+ 64] |
| punpcklwd m3, [pw_8192] |
| punpcklwd m4, [cq+112] |
| punpcklwd m5, [pw_8192] |
| |
| DEFINE_ARGS data0, stride, ww, h, data1, data2, x |
| shl strideq, 1 |
| mov data1q, [data0q+gprsize*1] |
| mov data2q, [data0q+gprsize*2] |
| mov data0q, [data0q+gprsize*0] |
| |
| .loop_v: |
| xor xd, xd |
| |
| .loop_h: |
| mova m6, [data0q+xq*2] |
| mova m7, [data1q+xq*2] |
| mova m8, [data2q+xq*2] |
| SBUTTERFLY wd, 6, 7, 9 |
| punpckhwd m9, m8, [pw_1] |
| punpcklwd m8, [pw_1] |
| |
| pmaddwd m10, m6, m0 |
| pmaddwd m11, m7, m0 |
| pmaddwd m12, m8, m1 |
| pmaddwd m13, m9, m1 |
| paddd m10, m12 |
| paddd m11, m13 |
| psrad m10, 14 |
| psrad m11, 14 |
| |
| pmaddwd m12, m6, m2 |
| pmaddwd m13, m7, m2 |
| pmaddwd m14, m8, m3 |
| pmaddwd m15, m9, m3 |
| paddd m12, m14 |
| paddd m13, m15 |
| psrad m12, 14 |
| psrad m13, 14 |
| |
| pmaddwd m6, m4 |
| pmaddwd m7, m4 |
| pmaddwd m8, m5 |
| pmaddwd m9, m5 |
| paddd m6, m8 |
| paddd m7, m9 |
| psrad m6, 14 |
| psrad m7, 14 |
| |
| packssdw m10, m11 |
| packssdw m12, m13 |
| packssdw m6, m7 |
| |
| mova [data0q+xq*2], m10 |
| mova [data1q+xq*2], m12 |
| mova [data2q+xq*2], m6 |
| |
| add xd, mmsize / 2 |
| cmp xd, wwd |
| jl .loop_h |
| |
| add data0q, strideq |
| add data1q, strideq |
| add data2q, strideq |
| dec hd |
| jg .loop_v |
| |
| RET |
| %endif |