| ;****************************************************************************** |
| ;* VP9 IDCT SIMD optimizations |
| ;* |
| ;* Copyright (C) 2025 Two Orioles, LLC |
| ;* |
| ;* This file is part of FFmpeg. |
| ;* |
| ;* FFmpeg is free software; you can redistribute it and/or |
| ;* modify it under the terms of the GNU Lesser General Public |
| ;* License as published by the Free Software Foundation; either |
| ;* version 2.1 of the License, or (at your option) any later version. |
| ;* |
| ;* FFmpeg is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| ;* Lesser General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU Lesser General Public |
| ;* License along with FFmpeg; if not, write to the Free Software |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| ;****************************************************************************** |
| |
| %include "libavutil/x86/x86util.asm" |
| |
| %if ARCH_X86_64 && HAVE_AVX512ICL_EXTERNAL |
| |
| SECTION_RODATA 64 |
| |
| ; The following set of constants are ordered to form the |
| ; qword shuffle mask { 0, 2, 4, 6, 1, 3, 5, 7 } |
| %define deintq_perm pd_5520 |
| pd_5520: dd 5520 |
| pd_9760: dd 9760 |
| pd_10394: dd 10394 |
| pd_15426: dd 15426 |
| pd_804: dd 804 |
| pd_2404: dd 2404 |
| pd_6270: dd 6270 |
| pd_9102: dd 9102 |
| pd_11585: dd 11585 |
| pd_12665: dd 12665 |
| pd_7723: dd 7723 |
| pd_14811: dd 14811 |
| pd_7005: dd 7005 |
| pd_14053: dd 14053 |
| pd_8423: dd 8423 |
| pd_13623: dd 13623 |
| |
| pixel_clip: times 2 dw 0x7c00 |
| pixel_clip6: dd 2031648 ; 32 + (pixel_clip << 6) |
| pd_532480: dd 532480 ; 8192 + (32 << 14) |
| pd_8192: dd 8192 |
| |
| pd_1606: dd 1606 |
| pd_3196: dd 3196 |
| pd_3981: dd 3981 |
| pd_4756: dd 4756 |
| pd_11003: dd 11003 |
| pd_12140: dd 12140 |
| pd_13160: dd 13160 |
| pd_14449: dd 14449 |
| pd_15137: dd 15137 |
| pd_15679: dd 15679 |
| pd_15893: dd 15893 |
| pd_16069: dd 16069 |
| pd_16207: dd 16207 |
| pd_16305: dd 16305 |
| pd_16364: dd 16364 |
| |
| SECTION .text |
| |
| %define o_base (deintq_perm+128) |
| %define o(x) (r5 - o_base + (x)) |
| %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) |
| |
| ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 |
| ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 |
| ; skip round/shift if rnd is not a number |
| %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], inv_dst2 |
| %if %8 < 32 |
| pmulld m%4, m%1, m%8 |
| pmulld m%3, m%2, m%8 |
| %else |
| vpbroadcastd m%3, [o(pd_%8)] |
| pmulld m%4, m%1, m%3 |
| pmulld m%3, m%2 |
| %endif |
| %if %7 < 32 |
| pmulld m%1, m%7 |
| pmulld m%2, m%7 |
| %else |
| vpbroadcastd m%5, [o(pd_%7)] |
| pmulld m%1, m%5 |
| pmulld m%2, m%5 |
| %endif |
| %if %9 |
| psubd m%4, m%6, m%4 |
| psubd m%2, m%4, m%2 |
| %else |
| %ifnum %6 |
| paddd m%4, m%6 |
| %endif |
| paddd m%2, m%4 |
| %endif |
| %ifnum %6 |
| paddd m%1, m%6 |
| %endif |
| psubd m%1, m%3 |
| %ifnum %6 |
| psrad m%2, 14 |
| psrad m%1, 14 |
| %endif |
| %endmacro |
| |
| %macro WRAP_YMM 1+ |
| INIT_YMM cpuname |
| %1 |
| INIT_ZMM cpuname |
| %endmacro |
| |
| %macro TRANSPOSE_4D 5 ; in[1-4], tmp |
| punpckhdq m%5, m%3, m%4 ; c2 d2 c3 d3 |
| punpckldq m%3, m%4 ; c0 d0 c1 d1 |
| punpckhdq m%4, m%1, m%2 ; a2 b2 a3 b3 |
| punpckldq m%1, m%2 ; a0 b0 a1 b1 |
| punpckhqdq m%2, m%1, m%3 ; a1 b1 c1 d1 |
| punpcklqdq m%1, m%3 ; a0 b0 c0 d0 |
| punpcklqdq m%3, m%4, m%5 ; a2 b2 c2 d2 |
| punpckhqdq m%4, m%5 ; a3 b3 c3 d3 |
| %endmacro |
| |
| %macro TRANSPOSE_4DQ 5 ; in[1-4], tmp |
| vshufi32x4 m%5, m%3, m%4, q3232 ; c2 c3 d2 d3 |
| vinserti32x8 m%3, ym%4, 1 ; c0 c1 d0 d1 |
| vshufi32x4 m%4, m%1, m%2, q3232 ; a2 a3 b2 b3 |
| vinserti32x8 m%1, ym%2, 1 ; a0 a1 b0 b1 |
| vshufi32x4 m%2, m%1, m%3, q3131 ; a1 b1 c1 d1 |
| vshufi32x4 m%1, m%3, q2020 ; a0 b0 c0 d0 |
| vshufi32x4 m%3, m%4, m%5, q2020 ; a2 b2 c2 d2 |
| vshufi32x4 m%4, m%5, q3131 ; a3 b3 c3 d3 |
| %endmacro |
| |
| %macro INV_TXFM_FN 3-4 0 ; type1, type2, size, eob_offset |
| cglobal vp9_i%1_i%2_%3_add_10, 4, 5, 0, dst, stride, c, eob, tx2 |
| %define %%p1 m(vp9_i%1_%3_internal_10) |
| lea r5, [o_base] |
| ; Jump to the 1st txfm function if we're not taking the fast path, which |
| ; in turn performs an indirect jump to the 2nd txfm function. |
| lea tx2q, [m(vp9_i%2_%3_internal_10).pass2] |
| %ifidn %1_%2, dct_dct |
| dec eobd |
| jnz %%p1 |
| %else |
| %if %4 |
| add eobd, %4 |
| %endif |
| ; jump to the 1st txfm function unless it's located directly after this |
| times ((%%end - %%p1) >> 31) & 1 jmp %%p1 |
| ALIGN function_align |
| %%end: |
| %endif |
| %endmacro |
| |
| %macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset |
| INV_TXFM_FN %1, %2, 16x16, %3 |
| %ifidn %1_%2, dct_dct |
| imul r6d, [cq], 11585 |
| vpbroadcastd ym3, [o(pixel_clip)] |
| mov [cq], r3d |
| add r6d, 8192 |
| sar r6d, 14 |
| imul r6d, 11585 |
| or r3d, 8 |
| add r6d, 532480 |
| sar r6d, 20 |
| vpbroadcastw ym2, r6d |
| paddsw ym2, ym3 |
| .dconly_loop: |
| paddsw ym0, ym2, [dstq+strideq*0] |
| paddsw ym1, ym2, [dstq+strideq*1] |
| psubusw ym0, ym3 |
| psubusw ym1, ym3 |
| mova [dstq+strideq*0], ym0 |
| mova [dstq+strideq*1], ym1 |
| lea dstq, [dstq+strideq*2] |
| dec r3d |
| jg .dconly_loop |
| RET |
| %endif |
| %endmacro |
| |
| %macro IDCT16_PART1 0 |
| %if mmsize == 64 |
| .main_part1_fast: |
| %endif |
| pmulld m15, m1, [o(pd_16305)] {bcstd} ; t15a |
| pmulld m1, [o(pd_1606)] {bcstd} ; t8a |
| pmulld m9, m7, [o(pd_10394)] {bcstd} ; t9a |
| pmulld m7, [o(pd_12665)] {bcstd} ; t14a |
| pmulld m11, m5, [o(pd_14449)] {bcstd} ; t13a |
| pmulld m5, [o(pd_7723)] {bcstd} ; t10a |
| pmulld m13, m3, [o(pd_4756)] {bcstd} ; t11a |
| pmulld m3, [o(pd_15679)] {bcstd} ; t12a |
| pmulld m10, m6, [o(pd_9102)] {bcstd} ; t5a |
| pmulld m6, [o(pd_13623)] {bcstd} ; t6a |
| pmulld m14, m2, [o(pd_16069)] {bcstd} ; t7a |
| pmulld m2, [o(pd_3196)] {bcstd} ; t4a |
| pmulld m12, m4, [o(pd_15137)] {bcstd} ; t3 |
| pmulld m4, [o(pd_6270)] {bcstd} ; t2 |
| pmulld m0, m21 |
| REPX {psubd x, m20, x}, m9, m13, m10 |
| paddd m0, m20 |
| mova m18, m0 |
| %if mmsize == 64 ; for the ymm variant we only ever use the fast path |
| jmp %%main_part1b |
| .main_part1: |
| ITX_MULSUB_2D 1, 15, 16, 17, 18, _, 1606, 16305 ; t8a, t15a |
| ITX_MULSUB_2D 9, 7, 16, 17, 18, _, 12665, 10394 ; t9a, t14a |
| ITX_MULSUB_2D 5, 11, 16, 17, 18, _, 7723, 14449 ; t10a, t13a |
| ITX_MULSUB_2D 13, 3, 16, 17, 18, _, 15679, 4756 ; t11a, t12a |
| ITX_MULSUB_2D 10, 6, 16, 17, 18, _, 13623, 9102 ; t5a, t6a |
| ITX_MULSUB_2D 2, 14, 16, 17, 18, _, 3196, 16069 ; t4a, t7a |
| ITX_MULSUB_2D 4, 12, 16, 17, 18, _, 6270, 15137 ; t2, t3 |
| pmulld m0, m21 |
| pmulld m8, m21 |
| REPX {paddd x, m20}, m0, m9, m13, m10 |
| psubd m18, m0, m8 ; t1 |
| paddd m0, m8 ; t0 |
| %%main_part1b: |
| %endif |
| vpbroadcastd m19, [o(pd_15137)] |
| vpbroadcastd m16, [o(pd_6270)] |
| REPX {paddd x, m20}, m15, m7, m1, m11, m3, m5 |
| REPX {psrad x, 14 }, m15, m7, m1, m9, m11, m3, m5, m13 |
| paddd m17, m15, m7 ; t15 |
| psubd m15, m7 ; t14 |
| psubd m7, m3, m11 ; t13 |
| paddd m3, m11 ; t12 |
| psubd m11, m13, m5 ; t10 |
| paddd m5, m13 ; t11 |
| psubd m13, m1, m9 ; t9 |
| paddd m1, m9 ; t8 |
| ITX_MULSUB_2D 15, 13, 8, 9, _, 20, 16, 19 ; t9a, t14a |
| ITX_MULSUB_2D 7, 11, 8, 9, _, 20, 16, 19, 2 ; t13a, t10a |
| paddd m16, m1, m5 ; t8a |
| psubd m1, m5 ; t11a |
| paddd m8, m15, m11 ; t9 |
| psubd m15, m11 ; t10 |
| psubd m11, m17, m3 ; t12a |
| paddd m17, m3 ; t15a |
| psubd m9, m13, m7 ; t13 |
| paddd m13, m7 ; t14 |
| REPX {pmulld x, m21}, m11, m9, m1, m15 |
| REPX {paddd x, m20}, m2, m6, m14 |
| REPX {psrad x, 14 }, m10, m2, m6, m14 |
| psubd m3, m2, m10 ; t5a |
| paddd m10, m2 ; t4 |
| paddd m11, m20 |
| psubd m5, m11, m1 ; t11 |
| paddd m11, m1 ; t12 |
| psubd m1, m14, m6 ; t6a |
| paddd m14, m6 ; t7 |
| pmulld m1, m21 |
| pmulld m3, m21 |
| paddd m4, m20 |
| paddd m12, m20 |
| REPX {psrad x, 14 }, m4, m12, m0, m18 |
| paddd m9, m20 |
| paddd m2, m9, m15 ; t13a |
| psubd m9, m15 ; t10a |
| paddd m1, m20 |
| psubd m6, m1, m3 ; t5 |
| paddd m1, m3 ; t6 |
| REPX {psrad x, 14}, m6, m1, m11, m5, m2, m9 |
| %endmacro |
| |
| %macro IDCT16_PART2 0 |
| psubd m3, m0, m12 ; t3 |
| paddd m0, m12 ; t0 |
| psubd m12, m18, m4 ; t2 |
| paddd m18, m4 ; t1 |
| psubd m4, m3, m10 ; t4 |
| paddd m3, m10 ; t3 |
| psubd m10, m12, m6 ; t5 |
| paddd m12, m6 ; t2 |
| psubd m6, m18, m1 ; t6 |
| paddd m1, m18 ; t1 |
| psubd m7, m0, m14 ; t7 |
| paddd m0, m14 ; t0 |
| psubd m15, m0, m17 ; out15 |
| paddd m0, m17 ; out0 |
| psubd m14, m1, m13 ; out14 |
| paddd m1, m13 ; out1 |
| psubd m13, m12, m2 ; out13 |
| paddd m2, m12 ; out2 |
| psubd m12, m3, m11 ; out12 |
| paddd m3, m11 ; out3 |
| psubd m11, m4, m5 ; out11 |
| paddd m4, m5 ; out4 |
| paddd m5, m10, m9 ; out5 |
| psubd m10, m9 ; out10 |
| psubd m9, m6, m8 ; out9 |
| paddd m6, m8 ; out6 |
| psubd m8, m7, m16 ; out8 |
| paddd m7, m16 ; out7 |
| %endmacro |
| |
| INIT_ZMM avx512icl |
| INV_TXFM_16X16_FN dct, dct |
| INV_TXFM_16X16_FN dct, adst, 39-23-1 |
| |
| cglobal vp9_idct_16x16_internal_10, 0, 7, 22, dst, stride, c, eob, tx2 |
| mova m0, [cq+64* 0] |
| mova m1, [cq+64* 1] |
| mova m2, [cq+64* 2] |
| mova m3, [cq+64* 3] |
| mova m4, [cq+64* 4] |
| mova m5, [cq+64* 5] |
| mova m6, [cq+64* 6] |
| mova m7, [cq+64* 7] |
| vpbroadcastd m20, [o(pd_8192)] |
| vpbroadcastd m21, [o(pd_11585)] |
| sub eobd, 38 |
| jl .pass1_fast |
| mova m8, [cq+64* 8] |
| mova m9, [cq+64* 9] |
| mova m10, [cq+64*10] |
| mova m11, [cq+64*11] |
| mova m12, [cq+64*12] |
| mova m13, [cq+64*13] |
| mova m14, [cq+64*14] |
| mova m15, [cq+64*15] |
| call .main_part1 |
| call .main_part2 |
| .pass1_end: |
| TRANSPOSE_4DQ 0, 4, 8, 12, 16 |
| TRANSPOSE_4DQ 1, 5, 9, 13, 16 |
| TRANSPOSE_4DQ 2, 6, 10, 14, 16 |
| TRANSPOSE_4DQ 3, 7, 11, 15, 16 |
| TRANSPOSE_4D 8, 9, 10, 11, 16 |
| TRANSPOSE_4D 12, 13, 14, 15, 16 |
| mov r6d, 64*12 |
| jmp .pass1_transpose_end |
| .pass1_fast: |
| WRAP_YMM IDCT16_PART1 |
| WRAP_YMM IDCT16_PART2 |
| .pass1_fast_end: |
| vinserti32x8 m0, ym4, 1 |
| vinserti32x8 m8, ym12, 1 |
| vinserti32x8 m1, ym5, 1 |
| vinserti32x8 m9, ym13, 1 |
| vinserti32x8 m2, ym6, 1 |
| vinserti32x8 m10, ym14, 1 |
| vinserti32x8 m3, ym7, 1 |
| vinserti32x8 m11, ym15, 1 |
| vshufi32x4 m4, m0, m8, q3131 |
| vshufi32x4 m0, m8, q2020 |
| vshufi32x4 m5, m1, m9, q3131 |
| vshufi32x4 m1, m9, q2020 |
| vshufi32x4 m6, m2, m10, q3131 |
| vshufi32x4 m2, m10, q2020 |
| vshufi32x4 m7, m3, m11, q3131 |
| vshufi32x4 m3, m11, q2020 |
| mov r6d, 64*4 |
| .pass1_transpose_end: |
| pxor m16, m16 |
| .zero_loop: |
| mova [cq+r6+64*0], m16 |
| mova [cq+r6+64*1], m16 |
| mova [cq+r6+64*2], m16 |
| mova [cq+r6+64*3], m16 |
| sub r6d, 64*4 |
| jge .zero_loop |
| TRANSPOSE_4D 0, 1, 2, 3, 16 |
| TRANSPOSE_4D 4, 5, 6, 7, 16 |
| jmp tx2q |
| .pass2: |
| test eobd, eobd |
| jl .pass2_fast |
| call .main_part1 |
| jmp .pass2_end |
| .pass2_fast: |
| call .main_part1_fast |
| .pass2_end: |
| vpbroadcastd m3, [o(pixel_clip6)] |
| paddd m0, m3 |
| paddd m18, m3 |
| call .main_part2 |
| REPX {psrad x, 6}, m0, m1, m2, m3 |
| packssdw m0, m1 |
| lea r6, [strideq*3] |
| packssdw m1, m2, m3 |
| mova m2, [o(deintq_perm)] |
| vpbroadcastd m3, [o(pixel_clip)] |
| REPX {psrad x, 6}, m4, m5, m6, m7 |
| call .write_16x4 |
| packssdw m0, m4, m5 |
| packssdw m1, m6, m7 |
| REPX {psrad x, 6}, m8, m9, m10, m11 |
| call .write_16x4 |
| packssdw m0, m8, m9 |
| packssdw m1, m10, m11 |
| .pass2_end2: |
| REPX {psrad x, 6}, m12, m13, m14, m15 |
| call .write_16x4 |
| packssdw m0, m12, m13 |
| packssdw m1, m14, m15 |
| call .write_16x4 |
| RET |
| ALIGN function_align |
| .write_16x4: |
| mova ym16, [dstq+strideq*0] |
| vinserti32x8 m16, [dstq+strideq*1], 1 |
| mova ym17, [dstq+strideq*2] |
| vinserti32x8 m17, [dstq+r6 ], 1 |
| vpermq m0, m2, m0 |
| vpermq m1, m2, m1 |
| paddsw m16, m0 |
| paddsw m17, m1 |
| psubusw m16, m3 |
| psubusw m17, m3 |
| mova [dstq+strideq*0], ym16 |
| vextracti32x8 [dstq+strideq*1], m16, 1 |
| mova [dstq+strideq*2], ym17 |
| vextracti32x8 [dstq+r6 ], m17, 1 |
| lea dstq, [dstq+strideq*4] |
| ret |
| ALIGN function_align |
| IDCT16_PART1 |
| ret |
| ALIGN function_align |
| .main_part2: |
| IDCT16_PART2 |
| ret |
| |
| %macro IADST16_PART1 0 |
| %if mmsize == 64 |
| .main_part1_fast: |
| %endif |
| pmulld m15, m0, [o(pd_16364)] {bcstd} ; t1 |
| pmulld m0, [o(pd_804)] {bcstd} ; t0 |
| pmulld m13, m2, [o(pd_15893)] {bcstd} ; t3 |
| pmulld m2, [o(pd_3981)] {bcstd} ; t2 |
| pmulld m11, m4, [o(pd_14811)] {bcstd} ; t5 |
| pmulld m4, [o(pd_7005)] {bcstd} ; t4 |
| pmulld m9, m6, [o(pd_13160)] {bcstd} ; t7 |
| pmulld m6, [o(pd_9760)] {bcstd} ; t6 |
| pmulld m8, m7, [o(pd_11003)] {bcstd} ; t8 |
| pmulld m7, [o(pd_12140)] {bcstd} ; t9 |
| pmulld m10, m5, [o(pd_8423)] {bcstd} ; t10 |
| pmulld m5, [o(pd_14053)] {bcstd} ; t11 |
| pmulld m12, m3, [o(pd_5520)] {bcstd} ; t12 |
| pmulld m3, [o(pd_15426)] {bcstd} ; t13 |
| pmulld m14, m1, [o(pd_2404)] {bcstd} ; t14 |
| pmulld m1, [o(pd_16207)] {bcstd} ; t15 |
| REPX {psubd x, m20, x}, m15, m13, m11, m9 |
| %if mmsize == 64 ; for the ymm variant we only ever use the fast path |
| jmp %%main_part1b |
| ALIGN function_align |
| .main_part1: |
| ITX_MULSUB_2D 15, 0, 16, 17, 18, _, 804, 16364 ; t1, t0 |
| ITX_MULSUB_2D 13, 2, 16, 17, 18, _, 3981, 15893 ; t3, t2 |
| ITX_MULSUB_2D 11, 4, 16, 17, 18, _, 7005, 14811 ; t5, t4 |
| ITX_MULSUB_2D 9, 6, 16, 17, 18, _, 9760, 13160 ; t7, t6 |
| ITX_MULSUB_2D 7, 8, 16, 17, 18, _, 12140, 11003 ; t9, t8 |
| ITX_MULSUB_2D 5, 10, 16, 17, 18, _, 14053, 8423 ; t11, t10 |
| ITX_MULSUB_2D 3, 12, 16, 17, 18, _, 15426, 5520 ; t13, t12 |
| ITX_MULSUB_2D 1, 14, 16, 17, 18, _, 16207, 2404 ; t15, t14 |
| REPX {paddd x, m20}, m15, m13, m11, m9 |
| %%main_part1b: |
| %endif |
| REPX {paddd x, m20}, m0, m2, m4, m6 |
| psubd m16, m2, m10 ; t10a |
| paddd m2, m10 ; t2a |
| psubd m10, m9, m1 ; t15a |
| paddd m9, m1 ; t7a |
| psubd m1, m13, m5 ; t11a |
| paddd m13, m5 ; t3a |
| psubd m5, m6, m14 ; t14a |
| paddd m6, m14 ; t6a |
| REPX {psrad x, 14}, m16, m10, m1, m5 |
| psubd m14, m0, m8 ; t8a |
| paddd m0, m8 ; t0a |
| psubd m8, m15, m7 ; t9a |
| paddd m15, m7 ; t1a |
| psubd m7, m4, m12 ; t12a |
| paddd m4, m12 ; t4a |
| paddd m12, m11, m3 ; t5a |
| psubd m11, m3 ; t13a |
| REPX {psrad x, 14}, m14, m8, m7, m11 |
| vpbroadcastd m19, [o(pd_9102)] |
| vpbroadcastd m18, [o(pd_13623)] |
| ITX_MULSUB_2D 16, 1, 3, 17, _, _, 18, 19 ; t11, t10 |
| ITX_MULSUB_2D 10, 5, 3, 17, _, _, 19, 18 ; t14, t15 |
| vpbroadcastd m19, [o(pd_16069)] |
| vpbroadcastd m18, [o(pd_3196)] |
| ITX_MULSUB_2D 14, 8, 3, 17, _, _, 18, 19 ; t9, t8 |
| ITX_MULSUB_2D 11, 7, 3, 17, _, _, 19, 18 ; t12, t13 |
| vpbroadcastd m19, [o(pd_6270)] |
| vpbroadcastd m18, [o(pd_15137)] |
| REPX {psrad x, 14}, m15, m12, m0, m4 |
| psubd m3, m15, m12 ; t5 |
| paddd m15, m12 ; t1 |
| psubd m12, m0, m4 ; t4 |
| paddd m0, m4 ; t0 |
| REPX {psrad x, 14}, m2, m6, m13, m9 |
| psubd m4, m2, m6 ; t6 |
| paddd m2, m6 ; t2 |
| psubd m6, m13, m9 ; t7 |
| paddd m9, m13 ; t3 |
| REPX {paddd x, m20}, m8, m14, m1, m16 |
| psubd m13, m8, m11 ; t12a |
| paddd m8, m11 ; t8a |
| psubd m11, m14, m7 ; t13a |
| paddd m14, m7 ; t9a |
| psubd m7, m1, m10 ; t14a |
| paddd m1, m10 ; t10a |
| psubd m10, m16, m5 ; t15a |
| paddd m16, m5 ; t11a |
| REPX {psrad x, 14}, m13, m11, m7, m10 |
| ITX_MULSUB_2D 12, 3, 5, 17, _, _, 19, 18 ; t5a, t4a |
| ITX_MULSUB_2D 6, 4, 5, 17, _, _, 18, 19 ; t6a, t7a |
| ITX_MULSUB_2D 13, 11, 5, 17, _, _, 19, 18 ; t13, t12 |
| ITX_MULSUB_2D 10, 7, 5, 17, _, _, 18, 19 ; t14, t15 |
| REPX {psrad x, 14}, m8, m1, m14, m16 |
| psubd m5, m8, m1 ; t10 |
| paddd m1, m8 ; -out1 |
| psubd m8, m15, m9 ; t3a |
| paddd m15, m9 ; -out15 |
| psubd m9, m14, m16 ; t11 |
| paddd m14, m16 ; out14 |
| psubd m16, m0, m2 ; t2a |
| paddd m0, m2 ; out0 |
| REPX {paddd x, m20}, m11, m13, m12, m3 |
| paddd m2, m11, m10 ; out2 |
| psubd m11, m10 ; t14a |
| psubd m10, m13, m7 ; t15a |
| paddd m13, m7 ; -out13 |
| psubd m7, m12, m4 ; t7 |
| paddd m12, m4 ; out12 |
| psubd m4, m3, m6 ; t6 |
| paddd m3, m6 ; -out3 |
| REPX {psrad x, 14}, m10, m7, m11, m4 |
| REPX {pmulld x, m21}, m9, m10, m7, m8, m5, m11, m4, m16 |
| REPX {psrad x, 14}, m2, m13, m12, m3 |
| %endmacro |
| |
| %macro IADST16_PART2 0 |
| paddd m9, m20 |
| psubd m10, m20, m10 |
| paddd m7, m20 |
| psubd m8, m20, m8 |
| paddd m6, m9, m5 ; out6 |
| psubd m9, m5 ; out9 |
| psubd m5, m10, m11 ; out5 |
| paddd m10, m11 ; out10 |
| psubd m11, m7, m4 ; out11 |
| paddd m4, m7 ; out4 |
| psubd m7, m8, m16 ; out7 |
| paddd m8, m16 ; out8 |
| %endmacro |
| |
| %macro IADST16_PASS1_END 0 |
| pxor m16, m16 |
| psubd m1, m16, m1 |
| psubd m3, m16, m3 |
| psubd m13, m16, m13 |
| psubd m15, m16, m15 |
| REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11 |
| %endmacro |
| |
| INV_TXFM_16X16_FN adst, dct, 39-18 |
| INV_TXFM_16X16_FN adst, adst |
| |
| cglobal vp9_iadst_16x16_internal_10, 0, 7, 22, dst, stride, c, eob, tx2 |
| mova m0, [cq+64* 0] |
| mova m1, [cq+64* 1] |
| mova m2, [cq+64* 2] |
| mova m3, [cq+64* 3] |
| mova m4, [cq+64* 4] |
| mova m5, [cq+64* 5] |
| mova m6, [cq+64* 6] |
| mova m7, [cq+64* 7] |
| vpbroadcastd m20, [o(pd_8192)] |
| vpbroadcastd m21, [o(pd_11585)] |
| sub eobd, 39 |
| jl .pass1_fast |
| mova m8, [cq+64* 8] |
| mova m9, [cq+64* 9] |
| mova m10, [cq+64*10] |
| mova m11, [cq+64*11] |
| mova m12, [cq+64*12] |
| mova m13, [cq+64*13] |
| mova m14, [cq+64*14] |
| mova m15, [cq+64*15] |
| call .main_part1 |
| call .main_part2 |
| IADST16_PASS1_END |
| jmp m(vp9_idct_16x16_internal_10).pass1_end |
| .pass1_fast: |
| WRAP_YMM IADST16_PART1 |
| WRAP_YMM IADST16_PART2 |
| WRAP_YMM IADST16_PASS1_END |
| jmp m(vp9_idct_16x16_internal_10).pass1_fast_end |
| .pass2: |
| test eobd, eobd |
| jl .pass2_fast |
| call .main_part1 |
| jmp .pass2_end |
| .pass2_fast: |
| call .main_part1_fast |
| .pass2_end: |
| vpbroadcastd m20, [o(pd_532480)] |
| call .main_part2 |
| vpbroadcastd m16, [o(pixel_clip6)] |
| REPX {paddd x, m16}, m0, m2, m12, m14 |
| REPX {psubd x, m16, x}, m1, m3, m13, m15 |
| REPX {psrad x, 6}, m0, m1, m2, m3 |
| packssdw m0, m1 |
| lea r6, [strideq*3] |
| packssdw m1, m2, m3 |
| mova m2, [o(deintq_perm)] |
| vpbroadcastd m3, [o(pixel_clip)] |
| REPX {psrad x, 20}, m4, m5, m6, m7 |
| call m(vp9_idct_16x16_internal_10).write_16x4 |
| packssdw m0, m4, m5 |
| packssdw m1, m6, m7 |
| paddsw m0, m3 |
| paddsw m1, m3 |
| REPX {psrad x, 20}, m8, m9, m10, m11 |
| call m(vp9_idct_16x16_internal_10).write_16x4 |
| packssdw m0, m8, m9 |
| packssdw m1, m10, m11 |
| paddsw m0, m3 |
| paddsw m1, m3 |
| jmp m(vp9_idct_16x16_internal_10).pass2_end2 |
| ALIGN function_align |
| IADST16_PART1 |
| ret |
| ALIGN function_align |
| .main_part2: |
| IADST16_PART2 |
| ret |
| |
| cglobal vp9_idct_idct_32x32_add_10, 4, 7, 23, 64*64, dst, stride, c, eob |
| %undef cmp |
| lea r5, [o_base] |
| dec eobd |
| jnz .pass1 |
| imul r6d, [cq], 11585 |
| vpbroadcastd m3, [o(pixel_clip)] |
| mov [cq], r3d |
| add r6d, 8192 |
| sar r6d, 14 |
| imul r6d, 11585 |
| or r3d, 16 |
| add r6d, 532480 |
| sar r6d, 20 |
| vpbroadcastw m2, r6d |
| paddsw m2, m3 |
| .dconly_loop: |
| paddsw m0, m2, [dstq+strideq*0] |
| paddsw m1, m2, [dstq+strideq*1] |
| psubusw m0, m3 |
| psubusw m1, m3 |
| mova [dstq+strideq*0], m0 |
| mova [dstq+strideq*1], m1 |
| lea dstq, [dstq+strideq*2] |
| dec r3d |
| jg .dconly_loop |
| RET |
| .pass1: |
| vpbroadcastd m20, [o(pd_8192)] |
| vpbroadcastd m21, [o(pd_11585)] |
| cmp eobd, 135 |
| jl .pass1_fast |
| add cq, 64 |
| lea r4, [rsp+64*8] |
| cmp eobd, 579 |
| jl .pass1_right_fast |
| mov r6d, 128*28 |
| call .pass1_main |
| jmp .pass1_right_end |
| .pass1_right_fast: ; bottomright quadrant is zero |
| mova m0, [cq+128* 1] |
| mova m1, [cq+128* 3] |
| mova m2, [cq+128* 5] |
| mova m3, [cq+128* 7] |
| mova m4, [cq+128* 9] |
| mova m5, [cq+128*11] |
| mova m6, [cq+128*13] |
| mova m7, [cq+128*15] |
| call .main_fast |
| mova m0, [cq+128* 0] |
| mova m1, [cq+128* 2] |
| mova m2, [cq+128* 4] |
| mova m3, [cq+128* 6] |
| mova m4, [cq+128* 8] |
| mova m5, [cq+128*10] |
| mova m6, [cq+128*12] |
| mova m7, [cq+128*14] |
| call m(vp9_idct_16x16_internal_10).main_part1_fast |
| mov r6d, 128*12 |
| call .pass1_main_end |
| .pass1_right_end: |
| mova [r4+64* 8], m0 |
| mova [r4+64* 9], m1 |
| mova [r4+64*10], m2 |
| mova [r4+64*11], m3 |
| mova [r4+64*12], m4 |
| mova [r4+64*13], m5 |
| mova [r4+64*14], m6 |
| mova [r4+64*15], m7 |
| mova [r4+64*16], m16 |
| mova [r4+64*17], m17 |
| mova [r4+64*18], m18 |
| mova [r4+64*19], m19 |
| mova [r4+64*20], m8 |
| mova [r4+64*21], m9 |
| mova [r4+64*22], m10 |
| mova [r4+64*23], m11 |
| sub cq, 64 |
| sub r4, 64*8 |
| mov r6d, 128*28 |
| call .pass1_main |
| mova m12, [r4+64*20] |
| mova m13, [r4+64*21] |
| mova m14, [r4+64*22] |
| mova m15, [r4+64*23] |
| mova [r4+64*20], m8 |
| mova [r4+64*21], m9 |
| mova [r4+64*22], m10 |
| mova [r4+64*23], m11 |
| mova m8, [r4+64*16] |
| mova m9, [r4+64*17] |
| mova m10, [r4+64*18] |
| mova m11, [r4+64*19] |
| mova [r4+64*16], m16 |
| mova [r4+64*17], m17 |
| mova [r4+64*18], m18 |
| mova [r4+64*19], m19 |
| call .main |
| mova m0, [r4+64*16] |
| mova m1, [r4+64*17] |
| mova m2, [r4+64*18] |
| mova m3, [r4+64*19] |
| mova m4, [r4+64*20] |
| mova m5, [r4+64*21] |
| mova m6, [r4+64*22] |
| mova m7, [r4+64*23] |
| mova m8, [r4+64*24] |
| mova m9, [r4+64*25] |
| mova m10, [r4+64*26] |
| mova m11, [r4+64*27] |
| mova m12, [r4+64*28] |
| mova m13, [r4+64*29] |
| mova m14, [r4+64*30] |
| mova m15, [r4+64*31] |
| call m(vp9_idct_16x16_internal_10).main_part1 |
| call .pass2_main_left |
| mova m8, [r4+64* 8] |
| mova m9, [r4+64* 9] |
| mova m10, [r4+64*10] |
| mova m11, [r4+64*11] |
| mova m12, [r4+64*12] |
| mova m13, [r4+64*13] |
| mova m14, [r4+64*14] |
| mova m15, [r4+64*15] |
| TRANSPOSE_4DQ 8, 10, 12, 14, 16 |
| TRANSPOSE_4DQ 9, 11, 13, 15, 16 |
| call .main |
| call .pass2_main_right |
| mova m8, [r4+64*24] |
| mova m9, [r4+64*25] |
| mova m10, [r4+64*26] |
| mova m11, [r4+64*27] |
| mova m12, [r4+64*28] |
| mova m13, [r4+64*29] |
| mova m14, [r4+64*30] |
| mova m15, [r4+64*31] |
| TRANSPOSE_4DQ 8, 10, 12, 14, 16 |
| TRANSPOSE_4DQ 9, 11, 13, 15, 16 |
| call m(vp9_idct_16x16_internal_10).main_part1 |
| jmp .pass2_end |
| .pass1_fast: |
| mova m0, [cq+128* 1] |
| mova m1, [cq+128* 3] |
| mova m2, [cq+128* 5] |
| mova m3, [cq+128* 7] |
| mova m4, [cq+128* 9] |
| mova m5, [cq+128*11] |
| mova m6, [cq+128*13] |
| mova m7, [cq+128*15] |
| mov r4, rsp |
| call .main_fast |
| mova m0, [cq+128* 0] |
| mova m1, [cq+128* 2] |
| mova m2, [cq+128* 4] |
| mova m3, [cq+128* 6] |
| mova m4, [cq+128* 8] |
| mova m5, [cq+128*10] |
| mova m6, [cq+128*12] |
| mova m7, [cq+128*14] |
| call m(vp9_idct_16x16_internal_10).main_part1_fast |
| call m(vp9_idct_16x16_internal_10).main_part2 |
| mov r6d, 128*12 |
| call .pass1_main_end2 |
| mova [r4+64*16], m16 |
| mova [r4+64*17], m17 |
| mova [r4+64*18], m18 |
| mova [r4+64*19], m19 |
| mova [r4+64*20], m8 |
| mova [r4+64*21], m9 |
| mova [r4+64*22], m10 |
| mova [r4+64*23], m11 |
| call .main_fast |
| mova m0, [r4+64*16] |
| mova m1, [r4+64*17] |
| mova m2, [r4+64*18] |
| mova m3, [r4+64*19] |
| mova m4, [r4+64*20] |
| mova m5, [r4+64*21] |
| mova m6, [r4+64*22] |
| mova m7, [r4+64*23] |
| call m(vp9_idct_16x16_internal_10).main_part1_fast |
| call .pass2_main_left |
| call .main_fast |
| call .pass2_main_right |
| call m(vp9_idct_16x16_internal_10).main_part1_fast |
| .pass2_end: |
| paddd m0, m22 |
| paddd m18, m22 |
| call m(vp9_idct_16x16_internal_10).main_part2 |
| mova m20, [o(deintq_perm)] |
| rorx r2, strideq, 59 ; strideq*32 |
| vpbroadcastd m21, [o(pixel_clip)] |
| add r2, dstq |
| %assign i 0 |
| %rep 16 |
| mova m16, [r4+64*(15-i)] |
| mova m17, [r4+64*(i-16)] |
| mova m18, [r4-64*(17+i)] |
| paddd m19, m %+ i, m16 |
| psubd m0, m %+ i, m16 |
| call .write_32x2 |
| %assign i i+1 |
| %endrep |
| RET |
| ALIGN function_align |
| .write_32x2: |
| paddd m16, m17, m18 |
| psubd m17, m18 |
| REPX {psrad x, 6}, m19, m16, m0, m17 |
| packssdw m16, m19 |
| packssdw m17, m0 |
| sub r2, strideq |
| vpermq m16, m20, m16 |
| vpermq m17, m20, m17 |
| paddsw m16, [dstq] |
| paddsw m17, [r2 ] |
| psubusw m16, m21 |
| psubusw m17, m21 |
| mova [dstq], m16 |
| mova [r2 ], m17 |
| add dstq, strideq |
| ret |
| ALIGN function_align |
| .pass1_main: |
| mova m0, [cq+128* 1] |
| mova m1, [cq+128* 3] |
| mova m2, [cq+128* 5] |
| mova m3, [cq+128* 7] |
| mova m4, [cq+128* 9] |
| mova m5, [cq+128*11] |
| mova m6, [cq+128*13] |
| mova m7, [cq+128*15] |
| mova m8, [cq+128*17] |
| mova m9, [cq+128*19] |
| mova m10, [cq+128*21] |
| mova m11, [cq+128*23] |
| mova m12, [cq+128*25] |
| mova m13, [cq+128*27] |
| mova m14, [cq+128*29] |
| mova m15, [cq+128*31] |
| call .main |
| mova m0, [cq+128* 0] |
| mova m1, [cq+128* 2] |
| mova m2, [cq+128* 4] |
| mova m3, [cq+128* 6] |
| mova m4, [cq+128* 8] |
| mova m5, [cq+128*10] |
| mova m6, [cq+128*12] |
| mova m7, [cq+128*14] |
| mova m8, [cq+128*16] |
| mova m9, [cq+128*18] |
| mova m10, [cq+128*20] |
| mova m11, [cq+128*22] |
| mova m12, [cq+128*24] |
| mova m13, [cq+128*26] |
| mova m14, [cq+128*28] |
| mova m15, [cq+128*30] |
| call m(vp9_idct_16x16_internal_10).main_part1 |
| .pass1_main_end: |
| call m(vp9_idct_16x16_internal_10).main_part2 |
| .pass1_main_end2: |
| pxor m16, m16 |
| .pass1_zero_loop: |
| mova [cq+r6+128*0], m16 |
| mova [cq+r6+128*1], m16 |
| mova [cq+r6+128*2], m16 |
| mova [cq+r6+128*3], m16 |
| sub r6d, 128*4 |
| jge .pass1_zero_loop |
| mova m16, [r4+64*15] |
| mova m19, [r4+64*14] |
| mova m22, [r4+64*13] |
| mova m17, [r4+64*12] |
| psubd m18, m0, m16 |
| paddd m16, m0 |
| paddd m0, m19, m1 |
| psubd m19, m1, m19 |
| paddd m1, m17, m3 |
| psubd m3, m17 |
| paddd m17, m2, m22 |
| psubd m2, m22 |
| TRANSPOSE_4D 3, 2, 19, 18, 22 ; 28 29 30 31 |
| TRANSPOSE_4D 16, 0, 17, 1, 22 ; 0 1 2 3 |
| mova [r4+64*54], m3 |
| mova [r4+64*55], m19 |
| mova [r4+64*38], m2 |
| mova [r4+64*39], m18 |
| mova m2, [r4+64*11] |
| mova m19, [r4+64*10] |
| mova m3, [r4+64* 9] |
| mova m22, [r4+64* 8] |
| paddd m18, m4, m2 |
| psubd m4, m2 |
| paddd m2, m5, m19 |
| psubd m5, m19 |
| paddd m19, m6, m3 |
| psubd m6, m3 |
| paddd m3, m7, m22 |
| psubd m7, m22 |
| TRANSPOSE_4D 7, 6, 5, 4, 22 ; 24 25 26 27 |
| TRANSPOSE_4D 18, 2, 19, 3, 22 ; 4 5 6 7 |
| mova [r4+64*52], m7 |
| mova [r4+64*53], m5 |
| mova [r4+64*36], m6 |
| mova [r4+64*37], m4 |
| mova m7, [r4+64* 7] |
| mova m4, [r4+64* 6] |
| mova m5, [r4+64* 5] |
| mova m22, [r4+64* 4] |
| psubd m6, m8, m7 |
| paddd m8, m7 |
| psubd m7, m9, m4 |
| paddd m4, m9 |
| paddd m9, m10, m5 |
| psubd m10, m5 |
| paddd m5, m11, m22 |
| psubd m11, m22 |
| TRANSPOSE_4D 11, 10, 7, 6, 22 ; 20 21 22 23 |
| TRANSPOSE_4D 8, 4, 9, 5, 22 ; 8 9 10 11 |
| mova [r4+64*50], m11 |
| mova [r4+64*51], m7 |
| mova [r4+64*34], m10 |
| mova [r4+64*35], m6 |
| mova m6, [r4+64* 3] |
| mova m11, [r4+64* 2] |
| mova m7, [r4+64* 1] |
| mova m22, [r4+64* 0] |
| paddd m10, m12, m6 |
| psubd m12, m6 |
| paddd m6, m13, m11 |
| psubd m13, m11 |
| paddd m11, m14, m7 |
| psubd m14, m7 |
| paddd m7, m15, m22 |
| psubd m15, m22 |
| TRANSPOSE_4D 15, 14, 13, 12, 22 ; 16 17 18 19 |
| TRANSPOSE_4D 10, 6, 11, 7, 22 ; 12 13 14 15 |
| mova [r4+64*48], m15 |
| mova [r4+64*49], m13 |
| mova [r4+64*32], m14 |
| mova [r4+64*33], m12 |
| TRANSPOSE_4DQ 0, 2, 4, 6, 22 |
| TRANSPOSE_4DQ 1, 3, 5, 7, 22 |
| TRANSPOSE_4DQ 16, 18, 8, 10, 22 |
| TRANSPOSE_4DQ 17, 19, 9, 11, 22 |
| ret |
| ALIGN function_align |
| .pass2_main_left: |
| vpbroadcastd m22, [o(pixel_clip6)] |
| paddd m0, m22 |
| paddd m18, m22 |
| call m(vp9_idct_16x16_internal_10).main_part2 |
| mova [r4+64*16], m0 |
| mova [r4+64*17], m1 |
| mova [r4+64*18], m2 |
| mova [r4+64*19], m3 |
| mova [r4+64*20], m4 |
| mova [r4+64*21], m5 |
| mova [r4+64*22], m6 |
| mova [r4+64*23], m7 |
| mova [r4+64*24], m8 |
| mova [r4+64*25], m9 |
| mova [r4+64*26], m10 |
| mova [r4+64*27], m11 |
| mova [r4+64*28], m12 |
| mova [r4+64*29], m13 |
| mova [r4+64*30], m14 |
| mova [r4+64*31], m15 |
| add r4, 64*32 |
| mova m0, [r4+64* 0] |
| mova m1, [r4+64* 1] |
| mova m2, [r4+64* 2] |
| mova m3, [r4+64* 3] |
| mova m4, [r4+64* 4] |
| mova m5, [r4+64* 5] |
| mova m6, [r4+64* 6] |
| mova m7, [r4+64* 7] |
| jmp .pass2_main_transpose |
| ALIGN function_align |
| .pass2_main_right: |
| mova m0, [r4+64*16] |
| mova m1, [r4+64*17] |
| mova m2, [r4+64*18] |
| mova m3, [r4+64*19] |
| mova m4, [r4+64*20] |
| mova m5, [r4+64*21] |
| mova m6, [r4+64*22] |
| mova m7, [r4+64*23] |
| .pass2_main_transpose: |
| TRANSPOSE_4DQ 0, 2, 4, 6, 8 |
| TRANSPOSE_4DQ 1, 3, 5, 7, 8 |
| ret |
| ALIGN function_align |
| .main_fast: |
| pmulld m15, m0, [o(pd_16364)] {1to16} ; t31a |
| pmulld m0, [o(pd_804)] {1to16} ; t16a |
| pmulld m8, m7, [o(pd_11003)] {1to16} ; t17a |
| pmulld m7, [o(pd_12140)] {1to16} ; t30a |
| pmulld m11, m4, [o(pd_14811)] {1to16} ; t29a |
| pmulld m4, [o(pd_7005)] {1to16} ; t18a |
| pmulld m12, m3, [o(pd_5520)] {1to16} ; t19a |
| pmulld m3, [o(pd_15426)] {1to16} ; t28a |
| pmulld m13, m2, [o(pd_15893)] {1to16} ; t27a |
| pmulld m2, [o(pd_3981)] {1to16} ; t20a |
| pmulld m10, m5, [o(pd_8423)] {1to16} ; t21a |
| pmulld m5, [o(pd_14053)] {1to16} ; t26a |
| pmulld m9, m6, [o(pd_13160)] {1to16} ; t25a |
| pmulld m6, [o(pd_9760)] {1to16} ; t22a |
| pmulld m14, m1, [o(pd_2404)] {1to16} ; t23a |
| pmulld m1, [o(pd_16207)] {1to16} ; t24a |
| REPX {psubd x, m20, x}, m8, m12, m10, m14 |
| jmp .main2 |
| ALIGN function_align |
| .main: |
| ITX_MULSUB_2D 0, 15, 16, 17, 18, _, 804, 16364 ; t16a, t31a |
| ITX_MULSUB_2D 8, 7, 16, 17, 18, _, 12140, 11003 ; t17a, t30a |
| ITX_MULSUB_2D 4, 11, 16, 17, 18, _, 7005, 14811 ; t18a, t29a |
| ITX_MULSUB_2D 12, 3, 16, 17, 18, _, 15426, 5520 ; t19a, t28a |
| ITX_MULSUB_2D 2, 13, 16, 17, 18, _, 3981, 15893 ; t20a, t27a |
| ITX_MULSUB_2D 10, 5, 16, 17, 18, _, 14053, 8423 ; t21a, t26a |
| ITX_MULSUB_2D 6, 9, 16, 17, 18, _, 9760, 13160 ; t22a, t25a |
| ITX_MULSUB_2D 14, 1, 16, 17, 18, _, 16207, 2404 ; t23a, t24a |
| REPX {paddd x, m20}, m8, m12, m10, m14 |
| .main2: |
| REPX {paddd x, m20}, m0, m15, m7, m4, m3, m11 |
| REPX {psrad x, 14 }, m8, m0, m15, m7, m12, m4, m3, m11 |
| psubd m16, m0, m8 ; t17 |
| paddd m0, m8 ; t16 |
| psubd m8, m15, m7 ; t30 |
| paddd m15, m7 ; t31 |
| paddd m7, m12, m4 ; t19 |
| psubd m12, m4 ; t18 |
| paddd m4, m3, m11 ; t28 |
| psubd m3, m11 ; t29 |
| REPX {paddd x, m20}, m2, m13, m5, m6, m1, m9 |
| REPX {psrad x, 14 }, m10, m2, m13, m5, m14, m6, m1, m9 |
| psubd m11, m2, m10 ; t21 |
| paddd m2, m10 ; t20 |
| psubd m10, m13, m5 ; t26 |
| paddd m13, m5 ; t27 |
| psubd m5, m14, m6 ; t22 |
| paddd m6, m14 ; t23 |
| psubd m14, m1, m9 ; t25 |
| paddd m9, m1 ; t24 |
| vpbroadcastd m19, [o(pd_16069)] |
| vpbroadcastd m18, [o(pd_3196)] |
| ITX_MULSUB_2D 8, 16, 1, 17, _, 20, 18, 19 ; t17a, t30a |
| ITX_MULSUB_2D 3, 12, 1, 17, _, 20, 18, 19, 1 ; t29a, t18a |
| vpbroadcastd m19, [o(pd_9102)] |
| vpbroadcastd m18, [o(pd_13623)] |
| ITX_MULSUB_2D 10, 11, 1, 17, _, 20, 18, 19 ; t21a, t26a |
| ITX_MULSUB_2D 14, 5, 1, 17, _, 20, 18, 19, 1 ; t25a, t22a |
| paddd m1, m6, m2 ; t23a |
| psubd m6, m2 ; t20a |
| psubd m2, m9, m13 ; t27a |
| paddd m9, m13 ; t24a |
| psubd m13, m15, m4 ; t28a |
| paddd m15, m4 ; t31a |
| psubd m4, m8, m12 ; t18 |
| paddd m8, m12 ; t17 |
| psubd m12, m0, m7 ; t19a |
| paddd m0, m7 ; t16a |
| psubd m7, m16, m3 ; t29 |
| paddd m3, m16 ; t30 |
| paddd m16, m5, m10 ; t22 |
| psubd m5, m10 ; t21 |
| psubd m10, m14, m11 ; t26 |
| paddd m14, m11 ; t25 |
| vpbroadcastd m19, [o(pd_15137)] |
| vpbroadcastd m18, [o(pd_6270)] |
| ITX_MULSUB_2D 13, 12, 11, 17, _, 20, 18, 19 ; t19, t28 |
| ITX_MULSUB_2D 2, 6, 11, 17, _, 20, 18, 19, 1 ; t27, t20 |
| ITX_MULSUB_2D 7, 4, 11, 17, _, 20, 18, 19 ; t18a, t29a |
| ITX_MULSUB_2D 10, 5, 11, 17, _, 20, 18, 19, 1 ; t26a, t21a |
| psubd m11, m0, m1 ; t23 |
| paddd m0, m1 ; t16 |
| paddd m1, m16, m8 ; t17a |
| psubd m16, m8, m16 ; t22a |
| psubd m8, m15, m9 ; t24 |
| paddd m15, m9 ; t31 |
| psubd m9, m3, m14 ; t25a |
| paddd m14, m3 ; t30a |
| paddd m3, m6, m13 ; t19a |
| psubd m6, m13, m6 ; t20a |
| paddd m13, m10, m4 ; t29 |
| psubd m10, m4, m10 ; t26 |
| psubd m4, m12, m2 ; t27a |
| paddd m12, m2 ; t28a |
| paddd m2, m7, m5 ; t18 |
| psubd m7, m5 ; t21 |
| REPX {pmulld x, m21}, m10, m8, m4, m9, m7, m11, m6, m16 |
| mova [r4+64* 0], m0 |
| mova [r4+64* 1], m1 |
| mova [r4+64* 2], m2 |
| mova [r4+64* 3], m3 |
| mova [r4+64*12], m12 |
| mova [r4+64*13], m13 |
| mova [r4+64*14], m14 |
| mova [r4+64*15], m15 |
| REPX {paddd x, m20}, m10, m8, m4, m9 |
| psubd m5, m10, m7 ; t21a |
| paddd m10, m7 ; t26a |
| psubd m7, m8, m11 ; t23a |
| paddd m8, m11 ; t24a |
| REPX {psrad x, 14 }, m5, m10, m7, m8 |
| paddd m11, m4, m6 ; t27 |
| psubd m4, m6 ; t20 |
| psubd m6, m9, m16 ; t22 |
| paddd m9, m16 ; t25 |
| REPX {psrad x, 14 }, m11, m4, m6, m9 |
| mova [r4+64* 4], m4 |
| mova [r4+64* 5], m5 |
| mova [r4+64* 6], m6 |
| mova [r4+64* 7], m7 |
| mova [r4+64* 8], m8 |
| mova [r4+64* 9], m9 |
| mova [r4+64*10], m10 |
| mova [r4+64*11], m11 |
| ret |
| |
| %endif |