| ;***************************************************************************** |
| ;* x86-optimized functions for gblur filter |
| ;* |
| ;* This file is part of FFmpeg. |
| ;* |
| ;* FFmpeg is free software; you can redistribute it and/or |
| ;* modify it under the terms of the GNU Lesser General Public |
| ;* License as published by the Free Software Foundation; either |
| ;* version 2.1 of the License, or (at your option) any later version. |
| ;* |
| ;* FFmpeg is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| ;* Lesser General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU Lesser General Public |
| ;* License along with FFmpeg; if not, write to the Free Software |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| ;****************************************************************************** |
| |
| %include "libavutil/x86/x86util.asm" |
| |
| SECTION .data |
| |
| gblur_transpose_16x16_indices1: dq 2, 3, 0, 1, 6, 7, 4, 5 |
| gblur_transpose_16x16_indices2: dq 1, 0, 3, 2, 5, 4, 7, 6 |
| gblur_transpose_16x16_indices3: dd 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 |
| gblur_transpose_16x16_mask: dw 0xcc, 0x33, 0xaa, 0x55, 0xaaaa, 0x5555 |
| gblur_vindex_width: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| |
| SECTION .text |
| |
| %xdefine AVX2_MMSIZE 32 |
| %xdefine AVX512_MMSIZE 64 |
| |
| %macro MOVSXDIFNIDN 1-* |
| %rep %0 |
| movsxdifnidn %1q, %1d |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| %macro KXNOR 2-* |
| %if mmsize == AVX512_MMSIZE |
| kxnorw %2, %2, %2 |
| %else |
| %if %0 == 3 |
| mov %3, -1 |
| %else |
| vpcmpeqd %1, %1, %1 |
| %endif |
| %endif |
| %endmacro |
| |
| %macro KMOVW 2-4 |
| %if mmsize == AVX2_MMSIZE && %0 == 4 |
| mova %1, %2 |
| %elif mmsize == AVX512_MMSIZE |
| %if %0 == 4 |
| %rotate 2 |
| %endif |
| kmovw %1, %2 |
| %endif |
| %endmacro |
| |
| %macro PUSH_MASK 5 |
| %if mmsize == AVX2_MMSIZE |
| %assign %%n mmsize/4 |
| %assign %%i 0 |
| %rep %%n |
| mov %4, %3 |
| and %4, 1 |
| neg %4 |
| mov dword [%5 + %%i*4], %4 |
| sar %3, 1 |
| %assign %%i %%i+1 |
| %endrep |
| movu %1, [%5] |
| %else |
| kmovd %2, %3 |
| %endif |
| %endmacro |
| |
| %macro VMASKMOVPS 4 |
| %if mmsize == AVX2_MMSIZE |
| vpmaskmovd %1, %3, %2 |
| %else |
| kmovw k7, %4 |
| vmovups %1{k7}, %2 |
| %endif |
| %endmacro |
| |
| %macro VGATHERDPS 4 |
| %if mmsize == AVX2_MMSIZE |
| vgatherdps %1, %2, %3 |
| %else |
| vgatherdps %1{%4}, %2 |
| %endif |
| %endmacro |
| |
| %macro VSCATTERDPS128 7 |
| %rep 4 |
| mov %7, %6 |
| and %7, 1 |
| cmp %7, 0 |
| je %%end_scatter |
| movss [%2 + %3*%4], xm%1 |
| vpshufd m%1, m%1, 0x39 |
| add %3, %5 |
| sar %6, 1 |
| %endrep |
| %%end_scatter: |
| %endmacro |
| |
| ; %1=register index |
| ; %2=base address %3=vindex |
| ; %4=scale %5=width |
| ; %6=mask %7=tmp |
| ; m15=reserved |
| %macro VSCATTERDPS256 7 |
| mova m15, m%1 |
| xor %3, %3 |
| VSCATTERDPS128 15, %2, %3, %4, %5, %6, %7 |
| vextractf128 xm15, m%1, 1 |
| VSCATTERDPS128 15, %2, %3, %4, %5, %6, %7 |
| %endmacro |
| |
| ; %1=base address %2=avx2 vindex |
| ; %3=avx512 vindex %4=avx2 mask |
| ; %5=avx512 mask %6=register index |
| ; %7=width %8-*=tmp |
| %macro VSCATTERDPS 8-* |
| %if mmsize == AVX2_MMSIZE |
| %if %0 == 9 |
| mov %9, %4 |
| VSCATTERDPS256 %6, %1, %2, 4, %7, %9, %8 |
| %else |
| VSCATTERDPS256 %6, %1, %2, 4, %7, %4, %8 |
| %endif |
| %else |
| vscatterdps [%1 + %3*4]{%5}, m%6 |
| %endif |
| %endmacro |
| |
| %macro INIT_WORD_MASK 1-* |
| %assign %%i 0 |
| %rep %0 |
| kmovw %1, [gblur_transpose_16x16_mask + %%i * 2] |
| %assign %%i %%i+1 |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| %macro INIT_INDICES 1-* |
| %assign %%i 1 |
| %rep %0 |
| movu %1, [gblur_transpose_16x16_indices %+ %%i] |
| %assign %%i %%i+1 |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| %assign stack_offset 0 |
| %macro PUSH_MM 1 |
| %if mmsize == AVX2_MMSIZE |
| movu [rsp + stack_offset], %1 |
| %assign stack_offset stack_offset+mmsize |
| %endif |
| %endmacro |
| |
| %macro POP_MM 1 |
| %if mmsize == AVX2_MMSIZE |
| %assign stack_offset stack_offset-mmsize |
| movu %1, [rsp + stack_offset] |
| %endif |
| %endmacro |
| |
| %macro READ_LOCAL_BUFFER 1 |
| %if mmsize == AVX512_MMSIZE |
| %assign %%i 19 |
| %else |
| %assign %%i 9 |
| %endif |
| %assign %%j %%i-1 |
| %assign %%k %1-1 |
| %xdefine %%m m %+ %%i |
| mova %%m, m3 |
| FMULADD_PS %%m, %%m, m0, [localbufq + %%k * mmsize], %%m |
| %assign %%k %%k-1 |
| %rep %1-1 |
| %xdefine %%m m %+ %%j |
| mova %%m, m %+ %%i |
| FMULADD_PS %%m, %%m, m0, [localbufq + %%k * mmsize], %%m |
| %assign %%i %%i-1 |
| %assign %%j %%j-1 |
| %assign %%k %%k-1 |
| %endrep |
| %if mmsize == AVX512_MMSIZE |
| mova m3, m %+ %%i |
| %endif |
| %endmacro |
| |
| %macro FMADD_WRITE 4 |
| FMULADD_PS %1, %1, %2, %3, %1 |
| mova %4, %1 |
| %endmacro |
| |
| %macro WRITE_LOCAL_BUFFER_INTERNAL 8-16 |
| %assign %%i 0 |
| %rep %0 |
| FMADD_WRITE m3, m0, m %+ %1, [localbufq + %%i * mmsize] |
| %assign %%i %%i+1 |
| %rotate 1 |
| %endrep |
| %endmacro |
| |
| %macro GATHERPS 1 |
| %if mmsize == AVX512_MMSIZE |
| %assign %%i 4 |
| %else |
| %assign %%i 2 |
| %endif |
| movu m %+ %%i, [ptrq] |
| mov strideq, widthq |
| %assign %%i %%i+1 |
| %rep %1-2 |
| movu m %+ %%i, [ptrq + strideq*4] |
| add strideq, widthq |
| %assign %%i %%i+1 |
| %endrep |
| movu m %+ %%i, [ptrq + strideq*4] |
| %endmacro |
| |
| %macro SCATTERPS_INTERNAL 8-16 |
| movu [ptrq + strideq*0], m %+ %1 |
| mov strideq, widthq |
| %rotate 1 |
| %rep %0-2 |
| movu [ptrq + strideq*4], m %+ %1 |
| add strideq, widthq |
| %rotate 1 |
| %endrep |
| movu [ptrq + strideq*4], m %+ %1 |
| %endmacro |
| |
| %macro BATCH_INSERT64X4 4-* |
| %assign %%imm8 %1 |
| %rotate 1 |
| %rep (%0-1)/3 |
| vinserti64x4 m%1, m%2, ym%3, %%imm8 |
| %rotate 3 |
| %endrep |
| %endmacro |
| |
| %macro BATCH_EXTRACT_INSERT 2-* |
| %assign %%imm8 %1 |
| %rotate 1 |
| %rep (%0-1)/2 |
| vextractf64x4 ym%1, m%1, %%imm8 |
| vextractf64x4 ym%2, m%2, %%imm8 |
| vinserti64x4 m%1, m%1, ym%2, %%imm8 |
| %rotate 2 |
| %endrep |
| %endmacro |
| |
| %macro BATCH_MOVE 2-* |
| %rep %0/2 |
| mova m%1, m%2 |
| %rotate 2 |
| %endrep |
| %endmacro |
| |
| %macro BATCH_PERMUTE 3-* |
| %xdefine %%decorator %1 |
| %xdefine %%mask %2 |
| %assign %%index %3 |
| %rotate 3 |
| %rep (%0-3)/2 |
| vperm %+ %%decorator m%1{%%mask}, m %+ %%index, m%2 |
| %rotate 2 |
| %endrep |
| %endmacro |
| |
| ; input : m3-m19 |
| ; output: m8 m5 m9 m15 m16 m7 m17 m27 m24 m21 m25 m19 m12 m23 m13 m11 |
| %macro TRANSPOSE_16X16_AVX512 0 |
| BATCH_INSERT64X4 0x1, 20,4,12, 21,5,13, 22,6,14, 23,7,15 |
| BATCH_INSERT64X4 0x1, 24,8,16, 25,9,17, 26,10,18, 27,11,19 |
| |
| BATCH_EXTRACT_INSERT 0x1, 4,12, 5,13, 6,14, 7,15 |
| BATCH_EXTRACT_INSERT 0x1, 8,16, 9,17, 10,18, 11,19 |
| |
| BATCH_MOVE 12,20, 13,21, 14,22, 15,23 |
| BATCH_PERMUTE q, k6, 28, 12,24, 13,25, 14,26, 15,27 |
| BATCH_PERMUTE q, k5, 28, 24,20, 25,21, 26,22, 27,23 |
| |
| BATCH_MOVE 16,4, 17,5, 18,6, 19,7 |
| BATCH_PERMUTE q, k6, 28, 16,8, 17,9, 18,10, 19,11 |
| BATCH_PERMUTE q, k5, 28, 8,4, 9,5, 10,6, 11,7 |
| |
| BATCH_MOVE 4,12, 5,13, 6,24, 7,25 |
| BATCH_MOVE 20,16, 21,17, 22,8, 23,9 |
| |
| BATCH_PERMUTE q, k4, 29, 4,14, 5,15, 6,26, 7,27 |
| BATCH_PERMUTE q, k3, 29, 14,12, 15,13, 26,24, 27,25 |
| BATCH_PERMUTE q, k4, 29, 20,18, 21,19, 22,10, 23,11 |
| BATCH_PERMUTE q, k3, 29, 18,16, 19,17, 10,8, 11,9 |
| |
| BATCH_MOVE 8,4, 9,14, 16,6, 17,26 |
| BATCH_MOVE 24,20, 25,18, 12,22, 13,10 |
| |
| BATCH_PERMUTE d, k2, 30, 8,5, 9,15, 16,7, 17,27 |
| BATCH_PERMUTE d, k1, 30, 5,4, 15,14, 7,6, 27,26 |
| BATCH_PERMUTE d, k2, 30, 24,21, 25,19, 12,23, 13,11 |
| BATCH_PERMUTE d, k1, 30, 21,20, 19,18, 23,22, 11,10 |
| %endmacro |
| |
| %macro INSERT_UNPACK 8 |
| vinsertf128 m%5, m%1, xm%3, 0x1 |
| vinsertf128 m%6, m%2, xm%4, 0x1 |
| vunpcklpd m%7, m%5, m%6 |
| vunpckhpd m%8, m%5, m%6 |
| %endmacro |
| |
| %macro SHUFFLE 4 |
| vshufps m%3, m%1, m%2, 0x88 |
| vshufps m%4, m%1, m%2, 0xDD |
| mova m%1, m%3 |
| mova m%2, m%4 |
| %endmacro |
| |
| %macro EXTRACT_INSERT_UNPACK 6 |
| vextractf128 xm%1, m%1, 0x1 |
| vextractf128 xm%2, m%2, 0x1 |
| vinsertf128 m%3, m%3, xm%1, 0x0 |
| vinsertf128 m%4, m%4, xm%2, 0x0 |
| vunpcklpd m%5, m%3, m%4 |
| vunpckhpd m%6, m%3, m%4 |
| %endmacro |
| |
| ; Transpose 8x8 AVX2 |
| ; Limit the number ym# register to 16 for compatibility |
| ; Used up registers instead of using stack memory |
| ; Input: m2-m9 |
| ; Output: m12, m14, m13, m15, m8, m10, m9, m11 |
| %macro TRANSPOSE_8X8_AVX2 0 |
| INSERT_UNPACK 2, 3, 6, 7, 10, 11, 12, 13 |
| INSERT_UNPACK 4, 5, 8, 9, 10, 11, 14, 15 |
| |
| SHUFFLE 12, 14, 10, 11 |
| SHUFFLE 13, 15, 10, 11 |
| |
| EXTRACT_INSERT_UNPACK 4, 5, 8, 9, 10, 11 |
| EXTRACT_INSERT_UNPACK 2, 3, 6, 7, 8, 9 |
| |
| SHUFFLE 8, 10, 6, 7 |
| SHUFFLE 9, 11, 6, 7 |
| %endmacro |
| |
| %macro TRANSPOSE 0 |
| %if cpuflag(avx512) |
| TRANSPOSE_16X16_AVX512 |
| %elif cpuflag(avx2) |
| TRANSPOSE_8X8_AVX2 |
| %endif |
| %endmacro |
| |
| %macro WRITE_LOCAL_BUFFER 0 |
| %if cpuflag(avx512) |
| WRITE_LOCAL_BUFFER_INTERNAL 8, 5, 9, 15, 16, 7, 17, 27, \ |
| 24, 21, 25, 19, 12, 23, 13, 11 |
| %elif cpuflag(avx2) |
| WRITE_LOCAL_BUFFER_INTERNAL 12, 14, 13, 15, 8, 10, 9, 11 |
| %endif |
| %endmacro |
| |
| %macro SCATTERPS 0 |
| %if cpuflag(avx512) |
| SCATTERPS_INTERNAL 8, 5, 9, 15, 16, 7, 17, 27, \ |
| 24, 21, 25, 19, 12, 23, 13, 11 |
| %elif cpuflag(avx2) |
| SCATTERPS_INTERNAL 12, 14, 13, 15, 8, 10, 9, 11 |
| %endif |
| %endmacro |
| |
| %macro OPTIMIZED_LOOP_STEP 0 |
| lea stepd, [stepsd - 1] |
| cmp stepd, 0 |
| jle %%bscale_scalar |
| %%loop_step: |
| sub localbufq, mmsize |
| mulps m3, m1 |
| movu [localbufq], m3 |
| |
| ; Filter leftwards |
| lea xq, [widthq - 1] |
| %%loop_step_x_back: |
| sub localbufq, mmsize |
| FMULADD_PS m3, m3, m0, [localbufq], m3 |
| movu [localbufq], m3 |
| |
| dec xq |
| cmp xq, 0 |
| jg %%loop_step_x_back |
| |
| ; Filter rightwards |
| mulps m3, m1 |
| movu [localbufq], m3 |
| add localbufq, mmsize |
| |
| lea xq, [widthq - 1] |
| %%loop_step_x: |
| FMULADD_PS m3, m3, m0, [localbufq], m3 |
| movu [localbufq], m3 |
| add localbufq, mmsize |
| |
| dec xq |
| cmp xq, 0 |
| jg %%loop_step_x |
| |
| dec stepd |
| cmp stepd, 0 |
| jg %%loop_step |
| |
| %%bscale_scalar: |
| %endmacro |
| |
| ;*************************************************************************** |
| ; void ff_horiz_slice(float *ptr, int width, int height, int steps, |
| ; float nu, float bscale) |
| ;*************************************************************************** |
| %macro HORIZ_SLICE 0 |
| %if UNIX64 |
| %if cpuflag(avx512) || cpuflag(avx2) |
| cglobal horiz_slice, 5, 12, mmnum, 0-mmsize*4, buffer, width, height, steps, \ |
| localbuf, x, y, step, stride, remain, ptr, mask |
| %else |
| cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, x, y, step, stride, remain |
| %endif |
| %else |
| %if cpuflag(avx512) || cpuflag(avx2) |
| cglobal horiz_slice, 5, 12, mmnum, 0-mmsize*4, buffer, width, height, steps, nu, bscale, \ |
| localbuf, x, y, step, stride, remain, ptr, mask |
| %else |
| cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, nu, bscale, x, y, step, stride, remain |
| %endif |
| %endif |
| %if cpuflag(avx512) || cpuflag(avx2) |
| %assign rows mmsize/4 |
| %assign cols mmsize/4 |
| %if WIN64 |
| VBROADCASTSS m0, num ; nu |
| VBROADCASTSS m1, bscalem ; bscale |
| |
| mov nuq, localbufm |
| DEFINE_ARGS buffer, width, height, steps, \ |
| localbuf, x, y, step, stride, remain, ptr, mask |
| %else |
| VBROADCASTSS m0, xmm0 ; nu |
| VBROADCASTSS m1, xmm1 ; bscale |
| %endif |
| |
| MOVSXDIFNIDN width, height, steps |
| |
| %if cpuflag(avx512) |
| vpbroadcastd m2, widthd |
| INIT_WORD_MASK k6, k5, k4, k3, k2, k1 |
| INIT_INDICES m28, m29, m30 |
| %else |
| movd xm2, widthd |
| VBROADCASTSS m2, xm2 |
| %endif |
| |
| vpmulld m2, m2, [gblur_vindex_width] ; vindex width |
| |
| xor yq, yq ; y = 0 |
| xor xq, xq ; x = 0 |
| |
| cmp heightq, rows |
| jl .y_scalar |
| sub heightq, rows |
| |
| .loop_y: |
| ; ptr = buffer + y * width; |
| mov ptrq, yq |
| imul ptrq, widthq |
| lea ptrq, [bufferq + ptrq*4] |
| |
| KXNOR m5, k7 |
| VGATHERDPS m3, [ptrq + m2*4], m5, k7 |
| mulps m3, m1 |
| movu [localbufq], m3 |
| add ptrq, 4 |
| add localbufq, mmsize |
| |
| ; Filter rightwards |
| PUSH_MM m2 |
| lea xq, [widthq - 1] |
| .loop_x: |
| PUSH_MM m3 |
| GATHERPS cols |
| TRANSPOSE |
| POP_MM m3 |
| WRITE_LOCAL_BUFFER |
| |
| add ptrq, mmsize |
| add localbufq, rows * mmsize |
| sub xq, cols |
| cmp xq, cols |
| jge .loop_x |
| POP_MM m2 |
| |
| cmp xq, 0 |
| jle .bscale_scalar |
| .loop_x_scalar: |
| KXNOR m5, k7 |
| VGATHERDPS m4, [ptrq + m2*4], m5, k7 |
| FMULADD_PS m3, m3, m0, m4, m3 |
| movu [localbufq], m3 |
| |
| add ptrq, 0x4 |
| add localbufq, mmsize |
| dec xq |
| cmp xq, 0 |
| jg .loop_x_scalar |
| |
| .bscale_scalar: |
| OPTIMIZED_LOOP_STEP |
| sub ptrq, 4 |
| sub localbufq, mmsize |
| mulps m3, m1 |
| KXNOR m5, k7, maskq |
| VSCATTERDPS ptrq, strideq, m2, maskq, k7, 3, widthq, remainq |
| |
| ; Filter leftwards |
| PUSH_MM m2 |
| lea xq, [widthq - 1] |
| .loop_x_back: |
| sub localbufq, rows * mmsize |
| READ_LOCAL_BUFFER cols |
| PUSH_MM m2 |
| TRANSPOSE |
| POP_MM m3 |
| sub ptrq, mmsize |
| SCATTERPS |
| |
| sub xq, cols |
| cmp xq, cols |
| jge .loop_x_back |
| POP_MM m2 |
| |
| cmp xq, 0 |
| jle .end_loop_x |
| .loop_x_back_scalar: |
| sub ptrq, 0x4 |
| sub localbufq, mmsize |
| FMULADD_PS m3, m3, m0, [localbufq], m3 |
| KXNOR m5, k7, maskq |
| VSCATTERDPS ptrq, strideq, m2, maskq, k7, 3, widthq, remainq |
| |
| dec xq |
| cmp xq, 0 |
| jg .loop_x_back_scalar |
| |
| .end_loop_x: |
| |
| add yq, rows |
| cmp yq, heightq |
| jle .loop_y |
| |
| add heightq, rows |
| cmp yq, heightq |
| jge .end_scalar |
| |
| mov remainq, widthq |
| imul remainq, mmsize |
| add ptrq, remainq |
| |
| .y_scalar: |
| mov remainq, heightq |
| sub remainq, yq |
| mov maskq, 1 |
| shlx maskq, maskq, remainq |
| sub maskq, 1 |
| mov remainq, maskq |
| PUSH_MASK m5, k1, remaind, xd, rsp + 0x20 |
| |
| mov ptrq, yq |
| imul ptrq, widthq |
| lea ptrq, [bufferq + ptrq * 4] ; ptrq = buffer + y * width |
| KMOVW m6, m5, k7, k1 |
| VGATHERDPS m3, [ptrq + m2 * 4], m6, k7 |
| mulps m3, m1 ; p0 *= bscale |
| movu [localbufq], m3 |
| add localbufq, mmsize |
| |
| ; Filter rightwards |
| lea xq, [widthq - 1] |
| .y_scalar_loop_x: |
| add ptrq, 4 |
| KMOVW m6, m5, k7, k1 |
| VGATHERDPS m4, [ptrq + m2 * 4], m6, k7 |
| FMULADD_PS m3, m3, m0, m4, m3 |
| movu [localbufq], m3 |
| add localbufq, mmsize |
| |
| dec xq |
| cmp xq, 0 |
| jg .y_scalar_loop_x |
| |
| OPTIMIZED_LOOP_STEP |
| |
| sub localbufq, mmsize |
| mulps m3, m1 ; p0 *= bscale |
| KMOVW k7, k1 |
| VSCATTERDPS ptrq, strideq, m2, maskq, k7, 3, widthq, remainq, heightq |
| |
| ; Filter leftwards |
| lea xq, [widthq - 1] |
| .y_scalar_loop_x_back: |
| sub ptrq, 4 |
| sub localbufq, mmsize |
| FMULADD_PS m3, m3, m0, [localbufq], m3 |
| KMOVW k7, k1 |
| VSCATTERDPS ptrq, strideq, m2, maskq, k7, 3, widthq, remainq, heightq |
| dec xq |
| cmp xq, 0 |
| jg .y_scalar_loop_x_back |
| |
| .end_scalar: |
| RET |
| %else |
| %if WIN64 |
| movss m0, num |
| movss m1, bscalem |
| DEFINE_ARGS ptr, width, height, steps, x, y, step, stride, remain |
| %endif |
| movsxdifnidn widthq, widthd |
| |
| mulss m2, m0, m0 ; nu ^ 2 |
| mulss m3, m2, m0 ; nu ^ 3 |
| mulss m4, m3, m0 ; nu ^ 4 |
| xor xq, xq |
| xor yd, yd |
| mov strideq, widthq |
| ; stride = width * 4 |
| shl strideq, 2 |
| ; w = w - ((w - 1) & 3) |
| mov remainq, widthq |
| sub remainq, 1 |
| and remainq, 3 |
| sub widthq, remainq |
| |
| shufps m0, m0, 0 |
| shufps m2, m2, 0 |
| shufps m3, m3, 0 |
| shufps m4, m4, 0 |
| |
| .loop_y: |
| xor stepd, stepd |
| |
| .loop_step: |
| ; p0 *= bscale |
| mulss m5, m1, [ptrq + xq * 4] |
| movss [ptrq + xq * 4], m5 |
| inc xq |
| ; filter rightwards |
| ; Here we are vectorizing the c version by 4 |
| ; for (x = 1; x < width; x++) |
| ; ptr[x] += nu * ptr[x - 1]; |
| ; let p0 stands for ptr[x-1], the data from last loop |
| ; and [p1,p2,p3,p4] be the vector data for this loop. |
| ; Unrolling the loop, we get: |
| ; p1' = p1 + p0*nu |
| ; p2' = p2 + p1*nu + p0*nu^2 |
| ; p3' = p3 + p2*nu + p1*nu^2 + p0*nu^3 |
| ; p4' = p4 + p3*nu + p2*nu^2 + p1*nu^3 + p0*nu^4 |
| ; so we can do it in simd: |
| ; [p1',p2',p3',p4'] = [p1,p2,p3,p4] + [p0,p1,p2,p3]*nu + |
| ; [0,p0,p1,p2]*nu^2 + [0,0,p0,p1]*nu^3 + |
| ; [0,0,0,p0]*nu^4 |
| |
| .loop_x: |
| movu m6, [ptrq + xq * 4] ; s = [p1,p2,p3,p4] |
| pslldq m7, m6, 4 ; [0, p1,p2,p3] |
| movss m7, m5 ; [p0,p1,p2,p3] |
| FMULADD_PS m6, m7, m0, m6, m8 ; s += [p0,p1,p2,p3] * nu |
| pslldq m7, 4 ; [0,p0,p1,p2] |
| FMULADD_PS m6, m7, m2, m6, m8 ; s += [0,p0,p1,p2] * nu^2 |
| pslldq m7, 4 |
| FMULADD_PS m6, m7, m3, m6, m8 ; s += [0,0,p0,p1] * nu^3 |
| pslldq m7, 4 |
| FMULADD_PS m6, m7, m4, m6, m8 ; s += [0,0,0,p0] * nu^4 |
| movu [ptrq + xq * 4], m6 |
| shufps m5, m6, m6, q3333 |
| add xq, 4 |
| cmp xq, widthq |
| jl .loop_x |
| |
| add widthq, remainq |
| cmp xq, widthq |
| jge .end_scalar |
| |
| .loop_scalar: |
| ; ptr[x] += nu * ptr[x-1] |
| movss m5, [ptrq + 4*xq - 4] |
| mulss m5, m0 |
| addss m5, [ptrq + 4*xq] |
| movss [ptrq + 4*xq], m5 |
| inc xq |
| cmp xq, widthq |
| jl .loop_scalar |
| .end_scalar: |
| ; ptr[width - 1] *= bscale |
| dec xq |
| mulss m5, m1, [ptrq + 4*xq] |
| movss [ptrq + 4*xq], m5 |
| shufps m5, m5, 0 |
| |
| ; filter leftwards |
| ; for (; x > 0; x--) |
| ; ptr[x - 1] += nu * ptr[x]; |
| ; The idea here is basically the same as filter rightwards. |
| ; But we need to take care as the data layout is different. |
| ; Let p0 stands for the ptr[x], which is the data from last loop. |
| ; The way we do it in simd as below: |
| ; [p-4', p-3', p-2', p-1'] = [p-4, p-3, p-2, p-1] |
| ; + [p-3, p-2, p-1, p0] * nu |
| ; + [p-2, p-1, p0, 0] * nu^2 |
| ; + [p-1, p0, 0, 0] * nu^3 |
| ; + [p0, 0, 0, 0] * nu^4 |
| .loop_x_back: |
| sub xq, 4 |
| movu m6, [ptrq + xq * 4] ; s = [p-4, p-3, p-2, p-1] |
| psrldq m7, m6, 4 ; [p-3, p-2, p-1, 0 ] |
| blendps m7, m5, 0x8 ; [p-3, p-2, p-1, p0 ] |
| FMULADD_PS m6, m7, m0, m6, m8 ; s+= [p-3, p-2, p-1, p0 ] * nu |
| psrldq m7, 4 ; |
| FMULADD_PS m6, m7, m2, m6, m8 ; s+= [p-2, p-1, p0, 0] * nu^2 |
| psrldq m7, 4 |
| FMULADD_PS m6, m7, m3, m6, m8 ; s+= [p-1, p0, 0, 0] * nu^3 |
| psrldq m7, 4 |
| FMULADD_PS m6, m7, m4, m6, m8 ; s+= [p0, 0, 0, 0] * nu^4 |
| movu [ptrq + xq * 4], m6 |
| shufps m5, m6, m6, 0 ; m5 = [p-4', p-4', p-4', p-4'] |
| cmp xq, remainq |
| jg .loop_x_back |
| |
| cmp xq, 0 |
| jle .end_scalar_back |
| |
| .loop_scalar_back: |
| ; ptr[x-1] += nu * ptr[x] |
| movss m5, [ptrq + 4*xq] |
| mulss m5, m0 |
| addss m5, [ptrq + 4*xq - 4] |
| movss [ptrq + 4*xq - 4], m5 |
| dec xq |
| cmp xq, 0 |
| jg .loop_scalar_back |
| .end_scalar_back: |
| |
| ; reset aligned width for next line |
| sub widthq, remainq |
| |
| inc stepd |
| cmp stepd, stepsd |
| jl .loop_step |
| |
| add ptrq, strideq |
| inc yd |
| cmp yd, heightd |
| jl .loop_y |
| |
| RET |
| %endif |
| %endmacro |
| |
| %if ARCH_X86_64 |
| INIT_XMM sse4 |
| HORIZ_SLICE |
| |
| %if HAVE_AVX2_EXTERNAL |
| INIT_YMM avx2 |
| %xdefine mmnum 16 |
| HORIZ_SLICE |
| %endif |
| |
| %if HAVE_AVX512_EXTERNAL |
| INIT_ZMM avx512 |
| %xdefine mmnum 32 |
| HORIZ_SLICE |
| %endif |
| %endif |
| |
| %macro POSTSCALE_SLICE 0 |
| cglobal postscale_slice, 2, 2, 4, ptr, length, postscale, min, max |
| shl lengthd, 2 |
| add ptrq, lengthq |
| neg lengthq |
| %if ARCH_X86_32 |
| VBROADCASTSS m0, postscalem |
| VBROADCASTSS m1, minm |
| VBROADCASTSS m2, maxm |
| %elif WIN64 |
| VBROADCASTSS m0, xmm2 |
| VBROADCASTSS m1, xmm3 |
| VBROADCASTSS m2, maxm |
| %else ; UNIX |
| VBROADCASTSS m0, xmm0 |
| VBROADCASTSS m1, xmm1 |
| VBROADCASTSS m2, xmm2 |
| %endif |
| |
| .loop: |
| %if cpuflag(avx2) || cpuflag(avx512) |
| mulps m3, m0, [ptrq + lengthq] |
| %else |
| movu m3, [ptrq + lengthq] |
| mulps m3, m0 |
| %endif |
| maxps m3, m1 |
| minps m3, m2 |
| movu [ptrq+lengthq], m3 |
| |
| add lengthq, mmsize |
| jl .loop |
| |
| RET |
| %endmacro |
| |
| INIT_XMM sse |
| POSTSCALE_SLICE |
| |
| %if HAVE_AVX2_EXTERNAL |
| INIT_YMM avx2 |
| POSTSCALE_SLICE |
| %endif |
| |
| %if HAVE_AVX512_EXTERNAL |
| INIT_ZMM avx512 |
| POSTSCALE_SLICE |
| %endif |
| |
| ;******************************************************************************* |
| ; void ff_verti_slice(float *buffer, int width, int height, int column_begin, |
| ; int column_end, int steps, float nu, float bscale); |
| ;******************************************************************************* |
| %macro VERTI_SLICE 0 |
| %if UNIX64 |
| cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \ |
| steps, x, y, cwidth, step, ptr, stride |
| %else |
| cglobal verti_slice, 6, 12, 9, 0-mmsize*2, buffer, width, height, cbegin, cend, \ |
| steps, nu, bscale, x, y, cwidth, step, \ |
| ptr, stride |
| %endif |
| %assign cols mmsize/4 |
| %if WIN64 |
| VBROADCASTSS m0, num |
| VBROADCASTSS m1, bscalem |
| DEFINE_ARGS buffer, width, height, cbegin, cend, \ |
| steps, x, y, cwidth, step, ptr, stride |
| %else |
| VBROADCASTSS m0, xmm0 ; nu |
| VBROADCASTSS m1, xmm1 ; bscale |
| %endif |
| MOVSXDIFNIDN width, height, cbegin, cend, steps |
| |
| mov cwidthq, cendq |
| sub cwidthq, cbeginq |
| lea strideq, [widthq * 4] |
| |
| xor xq, xq ; x = 0 |
| cmp cwidthq, cols |
| jl .x_scalar |
| cmp cwidthq, 0x0 |
| je .end_scalar |
| |
| sub cwidthq, cols |
| .loop_x: |
| xor stepq, stepq |
| .loop_step: |
| ; ptr = buffer + x + column_begin; |
| lea ptrq, [xq + cbeginq] |
| lea ptrq, [bufferq + ptrq*4] |
| |
| ; ptr[15:0] *= bcale; |
| movu m2, [ptrq] |
| mulps m2, m1 |
| movu [ptrq], m2 |
| |
| ; Filter downwards |
| mov yq, 1 |
| .loop_y_down: |
| add ptrq, strideq ; ptrq += width |
| movu m3, [ptrq] |
| FMULADD_PS m2, m2, m0, m3, m2 |
| movu [ptrq], m2 |
| |
| inc yq |
| cmp yq, heightq |
| jl .loop_y_down |
| |
| mulps m2, m1 |
| movu [ptrq], m2 |
| |
| ; Filter upwards |
| dec yq |
| .loop_y_up: |
| sub ptrq, strideq |
| movu m3, [ptrq] |
| FMULADD_PS m2, m2, m0, m3, m2 |
| movu [ptrq], m2 |
| |
| dec yq |
| cmp yq, 0 |
| jg .loop_y_up |
| |
| inc stepq |
| cmp stepq, stepsq |
| jl .loop_step |
| |
| add xq, cols |
| cmp xq, cwidthq |
| jle .loop_x |
| |
| add cwidthq, cols |
| cmp xq, cwidthq |
| jge .end_scalar |
| |
| .x_scalar: |
| xor stepq, stepq |
| mov qword [rsp + 0x10], xq |
| sub cwidthq, xq |
| mov xq, 1 |
| shlx cwidthq, xq, cwidthq |
| sub cwidthq, 1 |
| PUSH_MASK m4, k1, cwidthd, xd, rsp + 0x20 |
| mov xq, qword [rsp + 0x10] |
| |
| .loop_step_scalar: |
| lea ptrq, [xq + cbeginq] |
| lea ptrq, [bufferq + ptrq*4] |
| |
| VMASKMOVPS m2, [ptrq], m4, k1 |
| mulps m2, m1 |
| VMASKMOVPS [ptrq], m2, m4, k1 |
| |
| ; Filter downwards |
| mov yq, 1 |
| .x_scalar_loop_y_down: |
| add ptrq, strideq |
| VMASKMOVPS m3, [ptrq], m4, k1 |
| FMULADD_PS m2, m2, m0, m3, m2 |
| VMASKMOVPS [ptrq], m2, m4, k1 |
| |
| inc yq |
| cmp yq, heightq |
| jl .x_scalar_loop_y_down |
| |
| mulps m2, m1 |
| VMASKMOVPS [ptrq], m2, m4, k1 |
| |
| ; Filter upwards |
| dec yq |
| .x_scalar_loop_y_up: |
| sub ptrq, strideq |
| VMASKMOVPS m3, [ptrq], m4, k1 |
| FMULADD_PS m2, m2, m0, m3, m2 |
| VMASKMOVPS [ptrq], m2, m4, k1 |
| |
| dec yq |
| cmp yq, 0 |
| jg .x_scalar_loop_y_up |
| |
| inc stepq |
| cmp stepq, stepsq |
| jl .loop_step_scalar |
| |
| .end_scalar: |
| RET |
| %endmacro |
| |
| %if ARCH_X86_64 |
| %if HAVE_AVX2_EXTERNAL |
| INIT_YMM avx2 |
| VERTI_SLICE |
| %endif |
| |
| %if HAVE_AVX512_EXTERNAL |
| INIT_ZMM avx512 |
| VERTI_SLICE |
| %endif |
| %endif |