| ;***************************************************************************** |
| ;* x86-optimized functions for nlmeans filter |
| ;* |
| ;* This file is part of FFmpeg. |
| ;* |
| ;* FFmpeg is free software; you can redistribute it and/or |
| ;* modify it under the terms of the GNU Lesser General Public |
| ;* License as published by the Free Software Foundation; either |
| ;* version 2.1 of the License, or (at your option) any later version. |
| ;* |
| ;* FFmpeg is distributed in the hope that it will be useful, |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| ;* Lesser General Public License for more details. |
| ;* |
| ;* You should have received a copy of the GNU Lesser General Public |
| ;* License along with FFmpeg; if not, write to the Free Software |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| ;****************************************************************************** |
| |
| |
| %include "libavutil/x86/x86util.asm" |
| |
| %if HAVE_AVX2_EXTERNAL && ARCH_X86_64 |
| |
| SECTION_RODATA 32 |
| |
| ending_lut: dd -1, -1, -1, -1, -1, -1, -1, -1,\ |
| 0, -1, -1, -1, -1, -1, -1, -1,\ |
| 0, 0, -1, -1, -1, -1, -1, -1,\ |
| 0, 0, 0, -1, -1, -1, -1, -1,\ |
| 0, 0, 0, 0, -1, -1, -1, -1,\ |
| 0, 0, 0, 0, 0, -1, -1, -1,\ |
| 0, 0, 0, 0, 0, 0, -1, -1,\ |
| 0, 0, 0, 0, 0, 0, 0, -1,\ |
| 0, 0, 0, 0, 0, 0, 0, 0 |
| |
| SECTION .text |
| |
| ; void ff_compute_weights_line(const uint32_t *const iia, |
| ; const uint32_t *const iib, |
| ; const uint32_t *const iid, |
| ; const uint32_t *const iie, |
| ; const uint8_t *const src, |
| ; float *total, |
| ; float *sum, |
| ; const float *const lut, |
| ; int max, |
| ; int startx, int endx); |
| |
| INIT_YMM avx2 |
| cglobal compute_weights_line, 8, 13, 5, 0, iia, iib, iid, iie, src, total, sum, lut, x, startx, endx, mod, elut |
| movsxd startxq, dword startxm |
| movsxd endxq, dword endxm |
| VPBROADCASTD m2, r8m |
| |
| mov xq, startxq |
| mov modq, mmsize / 4 |
| lea elutq, [ending_lut] |
| |
| vpcmpeqd m4, m4 |
| |
| .loop: |
| mov startxq, endxq |
| sub startxq, xq |
| cmp startxq, modq |
| cmovge startxq, modq |
| sal startxq, 5 |
| |
| movu m0, [iieq + xq * 4] |
| |
| psubd m0, [iidq + xq * 4] |
| psubd m0, [iibq + xq * 4] |
| paddd m0, [iiaq + xq * 4] |
| por m0, [elutq + startxq] |
| pminud m0, m2 |
| pslld m0, 2 |
| mova m3, m4 |
| vgatherdps m1, [lutq + m0], m3 |
| |
| pmovzxbd m0, [srcq + xq] |
| cvtdq2ps m0, m0 |
| |
| mulps m0, m1 |
| |
| addps m1, [totalq + xq * 4] |
| addps m0, [sumq + xq * 4] |
| |
| movups [totalq + xq * 4], m1 |
| movups [sumq + xq * 4], m0 |
| |
| add xq, mmsize / 4 |
| cmp xq, endxq |
| jl .loop |
| RET |
| |
| %endif |