libavcodec/x86/opus_pvq_search.asm - third_party/ffmpeg - Git at Google

 ;******************************************************************************
 ;* SIMD optimized Opus encoder DSP function
 ;*
 ;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
 ;*
 ;* This file is part of FFmpeg.
 ;*
 ;* FFmpeg is free software; you can redistribute it and/or
 ;* modify it under the terms of the GNU Lesser General Public
 ;* License as published by the Free Software Foundation; either
 ;* version 2.1 of the License, or (at your option) any later version.
 ;*
 ;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ;* Lesser General Public License for more details.
 ;*
 ;* You should have received a copy of the GNU Lesser General Public
 ;* License along with FFmpeg; if not, write to the Free Software
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 ;******************************************************************************

 %include "config.asm"
 %include "libavutil/x86/x86util.asm"

 %ifdef __NASM_VER__
 %use "smartalign"
 ALIGNMODE p6
 %endif

 SECTION_RODATA 64

 const_float_abs_mask:   times 8 dd 0x7fffffff
 const_align_abs_edge:   times 8 dd 0

 const_float_0_5:        times 8 dd 0.5
 const_float_1:          times 8 dd 1.0
 const_float_sign_mask:  times 8 dd 0x80000000

 const_int32_offsets:
                         %rep 8
                                 dd $-const_int32_offsets
                         %endrep
 SECTION .text

 ;
 ;   Setup High Register to be used
 ;   for holding memory constants
 ;
 ; %1 - the register to be used, assmues it is >= mm8
 ; %2 - name of the constant.
 ;
 ; Subsequent opcodes are going to use the constant in the form
 ; "addps m0, mm_const_name" and it would be turned into:
 ; "addps m0, [const_name]" on 32 bit arch or
 ; "addps m0, m8" on 64 bit arch
 %macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
 %if num_mmregs > 8
     %define  mm_%3   %2
     %{1}        %2, [%3]    ; movaps m8, [const_name]
 %else
     %define  mm_%3  [%3]
 %endif
 %endmacro

 ;
 ;   Set Position Independent Code
 ;       Base address of a constant
 ; %1 - the register to be used, if PIC is set
 ; %2 - name of the constant.
 ;
 ; Subsequent opcode are going to use the base address in the form
 ; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
 ; "movaps m0, [r5 + r4]" if PIC is enabled
 ; "movaps m0, [constant_name + r4]" if texrel are used
 %macro SET_PIC_BASE 3; reg, const_label
 %ifdef PIC
     %{1}     %2, [%3]      ; lea r5, [rip+const]
     %define  pic_base_%3 %2
 %else
     %define  pic_base_%3 %3
 %endif
 %endmacro

 %macro PULSES_SEARCH 1
 ; m6 Syy_norm
 ; m7 Sxy_norm
     addps          m6, mm_const_float_0_5   ; Syy_norm += 1.0/2
     pxor           m1, m1                   ; max_idx
     xorps          m3, m3                   ; p_max
     xor           r4d, r4d
 align 16
 %%distortion_search:
     movd          xm2, dword r4d    ; movd zero extends
 %ifidn %1,add
     movaps         m4, [tmpY + r4]  ; y[i]
     movaps         m5, [tmpX + r4]  ; X[i]

   %if USE_APPROXIMATION == 1
     xorps          m0, m0
     cmpps          m0, m0, m5, 4    ; m0 = (X[i] != 0.0)
   %endif

     addps          m4, m6           ; m4 = Syy_new = y[i] + Syy_norm
     addps          m5, m7           ; m5 = Sxy_new = X[i] + Sxy_norm

   %if USE_APPROXIMATION == 1
     andps          m5, m0           ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
   %endif

 %else
     movaps         m5, [tmpY + r4]      ; m5 = y[i]

     xorps          m0, m0               ; m0 = 0;
     cmpps          m0, m0, m5, 1        ; m0 = (0<y)

     subps          m4, m6, m5           ; m4 = Syy_new = Syy_norm - y[i]
     subps          m5, m7, [tmpX + r4]  ; m5 = Sxy_new = Sxy_norm - X[i]
     andps          m5, m0               ; (0<y)?m5:0
 %endif

 %if USE_APPROXIMATION == 1
     rsqrtps        m4, m4
     mulps          m5, m4           ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
 %else
     mulps          m5, m5
     divps          m5, m4           ; m5 = p = Sxy_new*Sxy_new/Syy
 %endif
     VPBROADCASTD   m2, xm2          ; m2=i (all lanes get same values, we add the offset-per-lane, later)

     cmpps          m0, m3, m5, 1    ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
     maxps          m3, m5           ; m3=max(p_max,p)
                                     ; maxps here is faster than blendvps, despite blend having lower latency.

     pand           m2, m0           ; This version seems faster than sse41 pblendvb
     pmaxsw         m1, m2           ; SSE2 signed word, so it would work for N < 32768/4

     add           r4d, mmsize
     cmp           r4d, Nd
     jb   %%distortion_search

     por            m1, mm_const_int32_offsets  ; max_idx offsets per individual lane (skipped in the inner loop)
     movdqa         m4, m1                      ; needed for the aligned y[max_idx]+=1; processing

 %if mmsize >= 32
 ; Merge parallel maximums round 8 (4 vs 4)

     vextractf128  xm5, ym3, 1       ; xmm5 = ymm3[1x128] = ymm3[255..128b]
     cmpps         xm0, xm3, xm5, 1  ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )

     vextracti128  xm2, ym1, 1       ; xmm2 = ymm1[1x128] = ymm1[255..128b]
     BLENDVPS      xm3, xm5, xm0     ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
     PBLENDVB      xm1, xm2, xm0     ; p       = m0 ? p[1x128]       : p[0x128]
 %endif

 ; Merge parallel maximums round 4 (2 vs 2)
                                     ; m3=p[3210]
     movhlps       xm5, xm3          ; m5=p[xx32]
     cmpps         xm0, xm3, xm5, 1  ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )

     pshufd        xm2, xm1, q3232
     BLENDVPS      xm3, xm5, xm0     ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
     PBLENDVB      xm1, xm2, xm0     ; p       = m0 ? p[3,2]       : p[1,0]

 ; Merge parallel maximums final round (1 vs 1)
     shufps        xm0, xm3, xm3, q1111  ; m0 = m3[1] = p[1]
     cmpss         xm0, xm3, 5           ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )

     pshufd        xm2, xm1, q1111
     PBLENDVB      xm1, xm2, xm0

     movd    dword r4d, xm1          ; zero extends to the rest of r4q

     VBROADCASTSS   m3, [tmpX + r4]
     %{1}ps         m7, m3           ; Sxy += X[max_idx]

     VBROADCASTSS   m5, [tmpY + r4]
     %{1}ps         m6, m5           ; Syy += Y[max_idx]

     ; We have to update a single element in Y[i]
     ; However writing 4 bytes and then doing 16 byte load in the inner loop
     ; could cause a stall due to breaking write forwarding.
     VPBROADCASTD   m1, xm1
     pcmpeqd        m1, m1, m4           ; exactly 1 element matches max_idx and this finds it

     and           r4d, ~(mmsize-1)      ; align address down, so the value pointed by max_idx is inside a mmsize load
     movaps         m5, [tmpY + r4]      ; m5 = Y[y3...ym...y0]
     andps          m1, mm_const_float_1 ; m1 =  [ 0...1.0...0]
     %{1}ps         m5, m1               ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
     movaps [tmpY + r4], m5              ; Y[max_idx] +-= 1.0;
 %endmacro

 ;
 ; We need one more register for
 ; PIC relative addressing. Use this
 ; to count it in cglobal
 ;
 %ifdef PIC
   %define num_pic_regs 1
 %else
   %define num_pic_regs 0
 %endif

 ;
 ; Pyramid Vector Quantization Search implementation
 ;
 ; float * inX   - Unaligned (SIMD) access, it will be overread,
 ;                 but extra data is masked away.
 ; int32 * outY  - Should be aligned and padded buffer.
 ;                 It is used as temp buffer.
 ; uint32 K      - Number of pulses to have after quantizations.
 ; uint32 N      - Number of vector elements. Must be 0 < N < 256
 ;
 %macro PVQ_FAST_SEARCH 1
 cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
 %define tmpX rsp
 %define tmpY outYq

     movaps     m0, [const_float_abs_mask]
     shl        Nd, 2    ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
     mov       r4d, Nd

     neg       r4d
     and       r4d, mmsize-1

     SET_PIC_BASE lea, r5, const_align_abs_edge  ; rip+const
     movups     m2, [pic_base_const_align_abs_edge + r4 - mmsize]

     add        Nd, r4d              ; N = align(N, mmsize)

     lea       r4d, [Nd - mmsize]    ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
     movups     m1, [inXq + r4]
     andps      m1, m2
     movaps  [tmpX + r4], m1         ; Sx = abs( X[N-1] )

 align 16
 %%loop_abs_sum:
     sub       r4d, mmsize
     jc   %%end_loop_abs_sum

     movups     m2, [inXq + r4]
     andps      m2, m0

     movaps  [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
     addps      m1, m2       ; Sx += abs(X[i])
     jmp  %%loop_abs_sum

 align 16
 %%end_loop_abs_sum:

     HSUMPS     m1, m2       ; m1  = Sx

     xorps      m0, m0
     comiss    xm0, xm1      ;
     jz   %%zero_input       ; if (Sx==0) goto zero_input

     cvtsi2ss  xm0, dword Kd ; m0 = K
 %if USE_APPROXIMATION == 1
     rcpss     xm1, xm1      ; m1 = approx(1/Sx)
     mulss     xm0, xm1      ; m0 = K*(1/Sx)
 %else
     divss     xm0, xm1      ; b = K/Sx
                             ; b = K/max_x
 %endif

     VBROADCASTSS  m0, xm0

     lea       r4d, [Nd - mmsize]
     pxor       m5, m5             ; Sy    ( Sum of abs( y[i]) )
     xorps      m6, m6             ; Syy   ( Sum of y[i]*y[i]  )
     xorps      m7, m7             ; Sxy   ( Sum of X[i]*y[i]  )
 align 16
 %%loop_guess:
     movaps     m1, [tmpX + r4]    ; m1   = X[i]
     mulps      m2, m0, m1         ; m2   = res*X[i]
     cvtps2dq   m2, m2             ; yt   = (int)lrintf( res*X[i] )
     paddd      m5, m2             ; Sy  += yt
     cvtdq2ps   m2, m2             ; yt   = (float)yt
     mulps      m1, m2             ; m1   = X[i]*yt
     movaps  [tmpY + r4], m2       ; y[i] = m2
     addps      m7, m1             ; Sxy += m1;
     mulps      m2, m2             ; m2   = yt*yt
     addps      m6, m2             ; Syy += m2

     sub       r4d, mmsize
     jnc  %%loop_guess

     HSUMPS     m6, m1       ; Syy_norm
     HADDD      m5, m4       ; pulses

     movd  dword r4d, xm5    ; zero extends to the rest of r4q

     sub        Kd, r4d      ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
     jz   %%finish           ; K - pulses == 0

     SET_HI_REG_MM_CONSTANT movaps,  m8, const_float_0_5
     SET_HI_REG_MM_CONSTANT movaps,  m9, const_float_1
     SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
     ; Use Syy/2 in distortion parameter calculations.
     ; Saves pre and post-caclulation to correct Y[] values.
     ; Same precision, since float mantisa is normalized.
     ; The SQRT approximation does differ.
     HSUMPS     m7, m0         ; Sxy_norm
     mulps      m6, mm_const_float_0_5

     jc   %%remove_pulses_loop   ; K - pulses < 0

 align 16                        ; K - pulses > 0
 %%add_pulses_loop:

     PULSES_SEARCH add   ; m6 Syy_norm ; m7 Sxy_norm

     sub        Kd, 1
     jnz  %%add_pulses_loop

     addps      m6, m6 ; Syy*=2

     jmp  %%finish

 align 16
 %%remove_pulses_loop:

     PULSES_SEARCH sub   ; m6 Syy_norm ; m7 Sxy_norm

     add        Kd, 1
     jnz  %%remove_pulses_loop

     addps      m6, m6 ; Syy*=2

 align 16
 %%finish:
     lea       r4d, [Nd - mmsize]
     movaps     m2, [const_float_sign_mask]

 align 16
 %%restore_sign_loop:
     movaps     m0, [tmpY + r4]    ; m0 = Y[i]
     movups     m1, [inXq + r4]    ; m1 = X[i]
     andps      m1, m2             ; m1 = sign(X[i])
     orps       m0, m1             ; m0 = Y[i]*sign
     cvtps2dq   m3, m0             ; m3 = (int)m0
     movaps  [outYq + r4], m3

     sub       r4d, mmsize
     jnc  %%restore_sign_loop
 %%return:

 %if ARCH_X86_64 == 0    ; sbrdsp
     movss     r0m, xm6  ; return (float)Syy_norm
     fld dword r0m
 %else
     movaps     m0, m6   ; return (float)Syy_norm
 %endif

     RET

 align 16
 %%zero_input:
     lea       r4d, [Nd - mmsize]
     xorps      m0, m0
 %%zero_loop:
     movaps  [outYq + r4], m0
     sub       r4d, mmsize
     jnc  %%zero_loop

     movaps     m6, [const_float_1]
     jmp  %%return
 %endmacro

 ; if 1, use a float op that give half precision but execute for around 3 cycles.
 ; On Skylake & Ryzen the division is much faster (around 11c/3),
 ; that makes the full precision code about 2% slower.
 ; Opus also does use rsqrt approximation in their intrinsics code.
 %define USE_APPROXIMATION   1

 INIT_XMM sse2
 PVQ_FAST_SEARCH _approx

 INIT_XMM sse4
 PVQ_FAST_SEARCH _approx

 %define USE_APPROXIMATION   0

 INIT_XMM avx
 PVQ_FAST_SEARCH _exact
	;******************************************************************************
	;* SIMD optimized Opus encoder DSP function
	;*
	;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
	;*
	;* This file is part of FFmpeg.
	;*
	;* FFmpeg is free software; you can redistribute it and/or
	;* modify it under the terms of the GNU Lesser General Public
	;* License as published by the Free Software Foundation; either
	;* version 2.1 of the License, or (at your option) any later version.
	;*
	;* FFmpeg is distributed in the hope that it will be useful,
	;* but WITHOUT ANY WARRANTY; without even the implied warranty of
	;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	;* Lesser General Public License for more details.
	;*
	;* You should have received a copy of the GNU Lesser General Public
	;* License along with FFmpeg; if not, write to the Free Software
	;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	;******************************************************************************

	%include "config.asm"
	%include "libavutil/x86/x86util.asm"

	%ifdef __NASM_VER__
	%use "smartalign"
	ALIGNMODE p6
	%endif

	SECTION_RODATA 64

	const_float_abs_mask: times 8 dd 0x7fffffff
	const_align_abs_edge: times 8 dd 0

	const_float_0_5: times 8 dd 0.5
	const_float_1: times 8 dd 1.0
	const_float_sign_mask: times 8 dd 0x80000000

	const_int32_offsets:
	%rep 8
	dd $-const_int32_offsets
	%endrep
	SECTION .text

	;
	; Setup High Register to be used
	; for holding memory constants
	;
	; %1 - the register to be used, assmues it is >= mm8
	; %2 - name of the constant.
	;
	; Subsequent opcodes are going to use the constant in the form
	; "addps m0, mm_const_name" and it would be turned into:
	; "addps m0, [const_name]" on 32 bit arch or
	; "addps m0, m8" on 64 bit arch
	%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
	%if num_mmregs > 8
	%define mm_%3 %2
	%{1} %2, [%3] ; movaps m8, [const_name]
	%else
	%define mm_%3 [%3]
	%endif
	%endmacro

	;
	; Set Position Independent Code
	; Base address of a constant
	; %1 - the register to be used, if PIC is set
	; %2 - name of the constant.
	;
	; Subsequent opcode are going to use the base address in the form
	; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
	; "movaps m0, [r5 + r4]" if PIC is enabled
	; "movaps m0, [constant_name + r4]" if texrel are used
	%macro SET_PIC_BASE 3; reg, const_label
	%ifdef PIC
	%{1} %2, [%3] ; lea r5, [rip+const]
	%define pic_base_%3 %2
	%else
	%define pic_base_%3 %3
	%endif
	%endmacro

	%macro PULSES_SEARCH 1
	; m6 Syy_norm
	; m7 Sxy_norm
	addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
	pxor m1, m1 ; max_idx
	xorps m3, m3 ; p_max
	xor r4d, r4d
	align 16
	%%distortion_search:
	movd xm2, dword r4d ; movd zero extends
	%ifidn %1,add
	movaps m4, [tmpY + r4] ; y[i]
	movaps m5, [tmpX + r4] ; X[i]

	%if USE_APPROXIMATION == 1
	xorps m0, m0
	cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
	%endif

	addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
	addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm

	%if USE_APPROXIMATION == 1
	andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
	%endif

	%else
	movaps m5, [tmpY + r4] ; m5 = y[i]

	xorps m0, m0 ; m0 = 0;
	cmpps m0, m0, m5, 1 ; m0 = (0<y)

	subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
	subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i]
	andps m5, m0 ; (0<y)?m5:0
	%endif

	%if USE_APPROXIMATION == 1
	rsqrtps m4, m4
	mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
	%else
	mulps m5, m5
	divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
	%endif
	VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)

	cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
	maxps m3, m5 ; m3=max(p_max,p)
	; maxps here is faster than blendvps, despite blend having lower latency.

	pand m2, m0 ; This version seems faster than sse41 pblendvb
	pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4

	add r4d, mmsize
	cmp r4d, Nd
	jb %%distortion_search

	por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop)
	movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing

	%if mmsize >= 32
	; Merge parallel maximums round 8 (4 vs 4)

	vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
	cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )

	vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
	BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
	PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128]
	%endif

	; Merge parallel maximums round 4 (2 vs 2)
	; m3=p[3210]
	movhlps xm5, xm3 ; m5=p[xx32]
	cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )

	pshufd xm2, xm1, q3232
	BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
	PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0]

	; Merge parallel maximums final round (1 vs 1)
	shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1]
	cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )

	pshufd xm2, xm1, q1111
	PBLENDVB xm1, xm2, xm0

	movd dword r4d, xm1 ; zero extends to the rest of r4q

	VBROADCASTSS m3, [tmpX + r4]
	%{1}ps m7, m3 ; Sxy += X[max_idx]

	VBROADCASTSS m5, [tmpY + r4]
	%{1}ps m6, m5 ; Syy += Y[max_idx]

	; We have to update a single element in Y[i]
	; However writing 4 bytes and then doing 16 byte load in the inner loop
	; could cause a stall due to breaking write forwarding.
	VPBROADCASTD m1, xm1
	pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it

	and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load
	movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0]
	andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0]
	%{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
	movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0;
	%endmacro

	;
	; We need one more register for
	; PIC relative addressing. Use this
	; to count it in cglobal
	;
	%ifdef PIC
	%define num_pic_regs 1
	%else
	%define num_pic_regs 0
	%endif

	;
	; Pyramid Vector Quantization Search implementation
	;
	; float * inX - Unaligned (SIMD) access, it will be overread,
	; but extra data is masked away.
	; int32 * outY - Should be aligned and padded buffer.
	; It is used as temp buffer.
	; uint32 K - Number of pulses to have after quantizations.
	; uint32 N - Number of vector elements. Must be 0 < N < 256
	;
	%macro PVQ_FAST_SEARCH 1
	cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
	%define tmpX rsp
	%define tmpY outYq

	movaps m0, [const_float_abs_mask]
	shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
	mov r4d, Nd

	neg r4d
	and r4d, mmsize-1

	SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const
	movups m2, [pic_base_const_align_abs_edge + r4 - mmsize]

	add Nd, r4d ; N = align(N, mmsize)

	lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
	movups m1, [inXq + r4]
	andps m1, m2
	movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] )

	align 16
	%%loop_abs_sum:
	sub r4d, mmsize
	jc %%end_loop_abs_sum

	movups m2, [inXq + r4]
	andps m2, m0

	movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
	addps m1, m2 ; Sx += abs(X[i])
	jmp %%loop_abs_sum

	align 16
	%%end_loop_abs_sum:

	HSUMPS m1, m2 ; m1 = Sx

	xorps m0, m0
	comiss xm0, xm1 ;
	jz %%zero_input ; if (Sx==0) goto zero_input

	cvtsi2ss xm0, dword Kd ; m0 = K
	%if USE_APPROXIMATION == 1
	rcpss xm1, xm1 ; m1 = approx(1/Sx)
	mulss xm0, xm1 ; m0 = K*(1/Sx)
	%else
	divss xm0, xm1 ; b = K/Sx
	; b = K/max_x
	%endif

	VBROADCASTSS m0, xm0

	lea r4d, [Nd - mmsize]
	pxor m5, m5 ; Sy ( Sum of abs( y[i]) )
	xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
	xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
	align 16
	%%loop_guess:
	movaps m1, [tmpX + r4] ; m1 = X[i]
	mulps m2, m0, m1 ; m2 = res*X[i]
	cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
	paddd m5, m2 ; Sy += yt
	cvtdq2ps m2, m2 ; yt = (float)yt
	mulps m1, m2 ; m1 = X[i]*yt
	movaps [tmpY + r4], m2 ; y[i] = m2
	addps m7, m1 ; Sxy += m1;
	mulps m2, m2 ; m2 = yt*yt
	addps m6, m2 ; Syy += m2

	sub r4d, mmsize
	jnc %%loop_guess

	HSUMPS m6, m1 ; Syy_norm
	HADDD m5, m4 ; pulses

	movd dword r4d, xm5 ; zero extends to the rest of r4q

	sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
	jz %%finish ; K - pulses == 0

	SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5
	SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1
	SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
	; Use Syy/2 in distortion parameter calculations.
	; Saves pre and post-caclulation to correct Y[] values.
	; Same precision, since float mantisa is normalized.
	; The SQRT approximation does differ.
	HSUMPS m7, m0 ; Sxy_norm
	mulps m6, mm_const_float_0_5

	jc %%remove_pulses_loop ; K - pulses < 0

	align 16 ; K - pulses > 0
	%%add_pulses_loop:

	PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm

	sub Kd, 1
	jnz %%add_pulses_loop

	addps m6, m6 ; Syy*=2

	jmp %%finish

	align 16
	%%remove_pulses_loop:

	PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm

	add Kd, 1
	jnz %%remove_pulses_loop

	addps m6, m6 ; Syy*=2

	align 16
	%%finish:
	lea r4d, [Nd - mmsize]
	movaps m2, [const_float_sign_mask]

	align 16
	%%restore_sign_loop:
	movaps m0, [tmpY + r4] ; m0 = Y[i]
	movups m1, [inXq + r4] ; m1 = X[i]
	andps m1, m2 ; m1 = sign(X[i])
	orps m0, m1 ; m0 = Y[i]*sign
	cvtps2dq m3, m0 ; m3 = (int)m0
	movaps [outYq + r4], m3

	sub r4d, mmsize
	jnc %%restore_sign_loop
	%%return:

	%if ARCH_X86_64 == 0 ; sbrdsp
	movss r0m, xm6 ; return (float)Syy_norm
	fld dword r0m
	%else
	movaps m0, m6 ; return (float)Syy_norm
	%endif

	RET

	align 16
	%%zero_input:
	lea r4d, [Nd - mmsize]
	xorps m0, m0
	%%zero_loop:
	movaps [outYq + r4], m0
	sub r4d, mmsize
	jnc %%zero_loop

	movaps m6, [const_float_1]
	jmp %%return
	%endmacro

	; if 1, use a float op that give half precision but execute for around 3 cycles.
	; On Skylake & Ryzen the division is much faster (around 11c/3),
	; that makes the full precision code about 2% slower.
	; Opus also does use rsqrt approximation in their intrinsics code.
	%define USE_APPROXIMATION 1

	INIT_XMM sse2
	PVQ_FAST_SEARCH _approx

	INIT_XMM sse4
	PVQ_FAST_SEARCH _approx

	%define USE_APPROXIMATION 0

	INIT_XMM avx
	PVQ_FAST_SEARCH _exact