blob: b2bf70d5ee9ea95810e8d4750c2ae55a86c525fa [file] [log] [blame]
#if defined(__i386__)
.text
.align 64
.globl GFp_poly1305_init_asm
.hidden GFp_poly1305_init_asm
.type GFp_poly1305_init_asm,@function
.align 16
GFp_poly1305_init_asm:
.L_GFp_poly1305_init_asm_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ebp
xorl %eax,%eax
movl %eax,(%edi)
movl %eax,4(%edi)
movl %eax,8(%edi)
movl %eax,12(%edi)
movl %eax,16(%edi)
movl %eax,20(%edi)
cmpl $0,%esi
je .L000nokey
call .L001pic_point
.L001pic_point:
popl %ebx
leal GFp_poly1305_blocks-.L001pic_point(%ebx),%eax
leal GFp_poly1305_emit-.L001pic_point(%ebx),%edx
leal GFp_ia32cap_P-.L001pic_point(%ebx),%edi
movl (%edi),%ecx
andl $83886080,%ecx
cmpl $83886080,%ecx
leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
movl 20(%esp),%edi
movl %eax,(%ebp)
movl %edx,4(%ebp)
movl (%esi),%eax
movl 4(%esi),%ebx
movl 8(%esi),%ecx
movl 12(%esi),%edx
andl $268435455,%eax
andl $268435452,%ebx
andl $268435452,%ecx
andl $268435452,%edx
movl %eax,24(%edi)
movl %ebx,28(%edi)
movl %ecx,32(%edi)
movl %edx,36(%edi)
movl $1,%eax
.L000nokey:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size GFp_poly1305_init_asm,.-.L_GFp_poly1305_init_asm_begin
.globl GFp_poly1305_blocks
.hidden GFp_poly1305_blocks
.type GFp_poly1305_blocks,@function
.align 16
GFp_poly1305_blocks:
.L_GFp_poly1305_blocks_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
.Lenter_blocks:
andl $-15,%ecx
jz .L002nodata
subl $64,%esp
movl 24(%edi),%eax
movl 28(%edi),%ebx
leal (%esi,%ecx,1),%ebp
movl 32(%edi),%ecx
movl 36(%edi),%edx
movl %ebp,92(%esp)
movl %esi,%ebp
movl %eax,36(%esp)
movl %ebx,%eax
shrl $2,%eax
movl %ebx,40(%esp)
addl %ebx,%eax
movl %ecx,%ebx
shrl $2,%ebx
movl %ecx,44(%esp)
addl %ecx,%ebx
movl %edx,%ecx
shrl $2,%ecx
movl %edx,48(%esp)
addl %edx,%ecx
movl %eax,52(%esp)
movl %ebx,56(%esp)
movl %ecx,60(%esp)
movl (%edi),%eax
movl 4(%edi),%ebx
movl 8(%edi),%ecx
movl 12(%edi),%esi
movl 16(%edi),%edi
jmp .L003loop
.align 32
.L003loop:
addl (%ebp),%eax
adcl 4(%ebp),%ebx
adcl 8(%ebp),%ecx
adcl 12(%ebp),%esi
leal 16(%ebp),%ebp
adcl 96(%esp),%edi
movl %eax,(%esp)
movl %esi,12(%esp)
mull 36(%esp)
movl %edi,16(%esp)
movl %eax,%edi
movl %ebx,%eax
movl %edx,%esi
mull 60(%esp)
addl %eax,%edi
movl %ecx,%eax
adcl %edx,%esi
mull 56(%esp)
addl %eax,%edi
movl 12(%esp),%eax
adcl %edx,%esi
mull 52(%esp)
addl %eax,%edi
movl (%esp),%eax
adcl %edx,%esi
mull 40(%esp)
movl %edi,20(%esp)
xorl %edi,%edi
addl %eax,%esi
movl %ebx,%eax
adcl %edx,%edi
mull 36(%esp)
addl %eax,%esi
movl %ecx,%eax
adcl %edx,%edi
mull 60(%esp)
addl %eax,%esi
movl 12(%esp),%eax
adcl %edx,%edi
mull 56(%esp)
addl %eax,%esi
movl 16(%esp),%eax
adcl %edx,%edi
imull 52(%esp),%eax
addl %eax,%esi
movl (%esp),%eax
adcl $0,%edi
mull 44(%esp)
movl %esi,24(%esp)
xorl %esi,%esi
addl %eax,%edi
movl %ebx,%eax
adcl %edx,%esi
mull 40(%esp)
addl %eax,%edi
movl %ecx,%eax
adcl %edx,%esi
mull 36(%esp)
addl %eax,%edi
movl 12(%esp),%eax
adcl %edx,%esi
mull 60(%esp)
addl %eax,%edi
movl 16(%esp),%eax
adcl %edx,%esi
imull 56(%esp),%eax
addl %eax,%edi
movl (%esp),%eax
adcl $0,%esi
mull 48(%esp)
movl %edi,28(%esp)
xorl %edi,%edi
addl %eax,%esi
movl %ebx,%eax
adcl %edx,%edi
mull 44(%esp)
addl %eax,%esi
movl %ecx,%eax
adcl %edx,%edi
mull 40(%esp)
addl %eax,%esi
movl 12(%esp),%eax
adcl %edx,%edi
mull 36(%esp)
addl %eax,%esi
movl 16(%esp),%ecx
adcl %edx,%edi
movl %ecx,%edx
imull 60(%esp),%ecx
addl %ecx,%esi
movl 20(%esp),%eax
adcl $0,%edi
imull 36(%esp),%edx
addl %edi,%edx
movl 24(%esp),%ebx
movl 28(%esp),%ecx
movl %edx,%edi
shrl $2,%edx
andl $3,%edi
leal (%edx,%edx,4),%edx
addl %edx,%eax
adcl $0,%ebx
adcl $0,%ecx
adcl $0,%esi
adcl $0,%edi
cmpl 92(%esp),%ebp
jne .L003loop
movl 84(%esp),%edx
addl $64,%esp
movl %eax,(%edx)
movl %ebx,4(%edx)
movl %ecx,8(%edx)
movl %esi,12(%edx)
movl %edi,16(%edx)
.L002nodata:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size GFp_poly1305_blocks,.-.L_GFp_poly1305_blocks_begin
.globl GFp_poly1305_emit
.hidden GFp_poly1305_emit
.type GFp_poly1305_emit,@function
.align 16
GFp_poly1305_emit:
.L_GFp_poly1305_emit_begin:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%ebp
.Lenter_emit:
movl 24(%esp),%edi
movl (%ebp),%eax
movl 4(%ebp),%ebx
movl 8(%ebp),%ecx
movl 12(%ebp),%edx
movl 16(%ebp),%esi
addl $5,%eax
adcl $0,%ebx
adcl $0,%ecx
adcl $0,%edx
adcl $0,%esi
shrl $2,%esi
negl %esi
andl %esi,%eax
andl %esi,%ebx
andl %esi,%ecx
andl %esi,%edx
movl %eax,(%edi)
movl %ebx,4(%edi)
movl %ecx,8(%edi)
movl %edx,12(%edi)
notl %esi
movl (%ebp),%eax
movl 4(%ebp),%ebx
movl 8(%ebp),%ecx
movl 12(%ebp),%edx
movl 28(%esp),%ebp
andl %esi,%eax
andl %esi,%ebx
andl %esi,%ecx
andl %esi,%edx
orl (%edi),%eax
orl 4(%edi),%ebx
orl 8(%edi),%ecx
orl 12(%edi),%edx
addl (%ebp),%eax
adcl 4(%ebp),%ebx
adcl 8(%ebp),%ecx
adcl 12(%ebp),%edx
movl %eax,(%edi)
movl %ebx,4(%edi)
movl %ecx,8(%edi)
movl %edx,12(%edi)
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size GFp_poly1305_emit,.-.L_GFp_poly1305_emit_begin
.align 32
.hidden _poly1305_init_sse2
.type _poly1305_init_sse2,@function
.align 16
_poly1305_init_sse2:
movdqu 24(%edi),%xmm4
leal 48(%edi),%edi
movl %esp,%ebp
subl $224,%esp
andl $-16,%esp
movq 64(%ebx),%xmm7
movdqa %xmm4,%xmm0
movdqa %xmm4,%xmm1
movdqa %xmm4,%xmm2
pand %xmm7,%xmm0
psrlq $26,%xmm1
psrldq $6,%xmm2
pand %xmm7,%xmm1
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
pand %xmm7,%xmm2
pand %xmm7,%xmm3
psrldq $13,%xmm4
leal 144(%esp),%edx
movl $2,%ecx
.L004square:
movdqa %xmm0,(%esp)
movdqa %xmm1,16(%esp)
movdqa %xmm2,32(%esp)
movdqa %xmm3,48(%esp)
movdqa %xmm4,64(%esp)
movdqa %xmm1,%xmm6
movdqa %xmm2,%xmm5
pslld $2,%xmm6
pslld $2,%xmm5
paddd %xmm1,%xmm6
paddd %xmm2,%xmm5
movdqa %xmm6,80(%esp)
movdqa %xmm5,96(%esp)
movdqa %xmm3,%xmm6
movdqa %xmm4,%xmm5
pslld $2,%xmm6
pslld $2,%xmm5
paddd %xmm3,%xmm6
paddd %xmm4,%xmm5
movdqa %xmm6,112(%esp)
movdqa %xmm5,128(%esp)
pshufd $68,%xmm0,%xmm6
movdqa %xmm1,%xmm5
pshufd $68,%xmm1,%xmm1
pshufd $68,%xmm2,%xmm2
pshufd $68,%xmm3,%xmm3
pshufd $68,%xmm4,%xmm4
movdqa %xmm6,(%edx)
movdqa %xmm1,16(%edx)
movdqa %xmm2,32(%edx)
movdqa %xmm3,48(%edx)
movdqa %xmm4,64(%edx)
pmuludq %xmm0,%xmm4
pmuludq %xmm0,%xmm3
pmuludq %xmm0,%xmm2
pmuludq %xmm0,%xmm1
pmuludq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 48(%edx),%xmm5
movdqa %xmm6,%xmm7
pmuludq 32(%edx),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa 80(%esp),%xmm6
pmuludq (%edx),%xmm5
paddq %xmm7,%xmm2
pmuludq 64(%edx),%xmm6
movdqa 32(%esp),%xmm7
paddq %xmm5,%xmm1
movdqa %xmm7,%xmm5
pmuludq 32(%edx),%xmm7
paddq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 16(%edx),%xmm5
paddq %xmm7,%xmm4
movdqa 96(%esp),%xmm7
pmuludq (%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 64(%edx),%xmm7
paddq %xmm6,%xmm2
pmuludq 48(%edx),%xmm5
movdqa 48(%esp),%xmm6
paddq %xmm7,%xmm1
movdqa %xmm6,%xmm7
pmuludq 16(%edx),%xmm6
paddq %xmm5,%xmm0
movdqa 112(%esp),%xmm5
pmuludq (%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 64(%edx),%xmm5
paddq %xmm7,%xmm3
movdqa %xmm6,%xmm7
pmuludq 48(%edx),%xmm6
paddq %xmm5,%xmm2
pmuludq 32(%edx),%xmm7
movdqa 64(%esp),%xmm5
paddq %xmm6,%xmm1
movdqa 128(%esp),%xmm6
pmuludq (%edx),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 64(%edx),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 32(%edx),%xmm5
paddq %xmm7,%xmm0
pmuludq 48(%edx),%xmm6
movdqa 64(%ebx),%xmm7
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
movdqa %xmm3,%xmm5
pand %xmm7,%xmm3
psrlq $26,%xmm5
paddq %xmm4,%xmm5
movdqa %xmm0,%xmm6
pand %xmm7,%xmm0
psrlq $26,%xmm6
movdqa %xmm5,%xmm4
paddq %xmm1,%xmm6
psrlq $26,%xmm5
pand %xmm7,%xmm4
movdqa %xmm6,%xmm1
psrlq $26,%xmm6
paddd %xmm5,%xmm0
psllq $2,%xmm5
paddq %xmm2,%xmm6
paddq %xmm0,%xmm5
pand %xmm7,%xmm1
movdqa %xmm6,%xmm2
psrlq $26,%xmm6
pand %xmm7,%xmm2
paddd %xmm3,%xmm6
movdqa %xmm5,%xmm0
psrlq $26,%xmm5
movdqa %xmm6,%xmm3
psrlq $26,%xmm6
pand %xmm7,%xmm0
paddd %xmm5,%xmm1
pand %xmm7,%xmm3
paddd %xmm6,%xmm4
decl %ecx
jz .L005square_break
punpcklqdq (%esp),%xmm0
punpcklqdq 16(%esp),%xmm1
punpcklqdq 32(%esp),%xmm2
punpcklqdq 48(%esp),%xmm3
punpcklqdq 64(%esp),%xmm4
jmp .L004square
.L005square_break:
psllq $32,%xmm0
psllq $32,%xmm1
psllq $32,%xmm2
psllq $32,%xmm3
psllq $32,%xmm4
por (%esp),%xmm0
por 16(%esp),%xmm1
por 32(%esp),%xmm2
por 48(%esp),%xmm3
por 64(%esp),%xmm4
pshufd $141,%xmm0,%xmm0
pshufd $141,%xmm1,%xmm1
pshufd $141,%xmm2,%xmm2
pshufd $141,%xmm3,%xmm3
pshufd $141,%xmm4,%xmm4
movdqu %xmm0,(%edi)
movdqu %xmm1,16(%edi)
movdqu %xmm2,32(%edi)
movdqu %xmm3,48(%edi)
movdqu %xmm4,64(%edi)
movdqa %xmm1,%xmm6
movdqa %xmm2,%xmm5
pslld $2,%xmm6
pslld $2,%xmm5
paddd %xmm1,%xmm6
paddd %xmm2,%xmm5
movdqu %xmm6,80(%edi)
movdqu %xmm5,96(%edi)
movdqa %xmm3,%xmm6
movdqa %xmm4,%xmm5
pslld $2,%xmm6
pslld $2,%xmm5
paddd %xmm3,%xmm6
paddd %xmm4,%xmm5
movdqu %xmm6,112(%edi)
movdqu %xmm5,128(%edi)
movl %ebp,%esp
leal -48(%edi),%edi
ret
.size _poly1305_init_sse2,.-_poly1305_init_sse2
.align 32
.hidden _poly1305_blocks_sse2
.type _poly1305_blocks_sse2,@function
.align 16
_poly1305_blocks_sse2:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
movl 20(%edi),%eax
andl $-16,%ecx
jz .L006nodata
cmpl $64,%ecx
jae .L007enter_sse2
testl %eax,%eax
jz .Lenter_blocks
.align 16
.L007enter_sse2:
call .L008pic_point
.L008pic_point:
popl %ebx
leal .Lconst_sse2-.L008pic_point(%ebx),%ebx
testl %eax,%eax
jnz .L009base2_26
call _poly1305_init_sse2
movl (%edi),%eax
movl 3(%edi),%ecx
movl 6(%edi),%edx
movl 9(%edi),%esi
movl 13(%edi),%ebp
movl $1,20(%edi)
shrl $2,%ecx
andl $67108863,%eax
shrl $4,%edx
andl $67108863,%ecx
shrl $6,%esi
andl $67108863,%edx
movd %eax,%xmm0
movd %ecx,%xmm1
movd %edx,%xmm2
movd %esi,%xmm3
movd %ebp,%xmm4
movl 24(%esp),%esi
movl 28(%esp),%ecx
jmp .L010base2_32
.align 16
.L009base2_26:
movd (%edi),%xmm0
movd 4(%edi),%xmm1
movd 8(%edi),%xmm2
movd 12(%edi),%xmm3
movd 16(%edi),%xmm4
movdqa 64(%ebx),%xmm7
.L010base2_32:
movl 32(%esp),%eax
movl %esp,%ebp
subl $528,%esp
andl $-16,%esp
leal 48(%edi),%edi
shll $24,%eax
testl $31,%ecx
jz .L011even
movdqu (%esi),%xmm6
leal 16(%esi),%esi
movdqa %xmm6,%xmm5
pand %xmm7,%xmm6
paddd %xmm6,%xmm0
movdqa %xmm5,%xmm6
psrlq $26,%xmm5
psrldq $6,%xmm6
pand %xmm7,%xmm5
paddd %xmm5,%xmm1
movdqa %xmm6,%xmm5
psrlq $4,%xmm6
pand %xmm7,%xmm6
paddd %xmm6,%xmm2
movdqa %xmm5,%xmm6
psrlq $30,%xmm5
pand %xmm7,%xmm5
psrldq $7,%xmm6
paddd %xmm5,%xmm3
movd %eax,%xmm5
paddd %xmm6,%xmm4
movd 12(%edi),%xmm6
paddd %xmm5,%xmm4
movdqa %xmm0,(%esp)
movdqa %xmm1,16(%esp)
movdqa %xmm2,32(%esp)
movdqa %xmm3,48(%esp)
movdqa %xmm4,64(%esp)
pmuludq %xmm6,%xmm0
pmuludq %xmm6,%xmm1
pmuludq %xmm6,%xmm2
movd 28(%edi),%xmm5
pmuludq %xmm6,%xmm3
pmuludq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 48(%esp),%xmm5
movdqa %xmm6,%xmm7
pmuludq 32(%esp),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%esp),%xmm7
paddq %xmm6,%xmm3
movd 92(%edi),%xmm6
pmuludq (%esp),%xmm5
paddq %xmm7,%xmm2
pmuludq 64(%esp),%xmm6
movd 44(%edi),%xmm7
paddq %xmm5,%xmm1
movdqa %xmm7,%xmm5
pmuludq 32(%esp),%xmm7
paddq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 16(%esp),%xmm5
paddq %xmm7,%xmm4
movd 108(%edi),%xmm7
pmuludq (%esp),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 64(%esp),%xmm7
paddq %xmm6,%xmm2
pmuludq 48(%esp),%xmm5
movd 60(%edi),%xmm6
paddq %xmm7,%xmm1
movdqa %xmm6,%xmm7
pmuludq 16(%esp),%xmm6
paddq %xmm5,%xmm0
movd 124(%edi),%xmm5
pmuludq (%esp),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 64(%esp),%xmm5
paddq %xmm7,%xmm3
movdqa %xmm6,%xmm7
pmuludq 48(%esp),%xmm6
paddq %xmm5,%xmm2
pmuludq 32(%esp),%xmm7
movd 76(%edi),%xmm5
paddq %xmm6,%xmm1
movd 140(%edi),%xmm6
pmuludq (%esp),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 64(%esp),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%esp),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 32(%esp),%xmm5
paddq %xmm7,%xmm0
pmuludq 48(%esp),%xmm6
movdqa 64(%ebx),%xmm7
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
movdqa %xmm3,%xmm5
pand %xmm7,%xmm3
psrlq $26,%xmm5
paddq %xmm4,%xmm5
movdqa %xmm0,%xmm6
pand %xmm7,%xmm0
psrlq $26,%xmm6
movdqa %xmm5,%xmm4
paddq %xmm1,%xmm6
psrlq $26,%xmm5
pand %xmm7,%xmm4
movdqa %xmm6,%xmm1
psrlq $26,%xmm6
paddd %xmm5,%xmm0
psllq $2,%xmm5
paddq %xmm2,%xmm6
paddq %xmm0,%xmm5
pand %xmm7,%xmm1
movdqa %xmm6,%xmm2
psrlq $26,%xmm6
pand %xmm7,%xmm2
paddd %xmm3,%xmm6
movdqa %xmm5,%xmm0
psrlq $26,%xmm5
movdqa %xmm6,%xmm3
psrlq $26,%xmm6
pand %xmm7,%xmm0
paddd %xmm5,%xmm1
pand %xmm7,%xmm3
paddd %xmm6,%xmm4
subl $16,%ecx
jz .L012done
.L011even:
leal 384(%esp),%edx
leal -32(%esi),%eax
subl $64,%ecx
movdqu (%edi),%xmm5
pshufd $68,%xmm5,%xmm6
cmovbl %eax,%esi
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,(%edx)
leal 160(%esp),%eax
movdqu 16(%edi),%xmm6
movdqa %xmm5,-144(%edx)
pshufd $68,%xmm6,%xmm5
pshufd $238,%xmm6,%xmm6
movdqa %xmm5,16(%edx)
movdqu 32(%edi),%xmm5
movdqa %xmm6,-128(%edx)
pshufd $68,%xmm5,%xmm6
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,32(%edx)
movdqu 48(%edi),%xmm6
movdqa %xmm5,-112(%edx)
pshufd $68,%xmm6,%xmm5
pshufd $238,%xmm6,%xmm6
movdqa %xmm5,48(%edx)
movdqu 64(%edi),%xmm5
movdqa %xmm6,-96(%edx)
pshufd $68,%xmm5,%xmm6
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,64(%edx)
movdqu 80(%edi),%xmm6
movdqa %xmm5,-80(%edx)
pshufd $68,%xmm6,%xmm5
pshufd $238,%xmm6,%xmm6
movdqa %xmm5,80(%edx)
movdqu 96(%edi),%xmm5
movdqa %xmm6,-64(%edx)
pshufd $68,%xmm5,%xmm6
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,96(%edx)
movdqu 112(%edi),%xmm6
movdqa %xmm5,-48(%edx)
pshufd $68,%xmm6,%xmm5
pshufd $238,%xmm6,%xmm6
movdqa %xmm5,112(%edx)
movdqu 128(%edi),%xmm5
movdqa %xmm6,-32(%edx)
pshufd $68,%xmm5,%xmm6
pshufd $238,%xmm5,%xmm5
movdqa %xmm6,128(%edx)
movdqa %xmm5,-16(%edx)
movdqu 32(%esi),%xmm5
movdqu 48(%esi),%xmm6
leal 32(%esi),%esi
movdqa %xmm2,112(%esp)
movdqa %xmm3,128(%esp)
movdqa %xmm4,144(%esp)
movdqa %xmm5,%xmm2
movdqa %xmm6,%xmm3
psrldq $6,%xmm2
psrldq $6,%xmm3
movdqa %xmm5,%xmm4
punpcklqdq %xmm3,%xmm2
punpckhqdq %xmm6,%xmm4
punpcklqdq %xmm6,%xmm5
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
movdqa %xmm5,%xmm6
psrlq $40,%xmm4
psrlq $26,%xmm6
pand %xmm7,%xmm5
pand %xmm7,%xmm6
pand %xmm7,%xmm2
pand %xmm7,%xmm3
por (%ebx),%xmm4
movdqa %xmm0,80(%esp)
movdqa %xmm1,96(%esp)
jbe .L013skip_loop
jmp .L014loop
.align 32
.L014loop:
movdqa -144(%edx),%xmm7
movdqa %xmm6,16(%eax)
movdqa %xmm2,32(%eax)
movdqa %xmm3,48(%eax)
movdqa %xmm4,64(%eax)
movdqa %xmm5,%xmm1
pmuludq %xmm7,%xmm5
movdqa %xmm6,%xmm0
pmuludq %xmm7,%xmm6
pmuludq %xmm7,%xmm2
pmuludq %xmm7,%xmm3
pmuludq %xmm7,%xmm4
pmuludq -16(%edx),%xmm0
movdqa %xmm1,%xmm7
pmuludq -128(%edx),%xmm1
paddq %xmm5,%xmm0
movdqa %xmm7,%xmm5
pmuludq -112(%edx),%xmm7
paddq %xmm6,%xmm1
movdqa %xmm5,%xmm6
pmuludq -96(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa 16(%eax),%xmm7
pmuludq -80(%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq -128(%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq -112(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa 32(%eax),%xmm7
pmuludq -96(%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq -32(%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq -16(%edx),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq -128(%edx),%xmm6
paddq %xmm5,%xmm1
movdqa 48(%eax),%xmm5
pmuludq -112(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq -48(%edx),%xmm5
paddq %xmm7,%xmm4
movdqa %xmm6,%xmm7
pmuludq -32(%edx),%xmm6
paddq %xmm5,%xmm0
movdqa %xmm7,%xmm5
pmuludq -16(%edx),%xmm7
paddq %xmm6,%xmm1
movdqa 64(%eax),%xmm6
pmuludq -128(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa %xmm6,%xmm7
pmuludq -16(%edx),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq -64(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq -48(%edx),%xmm5
paddq %xmm7,%xmm0
movdqa 64(%ebx),%xmm7
pmuludq -32(%edx),%xmm6
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
movdqu -32(%esi),%xmm5
movdqu -16(%esi),%xmm6
leal 32(%esi),%esi
movdqa %xmm2,32(%esp)
movdqa %xmm3,48(%esp)
movdqa %xmm4,64(%esp)
movdqa %xmm5,%xmm2
movdqa %xmm6,%xmm3
psrldq $6,%xmm2
psrldq $6,%xmm3
movdqa %xmm5,%xmm4
punpcklqdq %xmm3,%xmm2
punpckhqdq %xmm6,%xmm4
punpcklqdq %xmm6,%xmm5
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
movdqa %xmm5,%xmm6
psrlq $40,%xmm4
psrlq $26,%xmm6
pand %xmm7,%xmm5
pand %xmm7,%xmm6
pand %xmm7,%xmm2
pand %xmm7,%xmm3
por (%ebx),%xmm4
leal -32(%esi),%eax
subl $64,%ecx
paddd 80(%esp),%xmm5
paddd 96(%esp),%xmm6
paddd 112(%esp),%xmm2
paddd 128(%esp),%xmm3
paddd 144(%esp),%xmm4
cmovbl %eax,%esi
leal 160(%esp),%eax
movdqa (%edx),%xmm7
movdqa %xmm1,16(%esp)
movdqa %xmm6,16(%eax)
movdqa %xmm2,32(%eax)
movdqa %xmm3,48(%eax)
movdqa %xmm4,64(%eax)
movdqa %xmm5,%xmm1
pmuludq %xmm7,%xmm5
paddq %xmm0,%xmm5
movdqa %xmm6,%xmm0
pmuludq %xmm7,%xmm6
pmuludq %xmm7,%xmm2
pmuludq %xmm7,%xmm3
pmuludq %xmm7,%xmm4
paddq 16(%esp),%xmm6
paddq 32(%esp),%xmm2
paddq 48(%esp),%xmm3
paddq 64(%esp),%xmm4
pmuludq 128(%edx),%xmm0
movdqa %xmm1,%xmm7
pmuludq 16(%edx),%xmm1
paddq %xmm5,%xmm0
movdqa %xmm7,%xmm5
pmuludq 32(%edx),%xmm7
paddq %xmm6,%xmm1
movdqa %xmm5,%xmm6
pmuludq 48(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa 16(%eax),%xmm7
pmuludq 64(%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 16(%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 32(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa 32(%eax),%xmm7
pmuludq 48(%edx),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 112(%edx),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 128(%edx),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 16(%edx),%xmm6
paddq %xmm5,%xmm1
movdqa 48(%eax),%xmm5
pmuludq 32(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 96(%edx),%xmm5
paddq %xmm7,%xmm4
movdqa %xmm6,%xmm7
pmuludq 112(%edx),%xmm6
paddq %xmm5,%xmm0
movdqa %xmm7,%xmm5
pmuludq 128(%edx),%xmm7
paddq %xmm6,%xmm1
movdqa 64(%eax),%xmm6
pmuludq 16(%edx),%xmm5
paddq %xmm7,%xmm2
movdqa %xmm6,%xmm7
pmuludq 128(%edx),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 80(%edx),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 96(%edx),%xmm5
paddq %xmm7,%xmm0
movdqa 64(%ebx),%xmm7
pmuludq 112(%edx),%xmm6
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
movdqa %xmm3,%xmm5
pand %xmm7,%xmm3
psrlq $26,%xmm5
paddq %xmm4,%xmm5
movdqa %xmm0,%xmm6
pand %xmm7,%xmm0
psrlq $26,%xmm6
movdqa %xmm5,%xmm4
paddq %xmm1,%xmm6
psrlq $26,%xmm5
pand %xmm7,%xmm4
movdqa %xmm6,%xmm1
psrlq $26,%xmm6
paddd %xmm5,%xmm0
psllq $2,%xmm5
paddq %xmm2,%xmm6
paddq %xmm0,%xmm5
pand %xmm7,%xmm1
movdqa %xmm6,%xmm2
psrlq $26,%xmm6
pand %xmm7,%xmm2
paddd %xmm3,%xmm6
movdqa %xmm5,%xmm0
psrlq $26,%xmm5
movdqa %xmm6,%xmm3
psrlq $26,%xmm6
pand %xmm7,%xmm0
paddd %xmm5,%xmm1
pand %xmm7,%xmm3
paddd %xmm6,%xmm4
movdqu 32(%esi),%xmm5
movdqu 48(%esi),%xmm6
leal 32(%esi),%esi
movdqa %xmm2,112(%esp)
movdqa %xmm3,128(%esp)
movdqa %xmm4,144(%esp)
movdqa %xmm5,%xmm2
movdqa %xmm6,%xmm3
psrldq $6,%xmm2
psrldq $6,%xmm3
movdqa %xmm5,%xmm4
punpcklqdq %xmm3,%xmm2
punpckhqdq %xmm6,%xmm4
punpcklqdq %xmm6,%xmm5
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
movdqa %xmm5,%xmm6
psrlq $40,%xmm4
psrlq $26,%xmm6
pand %xmm7,%xmm5
pand %xmm7,%xmm6
pand %xmm7,%xmm2
pand %xmm7,%xmm3
por (%ebx),%xmm4
movdqa %xmm0,80(%esp)
movdqa %xmm1,96(%esp)
ja .L014loop
.L013skip_loop:
pshufd $16,-144(%edx),%xmm7
addl $32,%ecx
jnz .L015long_tail
paddd %xmm0,%xmm5
paddd %xmm1,%xmm6
paddd 112(%esp),%xmm2
paddd 128(%esp),%xmm3
paddd 144(%esp),%xmm4
.L015long_tail:
movdqa %xmm5,(%eax)
movdqa %xmm6,16(%eax)
movdqa %xmm2,32(%eax)
movdqa %xmm3,48(%eax)
movdqa %xmm4,64(%eax)
pmuludq %xmm7,%xmm5
pmuludq %xmm7,%xmm6
pmuludq %xmm7,%xmm2
movdqa %xmm5,%xmm0
pshufd $16,-128(%edx),%xmm5
pmuludq %xmm7,%xmm3
movdqa %xmm6,%xmm1
pmuludq %xmm7,%xmm4
movdqa %xmm5,%xmm6
pmuludq 48(%eax),%xmm5
movdqa %xmm6,%xmm7
pmuludq 32(%eax),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%eax),%xmm7
paddq %xmm6,%xmm3
pshufd $16,-64(%edx),%xmm6
pmuludq (%eax),%xmm5
paddq %xmm7,%xmm2
pmuludq 64(%eax),%xmm6
pshufd $16,-112(%edx),%xmm7
paddq %xmm5,%xmm1
movdqa %xmm7,%xmm5
pmuludq 32(%eax),%xmm7
paddq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 16(%eax),%xmm5
paddq %xmm7,%xmm4
pshufd $16,-48(%edx),%xmm7
pmuludq (%eax),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 64(%eax),%xmm7
paddq %xmm6,%xmm2
pmuludq 48(%eax),%xmm5
pshufd $16,-96(%edx),%xmm6
paddq %xmm7,%xmm1
movdqa %xmm6,%xmm7
pmuludq 16(%eax),%xmm6
paddq %xmm5,%xmm0
pshufd $16,-32(%edx),%xmm5
pmuludq (%eax),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 64(%eax),%xmm5
paddq %xmm7,%xmm3
movdqa %xmm6,%xmm7
pmuludq 48(%eax),%xmm6
paddq %xmm5,%xmm2
pmuludq 32(%eax),%xmm7
pshufd $16,-80(%edx),%xmm5
paddq %xmm6,%xmm1
pshufd $16,-16(%edx),%xmm6
pmuludq (%eax),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 64(%eax),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%eax),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 32(%eax),%xmm5
paddq %xmm7,%xmm0
pmuludq 48(%eax),%xmm6
movdqa 64(%ebx),%xmm7
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
jz .L016short_tail
movdqu -32(%esi),%xmm5
movdqu -16(%esi),%xmm6
leal 32(%esi),%esi
movdqa %xmm2,32(%esp)
movdqa %xmm3,48(%esp)
movdqa %xmm4,64(%esp)
movdqa %xmm5,%xmm2
movdqa %xmm6,%xmm3
psrldq $6,%xmm2
psrldq $6,%xmm3
movdqa %xmm5,%xmm4
punpcklqdq %xmm3,%xmm2
punpckhqdq %xmm6,%xmm4
punpcklqdq %xmm6,%xmm5
movdqa %xmm2,%xmm3
psrlq $4,%xmm2
psrlq $30,%xmm3
movdqa %xmm5,%xmm6
psrlq $40,%xmm4
psrlq $26,%xmm6
pand %xmm7,%xmm5
pand %xmm7,%xmm6
pand %xmm7,%xmm2
pand %xmm7,%xmm3
por (%ebx),%xmm4
pshufd $16,(%edx),%xmm7
paddd 80(%esp),%xmm5
paddd 96(%esp),%xmm6
paddd 112(%esp),%xmm2
paddd 128(%esp),%xmm3
paddd 144(%esp),%xmm4
movdqa %xmm5,(%esp)
pmuludq %xmm7,%xmm5
movdqa %xmm6,16(%esp)
pmuludq %xmm7,%xmm6
paddq %xmm5,%xmm0
movdqa %xmm2,%xmm5
pmuludq %xmm7,%xmm2
paddq %xmm6,%xmm1
movdqa %xmm3,%xmm6
pmuludq %xmm7,%xmm3
paddq 32(%esp),%xmm2
movdqa %xmm5,32(%esp)
pshufd $16,16(%edx),%xmm5
paddq 48(%esp),%xmm3
movdqa %xmm6,48(%esp)
movdqa %xmm4,%xmm6
pmuludq %xmm7,%xmm4
paddq 64(%esp),%xmm4
movdqa %xmm6,64(%esp)
movdqa %xmm5,%xmm6
pmuludq 48(%esp),%xmm5
movdqa %xmm6,%xmm7
pmuludq 32(%esp),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%esp),%xmm7
paddq %xmm6,%xmm3
pshufd $16,80(%edx),%xmm6
pmuludq (%esp),%xmm5
paddq %xmm7,%xmm2
pmuludq 64(%esp),%xmm6
pshufd $16,32(%edx),%xmm7
paddq %xmm5,%xmm1
movdqa %xmm7,%xmm5
pmuludq 32(%esp),%xmm7
paddq %xmm6,%xmm0
movdqa %xmm5,%xmm6
pmuludq 16(%esp),%xmm5
paddq %xmm7,%xmm4
pshufd $16,96(%edx),%xmm7
pmuludq (%esp),%xmm6
paddq %xmm5,%xmm3
movdqa %xmm7,%xmm5
pmuludq 64(%esp),%xmm7
paddq %xmm6,%xmm2
pmuludq 48(%esp),%xmm5
pshufd $16,48(%edx),%xmm6
paddq %xmm7,%xmm1
movdqa %xmm6,%xmm7
pmuludq 16(%esp),%xmm6
paddq %xmm5,%xmm0
pshufd $16,112(%edx),%xmm5
pmuludq (%esp),%xmm7
paddq %xmm6,%xmm4
movdqa %xmm5,%xmm6
pmuludq 64(%esp),%xmm5
paddq %xmm7,%xmm3
movdqa %xmm6,%xmm7
pmuludq 48(%esp),%xmm6
paddq %xmm5,%xmm2
pmuludq 32(%esp),%xmm7
pshufd $16,64(%edx),%xmm5
paddq %xmm6,%xmm1
pshufd $16,128(%edx),%xmm6
pmuludq (%esp),%xmm5
paddq %xmm7,%xmm0
movdqa %xmm6,%xmm7
pmuludq 64(%esp),%xmm6
paddq %xmm5,%xmm4
movdqa %xmm7,%xmm5
pmuludq 16(%esp),%xmm7
paddq %xmm6,%xmm3
movdqa %xmm5,%xmm6
pmuludq 32(%esp),%xmm5
paddq %xmm7,%xmm0
pmuludq 48(%esp),%xmm6
movdqa 64(%ebx),%xmm7
paddq %xmm5,%xmm1
paddq %xmm6,%xmm2
.L016short_tail:
pshufd $78,%xmm4,%xmm6
pshufd $78,%xmm3,%xmm5
paddq %xmm6,%xmm4
paddq %xmm5,%xmm3
pshufd $78,%xmm0,%xmm6
pshufd $78,%xmm1,%xmm5
paddq %xmm6,%xmm0
paddq %xmm5,%xmm1
pshufd $78,%xmm2,%xmm6
movdqa %xmm3,%xmm5
pand %xmm7,%xmm3
psrlq $26,%xmm5
paddq %xmm6,%xmm2
paddq %xmm4,%xmm5
movdqa %xmm0,%xmm6
pand %xmm7,%xmm0
psrlq $26,%xmm6
movdqa %xmm5,%xmm4
paddq %xmm1,%xmm6
psrlq $26,%xmm5
pand %xmm7,%xmm4
movdqa %xmm6,%xmm1
psrlq $26,%xmm6
paddd %xmm5,%xmm0
psllq $2,%xmm5
paddq %xmm2,%xmm6
paddq %xmm0,%xmm5
pand %xmm7,%xmm1
movdqa %xmm6,%xmm2
psrlq $26,%xmm6
pand %xmm7,%xmm2
paddd %xmm3,%xmm6
movdqa %xmm5,%xmm0
psrlq $26,%xmm5
movdqa %xmm6,%xmm3
psrlq $26,%xmm6
pand %xmm7,%xmm0
paddd %xmm5,%xmm1
pand %xmm7,%xmm3
paddd %xmm6,%xmm4
.L012done:
movd %xmm0,-48(%edi)
movd %xmm1,-44(%edi)
movd %xmm2,-40(%edi)
movd %xmm3,-36(%edi)
movd %xmm4,-32(%edi)
movl %ebp,%esp
.L006nodata:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2
.align 32
.hidden _poly1305_emit_sse2
.type _poly1305_emit_sse2,@function
.align 16
_poly1305_emit_sse2:
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 20(%esp),%ebp
cmpl $0,20(%ebp)
je .Lenter_emit
movl (%ebp),%eax
movl 4(%ebp),%edi
movl 8(%ebp),%ecx
movl 12(%ebp),%edx
movl 16(%ebp),%esi
movl %edi,%ebx
shll $26,%edi
shrl $6,%ebx
addl %edi,%eax
movl %ecx,%edi
adcl $0,%ebx
shll $20,%edi
shrl $12,%ecx
addl %edi,%ebx
movl %edx,%edi
adcl $0,%ecx
shll $14,%edi
shrl $18,%edx
addl %edi,%ecx
movl %esi,%edi
adcl $0,%edx
shll $8,%edi
shrl $24,%esi
addl %edi,%edx
adcl $0,%esi
movl %esi,%edi
andl $3,%esi
shrl $2,%edi
leal (%edi,%edi,4),%ebp
movl 24(%esp),%edi
addl %ebp,%eax
movl 28(%esp),%ebp
adcl $0,%ebx
adcl $0,%ecx
adcl $0,%edx
adcl $0,%esi
movd %eax,%xmm0
addl $5,%eax
movd %ebx,%xmm1
adcl $0,%ebx
movd %ecx,%xmm2
adcl $0,%ecx
movd %edx,%xmm3
adcl $0,%edx
adcl $0,%esi
shrl $2,%esi
negl %esi
andl %esi,%eax
andl %esi,%ebx
andl %esi,%ecx
andl %esi,%edx
movl %eax,(%edi)
movd %xmm0,%eax
movl %ebx,4(%edi)
movd %xmm1,%ebx
movl %ecx,8(%edi)
movd %xmm2,%ecx
movl %edx,12(%edi)
movd %xmm3,%edx
notl %esi
andl %esi,%eax
andl %esi,%ebx
orl (%edi),%eax
andl %esi,%ecx
orl 4(%edi),%ebx
andl %esi,%edx
orl 8(%edi),%ecx
orl 12(%edi),%edx
addl (%ebp),%eax
adcl 4(%ebp),%ebx
movl %eax,(%edi)
adcl 8(%ebp),%ecx
movl %ebx,4(%edi)
adcl 12(%ebp),%edx
movl %ecx,8(%edi)
movl %edx,12(%edi)
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size _poly1305_emit_sse2,.-_poly1305_emit_sse2
.align 64
.Lconst_sse2:
.long 16777216,0,16777216,0,16777216,0,16777216,0
.long 0,0,0,0,0,0,0,0
.long 67108863,0,67108863,0,67108863,0,67108863,0
.long 268435455,268435452,268435452,268435452
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
.byte 114,103,62,0
.align 4
#endif