| # This file is generated from a similarly-named Perl script in the BoringSSL |
| # source tree. Do not edit by hand. |
| |
| #if defined(__has_feature) |
| #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) |
| #define OPENSSL_NO_ASM |
| #endif |
| #endif |
| |
| #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) |
| .text |
| |
| |
| |
| .globl _GFp_poly1305_init_asm |
| .private_extern _GFp_poly1305_init_asm |
| .private_extern _GFp_poly1305_init_asm |
| .globl _GFp_poly1305_blocks |
| .private_extern _GFp_poly1305_blocks |
| .private_extern _GFp_poly1305_blocks |
| .globl _GFp_poly1305_emit |
| .private_extern _GFp_poly1305_emit |
| .private_extern _GFp_poly1305_emit |
| |
| |
| .p2align 5 |
| _GFp_poly1305_init_asm: |
| xorq %rax,%rax |
| movq %rax,0(%rdi) |
| movq %rax,8(%rdi) |
| movq %rax,16(%rdi) |
| |
| cmpq $0,%rsi |
| je L$no_key |
| |
| leaq _GFp_poly1305_blocks(%rip),%r10 |
| leaq _GFp_poly1305_emit(%rip),%r11 |
| movq _GFp_ia32cap_P+4(%rip),%r9 |
| leaq poly1305_blocks_avx(%rip),%rax |
| leaq poly1305_emit_avx(%rip),%rcx |
| btq $28,%r9 |
| cmovcq %rax,%r10 |
| cmovcq %rcx,%r11 |
| leaq poly1305_blocks_avx2(%rip),%rax |
| btq $37,%r9 |
| cmovcq %rax,%r10 |
| movq $0x0ffffffc0fffffff,%rax |
| movq $0x0ffffffc0ffffffc,%rcx |
| andq 0(%rsi),%rax |
| andq 8(%rsi),%rcx |
| movq %rax,24(%rdi) |
| movq %rcx,32(%rdi) |
| movq %r10,0(%rdx) |
| movq %r11,8(%rdx) |
| movl $1,%eax |
| L$no_key: |
| .byte 0xf3,0xc3 |
| |
| |
| |
| .p2align 5 |
| _GFp_poly1305_blocks: |
| L$blocks: |
| shrq $4,%rdx |
| jz L$no_data |
| |
| pushq %rbx |
| pushq %rbp |
| pushq %r12 |
| pushq %r13 |
| pushq %r14 |
| pushq %r15 |
| L$blocks_body: |
| |
| movq %rdx,%r15 |
| |
| movq 24(%rdi),%r11 |
| movq 32(%rdi),%r13 |
| |
| movq 0(%rdi),%r14 |
| movq 8(%rdi),%rbx |
| movq 16(%rdi),%rbp |
| |
| movq %r13,%r12 |
| shrq $2,%r13 |
| movq %r12,%rax |
| addq %r12,%r13 |
| jmp L$oop |
| |
| .p2align 5 |
| L$oop: |
| addq 0(%rsi),%r14 |
| adcq 8(%rsi),%rbx |
| leaq 16(%rsi),%rsi |
| adcq %rcx,%rbp |
| mulq %r14 |
| movq %rax,%r9 |
| movq %r11,%rax |
| movq %rdx,%r10 |
| |
| mulq %r14 |
| movq %rax,%r14 |
| movq %r11,%rax |
| movq %rdx,%r8 |
| |
| mulq %rbx |
| addq %rax,%r9 |
| movq %r13,%rax |
| adcq %rdx,%r10 |
| |
| mulq %rbx |
| movq %rbp,%rbx |
| addq %rax,%r14 |
| adcq %rdx,%r8 |
| |
| imulq %r13,%rbx |
| addq %rbx,%r9 |
| movq %r8,%rbx |
| adcq $0,%r10 |
| |
| imulq %r11,%rbp |
| addq %r9,%rbx |
| movq $-4,%rax |
| adcq %rbp,%r10 |
| |
| andq %r10,%rax |
| movq %r10,%rbp |
| shrq $2,%r10 |
| andq $3,%rbp |
| addq %r10,%rax |
| addq %rax,%r14 |
| adcq $0,%rbx |
| adcq $0,%rbp |
| movq %r12,%rax |
| decq %r15 |
| jnz L$oop |
| |
| movq %r14,0(%rdi) |
| movq %rbx,8(%rdi) |
| movq %rbp,16(%rdi) |
| |
| movq 0(%rsp),%r15 |
| movq 8(%rsp),%r14 |
| movq 16(%rsp),%r13 |
| movq 24(%rsp),%r12 |
| movq 32(%rsp),%rbp |
| movq 40(%rsp),%rbx |
| leaq 48(%rsp),%rsp |
| L$no_data: |
| L$blocks_epilogue: |
| .byte 0xf3,0xc3 |
| |
| |
| |
| .p2align 5 |
| _GFp_poly1305_emit: |
| L$emit: |
| movq 0(%rdi),%r8 |
| movq 8(%rdi),%r9 |
| movq 16(%rdi),%r10 |
| |
| movq %r8,%rax |
| addq $5,%r8 |
| movq %r9,%rcx |
| adcq $0,%r9 |
| adcq $0,%r10 |
| shrq $2,%r10 |
| cmovnzq %r8,%rax |
| cmovnzq %r9,%rcx |
| |
| addq 0(%rdx),%rax |
| adcq 8(%rdx),%rcx |
| movq %rax,0(%rsi) |
| movq %rcx,8(%rsi) |
| |
| .byte 0xf3,0xc3 |
| |
| |
| .p2align 5 |
| __poly1305_block: |
| mulq %r14 |
| movq %rax,%r9 |
| movq %r11,%rax |
| movq %rdx,%r10 |
| |
| mulq %r14 |
| movq %rax,%r14 |
| movq %r11,%rax |
| movq %rdx,%r8 |
| |
| mulq %rbx |
| addq %rax,%r9 |
| movq %r13,%rax |
| adcq %rdx,%r10 |
| |
| mulq %rbx |
| movq %rbp,%rbx |
| addq %rax,%r14 |
| adcq %rdx,%r8 |
| |
| imulq %r13,%rbx |
| addq %rbx,%r9 |
| movq %r8,%rbx |
| adcq $0,%r10 |
| |
| imulq %r11,%rbp |
| addq %r9,%rbx |
| movq $-4,%rax |
| adcq %rbp,%r10 |
| |
| andq %r10,%rax |
| movq %r10,%rbp |
| shrq $2,%r10 |
| andq $3,%rbp |
| addq %r10,%rax |
| addq %rax,%r14 |
| adcq $0,%rbx |
| adcq $0,%rbp |
| .byte 0xf3,0xc3 |
| |
| |
| |
| .p2align 5 |
| __poly1305_init_avx: |
| movq %r11,%r14 |
| movq %r12,%rbx |
| xorq %rbp,%rbp |
| |
| leaq 48+64(%rdi),%rdi |
| |
| movq %r12,%rax |
| call __poly1305_block |
| |
| movl $0x3ffffff,%eax |
| movl $0x3ffffff,%edx |
| movq %r14,%r8 |
| andl %r14d,%eax |
| movq %r11,%r9 |
| andl %r11d,%edx |
| movl %eax,-64(%rdi) |
| shrq $26,%r8 |
| movl %edx,-60(%rdi) |
| shrq $26,%r9 |
| |
| movl $0x3ffffff,%eax |
| movl $0x3ffffff,%edx |
| andl %r8d,%eax |
| andl %r9d,%edx |
| movl %eax,-48(%rdi) |
| leal (%rax,%rax,4),%eax |
| movl %edx,-44(%rdi) |
| leal (%rdx,%rdx,4),%edx |
| movl %eax,-32(%rdi) |
| shrq $26,%r8 |
| movl %edx,-28(%rdi) |
| shrq $26,%r9 |
| |
| movq %rbx,%rax |
| movq %r12,%rdx |
| shlq $12,%rax |
| shlq $12,%rdx |
| orq %r8,%rax |
| orq %r9,%rdx |
| andl $0x3ffffff,%eax |
| andl $0x3ffffff,%edx |
| movl %eax,-16(%rdi) |
| leal (%rax,%rax,4),%eax |
| movl %edx,-12(%rdi) |
| leal (%rdx,%rdx,4),%edx |
| movl %eax,0(%rdi) |
| movq %rbx,%r8 |
| movl %edx,4(%rdi) |
| movq %r12,%r9 |
| |
| movl $0x3ffffff,%eax |
| movl $0x3ffffff,%edx |
| shrq $14,%r8 |
| shrq $14,%r9 |
| andl %r8d,%eax |
| andl %r9d,%edx |
| movl %eax,16(%rdi) |
| leal (%rax,%rax,4),%eax |
| movl %edx,20(%rdi) |
| leal (%rdx,%rdx,4),%edx |
| movl %eax,32(%rdi) |
| shrq $26,%r8 |
| movl %edx,36(%rdi) |
| shrq $26,%r9 |
| |
| movq %rbp,%rax |
| shlq $24,%rax |
| orq %rax,%r8 |
| movl %r8d,48(%rdi) |
| leaq (%r8,%r8,4),%r8 |
| movl %r9d,52(%rdi) |
| leaq (%r9,%r9,4),%r9 |
| movl %r8d,64(%rdi) |
| movl %r9d,68(%rdi) |
| |
| movq %r12,%rax |
| call __poly1305_block |
| |
| movl $0x3ffffff,%eax |
| movq %r14,%r8 |
| andl %r14d,%eax |
| shrq $26,%r8 |
| movl %eax,-52(%rdi) |
| |
| movl $0x3ffffff,%edx |
| andl %r8d,%edx |
| movl %edx,-36(%rdi) |
| leal (%rdx,%rdx,4),%edx |
| shrq $26,%r8 |
| movl %edx,-20(%rdi) |
| |
| movq %rbx,%rax |
| shlq $12,%rax |
| orq %r8,%rax |
| andl $0x3ffffff,%eax |
| movl %eax,-4(%rdi) |
| leal (%rax,%rax,4),%eax |
| movq %rbx,%r8 |
| movl %eax,12(%rdi) |
| |
| movl $0x3ffffff,%edx |
| shrq $14,%r8 |
| andl %r8d,%edx |
| movl %edx,28(%rdi) |
| leal (%rdx,%rdx,4),%edx |
| shrq $26,%r8 |
| movl %edx,44(%rdi) |
| |
| movq %rbp,%rax |
| shlq $24,%rax |
| orq %rax,%r8 |
| movl %r8d,60(%rdi) |
| leaq (%r8,%r8,4),%r8 |
| movl %r8d,76(%rdi) |
| |
| movq %r12,%rax |
| call __poly1305_block |
| |
| movl $0x3ffffff,%eax |
| movq %r14,%r8 |
| andl %r14d,%eax |
| shrq $26,%r8 |
| movl %eax,-56(%rdi) |
| |
| movl $0x3ffffff,%edx |
| andl %r8d,%edx |
| movl %edx,-40(%rdi) |
| leal (%rdx,%rdx,4),%edx |
| shrq $26,%r8 |
| movl %edx,-24(%rdi) |
| |
| movq %rbx,%rax |
| shlq $12,%rax |
| orq %r8,%rax |
| andl $0x3ffffff,%eax |
| movl %eax,-8(%rdi) |
| leal (%rax,%rax,4),%eax |
| movq %rbx,%r8 |
| movl %eax,8(%rdi) |
| |
| movl $0x3ffffff,%edx |
| shrq $14,%r8 |
| andl %r8d,%edx |
| movl %edx,24(%rdi) |
| leal (%rdx,%rdx,4),%edx |
| shrq $26,%r8 |
| movl %edx,40(%rdi) |
| |
| movq %rbp,%rax |
| shlq $24,%rax |
| orq %rax,%r8 |
| movl %r8d,56(%rdi) |
| leaq (%r8,%r8,4),%r8 |
| movl %r8d,72(%rdi) |
| |
| leaq -48-64(%rdi),%rdi |
| .byte 0xf3,0xc3 |
| |
| |
| |
| .p2align 5 |
| poly1305_blocks_avx: |
| movl 20(%rdi),%r8d |
| cmpq $128,%rdx |
| jae L$blocks_avx |
| testl %r8d,%r8d |
| jz L$blocks |
| |
| L$blocks_avx: |
| andq $-16,%rdx |
| jz L$no_data_avx |
| |
| vzeroupper |
| |
| testl %r8d,%r8d |
| jz L$base2_64_avx |
| |
| testq $31,%rdx |
| jz L$even_avx |
| |
| pushq %rbx |
| pushq %rbp |
| pushq %r12 |
| pushq %r13 |
| pushq %r14 |
| pushq %r15 |
| L$blocks_avx_body: |
| |
| movq %rdx,%r15 |
| |
| movq 0(%rdi),%r8 |
| movq 8(%rdi),%r9 |
| movl 16(%rdi),%ebp |
| |
| movq 24(%rdi),%r11 |
| movq 32(%rdi),%r13 |
| |
| |
| movl %r8d,%r14d |
| andq $-2147483648,%r8 |
| movq %r9,%r12 |
| movl %r9d,%ebx |
| andq $-2147483648,%r9 |
| |
| shrq $6,%r8 |
| shlq $52,%r12 |
| addq %r8,%r14 |
| shrq $12,%rbx |
| shrq $18,%r9 |
| addq %r12,%r14 |
| adcq %r9,%rbx |
| |
| movq %rbp,%r8 |
| shlq $40,%r8 |
| shrq $24,%rbp |
| addq %r8,%rbx |
| adcq $0,%rbp |
| |
| movq $-4,%r9 |
| movq %rbp,%r8 |
| andq %rbp,%r9 |
| shrq $2,%r8 |
| andq $3,%rbp |
| addq %r9,%r8 |
| addq %r8,%r14 |
| adcq $0,%rbx |
| adcq $0,%rbp |
| |
| movq %r13,%r12 |
| movq %r13,%rax |
| shrq $2,%r13 |
| addq %r12,%r13 |
| |
| addq 0(%rsi),%r14 |
| adcq 8(%rsi),%rbx |
| leaq 16(%rsi),%rsi |
| adcq %rcx,%rbp |
| |
| call __poly1305_block |
| |
| testq %rcx,%rcx |
| jz L$store_base2_64_avx |
| |
| |
| movq %r14,%rax |
| movq %r14,%rdx |
| shrq $52,%r14 |
| movq %rbx,%r11 |
| movq %rbx,%r12 |
| shrq $26,%rdx |
| andq $0x3ffffff,%rax |
| shlq $12,%r11 |
| andq $0x3ffffff,%rdx |
| shrq $14,%rbx |
| orq %r11,%r14 |
| shlq $24,%rbp |
| andq $0x3ffffff,%r14 |
| shrq $40,%r12 |
| andq $0x3ffffff,%rbx |
| orq %r12,%rbp |
| |
| subq $16,%r15 |
| jz L$store_base2_26_avx |
| |
| vmovd %eax,%xmm0 |
| vmovd %edx,%xmm1 |
| vmovd %r14d,%xmm2 |
| vmovd %ebx,%xmm3 |
| vmovd %ebp,%xmm4 |
| jmp L$proceed_avx |
| |
| .p2align 5 |
| L$store_base2_64_avx: |
| movq %r14,0(%rdi) |
| movq %rbx,8(%rdi) |
| movq %rbp,16(%rdi) |
| jmp L$done_avx |
| |
| .p2align 4 |
| L$store_base2_26_avx: |
| movl %eax,0(%rdi) |
| movl %edx,4(%rdi) |
| movl %r14d,8(%rdi) |
| movl %ebx,12(%rdi) |
| movl %ebp,16(%rdi) |
| .p2align 4 |
| L$done_avx: |
| movq 0(%rsp),%r15 |
| movq 8(%rsp),%r14 |
| movq 16(%rsp),%r13 |
| movq 24(%rsp),%r12 |
| movq 32(%rsp),%rbp |
| movq 40(%rsp),%rbx |
| leaq 48(%rsp),%rsp |
| L$no_data_avx: |
| L$blocks_avx_epilogue: |
| .byte 0xf3,0xc3 |
| |
| .p2align 5 |
| L$base2_64_avx: |
| pushq %rbx |
| pushq %rbp |
| pushq %r12 |
| pushq %r13 |
| pushq %r14 |
| pushq %r15 |
| L$base2_64_avx_body: |
| |
| movq %rdx,%r15 |
| |
| movq 24(%rdi),%r11 |
| movq 32(%rdi),%r13 |
| |
| movq 0(%rdi),%r14 |
| movq 8(%rdi),%rbx |
| movl 16(%rdi),%ebp |
| |
| movq %r13,%r12 |
| movq %r13,%rax |
| shrq $2,%r13 |
| addq %r12,%r13 |
| |
| testq $31,%rdx |
| jz L$init_avx |
| |
| addq 0(%rsi),%r14 |
| adcq 8(%rsi),%rbx |
| leaq 16(%rsi),%rsi |
| adcq %rcx,%rbp |
| subq $16,%r15 |
| |
| call __poly1305_block |
| |
| L$init_avx: |
| |
| movq %r14,%rax |
| movq %r14,%rdx |
| shrq $52,%r14 |
| movq %rbx,%r8 |
| movq %rbx,%r9 |
| shrq $26,%rdx |
| andq $0x3ffffff,%rax |
| shlq $12,%r8 |
| andq $0x3ffffff,%rdx |
| shrq $14,%rbx |
| orq %r8,%r14 |
| shlq $24,%rbp |
| andq $0x3ffffff,%r14 |
| shrq $40,%r9 |
| andq $0x3ffffff,%rbx |
| orq %r9,%rbp |
| |
| vmovd %eax,%xmm0 |
| vmovd %edx,%xmm1 |
| vmovd %r14d,%xmm2 |
| vmovd %ebx,%xmm3 |
| vmovd %ebp,%xmm4 |
| movl $1,20(%rdi) |
| |
| call __poly1305_init_avx |
| |
| L$proceed_avx: |
| movq %r15,%rdx |
| |
| movq 0(%rsp),%r15 |
| movq 8(%rsp),%r14 |
| movq 16(%rsp),%r13 |
| movq 24(%rsp),%r12 |
| movq 32(%rsp),%rbp |
| movq 40(%rsp),%rbx |
| leaq 48(%rsp),%rax |
| leaq 48(%rsp),%rsp |
| L$base2_64_avx_epilogue: |
| jmp L$do_avx |
| |
| .p2align 5 |
| L$even_avx: |
| vmovd 0(%rdi),%xmm0 |
| vmovd 4(%rdi),%xmm1 |
| vmovd 8(%rdi),%xmm2 |
| vmovd 12(%rdi),%xmm3 |
| vmovd 16(%rdi),%xmm4 |
| |
| L$do_avx: |
| leaq -88(%rsp),%r11 |
| subq $0x178,%rsp |
| subq $64,%rdx |
| leaq -32(%rsi),%rax |
| cmovcq %rax,%rsi |
| |
| vmovdqu 48(%rdi),%xmm14 |
| leaq 112(%rdi),%rdi |
| leaq L$const(%rip),%rcx |
| |
| |
| |
| vmovdqu 32(%rsi),%xmm5 |
| vmovdqu 48(%rsi),%xmm6 |
| vmovdqa 64(%rcx),%xmm15 |
| |
| vpsrldq $6,%xmm5,%xmm7 |
| vpsrldq $6,%xmm6,%xmm8 |
| vpunpckhqdq %xmm6,%xmm5,%xmm9 |
| vpunpcklqdq %xmm6,%xmm5,%xmm5 |
| vpunpcklqdq %xmm8,%xmm7,%xmm8 |
| |
| vpsrlq $40,%xmm9,%xmm9 |
| vpsrlq $26,%xmm5,%xmm6 |
| vpand %xmm15,%xmm5,%xmm5 |
| vpsrlq $4,%xmm8,%xmm7 |
| vpand %xmm15,%xmm6,%xmm6 |
| vpsrlq $30,%xmm8,%xmm8 |
| vpand %xmm15,%xmm7,%xmm7 |
| vpand %xmm15,%xmm8,%xmm8 |
| vpor 32(%rcx),%xmm9,%xmm9 |
| |
| jbe L$skip_loop_avx |
| |
| |
| vmovdqu -48(%rdi),%xmm11 |
| vmovdqu -32(%rdi),%xmm12 |
| vpshufd $0xEE,%xmm14,%xmm13 |
| vpshufd $0x44,%xmm14,%xmm10 |
| vmovdqa %xmm13,-144(%r11) |
| vmovdqa %xmm10,0(%rsp) |
| vpshufd $0xEE,%xmm11,%xmm14 |
| vmovdqu -16(%rdi),%xmm10 |
| vpshufd $0x44,%xmm11,%xmm11 |
| vmovdqa %xmm14,-128(%r11) |
| vmovdqa %xmm11,16(%rsp) |
| vpshufd $0xEE,%xmm12,%xmm13 |
| vmovdqu 0(%rdi),%xmm11 |
| vpshufd $0x44,%xmm12,%xmm12 |
| vmovdqa %xmm13,-112(%r11) |
| vmovdqa %xmm12,32(%rsp) |
| vpshufd $0xEE,%xmm10,%xmm14 |
| vmovdqu 16(%rdi),%xmm12 |
| vpshufd $0x44,%xmm10,%xmm10 |
| vmovdqa %xmm14,-96(%r11) |
| vmovdqa %xmm10,48(%rsp) |
| vpshufd $0xEE,%xmm11,%xmm13 |
| vmovdqu 32(%rdi),%xmm10 |
| vpshufd $0x44,%xmm11,%xmm11 |
| vmovdqa %xmm13,-80(%r11) |
| vmovdqa %xmm11,64(%rsp) |
| vpshufd $0xEE,%xmm12,%xmm14 |
| vmovdqu 48(%rdi),%xmm11 |
| vpshufd $0x44,%xmm12,%xmm12 |
| vmovdqa %xmm14,-64(%r11) |
| vmovdqa %xmm12,80(%rsp) |
| vpshufd $0xEE,%xmm10,%xmm13 |
| vmovdqu 64(%rdi),%xmm12 |
| vpshufd $0x44,%xmm10,%xmm10 |
| vmovdqa %xmm13,-48(%r11) |
| vmovdqa %xmm10,96(%rsp) |
| vpshufd $0xEE,%xmm11,%xmm14 |
| vpshufd $0x44,%xmm11,%xmm11 |
| vmovdqa %xmm14,-32(%r11) |
| vmovdqa %xmm11,112(%rsp) |
| vpshufd $0xEE,%xmm12,%xmm13 |
| vmovdqa 0(%rsp),%xmm14 |
| vpshufd $0x44,%xmm12,%xmm12 |
| vmovdqa %xmm13,-16(%r11) |
| vmovdqa %xmm12,128(%rsp) |
| |
| jmp L$oop_avx |
| |
| .p2align 5 |
| L$oop_avx: |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| vpmuludq %xmm5,%xmm14,%xmm10 |
| vpmuludq %xmm6,%xmm14,%xmm11 |
| vmovdqa %xmm2,32(%r11) |
| vpmuludq %xmm7,%xmm14,%xmm12 |
| vmovdqa 16(%rsp),%xmm2 |
| vpmuludq %xmm8,%xmm14,%xmm13 |
| vpmuludq %xmm9,%xmm14,%xmm14 |
| |
| vmovdqa %xmm0,0(%r11) |
| vpmuludq 32(%rsp),%xmm9,%xmm0 |
| vmovdqa %xmm1,16(%r11) |
| vpmuludq %xmm8,%xmm2,%xmm1 |
| vpaddq %xmm0,%xmm10,%xmm10 |
| vpaddq %xmm1,%xmm14,%xmm14 |
| vmovdqa %xmm3,48(%r11) |
| vpmuludq %xmm7,%xmm2,%xmm0 |
| vpmuludq %xmm6,%xmm2,%xmm1 |
| vpaddq %xmm0,%xmm13,%xmm13 |
| vmovdqa 48(%rsp),%xmm3 |
| vpaddq %xmm1,%xmm12,%xmm12 |
| vmovdqa %xmm4,64(%r11) |
| vpmuludq %xmm5,%xmm2,%xmm2 |
| vpmuludq %xmm7,%xmm3,%xmm0 |
| vpaddq %xmm2,%xmm11,%xmm11 |
| |
| vmovdqa 64(%rsp),%xmm4 |
| vpaddq %xmm0,%xmm14,%xmm14 |
| vpmuludq %xmm6,%xmm3,%xmm1 |
| vpmuludq %xmm5,%xmm3,%xmm3 |
| vpaddq %xmm1,%xmm13,%xmm13 |
| vmovdqa 80(%rsp),%xmm2 |
| vpaddq %xmm3,%xmm12,%xmm12 |
| vpmuludq %xmm9,%xmm4,%xmm0 |
| vpmuludq %xmm8,%xmm4,%xmm4 |
| vpaddq %xmm0,%xmm11,%xmm11 |
| vmovdqa 96(%rsp),%xmm3 |
| vpaddq %xmm4,%xmm10,%xmm10 |
| |
| vmovdqa 128(%rsp),%xmm4 |
| vpmuludq %xmm6,%xmm2,%xmm1 |
| vpmuludq %xmm5,%xmm2,%xmm2 |
| vpaddq %xmm1,%xmm14,%xmm14 |
| vpaddq %xmm2,%xmm13,%xmm13 |
| vpmuludq %xmm9,%xmm3,%xmm0 |
| vpmuludq %xmm8,%xmm3,%xmm1 |
| vpaddq %xmm0,%xmm12,%xmm12 |
| vmovdqu 0(%rsi),%xmm0 |
| vpaddq %xmm1,%xmm11,%xmm11 |
| vpmuludq %xmm7,%xmm3,%xmm3 |
| vpmuludq %xmm7,%xmm4,%xmm7 |
| vpaddq %xmm3,%xmm10,%xmm10 |
| |
| vmovdqu 16(%rsi),%xmm1 |
| vpaddq %xmm7,%xmm11,%xmm11 |
| vpmuludq %xmm8,%xmm4,%xmm8 |
| vpmuludq %xmm9,%xmm4,%xmm9 |
| vpsrldq $6,%xmm0,%xmm2 |
| vpaddq %xmm8,%xmm12,%xmm12 |
| vpaddq %xmm9,%xmm13,%xmm13 |
| vpsrldq $6,%xmm1,%xmm3 |
| vpmuludq 112(%rsp),%xmm5,%xmm9 |
| vpmuludq %xmm6,%xmm4,%xmm5 |
| vpunpckhqdq %xmm1,%xmm0,%xmm4 |
| vpaddq %xmm9,%xmm14,%xmm14 |
| vmovdqa -144(%r11),%xmm9 |
| vpaddq %xmm5,%xmm10,%xmm10 |
| |
| vpunpcklqdq %xmm1,%xmm0,%xmm0 |
| vpunpcklqdq %xmm3,%xmm2,%xmm3 |
| |
| |
| vpsrldq $5,%xmm4,%xmm4 |
| vpsrlq $26,%xmm0,%xmm1 |
| vpand %xmm15,%xmm0,%xmm0 |
| vpsrlq $4,%xmm3,%xmm2 |
| vpand %xmm15,%xmm1,%xmm1 |
| vpand 0(%rcx),%xmm4,%xmm4 |
| vpsrlq $30,%xmm3,%xmm3 |
| vpand %xmm15,%xmm2,%xmm2 |
| vpand %xmm15,%xmm3,%xmm3 |
| vpor 32(%rcx),%xmm4,%xmm4 |
| |
| vpaddq 0(%r11),%xmm0,%xmm0 |
| vpaddq 16(%r11),%xmm1,%xmm1 |
| vpaddq 32(%r11),%xmm2,%xmm2 |
| vpaddq 48(%r11),%xmm3,%xmm3 |
| vpaddq 64(%r11),%xmm4,%xmm4 |
| |
| leaq 32(%rsi),%rax |
| leaq 64(%rsi),%rsi |
| subq $64,%rdx |
| cmovcq %rax,%rsi |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| vpmuludq %xmm0,%xmm9,%xmm5 |
| vpmuludq %xmm1,%xmm9,%xmm6 |
| vpaddq %xmm5,%xmm10,%xmm10 |
| vpaddq %xmm6,%xmm11,%xmm11 |
| vmovdqa -128(%r11),%xmm7 |
| vpmuludq %xmm2,%xmm9,%xmm5 |
| vpmuludq %xmm3,%xmm9,%xmm6 |
| vpaddq %xmm5,%xmm12,%xmm12 |
| vpaddq %xmm6,%xmm13,%xmm13 |
| vpmuludq %xmm4,%xmm9,%xmm9 |
| vpmuludq -112(%r11),%xmm4,%xmm5 |
| vpaddq %xmm9,%xmm14,%xmm14 |
| |
| vpaddq %xmm5,%xmm10,%xmm10 |
| vpmuludq %xmm2,%xmm7,%xmm6 |
| vpmuludq %xmm3,%xmm7,%xmm5 |
| vpaddq %xmm6,%xmm13,%xmm13 |
| vmovdqa -96(%r11),%xmm8 |
| vpaddq %xmm5,%xmm14,%xmm14 |
| vpmuludq %xmm1,%xmm7,%xmm6 |
| vpmuludq %xmm0,%xmm7,%xmm7 |
| vpaddq %xmm6,%xmm12,%xmm12 |
| vpaddq %xmm7,%xmm11,%xmm11 |
| |
| vmovdqa -80(%r11),%xmm9 |
| vpmuludq %xmm2,%xmm8,%xmm5 |
| vpmuludq %xmm1,%xmm8,%xmm6 |
| vpaddq %xmm5,%xmm14,%xmm14 |
| vpaddq %xmm6,%xmm13,%xmm13 |
| vmovdqa -64(%r11),%xmm7 |
| vpmuludq %xmm0,%xmm8,%xmm8 |
| vpmuludq %xmm4,%xmm9,%xmm5 |
| vpaddq %xmm8,%xmm12,%xmm12 |
| vpaddq %xmm5,%xmm11,%xmm11 |
| vmovdqa -48(%r11),%xmm8 |
| vpmuludq %xmm3,%xmm9,%xmm9 |
| vpmuludq %xmm1,%xmm7,%xmm6 |
| vpaddq %xmm9,%xmm10,%xmm10 |
| |
| vmovdqa -16(%r11),%xmm9 |
| vpaddq %xmm6,%xmm14,%xmm14 |
| vpmuludq %xmm0,%xmm7,%xmm7 |
| vpmuludq %xmm4,%xmm8,%xmm5 |
| vpaddq %xmm7,%xmm13,%xmm13 |
| vpaddq %xmm5,%xmm12,%xmm12 |
| vmovdqu 32(%rsi),%xmm5 |
| vpmuludq %xmm3,%xmm8,%xmm7 |
| vpmuludq %xmm2,%xmm8,%xmm8 |
| vpaddq %xmm7,%xmm11,%xmm11 |
| vmovdqu 48(%rsi),%xmm6 |
| vpaddq %xmm8,%xmm10,%xmm10 |
| |
| vpmuludq %xmm2,%xmm9,%xmm2 |
| vpmuludq %xmm3,%xmm9,%xmm3 |
| vpsrldq $6,%xmm5,%xmm7 |
| vpaddq %xmm2,%xmm11,%xmm11 |
| vpmuludq %xmm4,%xmm9,%xmm4 |
| vpsrldq $6,%xmm6,%xmm8 |
| vpaddq %xmm3,%xmm12,%xmm2 |
| vpaddq %xmm4,%xmm13,%xmm3 |
| vpmuludq -32(%r11),%xmm0,%xmm4 |
| vpmuludq %xmm1,%xmm9,%xmm0 |
| vpunpckhqdq %xmm6,%xmm5,%xmm9 |
| vpaddq %xmm4,%xmm14,%xmm4 |
| vpaddq %xmm0,%xmm10,%xmm0 |
| |
| vpunpcklqdq %xmm6,%xmm5,%xmm5 |
| vpunpcklqdq %xmm8,%xmm7,%xmm8 |
| |
| |
| vpsrldq $5,%xmm9,%xmm9 |
| vpsrlq $26,%xmm5,%xmm6 |
| vmovdqa 0(%rsp),%xmm14 |
| vpand %xmm15,%xmm5,%xmm5 |
| vpsrlq $4,%xmm8,%xmm7 |
| vpand %xmm15,%xmm6,%xmm6 |
| vpand 0(%rcx),%xmm9,%xmm9 |
| vpsrlq $30,%xmm8,%xmm8 |
| vpand %xmm15,%xmm7,%xmm7 |
| vpand %xmm15,%xmm8,%xmm8 |
| vpor 32(%rcx),%xmm9,%xmm9 |
| |
| |
| |
| |
| |
| vpsrlq $26,%xmm3,%xmm13 |
| vpand %xmm15,%xmm3,%xmm3 |
| vpaddq %xmm13,%xmm4,%xmm4 |
| |
| vpsrlq $26,%xmm0,%xmm10 |
| vpand %xmm15,%xmm0,%xmm0 |
| vpaddq %xmm10,%xmm11,%xmm1 |
| |
| vpsrlq $26,%xmm4,%xmm10 |
| vpand %xmm15,%xmm4,%xmm4 |
| |
| vpsrlq $26,%xmm1,%xmm11 |
| vpand %xmm15,%xmm1,%xmm1 |
| vpaddq %xmm11,%xmm2,%xmm2 |
| |
| vpaddq %xmm10,%xmm0,%xmm0 |
| vpsllq $2,%xmm10,%xmm10 |
| vpaddq %xmm10,%xmm0,%xmm0 |
| |
| vpsrlq $26,%xmm2,%xmm12 |
| vpand %xmm15,%xmm2,%xmm2 |
| vpaddq %xmm12,%xmm3,%xmm3 |
| |
| vpsrlq $26,%xmm0,%xmm10 |
| vpand %xmm15,%xmm0,%xmm0 |
| vpaddq %xmm10,%xmm1,%xmm1 |
| |
| vpsrlq $26,%xmm3,%xmm13 |
| vpand %xmm15,%xmm3,%xmm3 |
| vpaddq %xmm13,%xmm4,%xmm4 |
| |
| ja L$oop_avx |
| |
| L$skip_loop_avx: |
| |
| |
| |
| vpshufd $0x10,%xmm14,%xmm14 |
| addq $32,%rdx |
| jnz L$ong_tail_avx |
| |
| vpaddq %xmm2,%xmm7,%xmm7 |
| vpaddq %xmm0,%xmm5,%xmm5 |
| vpaddq %xmm1,%xmm6,%xmm6 |
| vpaddq %xmm3,%xmm8,%xmm8 |
| vpaddq %xmm4,%xmm9,%xmm9 |
| |
| L$ong_tail_avx: |
| vmovdqa %xmm2,32(%r11) |
| vmovdqa %xmm0,0(%r11) |
| vmovdqa %xmm1,16(%r11) |
| vmovdqa %xmm3,48(%r11) |
| vmovdqa %xmm4,64(%r11) |
| |
| |
| |
| |
| |
| |
| |
| vpmuludq %xmm7,%xmm14,%xmm12 |
| vpmuludq %xmm5,%xmm14,%xmm10 |
| vpshufd $0x10,-48(%rdi),%xmm2 |
| vpmuludq %xmm6,%xmm14,%xmm11 |
| vpmuludq %xmm8,%xmm14,%xmm13 |
| vpmuludq %xmm9,%xmm14,%xmm14 |
| |
| vpmuludq %xmm8,%xmm2,%xmm0 |
| vpaddq %xmm0,%xmm14,%xmm14 |
| vpshufd $0x10,-32(%rdi),%xmm3 |
| vpmuludq %xmm7,%xmm2,%xmm1 |
| vpaddq %xmm1,%xmm13,%xmm13 |
| vpshufd $0x10,-16(%rdi),%xmm4 |
| vpmuludq %xmm6,%xmm2,%xmm0 |
| vpaddq %xmm0,%xmm12,%xmm12 |
| vpmuludq %xmm5,%xmm2,%xmm2 |
| vpaddq %xmm2,%xmm11,%xmm11 |
| vpmuludq %xmm9,%xmm3,%xmm3 |
| vpaddq %xmm3,%xmm10,%xmm10 |
| |
| vpshufd $0x10,0(%rdi),%xmm2 |
| vpmuludq %xmm7,%xmm4,%xmm1 |
| vpaddq %xmm1,%xmm14,%xmm14 |
| vpmuludq %xmm6,%xmm4,%xmm0 |
| vpaddq %xmm0,%xmm13,%xmm13 |
| vpshufd $0x10,16(%rdi),%xmm3 |
| vpmuludq %xmm5,%xmm4,%xmm4 |
| vpaddq %xmm4,%xmm12,%xmm12 |
| vpmuludq %xmm9,%xmm2,%xmm1 |
| vpaddq %xmm1,%xmm11,%xmm11 |
| vpshufd $0x10,32(%rdi),%xmm4 |
| vpmuludq %xmm8,%xmm2,%xmm2 |
| vpaddq %xmm2,%xmm10,%xmm10 |
| |
| vpmuludq %xmm6,%xmm3,%xmm0 |
| vpaddq %xmm0,%xmm14,%xmm14 |
| vpmuludq %xmm5,%xmm3,%xmm3 |
| vpaddq %xmm3,%xmm13,%xmm13 |
| vpshufd $0x10,48(%rdi),%xmm2 |
| vpmuludq %xmm9,%xmm4,%xmm1 |
| vpaddq %xmm1,%xmm12,%xmm12 |
| vpshufd $0x10,64(%rdi),%xmm3 |
| vpmuludq %xmm8,%xmm4,%xmm0 |
| vpaddq %xmm0,%xmm11,%xmm11 |
| vpmuludq %xmm7,%xmm4,%xmm4 |
| vpaddq %xmm4,%xmm10,%xmm10 |
| |
| vpmuludq %xmm5,%xmm2,%xmm2 |
| vpaddq %xmm2,%xmm14,%xmm14 |
| vpmuludq %xmm9,%xmm3,%xmm1 |
| vpaddq %xmm1,%xmm13,%xmm13 |
| vpmuludq %xmm8,%xmm3,%xmm0 |
| vpaddq %xmm0,%xmm12,%xmm12 |
| vpmuludq %xmm7,%xmm3,%xmm1 |
| vpaddq %xmm1,%xmm11,%xmm11 |
| vpmuludq %xmm6,%xmm3,%xmm3 |
| vpaddq %xmm3,%xmm10,%xmm10 |
| |
| jz L$short_tail_avx |
| |
| vmovdqu 0(%rsi),%xmm0 |
| vmovdqu 16(%rsi),%xmm1 |
| |
| vpsrldq $6,%xmm0,%xmm2 |
| vpsrldq $6,%xmm1,%xmm3 |
| vpunpckhqdq %xmm1,%xmm0,%xmm4 |
| vpunpcklqdq %xmm1,%xmm0,%xmm0 |
| vpunpcklqdq %xmm3,%xmm2,%xmm3 |
| |
| vpsrlq $40,%xmm4,%xmm4 |
| vpsrlq $26,%xmm0,%xmm1 |
| vpand %xmm15,%xmm0,%xmm0 |
| vpsrlq $4,%xmm3,%xmm2 |
| vpand %xmm15,%xmm1,%xmm1 |
| vpsrlq $30,%xmm3,%xmm3 |
| vpand %xmm15,%xmm2,%xmm2 |
| vpand %xmm15,%xmm3,%xmm3 |
| vpor 32(%rcx),%xmm4,%xmm4 |
| |
| vpshufd $0x32,-64(%rdi),%xmm9 |
| vpaddq 0(%r11),%xmm0,%xmm0 |
| vpaddq 16(%r11),%xmm1,%xmm1 |
| vpaddq 32(%r11),%xmm2,%xmm2 |
| vpaddq 48(%r11),%xmm3,%xmm3 |
| vpaddq 64(%r11),%xmm4,%xmm4 |
| |
| |
| |
| |
| vpmuludq %xmm0,%xmm9,%xmm5 |
| vpaddq %xmm5,%xmm10,%xmm10 |
| vpmuludq %xmm1,%xmm9,%xmm6 |
| vpaddq %xmm6,%xmm11,%xmm11 |
| vpmuludq %xmm2,%xmm9,%xmm5 |
| vpaddq %xmm5,%xmm12,%xmm12 |
| vpshufd $0x32,-48(%rdi),%xmm7 |
| vpmuludq %xmm3,%xmm9,%xmm6 |
| vpaddq %xmm6,%xmm13,%xmm13 |
| vpmuludq %xmm4,%xmm9,%xmm9 |
| vpaddq %xmm9,%xmm14,%xmm14 |
| |
| vpmuludq %xmm3,%xmm7,%xmm5 |
| vpaddq %xmm5,%xmm14,%xmm14 |
| vpshufd $0x32,-32(%rdi),%xmm8 |
| vpmuludq %xmm2,%xmm7,%xmm6 |
| vpaddq %xmm6,%xmm13,%xmm13 |
| vpshufd $0x32,-16(%rdi),%xmm9 |
| vpmuludq %xmm1,%xmm7,%xmm5 |
| vpaddq %xmm5,%xmm12,%xmm12 |
| vpmuludq %xmm0,%xmm7,%xmm7 |
| vpaddq %xmm7,%xmm11,%xmm11 |
| vpmuludq %xmm4,%xmm8,%xmm8 |
| vpaddq %xmm8,%xmm10,%xmm10 |
| |
| vpshufd $0x32,0(%rdi),%xmm7 |
| vpmuludq %xmm2,%xmm9,%xmm6 |
| vpaddq %xmm6,%xmm14,%xmm14 |
| vpmuludq %xmm1,%xmm9,%xmm5 |
| vpaddq %xmm5,%xmm13,%xmm13 |
| vpshufd $0x32,16(%rdi),%xmm8 |
| vpmuludq %xmm0,%xmm9,%xmm9 |
| vpaddq %xmm9,%xmm12,%xmm12 |
| vpmuludq %xmm4,%xmm7,%xmm6 |
| vpaddq %xmm6,%xmm11,%xmm11 |
| vpshufd $0x32,32(%rdi),%xmm9 |
| vpmuludq %xmm3,%xmm7,%xmm7 |
| vpaddq %xmm7,%xmm10,%xmm10 |
| |
| vpmuludq %xmm1,%xmm8,%xmm5 |
| vpaddq %xmm5,%xmm14,%xmm14 |
| vpmuludq %xmm0,%xmm8,%xmm8 |
| vpaddq %xmm8,%xmm13,%xmm13 |
| vpshufd $0x32,48(%rdi),%xmm7 |
| vpmuludq %xmm4,%xmm9,%xmm6 |
| vpaddq %xmm6,%xmm12,%xmm12 |
| vpshufd $0x32,64(%rdi),%xmm8 |
| vpmuludq %xmm3,%xmm9,%xmm5 |
| vpaddq %xmm5,%xmm11,%xmm11 |
| vpmuludq %xmm2,%xmm9,%xmm9 |
| vpaddq %xmm9,%xmm10,%xmm10 |
| |
| vpmuludq %xmm0,%xmm7,%xmm7 |
| vpaddq %xmm7,%xmm14,%xmm14 |
| vpmuludq %xmm4,%xmm8,%xmm6 |
| vpaddq %xmm6,%xmm13,%xmm13 |
| vpmuludq %xmm3,%xmm8,%xmm5 |
| vpaddq %xmm5,%xmm12,%xmm12 |
| vpmuludq %xmm2,%xmm8,%xmm6 |
| vpaddq %xmm6,%xmm11,%xmm11 |
| vpmuludq %xmm1,%xmm8,%xmm8 |
| vpaddq %xmm8,%xmm10,%xmm10 |
| |
| L$short_tail_avx: |
| |
| |
| |
| vpsrldq $8,%xmm14,%xmm9 |
| vpsrldq $8,%xmm13,%xmm8 |
| vpsrldq $8,%xmm11,%xmm6 |
| vpsrldq $8,%xmm10,%xmm5 |
| vpsrldq $8,%xmm12,%xmm7 |
| vpaddq %xmm8,%xmm13,%xmm13 |
| vpaddq %xmm9,%xmm14,%xmm14 |
| vpaddq %xmm5,%xmm10,%xmm10 |
| vpaddq %xmm6,%xmm11,%xmm11 |
| vpaddq %xmm7,%xmm12,%xmm12 |
| |
| |
| |
| |
| vpsrlq $26,%xmm13,%xmm3 |
| vpand %xmm15,%xmm13,%xmm13 |
| vpaddq %xmm3,%xmm14,%xmm14 |
| |
| vpsrlq $26,%xmm10,%xmm0 |
| vpand %xmm15,%xmm10,%xmm10 |
| vpaddq %xmm0,%xmm11,%xmm11 |
| |
| vpsrlq $26,%xmm14,%xmm4 |
| vpand %xmm15,%xmm14,%xmm14 |
| |
| vpsrlq $26,%xmm11,%xmm1 |
| vpand %xmm15,%xmm11,%xmm11 |
| vpaddq %xmm1,%xmm12,%xmm12 |
| |
| vpaddq %xmm4,%xmm10,%xmm10 |
| vpsllq $2,%xmm4,%xmm4 |
| vpaddq %xmm4,%xmm10,%xmm10 |
| |
| vpsrlq $26,%xmm12,%xmm2 |
| vpand %xmm15,%xmm12,%xmm12 |
| vpaddq %xmm2,%xmm13,%xmm13 |
| |
| vpsrlq $26,%xmm10,%xmm0 |
| vpand %xmm15,%xmm10,%xmm10 |
| vpaddq %xmm0,%xmm11,%xmm11 |
| |
| vpsrlq $26,%xmm13,%xmm3 |
| vpand %xmm15,%xmm13,%xmm13 |
| vpaddq %xmm3,%xmm14,%xmm14 |
| |
| vmovd %xmm10,-112(%rdi) |
| vmovd %xmm11,-108(%rdi) |
| vmovd %xmm12,-104(%rdi) |
| vmovd %xmm13,-100(%rdi) |
| vmovd %xmm14,-96(%rdi) |
| leaq 88(%r11),%rsp |
| vzeroupper |
| .byte 0xf3,0xc3 |
| |
| |
| |
| .p2align 5 |
| poly1305_emit_avx: |
| cmpl $0,20(%rdi) |
| je L$emit |
| |
| movl 0(%rdi),%eax |
| movl 4(%rdi),%ecx |
| movl 8(%rdi),%r8d |
| movl 12(%rdi),%r11d |
| movl 16(%rdi),%r10d |
| |
| shlq $26,%rcx |
| movq %r8,%r9 |
| shlq $52,%r8 |
| addq %rcx,%rax |
| shrq $12,%r9 |
| addq %rax,%r8 |
| adcq $0,%r9 |
| |
| shlq $14,%r11 |
| movq %r10,%rax |
| shrq $24,%r10 |
| addq %r11,%r9 |
| shlq $40,%rax |
| addq %rax,%r9 |
| adcq $0,%r10 |
| |
| movq %r10,%rax |
| movq %r10,%rcx |
| andq $3,%r10 |
| shrq $2,%rax |
| andq $-4,%rcx |
| addq %rcx,%rax |
| addq %rax,%r8 |
| adcq $0,%r9 |
| adcq $0,%r10 |
| |
| movq %r8,%rax |
| addq $5,%r8 |
| movq %r9,%rcx |
| adcq $0,%r9 |
| adcq $0,%r10 |
| shrq $2,%r10 |
| cmovnzq %r8,%rax |
| cmovnzq %r9,%rcx |
| |
| addq 0(%rdx),%rax |
| adcq 8(%rdx),%rcx |
| movq %rax,0(%rsi) |
| movq %rcx,8(%rsi) |
| |
| .byte 0xf3,0xc3 |
| |
| |
| .p2align 5 |
| poly1305_blocks_avx2: |
| movl 20(%rdi),%r8d |
| cmpq $128,%rdx |
| jae L$blocks_avx2 |
| testl %r8d,%r8d |
| jz L$blocks |
| |
| L$blocks_avx2: |
| andq $-16,%rdx |
| jz L$no_data_avx2 |
| |
| vzeroupper |
| |
| testl %r8d,%r8d |
| jz L$base2_64_avx2 |
| |
| testq $63,%rdx |
| jz L$even_avx2 |
| |
| pushq %rbx |
| pushq %rbp |
| pushq %r12 |
| pushq %r13 |
| pushq %r14 |
| pushq %r15 |
| L$blocks_avx2_body: |
| |
| movq %rdx,%r15 |
| |
| movq 0(%rdi),%r8 |
| movq 8(%rdi),%r9 |
| movl 16(%rdi),%ebp |
| |
| movq 24(%rdi),%r11 |
| movq 32(%rdi),%r13 |
| |
| |
| movl %r8d,%r14d |
| andq $-2147483648,%r8 |
| movq %r9,%r12 |
| movl %r9d,%ebx |
| andq $-2147483648,%r9 |
| |
| shrq $6,%r8 |
| shlq $52,%r12 |
| addq %r8,%r14 |
| shrq $12,%rbx |
| shrq $18,%r9 |
| addq %r12,%r14 |
| adcq %r9,%rbx |
| |
| movq %rbp,%r8 |
| shlq $40,%r8 |
| shrq $24,%rbp |
| addq %r8,%rbx |
| adcq $0,%rbp |
| |
| movq $-4,%r9 |
| movq %rbp,%r8 |
| andq %rbp,%r9 |
| shrq $2,%r8 |
| andq $3,%rbp |
| addq %r9,%r8 |
| addq %r8,%r14 |
| adcq $0,%rbx |
| adcq $0,%rbp |
| |
| movq %r13,%r12 |
| movq %r13,%rax |
| shrq $2,%r13 |
| addq %r12,%r13 |
| |
| L$base2_26_pre_avx2: |
| addq 0(%rsi),%r14 |
| adcq 8(%rsi),%rbx |
| leaq 16(%rsi),%rsi |
| adcq %rcx,%rbp |
| subq $16,%r15 |
| |
| call __poly1305_block |
| movq %r12,%rax |
| |
| testq $63,%r15 |
| jnz L$base2_26_pre_avx2 |
| |
| testq %rcx,%rcx |
| jz L$store_base2_64_avx2 |
| |
| |
| movq %r14,%rax |
| movq %r14,%rdx |
| shrq $52,%r14 |
| movq %rbx,%r11 |
| movq %rbx,%r12 |
| shrq $26,%rdx |
| andq $0x3ffffff,%rax |
| shlq $12,%r11 |
| andq $0x3ffffff,%rdx |
| shrq $14,%rbx |
| orq %r11,%r14 |
| shlq $24,%rbp |
| andq $0x3ffffff,%r14 |
| shrq $40,%r12 |
| andq $0x3ffffff,%rbx |
| orq %r12,%rbp |
| |
| testq %r15,%r15 |
| jz L$store_base2_26_avx2 |
| |
| vmovd %eax,%xmm0 |
| vmovd %edx,%xmm1 |
| vmovd %r14d,%xmm2 |
| vmovd %ebx,%xmm3 |
| vmovd %ebp,%xmm4 |
| jmp L$proceed_avx2 |
| |
| .p2align 5 |
| L$store_base2_64_avx2: |
| movq %r14,0(%rdi) |
| movq %rbx,8(%rdi) |
| movq %rbp,16(%rdi) |
| jmp L$done_avx2 |
| |
| .p2align 4 |
| L$store_base2_26_avx2: |
| movl %eax,0(%rdi) |
| movl %edx,4(%rdi) |
| movl %r14d,8(%rdi) |
| movl %ebx,12(%rdi) |
| movl %ebp,16(%rdi) |
| .p2align 4 |
| L$done_avx2: |
| movq 0(%rsp),%r15 |
| movq 8(%rsp),%r14 |
| movq 16(%rsp),%r13 |
| movq 24(%rsp),%r12 |
| movq 32(%rsp),%rbp |
| movq 40(%rsp),%rbx |
| leaq 48(%rsp),%rsp |
| L$no_data_avx2: |
| L$blocks_avx2_epilogue: |
| .byte 0xf3,0xc3 |
| |
| .p2align 5 |
| L$base2_64_avx2: |
| pushq %rbx |
| pushq %rbp |
| pushq %r12 |
| pushq %r13 |
| pushq %r14 |
| pushq %r15 |
| L$base2_64_avx2_body: |
| |
| movq %rdx,%r15 |
| |
| movq 24(%rdi),%r11 |
| movq 32(%rdi),%r13 |
| |
| movq 0(%rdi),%r14 |
| movq 8(%rdi),%rbx |
| movl 16(%rdi),%ebp |
| |
| movq %r13,%r12 |
| movq %r13,%rax |
| shrq $2,%r13 |
| addq %r12,%r13 |
| |
| testq $63,%rdx |
| jz L$init_avx2 |
| |
| L$base2_64_pre_avx2: |
| addq 0(%rsi),%r14 |
| adcq 8(%rsi),%rbx |
| leaq 16(%rsi),%rsi |
| adcq %rcx,%rbp |
| subq $16,%r15 |
| |
| call __poly1305_block |
| movq %r12,%rax |
| |
| testq $63,%r15 |
| jnz L$base2_64_pre_avx2 |
| |
| L$init_avx2: |
| |
| movq %r14,%rax |
| movq %r14,%rdx |
| shrq $52,%r14 |
| movq %rbx,%r8 |
| movq %rbx,%r9 |
| shrq $26,%rdx |
| andq $0x3ffffff,%rax |
| shlq $12,%r8 |
| andq $0x3ffffff,%rdx |
| shrq $14,%rbx |
| orq %r8,%r14 |
| shlq $24,%rbp |
| andq $0x3ffffff,%r14 |
| shrq $40,%r9 |
| andq $0x3ffffff,%rbx |
| orq %r9,%rbp |
| |
| vmovd %eax,%xmm0 |
| vmovd %edx,%xmm1 |
| vmovd %r14d,%xmm2 |
| vmovd %ebx,%xmm3 |
| vmovd %ebp,%xmm4 |
| movl $1,20(%rdi) |
| |
| call __poly1305_init_avx |
| |
| L$proceed_avx2: |
| movq %r15,%rdx |
| |
| movq 0(%rsp),%r15 |
| movq 8(%rsp),%r14 |
| movq 16(%rsp),%r13 |
| movq 24(%rsp),%r12 |
| movq 32(%rsp),%rbp |
| movq 40(%rsp),%rbx |
| leaq 48(%rsp),%rax |
| leaq 48(%rsp),%rsp |
| L$base2_64_avx2_epilogue: |
| jmp L$do_avx2 |
| |
| .p2align 5 |
| L$even_avx2: |
| vmovd 0(%rdi),%xmm0 |
| vmovd 4(%rdi),%xmm1 |
| vmovd 8(%rdi),%xmm2 |
| vmovd 12(%rdi),%xmm3 |
| vmovd 16(%rdi),%xmm4 |
| |
| L$do_avx2: |
| leaq -8(%rsp),%r11 |
| subq $0x128,%rsp |
| leaq 48+64(%rdi),%rdi |
| leaq L$const(%rip),%rcx |
| |
| |
| vmovdqu -64(%rdi),%xmm9 |
| andq $-512,%rsp |
| vmovdqu -48(%rdi),%xmm10 |
| vmovdqu -32(%rdi),%xmm6 |
| vmovdqu -16(%rdi),%xmm11 |
| vmovdqu 0(%rdi),%xmm12 |
| vmovdqu 16(%rdi),%xmm13 |
| vmovdqu 32(%rdi),%xmm14 |
| vpermq $0x15,%ymm9,%ymm9 |
| vmovdqu 48(%rdi),%xmm15 |
| vpermq $0x15,%ymm10,%ymm10 |
| vpshufd $0xc8,%ymm9,%ymm9 |
| vmovdqu 64(%rdi),%xmm5 |
| vpermq $0x15,%ymm6,%ymm6 |
| vpshufd $0xc8,%ymm10,%ymm10 |
| vmovdqa %ymm9,0(%rsp) |
| vpermq $0x15,%ymm11,%ymm11 |
| vpshufd $0xc8,%ymm6,%ymm6 |
| vmovdqa %ymm10,32(%rsp) |
| vpermq $0x15,%ymm12,%ymm12 |
| vpshufd $0xc8,%ymm11,%ymm11 |
| vmovdqa %ymm6,64(%rsp) |
| vpermq $0x15,%ymm13,%ymm13 |
| vpshufd $0xc8,%ymm12,%ymm12 |
| vmovdqa %ymm11,96(%rsp) |
| vpermq $0x15,%ymm14,%ymm14 |
| vpshufd $0xc8,%ymm13,%ymm13 |
| vmovdqa %ymm12,128(%rsp) |
| vpermq $0x15,%ymm15,%ymm15 |
| vpshufd $0xc8,%ymm14,%ymm14 |
| vmovdqa %ymm13,160(%rsp) |
| vpermq $0x15,%ymm5,%ymm5 |
| vpshufd $0xc8,%ymm15,%ymm15 |
| vmovdqa %ymm14,192(%rsp) |
| vpshufd $0xc8,%ymm5,%ymm5 |
| vmovdqa %ymm15,224(%rsp) |
| vmovdqa %ymm5,256(%rsp) |
| vmovdqa 64(%rcx),%ymm5 |
| |
| |
| |
| vmovdqu 0(%rsi),%xmm7 |
| vmovdqu 16(%rsi),%xmm8 |
| vinserti128 $1,32(%rsi),%ymm7,%ymm7 |
| vinserti128 $1,48(%rsi),%ymm8,%ymm8 |
| leaq 64(%rsi),%rsi |
| |
| vpsrldq $6,%ymm7,%ymm9 |
| vpsrldq $6,%ymm8,%ymm10 |
| vpunpckhqdq %ymm8,%ymm7,%ymm6 |
| vpunpcklqdq %ymm10,%ymm9,%ymm9 |
| vpunpcklqdq %ymm8,%ymm7,%ymm7 |
| |
| vpsrlq $30,%ymm9,%ymm10 |
| vpsrlq $4,%ymm9,%ymm9 |
| vpsrlq $26,%ymm7,%ymm8 |
| vpsrlq $40,%ymm6,%ymm6 |
| vpand %ymm5,%ymm9,%ymm9 |
| vpand %ymm5,%ymm7,%ymm7 |
| vpand %ymm5,%ymm8,%ymm8 |
| vpand %ymm5,%ymm10,%ymm10 |
| vpor 32(%rcx),%ymm6,%ymm6 |
| |
| leaq 144(%rsp),%rax |
| vpaddq %ymm2,%ymm9,%ymm2 |
| subq $64,%rdx |
| jz L$tail_avx2 |
| jmp L$oop_avx2 |
| |
| .p2align 5 |
| L$oop_avx2: |
| |
| |
| |
| |
| |
| |
| |
| |
| vpaddq %ymm0,%ymm7,%ymm0 |
| vmovdqa 0(%rsp),%ymm7 |
| vpaddq %ymm1,%ymm8,%ymm1 |
| vmovdqa 32(%rsp),%ymm8 |
| vpaddq %ymm3,%ymm10,%ymm3 |
| vmovdqa 96(%rsp),%ymm9 |
| vpaddq %ymm4,%ymm6,%ymm4 |
| vmovdqa 48(%rax),%ymm10 |
| vmovdqa 112(%rax),%ymm5 |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| vpmuludq %ymm2,%ymm7,%ymm13 |
| vpmuludq %ymm2,%ymm8,%ymm14 |
| vpmuludq %ymm2,%ymm9,%ymm15 |
| vpmuludq %ymm2,%ymm10,%ymm11 |
| vpmuludq %ymm2,%ymm5,%ymm12 |
| |
| vpmuludq %ymm0,%ymm8,%ymm6 |
| vpmuludq %ymm1,%ymm8,%ymm2 |
| vpaddq %ymm6,%ymm12,%ymm12 |
| vpaddq %ymm2,%ymm13,%ymm13 |
| vpmuludq %ymm3,%ymm8,%ymm6 |
| vpmuludq 64(%rsp),%ymm4,%ymm2 |
| vpaddq %ymm6,%ymm15,%ymm15 |
| vpaddq %ymm2,%ymm11,%ymm11 |
| vmovdqa -16(%rax),%ymm8 |
| |
| vpmuludq %ymm0,%ymm7,%ymm6 |
| vpmuludq %ymm1,%ymm7,%ymm2 |
| vpaddq %ymm6,%ymm11,%ymm11 |
| vpaddq %ymm2,%ymm12,%ymm12 |
| vpmuludq %ymm3,%ymm7,%ymm6 |
| vpmuludq %ymm4,%ymm7,%ymm2 |
| vmovdqu 0(%rsi),%xmm7 |
| vpaddq %ymm6,%ymm14,%ymm14 |
| vpaddq %ymm2,%ymm15,%ymm15 |
| vinserti128 $1,32(%rsi),%ymm7,%ymm7 |
| |
| vpmuludq %ymm3,%ymm8,%ymm6 |
| vpmuludq %ymm4,%ymm8,%ymm2 |
| vmovdqu 16(%rsi),%xmm8 |
| vpaddq %ymm6,%ymm11,%ymm11 |
| vpaddq %ymm2,%ymm12,%ymm12 |
| vmovdqa 16(%rax),%ymm2 |
| vpmuludq %ymm1,%ymm9,%ymm6 |
| vpmuludq %ymm0,%ymm9,%ymm9 |
| vpaddq %ymm6,%ymm14,%ymm14 |
| vpaddq %ymm9,%ymm13,%ymm13 |
| vinserti128 $1,48(%rsi),%ymm8,%ymm8 |
| leaq 64(%rsi),%rsi |
| |
| vpmuludq %ymm1,%ymm2,%ymm6 |
| vpmuludq %ymm0,%ymm2,%ymm2 |
| vpsrldq $6,%ymm7,%ymm9 |
| vpaddq %ymm6,%ymm15,%ymm15 |
| vpaddq %ymm2,%ymm14,%ymm14 |
| vpmuludq %ymm3,%ymm10,%ymm6 |
| vpmuludq %ymm4,%ymm10,%ymm2 |
| vpsrldq $6,%ymm8,%ymm10 |
| vpaddq %ymm6,%ymm12,%ymm12 |
| vpaddq %ymm2,%ymm13,%ymm13 |
| vpunpckhqdq %ymm8,%ymm7,%ymm6 |
| |
| vpmuludq %ymm3,%ymm5,%ymm3 |
| vpmuludq %ymm4,%ymm5,%ymm4 |
| vpunpcklqdq %ymm8,%ymm7,%ymm7 |
| vpaddq %ymm3,%ymm13,%ymm2 |
| vpaddq %ymm4,%ymm14,%ymm3 |
| vpunpcklqdq %ymm10,%ymm9,%ymm10 |
| vpmuludq 80(%rax),%ymm0,%ymm4 |
| vpmuludq %ymm1,%ymm5,%ymm0 |
| vmovdqa 64(%rcx),%ymm5 |
| vpaddq %ymm4,%ymm15,%ymm4 |
| vpaddq %ymm0,%ymm11,%ymm0 |
| |
| |
| |
| |
| vpsrlq $26,%ymm3,%ymm14 |
| vpand %ymm5,%ymm3,%ymm3 |
| vpaddq %ymm14,%ymm4,%ymm4 |
| |
| vpsrlq $26,%ymm0,%ymm11 |
| vpand %ymm5,%ymm0,%ymm0 |
| vpaddq %ymm11,%ymm12,%ymm1 |
| |
| vpsrlq $26,%ymm4,%ymm15 |
| vpand %ymm5,%ymm4,%ymm4 |
| |
| vpsrlq $4,%ymm10,%ymm9 |
| |
| vpsrlq $26,%ymm1,%ymm12 |
| vpand %ymm5,%ymm1,%ymm1 |
| vpaddq %ymm12,%ymm2,%ymm2 |
| |
| vpaddq %ymm15,%ymm0,%ymm0 |
| vpsllq $2,%ymm15,%ymm15 |
| vpaddq %ymm15,%ymm0,%ymm0 |
| |
| vpand %ymm5,%ymm9,%ymm9 |
| vpsrlq $26,%ymm7,%ymm8 |
| |
| vpsrlq $26,%ymm2,%ymm13 |
| vpand %ymm5,%ymm2,%ymm2 |
| vpaddq %ymm13,%ymm3,%ymm3 |
| |
| vpaddq %ymm9,%ymm2,%ymm2 |
| vpsrlq $30,%ymm10,%ymm10 |
| |
| vpsrlq $26,%ymm0,%ymm11 |
| vpand %ymm5,%ymm0,%ymm0 |
| vpaddq %ymm11,%ymm1,%ymm1 |
| |
| vpsrlq $40,%ymm6,%ymm6 |
| |
| vpsrlq $26,%ymm3,%ymm14 |
| vpand %ymm5,%ymm3,%ymm3 |
| vpaddq %ymm14,%ymm4,%ymm4 |
| |
| vpand %ymm5,%ymm7,%ymm7 |
| vpand %ymm5,%ymm8,%ymm8 |
| vpand %ymm5,%ymm10,%ymm10 |
| vpor 32(%rcx),%ymm6,%ymm6 |
| |
| subq $64,%rdx |
| jnz L$oop_avx2 |
| |
| .byte 0x66,0x90 |
| L$tail_avx2: |
| |
| |
| |
| |
| |
| |
| |
| vpaddq %ymm0,%ymm7,%ymm0 |
| vmovdqu 4(%rsp),%ymm7 |
| vpaddq %ymm1,%ymm8,%ymm1 |
| vmovdqu 36(%rsp),%ymm8 |
| vpaddq %ymm3,%ymm10,%ymm3 |
| vmovdqu 100(%rsp),%ymm9 |
| vpaddq %ymm4,%ymm6,%ymm4 |
| vmovdqu 52(%rax),%ymm10 |
| vmovdqu 116(%rax),%ymm5 |
| |
| vpmuludq %ymm2,%ymm7,%ymm13 |
| vpmuludq %ymm2,%ymm8,%ymm14 |
| vpmuludq %ymm2,%ymm9,%ymm15 |
| vpmuludq %ymm2,%ymm10,%ymm11 |
| vpmuludq %ymm2,%ymm5,%ymm12 |
| |
| vpmuludq %ymm0,%ymm8,%ymm6 |
| vpmuludq %ymm1,%ymm8,%ymm2 |
| vpaddq %ymm6,%ymm12,%ymm12 |
| vpaddq %ymm2,%ymm13,%ymm13 |
| vpmuludq %ymm3,%ymm8,%ymm6 |
| vpmuludq 68(%rsp),%ymm4,%ymm2 |
| vpaddq %ymm6,%ymm15,%ymm15 |
| vpaddq %ymm2,%ymm11,%ymm11 |
| |
| vpmuludq %ymm0,%ymm7,%ymm6 |
| vpmuludq %ymm1,%ymm7,%ymm2 |
| vpaddq %ymm6,%ymm11,%ymm11 |
| vmovdqu -12(%rax),%ymm8 |
| vpaddq %ymm2,%ymm12,%ymm12 |
| vpmuludq %ymm3,%ymm7,%ymm6 |
| vpmuludq %ymm4,%ymm7,%ymm2 |
| vpaddq %ymm6,%ymm14,%ymm14 |
| vpaddq %ymm2,%ymm15,%ymm15 |
| |
| vpmuludq %ymm3,%ymm8,%ymm6 |
| vpmuludq %ymm4,%ymm8,%ymm2 |
| vpaddq %ymm6,%ymm11,%ymm11 |
| vpaddq %ymm2,%ymm12,%ymm12 |
| vmovdqu 20(%rax),%ymm2 |
| vpmuludq %ymm1,%ymm9,%ymm6 |
| vpmuludq %ymm0,%ymm9,%ymm9 |
| vpaddq %ymm6,%ymm14,%ymm14 |
| vpaddq %ymm9,%ymm13,%ymm13 |
| |
| vpmuludq %ymm1,%ymm2,%ymm6 |
| vpmuludq %ymm0,%ymm2,%ymm2 |
| vpaddq %ymm6,%ymm15,%ymm15 |
| vpaddq %ymm2,%ymm14,%ymm14 |
| vpmuludq %ymm3,%ymm10,%ymm6 |
| vpmuludq %ymm4,%ymm10,%ymm2 |
| vpaddq %ymm6,%ymm12,%ymm12 |
| vpaddq %ymm2,%ymm13,%ymm13 |
| |
| vpmuludq %ymm3,%ymm5,%ymm3 |
| vpmuludq %ymm4,%ymm5,%ymm4 |
| vpaddq %ymm3,%ymm13,%ymm2 |
| vpaddq %ymm4,%ymm14,%ymm3 |
| vpmuludq 84(%rax),%ymm0,%ymm4 |
| vpmuludq %ymm1,%ymm5,%ymm0 |
| vmovdqa 64(%rcx),%ymm5 |
| vpaddq %ymm4,%ymm15,%ymm4 |
| vpaddq %ymm0,%ymm11,%ymm0 |
| |
| |
| |
| |
| vpsrldq $8,%ymm12,%ymm8 |
| vpsrldq $8,%ymm2,%ymm9 |
| vpsrldq $8,%ymm3,%ymm10 |
| vpsrldq $8,%ymm4,%ymm6 |
| vpsrldq $8,%ymm0,%ymm7 |
| vpaddq %ymm8,%ymm12,%ymm12 |
| vpaddq %ymm9,%ymm2,%ymm2 |
| vpaddq %ymm10,%ymm3,%ymm3 |
| vpaddq %ymm6,%ymm4,%ymm4 |
| vpaddq %ymm7,%ymm0,%ymm0 |
| |
| vpermq $0x2,%ymm3,%ymm10 |
| vpermq $0x2,%ymm4,%ymm6 |
| vpermq $0x2,%ymm0,%ymm7 |
| vpermq $0x2,%ymm12,%ymm8 |
| vpermq $0x2,%ymm2,%ymm9 |
| vpaddq %ymm10,%ymm3,%ymm3 |
| vpaddq %ymm6,%ymm4,%ymm4 |
| vpaddq %ymm7,%ymm0,%ymm0 |
| vpaddq %ymm8,%ymm12,%ymm12 |
| vpaddq %ymm9,%ymm2,%ymm2 |
| |
| |
| |
| |
| vpsrlq $26,%ymm3,%ymm14 |
| vpand %ymm5,%ymm3,%ymm3 |
| vpaddq %ymm14,%ymm4,%ymm4 |
| |
| vpsrlq $26,%ymm0,%ymm11 |
| vpand %ymm5,%ymm0,%ymm0 |
| vpaddq %ymm11,%ymm12,%ymm1 |
| |
| vpsrlq $26,%ymm4,%ymm15 |
| vpand %ymm5,%ymm4,%ymm4 |
| |
| vpsrlq $26,%ymm1,%ymm12 |
| vpand %ymm5,%ymm1,%ymm1 |
| vpaddq %ymm12,%ymm2,%ymm2 |
| |
| vpaddq %ymm15,%ymm0,%ymm0 |
| vpsllq $2,%ymm15,%ymm15 |
| vpaddq %ymm15,%ymm0,%ymm0 |
| |
| vpsrlq $26,%ymm2,%ymm13 |
| vpand %ymm5,%ymm2,%ymm2 |
| vpaddq %ymm13,%ymm3,%ymm3 |
| |
| vpsrlq $26,%ymm0,%ymm11 |
| vpand %ymm5,%ymm0,%ymm0 |
| vpaddq %ymm11,%ymm1,%ymm1 |
| |
| vpsrlq $26,%ymm3,%ymm14 |
| vpand %ymm5,%ymm3,%ymm3 |
| vpaddq %ymm14,%ymm4,%ymm4 |
| |
| vmovd %xmm0,-112(%rdi) |
| vmovd %xmm1,-108(%rdi) |
| vmovd %xmm2,-104(%rdi) |
| vmovd %xmm3,-100(%rdi) |
| vmovd %xmm4,-96(%rdi) |
| leaq 8(%r11),%rsp |
| vzeroupper |
| .byte 0xf3,0xc3 |
| |
| .p2align 6 |
| L$const: |
| L$mask24: |
| .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 |
| L$129: |
| .long 16777216,0,16777216,0,16777216,0,16777216,0 |
| L$mask26: |
| .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 |
| L$five: |
| .long 5,0,5,0,5,0,5,0 |
| .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 |
| .p2align 4 |
| #endif |