| // This file is generated from a similarly-named Perl script in the BoringSSL |
| // source tree. Do not edit by hand. |
| |
| #if !defined(__has_feature) |
| #define __has_feature(x) 0 |
| #endif |
| #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) |
| #define OPENSSL_NO_ASM |
| #endif |
| |
| #if !defined(OPENSSL_NO_ASM) |
| #if defined(BORINGSSL_PREFIX) |
| #include <boringssl_prefix_symbols_asm.h> |
| #endif |
| #include <openssl/arm_arch.h> |
| |
| #if __ARM_MAX_ARCH__>=7 |
| .text |
| |
| .section __TEXT,__const |
| .align 5 |
| Lrcon: |
| .long 0x01,0x01,0x01,0x01 |
| .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat |
| .long 0x1b,0x1b,0x1b,0x1b |
| |
| .text |
| |
| .globl _aes_hw_set_encrypt_key |
| .private_extern _aes_hw_set_encrypt_key |
| |
| .align 5 |
| _aes_hw_set_encrypt_key: |
| Lenc_key: |
| // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. |
| AARCH64_VALID_CALL_TARGET |
| stp x29,x30,[sp,#-16]! |
| add x29,sp,#0 |
| mov x3,#-1 |
| cmp x0,#0 |
| b.eq Lenc_key_abort |
| cmp x2,#0 |
| b.eq Lenc_key_abort |
| mov x3,#-2 |
| cmp w1,#128 |
| b.lt Lenc_key_abort |
| cmp w1,#256 |
| b.gt Lenc_key_abort |
| tst w1,#0x3f |
| b.ne Lenc_key_abort |
| |
| adrp x3,Lrcon@PAGE |
| add x3,x3,Lrcon@PAGEOFF |
| cmp w1,#192 |
| |
| eor v0.16b,v0.16b,v0.16b |
| ld1 {v3.16b},[x0],#16 |
| mov w1,#8 // reuse w1 |
| ld1 {v1.4s,v2.4s},[x3],#32 |
| |
| b.lt Loop128 |
| b.eq L192 |
| b L256 |
| |
| .align 4 |
| Loop128: |
| tbl v6.16b,{v3.16b},v2.16b |
| ext v5.16b,v0.16b,v3.16b,#12 |
| st1 {v3.4s},[x2],#16 |
| aese v6.16b,v0.16b |
| subs w1,w1,#1 |
| |
| eor v3.16b,v3.16b,v5.16b |
| ext v5.16b,v0.16b,v5.16b,#12 |
| eor v3.16b,v3.16b,v5.16b |
| ext v5.16b,v0.16b,v5.16b,#12 |
| eor v6.16b,v6.16b,v1.16b |
| eor v3.16b,v3.16b,v5.16b |
| shl v1.16b,v1.16b,#1 |
| eor v3.16b,v3.16b,v6.16b |
| b.ne Loop128 |
| |
| ld1 {v1.4s},[x3] |
| |
| tbl v6.16b,{v3.16b},v2.16b |
| ext v5.16b,v0.16b,v3.16b,#12 |
| st1 {v3.4s},[x2],#16 |
| aese v6.16b,v0.16b |
| |
| eor v3.16b,v3.16b,v5.16b |
| ext v5.16b,v0.16b,v5.16b,#12 |
| eor v3.16b,v3.16b,v5.16b |
| ext v5.16b,v0.16b,v5.16b,#12 |
| eor v6.16b,v6.16b,v1.16b |
| eor v3.16b,v3.16b,v5.16b |
| shl v1.16b,v1.16b,#1 |
| eor v3.16b,v3.16b,v6.16b |
| |
| tbl v6.16b,{v3.16b},v2.16b |
| ext v5.16b,v0.16b,v3.16b,#12 |
| st1 {v3.4s},[x2],#16 |
| aese v6.16b,v0.16b |
| |
| eor v3.16b,v3.16b,v5.16b |
| ext v5.16b,v0.16b,v5.16b,#12 |
| eor v3.16b,v3.16b,v5.16b |
| ext v5.16b,v0.16b,v5.16b,#12 |
| eor v6.16b,v6.16b,v1.16b |
| eor v3.16b,v3.16b,v5.16b |
| eor v3.16b,v3.16b,v6.16b |
| st1 {v3.4s},[x2] |
| add x2,x2,#0x50 |
| |
| mov w12,#10 |
| b Ldone |
| |
| .align 4 |
| L192: |
| ld1 {v4.8b},[x0],#8 |
| movi v6.16b,#8 // borrow v6.16b |
| st1 {v3.4s},[x2],#16 |
| sub v2.16b,v2.16b,v6.16b // adjust the mask |
| |
| Loop192: |
| tbl v6.16b,{v4.16b},v2.16b |
| ext v5.16b,v0.16b,v3.16b,#12 |
| st1 {v4.8b},[x2],#8 |
| aese v6.16b,v0.16b |
| subs w1,w1,#1 |
| |
| eor v3.16b,v3.16b,v5.16b |
| ext v5.16b,v0.16b,v5.16b,#12 |
| eor v3.16b,v3.16b,v5.16b |
| ext v5.16b,v0.16b,v5.16b,#12 |
| eor v3.16b,v3.16b,v5.16b |
| |
| dup v5.4s,v3.s[3] |
| eor v5.16b,v5.16b,v4.16b |
| eor v6.16b,v6.16b,v1.16b |
| ext v4.16b,v0.16b,v4.16b,#12 |
| shl v1.16b,v1.16b,#1 |
| eor v4.16b,v4.16b,v5.16b |
| eor v3.16b,v3.16b,v6.16b |
| eor v4.16b,v4.16b,v6.16b |
| st1 {v3.4s},[x2],#16 |
| b.ne Loop192 |
| |
| mov w12,#12 |
| add x2,x2,#0x20 |
| b Ldone |
| |
| .align 4 |
| L256: |
| ld1 {v4.16b},[x0] |
| mov w1,#7 |
| mov w12,#14 |
| st1 {v3.4s},[x2],#16 |
| |
| Loop256: |
| tbl v6.16b,{v4.16b},v2.16b |
| ext v5.16b,v0.16b,v3.16b,#12 |
| st1 {v4.4s},[x2],#16 |
| aese v6.16b,v0.16b |
| subs w1,w1,#1 |
| |
| eor v3.16b,v3.16b,v5.16b |
| ext v5.16b,v0.16b,v5.16b,#12 |
| eor v3.16b,v3.16b,v5.16b |
| ext v5.16b,v0.16b,v5.16b,#12 |
| eor v6.16b,v6.16b,v1.16b |
| eor v3.16b,v3.16b,v5.16b |
| shl v1.16b,v1.16b,#1 |
| eor v3.16b,v3.16b,v6.16b |
| st1 {v3.4s},[x2],#16 |
| b.eq Ldone |
| |
| dup v6.4s,v3.s[3] // just splat |
| ext v5.16b,v0.16b,v4.16b,#12 |
| aese v6.16b,v0.16b |
| |
| eor v4.16b,v4.16b,v5.16b |
| ext v5.16b,v0.16b,v5.16b,#12 |
| eor v4.16b,v4.16b,v5.16b |
| ext v5.16b,v0.16b,v5.16b,#12 |
| eor v4.16b,v4.16b,v5.16b |
| |
| eor v4.16b,v4.16b,v6.16b |
| b Loop256 |
| |
| Ldone: |
| str w12,[x2] |
| mov x3,#0 |
| |
| Lenc_key_abort: |
| mov x0,x3 // return value |
| ldr x29,[sp],#16 |
| ret |
| |
| |
| .globl _aes_hw_set_decrypt_key |
| .private_extern _aes_hw_set_decrypt_key |
| |
| .align 5 |
| _aes_hw_set_decrypt_key: |
| AARCH64_SIGN_LINK_REGISTER |
| stp x29,x30,[sp,#-16]! |
| add x29,sp,#0 |
| bl Lenc_key |
| |
| cmp x0,#0 |
| b.ne Ldec_key_abort |
| |
| sub x2,x2,#240 // restore original x2 |
| mov x4,#-16 |
| add x0,x2,x12,lsl#4 // end of key schedule |
| |
| ld1 {v0.4s},[x2] |
| ld1 {v1.4s},[x0] |
| st1 {v0.4s},[x0],x4 |
| st1 {v1.4s},[x2],#16 |
| |
| Loop_imc: |
| ld1 {v0.4s},[x2] |
| ld1 {v1.4s},[x0] |
| aesimc v0.16b,v0.16b |
| aesimc v1.16b,v1.16b |
| st1 {v0.4s},[x0],x4 |
| st1 {v1.4s},[x2],#16 |
| cmp x0,x2 |
| b.hi Loop_imc |
| |
| ld1 {v0.4s},[x2] |
| aesimc v0.16b,v0.16b |
| st1 {v0.4s},[x0] |
| |
| eor x0,x0,x0 // return value |
| Ldec_key_abort: |
| ldp x29,x30,[sp],#16 |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| .globl _aes_hw_encrypt |
| .private_extern _aes_hw_encrypt |
| |
| .align 5 |
| _aes_hw_encrypt: |
| AARCH64_VALID_CALL_TARGET |
| ldr w3,[x2,#240] |
| ld1 {v0.4s},[x2],#16 |
| ld1 {v2.16b},[x0] |
| sub w3,w3,#2 |
| ld1 {v1.4s},[x2],#16 |
| |
| Loop_enc: |
| aese v2.16b,v0.16b |
| aesmc v2.16b,v2.16b |
| ld1 {v0.4s},[x2],#16 |
| subs w3,w3,#2 |
| aese v2.16b,v1.16b |
| aesmc v2.16b,v2.16b |
| ld1 {v1.4s},[x2],#16 |
| b.gt Loop_enc |
| |
| aese v2.16b,v0.16b |
| aesmc v2.16b,v2.16b |
| ld1 {v0.4s},[x2] |
| aese v2.16b,v1.16b |
| eor v2.16b,v2.16b,v0.16b |
| |
| st1 {v2.16b},[x1] |
| ret |
| |
| .globl _aes_hw_decrypt |
| .private_extern _aes_hw_decrypt |
| |
| .align 5 |
| _aes_hw_decrypt: |
| AARCH64_VALID_CALL_TARGET |
| ldr w3,[x2,#240] |
| ld1 {v0.4s},[x2],#16 |
| ld1 {v2.16b},[x0] |
| sub w3,w3,#2 |
| ld1 {v1.4s},[x2],#16 |
| |
| Loop_dec: |
| aesd v2.16b,v0.16b |
| aesimc v2.16b,v2.16b |
| ld1 {v0.4s},[x2],#16 |
| subs w3,w3,#2 |
| aesd v2.16b,v1.16b |
| aesimc v2.16b,v2.16b |
| ld1 {v1.4s},[x2],#16 |
| b.gt Loop_dec |
| |
| aesd v2.16b,v0.16b |
| aesimc v2.16b,v2.16b |
| ld1 {v0.4s},[x2] |
| aesd v2.16b,v1.16b |
| eor v2.16b,v2.16b,v0.16b |
| |
| st1 {v2.16b},[x1] |
| ret |
| |
| .globl _aes_hw_cbc_encrypt |
| .private_extern _aes_hw_cbc_encrypt |
| |
| .align 5 |
| _aes_hw_cbc_encrypt: |
| // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. |
| AARCH64_VALID_CALL_TARGET |
| stp x29,x30,[sp,#-16]! |
| add x29,sp,#0 |
| subs x2,x2,#16 |
| mov x8,#16 |
| b.lo Lcbc_abort |
| csel x8,xzr,x8,eq |
| |
| cmp w5,#0 // en- or decrypting? |
| ldr w5,[x3,#240] |
| and x2,x2,#-16 |
| ld1 {v6.16b},[x4] |
| ld1 {v0.16b},[x0],x8 |
| |
| ld1 {v16.4s,v17.4s},[x3] // load key schedule... |
| sub w5,w5,#6 |
| add x7,x3,x5,lsl#4 // pointer to last 7 round keys |
| sub w5,w5,#2 |
| ld1 {v18.4s,v19.4s},[x7],#32 |
| ld1 {v20.4s,v21.4s},[x7],#32 |
| ld1 {v22.4s,v23.4s},[x7],#32 |
| ld1 {v7.4s},[x7] |
| |
| add x7,x3,#32 |
| mov w6,w5 |
| b.eq Lcbc_dec |
| |
| cmp w5,#2 |
| eor v0.16b,v0.16b,v6.16b |
| eor v5.16b,v16.16b,v7.16b |
| b.eq Lcbc_enc128 |
| |
| ld1 {v2.4s,v3.4s},[x7] |
| add x7,x3,#16 |
| add x6,x3,#16*4 |
| add x12,x3,#16*5 |
| aese v0.16b,v16.16b |
| aesmc v0.16b,v0.16b |
| add x14,x3,#16*6 |
| add x3,x3,#16*7 |
| b Lenter_cbc_enc |
| |
| .align 4 |
| Loop_cbc_enc: |
| aese v0.16b,v16.16b |
| aesmc v0.16b,v0.16b |
| st1 {v6.16b},[x1],#16 |
| Lenter_cbc_enc: |
| aese v0.16b,v17.16b |
| aesmc v0.16b,v0.16b |
| aese v0.16b,v2.16b |
| aesmc v0.16b,v0.16b |
| ld1 {v16.4s},[x6] |
| cmp w5,#4 |
| aese v0.16b,v3.16b |
| aesmc v0.16b,v0.16b |
| ld1 {v17.4s},[x12] |
| b.eq Lcbc_enc192 |
| |
| aese v0.16b,v16.16b |
| aesmc v0.16b,v0.16b |
| ld1 {v16.4s},[x14] |
| aese v0.16b,v17.16b |
| aesmc v0.16b,v0.16b |
| ld1 {v17.4s},[x3] |
| nop |
| |
| Lcbc_enc192: |
| aese v0.16b,v16.16b |
| aesmc v0.16b,v0.16b |
| subs x2,x2,#16 |
| aese v0.16b,v17.16b |
| aesmc v0.16b,v0.16b |
| csel x8,xzr,x8,eq |
| aese v0.16b,v18.16b |
| aesmc v0.16b,v0.16b |
| aese v0.16b,v19.16b |
| aesmc v0.16b,v0.16b |
| ld1 {v16.16b},[x0],x8 |
| aese v0.16b,v20.16b |
| aesmc v0.16b,v0.16b |
| eor v16.16b,v16.16b,v5.16b |
| aese v0.16b,v21.16b |
| aesmc v0.16b,v0.16b |
| ld1 {v17.4s},[x7] // re-pre-load rndkey[1] |
| aese v0.16b,v22.16b |
| aesmc v0.16b,v0.16b |
| aese v0.16b,v23.16b |
| eor v6.16b,v0.16b,v7.16b |
| b.hs Loop_cbc_enc |
| |
| st1 {v6.16b},[x1],#16 |
| b Lcbc_done |
| |
| .align 5 |
| Lcbc_enc128: |
| ld1 {v2.4s,v3.4s},[x7] |
| aese v0.16b,v16.16b |
| aesmc v0.16b,v0.16b |
| b Lenter_cbc_enc128 |
| Loop_cbc_enc128: |
| aese v0.16b,v16.16b |
| aesmc v0.16b,v0.16b |
| st1 {v6.16b},[x1],#16 |
| Lenter_cbc_enc128: |
| aese v0.16b,v17.16b |
| aesmc v0.16b,v0.16b |
| subs x2,x2,#16 |
| aese v0.16b,v2.16b |
| aesmc v0.16b,v0.16b |
| csel x8,xzr,x8,eq |
| aese v0.16b,v3.16b |
| aesmc v0.16b,v0.16b |
| aese v0.16b,v18.16b |
| aesmc v0.16b,v0.16b |
| aese v0.16b,v19.16b |
| aesmc v0.16b,v0.16b |
| ld1 {v16.16b},[x0],x8 |
| aese v0.16b,v20.16b |
| aesmc v0.16b,v0.16b |
| aese v0.16b,v21.16b |
| aesmc v0.16b,v0.16b |
| aese v0.16b,v22.16b |
| aesmc v0.16b,v0.16b |
| eor v16.16b,v16.16b,v5.16b |
| aese v0.16b,v23.16b |
| eor v6.16b,v0.16b,v7.16b |
| b.hs Loop_cbc_enc128 |
| |
| st1 {v6.16b},[x1],#16 |
| b Lcbc_done |
| .align 5 |
| Lcbc_dec: |
| ld1 {v18.16b},[x0],#16 |
| subs x2,x2,#32 // bias |
| add w6,w5,#2 |
| orr v3.16b,v0.16b,v0.16b |
| orr v1.16b,v0.16b,v0.16b |
| orr v19.16b,v18.16b,v18.16b |
| b.lo Lcbc_dec_tail |
| |
| orr v1.16b,v18.16b,v18.16b |
| ld1 {v18.16b},[x0],#16 |
| orr v2.16b,v0.16b,v0.16b |
| orr v3.16b,v1.16b,v1.16b |
| orr v19.16b,v18.16b,v18.16b |
| |
| Loop3x_cbc_dec: |
| aesd v0.16b,v16.16b |
| aesimc v0.16b,v0.16b |
| aesd v1.16b,v16.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v16.16b |
| aesimc v18.16b,v18.16b |
| ld1 {v16.4s},[x7],#16 |
| subs w6,w6,#2 |
| aesd v0.16b,v17.16b |
| aesimc v0.16b,v0.16b |
| aesd v1.16b,v17.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v17.16b |
| aesimc v18.16b,v18.16b |
| ld1 {v17.4s},[x7],#16 |
| b.gt Loop3x_cbc_dec |
| |
| aesd v0.16b,v16.16b |
| aesimc v0.16b,v0.16b |
| aesd v1.16b,v16.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v16.16b |
| aesimc v18.16b,v18.16b |
| eor v4.16b,v6.16b,v7.16b |
| subs x2,x2,#0x30 |
| eor v5.16b,v2.16b,v7.16b |
| csel x6,x2,x6,lo // x6, w6, is zero at this point |
| aesd v0.16b,v17.16b |
| aesimc v0.16b,v0.16b |
| aesd v1.16b,v17.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v17.16b |
| aesimc v18.16b,v18.16b |
| eor v17.16b,v3.16b,v7.16b |
| add x0,x0,x6 // x0 is adjusted in such way that |
| // at exit from the loop v1.16b-v18.16b |
| // are loaded with last "words" |
| orr v6.16b,v19.16b,v19.16b |
| mov x7,x3 |
| aesd v0.16b,v20.16b |
| aesimc v0.16b,v0.16b |
| aesd v1.16b,v20.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v20.16b |
| aesimc v18.16b,v18.16b |
| ld1 {v2.16b},[x0],#16 |
| aesd v0.16b,v21.16b |
| aesimc v0.16b,v0.16b |
| aesd v1.16b,v21.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v21.16b |
| aesimc v18.16b,v18.16b |
| ld1 {v3.16b},[x0],#16 |
| aesd v0.16b,v22.16b |
| aesimc v0.16b,v0.16b |
| aesd v1.16b,v22.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v22.16b |
| aesimc v18.16b,v18.16b |
| ld1 {v19.16b},[x0],#16 |
| aesd v0.16b,v23.16b |
| aesd v1.16b,v23.16b |
| aesd v18.16b,v23.16b |
| ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] |
| add w6,w5,#2 |
| eor v4.16b,v4.16b,v0.16b |
| eor v5.16b,v5.16b,v1.16b |
| eor v18.16b,v18.16b,v17.16b |
| ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] |
| st1 {v4.16b},[x1],#16 |
| orr v0.16b,v2.16b,v2.16b |
| st1 {v5.16b},[x1],#16 |
| orr v1.16b,v3.16b,v3.16b |
| st1 {v18.16b},[x1],#16 |
| orr v18.16b,v19.16b,v19.16b |
| b.hs Loop3x_cbc_dec |
| |
| cmn x2,#0x30 |
| b.eq Lcbc_done |
| nop |
| |
| Lcbc_dec_tail: |
| aesd v1.16b,v16.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v16.16b |
| aesimc v18.16b,v18.16b |
| ld1 {v16.4s},[x7],#16 |
| subs w6,w6,#2 |
| aesd v1.16b,v17.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v17.16b |
| aesimc v18.16b,v18.16b |
| ld1 {v17.4s},[x7],#16 |
| b.gt Lcbc_dec_tail |
| |
| aesd v1.16b,v16.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v16.16b |
| aesimc v18.16b,v18.16b |
| aesd v1.16b,v17.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v17.16b |
| aesimc v18.16b,v18.16b |
| aesd v1.16b,v20.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v20.16b |
| aesimc v18.16b,v18.16b |
| cmn x2,#0x20 |
| aesd v1.16b,v21.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v21.16b |
| aesimc v18.16b,v18.16b |
| eor v5.16b,v6.16b,v7.16b |
| aesd v1.16b,v22.16b |
| aesimc v1.16b,v1.16b |
| aesd v18.16b,v22.16b |
| aesimc v18.16b,v18.16b |
| eor v17.16b,v3.16b,v7.16b |
| aesd v1.16b,v23.16b |
| aesd v18.16b,v23.16b |
| b.eq Lcbc_dec_one |
| eor v5.16b,v5.16b,v1.16b |
| eor v17.16b,v17.16b,v18.16b |
| orr v6.16b,v19.16b,v19.16b |
| st1 {v5.16b},[x1],#16 |
| st1 {v17.16b},[x1],#16 |
| b Lcbc_done |
| |
| Lcbc_dec_one: |
| eor v5.16b,v5.16b,v18.16b |
| orr v6.16b,v19.16b,v19.16b |
| st1 {v5.16b},[x1],#16 |
| |
| Lcbc_done: |
| st1 {v6.16b},[x4] |
| Lcbc_abort: |
| ldr x29,[sp],#16 |
| ret |
| |
| .globl _aes_hw_ctr32_encrypt_blocks |
| .private_extern _aes_hw_ctr32_encrypt_blocks |
| |
| .align 5 |
| _aes_hw_ctr32_encrypt_blocks: |
| // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. |
| AARCH64_VALID_CALL_TARGET |
| stp x29,x30,[sp,#-16]! |
| add x29,sp,#0 |
| ldr w5,[x3,#240] |
| |
| ldr w8, [x4, #12] |
| ld1 {v0.4s},[x4] |
| |
| ld1 {v16.4s,v17.4s},[x3] // load key schedule... |
| sub w5,w5,#4 |
| mov x12,#16 |
| cmp x2,#2 |
| add x7,x3,x5,lsl#4 // pointer to last 5 round keys |
| sub w5,w5,#2 |
| ld1 {v20.4s,v21.4s},[x7],#32 |
| ld1 {v22.4s,v23.4s},[x7],#32 |
| ld1 {v7.4s},[x7] |
| add x7,x3,#32 |
| mov w6,w5 |
| csel x12,xzr,x12,lo |
| |
| // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are |
| // affected by silicon errata #1742098 [0] and #1655431 [1], |
| // respectively, where the second instruction of an aese/aesmc |
| // instruction pair may execute twice if an interrupt is taken right |
| // after the first instruction consumes an input register of which a |
| // single 32-bit lane has been updated the last time it was modified. |
| // |
| // This function uses a counter in one 32-bit lane. The vmov lines |
| // could write to v1.16b and v18.16b directly, but that trips this bugs. |
| // We write to v6.16b and copy to the final register as a workaround. |
| // |
| // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice |
| // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice |
| #ifndef __AARCH64EB__ |
| rev w8, w8 |
| #endif |
| add w10, w8, #1 |
| orr v6.16b,v0.16b,v0.16b |
| rev w10, w10 |
| mov v6.s[3],w10 |
| add w8, w8, #2 |
| orr v1.16b,v6.16b,v6.16b |
| b.ls Lctr32_tail |
| rev w12, w8 |
| mov v6.s[3],w12 |
| sub x2,x2,#3 // bias |
| orr v18.16b,v6.16b,v6.16b |
| b Loop3x_ctr32 |
| |
| .align 4 |
| Loop3x_ctr32: |
| aese v0.16b,v16.16b |
| aesmc v0.16b,v0.16b |
| aese v1.16b,v16.16b |
| aesmc v1.16b,v1.16b |
| aese v18.16b,v16.16b |
| aesmc v18.16b,v18.16b |
| ld1 {v16.4s},[x7],#16 |
| subs w6,w6,#2 |
| aese v0.16b,v17.16b |
| aesmc v0.16b,v0.16b |
| aese v1.16b,v17.16b |
| aesmc v1.16b,v1.16b |
| aese v18.16b,v17.16b |
| aesmc v18.16b,v18.16b |
| ld1 {v17.4s},[x7],#16 |
| b.gt Loop3x_ctr32 |
| |
| aese v0.16b,v16.16b |
| aesmc v4.16b,v0.16b |
| aese v1.16b,v16.16b |
| aesmc v5.16b,v1.16b |
| ld1 {v2.16b},[x0],#16 |
| add w9,w8,#1 |
| aese v18.16b,v16.16b |
| aesmc v18.16b,v18.16b |
| ld1 {v3.16b},[x0],#16 |
| rev w9,w9 |
| aese v4.16b,v17.16b |
| aesmc v4.16b,v4.16b |
| aese v5.16b,v17.16b |
| aesmc v5.16b,v5.16b |
| ld1 {v19.16b},[x0],#16 |
| mov x7,x3 |
| aese v18.16b,v17.16b |
| aesmc v17.16b,v18.16b |
| aese v4.16b,v20.16b |
| aesmc v4.16b,v4.16b |
| aese v5.16b,v20.16b |
| aesmc v5.16b,v5.16b |
| eor v2.16b,v2.16b,v7.16b |
| add w10,w8,#2 |
| aese v17.16b,v20.16b |
| aesmc v17.16b,v17.16b |
| eor v3.16b,v3.16b,v7.16b |
| add w8,w8,#3 |
| aese v4.16b,v21.16b |
| aesmc v4.16b,v4.16b |
| aese v5.16b,v21.16b |
| aesmc v5.16b,v5.16b |
| // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work |
| // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in |
| // 32-bit mode. See the comment above. |
| eor v19.16b,v19.16b,v7.16b |
| mov v6.s[3], w9 |
| aese v17.16b,v21.16b |
| aesmc v17.16b,v17.16b |
| orr v0.16b,v6.16b,v6.16b |
| rev w10,w10 |
| aese v4.16b,v22.16b |
| aesmc v4.16b,v4.16b |
| mov v6.s[3], w10 |
| rev w12,w8 |
| aese v5.16b,v22.16b |
| aesmc v5.16b,v5.16b |
| orr v1.16b,v6.16b,v6.16b |
| mov v6.s[3], w12 |
| aese v17.16b,v22.16b |
| aesmc v17.16b,v17.16b |
| orr v18.16b,v6.16b,v6.16b |
| subs x2,x2,#3 |
| aese v4.16b,v23.16b |
| aese v5.16b,v23.16b |
| aese v17.16b,v23.16b |
| |
| eor v2.16b,v2.16b,v4.16b |
| ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] |
| st1 {v2.16b},[x1],#16 |
| eor v3.16b,v3.16b,v5.16b |
| mov w6,w5 |
| st1 {v3.16b},[x1],#16 |
| eor v19.16b,v19.16b,v17.16b |
| ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] |
| st1 {v19.16b},[x1],#16 |
| b.hs Loop3x_ctr32 |
| |
| adds x2,x2,#3 |
| b.eq Lctr32_done |
| cmp x2,#1 |
| mov x12,#16 |
| csel x12,xzr,x12,eq |
| |
| Lctr32_tail: |
| aese v0.16b,v16.16b |
| aesmc v0.16b,v0.16b |
| aese v1.16b,v16.16b |
| aesmc v1.16b,v1.16b |
| ld1 {v16.4s},[x7],#16 |
| subs w6,w6,#2 |
| aese v0.16b,v17.16b |
| aesmc v0.16b,v0.16b |
| aese v1.16b,v17.16b |
| aesmc v1.16b,v1.16b |
| ld1 {v17.4s},[x7],#16 |
| b.gt Lctr32_tail |
| |
| aese v0.16b,v16.16b |
| aesmc v0.16b,v0.16b |
| aese v1.16b,v16.16b |
| aesmc v1.16b,v1.16b |
| aese v0.16b,v17.16b |
| aesmc v0.16b,v0.16b |
| aese v1.16b,v17.16b |
| aesmc v1.16b,v1.16b |
| ld1 {v2.16b},[x0],x12 |
| aese v0.16b,v20.16b |
| aesmc v0.16b,v0.16b |
| aese v1.16b,v20.16b |
| aesmc v1.16b,v1.16b |
| ld1 {v3.16b},[x0] |
| aese v0.16b,v21.16b |
| aesmc v0.16b,v0.16b |
| aese v1.16b,v21.16b |
| aesmc v1.16b,v1.16b |
| eor v2.16b,v2.16b,v7.16b |
| aese v0.16b,v22.16b |
| aesmc v0.16b,v0.16b |
| aese v1.16b,v22.16b |
| aesmc v1.16b,v1.16b |
| eor v3.16b,v3.16b,v7.16b |
| aese v0.16b,v23.16b |
| aese v1.16b,v23.16b |
| |
| cmp x2,#1 |
| eor v2.16b,v2.16b,v0.16b |
| eor v3.16b,v3.16b,v1.16b |
| st1 {v2.16b},[x1],#16 |
| b.eq Lctr32_done |
| st1 {v3.16b},[x1] |
| |
| Lctr32_done: |
| ldr x29,[sp],#16 |
| ret |
| |
| #endif |
| #endif // !OPENSSL_NO_ASM |