| // This file is generated from a similarly-named Perl script in the BoringSSL |
| // source tree. Do not edit by hand. |
| |
| #if !defined(__has_feature) |
| #define __has_feature(x) 0 |
| #endif |
| #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) |
| #define OPENSSL_NO_ASM |
| #endif |
| |
| #if !defined(OPENSSL_NO_ASM) |
| #if defined(BORINGSSL_PREFIX) |
| #include <boringssl_prefix_symbols_asm.h> |
| #endif |
| #include "openssl/arm_arch.h" |
| |
| .text |
| .globl _beeu_mod_inverse_vartime |
| .private_extern _beeu_mod_inverse_vartime |
| |
| .align 4 |
| _beeu_mod_inverse_vartime: |
| // Reserve enough space for 14 8-byte registers on the stack |
| // in the first stp call for x29, x30. |
| // Then store the remaining callee-saved registers. |
| // |
| // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 | |
| // ^ ^ |
| // sp <------------------- 112 bytes ----------------> old sp |
| // x29 (FP) |
| // |
| AARCH64_SIGN_LINK_REGISTER |
| stp x29,x30,[sp,#-112]! |
| add x29,sp,#0 |
| stp x19,x20,[sp,#16] |
| stp x21,x22,[sp,#32] |
| stp x23,x24,[sp,#48] |
| stp x25,x26,[sp,#64] |
| stp x27,x28,[sp,#80] |
| stp x0,x2,[sp,#96] |
| |
| // B = b3..b0 := a |
| ldp x25,x26,[x1] |
| ldp x27,x28,[x1,#16] |
| |
| // n3..n0 := n |
| // Note: the value of input params are changed in the following. |
| ldp x0,x1,[x2] |
| ldp x2,x30,[x2,#16] |
| |
| // A = a3..a0 := n |
| mov x21, x0 |
| mov x22, x1 |
| mov x23, x2 |
| mov x24, x30 |
| |
| // X = x4..x0 := 1 |
| mov x3, #1 |
| eor x4, x4, x4 |
| eor x5, x5, x5 |
| eor x6, x6, x6 |
| eor x7, x7, x7 |
| |
| // Y = y4..y0 := 0 |
| eor x8, x8, x8 |
| eor x9, x9, x9 |
| eor x10, x10, x10 |
| eor x11, x11, x11 |
| eor x12, x12, x12 |
| |
| Lbeeu_loop: |
| // if B == 0, jump to .Lbeeu_loop_end |
| orr x14, x25, x26 |
| orr x14, x14, x27 |
| |
| // reverse the bit order of x25. This is needed for clz after this macro |
| rbit x15, x25 |
| |
| orr x14, x14, x28 |
| cbz x14,Lbeeu_loop_end |
| |
| |
| // 0 < B < |n|, |
| // 0 < A <= |n|, |
| // (1) X*a == B (mod |n|), |
| // (2) (-1)*Y*a == A (mod |n|) |
| |
| // Now divide B by the maximum possible power of two in the |
| // integers, and divide X by the same value mod |n|. |
| // When we're done, (1) still holds. |
| |
| // shift := number of trailing 0s in x25 |
| // ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO) |
| clz x13, x15 |
| |
| // If there is no shift, goto shift_A_Y |
| cbz x13, Lbeeu_shift_A_Y |
| |
| // Shift B right by "x13" bits |
| neg x14, x13 |
| lsr x25, x25, x13 |
| lsl x15, x26, x14 |
| |
| lsr x26, x26, x13 |
| lsl x19, x27, x14 |
| |
| orr x25, x25, x15 |
| |
| lsr x27, x27, x13 |
| lsl x20, x28, x14 |
| |
| orr x26, x26, x19 |
| |
| lsr x28, x28, x13 |
| |
| orr x27, x27, x20 |
| |
| |
| // Shift X right by "x13" bits, adding n whenever X becomes odd. |
| // x13--; |
| // x14 := 0; needed in the addition to the most significant word in SHIFT1 |
| eor x14, x14, x14 |
| Lbeeu_shift_loop_X: |
| tbz x3, #0, Lshift1_0 |
| adds x3, x3, x0 |
| adcs x4, x4, x1 |
| adcs x5, x5, x2 |
| adcs x6, x6, x30 |
| adc x7, x7, x14 |
| Lshift1_0: |
| // var0 := [var1|var0]<64..1>; |
| // i.e. concatenate var1 and var0, |
| // extract bits <64..1> from the resulting 128-bit value |
| // and put them in var0 |
| extr x3, x4, x3, #1 |
| extr x4, x5, x4, #1 |
| extr x5, x6, x5, #1 |
| extr x6, x7, x6, #1 |
| lsr x7, x7, #1 |
| |
| subs x13, x13, #1 |
| bne Lbeeu_shift_loop_X |
| |
| // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl |
| // with the following differences: |
| // - "x13" is set directly to the number of trailing 0s in B |
| // (using rbit and clz instructions) |
| // - The loop is only used to call SHIFT1(X) |
| // and x13 is decreased while executing the X loop. |
| // - SHIFT256(B, x13) is performed before right-shifting X; they are independent |
| |
| Lbeeu_shift_A_Y: |
| // Same for A and Y. |
| // Afterwards, (2) still holds. |
| // Reverse the bit order of x21 |
| // x13 := number of trailing 0s in x21 (= number of leading 0s in x15) |
| rbit x15, x21 |
| clz x13, x15 |
| |
| // If there is no shift, goto |B-A|, X+Y update |
| cbz x13, Lbeeu_update_B_X_or_A_Y |
| |
| // Shift A right by "x13" bits |
| neg x14, x13 |
| lsr x21, x21, x13 |
| lsl x15, x22, x14 |
| |
| lsr x22, x22, x13 |
| lsl x19, x23, x14 |
| |
| orr x21, x21, x15 |
| |
| lsr x23, x23, x13 |
| lsl x20, x24, x14 |
| |
| orr x22, x22, x19 |
| |
| lsr x24, x24, x13 |
| |
| orr x23, x23, x20 |
| |
| |
| // Shift Y right by "x13" bits, adding n whenever Y becomes odd. |
| // x13--; |
| // x14 := 0; needed in the addition to the most significant word in SHIFT1 |
| eor x14, x14, x14 |
| Lbeeu_shift_loop_Y: |
| tbz x8, #0, Lshift1_1 |
| adds x8, x8, x0 |
| adcs x9, x9, x1 |
| adcs x10, x10, x2 |
| adcs x11, x11, x30 |
| adc x12, x12, x14 |
| Lshift1_1: |
| // var0 := [var1|var0]<64..1>; |
| // i.e. concatenate var1 and var0, |
| // extract bits <64..1> from the resulting 128-bit value |
| // and put them in var0 |
| extr x8, x9, x8, #1 |
| extr x9, x10, x9, #1 |
| extr x10, x11, x10, #1 |
| extr x11, x12, x11, #1 |
| lsr x12, x12, #1 |
| |
| subs x13, x13, #1 |
| bne Lbeeu_shift_loop_Y |
| |
| Lbeeu_update_B_X_or_A_Y: |
| // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow) |
| // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words |
| // without taking a sign bit if generated. The lack of a carry would |
| // indicate a negative result. See, for example, |
| // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes |
| subs x14, x25, x21 |
| sbcs x15, x26, x22 |
| sbcs x19, x27, x23 |
| sbcs x20, x28, x24 |
| bcs Lbeeu_B_greater_than_A |
| |
| // Else A > B => |
| // A := A - B; Y := Y + X; goto beginning of the loop |
| subs x21, x21, x25 |
| sbcs x22, x22, x26 |
| sbcs x23, x23, x27 |
| sbcs x24, x24, x28 |
| |
| adds x8, x8, x3 |
| adcs x9, x9, x4 |
| adcs x10, x10, x5 |
| adcs x11, x11, x6 |
| adc x12, x12, x7 |
| b Lbeeu_loop |
| |
| Lbeeu_B_greater_than_A: |
| // Continue with B > A => |
| // B := B - A; X := X + Y; goto beginning of the loop |
| mov x25, x14 |
| mov x26, x15 |
| mov x27, x19 |
| mov x28, x20 |
| |
| adds x3, x3, x8 |
| adcs x4, x4, x9 |
| adcs x5, x5, x10 |
| adcs x6, x6, x11 |
| adc x7, x7, x12 |
| b Lbeeu_loop |
| |
| Lbeeu_loop_end: |
| // The Euclid's algorithm loop ends when A == gcd(a,n); |
| // this would be 1, when a and n are co-prime (i.e. do not have a common factor). |
| // Since (-1)*Y*a == A (mod |n|), Y>0 |
| // then out = -Y mod n |
| |
| // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) |
| // Is A-1 == 0? |
| // If not, fail. |
| sub x14, x21, #1 |
| orr x14, x14, x22 |
| orr x14, x14, x23 |
| orr x14, x14, x24 |
| cbnz x14, Lbeeu_err |
| |
| // If Y>n ==> Y:=Y-n |
| Lbeeu_reduction_loop: |
| // x_i := y_i - n_i (X is no longer needed, use it as temp) |
| // (x14 = 0 from above) |
| subs x3, x8, x0 |
| sbcs x4, x9, x1 |
| sbcs x5, x10, x2 |
| sbcs x6, x11, x30 |
| sbcs x7, x12, x14 |
| |
| // If result is non-negative (i.e., cs = carry set = no borrow), |
| // y_i := x_i; goto reduce again |
| // else |
| // y_i := y_i; continue |
| csel x8, x3, x8, cs |
| csel x9, x4, x9, cs |
| csel x10, x5, x10, cs |
| csel x11, x6, x11, cs |
| csel x12, x7, x12, cs |
| bcs Lbeeu_reduction_loop |
| |
| // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0) |
| // out = -Y = n-Y |
| subs x8, x0, x8 |
| sbcs x9, x1, x9 |
| sbcs x10, x2, x10 |
| sbcs x11, x30, x11 |
| |
| // Save Y in output (out (x0) was saved on the stack) |
| ldr x3, [sp,#96] |
| stp x8, x9, [x3] |
| stp x10, x11, [x3,#16] |
| // return 1 (success) |
| mov x0, #1 |
| b Lbeeu_finish |
| |
| Lbeeu_err: |
| // return 0 (error) |
| eor x0, x0, x0 |
| |
| Lbeeu_finish: |
| // Restore callee-saved registers, except x0, x2 |
| add sp,x29,#0 |
| ldp x19,x20,[sp,#16] |
| ldp x21,x22,[sp,#32] |
| ldp x23,x24,[sp,#48] |
| ldp x25,x26,[sp,#64] |
| ldp x27,x28,[sp,#80] |
| ldp x29,x30,[sp],#112 |
| |
| AARCH64_VALIDATE_LINK_REGISTER |
| ret |
| |
| #endif // !OPENSSL_NO_ASM |