Add faster ARM multiplication code using UMAAL (#69)
On ARM platforms that support UMAAL, this new code should speed up curve
operations by 15-20%. There is automatic detection of UMAAL support
using compiler macros, but if it doesn't work for a given platform,
#define uECC_ARM_USE_UMAAL to 1 or 0 as desired.
diff --git a/asm_arm.inc b/asm_arm.inc
index 7b4459f..cdcdec6 100644
--- a/asm_arm.inc
+++ b/asm_arm.inc
@@ -156,342 +156,16 @@
#if (uECC_OPTIMIZATION_LEVEL >= 3)
-#include "asm_arm_mult_square.inc"
-
-#define FAST_MULT_ASM_5_TO_6 \
- "cmp r3, #5 \n\t" \
- "beq 1f \n\t" \
- \
- /* r4 = left high, r5 = right high */ \
- "ldr r4, [r1] \n\t" \
- "ldr r5, [r2] \n\t" \
- \
- "sub r0, #20 \n\t" \
- "sub r1, #20 \n\t" \
- "sub r2, #20 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r14, #0 \n\t" \
- "umull r9, r10, r4, r8 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r9, r9, r6 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "adds r9, r9, r11 \n\t" \
- "adcs r10, r10, r12 \n\t" \
- "adc r14, r14, #0 \n\t" \
- "str r9, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r10, r10, r6 \n\t" \
- "adcs r14, r14, #0 \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r9, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r10, r10, r11 \n\t" \
- "adcs r14, r14, r12 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r10, r10, r11 \n\t" \
- "adcs r14, r14, r12 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "str r10, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r14, r14, r6 \n\t" \
- "adcs r9, r9, #0 \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r10, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r14, r14, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r14, r14, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "str r14, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r9, r9, r6 \n\t" \
- "adcs r10, r10, #0 \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r14, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r9, r9, r11 \n\t" \
- "adcs r10, r10, r12 \n\t" \
- "adc r14, r14, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r9, r9, r11 \n\t" \
- "adcs r10, r10, r12 \n\t" \
- "adc r14, r14, #0 \n\t" \
- "str r9, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r10, r10, r6 \n\t" \
- "adcs r14, r14, #0 \n\t" \
- /* skip past already-loaded (r4, r5) */ \
- "ldr r7, [r1], #8 \n\t" \
- "ldr r8, [r2], #8 \n\t" \
- "mov r9, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r10, r10, r11 \n\t" \
- "adcs r14, r14, r12 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r10, r10, r11 \n\t" \
- "adcs r14, r14, r12 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "str r10, [r0], #4 \n\t" \
- \
- "umull r11, r12, r4, r5 \n\t" \
- "adds r11, r11, r14 \n\t" \
- "adc r12, r12, r9 \n\t" \
- "stmia r0!, {r11, r12} \n\t"
-
-#define FAST_MULT_ASM_6_TO_7 \
- "cmp r3, #6 \n\t" \
- "beq 1f \n\t" \
- \
- /* r4 = left high, r5 = right high */ \
- "ldr r4, [r1] \n\t" \
- "ldr r5, [r2] \n\t" \
- \
- "sub r0, #24 \n\t" \
- "sub r1, #24 \n\t" \
- "sub r2, #24 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r14, #0 \n\t" \
- "umull r9, r10, r4, r8 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r9, r9, r6 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "adds r9, r9, r11 \n\t" \
- "adcs r10, r10, r12 \n\t" \
- "adc r14, r14, #0 \n\t" \
- "str r9, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r10, r10, r6 \n\t" \
- "adcs r14, r14, #0 \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r9, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r10, r10, r11 \n\t" \
- "adcs r14, r14, r12 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r10, r10, r11 \n\t" \
- "adcs r14, r14, r12 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "str r10, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r14, r14, r6 \n\t" \
- "adcs r9, r9, #0 \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r10, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r14, r14, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r14, r14, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "str r14, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r9, r9, r6 \n\t" \
- "adcs r10, r10, #0 \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r14, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r9, r9, r11 \n\t" \
- "adcs r10, r10, r12 \n\t" \
- "adc r14, r14, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r9, r9, r11 \n\t" \
- "adcs r10, r10, r12 \n\t" \
- "adc r14, r14, #0 \n\t" \
- "str r9, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r10, r10, r6 \n\t" \
- "adcs r14, r14, #0 \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r9, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r10, r10, r11 \n\t" \
- "adcs r14, r14, r12 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r10, r10, r11 \n\t" \
- "adcs r14, r14, r12 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "str r10, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r14, r14, r6 \n\t" \
- "adcs r9, r9, #0 \n\t" \
- /* skip past already-loaded (r4, r5) */ \
- "ldr r7, [r1], #8 \n\t" \
- "ldr r8, [r2], #8 \n\t" \
- "mov r10, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r14, r14, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r14, r14, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "str r14, [r0], #4 \n\t" \
- \
- "umull r11, r12, r4, r5 \n\t" \
- "adds r11, r11, r9 \n\t" \
- "adc r12, r12, r10 \n\t" \
- "stmia r0!, {r11, r12} \n\t"
-
-#define FAST_MULT_ASM_7_TO_8 \
- "cmp r3, #7 \n\t" \
- "beq 1f \n\t" \
- \
- /* r4 = left high, r5 = right high */ \
- "ldr r4, [r1] \n\t" \
- "ldr r5, [r2] \n\t" \
- \
- "sub r0, #28 \n\t" \
- "sub r1, #28 \n\t" \
- "sub r2, #28 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r14, #0 \n\t" \
- "umull r9, r10, r4, r8 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r9, r9, r6 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "adds r9, r9, r11 \n\t" \
- "adcs r10, r10, r12 \n\t" \
- "adc r14, r14, #0 \n\t" \
- "str r9, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r10, r10, r6 \n\t" \
- "adcs r14, r14, #0 \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r9, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r10, r10, r11 \n\t" \
- "adcs r14, r14, r12 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r10, r10, r11 \n\t" \
- "adcs r14, r14, r12 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "str r10, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r14, r14, r6 \n\t" \
- "adcs r9, r9, #0 \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r10, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r14, r14, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r14, r14, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "str r14, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r9, r9, r6 \n\t" \
- "adcs r10, r10, #0 \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r14, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r9, r9, r11 \n\t" \
- "adcs r10, r10, r12 \n\t" \
- "adc r14, r14, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r9, r9, r11 \n\t" \
- "adcs r10, r10, r12 \n\t" \
- "adc r14, r14, #0 \n\t" \
- "str r9, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r10, r10, r6 \n\t" \
- "adcs r14, r14, #0 \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r9, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r10, r10, r11 \n\t" \
- "adcs r14, r14, r12 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r10, r10, r11 \n\t" \
- "adcs r14, r14, r12 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "str r10, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r14, r14, r6 \n\t" \
- "adcs r9, r9, #0 \n\t" \
- "ldr r7, [r1], #4 \n\t" \
- "ldr r8, [r2], #4 \n\t" \
- "mov r10, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r14, r14, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r14, r14, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "str r14, [r0], #4 \n\t" \
- \
- "ldr r6, [r0] \n\t" \
- "adds r9, r9, r6 \n\t" \
- "adcs r10, r10, #0 \n\t" \
- /* skip past already-loaded (r4, r5) */ \
- "ldr r7, [r1], #8 \n\t" \
- "ldr r8, [r2], #8 \n\t" \
- "mov r14, #0 \n\t" \
- "umull r11, r12, r4, r8 \n\t" \
- "adds r9, r9, r11 \n\t" \
- "adcs r10, r10, r12 \n\t" \
- "adc r14, r14, #0 \n\t" \
- "umull r11, r12, r5, r7 \n\t" \
- "adds r9, r9, r11 \n\t" \
- "adcs r10, r10, r12 \n\t" \
- "adc r14, r14, #0 \n\t" \
- "str r9, [r0], #4 \n\t" \
- \
- "umull r11, r12, r4, r5 \n\t" \
- "adds r11, r11, r10 \n\t" \
- "adc r12, r12, r14 \n\t" \
- "stmia r0!, {r11, r12} \n\t"
-
#if (uECC_PLATFORM != uECC_arm_thumb)
+
+#if uECC_ARM_USE_UMAAL
+ #include "asm_arm_mult_square_umaal.inc"
+#else
+ #include "asm_arm_mult_square.inc"
+#endif
+
+#if (uECC_OPTIMIZATION_LEVEL == 3)
+
uECC_VLI_API void uECC_vli_mult(uint32_t *result,
const uint32_t *left,
const uint32_t *right,
@@ -503,11 +177,8 @@
__asm__ volatile (
".syntax unified \n\t"
- "push {r3} \n\t"
-
#if (uECC_MIN_WORDS == 5)
FAST_MULT_ASM_5
- "pop {r3} \n\t"
#if (uECC_MAX_WORDS > 5)
FAST_MULT_ASM_5_TO_6
#endif
@@ -519,7 +190,6 @@
#endif
#elif (uECC_MIN_WORDS == 6)
FAST_MULT_ASM_6
- "pop {r3} \n\t"
#if (uECC_MAX_WORDS > 6)
FAST_MULT_ASM_6_TO_7
#endif
@@ -528,15 +198,12 @@
#endif
#elif (uECC_MIN_WORDS == 7)
FAST_MULT_ASM_7
- "pop {r3} \n\t"
#if (uECC_MAX_WORDS > 7)
FAST_MULT_ASM_7_TO_8
#endif
#elif (uECC_MIN_WORDS == 8)
FAST_MULT_ASM_8
- "pop {r3} \n\t"
#endif
-
"1: \n\t"
RESUME_SYNTAX
: "+r" (r0), "+r" (r1), "+r" (r2)
@@ -547,217 +214,6 @@
#define asm_mult 1
#if uECC_SQUARE_FUNC
-
-#define FAST_SQUARE_ASM_5_TO_6 \
- "cmp r2, #5 \n\t" \
- "beq 1f \n\t" \
- \
- /* r3 = high */ \
- "ldr r3, [r1] \n\t" \
- \
- "sub r0, #20 \n\t" \
- "sub r1, #20 \n\t" \
- \
- /* Do off-center multiplication */ \
- "ldr r14, [r1], #4 \n\t" \
- "umull r4, r5, r3, r14 \n\t" \
- "ldr r14, [r1], #4 \n\t" \
- "umull r7, r6, r3, r14 \n\t" \
- "adds r5, r5, r7 \n\t" \
- "ldr r14, [r1], #4 \n\t" \
- "umull r8, r7, r3, r14 \n\t" \
- "adcs r6, r6, r8 \n\t" \
- "ldr r14, [r1], #4 \n\t" \
- "umull r9, r8, r3, r14 \n\t" \
- "adcs r7, r7, r9 \n\t" \
- /* Skip already-loaded r3 */ \
- "ldr r14, [r1], #8 \n\t" \
- "umull r10, r9, r3, r14 \n\t" \
- "adcs r8, r8, r10 \n\t" \
- "adcs r9, r9, #0 \n\t" \
- \
- /* Multiply by 2 */ \
- "mov r10, #0 \n\t" \
- "adds r4, r4, r4 \n\t" \
- "adcs r5, r5, r5 \n\t" \
- "adcs r6, r6, r6 \n\t" \
- "adcs r7, r7, r7 \n\t" \
- "adcs r8, r8, r8 \n\t" \
- "adcs r9, r9, r9 \n\t" \
- "adcs r10, r10, #0 \n\t" \
- \
- /* Add into previous */ \
- "ldr r14, [r0] \n\t" \
- "adds r4, r4, r14 \n\t" \
- "str r4, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r5, r5, r14 \n\t" \
- "str r5, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r6, r6, r14 \n\t" \
- "str r6, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r7, r7, r14 \n\t" \
- "str r7, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r8, r8, r14 \n\t" \
- "str r8, [r0], #4 \n\t" \
- "adcs r9, r9, #0 \n\t" \
- "adcs r10, r10, #0 \n\t" \
- \
- /* Perform center multiplication */ \
- "umull r4, r5, r3, r3 \n\t" \
- "adds r4, r4, r9 \n\t" \
- "adc r5, r5, r10 \n\t" \
- "stmia r0!, {r4, r5} \n\t"
-
-#define FAST_SQUARE_ASM_6_TO_7 \
- "cmp r2, #6 \n\t" \
- "beq 1f \n\t" \
- \
- /* r3 = high */ \
- "ldr r3, [r1] \n\t" \
- \
- "sub r0, #24 \n\t" \
- "sub r1, #24 \n\t" \
- \
- /* Do off-center multiplication */ \
- "ldr r14, [r1], #4 \n\t" \
- "umull r4, r5, r3, r14 \n\t" \
- "ldr r14, [r1], #4 \n\t" \
- "umull r7, r6, r3, r14 \n\t" \
- "adds r5, r5, r7 \n\t" \
- "ldr r14, [r1], #4 \n\t" \
- "umull r8, r7, r3, r14 \n\t" \
- "adcs r6, r6, r8 \n\t" \
- "ldr r14, [r1], #4 \n\t" \
- "umull r9, r8, r3, r14 \n\t" \
- "adcs r7, r7, r9 \n\t" \
- "ldr r14, [r1], #4 \n\t" \
- "umull r10, r9, r3, r14 \n\t" \
- "adcs r8, r8, r10 \n\t" \
- /* Skip already-loaded r3 */ \
- "ldr r14, [r1], #8 \n\t" \
- "umull r11, r10, r3, r14 \n\t" \
- "adcs r9, r9, r11 \n\t" \
- "adcs r10, r10, #0 \n\t" \
- \
- /* Multiply by 2 */ \
- "mov r11, #0 \n\t" \
- "adds r4, r4, r4 \n\t" \
- "adcs r5, r5, r5 \n\t" \
- "adcs r6, r6, r6 \n\t" \
- "adcs r7, r7, r7 \n\t" \
- "adcs r8, r8, r8 \n\t" \
- "adcs r9, r9, r9 \n\t" \
- "adcs r10, r10, r10 \n\t" \
- "adcs r11, r11, #0 \n\t" \
- \
- /* Add into previous */ \
- "ldr r14, [r0] \n\t" \
- "adds r4, r4, r14 \n\t" \
- "str r4, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r5, r5, r14 \n\t" \
- "str r5, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r6, r6, r14 \n\t" \
- "str r6, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r7, r7, r14 \n\t" \
- "str r7, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r8, r8, r14 \n\t" \
- "str r8, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r9, r9, r14 \n\t" \
- "str r9, [r0], #4 \n\t" \
- "adcs r10, r10, #0 \n\t" \
- "adcs r11, r11, #0 \n\t" \
- \
- /* Perform center multiplication */ \
- "umull r4, r5, r3, r3 \n\t" \
- "adds r4, r4, r10 \n\t" \
- "adc r5, r5, r11 \n\t" \
- "stmia r0!, {r4, r5} \n\t"
-
-#define FAST_SQUARE_ASM_7_TO_8 \
- "cmp r2, #7 \n\t" \
- "beq 1f \n\t" \
- \
- /* r3 = high */ \
- "ldr r3, [r1] \n\t" \
- \
- "sub r0, #28 \n\t" \
- "sub r1, #28 \n\t" \
- \
- /* Do off-center multiplication */ \
- "ldr r14, [r1], #4 \n\t" \
- "umull r4, r5, r3, r14 \n\t" \
- "ldr r14, [r1], #4 \n\t" \
- "umull r7, r6, r3, r14 \n\t" \
- "adds r5, r5, r7 \n\t" \
- "ldr r14, [r1], #4 \n\t" \
- "umull r8, r7, r3, r14 \n\t" \
- "adcs r6, r6, r8 \n\t" \
- "ldr r14, [r1], #4 \n\t" \
- "umull r9, r8, r3, r14 \n\t" \
- "adcs r7, r7, r9 \n\t" \
- "ldr r14, [r1], #4 \n\t" \
- "umull r10, r9, r3, r14 \n\t" \
- "adcs r8, r8, r10 \n\t" \
- "ldr r14, [r1], #4 \n\t" \
- "umull r11, r10, r3, r14 \n\t" \
- "adcs r9, r9, r11 \n\t" \
- /* Skip already-loaded r3 */ \
- "ldr r14, [r1], #8 \n\t" \
- "umull r12, r11, r3, r14 \n\t" \
- "adcs r10, r10, r12 \n\t" \
- "adcs r11, r11, #0 \n\t" \
- \
- /* Multiply by 2 */ \
- "mov r12, #0 \n\t" \
- "adds r4, r4, r4 \n\t" \
- "adcs r5, r5, r5 \n\t" \
- "adcs r6, r6, r6 \n\t" \
- "adcs r7, r7, r7 \n\t" \
- "adcs r8, r8, r8 \n\t" \
- "adcs r9, r9, r9 \n\t" \
- "adcs r10, r10, r10 \n\t" \
- "adcs r11, r11, r11 \n\t" \
- "adcs r12, r12, #0 \n\t" \
- \
- /* Add into previous */ \
- "ldr r14, [r0] \n\t" \
- "adds r4, r4, r14 \n\t" \
- "str r4, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r5, r5, r14 \n\t" \
- "str r5, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r6, r6, r14 \n\t" \
- "str r6, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r7, r7, r14 \n\t" \
- "str r7, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r8, r8, r14 \n\t" \
- "str r8, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r9, r9, r14 \n\t" \
- "str r9, [r0], #4 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adcs r10, r10, r14 \n\t" \
- "str r10, [r0], #4 \n\t" \
- "adcs r11, r11, #0 \n\t" \
- "adcs r12, r12, #0 \n\t" \
- \
- /* Perform center multiplication */ \
- "umull r4, r5, r3, r3 \n\t" \
- "adds r4, r4, r11 \n\t" \
- "adc r5, r5, r12 \n\t" \
- "stmia r0!, {r4, r5} \n\t"
-
uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
const uECC_word_t *left,
wordcount_t num_words) {
@@ -767,13 +223,9 @@
__asm__ volatile (
".syntax unified \n\t"
- "push {r1, r2} \n\t"
-
#if (uECC_MIN_WORDS == 5)
FAST_SQUARE_ASM_5
- "pop {r1, r2} \n\t"
#if (uECC_MAX_WORDS > 5)
- "add r1, #20 \n\t"
FAST_SQUARE_ASM_5_TO_6
#endif
#if (uECC_MAX_WORDS > 6)
@@ -784,9 +236,7 @@
#endif
#elif (uECC_MIN_WORDS == 6)
FAST_SQUARE_ASM_6
- "pop {r1, r2} \n\t"
#if (uECC_MAX_WORDS > 6)
- "add r1, #24 \n\t"
FAST_SQUARE_ASM_6_TO_7
#endif
#if (uECC_MAX_WORDS > 7)
@@ -794,14 +244,11 @@
#endif
#elif (uECC_MIN_WORDS == 7)
FAST_SQUARE_ASM_7
- "pop {r1, r2} \n\t"
#if (uECC_MAX_WORDS > 7)
- "add r1, #28 \n\t"
FAST_SQUARE_ASM_7_TO_8
#endif
#elif (uECC_MIN_WORDS == 8)
FAST_SQUARE_ASM_8
- "pop {r1, r2} \n\t"
#endif
"1: \n\t"
@@ -814,6 +261,138 @@
#define asm_square 1
#endif /* uECC_SQUARE_FUNC */
+#else /* (uECC_OPTIMIZATION_LEVEL > 3) */
+
+uECC_VLI_API void uECC_vli_mult(uint32_t *result,
+ const uint32_t *left,
+ const uint32_t *right,
+ wordcount_t num_words) {
+ register uint32_t *r0 __asm__("r0") = result;
+ register const uint32_t *r1 __asm__("r1") = left;
+ register const uint32_t *r2 __asm__("r2") = right;
+ register uint32_t r3 __asm__("r3") = num_words;
+
+#if uECC_SUPPORTS_secp160r1
+ if (num_words == 5) {
+ __asm__ volatile (
+ ".syntax unified \n\t"
+ FAST_MULT_ASM_5
+ RESUME_SYNTAX
+ : "+r" (r0), "+r" (r1), "+r" (r2)
+ : "r" (r3)
+ : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+ );
+ return;
+ }
+#endif
+#if uECC_SUPPORTS_secp192r1
+ if (num_words == 6) {
+ __asm__ volatile (
+ ".syntax unified \n\t"
+ FAST_MULT_ASM_6
+ RESUME_SYNTAX
+ : "+r" (r0), "+r" (r1), "+r" (r2)
+ : "r" (r3)
+ : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+ );
+ return;
+ }
+#endif
+#if uECC_SUPPORTS_secp224r1
+ if (num_words == 7) {
+ __asm__ volatile (
+ ".syntax unified \n\t"
+ FAST_MULT_ASM_7
+ RESUME_SYNTAX
+ : "+r" (r0), "+r" (r1), "+r" (r2)
+ : "r" (r3)
+ : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+ );
+ return;
+ }
+#endif
+#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
+ if (num_words == 8) {
+ __asm__ volatile (
+ ".syntax unified \n\t"
+ FAST_MULT_ASM_8
+ RESUME_SYNTAX
+ : "+r" (r0), "+r" (r1), "+r" (r2)
+ : "r" (r3)
+ : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+ );
+ return;
+ }
+#endif
+}
+#define asm_mult 1
+
+#if uECC_SQUARE_FUNC
+uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
+ const uECC_word_t *left,
+ wordcount_t num_words) {
+ register uint32_t *r0 __asm__("r0") = result;
+ register const uint32_t *r1 __asm__("r1") = left;
+ register uint32_t r2 __asm__("r2") = num_words;
+
+#if uECC_SUPPORTS_secp160r1
+ if (num_words == 5) {
+ __asm__ volatile (
+ ".syntax unified \n\t"
+ FAST_SQUARE_ASM_5
+ RESUME_SYNTAX
+ : "+r" (r0), "+r" (r1)
+ : "r" (r2)
+ : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+ );
+ return;
+ }
+#endif
+#if uECC_SUPPORTS_secp192r1
+ if (num_words == 6) {
+ __asm__ volatile (
+ ".syntax unified \n\t"
+ FAST_SQUARE_ASM_6
+ RESUME_SYNTAX
+ : "+r" (r0), "+r" (r1)
+ : "r" (r2)
+ : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+ );
+ return;
+ }
+#endif
+#if uECC_SUPPORTS_secp224r1
+ if (num_words == 7) {
+ __asm__ volatile (
+ ".syntax unified \n\t"
+ FAST_SQUARE_ASM_7
+ RESUME_SYNTAX
+ : "+r" (r0), "+r" (r1)
+ : "r" (r2)
+ : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+ );
+ return;
+ }
+#endif
+#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
+ if (num_words == 8) {
+ __asm__ volatile (
+ ".syntax unified \n\t"
+ FAST_SQUARE_ASM_8
+ RESUME_SYNTAX
+ : "+r" (r0), "+r" (r1)
+ : "r" (r2)
+ : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+ );
+ return;
+ }
+#endif
+}
+#define asm_square 1
+#endif /* uECC_SQUARE_FUNC */
+
+#endif /* (uECC_OPTIMIZATION_LEVEL > 3) */
+
#endif /* uECC_PLATFORM != uECC_arm_thumb */
#endif /* (uECC_OPTIMIZATION_LEVEL >= 3) */
diff --git a/asm_arm_mult_square.inc b/asm_arm_mult_square.inc
index 9decef6..8907fc1 100644
--- a/asm_arm_mult_square.inc
+++ b/asm_arm_mult_square.inc
@@ -4,6 +4,7 @@
#define _UECC_ASM_ARM_MULT_SQUARE_H_
#define FAST_MULT_ASM_5 \
+ "push {r3} \n\t" \
"add r0, 12 \n\t" \
"add r2, 12 \n\t" \
"ldmia r1!, {r3,r4} \n\t" \
@@ -154,9 +155,106 @@
"umull r14, r9, r4, r7 \n\t" \
"adds r10, r10, r14 \n\t" \
"adc r11, r11, r9 \n\t" \
- "stmia r0!, {r10, r11} \n\t"
+ "stmia r0!, {r10, r11} \n\t" \
+ "pop {r3} \n\t"
+
+#define FAST_MULT_ASM_5_TO_6 \
+ "cmp r3, #5 \n\t" \
+ "beq 1f \n\t" \
+ \
+ /* r4 = left high, r5 = right high */ \
+ "ldr r4, [r1] \n\t" \
+ "ldr r5, [r2] \n\t" \
+ \
+ "sub r0, #20 \n\t" \
+ "sub r1, #20 \n\t" \
+ "sub r2, #20 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r14, #0 \n\t" \
+ "umull r9, r10, r4, r8 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r9, r9, r6 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "adds r9, r9, r11 \n\t" \
+ "adcs r10, r10, r12 \n\t" \
+ "adc r14, r14, #0 \n\t" \
+ "str r9, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r10, r10, r6 \n\t" \
+ "adcs r14, r14, #0 \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r9, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r10, r10, r11 \n\t" \
+ "adcs r14, r14, r12 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r10, r10, r11 \n\t" \
+ "adcs r14, r14, r12 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r14, r14, r6 \n\t" \
+ "adcs r9, r9, #0 \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r10, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r14, r14, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r14, r14, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "str r14, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r9, r9, r6 \n\t" \
+ "adcs r10, r10, #0 \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r14, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r9, r9, r11 \n\t" \
+ "adcs r10, r10, r12 \n\t" \
+ "adc r14, r14, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r9, r9, r11 \n\t" \
+ "adcs r10, r10, r12 \n\t" \
+ "adc r14, r14, #0 \n\t" \
+ "str r9, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r10, r10, r6 \n\t" \
+ "adcs r14, r14, #0 \n\t" \
+ /* skip past already-loaded (r4, r5) */ \
+ "ldr r7, [r1], #8 \n\t" \
+ "ldr r8, [r2], #8 \n\t" \
+ "mov r9, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r10, r10, r11 \n\t" \
+ "adcs r14, r14, r12 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r10, r10, r11 \n\t" \
+ "adcs r14, r14, r12 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ \
+ "umull r11, r12, r4, r5 \n\t" \
+ "adds r11, r11, r14 \n\t" \
+ "adc r12, r12, r9 \n\t" \
+ "stmia r0!, {r11, r12} \n\t"
#define FAST_MULT_ASM_6 \
+ "push {r3} \n\t" \
"add r0, 12 \n\t" \
"add r2, 12 \n\t" \
"ldmia r1!, {r3,r4,r5} \n\t" \
@@ -372,9 +470,122 @@
"umull r10, r11, r5, r8 \n\t" \
"adds r12, r12, r10 \n\t" \
"adc r14, r14, r11 \n\t" \
- "stmia r0!, {r12, r14} \n\t"
+ "stmia r0!, {r12, r14} \n\t" \
+ "pop {r3} \n\t"
+
+#define FAST_MULT_ASM_6_TO_7 \
+ "cmp r3, #6 \n\t" \
+ "beq 1f \n\t" \
+ \
+ /* r4 = left high, r5 = right high */ \
+ "ldr r4, [r1] \n\t" \
+ "ldr r5, [r2] \n\t" \
+ \
+ "sub r0, #24 \n\t" \
+ "sub r1, #24 \n\t" \
+ "sub r2, #24 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r14, #0 \n\t" \
+ "umull r9, r10, r4, r8 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r9, r9, r6 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "adds r9, r9, r11 \n\t" \
+ "adcs r10, r10, r12 \n\t" \
+ "adc r14, r14, #0 \n\t" \
+ "str r9, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r10, r10, r6 \n\t" \
+ "adcs r14, r14, #0 \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r9, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r10, r10, r11 \n\t" \
+ "adcs r14, r14, r12 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r10, r10, r11 \n\t" \
+ "adcs r14, r14, r12 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r14, r14, r6 \n\t" \
+ "adcs r9, r9, #0 \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r10, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r14, r14, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r14, r14, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "str r14, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r9, r9, r6 \n\t" \
+ "adcs r10, r10, #0 \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r14, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r9, r9, r11 \n\t" \
+ "adcs r10, r10, r12 \n\t" \
+ "adc r14, r14, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r9, r9, r11 \n\t" \
+ "adcs r10, r10, r12 \n\t" \
+ "adc r14, r14, #0 \n\t" \
+ "str r9, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r10, r10, r6 \n\t" \
+ "adcs r14, r14, #0 \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r9, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r10, r10, r11 \n\t" \
+ "adcs r14, r14, r12 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r10, r10, r11 \n\t" \
+ "adcs r14, r14, r12 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r14, r14, r6 \n\t" \
+ "adcs r9, r9, #0 \n\t" \
+ /* skip past already-loaded (r4, r5) */ \
+ "ldr r7, [r1], #8 \n\t" \
+ "ldr r8, [r2], #8 \n\t" \
+ "mov r10, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r14, r14, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r14, r14, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "str r14, [r0], #4 \n\t" \
+ \
+ "umull r11, r12, r4, r5 \n\t" \
+ "adds r11, r11, r9 \n\t" \
+ "adc r12, r12, r10 \n\t" \
+ "stmia r0!, {r11, r12} \n\t"
#define FAST_MULT_ASM_7 \
+ "push {r3} \n\t" \
"add r0, 24 \n\t" \
"add r2, 24 \n\t" \
"ldmia r1!, {r3} \n\t" \
@@ -680,9 +891,138 @@
"umull r10, r11, r3, r6 \n\t" \
"adds r12, r12, r10 \n\t" \
"adc r14, r14, r11 \n\t" \
- "stmia r0!, {r12, r14} \n\t"
+ "stmia r0!, {r12, r14} \n\t" \
+ "pop {r3} \n\t"
+
+#define FAST_MULT_ASM_7_TO_8 \
+ "cmp r3, #7 \n\t" \
+ "beq 1f \n\t" \
+ \
+ /* r4 = left high, r5 = right high */ \
+ "ldr r4, [r1] \n\t" \
+ "ldr r5, [r2] \n\t" \
+ \
+ "sub r0, #28 \n\t" \
+ "sub r1, #28 \n\t" \
+ "sub r2, #28 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r14, #0 \n\t" \
+ "umull r9, r10, r4, r8 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r9, r9, r6 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "adds r9, r9, r11 \n\t" \
+ "adcs r10, r10, r12 \n\t" \
+ "adc r14, r14, #0 \n\t" \
+ "str r9, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r10, r10, r6 \n\t" \
+ "adcs r14, r14, #0 \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r9, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r10, r10, r11 \n\t" \
+ "adcs r14, r14, r12 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r10, r10, r11 \n\t" \
+ "adcs r14, r14, r12 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r14, r14, r6 \n\t" \
+ "adcs r9, r9, #0 \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r10, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r14, r14, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r14, r14, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "str r14, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r9, r9, r6 \n\t" \
+ "adcs r10, r10, #0 \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r14, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r9, r9, r11 \n\t" \
+ "adcs r10, r10, r12 \n\t" \
+ "adc r14, r14, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r9, r9, r11 \n\t" \
+ "adcs r10, r10, r12 \n\t" \
+ "adc r14, r14, #0 \n\t" \
+ "str r9, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r10, r10, r6 \n\t" \
+ "adcs r14, r14, #0 \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r9, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r10, r10, r11 \n\t" \
+ "adcs r14, r14, r12 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r10, r10, r11 \n\t" \
+ "adcs r14, r14, r12 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r14, r14, r6 \n\t" \
+ "adcs r9, r9, #0 \n\t" \
+ "ldr r7, [r1], #4 \n\t" \
+ "ldr r8, [r2], #4 \n\t" \
+ "mov r10, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r14, r14, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r14, r14, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "str r14, [r0], #4 \n\t" \
+ \
+ "ldr r6, [r0] \n\t" \
+ "adds r9, r9, r6 \n\t" \
+ "adcs r10, r10, #0 \n\t" \
+ /* skip past already-loaded (r4, r5) */ \
+ "ldr r7, [r1], #8 \n\t" \
+ "ldr r8, [r2], #8 \n\t" \
+ "mov r14, #0 \n\t" \
+ "umull r11, r12, r4, r8 \n\t" \
+ "adds r9, r9, r11 \n\t" \
+ "adcs r10, r10, r12 \n\t" \
+ "adc r14, r14, #0 \n\t" \
+ "umull r11, r12, r5, r7 \n\t" \
+ "adds r9, r9, r11 \n\t" \
+ "adcs r10, r10, r12 \n\t" \
+ "adc r14, r14, #0 \n\t" \
+ "str r9, [r0], #4 \n\t" \
+ \
+ "umull r11, r12, r4, r5 \n\t" \
+ "adds r11, r11, r10 \n\t" \
+ "adc r12, r12, r14 \n\t" \
+ "stmia r0!, {r11, r12} \n\t"
#define FAST_MULT_ASM_8 \
+ "push {r3} \n\t" \
"add r0, 24 \n\t" \
"add r2, 24 \n\t" \
"ldmia r1!, {r3,r4} \n\t" \
@@ -1083,10 +1423,13 @@
"umull r9, r10, r4, r7 \n\t" \
"adds r11, r11, r9 \n\t" \
"adc r12, r12, r10 \n\t" \
- "stmia r0!, {r11, r12} \n\t"
+ "stmia r0!, {r11, r12} \n\t" \
+ "pop {r3} \n\t"
#define FAST_SQUARE_ASM_5 \
+ "push {r2} \n\t" \
"ldmia r1!, {r2,r3,r4,r5,r6} \n\t" \
+ "push {r1} \n\t" \
\
"umull r11, r12, r2, r2 \n\t" \
"stmia r0!, {r11} \n\t" \
@@ -1188,10 +1531,62 @@
"umull r1, r10, r6, r6 \n\t" \
"adds r8, r8, r1 \n\t" \
"adcs r11, r11, r10 \n\t" \
- "stmia r0!, {r8, r11} \n\t"
+ "stmia r0!, {r8, r11} \n\t" \
+ "pop {r1, r2} \n\t"
+
+#define FAST_SQUARE_ASM_5_TO_6 \
+ "cmp r2, #5 \n\t" \
+ "beq 1f \n\t" \
+ \
+ "sub r0, #20 \n\t" \
+ "sub r1, #20 \n\t" \
+ \
+ /* Do off-center multiplication */ \
+ "ldmia r1!, {r6,r7,r8,r9,r10,r11} \n\t" \
+ "umull r3, r4, r6, r11 \n\t" \
+ "umull r6, r5, r7, r11 \n\t" \
+ "adds r4, r4, r6 \n\t" \
+ "umull r7, r6, r8, r11 \n\t" \
+ "adcs r5, r5, r7 \n\t" \
+ "umull r8, r7, r9, r11 \n\t" \
+ "adcs r6, r6, r8 \n\t" \
+ "umull r9, r8, r10, r11 \n\t" \
+ "adcs r7, r7, r9 \n\t" \
+ "adcs r8, r8, #0 \n\t" \
+ \
+ /* Multiply by 2 */ \
+ "mov r9, #0 \n\t" \
+ "adds r3, r3, r3 \n\t" \
+ "adcs r4, r4, r4 \n\t" \
+ "adcs r5, r5, r5 \n\t" \
+ "adcs r6, r6, r6 \n\t" \
+ "adcs r7, r7, r7 \n\t" \
+ "adcs r8, r8, r8 \n\t" \
+ "adcs r9, r9, #0 \n\t" \
+ \
+ /* Add into previous */ \
+ "ldr r14, [r0], #4 \n\t" \
+ "adds r3, r3, r14 \n\t" \
+ "ldr r14, [r0], #4 \n\t" \
+ "adcs r4, r4, r14 \n\t" \
+ "ldr r14, [r0], #4 \n\t" \
+ "adcs r5, r5, r14 \n\t" \
+ "ldr r14, [r0], #4 \n\t" \
+ "adcs r6, r6, r14 \n\t" \
+ "ldr r14, [r0], #4 \n\t" \
+ "adcs r7, r7, r14 \n\t" \
+ "adcs r8, r8, #0 \n\t" \
+ "adcs r9, r9, #0 \n\t" \
+ "sub r0, #20 \n\t" \
+ \
+ /* Perform center multiplication */ \
+ "umlal r8, r9, r11, r11 \n\t" \
+ "stmia r0!, {r3,r4,r5,r6,r7,r8,r9} \n\t"
#define FAST_SQUARE_ASM_6 \
+ "push {r2} \n\t" \
"ldmia r1!, {r2,r3,r4,r5,r6,r7} \n\t" \
+ "push {r1} \n\t" \
\
"umull r11, r12, r2, r2 \n\t" \
"stmia r0!, {r11} \n\t" \
@@ -1329,237 +1724,344 @@
"umull r1, r10, r7, r7 \n\t" \
"adds r8, r8, r1 \n\t" \
"adcs r11, r11, r10 \n\t" \
- "stmia r0!, {r8, r11} \n\t"
+ "stmia r0!, {r8, r11} \n\t" \
+ "pop {r1, r2} \n\t"
-#define FAST_SQUARE_ASM_7 \
- "ldmia r1!, {r2} \n\t" \
- "add r1, 20 \n\t" \
- "ldmia r1!, {r5} \n\t" \
- "add r0, 24 \n\t" \
- "umull r8, r9, r2, r5 \n\t" \
- "stmia r0!, {r8, r9} \n\t" \
- "sub r0, 32 \n\t" \
- "sub r1, 28 \n\t" \
- \
- "ldmia r1!, {r2, r3, r4, r5, r6, r7} \n\t" \
- \
- "umull r11, r12, r2, r2 \n\t" \
- "stmia r0!, {r11} \n\t" \
- \
- "mov r9, #0 \n\t" \
- "umull r10, r11, r2, r3 \n\t" \
- "adds r12, r12, r10 \n\t" \
- "adcs r8, r11, #0 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "adds r12, r12, r10 \n\t" \
- "adcs r8, r8, r11 \n\t" \
- "adc r9, r9, #0 \n\t" \
- "stmia r0!, {r12} \n\t" \
- \
- "mov r10, #0 \n\t" \
- "umull r11, r12, r2, r4 \n\t" \
- "adds r11, r11, r11 \n\t" \
- "adcs r12, r12, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "adds r8, r8, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "umull r11, r12, r3, r3 \n\t" \
- "adds r8, r8, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "stmia r0!, {r8} \n\t" \
- \
- "mov r12, #0 \n\t" \
- "umull r8, r11, r2, r5 \n\t" \
- "mov r14, r11 \n\t" \
- "umlal r8, r11, r3, r4 \n\t" \
- "cmp r14, r11 \n\t" \
- "it hi \n\t" \
- "adchi r12, r12, #0 \n\t" \
- "adds r8, r8, r8 \n\t" \
- "adcs r11, r11, r11 \n\t" \
- "adc r12, r12, r12 \n\t" \
- "adds r8, r8, r9 \n\t" \
- "adcs r11, r11, r10 \n\t" \
- "adc r12, r12, #0 \n\t" \
- "stmia r0!, {r8} \n\t" \
- \
- "mov r10, #0 \n\t" \
- "umull r8, r9, r2, r6 \n\t" \
- "mov r14, r9 \n\t" \
- "umlal r8, r9, r3, r5 \n\t" \
- "cmp r14, r9 \n\t" \
- "it hi \n\t" \
- "adchi r10, r10, #0 \n\t" \
- "adds r8, r8, r8 \n\t" \
- "adcs r9, r9, r9 \n\t" \
- "adc r10, r10, r10 \n\t" \
- "mov r14, r9 \n\t" \
- "umlal r8, r9, r4, r4 \n\t" \
- "cmp r14, r9 \n\t" \
- "it hi \n\t" \
- "adchi r10, r10, #0 \n\t" \
- "adds r8, r8, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "stmia r0!, {r8} \n\t" \
- \
- "mov r12, #0 \n\t" \
- "umull r8, r11, r2, r7 \n\t" \
- "mov r14, r11 \n\t" \
- "umlal r8, r11, r3, r6 \n\t" \
- "cmp r14, r11 \n\t" \
- "it hi \n\t" \
- "adchi r12, r12, #0 \n\t" \
- "mov r14, r11 \n\t" \
- "umlal r8, r11, r4, r5 \n\t" \
- "cmp r14, r11 \n\t" \
- "it hi \n\t" \
- "adchi r12, r12, #0 \n\t" \
- "adds r8, r8, r8 \n\t" \
- "adcs r11, r11, r11 \n\t" \
- "adc r12, r12, r12 \n\t" \
- "adds r8, r8, r9 \n\t" \
- "adcs r11, r11, r10 \n\t" \
- "adc r12, r12, #0 \n\t" \
- "stmia r0!, {r8} \n\t" \
- \
- "ldmia r1!, {r2} \n\t" \
- "mov r10, #0 \n\t" \
- "umull r8, r9, r3, r7 \n\t" \
- "mov r14, r9 \n\t" \
- "umlal r8, r9, r4, r6 \n\t" \
- "cmp r14, r9 \n\t" \
- "it hi \n\t" \
- "adchi r10, r10, #0 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adds r8, r8, r14 \n\t" \
- "adcs r9, r9, #0 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "adds r8, r8, r8 \n\t" \
- "adcs r9, r9, r9 \n\t" \
- "adc r10, r10, r10 \n\t" \
- "mov r14, r9 \n\t" \
- "umlal r8, r9, r5, r5 \n\t" \
- "cmp r14, r9 \n\t" \
- "it hi \n\t" \
- "adchi r10, r10, #0 \n\t" \
- "adds r8, r8, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "stmia r0!, {r8} \n\t" \
- \
- "mov r12, #0 \n\t" \
- "umull r8, r11, r3, r2 \n\t" \
- "mov r14, r11 \n\t" \
- "umlal r8, r11, r4, r7 \n\t" \
- "cmp r14, r11 \n\t" \
- "it hi \n\t" \
- "adchi r12, r12, #0 \n\t" \
- "mov r14, r11 \n\t" \
- "umlal r8, r11, r5, r6 \n\t" \
- "cmp r14, r11 \n\t" \
- "it hi \n\t" \
- "adchi r12, r12, #0 \n\t" \
- "ldr r14, [r0] \n\t" \
- "adds r8, r8, r14 \n\t" \
- "adcs r11, r11, #0 \n\t" \
- "adc r12, r12, #0 \n\t" \
- "adds r8, r8, r8 \n\t" \
- "adcs r11, r11, r11 \n\t" \
- "adc r12, r12, r12 \n\t" \
- "adds r8, r8, r9 \n\t" \
- "adcs r11, r11, r10 \n\t" \
- "adc r12, r12, #0 \n\t" \
- "stmia r0!, {r8} \n\t" \
- \
- "mov r10, #0 \n\t" \
- "umull r8, r9, r4, r2 \n\t" \
- "mov r14, r9 \n\t" \
- "umlal r8, r9, r5, r7 \n\t" \
- "cmp r14, r9 \n\t" \
- "it hi \n\t" \
- "adchi r10, r10, #0 \n\t" \
- "adds r8, r8, r8 \n\t" \
- "adcs r9, r9, r9 \n\t" \
- "adc r10, r10, r10 \n\t" \
- "mov r14, r9 \n\t" \
- "umlal r8, r9, r6, r6 \n\t" \
- "cmp r14, r9 \n\t" \
- "it hi \n\t" \
- "adchi r10, r10, #0 \n\t" \
- "adds r8, r8, r11 \n\t" \
- "adcs r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "stmia r0!, {r8} \n\t" \
- \
- "mov r12, #0 \n\t" \
- "umull r8, r11, r5, r2 \n\t" \
- "mov r14, r11 \n\t" \
- "umlal r8, r11, r6, r7 \n\t" \
- "cmp r14, r11 \n\t" \
- "it hi \n\t" \
- "adchi r12, r12, #0 \n\t" \
- "adds r8, r8, r8 \n\t" \
- "adcs r11, r11, r11 \n\t" \
- "adc r12, r12, r12 \n\t" \
- "adds r8, r8, r9 \n\t" \
- "adcs r11, r11, r10 \n\t" \
- "adc r12, r12, #0 \n\t" \
- "stmia r0!, {r8} \n\t" \
- \
- "mov r8, #0 \n\t" \
- "umull r1, r10, r6, r2 \n\t" \
- "adds r1, r1, r1 \n\t" \
- "adcs r10, r10, r10 \n\t" \
- "adc r8, r8, #0 \n\t" \
- "adds r11, r11, r1 \n\t" \
- "adcs r12, r12, r10 \n\t" \
- "adc r8, r8, #0 \n\t" \
- "umull r1, r10, r7, r7 \n\t" \
- "adds r11, r11, r1 \n\t" \
- "adcs r12, r12, r10 \n\t" \
- "adc r8, r8, #0 \n\t" \
- "stmia r0!, {r11} \n\t" \
- \
- "mov r11, #0 \n\t" \
- "umull r1, r10, r7, r2 \n\t" \
- "adds r1, r1, r1 \n\t" \
- "adcs r10, r10, r10 \n\t" \
- "adc r11, r11, #0 \n\t" \
- "adds r12, r12, r1 \n\t" \
- "adcs r8, r8, r10 \n\t" \
- "adc r11, r11, #0 \n\t" \
- "stmia r0!, {r12} \n\t" \
- \
- "umull r1, r10, r2, r2 \n\t" \
- "adds r8, r8, r1 \n\t" \
- "adcs r11, r11, r10 \n\t" \
- "stmia r0!, {r8, r11} \n\t"
+#define FAST_SQUARE_ASM_6_TO_7 \
+ "cmp r2, #6 \n\t" \
+ "beq 1f \n\t" \
+ \
+ "sub r0, #24 \n\t" \
+ "sub r1, #24 \n\t" \
+ \
+ /* Do off-center multiplication */ \
+ "ldmia r1!, {r6,r7,r8,r9,r10,r11,r12} \n\t" \
+ "umull r3, r4, r6, r12 \n\t" \
+ "umull r6, r5, r7, r12 \n\t" \
+ "adds r4, r4, r6 \n\t" \
+ "umull r7, r6, r8, r12 \n\t" \
+ "adcs r5, r5, r7 \n\t" \
+ "umull r8, r7, r9, r12 \n\t" \
+ "adcs r6, r6, r8 \n\t" \
+ "umull r9, r8, r10, r12 \n\t" \
+ "adcs r7, r7, r9 \n\t" \
+ "umull r10, r9, r11, r12 \n\t" \
+ "adcs r8, r8, r10 \n\t" \
+ "adcs r9, r9, #0 \n\t" \
+ \
+ /* Multiply by 2 */ \
+ "mov r10, #0 \n\t" \
+ "adds r3, r3, r3 \n\t" \
+ "adcs r4, r4, r4 \n\t" \
+ "adcs r5, r5, r5 \n\t" \
+ "adcs r6, r6, r6 \n\t" \
+ "adcs r7, r7, r7 \n\t" \
+ "adcs r8, r8, r8 \n\t" \
+ "adcs r9, r9, r9 \n\t" \
+ "adcs r10, r10, #0 \n\t" \
+ \
+ /* Add into previous */ \
+ "ldr r14, [r0], #4 \n\t" \
+ "adds r3, r3, r14 \n\t" \
+ "ldr r14, [r0], #4 \n\t" \
+ "adcs r4, r4, r14 \n\t" \
+ "ldr r14, [r0], #4 \n\t" \
+ "adcs r5, r5, r14 \n\t" \
+ "ldr r14, [r0], #4 \n\t" \
+ "adcs r6, r6, r14 \n\t" \
+ "ldr r14, [r0], #4 \n\t" \
+ "adcs r7, r7, r14 \n\t" \
+ "ldr r14, [r0], #4 \n\t" \
+ "adcs r8, r8, r14 \n\t" \
+ "adcs r9, r9, #0 \n\t" \
+ "adcs r10, r10, #0 \n\t" \
+ "sub r0, #24 \n\t" \
+ \
+ /* Perform center multiplication */ \
+ "umlal r9, r10, r12, r12 \n\t" \
+ "stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10} \n\t"
+
+#define FAST_SQUARE_ASM_7 \
+ "push {r2} \n\t" \
+ "ldmia r1!, {r2, r3, r4, r5, r6, r7, r8} \n\t" \
+ "push {r1} \n\t" \
+ "sub r1, 4 \n\t" \
+ \
+ "add r0, 24 \n\t" \
+ "umull r9, r10, r2, r8 \n\t" \
+ "stmia r0!, {r9, r10} \n\t" \
+ "sub r0, 32 \n\t" \
+ \
+ "umull r11, r12, r2, r2 \n\t" \
+ "stmia r0!, {r11} \n\t" \
+ \
+ "mov r9, #0 \n\t" \
+ "umull r10, r11, r2, r3 \n\t" \
+ "adds r12, r12, r10 \n\t" \
+ "adcs r8, r11, #0 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "adds r12, r12, r10 \n\t" \
+ "adcs r8, r8, r11 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "stmia r0!, {r12} \n\t" \
+ \
+ "mov r10, #0 \n\t" \
+ "umull r11, r12, r2, r4 \n\t" \
+ "adds r11, r11, r11 \n\t" \
+ "adcs r12, r12, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "adds r8, r8, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "umull r11, r12, r3, r3 \n\t" \
+ "adds r8, r8, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "stmia r0!, {r8} \n\t" \
+ \
+ "mov r12, #0 \n\t" \
+ "umull r8, r11, r2, r5 \n\t" \
+ "mov r14, r11 \n\t" \
+ "umlal r8, r11, r3, r4 \n\t" \
+ "cmp r14, r11 \n\t" \
+ "it hi \n\t" \
+ "adchi r12, r12, #0 \n\t" \
+ "adds r8, r8, r8 \n\t" \
+ "adcs r11, r11, r11 \n\t" \
+ "adc r12, r12, r12 \n\t" \
+ "adds r8, r8, r9 \n\t" \
+ "adcs r11, r11, r10 \n\t" \
+ "adc r12, r12, #0 \n\t" \
+ "stmia r0!, {r8} \n\t" \
+ \
+ "mov r10, #0 \n\t" \
+ "umull r8, r9, r2, r6 \n\t" \
+ "mov r14, r9 \n\t" \
+ "umlal r8, r9, r3, r5 \n\t" \
+ "cmp r14, r9 \n\t" \
+ "it hi \n\t" \
+ "adchi r10, r10, #0 \n\t" \
+ "adds r8, r8, r8 \n\t" \
+ "adcs r9, r9, r9 \n\t" \
+ "adc r10, r10, r10 \n\t" \
+ "mov r14, r9 \n\t" \
+ "umlal r8, r9, r4, r4 \n\t" \
+ "cmp r14, r9 \n\t" \
+ "it hi \n\t" \
+ "adchi r10, r10, #0 \n\t" \
+ "adds r8, r8, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "stmia r0!, {r8} \n\t" \
+ \
+ "mov r12, #0 \n\t" \
+ "umull r8, r11, r2, r7 \n\t" \
+ "mov r14, r11 \n\t" \
+ "umlal r8, r11, r3, r6 \n\t" \
+ "cmp r14, r11 \n\t" \
+ "it hi \n\t" \
+ "adchi r12, r12, #0 \n\t" \
+ "mov r14, r11 \n\t" \
+ "umlal r8, r11, r4, r5 \n\t" \
+ "cmp r14, r11 \n\t" \
+ "it hi \n\t" \
+ "adchi r12, r12, #0 \n\t" \
+ "adds r8, r8, r8 \n\t" \
+ "adcs r11, r11, r11 \n\t" \
+ "adc r12, r12, r12 \n\t" \
+ "adds r8, r8, r9 \n\t" \
+ "adcs r11, r11, r10 \n\t" \
+ "adc r12, r12, #0 \n\t" \
+ "stmia r0!, {r8} \n\t" \
+ \
+ "ldmia r1!, {r2} \n\t" \
+ "mov r10, #0 \n\t" \
+ "umull r8, r9, r3, r7 \n\t" \
+ "mov r14, r9 \n\t" \
+ "umlal r8, r9, r4, r6 \n\t" \
+ "cmp r14, r9 \n\t" \
+ "it hi \n\t" \
+ "adchi r10, r10, #0 \n\t" \
+ "ldr r14, [r0] \n\t" \
+ "adds r8, r8, r14 \n\t" \
+ "adcs r9, r9, #0 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "adds r8, r8, r8 \n\t" \
+ "adcs r9, r9, r9 \n\t" \
+ "adc r10, r10, r10 \n\t" \
+ "mov r14, r9 \n\t" \
+ "umlal r8, r9, r5, r5 \n\t" \
+ "cmp r14, r9 \n\t" \
+ "it hi \n\t" \
+ "adchi r10, r10, #0 \n\t" \
+ "adds r8, r8, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "stmia r0!, {r8} \n\t" \
+ \
+ "mov r12, #0 \n\t" \
+ "umull r8, r11, r3, r2 \n\t" \
+ "mov r14, r11 \n\t" \
+ "umlal r8, r11, r4, r7 \n\t" \
+ "cmp r14, r11 \n\t" \
+ "it hi \n\t" \
+ "adchi r12, r12, #0 \n\t" \
+ "mov r14, r11 \n\t" \
+ "umlal r8, r11, r5, r6 \n\t" \
+ "cmp r14, r11 \n\t" \
+ "it hi \n\t" \
+ "adchi r12, r12, #0 \n\t" \
+ "ldr r14, [r0] \n\t" \
+ "adds r8, r8, r14 \n\t" \
+ "adcs r11, r11, #0 \n\t" \
+ "adc r12, r12, #0 \n\t" \
+ "adds r8, r8, r8 \n\t" \
+ "adcs r11, r11, r11 \n\t" \
+ "adc r12, r12, r12 \n\t" \
+ "adds r8, r8, r9 \n\t" \
+ "adcs r11, r11, r10 \n\t" \
+ "adc r12, r12, #0 \n\t" \
+ "stmia r0!, {r8} \n\t" \
+ \
+ "mov r10, #0 \n\t" \
+ "umull r8, r9, r4, r2 \n\t" \
+ "mov r14, r9 \n\t" \
+ "umlal r8, r9, r5, r7 \n\t" \
+ "cmp r14, r9 \n\t" \
+ "it hi \n\t" \
+ "adchi r10, r10, #0 \n\t" \
+ "adds r8, r8, r8 \n\t" \
+ "adcs r9, r9, r9 \n\t" \
+ "adc r10, r10, r10 \n\t" \
+ "mov r14, r9 \n\t" \
+ "umlal r8, r9, r6, r6 \n\t" \
+ "cmp r14, r9 \n\t" \
+ "it hi \n\t" \
+ "adchi r10, r10, #0 \n\t" \
+ "adds r8, r8, r11 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ "stmia r0!, {r8} \n\t" \
+ \
+ "mov r12, #0 \n\t" \
+ "umull r8, r11, r5, r2 \n\t" \
+ "mov r14, r11 \n\t" \
+ "umlal r8, r11, r6, r7 \n\t" \
+ "cmp r14, r11 \n\t" \
+ "it hi \n\t" \
+ "adchi r12, r12, #0 \n\t" \
+ "adds r8, r8, r8 \n\t" \
+ "adcs r11, r11, r11 \n\t" \
+ "adc r12, r12, r12 \n\t" \
+ "adds r8, r8, r9 \n\t" \
+ "adcs r11, r11, r10 \n\t" \
+ "adc r12, r12, #0 \n\t" \
+ "stmia r0!, {r8} \n\t" \
+ \
+ "mov r8, #0 \n\t" \
+ "umull r1, r10, r6, r2 \n\t" \
+ "adds r1, r1, r1 \n\t" \
+ "adcs r10, r10, r10 \n\t" \
+ "adc r8, r8, #0 \n\t" \
+ "adds r11, r11, r1 \n\t" \
+ "adcs r12, r12, r10 \n\t" \
+ "adc r8, r8, #0 \n\t" \
+ "umull r1, r10, r7, r7 \n\t" \
+ "adds r11, r11, r1 \n\t" \
+ "adcs r12, r12, r10 \n\t" \
+ "adc r8, r8, #0 \n\t" \
+ "stmia r0!, {r11} \n\t" \
+ \
+ "mov r11, #0 \n\t" \
+ "umull r1, r10, r7, r2 \n\t" \
+ "adds r1, r1, r1 \n\t" \
+ "adcs r10, r10, r10 \n\t" \
+ "adc r11, r11, #0 \n\t" \
+ "adds r12, r12, r1 \n\t" \
+ "adcs r8, r8, r10 \n\t" \
+ "adc r11, r11, #0 \n\t" \
+ "stmia r0!, {r12} \n\t" \
+ \
+ "umull r1, r10, r2, r2 \n\t" \
+ "adds r8, r8, r1 \n\t" \
+ "adcs r11, r11, r10 \n\t" \
+ "stmia r0!, {r8, r11} \n\t" \
+ "pop {r1, r2} \n\t"
+
+#define FAST_SQUARE_ASM_7_TO_8 \
+ "cmp r2, #7 \n\t" \
+ "beq 1f \n\t" \
+ \
+ "sub r0, #28 \n\t" \
+ "sub r1, #28 \n\t" \
+ \
+ /* Do off-center multiplication */ \
+ "ldmia r1!, {r6,r7,r8,r9,r10,r11,r12,r14} \n\t" \
+ "umull r3, r4, r6, r14 \n\t" \
+ "umull r6, r5, r7, r14 \n\t" \
+ "adds r4, r4, r6 \n\t" \
+ "umull r7, r6, r8, r14 \n\t" \
+ "adcs r5, r5, r7 \n\t" \
+ "umull r8, r7, r9, r14 \n\t" \
+ "adcs r6, r6, r8 \n\t" \
+ "umull r9, r8, r10, r14 \n\t" \
+ "adcs r7, r7, r9 \n\t" \
+ "umull r10, r9, r11, r14 \n\t" \
+ "adcs r8, r8, r10 \n\t" \
+ "umull r11, r10, r12, r14 \n\t" \
+ "adcs r9, r9, r11 \n\t" \
+ "adcs r10, r10, #0 \n\t" \
+ \
+ /* Multiply by 2 */ \
+ "mov r11, #0 \n\t" \
+ "adds r3, r3, r3 \n\t" \
+ "adcs r4, r4, r4 \n\t" \
+ "adcs r5, r5, r5 \n\t" \
+ "adcs r6, r6, r6 \n\t" \
+ "adcs r7, r7, r7 \n\t" \
+ "adcs r8, r8, r8 \n\t" \
+ "adcs r9, r9, r9 \n\t" \
+ "adcs r10, r10, r10 \n\t" \
+ "adcs r11, r11, #0 \n\t" \
+ \
+ /* Add into previous */ \
+ "ldr r12, [r0], #4 \n\t" \
+ "adds r3, r3, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r4, r4, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r5, r5, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r6, r6, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r7, r7, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r8, r8, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adcs r10, r10, #0 \n\t" \
+ "adcs r11, r11, #0 \n\t" \
+ "sub r0, #28 \n\t" \
+ \
+ /* Perform center multiplication */ \
+ "umlal r10, r11, r14, r14 \n\t" \
+ "stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10,r11} \n\t"
#define FAST_SQUARE_ASM_8 \
- "ldmia r1!, {r2, r3} \n\t" \
- "add r1, 16 \n\t" \
- "ldmia r1!, {r5, r6} \n\t" \
+ "push {r2} \n\t" \
+ "ldmia r1!, {r2,r3,r4,r5,r6,r7,r8,r9} \n\t" \
+ "push {r1} \n\t" \
+ "sub r1, 8 \n\t" \
+ \
"add r0, 24 \n\t" \
- \
- "umull r8, r9, r2, r5 \n\t" \
- "stmia r0!, {r8} \n\t" \
- \
- "umull r12, r10, r2, r6 \n\t" \
- "adds r9, r9, r12 \n\t" \
- "adc r10, r10, #0 \n\t" \
- "stmia r0!, {r9} \n\t" \
- \
- "umull r8, r9, r3, r6 \n\t" \
- "adds r10, r10, r8 \n\t" \
- "adc r11, r9, #0 \n\t" \
- "stmia r0!, {r10, r11} \n\t" \
- \
+ "umull r10, r11, r2, r8 \n\t" \
+ "umull r12, r14, r2, r9 \n\t" \
+ "umull r8, r9, r3, r9 \n\t" \
+ "adds r11, r11, r12 \n\t" \
+ "adcs r12, r14, r8 \n\t" \
+ "adcs r14, r9, #0 \n\t" \
+ "stmia r0!, {r10, r11, r12, r14} \n\t" \
"sub r0, 40 \n\t" \
- "sub r1, 32 \n\t" \
- "ldmia r1!, {r2,r3,r4,r5,r6,r7} \n\t" \
\
"umull r11, r12, r2, r2 \n\t" \
"stmia r0!, {r11} \n\t" \
@@ -1803,6 +2305,7 @@
"umull r1, r10, r3, r3 \n\t" \
"adds r8, r8, r1 \n\t" \
"adcs r11, r11, r10 \n\t" \
- "stmia r0!, {r8, r11} \n\t"
+ "stmia r0!, {r8, r11} \n\t" \
+ "pop {r1, r2} \n\t"
#endif /* _UECC_ASM_ARM_MULT_SQUARE_H_ */
diff --git a/asm_arm_mult_square_umaal.inc b/asm_arm_mult_square_umaal.inc
new file mode 100644
index 0000000..c554d20
--- /dev/null
+++ b/asm_arm_mult_square_umaal.inc
@@ -0,0 +1,1202 @@
+/* Copyright 2015, Kenneth MacKay. Licensed under the BSD 2-clause license. */
+
+#ifndef _UECC_ASM_ARM_MULT_SQUARE_H_
+#define _UECC_ASM_ARM_MULT_SQUARE_H_
+
+#define FAST_MULT_ASM_5 \
+ "push {r3} \n\t" \
+ "ldmia r2!, {r3, r4, r5, r6, r7} \n\t" \
+ "push {r2} \n\t" \
+ \
+ "ldr r2, [r1], #4 \n\t" \
+ "umull r8, r9, r3, r2 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "mov r10, #0 \n\t" \
+ "umaal r9, r10, r4, r2 \n\t" \
+ "mov r11, #0 \n\t" \
+ "umaal r10, r11, r5, r2 \n\t" \
+ "mov r12, #0 \n\t" \
+ "umaal r11, r12, r6, r2 \n\t" \
+ "mov r14, #0 \n\t" \
+ "umaal r12, r14, r7, r2 \n\t" \
+ \
+ "ldr r2, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r3, r2 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r4, r2 \n\t" \
+ "umaal r10, r11, r5, r2 \n\t" \
+ "umaal r11, r12, r6, r2 \n\t" \
+ "umaal r12, r14, r7, r2 \n\t" \
+ \
+ "ldr r2, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r3, r2 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r4, r2 \n\t" \
+ "umaal r10, r11, r5, r2 \n\t" \
+ "umaal r11, r12, r6, r2 \n\t" \
+ "umaal r12, r14, r7, r2 \n\t" \
+ \
+ "ldr r2, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r3, r2 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r4, r2 \n\t" \
+ "umaal r10, r11, r5, r2 \n\t" \
+ "umaal r11, r12, r6, r2 \n\t" \
+ "umaal r12, r14, r7, r2 \n\t" \
+ \
+ "ldr r2, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r3, r2 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r4, r2 \n\t" \
+ "umaal r10, r11, r5, r2 \n\t" \
+ "umaal r11, r12, r6, r2 \n\t" \
+ "umaal r12, r14, r7, r2 \n\t" \
+ \
+ "str r9, [r0], #4 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ "str r11, [r0], #4 \n\t" \
+ "str r12, [r0], #4 \n\t" \
+ "str r14, [r0], #4 \n\t" \
+ \
+ "pop {r2, r3} \n\t"
+
+#define FAST_MULT_ASM_5_TO_6 \
+ "cmp r3, #5 \n\t" \
+ "beq 1f \n\t" \
+ \
+ /* r4 = left high */ \
+ "ldr r4, [r1] \n\t" \
+ \
+ "sub r0, #20 \n\t" \
+ "sub r1, #20 \n\t" \
+ "sub r2, #20 \n\t" \
+ \
+ /* Do right side */ \
+ "ldr r14, [r2], #4 \n\t" \
+ "mov r5, #0 \n\t" \
+ "ldr r6, [r0], #4 \n\t" \
+ "umaal r5, r6, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r7, [r0], #4 \n\t" \
+ "umaal r6, r7, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r8, [r0], #4 \n\t" \
+ "umaal r7, r8, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r9, [r0], #4 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r10, [r0], #4 \n\t" \
+ "umaal r9, r10, r4, r14 \n\t" \
+ "sub r0, #20 \n\t" \
+ \
+ /* r4 = right high */ \
+ "ldr r4, [r2], #4 \n\t" \
+ \
+ /* Do left side */ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r12, #0 \n\t" \
+ "umaal r12, r5, r4, r14 \n\t" \
+ "str r12, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r5, r6, r4, r14 \n\t" \
+ "str r5, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r6, r7, r4, r14 \n\t" \
+ "str r6, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r7, r8, r4, r14 \n\t" \
+ "str r7, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r9, r10, r4, r14 \n\t" \
+ "stmia r0!, {r9, r10} \n\t"
+
+#define FAST_MULT_ASM_6 \
+ "ldmia r2!, {r4, r5, r6} \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "umull r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "mov r10, #0 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "mov r11, #0 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "str r9, [r0], #4 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ "str r11, [r0], #4 \n\t" \
+ \
+ "sub r0, #24 \n\t" \
+ "sub r1, #24 \n\t" \
+ "ldmia r2!, {r4, r5, r6} \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "mov r9, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "mov r10, #0 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "mov r11, #0 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "str r9, [r0], #4 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ "str r11, [r0], #4 \n\t"
+
+#define FAST_MULT_ASM_6_TO_7 \
+ "cmp r3, #6 \n\t" \
+ "beq 1f \n\t" \
+ \
+ /* r4 = left high */ \
+ "ldr r4, [r1] \n\t" \
+ \
+ "sub r0, #24 \n\t" \
+ "sub r1, #24 \n\t" \
+ "sub r2, #24 \n\t" \
+ \
+ /* Do right side */ \
+ "ldr r14, [r2], #4 \n\t" \
+ "mov r5, #0 \n\t" \
+ "ldr r6, [r0], #4 \n\t" \
+ "umaal r5, r6, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r7, [r0], #4 \n\t" \
+ "umaal r6, r7, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r8, [r0], #4 \n\t" \
+ "umaal r7, r8, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r9, [r0], #4 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r10, [r0], #4 \n\t" \
+ "umaal r9, r10, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r11, [r0], #4 \n\t" \
+ "umaal r10, r11, r4, r14 \n\t" \
+ "sub r0, #24 \n\t" \
+ \
+ /* r4 = right high */ \
+ "ldr r4, [r2], #4 \n\t" \
+ \
+ /* Do left side */ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r12, #0 \n\t" \
+ "umaal r12, r5, r4, r14 \n\t" \
+ "str r12, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r5, r6, r4, r14 \n\t" \
+ "str r5, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r6, r7, r4, r14 \n\t" \
+ "str r6, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r7, r8, r4, r14 \n\t" \
+ "str r7, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r9, r10, r4, r14 \n\t" \
+ "str r9, [r0], #4 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r10, r11, r4, r14 \n\t" \
+ "stmia r0!, {r10, r11} \n\t"
+
+#define FAST_MULT_ASM_7 \
+ "ldmia r2!, {r4, r5, r6, r7} \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "umull r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "mov r10, #0 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "mov r11, #0 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "mov r12, #0 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "str r9, [r0], #4 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ "str r11, [r0], #4 \n\t" \
+ "str r12, [r0], #4 \n\t" \
+ \
+ "sub r0, #28 \n\t" \
+ "sub r1, #28 \n\t" \
+ "ldmia r2!, {r4, r5, r6} \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "mov r9, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "mov r10, #0 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "mov r11, #0 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ \
+ "str r9, [r0], #4 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ "str r11, [r0], #4 \n\t"
+
+#define FAST_MULT_ASM_7_TO_8 \
+ "cmp r3, #7 \n\t" \
+ "beq 1f \n\t" \
+ "push {r3} \n\t" \
+ \
+ /* r4 = left high */ \
+ "ldr r4, [r1] \n\t" \
+ \
+ "sub r0, #28 \n\t" \
+ "sub r1, #28 \n\t" \
+ "sub r2, #28 \n\t" \
+ \
+ /* Do right side */ \
+ "ldr r14, [r2], #4 \n\t" \
+ "mov r5, #0 \n\t" \
+ "ldr r6, [r0], #4 \n\t" \
+ "umaal r5, r6, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r7, [r0], #4 \n\t" \
+ "umaal r6, r7, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r8, [r0], #4 \n\t" \
+ "umaal r7, r8, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r9, [r0], #4 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r10, [r0], #4 \n\t" \
+ "umaal r9, r10, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r11, [r0], #4 \n\t" \
+ "umaal r10, r11, r4, r14 \n\t" \
+ "ldr r14, [r2], #4 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "umaal r11, r12, r4, r14 \n\t" \
+ "sub r0, #28 \n\t" \
+ \
+ /* r4 = right high */ \
+ "ldr r4, [r2], #4 \n\t" \
+ \
+ /* Do left side */ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r3, #0 \n\t" \
+ "umaal r3, r5, r4, r14 \n\t" \
+ "str r3, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r5, r6, r4, r14 \n\t" \
+ "str r5, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r6, r7, r4, r14 \n\t" \
+ "str r6, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r7, r8, r4, r14 \n\t" \
+ "str r7, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r9, r10, r4, r14 \n\t" \
+ "str r9, [r0], #4 \n\t" \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r10, r11, r4, r14 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "umaal r11, r12, r4, r14 \n\t" \
+ "stmia r0!, {r11, r12} \n\t" \
+ "pop {r3} \n\t"
+
+#define FAST_MULT_ASM_8 \
+ "ldmia r2!, {r4, r5, r6, r7} \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "umull r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "mov r10, #0 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "mov r11, #0 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "mov r12, #0 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "str r9, [r0], #4 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ "str r11, [r0], #4 \n\t" \
+ "str r12, [r0], #4 \n\t" \
+ \
+ "sub r0, #32 \n\t" \
+ "sub r1, #32 \n\t" \
+ "ldmia r2!, {r4, r5, r6, r7} \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "mov r9, #0 \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "mov r10, #0 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "mov r11, #0 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "mov r12, #0 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "ldr r14, [r1], #4 \n\t" \
+ "ldr r8, [r0] \n\t" \
+ "umaal r8, r9, r4, r14 \n\t" \
+ "str r8, [r0], #4 \n\t" \
+ "umaal r9, r10, r5, r14 \n\t" \
+ "umaal r10, r11, r6, r14 \n\t" \
+ "umaal r11, r12, r7, r14 \n\t" \
+ \
+ "str r9, [r0], #4 \n\t" \
+ "str r10, [r0], #4 \n\t" \
+ "str r11, [r0], #4 \n\t" \
+ "str r12, [r0], #4 \n\t"
+
+#define FAST_SQUARE_ASM_5 \
+ "ldmia r1!, {r9,r10,r11,r12,r14} \n\t" \
+ "push {r1, r2} \n\t" \
+ \
+ "umull r1, r2, r10, r9 \n\t" \
+ "mov r3, #0 \n\t" \
+ "umaal r2, r3, r11, r9 \n\t" \
+ "mov r4, #0 \n\t" \
+ "umaal r3, r4, r12, r9 \n\t" \
+ "mov r5, #0 \n\t" \
+ "umaal r4, r5, r14, r9 \n\t" \
+ \
+ "mov r6, #0 \n\t" \
+ "umaal r6, r3, r11, r10 \n\t" \
+ "umaal r3, r4, r12, r10 \n\t" \
+ "adds r1, r1, r1 \n\t" \
+ "adcs r2, r2, r2 \n\t" \
+ "adcs r6, r6, r6 \n\t" \
+ "adcs r3, r3, r3 \n\t" \
+ \
+ "umull r7, r8, r9, r9 \n\t" \
+ /* Store carry in r9 */ \
+ "mov r9, #0 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ "adds r8, r8, r1 \n\t" \
+ "stmia r0!, {r7,r8} \n\t" \
+ \
+ "umull r7, r8, r10, r10 \n\t" \
+ "adcs r7, r7, r2 \n\t" \
+ "adcs r8, r8, r6 \n\t" \
+ "stmia r0!, {r7,r8} \n\t" \
+ \
+ "umaal r4, r5, r14, r10 \n\t" \
+ /* Store carry in r10 */ \
+ "mov r10, #0 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ \
+ "mov r1, #0 \n\t" \
+ "umaal r1, r4, r12, r11 \n\t" \
+ "umaal r4, r5, r14, r11 \n\t" \
+ \
+ "mov r2, #0 \n\t" \
+ "umaal r2, r5, r14, r12 \n\t" \
+ /* Load carry from r9 */ \
+ "lsrs r9, #1 \n\t" \
+ "adcs r1, r1, r1 \n\t" \
+ "adcs r4, r4, r4 \n\t" \
+ "adcs r2, r2, r2 \n\t" \
+ "adcs r5, r5, r5 \n\t" \
+ /* r9 is 0 now */ \
+ "adc r9, r9, #0 \n\t" \
+ \
+ /* Use carry from r10 */ \
+ "umaal r3, r10, r11, r11 \n\t" \
+ "adds r10, r10, r1 \n\t" \
+ "stmia r0!, {r3,r10} \n\t" \
+ \
+ "umull r6, r10, r12, r12 \n\t" \
+ "adcs r6, r6, r4 \n\t" \
+ "adcs r10, r10, r2 \n\t" \
+ "stmia r0!, {r6,r10} \n\t" \
+ \
+ "umull r6, r10, r14, r14 \n\t" \
+ "adcs r6, r6, r5 \n\t" \
+ "adcs r10, r10, r9 \n\t" \
+ "stmia r0!, {r6,r10} \n\t" \
+ "pop {r1, r2} \n\t"
+
+#define FAST_SQUARE_ASM_5_TO_6 \
+ "cmp r2, #5 \n\t" \
+ "beq 1f \n\t" \
+ \
+ "sub r0, #20 \n\t" \
+ "sub r1, #20 \n\t" \
+ \
+ /* Do off-center multiplication */ \
+ "ldmia r1!, {r5,r6,r7,r8,r9,r14} \n\t" \
+ "umull r3, r4, r5, r14 \n\t" \
+ "mov r5, #0 \n\t" \
+ "umaal r4, r5, r6, r14 \n\t" \
+ "mov r6, #0 \n\t" \
+ "umaal r5, r6, r7, r14 \n\t" \
+ "mov r7, #0 \n\t" \
+ "umaal r6, r7, r8, r14 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r7, r8, r9, r14 \n\t" \
+ \
+ /* Multiply by 2 */ \
+ "mov r9, #0 \n\t" \
+ "adds r3, r3, r3 \n\t" \
+ "adcs r4, r4, r4 \n\t" \
+ "adcs r5, r5, r5 \n\t" \
+ "adcs r6, r6, r6 \n\t" \
+ "adcs r7, r7, r7 \n\t" \
+ "adcs r8, r8, r8 \n\t" \
+ "adcs r9, r9, #0 \n\t" \
+ \
+ /* Add into previous */ \
+ "ldr r12, [r0], #4 \n\t" \
+ "adds r3, r3, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r4, r4, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r5, r5, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r6, r6, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r7, r7, r12 \n\t" \
+ "adcs r8, r8, #0 \n\t" \
+ "adcs r9, r9, #0 \n\t" \
+ "sub r0, #20 \n\t" \
+ \
+ /* Perform center multiplication */ \
+ "umlal r8, r9, r14, r14 \n\t" \
+ "stmia r0!, {r3,r4,r5,r6,r7,r8,r9} \n\t"
+
+#define FAST_SQUARE_ASM_6 \
+ "ldmia r1!, {r8,r9,r10,r11,r12,r14} \n\t" \
+ "push {r1, r2} \n\t" \
+ \
+ "umull r1, r2, r9, r8 \n\t" \
+ "mov r3, #0 \n\t" \
+ "umaal r2, r3, r10, r8 \n\t" \
+ "mov r4, #0 \n\t" \
+ "umaal r3, r4, r11, r8 \n\t" \
+ "mov r5, #0 \n\t" \
+ "umaal r4, r5, r12, r8 \n\t" \
+ "mov r6, #0 \n\t" \
+ "umaal r5, r6, r14, r8 \n\t" \
+ \
+ "mov r7, #0 \n\t" \
+ "umaal r7, r3, r10, r9 \n\t" \
+ "umaal r3, r4, r11, r9 \n\t" \
+ "umaal r4, r5, r12, r9 \n\t" \
+ "push {r4, r5} \n\t" \
+ "adds r1, r1, r1 \n\t" \
+ "adcs r2, r2, r2 \n\t" \
+ "adcs r7, r7, r7 \n\t" \
+ "adcs r3, r3, r3 \n\t" \
+ \
+ "umull r4, r5, r8, r8 \n\t" \
+ /* Store carry in r8 */ \
+ "mov r8, #0 \n\t" \
+ "adc r8, r8, #0 \n\t" \
+ "adds r5, r5, r1 \n\t" \
+ "stmia r0!, {r4,r5} \n\t" \
+ \
+ "umull r4, r5, r9, r9 \n\t" \
+ "adcs r4, r4, r2 \n\t" \
+ "adcs r5, r5, r7 \n\t" \
+ "stmia r0!, {r4,r5} \n\t" \
+ \
+ "pop {r4, r5} \n\t" \
+ "umaal r5, r6, r14, r9 \n\t" \
+ /* Store carry in r9 */ \
+ "mov r9, #0 \n\t" \
+ "adc r9, r9, #0 \n\t" \
+ \
+ "mov r1, #0 \n\t" \
+ "umaal r1, r4, r11, r10 \n\t" \
+ "umaal r4, r5, r12, r10 \n\t" \
+ "umaal r5, r6, r14, r10 \n\t" \
+ \
+ "mov r2, #0 \n\t" \
+ "umaal r2, r5, r12, r11 \n\t" \
+ "umaal r5, r6, r14, r11 \n\t" \
+ \
+ "mov r7, #0 \n\t" \
+ "umaal r7, r6, r14, r12 \n\t" \
+ \
+ /* Load carry from r8 */ \
+ "lsrs r8, #1 \n\t" \
+ "adcs r1, r1, r1 \n\t" \
+ "adcs r4, r4, r4 \n\t" \
+ "adcs r2, r2, r2 \n\t" \
+ "adcs r5, r5, r5 \n\t" \
+ "adcs r7, r7, r7 \n\t" \
+ "adcs r6, r6, r6 \n\t" \
+ "adc r8, r8, #0 \n\t" \
+ \
+ /* Use carry from r9 */ \
+ "umaal r3, r9, r10, r10 \n\t" \
+ "adds r9, r9, r1 \n\t" \
+ "stmia r0!, {r3,r9} \n\t" \
+ \
+ "umull r9, r10, r11, r11 \n\t" \
+ "adcs r9, r9, r4 \n\t" \
+ "adcs r10, r10, r2 \n\t" \
+ "stmia r0!, {r9,r10} \n\t" \
+ \
+ "umull r9, r10, r12, r12 \n\t" \
+ "adcs r9, r9, r5 \n\t" \
+ "adcs r10, r10, r7 \n\t" \
+ "stmia r0!, {r9,r10} \n\t" \
+ \
+ "umull r9, r10, r14, r14 \n\t" \
+ "adcs r9, r9, r6 \n\t" \
+ "adcs r10, r10, r8 \n\t" \
+ "stmia r0!, {r9,r10} \n\t" \
+ "pop {r1, r2} \n\t"
+
+#define FAST_SQUARE_ASM_6_TO_7 \
+ "cmp r2, #6 \n\t" \
+ "beq 1f \n\t" \
+ \
+ "sub r0, #24 \n\t" \
+ "sub r1, #24 \n\t" \
+ \
+ /* Do off-center multiplication */ \
+ "ldmia r1!, {r5,r6,r7,r8,r9,r10,r14} \n\t" \
+ "umull r3, r4, r5, r14 \n\t" \
+ "mov r5, #0 \n\t" \
+ "umaal r4, r5, r6, r14 \n\t" \
+ "mov r6, #0 \n\t" \
+ "umaal r5, r6, r7, r14 \n\t" \
+ "mov r7, #0 \n\t" \
+ "umaal r6, r7, r8, r14 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r7, r8, r9, r14 \n\t" \
+ "mov r9, #0 \n\t" \
+ "umaal r8, r9, r10, r14 \n\t" \
+ \
+ /* Multiply by 2 */ \
+ "mov r10, #0 \n\t" \
+ "adds r3, r3, r3 \n\t" \
+ "adcs r4, r4, r4 \n\t" \
+ "adcs r5, r5, r5 \n\t" \
+ "adcs r6, r6, r6 \n\t" \
+ "adcs r7, r7, r7 \n\t" \
+ "adcs r8, r8, r8 \n\t" \
+ "adcs r9, r9, r9 \n\t" \
+ "adcs r10, r10, #0 \n\t" \
+ \
+ /* Add into previous */ \
+ "ldr r12, [r0], #4 \n\t" \
+ "adds r3, r3, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r4, r4, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r5, r5, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r6, r6, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r7, r7, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r8, r8, r12 \n\t" \
+ "adcs r9, r9, #0 \n\t" \
+ "adcs r10, r10, #0 \n\t" \
+ "sub r0, #24 \n\t" \
+ \
+ /* Perform center multiplication */ \
+ "umlal r9, r10, r14, r14 \n\t" \
+ "stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10} \n\t"
+
+#define FAST_SQUARE_ASM_7 \
+ "ldmia r1!, {r9,r10,r11,r12} \n\t" \
+ "push {r2} \n\t" \
+ \
+ "umull r14, r2, r10, r9 \n\t" \
+ "mov r3, #0 \n\t" \
+ "umaal r2, r3, r11, r9 \n\t" \
+ "mov r4, #0 \n\t" \
+ "umaal r3, r4, r12, r9 \n\t" \
+ \
+ "mov r5, #0 \n\t" \
+ "umaal r5, r3, r11, r10 \n\t" \
+ "adds r14, r14, r14 \n\t" \
+ "adcs r2, r2, r2 \n\t" \
+ "adcs r5, r5, r5 \n\t" \
+ /* Store carry in r7 */ \
+ "mov r7, #0 \n\t" \
+ "adc r7, r7, #0 \n\t" \
+ \
+ "umull r6, r8, r9, r9 \n\t" \
+ "adds r8, r8, r14 \n\t" \
+ "stmia r0!, {r6,r8} \n\t" \
+ \
+ "umull r6, r8, r10, r10 \n\t" \
+ "adcs r6, r6, r2 \n\t" \
+ "adcs r8, r8, r5 \n\t" \
+ "stmia r0!, {r6,r8} \n\t" \
+ /* Store carry in r8 */ \
+ "mov r8, #0 \n\t" \
+ "adc r8, r8, #0 \n\t" \
+ \
+ "ldmia r1!, {r2, r6, r14} \n\t" \
+ "push {r1} \n\t" \
+ "umaal r3, r4, r2, r9 \n\t" \
+ "mov r5, #0 \n\t" \
+ "umaal r4, r5, r6, r9 \n\t" \
+ "mov r1, #0 \n\t" \
+ "umaal r5, r1, r14, r9 \n\t" \
+ \
+ "mov r9, #0 \n\t" \
+ "umaal r3, r9, r12, r10 \n\t" \
+ "umaal r9, r4, r2, r10 \n\t" \
+ "umaal r4, r5, r6, r10 \n\t" \
+ "umaal r5, r1, r14, r10 \n\t" \
+ \
+ "mov r10, #0 \n\t" \
+ "umaal r10, r9, r12, r11 \n\t" \
+ "umaal r9, r4, r2, r11 \n\t" \
+ "umaal r4, r5, r6, r11 \n\t" \
+ "umaal r5, r1, r14, r11 \n\t" \
+ \
+ /* Load carry from r7 */ \
+ "lsrs r7, #1 \n\t" \
+ "adcs r3, r3, r3 \n\t" \
+ "adcs r10, r10, r10 \n\t" \
+ "adcs r9, r9, r9 \n\t" \
+ /* Store carry back in r7 */ \
+ "adc r7, r7, #0 \n\t" \
+ \
+ /* Use carry from r8 */ \
+ "umaal r3, r8, r11, r11 \n\t" \
+ "adds r8, r8, r10 \n\t" \
+ "stmia r0!, {r3,r8} \n\t" \
+ /* Store carry back in r8 */ \
+ "mov r8, #0 \n\t" \
+ "adc r8, r8, #0 \n\t" \
+ \
+ "mov r3, #0 \n\t" \
+ "umaal r3, r4, r2, r12 \n\t" \
+ "umaal r4, r5, r6, r12 \n\t" \
+ "umaal r5, r1, r14, r12 \n\t" \
+ \
+ "mov r10, #0 \n\t" \
+ "umaal r10, r5, r6, r2 \n\t" \
+ "umaal r5, r1, r14, r2 \n\t" \
+ \
+ "mov r11, #0 \n\t" \
+ "umaal r11, r1, r14, r6 \n\t" \
+ \
+ /* Load carry from r7 */ \
+ "lsrs r7, #1 \n\t" \
+ "adcs r3, r3, r3 \n\t" \
+ "adcs r4, r4, r4 \n\t" \
+ "adcs r10, r10, r10 \n\t" \
+ "adcs r5, r5, r5 \n\t" \
+ "adcs r11, r11, r11 \n\t" \
+ "adcs r1, r1, r1 \n\t" \
+ "adc r7, r7, #0 \n\t" \
+ \
+ /* Use carry from r8 */ \
+ "umaal r8, r9, r12, r12 \n\t" \
+ "adds r9, r9, r3 \n\t" \
+ "stmia r0!, {r8,r9} \n\t" \
+ \
+ "umull r8, r9, r2, r2 \n\t" \
+ "adcs r8, r8, r4 \n\t" \
+ "adcs r9, r9, r10 \n\t" \
+ "stmia r0!, {r8,r9} \n\t" \
+ \
+ "umull r8, r9, r6, r6 \n\t" \
+ "adcs r8, r8, r5 \n\t" \
+ "adcs r9, r9, r11 \n\t" \
+ "stmia r0!, {r8,r9} \n\t" \
+ \
+ "umull r8, r9, r14, r14 \n\t" \
+ "adcs r8, r8, r1 \n\t" \
+ "adcs r9, r9, r7 \n\t" \
+ "stmia r0!, {r8,r9} \n\t" \
+ "pop {r1, r2} \n\t"
+
+#define FAST_SQUARE_ASM_7_TO_8 \
+ "cmp r2, #7 \n\t" \
+ "beq 1f \n\t" \
+ \
+ "sub r0, #28 \n\t" \
+ "sub r1, #28 \n\t" \
+ \
+ /* Do off-center multiplication */ \
+ "ldmia r1!, {r5,r6,r7,r8,r9,r10,r11,r14} \n\t" \
+ "umull r3, r4, r5, r14 \n\t" \
+ "mov r5, #0 \n\t" \
+ "umaal r4, r5, r6, r14 \n\t" \
+ "mov r6, #0 \n\t" \
+ "umaal r5, r6, r7, r14 \n\t" \
+ "mov r7, #0 \n\t" \
+ "umaal r6, r7, r8, r14 \n\t" \
+ "mov r8, #0 \n\t" \
+ "umaal r7, r8, r9, r14 \n\t" \
+ "mov r9, #0 \n\t" \
+ "umaal r8, r9, r10, r14 \n\t" \
+ "mov r10, #0 \n\t" \
+ "umaal r9, r10, r11, r14 \n\t" \
+ \
+ /* Multiply by 2 */ \
+ "mov r11, #0 \n\t" \
+ "adds r3, r3, r3 \n\t" \
+ "adcs r4, r4, r4 \n\t" \
+ "adcs r5, r5, r5 \n\t" \
+ "adcs r6, r6, r6 \n\t" \
+ "adcs r7, r7, r7 \n\t" \
+ "adcs r8, r8, r8 \n\t" \
+ "adcs r9, r9, r9 \n\t" \
+ "adcs r10, r10, r10 \n\t" \
+ "adcs r11, r11, #0 \n\t" \
+ \
+ /* Add into previous */ \
+ "ldr r12, [r0], #4 \n\t" \
+ "adds r3, r3, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r4, r4, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r5, r5, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r6, r6, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r7, r7, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r8, r8, r12 \n\t" \
+ "ldr r12, [r0], #4 \n\t" \
+ "adcs r9, r9, r12 \n\t" \
+ "adcs r10, r10, #0 \n\t" \
+ "adcs r11, r11, #0 \n\t" \
+ "sub r0, #28 \n\t" \
+ \
+ /* Perform center multiplication */ \
+ "umlal r10, r11, r14, r14 \n\t" \
+ "stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10,r11} \n\t"
+
+#define FAST_SQUARE_ASM_8 \
+ "ldmia r1!, {r10,r11,r12,r14} \n\t" \
+ "push {r2} \n\t" \
+ \
+ "umull r2, r3, r11, r10 \n\t" \
+ "mov r4, #0 \n\t" \
+ "umaal r3, r4, r12, r10 \n\t" \
+ "mov r5, #0 \n\t" \
+ "umaal r4, r5, r14, r10 \n\t" \
+ \
+ "mov r6, #0 \n\t" \
+ "umaal r6, r4, r12, r11 \n\t" \
+ "adds r2, r2, r2 \n\t" \
+ "adcs r3, r3, r3 \n\t" \
+ "adcs r6, r6, r6 \n\t" \
+ /* Store carry in r7 */ \
+ "mov r7, #0 \n\t" \
+ "adc r7, r7, #0 \n\t" \
+ \
+ "umull r8, r9, r10, r10 \n\t" \
+ "adds r9, r9, r2 \n\t" \
+ "stmia r0!, {r8,r9} \n\t" \
+ \
+ "umull r8, r9, r11, r11 \n\t" \
+ "adcs r8, r8, r3 \n\t" \
+ "adcs r9, r9, r6 \n\t" \
+ "stmia r0!, {r8,r9} \n\t" \
+ /* Store carry in r8 */ \
+ "mov r8, #0 \n\t" \
+ "adc r8, r8, #0 \n\t" \
+ \
+ "ldmia r1!, {r2, r3} \n\t" \
+ "push {r1} \n\t" \
+ "umaal r4, r5, r2, r10 \n\t" \
+ "mov r6, #0 \n\t" \
+ "umaal r5, r6, r3, r10 \n\t" \
+ \
+ "mov r9, #0 \n\t" \
+ "umaal r9, r4, r14, r11 \n\t" \
+ "umaal r4, r5, r2, r11 \n\t" \
+ \
+ "mov r1, #0 \n\t" \
+ "umaal r1, r4, r14, r12 \n\t" \
+ \
+ /* Load carry from r7 */ \
+ "lsrs r7, #1 \n\t" \
+ "adcs r9, r9, r9 \n\t" \
+ "adcs r1, r1, r1 \n\t" \
+ /* Store carry back in r7 */ \
+ "adc r7, r7, #0 \n\t" \
+ \
+ /* Use carry from r8 */ \
+ "umaal r8, r9, r12, r12 \n\t" \
+ "adds r9, r9, r1 \n\t" \
+ "stmia r0!, {r8,r9} \n\t" \
+ /* Store carry back in r8 */ \
+ "mov r8, #0 \n\t" \
+ "adc r8, r8, #0 \n\t" \
+ \
+ "pop {r1} \n\t" \
+ /* TODO could fix up r1 value on stack here */ \
+ /* and leave the value on the stack (rather */ \
+ /* than popping) if supporting curves > 256 bits */ \
+ "ldr r9, [r1], #4 \n\t" \
+ "ldr r1, [r1] \n\t" \
+ \
+ "push {r7} \n\t" \
+ "umaal r5, r6, r9, r10 \n\t" \
+ "mov r7, #0 \n\t" \
+ "umaal r6, r7, r1, r10 \n\t" \
+ /* Carry now stored in r10 */ \
+ "pop {r10} \n\t" \
+ \
+ "umaal r4, r5, r3, r11 \n\t" \
+ "umaal r5, r6, r9, r11 \n\t" \
+ "umaal r6, r7, r1, r11 \n\t" \
+ \
+ "mov r11, #0 \n\t" \
+ "umaal r11, r4, r2, r12 \n\t" \
+ "umaal r4, r5, r3, r12 \n\t" \
+ "umaal r5, r6, r9, r12 \n\t" \
+ "umaal r6, r7, r1, r12 \n\t" \
+ \
+ "mov r12, #0 \n\t" \
+ "umaal r12, r4, r2, r14 \n\t" \
+ "umaal r4, r5, r3, r14 \n\t" \
+ "umaal r5, r6, r9, r14 \n\t" \
+ "umaal r6, r7, r1, r14 \n\t" \
+ \
+ /* Load carry from r10 */ \
+ "lsrs r10, #1 \n\t" \
+ "adcs r11, r11, r11 \n\t" \
+ "adcs r12, r12, r12 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ \
+ /* Use carry from r8 */ \
+ "umaal r8, r11, r14, r14 \n\t" \
+ "adds r11, r11, r12 \n\t" \
+ "stmia r0!, {r8,r11} \n\t" \
+ /* Store carry back in r8 */ \
+ "mov r8, #0 \n\t" \
+ "adc r8, r8, #0 \n\t" \
+ \
+ "mov r11, #0 \n\t" \
+ "umaal r11, r5, r3, r2 \n\t" \
+ "umaal r5, r6, r9, r2 \n\t" \
+ "umaal r6, r7, r1, r2 \n\t" \
+ \
+ "mov r12, #0 \n\t" \
+ "umaal r12, r6, r9, r3 \n\t" \
+ "umaal r6, r7, r1, r3 \n\t" \
+ \
+ "mov r14, #0 \n\t" \
+ "umaal r14, r7, r1, r9 \n\t" \
+ \
+ /* Load carry from r10 */ \
+ "lsrs r10, #1 \n\t" \
+ "adcs r4, r4, r4 \n\t" \
+ "adcs r11, r11, r11 \n\t" \
+ "adcs r5, r5, r5 \n\t" \
+ "adcs r12, r12, r12 \n\t" \
+ "adcs r6, r6, r6 \n\t" \
+ "adcs r14, r14, r14 \n\t" \
+ "adcs r7, r7, r7 \n\t" \
+ "adc r10, r10, #0 \n\t" \
+ \
+ /* Use carry from r8 */ \
+ "umaal r4, r8, r2, r2 \n\t" \
+ "adds r8, r8, r11 \n\t" \
+ "stmia r0!, {r4,r8} \n\t" \
+ \
+ "umull r4, r8, r3, r3 \n\t" \
+ "adcs r4, r4, r5 \n\t" \
+ "adcs r8, r8, r12 \n\t" \
+ "stmia r0!, {r4,r8} \n\t" \
+ \
+ "umull r4, r8, r9, r9 \n\t" \
+ "adcs r4, r4, r6 \n\t" \
+ "adcs r8, r8, r14 \n\t" \
+ "stmia r0!, {r4,r8} \n\t" \
+ \
+ "umull r4, r8, r1, r1 \n\t" \
+ "adcs r4, r4, r7 \n\t" \
+ "adcs r8, r8, r10 \n\t" \
+ "stmia r0!, {r4,r8} \n\t" \
+ /* TODO pop {r1, r2} if supporting curves > 256 bits */ \
+ "pop {r2} \n\t"
+
+#endif /* _UECC_ASM_ARM_MULT_SQUARE_H_ */
diff --git a/types.h b/types.h
index 7cb1a28..9ee8143 100644
--- a/types.h
+++ b/types.h
@@ -23,6 +23,16 @@
#endif
#endif
+#ifndef uECC_ARM_USE_UMAAL
+ #if (uECC_PLATFORM == uECC_arm) && (__ARM_ARCH >= 6)
+ #define uECC_ARM_USE_UMAAL 1
+ #elif (uECC_PLATFORM == uECC_arm_thumb2) && (__ARM_ARCH >= 6) && !__ARM_ARCH_7M__
+ #define uECC_ARM_USE_UMAAL 1
+ #else
+ #define uECC_ARM_USE_UMAAL 0
+ #endif
+#endif
+
#ifndef uECC_WORD_SIZE
#if uECC_PLATFORM == uECC_avr
#define uECC_WORD_SIZE 1
diff --git a/uECC.h b/uECC.h
index 1193ce8..9911763 100644
--- a/uECC.h
+++ b/uECC.h
@@ -23,7 +23,9 @@
/* Optimization level; trade speed for code size.
Larger values produce code that is faster but larger.
- Currently supported values are 0 - 3; 0 is unusably slow for most applications. */
+ Currently supported values are 0 - 4; 0 is unusably slow for most applications.
+ Optimization level 4 currently only has an effect ARM platforms where more than one
+ curve is enabled. */
#ifndef uECC_OPTIMIZATION_LEVEL
#define uECC_OPTIMIZATION_LEVEL 2
#endif