Add faster ARM multiplication code using UMAAL (#69)

On ARM platforms that support UMAAL, this new code should speed up curve
operations by 15-20%. There is automatic detection of UMAAL support
using compiler macros, but if it doesn't work for a given platform,
#define uECC_ARM_USE_UMAAL to 1 or 0 as desired.
diff --git a/asm_arm.inc b/asm_arm.inc
index 7b4459f..cdcdec6 100644
--- a/asm_arm.inc
+++ b/asm_arm.inc
@@ -156,342 +156,16 @@
 
 #if (uECC_OPTIMIZATION_LEVEL >= 3)
 
-#include "asm_arm_mult_square.inc"
-
-#define FAST_MULT_ASM_5_TO_6                 \
-    "cmp r3, #5 \n\t"                        \
-    "beq 1f \n\t"                            \
-                                             \
-    /* r4 = left high, r5 = right high */    \
-    "ldr r4, [r1] \n\t"                      \
-    "ldr r5, [r2] \n\t"                      \
-                                             \
-    "sub r0, #20 \n\t"                       \
-    "sub r1, #20 \n\t"                       \
-    "sub r2, #20 \n\t"                       \
-                                             \
-    "ldr r6, [r0] \n\t"                      \
-    "ldr r7, [r1], #4 \n\t"                  \
-    "ldr r8, [r2], #4 \n\t"                  \
-    "mov r14, #0 \n\t"                       \
-    "umull r9, r10, r4, r8 \n\t"             \
-    "umull r11, r12, r5, r7 \n\t"            \
-    "adds r9, r9, r6 \n\t"                   \
-    "adc r10, r10, #0 \n\t"                  \
-    "adds r9, r9, r11 \n\t"                  \
-    "adcs r10, r10, r12 \n\t"                \
-    "adc r14, r14, #0 \n\t"                  \
-    "str r9, [r0], #4 \n\t"                  \
-                                             \
-    "ldr r6, [r0] \n\t"                      \
-    "adds r10, r10, r6 \n\t"                 \
-    "adcs r14, r14, #0 \n\t"                 \
-    "ldr r7, [r1], #4 \n\t"                  \
-    "ldr r8, [r2], #4 \n\t"                  \
-    "mov r9, #0 \n\t"                        \
-    "umull r11, r12, r4, r8 \n\t"            \
-    "adds r10, r10, r11 \n\t"                \
-    "adcs r14, r14, r12 \n\t"                \
-    "adc r9, r9, #0 \n\t"                    \
-    "umull r11, r12, r5, r7 \n\t"            \
-    "adds r10, r10, r11 \n\t"                \
-    "adcs r14, r14, r12 \n\t"                \
-    "adc r9, r9, #0 \n\t"                    \
-    "str r10, [r0], #4 \n\t"                 \
-                                             \
-    "ldr r6, [r0] \n\t"                      \
-    "adds r14, r14, r6 \n\t"                 \
-    "adcs r9, r9, #0 \n\t"                   \
-    "ldr r7, [r1], #4 \n\t"                  \
-    "ldr r8, [r2], #4 \n\t"                  \
-    "mov r10, #0 \n\t"                       \
-    "umull r11, r12, r4, r8 \n\t"            \
-    "adds r14, r14, r11 \n\t"                \
-    "adcs r9, r9, r12 \n\t"                  \
-    "adc r10, r10, #0 \n\t"                  \
-    "umull r11, r12, r5, r7 \n\t"            \
-    "adds r14, r14, r11 \n\t"                \
-    "adcs r9, r9, r12 \n\t"                  \
-    "adc r10, r10, #0 \n\t"                  \
-    "str r14, [r0], #4 \n\t"                 \
-                                             \
-    "ldr r6, [r0] \n\t"                      \
-    "adds r9, r9, r6 \n\t"                   \
-    "adcs r10, r10, #0 \n\t"                 \
-    "ldr r7, [r1], #4 \n\t"                  \
-    "ldr r8, [r2], #4 \n\t"                  \
-    "mov r14, #0 \n\t"                       \
-    "umull r11, r12, r4, r8 \n\t"            \
-    "adds r9, r9, r11 \n\t"                  \
-    "adcs r10, r10, r12 \n\t"                \
-    "adc r14, r14, #0 \n\t"                  \
-    "umull r11, r12, r5, r7 \n\t"            \
-    "adds r9, r9, r11 \n\t"                  \
-    "adcs r10, r10, r12 \n\t"                \
-    "adc r14, r14, #0 \n\t"                  \
-    "str r9, [r0], #4 \n\t"                  \
-                                             \
-    "ldr r6, [r0] \n\t"                      \
-    "adds r10, r10, r6 \n\t"                 \
-    "adcs r14, r14, #0 \n\t"                 \
-    /* skip past already-loaded (r4, r5) */  \
-    "ldr r7, [r1], #8 \n\t"                  \
-    "ldr r8, [r2], #8 \n\t"                  \
-    "mov r9, #0 \n\t"                        \
-    "umull r11, r12, r4, r8 \n\t"            \
-    "adds r10, r10, r11 \n\t"                \
-    "adcs r14, r14, r12 \n\t"                \
-    "adc r9, r9, #0 \n\t"                    \
-    "umull r11, r12, r5, r7 \n\t"            \
-    "adds r10, r10, r11 \n\t"                \
-    "adcs r14, r14, r12 \n\t"                \
-    "adc r9, r9, #0 \n\t"                    \
-    "str r10, [r0], #4 \n\t"                 \
-                                             \
-    "umull r11, r12, r4, r5 \n\t"            \
-    "adds r11, r11, r14 \n\t"                \
-    "adc r12, r12, r9 \n\t"                  \
-    "stmia r0!, {r11, r12} \n\t"
-
-#define FAST_MULT_ASM_6_TO_7                    \
-    "cmp r3, #6 \n\t"                           \
-    "beq 1f \n\t"                               \
-                                                \
-    /* r4 = left high, r5 = right high */       \
-    "ldr r4, [r1] \n\t"                         \
-    "ldr r5, [r2] \n\t"                         \
-                                                \
-    "sub r0, #24 \n\t"                          \
-    "sub r1, #24 \n\t"                          \
-    "sub r2, #24 \n\t"                          \
-                                                \
-    "ldr r6, [r0] \n\t"                         \
-    "ldr r7, [r1], #4 \n\t"                     \
-    "ldr r8, [r2], #4 \n\t"                     \
-    "mov r14, #0 \n\t"                          \
-    "umull r9, r10, r4, r8 \n\t"                \
-    "umull r11, r12, r5, r7 \n\t"               \
-    "adds r9, r9, r6 \n\t"                      \
-    "adc r10, r10, #0 \n\t"                     \
-    "adds r9, r9, r11 \n\t"                     \
-    "adcs r10, r10, r12 \n\t"                   \
-    "adc r14, r14, #0 \n\t"                     \
-    "str r9, [r0], #4 \n\t"                     \
-                                                \
-    "ldr r6, [r0] \n\t"                         \
-    "adds r10, r10, r6 \n\t"                    \
-    "adcs r14, r14, #0 \n\t"                    \
-    "ldr r7, [r1], #4 \n\t"                     \
-    "ldr r8, [r2], #4 \n\t"                     \
-    "mov r9, #0 \n\t"                           \
-    "umull r11, r12, r4, r8 \n\t"               \
-    "adds r10, r10, r11 \n\t"                   \
-    "adcs r14, r14, r12 \n\t"                   \
-    "adc r9, r9, #0 \n\t"                       \
-    "umull r11, r12, r5, r7 \n\t"               \
-    "adds r10, r10, r11 \n\t"                   \
-    "adcs r14, r14, r12 \n\t"                   \
-    "adc r9, r9, #0 \n\t"                       \
-    "str r10, [r0], #4 \n\t"                    \
-                                                \
-    "ldr r6, [r0] \n\t"                         \
-    "adds r14, r14, r6 \n\t"                    \
-    "adcs r9, r9, #0 \n\t"                      \
-    "ldr r7, [r1], #4 \n\t"                     \
-    "ldr r8, [r2], #4 \n\t"                     \
-    "mov r10, #0 \n\t"                          \
-    "umull r11, r12, r4, r8 \n\t"               \
-    "adds r14, r14, r11 \n\t"                   \
-    "adcs r9, r9, r12 \n\t"                     \
-    "adc r10, r10, #0 \n\t"                     \
-    "umull r11, r12, r5, r7 \n\t"               \
-    "adds r14, r14, r11 \n\t"                   \
-    "adcs r9, r9, r12 \n\t"                     \
-    "adc r10, r10, #0 \n\t"                     \
-    "str r14, [r0], #4 \n\t"                    \
-                                                \
-    "ldr r6, [r0] \n\t"                         \
-    "adds r9, r9, r6 \n\t"                      \
-    "adcs r10, r10, #0 \n\t"                    \
-    "ldr r7, [r1], #4 \n\t"                     \
-    "ldr r8, [r2], #4 \n\t"                     \
-    "mov r14, #0 \n\t"                          \
-    "umull r11, r12, r4, r8 \n\t"               \
-    "adds r9, r9, r11 \n\t"                     \
-    "adcs r10, r10, r12 \n\t"                   \
-    "adc r14, r14, #0 \n\t"                     \
-    "umull r11, r12, r5, r7 \n\t"               \
-    "adds r9, r9, r11 \n\t"                     \
-    "adcs r10, r10, r12 \n\t"                   \
-    "adc r14, r14, #0 \n\t"                     \
-    "str r9, [r0], #4 \n\t"                     \
-                                                \
-    "ldr r6, [r0] \n\t"                         \
-    "adds r10, r10, r6 \n\t"                    \
-    "adcs r14, r14, #0 \n\t"                    \
-    "ldr r7, [r1], #4 \n\t"                     \
-    "ldr r8, [r2], #4 \n\t"                     \
-    "mov r9, #0 \n\t"                           \
-    "umull r11, r12, r4, r8 \n\t"               \
-    "adds r10, r10, r11 \n\t"                   \
-    "adcs r14, r14, r12 \n\t"                   \
-    "adc r9, r9, #0 \n\t"                       \
-    "umull r11, r12, r5, r7 \n\t"               \
-    "adds r10, r10, r11 \n\t"                   \
-    "adcs r14, r14, r12 \n\t"                   \
-    "adc r9, r9, #0 \n\t"                       \
-    "str r10, [r0], #4 \n\t"                    \
-                                                \
-    "ldr r6, [r0] \n\t"                         \
-    "adds r14, r14, r6 \n\t"                    \
-    "adcs r9, r9, #0 \n\t"                      \
-    /* skip past already-loaded (r4, r5) */     \
-    "ldr r7, [r1], #8 \n\t"                     \
-    "ldr r8, [r2], #8 \n\t"                     \
-    "mov r10, #0 \n\t"                          \
-    "umull r11, r12, r4, r8 \n\t"               \
-    "adds r14, r14, r11 \n\t"                   \
-    "adcs r9, r9, r12 \n\t"                     \
-    "adc r10, r10, #0 \n\t"                     \
-    "umull r11, r12, r5, r7 \n\t"               \
-    "adds r14, r14, r11 \n\t"                   \
-    "adcs r9, r9, r12 \n\t"                     \
-    "adc r10, r10, #0 \n\t"                     \
-    "str r14, [r0], #4 \n\t"                    \
-                                                \
-    "umull r11, r12, r4, r5 \n\t"               \
-    "adds r11, r11, r9 \n\t"                    \
-    "adc r12, r12, r10 \n\t"                    \
-    "stmia r0!, {r11, r12} \n\t"
-
-#define FAST_MULT_ASM_7_TO_8                 \
-    "cmp r3, #7 \n\t"                        \
-    "beq 1f \n\t"                            \
-                                             \
-    /* r4 = left high, r5 = right high */    \
-    "ldr r4, [r1] \n\t"                      \
-    "ldr r5, [r2] \n\t"                      \
-                                             \
-    "sub r0, #28 \n\t"                       \
-    "sub r1, #28 \n\t"                       \
-    "sub r2, #28 \n\t"                       \
-                                             \
-    "ldr r6, [r0] \n\t"                      \
-    "ldr r7, [r1], #4 \n\t"                  \
-    "ldr r8, [r2], #4 \n\t"                  \
-    "mov r14, #0 \n\t"                       \
-    "umull r9, r10, r4, r8 \n\t"             \
-    "umull r11, r12, r5, r7 \n\t"            \
-    "adds r9, r9, r6 \n\t"                   \
-    "adc r10, r10, #0 \n\t"                  \
-    "adds r9, r9, r11 \n\t"                  \
-    "adcs r10, r10, r12 \n\t"                \
-    "adc r14, r14, #0 \n\t"                  \
-    "str r9, [r0], #4 \n\t"                  \
-                                             \
-    "ldr r6, [r0] \n\t"                      \
-    "adds r10, r10, r6 \n\t"                 \
-    "adcs r14, r14, #0 \n\t"                 \
-    "ldr r7, [r1], #4 \n\t"                  \
-    "ldr r8, [r2], #4 \n\t"                  \
-    "mov r9, #0 \n\t"                        \
-    "umull r11, r12, r4, r8 \n\t"            \
-    "adds r10, r10, r11 \n\t"                \
-    "adcs r14, r14, r12 \n\t"                \
-    "adc r9, r9, #0 \n\t"                    \
-    "umull r11, r12, r5, r7 \n\t"            \
-    "adds r10, r10, r11 \n\t"                \
-    "adcs r14, r14, r12 \n\t"                \
-    "adc r9, r9, #0 \n\t"                    \
-    "str r10, [r0], #4 \n\t"                 \
-                                             \
-    "ldr r6, [r0] \n\t"                      \
-    "adds r14, r14, r6 \n\t"                 \
-    "adcs r9, r9, #0 \n\t"                   \
-    "ldr r7, [r1], #4 \n\t"                  \
-    "ldr r8, [r2], #4 \n\t"                  \
-    "mov r10, #0 \n\t"                       \
-    "umull r11, r12, r4, r8 \n\t"            \
-    "adds r14, r14, r11 \n\t"                \
-    "adcs r9, r9, r12 \n\t"                  \
-    "adc r10, r10, #0 \n\t"                  \
-    "umull r11, r12, r5, r7 \n\t"            \
-    "adds r14, r14, r11 \n\t"                \
-    "adcs r9, r9, r12 \n\t"                  \
-    "adc r10, r10, #0 \n\t"                  \
-    "str r14, [r0], #4 \n\t"                 \
-                                             \
-    "ldr r6, [r0] \n\t"                      \
-    "adds r9, r9, r6 \n\t"                   \
-    "adcs r10, r10, #0 \n\t"                 \
-    "ldr r7, [r1], #4 \n\t"                  \
-    "ldr r8, [r2], #4 \n\t"                  \
-    "mov r14, #0 \n\t"                       \
-    "umull r11, r12, r4, r8 \n\t"            \
-    "adds r9, r9, r11 \n\t"                  \
-    "adcs r10, r10, r12 \n\t"                \
-    "adc r14, r14, #0 \n\t"                  \
-    "umull r11, r12, r5, r7 \n\t"            \
-    "adds r9, r9, r11 \n\t"                  \
-    "adcs r10, r10, r12 \n\t"                \
-    "adc r14, r14, #0 \n\t"                  \
-    "str r9, [r0], #4 \n\t"                  \
-                                             \
-    "ldr r6, [r0] \n\t"                      \
-    "adds r10, r10, r6 \n\t"                 \
-    "adcs r14, r14, #0 \n\t"                 \
-    "ldr r7, [r1], #4 \n\t"                  \
-    "ldr r8, [r2], #4 \n\t"                  \
-    "mov r9, #0 \n\t"                        \
-    "umull r11, r12, r4, r8 \n\t"            \
-    "adds r10, r10, r11 \n\t"                \
-    "adcs r14, r14, r12 \n\t"                \
-    "adc r9, r9, #0 \n\t"                    \
-    "umull r11, r12, r5, r7 \n\t"            \
-    "adds r10, r10, r11 \n\t"                \
-    "adcs r14, r14, r12 \n\t"                \
-    "adc r9, r9, #0 \n\t"                    \
-    "str r10, [r0], #4 \n\t"                 \
-                                             \
-    "ldr r6, [r0] \n\t"                      \
-    "adds r14, r14, r6 \n\t"                 \
-    "adcs r9, r9, #0 \n\t"                   \
-    "ldr r7, [r1], #4 \n\t"                  \
-    "ldr r8, [r2], #4 \n\t"                  \
-    "mov r10, #0 \n\t"                       \
-    "umull r11, r12, r4, r8 \n\t"            \
-    "adds r14, r14, r11 \n\t"                \
-    "adcs r9, r9, r12 \n\t"                  \
-    "adc r10, r10, #0 \n\t"                  \
-    "umull r11, r12, r5, r7 \n\t"            \
-    "adds r14, r14, r11 \n\t"                \
-    "adcs r9, r9, r12 \n\t"                  \
-    "adc r10, r10, #0 \n\t"                  \
-    "str r14, [r0], #4 \n\t"                 \
-                                             \
-    "ldr r6, [r0] \n\t"                      \
-    "adds r9, r9, r6 \n\t"                   \
-    "adcs r10, r10, #0 \n\t"                 \
-    /* skip past already-loaded (r4, r5) */  \
-    "ldr r7, [r1], #8 \n\t"                  \
-    "ldr r8, [r2], #8 \n\t"                  \
-    "mov r14, #0 \n\t"                       \
-    "umull r11, r12, r4, r8 \n\t"            \
-    "adds r9, r9, r11 \n\t"                  \
-    "adcs r10, r10, r12 \n\t"                \
-    "adc r14, r14, #0 \n\t"                  \
-    "umull r11, r12, r5, r7 \n\t"            \
-    "adds r9, r9, r11 \n\t"                  \
-    "adcs r10, r10, r12 \n\t"                \
-    "adc r14, r14, #0 \n\t"                  \
-    "str r9, [r0], #4 \n\t"                  \
-                                             \
-    "umull r11, r12, r4, r5 \n\t"            \
-    "adds r11, r11, r10 \n\t"                \
-    "adc r12, r12, r14 \n\t"                 \
-    "stmia r0!, {r11, r12} \n\t"
-
 #if (uECC_PLATFORM != uECC_arm_thumb)
+
+#if uECC_ARM_USE_UMAAL
+    #include "asm_arm_mult_square_umaal.inc"
+#else
+    #include "asm_arm_mult_square.inc"
+#endif
+
+#if (uECC_OPTIMIZATION_LEVEL == 3)
+
 uECC_VLI_API void uECC_vli_mult(uint32_t *result,
                                 const uint32_t *left,
                                 const uint32_t *right,
@@ -503,11 +177,8 @@
     
     __asm__ volatile (
         ".syntax unified \n\t"
-        "push {r3} \n\t"
-    
 #if (uECC_MIN_WORDS == 5)
         FAST_MULT_ASM_5
-        "pop {r3} \n\t"
     #if (uECC_MAX_WORDS > 5)
         FAST_MULT_ASM_5_TO_6
     #endif
@@ -519,7 +190,6 @@
     #endif
 #elif (uECC_MIN_WORDS == 6)
         FAST_MULT_ASM_6
-        "pop {r3} \n\t"
     #if (uECC_MAX_WORDS > 6)
         FAST_MULT_ASM_6_TO_7
     #endif
@@ -528,15 +198,12 @@
     #endif
 #elif (uECC_MIN_WORDS == 7)
         FAST_MULT_ASM_7
-        "pop {r3} \n\t"
     #if (uECC_MAX_WORDS > 7)
         FAST_MULT_ASM_7_TO_8
     #endif
 #elif (uECC_MIN_WORDS == 8)
         FAST_MULT_ASM_8
-        "pop {r3} \n\t"
 #endif
-
         "1: \n\t"
         RESUME_SYNTAX
         : "+r" (r0), "+r" (r1), "+r" (r2)
@@ -547,217 +214,6 @@
 #define asm_mult 1
 
 #if uECC_SQUARE_FUNC
-
-#define FAST_SQUARE_ASM_5_TO_6           \
-    "cmp r2, #5 \n\t"                    \
-    "beq 1f \n\t"                        \
-                                         \
-    /* r3 = high */                      \
-    "ldr r3, [r1] \n\t"                  \
-                                         \
-    "sub r0, #20 \n\t"                   \
-    "sub r1, #20 \n\t"                   \
-                                         \
-    /* Do off-center multiplication */   \
-    "ldr r14, [r1], #4 \n\t"             \
-    "umull r4, r5, r3, r14 \n\t"         \
-    "ldr r14, [r1], #4 \n\t"             \
-    "umull r7, r6, r3, r14 \n\t"         \
-    "adds r5, r5, r7 \n\t"               \
-    "ldr r14, [r1], #4 \n\t"             \
-    "umull r8, r7, r3, r14 \n\t"         \
-    "adcs r6, r6, r8 \n\t"               \
-    "ldr r14, [r1], #4 \n\t"             \
-    "umull r9, r8, r3, r14 \n\t"         \
-    "adcs r7, r7, r9 \n\t"               \
-    /* Skip already-loaded r3 */         \
-    "ldr r14, [r1], #8 \n\t"             \
-    "umull r10, r9, r3, r14 \n\t"        \
-    "adcs r8, r8, r10 \n\t"              \
-    "adcs r9, r9, #0 \n\t"               \
-                                         \
-    /* Multiply by 2 */                  \
-    "mov r10, #0 \n\t"                   \
-    "adds r4, r4, r4 \n\t"               \
-    "adcs r5, r5, r5 \n\t"               \
-    "adcs r6, r6, r6 \n\t"               \
-    "adcs r7, r7, r7 \n\t"               \
-    "adcs r8, r8, r8 \n\t"               \
-    "adcs r9, r9, r9 \n\t"               \
-    "adcs r10, r10, #0 \n\t"             \
-                                         \
-    /* Add into previous */              \
-    "ldr r14, [r0] \n\t"                 \
-    "adds r4, r4, r14 \n\t"              \
-    "str r4, [r0], #4 \n\t"              \
-    "ldr r14, [r0] \n\t"                 \
-    "adcs r5, r5, r14 \n\t"              \
-    "str r5, [r0], #4 \n\t"              \
-    "ldr r14, [r0] \n\t"                 \
-    "adcs r6, r6, r14 \n\t"              \
-    "str r6, [r0], #4 \n\t"              \
-    "ldr r14, [r0] \n\t"                 \
-    "adcs r7, r7, r14 \n\t"              \
-    "str r7, [r0], #4 \n\t"              \
-    "ldr r14, [r0] \n\t"                 \
-    "adcs r8, r8, r14 \n\t"              \
-    "str r8, [r0], #4 \n\t"              \
-    "adcs r9, r9, #0 \n\t"               \
-    "adcs r10, r10, #0 \n\t"             \
-                                         \
-    /* Perform center multiplication */  \
-    "umull r4, r5, r3, r3 \n\t"          \
-    "adds r4, r4, r9 \n\t"               \
-    "adc r5, r5, r10 \n\t"               \
-    "stmia r0!, {r4, r5} \n\t"           
-
-#define FAST_SQUARE_ASM_6_TO_7               \
-    "cmp r2, #6 \n\t"                        \
-    "beq 1f \n\t"                            \
-                                             \
-    /* r3 = high */                          \
-    "ldr r3, [r1] \n\t"                      \
-                                             \
-    "sub r0, #24 \n\t"                       \
-    "sub r1, #24 \n\t"                       \
-                                             \
-    /* Do off-center multiplication */       \
-    "ldr r14, [r1], #4 \n\t"                 \
-    "umull r4, r5, r3, r14 \n\t"             \
-    "ldr r14, [r1], #4 \n\t"                 \
-    "umull r7, r6, r3, r14 \n\t"             \
-    "adds r5, r5, r7 \n\t"                   \
-    "ldr r14, [r1], #4 \n\t"                 \
-    "umull r8, r7, r3, r14 \n\t"             \
-    "adcs r6, r6, r8 \n\t"                   \
-    "ldr r14, [r1], #4 \n\t"                 \
-    "umull r9, r8, r3, r14 \n\t"             \
-    "adcs r7, r7, r9 \n\t"                   \
-    "ldr r14, [r1], #4 \n\t"                 \
-    "umull r10, r9, r3, r14 \n\t"            \
-    "adcs r8, r8, r10 \n\t"                  \
-    /* Skip already-loaded r3 */             \
-    "ldr r14, [r1], #8 \n\t"                 \
-    "umull r11, r10, r3, r14 \n\t"           \
-    "adcs r9, r9, r11 \n\t"                  \
-    "adcs r10, r10, #0 \n\t"                 \
-                                             \
-    /* Multiply by 2 */                      \
-    "mov r11, #0 \n\t"                       \
-    "adds r4, r4, r4 \n\t"                   \
-    "adcs r5, r5, r5 \n\t"                   \
-    "adcs r6, r6, r6 \n\t"                   \
-    "adcs r7, r7, r7 \n\t"                   \
-    "adcs r8, r8, r8 \n\t"                   \
-    "adcs r9, r9, r9 \n\t"                   \
-    "adcs r10, r10, r10 \n\t"                \
-    "adcs r11, r11, #0 \n\t"                 \
-                                             \
-    /* Add into previous */                  \
-    "ldr r14, [r0] \n\t"                     \
-    "adds r4, r4, r14 \n\t"                  \
-    "str r4, [r0], #4 \n\t"                  \
-    "ldr r14, [r0] \n\t"                     \
-    "adcs r5, r5, r14 \n\t"                  \
-    "str r5, [r0], #4 \n\t"                  \
-    "ldr r14, [r0] \n\t"                     \
-    "adcs r6, r6, r14 \n\t"                  \
-    "str r6, [r0], #4 \n\t"                  \
-    "ldr r14, [r0] \n\t"                     \
-    "adcs r7, r7, r14 \n\t"                  \
-    "str r7, [r0], #4 \n\t"                  \
-    "ldr r14, [r0] \n\t"                     \
-    "adcs r8, r8, r14 \n\t"                  \
-    "str r8, [r0], #4 \n\t"                  \
-    "ldr r14, [r0] \n\t"                     \
-    "adcs r9, r9, r14 \n\t"                  \
-    "str r9, [r0], #4 \n\t"                  \
-    "adcs r10, r10, #0 \n\t"                 \
-    "adcs r11, r11, #0 \n\t"                 \
-                                             \
-    /* Perform center multiplication */      \
-    "umull r4, r5, r3, r3 \n\t"              \
-    "adds r4, r4, r10 \n\t"                  \
-    "adc r5, r5, r11 \n\t"                   \
-    "stmia r0!, {r4, r5} \n\t"
-
-#define FAST_SQUARE_ASM_7_TO_8           \
-    "cmp r2, #7 \n\t"                    \
-    "beq 1f \n\t"                        \
-                                         \
-    /* r3 = high */                      \
-    "ldr r3, [r1] \n\t"                  \
-                                         \
-    "sub r0, #28 \n\t"                   \
-    "sub r1, #28 \n\t"                   \
-                                         \
-    /* Do off-center multiplication */   \
-    "ldr r14, [r1], #4 \n\t"             \
-    "umull r4, r5, r3, r14 \n\t"         \
-    "ldr r14, [r1], #4 \n\t"             \
-    "umull r7, r6, r3, r14 \n\t"         \
-    "adds r5, r5, r7 \n\t"               \
-    "ldr r14, [r1], #4 \n\t"             \
-    "umull r8, r7, r3, r14 \n\t"         \
-    "adcs r6, r6, r8 \n\t"               \
-    "ldr r14, [r1], #4 \n\t"             \
-    "umull r9, r8, r3, r14 \n\t"         \
-    "adcs r7, r7, r9 \n\t"               \
-    "ldr r14, [r1], #4 \n\t"             \
-    "umull r10, r9, r3, r14 \n\t"        \
-    "adcs r8, r8, r10 \n\t"              \
-    "ldr r14, [r1], #4 \n\t"             \
-    "umull r11, r10, r3, r14 \n\t"       \
-    "adcs r9, r9, r11 \n\t"              \
-    /* Skip already-loaded r3 */         \
-    "ldr r14, [r1], #8 \n\t"             \
-    "umull r12, r11, r3, r14 \n\t"       \
-    "adcs r10, r10, r12 \n\t"            \
-    "adcs r11, r11, #0 \n\t"             \
-                                         \
-    /* Multiply by 2 */                  \
-    "mov r12, #0 \n\t"                   \
-    "adds r4, r4, r4 \n\t"               \
-    "adcs r5, r5, r5 \n\t"               \
-    "adcs r6, r6, r6 \n\t"               \
-    "adcs r7, r7, r7 \n\t"               \
-    "adcs r8, r8, r8 \n\t"               \
-    "adcs r9, r9, r9 \n\t"               \
-    "adcs r10, r10, r10 \n\t"            \
-    "adcs r11, r11, r11 \n\t"            \
-    "adcs r12, r12, #0 \n\t"             \
-                                         \
-    /* Add into previous */              \
-    "ldr r14, [r0] \n\t"                 \
-    "adds r4, r4, r14 \n\t"              \
-    "str r4, [r0], #4 \n\t"              \
-    "ldr r14, [r0] \n\t"                 \
-    "adcs r5, r5, r14 \n\t"              \
-    "str r5, [r0], #4 \n\t"              \
-    "ldr r14, [r0] \n\t"                 \
-    "adcs r6, r6, r14 \n\t"              \
-    "str r6, [r0], #4 \n\t"              \
-    "ldr r14, [r0] \n\t"                 \
-    "adcs r7, r7, r14 \n\t"              \
-    "str r7, [r0], #4 \n\t"              \
-    "ldr r14, [r0] \n\t"                 \
-    "adcs r8, r8, r14 \n\t"              \
-    "str r8, [r0], #4 \n\t"              \
-    "ldr r14, [r0] \n\t"                 \
-    "adcs r9, r9, r14 \n\t"              \
-    "str r9, [r0], #4 \n\t"              \
-    "ldr r14, [r0] \n\t"                 \
-    "adcs r10, r10, r14 \n\t"            \
-    "str r10, [r0], #4 \n\t"             \
-    "adcs r11, r11, #0 \n\t"             \
-    "adcs r12, r12, #0 \n\t"             \
-                                         \
-    /* Perform center multiplication */  \
-    "umull r4, r5, r3, r3 \n\t"          \
-    "adds r4, r4, r11 \n\t"              \
-    "adc r5, r5, r12 \n\t"               \
-    "stmia r0!, {r4, r5} \n\t"           
-
 uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
                                   const uECC_word_t *left,
                                   wordcount_t num_words) {
@@ -767,13 +223,9 @@
     
     __asm__ volatile (
         ".syntax unified \n\t"
-        "push {r1, r2} \n\t"
-
 #if (uECC_MIN_WORDS == 5)
         FAST_SQUARE_ASM_5
-        "pop {r1, r2} \n\t"
     #if (uECC_MAX_WORDS > 5)
-        "add r1, #20 \n\t"
         FAST_SQUARE_ASM_5_TO_6
     #endif
     #if (uECC_MAX_WORDS > 6)
@@ -784,9 +236,7 @@
     #endif
 #elif (uECC_MIN_WORDS == 6)
         FAST_SQUARE_ASM_6
-        "pop {r1, r2} \n\t"
     #if (uECC_MAX_WORDS > 6)
-        "add r1, #24 \n\t"
         FAST_SQUARE_ASM_6_TO_7
     #endif
     #if (uECC_MAX_WORDS > 7)
@@ -794,14 +244,11 @@
     #endif
 #elif (uECC_MIN_WORDS == 7)
         FAST_SQUARE_ASM_7
-        "pop {r1, r2} \n\t"
     #if (uECC_MAX_WORDS > 7)
-        "add r1, #28 \n\t"
         FAST_SQUARE_ASM_7_TO_8
     #endif
 #elif (uECC_MIN_WORDS == 8)
         FAST_SQUARE_ASM_8
-        "pop {r1, r2} \n\t"
 #endif
 
         "1: \n\t"
@@ -814,6 +261,138 @@
 #define asm_square 1
 #endif /* uECC_SQUARE_FUNC */
 
+#else /* (uECC_OPTIMIZATION_LEVEL > 3) */
+
+uECC_VLI_API void uECC_vli_mult(uint32_t *result,
+                                const uint32_t *left,
+                                const uint32_t *right,
+                                wordcount_t num_words) {
+    register uint32_t *r0 __asm__("r0") = result;
+    register const uint32_t *r1 __asm__("r1") = left;
+    register const uint32_t *r2 __asm__("r2") = right;
+    register uint32_t r3 __asm__("r3") = num_words;
+    
+#if uECC_SUPPORTS_secp160r1
+    if (num_words == 5) {
+        __asm__ volatile (
+            ".syntax unified \n\t"
+            FAST_MULT_ASM_5
+            RESUME_SYNTAX
+            : "+r" (r0), "+r" (r1), "+r" (r2)
+            : "r" (r3)
+            : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+        );
+        return;
+    }
+#endif
+#if uECC_SUPPORTS_secp192r1
+    if (num_words == 6) {
+        __asm__ volatile (
+            ".syntax unified \n\t"
+            FAST_MULT_ASM_6
+            RESUME_SYNTAX
+            : "+r" (r0), "+r" (r1), "+r" (r2)
+            : "r" (r3)
+            : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+        );
+        return;
+    }
+#endif
+#if uECC_SUPPORTS_secp224r1
+    if (num_words == 7) {
+        __asm__ volatile (
+            ".syntax unified \n\t"
+            FAST_MULT_ASM_7
+            RESUME_SYNTAX
+            : "+r" (r0), "+r" (r1), "+r" (r2)
+            : "r" (r3)
+            : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+        );
+        return;
+    }
+#endif
+#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
+    if (num_words == 8) {
+        __asm__ volatile (
+            ".syntax unified \n\t"
+            FAST_MULT_ASM_8
+            RESUME_SYNTAX
+            : "+r" (r0), "+r" (r1), "+r" (r2)
+            : "r" (r3)
+            : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+        );
+        return;
+    }
+#endif
+}
+#define asm_mult 1
+
+#if uECC_SQUARE_FUNC
+uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
+                                  const uECC_word_t *left,
+                                  wordcount_t num_words) {
+    register uint32_t *r0 __asm__("r0") = result;
+    register const uint32_t *r1 __asm__("r1") = left;
+    register uint32_t r2 __asm__("r2") = num_words;
+    
+#if uECC_SUPPORTS_secp160r1
+    if (num_words == 5) {
+        __asm__ volatile (
+            ".syntax unified \n\t"
+            FAST_SQUARE_ASM_5
+            RESUME_SYNTAX
+            : "+r" (r0), "+r" (r1)
+            : "r" (r2)
+            : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+        );
+        return;
+    }
+#endif
+#if uECC_SUPPORTS_secp192r1
+    if (num_words == 6) {
+        __asm__ volatile (
+            ".syntax unified \n\t"
+            FAST_SQUARE_ASM_6
+            RESUME_SYNTAX
+            : "+r" (r0), "+r" (r1)
+            : "r" (r2)
+            : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+        );
+        return;
+    }
+#endif
+#if uECC_SUPPORTS_secp224r1
+    if (num_words == 7) {
+        __asm__ volatile (
+            ".syntax unified \n\t"
+            FAST_SQUARE_ASM_7
+            RESUME_SYNTAX
+            : "+r" (r0), "+r" (r1)
+            : "r" (r2)
+            : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+        );
+        return;
+    }
+#endif
+#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
+    if (num_words == 8) {
+        __asm__ volatile (
+            ".syntax unified \n\t"
+            FAST_SQUARE_ASM_8
+            RESUME_SYNTAX
+            : "+r" (r0), "+r" (r1)
+            : "r" (r2)
+            : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
+        );
+        return;
+    }
+#endif
+}
+#define asm_square 1
+#endif /* uECC_SQUARE_FUNC */
+
+#endif /* (uECC_OPTIMIZATION_LEVEL > 3) */
+
 #endif /* uECC_PLATFORM != uECC_arm_thumb */
 
 #endif /* (uECC_OPTIMIZATION_LEVEL >= 3) */
diff --git a/asm_arm_mult_square.inc b/asm_arm_mult_square.inc
index 9decef6..8907fc1 100644
--- a/asm_arm_mult_square.inc
+++ b/asm_arm_mult_square.inc
@@ -4,6 +4,7 @@
 #define _UECC_ASM_ARM_MULT_SQUARE_H_
 
 #define FAST_MULT_ASM_5                \
+    "push {r3} \n\t"                   \
     "add r0, 12 \n\t"                  \
     "add r2, 12 \n\t"                  \
     "ldmia r1!, {r3,r4} \n\t"          \
@@ -154,9 +155,106 @@
     "umull r14, r9, r4, r7 \n\t"       \
     "adds r10, r10, r14 \n\t"          \
     "adc r11, r11, r9 \n\t"            \
-    "stmia r0!, {r10, r11} \n\t"
+    "stmia r0!, {r10, r11} \n\t"       \
+    "pop {r3} \n\t"
+
+#define FAST_MULT_ASM_5_TO_6                 \
+    "cmp r3, #5 \n\t"                        \
+    "beq 1f \n\t"                            \
+                                             \
+    /* r4 = left high, r5 = right high */    \
+    "ldr r4, [r1] \n\t"                      \
+    "ldr r5, [r2] \n\t"                      \
+                                             \
+    "sub r0, #20 \n\t"                       \
+    "sub r1, #20 \n\t"                       \
+    "sub r2, #20 \n\t"                       \
+                                             \
+    "ldr r6, [r0] \n\t"                      \
+    "ldr r7, [r1], #4 \n\t"                  \
+    "ldr r8, [r2], #4 \n\t"                  \
+    "mov r14, #0 \n\t"                       \
+    "umull r9, r10, r4, r8 \n\t"             \
+    "umull r11, r12, r5, r7 \n\t"            \
+    "adds r9, r9, r6 \n\t"                   \
+    "adc r10, r10, #0 \n\t"                  \
+    "adds r9, r9, r11 \n\t"                  \
+    "adcs r10, r10, r12 \n\t"                \
+    "adc r14, r14, #0 \n\t"                  \
+    "str r9, [r0], #4 \n\t"                  \
+                                             \
+    "ldr r6, [r0] \n\t"                      \
+    "adds r10, r10, r6 \n\t"                 \
+    "adcs r14, r14, #0 \n\t"                 \
+    "ldr r7, [r1], #4 \n\t"                  \
+    "ldr r8, [r2], #4 \n\t"                  \
+    "mov r9, #0 \n\t"                        \
+    "umull r11, r12, r4, r8 \n\t"            \
+    "adds r10, r10, r11 \n\t"                \
+    "adcs r14, r14, r12 \n\t"                \
+    "adc r9, r9, #0 \n\t"                    \
+    "umull r11, r12, r5, r7 \n\t"            \
+    "adds r10, r10, r11 \n\t"                \
+    "adcs r14, r14, r12 \n\t"                \
+    "adc r9, r9, #0 \n\t"                    \
+    "str r10, [r0], #4 \n\t"                 \
+                                             \
+    "ldr r6, [r0] \n\t"                      \
+    "adds r14, r14, r6 \n\t"                 \
+    "adcs r9, r9, #0 \n\t"                   \
+    "ldr r7, [r1], #4 \n\t"                  \
+    "ldr r8, [r2], #4 \n\t"                  \
+    "mov r10, #0 \n\t"                       \
+    "umull r11, r12, r4, r8 \n\t"            \
+    "adds r14, r14, r11 \n\t"                \
+    "adcs r9, r9, r12 \n\t"                  \
+    "adc r10, r10, #0 \n\t"                  \
+    "umull r11, r12, r5, r7 \n\t"            \
+    "adds r14, r14, r11 \n\t"                \
+    "adcs r9, r9, r12 \n\t"                  \
+    "adc r10, r10, #0 \n\t"                  \
+    "str r14, [r0], #4 \n\t"                 \
+                                             \
+    "ldr r6, [r0] \n\t"                      \
+    "adds r9, r9, r6 \n\t"                   \
+    "adcs r10, r10, #0 \n\t"                 \
+    "ldr r7, [r1], #4 \n\t"                  \
+    "ldr r8, [r2], #4 \n\t"                  \
+    "mov r14, #0 \n\t"                       \
+    "umull r11, r12, r4, r8 \n\t"            \
+    "adds r9, r9, r11 \n\t"                  \
+    "adcs r10, r10, r12 \n\t"                \
+    "adc r14, r14, #0 \n\t"                  \
+    "umull r11, r12, r5, r7 \n\t"            \
+    "adds r9, r9, r11 \n\t"                  \
+    "adcs r10, r10, r12 \n\t"                \
+    "adc r14, r14, #0 \n\t"                  \
+    "str r9, [r0], #4 \n\t"                  \
+                                             \
+    "ldr r6, [r0] \n\t"                      \
+    "adds r10, r10, r6 \n\t"                 \
+    "adcs r14, r14, #0 \n\t"                 \
+    /* skip past already-loaded (r4, r5) */  \
+    "ldr r7, [r1], #8 \n\t"                  \
+    "ldr r8, [r2], #8 \n\t"                  \
+    "mov r9, #0 \n\t"                        \
+    "umull r11, r12, r4, r8 \n\t"            \
+    "adds r10, r10, r11 \n\t"                \
+    "adcs r14, r14, r12 \n\t"                \
+    "adc r9, r9, #0 \n\t"                    \
+    "umull r11, r12, r5, r7 \n\t"            \
+    "adds r10, r10, r11 \n\t"                \
+    "adcs r14, r14, r12 \n\t"                \
+    "adc r9, r9, #0 \n\t"                    \
+    "str r10, [r0], #4 \n\t"                 \
+                                             \
+    "umull r11, r12, r4, r5 \n\t"            \
+    "adds r11, r11, r14 \n\t"                \
+    "adc r12, r12, r9 \n\t"                  \
+    "stmia r0!, {r11, r12} \n\t"
 
 #define FAST_MULT_ASM_6             \
+    "push {r3} \n\t"                \
     "add r0, 12 \n\t"               \
     "add r2, 12 \n\t"               \
     "ldmia r1!, {r3,r4,r5} \n\t"    \
@@ -372,9 +470,122 @@
     "umull r10, r11, r5, r8 \n\t"   \
     "adds r12, r12, r10 \n\t"       \
     "adc r14, r14, r11 \n\t"        \
-    "stmia r0!, {r12, r14} \n\t"
+    "stmia r0!, {r12, r14} \n\t"    \
+    "pop {r3} \n\t"
+
+#define FAST_MULT_ASM_6_TO_7                    \
+    "cmp r3, #6 \n\t"                           \
+    "beq 1f \n\t"                               \
+                                                \
+    /* r4 = left high, r5 = right high */       \
+    "ldr r4, [r1] \n\t"                         \
+    "ldr r5, [r2] \n\t"                         \
+                                                \
+    "sub r0, #24 \n\t"                          \
+    "sub r1, #24 \n\t"                          \
+    "sub r2, #24 \n\t"                          \
+                                                \
+    "ldr r6, [r0] \n\t"                         \
+    "ldr r7, [r1], #4 \n\t"                     \
+    "ldr r8, [r2], #4 \n\t"                     \
+    "mov r14, #0 \n\t"                          \
+    "umull r9, r10, r4, r8 \n\t"                \
+    "umull r11, r12, r5, r7 \n\t"               \
+    "adds r9, r9, r6 \n\t"                      \
+    "adc r10, r10, #0 \n\t"                     \
+    "adds r9, r9, r11 \n\t"                     \
+    "adcs r10, r10, r12 \n\t"                   \
+    "adc r14, r14, #0 \n\t"                     \
+    "str r9, [r0], #4 \n\t"                     \
+                                                \
+    "ldr r6, [r0] \n\t"                         \
+    "adds r10, r10, r6 \n\t"                    \
+    "adcs r14, r14, #0 \n\t"                    \
+    "ldr r7, [r1], #4 \n\t"                     \
+    "ldr r8, [r2], #4 \n\t"                     \
+    "mov r9, #0 \n\t"                           \
+    "umull r11, r12, r4, r8 \n\t"               \
+    "adds r10, r10, r11 \n\t"                   \
+    "adcs r14, r14, r12 \n\t"                   \
+    "adc r9, r9, #0 \n\t"                       \
+    "umull r11, r12, r5, r7 \n\t"               \
+    "adds r10, r10, r11 \n\t"                   \
+    "adcs r14, r14, r12 \n\t"                   \
+    "adc r9, r9, #0 \n\t"                       \
+    "str r10, [r0], #4 \n\t"                    \
+                                                \
+    "ldr r6, [r0] \n\t"                         \
+    "adds r14, r14, r6 \n\t"                    \
+    "adcs r9, r9, #0 \n\t"                      \
+    "ldr r7, [r1], #4 \n\t"                     \
+    "ldr r8, [r2], #4 \n\t"                     \
+    "mov r10, #0 \n\t"                          \
+    "umull r11, r12, r4, r8 \n\t"               \
+    "adds r14, r14, r11 \n\t"                   \
+    "adcs r9, r9, r12 \n\t"                     \
+    "adc r10, r10, #0 \n\t"                     \
+    "umull r11, r12, r5, r7 \n\t"               \
+    "adds r14, r14, r11 \n\t"                   \
+    "adcs r9, r9, r12 \n\t"                     \
+    "adc r10, r10, #0 \n\t"                     \
+    "str r14, [r0], #4 \n\t"                    \
+                                                \
+    "ldr r6, [r0] \n\t"                         \
+    "adds r9, r9, r6 \n\t"                      \
+    "adcs r10, r10, #0 \n\t"                    \
+    "ldr r7, [r1], #4 \n\t"                     \
+    "ldr r8, [r2], #4 \n\t"                     \
+    "mov r14, #0 \n\t"                          \
+    "umull r11, r12, r4, r8 \n\t"               \
+    "adds r9, r9, r11 \n\t"                     \
+    "adcs r10, r10, r12 \n\t"                   \
+    "adc r14, r14, #0 \n\t"                     \
+    "umull r11, r12, r5, r7 \n\t"               \
+    "adds r9, r9, r11 \n\t"                     \
+    "adcs r10, r10, r12 \n\t"                   \
+    "adc r14, r14, #0 \n\t"                     \
+    "str r9, [r0], #4 \n\t"                     \
+                                                \
+    "ldr r6, [r0] \n\t"                         \
+    "adds r10, r10, r6 \n\t"                    \
+    "adcs r14, r14, #0 \n\t"                    \
+    "ldr r7, [r1], #4 \n\t"                     \
+    "ldr r8, [r2], #4 \n\t"                     \
+    "mov r9, #0 \n\t"                           \
+    "umull r11, r12, r4, r8 \n\t"               \
+    "adds r10, r10, r11 \n\t"                   \
+    "adcs r14, r14, r12 \n\t"                   \
+    "adc r9, r9, #0 \n\t"                       \
+    "umull r11, r12, r5, r7 \n\t"               \
+    "adds r10, r10, r11 \n\t"                   \
+    "adcs r14, r14, r12 \n\t"                   \
+    "adc r9, r9, #0 \n\t"                       \
+    "str r10, [r0], #4 \n\t"                    \
+                                                \
+    "ldr r6, [r0] \n\t"                         \
+    "adds r14, r14, r6 \n\t"                    \
+    "adcs r9, r9, #0 \n\t"                      \
+    /* skip past already-loaded (r4, r5) */     \
+    "ldr r7, [r1], #8 \n\t"                     \
+    "ldr r8, [r2], #8 \n\t"                     \
+    "mov r10, #0 \n\t"                          \
+    "umull r11, r12, r4, r8 \n\t"               \
+    "adds r14, r14, r11 \n\t"                   \
+    "adcs r9, r9, r12 \n\t"                     \
+    "adc r10, r10, #0 \n\t"                     \
+    "umull r11, r12, r5, r7 \n\t"               \
+    "adds r14, r14, r11 \n\t"                   \
+    "adcs r9, r9, r12 \n\t"                     \
+    "adc r10, r10, #0 \n\t"                     \
+    "str r14, [r0], #4 \n\t"                    \
+                                                \
+    "umull r11, r12, r4, r5 \n\t"               \
+    "adds r11, r11, r9 \n\t"                    \
+    "adc r12, r12, r10 \n\t"                    \
+    "stmia r0!, {r11, r12} \n\t"
 
 #define FAST_MULT_ASM_7                \
+    "push {r3} \n\t"                   \
     "add r0, 24 \n\t"                  \
     "add r2, 24 \n\t"                  \
     "ldmia r1!, {r3} \n\t"             \
@@ -680,9 +891,138 @@
     "umull r10, r11, r3, r6 \n\t"      \
     "adds r12, r12, r10 \n\t"          \
     "adc r14, r14, r11 \n\t"           \
-    "stmia r0!, {r12, r14} \n\t"
+    "stmia r0!, {r12, r14} \n\t"       \
+    "pop {r3} \n\t"
+
+#define FAST_MULT_ASM_7_TO_8                 \
+    "cmp r3, #7 \n\t"                        \
+    "beq 1f \n\t"                            \
+                                             \
+    /* r4 = left high, r5 = right high */    \
+    "ldr r4, [r1] \n\t"                      \
+    "ldr r5, [r2] \n\t"                      \
+                                             \
+    "sub r0, #28 \n\t"                       \
+    "sub r1, #28 \n\t"                       \
+    "sub r2, #28 \n\t"                       \
+                                             \
+    "ldr r6, [r0] \n\t"                      \
+    "ldr r7, [r1], #4 \n\t"                  \
+    "ldr r8, [r2], #4 \n\t"                  \
+    "mov r14, #0 \n\t"                       \
+    "umull r9, r10, r4, r8 \n\t"             \
+    "umull r11, r12, r5, r7 \n\t"            \
+    "adds r9, r9, r6 \n\t"                   \
+    "adc r10, r10, #0 \n\t"                  \
+    "adds r9, r9, r11 \n\t"                  \
+    "adcs r10, r10, r12 \n\t"                \
+    "adc r14, r14, #0 \n\t"                  \
+    "str r9, [r0], #4 \n\t"                  \
+                                             \
+    "ldr r6, [r0] \n\t"                      \
+    "adds r10, r10, r6 \n\t"                 \
+    "adcs r14, r14, #0 \n\t"                 \
+    "ldr r7, [r1], #4 \n\t"                  \
+    "ldr r8, [r2], #4 \n\t"                  \
+    "mov r9, #0 \n\t"                        \
+    "umull r11, r12, r4, r8 \n\t"            \
+    "adds r10, r10, r11 \n\t"                \
+    "adcs r14, r14, r12 \n\t"                \
+    "adc r9, r9, #0 \n\t"                    \
+    "umull r11, r12, r5, r7 \n\t"            \
+    "adds r10, r10, r11 \n\t"                \
+    "adcs r14, r14, r12 \n\t"                \
+    "adc r9, r9, #0 \n\t"                    \
+    "str r10, [r0], #4 \n\t"                 \
+                                             \
+    "ldr r6, [r0] \n\t"                      \
+    "adds r14, r14, r6 \n\t"                 \
+    "adcs r9, r9, #0 \n\t"                   \
+    "ldr r7, [r1], #4 \n\t"                  \
+    "ldr r8, [r2], #4 \n\t"                  \
+    "mov r10, #0 \n\t"                       \
+    "umull r11, r12, r4, r8 \n\t"            \
+    "adds r14, r14, r11 \n\t"                \
+    "adcs r9, r9, r12 \n\t"                  \
+    "adc r10, r10, #0 \n\t"                  \
+    "umull r11, r12, r5, r7 \n\t"            \
+    "adds r14, r14, r11 \n\t"                \
+    "adcs r9, r9, r12 \n\t"                  \
+    "adc r10, r10, #0 \n\t"                  \
+    "str r14, [r0], #4 \n\t"                 \
+                                             \
+    "ldr r6, [r0] \n\t"                      \
+    "adds r9, r9, r6 \n\t"                   \
+    "adcs r10, r10, #0 \n\t"                 \
+    "ldr r7, [r1], #4 \n\t"                  \
+    "ldr r8, [r2], #4 \n\t"                  \
+    "mov r14, #0 \n\t"                       \
+    "umull r11, r12, r4, r8 \n\t"            \
+    "adds r9, r9, r11 \n\t"                  \
+    "adcs r10, r10, r12 \n\t"                \
+    "adc r14, r14, #0 \n\t"                  \
+    "umull r11, r12, r5, r7 \n\t"            \
+    "adds r9, r9, r11 \n\t"                  \
+    "adcs r10, r10, r12 \n\t"                \
+    "adc r14, r14, #0 \n\t"                  \
+    "str r9, [r0], #4 \n\t"                  \
+                                             \
+    "ldr r6, [r0] \n\t"                      \
+    "adds r10, r10, r6 \n\t"                 \
+    "adcs r14, r14, #0 \n\t"                 \
+    "ldr r7, [r1], #4 \n\t"                  \
+    "ldr r8, [r2], #4 \n\t"                  \
+    "mov r9, #0 \n\t"                        \
+    "umull r11, r12, r4, r8 \n\t"            \
+    "adds r10, r10, r11 \n\t"                \
+    "adcs r14, r14, r12 \n\t"                \
+    "adc r9, r9, #0 \n\t"                    \
+    "umull r11, r12, r5, r7 \n\t"            \
+    "adds r10, r10, r11 \n\t"                \
+    "adcs r14, r14, r12 \n\t"                \
+    "adc r9, r9, #0 \n\t"                    \
+    "str r10, [r0], #4 \n\t"                 \
+                                             \
+    "ldr r6, [r0] \n\t"                      \
+    "adds r14, r14, r6 \n\t"                 \
+    "adcs r9, r9, #0 \n\t"                   \
+    "ldr r7, [r1], #4 \n\t"                  \
+    "ldr r8, [r2], #4 \n\t"                  \
+    "mov r10, #0 \n\t"                       \
+    "umull r11, r12, r4, r8 \n\t"            \
+    "adds r14, r14, r11 \n\t"                \
+    "adcs r9, r9, r12 \n\t"                  \
+    "adc r10, r10, #0 \n\t"                  \
+    "umull r11, r12, r5, r7 \n\t"            \
+    "adds r14, r14, r11 \n\t"                \
+    "adcs r9, r9, r12 \n\t"                  \
+    "adc r10, r10, #0 \n\t"                  \
+    "str r14, [r0], #4 \n\t"                 \
+                                             \
+    "ldr r6, [r0] \n\t"                      \
+    "adds r9, r9, r6 \n\t"                   \
+    "adcs r10, r10, #0 \n\t"                 \
+    /* skip past already-loaded (r4, r5) */  \
+    "ldr r7, [r1], #8 \n\t"                  \
+    "ldr r8, [r2], #8 \n\t"                  \
+    "mov r14, #0 \n\t"                       \
+    "umull r11, r12, r4, r8 \n\t"            \
+    "adds r9, r9, r11 \n\t"                  \
+    "adcs r10, r10, r12 \n\t"                \
+    "adc r14, r14, #0 \n\t"                  \
+    "umull r11, r12, r5, r7 \n\t"            \
+    "adds r9, r9, r11 \n\t"                  \
+    "adcs r10, r10, r12 \n\t"                \
+    "adc r14, r14, #0 \n\t"                  \
+    "str r9, [r0], #4 \n\t"                  \
+                                             \
+    "umull r11, r12, r4, r5 \n\t"            \
+    "adds r11, r11, r10 \n\t"                \
+    "adc r12, r12, r14 \n\t"                 \
+    "stmia r0!, {r11, r12} \n\t"
 
 #define FAST_MULT_ASM_8             \
+    "push {r3} \n\t"                \
     "add r0, 24 \n\t"               \
     "add r2, 24 \n\t"               \
     "ldmia r1!, {r3,r4} \n\t"       \
@@ -1083,10 +1423,13 @@
     "umull r9, r10, r4, r7 \n\t"    \
     "adds r11, r11, r9 \n\t"        \
     "adc r12, r12, r10 \n\t"        \
-    "stmia r0!, {r11, r12} \n\t"
+    "stmia r0!, {r11, r12} \n\t"    \
+    "pop {r3} \n\t"
 
 #define FAST_SQUARE_ASM_5               \
+    "push   {r2} \n\t"                  \
     "ldmia r1!, {r2,r3,r4,r5,r6} \n\t"  \
+    "push   {r1} \n\t"                  \
                                         \
     "umull r11, r12, r2, r2 \n\t"       \
     "stmia r0!, {r11} \n\t"             \
@@ -1188,10 +1531,62 @@
     "umull r1, r10, r6, r6 \n\t"        \
     "adds r8, r8, r1 \n\t"              \
     "adcs r11, r11, r10 \n\t"           \
-    "stmia r0!, {r8, r11} \n\t"
+    "stmia r0!, {r8, r11} \n\t"         \
+    "pop {r1, r2} \n\t"
+
+#define FAST_SQUARE_ASM_5_TO_6           \
+    "cmp r2, #5 \n\t"                    \
+    "beq 1f \n\t"                        \
+                                         \
+    "sub r0, #20 \n\t"                   \
+    "sub r1, #20 \n\t"                   \
+                                         \
+    /* Do off-center multiplication */   \
+    "ldmia r1!, {r6,r7,r8,r9,r10,r11} \n\t" \
+    "umull r3, r4, r6, r11 \n\t"         \
+    "umull r6, r5, r7, r11 \n\t"         \
+    "adds r4, r4, r6 \n\t"               \
+    "umull r7, r6, r8, r11 \n\t"         \
+    "adcs r5, r5, r7 \n\t"               \
+    "umull r8, r7, r9, r11 \n\t"         \
+    "adcs r6, r6, r8 \n\t"               \
+    "umull r9, r8, r10, r11 \n\t"        \
+    "adcs r7, r7, r9 \n\t"               \
+    "adcs r8, r8, #0 \n\t"               \
+                                         \
+    /* Multiply by 2 */                  \
+    "mov r9, #0 \n\t"                    \
+    "adds r3, r3, r3 \n\t"               \
+    "adcs r4, r4, r4 \n\t"               \
+    "adcs r5, r5, r5 \n\t"               \
+    "adcs r6, r6, r6 \n\t"               \
+    "adcs r7, r7, r7 \n\t"               \
+    "adcs r8, r8, r8 \n\t"               \
+    "adcs r9, r9, #0 \n\t"               \
+                                         \
+    /* Add into previous */              \
+    "ldr r14, [r0], #4 \n\t"             \
+    "adds r3, r3, r14 \n\t"              \
+    "ldr r14, [r0], #4 \n\t"             \
+    "adcs r4, r4, r14 \n\t"              \
+    "ldr r14, [r0], #4 \n\t"             \
+    "adcs r5, r5, r14 \n\t"              \
+    "ldr r14, [r0], #4 \n\t"             \
+    "adcs r6, r6, r14 \n\t"              \
+    "ldr r14, [r0], #4 \n\t"             \
+    "adcs r7, r7, r14 \n\t"              \
+    "adcs r8, r8, #0 \n\t"               \
+    "adcs r9, r9, #0 \n\t"               \
+    "sub r0, #20 \n\t"                   \
+                                         \
+    /* Perform center multiplication */  \
+    "umlal r8, r9, r11, r11 \n\t"        \
+    "stmia r0!, {r3,r4,r5,r6,r7,r8,r9} \n\t"
 
 #define FAST_SQUARE_ASM_6                  \
+    "push   {r2} \n\t"                     \
     "ldmia r1!, {r2,r3,r4,r5,r6,r7} \n\t"  \
+    "push   {r1} \n\t"                     \
                                            \
     "umull r11, r12, r2, r2 \n\t"          \
     "stmia r0!, {r11} \n\t"                \
@@ -1329,237 +1724,344 @@
     "umull r1, r10, r7, r7 \n\t"           \
     "adds r8, r8, r1 \n\t"                 \
     "adcs r11, r11, r10 \n\t"              \
-    "stmia r0!, {r8, r11} \n\t"
+    "stmia r0!, {r8, r11} \n\t"            \
+    "pop {r1, r2} \n\t"
 
-#define FAST_SQUARE_ASM_7                      \
-    "ldmia r1!, {r2} \n\t"                     \
-    "add r1, 20 \n\t"                          \
-    "ldmia r1!, {r5} \n\t"                     \
-    "add r0, 24 \n\t"                          \
-    "umull r8, r9, r2, r5 \n\t"                \
-    "stmia r0!, {r8, r9} \n\t"                 \
-    "sub r0, 32 \n\t"                          \
-    "sub r1, 28 \n\t"                          \
-                                               \
-    "ldmia r1!, {r2, r3, r4, r5, r6, r7} \n\t" \
-                                               \
-    "umull r11, r12, r2, r2 \n\t"              \
-    "stmia r0!, {r11} \n\t"                    \
-                                               \
-    "mov r9, #0 \n\t"                          \
-    "umull r10, r11, r2, r3 \n\t"              \
-    "adds r12, r12, r10 \n\t"                  \
-    "adcs r8, r11, #0 \n\t"                    \
-    "adc r9, r9, #0 \n\t"                      \
-    "adds r12, r12, r10 \n\t"                  \
-    "adcs r8, r8, r11 \n\t"                    \
-    "adc r9, r9, #0 \n\t"                      \
-    "stmia r0!, {r12} \n\t"                    \
-                                               \
-    "mov r10, #0 \n\t"                         \
-    "umull r11, r12, r2, r4 \n\t"              \
-    "adds r11, r11, r11 \n\t"                  \
-    "adcs r12, r12, r12 \n\t"                  \
-    "adc r10, r10, #0 \n\t"                    \
-    "adds r8, r8, r11 \n\t"                    \
-    "adcs r9, r9, r12 \n\t"                    \
-    "adc r10, r10, #0 \n\t"                    \
-    "umull r11, r12, r3, r3 \n\t"              \
-    "adds r8, r8, r11 \n\t"                    \
-    "adcs r9, r9, r12 \n\t"                    \
-    "adc r10, r10, #0 \n\t"                    \
-    "stmia r0!, {r8} \n\t"                     \
-                                               \
-    "mov r12, #0 \n\t"                         \
-    "umull r8, r11, r2, r5 \n\t"               \
-    "mov r14, r11 \n\t"                        \
-    "umlal r8, r11, r3, r4 \n\t"               \
-    "cmp r14, r11 \n\t"                        \
-    "it hi \n\t"                               \
-    "adchi r12, r12, #0 \n\t"                  \
-    "adds r8, r8, r8 \n\t"                     \
-    "adcs r11, r11, r11 \n\t"                  \
-    "adc r12, r12, r12 \n\t"                   \
-    "adds r8, r8, r9 \n\t"                     \
-    "adcs r11, r11, r10 \n\t"                  \
-    "adc r12, r12, #0 \n\t"                    \
-    "stmia r0!, {r8} \n\t"                     \
-                                               \
-    "mov r10, #0 \n\t"                         \
-    "umull r8, r9, r2, r6 \n\t"                \
-    "mov r14, r9 \n\t"                         \
-    "umlal r8, r9, r3, r5 \n\t"                \
-    "cmp r14, r9 \n\t"                         \
-    "it hi \n\t"                               \
-    "adchi r10, r10, #0 \n\t"                  \
-    "adds r8, r8, r8 \n\t"                     \
-    "adcs r9, r9, r9 \n\t"                     \
-    "adc r10, r10, r10 \n\t"                   \
-    "mov r14, r9 \n\t"                         \
-    "umlal r8, r9, r4, r4 \n\t"                \
-    "cmp r14, r9 \n\t"                         \
-    "it hi \n\t"                               \
-    "adchi r10, r10, #0 \n\t"                  \
-    "adds r8, r8, r11 \n\t"                    \
-    "adcs r9, r9, r12 \n\t"                    \
-    "adc r10, r10, #0 \n\t"                    \
-    "stmia r0!, {r8} \n\t"                     \
-                                               \
-    "mov r12, #0 \n\t"                         \
-    "umull r8, r11, r2, r7 \n\t"               \
-    "mov r14, r11 \n\t"                        \
-    "umlal r8, r11, r3, r6 \n\t"               \
-    "cmp r14, r11 \n\t"                        \
-    "it hi \n\t"                               \
-    "adchi r12, r12, #0 \n\t"                  \
-    "mov r14, r11 \n\t"                        \
-    "umlal r8, r11, r4, r5 \n\t"               \
-    "cmp r14, r11 \n\t"                        \
-    "it hi \n\t"                               \
-    "adchi r12, r12, #0 \n\t"                  \
-    "adds r8, r8, r8 \n\t"                     \
-    "adcs r11, r11, r11 \n\t"                  \
-    "adc r12, r12, r12 \n\t"                   \
-    "adds r8, r8, r9 \n\t"                     \
-    "adcs r11, r11, r10 \n\t"                  \
-    "adc r12, r12, #0 \n\t"                    \
-    "stmia r0!, {r8} \n\t"                     \
-                                               \
-    "ldmia r1!, {r2} \n\t"                     \
-    "mov r10, #0 \n\t"                         \
-    "umull r8, r9, r3, r7 \n\t"                \
-    "mov r14, r9 \n\t"                         \
-    "umlal r8, r9, r4, r6 \n\t"                \
-    "cmp r14, r9 \n\t"                         \
-    "it hi \n\t"                               \
-    "adchi r10, r10, #0 \n\t"                  \
-    "ldr r14, [r0] \n\t"                       \
-    "adds r8, r8, r14 \n\t"                    \
-    "adcs r9, r9, #0 \n\t"                     \
-    "adc r10, r10, #0 \n\t"                    \
-    "adds r8, r8, r8 \n\t"                     \
-    "adcs r9, r9, r9 \n\t"                     \
-    "adc r10, r10, r10 \n\t"                   \
-    "mov r14, r9 \n\t"                         \
-    "umlal r8, r9, r5, r5 \n\t"                \
-    "cmp r14, r9 \n\t"                         \
-    "it hi \n\t"                               \
-    "adchi r10, r10, #0 \n\t"                  \
-    "adds r8, r8, r11 \n\t"                    \
-    "adcs r9, r9, r12 \n\t"                    \
-    "adc r10, r10, #0 \n\t"                    \
-    "stmia r0!, {r8} \n\t"                     \
-                                               \
-    "mov r12, #0 \n\t"                         \
-    "umull r8, r11, r3, r2 \n\t"               \
-    "mov r14, r11 \n\t"                        \
-    "umlal r8, r11, r4, r7 \n\t"               \
-    "cmp r14, r11 \n\t"                        \
-    "it hi \n\t"                               \
-    "adchi r12, r12, #0 \n\t"                  \
-    "mov r14, r11 \n\t"                        \
-    "umlal r8, r11, r5, r6 \n\t"               \
-    "cmp r14, r11 \n\t"                        \
-    "it hi \n\t"                               \
-    "adchi r12, r12, #0 \n\t"                  \
-    "ldr r14, [r0] \n\t"                       \
-    "adds r8, r8, r14 \n\t"                    \
-    "adcs r11, r11, #0 \n\t"                   \
-    "adc r12, r12, #0 \n\t"                    \
-    "adds r8, r8, r8 \n\t"                     \
-    "adcs r11, r11, r11 \n\t"                  \
-    "adc r12, r12, r12 \n\t"                   \
-    "adds r8, r8, r9 \n\t"                     \
-    "adcs r11, r11, r10 \n\t"                  \
-    "adc r12, r12, #0 \n\t"                    \
-    "stmia r0!, {r8} \n\t"                     \
-                                               \
-    "mov r10, #0 \n\t"                         \
-    "umull r8, r9, r4, r2 \n\t"                \
-    "mov r14, r9 \n\t"                         \
-    "umlal r8, r9, r5, r7 \n\t"                \
-    "cmp r14, r9 \n\t"                         \
-    "it hi \n\t"                               \
-    "adchi r10, r10, #0 \n\t"                  \
-    "adds r8, r8, r8 \n\t"                     \
-    "adcs r9, r9, r9 \n\t"                     \
-    "adc r10, r10, r10 \n\t"                   \
-    "mov r14, r9 \n\t"                         \
-    "umlal r8, r9, r6, r6 \n\t"                \
-    "cmp r14, r9 \n\t"                         \
-    "it hi \n\t"                               \
-    "adchi r10, r10, #0 \n\t"                  \
-    "adds r8, r8, r11 \n\t"                    \
-    "adcs r9, r9, r12 \n\t"                    \
-    "adc r10, r10, #0 \n\t"                    \
-    "stmia r0!, {r8} \n\t"                     \
-                                               \
-    "mov r12, #0 \n\t"                         \
-    "umull r8, r11, r5, r2 \n\t"               \
-    "mov r14, r11 \n\t"                        \
-    "umlal r8, r11, r6, r7 \n\t"               \
-    "cmp r14, r11 \n\t"                        \
-    "it hi \n\t"                               \
-    "adchi r12, r12, #0 \n\t"                  \
-    "adds r8, r8, r8 \n\t"                     \
-    "adcs r11, r11, r11 \n\t"                  \
-    "adc r12, r12, r12 \n\t"                   \
-    "adds r8, r8, r9 \n\t"                     \
-    "adcs r11, r11, r10 \n\t"                  \
-    "adc r12, r12, #0 \n\t"                    \
-    "stmia r0!, {r8} \n\t"                     \
-                                               \
-    "mov r8, #0 \n\t"                          \
-    "umull r1, r10, r6, r2 \n\t"               \
-    "adds r1, r1, r1 \n\t"                     \
-    "adcs r10, r10, r10 \n\t"                  \
-    "adc r8, r8, #0 \n\t"                      \
-    "adds r11, r11, r1 \n\t"                   \
-    "adcs r12, r12, r10 \n\t"                  \
-    "adc r8, r8, #0 \n\t"                      \
-    "umull r1, r10, r7, r7 \n\t"               \
-    "adds r11, r11, r1 \n\t"                   \
-    "adcs r12, r12, r10 \n\t"                  \
-    "adc r8, r8, #0 \n\t"                      \
-    "stmia r0!, {r11} \n\t"                    \
-                                               \
-    "mov r11, #0 \n\t"                         \
-    "umull r1, r10, r7, r2 \n\t"               \
-    "adds r1, r1, r1 \n\t"                     \
-    "adcs r10, r10, r10 \n\t"                  \
-    "adc r11, r11, #0 \n\t"                    \
-    "adds r12, r12, r1 \n\t"                   \
-    "adcs r8, r8, r10 \n\t"                    \
-    "adc r11, r11, #0 \n\t"                    \
-    "stmia r0!, {r12} \n\t"                    \
-                                               \
-    "umull r1, r10, r2, r2 \n\t"               \
-    "adds r8, r8, r1 \n\t"                     \
-    "adcs r11, r11, r10 \n\t"                  \
-    "stmia r0!, {r8, r11} \n\t"
+#define FAST_SQUARE_ASM_6_TO_7               \
+    "cmp r2, #6 \n\t"                        \
+    "beq 1f \n\t"                            \
+                                             \
+    "sub r0, #24 \n\t"                       \
+    "sub r1, #24 \n\t"                       \
+                                             \
+    /* Do off-center multiplication */       \
+    "ldmia r1!, {r6,r7,r8,r9,r10,r11,r12} \n\t" \
+    "umull r3, r4, r6, r12 \n\t"             \
+    "umull r6, r5, r7, r12 \n\t"             \
+    "adds r4, r4, r6 \n\t"                   \
+    "umull r7, r6, r8, r12 \n\t"             \
+    "adcs r5, r5, r7 \n\t"                   \
+    "umull r8, r7, r9, r12 \n\t"             \
+    "adcs r6, r6, r8 \n\t"                   \
+    "umull r9, r8, r10, r12 \n\t"            \
+    "adcs r7, r7, r9 \n\t"                   \
+    "umull r10, r9, r11, r12 \n\t"           \
+    "adcs r8, r8, r10 \n\t"                  \
+    "adcs r9, r9, #0 \n\t"                   \
+                                             \
+    /* Multiply by 2 */                      \
+    "mov r10, #0 \n\t"                       \
+    "adds r3, r3, r3 \n\t"                   \
+    "adcs r4, r4, r4 \n\t"                   \
+    "adcs r5, r5, r5 \n\t"                   \
+    "adcs r6, r6, r6 \n\t"                   \
+    "adcs r7, r7, r7 \n\t"                   \
+    "adcs r8, r8, r8 \n\t"                   \
+    "adcs r9, r9, r9 \n\t"                   \
+    "adcs r10, r10, #0 \n\t"                 \
+                                             \
+    /* Add into previous */                  \
+    "ldr r14, [r0], #4 \n\t"                 \
+    "adds r3, r3, r14 \n\t"                  \
+    "ldr r14, [r0], #4 \n\t"                 \
+    "adcs r4, r4, r14 \n\t"                  \
+    "ldr r14, [r0], #4 \n\t"                 \
+    "adcs r5, r5, r14 \n\t"                  \
+    "ldr r14, [r0], #4 \n\t"                 \
+    "adcs r6, r6, r14 \n\t"                  \
+    "ldr r14, [r0], #4 \n\t"                 \
+    "adcs r7, r7, r14 \n\t"                  \
+    "ldr r14, [r0], #4 \n\t"                 \
+    "adcs r8, r8, r14 \n\t"                  \
+    "adcs r9, r9, #0 \n\t"                   \
+    "adcs r10, r10, #0 \n\t"                 \
+    "sub r0, #24 \n\t"                       \
+                                             \
+    /* Perform center multiplication */      \
+    "umlal r9, r10, r12, r12 \n\t"           \
+    "stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10} \n\t"
+
+#define FAST_SQUARE_ASM_7                          \
+    "push   {r2} \n\t"                             \
+    "ldmia r1!, {r2, r3, r4, r5, r6, r7, r8} \n\t" \
+    "push   {r1} \n\t"                             \
+    "sub r1, 4 \n\t"                               \
+                                                   \
+    "add r0, 24 \n\t"                              \
+    "umull r9, r10, r2, r8 \n\t"                   \
+    "stmia r0!, {r9, r10} \n\t"                    \
+    "sub r0, 32 \n\t"                              \
+                                                   \
+    "umull r11, r12, r2, r2 \n\t"                  \
+    "stmia r0!, {r11} \n\t"                        \
+                                                   \
+    "mov r9, #0 \n\t"                              \
+    "umull r10, r11, r2, r3 \n\t"                  \
+    "adds r12, r12, r10 \n\t"                      \
+    "adcs r8, r11, #0 \n\t"                        \
+    "adc r9, r9, #0 \n\t"                          \
+    "adds r12, r12, r10 \n\t"                      \
+    "adcs r8, r8, r11 \n\t"                        \
+    "adc r9, r9, #0 \n\t"                          \
+    "stmia r0!, {r12} \n\t"                        \
+                                                   \
+    "mov r10, #0 \n\t"                             \
+    "umull r11, r12, r2, r4 \n\t"                  \
+    "adds r11, r11, r11 \n\t"                      \
+    "adcs r12, r12, r12 \n\t"                      \
+    "adc r10, r10, #0 \n\t"                        \
+    "adds r8, r8, r11 \n\t"                        \
+    "adcs r9, r9, r12 \n\t"                        \
+    "adc r10, r10, #0 \n\t"                        \
+    "umull r11, r12, r3, r3 \n\t"                  \
+    "adds r8, r8, r11 \n\t"                        \
+    "adcs r9, r9, r12 \n\t"                        \
+    "adc r10, r10, #0 \n\t"                        \
+    "stmia r0!, {r8} \n\t"                         \
+                                                   \
+    "mov r12, #0 \n\t"                             \
+    "umull r8, r11, r2, r5 \n\t"                   \
+    "mov r14, r11 \n\t"                            \
+    "umlal r8, r11, r3, r4 \n\t"                   \
+    "cmp r14, r11 \n\t"                            \
+    "it hi \n\t"                                   \
+    "adchi r12, r12, #0 \n\t"                      \
+    "adds r8, r8, r8 \n\t"                         \
+    "adcs r11, r11, r11 \n\t"                      \
+    "adc r12, r12, r12 \n\t"                       \
+    "adds r8, r8, r9 \n\t"                         \
+    "adcs r11, r11, r10 \n\t"                      \
+    "adc r12, r12, #0 \n\t"                        \
+    "stmia r0!, {r8} \n\t"                         \
+                                                   \
+    "mov r10, #0 \n\t"                             \
+    "umull r8, r9, r2, r6 \n\t"                    \
+    "mov r14, r9 \n\t"                             \
+    "umlal r8, r9, r3, r5 \n\t"                    \
+    "cmp r14, r9 \n\t"                             \
+    "it hi \n\t"                                   \
+    "adchi r10, r10, #0 \n\t"                      \
+    "adds r8, r8, r8 \n\t"                         \
+    "adcs r9, r9, r9 \n\t"                         \
+    "adc r10, r10, r10 \n\t"                       \
+    "mov r14, r9 \n\t"                             \
+    "umlal r8, r9, r4, r4 \n\t"                    \
+    "cmp r14, r9 \n\t"                             \
+    "it hi \n\t"                                   \
+    "adchi r10, r10, #0 \n\t"                      \
+    "adds r8, r8, r11 \n\t"                        \
+    "adcs r9, r9, r12 \n\t"                        \
+    "adc r10, r10, #0 \n\t"                        \
+    "stmia r0!, {r8} \n\t"                         \
+                                                   \
+    "mov r12, #0 \n\t"                             \
+    "umull r8, r11, r2, r7 \n\t"                   \
+    "mov r14, r11 \n\t"                            \
+    "umlal r8, r11, r3, r6 \n\t"                   \
+    "cmp r14, r11 \n\t"                            \
+    "it hi \n\t"                                   \
+    "adchi r12, r12, #0 \n\t"                      \
+    "mov r14, r11 \n\t"                            \
+    "umlal r8, r11, r4, r5 \n\t"                   \
+    "cmp r14, r11 \n\t"                            \
+    "it hi \n\t"                                   \
+    "adchi r12, r12, #0 \n\t"                      \
+    "adds r8, r8, r8 \n\t"                         \
+    "adcs r11, r11, r11 \n\t"                      \
+    "adc r12, r12, r12 \n\t"                       \
+    "adds r8, r8, r9 \n\t"                         \
+    "adcs r11, r11, r10 \n\t"                      \
+    "adc r12, r12, #0 \n\t"                        \
+    "stmia r0!, {r8} \n\t"                         \
+                                                   \
+    "ldmia r1!, {r2} \n\t"                         \
+    "mov r10, #0 \n\t"                             \
+    "umull r8, r9, r3, r7 \n\t"                    \
+    "mov r14, r9 \n\t"                             \
+    "umlal r8, r9, r4, r6 \n\t"                    \
+    "cmp r14, r9 \n\t"                             \
+    "it hi \n\t"                                   \
+    "adchi r10, r10, #0 \n\t"                      \
+    "ldr r14, [r0] \n\t"                           \
+    "adds r8, r8, r14 \n\t"                        \
+    "adcs r9, r9, #0 \n\t"                         \
+    "adc r10, r10, #0 \n\t"                        \
+    "adds r8, r8, r8 \n\t"                         \
+    "adcs r9, r9, r9 \n\t"                         \
+    "adc r10, r10, r10 \n\t"                       \
+    "mov r14, r9 \n\t"                             \
+    "umlal r8, r9, r5, r5 \n\t"                    \
+    "cmp r14, r9 \n\t"                             \
+    "it hi \n\t"                                   \
+    "adchi r10, r10, #0 \n\t"                      \
+    "adds r8, r8, r11 \n\t"                        \
+    "adcs r9, r9, r12 \n\t"                        \
+    "adc r10, r10, #0 \n\t"                        \
+    "stmia r0!, {r8} \n\t"                         \
+                                                   \
+    "mov r12, #0 \n\t"                             \
+    "umull r8, r11, r3, r2 \n\t"                   \
+    "mov r14, r11 \n\t"                            \
+    "umlal r8, r11, r4, r7 \n\t"                   \
+    "cmp r14, r11 \n\t"                            \
+    "it hi \n\t"                                   \
+    "adchi r12, r12, #0 \n\t"                      \
+    "mov r14, r11 \n\t"                            \
+    "umlal r8, r11, r5, r6 \n\t"                   \
+    "cmp r14, r11 \n\t"                            \
+    "it hi \n\t"                                   \
+    "adchi r12, r12, #0 \n\t"                      \
+    "ldr r14, [r0] \n\t"                           \
+    "adds r8, r8, r14 \n\t"                        \
+    "adcs r11, r11, #0 \n\t"                       \
+    "adc r12, r12, #0 \n\t"                        \
+    "adds r8, r8, r8 \n\t"                         \
+    "adcs r11, r11, r11 \n\t"                      \
+    "adc r12, r12, r12 \n\t"                       \
+    "adds r8, r8, r9 \n\t"                         \
+    "adcs r11, r11, r10 \n\t"                      \
+    "adc r12, r12, #0 \n\t"                        \
+    "stmia r0!, {r8} \n\t"                         \
+                                                   \
+    "mov r10, #0 \n\t"                             \
+    "umull r8, r9, r4, r2 \n\t"                    \
+    "mov r14, r9 \n\t"                             \
+    "umlal r8, r9, r5, r7 \n\t"                    \
+    "cmp r14, r9 \n\t"                             \
+    "it hi \n\t"                                   \
+    "adchi r10, r10, #0 \n\t"                      \
+    "adds r8, r8, r8 \n\t"                         \
+    "adcs r9, r9, r9 \n\t"                         \
+    "adc r10, r10, r10 \n\t"                       \
+    "mov r14, r9 \n\t"                             \
+    "umlal r8, r9, r6, r6 \n\t"                    \
+    "cmp r14, r9 \n\t"                             \
+    "it hi \n\t"                                   \
+    "adchi r10, r10, #0 \n\t"                      \
+    "adds r8, r8, r11 \n\t"                        \
+    "adcs r9, r9, r12 \n\t"                        \
+    "adc r10, r10, #0 \n\t"                        \
+    "stmia r0!, {r8} \n\t"                         \
+                                                   \
+    "mov r12, #0 \n\t"                             \
+    "umull r8, r11, r5, r2 \n\t"                   \
+    "mov r14, r11 \n\t"                            \
+    "umlal r8, r11, r6, r7 \n\t"                   \
+    "cmp r14, r11 \n\t"                            \
+    "it hi \n\t"                                   \
+    "adchi r12, r12, #0 \n\t"                      \
+    "adds r8, r8, r8 \n\t"                         \
+    "adcs r11, r11, r11 \n\t"                      \
+    "adc r12, r12, r12 \n\t"                       \
+    "adds r8, r8, r9 \n\t"                         \
+    "adcs r11, r11, r10 \n\t"                      \
+    "adc r12, r12, #0 \n\t"                        \
+    "stmia r0!, {r8} \n\t"                         \
+                                                   \
+    "mov r8, #0 \n\t"                              \
+    "umull r1, r10, r6, r2 \n\t"                   \
+    "adds r1, r1, r1 \n\t"                         \
+    "adcs r10, r10, r10 \n\t"                      \
+    "adc r8, r8, #0 \n\t"                          \
+    "adds r11, r11, r1 \n\t"                       \
+    "adcs r12, r12, r10 \n\t"                      \
+    "adc r8, r8, #0 \n\t"                          \
+    "umull r1, r10, r7, r7 \n\t"                   \
+    "adds r11, r11, r1 \n\t"                       \
+    "adcs r12, r12, r10 \n\t"                      \
+    "adc r8, r8, #0 \n\t"                          \
+    "stmia r0!, {r11} \n\t"                        \
+                                                   \
+    "mov r11, #0 \n\t"                             \
+    "umull r1, r10, r7, r2 \n\t"                   \
+    "adds r1, r1, r1 \n\t"                         \
+    "adcs r10, r10, r10 \n\t"                      \
+    "adc r11, r11, #0 \n\t"                        \
+    "adds r12, r12, r1 \n\t"                       \
+    "adcs r8, r8, r10 \n\t"                        \
+    "adc r11, r11, #0 \n\t"                        \
+    "stmia r0!, {r12} \n\t"                        \
+                                                   \
+    "umull r1, r10, r2, r2 \n\t"                   \
+    "adds r8, r8, r1 \n\t"                         \
+    "adcs r11, r11, r10 \n\t"                      \
+    "stmia r0!, {r8, r11} \n\t"                    \
+    "pop {r1, r2} \n\t"
+
+#define FAST_SQUARE_ASM_7_TO_8           \
+    "cmp r2, #7 \n\t"                    \
+    "beq 1f \n\t"                        \
+                                         \
+    "sub r0, #28 \n\t"                   \
+    "sub r1, #28 \n\t"                   \
+                                         \
+    /* Do off-center multiplication */   \
+    "ldmia r1!, {r6,r7,r8,r9,r10,r11,r12,r14} \n\t" \
+    "umull r3, r4, r6, r14 \n\t"         \
+    "umull r6, r5, r7, r14 \n\t"         \
+    "adds r4, r4, r6 \n\t"               \
+    "umull r7, r6, r8, r14 \n\t"         \
+    "adcs r5, r5, r7 \n\t"               \
+    "umull r8, r7, r9, r14 \n\t"         \
+    "adcs r6, r6, r8 \n\t"               \
+    "umull r9, r8, r10, r14 \n\t"        \
+    "adcs r7, r7, r9 \n\t"               \
+    "umull r10, r9, r11, r14 \n\t"       \
+    "adcs r8, r8, r10 \n\t"              \
+    "umull r11, r10, r12, r14 \n\t"      \
+    "adcs r9, r9, r11 \n\t"              \
+    "adcs r10, r10, #0 \n\t"             \
+                                         \
+    /* Multiply by 2 */                  \
+    "mov r11, #0 \n\t"                   \
+    "adds r3, r3, r3 \n\t"               \
+    "adcs r4, r4, r4 \n\t"               \
+    "adcs r5, r5, r5 \n\t"               \
+    "adcs r6, r6, r6 \n\t"               \
+    "adcs r7, r7, r7 \n\t"               \
+    "adcs r8, r8, r8 \n\t"               \
+    "adcs r9, r9, r9 \n\t"               \
+    "adcs r10, r10, r10 \n\t"            \
+    "adcs r11, r11, #0 \n\t"             \
+                                         \
+    /* Add into previous */              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adds r3, r3, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r4, r4, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r5, r5, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r6, r6, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r7, r7, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r8, r8, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r9, r9, r12 \n\t"              \
+    "adcs r10, r10, #0 \n\t"             \
+    "adcs r11, r11, #0 \n\t"             \
+    "sub r0, #28 \n\t"                   \
+                                         \
+    /* Perform center multiplication */  \
+    "umlal r10, r11, r14, r14 \n\t"      \
+    "stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10,r11} \n\t"
 
 #define FAST_SQUARE_ASM_8                   \
-    "ldmia r1!, {r2, r3} \n\t"              \
-    "add r1, 16 \n\t"                       \
-    "ldmia r1!, {r5, r6} \n\t"              \
+    "push   {r2} \n\t"                      \
+    "ldmia r1!, {r2,r3,r4,r5,r6,r7,r8,r9} \n\t" \
+    "push   {r1} \n\t"                      \
+    "sub r1, 8 \n\t"                        \
+                                            \
     "add r0, 24 \n\t"                       \
-                                            \
-    "umull r8, r9, r2, r5 \n\t"             \
-    "stmia r0!, {r8} \n\t"                  \
-                                            \
-    "umull r12, r10, r2, r6 \n\t"           \
-    "adds r9, r9, r12 \n\t"                 \
-    "adc r10, r10, #0 \n\t"                 \
-    "stmia r0!, {r9} \n\t"                  \
-                                            \
-    "umull r8, r9, r3, r6 \n\t"             \
-    "adds r10, r10, r8 \n\t"                \
-    "adc r11, r9, #0 \n\t"                  \
-    "stmia r0!, {r10, r11} \n\t"            \
-                                            \
+    "umull r10, r11, r2, r8 \n\t"           \
+    "umull r12, r14, r2, r9 \n\t"           \
+    "umull r8, r9, r3, r9 \n\t"             \
+    "adds r11, r11, r12 \n\t"               \
+    "adcs r12, r14, r8 \n\t"                \
+    "adcs r14, r9, #0 \n\t"                 \
+    "stmia r0!, {r10, r11, r12, r14} \n\t"  \
     "sub r0, 40 \n\t"                       \
-    "sub r1, 32 \n\t"                       \
-    "ldmia r1!, {r2,r3,r4,r5,r6,r7} \n\t"   \
                                             \
     "umull r11, r12, r2, r2 \n\t"           \
     "stmia r0!, {r11} \n\t"                 \
@@ -1803,6 +2305,7 @@
     "umull r1, r10, r3, r3 \n\t"            \
     "adds r8, r8, r1 \n\t"                  \
     "adcs r11, r11, r10 \n\t"               \
-    "stmia r0!, {r8, r11} \n\t"
+    "stmia r0!, {r8, r11} \n\t"             \
+    "pop {r1, r2} \n\t"
 
 #endif /* _UECC_ASM_ARM_MULT_SQUARE_H_ */
diff --git a/asm_arm_mult_square_umaal.inc b/asm_arm_mult_square_umaal.inc
new file mode 100644
index 0000000..c554d20
--- /dev/null
+++ b/asm_arm_mult_square_umaal.inc
@@ -0,0 +1,1202 @@
+/* Copyright 2015, Kenneth MacKay. Licensed under the BSD 2-clause license. */
+
+#ifndef _UECC_ASM_ARM_MULT_SQUARE_H_
+#define _UECC_ASM_ARM_MULT_SQUARE_H_
+
+#define FAST_MULT_ASM_5                     \
+    "push   {r3} \n\t"                      \
+    "ldmia  r2!, {r3, r4, r5, r6, r7} \n\t" \
+    "push   {r2} \n\t"                      \
+                                            \
+    "ldr    r2, [r1], #4 \n\t"              \
+    "umull  r8, r9, r3, r2 \n\t"            \
+    "str    r8, [r0], #4 \n\t"              \
+    "mov    r10, #0 \n\t"                   \
+    "umaal  r9, r10, r4, r2 \n\t"           \
+    "mov    r11, #0 \n\t"                   \
+    "umaal  r10, r11, r5, r2 \n\t"          \
+    "mov    r12, #0 \n\t"                   \
+    "umaal  r11, r12, r6, r2 \n\t"          \
+    "mov    r14, #0 \n\t"                   \
+    "umaal  r12, r14, r7, r2 \n\t"          \
+                                            \
+    "ldr    r2, [r1], #4 \n\t"              \
+    "mov    r8, #0 \n\t"                    \
+    "umaal  r8, r9, r3, r2 \n\t"            \
+    "str    r8, [r0], #4 \n\t"              \
+    "umaal  r9, r10, r4, r2 \n\t"           \
+    "umaal  r10, r11, r5, r2 \n\t"          \
+    "umaal  r11, r12, r6, r2 \n\t"          \
+    "umaal  r12, r14, r7, r2 \n\t"          \
+                                            \
+    "ldr    r2, [r1], #4 \n\t"              \
+    "mov    r8, #0 \n\t"                    \
+    "umaal  r8, r9, r3, r2 \n\t"            \
+    "str    r8, [r0], #4 \n\t"              \
+    "umaal  r9, r10, r4, r2 \n\t"           \
+    "umaal  r10, r11, r5, r2 \n\t"          \
+    "umaal  r11, r12, r6, r2 \n\t"          \
+    "umaal  r12, r14, r7, r2 \n\t"          \
+                                            \
+    "ldr    r2, [r1], #4 \n\t"              \
+    "mov    r8, #0 \n\t"                    \
+    "umaal  r8, r9, r3, r2 \n\t"            \
+    "str    r8, [r0], #4 \n\t"              \
+    "umaal  r9, r10, r4, r2 \n\t"           \
+    "umaal  r10, r11, r5, r2 \n\t"          \
+    "umaal  r11, r12, r6, r2 \n\t"          \
+    "umaal  r12, r14, r7, r2 \n\t"          \
+                                            \
+    "ldr    r2, [r1], #4 \n\t"              \
+    "mov    r8, #0 \n\t"                    \
+    "umaal  r8, r9, r3, r2 \n\t"            \
+    "str    r8, [r0], #4 \n\t"              \
+    "umaal  r9, r10, r4, r2 \n\t"           \
+    "umaal  r10, r11, r5, r2 \n\t"          \
+    "umaal  r11, r12, r6, r2 \n\t"          \
+    "umaal  r12, r14, r7, r2 \n\t"          \
+                                            \
+    "str    r9, [r0], #4 \n\t"              \
+    "str    r10, [r0], #4 \n\t"             \
+    "str    r11, [r0], #4 \n\t"             \
+    "str    r12, [r0], #4 \n\t"             \
+    "str    r14, [r0], #4 \n\t"             \
+                                            \
+    "pop   {r2, r3} \n\t"
+
+#define FAST_MULT_ASM_5_TO_6                 \
+    "cmp r3, #5 \n\t"                        \
+    "beq 1f \n\t"                            \
+                                             \
+    /* r4 = left high */                     \
+    "ldr r4, [r1] \n\t"                      \
+                                             \
+    "sub r0, #20 \n\t"                       \
+    "sub r1, #20 \n\t"                       \
+    "sub r2, #20 \n\t"                       \
+                                             \
+    /* Do right side */                      \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "mov r5, #0 \n\t"                        \
+    "ldr r6, [r0], #4 \n\t"                  \
+    "umaal  r5, r6, r4, r14 \n\t"            \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r7, [r0], #4 \n\t"                  \
+    "umaal  r6, r7, r4, r14 \n\t"            \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r8, [r0], #4 \n\t"                  \
+    "umaal  r7, r8, r4, r14 \n\t"            \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r9, [r0], #4 \n\t"                  \
+    "umaal  r8, r9, r4, r14 \n\t"            \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r10, [r0], #4 \n\t"                 \
+    "umaal  r9, r10, r4, r14 \n\t"           \
+    "sub r0, #20 \n\t"                       \
+                                             \
+    /* r4 = right high */                    \
+    "ldr r4, [r2], #4 \n\t"                  \
+                                             \
+    /* Do left side */                       \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "mov r12, #0 \n\t"                       \
+    "umaal  r12, r5, r4, r14 \n\t"           \
+    "str r12, [r0], #4 \n\t"                 \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r5, r6, r4, r14 \n\t"            \
+    "str r5, [r0], #4 \n\t"                  \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r6, r7, r4, r14 \n\t"            \
+    "str r6, [r0], #4 \n\t"                  \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r7, r8, r4, r14 \n\t"            \
+    "str r7, [r0], #4 \n\t"                  \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"            \
+    "str r8, [r0], #4 \n\t"                  \
+                                             \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r9, r10, r4, r14 \n\t"           \
+    "stmia r0!, {r9, r10} \n\t"
+
+#define FAST_MULT_ASM_6                  \
+    "ldmia  r2!, {r4, r5, r6} \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "umull  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "mov    r10, #0 \n\t"                \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "mov    r11, #0 \n\t"                \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "str    r9, [r0], #4 \n\t"           \
+    "str    r10, [r0], #4 \n\t"          \
+    "str    r11, [r0], #4 \n\t"          \
+                                         \
+    "sub r0, #24 \n\t"                   \
+    "sub r1, #24 \n\t"                   \
+    "ldmia  r2!, {r4, r5, r6} \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "mov    r9, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "mov    r10, #0 \n\t"                \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "mov    r11, #0 \n\t"                \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "str    r9, [r0], #4 \n\t"           \
+    "str    r10, [r0], #4 \n\t"          \
+    "str    r11, [r0], #4 \n\t"
+
+#define FAST_MULT_ASM_6_TO_7                 \
+    "cmp r3, #6 \n\t"                        \
+    "beq 1f \n\t"                            \
+                                             \
+    /* r4 = left high */                     \
+    "ldr r4, [r1] \n\t"                      \
+                                             \
+    "sub r0, #24 \n\t"                       \
+    "sub r1, #24 \n\t"                       \
+    "sub r2, #24 \n\t"                       \
+                                             \
+    /* Do right side */                      \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "mov r5, #0 \n\t"                        \
+    "ldr r6, [r0], #4 \n\t"                  \
+    "umaal  r5, r6, r4, r14 \n\t"            \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r7, [r0], #4 \n\t"                  \
+    "umaal  r6, r7, r4, r14 \n\t"            \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r8, [r0], #4 \n\t"                  \
+    "umaal  r7, r8, r4, r14 \n\t"            \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r9, [r0], #4 \n\t"                  \
+    "umaal  r8, r9, r4, r14 \n\t"            \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r10, [r0], #4 \n\t"                 \
+    "umaal  r9, r10, r4, r14 \n\t"           \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r11, [r0], #4 \n\t"                 \
+    "umaal  r10, r11, r4, r14 \n\t"          \
+    "sub r0, #24 \n\t"                       \
+                                             \
+    /* r4 = right high */                    \
+    "ldr r4, [r2], #4 \n\t"                  \
+                                             \
+    /* Do left side */                       \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "mov r12, #0 \n\t"                       \
+    "umaal  r12, r5, r4, r14 \n\t"           \
+    "str r12, [r0], #4 \n\t"                 \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r5, r6, r4, r14 \n\t"            \
+    "str r5, [r0], #4 \n\t"                  \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r6, r7, r4, r14 \n\t"            \
+    "str r6, [r0], #4 \n\t"                  \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r7, r8, r4, r14 \n\t"            \
+    "str r7, [r0], #4 \n\t"                  \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"            \
+    "str r8, [r0], #4 \n\t"                  \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r9, r10, r4, r14 \n\t"           \
+    "str r9, [r0], #4 \n\t"                  \
+                                             \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r10, r11, r4, r14 \n\t"          \
+    "stmia r0!, {r10, r11} \n\t"
+
+#define FAST_MULT_ASM_7                  \
+    "ldmia  r2!, {r4, r5, r6, r7} \n\t"  \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "umull  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "mov    r10, #0 \n\t"                \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "mov    r11, #0 \n\t"                \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "mov    r12, #0 \n\t"                \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "str    r9, [r0], #4 \n\t"           \
+    "str    r10, [r0], #4 \n\t"          \
+    "str    r11, [r0], #4 \n\t"          \
+    "str    r12, [r0], #4 \n\t"          \
+                                         \
+    "sub r0, #28 \n\t"                   \
+    "sub r1, #28 \n\t"                   \
+    "ldmia  r2!, {r4, r5, r6} \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "mov    r9, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "mov    r10, #0 \n\t"                \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "mov    r11, #0 \n\t"                \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+                                         \
+    "str    r9, [r0], #4 \n\t"           \
+    "str    r10, [r0], #4 \n\t"          \
+    "str    r11, [r0], #4 \n\t"
+
+#define FAST_MULT_ASM_7_TO_8                 \
+    "cmp r3, #7 \n\t"                        \
+    "beq 1f \n\t"                            \
+    "push {r3} \n\t"                         \
+                                             \
+    /* r4 = left high */                     \
+    "ldr r4, [r1] \n\t"                      \
+                                             \
+    "sub r0, #28 \n\t"                       \
+    "sub r1, #28 \n\t"                       \
+    "sub r2, #28 \n\t"                       \
+                                             \
+    /* Do right side */                      \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "mov r5, #0 \n\t"                        \
+    "ldr r6, [r0], #4 \n\t"                  \
+    "umaal  r5, r6, r4, r14 \n\t"            \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r7, [r0], #4 \n\t"                  \
+    "umaal  r6, r7, r4, r14 \n\t"            \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r8, [r0], #4 \n\t"                  \
+    "umaal  r7, r8, r4, r14 \n\t"            \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r9, [r0], #4 \n\t"                  \
+    "umaal  r8, r9, r4, r14 \n\t"            \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r10, [r0], #4 \n\t"                 \
+    "umaal  r9, r10, r4, r14 \n\t"           \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r11, [r0], #4 \n\t"                 \
+    "umaal  r10, r11, r4, r14 \n\t"          \
+    "ldr r14, [r2], #4 \n\t"                 \
+    "ldr r12, [r0], #4 \n\t"                 \
+    "umaal  r11, r12, r4, r14 \n\t"          \
+    "sub r0, #28 \n\t"                       \
+                                             \
+    /* r4 = right high */                    \
+    "ldr r4, [r2], #4 \n\t"                  \
+                                             \
+    /* Do left side */                       \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "mov r3, #0 \n\t"                        \
+    "umaal  r3, r5, r4, r14 \n\t"            \
+    "str r3, [r0], #4 \n\t"                  \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r5, r6, r4, r14 \n\t"            \
+    "str r5, [r0], #4 \n\t"                  \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r6, r7, r4, r14 \n\t"            \
+    "str r6, [r0], #4 \n\t"                  \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r7, r8, r4, r14 \n\t"            \
+    "str r7, [r0], #4 \n\t"                  \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"            \
+    "str r8, [r0], #4 \n\t"                  \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r9, r10, r4, r14 \n\t"           \
+    "str r9, [r0], #4 \n\t"                  \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r10, r11, r4, r14 \n\t"          \
+    "str r10, [r0], #4 \n\t"                 \
+                                             \
+    "ldr r14, [r1], #4 \n\t"                 \
+    "umaal  r11, r12, r4, r14 \n\t"          \
+    "stmia r0!, {r11, r12} \n\t"             \
+    "pop {r3} \n\t"
+
+#define FAST_MULT_ASM_8                  \
+    "ldmia  r2!, {r4, r5, r6, r7} \n\t"  \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "umull  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "mov    r10, #0 \n\t"                \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "mov    r11, #0 \n\t"                \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "mov    r12, #0 \n\t"                \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "mov    r8, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "str    r9, [r0], #4 \n\t"           \
+    "str    r10, [r0], #4 \n\t"          \
+    "str    r11, [r0], #4 \n\t"          \
+    "str    r12, [r0], #4 \n\t"          \
+                                         \
+    "sub r0, #32 \n\t"                   \
+    "sub r1, #32 \n\t"                   \
+    "ldmia  r2!, {r4, r5, r6, r7} \n\t"  \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "mov    r9, #0 \n\t"                 \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "mov    r10, #0 \n\t"                \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "mov    r11, #0 \n\t"                \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "mov    r12, #0 \n\t"                \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "ldr    r14, [r1], #4 \n\t"          \
+    "ldr    r8, [r0] \n\t"               \
+    "umaal  r8, r9, r4, r14 \n\t"        \
+    "str    r8, [r0], #4 \n\t"           \
+    "umaal  r9, r10, r5, r14 \n\t"       \
+    "umaal  r10, r11, r6, r14 \n\t"      \
+    "umaal  r11, r12, r7, r14 \n\t"      \
+                                         \
+    "str    r9, [r0], #4 \n\t"           \
+    "str    r10, [r0], #4 \n\t"          \
+    "str    r11, [r0], #4 \n\t"          \
+    "str    r12, [r0], #4 \n\t"
+
+#define FAST_SQUARE_ASM_5               \
+    "ldmia r1!, {r9,r10,r11,r12,r14} \n\t" \
+    "push {r1, r2} \n\t"                \
+                                        \
+    "umull r1, r2, r10, r9 \n\t"        \
+    "mov r3, #0 \n\t"                   \
+    "umaal r2, r3, r11, r9 \n\t"        \
+    "mov r4, #0 \n\t"                   \
+    "umaal r3, r4, r12, r9 \n\t"        \
+    "mov r5, #0 \n\t"                   \
+    "umaal r4, r5, r14, r9 \n\t"        \
+                                        \
+    "mov r6, #0 \n\t"                   \
+    "umaal r6, r3, r11, r10 \n\t"       \
+    "umaal r3, r4, r12, r10 \n\t"       \
+    "adds r1, r1, r1 \n\t"              \
+    "adcs r2, r2, r2 \n\t"              \
+    "adcs r6, r6, r6 \n\t"              \
+    "adcs r3, r3, r3 \n\t"              \
+                                        \
+    "umull r7, r8, r9, r9 \n\t"         \
+    /* Store carry in r9 */             \
+    "mov r9, #0 \n\t"                   \
+    "adc r9, r9, #0 \n\t"               \
+    "adds r8, r8, r1 \n\t"              \
+    "stmia r0!, {r7,r8} \n\t"           \
+                                        \
+    "umull r7, r8, r10, r10 \n\t"       \
+    "adcs r7, r7, r2 \n\t"              \
+    "adcs r8, r8, r6 \n\t"              \
+    "stmia r0!, {r7,r8} \n\t"           \
+                                        \
+    "umaal r4, r5, r14, r10 \n\t"       \
+    /* Store carry in r10 */            \
+    "mov r10, #0 \n\t"                  \
+    "adc r10, r10, #0 \n\t"             \
+                                        \
+    "mov r1, #0 \n\t"                   \
+    "umaal r1, r4, r12, r11 \n\t"       \
+    "umaal r4, r5, r14, r11 \n\t"       \
+                                        \
+    "mov r2, #0 \n\t"                   \
+    "umaal r2, r5, r14, r12 \n\t"       \
+    /* Load carry from r9 */            \
+    "lsrs r9, #1 \n\t"                  \
+    "adcs r1, r1, r1 \n\t"              \
+    "adcs r4, r4, r4 \n\t"              \
+    "adcs r2, r2, r2 \n\t"              \
+    "adcs r5, r5, r5 \n\t"              \
+    /* r9 is 0 now */                   \
+    "adc r9, r9, #0 \n\t"               \
+                                        \
+    /* Use carry from r10 */            \
+    "umaal r3, r10, r11, r11 \n\t"      \
+    "adds r10, r10, r1 \n\t"            \
+    "stmia r0!, {r3,r10} \n\t"          \
+                                        \
+    "umull r6, r10, r12, r12 \n\t"      \
+    "adcs r6, r6, r4 \n\t"              \
+    "adcs r10, r10, r2 \n\t"            \
+    "stmia r0!, {r6,r10} \n\t"          \
+                                        \
+    "umull r6, r10, r14, r14 \n\t"      \
+    "adcs r6, r6, r5 \n\t"              \
+    "adcs r10, r10, r9 \n\t"            \
+    "stmia r0!, {r6,r10} \n\t"          \
+    "pop {r1, r2} \n\t"
+
+#define FAST_SQUARE_ASM_5_TO_6           \
+    "cmp r2, #5 \n\t"                    \
+    "beq 1f \n\t"                        \
+                                         \
+    "sub r0, #20 \n\t"                   \
+    "sub r1, #20 \n\t"                   \
+                                         \
+    /* Do off-center multiplication */   \
+    "ldmia r1!, {r5,r6,r7,r8,r9,r14} \n\t" \
+    "umull r3, r4, r5, r14 \n\t"         \
+    "mov r5, #0 \n\t"                    \
+    "umaal r4, r5, r6, r14 \n\t"         \
+    "mov r6, #0 \n\t"                    \
+    "umaal r5, r6, r7, r14 \n\t"         \
+    "mov r7, #0 \n\t"                    \
+    "umaal r6, r7, r8, r14 \n\t"         \
+    "mov r8, #0 \n\t"                    \
+    "umaal r7, r8, r9, r14 \n\t"         \
+                                         \
+    /* Multiply by 2 */                  \
+    "mov r9, #0 \n\t"                    \
+    "adds r3, r3, r3 \n\t"               \
+    "adcs r4, r4, r4 \n\t"               \
+    "adcs r5, r5, r5 \n\t"               \
+    "adcs r6, r6, r6 \n\t"               \
+    "adcs r7, r7, r7 \n\t"               \
+    "adcs r8, r8, r8 \n\t"               \
+    "adcs r9, r9, #0 \n\t"               \
+                                         \
+    /* Add into previous */              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adds r3, r3, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r4, r4, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r5, r5, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r6, r6, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r7, r7, r12 \n\t"              \
+    "adcs r8, r8, #0 \n\t"               \
+    "adcs r9, r9, #0 \n\t"               \
+    "sub r0, #20 \n\t"                   \
+                                         \
+    /* Perform center multiplication */  \
+    "umlal r8, r9, r14, r14 \n\t"        \
+    "stmia r0!, {r3,r4,r5,r6,r7,r8,r9} \n\t"
+
+#define FAST_SQUARE_ASM_6               \
+    "ldmia r1!, {r8,r9,r10,r11,r12,r14} \n\t" \
+    "push {r1, r2} \n\t"                \
+                                        \
+    "umull r1, r2, r9, r8 \n\t"         \
+    "mov r3, #0 \n\t"                   \
+    "umaal r2, r3, r10, r8 \n\t"        \
+    "mov r4, #0 \n\t"                   \
+    "umaal r3, r4, r11, r8 \n\t"        \
+    "mov r5, #0 \n\t"                   \
+    "umaal r4, r5, r12, r8 \n\t"        \
+    "mov r6, #0 \n\t"                   \
+    "umaal r5, r6, r14, r8 \n\t"        \
+                                        \
+    "mov r7, #0 \n\t"                   \
+    "umaal r7, r3, r10, r9 \n\t"        \
+    "umaal r3, r4, r11, r9 \n\t"        \
+    "umaal r4, r5, r12, r9 \n\t"        \
+    "push {r4, r5} \n\t"                \
+    "adds r1, r1, r1 \n\t"              \
+    "adcs r2, r2, r2 \n\t"              \
+    "adcs r7, r7, r7 \n\t"              \
+    "adcs r3, r3, r3 \n\t"              \
+                                        \
+    "umull r4, r5, r8, r8 \n\t"         \
+    /* Store carry in r8 */             \
+    "mov r8, #0 \n\t"                   \
+    "adc r8, r8, #0 \n\t"               \
+    "adds r5, r5, r1 \n\t"              \
+    "stmia r0!, {r4,r5} \n\t"           \
+                                        \
+    "umull r4, r5, r9, r9 \n\t"         \
+    "adcs r4, r4, r2 \n\t"              \
+    "adcs r5, r5, r7 \n\t"              \
+    "stmia r0!, {r4,r5} \n\t"           \
+                                        \
+    "pop {r4, r5} \n\t"                 \
+    "umaal r5, r6, r14, r9 \n\t"        \
+    /* Store carry in r9 */             \
+    "mov r9, #0 \n\t"                   \
+    "adc r9, r9, #0 \n\t"               \
+                                        \
+    "mov r1, #0 \n\t"                   \
+    "umaal r1, r4, r11, r10 \n\t"       \
+    "umaal r4, r5, r12, r10 \n\t"       \
+    "umaal r5, r6, r14, r10 \n\t"       \
+                                        \
+    "mov r2, #0 \n\t"                   \
+    "umaal r2, r5, r12, r11 \n\t"       \
+    "umaal r5, r6, r14, r11 \n\t"       \
+                                        \
+    "mov r7, #0 \n\t"                   \
+    "umaal r7, r6, r14, r12 \n\t"       \
+                                        \
+    /* Load carry from r8 */            \
+    "lsrs r8, #1 \n\t"                  \
+    "adcs r1, r1, r1 \n\t"              \
+    "adcs r4, r4, r4 \n\t"              \
+    "adcs r2, r2, r2 \n\t"              \
+    "adcs r5, r5, r5 \n\t"              \
+    "adcs r7, r7, r7 \n\t"              \
+    "adcs r6, r6, r6 \n\t"              \
+    "adc r8, r8, #0 \n\t"               \
+                                        \
+    /* Use carry from r9 */             \
+    "umaal r3, r9, r10, r10 \n\t"       \
+    "adds r9, r9, r1 \n\t"              \
+    "stmia r0!, {r3,r9} \n\t"           \
+                                        \
+    "umull r9, r10, r11, r11 \n\t"      \
+    "adcs r9, r9, r4 \n\t"              \
+    "adcs r10, r10, r2 \n\t"            \
+    "stmia r0!, {r9,r10} \n\t"          \
+                                        \
+    "umull r9, r10, r12, r12 \n\t"      \
+    "adcs r9, r9, r5 \n\t"              \
+    "adcs r10, r10, r7 \n\t"            \
+    "stmia r0!, {r9,r10} \n\t"          \
+                                        \
+    "umull r9, r10, r14, r14 \n\t"      \
+    "adcs r9, r9, r6 \n\t"              \
+    "adcs r10, r10, r8 \n\t"            \
+    "stmia r0!, {r9,r10} \n\t"          \
+    "pop {r1, r2} \n\t"
+
+#define FAST_SQUARE_ASM_6_TO_7               \
+    "cmp r2, #6 \n\t"                        \
+    "beq 1f \n\t"                            \
+                                             \
+    "sub r0, #24 \n\t"                       \
+    "sub r1, #24 \n\t"                       \
+                                             \
+    /* Do off-center multiplication */       \
+    "ldmia r1!, {r5,r6,r7,r8,r9,r10,r14} \n\t" \
+    "umull r3, r4, r5, r14 \n\t"             \
+    "mov r5, #0 \n\t"                        \
+    "umaal r4, r5, r6, r14 \n\t"             \
+    "mov r6, #0 \n\t"                        \
+    "umaal r5, r6, r7, r14 \n\t"             \
+    "mov r7, #0 \n\t"                        \
+    "umaal r6, r7, r8, r14 \n\t"             \
+    "mov r8, #0 \n\t"                        \
+    "umaal r7, r8, r9, r14 \n\t"             \
+    "mov r9, #0 \n\t"                        \
+    "umaal r8, r9, r10, r14 \n\t"            \
+                                             \
+    /* Multiply by 2 */                      \
+    "mov r10, #0 \n\t"                       \
+    "adds r3, r3, r3 \n\t"                   \
+    "adcs r4, r4, r4 \n\t"                   \
+    "adcs r5, r5, r5 \n\t"                   \
+    "adcs r6, r6, r6 \n\t"                   \
+    "adcs r7, r7, r7 \n\t"                   \
+    "adcs r8, r8, r8 \n\t"                   \
+    "adcs r9, r9, r9 \n\t"                   \
+    "adcs r10, r10, #0 \n\t"                 \
+                                             \
+    /* Add into previous */                  \
+    "ldr r12, [r0], #4 \n\t"                 \
+    "adds r3, r3, r12 \n\t"                  \
+    "ldr r12, [r0], #4 \n\t"                 \
+    "adcs r4, r4, r12 \n\t"                  \
+    "ldr r12, [r0], #4 \n\t"                 \
+    "adcs r5, r5, r12 \n\t"                  \
+    "ldr r12, [r0], #4 \n\t"                 \
+    "adcs r6, r6, r12 \n\t"                  \
+    "ldr r12, [r0], #4 \n\t"                 \
+    "adcs r7, r7, r12 \n\t"                  \
+    "ldr r12, [r0], #4 \n\t"                 \
+    "adcs r8, r8, r12 \n\t"                  \
+    "adcs r9, r9, #0 \n\t"                   \
+    "adcs r10, r10, #0 \n\t"                 \
+    "sub r0, #24 \n\t"                       \
+                                             \
+    /* Perform center multiplication */      \
+    "umlal r9, r10, r14, r14 \n\t"           \
+    "stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10} \n\t"
+
+#define FAST_SQUARE_ASM_7               \
+    "ldmia r1!, {r9,r10,r11,r12} \n\t"  \
+    "push {r2} \n\t"                    \
+                                        \
+    "umull r14, r2, r10, r9 \n\t"       \
+    "mov r3, #0 \n\t"                   \
+    "umaal r2, r3, r11, r9 \n\t"        \
+    "mov r4, #0 \n\t"                   \
+    "umaal r3, r4, r12, r9 \n\t"        \
+                                        \
+    "mov r5, #0 \n\t"                   \
+    "umaal r5, r3, r11, r10 \n\t"       \
+    "adds r14, r14, r14 \n\t"           \
+    "adcs r2, r2, r2 \n\t"              \
+    "adcs r5, r5, r5 \n\t"              \
+    /* Store carry in r7 */             \
+    "mov r7, #0 \n\t"                   \
+    "adc r7, r7, #0 \n\t"               \
+                                        \
+    "umull r6, r8, r9, r9 \n\t"         \
+    "adds r8, r8, r14 \n\t"             \
+    "stmia r0!, {r6,r8} \n\t"           \
+                                        \
+    "umull r6, r8, r10, r10 \n\t"       \
+    "adcs r6, r6, r2 \n\t"              \
+    "adcs r8, r8, r5 \n\t"              \
+    "stmia r0!, {r6,r8} \n\t"           \
+    /* Store carry in r8 */             \
+    "mov r8, #0 \n\t"                   \
+    "adc r8, r8, #0 \n\t"               \
+                                        \
+    "ldmia r1!, {r2, r6, r14} \n\t"     \
+    "push {r1} \n\t"                    \
+    "umaal r3, r4, r2, r9 \n\t"         \
+    "mov r5, #0 \n\t"                   \
+    "umaal r4, r5, r6, r9 \n\t"         \
+    "mov r1, #0 \n\t"                   \
+    "umaal r5, r1, r14, r9 \n\t"        \
+                                        \
+    "mov r9, #0 \n\t"                   \
+    "umaal r3, r9, r12, r10 \n\t"       \
+    "umaal r9, r4, r2, r10 \n\t"        \
+    "umaal r4, r5, r6, r10 \n\t"        \
+    "umaal r5, r1, r14, r10 \n\t"       \
+                                        \
+    "mov r10, #0 \n\t"                  \
+    "umaal r10, r9, r12, r11 \n\t"      \
+    "umaal r9, r4, r2, r11 \n\t"        \
+    "umaal r4, r5, r6, r11 \n\t"        \
+    "umaal r5, r1, r14, r11 \n\t"       \
+                                        \
+    /* Load carry from r7 */            \
+    "lsrs r7, #1 \n\t"                  \
+    "adcs r3, r3, r3 \n\t"              \
+    "adcs r10, r10, r10 \n\t"           \
+    "adcs r9, r9, r9 \n\t"              \
+    /* Store carry back in r7 */        \
+    "adc r7, r7, #0 \n\t"               \
+                                        \
+    /* Use carry from r8 */             \
+    "umaal r3, r8, r11, r11 \n\t"       \
+    "adds r8, r8, r10 \n\t"             \
+    "stmia r0!, {r3,r8} \n\t"           \
+    /* Store carry back in r8 */        \
+    "mov r8, #0 \n\t"                   \
+    "adc r8, r8, #0 \n\t"               \
+                                        \
+    "mov r3, #0 \n\t"                   \
+    "umaal r3, r4, r2, r12 \n\t"        \
+    "umaal r4, r5, r6, r12 \n\t"        \
+    "umaal r5, r1, r14, r12 \n\t"       \
+                                        \
+    "mov r10, #0 \n\t"                  \
+    "umaal r10, r5, r6, r2 \n\t"        \
+    "umaal r5, r1, r14, r2 \n\t"        \
+                                        \
+    "mov r11, #0 \n\t"                  \
+    "umaal r11, r1, r14, r6 \n\t"       \
+                                        \
+    /* Load carry from r7 */            \
+    "lsrs r7, #1 \n\t"                  \
+    "adcs r3, r3, r3 \n\t"              \
+    "adcs r4, r4, r4 \n\t"              \
+    "adcs r10, r10, r10 \n\t"           \
+    "adcs r5, r5, r5 \n\t"              \
+    "adcs r11, r11, r11 \n\t"           \
+    "adcs r1, r1, r1 \n\t"              \
+    "adc r7, r7, #0 \n\t"               \
+                                        \
+    /* Use carry from r8 */             \
+    "umaal r8, r9, r12, r12 \n\t"       \
+    "adds r9, r9, r3 \n\t"              \
+    "stmia r0!, {r8,r9} \n\t"           \
+                                        \
+    "umull r8, r9, r2, r2 \n\t"         \
+    "adcs r8, r8, r4 \n\t"              \
+    "adcs r9, r9, r10 \n\t"             \
+    "stmia r0!, {r8,r9} \n\t"           \
+                                        \
+    "umull r8, r9, r6, r6 \n\t"         \
+    "adcs r8, r8, r5 \n\t"              \
+    "adcs r9, r9, r11 \n\t"             \
+    "stmia r0!, {r8,r9} \n\t"           \
+                                        \
+    "umull r8, r9, r14, r14 \n\t"       \
+    "adcs r8, r8, r1 \n\t"              \
+    "adcs r9, r9, r7 \n\t"              \
+    "stmia r0!, {r8,r9} \n\t"           \
+    "pop {r1, r2} \n\t"
+
+#define FAST_SQUARE_ASM_7_TO_8           \
+    "cmp r2, #7 \n\t"                    \
+    "beq 1f \n\t"                        \
+                                         \
+    "sub r0, #28 \n\t"                   \
+    "sub r1, #28 \n\t"                   \
+                                         \
+    /* Do off-center multiplication */   \
+    "ldmia r1!, {r5,r6,r7,r8,r9,r10,r11,r14} \n\t" \
+    "umull r3, r4, r5, r14 \n\t"         \
+    "mov r5, #0 \n\t"                    \
+    "umaal r4, r5, r6, r14 \n\t"         \
+    "mov r6, #0 \n\t"                    \
+    "umaal r5, r6, r7, r14 \n\t"         \
+    "mov r7, #0 \n\t"                    \
+    "umaal r6, r7, r8, r14 \n\t"         \
+    "mov r8, #0 \n\t"                    \
+    "umaal r7, r8, r9, r14 \n\t"         \
+    "mov r9, #0 \n\t"                    \
+    "umaal r8, r9, r10, r14 \n\t"        \
+    "mov r10, #0 \n\t"                   \
+    "umaal r9, r10, r11, r14 \n\t"       \
+                                         \
+    /* Multiply by 2 */                  \
+    "mov r11, #0 \n\t"                   \
+    "adds r3, r3, r3 \n\t"               \
+    "adcs r4, r4, r4 \n\t"               \
+    "adcs r5, r5, r5 \n\t"               \
+    "adcs r6, r6, r6 \n\t"               \
+    "adcs r7, r7, r7 \n\t"               \
+    "adcs r8, r8, r8 \n\t"               \
+    "adcs r9, r9, r9 \n\t"               \
+    "adcs r10, r10, r10 \n\t"            \
+    "adcs r11, r11, #0 \n\t"             \
+                                         \
+    /* Add into previous */              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adds r3, r3, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r4, r4, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r5, r5, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r6, r6, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r7, r7, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r8, r8, r12 \n\t"              \
+    "ldr r12, [r0], #4 \n\t"             \
+    "adcs r9, r9, r12 \n\t"              \
+    "adcs r10, r10, #0 \n\t"             \
+    "adcs r11, r11, #0 \n\t"             \
+    "sub r0, #28 \n\t"                   \
+                                         \
+    /* Perform center multiplication */  \
+    "umlal r10, r11, r14, r14 \n\t"      \
+    "stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10,r11} \n\t"
+
+#define FAST_SQUARE_ASM_8               \
+    "ldmia r1!, {r10,r11,r12,r14} \n\t" \
+    "push {r2} \n\t"                    \
+                                        \
+    "umull r2, r3, r11, r10 \n\t"       \
+    "mov r4, #0 \n\t"                   \
+    "umaal r3, r4, r12, r10 \n\t"       \
+    "mov r5, #0 \n\t"                   \
+    "umaal r4, r5, r14, r10 \n\t"       \
+                                        \
+    "mov r6, #0 \n\t"                   \
+    "umaal r6, r4, r12, r11 \n\t"       \
+    "adds r2, r2, r2 \n\t"              \
+    "adcs r3, r3, r3 \n\t"              \
+    "adcs r6, r6, r6 \n\t"              \
+    /* Store carry in r7 */             \
+    "mov r7, #0 \n\t"                   \
+    "adc r7, r7, #0 \n\t"               \
+                                        \
+    "umull r8, r9, r10, r10 \n\t"       \
+    "adds r9, r9, r2 \n\t"              \
+    "stmia r0!, {r8,r9} \n\t"           \
+                                        \
+    "umull r8, r9, r11, r11 \n\t"       \
+    "adcs r8, r8, r3 \n\t"              \
+    "adcs r9, r9, r6 \n\t"              \
+    "stmia r0!, {r8,r9} \n\t"           \
+    /* Store carry in r8 */             \
+    "mov r8, #0 \n\t"                   \
+    "adc r8, r8, #0 \n\t"               \
+                                        \
+    "ldmia r1!, {r2, r3} \n\t"          \
+    "push {r1} \n\t"                    \
+    "umaal r4, r5, r2, r10 \n\t"        \
+    "mov r6, #0 \n\t"                   \
+    "umaal r5, r6, r3, r10 \n\t"        \
+                                        \
+    "mov r9, #0 \n\t"                   \
+    "umaal r9, r4, r14, r11 \n\t"       \
+    "umaal r4, r5, r2, r11 \n\t"        \
+                                        \
+    "mov r1, #0 \n\t"                   \
+    "umaal r1, r4, r14, r12 \n\t"       \
+                                        \
+    /* Load carry from r7 */            \
+    "lsrs r7, #1 \n\t"                  \
+    "adcs r9, r9, r9 \n\t"              \
+    "adcs r1, r1, r1 \n\t"              \
+    /* Store carry back in r7 */        \
+    "adc r7, r7, #0 \n\t"               \
+                                        \
+    /* Use carry from r8 */             \
+    "umaal r8, r9, r12, r12 \n\t"       \
+    "adds r9, r9, r1  \n\t"             \
+    "stmia r0!, {r8,r9} \n\t"           \
+    /* Store carry back in r8 */        \
+    "mov r8, #0 \n\t"                   \
+    "adc r8, r8, #0 \n\t"               \
+                                        \
+    "pop {r1} \n\t"                     \
+    /* TODO could fix up r1 value on stack here */      \
+    /* and leave the value on the stack (rather */      \
+    /* than popping) if supporting curves > 256 bits */ \
+    "ldr r9, [r1], #4 \n\t"             \
+    "ldr r1, [r1] \n\t"                 \
+                                        \
+    "push {r7} \n\t"                    \
+    "umaal r5, r6, r9, r10 \n\t"        \
+    "mov r7, #0 \n\t"                   \
+    "umaal r6, r7, r1, r10 \n\t"        \
+    /* Carry now stored in r10 */       \
+    "pop {r10} \n\t"                    \
+                                        \
+    "umaal r4, r5, r3, r11 \n\t"        \
+    "umaal r5, r6, r9, r11 \n\t"        \
+    "umaal r6, r7, r1, r11 \n\t"        \
+                                        \
+    "mov r11, #0 \n\t"                  \
+    "umaal r11, r4, r2, r12 \n\t"       \
+    "umaal r4, r5, r3, r12 \n\t"        \
+    "umaal r5, r6, r9, r12 \n\t"        \
+    "umaal r6, r7, r1, r12 \n\t"        \
+                                        \
+    "mov r12, #0 \n\t"                  \
+    "umaal r12, r4, r2, r14 \n\t"       \
+    "umaal r4, r5, r3, r14 \n\t"        \
+    "umaal r5, r6, r9, r14 \n\t"        \
+    "umaal r6, r7, r1, r14 \n\t"        \
+                                        \
+    /* Load carry from r10 */           \
+    "lsrs r10, #1 \n\t"                 \
+    "adcs r11, r11, r11 \n\t"           \
+    "adcs r12, r12, r12 \n\t"           \
+    "adc r10, r10, #0 \n\t"             \
+                                        \
+    /* Use carry from r8 */             \
+    "umaal r8, r11, r14, r14 \n\t"      \
+    "adds r11, r11, r12 \n\t"           \
+    "stmia r0!, {r8,r11} \n\t"          \
+    /* Store carry back in r8 */        \
+    "mov r8, #0 \n\t"                   \
+    "adc r8, r8, #0 \n\t"               \
+                                        \
+    "mov r11, #0 \n\t"                  \
+    "umaal r11, r5, r3, r2 \n\t"        \
+    "umaal r5, r6, r9, r2 \n\t"         \
+    "umaal r6, r7, r1, r2 \n\t"         \
+                                        \
+    "mov r12, #0 \n\t"                  \
+    "umaal r12, r6, r9, r3 \n\t"        \
+    "umaal r6, r7, r1, r3 \n\t"         \
+                                        \
+    "mov r14, #0 \n\t"                  \
+    "umaal r14, r7, r1, r9 \n\t"        \
+                                        \
+    /* Load carry from r10 */           \
+    "lsrs r10, #1 \n\t"                 \
+    "adcs r4, r4, r4 \n\t"              \
+    "adcs r11, r11, r11 \n\t"           \
+    "adcs r5, r5, r5 \n\t"              \
+    "adcs r12, r12, r12 \n\t"           \
+    "adcs r6, r6, r6 \n\t"              \
+    "adcs r14, r14, r14 \n\t"           \
+    "adcs r7, r7, r7 \n\t"              \
+    "adc r10, r10, #0 \n\t"             \
+                                        \
+    /* Use carry from r8 */             \
+    "umaal r4, r8, r2, r2 \n\t"         \
+    "adds r8, r8, r11 \n\t"             \
+    "stmia r0!, {r4,r8} \n\t"           \
+                                        \
+    "umull r4, r8, r3, r3 \n\t"         \
+    "adcs r4, r4, r5 \n\t"              \
+    "adcs r8, r8, r12 \n\t"             \
+    "stmia r0!, {r4,r8} \n\t"           \
+                                        \
+    "umull r4, r8, r9, r9 \n\t"         \
+    "adcs r4, r4, r6 \n\t"              \
+    "adcs r8, r8, r14 \n\t"             \
+    "stmia r0!, {r4,r8} \n\t"           \
+                                        \
+    "umull r4, r8, r1, r1 \n\t"         \
+    "adcs r4, r4, r7 \n\t"              \
+    "adcs r8, r8, r10 \n\t"             \
+    "stmia r0!, {r4,r8} \n\t"           \
+    /* TODO pop {r1, r2} if supporting curves > 256 bits */ \
+    "pop {r2} \n\t"
+
+#endif /* _UECC_ASM_ARM_MULT_SQUARE_H_ */
diff --git a/types.h b/types.h
index 7cb1a28..9ee8143 100644
--- a/types.h
+++ b/types.h
@@ -23,6 +23,16 @@
     #endif
 #endif
 
+#ifndef uECC_ARM_USE_UMAAL
+    #if (uECC_PLATFORM == uECC_arm) && (__ARM_ARCH >= 6)
+        #define uECC_ARM_USE_UMAAL 1
+    #elif (uECC_PLATFORM == uECC_arm_thumb2) && (__ARM_ARCH >= 6) && !__ARM_ARCH_7M__
+        #define uECC_ARM_USE_UMAAL 1
+    #else
+        #define uECC_ARM_USE_UMAAL 0
+    #endif
+#endif
+
 #ifndef uECC_WORD_SIZE
     #if uECC_PLATFORM == uECC_avr
         #define uECC_WORD_SIZE 1
diff --git a/uECC.h b/uECC.h
index 1193ce8..9911763 100644
--- a/uECC.h
+++ b/uECC.h
@@ -23,7 +23,9 @@
 
 /* Optimization level; trade speed for code size.
    Larger values produce code that is faster but larger.
-   Currently supported values are 0 - 3; 0 is unusably slow for most applications. */
+   Currently supported values are 0 - 4; 0 is unusably slow for most applications.
+   Optimization level 4 currently only has an effect ARM platforms where more than one
+   curve is enabled. */
 #ifndef uECC_OPTIMIZATION_LEVEL
     #define uECC_OPTIMIZATION_LEVEL 2
 #endif