asm/f64: Added gather routines to axpyinc asm functions.  Needs benchmarking.
diff --git a/internal/asm/f64/axpy_test.go b/internal/asm/f64/axpy_test.go
index 231ee24..4bf11b0 100644
--- a/internal/asm/f64/axpy_test.go
+++ b/internal/asm/f64/axpy_test.go
@@ -16,84 +16,84 @@
 	want    []float64
 	wantRev []float64 // Result when x is traversed in reverse direction.
 }{
-	{
+	{ // 0
 		alpha:   0,
 		x:       []float64{},
 		y:       []float64{},
 		want:    []float64{},
 		wantRev: []float64{},
 	},
-	{
+	{ // 1
 		alpha:   0,
 		x:       []float64{2},
 		y:       []float64{-3},
 		want:    []float64{-3},
 		wantRev: []float64{-3},
 	},
-	{
+	{ // 2
 		alpha:   1,
 		x:       []float64{2},
 		y:       []float64{-3},
 		want:    []float64{-1},
 		wantRev: []float64{-1},
 	},
-	{
+	{ // 3
 		alpha:   3,
 		x:       []float64{2},
 		y:       []float64{-3},
 		want:    []float64{3},
 		wantRev: []float64{3},
 	},
-	{
+	{ // 4
 		alpha:   -3,
 		x:       []float64{2},
 		y:       []float64{-3},
 		want:    []float64{-9},
 		wantRev: []float64{-9},
 	},
-	{
+	{ // 5
 		alpha:   1,
 		x:       []float64{1, 5},
 		y:       []float64{2, -3},
 		want:    []float64{3, 2},
 		wantRev: []float64{7, -2},
 	},
-	{
+	{ // 6
 		alpha:   1,
 		x:       []float64{2, 3, 4},
 		y:       []float64{-3, -2, -1},
 		want:    []float64{-1, 1, 3},
 		wantRev: []float64{1, 1, 1},
 	},
-	{
+	{ // 7
 		alpha:   0,
 		x:       []float64{0, 0, 1, 1, 2, -3, -4},
 		y:       []float64{0, 1, 0, 3, -4, 5, -6},
 		want:    []float64{0, 1, 0, 3, -4, 5, -6},
 		wantRev: []float64{0, 1, 0, 3, -4, 5, -6},
 	},
-	{
+	{ // 8
 		alpha:   1,
 		x:       []float64{0, 0, 1, 1, 2, -3, -4},
 		y:       []float64{0, 1, 0, 3, -4, 5, -6},
 		want:    []float64{0, 1, 1, 4, -2, 2, -10},
 		wantRev: []float64{-4, -2, 2, 4, -3, 5, -6},
 	},
-	{
+	{ // 9
 		alpha:   3,
 		x:       []float64{0, 0, 1, 1, 2, -3, -4},
 		y:       []float64{0, 1, 0, 3, -4, 5, -6},
 		want:    []float64{0, 1, 3, 6, 2, -4, -18},
 		wantRev: []float64{-12, -8, 6, 6, -1, 5, -6},
 	},
-	{
+	{ // 10
 		alpha:   -3,
 		x:       []float64{0, 0, 1, 1, 2, -3, -4, 0, 0, 1, 1, 2, -3, -4},
 		y:       []float64{0, 1, 0, 3, -4, 5, -6, 0, 1, 0, 3, -4, 5, -6},
 		want:    []float64{0, 1, -3, 0, -10, 14, 6, 0, 1, -3, 0, -10, 14, 6},
 		wantRev: []float64{12, 10, -6, 0, -7, 5, -6, 12, 10, -6, 0, -7, 5, -6},
 	},
-	{
+	{ // 11
 		alpha:   -5,
 		x:       []float64{0, 0, 1, 1, 2, -3, -4, 5, 1, 2, -3, -4, 5},
 		y:       []float64{0, 1, 0, 3, -4, 5, -6, 7, 3, -4, 5, -6, 7},
diff --git a/internal/asm/f64/axpyinc_avx_amd64.s b/internal/asm/f64/axpyinc_avx_amd64.s
index a9ff51d..545a99e 100644
--- a/internal/asm/f64/axpyinc_avx_amd64.s
+++ b/internal/asm/f64/axpyinc_avx_amd64.s
@@ -53,6 +53,7 @@
 #define ALPHA X0
 #define ALPHA_Y Y0
 #define G_IDX Y1
+#define G_IDX2 X1
 
 // func AxpyIncAVX(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
 TEXT ·AxpyIncAVX(SB), NOSPLIT, $0
@@ -82,7 +83,7 @@
 	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
 	CMPQ INC_Y, $1
 	JNE  loop
-	CMPQ LEN, $4
+	CMPQ LEN, $1
 	JLE  loop
 	JMP  axpy_gather
 
@@ -141,17 +142,41 @@
 	XORQ         IDX, IDX             // IDX = 0
 	VPXOR        X1, X1, X1           // X1 = { 0, 0 }
 	VMOVQ        INC_X, X1            // X1 = { 0, INC_X }
-	VMOVQ        INCx3_X, X3          // X3 = { 0, INC_X }
+	VMOVQ        INCx3_X, X3          // X3 = { 0, 3 * INC_X }
 	VPADDQ       X1, X1, X2           // X2 = 2 * INC_X
-	VPADDQ       X3, X1, X4           // X4 = 4 * INC_X
 	VSHUFPD      $1, X1, X1, X1       // X1 = { 0, INC_X }
 	VSHUFPD      $0, X3, X2, X3       // X3 = { 2 * INC_X, 3 * INC_X }
 	VINSERTI128  $1, X3, G_IDX, G_IDX // G_IDX = { 0, INC_X, 2 * INC_X, 3 * INC_X }
 	VPCMPEQD     Y10, Y10, Y10        // set mask register to all 1's
 	VBROADCASTSD ALPHA, ALPHA_Y       // ALPHA_Y = { alpha, alpha, alpha, alpha }
 
+	SHRQ $1, LEN // LEN = floor( n / 4 )
+	JZ   g_tail4
+
 g_loop:
 	VMOVUPS    Y10, Y9
+	VMOVUPS    Y10, Y8
+	VGATHERQPD Y9, (X_PTR)(G_IDX * 1), Y2 // Y_i = X[IDX:IDX+3]
+	VGATHERQPD Y8, (X_PTR)(G_IDX * 2), Y3
+
+	VFMADD213PD (Y_PTR)(IDX*8), ALPHA_Y, Y2   // Y_i = Y_i * a + y[i]
+	VFMADD213PD 32(Y_PTR)(IDX*8), ALPHA_Y, Y3
+	VMOVUPS     Y2, (DST_PTR)(IDX*8)          // y[i] = Y_i
+	VMOVUPS     Y3, 32(DST_PTR)(IDX*8)
+
+	LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(X_PTR[incX*8])
+	ADDQ $8, IDX                 // i += 8
+	DECQ LEN
+	JNZ  g_loop
+
+	ANDQ $7, TAIL // if TAIL & 7 == 0 { return }
+	JE   g_end
+
+g_tail4:
+	TESTQ $4, TAIL
+	JZ    g_tail2
+
+	VMOVUPS    Y10, Y9
 	VGATHERQPD Y9, (X_PTR)(G_IDX * 1), Y2 // Y_i = X[IDX:IDX+3]
 
 	VFMADD213PD (Y_PTR)(IDX*8), ALPHA_Y, Y2 // Y_i = Y_i * a + y[i]
@@ -159,21 +184,16 @@
 
 	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
 	ADDQ $4, IDX                 // i += 4
-	DECQ LEN
-	JNZ  g_loop
-
-	CMPQ TAIL, $0 // if TAIL == 0 { return }
-	JE   g_end
 
 g_tail2:
 	TESTQ $2, TAIL
 	JZ    g_tail1
 
-	VMOVUPS    Y10, Y9
-	VGATHERQPD Y9, (X_PTR)(G_IDX * 1), Y2 // Y_i = X[IDX:IDX+3]
+	VMOVUPS    X10, X9
+	VGATHERQPD X9, (X_PTR)(G_IDX2 * 1), X2 // X_i = X[IDX:IDX+1]
 
-	VFMADD213PD (Y_PTR)(IDX*8), ALPHA_Y, Y2 // Y_i = Y_i * a + y[i]
-	VMOVUPS     Y2, (DST_PTR)(IDX*8)        // y[i] = Y_i
+	VFMADD213PD (Y_PTR)(IDX*8), ALPHA, X2 // Y_i = Y_i * a + y[i]
+	VMOVUPS     X2, (DST_PTR)(IDX*8)      // y[i] = Y_i
 
 	LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*4])
 	ADDQ $2, IDX                 // i += 4
@@ -182,10 +202,9 @@
 	TESTQ $1, TAIL
 	JZ    g_end
 
-	VMOVUPS     X10, X9
-	VGATHERQPD  X9, (X_PTR)(X1 * 1), X2
-	VFMADD213PD (Y_PTR)(IDX*8), ALPHA, X2
-	VMOVUPS     X2, (DST_PTR)(IDX*8)
+	VMOVSD      (X_PTR), X2
+	VFMADD213SD (Y_PTR)(IDX*8), ALPHA, X2
+	VMOVSD      X2, (DST_PTR)(IDX*8)
 
 g_end:
 	RET
diff --git a/internal/asm/f64/axpyincto_avx_amd64.s b/internal/asm/f64/axpyincto_avx_amd64.s
index b239c7b..52335da 100644
--- a/internal/asm/f64/axpyincto_avx_amd64.s
+++ b/internal/asm/f64/axpyincto_avx_amd64.s
@@ -40,6 +40,8 @@
 
 #define X_PTR SI
 #define Y_PTR DI
+#define X_PTR_INC4 R11
+#define Y_PTR_INC4 R12
 #define DST_PTR DX
 #define IDX AX
 #define LEN CX
@@ -52,6 +54,10 @@
 #define INCx3_DST R13
 #define ALPHA X0
 #define ALPHA_Y Y0
+#define X_IDX Y1
+#define X_IDX2 X1
+#define Y_IDX Y2
+#define Y_IDX2 X2
 
 // func AxpyIncToAVX(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
 TEXT ·AxpyIncToAVX(SB), NOSPLIT, $0
@@ -85,6 +91,12 @@
 	LEAQ (INC_Y)(INC_Y*2), INCx3_Y       // INCx3_Y = INC_Y * 3
 	LEAQ (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3
 
+	CMPQ INC_DST, $8
+	JNE  loop
+	CMPQ LEN, $0
+	JLE  loop
+	JMP  axpy_gather
+
 loop:  // do {  // y[i] += alpha * x[i] unrolled 2x.
 	VMOVSD (X_PTR), X2            // X_i = x[i]
 	VMOVSD (X_PTR)(INC_X*1), X3
@@ -134,3 +146,97 @@
 
 end:
 	RET
+
+axpy_gather:
+	XORQ         IDX, IDX             // IDX = 0
+	VPXOR        X1, X1, X1           // X1 = { 0, 0 }
+	VPXOR        X2, X2, X2           // X2 = { 0, 0 }
+	VMOVQ        INC_X, X1            // X1 = { INC_X, 0 }
+	VMOVQ        INC_Y, X2            // X2 = { INC_Y, 0 }
+	VMOVQ        INCx3_X, X3          // X3 = { 3 * INC_X, 0 }
+	VMOVQ        INCx3_Y, X4          // X4 = { 3 * INC_Y, 0 }
+	VPADDQ       X1, X1, X5           // X5 = 2 * INC_X
+	VPADDQ       X2, X2, X6           // X7 = 2 * INC_Y
+	VSHUFPD      $1, X1, X1, X1       // X1 = { 0, INC_X }
+	VSHUFPD      $1, X2, X2, X2       // X2 = { 0, INC_Y }
+	VSHUFPD      $0, X3, X5, X3       // X3 = { 2 * INC_X, 3 * INC_X }
+	VSHUFPD      $0, X4, X6, X4       // X4 = { 2 * INC_Y, 3 * INC_Y }
+	VINSERTI128  $1, X3, X_IDX, X_IDX // X_IDX = { 0, INC_X, 2 * INC_X, 3 * INC_X }
+	VINSERTI128  $1, X4, Y_IDX, Y_IDX // Y_IDX = { 0, INC_X, 2 * INC_X, 3 * INC_X }
+	VPCMPEQD     Y12, Y12, Y12        // set mask register to all 1's
+	VBROADCASTSD ALPHA, ALPHA_Y       // ALPHA_Y = { alpha, alpha, alpha, alpha }
+
+	SHRQ $1, LEN                      // LEN = floor( n / 4 )
+	JZ   g_tail4
+	LEAQ (X_PTR)(INC_X*4), X_PTR_INC4
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR_INC4
+
+g_loop:
+	VMOVUPS    Y12, Y10
+	VMOVUPS    Y12, Y9
+	VMOVUPS    Y12, Y8
+	VMOVUPS    Y12, Y7
+	VGATHERQPD Y10, (X_PTR)(X_IDX * 1), Y3     // Y_i = X[IDX:IDX+3]
+	VGATHERQPD Y9, (X_PTR_INC4)(X_IDX * 1), Y4
+	VGATHERQPD Y8, (Y_PTR)(Y_IDX * 1), Y5      // Y_i = X[IDX:IDX+3]
+	VGATHERQPD Y7, (Y_PTR_INC4)(Y_IDX * 1), Y6
+
+	VFMADD213PD Y5, ALPHA_Y, Y3        // Y_i = Y_i * a + y[i]
+	VFMADD213PD Y6, ALPHA_Y, Y4
+	VMOVUPS     Y3, (DST_PTR)(IDX*8)   // y[i] = Y_i
+	VMOVUPS     Y4, 32(DST_PTR)(IDX*8)
+
+	LEAQ (X_PTR)(INC_X*8), X_PTR           // X_PTR = &(X_PTR[incX*8])
+	LEAQ (Y_PTR)(INC_Y*8), Y_PTR           // Y_PTR = &(Y_PTR[incY*8])
+	LEAQ (X_PTR_INC4)(INC_X*8), X_PTR_INC4
+	LEAQ (Y_PTR_INC4)(INC_Y*8), Y_PTR_INC4
+	ADDQ $8, IDX                           // i += 8
+	DECQ LEN
+	JNZ  g_loop
+
+	ANDQ $7, TAIL // if TAIL & 7 == 0 { return }
+	JE   g_end
+
+g_tail4:
+	TESTQ $4, TAIL
+	JZ    g_tail2
+
+	VMOVUPS    Y12, Y10
+	VMOVUPS    Y12, Y8
+	VGATHERQPD Y10, (X_PTR)(X_IDX * 1), Y3 // Y_i = X[IDX:IDX+3]
+	VGATHERQPD Y8, (Y_PTR)(Y_IDX * 1), Y5  // Y_i = X[IDX:IDX+3]
+
+	VFMADD213PD Y5, ALPHA_Y, Y3      // Y_i = Y_i * a + y[i]
+	VMOVUPS     Y3, (DST_PTR)(IDX*8) // y[i] = Y_i
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
+	ADDQ $4, IDX                 // i += 4
+
+g_tail2:
+	TESTQ $2, TAIL
+	JZ    g_tail1
+
+	VMOVUPS    X12, X10
+	VMOVUPS    X12, X8
+	VGATHERQPD X10, (X_PTR)(X_IDX2 * 1), X3 // X_i = X[IDX:IDX+1]
+	VGATHERQPD X8, (Y_PTR)(Y_IDX2 * 1), X5  // X_i = X[IDX:IDX+1]
+
+	VFMADD213PD X5, ALPHA, X3        // X_i = X_i * a + x[i]
+	VMOVUPS     X3, (DST_PTR)(IDX*8) // x[i] = X_i
+
+	LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2])
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR // Y_PTR = &(Y_PTR[incY*2])
+	ADDQ $2, IDX                 // i += 2
+
+g_tail1:
+	TESTQ $1, TAIL
+	JZ    g_end
+
+	VMOVSD      (X_PTR), X2
+	VFMADD213SD (Y_PTR), ALPHA, X2
+	VMOVSD      X2, (DST_PTR)(IDX*8)
+
+g_end:
+	VZEROALL
+	RET