asm/f64: Added gather routines to axpyinc asm functions. Needs benchmarking.
diff --git a/internal/asm/f64/axpy_test.go b/internal/asm/f64/axpy_test.go
index 231ee24..4bf11b0 100644
--- a/internal/asm/f64/axpy_test.go
+++ b/internal/asm/f64/axpy_test.go
@@ -16,84 +16,84 @@
want []float64
wantRev []float64 // Result when x is traversed in reverse direction.
}{
- {
+ { // 0
alpha: 0,
x: []float64{},
y: []float64{},
want: []float64{},
wantRev: []float64{},
},
- {
+ { // 1
alpha: 0,
x: []float64{2},
y: []float64{-3},
want: []float64{-3},
wantRev: []float64{-3},
},
- {
+ { // 2
alpha: 1,
x: []float64{2},
y: []float64{-3},
want: []float64{-1},
wantRev: []float64{-1},
},
- {
+ { // 3
alpha: 3,
x: []float64{2},
y: []float64{-3},
want: []float64{3},
wantRev: []float64{3},
},
- {
+ { // 4
alpha: -3,
x: []float64{2},
y: []float64{-3},
want: []float64{-9},
wantRev: []float64{-9},
},
- {
+ { // 5
alpha: 1,
x: []float64{1, 5},
y: []float64{2, -3},
want: []float64{3, 2},
wantRev: []float64{7, -2},
},
- {
+ { // 6
alpha: 1,
x: []float64{2, 3, 4},
y: []float64{-3, -2, -1},
want: []float64{-1, 1, 3},
wantRev: []float64{1, 1, 1},
},
- {
+ { // 7
alpha: 0,
x: []float64{0, 0, 1, 1, 2, -3, -4},
y: []float64{0, 1, 0, 3, -4, 5, -6},
want: []float64{0, 1, 0, 3, -4, 5, -6},
wantRev: []float64{0, 1, 0, 3, -4, 5, -6},
},
- {
+ { // 8
alpha: 1,
x: []float64{0, 0, 1, 1, 2, -3, -4},
y: []float64{0, 1, 0, 3, -4, 5, -6},
want: []float64{0, 1, 1, 4, -2, 2, -10},
wantRev: []float64{-4, -2, 2, 4, -3, 5, -6},
},
- {
+ { // 9
alpha: 3,
x: []float64{0, 0, 1, 1, 2, -3, -4},
y: []float64{0, 1, 0, 3, -4, 5, -6},
want: []float64{0, 1, 3, 6, 2, -4, -18},
wantRev: []float64{-12, -8, 6, 6, -1, 5, -6},
},
- {
+ { // 10
alpha: -3,
x: []float64{0, 0, 1, 1, 2, -3, -4, 0, 0, 1, 1, 2, -3, -4},
y: []float64{0, 1, 0, 3, -4, 5, -6, 0, 1, 0, 3, -4, 5, -6},
want: []float64{0, 1, -3, 0, -10, 14, 6, 0, 1, -3, 0, -10, 14, 6},
wantRev: []float64{12, 10, -6, 0, -7, 5, -6, 12, 10, -6, 0, -7, 5, -6},
},
- {
+ { // 11
alpha: -5,
x: []float64{0, 0, 1, 1, 2, -3, -4, 5, 1, 2, -3, -4, 5},
y: []float64{0, 1, 0, 3, -4, 5, -6, 7, 3, -4, 5, -6, 7},
diff --git a/internal/asm/f64/axpyinc_avx_amd64.s b/internal/asm/f64/axpyinc_avx_amd64.s
index a9ff51d..545a99e 100644
--- a/internal/asm/f64/axpyinc_avx_amd64.s
+++ b/internal/asm/f64/axpyinc_avx_amd64.s
@@ -53,6 +53,7 @@
#define ALPHA X0
#define ALPHA_Y Y0
#define G_IDX Y1
+#define G_IDX2 X1
// func AxpyIncAVX(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyIncAVX(SB), NOSPLIT, $0
@@ -82,7 +83,7 @@
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
CMPQ INC_Y, $1
JNE loop
- CMPQ LEN, $4
+ CMPQ LEN, $1
JLE loop
JMP axpy_gather
@@ -141,17 +142,41 @@
XORQ IDX, IDX // IDX = 0
VPXOR X1, X1, X1 // X1 = { 0, 0 }
VMOVQ INC_X, X1 // X1 = { 0, INC_X }
- VMOVQ INCx3_X, X3 // X3 = { 0, INC_X }
+ VMOVQ INCx3_X, X3 // X3 = { 0, 3 * INC_X }
VPADDQ X1, X1, X2 // X2 = 2 * INC_X
- VPADDQ X3, X1, X4 // X4 = 4 * INC_X
VSHUFPD $1, X1, X1, X1 // X1 = { 0, INC_X }
VSHUFPD $0, X3, X2, X3 // X3 = { 2 * INC_X, 3 * INC_X }
VINSERTI128 $1, X3, G_IDX, G_IDX // G_IDX = { 0, INC_X, 2 * INC_X, 3 * INC_X }
VPCMPEQD Y10, Y10, Y10 // set mask register to all 1's
VBROADCASTSD ALPHA, ALPHA_Y // ALPHA_Y = { alpha, alpha, alpha, alpha }
+ SHRQ $1, LEN // LEN = floor( n / 4 )
+ JZ g_tail4
+
g_loop:
VMOVUPS Y10, Y9
+ VMOVUPS Y10, Y8
+ VGATHERQPD Y9, (X_PTR)(G_IDX * 1), Y2 // Y_i = X[IDX:IDX+3]
+ VGATHERQPD Y8, (X_PTR)(G_IDX * 2), Y3
+
+ VFMADD213PD (Y_PTR)(IDX*8), ALPHA_Y, Y2 // Y_i = Y_i * a + y[i]
+ VFMADD213PD 32(Y_PTR)(IDX*8), ALPHA_Y, Y3
+ VMOVUPS Y2, (DST_PTR)(IDX*8) // y[i] = Y_i
+ VMOVUPS Y3, 32(DST_PTR)(IDX*8)
+
+ LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(X_PTR[incX*8])
+ ADDQ $8, IDX // i += 8
+ DECQ LEN
+ JNZ g_loop
+
+ ANDQ $7, TAIL // if TAIL & 7 == 0 { return }
+ JE g_end
+
+g_tail4:
+ TESTQ $4, TAIL
+ JZ g_tail2
+
+ VMOVUPS Y10, Y9
VGATHERQPD Y9, (X_PTR)(G_IDX * 1), Y2 // Y_i = X[IDX:IDX+3]
VFMADD213PD (Y_PTR)(IDX*8), ALPHA_Y, Y2 // Y_i = Y_i * a + y[i]
@@ -159,21 +184,16 @@
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
ADDQ $4, IDX // i += 4
- DECQ LEN
- JNZ g_loop
-
- CMPQ TAIL, $0 // if TAIL == 0 { return }
- JE g_end
g_tail2:
TESTQ $2, TAIL
JZ g_tail1
- VMOVUPS Y10, Y9
- VGATHERQPD Y9, (X_PTR)(G_IDX * 1), Y2 // Y_i = X[IDX:IDX+3]
+ VMOVUPS X10, X9
+ VGATHERQPD X9, (X_PTR)(G_IDX2 * 1), X2 // X_i = X[IDX:IDX+1]
- VFMADD213PD (Y_PTR)(IDX*8), ALPHA_Y, Y2 // Y_i = Y_i * a + y[i]
- VMOVUPS Y2, (DST_PTR)(IDX*8) // y[i] = Y_i
+ VFMADD213PD (Y_PTR)(IDX*8), ALPHA, X2 // Y_i = Y_i * a + y[i]
+ VMOVUPS X2, (DST_PTR)(IDX*8) // y[i] = Y_i
LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*4])
ADDQ $2, IDX // i += 4
@@ -182,10 +202,9 @@
TESTQ $1, TAIL
JZ g_end
- VMOVUPS X10, X9
- VGATHERQPD X9, (X_PTR)(X1 * 1), X2
- VFMADD213PD (Y_PTR)(IDX*8), ALPHA, X2
- VMOVUPS X2, (DST_PTR)(IDX*8)
+ VMOVSD (X_PTR), X2
+ VFMADD213SD (Y_PTR)(IDX*8), ALPHA, X2
+ VMOVSD X2, (DST_PTR)(IDX*8)
g_end:
RET
diff --git a/internal/asm/f64/axpyincto_avx_amd64.s b/internal/asm/f64/axpyincto_avx_amd64.s
index b239c7b..52335da 100644
--- a/internal/asm/f64/axpyincto_avx_amd64.s
+++ b/internal/asm/f64/axpyincto_avx_amd64.s
@@ -40,6 +40,8 @@
#define X_PTR SI
#define Y_PTR DI
+#define X_PTR_INC4 R11
+#define Y_PTR_INC4 R12
#define DST_PTR DX
#define IDX AX
#define LEN CX
@@ -52,6 +54,10 @@
#define INCx3_DST R13
#define ALPHA X0
#define ALPHA_Y Y0
+#define X_IDX Y1
+#define X_IDX2 X1
+#define Y_IDX Y2
+#define Y_IDX2 X2
// func AxpyIncToAVX(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyIncToAVX(SB), NOSPLIT, $0
@@ -85,6 +91,12 @@
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
LEAQ (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3
+ CMPQ INC_DST, $8
+ JNE loop
+ CMPQ LEN, $0
+ JLE loop
+ JMP axpy_gather
+
loop: // do { // y[i] += alpha * x[i] unrolled 2x.
VMOVSD (X_PTR), X2 // X_i = x[i]
VMOVSD (X_PTR)(INC_X*1), X3
@@ -134,3 +146,97 @@
end:
RET
+
+axpy_gather:
+ XORQ IDX, IDX // IDX = 0
+ VPXOR X1, X1, X1 // X1 = { 0, 0 }
+ VPXOR X2, X2, X2 // X2 = { 0, 0 }
+ VMOVQ INC_X, X1 // X1 = { INC_X, 0 }
+ VMOVQ INC_Y, X2 // X2 = { INC_Y, 0 }
+ VMOVQ INCx3_X, X3 // X3 = { 3 * INC_X, 0 }
+ VMOVQ INCx3_Y, X4 // X4 = { 3 * INC_Y, 0 }
+ VPADDQ X1, X1, X5 // X5 = 2 * INC_X
+ VPADDQ X2, X2, X6 // X7 = 2 * INC_Y
+ VSHUFPD $1, X1, X1, X1 // X1 = { 0, INC_X }
+ VSHUFPD $1, X2, X2, X2 // X2 = { 0, INC_Y }
+ VSHUFPD $0, X3, X5, X3 // X3 = { 2 * INC_X, 3 * INC_X }
+ VSHUFPD $0, X4, X6, X4 // X4 = { 2 * INC_Y, 3 * INC_Y }
+ VINSERTI128 $1, X3, X_IDX, X_IDX // X_IDX = { 0, INC_X, 2 * INC_X, 3 * INC_X }
+ VINSERTI128 $1, X4, Y_IDX, Y_IDX // Y_IDX = { 0, INC_X, 2 * INC_X, 3 * INC_X }
+ VPCMPEQD Y12, Y12, Y12 // set mask register to all 1's
+ VBROADCASTSD ALPHA, ALPHA_Y // ALPHA_Y = { alpha, alpha, alpha, alpha }
+
+ SHRQ $1, LEN // LEN = floor( n / 4 )
+ JZ g_tail4
+ LEAQ (X_PTR)(INC_X*4), X_PTR_INC4
+ LEAQ (Y_PTR)(INC_Y*4), Y_PTR_INC4
+
+g_loop:
+ VMOVUPS Y12, Y10
+ VMOVUPS Y12, Y9
+ VMOVUPS Y12, Y8
+ VMOVUPS Y12, Y7
+ VGATHERQPD Y10, (X_PTR)(X_IDX * 1), Y3 // Y_i = X[IDX:IDX+3]
+ VGATHERQPD Y9, (X_PTR_INC4)(X_IDX * 1), Y4
+ VGATHERQPD Y8, (Y_PTR)(Y_IDX * 1), Y5 // Y_i = X[IDX:IDX+3]
+ VGATHERQPD Y7, (Y_PTR_INC4)(Y_IDX * 1), Y6
+
+ VFMADD213PD Y5, ALPHA_Y, Y3 // Y_i = Y_i * a + y[i]
+ VFMADD213PD Y6, ALPHA_Y, Y4
+ VMOVUPS Y3, (DST_PTR)(IDX*8) // y[i] = Y_i
+ VMOVUPS Y4, 32(DST_PTR)(IDX*8)
+
+ LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(X_PTR[incX*8])
+ LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(Y_PTR[incY*8])
+ LEAQ (X_PTR_INC4)(INC_X*8), X_PTR_INC4
+ LEAQ (Y_PTR_INC4)(INC_Y*8), Y_PTR_INC4
+ ADDQ $8, IDX // i += 8
+ DECQ LEN
+ JNZ g_loop
+
+ ANDQ $7, TAIL // if TAIL & 7 == 0 { return }
+ JE g_end
+
+g_tail4:
+ TESTQ $4, TAIL
+ JZ g_tail2
+
+ VMOVUPS Y12, Y10
+ VMOVUPS Y12, Y8
+ VGATHERQPD Y10, (X_PTR)(X_IDX * 1), Y3 // Y_i = X[IDX:IDX+3]
+ VGATHERQPD Y8, (Y_PTR)(Y_IDX * 1), Y5 // Y_i = X[IDX:IDX+3]
+
+ VFMADD213PD Y5, ALPHA_Y, Y3 // Y_i = Y_i * a + y[i]
+ VMOVUPS Y3, (DST_PTR)(IDX*8) // y[i] = Y_i
+
+ LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
+ LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
+ ADDQ $4, IDX // i += 4
+
+g_tail2:
+ TESTQ $2, TAIL
+ JZ g_tail1
+
+ VMOVUPS X12, X10
+ VMOVUPS X12, X8
+ VGATHERQPD X10, (X_PTR)(X_IDX2 * 1), X3 // X_i = X[IDX:IDX+1]
+ VGATHERQPD X8, (Y_PTR)(Y_IDX2 * 1), X5 // X_i = X[IDX:IDX+1]
+
+ VFMADD213PD X5, ALPHA, X3 // X_i = X_i * a + x[i]
+ VMOVUPS X3, (DST_PTR)(IDX*8) // x[i] = X_i
+
+ LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2])
+ LEAQ (Y_PTR)(INC_Y*2), Y_PTR // Y_PTR = &(Y_PTR[incY*2])
+ ADDQ $2, IDX // i += 2
+
+g_tail1:
+ TESTQ $1, TAIL
+ JZ g_end
+
+ VMOVSD (X_PTR), X2
+ VFMADD213SD (Y_PTR), ALPHA, X2
+ VMOVSD X2, (DST_PTR)(IDX*8)
+
+g_end:
+ VZEROALL
+ RET