asm/f64: Added AVX(FMA) variant for dot product
diff --git a/internal/asm/f64/dot_amd64.s b/internal/asm/f64/dot_amd64.s
index 79852d1..11846c2 100644
--- a/internal/asm/f64/dot_amd64.s
+++ b/internal/asm/f64/dot_amd64.s
@@ -45,8 +45,8 @@
MOVQ x_len+8(FP), DI // n = len(x)
MOVQ y+24(FP), R9
- MOVSD $(0.0), X7 // sum = 0
- MOVSD $(0.0), X8 // sum = 0
+ XORPS X7, X7 // sum = 0
+ XORPS X8, X8 // sum = 0
MOVQ $0, SI // i = 0
SUBQ $4, DI // n -= 4
diff --git a/internal/asm/f64/dot_avx_amd64.s b/internal/asm/f64/dot_avx_amd64.s
new file mode 100644
index 0000000..ec4a7c3
--- /dev/null
+++ b/internal/asm/f64/dot_avx_amd64.s
@@ -0,0 +1,96 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!appengine
+
+#include "textflag.h"
+
+#define X_PTR R8
+#define Y_PTR R9
+#define IDX SI
+#define LEN DI
+#define TAIL DX
+
+// func DdotUnitaryAVX(x, y []float64) (sum float64)
+// This function assumes len(y) >= len(x).
+TEXT ·DotUnitaryAVX(SB), NOSPLIT, $0
+ MOVQ x+0(FP), X_PTR
+ MOVQ x_len+8(FP), LEN // n = len(x)
+ MOVQ y+24(FP), Y_PTR
+
+ VXORPS Y7, Y7, Y7 // sum = 0
+ VXORPS Y8, Y8, Y8 // sum = 0
+ XORQ IDX, IDX // i = 0
+
+ MOVQ LEN, TAIL
+ SHRQ $4, LEN // LEN = floor( n / 16 )
+ JZ dot_tail8 // if LEN == 0 { goto dot_tail8 }
+
+loop_uni:
+ // sum += x[i] * y[i] unrolled 16x.
+ VMOVUPS 0(X_PTR)(IDX*8), Y0
+ VMOVUPS 32(X_PTR)(IDX*8), Y1
+ VMOVUPS 64(X_PTR)(IDX*8), Y2
+ VMOVUPS 96(X_PTR)(IDX*8), Y3
+ VFMADD231PD 0(Y_PTR)(IDX*8), Y0, Y7
+ VFMADD231PD 32(Y_PTR)(IDX*8), Y1, Y8
+ VFMADD231PD 64(Y_PTR)(IDX*8), Y2, Y7
+ VFMADD231PD 96(Y_PTR)(IDX*8), Y3, Y8
+
+ ADDQ $16, IDX // i += 16
+ DECQ LEN
+ JNZ loop_uni // if n > 0 { goto loop_uni }
+
+ ANDQ $15, TAIL // TAIL = n %16
+ JZ end_uni // if TAIL == 0 { goto end_uni }
+
+dot_tail8:
+ TESTQ $8, TAIL
+ JZ dot_tail4
+
+ VMOVUPS 0(X_PTR)(IDX*8), Y0
+ VMOVUPS 32(X_PTR)(IDX*8), Y1
+ VFMADD231PD 0(Y_PTR)(IDX*8), Y0, Y7
+ VFMADD231PD 32(Y_PTR)(IDX*8), Y1, Y8
+
+ ADDQ $8, IDX // i += 8
+
+dot_tail4:
+ TESTQ $4, TAIL
+ JZ dot_tail2
+
+ VMOVUPS 0(X_PTR)(IDX*8), Y0
+ VFMADD231PD 0(Y_PTR)(IDX*8), Y0, Y7
+
+ ADDQ $4, IDX // i += 4
+
+dot_tail2:
+ // Collapse sum to 128-bit register
+ // VFMADD will clear bits 128-255 when used with XMM registers
+ VADDPD Y8, Y7, Y7 // Y7 += Y8
+ VEXTRACTF128 $1, Y7, X8 // X8 = Y7[2:3] [ X7 = X7[0:1] ]
+
+ TESTQ $2, TAIL
+ JZ dot_tail1
+
+ VMOVUPS 0(X_PTR)(IDX*8), X0
+ VFMADD231PD 0(Y_PTR)(IDX*8), X0, X7
+
+ ADDQ $2, IDX // i += 2
+
+dot_tail1:
+ TESTQ $1, TAIL
+ JZ end_uni
+
+ VMOVSD 0(X_PTR)(IDX*8), X0
+ VFMADD231SD 0(Y_PTR)(IDX*8), X0, X7
+
+ INCQ IDX
+
+end_uni:
+ // Add the four sums together.
+ VHADDPD X8, X7, X7
+ VHADDPD X7, X7, X7
+ MOVSD X7, sum+48(FP) // Return final sum.
+ RET
diff --git a/internal/asm/f64/stubs_amd64.go b/internal/asm/f64/stubs_amd64.go
index a2ed593..0ad9494 100644
--- a/internal/asm/f64/stubs_amd64.go
+++ b/internal/asm/f64/stubs_amd64.go
@@ -111,6 +111,8 @@
// return sum
func DotUnitary(x, y []float64) (sum float64)
+func DotUnitaryAVX(x, y []float64) (sum float64)
+
// DotInc is
// for i := 0; i < int(n); i++ {
// sum += y[iy] * x[ix]