asm/f64: Added AVX(FMA) variant for dot product
diff --git a/internal/asm/f64/dot_amd64.s b/internal/asm/f64/dot_amd64.s
index 79852d1..11846c2 100644
--- a/internal/asm/f64/dot_amd64.s
+++ b/internal/asm/f64/dot_amd64.s
@@ -45,8 +45,8 @@
 	MOVQ x_len+8(FP), DI // n = len(x)
 	MOVQ y+24(FP), R9
 
-	MOVSD $(0.0), X7 // sum = 0
-	MOVSD $(0.0), X8 // sum = 0
+	XORPS X7, X7 // sum = 0
+	XORPS X8, X8 // sum = 0
 
 	MOVQ $0, SI   // i = 0
 	SUBQ $4, DI   // n -= 4
diff --git a/internal/asm/f64/dot_avx_amd64.s b/internal/asm/f64/dot_avx_amd64.s
new file mode 100644
index 0000000..ec4a7c3
--- /dev/null
+++ b/internal/asm/f64/dot_avx_amd64.s
@@ -0,0 +1,96 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!appengine
+
+#include "textflag.h"
+
+#define X_PTR R8
+#define Y_PTR R9
+#define IDX SI
+#define LEN DI
+#define TAIL DX
+
+// func DdotUnitaryAVX(x, y []float64) (sum float64)
+// This function assumes len(y) >= len(x).
+TEXT ·DotUnitaryAVX(SB), NOSPLIT, $0
+	MOVQ x+0(FP), X_PTR
+	MOVQ x_len+8(FP), LEN // n = len(x)
+	MOVQ y+24(FP), Y_PTR
+
+	VXORPS Y7, Y7, Y7 // sum = 0
+	VXORPS Y8, Y8, Y8 // sum = 0
+	XORQ   IDX, IDX   // i = 0
+
+	MOVQ LEN, TAIL
+	SHRQ $4, LEN   // LEN = floor( n / 16 )
+	JZ   dot_tail8 // if LEN == 0 { goto dot_tail8 }
+
+loop_uni:
+	// sum += x[i] * y[i] unrolled 16x.
+	VMOVUPS     0(X_PTR)(IDX*8), Y0
+	VMOVUPS     32(X_PTR)(IDX*8), Y1
+	VMOVUPS     64(X_PTR)(IDX*8), Y2
+	VMOVUPS     96(X_PTR)(IDX*8), Y3
+	VFMADD231PD 0(Y_PTR)(IDX*8), Y0, Y7
+	VFMADD231PD 32(Y_PTR)(IDX*8), Y1, Y8
+	VFMADD231PD 64(Y_PTR)(IDX*8), Y2, Y7
+	VFMADD231PD 96(Y_PTR)(IDX*8), Y3, Y8
+
+	ADDQ $16, IDX // i += 16
+	DECQ LEN
+	JNZ  loop_uni // if n > 0 { goto loop_uni }
+
+	ANDQ $15, TAIL // TAIL = n %16
+	JZ   end_uni   // if TAIL == 0 { goto end_uni }
+
+dot_tail8:
+	TESTQ $8, TAIL
+	JZ    dot_tail4
+
+	VMOVUPS     0(X_PTR)(IDX*8), Y0
+	VMOVUPS     32(X_PTR)(IDX*8), Y1
+	VFMADD231PD 0(Y_PTR)(IDX*8), Y0, Y7
+	VFMADD231PD 32(Y_PTR)(IDX*8), Y1, Y8
+
+	ADDQ $8, IDX // i += 8
+
+dot_tail4:
+	TESTQ $4, TAIL
+	JZ    dot_tail2
+
+	VMOVUPS     0(X_PTR)(IDX*8), Y0
+	VFMADD231PD 0(Y_PTR)(IDX*8), Y0, Y7
+
+	ADDQ $4, IDX // i += 4
+
+dot_tail2:
+	// Collapse sum to 128-bit register
+	// VFMADD will clear bits 128-255 when used with XMM registers
+	VADDPD       Y8, Y7, Y7 // Y7 += Y8
+	VEXTRACTF128 $1, Y7, X8 // X8 = Y7[2:3] [ X7 = X7[0:1] ]
+
+	TESTQ $2, TAIL
+	JZ    dot_tail1
+
+	VMOVUPS     0(X_PTR)(IDX*8), X0
+	VFMADD231PD 0(Y_PTR)(IDX*8), X0, X7
+
+	ADDQ $2, IDX // i += 2
+
+dot_tail1:
+	TESTQ $1, TAIL
+	JZ    end_uni
+
+	VMOVSD      0(X_PTR)(IDX*8), X0
+	VFMADD231SD 0(Y_PTR)(IDX*8), X0, X7
+
+	INCQ IDX
+
+end_uni:
+	// Add the four sums together.
+	VHADDPD X8, X7, X7
+	VHADDPD X7, X7, X7
+	MOVSD   X7, sum+48(FP) // Return final sum.
+	RET
diff --git a/internal/asm/f64/stubs_amd64.go b/internal/asm/f64/stubs_amd64.go
index a2ed593..0ad9494 100644
--- a/internal/asm/f64/stubs_amd64.go
+++ b/internal/asm/f64/stubs_amd64.go
@@ -111,6 +111,8 @@
 //  return sum
 func DotUnitary(x, y []float64) (sum float64)
 
+func DotUnitaryAVX(x, y []float64) (sum float64)
+
 // DotInc is
 //  for i := 0; i < int(n); i++ {
 //  	sum += y[iy] * x[ix]