asm/f64: Base copy asm.  Needs prefetching strategy to be worthwhile.
diff --git a/internal/asm/f64/benchCopy_test.go b/internal/asm/f64/benchCopy_test.go
new file mode 100644
index 0000000..b18fe3f
--- /dev/null
+++ b/internal/asm/f64/benchCopy_test.go
@@ -0,0 +1,57 @@
+// Copyright ©2017 The gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.7
+
+package f64
+
+import (
+	"fmt"
+	"testing"
+)
+
+var incCopy = []struct {
+	len int
+	inc []int
+}{
+	{1, []int{1}},
+	{3, []int{1, 2, 4, 10}},
+	{10, []int{1, 2, 4, 10}},
+	{30, []int{1, 2, 4, 10}},
+	{1e2, []int{1, 2, 4, 10}},
+	{3e2, []int{1, 2, 4, 10}},
+	{1e3, []int{1, 2, 4, 10}},
+	{3e3, []int{1, 2, 4, 10}},
+	{1e4, []int{1, 2, 4, 10}},
+}
+
+func BenchmarkCopy(t *testing.B) {
+	naivecopy := func(n int, dst []float64, incDst int, src []float64, incSrc int) {
+		for i := 0; i < n; i++ {
+			dst[i*incDst] = src[i*incSrc]
+		}
+	}
+	tests := []struct {
+		name string
+		f    func(n int, dst []float64, incDst int, src []float64, incSrc int)
+	}{
+		{"NaiveCopy", naivecopy},
+		{"Copy", Copy},
+	}
+	for _, tt := range incCopy {
+		for _, inc := range tt.inc {
+			for _, test := range tests {
+				t.Run(fmt.Sprintf("%s-%d-inc(%d)", test.name, tt.len, inc), func(b *testing.B) {
+					x := make([]float64, inc*tt.len)
+					y := make([]float64, inc*tt.len)
+					b.SetBytes(int64(64 * tt.len))
+					b.ResetTimer()
+					for i := 0; i < b.N; i++ {
+						Copy(tt.len, y, inc, x, inc)
+					}
+				})
+			}
+		}
+	}
+}
diff --git a/internal/asm/f64/copy_amd64.s b/internal/asm/f64/copy_amd64.s
new file mode 100644
index 0000000..24eaf7a
--- /dev/null
+++ b/internal/asm/f64/copy_amd64.s
@@ -0,0 +1,71 @@
+// Copyright ©2017 The gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!appengine
+
+#include "textflag.h"
+
+#define SIZE 8
+#define SHIFT 3
+#define SRC SI
+#define DST DI
+#define LEN CX
+#define TAIL BX
+#define INC_DST R8
+#define INC3_DST R9
+#define INC_SRC R10
+#define INC3_SRC R11
+
+// func Copy(n int, dst []float64, incDst int, src []float64, incSrc int)
+TEXT ·Copy(SB), NOSPLIT, $0
+	MOVQ n+0(FP), LEN
+	CMPQ LEN, $0
+	JEQ  ret
+	MOVQ dst_base+8(FP), DST
+	MOVQ src_base+40(FP), SRC
+	MOVQ incDst+32(FP), INC_DST
+	SHLQ $SHIFT, INC_DST
+	MOVQ incSrc+64(FP), INC_SRC
+	SHLQ $SHIFT, INC_SRC
+
+	MOVQ LEN, TAIL
+	SHRQ $2, LEN
+	JZ   tail
+
+	LEAQ        (INC_SRC)(INC_SRC*2), INC3_SRC
+	LEAQ        (INC_DST)(INC_DST*2), INC3_DST
+	PREFETCHNTA (SRC)(INC3_SRC*1)
+	PREFETCHT0  (DST)(INC3_DST*1)
+
+copy_loop:
+	MOVSD (SRC), X0
+	MOVSD (SRC)(INC_SRC*1), X1
+	MOVSD (SRC)(INC_SRC*2), X2
+	MOVSD (SRC)(INC3_SRC*1), X3
+	MOVSD X0, (DST)
+	MOVSD X1, (DST)(INC_DST*1)
+	MOVSD X2, (DST)(INC_DST*2)
+	MOVSD X3, (DST)(INC3_DST*1)
+
+	LEAQ        (SRC)(INC_SRC*4), SRC
+	LEAQ        (DST)(INC_DST*4), DST
+	PREFETCHNTA (SRC)(INC3_SRC*1)
+	PREFETCHT0  (DST)(INC3_DST*1)
+	DECQ        LEN
+	JNZ         copy_loop
+
+tail:
+	ANDQ $3, TAIL
+	JZ   ret
+
+tail_loop:
+	MOVSD (SRC), X0
+	MOVSD X0, (DST)
+	ADDQ  INC_SRC, SRC
+	ADDQ  INC_DST, DST
+	DECQ  TAIL
+	JNZ   tail_loop
+
+ret:
+	RET
diff --git a/internal/asm/f64/stubs_amd64.go b/internal/asm/f64/stubs_amd64.go
index d4de191..a444c7b 100644
--- a/internal/asm/f64/stubs_amd64.go
+++ b/internal/asm/f64/stubs_amd64.go
@@ -61,6 +61,12 @@
 //  }
 func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
 
+// Copy is
+//  for i := 0; i < n; i++ {
+//  	dst[i*incDst] = src[i*incSrc]
+//  }
+func Copy(n int, dst []float64, incDst int, src []float64, incSrc int)
+
 // CumSum is
 //  if len(s) == 0 {
 //  	return dst
diff --git a/internal/asm/f64/stubs_noasm.go b/internal/asm/f64/stubs_noasm.go
index 5e4ae69..d545af5 100644
--- a/internal/asm/f64/stubs_noasm.go
+++ b/internal/asm/f64/stubs_noasm.go
@@ -52,6 +52,16 @@
 	}
 }
 
+// Copy is
+//  for i := 0; i < n; i++ {
+//  	dst[i*incDst] = src[i*incSrc]
+//  }
+func Copy(n int, dst []float64, incDst int, src []float64, incSrc int) {
+	for i := 0; i < n; i++ {
+		dst[i*incDst] = src[i*incSrc]
+	}
+}
+
 // CumSum is
 //  if len(s) == 0 {
 //  	return dst
diff --git a/internal/asm/f64/stubs_test.go b/internal/asm/f64/stubs_test.go
index 6dd949f..dfe35c9 100644
--- a/internal/asm/f64/stubs_test.go
+++ b/internal/asm/f64/stubs_test.go
@@ -4,7 +4,10 @@
 
 package f64
 
-import "testing"
+import (
+	"fmt"
+	"testing"
+)
 
 func TestL1Norm(t *testing.T) {
 	var src_gd float64 = 1
@@ -165,6 +168,90 @@
 	}
 }
 
+func TestCopy(t *testing.T) {
+	const sGdVal, dGdVal = -1, 0.5
+	gdLn := 4
+	for i, test := range []struct {
+		dst, src, want []float64
+	}{
+		{
+			dst:  nil,
+			src:  nil,
+			want: nil,
+		},
+		{
+			dst:  []float64{},
+			src:  []float64{},
+			want: []float64{},
+		},
+		{
+			dst:  []float64{2},
+			src:  []float64{3},
+			want: []float64{3},
+		},
+		{
+			dst:  []float64{1, 2},
+			src:  []float64{3, 7},
+			want: []float64{3, 7},
+		},
+		{
+			dst:  []float64{1, 3, 5},
+			src:  []float64{3, 5, 7},
+			want: []float64{3, 5, 7},
+		},
+		{
+			dst:  []float64{1, 2, 3, 4},
+			src:  []float64{2, 4, 6, 8},
+			want: []float64{2, 4, 6, 8},
+		},
+		{
+			dst:  []float64{1, 2, 3, 4},
+			src:  []float64{2, 4, 6},
+			want: []float64{2, 4, 6, 4},
+		},
+		{
+			dst:  []float64{1, 2, 3, 4, 5, 6, 7, 8, 9},
+			src:  []float64{2, 8, 32, 128, 512, 2048, 8192, 32768},
+			want: []float64{2, 8, 32, 128, 512, 2048, 8192, 32768},
+		},
+		{
+			dst:  []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+			src:  []float64{2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048},
+			want: []float64{2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048},
+		},
+		{
+			dst:  []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+			src:  []float64{2, 8, 32, 128, 512, 2048, 8192, 32768, 131072, 524288, 2097152},
+			want: []float64{2, 8, 32, 128, 512, 2048, 8192, 32768, 131072, 524288, 2097152},
+		},
+	} {
+		n := len(test.src)
+		for _, inc := range newIncSet(1, 2, 3, 4, 7) {
+			prefix := fmt.Sprintf("Test %v (x:%v y:%v)", i, inc.x, inc.y)
+			sg := guardIncVector(test.src, sGdVal, inc.x, gdLn)
+			dg := guardIncVector(test.dst, dGdVal, inc.y, gdLn)
+			src, dst := sg[gdLn:len(sg)-gdLn], dg[gdLn:len(dg)-gdLn]
+
+			Copy(n, dst, inc.y, src, inc.x)
+
+			for i := range test.want {
+				if !same(dst[i*inc.y], test.want[i]) {
+					t.Errorf(msgVal, prefix, i, dst[i*inc.y], test.want[i])
+				}
+			}
+			if !equalStrided(test.src, src, inc.x) {
+				t.Errorf("%v: modified read-only src argument", prefix)
+			}
+			checkValidIncGuard(t, sg, sGdVal, inc.x, gdLn)
+			checkValidIncGuard(t, dg, dGdVal, inc.y, gdLn)
+			if t.Failed() {
+				t.Error(src, dst)
+				break
+			}
+		}
+	}
+}
+
 func TestCumSum(t *testing.T) {
 	var src_gd, dst_gd float64 = -1, 0
 	for j, v := range []struct {