asm/f64: Base copy asm. Needs prefetching strategy to be worthwhile.
diff --git a/internal/asm/f64/benchCopy_test.go b/internal/asm/f64/benchCopy_test.go new file mode 100644 index 0000000..b18fe3f --- /dev/null +++ b/internal/asm/f64/benchCopy_test.go
@@ -0,0 +1,57 @@ +// Copyright ©2017 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.7 + +package f64 + +import ( + "fmt" + "testing" +) + +var incCopy = []struct { + len int + inc []int +}{ + {1, []int{1}}, + {3, []int{1, 2, 4, 10}}, + {10, []int{1, 2, 4, 10}}, + {30, []int{1, 2, 4, 10}}, + {1e2, []int{1, 2, 4, 10}}, + {3e2, []int{1, 2, 4, 10}}, + {1e3, []int{1, 2, 4, 10}}, + {3e3, []int{1, 2, 4, 10}}, + {1e4, []int{1, 2, 4, 10}}, +} + +func BenchmarkCopy(t *testing.B) { + naivecopy := func(n int, dst []float64, incDst int, src []float64, incSrc int) { + for i := 0; i < n; i++ { + dst[i*incDst] = src[i*incSrc] + } + } + tests := []struct { + name string + f func(n int, dst []float64, incDst int, src []float64, incSrc int) + }{ + {"NaiveCopy", naivecopy}, + {"Copy", Copy}, + } + for _, tt := range incCopy { + for _, inc := range tt.inc { + for _, test := range tests { + t.Run(fmt.Sprintf("%s-%d-inc(%d)", test.name, tt.len, inc), func(b *testing.B) { + x := make([]float64, inc*tt.len) + y := make([]float64, inc*tt.len) + b.SetBytes(int64(64 * tt.len)) + b.ResetTimer() + for i := 0; i < b.N; i++ { + Copy(tt.len, y, inc, x, inc) + } + }) + } + } + } +}
diff --git a/internal/asm/f64/copy_amd64.s b/internal/asm/f64/copy_amd64.s new file mode 100644 index 0000000..24eaf7a --- /dev/null +++ b/internal/asm/f64/copy_amd64.s
@@ -0,0 +1,71 @@ +// Copyright ©2017 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!appengine + +#include "textflag.h" + +#define SIZE 8 +#define SHIFT 3 +#define SRC SI +#define DST DI +#define LEN CX +#define TAIL BX +#define INC_DST R8 +#define INC3_DST R9 +#define INC_SRC R10 +#define INC3_SRC R11 + +// func Copy(n int, dst []float64, incDst int, src []float64, incSrc int) +TEXT ·Copy(SB), NOSPLIT, $0 + MOVQ n+0(FP), LEN + CMPQ LEN, $0 + JEQ ret + MOVQ dst_base+8(FP), DST + MOVQ src_base+40(FP), SRC + MOVQ incDst+32(FP), INC_DST + SHLQ $SHIFT, INC_DST + MOVQ incSrc+64(FP), INC_SRC + SHLQ $SHIFT, INC_SRC + + MOVQ LEN, TAIL + SHRQ $2, LEN + JZ tail + + LEAQ (INC_SRC)(INC_SRC*2), INC3_SRC + LEAQ (INC_DST)(INC_DST*2), INC3_DST + PREFETCHNTA (SRC)(INC3_SRC*1) + PREFETCHT0 (DST)(INC3_DST*1) + +copy_loop: + MOVSD (SRC), X0 + MOVSD (SRC)(INC_SRC*1), X1 + MOVSD (SRC)(INC_SRC*2), X2 + MOVSD (SRC)(INC3_SRC*1), X3 + MOVSD X0, (DST) + MOVSD X1, (DST)(INC_DST*1) + MOVSD X2, (DST)(INC_DST*2) + MOVSD X3, (DST)(INC3_DST*1) + + LEAQ (SRC)(INC_SRC*4), SRC + LEAQ (DST)(INC_DST*4), DST + PREFETCHNTA (SRC)(INC3_SRC*1) + PREFETCHT0 (DST)(INC3_DST*1) + DECQ LEN + JNZ copy_loop + +tail: + ANDQ $3, TAIL + JZ ret + +tail_loop: + MOVSD (SRC), X0 + MOVSD X0, (DST) + ADDQ INC_SRC, SRC + ADDQ INC_DST, DST + DECQ TAIL + JNZ tail_loop + +ret: + RET
diff --git a/internal/asm/f64/stubs_amd64.go b/internal/asm/f64/stubs_amd64.go index d4de191..a444c7b 100644 --- a/internal/asm/f64/stubs_amd64.go +++ b/internal/asm/f64/stubs_amd64.go
@@ -61,6 +61,12 @@ // } func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) +// Copy is +// for i := 0; i < n; i++ { +// dst[i*incDst] = src[i*incSrc] +// } +func Copy(n int, dst []float64, incDst int, src []float64, incSrc int) + // CumSum is // if len(s) == 0 { // return dst
diff --git a/internal/asm/f64/stubs_noasm.go b/internal/asm/f64/stubs_noasm.go index 5e4ae69..d545af5 100644 --- a/internal/asm/f64/stubs_noasm.go +++ b/internal/asm/f64/stubs_noasm.go
@@ -52,6 +52,16 @@ } } +// Copy is +// for i := 0; i < n; i++ { +// dst[i*incDst] = src[i*incSrc] +// } +func Copy(n int, dst []float64, incDst int, src []float64, incSrc int) { + for i := 0; i < n; i++ { + dst[i*incDst] = src[i*incSrc] + } +} + // CumSum is // if len(s) == 0 { // return dst
diff --git a/internal/asm/f64/stubs_test.go b/internal/asm/f64/stubs_test.go index 6dd949f..dfe35c9 100644 --- a/internal/asm/f64/stubs_test.go +++ b/internal/asm/f64/stubs_test.go
@@ -4,7 +4,10 @@ package f64 -import "testing" +import ( + "fmt" + "testing" +) func TestL1Norm(t *testing.T) { var src_gd float64 = 1 @@ -165,6 +168,90 @@ } } +func TestCopy(t *testing.T) { + const sGdVal, dGdVal = -1, 0.5 + gdLn := 4 + for i, test := range []struct { + dst, src, want []float64 + }{ + { + dst: nil, + src: nil, + want: nil, + }, + { + dst: []float64{}, + src: []float64{}, + want: []float64{}, + }, + { + dst: []float64{2}, + src: []float64{3}, + want: []float64{3}, + }, + { + dst: []float64{1, 2}, + src: []float64{3, 7}, + want: []float64{3, 7}, + }, + { + dst: []float64{1, 3, 5}, + src: []float64{3, 5, 7}, + want: []float64{3, 5, 7}, + }, + { + dst: []float64{1, 2, 3, 4}, + src: []float64{2, 4, 6, 8}, + want: []float64{2, 4, 6, 8}, + }, + { + dst: []float64{1, 2, 3, 4}, + src: []float64{2, 4, 6}, + want: []float64{2, 4, 6, 4}, + }, + { + dst: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9}, + src: []float64{2, 8, 32, 128, 512, 2048, 8192, 32768}, + want: []float64{2, 8, 32, 128, 512, 2048, 8192, 32768}, + }, + { + dst: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, + src: []float64{2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048}, + want: []float64{2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048}, + }, + { + dst: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, + src: []float64{2, 8, 32, 128, 512, 2048, 8192, 32768, 131072, 524288, 2097152}, + want: []float64{2, 8, 32, 128, 512, 2048, 8192, 32768, 131072, 524288, 2097152}, + }, + } { + n := len(test.src) + for _, inc := range newIncSet(1, 2, 3, 4, 7) { + prefix := fmt.Sprintf("Test %v (x:%v y:%v)", i, inc.x, inc.y) + sg := guardIncVector(test.src, sGdVal, inc.x, gdLn) + dg := guardIncVector(test.dst, dGdVal, inc.y, gdLn) + src, dst := sg[gdLn:len(sg)-gdLn], dg[gdLn:len(dg)-gdLn] + + Copy(n, dst, inc.y, src, inc.x) + + for i := range test.want { + if !same(dst[i*inc.y], test.want[i]) { + t.Errorf(msgVal, prefix, i, dst[i*inc.y], test.want[i]) + } + } + if !equalStrided(test.src, src, inc.x) { + t.Errorf("%v: modified read-only src argument", prefix) + } + checkValidIncGuard(t, sg, sGdVal, inc.x, gdLn) + checkValidIncGuard(t, dg, dGdVal, inc.y, gdLn) + if t.Failed() { + t.Error(src, dst) + break + } + } + } +} + func TestCumSum(t *testing.T) { var src_gd, dst_gd float64 = -1, 0 for j, v := range []struct {