Combine: Optimize for all nil (#55)

The current implementation of Combine returns the input slice in the
returned multierr. This causes it to escape to the heap unconditionally.
Optimizing for the no-errors case, we can copy the slice in that case
and keep it on the stack.

```
name                                old time/op    new time/op    delta
Combine/inline_1-8                    17.7ns ± 0%     2.1ns ± 0%   -88.20%  (p=0.008 n=5+5)
Combine/inline_2-8                    21.0ns ± 0%     4.4ns ± 1%   -79.03%  (p=0.008 n=5+5)
Combine/inline_3_no_error-8           24.4ns ± 0%     5.1ns ± 1%   -79.22%  (p=0.016 n=4+5)
Combine/inline_3_one_error-8          24.8ns ± 0%     5.4ns ± 1%   -78.42%  (p=0.008 n=5+5)
Combine/inline_3_multiple_errors-8    44.3ns ± 0%    54.9ns ± 0%   +23.80%  (p=0.008 n=5+5)
Combine/slice_100_no_errors-8         72.9ns ± 0%    73.4ns ± 1%    +0.68%  (p=0.008 n=5+5)
Combine/slice_100_one_error-8         74.5ns ± 0%    74.8ns ± 0%      ~     (p=0.056 n=5+5)
Combine/slice_100_multi_error-8        193ns ± 0%     193ns ± 0%      ~     (p=0.127 n=5+5)

name                                old alloc/op   new alloc/op   delta
Combine/inline_1-8                     16.0B ± 0%      0.0B       -100.00%  (p=0.008 n=5+5)
Combine/inline_2-8                     32.0B ± 0%      0.0B       -100.00%  (p=0.008 n=5+5)
Combine/inline_3_no_error-8            48.0B ± 0%      0.0B       -100.00%  (p=0.008 n=5+5)
Combine/inline_3_one_error-8           48.0B ± 0%      0.0B       -100.00%  (p=0.008 n=5+5)
Combine/inline_3_multiple_errors-8     80.0B ± 0%     80.0B ± 0%      ~     (all equal)
Combine/slice_100_no_errors-8          0.00B          0.00B           ~     (all equal)
Combine/slice_100_one_error-8          0.00B          0.00B           ~     (all equal)
Combine/slice_100_multi_error-8        64.0B ± 0%     64.0B ± 0%      ~     (all equal)

name                                old allocs/op  new allocs/op  delta
Combine/inline_1-8                      1.00 ± 0%      0.00       -100.00%  (p=0.008 n=5+5)
Combine/inline_2-8                      1.00 ± 0%      0.00       -100.00%  (p=0.008 n=5+5)
Combine/inline_3_no_error-8             1.00 ± 0%      0.00       -100.00%  (p=0.008 n=5+5)
Combine/inline_3_one_error-8            1.00 ± 0%      0.00       -100.00%  (p=0.008 n=5+5)
Combine/inline_3_multiple_errors-8      2.00 ± 0%      2.00 ± 0%      ~     (all equal)
Combine/slice_100_no_errors-8           0.00           0.00           ~     (all equal)
Combine/slice_100_one_error-8           0.00           0.00           ~     (all equal)
Combine/slice_100_multi_error-8         2.00 ± 0%      2.00 ± 0%      ~     (all equal)
```

Co-authored-by: Abhinav Gupta <abg@uber.com>
diff --git a/benchmarks_test.go b/benchmarks_test.go
index 797f5a0..562d3bc 100644
--- a/benchmarks_test.go
+++ b/benchmarks_test.go
@@ -60,3 +60,68 @@
 		}
 	}
 }
+
+func BenchmarkCombine(b *testing.B) {
+	b.Run("inline 1", func(b *testing.B) {
+		var x error
+		for i := 0; i < b.N; i++ {
+			Combine(x)
+		}
+	})
+
+	b.Run("inline 2", func(b *testing.B) {
+		var x, y error
+		for i := 0; i < b.N; i++ {
+			Combine(x, y)
+		}
+	})
+
+	b.Run("inline 3 no error", func(b *testing.B) {
+		var x, y, z error
+		for i := 0; i < b.N; i++ {
+			Combine(x, y, z)
+		}
+	})
+
+	b.Run("inline 3 one error", func(b *testing.B) {
+		var x, y, z error
+		z = fmt.Errorf("failed")
+		for i := 0; i < b.N; i++ {
+			Combine(x, y, z)
+		}
+	})
+
+	b.Run("inline 3 multiple errors", func(b *testing.B) {
+		var x, y, z error
+		z = fmt.Errorf("failed3")
+		y = fmt.Errorf("failed2")
+		x = fmt.Errorf("failed")
+		for i := 0; i < b.N; i++ {
+			Combine(x, y, z)
+		}
+	})
+
+	b.Run("slice 100 no errors", func(b *testing.B) {
+		errs := make([]error, 100)
+		for i := 0; i < b.N; i++ {
+			Combine(errs...)
+		}
+	})
+
+	b.Run("slice 100 one error", func(b *testing.B) {
+		errs := make([]error, 100)
+		errs[len(errs)-1] = fmt.Errorf("failed")
+		for i := 0; i < b.N; i++ {
+			Combine(errs...)
+		}
+	})
+
+	b.Run("slice 100 multi error", func(b *testing.B) {
+		errs := make([]error, 100)
+		errs[0] = fmt.Errorf("failed1")
+		errs[len(errs)-1] = fmt.Errorf("failed2")
+		for i := 0; i < b.N; i++ {
+			Combine(errs...)
+		}
+	})
+}
diff --git a/error.go b/error.go
index 13a020a..f45af14 100644
--- a/error.go
+++ b/error.go
@@ -372,6 +372,14 @@
 
 // fromSlice converts the given list of errors into a single error.
 func fromSlice(errors []error) error {
+	// Don't pay to inspect small slices.
+	switch len(errors) {
+	case 0:
+		return nil
+	case 1:
+		return errors[0]
+	}
+
 	res := inspect(errors)
 	switch res.Count {
 	case 0:
@@ -381,8 +389,13 @@
 		return errors[res.FirstErrorIdx]
 	case len(errors):
 		if !res.ContainsMultiError {
-			// already flat
-			return &multiError{errors: errors}
+			// Error list is flat. Make a copy of it
+			// Otherwise "errors" escapes to the heap
+			// unconditionally for all other cases.
+			// This lets us optimize for the "no errors" case.
+			out := make([]error, len(errors))
+			copy(out, errors)
+			return &multiError{errors: out}
 		}
 	}
 
diff --git a/error_test.go b/error_test.go
index 3dd4884..c053167 100644
--- a/error_test.go
+++ b/error_test.go
@@ -98,6 +98,12 @@
 			wantSingleline: "foo; bar",
 		},
 		{
+			giveErrors:     []error{nil, nil, errors.New("great sadness"), nil},
+			wantError:      errors.New("great sadness"),
+			wantMultiline:  "great sadness",
+			wantSingleline: "great sadness",
+		},
+		{
 			giveErrors: []error{
 				errors.New("foo"),
 				newMultiErr(
@@ -273,6 +279,14 @@
 	assert.Nil(t, errors[1], 3)
 }
 
+func TestCombineGoodCaseNoAlloc(t *testing.T) {
+	errs := make([]error, 10)
+	allocs := testing.AllocsPerRun(100, func() {
+		Combine(errs...)
+	})
+	assert.Equal(t, 0.0, allocs)
+}
+
 func TestAppend(t *testing.T) {
 	tests := []struct {
 		left  error