Support the COPY_4 tag.

It is a valid encoding, even if no longer issued by most encoders.

name              old speed      new speed      delta
WordsDecode1e1-8   525MB/s ± 0%   504MB/s ± 1%  -4.04%   (p=0.000 n=9+10)
WordsDecode1e2-8  1.23GB/s ± 0%  1.23GB/s ± 1%    ~      (p=0.678 n=10+9)
WordsDecode1e3-8  1.54GB/s ± 0%  1.53GB/s ± 1%  -0.75%   (p=0.000 n=10+9)
WordsDecode1e4-8  1.53GB/s ± 0%  1.51GB/s ± 3%  -1.46%   (p=0.000 n=9+10)
WordsDecode1e5-8   793MB/s ± 0%   777MB/s ± 2%  -2.01%   (p=0.017 n=9+10)
WordsDecode1e6-8   917MB/s ± 1%   917MB/s ± 1%    ~      (p=0.473 n=8+10)
WordsEncode1e1-8   641MB/s ± 2%   641MB/s ± 0%    ~      (p=0.780 n=10+9)
WordsEncode1e2-8   583MB/s ± 0%   580MB/s ± 0%  -0.41%   (p=0.001 n=10+9)
WordsEncode1e3-8   647MB/s ± 1%   648MB/s ± 0%    ~      (p=0.326 n=10+9)
WordsEncode1e4-8   442MB/s ± 1%   452MB/s ± 0%  +2.20%   (p=0.000 n=10+8)
WordsEncode1e5-8   355MB/s ± 1%   355MB/s ± 0%    ~      (p=0.880 n=10+8)
WordsEncode1e6-8   433MB/s ± 0%   434MB/s ± 0%    ~       (p=0.700 n=8+8)
RandomEncode-8    14.2GB/s ± 3%  14.2GB/s ± 3%    ~      (p=0.780 n=10+9)
_UFlat0-8         2.18GB/s ± 1%  2.19GB/s ± 0%    ~      (p=0.447 n=10+9)
_UFlat1-8         1.40GB/s ± 2%  1.41GB/s ± 0%  +0.73%   (p=0.043 n=9+10)
_UFlat2-8         23.4GB/s ± 3%  23.5GB/s ± 2%    ~      (p=0.497 n=9+10)
_UFlat3-8         1.90GB/s ± 0%  1.91GB/s ± 0%  +0.30%    (p=0.002 n=8+9)
_UFlat4-8         13.9GB/s ± 2%  14.0GB/s ± 1%    ~      (p=0.720 n=9+10)
_UFlat5-8         1.96GB/s ± 1%  1.97GB/s ± 0%  +0.81%   (p=0.000 n=10+9)
_UFlat6-8          813MB/s ± 0%   814MB/s ± 0%  +0.17%   (p=0.037 n=8+10)
_UFlat7-8          783MB/s ± 2%   785MB/s ± 0%    ~       (p=0.340 n=9+9)
_UFlat8-8          859MB/s ± 0%   857MB/s ± 0%    ~       (p=0.074 n=8+9)
_UFlat9-8          719MB/s ± 1%   719MB/s ± 1%    ~      (p=0.621 n=10+9)
_UFlat10-8        2.84GB/s ± 0%  2.84GB/s ± 0%  +0.19%   (p=0.043 n=10+9)
_UFlat11-8        1.05GB/s ± 1%  1.05GB/s ± 0%    ~       (p=0.523 n=9+8)
_ZFlat0-8         1.04GB/s ± 2%  1.04GB/s ± 0%    ~       (p=0.222 n=9+9)
_ZFlat1-8          535MB/s ± 0%   534MB/s ± 0%    ~       (p=0.059 n=9+9)
_ZFlat2-8         15.6GB/s ± 3%  15.7GB/s ± 1%    ~      (p=0.720 n=9+10)
_ZFlat3-8          723MB/s ± 0%   740MB/s ± 3%  +2.36%   (p=0.034 n=8+10)
_ZFlat4-8         9.16GB/s ± 1%  9.20GB/s ± 1%    ~       (p=0.297 n=9+9)
_ZFlat5-8          987MB/s ± 1%   991MB/s ± 0%    ~       (p=0.167 n=9+8)
_ZFlat6-8          378MB/s ± 2%   379MB/s ± 0%    ~       (p=0.334 n=9+8)
_ZFlat7-8          350MB/s ± 2%   352MB/s ± 0%  +0.60%    (p=0.014 n=9+8)
_ZFlat8-8          397MB/s ± 0%   396MB/s ± 1%    ~      (p=0.965 n=8+10)
_ZFlat9-8          328MB/s ± 0%   327MB/s ± 1%    ~       (p=0.409 n=8+9)
_ZFlat10-8        1.33GB/s ± 0%  1.33GB/s ± 1%    ~      (p=0.356 n=9+10)
_ZFlat11-8         605MB/s ± 0%   605MB/s ± 1%    ~       (p=0.743 n=9+8)
diff --git a/README b/README
index 6b13826..cea1287 100644
--- a/README
+++ b/README
@@ -13,65 +13,65 @@
 The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
 or so files, the same set used by the C++ Snappy code (github.com/google/snappy
 and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
-3.40GHz", Go's GOARCH=amd64 numbers as of 2016-04-29:
+3.40GHz", Go's GOARCH=amd64 numbers as of 2016-05-29:
 
 "go test -test.bench=."
 
-_UFlat0-8         2.23GB/s ± 1%  html
-_UFlat1-8         1.43GB/s ± 0%  urls
-_UFlat2-8         23.7GB/s ± 1%  jpg
-_UFlat3-8         1.93GB/s ± 0%  jpg_200
-_UFlat4-8         13.9GB/s ± 2%  pdf
-_UFlat5-8         2.00GB/s ± 0%  html4
-_UFlat6-8          829MB/s ± 0%  txt1
-_UFlat7-8          799MB/s ± 0%  txt2
-_UFlat8-8          871MB/s ± 0%  txt3
-_UFlat9-8          730MB/s ± 0%  txt4
-_UFlat10-8        2.87GB/s ± 0%  pb
-_UFlat11-8        1.07GB/s ± 0%  gaviota
+_UFlat0-8         2.19GB/s ± 0%  html
+_UFlat1-8         1.41GB/s ± 0%  urls
+_UFlat2-8         23.5GB/s ± 2%  jpg
+_UFlat3-8         1.91GB/s ± 0%  jpg_200
+_UFlat4-8         14.0GB/s ± 1%  pdf
+_UFlat5-8         1.97GB/s ± 0%  html4
+_UFlat6-8          814MB/s ± 0%  txt1
+_UFlat7-8          785MB/s ± 0%  txt2
+_UFlat8-8          857MB/s ± 0%  txt3
+_UFlat9-8          719MB/s ± 1%  txt4
+_UFlat10-8        2.84GB/s ± 0%  pb
+_UFlat11-8        1.05GB/s ± 0%  gaviota
 
 _ZFlat0-8         1.04GB/s ± 0%  html
-_ZFlat1-8          536MB/s ± 0%  urls
-_ZFlat2-8         16.3GB/s ± 2%  jpg
-_ZFlat3-8          762MB/s ± 0%  jpg_200
-_ZFlat4-8         9.48GB/s ± 1%  pdf
-_ZFlat5-8          990MB/s ± 0%  html4
-_ZFlat6-8          381MB/s ± 0%  txt1
-_ZFlat7-8          353MB/s ± 0%  txt2
-_ZFlat8-8          398MB/s ± 0%  txt3
-_ZFlat9-8          329MB/s ± 0%  txt4
-_ZFlat10-8        1.35GB/s ± 1%  pb
-_ZFlat11-8         608MB/s ± 0%  gaviota
+_ZFlat1-8          534MB/s ± 0%  urls
+_ZFlat2-8         15.7GB/s ± 1%  jpg
+_ZFlat3-8          740MB/s ± 3%  jpg_200
+_ZFlat4-8         9.20GB/s ± 1%  pdf
+_ZFlat5-8          991MB/s ± 0%  html4
+_ZFlat6-8          379MB/s ± 0%  txt1
+_ZFlat7-8          352MB/s ± 0%  txt2
+_ZFlat8-8          396MB/s ± 1%  txt3
+_ZFlat9-8          327MB/s ± 1%  txt4
+_ZFlat10-8        1.33GB/s ± 1%  pb
+_ZFlat11-8         605MB/s ± 1%  gaviota
 
 
 
 "go test -test.bench=. -tags=noasm"
 
-_UFlat0-8          637MB/s ± 0%  html
-_UFlat1-8          506MB/s ± 0%  urls
-_UFlat2-8         23.0GB/s ± 5%  jpg
-_UFlat3-8         1.17GB/s ± 0%  jpg_200
-_UFlat4-8         4.44GB/s ± 1%  pdf
-_UFlat5-8          623MB/s ± 0%  html4
-_UFlat6-8          300MB/s ± 1%  txt1
-_UFlat7-8          293MB/s ± 0%  txt2
-_UFlat8-8          316MB/s ± 0%  txt3
-_UFlat9-8          285MB/s ± 0%  txt4
-_UFlat10-8         768MB/s ± 0%  pb
-_UFlat11-8         406MB/s ± 1%  gaviota
+_UFlat0-8          621MB/s ± 2%  html
+_UFlat1-8          494MB/s ± 1%  urls
+_UFlat2-8         23.2GB/s ± 1%  jpg
+_UFlat3-8         1.12GB/s ± 1%  jpg_200
+_UFlat4-8         4.35GB/s ± 1%  pdf
+_UFlat5-8          609MB/s ± 0%  html4
+_UFlat6-8          296MB/s ± 0%  txt1
+_UFlat7-8          288MB/s ± 0%  txt2
+_UFlat8-8          309MB/s ± 1%  txt3
+_UFlat9-8          280MB/s ± 1%  txt4
+_UFlat10-8         753MB/s ± 0%  pb
+_UFlat11-8         400MB/s ± 0%  gaviota
 
-_ZFlat0-8          411MB/s ± 1%  html
+_ZFlat0-8          409MB/s ± 1%  html
 _ZFlat1-8          250MB/s ± 1%  urls
-_ZFlat2-8         12.7GB/s ± 1%  jpg
-_ZFlat3-8          157MB/s ± 0%  jpg_200
-_ZFlat4-8         2.95GB/s ± 0%  pdf
-_ZFlat5-8          406MB/s ± 0%  html4
-_ZFlat6-8          182MB/s ± 0%  txt1
-_ZFlat7-8          173MB/s ± 1%  txt2
-_ZFlat8-8          191MB/s ± 0%  txt3
-_ZFlat9-8          166MB/s ± 0%  txt4
-_ZFlat10-8         480MB/s ± 0%  pb
-_ZFlat11-8         272MB/s ± 0%  gaviota
+_ZFlat2-8         12.3GB/s ± 1%  jpg
+_ZFlat3-8          132MB/s ± 0%  jpg_200
+_ZFlat4-8         2.92GB/s ± 0%  pdf
+_ZFlat5-8          405MB/s ± 1%  html4
+_ZFlat6-8          179MB/s ± 1%  txt1
+_ZFlat7-8          170MB/s ± 1%  txt2
+_ZFlat8-8          189MB/s ± 1%  txt3
+_ZFlat9-8          164MB/s ± 1%  txt4
+_ZFlat10-8         479MB/s ± 1%  pb
+_ZFlat11-8         270MB/s ± 1%  gaviota
 
 
 
diff --git a/decode.go b/decode.go
index 819c717..72efb03 100644
--- a/decode.go
+++ b/decode.go
@@ -18,7 +18,6 @@
 	// ErrUnsupported reports that the input isn't supported.
 	ErrUnsupported = errors.New("snappy: unsupported input")
 
-	errUnsupportedCopy4Tag      = errors.New("snappy: unsupported COPY_4 tag")
 	errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
 )
 
@@ -46,7 +45,6 @@
 const (
 	decodeErrCodeCorrupt                  = 1
 	decodeErrCodeUnsupportedLiteralLength = 2
-	decodeErrCodeUnsupportedCopy4Tag      = 3
 )
 
 // Decode returns the decoded form of src. The returned slice may be a sub-
@@ -69,8 +67,6 @@
 		return dst, nil
 	case decodeErrCodeUnsupportedLiteralLength:
 		return nil, errUnsupportedLiteralLength
-	case decodeErrCodeUnsupportedCopy4Tag:
-		return nil, errUnsupportedCopy4Tag
 	}
 	return nil, ErrCorrupt
 }
diff --git a/decode_amd64.s b/decode_amd64.s
index ed1e93b..e6179f6 100644
--- a/decode_amd64.s
+++ b/decode_amd64.s
@@ -226,6 +226,25 @@
 // ----------------------------------------
 // The code below handles copy tags.
 
+tagCopy4:
+	// case tagCopy4:
+	// s += 5
+	ADDQ $5, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-5])>>2
+	SHRQ $2, CX
+	INCQ CX
+
+	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+	MOVLQZX -4(SI), DX
+	JMP     doCopy
+
 tagCopy2:
 	// case tagCopy2:
 	// s += 3
@@ -241,7 +260,7 @@
 	SHRQ $2, CX
 	INCQ CX
 
-	// offset = int(src[s-2]) | int(src[s-1])<<8
+	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
 	MOVWQZX -2(SI), DX
 	JMP     doCopy
 
@@ -251,7 +270,7 @@
 	//	- CX == src[s]
 	CMPQ BX, $2
 	JEQ  tagCopy2
-	JA   errUC4T
+	JA   tagCopy4
 
 	// case tagCopy1:
 	// s += 2
@@ -263,7 +282,7 @@
 	CMPQ BX, R12
 	JA   errCorrupt
 
-	// offset = int(src[s-2])&0xe0<<3 | int(src[s-1])
+	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
 	MOVQ    CX, DX
 	ANDQ    $0xe0, DX
 	SHLQ    $3, DX
@@ -469,8 +488,3 @@
 	// return decodeErrCodeCorrupt
 	MOVQ $1, ret+48(FP)
 	RET
-
-errUC4T:
-	// return decodeErrCodeUnsupportedCopy4Tag
-	MOVQ $3, ret+48(FP)
-	RET
diff --git a/decode_other.go b/decode_other.go
index f305b6f..8c9f204 100644
--- a/decode_other.go
+++ b/decode_other.go
@@ -63,7 +63,7 @@
 				return decodeErrCodeCorrupt
 			}
 			length = 4 + int(src[s-2])>>2&0x7
-			offset = int(src[s-2])&0xe0<<3 | int(src[s-1])
+			offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
 
 		case tagCopy2:
 			s += 3
@@ -71,10 +71,15 @@
 				return decodeErrCodeCorrupt
 			}
 			length = 1 + int(src[s-3])>>2
-			offset = int(src[s-2]) | int(src[s-1])<<8
+			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
 
 		case tagCopy4:
-			return decodeErrCodeUnsupportedCopy4Tag
+			s += 5
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-5])>>2
+			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
 		}
 
 		if offset <= 0 || d < offset || length > len(dst)-d {
diff --git a/snappy.go b/snappy.go
index 0102542..0cf5e37 100644
--- a/snappy.go
+++ b/snappy.go
@@ -32,7 +32,10 @@
   - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
     The length is 1 + m. The offset is the little-endian unsigned integer
     denoted by the next 2 bytes.
-  - For l == 3, this tag is a legacy format that is no longer supported.
+  - For l == 3, this tag is a legacy format that is no longer issued by most
+    encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in
+    [1, 65). The length is 1 + m. The offset is the little-endian unsigned
+    integer denoted by the next 4 bytes.
 */
 const (
 	tagLiteral = 0x00
diff --git a/snappy_test.go b/snappy_test.go
index ce3f08e..2712710 100644
--- a/snappy_test.go
+++ b/snappy_test.go
@@ -257,10 +257,10 @@
 		"",
 		ErrCorrupt,
 	}, {
-		`decodedLen=4; tagCopy4; unsupported COPY_4 tag`,
-		"\x04" + "\x03\x00\x00\x00\x00",
+		`decodedLen=4; tagCopy4, 4 extra length|offset bytes; not enough extra bytes`,
+		"\x04" + "\x03\x00\x00\x00",
 		"",
-		errUnsupportedCopy4Tag,
+		ErrCorrupt,
 	}, {
 		`decodedLen=4; tagLiteral (4 bytes "abcd"); valid input`,
 		"\x04" + "\x0cabcd",
@@ -311,6 +311,11 @@
 		"\x06" + "\x0cabcd" + "\x06\x03\x00",
 		"abcdbc",
 		nil,
+	}, {
+		`decodedLen=6; tagLiteral (4 bytes "abcd"); tagCopy4; length=2 offset=3; valid input`,
+		"\x06" + "\x0cabcd" + "\x07\x03\x00\x00\x00",
+		"abcdbc",
+		nil,
 	}}
 
 	const (
@@ -369,6 +374,34 @@
 	}
 }
 
+func TestDecodeCopy4(t *testing.T) {
+	dots := strings.Repeat(".", 65536)
+
+	input := strings.Join([]string{
+		"\x89\x80\x04",         // decodedLen = 65545.
+		"\x0cpqrs",             // 4-byte literal "pqrs".
+		"\xf4\xff\xff" + dots,  // 65536-byte literal dots.
+		"\x13\x04\x00\x01\x00", // tagCopy4; length=5 offset=65540.
+	}, "")
+
+	gotBytes, err := Decode(nil, []byte(input))
+	if err != nil {
+		t.Fatal(err)
+	}
+	got := string(gotBytes)
+	want := "pqrs" + dots + "pqrs."
+	if len(got) != len(want) {
+		t.Fatalf("got %d bytes, want %d", len(got), len(want))
+	}
+	if got != want {
+		for i := 0; i < len(got); i++ {
+			if g, w := got[i], want[i]; g != w {
+				t.Fatalf("byte #%d: got %#02x, want %#02x", i, g, w)
+			}
+		}
+	}
+}
+
 // TestDecodeLengthOffset tests decoding an encoding of the form literal +
 // copy-length-offset + literal. For example: "abcdefghijkl" + "efghij" + "AB".
 func TestDecodeLengthOffset(t *testing.T) {