| // Copyright 2010 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| // Package utf16 implements encoding and decoding of UTF-16 sequences. |
| package utf16 |
| |
| // The conditions replacementChar==unicode.ReplacementChar and |
| // maxRune==unicode.MaxRune are verified in the tests. |
| // Defining them locally avoids this package depending on package unicode. |
| |
| const ( |
| replacementChar = '\uFFFD' // Unicode replacement character |
| maxRune = '\U0010FFFF' // Maximum valid Unicode code point. |
| ) |
| |
| const ( |
| // 0xd800-0xdc00 encodes the high 10 bits of a pair. |
| // 0xdc00-0xe000 encodes the low 10 bits of a pair. |
| // the value is those 20 bits plus 0x10000. |
| surr1 = 0xd800 |
| surr2 = 0xdc00 |
| surr3 = 0xe000 |
| |
| surrSelf = 0x10000 |
| ) |
| |
| // IsSurrogate reports whether the specified Unicode code point |
| // can appear in a surrogate pair. |
| func IsSurrogate(r rune) bool { |
| return surr1 <= r && r < surr3 |
| } |
| |
| // DecodeRune returns the UTF-16 decoding of a surrogate pair. |
| // If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns |
| // the Unicode replacement code point U+FFFD. |
| func DecodeRune(r1, r2 rune) rune { |
| if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 { |
| return (r1-surr1)<<10 | (r2 - surr2) + surrSelf |
| } |
| return replacementChar |
| } |
| |
| // EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune. |
| // If the rune is not a valid Unicode code point or does not need encoding, |
| // EncodeRune returns U+FFFD, U+FFFD. |
| func EncodeRune(r rune) (r1, r2 rune) { |
| if r < surrSelf || r > maxRune { |
| return replacementChar, replacementChar |
| } |
| r -= surrSelf |
| return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff |
| } |
| |
| // Encode returns the UTF-16 encoding of the Unicode code point sequence s. |
| func Encode(s []rune) []uint16 { |
| n := len(s) |
| for _, v := range s { |
| if v >= surrSelf { |
| n++ |
| } |
| } |
| |
| a := make([]uint16, n) |
| n = 0 |
| for _, v := range s { |
| switch { |
| case 0 <= v && v < surr1, surr3 <= v && v < surrSelf: |
| // normal rune |
| a[n] = uint16(v) |
| n++ |
| case surrSelf <= v && v <= maxRune: |
| // needs surrogate sequence |
| r1, r2 := EncodeRune(v) |
| a[n] = uint16(r1) |
| a[n+1] = uint16(r2) |
| n += 2 |
| default: |
| a[n] = uint16(replacementChar) |
| n++ |
| } |
| } |
| return a[:n] |
| } |
| |
| // AppendRune appends the UTF-16 encoding of the Unicode code point r |
| // to the end of p and returns the extended buffer. If the rune is not |
| // a valid Unicode code point, it appends the encoding of U+FFFD. |
| func AppendRune(a []uint16, r rune) []uint16 { |
| // This function is inlineable for fast handling of ASCII. |
| switch { |
| case 0 <= r && r < surr1, surr3 <= r && r < surrSelf: |
| // normal rune |
| return append(a, uint16(r)) |
| case surrSelf <= r && r <= maxRune: |
| // needs surrogate sequence |
| r1, r2 := EncodeRune(r) |
| return append(a, uint16(r1), uint16(r2)) |
| } |
| return append(a, replacementChar) |
| } |
| |
| // Decode returns the Unicode code point sequence represented |
| // by the UTF-16 encoding s. |
| func Decode(s []uint16) []rune { |
| a := make([]rune, len(s)) |
| n := 0 |
| for i := 0; i < len(s); i++ { |
| switch r := s[i]; { |
| case r < surr1, surr3 <= r: |
| // normal rune |
| a[n] = rune(r) |
| case surr1 <= r && r < surr2 && i+1 < len(s) && |
| surr2 <= s[i+1] && s[i+1] < surr3: |
| // valid surrogate sequence |
| a[n] = DecodeRune(rune(r), rune(s[i+1])) |
| i++ |
| default: |
| // invalid surrogate sequence |
| a[n] = replacementChar |
| } |
| n++ |
| } |
| return a[:n] |
| } |