| // Copyright 2017 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // Package tokenizer converts a text into a stream of tokens. |
| package tokenizer |
| |
| import ( |
| "bytes" |
| "fmt" |
| "hash/crc32" |
| "sort" |
| "unicode" |
| "unicode/utf8" |
| ) |
| |
| // Token is a non-whitespace sequence (i.e., word or punctuation) in the |
| // original string. This is not meant for use outside of this package. |
| type token struct { |
| Text string |
| Offset int |
| } |
| |
| // Tokens is a list of Token objects. |
| type Tokens []*token |
| |
| // newToken creates a new token object with an invalid (negative) offset, which |
| // will be set before the token's used. |
| func newToken() *token { |
| return &token{Offset: -1} |
| } |
| |
| // Tokenize converts a string into a stream of tokens. |
| func Tokenize(s string) (toks Tokens) { |
| tok := newToken() |
| for i := 0; i < len(s); { |
| r, size := utf8.DecodeRuneInString(s[i:]) |
| switch { |
| case unicode.IsSpace(r): |
| if tok.Offset >= 0 { |
| toks = append(toks, tok) |
| tok = newToken() |
| } |
| case unicode.IsPunct(r): |
| if tok.Offset >= 0 { |
| toks = append(toks, tok) |
| tok = newToken() |
| } |
| toks = append(toks, &token{ |
| Text: string(r), |
| Offset: i, |
| }) |
| default: |
| if tok.Offset == -1 { |
| tok.Offset = i |
| } |
| tok.Text += string(r) |
| } |
| i += size |
| } |
| if tok.Offset != -1 { |
| // Add any remaining token that wasn't yet included in the list. |
| toks = append(toks, tok) |
| } |
| return toks |
| } |
| |
| // GenerateHashes generates hashes for "size" length substrings. The |
| // "stringifyTokens" call takes a long time to run, so not all substrings have |
| // hashes, i.e. we skip some of the smaller substrings. |
| func (t Tokens) GenerateHashes(h Hash, size int) ([]uint32, TokenRanges) { |
| if size == 0 { |
| return nil, nil |
| } |
| |
| var css []uint32 |
| var tr TokenRanges |
| for offset := 0; offset+size <= len(t); offset += size / 2 { |
| var b bytes.Buffer |
| t.stringifyTokens(&b, offset, size) |
| cs := crc32.ChecksumIEEE(b.Bytes()) |
| css = append(css, cs) |
| tr = append(tr, &TokenRange{offset, offset + size}) |
| h.add(cs, offset, offset+size) |
| if size <= 1 { |
| break |
| } |
| } |
| |
| return css, tr |
| } |
| |
| // stringifyTokens serializes a sublist of tokens into a bytes buffer. |
| func (t Tokens) stringifyTokens(b *bytes.Buffer, offset, size int) { |
| for j := offset; j < offset+size; j++ { |
| if j != offset { |
| b.WriteRune(' ') |
| } |
| b.WriteString(t[j].Text) |
| } |
| } |
| |
| // TokenRange indicates the range of tokens that map to a particular checksum. |
| type TokenRange struct { |
| Start int |
| End int |
| } |
| |
| func (t *TokenRange) String() string { |
| return fmt.Sprintf("[%v, %v)", t.Start, t.End) |
| } |
| |
| // TokenRanges is a list of TokenRange objects. The chance that two different |
| // strings map to the same checksum is very small, but unfortunately isn't |
| // zero, so we use this instead of making the assumption that they will all be |
| // unique. |
| type TokenRanges []*TokenRange |
| |
| func (t TokenRanges) Len() int { return len(t) } |
| func (t TokenRanges) Swap(i, j int) { t[i], t[j] = t[j], t[i] } |
| func (t TokenRanges) Less(i, j int) bool { return t[i].Start < t[j].Start } |
| |
| // CombineUnique returns the combination of both token ranges with no duplicates. |
| func (t TokenRanges) CombineUnique(other TokenRanges) TokenRanges { |
| if len(other) == 0 { |
| return t |
| } |
| if len(t) == 0 { |
| return other |
| } |
| |
| cu := append(t, other...) |
| sort.Sort(cu) |
| |
| if len(cu) == 0 { |
| return nil |
| } |
| |
| res := TokenRanges{cu[0]} |
| for prev, i := cu[0], 1; i < len(cu); i++ { |
| if prev.Start != cu[i].Start || prev.End != cu[i].End { |
| res = append(res, cu[i]) |
| prev = cu[i] |
| } |
| } |
| return res |
| } |
| |
| // Hash is a map of the hashes of a section of text to the token range covering that text. |
| type Hash map[uint32]TokenRanges |
| |
| // add associates a token range, [start, end], to a checksum. |
| func (h Hash) add(checksum uint32, start, end int) { |
| ntr := &TokenRange{Start: start, End: end} |
| if r, ok := h[checksum]; ok { |
| for _, tr := range r { |
| if tr.Start == ntr.Start && tr.End == ntr.End { |
| // The token range already exists at this |
| // checksum. No need to re-add it. |
| return |
| } |
| } |
| } |
| h[checksum] = append(h[checksum], ntr) |
| } |