stringclassifier/searchset/tokenizer/tokenizer.go - third_party/github.com/google/licenseclassifier - Git at Google

 // Copyright 2017 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Package tokenizer converts a text into a stream of tokens.
 package tokenizer

 import (
 	"bytes"
 	"fmt"
 	"hash/crc32"
 	"sort"
 	"unicode"
 	"unicode/utf8"
 )

 // Token is a non-whitespace sequence (i.e., word or punctuation) in the
 // original string. This is not meant for use outside of this package.
 type token struct {
 	Text   string
 	Offset int
 }

 // Tokens is a list of Token objects.
 type Tokens []*token

 // newToken creates a new token object with an invalid (negative) offset, which
 // will be set before the token's used.
 func newToken() *token {
 	return &token{Offset: -1}
 }

 // Tokenize converts a string into a stream of tokens.
 func Tokenize(s string) (toks Tokens) {
 	tok := newToken()
 	for i := 0; i < len(s); {
 		r, size := utf8.DecodeRuneInString(s[i:])
 		switch {
 		case unicode.IsSpace(r):
 			if tok.Offset >= 0 {
 				toks = append(toks, tok)
 				tok = newToken()
 			}
 		case unicode.IsPunct(r):
 			if tok.Offset >= 0 {
 				toks = append(toks, tok)
 				tok = newToken()
 			}
 			toks = append(toks, &token{
 				Text:   string(r),
 				Offset: i,
 			})
 		default:
 			if tok.Offset == -1 {
 				tok.Offset = i
 			}
 			tok.Text += string(r)
 		}
 		i += size
 	}
 	if tok.Offset != -1 {
 		// Add any remaining token that wasn't yet included in the list.
 		toks = append(toks, tok)
 	}
 	return toks
 }

 // GenerateHashes generates hashes for "size" length substrings. The
 // "stringifyTokens" call takes a long time to run, so not all substrings have
 // hashes, i.e. we skip some of the smaller substrings.
 func (t Tokens) GenerateHashes(h Hash, size int) ([]uint32, TokenRanges) {
 	if size == 0 {
 		return nil, nil
 	}

 	var css []uint32
 	var tr TokenRanges
 	for offset := 0; offset+size <= len(t); offset += size / 2 {
 		var b bytes.Buffer
 		t.stringifyTokens(&b, offset, size)
 		cs := crc32.ChecksumIEEE(b.Bytes())
 		css = append(css, cs)
 		tr = append(tr, &TokenRange{offset, offset + size})
 		h.add(cs, offset, offset+size)
 		if size <= 1 {
 			break
 		}
 	}

 	return css, tr
 }

 // stringifyTokens serializes a sublist of tokens into a bytes buffer.
 func (t Tokens) stringifyTokens(b *bytes.Buffer, offset, size int) {
 	for j := offset; j < offset+size; j++ {
 		if j != offset {
 			b.WriteRune(' ')
 		}
 		b.WriteString(t[j].Text)
 	}
 }

 // TokenRange indicates the range of tokens that map to a particular checksum.
 type TokenRange struct {
 	Start int
 	End   int
 }

 func (t *TokenRange) String() string {
 	return fmt.Sprintf("[%v, %v)", t.Start, t.End)
 }

 // TokenRanges is a list of TokenRange objects. The chance that two different
 // strings map to the same checksum is very small, but unfortunately isn't
 // zero, so we use this instead of making the assumption that they will all be
 // unique.
 type TokenRanges []*TokenRange

 func (t TokenRanges) Len() int           { return len(t) }
 func (t TokenRanges) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
 func (t TokenRanges) Less(i, j int) bool { return t[i].Start < t[j].Start }

 // CombineUnique returns the combination of both token ranges with no duplicates.
 func (t TokenRanges) CombineUnique(other TokenRanges) TokenRanges {
 	if len(other) == 0 {
 		return t
 	}
 	if len(t) == 0 {
 		return other
 	}

 	cu := append(t, other...)
 	sort.Sort(cu)

 	if len(cu) == 0 {
 		return nil
 	}

 	res := TokenRanges{cu[0]}
 	for prev, i := cu[0], 1; i < len(cu); i++ {
 		if prev.Start != cu[i].Start || prev.End != cu[i].End {
 			res = append(res, cu[i])
 			prev = cu[i]
 		}
 	}
 	return res
 }

 // Hash is a map of the hashes of a section of text to the token range covering that text.
 type Hash map[uint32]TokenRanges

 // add associates a token range, [start, end], to a checksum.
 func (h Hash) add(checksum uint32, start, end int) {
 	ntr := &TokenRange{Start: start, End: end}
 	if r, ok := h[checksum]; ok {
 		for _, tr := range r {
 			if tr.Start == ntr.Start && tr.End == ntr.End {
 				// The token range already exists at this
 				// checksum. No need to re-add it.
 				return
 			}
 		}
 	}
 	h[checksum] = append(h[checksum], ntr)
 }
	// Copyright 2017 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// Package tokenizer converts a text into a stream of tokens.
	package tokenizer

	import (
	"bytes"
	"fmt"
	"hash/crc32"
	"sort"
	"unicode"
	"unicode/utf8"
	)

	// Token is a non-whitespace sequence (i.e., word or punctuation) in the
	// original string. This is not meant for use outside of this package.
	type token struct {
	Text string
	Offset int
	}

	// Tokens is a list of Token objects.
	type Tokens []*token

	// newToken creates a new token object with an invalid (negative) offset, which
	// will be set before the token's used.
	func newToken() *token {
	return &token{Offset: -1}
	}

	// Tokenize converts a string into a stream of tokens.
	func Tokenize(s string) (toks Tokens) {
	tok := newToken()
	for i := 0; i < len(s); {
	r, size := utf8.DecodeRuneInString(s[i:])
	switch {
	case unicode.IsSpace(r):
	if tok.Offset >= 0 {
	toks = append(toks, tok)
	tok = newToken()
	}
	case unicode.IsPunct(r):
	if tok.Offset >= 0 {
	toks = append(toks, tok)
	tok = newToken()
	}
	toks = append(toks, &token{
	Text: string(r),
	Offset: i,
	})
	default:
	if tok.Offset == -1 {
	tok.Offset = i
	}
	tok.Text += string(r)
	}
	i += size
	}
	if tok.Offset != -1 {
	// Add any remaining token that wasn't yet included in the list.
	toks = append(toks, tok)
	}
	return toks
	}

	// GenerateHashes generates hashes for "size" length substrings. The
	// "stringifyTokens" call takes a long time to run, so not all substrings have
	// hashes, i.e. we skip some of the smaller substrings.
	func (t Tokens) GenerateHashes(h Hash, size int) ([]uint32, TokenRanges) {
	if size == 0 {
	return nil, nil
	}

	var css []uint32
	var tr TokenRanges
	for offset := 0; offset+size <= len(t); offset += size / 2 {
	var b bytes.Buffer
	t.stringifyTokens(&b, offset, size)
	cs := crc32.ChecksumIEEE(b.Bytes())
	css = append(css, cs)
	tr = append(tr, &TokenRange{offset, offset + size})
	h.add(cs, offset, offset+size)
	if size <= 1 {
	break
	}
	}

	return css, tr
	}

	// stringifyTokens serializes a sublist of tokens into a bytes buffer.
	func (t Tokens) stringifyTokens(b *bytes.Buffer, offset, size int) {
	for j := offset; j < offset+size; j++ {
	if j != offset {
	b.WriteRune(' ')
	}
	b.WriteString(t[j].Text)
	}
	}

	// TokenRange indicates the range of tokens that map to a particular checksum.
	type TokenRange struct {
	Start int
	End int
	}

	func (t *TokenRange) String() string {
	return fmt.Sprintf("[%v, %v)", t.Start, t.End)
	}

	// TokenRanges is a list of TokenRange objects. The chance that two different
	// strings map to the same checksum is very small, but unfortunately isn't
	// zero, so we use this instead of making the assumption that they will all be
	// unique.
	type TokenRanges []*TokenRange

	func (t TokenRanges) Len() int { return len(t) }
	func (t TokenRanges) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
	func (t TokenRanges) Less(i, j int) bool { return t[i].Start < t[j].Start }

	// CombineUnique returns the combination of both token ranges with no duplicates.
	func (t TokenRanges) CombineUnique(other TokenRanges) TokenRanges {
	if len(other) == 0 {
	return t
	}
	if len(t) == 0 {
	return other
	}

	cu := append(t, other...)
	sort.Sort(cu)

	if len(cu) == 0 {
	return nil
	}

	res := TokenRanges{cu[0]}
	for prev, i := cu[0], 1; i < len(cu); i++ {
	if prev.Start != cu[i].Start \|\| prev.End != cu[i].End {
	res = append(res, cu[i])
	prev = cu[i]
	}
	}
	return res
	}

	// Hash is a map of the hashes of a section of text to the token range covering that text.
	type Hash map[uint32]TokenRanges

	// add associates a token range, [start, end], to a checksum.
	func (h Hash) add(checksum uint32, start, end int) {
	ntr := &TokenRange{Start: start, End: end}
	if r, ok := h[checksum]; ok {
	for _, tr := range r {
	if tr.Start == ntr.Start && tr.End == ntr.End {
	// The token range already exists at this
	// checksum. No need to re-add it.
	return
	}
	}
	}
	h[checksum] = append(h[checksum], ntr)
	}