blob: a01c43dacd7901bf19878e6d4800041b5bb3a7ab [file] [log] [blame]
// Copyright 2020 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package classifier
import (
"strings"
"testing"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
)
func TestCleanupToken(t *testing.T) {
tests := []struct {
input string
output string
}{{
input: "cleanup!",
output: "cleanup",
},
{
input: "12345",
output: "12345",
},
{
input: "r1@zx42-",
output: "rzx",
},
{
input: "12345,",
output: "12345",
},
{
input: "12345-6789",
output: "12345-6789",
},
{
input: "1(a)",
output: "1",
},
{
input: "1.2.3",
output: "1.2.3",
},
}
for _, test := range tests {
if got := cleanupToken(test.input); got != test.output {
t.Errorf("%q: got %q want %q", test.input, got, test.output)
}
}
}
func TestTokenize(t *testing.T) {
tests := []struct {
name string
input string
output *document
}{
{
name: "basic scenario",
input: `The AWESOME Project LICENSE
Modifi-
cations prohibited
Copyright 1996-2002, 2006 by A. Developer
Introduction
The AWESOME Project`,
output: &document{
Tokens: []*token{
{
Text: "the",
Index: 0,
Line: 1,
},
{
Text: "awesome",
Index: 1,
Line: 1,
},
{
Text: "project",
Index: 2,
Line: 1,
},
{
Text: "license",
Index: 3,
Line: 1,
},
{
Text: "modifications",
Index: 4,
Line: 3,
},
{
Text: "prohibited",
Index: 5,
Line: 4,
},
{
Text: "introduction",
Index: 6,
Line: 8,
},
{
Text: "the",
Index: 7,
Line: 10,
},
{
Text: "awesome",
Index: 8,
Line: 10,
},
{
Text: "project",
Index: 9,
Line: 10,
},
},
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
d := tokenize([]byte(test.input))
if !cmp.Equal(d, test.output, cmpopts.IgnoreUnexported(document{})) {
t.Errorf("%s failed: %s", test.name, cmp.Diff(d, test.output))
}
})
}
}
func TestTokenizer(t *testing.T) {
// This test focuses primarily on the textual content extracted and does not look
// at the other parts of the document.
tests := []struct {
name string
input string
output string
}{
{
name: "Basic Tokens",
input: "Here are some words. ",
output: "here are some words",
},
{
name: "skips bullet headers",
input: "* item the first\n· item the second",
output: "item the first item the second",
},
{
name: "preserves version numbers but not header numbers",
input: "sample rules\n1. Python 2.7.8 is a version of the language.",
output: "sample rules python 2.7.8 is a version of the language",
},
{
name: "preserves version numbers across line breaks",
input: "Python version\n2.7.8 is a version of the language.",
output: "python version 2.7.8 is a version of the language",
},
{
name: "preserves punctuation",
input: "Bill, Larry, and Sergey agree precision is critical!",
output: "bill larry and sergey agree precision is critical",
},
{
name: "ignores comment characters and bullet formatting",
input: "/* * item the first",
output: "item the first",
},
{
name: "produces blank line as needed",
input: "/* *",
output: "",
},
{
name: "clobbers header looking thing as appropriate",
input: " iv. this is a test",
output: "this is a test",
},
{
name: "clobbers header looking thing as appropriate even in comment",
input: "/* 1.2.3. this is a test",
output: "this is a test",
},
{
name: "preserve version number (not a header, but header-looking) not at beginning of sentence",
input: "This is version 1.1.",
output: "this is version 1.1",
},
{
name: "copyright inside a comment",
input: " /* Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved",
output: "",
},
{
name: "FTL copyright text",
input: `The FreeType Project LICENSE
2006-Jan-27
2006-01-27
Copyright 1996-2002, 2006 by David Turner, Robert Wilhelm, and Werner Lemberg
Introduction
The FreeType Project`,
output: "the freetype project license introduction the freetype project",
},
{
name: "Separated text",
input: `distribution and modifi‐
cation follow.`,
output: "distribution and modification follow",
},
{
name: "preserve internal references, even on line break",
input: "(ii) should be preserved as (ii) is preserved",
output: "ii should be preserved as ii is preserved",
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
d := tokenize([]byte(test.input))
var b strings.Builder
for _, tok := range d.Tokens {
b.WriteString(tok.Text)
b.WriteString(" ")
}
actual := strings.TrimSpace(b.String())
if actual != test.output {
t.Errorf("Tokenize(%q): got %q want %q", test.name, actual, test.output)
}
})
}
}