blob: a01c43dacd7901bf19878e6d4800041b5bb3a7ab [file] [log] [blame]
// Copyright 2020 Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package classifier
import (
func TestCleanupToken(t *testing.T) {
tests := []struct {
input string
output string
input: "cleanup!",
output: "cleanup",
input: "12345",
output: "12345",
input: "r1@zx42-",
output: "rzx",
input: "12345,",
output: "12345",
input: "12345-6789",
output: "12345-6789",
input: "1(a)",
output: "1",
input: "1.2.3",
output: "1.2.3",
for _, test := range tests {
if got := cleanupToken(test.input); got != test.output {
t.Errorf("%q: got %q want %q", test.input, got, test.output)
func TestTokenize(t *testing.T) {
tests := []struct {
name string
input string
output *document
name: "basic scenario",
input: `The AWESOME Project LICENSE
cations prohibited
Copyright 1996-2002, 2006 by A. Developer
The AWESOME Project`,
output: &document{
Tokens: []*token{
Text: "the",
Index: 0,
Line: 1,
Text: "awesome",
Index: 1,
Line: 1,
Text: "project",
Index: 2,
Line: 1,
Text: "license",
Index: 3,
Line: 1,
Text: "modifications",
Index: 4,
Line: 3,
Text: "prohibited",
Index: 5,
Line: 4,
Text: "introduction",
Index: 6,
Line: 8,
Text: "the",
Index: 7,
Line: 10,
Text: "awesome",
Index: 8,
Line: 10,
Text: "project",
Index: 9,
Line: 10,
for _, test := range tests {
t.Run(, func(t *testing.T) {
d := tokenize([]byte(test.input))
if !cmp.Equal(d, test.output, cmpopts.IgnoreUnexported(document{})) {
t.Errorf("%s failed: %s",, cmp.Diff(d, test.output))
func TestTokenizer(t *testing.T) {
// This test focuses primarily on the textual content extracted and does not look
// at the other parts of the document.
tests := []struct {
name string
input string
output string
name: "Basic Tokens",
input: "Here are some words. ",
output: "here are some words",
name: "skips bullet headers",
input: "* item the first\n· item the second",
output: "item the first item the second",
name: "preserves version numbers but not header numbers",
input: "sample rules\n1. Python 2.7.8 is a version of the language.",
output: "sample rules python 2.7.8 is a version of the language",
name: "preserves version numbers across line breaks",
input: "Python version\n2.7.8 is a version of the language.",
output: "python version 2.7.8 is a version of the language",
name: "preserves punctuation",
input: "Bill, Larry, and Sergey agree precision is critical!",
output: "bill larry and sergey agree precision is critical",
name: "ignores comment characters and bullet formatting",
input: "/* * item the first",
output: "item the first",
name: "produces blank line as needed",
input: "/* *",
output: "",
name: "clobbers header looking thing as appropriate",
input: " iv. this is a test",
output: "this is a test",
name: "clobbers header looking thing as appropriate even in comment",
input: "/* 1.2.3. this is a test",
output: "this is a test",
name: "preserve version number (not a header, but header-looking) not at beginning of sentence",
input: "This is version 1.1.",
output: "this is version 1.1",
name: "copyright inside a comment",
input: " /* Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved",
output: "",
name: "FTL copyright text",
input: `The FreeType Project LICENSE
Copyright 1996-2002, 2006 by David Turner, Robert Wilhelm, and Werner Lemberg
The FreeType Project`,
output: "the freetype project license introduction the freetype project",
name: "Separated text",
input: `distribution and modifi‐
cation follow.`,
output: "distribution and modification follow",
name: "preserve internal references, even on line break",
input: "(ii) should be preserved as (ii) is preserved",
output: "ii should be preserved as ii is preserved",
for _, test := range tests {
t.Run(, func(t *testing.T) {
d := tokenize([]byte(test.input))
var b strings.Builder
for _, tok := range d.Tokens {
b.WriteString(" ")
actual := strings.TrimSpace(b.String())
if actual != test.output {
t.Errorf("Tokenize(%q): got %q want %q",, actual, test.output)