v2/tokenizer_test.go - third_party/github.com/google/licenseclassifier - Git at Google

 // Copyright 2020 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package classifier

 import (
 	"strings"
 	"testing"

 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 )

 func TestCleanupToken(t *testing.T) {
 	tests := []struct {
 		input  string
 		output string
 	}{{
 		input:  "cleanup!",
 		output: "cleanup",
 	},
 		{
 			input:  "12345",
 			output: "12345",
 		},
 		{
 			input:  "r1@zx42-",
 			output: "rzx",
 		},
 		{
 			input:  "12345,",
 			output: "12345",
 		},
 		{
 			input:  "12345-6789",
 			output: "12345-6789",
 		},
 		{
 			input:  "1(a)",
 			output: "1",
 		},
 		{
 			input:  "1.2.3",
 			output: "1.2.3",
 		},
 	}
 	for _, test := range tests {
 		if got := cleanupToken(test.input); got != test.output {
 			t.Errorf("%q: got %q want %q", test.input, got, test.output)
 		}
 	}
 }

 func TestTokenize(t *testing.T) {
 	tests := []struct {
 		name   string
 		input  string
 		output *document
 	}{
 		{
 			name: "basic scenario",
 			input: `The AWESOME Project LICENSE

 Modifi-
 cations prohibited

 Copyright 1996-2002, 2006 by A. Developer

 Introduction

 The AWESOME Project`,
 			output: &document{
 				Tokens: []*token{
 					{
 						Text:  "the",
 						Index: 0,
 						Line:  1,
 					},
 					{
 						Text:  "awesome",
 						Index: 1,
 						Line:  1,
 					},
 					{
 						Text:  "project",
 						Index: 2,
 						Line:  1,
 					},
 					{
 						Text:  "license",
 						Index: 3,
 						Line:  1,
 					},
 					{
 						Text:  "modifications",
 						Index: 4,
 						Line:  3,
 					},
 					{
 						Text:  "prohibited",
 						Index: 5,
 						Line:  4,
 					},
 					{
 						Text:  "introduction",
 						Index: 6,
 						Line:  8,
 					},
 					{
 						Text:  "the",
 						Index: 7,
 						Line:  10,
 					},
 					{
 						Text:  "awesome",
 						Index: 8,
 						Line:  10,
 					},
 					{
 						Text:  "project",
 						Index: 9,
 						Line:  10,
 					},
 				},
 			},
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			d := tokenize([]byte(test.input))
 			if !cmp.Equal(d, test.output, cmpopts.IgnoreUnexported(document{})) {
 				t.Errorf("%s failed: %s", test.name, cmp.Diff(d, test.output))
 			}
 		})
 	}
 }

 func TestTokenizer(t *testing.T) {
 	// This test focuses primarily on the textual content extracted and does not look
 	// at the other parts of the document.
 	tests := []struct {
 		name   string
 		input  string
 		output string
 	}{
 		{
 			name:   "Basic Tokens",
 			input:  "Here are some words. ",
 			output: "here are some words",
 		},
 		{
 			name:   "skips bullet headers",
 			input:  "* item the first\n· item the second",
 			output: "item the first item the second",
 		},
 		{
 			name:   "preserves version numbers but not header numbers",
 			input:  "sample rules\n1. Python 2.7.8 is a version of the language.",
 			output: "sample rules python 2.7.8 is a version of the language",
 		},
 		{
 			name:   "preserves version numbers across line breaks",
 			input:  "Python version\n2.7.8 is a version of the language.",
 			output: "python version 2.7.8 is a version of the language",
 		},
 		{
 			name:   "preserves punctuation",
 			input:  "Bill, Larry, and Sergey agree precision is critical!",
 			output: "bill larry and sergey agree precision is critical",
 		},
 		{
 			name:   "ignores comment characters and bullet formatting",
 			input:  "/* * item the first",
 			output: "item the first",
 		},
 		{
 			name:   "produces blank line as needed",
 			input:  "/* *",
 			output: "",
 		},
 		{
 			name:   "clobbers header looking thing as appropriate",
 			input:  " iv. this is a test",
 			output: "this is a test",
 		},
 		{
 			name:   "clobbers header looking thing as appropriate even in comment",
 			input:  "/* 1.2.3. this is a test",
 			output: "this is a test",
 		},
 		{
 			name:   "preserve version number (not a header, but header-looking) not at beginning of sentence",
 			input:  "This is version 1.1.",
 			output: "this is version 1.1",
 		},
 		{
 			name:   "copyright inside a comment",
 			input:  " /* Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved",
 			output: "",
 		},
 		{
 			name: "FTL copyright text",
 			input: `The FreeType Project LICENSE

 2006-Jan-27
 2006-01-27

 Copyright 1996-2002, 2006 by David Turner, Robert Wilhelm, and Werner Lemberg

 Introduction

 The FreeType Project`,
 			output: "the freetype project license introduction the freetype project",
 		},
 		{
 			name: "Separated text",
 			input: `distribution and modifi‐
 				       cation follow.`,
 			output: "distribution and modification follow",
 		},
 		{
 			name:   "preserve internal references, even on line break",
 			input:  "(ii) should be preserved as (ii) is preserved",
 			output: "ii should be preserved as ii is preserved",
 		},
 	}

 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			d := tokenize([]byte(test.input))
 			var b strings.Builder
 			for _, tok := range d.Tokens {
 				b.WriteString(tok.Text)
 				b.WriteString(" ")
 			}
 			actual := strings.TrimSpace(b.String())
 			if actual != test.output {
 				t.Errorf("Tokenize(%q): got %q want %q", test.name, actual, test.output)
 			}
 		})
 	}
 }
	// Copyright 2020 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package classifier

	import (
	"strings"
	"testing"

	"github.com/google/go-cmp/cmp"
	"github.com/google/go-cmp/cmp/cmpopts"
	)

	func TestCleanupToken(t *testing.T) {
	tests := []struct {
	input string
	output string
	}{{
	input: "cleanup!",
	output: "cleanup",
	},
	{
	input: "12345",
	output: "12345",
	},
	{
	input: "r1@zx42-",
	output: "rzx",
	},
	{
	input: "12345,",
	output: "12345",
	},
	{
	input: "12345-6789",
	output: "12345-6789",
	},
	{
	input: "1(a)",
	output: "1",
	},
	{
	input: "1.2.3",
	output: "1.2.3",
	},
	}
	for _, test := range tests {
	if got := cleanupToken(test.input); got != test.output {
	t.Errorf("%q: got %q want %q", test.input, got, test.output)
	}
	}
	}

	func TestTokenize(t *testing.T) {
	tests := []struct {
	name string
	input string
	output *document
	}{
	{
	name: "basic scenario",
	input: `The AWESOME Project LICENSE

	Modifi-
	cations prohibited

	Copyright 1996-2002, 2006 by A. Developer

	Introduction

	The AWESOME Project`,
	output: &document{
	Tokens: []*token{
	{
	Text: "the",
	Index: 0,
	Line: 1,
	},
	{
	Text: "awesome",
	Index: 1,
	Line: 1,
	},
	{
	Text: "project",
	Index: 2,
	Line: 1,
	},
	{
	Text: "license",
	Index: 3,
	Line: 1,
	},
	{
	Text: "modifications",
	Index: 4,
	Line: 3,
	},
	{
	Text: "prohibited",
	Index: 5,
	Line: 4,
	},
	{
	Text: "introduction",
	Index: 6,
	Line: 8,
	},
	{
	Text: "the",
	Index: 7,
	Line: 10,
	},
	{
	Text: "awesome",
	Index: 8,
	Line: 10,
	},
	{
	Text: "project",
	Index: 9,
	Line: 10,
	},
	},
	},
	},
	}
	for _, test := range tests {
	t.Run(test.name, func(t *testing.T) {
	d := tokenize([]byte(test.input))
	if !cmp.Equal(d, test.output, cmpopts.IgnoreUnexported(document{})) {
	t.Errorf("%s failed: %s", test.name, cmp.Diff(d, test.output))
	}
	})
	}
	}

	func TestTokenizer(t *testing.T) {
	// This test focuses primarily on the textual content extracted and does not look
	// at the other parts of the document.
	tests := []struct {
	name string
	input string
	output string
	}{
	{
	name: "Basic Tokens",
	input: "Here are some words. ",
	output: "here are some words",
	},
	{
	name: "skips bullet headers",
	input: "* item the first\n· item the second",
	output: "item the first item the second",
	},
	{
	name: "preserves version numbers but not header numbers",
	input: "sample rules\n1. Python 2.7.8 is a version of the language.",
	output: "sample rules python 2.7.8 is a version of the language",
	},
	{
	name: "preserves version numbers across line breaks",
	input: "Python version\n2.7.8 is a version of the language.",
	output: "python version 2.7.8 is a version of the language",
	},
	{
	name: "preserves punctuation",
	input: "Bill, Larry, and Sergey agree precision is critical!",
	output: "bill larry and sergey agree precision is critical",
	},
	{
	name: "ignores comment characters and bullet formatting",
	input: "/* * item the first",
	output: "item the first",
	},
	{
	name: "produces blank line as needed",
	input: "/* *",
	output: "",
	},
	{
	name: "clobbers header looking thing as appropriate",
	input: " iv. this is a test",
	output: "this is a test",
	},
	{
	name: "clobbers header looking thing as appropriate even in comment",
	input: "/* 1.2.3. this is a test",
	output: "this is a test",
	},
	{
	name: "preserve version number (not a header, but header-looking) not at beginning of sentence",
	input: "This is version 1.1.",
	output: "this is version 1.1",
	},
	{
	name: "copyright inside a comment",
	input: " /* Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved",
	output: "",
	},
	{
	name: "FTL copyright text",
	input: `The FreeType Project LICENSE

	2006-Jan-27
	2006-01-27

	Copyright 1996-2002, 2006 by David Turner, Robert Wilhelm, and Werner Lemberg

	Introduction

	The FreeType Project`,
	output: "the freetype project license introduction the freetype project",
	},
	{
	name: "Separated text",
	input: `distribution and modifi‐
	cation follow.`,
	output: "distribution and modification follow",
	},
	{
	name: "preserve internal references, even on line break",
	input: "(ii) should be preserved as (ii) is preserved",
	output: "ii should be preserved as ii is preserved",
	},
	}

	for _, test := range tests {
	t.Run(test.name, func(t *testing.T) {
	d := tokenize([]byte(test.input))
	var b strings.Builder
	for _, tok := range d.Tokens {
	b.WriteString(tok.Text)
	b.WriteString(" ")
	}
	actual := strings.TrimSpace(b.String())
	if actual != test.output {
	t.Errorf("Tokenize(%q): got %q want %q", test.name, actual, test.output)
	}
	})
	}
	}