blob: ff41cf479ee65882151578da12b25bb8e80c9080 [file] [log] [blame]
// Copyright 2017 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package licenseclassifier provides methods to identify the open source
// license that most closely matches an unknown license.
package licenseclassifier
import (
"archive/tar"
"bytes"
"compress/gzip"
"fmt"
"html"
"io"
"math"
"regexp"
"sort"
"strings"
"sync"
"unicode"
"github.com/google/licenseclassifier/stringclassifier"
"github.com/google/licenseclassifier/stringclassifier/searchset"
)
// DefaultConfidenceThreshold is the minimum confidence percentage we're willing to accept in order
// to say that a match is good.
const DefaultConfidenceThreshold = 0.80
var (
// Normalizers is a list of functions that get applied to the strings
// before they are registered with the string classifier.
Normalizers = []stringclassifier.NormalizeFunc{
html.UnescapeString,
removeShebangLine,
RemoveNonWords,
NormalizeEquivalentWords,
NormalizePunctuation,
strings.ToLower,
removeIgnorableTexts,
stringclassifier.FlattenWhitespace,
strings.TrimSpace,
}
// commonLicenseWords are words that are common to all known licenses.
// If an unknown text doesn't have at least one of these, then we can
// ignore it.
commonLicenseWords = []*regexp.Regexp{
regexp.MustCompile(`(?i)\bcode\b`),
regexp.MustCompile(`(?i)\blicense\b`),
regexp.MustCompile(`(?i)\boriginal\b`),
regexp.MustCompile(`(?i)\brights\b`),
regexp.MustCompile(`(?i)\bsoftware\b`),
regexp.MustCompile(`(?i)\bterms\b`),
regexp.MustCompile(`(?i)\bversion\b`),
regexp.MustCompile(`(?i)\bwork\b`),
}
)
// License is a classifier pre-loaded with known open source licenses.
type License struct {
c *stringclassifier.Classifier
// Threshold is the lowest confidence percentage acceptable for the
// classifier.
Threshold float64
// archive is a function that must return the contents of the license archive.
// When archive is nil, ReadLicenseFile(LicenseFile) is used to retrieve the
// contents.
archive func() ([]byte, error)
}
// OptionFunc set options on a License struct.
type OptionFunc func(l *License) error
// Archive is an OptionFunc to specify the location of the license archive file.
func Archive(f string) OptionFunc {
return func(l *License) error {
l.archive = func() ([]byte, error) { return ReadLicenseFile(f) }
return nil
}
}
// ArchiveBytes is an OptionFunc that provides the contents of the license archive file.
// The caller must not overwrite the contents of b as it is not copied.
func ArchiveBytes(b []byte) OptionFunc {
return func(l *License) error {
l.archive = func() ([]byte, error) { return b, nil }
return nil
}
}
// ArchiveFunc is an OptionFunc that provides a function that must return the contents
// of the license archive file.
func ArchiveFunc(f func() ([]byte, error)) OptionFunc {
return func(l *License) error {
l.archive = f
return nil
}
}
// New creates a license classifier and pre-loads it with known open source licenses.
func New(threshold float64, options ...OptionFunc) (*License, error) {
classifier := &License{
c: stringclassifier.New(threshold, Normalizers...),
Threshold: threshold,
}
for _, o := range options {
err := o(classifier)
if err != nil {
return nil, fmt.Errorf("error setting option %v: %v", o, err)
}
}
if err := classifier.registerLicenses(); err != nil {
return nil, fmt.Errorf("cannot register licenses from archive: %v", err)
}
return classifier, nil
}
// NewWithForbiddenLicenses creates a license classifier and pre-loads it with
// known open source licenses which are forbidden.
func NewWithForbiddenLicenses(threshold float64, options ...OptionFunc) (*License, error) {
opts := []OptionFunc{Archive(ForbiddenLicenseArchive)}
opts = append(opts, options...)
return New(threshold, opts...)
}
// WithinConfidenceThreshold returns true if the confidence value is above or
// equal to the confidence threshold.
func (c *License) WithinConfidenceThreshold(conf float64) bool {
return conf > c.Threshold || math.Abs(conf-c.Threshold) < math.SmallestNonzeroFloat64
}
// NearestMatch returns the "nearest" match to the given set of known licenses.
// Returned are the name of the license, and a confidence percentage indicating
// how confident the classifier is in the result.
func (c *License) NearestMatch(contents string) *stringclassifier.Match {
if !c.hasCommonLicenseWords(contents) {
return nil
}
m := c.c.NearestMatch(contents)
m.Name = strings.TrimSuffix(m.Name, ".header")
return m
}
// MultipleMatch matches all licenses within an unknown text.
func (c *License) MultipleMatch(contents string, includeHeaders bool) stringclassifier.Matches {
norm := normalizeText(contents)
if !c.hasCommonLicenseWords(norm) {
return nil
}
m := make(map[stringclassifier.Match]bool)
var matches stringclassifier.Matches
for _, v := range c.c.MultipleMatch(norm) {
if !c.WithinConfidenceThreshold(v.Confidence) {
continue
}
if !includeHeaders && strings.HasSuffix(v.Name, ".header") {
continue
}
v.Name = strings.TrimSuffix(v.Name, ".header")
if re, ok := forbiddenRegexps[v.Name]; ok && !re.MatchString(norm) {
continue
}
if _, ok := m[*v]; !ok {
m[*v] = true
matches = append(matches, v)
}
}
sort.Sort(matches)
return matches
}
func normalizeText(s string) string {
for _, n := range Normalizers {
s = n(s)
}
return s
}
// hasCommonLicenseWords returns true if the unknown text has at least one word
// that's common to all licenses.
func (c *License) hasCommonLicenseWords(s string) bool {
for _, re := range commonLicenseWords {
if re.MatchString(s) {
return true
}
}
return false
}
type archivedValue struct {
name string
normalized string
set *searchset.SearchSet
}
// registerLicenses loads all known licenses and adds them to c as known values
// for comparison. The allocated space after ingesting the 'licenses.db'
// archive is ~167M.
func (c *License) registerLicenses() error {
var contents []byte
var err error
if c.archive == nil {
contents, err = ReadLicenseFile(LicenseArchive)
} else {
contents, err = c.archive()
}
if err != nil {
return err
}
reader := bytes.NewReader(contents)
gr, err := gzip.NewReader(reader)
if err != nil {
return err
}
defer gr.Close()
tr := tar.NewReader(gr)
var muVals sync.Mutex
var vals []archivedValue
for i := 0; ; i++ {
hdr, err := tr.Next()
if err == io.EOF {
break
}
if err != nil {
return err
}
name := strings.TrimSuffix(hdr.Name, ".txt")
// Read normalized value.
var b bytes.Buffer
if _, err := io.Copy(&b, tr); err != nil {
return err
}
normalized := b.String()
b.Reset()
// Read precomputed hashes.
hdr, err = tr.Next()
if err != nil {
return err
}
if _, err := io.Copy(&b, tr); err != nil {
return err
}
var set searchset.SearchSet
searchset.Deserialize(&b, &set)
muVals.Lock()
vals = append(vals, archivedValue{name, normalized, &set})
muVals.Unlock()
}
for _, v := range vals {
if err = c.c.AddPrecomputedValue(v.name, v.normalized, v.set); err != nil {
return err
}
}
return nil
}
// endOfLicenseText is text commonly associated with the end of a license. We
// can remove text that occurs after it.
var endOfLicenseText = []string{
"END OF TERMS AND CONDITIONS",
}
// TrimExtraneousTrailingText removes text after an obvious end of the license
// and does not include substantive text of the license.
func TrimExtraneousTrailingText(s string) string {
for _, e := range endOfLicenseText {
if i := strings.LastIndex(s, e); i != -1 {
return s[:i+len(e)]
}
}
return s
}
var copyrightRE = regexp.MustCompile(`(?m)(?i:Copyright)\s+(?i:©\s+|\(c\)\s+)?(?:\d{2,4})(?:[-,]\s*\d{2,4})*,?\s*(?i:by)?\s*(.*?(?i:\s+Inc\.)?)[.,]?\s*(?i:All rights reserved\.?)?\s*$`)
// CopyrightHolder finds a copyright notification, if it exists, and returns
// the copyright holder.
func CopyrightHolder(contents string) string {
matches := copyrightRE.FindStringSubmatch(contents)
if len(matches) == 2 {
return matches[1]
}
return ""
}
var publicDomainRE = regexp.MustCompile("(?i)(this file )?is( in the)? public domain")
// HasPublicDomainNotice performs a simple regex over the contents to see if a
// public domain notice is in there. As you can imagine, this isn't 100%
// definitive, but can be useful if a license match isn't found.
func (c *License) HasPublicDomainNotice(contents string) bool {
return publicDomainRE.FindString(contents) != ""
}
// ignorableTexts is a list of lines at the start of the string we can remove
// to get a cleaner match.
var ignorableTexts = []*regexp.Regexp{
regexp.MustCompile(`(?i)^(?:the )?mit license(?: \(mit\))?$`),
regexp.MustCompile(`(?i)^(?:new )?bsd license$`),
regexp.MustCompile(`(?i)^copyright and permission notice$`),
regexp.MustCompile(`(?i)^copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]? .*$`),
regexp.MustCompile(`(?i)^(all|some) rights reserved\.?$`),
regexp.MustCompile(`(?i)^@license$`),
regexp.MustCompile(`^\s*$`),
}
// removeIgnorableTexts removes common text, which is not important for
// classification, that shows up before the body of the license.
func removeIgnorableTexts(s string) string {
lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
var start int
for ; start < len(lines); start++ {
line := strings.TrimSpace(lines[start])
var matches bool
for _, re := range ignorableTexts {
if re.MatchString(line) {
matches = true
break
}
}
if !matches {
break
}
}
end := len(lines)
if start > end {
return "\n"
}
return strings.Join(lines[start:end], "\n") + "\n"
}
// removeShebangLine removes the '#!...' line if it's the first line in the
// file. Note that if it's the only line in a comment, it won't be removed.
func removeShebangLine(s string) string {
lines := strings.Split(s, "\n")
if len(lines) <= 1 || !strings.HasPrefix(lines[0], "#!") {
return s
}
return strings.Join(lines[1:], "\n")
}
// isDecorative returns true if the line is made up purely of non-letter and
// non-digit characters.
func isDecorative(s string) bool {
for _, c := range s {
if unicode.IsLetter(c) || unicode.IsDigit(c) {
return false
}
}
return true
}
var nonWords = regexp.MustCompile("[[:punct:]]+")
// RemoveNonWords removes non-words from the string.
func RemoveNonWords(s string) string {
return nonWords.ReplaceAllString(s, " ")
}
// interchangeablePunctutation is punctuation that can be normalized.
var interchangeablePunctuation = []struct {
interchangeable *regexp.Regexp
substitute string
}{
// Hyphen, Dash, En Dash, and Em Dash.
{regexp.MustCompile(`[-‒–—]`), "-"},
// Single, Double, Curly Single, and Curly Double.
{regexp.MustCompile("['\"`‘’“”]"), "'"},
// Copyright.
{regexp.MustCompile("©"), "(c)"},
// Hyphen-separated words.
{regexp.MustCompile(`(\S)-\s+(\S)`), "${1}-${2}"},
// Currency and Section. (Different copies of the CDDL use each marker.)
{regexp.MustCompile("[§¤]"), "(s)"},
// Middle Dot
{regexp.MustCompile("·"), "*"},
}
// NormalizePunctuation takes all hyphens and quotes and normalizes them.
func NormalizePunctuation(s string) string {
for _, iw := range interchangeablePunctuation {
s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
}
return s
}
// interchangeableWords are words we can substitute for a normalized form
// without changing the meaning of the license. See
// https://spdx.org/spdx-license-list/matching-guidelines for the list.
var interchangeableWords = []struct {
interchangeable *regexp.Regexp
substitute string
}{
{regexp.MustCompile("(?i)Acknowledgment"), "Acknowledgement"},
{regexp.MustCompile("(?i)Analogue"), "Analog"},
{regexp.MustCompile("(?i)Analyse"), "Analyze"},
{regexp.MustCompile("(?i)Artefact"), "Artifact"},
{regexp.MustCompile("(?i)Authorisation"), "Authorization"},
{regexp.MustCompile("(?i)Authorised"), "Authorized"},
{regexp.MustCompile("(?i)Calibre"), "Caliber"},
{regexp.MustCompile("(?i)Cancelled"), "Canceled"},
{regexp.MustCompile("(?i)Capitalisations"), "Capitalizations"},
{regexp.MustCompile("(?i)Catalogue"), "Catalog"},
{regexp.MustCompile("(?i)Categorise"), "Categorize"},
{regexp.MustCompile("(?i)Centre"), "Center"},
{regexp.MustCompile("(?i)Emphasised"), "Emphasized"},
{regexp.MustCompile("(?i)Favour"), "Favor"},
{regexp.MustCompile("(?i)Favourite"), "Favorite"},
{regexp.MustCompile("(?i)Fulfil"), "Fulfill"},
{regexp.MustCompile("(?i)Fulfilment"), "Fulfillment"},
{regexp.MustCompile("(?i)Initialise"), "Initialize"},
{regexp.MustCompile("(?i)Judgment"), "Judgement"},
{regexp.MustCompile("(?i)Labelling"), "Labeling"},
{regexp.MustCompile("(?i)Labour"), "Labor"},
{regexp.MustCompile("(?i)Licence"), "License"},
{regexp.MustCompile("(?i)Maximise"), "Maximize"},
{regexp.MustCompile("(?i)Modelled"), "Modeled"},
{regexp.MustCompile("(?i)Modelling"), "Modeling"},
{regexp.MustCompile("(?i)Offence"), "Offense"},
{regexp.MustCompile("(?i)Optimise"), "Optimize"},
{regexp.MustCompile("(?i)Organisation"), "Organization"},
{regexp.MustCompile("(?i)Organise"), "Organize"},
{regexp.MustCompile("(?i)Practise"), "Practice"},
{regexp.MustCompile("(?i)Programme"), "Program"},
{regexp.MustCompile("(?i)Realise"), "Realize"},
{regexp.MustCompile("(?i)Recognise"), "Recognize"},
{regexp.MustCompile("(?i)Signalling"), "Signaling"},
{regexp.MustCompile("(?i)Sub[- ]license"), "Sublicense"},
{regexp.MustCompile("(?i)Utilisation"), "Utilization"},
{regexp.MustCompile("(?i)Whilst"), "While"},
{regexp.MustCompile("(?i)Wilful"), "Wilfull"},
{regexp.MustCompile("(?i)Non-commercial"), "Noncommercial"},
{regexp.MustCompile("(?i)Per cent"), "Percent"},
}
// NormalizeEquivalentWords normalizes equivalent words that are interchangeable.
func NormalizeEquivalentWords(s string) string {
for _, iw := range interchangeableWords {
s = iw.interchangeable.ReplaceAllString(s, iw.substitute)
}
return s
}