| // Copyright 2017 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // Package licenseclassifier provides methods to identify the open source |
| // license that most closely matches an unknown license. |
| package licenseclassifier |
| |
| import ( |
| "archive/tar" |
| "bytes" |
| "compress/gzip" |
| "fmt" |
| "html" |
| "io" |
| "math" |
| "regexp" |
| "sort" |
| "strings" |
| "sync" |
| "unicode" |
| |
| "github.com/google/licenseclassifier/stringclassifier" |
| "github.com/google/licenseclassifier/stringclassifier/searchset" |
| ) |
| |
| // DefaultConfidenceThreshold is the minimum confidence percentage we're willing to accept in order |
| // to say that a match is good. |
| const DefaultConfidenceThreshold = 0.80 |
| |
| var ( |
| // Normalizers is a list of functions that get applied to the strings |
| // before they are registered with the string classifier. |
| Normalizers = []stringclassifier.NormalizeFunc{ |
| html.UnescapeString, |
| removeShebangLine, |
| RemoveNonWords, |
| NormalizeEquivalentWords, |
| NormalizePunctuation, |
| strings.ToLower, |
| removeIgnorableTexts, |
| stringclassifier.FlattenWhitespace, |
| strings.TrimSpace, |
| } |
| |
| // commonLicenseWords are words that are common to all known licenses. |
| // If an unknown text doesn't have at least one of these, then we can |
| // ignore it. |
| commonLicenseWords = []*regexp.Regexp{ |
| regexp.MustCompile(`(?i)\bcode\b`), |
| regexp.MustCompile(`(?i)\blicense\b`), |
| regexp.MustCompile(`(?i)\boriginal\b`), |
| regexp.MustCompile(`(?i)\brights\b`), |
| regexp.MustCompile(`(?i)\bsoftware\b`), |
| regexp.MustCompile(`(?i)\bterms\b`), |
| regexp.MustCompile(`(?i)\bversion\b`), |
| regexp.MustCompile(`(?i)\bwork\b`), |
| } |
| ) |
| |
| // License is a classifier pre-loaded with known open source licenses. |
| type License struct { |
| c *stringclassifier.Classifier |
| |
| // Threshold is the lowest confidence percentage acceptable for the |
| // classifier. |
| Threshold float64 |
| |
| // archive is a function that must return the contents of the license archive. |
| // When archive is nil, ReadLicenseFile(LicenseFile) is used to retrieve the |
| // contents. |
| archive func() ([]byte, error) |
| } |
| |
| // OptionFunc set options on a License struct. |
| type OptionFunc func(l *License) error |
| |
| // Archive is an OptionFunc to specify the location of the license archive file. |
| func Archive(f string) OptionFunc { |
| return func(l *License) error { |
| l.archive = func() ([]byte, error) { return ReadLicenseFile(f) } |
| return nil |
| } |
| } |
| |
| // ArchiveBytes is an OptionFunc that provides the contents of the license archive file. |
| // The caller must not overwrite the contents of b as it is not copied. |
| func ArchiveBytes(b []byte) OptionFunc { |
| return func(l *License) error { |
| l.archive = func() ([]byte, error) { return b, nil } |
| return nil |
| } |
| } |
| |
| // ArchiveFunc is an OptionFunc that provides a function that must return the contents |
| // of the license archive file. |
| func ArchiveFunc(f func() ([]byte, error)) OptionFunc { |
| return func(l *License) error { |
| l.archive = f |
| return nil |
| } |
| } |
| |
| // New creates a license classifier and pre-loads it with known open source licenses. |
| func New(threshold float64, options ...OptionFunc) (*License, error) { |
| classifier := &License{ |
| c: stringclassifier.New(threshold, Normalizers...), |
| Threshold: threshold, |
| } |
| |
| for _, o := range options { |
| err := o(classifier) |
| if err != nil { |
| return nil, fmt.Errorf("error setting option %v: %v", o, err) |
| } |
| } |
| |
| if err := classifier.registerLicenses(); err != nil { |
| return nil, fmt.Errorf("cannot register licenses from archive: %v", err) |
| } |
| return classifier, nil |
| } |
| |
| // NewWithForbiddenLicenses creates a license classifier and pre-loads it with |
| // known open source licenses which are forbidden. |
| func NewWithForbiddenLicenses(threshold float64, options ...OptionFunc) (*License, error) { |
| opts := []OptionFunc{Archive(ForbiddenLicenseArchive)} |
| opts = append(opts, options...) |
| return New(threshold, opts...) |
| } |
| |
| // WithinConfidenceThreshold returns true if the confidence value is above or |
| // equal to the confidence threshold. |
| func (c *License) WithinConfidenceThreshold(conf float64) bool { |
| return conf > c.Threshold || math.Abs(conf-c.Threshold) < math.SmallestNonzeroFloat64 |
| } |
| |
| // NearestMatch returns the "nearest" match to the given set of known licenses. |
| // Returned are the name of the license, and a confidence percentage indicating |
| // how confident the classifier is in the result. |
| func (c *License) NearestMatch(contents string) *stringclassifier.Match { |
| if !c.hasCommonLicenseWords(contents) { |
| return nil |
| } |
| m := c.c.NearestMatch(contents) |
| m.Name = strings.TrimSuffix(m.Name, ".header") |
| return m |
| } |
| |
| // MultipleMatch matches all licenses within an unknown text. |
| func (c *License) MultipleMatch(contents string, includeHeaders bool) stringclassifier.Matches { |
| norm := normalizeText(contents) |
| if !c.hasCommonLicenseWords(norm) { |
| return nil |
| } |
| |
| m := make(map[stringclassifier.Match]bool) |
| var matches stringclassifier.Matches |
| for _, v := range c.c.MultipleMatch(norm) { |
| if !c.WithinConfidenceThreshold(v.Confidence) { |
| continue |
| } |
| |
| if !includeHeaders && strings.HasSuffix(v.Name, ".header") { |
| continue |
| } |
| |
| v.Name = strings.TrimSuffix(v.Name, ".header") |
| if re, ok := forbiddenRegexps[v.Name]; ok && !re.MatchString(norm) { |
| continue |
| } |
| if _, ok := m[*v]; !ok { |
| m[*v] = true |
| matches = append(matches, v) |
| } |
| } |
| sort.Sort(matches) |
| return matches |
| } |
| |
| func normalizeText(s string) string { |
| for _, n := range Normalizers { |
| s = n(s) |
| } |
| return s |
| } |
| |
| // hasCommonLicenseWords returns true if the unknown text has at least one word |
| // that's common to all licenses. |
| func (c *License) hasCommonLicenseWords(s string) bool { |
| for _, re := range commonLicenseWords { |
| if re.MatchString(s) { |
| return true |
| } |
| } |
| return false |
| } |
| |
| type archivedValue struct { |
| name string |
| normalized string |
| set *searchset.SearchSet |
| } |
| |
| // registerLicenses loads all known licenses and adds them to c as known values |
| // for comparison. The allocated space after ingesting the 'licenses.db' |
| // archive is ~167M. |
| func (c *License) registerLicenses() error { |
| var contents []byte |
| var err error |
| if c.archive == nil { |
| contents, err = ReadLicenseFile(LicenseArchive) |
| } else { |
| contents, err = c.archive() |
| } |
| if err != nil { |
| return err |
| } |
| |
| reader := bytes.NewReader(contents) |
| gr, err := gzip.NewReader(reader) |
| if err != nil { |
| return err |
| } |
| defer gr.Close() |
| |
| tr := tar.NewReader(gr) |
| |
| var muVals sync.Mutex |
| var vals []archivedValue |
| for i := 0; ; i++ { |
| hdr, err := tr.Next() |
| if err == io.EOF { |
| break |
| } |
| if err != nil { |
| return err |
| } |
| |
| name := strings.TrimSuffix(hdr.Name, ".txt") |
| |
| // Read normalized value. |
| var b bytes.Buffer |
| if _, err := io.Copy(&b, tr); err != nil { |
| return err |
| } |
| normalized := b.String() |
| b.Reset() |
| |
| // Read precomputed hashes. |
| hdr, err = tr.Next() |
| if err != nil { |
| return err |
| } |
| |
| if _, err := io.Copy(&b, tr); err != nil { |
| return err |
| } |
| |
| var set searchset.SearchSet |
| searchset.Deserialize(&b, &set) |
| |
| muVals.Lock() |
| vals = append(vals, archivedValue{name, normalized, &set}) |
| muVals.Unlock() |
| } |
| |
| for _, v := range vals { |
| if err = c.c.AddPrecomputedValue(v.name, v.normalized, v.set); err != nil { |
| return err |
| } |
| } |
| return nil |
| } |
| |
| // endOfLicenseText is text commonly associated with the end of a license. We |
| // can remove text that occurs after it. |
| var endOfLicenseText = []string{ |
| "END OF TERMS AND CONDITIONS", |
| } |
| |
| // TrimExtraneousTrailingText removes text after an obvious end of the license |
| // and does not include substantive text of the license. |
| func TrimExtraneousTrailingText(s string) string { |
| for _, e := range endOfLicenseText { |
| if i := strings.LastIndex(s, e); i != -1 { |
| return s[:i+len(e)] |
| } |
| } |
| return s |
| } |
| |
| var copyrightRE = regexp.MustCompile(`(?m)(?i:Copyright)\s+(?i:©\s+|\(c\)\s+)?(?:\d{2,4})(?:[-,]\s*\d{2,4})*,?\s*(?i:by)?\s*(.*?(?i:\s+Inc\.)?)[.,]?\s*(?i:All rights reserved\.?)?\s*$`) |
| |
| // CopyrightHolder finds a copyright notification, if it exists, and returns |
| // the copyright holder. |
| func CopyrightHolder(contents string) string { |
| matches := copyrightRE.FindStringSubmatch(contents) |
| if len(matches) == 2 { |
| return matches[1] |
| } |
| return "" |
| } |
| |
| var publicDomainRE = regexp.MustCompile("(?i)(this file )?is( in the)? public domain") |
| |
| // HasPublicDomainNotice performs a simple regex over the contents to see if a |
| // public domain notice is in there. As you can imagine, this isn't 100% |
| // definitive, but can be useful if a license match isn't found. |
| func (c *License) HasPublicDomainNotice(contents string) bool { |
| return publicDomainRE.FindString(contents) != "" |
| } |
| |
| // ignorableTexts is a list of lines at the start of the string we can remove |
| // to get a cleaner match. |
| var ignorableTexts = []*regexp.Regexp{ |
| regexp.MustCompile(`(?i)^(?:the )?mit license(?: \(mit\))?$`), |
| regexp.MustCompile(`(?i)^(?:new )?bsd license$`), |
| regexp.MustCompile(`(?i)^copyright and permission notice$`), |
| regexp.MustCompile(`(?i)^copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]? .*$`), |
| regexp.MustCompile(`(?i)^(all|some) rights reserved\.?$`), |
| regexp.MustCompile(`(?i)^@license$`), |
| regexp.MustCompile(`^\s*$`), |
| } |
| |
| // removeIgnorableTexts removes common text, which is not important for |
| // classification, that shows up before the body of the license. |
| func removeIgnorableTexts(s string) string { |
| lines := strings.Split(strings.TrimRight(s, "\n"), "\n") |
| var start int |
| for ; start < len(lines); start++ { |
| line := strings.TrimSpace(lines[start]) |
| var matches bool |
| for _, re := range ignorableTexts { |
| if re.MatchString(line) { |
| matches = true |
| break |
| } |
| } |
| if !matches { |
| break |
| } |
| } |
| end := len(lines) |
| if start > end { |
| return "\n" |
| } |
| return strings.Join(lines[start:end], "\n") + "\n" |
| } |
| |
| // removeShebangLine removes the '#!...' line if it's the first line in the |
| // file. Note that if it's the only line in a comment, it won't be removed. |
| func removeShebangLine(s string) string { |
| lines := strings.Split(s, "\n") |
| if len(lines) <= 1 || !strings.HasPrefix(lines[0], "#!") { |
| return s |
| } |
| |
| return strings.Join(lines[1:], "\n") |
| } |
| |
| // isDecorative returns true if the line is made up purely of non-letter and |
| // non-digit characters. |
| func isDecorative(s string) bool { |
| for _, c := range s { |
| if unicode.IsLetter(c) || unicode.IsDigit(c) { |
| return false |
| } |
| } |
| return true |
| } |
| |
| var nonWords = regexp.MustCompile("[[:punct:]]+") |
| |
| // RemoveNonWords removes non-words from the string. |
| func RemoveNonWords(s string) string { |
| return nonWords.ReplaceAllString(s, " ") |
| } |
| |
| // interchangeablePunctutation is punctuation that can be normalized. |
| var interchangeablePunctuation = []struct { |
| interchangeable *regexp.Regexp |
| substitute string |
| }{ |
| // Hyphen, Dash, En Dash, and Em Dash. |
| {regexp.MustCompile(`[-‒–—]`), "-"}, |
| // Single, Double, Curly Single, and Curly Double. |
| {regexp.MustCompile("['\"`‘’“”]"), "'"}, |
| // Copyright. |
| {regexp.MustCompile("©"), "(c)"}, |
| // Hyphen-separated words. |
| {regexp.MustCompile(`(\S)-\s+(\S)`), "${1}-${2}"}, |
| // Currency and Section. (Different copies of the CDDL use each marker.) |
| {regexp.MustCompile("[§¤]"), "(s)"}, |
| // Middle Dot |
| {regexp.MustCompile("·"), "*"}, |
| } |
| |
| // NormalizePunctuation takes all hyphens and quotes and normalizes them. |
| func NormalizePunctuation(s string) string { |
| for _, iw := range interchangeablePunctuation { |
| s = iw.interchangeable.ReplaceAllString(s, iw.substitute) |
| } |
| return s |
| } |
| |
| // interchangeableWords are words we can substitute for a normalized form |
| // without changing the meaning of the license. See |
| // https://spdx.org/spdx-license-list/matching-guidelines for the list. |
| var interchangeableWords = []struct { |
| interchangeable *regexp.Regexp |
| substitute string |
| }{ |
| {regexp.MustCompile("(?i)Acknowledgment"), "Acknowledgement"}, |
| {regexp.MustCompile("(?i)Analogue"), "Analog"}, |
| {regexp.MustCompile("(?i)Analyse"), "Analyze"}, |
| {regexp.MustCompile("(?i)Artefact"), "Artifact"}, |
| {regexp.MustCompile("(?i)Authorisation"), "Authorization"}, |
| {regexp.MustCompile("(?i)Authorised"), "Authorized"}, |
| {regexp.MustCompile("(?i)Calibre"), "Caliber"}, |
| {regexp.MustCompile("(?i)Cancelled"), "Canceled"}, |
| {regexp.MustCompile("(?i)Capitalisations"), "Capitalizations"}, |
| {regexp.MustCompile("(?i)Catalogue"), "Catalog"}, |
| {regexp.MustCompile("(?i)Categorise"), "Categorize"}, |
| {regexp.MustCompile("(?i)Centre"), "Center"}, |
| {regexp.MustCompile("(?i)Emphasised"), "Emphasized"}, |
| {regexp.MustCompile("(?i)Favour"), "Favor"}, |
| {regexp.MustCompile("(?i)Favourite"), "Favorite"}, |
| {regexp.MustCompile("(?i)Fulfil"), "Fulfill"}, |
| {regexp.MustCompile("(?i)Fulfilment"), "Fulfillment"}, |
| {regexp.MustCompile("(?i)Initialise"), "Initialize"}, |
| {regexp.MustCompile("(?i)Judgment"), "Judgement"}, |
| {regexp.MustCompile("(?i)Labelling"), "Labeling"}, |
| {regexp.MustCompile("(?i)Labour"), "Labor"}, |
| {regexp.MustCompile("(?i)Licence"), "License"}, |
| {regexp.MustCompile("(?i)Maximise"), "Maximize"}, |
| {regexp.MustCompile("(?i)Modelled"), "Modeled"}, |
| {regexp.MustCompile("(?i)Modelling"), "Modeling"}, |
| {regexp.MustCompile("(?i)Offence"), "Offense"}, |
| {regexp.MustCompile("(?i)Optimise"), "Optimize"}, |
| {regexp.MustCompile("(?i)Organisation"), "Organization"}, |
| {regexp.MustCompile("(?i)Organise"), "Organize"}, |
| {regexp.MustCompile("(?i)Practise"), "Practice"}, |
| {regexp.MustCompile("(?i)Programme"), "Program"}, |
| {regexp.MustCompile("(?i)Realise"), "Realize"}, |
| {regexp.MustCompile("(?i)Recognise"), "Recognize"}, |
| {regexp.MustCompile("(?i)Signalling"), "Signaling"}, |
| {regexp.MustCompile("(?i)Sub[- ]license"), "Sublicense"}, |
| {regexp.MustCompile("(?i)Utilisation"), "Utilization"}, |
| {regexp.MustCompile("(?i)Whilst"), "While"}, |
| {regexp.MustCompile("(?i)Wilful"), "Wilfull"}, |
| {regexp.MustCompile("(?i)Non-commercial"), "Noncommercial"}, |
| {regexp.MustCompile("(?i)Per cent"), "Percent"}, |
| } |
| |
| // NormalizeEquivalentWords normalizes equivalent words that are interchangeable. |
| func NormalizeEquivalentWords(s string) string { |
| for _, iw := range interchangeableWords { |
| s = iw.interchangeable.ReplaceAllString(s, iw.substitute) |
| } |
| return s |
| } |