| // Copyright 2020 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package classifier |
| |
| import ( |
| "html" |
| "regexp" |
| "strings" |
| "unicode" |
| "unicode/utf8" |
| ) |
| |
| // isSignificant looks for runes that are likely to be the part of English language content |
| // of interest in licenses. Notably, it skips over punctuation, looking only for letters |
| // or numbers that consistitute the tokens of most interest. |
| func isSignificant(r rune) bool { |
| return unicode.IsLetter(r) || unicode.IsDigit(r) |
| } |
| |
| var eol = "\n" |
| |
| func cleanupToken(in string) string { |
| r, _ := utf8.DecodeRuneInString(in) |
| var out strings.Builder |
| if !unicode.IsLetter(r) { |
| if unicode.IsDigit(r) { |
| // Based on analysis of the license corpus, the characters |
| // that are significant are numbers, periods, and dashes. Anything |
| // else can be safely discarded, and helps avoid matching failures |
| // due to inconsistent whitespacing and formatting. |
| for _, c := range in { |
| if unicode.IsDigit(c) || c == '.' || c == '-' { |
| out.WriteRune(c) |
| } |
| } |
| |
| // Numbers should not end in a . since that doesn't indicate a version |
| // number, but usually an end of a line. |
| res := out.String() |
| for strings.HasSuffix(res, ".") { |
| res = res[0 : len(res)-1] |
| } |
| return res |
| } |
| } |
| |
| // Remove internal hyphenization or URL constructs to better normalize |
| // strings for matching. |
| for _, c := range in { |
| if unicode.IsLetter(c) { |
| out.WriteRune(c) |
| } |
| } |
| return out.String() |
| } |
| |
| func normalizeDoc(in []byte, normWords bool) string { |
| // Apply the global transforms described in SPDX |
| |
| norm := string(in) |
| norm = html.UnescapeString(norm) |
| norm = normalizePunctuation(norm) |
| norm = removeIgnorableTexts(norm) |
| |
| if normWords { |
| norm = normalizeWords(norm) |
| } |
| return norm |
| } |
| |
| func tokenize(in []byte) *document { |
| // tokenize produces a document from the input content. |
| text := normalizeDoc(in, true) |
| return extractDoc(text, true) |
| } |
| |
| func extractDoc(text string, removeEol bool) *document { |
| var doc document |
| // Iterate on a line-by-line basis. |
| i := 0 |
| pos := 0 |
| for { |
| // Scan the text for the first likely textual content. The scan ignores punctuation |
| // artifacts that include visual boxes for layout as well as comment characters in |
| // source files. |
| firstInLine := true |
| var wid int |
| var r rune |
| |
| if pos == len(text) { |
| break |
| } |
| |
| next := func() { |
| r, wid = utf8.DecodeRuneInString(text[pos:]) |
| pos += wid |
| } |
| |
| for pos < len(text) { |
| start := pos |
| next() |
| |
| if r == '\n' { |
| doc.Tokens = append(doc.Tokens, &token{ |
| Text: eol, |
| Line: i + 1}) |
| i++ |
| } |
| |
| if !isSignificant(r) { |
| continue |
| } |
| |
| // We're at a word/number character. |
| for pos < len(text) { |
| next() |
| if unicode.IsSpace(r) { |
| pos -= wid // Will skip this in outer loop |
| break |
| } |
| } |
| |
| if pos > start { |
| if start >= 2 && text[start-2] == '.' && text[start-1] == ' ' { |
| // Insert a "soft EOL" that helps detect header-looking entries that |
| // follow this text. This resolves problems with licenses that are a |
| // very long line of text, motivated by |
| // https://github.com/microsoft/TypeScript/commit/6e6e570d57b6785335668e30b63712e41f89bf74#diff-e60c8cd1bc09b7c4e1bf79c769c9c120L109 |
| // |
| // Don't do this if the previous token was already an EOL |
| if len(doc.Tokens) > 0 && doc.Tokens[len(doc.Tokens)-1].Text != eol { |
| doc.Tokens = append(doc.Tokens, &token{ |
| Text: eol, |
| Line: i + 1}) |
| } |
| } |
| |
| tok := token{ |
| Text: text[start:pos], |
| Line: i + 1, |
| } |
| if firstInLine { |
| // Store the prefix material, it is useful to discern some corner cases |
| tok.Previous = text[0:start] |
| } |
| doc.Tokens = append(doc.Tokens, &tok) |
| firstInLine = false |
| } |
| } |
| } |
| |
| doc.Tokens = cleanupTokens(doc.Tokens, removeEol) |
| return &doc |
| } |
| |
| func cleanupTokens(in []*token, removeEol bool) []*token { |
| // This routine performs sanitization of tokens. If it is a header-looking |
| // token (but not a version number) starting a line, it is removed. |
| // Hyphenated words are reassembled. |
| partialWord := "" |
| var out []*token |
| tokIdx := 0 |
| firstInLine := true |
| for i, tok := range in { |
| if firstInLine && header(tok) { |
| continue |
| } |
| if tok.Text == eol { |
| firstInLine = true |
| if removeEol { |
| continue |
| } |
| // If we are reconstructing a hyphenated word, don't append the EOL |
| // now, do it when the word is reconstructed. |
| if partialWord == "" { |
| out = append(out, &token{Text: eol, Line: tok.Line, Index: tokIdx}) |
| tokIdx++ |
| } |
| continue |
| } |
| firstInLine = false |
| t := cleanupToken(tok.Text) |
| // If this is the last token in a line, and it looks like a hyphenated |
| // word, store it for reassembly. |
| if strings.HasSuffix(tok.Text, "-") && i+1 < len(in) && in[i+1].Text == eol { |
| partialWord = t |
| } else if partialWord != "" { |
| // Repair hyphenated words |
| tp := in[i-1] |
| tp.Text = partialWord + t |
| tp.Index = tokIdx |
| tp.Previous = "" |
| out = append(out, tp) |
| tokIdx++ |
| if !removeEol { |
| // Append the EOL now that the whole word is recovered |
| out = append(out, &token{Text: eol, Line: tp.Line, Index: tokIdx}) |
| tokIdx++ |
| } |
| |
| partialWord = "" |
| } else { |
| tok.Text = t |
| tok.Index = tokIdx |
| tok.Previous = "" |
| out = append(out, tok) |
| tokIdx++ |
| } |
| } |
| return out |
| } |
| |
| // interchangeablePunctutation is punctuation that can be normalized. |
| var interchangeablePunctuation = []struct { |
| interchangeable string |
| substitute string |
| }{ |
| // Hyphen, Dash, En Dash, and Em Dash. |
| {`-‒–—‐`, "-"}, |
| // Single, Double, Curly Single, and Curly Double. |
| {"'\"`‘’“”", "'"}, |
| // Copyright. |
| {"©", "(c)"}, |
| // Currency and Section. (Different copies of the CDDL use each marker.) |
| {"§¤", "(s)"}, |
| // Middle Dot |
| {"·", " "}, |
| {"*", " "}, |
| } |
| |
| // normalizePunctuation takes all hyphens and quotes and normalizes them. |
| func normalizePunctuation(s string) string { |
| for _, iw := range interchangeablePunctuation { |
| for _, in := range strings.Split(iw.interchangeable, "") { |
| s = strings.ReplaceAll(s, in, iw.substitute) |
| } |
| } |
| return s |
| } |
| |
| // interchangeableWords are words we can substitute for a normalized form |
| // without changing the meaning of the license. See |
| // https://spdx.org/spdx-license-list/matching-guidelines for the list. |
| var interchangeableWords = []struct { |
| interchangeable *regexp.Regexp |
| substitute string |
| }{ |
| {regexp.MustCompile("acknowledgement"), "acknowledgment"}, |
| {regexp.MustCompile("analogue"), "analog"}, |
| {regexp.MustCompile("analyse"), "analyze"}, |
| {regexp.MustCompile("artefact"), "artifact"}, |
| {regexp.MustCompile("authorisation"), "authorization"}, |
| {regexp.MustCompile("authorised"), "authorized"}, |
| {regexp.MustCompile("calibre"), "caliber"}, |
| {regexp.MustCompile("cancelled"), "canceled"}, |
| {regexp.MustCompile("capitalisations"), "capitalizations"}, |
| {regexp.MustCompile("catalogue"), "catalog"}, |
| {regexp.MustCompile("categorise"), "categorize"}, |
| {regexp.MustCompile("centre"), "center"}, |
| {regexp.MustCompile("emphasised"), "emphasized"}, |
| {regexp.MustCompile("favour"), "favor"}, |
| {regexp.MustCompile("favourite"), "favorite"}, |
| {regexp.MustCompile("fulfil\\b"), "fulfill"}, |
| {regexp.MustCompile("fulfilment"), "fulfillment"}, |
| {regexp.MustCompile("https"), "http"}, |
| {regexp.MustCompile("initialise"), "initialize"}, |
| {regexp.MustCompile("judgment"), "judgement"}, |
| {regexp.MustCompile("labelling"), "labeling"}, |
| {regexp.MustCompile("labour"), "labor"}, |
| {regexp.MustCompile("licence"), "license"}, |
| {regexp.MustCompile("maximise"), "maximize"}, |
| {regexp.MustCompile("modelled"), "modeled"}, |
| {regexp.MustCompile("modelling"), "modeling"}, |
| {regexp.MustCompile("offence"), "offense"}, |
| {regexp.MustCompile("optimise"), "optimize"}, |
| {regexp.MustCompile("organisation"), "organization"}, |
| {regexp.MustCompile("organise"), "organize"}, |
| {regexp.MustCompile("practise"), "practice"}, |
| {regexp.MustCompile("programme"), "program"}, |
| {regexp.MustCompile("realise"), "realize"}, |
| {regexp.MustCompile("recognise"), "recognize"}, |
| {regexp.MustCompile("signalling"), "signaling"}, |
| {regexp.MustCompile("sub[ -]license"), "sublicense"}, |
| {regexp.MustCompile("utilisation"), "utilization"}, |
| {regexp.MustCompile("whilst"), "while"}, |
| {regexp.MustCompile("wilful"), "wilfull"}, |
| {regexp.MustCompile("non[ -]commercial"), "noncommercial"}, |
| {regexp.MustCompile("per cent"), "percent"}, |
| } |
| |
| // normalizeWords remaps equivalent words that are interchangeable and lowercases |
| // the word to allow for exact matching. |
| func normalizeWords(s string) string { |
| s = strings.ToLower(s) |
| for _, iw := range interchangeableWords { |
| s = iw.interchangeable.ReplaceAllString(s, iw.substitute) |
| } |
| return s |
| } |
| |
| func header(tok *token) bool { |
| in := tok.Text |
| p, e := in[:len(in)-1], in[len(in)-1] |
| switch e { |
| case '.', ':', ')': |
| if listMarker[p] { |
| if e != ')' { |
| return true |
| } |
| // Sometimes an internal reference like "(ii)" from NPL-1.02.txt |
| // endds up at the beginning of a line. In that case, it's |
| // not actually a header. |
| if e == ')' && !strings.HasSuffix(tok.Previous, "(") { |
| return true |
| } |
| } |
| // Check for patterns like 1.2.3 |
| for _, r := range p { |
| if unicode.IsDigit(r) || r == '.' { |
| continue |
| } |
| return false |
| } |
| return true |
| } |
| return false |
| } |
| |
| var listMarker = func() map[string]bool { |
| const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv v vi vii viii ix xi xii xiii xiv xv" |
| l := map[string]bool{} |
| for _, marker := range strings.Split(allListMarkers, " ") { |
| l[marker] = true |
| } |
| return l |
| }() |
| |
| // ignorableTexts is a list of lines at the start of the string we can remove |
| // to get a cleaner match. |
| var ignorableTexts = []*regexp.Regexp{ |
| regexp.MustCompile(`(?i)^(.{1,5})?copyright (\(c\) )?(\[yyyy\]|\d{4})[,.]?.*$`), |
| regexp.MustCompile(`(?i)^(.{1,5})?copyright \(c\) \[dates of first publication\].*$`), |
| regexp.MustCompile(`(?i)^\d{4}-(\d{2}|[a-z]{3})-\d{2}$`), |
| } |
| |
| // removeIgnorableTexts removes common text, which is not important for |
| // classification |
| func removeIgnorableTexts(s string) string { |
| var out []string |
| lines := strings.Split(s, "\n") |
| for _, l := range lines { |
| line := strings.TrimSpace(l) |
| var match bool |
| for _, re := range ignorableTexts { |
| if re.MatchString(line) { |
| match = true |
| } |
| } |
| if !match { |
| out = append(out, l) |
| } else { |
| // We want to preserve line presence for the positional information |
| out = append(out, "") |
| } |
| } |
| return strings.Join(out, "\n") |
| } |