tools/check-licenses/file/filedata.go - fuchsia - Git at Google

 // Copyright 2022 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 package file

 import (
 	"bytes"
 	"crypto/sha1"
 	"encoding/base64"
 	"fmt"
 	"hash/fnv"
 	"strings"

 	classifierLib "github.com/google/licenseclassifier/v2"
 	"go.fuchsia.dev/fuchsia/tools/check-licenses/file/notice"
 )

 // FileData holds the text information (and some metadata) for a given file.
 //
 // Many NOTICE files will include several license texts in it.
 // FileData represents one of those segments. It also maintains a line number
 // that points to the location of this license text in the original NOTICE file,
 // making it easier to find this license text again later.
 type FileData struct {
 	file        *File
 	libraryName string
 	lineNumber  int
 	data        []byte

 	searchResults *classifierLib.Results

 	// ---------------
 	licenseType string
 	patternPath string
 	url         string

 	beingSurfaced      string
 	sourceCodeIncluded string

 	// SPDX fields for referencing this file content
 	// in the SPDX output file.
 	spdxName string
 	spdxID   string

 	hash string
 }

 // Order implements sort.Interface for []*FileData based on the FilePath field.
 type OrderFileData []*FileData

 func (a OrderFileData) Len() int      { return len(a) }
 func (a OrderFileData) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
 func (a OrderFileData) Less(i, j int) bool {
 	if a[i].file.absPath < a[j].file.absPath {
 		return true
 	}
 	if a[i].file.absPath > a[j].file.absPath {
 		return false
 	}
 	return a[i].lineNumber < a[j].lineNumber
 }

 func LoadFileData(f *File, content []byte) ([]*FileData, error) {
 	data := make([]*FileData, 0)

 	// The "LicenseFormat" field of each file is set at the project level
 	// (in README.fuchsia files) and it affects how they are analyzed here.
 	switch f.fileType {

 	// File.LicenseFormat == RegularFile
 	// All source files belonging to "The Fuchsia Authors" (fuchsia.git)
 	// must contain Copyright header information.
 	// Source files in other projects must not have restrictive license types.
 	case RegularFile:
 		fallthrough

 	// File.LicenseFormat == SingleLicense
 	// Regular LICENSE files that contain text for a single license.
 	case SingleLicense:
 		data = append(data, &FileData{
 			file:        f,
 			lineNumber:  0,
 			libraryName: f.projectName,
 			data:        bytes.TrimSpace(content),
 			url:         f.url,
 		})

 	// File.LicenseFormat == MultiLicense*
 	// NOTICE files that contain text for multiple licenses.
 	// See the files in the /notice subdirectory for more info.
 	case MultiLicense:
 		ndata, err := notice.ParseOneDelimiter(f.absPath, content)
 		if err != nil {
 			return nil, err
 		}
 		for _, d := range ndata {
 			data = append(data, &FileData{
 				file:        f,
 				lineNumber:  d.LineNumber,
 				libraryName: d.LibraryName,
 				data:        bytes.TrimSpace(d.LicenseText),
 			})
 		}

 	case MultiLicenseChromium:
 		ndata, err := notice.ParseChromium(f.absPath, content)
 		if err != nil {
 			return nil, err
 		}
 		for _, d := range ndata {
 			data = append(data, &FileData{
 				file:        f,
 				lineNumber:  d.LineNumber,
 				libraryName: d.LibraryName,
 				data:        bytes.TrimSpace(d.LicenseText),
 			})
 		}
 	case MultiLicenseFlutter:
 		ndata, err := notice.ParseFlutter(f.absPath, content)
 		if err != nil {
 			return nil, err
 		}
 		for _, d := range ndata {
 			data = append(data, &FileData{
 				file:        f,
 				lineNumber:  d.LineNumber,
 				libraryName: d.LibraryName,
 				data:        bytes.TrimSpace(d.LicenseText),
 			})
 		}
 	case MultiLicenseAndroid:
 		ndata, err := notice.ParseAndroid(f.absPath, content)
 		if err != nil {
 			return nil, err
 		}
 		for _, d := range ndata {
 			data = append(data, &FileData{
 				file:        f,
 				lineNumber:  d.LineNumber,
 				libraryName: d.LibraryName,
 				data:        bytes.TrimSpace(d.LicenseText),
 			})
 		}
 	case MultiLicenseGoogle:
 		ndata, err := notice.ParseGoogle(f.absPath, content)
 		if err != nil {
 			return nil, err
 		}
 		for _, d := range ndata {
 			data = append(data, &FileData{
 				file:        f,
 				lineNumber:  d.LineNumber,
 				libraryName: d.LibraryName,
 				data:        bytes.TrimSpace(d.LicenseText),
 			})
 		}

 	default:
 		return nil, fmt.Errorf("File type %v is unknown for filedata processing.", f.fileType)
 	}

 	for _, d := range data {
 		// Some characters in license texts are not interpreted properly
 		// (mismatched encodings?) and end up as garbled characters in output files.
 		// We replace those characters with properly encoded ones here.
 		for _, r := range Config.Replacements {
 			d.data = bytes.ReplaceAll(d.data, []byte(r.Replace), []byte(r.With))
 		}

 		if d.libraryName == "" {
 			d.libraryName = f.projectName
 		}

 		d.spdxName = fmt.Sprintf("%s", d.libraryName)

 		h := fnv.New128a()
 		h.Write([]byte(fmt.Sprintf("%s %s %s", d.libraryName, d.file.relPath, string(d.data))))
 		d.spdxID = fmt.Sprintf("LicenseRef-filedata-%x", h.Sum([]byte{}))
 	}
 	return data, nil
 }

 func (fd *FileData) Search() {
 	if fd.searchResults == nil {
 		results := classifier.Match(fd.data)
 		fd.searchResults = &results
 	}
 }

 // Getters
 func (fd *FileData) File() *File                           { return fd.file }
 func (fd *FileData) LibraryName() string                   { return fd.libraryName }
 func (fd *FileData) LineNumber() int                       { return fd.lineNumber }
 func (fd *FileData) Data() []byte                          { return fd.data }
 func (fd *FileData) LicenseType() string                   { return fd.licenseType }
 func (fd *FileData) PatternPath() string                   { return fd.patternPath }
 func (fd *FileData) URL() string                           { return fd.url }
 func (fd *FileData) BeingSurfaced() string                 { return fd.beingSurfaced }
 func (fd *FileData) SourceCodeIncluded() string            { return fd.sourceCodeIncluded }
 func (fd *FileData) SPDXName() string                      { return fd.spdxName }
 func (fd *FileData) SPDXID() string                        { return fd.spdxID }
 func (fd *FileData) SearchResults() *classifierLib.Results { return fd.searchResults }

 // For copyright data, we want "filedata" to only contain the copyright
 // text. Not the rest of the source code in the given file.
 // This method lets us set the filedata data after detecting the copyright
 // header info.
 func (fd *FileData) SetData(data []byte) {
 	fd.data = data
 	fd.hash = ""
 	fd.Hash()
 }

 // Use the config replacement / filedataurls information, along with
 // the project name and URL (if it exists) to define the actual location
 // of the license file on the internet.
 func (fd *FileData) UpdateURLs(projectName string, projectURL string) {
 	if strings.Contains(fd.file.relPath, "prebuilt") {
 		for _, ur := range Config.FileDataURLs {
 			if _, ok := ur.Projects[projectName]; !ok {
 				continue
 			}

 			prefix := ur.Prefix
 			if url, ok := ur.Replacements[fd.libraryName]; ok {
 				fd.url = fmt.Sprintf("%v%v", prefix, url)
 				return
 			}
 		}
 	}
 }

 // Hash the content of this filedata object, to help detect duplicate texts
 // and help reduce the final NOTICE filesize.
 func (fd *FileData) Hash() string {
 	if len(fd.hash) > 0 {
 		return fd.hash
 	}

 	hasher := sha1.New()
 	hasher.Write(bytes.TrimSpace(fd.data))
 	fd.hash = base64.URLEncoding.EncodeToString(hasher.Sum(nil))
 	return fd.hash
 }
	// Copyright 2022 The Fuchsia Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	package file

	import (
	"bytes"
	"crypto/sha1"
	"encoding/base64"
	"fmt"
	"hash/fnv"
	"strings"

	classifierLib "github.com/google/licenseclassifier/v2"
	"go.fuchsia.dev/fuchsia/tools/check-licenses/file/notice"
	)

	// FileData holds the text information (and some metadata) for a given file.
	//
	// Many NOTICE files will include several license texts in it.
	// FileData represents one of those segments. It also maintains a line number
	// that points to the location of this license text in the original NOTICE file,
	// making it easier to find this license text again later.
	type FileData struct {
	file *File
	libraryName string
	lineNumber int
	data []byte

	searchResults *classifierLib.Results

	// ---------------
	licenseType string
	patternPath string
	url string

	beingSurfaced string
	sourceCodeIncluded string

	// SPDX fields for referencing this file content
	// in the SPDX output file.
	spdxName string
	spdxID string

	hash string
	}

	// Order implements sort.Interface for []*FileData based on the FilePath field.
	type OrderFileData []*FileData

	func (a OrderFileData) Len() int { return len(a) }
	func (a OrderFileData) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
	func (a OrderFileData) Less(i, j int) bool {
	if a[i].file.absPath < a[j].file.absPath {
	return true
	}
	if a[i].file.absPath > a[j].file.absPath {
	return false
	}
	return a[i].lineNumber < a[j].lineNumber
	}

	func LoadFileData(f File, content []byte) ([]FileData, error) {
	data := make([]*FileData, 0)

	// The "LicenseFormat" field of each file is set at the project level
	// (in README.fuchsia files) and it affects how they are analyzed here.
	switch f.fileType {

	// File.LicenseFormat == RegularFile
	// All source files belonging to "The Fuchsia Authors" (fuchsia.git)
	// must contain Copyright header information.
	// Source files in other projects must not have restrictive license types.
	case RegularFile:
	fallthrough

	// File.LicenseFormat == SingleLicense
	// Regular LICENSE files that contain text for a single license.
	case SingleLicense:
	data = append(data, &FileData{
	file: f,
	lineNumber: 0,
	libraryName: f.projectName,
	data: bytes.TrimSpace(content),
	url: f.url,
	})

	// File.LicenseFormat == MultiLicense*
	// NOTICE files that contain text for multiple licenses.
	// See the files in the /notice subdirectory for more info.
	case MultiLicense:
	ndata, err := notice.ParseOneDelimiter(f.absPath, content)
	if err != nil {
	return nil, err
	}
	for _, d := range ndata {
	data = append(data, &FileData{
	file: f,
	lineNumber: d.LineNumber,
	libraryName: d.LibraryName,
	data: bytes.TrimSpace(d.LicenseText),
	})
	}

	case MultiLicenseChromium:
	ndata, err := notice.ParseChromium(f.absPath, content)
	if err != nil {
	return nil, err
	}
	for _, d := range ndata {
	data = append(data, &FileData{
	file: f,
	lineNumber: d.LineNumber,
	libraryName: d.LibraryName,
	data: bytes.TrimSpace(d.LicenseText),
	})
	}
	case MultiLicenseFlutter:
	ndata, err := notice.ParseFlutter(f.absPath, content)
	if err != nil {
	return nil, err
	}
	for _, d := range ndata {
	data = append(data, &FileData{
	file: f,
	lineNumber: d.LineNumber,
	libraryName: d.LibraryName,
	data: bytes.TrimSpace(d.LicenseText),
	})
	}
	case MultiLicenseAndroid:
	ndata, err := notice.ParseAndroid(f.absPath, content)
	if err != nil {
	return nil, err
	}
	for _, d := range ndata {
	data = append(data, &FileData{
	file: f,
	lineNumber: d.LineNumber,
	libraryName: d.LibraryName,
	data: bytes.TrimSpace(d.LicenseText),
	})
	}
	case MultiLicenseGoogle:
	ndata, err := notice.ParseGoogle(f.absPath, content)
	if err != nil {
	return nil, err
	}
	for _, d := range ndata {
	data = append(data, &FileData{
	file: f,
	lineNumber: d.LineNumber,
	libraryName: d.LibraryName,
	data: bytes.TrimSpace(d.LicenseText),
	})
	}

	default:
	return nil, fmt.Errorf("File type %v is unknown for filedata processing.", f.fileType)
	}

	for _, d := range data {
	// Some characters in license texts are not interpreted properly
	// (mismatched encodings?) and end up as garbled characters in output files.
	// We replace those characters with properly encoded ones here.
	for _, r := range Config.Replacements {
	d.data = bytes.ReplaceAll(d.data, []byte(r.Replace), []byte(r.With))
	}

	if d.libraryName == "" {
	d.libraryName = f.projectName
	}

	d.spdxName = fmt.Sprintf("%s", d.libraryName)

	h := fnv.New128a()
	h.Write([]byte(fmt.Sprintf("%s %s %s", d.libraryName, d.file.relPath, string(d.data))))
	d.spdxID = fmt.Sprintf("LicenseRef-filedata-%x", h.Sum([]byte{}))
	}
	return data, nil
	}

	func (fd *FileData) Search() {
	if fd.searchResults == nil {
	results := classifier.Match(fd.data)
	fd.searchResults = &results
	}
	}

	// Getters
	func (fd FileData) File() File { return fd.file }
	func (fd *FileData) LibraryName() string { return fd.libraryName }
	func (fd *FileData) LineNumber() int { return fd.lineNumber }
	func (fd *FileData) Data() []byte { return fd.data }
	func (fd *FileData) LicenseType() string { return fd.licenseType }
	func (fd *FileData) PatternPath() string { return fd.patternPath }
	func (fd *FileData) URL() string { return fd.url }
	func (fd *FileData) BeingSurfaced() string { return fd.beingSurfaced }
	func (fd *FileData) SourceCodeIncluded() string { return fd.sourceCodeIncluded }
	func (fd *FileData) SPDXName() string { return fd.spdxName }
	func (fd *FileData) SPDXID() string { return fd.spdxID }
	func (fd FileData) SearchResults() classifierLib.Results { return fd.searchResults }

	// For copyright data, we want "filedata" to only contain the copyright
	// text. Not the rest of the source code in the given file.
	// This method lets us set the filedata data after detecting the copyright
	// header info.
	func (fd *FileData) SetData(data []byte) {
	fd.data = data
	fd.hash = ""
	fd.Hash()
	}

	// Use the config replacement / filedataurls information, along with
	// the project name and URL (if it exists) to define the actual location
	// of the license file on the internet.
	func (fd *FileData) UpdateURLs(projectName string, projectURL string) {
	if strings.Contains(fd.file.relPath, "prebuilt") {
	for _, ur := range Config.FileDataURLs {
	if _, ok := ur.Projects[projectName]; !ok {
	continue
	}

	prefix := ur.Prefix
	if url, ok := ur.Replacements[fd.libraryName]; ok {
	fd.url = fmt.Sprintf("%v%v", prefix, url)
	return
	}
	}
	}
	}

	// Hash the content of this filedata object, to help detect duplicate texts
	// and help reduce the final NOTICE filesize.
	func (fd *FileData) Hash() string {
	if len(fd.hash) > 0 {
	return fd.hash
	}

	hasher := sha1.New()
	hasher.Write(bytes.TrimSpace(fd.data))
	fd.hash = base64.URLEncoding.EncodeToString(hasher.Sum(nil))
	return fd.hash
	}