builder/dockerfile/parser/parser.go - third_party/github.com/moby/moby - Git at Google

 // Package parser implements a parser and parse tree dumper for Dockerfiles.
 package parser

 import (
 	"bufio"
 	"bytes"
 	"fmt"
 	"io"
 	"regexp"
 	"runtime"
 	"strconv"
 	"strings"
 	"unicode"

 	"github.com/docker/docker/builder/dockerfile/command"
 	"github.com/docker/docker/pkg/system"
 	"github.com/pkg/errors"
 )

 // Node is a structure used to represent a parse tree.
 //
 // In the node there are three fields, Value, Next, and Children. Value is the
 // current token's string value. Next is always the next non-child token, and
 // children contains all the children. Here's an example:
 //
 // (value next (child child-next child-next-next) next-next)
 //
 // This data structure is frankly pretty lousy for handling complex languages,
 // but lucky for us the Dockerfile isn't very complicated. This structure
 // works a little more effectively than a "proper" parse tree for our needs.
 //
 type Node struct {
 	Value      string          // actual content
 	Next       *Node           // the next item in the current sexp
 	Children   []*Node         // the children of this sexp
 	Attributes map[string]bool // special attributes for this node
 	Original   string          // original line used before parsing
 	Flags      []string        // only top Node should have this set
 	StartLine  int             // the line in the original dockerfile where the node begins
 	endLine    int             // the line in the original dockerfile where the node ends
 }

 // Dump dumps the AST defined by `node` as a list of sexps.
 // Returns a string suitable for printing.
 func (node *Node) Dump() string {
 	str := ""
 	str += node.Value

 	if len(node.Flags) > 0 {
 		str += fmt.Sprintf(" %q", node.Flags)
 	}

 	for _, n := range node.Children {
 		str += "(" + n.Dump() + ")\n"
 	}

 	for n := node.Next; n != nil; n = n.Next {
 		if len(n.Children) > 0 {
 			str += " " + n.Dump()
 		} else {
 			str += " " + strconv.Quote(n.Value)
 		}
 	}

 	return strings.TrimSpace(str)
 }

 func (node *Node) lines(start, end int) {
 	node.StartLine = start
 	node.endLine = end
 }

 // AddChild adds a new child node, and updates line information
 func (node *Node) AddChild(child *Node, startLine, endLine int) {
 	child.lines(startLine, endLine)
 	if node.StartLine < 0 {
 		node.StartLine = startLine
 	}
 	node.endLine = endLine
 	node.Children = append(node.Children, child)
 }

 var (
 	dispatch             map[string]func(string, *Directive) (*Node, map[string]bool, error)
 	tokenWhitespace      = regexp.MustCompile(`[\t\v\f\r ]+`)
 	tokenEscapeCommand   = regexp.MustCompile(`^#[ \t]*escape[ \t]*=[ \t]*(?P<escapechar>.).*$`)
 	tokenPlatformCommand = regexp.MustCompile(`^#[ \t]*platform[ \t]*=[ \t]*(?P<platform>.*)$`)
 	tokenComment         = regexp.MustCompile(`^#.*$`)
 )

 // DefaultEscapeToken is the default escape token
 const DefaultEscapeToken = '\\'

 // Directive is the structure used during a build run to hold the state of
 // parsing directives.
 type Directive struct {
 	escapeToken           rune           // Current escape token
 	platformToken         string         // Current platform token
 	lineContinuationRegex *regexp.Regexp // Current line continuation regex
 	processingComplete    bool           // Whether we are done looking for directives
 	escapeSeen            bool           // Whether the escape directive has been seen
 	platformSeen          bool           // Whether the platform directive has been seen
 }

 // setEscapeToken sets the default token for escaping characters in a Dockerfile.
 func (d *Directive) setEscapeToken(s string) error {
 	if s != "`" && s != "\\" {
 		return fmt.Errorf("invalid ESCAPE '%s'. Must be ` or \\", s)
 	}
 	d.escapeToken = rune(s[0])
 	d.lineContinuationRegex = regexp.MustCompile(`\` + s + `[ \t]*$`)
 	return nil
 }

 // setPlatformToken sets the default platform for pulling images in a Dockerfile.
 func (d *Directive) setPlatformToken(s string) error {
 	s = strings.ToLower(s)
 	valid := []string{runtime.GOOS}
 	if system.LCOWSupported() {
 		valid = append(valid, "linux")
 	}
 	for _, item := range valid {
 		if s == item {
 			d.platformToken = s
 			return nil
 		}
 	}
 	return fmt.Errorf("invalid PLATFORM '%s'. Must be one of %v", s, valid)
 }

 // possibleParserDirective looks for one or more parser directives '# escapeToken=<char>' and
 // '# platform=<string>'. Parser directives must precede any builder instruction
 // or other comments, and cannot be repeated.
 func (d *Directive) possibleParserDirective(line string) error {
 	if d.processingComplete {
 		return nil
 	}

 	tecMatch := tokenEscapeCommand.FindStringSubmatch(strings.ToLower(line))
 	if len(tecMatch) != 0 {
 		for i, n := range tokenEscapeCommand.SubexpNames() {
 			if n == "escapechar" {
 				if d.escapeSeen {
 					return errors.New("only one escape parser directive can be used")
 				}
 				d.escapeSeen = true
 				return d.setEscapeToken(tecMatch[i])
 			}
 		}
 	}

 	// Only recognise a platform token if LCOW is supported
 	if system.LCOWSupported() {
 		tpcMatch := tokenPlatformCommand.FindStringSubmatch(strings.ToLower(line))
 		if len(tpcMatch) != 0 {
 			for i, n := range tokenPlatformCommand.SubexpNames() {
 				if n == "platform" {
 					if d.platformSeen {
 						return errors.New("only one platform parser directive can be used")
 					}
 					d.platformSeen = true
 					return d.setPlatformToken(tpcMatch[i])
 				}
 			}
 		}
 	}

 	d.processingComplete = true
 	return nil
 }

 // NewDefaultDirective returns a new Directive with the default escapeToken token
 func NewDefaultDirective() *Directive {
 	directive := Directive{}
 	directive.setEscapeToken(string(DefaultEscapeToken))
 	return &directive
 }

 func init() {
 	// Dispatch Table. see line_parsers.go for the parse functions.
 	// The command is parsed and mapped to the line parser. The line parser
 	// receives the arguments but not the command, and returns an AST after
 	// reformulating the arguments according to the rules in the parser
 	// functions. Errors are propagated up by Parse() and the resulting AST can
 	// be incorporated directly into the existing AST as a next.
 	dispatch = map[string]func(string, *Directive) (*Node, map[string]bool, error){
 		command.Add:         parseMaybeJSONToList,
 		command.Arg:         parseNameOrNameVal,
 		command.Cmd:         parseMaybeJSON,
 		command.Copy:        parseMaybeJSONToList,
 		command.Entrypoint:  parseMaybeJSON,
 		command.Env:         parseEnv,
 		command.Expose:      parseStringsWhitespaceDelimited,
 		command.From:        parseStringsWhitespaceDelimited,
 		command.Healthcheck: parseHealthConfig,
 		command.Label:       parseLabel,
 		command.Maintainer:  parseString,
 		command.Onbuild:     parseSubCommand,
 		command.Run:         parseMaybeJSON,
 		command.Shell:       parseMaybeJSON,
 		command.StopSignal:  parseString,
 		command.User:        parseString,
 		command.Volume:      parseMaybeJSONToList,
 		command.Workdir:     parseString,
 	}
 }

 // newNodeFromLine splits the line into parts, and dispatches to a function
 // based on the command and command arguments. A Node is created from the
 // result of the dispatch.
 func newNodeFromLine(line string, directive *Directive) (*Node, error) {
 	cmd, flags, args, err := splitCommand(line)
 	if err != nil {
 		return nil, err
 	}

 	fn := dispatch[cmd]
 	// Ignore invalid Dockerfile instructions
 	if fn == nil {
 		fn = parseIgnore
 	}
 	next, attrs, err := fn(args, directive)
 	if err != nil {
 		return nil, err
 	}

 	return &Node{
 		Value:      cmd,
 		Original:   line,
 		Flags:      flags,
 		Next:       next,
 		Attributes: attrs,
 	}, nil
 }

 // Result is the result of parsing a Dockerfile
 type Result struct {
 	AST         *Node
 	EscapeToken rune
 	// TODO @jhowardmsft - see https://github.com/moby/moby/issues/34617
 	// This next field will be removed in a future update for LCOW support.
 	OS       string
 	Warnings []string
 }

 // PrintWarnings to the writer
 func (r *Result) PrintWarnings(out io.Writer) {
 	if len(r.Warnings) == 0 {
 		return
 	}
 	fmt.Fprintf(out, strings.Join(r.Warnings, "\n")+"\n")
 }

 // Parse reads lines from a Reader, parses the lines into an AST and returns
 // the AST and escape token
 func Parse(rwc io.Reader) (*Result, error) {
 	d := NewDefaultDirective()
 	currentLine := 0
 	root := &Node{StartLine: -1}
 	scanner := bufio.NewScanner(rwc)
 	warnings := []string{}

 	var err error
 	for scanner.Scan() {
 		bytesRead := scanner.Bytes()
 		if currentLine == 0 {
 			// First line, strip the byte-order-marker if present
 			bytesRead = bytes.TrimPrefix(bytesRead, utf8bom)
 		}
 		bytesRead, err = processLine(d, bytesRead, true)
 		if err != nil {
 			return nil, err
 		}
 		currentLine++

 		startLine := currentLine
 		line, isEndOfLine := trimContinuationCharacter(string(bytesRead), d)
 		if isEndOfLine && line == "" {
 			continue
 		}

 		var hasEmptyContinuationLine bool
 		for !isEndOfLine && scanner.Scan() {
 			bytesRead, err := processLine(d, scanner.Bytes(), false)
 			if err != nil {
 				return nil, err
 			}
 			currentLine++

 			if isComment(scanner.Bytes()) {
 				// original line was a comment (processLine strips comments)
 				continue
 			}
 			if isEmptyContinuationLine(bytesRead) {
 				hasEmptyContinuationLine = true
 				continue
 			}

 			continuationLine := string(bytesRead)
 			continuationLine, isEndOfLine = trimContinuationCharacter(continuationLine, d)
 			line += continuationLine
 		}

 		if hasEmptyContinuationLine {
 			warning := "[WARNING]: Empty continuation line found in:\n    " + line
 			warnings = append(warnings, warning)
 		}

 		child, err := newNodeFromLine(line, d)
 		if err != nil {
 			return nil, err
 		}
 		root.AddChild(child, startLine, currentLine)
 	}

 	if len(warnings) > 0 {
 		warnings = append(warnings, "[WARNING]: Empty continuation lines will become errors in a future release.")
 	}
 	return &Result{
 		AST:         root,
 		Warnings:    warnings,
 		EscapeToken: d.escapeToken,
 		OS:          d.platformToken,
 	}, handleScannerError(scanner.Err())
 }

 func trimComments(src []byte) []byte {
 	return tokenComment.ReplaceAll(src, []byte{})
 }

 func trimWhitespace(src []byte) []byte {
 	return bytes.TrimLeftFunc(src, unicode.IsSpace)
 }

 func isComment(line []byte) bool {
 	return tokenComment.Match(trimWhitespace(line))
 }

 func isEmptyContinuationLine(line []byte) bool {
 	return len(trimWhitespace(line)) == 0
 }

 var utf8bom = []byte{0xEF, 0xBB, 0xBF}

 func trimContinuationCharacter(line string, d *Directive) (string, bool) {
 	if d.lineContinuationRegex.MatchString(line) {
 		line = d.lineContinuationRegex.ReplaceAllString(line, "")
 		return line, false
 	}
 	return line, true
 }

 // TODO: remove stripLeftWhitespace after deprecation period. It seems silly
 // to preserve whitespace on continuation lines. Why is that done?
 func processLine(d *Directive, token []byte, stripLeftWhitespace bool) ([]byte, error) {
 	if stripLeftWhitespace {
 		token = trimWhitespace(token)
 	}
 	return trimComments(token), d.possibleParserDirective(string(token))
 }

 func handleScannerError(err error) error {
 	switch err {
 	case bufio.ErrTooLong:
 		return errors.Errorf("dockerfile line greater than max allowed size of %d", bufio.MaxScanTokenSize-1)
 	default:
 		return err
 	}
 }
	// Package parser implements a parser and parse tree dumper for Dockerfiles.
	package parser

	import (
	"bufio"
	"bytes"
	"fmt"
	"io"
	"regexp"
	"runtime"
	"strconv"
	"strings"
	"unicode"

	"github.com/docker/docker/builder/dockerfile/command"
	"github.com/docker/docker/pkg/system"
	"github.com/pkg/errors"
	)

	// Node is a structure used to represent a parse tree.
	//
	// In the node there are three fields, Value, Next, and Children. Value is the
	// current token's string value. Next is always the next non-child token, and
	// children contains all the children. Here's an example:
	//
	// (value next (child child-next child-next-next) next-next)
	//
	// This data structure is frankly pretty lousy for handling complex languages,
	// but lucky for us the Dockerfile isn't very complicated. This structure
	// works a little more effectively than a "proper" parse tree for our needs.
	//
	type Node struct {
	Value string // actual content
	Next *Node // the next item in the current sexp
	Children []*Node // the children of this sexp
	Attributes map[string]bool // special attributes for this node
	Original string // original line used before parsing
	Flags []string // only top Node should have this set
	StartLine int // the line in the original dockerfile where the node begins
	endLine int // the line in the original dockerfile where the node ends
	}

	// Dump dumps the AST defined by `node` as a list of sexps.
	// Returns a string suitable for printing.
	func (node *Node) Dump() string {
	str := ""
	str += node.Value

	if len(node.Flags) > 0 {
	str += fmt.Sprintf(" %q", node.Flags)
	}

	for _, n := range node.Children {
	str += "(" + n.Dump() + ")\n"
	}

	for n := node.Next; n != nil; n = n.Next {
	if len(n.Children) > 0 {
	str += " " + n.Dump()
	} else {
	str += " " + strconv.Quote(n.Value)
	}
	}

	return strings.TrimSpace(str)
	}

	func (node *Node) lines(start, end int) {
	node.StartLine = start
	node.endLine = end
	}

	// AddChild adds a new child node, and updates line information
	func (node Node) AddChild(child Node, startLine, endLine int) {
	child.lines(startLine, endLine)
	if node.StartLine < 0 {
	node.StartLine = startLine
	}
	node.endLine = endLine
	node.Children = append(node.Children, child)
	}

	var (
	dispatch map[string]func(string, Directive) (Node, map[string]bool, error)
	tokenWhitespace = regexp.MustCompile(`[\t\v\f\r ]+`)
	tokenEscapeCommand = regexp.MustCompile(`^#[ \t]escape[ \t]=[ \t](?P<escapechar>.).$`)
	tokenPlatformCommand = regexp.MustCompile(`^#[ \t]platform[ \t]=[ \t](?P<platform>.)$`)
	tokenComment = regexp.MustCompile(`^#.*$`)
	)

	// DefaultEscapeToken is the default escape token
	const DefaultEscapeToken = '\\'

	// Directive is the structure used during a build run to hold the state of
	// parsing directives.
	type Directive struct {
	escapeToken rune // Current escape token
	platformToken string // Current platform token
	lineContinuationRegex *regexp.Regexp // Current line continuation regex
	processingComplete bool // Whether we are done looking for directives
	escapeSeen bool // Whether the escape directive has been seen
	platformSeen bool // Whether the platform directive has been seen
	}

	// setEscapeToken sets the default token for escaping characters in a Dockerfile.
	func (d *Directive) setEscapeToken(s string) error {
	if s != "`" && s != "\\" {
	return fmt.Errorf("invalid ESCAPE '%s'. Must be ` or \\", s)
	}
	d.escapeToken = rune(s[0])
	d.lineContinuationRegex = regexp.MustCompile(`\` + s + `[ \t]*$`)
	return nil
	}

	// setPlatformToken sets the default platform for pulling images in a Dockerfile.
	func (d *Directive) setPlatformToken(s string) error {
	s = strings.ToLower(s)
	valid := []string{runtime.GOOS}
	if system.LCOWSupported() {
	valid = append(valid, "linux")
	}
	for _, item := range valid {
	if s == item {
	d.platformToken = s
	return nil
	}
	}
	return fmt.Errorf("invalid PLATFORM '%s'. Must be one of %v", s, valid)
	}

	// possibleParserDirective looks for one or more parser directives '# escapeToken=<char>' and
	// '# platform=<string>'. Parser directives must precede any builder instruction
	// or other comments, and cannot be repeated.
	func (d *Directive) possibleParserDirective(line string) error {
	if d.processingComplete {
	return nil
	}

	tecMatch := tokenEscapeCommand.FindStringSubmatch(strings.ToLower(line))
	if len(tecMatch) != 0 {
	for i, n := range tokenEscapeCommand.SubexpNames() {
	if n == "escapechar" {
	if d.escapeSeen {
	return errors.New("only one escape parser directive can be used")
	}
	d.escapeSeen = true
	return d.setEscapeToken(tecMatch[i])
	}
	}
	}

	// Only recognise a platform token if LCOW is supported
	if system.LCOWSupported() {
	tpcMatch := tokenPlatformCommand.FindStringSubmatch(strings.ToLower(line))
	if len(tpcMatch) != 0 {
	for i, n := range tokenPlatformCommand.SubexpNames() {
	if n == "platform" {
	if d.platformSeen {
	return errors.New("only one platform parser directive can be used")
	}
	d.platformSeen = true
	return d.setPlatformToken(tpcMatch[i])
	}
	}
	}
	}

	d.processingComplete = true
	return nil
	}

	// NewDefaultDirective returns a new Directive with the default escapeToken token
	func NewDefaultDirective() *Directive {
	directive := Directive{}
	directive.setEscapeToken(string(DefaultEscapeToken))
	return &directive
	}

	func init() {
	// Dispatch Table. see line_parsers.go for the parse functions.
	// The command is parsed and mapped to the line parser. The line parser
	// receives the arguments but not the command, and returns an AST after
	// reformulating the arguments according to the rules in the parser
	// functions. Errors are propagated up by Parse() and the resulting AST can
	// be incorporated directly into the existing AST as a next.
	dispatch = map[string]func(string, Directive) (Node, map[string]bool, error){
	command.Add: parseMaybeJSONToList,
	command.Arg: parseNameOrNameVal,
	command.Cmd: parseMaybeJSON,
	command.Copy: parseMaybeJSONToList,
	command.Entrypoint: parseMaybeJSON,
	command.Env: parseEnv,
	command.Expose: parseStringsWhitespaceDelimited,
	command.From: parseStringsWhitespaceDelimited,
	command.Healthcheck: parseHealthConfig,
	command.Label: parseLabel,
	command.Maintainer: parseString,
	command.Onbuild: parseSubCommand,
	command.Run: parseMaybeJSON,
	command.Shell: parseMaybeJSON,
	command.StopSignal: parseString,
	command.User: parseString,
	command.Volume: parseMaybeJSONToList,
	command.Workdir: parseString,
	}
	}

	// newNodeFromLine splits the line into parts, and dispatches to a function
	// based on the command and command arguments. A Node is created from the
	// result of the dispatch.
	func newNodeFromLine(line string, directive Directive) (Node, error) {
	cmd, flags, args, err := splitCommand(line)
	if err != nil {
	return nil, err
	}

	fn := dispatch[cmd]
	// Ignore invalid Dockerfile instructions
	if fn == nil {
	fn = parseIgnore
	}
	next, attrs, err := fn(args, directive)
	if err != nil {
	return nil, err
	}

	return &Node{
	Value: cmd,
	Original: line,
	Flags: flags,
	Next: next,
	Attributes: attrs,
	}, nil
	}

	// Result is the result of parsing a Dockerfile
	type Result struct {
	AST *Node
	EscapeToken rune
	// TODO @jhowardmsft - see https://github.com/moby/moby/issues/34617
	// This next field will be removed in a future update for LCOW support.
	OS string
	Warnings []string
	}

	// PrintWarnings to the writer
	func (r *Result) PrintWarnings(out io.Writer) {
	if len(r.Warnings) == 0 {
	return
	}
	fmt.Fprintf(out, strings.Join(r.Warnings, "\n")+"\n")
	}

	// Parse reads lines from a Reader, parses the lines into an AST and returns
	// the AST and escape token
	func Parse(rwc io.Reader) (*Result, error) {
	d := NewDefaultDirective()
	currentLine := 0
	root := &Node{StartLine: -1}
	scanner := bufio.NewScanner(rwc)
	warnings := []string{}

	var err error
	for scanner.Scan() {
	bytesRead := scanner.Bytes()
	if currentLine == 0 {
	// First line, strip the byte-order-marker if present
	bytesRead = bytes.TrimPrefix(bytesRead, utf8bom)
	}
	bytesRead, err = processLine(d, bytesRead, true)
	if err != nil {
	return nil, err
	}
	currentLine++

	startLine := currentLine
	line, isEndOfLine := trimContinuationCharacter(string(bytesRead), d)
	if isEndOfLine && line == "" {
	continue
	}

	var hasEmptyContinuationLine bool
	for !isEndOfLine && scanner.Scan() {
	bytesRead, err := processLine(d, scanner.Bytes(), false)
	if err != nil {
	return nil, err
	}
	currentLine++

	if isComment(scanner.Bytes()) {
	// original line was a comment (processLine strips comments)
	continue
	}
	if isEmptyContinuationLine(bytesRead) {
	hasEmptyContinuationLine = true
	continue
	}

	continuationLine := string(bytesRead)
	continuationLine, isEndOfLine = trimContinuationCharacter(continuationLine, d)
	line += continuationLine
	}

	if hasEmptyContinuationLine {
	warning := "[WARNING]: Empty continuation line found in:\n " + line
	warnings = append(warnings, warning)
	}

	child, err := newNodeFromLine(line, d)
	if err != nil {
	return nil, err
	}
	root.AddChild(child, startLine, currentLine)
	}

	if len(warnings) > 0 {
	warnings = append(warnings, "[WARNING]: Empty continuation lines will become errors in a future release.")
	}
	return &Result{
	AST: root,
	Warnings: warnings,
	EscapeToken: d.escapeToken,
	OS: d.platformToken,
	}, handleScannerError(scanner.Err())
	}

	func trimComments(src []byte) []byte {
	return tokenComment.ReplaceAll(src, []byte{})
	}

	func trimWhitespace(src []byte) []byte {
	return bytes.TrimLeftFunc(src, unicode.IsSpace)
	}

	func isComment(line []byte) bool {
	return tokenComment.Match(trimWhitespace(line))
	}

	func isEmptyContinuationLine(line []byte) bool {
	return len(trimWhitespace(line)) == 0
	}

	var utf8bom = []byte{0xEF, 0xBB, 0xBF}

	func trimContinuationCharacter(line string, d *Directive) (string, bool) {
	if d.lineContinuationRegex.MatchString(line) {
	line = d.lineContinuationRegex.ReplaceAllString(line, "")
	return line, false
	}
	return line, true
	}

	// TODO: remove stripLeftWhitespace after deprecation period. It seems silly
	// to preserve whitespace on continuation lines. Why is that done?
	func processLine(d *Directive, token []byte, stripLeftWhitespace bool) ([]byte, error) {
	if stripLeftWhitespace {
	token = trimWhitespace(token)
	}
	return trimComments(token), d.possibleParserDirective(string(token))
	}

	func handleScannerError(err error) error {
	switch err {
	case bufio.ErrTooLong:
	return errors.Errorf("dockerfile line greater than max allowed size of %d", bufio.MaxScanTokenSize-1)
	default:
	return err
	}
	}