[tap] Implement TAP parser

The motivation for this is so that infra tools can consume the output
of //tools/testrunner.  This parser does not yet support YAML documents,
directives after the plan line, etc.  We can add those features as
needed.

Change-Id: Ia2fa9962dbd90bdd4d3d1e69eafc93255f735e09
diff --git a/tap/parser.go b/tap/parser.go
new file mode 100644
index 0000000..a4d6b13
--- /dev/null
+++ b/tap/parser.go
@@ -0,0 +1,224 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package tap
+
+import (
+	"errors"
+	"fmt"
+	"log"
+	"strconv"
+	"strings"
+
+	"fuchsia.googlesource.com/tools/tap/tokenizer"
+)
+
+// Parse parses the given input string into a Document.  The input is allowed to contain
+// garbage lines; The parser will skip them and parse as much of the input as possible.
+// The only execption is that the first line of input must be a TAP version header of the
+// form "TAP version XXX".
+func Parse(input []byte) (*Document, error) {
+	output := make(chan *Document)
+	go parse(tokenizer.NewTokenStream(input), output)
+	return <-output, nil
+}
+
+// State represents a parser state.  Each state takes the current stream of input tokens
+// and the current Document and attempts to parse the next line of input.  A state must
+// return the next state to use, even when an error is encountered.  If nil is returned,
+// parsing stops.
+type state func(*tokenizer.TokenStream, *Document) (state, error)
+
+// Parse parses a Document from the given Token stream.  The result is emitted on the
+// output channel.
+func parse(tokens *tokenizer.TokenStream, output chan<- *Document) {
+	document := &Document{}
+
+	for state := parseVersion; state != nil; {
+		next, err := state(tokens, document)
+		if err != nil {
+			// Garbage lines are allowed; Treat errors as non-fatal.
+			log.Println(err)
+		}
+		state = next
+	}
+
+	output <- document
+}
+
+// DiscardLine is a parser state that throws away every token until a newline or EOF.
+func discardLine(tokens *tokenizer.TokenStream, _ *Document) (state, error) {
+	for {
+		token := tokens.Peek()
+		switch {
+		case token.Type == tokenizer.TypeEOF:
+			return nil, nil
+		case token.Type != tokenizer.TypeNewline:
+			tokens.Next()
+		default:
+			tokens.Next() // Skip the newline.
+			return parseNextLine, nil
+		}
+	}
+}
+
+func parseNextLine(tokens *tokenizer.TokenStream, doc *Document) (state, error) {
+	if tokens.Peek().Type == tokenizer.TypeEOF {
+		return nil, nil
+	}
+
+	if tokens.Peek().Type == tokenizer.TypeNumber {
+		return parsePlan, nil
+	}
+
+	if tokens.Peek().Value == "ok" || tokens.Peek().Value == "not" {
+		return parseTestLine, nil
+	}
+
+	return parseNextLine, unexpectedTokenError("one of 'ok', 'not' or a number", tokens.Next())
+}
+
+func parseVersion(tokens *tokenizer.TokenStream, doc *Document) (state, error) {
+	token := tokens.Next()
+	if token.Value != "TAP" {
+		return nil, unexpectedTokenError("'TAP'", token)
+	}
+
+	token = tokens.Next()
+	if token.Value != "version" {
+		return nil, unexpectedTokenError("'version'", token)
+	}
+
+	token = tokens.Next()
+	if token.Type != tokenizer.TypeNumber {
+		return nil, unexpectedTokenError("a version number", token)
+	}
+
+	version, err := strconv.ParseInt(token.Value, 10, 64)
+	if err != nil {
+		return nil, parserError(err.Error())
+	}
+
+	doc.Version = Version(version)
+	return parseNextLine, eat(tokens, tokenizer.TypeNewline)
+}
+
+func parsePlan(tokens *tokenizer.TokenStream, doc *Document) (state, error) {
+	if doc.Plan.Start != 0 || doc.Plan.End != 0 {
+		return discardLine, errors.New("plan has already been parsed")
+	}
+
+	token := tokens.Peek()
+	if token.Type != tokenizer.TypeNumber {
+		return discardLine, unexpectedTokenError("a number", token)
+	}
+
+	start, err := strconv.ParseInt(tokens.Next().Value, 10, 64)
+	if err != nil {
+		return discardLine, parserError(err.Error())
+	}
+
+	if err := eat(tokens, tokenizer.TypeDot); err != nil {
+		return discardLine, err
+	}
+
+	if err := eat(tokens, tokenizer.TypeDot); err != nil {
+		return discardLine, err
+	}
+
+	token = tokens.Peek()
+	if token.Type != tokenizer.TypeNumber {
+		return discardLine, unexpectedTokenError("a number > 1", token)
+	}
+
+	end, err := strconv.ParseInt(tokens.Next().Value, 10, 64)
+	if err != nil {
+		return discardLine, parserError(err.Error())
+	}
+
+	doc.Plan = Plan{Start: int(start), End: int(end)}
+	return parseNextLine, eat(tokens, tokenizer.TypeNewline)
+}
+
+func parseTestLine(tokens *tokenizer.TokenStream, doc *Document) (state, error) {
+	var testLine TestLine
+
+	// Parse test status.
+	token := tokens.Next()
+	switch token.Value {
+	case "not":
+		testLine.Ok = false
+		token = tokens.Next()
+		if token.Value != "ok" {
+			return discardLine, unexpectedTokenError("'ok'", token)
+		}
+	case "ok":
+		testLine.Ok = true
+	default:
+		return discardLine, unexpectedTokenError("'ok' or 'not ok'", token)
+	}
+
+	// Parse optional test number.
+	testLine.Count = len(doc.TestLines) + 1
+	if tokens.Peek().Type == tokenizer.TypeNumber {
+		count, err := strconv.ParseInt(tokens.Next().Value, 10, 64)
+		if err != nil {
+			return discardLine, parserError(err.Error())
+		}
+		testLine.Count = int(count)
+	}
+
+	// Parse optional description
+	var description []string
+	for tokens.Peek().Type == tokenizer.TypeText {
+		description = append(description, tokens.Next().Value)
+	}
+	testLine.Description = strings.Join(description, " ")
+
+	// Move to next line if there's no directive.
+	if err := eat(tokens, tokenizer.TypePound); err != nil {
+		doc.TestLines = append(doc.TestLines, testLine)
+		return discardLine, err
+	}
+
+	// Parse optional directive.
+	token = tokens.Next()
+	switch token.Value {
+	case "TODO":
+		testLine.Directive = Todo
+	case "SKIP":
+		testLine.Directive = Skip
+	default:
+		return discardLine, unexpectedTokenError("a directive", token)
+	}
+
+	// Parse explanation.
+	var explanation []string
+	for tokens.Peek().Type == tokenizer.TypeText {
+		explanation = append(explanation, tokens.Next().Value)
+	}
+	testLine.Explanation = strings.Join(explanation, " ")
+
+	doc.TestLines = append(doc.TestLines, testLine)
+	return parseNextLine, eat(tokens, tokenizer.TypeNewline)
+}
+
+// Eat consumes the next token from the stream iff it's type matches typ.  If the types
+// are different, an error is returned.
+func eat(tokens *tokenizer.TokenStream, typ tokenizer.TokenType) error {
+	token := tokens.Peek()
+	if token.Type != typ {
+		return unexpectedTokenError(string(typ), token)
+	}
+	tokens.Next()
+	return nil
+}
+
+func unexpectedTokenError(wanted string, token tokenizer.Token) error {
+	return parserError("got %q but wanted %s", token, wanted)
+}
+
+func parserError(format string, args ...interface{}) error {
+	return fmt.Errorf("parse error: "+format, args...)
+}
diff --git a/tap/parser_test.go b/tap/parser_test.go
new file mode 100644
index 0000000..9cfd75e
--- /dev/null
+++ b/tap/parser_test.go
@@ -0,0 +1,137 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package tap
+
+import (
+	"reflect"
+	"strings"
+	"testing"
+)
+
+func TestParse(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected *Document
+	}{
+		{
+			name: "version",
+			input: strings.TrimSpace(`
+		TAP version 13
+		`),
+			expected: &Document{
+				Version: 13,
+			},
+		},
+		{
+			name: "version and plan",
+			input: strings.TrimSpace(`
+TAP version 13
+1..2
+`),
+			expected: &Document{
+				Version: 13,
+				Plan:    Plan{Start: 1, End: 2},
+			},
+		},
+		{
+			name: "version and plan and test lines.",
+			input: strings.TrimSpace(`
+TAP version 13
+1..4
+ok 1 - This test passed
+ok 2 # TODO this test is disabled
+not ok 3 - This test failed
+ok 4 - This test passed also # TODO congratulate the author
+`),
+			expected: &Document{
+				Version: 13,
+				Plan:    Plan{Start: 1, End: 4},
+				TestLines: []TestLine{
+					{Ok: true, Count: 1, Description: "- This test passed"},
+					{Ok: true, Count: 2, Directive: Todo, Explanation: "this test is disabled"},
+					{Ok: false, Count: 3, Description: "- This test failed"},
+					{Ok: true, Count: 4, Description: "- This test passed also", Directive: Todo, Explanation: "congratulate the author"},
+				},
+			},
+		},
+		{
+			name: "version and test lines with plan at the end",
+			input: strings.TrimSpace(`
+TAP version 13
+ok 1 - This test passed
+ok 2 # TODO this test is disabled
+not ok 3 - This test failed
+1..3
+		`),
+			expected: &Document{
+				Version: 13,
+				Plan: Plan{
+					Start: 1,
+					End:   3,
+				},
+				TestLines: []TestLine{
+					{Ok: true, Count: 1, Description: "- This test passed"},
+					{Ok: true, Count: 2, Directive: Todo, Explanation: "this test is disabled"},
+					{Ok: false, Count: 3, Description: "- This test failed"},
+				},
+			},
+		},
+		{
+			name: "with garbage output interleaved",
+			input: strings.TrimSpace(`
+TAP version 13
+ERROR: segfault at 0x33123. print stackdump;
+0x00001fff: 0x88881
+0x00001ffe: 0x88881
+0x00001ffd: 0x88881
+1..3
+0x00001ffc: 0x88881
+ok 1 - This test passed
+ok 2 # TODO this test is disabled
+exiting
+not ok 3 - This test failed
+		`),
+			expected: &Document{
+				Version: 13,
+				Plan: Plan{
+					Start: 1,
+					End:   3,
+				},
+				TestLines: []TestLine{
+					{Ok: true, Count: 1, Description: "- This test passed"},
+					{Ok: true, Count: 2, Directive: Todo, Explanation: "this test is disabled"},
+					{Ok: false, Count: 3, Description: "- This test failed"},
+				},
+			},
+		},
+		{
+			name: "line that looks like test plan, but is not",
+			input: strings.TrimSpace(`
+TAP version 13
+1..
+not ok 3 - This test failed
+				`),
+			expected: &Document{
+				Version: 13,
+				TestLines: []TestLine{
+					{Ok: false, Count: 3, Description: "- This test failed"},
+				},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			doc, err := Parse([]byte(tt.input))
+			if err != nil {
+				t.Fatal(err)
+			}
+			if !reflect.DeepEqual(doc, tt.expected) {
+				t.Errorf("got\n%+v\nbut wanted\n%+v", doc, tt.expected)
+			}
+		})
+	}
+}
diff --git a/tap/tap.go b/tap/tap.go
new file mode 100644
index 0000000..0b89a7c
--- /dev/null
+++ b/tap/tap.go
@@ -0,0 +1,133 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package tap
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"strings"
+)
+
+// Type describes the type of TAP Object returned by the Parser.
+type Type int
+
+const (
+	// VersionType is the type returned by a Version Object.
+	VersionType Type = iota
+
+	// PlanType is the type returned by a Plan Object.
+	PlanType
+
+	// TestLineType is the type returned by a TestLine.
+	TestLineType
+)
+
+// Object is a TAP element such as a test line, plan, version header, or Yaml block.
+type Object interface {
+	Type() Type
+}
+
+// Document represents a complete TAP document.
+type Document struct {
+	Version   Version
+	Plan      Plan
+	TestLines []TestLine
+}
+
+// WriteTo writes this document to the given Writer as a formatted TAP output stream.
+func (d *Document) WriteTo(w io.Writer) (int64, error) {
+	n, err := w.Write([]byte(d.format()))
+	return int64(n), err
+}
+
+// Format renders this document as thought it were a TAP output stream.
+func (d *Document) format() string {
+	output := new(bytes.Buffer)
+	output.WriteString(fmt.Sprintf("TAP version %d\n", d.Version))
+	output.WriteString(fmt.Sprintf("%d..%d\n", d.Plan.Start, d.Plan.End))
+
+	for _, line := range d.TestLines {
+		var parts []string
+		ok := "ok"
+		if !line.Ok {
+			ok = "not ok"
+		}
+		parts = append(parts, ok)
+
+		if line.Count != 0 {
+			parts = append(parts, fmt.Sprintf("%d", line.Count))
+		}
+
+		if line.Description != "" {
+			parts = append(parts, line.Description)
+		}
+
+		switch line.Directive {
+		case Todo:
+			parts = append(parts, "# TODO", line.Explanation)
+		case Skip:
+			parts = append(parts, "# SKIP", line.Explanation)
+		}
+
+		output.WriteString(strings.Join(parts, " ") + "\n")
+	}
+
+	return output.String()
+}
+
+// Version represents a TAP version line.
+type Version int
+
+// Type implements Object.
+func (v Version) Type() Type {
+	return VersionType
+}
+
+func (v Version) String() string {
+	return fmt.Sprintf("TAP version %d", v)
+}
+
+// Plan represents a TAP plan line.
+type Plan struct {
+	Start int
+	End   int
+}
+
+// Type implements Object.
+func (p Plan) Type() Type {
+	return PlanType
+}
+
+func (p Plan) String() string {
+	return fmt.Sprintf("%d..%d", p.Start, p.End)
+}
+
+// Directive represents a TAP directive (TODO|SKIP|<none>)
+type Directive int
+
+// Valid Tap directives.
+const (
+	None Directive = iota
+	Todo
+	Skip
+)
+
+// TestLine represents a TAP test line beginning with "ok" or "not ok".
+//
+// TODO: Support inline YAML documents.
+type TestLine struct {
+	Ok          bool
+	Count       int
+	Description string
+	Directive   Directive
+	Explanation string
+	Diagnostic  string
+}
+
+// Type implements Object.
+func (t TestLine) Type() Type {
+	return TestLineType
+}
diff --git a/tap/tokenizer.go b/tap/tokenizer.go
deleted file mode 100644
index b226550..0000000
--- a/tap/tokenizer.go
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright 2019 The Fuchsia Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-package tap
-
-// Directive is a TAP directive (TODO|SKIP|<none>)
-type Directive int
-
-// Valid Tap directives.
-const (
-	None Directive = iota
-	Todo
-	Skip
-)
diff --git a/tap/tokenizer/lexer.go b/tap/tokenizer/lexer.go
new file mode 100644
index 0000000..3761352
--- /dev/null
+++ b/tap/tokenizer/lexer.go
@@ -0,0 +1,210 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package tokenizer
+
+import (
+	"log"
+	"strconv"
+	"unicode/utf8"
+)
+
+// TokenType describes the category a Token belongs to.
+type TokenType string
+
+// TokenType values recognized by the lexer.
+const (
+	TypePound   TokenType = "POUND"   // '#'
+	TypeNumber  TokenType = "NUMBER"  // A number
+	TypeText    TokenType = "TEXT"    // Catch-all type
+	TypeDot     TokenType = "DOT"     // '.'
+	TypeNewline TokenType = "NEWLINE" // '\n'
+	TypeEOF     TokenType = "EOF"     // Psuedo token to signal the end of input.
+)
+
+// Token represents some atomic TAP output string.
+type Token struct {
+	Type  TokenType
+	Value string
+}
+
+// Tokenize generates a channel of Tokens read from the given input.
+func Tokenize(input []byte) <-chan Token {
+	l := &lexer{
+		input:  input,
+		Tokens: make(chan Token, 1),
+	}
+	go l.run()
+	return l.Tokens
+}
+
+// EOFToken is emitted to signal the end of input.
+func EOFToken() Token {
+	return Token{
+		Type:  TypeEOF,
+		Value: "",
+	}
+}
+
+// The rune emitted when the end of input has been reached.
+const eof = rune(-1)
+
+// State represents a lexical analysis state.  Each state accepts a lexer as input and
+// returns the next lexer state.  If the output state is nil, lexing stops.
+type state func(*lexer) state
+
+// Lexer manages the position of a lexical analysis on some TAP output string.
+type lexer struct {
+	input  []byte
+	start  int
+	pos    int
+	width  int
+	Tokens chan Token
+}
+
+func (l *lexer) run() {
+	for state := lexAny; state != nil; {
+		state = state(l)
+	}
+	close(l.Tokens)
+}
+
+func (l *lexer) emit(t TokenType) {
+	l.Tokens <- Token{Type: t, Value: string(l.input[l.start:l.pos])}
+	l.start = l.pos
+}
+
+func (l *lexer) next() rune {
+	if l.pos >= len(l.input) {
+		l.width = 0
+		return eof
+	}
+
+	// Read the next rune, skipping over all invalid utf8 sequences.
+	var rn rune
+	rn, l.width = utf8.DecodeRune(l.input[l.pos:])
+	for rn == utf8.RuneError && l.pos < len(l.input) {
+		log.Printf("invalid UTF-8 found at pos %d:\n\n%s", l.pos, string(l.input))
+		l.pos++
+		rn, l.width = utf8.DecodeRune(l.input[l.pos:])
+	}
+	l.pos += l.width
+	return rn
+}
+
+// Returns the current lexeme.
+func (l *lexer) lexeme() lexeme {
+	return lexeme(l.input[l.pos : l.pos+1][0])
+}
+
+// LexAny is the lexer start state.  It's job is to put the lexer into the proper state
+// according to the next input rune.  Other states should return to this state after
+// emitting their lexemes.  They should also not consume runes using l.next() immediately
+// before entering this state.
+func lexAny(l *lexer) state {
+	lxm := l.lexeme()
+	if lxm.isEOF() {
+		l.emit(TypeEOF)
+		return nil
+	}
+
+	l.start = l.pos
+
+	switch {
+	case lxm.isNewline():
+		l.next()
+		l.emit(TypeNewline)
+		return lexAny
+	case lxm.isDot():
+		l.next()
+		l.emit(TypeDot)
+		return lexAny
+	case lxm.isPound():
+		l.next()
+		l.emit(TypePound)
+		return lexAny
+	case lxm.isSpace():
+		// Skip all spaces.
+		for l.lexeme().isSpace() {
+			l.next()
+		}
+		l.start = l.pos
+		return lexAny
+	case lxm.isDigit():
+		return lexNumber
+	}
+
+	return lexText
+}
+
+func lexNumber(l *lexer) state {
+	for {
+		lxm := l.lexeme()
+		if lxm.isEOF() || !lxm.isDigit() {
+			l.emit(TypeNumber)
+			return lexAny
+		}
+
+		if l.next() == eof {
+			break
+		}
+	}
+
+	// Reached EOF
+	if l.pos > l.start {
+		l.emit(TypeNumber)
+	}
+
+	l.emit(TypeEOF)
+	return nil
+}
+
+func lexText(l *lexer) state {
+	for l.next() != eof {
+		lxm := l.lexeme()
+		if lxm.isNonText() {
+			l.emit(TypeText)
+			return lexAny
+		}
+	}
+
+	// Reached EOF
+	if l.pos > l.start {
+		l.emit(TypeText)
+	}
+
+	l.emit(TypeEOF)
+	return nil
+}
+
+type lexeme rune
+
+func (l lexeme) isSpace() bool {
+	return l == ' ' || l == '\r'
+}
+
+func (l lexeme) isNewline() bool {
+	return l == '\n'
+}
+
+func (l lexeme) isDigit() bool {
+	_, err := strconv.Atoi(string(l))
+	return err == nil
+}
+
+func (l lexeme) isDot() bool {
+	return l == '.'
+}
+
+func (l lexeme) isPound() bool {
+	return l == '#'
+}
+
+func (l lexeme) isEOF() bool {
+	return rune(l) == eof
+}
+
+func (l lexeme) isNonText() bool {
+	return l.isEOF() || l.isSpace() || l.isNewline() || l.isDigit() || l.isDot() || l.isPound()
+}
diff --git a/tap/tokenizer/stream.go b/tap/tokenizer/stream.go
new file mode 100644
index 0000000..fd14d91
--- /dev/null
+++ b/tap/tokenizer/stream.go
@@ -0,0 +1,56 @@
+// Copyright 2019 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package tokenizer
+
+// NewStream creates a stream of Token values read from input.
+func NewTokenStream(input []byte) *TokenStream {
+	return &TokenStream{
+		stream: Tokenize(input),
+	}
+}
+
+// TokenStream is a read-only queue of Token values.  The next Token in the stream can be
+// consumed by calling Next().  The next token can be observed without being consumed by
+// calling Peek().
+type TokenStream struct {
+	eof       bool
+	lookahead *Token
+	stream    <-chan Token
+}
+
+// Next consumes the next token in the stream.
+func (s *TokenStream) Next() Token {
+	if s.eof {
+		return EOFToken()
+	}
+
+	next := new(Token)
+	if s.lookahead == nil {
+		*next = <-s.stream
+	} else {
+		next = s.lookahead
+		s.lookahead = nil
+	}
+
+	if next.Type == TypeEOF {
+		s.eof = true
+	}
+
+	return *next
+}
+
+// Peek returns a read-only copy of the next token in the stream, without consuming it.
+func (s *TokenStream) Peek() Token {
+	if s.eof {
+		return EOFToken()
+	}
+
+	if s.lookahead == nil {
+		s.lookahead = new(Token)
+		*s.lookahead = <-s.stream
+	}
+
+	return *s.lookahead
+}