[tap] Preserve whitespace in strings

This change causes the parser to output TAP documents which have
preserved the whitespace for test line descriptions and directives.

Note that the whitespace surrounding a description or directive is
assumed to be unimportant and is not preserved.  For example: The
text "some  text" is parsed as "some   text", but we assume
that it's OK to trim " some text " when parsing, as "some text".

Change-Id: Ia16a2ae8b1c97e99533c5768a0a08e78d21aa3d7
diff --git a/tap/parser.go b/tap/parser.go
index a4d6b13..d96ea9e 100644
--- a/tap/parser.go
+++ b/tap/parser.go
@@ -14,7 +14,7 @@
 	"fuchsia.googlesource.com/tools/tap/tokenizer"
 )
 
-// Parse parses the given input string into a Document.  The input is allowed to contain
+// Parse parses the given input string into a Document. The input is allowed to contain
 // garbage lines; The parser will skip them and parse as much of the input as possible.
 // The only execption is that the first line of input must be a TAP version header of the
 // form "TAP version XXX".
@@ -24,13 +24,13 @@
 	return <-output, nil
 }
 
-// State represents a parser state.  Each state takes the current stream of input tokens
-// and the current Document and attempts to parse the next line of input.  A state must
-// return the next state to use, even when an error is encountered.  If nil is returned,
+// State represents a parser state. Each state takes the current stream of input tokens
+// and the current Document and attempts to parse the next line of input. A state must
+// return the next state to use, even when an error is encountered. If nil is returned,
 // parsing stops.
 type state func(*tokenizer.TokenStream, *Document) (state, error)
 
-// Parse parses a Document from the given Token stream.  The result is emitted on the
+// Parse parses a Document from the given Token stream. The result is emitted on the
 // output channel.
 func parse(tokens *tokenizer.TokenStream, output chan<- *Document) {
 	document := &Document{}
@@ -169,12 +169,12 @@
 		testLine.Count = int(count)
 	}
 
-	// Parse optional description
-	var description []string
-	for tokens.Peek().Type == tokenizer.TypeText {
-		description = append(description, tokens.Next().Value)
-	}
-	testLine.Description = strings.Join(description, " ")
+	// Parse optional description. Stop at a TypePound token which marks the start of a
+	// diagnostic.
+	testLine.Description = concat(tokens.Raw(), func(tok tokenizer.Token) bool {
+		t := tok.Type
+		return t != tokenizer.TypePound && t != tokenizer.TypeNewline && t != tokenizer.TypeEOF
+	})
 
 	// Move to next line if there's no directive.
 	if err := eat(tokens, tokenizer.TypePound); err != nil {
@@ -194,17 +194,16 @@
 	}
 
 	// Parse explanation.
-	var explanation []string
-	for tokens.Peek().Type == tokenizer.TypeText {
-		explanation = append(explanation, tokens.Next().Value)
-	}
-	testLine.Explanation = strings.Join(explanation, " ")
+	testLine.Explanation = concat(tokens.Raw(), func(tok tokenizer.Token) bool {
+		t := tok.Type
+		return t != tokenizer.TypeNewline && t != tokenizer.TypeEOF
+	})
 
 	doc.TestLines = append(doc.TestLines, testLine)
 	return parseNextLine, eat(tokens, tokenizer.TypeNewline)
 }
 
-// Eat consumes the next token from the stream iff it's type matches typ.  If the types
+// Eat consumes the next token from the stream iff it's type matches typ. If the types
 // are different, an error is returned.
 func eat(tokens *tokenizer.TokenStream, typ tokenizer.TokenType) error {
 	token := tokens.Peek()
@@ -215,6 +214,17 @@
 	return nil
 }
 
+// Concat concatenates the values of the next tokens in the stream as long as cond keeps
+// returning true. Returns the contatenated output with leading and trailing spaces
+// trimmed.
+func concat(tokens *tokenizer.RawTokenStream, cond func(tok tokenizer.Token) bool) string {
+	var values string
+	for cond(tokens.Peek()) {
+		values += tokens.Next().Value
+	}
+	return strings.TrimSpace(values)
+}
+
 func unexpectedTokenError(wanted string, token tokenizer.Token) error {
 	return parserError("got %q but wanted %s", token, wanted)
 }
diff --git a/tap/parser_test.go b/tap/parser_test.go
index 9cfd75e..63b4700 100644
--- a/tap/parser_test.go
+++ b/tap/parser_test.go
@@ -121,6 +121,36 @@
 				},
 			},
 		},
+		{
+			name: "should preserve spaces in description",
+			input: strings.TrimSpace(`
+TAP version 13
+1..1
+ok 1 - This test     passed
+`),
+			expected: &Document{
+				Version: 13,
+				Plan:    Plan{Start: 1, End: 1},
+				TestLines: []TestLine{
+					{Ok: true, Count: 1, Description: "- This test     passed"},
+				},
+			},
+		},
+		{
+			name: "should preserve spaces in directive explanation",
+			input: strings.TrimSpace(`
+TAP version 13
+1..1
+ok 1 # SKIP this  is   disabled
+`),
+			expected: &Document{
+				Version: 13,
+				Plan:    Plan{Start: 1, End: 1},
+				TestLines: []TestLine{
+					{Ok: true, Count: 1, Directive: Skip, Explanation: "this  is   disabled"},
+				},
+			},
+		},
 	}
 
 	for _, tt := range tests {
diff --git a/tap/tokenizer/lexer.go b/tap/tokenizer/lexer.go
index 3761352..9d97fbf 100644
--- a/tap/tokenizer/lexer.go
+++ b/tap/tokenizer/lexer.go
@@ -7,6 +7,7 @@
 import (
 	"log"
 	"strconv"
+	"unicode"
 	"unicode/utf8"
 )
 
@@ -21,6 +22,7 @@
 	TypeDot     TokenType = "DOT"     // '.'
 	TypeNewline TokenType = "NEWLINE" // '\n'
 	TypeEOF     TokenType = "EOF"     // Psuedo token to signal the end of input.
+	TypeSpace   TokenType = "SPACE"   // A whitespace character
 )
 
 // Token represents some atomic TAP output string.
@@ -50,8 +52,8 @@
 // The rune emitted when the end of input has been reached.
 const eof = rune(-1)
 
-// State represents a lexical analysis state.  Each state accepts a lexer as input and
-// returns the next lexer state.  If the output state is nil, lexing stops.
+// State represents a lexical analysis state. Each state accepts a lexer as input and
+// returns the next lexer state. If the output state is nil, lexing stops.
 type state func(*lexer) state
 
 // Lexer manages the position of a lexical analysis on some TAP output string.
@@ -98,9 +100,9 @@
 	return lexeme(l.input[l.pos : l.pos+1][0])
 }
 
-// LexAny is the lexer start state.  It's job is to put the lexer into the proper state
-// according to the next input rune.  Other states should return to this state after
-// emitting their lexemes.  They should also not consume runes using l.next() immediately
+// LexAny is the lexer start state. It's job is to put the lexer into the proper state
+// according to the next input rune. Other states should return to this state after
+// emitting their lexemes. They should also not consume runes using l.next() immediately
 // before entering this state.
 func lexAny(l *lexer) state {
 	lxm := l.lexeme()
@@ -125,12 +127,7 @@
 		l.emit(TypePound)
 		return lexAny
 	case lxm.isSpace():
-		// Skip all spaces.
-		for l.lexeme().isSpace() {
-			l.next()
-		}
-		l.start = l.pos
-		return lexAny
+		return lexSpace
 	case lxm.isDigit():
 		return lexNumber
 	}
@@ -138,42 +135,35 @@
 	return lexText
 }
 
+func lexSpace(l *lexer) state {
+	return lexUntil(l, TypeSpace, func(lxm lexeme) bool { return !lxm.isSpace() })
+}
+
 func lexNumber(l *lexer) state {
+	return lexUntil(l, TypeNumber, func(lxm lexeme) bool { return !lxm.isDigit() })
+}
+
+func lexText(l *lexer) state {
+	return lexUntil(l, TypeText, func(lxm lexeme) bool { return lxm.isNonText() })
+}
+
+// LexUntil consumes all runes into a token of the given type while `stop` is false.
+// Returns lexAny when complete or nil if the end of input was reached.
+func lexUntil(l *lexer, typ TokenType, stop func(lexeme) bool) state {
 	for {
 		lxm := l.lexeme()
-		if lxm.isEOF() || !lxm.isDigit() {
-			l.emit(TypeNumber)
+		if lxm.isEOF() || stop(lxm) {
+			l.emit(typ)
 			return lexAny
 		}
-
 		if l.next() == eof {
 			break
 		}
 	}
-
 	// Reached EOF
 	if l.pos > l.start {
-		l.emit(TypeNumber)
+		l.emit(typ)
 	}
-
-	l.emit(TypeEOF)
-	return nil
-}
-
-func lexText(l *lexer) state {
-	for l.next() != eof {
-		lxm := l.lexeme()
-		if lxm.isNonText() {
-			l.emit(TypeText)
-			return lexAny
-		}
-	}
-
-	// Reached EOF
-	if l.pos > l.start {
-		l.emit(TypeText)
-	}
-
 	l.emit(TypeEOF)
 	return nil
 }
@@ -181,7 +171,7 @@
 type lexeme rune
 
 func (l lexeme) isSpace() bool {
-	return l == ' ' || l == '\r'
+	return l != '\n' && unicode.IsSpace(rune(l))
 }
 
 func (l lexeme) isNewline() bool {
diff --git a/tap/tokenizer/stream.go b/tap/tokenizer/stream.go
index fd14d91..cc9e947 100644
--- a/tap/tokenizer/stream.go
+++ b/tap/tokenizer/stream.go
@@ -7,21 +7,60 @@
 // NewStream creates a stream of Token values read from input.
 func NewTokenStream(input []byte) *TokenStream {
 	return &TokenStream{
-		stream: Tokenize(input),
+		raw: &RawTokenStream{
+			stream: Tokenize(input),
+		},
 	}
 }
 
-// TokenStream is a read-only queue of Token values.  The next Token in the stream can be
-// consumed by calling Next().  The next token can be observed without being consumed by
-// calling Peek().
+// TokenStream is a read-only queue of Token values. The next Token in the stream can be
+// consumed by calling Next(). The next token can be observed without being consumed by
+// calling Peek(). By default, TokenStream discards \s characters as though they are not
+// part of the stream. They are discarded when calling both Peek and Next.
 type TokenStream struct {
+	raw *RawTokenStream
+}
+
+// Next consumes the next token in the stream. Space characters are skipped.
+func (s *TokenStream) Next() Token {
+	for {
+		next := s.raw.Next()
+		if next.Type != TypeSpace {
+			return next
+		}
+	}
+}
+
+// Peek returns a read-only copy of the next token in the stream, without consuming it.
+// Space characters are skipped.
+func (s *TokenStream) Peek() Token {
+	for {
+		next := s.raw.Peek()
+		if next.Type == TypeSpace {
+			s.raw.Next()
+			continue
+		}
+		return next
+	}
+}
+
+// Raw returns a RawTokenStream using the same underlying stream of Tokens as this
+// TokenStream.
+func (s TokenStream) Raw() *RawTokenStream {
+	return s.raw
+}
+
+// RawTokenStream is a read-only queue of Token values. The next Token in the stream can
+// be consumed by calling Next(). The next token can be observed without being consumed
+// by calling Peek().
+type RawTokenStream struct {
+	stream    <-chan Token
 	eof       bool
 	lookahead *Token
-	stream    <-chan Token
 }
 
 // Next consumes the next token in the stream.
-func (s *TokenStream) Next() Token {
+func (s *RawTokenStream) Next() Token {
 	if s.eof {
 		return EOFToken()
 	}
@@ -42,7 +81,7 @@
 }
 
 // Peek returns a read-only copy of the next token in the stream, without consuming it.
-func (s *TokenStream) Peek() Token {
+func (s *RawTokenStream) Peek() Token {
 	if s.eof {
 		return EOFToken()
 	}