[tap] Preserve whitespace in strings
This change causes the parser to output TAP documents which have
preserved the whitespace for test line descriptions and directives.
Note that the whitespace surrounding a description or directive is
assumed to be unimportant and is not preserved. For example: The
text "some text" is parsed as "some text", but we assume
that it's OK to trim " some text " when parsing, as "some text".
Change-Id: Ia16a2ae8b1c97e99533c5768a0a08e78d21aa3d7
diff --git a/tap/parser.go b/tap/parser.go
index a4d6b13..d96ea9e 100644
--- a/tap/parser.go
+++ b/tap/parser.go
@@ -14,7 +14,7 @@
"fuchsia.googlesource.com/tools/tap/tokenizer"
)
-// Parse parses the given input string into a Document. The input is allowed to contain
+// Parse parses the given input string into a Document. The input is allowed to contain
// garbage lines; The parser will skip them and parse as much of the input as possible.
// The only execption is that the first line of input must be a TAP version header of the
// form "TAP version XXX".
@@ -24,13 +24,13 @@
return <-output, nil
}
-// State represents a parser state. Each state takes the current stream of input tokens
-// and the current Document and attempts to parse the next line of input. A state must
-// return the next state to use, even when an error is encountered. If nil is returned,
+// State represents a parser state. Each state takes the current stream of input tokens
+// and the current Document and attempts to parse the next line of input. A state must
+// return the next state to use, even when an error is encountered. If nil is returned,
// parsing stops.
type state func(*tokenizer.TokenStream, *Document) (state, error)
-// Parse parses a Document from the given Token stream. The result is emitted on the
+// Parse parses a Document from the given Token stream. The result is emitted on the
// output channel.
func parse(tokens *tokenizer.TokenStream, output chan<- *Document) {
document := &Document{}
@@ -169,12 +169,12 @@
testLine.Count = int(count)
}
- // Parse optional description
- var description []string
- for tokens.Peek().Type == tokenizer.TypeText {
- description = append(description, tokens.Next().Value)
- }
- testLine.Description = strings.Join(description, " ")
+ // Parse optional description. Stop at a TypePound token which marks the start of a
+ // diagnostic.
+ testLine.Description = concat(tokens.Raw(), func(tok tokenizer.Token) bool {
+ t := tok.Type
+ return t != tokenizer.TypePound && t != tokenizer.TypeNewline && t != tokenizer.TypeEOF
+ })
// Move to next line if there's no directive.
if err := eat(tokens, tokenizer.TypePound); err != nil {
@@ -194,17 +194,16 @@
}
// Parse explanation.
- var explanation []string
- for tokens.Peek().Type == tokenizer.TypeText {
- explanation = append(explanation, tokens.Next().Value)
- }
- testLine.Explanation = strings.Join(explanation, " ")
+ testLine.Explanation = concat(tokens.Raw(), func(tok tokenizer.Token) bool {
+ t := tok.Type
+ return t != tokenizer.TypeNewline && t != tokenizer.TypeEOF
+ })
doc.TestLines = append(doc.TestLines, testLine)
return parseNextLine, eat(tokens, tokenizer.TypeNewline)
}
-// Eat consumes the next token from the stream iff it's type matches typ. If the types
+// Eat consumes the next token from the stream iff it's type matches typ. If the types
// are different, an error is returned.
func eat(tokens *tokenizer.TokenStream, typ tokenizer.TokenType) error {
token := tokens.Peek()
@@ -215,6 +214,17 @@
return nil
}
+// Concat concatenates the values of the next tokens in the stream as long as cond keeps
+// returning true. Returns the contatenated output with leading and trailing spaces
+// trimmed.
+func concat(tokens *tokenizer.RawTokenStream, cond func(tok tokenizer.Token) bool) string {
+ var values string
+ for cond(tokens.Peek()) {
+ values += tokens.Next().Value
+ }
+ return strings.TrimSpace(values)
+}
+
func unexpectedTokenError(wanted string, token tokenizer.Token) error {
return parserError("got %q but wanted %s", token, wanted)
}
diff --git a/tap/parser_test.go b/tap/parser_test.go
index 9cfd75e..63b4700 100644
--- a/tap/parser_test.go
+++ b/tap/parser_test.go
@@ -121,6 +121,36 @@
},
},
},
+ {
+ name: "should preserve spaces in description",
+ input: strings.TrimSpace(`
+TAP version 13
+1..1
+ok 1 - This test passed
+`),
+ expected: &Document{
+ Version: 13,
+ Plan: Plan{Start: 1, End: 1},
+ TestLines: []TestLine{
+ {Ok: true, Count: 1, Description: "- This test passed"},
+ },
+ },
+ },
+ {
+ name: "should preserve spaces in directive explanation",
+ input: strings.TrimSpace(`
+TAP version 13
+1..1
+ok 1 # SKIP this is disabled
+`),
+ expected: &Document{
+ Version: 13,
+ Plan: Plan{Start: 1, End: 1},
+ TestLines: []TestLine{
+ {Ok: true, Count: 1, Directive: Skip, Explanation: "this is disabled"},
+ },
+ },
+ },
}
for _, tt := range tests {
diff --git a/tap/tokenizer/lexer.go b/tap/tokenizer/lexer.go
index 3761352..9d97fbf 100644
--- a/tap/tokenizer/lexer.go
+++ b/tap/tokenizer/lexer.go
@@ -7,6 +7,7 @@
import (
"log"
"strconv"
+ "unicode"
"unicode/utf8"
)
@@ -21,6 +22,7 @@
TypeDot TokenType = "DOT" // '.'
TypeNewline TokenType = "NEWLINE" // '\n'
TypeEOF TokenType = "EOF" // Psuedo token to signal the end of input.
+ TypeSpace TokenType = "SPACE" // A whitespace character
)
// Token represents some atomic TAP output string.
@@ -50,8 +52,8 @@
// The rune emitted when the end of input has been reached.
const eof = rune(-1)
-// State represents a lexical analysis state. Each state accepts a lexer as input and
-// returns the next lexer state. If the output state is nil, lexing stops.
+// State represents a lexical analysis state. Each state accepts a lexer as input and
+// returns the next lexer state. If the output state is nil, lexing stops.
type state func(*lexer) state
// Lexer manages the position of a lexical analysis on some TAP output string.
@@ -98,9 +100,9 @@
return lexeme(l.input[l.pos : l.pos+1][0])
}
-// LexAny is the lexer start state. It's job is to put the lexer into the proper state
-// according to the next input rune. Other states should return to this state after
-// emitting their lexemes. They should also not consume runes using l.next() immediately
+// LexAny is the lexer start state. It's job is to put the lexer into the proper state
+// according to the next input rune. Other states should return to this state after
+// emitting their lexemes. They should also not consume runes using l.next() immediately
// before entering this state.
func lexAny(l *lexer) state {
lxm := l.lexeme()
@@ -125,12 +127,7 @@
l.emit(TypePound)
return lexAny
case lxm.isSpace():
- // Skip all spaces.
- for l.lexeme().isSpace() {
- l.next()
- }
- l.start = l.pos
- return lexAny
+ return lexSpace
case lxm.isDigit():
return lexNumber
}
@@ -138,42 +135,35 @@
return lexText
}
+func lexSpace(l *lexer) state {
+ return lexUntil(l, TypeSpace, func(lxm lexeme) bool { return !lxm.isSpace() })
+}
+
func lexNumber(l *lexer) state {
+ return lexUntil(l, TypeNumber, func(lxm lexeme) bool { return !lxm.isDigit() })
+}
+
+func lexText(l *lexer) state {
+ return lexUntil(l, TypeText, func(lxm lexeme) bool { return lxm.isNonText() })
+}
+
+// LexUntil consumes all runes into a token of the given type while `stop` is false.
+// Returns lexAny when complete or nil if the end of input was reached.
+func lexUntil(l *lexer, typ TokenType, stop func(lexeme) bool) state {
for {
lxm := l.lexeme()
- if lxm.isEOF() || !lxm.isDigit() {
- l.emit(TypeNumber)
+ if lxm.isEOF() || stop(lxm) {
+ l.emit(typ)
return lexAny
}
-
if l.next() == eof {
break
}
}
-
// Reached EOF
if l.pos > l.start {
- l.emit(TypeNumber)
+ l.emit(typ)
}
-
- l.emit(TypeEOF)
- return nil
-}
-
-func lexText(l *lexer) state {
- for l.next() != eof {
- lxm := l.lexeme()
- if lxm.isNonText() {
- l.emit(TypeText)
- return lexAny
- }
- }
-
- // Reached EOF
- if l.pos > l.start {
- l.emit(TypeText)
- }
-
l.emit(TypeEOF)
return nil
}
@@ -181,7 +171,7 @@
type lexeme rune
func (l lexeme) isSpace() bool {
- return l == ' ' || l == '\r'
+ return l != '\n' && unicode.IsSpace(rune(l))
}
func (l lexeme) isNewline() bool {
diff --git a/tap/tokenizer/stream.go b/tap/tokenizer/stream.go
index fd14d91..cc9e947 100644
--- a/tap/tokenizer/stream.go
+++ b/tap/tokenizer/stream.go
@@ -7,21 +7,60 @@
// NewStream creates a stream of Token values read from input.
func NewTokenStream(input []byte) *TokenStream {
return &TokenStream{
- stream: Tokenize(input),
+ raw: &RawTokenStream{
+ stream: Tokenize(input),
+ },
}
}
-// TokenStream is a read-only queue of Token values. The next Token in the stream can be
-// consumed by calling Next(). The next token can be observed without being consumed by
-// calling Peek().
+// TokenStream is a read-only queue of Token values. The next Token in the stream can be
+// consumed by calling Next(). The next token can be observed without being consumed by
+// calling Peek(). By default, TokenStream discards \s characters as though they are not
+// part of the stream. They are discarded when calling both Peek and Next.
type TokenStream struct {
+ raw *RawTokenStream
+}
+
+// Next consumes the next token in the stream. Space characters are skipped.
+func (s *TokenStream) Next() Token {
+ for {
+ next := s.raw.Next()
+ if next.Type != TypeSpace {
+ return next
+ }
+ }
+}
+
+// Peek returns a read-only copy of the next token in the stream, without consuming it.
+// Space characters are skipped.
+func (s *TokenStream) Peek() Token {
+ for {
+ next := s.raw.Peek()
+ if next.Type == TypeSpace {
+ s.raw.Next()
+ continue
+ }
+ return next
+ }
+}
+
+// Raw returns a RawTokenStream using the same underlying stream of Tokens as this
+// TokenStream.
+func (s TokenStream) Raw() *RawTokenStream {
+ return s.raw
+}
+
+// RawTokenStream is a read-only queue of Token values. The next Token in the stream can
+// be consumed by calling Next(). The next token can be observed without being consumed
+// by calling Peek().
+type RawTokenStream struct {
+ stream <-chan Token
eof bool
lookahead *Token
- stream <-chan Token
}
// Next consumes the next token in the stream.
-func (s *TokenStream) Next() Token {
+func (s *RawTokenStream) Next() Token {
if s.eof {
return EOFToken()
}
@@ -42,7 +81,7 @@
}
// Peek returns a read-only copy of the next token in the stream, without consuming it.
-func (s *TokenStream) Peek() Token {
+func (s *RawTokenStream) Peek() Token {
if s.eof {
return EOFToken()
}