blob: 9173f682143bf0cee69c406024411b24a4bed51b [file] [log] [blame] [edit]
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package tokenizer
import (
"log"
"strconv"
"unicode"
"unicode/utf8"
)
// TokenType describes the category a Token belongs to.
type TokenType string
// TokenType values recognized by the lexer.
const (
TypePound TokenType = "POUND" // '#'
TypeNumber TokenType = "NUMBER" // A number
TypeText TokenType = "TEXT" // Catch-all type
TypeDot TokenType = "DOT" // '.'
TypeNewline TokenType = "NEWLINE" // '\n'
TypeEOF TokenType = "EOF" // Pseudo token to signal the end of input.
TypeSpace TokenType = "SPACE" // A whitespace character
TypeDash TokenType = "DASH" // '-'
)
// Token represents some atomic TAP output string.
type Token struct {
Type TokenType
Value string
}
// Tokenize generates a channel of Tokens read from the given input.
func Tokenize(input []byte) <-chan Token {
l := &lexer{
input: input,
Tokens: make(chan Token, 1),
}
go l.run()
return l.Tokens
}
// EOFToken is emitted to signal the end of input.
func EOFToken() Token {
return Token{
Type: TypeEOF,
Value: "",
}
}
// The rune emitted when the end of input has been reached.
const eof = rune(-1)
// State represents a lexical analysis state. Each state accepts a lexer as input and
// returns the next lexer state. If the output state is nil, lexing stops.
type state func(*lexer) state
// Lexer manages the position of a lexical analysis on some TAP output string.
type lexer struct {
input []byte
start int
pos int
width int
Tokens chan Token
}
func (l *lexer) run() {
for state := lexAny; state != nil; {
state = state(l)
}
close(l.Tokens)
}
func (l *lexer) emit(t TokenType) {
l.Tokens <- Token{Type: t, Value: string(l.input[l.start:l.pos])}
l.start = l.pos
}
func (l *lexer) next() rune {
if l.pos >= len(l.input) {
l.width = 0
return eof
}
// Read the next rune, skipping over all invalid utf8 sequences.
var rn rune
rn, l.width = utf8.DecodeRune(l.input[l.pos:])
for rn == utf8.RuneError && l.pos < len(l.input) {
log.Printf("invalid UTF-8 found at pos %d:\n\n%s", l.pos, string(l.input))
l.pos++
rn, l.width = utf8.DecodeRune(l.input[l.pos:])
}
l.pos += l.width
return rn
}
// Returns the current lexeme.
func (l *lexer) lexeme() lexeme {
if l.pos >= len(l.input) {
return lexeme(eof)
}
return lexeme(l.input[l.pos : l.pos+1][0])
}
// LexAny is the lexer start state. Its job is to put the lexer into the proper state
// according to the next input rune. Other states should return to this state after
// emitting their lexemes. They should also not consume runes using l.next() immediately
// before entering this state.
func lexAny(l *lexer) state {
lxm := l.lexeme()
if lxm.isEOF() {
l.emit(TypeEOF)
return nil
}
l.start = l.pos
switch {
case lxm.isDash():
l.next()
l.emit(TypeDash)
return lexAny
case lxm.isNewline():
l.next()
l.emit(TypeNewline)
return lexAny
case lxm.isDot():
l.next()
l.emit(TypeDot)
return lexAny
case lxm.isPound():
l.next()
l.emit(TypePound)
return lexAny
case lxm.isSpace():
return lexSpace
case lxm.isDigit():
return lexNumber
}
return lexText
}
func lexSpace(l *lexer) state {
return lexUntil(l, TypeSpace, func(lxm lexeme) bool { return !lxm.isSpace() })
}
func lexNumber(l *lexer) state {
return lexUntil(l, TypeNumber, func(lxm lexeme) bool { return !lxm.isDigit() })
}
func lexText(l *lexer) state {
return lexUntil(l, TypeText, func(lxm lexeme) bool { return lxm.isNonText() })
}
// LexUntil consumes all runes into a token of the given type while `stop` is false.
// Returns lexAny when complete or nil if the end of input was reached.
func lexUntil(l *lexer, typ TokenType, stop func(lexeme) bool) state {
for {
lxm := l.lexeme()
if lxm.isEOF() || stop(lxm) {
l.emit(typ)
return lexAny
}
if l.next() == eof {
break
}
}
// Reached EOF
if l.pos > l.start {
l.emit(typ)
}
l.emit(TypeEOF)
return nil
}
type lexeme rune
func (l lexeme) isSpace() bool {
return l != '\n' && unicode.IsSpace(rune(l))
}
func (l lexeme) isNewline() bool {
return l == '\n'
}
func (l lexeme) isDigit() bool {
_, err := strconv.Atoi(string(l))
return err == nil
}
func (l lexeme) isDot() bool {
return l == '.'
}
func (l lexeme) isDash() bool {
return l == '-'
}
func (l lexeme) isPound() bool {
return l == '#'
}
func (l lexeme) isEOF() bool {
return rune(l) == eof
}
func (l lexeme) isNonText() bool {
return l.isEOF() || l.isSpace() || l.isNewline() || l.isDigit() || l.isDot() || l.isPound() || l.isDash()
}