added go source, tests.

commit: d230c4ab1457e5928bbee4357a5d7ea17b70d3e4 [log] [tgz]
author: Steven Thurgood <steven.thurgood+shlex@gmail.com> Fri Jan 27 13:08:59 2012 +0000
committer: Steven Thurgood <steven.thurgood+shlex@gmail.com> Fri Jan 27 13:08:59 2012 +0000
tree: 5ba188fbd9e35d637695652506492e1b49c186b0
parent: 5701c3bd13086baa9858ac5317a02d62bb41fbe1 [diff]
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..038d9a4
--- /dev/null
+++ b/Makefile

@@ -0,0 +1,21 @@
+# Copyright 2011 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include $(GOROOT)/src/Make.inc
+
+TARG=shlex
+GOFILES=\
+	shlex.go\
+
+include $(GOROOT)/src/Make.pkg

diff --git a/shlex.go b/shlex.go
new file mode 100644
index 0000000..36f6515
--- /dev/null
+++ b/shlex.go

@@ -0,0 +1,457 @@
+/*
+Copyright 2012 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package shlex
+
+/*
+Package shlex implements a simple lexer which splits input in to tokens using
+shell-style rules for quoting and commenting.
+*/
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"os"
+	"strings"
+)
+
+/*
+A TokenType is a top-level token; a word, space, comment, unknown.
+*/
+type TokenType int
+
+/*
+A RuneTokenType is the type of a UTF-8 character; a character, quote, space, escape.
+*/
+type RuneTokenType int
+
+type lexerState int
+
+type Token struct {
+	tokenType TokenType
+	value     string
+}
+
+/*
+Two tokens are equal if both their types and values are equal. A nil token can
+never equal another token.
+*/
+func (a *Token) Equal(b *Token) bool {
+	if a == nil || b == nil {
+		return false
+	}
+	if a.tokenType != b.tokenType {
+		return false
+	}
+	return a.value == b.value
+}
+
+const (
+	RUNE_CHAR              string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._-,"
+	RUNE_SPACE             string = " \t\r\n"
+	RUNE_ESCAPING_QUOTE    string = "\""
+	RUNE_NONESCAPING_QUOTE string = "'"
+	RUNE_ESCAPE            = "\\"
+	RUNE_COMMENT           = "#"
+
+	RUNETOKEN_UNKNOWN           RuneTokenType = 0
+	RUNETOKEN_CHAR              RuneTokenType = 1
+	RUNETOKEN_SPACE             RuneTokenType = 2
+	RUNETOKEN_ESCAPING_QUOTE    RuneTokenType = 3
+	RUNETOKEN_NONESCAPING_QUOTE RuneTokenType = 4
+	RUNETOKEN_ESCAPE            RuneTokenType = 5
+	RUNETOKEN_COMMENT           RuneTokenType = 6
+	RUNETOKEN_EOF               RuneTokenType = 7
+
+	TOKEN_UNKNOWN TokenType = 0
+	TOKEN_WORD    TokenType = 1
+	TOKEN_SPACE   TokenType = 2
+	TOKEN_COMMENT TokenType = 3
+
+	STATE_START           lexerState = 0
+	STATE_INWORD          lexerState = 1
+	STATE_ESCAPING        lexerState = 2
+	STATE_ESCAPING_QUOTED lexerState = 3
+	STATE_QUOTED_ESCAPING lexerState = 4
+	STATE_QUOTED          lexerState = 5
+	STATE_COMMENT         lexerState = 6
+
+	INITIAL_TOKEN_CAPACITY int = 100
+)
+
+/*
+A type for classifying characters. This allows for different sorts of
+classifiers - those accepting extended non-ascii chars, or strict posix
+compatibility, for example.
+*/
+type TokenClassifier struct {
+	typeMap map[int]RuneTokenType
+}
+
+func addRuneClass(typeMap *map[int]RuneTokenType, runes string, tokenType RuneTokenType) {
+	for _, rune := range runes {
+		(*typeMap)[rune] = tokenType
+	}
+}
+
+/*
+Create a new classifier for basic ASCII characters.
+*/
+func NewDefaultClassifier() *TokenClassifier {
+	typeMap := map[int]RuneTokenType{}
+	addRuneClass(&typeMap, RUNE_CHAR, RUNETOKEN_CHAR)
+	addRuneClass(&typeMap, RUNE_SPACE, RUNETOKEN_SPACE)
+	addRuneClass(&typeMap, RUNE_ESCAPING_QUOTE, RUNETOKEN_ESCAPING_QUOTE)
+	addRuneClass(&typeMap, RUNE_NONESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE)
+	addRuneClass(&typeMap, RUNE_ESCAPE, RUNETOKEN_ESCAPE)
+	addRuneClass(&typeMap, RUNE_COMMENT, RUNETOKEN_COMMENT)
+	return &TokenClassifier{
+		typeMap: typeMap}
+}
+
+func (classifier *TokenClassifier) ClassifyRune(rune int) RuneTokenType {
+	return classifier.typeMap[rune]
+}
+
+/*
+A type for turning an input stream in to a sequence of strings. Whitespace and
+comments are skipped.
+*/
+type Lexer struct {
+	tokenizer *Tokenizer
+}
+
+/*
+Create a new lexer.
+*/
+func NewLexer(r io.Reader) (*Lexer, os.Error) {
+
+	tokenizer, err := NewTokenizer(r)
+	if err != nil {
+		return nil, err
+	}
+	lexer := &Lexer{tokenizer: tokenizer}
+	return lexer, nil
+}
+
+/*
+Return the next word, and an error value. If there are no more words, the error
+will be os.EOF.
+*/
+func (l *Lexer) NextWord() (string, os.Error) {
+	var token *Token
+	var err os.Error
+	for {
+		token, err = l.tokenizer.NextToken()
+		if err != nil {
+			return "", err
+		}
+		switch token.tokenType {
+		case TOKEN_WORD:
+			{
+				return token.value, nil
+			}
+		case TOKEN_COMMENT:
+			{
+				// skip comments
+			}
+		default:
+			{
+				panic(fmt.Sprintf("Unknown token type: %v", token.tokenType))
+			}
+		}
+	}
+	return "", os.EOF
+}
+
+/*
+A type for turning an input stream in to a sequence of typed tokens.
+*/
+type Tokenizer struct {
+	input      *bufio.Reader
+	classifier *TokenClassifier
+}
+
+/*
+Create a new tokenizer.
+*/
+func NewTokenizer(r io.Reader) (*Tokenizer, os.Error) {
+	input := bufio.NewReader(r)
+	classifier := NewDefaultClassifier()
+	tokenizer := &Tokenizer{
+		input:      input,
+		classifier: classifier}
+	return tokenizer, nil
+}
+
+/*
+Scan the stream for the next token.
+
+This uses an internal state machine. It will panic if it encounters a character
+which it does not know how to handle.
+*/
+func (t *Tokenizer) scanStream() (*Token, os.Error) {
+	state := STATE_START
+	var tokenType TokenType
+	value := make([]int, 0, INITIAL_TOKEN_CAPACITY)
+	var (
+		nextRune     int
+		nextRuneType RuneTokenType
+		err          os.Error
+	)
+SCAN:
+	for {
+		nextRune, _, err = t.input.ReadRune()
+		nextRuneType = t.classifier.ClassifyRune(nextRune)
+		if err != nil {
+			if err == os.EOF {
+				nextRuneType = RUNETOKEN_EOF
+				err = nil
+			} else {
+				return nil, err
+			}
+		}
+		switch state {
+		case STATE_START: // no runes read yet
+			{
+				switch nextRuneType {
+				case RUNETOKEN_EOF:
+					{
+						return nil, os.EOF
+					}
+				case RUNETOKEN_CHAR:
+					{
+						tokenType = TOKEN_WORD
+						value = append(value, nextRune)
+						state = STATE_INWORD
+					}
+				case RUNETOKEN_SPACE:
+					{
+					}
+				case RUNETOKEN_ESCAPING_QUOTE:
+					{
+						tokenType = TOKEN_WORD
+						state = STATE_QUOTED_ESCAPING
+					}
+				case RUNETOKEN_NONESCAPING_QUOTE:
+					{
+						tokenType = TOKEN_WORD
+						state = STATE_QUOTED
+					}
+				case RUNETOKEN_ESCAPE:
+					{
+						tokenType = TOKEN_WORD
+						state = STATE_ESCAPING
+					}
+				case RUNETOKEN_COMMENT:
+					{
+						tokenType = TOKEN_COMMENT
+						state = STATE_COMMENT
+					}
+				default:
+					{
+						return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+					}
+				}
+			}
+		case STATE_INWORD: // in a regular word
+			{
+				switch nextRuneType {
+				case RUNETOKEN_EOF:
+					{
+						break SCAN
+					}
+				case RUNETOKEN_CHAR, RUNETOKEN_COMMENT:
+					{
+						value = append(value, nextRune)
+					}
+				case RUNETOKEN_SPACE:
+					{
+						t.input.UnreadRune()
+						break SCAN
+					}
+				case RUNETOKEN_ESCAPING_QUOTE:
+					{
+						state = STATE_QUOTED_ESCAPING
+					}
+				case RUNETOKEN_NONESCAPING_QUOTE:
+					{
+						state = STATE_QUOTED
+					}
+				case RUNETOKEN_ESCAPE:
+					{
+						state = STATE_ESCAPING
+					}
+				default:
+					{
+						return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+					}
+				}
+			}
+		case STATE_ESCAPING: // the next rune after an escape character
+			{
+				switch nextRuneType {
+				case RUNETOKEN_EOF:
+					{
+						err = os.NewError("EOF found after escape character")
+						break SCAN
+					}
+				case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
+					{
+						state = STATE_INWORD
+						value = append(value, nextRune)
+					}
+				default:
+					{
+						return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+					}
+				}
+			}
+		case STATE_ESCAPING_QUOTED: // the next rune after an escape character, in double quotes
+			{
+				switch nextRuneType {
+				case RUNETOKEN_EOF:
+					{
+						err = os.NewError("EOF found after escape character")
+						break SCAN
+					}
+				case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
+					{
+						state = STATE_QUOTED_ESCAPING
+						value = append(value, nextRune)
+					}
+				default:
+					{
+						return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+					}
+				}
+			}
+		case STATE_QUOTED_ESCAPING: // in escaping double quotes
+			{
+				switch nextRuneType {
+				case RUNETOKEN_EOF:
+					{
+						err = os.NewError("EOF found when expecting closing quote.")
+						break SCAN
+					}
+				case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_COMMENT:
+					{
+						value = append(value, nextRune)
+					}
+				case RUNETOKEN_ESCAPING_QUOTE:
+					{
+						state = STATE_INWORD
+					}
+				case RUNETOKEN_ESCAPE:
+					{
+						state = STATE_ESCAPING_QUOTED
+					}
+				default:
+					{
+						return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+					}
+				}
+			}
+		case STATE_QUOTED: // in non-escaping single quotes
+			{
+				switch nextRuneType {
+				case RUNETOKEN_EOF:
+					{
+						err = os.NewError("EOF found when expecting closing quote.")
+						break SCAN
+					}
+				case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
+					{
+						value = append(value, nextRune)
+					}
+				case RUNETOKEN_NONESCAPING_QUOTE:
+					{
+						state = STATE_INWORD
+					}
+				default:
+					{
+						return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+					}
+				}
+			}
+		case STATE_COMMENT:
+			{
+				switch nextRuneType {
+				case RUNETOKEN_EOF:
+					{
+						break SCAN
+					}
+				case RUNETOKEN_CHAR, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT, RUNETOKEN_NONESCAPING_QUOTE:
+					{
+						value = append(value, nextRune)
+					}
+				case RUNETOKEN_SPACE:
+					{
+						if nextRune == '\n' {
+							state = STATE_START
+							break SCAN
+						} else {
+							value = append(value, nextRune)
+						}
+					}
+				default:
+					{
+						return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+					}
+				}
+			}
+		default:
+			{
+				panic(fmt.Sprintf("Unexpected state: %v", state))
+			}
+		}
+	}
+	token := &Token{
+		tokenType: tokenType,
+		value:     string(value)}
+	return token, err
+}
+
+/*
+Return the next token in the stream, and an error value. If there are no more
+tokens available, the error value will be os.EOF.
+*/
+func (t *Tokenizer) NextToken() (*Token, os.Error) {
+	return t.scanStream()
+}
+
+/*
+Split a string in to a slice of strings, based upon shell-style rules for
+quoting, escaping, and spaces.
+*/
+func Split(s string) ([]string, os.Error) {
+	l, err := NewLexer(strings.NewReader(s))
+	if err != nil {
+		return nil, err
+	}
+	subStrings := []string{}
+	for {
+		word, err := l.NextWord()
+		if err != nil {
+			if err == os.EOF {
+				return subStrings, nil
+			}
+			return subStrings, err
+		}
+		subStrings = append(subStrings, word)
+	}
+	return subStrings, nil
+}

diff --git a/shlex_test.go b/shlex_test.go
new file mode 100644
index 0000000..22846f4
--- /dev/null
+++ b/shlex_test.go

@@ -0,0 +1,111 @@
+/*
+Copyright 2012 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package shlex
+
+import (
+	"os"
+	"strings"
+	"testing"
+)
+
+func checkError(err os.Error, t *testing.T) {
+	if err != nil {
+		t.Error(err)
+	}
+}
+
+func TestClassifier(t *testing.T) {
+	classifier := NewDefaultClassifier()
+	runeTests := map[int]RuneTokenType{
+		'a':  RUNETOKEN_CHAR,
+		' ':  RUNETOKEN_SPACE,
+		'"':  RUNETOKEN_ESCAPING_QUOTE,
+		'\'': RUNETOKEN_NONESCAPING_QUOTE,
+		'#':  RUNETOKEN_COMMENT}
+	for rune, expectedType := range runeTests {
+		foundType := classifier.ClassifyRune(rune)
+		if foundType != expectedType {
+			t.Logf("Expected type: %v for rune '%c'(%v). Found type: %v.", expectedType, rune, rune, foundType)
+			t.Fail()
+		}
+	}
+}
+
+func TestTokenizer(t *testing.T) {
+	testInput := strings.NewReader("one two \"three four\" \"five \\\"six\\\"\" seven#eight # nine # ten\n eleven")
+	expectedTokens := []*Token{
+		&Token{
+			tokenType: TOKEN_WORD,
+			value:     "one"},
+		&Token{
+			tokenType: TOKEN_WORD,
+			value:     "two"},
+		&Token{
+			tokenType: TOKEN_WORD,
+			value:     "three four"},
+		&Token{
+			tokenType: TOKEN_WORD,
+			value:     "five \"six\""},
+		&Token{
+			tokenType: TOKEN_WORD,
+			value:     "seven#eight"},
+		&Token{
+			tokenType: TOKEN_COMMENT,
+			value:     " nine # ten"},
+		&Token{
+			tokenType: TOKEN_WORD,
+			value:     "eleven"}}
+
+	tokenizer, err := NewTokenizer(testInput)
+	checkError(err, t)
+	for _, expectedToken := range expectedTokens {
+		foundToken, err := tokenizer.NextToken()
+		checkError(err, t)
+		if !foundToken.Equal(expectedToken) {
+			t.Error("Expected token:", expectedToken, ". Found:", foundToken)
+		}
+	}
+}
+
+func TestLexer(t *testing.T) {
+	testInput := strings.NewReader("one")
+	expectedWord := "one"
+	lexer, err := NewLexer(testInput)
+	checkError(err, t)
+	foundWord, err := lexer.NextWord()
+	checkError(err, t)
+	if expectedWord != foundWord {
+		t.Error("Expected word:", expectedWord, ". Found:", foundWord)
+	}
+}
+
+func TestSplit(t *testing.T) {
+	testInput := "one two three"
+	expectedOutput := []string{"one", "two", "three"}
+	foundOutput, err := Split(testInput)
+	if err != nil {
+		t.Error("Split returned error:", err)
+	}
+	if len(expectedOutput) != len(foundOutput) {
+		t.Error("Split expected:", len(expectedOutput), "results. Found:", len(foundOutput), "results")
+	}
+	for i := range foundOutput {
+		if foundOutput[i] != expectedOutput[i] {
+			t.Error("Item:", i, "(", foundOutput[i], ") differs from the expected value:", expectedOutput[i])
+		}
+	}
+}
commit	d230c4ab1457e5928bbee4357a5d7ea17b70d3e4	[log] [tgz]
author	Steven Thurgood <steven.thurgood+shlex@gmail.com>	Fri Jan 27 13:08:59 2012 +0000
committer	Steven Thurgood <steven.thurgood+shlex@gmail.com>	Fri Jan 27 13:08:59 2012 +0000
tree	5ba188fbd9e35d637695652506492e1b49c186b0
parent	5701c3bd13086baa9858ac5317a02d62bb41fbe1 [diff]