added go source, tests.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..038d9a4
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,21 @@
+# Copyright 2011 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include $(GOROOT)/src/Make.inc
+
+TARG=shlex
+GOFILES=\
+ shlex.go\
+
+include $(GOROOT)/src/Make.pkg
diff --git a/shlex.go b/shlex.go
new file mode 100644
index 0000000..36f6515
--- /dev/null
+++ b/shlex.go
@@ -0,0 +1,457 @@
+/*
+Copyright 2012 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package shlex
+
+/*
+Package shlex implements a simple lexer which splits input in to tokens using
+shell-style rules for quoting and commenting.
+*/
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "os"
+ "strings"
+)
+
+/*
+A TokenType is a top-level token; a word, space, comment, unknown.
+*/
+type TokenType int
+
+/*
+A RuneTokenType is the type of a UTF-8 character; a character, quote, space, escape.
+*/
+type RuneTokenType int
+
+type lexerState int
+
+type Token struct {
+ tokenType TokenType
+ value string
+}
+
+/*
+Two tokens are equal if both their types and values are equal. A nil token can
+never equal another token.
+*/
+func (a *Token) Equal(b *Token) bool {
+ if a == nil || b == nil {
+ return false
+ }
+ if a.tokenType != b.tokenType {
+ return false
+ }
+ return a.value == b.value
+}
+
+const (
+ RUNE_CHAR string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._-,"
+ RUNE_SPACE string = " \t\r\n"
+ RUNE_ESCAPING_QUOTE string = "\""
+ RUNE_NONESCAPING_QUOTE string = "'"
+ RUNE_ESCAPE = "\\"
+ RUNE_COMMENT = "#"
+
+ RUNETOKEN_UNKNOWN RuneTokenType = 0
+ RUNETOKEN_CHAR RuneTokenType = 1
+ RUNETOKEN_SPACE RuneTokenType = 2
+ RUNETOKEN_ESCAPING_QUOTE RuneTokenType = 3
+ RUNETOKEN_NONESCAPING_QUOTE RuneTokenType = 4
+ RUNETOKEN_ESCAPE RuneTokenType = 5
+ RUNETOKEN_COMMENT RuneTokenType = 6
+ RUNETOKEN_EOF RuneTokenType = 7
+
+ TOKEN_UNKNOWN TokenType = 0
+ TOKEN_WORD TokenType = 1
+ TOKEN_SPACE TokenType = 2
+ TOKEN_COMMENT TokenType = 3
+
+ STATE_START lexerState = 0
+ STATE_INWORD lexerState = 1
+ STATE_ESCAPING lexerState = 2
+ STATE_ESCAPING_QUOTED lexerState = 3
+ STATE_QUOTED_ESCAPING lexerState = 4
+ STATE_QUOTED lexerState = 5
+ STATE_COMMENT lexerState = 6
+
+ INITIAL_TOKEN_CAPACITY int = 100
+)
+
+/*
+A type for classifying characters. This allows for different sorts of
+classifiers - those accepting extended non-ascii chars, or strict posix
+compatibility, for example.
+*/
+type TokenClassifier struct {
+ typeMap map[int]RuneTokenType
+}
+
+func addRuneClass(typeMap *map[int]RuneTokenType, runes string, tokenType RuneTokenType) {
+ for _, rune := range runes {
+ (*typeMap)[rune] = tokenType
+ }
+}
+
+/*
+Create a new classifier for basic ASCII characters.
+*/
+func NewDefaultClassifier() *TokenClassifier {
+ typeMap := map[int]RuneTokenType{}
+ addRuneClass(&typeMap, RUNE_CHAR, RUNETOKEN_CHAR)
+ addRuneClass(&typeMap, RUNE_SPACE, RUNETOKEN_SPACE)
+ addRuneClass(&typeMap, RUNE_ESCAPING_QUOTE, RUNETOKEN_ESCAPING_QUOTE)
+ addRuneClass(&typeMap, RUNE_NONESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE)
+ addRuneClass(&typeMap, RUNE_ESCAPE, RUNETOKEN_ESCAPE)
+ addRuneClass(&typeMap, RUNE_COMMENT, RUNETOKEN_COMMENT)
+ return &TokenClassifier{
+ typeMap: typeMap}
+}
+
+func (classifier *TokenClassifier) ClassifyRune(rune int) RuneTokenType {
+ return classifier.typeMap[rune]
+}
+
+/*
+A type for turning an input stream in to a sequence of strings. Whitespace and
+comments are skipped.
+*/
+type Lexer struct {
+ tokenizer *Tokenizer
+}
+
+/*
+Create a new lexer.
+*/
+func NewLexer(r io.Reader) (*Lexer, os.Error) {
+
+ tokenizer, err := NewTokenizer(r)
+ if err != nil {
+ return nil, err
+ }
+ lexer := &Lexer{tokenizer: tokenizer}
+ return lexer, nil
+}
+
+/*
+Return the next word, and an error value. If there are no more words, the error
+will be os.EOF.
+*/
+func (l *Lexer) NextWord() (string, os.Error) {
+ var token *Token
+ var err os.Error
+ for {
+ token, err = l.tokenizer.NextToken()
+ if err != nil {
+ return "", err
+ }
+ switch token.tokenType {
+ case TOKEN_WORD:
+ {
+ return token.value, nil
+ }
+ case TOKEN_COMMENT:
+ {
+ // skip comments
+ }
+ default:
+ {
+ panic(fmt.Sprintf("Unknown token type: %v", token.tokenType))
+ }
+ }
+ }
+ return "", os.EOF
+}
+
+/*
+A type for turning an input stream in to a sequence of typed tokens.
+*/
+type Tokenizer struct {
+ input *bufio.Reader
+ classifier *TokenClassifier
+}
+
+/*
+Create a new tokenizer.
+*/
+func NewTokenizer(r io.Reader) (*Tokenizer, os.Error) {
+ input := bufio.NewReader(r)
+ classifier := NewDefaultClassifier()
+ tokenizer := &Tokenizer{
+ input: input,
+ classifier: classifier}
+ return tokenizer, nil
+}
+
+/*
+Scan the stream for the next token.
+
+This uses an internal state machine. It will panic if it encounters a character
+which it does not know how to handle.
+*/
+func (t *Tokenizer) scanStream() (*Token, os.Error) {
+ state := STATE_START
+ var tokenType TokenType
+ value := make([]int, 0, INITIAL_TOKEN_CAPACITY)
+ var (
+ nextRune int
+ nextRuneType RuneTokenType
+ err os.Error
+ )
+SCAN:
+ for {
+ nextRune, _, err = t.input.ReadRune()
+ nextRuneType = t.classifier.ClassifyRune(nextRune)
+ if err != nil {
+ if err == os.EOF {
+ nextRuneType = RUNETOKEN_EOF
+ err = nil
+ } else {
+ return nil, err
+ }
+ }
+ switch state {
+ case STATE_START: // no runes read yet
+ {
+ switch nextRuneType {
+ case RUNETOKEN_EOF:
+ {
+ return nil, os.EOF
+ }
+ case RUNETOKEN_CHAR:
+ {
+ tokenType = TOKEN_WORD
+ value = append(value, nextRune)
+ state = STATE_INWORD
+ }
+ case RUNETOKEN_SPACE:
+ {
+ }
+ case RUNETOKEN_ESCAPING_QUOTE:
+ {
+ tokenType = TOKEN_WORD
+ state = STATE_QUOTED_ESCAPING
+ }
+ case RUNETOKEN_NONESCAPING_QUOTE:
+ {
+ tokenType = TOKEN_WORD
+ state = STATE_QUOTED
+ }
+ case RUNETOKEN_ESCAPE:
+ {
+ tokenType = TOKEN_WORD
+ state = STATE_ESCAPING
+ }
+ case RUNETOKEN_COMMENT:
+ {
+ tokenType = TOKEN_COMMENT
+ state = STATE_COMMENT
+ }
+ default:
+ {
+ return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+ }
+ }
+ }
+ case STATE_INWORD: // in a regular word
+ {
+ switch nextRuneType {
+ case RUNETOKEN_EOF:
+ {
+ break SCAN
+ }
+ case RUNETOKEN_CHAR, RUNETOKEN_COMMENT:
+ {
+ value = append(value, nextRune)
+ }
+ case RUNETOKEN_SPACE:
+ {
+ t.input.UnreadRune()
+ break SCAN
+ }
+ case RUNETOKEN_ESCAPING_QUOTE:
+ {
+ state = STATE_QUOTED_ESCAPING
+ }
+ case RUNETOKEN_NONESCAPING_QUOTE:
+ {
+ state = STATE_QUOTED
+ }
+ case RUNETOKEN_ESCAPE:
+ {
+ state = STATE_ESCAPING
+ }
+ default:
+ {
+ return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+ }
+ }
+ }
+ case STATE_ESCAPING: // the next rune after an escape character
+ {
+ switch nextRuneType {
+ case RUNETOKEN_EOF:
+ {
+ err = os.NewError("EOF found after escape character")
+ break SCAN
+ }
+ case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
+ {
+ state = STATE_INWORD
+ value = append(value, nextRune)
+ }
+ default:
+ {
+ return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+ }
+ }
+ }
+ case STATE_ESCAPING_QUOTED: // the next rune after an escape character, in double quotes
+ {
+ switch nextRuneType {
+ case RUNETOKEN_EOF:
+ {
+ err = os.NewError("EOF found after escape character")
+ break SCAN
+ }
+ case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
+ {
+ state = STATE_QUOTED_ESCAPING
+ value = append(value, nextRune)
+ }
+ default:
+ {
+ return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+ }
+ }
+ }
+ case STATE_QUOTED_ESCAPING: // in escaping double quotes
+ {
+ switch nextRuneType {
+ case RUNETOKEN_EOF:
+ {
+ err = os.NewError("EOF found when expecting closing quote.")
+ break SCAN
+ }
+ case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_NONESCAPING_QUOTE, RUNETOKEN_COMMENT:
+ {
+ value = append(value, nextRune)
+ }
+ case RUNETOKEN_ESCAPING_QUOTE:
+ {
+ state = STATE_INWORD
+ }
+ case RUNETOKEN_ESCAPE:
+ {
+ state = STATE_ESCAPING_QUOTED
+ }
+ default:
+ {
+ return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+ }
+ }
+ }
+ case STATE_QUOTED: // in non-escaping single quotes
+ {
+ switch nextRuneType {
+ case RUNETOKEN_EOF:
+ {
+ err = os.NewError("EOF found when expecting closing quote.")
+ break SCAN
+ }
+ case RUNETOKEN_CHAR, RUNETOKEN_SPACE, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT:
+ {
+ value = append(value, nextRune)
+ }
+ case RUNETOKEN_NONESCAPING_QUOTE:
+ {
+ state = STATE_INWORD
+ }
+ default:
+ {
+ return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+ }
+ }
+ }
+ case STATE_COMMENT:
+ {
+ switch nextRuneType {
+ case RUNETOKEN_EOF:
+ {
+ break SCAN
+ }
+ case RUNETOKEN_CHAR, RUNETOKEN_ESCAPING_QUOTE, RUNETOKEN_ESCAPE, RUNETOKEN_COMMENT, RUNETOKEN_NONESCAPING_QUOTE:
+ {
+ value = append(value, nextRune)
+ }
+ case RUNETOKEN_SPACE:
+ {
+ if nextRune == '\n' {
+ state = STATE_START
+ break SCAN
+ } else {
+ value = append(value, nextRune)
+ }
+ }
+ default:
+ {
+ return nil, os.NewError(fmt.Sprintf("Uknown rune: %v", nextRune))
+ }
+ }
+ }
+ default:
+ {
+ panic(fmt.Sprintf("Unexpected state: %v", state))
+ }
+ }
+ }
+ token := &Token{
+ tokenType: tokenType,
+ value: string(value)}
+ return token, err
+}
+
+/*
+Return the next token in the stream, and an error value. If there are no more
+tokens available, the error value will be os.EOF.
+*/
+func (t *Tokenizer) NextToken() (*Token, os.Error) {
+ return t.scanStream()
+}
+
+/*
+Split a string in to a slice of strings, based upon shell-style rules for
+quoting, escaping, and spaces.
+*/
+func Split(s string) ([]string, os.Error) {
+ l, err := NewLexer(strings.NewReader(s))
+ if err != nil {
+ return nil, err
+ }
+ subStrings := []string{}
+ for {
+ word, err := l.NextWord()
+ if err != nil {
+ if err == os.EOF {
+ return subStrings, nil
+ }
+ return subStrings, err
+ }
+ subStrings = append(subStrings, word)
+ }
+ return subStrings, nil
+}
diff --git a/shlex_test.go b/shlex_test.go
new file mode 100644
index 0000000..22846f4
--- /dev/null
+++ b/shlex_test.go
@@ -0,0 +1,111 @@
+/*
+Copyright 2012 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package shlex
+
+import (
+ "os"
+ "strings"
+ "testing"
+)
+
+func checkError(err os.Error, t *testing.T) {
+ if err != nil {
+ t.Error(err)
+ }
+}
+
+func TestClassifier(t *testing.T) {
+ classifier := NewDefaultClassifier()
+ runeTests := map[int]RuneTokenType{
+ 'a': RUNETOKEN_CHAR,
+ ' ': RUNETOKEN_SPACE,
+ '"': RUNETOKEN_ESCAPING_QUOTE,
+ '\'': RUNETOKEN_NONESCAPING_QUOTE,
+ '#': RUNETOKEN_COMMENT}
+ for rune, expectedType := range runeTests {
+ foundType := classifier.ClassifyRune(rune)
+ if foundType != expectedType {
+ t.Logf("Expected type: %v for rune '%c'(%v). Found type: %v.", expectedType, rune, rune, foundType)
+ t.Fail()
+ }
+ }
+}
+
+func TestTokenizer(t *testing.T) {
+ testInput := strings.NewReader("one two \"three four\" \"five \\\"six\\\"\" seven#eight # nine # ten\n eleven")
+ expectedTokens := []*Token{
+ &Token{
+ tokenType: TOKEN_WORD,
+ value: "one"},
+ &Token{
+ tokenType: TOKEN_WORD,
+ value: "two"},
+ &Token{
+ tokenType: TOKEN_WORD,
+ value: "three four"},
+ &Token{
+ tokenType: TOKEN_WORD,
+ value: "five \"six\""},
+ &Token{
+ tokenType: TOKEN_WORD,
+ value: "seven#eight"},
+ &Token{
+ tokenType: TOKEN_COMMENT,
+ value: " nine # ten"},
+ &Token{
+ tokenType: TOKEN_WORD,
+ value: "eleven"}}
+
+ tokenizer, err := NewTokenizer(testInput)
+ checkError(err, t)
+ for _, expectedToken := range expectedTokens {
+ foundToken, err := tokenizer.NextToken()
+ checkError(err, t)
+ if !foundToken.Equal(expectedToken) {
+ t.Error("Expected token:", expectedToken, ". Found:", foundToken)
+ }
+ }
+}
+
+func TestLexer(t *testing.T) {
+ testInput := strings.NewReader("one")
+ expectedWord := "one"
+ lexer, err := NewLexer(testInput)
+ checkError(err, t)
+ foundWord, err := lexer.NextWord()
+ checkError(err, t)
+ if expectedWord != foundWord {
+ t.Error("Expected word:", expectedWord, ". Found:", foundWord)
+ }
+}
+
+func TestSplit(t *testing.T) {
+ testInput := "one two three"
+ expectedOutput := []string{"one", "two", "three"}
+ foundOutput, err := Split(testInput)
+ if err != nil {
+ t.Error("Split returned error:", err)
+ }
+ if len(expectedOutput) != len(foundOutput) {
+ t.Error("Split expected:", len(expectedOutput), "results. Found:", len(foundOutput), "results")
+ }
+ for i := range foundOutput {
+ if foundOutput[i] != expectedOutput[i] {
+ t.Error("Item:", i, "(", foundOutput[i], ") differs from the expected value:", expectedOutput[i])
+ }
+ }
+}