Replace regexes with participle parser

This CL replaces the regexes to find the library name and imports with a
proper parser based on the official FIDL grammar. It uses the participle
library to generate the parser.

Test: manually checked declaring new file alongside SDK library
Test: manually checked importing SDK libraries
Test: manually built and run docker container to check new go dependencies
Fixed: 86590
Change-Id: I0192909b48b5e74b3c5f16746fa3705ca23eebfc
Reviewed-on: https://fuchsia-review.googlesource.com/c/fidlbolt/+/594784
Reviewed-by: Yifei Teng <yifeit@google.com>
diff --git a/backend/analyze.go b/backend/analyze.go
index eb130a2..7b14eb0 100644
--- a/backend/analyze.go
+++ b/backend/analyze.go
@@ -5,14 +5,11 @@
 package main
 
 import (
-	"bytes"
 	"errors"
 	"fmt"
 	"io/ioutil"
 	"os"
 	"path/filepath"
-	"regexp"
-	"strconv"
 	"strings"
 	"sync"
 )
@@ -116,8 +113,8 @@
 // content. If any library cannot be located in a.paths, it is silently ignored.
 func (a *libraryAnalyzer) analyze(filename, content string) analysis {
 	result := analysis{libs: make(libraryMap)}
-	rootLibraryMatch, ok := parseLibraryMatch(content)
-	if !ok {
+	root, err := parseFidl(content)
+	if err != nil {
 		// If we can't parse the library name, there's no point in continuing.
 		// We could make this function return an error and report that to the
 		// user. However, we prefer to invoke fidlc anyway and show its error.
@@ -126,7 +123,7 @@
 		result.libs[""] = libraryInfo{files: []string{filename}}
 		return result
 	}
-	result.root = rootLibraryMatch.lib
+	result.root = root.Decl.Library
 
 	// Start with a stack containing the root library (if it's a platform source
 	// library, meaning we can expect to find it in the tree) and its imports.
@@ -134,9 +131,8 @@
 	if result.root.isPlatform() {
 		stack = append(stack, result.root)
 	}
-	rootImports := parsePlatformImportsMatch(content)
-	for _, m := range rootImports {
-		stack = append(stack, m.lib)
+	for _, using := range root.Using {
+		stack = append(stack, using.Library)
 	}
 
 	// Explore all dependencies via depth-first search.
@@ -180,12 +176,13 @@
 					if err != nil {
 						return err
 					}
-					declaredLib, ok := parseLibrary(b)
-					if !ok || declaredLib != lib {
+					parsed, err := parseFidl(string(b))
+					if err != nil || parsed.Decl.Library != lib {
 						return nil
 					}
 					info.files = append(info.files, rel)
-					for _, dep := range parsePlatformImports(b) {
+					for _, using := range parsed.Using {
+						dep := using.Library
 						if _, ok := seen[dep]; ok {
 							continue
 						}
@@ -212,18 +209,18 @@
 	//      if they were adding a new file to the fuchsia.io library, alongside
 	//      the other ones in the source tree.
 	rootInfo := libraryInfo{files: []string{filename}}
-	for _, m := range rootImports {
-		rootInfo.deps = append(rootInfo.deps, m.lib)
-		if info, ok := result.libs[m.lib]; ok {
-			text := fmt.Sprintf("Found %s in %s", m.lib, info.userVisibleDir())
-			result.annotations = append(result.annotations, m.annotation(Info, text))
+	for _, using := range root.Using {
+		rootInfo.deps = append(rootInfo.deps, using.Library)
+		if info, ok := result.libs[using.Library]; ok {
+			text := fmt.Sprintf("Found %s in %s", using.Library, info.userVisibleDir())
+			result.annotations = append(result.annotations, using.annotation(Info, text))
 		}
 	}
 	if info, ok := result.libs[result.root]; ok {
 		rootInfo = mergeInfo(rootInfo, info)
 		text := fmt.Sprintf("Found %s in %s\n\n%s",
 			result.root, info.userVisibleDir(), existingLibraryNotice)
-		result.annotations = append(result.annotations, rootLibraryMatch.annotation(Info, text))
+		result.annotations = append(result.annotations, root.Decl.annotation(Info, text))
 	}
 	result.libs[result.root] = rootInfo
 
@@ -391,133 +388,3 @@
 	}
 	return info
 }
-
-// TODO(fxbug.dev/86590): Stop relying on regexes.
-var (
-	// https://fuchsia.dev/fuchsia-src/development/languages/fidl/reference/language.md#identifiers
-	fidlIdentifierPattern = `[a-zA-Z](?:[a-zA-Z0-9_]*[a-zA-Z0-9])?`
-
-	// Works most of the time, but breaks if there is an attribute with a string
-	// argument that contains ")".
-	libraryRegexp = regexp.MustCompile(`` +
-		`^(?:` +
-		/* space     */ `\s` +
-		/* comment   */ `|//[^\n]*\n|` +
-		/* attribute */ `|@\s*` + fidlIdentifierPattern + `\s*` +
-		/* arguments */ `(?:\([^)]*\))?` +
-		`)*` +
-		`library\s+(` +
-		fidlIdentifierPattern +
-		`(?:\.` + fidlIdentifierPattern + `)*` +
-		`)\s*;`)
-
-	// Works most of the time, but breaks if there is a string literal whose
-	// value looks like an import.
-	platformImportRegexp = regexp.MustCompile(`` +
-		`\busing\s+(` +
-		`zx|` +
-		`(?:fidl|fuchsia|test)` +
-		`(?:\.` + fidlIdentifierPattern + `)+` +
-		`)` +
-		`(?:\s+as\s+` + fidlIdentifierPattern + `)?` +
-		`\s*;`)
-)
-
-type libraryMatch struct {
-	lib          library
-	line, column int
-}
-
-func parseLibrary(fidl []byte) (library, bool) {
-	m := libraryRegexp.FindSubmatch(fidl)
-	if m == nil {
-		return "", false
-	}
-	return library(m[1]), true
-}
-
-func parseLibraryMatch(fidl string) (libraryMatch, bool) {
-	m := libraryRegexp.FindStringSubmatchIndex(fidl)
-	if m == nil {
-		return libraryMatch{}, false
-	}
-	return toLibraryMatch(fidl, m[2], m[3]), true
-}
-
-func parsePlatformImports(fidl []byte) []library {
-	var libs []library
-	for _, m := range platformImportRegexp.FindAllSubmatchIndex(fidl, -1) {
-		// Make sure the line isn't commented. Technically this breaks if the
-		// line contains a string const with "//" in it. This isn't a big deal
-		// because imports are supposed to come before all other declarations.
-		i := bytes.LastIndexByte(fidl[:m[2]], '\n')
-		if i == -1 || !bytes.Contains(fidl[i:m[2]], []byte("//")) {
-			libs = append(libs, library(fidl[m[2]:m[3]]))
-		}
-	}
-	return libs
-}
-
-func parsePlatformImportsMatch(fidl string) []libraryMatch {
-	var libs []libraryMatch
-	for _, m := range platformImportRegexp.FindAllStringSubmatchIndex(fidl, -1) {
-		// See the comment in parsePlatformImports.
-		i := strings.LastIndexByte(fidl[:m[2]], '\n')
-		if i == -1 || !strings.Contains(fidl[i:m[2]], "//") {
-			libs = append(libs, toLibraryMatch(fidl, m[2], m[3]))
-		}
-	}
-	return libs
-}
-
-func toLibraryMatch(fidl string, start, end int) libraryMatch {
-	return libraryMatch{
-		lib:    library(fidl[start:end]),
-		line:   strings.Count(fidl[:start], "\n") + 1,
-		column: start - strings.LastIndexByte(fidl[:start], '\n'),
-	}
-}
-
-func (m *libraryMatch) annotation(kind annotationKind, text string) annotation {
-	return annotation{
-		Line:   m.line,
-		Column: m.column,
-		Kind:   kind,
-		Text:   text,
-	}
-}
-
-var fidlAnnotationRegexp = regexp.MustCompile(`` +
-	// Multi-line mode: ^ and $ match begin/end of line OR text.
-	`(?m)` +
-	`^/.*/fidlbolt\.fidl:(\d+):(\d+).*?` +
-	`(` +
-	string(Info) + `|` +
-	string(Warning) + `|` +
-	string(Error) + `)` +
-	`:\s*(.*)$`)
-
-// parseFidlAnnotations parses a list of annotations from error output. It
-// assumes that the output refers to a file called "fidlbolt.fidl".
-// TODO: Use fidlc --format=json instead of regex parsing.
-func parseFidlAnnotations(output string) []annotation {
-	matches := fidlAnnotationRegexp.FindAllStringSubmatch(output, -1)
-	var anns []annotation
-	for _, m := range matches {
-		line, err := strconv.Atoi(m[1])
-		if err != nil {
-			continue
-		}
-		column, err := strconv.Atoi(m[2])
-		if err != nil {
-			continue
-		}
-		anns = append(anns, annotation{
-			Line:   line,
-			Column: column,
-			Kind:   annotationKind(m[3]),
-			Text:   m[4],
-		})
-	}
-	return anns
-}
diff --git a/backend/go.mod b/backend/go.mod
index 58624d4..0ad91a1 100644
--- a/backend/go.mod
+++ b/backend/go.mod
@@ -1,3 +1,8 @@
 module fidlbolt
 
 go 1.17
+
+require (
+	github.com/alecthomas/participle/v2 v2.0.0-alpha7
+	github.com/google/go-cmp v0.5.6
+)
diff --git a/backend/go.sum b/backend/go.sum
new file mode 100644
index 0000000..243dc77
--- /dev/null
+++ b/backend/go.sum
@@ -0,0 +1,21 @@
+github.com/alecthomas/participle v0.7.1 h1:2bN7reTw//5f0cugJcTOnY/NYZcWQOaajW+BwZB5xWs=
+github.com/alecthomas/participle v0.7.1/go.mod h1:HfdmEuwvr12HXQN44HPWXR0lHmVolVYe4dyL6lQ3duY=
+github.com/alecthomas/participle/v2 v2.0.0-alpha7 h1:cK4vjj0VSgb3lN1nuKA5F7dw+1s1pWBe5bx7nNCnN+c=
+github.com/alecthomas/participle/v2 v2.0.0-alpha7/go.mod h1:NumScqsC42o9x+dGj8/YqsIfhrIQjFEOFovxotbBirA=
+github.com/alecthomas/repr v0.0.0-20181024024818-d37bc2a10ba1 h1:GDQdwm/gAcJcLAKQQZGOJ4knlw+7rfEQQcmwTbt4p5E=
+github.com/alecthomas/repr v0.0.0-20181024024818-d37bc2a10ba1/go.mod h1:xTS7Pm1pD1mvyM075QCDSRqH6qRLXylzS24ZTpRiSzQ=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/google/go-cmp v0.5.6 h1:BKbKCqvP6I+rmFHt06ZmyQtvB8xAkWdhFyr0ZUNZcxQ=
+github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
+github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
+gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
diff --git a/backend/parser.go b/backend/parser.go
new file mode 100644
index 0000000..a82f745
--- /dev/null
+++ b/backend/parser.go
@@ -0,0 +1,180 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package main
+
+import (
+	"regexp"
+	"strconv"
+	"strings"
+
+	"github.com/alecthomas/participle/v2"
+	"github.com/alecthomas/participle/v2/lexer"
+)
+
+// This file is based on the official FIDL grammar:
+// https://fuchsia.dev/fuchsia-src/reference/fidl/language/grammar
+//
+// It uses the participle parser library:
+// https://github.com/alecthomas/participle
+//
+// A quick primer on participle:
+//
+// - It has separate lexing and parsing stages.
+// - Lexer tokens are defined by regexes.
+// - Productions are defined by structs with tags on each field.
+// - The tags get concatenated (e.g. parens can be unbalanced within a tag).
+// - Tags have the format `<exp1> <exp2> ... <expN>`.
+// - Expression `"foo"` matches the literal "foo".
+// - Expression `Foo` matches a Foo token.
+// - Expression `@<exp>` captures <exp> into the field.
+// - Expression `@@` recursively captures based on the field's type.
+// - Tags for slice types can have multiple captures.
+// - Boolean fields become true if a capture occurred.
+// - Parentheses, `?`, `+`, `*`, `|` behave as expected.
+
+var (
+	fidlLexer = lexer.MustSimple([]lexer.Rule{
+		{Name: "Whitespace", Pattern: `[ \n\r\t]`},
+		{Name: "Comment", Pattern: `//[^\n]*`},
+		{Name: "Punctuation", Pattern: "[.,;@=()]"},
+		{Name: "Keyword", Pattern: `library|using|as`},
+		{Name: "Bool", Pattern: `true|false`},
+		{Name: "String", Pattern: `"(\\.|[^"\\])*"`},
+		// This pattern matches the FIDL lexer, which defers more precise
+		// numeric parsing to the flat AST stage.
+		{Name: "Number", Pattern: `[0-9][0-9a-fA-FxX._\-]*`},
+		// https://fuchsia.dev/fuchsia-src/reference/fidl/language/language?hl=en#identifiers
+		{Name: "Identifier", Pattern: `[a-zA-Z]([a-zA-Z0-9_]*[a-zA-Z0-9])?`},
+		// Define a fallback token so that the lexer never fails and we can
+		// ignore the rest of the file (see the Rest field in fidlTopLevel).
+		{Name: "Other", Pattern: `.*`},
+	})
+
+	fidlParser = participle.MustBuild(
+		&fidlTopLevel{},
+		participle.Lexer(fidlLexer),
+		participle.Elide("Whitespace", "Comment"),
+	)
+)
+
+type fidlTopLevel struct {
+	Header fidlLibraryHeader `@@`
+	Using  []fidlUsing       `@@*`
+	// Ignore the rest of the file by matching 0+ tokens of any kind. Rather
+	// than list every token type, we say ~"" (anything not equal to "").
+	Rest bool `(@~"")*`
+}
+
+type fidlLibraryHeader struct {
+	Attributes []fidlAttribute        `@@*`
+	Library    fidlCompoundIdentifier `"library" @@ ";"`
+}
+
+type fidlUsing struct {
+	Library fidlCompoundIdentifier `"using" @@`
+	Alias   bool                   `("as" @Identifier)? ";"`
+}
+
+type fidlAttribute struct {
+	Name bool               `"@" @Identifier`
+	Args []fidlAttributeArg `("(" @@ ("," @@)* ")")?`
+}
+
+type fidlAttributeArg struct {
+	Name  bool         `(@Identifier "=")?`
+	Value fidlConstant `@@`
+}
+
+type fidlConstant struct {
+	Identifier *fidlCompoundIdentifier `@@`
+	Literal    bool                    `| @String | @Number | @Bool`
+}
+
+type fidlCompoundIdentifier struct {
+	// Note: The field must be named "Pos" for participle to recognize it.
+	Pos   lexer.Position
+	Parts []string `@Identifier ("." @Identifier)*`
+}
+
+// Note: fields exported for cmp.Diff.
+type parsedFidl struct {
+	Decl  parsedLibrary
+	Using []parsedLibrary
+}
+
+// Note: fields exported for cmp.Diff.
+type parsedLibrary struct {
+	Library      library
+	Line, Column int
+}
+
+func parseFidl(input string) (parsedFidl, error) {
+	var (
+		top      fidlTopLevel
+		parsed   parsedFidl
+		filename = ""
+	)
+	err := fidlParser.ParseString(filename, input, &top)
+	if err != nil {
+		return parsed, err
+	}
+	parsed.Decl = toParsedLibrary(top.Header.Library)
+	for _, using := range top.Using {
+		parsed.Using = append(parsed.Using, toParsedLibrary(using.Library))
+	}
+	return parsed, nil
+}
+
+func toParsedLibrary(ident fidlCompoundIdentifier) parsedLibrary {
+	return parsedLibrary{
+		Library: library(strings.Join(ident.Parts, ".")),
+		Line:    ident.Pos.Line,
+		Column:  ident.Pos.Column,
+	}
+}
+
+func (m *parsedLibrary) annotation(kind annotationKind, text string) annotation {
+	return annotation{
+		Line:   m.Line,
+		Column: m.Column,
+		Kind:   kind,
+		Text:   text,
+	}
+}
+
+var fidlAnnotationRegexp = regexp.MustCompile(`` +
+	// Multi-line mode: ^ and $ match begin/end of line OR text.
+	`(?m)` +
+	`^/.*/fidlbolt\.fidl:(\d+):(\d+).*?` +
+	`(` +
+	string(Info) + `|` +
+	string(Warning) + `|` +
+	string(Error) + `)` +
+	`:\s*(.*)$`)
+
+// parseFidlAnnotations parses a list of annotations from error output. It
+// assumes that the output refers to a file called "fidlbolt.fidl".
+// TODO: Use fidlc --format=json instead of regex parsing.
+func parseFidlAnnotations(output string) []annotation {
+	matches := fidlAnnotationRegexp.FindAllStringSubmatch(output, -1)
+	var anns []annotation
+	for _, m := range matches {
+		line, err := strconv.Atoi(m[1])
+		if err != nil {
+			continue
+		}
+		column, err := strconv.Atoi(m[2])
+		if err != nil {
+			continue
+		}
+		anns = append(anns, annotation{
+			Line:   line,
+			Column: column,
+			Kind:   annotationKind(m[3]),
+			Text:   m[4],
+		})
+	}
+	return anns
+}
diff --git a/backend/parser_test.go b/backend/parser_test.go
new file mode 100644
index 0000000..2848aa1
--- /dev/null
+++ b/backend/parser_test.go
@@ -0,0 +1,150 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package main
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestParseFidlGood(t *testing.T) {
+	tests := []struct {
+		input    string
+		expected parsedFidl
+	}{
+		{
+			input: `library x;`,
+			expected: parsedFidl{
+				Decl: parsedLibrary{"x", 1, 9},
+			},
+		},
+		{
+			input: `library foo.bar.baz;`,
+			expected: parsedFidl{
+				Decl: parsedLibrary{"foo.bar.baz", 1, 9},
+			},
+		},
+		{
+			// Note: fidlc rejects this, but it's not worth the effort here.
+			input: `library Foo_Bar;`,
+			expected: parsedFidl{
+				Decl: parsedLibrary{"Foo_Bar", 1, 9},
+			},
+		},
+		{
+			// Note: confirmed that fidlc accepts this.
+			input: ` library  foo . bar ; `,
+			expected: parsedFidl{
+				Decl: parsedLibrary{"foo.bar", 1, 11},
+			},
+		},
+		{
+			input: `@attr library foo.bar;`,
+			expected: parsedFidl{
+				Decl: parsedLibrary{"foo.bar", 1, 15},
+			},
+		},
+		{
+			input: `@attr(1) library foo.bar;`,
+			expected: parsedFidl{
+				Decl: parsedLibrary{"foo.bar", 1, 18},
+			},
+		},
+		{
+			input: `@attr(name=value) library foo.bar;`,
+			expected: parsedFidl{
+				Decl: parsedLibrary{"foo.bar", 1, 27},
+			},
+		},
+		{
+			input: `library foo.bar; const FOO int32 = 1 | 2;`,
+			expected: parsedFidl{
+				Decl: parsedLibrary{"foo.bar", 1, 9},
+			},
+		},
+		{
+			input: `library foo.bar; We ignore the rest of the file!`,
+			expected: parsedFidl{
+				Decl: parsedLibrary{"foo.bar", 1, 9},
+			},
+		},
+		{
+			input: `
+// Some comments
+library
+// Some comments
+foo.bar;
+			`,
+			expected: parsedFidl{
+				Decl: parsedLibrary{"foo.bar", 5, 1},
+			},
+		},
+		{
+			input: `
+library foo;
+using bar;
+using baz.qux;
+`,
+			expected: parsedFidl{
+				Decl: parsedLibrary{"foo", 2, 9},
+				Using: []parsedLibrary{
+					{"bar", 3, 7},
+					{"baz.qux", 4, 7},
+				},
+			},
+		},
+		{
+			input: `
+// library foo;
+@attr("library foo")
+@attr("\\\"")
+@attr (
+	a="using a;",
+	// using b;
+	c="using c;")
+library foo;
+// using d;
+using
+//
+first; using second ;
+We ignore the rest of the file.
+Even if it has 𝔰𝔱𝔯𝔞𝔫𝔤𝔢 characters.
+`,
+			expected: parsedFidl{
+				Decl: parsedLibrary{"foo", 9, 9},
+				Using: []parsedLibrary{
+					{"first", 13, 1},
+					{"second", 13, 14},
+				},
+			},
+		},
+	}
+	for _, tc := range tests {
+		actual, err := parseFidl(tc.input)
+		if err != nil {
+			t.Errorf("input:\n%s\n\nfailed: %s", tc.input, err)
+		} else if diff := cmp.Diff(tc.expected, actual); diff != "" {
+			t.Errorf("input:\n%s\n\ndiff (-want +got):\n%s", tc.input, diff)
+		}
+	}
+}
+
+func TestParseFidlBad(t *testing.T) {
+	tests := []string{
+		``,
+		`library`,
+		`library foo`,
+		`using bar;`,
+		`using bar; library foo;`,
+		`// library foo;`,
+	}
+	for _, input := range tests {
+		actual, err := parseFidl(input)
+		if err == nil {
+			t.Errorf("input:\n%s\n\nunexpectedly succeeded. output:\n%+v", input, actual)
+		}
+	}
+}