| // Copyright 2011 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package syntax |
| |
| // Note to implementers: |
| // In this package, re is always a *Regexp and r is always a rune. |
| |
| import ( |
| "strconv" |
| "strings" |
| "unicode" |
| ) |
| |
| // A Regexp is a node in a regular expression syntax tree. |
| type Regexp struct { |
| Op Op // operator |
| Flags Flags |
| Sub []*Regexp // subexpressions, if any |
| Sub0 [1]*Regexp // storage for short Sub |
| Rune []rune // matched runes, for OpLiteral, OpCharClass |
| Rune0 [2]rune // storage for short Rune |
| Min, Max int // min, max for OpRepeat |
| Cap int // capturing index, for OpCapture |
| Name string // capturing name, for OpCapture |
| } |
| |
| //go:generate stringer -type Op -trimprefix Op |
| |
| // An Op is a single regular expression operator. |
| type Op uint8 |
| |
| // Operators are listed in precedence order, tightest binding to weakest. |
| // Character class operators are listed simplest to most complex |
| // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). |
| |
| const ( |
| OpNoMatch Op = 1 + iota // matches no strings |
| OpEmptyMatch // matches empty string |
| OpLiteral // matches Runes sequence |
| OpCharClass // matches Runes interpreted as range pair list |
| OpAnyCharNotNL // matches any character except newline |
| OpAnyChar // matches any character |
| OpBeginLine // matches empty string at beginning of line |
| OpEndLine // matches empty string at end of line |
| OpBeginText // matches empty string at beginning of text |
| OpEndText // matches empty string at end of text |
| OpWordBoundary // matches word boundary `\b` |
| OpNoWordBoundary // matches word non-boundary `\B` |
| OpCapture // capturing subexpression with index Cap, optional name Name |
| OpStar // matches Sub[0] zero or more times |
| OpPlus // matches Sub[0] one or more times |
| OpQuest // matches Sub[0] zero or one times |
| OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) |
| OpConcat // matches concatenation of Subs |
| OpAlternate // matches alternation of Subs |
| ) |
| |
| const opPseudo Op = 128 // where pseudo-ops start |
| |
| // Equal reports whether x and y have identical structure. |
| func (x *Regexp) Equal(y *Regexp) bool { |
| if x == nil || y == nil { |
| return x == y |
| } |
| if x.Op != y.Op { |
| return false |
| } |
| switch x.Op { |
| case OpEndText: |
| // The parse flags remember whether this is \z or \Z. |
| if x.Flags&WasDollar != y.Flags&WasDollar { |
| return false |
| } |
| |
| case OpLiteral, OpCharClass: |
| if len(x.Rune) != len(y.Rune) { |
| return false |
| } |
| for i, r := range x.Rune { |
| if r != y.Rune[i] { |
| return false |
| } |
| } |
| |
| case OpAlternate, OpConcat: |
| if len(x.Sub) != len(y.Sub) { |
| return false |
| } |
| for i, sub := range x.Sub { |
| if !sub.Equal(y.Sub[i]) { |
| return false |
| } |
| } |
| |
| case OpStar, OpPlus, OpQuest: |
| if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { |
| return false |
| } |
| |
| case OpRepeat: |
| if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { |
| return false |
| } |
| |
| case OpCapture: |
| if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { |
| return false |
| } |
| } |
| return true |
| } |
| |
| // printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp. |
| type printFlags uint8 |
| |
| const ( |
| flagI printFlags = 1 << iota // (?i: |
| flagM // (?m: |
| flagS // (?s: |
| flagOff // ) |
| flagPrec // (?: ) |
| negShift = 5 // flagI<<negShift is (?-i: |
| ) |
| |
| // addSpan enables the flags f around start..last, |
| // by setting flags[start] = f and flags[last] = flagOff. |
| func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) { |
| if *flags == nil { |
| *flags = make(map[*Regexp]printFlags) |
| } |
| (*flags)[start] = f |
| (*flags)[last] |= flagOff // maybe start==last |
| } |
| |
| // calcFlags calculates the flags to print around each subexpression in re, |
| // storing that information in (*flags)[sub] for each affected subexpression. |
| // The first time an entry needs to be written to *flags, calcFlags allocates the map. |
| // calcFlags also calculates the flags that must be active or can't be active |
| // around re and returns those flags. |
| func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) { |
| switch re.Op { |
| default: |
| return 0, 0 |
| |
| case OpLiteral: |
| // If literal is fold-sensitive, return (flagI, 0) or (0, flagI) |
| // according to whether (?i) is active. |
| // If literal is not fold-sensitive, return 0, 0. |
| for _, r := range re.Rune { |
| if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r { |
| if re.Flags&FoldCase != 0 { |
| return flagI, 0 |
| } else { |
| return 0, flagI |
| } |
| } |
| } |
| return 0, 0 |
| |
| case OpCharClass: |
| // If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out. |
| // If literal is not fold-sensitive, return 0, 0. |
| for i := 0; i < len(re.Rune); i += 2 { |
| lo := max(minFold, re.Rune[i]) |
| hi := min(maxFold, re.Rune[i+1]) |
| for r := lo; r <= hi; r++ { |
| for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) { |
| if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) { |
| return 0, flagI |
| } |
| } |
| } |
| } |
| return 0, 0 |
| |
| case OpAnyCharNotNL: // (?-s). |
| return 0, flagS |
| |
| case OpAnyChar: // (?s). |
| return flagS, 0 |
| |
| case OpBeginLine, OpEndLine: // (?m)^ (?m)$ |
| return flagM, 0 |
| |
| case OpEndText: |
| if re.Flags&WasDollar != 0 { // (?-m)$ |
| return 0, flagM |
| } |
| return 0, 0 |
| |
| case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat: |
| return calcFlags(re.Sub[0], flags) |
| |
| case OpConcat, OpAlternate: |
| // Gather the must and cant for each subexpression. |
| // When we find a conflicting subexpression, insert the necessary |
| // flags around the previously identified span and start over. |
| var must, cant, allCant printFlags |
| start := 0 |
| last := 0 |
| did := false |
| for i, sub := range re.Sub { |
| subMust, subCant := calcFlags(sub, flags) |
| if must&subCant != 0 || subMust&cant != 0 { |
| if must != 0 { |
| addSpan(re.Sub[start], re.Sub[last], must, flags) |
| } |
| must = 0 |
| cant = 0 |
| start = i |
| did = true |
| } |
| must |= subMust |
| cant |= subCant |
| allCant |= subCant |
| if subMust != 0 { |
| last = i |
| } |
| if must == 0 && start == i { |
| start++ |
| } |
| } |
| if !did { |
| // No conflicts: pass the accumulated must and cant upward. |
| return must, cant |
| } |
| if must != 0 { |
| // Conflicts found; need to finish final span. |
| addSpan(re.Sub[start], re.Sub[last], must, flags) |
| } |
| return 0, allCant |
| } |
| } |
| |
| // writeRegexp writes the Perl syntax for the regular expression re to b. |
| func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) { |
| f |= flags[re] |
| if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 { |
| // flagPrec is redundant with other flags being added and terminated |
| f &^= flagPrec |
| } |
| if f&^(flagOff|flagPrec) != 0 { |
| b.WriteString(`(?`) |
| if f&flagI != 0 { |
| b.WriteString(`i`) |
| } |
| if f&flagM != 0 { |
| b.WriteString(`m`) |
| } |
| if f&flagS != 0 { |
| b.WriteString(`s`) |
| } |
| if f&((flagM|flagS)<<negShift) != 0 { |
| b.WriteString(`-`) |
| if f&(flagM<<negShift) != 0 { |
| b.WriteString(`m`) |
| } |
| if f&(flagS<<negShift) != 0 { |
| b.WriteString(`s`) |
| } |
| } |
| b.WriteString(`:`) |
| } |
| if f&flagOff != 0 { |
| defer b.WriteString(`)`) |
| } |
| if f&flagPrec != 0 { |
| b.WriteString(`(?:`) |
| defer b.WriteString(`)`) |
| } |
| |
| switch re.Op { |
| default: |
| b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") |
| case OpNoMatch: |
| b.WriteString(`[^\x00-\x{10FFFF}]`) |
| case OpEmptyMatch: |
| b.WriteString(`(?:)`) |
| case OpLiteral: |
| for _, r := range re.Rune { |
| escape(b, r, false) |
| } |
| case OpCharClass: |
| if len(re.Rune)%2 != 0 { |
| b.WriteString(`[invalid char class]`) |
| break |
| } |
| b.WriteRune('[') |
| if len(re.Rune) == 0 { |
| b.WriteString(`^\x00-\x{10FFFF}`) |
| } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { |
| // Contains 0 and MaxRune. Probably a negated class. |
| // Print the gaps. |
| b.WriteRune('^') |
| for i := 1; i < len(re.Rune)-1; i += 2 { |
| lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 |
| escape(b, lo, lo == '-') |
| if lo != hi { |
| if hi != lo+1 { |
| b.WriteRune('-') |
| } |
| escape(b, hi, hi == '-') |
| } |
| } |
| } else { |
| for i := 0; i < len(re.Rune); i += 2 { |
| lo, hi := re.Rune[i], re.Rune[i+1] |
| escape(b, lo, lo == '-') |
| if lo != hi { |
| if hi != lo+1 { |
| b.WriteRune('-') |
| } |
| escape(b, hi, hi == '-') |
| } |
| } |
| } |
| b.WriteRune(']') |
| case OpAnyCharNotNL, OpAnyChar: |
| b.WriteString(`.`) |
| case OpBeginLine: |
| b.WriteString(`^`) |
| case OpEndLine: |
| b.WriteString(`$`) |
| case OpBeginText: |
| b.WriteString(`\A`) |
| case OpEndText: |
| if re.Flags&WasDollar != 0 { |
| b.WriteString(`$`) |
| } else { |
| b.WriteString(`\z`) |
| } |
| case OpWordBoundary: |
| b.WriteString(`\b`) |
| case OpNoWordBoundary: |
| b.WriteString(`\B`) |
| case OpCapture: |
| if re.Name != "" { |
| b.WriteString(`(?P<`) |
| b.WriteString(re.Name) |
| b.WriteRune('>') |
| } else { |
| b.WriteRune('(') |
| } |
| if re.Sub[0].Op != OpEmptyMatch { |
| writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags) |
| } |
| b.WriteRune(')') |
| case OpStar, OpPlus, OpQuest, OpRepeat: |
| p := printFlags(0) |
| sub := re.Sub[0] |
| if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { |
| p = flagPrec |
| } |
| writeRegexp(b, sub, p, flags) |
| |
| switch re.Op { |
| case OpStar: |
| b.WriteRune('*') |
| case OpPlus: |
| b.WriteRune('+') |
| case OpQuest: |
| b.WriteRune('?') |
| case OpRepeat: |
| b.WriteRune('{') |
| b.WriteString(strconv.Itoa(re.Min)) |
| if re.Max != re.Min { |
| b.WriteRune(',') |
| if re.Max >= 0 { |
| b.WriteString(strconv.Itoa(re.Max)) |
| } |
| } |
| b.WriteRune('}') |
| } |
| if re.Flags&NonGreedy != 0 { |
| b.WriteRune('?') |
| } |
| case OpConcat: |
| for _, sub := range re.Sub { |
| p := printFlags(0) |
| if sub.Op == OpAlternate { |
| p = flagPrec |
| } |
| writeRegexp(b, sub, p, flags) |
| } |
| case OpAlternate: |
| for i, sub := range re.Sub { |
| if i > 0 { |
| b.WriteRune('|') |
| } |
| writeRegexp(b, sub, 0, flags) |
| } |
| } |
| } |
| |
| func (re *Regexp) String() string { |
| var b strings.Builder |
| var flags map[*Regexp]printFlags |
| must, cant := calcFlags(re, &flags) |
| must |= (cant &^ flagI) << negShift |
| if must != 0 { |
| must |= flagOff |
| } |
| writeRegexp(&b, re, must, flags) |
| return b.String() |
| } |
| |
| const meta = `\.+*?()|[]{}^$` |
| |
| func escape(b *strings.Builder, r rune, force bool) { |
| if unicode.IsPrint(r) { |
| if strings.ContainsRune(meta, r) || force { |
| b.WriteRune('\\') |
| } |
| b.WriteRune(r) |
| return |
| } |
| |
| switch r { |
| case '\a': |
| b.WriteString(`\a`) |
| case '\f': |
| b.WriteString(`\f`) |
| case '\n': |
| b.WriteString(`\n`) |
| case '\r': |
| b.WriteString(`\r`) |
| case '\t': |
| b.WriteString(`\t`) |
| case '\v': |
| b.WriteString(`\v`) |
| default: |
| if r < 0x100 { |
| b.WriteString(`\x`) |
| s := strconv.FormatInt(int64(r), 16) |
| if len(s) == 1 { |
| b.WriteRune('0') |
| } |
| b.WriteString(s) |
| break |
| } |
| b.WriteString(`\x{`) |
| b.WriteString(strconv.FormatInt(int64(r), 16)) |
| b.WriteString(`}`) |
| } |
| } |
| |
| // MaxCap walks the regexp to find the maximum capture index. |
| func (re *Regexp) MaxCap() int { |
| m := 0 |
| if re.Op == OpCapture { |
| m = re.Cap |
| } |
| for _, sub := range re.Sub { |
| if n := sub.MaxCap(); m < n { |
| m = n |
| } |
| } |
| return m |
| } |
| |
| // CapNames walks the regexp to find the names of capturing groups. |
| func (re *Regexp) CapNames() []string { |
| names := make([]string, re.MaxCap()+1) |
| re.capNames(names) |
| return names |
| } |
| |
| func (re *Regexp) capNames(names []string) { |
| if re.Op == OpCapture { |
| names[re.Cap] = re.Name |
| } |
| for _, sub := range re.Sub { |
| sub.capNames(names) |
| } |
| } |