| // Copyright 2011 The Go Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| |
| package syntax |
| |
| import ( |
| "sort" |
| "strings" |
| "unicode" |
| "unicode/utf8" |
| ) |
| |
| // An Error describes a failure to parse a regular expression |
| // and gives the offending expression. |
| type Error struct { |
| Code ErrorCode |
| Expr string |
| } |
| |
| func (e *Error) Error() string { |
| return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" |
| } |
| |
| // An ErrorCode describes a failure to parse a regular expression. |
| type ErrorCode string |
| |
| const ( |
| // Unexpected error |
| ErrInternalError ErrorCode = "regexp/syntax: internal error" |
| |
| // Parse errors |
| ErrInvalidCharClass ErrorCode = "invalid character class" |
| ErrInvalidCharRange ErrorCode = "invalid character class range" |
| ErrInvalidEscape ErrorCode = "invalid escape sequence" |
| ErrInvalidNamedCapture ErrorCode = "invalid named capture" |
| ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" |
| ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" |
| ErrInvalidRepeatSize ErrorCode = "invalid repeat count" |
| ErrInvalidUTF8 ErrorCode = "invalid UTF-8" |
| ErrMissingBracket ErrorCode = "missing closing ]" |
| ErrMissingParen ErrorCode = "missing closing )" |
| ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" |
| ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" |
| ErrUnexpectedParen ErrorCode = "unexpected )" |
| ErrNestingDepth ErrorCode = "expression nests too deeply" |
| ErrLarge ErrorCode = "expression too large" |
| ) |
| |
| func (e ErrorCode) String() string { |
| return string(e) |
| } |
| |
| // Flags control the behavior of the parser and record information about regexp context. |
| type Flags uint16 |
| |
| const ( |
| FoldCase Flags = 1 << iota // case-insensitive match |
| Literal // treat pattern as literal string |
| ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline |
| DotNL // allow . to match newline |
| OneLine // treat ^ and $ as only matching at beginning and end of text |
| NonGreedy // make repetition operators default to non-greedy |
| PerlX // allow Perl extensions |
| UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation |
| WasDollar // regexp OpEndText was $, not \z |
| Simple // regexp contains no counted repetition |
| |
| MatchNL = ClassNL | DotNL |
| |
| Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible |
| POSIX Flags = 0 // POSIX syntax |
| ) |
| |
| // Pseudo-ops for parsing stack. |
| const ( |
| opLeftParen = opPseudo + iota |
| opVerticalBar |
| ) |
| |
| // maxHeight is the maximum height of a regexp parse tree. |
| // It is somewhat arbitrarily chosen, but the idea is to be large enough |
| // that no one will actually hit in real use but at the same time small enough |
| // that recursion on the Regexp tree will not hit the 1GB Go stack limit. |
| // The maximum amount of stack for a single recursive frame is probably |
| // closer to 1kB, so this could potentially be raised, but it seems unlikely |
| // that people have regexps nested even this deeply. |
| // We ran a test on Google's C++ code base and turned up only |
| // a single use case with depth > 100; it had depth 128. |
| // Using depth 1000 should be plenty of margin. |
| // As an optimization, we don't even bother calculating heights |
| // until we've allocated at least maxHeight Regexp structures. |
| const maxHeight = 1000 |
| |
| // maxSize is the maximum size of a compiled regexp in Insts. |
| // It too is somewhat arbitrarily chosen, but the idea is to be large enough |
| // to allow significant regexps while at the same time small enough that |
| // the compiled form will not take up too much memory. |
| // 128 MB is enough for a 3.3 million Inst structures, which roughly |
| // corresponds to a 3.3 MB regexp. |
| const ( |
| maxSize = 128 << 20 / instSize |
| instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words |
| ) |
| |
| // maxRunes is the maximum number of runes allowed in a regexp tree |
| // counting the runes in all the nodes. |
| // Ignoring character classes p.numRunes is always less than the length of the regexp. |
| // Character classes can make it much larger: each \pL adds 1292 runes. |
| // 128 MB is enough for 32M runes, which is over 26k \pL instances. |
| // Note that repetitions do not make copies of the rune slices, |
| // so \pL{1000} is only one rune slice, not 1000. |
| // We could keep a cache of character classes we've seen, |
| // so that all the \pL we see use the same rune list, |
| // but that doesn't remove the problem entirely: |
| // consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()]. |
| // And because the Rune slice is exposed directly in the Regexp, |
| // there is not an opportunity to change the representation to allow |
| // partial sharing between different character classes. |
| // So the limit is the best we can do. |
| const ( |
| maxRunes = 128 << 20 / runeSize |
| runeSize = 4 // rune is int32 |
| ) |
| |
| type parser struct { |
| flags Flags // parse mode flags |
| stack []*Regexp // stack of parsed expressions |
| free *Regexp |
| numCap int // number of capturing groups seen |
| wholeRegexp string |
| tmpClass []rune // temporary char class work space |
| numRegexp int // number of regexps allocated |
| numRunes int // number of runes in char classes |
| repeats int64 // product of all repetitions seen |
| height map[*Regexp]int // regexp height, for height limit check |
| size map[*Regexp]int64 // regexp compiled size, for size limit check |
| } |
| |
| func (p *parser) newRegexp(op Op) *Regexp { |
| re := p.free |
| if re != nil { |
| p.free = re.Sub0[0] |
| *re = Regexp{} |
| } else { |
| re = new(Regexp) |
| p.numRegexp++ |
| } |
| re.Op = op |
| return re |
| } |
| |
| func (p *parser) reuse(re *Regexp) { |
| if p.height != nil { |
| delete(p.height, re) |
| } |
| re.Sub0[0] = p.free |
| p.free = re |
| } |
| |
| func (p *parser) checkLimits(re *Regexp) { |
| if p.numRunes > maxRunes { |
| panic(ErrLarge) |
| } |
| p.checkSize(re) |
| p.checkHeight(re) |
| } |
| |
| func (p *parser) checkSize(re *Regexp) { |
| if p.size == nil { |
| // We haven't started tracking size yet. |
| // Do a relatively cheap check to see if we need to start. |
| // Maintain the product of all the repeats we've seen |
| // and don't track if the total number of regexp nodes |
| // we've seen times the repeat product is in budget. |
| if p.repeats == 0 { |
| p.repeats = 1 |
| } |
| if re.Op == OpRepeat { |
| n := re.Max |
| if n == -1 { |
| n = re.Min |
| } |
| if n <= 0 { |
| n = 1 |
| } |
| if int64(n) > maxSize/p.repeats { |
| p.repeats = maxSize |
| } else { |
| p.repeats *= int64(n) |
| } |
| } |
| if int64(p.numRegexp) < maxSize/p.repeats { |
| return |
| } |
| |
| // We need to start tracking size. |
| // Make the map and belatedly populate it |
| // with info about everything we've constructed so far. |
| p.size = make(map[*Regexp]int64) |
| for _, re := range p.stack { |
| p.checkSize(re) |
| } |
| } |
| |
| if p.calcSize(re, true) > maxSize { |
| panic(ErrLarge) |
| } |
| } |
| |
| func (p *parser) calcSize(re *Regexp, force bool) int64 { |
| if !force { |
| if size, ok := p.size[re]; ok { |
| return size |
| } |
| } |
| |
| var size int64 |
| switch re.Op { |
| case OpLiteral: |
| size = int64(len(re.Rune)) |
| case OpCapture, OpStar: |
| // star can be 1+ or 2+; assume 2 pessimistically |
| size = 2 + p.calcSize(re.Sub[0], false) |
| case OpPlus, OpQuest: |
| size = 1 + p.calcSize(re.Sub[0], false) |
| case OpConcat: |
| for _, sub := range re.Sub { |
| size += p.calcSize(sub, false) |
| } |
| case OpAlternate: |
| for _, sub := range re.Sub { |
| size += p.calcSize(sub, false) |
| } |
| if len(re.Sub) > 1 { |
| size += int64(len(re.Sub)) - 1 |
| } |
| case OpRepeat: |
| sub := p.calcSize(re.Sub[0], false) |
| if re.Max == -1 { |
| if re.Min == 0 { |
| size = 2 + sub // x* |
| } else { |
| size = 1 + int64(re.Min)*sub // xxx+ |
| } |
| break |
| } |
| // x{2,5} = xx(x(x(x)?)?)? |
| size = int64(re.Max)*sub + int64(re.Max-re.Min) |
| } |
| |
| if size < 1 { |
| size = 1 |
| } |
| p.size[re] = size |
| return size |
| } |
| |
| func (p *parser) checkHeight(re *Regexp) { |
| if p.numRegexp < maxHeight { |
| return |
| } |
| if p.height == nil { |
| p.height = make(map[*Regexp]int) |
| for _, re := range p.stack { |
| p.checkHeight(re) |
| } |
| } |
| if p.calcHeight(re, true) > maxHeight { |
| panic(ErrNestingDepth) |
| } |
| } |
| |
| func (p *parser) calcHeight(re *Regexp, force bool) int { |
| if !force { |
| if h, ok := p.height[re]; ok { |
| return h |
| } |
| } |
| h := 1 |
| for _, sub := range re.Sub { |
| hsub := p.calcHeight(sub, false) |
| if h < 1+hsub { |
| h = 1 + hsub |
| } |
| } |
| p.height[re] = h |
| return h |
| } |
| |
| // Parse stack manipulation. |
| |
| // push pushes the regexp re onto the parse stack and returns the regexp. |
| func (p *parser) push(re *Regexp) *Regexp { |
| p.numRunes += len(re.Rune) |
| if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { |
| // Single rune. |
| if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { |
| return nil |
| } |
| re.Op = OpLiteral |
| re.Rune = re.Rune[:1] |
| re.Flags = p.flags &^ FoldCase |
| } else if re.Op == OpCharClass && len(re.Rune) == 4 && |
| re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && |
| unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && |
| unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || |
| re.Op == OpCharClass && len(re.Rune) == 2 && |
| re.Rune[0]+1 == re.Rune[1] && |
| unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && |
| unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { |
| // Case-insensitive rune like [Aa] or [Δδ]. |
| if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { |
| return nil |
| } |
| |
| // Rewrite as (case-insensitive) literal. |
| re.Op = OpLiteral |
| re.Rune = re.Rune[:1] |
| re.Flags = p.flags | FoldCase |
| } else { |
| // Incremental concatenation. |
| p.maybeConcat(-1, 0) |
| } |
| |
| p.stack = append(p.stack, re) |
| p.checkLimits(re) |
| return re |
| } |
| |
| // maybeConcat implements incremental concatenation |
| // of literal runes into string nodes. The parser calls this |
| // before each push, so only the top fragment of the stack |
| // might need processing. Since this is called before a push, |
| // the topmost literal is no longer subject to operators like * |
| // (Otherwise ab* would turn into (ab)*.) |
| // If r >= 0 and there's a node left over, maybeConcat uses it |
| // to push r with the given flags. |
| // maybeConcat reports whether r was pushed. |
| func (p *parser) maybeConcat(r rune, flags Flags) bool { |
| n := len(p.stack) |
| if n < 2 { |
| return false |
| } |
| |
| re1 := p.stack[n-1] |
| re2 := p.stack[n-2] |
| if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { |
| return false |
| } |
| |
| // Push re1 into re2. |
| re2.Rune = append(re2.Rune, re1.Rune...) |
| |
| // Reuse re1 if possible. |
| if r >= 0 { |
| re1.Rune = re1.Rune0[:1] |
| re1.Rune[0] = r |
| re1.Flags = flags |
| return true |
| } |
| |
| p.stack = p.stack[:n-1] |
| p.reuse(re1) |
| return false // did not push r |
| } |
| |
| // literal pushes a literal regexp for the rune r on the stack. |
| func (p *parser) literal(r rune) { |
| re := p.newRegexp(OpLiteral) |
| re.Flags = p.flags |
| if p.flags&FoldCase != 0 { |
| r = minFoldRune(r) |
| } |
| re.Rune0[0] = r |
| re.Rune = re.Rune0[:1] |
| p.push(re) |
| } |
| |
| // minFoldRune returns the minimum rune fold-equivalent to r. |
| func minFoldRune(r rune) rune { |
| if r < minFold || r > maxFold { |
| return r |
| } |
| m := r |
| r0 := r |
| for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { |
| m = min(m, r) |
| } |
| return m |
| } |
| |
| // op pushes a regexp with the given op onto the stack |
| // and returns that regexp. |
| func (p *parser) op(op Op) *Regexp { |
| re := p.newRegexp(op) |
| re.Flags = p.flags |
| return p.push(re) |
| } |
| |
| // repeat replaces the top stack element with itself repeated according to op, min, max. |
| // before is the regexp suffix starting at the repetition operator. |
| // after is the regexp suffix following after the repetition operator. |
| // repeat returns an updated 'after' and an error, if any. |
| func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) { |
| flags := p.flags |
| if p.flags&PerlX != 0 { |
| if len(after) > 0 && after[0] == '?' { |
| after = after[1:] |
| flags ^= NonGreedy |
| } |
| if lastRepeat != "" { |
| // In Perl it is not allowed to stack repetition operators: |
| // a** is a syntax error, not a doubled star, and a++ means |
| // something else entirely, which we don't support! |
| return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]} |
| } |
| } |
| n := len(p.stack) |
| if n == 0 { |
| return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} |
| } |
| sub := p.stack[n-1] |
| if sub.Op >= opPseudo { |
| return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} |
| } |
| |
| re := p.newRegexp(op) |
| re.Min = min |
| re.Max = max |
| re.Flags = flags |
| re.Sub = re.Sub0[:1] |
| re.Sub[0] = sub |
| p.stack[n-1] = re |
| p.checkLimits(re) |
| |
| if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) { |
| return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} |
| } |
| |
| return after, nil |
| } |
| |
| // repeatIsValid reports whether the repetition re is valid. |
| // Valid means that the combination of the top-level repetition |
| // and any inner repetitions does not exceed n copies of the |
| // innermost thing. |
| // This function rewalks the regexp tree and is called for every repetition, |
| // so we have to worry about inducing quadratic behavior in the parser. |
| // We avoid this by only calling repeatIsValid when min or max >= 2. |
| // In that case the depth of any >= 2 nesting can only get to 9 without |
| // triggering a parse error, so each subtree can only be rewalked 9 times. |
| func repeatIsValid(re *Regexp, n int) bool { |
| if re.Op == OpRepeat { |
| m := re.Max |
| if m == 0 { |
| return true |
| } |
| if m < 0 { |
| m = re.Min |
| } |
| if m > n { |
| return false |
| } |
| if m > 0 { |
| n /= m |
| } |
| } |
| for _, sub := range re.Sub { |
| if !repeatIsValid(sub, n) { |
| return false |
| } |
| } |
| return true |
| } |
| |
| // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. |
| func (p *parser) concat() *Regexp { |
| p.maybeConcat(-1, 0) |
| |
| // Scan down to find pseudo-operator | or (. |
| i := len(p.stack) |
| for i > 0 && p.stack[i-1].Op < opPseudo { |
| i-- |
| } |
| subs := p.stack[i:] |
| p.stack = p.stack[:i] |
| |
| // Empty concatenation is special case. |
| if len(subs) == 0 { |
| return p.push(p.newRegexp(OpEmptyMatch)) |
| } |
| |
| return p.push(p.collapse(subs, OpConcat)) |
| } |
| |
| // alternate replaces the top of the stack (above the topmost '(') with its alternation. |
| func (p *parser) alternate() *Regexp { |
| // Scan down to find pseudo-operator (. |
| // There are no | above (. |
| i := len(p.stack) |
| for i > 0 && p.stack[i-1].Op < opPseudo { |
| i-- |
| } |
| subs := p.stack[i:] |
| p.stack = p.stack[:i] |
| |
| // Make sure top class is clean. |
| // All the others already are (see swapVerticalBar). |
| if len(subs) > 0 { |
| cleanAlt(subs[len(subs)-1]) |
| } |
| |
| // Empty alternate is special case |
| // (shouldn't happen but easy to handle). |
| if len(subs) == 0 { |
| return p.push(p.newRegexp(OpNoMatch)) |
| } |
| |
| return p.push(p.collapse(subs, OpAlternate)) |
| } |
| |
| // cleanAlt cleans re for eventual inclusion in an alternation. |
| func cleanAlt(re *Regexp) { |
| switch re.Op { |
| case OpCharClass: |
| re.Rune = cleanClass(&re.Rune) |
| if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { |
| re.Rune = nil |
| re.Op = OpAnyChar |
| return |
| } |
| if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { |
| re.Rune = nil |
| re.Op = OpAnyCharNotNL |
| return |
| } |
| if cap(re.Rune)-len(re.Rune) > 100 { |
| // re.Rune will not grow any more. |
| // Make a copy or inline to reclaim storage. |
| re.Rune = append(re.Rune0[:0], re.Rune...) |
| } |
| } |
| } |
| |
| // collapse returns the result of applying op to sub. |
| // If sub contains op nodes, they all get hoisted up |
| // so that there is never a concat of a concat or an |
| // alternate of an alternate. |
| func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { |
| if len(subs) == 1 { |
| return subs[0] |
| } |
| re := p.newRegexp(op) |
| re.Sub = re.Sub0[:0] |
| for _, sub := range subs { |
| if sub.Op == op { |
| re.Sub = append(re.Sub, sub.Sub...) |
| p.reuse(sub) |
| } else { |
| re.Sub = append(re.Sub, sub) |
| } |
| } |
| if op == OpAlternate { |
| re.Sub = p.factor(re.Sub) |
| if len(re.Sub) == 1 { |
| old := re |
| re = re.Sub[0] |
| p.reuse(old) |
| } |
| } |
| return re |
| } |
| |
| // factor factors common prefixes from the alternation list sub. |
| // It returns a replacement list that reuses the same storage and |
| // frees (passes to p.reuse) any removed *Regexps. |
| // |
| // For example, |
| // |
| // ABC|ABD|AEF|BCX|BCY |
| // |
| // simplifies by literal prefix extraction to |
| // |
| // A(B(C|D)|EF)|BC(X|Y) |
| // |
| // which simplifies by character class introduction to |
| // |
| // A(B[CD]|EF)|BC[XY] |
| func (p *parser) factor(sub []*Regexp) []*Regexp { |
| if len(sub) < 2 { |
| return sub |
| } |
| |
| // Round 1: Factor out common literal prefixes. |
| var str []rune |
| var strflags Flags |
| start := 0 |
| out := sub[:0] |
| for i := 0; i <= len(sub); i++ { |
| // Invariant: the Regexps that were in sub[0:start] have been |
| // used or marked for reuse, and the slice space has been reused |
| // for out (len(out) <= start). |
| // |
| // Invariant: sub[start:i] consists of regexps that all begin |
| // with str as modified by strflags. |
| var istr []rune |
| var iflags Flags |
| if i < len(sub) { |
| istr, iflags = p.leadingString(sub[i]) |
| if iflags == strflags { |
| same := 0 |
| for same < len(str) && same < len(istr) && str[same] == istr[same] { |
| same++ |
| } |
| if same > 0 { |
| // Matches at least one rune in current range. |
| // Keep going around. |
| str = str[:same] |
| continue |
| } |
| } |
| } |
| |
| // Found end of a run with common leading literal string: |
| // sub[start:i] all begin with str[0:len(str)], but sub[i] |
| // does not even begin with str[0]. |
| // |
| // Factor out common string and append factored expression to out. |
| if i == start { |
| // Nothing to do - run of length 0. |
| } else if i == start+1 { |
| // Just one: don't bother factoring. |
| out = append(out, sub[start]) |
| } else { |
| // Construct factored form: prefix(suffix1|suffix2|...) |
| prefix := p.newRegexp(OpLiteral) |
| prefix.Flags = strflags |
| prefix.Rune = append(prefix.Rune[:0], str...) |
| |
| for j := start; j < i; j++ { |
| sub[j] = p.removeLeadingString(sub[j], len(str)) |
| p.checkLimits(sub[j]) |
| } |
| suffix := p.collapse(sub[start:i], OpAlternate) // recurse |
| |
| re := p.newRegexp(OpConcat) |
| re.Sub = append(re.Sub[:0], prefix, suffix) |
| out = append(out, re) |
| } |
| |
| // Prepare for next iteration. |
| start = i |
| str = istr |
| strflags = iflags |
| } |
| sub = out |
| |
| // Round 2: Factor out common simple prefixes, |
| // just the first piece of each concatenation. |
| // This will be good enough a lot of the time. |
| // |
| // Complex subexpressions (e.g. involving quantifiers) |
| // are not safe to factor because that collapses their |
| // distinct paths through the automaton, which affects |
| // correctness in some cases. |
| start = 0 |
| out = sub[:0] |
| var first *Regexp |
| for i := 0; i <= len(sub); i++ { |
| // Invariant: the Regexps that were in sub[0:start] have been |
| // used or marked for reuse, and the slice space has been reused |
| // for out (len(out) <= start). |
| // |
| // Invariant: sub[start:i] consists of regexps that all begin with ifirst. |
| var ifirst *Regexp |
| if i < len(sub) { |
| ifirst = p.leadingRegexp(sub[i]) |
| if first != nil && first.Equal(ifirst) && |
| // first must be a character class OR a fixed repeat of a character class. |
| (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) { |
| continue |
| } |
| } |
| |
| // Found end of a run with common leading regexp: |
| // sub[start:i] all begin with first but sub[i] does not. |
| // |
| // Factor out common regexp and append factored expression to out. |
| if i == start { |
| // Nothing to do - run of length 0. |
| } else if i == start+1 { |
| // Just one: don't bother factoring. |
| out = append(out, sub[start]) |
| } else { |
| // Construct factored form: prefix(suffix1|suffix2|...) |
| prefix := first |
| for j := start; j < i; j++ { |
| reuse := j != start // prefix came from sub[start] |
| sub[j] = p.removeLeadingRegexp(sub[j], reuse) |
| p.checkLimits(sub[j]) |
| } |
| suffix := p.collapse(sub[start:i], OpAlternate) // recurse |
| |
| re := p.newRegexp(OpConcat) |
| re.Sub = append(re.Sub[:0], prefix, suffix) |
| out = append(out, re) |
| } |
| |
| // Prepare for next iteration. |
| start = i |
| first = ifirst |
| } |
| sub = out |
| |
| // Round 3: Collapse runs of single literals into character classes. |
| start = 0 |
| out = sub[:0] |
| for i := 0; i <= len(sub); i++ { |
| // Invariant: the Regexps that were in sub[0:start] have been |
| // used or marked for reuse, and the slice space has been reused |
| // for out (len(out) <= start). |
| // |
| // Invariant: sub[start:i] consists of regexps that are either |
| // literal runes or character classes. |
| if i < len(sub) && isCharClass(sub[i]) { |
| continue |
| } |
| |
| // sub[i] is not a char or char class; |
| // emit char class for sub[start:i]... |
| if i == start { |
| // Nothing to do - run of length 0. |
| } else if i == start+1 { |
| out = append(out, sub[start]) |
| } else { |
| // Make new char class. |
| // Start with most complex regexp in sub[start]. |
| max := start |
| for j := start + 1; j < i; j++ { |
| if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { |
| max = j |
| } |
| } |
| sub[start], sub[max] = sub[max], sub[start] |
| |
| for j := start + 1; j < i; j++ { |
| mergeCharClass(sub[start], sub[j]) |
| p.reuse(sub[j]) |
| } |
| cleanAlt(sub[start]) |
| out = append(out, sub[start]) |
| } |
| |
| // ... and then emit sub[i]. |
| if i < len(sub) { |
| out = append(out, sub[i]) |
| } |
| start = i + 1 |
| } |
| sub = out |
| |
| // Round 4: Collapse runs of empty matches into a single empty match. |
| start = 0 |
| out = sub[:0] |
| for i := range sub { |
| if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { |
| continue |
| } |
| out = append(out, sub[i]) |
| } |
| sub = out |
| |
| return sub |
| } |
| |
| // leadingString returns the leading literal string that re begins with. |
| // The string refers to storage in re or its children. |
| func (p *parser) leadingString(re *Regexp) ([]rune, Flags) { |
| if re.Op == OpConcat && len(re.Sub) > 0 { |
| re = re.Sub[0] |
| } |
| if re.Op != OpLiteral { |
| return nil, 0 |
| } |
| return re.Rune, re.Flags & FoldCase |
| } |
| |
| // removeLeadingString removes the first n leading runes |
| // from the beginning of re. It returns the replacement for re. |
| func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { |
| if re.Op == OpConcat && len(re.Sub) > 0 { |
| // Removing a leading string in a concatenation |
| // might simplify the concatenation. |
| sub := re.Sub[0] |
| sub = p.removeLeadingString(sub, n) |
| re.Sub[0] = sub |
| if sub.Op == OpEmptyMatch { |
| p.reuse(sub) |
| switch len(re.Sub) { |
| case 0, 1: |
| // Impossible but handle. |
| re.Op = OpEmptyMatch |
| re.Sub = nil |
| case 2: |
| old := re |
| re = re.Sub[1] |
| p.reuse(old) |
| default: |
| copy(re.Sub, re.Sub[1:]) |
| re.Sub = re.Sub[:len(re.Sub)-1] |
| } |
| } |
| return re |
| } |
| |
| if re.Op == OpLiteral { |
| re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] |
| if len(re.Rune) == 0 { |
| re.Op = OpEmptyMatch |
| } |
| } |
| return re |
| } |
| |
| // leadingRegexp returns the leading regexp that re begins with. |
| // The regexp refers to storage in re or its children. |
| func (p *parser) leadingRegexp(re *Regexp) *Regexp { |
| if re.Op == OpEmptyMatch { |
| return nil |
| } |
| if re.Op == OpConcat && len(re.Sub) > 0 { |
| sub := re.Sub[0] |
| if sub.Op == OpEmptyMatch { |
| return nil |
| } |
| return sub |
| } |
| return re |
| } |
| |
| // removeLeadingRegexp removes the leading regexp in re. |
| // It returns the replacement for re. |
| // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. |
| func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { |
| if re.Op == OpConcat && len(re.Sub) > 0 { |
| if reuse { |
| p.reuse(re.Sub[0]) |
| } |
| re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] |
| switch len(re.Sub) { |
| case 0: |
| re.Op = OpEmptyMatch |
| re.Sub = nil |
| case 1: |
| old := re |
| re = re.Sub[0] |
| p.reuse(old) |
| } |
| return re |
| } |
| if reuse { |
| p.reuse(re) |
| } |
| return p.newRegexp(OpEmptyMatch) |
| } |
| |
| func literalRegexp(s string, flags Flags) *Regexp { |
| re := &Regexp{Op: OpLiteral} |
| re.Flags = flags |
| re.Rune = re.Rune0[:0] // use local storage for small strings |
| for _, c := range s { |
| if len(re.Rune) >= cap(re.Rune) { |
| // string is too long to fit in Rune0. let Go handle it |
| re.Rune = []rune(s) |
| break |
| } |
| re.Rune = append(re.Rune, c) |
| } |
| return re |
| } |
| |
| // Parsing. |
| |
| // Parse parses a regular expression string s, controlled by the specified |
| // Flags, and returns a regular expression parse tree. The syntax is |
| // described in the top-level comment. |
| func Parse(s string, flags Flags) (*Regexp, error) { |
| return parse(s, flags) |
| } |
| |
| func parse(s string, flags Flags) (_ *Regexp, err error) { |
| defer func() { |
| switch r := recover(); r { |
| default: |
| panic(r) |
| case nil: |
| // ok |
| case ErrLarge: // too big |
| err = &Error{Code: ErrLarge, Expr: s} |
| case ErrNestingDepth: |
| err = &Error{Code: ErrNestingDepth, Expr: s} |
| } |
| }() |
| |
| if flags&Literal != 0 { |
| // Trivial parser for literal string. |
| if err := checkUTF8(s); err != nil { |
| return nil, err |
| } |
| return literalRegexp(s, flags), nil |
| } |
| |
| // Otherwise, must do real work. |
| var ( |
| p parser |
| c rune |
| op Op |
| lastRepeat string |
| ) |
| p.flags = flags |
| p.wholeRegexp = s |
| t := s |
| for t != "" { |
| repeat := "" |
| BigSwitch: |
| switch t[0] { |
| default: |
| if c, t, err = nextRune(t); err != nil { |
| return nil, err |
| } |
| p.literal(c) |
| |
| case '(': |
| if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { |
| // Flag changes and non-capturing groups. |
| if t, err = p.parsePerlFlags(t); err != nil { |
| return nil, err |
| } |
| break |
| } |
| p.numCap++ |
| p.op(opLeftParen).Cap = p.numCap |
| t = t[1:] |
| case '|': |
| if err = p.parseVerticalBar(); err != nil { |
| return nil, err |
| } |
| t = t[1:] |
| case ')': |
| if err = p.parseRightParen(); err != nil { |
| return nil, err |
| } |
| t = t[1:] |
| case '^': |
| if p.flags&OneLine != 0 { |
| p.op(OpBeginText) |
| } else { |
| p.op(OpBeginLine) |
| } |
| t = t[1:] |
| case '$': |
| if p.flags&OneLine != 0 { |
| p.op(OpEndText).Flags |= WasDollar |
| } else { |
| p.op(OpEndLine) |
| } |
| t = t[1:] |
| case '.': |
| if p.flags&DotNL != 0 { |
| p.op(OpAnyChar) |
| } else { |
| p.op(OpAnyCharNotNL) |
| } |
| t = t[1:] |
| case '[': |
| if t, err = p.parseClass(t); err != nil { |
| return nil, err |
| } |
| case '*', '+', '?': |
| before := t |
| switch t[0] { |
| case '*': |
| op = OpStar |
| case '+': |
| op = OpPlus |
| case '?': |
| op = OpQuest |
| } |
| after := t[1:] |
| if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil { |
| return nil, err |
| } |
| repeat = before |
| t = after |
| case '{': |
| op = OpRepeat |
| before := t |
| min, max, after, ok := p.parseRepeat(t) |
| if !ok { |
| // If the repeat cannot be parsed, { is a literal. |
| p.literal('{') |
| t = t[1:] |
| break |
| } |
| if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max { |
| // Numbers were too big, or max is present and min > max. |
| return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} |
| } |
| if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { |
| return nil, err |
| } |
| repeat = before |
| t = after |
| case '\\': |
| if p.flags&PerlX != 0 && len(t) >= 2 { |
| switch t[1] { |
| case 'A': |
| p.op(OpBeginText) |
| t = t[2:] |
| break BigSwitch |
| case 'b': |
| p.op(OpWordBoundary) |
| t = t[2:] |
| break BigSwitch |
| case 'B': |
| p.op(OpNoWordBoundary) |
| t = t[2:] |
| break BigSwitch |
| case 'C': |
| // any byte; not supported |
| return nil, &Error{ErrInvalidEscape, t[:2]} |
| case 'Q': |
| // \Q ... \E: the ... is always literals |
| var lit string |
| lit, t, _ = strings.Cut(t[2:], `\E`) |
| for lit != "" { |
| c, rest, err := nextRune(lit) |
| if err != nil { |
| return nil, err |
| } |
| p.literal(c) |
| lit = rest |
| } |
| break BigSwitch |
| case 'z': |
| p.op(OpEndText) |
| t = t[2:] |
| break BigSwitch |
| } |
| } |
| |
| re := p.newRegexp(OpCharClass) |
| re.Flags = p.flags |
| |
| // Look for Unicode character group like \p{Han} |
| if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { |
| r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) |
| if err != nil { |
| return nil, err |
| } |
| if r != nil { |
| re.Rune = r |
| t = rest |
| p.push(re) |
| break BigSwitch |
| } |
| } |
| |
| // Perl character class escape. |
| if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { |
| re.Rune = r |
| t = rest |
| p.push(re) |
| break BigSwitch |
| } |
| p.reuse(re) |
| |
| // Ordinary single-character escape. |
| if c, t, err = p.parseEscape(t); err != nil { |
| return nil, err |
| } |
| p.literal(c) |
| } |
| lastRepeat = repeat |
| } |
| |
| p.concat() |
| if p.swapVerticalBar() { |
| // pop vertical bar |
| p.stack = p.stack[:len(p.stack)-1] |
| } |
| p.alternate() |
| |
| n := len(p.stack) |
| if n != 1 { |
| return nil, &Error{ErrMissingParen, s} |
| } |
| return p.stack[0], nil |
| } |
| |
| // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. |
| // If s is not of that form, it returns ok == false. |
| // If s has the right form but the values are too big, it returns min == -1, ok == true. |
| func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { |
| if s == "" || s[0] != '{' { |
| return |
| } |
| s = s[1:] |
| var ok1 bool |
| if min, s, ok1 = p.parseInt(s); !ok1 { |
| return |
| } |
| if s == "" { |
| return |
| } |
| if s[0] != ',' { |
| max = min |
| } else { |
| s = s[1:] |
| if s == "" { |
| return |
| } |
| if s[0] == '}' { |
| max = -1 |
| } else if max, s, ok1 = p.parseInt(s); !ok1 { |
| return |
| } else if max < 0 { |
| // parseInt found too big a number |
| min = -1 |
| } |
| } |
| if s == "" || s[0] != '}' { |
| return |
| } |
| rest = s[1:] |
| ok = true |
| return |
| } |
| |
| // parsePerlFlags parses a Perl flag setting or non-capturing group or both, |
| // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. |
| // The caller must have ensured that s begins with "(?". |
| func (p *parser) parsePerlFlags(s string) (rest string, err error) { |
| t := s |
| |
| // Check for named captures, first introduced in Python's regexp library. |
| // As usual, there are three slightly different syntaxes: |
| // |
| // (?P<name>expr) the original, introduced by Python |
| // (?<name>expr) the .NET alteration, adopted by Perl 5.10 |
| // (?'name'expr) another .NET alteration, adopted by Perl 5.10 |
| // |
| // Perl 5.10 gave in and implemented the Python version too, |
| // but they claim that the last two are the preferred forms. |
| // PCRE and languages based on it (specifically, PHP and Ruby) |
| // support all three as well. EcmaScript 4 uses only the Python form. |
| // |
| // In both the open source world (via Code Search) and the |
| // Google source tree, (?P<expr>name) and (?<expr>name) are the |
| // dominant forms of named captures and both are supported. |
| startsWithP := len(t) > 4 && t[2] == 'P' && t[3] == '<' |
| startsWithName := len(t) > 3 && t[2] == '<' |
| |
| if startsWithP || startsWithName { |
| // position of expr start |
| exprStartPos := 4 |
| if startsWithName { |
| exprStartPos = 3 |
| } |
| |
| // Pull out name. |
| end := strings.IndexRune(t, '>') |
| if end < 0 { |
| if err = checkUTF8(t); err != nil { |
| return "", err |
| } |
| return "", &Error{ErrInvalidNamedCapture, s} |
| } |
| |
| capture := t[:end+1] // "(?P<name>" or "(?<name>" |
| name := t[exprStartPos:end] // "name" |
| if err = checkUTF8(name); err != nil { |
| return "", err |
| } |
| if !isValidCaptureName(name) { |
| return "", &Error{ErrInvalidNamedCapture, capture} |
| } |
| |
| // Like ordinary capture, but named. |
| p.numCap++ |
| re := p.op(opLeftParen) |
| re.Cap = p.numCap |
| re.Name = name |
| return t[end+1:], nil |
| } |
| |
| // Non-capturing group. Might also twiddle Perl flags. |
| var c rune |
| t = t[2:] // skip (? |
| flags := p.flags |
| sign := +1 |
| sawFlag := false |
| Loop: |
| for t != "" { |
| if c, t, err = nextRune(t); err != nil { |
| return "", err |
| } |
| switch c { |
| default: |
| break Loop |
| |
| // Flags. |
| case 'i': |
| flags |= FoldCase |
| sawFlag = true |
| case 'm': |
| flags &^= OneLine |
| sawFlag = true |
| case 's': |
| flags |= DotNL |
| sawFlag = true |
| case 'U': |
| flags |= NonGreedy |
| sawFlag = true |
| |
| // Switch to negation. |
| case '-': |
| if sign < 0 { |
| break Loop |
| } |
| sign = -1 |
| // Invert flags so that | above turn into &^ and vice versa. |
| // We'll invert flags again before using it below. |
| flags = ^flags |
| sawFlag = false |
| |
| // End of flags, starting group or not. |
| case ':', ')': |
| if sign < 0 { |
| if !sawFlag { |
| break Loop |
| } |
| flags = ^flags |
| } |
| if c == ':' { |
| // Open new group |
| p.op(opLeftParen) |
| } |
| p.flags = flags |
| return t, nil |
| } |
| } |
| |
| return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} |
| } |
| |
| // isValidCaptureName reports whether name |
| // is a valid capture name: [A-Za-z0-9_]+. |
| // PCRE limits names to 32 bytes. |
| // Python rejects names starting with digits. |
| // We don't enforce either of those. |
| func isValidCaptureName(name string) bool { |
| if name == "" { |
| return false |
| } |
| for _, c := range name { |
| if c != '_' && !isalnum(c) { |
| return false |
| } |
| } |
| return true |
| } |
| |
| // parseInt parses a decimal integer. |
| func (p *parser) parseInt(s string) (n int, rest string, ok bool) { |
| if s == "" || s[0] < '0' || '9' < s[0] { |
| return |
| } |
| // Disallow leading zeros. |
| if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { |
| return |
| } |
| t := s |
| for s != "" && '0' <= s[0] && s[0] <= '9' { |
| s = s[1:] |
| } |
| rest = s |
| ok = true |
| // Have digits, compute value. |
| t = t[:len(t)-len(s)] |
| for i := 0; i < len(t); i++ { |
| // Avoid overflow. |
| if n >= 1e8 { |
| n = -1 |
| break |
| } |
| n = n*10 + int(t[i]) - '0' |
| } |
| return |
| } |
| |
| // can this be represented as a character class? |
| // single-rune literal string, char class, ., and .|\n. |
| func isCharClass(re *Regexp) bool { |
| return re.Op == OpLiteral && len(re.Rune) == 1 || |
| re.Op == OpCharClass || |
| re.Op == OpAnyCharNotNL || |
| re.Op == OpAnyChar |
| } |
| |
| // does re match r? |
| func matchRune(re *Regexp, r rune) bool { |
| switch re.Op { |
| case OpLiteral: |
| return len(re.Rune) == 1 && re.Rune[0] == r |
| case OpCharClass: |
| for i := 0; i < len(re.Rune); i += 2 { |
| if re.Rune[i] <= r && r <= re.Rune[i+1] { |
| return true |
| } |
| } |
| return false |
| case OpAnyCharNotNL: |
| return r != '\n' |
| case OpAnyChar: |
| return true |
| } |
| return false |
| } |
| |
| // parseVerticalBar handles a | in the input. |
| func (p *parser) parseVerticalBar() error { |
| p.concat() |
| |
| // The concatenation we just parsed is on top of the stack. |
| // If it sits above an opVerticalBar, swap it below |
| // (things below an opVerticalBar become an alternation). |
| // Otherwise, push a new vertical bar. |
| if !p.swapVerticalBar() { |
| p.op(opVerticalBar) |
| } |
| |
| return nil |
| } |
| |
| // mergeCharClass makes dst = dst|src. |
| // The caller must ensure that dst.Op >= src.Op, |
| // to reduce the amount of copying. |
| func mergeCharClass(dst, src *Regexp) { |
| switch dst.Op { |
| case OpAnyChar: |
| // src doesn't add anything. |
| case OpAnyCharNotNL: |
| // src might add \n |
| if matchRune(src, '\n') { |
| dst.Op = OpAnyChar |
| } |
| case OpCharClass: |
| // src is simpler, so either literal or char class |
| if src.Op == OpLiteral { |
| dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) |
| } else { |
| dst.Rune = appendClass(dst.Rune, src.Rune) |
| } |
| case OpLiteral: |
| // both literal |
| if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags { |
| break |
| } |
| dst.Op = OpCharClass |
| dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags) |
| dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) |
| } |
| } |
| |
| // If the top of the stack is an element followed by an opVerticalBar |
| // swapVerticalBar swaps the two and returns true. |
| // Otherwise it returns false. |
| func (p *parser) swapVerticalBar() bool { |
| // If above and below vertical bar are literal or char class, |
| // can merge into a single char class. |
| n := len(p.stack) |
| if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { |
| re1 := p.stack[n-1] |
| re3 := p.stack[n-3] |
| // Make re3 the more complex of the two. |
| if re1.Op > re3.Op { |
| re1, re3 = re3, re1 |
| p.stack[n-3] = re3 |
| } |
| mergeCharClass(re3, re1) |
| p.reuse(re1) |
| p.stack = p.stack[:n-1] |
| return true |
| } |
| |
| if n >= 2 { |
| re1 := p.stack[n-1] |
| re2 := p.stack[n-2] |
| if re2.Op == opVerticalBar { |
| if n >= 3 { |
| // Now out of reach. |
| // Clean opportunistically. |
| cleanAlt(p.stack[n-3]) |
| } |
| p.stack[n-2] = re1 |
| p.stack[n-1] = re2 |
| return true |
| } |
| } |
| return false |
| } |
| |
| // parseRightParen handles a ) in the input. |
| func (p *parser) parseRightParen() error { |
| p.concat() |
| if p.swapVerticalBar() { |
| // pop vertical bar |
| p.stack = p.stack[:len(p.stack)-1] |
| } |
| p.alternate() |
| |
| n := len(p.stack) |
| if n < 2 { |
| return &Error{ErrUnexpectedParen, p.wholeRegexp} |
| } |
| re1 := p.stack[n-1] |
| re2 := p.stack[n-2] |
| p.stack = p.stack[:n-2] |
| if re2.Op != opLeftParen { |
| return &Error{ErrUnexpectedParen, p.wholeRegexp} |
| } |
| // Restore flags at time of paren. |
| p.flags = re2.Flags |
| if re2.Cap == 0 { |
| // Just for grouping. |
| p.push(re1) |
| } else { |
| re2.Op = OpCapture |
| re2.Sub = re2.Sub0[:1] |
| re2.Sub[0] = re1 |
| p.push(re2) |
| } |
| return nil |
| } |
| |
| // parseEscape parses an escape sequence at the beginning of s |
| // and returns the rune. |
| func (p *parser) parseEscape(s string) (r rune, rest string, err error) { |
| t := s[1:] |
| if t == "" { |
| return 0, "", &Error{ErrTrailingBackslash, ""} |
| } |
| c, t, err := nextRune(t) |
| if err != nil { |
| return 0, "", err |
| } |
| |
| Switch: |
| switch c { |
| default: |
| if c < utf8.RuneSelf && !isalnum(c) { |
| // Escaped non-word characters are always themselves. |
| // PCRE is not quite so rigorous: it accepts things like |
| // \q, but we don't. We once rejected \_, but too many |
| // programs and people insist on using it, so allow \_. |
| return c, t, nil |
| } |
| |
| // Octal escapes. |
| case '1', '2', '3', '4', '5', '6', '7': |
| // Single non-zero digit is a backreference; not supported |
| if t == "" || t[0] < '0' || t[0] > '7' { |
| break |
| } |
| fallthrough |
| case '0': |
| // Consume up to three octal digits; already have one. |
| r = c - '0' |
| for i := 1; i < 3; i++ { |
| if t == "" || t[0] < '0' || t[0] > '7' { |
| break |
| } |
| r = r*8 + rune(t[0]) - '0' |
| t = t[1:] |
| } |
| return r, t, nil |
| |
| // Hexadecimal escapes. |
| case 'x': |
| if t == "" { |
| break |
| } |
| if c, t, err = nextRune(t); err != nil { |
| return 0, "", err |
| } |
| if c == '{' { |
| // Any number of digits in braces. |
| // Perl accepts any text at all; it ignores all text |
| // after the first non-hex digit. We require only hex digits, |
| // and at least one. |
| nhex := 0 |
| r = 0 |
| for { |
| if t == "" { |
| break Switch |
| } |
| if c, t, err = nextRune(t); err != nil { |
| return 0, "", err |
| } |
| if c == '}' { |
| break |
| } |
| v := unhex(c) |
| if v < 0 { |
| break Switch |
| } |
| r = r*16 + v |
| if r > unicode.MaxRune { |
| break Switch |
| } |
| nhex++ |
| } |
| if nhex == 0 { |
| break Switch |
| } |
| return r, t, nil |
| } |
| |
| // Easy case: two hex digits. |
| x := unhex(c) |
| if c, t, err = nextRune(t); err != nil { |
| return 0, "", err |
| } |
| y := unhex(c) |
| if x < 0 || y < 0 { |
| break |
| } |
| return x*16 + y, t, nil |
| |
| // C escapes. There is no case 'b', to avoid misparsing |
| // the Perl word-boundary \b as the C backspace \b |
| // when in POSIX mode. In Perl, /\b/ means word-boundary |
| // but /[\b]/ means backspace. We don't support that. |
| // If you want a backspace, embed a literal backspace |
| // character or use \x08. |
| case 'a': |
| return '\a', t, err |
| case 'f': |
| return '\f', t, err |
| case 'n': |
| return '\n', t, err |
| case 'r': |
| return '\r', t, err |
| case 't': |
| return '\t', t, err |
| case 'v': |
| return '\v', t, err |
| } |
| return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} |
| } |
| |
| // parseClassChar parses a character class character at the beginning of s |
| // and returns it. |
| func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) { |
| if s == "" { |
| return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} |
| } |
| |
| // Allow regular escape sequences even though |
| // many need not be escaped in this context. |
| if s[0] == '\\' { |
| return p.parseEscape(s) |
| } |
| |
| return nextRune(s) |
| } |
| |
| type charGroup struct { |
| sign int |
| class []rune |
| } |
| |
| // parsePerlClassEscape parses a leading Perl character class escape like \d |
| // from the beginning of s. If one is present, it appends the characters to r |
| // and returns the new slice r and the remainder of the string. |
| func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) { |
| if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { |
| return |
| } |
| g := perlGroup[s[0:2]] |
| if g.sign == 0 { |
| return |
| } |
| return p.appendGroup(r, g), s[2:] |
| } |
| |
| // parseNamedClass parses a leading POSIX named character class like [:alnum:] |
| // from the beginning of s. If one is present, it appends the characters to r |
| // and returns the new slice r and the remainder of the string. |
| func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) { |
| if len(s) < 2 || s[0] != '[' || s[1] != ':' { |
| return |
| } |
| |
| i := strings.Index(s[2:], ":]") |
| if i < 0 { |
| return |
| } |
| i += 2 |
| name, s := s[0:i+2], s[i+2:] |
| g := posixGroup[name] |
| if g.sign == 0 { |
| return nil, "", &Error{ErrInvalidCharRange, name} |
| } |
| return p.appendGroup(r, g), s, nil |
| } |
| |
| func (p *parser) appendGroup(r []rune, g charGroup) []rune { |
| if p.flags&FoldCase == 0 { |
| if g.sign < 0 { |
| r = appendNegatedClass(r, g.class) |
| } else { |
| r = appendClass(r, g.class) |
| } |
| } else { |
| tmp := p.tmpClass[:0] |
| tmp = appendFoldedClass(tmp, g.class) |
| p.tmpClass = tmp |
| tmp = cleanClass(&p.tmpClass) |
| if g.sign < 0 { |
| r = appendNegatedClass(r, tmp) |
| } else { |
| r = appendClass(r, tmp) |
| } |
| } |
| return r |
| } |
| |
| var anyTable = &unicode.RangeTable{ |
| R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}}, |
| R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}}, |
| } |
| |
| // unicodeTable returns the unicode.RangeTable identified by name |
| // and the table of additional fold-equivalent code points. |
| func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { |
| // Special case: "Any" means any. |
| if name == "Any" { |
| return anyTable, anyTable |
| } |
| if t := unicode.Categories[name]; t != nil { |
| return t, unicode.FoldCategory[name] |
| } |
| if t := unicode.Scripts[name]; t != nil { |
| return t, unicode.FoldScript[name] |
| } |
| return nil, nil |
| } |
| |
| // parseUnicodeClass parses a leading Unicode character class like \p{Han} |
| // from the beginning of s. If one is present, it appends the characters to r |
| // and returns the new slice r and the remainder of the string. |
| func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) { |
| if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { |
| return |
| } |
| |
| // Committed to parse or return error. |
| sign := +1 |
| if s[1] == 'P' { |
| sign = -1 |
| } |
| t := s[2:] |
| c, t, err := nextRune(t) |
| if err != nil { |
| return |
| } |
| var seq, name string |
| if c != '{' { |
| // Single-letter name. |
| seq = s[:len(s)-len(t)] |
| name = seq[2:] |
| } else { |
| // Name is in braces. |
| end := strings.IndexRune(s, '}') |
| if end < 0 { |
| if err = checkUTF8(s); err != nil { |
| return |
| } |
| return nil, "", &Error{ErrInvalidCharRange, s} |
| } |
| seq, t = s[:end+1], s[end+1:] |
| name = s[3:end] |
| if err = checkUTF8(name); err != nil { |
| return |
| } |
| } |
| |
| // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. |
| if name != "" && name[0] == '^' { |
| sign = -sign |
| name = name[1:] |
| } |
| |
| tab, fold := unicodeTable(name) |
| if tab == nil { |
| return nil, "", &Error{ErrInvalidCharRange, seq} |
| } |
| |
| if p.flags&FoldCase == 0 || fold == nil { |
| if sign > 0 { |
| r = appendTable(r, tab) |
| } else { |
| r = appendNegatedTable(r, tab) |
| } |
| } else { |
| // Merge and clean tab and fold in a temporary buffer. |
| // This is necessary for the negative case and just tidy |
| // for the positive case. |
| tmp := p.tmpClass[:0] |
| tmp = appendTable(tmp, tab) |
| tmp = appendTable(tmp, fold) |
| p.tmpClass = tmp |
| tmp = cleanClass(&p.tmpClass) |
| if sign > 0 { |
| r = appendClass(r, tmp) |
| } else { |
| r = appendNegatedClass(r, tmp) |
| } |
| } |
| return r, t, nil |
| } |
| |
| // parseClass parses a character class at the beginning of s |
| // and pushes it onto the parse stack. |
| func (p *parser) parseClass(s string) (rest string, err error) { |
| t := s[1:] // chop [ |
| re := p.newRegexp(OpCharClass) |
| re.Flags = p.flags |
| re.Rune = re.Rune0[:0] |
| |
| sign := +1 |
| if t != "" && t[0] == '^' { |
| sign = -1 |
| t = t[1:] |
| |
| // If character class does not match \n, add it here, |
| // so that negation later will do the right thing. |
| if p.flags&ClassNL == 0 { |
| re.Rune = append(re.Rune, '\n', '\n') |
| } |
| } |
| |
| class := re.Rune |
| first := true // ] and - are okay as first char in class |
| for t == "" || t[0] != ']' || first { |
| // POSIX: - is only okay unescaped as first or last in class. |
| // Perl: - is okay anywhere. |
| if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { |
| _, size := utf8.DecodeRuneInString(t[1:]) |
| return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} |
| } |
| first = false |
| |
| // Look for POSIX [:alnum:] etc. |
| if len(t) > 2 && t[0] == '[' && t[1] == ':' { |
| nclass, nt, err := p.parseNamedClass(t, class) |
| if err != nil { |
| return "", err |
| } |
| if nclass != nil { |
| class, t = nclass, nt |
| continue |
| } |
| } |
| |
| // Look for Unicode character group like \p{Han}. |
| nclass, nt, err := p.parseUnicodeClass(t, class) |
| if err != nil { |
| return "", err |
| } |
| if nclass != nil { |
| class, t = nclass, nt |
| continue |
| } |
| |
| // Look for Perl character class symbols (extension). |
| if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { |
| class, t = nclass, nt |
| continue |
| } |
| |
| // Single character or simple range. |
| rng := t |
| var lo, hi rune |
| if lo, t, err = p.parseClassChar(t, s); err != nil { |
| return "", err |
| } |
| hi = lo |
| // [a-] means (a|-) so check for final ]. |
| if len(t) >= 2 && t[0] == '-' && t[1] != ']' { |
| t = t[1:] |
| if hi, t, err = p.parseClassChar(t, s); err != nil { |
| return "", err |
| } |
| if hi < lo { |
| rng = rng[:len(rng)-len(t)] |
| return "", &Error{Code: ErrInvalidCharRange, Expr: rng} |
| } |
| } |
| if p.flags&FoldCase == 0 { |
| class = appendRange(class, lo, hi) |
| } else { |
| class = appendFoldedRange(class, lo, hi) |
| } |
| } |
| t = t[1:] // chop ] |
| |
| // Use &re.Rune instead of &class to avoid allocation. |
| re.Rune = class |
| class = cleanClass(&re.Rune) |
| if sign < 0 { |
| class = negateClass(class) |
| } |
| re.Rune = class |
| p.push(re) |
| return t, nil |
| } |
| |
| // cleanClass sorts the ranges (pairs of elements of r), |
| // merges them, and eliminates duplicates. |
| func cleanClass(rp *[]rune) []rune { |
| |
| // Sort by lo increasing, hi decreasing to break ties. |
| sort.Sort(ranges{rp}) |
| |
| r := *rp |
| if len(r) < 2 { |
| return r |
| } |
| |
| // Merge abutting, overlapping. |
| w := 2 // write index |
| for i := 2; i < len(r); i += 2 { |
| lo, hi := r[i], r[i+1] |
| if lo <= r[w-1]+1 { |
| // merge with previous range |
| if hi > r[w-1] { |
| r[w-1] = hi |
| } |
| continue |
| } |
| // new disjoint range |
| r[w] = lo |
| r[w+1] = hi |
| w += 2 |
| } |
| |
| return r[:w] |
| } |
| |
| // inCharClass reports whether r is in the class. |
| // It assumes the class has been cleaned by cleanClass. |
| func inCharClass(r rune, class []rune) bool { |
| _, ok := sort.Find(len(class)/2, func(i int) int { |
| lo, hi := class[2*i], class[2*i+1] |
| if r > hi { |
| return +1 |
| } |
| if r < lo { |
| return -1 |
| } |
| return 0 |
| }) |
| return ok |
| } |
| |
| // appendLiteral returns the result of appending the literal x to the class r. |
| func appendLiteral(r []rune, x rune, flags Flags) []rune { |
| if flags&FoldCase != 0 { |
| return appendFoldedRange(r, x, x) |
| } |
| return appendRange(r, x, x) |
| } |
| |
| // appendRange returns the result of appending the range lo-hi to the class r. |
| func appendRange(r []rune, lo, hi rune) []rune { |
| // Expand last range or next to last range if it overlaps or abuts. |
| // Checking two ranges helps when appending case-folded |
| // alphabets, so that one range can be expanding A-Z and the |
| // other expanding a-z. |
| n := len(r) |
| for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 |
| if n >= i { |
| rlo, rhi := r[n-i], r[n-i+1] |
| if lo <= rhi+1 && rlo <= hi+1 { |
| if lo < rlo { |
| r[n-i] = lo |
| } |
| if hi > rhi { |
| r[n-i+1] = hi |
| } |
| return r |
| } |
| } |
| } |
| |
| return append(r, lo, hi) |
| } |
| |
| const ( |
| // minimum and maximum runes involved in folding. |
| // checked during test. |
| minFold = 0x0041 |
| maxFold = 0x1e943 |
| ) |
| |
| // appendFoldedRange returns the result of appending the range lo-hi |
| // and its case folding-equivalent runes to the class r. |
| func appendFoldedRange(r []rune, lo, hi rune) []rune { |
| // Optimizations. |
| if lo <= minFold && hi >= maxFold { |
| // Range is full: folding can't add more. |
| return appendRange(r, lo, hi) |
| } |
| if hi < minFold || lo > maxFold { |
| // Range is outside folding possibilities. |
| return appendRange(r, lo, hi) |
| } |
| if lo < minFold { |
| // [lo, minFold-1] needs no folding. |
| r = appendRange(r, lo, minFold-1) |
| lo = minFold |
| } |
| if hi > maxFold { |
| // [maxFold+1, hi] needs no folding. |
| r = appendRange(r, maxFold+1, hi) |
| hi = maxFold |
| } |
| |
| // Brute force. Depend on appendRange to coalesce ranges on the fly. |
| for c := lo; c <= hi; c++ { |
| r = appendRange(r, c, c) |
| f := unicode.SimpleFold(c) |
| for f != c { |
| r = appendRange(r, f, f) |
| f = unicode.SimpleFold(f) |
| } |
| } |
| return r |
| } |
| |
| // appendClass returns the result of appending the class x to the class r. |
| // It assume x is clean. |
| func appendClass(r []rune, x []rune) []rune { |
| for i := 0; i < len(x); i += 2 { |
| r = appendRange(r, x[i], x[i+1]) |
| } |
| return r |
| } |
| |
| // appendFoldedClass returns the result of appending the case folding of the class x to the class r. |
| func appendFoldedClass(r []rune, x []rune) []rune { |
| for i := 0; i < len(x); i += 2 { |
| r = appendFoldedRange(r, x[i], x[i+1]) |
| } |
| return r |
| } |
| |
| // appendNegatedClass returns the result of appending the negation of the class x to the class r. |
| // It assumes x is clean. |
| func appendNegatedClass(r []rune, x []rune) []rune { |
| nextLo := '\u0000' |
| for i := 0; i < len(x); i += 2 { |
| lo, hi := x[i], x[i+1] |
| if nextLo <= lo-1 { |
| r = appendRange(r, nextLo, lo-1) |
| } |
| nextLo = hi + 1 |
| } |
| if nextLo <= unicode.MaxRune { |
| r = appendRange(r, nextLo, unicode.MaxRune) |
| } |
| return r |
| } |
| |
| // appendTable returns the result of appending x to the class r. |
| func appendTable(r []rune, x *unicode.RangeTable) []rune { |
| for _, xr := range x.R16 { |
| lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) |
| if stride == 1 { |
| r = appendRange(r, lo, hi) |
| continue |
| } |
| for c := lo; c <= hi; c += stride { |
| r = appendRange(r, c, c) |
| } |
| } |
| for _, xr := range x.R32 { |
| lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) |
| if stride == 1 { |
| r = appendRange(r, lo, hi) |
| continue |
| } |
| for c := lo; c <= hi; c += stride { |
| r = appendRange(r, c, c) |
| } |
| } |
| return r |
| } |
| |
| // appendNegatedTable returns the result of appending the negation of x to the class r. |
| func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune { |
| nextLo := '\u0000' // lo end of next class to add |
| for _, xr := range x.R16 { |
| lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) |
| if stride == 1 { |
| if nextLo <= lo-1 { |
| r = appendRange(r, nextLo, lo-1) |
| } |
| nextLo = hi + 1 |
| continue |
| } |
| for c := lo; c <= hi; c += stride { |
| if nextLo <= c-1 { |
| r = appendRange(r, nextLo, c-1) |
| } |
| nextLo = c + 1 |
| } |
| } |
| for _, xr := range x.R32 { |
| lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) |
| if stride == 1 { |
| if nextLo <= lo-1 { |
| r = appendRange(r, nextLo, lo-1) |
| } |
| nextLo = hi + 1 |
| continue |
| } |
| for c := lo; c <= hi; c += stride { |
| if nextLo <= c-1 { |
| r = appendRange(r, nextLo, c-1) |
| } |
| nextLo = c + 1 |
| } |
| } |
| if nextLo <= unicode.MaxRune { |
| r = appendRange(r, nextLo, unicode.MaxRune) |
| } |
| return r |
| } |
| |
| // negateClass overwrites r and returns r's negation. |
| // It assumes the class r is already clean. |
| func negateClass(r []rune) []rune { |
| nextLo := '\u0000' // lo end of next class to add |
| w := 0 // write index |
| for i := 0; i < len(r); i += 2 { |
| lo, hi := r[i], r[i+1] |
| if nextLo <= lo-1 { |
| r[w] = nextLo |
| r[w+1] = lo - 1 |
| w += 2 |
| } |
| nextLo = hi + 1 |
| } |
| r = r[:w] |
| if nextLo <= unicode.MaxRune { |
| // It's possible for the negation to have one more |
| // range - this one - than the original class, so use append. |
| r = append(r, nextLo, unicode.MaxRune) |
| } |
| return r |
| } |
| |
| // ranges implements sort.Interface on a []rune. |
| // The choice of receiver type definition is strange |
| // but avoids an allocation since we already have |
| // a *[]rune. |
| type ranges struct { |
| p *[]rune |
| } |
| |
| func (ra ranges) Less(i, j int) bool { |
| p := *ra.p |
| i *= 2 |
| j *= 2 |
| return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] |
| } |
| |
| func (ra ranges) Len() int { |
| return len(*ra.p) / 2 |
| } |
| |
| func (ra ranges) Swap(i, j int) { |
| p := *ra.p |
| i *= 2 |
| j *= 2 |
| p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] |
| } |
| |
| func checkUTF8(s string) error { |
| for s != "" { |
| rune, size := utf8.DecodeRuneInString(s) |
| if rune == utf8.RuneError && size == 1 { |
| return &Error{Code: ErrInvalidUTF8, Expr: s} |
| } |
| s = s[size:] |
| } |
| return nil |
| } |
| |
| func nextRune(s string) (c rune, t string, err error) { |
| c, size := utf8.DecodeRuneInString(s) |
| if c == utf8.RuneError && size == 1 { |
| return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} |
| } |
| return c, s[size:], nil |
| } |
| |
| func isalnum(c rune) bool { |
| return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' |
| } |
| |
| func unhex(c rune) rune { |
| if '0' <= c && c <= '9' { |
| return c - '0' |
| } |
| if 'a' <= c && c <= 'f' { |
| return c - 'a' + 10 |
| } |
| if 'A' <= c && c <= 'F' { |
| return c - 'A' + 10 |
| } |
| return -1 |
| } |