| /* $NetBSD: regcomp.c,v 1.46 2021/03/11 15:00:29 christos Exp $ */ |
| |
| /*- |
| * SPDX-License-Identifier: BSD-3-Clause |
| * |
| * Copyright (c) 1992, 1993, 1994 Henry Spencer. |
| * Copyright (c) 1992, 1993, 1994 |
| * The Regents of the University of California. All rights reserved. |
| * |
| * Copyright (c) 2011 The FreeBSD Foundation |
| * All rights reserved. |
| * Portions of this software were developed by David Chisnall |
| * under sponsorship from the FreeBSD Foundation. |
| * |
| * This code is derived from software contributed to Berkeley by |
| * Henry Spencer. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * 3. Neither the name of the University nor the names of its contributors |
| * may be used to endorse or promote products derived from this software |
| * without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| * SUCH DAMAGE. |
| * |
| * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 |
| */ |
| |
| #if HAVE_NBTOOL_CONFIG_H |
| #include "nbtool_config.h" |
| #endif |
| |
| #include <sys/cdefs.h> |
| #if 0 |
| static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; |
| __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $"); |
| #endif |
| __RCSID("$NetBSD: regcomp.c,v 1.46 2021/03/11 15:00:29 christos Exp $"); |
| |
| #define _OPENBSD_SOURCE |
| |
| #ifndef LIBHACK |
| #define REGEX_GNU_EXTENSIONS |
| |
| #include "namespace.h" |
| #endif |
| #include <sys/types.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <ctype.h> |
| #include <limits.h> |
| #include <stdlib.h> |
| #include <regex.h> |
| #include <stdbool.h> |
| |
| #if defined(__weak_alias) && !defined(LIBHACK) |
| __weak_alias(regcomp,_regcomp) |
| #endif |
| |
| #ifdef REGEX_LIBC_COLLATE |
| #include "collate.h" |
| #endif |
| |
| #include "utils.h" |
| #include "regex2.h" |
| |
| #include "cname.h" |
| |
| /* |
| * Branching context, used to keep track of branch state for all of the branch- |
| * aware functions. In addition to keeping track of branch positions for the |
| * p_branch_* functions, we use this to simplify some clumsiness in BREs for |
| * detection of whether ^ is acting as an anchor or being used erroneously and |
| * also for whether we're in a sub-expression or not. |
| */ |
| struct branchc { |
| sopno start; |
| sopno back; |
| sopno fwd; |
| |
| int nbranch; |
| int nchain; |
| bool outer; |
| bool terminate; |
| }; |
| |
| /* |
| * parse structure, passed up and down to avoid global variables and |
| * other clumsinesses |
| */ |
| struct parse { |
| const char *next; /* next character in RE */ |
| const char *end; /* end of string (-> NUL normally) */ |
| int error; /* has an error been seen? */ |
| int gnuext; |
| sop *strip; /* malloced strip */ |
| sopno ssize; /* malloced strip size (allocated) */ |
| sopno slen; /* malloced strip length (used) */ |
| size_t ncsalloc; /* number of csets allocated */ |
| struct re_guts *g; |
| # define NPAREN 10 /* we need to remember () 1-9 for back refs */ |
| sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ |
| sopno pend[NPAREN]; /* -> ) ([0] unused) */ |
| bool allowbranch; /* can this expression branch? */ |
| bool bre; /* convenience; is this a BRE? */ |
| int pflags; /* other parsing flags -- legacy escapes? */ |
| bool (*parse_expr)(struct parse *, struct branchc *); |
| void (*pre_parse)(struct parse *, struct branchc *); |
| void (*post_parse)(struct parse *, struct branchc *); |
| }; |
| |
| #define PFLAG_LEGACY_ESC 0x00000001 |
| |
| /* ========= begin header generated by ./mkh ========= */ |
| #ifdef __cplusplus |
| extern "C" { |
| #endif |
| |
| /* === regcomp.c === */ |
| static bool p_ere_exp(struct parse *p, struct branchc *bc); |
| static void p_str(struct parse *p); |
| static int p_branch_eat_delim(struct parse *p, struct branchc *bc); |
| static void p_branch_ins_offset(struct parse *p, struct branchc *bc); |
| static void p_branch_fix_tail(struct parse *p, struct branchc *bc); |
| static bool p_branch_empty(struct parse *p, struct branchc *bc); |
| static bool p_branch_do(struct parse *p, struct branchc *bc); |
| static void p_bre_pre_parse(struct parse *p, struct branchc *bc); |
| static void p_bre_post_parse(struct parse *p, struct branchc *bc); |
| static void p_re(struct parse *p, int end1, int end2); |
| static bool p_simp_re(struct parse *p, struct branchc *bc); |
| static int p_count(struct parse *p); |
| static void p_bracket(struct parse *p); |
| static int p_range_cmp(wchar_t c1, wchar_t c2); |
| static void p_b_term(struct parse *p, cset *cs); |
| #ifdef REGEX_GNU_EXTENSIONS |
| static int p_b_pseudoclass(struct parse *p, char c); |
| #endif |
| static void p_b_cclass(struct parse *p, cset *cs); |
| static void p_b_cclass_named(struct parse *p, cset *cs, const char[]); |
| static void p_b_eclass(struct parse *p, cset *cs); |
| static wint_t p_b_symbol(struct parse *p); |
| static wint_t p_b_coll_elem(struct parse *p, wint_t endc); |
| static bool may_escape(struct parse *p, const wint_t ch); |
| static wint_t othercase(wint_t ch); |
| static void bothcases(struct parse *p, wint_t ch); |
| static void ordinary(struct parse *p, wint_t ch); |
| static void nonnewline(struct parse *p); |
| static void repeat(struct parse *p, sopno start, int from, int to); |
| static int seterr(struct parse *p, int e); |
| static cset *allocset(struct parse *p); |
| static void freeset(struct parse *p, cset *cs); |
| static void CHadd(struct parse *p, cset *cs, wint_t ch); |
| static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); |
| static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); |
| static wint_t singleton(cset *cs); |
| static sopno dupl(struct parse *p, sopno start, sopno finish); |
| static void doemit(struct parse *p, sop op, size_t opnd); |
| static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); |
| static void dofwd(struct parse *p, sopno pos, sop value); |
| static int enlarge(struct parse *p, sopno size); |
| static void stripsnug(struct parse *p, struct re_guts *g); |
| static void findmust(struct parse *p, struct re_guts *g); |
| static int altoffset(sop *scan, int offset); |
| static void computejumps(struct parse *p, struct re_guts *g); |
| static void computematchjumps(struct parse *p, struct re_guts *g); |
| static sopno pluscount(struct parse *p, struct re_guts *g); |
| static wint_t wgetnext(struct parse *p); |
| |
| #ifdef __cplusplus |
| } |
| #endif |
| /* ========= end header generated by ./mkh ========= */ |
| |
| static char nuls[10]; /* place to point scanner in event of error */ |
| |
| /* |
| * macros for use with parse structure |
| * BEWARE: these know that the parse structure is named `p' !!! |
| */ |
| #define PEEK() (*p->next) |
| #define PEEK2() (*(p->next+1)) |
| #define MORE() (p->next < p->end) |
| #define MORE2() (p->next+1 < p->end) |
| #define SEE(c) (MORE() && PEEK() == (c)) |
| #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) |
| #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) |
| #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) |
| #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) |
| #define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a)) |
| #define NEXT() (p->next++) |
| #define NEXT2() (p->next += 2) |
| #define NEXTn(n) (p->next += (n)) |
| #define GETNEXT() (*p->next++) |
| #define WGETNEXT() wgetnext(p) |
| #define SETERROR(e) seterr(p, (e)) |
| #define REQUIRE(co, e) ((co) || SETERROR(e)) |
| #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) |
| #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) |
| #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) |
| #define EMIT(op, sopnd) doemit(p, (op), (sopnd)) |
| #define INSERT(op, pos) doinsert(p, (op), HERE()-(pos)+1, pos) |
| #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) |
| #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) |
| #define HERE() (p->slen) |
| #define THERE() (p->slen - 1) |
| #define THERETHERE() (p->slen - 2) |
| #define DROP(n) (p->slen -= (n)) |
| |
| /* Macro used by computejump()/computematchjump() */ |
| #ifndef MIN |
| #define MIN(a,b) ((a)<(b)?(a):(b)) |
| #endif |
| |
| #ifndef NLS |
| static const struct { |
| const char *name; |
| int (*func)(int); |
| } wctypes[] = { |
| #define ADD(x) { .name = # x, .func = is ## x } |
| ADD(alnum), |
| ADD(alpha), |
| ADD(blank), |
| ADD(cntrl), |
| ADD(digit), |
| ADD(graph), |
| ADD(lower), |
| ADD(print), |
| ADD(punct), |
| ADD(space), |
| ADD(upper), |
| ADD(xdigit), |
| #undef ADD |
| }; |
| |
| wctype_t |
| __regex_wctype(const char *str) |
| { |
| for (size_t i = 0; i < __arraycount(wctypes); i++) { |
| if (strcmp(wctypes[i].name, str) == 0) |
| return (wctype_t)(i + 1); |
| } |
| return (wctype_t)0; |
| } |
| |
| int |
| __regex_iswctype(wint_t c, wctype_t ct) |
| { |
| if (ct == 0) |
| return 0; |
| return (*wctypes[ct - 1].func)(c); |
| } |
| #endif |
| |
| static int /* 0 success, otherwise REG_something */ |
| regcomp_internal(regex_t * __restrict preg, |
| const char * __restrict pattern, |
| int cflags, int pflags) |
| { |
| struct parse pa; |
| struct re_guts *g; |
| struct parse *p = &pa; |
| int i; |
| size_t len; |
| size_t maxlen; |
| #ifdef REDEBUG |
| # define GOODFLAGS(f) (f) |
| #else |
| # define GOODFLAGS(f) ((f)&~REG_DUMP) |
| #endif |
| |
| _DIAGASSERT(preg != NULL); |
| _DIAGASSERT(pattern != NULL); |
| |
| cflags = GOODFLAGS(cflags); |
| if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) |
| return(REG_INVARG); |
| |
| if (cflags®_PEND) { |
| if (preg->re_endp < pattern) |
| return(REG_INVARG); |
| len = preg->re_endp - pattern; |
| } else |
| len = strlen(pattern); |
| |
| /* do the mallocs early so failure handling is easy */ |
| g = malloc(sizeof(*g)); |
| if (g == NULL) |
| return(REG_ESPACE); |
| /* |
| * Limit the pattern space to avoid a 32-bit overflow on buffer |
| * extension. Also avoid any signed overflow in case of conversion |
| * so make the real limit based on a 31-bit overflow. |
| * |
| * Likely not applicable on 64-bit systems but handle the case |
| * generically (who are we to stop people from using ~715MB+ |
| * patterns?). |
| */ |
| maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3; |
| if (len >= maxlen) { |
| free(g); |
| return(REG_ESPACE); |
| } |
| p->ssize = (sopno)(len / 2 * 3 + 1); /* ugh */ |
| assert(p->ssize >= len); |
| |
| p->strip = calloc(p->ssize, sizeof(*p->strip)); |
| p->slen = 0; |
| if (p->strip == NULL) { |
| free(g); |
| return(REG_ESPACE); |
| } |
| |
| /* set things up */ |
| p->g = g; |
| p->next = pattern; /* convenience; we do not modify it */ |
| p->end = p->next + len; |
| p->error = 0; |
| p->ncsalloc = 0; |
| p->pflags = pflags; |
| for (i = 0; i < NPAREN; i++) { |
| p->pbegin[i] = 0; |
| p->pend[i] = 0; |
| } |
| #ifdef REGEX_GNU_EXTENSIONS |
| if ((cflags & REG_GNU) == 0) { |
| p->gnuext = false; |
| p->allowbranch = (cflags & REG_EXTENDED) != 0; |
| } else |
| p->gnuext = p->allowbranch = true; |
| #else |
| p->gnuext = false; |
| p->allowbranch = (cflags & REG_EXTENDED) != 0; |
| #endif |
| if (cflags & REG_EXTENDED) { |
| p->bre = false; |
| p->parse_expr = p_ere_exp; |
| p->pre_parse = NULL; |
| p->post_parse = NULL; |
| } else { |
| p->bre = true; |
| p->parse_expr = p_simp_re; |
| p->pre_parse = p_bre_pre_parse; |
| p->post_parse = p_bre_post_parse; |
| } |
| g->sets = NULL; |
| g->ncsets = 0; |
| g->cflags = cflags; |
| g->iflags = 0; |
| g->nbol = 0; |
| g->neol = 0; |
| g->must = NULL; |
| g->moffset = -1; |
| g->charjump = NULL; |
| g->matchjump = NULL; |
| g->mlen = 0; |
| g->nsub = 0; |
| g->backrefs = 0; |
| |
| /* do it */ |
| EMIT(OEND, 0); |
| g->firststate = THERE(); |
| if (cflags & REG_NOSPEC) |
| p_str(p); |
| else |
| p_re(p, OUT, OUT); |
| EMIT(OEND, 0); |
| g->laststate = THERE(); |
| |
| /* tidy up loose ends and fill things in */ |
| stripsnug(p, g); |
| findmust(p, g); |
| /* only use Boyer-Moore algorithm if the pattern is bigger |
| * than three characters |
| */ |
| if(g->mlen > 3) { |
| computejumps(p, g); |
| computematchjumps(p, g); |
| if(g->matchjump == NULL && g->charjump != NULL) { |
| free(g->charjump); |
| g->charjump = NULL; |
| } |
| } |
| g->nplus = pluscount(p, g); |
| g->magic = MAGIC2; |
| preg->re_nsub = g->nsub; |
| preg->re_g = g; |
| preg->re_magic = MAGIC1; |
| #ifndef REDEBUG |
| /* not debugging, so can't rely on the assert() in regexec() */ |
| if (g->iflags&BAD) |
| SETERROR(REG_ASSERT); |
| #endif |
| |
| /* win or lose, we're done */ |
| if (p->error != 0) /* lose */ |
| regfree(preg); |
| return(p->error); |
| } |
| |
| /* |
| - regcomp - interface for parser and compilation |
| = extern int regcomp(regex_t *, const char *, int); |
| = #define REG_BASIC 0000 |
| = #define REG_EXTENDED 0001 |
| = #define REG_ICASE 0002 |
| = #define REG_NOSUB 0004 |
| = #define REG_NEWLINE 0010 |
| = #define REG_NOSPEC 0020 |
| = #define REG_PEND 0040 |
| = #define REG_DUMP 0200 |
| */ |
| int /* 0 success, otherwise REG_something */ |
| regcomp(regex_t * __restrict preg, |
| const char * __restrict pattern, |
| int cflags) |
| { |
| |
| return (regcomp_internal(preg, pattern, cflags, 0)); |
| } |
| |
| /* |
| - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op, |
| - return whether we should terminate or not |
| == static bool p_ere_exp(struct parse *p); |
| */ |
| static bool |
| p_ere_exp(struct parse *p, struct branchc *bc) |
| { |
| char c; |
| wint_t wc; |
| sopno pos; |
| int count; |
| int count2; |
| #ifdef REGEX_GNU_EXTENSIONS |
| size_t i; |
| int handled; |
| #endif |
| sopno subno; |
| int wascaret = 0; |
| |
| _DIAGASSERT(p != NULL); |
| |
| (void)bc; |
| assert(MORE()); /* caller should have ensured this */ |
| c = GETNEXT(); |
| |
| #ifdef REGEX_GNU_EXTENSIONS |
| handled = 0; |
| #endif |
| pos = HERE(); |
| switch (c) { |
| case '(': |
| (void)REQUIRE(MORE(), REG_EPAREN); |
| p->g->nsub++; |
| subno = (sopno)p->g->nsub; |
| if (subno < NPAREN) |
| p->pbegin[subno] = HERE(); |
| EMIT(OLPAREN, subno); |
| if (!SEE(')')) |
| p_re(p, ')', IGN); |
| if (subno < NPAREN) { |
| p->pend[subno] = HERE(); |
| assert(p->pend[subno] != 0); |
| } |
| EMIT(ORPAREN, subno); |
| (void)MUSTEAT(')', REG_EPAREN); |
| break; |
| #ifndef POSIX_MISTAKE |
| case ')': /* happens only if no current unmatched ( */ |
| /* |
| * You may ask, why the ifndef? Because I didn't notice |
| * this until slightly too late for 1003.2, and none of the |
| * other 1003.2 regular-expression reviewers noticed it at |
| * all. So an unmatched ) is legal POSIX, at least until |
| * we can get it fixed. |
| */ |
| SETERROR(REG_EPAREN); |
| break; |
| #endif |
| case '^': |
| EMIT(OBOL, 0); |
| p->g->iflags |= USEBOL; |
| p->g->nbol++; |
| wascaret = 1; |
| break; |
| case '$': |
| EMIT(OEOL, 0); |
| p->g->iflags |= USEEOL; |
| p->g->neol++; |
| break; |
| case '|': |
| SETERROR(REG_EMPTY); |
| break; |
| case '*': |
| case '+': |
| case '?': |
| case '{': |
| SETERROR(REG_BADRPT); |
| break; |
| case '.': |
| if (p->g->cflags®_NEWLINE) |
| nonnewline(p); |
| else |
| EMIT(OANY, 0); |
| break; |
| case '[': |
| p_bracket(p); |
| break; |
| case '\\': |
| (void)REQUIRE(MORE(), REG_EESCAPE); |
| wc = WGETNEXT(); |
| #ifdef REGEX_GNU_EXTENSIONS |
| if (p->gnuext) { |
| handled = 1; |
| switch (wc) { |
| case '`': |
| EMIT(OBOS, 0); |
| break; |
| case '\'': |
| EMIT(OEOS, 0); |
| break; |
| case 'B': |
| EMIT(ONWBND, 0); |
| break; |
| case 'b': |
| EMIT(OWBND, 0); |
| break; |
| case 'W': |
| case 'w': |
| case 'S': |
| case 's': |
| p_b_pseudoclass(p, wc); |
| break; |
| case 'a': |
| ordinary(p, '\a'); |
| break; |
| case 'e': |
| ordinary(p, '\e'); |
| break; |
| case 'f': |
| ordinary(p, '\f'); |
| break; |
| case 'n': |
| ordinary(p, '\n'); |
| break; |
| case 'r': |
| ordinary(p, '\r'); |
| break; |
| case 't': |
| ordinary(p, '\t'); |
| break; |
| case 'v': |
| ordinary(p, '\v'); |
| break; |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': |
| i = wc - '0'; |
| assert(i < NPAREN); |
| if (p->pend[i] != 0) { |
| assert(i <= p->g->nsub); |
| EMIT(OBACK_, i); |
| assert(p->pbegin[i] != 0); |
| assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); |
| assert(OP(p->strip[p->pend[i]]) == ORPAREN); |
| (void) dupl(p, p->pbegin[i]+1, p->pend[i]); |
| EMIT(O_BACK, i); |
| } else |
| SETERROR(REG_ESUBREG); |
| p->g->backrefs = 1; |
| break; |
| default: |
| handled = 0; |
| } |
| /* Don't proceed to the POSIX bits if we've already handled it */ |
| if (handled) |
| break; |
| } |
| #endif |
| switch (wc) { |
| case '<': |
| EMIT(OBOW, 0); |
| break; |
| case '>': |
| EMIT(OEOW, 0); |
| break; |
| default: |
| if (may_escape(p, wc)) |
| ordinary(p, wc); |
| else |
| SETERROR(REG_EESCAPE); |
| break; |
| } |
| break; |
| default: |
| if (p->error != 0) |
| return (false); |
| p->next--; |
| wc = WGETNEXT(); |
| ordinary(p, wc); |
| break; |
| } |
| |
| if (!MORE()) |
| return (false); |
| c = PEEK(); |
| /* we call { a repetition if followed by a digit */ |
| if (!( c == '*' || c == '+' || c == '?' || c == '{')) |
| return (false); /* no repetition, we're done */ |
| else if (c == '{') |
| (void)REQUIRE(MORE2() && \ |
| (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); |
| NEXT(); |
| |
| (void)REQUIRE(!wascaret, REG_BADRPT); |
| switch (c) { |
| case '*': /* implemented as +? */ |
| /* this case does not require the (y|) trick, noKLUDGE */ |
| INSERT(OPLUS_, pos); |
| ASTERN(O_PLUS, pos); |
| INSERT(OQUEST_, pos); |
| ASTERN(O_QUEST, pos); |
| break; |
| case '+': |
| INSERT(OPLUS_, pos); |
| ASTERN(O_PLUS, pos); |
| break; |
| case '?': |
| /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
| INSERT(OCH_, pos); /* offset slightly wrong */ |
| ASTERN(OOR1, pos); /* this one's right */ |
| AHEAD(pos); /* fix the OCH_ */ |
| EMIT(OOR2, 0); /* offset very wrong... */ |
| AHEAD(THERE()); /* ...so fix it */ |
| ASTERN(O_CH, THERETHERE()); |
| break; |
| case '{': |
| count = p_count(p); |
| if (EAT(',')) { |
| if (isdigit((uch)PEEK())) { |
| count2 = p_count(p); |
| (void)REQUIRE(count <= count2, REG_BADBR); |
| } else /* single number with comma */ |
| count2 = INFINITY; |
| } else /* just a single number */ |
| count2 = count; |
| repeat(p, pos, count, count2); |
| if (!EAT('}')) { /* error heuristics */ |
| while (MORE() && PEEK() != '}') |
| NEXT(); |
| (void)REQUIRE(MORE(), REG_EBRACE); |
| SETERROR(REG_BADBR); |
| } |
| break; |
| } |
| |
| if (!MORE()) |
| return (false); |
| c = PEEK(); |
| if (!( c == '*' || c == '+' || c == '?' || |
| (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) |
| return (false); |
| SETERROR(REG_BADRPT); |
| return (false); |
| } |
| |
| /* |
| - p_str - string (no metacharacters) "parser" |
| == static void p_str(struct parse *p); |
| */ |
| static void |
| p_str(struct parse *p) |
| { |
| (void)REQUIRE(MORE(), REG_EMPTY); |
| while (MORE()) |
| ordinary(p, WGETNEXT()); |
| } |
| |
| /* |
| * Eat consecutive branch delimiters for the kind of expression that we are |
| * parsing, return the number of delimiters that we ate. |
| */ |
| static int |
| p_branch_eat_delim(struct parse *p, struct branchc *bc) |
| { |
| int nskip; |
| |
| (void)bc; |
| nskip = 0; |
| while (EATSPEC('|')) |
| ++nskip; |
| return (nskip); |
| } |
| |
| /* |
| * Insert necessary branch book-keeping operations. This emits a |
| * bogus 'next' offset, since we still have more to parse |
| */ |
| static void |
| p_branch_ins_offset(struct parse *p, struct branchc *bc) |
| { |
| |
| if (bc->nbranch == 0) { |
| INSERT(OCH_, bc->start); /* offset is wrong */ |
| bc->fwd = bc->start; |
| bc->back = bc->start; |
| } |
| |
| ASTERN(OOR1, bc->back); |
| bc->back = THERE(); |
| AHEAD(bc->fwd); /* fix previous offset */ |
| bc->fwd = HERE(); |
| EMIT(OOR2, 0); /* offset is very wrong */ |
| ++bc->nbranch; |
| } |
| |
| /* |
| * Fix the offset of the tail branch, if we actually had any branches. |
| * This is to correct the bogus placeholder offset that we use. |
| */ |
| static void |
| p_branch_fix_tail(struct parse *p, struct branchc *bc) |
| { |
| |
| /* Fix bogus offset at the tail if we actually have branches */ |
| if (bc->nbranch > 0) { |
| AHEAD(bc->fwd); |
| ASTERN(O_CH, bc->back); |
| } |
| } |
| |
| /* |
| * Signal to the parser that an empty branch has been encountered; this will, |
| * in the future, be used to allow for more permissive behavior with empty |
| * branches. The return value should indicate whether parsing may continue |
| * or not. |
| */ |
| static bool |
| p_branch_empty(struct parse *p, struct branchc *bc) |
| { |
| |
| (void)bc; |
| SETERROR(REG_EMPTY); |
| return (false); |
| } |
| |
| /* |
| * Take care of any branching requirements. This includes inserting the |
| * appropriate branching instructions as well as eating all of the branch |
| * delimiters until we either run out of pattern or need to parse more pattern. |
| */ |
| static bool |
| p_branch_do(struct parse *p, struct branchc *bc) |
| { |
| int ate = 0; |
| |
| ate = p_branch_eat_delim(p, bc); |
| if (ate == 0) |
| return (false); |
| else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) |
| /* |
| * Halt parsing only if we have an empty branch and p_branch_empty |
| * indicates that we must not continue. In the future, this will not |
| * necessarily be an error. |
| */ |
| return (false); |
| p_branch_ins_offset(p, bc); |
| |
| return (true); |
| } |
| |
| static void |
| p_bre_pre_parse(struct parse *p, struct branchc *bc) |
| { |
| |
| (void)bc; |
| /* |
| * Does not move cleanly into expression parser because of |
| * ordinary interpration of * at the beginning position of |
| * an expression. |
| */ |
| if (EAT('^')) { |
| EMIT(OBOL, 0); |
| p->g->iflags |= USEBOL; |
| p->g->nbol++; |
| } |
| } |
| |
| static void |
| p_bre_post_parse(struct parse *p, struct branchc *bc) |
| { |
| |
| /* Expression is terminating due to EOL token */ |
| if (bc->terminate) { |
| DROP(1); |
| EMIT(OEOL, 0); |
| p->g->iflags |= USEEOL; |
| p->g->neol++; |
| } |
| } |
| |
| /* |
| - p_re - Top level parser, concatenation and BRE anchoring |
| == static void p_re(struct parse *p, int end1, int end2); |
| * Giving end1 as OUT essentially eliminates the end1/end2 check. |
| * |
| * This implementation is a bit of a kludge, in that a trailing $ is first |
| * taken as an ordinary character and then revised to be an anchor. |
| * The amount of lookahead needed to avoid this kludge is excessive. |
| */ |
| static void |
| p_re(struct parse *p, |
| int end1, /* first terminating character */ |
| int end2) /* second terminating character; ignored for EREs */ |
| { |
| struct branchc bc; |
| |
| bc.nbranch = 0; |
| if (end1 == OUT && end2 == OUT) |
| bc.outer = true; |
| else |
| bc.outer = false; |
| #define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) |
| for (;;) { |
| bc.start = HERE(); |
| bc.nchain = 0; |
| bc.terminate = false; |
| if (p->pre_parse != NULL) |
| p->pre_parse(p, &bc); |
| while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) { |
| bc.terminate = p->parse_expr(p, &bc); |
| ++bc.nchain; |
| } |
| if (p->post_parse != NULL) |
| p->post_parse(p, &bc); |
| (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY); |
| #ifdef REGEX_GNU_EXTENSIONS |
| if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc)) |
| break; |
| #endif |
| if (!p->allowbranch) |
| break; |
| /* |
| * p_branch_do's return value indicates whether we should |
| * continue parsing or not. This is both for correctness and |
| * a slight optimization, because it will check if we've |
| * encountered an empty branch or the end of the string |
| * immediately following a branch delimiter. |
| */ |
| if (!p_branch_do(p, &bc)) |
| break; |
| } |
| #undef SEE_END |
| if (p->allowbranch) |
| p_branch_fix_tail(p, &bc); |
| assert(!MORE() || SEE(end1)); |
| } |
| |
| /* |
| - p_simp_re - parse a simple RE, an atom possibly followed by a repetition |
| == static bool p_simp_re(struct parse *p, struct branchc *bc); |
| */ |
| static bool /* was the simple RE an unbackslashed $? */ |
| p_simp_re(struct parse *p, struct branchc *bc) |
| { |
| int c; |
| int cc; /* convenient/control character */ |
| int count; |
| int count2; |
| sopno pos; |
| bool handled; |
| size_t i; |
| wint_t wc; |
| sopno subno; |
| # define BACKSL (1<<CHAR_BIT) |
| |
| pos = HERE(); /* repetition op, if any, covers from here */ |
| handled = false; |
| |
| assert(MORE()); /* caller should have ensured this */ |
| c = GETNEXT(); |
| if (c == '\\') { |
| (void)REQUIRE(MORE(), REG_EESCAPE); |
| cc = GETNEXT(); |
| c = BACKSL | cc; |
| #ifdef REGEX_GNU_EXTENSIONS |
| if (p->gnuext) { |
| handled = true; |
| switch (c) { |
| case BACKSL|'`': |
| EMIT(OBOS, 0); |
| break; |
| case BACKSL|'\'': |
| EMIT(OEOS, 0); |
| break; |
| case BACKSL|'B': |
| EMIT(ONWBND, 0); |
| break; |
| case BACKSL|'b': |
| EMIT(OWBND, 0); |
| break; |
| case BACKSL|'W': |
| case BACKSL|'w': |
| case BACKSL|'S': |
| case BACKSL|'s': |
| p_b_pseudoclass(p, cc); |
| break; |
| case BACKSL|'a': |
| ordinary(p, '\a'); |
| break; |
| case BACKSL|'e': |
| ordinary(p, '\e'); |
| break; |
| case BACKSL|'f': |
| ordinary(p, '\f'); |
| break; |
| case BACKSL|'n': |
| ordinary(p, '\n'); |
| break; |
| case BACKSL|'r': |
| ordinary(p, '\r'); |
| break; |
| case BACKSL|'t': |
| ordinary(p, '\t'); |
| break; |
| case BACKSL|'v': |
| ordinary(p, '\v'); |
| break; |
| default: |
| handled = false; |
| } |
| } |
| #endif |
| } |
| if (!handled) { |
| switch (c) { |
| case '.': |
| if (p->g->cflags®_NEWLINE) |
| nonnewline(p); |
| else |
| EMIT(OANY, 0); |
| break; |
| case '[': |
| p_bracket(p); |
| break; |
| case BACKSL|'<': |
| EMIT(OBOW, 0); |
| break; |
| case BACKSL|'>': |
| EMIT(OEOW, 0); |
| break; |
| case BACKSL|'{': |
| SETERROR(REG_BADRPT); |
| break; |
| case BACKSL|'(': |
| p->g->nsub++; |
| subno = (sopno)p->g->nsub; |
| if (subno < NPAREN) |
| p->pbegin[subno] = HERE(); |
| EMIT(OLPAREN, subno); |
| /* the MORE here is an error heuristic */ |
| if (MORE() && !SEETWO('\\', ')')) |
| p_re(p, '\\', ')'); |
| if (subno < NPAREN) { |
| p->pend[subno] = HERE(); |
| assert(p->pend[subno] != 0); |
| } |
| EMIT(ORPAREN, subno); |
| (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); |
| break; |
| case BACKSL|')': /* should not get here -- must be user */ |
| SETERROR(REG_EPAREN); |
| break; |
| case BACKSL|'1': |
| case BACKSL|'2': |
| case BACKSL|'3': |
| case BACKSL|'4': |
| case BACKSL|'5': |
| case BACKSL|'6': |
| case BACKSL|'7': |
| case BACKSL|'8': |
| case BACKSL|'9': |
| i = (c&~BACKSL) - '0'; |
| assert(i < NPAREN); |
| if (p->pend[i] != 0) { |
| assert(i <= p->g->nsub); |
| EMIT(OBACK_, i); |
| assert(p->pbegin[i] != 0); |
| assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); |
| assert(OP(p->strip[p->pend[i]]) == ORPAREN); |
| (void) dupl(p, p->pbegin[i]+1, p->pend[i]); |
| EMIT(O_BACK, i); |
| } else |
| SETERROR(REG_ESUBREG); |
| p->g->backrefs = 1; |
| break; |
| case '*': |
| /* |
| * Ordinary if used as the first character beyond BOL anchor of |
| * a (sub-)expression, counts as a bad repetition operator if it |
| * appears otherwise. |
| */ |
| (void)REQUIRE(bc->nchain == 0, REG_BADRPT); |
| /* FALLTHROUGH */ |
| default: |
| if (p->error != 0) |
| return (false); /* Definitely not $... */ |
| p->next--; |
| wc = WGETNEXT(); |
| if ((c & BACKSL) == 0 || may_escape(p, wc)) |
| ordinary(p, wc); |
| else |
| SETERROR(REG_EESCAPE); |
| break; |
| } |
| } |
| |
| if (EAT('*')) { /* implemented as +? */ |
| /* this case does not require the (y|) trick, noKLUDGE */ |
| INSERT(OPLUS_, pos); |
| ASTERN(O_PLUS, pos); |
| INSERT(OQUEST_, pos); |
| ASTERN(O_QUEST, pos); |
| #ifdef REGEX_GNU_EXTENSIONS |
| } else if (p->gnuext && EATTWO('\\', '?')) { |
| INSERT(OQUEST_, pos); |
| ASTERN(O_QUEST, pos); |
| } else if (p->gnuext && EATTWO('\\', '+')) { |
| INSERT(OPLUS_, pos); |
| ASTERN(O_PLUS, pos); |
| #endif |
| } else if (EATTWO('\\', '{')) { |
| count = p_count(p); |
| if (EAT(',')) { |
| if (MORE() && isdigit((uch)PEEK())) { |
| count2 = p_count(p); |
| (void)REQUIRE(count <= count2, REG_BADBR); |
| } else /* single number with comma */ |
| count2 = INFINITY; |
| } else /* just a single number */ |
| count2 = count; |
| repeat(p, pos, count, count2); |
| if (!EATTWO('\\', '}')) { /* error heuristics */ |
| while (MORE() && !SEETWO('\\', '}')) |
| NEXT(); |
| (void)REQUIRE(MORE(), REG_EBRACE); |
| SETERROR(REG_BADBR); |
| } |
| } else if (c == '$') /* $ (but not \$) ends it */ |
| return (true); |
| |
| return (false); |
| } |
| |
| /* |
| - p_count - parse a repetition count |
| == static int p_count(struct parse *p); |
| */ |
| static int /* the value */ |
| p_count(struct parse *p) |
| { |
| int count = 0; |
| int ndigits = 0; |
| |
| while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { |
| count = count*10 + (GETNEXT() - '0'); |
| ndigits++; |
| } |
| |
| (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); |
| return(count); |
| } |
| |
| /* |
| - p_bracket - parse a bracketed character list |
| == static void p_bracket(struct parse *p); |
| */ |
| static void |
| p_bracket(struct parse *p) |
| { |
| cset *cs; |
| wint_t ch; |
| |
| /* Dept of Truly Sickening Special-Case Kludges */ |
| if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { |
| EMIT(OBOW, 0); |
| NEXTn(6); |
| return; |
| } |
| if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { |
| EMIT(OEOW, 0); |
| NEXTn(6); |
| return; |
| } |
| |
| if ((cs = allocset(p)) == NULL) |
| return; |
| |
| if (p->g->cflags®_ICASE) |
| cs->icase = 1; |
| if (EAT('^')) |
| cs->invert = 1; |
| if (EAT(']')) |
| CHadd(p, cs, ']'); |
| else if (EAT('-')) |
| CHadd(p, cs, '-'); |
| while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) |
| p_b_term(p, cs); |
| if (EAT('-')) |
| CHadd(p, cs, '-'); |
| (void)MUSTEAT(']', REG_EBRACK); |
| |
| if (p->error != 0) /* don't mess things up further */ |
| return; |
| |
| if (cs->invert && p->g->cflags®_NEWLINE) |
| cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); |
| |
| if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ |
| ordinary(p, ch); |
| freeset(p, cs); |
| } else |
| EMIT(OANYOF, (size_t)(cs - p->g->sets)); |
| } |
| |
| static int |
| p_range_cmp(wchar_t c1, wchar_t c2) |
| { |
| #ifdef REGEX_LIBC_COLLATE |
| return __wcollate_range_cmp(c1, c2); |
| #elif defined(NLS) |
| /* Copied from libc/collate __wcollate_range_cmp */ |
| wchar_t s1[2], s2[2]; |
| |
| s1[0] = c1; |
| s1[1] = L'\0'; |
| s2[0] = c2; |
| s2[1] = L'\0'; |
| return wcscoll(s1, s2); |
| #else |
| char s1[2], s2[2]; |
| |
| s1[0] = (char)c1; |
| s1[1] = '\0'; |
| s2[0] = (char)c2; |
| s2[1] = '\0'; |
| return strcoll(s1, s2); |
| #endif |
| } |
| |
| /* |
| - p_b_term - parse one term of a bracketed character list |
| == static void p_b_term(struct parse *p, cset *cs); |
| */ |
| static void |
| p_b_term(struct parse *p, cset *cs) |
| { |
| char c; |
| wint_t start, finish; |
| wint_t i; |
| #ifdef REGEX_LIBC_COLLATE |
| struct xlocale_collate *table = |
| (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; |
| #endif |
| |
| _DIAGASSERT(p != NULL); |
| _DIAGASSERT(cs != NULL); |
| |
| /* classify what we've got */ |
| switch ((MORE()) ? PEEK() : '\0') { |
| case '[': |
| c = (MORE2()) ? PEEK2() : '\0'; |
| break; |
| case '-': |
| SETERROR(REG_ERANGE); |
| return; /* NOTE RETURN */ |
| default: |
| c = '\0'; |
| break; |
| } |
| |
| switch (c) { |
| case ':': /* character class */ |
| NEXT2(); |
| (void)REQUIRE(MORE(), REG_EBRACK); |
| c = PEEK(); |
| (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); |
| p_b_cclass(p, cs); |
| (void)REQUIRE(MORE(), REG_EBRACK); |
| (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); |
| break; |
| case '=': /* equivalence class */ |
| NEXT2(); |
| (void)REQUIRE(MORE(), REG_EBRACK); |
| c = PEEK(); |
| (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); |
| p_b_eclass(p, cs); |
| (void)REQUIRE(MORE(), REG_EBRACK); |
| (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); |
| break; |
| default: /* symbol, ordinary character, or range */ |
| start = p_b_symbol(p); |
| if (SEE('-') && MORE2() && PEEK2() != ']') { |
| /* range */ |
| NEXT(); |
| if (EAT('-')) |
| finish = '-'; |
| else |
| finish = p_b_symbol(p); |
| } else |
| finish = start; |
| if (start == finish) |
| CHadd(p, cs, start); |
| else { |
| #ifdef REGEX_LIBC_COLLATE |
| if (table->__collate_load_error || MB_CUR_MAX > 1) { |
| #else |
| if (MB_CUR_MAX > 1) { |
| #endif |
| (void)REQUIRE(start <= finish, REG_ERANGE); |
| CHaddrange(p, cs, start, finish); |
| } else { |
| (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE); |
| for (i = 0; i <= UCHAR_MAX; i++) { |
| if (p_range_cmp(start, i) <= 0 && |
| p_range_cmp(i, finish) <= 0 ) |
| CHadd(p, cs, i); |
| } |
| } |
| } |
| break; |
| } |
| } |
| |
| #ifdef REGEX_GNU_EXTENSIONS |
| /* |
| - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S) |
| == static int p_b_pseudoclass(struct parse *p, char c) |
| */ |
| static int |
| p_b_pseudoclass(struct parse *p, char c) { |
| cset *cs; |
| |
| if ((cs = allocset(p)) == NULL) |
| return(0); |
| |
| if (p->g->cflags®_ICASE) |
| cs->icase = 1; |
| |
| switch (c) { |
| case 'W': |
| cs->invert = 1; |
| /* FALLTHROUGH */ |
| case 'w': |
| p_b_cclass_named(p, cs, "alnum"); |
| break; |
| case 'S': |
| cs->invert = 1; |
| /* FALLTHROUGH */ |
| case 's': |
| p_b_cclass_named(p, cs, "space"); |
| break; |
| default: |
| return(0); |
| } |
| |
| EMIT(OANYOF, (size_t)(cs - p->g->sets)); |
| return(1); |
| } |
| #endif |
| |
| /* |
| - p_b_cclass - parse a character-class name and deal with it |
| == static void p_b_cclass(struct parse *p, cset *cs); |
| */ |
| static void |
| p_b_cclass(struct parse *p, cset *cs) |
| { |
| const char *sp = p->next; |
| size_t len; |
| char clname[16]; |
| |
| while (MORE() && isalpha((uch)PEEK())) |
| NEXT(); |
| len = p->next - sp; |
| if (len >= sizeof(clname) - 1) { |
| SETERROR(REG_ECTYPE); |
| return; |
| } |
| memcpy(clname, sp, len); |
| clname[len] = '\0'; |
| |
| p_b_cclass_named(p, cs, clname); |
| } |
| |
| /* |
| - p_b_cclass_named - deal with a named character class |
| == static void p_b_cclass_named(struct parse *p, cset *cs, const char []); |
| */ |
| static void |
| p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) { |
| wctype_t wct; |
| |
| if ((wct = wctype(clname)) == 0) { |
| SETERROR(REG_ECTYPE); |
| return; |
| } |
| CHaddtype(p, cs, wct); |
| } |
| |
| /* |
| - p_b_eclass - parse an equivalence-class name and deal with it |
| == static void p_b_eclass(struct parse *p, cset *cs); |
| * |
| * This implementation is incomplete. xxx |
| */ |
| static void |
| p_b_eclass(struct parse *p, cset *cs) |
| { |
| wint_t c; |
| |
| _DIAGASSERT(p != NULL); |
| _DIAGASSERT(cs != NULL); |
| |
| c = p_b_coll_elem(p, '='); |
| CHadd(p, cs, c); |
| } |
| |
| /* |
| - p_b_symbol - parse a character or [..]ed multicharacter collating symbol |
| == static wint_t p_b_symbol(struct parse *p); |
| */ |
| static wint_t /* value of symbol */ |
| p_b_symbol(struct parse *p) |
| { |
| wint_t value; |
| |
| _DIAGASSERT(p != NULL); |
| |
| (void)REQUIRE(MORE(), REG_EBRACK); |
| if (!EATTWO('[', '.')) |
| return(WGETNEXT()); |
| |
| /* collating symbol */ |
| value = p_b_coll_elem(p, '.'); |
| (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); |
| return(value); |
| } |
| |
| /* |
| - p_b_coll_elem - parse a collating-element name and look it up |
| == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); |
| */ |
| static wint_t /* value of collating element */ |
| p_b_coll_elem(struct parse *p, |
| wint_t endc) /* name ended by endc,']' */ |
| { |
| const char *sp = p->next; |
| struct cname *cp; |
| size_t len; |
| |
| _DIAGASSERT(p != NULL); |
| |
| while (MORE() && !SEETWO(endc, ']')) |
| NEXT(); |
| if (!MORE()) { |
| SETERROR(REG_EBRACK); |
| return(0); |
| } |
| len = p->next - sp; |
| for (cp = cnames; cp->name != NULL; cp++) |
| if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) |
| return(cp->code); /* known name */ |
| #ifdef NLS |
| mbstate_t mbs; |
| wchar_t wc; |
| size_t clen; |
| |
| memset(&mbs, 0, sizeof(mbs)); |
| if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) |
| return (wc); /* single character */ |
| else if (clen == (size_t)-1 || clen == (size_t)-2) |
| SETERROR(REG_ILLSEQ); |
| else |
| SETERROR(REG_ECOLLATE); /* neither */ |
| return(0); |
| #else |
| if (len == 1) |
| return *sp; /* single character */ |
| SETERROR(REG_ECOLLATE); /* neither */ |
| return 0; |
| #endif |
| } |
| |
| /* |
| - may_escape - determine whether 'ch' is escape-able in the current context |
| == static int may_escape(struct parse *p, const wint_t ch) |
| */ |
| static bool |
| may_escape(struct parse *p, const wint_t ch) |
| { |
| |
| if ((p->pflags & PFLAG_LEGACY_ESC) != 0) |
| return (true); |
| if (isalpha(ch) || ch == '\'' || ch == '`') |
| return (false); |
| return (true); |
| #ifdef NOTYET |
| /* |
| * Build a whitelist of characters that may be escaped to produce an |
| * ordinary in the current context. This assumes that these have not |
| * been otherwise interpreted as a special character. Escaping an |
| * ordinary character yields undefined results according to |
| * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take |
| * advantage of this and use escaped ordinary characters to provide |
| * special meaning, e.g. \b, \B, \w, \W, \s, \S. |
| */ |
| switch(ch) { |
| case '|': |
| case '+': |
| case '?': |
| /* The above characters may not be escaped in BREs */ |
| if (!(p->g->cflags®_EXTENDED)) |
| return (false); |
| /* Fallthrough */ |
| case '(': |
| case ')': |
| case '{': |
| case '}': |
| case '.': |
| case '[': |
| case ']': |
| case '\\': |
| case '*': |
| case '^': |
| case '$': |
| return (true); |
| default: |
| return (false); |
| } |
| #endif |
| } |
| |
| /* |
| - othercase - return the case counterpart of an alphabetic |
| == static wint_t othercase(wint_t ch); |
| */ |
| static wint_t /* if no counterpart, return ch */ |
| othercase(wint_t ch) |
| { |
| assert(iswalpha(ch)); |
| if (iswupper(ch)) |
| return(towlower(ch)); |
| else if (iswlower(ch)) |
| return(towupper(ch)); |
| else /* peculiar, but could happen */ |
| return(ch); |
| } |
| |
| /* |
| - bothcases - emit a dualcase version of a two-case character |
| == static void bothcases(struct parse *p, wint_t ch); |
| * |
| * Boy, is this implementation ever a kludge... |
| */ |
| static void |
| bothcases(struct parse *p, wint_t ch) |
| { |
| const char *oldnext = p->next; |
| const char *oldend = p->end; |
| char bracket[3 + MB_LEN_MAX]; |
| size_t n; |
| |
| _DIAGASSERT(p != NULL); |
| |
| assert(othercase(ch) != ch); /* p_bracket() would recurse */ |
| p->next = bracket; |
| #ifdef NLS |
| mbstate_t mbs; |
| memset(&mbs, 0, sizeof(mbs)); |
| n = wcrtomb(bracket, ch, &mbs); |
| assert(n != (size_t)-1); |
| #else |
| n = 0; |
| bracket[n++] = ch; |
| #endif |
| bracket[n] = ']'; |
| bracket[n + 1] = '\0'; |
| p->end = bracket+n+1; |
| p_bracket(p); |
| assert(p->next == p->end); |
| p->next = oldnext; |
| p->end = oldend; |
| } |
| |
| /* |
| - ordinary - emit an ordinary character |
| == static void ordinary(struct parse *p, wint_t ch); |
| */ |
| static void |
| ordinary(struct parse *p, wint_t ch) |
| { |
| cset *cs; |
| |
| _DIAGASSERT(p != NULL); |
| |
| if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) |
| bothcases(p, ch); |
| else if ((wint_t)(ch & OPDMASK) == ch) |
| EMIT(OCHAR, (size_t)ch); |
| else { |
| /* |
| * Kludge: character is too big to fit into an OCHAR operand. |
| * Emit a singleton set. |
| */ |
| if ((cs = allocset(p)) == NULL) |
| return; |
| CHadd(p, cs, ch); |
| EMIT(OANYOF, (size_t)(cs - p->g->sets)); |
| } |
| } |
| |
| /* |
| - nonnewline - emit REG_NEWLINE version of OANY |
| == static void nonnewline(struct parse *p); |
| * |
| * Boy, is this implementation ever a kludge... |
| */ |
| static void |
| nonnewline(struct parse *p) |
| { |
| const char *oldnext = p->next; |
| const char *oldend = p->end; |
| char bracket[4]; |
| |
| _DIAGASSERT(p != NULL); |
| |
| p->next = bracket; |
| p->end = bracket+3; |
| bracket[0] = '^'; |
| bracket[1] = '\n'; |
| bracket[2] = ']'; |
| bracket[3] = '\0'; |
| p_bracket(p); |
| assert(p->next == bracket+3); |
| p->next = oldnext; |
| p->end = oldend; |
| } |
| |
| /* |
| - repeat - generate code for a bounded repetition, recursively if needed |
| == static void repeat(struct parse *p, sopno start, int from, int to); |
| */ |
| static void |
| repeat(struct parse *p, |
| sopno start, /* operand from here to end of strip */ |
| int from, /* repeated from this number */ |
| int to) /* to this number of times (maybe INFINITY) */ |
| { |
| sopno finish = HERE(); |
| # define N 2 |
| # define INF 3 |
| # define REP(f, t) ((f)*8 + (t)) |
| # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) |
| sopno copy; |
| |
| _DIAGASSERT(p != NULL); |
| |
| if (p->error != 0) /* head off possible runaway recursion */ |
| return; |
| |
| assert(from <= to); |
| |
| switch (REP(MAP(from), MAP(to))) { |
| case REP(0, 0): /* must be user doing this */ |
| DROP(finish-start); /* drop the operand */ |
| break; |
| case REP(0, 1): /* as x{1,1}? */ |
| case REP(0, N): /* as x{1,n}? */ |
| case REP(0, INF): /* as x{1,}? */ |
| /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
| INSERT(OCH_, start); /* offset is wrong... */ |
| repeat(p, start+1, 1, to); |
| ASTERN(OOR1, start); |
| AHEAD(start); /* ... fix it */ |
| EMIT(OOR2, 0); |
| AHEAD(THERE()); |
| ASTERN(O_CH, THERETHERE()); |
| break; |
| case REP(1, 1): /* trivial case */ |
| /* done */ |
| break; |
| case REP(1, N): /* as x?x{1,n-1} */ |
| /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
| INSERT(OCH_, start); |
| ASTERN(OOR1, start); |
| AHEAD(start); |
| EMIT(OOR2, 0); /* offset very wrong... */ |
| AHEAD(THERE()); /* ...so fix it */ |
| ASTERN(O_CH, THERETHERE()); |
| copy = dupl(p, start+1, finish+1); |
| assert(copy == finish+4); |
| repeat(p, copy, 1, to-1); |
| break; |
| case REP(1, INF): /* as x+ */ |
| INSERT(OPLUS_, start); |
| ASTERN(O_PLUS, start); |
| break; |
| case REP(N, N): /* as xx{m-1,n-1} */ |
| copy = dupl(p, start, finish); |
| repeat(p, copy, from-1, to-1); |
| break; |
| case REP(N, INF): /* as xx{n-1,INF} */ |
| copy = dupl(p, start, finish); |
| repeat(p, copy, from-1, to); |
| break; |
| default: /* "can't happen" */ |
| SETERROR(REG_ASSERT); /* just in case */ |
| break; |
| } |
| } |
| |
| /* |
| - wgetnext - helper function for WGETNEXT() macro. Gets the next wide |
| - character from the parse struct, signals a REG_ILLSEQ error if the |
| - character can't be converted. Returns the number of bytes consumed. |
| */ |
| static wint_t |
| wgetnext(struct parse *p) |
| { |
| #ifdef NLS |
| mbstate_t mbs; |
| wchar_t wc; |
| size_t n; |
| |
| memset(&mbs, 0, sizeof(mbs)); |
| n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs); |
| if (n == (size_t)-1 || n == (size_t)-2) { |
| SETERROR(REG_ILLSEQ); |
| return (0); |
| } |
| if (n == 0) |
| n = 1; |
| p->next += n; |
| return wc; |
| #else |
| return *p->next++; |
| #endif |
| } |
| |
| /* |
| - seterr - set an error condition |
| == static int seterr(struct parse *p, int e); |
| */ |
| static int /* useless but makes type checking happy */ |
| seterr(struct parse *p, int e) |
| { |
| |
| _DIAGASSERT(p != NULL); |
| |
| if (p->error == 0) /* keep earliest error condition */ |
| p->error = e; |
| p->next = nuls; /* try to bring things to a halt */ |
| p->end = nuls; |
| return(0); /* make the return value well-defined */ |
| } |
| |
| /* |
| - allocset - allocate a set of characters for [] |
| == static cset *allocset(struct parse *p); |
| */ |
| static cset * |
| allocset(struct parse *p) |
| { |
| cset *cs, *ncs; |
| |
| _DIAGASSERT(p != NULL); |
| |
| ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs)); |
| if (ncs == NULL) { |
| SETERROR(REG_ESPACE); |
| return (NULL); |
| } |
| p->g->sets = ncs; |
| cs = &p->g->sets[p->g->ncsets++]; |
| memset(cs, 0, sizeof(*cs)); |
| |
| return(cs); |
| } |
| |
| /* |
| - freeset - free a now-unused set |
| == static void freeset(struct parse *p, cset *cs); |
| */ |
| static void |
| freeset(struct parse *p, cset *cs) |
| { |
| cset *top; |
| |
| _DIAGASSERT(p != NULL); |
| _DIAGASSERT(cs != NULL); |
| |
| top = &p->g->sets[p->g->ncsets]; |
| |
| free(cs->wides); |
| free(cs->ranges); |
| free(cs->types); |
| memset(cs, 0, sizeof(*cs)); |
| if (cs == top-1) /* recover only the easy case */ |
| p->g->ncsets--; |
| } |
| |
| /* |
| - singleton - Determine whether a set contains only one character, |
| - returning it if so, otherwise returning OUT. |
| */ |
| static wint_t |
| singleton(cset *cs) |
| { |
| wint_t i, s, n; |
| |
| for (i = n = 0; i < NC; i++) |
| if (CHIN(cs, i)) { |
| n++; |
| s = i; |
| } |
| if (n == 1) |
| return (s); |
| if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && |
| cs->icase == 0) |
| return (cs->wides[0]); |
| /* Don't bother handling the other cases. */ |
| return (OUT); |
| } |
| |
| /* |
| - CHadd - add character to character set. |
| */ |
| static void |
| CHadd(struct parse *p, cset *cs, wint_t ch) |
| { |
| wint_t nch, *newwides; |
| |
| _DIAGASSERT(p != NULL); |
| _DIAGASSERT(cs != NULL); |
| |
| assert(ch >= 0); |
| if (ch < NC) |
| cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7); |
| else { |
| newwides = reallocarray(cs->wides, cs->nwides + 1, |
| sizeof(*cs->wides)); |
| if (newwides == NULL) { |
| SETERROR(REG_ESPACE); |
| return; |
| } |
| cs->wides = newwides; |
| cs->wides[cs->nwides++] = ch; |
| } |
| if (cs->icase) { |
| if ((nch = towlower(ch)) < NC) |
| cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7); |
| if ((nch = towupper(ch)) < NC) |
| cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7); |
| } |
| } |
| |
| /* |
| - CHaddrange - add all characters in the range [min,max] to a character set. |
| */ |
| static void |
| CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) |
| { |
| crange *newranges; |
| |
| _DIAGASSERT(p != NULL); |
| _DIAGASSERT(cs != NULL); |
| |
| for (; min < NC && min <= max; min++) |
| CHadd(p, cs, min); |
| if (min >= max) |
| return; |
| newranges = reallocarray(cs->ranges, cs->nranges + 1, |
| sizeof(*cs->ranges)); |
| if (newranges == NULL) { |
| SETERROR(REG_ESPACE); |
| return; |
| } |
| cs->ranges = newranges; |
| cs->ranges[cs->nranges].min = min; |
| cs->ranges[cs->nranges].max = max; |
| cs->nranges++; |
| } |
| |
| /* |
| - CHaddtype - add all characters of a certain type to a character set. |
| */ |
| static void |
| CHaddtype(struct parse *p, cset *cs, wctype_t wct) |
| { |
| wint_t i; |
| wctype_t *newtypes; |
| |
| _DIAGASSERT(p != NULL); |
| _DIAGASSERT(cs != NULL); |
| |
| for (i = 0; i < NC; i++) |
| if (iswctype(i, wct)) |
| CHadd(p, cs, i); |
| newtypes = reallocarray(cs->types, cs->ntypes + 1, |
| sizeof(*cs->types)); |
| if (newtypes == NULL) { |
| SETERROR(REG_ESPACE); |
| return; |
| } |
| cs->types = newtypes; |
| cs->types[cs->ntypes++] = wct; |
| } |
| |
| /* |
| - dupl - emit a duplicate of a bunch of sops |
| == static sopno dupl(struct parse *p, sopno start, sopno finish); |
| */ |
| static sopno /* start of duplicate */ |
| dupl(struct parse *p, |
| sopno start, /* from here */ |
| sopno finish) /* to this less one */ |
| { |
| sopno ret = HERE(); |
| sopno len = finish - start; |
| |
| _DIAGASSERT(p != NULL); |
| |
| assert(finish >= start); |
| if (len == 0) |
| return(ret); |
| if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ |
| return(ret); |
| (void) memcpy(p->strip + p->slen, |
| p->strip + start, len * sizeof(*p->strip)); |
| p->slen += len; |
| return(ret); |
| } |
| |
| /* |
| - doemit - emit a strip operator |
| == static void doemit(struct parse *p, sop op, size_t opnd); |
| * |
| * It might seem better to implement this as a macro with a function as |
| * hard-case backup, but it's just too big and messy unless there are |
| * some changes to the data structures. Maybe later. |
| */ |
| static void |
| doemit(struct parse *p, sop op, size_t opnd) |
| { |
| /* avoid making error situations worse */ |
| if (p->error != 0) |
| return; |
| |
| _DIAGASSERT(p != NULL); |
| |
| /* deal with oversize operands ("can't happen", more or less) */ |
| assert(opnd < 1<<OPSHIFT); |
| |
| /* deal with undersized strip */ |
| if (p->slen >= p->ssize) |
| if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ |
| return; |
| |
| /* finally, it's all reduced to the easy case */ |
| p->strip[p->slen++] = (sopno)SOP(op, opnd); |
| } |
| |
| /* |
| - doinsert - insert a sop into the strip |
| == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); |
| */ |
| static void |
| doinsert(struct parse *p, sop op, size_t opnd, sopno pos) |
| { |
| sopno sn; |
| sop s; |
| int i; |
| |
| _DIAGASSERT(p != NULL); |
| |
| /* avoid making error situations worse */ |
| if (p->error != 0) |
| return; |
| |
| sn = HERE(); |
| EMIT(op, opnd); /* do checks, ensure space */ |
| assert(HERE() == sn+1); |
| s = p->strip[sn]; |
| |
| /* adjust paren pointers */ |
| assert(pos > 0); |
| for (i = 1; i < NPAREN; i++) { |
| if (p->pbegin[i] >= pos) { |
| p->pbegin[i]++; |
| } |
| if (p->pend[i] >= pos) { |
| p->pend[i]++; |
| } |
| } |
| |
| memmove(&p->strip[pos+1], &p->strip[pos], |
| (HERE()-pos-1)*sizeof(*p->strip)); |
| p->strip[pos] = s; |
| } |
| |
| /* |
| - dofwd - complete a forward reference |
| == static void dofwd(struct parse *p, sopno pos, sop value); |
| */ |
| static void |
| dofwd(struct parse *p, sopno pos, sop value) |
| { |
| |
| _DIAGASSERT(p != NULL); |
| |
| /* avoid making error situations worse */ |
| if (p->error != 0) |
| return; |
| |
| assert(value < 1<<OPSHIFT); |
| p->strip[pos] = OP(p->strip[pos]) | value; |
| } |
| |
| /* |
| - enlarge - enlarge the strip |
| == static int enlarge(struct parse *p, sopno size); |
| */ |
| static int |
| enlarge(struct parse *p, sopno size) |
| { |
| sop *sp; |
| |
| _DIAGASSERT(p != NULL); |
| |
| if (p->ssize >= size) |
| return 1; |
| |
| sp = reallocarray(p->strip, size, sizeof(*p->strip)); |
| if (sp == NULL) { |
| SETERROR(REG_ESPACE); |
| return 0; |
| } |
| p->strip = sp; |
| p->ssize = size; |
| return 1; |
| } |
| |
| /* |
| - stripsnug - compact the strip |
| == static void stripsnug(struct parse *p, struct re_guts *g); |
| */ |
| static void |
| stripsnug(struct parse *p, struct re_guts *g) |
| { |
| |
| _DIAGASSERT(p != NULL); |
| _DIAGASSERT(g != NULL); |
| |
| g->nstates = p->slen; |
| g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip)); |
| if (g->strip == NULL) { |
| SETERROR(REG_ESPACE); |
| g->strip = p->strip; |
| } |
| } |
| |
| /* |
| - findmust - fill in must and mlen with longest mandatory literal string |
| == static void findmust(struct parse *p, struct re_guts *g); |
| * |
| * This algorithm could do fancy things like analyzing the operands of | |
| * for common subsequences. Someday. This code is simple and finds most |
| * of the interesting cases. |
| * |
| * Note that must and mlen got initialized during setup. |
| */ |
| static void |
| findmust(struct parse *p, struct re_guts *g) |
| { |
| sop *scan; |
| sop *start = NULL; |
| sop *newstart = NULL; |
| sopno newlen; |
| sop s; |
| char *cp; |
| int offset; |
| mbstate_t mbs; |
| |
| _DIAGASSERT(p != NULL); |
| _DIAGASSERT(g != NULL); |
| |
| /* avoid making error situations worse */ |
| if (p->error != 0) |
| return; |
| |
| #ifdef notyet |
| /* |
| * It's not generally safe to do a ``char'' substring search on |
| * multibyte character strings, but it's safe for at least |
| * UTF-8 (see RFC 3629). |
| */ |
| if (MB_CUR_MAX > 1 && |
| strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) |
| return; |
| #endif |
| |
| /* find the longest OCHAR sequence in strip */ |
| newlen = 0; |
| offset = 0; |
| g->moffset = 0; |
| scan = g->strip + 1; |
| do { |
| s = *scan++; |
| switch (OP(s)) { |
| case OCHAR: /* sequence member */ |
| if (newlen == 0) { /* new sequence */ |
| memset(&mbs, 0, sizeof(mbs)); |
| newstart = scan - 1; |
| } |
| #ifdef NLS |
| char buf[MB_LEN_MAX]; |
| size_t clen = wcrtomb(buf, (int)OPND(s), &mbs); |
| if (clen == (size_t)-1) |
| goto toohard; |
| newlen += (sopno)clen; |
| #else |
| newlen++; |
| #endif |
| break; |
| case OPLUS_: /* things that don't break one */ |
| case OLPAREN: |
| case ORPAREN: |
| break; |
| case OQUEST_: /* things that must be skipped */ |
| case OCH_: |
| offset = altoffset(scan, offset); |
| scan--; |
| do { |
| scan += OPND(s); |
| s = *scan; |
| /* assert() interferes w debug printouts */ |
| if (OP(s) != O_QUEST && |
| OP(s) != O_CH && OP(s) != OOR2) { |
| g->iflags |= BAD; |
| return; |
| } |
| } while (OP(s) != O_QUEST && OP(s) != O_CH); |
| /* FALLTHROUGH */ |
| case OBOW: /* things that break a sequence */ |
| case OEOW: |
| case OBOL: |
| case OEOL: |
| case OBOS: |
| case OEOS: |
| case OWBND: |
| case ONWBND: |
| case O_QUEST: |
| case O_CH: |
| case OEND: |
| if (newlen > (sopno)g->mlen) { /* ends one */ |
| start = newstart; |
| g->mlen = newlen; |
| if (offset > -1) { |
| g->moffset += offset; |
| offset = newlen; |
| } else |
| g->moffset = offset; |
| } else { |
| if (offset > -1) |
| offset += newlen; |
| } |
| newlen = 0; |
| break; |
| case OANY: |
| if (newlen > (sopno)g->mlen) { /* ends one */ |
| start = newstart; |
| g->mlen = newlen; |
| if (offset > -1) { |
| g->moffset += offset; |
| offset = newlen; |
| } else |
| g->moffset = offset; |
| } else { |
| if (offset > -1) |
| offset += newlen; |
| } |
| if (offset > -1) |
| offset++; |
| newlen = 0; |
| break; |
| case OANYOF: /* may or may not invalidate offset */ |
| /* First, everything as OANY */ |
| if (newlen > (sopno)g->mlen) { /* ends one */ |
| start = newstart; |
| g->mlen = newlen; |
| if (offset > -1) { |
| g->moffset += offset; |
| offset = newlen; |
| } else |
| g->moffset = offset; |
| } else { |
| if (offset > -1) |
| offset += newlen; |
| } |
| if (offset > -1) |
| offset++; |
| newlen = 0; |
| break; |
| #ifdef NLS |
| toohard:/*FALLTHROUGH*/ |
| #endif |
| default: |
| /* Anything here makes it impossible or too hard |
| * to calculate the offset -- so we give up; |
| * save the last known good offset, in case the |
| * must sequence doesn't occur later. |
| */ |
| if (newlen > (sopno)g->mlen) { /* ends one */ |
| start = newstart; |
| g->mlen = newlen; |
| if (offset > -1) |
| g->moffset += offset; |
| else |
| g->moffset = offset; |
| } |
| offset = -1; |
| newlen = 0; |
| break; |
| } |
| } while (OP(s) != OEND); |
| |
| if (g->mlen == 0) { /* there isn't one */ |
| g->moffset = -1; |
| return; |
| } |
| |
| /* turn it into a character string */ |
| g->must = malloc((size_t)g->mlen + 1); |
| if (g->must == NULL) { /* argh; just forget it */ |
| g->mlen = 0; |
| g->moffset = -1; |
| return; |
| } |
| cp = g->must; |
| scan = start; |
| memset(&mbs, 0, sizeof(mbs)); |
| while (cp < g->must + g->mlen) { |
| while (OP(s = *scan++) != OCHAR) |
| continue; |
| #ifdef NLS |
| size_t clen = wcrtomb(cp, (int)OPND(s), &mbs); |
| assert(clen != (size_t)-1); |
| cp += clen; |
| #else |
| *cp++ = OPND(s); |
| #endif |
| } |
| assert(cp == g->must + g->mlen); |
| *cp++ = '\0'; /* just on general principles */ |
| } |
| |
| /* |
| - altoffset - choose biggest offset among multiple choices |
| == static int altoffset(sop *scan, int offset); |
| * |
| * Compute, recursively if necessary, the largest offset among multiple |
| * re paths. |
| */ |
| static int |
| altoffset(sop *scan, int offset) |
| { |
| int largest; |
| int try; |
| sop s; |
| |
| _DIAGASSERT(scan != NULL); |
| |
| /* If we gave up already on offsets, return */ |
| if (offset == -1) |
| return -1; |
| |
| largest = 0; |
| try = 0; |
| s = *scan++; |
| while (OP(s) != O_QUEST && OP(s) != O_CH) { |
| switch (OP(s)) { |
| case OOR1: |
| if (try > largest) |
| largest = try; |
| try = 0; |
| break; |
| case OQUEST_: |
| case OCH_: |
| try = altoffset(scan, try); |
| if (try == -1) |
| return -1; |
| scan--; |
| do { |
| scan += OPND(s); |
| s = *scan; |
| if (OP(s) != O_QUEST && |
| OP(s) != O_CH && OP(s) != OOR2) |
| return -1; |
| } while (OP(s) != O_QUEST && OP(s) != O_CH); |
| /* We must skip to the next position, or we'll |
| * leave altoffset() too early. |
| */ |
| scan++; |
| break; |
| case OANYOF: |
| case OCHAR: |
| case OANY: |
| try++; |
| /*FALLTHROUGH*/ |
| case OBOW: |
| case OEOW: |
| case OWBND: |
| case ONWBND: |
| case OLPAREN: |
| case ORPAREN: |
| case OOR2: |
| break; |
| default: |
| try = -1; |
| break; |
| } |
| if (try == -1) |
| return -1; |
| s = *scan++; |
| } |
| |
| if (try > largest) |
| largest = try; |
| |
| return largest+offset; |
| } |
| |
| /* |
| - computejumps - compute char jumps for BM scan |
| == static void computejumps(struct parse *p, struct re_guts *g); |
| * |
| * This algorithm assumes g->must exists and is has size greater than |
| * zero. It's based on the algorithm found on Computer Algorithms by |
| * Sara Baase. |
| * |
| * A char jump is the number of characters one needs to jump based on |
| * the value of the character from the text that was mismatched. |
| */ |
| static void |
| computejumps(struct parse *p, struct re_guts *g) |
| { |
| int ch; |
| size_t mindex; |
| |
| _DIAGASSERT(p != NULL); |
| _DIAGASSERT(g != NULL); |
| |
| /* Avoid making errors worse */ |
| if (p->error != 0) |
| return; |
| |
| g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump)); |
| if (g->charjump == NULL) /* Not a fatal error */ |
| return; |
| /* Adjust for signed chars, if necessary */ |
| g->charjump = &g->charjump[-(CHAR_MIN)]; |
| |
| /* If the character does not exist in the pattern, the jump |
| * is equal to the number of characters in the pattern. |
| */ |
| for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) |
| g->charjump[ch] = g->mlen; |
| |
| /* If the character does exist, compute the jump that would |
| * take us to the last character in the pattern equal to it |
| * (notice that we match right to left, so that last character |
| * is the first one that would be matched). |
| */ |
| for (mindex = 0; mindex < g->mlen; mindex++) |
| g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; |
| } |
| |
| /* |
| - computematchjumps - compute match jumps for BM scan |
| == static void computematchjumps(struct parse *p, struct re_guts *g); |
| * |
| * This algorithm assumes g->must exists and is has size greater than |
| * zero. It's based on the algorithm found on Computer Algorithms by |
| * Sara Baase. |
| * |
| * A match jump is the number of characters one needs to advance based |
| * on the already-matched suffix. |
| * Notice that all values here are minus (g->mlen-1), because of the way |
| * the search algorithm works. |
| */ |
| static void |
| computematchjumps(struct parse *p, struct re_guts *g) |
| { |
| size_t mindex; /* General "must" iterator */ |
| size_t suffix; /* Keeps track of matching suffix */ |
| size_t ssuffix; /* Keeps track of suffixes' suffix */ |
| size_t* pmatches; /* pmatches[k] points to the next i |
| * such that i+1...mlen is a substring |
| * of k+1...k+mlen-i-1 |
| */ |
| |
| _DIAGASSERT(p != NULL); |
| _DIAGASSERT(g != NULL); |
| |
| /* Avoid making errors worse */ |
| if (p->error != 0) |
| return; |
| |
| pmatches = calloc(g->mlen, sizeof(*pmatches)); |
| if (pmatches == NULL) { |
| g->matchjump = NULL; |
| return; |
| } |
| |
| g->matchjump = calloc(g->mlen, sizeof(*g->matchjump)); |
| if (g->matchjump == NULL) { /* Not a fatal error */ |
| free(pmatches); |
| return; |
| } |
| |
| /* Set maximum possible jump for each character in the pattern */ |
| for (mindex = 0; mindex < g->mlen; mindex++) |
| g->matchjump[mindex] = 2 * g->mlen - mindex - 1; |
| |
| /* Compute pmatches[] */ |
| for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) { |
| pmatches[mindex] = suffix; |
| |
| /* If a mismatch is found, interrupting the substring, |
| * compute the matchjump for that position. If no |
| * mismatch is found, then a text substring mismatched |
| * against the suffix will also mismatch against the |
| * substring. |
| */ |
| while (suffix < g->mlen |
| && g->must[mindex] != g->must[suffix]) { |
| g->matchjump[suffix] = MIN(g->matchjump[suffix], |
| g->mlen - mindex - 1); |
| suffix = pmatches[suffix]; |
| } |
| } |
| |
| /* Compute the matchjump up to the last substring found to jump |
| * to the beginning of the largest must pattern prefix matching |
| * it's own suffix. |
| */ |
| for (mindex = 0; mindex <= suffix; mindex++) |
| g->matchjump[mindex] = MIN(g->matchjump[mindex], |
| g->mlen + suffix - mindex); |
| |
| ssuffix = pmatches[suffix]; |
| while (suffix < g->mlen) { |
| while (suffix <= ssuffix && suffix < g->mlen) { |
| g->matchjump[suffix] = MIN(g->matchjump[suffix], |
| g->mlen + ssuffix - suffix); |
| suffix++; |
| } |
| if (suffix < g->mlen) |
| ssuffix = pmatches[ssuffix]; |
| } |
| |
| free(pmatches); |
| } |
| |
| /* |
| - pluscount - count + nesting |
| == static sopno pluscount(struct parse *p, struct re_guts *g); |
| */ |
| static sopno /* nesting depth */ |
| pluscount(struct parse *p, struct re_guts *g) |
| { |
| sop *scan; |
| sop s; |
| sopno plusnest = 0; |
| sopno maxnest = 0; |
| |
| _DIAGASSERT(p != NULL); |
| _DIAGASSERT(g != NULL); |
| |
| if (p->error != 0) |
| return(0); /* there may not be an OEND */ |
| |
| scan = g->strip + 1; |
| do { |
| s = *scan++; |
| switch (OP(s)) { |
| case OPLUS_: |
| plusnest++; |
| break; |
| case O_PLUS: |
| if (plusnest > maxnest) |
| maxnest = plusnest; |
| plusnest--; |
| break; |
| } |
| } while (OP(s) != OEND); |
| if (plusnest != 0) |
| g->iflags |= BAD; |
| return(maxnest); |
| } |