| /************************************************* |
| * Perl-Compatible Regular Expressions * |
| *************************************************/ |
| |
| /* PCRE is a library of functions to support regular expressions whose syntax |
| and semantics are as close as possible to those of the Perl 5 language. |
| |
| Written by Philip Hazel |
| Copyright (c) 1997-2012 University of Cambridge |
| |
| ----------------------------------------------------------------------------- |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions are met: |
| |
| * Redistributions of source code must retain the above copyright notice, |
| this list of conditions and the following disclaimer. |
| |
| * Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions and the following disclaimer in the |
| documentation and/or other materials provided with the distribution. |
| |
| * Neither the name of the University of Cambridge nor the names of its |
| contributors may be used to endorse or promote products derived from |
| this software without specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| POSSIBILITY OF SUCH DAMAGE. |
| ----------------------------------------------------------------------------- |
| */ |
| |
| |
| /* This module contains the external function pcre_compile(), along with |
| supporting internal functions that are not used by other modules. */ |
| |
| |
| #include "config.h" |
| |
| #define NLBLOCK cd /* Block containing newline information */ |
| #define PSSTART start_pattern /* Field containing processed string start */ |
| #define PSEND end_pattern /* Field containing processed string end */ |
| |
| #include "pcre_internal.h" |
| |
| #ifdef GLIB_COMPILATION |
| #include "gstrfuncs.h" |
| #else |
| #include <glib.h> |
| #endif |
| |
| /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which |
| is also used by pcretest. PCRE_DEBUG is not defined when building a production |
| library. We do not need to select pcre16_printint.c specially, because the |
| COMPILE_PCREx macro will already be appropriately set. */ |
| |
| #ifdef PCRE_DEBUG |
| /* pcre_printint.c should not include any headers */ |
| #define PCRE_INCLUDED |
| #include "pcre_printint.c" |
| #undef PCRE_INCLUDED |
| #endif |
| |
| |
| /* Macro for setting individual bits in class bitmaps. */ |
| |
| #define SETBIT(a,b) a[b/8] |= (1 << (b%8)) |
| |
| /* Maximum length value to check against when making sure that the integer that |
| holds the compiled pattern length does not overflow. We make it a bit less than |
| INT_MAX to allow for adding in group terminating bytes, so that we don't have |
| to check them every time. */ |
| |
| #define OFLOW_MAX (INT_MAX - 20) |
| |
| |
| /************************************************* |
| * Code parameters and static tables * |
| *************************************************/ |
| |
| /* This value specifies the size of stack workspace that is used during the |
| first pre-compile phase that determines how much memory is required. The regex |
| is partly compiled into this space, but the compiled parts are discarded as |
| soon as they can be, so that hopefully there will never be an overrun. The code |
| does, however, check for an overrun. The largest amount I've seen used is 218, |
| so this number is very generous. |
| |
| The same workspace is used during the second, actual compile phase for |
| remembering forward references to groups so that they can be filled in at the |
| end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE |
| is 4 there is plenty of room for most patterns. However, the memory can get |
| filled up by repetitions of forward references, for example patterns like |
| /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so |
| that the workspace is expanded using malloc() in this situation. The value |
| below is therefore a minimum, and we put a maximum on it for safety. The |
| minimum is now also defined in terms of LINK_SIZE so that the use of malloc() |
| kicks in at the same number of forward references in all cases. */ |
| |
| #define COMPILE_WORK_SIZE (2048*LINK_SIZE) |
| #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE) |
| |
| /* The overrun tests check for a slightly smaller size so that they detect the |
| overrun before it actually does run off the end of the data block. */ |
| |
| #define WORK_SIZE_SAFETY_MARGIN (100) |
| |
| /* Private flags added to firstchar and reqchar. */ |
| |
| #define REQ_CASELESS 0x10000000l /* Indicates caselessness */ |
| #define REQ_VARY 0x20000000l /* Reqchar followed non-literal item */ |
| |
| /* Repeated character flags. */ |
| |
| #define UTF_LENGTH 0x10000000l /* The char contains its length. */ |
| |
| /* Table for handling escaped characters in the range '0'-'z'. Positive returns |
| are simple data values; negative values are for special things like \d and so |
| on. Zero means further processing is needed (for things like \x), or the escape |
| is invalid. */ |
| |
| #ifndef EBCDIC |
| |
| /* This is the "normal" table for ASCII systems or for EBCDIC systems running |
| in UTF-8 mode. */ |
| |
| static const short int escapes[] = { |
| 0, 0, |
| 0, 0, |
| 0, 0, |
| 0, 0, |
| 0, 0, |
| CHAR_COLON, CHAR_SEMICOLON, |
| CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, |
| CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK, |
| CHAR_COMMERCIAL_AT, -ESC_A, |
| -ESC_B, -ESC_C, |
| -ESC_D, -ESC_E, |
| 0, -ESC_G, |
| -ESC_H, 0, |
| 0, -ESC_K, |
| 0, 0, |
| -ESC_N, 0, |
| -ESC_P, -ESC_Q, |
| -ESC_R, -ESC_S, |
| 0, 0, |
| -ESC_V, -ESC_W, |
| -ESC_X, 0, |
| -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, |
| CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, |
| CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, |
| CHAR_GRAVE_ACCENT, 7, |
| -ESC_b, 0, |
| -ESC_d, ESC_e, |
| ESC_f, 0, |
| -ESC_h, 0, |
| 0, -ESC_k, |
| 0, 0, |
| ESC_n, 0, |
| -ESC_p, 0, |
| ESC_r, -ESC_s, |
| ESC_tee, 0, |
| -ESC_v, -ESC_w, |
| 0, 0, |
| -ESC_z |
| }; |
| |
| #else |
| |
| /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */ |
| |
| static const short int escapes[] = { |
| /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', |
| /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, |
| /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', |
| /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, |
| /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', |
| /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, |
| /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', |
| /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, |
| /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0, |
| /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p, |
| /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, |
| /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0, |
| /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, |
| /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, |
| /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', |
| /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, |
| /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0, |
| /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P, |
| /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, |
| /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X, |
| /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, |
| /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, |
| /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 |
| }; |
| #endif |
| |
| |
| /* Table of special "verbs" like (*PRUNE). This is a short table, so it is |
| searched linearly. Put all the names into a single string, in order to reduce |
| the number of relocations when a shared library is dynamically linked. The |
| string is built from string macros so that it works in UTF-8 mode on EBCDIC |
| platforms. */ |
| |
| typedef struct verbitem { |
| int len; /* Length of verb name */ |
| int op; /* Op when no arg, or -1 if arg mandatory */ |
| int op_arg; /* Op when arg present, or -1 if not allowed */ |
| } verbitem; |
| |
| static const char verbnames[] = |
| "\0" /* Empty name is a shorthand for MARK */ |
| STRING_MARK0 |
| STRING_ACCEPT0 |
| STRING_COMMIT0 |
| STRING_F0 |
| STRING_FAIL0 |
| STRING_PRUNE0 |
| STRING_SKIP0 |
| STRING_THEN; |
| |
| static const verbitem verbs[] = { |
| { 0, -1, OP_MARK }, |
| { 4, -1, OP_MARK }, |
| { 6, OP_ACCEPT, -1 }, |
| { 6, OP_COMMIT, -1 }, |
| { 1, OP_FAIL, -1 }, |
| { 4, OP_FAIL, -1 }, |
| { 5, OP_PRUNE, OP_PRUNE_ARG }, |
| { 4, OP_SKIP, OP_SKIP_ARG }, |
| { 4, OP_THEN, OP_THEN_ARG } |
| }; |
| |
| static const int verbcount = sizeof(verbs)/sizeof(verbitem); |
| |
| |
| /* Tables of names of POSIX character classes and their lengths. The names are |
| now all in a single string, to reduce the number of relocations when a shared |
| library is dynamically loaded. The list of lengths is terminated by a zero |
| length entry. The first three must be alpha, lower, upper, as this is assumed |
| for handling case independence. */ |
| |
| static const char posix_names[] = |
| STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0 |
| STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0 |
| STRING_graph0 STRING_print0 STRING_punct0 STRING_space0 |
| STRING_word0 STRING_xdigit; |
| |
| static const pcre_uint8 posix_name_lengths[] = { |
| 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; |
| |
| /* Table of class bit maps for each POSIX class. Each class is formed from a |
| base map, with an optional addition or removal of another map. Then, for some |
| classes, there is some additional tweaking: for [:blank:] the vertical space |
| characters are removed, and for [:alpha:] and [:alnum:] the underscore |
| character is removed. The triples in the table consist of the base map offset, |
| second map offset or -1 if no second map, and a non-negative value for map |
| addition or a negative value for map subtraction (if there are two maps). The |
| absolute value of the third field has these meanings: 0 => no tweaking, 1 => |
| remove vertical space characters, 2 => remove underscore. */ |
| |
| static const int posix_class_maps[] = { |
| cbit_word, cbit_digit, -2, /* alpha */ |
| cbit_lower, -1, 0, /* lower */ |
| cbit_upper, -1, 0, /* upper */ |
| cbit_word, -1, 2, /* alnum - word without underscore */ |
| cbit_print, cbit_cntrl, 0, /* ascii */ |
| cbit_space, -1, 1, /* blank - a GNU extension */ |
| cbit_cntrl, -1, 0, /* cntrl */ |
| cbit_digit, -1, 0, /* digit */ |
| cbit_graph, -1, 0, /* graph */ |
| cbit_print, -1, 0, /* print */ |
| cbit_punct, -1, 0, /* punct */ |
| cbit_space, -1, 0, /* space */ |
| cbit_word, -1, 0, /* word - a Perl extension */ |
| cbit_xdigit,-1, 0 /* xdigit */ |
| }; |
| |
| /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class |
| substitutes must be in the order of the names, defined above, and there are |
| both positive and negative cases. NULL means no substitute. */ |
| |
| #ifdef SUPPORT_UCP |
| static const pcre_uchar string_PNd[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_pNd[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_PXsp[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_pXsp[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_PXwd[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_pXwd[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| |
| static const pcre_uchar *substitutes[] = { |
| string_PNd, /* \D */ |
| string_pNd, /* \d */ |
| string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */ |
| string_pXsp, /* \s */ |
| string_PXwd, /* \W */ |
| string_pXwd /* \w */ |
| }; |
| |
| static const pcre_uchar string_pL[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_pLl[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_pLu[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_pXan[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_h[] = { |
| CHAR_BACKSLASH, CHAR_h, '\0' }; |
| static const pcre_uchar string_pXps[] = { |
| CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_PL[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_PLl[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_PLu[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_PXan[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| static const pcre_uchar string_H[] = { |
| CHAR_BACKSLASH, CHAR_H, '\0' }; |
| static const pcre_uchar string_PXps[] = { |
| CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, |
| CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' }; |
| |
| static const pcre_uchar *posix_substitutes[] = { |
| string_pL, /* alpha */ |
| string_pLl, /* lower */ |
| string_pLu, /* upper */ |
| string_pXan, /* alnum */ |
| NULL, /* ascii */ |
| string_h, /* blank */ |
| NULL, /* cntrl */ |
| string_pNd, /* digit */ |
| NULL, /* graph */ |
| NULL, /* print */ |
| NULL, /* punct */ |
| string_pXps, /* space */ /* NOTE: Xps is POSIX space */ |
| string_pXwd, /* word */ |
| NULL, /* xdigit */ |
| /* Negated cases */ |
| string_PL, /* ^alpha */ |
| string_PLl, /* ^lower */ |
| string_PLu, /* ^upper */ |
| string_PXan, /* ^alnum */ |
| NULL, /* ^ascii */ |
| string_H, /* ^blank */ |
| NULL, /* ^cntrl */ |
| string_PNd, /* ^digit */ |
| NULL, /* ^graph */ |
| NULL, /* ^print */ |
| NULL, /* ^punct */ |
| string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */ |
| string_PXwd, /* ^word */ |
| NULL /* ^xdigit */ |
| }; |
| #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *)) |
| #endif |
| |
| #define STRING(a) # a |
| #define XSTRING(s) STRING(s) |
| |
| /* The texts of compile-time error messages. These are "char *" because they |
| are passed to the outside world. Do not ever re-use any error number, because |
| they are documented. Always add a new error instead. Messages marked DEAD below |
| are no longer used. This used to be a table of strings, but in order to reduce |
| the number of relocations needed when a shared library is loaded dynamically, |
| it is now one long string. We cannot use a table of offsets, because the |
| lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we |
| simply count through to the one we want - this isn't a performance issue |
| because these strings are used only when there is a compilation error. |
| |
| Each substring ends with \0 to insert a null character. This includes the final |
| substring, so that the whole string ends with \0\0, which can be detected when |
| counting through. */ |
| |
| static const char error_texts[] = |
| "no error\0" |
| "\\ at end of pattern\0" |
| "\\c at end of pattern\0" |
| "unrecognized character follows \\\0" |
| "numbers out of order in {} quantifier\0" |
| /* 5 */ |
| "number too big in {} quantifier\0" |
| "missing terminating ] for character class\0" |
| "invalid escape sequence in character class\0" |
| "range out of order in character class\0" |
| "nothing to repeat\0" |
| /* 10 */ |
| "operand of unlimited repeat could match the empty string\0" /** DEAD **/ |
| "internal error: unexpected repeat\0" |
| "unrecognized character after (? or (?-\0" |
| "POSIX named classes are supported only within a class\0" |
| "missing )\0" |
| /* 15 */ |
| "reference to non-existent subpattern\0" |
| "erroffset passed as NULL\0" |
| "unknown option bit(s) set\0" |
| "missing ) after comment\0" |
| "parentheses nested too deeply\0" /** DEAD **/ |
| /* 20 */ |
| "regular expression is too large\0" |
| "failed to get memory\0" |
| "unmatched parentheses\0" |
| "internal error: code overflow\0" |
| "unrecognized character after (?<\0" |
| /* 25 */ |
| "lookbehind assertion is not fixed length\0" |
| "malformed number or name after (?(\0" |
| "conditional group contains more than two branches\0" |
| "assertion expected after (?(\0" |
| "(?R or (?[+-]digits must be followed by )\0" |
| /* 30 */ |
| "unknown POSIX class name\0" |
| "POSIX collating elements are not supported\0" |
| "this version of PCRE is compiled without UTF support\0" |
| "spare error\0" /** DEAD **/ |
| "character value in \\x{...} sequence is too large\0" |
| /* 35 */ |
| "invalid condition (?(0)\0" |
| "\\C not allowed in lookbehind assertion\0" |
| "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0" |
| "number after (?C is > 255\0" |
| "closing ) for (?C expected\0" |
| /* 40 */ |
| "recursive call could loop indefinitely\0" |
| "unrecognized character after (?P\0" |
| "syntax error in subpattern name (missing terminator)\0" |
| "two named subpatterns have the same name\0" |
| "invalid UTF-8 string\0" |
| /* 45 */ |
| "support for \\P, \\p, and \\X has not been compiled\0" |
| "malformed \\P or \\p sequence\0" |
| "unknown property name after \\P or \\p\0" |
| "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0" |
| "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" |
| /* 50 */ |
| "repeated subpattern is too long\0" /** DEAD **/ |
| "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0" |
| "internal error: overran compiling workspace\0" |
| "internal error: previously-checked referenced subpattern not found\0" |
| "DEFINE group contains more than one branch\0" |
| /* 55 */ |
| "repeating a DEFINE group is not allowed\0" /** DEAD **/ |
| "inconsistent NEWLINE options\0" |
| "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" |
| "a numbered reference must not be zero\0" |
| "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0" |
| /* 60 */ |
| "(*VERB) not recognized\0" |
| "number is too big\0" |
| "subpattern name expected\0" |
| "digit expected after (?+\0" |
| "] is an invalid data character in JavaScript compatibility mode\0" |
| /* 65 */ |
| "different names for subpatterns of the same number are not allowed\0" |
| "(*MARK) must have an argument\0" |
| "this version of PCRE is not compiled with Unicode property support\0" |
| "\\c must be followed by an ASCII character\0" |
| "\\k is not followed by a braced, angle-bracketed, or quoted name\0" |
| /* 70 */ |
| "internal error: unknown opcode in find_fixedlength()\0" |
| "\\N is not supported in a class\0" |
| "too many forward references\0" |
| "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0" |
| "invalid UTF-16 string\0" |
| /* 75 */ |
| "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0" |
| "character value in \\u.... sequence is too large\0" |
| ; |
| |
| /* Table to identify digits and hex digits. This is used when compiling |
| patterns. Note that the tables in chartables are dependent on the locale, and |
| may mark arbitrary characters as digits - but the PCRE compiling code expects |
| to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have |
| a private table here. It costs 256 bytes, but it is a lot faster than doing |
| character value tests (at least in some simple cases I timed), and in some |
| applications one wants PCRE to compile efficiently as well as match |
| efficiently. |
| |
| For convenience, we use the same bit definitions as in chartables: |
| |
| 0x04 decimal digit |
| 0x08 hexadecimal digit |
| |
| Then we can use ctype_digit and ctype_xdigit in the code. */ |
| |
| /* Using a simple comparison for decimal numbers rather than a memory read |
| is much faster, and the resulting code is simpler (the compiler turns it |
| into a subtraction and unsigned comparison). */ |
| |
| #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9) |
| |
| #if 0 |
| #ifndef EBCDIC |
| |
| /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in |
| UTF-8 mode. */ |
| |
| static const pcre_uint8 digitab[] = |
| { |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ |
| 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ |
| 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ |
| 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ |
| 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ |
| |
| #else |
| |
| /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ |
| |
| static const pcre_uint8 digitab[] = |
| { |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ |
| 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ |
| 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ |
| 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */ |
| 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ |
| |
| static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */ |
| 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */ |
| 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */ |
| 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ |
| 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ |
| 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */ |
| 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */ |
| 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */ |
| 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ |
| 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */ |
| 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ |
| 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */ |
| 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ |
| 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */ |
| 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ |
| 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */ |
| 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ |
| 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */ |
| 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ |
| 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */ |
| 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ |
| 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */ |
| 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ |
| 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */ |
| 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ |
| #endif |
| #endif /* 0 */ |
| |
| /* Definition to allow mutual recursion */ |
| |
| static BOOL |
| compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int, |
| int *, int *, branch_chain *, compile_data *, int *); |
| |
| |
| |
| /************************************************* |
| * Find an error text * |
| *************************************************/ |
| |
| /* The error texts are now all in one long string, to save on relocations. As |
| some of the text is of unknown length, we can't use a table of offsets. |
| Instead, just count through the strings. This is not a performance issue |
| because it happens only when there has been a compilation error. |
| |
| Argument: the error number |
| Returns: pointer to the error string |
| */ |
| |
| static const char * |
| find_error_text(int n) |
| { |
| const char *s = error_texts; |
| for (; n > 0; n--) |
| { |
| while (*s++ != 0) {}; |
| if (*s == 0) return "Error text not found (please report)"; |
| } |
| return s; |
| } |
| |
| |
| /************************************************* |
| * Expand the workspace * |
| *************************************************/ |
| |
| /* This function is called during the second compiling phase, if the number of |
| forward references fills the existing workspace, which is originally a block on |
| the stack. A larger block is obtained from malloc() unless the ultimate limit |
| has been reached or the increase will be rather small. |
| |
| Argument: pointer to the compile data block |
| Returns: 0 if all went well, else an error number |
| */ |
| |
| static int |
| expand_workspace(compile_data *cd) |
| { |
| pcre_uchar *newspace; |
| int newsize = cd->workspace_size * 2; |
| |
| if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX; |
| if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX || |
| newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN) |
| return ERR72; |
| |
| newspace = (PUBL(malloc))(IN_UCHARS(newsize)); |
| if (newspace == NULL) return ERR21; |
| memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar)); |
| cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace); |
| if (cd->workspace_size > COMPILE_WORK_SIZE) |
| (PUBL(free))((void *)cd->start_workspace); |
| cd->start_workspace = newspace; |
| cd->workspace_size = newsize; |
| return 0; |
| } |
| |
| |
| |
| /************************************************* |
| * Check for counted repeat * |
| *************************************************/ |
| |
| /* This function is called when a '{' is encountered in a place where it might |
| start a quantifier. It looks ahead to see if it really is a quantifier or not. |
| It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd} |
| where the ddds are digits. |
| |
| Arguments: |
| p pointer to the first char after '{' |
| |
| Returns: TRUE or FALSE |
| */ |
| |
| static BOOL |
| is_counted_repeat(const pcre_uchar *p) |
| { |
| if (!IS_DIGIT(*p)) return FALSE; |
| p++; |
| while (IS_DIGIT(*p)) p++; |
| if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; |
| |
| if (*p++ != CHAR_COMMA) return FALSE; |
| if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; |
| |
| if (!IS_DIGIT(*p)) return FALSE; |
| p++; |
| while (IS_DIGIT(*p)) p++; |
| |
| return (*p == CHAR_RIGHT_CURLY_BRACKET); |
| } |
| |
| |
| |
| /************************************************* |
| * Handle escapes * |
| *************************************************/ |
| |
| /* This function is called when a \ has been encountered. It either returns a |
| positive value for a simple escape such as \n, or a negative value which |
| encodes one of the more complicated things such as \d. A backreference to group |
| n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When |
| UTF-8 is enabled, a positive value greater than 255 may be returned. On entry, |
| ptr is pointing at the \. On exit, it is on the final character of the escape |
| sequence. |
| |
| Arguments: |
| ptrptr points to the pattern position pointer |
| errorcodeptr points to the errorcode variable |
| bracount number of previous extracting brackets |
| options the options bits |
| isclass TRUE if inside a character class |
| |
| Returns: zero or positive => a data character |
| negative => a special escape sequence |
| on error, errorcodeptr is set |
| */ |
| |
| static int |
| check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount, |
| int options, BOOL isclass) |
| { |
| /* PCRE_UTF16 has the same value as PCRE_UTF8. */ |
| BOOL utf = (options & PCRE_UTF8) != 0; |
| const pcre_uchar *ptr = *ptrptr + 1; |
| pcre_int32 c; |
| int i; |
| |
| GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ |
| ptr--; /* Set pointer back to the last byte */ |
| |
| /* If backslash is at the end of the pattern, it's an error. */ |
| |
| if (c == 0) *errorcodeptr = ERR1; |
| |
| /* Non-alphanumerics are literals. For digits or letters, do an initial lookup |
| in a table. A non-zero result is something that can be returned immediately. |
| Otherwise further processing may be required. */ |
| |
| #ifndef EBCDIC /* ASCII/UTF-8 coding */ |
| /* Not alphanumeric */ |
| else if (c < CHAR_0 || c > CHAR_z) {} |
| else if ((i = escapes[c - CHAR_0]) != 0) c = i; |
| |
| #else /* EBCDIC coding */ |
| /* Not alphanumeric */ |
| else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {} |
| else if ((i = escapes[c - 0x48]) != 0) c = i; |
| #endif |
| |
| /* Escapes that need further processing, or are illegal. */ |
| |
| else |
| { |
| const pcre_uchar *oldptr; |
| BOOL braced, negated; |
| |
| switch (c) |
| { |
| /* A number of Perl escapes are not handled by PCRE. We give an explicit |
| error. */ |
| |
| case CHAR_l: |
| case CHAR_L: |
| *errorcodeptr = ERR37; |
| break; |
| |
| case CHAR_u: |
| if ((options & PCRE_JAVASCRIPT_COMPAT) != 0) |
| { |
| /* In JavaScript, \u must be followed by four hexadecimal numbers. |
| Otherwise it is a lowercase u letter. */ |
| if (MAX_255(ptr[1]) && g_ascii_isxdigit(ptr[1]) != 0 |
| && MAX_255(ptr[2]) && g_ascii_isxdigit(ptr[2]) != 0 |
| && MAX_255(ptr[3]) && g_ascii_isxdigit(ptr[3]) != 0 |
| && MAX_255(ptr[4]) && g_ascii_isxdigit(ptr[4]) != 0) |
| { |
| c = 0; |
| for (i = 0; i < 4; ++i) |
| { |
| int cc = *(++ptr); |
| #ifndef EBCDIC /* ASCII/UTF-8 coding */ |
| if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ |
| c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); |
| #else /* EBCDIC coding */ |
| if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ |
| c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
| #endif |
| } |
| |
| #ifdef COMPILE_PCRE8 |
| if (c > (utf ? 0x10ffff : 0xff)) |
| #else |
| #ifdef COMPILE_PCRE16 |
| if (c > (utf ? 0x10ffff : 0xffff)) |
| #endif |
| #endif |
| { |
| *errorcodeptr = ERR76; |
| } |
| else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; |
| } |
| } |
| else |
| *errorcodeptr = ERR37; |
| break; |
| |
| case CHAR_U: |
| /* In JavaScript, \U is an uppercase U letter. */ |
| if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37; |
| break; |
| |
| /* In a character class, \g is just a literal "g". Outside a character |
| class, \g must be followed by one of a number of specific things: |
| |
| (1) A number, either plain or braced. If positive, it is an absolute |
| backreference. If negative, it is a relative backreference. This is a Perl |
| 5.10 feature. |
| |
| (2) Perl 5.10 also supports \g{name} as a reference to a named group. This |
| is part of Perl's movement towards a unified syntax for back references. As |
| this is synonymous with \k{name}, we fudge it up by pretending it really |
| was \k. |
| |
| (3) For Oniguruma compatibility we also support \g followed by a name or a |
| number either in angle brackets or in single quotes. However, these are |
| (possibly recursive) subroutine calls, _not_ backreferences. Just return |
| the -ESC_g code (cf \k). */ |
| |
| case CHAR_g: |
| if (isclass) break; |
| if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE) |
| { |
| c = -ESC_g; |
| break; |
| } |
| |
| /* Handle the Perl-compatible cases */ |
| |
| if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
| { |
| const pcre_uchar *p; |
| for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++) |
| if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break; |
| if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET) |
| { |
| c = -ESC_k; |
| break; |
| } |
| braced = TRUE; |
| ptr++; |
| } |
| else braced = FALSE; |
| |
| if (ptr[1] == CHAR_MINUS) |
| { |
| negated = TRUE; |
| ptr++; |
| } |
| else negated = FALSE; |
| |
| /* The integer range is limited by the machine's int representation. */ |
| c = 0; |
| while (IS_DIGIT(ptr[1])) |
| { |
| if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */ |
| { |
| c = -1; |
| break; |
| } |
| c = c * 10 + *(++ptr) - CHAR_0; |
| } |
| if (((unsigned int)c) > INT_MAX) /* Integer overflow */ |
| { |
| while (IS_DIGIT(ptr[1])) |
| ptr++; |
| *errorcodeptr = ERR61; |
| break; |
| } |
| |
| if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET) |
| { |
| *errorcodeptr = ERR57; |
| break; |
| } |
| |
| if (c == 0) |
| { |
| *errorcodeptr = ERR58; |
| break; |
| } |
| |
| if (negated) |
| { |
| if (c > bracount) |
| { |
| *errorcodeptr = ERR15; |
| break; |
| } |
| c = bracount - (c - 1); |
| } |
| |
| c = -(ESC_REF + c); |
| break; |
| |
| /* The handling of escape sequences consisting of a string of digits |
| starting with one that is not zero is not straightforward. By experiment, |
| the way Perl works seems to be as follows: |
| |
| Outside a character class, the digits are read as a decimal number. If the |
| number is less than 10, or if there are that many previous extracting |
| left brackets, then it is a back reference. Otherwise, up to three octal |
| digits are read to form an escaped byte. Thus \123 is likely to be octal |
| 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal |
| value is greater than 377, the least significant 8 bits are taken. Inside a |
| character class, \ followed by a digit is always an octal number. */ |
| |
| case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: |
| case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: |
| |
| if (!isclass) |
| { |
| oldptr = ptr; |
| /* The integer range is limited by the machine's int representation. */ |
| c -= CHAR_0; |
| while (IS_DIGIT(ptr[1])) |
| { |
| if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */ |
| { |
| c = -1; |
| break; |
| } |
| c = c * 10 + *(++ptr) - CHAR_0; |
| } |
| if (((unsigned int)c) > INT_MAX) /* Integer overflow */ |
| { |
| while (IS_DIGIT(ptr[1])) |
| ptr++; |
| *errorcodeptr = ERR61; |
| break; |
| } |
| if (c < 10 || c <= bracount) |
| { |
| c = -(ESC_REF + c); |
| break; |
| } |
| ptr = oldptr; /* Put the pointer back and fall through */ |
| } |
| |
| /* Handle an octal number following \. If the first digit is 8 or 9, Perl |
| generates a binary zero byte and treats the digit as a following literal. |
| Thus we have to pull back the pointer by one. */ |
| |
| if ((c = *ptr) >= CHAR_8) |
| { |
| ptr--; |
| c = 0; |
| break; |
| } |
| |
| /* \0 always starts an octal number, but we may drop through to here with a |
| larger first octal digit. The original code used just to take the least |
| significant 8 bits of octal numbers (I think this is what early Perls used |
| to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode, |
| but no more than 3 octal digits. */ |
| |
| case CHAR_0: |
| c -= CHAR_0; |
| while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) |
| c = c * 8 + *(++ptr) - CHAR_0; |
| #ifdef COMPILE_PCRE8 |
| if (!utf && c > 0xff) *errorcodeptr = ERR51; |
| #endif |
| break; |
| |
| /* \x is complicated. \x{ddd} is a character number which can be greater |
| than 0xff in utf or non-8bit mode, but only if the ddd are hex digits. |
| If not, { is treated as a data character. */ |
| |
| case CHAR_x: |
| if ((options & PCRE_JAVASCRIPT_COMPAT) != 0) |
| { |
| /* In JavaScript, \x must be followed by two hexadecimal numbers. |
| Otherwise it is a lowercase x letter. */ |
| if (MAX_255(ptr[1]) && g_ascii_isxdigit(ptr[1]) != 0 |
| && MAX_255(ptr[2]) && g_ascii_isxdigit(ptr[2]) != 0) |
| { |
| c = 0; |
| for (i = 0; i < 2; ++i) |
| { |
| int cc = *(++ptr); |
| #ifndef EBCDIC /* ASCII/UTF-8 coding */ |
| if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ |
| c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); |
| #else /* EBCDIC coding */ |
| if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ |
| c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
| #endif |
| } |
| } |
| break; |
| } |
| |
| if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
| { |
| const pcre_uchar *pt = ptr + 2; |
| |
| c = 0; |
| while (MAX_255(*pt) && g_ascii_isxdigit(*pt) != 0) |
| { |
| int cc = *pt++; |
| if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ |
| |
| #ifndef EBCDIC /* ASCII/UTF-8 coding */ |
| if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ |
| c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); |
| #else /* EBCDIC coding */ |
| if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ |
| c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
| #endif |
| |
| #ifdef COMPILE_PCRE8 |
| if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; } |
| #else |
| #ifdef COMPILE_PCRE16 |
| if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; } |
| #endif |
| #endif |
| } |
| |
| if (c < 0) |
| { |
| while (MAX_255(*pt) && g_ascii_isxdigit(*pt) != 0) pt++; |
| *errorcodeptr = ERR34; |
| } |
| |
| if (*pt == CHAR_RIGHT_CURLY_BRACKET) |
| { |
| if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; |
| ptr = pt; |
| break; |
| } |
| |
| /* If the sequence of hex digits does not end with '}', then we don't |
| recognize this construct; fall through to the normal \x handling. */ |
| } |
| |
| /* Read just a single-byte hex-defined char */ |
| |
| c = 0; |
| while (i++ < 2 && MAX_255(ptr[1]) && g_ascii_isxdigit(ptr[1]) != 0) |
| { |
| int cc; /* Some compilers don't like */ |
| cc = *(++ptr); /* ++ in initializers */ |
| #ifndef EBCDIC /* ASCII/UTF-8 coding */ |
| if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ |
| c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); |
| #else /* EBCDIC coding */ |
| if (cc <= CHAR_z) cc += 64; /* Convert to upper case */ |
| c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
| #endif |
| } |
| break; |
| |
| /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. |
| An error is given if the byte following \c is not an ASCII character. This |
| coding is ASCII-specific, but then the whole concept of \cx is |
| ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ |
| |
| case CHAR_c: |
| c = *(++ptr); |
| if (c == 0) |
| { |
| *errorcodeptr = ERR2; |
| break; |
| } |
| #ifndef EBCDIC /* ASCII/UTF-8 coding */ |
| if (c > 127) /* Excludes all non-ASCII in either mode */ |
| { |
| *errorcodeptr = ERR68; |
| break; |
| } |
| if (c >= CHAR_a && c <= CHAR_z) c -= 32; |
| c ^= 0x40; |
| #else /* EBCDIC coding */ |
| if (c >= CHAR_a && c <= CHAR_z) c += 64; |
| c ^= 0xC0; |
| #endif |
| break; |
| |
| /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any |
| other alphanumeric following \ is an error if PCRE_EXTRA was set; |
| otherwise, for Perl compatibility, it is a literal. This code looks a bit |
| odd, but there used to be some cases other than the default, and there may |
| be again in future, so I haven't "optimized" it. */ |
| |
| default: |
| if ((options & PCRE_EXTRA) != 0) switch(c) |
| { |
| default: |
| *errorcodeptr = ERR3; |
| break; |
| } |
| break; |
| } |
| } |
| |
| /* Perl supports \N{name} for character names, as well as plain \N for "not |
| newline". PCRE does not support \N{name}. However, it does support |
| quantification such as \N{2,3}. */ |
| |
| if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET && |
| !is_counted_repeat(ptr+2)) |
| *errorcodeptr = ERR37; |
| |
| /* If PCRE_UCP is set, we change the values for \d etc. */ |
| |
| if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w) |
| c -= (ESC_DU - ESC_D); |
| |
| /* Set the pointer to the final character before returning. */ |
| |
| *ptrptr = ptr; |
| return c; |
| } |
| |
| |
| |
| #ifdef SUPPORT_UCP |
| /************************************************* |
| * Handle \P and \p * |
| *************************************************/ |
| |
| /* This function is called after \P or \p has been encountered, provided that |
| PCRE is compiled with support for Unicode properties. On entry, ptrptr is |
| pointing at the P or p. On exit, it is pointing at the final character of the |
| escape sequence. |
| |
| Argument: |
| ptrptr points to the pattern position pointer |
| negptr points to a boolean that is set TRUE for negation else FALSE |
| dptr points to an int that is set to the detailed property value |
| errorcodeptr points to the error code variable |
| |
| Returns: type value from ucp_type_table, or -1 for an invalid type |
| */ |
| |
| static int |
| get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr) |
| { |
| int c, i, bot, top; |
| const pcre_uchar *ptr = *ptrptr; |
| pcre_uchar name[32]; |
| |
| c = *(++ptr); |
| if (c == 0) goto ERROR_RETURN; |
| |
| *negptr = FALSE; |
| |
| /* \P or \p can be followed by a name in {}, optionally preceded by ^ for |
| negation. */ |
| |
| if (c == CHAR_LEFT_CURLY_BRACKET) |
| { |
| if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT) |
| { |
| *negptr = TRUE; |
| ptr++; |
| } |
| for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++) |
| { |
| c = *(++ptr); |
| if (c == 0) goto ERROR_RETURN; |
| if (c == CHAR_RIGHT_CURLY_BRACKET) break; |
| name[i] = c; |
| } |
| if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; |
| name[i] = 0; |
| } |
| |
| /* Otherwise there is just one following character */ |
| |
| else |
| { |
| name[0] = c; |
| name[1] = 0; |
| } |
| |
| *ptrptr = ptr; |
| |
| /* Search for a recognized property name using binary chop */ |
| |
| bot = 0; |
| top = PRIV(utt_size); |
| |
| while (bot < top) |
| { |
| i = (bot + top) >> 1; |
| c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset); |
| if (c == 0) |
| { |
| *dptr = PRIV(utt)[i].value; |
| return PRIV(utt)[i].type; |
| } |
| if (c > 0) bot = i + 1; else top = i; |
| } |
| |
| *errorcodeptr = ERR47; |
| *ptrptr = ptr; |
| return -1; |
| |
| ERROR_RETURN: |
| *errorcodeptr = ERR46; |
| *ptrptr = ptr; |
| return -1; |
| } |
| #endif |
| |
| |
| |
| |
| /************************************************* |
| * Read repeat counts * |
| *************************************************/ |
| |
| /* Read an item of the form {n,m} and return the values. This is called only |
| after is_counted_repeat() has confirmed that a repeat-count quantifier exists, |
| so the syntax is guaranteed to be correct, but we need to check the values. |
| |
| Arguments: |
| p pointer to first char after '{' |
| minp pointer to int for min |
| maxp pointer to int for max |
| returned as -1 if no max |
| errorcodeptr points to error code variable |
| |
| Returns: pointer to '}' on success; |
| current ptr on error, with errorcodeptr set non-zero |
| */ |
| |
| static const pcre_uchar * |
| read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr) |
| { |
| int min = 0; |
| int max = -1; |
| |
| /* Read the minimum value and do a paranoid check: a negative value indicates |
| an integer overflow. */ |
| |
| while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0; |
| if (min < 0 || min > 65535) |
| { |
| *errorcodeptr = ERR5; |
| return p; |
| } |
| |
| /* Read the maximum value if there is one, and again do a paranoid on its size. |
| Also, max must not be less than min. */ |
| |
| if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else |
| { |
| if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) |
| { |
| max = 0; |
| while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0; |
| if (max < 0 || max > 65535) |
| { |
| *errorcodeptr = ERR5; |
| return p; |
| } |
| if (max < min) |
| { |
| *errorcodeptr = ERR4; |
| return p; |
| } |
| } |
| } |
| |
| /* Fill in the required variables, and pass back the pointer to the terminating |
| '}'. */ |
| |
| *minp = min; |
| *maxp = max; |
| return p; |
| } |
| |
| |
| |
| /************************************************* |
| * Subroutine for finding forward reference * |
| *************************************************/ |
| |
| /* This recursive function is called only from find_parens() below. The |
| top-level call starts at the beginning of the pattern. All other calls must |
| start at a parenthesis. It scans along a pattern's text looking for capturing |
| subpatterns, and counting them. If it finds a named pattern that matches the |
| name it is given, it returns its number. Alternatively, if the name is NULL, it |
| returns when it reaches a given numbered subpattern. Recursion is used to keep |
| track of subpatterns that reset the capturing group numbers - the (?| feature. |
| |
| This function was originally called only from the second pass, in which we know |
| that if (?< or (?' or (?P< is encountered, the name will be correctly |
| terminated because that is checked in the first pass. There is now one call to |
| this function in the first pass, to check for a recursive back reference by |
| name (so that we can make the whole group atomic). In this case, we need check |
| only up to the current position in the pattern, and that is still OK because |
| and previous occurrences will have been checked. To make this work, the test |
| for "end of pattern" is a check against cd->end_pattern in the main loop, |
| instead of looking for a binary zero. This means that the special first-pass |
| call can adjust cd->end_pattern temporarily. (Checks for binary zero while |
| processing items within the loop are OK, because afterwards the main loop will |
| terminate.) |
| |
| Arguments: |
| ptrptr address of the current character pointer (updated) |
| cd compile background data |
| name name to seek, or NULL if seeking a numbered subpattern |
| lorn name length, or subpattern number if name is NULL |
| xmode TRUE if we are in /x mode |
| utf TRUE if we are in UTF-8 / UTF-16 mode |
| count pointer to the current capturing subpattern number (updated) |
| |
| Returns: the number of the named subpattern, or -1 if not found |
| */ |
| |
| static int |
| find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn, |
| BOOL xmode, BOOL utf, int *count) |
| { |
| pcre_uchar *ptr = *ptrptr; |
| int start_count = *count; |
| int hwm_count = start_count; |
| BOOL dup_parens = FALSE; |
| |
| /* If the first character is a parenthesis, check on the type of group we are |
| dealing with. The very first call may not start with a parenthesis. */ |
| |
| if (ptr[0] == CHAR_LEFT_PARENTHESIS) |
| { |
| /* Handle specials such as (*SKIP) or (*UTF8) etc. */ |
| |
| if (ptr[1] == CHAR_ASTERISK) ptr += 2; |
| |
| /* Handle a normal, unnamed capturing parenthesis. */ |
| |
| else if (ptr[1] != CHAR_QUESTION_MARK) |
| { |
| *count += 1; |
| if (name == NULL && *count == lorn) return *count; |
| ptr++; |
| } |
| |
| /* All cases now have (? at the start. Remember when we are in a group |
| where the parenthesis numbers are duplicated. */ |
| |
| else if (ptr[2] == CHAR_VERTICAL_LINE) |
| { |
| ptr += 3; |
| dup_parens = TRUE; |
| } |
| |
| /* Handle comments; all characters are allowed until a ket is reached. */ |
| |
| else if (ptr[2] == CHAR_NUMBER_SIGN) |
| { |
| for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break; |
| goto FAIL_EXIT; |
| } |
| |
| /* Handle a condition. If it is an assertion, just carry on so that it |
| is processed as normal. If not, skip to the closing parenthesis of the |
| condition (there can't be any nested parens). */ |
| |
| else if (ptr[2] == CHAR_LEFT_PARENTHESIS) |
| { |
| ptr += 2; |
| if (ptr[1] != CHAR_QUESTION_MARK) |
| { |
| while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; |
| if (*ptr != 0) ptr++; |
| } |
| } |
| |
| /* Start with (? but not a condition. */ |
| |
| else |
| { |
| ptr += 2; |
| if (*ptr == CHAR_P) ptr++; /* Allow optional P */ |
| |
| /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */ |
| |
| if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK && |
| ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE) |
| { |
| int term; |
| const pcre_uchar *thisname; |
| *count += 1; |
| if (name == NULL && *count == lorn) return *count; |
| term = *ptr++; |
| if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN; |
| thisname = ptr; |
| while (*ptr != term) ptr++; |
| if (name != NULL && lorn == ptr - thisname && |
| STRNCMP_UC_UC(name, thisname, lorn) == 0) |
| return *count; |
| term++; |
| } |
| } |
| } |
| |
| /* Past any initial parenthesis handling, scan for parentheses or vertical |
| bars. Stop if we get to cd->end_pattern. Note that this is important for the |
| first-pass call when this value is temporarily adjusted to stop at the current |
| position. So DO NOT change this to a test for binary zero. */ |
| |
| for (; ptr < cd->end_pattern; ptr++) |
| { |
| /* Skip over backslashed characters and also entire \Q...\E */ |
| |
| if (*ptr == CHAR_BACKSLASH) |
| { |
| if (*(++ptr) == 0) goto FAIL_EXIT; |
| if (*ptr == CHAR_Q) for (;;) |
| { |
| while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {}; |
| if (*ptr == 0) goto FAIL_EXIT; |
| if (*(++ptr) == CHAR_E) break; |
| } |
| continue; |
| } |
| |
| /* Skip over character classes; this logic must be similar to the way they |
| are handled for real. If the first character is '^', skip it. Also, if the |
| first few characters (either before or after ^) are \Q\E or \E we skip them |
| too. This makes for compatibility with Perl. Note the use of STR macros to |
| encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */ |
| |
| if (*ptr == CHAR_LEFT_SQUARE_BRACKET) |
| { |
| BOOL negate_class = FALSE; |
| for (;;) |
| { |
| if (ptr[1] == CHAR_BACKSLASH) |
| { |
| if (ptr[2] == CHAR_E) |
| ptr+= 2; |
| else if (STRNCMP_UC_C8(ptr + 2, |
| STR_Q STR_BACKSLASH STR_E, 3) == 0) |
| ptr += 4; |
| else |
| break; |
| } |
| else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT) |
| { |
| negate_class = TRUE; |
| ptr++; |
| } |
| else break; |
| } |
| |
| /* If the next character is ']', it is a data character that must be |
| skipped, except in JavaScript compatibility mode. */ |
| |
| if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET && |
| (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) |
| ptr++; |
| |
| while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET) |
| { |
| if (*ptr == 0) return -1; |
| if (*ptr == CHAR_BACKSLASH) |
| { |
| if (*(++ptr) == 0) goto FAIL_EXIT; |
| if (*ptr == CHAR_Q) for (;;) |
| { |
| while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {}; |
| if (*ptr == 0) goto FAIL_EXIT; |
| if (*(++ptr) == CHAR_E) break; |
| } |
| continue; |
| } |
| } |
| continue; |
| } |
| |
| /* Skip comments in /x mode */ |
| |
| if (xmode && *ptr == CHAR_NUMBER_SIGN) |
| { |
| ptr++; |
| while (*ptr != 0) |
| { |
| if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } |
| ptr++; |
| #ifdef SUPPORT_UTF |
| if (utf) FORWARDCHAR(ptr); |
| #endif |
| } |
| if (*ptr == 0) goto FAIL_EXIT; |
| continue; |
| } |
| |
| /* Check for the special metacharacters */ |
| |
| if (*ptr == CHAR_LEFT_PARENTHESIS) |
| { |
| int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count); |
| if (rc > 0) return rc; |
| if (*ptr == 0) goto FAIL_EXIT; |
| } |
| |
| else if (*ptr == CHAR_RIGHT_PARENTHESIS) |
| { |
| if (dup_parens && *count < hwm_count) *count = hwm_count; |
| goto FAIL_EXIT; |
| } |
| |
| else if (*ptr == CHAR_VERTICAL_LINE && dup_parens) |
| { |
| if (*count > hwm_count) hwm_count = *count; |
| *count = start_count; |
| } |
| } |
| |
| FAIL_EXIT: |
| *ptrptr = ptr; |
| return -1; |
| } |
| |
| |
| |
| |
| /************************************************* |
| * Find forward referenced subpattern * |
| *************************************************/ |
| |
| /* This function scans along a pattern's text looking for capturing |
| subpatterns, and counting them. If it finds a named pattern that matches the |
| name it is given, it returns its number. Alternatively, if the name is NULL, it |
| returns when it reaches a given numbered subpattern. This is used for forward |
| references to subpatterns. We used to be able to start this scan from the |
| current compiling point, using the current count value from cd->bracount, and |
| do it all in a single loop, but the addition of the possibility of duplicate |
| subpattern numbers means that we have to scan from the very start, in order to |
| take account of such duplicates, and to use a recursive function to keep track |
| of the different types of group. |
| |
| Arguments: |
| cd compile background data |
| name name to seek, or NULL if seeking a numbered subpattern |
| lorn name length, or subpattern number if name is NULL |
| xmode TRUE if we are in /x mode |
| utf TRUE if we are in UTF-8 / UTF-16 mode |
| |
| Returns: the number of the found subpattern, or -1 if not found |
| */ |
| |
| static int |
| find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode, |
| BOOL utf) |
| { |
| pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern; |
| int count = 0; |
| int rc; |
| |
| /* If the pattern does not start with an opening parenthesis, the first call |
| to find_parens_sub() will scan right to the end (if necessary). However, if it |
| does start with a parenthesis, find_parens_sub() will return when it hits the |
| matching closing parens. That is why we have to have a loop. */ |
| |
| for (;;) |
| { |
| rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count); |
| if (rc > 0 || *ptr++ == 0) break; |
| } |
| |
| return rc; |
| } |
| |
| |
| |
| |
| /************************************************* |
| * Find first significant op code * |
| *************************************************/ |
| |
| /* This is called by several functions that scan a compiled expression looking |
| for a fixed first character, or an anchoring op code etc. It skips over things |
| that do not influence this. For some calls, it makes sense to skip negative |
| forward and all backward assertions, and also the \b assertion; for others it |
| does not. |
| |
| Arguments: |
| code pointer to the start of the group |
| skipassert TRUE if certain assertions are to be skipped |
| |
| Returns: pointer to the first significant opcode |
| */ |
| |
| static const pcre_uchar* |
| first_significant_code(const pcre_uchar *code, BOOL skipassert) |
| { |
| for (;;) |
| { |
| switch ((int)*code) |
| { |
| case OP_ASSERT_NOT: |
| case OP_ASSERTBACK: |
| case OP_ASSERTBACK_NOT: |
| if (!skipassert) return code; |
| do code += GET(code, 1); while (*code == OP_ALT); |
| code += PRIV(OP_lengths)[*code]; |
| break; |
| |
| case OP_WORD_BOUNDARY: |
| case OP_NOT_WORD_BOUNDARY: |
| if (!skipassert) return code; |
| /* Fall through */ |
| |
| case OP_CALLOUT: |
| case OP_CREF: |
| case OP_NCREF: |
| case OP_RREF: |
| case OP_NRREF: |
| case OP_DEF: |
| code += PRIV(OP_lengths)[*code]; |
| break; |
| |
| default: |
| return code; |
| } |
| } |
| /* Control never reaches here */ |
| } |
| |
| |
| |
| |
| /************************************************* |
| * Find the fixed length of a branch * |
| *************************************************/ |
| |
| /* Scan a branch and compute the fixed length of subject that will match it, |
| if the length is fixed. This is needed for dealing with backward assertions. |
| In UTF8 mode, the result is in characters rather than bytes. The branch is |
| temporarily terminated with OP_END when this function is called. |
| |
| This function is called when a backward assertion is encountered, so that if it |
| fails, the error message can point to the correct place in the pattern. |
| However, we cannot do this when the assertion contains subroutine calls, |
| because they can be forward references. We solve this by remembering this case |
| and doing the check at the end; a flag specifies which mode we are running in. |
| |
| Arguments: |
| code points to the start of the pattern (the bracket) |
| utf TRUE in UTF-8 / UTF-16 mode |
| atend TRUE if called when the pattern is complete |
| cd the "compile data" structure |
| |
| Returns: the fixed length, |
| or -1 if there is no fixed length, |
| or -2 if \C was encountered (in UTF-8 mode only) |
| or -3 if an OP_RECURSE item was encountered and atend is FALSE |
| or -4 if an unknown opcode was encountered (internal error) |
| */ |
| |
| static int |
| find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd) |
| { |
| int length = -1; |
| |
| int branchlength = 0; |
| pcre_uchar *cc = code + 1 + LINK_SIZE; |
| |
| /* Scan along the opcodes for this branch. If we get to the end of the |
| branch, check the length against that of the other branches. */ |
| |
| for (;;) |
| { |
| int d; |
| pcre_uchar *ce, *cs; |
| int op = *cc; |
| |
| switch (op) |
| { |
| /* We only need to continue for OP_CBRA (normal capturing bracket) and |
| OP_BRA (normal non-capturing bracket) because the other variants of these |
| opcodes are all concerned with unlimited repeated groups, which of course |
| are not of fixed length. */ |
| |
| case OP_CBRA: |
| case OP_BRA: |
| case OP_ONCE: |
| case OP_ONCE_NC: |
| case OP_COND: |
| d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd); |
| if (d < 0) return d; |
| branchlength += d; |
| do cc += GET(cc, 1); while (*cc == OP_ALT); |
| cc += 1 + LINK_SIZE; |
| break; |
| |
| /* Reached end of a branch; if it's a ket it is the end of a nested call. |
| If it's ALT it is an alternation in a nested call. An ACCEPT is effectively |
| an ALT. If it is END it's the end of the outer call. All can be handled by |
| the same code. Note that we must not include the OP_KETRxxx opcodes here, |
| because they all imply an unlimited repeat. */ |
| |
| case OP_ALT: |
| case OP_KET: |
| case OP_END: |
| case OP_ACCEPT: |
| case OP_ASSERT_ACCEPT: |
| if (length < 0) length = branchlength; |
| else if (length != branchlength) return -1; |
| if (*cc != OP_ALT) return length; |
| cc += 1 + LINK_SIZE; |
| branchlength = 0; |
| break; |
| |
| /* A true recursion implies not fixed length, but a subroutine call may |
| be OK. If the subroutine is a forward reference, we can't deal with |
| it until the end of the pattern, so return -3. */ |
| |
| case OP_RECURSE: |
| if (!atend) return -3; |
| cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */ |
| do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ |
| if (cc > cs && cc < ce) return -1; /* Recursion */ |
| d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd); |
| if (d < 0) return d; |
| branchlength += d; |
| cc += 1 + LINK_SIZE; |
| break; |
| |
| /* Skip over assertive subpatterns */ |
| |
| case OP_ASSERT: |
| case OP_ASSERT_NOT: |
| case OP_ASSERTBACK: |
| case OP_ASSERTBACK_NOT: |
| do cc += GET(cc, 1); while (*cc == OP_ALT); |
| cc += PRIV(OP_lengths)[*cc]; |
| break; |
| |
| /* Skip over things that don't match chars */ |
| |
| case OP_MARK: |
| case OP_PRUNE_ARG: |
| case OP_SKIP_ARG: |
| case OP_THEN_ARG: |
| cc += cc[1] + PRIV(OP_lengths)[*cc]; |
| break; |
| |
| case OP_CALLOUT: |
| case OP_CIRC: |
| case OP_CIRCM: |
| case OP_CLOSE: |
| case OP_COMMIT: |
| case OP_CREF: |
| case OP_DEF: |
| case OP_DOLL: |
| case OP_DOLLM: |
| case OP_EOD: |
| case OP_EODN: |
| case OP_FAIL: |
| case OP_NCREF: |
| case OP_NRREF: |
| case OP_NOT_WORD_BOUNDARY: |
| case OP_PRUNE: |
| case OP_REVERSE: |
| case OP_RREF: |
| case OP_SET_SOM: |
| case OP_SKIP: |
| case OP_SOD: |
| case OP_SOM: |
| case OP_THEN: |
| case OP_WORD_BOUNDARY: |
| cc += PRIV(OP_lengths)[*cc]; |
| break; |
| |
| /* Handle literal characters */ |
| |
| case OP_CHAR: |
| case OP_CHARI: |
| case OP_NOT: |
| case OP_NOTI: |
| branchlength++; |
| cc += 2; |
| #ifdef SUPPORT_UTF |
| if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); |
| #endif |
| break; |
| |
| /* Handle exact repetitions. The count is already in characters, but we |
| need to skip over a multibyte character in UTF8 mode. */ |
| |
| case OP_EXACT: |
| case OP_EXACTI: |
| case OP_NOTEXACT: |
| case OP_NOTEXACTI: |
| branchlength += GET2(cc,1); |
| cc += 2 + IMM2_SIZE; |
| #ifdef SUPPORT_UTF |
| if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); |
| #endif |
| break; |
| |
| case OP_TYPEEXACT: |
| branchlength += GET2(cc,1); |
| if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2; |
| cc += 1 + IMM2_SIZE + 1; |
| break; |
| |
| /* Handle single-char matchers */ |
| |
| case OP_PROP: |
| case OP_NOTPROP: |
| cc += 2; |
| /* Fall through */ |
| |
| case OP_HSPACE: |
| case OP_VSPACE: |
| case OP_NOT_HSPACE: |
| case OP_NOT_VSPACE: |
| case OP_NOT_DIGIT: |
| case OP_DIGIT: |
| case OP_NOT_WHITESPACE: |
| case OP_WHITESPACE: |
| case OP_NOT_WORDCHAR: |
| case OP_WORDCHAR: |
| case OP_ANY: |
| case OP_ALLANY: |
| branchlength++; |
| cc++; |
| break; |
| |
| /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode; |
| otherwise \C is coded as OP_ALLANY. */ |
| |
| case OP_ANYBYTE: |
| return -2; |
| |
| /* Check a class for variable quantification */ |
| |
| #if defined SUPPORT_UTF || defined COMPILE_PCRE16 |
| case OP_XCLASS: |
| cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS]; |
| /* Fall through */ |
| #endif |
| |
| case OP_CLASS: |
| case OP_NCLASS: |
| cc += PRIV(OP_lengths)[OP_CLASS]; |
| |
| switch (*cc) |
| { |
| case OP_CRPLUS: |
| case OP_CRMINPLUS: |
| case OP_CRSTAR: |
| case OP_CRMINSTAR: |
| case OP_CRQUERY: |
| case OP_CRMINQUERY: |
| return -1; |
| |
| case OP_CRRANGE: |
| case OP_CRMINRANGE: |
| if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1; |
| branchlength += GET2(cc,1); |
| cc += 1 + 2 * IMM2_SIZE; |
| break; |
| |
| default: |
| branchlength++; |
| } |
| break; |
| |
| /* Anything else is variable length */ |
| |
| case OP_ANYNL: |
| case OP_BRAMINZERO: |
| case OP_BRAPOS: |
| case OP_BRAPOSZERO: |
| case OP_BRAZERO: |
| case OP_CBRAPOS: |
| case OP_EXTUNI: |
| case OP_KETRMAX: |
| case OP_KETRMIN: |
| case OP_KETRPOS: |
| case OP_MINPLUS: |
| case OP_MINPLUSI: |
| case OP_MINQUERY: |
| case OP_MINQUERYI: |
| case OP_MINSTAR: |
| case OP_MINSTARI: |
| case OP_MINUPTO: |
| case OP_MINUPTOI: |
| case OP_NOTMINPLUS: |
| case OP_NOTMINPLUSI: |
| case OP_NOTMINQUERY: |
| case OP_NOTMINQUERYI: |
| case OP_NOTMINSTAR: |
| case OP_NOTMINSTARI: |
| case OP_NOTMINUPTO: |
| case OP_NOTMINUPTOI: |
| case OP_NOTPLUS: |
| case OP_NOTPLUSI: |
| case OP_NOTPOSPLUS: |
| case OP_NOTPOSPLUSI: |
| case OP_NOTPOSQUERY: |
| case OP_NOTPOSQUERYI: |
| case OP_NOTPOSSTAR: |
| case OP_NOTPOSSTARI: |
| case OP_NOTPOSUPTO: |
| case OP_NOTPOSUPTOI: |
| case OP_NOTQUERY: |
| case OP_NOTQUERYI: |
| case OP_NOTSTAR: |
| case OP_NOTSTARI: |
| case OP_NOTUPTO: |
| case OP_NOTUPTOI: |
| case OP_PLUS: |
| case OP_PLUSI: |
| case OP_POSPLUS: |
| case OP_POSPLUSI: |
| case OP_POSQUERY: |
| case OP_POSQUERYI: |
| case OP_POSSTAR: |
| case OP_POSSTARI: |
| case OP_POSUPTO: |
| case OP_POSUPTOI: |
| case OP_QUERY: |
| case OP_QUERYI: |
| case OP_REF: |
| case OP_REFI: |
| case OP_SBRA: |
| case OP_SBRAPOS: |
| case OP_SCBRA: |
| case OP_SCBRAPOS: |
| case OP_SCOND: |
| case OP_SKIPZERO: |
| case OP_STAR: |
| case OP_STARI: |
| case OP_TYPEMINPLUS: |
| case OP_TYPEMINQUERY: |
| case OP_TYPEMINSTAR: |
| case OP_TYPEMINUPTO: |
| case OP_TYPEPLUS: |
| case OP_TYPEPOSPLUS: |
| case OP_TYPEPOSQUERY: |
| case OP_TYPEPOSSTAR: |
| case OP_TYPEPOSUPTO: |
| case OP_TYPEQUERY: |
| case OP_TYPESTAR: |
| case OP_TYPEUPTO: |
| case OP_UPTO: |
| case OP_UPTOI: |
| return -1; |
| |
| /* Catch unrecognized opcodes so that when new ones are added they |
| are not forgotten, as has happened in the past. */ |
| |
| default: |
| return -4; |
| } |
| } |
| /* Control never gets here */ |
| } |
| |
| |
| |
| |
| /************************************************* |
| * Scan compiled regex for specific bracket * |
| *************************************************/ |
| |
| /* This little function scans through a compiled pattern until it finds a |
| capturing bracket with the given number, or, if the number is negative, an |
| instance of OP_REVERSE for a lookbehind. The function is global in the C sense |
| so that it can be called from pcre_study() when finding the minimum matching |
| length. |
| |
| Arguments: |
| code points to start of expression |
| utf TRUE in UTF-8 / UTF-16 mode |
| number the required bracket number or negative to find a lookbehind |
| |
| Returns: pointer to the opcode for the bracket, or NULL if not found |
| */ |
| |
| const pcre_uchar * |
| PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number) |
| { |
| for (;;) |
| { |
| int c = *code; |
| |
| if (c == OP_END) return NULL; |
| |
| /* XCLASS is used for classes that cannot be represented just by a bit |
| map. This includes negated single high-valued characters. The length in |
| the table is zero; the actual length is stored in the compiled code. */ |
| |
| if (c == OP_XCLASS) code += GET(code, 1); |
| |
| /* Handle recursion */ |
| |
| else if (c == OP_REVERSE) |
| { |
| if (number < 0) return (pcre_uchar *)code; |
| code += PRIV(OP_lengths)[c]; |
| } |
| |
| /* Handle capturing bracket */ |
| |
| else if (c == OP_CBRA || c == OP_SCBRA || |
| c == OP_CBRAPOS || c == OP_SCBRAPOS) |
| { |
| int n = GET2(code, 1+LINK_SIZE); |
| if (n == number) return (pcre_uchar *)code; |
| code += PRIV(OP_lengths)[c]; |
| } |
| |
| /* Otherwise, we can get the item's length from the table, except that for |
| repeated character types, we have to test for \p and \P, which have an extra |
| two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we |
| must add in its length. */ |
| |
| else |
| { |
| switch(c) |
| { |
| case OP_TYPESTAR: |
| case OP_TYPEMINSTAR: |
| case OP_TYPEPLUS: |
| case OP_TYPEMINPLUS: |
| case OP_TYPEQUERY: |
| case OP_TYPEMINQUERY: |
| case OP_TYPEPOSSTAR: |
| case OP_TYPEPOSPLUS: |
| case OP_TYPEPOSQUERY: |
| if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
| break; |
| |
| case OP_TYPEUPTO: |
| case OP_TYPEMINUPTO: |
| case OP_TYPEEXACT: |
| case OP_TYPEPOSUPTO: |
| if (code[1 + IMM2_SIZE] == OP_PROP |
| || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2; |
| break; |
| |
| case OP_MARK: |
| case OP_PRUNE_ARG: |
| case OP_SKIP_ARG: |
| code += code[1]; |
| break; |
| |
| case OP_THEN_ARG: |
| code += code[1]; |
| break; |
| } |
| |
| /* Add in the fixed length from the table */ |
| |
| code += PRIV(OP_lengths)[c]; |
| |
| /* In UTF-8 mode, opcodes that are followed by a character may be followed by |
| a multi-byte character. The length in the table is a minimum, so we have to |
| arrange to skip the extra bytes. */ |
| |
| #ifdef SUPPORT_UTF |
| if (utf) switch(c) |
| { |
| case OP_CHAR: |
| case OP_CHARI: |
| case OP_EXACT: |
| case OP_EXACTI: |
| case OP_UPTO: |
| case OP_UPTOI: |
| case OP_MINUPTO: |
| case OP_MINUPTOI: |
| case OP_POSUPTO: |
| case OP_POSUPTOI: |
| case OP_STAR: |
| case OP_STARI: |
| case OP_MINSTAR: |
| case OP_MINSTARI: |
| case OP_POSSTAR: |
| case OP_POSSTARI: |
| case OP_PLUS: |
| case OP_PLUSI: |
| case OP_MINPLUS: |
| case OP_MINPLUSI: |
| case OP_POSPLUS: |
| case OP_POSPLUSI: |
| case OP_QUERY: |
| case OP_QUERYI: |
| case OP_MINQUERY: |
| case OP_MINQUERYI: |
| case OP_POSQUERY: |
| case OP_POSQUERYI: |
| if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); |
| break; |
| } |
| #else |
| (void)(utf); /* Keep compiler happy by referencing function argument */ |
| #endif |
| } |
| } |
| } |
| |
| |
| |
| /************************************************* |
| * Scan compiled regex for recursion reference * |
| *************************************************/ |
| |
| /* This little function scans through a compiled pattern until it finds an |
| instance of OP_RECURSE. |
| |
| Arguments: |
| code points to start of expression |
| utf TRUE in UTF-8 / UTF-16 mode |
| |
| Returns: pointer to the opcode for OP_RECURSE, or NULL if not found |
| */ |
| |
| static const pcre_uchar * |
| find_recurse(const pcre_uchar *code, BOOL utf) |
| { |
| for (;;) |
| { |
| int c = *code; |
| if (c == OP_END) return NULL; |
| if (c == OP_RECURSE) return code; |
| |
| /* XCLASS is used for classes that cannot be represented just by a bit |
| map. This includes negated single high-valued characters. The length in |
| the table is zero; the actual length is stored in the compiled code. */ |
| |
| if (c == OP_XCLASS) code += GET(code, 1); |
| |
| /* Otherwise, we can get the item's length from the table, except that for |
| repeated character types, we have to test for \p and \P, which have an extra |
| two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we |
| must add in its length. */ |
| |
| else |
| { |
| switch(c) |
| { |
| case OP_TYPESTAR: |
| case OP_TYPEMINSTAR: |
| case OP_TYPEPLUS: |
| case OP_TYPEMINPLUS: |
| case OP_TYPEQUERY: |
| case OP_TYPEMINQUERY: |
| case OP_TYPEPOSSTAR: |
| case OP_TYPEPOSPLUS: |
| case OP_TYPEPOSQUERY: |
| if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
| break; |
| |
| case OP_TYPEPOSUPTO: |
| case OP_TYPEUPTO: |
| case OP_TYPEMINUPTO: |
| case OP_TYPEEXACT: |
| if (code[1 + IMM2_SIZE] == OP_PROP |
| || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2; |
| break; |
| |
| case OP_MARK: |
| case OP_PRUNE_ARG: |
| case OP_SKIP_ARG: |
| code += code[1]; |
| break; |
| |
| case OP_THEN_ARG: |
| code += code[1]; |
| break; |
| } |
| |
| /* Add in the fixed length from the table */ |
| |
| code += PRIV(OP_lengths)[c]; |
| |
| /* In UTF-8 mode, opcodes that are followed by a character may be followed |
| by a multi-byte character. The length in the table is a minimum, so we have |
| to arrange to skip the extra bytes. */ |
| |
| #ifdef SUPPORT_UTF |
| if (utf) switch(c) |
| { |
| case OP_CHAR: |
| case OP_CHARI: |
| case OP_NOT: |
| case OP_NOTI: |
| case OP_EXACT: |
| case OP_EXACTI: |
| case OP_NOTEXACT: |
| case OP_NOTEXACTI: |
| case OP_UPTO: |
| case OP_UPTOI: |
| case OP_NOTUPTO: |
| case OP_NOTUPTOI: |
| case OP_MINUPTO: |
| case OP_MINUPTOI: |
| case OP_NOTMINUPTO: |
| case OP_NOTMINUPTOI: |
| case OP_POSUPTO: |
| case OP_POSUPTOI: |
| case OP_NOTPOSUPTO: |
| case OP_NOTPOSUPTOI: |
| case OP_STAR: |
| case OP_STARI: |
| case OP_NOTSTAR: |
| case OP_NOTSTARI: |
| case OP_MINSTAR: |
| case OP_MINSTARI: |
| case OP_NOTMINSTAR: |
| case OP_NOTMINSTARI: |
| case OP_POSSTAR: |
| case OP_POSSTARI: |
| case OP_NOTPOSSTAR: |
| case OP_NOTPOSSTARI: |
| case OP_PLUS: |
| case OP_PLUSI: |
| case OP_NOTPLUS: |
| case OP_NOTPLUSI: |
| case OP_MINPLUS: |
| case OP_MINPLUSI: |
| case OP_NOTMINPLUS: |
| case OP_NOTMINPLUSI: |
| case OP_POSPLUS: |
| case OP_POSPLUSI: |
| case OP_NOTPOSPLUS: |
| case OP_NOTPOSPLUSI: |
| case OP_QUERY: |
| case OP_QUERYI: |
| case OP_NOTQUERY: |
| case OP_NOTQUERYI: |
| case OP_MINQUERY: |
| case OP_MINQUERYI: |
| case OP_NOTMINQUERY: |
| case OP_NOTMINQUERYI: |
| case OP_POSQUERY: |
| case OP_POSQUERYI: |
| case OP_NOTPOSQUERY: |
| case OP_NOTPOSQUERYI: |
| if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); |
| break; |
| } |
| #else |
| (void)(utf); /* Keep compiler happy by referencing function argument */ |
| #endif |
| } |
| } |
| } |
| |
| |
| |
| /************************************************* |
| * Scan compiled branch for non-emptiness * |
| *************************************************/ |
| |
| /* This function scans through a branch of a compiled pattern to see whether it |
| can match the empty string or not. It is called from could_be_empty() |
| below and from compile_branch() when checking for an unlimited repeat of a |
| group that can match nothing. Note that first_significant_code() skips over |
| backward and negative forward assertions when its final argument is TRUE. If we |
| hit an unclosed bracket, we return "empty" - this means we've struck an inner |
| bracket whose current branch will already have been scanned. |
| |
| Arguments: |
| code points to start of search |
| endcode points to where to stop |
| utf TRUE if in UTF-8 / UTF-16 mode |
| cd contains pointers to tables etc. |
| |
| Returns: TRUE if what is matched could be empty |
| */ |
| |
| static BOOL |
| could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode, |
| BOOL utf, compile_data *cd) |
| { |
| int c; |
| for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); |
| code < endcode; |
| code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE)) |
| { |
| const pcre_uchar *ccode; |
| |
| c = *code; |
| |
| /* Skip over forward assertions; the other assertions are skipped by |
| first_significant_code() with a TRUE final argument. */ |
| |
| if (c == OP_ASSERT) |
| { |
| do code += GET(code, 1); while (*code == OP_ALT); |
| c = *code; |
| continue; |
| } |
| |
| /* For a recursion/subroutine call, if its end has been reached, which |
| implies a backward reference subroutine call, we can scan it. If it's a |
| forward reference subroutine call, we can't. To detect forward reference |
| we have to scan up the list that is kept in the workspace. This function is |
| called only when doing the real compile, not during the pre-compile that |
| measures the size of the compiled pattern. */ |
| |
| if (c == OP_RECURSE) |
| { |
| const pcre_uchar *scode; |
| BOOL empty_branch; |
| |
| /* Test for forward reference */ |
| |
| for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE) |
| if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE; |
| |
| /* Not a forward reference, test for completed backward reference */ |
| |
| empty_branch = FALSE; |
| scode = cd->start_code + GET(code, 1); |
| if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ |
| |
| /* Completed backwards reference */ |
| |
| do |
| { |
| if (could_be_empty_branch(scode, endcode, utf, cd)) |
| { |
| empty_branch = TRUE; |
| break; |
| } |
| scode += GET(scode, 1); |
| } |
| while (*scode == OP_ALT); |
| |
| if (!empty_branch) return FALSE; /* All branches are non-empty */ |
| continue; |
| } |
| |
| /* Groups with zero repeats can of course be empty; skip them. */ |
| |
| if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO || |
| c == OP_BRAPOSZERO) |
| { |
| code += PRIV(OP_lengths)[c]; |
| do code += GET(code, 1); while (*code == OP_ALT); |
| c = *code; |
| continue; |
| } |
| |
| /* A nested group that is already marked as "could be empty" can just be |
| skipped. */ |
| |
| if (c == OP_SBRA || c == OP_SBRAPOS || |
| c == OP_SCBRA || c == OP_SCBRAPOS) |
| { |
| do code += GET(code, 1); while (*code == OP_ALT); |
| c = *code; |
| continue; |
| } |
| |
| /* For other groups, scan the branches. */ |
| |
| if (c == OP_BRA || c == OP_BRAPOS || |
| c == OP_CBRA || c == OP_CBRAPOS || |
| c == OP_ONCE || c == OP_ONCE_NC || |
| c == OP_COND) |
| { |
| BOOL empty_branch; |
| if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ |
| |
| /* If a conditional group has only one branch, there is a second, implied, |
| empty branch, so just skip over the conditional, because it could be empty. |
| Otherwise, scan the individual branches of the group. */ |
| |
| if (c == OP_COND && code[GET(code, 1)] != OP_ALT) |
| code += GET(code, 1); |
| else |
| { |
| empty_branch = FALSE; |
| do |
| { |
| if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd)) |
| empty_branch = TRUE; |
| code += GET(code, 1); |
| } |
| while (*code == OP_ALT); |
| if (!empty_branch) return FALSE; /* All branches are non-empty */ |
| } |
| |
| c = *code; |
| continue; |
| } |
| |
| /* Handle the other opcodes */ |
| |
| switch (c) |
| { |
| /* Check for quantifiers after a class. XCLASS is used for classes that |
| cannot be represented just by a bit map. This includes negated single |
| high-valued characters. The length in PRIV(OP_lengths)[] is zero; the |
| actual length is stored in the compiled code, so we must update "code" |
| here. */ |
| |
| #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 |
| case OP_XCLASS: |
| ccode = code += GET(code, 1); |
| goto CHECK_CLASS_REPEAT; |
| #endif |
| |
| case OP_CLASS: |
| case OP_NCLASS: |
| ccode = code + PRIV(OP_lengths)[OP_CLASS]; |
| |
| #if defined SUPPORT_UTF || !defined COMPILE_PCRE8 |
| CHECK_CLASS_REPEAT: |
| #endif |
| |
| switch (*ccode) |
| { |
| case OP_CRSTAR: /* These could be empty; continue */ |
| case OP_CRMINSTAR: |
| case OP_CRQUERY: |
| case OP_CRMINQUERY: |
| break; |
| |
| default: /* Non-repeat => class must match */ |
| case OP_CRPLUS: /* These repeats aren't empty */ |
| case OP_CRMINPLUS: |
| return FALSE; |
| |
| case OP_CRRANGE: |
| case OP_CRMINRANGE: |
| if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */ |
| break; |
| } |
| break; |
| |
| /* Opcodes that must match a character */ |
| |
| case OP_PROP: |
| case OP_NOTPROP: |
| case OP_EXTUNI: |
| case OP_NOT_DIGIT: |
| case OP_DIGIT: |
| case OP_NOT_WHITESPACE: |
| case OP_WHITESPACE: |
| case OP_NOT_WORDCHAR: |
| case OP_WORDCHAR: |
| case OP_ANY: |
| case OP_ALLANY: |
| case OP_ANYBYTE: |
| case OP_CHAR: |
| case OP_CHARI: |
| case OP_NOT: |
| case OP_NOTI: |
| case OP_PLUS: |
| case OP_MINPLUS: |
| case OP_POSPLUS: |
| case OP_EXACT: |
| case OP_NOTPLUS: |
| case OP_NOTMINPLUS: |
| case OP_NOTPOSPLUS: |
| case OP_NOTEXACT: |
| case OP_TYPEPLUS: |
| case OP_TYPEMINPLUS: |
| case OP_TYPEPOSPLUS: |
| case OP_TYPEEXACT: |
| return FALSE; |
| |
| /* These are going to continue, as they may be empty, but we have to |
| fudge the length for the \p and \P cases. */ |
| |
| case OP_TYPESTAR: |
| case OP_TYPEMINSTAR: |
| case OP_TYPEPOSSTAR: |
| case OP_TYPEQUERY: |
| case OP_TYPEMINQUERY: |
| case OP_TYPEPOSQUERY: |
| if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; |
| break; |
| |
| /* Same for these */ |
| |
| case OP_TYPEUPTO: |
| case OP_TYPEMINUPTO: |
| case OP_TYPEPOSUPTO: |
| if (code[1 + IMM2_SIZE] == OP_PROP |
| || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2; |
| break; |
| |
| /* End of branch */ |
| |
| case OP_KET: |
| case OP_KETRMAX: |
| case OP_KETRMIN: |
| case OP_KETRPOS: |
| case OP_ALT: |
| return TRUE; |
| |
| /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, |
| MINUPTO, and POSUPTO may be followed by a multibyte character */ |
| |
| #ifdef SUPPORT_UTF |
| case OP_STAR: |
| case OP_STARI: |
| case OP_MINSTAR: |
| case OP_MINSTARI: |
| case OP_POSSTAR: |
| case OP_POSSTARI: |
| case OP_QUERY: |
| case OP_QUERYI: |
| case OP_MINQUERY: |
| case OP_MINQUERYI: |
| case OP_POSQUERY: |
| case OP_POSQUERYI: |
| if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]); |
| break; |
| |
| case OP_UPTO: |
| case OP_UPTOI: |
| case OP_MINUPTO: |
| case OP_MINUPTOI: |
| case OP_POSUPTO: |
| case OP_POSUPTOI: |
| if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]); |
| break; |
| #endif |
| |
| /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument |
| string. */ |
| |
| case OP_MARK: |
| case OP_PRUNE_ARG: |
| case OP_SKIP_ARG: |
| code += code[1]; |
| break; |
| |
| case OP_THEN_ARG: |
| code += code[1]; |
| break; |
| |
| /* None of the remaining opcodes are required to match a character. */ |
| |
| default: |
| break; |
| } |
| } |
| |
| return TRUE; |
| } |
| |
| |
| |
| /************************************************* |
| * Scan compiled regex for non-emptiness * |
| *************************************************/ |
| |
| /* This function is called to check for left recursive calls. We want to check |
| the current branch of the current pattern to see if it could match the empty |
| string. If it could, we must look outwards for branches at other levels, |
| stopping when we pass beyond the bracket which is the subject of the recursion. |
| This function is called only during the real compile, not during the |
| pre-compile. |
| |
| Arguments: |
| code points to start of the recursion |
| endcode points to where to stop (current RECURSE item) |
| bcptr points to the chain of current (unclosed) branch starts |
| utf TRUE if in UTF-8 / UTF-16 mode |
| cd pointers to tables etc |
| |
| Returns: TRUE if what is matched could be empty |
| */ |
| |
| static BOOL |
| could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode, |
| branch_chain *bcptr, BOOL utf, compile_data *cd) |
| { |
| while (bcptr != NULL && bcptr->current_branch >= code) |
| { |
| if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd)) |
| return FALSE; |
| bcptr = bcptr->outer; |
| } |
| return TRUE; |
| } |
| |
| |
| |
| /************************************************* |
| * Check for POSIX class syntax * |
| *************************************************/ |
| |
| /* This function is called when the sequence "[:" or "[." or "[=" is |
| encountered in a character class. It checks whether this is followed by a |
| sequence of characters terminated by a matching ":]" or ".]" or "=]". If we |
| reach an unescaped ']' without the special preceding character, return FALSE. |
| |
| Originally, this function only recognized a sequence of letters between the |
| terminators, but it seems that Perl recognizes any sequence of characters, |
| though of course unknown POSIX names are subsequently rejected. Perl gives an |
| "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE |
| didn't consider this to be a POSIX class. Likewise for [:1234:]. |
| |
| The problem in trying to be exactly like Perl is in the handling of escapes. We |
| have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX |
| class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code |
| below handles the special case of \], but does not try to do any other escape |
| processing. This makes it different from Perl for cases such as [:l\ower:] |
| where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize |
| "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does, |
| I think. |
| |
| A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. |
| It seems that the appearance of a nested POSIX class supersedes an apparent |
| external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or |
| a digit. |
| |
| In Perl, unescaped square brackets may also appear as part of class names. For |
| example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for |
| [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not |
| seem right at all. PCRE does not allow closing square brackets in POSIX class |
| names. |
| |
| Arguments: |
| ptr pointer to the initial [ |
| endptr where to return the end pointer |
| |
| Returns: TRUE or FALSE |
| */ |
| |
| static BOOL |
| check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr) |
| { |
| int terminator; /* Don't combine these lines; the Solaris cc */ |
| terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ |
| for (++ptr; *ptr != 0; ptr++) |
| { |
| if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
| ptr++; |
| else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; |
| else |
| { |
| if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
| { |
| *endptr = ptr; |
| return TRUE; |
| } |
| if (*ptr == CHAR_LEFT_SQUARE_BRACKET && |
| (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || |
| ptr[1] == CHAR_EQUALS_SIGN) && |
| check_posix_syntax(ptr, endptr)) |
| return FALSE; |
| } |
| } |
| return FALSE; |
| } |
| |
| |
| |
| |
| /************************************************* |
| * Check POSIX class name * |
| *************************************************/ |
| |
| /* This function is called to check the name given in a POSIX-style class entry |
| such as [:alnum:]. |
| |
| Arguments: |
| ptr points to the first letter |
| len the length of the name |
| |
| Returns: a value representing the name, or -1 if unknown |
| */ |
| |
| static int |
| check_posix_name(const pcre_uchar *ptr, int len) |
| { |
| const char *pn = posix_names; |
| int yield = 0; |
| while (posix_name_lengths[yield] != 0) |
| { |
| if (len == posix_name_lengths[yield] && |
| STRNCMP_UC_C8(ptr, pn, len) == 0) return yield; |
| pn += posix_name_lengths[yield] + 1; |
| yield++; |
| } |
| return -1; |
| } |
| |
| |
| /************************************************* |
| * Adjust OP_RECURSE items in repeated group * |
| *************************************************/ |
| |
| /* OP_RECURSE items contain an offset from the start of the regex to the group |
| that is referenced. This means that groups can be replicated for fixed |
| repetition simply by copying (because the recursion is allowed to refer to |
| earlier groups that are outside the current group). However, when a group is |
| optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is |
| inserted before it, after it has been compiled. This means that any OP_RECURSE |
| items within it that refer to the group itself or any contained groups have to |
| have their offsets adjusted. That one of the jobs of this function. Before it |
| is called, the partially compiled regex must be temporarily terminated with |
| OP_END. |
| |
| This function has been extended with the possibility of forward references for |
| recursions and subroutine calls. It must also check the list of such references |
| for the group we are dealing with. If it finds that one of the recursions in |
| the current group is on this list, it adjusts the offset in the list, not the |
| value in the reference (which is a group number). |
| |
| Arguments: |
| group points to the start of the group |
| adjust the amount by which the group is to be moved |
| utf TRUE in UTF-8 / UTF-16 mode |
| cd contains pointers to tables etc. |
| save_hwm the hwm forward reference pointer at the start of the group |
| |
| Returns: nothing |
| */ |
| |
| static void |
| adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd, |
| pcre_uchar *save_hwm) |
| { |
| pcre_uchar *ptr = group; |
| |
| while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL) |
| { |
| int offset; |
| pcre_uchar *hc; |
| |
| /* See if this recursion is on the forward reference list. If so, adjust the |
| reference. */ |
| |
| for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) |
| { |
| offset = GET(hc, 0); |
| if (cd->start_code + offset == ptr + 1) |
| { |
| PUT(hc, 0, offset + adjust); |
| break; |
| } |
| } |
| |
| /* Otherwise, adjust the recursion offset if it's after the start of this |
| group. */ |
| |
| if (hc >= cd->hwm) |
| { |
| offset = GET(ptr, 1); |
| if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); |
| } |
| |
| ptr += 1 + LINK_SIZE; |
| } |
| } |
| |
| |
| |
| /************************************************* |
| * Insert an automatic callout point * |
| *************************************************/ |
| |
| /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert |
| callout points before each pattern item. |
| |
| Arguments: |
| code current code pointer |
| ptr current pattern pointer |
| cd pointers to tables etc |
| |
| Returns: new code pointer |
| */ |
| |
| static pcre_uchar * |
| auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd) |
| { |
| *code++ = OP_CALLOUT; |
| *code++ = 255; |
| PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */ |
| PUT(code, LINK_SIZE, 0); /* Default length */ |
| return code + 2 * LINK_SIZE; |
| } |
| |
| |
| |
| /************************************************* |
| * Complete a callout item * |
| *************************************************/ |
| |
| /* A callout item contains the length of the next item in the pattern, which |
| we can't fill in till after we have reached the relevant point. This is used |
| for both automatic and manual callouts. |
| |
| Arguments: |
| previous_callout points to previous callout item |
| ptr current pattern pointer |
| cd pointers to tables etc |
| |
| Returns: nothing |
| */ |
| |
| static void |
| complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd) |
| { |
| int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2)); |
| PUT(previous_callout, 2 + LINK_SIZE, length); |
| } |
| |
| |
| |
| #ifdef SUPPORT_UCP |
| /************************************************* |
| * Get othercase range * |
| *************************************************/ |
| |
| /* This function is passed the start and end of a class range, in UTF-8 mode |
| with UCP support. It searches up the characters, looking for internal ranges of |
| characters in the "other" case. Each call returns the next one, updating the |
| start address. |
| |
| Arguments: |
| cptr points to starting character value; updated |
| d end value |
| ocptr where to put start of othercase range |
| odptr where to put end of othercase range |
| |
| Yield: TRUE when range returned; FALSE when no more |
| */ |
| |
| static BOOL |
| get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, |
| unsigned int *odptr) |
| { |
| unsigned int c, othercase, next; |
| |
| for (c = *cptr; c <= d; c++) |
| { if ((othercase = UCD_OTHERCASE(c)) != c) break; } |
| |
| if (c > d) return FALSE; |
| |
| *ocptr = othercase; |
| next = othercase + 1; |
| |
| for (++c; c <= d; c++) |
| { |
| if (UCD_OTHERCASE(c) != next) break; |
| next++; |
| } |
| |
| *odptr = next - 1; |
| *cptr = c; |
| |
| return TRUE; |
| } |
| |
| |
| |
| /************************************************* |
| * Check a character and a property * |
| *************************************************/ |
| |
| /* This function is called by check_auto_possessive() when a property item |
| is adjacent to a fixed character. |
| |
| Arguments: |
| c the character |
| ptype the property type |
| pdata the data for the type |
| negated TRUE if it's a negated property (\P or \p{^) |
| |
| Returns: TRUE if auto-possessifying is OK |
| */ |
| |
| static BOOL |
| check_char_prop(int c, int ptype, int pdata, BOOL negated) |
| { |
| const pcre_uint8 chartype = UCD_CHARTYPE(c); |
| switch(ptype) |
| { |
| case PT_LAMP: |
| return (chartype == ucp_Lu || |
| chartype == ucp_Ll || |
| chartype == ucp_Lt) == negated; |
| |
| case PT_GC: |
| return (pdata == PRIV(ucp_gentype)[chartype]) == negated; |
| |
| case PT_PC: |
| return (pdata == chartype) == negated; |
| |
| case PT_SC: |
| return (pdata == UCD_SCRIPT(c)) == negated; |
| |
| /* These are specials */ |
| |
| case PT_ALNUM: |
| return (PRIV(ucp_gentype)[chartype] == ucp_L || |
| PRIV(ucp_gentype)[chartype] == ucp_N) == negated; |
| |
| case PT_SPACE: /* Perl space */ |
| return (PRIV(ucp_gentype)[chartype] == ucp_Z || |
| c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) |
| == negated; |
| |
| case PT_PXSPACE: /* POSIX space */ |
| return (PRIV(ucp_gentype)[chartype] == ucp_Z || |
| c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || |
| c == CHAR_FF || c == CHAR_CR) |
| == negated; |
| |
| case PT_WORD: |
| return (PRIV(ucp_gentype)[chartype] == ucp_L || |
| PRIV(ucp_gentype)[chartype] == ucp_N || |
| c == CHAR_UNDERSCORE) == negated; |
| } |
| return FALSE; |
| } |
| #endif /* SUPPORT_UCP */ |
| |
| |
| |
| /************************************************* |
| * Check if auto-possessifying is possible * |
| *************************************************/ |
| |
| /* This function is called for unlimited repeats of certain items, to see |
| whether the next thing could possibly match the repeated item. If not, it makes |
| sense to automatically possessify the repeated item. |
| |
| Arguments: |
| previous pointer to the repeated opcode |
| utf TRUE in UTF-8 / UTF-16 mode |
| ptr next character in pattern |
| options options bits |
| cd contains pointers to tables etc. |
| |
| Returns: TRUE if possessifying is wanted |
| */ |
| |
| static BOOL |
| check_auto_possessive(const pcre_uchar *previous, BOOL utf, |
| const pcre_uchar *ptr, int options, compile_data *cd) |
| { |
| pcre_int32 c, next; |
| int op_code = *previous++; |
| |
| /* Skip whitespace and comments in extended mode */ |
| |
| if ((options & PCRE_EXTENDED) != 0) |
| { |
| for (;;) |
| { |
| while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++; |
| if (*ptr == CHAR_NUMBER_SIGN) |
| { |
| ptr++; |
| while (*ptr != 0) |
| { |
| if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } |
| ptr++; |
| #ifdef SUPPORT_UTF |
| if (utf) FORWARDCHAR(ptr); |
| #endif |
| } |
| } |
| else break; |
| } |
| } |
| |
| /* If the next item is one that we can handle, get its value. A non-negative |
| value is a character, a negative value is an escape value. */ |
| |
| if (*ptr == CHAR_BACKSLASH) |
| { |
| int temperrorcode = 0; |
| next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE); |
| if (temperrorcode != 0) return FALSE; |
| ptr++; /* Point after the escape sequence */ |
| } |
| else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0) |
| { |
| #ifdef SUPPORT_UTF |
| if (utf) { GETCHARINC(next, ptr); } else |
| #endif |
| next = *ptr++; |
| } |
| else return FALSE; |
| |
| /* Skip whitespace and comments in extended mode */ |
| |
| if ((options & PCRE_EXTENDED) != 0) |
| { |
| for (;;) |
| { |
| while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++; |
| if (*ptr == CHAR_NUMBER_SIGN) |
| { |
| ptr++; |
| while (*ptr != 0) |
| { |
| if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } |
| ptr++; |
| #ifdef SUPPORT_UTF |
| if (utf) FORWARDCHAR(ptr); |
| #endif |
| } |
| } |
| else break; |
| } |
| } |
| |
| /* If the next thing is itself optional, we have to give up. */ |
| |
| if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK || |
| STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) |
| return FALSE; |
| |
| /* Now compare the next item with the previous opcode. First, handle cases when |
| the next item is a character. */ |
| |
| if (next >= 0) switch(op_code) |
| { |
| case OP_CHAR: |
| #ifdef SUPPORT_UTF |
| GETCHARTEST(c, previous); |
| #else |
| c = *previous; |
| #endif |
| return c != next; |
| |
| /* For CHARI (caseless character) we must check the other case. If we have |
| Unicode property support, we can use it to test the other case of |
| high-valued characters. */ |
| |
| case OP_CHARI: |
| #ifdef SUPPORT_UTF |
| GETCHARTEST(c, previous); |
| #else |
| c = *previous; |
| #endif |
| if (c == next) return FALSE; |
| #ifdef SUPPORT_UTF |
| if (utf) |
| { |
| unsigned int othercase; |
| if (next < 128) othercase = cd->fcc[next]; else |
| #ifdef SUPPORT_UCP |
| othercase = UCD_OTHERCASE((unsigned int)next); |
| #else |
| othercase = NOTACHAR; |
| #endif |
| return (unsigned int)c != othercase; |
| } |
| else |
| #endif /* SUPPORT_UTF */ |
| return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */ |
| |
| case OP_NOT: |
| #ifdef SUPPORT_UTF |
| GETCHARTEST(c, previous); |
| #else |
| c = *previous; |
| #endif |
| return c == next; |
| |
| case OP_NOTI: |
| #ifdef SUPPORT_UTF |
| GETCHARTEST(c, previous); |
| |