| /************************************************* |
| * PCRE2 testing program * |
| *************************************************/ |
| |
| /* PCRE2 is a library of functions to support regular expressions whose syntax |
| and semantics are as close as possible to those of the Perl 5 language. In 2014 |
| the API was completely revised and '2' was added to the name, because the old |
| API, which had lasted for 16 years, could not accommodate new requirements. At |
| the same time, this testing program was re-designed because its original |
| hacked-up (non-) design had also run out of steam. |
| |
| Written by Philip Hazel |
| Original code Copyright (c) 1997-2012 University of Cambridge |
| Rewritten code Copyright (c) 2016-2024 University of Cambridge |
| |
| ----------------------------------------------------------------------------- |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions are met: |
| |
| * Redistributions of source code must retain the above copyright notice, |
| this list of conditions and the following disclaimer. |
| |
| * Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions and the following disclaimer in the |
| documentation and/or other materials provided with the distribution. |
| |
| * Neither the name of the University of Cambridge nor the names of its |
| contributors may be used to endorse or promote products derived from |
| this software without specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| POSSIBILITY OF SUCH DAMAGE. |
| ----------------------------------------------------------------------------- |
| */ |
| |
| |
| /* This program supports testing of the 8-bit, 16-bit, and 32-bit PCRE2 |
| libraries in a single program, though its input and output are always 8-bit. |
| It is different from modules such as pcre2_compile.c in the library itself, |
| which are compiled separately for each code unit width. If two widths are |
| enabled, for example, pcre2_compile.c is compiled twice. In contrast, |
| pcre2test.c is compiled only once, and linked with all the enabled libraries. |
| Therefore, it must not make use of any of the macros from pcre2.h or |
| pcre2_internal.h that depend on PCRE2_CODE_UNIT_WIDTH. It does, however, make |
| use of SUPPORT_PCRE2_8, SUPPORT_PCRE2_16, and SUPPORT_PCRE2_32, to ensure that |
| it references only the enabled library functions. */ |
| |
| |
| #if defined HAVE_CONFIG_H && !defined PCRE2_CONFIG_H_IDEMPOTENT_GUARD |
| #define PCRE2_CONFIG_H_IDEMPOTENT_GUARD |
| #include "config.h" |
| #endif |
| |
| |
| |
| #include <ctype.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <stdlib.h> |
| #include <time.h> |
| #include <locale.h> |
| #include <errno.h> |
| |
| #if defined NATIVE_ZOS |
| #include "pcrzoscs.h" |
| /* That header is not included in the main PCRE2 distribution because other |
| apparatus is needed to compile pcre2test for z/OS. The header can be found in |
| the special z/OS distribution, which is available from www.zaconsultants.net or |
| from www.cbttape.org. */ |
| #endif |
| |
| #ifdef HAVE_UNISTD_H |
| #include <unistd.h> |
| #endif |
| |
| /* Debugging code enabler */ |
| |
| /* #define DEBUG_SHOW_MALLOC_ADDRESSES */ |
| |
| /* Both libreadline and libedit are optionally supported */ |
| #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) |
| #if defined(SUPPORT_LIBREADLINE) |
| #include <readline/readline.h> |
| #include <readline/history.h> |
| #else |
| #if defined(HAVE_EDITLINE_READLINE_H) |
| #include <editline/readline.h> |
| #elif defined(HAVE_EDIT_READLINE_READLINE_H) |
| #include <edit/readline/readline.h> |
| #else |
| #include <readline.h> |
| /* GNU readline defines this macro but libedit doesn't, if that ever changes |
| this needs to be updated or the build could break */ |
| #ifdef RL_VERSION_MAJOR |
| #include <history.h> |
| #endif |
| #endif |
| #endif |
| #endif |
| |
| /* Put the test for interactive input into a macro so that it can be changed if |
| required for different environments. */ |
| |
| #define INTERACTIVE(f) isatty(fileno(f)) |
| |
| |
| /* ---------------------- System-specific definitions ---------------------- */ |
| |
| /* A number of things vary for Windows builds. Originally, pcretest opened its |
| input and output without "b"; then I was told that "b" was needed in some |
| environments, so it was added for release 5.0 to both the input and output. (It |
| makes no difference on Unix-like systems.) Later I was told that it is wrong |
| for the input on Windows. I've now abstracted the modes into macros that are |
| set here, to make it easier to fiddle with them, and removed "b" from the input |
| mode under Windows. The BINARY versions are used when saving/restoring compiled |
| patterns. */ |
| |
| #if defined(_WIN32) || defined(WIN32) |
| #include <io.h> /* For _setmode() */ |
| #include <fcntl.h> /* For _O_BINARY */ |
| #define INPUT_MODE "r" |
| #define OUTPUT_MODE "wb" |
| #define BINARY_INPUT_MODE "rb" |
| #define BINARY_OUTPUT_MODE "wb" |
| |
| #ifndef isatty |
| #define isatty _isatty /* This is what Windows calls them, I'm told, */ |
| #endif /* though in some environments they seem to */ |
| /* be already defined, hence the #ifndefs. */ |
| #ifndef fileno |
| #define fileno _fileno |
| #endif |
| |
| /* A user sent this fix for Borland Builder 5 under Windows. */ |
| |
| #ifdef __BORLANDC__ |
| #define _setmode(handle, mode) setmode(handle, mode) |
| #endif |
| |
| /* Not Windows */ |
| |
| #else |
| #include <sys/time.h> /* These two includes are needed */ |
| #include <sys/resource.h> /* for setrlimit(). */ |
| #if defined NATIVE_ZOS /* z/OS uses non-binary I/O */ |
| #define INPUT_MODE "r" |
| #define OUTPUT_MODE "w" |
| #define BINARY_INPUT_MODE "rb" |
| #define BINARY_OUTPUT_MODE "wb" |
| #else |
| #define INPUT_MODE "rb" |
| #define OUTPUT_MODE "wb" |
| #define BINARY_INPUT_MODE "rb" |
| #define BINARY_OUTPUT_MODE "wb" |
| #endif |
| #endif |
| |
| /* VMS-specific code was included as suggested by a VMS user [1]. Another VMS |
| user [2] provided alternative code which worked better for him. I have |
| commented out the original, but kept it around just in case. */ |
| |
| #ifdef __VMS |
| #include <ssdef.h> |
| /* These two includes came from [2]. */ |
| #include descrip |
| #include lib$routines |
| /* void vms_setsymbol( char *, char *, int ); Original code from [1]. */ |
| #endif |
| |
| /* old VC and older compilers don't support %td or %zu, and even some that |
| claim to be C99 don't support it (hence DISABLE_PERCENT_ZT). */ |
| |
| #if defined(DISABLE_PERCENT_ZT) || (defined(_MSC_VER) && (_MSC_VER < 1800)) || \ |
| (!defined(_MSC_VER) && (!defined(__STDC_VERSION__) || (__STDC_VERSION__ < 199901L))) |
| #ifdef _WIN64 |
| #define PTR_FORM "lld" |
| #define SIZ_FORM "llu" |
| #else |
| #define PTR_FORM "ld" |
| #define SIZ_FORM "lu" |
| #endif |
| #else |
| #define PTR_FORM "td" |
| #define SIZ_FORM "zu" |
| #endif |
| |
| /* ------------------End of system-specific definitions -------------------- */ |
| |
| /* Glueing macros that are used in several places below. */ |
| |
| #define glue(a,b) a##b |
| #define G(a,b) glue(a,b) |
| |
| #define stringify(x) #x |
| #define STR(x) stringify(x) |
| |
| /* Miscellaneous parameters and manifests */ |
| |
| #ifndef CLOCKS_PER_SEC |
| #ifdef CLK_TCK |
| #define CLOCKS_PER_SEC CLK_TCK |
| #else |
| #define CLOCKS_PER_SEC 100 |
| #endif |
| #endif |
| |
| #define CFORE_UNSET UINT32_MAX /* Unset value for startend/cfail/cerror fields */ |
| #define CONVERT_UNSET UINT32_MAX /* Unset value for convert_type field */ |
| #define DFA_WS_DIMENSION 1000 /* Size of DFA workspace */ |
| #define DEFAULT_OVECCOUNT 15 /* Default ovector count */ |
| #define JUNK_OFFSET 0xdeadbeef /* For initializing ovector */ |
| #define LOCALESIZE 32 /* Size of locale name */ |
| #define LOOPREPEAT 500000 /* Default loop count for timing */ |
| #define MALLOCLISTSIZE 20 /* For remembering mallocs */ |
| #define PARENS_NEST_DEFAULT 220 /* Default parentheses nest limit */ |
| #define PATSTACKSIZE 20 /* Pattern stack for save/restore testing */ |
| #define REPLACE_MODSIZE 100 /* Field for reading 8-bit replacement */ |
| #define VERSION_SIZE 64 /* Size of buffer for the version strings */ |
| #define REPLACE_BUFFSIZE 256 /* Code units for replacement buffer */ |
| |
| /* Default JIT compile options */ |
| |
| #define JIT_DEFAULT (PCRE2_JIT_COMPLETE|\ |
| PCRE2_JIT_PARTIAL_SOFT|\ |
| PCRE2_JIT_PARTIAL_HARD) |
| |
| /* Execution modes */ |
| |
| #define PCRE2TEST_MODE_8 8 |
| #define PCRE2TEST_MODE_16 16 |
| #define PCRE2TEST_MODE_32 32 |
| |
| /* Processing returns */ |
| |
| enum { PR_OK, PR_SKIP, PR_ABEND, PR_ENDIF }; |
| |
| /* The macro EBCDIC_IO describes whether pcre2tests takes ASCII or EBCDIC as |
| its input files (or terminal input). If the compiler uses ASCII for character |
| literals, then we make pcre2test take ASCII as its input and output. This is |
| different to the core PCRE2 library, where we use macros like "CHAR_A" for every |
| single character and string literal used in pattern parsing and matching. It |
| would simply be too arduous to do the same for pcre2test, so we make its |
| input/output format match the compiler's codepage. */ |
| #if defined(EBCDIC) && 'a' == 0x81 |
| #define EBCDIC_IO 1 |
| #else |
| #define EBCDIC_IO 0 |
| #endif |
| |
| /* The macro PRINTABLE determines whether to print an output character as-is or |
| as a hex value when showing compiled patterns. We use it in cases when the |
| locale has not been explicitly changed, so as to get consistent output from |
| systems that differ in their output from isprint() even in the "C" locale. */ |
| |
| #if defined(EBCDIC) |
| #define PRINTABLE(c) printable(c) |
| #else |
| #define PRINTABLE(c) ((c) >= 32 && (c) < 127) |
| #endif |
| |
| /* The macro CHAR_OUTPUT is used to output characters in pcre2test's output |
| format. The input character is encoded in PCRE2's native codepage (EBCDIC, if |
| enabled), but the output may differ in the case where pcre2test uses ASCII input |
| and output. */ |
| #if defined(EBCDIC) && !EBCDIC_IO |
| #define CHAR_OUTPUT(c) ebcdic_to_ascii(c) |
| #define CHAR_OUTPUT_HEX(c) CHAR_OUTPUT(c) |
| #define CHAR_INPUT(c) ascii_to_ebcdic(c) |
| #define CHAR_INPUT_HEX(c) CHAR_INPUT(c) |
| #elif defined(EBCDIC) |
| #define CHAR_OUTPUT(c) (c) |
| #define CHAR_OUTPUT_HEX(c) ebcdic_to_ascii(c) |
| #define CHAR_INPUT(c) (c) |
| #define CHAR_INPUT_HEX(c) ascii_to_ebcdic(c) |
| #else |
| #define CHAR_OUTPUT(c) (c) |
| #define CHAR_OUTPUT_HEX(c) CHAR_OUTPUT(c) |
| #define CHAR_INPUT(c) (c) |
| #define CHAR_INPUT_HEX(c) CHAR_INPUT(c) |
| #endif |
| |
| /* We have to include some of the library source files because we need |
| to use some of the macros, internal structure definitions, and other internal |
| values - pcre2test has "inside information" compared to an application program |
| that strictly follows the PCRE2 API. |
| |
| Before including pcre2_internal.h we define PRIV so that it does not get |
| defined therein. This ensures that PRIV names in the included files do not |
| clash with those in the libraries. Also, although pcre2_internal.h does itself |
| include pcre2.h, we explicitly include it beforehand, along with pcre2posix.h, |
| so that the PCRE2_EXP_xxx macros get set appropriately for an application, not |
| for building the library. |
| |
| Setting PCRE2_CODE_UNIT_WIDTH to zero cuts out all the width-specific settings |
| in pcre2.h and pcre2_internal.h. Defining PCRE2_PCRE2TEST cuts out the check in |
| pcre2_internal.h that ensures PCRE2_CODE_UNIT_WIDTH is 8, 16, or 32 (which it |
| needs to be when compiling one of the libraries). */ |
| |
| #define PRIV(name) name |
| #define PCRE2_CODE_UNIT_WIDTH 0 |
| #define PCRE2_PCRE2TEST |
| #include "pcre2.h" |
| #include "pcre2posix.h" |
| #include "pcre2_internal.h" |
| |
| /* We need access to some of the data tables that PCRE2 uses. The previous |
| definition of PCRE2_PCRE2TEST makes some minor changes in the files. The |
| previous definition of PRIV avoids name clashes. */ |
| |
| #include "pcre2_tables.c" |
| #include "pcre2_ucd.c" |
| |
| /* Forward-declarations for PRINTABLE(), etc. */ |
| |
| #if defined(EBCDIC) |
| static BOOL printable(uint32_t c); |
| #endif |
| #if defined(EBCDIC) && !EBCDIC_IO |
| static void ascii_to_ebcdic_str(uint8_t *buf, size_t len); |
| #endif |
| #if defined(EBCDIC) |
| static uint32_t ascii_to_ebcdic(uint32_t c); |
| static uint32_t ebcdic_to_ascii(uint32_t c); |
| #endif |
| |
| /* 32-bit integer values in the input are read by strtoul() or strtol(). The |
| check needed for overflow depends on whether long ints are in fact longer than |
| ints. They are defined not to be shorter. */ |
| |
| #if ULONG_MAX > UINT32_MAX |
| #define U32OVERFLOW(x) (x > UINT32_MAX) |
| #else |
| #define U32OVERFLOW(x) (x == UINT32_MAX) |
| #endif |
| |
| #if LONG_MAX > INT32_MAX |
| #define S32OVERFLOW(x) (x > INT32_MAX || x < INT32_MIN) |
| #else |
| #define S32OVERFLOW(x) (x == INT32_MAX || x == INT32_MIN) |
| #endif |
| |
| /* When PCRE2_CODE_UNIT_WIDTH is zero, pcre2_internal.h does not include |
| pcre2_intmodedep.h, which is where mode-dependent macros and structures are |
| defined. We can now include it for each supported code unit width. Because |
| PCRE2_CODE_UNIT_WIDTH was defined as zero before including pcre2.h, it will |
| have left PCRE2_SUFFIX defined as a no-op. We must re-define it appropriately |
| while including these files, and then restore it to a no-op. Because LINK_SIZE |
| may be changed in 16-bit mode and forced to 1 in 32-bit mode, the order of |
| these inclusions should not be changed. */ |
| |
| #undef PCRE2_SUFFIX |
| #undef PCRE2_CODE_UNIT_WIDTH |
| |
| #ifdef SUPPORT_PCRE2_8 |
| #define PCRE2_CODE_UNIT_WIDTH 8 |
| #define PCRE2_SUFFIX(a) G(a,8) |
| #include "pcre2_intmodedep.h" |
| #include "pcre2_printint_inc.h" |
| #undef PCRE2_CODE_UNIT_WIDTH |
| #undef PCRE2_SUFFIX |
| #endif /* SUPPORT_PCRE2_8 */ |
| |
| #ifdef SUPPORT_PCRE2_16 |
| #define PCRE2_CODE_UNIT_WIDTH 16 |
| #define PCRE2_SUFFIX(a) G(a,16) |
| #include "pcre2_intmodedep.h" |
| #include "pcre2_printint_inc.h" |
| #undef PCRE2_CODE_UNIT_WIDTH |
| #undef PCRE2_SUFFIX |
| #endif /* SUPPORT_PCRE2_16 */ |
| |
| #ifdef SUPPORT_PCRE2_32 |
| #define PCRE2_CODE_UNIT_WIDTH 32 |
| #define PCRE2_SUFFIX(a) G(a,32) |
| #include "pcre2_intmodedep.h" |
| #include "pcre2_printint_inc.h" |
| #undef PCRE2_CODE_UNIT_WIDTH |
| #undef PCRE2_SUFFIX |
| #endif /* SUPPORT_PCRE2_32 */ |
| |
| #define PCRE2_CODE_UNIT_WIDTH 0 |
| #include "pcre2_intmodedep.h" /* Clear out the stale macros */ |
| #undef PCRE2_CODE_UNIT_WIDTH |
| |
| #define PCRE2_SUFFIX(a) a |
| |
| /* We need to be able to check input text for UTF-8 validity, whatever code |
| widths are actually available, because the input to pcre2test is always in |
| 8-bit code units. So we include the UTF validity checking function for 8-bit |
| code units. */ |
| |
| extern int valid_utf(PCRE2_SPTR8, PCRE2_SIZE, PCRE2_SIZE *); |
| |
| #define PCRE2_CODE_UNIT_WIDTH 8 |
| #undef PCRE2_SPTR |
| #define PCRE2_SPTR PCRE2_SPTR8 |
| #include "pcre2_valid_utf.c" |
| #undef PCRE2_CODE_UNIT_WIDTH |
| #undef PCRE2_SPTR |
| #define PCRE2_SPTR PCRE2_SUFFIX(PCRE2_SPTR) |
| |
| /* If we have 8-bit support, default to it; if there is also 16-or 32-bit |
| support, it can be selected by a command line option. If there is no 8-bit |
| support, there must be 16-bit or 32-bit support, so default to one of them. |
| |
| The contexts just happen to be exactly the same layout on all bit-widths |
| (although the contents are very much not the same). For example, the 8-bit |
| and 16-bit match contexts have the same fields, all at the same offsets and |
| sizes, but the function pointers for the callouts in the 8-bit context are not |
| of the same type as in the 16-bit context. When we are parsing the modifier |
| bits, it is convenient to be able to uniformly set flags in any of the contexts, |
| so for that purpose only we may ignore the differences between the contexts at |
| different bit-widths. Choose one arbitrarily (does not need to match the |
| test mode). */ |
| |
| #if defined SUPPORT_PCRE2_8 |
| #define DEFAULT_TEST_MODE PCRE2TEST_MODE_8 |
| #define PCRE2_REAL_COMPILE_CONTEXT pcre2_real_compile_context_8 |
| #define PCRE2_REAL_MATCH_CONTEXT pcre2_real_match_context_8 |
| |
| #elif defined SUPPORT_PCRE2_16 |
| #define DEFAULT_TEST_MODE PCRE2TEST_MODE_16 |
| #define PCRE2_REAL_COMPILE_CONTEXT pcre2_real_compile_context_16 |
| #define PCRE2_REAL_MATCH_CONTEXT pcre2_real_match_context_16 |
| |
| #elif defined SUPPORT_PCRE2_32 |
| #define DEFAULT_TEST_MODE PCRE2TEST_MODE_32 |
| #define PCRE2_REAL_COMPILE_CONTEXT pcre2_real_compile_context_32 |
| #define PCRE2_REAL_MATCH_CONTEXT pcre2_real_match_context_32 |
| #endif |
| |
| /* ------------- Structure and table for handling #-commands ------------- */ |
| |
| typedef struct cmdstruct { |
| const char *name; |
| int value; |
| } cmdstruct; |
| |
| enum { CMD_ENDIF, CMD_FORBID_UTF, CMD_IF, CMD_LOAD, CMD_LOADTABLES, |
| CMD_NEWLINE_DEFAULT, CMD_PATTERN, CMD_PERLTEST, CMD_POP, CMD_POPCOPY, |
| CMD_SAVE, CMD_SUBJECT, CMD_UNKNOWN }; |
| |
| static cmdstruct cmdlist[] = { |
| { "endif", CMD_ENDIF }, |
| { "forbid_utf", CMD_FORBID_UTF }, |
| { "if", CMD_IF }, |
| { "load", CMD_LOAD }, |
| { "loadtables", CMD_LOADTABLES }, |
| { "newline_default", CMD_NEWLINE_DEFAULT }, |
| { "pattern", CMD_PATTERN }, |
| { "perltest", CMD_PERLTEST }, |
| { "pop", CMD_POP }, |
| { "popcopy", CMD_POPCOPY }, |
| { "save", CMD_SAVE }, |
| { "subject", CMD_SUBJECT }}; |
| |
| #define cmdlistcount (sizeof(cmdlist)/sizeof(cmdstruct)) |
| |
| /* ------------- Structures and tables for handling modifiers -------------- */ |
| |
| /* Table of names for newline types. Must be kept in step with the definitions |
| of PCRE2_NEWLINE_xx in pcre2.h. */ |
| |
| static const char *newlines[] = { |
| "DEFAULT", "CR", "LF", "CRLF", "ANY", "ANYCRLF", "NUL" }; |
| |
| /* Structure and table for handling pattern conversion types. */ |
| |
| typedef struct convertstruct { |
| const char *name; |
| uint32_t option; |
| } convertstruct; |
| |
| static convertstruct convertlist[] = { |
| { "glob", PCRE2_CONVERT_GLOB }, |
| { "glob_no_starstar", PCRE2_CONVERT_GLOB_NO_STARSTAR }, |
| { "glob_no_wild_separator", PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR }, |
| { "posix_basic", PCRE2_CONVERT_POSIX_BASIC }, |
| { "posix_extended", PCRE2_CONVERT_POSIX_EXTENDED }, |
| { "unset", CONVERT_UNSET }}; |
| |
| #define convertlistcount (sizeof(convertlist)/sizeof(convertstruct)) |
| |
| /* Modifier types and applicability */ |
| |
| enum { MOD_CTC, /* Applies to a compile context */ |
| MOD_CTM, /* Applies to a match context */ |
| MOD_PAT, /* Applies to a pattern */ |
| MOD_PATP, /* Ditto, OK for Perl test */ |
| MOD_DAT, /* Applies to a data line */ |
| MOD_DATP, /* Ditto, OK for Perl test */ |
| MOD_PD, /* Applies to a pattern or a data line */ |
| MOD_PDP, /* As MOD_PD, OK for Perl test */ |
| MOD_PND, /* As MOD_PD, but not for a default pattern */ |
| MOD_PNDP, /* As MOD_PND, OK for Perl test */ |
| MOD_CHR, /* Is a single character */ |
| MOD_CON, /* Is a "convert" type/options list */ |
| MOD_CTL, /* Is a control bit */ |
| MOD_BSR, /* Is a BSR value */ |
| MOD_IN2, /* Is one or two unsigned integers */ |
| MOD_INS, /* Is a signed integer */ |
| MOD_INT, /* Is an unsigned integer */ |
| MOD_IND, /* Is an unsigned integer, but no value => default */ |
| MOD_NL, /* Is a newline value */ |
| MOD_NN, /* Is a number or a name; more than one may occur */ |
| MOD_OPT, /* Is an option bit */ |
| MOD_OPTMZ, /* Is an optimization directive */ |
| MOD_SIZ, /* Is a PCRE2_SIZE value */ |
| MOD_STR }; /* Is a string */ |
| |
| /* Control bits. Some apply to compiling, some to matching, but some can be set |
| either on a pattern or a data line, so they must all be distinct. There are now |
| so many of them that they are split into two fields. */ |
| |
| #define CTL_AFTERTEXT 0x00000001u |
| #define CTL_ALLAFTERTEXT 0x00000002u |
| #define CTL_ALLCAPTURES 0x00000004u |
| #define CTL_ALLUSEDTEXT 0x00000008u |
| #define CTL_ALTGLOBAL 0x00000010u |
| #define CTL_BINCODE 0x00000020u |
| #define CTL_CALLOUT_CAPTURE 0x00000040u |
| #define CTL_CALLOUT_INFO 0x00000080u |
| #define CTL_CALLOUT_NONE 0x00000100u |
| #define CTL_DFA 0x00000200u |
| #define CTL_EXPAND 0x00000400u |
| #define CTL_FINDLIMITS 0x00000800u |
| #define CTL_FINDLIMITS_NOHEAP 0x00001000u |
| #define CTL_FULLBINCODE 0x00002000u |
| #define CTL_GETALL 0x00004000u |
| #define CTL_GLOBAL 0x00008000u |
| #define CTL_HEXPAT 0x00010000u /* Same word as USE_LENGTH */ |
| #define CTL_INFO 0x00020000u |
| #define CTL_JITFAST 0x00040000u |
| #define CTL_JITVERIFY 0x00080000u |
| #define CTL_MARK 0x00100000u |
| #define CTL_MEMORY 0x00200000u |
| #define CTL_NULLCONTEXT 0x00400000u |
| #define CTL_POSIX 0x00800000u |
| #define CTL_POSIX_NOSUB 0x01000000u |
| #define CTL_PUSH 0x02000000u /* These three must be */ |
| #define CTL_PUSHCOPY 0x04000000u /* all in the same */ |
| #define CTL_PUSHTABLESCOPY 0x08000000u /* word. */ |
| #define CTL_STARTCHAR 0x10000000u |
| #define CTL_USE_LENGTH 0x20000000u /* Same word as HEXPAT */ |
| #define CTL_UTF8_INPUT 0x40000000u |
| #define CTL_ZERO_TERMINATE 0x80000000u |
| |
| /* Combinations */ |
| |
| #define CTL_DEBUG (CTL_FULLBINCODE|CTL_INFO) /* For setting */ |
| #define CTL_ANYINFO (CTL_DEBUG|CTL_BINCODE|CTL_CALLOUT_INFO) |
| #define CTL_ANYGLOB (CTL_ALTGLOBAL|CTL_GLOBAL) |
| |
| /* Second control word */ |
| |
| #define CTL2_SUBSTITUTE_CALLOUT 0x00000001u |
| #define CTL2_SUBSTITUTE_EXTENDED 0x00000002u |
| #define CTL2_SUBSTITUTE_LITERAL 0x00000004u |
| #define CTL2_SUBSTITUTE_MATCHED 0x00000008u |
| #define CTL2_SUBSTITUTE_OVERFLOW_LENGTH 0x00000010u |
| #define CTL2_SUBSTITUTE_REPLACEMENT_ONLY 0x00000020u |
| #define CTL2_SUBSTITUTE_UNKNOWN_UNSET 0x00000040u |
| #define CTL2_SUBSTITUTE_UNSET_EMPTY 0x00000080u |
| #define CTL2_SUBJECT_LITERAL 0x00000100u |
| #define CTL2_CALLOUT_NO_WHERE 0x00000200u |
| #define CTL2_CALLOUT_EXTRA 0x00000400u |
| #define CTL2_ALLVECTOR 0x00000800u |
| #define CTL2_NULL_PATTERN 0x00001000u |
| #define CTL2_NULL_SUBJECT 0x00002000u |
| #define CTL2_NULL_REPLACEMENT 0x00004000u |
| #define CTL2_FRAMESIZE 0x00008000u |
| #define CTL2_SUBSTITUTE_CASE_CALLOUT 0x00010000u |
| |
| #define CTL2_HEAPFRAMES_SIZE 0x20000000u /* Informational */ |
| #define CTL2_NL_SET 0x40000000u /* Informational */ |
| #define CTL2_BSR_SET 0x80000000u /* Informational */ |
| |
| /* These are the matching controls that may be set either on a pattern or on a |
| data line. They are copied from the pattern controls as initial settings for |
| data line controls. Note that CTL_MEMORY is not included here, because it does |
| different things in the two cases. */ |
| |
| #define CTL_ALLPD (CTL_AFTERTEXT|\ |
| CTL_ALLAFTERTEXT|\ |
| CTL_ALLCAPTURES|\ |
| CTL_ALLUSEDTEXT|\ |
| CTL_ALTGLOBAL|\ |
| CTL_GLOBAL|\ |
| CTL_MARK|\ |
| CTL_STARTCHAR|\ |
| CTL_UTF8_INPUT) |
| |
| #define CTL2_ALLPD (CTL2_SUBSTITUTE_CALLOUT|\ |
| CTL2_SUBSTITUTE_EXTENDED|\ |
| CTL2_SUBSTITUTE_LITERAL|\ |
| CTL2_SUBSTITUTE_MATCHED|\ |
| CTL2_SUBSTITUTE_OVERFLOW_LENGTH|\ |
| CTL2_SUBSTITUTE_REPLACEMENT_ONLY|\ |
| CTL2_SUBSTITUTE_UNKNOWN_UNSET|\ |
| CTL2_SUBSTITUTE_UNSET_EMPTY|\ |
| CTL2_ALLVECTOR|\ |
| CTL2_SUBSTITUTE_CASE_CALLOUT|\ |
| CTL2_HEAPFRAMES_SIZE) |
| |
| /* Structures for holding modifier information for patterns and subject strings |
| (data). Fields containing modifiers that can be set either for a pattern or a |
| subject must be at the start and in the same order in both cases so that the |
| same offset in the big table below works for both. */ |
| |
| typedef struct patctl { /* Structure for pattern modifiers. */ |
| uint32_t options; /* Must be in same position as datctl */ |
| uint32_t control; /* Must be in same position as datctl */ |
| uint32_t control2; /* Must be in same position as datctl */ |
| uint32_t jitstack; /* Must be in same position as datctl */ |
| uint8_t replacement[REPLACE_MODSIZE]; /* So must this */ |
| uint32_t substitute_skip; /* Must be in same position as datctl */ |
| uint32_t substitute_stop; /* Must be in same position as datctl */ |
| uint32_t jit; |
| uint32_t stackguard_test; |
| uint32_t tables_id; |
| uint32_t convert_type; |
| uint32_t convert_length; |
| uint32_t convert_glob_escape; |
| uint32_t convert_glob_separator; |
| uint32_t regerror_buffsize; |
| uint8_t locale[LOCALESIZE]; |
| } patctl; |
| |
| #define MAXCPYGET 10 |
| #define LENCPYGET 64 |
| |
| typedef struct datctl { /* Structure for data line modifiers. */ |
| uint32_t options; /* Must be in same position as patctl */ |
| uint32_t control; /* Must be in same position as patctl */ |
| uint32_t control2; /* Must be in same position as patctl */ |
| uint32_t jitstack; /* Must be in same position as patctl */ |
| uint8_t replacement[REPLACE_MODSIZE]; /* So must this */ |
| uint32_t substitute_skip; /* Must be in same position as patctl */ |
| uint32_t substitute_stop; /* Must be in same position as patctl */ |
| uint32_t startend[2]; |
| uint32_t cerror[2]; |
| uint32_t cfail[2]; |
| int32_t callout_data; |
| int32_t copy_numbers[MAXCPYGET]; |
| int32_t get_numbers[MAXCPYGET]; |
| uint32_t oveccount; |
| PCRE2_SIZE offset; |
| uint8_t copy_names[LENCPYGET]; |
| uint8_t get_names[LENCPYGET]; |
| } datctl; |
| |
| /* Ids for which context to modify. */ |
| |
| enum { CTX_PAT, /* Active pattern context */ |
| CTX_POPPAT, /* Ditto, for a popped pattern */ |
| CTX_DEFPAT, /* Default pattern context */ |
| CTX_DAT, /* Active data (match) context */ |
| CTX_DEFDAT }; /* Default data (match) context */ |
| |
| /* Macros to simplify the big table below. */ |
| |
| #define CO(name) offsetof(PCRE2_REAL_COMPILE_CONTEXT, name) |
| #define MO(name) offsetof(PCRE2_REAL_MATCH_CONTEXT, name) |
| #define PO(name) offsetof(patctl, name) |
| #define PD(name) PO(name) |
| #define DO(name) offsetof(datctl, name) |
| |
| /* Table of all long-form modifiers. Must be in collating sequence of modifier |
| name because it is searched by binary chop. */ |
| |
| typedef struct modstruct { |
| const char *name; |
| uint16_t which; |
| uint16_t type; |
| uint32_t value; |
| PCRE2_SIZE offset; |
| } modstruct; |
| |
| #define PCRE2_EXTRA_ASCII_ALL (PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS| \ |
| PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX) |
| |
| static modstruct modlist[] = { |
| { "aftertext", MOD_PNDP, MOD_CTL, CTL_AFTERTEXT, PO(control) }, |
| { "allaftertext", MOD_PNDP, MOD_CTL, CTL_ALLAFTERTEXT, PO(control) }, |
| { "allcaptures", MOD_PND, MOD_CTL, CTL_ALLCAPTURES, PO(control) }, |
| { "allow_empty_class", MOD_PAT, MOD_OPT, PCRE2_ALLOW_EMPTY_CLASS, PO(options) }, |
| { "allow_lookaround_bsk", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK, CO(extra_options) }, |
| { "allow_surrogate_escapes", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES, CO(extra_options) }, |
| { "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) }, |
| { "allvector", MOD_PND, MOD_CTL, CTL2_ALLVECTOR, PO(control2) }, |
| { "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) }, |
| { "alt_circumflex", MOD_PAT, MOD_OPT, PCRE2_ALT_CIRCUMFLEX, PO(options) }, |
| { "alt_extended_class", MOD_PAT, MOD_OPT, PCRE2_ALT_EXTENDED_CLASS, PO(options) }, |
| { "alt_verbnames", MOD_PAT, MOD_OPT, PCRE2_ALT_VERBNAMES, PO(options) }, |
| { "altglobal", MOD_PND, MOD_CTL, CTL_ALTGLOBAL, PO(control) }, |
| { "anchored", MOD_PD, MOD_OPT, PCRE2_ANCHORED, PD(options) }, |
| { "ascii_all", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_ALL, CO(extra_options) }, |
| { "ascii_bsd", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSD, CO(extra_options) }, |
| { "ascii_bss", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSS, CO(extra_options) }, |
| { "ascii_bsw", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_BSW, CO(extra_options) }, |
| { "ascii_digit", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_DIGIT, CO(extra_options) }, |
| { "ascii_posix", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_POSIX, CO(extra_options) }, |
| { "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) }, |
| { "auto_possess", MOD_CTC, MOD_OPTMZ, PCRE2_AUTO_POSSESS, 0 }, |
| { "auto_possess_off", MOD_CTC, MOD_OPTMZ, PCRE2_AUTO_POSSESS_OFF, 0 }, |
| { "bad_escape_is_literal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL, CO(extra_options) }, |
| { "bincode", MOD_PAT, MOD_CTL, CTL_BINCODE, PO(control) }, |
| { "bsr", MOD_CTC, MOD_BSR, 0, CO(bsr_convention) }, |
| { "callout_capture", MOD_DAT, MOD_CTL, CTL_CALLOUT_CAPTURE, DO(control) }, |
| { "callout_data", MOD_DAT, MOD_INS, 0, DO(callout_data) }, |
| { "callout_error", MOD_DAT, MOD_IN2, 0, DO(cerror) }, |
| { "callout_extra", MOD_DAT, MOD_CTL, CTL2_CALLOUT_EXTRA, DO(control2) }, |
| { "callout_fail", MOD_DAT, MOD_IN2, 0, DO(cfail) }, |
| { "callout_info", MOD_PAT, MOD_CTL, CTL_CALLOUT_INFO, PO(control) }, |
| { "callout_no_where", MOD_DAT, MOD_CTL, CTL2_CALLOUT_NO_WHERE, DO(control2) }, |
| { "callout_none", MOD_DAT, MOD_CTL, CTL_CALLOUT_NONE, DO(control) }, |
| { "caseless", MOD_PATP, MOD_OPT, PCRE2_CASELESS, PO(options) }, |
| { "caseless_restrict", MOD_CTC, MOD_OPT, PCRE2_EXTRA_CASELESS_RESTRICT, CO(extra_options) }, |
| { "convert", MOD_PAT, MOD_CON, 0, PO(convert_type) }, |
| { "convert_glob_escape", MOD_PAT, MOD_CHR, 0, PO(convert_glob_escape) }, |
| { "convert_glob_separator", MOD_PAT, MOD_CHR, 0, PO(convert_glob_separator) }, |
| { "convert_length", MOD_PAT, MOD_INT, 0, PO(convert_length) }, |
| { "copy", MOD_DAT, MOD_NN, DO(copy_numbers), DO(copy_names) }, |
| { "copy_matched_subject", MOD_DAT, MOD_OPT, PCRE2_COPY_MATCHED_SUBJECT, DO(options) }, |
| { "debug", MOD_PAT, MOD_CTL, CTL_DEBUG, PO(control) }, |
| { "depth_limit", MOD_CTM, MOD_INT, 0, MO(depth_limit) }, |
| { "dfa", MOD_DAT, MOD_CTL, CTL_DFA, DO(control) }, |
| { "dfa_restart", MOD_DAT, MOD_OPT, PCRE2_DFA_RESTART, DO(options) }, |
| { "dfa_shortest", MOD_DAT, MOD_OPT, PCRE2_DFA_SHORTEST, DO(options) }, |
| { "disable_recurseloop_check", MOD_DAT, MOD_OPT, PCRE2_DISABLE_RECURSELOOP_CHECK, DO(options) }, |
| { "dollar_endonly", MOD_PAT, MOD_OPT, PCRE2_DOLLAR_ENDONLY, PO(options) }, |
| { "dotall", MOD_PATP, MOD_OPT, PCRE2_DOTALL, PO(options) }, |
| { "dotstar_anchor", MOD_CTC, MOD_OPTMZ, PCRE2_DOTSTAR_ANCHOR, 0 }, |
| { "dotstar_anchor_off", MOD_CTC, MOD_OPTMZ, PCRE2_DOTSTAR_ANCHOR_OFF, 0 }, |
| { "dupnames", MOD_PATP, MOD_OPT, PCRE2_DUPNAMES, PO(options) }, |
| { "endanchored", MOD_PD, MOD_OPT, PCRE2_ENDANCHORED, PD(options) }, |
| { "escaped_cr_is_lf", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ESCAPED_CR_IS_LF, CO(extra_options) }, |
| { "expand", MOD_PAT, MOD_CTL, CTL_EXPAND, PO(control) }, |
| { "extended", MOD_PATP, MOD_OPT, PCRE2_EXTENDED, PO(options) }, |
| { "extended_more", MOD_PATP, MOD_OPT, PCRE2_EXTENDED_MORE, PO(options) }, |
| { "extra_alt_bsux", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ALT_BSUX, CO(extra_options) }, |
| { "find_limits", MOD_DAT, MOD_CTL, CTL_FINDLIMITS, DO(control) }, |
| { "find_limits_noheap", MOD_DAT, MOD_CTL, CTL_FINDLIMITS_NOHEAP, DO(control) }, |
| { "firstline", MOD_PAT, MOD_OPT, PCRE2_FIRSTLINE, PO(options) }, |
| { "framesize", MOD_PAT, MOD_CTL, CTL2_FRAMESIZE, PO(control2) }, |
| { "fullbincode", MOD_PAT, MOD_CTL, CTL_FULLBINCODE, PO(control) }, |
| { "get", MOD_DAT, MOD_NN, DO(get_numbers), DO(get_names) }, |
| { "getall", MOD_DAT, MOD_CTL, CTL_GETALL, DO(control) }, |
| { "global", MOD_PNDP, MOD_CTL, CTL_GLOBAL, PO(control) }, |
| { "heap_limit", MOD_CTM, MOD_INT, 0, MO(heap_limit) }, |
| { "heapframes_size", MOD_PND, MOD_CTL, CTL2_HEAPFRAMES_SIZE, PO(control2) }, |
| { "hex", MOD_PATP, MOD_CTL, CTL_HEXPAT, PO(control) }, |
| { "info", MOD_PAT, MOD_CTL, CTL_INFO, PO(control) }, |
| { "jit", MOD_PAT, MOD_IND, 7, PO(jit) }, |
| { "jitfast", MOD_PAT, MOD_CTL, CTL_JITFAST, PO(control) }, |
| { "jitstack", MOD_PNDP, MOD_INT, 0, PO(jitstack) }, |
| { "jitverify", MOD_PAT, MOD_CTL, CTL_JITVERIFY, PO(control) }, |
| { "literal", MOD_PAT, MOD_OPT, PCRE2_LITERAL, PO(options) }, |
| { "locale", MOD_PATP, MOD_STR, LOCALESIZE, PO(locale) }, |
| { "mark", MOD_PNDP, MOD_CTL, CTL_MARK, PO(control) }, |
| { "match_invalid_utf", MOD_PAT, MOD_OPT, PCRE2_MATCH_INVALID_UTF, PO(options) }, |
| { "match_limit", MOD_CTM, MOD_INT, 0, MO(match_limit) }, |
| { "match_line", MOD_CTC, MOD_OPT, PCRE2_EXTRA_MATCH_LINE, CO(extra_options) }, |
| { "match_unset_backref", MOD_PAT, MOD_OPT, PCRE2_MATCH_UNSET_BACKREF, PO(options) }, |
| { "match_word", MOD_CTC, MOD_OPT, PCRE2_EXTRA_MATCH_WORD, CO(extra_options) }, |
| { "max_pattern_compiled_length", MOD_CTC, MOD_SIZ, 0, CO(max_pattern_compiled_length) }, |
| { "max_pattern_length", MOD_CTC, MOD_SIZ, 0, CO(max_pattern_length) }, |
| { "max_varlookbehind", MOD_CTC, MOD_INT, 0, CO(max_varlookbehind) }, |
| { "memory", MOD_PD, MOD_CTL, CTL_MEMORY, PD(control) }, |
| { "multiline", MOD_PATP, MOD_OPT, PCRE2_MULTILINE, PO(options) }, |
| { "never_backslash_c", MOD_PAT, MOD_OPT, PCRE2_NEVER_BACKSLASH_C, PO(options) }, |
| { "never_callout", MOD_CTC, MOD_OPT, PCRE2_EXTRA_NEVER_CALLOUT, CO(extra_options) }, |
| { "never_ucp", MOD_PAT, MOD_OPT, PCRE2_NEVER_UCP, PO(options) }, |
| { "never_utf", MOD_PAT, MOD_OPT, PCRE2_NEVER_UTF, PO(options) }, |
| { "newline", MOD_CTC, MOD_NL, 0, CO(newline_convention) }, |
| { "no_auto_capture", MOD_PAT, MOD_OPT, PCRE2_NO_AUTO_CAPTURE, PO(options) }, |
| { "no_auto_possess", MOD_PATP, MOD_OPT, PCRE2_NO_AUTO_POSSESS, PO(options) }, |
| { "no_bs0", MOD_CTC, MOD_OPT, PCRE2_EXTRA_NO_BS0, CO(extra_options) }, |
| { "no_dotstar_anchor", MOD_PAT, MOD_OPT, PCRE2_NO_DOTSTAR_ANCHOR, PO(options) }, |
| { "no_jit", MOD_DATP, MOD_OPT, PCRE2_NO_JIT, DO(options) }, |
| { "no_start_optimize", MOD_PATP, MOD_OPT, PCRE2_NO_START_OPTIMIZE, PO(options) }, |
| { "no_utf_check", MOD_PD, MOD_OPT, PCRE2_NO_UTF_CHECK, PD(options) }, |
| { "notbol", MOD_DAT, MOD_OPT, PCRE2_NOTBOL, DO(options) }, |
| { "notempty", MOD_DAT, MOD_OPT, PCRE2_NOTEMPTY, DO(options) }, |
| { "notempty_atstart", MOD_DAT, MOD_OPT, PCRE2_NOTEMPTY_ATSTART, DO(options) }, |
| { "noteol", MOD_DAT, MOD_OPT, PCRE2_NOTEOL, DO(options) }, |
| { "null_context", MOD_PD, MOD_CTL, CTL_NULLCONTEXT, PO(control) }, |
| { "null_pattern", MOD_PAT, MOD_CTL, CTL2_NULL_PATTERN, PO(control2) }, |
| { "null_replacement", MOD_DAT, MOD_CTL, CTL2_NULL_REPLACEMENT, DO(control2) }, |
| { "null_subject", MOD_DAT, MOD_CTL, CTL2_NULL_SUBJECT, DO(control2) }, |
| { "offset", MOD_DAT, MOD_SIZ, 0, DO(offset) }, |
| { "offset_limit", MOD_CTM, MOD_SIZ, 0, MO(offset_limit)}, |
| { "optimization_full", MOD_CTC, MOD_OPTMZ, PCRE2_OPTIMIZATION_FULL, 0 }, |
| { "optimization_none", MOD_CTC, MOD_OPTMZ, PCRE2_OPTIMIZATION_NONE, 0 }, |
| { "ovector", MOD_DAT, MOD_INT, 0, DO(oveccount) }, |
| { "parens_nest_limit", MOD_CTC, MOD_INT, 0, CO(parens_nest_limit) }, |
| { "partial_hard", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) }, |
| { "partial_soft", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) }, |
| { "ph", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) }, |
| { "posix", MOD_PAT, MOD_CTL, CTL_POSIX, PO(control) }, |
| { "posix_nosub", MOD_PAT, MOD_CTL, CTL_POSIX|CTL_POSIX_NOSUB, PO(control) }, |
| { "posix_startend", MOD_DAT, MOD_IN2, 0, DO(startend) }, |
| { "ps", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_SOFT, DO(options) }, |
| { "push", MOD_PAT, MOD_CTL, CTL_PUSH, PO(control) }, |
| { "pushcopy", MOD_PAT, MOD_CTL, CTL_PUSHCOPY, PO(control) }, |
| { "pushtablescopy", MOD_PAT, MOD_CTL, CTL_PUSHTABLESCOPY, PO(control) }, |
| { "python_octal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_PYTHON_OCTAL, CO(extra_options) }, |
| { "recursion_limit", MOD_CTM, MOD_INT, 0, MO(depth_limit) }, /* Obsolete synonym */ |
| { "regerror_buffsize", MOD_PAT, MOD_INT, 0, PO(regerror_buffsize) }, |
| { "replace", MOD_PND, MOD_STR, REPLACE_MODSIZE, PO(replacement) }, |
| { "stackguard", MOD_PAT, MOD_INT, 0, PO(stackguard_test) }, |
| { "start_optimize", MOD_CTC, MOD_OPTMZ, PCRE2_START_OPTIMIZE, 0 }, |
| { "start_optimize_off", MOD_CTC, MOD_OPTMZ, PCRE2_START_OPTIMIZE_OFF, 0 }, |
| { "startchar", MOD_PND, MOD_CTL, CTL_STARTCHAR, PO(control) }, |
| { "startoffset", MOD_DAT, MOD_SIZ, 0, DO(offset) }, |
| { "subject_literal", MOD_PATP, MOD_CTL, CTL2_SUBJECT_LITERAL, PO(control2) }, |
| { "substitute_callout", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_CALLOUT, PO(control2) }, |
| { "substitute_case_callout", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_CASE_CALLOUT, PO(control2) }, |
| { "substitute_extended", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_EXTENDED, PO(control2) }, |
| { "substitute_literal", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_LITERAL, PO(control2) }, |
| { "substitute_matched", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_MATCHED, PO(control2) }, |
| { "substitute_overflow_length", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_OVERFLOW_LENGTH, PO(control2) }, |
| { "substitute_replacement_only", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_REPLACEMENT_ONLY, PO(control2) }, |
| { "substitute_skip", MOD_PND, MOD_INT, 0, PO(substitute_skip) }, |
| { "substitute_stop", MOD_PND, MOD_INT, 0, PO(substitute_stop) }, |
| { "substitute_unknown_unset", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_UNKNOWN_UNSET, PO(control2) }, |
| { "substitute_unset_empty", MOD_PND, MOD_CTL, CTL2_SUBSTITUTE_UNSET_EMPTY, PO(control2) }, |
| { "tables", MOD_PAT, MOD_INT, 0, PO(tables_id) }, |
| { "turkish_casing", MOD_CTC, MOD_OPT, PCRE2_EXTRA_TURKISH_CASING, CO(extra_options) }, |
| { "ucp", MOD_PATP, MOD_OPT, PCRE2_UCP, PO(options) }, |
| { "ungreedy", MOD_PAT, MOD_OPT, PCRE2_UNGREEDY, PO(options) }, |
| { "use_length", MOD_PAT, MOD_CTL, CTL_USE_LENGTH, PO(control) }, |
| { "use_offset_limit", MOD_PAT, MOD_OPT, PCRE2_USE_OFFSET_LIMIT, PO(options) }, |
| { "utf", MOD_PATP, MOD_OPT, PCRE2_UTF, PO(options) }, |
| { "utf8_input", MOD_PAT, MOD_CTL, CTL_UTF8_INPUT, PO(control) }, |
| { "zero_terminate", MOD_DAT, MOD_CTL, CTL_ZERO_TERMINATE, DO(control) } |
| }; |
| |
| #define MODLISTCOUNT sizeof(modlist)/sizeof(modstruct) |
| |
| /* Controls and options that are supported for use with the POSIX interface. */ |
| |
| #define POSIX_SUPPORTED_COMPILE_OPTIONS ( \ |
| PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_LITERAL|PCRE2_MULTILINE|PCRE2_UCP| \ |
| PCRE2_UTF|PCRE2_UNGREEDY) |
| |
| #define POSIX_SUPPORTED_COMPILE_EXTRA_OPTIONS (0) |
| |
| #define POSIX_SUPPORTED_COMPILE_CONTROLS ( \ |
| CTL_AFTERTEXT|CTL_ALLAFTERTEXT|CTL_EXPAND|CTL_HEXPAT|CTL_POSIX| \ |
| CTL_POSIX_NOSUB|CTL_USE_LENGTH) |
| |
| #define POSIX_SUPPORTED_COMPILE_CONTROLS2 (0) |
| |
| #define POSIX_SUPPORTED_MATCH_OPTIONS ( \ |
| PCRE2_NOTBOL|PCRE2_NOTEMPTY|PCRE2_NOTEOL) |
| |
| #define POSIX_SUPPORTED_MATCH_CONTROLS (CTL_AFTERTEXT|CTL_ALLAFTERTEXT) |
| #define POSIX_SUPPORTED_MATCH_CONTROLS2 (CTL2_NULL_SUBJECT) |
| |
| /* Control bits that are not ignored with 'push'. */ |
| |
| #define PUSH_SUPPORTED_COMPILE_CONTROLS ( \ |
| CTL_BINCODE|CTL_CALLOUT_INFO|CTL_FULLBINCODE|CTL_HEXPAT|CTL_INFO| \ |
| CTL_JITVERIFY|CTL_MEMORY|CTL_PUSH|CTL_PUSHCOPY| \ |
| CTL_PUSHTABLESCOPY|CTL_USE_LENGTH) |
| |
| #define PUSH_SUPPORTED_COMPILE_CONTROLS2 (CTL2_BSR_SET| \ |
| CTL2_HEAPFRAMES_SIZE|CTL2_FRAMESIZE|CTL2_NL_SET) |
| |
| /* Controls that apply only at compile time with 'push'. */ |
| |
| #define PUSH_COMPILE_ONLY_CONTROLS CTL_JITVERIFY |
| #define PUSH_COMPILE_ONLY_CONTROLS2 (0) |
| |
| /* Controls that are forbidden with #pop or #popcopy. */ |
| |
| #define NOTPOP_CONTROLS (CTL_HEXPAT|CTL_POSIX|CTL_POSIX_NOSUB|CTL_PUSH| \ |
| CTL_PUSHCOPY|CTL_PUSHTABLESCOPY|CTL_USE_LENGTH) |
| |
| /* Pattern controls that are mutually exclusive. At present these are all in |
| the first control word. Note that CTL_POSIX_NOSUB is always accompanied by |
| CTL_POSIX, so it doesn't need its own entries. */ |
| |
| static uint32_t exclusive_pat_controls[] = { |
| CTL_POSIX | CTL_PUSH, |
| CTL_POSIX | CTL_PUSHCOPY, |
| CTL_POSIX | CTL_PUSHTABLESCOPY, |
| CTL_PUSH | CTL_PUSHCOPY, |
| CTL_PUSH | CTL_PUSHTABLESCOPY, |
| CTL_PUSHCOPY | CTL_PUSHTABLESCOPY, |
| CTL_EXPAND | CTL_HEXPAT }; |
| |
| /* Data controls that are mutually exclusive. At present these are all in the |
| first control word. */ |
| |
| static uint32_t exclusive_dat_controls[] = { |
| CTL_ALLUSEDTEXT | CTL_STARTCHAR, |
| CTL_FINDLIMITS | CTL_NULLCONTEXT, |
| CTL_FINDLIMITS_NOHEAP | CTL_NULLCONTEXT }; |
| |
| /* Table of single-character abbreviated modifiers. The index field is |
| initialized to -1, but the first time the modifier is encountered, it is filled |
| in with the index of the full entry in modlist, to save repeated searching when |
| processing multiple test items. This short list is searched serially, so its |
| order does not matter. */ |
| |
| typedef struct c1modstruct { |
| const char *fullname; |
| uint32_t onechar; |
| int index; |
| } c1modstruct; |
| |
| static c1modstruct c1modlist[] = { |
| { "bincode", 'B', -1 }, |
| { "info", 'I', -1 }, |
| { "ascii_all", 'a', -1 }, |
| { "global", 'g', -1 }, |
| { "caseless", 'i', -1 }, |
| { "multiline", 'm', -1 }, |
| { "no_auto_capture", 'n', -1 }, |
| { "caseless_restrict", 'r', -1 }, |
| { "dotall", 's', -1 }, |
| { "extended", 'x', -1 } |
| }; |
| |
| #define C1MODLISTCOUNT sizeof(c1modlist)/sizeof(c1modstruct) |
| |
| /* Table of arguments for the -C command line option. Use macros to make the |
| table itself easier to read. */ |
| |
| #if defined SUPPORT_PCRE2_8 |
| #define SUPPORT_8 1 |
| #endif |
| #if defined SUPPORT_PCRE2_16 |
| #define SUPPORT_16 1 |
| #endif |
| #if defined SUPPORT_PCRE2_32 |
| #define SUPPORT_32 1 |
| #endif |
| |
| #ifndef SUPPORT_8 |
| #define SUPPORT_8 0 |
| #endif |
| #ifndef SUPPORT_16 |
| #define SUPPORT_16 0 |
| #endif |
| #ifndef SUPPORT_32 |
| #define SUPPORT_32 0 |
| #endif |
| |
| #if defined EBCDIC |
| #define SUPPORT_EBCDIC 1 |
| #define SUPPORT_EBCDIC_NL25 CHAR_LF == 0x25 |
| #else |
| #define SUPPORT_EBCDIC 0 |
| #define SUPPORT_EBCDIC_NL25 0 |
| #endif |
| |
| #ifdef NEVER_BACKSLASH_C |
| #define BACKSLASH_C 0 |
| #else |
| #define BACKSLASH_C 1 |
| #endif |
| |
| typedef struct coptstruct { |
| const char *name; |
| uint32_t type; |
| uint32_t value; |
| } coptstruct; |
| |
| enum { CONF_BSR, |
| CONF_FIX, |
| CONF_INT, |
| CONF_NL, |
| CONF_JU |
| }; |
| |
| static coptstruct coptlist[] = { |
| { "backslash-C", CONF_FIX, BACKSLASH_C }, |
| { "bsr", CONF_BSR, PCRE2_CONFIG_BSR }, |
| { "ebcdic", CONF_FIX, SUPPORT_EBCDIC }, |
| { "ebcdic-io", CONF_FIX, EBCDIC_IO }, |
| { "ebcdic-nl25", CONF_FIX, SUPPORT_EBCDIC_NL25 }, |
| { "jit", CONF_INT, PCRE2_CONFIG_JIT }, |
| { "jitusable", CONF_JU, 0 }, |
| { "linksize", CONF_INT, PCRE2_CONFIG_EFFECTIVE_LINKSIZE }, |
| { "newline", CONF_NL, PCRE2_CONFIG_NEWLINE }, |
| { "pcre2-16", CONF_FIX, SUPPORT_16 }, |
| { "pcre2-32", CONF_FIX, SUPPORT_32 }, |
| { "pcre2-8", CONF_FIX, SUPPORT_8 }, |
| { "unicode", CONF_INT, PCRE2_CONFIG_UNICODE } |
| }; |
| |
| #define COPTLISTCOUNT sizeof(coptlist)/sizeof(coptstruct) |
| |
| #undef SUPPORT_8 |
| #undef SUPPORT_16 |
| #undef SUPPORT_32 |
| #undef SUPPORT_EBCDIC |
| #undef SUPPORT_EBDCIC_NL25 |
| #undef BACKSLASH_C |
| |
| /* Types for the parser, to be used in process_data() */ |
| |
| enum force_encoding { |
| FORCE_NONE, /* No preference, follow utf modifier */ |
| FORCE_RAW, /* Encode as a code point or error if too wide */ |
| FORCE_UTF /* Encode as a character or error if too wide */ |
| }; |
| |
| /* ----------------------- Static variables ------------------------ */ |
| |
| static FILE *infile; |
| static FILE *outfile; |
| |
| static const void *last_callout_mark; |
| |
| static BOOL first_callout; |
| static BOOL jit_was_used; |
| static BOOL restrict_for_perl_test = FALSE; |
| static BOOL show_memory = FALSE; |
| static BOOL preprocess_only = FALSE; |
| static BOOL inside_if = FALSE; |
| static BOOL malloc_testing = FALSE; |
| |
| static int jitrc; /* Return from JIT compile */ |
| static int timeit = 0; |
| static int timeitm = 0; |
| |
| static clock_t total_compile_time = 0; |
| static clock_t total_jit_compile_time = 0; |
| static clock_t total_match_time = 0; |
| |
| static uint32_t dfa_matched; |
| static uint32_t forbid_utf = 0; |
| static uint32_t maxlookbehind; |
| static uint32_t max_oveccount; |
| static uint32_t callout_count; |
| static uint32_t maxcapcount; |
| |
| static uint16_t local_newline_default = 0; |
| |
| static patctl def_patctl; |
| static patctl pat_patctl; |
| static datctl def_datctl; |
| static datctl dat_datctl; |
| |
| static void *malloclist[MALLOCLISTSIZE]; |
| static PCRE2_SIZE malloclistlength[MALLOCLISTSIZE]; |
| static uint32_t malloclistptr = 0; |
| |
| #ifdef SUPPORT_PCRE2_8 |
| static regex_t preg = { NULL, NULL, 0, 0, 0, 0 }; |
| #endif |
| |
| static int *dfa_workspace = NULL; |
| static const uint8_t *locale_tables = NULL; |
| static const uint8_t *use_tables = NULL; |
| static uint8_t locale_name[LOCALESIZE]; |
| static uint8_t *tables3 = NULL; /* For binary-loaded tables */ |
| static uint32_t loadtables_length = 0; |
| |
| /* We need buffers for building 16/32-bit strings; 8-bit strings don't need |
| rebuilding, but set up the same naming scheme for use in macros. The "buffer" |
| buffer is where all input lines are read. Its size is the same as pbuffer8. */ |
| |
| static size_t pbuffer8_size = 50000; /* Initial size, bytes */ |
| static uint8_t *pbuffer8 = NULL; |
| #ifdef SUPPORT_PCRE2_16 |
| static size_t pbuffer16_size = 0; /* Size, bytes! Set only when needed */ |
| static uint16_t *pbuffer16 = NULL; |
| #endif |
| #ifdef SUPPORT_PCRE2_32 |
| static size_t pbuffer32_size = 0; /* Size, bytes! Set only when needed */ |
| static uint32_t *pbuffer32 = NULL; |
| #endif |
| static uint8_t *buffer = NULL; |
| |
| /* The dbuffer is where all processed data lines are put. In non-8-bit modes it |
| is cast as needed. For long data lines it grows as necessary. */ |
| |
| static size_t dbuffer_size = 1u << 14; /* Initial size, bytes */ |
| static uint8_t *dbuffer = NULL; |
| |
| |
| |
| /************************************************* |
| * Alternate character tables * |
| *************************************************/ |
| |
| /* By default, the "tables" pointer in the compile context when calling |
| pcre2_compile() is not set (= NULL), thereby using the default tables of the |
| library. However, the tables modifier can be used to select alternate sets of |
| tables, for different kinds of testing. Note that the locale modifier also |
| adjusts the tables. */ |
| |
| /* This is the set of tables distributed as default with PCRE2. It recognizes |
| only ASCII characters. */ |
| |
| static const uint8_t tables1[] = { |
| |
| /* This table is a lower casing table. */ |
| |
| 0, 1, 2, 3, 4, 5, 6, 7, |
| 8, 9, 10, 11, 12, 13, 14, 15, |
| 16, 17, 18, 19, 20, 21, 22, 23, |
| 24, 25, 26, 27, 28, 29, 30, 31, |
| 32, 33, 34, 35, 36, 37, 38, 39, |
| 40, 41, 42, 43, 44, 45, 46, 47, |
| 48, 49, 50, 51, 52, 53, 54, 55, |
| 56, 57, 58, 59, 60, 61, 62, 63, |
| 64, 97, 98, 99,100,101,102,103, |
| 104,105,106,107,108,109,110,111, |
| 112,113,114,115,116,117,118,119, |
| 120,121,122, 91, 92, 93, 94, 95, |
| 96, 97, 98, 99,100,101,102,103, |
| 104,105,106,107,108,109,110,111, |
| 112,113,114,115,116,117,118,119, |
| 120,121,122,123,124,125,126,127, |
| 128,129,130,131,132,133,134,135, |
| 136,137,138,139,140,141,142,143, |
| 144,145,146,147,148,149,150,151, |
| 152,153,154,155,156,157,158,159, |
| 160,161,162,163,164,165,166,167, |
| 168,169,170,171,172,173,174,175, |
| 176,177,178,179,180,181,182,183, |
| 184,185,186,187,188,189,190,191, |
| 192,193,194,195,196,197,198,199, |
| 200,201,202,203,204,205,206,207, |
| 208,209,210,211,212,213,214,215, |
| 216,217,218,219,220,221,222,223, |
| 224,225,226,227,228,229,230,231, |
| 232,233,234,235,236,237,238,239, |
| 240,241,242,243,244,245,246,247, |
| 248,249,250,251,252,253,254,255, |
| |
| /* This table is a case flipping table. */ |
| |
| 0, 1, 2, 3, 4, 5, 6, 7, |
| 8, 9, 10, 11, 12, 13, 14, 15, |
| 16, 17, 18, 19, 20, 21, 22, 23, |
| 24, 25, 26, 27, 28, 29, 30, 31, |
| 32, 33, 34, 35, 36, 37, 38, 39, |
| 40, 41, 42, 43, 44, 45, 46, 47, |
| 48, 49, 50, 51, 52, 53, 54, 55, |
| 56, 57, 58, 59, 60, 61, 62, 63, |
| 64, 97, 98, 99,100,101,102,103, |
| 104,105,106,107,108,109,110,111, |
| 112,113,114,115,116,117,118,119, |
| 120,121,122, 91, 92, 93, 94, 95, |
| 96, 65, 66, 67, 68, 69, 70, 71, |
| 72, 73, 74, 75, 76, 77, 78, 79, |
| 80, 81, 82, 83, 84, 85, 86, 87, |
| 88, 89, 90,123,124,125,126,127, |
| 128,129,130,131,132,133,134,135, |
| 136,137,138,139,140,141,142,143, |
| 144,145,146,147,148,149,150,151, |
| 152,153,154,155,156,157,158,159, |
| 160,161,162,163,164,165,166,167, |
| 168,169,170,171,172,173,174,175, |
| 176,177,178,179,180,181,182,183, |
| 184,185,186,187,188,189,190,191, |
| 192,193,194,195,196,197,198,199, |
| 200,201,202,203,204,205,206,207, |
| 208,209,210,211,212,213,214,215, |
| 216,217,218,219,220,221,222,223, |
| 224,225,226,227,228,229,230,231, |
| 232,233,234,235,236,237,238,239, |
| 240,241,242,243,244,245,246,247, |
| 248,249,250,251,252,253,254,255, |
| |
| /* This table contains bit maps for various character classes. Each map is 32 |
| bytes long and the bits run from the least significant end of each byte. The |
| classes that have their own maps are: space, xdigit, digit, upper, lower, word, |
| graph, print, punct, and cntrl. Other classes are built from combinations. */ |
| |
| 0x00,0x3e,0x00,0x00,0x01,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| |
| 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, |
| 0x7e,0x00,0x00,0x00,0x7e,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| |
| 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0xfe,0xff,0xff,0x07,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0x07, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| |
| 0x00,0x00,0x00,0x00,0x00,0x00,0xff,0x03, |
| 0xfe,0xff,0xff,0x87,0xfe,0xff,0xff,0x07, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| |
| 0x00,0x00,0x00,0x00,0xfe,0xff,0xff,0xff, |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| |
| 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff, |
| 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x7f, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| |
| 0x00,0x00,0x00,0x00,0xfe,0xff,0x00,0xfc, |
| 0x01,0x00,0x00,0xf8,0x01,0x00,0x00,0x78, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| |
| 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, |
| |
| /* This table identifies various classes of character by individual bits: |
| 0x01 white space character |
| 0x02 letter |
| 0x04 decimal digit |
| 0x08 hexadecimal digit |
| 0x10 alphanumeric or '_' |
| 0x80 regular expression metacharacter or binary zero |
| */ |
| |
| 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ |
| 0x00,0x01,0x01,0x01,0x01,0x01,0x00,0x00, /* 8- 15 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ |
| 0x01,0x00,0x00,0x00,0x80,0x00,0x00,0x00, /* - ' */ |
| 0x80,0x80,0x80,0x80,0x00,0x00,0x80,0x00, /* ( - / */ |
| 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */ |
| 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x80, /* 8 - ? */ |
| 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* @ - G */ |
| 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* H - O */ |
| 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* P - W */ |
| 0x12,0x12,0x12,0x80,0x80,0x00,0x80,0x10, /* X - _ */ |
| 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* ` - g */ |
| 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* h - o */ |
| 0x12,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* p - w */ |
| 0x12,0x12,0x12,0x80,0x80,0x00,0x00,0x00, /* x -127 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ |
| 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ |
| |
| /* This is a set of tables that came originally from a Windows user. It seems |
| to be at least an approximation of ISO 8859. In particular, there are |
| characters greater than 128 that are marked as spaces, letters, etc. */ |
| |
| static const uint8_t tables2[] = { |
| 0,1,2,3,4,5,6,7, |
| 8,9,10,11,12,13,14,15, |
| 16,17,18,19,20,21,22,23, |
| 24,25,26,27,28,29,30,31, |
| 32,33,34,35,36,37,38,39, |
| 40,41,42,43,44,45,46,47, |
| 48,49,50,51,52,53,54,55, |
| 56,57,58,59,60,61,62,63, |
| 64,97,98,99,100,101,102,103, |
| 104,105,106,107,108,109,110,111, |
| 112,113,114,115,116,117,118,119, |
| 120,121,122,91,92,93,94,95, |
| 96,97,98,99,100,101,102,103, |
| 104,105,106,107,108,109,110,111, |
| 112,113,114,115,116,117,118,119, |
| 120,121,122,123,124,125,126,127, |
| 128,129,130,131,132,133,134,135, |
| 136,137,138,139,140,141,142,143, |
| 144,145,146,147,148,149,150,151, |
| 152,153,154,155,156,157,158,159, |
| 160,161,162,163,164,165,166,167, |
| 168,169,170,171,172,173,174,175, |
| 176,177,178,179,180,181,182,183, |
| 184,185,186,187,188,189,190,191, |
| 224,225,226,227,228,229,230,231, |
| 232,233,234,235,236,237,238,239, |
| 240,241,242,243,244,245,246,215, |
| 248,249,250,251,252,253,254,223, |
| 224,225,226,227,228,229,230,231, |
| 232,233,234,235,236,237,238,239, |
| 240,241,242,243,244,245,246,247, |
| 248,249,250,251,252,253,254,255, |
| 0,1,2,3,4,5,6,7, |
| 8,9,10,11,12,13,14,15, |
| 16,17,18,19,20,21,22,23, |
| 24,25,26,27,28,29,30,31, |
| 32,33,34,35,36,37,38,39, |
| 40,41,42,43,44,45,46,47, |
| 48,49,50,51,52,53,54,55, |
| 56,57,58,59,60,61,62,63, |
| 64,97,98,99,100,101,102,103, |
| 104,105,106,107,108,109,110,111, |
| 112,113,114,115,116,117,118,119, |
| 120,121,122,91,92,93,94,95, |
| 96,65,66,67,68,69,70,71, |
| 72,73,74,75,76,77,78,79, |
| 80,81,82,83,84,85,86,87, |
| 88,89,90,123,124,125,126,127, |
| 128,129,130,131,132,133,134,135, |
| 136,137,138,139,140,141,142,143, |
| 144,145,146,147,148,149,150,151, |
| 152,153,154,155,156,157,158,159, |
| 160,161,162,163,164,165,166,167, |
| 168,169,170,171,172,173,174,175, |
| 176,177,178,179,180,181,182,183, |
| 184,185,186,187,188,189,190,191, |
| 224,225,226,227,228,229,230,231, |
| 232,233,234,235,236,237,238,239, |
| 240,241,242,243,244,245,246,215, |
| 248,249,250,251,252,253,254,223, |
| 192,193,194,195,196,197,198,199, |
| 200,201,202,203,204,205,206,207, |
| 208,209,210,211,212,213,214,247, |
| 216,217,218,219,220,221,222,255, |
| 0,62,0,0,1,0,0,0, |
| 0,0,0,0,0,0,0,0, |
| 32,0,0,0,1,0,0,0, |
| 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,255,3, |
| 126,0,0,0,126,0,0,0, |
| 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,255,3, |
| 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,12,2, |
| 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0, |
| 254,255,255,7,0,0,0,0, |
| 0,0,0,0,0,0,0,0, |
| 255,255,127,127,0,0,0,0, |
| 0,0,0,0,0,0,0,0, |
| 0,0,0,0,254,255,255,7, |
| 0,0,0,0,0,4,32,4, |
| 0,0,0,128,255,255,127,255, |
| 0,0,0,0,0,0,255,3, |
| 254,255,255,135,254,255,255,7, |
| 0,0,0,0,0,4,44,6, |
| 255,255,127,255,255,255,127,255, |
| 0,0,0,0,254,255,255,255, |
| 255,255,255,255,255,255,255,127, |
| 0,0,0,0,254,255,255,255, |
| 255,255,255,255,255,255,255,255, |
| 0,2,0,0,255,255,255,255, |
| 255,255,255,255,255,255,255,127, |
| 0,0,0,0,255,255,255,255, |
| 255,255,255,255,255,255,255,255, |
| 0,0,0,0,254,255,0,252, |
| 1,0,0,248,1,0,0,120, |
| 0,0,0,0,254,255,255,255, |
| 0,0,128,0,0,0,128,0, |
| 255,255,255,255,0,0,0,0, |
| 0,0,0,0,0,0,0,128, |
| 255,255,255,255,0,0,0,0, |
| 0,0,0,0,0,0,0,0, |
| 128,0,0,0,0,0,0,0, |
| 0,1,1,0,1,1,0,0, |
| 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0, |
| 1,0,0,0,128,0,0,0, |
| 128,128,128,128,0,0,128,0, |
| 28,28,28,28,28,28,28,28, |
| 28,28,0,0,0,0,0,128, |
| 0,26,26,26,26,26,26,18, |
| 18,18,18,18,18,18,18,18, |
| 18,18,18,18,18,18,18,18, |
| 18,18,18,128,128,0,128,16, |
| 0,26,26,26,26,26,26,18, |
| 18,18,18,18,18,18,18,18, |
| 18,18,18,18,18,18,18,18, |
| 18,18,18,128,128,0,0,0, |
| 0,0,0,0,0,1,0,0, |
| 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0, |
| 1,0,0,0,0,0,0,0, |
| 0,0,18,0,0,0,0,0, |
| 0,0,20,20,0,18,0,0, |
| 0,20,18,0,0,0,0,0, |
| 18,18,18,18,18,18,18,18, |
| 18,18,18,18,18,18,18,18, |
| 18,18,18,18,18,18,18,0, |
| 18,18,18,18,18,18,18,18, |
| 18,18,18,18,18,18,18,18, |
| 18,18,18,18,18,18,18,18, |
| 18,18,18,18,18,18,18,0, |
| 18,18,18,18,18,18,18,18 |
| }; |
| |
| |
| |
| #if !defined(VPCOMPAT) && !defined(HAVE_MEMMOVE) |
| /************************************************* |
| * Emulated memmove() for systems without it * |
| *************************************************/ |
| |
| /* This function can make use of bcopy() if it is available. Otherwise do it by |
| steam, as there are some non-Unix environments that lack both memmove() and |
| bcopy(). */ |
| |
| static void * |
| emulated_memmove(void *d, const void *s, size_t n) |
| { |
| #ifdef HAVE_BCOPY |
| bcopy(s, d, n); |
| return d; |
| #else |
| size_t i; |
| unsigned char *dest = (unsigned char *)d; |
| const unsigned char *src = (const unsigned char *)s; |
| if (dest > src) |
| { |
| dest += n; |
| src += n; |
| for (i = 0; i < n; ++i) *(--dest) = *(--src); |
| return (void *)dest; |
| } |
| else |
| { |
| for (i = 0; i < n; ++i) *dest++ = *src++; |
| return (void *)(dest - n); |
| } |
| #endif /* not HAVE_BCOPY */ |
| } |
| #undef memmove |
| #define memmove(d,s,n) emulated_memmove(d,s,n) |
| #endif /* not VPCOMPAT && not HAVE_MEMMOVE */ |
| |
| |
| |
| #ifndef HAVE_STRERROR |
| /************************************************* |
| * Provide strerror() for non-ANSI libraries * |
| *************************************************/ |
| |
| /* Some old-fashioned systems (e.g. SunOS4) didn't have strerror() in their |
| libraries. They may no longer be around, but just in case, we can try to |
| provide the same facility by this simple alternative function. */ |
| |
| extern int sys_nerr; |
| extern char *sys_errlist[]; |
| |
| char * |
| strerror(int n) |
| { |
| if (n < 0 || n >= sys_nerr) return "unknown error number"; |
| return sys_errlist[n]; |
| } |
| #endif /* HAVE_STRERROR */ |
| |
| |
| |
| /************************************************* |
| * Local memory functions * |
| *************************************************/ |
| |
| static int mallocs_until_failure = INT_MAX; |
| static int mallocs_called = 0; |
| |
| /* Alternative memory functions, to test functionality. */ |
| |
| static void *my_malloc(size_t size, void *data) |
| { |
| void *block; |
| |
| (void)data; |
| |
| mallocs_called++; |
| if (mallocs_until_failure != INT_MAX && mallocs_until_failure-- <= 0) |
| return NULL; |
| |
| block = malloc(size); |
| if (show_memory) |
| { |
| if (block == NULL) |
| { |
| fprintf(outfile, "** malloc() failed for %" SIZ_FORM "\n", size); |
| } |
| else |
| { |
| fprintf(outfile, "malloc %5" SIZ_FORM, size); |
| #ifdef DEBUG_SHOW_MALLOC_ADDRESSES |
| fprintf(outfile, " %p", block); /* Not portable */ |
| #endif |
| if (malloclistptr < MALLOCLISTSIZE) |
| { |
| malloclist[malloclistptr] = block; |
| malloclistlength[malloclistptr++] = size; |
| } |
| else |
| fprintf(outfile, " (not remembered)"); |
| fprintf(outfile, "\n"); |
| } |
| } |
| return block; |
| } |
| |
| static void my_free(void *block, void *data) |
| { |
| (void)data; |
| if (show_memory && block != NULL) |
| { |
| uint32_t i, j; |
| BOOL found = FALSE; |
| |
| fprintf(outfile, "free"); |
| for (i = 0; i < malloclistptr; i++) |
| { |
| if (block == malloclist[i]) |
| { |
| fprintf(outfile, " %5" SIZ_FORM, malloclistlength[i]); |
| malloclistptr--; |
| for (j = i; j < malloclistptr; j++) |
| { |
| malloclist[j] = malloclist[j+1]; |
| malloclistlength[j] = malloclistlength[j+1]; |
| } |
| found = TRUE; |
| break; |
| } |
| } |
| if (!found) fprintf(outfile, " unremembered block"); |
| #ifdef DEBUG_SHOW_MALLOC_ADDRESSES |
| fprintf(outfile, " %p", block); /* Not portable */ |
| #endif |
| fprintf(outfile, "\n"); |
| } |
| free(block); |
| } |
| |
| |
| |
| /************************************************* |
| * Callback function for stack guard * |
| *************************************************/ |
| |
| /* This is set up to be called from pcre2_compile() when the stackguard=n |
| modifier sets a value greater than zero. The test we do is whether the |
| parenthesis nesting depth is greater than the value set by the modifier. |
| |
| Argument: the current parenthesis nesting depth |
| Returns: non-zero to kill the compilation |
| */ |
| |
| static int |
| stack_guard(uint32_t depth, void *user_data) |
| { |
| (void)user_data; |
| return depth > pat_patctl.stackguard_test; |
| } |
| |
| |
| |
| /************************************************* |
| * EBCDIC support functions * |
| *************************************************/ |
| |
| #if defined(EBCDIC) |
| static BOOL |
| printable(uint32_t c) |
| { |
| if ((c >= CHAR_a && c <= CHAR_i) || |
| (c >= CHAR_j && c <= CHAR_r) || |
| (c >= CHAR_s && c <= CHAR_z) || |
| (c >= CHAR_A && c <= CHAR_I) || |
| (c >= CHAR_J && c <= CHAR_R) || |
| (c >= CHAR_S && c <= CHAR_Z) || |
| (c >= CHAR_0 && c <= CHAR_9)) |
| return TRUE; |
| |
| switch (c) |
| { |
| case CHAR_SPACE: |
| case CHAR_EXCLAMATION_MARK: |
| case CHAR_QUOTATION_MARK: |
| case CHAR_NUMBER_SIGN: |
| case CHAR_DOLLAR_SIGN: |
| case CHAR_PERCENT_SIGN: |
| case CHAR_AMPERSAND: |
| case CHAR_APOSTROPHE: |
| case CHAR_LEFT_PARENTHESIS: |
| case CHAR_RIGHT_PARENTHESIS: |
| case CHAR_ASTERISK: |
| case CHAR_PLUS: |
| case CHAR_COMMA: |
| case CHAR_MINUS: |
| case CHAR_DOT: |
| case CHAR_SLASH: |
| case CHAR_COLON: |
| case CHAR_SEMICOLON: |
| case CHAR_LESS_THAN_SIGN: |
| case CHAR_EQUALS_SIGN: |
| case CHAR_GREATER_THAN_SIGN: |
| case CHAR_QUESTION_MARK: |
| case CHAR_COMMERCIAL_AT: |
| case CHAR_LEFT_SQUARE_BRACKET: |
| case CHAR_BACKSLASH: |
| case CHAR_RIGHT_SQUARE_BRACKET: |
| case CHAR_CIRCUMFLEX_ACCENT: |
| case CHAR_UNDERSCORE: |
| case CHAR_GRAVE_ACCENT: |
| case CHAR_LEFT_CURLY_BRACKET: |
| case CHAR_VERTICAL_LINE: |
| case CHAR_RIGHT_CURLY_BRACKET: |
| case CHAR_TILDE: |
| return TRUE; |
| } |
| |
| return FALSE; |
| } |
| #endif |
| |
| #if defined(EBCDIC) && !EBCDIC_IO |
| static void |
| ascii_to_ebcdic_str(uint8_t *buf, size_t len) |
| { |
| for (size_t i = 0; i < len; ++i) |
| buf[i] = ascii_to_ebcdic_1047[buf[i]]; |
| } |
| #endif |
| |
| #if defined(EBCDIC) |
| static uint32_t |
| ascii_to_ebcdic(uint32_t c) |
| { |
| return (c < 256)? ascii_to_ebcdic_1047[c] : c; |
| } |
| |
| static uint32_t |
| ebcdic_to_ascii(uint32_t c) |
| { |
| return (c < 256)? ebcdic_1047_to_ascii[c] : c; |
| } |
| #endif |
| |
| |
| |
| /************************************************* |
| * Convert UTF-8 character to code point * |
| *************************************************/ |
| |
| /* This function reads one or more bytes that represent a UTF-8 character, |
| and returns the codepoint of that character. Note that the function supports |
| the original UTF-8 definition of RFC 2279, allowing for values in the range 0 |
| to 0x7fffffff, up to 6 bytes long. This makes it possible to generate |
| codepoints greater than 0x10ffff which are useful for testing PCRE2's error |
| checking, and also for generating 32-bit non-UTF data values above the UTF |
| limit. |
| |
| Argument: |
| utf8bytes a pointer to the byte vector |
| end a pointer to the end of the byte vector |
| vptr a pointer to an int to receive the value |
| |
| Returns: > 0 => the number of bytes consumed |
| -6 to 0 => malformed UTF-8 character at offset = (-return) |
| */ |
| |
| static int |
| utf8_to_ord(PCRE2_SPTR8 utf8bytes, PCRE2_SPTR8 end, uint32_t *vptr) |
| { |
| uint32_t c = *utf8bytes++; |
| uint32_t d = c; |
| int i, j, s; |
| |
| for (i = -1; i < 6; i++) /* i is number of additional bytes */ |
| { |
| if ((d & 0x80) == 0) break; |
| d <<= 1; |
| } |
| |
| if (i == -1) { *vptr = c; return 1; } /* ascii character */ |
| if (i == 0 || i == 6) return 0; /* invalid UTF-8 */ |
| |
| /* i now has a value in the range 1-5 */ |
| |
| s = 6*i; |
| d = (c & utf8_table3[i]) << s; |
| |
| for (j = 0; j < i; j++) |
| { |
| if (utf8bytes >= end) return 0; |
| |
| c = *utf8bytes++; |
| if ((c & 0xc0) != 0x80) return -(j+1); |
| s -= 6; |
| d |= (c & 0x3f) << s; |
| } |
| |
| /* Check that encoding was the correct unique one */ |
| |
| for (j = 0; j < (int)utf8_table1_size; j++) |
| if (d <= (uint32_t)utf8_table1[j]) break; |
| if (j != i) return -(i+1); |
| |
| /* Valid value */ |
| |
| *vptr = d; |
| return i+1; |
| } |
| |
| |
| |
| #ifdef SUPPORT_PCRE2_16 |
| /************************************************* |
| * Convert UTF-16 character to code point * |
| *************************************************/ |
| |
| /* This function reads one or more UTF-16 code units, and returns the |
| codepoint of that character. |
| |
| Argument: |
| utf16units a pointer to the units vector |
| end a pointer to the end of the units vector |
| vptr a pointer to an int to receive the value |
| |
| Returns: > 0 => the number of 16-bit units consumed |
| -1 => malformed UTF-16 |
| */ |
| |
| static int |
| utf16_to_ord(PCRE2_SPTR16 utf16units, PCRE2_SPTR16 end, uint32_t *vptr) |
| { |
| uint32_t c = *utf16units++; |
| |
| if (c >= 0xdc00 && c <= 0xdfff) return -1; |
| |
| if (c >= 0xd800 && c < 0xdc00) |
| { |
| uint32_t c2; |
| |
| if (utf16units >= end) return -1; |
| |
| c2 = *utf16units++; |
| if (c2 < 0xdc00 || c2 > 0xdfff) return -1; |
| *vptr = ((c & 0x3ff) << 10) + (c2 & 0x3ff) + 0x10000; |
| return 2; |
| } |
| |
| *vptr = c; |
| return 1; |
| } |
| #endif /* SUPPORT_PCRE2_16 */ |
| |
| |
| |
| /************************************************* |
| * Convert character value to UTF-8 * |
| *************************************************/ |
| |
| /* This function takes an integer value in the range 0 - 0x7fffffff |
| and encodes it as a UTF-8 character in 0 to 6 bytes. It is needed even when the |
| 8-bit library is not supported, to generate UTF-8 output for non-ASCII |
| characters. |
| |
| Arguments: |
| cvalue the character value |
| utf8bytes pointer to buffer for result - at least 6 bytes long |
| |
| Returns: number of characters placed in the buffer |
| */ |
| |
| static int |
| ord_to_utf8(uint32_t cvalue, uint8_t *utf8bytes) |
| { |
| int i, j; |
| if (cvalue > 0x7fffffffu) |
| return -1; |
| for (i = 0; i < (int)utf8_table1_size; i++) |
| if (cvalue <= (uint32_t)utf8_table1[i]) break; |
| utf8bytes += i; |
| for (j = i; j > 0; j--) |
| { |
| *utf8bytes-- = 0x80 | (cvalue & 0x3f); |
| cvalue >>= 6; |
| } |
| *utf8bytes = utf8_table2[i] | cvalue; |
| return i + 1; |
| } |
| |
| |
| |
| /************************************************* |
| * Print one character * |
| *************************************************/ |
| |
| /* Print a single character either literally, or as a hex escape, and count how |
| many printed characters are used. |
| |
| Arguments: |
| c the character |
| utf TRUE in UTF mode |
| f the FILE to print to, or NULL just to count characters |
| |
| Returns: number of characters written |
| */ |
| |
| static int |
| pchar(uint32_t c, BOOL utf, FILE *f) |
| { |
| int n = 0; |
| char tempbuffer[16]; |
| |
| if (PRINTABLE(c)) |
| { |
| c = CHAR_OUTPUT(c); |
| if (f != NULL) fprintf(f, "%c", c); |
| return 1; |
| } |
| |
| c = CHAR_OUTPUT_HEX(c); |
| |
| if (c < 0x100) |
| { |
| if (utf) |
| { |
| if (f != NULL) fprintf(f, "\\x{%02x}", c); |
| return 6; |
| } |
| else |
| { |
| if (f != NULL) fprintf(f, "\\x%02x", c); |
| return 4; |
| } |
| } |
| |
| if (f != NULL) n = fprintf(f, "\\x{%02x}", c); |
| else n = snprintf(tempbuffer, sizeof(tempbuffer), "\\x{%02x}", c); |
| |
| return n >= 0 ? n : 0; |
| } |
| |
| |
| |
| /************************************************* |
| * Expand input buffers * |
| *************************************************/ |
| |
| /* This function doubles the size of the input buffer and the buffer for |
| keeping an 8-bit copy of patterns (pbuffer8), and copies the current buffers to |
| the new ones. |
| |
| Arguments: none |
| Returns: nothing (aborts if malloc() fails) |
| */ |
| |
| static void |
| expand_input_buffers(void) |
| { |
| size_t new_pbuffer8_size = 2*pbuffer8_size; |
| uint8_t *new_buffer = (uint8_t *)malloc(new_pbuffer8_size); |
| uint8_t *new_pbuffer8 = (uint8_t *)malloc(new_pbuffer8_size); |
| |
| if (new_buffer == NULL || new_pbuffer8 == NULL) |
| { |
| fprintf(stderr, "pcre2test: malloc(%" SIZ_FORM ") failed\n", |
| new_pbuffer8_size); |
| exit(1); |
| } |
| |
| memcpy(new_buffer, buffer, pbuffer8_size); |
| memcpy(new_pbuffer8, pbuffer8, pbuffer8_size); |
| |
| pbuffer8_size = new_pbuffer8_size; |
| |
| free(buffer); |
| free(pbuffer8); |
| |
| buffer = new_buffer; |
| pbuffer8 = new_pbuffer8; |
| } |
| |
| |
| |
| /************************************************* |
| * Read or extend an input line * |
| *************************************************/ |
| |
| /* Input lines are read into buffer, but both patterns and data lines can be |
| continued over multiple input lines. In addition, if the buffer fills up, we |
| want to automatically expand it so as to be able to handle extremely large |
| lines that are needed for certain stress tests, although this is less likely |
| now that there are repetition features for both patterns and data. When the |
| input buffer is expanded, the other two buffers must also be expanded likewise, |
| and the contents of pbuffer, which are a copy of the input for callouts, must |
| be preserved (for when expansion happens for a data line). This is not the most |
| optimal way of handling this, but hey, this is just a test program! |
| |
| Arguments: |
| f the file to read |
| start where in buffer to start (this *must* be within buffer) |
| prompt for stdin or readline() |
| |
| Returns: pointer to the start of new data |
| could be a copy of start, or could be moved |
| NULL if no data read and EOF reached |
| */ |
| |
| static uint8_t * |
| extend_inputline(FILE *f, uint8_t *start, const char *prompt) |
| { |
| uint8_t *here = start; |
| |
| for (;;) |
| { |
| size_t dlen; |
| size_t rlen = (size_t)(pbuffer8_size - (here - buffer)); |
| |
| /* If libreadline or libedit support is required, use readline() to read a |
| line if the input is a terminal. Note that readline() removes the trailing |
| newline, so we must put it back again, to be compatible with fgets(). */ |
| |
| #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) |
| if (INTERACTIVE(f)) |
| { |
| char *s = readline(prompt); |
| if (s == NULL) return (here == start)? NULL : start; |
| dlen = strlen(s); |
| if (dlen > rlen - 2) |
| { |
| fprintf(outfile, "** Interactive input exceeds buffer space\n"); |
| exit(1); |
| } |
| if (dlen > 0) add_history(s); |
| memcpy(here, s, dlen); |
| here[dlen] = '\n'; |
| here[dlen+1] = 0; |
| free(s); |
| return start; |
| } |
| #endif |
| |
| if (rlen > 1000) |
| { |
| int rlen_trunc = (rlen > (unsigned)INT_MAX)? INT_MAX : (int)rlen; |
| |
| /* Read the next line by normal means, prompting if the file is a tty. */ |
| |
| if (INTERACTIVE(f)) printf("%s", prompt); |
| if (fgets((char *)here, rlen_trunc, f) == NULL) |
| return (here == start)? NULL : start; |
| |
| dlen = strlen((char *)here); |
| here += dlen; |
| |
| /* Check for end of line reached. Take care not to read data from before |
| start (dlen will be zero for a file starting with a binary zero). */ |
| |
| if (here > start && here[-1] == '\n') return start; |
| |
| /* If we have not read a newline when reading a file, we have either filled |
| the buffer or reached the end of the file. We can detect the former by |
| checking that the string fills the buffer, and the latter by feof(). If |
| neither of these is true, it means we read a binary zero which has caused |
| strlen() to give a short length. This is a hard error because pcre2test |
| expects to work with C strings. */ |
| |
| if (dlen < (unsigned)rlen_trunc - 1 && !feof(f)) |
| { |
| fprintf(outfile, "** Binary zero encountered in input\n"); |
| fprintf(outfile, "** pcre2test run abandoned\n"); |
| exit(1); |
| } |
| } |
| |
| else |
| { |
| size_t start_offset = start - buffer; |
| size_t here_offset = here - buffer; |
| expand_input_buffers(); |
| start = buffer + start_offset; |
| here = buffer + here_offset; |
| } |
| } |
| |
| PCRE2_UNREACHABLE(); /* Control never reaches here */ |
| } |
| |
| |
| |
| /************************************************* |
| * Case-independent strncmp() function * |
| *************************************************/ |
| |
| /* |
| Arguments: |
| s first string |
| t second string |
| n number of characters to compare |
| |
| Returns: < 0, = 0, or > 0, according to the comparison |
| */ |
| |
| static int |
| strncmpic(const uint8_t *s, const uint8_t *t, size_t n) |
| { |
| if (n > 0) do |
| { |
| int c = tolower(*s++) - tolower(*t++); |
| if (c != 0) return c; |
| } |
| while (--n > 0); |
| |
| return 0; |
| } |
| |
| |
| |
| /************************************************* |
| * Scan the main modifier list * |
| *************************************************/ |
| |
| /* This function searches the modifier list for a long modifier name. |
| |
| Argument: |
| p start of the name |
| lenp length of the name |
| |
| Returns: an index in the modifier list, or -1 on failure |
| */ |
| |
| static int |
| scan_modifiers(const uint8_t *p, size_t len) |
| { |
| int bot = 0; |
| int top = MODLISTCOUNT; |
| |
| while (top > bot) |
| { |
| int mid = (bot + top)/2; |
| size_t mlen = strlen(modlist[mid].name); |
| int c = strncmp((const char *)p, modlist[mid].name, (len < mlen)? len : mlen); |
| if (c == 0) |
| { |
| if (len == mlen) return mid; |
| c = len > mlen ? 1 : -1; |
| } |
| if (c > 0) bot = mid + 1; else top = mid; |
| } |
| |
| return -1; |
| } |
| |
| |
| |
| /************************************************* |
| * Determine how to print an error offset * |
| *************************************************/ |
| |
| /* Each error code has an associated direction - does it refer |
| to the characters to the right or to the left of the offset? |
| |
| Arguments: |
| rc the error code associated with the offset |
| erroroffset the offset in the pattern where the error occurred |
| |
| Returns: -1 if the error is unimplemented |
| 0 if the offset is to be ignored (should be zero) |
| 1 if the error refers to the left of the offset |
| 2 if the error refers to the right of the offset |
| 3 if the error refers to both sides of the offset |
| */ |
| |
| static int |
| error_direction(int rc, PCRE2_SIZE erroroffset) |
| { |
| switch (rc) |
| { |
| /* These cases are all for things which don't affect a specific part of the |
| pattern, and should always return zero offset. */ |
| |
| case PCRE2_ERROR_NULL_PATTERN: |
| case PCRE2_ERROR_BAD_OPTIONS: |
| case PCRE2_ERROR_PATTERN_TOO_LARGE: |
| case PCRE2_ERROR_HEAP_FAILED: |
| case PCRE2_ERROR_UNICODE_NOT_SUPPORTED: |
| case PCRE2_ERROR_PARENTHESES_STACK_CHECK: |
| case PCRE2_ERROR_PATTERN_TOO_COMPLICATED: |
| case PCRE2_ERROR_PATTERN_STRING_TOO_LONG: |
| case PCRE2_ERROR_NO_SURROGATES_IN_UTF16: |
| case PCRE2_ERROR_BAD_LITERAL_OPTIONS: |
| case PCRE2_ERROR_PATTERN_COMPILED_SIZE_TOO_BIG: |
| case PCRE2_ERROR_EXTRA_CASING_REQUIRES_UNICODE: |
| case PCRE2_ERROR_TURKISH_CASING_REQUIRES_UTF: |
| case PCRE2_ERROR_EXTRA_CASING_INCOMPATIBLE: |
| return 0; |
| |
| /* A few exceptional cases use the errorofset to point rightwards. These are |
| used when indicating an error in a capture group or lookaround parentheses. |
| It is more user-friendly to identify the capture group by its start. */ |
| |
| case PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP: |
| case PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH: |
| case PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES: |
| case PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED: |
| case PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C: |
| case PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER: |
| case PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP: |
| case PCRE2_ERROR_LOOKBEHIND_TOO_LONG: |
| case PCRE2_ERROR_MAX_VAR_LOOKBEHIND_EXCEEDED: |
| case PCRE2_ERROR_ECLASS_NEST_TOO_DEEP: |
| return 2; |
| |
| /* The standard erroroffset should occur just after the affected portion of |
| the pattern, unless there is a good reason not to do this. Consistency is |
| good, but if there's a specific need then that's more important. */ |
| |
| case PCRE2_ERROR_END_BACKSLASH: |
| case PCRE2_ERROR_END_BACKSLASH_C: |
| case PCRE2_ERROR_UNKNOWN_ESCAPE: |
| case PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER: |
| case PCRE2_ERROR_QUANTIFIER_TOO_BIG: |
| case PCRE2_ERROR_MISSING_SQUARE_BRACKET: |
| case PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS: |
| case PCRE2_ERROR_CLASS_RANGE_ORDER: |
| case PCRE2_ERROR_QUANTIFIER_INVALID: |
| case PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY: |
| case PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS: |
| case PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING: |
| case PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS: |
| return 1; |
| case PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE: |
| return 3; /* TODO I'd like to fix this, but some of the cases are _hard_ */ |
| case PCRE2_ERROR_MISSING_COMMENT_CLOSING: |
| case PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS: |
| case PCRE2_ERROR_MISSING_CONDITION_CLOSING: |
| case PCRE2_ERROR_ZERO_RELATIVE_REFERENCE: |
| case PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED: |
| case PCRE2_ERROR_BAD_RELATIVE_REFERENCE: |
| case PCRE2_ERROR_UNKNOWN_POSIX_CLASS: |
| case PCRE2_ERROR_CODE_POINT_TOO_BIG: |
| case PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE: |
| case PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG: |
| case PCRE2_ERROR_MISSING_CALLOUT_CLOSING: |
| case PCRE2_ERROR_ESCAPE_INVALID_IN_VERB: |
| case PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P: |
| case PCRE2_ERROR_MISSING_NAME_TERMINATOR: |
| case PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME: |
| case PCRE2_ERROR_INVALID_SUBPATTERN_NAME: |
| case PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE: |
| case PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY: |
| case PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY: |
| case PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG: |
| case PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS: |
| case PCRE2_ERROR_CLASS_INVALID_RANGE: |
| case PCRE2_ERROR_OCTAL_BYTE_TOO_BIG: |
| return 1; |
| case PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES: |
| return 2; /* TODO Not ideally placed; I'd like to fix this */ |
| case PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE: |
| case PCRE2_ERROR_BACKSLASH_G_SYNTAX: |
| case PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING: |
| case PCRE2_ERROR_VERB_UNKNOWN: |
| case PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG: |
| case PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED: |
| case PCRE2_ERROR_INVALID_OCTAL: |
| case PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH: |
| case PCRE2_ERROR_MARK_MISSING_ARGUMENT: |
| case PCRE2_ERROR_INVALID_HEXADECIMAL: |
| case PCRE2_ERROR_BACKSLASH_C_SYNTAX: |
| case PCRE2_ERROR_BACKSLASH_K_SYNTAX: |
| case PCRE2_ERROR_BACKSLASH_N_IN_CLASS: |
| case PCRE2_ERROR_CALLOUT_STRING_TOO_LONG: |
| case PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT: |
| return 1; |
| case PCRE2_ERROR_VERB_NAME_TOO_LONG: |
| case PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG: |
| case PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS: |
| case PCRE2_ERROR_VERSION_CONDITION_SYNTAX: |
| case PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER: |
| case PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED: |
| case PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED: |
| case PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE: |
| case PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS: |
| case PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN: |
| case PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE: |
| case PCRE2_ERROR_TOO_MANY_CAPTURES: |
| case PCRE2_ERROR_MISSING_OCTAL_DIGIT: |
| return 1; |
| case PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND: |
| return 3; /* TODO No erroroffset implemented yet, sadly */ |
| case PCRE2_ERROR_OVERSIZE_PYTHON_OCTAL: |
| case PCRE2_ERROR_CALLOUT_CALLER_DISABLED: |
| case PCRE2_ERROR_ECLASS_INVALID_OPERATOR: |
| case PCRE2_ERROR_ECLASS_UNEXPECTED_OPERATOR: |
| case PCRE2_ERROR_ECLASS_EXPECTED_OPERAND: |
| case PCRE2_ERROR_ECLASS_MIXED_OPERATORS: |
| case PCRE2_ERROR_ECLASS_HINT_SQUARE_BRACKET: |
| case PCRE2_ERROR_PERL_ECLASS_UNEXPECTED_EXPR: |
| case PCRE2_ERROR_PERL_ECLASS_EMPTY_EXPR: |
| case PCRE2_ERROR_PERL_ECLASS_MISSING_CLOSE: |
| case PCRE2_ERROR_PERL_ECLASS_UNEXPECTED_CHAR: |
| case PCRE2_ERROR_EXPECTED_CAPTURE_GROUP: |
| case PCRE2_ERROR_MISSING_OPENING_PARENTHESIS: |
| case PCRE2_ERROR_MISSING_NUMBER_TERMINATOR: |
| return 1; |
| |
| /* These two are a little fiddly. They can be triggered by passed-in options |
| (when erroroffset is zero), or by text in the pattern "(*UTF)". We only |
| indicate an pattern error in the latter case. */ |
| |
| case PCRE2_ERROR_UTF_IS_DISABLED: |
| case PCRE2_ERROR_UCP_IS_DISABLED: |
| return (erroroffset > 0)? 1 : 0; |
| |
| case PCRE2_ERROR_UTF8_ERR1: |
| case PCRE2_ERROR_UTF8_ERR2: |
| case PCRE2_ERROR_UTF8_ERR3: |
| case PCRE2_ERROR_UTF8_ERR4: |
| case PCRE2_ERROR_UTF8_ERR5: |
| case PCRE2_ERROR_UTF8_ERR6: |
| case PCRE2_ERROR_UTF8_ERR7: |
| case PCRE2_ERROR_UTF8_ERR8: |
| case PCRE2_ERROR_UTF8_ERR9: |
| case PCRE2_ERROR_UTF8_ERR10: |
| case PCRE2_ERROR_UTF8_ERR11: |
| case PCRE2_ERROR_UTF8_ERR12: |
| case PCRE2_ERROR_UTF8_ERR13: |
| case PCRE2_ERROR_UTF8_ERR14: |
| case PCRE2_ERROR_UTF8_ERR15: |
| case PCRE2_ERROR_UTF8_ERR16: |
| case PCRE2_ERROR_UTF8_ERR17: |
| case PCRE2_ERROR_UTF8_ERR18: |
| case PCRE2_ERROR_UTF8_ERR19: |
| case PCRE2_ERROR_UTF8_ERR20: |
| case PCRE2_ERROR_UTF8_ERR21: |
| case PCRE2_ERROR_UTF16_ERR1: |
| case PCRE2_ERROR_UTF16_ERR2: |
| case PCRE2_ERROR_UTF16_ERR3: |
| case PCRE2_ERROR_UTF32_ERR1: |
| case PCRE2_ERROR_UTF32_ERR2: |
| return 2; |
| } |
| |
| return -1; |
| } |
| |
| |
| |
| #ifdef SUPPORT_PCRE2_8 |
| /************************************************* |
| * Show something in a list * |
| *************************************************/ |
| |
| /* This function just helps to keep the code that uses it tidier. It's used for |
| various lists of things where there needs to be introductory text before the |
| first item. As these calls are all in the POSIX-support code, they happen only |
| when 8-bit mode is supported. */ |
| |
| static void |
| prmsg(const char **msg, const char *s) |
| { |
| fprintf(outfile, "%s %s", *msg, s); |
| *msg = ""; |
| } |
| #endif /* SUPPORT_PCRE2_8 */ |
| |
| |
| |
| /************************************************* |
| * Show control bits * |
| *************************************************/ |
| |
| /* Called for mutually exclusive controls and for unsupported POSIX controls. |
| Because the bits are unique, this can be used for both pattern and data control |
| words. |
| |
| Arguments: |
| controls control bits |
| controls2 more control bits |
| before text to print before |
| |
| Returns: nothing |
| */ |
| |
| static void |
| show_controls(uint32_t controls, uint32_t controls2, const char *before) |
| { |
| fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", |
| before, |
| ((controls & CTL_AFTERTEXT) != 0)? " aftertext" : "", |
| ((controls & CTL_ALLAFTERTEXT) != 0)? " allaftertext" : "", |
| ((controls & CTL_ALLCAPTURES) != 0)? " allcaptures" : "", |
| ((controls & CTL_ALLUSEDTEXT) != 0)? " allusedtext" : "", |
| ((controls2 & CTL2_ALLVECTOR) != 0)? " allvector" : "", |
| ((controls & CTL_ALTGLOBAL) != 0)? " altglobal" : "", |
| ((controls & CTL_BINCODE) != 0)? " bincode" : "", |
| ((controls2 & CTL2_BSR_SET) != 0)? " bsr" : "", |
| ((controls & CTL_CALLOUT_CAPTURE) != 0)? " callout_capture" : "", |
| ((controls2 & CTL2_CALLOUT_EXTRA) != 0)? " callout_extra" : "", |
| ((controls & CTL_CALLOUT_INFO) != 0)? " callout_info" : "", |
| ((controls & CTL_CALLOUT_NONE) != 0)? " callout_none" : "", |
| ((controls2 & CTL2_CALLOUT_NO_WHERE) != 0)? " callout_no_where" : "", |
| ((controls & CTL_DFA) != 0)? " dfa" : "", |
| ((controls & CTL_EXPAND) != 0)? " expand" : "", |
| ((controls & CTL_FINDLIMITS) != 0)? " find_limits" : "", |
| ((controls & CTL_FINDLIMITS_NOHEAP) != 0)? " find_limits_noheap" : "", |
| ((controls2 & CTL2_FRAMESIZE) != 0)? " framesize" : "", |
| ((controls & CTL_FULLBINCODE) != 0)? " fullbincode" : "", |
| ((controls & CTL_GETALL) != 0)? " getall" : "", |
| ((controls & CTL_GLOBAL) != 0)? " global" : "", |
| ((controls2 & CTL2_HEAPFRAMES_SIZE) != 0)? " heapframes_size" : "", |
| ((controls & CTL_HEXPAT) != 0)? " hex" : "", |
| ((controls & CTL_INFO) != 0)? " info" : "", |
| ((controls & CTL_JITFAST) != 0)? " jitfast" : "", |
| ((controls & CTL_JITVERIFY) != 0)? " jitverify" : "", |
| ((controls & CTL_MARK) != 0)? " mark" : "", |
| ((controls & CTL_MEMORY) != 0)? " memory" : "", |
| ((controls2 & CTL2_NL_SET) != 0)? " newline" : "", |
| ((controls & CTL_NULLCONTEXT) != 0)? " null_context" : "", |
| ((controls2 & CTL2_NULL_REPLACEMENT) != 0)? " null_replacement" : "", |
| ((controls2 & CTL2_NULL_SUBJECT) != 0)? " null_subject" : "", |
| ((controls & CTL_POSIX) != 0)? " posix" : "", |
| ((controls & CTL_POSIX_NOSUB) != 0)? " posix_nosub" : "", |
| ((controls & CTL_PUSH) != 0)? " push" : "", |
| ((controls & CTL_PUSHCOPY) != 0)? " pushcopy" : "", |
| ((controls & CTL_PUSHTABLESCOPY) != 0)? " pushtablescopy" : "", |
| ((controls & CTL_STARTCHAR) != 0)? " startchar" : "", |
| ((controls2 & CTL2_SUBSTITUTE_CALLOUT) != 0)? " substitute_callout" : "", |
| ((controls2 & CTL2_SUBSTITUTE_CASE_CALLOUT) != 0)? " substitute_case_callout" : "", |
| ((controls2 & CTL2_SUBSTITUTE_EXTENDED) != 0)? " substitute_extended" : "", |
| ((controls2 & CTL2_SUBSTITUTE_LITERAL) != 0)? " substitute_literal" : "", |
| ((controls2 & CTL2_SUBSTITUTE_MATCHED) != 0)? " substitute_matched" : "", |
| ((controls2 & CTL2_SUBSTITUTE_OVERFLOW_LENGTH) != 0)? " substitute_overflow_length" : "", |
| ((controls2 & CTL2_SUBSTITUTE_REPLACEMENT_ONLY) != 0)? " substitute_replacement_only" : "", |
| ((controls2 & CTL2_SUBSTITUTE_UNKNOWN_UNSET) != 0)? " substitute_unknown_unset" : "", |
| ((controls2 & CTL2_SUBSTITUTE_UNSET_EMPTY) != 0)? " substitute_unset_empty" : "", |
| ((controls & CTL_USE_LENGTH) != 0)? " use_length" : "", |
| ((controls & CTL_UTF8_INPUT) != 0)? " utf8_input" : "", |
| ((controls & CTL_ZERO_TERMINATE) != 0)? " zero_terminate" : ""); |
| } |
| |
| |
| |
| /************************************************* |
| * Show compile options * |
| *************************************************/ |
| |
| /* Called from show_pattern_info() and for unsupported POSIX options. |
| |
| Arguments: |
| options an options word |
| before text to print before |
| after text to print after |
| |
| Returns: nothing |
| */ |
| |
| static void |
| show_compile_options(uint32_t options, const char *before, const char *after) |
| { |
| if (options == 0) fprintf(outfile, "%s <none>%s", before, after); |
| else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", |
| before, |
| ((options & PCRE2_ALT_BSUX) != 0)? " alt_bsux" : "", |
| ((options & PCRE2_ALT_CIRCUMFLEX) != 0)? " alt_circumflex" : "", |
| ((options & PCRE2_ALT_EXTENDED_CLASS) != 0)? " alt_extended_class" : "", |
| ((options & PCRE2_ALT_VERBNAMES) != 0)? " alt_verbnames" : "", |
| ((options & PCRE2_ALLOW_EMPTY_CLASS) != 0)? " allow_empty_class" : "", |
| ((options & PCRE2_ANCHORED) != 0)? " anchored" : "", |
| ((options & PCRE2_AUTO_CALLOUT) != 0)? " auto_callout" : "", |
| ((options & PCRE2_CASELESS) != 0)? " caseless" : "", |
| ((options & PCRE2_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "", |
| ((options & PCRE2_DOTALL) != 0)? " dotall" : "", |
| ((options & PCRE2_DUPNAMES) != 0)? " dupnames" : "", |
| ((options & PCRE2_ENDANCHORED) != 0)? " endanchored" : "", |
| ((options & PCRE2_EXTENDED) != 0)? " extended" : "", |
| ((options & PCRE2_EXTENDED_MORE) != 0)? " extended_more" : "", |
| ((options & PCRE2_FIRSTLINE) != 0)? " firstline" : "", |
| ((options & PCRE2_LITERAL) != 0)? " literal" : "", |
| ((options & PCRE2_MATCH_INVALID_UTF) != 0)? " match_invalid_utf" : "", |
| ((options & PCRE2_MATCH_UNSET_BACKREF) != 0)? " match_unset_backref" : "", |
| ((options & PCRE2_MULTILINE) != 0)? " multiline" : "", |
| ((options & PCRE2_NEVER_BACKSLASH_C) != 0)? " never_backslash_c" : "", |
| ((options & PCRE2_NEVER_UCP) != 0)? " never_ucp" : "", |
| ((options & PCRE2_NEVER_UTF) != 0)? " never_utf" : "", |
| ((options & PCRE2_NO_AUTO_CAPTURE) != 0)? " no_auto_capture" : "", |
| ((options & PCRE2_NO_AUTO_POSSESS) != 0)? " no_auto_possess" : "", |
| ((options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)? " no_dotstar_anchor" : "", |
| ((options & PCRE2_NO_UTF_CHECK) != 0)? " no_utf_check" : "", |
| ((options & PCRE2_NO_START_OPTIMIZE) != 0)? " no_start_optimize" : "", |
| ((options & PCRE2_UCP) != 0)? " ucp" : "", |
| ((options & PCRE2_UNGREEDY) != 0)? " ungreedy" : "", |
| ((options & PCRE2_USE_OFFSET_LIMIT) != 0)? " use_offset_limit" : "", |
| ((options & PCRE2_UTF) != 0)? " utf" : "", |
| after); |
| } |
| |
| |
| /************************************************* |
| * Show compile extra options * |
| *************************************************/ |
| |
| /* Called from show_pattern_info() and for unsupported POSIX options. |
| |
| Arguments: |
| options an options word |
| before text to print before |
| after text to print after |
| |
| Returns: nothing |
| */ |
| |
| static void |
| show_compile_extra_options(uint32_t options, const char *before, |
| const char *after) |
| { |
| if (options == 0) fprintf(outfile, "%s <none>%s", before, after); |
| else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", |
| before, |
| ((options & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) != 0) ? " allow_lookaround_bsk" : "", |
| ((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "", |
| ((options & PCRE2_EXTRA_ALT_BSUX) != 0)? " alt_bsux" : "", |
| ((options & PCRE2_EXTRA_ASCII_BSD) != 0)? " ascii_bsd" : "", |
| ((options & PCRE2_EXTRA_ASCII_BSS) != 0)? " ascii_bss" : "", |
| ((options & PCRE2_EXTRA_ASCII_BSW) != 0)? " ascii_bsw" : "", |
| ((options & PCRE2_EXTRA_ASCII_DIGIT) != 0)? " ascii_digit" : "", |
| ((options & PCRE2_EXTRA_ASCII_POSIX) != 0)? " ascii_posix" : "", |
| ((options & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) != 0)? " bad_escape_is_literal" : "", |
| ((options & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)? " caseless_restrict" : "", |
| ((options & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)? " escaped_cr_is_lf" : "", |
| ((options & PCRE2_EXTRA_MATCH_WORD) != 0)? " match_word" : "", |
| ((options & PCRE2_EXTRA_MATCH_LINE) != 0)? " match_line" : "", |
| ((options & PCRE2_EXTRA_NEVER_CALLOUT) != 0)? " never_callout" : "", |
| ((options & PCRE2_EXTRA_NO_BS0) != 0)? " no_bs0" : "", |
| ((options & PCRE2_EXTRA_PYTHON_OCTAL) != 0)? " python_octal" : "", |
| ((options & PCRE2_EXTRA_TURKISH_CASING) != 0)? " turkish_casing" : "", |
| after); |
| } |
| |
| |
| /************************************************* |
| * Show optimization flags * |
| *************************************************/ |
| |
| /* |
| Arguments: |
| flags an options word |
| before text to print before |
| after text to print after |
| |
| Returns: nothing |
| */ |
| |
| static void |
| show_optimize_flags(uint32_t flags, const char *before, const char *after) |
| { |
| if (flags == 0) fprintf(outfile, "%s<none>%s", before, after); |
| else fprintf(outfile, "%s%s%s%s%s%s%s", |
| before, |
| ((flags & PCRE2_OPTIM_AUTO_POSSESS) != 0) ? "auto_possess" : "", |
| ((flags & PCRE2_OPTIM_AUTO_POSSESS) != 0 && (flags >> 1) != 0) ? "," : "", |
| ((flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0) ? "dotstar_anchor" : "", |
| ((flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0 && (flags >> 2) != 0) ? "," : "", |
| ((flags & PCRE2_OPTIM_START_OPTIMIZE) != 0) ? "start_optimize" : "", |
| after); |
| } |
| |
| |
| #ifdef SUPPORT_PCRE2_8 |
| /************************************************* |
| * Show match options * |
| *************************************************/ |
| |
| /* Called for unsupported POSIX options. */ |
| |
| static void |
| show_match_options(uint32_t options) |
| { |
| fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s", |
| ((options & PCRE2_ANCHORED) != 0)? " anchored" : "", |
| ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0)? " copy_matched_subject" : "", |
| ((options & PCRE2_DFA_RESTART) != 0)? " dfa_restart" : "", |
| ((options & PCRE2_DFA_SHORTEST) != 0)? " dfa_shortest" : "", |
| ((options & PCRE2_DISABLE_RECURSELOOP_CHECK) != 0)? " disable_recurseloop_check" : "", |
| ((options & PCRE2_ENDANCHORED) != 0)? " endanchored" : "", |
| ((options & PCRE2_NO_JIT) != 0)? " no_jit" : "", |
| ((options & PCRE2_NO_UTF_CHECK) != 0)? " no_utf_check" : "", |
| ((options & PCRE2_NOTBOL) != 0)? " notbol" : "", |
| ((options & PCRE2_NOTEMPTY) != 0)? " notempty" : "", |
| ((options & PCRE2_NOTEMPTY_ATSTART) != 0)? " notempty_atstart" : "", |
| ((options & PCRE2_NOTEOL) != 0)? " noteol" : "", |
| ((options & PCRE2_PARTIAL_HARD) != 0)? " partial_hard" : "", |
| ((options & PCRE2_PARTIAL_SOFT) != 0)? " partial_soft" : ""); |
| } |
| #endif /* SUPPORT_PCRE2_8 */ |
| |
| |
| |
| /************************************************* |
| * Open file for save/load commands * |
| *************************************************/ |
| |
| /* This function decodes the file name and opens the file. |
| |
| Arguments: |
| buffptr point after the #command |
| mode open mode |
| fptr points to the FILE variable |
| name name of # command |
| |
| Returns: PR_OK or PR_ABEND |
| */ |
| |
| static int |
| open_file(uint8_t *buffptr, const char *mode, FILE **fptr, const char *name) |
| { |
| char *endf; |
| char *filename = (char *)buffptr; |
| while (isspace((unsigned char)*filename)) filename++; |
| endf = filename + strlen(filename); |
| while (endf > filename && isspace((unsigned char)endf[-1])) endf--; |
| |
| if (endf == filename) |
| { |
| fprintf(outfile, "** File name expected after %s\n", name); |
| return PR_ABEND; |
| } |
| |
| *endf = 0; |
| *fptr = fopen((const char *)filename, mode); |
| if (*fptr == NULL) |
| { |
| fprintf(outfile, "** Failed to open \"%s\": %s\n", filename, strerror(errno)); |
| return PR_ABEND; |
| } |
| |
| return PR_OK; |
| } |
| |
| |
| |
| |
| /************************************************* |
| * Substitute case callout transform * |
| *************************************************/ |
| |
| /* Function to implement our test-only custom case mappings. |
| To ease implementation, we only work in the ASCII range (so that we don't need |
| to read & write UTF sequences). |
| However, we aim to implement case mappings which fairly well represent the range |
| of interesting behaviours that exist for Unicode codepoints. */ |
| |
| static BOOL |
| case_transform(int to_case, int num_in, int *num_read, int *num_write, |
| uint32_t *c1, uint32_t *c2) |
| { |
| /* Let's have one character which aborts the substitution. */ |
| if (*c1 == CHAR_EXCLAMATION_MARK) return FALSE; |
| |
| /* Default behaviour is to read one character, and write back that same one |
| character (treating all characters as "uncased"). */ |
| *num_read = *num_write = 1; |
| |
| /* Add a normal case pair 'a' (l) <-> 'B' (t,u). Standard ASCII letter |
| behaviour, but with switched letters for testing. */ |
| if (*c1 == CHAR_a && to_case != PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = CHAR_B; |
| else if (*c1 == CHAR_B && to_case == PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = CHAR_a; |
| |
| /* Add a titlecased triplet 'd' (l) <-> 'D' (t) <-> 'Z' (u). Example: the |
| 'dz'/'Dz'/'DZ' ligature character ("Latin Small Letter DZ" <-> "Latin Capital |
| Letter D with Small Letter Z" <-> "Latin Capital Letter DZ"). */ |
| else if (*c1 == CHAR_d && to_case != PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = (to_case == PCRE2_SUBSTITUTE_CASE_TITLE_FIRST)? CHAR_D : CHAR_Z; |
| else if (*c1 == CHAR_D && to_case != PCRE2_SUBSTITUTE_CASE_TITLE_FIRST) |
| *c1 = (to_case == PCRE2_SUBSTITUTE_CASE_LOWER)? CHAR_d : CHAR_Z; |
| else if (*c1 == CHAR_Z && to_case != PCRE2_SUBSTITUTE_CASE_UPPER) |
| *c1 = (to_case == PCRE2_SUBSTITUTE_CASE_LOWER)? CHAR_d : CHAR_D; |
| |
| /* Expands when uppercased. Example: Esszet 'f' <-> 'SS'. */ |
| else if (*c1 == CHAR_f && to_case != PCRE2_SUBSTITUTE_CASE_LOWER) |
| { |
| *c1 = CHAR_S; |
| *c2 = CHAR_S; |
| *num_write = 2; |
| } |
| else if (*c1 == CHAR_s && to_case != PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = CHAR_S; |
| else if (*c1 == CHAR_S && to_case == PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = CHAR_s; |
| |
| /* Expanding and contracting characters, 'o' <-> 'OO'. You can get this purely |
| due to UTF-8 encoding length, for example uppercase Omega (3 bytes in UTF-8) |
| lowercases to 2 bytes in UTF-8. */ |
| else if (num_in == 2 && *c1 == CHAR_O && *c2 == CHAR_O && |
| to_case == PCRE2_SUBSTITUTE_CASE_LOWER) |
| { |
| *c1 = CHAR_o; |
| *num_read = 2; |
| } |
| else if (*c1 == CHAR_o && to_case != PCRE2_SUBSTITUTE_CASE_LOWER) |
| { |
| *c1 = CHAR_O; |
| *c2 = CHAR_O; |
| *num_write = 2; |
| } |
| else if (num_in == 2 && *c1 == CHAR_p && *c2 == CHAR_p && |
| to_case != PCRE2_SUBSTITUTE_CASE_LOWER) |
| { |
| *c1 = CHAR_P; |
| *num_read = 2; |
| } |
| else if (*c1 == CHAR_P && to_case == PCRE2_SUBSTITUTE_CASE_LOWER) |
| { |
| *c1 = CHAR_p; |
| *c2 = CHAR_p; |
| *num_write = 2; |
| } |
| |
| /* Use 'l' -> 'Mn' or 'MN' as an expanding ligature, like 'fi' -> 'Fi' -> |
| 'FI'. */ |
| else if (*c1 == CHAR_l && to_case != PCRE2_SUBSTITUTE_CASE_LOWER) |
| { |
| *c1 = CHAR_M; |
| *c2 = (to_case == PCRE2_SUBSTITUTE_CASE_TITLE_FIRST)? CHAR_n : CHAR_N; |
| *num_write = 2; |
| } |
| else if (*c1 == CHAR_M && to_case == PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = CHAR_m; |
| else if (*c1 == CHAR_m && to_case != PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = CHAR_M; |
| else if (*c1 == CHAR_N && to_case == PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = CHAR_n; |
| else if (*c1 == CHAR_n && to_case != PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = CHAR_N; |
| |
| /* An example of a context-dependent mapping, the Greek Sigma. It lowercases |
| depending on the following character. Use 'c'/'k' -> 'K'. */ |
| else if ((*c1 == CHAR_c || *c1 == CHAR_k) && |
| to_case != PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = CHAR_K; |
| else if (*c1 == CHAR_K && to_case == PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = (num_in == 1 || *c2 == CHAR_SPACE)? CHAR_c : CHAR_k; |
| |
| /* An example of a context-dependent multi mapping, the Dutch IJ. When those |
| letters appear together, they titlecase 'ij' (l) <-> 'IJ' (t) <-> 'IJ' (u). |
| Namely, English titlecasing of 'ijnssel' would be 'Ijnssel' (just uppercase the |
| first letter), but the Dutch rule is 'IJnssel'. */ |
| else if (num_in == 2 && (*c1 == CHAR_i || *c1 == CHAR_I) && |
| (*c2 == CHAR_j || *c2 == CHAR_J) && |
| to_case == PCRE2_SUBSTITUTE_CASE_TITLE_FIRST) |
| { |
| *c1 = CHAR_I; |
| *c2 = CHAR_J; |
| *num_read = 2; |
| *num_write = 2; |
| } |
| else if (*c1 == CHAR_i && to_case != PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = CHAR_I; |
| else if (*c1 == CHAR_I && to_case == PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = CHAR_i; |
| else if (*c1 == CHAR_j && to_case != PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = CHAR_J; |
| else if (*c1 == CHAR_J && to_case == PCRE2_SUBSTITUTE_CASE_LOWER) |
| *c1 = CHAR_j; |
| |
| return TRUE; |
| } |
| |
| |
| |
| /************************************************* |
| * Show an entire ovector * |
| *************************************************/ |
| |
| /* This function is called after partial matching or match failure, when the |
| "allvector" modifier is set. It is a means of checking the contents of the |
| entire ovector, to ensure no modification of fields that should be unchanged. |
| |
| Arguments: |
| ovector points to the ovector |
| oveccount number of pairs |
| |
| Returns: nothing |
| */ |
| |
| static void |
| show_ovector(PCRE2_SIZE *ovector, uint32_t oveccount) |
| { |
| uint32_t i; |
| for (i = 0; i < 2*oveccount; i += 2) |
| { |
| PCRE2_SIZE start = ovector[i]; |
| PCRE2_SIZE end = ovector[i+1]; |
| |
| fprintf(outfile, "%2d: ", i/2); |
| if (start == PCRE2_UNSET && end == PCRE2_UNSET) |
| fprintf(outfile, "<unset>\n"); |
| else if (start == JUNK_OFFSET && end == JUNK_OFFSET) |
| fprintf(outfile, "<unchanged>\n"); |
| else |
| fprintf(outfile, "%ld %ld\n", (unsigned long int)start, |
| (unsigned long int)end); |
| } |
| } |
| |
| |
| |
| /************************************************* |
| * Mode-dependent code * |
| *************************************************/ |
| |
| /* All the mode-independent utilities should go above this section, so that |
| the mode-dependent code can use them. |
| |
| The structure is: |
| main |
| -> calls into usage, command line parsing, top-level dispatch |
| -> calls into mode-dependent code to handle input lines |
| -> calls into mode-independent utilities |
| |
| The ordering of the code blocks is therefore: |
| - mode-independent utilities (ABOVE THIS SECTION) |
| - mode-dependent code to handle input lines (THIS SECTION) |
| - usage, command line parsing, top-level dispatch (NEXT SECTION) |
| - main (AT THE BOTTOM) |
| */ |
| |
| /* --- Repeated pre-processor inclusions to build the mode-dependent code -- */ |
| |
| #undef PCRE2_SUFFIX |
| |
| #ifdef SUPPORT_PCRE2_8 |
| #define PCRE2_CODE_UNIT_WIDTH 8 |
| #define PCRE2_SUFFIX(a) G(a,8) |
| #include "pcre2_intmodedep.h" |
| #include "pcre2test_inc.h" |
| #undef PCRE2_CODE_UNIT_WIDTH |
| #undef PCRE2_SUFFIX |
| #endif |
| |
| #ifdef SUPPORT_PCRE2_16 |
| #define PCRE2_CODE_UNIT_WIDTH 16 |
| #define PCRE2_SUFFIX(a) G(a,16) |
| #include "pcre2_intmodedep.h" |
| #include "pcre2test_inc.h" |
| #undef PCRE2_CODE_UNIT_WIDTH |
| #undef PCRE2_SUFFIX |
| #endif |
| |
| #ifdef SUPPORT_PCRE2_32 |
| #define PCRE2_CODE_UNIT_WIDTH 32 |
| #define PCRE2_SUFFIX(a) G(a,32) |
| #include "pcre2_intmodedep.h" |
| #include "pcre2test_inc.h" |
| #undef PCRE2_CODE_UNIT_WIDTH |
| #undef PCRE2_SUFFIX |
| #endif |
| |
| #define PCRE2_CODE_UNIT_WIDTH 0 |
| #include "pcre2_intmodedep.h" /* Clear out the stale macros */ |
| #undef PCRE2_CODE_UNIT_WIDTH |
| |
| #define PCRE2_SUFFIX(a) a |
| |
| /* --------------------------- Static variables ---------------------------- */ |
| |
| /* Declared after mode-dependent code. */ |
| |
| static int test_mode = DEFAULT_TEST_MODE; |
| |
| /* -------------------- Mode-dependent dispatch helper --------------------- */ |
| |
| /* When there are three supported bit widths, use a three-way ternary. */ |
| |
| #if defined(SUPPORT_PCRE2_8) && defined(SUPPORT_PCRE2_16) && defined(SUPPORT_PCRE2_32) |
| |
| #define DISPATCH(opt_ret, fname, fargs) opt_ret \ |
| ((test_mode == PCRE2TEST_MODE_8)? G(fname,8) fargs : \ |
| (test_mode == PCRE2TEST_MODE_16)? G(fname,16) fargs : \ |
| G(fname,32) fargs) |
| |
| #elif (defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + \ |
| defined(SUPPORT_PCRE2_32)) == 2 |
| |
| /* With some macro trickery, we can make a single definition work to dispatch |
| between any two bit widths. */ |
| |
| #if defined(SUPPORT_PCRE2_32) && defined(SUPPORT_PCRE2_16) |
| #define BITONE 32 |
| #define BITTWO 16 |
| #elif defined(SUPPORT_PCRE2_32) && defined(SUPPORT_PCRE2_8) |
| #define BITONE 32 |
| #define BITTWO 8 |
| #else |
| #define BITONE 16 |
| #define BITTWO 8 |
| #endif |
| |
| #define DISPATCH(opt_ret, fname, fargs) opt_ret \ |
| ((test_mode == G(PCRE2TEST_MODE_,BITONE))? G(fname,BITONE) fargs : \ |
| G(fname,BITTWO) fargs) |
| |
| #else /* Only one bit width supported */ |
| |
| #if defined(SUPPORT_PCRE2_32) |
| #define BITONE 32 |
| #elif defined(SUPPORT_PCRE2_16) |
| #define BITONE 16 |
| #else |
| #define BITONE 8 |
| #endif |
| |
| #define DISPATCH(opt_ret, fname, fargs) \ |
| opt_ret (G(fname,BITONE) fargs) |
| |
| #endif |
| |
| /* -------------------- Mode-dependent dispatch wrappers ------------------- */ |
| |
| static int jit_compile_test(void) |
| { |
| DISPATCH(return, pcre2_jit_compile_, (NULL, PCRE2_JIT_TEST_ALLOC)); |
| } |
| |
| static int pcre2_config(uint32_t what, void *where) |
| { |
| DISPATCH(return, pcre2_config_, (what, where)); |
| } |
| |
| static void config_str(uint32_t what, char *where) |
| { |
| DISPATCH(, config_str_, (what, where)); |
| } |
| |
| static BOOL decode_modifiers(uint8_t *p, int ctx, patctl *pctl, datctl *dctl) |
| { |
| DISPATCH(return, decode_modifiers_, (p, ctx, pctl, dctl)); |
| } |
| |
| static BOOL |
| print_error_message_file(FILE *file, int errorcode, const char *before, |
| const char *after, BOOL badcode_ok) |
| { |
| DISPATCH(return, print_error_message_file_, \ |
| (file, errorcode, before, after, badcode_ok)); |
| } |
| |
| static int process_command(void) |
| { |
| DISPATCH(return, process_command_, ()); |
| } |
| |
| static int process_pattern(void) |
| { |
| DISPATCH(return, process_pattern_, ()); |
| } |
| |
| static BOOL have_active_pattern(void) |
| { |
| DISPATCH(return, have_active_pattern_, ()); |
| } |
| |
| static void free_active_pattern(void) |
| { |
| DISPATCH(, free_active_pattern_, ()); |
| } |
| |
| static int process_data(void) |
| { |
| DISPATCH(return, process_data_, ()); |
| } |
| |
| static void init_globals(void) |
| { |
| DISPATCH(, init_globals_, ()); |
| } |
| |
| static void free_globals(void) |
| { |
| DISPATCH(, free_globals_, ()); |
| } |
| |
| static void unittest(void) |
| { |
| DISPATCH(, unittest_, ()); |
| } |
| |
| #undef DISPATCH |
| #undef BITONE |
| #undef BITTWO |
| |
| |
| |
| /************************************************* |
| * Print PCRE2 version * |
| *************************************************/ |
| |
| static void |
| print_version(FILE *f, BOOL include_mode) |
| { |
| char buff[VERSION_SIZE]; |
| config_str(PCRE2_CONFIG_VERSION, buff); |
| fprintf(f, "PCRE2 version %s", buff); |
| if (include_mode) |
| { |
| fprintf(f, " (%d-bit)", test_mode); |
| } |
| fprintf(f, "\n"); |
| } |
| |
| |
| |
| /************************************************* |
| * Print Unicode version * |
| *************************************************/ |
| |
| static void |
| print_unicode_version(FILE *f) |
| { |
| char buff[VERSION_SIZE]; |
| config_str(PCRE2_CONFIG_UNICODE_VERSION, buff); |
| fprintf(f, "Unicode version %s", buff); |
| } |
| |
| |
| |
| /************************************************* |
| * Print JIT target * |
| *************************************************/ |
| |
| static void |
| print_jit_target(FILE *f) |
| { |
| char buff[VERSION_SIZE]; |
| config_str(PCRE2_CONFIG_JITTARGET, buff); |
| fputs(buff, f); |
| } |
| |
| |
| |
| /************************************************* |
| * Print newline configuration * |
| *************************************************/ |
| |
| /* Output is always to stdout. |
| |
| Arguments: |
| rc the return code from PCRE2_CONFIG_NEWLINE |
| isc TRUE if called from "-C newline" |
| Returns: nothing |
| */ |
| |
| static void |
| print_newline_config(uint32_t optval, BOOL isc) |
| { |
| if (!isc) printf(" Default newline sequence is "); |
| if (optval < sizeof(newlines)/sizeof(char *)) |
| printf("%s\n", newlines[optval]); |
| else |
| printf("a non-standard value: %d\n", optval); |
| } |
| |
| |
| |
| /************************************************* |
| * Usage function * |
| *************************************************/ |
| |
| static void |
| usage(void) |
| { |
| printf("Usage: pcre2test [options] [<input file> [<output file>]]\n\n"); |
| printf("Input and output default to stdin and stdout.\n"); |
| #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) |
| printf("If input is a terminal, readline() is used to read from it.\n"); |
| #else |
| printf("This version of pcre2test is not linked with readline().\n"); |
| #endif |
| printf("\nOptions:\n"); |
| #ifdef SUPPORT_PCRE2_8 |
| printf(" -8 use the 8-bit library\n"); |
| #endif |
| #ifdef SUPPORT_PCRE2_16 |
| printf(" -16 use the 16-bit library\n"); |
| #endif |
| #ifdef SUPPORT_PCRE2_32 |
| printf(" -32 use the 32-bit library\n"); |
| #endif |
| printf(" -ac set default pattern modifier PCRE2_AUTO_CALLOUT\n"); |
| printf(" -AC as -ac, but also set subject 'callout_extra' modifier\n"); |
| printf(" -b set default pattern modifier 'fullbincode'\n"); |
| printf(" -C show PCRE2 compile-time options and exit\n"); |
| printf(" -C arg show a specific compile-time option and exit with its\n"); |
| printf(" value if numeric (else 0). The arg can be:\n"); |
| printf(" backslash-C use of \\C is enabled [0, 1]\n"); |
| printf(" bsr \\R type [ANYCRLF, ANY]\n"); |
| printf(" ebcdic compiled for EBCDIC character code [0, 1]\n"); |
| printf(" ebcdic-io if compiled for EBCDIC, whether pcre2test's input\n"); |
| printf(" and output is EBCDIC or ASCII [0, 1]\n"); |
| printf(" ebcdic-nl25 if compiled for EBCDIC, whether NL is 0x25 [0, 1]\n"); |
| printf(" jit just-in-time compiler supported [0, 1]\n"); |
| printf(" jitusable test JIT usability [0, 1, 2, 3]\n"); |
| printf(" linksize internal link size [2, 3, 4]\n"); |
| printf(" newline newline type [CR, LF, CRLF, ANYCRLF, ANY, NUL]\n"); |
| printf(" pcre2-8 8 bit library support enabled [0, 1]\n"); |
| printf(" pcre2-16 16 bit library support enabled [0, 1]\n"); |
| printf(" pcre2-32 32 bit library support enabled [0, 1]\n"); |
| printf(" unicode Unicode and UTF support enabled [0, 1]\n"); |
| printf(" -d set default pattern modifier 'debug'\n"); |
| printf(" -dfa set default subject modifier 'dfa'\n"); |
| printf(" -E preprocess input only (#if ... #endif)\n"); |
| printf(" -error <n,m,..> show messages for error numbers, then exit\n"); |
| printf(" -help show usage information\n"); |
| printf(" -i set default pattern modifier 'info'\n"); |
| printf(" -jit set default pattern modifier 'jit'\n"); |
| printf(" -jitfast set default pattern modifier 'jitfast'\n"); |
| printf(" -jitverify set default pattern modifier 'jitverify'\n"); |
| printf(" -LM list pattern and subject modifiers, then exit\n"); |
| printf(" -LP list non-script properties, then exit\n"); |
| printf(" -LS list supported scripts, then exit\n"); |
| printf(" -q quiet: do not output PCRE2 version number at start\n"); |
| printf(" -pattern <s> set default pattern modifier fields\n"); |
| printf(" -subject <s> set default subject modifier fields\n"); |
| printf(" -S <n> set stack size to <n> mebibytes\n"); |
| printf(" -t [<n>] time compilation and execution, repeating <n> times\n"); |
| printf(" -tm [<n>] time execution (matching) only, repeating <n> times\n"); |
| printf(" -T same as -t, but show total times at the end\n"); |
| printf(" -TM same as -tm, but show total time at the end\n"); |
| printf(" -malloc exercise malloc() failures\n"); |
| printf(" -v|--version show PCRE2 version and exit\n"); |
| } |
| |
| |
| |
| /************************************************* |
| * Handle -C option * |
| *************************************************/ |
| |
| /* This option outputs configuration options and sets an appropriate return |
| code when asked for a single option. The code is abstracted into a separate |
| function because of its size. |
| |
| Most, but not all, of the data is independent of the test mode. |
| |
| Argument: an option name or NULL |
| Returns: the return code |
| */ |
| |
| static int |
| c_option(const char *arg) |
| { |
| uint32_t optval; |
| unsigned int i = COPTLISTCOUNT; |
| int rc, yield = 0; |
| |
| if (arg != NULL && arg[0] != '-') |
| { |
| for (i = 0; i < COPTLISTCOUNT; i++) |
| if (strcmp(arg, coptlist[i].name) == 0) break; |
| |
| if (i >= COPTLISTCOUNT) |
| { |
| fprintf(stderr, "** Unknown -C option \"%s\"\n", arg); |
| return 0; |
| } |
| |
| switch (coptlist[i].type) |
| { |
| case CONF_BSR: |
| (void)pcre2_config(coptlist[i].value, &optval); |
| printf("%s\n", (optval == PCRE2_BSR_ANYCRLF)? "ANYCRLF" : "ANY"); |
| break; |
| |
| case CONF_FIX: |
| yield = coptlist[i].value; |
| printf("%d\n", yield); |
| break; |
| |
| case CONF_INT: |
| (void)pcre2_config(coptlist[i].value, &yield); |
| printf("%d\n", yield); |
| break; |
| |
| case CONF_NL: |
| (void)pcre2_config(coptlist[i].value, &optval); |
| print_newline_config(optval, TRUE); |
| break; |
| |
| case CONF_JU: |
| rc = jit_compile_test(); |
| switch(rc) |
| { |
| case 0: yield = 0; break; |
| case PCRE2_ERROR_NOMEMORY: yield = 1; break; |
| case PCRE2_ERROR_JIT_UNSUPPORTED: yield = 2; break; |
| default: yield = 3; break; |
| } |
| printf("%d\n", yield); |
| break; |
| } |
| |
| /* For VMS, return the value by setting a symbol, for certain values only. This |
| is contributed code which the PCRE2 developers have no means of testing. */ |
| |
| #ifdef __VMS |
| |
| /* This is the original code provided by the first VMS contributor. */ |
| #ifdef NEVER |
| if (copytlist[i].type == CONF_FIX || coptlist[i].type == CONF_INT) |
| { |
| char ucname[16]; |
| strcpy(ucname, coptlist[i].name); |
| for (i = 0; ucname[i] != 0; i++) ucname[i] = toupper[ucname[i]]; |
| vms_setsymbol(ucname, 0, optval); |
| } |
| #endif |
| |
| /* This is the new code, provided by a second VMS contributor. */ |
| |
| if (coptlist[i].type == CONF_FIX || coptlist[i].type == CONF_INT) |
| { |
| char nam_buf[22], val_buf[4]; |
| $DESCRIPTOR(nam, nam_buf); |
| $DESCRIPTOR(val, val_buf); |
| |
| strcpy(nam_buf, coptlist[i].name); |
| nam.dsc$w_length = strlen(nam_buf); |
| sprintf(val_buf, "%d", yield); |
| val.dsc$w_length = strlen(val_buf); |
| lib$set_symbol(&nam, &val); |
| } |
| #endif /* __VMS */ |
| |
| return yield; |
| } |
| |
| /* No argument for -C: output all configuration information. */ |
| |
| print_version(stdout, FALSE); |
| printf("Compiled with\n"); |
| |
| #ifdef EBCDIC |
| printf(" EBCDIC code support: LF is 0x%02x\n", CHAR_LF); |
| #if defined NATIVE_ZOS |
| printf(" EBCDIC code page %s or similar\n", pcrz_cpversion()); |
| #endif |
| #if EBCDIC_IO |
| printf(" Input/output for pcre2test is EBCDIC\n"); |
| #else |
| printf(" Input/output for pcre2test is ASCII, not EBCDIC\n"); |
| #endif |
| #endif |
| |
| (void)pcre2_config(PCRE2_CONFIG_COMPILED_WIDTHS, &optval); |
| if (optval & 1) printf(" 8-bit support\n"); |
| if (optval & 2) printf(" 16-bit support\n"); |
| if (optval & 4) printf(" 32-bit support\n"); |
| |
| #ifdef SUPPORT_VALGRIND |
| printf(" Valgrind support\n"); |
| #endif |
| |
| (void)pcre2_config(PCRE2_CONFIG_UNICODE, &optval); |
| if (optval != 0) |
| { |
| printf(" UTF and UCP support ("); |
| print_unicode_version(stdout); |
| printf(")\n"); |
| } |
| else printf(" No Unicode support\n"); |
| |
| (void)pcre2_config(PCRE2_CONFIG_JIT, &optval); |
| if (optval != 0) |
| { |
| printf(" Just-in-time compiler support\n"); |
| printf(" Architecture: "); |
| print_jit_target(stdout); |
| printf("\n"); |
| |
| printf(" Can allocate executable memory: "); |
| rc = jit_compile_test(); |
| switch(rc) |
| { |
| case 0: |
| printf("Yes\n"); |
| break; |
| |
| case PCRE2_ERROR_NOMEMORY: |
| printf("No (so cannot work)\n"); |
| break; |
| |
| default: |
| printf("\n** Unexpected return %d from " |
| "pcre2_jit_compile(NULL, PCRE2_JIT_TEST_ALLOC)\n", rc); |
| printf("** Should not occur\n"); |
| yield = 1; |
| break; |
| } |
| } |
| else |
| { |
| printf(" No just-in-time compiler support\n"); |
| } |
| |
| (void)pcre2_config(PCRE2_CONFIG_NEWLINE, &optval); |
| print_newline_config(optval, FALSE); |
| (void)pcre2_config(PCRE2_CONFIG_BSR, &optval); |
| printf(" \\R matches %s\n", |
| (optval == PCRE2_BSR_ANYCRLF)? "CR, LF, or CRLF only" : |
| "all Unicode newlines"); |
| (void)pcre2_config(PCRE2_CONFIG_NEVER_BACKSLASH_C, &optval); |
| printf(" \\C is %ssupported\n", optval? "not ":""); |
| printf(" Internal link size\n"); |
| (void)pcre2_config(PCRE2_CONFIG_LINKSIZE, &optval); |
| printf(" Requested = %d\n", optval); |
| (void)pcre2_config(PCRE2_CONFIG_EFFECTIVE_LINKSIZE, &optval); |
| printf(" Effective = %d\n", optval); |
| (void)pcre2_config(PCRE2_CONFIG_PARENSLIMIT, &optval); |
| printf(" Parentheses nest limit = %d\n", optval); |
| (void)pcre2_config(PCRE2_CONFIG_HEAPLIMIT, &optval); |
| printf(" Default heap limit = %d kibibytes\n", optval); |
| (void)pcre2_config(PCRE2_CONFIG_MATCHLIMIT, &optval); |
| printf(" Default match limit = %d\n", optval); |
| (void)pcre2_config(PCRE2_CONFIG_DEPTHLIMIT, &optval); |
| printf(" Default depth limit = %d\n", optval); |
| |
| #if defined SUPPORT_LIBREADLINE |
| printf(" pcre2test has libreadline support\n"); |
| #elif defined SUPPORT_LIBEDIT |
| printf(" pcre2test has libedit support\n"); |
| #else |
| printf(" pcre2test has neither libreadline nor libedit support\n"); |
| #endif |
| |
| return yield; |
| } |
| |
| |
| /************************************************* |
| * Format one property/script list item * |
| *************************************************/ |
| |
| #ifdef SUPPORT_UNICODE |
| static void |
| format_list_item(int16_t *ff, char *buff, BOOL isscript) |
| { |
| int count; |
| int maxi = 0; |
| const char *maxs = ""; |
| size_t max = 0; |
| |
| for (count = 0; ff[count] >= 0; count++) {} |
| |
| /* Find the name to put first. For scripts, any 3-character name is chosen. |
| For non-scripts, or if there is no 3-character name, take the longest. */ |
| |
| for (int i = 0; ff[i] >= 0; i++) |
| { |
| const char *s = PRIV(utt_names) + ff[i]; |
| size_t len = strlen(s); |
| if (isscript && len == 3) |
| { |
| maxi = i; |
| max = len; |
| maxs = s; |
| break; |
| } |
| else if (len > max) |
| { |
| max = len; |
| maxi = i; |
| maxs = s; |
| } |
| } |
| |
| strcpy(buff, maxs); |
| buff += max; |
| |
| if (count > 1) |
| { |
| const char *sep = " ("; |
| for (int i = 0; i < count; i++) |
| { |
| if (i == maxi) continue; |
| buff += sprintf(buff, "%s%s", sep, PRIV(utt_names) + ff[i]); |
| sep = ", "; |
| } |
| (void)sprintf(buff, ")"); |
| } |
| } |
| #endif /* SUPPORT_UNICODE */ |
| |
| |
| |
| /************************************************* |
| * Display scripts or properties * |
| *************************************************/ |
| |
| #define MAX_SYNONYMS 5 |
| |
| static void |
| display_properties(BOOL wantscripts) |
| { |
| #ifndef SUPPORT_UNICODE |
| (void)wantscripts; |
| printf("** This version of PCRE2 was compiled without Unicode support.\n"); |
| #else |
| |
| uint16_t seentypes[1024]; |
| uint16_t seenvalues[1024]; |
| int seencount = 0; |
| int16_t found[256][MAX_SYNONYMS + 1]; |
| int fc = 0; |
| int colwidth = 40; |
| int n = wantscripts? ucp_Script_Count : ucp_Bprop_Count; |
| |
| for (size_t i = 0; i < PRIV(utt_size); i++) |
| { |
| int k; |
| int m = 0; |
| int16_t *fv; |
| const ucp_type_table *t = PRIV(utt) + i; |
| unsigned int value = t->value; |
| |
| if (wantscripts) |
| { |
| if (t->type != PT_SC && t->type != PT_SCX) continue; |
| } |
| else |
| { |
| if (t->type != PT_BOOL) continue; |
| } |
| |
| for (k = 0; k < seencount; k++) |
| { |
| if (t->type == seentypes[k] && t->value == seenvalues[k]) break; |
| } |
| if (k < seencount) continue; |
| |
| seentypes[seencount] = t->type; |
| seenvalues[seencount++] = t->value; |
| |
| fv = found[fc++]; |
| fv[m++] = t->name_offset; |
| |
| for (size_t j = i + 1; j < PRIV(utt_size); j++) |
| { |
| const ucp_type_table *tt = PRIV(utt) + j; |
| if (tt->type != t->type || tt->value != value) continue; |
| if (m >= MAX_SYNONYMS) |
| printf("** Too many synonyms: %s ignored\n", |
| PRIV(utt_names) + tt->name_offset); |
| else fv[m++] = tt->name_offset; |
| } |
| |
| fv[m] = -1; |
| } |
| |
| printf("-------------------------- SUPPORTED %s --------------------------\n\n", |
| wantscripts? "SCRIPTS" : "PROPERTIES"); |
| |
| if (!wantscripts) printf( |
| "This release of PCRE2 supports Unicode's general category properties such\n" |
| "as Lu (upper case letter), bi-directional properties such as Bidi_Class,\n" |
| "and the following binary (yes/no) properties:\n\n"); |
| |
| |
| for (int k = 0; k < (n+1)/2; k++) |
| { |
| int x; |
| char buff1[128]; |
| char buff2[128]; |
| |
| format_list_item(found[k], buff1, wantscripts); |
| x = k + (n+1)/2; |
| if (x < n) format_list_item(found[x], buff2, wantscripts); |
| else buff2[0] = 0; |
| |
| x = printf("%s", buff1); |
| while (x++ < colwidth) printf(" "); |
| printf("%s\n", buff2); |
| } |
| |
| #endif /* SUPPORT_UNICODE */ |
| } |
| |
| |
| |
| /************************************************* |
| * Display one modifier * |
| *************************************************/ |
| |
| static void |
| display_one_modifier(modstruct *m, BOOL for_pattern) |
| { |
| uint32_t c = (!for_pattern && (m->which == MOD_PND || m->which == MOD_PNDP))? |
| '*' : ' '; |
| printf("%c%s", c, m->name); |
| for (size_t i = 0; i < C1MODLISTCOUNT; i++) |
| { |
| if (strcmp(m->name, c1modlist[i].fullname) == 0) |
| printf(" (%c)", c1modlist[i].onechar); |
| } |
| } |
| |
| |
| |
| /************************************************* |
| * Display pattern or subject modifiers * |
| *************************************************/ |
| |
| /* In order to print in two columns, first scan without printing to get a list |
| of the modifiers that are required. |
| |
| Arguments: |
| for_pattern TRUE for pattern modifiers, FALSE for subject modifiers |
| title string to be used in title |
| |
| Returns: nothing |
| */ |
| |
| static void |
| display_selected_modifiers(BOOL for_pattern, const char *title) |
| { |
| uint32_t i, j; |
| uint32_t n = 0; |
| uint32_t list[MODLISTCOUNT]; |
| uint32_t extra[MODLISTCOUNT]; |
| |
| for (i = 0; i < MODLISTCOUNT; i++) |
| { |
| BOOL is_pattern = TRUE; |
| modstruct *m = modlist + i; |
| |
| switch (m->which) |
| { |
| case MOD_CTC: /* Compile context */ |
| case MOD_PAT: /* Pattern */ |
| case MOD_PATP: /* Pattern, OK for Perl-compatible test */ |
| break; |
| |
| /* The MOD_PND and MOD_PNDP modifiers are precisely those that affect |
| subjects, but can be given with a pattern. We list them as subject |
| modifiers, but marked with an asterisk.*/ |
| |
| case MOD_CTM: /* Match context */ |
| case MOD_DAT: /* Subject line */ |
| case MOD_DATP: /* Subject line, OK for Perl-compatible test */ |
| case MOD_PND: /* As PD, but not default pattern */ |
| case MOD_PNDP: /* As PND, OK for Perl-compatible test */ |
| is_pattern = FALSE; |
| break; |
| |
| default: printf("** Unknown type for modifier \"%s\"\n", m->name); |
| PCRE2_FALLTHROUGH /* Fall through */ |
| case MOD_PD: /* Pattern or subject */ |
| case MOD_PDP: /* As PD, OK for Perl-compatible test */ |
| is_pattern = for_pattern; |
| break; |
| } |
| |
| if (for_pattern == is_pattern) |
| { |
| extra[n] = 0; |
| for (size_t k = 0; k < C1MODLISTCOUNT; k++) |
| { |
| if (strcmp(m->name, c1modlist[k].fullname) == 0) |
| { |
| extra[n] += 4; |
| break; |
| } |
| } |
| list[n++] = i; |
| } |
| } |
| |
| /* Now print from the list in two columns. */ |
| |
| printf("-------------- %s MODIFIERS --------------\n", title); |
| |
| for (i = 0, j = (n+1)/2; i < (n+1)/2; i++, j++) |
| { |
| modstruct *m = modlist + list[i]; |
| display_one_modifier(m, for_pattern); |
| if (j < n) |
| { |
| size_t k = 27 - strlen(m->name) - extra[i]; |
| while (k-- > 0) printf(" "); |
| display_one_modifier(modlist + list[j], for_pattern); |
| } |
| printf("\n"); |
| } |
| } |
| |
| |
| |
| /************************************************* |
| * Display the list of modifiers * |
| *************************************************/ |
| |
| static void |
| display_modifiers(void) |
| { |
| printf( |
| "An asterisk on a subject modifier means that it may be given on a pattern\n" |
| "line, in order to apply to all subjects matched by that pattern. Modifiers\n" |
| "that are listed for both patterns and subjects have different effects in\n" |
| "each case.\n\n"); |
| display_selected_modifiers(TRUE, "PATTERN"); |
| printf("\n"); |
| display_selected_modifiers(FALSE, "SUBJECT"); |
| } |
| |
| |
| |
| /************************************************* |
| * Main Program * |
| *************************************************/ |
| |
| int |
| main(int argc, char **argv) |
| { |
| uint32_t yield = 0; |
| uint32_t op = 1; |
| BOOL notdone = TRUE; |
| BOOL quiet = FALSE; |
| BOOL showtotaltimes = FALSE; |
| BOOL skipping = FALSE; |
| BOOL skipping_endif = FALSE; |
| char *arg_subject = NULL; |
| char *arg_pattern = NULL; |
| char *arg_error = NULL; |
| |
| /* The offsets to the options and control bits fields of the pattern and data |
| control blocks must be the same so that common options and controls such as |
| "anchored" or "memory" can work for either of them from a single table entry. |
| We cannot test this till runtime because "offsetof" does not work in the |
| preprocessor. */ |
| |
| // TODO This comment above is not correct: we can test it at compile time, |
| // although it is true that it's not possible using the preprocessor. Use our |
| // new STATIC_ASSERT macro. |
| |
| if (PO(options) != DO(options) || PO(control) != DO(control) || |
| PO(control2) != DO(control2)) |
| { |
| fprintf(stderr, "** Coding error: " |
| "options and control offsets for pattern and data must be the same.\n"); |
| return 1; |
| } |
| |
| /* Get buffers from malloc() so that valgrind will check their misuse when |
| debugging. They grow automatically when very long lines are read. The 16- |
| and 32-bit buffers (pbuffer16, pbuffer32) are obtained only if needed. */ |
| |
| buffer = (uint8_t *)malloc(pbuffer8_size); |
| pbuffer8 = (uint8_t *)malloc(pbuffer8_size); |
| |
| /* The following _setmode() stuff is some Windows magic that tells its runtime |
| library to translate CRLF into a single LF character. At least, that's what |
| I've been told: never having used Windows I take this all on trust. Originally |
| it set 0x8000, but then I was advised that _O_BINARY was better. */ |
| |
| #if defined(_WIN32) || defined(WIN32) |
| _setmode( _fileno( stdout ), _O_BINARY ); |
| #endif |
| |
| /* Initialization that does not depend on the running mode. */ |
| |
| locale_name[0] = 0; |
| |
| memset(&def_patctl, 0, sizeof(patctl)); |
| def_patctl.convert_type = CONVERT_UNSET; |
| |
| memset(&def_datctl, 0, sizeof(datctl)); |
| def_datctl.oveccount = DEFAULT_OVECCOUNT; |
| def_datctl.copy_numbers[0] = -1; |
| def_datctl.get_numbers[0] = -1; |
| def_datctl.startend[0] = def_datctl.startend[1] = CFORE_UNSET; |
| def_datctl.cerror[0] = def_datctl.cerror[1] = CFORE_UNSET; |
| def_datctl.cfail[0] = def_datctl.cfail[1] = CFORE_UNSET; |
| |
| /* Scan command line options. */ |
| |
| while (argc > 1 && argv[op][0] == '-' && argv[op][1] != 0) |
| { |
| char *endptr; |
| char *arg = argv[op]; |
| unsigned long uli; |
| |
| /* List modifiers and exit. */ |
| |
| if (strcmp(arg, "-LM") == 0) |
| { |
| display_modifiers(); |
| goto EXIT; |
| } |
| |
| /* List properties and exit */ |
| |
| if (strcmp(arg, "-LP") == 0) |
| { |
| display_properties(FALSE); |
| goto EXIT; |
| } |
| |
| /* List scripts and exit */ |
| |
| if (strcmp(arg, "-LS") == 0) |
| { |
| display_properties(TRUE); |
| goto EXIT; |
| } |
| |
| /* Display and/or set return code for configuration options. */ |
| |
| if (strcmp(arg, "-C") == 0) |
| { |
| yield = c_option(argv[op + 1]); |
| goto EXIT; |
| } |
| |
| /* Select operating mode. */ |
| |
| if (strcmp(arg, "-8") == 0) |
| { |
| #ifdef SUPPORT_PCRE2_8 |
| test_mode = PCRE2TEST_MODE_8; |
| #else |
| fprintf(stderr, |
| "** This version of PCRE2 was built without 8-bit support\n"); |
| exit(1); |
| #endif |
| } |
| |
| else if (strcmp(arg, "-16") == 0) |
| { |
| #ifdef SUPPORT_PCRE2_16 |
| test_mode = PCRE2TEST_MODE_16; |
| #else |
| fprintf(stderr, |
| "** This version of PCRE2 was built without 16-bit support\n"); |
| exit(1); |
| #endif |
| } |
| |
| else if (strcmp(arg, "-32") == 0) |
| { |
| #ifdef SUPPORT_PCRE2_32 |
| test_mode = PCRE2TEST_MODE_32; |
| #else |
| fprintf(stderr, |
| "** This version of PCRE2 was built without 32-bit support\n"); |
| exit(1); |
| #endif |
| } |
| |
| /* Set preprocess-only (only handle #if ... #endif) */ |
| |
| else if (strcmp(arg, "-E") == 0) preprocess_only = TRUE; |
| |
| /* Set quiet (no version verification) */ |
| |
| else if (strcmp(arg, "-q") == 0) quiet = TRUE; |
| |
| /* Set system stack size */ |
| |
| else if (strcmp(arg, "-S") == 0 && argc > 2 && |
| ((uli = strtoul(argv[op+1], &endptr, 10)), *endptr == 0)) |
| { |
| #if defined(_WIN32) || defined(WIN32) || defined(__HAIKU__) || defined(NATIVE_ZOS) || defined(__VMS) |
| fprintf(stderr, "pcre2test: -S is not supported on this OS\n"); |
| exit(1); |
| #else |
| int rc = 0; |
| uint32_t stack_size; |
| struct rlimit rlim, rlim_old; |
| if (uli > INT32_MAX / (1024 * 1024)) |
| { |
| fprintf(stderr, "** Argument for -S is too big\n"); |
| exit(1); |
| } |
| stack_size = (uint32_t)uli; |
| getrlimit(RLIMIT_STACK, &rlim_old); |
| rlim = rlim_old; |
| rlim.rlim_cur = stack_size * 1024 * 1024; |
| if (rlim.rlim_max != RLIM_INFINITY && rlim.rlim_cur > rlim.rlim_max) |
| { |
| fprintf(stderr, |
| "pcre2test: requested stack size %luMiB is greater than hard limit ", |
| (unsigned long int)stack_size); |
| if (rlim.rlim_max % (1024*1024) == 0) |
| fprintf(stderr, "%luMiB\n", (unsigned long)(rlim.rlim_max/(1024*1024))); |
| else if (rlim.rlim_max % 1024 == 0) |
| fprintf(stderr, "%luKiB\n", (unsigned long)(rlim.rlim_max/1024)); |
| else |
| fprintf(stderr, "%lu bytes\n", (unsigned long)(rlim.rlim_max)); |
| exit(1); |
| } |
| if (rlim_old.rlim_cur != RLIM_INFINITY && rlim_old.rlim_cur <= INT32_MAX && |
| rlim.rlim_cur > rlim_old.rlim_cur) |
| rc = setrlimit(RLIMIT_STACK, &rlim); |
| if (rc != 0) |
| { |
| fprintf(stderr, "pcre2test: setting stack size %luMiB failed: %s\n", |
| (unsigned long int)stack_size, strerror(errno)); |
| exit(1); |
| } |
| op++; |
| argc--; |
| #endif |
| } |
| |
| /* Set some common pattern and subject controls */ |
| |
| else if (strcmp(arg, "-AC") == 0) |
| { |
| def_patctl.options |= PCRE2_AUTO_CALLOUT; |
| def_datctl.control2 |= CTL2_CALLOUT_EXTRA; |
| } |
| else if (strcmp(arg, "-ac") == 0) def_patctl.options |= PCRE2_AUTO_CALLOUT; |
| else if (strcmp(arg, "-b") == 0) def_patctl.control |= CTL_FULLBINCODE; |
| else if (strcmp(arg, "-d") == 0) def_patctl.control |= CTL_DEBUG; |
| else if (strcmp(arg, "-dfa") == 0) def_datctl.control |= CTL_DFA; |
| else if (strcmp(arg, "-i") == 0) def_patctl.control |= CTL_INFO; |
| else if (strcmp(arg, "-jit") == 0 || strcmp(arg, "-jitverify") == 0 || |
| strcmp(arg, "-jitfast") == 0) |
| { |
| if (arg[4] == 'v') def_patctl.control |= CTL_JITVERIFY; |
| else if (arg[4] == 'f') def_patctl.control |= CTL_JITFAST; |
| def_patctl.jit = JIT_DEFAULT; /* full & partial */ |
| #ifndef SUPPORT_JIT |
| fprintf(stderr, "** Warning: JIT support is not available: " |
| "-jit[fast|verify] calls functions that do nothing.\n"); |
| #endif |
| } |
| |
| /* Set timing parameters */ |
| |
| else if (strcmp(arg, "-t") == 0 || strcmp(arg, "-tm") == 0 || |
| strcmp(arg, "-T") == 0 || strcmp(arg, "-TM") == 0) |
| { |
| int both = arg[2] == 0; |
| showtotaltimes = arg[1] == 'T'; |
| if (argc > 2 && (uli = strtoul(argv[op+1], &endptr, 10), *endptr == 0)) |
| { |
| if (uli == 0) |
| { |
| fprintf(stderr, "** Argument for %s must not be zero\n", arg); |
| exit(1); |
| } |
| if (U32OVERFLOW(uli)) |
| { |
| fprintf(stderr, "** Argument for %s is too big\n", arg); |
| exit(1); |
| } |
| timeitm = (int)uli; |
| op++; |
| argc--; |
| } |
| else timeitm = LOOPREPEAT; |
| if (both) timeit = timeitm; |
| } |
| |
| /* Set malloc testing */ |
| |
| else if (strcmp(arg, "-malloc") == 0) |
| { |
| malloc_testing = TRUE; |
| } |
| |
| /* Give help */ |
| |
| else if (strcmp(arg, "-help") == 0 || |
| strcmp(arg, "--help") == 0) |
| { |
| usage(); |
| goto EXIT; |
| } |
| |
| /* Show version */ |
| |
| else if (memcmp(arg, "-v", 2) == 0 || |
| strcmp(arg, "--version") == 0) |
| { |
| print_version(stdout, FALSE); |
| goto EXIT; |
| } |
| |
| /* The following options save their data for processing once we know what |
| the running mode is. */ |
| |
| else if (strcmp(arg, "-error") == 0) |
| { |
| arg_error = argv[op+1]; |
| goto CHECK_VALUE_EXISTS; |
| } |
| |
| else if (strcmp(arg, "-subject") == 0) |
| { |
| arg_subject = argv[op+1]; |
| goto CHECK_VALUE_EXISTS; |
| } |
| |
| else if (strcmp(arg, "-pattern") == 0) |
| { |
| arg_pattern = argv[op+1]; |
| CHECK_VALUE_EXISTS: |
| if (argc <= 2) |
| { |
| fprintf(stderr, "** Missing value for %s\n", arg); |
| yield = 1; |
| goto EXIT; |
| } |
| op++; |
| argc--; |
| } |
| |
| /* Unrecognized option */ |
| |
| else |
| { |
| fprintf(stderr, "** Unknown or malformed option \"%s\"\n", arg); |
| usage(); |
| yield = 1; |
| goto EXIT; |
| } |
| op++; |
| argc--; |
| } |
| |
| /* If -error was present, get the error numbers, show the messages, and exit. |
| We wait to do this until we know which mode we are in. */ |
| |
| if (arg_error != NULL) |
| { |
| int errcode; |
| char *endptr; |
| long li; |
| |
| /* Loop along a list of error numbers. */ |
| |
| for (;;) |
| { |
| li = strtol(arg_error, &endptr, 10); |
| if (S32OVERFLOW(li) || (*endptr != 0 && *endptr != ',')) |
| { |
| fprintf(stderr, "** \"%s\" is not a valid error number list\n", arg_error); |
| yield = 1; |
| goto EXIT; |
| } |
| errcode = (int)li; |
| printf("Error %d: ", errcode); |
| print_error_message_file(stdout, errcode, "", "\n", TRUE); |
| if (*endptr == 0) goto EXIT; |
| arg_error = endptr + 1; |
| } |
| |
| PCRE2_UNREACHABLE(); /* Control never reaches here */ |
| } /* End of -error handling */ |
| |
| /* Initialize things that cannot be done until we know which test mode we are |
| running in. */ |
| |
| max_oveccount = DEFAULT_OVECCOUNT; |
| |
| /* Initialise the globals for the current mode. */ |
| |
| init_globals(); |
| |
| /* Perform additional edge-case and error-handling tests of public API |
| functions, which wouldn't otherwise be covered by the standard use of the API |
| in pcre2test. */ |
| |
| unittest(); |
| |
| /* Handle command line modifier settings, sending any error messages to |
| stderr. We need to know the mode before modifying the context, and it is tidier |
| to do them all in the same way. */ |
| |
| outfile = stderr; |
| if ((arg_pattern != NULL && |
| !decode_modifiers((uint8_t *)arg_pattern, CTX_DEFPAT, &def_patctl, NULL)) || |
| (arg_subject != NULL && |
| !decode_modifiers((uint8_t *)arg_subject, CTX_DEFDAT, NULL, &def_datctl))) |
| { |
| yield = 1; |
| goto EXIT; |
| } |
| |
| /* Sort out the input and output files, defaulting to stdin/stdout. */ |
| |
| infile = stdin; |
| outfile = stdout; |
| |
| if (argc > 1 && strcmp(argv[op], "-") != 0) |
| { |
| infile = fopen(argv[op], INPUT_MODE); |
| if (infile == NULL) |
| { |
| printf("** Failed to open \"%s\": %s\n", argv[op], strerror(errno)); |
| yield = 1; |
| goto EXIT; |
| } |
| } |
| |
| #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) |
| if (INTERACTIVE(infile)) using_history(); |
| #endif |
| |
| if (argc > 2) |
| { |
| outfile = fopen(argv[op+1], OUTPUT_MODE); |
| if (outfile == NULL) |
| { |
| printf("** Failed to open \"%s\": %s\n", argv[op+1], strerror(errno)); |
| yield = 1; |
| goto EXIT; |
| } |
| } |
| |
| /* Output a heading line unless quiet, then process input lines. */ |
| |
| if (!quiet) print_version(outfile, TRUE); |
| |
| #ifdef SUPPORT_PCRE2_8 |
| preg.re_pcre2_code = NULL; |
| preg.re_match_data = NULL; |
| #endif |
| |
| while (notdone) |
| { |
| const uint8_t *p; |
| int rc = PR_OK; |
| BOOL expectdata = have_active_pattern(); |
| #ifdef SUPPORT_PCRE2_8 |
| expectdata |= preg.re_pcre2_code != NULL; |
| #endif |
| |
| if (extend_inputline(infile, buffer, expectdata? "data> " : " re> ") == NULL) |
| break; |
| |
| /* Pre-process input lines with #if...#endif. */ |
| |
| if (skipping_endif) |
| { |
| if (strncmp((char*)buffer, "#endif", 6) != 0 || |
| !(buffer[6] == 0 || isspace(buffer[6]))) |
| continue; |
| skipping_endif = FALSE; |
| } |
| |
| /* Begin processing the line. */ |
| |
| if (!INTERACTIVE(infile)) fprintf(outfile, "%s", (char *)buffer); |
| fflush(outfile); |
| p = buffer; |
| |
| if (preprocess_only && *p != '#') continue; |
| |
| /* If we have a pattern set up for testing, or we are skipping after a |
| compile failure, a blank line terminates this test. */ |
| |
| if (expectdata || skipping) |
| { |
| while (isspace(*p)) p++; |
| if (*p == 0) |
| { |
| #ifdef SUPPORT_PCRE2_8 |
| if (preg.re_pcre2_code != NULL) |
| { |
| regfree(&preg); |
| preg.re_pcre2_code = NULL; |
| preg.re_match_data = NULL; |
| } |
| #endif /* SUPPORT_PCRE2_8 */ |
| free_active_pattern(); |
| skipping = FALSE; |
| setlocale(LC_CTYPE, "C"); |
| } |
| |
| /* Otherwise, if we are not skipping, and the line is not a data comment |
| line starting with "\=", process a data line. */ |
| |
| else if (!skipping && !(p[0] == '\\' && p[1] == '=' && isspace(p[2]))) |
| { |
| rc = process_data(); |
| } |
| } |
| |
| /* We do not have a pattern set up for testing. Lines starting with # are |
| either comments or special commands. Blank lines are ignored. Otherwise, the |
| line must start with a valid delimiter. It is then processed as a pattern |
| line. The pattern remains in pbuffer8/16/32 after compilation, for use by |
| callouts. Under valgrind, make the unused part of the buffer undefined, to |
| catch overruns. */ |
| |
| else if (*p == '#') |
| { |
| if (isspace(p[1]) || p[1] == '!' || p[1] == 0) continue; |
| rc = process_command(); |
| } |
| |
| else if (strchr("/!\"'`%&-=_:;,@~", *p) != NULL) |
| { |
| rc = process_pattern(); |
| dfa_matched = 0; |
| } |
| |
| else |
| { |
| while (isspace(*p)) p++; |
| if (*p != 0) |
| { |
| fprintf(outfile, "** Invalid pattern delimiter '%c' (x%x).\n", *buffer, |
| *buffer); |
| rc = PR_SKIP; |
| } |
| } |
| |
| if (rc == PR_SKIP && !INTERACTIVE(infile)) skipping = TRUE; |
| else if (rc == PR_ENDIF) skipping_endif = TRUE; |
| else if (rc == PR_ABEND) |
| { |
| fprintf(outfile, "** pcre2test run abandoned\n"); |
| yield = 1; |
| goto EXIT; |
| } |
| } |
| |
| /* Finish off a normal run. */ |
| |
| if (skipping_endif) |
| { |
| fprintf(outfile, "** Expected #endif\n"); |
| yield = 1; |
| goto EXIT; |
| } |
| |
| if (INTERACTIVE(infile)) fprintf(outfile, "\n"); |
| |
| if (showtotaltimes) |
| { |
| const char *pad = ""; |
| fprintf(outfile, "--------------------------------------\n"); |
| if (timeit > 0) |
| { |
| fprintf(outfile, "Total compile time %8.2f microseconds\n", |
| ((1000000 / CLOCKS_PER_SEC) * (double)total_compile_time) / timeit); |
| if (total_jit_compile_time > 0) |
| fprintf(outfile, "Total JIT compile %8.2f microseconds\n", |
| ((1000000 / CLOCKS_PER_SEC) * (double)total_jit_compile_time) / \ |
| timeit); |
| pad = " "; |
| } |
| fprintf(outfile, "Total match time %s%8.2f microseconds\n", pad, |
| ((1000000 / CLOCKS_PER_SEC) * (double)total_match_time) / timeitm); |
| } |
| |
| |
| EXIT: |
| |
| #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) |
| if (infile != NULL && INTERACTIVE(infile)) clear_history(); |
| #endif |
| |
| if (infile != NULL && infile != stdin) fclose(infile); |
| if (outfile != NULL && outfile != stdout) fclose(outfile); |
| |
| #ifdef SUPPORT_PCRE2_8 |
| if (preg.re_pcre2_code != NULL) regfree(&preg); |
| #endif |
| |
| free(buffer); |
| free(dbuffer); |
| free(pbuffer8); |
| #ifdef SUPPORT_PCRE2_16 |
| free(pbuffer16); |
| #endif |
| #ifdef SUPPORT_PCRE2_32 |
| free(pbuffer32); |
| #endif |
| free(dfa_workspace); |
| free(tables3); |
| free_globals(); |
| |
| #if defined(__VMS) |
| yield = SS$_NORMAL; /* Return values via DCL symbols */ |
| #endif |
| |
| return yield; |
| } |
| |
| /* End of pcre2test.c */ |