| /************************************************* |
| * Perl-Compatible Regular Expressions * |
| *************************************************/ |
| |
| /* PCRE is a library of functions to support regular expressions whose syntax |
| and semantics are as close as possible to those of the Perl 5 language. |
| |
| Written by Philip Hazel |
| This module by Zoltan Herczeg |
| Original API code Copyright (c) 1997-2012 University of Cambridge |
| New API code Copyright (c) 2016-2024 University of Cambridge |
| |
| ----------------------------------------------------------------------------- |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions are met: |
| |
| * Redistributions of source code must retain the above copyright notice, |
| this list of conditions and the following disclaimer. |
| |
| * Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions and the following disclaimer in the |
| documentation and/or other materials provided with the distribution. |
| |
| * Neither the name of the University of Cambridge nor the names of its |
| contributors may be used to endorse or promote products derived from |
| this software without specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| POSSIBILITY OF SUCH DAMAGE. |
| ----------------------------------------------------------------------------- |
| */ |
| |
| /* XClass matching code. */ |
| |
| #ifdef SUPPORT_WIDE_CHARS |
| |
| #define ECLASS_CHAR_DATA STACK_TOP |
| #define ECLASS_STACK_DATA STACK_LIMIT |
| |
| #define SET_CHAR_OFFSET(value) \ |
| if ((value) != charoffset) \ |
| { \ |
| if ((value) < charoffset) \ |
| OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(charoffset - (value))); \ |
| else \ |
| OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)((value) - charoffset)); \ |
| } \ |
| charoffset = (value); |
| |
| #define READ_FROM_CHAR_LIST(destination) \ |
| if (list_ind <= 1) \ |
| { \ |
| destination = *(const uint16_t*)next_char; \ |
| next_char += 2; \ |
| } \ |
| else \ |
| { \ |
| destination = *(const uint32_t*)next_char; \ |
| next_char += 4; \ |
| } |
| |
| #define XCLASS_LOCAL_RANGES_SIZE 32 |
| #define XCLASS_LOCAL_RANGES_LOG2_SIZE 5 |
| |
| typedef struct xclass_stack_item { |
| sljit_u32 first_item; |
| sljit_u32 last_item; |
| struct sljit_jump *jump; |
| } xclass_stack_item; |
| |
| typedef struct xclass_ranges { |
| size_t range_count; |
| /* Pointer to ranges. A stack area is provided when a small buffer is enough. */ |
| uint32_t *ranges; |
| uint32_t local_ranges[XCLASS_LOCAL_RANGES_SIZE * 2]; |
| /* Stack size must be log2(ranges / 2). */ |
| xclass_stack_item *stack; |
| xclass_stack_item local_stack[XCLASS_LOCAL_RANGES_LOG2_SIZE]; |
| } xclass_ranges; |
| |
| static void xclass_compute_ranges(compiler_common *common, PCRE2_SPTR cc, xclass_ranges *ranges) |
| { |
| DEFINE_COMPILER; |
| size_t range_count = 0, est_range_count; |
| size_t est_stack_size, tmp; |
| uint32_t type, list_ind; |
| uint32_t est_type; |
| uint32_t char_list_add, range_start, range_end; |
| const uint8_t *next_char; |
| const uint8_t *est_next_char; |
| #if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16) |
| BOOL utf = common->utf; |
| #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */ |
| |
| if (*cc == XCL_SINGLE || *cc == XCL_RANGE) |
| { |
| /* Only a few ranges are present. */ |
| do |
| { |
| type = *cc++; |
| SLJIT_ASSERT(type == XCL_SINGLE || type == XCL_RANGE); |
| GETCHARINCTEST(range_end, cc); |
| ranges->ranges[range_count] = range_end; |
| |
| if (type == XCL_RANGE) |
| { |
| GETCHARINCTEST(range_end, cc); |
| } |
| |
| ranges->ranges[range_count + 1] = range_end; |
| range_count += 2; |
| } |
| while (*cc != XCL_END); |
| |
| SLJIT_ASSERT(range_count <= XCLASS_LOCAL_RANGES_SIZE); |
| ranges->range_count = range_count; |
| return; |
| } |
| |
| SLJIT_ASSERT(cc[0] >= XCL_LIST); |
| #if PCRE2_CODE_UNIT_WIDTH == 8 |
| type = (uint32_t)(cc[0] << 8) | cc[1]; |
| cc += 2; |
| #else |
| type = cc[0]; |
| cc++; |
| #endif /* CODE_UNIT_WIDTH */ |
| |
| /* Align characters. */ |
| next_char = (const uint8_t*)common->start - (GET(cc, 0) << 1); |
| type &= XCL_TYPE_MASK; |
| |
| /* Estimate size. */ |
| est_next_char = next_char; |
| est_type = type; |
| est_range_count = 0; |
| list_ind = 0; |
| |
| while (est_type > 0) |
| { |
| uint32_t item_count = est_type & XCL_ITEM_COUNT_MASK; |
| |
| if (item_count == XCL_ITEM_COUNT_MASK) |
| { |
| if (list_ind <= 1) |
| { |
| item_count = *(const uint16_t*)est_next_char; |
| est_next_char += 2; |
| } |
| else |
| { |
| item_count = *(const uint32_t*)est_next_char; |
| est_next_char += 4; |
| } |
| } |
| |
| est_type >>= XCL_TYPE_BIT_LEN; |
| est_next_char += (size_t)item_count << (list_ind <= 1 ? 1 : 2); |
| list_ind++; |
| est_range_count += item_count + 1; |
| } |
| |
| if (est_range_count > XCLASS_LOCAL_RANGES_SIZE) |
| { |
| est_stack_size = 0; |
| tmp = est_range_count - 1; |
| |
| /* Compute log2(est_range_count) */ |
| while (tmp > 0) |
| { |
| est_stack_size++; |
| tmp >>= 1; |
| } |
| |
| ranges->stack = (xclass_stack_item*)SLJIT_MALLOC((sizeof(xclass_stack_item) * est_stack_size) |
| + ((sizeof(uint32_t) << 1) * (size_t)est_range_count), compiler->allocator_data); |
| |
| if (ranges->stack == NULL) |
| { |
| sljit_set_compiler_memory_error(compiler); |
| ranges->ranges = NULL; |
| return; |
| } |
| |
| ranges->ranges = (uint32_t*)(ranges->stack + est_stack_size); |
| } |
| |
| char_list_add = XCL_CHAR_LIST_LOW_16_ADD; |
| range_start = ~(uint32_t)0; |
| list_ind = 0; |
| |
| if ((type & XCL_BEGIN_WITH_RANGE) != 0) |
| range_start = XCL_CHAR_LIST_LOW_16_START; |
| |
| while (type > 0) |
| { |
| uint32_t item_count = type & XCL_ITEM_COUNT_MASK; |
| |
| if (item_count == XCL_ITEM_COUNT_MASK) |
| { |
| READ_FROM_CHAR_LIST(item_count); |
| SLJIT_ASSERT(item_count >= XCL_ITEM_COUNT_MASK); |
| } |
| |
| while (item_count > 0) |
| { |
| READ_FROM_CHAR_LIST(range_end); |
| |
| if ((range_end & XCL_CHAR_END) != 0) |
| { |
| range_end = char_list_add + (range_end >> XCL_CHAR_SHIFT); |
| |
| if (range_start == ~(uint32_t)0) |
| range_start = range_end; |
| |
| ranges->ranges[range_count] = range_start; |
| ranges->ranges[range_count + 1] = range_end; |
| range_count += 2; |
| range_start = ~(uint32_t)0; |
| } |
| else |
| range_start = char_list_add + (range_end >> XCL_CHAR_SHIFT); |
| |
| item_count--; |
| } |
| |
| list_ind++; |
| type >>= XCL_TYPE_BIT_LEN; |
| |
| if (range_start == ~(uint32_t)0) |
| { |
| if ((type & XCL_BEGIN_WITH_RANGE) != 0) |
| { |
| if (list_ind == 1) range_start = XCL_CHAR_LIST_HIGH_16_START; |
| #if PCRE2_CODE_UNIT_WIDTH == 32 |
| else if (list_ind == 2) range_start = XCL_CHAR_LIST_LOW_32_START; |
| else range_start = XCL_CHAR_LIST_HIGH_32_START; |
| #else |
| else range_start = XCL_CHAR_LIST_LOW_32_START; |
| #endif |
| } |
| } |
| else if ((type & XCL_BEGIN_WITH_RANGE) == 0) |
| { |
| if (list_ind == 1) range_end = XCL_CHAR_LIST_LOW_16_END; |
| else if (list_ind == 2) range_end = XCL_CHAR_LIST_HIGH_16_END; |
| #if PCRE2_CODE_UNIT_WIDTH == 32 |
| else if (list_ind == 3) range_end = XCL_CHAR_LIST_LOW_32_END; |
| else range_end = XCL_CHAR_LIST_HIGH_32_END; |
| #else |
| else range_end = XCL_CHAR_LIST_LOW_32_END; |
| #endif |
| |
| ranges->ranges[range_count] = range_start; |
| ranges->ranges[range_count + 1] = range_end; |
| range_count += 2; |
| range_start = ~(uint32_t)0; |
| } |
| |
| if (list_ind == 1) char_list_add = XCL_CHAR_LIST_HIGH_16_ADD; |
| #if PCRE2_CODE_UNIT_WIDTH == 32 |
| else if (list_ind == 2) char_list_add = XCL_CHAR_LIST_LOW_32_ADD; |
| else char_list_add = XCL_CHAR_LIST_HIGH_32_ADD; |
| #else |
| else char_list_add = XCL_CHAR_LIST_LOW_32_ADD; |
| #endif |
| } |
| |
| SLJIT_ASSERT(range_count > 0 && range_count <= (est_range_count << 1)); |
| SLJIT_ASSERT(next_char <= (const uint8_t*)common->start); |
| ranges->range_count = range_count; |
| } |
| |
| static void xclass_check_bitset(compiler_common *common, const sljit_u8 *bitset, jump_list **found, jump_list **backtracks) |
| { |
| DEFINE_COMPILER; |
| struct sljit_jump *jump; |
| |
| jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); |
| if (!optimize_class(common, bitset, (bitset[31] & 0x80) != 0, TRUE, found)) |
| { |
| OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); |
| OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); |
| OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)bitset); |
| OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0); |
| OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, TMP2, 0); |
| add_jump(compiler, found, JUMP(SLJIT_NOT_ZERO)); |
| } |
| |
| add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); |
| JUMPHERE(jump); |
| } |
| |
| #if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16) |
| |
| static void xclass_update_min_max(compiler_common *common, PCRE2_SPTR cc, sljit_u32 *min_ptr, sljit_u32 *max_ptr) |
| { |
| uint32_t type, list_ind, c; |
| sljit_u32 min = *min_ptr; |
| sljit_u32 max = *max_ptr; |
| uint32_t char_list_add; |
| const uint8_t *next_char; |
| BOOL utf = TRUE; |
| |
| /* This function is pointless without utf 8/16. */ |
| SLJIT_ASSERT(common->utf); |
| if (*cc == XCL_SINGLE || *cc == XCL_RANGE) |
| { |
| /* Only a few ranges are present. */ |
| do |
| { |
| type = *cc++; |
| SLJIT_ASSERT(type == XCL_SINGLE || type == XCL_RANGE); |
| GETCHARINCTEST(c, cc); |
| |
| if (c < min) |
| min = c; |
| |
| if (type == XCL_RANGE) |
| { |
| GETCHARINCTEST(c, cc); |
| } |
| |
| if (c > max) |
| max = c; |
| } |
| while (*cc != XCL_END); |
| |
| SLJIT_ASSERT(min <= MAX_UTF_CODE_POINT && max <= MAX_UTF_CODE_POINT && min <= max); |
| *min_ptr = min; |
| *max_ptr = max; |
| return; |
| } |
| |
| SLJIT_ASSERT(cc[0] >= XCL_LIST); |
| #if PCRE2_CODE_UNIT_WIDTH == 8 |
| type = (uint32_t)(cc[0] << 8) | cc[1]; |
| cc += 2; |
| #else |
| type = cc[0]; |
| cc++; |
| #endif /* CODE_UNIT_WIDTH */ |
| |
| /* Align characters. */ |
| next_char = (const uint8_t*)common->start - (GET(cc, 0) << 1); |
| type &= XCL_TYPE_MASK; |
| |
| SLJIT_ASSERT(type != 0); |
| |
| /* Detect minimum. */ |
| |
| /* Skip unused ranges. */ |
| list_ind = 0; |
| while ((type & (XCL_BEGIN_WITH_RANGE | XCL_ITEM_COUNT_MASK)) == 0) |
| { |
| type >>= XCL_TYPE_BIT_LEN; |
| list_ind++; |
| } |
| |
| SLJIT_ASSERT(list_ind <= 2); |
| switch (list_ind) |
| { |
| case 0: |
| char_list_add = XCL_CHAR_LIST_LOW_16_ADD; |
| c = XCL_CHAR_LIST_LOW_16_START; |
| break; |
| |
| case 1: |
| char_list_add = XCL_CHAR_LIST_HIGH_16_ADD; |
| c = XCL_CHAR_LIST_HIGH_16_START; |
| break; |
| |
| default: |
| char_list_add = XCL_CHAR_LIST_LOW_32_ADD; |
| c = XCL_CHAR_LIST_LOW_32_START; |
| break; |
| } |
| |
| if ((type & XCL_BEGIN_WITH_RANGE) != 0) |
| { |
| if (c < min) |
| min = c; |
| } |
| else |
| { |
| if ((type & XCL_ITEM_COUNT_MASK) == XCL_ITEM_COUNT_MASK) |
| { |
| if (list_ind <= 1) |
| c = *(const uint16_t*)(next_char + 2); |
| else |
| c = *(const uint32_t*)(next_char + 4); |
| } |
| else |
| { |
| if (list_ind <= 1) |
| c = *(const uint16_t*)next_char; |
| else |
| c = *(const uint32_t*)next_char; |
| } |
| |
| c = char_list_add + (c >> XCL_CHAR_SHIFT); |
| if (c < min) |
| min = c; |
| } |
| |
| /* Detect maximum. */ |
| |
| /* Skip intermediate ranges. */ |
| while (TRUE) |
| { |
| if ((type & XCL_ITEM_COUNT_MASK) == XCL_ITEM_COUNT_MASK) |
| { |
| if (list_ind <= 1) |
| { |
| c = *(const uint16_t*)next_char; |
| next_char += (c + 1) << 1; |
| } |
| else |
| { |
| c = *(const uint32_t*)next_char; |
| next_char += (c + 1) << 2; |
| } |
| } |
| else |
| next_char += (type & XCL_ITEM_COUNT_MASK) << (list_ind <= 1 ? 1 : 2); |
| |
| if ((type >> XCL_TYPE_BIT_LEN) == 0) |
| break; |
| |
| list_ind++; |
| type >>= XCL_TYPE_BIT_LEN; |
| } |
| |
| SLJIT_ASSERT(list_ind <= 2 && type != 0); |
| switch (list_ind) |
| { |
| case 0: |
| char_list_add = XCL_CHAR_LIST_LOW_16_ADD; |
| c = XCL_CHAR_LIST_LOW_16_END; |
| break; |
| |
| case 1: |
| char_list_add = XCL_CHAR_LIST_HIGH_16_ADD; |
| c = XCL_CHAR_LIST_HIGH_16_END; |
| break; |
| |
| default: |
| char_list_add = XCL_CHAR_LIST_LOW_32_ADD; |
| c = XCL_CHAR_LIST_LOW_32_END; |
| break; |
| } |
| |
| if ((type & XCL_ITEM_COUNT_MASK) != 0) |
| { |
| /* Type is reused as temporary. */ |
| if (list_ind <= 1) |
| type = *(const uint16_t*)(next_char - 2); |
| else |
| type = *(const uint32_t*)(next_char - 4); |
| |
| if (type & XCL_CHAR_END) |
| c = char_list_add + (type >> XCL_CHAR_SHIFT); |
| } |
| |
| if (c > max) |
| max = c; |
| |
| SLJIT_ASSERT(min <= MAX_UTF_CODE_POINT && max <= MAX_UTF_CODE_POINT && min <= max); |
| *min_ptr = min; |
| *max_ptr = max; |
| } |
| |
| #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */ |
| |
| #define XCLASS_IS_ECLASS 0x001 |
| #ifdef SUPPORT_UNICODE |
| #define XCLASS_SAVE_CHAR 0x002 |
| #define XCLASS_HAS_TYPE 0x004 |
| #define XCLASS_HAS_SCRIPT 0x008 |
| #define XCLASS_HAS_SCRIPT_EXTENSION 0x010 |
| #define XCLASS_HAS_BOOL 0x020 |
| #define XCLASS_HAS_BIDICL 0x040 |
| #define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BOOL | XCLASS_HAS_BIDICL) |
| #define XCLASS_SCRIPT_EXTENSION_NOTPROP 0x080 |
| #define XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR 0x100 |
| #define XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0 0x200 |
| #endif /* SUPPORT_UNICODE */ |
| |
| static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr); |
| |
| /* TMP3 must be preserved because it is used by compile_iterator_matchingpath. */ |
| static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks, sljit_u32 status) |
| { |
| DEFINE_COMPILER; |
| jump_list *found = NULL; |
| jump_list *check_result = NULL; |
| jump_list **list = (cc[0] & XCL_NOT) == 0 ? &found : backtracks; |
| sljit_uw c, charoffset; |
| sljit_u32 max = READ_CHAR_MAX, min = 0; |
| struct sljit_jump *jump = NULL; |
| PCRE2_UCHAR flags; |
| PCRE2_SPTR ccbegin; |
| sljit_u32 compares, invertcmp, depth; |
| sljit_u32 first_item, last_item, mid_item; |
| sljit_u32 range_start, range_end; |
| xclass_ranges ranges; |
| BOOL has_cmov, last_range_set; |
| |
| #ifdef SUPPORT_UNICODE |
| sljit_u32 category_list = 0; |
| sljit_u32 items; |
| int typereg = TMP1; |
| #else |
| (void)c; /* Avoid compiler warning. */ |
| #endif /* SUPPORT_UNICODE */ |
| |
| SLJIT_ASSERT(common->locals_size >= SSIZE_OF(sw)); |
| /* Scanning the necessary info. */ |
| flags = *cc++; |
| ccbegin = cc; |
| compares = 0; |
| |
| if (flags & XCL_MAP) |
| cc += 32 / sizeof(PCRE2_UCHAR); |
| |
| #ifdef SUPPORT_UNICODE |
| while (*cc == XCL_PROP || *cc == XCL_NOTPROP) |
| { |
| compares++; |
| cc++; |
| |
| items = 0; |
| |
| switch(*cc) |
| { |
| case PT_LAMP: |
| items = UCPCAT3(ucp_Lu, ucp_Ll, ucp_Lt); |
| break; |
| |
| case PT_GC: |
| items = UCPCAT_RANGE(PRIV(ucp_typerange)[(int)cc[1] * 2], PRIV(ucp_typerange)[(int)cc[1] * 2 + 1]); |
| break; |
| |
| case PT_PC: |
| items = UCPCAT(cc[1]); |
| break; |
| |
| case PT_WORD: |
| items = UCPCAT2(ucp_Mn, ucp_Pc) | UCPCAT_L | UCPCAT_N; |
| break; |
| |
| case PT_ALNUM: |
| items = UCPCAT_L | UCPCAT_N; |
| break; |
| |
| case PT_SCX: |
| status |= XCLASS_HAS_SCRIPT_EXTENSION; |
| if (cc[-1] == XCL_NOTPROP) |
| { |
| status |= XCLASS_SCRIPT_EXTENSION_NOTPROP; |
| break; |
| } |
| compares++; |
| PCRE2_FALLTHROUGH /* Fall through */ |
| |
| case PT_SC: |
| status |= XCLASS_HAS_SCRIPT; |
| break; |
| |
| case PT_SPACE: |
| case PT_PXSPACE: |
| case PT_PXGRAPH: |
| case PT_PXPRINT: |
| case PT_PXPUNCT: |
| status |= XCLASS_SAVE_CHAR | XCLASS_HAS_TYPE; |
| break; |
| |
| case PT_UCNC: |
| case PT_PXXDIGIT: |
| status |= XCLASS_SAVE_CHAR; |
| break; |
| |
| case PT_BOOL: |
| status |= XCLASS_HAS_BOOL; |
| break; |
| |
| case PT_BIDICL: |
| status |= XCLASS_HAS_BIDICL; |
| break; |
| |
| default: |
| SLJIT_UNREACHABLE(); |
| break; |
| } |
| |
| if (items > 0) |
| { |
| if (cc[-1] == XCL_NOTPROP) |
| items ^= UCPCAT_ALL; |
| category_list |= items; |
| status |= XCLASS_HAS_TYPE; |
| compares--; |
| } |
| |
| cc += 2; |
| } |
| |
| if (category_list == UCPCAT_ALL) |
| { |
| /* All or no characters are accepted, same as dotall. */ |
| if (status & XCLASS_IS_ECLASS) |
| { |
| if (list != backtracks) |
| OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1); |
| return; |
| } |
| |
| compile_char1_matchingpath(common, OP_ALLANY, cc, backtracks, FALSE); |
| if (list == backtracks) |
| add_jump(compiler, backtracks, JUMP(SLJIT_JUMP)); |
| return; |
| } |
| |
| if (category_list != 0) |
| compares++; |
| #endif |
| |
| if (*cc != XCL_END) |
| { |
| #if defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16) |
| if (common->utf && compares == 0 && !(status & XCLASS_IS_ECLASS)) |
| { |
| SLJIT_ASSERT(category_list == 0); |
| max = 0; |
| min = (flags & XCL_MAP) != 0 ? 0 : READ_CHAR_MAX; |
| xclass_update_min_max(common, cc, &min, &max); |
| } |
| #endif |
| compares++; |
| #ifdef SUPPORT_UNICODE |
| status |= XCLASS_SAVE_CHAR; |
| #endif /* SUPPORT_UNICODE */ |
| } |
| |
| #ifdef SUPPORT_UNICODE |
| SLJIT_ASSERT(compares > 0 || category_list != 0); |
| #else /* !SUPPORT_UNICODE */ |
| SLJIT_ASSERT(compares > 0); |
| #endif /* SUPPORT_UNICODE */ |
| |
| /* We are not necessary in utf mode even in 8 bit mode. */ |
| cc = ccbegin; |
| if (!(status & XCLASS_IS_ECLASS)) |
| { |
| if ((flags & XCL_NOT) != 0) |
| read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR); |
| else |
| { |
| #ifdef SUPPORT_UNICODE |
| read_char(common, min, max, (status & XCLASS_NEEDS_UCD) ? backtracks : NULL, 0); |
| #else /* !SUPPORT_UNICODE */ |
| read_char(common, min, max, NULL, 0); |
| #endif /* SUPPORT_UNICODE */ |
| } |
| } |
| |
| if ((flags & XCL_MAP) != 0) |
| { |
| SLJIT_ASSERT(!(status & XCLASS_IS_ECLASS)); |
| xclass_check_bitset(common, (const sljit_u8 *)cc, &found, backtracks); |
| cc += 32 / sizeof(PCRE2_UCHAR); |
| } |
| |
| #ifdef SUPPORT_UNICODE |
| if (status & XCLASS_NEEDS_UCD) |
| { |
| if ((status & (XCLASS_SAVE_CHAR | XCLASS_IS_ECLASS)) == XCLASS_SAVE_CHAR) |
| OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP1, 0); |
| |
| #if PCRE2_CODE_UNIT_WIDTH == 32 |
| if (!common->utf) |
| { |
| OP2U(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, MAX_UTF_CODE_POINT + 1); |
| SELECT(SLJIT_GREATER_EQUAL, TMP1, SLJIT_IMM, UNASSIGNED_UTF_CHAR, TMP1); |
| } |
| #endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ |
| |
| OP2(SLJIT_LSHR, TMP2, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_SHIFT); |
| OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 1); |
| OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_stage1)); |
| OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BLOCK_MASK); |
| sljit_emit_op2_shift(compiler, SLJIT_ADD | SLJIT_SHL_IMM | SLJIT_SRC2_UNDEFINED, TMP1, 0, TMP1, 0, TMP2, 0, UCD_BLOCK_SHIFT); |
| OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, (sljit_sw)PRIV(ucd_stage2)); |
| OP1(SLJIT_MOV_U16, TMP2, 0, SLJIT_MEM2(TMP2, TMP1), 1); |
| sljit_emit_op2_shift(compiler, SLJIT_ADD | SLJIT_SHL_IMM | SLJIT_SRC2_UNDEFINED, TMP2, 0, TMP2, 0, TMP2, 0, 1); |
| OP2(SLJIT_SHL, TMP2, 0, TMP2, 0, SLJIT_IMM, 2); |
| |
| ccbegin = cc; |
| |
| if (status & XCLASS_HAS_BIDICL) |
| { |
| OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx_bidiclass)); |
| OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BIDICLASS_SHIFT); |
| |
| while (*cc == XCL_PROP || *cc == XCL_NOTPROP) |
| { |
| cc++; |
| |
| if (*cc == PT_BIDICL) |
| { |
| compares--; |
| invertcmp = (compares == 0 && list != backtracks); |
| if (cc[-1] == XCL_NOTPROP) |
| invertcmp ^= 0x1; |
| jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]); |
| add_jump(compiler, compares > 0 ? list : backtracks, jump); |
| } |
| cc += 2; |
| } |
| |
| cc = ccbegin; |
| } |
| |
| if (status & XCLASS_HAS_BOOL) |
| { |
| OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, bprops)); |
| OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_BPROPS_MASK); |
| OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2); |
| |
| while (*cc == XCL_PROP || *cc == XCL_NOTPROP) |
| { |
| cc++; |
| if (*cc == PT_BOOL) |
| { |
| compares--; |
| invertcmp = (compares == 0 && list != backtracks); |
| if (cc[-1] == XCL_NOTPROP) |
| invertcmp ^= 0x1; |
| |
| OP2U(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_boolprop_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)(1u << (cc[1] & 0x1f))); |
| add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp)); |
| } |
| cc += 2; |
| } |
| |
| cc = ccbegin; |
| } |
| |
| if (status & XCLASS_HAS_SCRIPT) |
| { |
| OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script)); |
| |
| while (*cc == XCL_PROP || *cc == XCL_NOTPROP) |
| { |
| cc++; |
| |
| switch (*cc) |
| { |
| case PT_SCX: |
| if (cc[-1] == XCL_NOTPROP) |
| break; |
| PCRE2_FALLTHROUGH /* Fall through */ |
| |
| case PT_SC: |
| compares--; |
| invertcmp = (compares == 0 && list != backtracks); |
| if (cc[-1] == XCL_NOTPROP) |
| invertcmp ^= 0x1; |
| |
| add_jump(compiler, compares > 0 ? list : backtracks, CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1])); |
| } |
| cc += 2; |
| } |
| |
| cc = ccbegin; |
| } |
| |
| if (status & XCLASS_HAS_SCRIPT_EXTENSION) |
| { |
| OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx_bidiclass)); |
| OP2(SLJIT_AND, TMP1, 0, TMP1, 0, SLJIT_IMM, UCD_SCRIPTX_MASK); |
| OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2); |
| |
| if (status & XCLASS_SCRIPT_EXTENSION_NOTPROP) |
| { |
| if (status & XCLASS_HAS_TYPE) |
| { |
| if ((status & (XCLASS_SAVE_CHAR | XCLASS_IS_ECLASS)) == XCLASS_SAVE_CHAR) |
| { |
| OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL0, TMP2, 0); |
| status |= XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0; |
| } |
| else |
| { |
| OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP2, 0); |
| status |= XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR; |
| } |
| } |
| OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script)); |
| } |
| |
| while (*cc == XCL_PROP || *cc == XCL_NOTPROP) |
| { |
| cc++; |
| |
| if (*cc == PT_SCX) |
| { |
| compares--; |
| invertcmp = (compares == 0 && list != backtracks); |
| |
| jump = NULL; |
| if (cc[-1] == XCL_NOTPROP) |
| { |
| jump = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, (int)cc[1]); |
| if (invertcmp) |
| { |
| add_jump(compiler, backtracks, jump); |
| jump = NULL; |
| } |
| invertcmp ^= 0x1; |
| } |
| |
| OP2U(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)(1u << (cc[1] & 0x1f))); |
| add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp)); |
| |
| if (jump != NULL) |
| JUMPHERE(jump); |
| } |
| cc += 2; |
| } |
| |
| if (status & XCLASS_SCRIPT_EXTENSION_RESTORE_LOCAL0) |
| OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOCAL0); |
| else if (status & XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR) |
| OP1(SLJIT_MOV, TMP2, 0, RETURN_ADDR, 0); |
| cc = ccbegin; |
| } |
| |
| if (status & XCLASS_SAVE_CHAR) |
| OP1(SLJIT_MOV, TMP1, 0, (status & XCLASS_IS_ECLASS) ? ECLASS_CHAR_DATA : RETURN_ADDR, 0); |
| |
| if (status & XCLASS_HAS_TYPE) |
| { |
| if (status & XCLASS_SAVE_CHAR) |
| typereg = RETURN_ADDR; |
| |
| OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, chartype)); |
| OP2(SLJIT_SHL, typereg, 0, SLJIT_IMM, 1, TMP2, 0); |
| |
| if (category_list > 0) |
| { |
| compares--; |
| invertcmp = (compares == 0 && list != backtracks); |
| OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, category_list); |
| add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp)); |
| } |
| } |
| } |
| #endif /* SUPPORT_UNICODE */ |
| |
| /* Generating code. */ |
| charoffset = 0; |
| |
| #ifdef SUPPORT_UNICODE |
| while (*cc == XCL_PROP || *cc == XCL_NOTPROP) |
| { |
| compares--; |
| invertcmp = (compares == 0 && list != backtracks); |
| jump = NULL; |
| |
| if (*cc == XCL_NOTPROP) |
| invertcmp ^= 0x1; |
| cc++; |
| switch(*cc) |
| { |
| case PT_LAMP: |
| case PT_GC: |
| case PT_PC: |
| case PT_SC: |
| case PT_SCX: |
| case PT_BOOL: |
| case PT_BIDICL: |
| case PT_WORD: |
| case PT_ALNUM: |
| compares++; |
| /* Already handled. */ |
| break; |
| |
| case PT_SPACE: |
| case PT_PXSPACE: |
| SET_CHAR_OFFSET(9); |
| OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xd - 0x9); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL); |
| |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x85 - 0x9); |
| OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); |
| |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x180e - 0x9); |
| OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); |
| |
| OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Zl, ucp_Zs)); |
| OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_ZERO); |
| jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); |
| break; |
| |
| case PT_UCNC: |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_DOLLAR_SIGN - charoffset)); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL); |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_COMMERCIAL_AT - charoffset)); |
| OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(CHAR_GRAVE_ACCENT - charoffset)); |
| OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); |
| |
| SET_CHAR_OFFSET(0xa0); |
| OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(0xd7ff - charoffset)); |
| OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL); |
| SET_CHAR_OFFSET(0); |
| OP2U(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, 0xe000 - 0); |
| OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_GREATER_EQUAL); |
| jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); |
| break; |
| |
| case PT_PXGRAPH: |
| OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Cc, ucp_Cs) | UCPCAT_RANGE(ucp_Zl, ucp_Zs)); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO); |
| |
| OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(ucp_Cf)); |
| jump = JUMP(SLJIT_ZERO); |
| |
| c = charoffset; |
| /* In case of ucp_Cf, we overwrite the result. */ |
| SET_CHAR_OFFSET(0x2066); |
| OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL); |
| |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066); |
| OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); |
| |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x180e - 0x2066); |
| OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); |
| |
| /* Restore charoffset. */ |
| SET_CHAR_OFFSET(c); |
| |
| JUMPHERE(jump); |
| jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0); |
| break; |
| |
| case PT_PXPRINT: |
| OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Cc, ucp_Cs) | UCPCAT2(ucp_Zl, ucp_Zp)); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO); |
| |
| OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT(ucp_Cf)); |
| jump = JUMP(SLJIT_ZERO); |
| |
| c = charoffset; |
| /* In case of ucp_Cf, we overwrite the result. */ |
| SET_CHAR_OFFSET(0x2066); |
| OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x2069 - 0x2066); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL); |
| |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, 0x061c - 0x2066); |
| OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_EQUAL); |
| |
| /* Restore charoffset. */ |
| SET_CHAR_OFFSET(c); |
| |
| JUMPHERE(jump); |
| jump = CMP(SLJIT_ZERO ^ invertcmp, TMP2, 0, SLJIT_IMM, 0); |
| break; |
| |
| case PT_PXPUNCT: |
| OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Sc, ucp_So)); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_NOT_ZERO); |
| |
| SET_CHAR_OFFSET(0); |
| OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0x7f); |
| OP_FLAGS(SLJIT_AND, TMP2, 0, SLJIT_LESS_EQUAL); |
| |
| OP2U(SLJIT_AND | SLJIT_SET_Z, typereg, 0, SLJIT_IMM, UCPCAT_RANGE(ucp_Pc, ucp_Ps)); |
| OP_FLAGS(SLJIT_OR | SLJIT_SET_Z, TMP2, 0, SLJIT_NOT_ZERO); |
| jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); |
| break; |
| |
| case PT_PXXDIGIT: |
| SET_CHAR_OFFSET(CHAR_A); |
| OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, ~0x20); |
| OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP2, 0, SLJIT_IMM, CHAR_F - CHAR_A); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL); |
| |
| SET_CHAR_OFFSET(CHAR_0); |
| OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_9 - CHAR_0); |
| OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL); |
| |
| SET_CHAR_OFFSET(0xff10); |
| jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 0xff46 - 0xff10); |
| |
| OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff19 - 0xff10); |
| OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL); |
| |
| SET_CHAR_OFFSET(0xff21); |
| OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff26 - 0xff21); |
| OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL); |
| |
| SET_CHAR_OFFSET(0xff41); |
| OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, 0xff46 - 0xff41); |
| OP_FLAGS(SLJIT_OR, TMP2, 0, SLJIT_LESS_EQUAL); |
| |
| SET_CHAR_OFFSET(0xff10); |
| |
| JUMPHERE(jump); |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, 0); |
| jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp); |
| break; |
| |
| default: |
| SLJIT_UNREACHABLE(); |
| break; |
| } |
| |
| cc += 2; |
| |
| if (jump != NULL) |
| add_jump(compiler, compares > 0 ? list : backtracks, jump); |
| } |
| |
| if (compares == 0) |
| { |
| if (found != NULL) |
| set_jumps(found, LABEL()); |
| |
| if (status & XCLASS_IS_ECLASS) |
| OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1); |
| return; |
| } |
| #endif /* SUPPORT_UNICODE */ |
| |
| SLJIT_ASSERT(compares == 1); |
| ranges.range_count = 0; |
| ranges.ranges = ranges.local_ranges; |
| ranges.stack = ranges.local_stack; |
| |
| xclass_compute_ranges(common, cc, &ranges); |
| |
| /* Memory error is set for the compiler. */ |
| if (ranges.stack == NULL) |
| return; |
| |
| #if (defined SLJIT_DEBUG && SLJIT_DEBUG) && \ |
| defined SUPPORT_UNICODE && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16) |
| if (common->utf) |
| { |
| min = READ_CHAR_MAX; |
| max = 0; |
| xclass_update_min_max(common, cc, &min, &max); |
| SLJIT_ASSERT(ranges.ranges[0] == min && ranges.ranges[ranges.range_count - 1] == max); |
| } |
| #endif /* SLJIT_DEBUG && SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == [8|16] */ |
| |
| invertcmp = (list != backtracks); |
| |
| if (ranges.range_count == 2) |
| { |
| range_start = ranges.ranges[0]; |
| range_end = ranges.ranges[1]; |
| |
| if (range_start < range_end) |
| { |
| SET_CHAR_OFFSET(range_start); |
| jump = CMP(SLJIT_LESS_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start)); |
| } |
| else |
| jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset)); |
| |
| add_jump(compiler, backtracks, jump); |
| |
| SLJIT_ASSERT(ranges.stack == ranges.local_stack); |
| if (found != NULL) |
| set_jumps(found, LABEL()); |
| |
| if (status & XCLASS_IS_ECLASS) |
| OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1); |
| return; |
| } |
| |
| range_start = ranges.ranges[0]; |
| SET_CHAR_OFFSET(range_start); |
| if (ranges.range_count >= 6) |
| { |
| /* Early fail. */ |
| range_end = ranges.ranges[ranges.range_count - 1]; |
| add_jump(compiler, (flags & XCL_NOT) == 0 ? backtracks : &found, |
| CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start))); |
| } |
| |
| depth = 0; |
| first_item = 0; |
| last_item = (sljit_u32)(ranges.range_count - 2); |
| has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV) != 0; |
| |
| while (TRUE) |
| { |
| /* At least two items are present. */ |
| SLJIT_ASSERT(first_item < last_item && charoffset == ranges.ranges[0]); |
| last_range_set = FALSE; |
| |
| if (first_item + 6 <= last_item) |
| { |
| mid_item = ((first_item + last_item) >> 1) & ~(sljit_u32)1; |
| SLJIT_ASSERT(last_item >= mid_item + 4); |
| |
| range_end = ranges.ranges[mid_item + 1]; |
| if (first_item + 6 > mid_item && ranges.ranges[mid_item] == range_end) |
| { |
| OP2U(SLJIT_SUB | SLJIT_SET_GREATER | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - charoffset)); |
| ranges.stack[depth].jump = JUMP(SLJIT_GREATER); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL); |
| last_range_set = TRUE; |
| } |
| else |
| ranges.stack[depth].jump = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - charoffset)); |
| |
| ranges.stack[depth].first_item = (sljit_u32)(mid_item + 2); |
| ranges.stack[depth].last_item = (sljit_u32)last_item; |
| |
| depth++; |
| SLJIT_ASSERT(ranges.stack == ranges.local_stack ? |
| depth <= XCLASS_LOCAL_RANGES_LOG2_SIZE : (ranges.stack + depth) <= (xclass_stack_item*)ranges.ranges); |
| |
| last_item = mid_item; |
| if (!last_range_set) |
| continue; |
| |
| last_item -= 2; |
| } |
| |
| if (!last_range_set) |
| { |
| range_start = ranges.ranges[first_item]; |
| range_end = ranges.ranges[first_item + 1]; |
| |
| if (range_start < range_end) |
| { |
| SET_CHAR_OFFSET(range_start); |
| OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start)); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_LESS_EQUAL); |
| } |
| else |
| { |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset)); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL); |
| } |
| first_item += 2; |
| } |
| |
| SLJIT_ASSERT(first_item <= last_item); |
| |
| do |
| { |
| range_start = ranges.ranges[first_item]; |
| range_end = ranges.ranges[first_item + 1]; |
| |
| if (range_start < range_end) |
| { |
| SET_CHAR_OFFSET(range_start); |
| OP2U(SLJIT_SUB | SLJIT_SET_LESS_EQUAL, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_end - range_start)); |
| |
| if (has_cmov) |
| SELECT(SLJIT_LESS_EQUAL, TMP2, STR_END, 0, TMP2); |
| else |
| OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_LESS_EQUAL); |
| } |
| else |
| { |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(range_start - charoffset)); |
| |
| if (has_cmov) |
| SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2); |
| else |
| OP_FLAGS(SLJIT_OR | ((first_item == last_item) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL); |
| } |
| |
| first_item += 2; |
| } |
| while (first_item <= last_item); |
| |
| if (depth == 0) break; |
| |
| add_jump(compiler, &check_result, JUMP(SLJIT_JUMP)); |
| |
| /* The charoffset resets after the end of a branch is reached. */ |
| charoffset = ranges.ranges[0]; |
| depth--; |
| first_item = ranges.stack[depth].first_item; |
| last_item = ranges.stack[depth].last_item; |
| JUMPHERE(ranges.stack[depth].jump); |
| } |
| |
| if (check_result != NULL) |
| set_jumps(check_result, LABEL()); |
| |
| if (has_cmov) |
| jump = CMP(SLJIT_NOT_EQUAL ^ invertcmp, TMP2, 0, SLJIT_IMM, 0); |
| else |
| { |
| sljit_set_current_flags(compiler, SLJIT_SET_Z); |
| jump = JUMP(SLJIT_NOT_EQUAL ^ invertcmp); |
| } |
| |
| add_jump(compiler, backtracks, jump); |
| |
| if (found != NULL) |
| set_jumps(found, LABEL()); |
| |
| if (status & XCLASS_IS_ECLASS) |
| OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1); |
| |
| if (ranges.stack != ranges.local_stack) |
| SLJIT_FREE(ranges.stack, compiler->allocator_data); |
| } |
| |
| static PCRE2_SPTR compile_eclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks) |
| { |
| DEFINE_COMPILER; |
| PCRE2_SPTR end = cc + GET(cc, 0) - 1; |
| PCRE2_SPTR begin; |
| jump_list *not_found; |
| jump_list *found = NULL; |
| |
| cc += LINK_SIZE; |
| |
| /* Should be optimized later. */ |
| read_char(common, 0, READ_CHAR_MAX, backtracks, 0); |
| |
| if (((*cc++) & ECL_MAP) != 0) |
| { |
| xclass_check_bitset(common, (const sljit_u8 *)cc, &found, backtracks); |
| cc += 32 / sizeof(PCRE2_UCHAR); |
| } |
| |
| begin = cc; |
| |
| OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL0, ECLASS_CHAR_DATA, 0); |
| OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCAL1, ECLASS_STACK_DATA, 0); |
| OP1(SLJIT_MOV, ECLASS_STACK_DATA, 0, SLJIT_IMM, 0); |
| OP1(SLJIT_MOV, ECLASS_CHAR_DATA, 0, TMP1, 0); |
| |
| /* All eclass must start with an xclass. */ |
| SLJIT_ASSERT(*cc == ECL_XCLASS); |
| |
| while (cc < end) |
| { |
| switch (*cc) |
| { |
| case ECL_AND: |
| ++cc; |
| OP2(SLJIT_OR, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, ~(sljit_sw)1); |
| OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1); |
| OP2(SLJIT_AND, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0); |
| break; |
| |
| case ECL_OR: |
| ++cc; |
| OP2(SLJIT_AND, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1); |
| OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1); |
| OP2(SLJIT_OR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0); |
| break; |
| |
| case ECL_XOR: |
| ++cc; |
| OP2(SLJIT_AND, TMP2, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1); |
| OP2(SLJIT_LSHR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1); |
| OP2(SLJIT_XOR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, TMP2, 0); |
| break; |
| |
| case ECL_NOT: |
| ++cc; |
| OP2(SLJIT_XOR, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1); |
| break; |
| |
| default: |
| SLJIT_ASSERT(*cc == ECL_XCLASS); |
| if (cc != begin) |
| { |
| OP1(SLJIT_MOV, TMP1, 0, ECLASS_CHAR_DATA, 0); |
| OP2(SLJIT_SHL, ECLASS_STACK_DATA, 0, ECLASS_STACK_DATA, 0, SLJIT_IMM, 1); |
| } |
| |
| not_found = NULL; |
| compile_xclass_matchingpath(common, cc + 1 + LINK_SIZE, ¬_found, XCLASS_IS_ECLASS); |
| set_jumps(not_found, LABEL()); |
| |
| cc += GET(cc, 1); |
| break; |
| } |
| } |
| |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, ECLASS_STACK_DATA, 0, SLJIT_IMM, 0); |
| OP1(SLJIT_MOV, ECLASS_CHAR_DATA, 0, SLJIT_MEM1(SLJIT_SP), LOCAL0); |
| OP1(SLJIT_MOV, ECLASS_STACK_DATA, 0, SLJIT_MEM1(SLJIT_SP), LOCAL1); |
| add_jump(compiler, backtracks, JUMP(SLJIT_EQUAL)); |
| set_jumps(found, LABEL()); |
| return end; |
| } |
| |
| /* Generic character matching code. */ |
| |
| #undef SET_CHAR_OFFSET |
| #undef READ_FROM_CHAR_LIST |
| #undef XCLASS_LOCAL_RANGES_SIZE |
| #undef XCLASS_LOCAL_RANGES_LOG2_SIZE |
| |
| #endif /* SUPPORT_WIDE_CHARS */ |
| |
| static PCRE2_SPTR byte_sequence_compare(compiler_common *common, BOOL caseless, PCRE2_SPTR cc, |
| compare_context *context, jump_list **backtracks) |
| { |
| DEFINE_COMPILER; |
| unsigned int othercasebit = 0; |
| PCRE2_SPTR othercasechar = NULL; |
| #ifdef SUPPORT_UNICODE |
| int utflength; |
| #endif |
| |
| if (caseless && char_has_othercase(common, cc)) |
| { |
| othercasebit = char_get_othercase_bit(common, cc); |
| SLJIT_ASSERT(othercasebit); |
| /* Extracting bit difference info. */ |
| #if PCRE2_CODE_UNIT_WIDTH == 8 |
| othercasechar = cc + (othercasebit >> 8); |
| othercasebit &= 0xff; |
| #elif PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 |
| /* Note that this code only handles characters in the BMP. If there |
| ever are characters outside the BMP whose othercase differs in only one |
| bit from itself (there currently are none), this code will need to be |
| revised for PCRE2_CODE_UNIT_WIDTH == 32. */ |
| othercasechar = cc + (othercasebit >> 9); |
| if ((othercasebit & 0x100) != 0) |
| othercasebit = (othercasebit & 0xff) << 8; |
| else |
| othercasebit &= 0xff; |
| #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ |
| } |
| |
| if (context->sourcereg == -1) |
| { |
| #if PCRE2_CODE_UNIT_WIDTH == 8 |
| #if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED |
| if (context->length >= 4) |
| OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); |
| else if (context->length >= 2) |
| OP1(SLJIT_MOV_U16, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); |
| else |
| #endif |
| OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); |
| #elif PCRE2_CODE_UNIT_WIDTH == 16 |
| #if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED |
| if (context->length >= 4) |
| OP1(SLJIT_MOV_S32, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); |
| else |
| #endif |
| OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); |
| #elif PCRE2_CODE_UNIT_WIDTH == 32 |
| OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), -context->length); |
| #endif /* PCRE2_CODE_UNIT_WIDTH == [8|16|32] */ |
| context->sourcereg = TMP2; |
| } |
| |
| #ifdef SUPPORT_UNICODE |
| utflength = 1; |
| if (common->utf && HAS_EXTRALEN(*cc)) |
| utflength += GET_EXTRALEN(*cc); |
| |
| do |
| { |
| #endif |
| |
| context->length -= IN_UCHARS(1); |
| #if (defined SLJIT_UNALIGNED && SLJIT_UNALIGNED) && (PCRE2_CODE_UNIT_WIDTH == 8 || PCRE2_CODE_UNIT_WIDTH == 16) |
| |
| /* Unaligned read is supported. */ |
| if (othercasebit != 0 && othercasechar == cc) |
| { |
| context->c.asuchars[context->ucharptr] = *cc | othercasebit; |
| context->oc.asuchars[context->ucharptr] = othercasebit; |
| } |
| else |
| { |
| context->c.asuchars[context->ucharptr] = *cc; |
| context->oc.asuchars[context->ucharptr] = 0; |
| } |
| context->ucharptr++; |
| |
| #if PCRE2_CODE_UNIT_WIDTH == 8 |
| if (context->ucharptr >= 4 || context->length == 0 || (context->ucharptr == 2 && context->length == 1)) |
| #else |
| if (context->ucharptr >= 2 || context->length == 0) |
| #endif |
| { |
| if (context->length >= 4) |
| OP1(SLJIT_MOV_S32, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); |
| else if (context->length >= 2) |
| OP1(SLJIT_MOV_U16, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); |
| #if PCRE2_CODE_UNIT_WIDTH == 8 |
| else if (context->length >= 1) |
| OP1(SLJIT_MOV_U8, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); |
| #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ |
| context->sourcereg = context->sourcereg == TMP1 ? TMP2 : TMP1; |
| |
| switch(context->ucharptr) |
| { |
| case 4 / sizeof(PCRE2_UCHAR): |
| if (context->oc.asint != 0) |
| OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asint); |
| add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asint | context->oc.asint)); |
| break; |
| |
| case 2 / sizeof(PCRE2_UCHAR): |
| if (context->oc.asushort != 0) |
| OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asushort); |
| add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asushort | context->oc.asushort)); |
| break; |
| |
| #if PCRE2_CODE_UNIT_WIDTH == 8 |
| case 1: |
| if (context->oc.asbyte != 0) |
| OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, context->oc.asbyte); |
| add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, context->c.asbyte | context->oc.asbyte)); |
| break; |
| #endif |
| |
| default: |
| SLJIT_UNREACHABLE(); |
| break; |
| } |
| context->ucharptr = 0; |
| } |
| |
| #else |
| |
| /* Unaligned read is unsupported or in 32 bit mode. */ |
| if (context->length >= 1) |
| OP1(MOV_UCHAR, context->sourcereg, 0, SLJIT_MEM1(STR_PTR), -context->length); |
| |
| context->sourcereg = context->sourcereg == TMP1 ? TMP2 : TMP1; |
| |
| if (othercasebit != 0 && othercasechar == cc) |
| { |
| OP2(SLJIT_OR, context->sourcereg, 0, context->sourcereg, 0, SLJIT_IMM, othercasebit); |
| add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, *cc | othercasebit)); |
| } |
| else |
| add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, context->sourcereg, 0, SLJIT_IMM, *cc)); |
| |
| #endif |
| |
| cc++; |
| #ifdef SUPPORT_UNICODE |
| utflength--; |
| } |
| while (utflength > 0); |
| #endif |
| |
| return cc; |
| } |
| |
| #ifdef SUPPORT_UNICODE |
| |
| #if PCRE2_CODE_UNIT_WIDTH != 32 |
| |
| /* The code in this function copies the logic of the interpreter function that |
| is defined in the pcre2_extuni.c source. If that code is updated, this |
| function, and those below it, must be kept in step (note by PH, June 2024). */ |
| |
| static PCRE2_SPTR SLJIT_FUNC do_extuni_utf(jit_arguments *args, PCRE2_SPTR cc) |
| { |
| PCRE2_SPTR start_subject = args->begin; |
| PCRE2_SPTR end_subject = args->end; |
| int lgb = 0, rgb, ricount; |
| PCRE2_SPTR prevcc, endcc, bptr; |
| BOOL first = TRUE; |
| BOOL was_ep_ZWJ = FALSE; |
| uint32_t c; |
| |
| prevcc = cc; |
| endcc = NULL; |
| do |
| { |
| GETCHARINC(c, cc); |
| rgb = UCD_GRAPHBREAK(c); |
| |
| if (first) |
| { |
| lgb = rgb; |
| endcc = cc; |
| first = FALSE; |
| continue; |
| } |
| |
| if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) |
| break; |
| |
| /* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was |
| preceded by Extended Pictographic. */ |
| |
| if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ) |
| break; |
| |
| /* Not breaking between Regional Indicators is allowed only if there |
| are an even number of preceding RIs. */ |
| |
| if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) |
| { |
| ricount = 0; |
| bptr = prevcc; |
| |
| /* bptr is pointing to the left-hand character */ |
| while (bptr > start_subject) |
| { |
| bptr--; |
| BACKCHAR(bptr); |
| GETCHAR(c, bptr); |
| |
| if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) |
| break; |
| |
| ricount++; |
| } |
| |
| if ((ricount & 1) != 0) break; /* Grapheme break required */ |
| } |
| |
| /* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in |
| between; see next statement). */ |
| |
| was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ); |
| |
| /* If Extend follows Extended_Pictographic, do not update lgb; this allows |
| any number of them before a following ZWJ. */ |
| |
| if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic) |
| lgb = rgb; |
| |
| prevcc = endcc; |
| endcc = cc; |
| } |
| while (cc < end_subject); |
| |
| return endcc; |
| } |
| |
| #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ |
| |
| /* The code in this function copies the logic of the interpreter function that |
| is defined in the pcre2_extuni.c source. If that code is updated, this |
| function, and the one below it, must be kept in step (note by PH, June 2024). */ |
| |
| static PCRE2_SPTR SLJIT_FUNC do_extuni_utf_invalid(jit_arguments *args, PCRE2_SPTR cc) |
| { |
| PCRE2_SPTR start_subject = args->begin; |
| PCRE2_SPTR end_subject = args->end; |
| int lgb = 0, rgb, ricount; |
| PCRE2_SPTR prevcc, endcc, bptr; |
| BOOL first = TRUE; |
| BOOL was_ep_ZWJ = FALSE; |
| uint32_t c; |
| |
| prevcc = cc; |
| endcc = NULL; |
| do |
| { |
| GETCHARINC_INVALID(c, cc, end_subject, break); |
| rgb = UCD_GRAPHBREAK(c); |
| |
| if (first) |
| { |
| lgb = rgb; |
| endcc = cc; |
| first = FALSE; |
| continue; |
| } |
| |
| if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) |
| break; |
| |
| /* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was |
| preceded by Extended Pictographic. */ |
| |
| if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ) |
| break; |
| |
| /* Not breaking between Regional Indicators is allowed only if there |
| are an even number of preceding RIs. */ |
| |
| if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) |
| { |
| ricount = 0; |
| bptr = prevcc; |
| |
| /* bptr is pointing to the left-hand character */ |
| while (bptr > start_subject) |
| { |
| GETCHARBACK_INVALID(c, bptr, start_subject, break); |
| |
| if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) |
| break; |
| |
| ricount++; |
| } |
| |
| if ((ricount & 1) != 0) |
| break; /* Grapheme break required */ |
| } |
| |
| /* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in |
| between; see next statement). */ |
| |
| was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ); |
| |
| /* If Extend follows Extended_Pictographic, do not update lgb; this allows |
| any number of them before a following ZWJ. */ |
| |
| if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic) |
| lgb = rgb; |
| |
| prevcc = endcc; |
| endcc = cc; |
| } |
| while (cc < end_subject); |
| |
| return endcc; |
| } |
| |
| /* The code in this function copies the logic of the interpreter function that |
| is defined in the pcre2_extuni.c source. If that code is updated, this |
| function must be kept in step (note by PH, June 2024). */ |
| |
| static PCRE2_SPTR SLJIT_FUNC do_extuni_no_utf(jit_arguments *args, PCRE2_SPTR cc) |
| { |
| PCRE2_SPTR start_subject = args->begin; |
| PCRE2_SPTR end_subject = args->end; |
| int lgb, rgb, ricount; |
| PCRE2_SPTR bptr; |
| uint32_t c; |
| BOOL was_ep_ZWJ = FALSE; |
| |
| /* Patch by PH */ |
| /* GETCHARINC(c, cc); */ |
| c = *cc++; |
| |
| #if PCRE2_CODE_UNIT_WIDTH == 32 |
| if (c >= 0x110000) |
| return cc; |
| #endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ |
| lgb = UCD_GRAPHBREAK(c); |
| |
| while (cc < end_subject) |
| { |
| c = *cc; |
| #if PCRE2_CODE_UNIT_WIDTH == 32 |
| if (c >= 0x110000) |
| break; |
| #endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ |
| rgb = UCD_GRAPHBREAK(c); |
| |
| if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) |
| break; |
| |
| /* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was |
| preceded by Extended Pictographic. */ |
| |
| if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ) |
| break; |
| |
| /* Not breaking between Regional Indicators is allowed only if there |
| are an even number of preceding RIs. */ |
| |
| if (lgb == ucp_gbRegional_Indicator && rgb == ucp_gbRegional_Indicator) |
| { |
| ricount = 0; |
| bptr = cc - 1; |
| |
| /* bptr is pointing to the left-hand character */ |
| while (bptr > start_subject) |
| { |
| bptr--; |
| c = *bptr; |
| #if PCRE2_CODE_UNIT_WIDTH == 32 |
| if (c >= 0x110000) |
| break; |
| #endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ |
| |
| if (UCD_GRAPHBREAK(c) != ucp_gbRegional_Indicator) break; |
| |
| ricount++; |
| } |
| |
| if ((ricount & 1) != 0) |
| break; /* Grapheme break required */ |
| } |
| |
| /* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in |
| between; see next statement). */ |
| |
| was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ); |
| |
| /* If Extend follows Extended_Pictographic, do not update lgb; this allows |
| any number of them before a following ZWJ. */ |
| |
| if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic) |
| lgb = rgb; |
| |
| cc++; |
| } |
| |
| return cc; |
| } |
| |
| static void compile_clist(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks) |
| { |
| DEFINE_COMPILER; |
| const sljit_u32 *other_cases; |
| struct sljit_jump *jump; |
| sljit_u32 min = 0, max = READ_CHAR_MAX; |
| BOOL has_cmov = sljit_has_cpu_feature(SLJIT_HAS_CMOV) != 0; |
| |
| SLJIT_ASSERT(cc[1] == PT_CLIST); |
| |
| if (cc[0] == OP_PROP) |
| { |
| other_cases = PRIV(ucd_caseless_sets) + cc[2]; |
| |
| min = *other_cases++; |
| max = min; |
| |
| while (*other_cases != NOTACHAR) |
| { |
| if (*other_cases > max) max = *other_cases; |
| if (*other_cases < min) min = *other_cases; |
| other_cases++; |
| } |
| } |
| |
| other_cases = PRIV(ucd_caseless_sets) + cc[2]; |
| SLJIT_ASSERT(other_cases[0] != NOTACHAR && other_cases[1] != NOTACHAR); |
| /* The NOTACHAR is higher than any character. */ |
| SLJIT_ASSERT(other_cases[0] < other_cases[1] && other_cases[1] < other_cases[2]); |
| |
| read_char(common, min, max, backtracks, READ_CHAR_UPDATE_STR_PTR); |
| |
| /* At least two characters are required. |
| Otherwise this case would be handled by the normal code path. */ |
| /* NOTACHAR is the unsigned maximum. */ |
| |
| /* Optimizing character pairs, if their difference is power of 2. */ |
| if (is_powerof2(other_cases[1] ^ other_cases[0])) |
| { |
| OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(other_cases[1] ^ other_cases[0])); |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, other_cases[1]); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL); |
| other_cases += 2; |
| } |
| else if (is_powerof2(other_cases[2] ^ other_cases[1])) |
| { |
| SLJIT_ASSERT(other_cases[2] != NOTACHAR); |
| |
| OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)(other_cases[2] ^ other_cases[1])); |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP2, 0, SLJIT_IMM, other_cases[2]); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL); |
| |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)other_cases[0]); |
| |
| if (has_cmov) |
| SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2); |
| else |
| OP_FLAGS(SLJIT_OR | ((other_cases[3] == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL); |
| |
| other_cases += 3; |
| } |
| else |
| { |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++)); |
| OP_FLAGS(SLJIT_MOV, TMP2, 0, SLJIT_EQUAL); |
| } |
| |
| while (*other_cases != NOTACHAR) |
| { |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, (sljit_sw)(*other_cases++)); |
| |
| if (has_cmov) |
| SELECT(SLJIT_EQUAL, TMP2, STR_END, 0, TMP2); |
| else |
| OP_FLAGS(SLJIT_OR | ((*other_cases == NOTACHAR) ? SLJIT_SET_Z : 0), TMP2, 0, SLJIT_EQUAL); |
| } |
| |
| if (has_cmov) |
| jump = CMP(cc[0] == OP_PROP ? SLJIT_EQUAL : SLJIT_NOT_EQUAL, TMP2, 0, SLJIT_IMM, 0); |
| else |
| jump = JUMP(cc[0] == OP_PROP ? SLJIT_ZERO : SLJIT_NOT_ZERO); |
| |
| add_jump(compiler, backtracks, jump); |
| } |
| |
| #endif /* SUPPORT_UNICODE */ |
| |
| static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr) |
| { |
| DEFINE_COMPILER; |
| int length; |
| unsigned int c, oc, bit; |
| compare_context context; |
| struct sljit_jump *jump[3]; |
| jump_list *end_list; |
| #ifdef SUPPORT_UNICODE |
| PCRE2_UCHAR propdata[5]; |
| #endif /* SUPPORT_UNICODE */ |
| |
| switch(type) |
| { |
| case OP_NOT_DIGIT: |
| case OP_DIGIT: |
| /* Digits are usually 0-9, so it is worth to optimize them. */ |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 |
| if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_digit, FALSE)) |
| read_char7_type(common, backtracks, type == OP_NOT_DIGIT); |
| else |
| #endif |
| read_char8_type(common, backtracks, type == OP_NOT_DIGIT); |
| /* Flip the starting bit in the negative case. */ |
| OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_digit); |
| add_jump(compiler, backtracks, JUMP(type == OP_DIGIT ? SLJIT_ZERO : SLJIT_NOT_ZERO)); |
| return cc; |
| |
| case OP_NOT_WHITESPACE: |
| case OP_WHITESPACE: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 |
| if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_space, FALSE)) |
| read_char7_type(common, backtracks, type == OP_NOT_WHITESPACE); |
| else |
| #endif |
| read_char8_type(common, backtracks, type == OP_NOT_WHITESPACE); |
| OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_space); |
| add_jump(compiler, backtracks, JUMP(type == OP_WHITESPACE ? SLJIT_ZERO : SLJIT_NOT_ZERO)); |
| return cc; |
| |
| case OP_NOT_WORDCHAR: |
| case OP_WORDCHAR: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 |
| if (common->utf && is_char7_bitset((const sljit_u8*)common->ctypes - cbit_length + cbit_word, FALSE)) |
| read_char7_type(common, backtracks, type == OP_NOT_WORDCHAR); |
| else |
| #endif |
| read_char8_type(common, backtracks, type == OP_NOT_WORDCHAR); |
| OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, ctype_word); |
| add_jump(compiler, backtracks, JUMP(type == OP_WORDCHAR ? SLJIT_ZERO : SLJIT_NOT_ZERO)); |
| return cc; |
| |
| case OP_ANY: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| read_char(common, common->nlmin, common->nlmax, backtracks, READ_CHAR_UPDATE_STR_PTR); |
| if (common->nltype == NLTYPE_FIXED && common->newline > 255) |
| { |
| jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, (common->newline >> 8) & 0xff); |
| end_list = NULL; |
| if (common->mode != PCRE2_JIT_PARTIAL_HARD) |
| add_jump(compiler, &end_list, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); |
| else |
| check_str_end(common, &end_list); |
| |
| OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); |
| add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, common->newline & 0xff)); |
| set_jumps(end_list, LABEL()); |
| JUMPHERE(jump[0]); |
| } |
| else |
| check_newlinechar(common, common->nltype, backtracks, TRUE); |
| return cc; |
| |
| case OP_ALLANY: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| #ifdef SUPPORT_UNICODE |
| if (common->utf && common->invalid_utf) |
| { |
| read_char(common, 0, READ_CHAR_MAX, backtracks, READ_CHAR_UPDATE_STR_PTR); |
| return cc; |
| } |
| #endif /* SUPPORT_UNICODE */ |
| |
| skip_valid_char(common); |
| return cc; |
| |
| case OP_ANYBYTE: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); |
| return cc; |
| |
| #ifdef SUPPORT_UNICODE |
| case OP_NOTPROP: |
| case OP_PROP: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| if (cc[0] == PT_CLIST) |
| { |
| compile_clist(common, cc - 1, backtracks); |
| return cc + 2; |
| } |
| |
| propdata[0] = 0; |
| propdata[1] = type == OP_NOTPROP ? XCL_NOTPROP : XCL_PROP; |
| propdata[2] = cc[0]; |
| propdata[3] = cc[1]; |
| propdata[4] = XCL_END; |
| compile_xclass_matchingpath(common, propdata, backtracks, 0); |
| return cc + 2; |
| #endif |
| |
| case OP_ANYNL: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| read_char(common, common->bsr_nlmin, common->bsr_nlmax, NULL, 0); |
| jump[0] = CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, CHAR_CR); |
| /* We don't need to handle soft partial matching case. */ |
| end_list = NULL; |
| if (common->mode != PCRE2_JIT_PARTIAL_HARD) |
| add_jump(compiler, &end_list, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); |
| else |
| check_str_end(common, &end_list); |
| OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, CHAR_NL); |
| OP_FLAGS(SLJIT_MOV, TMP1, 0, SLJIT_EQUAL); |
| #if PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 |
| OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, UCHAR_SHIFT); |
| #endif |
| OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); |
| jump[1] = JUMP(SLJIT_JUMP); |
| JUMPHERE(jump[0]); |
| check_newlinechar(common, common->bsr_nltype, backtracks, FALSE); |
| set_jumps(end_list, LABEL()); |
| JUMPHERE(jump[1]); |
| return cc; |
| |
| case OP_NOT_HSPACE: |
| case OP_HSPACE: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| |
| if (type == OP_NOT_HSPACE) |
| read_char(common, 0x1, 0x3000, backtracks, READ_CHAR_UPDATE_STR_PTR); |
| else |
| read_char(common, 0x1, 0x3000, NULL, 0); |
| |
| add_jump(compiler, &common->hspace, JUMP(SLJIT_FAST_CALL)); |
| sljit_set_current_flags(compiler, SLJIT_SET_Z); |
| add_jump(compiler, backtracks, JUMP(type == OP_NOT_HSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO)); |
| return cc; |
| |
| case OP_NOT_VSPACE: |
| case OP_VSPACE: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| |
| if (type == OP_NOT_VSPACE) |
| read_char(common, 0x1, 0x2029, backtracks, READ_CHAR_UPDATE_STR_PTR); |
| else |
| read_char(common, 0x1, 0x2029, NULL, 0); |
| |
| add_jump(compiler, &common->vspace, JUMP(SLJIT_FAST_CALL)); |
| sljit_set_current_flags(compiler, SLJIT_SET_Z); |
| add_jump(compiler, backtracks, JUMP(type == OP_NOT_VSPACE ? SLJIT_NOT_ZERO : SLJIT_ZERO)); |
| return cc; |
| |
| #ifdef SUPPORT_UNICODE |
| case OP_EXTUNI: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| |
| SLJIT_ASSERT(TMP1 == SLJIT_R0 && STR_PTR == SLJIT_R1); |
| OP1(SLJIT_MOV, SLJIT_R0, 0, ARGUMENTS, 0); |
| |
| #if PCRE2_CODE_UNIT_WIDTH != 32 |
| sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2(W, W, W), SLJIT_IMM, |
| common->utf ? (common->invalid_utf ? SLJIT_FUNC_ADDR(do_extuni_utf_invalid) : SLJIT_FUNC_ADDR(do_extuni_utf)) : SLJIT_FUNC_ADDR(do_extuni_no_utf)); |
| if (common->invalid_utf) |
| add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0)); |
| #else |
| sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2(W, W, W), SLJIT_IMM, |
| common->invalid_utf ? SLJIT_FUNC_ADDR(do_extuni_utf_invalid) : SLJIT_FUNC_ADDR(do_extuni_no_utf)); |
| if (common->invalid_utf) |
| add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0)); |
| #endif |
| |
| OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0); |
| |
| if (common->mode == PCRE2_JIT_PARTIAL_HARD) |
| { |
| jump[0] = CMP(SLJIT_LESS, SLJIT_RETURN_REG, 0, STR_END, 0); |
| /* Since we successfully read a char above, partial matching must occur. */ |
| check_partial(common, TRUE); |
| JUMPHERE(jump[0]); |
| } |
| return cc; |
| #endif |
| |
| case OP_CHAR: |
| case OP_CHARI: |
| length = 1; |
| #ifdef SUPPORT_UNICODE |
| if (common->utf && HAS_EXTRALEN(*cc)) length += GET_EXTRALEN(*cc); |
| #endif |
| |
| if (check_str_ptr && common->mode != PCRE2_JIT_COMPLETE) |
| detect_partial_match(common, backtracks); |
| |
| if (type == OP_CHAR || !char_has_othercase(common, cc) || char_get_othercase_bit(common, cc) != 0) |
| { |
| OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(length)); |
| if (length > 1 || (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE)) |
| add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0)); |
| |
| context.length = IN_UCHARS(length); |
| context.sourcereg = -1; |
| #if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED |
| context.ucharptr = 0; |
| #endif |
| return byte_sequence_compare(common, type == OP_CHARI, cc, &context, backtracks); |
| } |
| |
| #ifdef SUPPORT_UNICODE |
| if (common->utf) |
| { |
| GETCHAR(c, cc); |
| } |
| else |
| #endif |
| c = *cc; |
| |
| SLJIT_ASSERT(type == OP_CHARI && char_has_othercase(common, cc)); |
| |
| if (check_str_ptr && common->mode == PCRE2_JIT_COMPLETE) |
| add_jump(compiler, backtracks, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0)); |
| |
| oc = char_othercase(common, c); |
| read_char(common, c < oc ? c : oc, c > oc ? c : oc, NULL, 0); |
| |
| SLJIT_ASSERT(!is_powerof2(c ^ oc)); |
| |
| if (sljit_has_cpu_feature(SLJIT_HAS_CMOV)) |
| { |
| OP2U(SLJIT_SUB | SLJIT_SET_Z, TMP1, 0, SLJIT_IMM, oc); |
| SELECT(SLJIT_EQUAL, TMP1, SLJIT_IMM, c, TMP1); |
| add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, c)); |
| } |
| else |
| { |
| jump[0] = CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c); |
| add_jump(compiler, backtracks, CMP(SLJIT_NOT_EQUAL, TMP1, 0, SLJIT_IMM, oc)); |
| JUMPHERE(jump[0]); |
| } |
| return cc + length; |
| |
| case OP_NOT: |
| case OP_NOTI: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| |
| length = 1; |
| #ifdef SUPPORT_UNICODE |
| if (common->utf) |
| { |
| #if PCRE2_CODE_UNIT_WIDTH == 8 |
| c = *cc; |
| if (c < 128 && !common->invalid_utf) |
| { |
| OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(STR_PTR), 0); |
| if (type == OP_NOT || !char_has_othercase(common, cc)) |
| add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c)); |
| else |
| { |
| /* Since UTF8 code page is fixed, we know that c is in [a-z] or [A-Z] range. */ |
| OP2(SLJIT_OR, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x20); |
| add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, c | 0x20)); |
| } |
| /* Skip the variable-length character. */ |
| OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1)); |
| jump[0] = CMP(SLJIT_LESS, TMP1, 0, SLJIT_IMM, 0xc0); |
| OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)PRIV(utf8_table4) - 0xc0); |
| OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0); |
| JUMPHERE(jump[0]); |
| return cc + 1; |
| } |
| else |
| #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ |
| { |
| GETCHARLEN(c, cc, length); |
| } |
| } |
| else |
| #endif /* SUPPORT_UNICODE */ |
| c = *cc; |
| |
| if (type == OP_NOT || !char_has_othercase(common, cc)) |
| { |
| read_char(common, c, c, backtracks, READ_CHAR_UPDATE_STR_PTR); |
| add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c)); |
| } |
| else |
| { |
| oc = char_othercase(common, c); |
| read_char(common, c < oc ? c : oc, c > oc ? c : oc, backtracks, READ_CHAR_UPDATE_STR_PTR); |
| bit = c ^ oc; |
| if (is_powerof2(bit)) |
| { |
| OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, bit); |
| add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c | bit)); |
| } |
| else |
| { |
| add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, c)); |
| add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, TMP1, 0, SLJIT_IMM, oc)); |
| } |
| } |
| return cc + length; |
| |
| case OP_CLASS: |
| case OP_NCLASS: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| |
| #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 |
| bit = (common->utf && is_char7_bitset((const sljit_u8 *)cc, type == OP_NCLASS)) ? 127 : 255; |
| if (type == OP_NCLASS) |
| read_char(common, 0, bit, backtracks, READ_CHAR_UPDATE_STR_PTR); |
| else |
| read_char(common, 0, bit, NULL, 0); |
| #else |
| if (type == OP_NCLASS) |
| read_char(common, 0, 255, backtracks, READ_CHAR_UPDATE_STR_PTR); |
| else |
| read_char(common, 0, 255, NULL, 0); |
| #endif |
| |
| if (optimize_class(common, (const sljit_u8 *)cc, type == OP_NCLASS, FALSE, backtracks)) |
| return cc + 32 / sizeof(PCRE2_UCHAR); |
| |
| #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 |
| jump[0] = NULL; |
| if (common->utf) |
| { |
| jump[0] = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, bit); |
| if (type == OP_CLASS) |
| { |
| add_jump(compiler, backtracks, jump[0]); |
| jump[0] = NULL; |
| } |
| } |
| #elif PCRE2_CODE_UNIT_WIDTH != 8 |
| jump[0] = CMP(SLJIT_GREATER, TMP1, 0, SLJIT_IMM, 255); |
| if (type == OP_CLASS) |
| { |
| add_jump(compiler, backtracks, jump[0]); |
| jump[0] = NULL; |
| } |
| #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8 */ |
| |
| OP2(SLJIT_AND, TMP2, 0, TMP1, 0, SLJIT_IMM, 0x7); |
| OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, SLJIT_IMM, 3); |
| OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)cc); |
| OP2(SLJIT_SHL, TMP2, 0, SLJIT_IMM, 1, TMP2, 0); |
| OP2U(SLJIT_AND | SLJIT_SET_Z, TMP1, 0, TMP2, 0); |
| add_jump(compiler, backtracks, JUMP(SLJIT_ZERO)); |
| |
| #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8 |
| if (jump[0] != NULL) |
| JUMPHERE(jump[0]); |
| #endif |
| return cc + 32 / sizeof(PCRE2_UCHAR); |
| |
| #if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH == 16 || PCRE2_CODE_UNIT_WIDTH == 32 |
| case OP_XCLASS: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| compile_xclass_matchingpath(common, cc + LINK_SIZE, backtracks, 0); |
| return cc + GET(cc, 0) - 1; |
| |
| case OP_ECLASS: |
| if (check_str_ptr) |
| detect_partial_match(common, backtracks); |
| return compile_eclass_matchingpath(common, cc, backtracks); |
| #endif |
| } |
| SLJIT_UNREACHABLE(); |
| return cc; |
| } |
| |
| static SLJIT_INLINE PCRE2_SPTR compile_charn_matchingpath(compiler_common *common, PCRE2_SPTR cc, PCRE2_SPTR ccend, jump_list **backtracks) |
| { |
| /* This function consumes at least one input character. */ |
| /* To decrease the number of length checks, we try to concatenate the fixed length character sequences. */ |
| DEFINE_COMPILER; |
| PCRE2_SPTR ccbegin = cc; |
| compare_context context; |
| int size; |
| |
| context.length = 0; |
| do |
| { |
| if (cc >= ccend) |
| break; |
| |
| if (*cc == OP_CHAR) |
| { |
| size = 1; |
| #ifdef SUPPORT_UNICODE |
| if (common->utf && HAS_EXTRALEN(cc[1])) |
| size += GET_EXTRALEN(cc[1]); |
| #endif |
| } |
| else if (*cc == OP_CHARI) |
| { |
| size = 1; |
| #ifdef SUPPORT_UNICODE |
| if (common->utf) |
| { |
| if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0) |
| size = 0; |
| else if (HAS_EXTRALEN(cc[1])) |
| size += GET_EXTRALEN(cc[1]); |
| } |
| else |
| #endif |
| if (char_has_othercase(common, cc + 1) && char_get_othercase_bit(common, cc + 1) == 0) |
| size = 0; |
| } |
| else |
| size = 0; |
| |
| cc += 1 + size; |
| context.length += IN_UCHARS(size); |
| } |
| while (size > 0 && context.length <= 128); |
| |
| cc = ccbegin; |
| if (context.length > 0) |
| { |
| /* We have a fixed-length byte sequence. */ |
| OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, context.length); |
| add_jump(compiler, backtracks, CMP(SLJIT_GREATER, STR_PTR, 0, STR_END, 0)); |
| |
| context.sourcereg = -1; |
| #if defined SLJIT_UNALIGNED && SLJIT_UNALIGNED |
| context.ucharptr = 0; |
| #endif |
| do cc = byte_sequence_compare(common, *cc == OP_CHARI, cc + 1, &context, backtracks); while (context.length > 0); |
| return cc; |
| } |
| |
| /* A non-fixed length character will be checked if length == 0. */ |
| return compile_char1_matchingpath(common, *cc, cc + 1, backtracks, TRUE); |
| } |
| |
| |