libregexp.c - third_party/quickjs - Git at Google

 /*
  * Regular Expression Engine
  *
  * Copyright (c) 2017-2018 Fabrice Bellard
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
  * in the Software without restriction, including without limitation the rights
  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  * copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdarg.h>
 #include <inttypes.h>
 #include <string.h>
 #include <assert.h>

 #include "cutils.h"
 #include "libregexp.h"

 /*
   TODO:

   - Add full unicode canonicalize rules for character ranges (not
     really useful but needed for exact "ignorecase" compatibility).

   - Add a lock step execution mode (=linear time execution guaranteed)
     when the regular expression is "simple" i.e. no backreference nor
     complicated lookahead. The opcodes are designed for this execution
     model.
 */

 #if defined(TEST)
 #define DUMP_REOP
 #endif

 typedef enum {
 #define DEF(id, size) REOP_ ## id,
 #include "libregexp-opcode.h"
 #undef DEF
     REOP_COUNT,
 } REOPCodeEnum;

 #define CAPTURE_COUNT_MAX 255
 #define STACK_SIZE_MAX 255

 /* unicode code points */
 #define CP_LS   0x2028
 #define CP_PS   0x2029

 #define TMP_BUF_SIZE 128

 typedef struct {
     DynBuf byte_code;
     const uint8_t *buf_ptr;
     const uint8_t *buf_end;
     const uint8_t *buf_start;
     int re_flags;
     BOOL is_utf16;
     BOOL ignore_case;
     BOOL dotall;
     int capture_count;
     int total_capture_count; /* -1 = not computed yet */
     int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
     void *mem_opaque;
     DynBuf group_names;
     union {
         char error_msg[TMP_BUF_SIZE];
         char tmp_buf[TMP_BUF_SIZE];
     } u;
 } REParseState;

 typedef struct {
 #ifdef DUMP_REOP
     const char *name;
 #endif
     uint8_t size;
 } REOpCode;

 static const REOpCode reopcode_info[REOP_COUNT] = {
 #ifdef DUMP_REOP
 #define DEF(id, size) { #id, size },
 #else
 #define DEF(id, size) { size },
 #endif
 #include "libregexp-opcode.h"
 #undef DEF
 };

 #define RE_HEADER_FLAGS         0
 #define RE_HEADER_CAPTURE_COUNT 1
 #define RE_HEADER_STACK_SIZE    2

 #define RE_HEADER_LEN 7

 static inline int is_digit(int c) {
     return c >= '0' && c <= '9';
 }

 /* insert 'len' bytes at position 'pos'. Return < 0 if error. */
 static int dbuf_insert(DynBuf *s, int pos, int len)
 {
     if (dbuf_realloc(s, s->size + len))
         return -1;
     memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
     s->size += len;
     return 0;
 }

 /* canonicalize with the specific JS regexp rules */
 static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16)
 {
     uint32_t res[LRE_CC_RES_LEN_MAX];
     int len;
     if (is_utf16) {
         if (likely(c < 128)) {
             if (c >= 'A' && c <= 'Z')
                 c = c - 'A' + 'a';
         } else {
             lre_case_conv(res, c, 2);
             c = res[0];
         }
     } else {
         if (likely(c < 128)) {
             if (c >= 'a' && c <= 'z')
                 c = c - 'a' + 'A';
         } else {
             /* legacy regexp: to upper case if single char >= 128 */
             len = lre_case_conv(res, c, FALSE);
             if (len == 1 && res[0] >= 128)
                 c = res[0];
         }
     }
     return c;
 }

 static const uint16_t char_range_d[] = {
     1,
     0x0030, 0x0039 + 1,
 };

 /* code point ranges for Zs,Zl or Zp property */
 static const uint16_t char_range_s[] = {
     10,
     0x0009, 0x000D + 1,
     0x0020, 0x0020 + 1,
     0x00A0, 0x00A0 + 1,
     0x1680, 0x1680 + 1,
     0x2000, 0x200A + 1,
     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
     0x2028, 0x2029 + 1,
     0x202F, 0x202F + 1,
     0x205F, 0x205F + 1,
     0x3000, 0x3000 + 1,
     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
     0xFEFF, 0xFEFF + 1,
 };

 BOOL lre_is_space(int c)
 {
     int i, n, low, high;
     n = (countof(char_range_s) - 1) / 2;
     for(i = 0; i < n; i++) {
         low = char_range_s[2 * i + 1];
         if (c < low)
             return FALSE;
         high = char_range_s[2 * i + 2];
         if (c < high)
             return TRUE;
     }
     return FALSE;
 }

 uint32_t const lre_id_start_table_ascii[4] = {
     /* $ A-Z _ a-z */
     0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
 };

 uint32_t const lre_id_continue_table_ascii[4] = {
     /* $ 0-9 A-Z _ a-z */
     0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE
 };


 static const uint16_t char_range_w[] = {
     4,
     0x0030, 0x0039 + 1,
     0x0041, 0x005A + 1,
     0x005F, 0x005F + 1,
     0x0061, 0x007A + 1,
 };

 #define CLASS_RANGE_BASE 0x40000000

 typedef enum {
     CHAR_RANGE_d,
     CHAR_RANGE_D,
     CHAR_RANGE_s,
     CHAR_RANGE_S,
     CHAR_RANGE_w,
     CHAR_RANGE_W,
 } CharRangeEnum;

 static const uint16_t *char_range_table[] = {
     char_range_d,
     char_range_s,
     char_range_w,
 };

 static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
 {
     BOOL invert;
     const uint16_t *c_pt;
     int len, i;

     invert = c & 1;
     c_pt = char_range_table[c >> 1];
     len = *c_pt++;
     cr_init(cr, s->mem_opaque, lre_realloc);
     for(i = 0; i < len * 2; i++) {
         if (cr_add_point(cr, c_pt[i]))
             goto fail;
     }
     if (invert) {
         if (cr_invert(cr))
             goto fail;
     }
     return 0;
  fail:
     cr_free(cr);
     return -1;
 }

 static int cr_canonicalize(CharRange *cr)
 {
     CharRange a;
     uint32_t pt[2];
     int i, ret;

     cr_init(&a, cr->mem_opaque, lre_realloc);
     pt[0] = 'a';
     pt[1] = 'z' + 1;
     ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER);
     if (ret)
         goto fail;
     /* convert to upper case */
     /* XXX: the generic unicode case would be much more complicated
        and not really useful */
     for(i = 0; i < a.len; i++) {
         a.points[i] += 'A' - 'a';
     }
     /* Note: for simplicity we keep the lower case ranges */
     ret = cr_union1(cr, a.points, a.len);
  fail:
     cr_free(&a);
     return ret;
 }

 #ifdef DUMP_REOP
 static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
                                                      int buf_len)
 {
     int pos, len, opcode, bc_len, re_flags, i;
     uint32_t val;

     assert(buf_len >= RE_HEADER_LEN);

     re_flags=  buf[0];
     bc_len = get_u32(buf + 3);
     assert(bc_len + RE_HEADER_LEN <= buf_len);
     printf("flags: 0x%x capture_count=%d stack_size=%d\n",
            re_flags, buf[1], buf[2]);
     if (re_flags & LRE_FLAG_NAMED_GROUPS) {
         const char *p;
         p = (char *)buf + RE_HEADER_LEN + bc_len;
         printf("named groups: ");
         for(i = 1; i < buf[1]; i++) {
             if (i != 1)
                 printf(",");
             printf("<%s>", p);
             p += strlen(p) + 1;
         }
         printf("\n");
         assert(p == (char *)(buf + buf_len));
     }
     printf("bytecode_len=%d\n", bc_len);

     buf += RE_HEADER_LEN;
     pos = 0;
     while (pos < bc_len) {
         printf("%5u: ", pos);
         opcode = buf[pos];
         len = reopcode_info[opcode].size;
         if (opcode >= REOP_COUNT) {
             printf(" invalid opcode=0x%02x\n", opcode);
             break;
         }
         if ((pos + len) > bc_len) {
             printf(" buffer overflow (opcode=0x%02x)\n", opcode);
             break;
         }
         printf("%s", reopcode_info[opcode].name);
         switch(opcode) {
         case REOP_char:
             val = get_u16(buf + pos + 1);
             if (val >= ' ' && val <= 126)
                 printf(" '%c'", val);
             else
                 printf(" 0x%04x", val);
             break;
         case REOP_char32:
             val = get_u32(buf + pos + 1);
             if (val >= ' ' && val <= 126)
                 printf(" '%c'", val);
             else
                 printf(" 0x%08x", val);
             break;
         case REOP_goto:
         case REOP_split_goto_first:
         case REOP_split_next_first:
         case REOP_loop:
         case REOP_lookahead:
         case REOP_negative_lookahead:
         case REOP_bne_char_pos:
             val = get_u32(buf + pos + 1);
             val += (pos + 5);
             printf(" %u", val);
             break;
         case REOP_simple_greedy_quant:
             printf(" %u %u %u %u",
                    get_u32(buf + pos + 1) + (pos + 17),
                    get_u32(buf + pos + 1 + 4),
                    get_u32(buf + pos + 1 + 8),
                    get_u32(buf + pos + 1 + 12));
             break;
         case REOP_save_start:
         case REOP_save_end:
         case REOP_back_reference:
         case REOP_backward_back_reference:
             printf(" %u", buf[pos + 1]);
             break;
         case REOP_save_reset:
             printf(" %u %u", buf[pos + 1], buf[pos + 2]);
             break;
         case REOP_push_i32:
             val = get_u32(buf + pos + 1);
             printf(" %d", val);
             break;
         case REOP_range:
             {
                 int n, i;
                 n = get_u16(buf + pos + 1);
                 len += n * 4;
                 for(i = 0; i < n * 2; i++) {
                     val = get_u16(buf + pos + 3 + i * 2);
                     printf(" 0x%04x", val);
                 }
             }
             break;
         case REOP_range32:
             {
                 int n, i;
                 n = get_u16(buf + pos + 1);
                 len += n * 8;
                 for(i = 0; i < n * 2; i++) {
                     val = get_u32(buf + pos + 3 + i * 4);
                     printf(" 0x%08x", val);
                 }
             }
             break;
         default:
             break;
         }
         printf("\n");
         pos += len;
     }
 }
 #endif

 static void re_emit_op(REParseState *s, int op)
 {
     dbuf_putc(&s->byte_code, op);
 }

 /* return the offset of the u32 value */
 static int re_emit_op_u32(REParseState *s, int op, uint32_t val)
 {
     int pos;
     dbuf_putc(&s->byte_code, op);
     pos = s->byte_code.size;
     dbuf_put_u32(&s->byte_code, val);
     return pos;
 }

 static int re_emit_goto(REParseState *s, int op, uint32_t val)
 {
     int pos;
     dbuf_putc(&s->byte_code, op);
     pos = s->byte_code.size;
     dbuf_put_u32(&s->byte_code, val - (pos + 4));
     return pos;
 }

 static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
 {
     dbuf_putc(&s->byte_code, op);
     dbuf_putc(&s->byte_code, val);
 }

 static void re_emit_op_u16(REParseState *s, int op, uint32_t val)
 {
     dbuf_putc(&s->byte_code, op);
     dbuf_put_u16(&s->byte_code, val);
 }

 static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...)
 {
     va_list ap;
     va_start(ap, fmt);
     vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
     va_end(ap);
     return -1;
 }

 static int re_parse_out_of_memory(REParseState *s)
 {
     return re_parse_error(s, "out of memory");
 }

 /* If allow_overflow is false, return -1 in case of
    overflow. Otherwise return INT32_MAX. */
 static int parse_digits(const uint8_t **pp, BOOL allow_overflow)
 {
     const uint8_t *p;
     uint64_t v;
     int c;

     p = *pp;
     v = 0;
     for(;;) {
         c = *p;
         if (c < '0' || c > '9')
             break;
         v = v * 10 + c - '0';
         if (v >= INT32_MAX) {
             if (allow_overflow)
                 v = INT32_MAX;
             else
                 return -1;
         }
         p++;
     }
     *pp = p;
     return v;
 }

 static int re_parse_expect(REParseState *s, const uint8_t **pp, int c)
 {
     const uint8_t *p;
     p = *pp;
     if (*p != c)
         return re_parse_error(s, "expecting '%c'", c);
     p++;
     *pp = p;
     return 0;
 }

 /* Parse an escape sequence, *pp points after the '\':
    allow_utf16 value:
    0 : no UTF-16 escapes allowed
    1 : UTF-16 escapes allowed
    2 : UTF-16 escapes allowed and escapes of surrogate pairs are
    converted to a unicode character (unicode regexp case).

    Return the unicode char and update *pp if recognized,
    return -1 if malformed escape,
    return -2 otherwise. */
 int lre_parse_escape(const uint8_t **pp, int allow_utf16)
 {
     const uint8_t *p;
     uint32_t c;

     p = *pp;
     c = *p++;
     switch(c) {
     case 'b':
         c = '\b';
         break;
     case 'f':
         c = '\f';
         break;
     case 'n':
         c = '\n';
         break;
     case 'r':
         c = '\r';
         break;
     case 't':
         c = '\t';
         break;
     case 'v':
         c = '\v';
         break;
     case 'x':
     case 'u':
         {
             int h, n, i;
             uint32_t c1;

             if (*p == '{' && allow_utf16) {
                 p++;
                 c = 0;
                 for(;;) {
                     h = from_hex(*p++);
                     if (h < 0)
                         return -1;
                     c = (c << 4) | h;
                     if (c > 0x10FFFF)
                         return -1;
                     if (*p == '}')
                         break;
                 }
                 p++;
             } else {
                 if (c == 'x') {
                     n = 2;
                 } else {
                     n = 4;
                 }

                 c = 0;
                 for(i = 0; i < n; i++) {
                     h = from_hex(*p++);
                     if (h < 0) {
                         return -1;
                     }
                     c = (c << 4) | h;
                 }
                 if (c >= 0xd800 && c < 0xdc00 &&
                     allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
                     /* convert an escaped surrogate pair into a
                        unicode char */
                     c1 = 0;
                     for(i = 0; i < 4; i++) {
                         h = from_hex(p[2 + i]);
                         if (h < 0)
                             break;
                         c1 = (c1 << 4) | h;
                     }
                     if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) {
                         p += 6;
                         c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
                     }
                 }
             }
         }
         break;
     case '0': case '1': case '2': case '3':
     case '4': case '5': case '6': case '7':
         c -= '0';
         if (allow_utf16 == 2) {
             /* only accept \0 not followed by digit */
             if (c != 0 || is_digit(*p))
                 return -1;
         } else {
             /* parse a legacy octal sequence */
             uint32_t v;
             v = *p - '0';
             if (v > 7)
                 break;
             c = (c << 3) | v;
             p++;
             if (c >= 32)
                 break;
             v = *p - '0';
             if (v > 7)
                 break;
             c = (c << 3) | v;
             p++;
         }
         break;
     default:
         return -2;
     }
     *pp = p;
     return c;
 }

 #ifdef CONFIG_ALL_UNICODE
 /* XXX: we use the same chars for name and value */
 static BOOL is_unicode_char(int c)
 {
     return ((c >= '0' && c <= '9') ||
             (c >= 'A' && c <= 'Z') ||
             (c >= 'a' && c <= 'z') ||
             (c == '_'));
 }

 static int parse_unicode_property(REParseState *s, CharRange *cr,
                                   const uint8_t **pp, BOOL is_inv)
 {
     const uint8_t *p;
     char name[64], value[64];
     char *q;
     BOOL script_ext;
     int ret;

     p = *pp;
     if (*p != '{')
         return re_parse_error(s, "expecting '{' after \\p");
     p++;
     q = name;
     while (is_unicode_char(*p)) {
         if ((q - name) > sizeof(name) - 1)
             goto unknown_property_name;
         *q++ = *p++;
     }
     *q = '\0';
     q = value;
     if (*p == '=') {
         p++;
         while (is_unicode_char(*p)) {
             if ((q - value) > sizeof(value) - 1)
                 return re_parse_error(s, "unknown unicode property value");
             *q++ = *p++;
         }
     }
     *q = '\0';
     if (*p != '}')
         return re_parse_error(s, "expecting '}'");
     p++;
     //    printf("name=%s value=%s\n", name, value);

     if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
         script_ext = FALSE;
         goto do_script;
     } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
         script_ext = TRUE;
     do_script:
         cr_init(cr, s->mem_opaque, lre_realloc);
         ret = unicode_script(cr, value, script_ext);
         if (ret) {
             cr_free(cr);
             if (ret == -2)
                 return re_parse_error(s, "unknown unicode script");
             else
                 goto out_of_memory;
         }
     } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
         cr_init(cr, s->mem_opaque, lre_realloc);
         ret = unicode_general_category(cr, value);
         if (ret) {
             cr_free(cr);
             if (ret == -2)
                 return re_parse_error(s, "unknown unicode general category");
             else
                 goto out_of_memory;
         }
     } else if (value[0] == '\0') {
         cr_init(cr, s->mem_opaque, lre_realloc);
         ret = unicode_general_category(cr, name);
         if (ret == -1) {
             cr_free(cr);
             goto out_of_memory;
         }
         if (ret < 0) {
             ret = unicode_prop(cr, name);
             if (ret) {
                 cr_free(cr);
                 if (ret == -2)
                     goto unknown_property_name;
                 else
                     goto out_of_memory;
             }
         }
     } else {
     unknown_property_name:
         return re_parse_error(s, "unknown unicode property name");
     }

     if (is_inv) {
         if (cr_invert(cr)) {
             cr_free(cr);
             return -1;
         }
     }
     *pp = p;
     return 0;
  out_of_memory:
     return re_parse_out_of_memory(s);
 }
 #endif /* CONFIG_ALL_UNICODE */

 /* return -1 if error otherwise the character or a class range
    (CLASS_RANGE_BASE). In case of class range, 'cr' is
    initialized. Otherwise, it is ignored. */
 static int get_class_atom(REParseState *s, CharRange *cr,
                           const uint8_t **pp, BOOL inclass)
 {
     const uint8_t *p;
     uint32_t c;
     int ret;

     p = *pp;

     c = *p;
     switch(c) {
     case '\\':
         p++;
         if (p >= s->buf_end)
             goto unexpected_end;
         c = *p++;
         switch(c) {
         case 'd':
             c = CHAR_RANGE_d;
             goto class_range;
         case 'D':
             c = CHAR_RANGE_D;
             goto class_range;
         case 's':
             c = CHAR_RANGE_s;
             goto class_range;
         case 'S':
             c = CHAR_RANGE_S;
             goto class_range;
         case 'w':
             c = CHAR_RANGE_w;
             goto class_range;
         case 'W':
             c = CHAR_RANGE_W;
         class_range:
             if (cr_init_char_range(s, cr, c))
                 return -1;
             c = CLASS_RANGE_BASE;
             break;
         case 'c':
             c = *p;
             if ((c >= 'a' && c <= 'z') ||
                 (c >= 'A' && c <= 'Z') ||
                 (((c >= '0' && c <= '9') || c == '_') &&
                  inclass && !s->is_utf16)) {   /* Annex B.1.4 */
                 c &= 0x1f;
                 p++;
             } else if (s->is_utf16) {
                 goto invalid_escape;
             } else {
                 /* otherwise return '\' and 'c' */
                 p--;
                 c = '\\';
             }
             break;
 #ifdef CONFIG_ALL_UNICODE
         case 'p':
         case 'P':
             if (s->is_utf16) {
                 if (parse_unicode_property(s, cr, &p, (c == 'P')))
                     return -1;
                 c = CLASS_RANGE_BASE;
                 break;
             }
             /* fall thru */
 #endif
         default:
             p--;
             ret = lre_parse_escape(&p, s->is_utf16 * 2);
             if (ret >= 0) {
                 c = ret;
             } else {
                 if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
                     /* always valid to escape these characters */
                     goto normal_char;
                 } else if (s->is_utf16) {
                 invalid_escape:
                     return re_parse_error(s, "invalid escape sequence in regular expression");
                 } else {
                     /* just ignore the '\' */
                     goto normal_char;
                 }
             }
             break;
         }
         break;
     case '\0':
         if (p >= s->buf_end) {
         unexpected_end:
             return re_parse_error(s, "unexpected end");
         }
         /* fall thru */
     default:
     normal_char:
         /* normal char */
         if (c >= 128) {
             c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
             if ((unsigned)c > 0xffff && !s->is_utf16) {
                 /* XXX: should handle non BMP-1 code points */
                 return re_parse_error(s, "malformed unicode char");
             }
         } else {
             p++;
         }
         break;
     }
     *pp = p;
     return c;
 }

 static int re_emit_range(REParseState *s, const CharRange *cr)
 {
     int len, i;
     uint32_t high;

     len = (unsigned)cr->len / 2;
     if (len >= 65535)
         return re_parse_error(s, "too many ranges");
     if (len == 0) {
         /* not sure it can really happen. Emit a match that is always
            false */
         re_emit_op_u32(s, REOP_char32, -1);
     } else {
         high = cr->points[cr->len - 1];
         if (high == UINT32_MAX)
             high = cr->points[cr->len - 2];
         if (high <= 0xffff) {
             /* can use 16 bit ranges with the conversion that 0xffff =
                infinity */
             re_emit_op_u16(s, REOP_range, len);
             for(i = 0; i < cr->len; i += 2) {
                 dbuf_put_u16(&s->byte_code, cr->points[i]);
                 high = cr->points[i + 1] - 1;
                 if (high == UINT32_MAX - 1)
                     high = 0xffff;
                 dbuf_put_u16(&s->byte_code, high);
             }
         } else {
             re_emit_op_u16(s, REOP_range32, len);
             for(i = 0; i < cr->len; i += 2) {
                 dbuf_put_u32(&s->byte_code, cr->points[i]);
                 dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
             }
         }
     }
     return 0;
 }

 static int re_parse_char_class(REParseState *s, const uint8_t **pp)
 {
     const uint8_t *p;
     uint32_t c1, c2;
     CharRange cr_s, *cr = &cr_s;
     CharRange cr1_s, *cr1 = &cr1_s;
     BOOL invert;

     cr_init(cr, s->mem_opaque, lre_realloc);
     p = *pp;
     p++;    /* skip '[' */
     invert = FALSE;
     if (*p == '^') {
         p++;
         invert = TRUE;
     }
     for(;;) {
         if (*p == ']')
             break;
         c1 = get_class_atom(s, cr1, &p, TRUE);
         if ((int)c1 < 0)
             goto fail;
         if (*p == '-' && p[1] != ']') {
             const uint8_t *p0 = p + 1;
             if (c1 >= CLASS_RANGE_BASE) {
                 if (s->is_utf16) {
                     cr_free(cr1);
                     goto invalid_class_range;
                 }
                 /* Annex B: match '-' character */
                 goto class_atom;
             }
             c2 = get_class_atom(s, cr1, &p0, TRUE);
             if ((int)c2 < 0)
                 goto fail;
             if (c2 >= CLASS_RANGE_BASE) {
                 cr_free(cr1);
                 if (s->is_utf16) {
                     goto invalid_class_range;
                 }
                 /* Annex B: match '-' character */
                 goto class_atom;
             }
             p = p0;
             if (c2 < c1) {
             invalid_class_range:
                 re_parse_error(s, "invalid class range");
                 goto fail;
             }
             if (cr_union_interval(cr, c1, c2))
                 goto memory_error;
         } else {
         class_atom:
             if (c1 >= CLASS_RANGE_BASE) {
                 int ret;
                 ret = cr_union1(cr, cr1->points, cr1->len);
                 cr_free(cr1);
                 if (ret)
                     goto memory_error;
             } else {
                 if (cr_union_interval(cr, c1, c1))
                     goto memory_error;
             }
         }
     }
     if (s->ignore_case) {
         if (cr_canonicalize(cr))
             goto memory_error;
     }
     if (invert) {
         if (cr_invert(cr))
             goto memory_error;
     }
     if (re_emit_range(s, cr))
         goto fail;
     cr_free(cr);
     p++;    /* skip ']' */
     *pp = p;
     return 0;
  memory_error:
     re_parse_out_of_memory(s);
  fail:
     cr_free(cr);
     return -1;
 }

 /* Return:
    1 if the opcodes in bc_buf[] always advance the character pointer.
    0 if the character pointer may not be advanced.
    -1 if the code may depend on side effects of its previous execution (backreference)
 */
 static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
 {
     int pos, opcode, ret, len, i;
     uint32_t val, last;
     BOOL has_back_reference;
     uint8_t capture_bitmap[CAPTURE_COUNT_MAX];

     ret = -2; /* not known yet */
     pos = 0;
     has_back_reference = FALSE;
     memset(capture_bitmap, 0, sizeof(capture_bitmap));

     while (pos < bc_buf_len) {
         opcode = bc_buf[pos];
         len = reopcode_info[opcode].size;
         switch(opcode) {
         case REOP_range:
             val = get_u16(bc_buf + pos + 1);
             len += val * 4;
             goto simple_char;
         case REOP_range32:
             val = get_u16(bc_buf + pos + 1);
             len += val * 8;
             goto simple_char;
         case REOP_char:
         case REOP_char32:
         case REOP_dot:
         case REOP_any:
         simple_char:
             if (ret == -2)
                 ret = 1;
             break;
         case REOP_line_start:
         case REOP_line_end:
         case REOP_push_i32:
         case REOP_push_char_pos:
         case REOP_drop:
         case REOP_word_boundary:
         case REOP_not_word_boundary:
         case REOP_prev:
             /* no effect */
             break;
         case REOP_save_start:
         case REOP_save_end:
             val = bc_buf[pos + 1];
             capture_bitmap[val] |= 1;
             break;
         case REOP_save_reset:
             {
                 val = bc_buf[pos + 1];
                 last = bc_buf[pos + 2];
                 while (val < last)
                     capture_bitmap[val++] |= 1;
             }
             break;
         case REOP_back_reference:
         case REOP_backward_back_reference:
             val = bc_buf[pos + 1];
             capture_bitmap[val] |= 2;
             has_back_reference = TRUE;
             break;
         default:
             /* safe behvior: we cannot predict the outcome */
             if (ret == -2)
                 ret = 0;
             break;
         }
         pos += len;
     }
     if (has_back_reference) {
         /* check if there is back reference which references a capture
            made in the some code */
         for(i = 0; i < CAPTURE_COUNT_MAX; i++) {
             if (capture_bitmap[i] == 3)
                 return -1;
         }
     }
     if (ret == -2)
         ret = 0;
     return ret;
 }

 /* return -1 if a simple quantifier cannot be used. Otherwise return
    the number of characters in the atom. */
 static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
 {
     int pos, opcode, len, count;
     uint32_t val;

     count = 0;
     pos = 0;
     while (pos < bc_buf_len) {
         opcode = bc_buf[pos];
         len = reopcode_info[opcode].size;
         switch(opcode) {
         case REOP_range:
             val = get_u16(bc_buf + pos + 1);
             len += val * 4;
             goto simple_char;
         case REOP_range32:
             val = get_u16(bc_buf + pos + 1);
             len += val * 8;
             goto simple_char;
         case REOP_char:
         case REOP_char32:
         case REOP_dot:
         case REOP_any:
         simple_char:
             count++;
             break;
         case REOP_line_start:
         case REOP_line_end:
         case REOP_word_boundary:
         case REOP_not_word_boundary:
             break;
         default:
             return -1;
         }
         pos += len;
     }
     return count;
 }

 /* '*pp' is the first char after '<' */
 static int re_parse_group_name(char *buf, int buf_size,
                                const uint8_t **pp, BOOL is_utf16)
 {
     const uint8_t *p;
     uint32_t c;
     char *q;

     p = *pp;
     q = buf;
     for(;;) {
         c = *p;
         if (c == '\\') {
             p++;
             if (*p != 'u')
                 return -1;
             c = lre_parse_escape(&p, is_utf16 * 2);
         } else if (c == '>') {
             break;
         } else if (c >= 128) {
             c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
         } else {
             p++;
         }
         if (c > 0x10FFFF)
             return -1;
         if (q == buf) {
             if (!lre_js_is_ident_first(c))
                 return -1;
         } else {
             if (!lre_js_is_ident_next(c))
                 return -1;
         }
         if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
             return -1;
         if (c < 128) {
             *q++ = c;
         } else {
             q += unicode_to_utf8((uint8_t*)q, c);
         }
     }
     if (q == buf)
         return -1;
     *q = '\0';
     p++;
     *pp = p;
     return 0;
 }

 /* if capture_name = NULL: return the number of captures + 1.
    Otherwise, return the capture index corresponding to capture_name
    or -1 if none */
 static int re_parse_captures(REParseState *s, int *phas_named_captures,
                              const char *capture_name)
 {
     const uint8_t *p;
     int capture_index;
     char name[TMP_BUF_SIZE];

     capture_index = 1;
     *phas_named_captures = 0;
     for (p = s->buf_start; p < s->buf_end; p++) {
         switch (*p) {
         case '(':
             if (p[1] == '?') {
                 if (p[2] == '<' && p[3] != '=' && p[3] != '!') {
                     *phas_named_captures = 1;
                     /* potential named capture */
                     if (capture_name) {
                         p += 3;
                         if (re_parse_group_name(name, sizeof(name), &p,
                                                 s->is_utf16) == 0) {
                             if (!strcmp(name, capture_name))
                                 return capture_index;
                         }
                     }
                     capture_index++;
                 }
             } else {
                 capture_index++;
             }
             break;
         case '\\':
             p++;
             break;
         case '[':
             for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) {
                 if (*p == '\\')
                     p++;
             }
             break;
         }
     }
     if (capture_name)
         return -1;
     else
         return capture_index;
 }

 static int re_count_captures(REParseState *s)
 {
     if (s->total_capture_count < 0) {
         s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
                                                    NULL);
     }
     return s->total_capture_count;
 }

 static BOOL re_has_named_captures(REParseState *s)
 {
     if (s->has_named_captures < 0)
         re_count_captures(s);
     return s->has_named_captures;
 }

 static int find_group_name(REParseState *s, const char *name)
 {
     const char *p, *buf_end;
     size_t len, name_len;
     int capture_index;

     name_len = strlen(name);
     p = (char *)s->group_names.buf;
     buf_end = (char *)s->group_names.buf + s->group_names.size;
     capture_index = 1;
     while (p < buf_end) {
         len = strlen(p);
         if (len == name_len && memcmp(name, p, name_len) == 0)
             return capture_index;
         p += len + 1;
         capture_index++;
     }
     return -1;
 }

 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);

 static int re_parse_term(REParseState *s, BOOL is_backward_dir)
 {
     const uint8_t *p;
     int c, last_atom_start, quant_min, quant_max, last_capture_count;
     BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
     CharRange cr_s, *cr = &cr_s;

     last_atom_start = -1;
     last_capture_count = 0;
     p = s->buf_ptr;
     c = *p;
     switch(c) {
     case '^':
         p++;
         re_emit_op(s, REOP_line_start);
         break;
     case '$':
         p++;
         re_emit_op(s, REOP_line_end);
         break;
     case '.':
         p++;
         last_atom_start = s->byte_code.size;
         last_capture_count = s->capture_count;
         if (is_backward_dir)
             re_emit_op(s, REOP_prev);
         re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
         if (is_backward_dir)
             re_emit_op(s, REOP_prev);
         break;
     case '{':
         if (s->is_utf16) {
             return re_parse_error(s, "syntax error");
         } else if (!is_digit(p[1])) {
             /* Annex B: we accept '{' not followed by digits as a
                normal atom */
             goto parse_class_atom;
         } else {
             const uint8_t *p1 = p + 1;
             /* Annex B: error if it is like a repetition count */
             parse_digits(&p1, TRUE);
             if (*p1 == ',') {
                 p1++;
                 if (is_digit(*p1)) {
                     parse_digits(&p1, TRUE);
                 }
             }
             if (*p1 != '}') {
                 goto parse_class_atom;
             }
         }
         /* fall thru */
     case '*':
     case '+':
     case '?':
         return re_parse_error(s, "nothing to repeat");
     case '(':
         if (p[1] == '?') {
             if (p[2] == ':') {
                 p += 3;
                 last_atom_start = s->byte_code.size;
                 last_capture_count = s->capture_count;
                 s->buf_ptr = p;
                 if (re_parse_disjunction(s, is_backward_dir))
                     return -1;
                 p = s->buf_ptr;
                 if (re_parse_expect(s, &p, ')'))
                     return -1;
             } else if ((p[2] == '=' || p[2] == '!')) {
                 is_neg = (p[2] == '!');
                 is_backward_lookahead = FALSE;
                 p += 3;
                 goto lookahead;
             } else if (p[2] == '<' &&
                        (p[3] == '=' || p[3] == '!')) {
                 int pos;
                 is_neg = (p[3] == '!');
                 is_backward_lookahead = TRUE;
                 p += 4;
                 /* lookahead */
             lookahead:
                 /* Annex B allows lookahead to be used as an atom for
                    the quantifiers */
                 if (!s->is_utf16 && !is_backward_lookahead)  {
                     last_atom_start = s->byte_code.size;
                     last_capture_count = s->capture_count;
                 }
                 pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
                 s->buf_ptr = p;
                 if (re_parse_disjunction(s, is_backward_lookahead))
                     return -1;
                 p = s->buf_ptr;
                 if (re_parse_expect(s, &p, ')'))
                     return -1;
                 re_emit_op(s, REOP_match);
                 /* jump after the 'match' after the lookahead is successful */
                 if (dbuf_error(&s->byte_code))
                     return -1;
                 put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
             } else if (p[2] == '<') {
                 p += 3;
                 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
                                         &p, s->is_utf16)) {
                     return re_parse_error(s, "invalid group name");
                 }
                 if (find_group_name(s, s->u.tmp_buf) > 0) {
                     return re_parse_error(s, "duplicate group name");
                 }
                 /* group name with a trailing zero */
                 dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
                          strlen(s->u.tmp_buf) + 1);
                 s->has_named_captures = 1;
                 goto parse_capture;
             } else {
                 return re_parse_error(s, "invalid group");
             }
         } else {
             int capture_index;
             p++;
             /* capture without group name */
             dbuf_putc(&s->group_names, 0);
         parse_capture:
             if (s->capture_count >= CAPTURE_COUNT_MAX)
                 return re_parse_error(s, "too many captures");
             last_atom_start = s->byte_code.size;
             last_capture_count = s->capture_count;
             capture_index = s->capture_count++;
             re_emit_op_u8(s, REOP_save_start + is_backward_dir,
                           capture_index);

             s->buf_ptr = p;
             if (re_parse_disjunction(s, is_backward_dir))
                 return -1;
             p = s->buf_ptr;

             re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir,
                           capture_index);

             if (re_parse_expect(s, &p, ')'))
                 return -1;
         }
         break;
     case '\\':
         switch(p[1]) {
         case 'b':
         case 'B':
             re_emit_op(s, REOP_word_boundary + (p[1] != 'b'));
             p += 2;
             break;
         case 'k':
             {
                 const uint8_t *p1;
                 int dummy_res;

                 p1 = p;
                 if (p1[2] != '<') {
                     /* annex B: we tolerate invalid group names in non
                        unicode mode if there is no named capture
                        definition */
                     if (s->is_utf16 || re_has_named_captures(s))
                         return re_parse_error(s, "expecting group name");
                     else
                         goto parse_class_atom;
                 }
                 p1 += 3;
                 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
                                         &p1, s->is_utf16)) {
                     if (s->is_utf16 || re_has_named_captures(s))
                         return re_parse_error(s, "invalid group name");
                     else
                         goto parse_class_atom;
                 }
                 c = find_group_name(s, s->u.tmp_buf);
                 if (c < 0) {
                     /* no capture name parsed before, try to look
                        after (inefficient, but hopefully not common */
                     c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
                     if (c < 0) {
                         if (s->is_utf16 || re_has_named_captures(s))
                             return re_parse_error(s, "group name not defined");
                         else
                             goto parse_class_atom;
                     }
                 }
                 p = p1;
             }
             goto emit_back_reference;
         case '0':
             p += 2;
             c = 0;
             if (s->is_utf16) {
                 if (is_digit(*p)) {
                     return re_parse_error(s, "invalid decimal escape in regular expression");
                 }
             } else {
                 /* Annex B.1.4: accept legacy octal */
                 if (*p >= '0' && *p <= '7') {
                     c = *p++ - '0';
                     if (*p >= '0' && *p <= '7') {
                         c = (c << 3) + *p++ - '0';
                     }
                 }
             }
             goto normal_char;
         case '1': case '2': case '3': case '4':
         case '5': case '6': case '7': case '8':
         case '9':
             {
                 const uint8_t *q = ++p;

                 c = parse_digits(&p, FALSE);
                 if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
                     if (!s->is_utf16) {
                         /* Annex B.1.4: accept legacy octal */
                         p = q;
                         if (*p <= '7') {
                             c = 0;
                             if (*p <= '3')
                                 c = *p++ - '0';
                             if (*p >= '0' && *p <= '7') {
                                 c = (c << 3) + *p++ - '0';
                                 if (*p >= '0' && *p <= '7') {
                                     c = (c << 3) + *p++ - '0';
                                 }
                             }
                         } else {
                             c = *p++;
                         }
                         goto normal_char;
                     }
                     return re_parse_error(s, "back reference out of range in regular expression");
                 }
             emit_back_reference:
                 last_atom_start = s->byte_code.size;
                 last_capture_count = s->capture_count;
                 re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
             }
             break;
         default:
             goto parse_class_atom;
         }
         break;
     case '[':
         last_atom_start = s->byte_code.size;
         last_capture_count = s->capture_count;
         if (is_backward_dir)
             re_emit_op(s, REOP_prev);
         if (re_parse_char_class(s, &p))
             return -1;
         if (is_backward_dir)
             re_emit_op(s, REOP_prev);
         break;
     case ']':
     case '}':
         if (s->is_utf16)
             return re_parse_error(s, "syntax error");
         goto parse_class_atom;
     default:
     parse_class_atom:
         c = get_class_atom(s, cr, &p, FALSE);
         if ((int)c < 0)
             return -1;
     normal_char:
         last_atom_start = s->byte_code.size;
         last_capture_count = s->capture_count;
         if (is_backward_dir)
             re_emit_op(s, REOP_prev);
         if (c >= CLASS_RANGE_BASE) {
             int ret;
             /* Note: canonicalization is not needed */
             ret = re_emit_range(s, cr);
             cr_free(cr);
             if (ret)
                 return -1;
         } else {
             if (s->ignore_case)
                 c = lre_canonicalize(c, s->is_utf16);
             if (c <= 0xffff)
                 re_emit_op_u16(s, REOP_char, c);
             else
                 re_emit_op_u32(s, REOP_char32, c);
         }
         if (is_backward_dir)
             re_emit_op(s, REOP_prev);
         break;
     }

     /* quantifier */
     if (last_atom_start >= 0) {
         c = *p;
         switch(c) {
         case '*':
             p++;
             quant_min = 0;
             quant_max = INT32_MAX;
             goto quantifier;
         case '+':
             p++;
             quant_min = 1;
             quant_max = INT32_MAX;
             goto quantifier;
         case '?':
             p++;
             quant_min = 0;
             quant_max = 1;
             goto quantifier;
         case '{':
             {
                 const uint8_t *p1 = p;
                 /* As an extension (see ES6 annex B), we accept '{' not
                    followed by digits as a normal atom */
                 if (!is_digit(p[1])) {
                     if (s->is_utf16)
                         goto invalid_quant_count;
                     break;
                 }
                 p++;
                 quant_min = parse_digits(&p, TRUE);
                 quant_max = quant_min;
                 if (*p == ',') {
                     p++;
                     if (is_digit(*p)) {
                         quant_max = parse_digits(&p, TRUE);
                         if (quant_max < quant_min) {
                         invalid_quant_count:
                             return re_parse_error(s, "invalid repetition count");
                         }
                     } else {
                         quant_max = INT32_MAX; /* infinity */
                     }
                 }
                 if (*p != '}' && !s->is_utf16) {
                     /* Annex B: normal atom if invalid '{' syntax */
                     p = p1;
                     break;
                 }
                 if (re_parse_expect(s, &p, '}'))
                     return -1;
             }
         quantifier:
             greedy = TRUE;
             if (*p == '?') {
                 p++;
                 greedy = FALSE;
             }
             if (last_atom_start < 0) {
                 return re_parse_error(s, "nothing to repeat");
             }
             if (greedy) {
                 int len, pos;

                 if (quant_max > 0) {
                     /* specific optimization for simple quantifiers */
                     if (dbuf_error(&s->byte_code))
                         goto out_of_memory;
                     len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
                                                  s->byte_code.size - last_atom_start);
                     if (len > 0) {
                         re_emit_op(s, REOP_match);

                         if (dbuf_insert(&s->byte_code, last_atom_start, 17))
                             goto out_of_memory;
                         pos = last_atom_start;
                         s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
                         put_u32(&s->byte_code.buf[pos],
                                 s->byte_code.size - last_atom_start - 17);
                         pos += 4;
                         put_u32(&s->byte_code.buf[pos], quant_min);
                         pos += 4;
                         put_u32(&s->byte_code.buf[pos], quant_max);
                         pos += 4;
                         put_u32(&s->byte_code.buf[pos], len);
                         pos += 4;
                         goto done;
                     }
                 }

                 if (dbuf_error(&s->byte_code))
                     goto out_of_memory;
                 add_zero_advance_check = (re_check_advance(s->byte_code.buf + last_atom_start,
                                                            s->byte_code.size - last_atom_start) == 0);
             } else {
                 add_zero_advance_check = FALSE;
             }

             {
                 int len, pos;
                 len = s->byte_code.size - last_atom_start;
                 if (quant_min == 0) {
                     /* need to reset the capture in case the atom is
                        not executed */
                     if (last_capture_count != s->capture_count) {
                         if (dbuf_insert(&s->byte_code, last_atom_start, 3))
                             goto out_of_memory;
                         s->byte_code.buf[last_atom_start++] = REOP_save_reset;
                         s->byte_code.buf[last_atom_start++] = last_capture_count;
                         s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
                     }
                     if (quant_max == 0) {
                         s->byte_code.size = last_atom_start;
                     } else if (quant_max == 1) {
                         if (dbuf_insert(&s->byte_code, last_atom_start, 5))
                             goto out_of_memory;
                         s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
                             greedy;
                         put_u32(s->byte_code.buf + last_atom_start + 1, len);
                     } else if (quant_max == INT32_MAX) {
                         if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check))
                             goto out_of_memory;
                         s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
                             greedy;
                         put_u32(s->byte_code.buf + last_atom_start + 1,
                                 len + 5 + add_zero_advance_check);
                         if (add_zero_advance_check) {
                             /* avoid infinite loop by stoping the
                                recursion if no advance was made in the
                                atom (only works if the atom has no
                                side effect) */
                             s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
                             re_emit_goto(s, REOP_bne_char_pos, last_atom_start);
                         } else {
                             re_emit_goto(s, REOP_goto, last_atom_start);
                         }
                     } else {
                         if (dbuf_insert(&s->byte_code, last_atom_start, 10))
                             goto out_of_memory;
                         pos = last_atom_start;
                         s->byte_code.buf[pos++] = REOP_push_i32;
                         put_u32(s->byte_code.buf + pos, quant_max);
                         pos += 4;
                         s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
                         put_u32(s->byte_code.buf + pos, len + 5);
                         re_emit_goto(s, REOP_loop, last_atom_start + 5);
                         re_emit_op(s, REOP_drop);
                     }
                 } else if (quant_min == 1 && quant_max == INT32_MAX &&
                            !add_zero_advance_check) {
                     re_emit_goto(s, REOP_split_next_first - greedy,
                                  last_atom_start);
                 } else {
                     if (quant_min == 1) {
                         /* nothing to add */
                     } else {
                         if (dbuf_insert(&s->byte_code, last_atom_start, 5))
                             goto out_of_memory;
                         s->byte_code.buf[last_atom_start] = REOP_push_i32;
                         put_u32(s->byte_code.buf + last_atom_start + 1,
                                 quant_min);
                         last_atom_start += 5;
                         re_emit_goto(s, REOP_loop, last_atom_start);
                         re_emit_op(s, REOP_drop);
                     }
                     if (quant_max == INT32_MAX) {
                         pos = s->byte_code.size;
                         re_emit_op_u32(s, REOP_split_goto_first + greedy,
                                        len + 5 + add_zero_advance_check);
                         if (add_zero_advance_check)
                             re_emit_op(s, REOP_push_char_pos);
                         /* copy the atom */
                         dbuf_put_self(&s->byte_code, last_atom_start, len);
                         if (add_zero_advance_check)
                             re_emit_goto(s, REOP_bne_char_pos, pos);
                         else
                             re_emit_goto(s, REOP_goto, pos);
                     } else if (quant_max > quant_min) {
                         re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
                         pos = s->byte_code.size;
                         re_emit_op_u32(s, REOP_split_goto_first + greedy, len + 5);
                         /* copy the atom */
                         dbuf_put_self(&s->byte_code, last_atom_start, len);

                         re_emit_goto(s, REOP_loop, pos);
                         re_emit_op(s, REOP_drop);
                     }
                 }
                 last_atom_start = -1;
             }
             break;
         default:
             break;
         }
     }
  done:
     s->buf_ptr = p;
     return 0;
  out_of_memory:
     return re_parse_out_of_memory(s);
 }

 static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
 {
     const uint8_t *p;
     int ret;
     size_t start, term_start, end, term_size;

     start = s->byte_code.size;
     for(;;) {
         p = s->buf_ptr;
         if (p >= s->buf_end)
             break;
         if (*p == '|' || *p == ')')
             break;
         term_start = s->byte_code.size;
         ret = re_parse_term(s, is_backward_dir);
         if (ret)
             return ret;
         if (is_backward_dir) {
             /* reverse the order of the terms (XXX: inefficient, but
                speed is not really critical here) */
             end = s->byte_code.size;
             term_size = end - term_start;
             if (dbuf_realloc(&s->byte_code, end + term_size))
                 return -1;
             memmove(s->byte_code.buf + start + term_size,
                     s->byte_code.buf + start,
                     end - start);
             memcpy(s->byte_code.buf + start, s->byte_code.buf + end,
                    term_size);
         }
     }
     return 0;
 }

 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
 {
     int start, len, pos;

     start = s->byte_code.size;
     if (re_parse_alternative(s, is_backward_dir))
         return -1;
     while (*s->buf_ptr == '|') {
         s->buf_ptr++;

         len = s->byte_code.size - start;

         /* insert a split before the first alternative */
         if (dbuf_insert(&s->byte_code, start, 5)) {
             return re_parse_out_of_memory(s);
         }
         s->byte_code.buf[start] = REOP_split_next_first;
         put_u32(s->byte_code.buf + start + 1, len + 5);

         pos = re_emit_op_u32(s, REOP_goto, 0);

         if (re_parse_alternative(s, is_backward_dir))
             return -1;

         /* patch the goto */
         len = s->byte_code.size - (pos + 4);
         put_u32(s->byte_code.buf + pos, len);
     }
     return 0;
 }

 /* the control flow is recursive so the analysis can be linear */
 static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
 {
     int stack_size, stack_size_max, pos, opcode, len;
     uint32_t val;

     stack_size = 0;
     stack_size_max = 0;
     bc_buf += RE_HEADER_LEN;
     bc_buf_len -= RE_HEADER_LEN;
     pos = 0;
     while (pos < bc_buf_len) {
         opcode = bc_buf[pos];
         len = reopcode_info[opcode].size;
         assert(opcode < REOP_COUNT);
         assert((pos + len) <= bc_buf_len);
         switch(opcode) {
         case REOP_push_i32:
         case REOP_push_char_pos:
             stack_size++;
             if (stack_size > stack_size_max) {
                 if (stack_size > STACK_SIZE_MAX)
                     return -1;
                 stack_size_max = stack_size;
             }
             break;
         case REOP_drop:
         case REOP_bne_char_pos:
             assert(stack_size > 0);
             stack_size--;
             break;
         case REOP_range:
             val = get_u16(bc_buf + pos + 1);
             len += val * 4;
             break;
         case REOP_range32:
             val = get_u16(bc_buf + pos + 1);
             len += val * 8;
             break;
         }
         pos += len;
     }
     return stack_size_max;
 }

 /* 'buf' must be a zero terminated UTF-8 string of length buf_len.
    Return NULL if error and allocate an error message in *perror_msg,
    otherwise the compiled bytecode and its length in plen.
 */
 uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
                      const char *buf, size_t buf_len, int re_flags,
                      void *opaque)
 {
     REParseState s_s, *s = &s_s;
     int stack_size;
     BOOL is_sticky;

     memset(s, 0, sizeof(*s));
     s->mem_opaque = opaque;
     s->buf_ptr = (const uint8_t *)buf;
     s->buf_end = s->buf_ptr + buf_len;
     s->buf_start = s->buf_ptr;
     s->re_flags = re_flags;
     s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0);
     is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
     s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
     s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
     s->capture_count = 1;
     s->total_capture_count = -1;
     s->has_named_captures = -1;

     dbuf_init2(&s->byte_code, opaque, lre_realloc);
     dbuf_init2(&s->group_names, opaque, lre_realloc);

     dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */
     dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
     dbuf_putc(&s->byte_code, 0); /* stack size */
     dbuf_put_u32(&s->byte_code, 0); /* bytecode length */

     if (!is_sticky) {
         /* iterate thru all positions (about the same as .*?( ... ) )
            .  We do it without an explicit loop so that lock step
            thread execution will be possible in an optimized
            implementation */
         re_emit_op_u32(s, REOP_split_goto_first, 1 + 5);
         re_emit_op(s, REOP_any);
         re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5));
     }
     re_emit_op_u8(s, REOP_save_start, 0);

     if (re_parse_disjunction(s, FALSE)) {
     error:
         dbuf_free(&s->byte_code);
         dbuf_free(&s->group_names);
         pstrcpy(error_msg, error_msg_size, s->u.error_msg);
         *plen = 0;
         return NULL;
     }

     re_emit_op_u8(s, REOP_save_end, 0);

     re_emit_op(s, REOP_match);

     if (*s->buf_ptr != '\0') {
         re_parse_error(s, "extraneous characters at the end");
         goto error;
     }

     if (dbuf_error(&s->byte_code)) {
         re_parse_out_of_memory(s);
         goto error;
     }

     stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size);
     if (stack_size < 0) {
         re_parse_error(s, "too many imbricated quantifiers");
         goto error;
     }

     s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
     s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
     put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN);

     /* add the named groups if needed */
     if (s->group_names.size > (s->capture_count - 1)) {
         dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
         s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS;
     }
     dbuf_free(&s->group_names);

 #ifdef DUMP_REOP
     lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
 #endif

     error_msg[0] = '\0';
     *plen = s->byte_code.size;
     return s->byte_code.buf;
 }

 static BOOL is_line_terminator(uint32_t c)
 {
     return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
 }

 static BOOL is_word_char(uint32_t c)
 {
     return ((c >= '0' && c <= '9') ||
             (c >= 'a' && c <= 'z') ||
             (c >= 'A' && c <= 'Z') ||
             (c == '_'));
 }

 #define GET_CHAR(c, cptr, cbuf_end)                                     \
     do {                                                                \
         if (cbuf_type == 0) {                                           \
             c = *cptr++;                                                \
         } else {                                                        \
             uint32_t __c1;                                              \
             c = *(uint16_t *)cptr;                                      \
             cptr += 2;                                                  \
             if (c >= 0xd800 && c < 0xdc00 &&                            \
                 cbuf_type == 2 && cptr < cbuf_end) {                    \
                 __c1 = *(uint16_t *)cptr;                               \
                 if (__c1 >= 0xdc00 && __c1 < 0xe000) {                  \
                     c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
                     cptr += 2;                                          \
                 }                                                       \
             }                                                           \
         }                                                               \
     } while (0)

 #define PEEK_CHAR(c, cptr, cbuf_end)             \
     do {                                         \
         if (cbuf_type == 0) {                    \
             c = cptr[0];                         \
         } else {                                 \
             uint32_t __c1;                                              \
             c = ((uint16_t *)cptr)[0];                                  \
             if (c >= 0xd800 && c < 0xdc00 &&                            \
                 cbuf_type == 2 && (cptr + 2) < cbuf_end) {              \
                 __c1 = ((uint16_t *)cptr)[1];                           \
                 if (__c1 >= 0xdc00 && __c1 < 0xe000) {                  \
                     c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
                 }                                                       \
             }                                                           \
         }                                        \
     } while (0)

 #define PEEK_PREV_CHAR(c, cptr, cbuf_start)                 \
     do {                                         \
         if (cbuf_type == 0) {                    \
             c = cptr[-1];                        \
         } else {                                 \
             uint32_t __c1;                                              \
             c = ((uint16_t *)cptr)[-1];                                 \
             if (c >= 0xdc00 && c < 0xe000 &&                            \
                 cbuf_type == 2 && (cptr - 4) >= cbuf_start) {              \
                 __c1 = ((uint16_t *)cptr)[-2];                          \
                 if (__c1 >= 0xd800 && __c1 < 0xdc00 ) {                 \
                     c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
                 }                                                       \
             }                                                           \
         }                                                               \
     } while (0)

 #define GET_PREV_CHAR(c, cptr, cbuf_start)       \
     do {                                         \
         if (cbuf_type == 0) {                    \
             cptr--;                              \
             c = cptr[0];                         \
         } else {                                 \
             uint32_t __c1;                                              \
             cptr -= 2;                                                  \
             c = ((uint16_t *)cptr)[0];                                 \
             if (c >= 0xdc00 && c < 0xe000 &&                            \
                 cbuf_type == 2 && cptr > cbuf_start) {                  \
                 __c1 = ((uint16_t *)cptr)[-1];                          \
                 if (__c1 >= 0xd800 && __c1 < 0xdc00 ) {                 \
                     cptr -= 2;                                          \
                     c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
                 }                                                       \
             }                                                           \
         }                                                               \
     } while (0)

 #define PREV_CHAR(cptr, cbuf_start)       \
     do {                                  \
         if (cbuf_type == 0) {             \
             cptr--;                       \
         } else {                          \
             cptr -= 2;                          \
             if (cbuf_type == 2) {                                       \
                 c = ((uint16_t *)cptr)[0];                              \
                 if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) {   \
                     c = ((uint16_t *)cptr)[-1];                         \
                     if (c >= 0xd800 && c < 0xdc00)                      \
                         cptr -= 2;                                      \
                 }                                                       \
             }                                                           \
         }                                                               \
     } while (0)

 typedef uintptr_t StackInt;

 typedef enum {
     RE_EXEC_STATE_SPLIT,
     RE_EXEC_STATE_LOOKAHEAD,
     RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
     RE_EXEC_STATE_GREEDY_QUANT,
 } REExecStateEnum;

 typedef struct REExecState {
     REExecStateEnum type : 8;
     uint8_t stack_len;
     size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */
     const uint8_t *cptr;
     const uint8_t *pc;
     void *buf[0];
 } REExecState;

 typedef struct {
     const uint8_t *cbuf;
     const uint8_t *cbuf_end;
     /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
     int cbuf_type;
     int capture_count;
     int stack_size_max;
     BOOL multi_line;
     BOOL ignore_case;
     BOOL is_utf16;
     void *opaque; /* used for stack overflow check */

     size_t state_size;
     uint8_t *state_stack;
     size_t state_stack_size;
     size_t state_stack_len;
 } REExecContext;

 static int push_state(REExecContext *s,
                       uint8_t **capture,
                       StackInt *stack, size_t stack_len,
                       const uint8_t *pc, const uint8_t *cptr,
                       REExecStateEnum type, size_t count)
 {
     REExecState *rs;
     uint8_t *new_stack;
     size_t new_size, i, n;
     StackInt *stack_buf;

     if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) {
         /* reallocate the stack */
         new_size = s->state_stack_size * 3 / 2;
         if (new_size < 8)
             new_size = 8;
         new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size);
         if (!new_stack)
             return -1;
         s->state_stack_size = new_size;
         s->state_stack = new_stack;
     }
     rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size);
     s->state_stack_len++;
     rs->type = type;
     rs->count = count;
     rs->stack_len = stack_len;
     rs->cptr = cptr;
     rs->pc = pc;
     n = 2 * s->capture_count;
     for(i = 0; i < n; i++)
         rs->buf[i] = capture[i];
     stack_buf = (StackInt *)(rs->buf + n);
     for(i = 0; i < stack_len; i++)
         stack_buf[i] = stack[i];
     return 0;
 }

 /* return 1 if match, 0 if not match or -1 if error. */
 static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                                    StackInt *stack, int stack_len,
                                    const uint8_t *pc, const uint8_t *cptr,
                                    BOOL no_recurse)
 {
     int opcode, ret;
     int cbuf_type;
     uint32_t val, c;
     const uint8_t *cbuf_end;

     cbuf_type = s->cbuf_type;
     cbuf_end = s->cbuf_end;

     for(;;) {
         //        printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN)));
         opcode = *pc++;
         switch(opcode) {
         case REOP_match:
             {
                 REExecState *rs;
                 if (no_recurse)
                     return (intptr_t)cptr;
                 ret = 1;
                 goto recurse;
             no_match:
                 if (no_recurse)
                     return 0;
                 ret = 0;
             recurse:
                 for(;;) {
                     if (s->state_stack_len == 0)
                         return ret;
                     rs = (REExecState *)(s->state_stack +
                                          (s->state_stack_len - 1) * s->state_size);
                     if (rs->type == RE_EXEC_STATE_SPLIT) {
                         if (!ret) {
                         pop_state:
                             memcpy(capture, rs->buf,
                                    sizeof(capture[0]) * 2 * s->capture_count);
                         pop_state1:
                             pc = rs->pc;
                             cptr = rs->cptr;
                             stack_len = rs->stack_len;
                             memcpy(stack, rs->buf + 2 * s->capture_count,
                                    stack_len * sizeof(stack[0]));
                             s->state_stack_len--;
                             break;
                         }
                     } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) {
                         if (!ret) {
                             uint32_t char_count, i;
                             memcpy(capture, rs->buf,
                                    sizeof(capture[0]) * 2 * s->capture_count);
                             stack_len = rs->stack_len;
                             memcpy(stack, rs->buf + 2 * s->capture_count,
                                    stack_len * sizeof(stack[0]));
                             pc = rs->pc;
                             cptr = rs->cptr;
                             /* go backward */
                             char_count = get_u32(pc + 12);
                             for(i = 0; i < char_count; i++) {
                                 PREV_CHAR(cptr, s->cbuf);
                             }
                             pc = (pc + 16) + (int)get_u32(pc);
                             rs->cptr = cptr;
                             rs->count--;
                             if (rs->count == 0) {
                                 s->state_stack_len--;
                             }
                             break;
                         }
                     } else {
                         ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) ||
                                (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret));
                         if (ret) {
                             /* keep the capture in case of positive lookahead */
                             if (rs->type == RE_EXEC_STATE_LOOKAHEAD)
                                 goto pop_state1;
                             else
                                 goto pop_state;
                         }
                     }
                     s->state_stack_len--;
                 }
             }
             break;
         case REOP_char32:
             val = get_u32(pc);
             pc += 4;
             goto test_char;
         case REOP_char:
             val = get_u16(pc);
             pc += 2;
         test_char:
             if (cptr >= cbuf_end)
                 goto no_match;
             GET_CHAR(c, cptr, cbuf_end);
             if (s->ignore_case) {
                 c = lre_canonicalize(c, s->is_utf16);
             }
             if (val != c)
                 goto no_match;
             break;
         case REOP_split_goto_first:
         case REOP_split_next_first:
             {
                 const uint8_t *pc1;

                 val = get_u32(pc);
                 pc += 4;
                 if (opcode == REOP_split_next_first) {
                     pc1 = pc + (int)val;
                 } else {
                     pc1 = pc;
                     pc = pc + (int)val;
                 }
                 ret = push_state(s, capture, stack, stack_len,
                                  pc1, cptr, RE_EXEC_STATE_SPLIT, 0);
                 if (ret < 0)
                     return -1;
                 break;
             }
         case REOP_lookahead:
         case REOP_negative_lookahead:
             val = get_u32(pc);
             pc += 4;
             ret = push_state(s, capture, stack, stack_len,
                              pc + (int)val, cptr,
                              RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
                              0);
             if (ret < 0)
                 return -1;
             break;

         case REOP_goto:
             val = get_u32(pc);
             pc += 4 + (int)val;
             break;
         case REOP_line_start:
             if (cptr == s->cbuf)
                 break;
             if (!s->multi_line)
                 goto no_match;
             PEEK_PREV_CHAR(c, cptr, s->cbuf);
             if (!is_line_terminator(c))
                 goto no_match;
             break;
         case REOP_line_end:
             if (cptr == cbuf_end)
                 break;
             if (!s->multi_line)
                 goto no_match;
             PEEK_CHAR(c, cptr, cbuf_end);
             if (!is_line_terminator(c))
                 goto no_match;
             break;
         case REOP_dot:
             if (cptr == cbuf_end)
                 goto no_match;
             GET_CHAR(c, cptr, cbuf_end);
             if (is_line_terminator(c))
                 goto no_match;
             break;
         case REOP_any:
             if (cptr == cbuf_end)
                 goto no_match;
             GET_CHAR(c, cptr, cbuf_end);
             break;
         case REOP_save_start:
         case REOP_save_end:
             val = *pc++;
             assert(val < s->capture_count);
             capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr;
             break;
         case REOP_save_reset:
             {
                 uint32_t val2;
                 val = pc[0];
                 val2 = pc[1];
                 pc += 2;
                 assert(val2 < s->capture_count);
                 while (val <= val2) {
                     capture[2 * val] = NULL;
                     capture[2 * val + 1] = NULL;
                     val++;
                 }
             }
             break;
         case REOP_push_i32:
             val = get_u32(pc);
             pc += 4;
             stack[stack_len++] = val;
             break;
         case REOP_drop:
             stack_len--;
             break;
         case REOP_loop:
             val = get_u32(pc);
             pc += 4;
             if (--stack[stack_len - 1] != 0) {
                 pc += (int)val;
             }
             break;
         case REOP_push_char_pos:
             stack[stack_len++] = (uintptr_t)cptr;
             break;
         case REOP_bne_char_pos:
             val = get_u32(pc);
             pc += 4;
             if (stack[--stack_len] != (uintptr_t)cptr)
                 pc += (int)val;
             break;
         case REOP_word_boundary:
         case REOP_not_word_boundary:
             {
                 BOOL v1, v2;
                 /* char before */
                 if (cptr == s->cbuf) {
                     v1 = FALSE;
                 } else {
                     PEEK_PREV_CHAR(c, cptr, s->cbuf);
                     v1 = is_word_char(c);
                 }
                 /* current char */
                 if (cptr >= cbuf_end) {
                     v2 = FALSE;
                 } else {
                     PEEK_CHAR(c, cptr, cbuf_end);
                     v2 = is_word_char(c);
                 }
                 if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
                     goto no_match;
             }
             break;
         case REOP_back_reference:
         case REOP_backward_back_reference:
             {
                 const uint8_t *cptr1, *cptr1_end, *cptr1_start;
                 uint32_t c1, c2;

                 val = *pc++;
                 if (val >= s->capture_count)
                     goto no_match;
                 cptr1_start = capture[2 * val];
                 cptr1_end = capture[2 * val + 1];
                 if (!cptr1_start || !cptr1_end)
                     break;
                 if (opcode == REOP_back_reference) {
                     cptr1 = cptr1_start;
                     while (cptr1 < cptr1_end) {
                         if (cptr >= cbuf_end)
                             goto no_match;
                         GET_CHAR(c1, cptr1, cptr1_end);
                         GET_CHAR(c2, cptr, cbuf_end);
                         if (s->ignore_case) {
                             c1 = lre_canonicalize(c1, s->is_utf16);
                             c2 = lre_canonicalize(c2, s->is_utf16);
                         }
                         if (c1 != c2)
                             goto no_match;
                     }
                 } else {
                     cptr1 = cptr1_end;
                     while (cptr1 > cptr1_start) {
                         if (cptr == s->cbuf)
                             goto no_match;
                         GET_PREV_CHAR(c1, cptr1, cptr1_start);
                         GET_PREV_CHAR(c2, cptr, s->cbuf);
                         if (s->ignore_case) {
                             c1 = lre_canonicalize(c1, s->is_utf16);
                             c2 = lre_canonicalize(c2, s->is_utf16);
                         }
                         if (c1 != c2)
                             goto no_match;
                     }
                 }
             }
             break;
         case REOP_range:
             {
                 int n;
                 uint32_t low, high, idx_min, idx_max, idx;

                 n = get_u16(pc); /* n must be >= 1 */
                 pc += 2;
                 if (cptr >= cbuf_end)
                     goto no_match;
                 GET_CHAR(c, cptr, cbuf_end);
                 if (s->ignore_case) {
                     c = lre_canonicalize(c, s->is_utf16);
                 }
                 idx_min = 0;
                 low = get_u16(pc + 0 * 4);
                 if (c < low)
                     goto no_match;
                 idx_max = n - 1;
                 high = get_u16(pc + idx_max * 4 + 2);
                 /* 0xffff in for last value means +infinity */
                 if (unlikely(c >= 0xffff) && high == 0xffff)
                     goto range_match;
                 if (c > high)
                     goto no_match;
                 while (idx_min <= idx_max) {
                     idx = (idx_min + idx_max) / 2;
                     low = get_u16(pc + idx * 4);
                     high = get_u16(pc + idx * 4 + 2);
                     if (c < low)
                         idx_max = idx - 1;
                     else if (c > high)
                         idx_min = idx + 1;
                     else
                         goto range_match;
                 }
                 goto no_match;
             range_match:
                 pc += 4 * n;
             }
             break;
         case REOP_range32:
             {
                 int n;
                 uint32_t low, high, idx_min, idx_max, idx;

                 n = get_u16(pc); /* n must be >= 1 */
                 pc += 2;
                 if (cptr >= cbuf_end)
                     goto no_match;
                 GET_CHAR(c, cptr, cbuf_end);
                 if (s->ignore_case) {
                     c = lre_canonicalize(c, s->is_utf16);
                 }
                 idx_min = 0;
                 low = get_u32(pc + 0 * 8);
                 if (c < low)
                     goto no_match;
                 idx_max = n - 1;
                 high = get_u32(pc + idx_max * 8 + 4);
                 if (c > high)
                     goto no_match;
                 while (idx_min <= idx_max) {
                     idx = (idx_min + idx_max) / 2;
                     low = get_u32(pc + idx * 8);
                     high = get_u32(pc + idx * 8 + 4);
                     if (c < low)
                         idx_max = idx - 1;
                     else if (c > high)
                         idx_min = idx + 1;
                     else
                         goto range32_match;
                 }
                 goto no_match;
             range32_match:
                 pc += 8 * n;
             }
             break;
         case REOP_prev:
             /* go to the previous char */
             if (cptr == s->cbuf)
                 goto no_match;
             PREV_CHAR(cptr, s->cbuf);
             break;
         case REOP_simple_greedy_quant:
             {
                 uint32_t next_pos, quant_min, quant_max;
                 size_t q;
                 intptr_t res;
                 const uint8_t *pc1;

                 next_pos = get_u32(pc);
                 quant_min = get_u32(pc + 4);
                 quant_max = get_u32(pc + 8);
                 pc += 16;
                 pc1 = pc;
                 pc += (int)next_pos;

                 q = 0;
                 for(;;) {
                     res = lre_exec_backtrack(s, capture, stack, stack_len,
                                              pc1, cptr, TRUE);
                     if (res == -1)
                         return res;
                     if (!res)
                         break;
                     cptr = (uint8_t *)res;
                     q++;
                     if (q >= quant_max && quant_max != INT32_MAX)
                         break;
                 }
                 if (q < quant_min)
                     goto no_match;
                 if (q > quant_min) {
                     /* will examine all matches down to quant_min */
                     ret = push_state(s, capture, stack, stack_len,
                                      pc1 - 16, cptr,
                                      RE_EXEC_STATE_GREEDY_QUANT,
                                      q - quant_min);
                     if (ret < 0)
                         return -1;
                 }
             }
             break;
         default:
             abort();
         }
     }
 }

 /* Return 1 if match, 0 if not match or -1 if error. cindex is the
    starting position of the match and must be such as 0 <= cindex <=
    clen. */
 int lre_exec(uint8_t **capture,
              const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen,
              int cbuf_type, void *opaque)
 {
     REExecContext s_s, *s = &s_s;
     int re_flags, i, alloca_size, ret;
     StackInt *stack_buf;

     re_flags = bc_buf[RE_HEADER_FLAGS];
     s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
     s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
     s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0;
     s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
     s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
     s->cbuf = cbuf;
     s->cbuf_end = cbuf + (clen << cbuf_type);
     s->cbuf_type = cbuf_type;
     if (s->cbuf_type == 1 && s->is_utf16)
         s->cbuf_type = 2;
     s->opaque = opaque;

     s->state_size = sizeof(REExecState) +
         s->capture_count * sizeof(capture[0]) * 2 +
         s->stack_size_max * sizeof(stack_buf[0]);
     s->state_stack = NULL;
     s->state_stack_len = 0;
     s->state_stack_size = 0;

     for(i = 0; i < s->capture_count * 2; i++)
         capture[i] = NULL;
     alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
     stack_buf = alloca(alloca_size);
     ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
                              cbuf + (cindex << cbuf_type), FALSE);
     lre_realloc(s->opaque, s->state_stack, 0);
     return ret;
 }

 int lre_get_capture_count(const uint8_t *bc_buf)
 {
     return bc_buf[RE_HEADER_CAPTURE_COUNT];
 }

 int lre_get_flags(const uint8_t *bc_buf)
 {
     return bc_buf[RE_HEADER_FLAGS];
 }

 /* Return NULL if no group names. Otherwise, return a pointer to
    'capture_count - 1' zero terminated UTF-8 strings. */
 const char *lre_get_groupnames(const uint8_t *bc_buf)
 {
     uint32_t re_bytecode_len;
     if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
         return NULL;
     re_bytecode_len = get_u32(bc_buf + 3);
     return (const char *)(bc_buf + 7 + re_bytecode_len);
 }

 #ifdef TEST

 BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
 {
     return FALSE;
 }

 void *lre_realloc(void *opaque, void *ptr, size_t size)
 {
     return realloc(ptr, size);
 }

 int main(int argc, char **argv)
 {
     int len, ret, i;
     uint8_t *bc;
     char error_msg[64];
     uint8_t *capture[CAPTURE_COUNT_MAX * 2];
     const char *input;
     int input_len, capture_count;

     if (argc < 3) {
         printf("usage: %s regexp input\n", argv[0]);
         exit(1);
     }
     bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
                      strlen(argv[1]), 0, NULL);
     if (!bc) {
         fprintf(stderr, "error: %s\n", error_msg);
         exit(1);
     }

     input = argv[2];
     input_len = strlen(input);

     ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
     printf("ret=%d\n", ret);
     if (ret == 1) {
         capture_count = lre_get_capture_count(bc);
         for(i = 0; i < 2 * capture_count; i++) {
             uint8_t *ptr;
             ptr = capture[i];
             printf("%d: ", i);
             if (!ptr)
                 printf("<nil>");
             else
                 printf("%u", (int)(ptr - (uint8_t *)input));
             printf("\n");
         }
     }
     return 0;
 }
 #endif