| /* |
| * Unicode utilities |
| * |
| * Copyright (c) 2017-2018 Fabrice Bellard |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to deal |
| * in the Software without restriction, including without limitation the rights |
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| * copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in |
| * all copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| * THE SOFTWARE. |
| */ |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <stdarg.h> |
| #include <string.h> |
| #include <assert.h> |
| |
| #include "cutils.h" |
| #include "libunicode.h" |
| #include "libunicode-table.h" |
| |
| enum { |
| RUN_TYPE_U, |
| RUN_TYPE_L, |
| RUN_TYPE_UF, |
| RUN_TYPE_LF, |
| RUN_TYPE_UL, |
| RUN_TYPE_LSU, |
| RUN_TYPE_U2L_399_EXT2, |
| RUN_TYPE_UF_D20, |
| RUN_TYPE_UF_D1_EXT, |
| RUN_TYPE_U_EXT, |
| RUN_TYPE_LF_EXT, |
| RUN_TYPE_U_EXT2, |
| RUN_TYPE_L_EXT2, |
| RUN_TYPE_U_EXT3, |
| }; |
| |
| /* conv_type: |
| 0 = to upper |
| 1 = to lower |
| 2 = case folding (= to lower with modifications) |
| */ |
| int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) |
| { |
| if (c < 128) { |
| if (conv_type) { |
| if (c >= 'A' && c <= 'Z') { |
| c = c - 'A' + 'a'; |
| } |
| } else { |
| if (c >= 'a' && c <= 'z') { |
| c = c - 'a' + 'A'; |
| } |
| } |
| } else { |
| uint32_t v, code, data, type, len, a, is_lower; |
| int idx, idx_min, idx_max; |
| |
| is_lower = (conv_type != 0); |
| idx_min = 0; |
| idx_max = countof(case_conv_table1) - 1; |
| while (idx_min <= idx_max) { |
| idx = (unsigned)(idx_max + idx_min) / 2; |
| v = case_conv_table1[idx]; |
| code = v >> (32 - 17); |
| len = (v >> (32 - 17 - 7)) & 0x7f; |
| if (c < code) { |
| idx_max = idx - 1; |
| } else if (c >= code + len) { |
| idx_min = idx + 1; |
| } else { |
| type = (v >> (32 - 17 - 7 - 4)) & 0xf; |
| data = ((v & 0xf) << 8) | case_conv_table2[idx]; |
| switch(type) { |
| case RUN_TYPE_U: |
| case RUN_TYPE_L: |
| case RUN_TYPE_UF: |
| case RUN_TYPE_LF: |
| if (conv_type == (type & 1) || |
| (type >= RUN_TYPE_UF && conv_type == 2)) { |
| c = c - code + (case_conv_table1[data] >> (32 - 17)); |
| } |
| break; |
| case RUN_TYPE_UL: |
| a = c - code; |
| if ((a & 1) != (1 - is_lower)) |
| break; |
| c = (a ^ 1) + code; |
| break; |
| case RUN_TYPE_LSU: |
| a = c - code; |
| if (a == 1) { |
| c += 2 * is_lower - 1; |
| } else if (a == (1 - is_lower) * 2) { |
| c += (2 * is_lower - 1) * 2; |
| } |
| break; |
| case RUN_TYPE_U2L_399_EXT2: |
| if (!is_lower) { |
| res[0] = c - code + case_conv_ext[data >> 6]; |
| res[1] = 0x399; |
| return 2; |
| } else { |
| c = c - code + case_conv_ext[data & 0x3f]; |
| } |
| break; |
| case RUN_TYPE_UF_D20: |
| if (conv_type == 1) |
| break; |
| c = data + (conv_type == 2) * 0x20; |
| break; |
| case RUN_TYPE_UF_D1_EXT: |
| if (conv_type == 1) |
| break; |
| c = case_conv_ext[data] + (conv_type == 2); |
| break; |
| case RUN_TYPE_U_EXT: |
| case RUN_TYPE_LF_EXT: |
| if (is_lower != (type - RUN_TYPE_U_EXT)) |
| break; |
| c = case_conv_ext[data]; |
| break; |
| case RUN_TYPE_U_EXT2: |
| case RUN_TYPE_L_EXT2: |
| if (conv_type != (type - RUN_TYPE_U_EXT2)) |
| break; |
| res[0] = c - code + case_conv_ext[data >> 6]; |
| res[1] = case_conv_ext[data & 0x3f]; |
| return 2; |
| default: |
| case RUN_TYPE_U_EXT3: |
| if (conv_type != 0) |
| break; |
| res[0] = case_conv_ext[data >> 8]; |
| res[1] = case_conv_ext[(data >> 4) & 0xf]; |
| res[2] = case_conv_ext[data & 0xf]; |
| return 3; |
| } |
| break; |
| } |
| } |
| } |
| res[0] = c; |
| return 1; |
| } |
| |
| static uint32_t get_le24(const uint8_t *ptr) |
| { |
| #if defined(__x86__) || defined(__x86_64__) |
| return *(uint16_t *)ptr | (ptr[2] << 16); |
| #else |
| return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16); |
| #endif |
| } |
| |
| #define UNICODE_INDEX_BLOCK_LEN 32 |
| |
| /* return -1 if not in table, otherwise the offset in the block */ |
| static int get_index_pos(uint32_t *pcode, uint32_t c, |
| const uint8_t *index_table, int index_table_len) |
| { |
| uint32_t code, v; |
| int idx_min, idx_max, idx; |
| |
| idx_min = 0; |
| v = get_le24(index_table); |
| code = v & ((1 << 21) - 1); |
| if (c < code) { |
| *pcode = 0; |
| return 0; |
| } |
| idx_max = index_table_len - 1; |
| code = get_le24(index_table + idx_max * 3); |
| if (c >= code) |
| return -1; |
| /* invariant: tab[idx_min] <= c < tab2[idx_max] */ |
| while ((idx_max - idx_min) > 1) { |
| idx = (idx_max + idx_min) / 2; |
| v = get_le24(index_table + idx * 3); |
| code = v & ((1 << 21) - 1); |
| if (c < code) { |
| idx_max = idx; |
| } else { |
| idx_min = idx; |
| } |
| } |
| v = get_le24(index_table + idx_min * 3); |
| *pcode = v & ((1 << 21) - 1); |
| return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21); |
| } |
| |
| static BOOL lre_is_in_table(uint32_t c, const uint8_t *table, |
| const uint8_t *index_table, int index_table_len) |
| { |
| uint32_t code, b, bit; |
| int pos; |
| const uint8_t *p; |
| |
| pos = get_index_pos(&code, c, index_table, index_table_len); |
| if (pos < 0) |
| return FALSE; /* outside the table */ |
| p = table + pos; |
| bit = 0; |
| for(;;) { |
| b = *p++; |
| if (b < 64) { |
| code += (b >> 3) + 1; |
| if (c < code) |
| return bit; |
| bit ^= 1; |
| code += (b & 7) + 1; |
| } else if (b >= 0x80) { |
| code += b - 0x80 + 1; |
| } else if (b < 0x60) { |
| code += (((b - 0x40) << 8) | p[0]) + 1; |
| p++; |
| } else { |
| code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1; |
| p += 2; |
| } |
| if (c < code) |
| return bit; |
| bit ^= 1; |
| } |
| } |
| |
| BOOL lre_is_cased(uint32_t c) |
| { |
| uint32_t v, code, len; |
| int idx, idx_min, idx_max; |
| |
| idx_min = 0; |
| idx_max = countof(case_conv_table1) - 1; |
| while (idx_min <= idx_max) { |
| idx = (unsigned)(idx_max + idx_min) / 2; |
| v = case_conv_table1[idx]; |
| code = v >> (32 - 17); |
| len = (v >> (32 - 17 - 7)) & 0x7f; |
| if (c < code) { |
| idx_max = idx - 1; |
| } else if (c >= code + len) { |
| idx_min = idx + 1; |
| } else { |
| return TRUE; |
| } |
| } |
| return lre_is_in_table(c, unicode_prop_Cased1_table, |
| unicode_prop_Cased1_index, |
| sizeof(unicode_prop_Cased1_index) / 3); |
| } |
| |
| BOOL lre_is_case_ignorable(uint32_t c) |
| { |
| return lre_is_in_table(c, unicode_prop_Case_Ignorable_table, |
| unicode_prop_Case_Ignorable_index, |
| sizeof(unicode_prop_Case_Ignorable_index) / 3); |
| } |
| |
| /* character range */ |
| |
| static __maybe_unused void cr_dump(CharRange *cr) |
| { |
| int i; |
| for(i = 0; i < cr->len; i++) |
| printf("%d: 0x%04x\n", i, cr->points[i]); |
| } |
| |
| static void *cr_default_realloc(void *opaque, void *ptr, size_t size) |
| { |
| return realloc(ptr, size); |
| } |
| |
| void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func) |
| { |
| cr->len = cr->size = 0; |
| cr->points = NULL; |
| cr->mem_opaque = mem_opaque; |
| cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc; |
| } |
| |
| void cr_free(CharRange *cr) |
| { |
| cr->realloc_func(cr->mem_opaque, cr->points, 0); |
| } |
| |
| int cr_realloc(CharRange *cr, int size) |
| { |
| int new_size; |
| uint32_t *new_buf; |
| |
| if (size > cr->size) { |
| new_size = max_int(size, cr->size * 3 / 2); |
| new_buf = cr->realloc_func(cr->mem_opaque, cr->points, |
| new_size * sizeof(cr->points[0])); |
| if (!new_buf) |
| return -1; |
| cr->points = new_buf; |
| cr->size = new_size; |
| } |
| return 0; |
| } |
| |
| int cr_copy(CharRange *cr, const CharRange *cr1) |
| { |
| if (cr_realloc(cr, cr1->len)) |
| return -1; |
| memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len); |
| cr->len = cr1->len; |
| return 0; |
| } |
| |
| /* merge consecutive intervals and remove empty intervals */ |
| static void cr_compress(CharRange *cr) |
| { |
| int i, j, k, len; |
| uint32_t *pt; |
| |
| pt = cr->points; |
| len = cr->len; |
| i = 0; |
| j = 0; |
| k = 0; |
| while ((i + 1) < len) { |
| if (pt[i] == pt[i + 1]) { |
| /* empty interval */ |
| i += 2; |
| } else { |
| j = i; |
| while ((j + 3) < len && pt[j + 1] == pt[j + 2]) |
| j += 2; |
| /* just copy */ |
| pt[k] = pt[i]; |
| pt[k + 1] = pt[j + 1]; |
| k += 2; |
| i = j + 2; |
| } |
| } |
| cr->len = k; |
| } |
| |
| /* union or intersection */ |
| int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, |
| const uint32_t *b_pt, int b_len, int op) |
| { |
| int a_idx, b_idx, is_in; |
| uint32_t v; |
| |
| a_idx = 0; |
| b_idx = 0; |
| for(;;) { |
| /* get one more point from a or b in increasing order */ |
| if (a_idx < a_len && b_idx < b_len) { |
| if (a_pt[a_idx] < b_pt[b_idx]) { |
| goto a_add; |
| } else if (a_pt[a_idx] == b_pt[b_idx]) { |
| v = a_pt[a_idx]; |
| a_idx++; |
| b_idx++; |
| } else { |
| goto b_add; |
| } |
| } else if (a_idx < a_len) { |
| a_add: |
| v = a_pt[a_idx++]; |
| } else if (b_idx < b_len) { |
| b_add: |
| v = b_pt[b_idx++]; |
| } else { |
| break; |
| } |
| /* add the point if the in/out status changes */ |
| switch(op) { |
| case CR_OP_UNION: |
| is_in = (a_idx & 1) | (b_idx & 1); |
| break; |
| case CR_OP_INTER: |
| is_in = (a_idx & 1) & (b_idx & 1); |
| break; |
| case CR_OP_XOR: |
| is_in = (a_idx & 1) ^ (b_idx & 1); |
| break; |
| default: |
| abort(); |
| } |
| if (is_in != (cr->len & 1)) { |
| if (cr_add_point(cr, v)) |
| return -1; |
| } |
| } |
| cr_compress(cr); |
| return 0; |
| } |
| |
| int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len) |
| { |
| CharRange a = *cr; |
| int ret; |
| cr->len = 0; |
| cr->size = 0; |
| cr->points = NULL; |
| ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION); |
| cr_free(&a); |
| return ret; |
| } |
| |
| int cr_invert(CharRange *cr) |
| { |
| int len; |
| len = cr->len; |
| if (cr_realloc(cr, len + 2)) |
| return -1; |
| memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0])); |
| cr->points[0] = 0; |
| cr->points[len + 1] = UINT32_MAX; |
| cr->len = len + 2; |
| cr_compress(cr); |
| return 0; |
| } |
| |
| #ifdef CONFIG_ALL_UNICODE |
| |
| BOOL lre_is_id_start(uint32_t c) |
| { |
| return lre_is_in_table(c, unicode_prop_ID_Start_table, |
| unicode_prop_ID_Start_index, |
| sizeof(unicode_prop_ID_Start_index) / 3); |
| } |
| |
| BOOL lre_is_id_continue(uint32_t c) |
| { |
| return lre_is_id_start(c) || |
| lre_is_in_table(c, unicode_prop_ID_Continue1_table, |
| unicode_prop_ID_Continue1_index, |
| sizeof(unicode_prop_ID_Continue1_index) / 3); |
| } |
| |
| #define UNICODE_DECOMP_LEN_MAX 18 |
| |
| typedef enum { |
| DECOMP_TYPE_C1, /* 16 bit char */ |
| DECOMP_TYPE_L1, /* 16 bit char table */ |
| DECOMP_TYPE_L2, |
| DECOMP_TYPE_L3, |
| DECOMP_TYPE_L4, |
| DECOMP_TYPE_L5, /* XXX: not used */ |
| DECOMP_TYPE_L6, /* XXX: could remove */ |
| DECOMP_TYPE_L7, /* XXX: could remove */ |
| DECOMP_TYPE_LL1, /* 18 bit char table */ |
| DECOMP_TYPE_LL2, |
| DECOMP_TYPE_S1, /* 8 bit char table */ |
| DECOMP_TYPE_S2, |
| DECOMP_TYPE_S3, |
| DECOMP_TYPE_S4, |
| DECOMP_TYPE_S5, |
| DECOMP_TYPE_I1, /* increment 16 bit char value */ |
| DECOMP_TYPE_I2_0, |
| DECOMP_TYPE_I2_1, |
| DECOMP_TYPE_I3_1, |
| DECOMP_TYPE_I3_2, |
| DECOMP_TYPE_I4_1, |
| DECOMP_TYPE_I4_2, |
| DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */ |
| DECOMP_TYPE_B2, |
| DECOMP_TYPE_B3, |
| DECOMP_TYPE_B4, |
| DECOMP_TYPE_B5, |
| DECOMP_TYPE_B6, |
| DECOMP_TYPE_B7, |
| DECOMP_TYPE_B8, |
| DECOMP_TYPE_B18, |
| DECOMP_TYPE_LS2, |
| DECOMP_TYPE_PAT3, |
| DECOMP_TYPE_S2_UL, |
| DECOMP_TYPE_LS2_UL, |
| } DecompTypeEnum; |
| |
| static uint32_t unicode_get_short_code(uint32_t c) |
| { |
| static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 }; |
| |
| if (c < 0x80) |
| return c; |
| else if (c < 0x80 + 0x50) |
| return c - 0x80 + 0x300; |
| else |
| return unicode_short_table[c - 0x80 - 0x50]; |
| } |
| |
| static uint32_t unicode_get_lower_simple(uint32_t c) |
| { |
| if (c < 0x100 || (c >= 0x410 && c <= 0x42f)) |
| c += 0x20; |
| else |
| c++; |
| return c; |
| } |
| |
| static uint16_t unicode_get16(const uint8_t *p) |
| { |
| return p[0] | (p[1] << 8); |
| } |
| |
| static int unicode_decomp_entry(uint32_t *res, uint32_t c, |
| int idx, uint32_t code, uint32_t len, |
| uint32_t type) |
| { |
| uint32_t c1; |
| int l, i, p; |
| const uint8_t *d; |
| |
| if (type == DECOMP_TYPE_C1) { |
| res[0] = unicode_decomp_table2[idx]; |
| return 1; |
| } else { |
| d = unicode_decomp_data + unicode_decomp_table2[idx]; |
| switch(type) { |
| case DECOMP_TYPE_L1: |
| case DECOMP_TYPE_L2: |
| case DECOMP_TYPE_L3: |
| case DECOMP_TYPE_L4: |
| case DECOMP_TYPE_L5: |
| case DECOMP_TYPE_L6: |
| case DECOMP_TYPE_L7: |
| l = type - DECOMP_TYPE_L1 + 1; |
| d += (c - code) * l * 2; |
| for(i = 0; i < l; i++) { |
| if ((res[i] = unicode_get16(d + 2 * i)) == 0) |
| return 0; |
| } |
| return l; |
| case DECOMP_TYPE_LL1: |
| case DECOMP_TYPE_LL2: |
| { |
| uint32_t k, p; |
| l = type - DECOMP_TYPE_LL1 + 1; |
| k = (c - code) * l; |
| p = len * l * 2; |
| for(i = 0; i < l; i++) { |
| c1 = unicode_get16(d + 2 * k) | |
| (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16); |
| if (!c1) |
| return 0; |
| res[i] = c1; |
| k++; |
| } |
| } |
| return l; |
| case DECOMP_TYPE_S1: |
| case DECOMP_TYPE_S2: |
| case DECOMP_TYPE_S3: |
| case DECOMP_TYPE_S4: |
| case DECOMP_TYPE_S5: |
| l = type - DECOMP_TYPE_S1 + 1; |
| d += (c - code) * l; |
| for(i = 0; i < l; i++) { |
| if ((res[i] = unicode_get_short_code(d[i])) == 0) |
| return 0; |
| } |
| return l; |
| case DECOMP_TYPE_I1: |
| l = 1; |
| p = 0; |
| goto decomp_type_i; |
| case DECOMP_TYPE_I2_0: |
| case DECOMP_TYPE_I2_1: |
| case DECOMP_TYPE_I3_1: |
| case DECOMP_TYPE_I3_2: |
| case DECOMP_TYPE_I4_1: |
| case DECOMP_TYPE_I4_2: |
| l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1); |
| p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2); |
| decomp_type_i: |
| for(i = 0; i < l; i++) { |
| c1 = unicode_get16(d + 2 * i); |
| if (i == p) |
| c1 += c - code; |
| res[i] = c1; |
| } |
| return l; |
| case DECOMP_TYPE_B18: |
| l = 18; |
| goto decomp_type_b; |
| case DECOMP_TYPE_B1: |
| case DECOMP_TYPE_B2: |
| case DECOMP_TYPE_B3: |
| case DECOMP_TYPE_B4: |
| case DECOMP_TYPE_B5: |
| case DECOMP_TYPE_B6: |
| case DECOMP_TYPE_B7: |
| case DECOMP_TYPE_B8: |
| l = type - DECOMP_TYPE_B1 + 1; |
| decomp_type_b: |
| { |
| uint32_t c_min; |
| c_min = unicode_get16(d); |
| d += 2 + (c - code) * l; |
| for(i = 0; i < l; i++) { |
| c1 = d[i]; |
| if (c1 == 0xff) |
| c1 = 0x20; |
| else |
| c1 += c_min; |
| res[i] = c1; |
| } |
| } |
| return l; |
| case DECOMP_TYPE_LS2: |
| d += (c - code) * 3; |
| if (!(res[0] = unicode_get16(d))) |
| return 0; |
| res[1] = unicode_get_short_code(d[2]); |
| return 2; |
| case DECOMP_TYPE_PAT3: |
| res[0] = unicode_get16(d); |
| res[2] = unicode_get16(d + 2); |
| d += 4 + (c - code) * 2; |
| res[1] = unicode_get16(d); |
| return 3; |
| case DECOMP_TYPE_S2_UL: |
| case DECOMP_TYPE_LS2_UL: |
| c1 = c - code; |
| if (type == DECOMP_TYPE_S2_UL) { |
| d += c1 & ~1; |
| c = unicode_get_short_code(*d); |
| d++; |
| } else { |
| d += (c1 >> 1) * 3; |
| c = unicode_get16(d); |
| d += 2; |
| } |
| if (c1 & 1) |
| c = unicode_get_lower_simple(c); |
| res[0] = c; |
| res[1] = unicode_get_short_code(*d); |
| return 2; |
| } |
| } |
| return 0; |
| } |
| |
| |
| /* return the length of the decomposition (length <= |
| UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */ |
| static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1) |
| { |
| uint32_t v, type, is_compat, code, len; |
| int idx_min, idx_max, idx; |
| |
| idx_min = 0; |
| idx_max = countof(unicode_decomp_table1) - 1; |
| while (idx_min <= idx_max) { |
| idx = (idx_max + idx_min) / 2; |
| v = unicode_decomp_table1[idx]; |
| code = v >> (32 - 18); |
| len = (v >> (32 - 18 - 7)) & 0x7f; |
| // printf("idx=%d code=%05x len=%d\n", idx, code, len); |
| if (c < code) { |
| idx_max = idx - 1; |
| } else if (c >= code + len) { |
| idx_min = idx + 1; |
| } else { |
| is_compat = v & 1; |
| if (is_compat1 < is_compat) |
| break; |
| type = (v >> (32 - 18 - 7 - 6)) & 0x3f; |
| return unicode_decomp_entry(res, c, idx, code, len, type); |
| } |
| } |
| return 0; |
| } |
| |
| /* return 0 if no pair found */ |
| static int unicode_compose_pair(uint32_t c0, uint32_t c1) |
| { |
| uint32_t code, len, type, v, idx1, d_idx, d_offset, ch; |
| int idx_min, idx_max, idx, d; |
| uint32_t pair[2]; |
| |
| idx_min = 0; |
| idx_max = countof(unicode_comp_table) - 1; |
| while (idx_min <= idx_max) { |
| idx = (idx_max + idx_min) / 2; |
| idx1 = unicode_comp_table[idx]; |
| |
| /* idx1 represent an entry of the decomposition table */ |
| d_idx = idx1 >> 6; |
| d_offset = idx1 & 0x3f; |
| v = unicode_decomp_table1[d_idx]; |
| code = v >> (32 - 18); |
| len = (v >> (32 - 18 - 7)) & 0x7f; |
| type = (v >> (32 - 18 - 7 - 6)) & 0x3f; |
| ch = code + d_offset; |
| unicode_decomp_entry(pair, ch, d_idx, code, len, type); |
| d = c0 - pair[0]; |
| if (d == 0) |
| d = c1 - pair[1]; |
| if (d < 0) { |
| idx_max = idx - 1; |
| } else if (d > 0) { |
| idx_min = idx + 1; |
| } else { |
| return ch; |
| } |
| } |
| return 0; |
| } |
| |
| /* return the combining class of character c (between 0 and 255) */ |
| static int unicode_get_cc(uint32_t c) |
| { |
| uint32_t code, n, type, cc, c1, b; |
| int pos; |
| const uint8_t *p; |
| |
| pos = get_index_pos(&code, c, |
| unicode_cc_index, sizeof(unicode_cc_index) / 3); |
| if (pos < 0) |
| return 0; |
| p = unicode_cc_table + pos; |
| for(;;) { |
| b = *p++; |
| type = b >> 6; |
| n = b & 0x3f; |
| if (n < 48) { |
| } else if (n < 56) { |
| n = (n - 48) << 8; |
| n |= *p++; |
| n += 48; |
| } else { |
| n = (n - 56) << 8; |
| n |= *p++ << 8; |
| n |= *p++; |
| n += 48 + (1 << 11); |
| } |
| if (type <= 1) |
| p++; |
| c1 = code + n + 1; |
| if (c < c1) { |
| switch(type) { |
| case 0: |
| cc = p[-1]; |
| break; |
| case 1: |
| cc = p[-1] + c - code; |
| break; |
| case 2: |
| cc = 0; |
| break; |
| default: |
| case 3: |
| cc = 230; |
| break; |
| } |
| return cc; |
| } |
| code = c1; |
| } |
| } |
| |
| static void sort_cc(int *buf, int len) |
| { |
| int i, j, k, cc, cc1, start, ch1; |
| |
| for(i = 0; i < len; i++) { |
| cc = unicode_get_cc(buf[i]); |
| if (cc != 0) { |
| start = i; |
| j = i + 1; |
| while (j < len) { |
| ch1 = buf[j]; |
| cc1 = unicode_get_cc(ch1); |
| if (cc1 == 0) |
| break; |
| k = j - 1; |
| while (k >= start) { |
| if (unicode_get_cc(buf[k]) <= cc1) |
| break; |
| buf[k + 1] = buf[k]; |
| k--; |
| } |
| buf[k + 1] = ch1; |
| j++; |
| } |
| #if 0 |
| printf("cc:"); |
| for(k = start; k < j; k++) { |
| printf(" %3d", unicode_get_cc(buf[k])); |
| } |
| printf("\n"); |
| #endif |
| i = j; |
| } |
| } |
| } |
| |
| static void to_nfd_rec(DynBuf *dbuf, |
| const int *src, int src_len, int is_compat) |
| { |
| uint32_t c, v; |
| int i, l; |
| uint32_t res[UNICODE_DECOMP_LEN_MAX]; |
| |
| for(i = 0; i < src_len; i++) { |
| c = src[i]; |
| if (c >= 0xac00 && c < 0xd7a4) { |
| /* Hangul decomposition */ |
| c -= 0xac00; |
| dbuf_put_u32(dbuf, 0x1100 + c / 588); |
| dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28); |
| v = c % 28; |
| if (v != 0) |
| dbuf_put_u32(dbuf, 0x11a7 + v); |
| } else { |
| l = unicode_decomp_char(res, c, is_compat); |
| if (l) { |
| to_nfd_rec(dbuf, (int *)res, l, is_compat); |
| } else { |
| dbuf_put_u32(dbuf, c); |
| } |
| } |
| } |
| } |
| |
| /* return 0 if not found */ |
| static int compose_pair(uint32_t c0, uint32_t c1) |
| { |
| /* Hangul composition */ |
| if (c0 >= 0x1100 && c0 < 0x1100 + 19 && |
| c1 >= 0x1161 && c1 < 0x1161 + 21) { |
| return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28; |
| } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 && |
| (c0 - 0xac00) % 28 == 0 && |
| c1 >= 0x11a7 && c1 < 0x11a7 + 28) { |
| return c0 + c1 - 0x11a7; |
| } else { |
| return unicode_compose_pair(c0, c1); |
| } |
| } |
| |
| int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len, |
| UnicodeNormalizationEnum n_type, |
| void *opaque, DynBufReallocFunc *realloc_func) |
| { |
| int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len; |
| BOOL is_compat; |
| DynBuf dbuf_s, *dbuf = &dbuf_s; |
| |
| is_compat = n_type >> 1; |
| |
| dbuf_init2(dbuf, opaque, realloc_func); |
| if (dbuf_realloc(dbuf, sizeof(int) * src_len)) |
| goto fail; |
| |
| /* common case: latin1 is unaffected by NFC */ |
| if (n_type == UNICODE_NFC) { |
| for(i = 0; i < src_len; i++) { |
| if (src[i] >= 0x100) |
| goto not_latin1; |
| } |
| buf = (int *)dbuf->buf; |
| memcpy(buf, src, src_len * sizeof(int)); |
| *pdst = (uint32_t *)buf; |
| return src_len; |
| not_latin1: ; |
| } |
| |
| to_nfd_rec(dbuf, (const int *)src, src_len, is_compat); |
| if (dbuf_error(dbuf)) { |
| fail: |
| *pdst = NULL; |
| return -1; |
| } |
| buf = (int *)dbuf->buf; |
| buf_len = dbuf->size / sizeof(int); |
| |
| sort_cc(buf, buf_len); |
| |
| if (buf_len <= 1 || (n_type & 1) != 0) { |
| /* NFD / NFKD */ |
| *pdst = (uint32_t *)buf; |
| return buf_len; |
| } |
| |
| i = 1; |
| out_len = 1; |
| while (i < buf_len) { |
| /* find the starter character and test if it is blocked from |
| the character at 'i' */ |
| last_cc = unicode_get_cc(buf[i]); |
| starter_pos = out_len - 1; |
| while (starter_pos >= 0) { |
| cc = unicode_get_cc(buf[starter_pos]); |
| if (cc == 0) |
| break; |
| if (cc >= last_cc) |
| goto next; |
| last_cc = 256; |
| starter_pos--; |
| } |
| if (starter_pos >= 0 && |
| (p = compose_pair(buf[starter_pos], buf[i])) != 0) { |
| buf[starter_pos] = p; |
| i++; |
| } else { |
| next: |
| buf[out_len++] = buf[i++]; |
| } |
| } |
| *pdst = (uint32_t *)buf; |
| return out_len; |
| } |
| |
| /* char ranges for various unicode properties */ |
| |
| static int unicode_find_name(const char *name_table, const char *name) |
| { |
| const char *p, *r; |
| int pos; |
| size_t name_len, len; |
| |
| p = name_table; |
| pos = 0; |
| name_len = strlen(name); |
| while (*p) { |
| for(;;) { |
| r = strchr(p, ','); |
| if (!r) |
| len = strlen(p); |
| else |
| len = r - p; |
| if (len == name_len && !memcmp(p, name, name_len)) |
| return pos; |
| p += len + 1; |
| if (!r) |
| break; |
| } |
| pos++; |
| } |
| return -1; |
| } |
| |
| /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2 |
| if not found */ |
| int unicode_script(CharRange *cr, |
| const char *script_name, BOOL is_ext) |
| { |
| int script_idx; |
| const uint8_t *p, *p_end; |
| uint32_t c, c1, b, n, v, v_len, i, type; |
| CharRange cr1_s, *cr1; |
| CharRange cr2_s, *cr2 = &cr2_s; |
| BOOL is_common; |
| |
| script_idx = unicode_find_name(unicode_script_name_table, script_name); |
| if (script_idx < 0) |
| return -2; |
| /* Note: we remove the "Unknown" Script */ |
| script_idx += UNICODE_SCRIPT_Unknown + 1; |
| |
| is_common = (script_idx == UNICODE_SCRIPT_Common || |
| script_idx == UNICODE_SCRIPT_Inherited); |
| if (is_ext) { |
| cr1 = &cr1_s; |
| cr_init(cr1, cr->mem_opaque, cr->realloc_func); |
| cr_init(cr2, cr->mem_opaque, cr->realloc_func); |
| } else { |
| cr1 = cr; |
| } |
| |
| p = unicode_script_table; |
| p_end = unicode_script_table + countof(unicode_script_table); |
| c = 0; |
| while (p < p_end) { |
| b = *p++; |
| type = b >> 7; |
| n = b & 0x7f; |
| if (n < 96) { |
| } else if (n < 112) { |
| n = (n - 96) << 8; |
| n |= *p++; |
| n += 96; |
| } else { |
| n = (n - 112) << 16; |
| n |= *p++ << 8; |
| n |= *p++; |
| n += 96 + (1 << 12); |
| } |
| if (type == 0) |
| v = 0; |
| else |
| v = *p++; |
| c1 = c + n + 1; |
| if (v == script_idx) { |
| if (cr_add_interval(cr1, c, c1)) |
| goto fail; |
| } |
| c = c1; |
| } |
| |
| if (is_ext) { |
| /* add the script extensions */ |
| p = unicode_script_ext_table; |
| p_end = unicode_script_ext_table + countof(unicode_script_ext_table); |
| c = 0; |
| while (p < p_end) { |
| b = *p++; |
| if (b < 128) { |
| n = b; |
| } else if (b < 128 + 64) { |
| n = (b - 128) << 8; |
| n |= *p++; |
| n += 128; |
| } else { |
| n = (b - 128 - 64) << 16; |
| n |= *p++ << 8; |
| n |= *p++; |
| n += 128 + (1 << 14); |
| } |
| c1 = c + n + 1; |
| v_len = *p++; |
| if (is_common) { |
| if (v_len != 0) { |
| if (cr_add_interval(cr2, c, c1)) |
| goto fail; |
| } |
| } else { |
| for(i = 0; i < v_len; i++) { |
| if (p[i] == script_idx) { |
| if (cr_add_interval(cr2, c, c1)) |
| goto fail; |
| break; |
| } |
| } |
| } |
| p += v_len; |
| c = c1; |
| } |
| if (is_common) { |
| /* remove all the characters with script extensions */ |
| if (cr_invert(cr2)) |
| goto fail; |
| if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len, |
| CR_OP_INTER)) |
| goto fail; |
| } else { |
| if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len, |
| CR_OP_UNION)) |
| goto fail; |
| } |
| cr_free(cr1); |
| cr_free(cr2); |
| } |
| return 0; |
| fail: |
| if (is_ext) { |
| cr_free(cr1); |
| cr_free(cr2); |
| } |
| goto fail; |
| } |
| |
| #define M(id) (1U << UNICODE_GC_ ## id) |
| |
| static int unicode_general_category1(CharRange *cr, uint32_t gc_mask) |
| { |
| const uint8_t *p, *p_end; |
| uint32_t c, c0, b, n, v; |
| |
| p = unicode_gc_table; |
| p_end = unicode_gc_table + countof(unicode_gc_table); |
| c = 0; |
| while (p < p_end) { |
| b = *p++; |
| n = b >> 5; |
| v = b & 0x1f; |
| if (n == 7) { |
| n = *p++; |
| if (n < 128) { |
| n += 7; |
| } else if (n < 128 + 64) { |
| n = (n - 128) << 8; |
| n |= *p++; |
| n += 7 + 128; |
| } else { |
| n = (n - 128 - 64) << 16; |
| n |= *p++ << 8; |
| n |= *p++; |
| n += 7 + 128 + (1 << 14); |
| } |
| } |
| c0 = c; |
| c += n + 1; |
| if (v == 31) { |
| /* run of Lu / Ll */ |
| b = gc_mask & (M(Lu) | M(Ll)); |
| if (b != 0) { |
| if (b == (M(Lu) | M(Ll))) { |
| goto add_range; |
| } else { |
| c0 += ((gc_mask & M(Ll)) != 0); |
| for(; c0 < c; c0 += 2) { |
| if (cr_add_interval(cr, c0, c0 + 1)) |
| return -1; |
| } |
| } |
| } |
| } else if ((gc_mask >> v) & 1) { |
| add_range: |
| if (cr_add_interval(cr, c0, c)) |
| return -1; |
| } |
| } |
| return 0; |
| } |
| |
| static int unicode_prop1(CharRange *cr, int prop_idx) |
| { |
| const uint8_t *p, *p_end; |
| uint32_t c, c0, b, bit; |
| |
| p = unicode_prop_table[prop_idx]; |
| p_end = p + unicode_prop_len_table[prop_idx]; |
| c = 0; |
| bit = 0; |
| while (p < p_end) { |
| c0 = c; |
| b = *p++; |
| if (b < 64) { |
| c += (b >> 3) + 1; |
| if (bit) { |
| if (cr_add_interval(cr, c0, c)) |
| return -1; |
| } |
| bit ^= 1; |
| c0 = c; |
| c += (b & 7) + 1; |
| } else if (b >= 0x80) { |
| c += b - 0x80 + 1; |
| } else if (b < 0x60) { |
| c += (((b - 0x40) << 8) | p[0]) + 1; |
| p++; |
| } else { |
| c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1; |
| p += 2; |
| } |
| if (bit) { |
| if (cr_add_interval(cr, c0, c)) |
| return -1; |
| } |
| bit ^= 1; |
| } |
| return 0; |
| } |
| |
| #define CASE_U (1 << 0) |
| #define CASE_L (1 << 1) |
| #define CASE_F (1 << 2) |
| |
| /* use the case conversion table to generate range of characters. |
| CASE_U: set char if modified by uppercasing, |
| CASE_L: set char if modified by lowercasing, |
| CASE_F: set char if modified by case folding, |
| */ |
| static int unicode_case1(CharRange *cr, int case_mask) |
| { |
| #define MR(x) (1 << RUN_TYPE_ ## x) |
| const uint32_t tab_run_mask[3] = { |
| MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) | |
| MR(UF_D1_EXT) | MR(U_EXT) | MR(U_EXT2) | MR(U_EXT3), |
| |
| MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(L_EXT2), |
| |
| MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT), |
| }; |
| #undef MR |
| uint32_t mask, v, code, type, len, i, idx; |
| |
| if (case_mask == 0) |
| return 0; |
| mask = 0; |
| for(i = 0; i < 3; i++) { |
| if ((case_mask >> i) & 1) |
| mask |= tab_run_mask[i]; |
| } |
| for(idx = 0; idx < countof(case_conv_table1); idx++) { |
| v = case_conv_table1[idx]; |
| type = (v >> (32 - 17 - 7 - 4)) & 0xf; |
| code = v >> (32 - 17); |
| len = (v >> (32 - 17 - 7)) & 0x7f; |
| if ((mask >> type) & 1) { |
| // printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1); |
| switch(type) { |
| case RUN_TYPE_UL: |
| if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F))) |
| goto def_case; |
| code += ((case_mask & CASE_U) != 0); |
| for(i = 0; i < len; i += 2) { |
| if (cr_add_interval(cr, code + i, code + i + 1)) |
| return -1; |
| } |
| break; |
| case RUN_TYPE_LSU: |
| if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F))) |
| goto def_case; |
| if (!(case_mask & CASE_U)) { |
| if (cr_add_interval(cr, code, code + 1)) |
| return -1; |
| } |
| if (cr_add_interval(cr, code + 1, code + 2)) |
| return -1; |
| if (case_mask & CASE_U) { |
| if (cr_add_interval(cr, code + 2, code + 3)) |
| return -1; |
| } |
| break; |
| default: |
| def_case: |
| if (cr_add_interval(cr, code, code + len)) |
| return -1; |
| break; |
| } |
| } |
| } |
| return 0; |
| } |
| |
| typedef enum { |
| POP_GC, |
| POP_PROP, |
| POP_CASE, |
| POP_UNION, |
| POP_INTER, |
| POP_XOR, |
| POP_INVERT, |
| POP_END, |
| } PropOPEnum; |
| |
| #define POP_STACK_LEN_MAX 4 |
| |
| static int unicode_prop_ops(CharRange *cr, ...) |
| { |
| va_list ap; |
| CharRange stack[POP_STACK_LEN_MAX]; |
| int stack_len, op, ret, i; |
| uint32_t a; |
| |
| va_start(ap, cr); |
| stack_len = 0; |
| for(;;) { |
| op = va_arg(ap, int); |
| switch(op) { |
| case POP_GC: |
| assert(stack_len < POP_STACK_LEN_MAX); |
| a = va_arg(ap, int); |
| cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func); |
| if (unicode_general_category1(&stack[stack_len - 1], a)) |
| goto fail; |
| break; |
| case POP_PROP: |
| assert(stack_len < POP_STACK_LEN_MAX); |
| a = va_arg(ap, int); |
| cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func); |
| if (unicode_prop1(&stack[stack_len - 1], a)) |
| goto fail; |
| break; |
| case POP_CASE: |
| assert(stack_len < POP_STACK_LEN_MAX); |
| a = va_arg(ap, int); |
| cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func); |
| if (unicode_case1(&stack[stack_len - 1], a)) |
| goto fail; |
| break; |
| case POP_UNION: |
| case POP_INTER: |
| case POP_XOR: |
| { |
| CharRange *cr1, *cr2, *cr3; |
| assert(stack_len >= 2); |
| assert(stack_len < POP_STACK_LEN_MAX); |
| cr1 = &stack[stack_len - 2]; |
| cr2 = &stack[stack_len - 1]; |
| cr3 = &stack[stack_len++]; |
| cr_init(cr3, cr->mem_opaque, cr->realloc_func); |
| if (cr_op(cr3, cr1->points, cr1->len, |
| cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION)) |
| goto fail; |
| cr_free(cr1); |
| cr_free(cr2); |
| *cr1 = *cr3; |
| stack_len -= 2; |
| } |
| break; |
| case POP_INVERT: |
| assert(stack_len >= 1); |
| if (cr_invert(&stack[stack_len - 1])) |
| goto fail; |
| break; |
| case POP_END: |
| goto done; |
| default: |
| abort(); |
| } |
| } |
| done: |
| assert(stack_len == 1); |
| ret = cr_copy(cr, &stack[0]); |
| cr_free(&stack[0]); |
| return ret; |
| fail: |
| for(i = 0; i < stack_len; i++) |
| cr_free(&stack[i]); |
| return -1; |
| } |
| |
| static const uint32_t unicode_gc_mask_table[] = { |
| M(Lu) | M(Ll) | M(Lt), /* LC */ |
| M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */ |
| M(Mn) | M(Mc) | M(Me), /* M */ |
| M(Nd) | M(Nl) | M(No), /* N */ |
| M(Sm) | M(Sc) | M(Sk) | M(So), /* S */ |
| M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */ |
| M(Zs) | M(Zl) | M(Zp), /* Z */ |
| M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */ |
| }; |
| |
| /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2 |
| if not found */ |
| int unicode_general_category(CharRange *cr, const char *gc_name) |
| { |
| int gc_idx; |
| uint32_t gc_mask; |
| |
| gc_idx = unicode_find_name(unicode_gc_name_table, gc_name); |
| if (gc_idx < 0) |
| return -2; |
| if (gc_idx <= UNICODE_GC_Co) { |
| gc_mask = (uint64_t)1 << gc_idx; |
| } else { |
| gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC]; |
| } |
| return unicode_general_category1(cr, gc_mask); |
| } |
| |
| |
| /* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2 |
| if not found */ |
| int unicode_prop(CharRange *cr, const char *prop_name) |
| { |
| int prop_idx, ret; |
| |
| prop_idx = unicode_find_name(unicode_prop_name_table, prop_name); |
| if (prop_idx < 0) |
| return -2; |
| prop_idx += UNICODE_PROP_ASCII_Hex_Digit; |
| |
| ret = 0; |
| switch(prop_idx) { |
| case UNICODE_PROP_ASCII: |
| if (cr_add_interval(cr, 0x00, 0x7f + 1)) |
| return -1; |
| break; |
| case UNICODE_PROP_Any: |
| if (cr_add_interval(cr, 0x00000, 0x10ffff + 1)) |
| return -1; |
| break; |
| case UNICODE_PROP_Assigned: |
| ret = unicode_prop_ops(cr, |
| POP_GC, M(Cn), |
| POP_INVERT, |
| POP_END); |
| break; |
| case UNICODE_PROP_Math: |
| ret = unicode_prop_ops(cr, |
| POP_GC, M(Sm), |
| POP_PROP, UNICODE_PROP_Other_Math, |
| POP_UNION, |
| POP_END); |
| break; |
| case UNICODE_PROP_Lowercase: |
| ret = unicode_prop_ops(cr, |
| POP_GC, M(Ll), |
| POP_PROP, UNICODE_PROP_Other_Lowercase, |
| POP_UNION, |
| POP_END); |
| break; |
| case UNICODE_PROP_Uppercase: |
| ret = unicode_prop_ops(cr, |
| POP_GC, M(Lu), |
| POP_PROP, UNICODE_PROP_Other_Uppercase, |
| POP_UNION, |
| POP_END); |
| break; |
| case UNICODE_PROP_Cased: |
| ret = unicode_prop_ops(cr, |
| POP_GC, M(Lu) | M(Ll) | M(Lt), |
| POP_PROP, UNICODE_PROP_Other_Uppercase, |
| POP_UNION, |
| POP_PROP, UNICODE_PROP_Other_Lowercase, |
| POP_UNION, |
| POP_END); |
| break; |
| case UNICODE_PROP_Alphabetic: |
| ret = unicode_prop_ops(cr, |
| POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl), |
| POP_PROP, UNICODE_PROP_Other_Uppercase, |
| POP_UNION, |
| POP_PROP, UNICODE_PROP_Other_Lowercase, |
| POP_UNION, |
| POP_PROP, UNICODE_PROP_Other_Alphabetic, |
| POP_UNION, |
| POP_END); |
| break; |
| case UNICODE_PROP_Grapheme_Base: |
| ret = unicode_prop_ops(cr, |
| POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn), |
| POP_PROP, UNICODE_PROP_Other_Grapheme_Extend, |
| POP_UNION, |
| POP_INVERT, |
| POP_END); |
| break; |
| case UNICODE_PROP_Grapheme_Extend: |
| ret = unicode_prop_ops(cr, |
| POP_GC, M(Me) | M(Mn), |
| POP_PROP, UNICODE_PROP_Other_Grapheme_Extend, |
| POP_UNION, |
| POP_END); |
| break; |
| case UNICODE_PROP_XID_Start: |
| ret = unicode_prop_ops(cr, |
| POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl), |
| POP_PROP, UNICODE_PROP_Other_ID_Start, |
| POP_UNION, |
| POP_PROP, UNICODE_PROP_Pattern_Syntax, |
| POP_PROP, UNICODE_PROP_Pattern_White_Space, |
| POP_UNION, |
| POP_PROP, UNICODE_PROP_XID_Start1, |
| POP_UNION, |
| POP_INVERT, |
| POP_INTER, |
| POP_END); |
| break; |
| case UNICODE_PROP_XID_Continue: |
| ret = unicode_prop_ops(cr, |
| POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) | |
| M(Mn) | M(Mc) | M(Nd) | M(Pc), |
| POP_PROP, UNICODE_PROP_Other_ID_Start, |
| POP_UNION, |
| POP_PROP, UNICODE_PROP_Other_ID_Continue, |
| POP_UNION, |
| POP_PROP, UNICODE_PROP_Pattern_Syntax, |
| POP_PROP, UNICODE_PROP_Pattern_White_Space, |
| POP_UNION, |
| POP_PROP, UNICODE_PROP_XID_Continue1, |
| POP_UNION, |
| POP_INVERT, |
| POP_INTER, |
| POP_END); |
| break; |
| case UNICODE_PROP_Changes_When_Uppercased: |
| ret = unicode_case1(cr, CASE_U); |
| break; |
| case UNICODE_PROP_Changes_When_Lowercased: |
| ret = unicode_case1(cr, CASE_L); |
| break; |
| case UNICODE_PROP_Changes_When_Casemapped: |
| ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F); |
| break; |
| case UNICODE_PROP_Changes_When_Titlecased: |
| ret = unicode_prop_ops(cr, |
| POP_CASE, CASE_U, |
| POP_PROP, UNICODE_PROP_Changes_When_Titlecased1, |
| POP_XOR, |
| POP_END); |
| break; |
| case UNICODE_PROP_Changes_When_Casefolded: |
| ret = unicode_prop_ops(cr, |
| POP_CASE, CASE_F, |
| POP_PROP, UNICODE_PROP_Changes_When_Casefolded1, |
| POP_XOR, |
| POP_END); |
| break; |
| case UNICODE_PROP_Changes_When_NFKC_Casefolded: |
| ret = unicode_prop_ops(cr, |
| POP_CASE, CASE_F, |
| POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1, |
| POP_XOR, |
| POP_END); |
| break; |
| #if 0 |
| case UNICODE_PROP_ID_Start: |
| ret = unicode_prop_ops(cr, |
| POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl), |
| POP_PROP, UNICODE_PROP_Other_ID_Start, |
| POP_UNION, |
| POP_PROP, UNICODE_PROP_Pattern_Syntax, |
| POP_PROP, UNICODE_PROP_Pattern_White_Space, |
| POP_UNION, |
| POP_INVERT, |
| POP_INTER, |
| POP_END); |
| break; |
| case UNICODE_PROP_ID_Continue: |
| ret = unicode_prop_ops(cr, |
| POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) | |
| M(Mn) | M(Mc) | M(Nd) | M(Pc), |
| POP_PROP, UNICODE_PROP_Other_ID_Start, |
| POP_UNION, |
| POP_PROP, UNICODE_PROP_Other_ID_Continue, |
| POP_UNION, |
| POP_PROP, UNICODE_PROP_Pattern_Syntax, |
| POP_PROP, UNICODE_PROP_Pattern_White_Space, |
| POP_UNION, |
| POP_INVERT, |
| POP_INTER, |
| POP_END); |
| break; |
| case UNICODE_PROP_Case_Ignorable: |
| ret = unicode_prop_ops(cr, |
| POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk), |
| POP_PROP, UNICODE_PROP_Case_Ignorable1, |
| POP_XOR, |
| POP_END); |
| break; |
| #else |
| /* we use the existing tables */ |
| case UNICODE_PROP_ID_Continue: |
| ret = unicode_prop_ops(cr, |
| POP_PROP, UNICODE_PROP_ID_Start, |
| POP_PROP, UNICODE_PROP_ID_Continue1, |
| POP_XOR, |
| POP_END); |
| break; |
| #endif |
| default: |
| if (prop_idx >= countof(unicode_prop_table)) |
| return -2; |
| ret = unicode_prop1(cr, prop_idx); |
| break; |
| } |
| return ret; |
| } |
| |
| #endif /* CONFIG_ALL_UNICODE */ |