| /* |
| * Generation of Unicode tables |
| * |
| * Copyright (c) 2017-2018 Fabrice Bellard |
| * Copyright (c) 2017-2018 Charlie Gordon |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to deal |
| * in the Software without restriction, including without limitation the rights |
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| * copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in |
| * all copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| * THE SOFTWARE. |
| */ |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <stdarg.h> |
| #include <inttypes.h> |
| #include <string.h> |
| #include <assert.h> |
| #include <ctype.h> |
| #include <time.h> |
| |
| #include "cutils.h" |
| |
| /* define it to be able to test unicode.c */ |
| //#define USE_TEST |
| /* profile tests */ |
| //#define PROFILE |
| |
| //#define DUMP_CASE_CONV_TABLE |
| //#define DUMP_TABLE_SIZE |
| //#define DUMP_CC_TABLE |
| //#define DUMP_DECOMP_TABLE |
| |
| /* Ideas: |
| - Generalize run length encoding + index for all tables |
| - remove redundant tables for ID_start, ID_continue, Case_Ignorable, Cased |
| |
| Case conversion: |
| - use a single entry for consecutive U/LF runs |
| - allow EXT runs of length > 1 |
| |
| Decomposition: |
| - Greek lower case (+1f10/1f10) ? |
| - allow holes in B runs |
| - suppress more upper / lower case redundancy |
| */ |
| |
| #ifdef USE_TEST |
| #include "libunicode.c" |
| #endif |
| |
| #define CHARCODE_MAX 0x10ffff |
| #define CC_LEN_MAX 3 |
| |
| void *mallocz(size_t size) |
| { |
| void *ptr; |
| ptr = malloc(size); |
| memset(ptr, 0, size); |
| return ptr; |
| } |
| |
| const char *get_field(const char *p, int n) |
| { |
| int i; |
| for(i = 0; i < n; i++) { |
| while (*p != ';' && *p != '\0') |
| p++; |
| if (*p == '\0') |
| return NULL; |
| p++; |
| } |
| return p; |
| } |
| |
| const char *get_field_buf(char *buf, size_t buf_size, const char *p, int n) |
| { |
| char *q; |
| p = get_field(p, n); |
| q = buf; |
| while (*p != ';' && *p != '\0') { |
| if ((q - buf) < buf_size - 1) |
| *q++ = *p; |
| p++; |
| } |
| *q = '\0'; |
| return buf; |
| } |
| |
| void add_char(int **pbuf, int *psize, int *plen, int c) |
| { |
| int len, size, *buf; |
| buf = *pbuf; |
| size = *psize; |
| len = *plen; |
| if (len >= size) { |
| size = *psize; |
| size = max_int(len + 1, size * 3 / 2); |
| buf = realloc(buf, sizeof(buf[0]) * size); |
| *pbuf = buf; |
| *psize = size; |
| } |
| buf[len++] = c; |
| *plen = len; |
| } |
| |
| int *get_field_str(int *plen, const char *str, int n) |
| { |
| const char *p; |
| int *buf, len, size; |
| p = get_field(str, n); |
| if (!p) { |
| *plen = 0; |
| return NULL; |
| } |
| len = 0; |
| size = 0; |
| buf = NULL; |
| for(;;) { |
| while (isspace(*p)) |
| p++; |
| if (!isxdigit(*p)) |
| break; |
| add_char(&buf, &size, &len, strtoul(p, (char **)&p, 16)); |
| } |
| *plen = len; |
| return buf; |
| } |
| |
| char *get_line(char *buf, int buf_size, FILE *f) |
| { |
| int len; |
| if (!fgets(buf, buf_size, f)) |
| return NULL; |
| len = strlen(buf); |
| if (len > 0 && buf[len - 1] == '\n') |
| buf[len - 1] = '\0'; |
| return buf; |
| } |
| |
| #define UNICODE_GENERAL_CATEGORY |
| |
| typedef enum { |
| #define DEF(id, str) GCAT_ ## id, |
| #include "unicode_gen_def.h" |
| #undef DEF |
| GCAT_COUNT, |
| } UnicodeGCEnum1; |
| |
| static const char *unicode_gc_name[] = { |
| #define DEF(id, str) #id, |
| #include "unicode_gen_def.h" |
| #undef DEF |
| }; |
| |
| static const char *unicode_gc_short_name[] = { |
| #define DEF(id, str) str, |
| #include "unicode_gen_def.h" |
| #undef DEF |
| }; |
| |
| #undef UNICODE_GENERAL_CATEGORY |
| |
| #define UNICODE_SCRIPT |
| |
| typedef enum { |
| #define DEF(id, str) SCRIPT_ ## id, |
| #include "unicode_gen_def.h" |
| #undef DEF |
| SCRIPT_COUNT, |
| } UnicodeScriptEnum1; |
| |
| static const char *unicode_script_name[] = { |
| #define DEF(id, str) #id, |
| #include "unicode_gen_def.h" |
| #undef DEF |
| }; |
| |
| const char *unicode_script_short_name[] = { |
| #define DEF(id, str) str, |
| #include "unicode_gen_def.h" |
| #undef DEF |
| }; |
| |
| #undef UNICODE_SCRIPT |
| |
| #define UNICODE_PROP_LIST |
| |
| typedef enum { |
| #define DEF(id, str) PROP_ ## id, |
| #include "unicode_gen_def.h" |
| #undef DEF |
| PROP_COUNT, |
| } UnicodePropEnum1; |
| |
| static const char *unicode_prop_name[] = { |
| #define DEF(id, str) #id, |
| #include "unicode_gen_def.h" |
| #undef DEF |
| }; |
| |
| static const char *unicode_prop_short_name[] = { |
| #define DEF(id, str) str, |
| #include "unicode_gen_def.h" |
| #undef DEF |
| }; |
| |
| #undef UNICODE_SPROP_LIST |
| |
| typedef struct { |
| /* case conv */ |
| uint8_t u_len; |
| uint8_t l_len; |
| int u_data[CC_LEN_MAX]; |
| int l_data[CC_LEN_MAX]; |
| int f_code; |
| |
| uint8_t combining_class; |
| uint8_t is_compat:1; |
| uint8_t is_excluded:1; |
| uint8_t general_category; |
| uint8_t script; |
| uint8_t script_ext_len; |
| uint8_t *script_ext; |
| uint32_t prop_bitmap_tab[3]; |
| /* decomposition */ |
| int decomp_len; |
| int *decomp_data; |
| } CCInfo; |
| |
| CCInfo *unicode_db; |
| |
| int find_name(const char **tab, int tab_len, const char *name) |
| { |
| int i, len, name_len; |
| const char *p, *r; |
| |
| name_len = strlen(name); |
| for(i = 0; i < tab_len; i++) { |
| p = tab[i]; |
| for(;;) { |
| r = strchr(p, ','); |
| if (!r) |
| len = strlen(p); |
| else |
| len = r - p; |
| if (len == name_len && memcmp(p, name, len) == 0) |
| return i; |
| if (!r) |
| break; |
| p = r + 1; |
| } |
| } |
| return -1; |
| } |
| |
| static int get_prop(uint32_t c, int prop_idx) |
| { |
| return (unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1; |
| } |
| |
| static void set_prop(uint32_t c, int prop_idx, int val) |
| { |
| uint32_t mask; |
| mask = 1U << (prop_idx & 0x1f); |
| if (val) |
| unicode_db[c].prop_bitmap_tab[prop_idx >> 5] |= mask; |
| else |
| unicode_db[c].prop_bitmap_tab[prop_idx >> 5] &= ~mask; |
| } |
| |
| void parse_unicode_data(const char *filename) |
| { |
| FILE *f; |
| char line[1024]; |
| char buf1[256]; |
| const char *p; |
| int code, lc, uc, last_code; |
| CCInfo *ci, *tab = unicode_db; |
| |
| f = fopen(filename, "rb"); |
| if (!f) { |
| perror(filename); |
| exit(1); |
| } |
| |
| last_code = 0; |
| for(;;) { |
| if (!get_line(line, sizeof(line), f)) |
| break; |
| p = line; |
| while (isspace(*p)) |
| p++; |
| if (*p == '#') |
| continue; |
| |
| p = get_field(line, 0); |
| if (!p) |
| continue; |
| code = strtoul(p, NULL, 16); |
| lc = 0; |
| uc = 0; |
| |
| p = get_field(line, 12); |
| if (p && *p != ';') { |
| uc = strtoul(p, NULL, 16); |
| } |
| |
| p = get_field(line, 13); |
| if (p && *p != ';') { |
| lc = strtoul(p, NULL, 16); |
| } |
| ci = &tab[code]; |
| if (uc > 0 || lc > 0) { |
| assert(code <= CHARCODE_MAX); |
| if (uc > 0) { |
| assert(ci->u_len == 0); |
| ci->u_len = 1; |
| ci->u_data[0] = uc; |
| } |
| if (lc > 0) { |
| assert(ci->l_len == 0); |
| ci->l_len = 1; |
| ci->l_data[0] = lc; |
| } |
| } |
| |
| { |
| int i; |
| get_field_buf(buf1, sizeof(buf1), line, 2); |
| i = find_name(unicode_gc_name, countof(unicode_gc_name), buf1); |
| if (i < 0) { |
| fprintf(stderr, "General category '%s' not found\n", |
| buf1); |
| exit(1); |
| } |
| ci->general_category = i; |
| } |
| |
| p = get_field(line, 3); |
| if (p && *p != ';' && *p != '\0') { |
| int cc; |
| cc = strtoul(p, NULL, 0); |
| if (cc != 0) { |
| assert(code <= CHARCODE_MAX); |
| ci->combining_class = cc; |
| // printf("%05x: %d\n", code, ci->combining_class); |
| } |
| } |
| |
| p = get_field(line, 5); |
| if (p && *p != ';' && *p != '\0') { |
| int size; |
| assert(code <= CHARCODE_MAX); |
| ci->is_compat = 0; |
| if (*p == '<') { |
| while (*p != '\0' && *p != '>') |
| p++; |
| if (*p == '>') |
| p++; |
| ci->is_compat = 1; |
| } |
| size = 0; |
| for(;;) { |
| while (isspace(*p)) |
| p++; |
| if (!isxdigit(*p)) |
| break; |
| add_char(&ci->decomp_data, &size, &ci->decomp_len, strtoul(p, (char **)&p, 16)); |
| } |
| #if 0 |
| { |
| int i; |
| static int count, d_count; |
| |
| printf("%05x: %c", code, ci->is_compat ? 'C': ' '); |
| for(i = 0; i < ci->decomp_len; i++) |
| printf(" %05x", ci->decomp_data[i]); |
| printf("\n"); |
| count++; |
| d_count += ci->decomp_len; |
| // printf("%d %d\n", count, d_count); |
| } |
| #endif |
| } |
| |
| p = get_field(line, 9); |
| if (p && *p == 'Y') { |
| set_prop(code, PROP_Bidi_Mirrored, 1); |
| } |
| |
| /* handle ranges */ |
| get_field_buf(buf1, sizeof(buf1), line, 1); |
| if (strstr(buf1, " Last>")) { |
| int i; |
| // printf("range: 0x%x-%0x\n", last_code, code); |
| assert(ci->decomp_len == 0); |
| assert(ci->script_ext_len == 0); |
| for(i = last_code + 1; i < code; i++) { |
| unicode_db[i] = *ci; |
| } |
| } |
| last_code = code; |
| } |
| |
| fclose(f); |
| } |
| |
| void parse_special_casing(CCInfo *tab, const char *filename) |
| { |
| FILE *f; |
| char line[1024]; |
| const char *p; |
| int code; |
| CCInfo *ci; |
| |
| f = fopen(filename, "rb"); |
| if (!f) { |
| perror(filename); |
| exit(1); |
| } |
| |
| for(;;) { |
| if (!get_line(line, sizeof(line), f)) |
| break; |
| p = line; |
| while (isspace(*p)) |
| p++; |
| if (*p == '#') |
| continue; |
| |
| p = get_field(line, 0); |
| if (!p) |
| continue; |
| code = strtoul(p, NULL, 16); |
| assert(code <= CHARCODE_MAX); |
| ci = &tab[code]; |
| |
| p = get_field(line, 4); |
| if (p) { |
| /* locale dependent casing */ |
| while (isspace(*p)) |
| p++; |
| if (*p != '#' && *p != '\0') |
| continue; |
| } |
| |
| |
| p = get_field(line, 1); |
| if (p && *p != ';') { |
| ci->l_len = 0; |
| for(;;) { |
| while (isspace(*p)) |
| p++; |
| if (*p == ';') |
| break; |
| assert(ci->l_len < CC_LEN_MAX); |
| ci->l_data[ci->l_len++] = strtoul(p, (char **)&p, 16); |
| } |
| |
| if (ci->l_len == 1 && ci->l_data[0] == code) |
| ci->l_len = 0; |
| } |
| |
| p = get_field(line, 3); |
| if (p && *p != ';') { |
| ci->u_len = 0; |
| for(;;) { |
| while (isspace(*p)) |
| p++; |
| if (*p == ';') |
| break; |
| assert(ci->u_len < CC_LEN_MAX); |
| ci->u_data[ci->u_len++] = strtoul(p, (char **)&p, 16); |
| } |
| |
| if (ci->u_len == 1 && ci->u_data[0] == code) |
| ci->u_len = 0; |
| } |
| } |
| |
| fclose(f); |
| } |
| |
| void parse_case_folding(CCInfo *tab, const char *filename) |
| { |
| FILE *f; |
| char line[1024]; |
| const char *p; |
| int code; |
| CCInfo *ci; |
| |
| f = fopen(filename, "rb"); |
| if (!f) { |
| perror(filename); |
| exit(1); |
| } |
| |
| for(;;) { |
| if (!get_line(line, sizeof(line), f)) |
| break; |
| p = line; |
| while (isspace(*p)) |
| p++; |
| if (*p == '#') |
| continue; |
| |
| p = get_field(line, 0); |
| if (!p) |
| continue; |
| code = strtoul(p, NULL, 16); |
| assert(code <= CHARCODE_MAX); |
| ci = &tab[code]; |
| |
| p = get_field(line, 1); |
| if (!p) |
| continue; |
| /* locale dependent casing */ |
| while (isspace(*p)) |
| p++; |
| if (*p != 'C' && *p != 'S') |
| continue; |
| |
| p = get_field(line, 2); |
| assert(p != 0); |
| assert(ci->f_code == 0); |
| ci->f_code = strtoul(p, NULL, 16); |
| assert(ci->f_code != 0 && ci->f_code != code); |
| } |
| |
| fclose(f); |
| } |
| |
| void parse_composition_exclusions(const char *filename) |
| { |
| FILE *f; |
| char line[4096], *p; |
| uint32_t c0; |
| |
| f = fopen(filename, "rb"); |
| if (!f) { |
| perror(filename); |
| exit(1); |
| } |
| |
| for(;;) { |
| if (!get_line(line, sizeof(line), f)) |
| break; |
| p = line; |
| while (isspace(*p)) |
| p++; |
| if (*p == '#' || *p == '@' || *p == '\0') |
| continue; |
| c0 = strtoul(p, (char **)&p, 16); |
| assert(c0 > 0 && c0 <= CHARCODE_MAX); |
| unicode_db[c0].is_excluded = TRUE; |
| } |
| fclose(f); |
| } |
| |
| void parse_derived_core_properties(const char *filename) |
| { |
| FILE *f; |
| char line[4096], *p, buf[256], *q; |
| uint32_t c0, c1, c; |
| int i; |
| |
| f = fopen(filename, "rb"); |
| if (!f) { |
| perror(filename); |
| exit(1); |
| } |
| |
| for(;;) { |
| if (!get_line(line, sizeof(line), f)) |
| break; |
| p = line; |
| while (isspace(*p)) |
| p++; |
| if (*p == '#' || *p == '@' || *p == '\0') |
| continue; |
| c0 = strtoul(p, (char **)&p, 16); |
| if (*p == '.' && p[1] == '.') { |
| p += 2; |
| c1 = strtoul(p, (char **)&p, 16); |
| } else { |
| c1 = c0; |
| } |
| assert(c1 <= CHARCODE_MAX); |
| p += strspn(p, " \t"); |
| if (*p == ';') { |
| p++; |
| p += strspn(p, " \t"); |
| q = buf; |
| while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { |
| if ((q - buf) < sizeof(buf) - 1) |
| *q++ = *p; |
| p++; |
| } |
| *q = '\0'; |
| i = find_name(unicode_prop_name, |
| countof(unicode_prop_name), buf); |
| if (i < 0) { |
| if (!strcmp(buf, "Grapheme_Link")) |
| goto next; |
| fprintf(stderr, "Property not found: %s\n", buf); |
| exit(1); |
| } |
| for(c = c0; c <= c1; c++) { |
| set_prop(c, i, 1); |
| } |
| next: ; |
| } |
| } |
| fclose(f); |
| } |
| |
| void parse_derived_norm_properties(const char *filename) |
| { |
| FILE *f; |
| char line[4096], *p, buf[256], *q; |
| uint32_t c0, c1, c; |
| |
| f = fopen(filename, "rb"); |
| if (!f) { |
| perror(filename); |
| exit(1); |
| } |
| |
| for(;;) { |
| if (!get_line(line, sizeof(line), f)) |
| break; |
| p = line; |
| while (isspace(*p)) |
| p++; |
| if (*p == '#' || *p == '@' || *p == '\0') |
| continue; |
| c0 = strtoul(p, (char **)&p, 16); |
| if (*p == '.' && p[1] == '.') { |
| p += 2; |
| c1 = strtoul(p, (char **)&p, 16); |
| } else { |
| c1 = c0; |
| } |
| assert(c1 <= CHARCODE_MAX); |
| p += strspn(p, " \t"); |
| if (*p == ';') { |
| p++; |
| p += strspn(p, " \t"); |
| q = buf; |
| while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { |
| if ((q - buf) < sizeof(buf) - 1) |
| *q++ = *p; |
| p++; |
| } |
| *q = '\0'; |
| if (!strcmp(buf, "Changes_When_NFKC_Casefolded")) { |
| for(c = c0; c <= c1; c++) { |
| set_prop(c, PROP_Changes_When_NFKC_Casefolded, 1); |
| } |
| } |
| } |
| } |
| fclose(f); |
| } |
| |
| void parse_prop_list(const char *filename) |
| { |
| FILE *f; |
| char line[4096], *p, buf[256], *q; |
| uint32_t c0, c1, c; |
| int i; |
| |
| f = fopen(filename, "rb"); |
| if (!f) { |
| perror(filename); |
| exit(1); |
| } |
| |
| for(;;) { |
| if (!get_line(line, sizeof(line), f)) |
| break; |
| p = line; |
| while (isspace(*p)) |
| p++; |
| if (*p == '#' || *p == '@' || *p == '\0') |
| continue; |
| c0 = strtoul(p, (char **)&p, 16); |
| if (*p == '.' && p[1] == '.') { |
| p += 2; |
| c1 = strtoul(p, (char **)&p, 16); |
| } else { |
| c1 = c0; |
| } |
| assert(c1 <= CHARCODE_MAX); |
| p += strspn(p, " \t"); |
| if (*p == ';') { |
| p++; |
| p += strspn(p, " \t"); |
| q = buf; |
| while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { |
| if ((q - buf) < sizeof(buf) - 1) |
| *q++ = *p; |
| p++; |
| } |
| *q = '\0'; |
| i = find_name(unicode_prop_name, |
| countof(unicode_prop_name), buf); |
| if (i < 0) { |
| fprintf(stderr, "Property not found: %s\n", buf); |
| exit(1); |
| } |
| for(c = c0; c <= c1; c++) { |
| set_prop(c, i, 1); |
| } |
| } |
| } |
| fclose(f); |
| } |
| |
| void parse_scripts(const char *filename) |
| { |
| FILE *f; |
| char line[4096], *p, buf[256], *q; |
| uint32_t c0, c1, c; |
| int i; |
| |
| f = fopen(filename, "rb"); |
| if (!f) { |
| perror(filename); |
| exit(1); |
| } |
| |
| for(;;) { |
| if (!get_line(line, sizeof(line), f)) |
| break; |
| p = line; |
| while (isspace(*p)) |
| p++; |
| if (*p == '#' || *p == '@' || *p == '\0') |
| continue; |
| c0 = strtoul(p, (char **)&p, 16); |
| if (*p == '.' && p[1] == '.') { |
| p += 2; |
| c1 = strtoul(p, (char **)&p, 16); |
| } else { |
| c1 = c0; |
| } |
| assert(c1 <= CHARCODE_MAX); |
| p += strspn(p, " \t"); |
| if (*p == ';') { |
| p++; |
| p += strspn(p, " \t"); |
| q = buf; |
| while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { |
| if ((q - buf) < sizeof(buf) - 1) |
| *q++ = *p; |
| p++; |
| } |
| *q = '\0'; |
| i = find_name(unicode_script_name, |
| countof(unicode_script_name), buf); |
| if (i < 0) { |
| fprintf(stderr, "Unknown script: '%s'\n", buf); |
| exit(1); |
| } |
| for(c = c0; c <= c1; c++) |
| unicode_db[c].script = i; |
| } |
| } |
| fclose(f); |
| } |
| |
| void parse_script_extensions(const char *filename) |
| { |
| FILE *f; |
| char line[4096], *p, buf[256], *q; |
| uint32_t c0, c1, c; |
| int i; |
| uint8_t script_ext[255]; |
| int script_ext_len; |
| |
| f = fopen(filename, "rb"); |
| if (!f) { |
| perror(filename); |
| exit(1); |
| } |
| |
| for(;;) { |
| if (!get_line(line, sizeof(line), f)) |
| break; |
| p = line; |
| while (isspace(*p)) |
| p++; |
| if (*p == '#' || *p == '@' || *p == '\0') |
| continue; |
| c0 = strtoul(p, (char **)&p, 16); |
| if (*p == '.' && p[1] == '.') { |
| p += 2; |
| c1 = strtoul(p, (char **)&p, 16); |
| } else { |
| c1 = c0; |
| } |
| assert(c1 <= CHARCODE_MAX); |
| p += strspn(p, " \t"); |
| script_ext_len = 0; |
| if (*p == ';') { |
| p++; |
| for(;;) { |
| p += strspn(p, " \t"); |
| q = buf; |
| while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { |
| if ((q - buf) < sizeof(buf) - 1) |
| *q++ = *p; |
| p++; |
| } |
| *q = '\0'; |
| if (buf[0] == '\0') |
| break; |
| i = find_name(unicode_script_short_name, |
| countof(unicode_script_short_name), buf); |
| if (i < 0) { |
| fprintf(stderr, "Script not found: %s\n", buf); |
| exit(1); |
| } |
| assert(script_ext_len < sizeof(script_ext)); |
| script_ext[script_ext_len++] = i; |
| } |
| for(c = c0; c <= c1; c++) { |
| CCInfo *ci = &unicode_db[c]; |
| ci->script_ext_len = script_ext_len; |
| ci->script_ext = malloc(sizeof(ci->script_ext[0]) * script_ext_len); |
| for(i = 0; i < script_ext_len; i++) |
| ci->script_ext[i] = script_ext[i]; |
| } |
| } |
| } |
| fclose(f); |
| } |
| |
| void dump_cc_info(CCInfo *ci, int i) |
| { |
| int j; |
| printf("%05x:", i); |
| if (ci->u_len != 0) { |
| printf(" U:"); |
| for(j = 0; j < ci->u_len; j++) |
| printf(" %05x", ci->u_data[j]); |
| } |
| if (ci->l_len != 0) { |
| printf(" L:"); |
| for(j = 0; j < ci->l_len; j++) |
| printf(" %05x", ci->l_data[j]); |
| } |
| if (ci->f_code != 0) { |
| printf(" F: %05x", ci->f_code); |
| } |
| printf("\n"); |
| } |
| |
| void dump_data(CCInfo *tab) |
| { |
| int i; |
| CCInfo *ci; |
| for(i = 0; i <= CHARCODE_MAX; i++) { |
| ci = &tab[i]; |
| if (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0) { |
| dump_cc_info(ci, i); |
| } |
| } |
| } |
| |
| BOOL is_complicated_case(const CCInfo *ci) |
| { |
| return (ci->u_len > 1 || ci->l_len > 1 || |
| (ci->u_len > 0 && ci->l_len > 0) || |
| (ci->f_code != 0) != ci->l_len || |
| (ci->f_code != 0 && ci->l_data[0] != ci->f_code)); |
| } |
| |
| #ifndef USE_TEST |
| enum { |
| RUN_TYPE_U, |
| RUN_TYPE_L, |
| RUN_TYPE_UF, |
| RUN_TYPE_LF, |
| RUN_TYPE_UL, |
| RUN_TYPE_LSU, |
| RUN_TYPE_U2L_399_EXT2, |
| RUN_TYPE_UF_D20, |
| RUN_TYPE_UF_D1_EXT, |
| RUN_TYPE_U_EXT, |
| RUN_TYPE_LF_EXT, |
| RUN_TYPE_U_EXT2, |
| RUN_TYPE_L_EXT2, |
| RUN_TYPE_U_EXT3, |
| }; |
| #endif |
| |
| const char *run_type_str[] = { |
| "U", |
| "L", |
| "UF", |
| "LF", |
| "UL", |
| "LSU", |
| "U2L_399_EXT2", |
| "UF_D20", |
| "UF_D1_EXT", |
| "U_EXT", |
| "LF_EXT", |
| "U_EXT2", |
| "L_EXT2", |
| "U_EXT3", |
| }; |
| |
| typedef struct { |
| int code; |
| int len; |
| int type; |
| int data; |
| int ext_len; |
| int ext_data[3]; |
| int data_index; /* 'data' coming from the table */ |
| } TableEntry; |
| |
| /* code (17), len (7), type (4) */ |
| |
| void find_run_type(TableEntry *te, CCInfo *tab, int code) |
| { |
| int is_lower, len; |
| CCInfo *ci, *ci1, *ci2; |
| |
| ci = &tab[code]; |
| ci1 = &tab[code + 1]; |
| ci2 = &tab[code + 2]; |
| te->code = code; |
| |
| if (ci->l_len == 1 && ci->l_data[0] == code + 2 && |
| ci->f_code == ci->l_data[0] && |
| ci->u_len == 0 && |
| |
| ci1->l_len == 1 && ci1->l_data[0] == code + 2 && |
| ci1->f_code == ci1->l_data[0] && |
| ci1->u_len == 1 && ci1->u_data[0] == code && |
| |
| ci2->l_len == 0 && |
| ci2->f_code == 0 && |
| ci2->u_len == 1 && ci2->u_data[0] == code) { |
| te->len = 3; |
| te->data = 0; |
| te->type = RUN_TYPE_LSU; |
| return; |
| } |
| |
| if (is_complicated_case(ci)) { |
| len = 1; |
| while (code + len <= CHARCODE_MAX) { |
| ci1 = &tab[code + len]; |
| if (ci1->u_len != 1 || |
| ci1->u_data[0] != ci->u_data[0] + len || |
| ci1->l_len != 0 || |
| ci1->f_code != ci1->u_data[0]) |
| break; |
| len++; |
| } |
| if (len > 1) { |
| te->len = len; |
| te->type = RUN_TYPE_UF; |
| te->data = ci->u_data[0]; |
| return; |
| } |
| |
| if (ci->u_len == 2 && ci->u_data[1] == 0x399 && |
| ci->f_code == 0 && ci->l_len == 0) { |
| len = 1; |
| while (code + len <= CHARCODE_MAX) { |
| ci1 = &tab[code + len]; |
| if (!(ci1->u_len == 2 && |
| ci1->u_data[1] == 0x399 && |
| ci1->u_data[0] == ci->u_data[0] + len && |
| ci1->f_code == 0 && |
| ci1->l_len == 0)) |
| break; |
| len++; |
| } |
| te->len = len; |
| te->type = RUN_TYPE_U_EXT2; |
| te->ext_data[0] = ci->u_data[0]; |
| te->ext_data[1] = ci->u_data[1]; |
| te->ext_len = 2; |
| return; |
| } |
| |
| if (ci->u_len == 2 && ci->u_data[1] == 0x399 && |
| ci->l_len == 1 && ci->f_code == ci->l_data[0]) { |
| len = 1; |
| while (code + len <= CHARCODE_MAX) { |
| ci1 = &tab[code + len]; |
| if (!(ci1->u_len == 2 && |
| ci1->u_data[1] == 0x399 && |
| ci1->u_data[0] == ci->u_data[0] + len && |
| ci1->l_len == 1 && |
| ci1->l_data[0] == ci->l_data[0] + len && |
| ci1->f_code == ci1->l_data[0])) |
| break; |
| len++; |
| } |
| te->len = len; |
| te->type = RUN_TYPE_U2L_399_EXT2; |
| te->ext_data[0] = ci->u_data[0]; |
| te->ext_data[1] = ci->l_data[0]; |
| te->ext_len = 2; |
| return; |
| } |
| |
| if (ci->l_len == 1 && ci->u_len == 0 && ci->f_code == 0) { |
| len = 1; |
| while (code + len <= CHARCODE_MAX) { |
| ci1 = &tab[code + len]; |
| if (!(ci1->l_len == 1 && |
| ci1->l_data[0] == ci->l_data[0] + len && |
| ci1->u_len == 0 && ci1->f_code == 0)) |
| break; |
| len++; |
| } |
| te->len = len; |
| te->type = RUN_TYPE_L; |
| te->data = ci->l_data[0]; |
| return; |
| } |
| |
| if (ci->l_len == 0 && |
| ci->u_len == 1 && |
| ci->u_data[0] < 0x1000 && |
| ci->f_code == ci->u_data[0] + 0x20) { |
| te->len = 1; |
| te->type = RUN_TYPE_UF_D20; |
| te->data = ci->u_data[0]; |
| } else if (ci->l_len == 0 && |
| ci->u_len == 1 && |
| ci->f_code == ci->u_data[0] + 1) { |
| te->len = 1; |
| te->type = RUN_TYPE_UF_D1_EXT; |
| te->ext_data[0] = ci->u_data[0]; |
| te->ext_len = 1; |
| } else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_code == 0) { |
| te->len = 1; |
| te->type = RUN_TYPE_L_EXT2; |
| te->ext_data[0] = ci->l_data[0]; |
| te->ext_data[1] = ci->l_data[1]; |
| te->ext_len = 2; |
| } else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_code == 0) { |
| te->len = 1; |
| te->type = RUN_TYPE_U_EXT2; |
| te->ext_data[0] = ci->u_data[0]; |
| te->ext_data[1] = ci->u_data[1]; |
| te->ext_len = 2; |
| } else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_code == 0) { |
| te->len = 1; |
| te->type = RUN_TYPE_U_EXT3; |
| te->ext_data[0] = ci->u_data[0]; |
| te->ext_data[1] = ci->u_data[1]; |
| te->ext_data[2] = ci->u_data[2]; |
| te->ext_len = 3; |
| } else { |
| printf("unsupported encoding case:\n"); |
| dump_cc_info(ci, code); |
| abort(); |
| } |
| } else { |
| /* look for a run of identical conversions */ |
| len = 0; |
| for(;;) { |
| if (code >= CHARCODE_MAX || len >= 126) |
| break; |
| ci = &tab[code + len]; |
| ci1 = &tab[code + len + 1]; |
| if (is_complicated_case(ci) || is_complicated_case(ci1)) { |
| break; |
| } |
| if (ci->l_len != 1 || ci->l_data[0] != code + len + 1) |
| break; |
| if (ci1->u_len != 1 || ci1->u_data[0] != code + len) |
| break; |
| len += 2; |
| } |
| if (len > 0) { |
| te->len = len; |
| te->type = RUN_TYPE_UL; |
| te->data = 0; |
| return; |
| } |
| |
| ci = &tab[code]; |
| is_lower = ci->l_len > 0; |
| len = 1; |
| while (code + len <= CHARCODE_MAX) { |
| ci1 = &tab[code + len]; |
| if (is_complicated_case(ci1)) |
| break; |
| if (is_lower) { |
| if (ci1->l_len != 1 || |
| ci1->l_data[0] != ci->l_data[0] + len) |
| break; |
| } else { |
| if (ci1->u_len != 1 || |
| ci1->u_data[0] != ci->u_data[0] + len) |
| break; |
| } |
| len++; |
| } |
| te->len = len; |
| if (is_lower) { |
| te->type = RUN_TYPE_LF; |
| te->data = ci->l_data[0]; |
| } else { |
| te->type = RUN_TYPE_U; |
| te->data = ci->u_data[0]; |
| } |
| } |
| } |
| |
| TableEntry conv_table[1000]; |
| int conv_table_len; |
| int ext_data[1000]; |
| int ext_data_len; |
| |
| void dump_case_conv_table1(void) |
| { |
| int i, j; |
| const TableEntry *te; |
| |
| for(i = 0; i < conv_table_len; i++) { |
| te = &conv_table[i]; |
| printf("%05x %02x %-10s %05x", |
| te->code, te->len, run_type_str[te->type], te->data); |
| for(j = 0; j < te->ext_len; j++) { |
| printf(" %05x", te->ext_data[j]); |
| } |
| printf("\n"); |
| } |
| printf("table_len=%d ext_len=%d\n", conv_table_len, ext_data_len); |
| } |
| |
| int find_data_index(const TableEntry *conv_table, int len, int data) |
| { |
| int i; |
| const TableEntry *te; |
| for(i = 0; i < len; i++) { |
| te = &conv_table[i]; |
| if (te->code == data) |
| return i; |
| } |
| return -1; |
| } |
| |
| int find_ext_data_index(int data) |
| { |
| int i; |
| for(i = 0; i < ext_data_len; i++) { |
| if (ext_data[i] == data) |
| return i; |
| } |
| assert(ext_data_len < countof(ext_data)); |
| ext_data[ext_data_len++] = data; |
| return ext_data_len - 1; |
| } |
| |
| void build_conv_table(CCInfo *tab) |
| { |
| int code, i, j; |
| CCInfo *ci; |
| TableEntry *te; |
| |
| te = conv_table; |
| for(code = 0; code <= CHARCODE_MAX; code++) { |
| ci = &tab[code]; |
| if (ci->u_len == 0 && ci->l_len == 0 && ci->f_code == 0) |
| continue; |
| assert(te - conv_table < countof(conv_table)); |
| find_run_type(te, tab, code); |
| #if 0 |
| if (te->type == RUN_TYPE_TODO) { |
| printf("TODO: "); |
| dump_cc_info(ci, code); |
| } |
| #endif |
| assert(te->len <= 127); |
| code += te->len - 1; |
| te++; |
| } |
| conv_table_len = te - conv_table; |
| |
| /* find the data index */ |
| for(i = 0; i < conv_table_len; i++) { |
| int data_index; |
| te = &conv_table[i]; |
| |
| switch(te->type) { |
| case RUN_TYPE_U: |
| case RUN_TYPE_L: |
| case RUN_TYPE_UF: |
| case RUN_TYPE_LF: |
| data_index = find_data_index(conv_table, conv_table_len, te->data); |
| if (data_index < 0) { |
| switch(te->type) { |
| case RUN_TYPE_U: |
| te->type = RUN_TYPE_U_EXT; |
| te->ext_len = 1; |
| te->ext_data[0] = te->data; |
| break; |
| case RUN_TYPE_LF: |
| te->type = RUN_TYPE_LF_EXT; |
| te->ext_len = 1; |
| te->ext_data[0] = te->data; |
| break; |
| default: |
| printf("%05x: index not found\n", te->code); |
| exit(1); |
| } |
| } else { |
| te->data_index = data_index; |
| } |
| break; |
| case RUN_TYPE_UF_D20: |
| te->data_index = te->data; |
| break; |
| } |
| } |
| |
| /* find the data index for ext_data */ |
| for(i = 0; i < conv_table_len; i++) { |
| te = &conv_table[i]; |
| if (te->type == RUN_TYPE_U_EXT3) { |
| int p, v; |
| v = 0; |
| for(j = 0; j < 3; j++) { |
| p = find_ext_data_index(te->ext_data[j]); |
| assert(p < 16); |
| v = (v << 4) | p; |
| } |
| te->data_index = v; |
| } |
| } |
| |
| for(i = 0; i < conv_table_len; i++) { |
| te = &conv_table[i]; |
| if (te->type == RUN_TYPE_L_EXT2 || |
| te->type == RUN_TYPE_U_EXT2 || |
| te->type == RUN_TYPE_U2L_399_EXT2) { |
| int p, v; |
| v = 0; |
| for(j = 0; j < 2; j++) { |
| p = find_ext_data_index(te->ext_data[j]); |
| assert(p < 64); |
| v = (v << 6) | p; |
| } |
| te->data_index = v; |
| } |
| } |
| |
| for(i = 0; i < conv_table_len; i++) { |
| te = &conv_table[i]; |
| if (te->type == RUN_TYPE_UF_D1_EXT || |
| te->type == RUN_TYPE_U_EXT || |
| te->type == RUN_TYPE_LF_EXT) { |
| te->data_index = find_ext_data_index(te->ext_data[0]); |
| } |
| } |
| #ifdef DUMP_CASE_CONV_TABLE |
| dump_case_conv_table1(); |
| #endif |
| } |
| |
| void dump_case_conv_table(FILE *f) |
| { |
| int i; |
| uint32_t v; |
| const TableEntry *te; |
| |
| fprintf(f, "static const uint32_t case_conv_table1[%u] = {", conv_table_len); |
| for(i = 0; i < conv_table_len; i++) { |
| if (i % 4 == 0) |
| fprintf(f, "\n "); |
| te = &conv_table[i]; |
| v = te->code << (32 - 17); |
| v |= te->len << (32 - 17 - 7); |
| v |= te->type << (32 - 17 - 7 - 4); |
| v |= te->data_index >> 8; |
| fprintf(f, " 0x%08x,", v); |
| } |
| fprintf(f, "\n};\n\n"); |
| |
| fprintf(f, "static const uint8_t case_conv_table2[%u] = {", conv_table_len); |
| for(i = 0; i < conv_table_len; i++) { |
| if (i % 8 == 0) |
| fprintf(f, "\n "); |
| te = &conv_table[i]; |
| fprintf(f, " 0x%02x,", te->data_index & 0xff); |
| } |
| fprintf(f, "\n};\n\n"); |
| |
| fprintf(f, "static const uint16_t case_conv_ext[%u] = {", ext_data_len); |
| for(i = 0; i < ext_data_len; i++) { |
| if (i % 8 == 0) |
| fprintf(f, "\n "); |
| fprintf(f, " 0x%04x,", ext_data[i]); |
| } |
| fprintf(f, "\n};\n\n"); |
| } |
| |
| int tabcmp(const int *tab1, const int *tab2, int n) |
| { |
| int i; |
| for(i = 0; i < n; i++) { |
| if (tab1[i] != tab2[i]) |
| return -1; |
| } |
| return 0; |
| } |
| |
| void dump_str(const char *str, const int *buf, int len) |
| { |
| int i; |
| printf("%s=", str); |
| for(i = 0; i < len; i++) |
| printf(" %05x", buf[i]); |
| printf("\n"); |
| } |
| |
| void compute_internal_props(void) |
| { |
| int i; |
| BOOL has_ul; |
| |
| for(i = 0; i <= CHARCODE_MAX; i++) { |
| CCInfo *ci = &unicode_db[i]; |
| has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0); |
| if (has_ul) { |
| assert(get_prop(i, PROP_Cased)); |
| } else { |
| set_prop(i, PROP_Cased1, get_prop(i, PROP_Cased)); |
| } |
| set_prop(i, PROP_ID_Continue1, |
| get_prop(i, PROP_ID_Continue) & (get_prop(i, PROP_ID_Start) ^ 1)); |
| set_prop(i, PROP_XID_Start1, |
| get_prop(i, PROP_ID_Start) ^ get_prop(i, PROP_XID_Start)); |
| set_prop(i, PROP_XID_Continue1, |
| get_prop(i, PROP_ID_Continue) ^ get_prop(i, PROP_XID_Continue)); |
| set_prop(i, PROP_Changes_When_Titlecased1, |
| get_prop(i, PROP_Changes_When_Titlecased) ^ (ci->u_len != 0)); |
| set_prop(i, PROP_Changes_When_Casefolded1, |
| get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_code != 0)); |
| /* XXX: reduce table size (438 bytes) */ |
| set_prop(i, PROP_Changes_When_NFKC_Casefolded1, |
| get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_code != 0)); |
| #if 0 |
| /* TEST */ |
| #define M(x) (1U << GCAT_ ## x) |
| { |
| int b; |
| b = ((M(Mn) | M(Cf) | M(Lm) | M(Sk)) >> |
| unicode_db[i].general_category) & 1; |
| set_prop(i, PROP_Cased1, |
| get_prop(i, PROP_Case_Ignorable) ^ b); |
| } |
| #undef M |
| #endif |
| } |
| } |
| |
| void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len) |
| { |
| int i; |
| fprintf(f, "static const uint8_t %s[%d] = {", cname, len); |
| for(i = 0; i < len; i++) { |
| if (i % 8 == 0) |
| fprintf(f, "\n "); |
| fprintf(f, " 0x%02x,", tab[i]); |
| } |
| fprintf(f, "\n};\n\n"); |
| } |
| |
| #define PROP_BLOCK_LEN 32 |
| |
| void build_prop_table(FILE *f, int prop_index, BOOL add_index) |
| { |
| int i, j, n, v, offset, code; |
| DynBuf dbuf_s, *dbuf = &dbuf_s; |
| DynBuf dbuf1_s, *dbuf1 = &dbuf1_s; |
| DynBuf dbuf2_s, *dbuf2 = &dbuf2_s; |
| const uint32_t *buf; |
| int buf_len, block_end_pos, bit; |
| char cname[128]; |
| |
| dbuf_init(dbuf1); |
| |
| for(i = 0; i <= CHARCODE_MAX;) { |
| v = get_prop(i, prop_index); |
| j = i + 1; |
| while (j <= CHARCODE_MAX && get_prop(j, prop_index) == v) { |
| j++; |
| } |
| n = j - i; |
| if (j == (CHARCODE_MAX + 1) && v == 0) |
| break; /* no need to encode last zero run */ |
| //printf("%05x: %d %d\n", i, n, v); |
| dbuf_put_u32(dbuf1, n - 1); |
| i += n; |
| } |
| |
| dbuf_init(dbuf); |
| dbuf_init(dbuf2); |
| buf = (uint32_t *)dbuf1->buf; |
| buf_len = dbuf1->size / sizeof(buf[0]); |
| |
| /* the first value is assumed to be 0 */ |
| assert(get_prop(0, prop_index) == 0); |
| |
| block_end_pos = PROP_BLOCK_LEN; |
| i = 0; |
| code = 0; |
| bit = 0; |
| while (i < buf_len) { |
| if (add_index && dbuf->size >= block_end_pos && bit == 0) { |
| offset = (dbuf->size - block_end_pos); |
| /* XXX: offset could be larger in case of runs of small |
| lengths. Could add code to change the encoding to |
| prevent it at the expense of one byte loss */ |
| assert(offset <= 7); |
| v = code | (offset << 21); |
| dbuf_putc(dbuf2, v); |
| dbuf_putc(dbuf2, v >> 8); |
| dbuf_putc(dbuf2, v >> 16); |
| block_end_pos += PROP_BLOCK_LEN; |
| } |
| |
| v = buf[i]; |
| code += v + 1; |
| bit ^= 1; |
| if (v < 8 && (i + 1) < buf_len && buf[i + 1] < 8) { |
| code += buf[i + 1] + 1; |
| bit ^= 1; |
| dbuf_putc(dbuf, (v << 3) | buf[i + 1]); |
| i += 2; |
| } else if (v < 128) { |
| dbuf_putc(dbuf, 0x80 + v); |
| i++; |
| } else if (v < (1 << 13)) { |
| dbuf_putc(dbuf, 0x40 + (v >> 8)); |
| dbuf_putc(dbuf, v); |
| i++; |
| } else { |
| assert(v < (1 << 21)); |
| dbuf_putc(dbuf, 0x60 + (v >> 16)); |
| dbuf_putc(dbuf, v >> 8); |
| dbuf_putc(dbuf, v); |
| i++; |
| } |
| } |
| |
| if (add_index) { |
| /* last index entry */ |
| v = code; |
| dbuf_putc(dbuf2, v); |
| dbuf_putc(dbuf2, v >> 8); |
| dbuf_putc(dbuf2, v >> 16); |
| } |
| |
| #ifdef DUMP_TABLE_SIZE |
| printf("prop %s: length=%d bytes\n", unicode_prop_name[prop_index], |
| (int)(dbuf->size + dbuf2->size)); |
| #endif |
| snprintf(cname, sizeof(cname), "unicode_prop_%s_table", unicode_prop_name[prop_index]); |
| dump_byte_table(f, cname, dbuf->buf, dbuf->size); |
| if (add_index) { |
| snprintf(cname, sizeof(cname), "unicode_prop_%s_index", unicode_prop_name[prop_index]); |
| dump_byte_table(f, cname, dbuf2->buf, dbuf2->size); |
| } |
| |
| dbuf_free(dbuf); |
| dbuf_free(dbuf1); |
| dbuf_free(dbuf2); |
| } |
| |
| void build_flags_tables(FILE *f) |
| { |
| build_prop_table(f, PROP_Cased1, TRUE); |
| build_prop_table(f, PROP_Case_Ignorable, TRUE); |
| build_prop_table(f, PROP_ID_Start, TRUE); |
| build_prop_table(f, PROP_ID_Continue1, TRUE); |
| } |
| |
| void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len, |
| const char **tab_short_name) |
| { |
| int i, w, maxw; |
| |
| maxw = 0; |
| for(i = 0; i < len; i++) { |
| w = strlen(tab_name[i]); |
| if (tab_short_name[i][0] != '\0') { |
| w += 1 + strlen(tab_short_name[i]); |
| } |
| if (maxw < w) |
| maxw = w; |
| } |
| |
| /* generate a sequence of strings terminated by an empty string */ |
| fprintf(f, "static const char %s[] =\n", cname); |
| for(i = 0; i < len; i++) { |
| fprintf(f, " \""); |
| w = fprintf(f, "%s", tab_name[i]); |
| if (tab_short_name[i][0] != '\0') { |
| w += fprintf(f, ",%s", tab_short_name[i]); |
| } |
| fprintf(f, "\"%*s\"\\0\"\n", 1 + maxw - w, ""); |
| } |
| fprintf(f, ";\n\n"); |
| } |
| |
| void build_general_category_table(FILE *f) |
| { |
| int i, v, j, n, n1; |
| DynBuf dbuf_s, *dbuf = &dbuf_s; |
| int cw_count, cw_len_count[4], cw_start; |
| |
| fprintf(f, "typedef enum {\n"); |
| for(i = 0; i < GCAT_COUNT; i++) |
| fprintf(f, " UNICODE_GC_%s,\n", unicode_gc_name[i]); |
| fprintf(f, " UNICODE_GC_COUNT,\n"); |
| fprintf(f, "} UnicodeGCEnum;\n\n"); |
| |
| dump_name_table(f, "unicode_gc_name_table", |
| unicode_gc_name, GCAT_COUNT, |
| unicode_gc_short_name); |
| |
| |
| dbuf_init(dbuf); |
| cw_count = 0; |
| for(i = 0; i < 4; i++) |
| cw_len_count[i] = 0; |
| for(i = 0; i <= CHARCODE_MAX;) { |
| v = unicode_db[i].general_category; |
| j = i + 1; |
| while (j <= CHARCODE_MAX && unicode_db[j].general_category == v) |
| j++; |
| n = j - i; |
| /* compress Lu/Ll runs */ |
| if (v == GCAT_Lu) { |
| n1 = 1; |
| while ((i + n1) <= CHARCODE_MAX && unicode_db[i + n1].general_category == (v + (n1 & 1))) { |
| n1++; |
| } |
| if (n1 > n) { |
| v = 31; |
| n = n1; |
| } |
| } |
| // printf("%05x %05x %d\n", i, n, v); |
| cw_count++; |
| n--; |
| cw_start = dbuf->size; |
| if (n < 7) { |
| dbuf_putc(dbuf, (n << 5) | v); |
| } else if (n < 7 + 128) { |
| n1 = n - 7; |
| assert(n1 < 128); |
| dbuf_putc(dbuf, (0xf << 5) | v); |
| dbuf_putc(dbuf, n1); |
| } else if (n < 7 + 128 + (1 << 14)) { |
| n1 = n - (7 + 128); |
| assert(n1 < (1 << 14)); |
| dbuf_putc(dbuf, (0xf << 5) | v); |
| dbuf_putc(dbuf, (n1 >> 8) + 128); |
| dbuf_putc(dbuf, n1); |
| } else { |
| n1 = n - (7 + 128 + (1 << 14)); |
| assert(n1 < (1 << 22)); |
| dbuf_putc(dbuf, (0xf << 5) | v); |
| dbuf_putc(dbuf, (n1 >> 16) + 128 + 64); |
| dbuf_putc(dbuf, n1 >> 8); |
| dbuf_putc(dbuf, n1); |
| } |
| cw_len_count[dbuf->size - cw_start - 1]++; |
| i += n + 1; |
| } |
| #ifdef DUMP_TABLE_SIZE |
| printf("general category: %d entries [", |
| cw_count); |
| for(i = 0; i < 4; i++) |
| printf(" %d", cw_len_count[i]); |
| printf(" ], length=%d bytes\n", (int)dbuf->size); |
| #endif |
| |
| dump_byte_table(f, "unicode_gc_table", dbuf->buf, dbuf->size); |
| |
| dbuf_free(dbuf); |
| } |
| |
| void build_script_table(FILE *f) |
| { |
| int i, v, j, n, n1, type; |
| DynBuf dbuf_s, *dbuf = &dbuf_s; |
| int cw_count, cw_len_count[4], cw_start; |
| |
| fprintf(f, "typedef enum {\n"); |
| for(i = 0; i < SCRIPT_COUNT; i++) |
| fprintf(f, " UNICODE_SCRIPT_%s,\n", unicode_script_name[i]); |
| fprintf(f, " UNICODE_SCRIPT_COUNT,\n"); |
| fprintf(f, "} UnicodeScriptEnum;\n\n"); |
| |
| i = 1; |
| dump_name_table(f, "unicode_script_name_table", |
| unicode_script_name + i, SCRIPT_COUNT - i, |
| unicode_script_short_name + i); |
| |
| dbuf_init(dbuf); |
| cw_count = 0; |
| for(i = 0; i < 4; i++) |
| cw_len_count[i] = 0; |
| for(i = 0; i <= CHARCODE_MAX;) { |
| v = unicode_db[i].script; |
| j = i + 1; |
| while (j <= CHARCODE_MAX && unicode_db[j].script == v) |
| j++; |
| n = j - i; |
| if (v == 0 && j == (CHARCODE_MAX + 1)) |
| break; |
| // printf("%05x %05x %d\n", i, n, v); |
| cw_count++; |
| n--; |
| cw_start = dbuf->size; |
| if (v == 0) |
| type = 0; |
| else |
| type = 1; |
| if (n < 96) { |
| dbuf_putc(dbuf, n | (type << 7)); |
| } else if (n < 96 + (1 << 12)) { |
| n1 = n - 96; |
| assert(n1 < (1 << 12)); |
| dbuf_putc(dbuf, ((n1 >> 8) + 96) | (type << 7)); |
| dbuf_putc(dbuf, n1); |
| } else { |
| n1 = n - (96 + (1 << 12)); |
| assert(n1 < (1 << 20)); |
| dbuf_putc(dbuf, ((n1 >> 16) + 112) | (type << 7)); |
| dbuf_putc(dbuf, n1 >> 8); |
| dbuf_putc(dbuf, n1); |
| } |
| if (type != 0) |
| dbuf_putc(dbuf, v); |
| |
| cw_len_count[dbuf->size - cw_start - 1]++; |
| i += n + 1; |
| } |
| #if defined(DUMP_TABLE_SIZE) |
| printf("script: %d entries [", |
| cw_count); |
| for(i = 0; i < 4; i++) |
| printf(" %d", cw_len_count[i]); |
| printf(" ], length=%d bytes\n", (int)dbuf->size); |
| #endif |
| |
| dump_byte_table(f, "unicode_script_table", dbuf->buf, dbuf->size); |
| |
| dbuf_free(dbuf); |
| } |
| |
| void build_script_ext_table(FILE *f) |
| { |
| int i, j, n, n1, script_ext_len; |
| DynBuf dbuf_s, *dbuf = &dbuf_s; |
| int cw_count; |
| |
| dbuf_init(dbuf); |
| cw_count = 0; |
| for(i = 0; i <= CHARCODE_MAX;) { |
| script_ext_len = unicode_db[i].script_ext_len; |
| j = i + 1; |
| while (j <= CHARCODE_MAX && |
| unicode_db[j].script_ext_len == script_ext_len && |
| !memcmp(unicode_db[j].script_ext, unicode_db[i].script_ext, |
| script_ext_len)) { |
| j++; |
| } |
| n = j - i; |
| cw_count++; |
| n--; |
| if (n < 128) { |
| dbuf_putc(dbuf, n); |
| } else if (n < 128 + (1 << 14)) { |
| n1 = n - 128; |
| assert(n1 < (1 << 14)); |
| dbuf_putc(dbuf, (n1 >> 8) + 128); |
| dbuf_putc(dbuf, n1); |
| } else { |
| n1 = n - (128 + (1 << 14)); |
| assert(n1 < (1 << 22)); |
| dbuf_putc(dbuf, (n1 >> 16) + 128 + 64); |
| dbuf_putc(dbuf, n1 >> 8); |
| dbuf_putc(dbuf, n1); |
| } |
| dbuf_putc(dbuf, script_ext_len); |
| for(j = 0; j < script_ext_len; j++) |
| dbuf_putc(dbuf, unicode_db[i].script_ext[j]); |
| i += n + 1; |
| } |
| #ifdef DUMP_TABLE_SIZE |
| printf("script_ext: %d entries", |
| cw_count); |
| printf(", length=%d bytes\n", (int)dbuf->size); |
| #endif |
| |
| dump_byte_table(f, "unicode_script_ext_table", dbuf->buf, dbuf->size); |
| |
| dbuf_free(dbuf); |
| } |
| |
| /* the following properties are synthetized so no table is necessary */ |
| #define PROP_TABLE_COUNT PROP_ASCII |
| |
| void build_prop_list_table(FILE *f) |
| { |
| int i; |
| |
| for(i = 0; i < PROP_TABLE_COUNT; i++) { |
| if (i == PROP_ID_Start || |
| i == PROP_Case_Ignorable || |
| i == PROP_ID_Continue1) { |
| /* already generated */ |
| } else { |
| build_prop_table(f, i, FALSE); |
| } |
| } |
| |
| fprintf(f, "typedef enum {\n"); |
| for(i = 0; i < PROP_COUNT; i++) |
| fprintf(f, " UNICODE_PROP_%s,\n", unicode_prop_name[i]); |
| fprintf(f, " UNICODE_PROP_COUNT,\n"); |
| fprintf(f, "} UnicodePropertyEnum;\n\n"); |
| |
| i = PROP_ASCII_Hex_Digit; |
| dump_name_table(f, "unicode_prop_name_table", |
| unicode_prop_name + i, PROP_XID_Start - i + 1, |
| unicode_prop_short_name + i); |
| |
| fprintf(f, "static const uint8_t * const unicode_prop_table[] = {\n"); |
| for(i = 0; i < PROP_TABLE_COUNT; i++) { |
| fprintf(f, " unicode_prop_%s_table,\n", unicode_prop_name[i]); |
| } |
| fprintf(f, "};\n\n"); |
| |
| fprintf(f, "static const uint16_t unicode_prop_len_table[] = {\n"); |
| for(i = 0; i < PROP_TABLE_COUNT; i++) { |
| fprintf(f, " countof(unicode_prop_%s_table),\n", unicode_prop_name[i]); |
| } |
| fprintf(f, "};\n\n"); |
| } |
| |
| #ifdef USE_TEST |
| int check_conv(uint32_t *res, uint32_t c, int conv_type) |
| { |
| return lre_case_conv(res, c, conv_type); |
| } |
| |
| void check_case_conv(void) |
| { |
| CCInfo *tab = unicode_db; |
| uint32_t res[3]; |
| int l, error; |
| CCInfo ci_s, *ci1, *ci = &ci_s; |
| int code; |
| |
| for(code = 0; code <= CHARCODE_MAX; code++) { |
| ci1 = &tab[code]; |
| *ci = *ci1; |
| if (ci->l_len == 0) { |
| ci->l_len = 1; |
| ci->l_data[0] = code; |
| } |
| if (ci->u_len == 0) { |
| ci->u_len = 1; |
| ci->u_data[0] = code; |
| } |
| if (ci->f_code == 0) |
| ci->f_code = code; |
| |
| error = 0; |
| l = check_conv(res, code, 0); |
| if (l != ci->u_len || tabcmp((int *)res, ci->u_data, l)) { |
| printf("ERROR: L\n"); |
| error++; |
| } |
| l = check_conv(res, code, 1); |
| if (l != ci->l_len || tabcmp((int *)res, ci->l_data, l)) { |
| printf("ERROR: U\n"); |
| error++; |
| } |
| l = check_conv(res, code, 2); |
| if (l != 1 || res[0] != ci->f_code) { |
| printf("ERROR: F\n"); |
| error++; |
| } |
| if (error) { |
| dump_cc_info(ci, code); |
| exit(1); |
| } |
| } |
| } |
| |
| #ifdef PROFILE |
| static int64_t get_time_ns(void) |
| { |
| struct timespec ts; |
| clock_gettime(CLOCK_MONOTONIC, &ts); |
| return (int64_t)ts.tv_sec * 1000000000 + ts.tv_nsec; |
| } |
| #endif |
| |
| |
| void check_flags(void) |
| { |
| int c; |
| BOOL flag_ref, flag; |
| for(c = 0; c <= CHARCODE_MAX; c++) { |
| flag_ref = get_prop(c, PROP_Cased); |
| flag = lre_is_cased(c); |
| if (flag != flag_ref) { |
| printf("ERROR: c=%05x cased=%d ref=%d\n", |
| c, flag, flag_ref); |
| exit(1); |
| } |
| |
| flag_ref = get_prop(c, PROP_Case_Ignorable); |
| flag = lre_is_case_ignorable(c); |
| if (flag != flag_ref) { |
| printf("ERROR: c=%05x case_ignorable=%d ref=%d\n", |
| c, flag, flag_ref); |
| exit(1); |
| } |
| |
| flag_ref = get_prop(c, PROP_ID_Start); |
| flag = lre_is_id_start(c); |
| if (flag != flag_ref) { |
| printf("ERROR: c=%05x id_start=%d ref=%d\n", |
| c, flag, flag_ref); |
| exit(1); |
| } |
| |
| flag_ref = get_prop(c, PROP_ID_Continue); |
| flag = lre_is_id_continue(c); |
| if (flag != flag_ref) { |
| printf("ERROR: c=%05x id_cont=%d ref=%d\n", |
| c, flag, flag_ref); |
| exit(1); |
| } |
| } |
| #ifdef PROFILE |
| { |
| int64_t ti, count; |
| ti = get_time_ns(); |
| count = 0; |
| for(c = 0x20; c <= 0xffff; c++) { |
| flag_ref = get_prop(c, PROP_ID_Start); |
| flag = lre_is_id_start(c); |
| assert(flag == flag_ref); |
| count++; |
| } |
| ti = get_time_ns() - ti; |
| printf("flags time=%0.1f ns/char\n", |
| (double)ti / count); |
| } |
| #endif |
| } |
| |
| #endif |
| |
| #define CC_BLOCK_LEN 32 |
| |
| void build_cc_table(FILE *f) |
| { |
| int i, cc, n, cc_table_len, type, n1; |
| DynBuf dbuf_s, *dbuf = &dbuf_s; |
| DynBuf dbuf1_s, *dbuf1 = &dbuf1_s; |
| int cw_len_tab[3], cw_start, block_end_pos; |
| uint32_t v; |
| |
| dbuf_init(dbuf); |
| dbuf_init(dbuf1); |
| cc_table_len = 0; |
| for(i = 0; i < countof(cw_len_tab); i++) |
| cw_len_tab[i] = 0; |
| block_end_pos = CC_BLOCK_LEN; |
| for(i = 0; i <= CHARCODE_MAX;) { |
| cc = unicode_db[i].combining_class; |
| assert(cc <= 255); |
| /* check increasing values */ |
| n = 1; |
| while ((i + n) <= CHARCODE_MAX && |
| unicode_db[i + n].combining_class == (cc + n)) |
| n++; |
| if (n >= 2) { |
| type = 1; |
| } else { |
| type = 0; |
| n = 1; |
| while ((i + n) <= CHARCODE_MAX && |
| unicode_db[i + n].combining_class == cc) |
| n++; |
| } |
| /* no need to encode the last run */ |
| if (cc == 0 && (i + n - 1) == CHARCODE_MAX) |
| break; |
| #ifdef DUMP_CC_TABLE |
| printf("%05x %6d %d %d\n", i, n, type, cc); |
| #endif |
| if (type == 0) { |
| if (cc == 0) |
| type = 2; |
| else if (cc == 230) |
| type = 3; |
| } |
| n1 = n - 1; |
| |
| /* add an entry to the index if necessary */ |
| if (dbuf->size >= block_end_pos) { |
| v = i | ((dbuf->size - block_end_pos) << 21); |
| dbuf_putc(dbuf1, v); |
| dbuf_putc(dbuf1, v >> 8); |
| dbuf_putc(dbuf1, v >> 16); |
| block_end_pos += CC_BLOCK_LEN; |
| } |
| cw_start = dbuf->size; |
| if (n1 < 48) { |
| dbuf_putc(dbuf, n1 | (type << 6)); |
| } else if (n1 < 48 + (1 << 11)) { |
| n1 -= 48; |
| dbuf_putc(dbuf, ((n1 >> 8) + 48) | (type << 6)); |
| dbuf_putc(dbuf, n1); |
| } else { |
| n1 -= 48 + (1 << 11); |
| assert(n1 < (1 << 20)); |
| dbuf_putc(dbuf, ((n1 >> 16) + 56) | (type << 6)); |
| dbuf_putc(dbuf, n1 >> 8); |
| dbuf_putc(dbuf, n1); |
| } |
| cw_len_tab[dbuf->size - cw_start - 1]++; |
| if (type == 0 || type == 1) |
| dbuf_putc(dbuf, cc); |
| cc_table_len++; |
| i += n; |
| } |
| |
| /* last index entry */ |
| v = i; |
| dbuf_putc(dbuf1, v); |
| dbuf_putc(dbuf1, v >> 8); |
| dbuf_putc(dbuf1, v >> 16); |
| |
| dump_byte_table(f, "unicode_cc_table", dbuf->buf, dbuf->size); |
| dump_byte_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size); |
| |
| #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE) |
| printf("CC table: size=%d (%d entries) [", |
| (int)(dbuf->size + dbuf1->size), |
| cc_table_len); |
| for(i = 0; i < countof(cw_len_tab); i++) |
| printf(" %d", cw_len_tab[i]); |
| printf(" ]\n"); |
| #endif |
| dbuf_free(dbuf); |
| dbuf_free(dbuf1); |
| } |
| |
| /* maximum length of decomposition: 18 chars (1), then 8 */ |
| #ifndef USE_TEST |
| typedef enum { |
| DECOMP_TYPE_C1, /* 16 bit char */ |
| DECOMP_TYPE_L1, /* 16 bit char table */ |
| DECOMP_TYPE_L2, |
| DECOMP_TYPE_L3, |
| DECOMP_TYPE_L4, |
| DECOMP_TYPE_L5, /* XXX: not used */ |
| DECOMP_TYPE_L6, /* XXX: could remove */ |
| DECOMP_TYPE_L7, /* XXX: could remove */ |
| DECOMP_TYPE_LL1, /* 18 bit char table */ |
| DECOMP_TYPE_LL2, |
| DECOMP_TYPE_S1, /* 8 bit char table */ |
| DECOMP_TYPE_S2, |
| DECOMP_TYPE_S3, |
| DECOMP_TYPE_S4, |
| DECOMP_TYPE_S5, |
| DECOMP_TYPE_I1, /* increment 16 bit char value */ |
| DECOMP_TYPE_I2_0, |
| DECOMP_TYPE_I2_1, |
| DECOMP_TYPE_I3_1, |
| DECOMP_TYPE_I3_2, |
| DECOMP_TYPE_I4_1, |
| DECOMP_TYPE_I4_2, |
| DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */ |
| DECOMP_TYPE_B2, |
| DECOMP_TYPE_B3, |
| DECOMP_TYPE_B4, |
| DECOMP_TYPE_B5, |
| DECOMP_TYPE_B6, |
| DECOMP_TYPE_B7, |
| DECOMP_TYPE_B8, |
| DECOMP_TYPE_B18, |
| DECOMP_TYPE_LS2, |
| DECOMP_TYPE_PAT3, |
| DECOMP_TYPE_S2_UL, |
| DECOMP_TYPE_LS2_UL, |
| } DecompTypeEnum; |
| #endif |
| |
| const char *decomp_type_str[] = { |
| "C1", |
| "L1", |
| "L2", |
| "L3", |
| "L4", |
| "L5", |
| "L6", |
| "L7", |
| "LL1", |
| "LL2", |
| "S1", |
| "S2", |
| "S3", |
| "S4", |
| "S5", |
| "I1", |
| "I2_0", |
| "I2_1", |
| "I3_1", |
| "I3_2", |
| "I4_1", |
| "I4_2", |
| "B1", |
| "B2", |
| "B3", |
| "B4", |
| "B5", |
| "B6", |
| "B7", |
| "B8", |
| "B18", |
| "LS2", |
| "PAT3", |
| "S2_UL", |
| "LS2_UL", |
| }; |
| |
| const int decomp_incr_tab[4][4] = { |
| { DECOMP_TYPE_I1, 0, -1 }, |
| { DECOMP_TYPE_I2_0, 0, 1, -1 }, |
| { DECOMP_TYPE_I3_1, 1, 2, -1 }, |
| { DECOMP_TYPE_I4_1, 1, 2, -1 }, |
| }; |
| |
| /* |
| entry size: |
| type bits |
| code 18 |
| len 7 |
| compat 1 |
| type 5 |
| index 16 |
| total 47 |
| */ |
| |
| typedef struct { |
| int code; |
| uint8_t len; |
| uint8_t type; |
| uint8_t c_len; |
| uint16_t c_min; |
| uint16_t data_index; |
| int cost; /* size in bytes from this entry to the end */ |
| } DecompEntry; |
| |
| int get_decomp_run_size(const DecompEntry *de) |
| { |
| int s; |
| s = 6; |
| if (de->type <= DECOMP_TYPE_C1) { |
| /* nothing more */ |
| } else if (de->type <= DECOMP_TYPE_L7) { |
| s += de->len * de->c_len * 2; |
| } else if (de->type <= DECOMP_TYPE_LL2) { |
| /* 18 bits per char */ |
| s += (de->len * de->c_len * 18 + 7) / 8; |
| } else if (de->type <= DECOMP_TYPE_S5) { |
| s += de->len * de->c_len; |
| } else if (de->type <= DECOMP_TYPE_I4_2) { |
| s += de->c_len * 2; |
| } else if (de->type <= DECOMP_TYPE_B18) { |
| s += 2 + de->len * de->c_len; |
| } else if (de->type <= DECOMP_TYPE_LS2) { |
| s += de->len * 3; |
| } else if (de->type <= DECOMP_TYPE_PAT3) { |
| s += 4 + de->len * 2; |
| } else if (de->type <= DECOMP_TYPE_S2_UL) { |
| s += de->len; |
| } else if (de->type <= DECOMP_TYPE_LS2_UL) { |
| s += (de->len / 2) * 3; |
| } else { |
| abort(); |
| } |
| return s; |
| } |
| |
| static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 }; |
| |
| /* return -1 if not found */ |
| int get_short_code(int c) |
| { |
| int i; |
| if (c < 0x80) { |
| return c; |
| } else if (c >= 0x300 && c < 0x350) { |
| return c - 0x300 + 0x80; |
| } else { |
| for(i = 0; i < countof(unicode_short_table); i++) { |
| if (c == unicode_short_table[i]) |
| return i + 0x80 + 0x50; |
| } |
| return -1; |
| } |
| } |
| |
| static BOOL is_short(int code) |
| { |
| return get_short_code(code) >= 0; |
| } |
| |
| static BOOL is_short_tab(const int *tab, int len) |
| { |
| int i; |
| for(i = 0; i < len; i++) { |
| if (!is_short(tab[i])) |
| return FALSE; |
| } |
| return TRUE; |
| } |
| |
| static BOOL is_16bit(const int *tab, int len) |
| { |
| int i; |
| for(i = 0; i < len; i++) { |
| if (tab[i] > 0xffff) |
| return FALSE; |
| } |
| return TRUE; |
| } |
| |
| static uint32_t to_lower_simple(uint32_t c) |
| { |
| /* Latin1 and Cyrillic */ |
| if (c < 0x100 || (c >= 0x410 && c <= 0x42f)) |
| c += 0x20; |
| else |
| c++; |
| return c; |
| } |
| |
| /* select best encoding with dynamic programming */ |
| void find_decomp_run(DecompEntry *tab_de, int i) |
| { |
| DecompEntry de_s, *de = &de_s; |
| CCInfo *ci, *ci1, *ci2; |
| int l, j, n, len_max; |
| |
| ci = &unicode_db[i]; |
| l = ci->decomp_len; |
| if (l == 0) { |
| tab_de[i].cost = tab_de[i + 1].cost; |
| return; |
| } |
| |
| /* the offset for the compose table has only 6 bits, so we must |
| limit if it can be used by the compose table */ |
| if (!ci->is_compat && !ci->is_excluded && l == 2) |
| len_max = 64; |
| else |
| len_max = 127; |
| |
| tab_de[i].cost = 0x7fffffff; |
| |
| if (!is_16bit(ci->decomp_data, l)) { |
| assert(l <= 2); |
| |
| n = 1; |
| for(;;) { |
| de->code = i; |
| de->len = n; |
| de->type = DECOMP_TYPE_LL1 + l - 1; |
| de->c_len = l; |
| de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; |
| if (de->cost < tab_de[i].cost) { |
| tab_de[i] = *de; |
| } |
| if (!((i + n) <= CHARCODE_MAX && n < len_max)) |
| break; |
| ci1 = &unicode_db[i + n]; |
| /* Note: we accept a hole */ |
| if (!(ci1->decomp_len == 0 || |
| (ci1->decomp_len == l && |
| ci1->is_compat == ci->is_compat))) |
| break; |
| n++; |
| } |
| return; |
| } |
| |
| if (l <= 7) { |
| n = 1; |
| for(;;) { |
| de->code = i; |
| de->len = n; |
| if (l == 1 && n == 1) { |
| de->type = DECOMP_TYPE_C1; |
| } else { |
| assert(l <= 8); |
| de->type = DECOMP_TYPE_L1 + l - 1; |
| } |
| de->c_len = l; |
| de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; |
| if (de->cost < tab_de[i].cost) { |
| tab_de[i] = *de; |
| } |
| |
| if (!((i + n) <= CHARCODE_MAX && n < len_max)) |
| break; |
| ci1 = &unicode_db[i + n]; |
| /* Note: we accept a hole */ |
| if (!(ci1->decomp_len == 0 || |
| (ci1->decomp_len == l && |
| ci1->is_compat == ci->is_compat && |
| is_16bit(ci1->decomp_data, l)))) |
| break; |
| n++; |
| } |
| } |
| |
| if (l <= 8 || l == 18) { |
| int c_min, c_max, c; |
| c_min = c_max = -1; |
| n = 1; |
| for(;;) { |
| ci1 = &unicode_db[i + n - 1]; |
| for(j = 0; j < l; j++) { |
| c = ci1->decomp_data[j]; |
| if (c == 0x20) { |
| /* we accept space for Arabic */ |
| } else if (c_min == -1) { |
| c_min = c_max = c; |
| } else { |
| c_min = min_int(c_min, c); |
| c_max = max_int(c_max, c); |
| } |
| } |
| if ((c_max - c_min) > 254) |
| break; |
| de->code = i; |
| de->len = n; |
| if (l == 18) |
| de->type = DECOMP_TYPE_B18; |
| else |
| de->type = DECOMP_TYPE_B1 + l - 1; |
| de->c_len = l; |
| de->c_min = c_min; |
| de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; |
| if (de->cost < tab_de[i].cost) { |
| tab_de[i] = *de; |
| } |
| if (!((i + n) <= CHARCODE_MAX && n < len_max)) |
| break; |
| ci1 = &unicode_db[i + n]; |
| if (!(ci1->decomp_len == l && |
| ci1->is_compat == ci->is_compat)) |
| break; |
| n++; |
| } |
| } |
| |
| /* find an ascii run */ |
| if (l <= 5 && is_short_tab(ci->decomp_data, l)) { |
| n = 1; |
| for(;;) { |
| de->code = i; |
| de->len = n; |
| de->type = DECOMP_TYPE_S1 + l - 1; |
| de->c_len = l; |
| de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; |
| if (de->cost < tab_de[i].cost) { |
| tab_de[i] = *de; |
| } |
| |
| if (!((i + n) <= CHARCODE_MAX && n < len_max)) |
| break; |
| ci1 = &unicode_db[i + n]; |
| /* Note: we accept a hole */ |
| if (!(ci1->decomp_len == 0 || |
| (ci1->decomp_len == l && |
| ci1->is_compat == ci->is_compat && |
| is_short_tab(ci1->decomp_data, l)))) |
| break; |
| n++; |
| } |
| } |
| |
| /* check if a single char is increasing */ |
| if (l <= 4) { |
| int idx1, idx; |
| |
| for(idx1 = 1; (idx = decomp_incr_tab[l - 1][idx1]) >= 0; idx1++) { |
| n = 1; |
| for(;;) { |
| de->code = i; |
| de->len = n; |
| de->type = decomp_incr_tab[l - 1][0] + idx1 - 1; |
| de->c_len = l; |
| de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; |
| if (de->cost < tab_de[i].cost) { |
| tab_de[i] = *de; |
| } |
| |
| if (!((i + n) <= CHARCODE_MAX && n < len_max)) |
| break; |
| ci1 = &unicode_db[i + n]; |
| if (!(ci1->decomp_len == l && |
| ci1->is_compat == ci->is_compat)) |
| goto next1; |
| for(j = 0; j < l; j++) { |
| if (j == idx) { |
| if (ci1->decomp_data[j] != ci->decomp_data[j] + n) |
| goto next1; |
| } else { |
| if (ci1->decomp_data[j] != ci->decomp_data[j]) |
| goto next1; |
| } |
| } |
| n++; |
| } |
| next1: ; |
| } |
| } |
| |
| if (l == 3) { |
| n = 1; |
| for(;;) { |
| de->code = i; |
| de->len = n; |
| de->type = DECOMP_TYPE_PAT3; |
| de->c_len = l; |
| de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; |
| if (de->cost < tab_de[i].cost) { |
| tab_de[i] = *de; |
| } |
| if (!((i + n) <= CHARCODE_MAX && n < len_max)) |
| break; |
| ci1 = &unicode_db[i + n]; |
| if (!(ci1->decomp_len == l && |
| ci1->is_compat == ci->is_compat && |
| ci1->decomp_data[1] <= 0xffff && |
| ci1->decomp_data[0] == ci->decomp_data[0] && |
| ci1->decomp_data[l - 1] == ci->decomp_data[l - 1])) |
| break; |
| n++; |
| } |
| } |
| |
| if (l == 2 && is_short(ci->decomp_data[1])) { |
| n = 1; |
| for(;;) { |
| de->code = i; |
| de->len = n; |
| de->type = DECOMP_TYPE_LS2; |
| de->c_len = l; |
| de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; |
| if (de->cost < tab_de[i].cost) { |
| tab_de[i] = *de; |
| } |
| if (!((i + n) <= CHARCODE_MAX && n < len_max)) |
| break; |
| ci1 = &unicode_db[i + n]; |
| if (!(ci1->decomp_len == 0 || |
| (ci1->decomp_len == l && |
| ci1->is_compat == ci->is_compat && |
| ci1->decomp_data[0] <= 0xffff && |
| is_short(ci1->decomp_data[1])))) |
| break; |
| n++; |
| } |
| } |
| |
| if (l == 2) { |
| BOOL is_16bit; |
| |
| n = 0; |
| is_16bit = FALSE; |
| for(;;) { |
| if (!((i + n + 1) <= CHARCODE_MAX && n + 2 <= len_max)) |
| break; |
| ci1 = &unicode_db[i + n]; |
| if (!(ci1->decomp_len == l && |
| ci1->is_compat == ci->is_compat && |
| is_short(ci1->decomp_data[1]))) |
| break; |
| if (!is_16bit && !is_short(ci1->decomp_data[0])) |
| is_16bit = TRUE; |
| ci2 = &unicode_db[i + n + 1]; |
| if (!(ci2->decomp_len == l && |
| ci2->is_compat == ci->is_compat && |
| ci2->decomp_data[0] == to_lower_simple(ci1->decomp_data[0]) && |
| ci2->decomp_data[1] == ci1->decomp_data[1])) |
| break; |
| n += 2; |
| de->code = i; |
| de->len = n; |
| de->type = DECOMP_TYPE_S2_UL + is_16bit; |
| de->c_len = l; |
| de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; |
| if (de->cost < tab_de[i].cost) { |
| tab_de[i] = *de; |
| } |
| } |
| } |
| } |
| |
| void put16(uint8_t *data_buf, int *pidx, uint16_t c) |
| { |
| int idx; |
| idx = *pidx; |
| data_buf[idx++] = c; |
| data_buf[idx++] = c >> 8; |
| *pidx = idx; |
| } |
| |
| void add_decomp_data(uint8_t *data_buf, int *pidx, DecompEntry *de) |
| { |
| int i, j, idx, c; |
| CCInfo *ci; |
| |
| idx = *pidx; |
| de->data_index = idx; |
| if (de->type <= DECOMP_TYPE_C1) { |
| ci = &unicode_db[de->code]; |
| assert(ci->decomp_len == 1); |
| de->data_index = ci->decomp_data[0]; |
| } else if (de->type <= DECOMP_TYPE_L7) { |
| for(i = 0; i < de->len; i++) { |
| ci = &unicode_db[de->code + i]; |
| for(j = 0; j < de->c_len; j++) { |
| if (ci->decomp_len == 0) |
| c = 0; |
| else |
| c = ci->decomp_data[j]; |
| put16(data_buf, &idx, c); |
| } |
| } |
| } else if (de->type <= DECOMP_TYPE_LL2) { |
| int n, p, k; |
| n = (de->len * de->c_len * 18 + 7) / 8; |
| p = de->len * de->c_len * 2; |
| memset(data_buf + idx, 0, n); |
| k = 0; |
| for(i = 0; i < de->len; i++) { |
| ci = &unicode_db[de->code + i]; |
| for(j = 0; j < de->c_len; j++) { |
| if (ci->decomp_len == 0) |
| c = 0; |
| else |
| c = ci->decomp_data[j]; |
| data_buf[idx + k * 2] = c; |
| data_buf[idx + k * 2 + 1] = c >> 8; |
| data_buf[idx + p + (k / 4)] |= (c >> 16) << ((k % 4) * 2); |
| k++; |
| } |
| } |
| idx += n; |
| } else if (de->type <= DECOMP_TYPE_S5) { |
| for(i = 0; i < de->len; i++) { |
| ci = &unicode_db[de->code + i]; |
| for(j = 0; j < de->c_len; j++) { |
| if (ci->decomp_len == 0) |
| c = 0; |
| else |
| c = ci->decomp_data[j]; |
| c = get_short_code(c); |
| assert(c >= 0); |
| data_buf[idx++] = c; |
| } |
| } |
| } else if (de->type <= DECOMP_TYPE_I4_2) { |
| ci = &unicode_db[de->code]; |
| assert(ci->decomp_len == de->c_len); |
| for(j = 0; j < de->c_len; j++) |
| put16(data_buf, &idx, ci->decomp_data[j]); |
| } else if (de->type <= DECOMP_TYPE_B18) { |
| c = de->c_min; |
| data_buf[idx++] = c; |
| data_buf[idx++] = c >> 8; |
| for(i = 0; i < de->len; i++) { |
| ci = &unicode_db[de->code + i]; |
| for(j = 0; j < de->c_len; j++) { |
| assert(ci->decomp_len == de->c_len); |
| c = ci->decomp_data[j]; |
| if (c == 0x20) { |
| c = 0xff; |
| } else { |
| c -= de->c_min; |
| assert((uint32_t)c <= 254); |
| } |
| data_buf[idx++] = c; |
| } |
| } |
| } else if (de->type <= DECOMP_TYPE_LS2) { |
| assert(de->c_len == 2); |
| for(i = 0; i < de->len; i++) { |
| ci = &unicode_db[de->code + i]; |
| if (ci->decomp_len == 0) |
| c = 0; |
| else |
| c = ci->decomp_data[0]; |
| put16(data_buf, &idx, c); |
| |
| if (ci->decomp_len == 0) |
| c = 0; |
| else |
| c = ci->decomp_data[1]; |
| c = get_short_code(c); |
| assert(c >= 0); |
| data_buf[idx++] = c; |
| } |
| } else if (de->type <= DECOMP_TYPE_PAT3) { |
| ci = &unicode_db[de->code]; |
| assert(ci->decomp_len == 3); |
| put16(data_buf, &idx, ci->decomp_data[0]); |
| put16(data_buf, &idx, ci->decomp_data[2]); |
| for(i = 0; i < de->len; i++) { |
| ci = &unicode_db[de->code + i]; |
| assert(ci->decomp_len == 3); |
| put16(data_buf, &idx, ci->decomp_data[1]); |
| } |
| } else if (de->type <= DECOMP_TYPE_S2_UL) { |
| for(i = 0; i < de->len; i += 2) { |
| ci = &unicode_db[de->code + i]; |
| c = ci->decomp_data[0]; |
| c = get_short_code(c); |
| assert(c >= 0); |
| data_buf[idx++] = c; |
| c = ci->decomp_data[1]; |
| c = get_short_code(c); |
| assert(c >= 0); |
| data_buf[idx++] = c; |
| } |
| } else if (de->type <= DECOMP_TYPE_LS2_UL) { |
| for(i = 0; i < de->len; i += 2) { |
| ci = &unicode_db[de->code + i]; |
| c = ci->decomp_data[0]; |
| put16(data_buf, &idx, c); |
| c = ci->decomp_data[1]; |
| c = get_short_code(c); |
| assert(c >= 0); |
| data_buf[idx++] = c; |
| } |
| } else { |
| abort(); |
| } |
| *pidx = idx; |
| } |
| |
| #if 0 |
| void dump_large_char(void) |
| { |
| int i, j; |
| for(i = 0; i <= CHARCODE_MAX; i++) { |
| CCInfo *ci = &unicode_db[i]; |
| for(j = 0; j < ci->decomp_len; j++) { |
| if (ci->decomp_data[j] > 0xffff) |
| printf("%05x\n", ci->decomp_data[j]); |
| } |
| } |
| } |
| #endif |
| |
| void build_compose_table(FILE *f, const DecompEntry *tab_de); |
| |
| void build_decompose_table(FILE *f) |
| { |
| int i, array_len, code_max, data_len, count; |
| DecompEntry *tab_de, de_s, *de = &de_s; |
| uint8_t *data_buf; |
| |
| code_max = CHARCODE_MAX; |
| |
| tab_de = mallocz((code_max + 2) * sizeof(*tab_de)); |
| |
| for(i = code_max; i >= 0; i--) { |
| find_decomp_run(tab_de, i); |
| } |
| |
| /* build the data buffer */ |
| data_buf = malloc(100000); |
| data_len = 0; |
| array_len = 0; |
| for(i = 0; i <= code_max; i++) { |
| de = &tab_de[i]; |
| if (de->len != 0) { |
| add_decomp_data(data_buf, &data_len, de); |
| i += de->len - 1; |
| array_len++; |
| } |
| } |
| |
| #ifdef DUMP_DECOMP_TABLE |
| /* dump */ |
| { |
| int size, size1; |
| |
| printf("START LEN TYPE L C SIZE\n"); |
| size = 0; |
| for(i = 0; i <= code_max; i++) { |
| de = &tab_de[i]; |
| if (de->len != 0) { |
| size1 = get_decomp_run_size(de); |
| printf("%05x %3d %6s %2d %1d %4d\n", i, de->len, |
| decomp_type_str[de->type], de->c_len, |
| unicode_db[i].is_compat, size1); |
| i += de->len - 1; |
| size += size1; |
| } |
| } |
| |
| printf("array_len=%d estimated size=%d bytes actual=%d bytes\n", |
| array_len, size, array_len * 6 + data_len); |
| } |
| #endif |
| |
| fprintf(f, "static const uint32_t unicode_decomp_table1[%u] = {", |
| array_len); |
| count = 0; |
| for(i = 0; i <= code_max; i++) { |
| de = &tab_de[i]; |
| if (de->len != 0) { |
| uint32_t v; |
| if (count++ % 4 == 0) |
| fprintf(f, "\n "); |
| v = (de->code << (32 - 18)) | |
| (de->len << (32 - 18 - 7)) | |
| (de->type << (32 - 18 - 7 - 6)) | |
| unicode_db[de->code].is_compat; |
| fprintf(f, " 0x%08x,", v); |
| i += de->len - 1; |
| } |
| } |
| fprintf(f, "\n};\n\n"); |
| |
| fprintf(f, "static const uint16_t unicode_decomp_table2[%u] = {", |
| array_len); |
| count = 0; |
| for(i = 0; i <= code_max; i++) { |
| de = &tab_de[i]; |
| if (de->len != 0) { |
| if (count++ % 8 == 0) |
| fprintf(f, "\n "); |
| fprintf(f, " 0x%04x,", de->data_index); |
| i += de->len - 1; |
| } |
| } |
| fprintf(f, "\n};\n\n"); |
| |
| fprintf(f, "static const uint8_t unicode_decomp_data[%u] = {", |
| data_len); |
| for(i = 0; i < data_len; i++) { |
| if (i % 8 == 0) |
| fprintf(f, "\n "); |
| fprintf(f, " 0x%02x,", data_buf[i]); |
| } |
| fprintf(f, "\n};\n\n"); |
| |
| build_compose_table(f, tab_de); |
| |
| free(data_buf); |
| |
| free(tab_de); |
| } |
| |
| typedef struct { |
| uint32_t c[2]; |
| uint32_t p; |
| } ComposeEntry; |
| |
| #define COMPOSE_LEN_MAX 10000 |
| |
| static int ce_cmp(const void *p1, const void *p2) |
| { |
| const ComposeEntry *ce1 = p1; |
| const ComposeEntry *ce2 = p2; |
| int i; |
| |
|