| #!/bin/sh |
| # |
| # This needs http://unicode.org/Public/UNIDATA/UnicodeData.txt |
| # |
| inputfile="$1" # Expect UnicodeData.txt |
| outfile=archive_string_composition.h |
| pickout=/tmp/mk_unicode_composition_tbl$$.awk |
| ################################################################################# |
| # |
| # Append the file header of "archive_string_composition.h" |
| # |
| ################################################################################# |
| append_copyright() |
| { |
| cat > ${outfile} <<CR_END |
| /*- |
| * Copyright (c) 2011 libarchive Project |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR |
| * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
| * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, |
| * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
| * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| * |
| * \$FreeBSD\$ |
| * |
| */ |
| |
| /* |
| * ATTENTION! |
| * This file is generated by build/utils/gen_archive_string_composition_h.sh |
| * from http://unicode.org/Public/UNIDATA/UnicodeData.txt |
| * |
| * See also http://unicode.org/report/tr15/ |
| */ |
| |
| #ifndef __LIBARCHIVE_BUILD |
| #error This header is only to be used internally to libarchive. |
| #endif |
| |
| #ifndef ARCHIVE_STRING_COMPOSITION_H_INCLUDED |
| #define ARCHIVE_STRING_COMPOSITION_H_INCLUDED |
| |
| struct unicode_composition_table { |
| uint32_t cp1; |
| uint32_t cp2; |
| uint32_t nfc; |
| }; |
| |
| CR_END |
| } |
| ################################################################################# |
| # |
| # awk script |
| # |
| ################################################################################# |
| cat > ${pickout} <<AWK_END |
| # |
| BEGIN { |
| FS = ";" |
| min = ""; |
| max = ""; |
| cmd="sort | awk -F ' ' '{printf \"\\\\t{ 0x%s , 0x%s , 0x%s },\\\\n\",\$1,\$2,\$3}'" |
| print "static const struct unicode_composition_table u_composition_table[] = {" |
| } |
| END { |
| close(cmd) |
| print "};" |
| print "" |
| # |
| # Output Canonical Combining Class tables used for translating NFD to NFC. |
| # |
| printf "#define CANONICAL_CLASS_MIN\\t0x%s\\n", min |
| printf "#define CANONICAL_CLASS_MAX\\t0x%s\\n", max |
| print "" |
| printf "#define IS_DECOMPOSABLE_BLOCK(uc)\\t\\\\\n" |
| printf "\\t(((uc)>>8) <= 0x%X && u_decomposable_blocks[(uc)>>8])\\n", highnum |
| printf "static const char u_decomposable_blocks[0x%X+1] = {\\n\\t", highnum |
| # |
| # Output blockmap |
| for (i = 0; i <= highnum; i++) { |
| if (i != 0 && i % 32 == 0) |
| printf "\\n\\t" |
| # Additionally Hangul[11XX(17), AC00(172) - D7FF(215)] is decomposable. |
| if (blockmap[i] || i == 17 || (i >= 172 && i <= 215)) |
| printf "1," |
| else |
| printf "0," |
| } |
| printf "\\n};\\n\\n" |
| # |
| # Output a macro to get a canonical combining class. |
| # |
| print "/* Get Canonical Combining Class(CCC). */" |
| printf "#define CCC(uc)\\t\\\\\n" |
| printf "\\t(((uc) > 0x%s)?0:\\\\\\n", max |
| printf "\\tccc_val[ccc_val_index[ccc_index[(uc)>>8]][((uc)>>4)&0x0F]][(uc)&0x0F])\\n" |
| print "" |
| # |
| # Output a canonical combining class value table. |
| # |
| midcnt = 0 |
| printf "/* The table of the value of Canonical Cimbining Class */\\n" |
| print "static const unsigned char ccc_val[][16] = {" |
| print " /* idx=0: XXXX0 - XXXXF */" |
| print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }," |
| for (h = 0; h <= highnum; h++) { |
| if (!blockmap[h]) |
| continue; |
| for (m = 0; m < 16; m++) { |
| if (!xx_blockmap[h, m]) |
| continue; |
| midcnt++ |
| printf " /* idx=%d: %03X%1X0 - %03X%1XF */\\n {", midcnt, h, m, h, m |
| for (l = 0; l < 15; l++) { |
| printf "%d, ", xxx_blockmap[h, m, l] |
| } |
| printf "%d },\n", xxx_blockmap[h, m, 15] |
| } |
| } |
| printf "};\n" |
| # |
| # Output the index table of the canonical combining class value table. |
| # |
| cnt = 0 |
| midcnt = 0 |
| printf "\\n/* The index table to ccc_val[*][16] */\\n" |
| print "static const unsigned char ccc_val_index[][16] = {" |
| print " /* idx=0: XXX00 - XXXFF */" |
| print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }," |
| for (h = 0; h <= highnum; h++) { |
| if (!blockmap[h]) |
| continue; |
| cnt++ |
| printf " /* idx=%d: %03X00 - %03XFF */\\n {", cnt, h, h |
| for (m = 0; m < 16; m++) { |
| if (m != 0) |
| printf "," |
| if (xx_blockmap[h, m]) { |
| midcnt++ |
| printf "%2d", midcnt |
| } else |
| printf " 0" |
| } |
| printf " },\\n" |
| } |
| printf "};\\n" |
| # |
| # Output the index table to the index table of the canonical combining |
| # class value table. |
| # |
| printf "\\n/* The index table to ccc_val_index[*][16] */\\n" |
| printf "static const unsigned char ccc_index[] = {\\n ", h |
| cnt = 0 |
| for (h = 0; h <= highnum; h++) { |
| if (h != 0 && h % 24 == 0) |
| printf "\\n " |
| if (blockmap[h]) { |
| cnt++; |
| printf "%2d,", cnt |
| } else |
| printf " 0," |
| } |
| print "};" |
| print "" |
| print "#endif /* ARCHIVE_STRING_COMPOSITION_H_INCLUDED */" |
| } |
| # |
| # |
| function hextoi(hex) |
| { |
| dec = 0 |
| for (i=0; i < length(hex); i++) { |
| x = substr(hex, i+1, 1) |
| if (x ~/[0-9]/) |
| dec = dec * 16 + x; |
| else if (x == "A") |
| dec = dec * 16 + 10; |
| else if (x == "B") |
| dec = dec * 16 + 11; |
| else if (x == "C") |
| dec = dec * 16 + 12; |
| else if (x == "D") |
| dec = dec * 16 + 13; |
| else if (x == "E") |
| dec = dec * 16 + 14; |
| else if (x == "F") |
| dec = dec * 16 + 15; |
| } |
| return dec |
| } |
| # |
| # Collect Canonical Combining Class values. |
| # |
| \$4 ~/^[0-9A-F]+$/ { |
| if (\$4 !~/^0$/) { |
| if (min == "") { |
| min = \$1 |
| } |
| max = \$1 |
| high = substr(\$1, 1, length(\$1) -2) |
| highnum = hextoi(high) |
| mid = substr(\$1, length(\$1) -1, 1) |
| midnum = hextoi(mid) |
| low = substr(\$1, length(\$1), 1) |
| lownum = hextoi(low) |
| blockmap[highnum] = 1 |
| xx_blockmap[highnum, midnum] = 1 |
| xxx_blockmap[highnum, midnum, lownum] = \$4 |
| } |
| } |
| # |
| # Following code points are not decomposed in MAC OS. |
| # U+2000 - U+2FFF |
| # U+F900 - U+FAFF |
| # U+2F800 - U+2FAFF |
| # |
| #\$1 ~/^2[0-9A-F][0-9A-F][0-9A-F]\$/ { |
| # next |
| #} |
| #\$1 ~/^F[9A][0-9A-F][0-9A-F]\$/ { |
| # next |
| #} |
| #\$1 ~/^2F[89A][0-9A-F][0-9A-F]\$/ { |
| # next |
| #} |
| # |
| # Exclusion code points specified by |
| # http://unicode.org/Public/UNIDATA/CompositionExclusions.txt |
| ## |
| # 1. Script Specifices |
| ## |
| \$1 ~/^095[89ABCDEF]\$/ { |
| next |
| } |
| \$1 ~/^09D[CDF]\$/ { |
| next |
| } |
| \$1 ~/^0A3[36]\$/ { |
| next |
| } |
| \$1 ~/^0A5[9ABE]\$/ { |
| next |
| } |
| \$1 ~/^0B5[CD]\$/ { |
| next |
| } |
| \$1 ~/^0F4[3D]\$/ { |
| next |
| } |
| \$1 ~/^0F5[27C]\$/ { |
| next |
| } |
| \$1 ~/^0F69\$/ { |
| next |
| } |
| \$1 ~/^0F7[68]\$/ { |
| next |
| } |
| \$1 ~/^0F9[3D]\$/ { |
| next |
| } |
| \$1 ~/^0FA[27C]\$/ { |
| next |
| } |
| \$1 ~/^0FB9\$/ { |
| next |
| } |
| \$1 ~/^FB1[DF]\$/ { |
| next |
| } |
| \$1 ~/^FB2[ABCDEF]\$/ { |
| next |
| } |
| \$1 ~/^FB3[012345689ABCE]\$/ { |
| next |
| } |
| \$1 ~/^FB4[01346789ABCDE]\$/ { |
| next |
| } |
| ## |
| # 2. Post Composition Version precomposed characters |
| ## |
| \$1 ~/^2ADC\$/ { |
| next |
| } |
| \$1 ~/^1D15[EF]\$/ { |
| next |
| } |
| \$1 ~/^1D16[01234]\$/ { |
| next |
| } |
| \$1 ~/^1D1B[BCDEF]\$/ { |
| next |
| } |
| \$1 ~/^1D1C0\$/ { |
| next |
| } |
| ## |
| # 3. Singleton Decompositions |
| ## |
| \$1 ~/^034[01]\$/ { |
| next |
| } |
| \$1 ~/^037[4E]\$/ { |
| next |
| } |
| \$1 ~/^0387\$/ { |
| next |
| } |
| \$1 ~/^1F7[13579BD]\$/ { |
| next |
| } |
| \$1 ~/^1FB[BE]\$/ { |
| next |
| } |
| \$1 ~/^1FC[9B]\$/ { |
| next |
| } |
| \$1 ~/^1FD[3B]\$/ { |
| next |
| } |
| \$1 ~/^1FE[3BEF]\$/ { |
| next |
| } |
| \$1 ~/^1FF[9BD]\$/ { |
| next |
| } |
| \$1 ~/^200[01]\$/ { |
| next |
| } |
| \$1 ~/^212[6AB]\$/ { |
| next |
| } |
| \$1 ~/^232[9A]\$/ { |
| next |
| } |
| \$1 ~/^F9[0-9A-F][0-9A-F]\$/ { |
| next |
| } |
| \$1 ~/^FA0[0-9A-D]\$/ { |
| next |
| } |
| \$1 ~/^FA1[025-9A-E]\$/ { |
| next |
| } |
| \$1 ~/^FA2[0256A-D]\$/ { |
| next |
| } |
| \$1 ~/^FA[3-5][0-9A-F]\$/ { |
| next |
| } |
| \$1 ~/^FA6[0-9A-D]\$/ { |
| next |
| } |
| \$1 ~/^FA[7-9A-C][0-9A-F]\$/ { |
| next |
| } |
| \$1 ~/^FAD[0-9]\$/ { |
| next |
| } |
| \$1 ~/^2F[89][0-9A-F][0-9A-F]\$/ { |
| next |
| } |
| \$1 ~/^2FA0[0-9A-F]\$/ { |
| next |
| } |
| \$1 ~/^2FA1[0-9A-D]\$/ { |
| next |
| } |
| ## |
| # 4. Non-Starter Decompositions |
| ## |
| \$1 ~/^0344\$/ { |
| next |
| } |
| \$1 ~/^0F7[35]\$/ { |
| next |
| } |
| \$1 ~/^0F81\$/ { |
| next |
| } |
| # |
| # Output combinations for NFD ==> NFC. |
| # |
| \$6 ~/^[0-9A-F]+ [0-9A-F]+\$/ { |
| split(\$6, cp, " ") |
| if (length(\$1) == 4) |
| print "0"cp[1], "0"cp[2], "0"\$1 | cmd |
| else |
| print cp[1], cp[2], \$1 | cmd |
| } |
| AWK_END |
| ################################################################################# |
| # |
| # Run awk a script. |
| # |
| ################################################################################# |
| append_copyright |
| awk -f ${pickout} ${inputfile} >> ${outfile} |
| # |
| # Remove awk the script. |
| rm ${pickout} |