blob: a3b2e62be482037483346b750a52114f4c9b1d33 [file] [log] [blame]
library unicode.tool.generator;
import "dart:async";
import "dart:io";
import "package:http/http.dart" as http;
import "package:lists/lists.dart";
import "package:strings/strings.dart";
import "package:template_block/template_block.dart";
const String UNICODE_DATA_FILE = "UnicodeData.txt";
const String UNICODE_DATA_URL =
"http://www.unicode.org/Public/UNIDATA/UnicodeData.txt";
const String VERSION = "8.0.0";
void main() {
var resources = <String, Resource>{};
resources[Generator.UNICODE_DATA] =
new Resource(filename: UNICODE_DATA_FILE, url: UNICODE_DATA_URL);
Future.wait(resources.values.map((r) => r.load())).then((_) {
var generator = new Generator();
var data = <String, List<String>>{};
resources.forEach((k, v) => data[k] = v.data);
var result = generator.generate(data);
var script = "lib/unicode.dart";
var file = new File(script);
file.writeAsStringSync(result.join("\n"));
});
}
class Categories {
static const Categories CN = const Categories("Cn", "NOT_ASSIGNED", 0);
static const Categories CC = const Categories("Cc", "CONTROL", 1);
static const Categories CF = const Categories("Cf", "FORMAT", 2);
static const Categories CO = const Categories("Co", "PRIVATE_USE", 3);
static const Categories CS = const Categories("Cs", "SURROGATE", 4);
static const Categories LL = const Categories("Ll", "LOWERCASE_LETTER", 5);
static const Categories LM = const Categories("Lm", "MODIFIER_LETTER", 6);
static const Categories LO = const Categories("Lo", "OTHER_LETTER", 7);
static const Categories LT = const Categories("Lt", "TITLECASE_LETTER", 8);
static const Categories LU = const Categories("Lu", "UPPERCASE_LETTER", 9);
static const Categories MC = const Categories("Mc", "SPACING_MARK", 10);
static const Categories ME = const Categories("Me", "ENCOSING_MARK", 11);
static const Categories MN = const Categories("Mn", "NONSPACING_MARK", 12);
static const Categories ND = const Categories("Nd", "DECIMAL_NUMBER", 13);
static const Categories NL = const Categories("Nl", "LETTER_NUMBER", 14);
static const Categories NO = const Categories("No", "OTHER_NUMBER", 15);
static const Categories PC =
const Categories("Pc", "CONNECTOR_PUNCTUATION", 16);
static const Categories PD = const Categories("Pd", "DASH_PUNCTUATION", 17);
static const Categories PE = const Categories("Pe", "CLOSE_PUNCTUATION", 18);
static const Categories PF = const Categories("Pf", "FINAL_PUNCTUATION", 19);
static const Categories PI =
const Categories("Pi", "INITIAL_PUNCTUATION", 20);
static const Categories PO = const Categories("Po", "OTHER_PUNCTUATION", 21);
static const Categories PS = const Categories("Ps", "OPEN_PUNCTUATION", 22);
static const Categories SC = const Categories("Sc", "CURRENCY_SYMBOL", 23);
static const Categories SK = const Categories("Sk", "MODIFIER_SYMBOL", 24);
static const Categories SM = const Categories("Sm", "MATH_SYMBOL", 25);
static const Categories SO = const Categories("So", "OTHER_SYMBOL", 26);
static const Categories ZL = const Categories("Zl", "LINE_SEPARATOR", 27);
static const Categories ZP =
const Categories("Zp", "PARAGRAPH_SEPARATOR", 28);
static const Categories ZS = const Categories("Zs", "SPACE_SEPARATOR", 29);
final int id;
final String abbr;
final String name;
const Categories(this.abbr, this.name, this.id);
static final Map<String, Categories> values = <String, Categories>{
CN.abbr: CN,
CC.abbr: CC,
CF.abbr: CF,
CO.abbr: CO,
CS.abbr: CS,
LL.abbr: LL,
LM.abbr: LM,
LO.abbr: LO,
LT.abbr: LT,
LU.abbr: LU,
MC.abbr: MC,
ME.abbr: ME,
MN.abbr: MN,
ND.abbr: ND,
NL.abbr: NL,
NO.abbr: NO,
PC.abbr: PC,
PD.abbr: PD,
PE.abbr: PE,
PF.abbr: PF,
PI.abbr: PI,
PO.abbr: PO,
PS.abbr: PS,
SC.abbr: SC,
SK.abbr: SK,
SM.abbr: SM,
SO.abbr: SO,
ZL.abbr: ZL,
ZP.abbr: ZP,
ZS.abbr: ZS,
};
String toString() => name;
}
class Generator {
static const int MAX_VALUE = 0x10ffff;
static const int UNICODE_LENGTH = MAX_VALUE + 1;
static const String UNICODE_DATA = "UNICODE_DATA";
static const String _GENERAL_CATEGORIES = "generalCategories";
static const String _GENERATE_BOOL_GROUP = "_generateBoolGroup";
static const String _GENERATE_CATEGORY = "_generateCategory";
static const String _GENERATE_INT_GROUP = "_generateIntGroup";
static const String _GENERATE_INT_MAPPING = "_generateIntMapping";
static const String _LOWERCASE = "lowercase";
static const String _TITLECASE = "titlecase";
static const String _TO_CASE = "_toCase";
static const String _TO_RUNE = "toRune";
static const String _TO_RUNES = "toRunes";
static const String _UPPERCASE = "uppercase";
static final String _templateLibrary = '''
// This library was created by the tool.
// Source: $UNICODE_DATA_URL
// Unicode Version: $VERSION
library {{NAME}};
{{#DIRECTIVES}}
{{#CONSTANTS}}
{{#VARIABLES}}
{{#METHODS}}
''';
static final String _templateMethodGenerateBoolGroup = '''
SparseBoolList $_GENERATE_BOOL_GROUP(List<int> data) {
var list = new SparseBoolList();
list.length = $UNICODE_LENGTH;
var length = data.length;
for (var i = 0; i < length; i += 2) {
var start = data[i + 0];
var end = data[i + 1];
list.addGroup(new GroupedRangeList<bool>(start, end, true));
}
list.freeze();
return list;
}
''';
static final String _templateMethodGenerateCategory = '''
SparseBoolList $_GENERATE_CATEGORY(int category) {
var list = new SparseBoolList();
list.length = $UNICODE_LENGTH;
for (var group in $_GENERAL_CATEGORIES.groups) {
if (group.key == category) {
list.addGroup(new GroupedRangeList<bool>(group.start, group.end, true));
}
}
list.freeze();
return list;
}
''';
static final String _templateMethodGenerateIntGroup = '''
SparseList<int> $_GENERATE_INT_GROUP(List<int> data, bool isCompressed) {
if (isCompressed) {
data = GZIP.decoder.convert(data);
}
var list = new SparseList<int>(defaultValue: 0);
list.length = $UNICODE_LENGTH;
var length = data.length;
var start = 0;
var end = 0;
for (var i = 0; i < length; i+= 3) {
start += data[i + 0];
end += data[i + 1];
var key = data[i + 2];
list.addGroup(new GroupedRangeList<int>(start, end, key));
}
list.freeze();
return list;
}
''';
static final String _templateMethodGenerateIntMapping = '''
Map<int, int> $_GENERATE_INT_MAPPING(List<int> data, bool isCompressed) {
if (isCompressed) {
data = GZIP.decoder.convert(data);
}
var map = new HashMap<int, int>();
var length = data.length;
var key = 0;
var value = 0;
for (var i = 0; i < length; i+= 2) {
key += data[i + 0];
value += data[i + 1];
map[key] = value;
}
return new UnmodifiableMapView<int, int>(map);
}
''';
static final String _templateMethodIsCategory = '''
bool is{{NAME}}(int character) => {{CHARACTER_SET}}[character];
''';
static final String _templateMethodToCase = '''
String $_TO_CASE(String string, Map<int, int> mapping) {
var runes = toRunes(string);
var length = runes.length;
for (var i = 0; i < length; i++) {
var character = mapping[runes[i]];
if (character != null) {
runes[i] = character;
}
}
return new String.fromCharCodes(runes);
}
''';
static final String _templateMethodToRune = '''
int $_TO_RUNE(String string) {
if (string == null) {
throw new ArgumentError("string: \$string");
}
var length = string.length;
if (length == 0) {
throw new StateError("An empty string contains no elements.");
}
var start = string.codeUnitAt(0);
if (length == 1) {
return start;
}
if ((start & 0xFC00) == 0xD800) {
var end = string.codeUnitAt(1);
if ((end & 0xFC00) == 0xDC00) {
return (0x10000 + ((start & 0x3FF) << 10) + (end & 0x3FF));
}
}
return start;
}
''';
static final String _templateMethodToRunes = '''
List<int> $_TO_RUNES(String string) {
if (string == null) {
throw new ArgumentError("string: \$string");
}
var length = string.length;
if (length == 0) {
return const <int>[];
}
var runes = <int>[];
runes.length = length;
var i = 0;
var pos = 0;
for ( ; i < length; pos++) {
var start = string.codeUnitAt(i);
i++;
if ((start & 0xFC00) == 0xD800 && i < length) {
var end = string.codeUnitAt(i);
if ((end & 0xFC00) == 0xDC00) {
runes[pos] = (0x10000 + ((start & 0x3FF) << 10) + (end & 0x3FF));
i++;
} else {
runes[pos] = start;
}
} else {
runes[pos] = start;
}
}
runes.length = pos;
return runes;
}
''';
static final String _templateMethodToXxxCase = '''
String {{NAME}}(String string) => $_TO_CASE(string, {{MAPPING}});
''';
static final String _templateCharacterSet = '''
final SparseBoolList {{NAME}} = $_GENERATE_CATEGORY({{ID}});
''';
static final String _templateMapping = '''
final Map<int, int> {{NAME}} = $_GENERATE_INT_MAPPING({{DATA}}, {{IS_COMRESSED}});
''';
static final String _templateSparseListBool = '''
final SparseBoolList {{NAME}} = $_GENERATE_BOOL_GROUP({{DATA}});
''';
static final String _templateSparseListInt = '''
final SparseList<int> {{NAME}} = $_GENERATE_INT_GROUP({{DATA}}, {{IS_COMRESSED}});
''';
/**
* This bug present in Dart VM since 8 Sep 2014
*/
bool _bugInDartGzip;
SparseList<int> _characters;
Map<Categories, SparseBoolList> _categories;
List<List<String>> _constants;
List<List<String>> _methods;
Map<String, Map<int, int>> _caseMapping;
List<List<String>> _variables;
List<String> generate(Map<String, List<String>> data) {
_caseMapping = <String, Map<int, int>>{};
_characters = new SparseList<int>(defaultValue: 0);
_categories = <Categories, SparseBoolList>{};
_constants = <List<String>>[];
_methods = <List<String>>[];
_variables = <List<String>>[];
var characters = _parseUnicodeData(data[Generator.UNICODE_DATA]);
_build(characters);
_generateConstants();
_generateVariables();
_generateMethods();
return _generateLibrary("unicode");
}
void _build(List<Character> characters) {
_caseMapping[_LOWERCASE] = <int, int>{};
_caseMapping[_TITLECASE] = <int, int>{};
_caseMapping[_UPPERCASE] = <int, int>{};
var length = characters.length;
for (var category in Categories.values.values) {
var list = new SparseBoolList();
list.length = UNICODE_LENGTH;
_categories[category] = list;
}
for (var i = 0; i < length; i++) {
var character = characters[i];
if (character == null) {
continue;
}
var code = character.code;
var category = Categories.values[character.category];
if (category == null) {
throw new StateError(
"Unknown character category: ${character.category}");
}
_categories[category][character.code] = true;
// Case mapping
var lowercase = character.lowercase;
var titlecase = character.titlecase;
var uppercase = character.uppercase;
if (lowercase != null) {
_caseMapping[_LOWERCASE][code] = lowercase;
}
if (titlecase != null) {
_caseMapping[_TITLECASE][code] = titlecase;
}
if (uppercase != null) {
_caseMapping[_UPPERCASE][code] = uppercase;
}
}
for (var category in _categories.keys) {
var characters = _categories[category];
for (var group in characters.groups) {
var group2 =
new GroupedRangeList<int>(group.start, group.end, category.id);
_characters.addGroup(group2);
}
}
}
List<int> _compressGroups(List<int> groups) {
var data = <int>[];
var deltaStart = 0;
var deltaEnd = 0;
var start = 0;
var end = 0;
// Compression phase #1
for (var i = 0; i < groups.length; i += 3) {
deltaStart = groups[i] - start;
deltaEnd = groups[i + 1] - end;
start = start + deltaStart;
end = end + deltaEnd;
data.add(deltaStart);
data.add(deltaEnd);
data.add(groups[i + 2]);
}
// Compression phase #2
var compressed = GZIP.encoder.convert(data);
var uncompressed = GZIP.decoder.convert(compressed);
var length = data.length;
_bugInDartGzip = false;
for (var i = 0; i < length; i++) {
if (data[i] != uncompressed[i]) {
_bugInDartGzip = true;
break;
}
}
if (_bugInDartGzip) {
compressed = data;
}
return compressed;
}
List<int> _compressMapping(List<int> mapping) {
var data = <int>[];
var deltaKey = 0;
var deltaValue = 0;
var key = 0;
var value = 0;
// Compression phase #1
for (var i = 0; i < mapping.length; i += 2) {
deltaKey = mapping[i] - key;
deltaValue = mapping[i + 1] - value;
key = key + deltaKey;
value = value + deltaValue;
data.add(deltaKey);
data.add(deltaValue);
}
// Compression phase #2
var compressed = GZIP.encoder.convert(data);
var uncompressed = GZIP.decoder.convert(compressed);
var length = data.length;
_bugInDartGzip = false;
for (var i = 0; i < length; i++) {
if (data[i] != uncompressed[i]) {
_bugInDartGzip = true;
break;
}
}
if (_bugInDartGzip) {
compressed = data;
}
return compressed;
}
void _generateConstants() {
var strings = <String>[];
for (var category in Categories.values.values) {
var name = category.name;
var id = category.id;
strings.add("const int $name = $id;");
}
strings.add("");
_constants.add(strings);
}
List<String> _generateLibrary(String name) {
var block = new TemplateBlock(_templateLibrary);
block.assign("NAME", name);
block.assign("#DIRECTIVES", "import \"dart:collection\";");
block.assign("#DIRECTIVES", "import \"dart:io\";");
block.assign("#DIRECTIVES", "import \"package:lists/lists.dart\";");
block.assign("#DIRECTIVES", "");
block.assign("#CONSTANTS", _constants);
block.assign("#METHODS", _methods);
block.assign("#VARIABLES", _variables);
return block.process();
}
void _generateMethodGenerateBoolGroup() {
var block = new TemplateBlock(_templateMethodGenerateBoolGroup);
_methods.add(block.process());
}
void _generateMethodGenerateCategory() {
var block = new TemplateBlock(_templateMethodGenerateCategory);
_methods.add(block.process());
}
void _generateMethodGenerateIntGroup() {
var block = new TemplateBlock(_templateMethodGenerateIntGroup);
_methods.add(block.process());
}
void _generateMethodGenerateIntMapping() {
var block = new TemplateBlock(_templateMethodGenerateIntMapping);
_methods.add(block.process());
}
void _generateMethods() {
_generateMethodIsCategory();
_generateMethodToXxxCase();
_generateMethodToRune();
_generateMethodToRunes();
_generateMethodGenerateBoolGroup();
_generateMethodGenerateCategory();
_generateMethodGenerateIntGroup();
_generateMethodGenerateIntMapping();
_generateMethodToCase();
}
void _generateMethodIsCategory() {
var blockIsCategory = new TemplateBlock(_templateMethodIsCategory);
var categories = Categories.values;
for (var category in categories.values) {
var block = blockIsCategory.clone();
var name = camelize(category.name);
block.assign("NAME", name);
block.assign("CHARACTER_SET", _getCharacterSetName(category));
_methods.add(block.process());
}
}
void _generateMethodToCase() {
var block = new TemplateBlock(_templateMethodToCase);
_methods.add(block.process());
}
void _generateMethodToRune() {
var block = new TemplateBlock(_templateMethodToRune);
_methods.add(block.process());
}
void _generateMethodToRunes() {
var block = new TemplateBlock(_templateMethodToRunes);
_methods.add(block.process());
}
void _generateMethodToXxxCase() {
var block = new TemplateBlock(_templateMethodToXxxCase);
for (var key in _caseMapping.keys) {
var mapping = _getSimpleCaseMappingName(key);
var block1 = block.clone();
var name = "to_${key}";
name = camelize(name, true);
block1.assign("NAME", name);
block1.assign("MAPPING", mapping);
_methods.add(block1.process());
}
}
void _generateVariableCategories() {
var block = new TemplateBlock(_templateSparseListInt);
var data = <int>[];
for (var group in _characters.groups) {
data.add(group.start);
data.add(group.end);
data.add(group.key);
}
var compressed = _compressGroups(data);
block.assign("IS_COMRESSED", !_bugInDartGzip);
block.assign("DATA", "[${compressed.join(", ")}]");
block.assign("NAME", _GENERAL_CATEGORIES);
_variables.add(block.process());
}
void _generateVariables() {
_generateVariableCategories();
_generateVariableCharacterSet();
_generateVariableSimpleCaseMapping();
}
void _generateVariableCharacterSet() {
var block = new TemplateBlock(_templateCharacterSet);
for (var category in _categories.keys) {
var block1 = block.clone();
block1.assign("NAME", _getCharacterSetName(category));
block1.assign("ID", category.id);
_variables.add(block1.process());
}
}
void _generateVariableSimpleCaseMapping() {
var block = new TemplateBlock(_templateMapping);
for (var name in _caseMapping.keys) {
var data = <int>[];
var map = _caseMapping[name];
for (var key in map.keys) {
data.add(key);
data.add(map[key]);
}
var compressed = _compressMapping(data);
var block1 = block.clone();
block1.assign("IS_COMRESSED", !_bugInDartGzip);
block1.assign("DATA", "[${compressed.join(", ")}]");
block1.assign("NAME", _getSimpleCaseMappingName(name));
_variables.add(block1.process());
}
}
String _getCharacterSetName(Categories category) {
var name = category.name;
name = "${category.name}_Characters";
name = camelize(name, true);
return name;
}
String _getSimpleCaseMappingName(String name) {
name = "simple_${name}_mapping";
name = camelize(name, true);
return name;
}
List<Character> _parseUnicodeData(List<String> lines) {
var characters = new List(UNICODE_LENGTH);
for (var line in lines) {
var parts = line.split(";");
var index = int.parse(parts[0], radix: 16);
var character = new Character(parts);
characters[index] = new Character(parts);
}
return characters;
}
}
class Character {
int code;
List<String> data;
String category;
int uppercase;
int titlecase;
int lowercase;
Character(this.data) {
code = int.parse(data[0], radix: 16);
category = data[2];
if (!data[12].isEmpty) {
uppercase = int.parse(data[12], radix: 16);
}
if (!data[13].isEmpty) {
lowercase = int.parse(data[13], radix: 16);
}
if (!data[14].isEmpty) {
titlecase = int.parse(data[14], radix: 16);
}
}
}
class Resource {
List<String> data;
String filename;
String url;
Resource({this.filename, this.url});
Future<List<String>> load() {
var file = new File(filename);
if (file.existsSync()) {
return file.readAsLines().then((result) {
data = result;
return data;
});
}
return http.read(Uri.parse(url)).then((string) {
string = string.replaceAll("\r\n", "\n");
string = string.replaceAll("\r", "\n");
data = string.split("\n");
if (data.last.isEmpty) {
data.removeLast();
}
return data;
});
}
}