library unicode.tool.generator; | |
import "dart:async"; | |
import "dart:io"; | |
import "package:http/http.dart" as http; | |
import "package:lists/lists.dart"; | |
import "package:strings/strings.dart"; | |
import "package:template_block/template_block.dart"; | |
const String UNICODE_DATA_FILE = "UnicodeData.txt"; | |
const String UNICODE_DATA_URL = | |
"http://www.unicode.org/Public/UNIDATA/UnicodeData.txt"; | |
const String VERSION = "8.0.0"; | |
void main() { | |
var resources = <String, Resource>{}; | |
resources[Generator.UNICODE_DATA] = | |
new Resource(filename: UNICODE_DATA_FILE, url: UNICODE_DATA_URL); | |
Future.wait(resources.values.map((r) => r.load())).then((_) { | |
var generator = new Generator(); | |
var data = <String, List<String>>{}; | |
resources.forEach((k, v) => data[k] = v.data); | |
var result = generator.generate(data); | |
var script = "lib/unicode.dart"; | |
var file = new File(script); | |
file.writeAsStringSync(result.join("\n")); | |
}); | |
} | |
class Categories { | |
static const Categories CN = const Categories("Cn", "NOT_ASSIGNED", 0); | |
static const Categories CC = const Categories("Cc", "CONTROL", 1); | |
static const Categories CF = const Categories("Cf", "FORMAT", 2); | |
static const Categories CO = const Categories("Co", "PRIVATE_USE", 3); | |
static const Categories CS = const Categories("Cs", "SURROGATE", 4); | |
static const Categories LL = const Categories("Ll", "LOWERCASE_LETTER", 5); | |
static const Categories LM = const Categories("Lm", "MODIFIER_LETTER", 6); | |
static const Categories LO = const Categories("Lo", "OTHER_LETTER", 7); | |
static const Categories LT = const Categories("Lt", "TITLECASE_LETTER", 8); | |
static const Categories LU = const Categories("Lu", "UPPERCASE_LETTER", 9); | |
static const Categories MC = const Categories("Mc", "SPACING_MARK", 10); | |
static const Categories ME = const Categories("Me", "ENCOSING_MARK", 11); | |
static const Categories MN = const Categories("Mn", "NONSPACING_MARK", 12); | |
static const Categories ND = const Categories("Nd", "DECIMAL_NUMBER", 13); | |
static const Categories NL = const Categories("Nl", "LETTER_NUMBER", 14); | |
static const Categories NO = const Categories("No", "OTHER_NUMBER", 15); | |
static const Categories PC = | |
const Categories("Pc", "CONNECTOR_PUNCTUATION", 16); | |
static const Categories PD = const Categories("Pd", "DASH_PUNCTUATION", 17); | |
static const Categories PE = const Categories("Pe", "CLOSE_PUNCTUATION", 18); | |
static const Categories PF = const Categories("Pf", "FINAL_PUNCTUATION", 19); | |
static const Categories PI = | |
const Categories("Pi", "INITIAL_PUNCTUATION", 20); | |
static const Categories PO = const Categories("Po", "OTHER_PUNCTUATION", 21); | |
static const Categories PS = const Categories("Ps", "OPEN_PUNCTUATION", 22); | |
static const Categories SC = const Categories("Sc", "CURRENCY_SYMBOL", 23); | |
static const Categories SK = const Categories("Sk", "MODIFIER_SYMBOL", 24); | |
static const Categories SM = const Categories("Sm", "MATH_SYMBOL", 25); | |
static const Categories SO = const Categories("So", "OTHER_SYMBOL", 26); | |
static const Categories ZL = const Categories("Zl", "LINE_SEPARATOR", 27); | |
static const Categories ZP = | |
const Categories("Zp", "PARAGRAPH_SEPARATOR", 28); | |
static const Categories ZS = const Categories("Zs", "SPACE_SEPARATOR", 29); | |
final int id; | |
final String abbr; | |
final String name; | |
const Categories(this.abbr, this.name, this.id); | |
static final Map<String, Categories> values = <String, Categories>{ | |
CN.abbr: CN, | |
CC.abbr: CC, | |
CF.abbr: CF, | |
CO.abbr: CO, | |
CS.abbr: CS, | |
LL.abbr: LL, | |
LM.abbr: LM, | |
LO.abbr: LO, | |
LT.abbr: LT, | |
LU.abbr: LU, | |
MC.abbr: MC, | |
ME.abbr: ME, | |
MN.abbr: MN, | |
ND.abbr: ND, | |
NL.abbr: NL, | |
NO.abbr: NO, | |
PC.abbr: PC, | |
PD.abbr: PD, | |
PE.abbr: PE, | |
PF.abbr: PF, | |
PI.abbr: PI, | |
PO.abbr: PO, | |
PS.abbr: PS, | |
SC.abbr: SC, | |
SK.abbr: SK, | |
SM.abbr: SM, | |
SO.abbr: SO, | |
ZL.abbr: ZL, | |
ZP.abbr: ZP, | |
ZS.abbr: ZS, | |
}; | |
String toString() => name; | |
} | |
class Generator { | |
static const int MAX_VALUE = 0x10ffff; | |
static const int UNICODE_LENGTH = MAX_VALUE + 1; | |
static const String UNICODE_DATA = "UNICODE_DATA"; | |
static const String _GENERAL_CATEGORIES = "generalCategories"; | |
static const String _GENERATE_BOOL_GROUP = "_generateBoolGroup"; | |
static const String _GENERATE_CATEGORY = "_generateCategory"; | |
static const String _GENERATE_INT_GROUP = "_generateIntGroup"; | |
static const String _GENERATE_INT_MAPPING = "_generateIntMapping"; | |
static const String _LOWERCASE = "lowercase"; | |
static const String _TITLECASE = "titlecase"; | |
static const String _TO_CASE = "_toCase"; | |
static const String _TO_RUNE = "toRune"; | |
static const String _TO_RUNES = "toRunes"; | |
static const String _UPPERCASE = "uppercase"; | |
static final String _templateLibrary = ''' | |
// This library was created by the tool. | |
// Source: $UNICODE_DATA_URL | |
// Unicode Version: $VERSION | |
library {{NAME}}; | |
{{#DIRECTIVES}} | |
{{#CONSTANTS}} | |
{{#VARIABLES}} | |
{{#METHODS}} | |
'''; | |
static final String _templateMethodGenerateBoolGroup = ''' | |
SparseBoolList $_GENERATE_BOOL_GROUP(List<int> data) { | |
var list = new SparseBoolList(); | |
list.length = $UNICODE_LENGTH; | |
var length = data.length; | |
for (var i = 0; i < length; i += 2) { | |
var start = data[i + 0]; | |
var end = data[i + 1]; | |
list.addGroup(new GroupedRangeList<bool>(start, end, true)); | |
} | |
list.freeze(); | |
return list; | |
} | |
'''; | |
static final String _templateMethodGenerateCategory = ''' | |
SparseBoolList $_GENERATE_CATEGORY(int category) { | |
var list = new SparseBoolList(); | |
list.length = $UNICODE_LENGTH; | |
for (var group in $_GENERAL_CATEGORIES.groups) { | |
if (group.key == category) { | |
list.addGroup(new GroupedRangeList<bool>(group.start, group.end, true)); | |
} | |
} | |
list.freeze(); | |
return list; | |
} | |
'''; | |
static final String _templateMethodGenerateIntGroup = ''' | |
SparseList<int> $_GENERATE_INT_GROUP(List<int> data, bool isCompressed) { | |
if (isCompressed) { | |
data = GZIP.decoder.convert(data); | |
} | |
var list = new SparseList<int>(defaultValue: 0); | |
list.length = $UNICODE_LENGTH; | |
var length = data.length; | |
var start = 0; | |
var end = 0; | |
for (var i = 0; i < length; i+= 3) { | |
start += data[i + 0]; | |
end += data[i + 1]; | |
var key = data[i + 2]; | |
list.addGroup(new GroupedRangeList<int>(start, end, key)); | |
} | |
list.freeze(); | |
return list; | |
} | |
'''; | |
static final String _templateMethodGenerateIntMapping = ''' | |
Map<int, int> $_GENERATE_INT_MAPPING(List<int> data, bool isCompressed) { | |
if (isCompressed) { | |
data = GZIP.decoder.convert(data); | |
} | |
var map = new HashMap<int, int>(); | |
var length = data.length; | |
var key = 0; | |
var value = 0; | |
for (var i = 0; i < length; i+= 2) { | |
key += data[i + 0]; | |
value += data[i + 1]; | |
map[key] = value; | |
} | |
return new UnmodifiableMapView<int, int>(map); | |
} | |
'''; | |
static final String _templateMethodIsCategory = ''' | |
bool is{{NAME}}(int character) => {{CHARACTER_SET}}[character]; | |
'''; | |
static final String _templateMethodToCase = ''' | |
String $_TO_CASE(String string, Map<int, int> mapping) { | |
var runes = toRunes(string); | |
var length = runes.length; | |
for (var i = 0; i < length; i++) { | |
var character = mapping[runes[i]]; | |
if (character != null) { | |
runes[i] = character; | |
} | |
} | |
return new String.fromCharCodes(runes); | |
} | |
'''; | |
static final String _templateMethodToRune = ''' | |
int $_TO_RUNE(String string) { | |
if (string == null) { | |
throw new ArgumentError("string: \$string"); | |
} | |
var length = string.length; | |
if (length == 0) { | |
throw new StateError("An empty string contains no elements."); | |
} | |
var start = string.codeUnitAt(0); | |
if (length == 1) { | |
return start; | |
} | |
if ((start & 0xFC00) == 0xD800) { | |
var end = string.codeUnitAt(1); | |
if ((end & 0xFC00) == 0xDC00) { | |
return (0x10000 + ((start & 0x3FF) << 10) + (end & 0x3FF)); | |
} | |
} | |
return start; | |
} | |
'''; | |
static final String _templateMethodToRunes = ''' | |
List<int> $_TO_RUNES(String string) { | |
if (string == null) { | |
throw new ArgumentError("string: \$string"); | |
} | |
var length = string.length; | |
if (length == 0) { | |
return const <int>[]; | |
} | |
var runes = <int>[]; | |
runes.length = length; | |
var i = 0; | |
var pos = 0; | |
for ( ; i < length; pos++) { | |
var start = string.codeUnitAt(i); | |
i++; | |
if ((start & 0xFC00) == 0xD800 && i < length) { | |
var end = string.codeUnitAt(i); | |
if ((end & 0xFC00) == 0xDC00) { | |
runes[pos] = (0x10000 + ((start & 0x3FF) << 10) + (end & 0x3FF)); | |
i++; | |
} else { | |
runes[pos] = start; | |
} | |
} else { | |
runes[pos] = start; | |
} | |
} | |
runes.length = pos; | |
return runes; | |
} | |
'''; | |
static final String _templateMethodToXxxCase = ''' | |
String {{NAME}}(String string) => $_TO_CASE(string, {{MAPPING}}); | |
'''; | |
static final String _templateCharacterSet = ''' | |
final SparseBoolList {{NAME}} = $_GENERATE_CATEGORY({{ID}}); | |
'''; | |
static final String _templateMapping = ''' | |
final Map<int, int> {{NAME}} = $_GENERATE_INT_MAPPING({{DATA}}, {{IS_COMRESSED}}); | |
'''; | |
static final String _templateSparseListBool = ''' | |
final SparseBoolList {{NAME}} = $_GENERATE_BOOL_GROUP({{DATA}}); | |
'''; | |
static final String _templateSparseListInt = ''' | |
final SparseList<int> {{NAME}} = $_GENERATE_INT_GROUP({{DATA}}, {{IS_COMRESSED}}); | |
'''; | |
/** | |
* This bug present in Dart VM since 8 Sep 2014 | |
*/ | |
bool _bugInDartGzip; | |
SparseList<int> _characters; | |
Map<Categories, SparseBoolList> _categories; | |
List<List<String>> _constants; | |
List<List<String>> _methods; | |
Map<String, Map<int, int>> _caseMapping; | |
List<List<String>> _variables; | |
List<String> generate(Map<String, List<String>> data) { | |
_caseMapping = <String, Map<int, int>>{}; | |
_characters = new SparseList<int>(defaultValue: 0); | |
_categories = <Categories, SparseBoolList>{}; | |
_constants = <List<String>>[]; | |
_methods = <List<String>>[]; | |
_variables = <List<String>>[]; | |
var characters = _parseUnicodeData(data[Generator.UNICODE_DATA]); | |
_build(characters); | |
_generateConstants(); | |
_generateVariables(); | |
_generateMethods(); | |
return _generateLibrary("unicode"); | |
} | |
void _build(List<Character> characters) { | |
_caseMapping[_LOWERCASE] = <int, int>{}; | |
_caseMapping[_TITLECASE] = <int, int>{}; | |
_caseMapping[_UPPERCASE] = <int, int>{}; | |
var length = characters.length; | |
for (var category in Categories.values.values) { | |
var list = new SparseBoolList(); | |
list.length = UNICODE_LENGTH; | |
_categories[category] = list; | |
} | |
for (var i = 0; i < length; i++) { | |
var character = characters[i]; | |
if (character == null) { | |
continue; | |
} | |
var code = character.code; | |
var category = Categories.values[character.category]; | |
if (category == null) { | |
throw new StateError( | |
"Unknown character category: ${character.category}"); | |
} | |
_categories[category][character.code] = true; | |
// Case mapping | |
var lowercase = character.lowercase; | |
var titlecase = character.titlecase; | |
var uppercase = character.uppercase; | |
if (lowercase != null) { | |
_caseMapping[_LOWERCASE][code] = lowercase; | |
} | |
if (titlecase != null) { | |
_caseMapping[_TITLECASE][code] = titlecase; | |
} | |
if (uppercase != null) { | |
_caseMapping[_UPPERCASE][code] = uppercase; | |
} | |
} | |
for (var category in _categories.keys) { | |
var characters = _categories[category]; | |
for (var group in characters.groups) { | |
var group2 = | |
new GroupedRangeList<int>(group.start, group.end, category.id); | |
_characters.addGroup(group2); | |
} | |
} | |
} | |
List<int> _compressGroups(List<int> groups) { | |
var data = <int>[]; | |
var deltaStart = 0; | |
var deltaEnd = 0; | |
var start = 0; | |
var end = 0; | |
// Compression phase #1 | |
for (var i = 0; i < groups.length; i += 3) { | |
deltaStart = groups[i] - start; | |
deltaEnd = groups[i + 1] - end; | |
start = start + deltaStart; | |
end = end + deltaEnd; | |
data.add(deltaStart); | |
data.add(deltaEnd); | |
data.add(groups[i + 2]); | |
} | |
// Compression phase #2 | |
var compressed = GZIP.encoder.convert(data); | |
var uncompressed = GZIP.decoder.convert(compressed); | |
var length = data.length; | |
_bugInDartGzip = false; | |
for (var i = 0; i < length; i++) { | |
if (data[i] != uncompressed[i]) { | |
_bugInDartGzip = true; | |
break; | |
} | |
} | |
if (_bugInDartGzip) { | |
compressed = data; | |
} | |
return compressed; | |
} | |
List<int> _compressMapping(List<int> mapping) { | |
var data = <int>[]; | |
var deltaKey = 0; | |
var deltaValue = 0; | |
var key = 0; | |
var value = 0; | |
// Compression phase #1 | |
for (var i = 0; i < mapping.length; i += 2) { | |
deltaKey = mapping[i] - key; | |
deltaValue = mapping[i + 1] - value; | |
key = key + deltaKey; | |
value = value + deltaValue; | |
data.add(deltaKey); | |
data.add(deltaValue); | |
} | |
// Compression phase #2 | |
var compressed = GZIP.encoder.convert(data); | |
var uncompressed = GZIP.decoder.convert(compressed); | |
var length = data.length; | |
_bugInDartGzip = false; | |
for (var i = 0; i < length; i++) { | |
if (data[i] != uncompressed[i]) { | |
_bugInDartGzip = true; | |
break; | |
} | |
} | |
if (_bugInDartGzip) { | |
compressed = data; | |
} | |
return compressed; | |
} | |
void _generateConstants() { | |
var strings = <String>[]; | |
for (var category in Categories.values.values) { | |
var name = category.name; | |
var id = category.id; | |
strings.add("const int $name = $id;"); | |
} | |
strings.add(""); | |
_constants.add(strings); | |
} | |
List<String> _generateLibrary(String name) { | |
var block = new TemplateBlock(_templateLibrary); | |
block.assign("NAME", name); | |
block.assign("#DIRECTIVES", "import \"dart:collection\";"); | |
block.assign("#DIRECTIVES", "import \"dart:io\";"); | |
block.assign("#DIRECTIVES", "import \"package:lists/lists.dart\";"); | |
block.assign("#DIRECTIVES", ""); | |
block.assign("#CONSTANTS", _constants); | |
block.assign("#METHODS", _methods); | |
block.assign("#VARIABLES", _variables); | |
return block.process(); | |
} | |
void _generateMethodGenerateBoolGroup() { | |
var block = new TemplateBlock(_templateMethodGenerateBoolGroup); | |
_methods.add(block.process()); | |
} | |
void _generateMethodGenerateCategory() { | |
var block = new TemplateBlock(_templateMethodGenerateCategory); | |
_methods.add(block.process()); | |
} | |
void _generateMethodGenerateIntGroup() { | |
var block = new TemplateBlock(_templateMethodGenerateIntGroup); | |
_methods.add(block.process()); | |
} | |
void _generateMethodGenerateIntMapping() { | |
var block = new TemplateBlock(_templateMethodGenerateIntMapping); | |
_methods.add(block.process()); | |
} | |
void _generateMethods() { | |
_generateMethodIsCategory(); | |
_generateMethodToXxxCase(); | |
_generateMethodToRune(); | |
_generateMethodToRunes(); | |
_generateMethodGenerateBoolGroup(); | |
_generateMethodGenerateCategory(); | |
_generateMethodGenerateIntGroup(); | |
_generateMethodGenerateIntMapping(); | |
_generateMethodToCase(); | |
} | |
void _generateMethodIsCategory() { | |
var blockIsCategory = new TemplateBlock(_templateMethodIsCategory); | |
var categories = Categories.values; | |
for (var category in categories.values) { | |
var block = blockIsCategory.clone(); | |
var name = camelize(category.name); | |
block.assign("NAME", name); | |
block.assign("CHARACTER_SET", _getCharacterSetName(category)); | |
_methods.add(block.process()); | |
} | |
} | |
void _generateMethodToCase() { | |
var block = new TemplateBlock(_templateMethodToCase); | |
_methods.add(block.process()); | |
} | |
void _generateMethodToRune() { | |
var block = new TemplateBlock(_templateMethodToRune); | |
_methods.add(block.process()); | |
} | |
void _generateMethodToRunes() { | |
var block = new TemplateBlock(_templateMethodToRunes); | |
_methods.add(block.process()); | |
} | |
void _generateMethodToXxxCase() { | |
var block = new TemplateBlock(_templateMethodToXxxCase); | |
for (var key in _caseMapping.keys) { | |
var mapping = _getSimpleCaseMappingName(key); | |
var block1 = block.clone(); | |
var name = "to_${key}"; | |
name = camelize(name, true); | |
block1.assign("NAME", name); | |
block1.assign("MAPPING", mapping); | |
_methods.add(block1.process()); | |
} | |
} | |
void _generateVariableCategories() { | |
var block = new TemplateBlock(_templateSparseListInt); | |
var data = <int>[]; | |
for (var group in _characters.groups) { | |
data.add(group.start); | |
data.add(group.end); | |
data.add(group.key); | |
} | |
var compressed = _compressGroups(data); | |
block.assign("IS_COMRESSED", !_bugInDartGzip); | |
block.assign("DATA", "[${compressed.join(", ")}]"); | |
block.assign("NAME", _GENERAL_CATEGORIES); | |
_variables.add(block.process()); | |
} | |
void _generateVariables() { | |
_generateVariableCategories(); | |
_generateVariableCharacterSet(); | |
_generateVariableSimpleCaseMapping(); | |
} | |
void _generateVariableCharacterSet() { | |
var block = new TemplateBlock(_templateCharacterSet); | |
for (var category in _categories.keys) { | |
var block1 = block.clone(); | |
block1.assign("NAME", _getCharacterSetName(category)); | |
block1.assign("ID", category.id); | |
_variables.add(block1.process()); | |
} | |
} | |
void _generateVariableSimpleCaseMapping() { | |
var block = new TemplateBlock(_templateMapping); | |
for (var name in _caseMapping.keys) { | |
var data = <int>[]; | |
var map = _caseMapping[name]; | |
for (var key in map.keys) { | |
data.add(key); | |
data.add(map[key]); | |
} | |
var compressed = _compressMapping(data); | |
var block1 = block.clone(); | |
block1.assign("IS_COMRESSED", !_bugInDartGzip); | |
block1.assign("DATA", "[${compressed.join(", ")}]"); | |
block1.assign("NAME", _getSimpleCaseMappingName(name)); | |
_variables.add(block1.process()); | |
} | |
} | |
String _getCharacterSetName(Categories category) { | |
var name = category.name; | |
name = "${category.name}_Characters"; | |
name = camelize(name, true); | |
return name; | |
} | |
String _getSimpleCaseMappingName(String name) { | |
name = "simple_${name}_mapping"; | |
name = camelize(name, true); | |
return name; | |
} | |
List<Character> _parseUnicodeData(List<String> lines) { | |
var characters = new List(UNICODE_LENGTH); | |
for (var line in lines) { | |
var parts = line.split(";"); | |
var index = int.parse(parts[0], radix: 16); | |
var character = new Character(parts); | |
characters[index] = new Character(parts); | |
} | |
return characters; | |
} | |
} | |
class Character { | |
int code; | |
List<String> data; | |
String category; | |
int uppercase; | |
int titlecase; | |
int lowercase; | |
Character(this.data) { | |
code = int.parse(data[0], radix: 16); | |
category = data[2]; | |
if (!data[12].isEmpty) { | |
uppercase = int.parse(data[12], radix: 16); | |
} | |
if (!data[13].isEmpty) { | |
lowercase = int.parse(data[13], radix: 16); | |
} | |
if (!data[14].isEmpty) { | |
titlecase = int.parse(data[14], radix: 16); | |
} | |
} | |
} | |
class Resource { | |
List<String> data; | |
String filename; | |
String url; | |
Resource({this.filename, this.url}); | |
Future<List<String>> load() { | |
var file = new File(filename); | |
if (file.existsSync()) { | |
return file.readAsLines().then((result) { | |
data = result; | |
return data; | |
}); | |
} | |
return http.read(Uri.parse(url)).then((string) { | |
string = string.replaceAll("\r\n", "\n"); | |
string = string.replaceAll("\r", "\n"); | |
data = string.split("\n"); | |
if (data.last.isEmpty) { | |
data.removeLast(); | |
} | |
return data; | |
}); | |
} | |
} |