blob: 5c5e337318a747a72de28bf4474807eaff3e7221 [file] [log] [blame]
// Copyright (c) 2017, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
library _fe_analyzer_shared.scanner.string_canonicalizer;
import 'dart:convert';
final _StringCanonicalizer _canonicalizer = new _StringCanonicalizer();
/// Returns [string] or a canonicalized version of it if our heuristics indicate
/// it may be a good idea to canonicalize it.
String considerCanonicalizeString(String string) {
// It is very unlikely that large strings will appear repeatedly (mostly those
// are long comments or literal strings).
// To avoid putting those into the cache and requiring pruning again, we
// simply return them.
if (string.length > 250) return string;
return _canonicalizer.canonicalizeString(string);
}
/// Returns a sub-string of [string] or a canonicalized version of it if our
/// heuristics indicate it may be a good idea to canonicalize it.
///
/// The sub-string is starting at [start] (inclusive) and ends at [end]
/// (exclusive).
String considerCanonicalizeSubString(String string, int start, int end) {
// It is very unlikely that large strings will appear repeatedly (mostly those
// are long comments or literal strings).
// To avoid putting those into the cache and requiring pruning again, we
// simply return them.
if ((end - start) > 250) return string.substring(start, end);
return _canonicalizer.canonicalizeSubString(string, start, end);
}
/// Returns a canonicalized version of [string].
String canonicalizeString(String string) {
return _canonicalizer.canonicalizeString(string);
}
/// Returns a canonicalized version of a sub-string of [string].
///
/// The sub-string is starting at [start] (inclusive) and ends at [end]
/// (exclusive).
String canonicalizeSubString(String string, int start, int end) {
return _canonicalizer.canonicalizeSubString(string, start, end);
}
/// Returns a canonicalized version of the UTF-8 decoded string from [bytes].
///
/// The string is UTF-8 decoded from [bytes], starting at [start] (inclusive)
/// and ending at [end] (exclusive).
///
/// It may avoid the overhead of UTF-8 decoding if the string is already in the
/// cache.
String canonicalizeUtf8SubString(
List<int> bytes, int start, int end, bool isAscii) {
return _canonicalizer.canonicalizeBytes(bytes, start, end, isAscii);
}
/// Will clear the string canonicalization cache.
void clearStringCanonicalizationCache() {
_canonicalizer.clear();
}
/// Will prune the string canonicalization if it's current memory consumption
/// is estimated to be larger than [allowGrowthTo].
///
/// An attempt is made to keep most frequently used strings in the cache while
/// maintaining the memory limit.
void pruneStringCanonicalizationCache([int allowGrowthTo = 5 * 1024 * 1024]) {
_canonicalizer.pruneTo(allowGrowthTo);
}
/// Decode UTF-8 without canonicalizing it.
String decodeString(List<int> bytes, int start, int end, bool isAscii) {
return isAscii
? new String.fromCharCodes(bytes, start, end)
: const Utf8Decoder(allowMalformed: true).convert(bytes, start, end);
}
abstract class _Node {
final String payload;
_Node? next;
_Node(this.payload, this.next);
int get hash;
}
class _StringNode extends _Node {
int usageCount = 1;
_StringNode(super.payload, super.next);
@override
int get hash =>
_StringCanonicalizer.hashString(payload, /* start = */ 0, payload.length);
// On a 64-bit Dart VM the size of
// * [_StringNode] itself is 32 bytes
// * [String] is 16 bytes plus the actual string data.
//
// It's an estimation that may overestimate (e.g. on 32-bit architectures) or
// underestimate (if payload is unicode) - but is reasonably precise for our
// purpose.
int get estimatedMemoryConsumption => 32 + (16 + payload.length);
}
class _Utf8Node extends _Node {
final List<int> data;
final int start;
final int end;
_Utf8Node(this.data, this.start, this.end, String payload, _Node? next)
: super(payload, next);
@override
int get hash => _StringCanonicalizer.hashBytes(data, start, end);
}
/// A hash table for triples:
/// (list of bytes, start, end) --> canonicalized string
/// Using triples avoids allocating string slices before checking if they
/// are canonical.
///
/// Gives about 3% speedup on dart2js.
class _StringCanonicalizer {
/// Mask away top bits to keep hash calculation within 32-bit SMI range.
static const int MASK = 16 * 1024 * 1024 - 1;
static const int INITIAL_SIZE = 8 * 1024;
/// Linear size of a hash table.
int _size = INITIAL_SIZE;
/// Items in a hash table.
int _count = 0;
/// Number of [_StringNode]s in the cache.
int _stringCount = 0;
/// Number of []s in the cache.
int _utf8StringCount = 0;
/// Memory consumption of [_StringNode]s.
int _estimatedStringMemoryConsumption = 0;
/// The table itself.
List<_Node?> _nodes =
new List<_Node?>.filled(INITIAL_SIZE, /* fill = */ null);
static int hashBytes(List<int> data, int start, int end) {
int h = 5381;
for (int i = start; i < end; i++) {
h = ((h << 5) + h + data[i]) & MASK;
}
return h;
}
static int hashString(String data, int start, int end) {
int h = 5381;
for (int i = start; i < end; i++) {
h = ((h << 5) + h + data.codeUnitAt(i)) & MASK;
}
return h;
}
void rehash() {
int newSize = _size * 2;
List<_Node?> newNodes = new List<_Node?>.filled(newSize, /* fill = */ null);
for (int i = 0; i < _size; i++) {
_Node? t = _nodes[i];
while (t != null) {
_Node? n = t.next;
int newIndex = t.hash & (newSize - 1);
_Node? s = newNodes[newIndex];
t.next = s;
newNodes[newIndex] = t;
t = n;
}
}
_size = newSize;
_nodes = newNodes;
}
String canonicalizeBytes(List<int> data, int start, int end, bool asciiOnly) {
if (_count > _size) rehash();
final int index = hashBytes(data, start, end) & (_size - 1);
_Node? s = _nodes[index];
_Node? t = s;
int len = end - start;
while (t != null) {
if (t is _Utf8Node) {
final List<int> tData = t.data;
if (t.end - t.start == len) {
int i = start, j = t.start;
while (i < end && data[i] == tData[j]) {
i++;
j++;
}
if (i == end) {
return t.payload;
}
}
}
t = t.next;
}
return insertUtf8Node(
index, s, data, start, end, decodeString(data, start, end, asciiOnly));
}
String canonicalizeSubString(String data, int start, int end) {
final int len = end - start;
if (start == 0 && data.length == len) {
return canonicalizeString(data);
}
if (_count > _size) rehash();
final int index = hashString(data, start, end) & (_size - 1);
final _Node? s = _nodes[index];
_Node? t = s;
while (t != null) {
if (t is _StringNode) {
final String tData = t.payload;
if (tData.length == len && data.startsWith(tData, start)) {
t.usageCount++;
return tData;
}
}
t = t.next;
}
return insertStringNode(index, s, data.substring(start, end));
}
String canonicalizeString(String data) {
if (_count > _size) rehash();
final int index =
hashString(data, /* start = */ 0, data.length) & (_size - 1);
final _Node? s = _nodes[index];
_Node? t = s;
while (t != null) {
if (t is _StringNode) {
final String tData = t.payload;
if (identical(data, tData) || data == tData) {
t.usageCount++;
return tData;
}
}
t = t.next;
}
return insertStringNode(index, s, data);
}
String insertStringNode(int index, _Node? next, String value) {
final _StringNode newNode = new _StringNode(value, next);
_nodes[index] = newNode;
_count++;
_stringCount++;
_estimatedStringMemoryConsumption += newNode.estimatedMemoryConsumption;
return value;
}
String insertUtf8Node(int index, _Node? next, List<int> buffer, int start,
int end, String value) {
final _Utf8Node newNode = new _Utf8Node(buffer, start, end, value, next);
_nodes[index] = newNode;
_count++;
_utf8StringCount++;
return value;
}
void clear() {
initializeWithSize(INITIAL_SIZE);
}
void initializeWithSize(int size) {
_size = size;
_nodes = new List<_Node?>.filled(_size, /* fill = */ null);
_count = 0;
_utf8StringCount = 0;
_stringCount = 0;
_estimatedStringMemoryConsumption = 0;
}
void pruneTo(int allowGrowthTo) {
// If we're already within the limit and there's no [_Utf8Node]s around
// (which may hold on to very large Uint8List buffers) we have nothing to
// do.
if (_estimatedStringMemoryConsumption < allowGrowthTo &&
_utf8StringCount == 0) {
return;
}
// If there's only [_Utf8Node]s around we'll clear the entire cache.
if (_stringCount == 0) {
clear();
return;
}
// Find all [_StringNode]s, sort them by usage and insert remaining nodes
// into a new cache until we've reached half of the [allowGrowthTo] -
// leaving some space before we'll have to prune again next time.
final List<_StringNode> stringNodes = new List<_StringNode>.filled(
_stringCount, new _StringNode('dummy', /* next = */ null));
int writeIndex = 0;
for (int i = 0; i < _nodes.length; ++i) {
_Node? node = _nodes[i];
while (node != null) {
final _Node? nextNode = node.next;
if (node is _StringNode) {
stringNodes[writeIndex++] = node;
node.next = null;
}
node = nextNode;
}
}
stringNodes.sort((_StringNode a, _StringNode b) {
return b.usageCount - a.usageCount;
});
int sum = 0;
int lastIndex = 0;
for (int i = 0; i < stringNodes.length; ++i) {
final _StringNode node = stringNodes[i];
sum += node.estimatedMemoryConsumption;
if (sum >= (allowGrowthTo ~/ 2) || node.usageCount < 2) {
lastIndex = i;
break;
}
}
final int newSize = (1 << lastIndex.bitLength);
initializeWithSize(newSize);
for (int i = lastIndex; i >= 0; --i) {
final _StringNode newNode = stringNodes[i];
final int index = newNode.hash & (_size - 1);
newNode.next = _nodes[index];
_nodes[index] = newNode;
_count++;
_stringCount++;
_estimatedStringMemoryConsumption += newNode.estimatedMemoryConsumption;
}
}
}