blob: 3a105794cc3c0506ffbe9ed8770d0e178bb97e1b [file] [log] [blame]
// Copyright (c) 2016, Filip Hracek. All rights reserved. Use of this source
// code is governed by a BSD-style license that can be found in the LICENSE
// file.
library html_unescape.base;
import 'dart:convert';
import 'dart:math';
// Character constants.
const int _hashCodeUnit = 35; // #
const int _xCodeUnit = 120; // x
const int _minDecimalEscapeLength = 4; // �
const int _minHexadecimalEscapeLength = 5; // �
abstract class HtmlUnescapeBase
extends Converter<String, String> {
int _chunkLength;
List<String> get keys;
List<String> get values;
int get maxKeyLength;
HtmlUnescapeBase() {
_chunkLength = max(maxKeyLength, _minHexadecimalEscapeLength);
/// Converts from HTML-escaped [data] to unescaped string.
String convert(String data) {
// Return early if possible.
if (data.indexOf('&') == -1) return data;
StringBuffer buf = new StringBuffer();
int offset = 0;
while (true) {
int nextAmp = data.indexOf('&', offset);
if (nextAmp == -1) {
// Rest of string.
buf.write(data.substring(offset, nextAmp));
offset = nextAmp;
var chunk =
data.substring(offset, min(data.length, offset + _chunkLength));
// Try &#123; and &#xff;
if (chunk.length > _minDecimalEscapeLength &&
chunk.codeUnitAt(1) == _hashCodeUnit) {
int nextSemicolon = chunk.indexOf(';');
if (nextSemicolon != -1) {
var hex = chunk.codeUnitAt(2) == _xCodeUnit;
var str = chunk.substring(hex ? 3 : 2, nextSemicolon);
int ord = int.parse(str, radix: hex ? 16 : 10, onError: (_) => -1);
if (ord != -1) {
buf.write(new String.fromCharCode(ord));
offset += nextSemicolon + 1;
// Try &nbsp;
var replaced = false;
for (int i = 0; i < keys.length; i++) {
var key = keys[i];
if (chunk.startsWith(key)) {
var replacement = values[i];
offset += key.length;
replaced = true;
if (!replaced) {
offset += 1;
return buf.toString();
StringConversionSink startChunkedConversion(Sink<String> sink) {
if (sink is! StringConversionSink) {
sink = new StringConversionSink.from(sink);
return new _HtmlUnescapeSink(sink, this);
class _HtmlUnescapeSink extends StringConversionSinkBase {
final StringConversionSink _sink;
final HtmlUnescapeBase _unescape;
/// The carry-over from the previous chunk.
/// If the previous slice ended with ampersand too close to end,
/// then the next slice may continue the reference.
String _carry;
_HtmlUnescapeSink(this._sink, this._unescape);
void addSlice(String chunk, int start, int end, bool isLast) {
end = RangeError.checkValidRange(start, end, chunk.length);
// If the chunk is empty, it's probably because it's the last one.
// Handle that here, so we know the range is non-empty below.
if (start >= end) {
if (isLast) close();
if (_carry != null) {
chunk = _carry + chunk.substring(start, end);
start = 0;
end = chunk.length;
_carry = null;
_convert(chunk, start, end, isLast);
if (isLast) close();
void close() {
if (_carry != null) {
_carry = null;
void _convert(String chunk, int start, int end, bool isLast) {
int nextAmp = chunk.indexOf('&', start);
if (nextAmp == -1 || nextAmp > end) {
_sink.add(chunk.substring(start, end));
_carry = null;
while (nextAmp + _unescape.maxKeyLength <= end) {
var lastAmp = chunk.lastIndexOf('&', end);
int subEnd = lastAmp != -1 ? lastAmp : nextAmp + _unescape.maxKeyLength;
var result = _unescape.convert(chunk.substring(start, subEnd));
start = subEnd;
nextAmp = chunk.indexOf('&', start);
if (nextAmp == -1 || nextAmp > end) {
_sink.add(chunk.substring(start, end));
_carry = null;
if (nextAmp + _unescape.maxKeyLength > end && isLast) {
var result = _unescape.convert(chunk.substring(start, end));
_carry = null;
var nextCarry = chunk.substring(start, end);
if (_carry == null) {
_carry = nextCarry;
} else {
_carry = _carry + nextCarry;