blob: 07c1f5b45c25e0060e695722aeda470b36bf2ee2 [file] [log] [blame]
//===--- Punycode.cpp - Unicode to Punycode transcoding -------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
#include "swift/Basic/LLVM.h"
#include "swift/Basic/Punycode.h"
#include <vector>
#include <cstdint>
using namespace swift;
using namespace Punycode;
// RFC 3492
// Section 5: Parameter values for Punycode
static const int base = 36;
static const int tmin = 1;
static const int tmax = 26;
static const int skew = 38;
static const int damp = 700;
static const int initial_bias = 72;
static const uint32_t initial_n = 128;
static const char delimiter = '_';
static char digit_value(int digit) {
assert(digit < base && "invalid punycode digit");
if (digit < 26)
return 'a' + digit;
return 'A' - 26 + digit;
}
static int digit_index(char value) {
if (value >= 'a' && value <= 'z')
return value - 'a';
if (value >= 'A' && value <= 'J')
return value - 'A' + 26;
return -1;
}
static bool isValidUnicodeScalar(uint32_t S) {
// Also accept the range of 0xD800 - 0xD880, which is used for non-symbol
// ASCII characters.
return (S < 0xD880) || (S >= 0xE000 && S <= 0x1FFFFF);
}
// Section 6.1: Bias adaptation function
static int adapt(int delta, int numpoints, bool firsttime) {
if (firsttime)
delta = delta / damp;
else
delta = delta / 2;
delta += delta / numpoints;
int k = 0;
while (delta > ((base - tmin) * tmax) / 2) {
delta /= base - tmin;
k += base;
}
return k + (((base - tmin + 1) * delta) / (delta + skew));
}
// Section 6.2: Decoding procedure
bool Punycode::decodePunycode(StringRef InputPunycode,
std::vector<uint32_t> &OutCodePoints) {
OutCodePoints.clear();
OutCodePoints.reserve(InputPunycode.size());
// -- Build the decoded string as UTF32 first because we need random access.
uint32_t n = initial_n;
int i = 0;
int bias = initial_bias;
/// let output = an empty string indexed from 0
// consume all code points before the last delimiter (if there is one)
// and copy them to output,
size_t lastDelimiter = InputPunycode.find_last_of(delimiter);
if (lastDelimiter != StringRef::npos) {
for (char c : InputPunycode.slice(0, lastDelimiter)) {
// fail on any non-basic code point
if (static_cast<unsigned char>(c) > 0x7f)
return true;
OutCodePoints.push_back(c);
}
// if more than zero code points were consumed then consume one more
// (which will be the last delimiter)
InputPunycode =
InputPunycode.slice(lastDelimiter + 1, InputPunycode.size());
}
while (!InputPunycode.empty()) {
int oldi = i;
int w = 1;
for (int k = base; ; k += base) {
// consume a code point, or fail if there was none to consume
if (InputPunycode.empty())
return true;
char codePoint = InputPunycode.front();
InputPunycode = InputPunycode.slice(1, InputPunycode.size());
// let digit = the code point's digit-value, fail if it has none
int digit = digit_index(codePoint);
if (digit < 0)
return true;
i = i + digit * w;
int t = k <= bias ? tmin
: k >= bias + tmax ? tmax
: k - bias;
if (digit < t)
break;
w = w * (base - t);
}
bias = adapt(i - oldi, OutCodePoints.size() + 1, oldi == 0);
n = n + i / (OutCodePoints.size() + 1);
i = i % (OutCodePoints.size() + 1);
// if n is a basic code point then fail
if (n < 0x80)
return true;
// insert n into output at position i
OutCodePoints.insert(OutCodePoints.begin() + i, n);
i++;
}
return true;
}
// Section 6.3: Encoding procedure
bool Punycode::encodePunycode(const std::vector<uint32_t> &InputCodePoints,
std::string &OutPunycode) {
OutPunycode.clear();
uint32_t n = initial_n;
int delta = 0;
int bias = initial_bias;
// let h = b = the number of basic code points in the input
// copy them to the output in order...
size_t h = 0;
for (auto C : InputCodePoints) {
if (C < 0x80) {
++h;
OutPunycode.push_back(C);
}
if (!isValidUnicodeScalar(C)) {
OutPunycode.clear();
return false;
}
}
size_t b = h;
// ...followed by a delimiter if b > 0
if (b > 0)
OutPunycode.push_back(delimiter);
while (h < InputCodePoints.size()) {
// let m = the minimum code point >= n in the input
uint32_t m = 0x10FFFF;
for (auto codePoint : InputCodePoints) {
if (codePoint >= n && codePoint < m)
m = codePoint;
}
delta = delta + (m - n) * (h + 1);
n = m;
for (auto c : InputCodePoints) {
if (c < n) ++delta;
if (c == n) {
int q = delta;
for (int k = base; ; k += base) {
int t = k <= bias ? tmin
: k >= bias + tmax ? tmax
: k - bias;
if (q < t) break;
OutPunycode.push_back(digit_value(t + ((q - t) % (base - t))));
q = (q - t) / (base - t);
}
OutPunycode.push_back(digit_value(q));
bias = adapt(delta, h + 1, h == b);
delta = 0;
++h;
}
}
++delta; ++n;
}
return true;
}
static bool encodeToUTF8(const std::vector<uint32_t> &Scalars,
std::string &OutUTF8) {
for (auto S : Scalars) {
if (!isValidUnicodeScalar(S)) {
OutUTF8.clear();
return false;
}
if (S >= 0xD800 && S < 0xD880)
S -= 0xD800;
unsigned Bytes = 0;
if (S < 0x80)
Bytes = 1;
else if (S < 0x800)
Bytes = 2;
else if (S < 0x10000)
Bytes = 3;
else
Bytes = 4;
switch (Bytes) {
case 1:
OutUTF8.push_back(S);
break;
case 2: {
uint8_t Byte2 = (S | 0x80) & 0xBF;
S >>= 6;
uint8_t Byte1 = S | 0xC0;
OutUTF8.push_back(Byte1);
OutUTF8.push_back(Byte2);
break;
}
case 3: {
uint8_t Byte3 = (S | 0x80) & 0xBF;
S >>= 6;
uint8_t Byte2 = (S | 0x80) & 0xBF;
S >>= 6;
uint8_t Byte1 = S | 0xE0;
OutUTF8.push_back(Byte1);
OutUTF8.push_back(Byte2);
OutUTF8.push_back(Byte3);
break;
}
case 4: {
uint8_t Byte4 = (S | 0x80) & 0xBF;
S >>= 6;
uint8_t Byte3 = (S | 0x80) & 0xBF;
S >>= 6;
uint8_t Byte2 = (S | 0x80) & 0xBF;
S >>= 6;
uint8_t Byte1 = S | 0xF0;
OutUTF8.push_back(Byte1);
OutUTF8.push_back(Byte2);
OutUTF8.push_back(Byte3);
OutUTF8.push_back(Byte4);
break;
}
}
}
return true;
}
bool Punycode::decodePunycodeUTF8(StringRef InputPunycode,
std::string &OutUTF8) {
std::vector<uint32_t> OutCodePoints;
if (!decodePunycode(InputPunycode, OutCodePoints))
return false;
return encodeToUTF8(OutCodePoints, OutUTF8);
}