blob: d31eb2a73a0ce35faae100b78d963403d8f94bb7 [file] [log] [blame]
// Copyright 2017 The Bazel Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
// Starlark quoted string utilities.
import (
"bytes"
"fmt"
"strconv"
"strings"
)
// unesc maps single-letter chars following \ to their actual values.
var unesc = [256]byte{
'a': '\a',
'b': '\b',
'f': '\f',
'n': '\n',
'r': '\r',
't': '\t',
'v': '\v',
'\\': '\\',
'\'': '\'',
'"': '"',
}
// esc maps escape-worthy bytes to the char that should follow \.
var esc = [256]byte{
'\a': 'a',
'\b': 'b',
'\f': 'f',
'\n': 'n',
'\r': 'r',
'\t': 't',
'\v': 'v',
'\\': '\\',
'\'': '\'',
'"': '"',
}
// notEsc is a list of characters that can follow a \ in a string value
// without having to escape the \. That is, since ( is in this list, we
// quote the Go string "foo\\(bar" as the Python literal "foo\(bar".
// This really does happen in BUILD files, especially in strings
// being used as shell arguments containing regular expressions.
const notEsc = " !#$%&()*+,-./:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~"
// unquote unquotes the quoted string, returning the actual
// string value, whether the original was triple-quoted, and
// an error describing invalid input.
func unquote(quoted string) (s string, triple bool, err error) {
// Check for raw prefix: means don't interpret the inner \.
raw := false
if strings.HasPrefix(quoted, "r") {
raw = true
quoted = quoted[1:]
}
if len(quoted) < 2 {
err = fmt.Errorf("string literal too short")
return
}
if quoted[0] != '"' && quoted[0] != '\'' || quoted[0] != quoted[len(quoted)-1] {
err = fmt.Errorf("string literal has invalid quotes")
return
}
// Check for triple quoted string.
quote := quoted[0]
if len(quoted) >= 6 && quoted[1] == quote && quoted[2] == quote && quoted[:3] == quoted[len(quoted)-3:] {
triple = true
quoted = quoted[3 : len(quoted)-3]
} else {
quoted = quoted[1 : len(quoted)-1]
}
// Now quoted is the quoted data, but no quotes.
// If we're in raw mode or there are no escapes or
// carriage returns, we're done.
var unquoteChars string
if raw {
unquoteChars = "\r"
} else {
unquoteChars = "\\\r"
}
if !strings.ContainsAny(quoted, unquoteChars) {
s = quoted
return
}
// Otherwise process quoted string.
// Each iteration processes one escape sequence along with the
// plain text leading up to it.
var buf bytes.Buffer
for {
// Remove prefix before escape sequence.
i := strings.IndexAny(quoted, unquoteChars)
if i < 0 {
i = len(quoted)
}
buf.WriteString(quoted[:i])
quoted = quoted[i:]
if len(quoted) == 0 {
break
}
// Process carriage return.
if quoted[0] == '\r' {
buf.WriteByte('\n')
if len(quoted) > 1 && quoted[1] == '\n' {
quoted = quoted[2:]
} else {
quoted = quoted[1:]
}
continue
}
// Process escape sequence.
if len(quoted) == 1 {
err = fmt.Errorf(`truncated escape sequence \`)
return
}
switch quoted[1] {
default:
// In Python, if \z (for some byte z) is not a known escape sequence
// then it appears as literal text in the string.
buf.WriteString(quoted[:2])
quoted = quoted[2:]
case '\n':
// Ignore the escape and the line break.
quoted = quoted[2:]
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '\'', '"':
// One-char escape
buf.WriteByte(unesc[quoted[1]])
quoted = quoted[2:]
case '0', '1', '2', '3', '4', '5', '6', '7':
// Octal escape, up to 3 digits.
n := int(quoted[1] - '0')
quoted = quoted[2:]
for i := 1; i < 3; i++ {
if len(quoted) == 0 || quoted[0] < '0' || '7' < quoted[0] {
break
}
n = n*8 + int(quoted[0]-'0')
quoted = quoted[1:]
}
if n >= 256 {
// NOTE: Python silently discards the high bit,
// so that '\541' == '\141' == 'a'.
// Let's see if we can avoid doing that in BUILD files.
err = fmt.Errorf(`invalid escape sequence \%03o`, n)
return
}
buf.WriteByte(byte(n))
case 'x':
// Hexadecimal escape, exactly 2 digits.
if len(quoted) < 4 {
err = fmt.Errorf(`truncated escape sequence %s`, quoted)
return
}
n, err1 := strconv.ParseUint(quoted[2:4], 16, 0)
if err1 != nil {
err = fmt.Errorf(`invalid escape sequence %s`, quoted[:4])
return
}
buf.WriteByte(byte(n))
quoted = quoted[4:]
}
}
s = buf.String()
return
}
// indexByte returns the index of the first instance of b in s, or else -1.
func indexByte(s string, b byte) int {
for i := 0; i < len(s); i++ {
if s[i] == b {
return i
}
}
return -1
}
// hex is a list of the hexadecimal digits, for use in quoting.
// We always print lower-case hexadecimal.
const hex = "0123456789abcdef"
// quote returns the quoted form of the string value "x".
// If triple is true, quote uses the triple-quoted form """x""".
func quote(unquoted string, triple bool) string {
q := `"`
if triple {
q = `"""`
}
var buf bytes.Buffer
buf.WriteString(q)
for i := 0; i < len(unquoted); i++ {
c := unquoted[i]
if c == '"' && triple && (i+1 < len(unquoted) && unquoted[i+1] != '"' || i+2 < len(unquoted) && unquoted[i+2] != '"') {
// Can pass up to two quotes through, because they are followed by a non-quote byte.
buf.WriteByte(c)
if i+1 < len(unquoted) && unquoted[i+1] == '"' {
buf.WriteByte(c)
i++
}
continue
}
if triple && c == '\n' {
// Can allow newline in triple-quoted string.
buf.WriteByte(c)
continue
}
if c == '\'' {
// Can allow ' since we always use ".
buf.WriteByte(c)
continue
}
if c == '\\' {
if i+1 < len(unquoted) && indexByte(notEsc, unquoted[i+1]) >= 0 {
// Can pass \ through when followed by a byte that
// known not to be a valid escape sequence and also
// that does not trigger an escape sequence of its own.
// Use this, because various BUILD files do.
buf.WriteByte('\\')
buf.WriteByte(unquoted[i+1])
i++
continue
}
}
if esc[c] != 0 {
buf.WriteByte('\\')
buf.WriteByte(esc[c])
continue
}
if c < 0x20 || c >= 0x80 {
// BUILD files are supposed to be Latin-1, so escape all control and high bytes.
// I'd prefer to use \x here, but Blaze does not implement
// \x in quoted strings (b/7272572).
buf.WriteByte('\\')
buf.WriteByte(hex[c>>6]) // actually octal but reusing hex digits 0-7.
buf.WriteByte(hex[(c>>3)&7])
buf.WriteByte(hex[c&7])
/*
buf.WriteByte('\\')
buf.WriteByte('x')
buf.WriteByte(hex[c>>4])
buf.WriteByte(hex[c&0xF])
*/
continue
}
buf.WriteByte(c)
continue
}
buf.WriteString(q)
return buf.String()
}