blob: c73f7577cfeca6c7d05a5c522229717df3257302 [file] [log] [blame]
// Copyright 2020 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ----------------
// Package dumbindent formats C (and C-like) programs.
//
// It is similar in concept to pretty-printers like `indent` or `clang-format`.
// It is much dumber (it will not add or remove line breaks or otherwise
// re-flow lines of code just to fit within an 80 column limit) but it can
// therefore be much faster at the basic task of automatically indenting nested
// blocks. The output isn't 'perfect', but it's usually sufficiently readable
// if the input already has sensible line breaks.
//
// To quantify "much faster", on this one C file, `cmd/dumbindent` in this
// repository was 80 times faster than `clang-format`, even without a column
// limit:
//
// $ wc release/c/wuffs-v0.2.c
// 11858 35980 431885 release/c/wuffs-v0.2.c
// $ time dumbindent < release/c/wuffs-v0.2.c > /dev/null
// real 0m0.008s
// user 0m0.005s
// sys 0m0.005s
// $ time clang-format-9 < release/c/wuffs-v0.2.c > /dev/null
// real 0m0.668s
// user 0m0.618s
// sys 0m0.032s
// $ time clang-format-9 -style='{ColumnLimit: 0}' < release/c/wuffs-v0.2.c > /dev/null
// real 0m0.641s
// user 0m0.585s
// sys 0m0.037s
//
// Apart from some rare and largely uninteresting exceptions, the dumbindent
// algorithm only considers:
//
// ∙ '{' and '}' curly braces,
// ∙ '(' and ')' round parentheses,
// ∙ '\n' line breaks,
// ∙ ' ' spaces and '\t' tabs that start or end a line, and
// ∙ strings, comments and preprocessor directives (in order to ignore any of
// the above special characters within them),
//
// Everything else is an opaque byte. Consider this input:
//
// for (i = 0; i < 3; i++) {
// j = 0; // Ignore { in a comment.
// if (i < j) { foo(); }
// u = (v +
// w);
// }
//
// From the algorithm's point of view, this input is equivalent to:
//
// ....(.................).{
// .................................
// ...(.....).{....()..}
// ....(...
// .);
// }
//
// The formatted output (using the default of 2 spaces per indent level) is:
//
// ....(.................).{
// .................................
// ...(.....).{....()..}
// ....(...
// .);
// }
//
// Dumbindent adjusts lines horizontally (indenting) but not vertically (it
// does not break or un-break lines, or collapse consecutive blank lines),
// although it will remove blank lines at the end of the input. In the example
// above, it will not remove the "\n" between "u = (v +" and "w);", even though
// both lines are short.
//
// Each output line is indented according to the net number of open braces
// preceding it, although lines starting with close braces will outdent first,
// similar to `gofmt` style. A line which starts in a so-far-unbalanced open
// parenthesis, such as the "w);" line above, gets 2 additional indent levels.
//
// Horizontal adjustment only affects a line's leading white space (and will
// trim trailing white space). It does not affect white space within a line.
// Dumbindent does not parse the input as C/C++ source code.
//
// In particular, the algorithm does not solve C++'s "most vexing parse" or
// otherwise determine whether "x*y" is a multiplication or a type definition
// (where y is a pointer-to-x typed variable or function argument, such as
// "int*p"). For a type definition, where other formatting algorithms would
// re-write around the "*" as either "x* y" or "x *y", dumbindent will not
// insert spaces.
//
// Similarly, dumbindent will not correct this mis-indentation:
//
// if (condition)
// goto fail;
// goto fail;
//
// Instead, when automatically or manually generating the input for dumbindent,
// it is recommended to always emit curly braces (again, similar to `gofmt`
// style), even for what would otherwise be 'one-liner' if statements.
//
// More commentary is at:
// https://nigeltao.github.io/blog/2020/dumbindent.html
package dumbindent
import (
"bytes"
)
// 'Constants', but their type is []byte, not string.
var (
backTick = []byte("`")
extern = []byte("extern ")
namespace = []byte("namespace ")
starSlash = []byte("*/")
newLines = []byte("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
spaces = []byte(" ")
tabs = []byte("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t")
)
// hangingBytes is a look-up table for updating the hanging variable.
var hangingBytes = [256]bool{
'=': true,
'\\': true,
}
// Options are formatting options.
type Options struct {
// Spaces, if positive, is the number of spaces per indent level. A
// non-positive value means to use the default: 2 spaces per indent level.
//
// This field is ignored when Tabs is true.
Spaces int
// Tabs is whether to indent with tabs instead of spaces. If true, it's one
// '\t' tab character per indent and the Spaces field is ignored.
Tabs bool
}
// FormatBytes formats the C (or C-like) program in src, appending the result
// to dst, and returns that longer slice.
//
// It is valid to pass a dst slice (such as nil) whose unused capacity,
// cap(dst) - len(dst), is too short to hold the formatted program. In this
// case, a new slice will be allocated and returned.
//
// Passing a nil opts is valid and equivalent to passing &Options{}.
func FormatBytes(dst []byte, src []byte, opts *Options) []byte {
indentBytes := spaces
indentCount := 2
if opts != nil {
if opts.Tabs {
indentBytes = tabs
indentCount = 1
} else if opts.Spaces > 0 {
indentCount = opts.Spaces
}
}
// Count the number of leading spaces (or tabs) on the first line. Every
// output line starts with that much indent, before further indentation
// from unbalanced braces and parentheses.
initialIndent := countInitialOccurrences(src, indentBytes[0])
src = trimLeadingWhiteSpaceAndNewLines(src)
if len(src) == 0 {
return dst
} else if len(dst) == 0 {
dst = make([]byte, 0, len(src)+(len(src)/2))
}
nBlankLines := 0 // The number of preceding blank lines.
nBraces := 0 // The number of unbalanced '{'s.
nParens := 0 // The number of unbalanced '('s.
hanging := false // Whether the previous non-blank line ends with '=' or '\\'.
preproc := false // Whether we're in a #preprocessor line.
for line, remaining := src, []byte(nil); len(src) > 0; src = remaining {
src = trimLeadingWhiteSpace(src)
line, remaining = src, nil
if i := bytes.IndexByte(line, '\n'); i >= 0 {
line, remaining = line[:i], line[i+1:]
}
lineLength := len(line)
// Strip any blank lines at the end of the file.
if len(line) == 0 {
nBlankLines++
continue
}
if nBlankLines > 0 {
dst = appendRepeatedBytes(dst, newLines, nBlankLines)
nBlankLines = 0
}
// Handle preprocessor lines (#ifdef, #pragma, etc).
if preproc || (line[0] == '#') {
indent := initialIndent
if preproc {
indent += indentCount * 2
}
line = trimTrailingWhiteSpace(line)
dst = appendRepeatedBytes(dst, indentBytes, indent)
dst = append(dst, line...)
dst = append(dst, '\n')
hanging = false
preproc = lastNonWhiteSpace(line) == '\\'
continue
}
closeBraces := 0
// Don't indent for `extern "C" {` or `namespace foo {`.
if ((line[0] == 'e') && hasPrefixAndBrace(line, extern)) ||
((line[0] == 'n') && hasPrefixAndBrace(line, namespace)) {
nBraces--
} else {
// Account for leading '}'s before we print the line's indentation.
for ; (closeBraces < len(line)) && line[closeBraces] == '}'; closeBraces++ {
}
nBraces -= closeBraces
// Because the "{" in "extern .*{" and "namespace .*{" is had no
// net effect on nBraces, the matching "}" can cause the nBraces
// count to dip below zero. Correct for that here.
if nBraces < 0 {
nBraces = 0
}
}
// Output indentation. Dumbindent's default, 2 spaces per indent level,
// roughly approximates clang-format's default style.
indent := initialIndent
if nBraces > 0 {
indent += indentCount * nBraces
}
if (nParens > 0) || hanging {
indent += indentCount * 2
}
dst = appendRepeatedBytes(dst, indentBytes, indent)
// Output the leading '}'s.
dst = append(dst, line[:closeBraces]...)
line = line[closeBraces:]
// Adjust the state according to the braces and parentheses within the
// line (except for those in comments and strings).
last := lastNonWhiteSpace(line)
loop:
for {
for i, c := range line {
switch c {
case '{':
nBraces++
case '}':
nBraces--
case '(':
nParens++
case ')':
nParens--
case '/':
if (i + 1) >= len(line) {
break
}
if line[i+1] == '/' {
// A slash-slash comment. Skip the rest of the line.
last = lastNonWhiteSpace(line[:i])
break loop
} else if line[i+1] == '*' {
// A slash-star comment.
dst = append(dst, line[:i+2]...)
restOfLine := line[i+2:]
restOfSrc := src[lineLength-len(restOfLine):]
dst, line, remaining = handleRaw(dst, restOfSrc, starSlash)
last = lastNonWhiteSpace(line)
continue loop
}
case '"', '\'':
// A cooked string, whose contents are backslash-escaped.
suffix := skipCooked(line[i+1:], c)
dst = append(dst, line[:len(line)-len(suffix)]...)
line = suffix
continue loop
case '`':
// A raw string.
dst = append(dst, line[:i+1]...)
restOfLine := line[i+1:]
restOfSrc := src[lineLength-len(restOfLine):]
dst, line, remaining = handleRaw(dst, restOfSrc, backTick)
last = lastNonWhiteSpace(line)
continue loop
}
}
break loop
}
hanging = hangingBytes[last]
// Output the line (minus any trailing space).
line = trimTrailingWhiteSpace(line)
dst = append(dst, line...)
dst = append(dst, "\n"...)
}
return dst
}
// hasPrefixAndBrace returns whether line starts with prefix and after that
// contains a '{'.
func hasPrefixAndBrace(line []byte, prefix []byte) bool {
return bytes.HasPrefix(line, prefix) &&
bytes.IndexByte(line[len(prefix):], '{') >= 0
}
// trimLeadingWhiteSpaceAndNewLines converts "\t\n foo bar " to "foo bar ".
func trimLeadingWhiteSpaceAndNewLines(s []byte) []byte {
for (len(s) > 0) && ((s[0] == ' ') || (s[0] == '\t') || (s[0] == '\n')) {
s = s[1:]
}
return s
}
// trimLeadingWhiteSpace converts "\t\t foo bar " to "foo bar ".
func trimLeadingWhiteSpace(s []byte) []byte {
for (len(s) > 0) && ((s[0] == ' ') || (s[0] == '\t')) {
s = s[1:]
}
return s
}
// trimTrailingWhiteSpace converts "\t\t foo bar " to "\t\t foo bar".
func trimTrailingWhiteSpace(s []byte) []byte {
for (len(s) > 0) && ((s[len(s)-1] == ' ') || (s[len(s)-1] == '\t')) {
s = s[:len(s)-1]
}
return s
}
// appendRepeatedBytes appends number copies of a byte, assuming that
// repeatedBytes' elements are all the same byte.
func appendRepeatedBytes(dst []byte, repeatedBytes []byte, number int) []byte {
for number > 0 {
n := number
if n > len(repeatedBytes) {
n = len(repeatedBytes)
}
dst = append(dst, repeatedBytes[:n]...)
number -= n
}
return dst
}
// countInitialOccurrences returns the length of the longest prefix of s that
// consists entirely of x. Passing ("aaabacdaa", 'a') returns 3.
func countInitialOccurrences(s []byte, x byte) int {
for i, c := range s {
if c != x {
return i
}
}
return len(s)
}
// lastNonWhiteSpace returns the 'z' in "abc xyz ". It returns '\x00' if s
// consists entirely of spaces or tabs.
func lastNonWhiteSpace(s []byte) byte {
for i := len(s) - 1; i >= 0; i-- {
if x := s[i]; (x != ' ') && (x != '\t') {
return x
}
}
return 0
}
// skipCooked converts `ijk \" lmn" pqr` to ` pqr`.
func skipCooked(s []byte, quote byte) (suffix []byte) {
for i := 0; i < len(s); {
if x := s[i]; x == quote {
return s[i+1:]
} else if x != '\\' {
i += 1
} else if (i + 1) < len(s) {
i += 2
} else {
break
}
}
return nil
}
// handleRaw copies a raw string from restOfSrc to dst, re-calculating the
// (line, remaining) pair afterwards.
func handleRaw(dst []byte, restOfSrc []byte, endQuote []byte) (retDst []byte, line []byte, remaining []byte) {
end := bytes.Index(restOfSrc, endQuote)
if end < 0 {
end = len(restOfSrc)
} else {
end += len(endQuote)
}
dst = append(dst, restOfSrc[:end]...)
line, remaining = restOfSrc[end:], nil
if i := bytes.IndexByte(line, '\n'); i >= 0 {
line, remaining = line[:i], line[i+1:]
}
return dst, line, remaining
}