blob: b6028b3abc835b3264e921baf8491f478fdc0d34 [file] [log] [blame]
// Copyright 2017 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package commentparser does a basic parse over a source file and returns all
// of the comments from the code. This is useful for when you want to analyze
// text written in comments (like copyright notices) but not in the code
// itself.
package commentparser
import (
"bytes"
"strings"
"unicode/utf8"
"github.com/google/licenseclassifier/commentparser/language"
)
const (
eofInString = "%d:EOF in string"
eofInSingleLineComment = "%d:EOF in single line comment"
eofInMultilineComment = "%d:EOF in multiline comment"
)
// Parse parses the input data and returns the comments.
func Parse(contents []byte, lang language.Language) Comments {
if len(contents) == 0 {
return nil
}
c := string(contents)
if !strings.HasSuffix(c, "\n") {
// Force a terminating newline if one isn't present.
c += "\n"
}
i := &input{
s: c,
lang: lang,
offset: 0,
pos: position{line: 1, lineRune: []int{0}},
}
i.lex()
return i.comments
}
// Comment is either a single line or multiline comment in a source code file.
// A single line comment has StartLine equal to EndLine. The lines are 1-based.
type Comment struct {
StartLine int
EndLine int
Text string
}
// Comments allows us to treat a slice of comments as a unit.
type Comments []*Comment
// ChunkIterator returns a read-only channel and generates the comments in a
// goroutine, then closes the channel.
func (c Comments) ChunkIterator() <-chan Comments {
ch := make(chan Comments)
go func() {
defer close(ch)
if len(c) == 0 {
return
}
prevChunk := c[0]
for index := 0; index < len(c); index++ {
var chunk Comments
for ; index < len(c); index++ {
if c[index].StartLine > prevChunk.StartLine+1 {
break
}
if c[index].StartLine == prevChunk.StartLine+2 {
if c[index].StartLine != c[index].EndLine || prevChunk.StartLine != prevChunk.EndLine {
break
}
}
chunk = append(chunk, c[index])
prevChunk = c[index]
}
if len(chunk) == 0 {
break
}
ch <- chunk
if index >= len(c) {
break
}
prevChunk = c[index]
index--
}
}()
return ch
}
// StartLine is the line number (1-based) the first part of the comment block
// starts on.
func (c Comments) StartLine() int {
if len(c) == 0 {
return 0
}
return c[0].StartLine
}
// String creates a string out of the text of the comments. Comment begin and
// end markers are removed.
func (c Comments) String() string {
var s []string
for _, cmt := range c {
s = append(s, cmt.Text)
}
return strings.Join(s, "\n")
}
// position records the location of a lexeme.
type position struct {
line int // Line number of input: 1-based
lineRune []int // Rune offset from beginning of line: 0-based
}
// input holds the current state of the lexer.
type input struct {
s string // Entire input.
lang language.Language // Source code language.
offset int // Offset into input.
pos position // Current position in the input.
comments Comments // Comments in the source file.
}
// lex is called to obtain the comments.
func (i *input) lex() {
for {
c, ok := i.peekRune()
if !ok {
break
}
switch c {
case '"', '\'', '`': // String
// Ignore strings because they could contain comment
// start or end sequences which we need to ignore.
if i.lang == language.HTML {
// Quotes in HTML-like files aren't meaningful,
// because it's basically plain text
break
}
ok, hasEscape := i.lang.QuoteCharacter(c)
if !ok {
break
}
var content bytes.Buffer
isDocString := false
quote := string(c)
if i.lang == language.Python {
if c == '\'' && i.match("'''") {
quote = "'''"
// Assume module-level docstrings start at the
// beginning of a line. Function docstrings not
// supported.
if i.pos.lineRune[len(i.pos.lineRune)-1] == 3 {
isDocString = true
}
} else if c == '"' && i.match(`"""`) {
quote = `"""`
if i.pos.lineRune[len(i.pos.lineRune)-1] == 3 {
isDocString = true
}
} else {
i.readRune() // Eat quote.
}
} else {
i.readRune() // Eat quote.
}
startLine := i.pos.line
for {
c, ok = i.peekRune()
if !ok {
return
}
if hasEscape && c == '\\' {
i.readRune() // Eat escape.
} else if i.match(quote) {
break
} else if (i.lang == language.JavaScript || i.lang == language.Perl) && c == '\n' {
// JavaScript and Perl allow you to
// specify regexes without quotes, but
// which contain quotes. So treat the
// newline as terminating the string.
break
}
c := i.readRune()
if isDocString {
content.WriteRune(c)
}
if i.eof() {
return
}
}
if isDocString {
i.comments = append(i.comments, &Comment{
StartLine: startLine,
EndLine: i.pos.line,
Text: content.String(),
})
}
default:
startLine := i.pos.line
var comment bytes.Buffer
if ok, start, end := i.multiLineComment(); ok { // Multiline comment
nesting := 0
startLine := i.pos.line
for {
if i.eof() {
return
}
c := i.readRune()
comment.WriteRune(c)
if i.lang.NestedComments() && i.match(start) {
// Allows nested comments.
comment.WriteString(start)
nesting++
}
if i.match(end) {
if nesting > 0 {
comment.WriteString(end)
nesting--
} else {
break
}
}
}
i.comments = append(i.comments, &Comment{
StartLine: startLine,
EndLine: i.pos.line,
Text: comment.String(),
})
} else if i.singleLineComment() { // Single line comment
for {
if i.eof() {
return
}
c = i.readRune()
if c == '\n' {
i.unreadRune(c)
break
}
comment.WriteRune(c)
}
i.comments = append(i.comments, &Comment{
StartLine: startLine,
EndLine: i.pos.line,
Text: comment.String(),
})
}
}
i.readRune() // Ignore non-comments.
}
}
// singleLineComment returns 'true' if we've run across a single line comment
// in the given language.
func (i *input) singleLineComment() bool {
if i.match(i.lang.SingleLineCommentStart()) {
return true
}
if i.lang == language.SQL {
return i.match(language.MySQL.SingleLineCommentStart())
} else if i.lang == language.ObjectiveC {
return i.match(language.Matlab.SingleLineCommentStart())
}
return false
}
// multiLineComment returns 'true' if we've run across a multiline comment in
// the given language.
func (i *input) multiLineComment() (bool, string, string) {
if s := i.lang.MultilineCommentStart(); i.match(s) {
return true, s, i.lang.MultilineCommentEnd()
}
if i.lang == language.SQL {
if s := language.MySQL.MultilineCommentStart(); i.match(s) {
return true, s, language.MySQL.MultilineCommentEnd()
}
} else if i.lang == language.ObjectiveC {
if s := language.Matlab.MultilineCommentStart(); i.match(s) {
return true, s, language.Matlab.MultilineCommentEnd()
}
}
return false, "", ""
}
// match returns 'true' if the next tokens in the stream match the given
// string.
func (i *input) match(s string) bool {
if s == "" {
return false
}
saved := s
var read []rune
for len(s) > 0 && !i.eof() {
r, size := utf8.DecodeRuneInString(s)
if c, ok := i.peekRune(); ok && c == r {
read = append(read, c)
} else {
// No match. Push the tokens we read back onto the stack.
for idx := len(read) - 1; idx >= 0; idx-- {
i.unreadRune(read[idx])
}
return false
}
s = s[size:]
i.readRune() // Eat token.
}
return string(read) == saved
}
// eof reports whether the input has reached the end of the file.
func (i *input) eof() bool {
return len(i.s) <= i.offset
}
// peekRune returns the next rune in the input without consuming it.
func (i *input) peekRune() (rune, bool) {
if i.eof() {
return rune(0), false
}
r, _ := utf8.DecodeRuneInString(i.s[i.offset:])
return r, true
}
// readRune consumes and returns the next rune in the input.
func (i *input) readRune() rune {
r, size := utf8.DecodeRuneInString(i.s[i.offset:])
if r == '\n' {
i.pos.line++
i.pos.lineRune = append(i.pos.lineRune, 0)
} else {
i.pos.lineRune[len(i.pos.lineRune)-1]++
}
i.offset += size
return r
}
// unreadRune winds the lexer's state back to before the rune was read.
func (i *input) unreadRune(c rune) {
p := make([]byte, utf8.UTFMax)
size := utf8.EncodeRune(p, c)
i.offset -= size
if c == '\n' {
i.pos.line--
if len(i.pos.lineRune) > 1 {
i.pos.lineRune = i.pos.lineRune[:len(i.pos.lineRune)-1]
} else {
i.pos.lineRune[len(i.pos.lineRune)-1] = 0
}
} else {
i.pos.lineRune[len(i.pos.lineRune)-1]--
}
}