| #!/usr/bin/env perl |
| # Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al. |
| # |
| # SPDX-License-Identifier: curl |
| # |
| # bad[:=]correct |
| # |
| # If separator is '=', the string will be compared case sensitively. |
| # If separator is ':', the check is done case insensitively. |
| # |
| # To add white listed uses of bad words that are removed before checking for |
| # the bad ones: |
| # |
| # ---(accepted word) |
| # ---:[path]:(accepted word) |
| # |
| |
| use strict; |
| use warnings; |
| |
| use File::Basename; |
| |
| # |
| ## States |
| # |
| # 0 - default, initial state |
| # 1 - there was a slash |
| # 2 - quoted string |
| # 3 - // comment |
| # 4 - /* comment |
| # 5 - asterisk found within a /* comment |
| # 6 - #include line |
| # 7 - backslash in a string |
| # |
| ## Flags |
| # |
| # 1 - include preprocessor line, ignore strings |
| |
| sub srcline { |
| my ($state, $flags, $l) = @_; |
| my $line = ""; |
| |
| if(($state == 0) && ($l =~ /^ *\# *include/)) { |
| # preprocessor include line |
| $flags |= 1; |
| } |
| else { |
| # not preprocessor |
| $flags &= ~1; |
| } |
| |
| if($state == 3) { |
| # // ended on the prev line, go back to init |
| $state = 0; |
| } |
| |
| my @c = split(//, $l); |
| |
| # state machine this line |
| for my $c (@c) { |
| if($state == 1) { |
| # we had a slash |
| if($c eq "/") { |
| # // confirmed, the rest of the line is a comment |
| $line .= "//"; |
| $state = 3; |
| } |
| elsif($c eq "*") { |
| # /* confirmed |
| $state = 4; |
| $line .= "/*"; |
| } |
| else { |
| # back to normal |
| $line .= " "; |
| $state = 0; |
| } |
| } |
| elsif($state == 2) { |
| # a string |
| if($c eq "\\") { |
| $line .= "\\"; |
| $state = 7; |
| } |
| elsif($c eq "\"") { |
| # end of the string |
| $line .= "\""; |
| $state = 0; |
| } |
| else { |
| $line .= $c; |
| } |
| } |
| elsif($state == 3) { |
| # a // comment |
| $line .= $c; |
| } |
| elsif($state == 4) { |
| # a /* comment |
| if($c eq "*") { |
| # could be a comment close |
| $state = 5; |
| } |
| else { |
| $line .= $c; |
| } |
| } |
| elsif($state == 5) { |
| if($c eq "/") { |
| # a /* */ comment ended here */ |
| $line .= "*/"; |
| $state = 0; |
| } |
| else { |
| # the /* comment continues |
| $line .= "*$c"; |
| $state = 4; |
| } |
| } |
| elsif($state == 7) { |
| # the prev was a backslash in a string |
| $line .= $c; |
| # switch back to normal string |
| $state = 2; |
| } |
| else { |
| if($c eq "/") { |
| $state = 1; # got a slash |
| } |
| elsif(($c eq "\"") && !($flags & 1)) { |
| # start of a string, not within a preprocessor line |
| $line .= "\""; |
| $state = 2; |
| } |
| elsif($c eq "\n") { |
| $line .= "\n"; |
| } |
| else { |
| $line .= " "; |
| } |
| } |
| } |
| return $state, $flags, $line; |
| } |
| |
| sub sourcecode { |
| my ($f) = @_; |
| my $state = 0; |
| my $flags = 0; |
| my @lines; |
| my $line; |
| open(F, "<$f"); |
| while(<F>) { |
| my $l = $_; |
| ($state, $flags, $line) = srcline($state, $flags, $l); |
| push @lines, $line; |
| } |
| close(F); |
| return @lines; |
| } |
| |
| my @whitelist = ( |
| # ignore what looks like URLs |
| '(^|\W)((https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&\'\(\)*+,;=]+)', |
| # remove bolded sections |
| '\*\*.*?\*\*', |
| # remove backticked texts |
| '\`.*?\`' |
| ); |
| my %alt; |
| my %exactcase; |
| |
| my %wl; |
| |
| my @w; |
| my @exact; |
| my $file = shift @ARGV; |
| open(CONFIG, "<$file") or die "Cannot open '$file': $!"; |
| while(<CONFIG>) { |
| chomp; |
| if($_ =~ /^#/) { |
| next; |
| } |
| if(/^---:([^:]*):(.*)/) { |
| # whitelist file + word |
| my $word = lc($2); |
| $wl{"$1:$word"}=1; |
| } |
| elsif($_ =~ /^---(.+)/) { |
| # whitelist word |
| push @whitelist, $1; |
| } |
| elsif($_ =~ /^(.*)([:=])(.*)/) { |
| my ($bad, $sep, $better)=($1, $2, $3); |
| if($sep eq "=") { |
| $alt{$bad} = $better; |
| push @exact, $bad; |
| } |
| else { |
| $alt{lc($bad)} = $better; |
| push @w, $bad; |
| } |
| } |
| } |
| close(CONFIG); |
| |
| # Build a single combined regex for case-insensitive words |
| my $re_ci; |
| if(@w) { |
| my $pat = join('|', map { quotemeta($_) } @w); |
| $re_ci = qr/\b($pat)\b/i; |
| } |
| |
| # Build a single combined regex for case-sensitive (exact) words |
| my $re_cs; |
| if(@exact) { |
| my $pat = join('|', map { quotemeta($_) } @exact); |
| $re_cs = qr/\b($pat)\b/; |
| } |
| |
| # Build a single combined regex for removing whitelisted content |
| my $pat = join('|', map { $_ } @whitelist); |
| my $re_wl = qr/($pat)/; |
| |
| my $errors = 0; |
| |
| sub highlight { |
| my ($p, $w, $in, $f, $l, $lookup) = @_; |
| |
| my $c = length($p)+1; |
| my $ch; |
| |
| my $dir = dirname($f); |
| $ch = $dir . "/" . ":" . lc($w); |
| if($wl{$ch}) { |
| # whitelisted dirname + word |
| return; |
| } |
| my $updir = dirname($dir); |
| if($dir ne $updir) { |
| $ch = $updir . "/" . ":" . lc($w); |
| if($wl{$ch}) { |
| # whitelisted upper dirname + word |
| return; |
| } |
| } |
| $ch = $f . ":" . lc($w); |
| if($wl{$ch}) { |
| # whitelisted filename + word |
| return; |
| } |
| |
| print STDERR "$f:$l:$c: error: found bad word \"$w\"\n"; |
| printf STDERR " %4d | %s\n", $l, $in; |
| printf STDERR " | %*s^%s\n", length($p), " ", |
| "~" x (length($w)-1); |
| printf STDERR " maybe use \"%s\" instead?\n", $alt{$lookup}; |
| $errors++; |
| } |
| |
| sub document { |
| my ($f) = @_; |
| my @lines; |
| open(F, "<$f"); |
| while(<F>) { |
| push @lines, $_; |
| } |
| close(F); |
| return @lines; |
| } |
| |
| sub file { |
| my ($f) = @_; |
| my $l = 0; |
| |
| my $skip_indented = 0; |
| my $source_code = 0; |
| if($f =~ /\.[ch]$/) { |
| $source_code = 1; |
| } |
| else { |
| # markdown |
| $skip_indented = 1; |
| } |
| |
| my @lines; |
| if($source_code) { |
| @lines = sourcecode($f); |
| } |
| else { |
| @lines = document($f); |
| } |
| for my $in (@lines) { |
| $l++; |
| chomp $in; |
| if($skip_indented && $in =~ /^ /) { |
| next; |
| } |
| # remove the link part |
| $in =~ s/(\[.*\])\(.*\)/$1/g; |
| # remove whitelisted patterns (pre-compiled) |
| if($re_wl) { |
| $in =~ s/${re_wl}//ig; |
| } |
| # case-insensitive bad words |
| if($re_ci) { |
| if($in =~ /^(.*)$re_ci/i) { |
| highlight($1, $2, $in, $f, $l, lc($2)); |
| } |
| } |
| # case-sensitive (exact) bad words |
| if($re_cs) { |
| if($in =~ /^(.*)$re_cs/) { |
| highlight($1, $2, $in, $f, $l, $2); |
| } |
| } |
| } |
| } |
| |
| my @filemasks = @ARGV; |
| open(my $git_ls_files, '-|', 'git', 'ls-files', '--', @filemasks) or die "Failed running git ls-files: $!"; |
| my @files; |
| while(my $each = <$git_ls_files>) { |
| chomp $each; |
| push @files, $each; |
| } |
| close $git_ls_files; |
| |
| my $onum = scalar(@files); |
| my $num; |
| for my $e (@files) { |
| #printf STDERR "Complete: %d%%\r", $num++ * 100 / $onum; |
| file($e); |
| } |
| |
| exit $errors; |