scripts/badwords - third_party/curl - Git at Google

 #!/usr/bin/env perl
 # Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
 #
 # SPDX-License-Identifier: curl
 #
 # bad[:=]correct
 #
 # If separator is '=', the string is compared case sensitively.
 # If separator is ':', the check is done case insensitively.
 #
 # To add white listed uses of bad words that are removed before checking for
 # the bad ones:
 #
 # ---(accepted word)
 # ---:[path]:(accepted word)
 #

 use strict;
 use warnings;

 use File::Basename;

 #
 ## States
 #
 # 0 - default, initial state
 # 1 - there was a slash
 # 2 - quoted string
 # 3 - // comment
 # 4 - /* comment
 # 5 - asterisk found within a /* comment
 # 6 - #include line
 # 7 - backslash in a string
 #
 ## Flags
 #
 # 1 - include preprocessor line, ignore strings

 sub srcline {
     my ($state, $flags, $l) = @_;
     my $line = "";

     if(($state == 0) && ($l =~ /^ *\# *include/)) {
         # preprocessor include line
         $flags |= 1;
     }
     else {
         # not preprocessor
         $flags &= ~1;
     }

     if($state == 3) {
         # // ended on the prev line, go back to init
         $state = 0;
     }

     my @c = split(//, $l);

     # state machine this line
     for my $c (@c) {
         if($state == 1) {
             # we had a slash
             if($c eq "/") {
                 # // confirmed, the rest of the line is a comment
                 $line .= "//";
                 $state = 3;
             }
             elsif($c eq "*") {
                 # /* confirmed
                 $state = 4;
                 $line .= "/*";
             }
             else {
                 # back to normal
                 $line .= "  ";
                 $state = 0;
             }
         }
         elsif($state == 2) {
             # a string
             if($c eq "\\") {
                 $line .= "\\";
                 $state = 7;
             }
             elsif($c eq "\"") {
                 # end of the string
                 $line .= "\"";
                 $state = 0;
             }
             else {
                 $line .= $c;
             }
         }
         elsif($state == 3) {
             # a // comment
             $line .= $c;
         }
         elsif($state == 4) {
             # a /* comment
             if($c eq "*") {
                 # could be a comment close
                 $state = 5;
             }
             else {
                 $line .= $c;
             }
         }
         elsif($state == 5) {
             if($c eq "/") {
                 # a /* */ comment ended here */
                 $line .= "*/";
                 $state = 0;
             }
             else {
                 # the /* comment continues
                 $line .= "*$c";
                 $state = 4;
             }
         }
         elsif($state == 7) {
             # the prev was a backslash in a string
             $line .= $c;
             # switch back to normal string
             $state = 2;
         }
         else {
             if($c eq "/") {
                 $state = 1; # got a slash
             }
             elsif(($c eq "\"") && !($flags & 1)) {
                 # start of a string, not within a preprocessor line
                 $line .= "\"";
                 $state = 2;
             }
             elsif($c eq "\n") {
                 $line .= "\n";
             }
             else {
                 $line .= " ";
             }
         }
     }
     return $state, $flags, $line;
 }

 sub sourcecode {
     my ($f) = @_;
     my $state = 0;
     my $flags = 0;
     my @lines;
     my $line;
     open(F, "<$f");
     while(<F>) {
         my $l = $_;
         ($state, $flags, $line) = srcline($state, $flags, $l);
         push @lines, $line;
     }
     close(F);
     return @lines;
 }

 my @whitelist = (
     # ignore what looks like URLs
     '(^|\W)((https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&\'\(\)*+,;=]+)',
     # remove bolded sections
     '\*\*.*?\*\*',
     # remove backticked texts
     '\`.*?\`'
     );
 my %alt;
 my %exactcase;

 my %wl;

 my @w;
 my @exact;
 my $file = shift @ARGV;
 open(CONFIG, "<$file") or die "Cannot open '$file': $!";
 while(<CONFIG>) {
     chomp;
     if($_ =~ /^#/) {
         next;
     }
     if(/^---:([^:]*):(.*)/) {
         # whitelist file + word
         my $word = lc($2);
         $wl{"$1:$word"} = 1;
     }
     elsif($_ =~ /^---(.+)/) {
         # whitelist word
         push @whitelist, $1;
     }
     elsif($_ =~ /^(.*)([:=])(.*)/) {
         my ($bad, $sep, $better) = ($1, $2, $3);
         if($sep eq "=") {
             $alt{$bad} = $better;
             push @exact, $bad;
         }
         else {
             $alt{lc($bad)} = $better;
             push @w, $bad;
         }
     }
 }
 close(CONFIG);

 # Build a single combined regex for case-insensitive words
 my $re_ci;
 if(@w) {
     my $pat = join('|', map { quotemeta($_) } @w);
     $re_ci = qr/\b($pat)\b/i;
 }

 # Build a single combined regex for case-sensitive (exact) words
 my $re_cs;
 if(@exact) {
     my $pat = join('|', map { quotemeta($_) } @exact);
     $re_cs = qr/\b($pat)\b/;
 }

 # Build a single combined regex for removing whitelisted content
 my $pat = join('|', map { $_ } @whitelist);
 my $re_wl = qr/($pat)/;

 my $errors = 0;

 sub highlight {
     my ($p, $w, $in, $f, $l, $lookup) = @_;

     my $c = length($p)+1;
     my $ch;

     my $dir = dirname($f);
     $ch = $dir . "/" . ":" . lc($w);
     if($wl{$ch}) {
         # whitelisted dirname + word
         return;
     }
     my $updir = dirname($dir);
     if($dir ne $updir) {
         $ch = $updir . "/" . ":" . lc($w);
         if($wl{$ch}) {
             # whitelisted upper dirname + word
             return;
         }
     }
     $ch = $f . ":" . lc($w);
     if($wl{$ch}) {
         # whitelisted filename + word
         return;
     }

     print STDERR  "$f:$l:$c: error: found bad word \"$w\"\n";
     printf STDERR " %4d | %s\n", $l, $in;
     printf STDERR "      | %*s^%s\n", length($p), " ",
         "~" x (length($w)-1);
     printf STDERR " maybe use \"%s\" instead?\n", $alt{$lookup};
     $errors++;
 }

 sub document {
     my ($f) = @_;
     my @lines;
     open(F, "<$f");
     while(<F>) {
         push @lines, $_;
     }
     close(F);
     return @lines;
 }

 sub file {
     my ($f) = @_;
     my $l = 0;

     my $skip_indented = 0;
     my $source_code = 0;
     if($f =~ /\.[ch]$/) {
         $source_code = 1;
     }
     else {
         # markdown
         $skip_indented = 1;
     }

     my @lines;
     if($source_code) {
         @lines = sourcecode($f);
     }
     else {
         @lines = document($f);
     }
     for my $in (@lines) {
         $l++;
         chomp $in;
         if($skip_indented && $in =~ /^    /) {
             next;
         }
         # remove the link part
         $in =~ s/(\[.*\])\(.*\)/$1/g;
         # remove whitelisted patterns (pre-compiled)
         if($re_wl) {
             $in =~ s/${re_wl}//ig;
         }
         # case-insensitive bad words
         if($re_ci) {
             if($in =~ /^(.*)$re_ci/i) {
                 highlight($1, $2, $in, $f, $l, lc($2));
             }
         }
         # case-sensitive (exact) bad words
         if($re_cs) {
             if($in =~ /^(.*)$re_cs/) {
                 highlight($1, $2, $in, $f, $l, $2);
             }
         }
     }
 }

 my @filemasks = @ARGV;
 open(my $git_ls_files, '-|', 'git', 'ls-files', '--', ":!:$file", @filemasks) or die "Failed running git ls-files: $!";
 my @files;
 while(my $each = <$git_ls_files>) {
     chomp $each;
     push @files, $each;
 }
 close $git_ls_files;

 my $onum = scalar(@files);
 my $num;
 for my $e (@files) {
     #printf STDERR "Complete: %d%%\r", $num++ * 100 / $onum;
     file($e);
 }

 exit $errors;
	#!/usr/bin/env perl
	# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
	#
	# SPDX-License-Identifier: curl
	#
	# bad[:=]correct
	#
	# If separator is '=', the string is compared case sensitively.
	# If separator is ':', the check is done case insensitively.
	#
	# To add white listed uses of bad words that are removed before checking for
	# the bad ones:
	#
	# ---(accepted word)
	# ---:[path]:(accepted word)
	#

	use strict;
	use warnings;

	use File::Basename;

	#
	## States
	#
	# 0 - default, initial state
	# 1 - there was a slash
	# 2 - quoted string
	# 3 - // comment
	# 4 - /* comment
	# 5 - asterisk found within a /* comment
	# 6 - #include line
	# 7 - backslash in a string
	#
	## Flags
	#
	# 1 - include preprocessor line, ignore strings

	sub srcline {
	my ($state, $flags, $l) = @_;
	my $line = "";

	if(($state == 0) && ($l =~ /^ \# include/)) {
	# preprocessor include line
	$flags \|= 1;
	}
	else {
	# not preprocessor
	$flags &= ~1;
	}

	if($state == 3) {
	# // ended on the prev line, go back to init
	$state = 0;
	}

	my @c = split(//, $l);

	# state machine this line
	for my $c (@c) {
	if($state == 1) {
	# we had a slash
	if($c eq "/") {
	# // confirmed, the rest of the line is a comment
	$line .= "//";
	$state = 3;
	}
	elsif($c eq "*") {
	# /* confirmed
	$state = 4;
	$line .= "/*";
	}
	else {
	# back to normal
	$line .= " ";
	$state = 0;
	}
	}
	elsif($state == 2) {
	# a string
	if($c eq "\\") {
	$line .= "\\";
	$state = 7;
	}
	elsif($c eq "\"") {
	# end of the string
	$line .= "\"";
	$state = 0;
	}
	else {
	$line .= $c;
	}
	}
	elsif($state == 3) {
	# a // comment
	$line .= $c;
	}
	elsif($state == 4) {
	# a /* comment
	if($c eq "*") {
	# could be a comment close
	$state = 5;
	}
	else {
	$line .= $c;
	}
	}
	elsif($state == 5) {
	if($c eq "/") {
	# a /* / comment ended here /
	$line .= "*/";
	$state = 0;
	}
	else {
	# the /* comment continues
	$line .= "*$c";
	$state = 4;
	}
	}
	elsif($state == 7) {
	# the prev was a backslash in a string
	$line .= $c;
	# switch back to normal string
	$state = 2;
	}
	else {
	if($c eq "/") {
	$state = 1; # got a slash
	}
	elsif(($c eq "\"") && !($flags & 1)) {
	# start of a string, not within a preprocessor line
	$line .= "\"";
	$state = 2;
	}
	elsif($c eq "\n") {
	$line .= "\n";
	}
	else {
	$line .= " ";
	}
	}
	}
	return $state, $flags, $line;
	}

	sub sourcecode {
	my ($f) = @_;
	my $state = 0;
	my $flags = 0;
	my @lines;
	my $line;
	open(F, "<$f");
	while(<F>) {
	my $l = $_;
	($state, $flags, $line) = srcline($state, $flags, $l);
	push @lines, $line;
	}
	close(F);
	return @lines;
	}

	my @whitelist = (
	# ignore what looks like URLs
	'(^\|\W)((https\|http\|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&\'\(\)*+,;=]+)',
	# remove bolded sections
	'\\.?\\*',
	# remove backticked texts
	'\`.*?\`'
	);
	my %alt;
	my %exactcase;

	my %wl;

	my @w;
	my @exact;
	my $file = shift @ARGV;
	open(CONFIG, "<$file") or die "Cannot open '$file': $!";
	while(<CONFIG>) {
	chomp;
	if($_ =~ /^#/) {
	next;
	}
	if(/^---:([^:]):(.)/) {
	# whitelist file + word
	my $word = lc($2);
	$wl{"$1:$word"} = 1;
	}
	elsif($_ =~ /^---(.+)/) {
	# whitelist word
	push @whitelist, $1;
	}
	elsif($_ =~ /^(.)([:=])(.)/) {
	my ($bad, $sep, $better) = ($1, $2, $3);
	if($sep eq "=") {
	$alt{$bad} = $better;
	push @exact, $bad;
	}
	else {
	$alt{lc($bad)} = $better;
	push @w, $bad;
	}
	}
	}
	close(CONFIG);

	# Build a single combined regex for case-insensitive words
	my $re_ci;
	if(@w) {
	my $pat = join('\|', map { quotemeta($_) } @w);
	$re_ci = qr/\b($pat)\b/i;
	}

	# Build a single combined regex for case-sensitive (exact) words
	my $re_cs;
	if(@exact) {
	my $pat = join('\|', map { quotemeta($_) } @exact);
	$re_cs = qr/\b($pat)\b/;
	}

	# Build a single combined regex for removing whitelisted content
	my $pat = join('\|', map { $_ } @whitelist);
	my $re_wl = qr/($pat)/;

	my $errors = 0;

	sub highlight {
	my ($p, $w, $in, $f, $l, $lookup) = @_;

	my $c = length($p)+1;
	my $ch;

	my $dir = dirname($f);
	$ch = $dir . "/" . ":" . lc($w);
	if($wl{$ch}) {
	# whitelisted dirname + word
	return;
	}
	my $updir = dirname($dir);
	if($dir ne $updir) {
	$ch = $updir . "/" . ":" . lc($w);
	if($wl{$ch}) {
	# whitelisted upper dirname + word
	return;
	}
	}
	$ch = $f . ":" . lc($w);
	if($wl{$ch}) {
	# whitelisted filename + word
	return;
	}

	print STDERR "$f:$l:$c: error: found bad word \"$w\"\n";
	printf STDERR " %4d \| %s\n", $l, $in;
	printf STDERR " \| %*s^%s\n", length($p), " ",
	"~" x (length($w)-1);
	printf STDERR " maybe use \"%s\" instead?\n", $alt{$lookup};
	$errors++;
	}

	sub document {
	my ($f) = @_;
	my @lines;
	open(F, "<$f");
	while(<F>) {
	push @lines, $_;
	}
	close(F);
	return @lines;
	}

	sub file {
	my ($f) = @_;
	my $l = 0;

	my $skip_indented = 0;
	my $source_code = 0;
	if($f =~ /\.[ch]$/) {
	$source_code = 1;
	}
	else {
	# markdown
	$skip_indented = 1;
	}

	my @lines;
	if($source_code) {
	@lines = sourcecode($f);
	}
	else {
	@lines = document($f);
	}
	for my $in (@lines) {
	$l++;
	chomp $in;
	if($skip_indented && $in =~ /^ /) {
	next;
	}
	# remove the link part
	$in =~ s/(\[.\])\(.\)/$1/g;
	# remove whitelisted patterns (pre-compiled)
	if($re_wl) {
	$in =~ s/${re_wl}//ig;
	}
	# case-insensitive bad words
	if($re_ci) {
	if($in =~ /^(.*)$re_ci/i) {
	highlight($1, $2, $in, $f, $l, lc($2));
	}
	}
	# case-sensitive (exact) bad words
	if($re_cs) {
	if($in =~ /^(.*)$re_cs/) {
	highlight($1, $2, $in, $f, $l, $2);
	}
	}
	}
	}

	my @filemasks = @ARGV;
	open(my $git_ls_files, '-\|', 'git', 'ls-files', '--', ":!:$file", @filemasks) or die "Failed running git ls-files: $!";
	my @files;
	while(my $each = <$git_ls_files>) {
	chomp $each;
	push @files, $each;
	}
	close $git_ls_files;

	my $onum = scalar(@files);
	my $num;
	for my $e (@files) {
	#printf STDERR "Complete: %d%%\r", $num++ * 100 / $onum;
	file($e);
	}

	exit $errors;