perl/getlinks.pl.in - third_party/curl - Git at Google

 #!@PERL@
 #
 # getlinks.pl
 #
 # This script extracts all links from a HTML page, compares them to a pattern
 # entered on the command line and then downloads matching links into the
 # target dir (also specified on the command line).
 #
 # Written to use 'curl' for URL fetching, uses the source file names in the
 # target directory.
 #
 # Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
 # Version: 0.1 Oct 7, 1998
 #
 # HISTORY
 #
 # 0.1 - Created now!
 #

 $in="";

  argv:
 if($ARGV[0] eq "-v" ) {
     $verbose = 1;
     shift @ARGV;
     goto argv;
 }
 if($ARGV[0] eq "-d" ) {
     $display = 1;
     shift @ARGV;
     goto argv;
 }
 elsif($ARGV[0] eq "-h" ) {
     $help = 1;
     shift @ARGV;
     goto argv;
 }

 $geturl = $ARGV[0];
 $getdir = $ARGV[1];
 $getregex = $ARGV[2];

 if(($geturl eq "") ||
    (($getdir eq "") && !$display) ||
    $help) {
     print  "Usage: $0 [-hv] <full source URL> <target dir> [regex]\n",
     " Use a traling slash for directory URLs!\n",
     " Use \"quotes\" around the regex!\n",
     " -h  This help text\n",
     " -d  Display matches only instead of downloading\n",
     " -v  Verbose mode\n";
     exit;
 }

 # change to target directory:
 chdir $getdir ||
     die "couldn't cd into $getdir";

 # This is necessary from where I tried this:
 #$proxy =" -x 194.237.142.41:80";

 # linkchecker, URL will be appended to the right of this command line
 # this is the one using HEAD:
 $linkcheck = "curl -s -m 20 -I$proxy";

 # as a second attempt, this will be used. This is not using HEAD but will
 # get the whole frigging document!
 $linkcheckfull = "curl -s -m 20 -i$proxy";

 # htmlget, URL will be appended to the right of this command line
 $htmlget = "curl -s$proxy";

 # urlget, URL will be appended to the right of this command line
 # this stores the file with the remote file name in the current dir
 $urlget = "curl -O -s$proxy";

 # Parse the input URL and split it into the relevant parts:

 sub SplitURL {
     my $inurl = $_[0];

     if($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)\/(.*)/ ) {
 	$getprotocol = $1;
 	$getserver = $2;
 	$getpath = $3;
 	$getdocument = $4;
     }
     elsif ($inurl=~ /^([^:]+):\/\/([^\/]*)\/(.*)/ ) {
 	$getprotocol = $1;
 	$getserver = $2;
 	$getpath = $3;
 	$getdocument = "";

 	if($getpath !~ /\//) {
 	    $getpath ="";
 	    $getdocument = $3;
 	}

     }
     elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
 	$getprotocol = $1;
 	$getserver = $2;
 	$getpath = "";
 	$getdocument = "";
     }
     else {
 	print "Couldn't parse the specified URL, retry please!\n";
 	exit;
     }
 }

 &SplitURL($geturl);

 #print "protocol = $getprotocol\n";
 #print "server = $getserver\n";
 #print "path = $getpath\n";
 #print "document = $getdocument\n";
 #exit;

 if(!$usestdin) {
     open(HEADGET, "$linkcheck $geturl|") ||
 	die "Couldn't get web page for some reason";
   headget:
     while(<HEADGET>) {
 #	print $_;
 	if($_ =~ /HTTP\/.*3\d\d /) {
 	    $pagemoved=1;
 	}
 	elsif($pagemoved &&
 	       ($_ =~ /^Location: (.*)/)) {
 	    $geturl = $1;

 	    &SplitURL($geturl);

 	    $pagemoved++;
 	    last headget;
 	}
     }
     close(HEADGET);

     if($pagemoved == 1) {
 	print "Page is moved but we don't know where. Did you forget the ",
 	"traling slash?\n";
 	exit;
     }

     open(WEBGET, "$htmlget $geturl|") ||
 	die "Couldn't get web page for some reason";

     while(<WEBGET>) {
 	$line = $_;
 	push @indoc, $line;
 	$line=~ s/\n//g;
 	$line=~ s/\r//g;
 #    print $line."\n";
 	$in=$in.$line;
     }

     close(WEBGET);
 }
 else {
     while(<STDIN>) {
 	$line = $_;
 	push @indoc, $line;
 	$line=~ s/\n//g;
 	$line=~ s/\r//g;
 	$in=$in.$line;
     }
 }

 sub GetLinks {
     my $in = $_[0];
     my @result;

   getlinkloop:
     while($in =~ /[^<]*(<[^>]+>)/g ) {
 	# we have a tag in $1
 	$tag = $1;

 	if($tag =~ /^<!--/) {
 	    # this is a comment tag, ignore it
 	}
 	else {
 	    if($tag =~ /(src|href|background|archive) *= *(\"[^\"]\"|[^ )>]*)/i) {
    	        $url=$2;
 		if($url =~ /^\"(.*)\"$/) {
 		    # this was a "string" now $1 has removed the quotes:
 		    $url=$1;
 		}


 		$url =~ s/([^\#]*)\#.*/$1/g;

 		if($url eq "") {
 		    # if the link was nothing than a #-link it may now have
 		    # been emptied completely so then we skip the rest
 		    next getlinkloop;
 		}

 		if($done{$url}) {
 		    # if this url already is done, do next
 		    $done{$url}++;
 		    next getlinkloop;
 		}

 		$done{$url} = 1; # this is "done"

 	        push @result, $url;
 		if($tag =~ /< *([^ ]+)/) {
 #		    print "TAG: $1\n";
 		    $tagtype{$url}=$1;
 		}
 	    }
         }
     }
     return @result;
 }

 @links = &GetLinks($in);

  linkloop:
 for(@links) {
     $url = $_;

     if($url =~ /^([^:]+):/) {
 	$link = $url;
     }
     else {
 	# this is an absolute link on the same server:
 	if($url =~ /^\//) {
 	    # from root
 	    $link = "$getprotocol://$getserver$url";
 	}
 	else {
 	    # from the scanned page's dir
 	    $nyurl=$url;

 	    if(length($getpath) &&
 	       ($getpath !~ /\/$/) &&
 	       ($nyurl !~ /^\//)) {
 		# lacks ending slash, add one to the document part:
 		$nyurl = "/".$nyurl;
 	    }
 	    $link = "$getprotocol://$getserver/$getpath$nyurl";
 	}
     }

     if($link =~ /$getregex/) {
 	if($display) {
 	    print "$link\n";
 	}
 	else {
 	    if($verbose) {
 		print "Gets $link\n";
 	    }
 	    print `$urlget $link`;
 	}
     }


 }
	#!@PERL@
	#
	# getlinks.pl
	#
	# This script extracts all links from a HTML page, compares them to a pattern
	# entered on the command line and then downloads matching links into the
	# target dir (also specified on the command line).
	#
	# Written to use 'curl' for URL fetching, uses the source file names in the
	# target directory.
	#
	# Author: Daniel Stenberg <Daniel.Stenberg@sth.frontec.se>
	# Version: 0.1 Oct 7, 1998
	#
	# HISTORY
	#
	# 0.1 - Created now!
	#

	$in="";

	argv:
	if($ARGV[0] eq "-v" ) {
	$verbose = 1;
	shift @ARGV;
	goto argv;
	}
	if($ARGV[0] eq "-d" ) {
	$display = 1;
	shift @ARGV;
	goto argv;
	}
	elsif($ARGV[0] eq "-h" ) {
	$help = 1;
	shift @ARGV;
	goto argv;
	}

	$geturl = $ARGV[0];
	$getdir = $ARGV[1];
	$getregex = $ARGV[2];

	if(($geturl eq "") \|\|
	(($getdir eq "") && !$display) \|\|
	$help) {
	print "Usage: $0 [-hv] <full source URL> <target dir> [regex]\n",
	" Use a traling slash for directory URLs!\n",
	" Use \"quotes\" around the regex!\n",
	" -h This help text\n",
	" -d Display matches only instead of downloading\n",
	" -v Verbose mode\n";
	exit;
	}

	# change to target directory:
	chdir $getdir \|\|
	die "couldn't cd into $getdir";

	# This is necessary from where I tried this:
	#$proxy =" -x 194.237.142.41:80";

	# linkchecker, URL will be appended to the right of this command line
	# this is the one using HEAD:
	$linkcheck = "curl -s -m 20 -I$proxy";

	# as a second attempt, this will be used. This is not using HEAD but will
	# get the whole frigging document!
	$linkcheckfull = "curl -s -m 20 -i$proxy";

	# htmlget, URL will be appended to the right of this command line
	$htmlget = "curl -s$proxy";

	# urlget, URL will be appended to the right of this command line
	# this stores the file with the remote file name in the current dir
	$urlget = "curl -O -s$proxy";

	# Parse the input URL and split it into the relevant parts:

	sub SplitURL {
	my $inurl = $_[0];

	if($inurl=~ /^([^:]+):\/\/([^\/])\/(.)\/(.*)/ ) {
	$getprotocol = $1;
	$getserver = $2;
	$getpath = $3;
	$getdocument = $4;
	}
	elsif ($inurl=~ /^([^:]+):\/\/([^\/])\/(.)/ ) {
	$getprotocol = $1;
	$getserver = $2;
	$getpath = $3;
	$getdocument = "";

	if($getpath !~ /\//) {
	$getpath ="";
	$getdocument = $3;
	}

	}
	elsif ($inurl=~ /^([^:]+):\/\/(.*)/ ) {
	$getprotocol = $1;
	$getserver = $2;
	$getpath = "";
	$getdocument = "";
	}
	else {
	print "Couldn't parse the specified URL, retry please!\n";
	exit;
	}
	}

	&SplitURL($geturl);

	#print "protocol = $getprotocol\n";
	#print "server = $getserver\n";
	#print "path = $getpath\n";
	#print "document = $getdocument\n";
	#exit;

	if(!$usestdin) {
	open(HEADGET, "$linkcheck $geturl\|") \|\|
	die "Couldn't get web page for some reason";
	headget:
	while(<HEADGET>) {
	# print $_;
	if($_ =~ /HTTP\/.*3\d\d /) {
	$pagemoved=1;
	}
	elsif($pagemoved &&
	($_ =~ /^Location: (.*)/)) {
	$geturl = $1;

	&SplitURL($geturl);

	$pagemoved++;
	last headget;
	}
	}
	close(HEADGET);

	if($pagemoved == 1) {
	print "Page is moved but we don't know where. Did you forget the ",
	"traling slash?\n";
	exit;
	}

	open(WEBGET, "$htmlget $geturl\|") \|\|
	die "Couldn't get web page for some reason";

	while(<WEBGET>) {
	$line = $_;
	push @indoc, $line;
	$line=~ s/\n//g;
	$line=~ s/\r//g;
	# print $line."\n";
	$in=$in.$line;
	}

	close(WEBGET);
	}
	else {
	while(<STDIN>) {
	$line = $_;
	push @indoc, $line;
	$line=~ s/\n//g;
	$line=~ s/\r//g;
	$in=$in.$line;
	}
	}

	sub GetLinks {
	my $in = $_[0];
	my @result;

	getlinkloop:
	while($in =~ /[^<]*(<[^>]+>)/g ) {
	# we have a tag in $1
	$tag = $1;

	if($tag =~ /^<!--/) {
	# this is a comment tag, ignore it
	}
	else {
	if($tag =~ /(src\|href\|background\|archive) = (\"[^\"]\"\|[^ )>]*)/i) {
	$url=$2;
	if($url =~ /^\"(.*)\"$/) {
	# this was a "string" now $1 has removed the quotes:
	$url=$1;
	}


	$url =~ s/([^\#])\#./$1/g;

	if($url eq "") {
	# if the link was nothing than a #-link it may now have
	# been emptied completely so then we skip the rest
	next getlinkloop;
	}

	if($done{$url}) {
	# if this url already is done, do next
	$done{$url}++;
	next getlinkloop;
	}

	$done{$url} = 1; # this is "done"

	push @result, $url;
	if($tag =~ /< *([^ ]+)/) {
	# print "TAG: $1\n";
	$tagtype{$url}=$1;
	}
	}
	}
	}
	return @result;
	}

	@links = &GetLinks($in);

	linkloop:
	for(@links) {
	$url = $_;

	if($url =~ /^([^:]+):/) {
	$link = $url;
	}
	else {
	# this is an absolute link on the same server:
	if($url =~ /^\//) {
	# from root
	$link = "$getprotocol://$getserver$url";
	}
	else {
	# from the scanned page's dir
	$nyurl=$url;

	if(length($getpath) &&
	($getpath !~ /\/$/) &&
	($nyurl !~ /^\//)) {
	# lacks ending slash, add one to the document part:
	$nyurl = "/".$nyurl;
	}
	$link = "$getprotocol://$getserver/$getpath$nyurl";
	}
	}

	if($link =~ /$getregex/) {
	if($display) {
	print "$link\n";
	}
	else {
	if($verbose) {
	print "Gets $link\n";
	}
	print `$urlget $link`;
	}
	}


	}