#GPL
#GPL  libwhisker copyright 2000-2004 by rfp.labs
#GPL
#GPL  This program is free software; you can redistribute it and/or
#GPL  modify it under the terms of the GNU General Public License
#GPL  as published by the Free Software Foundation; either version 2
#GPL  of the License, or (at your option) any later version.
#GPL
#GPL  This program is distributed in the hope that it will be useful,
#GPL  but WITHOUT ANY WARRANTY; without even the implied warranty of
#GPL  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GPL  GNU General Public License for more details.
#GPL

#####################################################

# cluster global variables
%_crawl_config=('save_cookies'	=> 0,
		'reuse_cookies'	=> 1,
		'save_offsites'	=> 0,
		'save_non_http'	=> 0,
		'follow_moves'	=> 1,
		'url_limit'	=> 1000,
		'use_params'	=> 0,
		'params_double_record' => 0,
		'skip_ext'	=> { 	gif=>1, jpg=>1, png=>1, gz=>1, 
					swf=>1, pdf=>1, zip=>1, wav=>1, 
					mp3=>1, asf=>1, tgz=>1 },
		'save_skipped'	=> 0,
		'save_referrers'=> 0,
		'do_head'	=> 0,
		'callback'	=> 0,
		'netloc_bug'	=> 1,
		'normalize_uri'	=> 1,
		'source_callback' => 0
);

%_crawl_linktags = (
	 'a'       => 'href',
	 'applet'  => [qw(codebase archive code)],
	 'area'    => 'href',
	 'base'    => 'href',
	 'bgsound' => 'src',
	 'blockquote' => 'cite',
	 'body'    => 'background',
	 'del'     => 'cite',
	 'embed'   => [qw(src pluginspage)],
	 'form'    => 'action',
	 'frame'   => [qw(src longdesc)],
	 'iframe'  => [qw(src longdesc)],
	 'ilayer'  => 'background',
	 'img'     => [qw(src lowsrc longdesc usemap)],
	 'input'   => [qw(src usemap)],
	 'ins'     => 'cite',
	 'isindex' => 'action',
	 'head'    => 'profile',
	 'layer'   => [qw(background src)],
	 'link'    => 'href',
#	 'meta'    => 'http-equiv',
	 'object'  => [qw(codebase data archive usemap)],
	 'q'       => 'cite',
	 'script'  => 'src',
	 'table'   => 'background',
	 'td'      => 'background',
	 'th'      => 'background',
	 'xmp'     => 'href',
);

#####################################################

=item B<crawl_new>

Params: $START, $MAX_DEPTH, \%request_hash [, \%tracking_hash ]

Return: $crawl_object

The crawl_new() functions initializes a crawl object (hash) to the default
values, and then returns it for later use by crawl().  $START is the starting
URL (in the form of 'http://www.host.com/url'), and MAX_DEPTH is the maximum
number of levels to crawl (the START URL counts as 1, so a value of 2 will
crawl the START URL and all URLs found on that page).  The request_hash
is a standard initialized request hash to be used for requests; you should
set any authentication information or headers in this hash in order for
the crawler to use them.  The optional tracking_hash lets you supply a
hash for use in tracking URL results (otherwise crawl_new() will allocate
a new anon hash).

=cut

sub crawl_new {
	my ($start, $depth, $reqref, $trackref)=@_;
	my %X;

	return undef if(!defined $start    || !defined $depth);
	return undef if(!defined $reqref   || !ref($reqref));
	$trackref={} if(!defined $trackref || !ref($trackref));

	$X{track}	=$trackref;
	$X{request}	=$reqref;
	$X{depth}	=$depth||2;
	$X{start}	=$start;
	$X{magic}	=7340;

	$X{reset}= sub {
		$X{errors}	=[]; # all errors encountered
		$X{urls}	=[]; # temp; used to hold all URLs on page
		$X{server_tags}	={}; # all server tags found
		$X{referrers}	={}; # who refers to what URLs
		$X{offsites}	={}; # all URLs that point offsite
		$X{response}	={}; # temp; the response hash
		$X{non_http}	={}; # all non_http URLs found
		$X{cookies}	={}; # all cookies found
		$X{forms}	={}; # all forms found
		$X{jar}		={}; # temp; cookie jar

		$X{config}	={};
		%{ $X{config} } = %_crawl_config;

		%{ $X{track} }		=();
		$X{parsed_page_count}	=0;
	};

	$X{crawl}= sub { crawl(\%X, @_) };
	$X{reset}->();

	return \%X;
}

#####################################################

=item B<crawl>

Params: $crawl_object [, $START, $MAX_DEPTH ]

Return: $count [ undef on error ] 

The heart of the crawl package.  Will perform an HTTP crawl on the
specified HOST, starting at START URI, proceeding up to MAX_DEPTH. 

Crawl_object needs to be the variable returned by crawl_new().  You can
also indirectly call crawl() via the crawl_object itself:

	$crawl_object->{crawl}->($START,$MAX_DEPTH)

Returns the number of URLs actually crawled (not including those skipped).

=cut

{ # START OF CRAWL CONTAINER

sub crawl {
 my ($C, $START, $MAX_DEPTH)=@_;
 return undef if(!defined $C || !ref($C) || $C->{magic}!=7340);

 # shortcuts, to reduce dereferences and typing
 my $CONFIG   = $C->{config};
 my $TRACK    = $C->{track}; 
 my $URLS     = $C->{urls};
 my $RESP     = $C->{response};
 my $REQ      = $C->{request};
 $START     ||= $C->{start};
 $C->{depth}  = $MAX_DEPTH || $C->{depth};

 my ($COUNT, $T, @ST, @url_queue)=(0, '');
 # ST[] = [ 0.HOST, 1.PORT, 2.URL, 3.DEPTH, 4.CWD ]

 my @v=uri_split($START);

 my $error = undef;
 $error = 'Start protocol not http or https'
	if($v[1] ne 'http' && $v[1] ne 'https');
 $error = 'Bad start host' if(!defined $v[2] || $v[2] eq '');
 push ( @{ $C->{errors} }, $error ) && return undef if(defined $error);

 @ST = ( $v[2], $v[3], $v[0], 1, '' );

 $REQ->{whisker}->{ssl} = 1 if($v[1] eq 'https');
 $REQ->{whisker}->{host}=$ST[0];
 $REQ->{whisker}->{port}=$ST[1];
 $REQ->{whisker}->{lowercase_incoming_headers}=1;
 $REQ->{whisker}->{ignore_duplicate_headers}  =0;
 delete $REQ->{whisker}->{parameters};
 http_fixup_request($REQ);

 push @url_queue, \@ST;

 while(@url_queue){
  @ST = @{ shift @url_queue };

  next if(defined $TRACK->{$ST[2]} && $TRACK->{$ST[2]} ne '?');
  if($ST[3] > $C->{depth}){
	$TRACK->{$ST[2]}='?' if($CONFIG->{save_skipped}>0);
	next; }

  $ST[4]=uri_get_dir($ST[2]);
  $REQ->{whisker}->{uri}=$ST[2];

  my $result = _crawl_do_request($REQ,$RESP,$C);
  if($result==1 || $result==2){
	push @{ $C->{errors} }, "$ST[2]: $RESP->{whisker}->{error}";
	next; }

  $COUNT++;
  $TRACK->{$ST[2]}=$RESP->{whisker}->{code} if($result==0 || $result==4);
  $TRACK->{$ST[2]}='?' if( ($result==3||$result==5) && $CONFIG->{save_skipped}>0);
  
  if(defined $RESP->{server} && !ref($RESP->{server})){ 
	$C->{server_tags}->{ $RESP->{server} }++; }

  if(defined $RESP->{'set-cookie'}){
		if($CONFIG->{save_cookies}>0){
			if(ref($RESP->{'set-cookie'})){
				$C->{cookies}->{$_}++ foreach (@{$RESP->{'set-cookie'}});
			} else {
				$C->{cookies}->{$RESP->{'set-cookie'}}++; 
		}	}
		cookie_read($C->{jar}, $RESP) if($CONFIG->{reuse_cookies}>0);
  }

  next if($result==4 || $result==5);  
  next if(scalar @url_queue > $CONFIG->{url_limit});

  if($result==0){ # page should be parsed
	if($CONFIG->{source_callback} != 0  &&
		ref($CONFIG->{source_callback}) eq 'CODE'){
		&{$CONFIG->{source_callback}}($C); }

	html_find_tags(\$RESP->{whisker}->{data},
		\&_crawl_extract_links_test,0,$C,\%_crawl_linktags);
	$C->{parsed_page_count}++;
  }

  push @$URLS, $RESP->{location} if($result==3);

  foreach $T (@$URLS){
	 $T=~tr/\0\r\n//d;
	 next if (length($T)==0);
	 next if ($T=~/^#/i); # fragment

	 push @{$C->{referrers}->{$T}}, $ST[2] if($CONFIG->{save_referrers}>0);

	 if($T=~/^([a-zA-Z0-9]*):/ && lc($1) ne 'http' && lc($1) ne 'https'){
		push @{$C->{non_http}->{$T}}, $ST[2] if($CONFIG->{save_non_http}>0);
		next;
	 }

	 if( substr($T,0,2) eq '//' && $CONFIG->{netloc_bug}>0 ){
		if( $REQ->{whisker}->{ssl}>0 ){	$T='https:'.$T; }
		else {				$T='http:'.$T;  }
	 }
	 if($CONFIG->{callback} != 0){
		next if &{$CONFIG->{callback}}($T,$C); }

	 $T=uri_absolute( $T, $ST[4], $CONFIG->{normalize_uri} );

	 # (uri,protocol,host,port,params,frag,user,pass)
	 @v=uri_split($T);

	 # make sure URL is on same host and port
	 if( (defined $v[2] && $v[2] ne $ST[0]) || 
			($v[3]>0 && $v[3] != $ST[1])){
		$C->{offsites}->{uri_join(@v)}++ if($CONFIG->{save_offsites}>0);
		next; }

	 next if(defined $TRACK->{$v[0]}); # we've processed this already

	 if($v[0]=~/\.([a-z0-9]+)$/i){
		if(defined $CONFIG->{skip_ext}->{lc($1)}){
			$TRACK->{$v[0]}='?' if($CONFIG->{save_skipped}>0);
			next;
		}
	 }

	 if(defined $v[4] && $CONFIG->{use_params}>0){
		$TRACK->{$v[0]}='?' if($CONFIG->{params_double_record}>0 &&
			!defined $TRACK->{$v[0]});
		$v[0]=$v[0].'?'.$v[4];	
	 }

	 # ST[] = [ 0.HOST, 1.PORT, 2.URL, 3.DEPTH, 4.CWD ]
	 push @url_queue, [ $ST[0], $ST[1], $v[0], $ST[3]+1, '' ];
  } # foreach

  @$URLS=(); # reset for next round
 } # while

 return $COUNT;
} # end sub crawl

#####################################################

sub _crawl_extract_links_test {
	my ($TAG, $hr, $dr, $start, $len, $OBJ)=(lc(shift),@_);

	return undef if(!scalar %$hr); # fastpath quickie

	# we know this is defined, due to our tagmap
	my $t=$_crawl_linktags{$TAG};

	while( my ($key,$val)= each %$hr){ # normalize element values
		$$hr{lc($key)} = $val;
	}

	# all of this just to catch meta refresh URLs
	if($TAG eq 'meta' && defined $$hr{'http-equiv'} 
			&& $$hr{'http-equiv'} eq 'refresh' 
			&& defined $$hr{'content'}
			&& $$hr{'content'}=~m/url=(.+)/i ){
		push(@{$OBJ->{urls}},$1);

	} elsif(ref($t)){
		foreach (@$t){
			push(@{$OBJ->{urls}},$$hr{$_}) if(defined $$hr{$_});
		}
	} else {
		push(@{$OBJ->{urls}},$$hr{$t}) if(defined $$hr{$t});
	}

	if($TAG eq 'form' && defined $$hr{action}){
		my $u=$OBJ->{response}->{whisker}->{uri};
		$OBJ->{forms}->{ uri_absolute($$hr{action},$u,1) }++;
	}

	return undef;
}

################################################################

sub _crawl_do_request_ex {
	my ($hrin,$hrout,$OBJ)=@_;
	my $ret;

	$ret=http_do_request($hrin,$hrout);

	return (2,$ret) if($ret==2); # if there was connection error, do not continue
	if($ret==0){ # successful request

		# WARNING: what if *all* HEAD respones are 302'd on purpose, but
		#          all GETs are normal?
		if($$hrout{whisker}->{code} <308 &&
				$$hrout{whisker}->{code} >300){
			if($OBJ->{config}->{follow_moves} >0){
				return (3,$ret) if(defined $$hrout{location} &&
					!ref($$hrout{location})); }
			return (5,$ret); # not avail
		}

		if($$hrout{whisker}->{code}==200){
			# no content-type is treated as text/htm
			if(defined $$hrout{'content-type'} &&
					$$hrout{'content-type'}!~/^text\/htm/i){
				return (4,$ret);
			}		
		}
    	}
	return (-1,$ret); # fallthrough
}

################################################################

sub _crawl_do_request {
 my ($hrin,$hrout,$OBJ) = @_;
 my ($cret,$lwret);

 if($OBJ->{config}->{do_head} && $$hrin{whisker}->{method} ne 'HEAD'){  
	my $save=$$hrin{whisker}->{method};
	$$hrin{whisker}->{method}='HEAD';
	($cret,$lwret)=_crawl_do_request_ex($hrin,$hrout,$OBJ);
	$$hrin{whisker}->{method}=$save;

	return $cret if($cret > 0);

	if($lwret==0){ # successful request
	    	if($$hrout{whisker}->{code}==501){ # HEAD not allowed
    			$OBJ->{config}->{do_head}=0; # no more HEAD requests
	    	}
	}
	# request errors are essentially redone via GET, below
  }

 ($cret,$lwret)=_crawl_do_request_ex($hrin,$hrout,$OBJ);
 return $lwret if($cret < 0);
 return $cret;
}

} # CRAWL_CONTAINER

################################################################