# NOPOD NOTICE: the documentation and whitespace have been stripped # from this file in order to reduce filesize. # #!/usr/bin/perl # LW2 version 2.4 # # LW2 copyright 2000-2006 by rain forest puppy, rfp.labs # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # package LW2; $LW2::VERSION="2.4"; $PACKAGE='LW2'; BEGIN { package LW2; $PACKAGE='LW2'; $LW_SSL_LIB = 0; $LW_SSL_KEEPALIVE = 0; $LW_NONBLOCK_CONNECT = 1; $_SSL_LIBRARY = undef; eval "use Socket"; if ( !$@ ) { eval "use Net::SSLeay"; # do we have SSL support? if ( !$@ ) { $LW_SSL_LIB = 1; $_SSL_LIBRARY = 'Net::SSLeay'; Net::SSLeay::load_error_strings(); Net::SSLeay::SSLeay_add_ssl_algorithms(); Net::SSLeay::randomize(); } else { eval "use Net::SSL"; if ( !$@ ) { $LW_SSL_LIB = 2; $_SSL_LIBRARY = 'Net::SSL'; } } if ( $^O !~ /Win32/ ) { eval "use POSIX qw(:errno_h :fcntl_h)"; if ($@) { $LW_NONBLOCK_CONNECT = 0; } } else { *EINPROGRESS = sub { 10036 }; *EWOULDBLOCK = sub { 10035 }; } } } # BEGIN sub auth_brute_force { my ( $auth_method, $hrin, $user, $pwordref, $dom, $fail_code ) = @_; my ( $P, %hout ); $fail_code ||= 401; return undef if ( !defined $auth_method || length($auth_method) == 0 ); return undef if ( !defined $user || length($user) == 0 ); return undef if ( !( defined $hrin && ref($hrin) ) ); return undef if ( !( defined $pwordref && ref($pwordref) ) ); map { ( $P = $_ ) =~ tr/\r\n//d; auth_set( $auth_method, $hrin, $user, $P, $dom ); return undef if ( http_do_request( $hrin, \%hout ) ); return $P if ( $hout{whisker}->{code} != $fail_code ); } @$pwordref; return undef; } sub auth_unset { my $href = shift; return if ( !defined $href || !ref($href) ); delete $$href{Authorization}; delete $$href{'Proxy-Authorization'}; delete $$href{whisker}->{auth_callback}; delete $$href{whisker}->{auth_proxy_callback}; delete $$href{whisker}->{auth_data}; delete $$href{whisker}->{auth_proxy_data}; } sub auth_set { my ( $method, $href, $user, $pass, $domain ) = ( lc(shift), @_ ); return if ( !( defined $href && ref($href) ) ); return if ( !defined $user || !defined $pass ); if ( $method eq 'basic' ) { $$href{'Authorization'} = 'Basic ' . encode_base64( $user . ':' . $pass, '' ); } if ( $method eq 'proxy-basic' ) { $$href{'Proxy-Authorization'} = 'Basic ' . encode_base64( $user . ':' . $pass, '' ); } if ( $method eq 'ntlm' ) { http_close($href); $$href{whisker}->{auth_data} = ntlm_new( $user, $pass, $domain ); $$href{whisker}->{auth_callback} = \&_ntlm_auth_callback; } if ( $method eq 'proxy-ntlm' ) { utils_croak('',"auth_set: proxy-ntlm auth w/ SSL not currently supported") if ( $href->{whisker}->{ssl} > 0 ); http_close($href); $$href{whisker}->{auth_proxy_data} = ntlm_new( $user, $pass, $domain ); $$href{whisker}->{auth_proxy_callback} = \&_ntlm_auth_proxy_callback; } } sub cookie_new_jar { return {}; } sub cookie_read { my ( $count, $jarref, $hrs, $hrq, $rej ) = ( 0, @_ ); return 0 if ( !( defined $jarref && ref($jarref) ) ); return 0 if ( !( defined $hrs && ref($hrs) ) ); return 0 if ( !( defined $$hrs{whisker}->{cookies} && ref( $$hrs{whisker}->{cookies} ) ) ); my @opt; if(defined $hrq && ref($hrq)){ push @opt, $hrq->{whisker}->{host}; my $u = $hrq->{whisker}->{uri}; $u=~s#/.*?$##; $u='/' if($u eq ''); push @opt, $u, $rej; } foreach ( @{ $hrs->{whisker}->{cookies} } ) { cookie_parse( $jarref, $_ , @opt); $count++; } return $count; } sub cookie_parse { my ( $jarref, $header ) = (shift, shift); my ( $Dd, $Dp, $R ) = (shift, shift, shift||0); return if ( !( defined $jarref && ref($jarref) ) ); return if ( !( defined $header && length($header) > 0 ) ); my @C = ( undef, undef, undef, undef, 0 ); $header =~ tr/\r\n//d; my ($f,%seen,$n,$t) = (1); while( length($header) ){ $header =~ s/^[ \t]+//; last if(!($header =~ s/^([^ \t=;]+)//)); my $an = lc($1); my $av = undef; $header =~ s/^[ \t]+//; if(substr($header,0,1) eq '='){ $header=~s/^=[ \t]*//; if(substr($header,0,1) eq '"'){ my $p = index($header,'"',1); last if($p == -1); $av = substr($header,1,$p-1); substr($header,0,$p+1)=''; } else { $av = $1 if($header =~ s/^([^ \t;,]*)//); } } else { my $p = index($header,';'); substr($header,0,$p)=''; } $header =~ s/^.*?;//; if($f){ return if(!defined $av); ($f,$n,$C[0])=(0,$an,$av); } else { $seen{$an}=$av if(!exists $seen{$an}); } } return if(!defined $n || $n eq ''); my $del = 0; $del++ if($C[0] eq ''); $del++ if(defined $seen{'max-age'} && $seen{'max-age'} eq '0'); if($del){ delete $$jarref{$n} if exists $$jarref{$n}; return; } if(defined $seen{domain} && $seen{domain} ne ''){ $t = $seen{domain}; $t='.'.$t if(substr($t,0,1) ne '.' && !_is_ip_address($t)); } else { $t=$Dd; } $t=~s/\.+$// if(defined $t); $C[1]=$t; if(defined $seen{path}){ $t = $seen{path}; } else { $t=$Dp || '/'; } $t=~s#/+$##; $t='/' if(substr($t,0,1) ne '/'); $C[2]=$t; $C[4]=1 if(exists $seen{secure}); return if($R && !_is_valid_cookie_match($C[1], $C[2], $Dd, $Dp)); $$jarref{$n} = \@C; } sub _is_ip_address { my $n = shift; return 1 if($n=~/^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$/); return 0; } sub _is_valid_cookie_match { my ($cd, $cp, $td, $tp) = @_; return 0 if(index($tp,$cp)!=0); if(substr($cd,0,1) eq '.'){ if( $td =~ /(.+)$cd$/ ){ return 1 if(index($1,'.') == -1); } return 0; } else { return 0 if($cd ne $td); } return 1; } sub cookie_write { my ( $jarref, $hin, $override ) = @_; my ( $name, $out ) = ( '', '' ); return if ( !( defined $jarref && ref($jarref) ) ); return if ( !( defined $hin && ref($hin) ) ); $override = $override || 0; $$hin{'whisker'}->{'ssl'} = $$hin{'whisker'}->{'ssl'} || 0; foreach $name ( keys %$jarref ) { next if ( $name eq '' ); if($override){ $out .= "$name=$$jarref{$name}->[0];"; next; } next if ( $$hin{'whisker'}->{'ssl'} == 0 && $$jarref{$name}->[4] > 0 ); if ( $$hin{'whisker'}->{'host'} =~ /$$jarref{$name}->[1]$/i && $$hin{'whisker'}->{'uri'} =~ /^$$jarref{$name}->[2])/ ) { $out .= "$name=$$jarref{$name}->[0];"; } } if ( $out ne '' ) { $$hin{'Cookie'} = $out; } } sub cookie_get { my ( $jarref, $name ) = @_; return undef if ( !( defined $jarref && ref($jarref) ) ); if ( defined $$jarref{$name} ) { return @{ $$jarref{$name} }; } return undef; } sub cookie_get_names { my ( $jarref, $name ) = @_; return undef if ( !( defined $jarref && ref($jarref) ) ); return keys %$jarref; } sub cookie_get_valid_names { my ( $jarref, $domain, $url, $ssl ) = @_; return () if ( !( defined $jarref && ref($jarref) ) ); return () if ( !defined $domain || $domain eq '' ); return () if ( !defined $url || $url eq '' ); $ssl ||= 0; my (@r, $name); foreach $name ( keys %$jarref ) { next if ( $name eq '' ); next if ( $$jarref{$name}->[4] > 0 && $ssl == 0 ); if ( $domain =~ /$$jarref{$name}->[1]$/i && $url =~ /^$$jarref{$name}->[2])/i ) { push @r, $name; } } return @r; } sub cookie_set { my ( $jarref, $name, $value, $domain, $path, $expire, $secure ) = @_; my @construct; return if ( !( defined $jarref && ref($jarref) ) ); return if ( $name eq '' ); if ( !defined $value || $value eq '' ) { delete $$jarref{$name}; return; } $path = $path || '/'; $secure = $secure || 0; @construct = ( $value, $domain, $path, undef, $secure ); $$jarref{$name} = \@construct; } %_crawl_config = ( 'save_cookies' => 0, 'reuse_cookies' => 1, 'save_offsites' => 0, 'save_non_http' => 0, 'follow_moves' => 1, 'url_limit' => 1000, 'use_params' => 0, 'params_double_record' => 0, 'skip_ext' => { gif => 1, jpg => 1, png => 1, gz => 1, swf => 1, pdf => 1, zip => 1, wav => 1, mp3 => 1, asf => 1, tgz => 1 }, 'save_skipped' => 0, 'save_referrers' => 0, 'use_referrers' => 1, 'do_head' => 0, 'callback' => 0, 'netloc_bug' => 1, 'normalize_uri' => 1, 'source_callback' => 0 ); %_crawl_linktags = ( 'a' => 'href', 'applet' => [qw(codebase archive code)], 'area' => 'href', 'base' => 'href', 'bgsound' => 'src', 'blockquote' => 'cite', 'body' => 'background', 'del' => 'cite', 'embed' => [qw(src pluginspage)], 'form' => 'action', 'frame' => [qw(src longdesc)], 'iframe' => [qw(src longdesc)], 'ilayer' => 'background', 'img' => [qw(src lowsrc longdesc usemap)], 'input' => [qw(src usemap)], 'ins' => 'cite', 'isindex' => 'action', 'head' => 'profile', 'layer' => [qw(background src)], 'link' => 'href', 'object' => [qw(codebase data archive usemap)], 'q' => 'cite', 'script' => 'src', 'table' => 'background', 'td' => 'background', 'th' => 'background', 'xmp' => 'href', ); sub crawl_new { my ( $start, $depth, $reqref, $trackref ) = @_; my %X; return undef if ( !defined $start || !defined $depth ); return undef if ( !defined $reqref || !ref($reqref) ); $trackref = {} if ( !defined $trackref || !ref($trackref) ); $X{track} = $trackref; $X{request} = $reqref; $X{depth} = $depth || 2; $X{start} = $start; $X{magic} = 7340; $X{reset} = sub { $X{errors} = []; # all errors encountered $X{urls} = []; # temp; used to hold all URLs on page $X{server_tags} = {}; # all server tags found $X{referrers} = {}; # who refers to what URLs $X{offsites} = {}; # all URLs that point offsite $X{response} = {}; # temp; the response hash $X{non_http} = {}; # all non_http URLs found $X{cookies} = {}; # all cookies found $X{forms} = {}; # all forms found $X{jar} = {}; # temp; cookie jar $X{url_queue} = []; # temp; URLs to still fetch $X{config} = {}; %{ $X{config} } = %_crawl_config; %{ $X{track} } = (); $X{parsed_page_count} = 0; }; $X{crawl} = sub { crawl( \%X, @_ ) }; $X{reset}->(); return \%X; } { # START OF CRAWL CONTAINER sub crawl { my ( $C, $START, $MAX_DEPTH ) = @_; return undef if ( !defined $C || !ref($C) || $C->{magic} != 7340 ); my $CONFIG = $C->{config}; my $TRACK = $C->{track}; my $URLS = $C->{urls}; my $RESP = $C->{response}; my $REQ = $C->{request}; my $Q = $C->{url_queue}; $START ||= $C->{start}; $C->{depth} = $MAX_DEPTH || $C->{depth}; my ( $COUNT, $T, @ST ) = ( 0, '' ); my @v = uri_split($START); my $error = undef; $error = 'Start protocol not http or https' if ( $v[1] ne 'http' && $v[1] ne 'https' ); $error = 'Bad start host' if ( !defined $v[2] || $v[2] eq '' ); push( @{ $C->{errors} }, $error ) && return undef if ( defined $error ); @ST = ( $v[2], $v[3], $v[0], 1, '', '' ); $REQ->{whisker}->{ssl} = 1 if ( $v[1] eq 'https' ); $REQ->{whisker}->{host} = $ST[0]; $REQ->{whisker}->{port} = $ST[1]; $REQ->{whisker}->{lowercase_incoming_headers} = 1; $REQ->{whisker}->{ignore_duplicate_headers} = 0; delete $REQ->{whisker}->{parameters}; http_fixup_request($REQ); push @$Q, \@ST; while (@$Q) { @ST = @{ shift @$Q }; next if ( defined $TRACK->{ $ST[2] } && $TRACK->{ $ST[2] } ne '?' ); if ( $ST[3] > $C->{depth} ) { $TRACK->{ $ST[2] } = '?' if ( $CONFIG->{save_skipped} > 0 ); next; } $ST[4] = uri_get_dir( $ST[2] ); $REQ->{whisker}->{uri} = $ST[2]; if ( $ST[5] ne '' && $CONFIG->{use_referrers} > 0 ) { $REQ->{Referrer} = $ST[5]; } my $result = _crawl_do_request( $REQ, $RESP, $C ); if ( $result == 1 || $result == 2 ) { push @{ $C->{errors} }, "$ST[2]: $RESP->{whisker}->{error}"; next; } $COUNT++; $TRACK->{ $ST[2] } = $RESP->{whisker}->{code} if ( $result == 0 || $result == 4 ); $TRACK->{ $ST[2] } = '?' if ( ( $result == 3 || $result == 5 ) && $CONFIG->{save_skipped} > 0 ); if ( defined $RESP->{server} && !ref( $RESP->{server} ) ) { $C->{server_tags}->{ $RESP->{server} }++; } if ( defined $RESP->{'set-cookie'} ) { if ( $CONFIG->{save_cookies} > 0 ) { if ( ref( $RESP->{'set-cookie'} ) ) { $C->{cookies}->{$_}++ foreach ( @{ $RESP->{'set-cookie'} } ); } else { $C->{cookies}->{ $RESP->{'set-cookie'} }++; } } cookie_read( $C->{jar}, $RESP ) if ( $CONFIG->{reuse_cookies} > 0 ); } next if ( $result == 4 || $result == 5 ); next if ( scalar @$Q > $CONFIG->{url_limit} ); if ( $result == 0 ) { # page should be parsed if ( $CONFIG->{source_callback} != 0 && ref( $CONFIG->{source_callback} ) eq 'CODE' ) { &{ $CONFIG->{source_callback} }($C); } html_find_tags( \$RESP->{whisker}->{data}, \&_crawl_extract_links_test, 0, $C, \%_crawl_linktags ); $C->{parsed_page_count}++; } push @$URLS, $RESP->{location} if ( $result == 3 ); foreach $T (@$URLS) { $T =~ tr/\0\r\n//d; next if ( length($T) == 0 ); next if ( $T =~ /^#/i ); # fragment push @{ $C->{referrers}->{$T} }, $ST[2] if ( $CONFIG->{save_referrers} > 0 ); if ( $T =~ /^([a-zA-Z0-9]*):/ && lc($1) ne 'http' && lc($1) ne 'https' ) { push @{ $C->{non_http}->{$T} }, $ST[2] if ( $CONFIG->{save_non_http} > 0 ); next; } if ( substr( $T, 0, 2 ) eq '//' && $CONFIG->{netloc_bug} > 0 ) { if ( $REQ->{whisker}->{ssl} > 0 ) { $T = 'https:' . $T; } else { $T = 'http:' . $T; } } if ( $CONFIG->{callback} != 0 ) { next if &{ $CONFIG->{callback} }( $T, $C ); } $T = uri_absolute( $T, $ST[4], $CONFIG->{normalize_uri} ); @v = uri_split($T); if ( ( defined $v[2] && $v[2] ne $ST[0] ) || ( $v[3] > 0 && $v[3] != $ST[1] ) ) { $C->{offsites}->{ uri_join(@v) }++ if ( $CONFIG->{save_offsites} > 0 ); next; } if ( $v[0] =~ /\.([a-z0-9]+)$/i ) { if ( defined $CONFIG->{skip_ext}->{ lc($1) } ) { $TRACK->{ $v[0] } = '?' if ( $CONFIG->{save_skipped} > 0 ); next; } } if ( defined $v[4] && $CONFIG->{use_params} > 0 ) { $TRACK->{ $v[0] } = '?' if ( $CONFIG->{params_double_record} > 0 && !defined $TRACK->{ $v[0] } ); $v[0] = $v[0] . '?' . $v[4]; } next if ( defined $TRACK->{ $v[0] } ) ; # we've processed this already push @$Q, [ $ST[0], $ST[1], $v[0], $ST[3] + 1, '', $ST[2] ]; } # foreach @$URLS = (); # reset for next round } # while return $COUNT; } # end sub crawl sub _crawl_extract_links_test { my ( $TAG, $hr, $dr, $start, $len, $OBJ ) = ( lc(shift), @_ ); return undef if ( !scalar %$hr ); # fastpath quickie my $t = $_crawl_linktags{$TAG}; while ( my ( $key, $val ) = each %$hr ) { # normalize element values $$hr{ lc($key) } = $val; } if ( $TAG eq 'meta' && defined $$hr{'http-equiv'} && $$hr{'http-equiv'} eq 'refresh' && defined $$hr{'content'} && $$hr{'content'} =~ m/url=(.+)/i ) { push( @{ $OBJ->{urls} }, $1 ); } elsif ( ref($t) ) { foreach (@$t) { push( @{ $OBJ->{urls} }, $$hr{$_} ) if ( defined $$hr{$_} ); } } else { push( @{ $OBJ->{urls} }, $$hr{$t} ) if ( defined $$hr{$t} ); } if ( $TAG eq 'form' && defined $$hr{action} ) { my $u = $OBJ->{response}->{whisker}->{uri}; $OBJ->{forms}->{ uri_absolute( $$hr{action}, $u, 1 ) }++; } return undef; } sub _crawl_do_request_ex { my ( $hrin, $hrout, $OBJ ) = @_; my $ret; $ret = http_do_request( $hrin, $hrout ); return ( 2, $ret ) if ( $ret == 2 ); # if there was connection error, do not continue if ( $ret == 0 ) { # successful request if ( $$hrout{whisker}->{code} < 308 && $$hrout{whisker}->{code} > 300 ) { if ( $OBJ->{config}->{follow_moves} > 0 ) { return ( 3, $ret ) if ( defined $$hrout{location} && !ref( $$hrout{location} ) ); } return ( 5, $ret ); # not avail } if ( $$hrout{whisker}->{code} == 200 ) { if ( defined $$hrout{'content-type'} && $$hrout{'content-type'} !~ /^text\/htm/i ) { return ( 4, $ret ); } } } return ( -1, $ret ); # fallthrough } sub _crawl_do_request { my ( $hrin, $hrout, $OBJ ) = @_; my ( $cret, $lwret ); if ( $OBJ->{config}->{do_head} && $$hrin{whisker}->{method} ne 'HEAD' ) { my $save = $$hrin{whisker}->{method}; $$hrin{whisker}->{method} = 'HEAD'; ( $cret, $lwret ) = _crawl_do_request_ex( $hrin, $hrout, $OBJ ); $$hrin{whisker}->{method} = $save; return $cret if ( $cret > 0 ); if ( $lwret == 0 ) { # successful request if ( $$hrout{whisker}->{code} == 501 ) { # HEAD not allowed $OBJ->{config}->{do_head} = 0; # no more HEAD requests } } } ( $cret, $lwret ) = _crawl_do_request_ex( $hrin, $hrout, $OBJ ); return $lwret if ( $cret < 0 ); return $cret; } } # CRAWL_CONTAINER sub dump { my %what = @_; my ( $final, $k, $v ) = (''); while ( ( $k, $v ) = each %what ) { return undef if ( ref($k) || !ref($v) ); $final .= "\$$k = " . _dump( 1, $v, 1 ); $final =~ s#,\n$##; $final .= ";\n"; } return $final; } sub dump_writefile { my $file = shift; my $output = &dump(@_); return 1 if ( !open( OUT, ">$file" ) || !defined $output ); binmode(OUT); print OUT $output; close(OUT); } sub _dump { # dereference and dump an element my ( $t, $ref, $depth ) = @_; my ( $out, $k, $v ) = (''); $depth ||= 1; return 'undef' if ( $depth > 128 ); if ( !defined $ref ) { return 'undef'; } elsif ( ref($ref) eq 'HASH' ) { $out .= "{\n"; while ( ( $k, $v ) = each %$ref ) { $out .= "\t" x $t; $out .= _dumpd($k) . ' => '; if ( ref($v) ) { $out .= _dump( $t + 1, $v, $depth + 1 ); } else { $out .= _dumpd($v); } $out .= ",\n" unless ( substr( $out, -2, 2 ) eq ",\n" ); } $out =~ s#,\n$#\n#; $out .= "\t" x ( $t - 1 ); $out .= "},\n"; } elsif ( ref($ref) eq 'ARRAY' ) { $out .= "["; if ( ~~@$ref ) { $out .= "\n"; foreach $v (@$ref) { $out .= "\t" x $t; if ( ref($v) ) { $out .= _dump( $t + 1, $v, $depth + 1 ); } else { $out .= _dumpd($v); } $out .= ",\n" unless ( substr( $out, -2, 2 ) eq ",\n" ); } $out =~ s#,\n$#\n#; $out .= "\t" x ( $t - 1 ); } $out .= "],\n"; } elsif ( ref($ref) eq 'SCALAR' ) { $out .= _dumpd($$ref); } elsif ( ref($ref) eq 'REF' ) { $out .= _dump( $t, $$ref, $depth + 1 ); } elsif ( ref($ref) ) { # unknown/unsupported ref $out .= "undef"; } else { # normal scalar $out .= _dumpd($ref); } return $out; } sub _dumpd { # escape a scalar string my $v = shift; return 'undef' if ( !defined $v ); return "''" if ( $v eq '' ); return "$v" if ( $v eq '0' || $v !~ tr/0-9//c && $v !~ m#^0+# ); if ( $v !~ tr/ !-~//c ) { $v =~ s/(['\\])/\\$1/g; return "'$v'"; } $v =~ s#\\#\\\\#g; $v =~ s#"#\\"#g; $v =~ s#\r#\\r#g; $v =~ s#\n#\\n#g; $v =~ s#\t#\\t#g; $v =~ s#\$#\\\$#g; $v =~ s#([^!-~ ])#sprintf('\\x%02x',ord($1))#eg; return "\"$v\""; } { # package variables my $MIMEBASE64_TRYLOADING = 1; sub encode_base64 { if ($MIMEBASE64_TRYLOADING) { eval "require MIME::Base64"; $MIMEBASE64_TRYLOADING = 0; } goto &MIME::Base64::encode_base64 if ($MIME::Base64::VERSION); my $res = ""; my $eol = $_[1]; $eol = "\n" unless defined $eol; pos( $_[0] ) = 0; while ( $_[0] =~ /(.{1,45})/gs ) { $res .= substr( pack( 'u', $1 ), 1 ); chop($res); } $res =~ tr|` -_|AA-Za-z0-9+/|; my $padding = ( 3 - length( $_[0] ) % 3 ) % 3; $res =~ s/.{$padding}$/'=' x $padding/e if $padding; if ( length $eol ) { $res =~ s/(.{1,76})/$1$eol/g; } $res; } sub decode_base64 { if ($MIMEBASE64_TRYLOADING) { eval "require MIME::Base64"; $MIMEBASE64_TRYLOADING = 0; } goto &MIME::Base64::decode_base64 if ($MIME::Base64::VERSION); my $str = shift; my $res = ""; $str =~ tr|A-Za-z0-9+=/||cd; $str =~ s/=+$//; # remove padding $str =~ tr|A-Za-z0-9+/| -_|; # convert to uuencoded format while ( $str =~ /(.{1,60})/gs ) { my $len = chr( 32 + length($1) * 3 / 4 ); # compute length byte $res .= unpack( "u", $len . $1 ); # uudecode } $res; } } # end package variables sub encode_uri_hex { # normal hex encoding my $str = shift; $str =~ s/([^\/])/sprintf("%%%02x",ord($1))/ge; return $str; } sub encode_uri_randomhex { # random normal hex encoding my @T = split( //, shift ); my $s; foreach (@T) { if (m#[;=:&@\?]#) { $s .= $_; next; } if ( ( rand() * 2 ) % 2 == 1 ) { $s .= sprintf( "%%%02x", ord($_) ); } else { $s .= $_; } } return $s; } sub encode_uri_randomcase { my ( $x, $uri ) = ( '', shift ); return $uri if ( $uri !~ tr/a-zA-Z// ); # fast-path my @T = split( //, $uri ); for ( $x = 0 ; $x < ( scalar @T ) ; $x++ ) { if ( ( rand() * 2 ) % 2 == 1 ) { $T[$x] =~ tr/A-Za-z/a-zA-Z/; } } return join( '', @T ); } sub encode_unicode { my ( $c, $r ) = ( '', '' ); foreach $c ( split( //, shift ) ) { $r .= pack( "v", ord($c) ); } return $r; } sub decode_unicode { my $str = $_[0]; return $str if ( $str !~ tr/!-~//c ); # fastpath my ( $lead, $count, $idx ); my $out = ''; my $len = length($str); my ( $ptr, $no, $nu ) = ( 0, 0, 0 ); while ( $ptr < $len ) { my $c = substr( $str, $ptr, 1 ); if ( ord($c) >= 0xc0 && ord($c) <= 0xfd ) { $count = 0; $c = ord($c) << 1; while ( ( $c & 0x80 ) == 0x80 ) { $c <<= 1; last if ( $count++ == 4 ); } $c = ( $c & 0xff ); for ( $idx = 1 ; $idx < $count ; $idx++ ) { my $o = ord( substr( $str, $ptr + $idx, 1 ) ); $no = 1 if ( $o != 0x80 ); $nu = 1 if ( $o < 0x80 || $o > 0xbf ); } my $o = ord( substr( $str, $ptr + $idx, 1 ) ); $nu = 1 if ( $o < 0x80 || $o > 0xbf ); if ($nu) { $out .= substr( $str, $ptr++, 1 ); } else { if ($no) { $out .= "\xff"; # generic replacement char } else { my $prior = ord( substr( $str, $ptr + $count - 1, 1 ) ) << 6; $out .= pack( "C", (( ord( substr( $str, $ptr + $count, 1 ) ) & 0x7f ) + $prior ) & 255 ); } $ptr += $count + 1; } $no = $nu = 0; } else { $out .= $c; $ptr++; } } return $out; } sub encode_anti_ids { my ( $rhin, $modes ) = ( shift, shift ); my ( @T, $x, $c, $s, $y ); my $ENCODED = 0; my $W = $$rhin{'whisker'}; return if ( !( defined $rhin && ref($rhin) ) ); $$rhin{'whisker'}->{'uri_orig'} = $$rhin{'whisker'}->{'uri'}; if ( $modes =~ /4/ ) { $s = ''; if ( $$W{'uri'} =~ m#^/# ) { $y = &utils_randstr; $s .= $y while ( length($s) < 512 ); $$W{'uri'} = "/$s/.." . $$W{'uri'}; } } if ( $modes =~ /7/ ) { $$W{'uri'} = encode_uri_randomcase( $$W{'uri'} ); } if ( $modes =~ /2/ ) { $$W{'uri'} =~ s#/#/./#g; } if ( $modes =~ /8/ ) { $$W{'uri'} =~ s#/#\\#g; $$W{'uri'} =~ s#^\\#/#; $$W{'uri'} =~ s#^([a-zA-Z0-9_]+):\\#$1://#; $$W{'uri'} =~ s#\\$#/#; } if ( $modes =~ /1/ ) { if ( $ENCODED == 0 ) { $$W{'uri'} = encode_uri_randomhex( $$W{'uri'} ); $ENCODED = 1; } } if ( $modes =~ /5/ ) { ( $s, $y ) = ( &utils_randstr, &utils_randstr ); $$W{'uri'} = "/$s.html%3F$y=/../$$W{'uri'}"; } if ( $modes =~ /3/ ) { $s = &utils_randstr; $$W{'uri'} = "/%20HTTP/1.1%0d%0aAccept%3a%20$s/../..$$W{'uri'}"; } if ( $modes =~ /6/ ) { $$W{'http_space1'} = "\t"; } } %_forms_ELEMENTS = ( 'form' => 1, 'input' => 1, 'textarea' => 1, 'button' => 1, 'select' => 1, 'option' => 1, '/select' => 1 ); sub forms_read { my $dr = shift; return undef if ( !ref($dr) || length($$dr) == 0 ); my $A = [ {}, [] ]; html_find_tags( $dr, \&_forms_parse_callback, 0, $A, \%_forms_ELEMENTS ); if ( scalar %{ $A->[0] } ) { push( @{ $A->[1] }, $A->[0] ); } return $A->[1]; } sub forms_write { my $hr = shift; return undef if ( !ref($hr) || !( scalar %$hr ) ); return undef if ( !defined $$hr{"\0"} ); my $t = '
\n"; return $t; } { # these are 'private' static variables for &_forms_parse_html my $CURRENT_SELECT = undef; my $UNKNOWNS = 0; sub _forms_parse_callback { my ( $TAG, $hr, $dr, $start, $len, $ar ) = ( lc(shift), @_ ); my ( $saveparam, $parr, $key ) = ( 0, undef, '' ); my $_forms_CURRENT = $ar->[0]; my $_forms_FOUND = $ar->[1]; if ( scalar %$hr ) { while ( my ( $key, $val ) = each %$hr ) { if ( $key =~ tr/A-Z// ) { delete $$hr{$key}; if ( defined $val ) { $$hr{ lc($key) } = $val; } else { $$hr{ lc($key) } = undef; } } } } if ( $TAG eq 'form' ) { if ( scalar %$_forms_CURRENT ) { # save last form push( @$_forms_FOUND, $_forms_CURRENT ); $ar->[0] = {}; $_forms_CURRENT = $ar->[0]; } $_forms_CURRENT->{"\0"} = [ $$hr{name}, $$hr{method}, $$hr{action}, [] ]; delete $$hr{'name'}; delete $$hr{'method'}; delete $$hr{'action'}; $key = "\0"; $UNKNOWNS = 0; } elsif ( $TAG eq 'input' ) { $$hr{type} = 'text' if ( !defined $$hr{type} ); $$hr{name} = 'unknown' . $UNKNOWNS++ if ( !defined $$hr{name} ); $$hr{value} = undef if ( !defined $$hr{value} ); $key = $$hr{name}; push @{ $_forms_CURRENT->{$key} }, [ 'input-' . $$hr{type}, $$hr{value}, [] ]; delete $$hr{'name'}; delete $$hr{'type'}; delete $$hr{'value'}; } elsif ( $TAG eq 'select' ) { $$hr{name} = 'unknown' . $UNKNOWNS++ if ( !defined $$hr{name} ); $key = $$hr{name}; push @{ $_forms_CURRENT->{$key} }, [ 'select', undef, [] ]; $CURRENT_SELECT = $key; delete $$hr{name}; } elsif ( $TAG eq '/select' ) { push @{ $_forms_CURRENT->{$CURRENT_SELECT} }, [ '/select', undef, [] ]; $CURRENT_SELECT = undef; return undef; } elsif ( $TAG eq 'option' ) { return undef if ( !defined $CURRENT_SELECT ); if ( !defined $$hr{value} ) { my $stop = index( $$dr, '<', $start + $len ); return undef if ( $stop == -1 ); # MAJOR PUKE $$hr{value} = substr( $$dr, $start + $len, ( $stop - $start - $len ) ); $$hr{value} =~ tr/\r\n//d; } push @{ $_forms_CURRENT->{$CURRENT_SELECT} }, [ 'option', $$hr{value}, [] ]; delete $$hr{value}; } elsif ( $TAG eq 'textarea' ) { my $stop = $start + $len; $$hr{value} = $$hr{'='}; delete $$hr{'='}; $$hr{name} = 'unknown' . $UNKNOWNS++ if ( !defined $$hr{name} ); $key = $$hr{name}; push @{ $_forms_CURRENT->{$key} }, [ 'textarea', $$hr{value}, [] ]; delete $$hr{'name'}; delete $$hr{'value'}; } else { # button $$hr{name} = 'unknown' . $UNKNOWNS++ if ( !defined $$hr{name} ); $$hr{value} = undef if ( !defined $$hr{value} ); $key = $$hr{name}; push @{ $_forms_CURRENT->{$key} }, [ 'button', $$hr{value}, [] ]; delete $$hr{'name'}; delete $$hr{'value'}; } if ( scalar %$hr ) { if ( $TAG eq 'form' ) { $parr = $_forms_CURRENT->{$key}->[3]; } else { $parr = $_forms_CURRENT->{$key}->[-1]; $parr = $parr->[2]; } my ( $k, $v ); while ( ( $k, $v ) = each(%$hr) ) { if ( defined $v ) { push @$parr, "$k=\"$v\""; } else { push @$parr, $k; } } } return undef; } } { # contained variables $DR = undef; # data reference $c = 0; # parser pointer $LEN = 0; sub html_find_tags { my ( $dataref, $callbackfunc, $xml, $fref, $tagmap ) = @_; return if ( !( defined $dataref && ref($dataref) ) ); return if ( !( defined $callbackfunc && ref($callbackfunc) ) ); $xml ||= 0; my ( $INTAG, $CURTAG, $LCCURTAG, $ELEMENT, $VALUE, $cc ) = (0); my ( %TAG, $ret, $start, $tagstart, $tempstart, $x, $found ); my $usetagmap = ( ( defined $tagmap && ref($tagmap) ) ? 1 : 0 ); $CURTAG = $LCCURTAG = $ELEMENT = $VALUE = $cc = ''; $DR = $dataref; $LEN = length($$dataref); for ( $c = 0 ; $c < $LEN ; $c++ ) { $cc = substr( $$dataref, $c, 1 ); next if ( !$INTAG && $cc ne '>' && $cc ne '<' ); if ( $cc eq '<' ) { if ($INTAG) { $cc = '>'; $c--; } elsif ($xml && $LEN > ( $c + 9 ) && substr( $$dataref, $c + 1, 8 ) eq '![CDATA[' ) { $c += 9; $tempstart = $c; $found = index( $$dataref, ']]>', $c ); $c = $found + 2; $c = $LEN if ( $found < 0 ); # malformed XML next; } elsif ( $LEN > ( $c + 3 ) && substr( $$dataref, $c + 1, 3 ) eq '!--' ) { $tempstart = $c; $c += 4; $found = index( $$dataref, '-->', $c ); if ( $found < 0 ) { $found = index( $$dataref, '>', $c ); $found = $LEN if ( $found < 0 ); $c = $found; } else { $c = $found + 2; } if ( $usetagmap == 0 || defined $tagmap->{'!--'} ) { my $dat = substr( $$dataref, $tempstart + 4, $found - $tempstart - 4 ); &$callbackfunc( '!--', { '=' => $dat }, $dataref, $tempstart, $c - $tempstart + 1, $fref ); } next; } elsif ( !$INTAG ) { next if ( substr( $$dataref, $c + 1, 1 ) =~ tr/ \t\r\n// ); $c++; $INTAG = 1; $tagstart = $c - 1; $CURTAG = ''; while ( $c < $LEN && ( $x = substr( $$dataref, $c, 1 ) ) !~ tr/ \t\r\n>=// ) { $CURTAG .= $x; $c++; } chop $CURTAG if ( $xml && substr( $CURTAG, -1, 1 ) eq '/' ); $c++ if ( defined $x && $x ne '>' ); $LCCURTAG = lc($CURTAG); $INTAG = 0 if ( $LCCURTAG !~ tr/a-z0-9// ); next if ( $c >= $LEN ); $cc = substr( $$dataref, $c, 1 ); } } if ( $cc eq '>' ) { next if ( !$INTAG ); if ( $LCCURTAG eq 'script' && !$xml ) { $tempstart = $c + 1; pos($$dataref) = $c; if ( $$dataref !~ m#()#ig ) { } else { $c = pos($$dataref) - 1; my $l = length($1); $TAG{'='} = substr( $$dataref, $tempstart, $c - $tempstart - $l + 1 ); } } elsif ( $LCCURTAG eq 'textarea' && !$xml ) { $tempstart = $c + 1; pos($$dataref) = $c; if ( $$dataref !~ m#()#ig ) { } else { $c = pos($$dataref) - 1; my $l = length($1); $TAG{'='} = substr( $$dataref, $tempstart, $c - $tempstart - $l + 1 ); } } $INTAG = 0; $TAG{'/'}++ if ( $xml && substr( $$dataref, $c - 1, 1 ) eq '/' ); &$callbackfunc( $CURTAG, \%TAG, $dataref, $tagstart, $c - $tagstart + 1, $fref ) if ( $usetagmap == 0 || defined $tagmap->{$LCCURTAG} ); $CURTAG = $LCCURTAG = ''; %TAG = (); next; } if ($INTAG) { $ELEMENT = ''; $VALUE = undef; pos($$dataref) = $c; if ( $$dataref !~ m/[^ \t\r\n]/g ) { $c = $LEN; next; # should we really abort? } $start = pos($$dataref) - 1; if ( $$dataref !~ m/[ \t\r\n<>=]/g ) { $c = $LEN; next; # should we really abort? } $c = pos($$dataref) - 1; if ( $c > $start ) { $ELEMENT = substr( $$dataref, $start, $c - $start ); chop $ELEMENT if ( $xml && substr( $ELEMENT, -1, 1 ) eq '/' ); } $cc = substr( $$dataref, $c, 1 ); if ( $cc ne '>' ) { if ( $cc =~ tr/ \t\r\n// ) { $c++ while ( substr( $$dataref, $c, 1 ) =~ tr/ \t\r\n// ); } if ( substr( $$dataref, $c, 1 ) eq '=' ) { $c++; $start = $c; my $p = substr( $$dataref, $c, 1 ); if ( $p eq '"' || $p eq '\'' ) { $c++; $start++; $c = index( $$dataref, $p, $c ); if ( $c < 0 ) { $c = $LEN; next; } # Bad HTML $VALUE = substr( $$dataref, $start, $c - $start ); $c++; pos($$dataref) = $c; } else { pos($$dataref) = $c; if ( $$dataref !~ /[ \t\r\n>]/g ) { $c = $LEN; } else { $c = pos($$dataref) - 1; $VALUE = substr( $$dataref, $start, $c - $start ); chop $VALUE if ( $xml && substr( $$dataref, $c - 1, 2 ) eq '/>' ); } } if ( substr( $$dataref, $c, 1 ) =~ tr/ \t\r\n// ) { if ( $$dataref !~ /[^ \t\r\n]/g ) { $c = $LEN; next; # should we really abort? } $c = pos($$dataref) - 1; } } } # if $c ne '>' $c--; $TAG{$ELEMENT} = $VALUE if ( $ELEMENT ne '' || ( $xml && $ELEMENT ne '/' ) ); } } if ($INTAG) { &$callbackfunc( $CURTAG, \%TAG, $dataref, $tagstart, $c - $tagstart + 1, $fref ) if ( $usetagmap == 0 || defined $tagmap->{$LCCURTAG} ); } $DR = undef; # void dataref pointer } sub html_find_tags_rewrite { return if ( !defined $DR ); my ( $pos, $len, $replace_str ) = @_; substr( $$DR, $pos, $len ) = $replace_str; my $l = ( length($replace_str) - $len ); $c += $l; $LEN += $l; } sub _html_find_tags_adjust { my ( $p, $l ) = @_; $c += $p; $LEN += $l; } } # end container sub html_link_extractor { my $data = shift; my $ptr; if ( ref($data) ) { $ptr = $data; } else { $ptr = \$data; } my %OBJ = ( urls => [], forms => {} ); $OBJ{response} = {}; $OBJ{response}->{whisker} = {}; $OBJ{response}->{whisker}->{uri} = ''; html_find_tags( $ptr, # data \&_crawl_extract_links_test, # callback function 0, # xml mode \%OBJ, # data object \%_crawl_linktags ); # tagmap return @{ $OBJ{urls} }; } %http_host_cache = (); sub http_new_request { my %X = @_; my ( $k, $v, %RET, %RES ); http_init_request( \%RET ); while ( ( $k, $v ) = each(%X) ) { $RET{whisker}->{$k} = $v; } $RES{whisker} = {}; $RES{whisker}->{MAGIC} = 31340; $RES{whisker}->{uri} = ''; return ( \%RET, \%RES ) if wantarray(); return \%RET; } sub http_new_response { my %RET; $RET{whisker} = {}; $RET{whisker}->{MAGIC} = 31340; $RET{whisker}->{uri} = ''; return \%RET; } sub http_init_request { # doesn't return anything my ($hin) = shift; return if ( !( defined $hin && ref($hin) ) ); %$hin = (); # clear control hash $$hin{whisker} = { http_space1 => ' ', http_space2 => ' ', version => '1.1', method => 'GET', protocol => 'HTTP', port => 80, uri => '/', uri_prefix => '', uri_postfix => '', uri_param_sep => '?', host => 'localhost', timeout => 10, include_host_in_uri => 0, ignore_duplicate_headers => 1, normalize_incoming_headers => 1, lowercase_incoming_headers => 0, require_newline_after_headers => 0, invalid_protocol_return_value => 1, ssl => 0, ssl_save_info => 0, http_eol => "\x0d\x0a", force_close => 0, force_open => 0, retry => 1, trailing_slurp => 0, force_bodysnatch => 0, max_size => 0, MAGIC => 31339 }; $$hin{'Connection'} = 'Keep-Alive'; $$hin{'User-Agent'} = "Mozilla (libwhisker/$LW2::VERSION)"; } sub http_do_request { my ( $hin, $hout ) = ( shift, shift ); return 2 if ( !( defined $hin && ref($hin) ) ); return 2 if ( !( defined $hout && ref($hout) ) ); %$hout = (); $$hout{whisker} = {}; $$hout{whisker}->{'MAGIC'} = 31340; $$hout{whisker}->{uri} = $$hin{whisker}->{uri}; if ( !defined $$hin{whisker} || !defined $$hin{whisker}->{'MAGIC'} || $$hin{whisker}->{'MAGIC'} != 31339 ) { $$hout{whisker}->{error} = 'Input hash not initialized'; return 2; } if ( defined $_[0] ) { # handle extra params my %hashref; if ( ref( $_[0] ) eq 'HASH' ) { %hashref = %{ $_[0] }; } else { %hashref = @_; } $$hin{whisker}->{$_} = $hashref{$_} foreach ( keys %hashref ); } if ( defined $$hin{whisker}->{'anti_ids'} ) { # handle anti_ids my %copy = %$hin; $copy{whisker} = {}; %{ $copy{whisker} } = %{ $$hin{whisker} }; encode_anti_ids( \%copy, $$hin{whisker}->{'anti_ids'} ); $hin = \%copy; } my $cache_key = stream_key($hin); my $stream; if ( !defined $http_host_cache{$cache_key} ) { $stream = stream_new($hin); $http_host_cache{$cache_key} = $stream; } else { $stream = $http_host_cache{$cache_key}; } if ( !defined $stream ) { $$hout{whisker}->{error} = 'unable to allocate stream'; return 2; } my $retry_count = $$hin{whisker}->{retry}; my $puke_flag = 0; my $ret = 1; do { # retries wrapper my ( $aret, $pass ); if ( !$stream->{valid}->() ) { $stream->{clearall}->(); if ( !$stream->{open}->($hin) ) { $$hout{whisker}->{error} = 'opening stream: ' . $stream->{error}; $$hout{whisker}->{error} .= '(reconnect problem after prior request)' if ($puke_flag); return 2; } if ( defined $$hin{whisker}->{proxy_host} && defined $$hin{whisker}->{auth_proxy_callback} ) { $aret = $$hin{whisker}->{auth_proxy_callback} ->( $stream, $hin, $hout ); return $aret if ( $aret != 0 ); # proxy auth error } if ( defined $$hin{whisker}->{auth_callback} ) { $aret = $$hin{whisker}->{auth_callback}->( $stream, $hin, $hout ); return 0 if ( $aret == 200 ); # auth not needed? return $aret if ( $aret != 0 ); # auth error } } _ssl_save_info( $hout, $stream ) if ( $$hin{whisker}->{ssl} > 0 && $$hin{whisker}->{ssl_save_info} > 0 ); $ret = _http_do_request_ex( $stream, $hin, $hout ); $puke_flag++ if ( $ret == 1 && defined( $$hout{whisker}->{http_data_sent} ) ); return $ret if ( $ret == 0 || $ret == 2 ); # success or fatal socket error $retry_count--; } while ( $retry_count >= 0 ); return $ret; } sub _http_do_request_ex { my ( $stream, $hin, $hout, $raw ) = @_; return 2 if ( !defined $stream ); return 2 if ( !( defined $hin && ref($hin) ) ); return 2 if ( !( defined $hout && ref($hout) ) ); my $W = $hin->{whisker}; if ( !defined $$hout{whisker}->{MAGIC} || $$hout{whisker}->{MAGIC} != 31340 ) { %$hout = (); $$hout{whisker} = {}; $$hout{whisker}->{'MAGIC'} = 31340; $$hout{whisker}->{uri} = $$hin{whisker}->{uri}; } $stream->{clear}->(); if ( defined $raw && ref($raw) ) { $stream->{queue}->($$raw); } else { $stream->{queue}->( http_req2line($hin) ); if ( $$W{version} ne '0.9' ) { $stream->{queue}->( http_construct_headers($hin) ); $stream->{queue}->( $$W{raw_header_data} ) if ( defined $$W{raw_header_data} ); $stream->{queue}->( $$W{http_eol} ); $stream->{queue}->( $$W{data} ) if ( defined $$W{data} ); } # http 0.9 support } if ( defined $$W{request_fingerprint} ) { $$hout{whisker}->{request_fingerprint} = 'md5:' . md5( $stream->{bufout} ) if ( $$W{request_fingerprint} eq 'md5' ); $$hout{whisker}->{request_fingerprint} = 'md4:' . md4( $stream->{bufout} ) if ( $$W{request_fingerprint} eq 'md4' ); } if ( !$stream->{'write'}->() ) { $$hout{whisker}->{'error'} = 'sending request: ' . $stream->{error}; $stream->{'close'}->(); return 1; } $$hout{whisker}->{http_data_sent} = 1; $$hout{whisker}->{'lowercase_incoming_headers'} = $$W{'lowercase_incoming_headers'}; my @H; if ( $$W{'version'} ne '0.9' ) { do { # catch '100 Continue' responses my $resp = _http_getline($stream); if ( !defined $resp ) { $$hout{whisker}->{error} = 'error reading HTTP response'; $$hout{whisker}->{data} = $stream->{bufin}; $stream->{'close'}->(); return 1; } $$hout{whisker}->{'raw_header_data'} .= $resp if ( defined $$W{'save_raw_headers'} ); if ( $resp !~ /^([^\/]+)\/(\d\.\d)([ \t]+)(\d+)([ \t]*)(.*?)([\r\n]+)/ ) { $$hout{whisker}->{'error'} = 'invalid HTTP response'; $$hout{whisker}->{'data'} = $resp; while ( defined( $_ = _http_getline($stream) ) ) { $$hout{whisker}->{'data'} .= $_; } $stream->{'close'}->(); return $$W{'invalid_protocol_return_value'} || 1; } $$hout{whisker}->{protocol} = $1; $$hout{whisker}->{version} = $2; $$hout{whisker}->{http_space1} = $3; $$hout{whisker}->{code} = $4; $$hout{whisker}->{http_space2} = $5; $$hout{whisker}->{message} = $6; $$hout{whisker}->{http_eol} = $7; $$hout{whisker}->{'100_continue'}++ if ( $4 == 100 ); @H = http_read_headers( $stream, $hin, $hout ); if ( !$H[0] ) { $$hout{whisker}->{'error'} = 'Error in reading headers: ' . $H[1]; $stream->{'close'}->(); return 1; } if ( !defined $H[3] ) { # connection my ($t) = utils_find_lowercase_key( $hin, 'connection' ); $H[3] = $t || 'close'; } } while ( $$hout{whisker}->{'code'} == 100 ); } else { # http ver 0.9, we need to fake it since headers are not sent $$hout{whisker}->{version} = '0.9'; $$hout{whisker}->{code} = 200; $$hout{whisker}->{message} = ''; $H[3] = 'close'; } if ( $$hout{whisker}->{code}==404 && defined $$W{'shortcut_on_404'} ) { $stream->{'close'}->(); } elsif ( defined $$W{data_sock} ) { $$hout{whisker}->{data_sock} = $stream->{sock}; $$hout{whisker}->{data_stream} = $stream; } else { if ( $$W{'force_bodysnatch'} || ( $$W{'method'} ne 'HEAD' && $$hout{whisker}->{'code'} != 206 && $$hout{whisker}->{'code'} != 102 ) ) { return 1 if ( !http_read_body( $stream, $hin, $hout, $H[1], $H[2] ) ); if ( lc( $H[1] ) eq 'chunked' && defined $$hin{whisker}->{hide_chunked_responses} && $$hin{whisker}->{hide_chunked_responses} == 1 && !defined $$hin{whisker}->{save_raw_chunks} ) { $$hout{'Content-Length'} = length( $$hout{whisker}->{data} ); utils_delete_lowercase_key( $hout, 'transfer-encoding' ); my $new = []; my $cl = 0; foreach ( @{ $$hout{whisker}->{header_order} } ) { my $l = lc($_); if ( $l eq 'content-length' ) { $cl++; next if ( $cl > 1 ); } push @$new, $_ if ( $l ne 'transfer-encoding' ); } push @$new, 'Content-Length' if ( $cl == 0 ); $$hout{whisker}->{header_order} = $new; } } my ($ch) = LW2::utils_find_lowercase_key( $hin, 'connection' ); my $cl = 0; $cl++ if ( ( lc( $H[3] ) ne 'keep-alive' || ( defined $ch && $ch =~ m/close/i ) ) && $$W{'force_open'} != 1 ); $cl++ if ( $$W{'force_close'} > 0 || $stream->{forceclose} > 0 ); $cl++ if ( $$W{'ssl'} > 0 && $LW_SSL_KEEPALIVE == 0 ); $stream->{'close'}->() if ($cl); } if ( defined $$W{'header_delete_on_success'} && ref( $$W{'header_delete_on_success'} ) ) { foreach ( @{ $$W{'header_delete_on_success'} } ) { delete $hin->{$_} if ( exists $hin->{$_} ); } delete $$W{header_delete_on_success}; } $stream->{reqs}++; $$hout{whisker}->{'stats_reqs'} = $stream->{reqs}; $$hout{whisker}->{'stats_syns'} = $stream->{syns}; $$hout{whisker}->{'socket_state'} = $stream->{state}; delete $$hout{whisker}->{'error'}; # no error return 0; } sub http_req2line { my ( $S, $hin, $UO ) = ( '', @_ ); $UO ||= 0; if ( defined $$hin{whisker}->{'full_request_override'} ) { return $$hin{whisker}->{'full_request_override'}; } else { # notice the components of a request--this is for flexibility if ( $UO != 1 ) { $S .= $$hin{whisker}->{'method'} . $$hin{whisker}->{'http_space1'}; if ( $$hin{whisker}->{'include_host_in_uri'} > 0 ) { if ( $$hin{whisker}->{'ssl'} == 1 ) { $S .= 'https://'; } else { $S .= 'http://'; } if ( defined $$hin{whisker}->{'uri_user'} ) { $S .= $$hin{whisker}->{'uri_user'}; if ( defined $$hin{whisker}->{'uri_password'} ) { $S .= ':' . $$hin{whisker}->{'uri_password'}; } $S .= '@'; } $S .= $$hin{whisker}->{'host'} . ':' . $$hin{whisker}->{'port'}; } } $S .= $$hin{whisker}->{'uri_prefix'} . $$hin{whisker}->{'uri'} . $$hin{whisker}->{'uri_postfix'}; if ( defined $$hin{whisker}->{'parameters'} && $$hin{whisker}->{'parameters'} ne '' ) { $S .= $$hin{whisker}->{'uri_param_sep'} . $$hin{whisker}->{'parameters'}; } if ( $UO != 1 ) { if ( $$hin{whisker}->{'version'} ne '0.9' ) { $S .= $$hin{whisker}->{'http_space2'} . $$hin{whisker}->{'protocol'} . '/' . $$hin{whisker}->{'version'}; } $S .= $$hin{whisker}->{'http_eol'}; } } return $S; } sub http_resp2line { my $hout = shift; my $out = ''; return undef if ( !defined $hout || !ref($hout) ); return undef if ( $hout->{whisker}->{MAGIC} != 31340 ); $out .= $$hout{whisker}->{protocol}; $out .= '/'; $out .= $$hout{whisker}->{version}; $out .= $$hout{whisker}->{http_space1}; $out .= $$hout{whisker}->{code}; $out .= $$hout{whisker}->{http_space2}; $out .= $$hout{whisker}->{message}; $out .= $$hout{whisker}->{http_eol}; return $out; } sub _http_getline { my $stream = shift; my ( $str, $t, $bc ) = ( '', 0, 0 ); $t = index( $stream->{bufin}, "\n", 0 ); while ( $t < 0 ) { return undef if !$stream->{read}->() || length($stream->{bufin}) == $bc; $t = index( $stream->{bufin}, "\n", 0 ); $bc = length( $stream->{bufin} ); } my $r = substr( $stream->{bufin}, 0, $t + 1 ); $stream->{bufin} = substr( $stream->{bufin}, $t + 1 ); return $r; } sub _http_get { # read from socket w/ timeouts my ( $stream, $amount ) = @_; my ( $str, $t, $b ) = ( '', '', 0 ); while ( $amount > length( $stream->{bufin} ) ) { return undef if !$stream->{read}->() || length( $stream->{bufin} ) == $b; $b = length( $stream->{bufin} ); } my $r = substr( $stream->{bufin}, 0, $amount ); $stream->{bufin} = substr( $stream->{bufin}, $amount ); return $r; } sub _http_getall { my ( $tmp, $b, $stream, $max_size ) = ('', 0, @_); while ( $stream->{read}->() && length( $stream->{bufin} ) != $b) { last if ( $max_size && length( $stream->{bufin} ) >= $max_size ); $b = length( $stream->{bufin} ); } ( $tmp, $stream->{bufin} ) = ( $stream->{bufin}, '' ); $tmp = substr($tmp, 0, $max_size) if($max_size && length($tmp) > $max_size); return $tmp; } sub http_fixup_request { my $hin = shift; return if ( !( defined $hin && ref($hin) ) ); $$hin{whisker}->{uri} = '/' if ( $$hin{whisker}->{uri} eq '' ); $$hin{whisker}->{http_space1}= ' '; $$hin{whisker}->{http_space2}= ' '; $$hin{whisker}->{protocol}= 'HTTP'; $$hin{whisker}->{uri_param_sep}= '?'; if ( $$hin{whisker}->{'version'} eq '1.1' ) { my ($host) = utils_find_lowercase_key($hin,'host'); $$hin{'Host'} = $$hin{whisker}->{'host'} if(!defined $host || $host eq ''); $$hin{'Host'} .= ':' . $$hin{whisker}->{'port'} if ( $$hin{whisker}->{port} != 80 || ( $$hin{whisker}->{ssl}==1 && $$hin{whisker}->{port} != 443 ) ); my ($conn) = utils_find_lowercase_key($hin,'connection'); $$hin{'Connection'} = 'Keep-Alive' if(!defined $conn || $conn eq ''); } elsif( $$hin{whisker}->{'version'} eq '1.0' ){ my ($conn) = utils_find_lowercase_key($hin,'connection'); $$hin{'Connection'} = 'close' if(!defined $conn || $conn eq ''); } utils_delete_lowercase_key( $hin, 'content-length' ); if ( $$hin{whisker}->{method} eq 'POST' || defined $$hin{whisker}->{data} ) { $$hin{whisker}->{data}||=''; $$hin{'Content-Length'} = length( $$hin{whisker}->{'data'} ); my ($v) = utils_find_lowercase_key( $hin, 'content-type' ); if ( !defined $v || $v eq '' ) { $$hin{'Content-Type'} = 'application/x-www-form-urlencoded'; } } if ( defined $$hin{whisker}->{'proxy_host'} ) { $$hin{whisker}->{'include_host_in_uri'} = 1; } } sub http_reset { my $stream; foreach $stream ( keys %http_host_cache ) { $stream->{'close'}->() if(ref($stream)); delete $http_host_cache{$stream}; } } sub ssl_is_available { return 0 if ( $LW_SSL_LIB == 0 ); if ( $LW_SSL_LIB == 1 ) { return 1 if ( !wantarray() ); return ( 1, "Net::SSLeay", $Net::SSLeay::VERSION ); } elsif ( $LW_SSL_LIB == 2 ) { return 1 if ( !wantarray() ); return ( 1, "Net::SSL", $Net::SSL::VERSION ); } else { utils_carp('',"ssl_is_available: sanity check failed"); return 0; } } sub _ssl_save_info { my ( $hr, $stream ) = @_; my $cert; if ( $stream->{streamtype} == 4 ) { my $SSL = $stream->{sock}; $hr->{whisker}->{ssl_cipher} = Net::SSLeay::get_cipher($SSL); if ( $cert = Net::SSLeay::get_peer_certificate($SSL) ) { $hr->{whisker}->{ssl_cert_subject} = Net::SSLeay::X509_NAME_oneline( Net::SSLeay::X509_get_subject_name($cert) ); $hr->{whisker}->{ssl_cert_issuer} = Net::SSLeay::X509_NAME_oneline( Net::SSLeay::X509_get_issuer_name($cert) ); } return; } if ( $stream->{streamtype} == 5 ) { $hr->{whisker}->{ssl_cipher} = $stream->{sock}->get_cipher(); if ( $cert = $stream->{sock}->get_peer_certificate() ) { $hr->{whisker}->{ssl_cert_subject} = $cert->subject_name(); $hr->{whisker}->{ssl_cert_issuer} = $cert->issuer_name(); } return; } } sub http_read_headers { my ( $stream, $in, $hout ) = @_; my $W = $in->{whisker}; my ( $a, $b, $LC, $CL, $TE, $CO ); my $last; pos( $stream->{bufin} ) = 0; while (1) { $last = pos( $stream->{bufin} ); if ( $stream->{bufin} !~ m/(.*?)[\r]{0,1}\n/g ) { if ( !$stream->{read}->() ) { last if ( $$W{require_newline_after_headers} == 0 && length( $stream->{bufin} ) - 1 == $last ); return ( 0, 'error reading in all headers' ); } pos( $stream->{bufin} ) = $last; next; } last if ( $1 eq '' ); return ( 0, 'malformed header' ) if ( $1 !~ m/^([^:]+):([ \t]*)(.*)$/ ); $$hout{whisker}->{'abnormal_header_spacing'}++ if ( $2 ne ' ' ); $a = $1; $b = $3; $LC = lc($a); next if ( $LC eq 'whisker' ); $TE = lc($b) if ( $LC eq 'transfer-encoding' ); $CL = $b if ( $LC eq 'content-length' ); $CO = lc($b) if ( $LC eq 'connection' ); push( @{ $$hout{whisker}->{cookies} }, $b ) if ( $LC eq 'set-cookie' || $LC eq 'set-cookie2' ); if ( $$W{'lowercase_incoming_headers'} > 0 ) { $a = $LC; } elsif ( $$W{'normalize_incoming_headers'} > 0 ) { $a = ucfirst($LC); $a = 'ETag' if ( $a eq 'Etag' ); $a =~ s/(-[a-z])/uc($1)/eg; } push( @{ $$hout{whisker}->{header_order} }, $a ); if ( defined $$hout{$a} && $$W{ignore_duplicate_headers} != 1 ) { $$hout{$a} = [ $$hout{$a} ] if ( !ref( $$hout{$a} ) ); push( @{ $$hout{$a} }, $b ); } else { $$hout{$a} = $b; } } my $found = pos( $stream->{bufin} ); $$hout{whisker}->{'raw_header_data'} = substr( $stream->{bufin}, 0, $found ) if ( defined $$W{'save_raw_headers'} ); $stream->{bufin} = substr( $stream->{bufin}, $found ); return ( 1, $TE, $CL, $CO ); } sub http_read_body { my ( $temp, $stream, $hin, $hout, $enc, $len ) = ( '', @_ ); my $max_size = $hin->{whisker}->{max_size} || 0; $$hout{whisker}->{data} = ''; if ( defined $enc && lc($enc) eq 'chunked' ) { my $total = 0; my $x; my $saveraw = $$hin{whisker}->{save_raw_chunks} || 0; if ( !defined( $x = _http_getline($stream) ) ) { $$hout{whisker}->{'error'} = 'Error reading chunked data length'; $stream->{'close'}->(); return 0; } $a = $x; $a =~ tr/a-fA-F0-9//cd; if ( length($a) > 8 ) { $$hout{whisker}->{'error'} = 'Chunked size is too big: ' . $x; $stream->{'close'}->(); return 0; } $len = hex($a); $len = $max_size if ( $max_size && $len > $max_size ); $$hout{whisker}->{'data'} = $x if ($saveraw); while ( $len > 0 ) { # chunked sucks if ( !defined( $temp = _http_get( $stream, $len ) ) ) { $$hout{whisker}->{'error'} = 'Error reading chunked data'; $stream->{'close'}->(); return 0; } $$hout{whisker}->{'data'} = $$hout{whisker}->{'data'} . $temp; $total += $len; if ( $max_size && $total >= $max_size ) { $stream->{'close'}->(); return 1; } $temp = _http_getline($stream); $$hout{whisker}->{'data'} .= $temp if ( $saveraw && defined $temp ); if ( defined $temp && $temp =~ /^[\r\n]*$/ ) { $temp = _http_getline($stream); $$hout{whisker}->{'data'} .= $temp if ( $saveraw && defined $temp ); } if ( !defined $temp ) { $$hout{whisker}->{'error'} = 'Error reading chunked data'; $stream->{'close'}->(); return 0; } $temp =~ tr/a-fA-F0-9//cd; if ( length($temp) > 8 ) { $$hout{whisker}->{'error'} = 'Chunked size is too big: ' . $temp; $stream->{'close'}->(); return 0; } $len = hex($temp); $len = ( $max_size - $total ) if ( $max_size && $len > ( $max_size - $total ) ); } while ( defined( $_ = _http_getline($stream) ) ) { $$hout{whisker}->{'data'} .= $_ if ($saveraw); tr/\r\n//d; last if ( $_ eq '' ); } } else { if ( defined $len ) { return 1 if ( $len <= 0 ); $len = $max_size if ( $max_size && $len > $max_size ); if ( !defined( $$hout{whisker}->{data} = _http_get( $stream, $len ) ) ) { $$hout{whisker}->{'error'} = 'Error reading data: ' . $stream->{error}; $stream->{'close'}->(); return 0; } } else { # Yuck...read until server stops sending.... $$hout{whisker}->{data} = _http_getall( $stream, $max_size ); $stream->{'close'}->(); } $$hout{whisker}->{'data'} ||= ''; } return 1; } sub http_construct_headers { my $hin = shift; my ( %SENT, $output, $i ); my $EOL = $hin->{whisker}->{http_eol} || "\x0d\x0a"; if ( defined $hin->{whisker}->{header_order} && ref( $hin->{whisker}->{header_order} ) eq 'ARRAY' ) { foreach ( @{ $hin->{whisker}->{header_order} } ) { next if ( $_ eq '' || $_ eq 'whisker' || !defined $hin->{$_} ); if ( ref( $hin->{$_} ) ) { utils_croak("http_construct_headers: non-array header value reference") if ( ref( $hin->{$_} ) ne 'ARRAY' ); $SENT{$_} ||= 0; my $v = $$hin{$_}->[ $SENT{$_} ]; $output .= "$_: $v$EOL"; } else { $output .= "$_: $$hin{$_}$EOL"; } $SENT{$_}++; } } foreach ( keys %$hin ) { next if ( $_ eq '' || $_ eq 'whisker' ); if ( ref( $hin->{$_} ) ) { # header with multiple values utils_croak("http_construct_headers: non-array header value ref") if ( ref( $hin->{$_} ) ne 'ARRAY' ); $SENT{$_} ||= 0; for($i=$SENT{$_}; $i<~~@{ $hin->{$_} }; $i++) { $output .= "$_: " . $hin->{$_}->[$i] . $EOL; } } else { # normal header next if ( defined $SENT{$_} ); $output .= "$_: $$hin{$_}$EOL"; } } return $output; } sub http_close { my $hin = shift; my $cache_key = stream_key($hin); return if ( !defined $http_host_cache{$cache_key} ); my $stream = $http_host_cache{$cache_key}; $stream->{'close'}->(); } sub http_do_request_timeout { my ( $req, $resp, $timeout ) = @_; $timeout ||= 30; my $result; eval { local $SIG{ALRM} = sub { die "timeout\n" }; eval { alarm($timeout) }; $result = LW2::http_do_request( $req, $resp ); eval { alarm(0) }; }; if ($@) { $result = 1; $resp->{whisker}->{error} = 'Error with timeout wrapper'; $resp->{whisker}->{error} = 'Total transaction timed out' if ( $@ =~ /timeout/ ); } return $result; } { # start md5 packaged varbs my ( @S, @T, @M ); my $code = ''; my $MD5_TRYLOADING = 1; sub md5 { return undef if ( !defined $_[0] ); # oops, forgot the data if ($MD5_TRYLOADING) { $MD5_TRYLOADING = 0; eval "require MD5"; } return MD5->hexhash( $_[0] ) if ($MD5::VERSION); my $DATA = _md5_pad( $_[0] ); &_md5_init() if ( !defined $M[0] ); return _md5_perl_generated( \$DATA ); } sub _md5_init { return if ( defined $S[0] ); my $i; for ( $i = 1 ; $i <= 64 ; $i++ ) { $T[ $i - 1 ] = int( ( 2**32 ) * abs( sin($i) ) ); } my @t = ( 7, 12, 17, 22, 5, 9, 14, 20, 4, 11, 16, 23, 6, 10, 15, 21 ); for ( $i = 0 ; $i < 64 ; $i++ ) { $S[$i] = $t[ ( int( $i / 16 ) * 4 ) + ( $i % 4 ) ]; } @M = ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9 ); &_md5_generate(); my $TEST = _md5_pad('foobar'); if ( _md5_perl_generated( \$TEST ) ne '3858f62230ac3c915f300c664312c63f' ) { utils_carp('md5: MD5 self-test not successful.'); } } sub _md5_pad { my $l = length( my $msg = shift() . chr(128) ); $msg .= "\0" x ( ( $l % 64 <= 56 ? 56 : 120 ) - $l % 64 ); $l = ( $l - 1 ) * 8; $msg .= pack 'VV', $l & 0xffffffff, ( $l >> 16 >> 16 ); return $msg; } sub _md5_generate { my $N = 'abcddabccdabbcda'; my ( $i, $M ) = ( 0, '' ); $M = '&0xffffffff' if ( ( 1 << 16 ) << 16 ); # mask for 64bit systems $code = <