use HTML::LinkExtor;
use LWP::Simple;
sub Iterator (&) {
$_[0];
}
sub imap (&$) {
my ($transform, $it) = @_;
return sub {
local $_ = NEXTVAL($it);
return unless defined $_;
return $transform->();
};
}
sub igrep (&$) {
my ($is_interesting, $it) = @_;
return sub {
local $_;
while (defined ($_ = NEXTVAL($it))) {
return $_ if $is_interesting->();
}
return;
}
}
sub traverse {
my $interesting_link;
$interesting_link = shift if ref $_[0] eq 'CODE';
my %seen;
my @queue = map [$_, 'user-supplied'], @_;
my $q_it =
igrep { ! $seen{$_->[0]}++ && print "unseen.\n" }
imap {$_->[0] =~ s/#.*$//; $_}
Iterator { print "Shifting...\n"; print " @{$queue[0]}\n"; return shift @queue };
if ($interesting_link) {
$q_it = igrep {$interesting_link->()} $q_it;
}
return imap {
my ($url, $referrer) = @$_;
my (%head, $html);
@head{qw(TYPE LENGTH LAST_MODIFIED EXPIRES SERVER)} = head($url);
if ($head{TYPE} eq 'text/html') {
$html = get($url);
my @links = get_links($url, $html);
push @queue, map [$_, $url], @links;
}
return wantarray ? ($url, \%head, $html, $referrer) : $url;
} $q_it;
}
sub get_links {
my ($base, $html) = @_;
my @links;
my $more_links = sub {
my ($tag, %attrs) = @_;
push @links, values %attrs;
};
HTML::LinkExtor->new($more_links, $base)->parse($html);
return @links;
}
sub NEXTVAL { $_[0]->() }
my $top = 'http://perl.plover.com/';
my $interesting = sub { $_->[0] =~ /^\Q$top/o };
my $urls = traverse($interesting, $top);
while (my ($url, $head, undef, $referrer) = NEXTVAL($urls)) {
print "$referrer -> $url\n";
print " (bad link)\n" unless $head->{TYPE};
<>;
}