#!/usr/bin/perl
use HTML::LinkExtor;
use LWP::Simple;
use WWW::RobotRules;
use URI::URL;
use Iterator;
sub make_robot_filter {
my $agent = shift;
my %seen_site;
my $rules = WWW::RobotRules->new($agent);
return sub {
my $url = url(shift());
return 1 unless $url->scheme eq 'http';
unless ($seen_site{$url->netloc}++) {
my $robots = $url->clone;
$robots->path('/robots.txt');
$robots->frag(undef);
$rules->parse($robots, get($robots));
}
$rules->allowed($url)
};
}
my $ROBOT_NAME = 'Grasshopper/1.0';
sub traverse {
my $interesting_link;
$interesting_link = shift if ref $_[0] eq 'CODE';
my @queue = map [$_, 'supplied by user'], @_;
my %seen;
my $robot_filter = make_robot_filter($ROBOT_NAME);
my $q_it = igrep { ! $seen{$_->[0]}++ && $robot_filter->($_->[0]) }
imap { $_->[0] =~ s/#.*$//; $_ }
Iterator { return shift(@queue) };
if ($interesting_link) {
$q_it = igrep {$interesting_link->()} $q_it;
}
return imap {
my ($url, $referrer) = @$_;
my (%head, $html);
@head{qw(TYPE LENGTH LAST_MODIFIED EXPIRES SERVER)} = head($url);
if ($head{TYPE} eq 'text/html') {
my $html = get($url);
push @queue,
map [$_, $url],
get_links($url, $html);
}
return wantarray ? ($url, \%head, $referrer, $html) : $url;
} $q_it;
}
sub get_links {
my ($base, $html) = @_;
my @links;
my $more_links = sub {
my ($tag, %attrs) = @_;
push @links, values %attrs;
};
HTML::LinkExtor->new($more_links, $base)->parse($html);
return @links;
}
## Sample usage
my $top = 'http://perl.plover.com/yak/';
my $interesting = sub { $_->[0] =~ /^\Q$top/o };
my $urls = traverse($interesting, $top);
while (my ($url, $head, $referrer) = NEXTVAL($urls)) {
print "$referrer -> $url\n";
print " (bad link)\n" unless $head->{TYPE};
<>;
}