#!/usr/bin/perl use HTML::LinkExtor; use LWP::Simple; use WWW::RobotRules; use URI::URL; use Iterator; sub make_robot_filter { my $agent = shift; my %seen_site; my $rules = WWW::RobotRules->new($agent); return sub { my $url = url(shift()); return 1 unless $url->scheme eq 'http'; unless ($seen_site{$url->netloc}++) { my $robots = $url->clone; $robots->path('/robots.txt'); $robots->frag(undef); $rules->parse($robots, get($robots)); } $rules->allowed($url) }; } my $ROBOT_NAME = 'Grasshopper/1.0'; sub traverse { my $interesting_link; $interesting_link = shift if ref $_[0] eq 'CODE'; my @queue = map [$_, 'supplied by user'], @_; my %seen; my $robot_filter = make_robot_filter($ROBOT_NAME); my $q_it = igrep { ! $seen{$_->[0]}++ && $robot_filter->($_->[0]) } imap { $_->[0] =~ s/#.*$//; $_ } Iterator { return shift(@queue) }; if ($interesting_link) { $q_it = igrep {$interesting_link->()} $q_it; } return imap { my ($url, $referrer) = @$_; my (%head, $html); @head{qw(TYPE LENGTH LAST_MODIFIED EXPIRES SERVER)} = head($url); if ($head{TYPE} eq 'text/html') { my $html = get($url); push @queue, map [$_, $url], get_links($url, $html); } return wantarray ? ($url, \%head, $referrer, $html) : $url; } $q_it; } sub get_links { my ($base, $html) = @_; my @links; my $more_links = sub { my ($tag, %attrs) = @_; push @links, values %attrs; }; HTML::LinkExtor->new($more_links, $base)->parse($html); return @links; } ## Sample usage my $top = 'http://perl.plover.com/yak/'; my $interesting = sub { $_->[0] =~ /^\Q$top/o }; my $urls = traverse($interesting, $top); while (my ($url, $head, $referrer) = NEXTVAL($urls)) { print "$referrer -> $url\n"; print " (bad link)\n" unless $head->{TYPE}; <>; }