#!/usr/bin/perl

use HTML::LinkExtor;
use LWP::Simple;
use WWW::RobotRules;
use URI::URL;
use Iterator;

{ my %seen_site;
  my $rules = WWW::RobotRules->new($USER_AGENT);
  sub robot_filter {
    my $url = url(shift());
    return unless $url->scheme eq 'http';
    unless ($seen_site{$url->netloc}++) {
      my $robots = $url->clone;
      $robots->path('/robots.txt');
      $robots->frag(undef);
      $rules->parse(get($robots));
    }
    $rules->allowed($url)
  };
}

sub traverse {
  my $interesting_link;
  $interesting_link = shift if ref $_[0] eq 'CODE';

  my %seen;
  my @queue = map [$_, 'user-supplied'], @_;
  my $q_it = 
    igrep_l {  ! $seen{$_[0]}++ && robot_filter($_[0]) }
      imap_l { $_[0] =~ s/#.*$//; @_ }
        Iterator { print "Shifting...\n"; print "  @{$queue[0]}\n"; return @{shift @queue} };
  if ($interesting_link) {
    $q_it = igrep_l {$interesting_link->(@_)} $q_it;
  }

  return imap_l {
    my ($url, $referrer) = @_;

    my (%head, $html);
    @head{qw(TYPE LENGTH LAST_MODIFIED EXPIRES SERVER)} = head($url);
    if ($head{TYPE} eq 'text/html') {
      $html = get($url);
      my @links = get_links($url, $html);
      push @queue, map [$_, $url], @links;
    }
    return wantarray ? ($url, \%head, $html, $referrer) : $url;
  } $q_it;
}


sub get_links {
  my ($base, $html) = @_;
  my @links;
  my $more_links = sub {
    my ($tag, %attrs) = @_;
    push @links, values %attrs;
  };

  HTML::LinkExtor->new($more_links, $base)->parse($html);
  return @links;
}

## Sample usage
my $top = 'http://perl.plover.com/yak/';
my $interesting = sub { $_[0] =~ /^\Q$top/o };
my $urls = traverse($interesting, $top);
while (my ($url, $head, undef, $referrer) = NEXTVAL($urls)) {
  print "$referrer -> $url\n";
  print "  (bad link)\n" unless $head->{TYPE};
  <>;
}






