#!/usr/local/bin/perl use 5.005; use strict; use Getopt::Std; use LWP::UserAgent; use HTML::Parser; use Pod::Usage; use URI; package HTML::Parser::Links; use base qw(HTML::Parser); sub new { my($class, $base) = @_; my $parser = new HTML::Parser; $parser->{base } = $base; $parser->{links} = []; $parser->{fragment} = {}; bless $parser, $class } sub start { my($parser, $tag, $attr, $attrseq, $origtext) = @_; $tag eq 'base' and $parser->{base} = $attr->{href}; $tag eq 'a' and $attr->{href} and do { my $base = $parser->{base}; my $href = $attr->{href}; my $uri = new_abs URI $href, $base; push @{$parser->{links}}, $uri; }; $tag eq 'a' and $attr->{name} and do { my $name = $attr->{name}; $parser->{fragment}{$name} = 1; }; } sub links { my $parser = shift; $parser->{links} } sub check_fragment { my($parser, $fragment) = @_; $parser->{fragment}{$fragment} } package Page; sub new { my($package, $uri) = @_; my $page = { uri => $uri, base => $uri }; bless $page, $package } sub uri { shift->{'uri' } } sub base { shift->{'base'} } sub get { my $page = shift; my $uri = $page->{uri}; exists $Page::Content{$uri} and return $Page::Content{$uri}; my $ua = new LWP::UserAgent; my $request = new HTTP::Request GET => $uri; my $response = $ua->request($request); $response->is_success or return undef; $page->{base} = $response->request->uri; $Page::Content{$uri} = $response->content; $response->content } sub parse { my $page = shift; my $uri = $page->{uri}; exists $Page::Parser{$uri} and return $Page::Parser{$uri}; my $content = $page->get; defined $content or return undef; my $parser = new HTML::Parser::Links $page->base; $parser->parse($content); $parser->eof; $Page::Parser{$uri} = $parser; $parser } sub links { my $page = shift; my $parser = $page->parse; defined $parser or return undef; $parser->links } package Link; sub new { my($package, $uri) = @_; my $base = $uri ->clone; my $fragment = $base->fragment(undef); my $link = { uri => $uri, base => $base, fragment => $fragment }; bless $link, $package } sub check { my $link = shift; my $uri = $link->{uri}; exists $Link::Check{$uri} and return $Link::Check{$uri}; my $fragment = $link->{fragment}; my $check = defined $fragment ? 'check_fragment' : 'check_base'; my $success = $link->$check(); $Link::Check{$uri} = $success; $success } sub check_fragment { my $link = shift; my $base = $link->{base}; my $fragment = $link->{fragment}; my $page = new Page $base; my $parser = $page->parse; defined $parser or return ''; $parser->check_fragment($fragment) } sub check_base { my $link = shift; my $base = $link->{base}; my $ua = new LWP::UserAgent; my $request = new HTTP::Request HEAD => $base; my $response = $ua->request($request); # Some servers don't like HEAD requests $response->code==500 and do { $request = new HTTP::Request GET => $base; $response = $ua->request($request); }; $response->is_success; } package Spinner; use vars qw($N @Spin); @Spin = ('|', '/', '-', '\\'); sub Spin { print STDERR $Spin[$N++], "\r"; $N==4 and $N=0; } package main; my %Options; my %Checked; my($Scheme, $Authority); my($Pages, $Links, $Broken) = (0, 0, 0); getopt('vt', \%Options); Help(); CheckPages(@ARGV); Summary(); sub Help { $Options{H} and pod2usage(VERBOSE=>1); $Options{M} and pod2usage(VERBOSE=>2); @ARGV or pod2usage(); } sub CheckPages { my @pages = @_; my @URIs = map { new URI $_ } @pages; for my $uri (@URIs) { $Scheme = $uri->scheme; $Authority = $uri->authority; CheckPage($uri); } } sub CheckPage { my $uri = shift; $Checked{$uri} and return; $Checked{$uri} = 1; $Pages++; Twiddle(); print "PAGE $uri\n" if $Options{v} > 1; my $page = new Page $uri; my $links = $page->links; defined $links or die "Can't get $uri\n"; CheckLinks($page, $links); } sub CheckLinks { my($page, $links) = @_; my @links; for my $link (@$links) { $link->scheme eq 'http' or next; my $on_site = $link->authority eq $Authority; $on_site or $Options{o} or next; $Links++; Twiddle(); print "LINK $link\n" if $Options{v} > 2; Link->new($link)->check or do { Report($page, $link); next; }; $on_site or next; $link->fragment(undef); push @links, $link; } $Options{r} or return; for my $link (@links) { CheckPage($link); } } sub Report { my($page, $link) = @_; my $uri = $page->uri->as_string; $link = $link ->as_string; $Options{a} and do { $uri =~ s($Scheme://$Authority)(); $link =~ s($Scheme://$Authority)(); }; $Broken++; print "BROKEN $uri -> $link\n" if $Options{v} > 0; } sub Twiddle { $Options{t}==1 and Spinner::Spin(); $Options{t}==2 and Progress(); } sub Progress { print STDERR "$Pages pages, $Links links, $Broken broken\r"; } sub Summary { print STDERR "Checked $Pages pages, $Links links \n"; print STDERR "Found $Broken broken links\n"; } __END__ =head1 NAME B - check the links on an HTML page =head1 SYNOPSIS B [-a] [-o] [-r] [B<-t> I] [B<-v> I] I ... =head1 DESCRIPTION B reads the web pages at I ..., and checks the existence of any links that it finds there. =head1 OPTIONS =over 4 =item B<-a> Omit the scheme://authority part when reporting broken links. =item B<-o> Check off-site links. =item B<-r> Recursively check pages that I links to. Doesn't recurse to off-site pages. =item B<-t> I Indicate activity with a twiddle: 0, 1, 2 =item B<-v> I Verbosity level: 0, 1, 2, 3 =back =head1 BUGS =over 4 =item * The definition of I is too simple. There should be a way to restrict recursion to a directory tree within a web site. =back =head1 AUTHOR Steven McDougall, swmcd@world.std.com =head1 COPYRIGHT Copyright 2000 by Steven McDougall. This program is free software; you can redistribute it and/or modify it under the same terms as Perl.