# $Id: Find.pm 1875 2005-09-09 05:41:38Z btrott $ package Feed::Find; use strict; use base qw( Class::ErrorHandler ); use LWP::UserAgent; use HTML::Parser; use URI; use vars qw( $VERSION ); $VERSION = '0.06'; use constant FEED_MIME_TYPES => [ 'application/x.atom+xml', 'application/atom+xml', 'application/xml', 'text/xml', 'application/rss+xml', 'application/rdf+xml', ]; our $FEED_EXT = qr/\.(?:rss|xml|rdf)$/; our %IsFeed = map { $_ => 1 } @{ FEED_MIME_TYPES() }; sub find { my $class = shift; my($uri) = @_; my $ua = LWP::UserAgent->new; $ua->agent(join '/', $class, $class->VERSION); $ua->parse_head(0); ## We're already basically doing this ourselves. my $req = HTTP::Request->new(GET => $uri); my $p = HTML::Parser->new(api_version => 3, start_h => [ \&_find_links, 'self,tagname,attr' ]); $p->{base_uri} = $uri; $p->{feeds} = []; my $res = $ua->request($req, sub { my($chunk, $res, $proto) = @_; if ($IsFeed{$res->content_type}) { push @{ $p->{feeds} }, $uri; die "Done parsing"; } $p->parse($chunk) or die "Done parsing"; }); return $class->error($res->status_line) unless $res->is_success; @{ $p->{feeds} }; } sub find_in_html { my $class = shift; my($html, $base_uri) = @_; my $p = HTML::Parser->new(api_version => 3, start_h => [ \&_find_links, 'self,tagname,attr' ]); $p->{base_uri} = $base_uri; $p->{feeds} = []; $p->parse($$html); @{ $p->{feeds} }; } sub _find_links { my($p, $tag, $attr) = @_; my $base_uri = $p->{base_uri}; if ($tag eq 'link') { return unless $attr->{rel}; my %rel = map { $_ => 1 } split /\s+/, lc($attr->{rel}); (my $type = lc $attr->{type}) =~ s/^\s*//; $type =~ s/\s*$//; push @{ $p->{feeds} }, URI->new_abs($attr->{href}, $base_uri)->as_string if $IsFeed{$type} && ($rel{alternate} || $rel{'service.feed'}); } elsif ($tag eq 'base') { $p->{base_uri} = $attr->{href} if $attr->{href}; } elsif ($tag =~ /^(?:meta|isindex|title|script|style|head|html)$/) { ## Ignore other valid tags inside of
. } elsif ($tag eq 'a') { my $href = $attr->{href} or return; my $uri = URI->new($href); push @{ $p->{feeds} }, URI->new_abs($href, $base_uri)->as_string if $uri->path =~ /$FEED_EXT/io; } else { ## Anything else indicates the start of the , ## so we stop parsing. $p->eof if @{ $p->{feeds} }; } } 1; __END__ =head1 NAME Feed::Find - Syndication feed auto-discovery =head1 SYNOPSIS use Feed::Find; my @feeds = Feed::Find->find('http://example.com/'); =head1 DESCRIPTION I