root/HTML-ResolveLink/trunk/lib/HTML/ResolveLink.pm

Revision 2107 (checked in by miyagawa, 14 years ago)

0.05; fix quot typo

  • Property svn:keywords set to Id Revision
Line 
1 package HTML::ResolveLink;
2
3 use strict;
4 our $VERSION = '0.05';
5 use base qw(HTML::Parser);
6
7 use Carp;
8 use HTML::Tagset ();
9 use URI;
10
11 sub new {
12     my($class, %p) = @_;
13     my $self = $class->SUPER::new(
14         start_h => [ \&_start_tag, "self,tagname,attr,attrseq,text" ],
15         default_h => [ \&_default, "self,tagname,attr,text" ],
16     );
17
18     unless ($p{base}) {
19         Carp::croak("HTML::ResolveLink->new: base is a required parameter");
20     }
21
22     $p{base} = URI->new($p{base}) unless ref $p{base};
23     $self->{resolvelink_base} = $p{base};
24     $self->{resolvelink_callback} = $p{callback} if $p{callback};
25
26     $self;
27 }
28
29 sub _start_tag {
30     my($self, $tagname, $attr, $attrseq, $text) = @_;
31
32     if ($tagname eq 'base' && defined $attr->{href}) {
33         $self->{resolvelink_base} = $attr->{href};
34     }
35
36     my $base = $self->{resolvelink_base};
37
38     my $links = $HTML::Tagset::linkElements{$tagname} || [];
39     $links = [$links] unless ref $links;
40
41     for my $a (@$links) {
42         next unless exists $attr->{$a};
43
44         my $link = $attr->{$a};
45         my $uri  = URI->new($link);
46
47         # relative link:
48         unless (defined $uri->scheme) {
49             my $old = $uri;
50             $uri = $uri->abs($base);
51             $attr->{$a} = $uri->as_string;
52             if ($self->{resolvelink_callback}) {
53                 $self->{resolvelink_callback}->($uri, $old);
54             }
55             $self->{resolvelink_count}++;
56         }
57     }
58
59     $self->{resolvelink_html} .= "<$tagname";
60     for my $a (@$attrseq) {
61         next if $a eq '/';
62         $self->{resolvelink_html} .= sprintf qq( %s="%s"), $a, _escape($attr->{$a});
63     }
64     $self->{resolvelink_html} .= ' /' if $attr->{'/'};
65     $self->{resolvelink_html} .= '>';
66 }
67
68 sub _default {
69     my($self, $tagname, $attr, $text) = @_;
70     $self->{resolvelink_html} .= $text;
71 }
72
73 my %escape = (
74     '<' => '&lt;',
75     '>' => '&gt;',
76     '"' => '&quot;',
77     '&' => '&amp;',
78 );
79 my $esc_re = join '|', keys %escape;
80
81 sub _escape {
82     my $str = shift;
83     $str =~ s/($esc_re)/$escape{$1}/g;
84     $str;
85 }
86
87 sub resolve {
88     my($self, $html) = @_;
89
90     # init
91     $self->{resolvelink_html} = '';
92     $self->{resolvelink_count} = 0;
93
94     $self->parse($html);
95     $self->eof;
96
97     $self->{resolvelink_html};
98 }
99
100 sub resolved_count {
101     my $self = shift;
102     $self->{resolvelink_count};
103 }
104
105 1;
106 __END__
107
108 =head1 NAME
109
110 HTML::ResolveLink - Resolve relative links in (X)HTML into absolute URI
111
112 =head1 SYNOPSIS
113
114   use HTML::ResolveLink;
115
116   my $resolver = HTML::ResolveLink->new(
117       base => 'http://www.example.com/foo/bar.html',
118       callback => sub {
119          my($uri, $old) = @_;
120          # ...
121       },
122   );
123   $html = $resolver->resolve($html);
124
125 =head1 DESCRIPTION
126
127 HTML::ResolveLink is a module to rewrite relative links in XHTML or
128 HTML into absolute URI.
129
130 For example. when you have
131
132   <a href="foo.html">foo</a>
133   <img src="/bar.gif" />
134
135 and use C<http://www.example.com/foo/bar> as C<base> URL, you'll get:
136
137   <a href="http://www.example.com/foo/foo.html">foo</a>
138   <img src="http://www.example.com/bar.gif" />
139
140 If the parser encounters C<< <base> >> tag in HTML, it'll honor that.
141
142 =head1 METHODS
143
144 =over 4
145
146 =item new
147
148   my $resolver = HTML::ResolveLink->new(
149       base => 'http://www.example.com/',
150       callback => \&callback,
151   );
152
153 C<base> is a required parameter, which is used to resolve the relative
154 URI found in the document.
155
156 C<callback> is an optional parameter, which is a callback subroutine
157 reference which would take new resolved URI and the original path as
158 arguments.
159
160 Here's an example code to illustrate how to use callback function.
161
162   my $count;
163   my $resolver = HTML::ResolveLink->new(
164       base => $base,
165       callback => sub {
166           my($uri, $old) = @_;
167           warn "$old is resolved to $uri";
168           $count++;
169       },
170   );
171
172   $html = $resolver->resolve($html);
173
174   if ($count) {
175       warn "HTML::ResolveLink resolved $count links";
176   }
177
178 =item resolve
179
180   $html = $resolver->resolve($html);
181
182 Resolves relative URI found in C<$html> into absolute and returns a
183 string containing rewritten one.
184
185 =item resolved_count
186
187   $count = $resolver->resolved_count;
188
189 Returns how many URIs are resolved during the previous I<resolve>
190 method call. This should be called after the I<resolve>, otherwise
191 returns undef.
192
193 =head1 AUTHOR
194
195 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
196
197 This library is free software; you can redistribute it and/or modify
198 it under the same terms as Perl itself.
199
200 =head1 SEE ALSO
201
202 L<HTML::Parser>, L<HTML::LinkExtor>
203
204 =cut
Note: See TracBrowser for help on using the browser.