root/HTML-Selector-XPath/trunk/lib/HTML/Selector/XPath.pm

Revision 1993 (checked in by miyagawa, 14 years ago)

fixed SYNOPSIS. Added functional interface. Added test using H::TB::XPath

Line 
1 package HTML::Selector::XPath;
2
3 use strict;
4 our $VERSION = '0.01';
5
6 require Exporter;
7 our @EXPORT_OK = qw(selector_to_xpath);
8 *import = \&Exporter::import;
9
10 sub selector_to_xpath {
11     __PACKAGE__->new(shift)->to_xpath;
12 }
13
14 my $reg = {
15     # tag name/id/class
16     element => qr/^([#.]?)([a-z0-9\\*_-]*)((\|)([a-z0-9\\*_-]*))?/i,
17     # attribute presence
18     attr1   => qr/^\[([^\]]*)\]/,
19     # attribute value match
20     attr2   => qr/^\[\s*([^~\|=\s]+)\s*([~\|]?=)\s*"([^"]+)"\s*\]/i,
21     attrN   => qr/^:not\((.*?)\)/i,
22     pseudo  => qr/^:([()a-z_-]+)/i,
23     # adjacency/direct descendance
24     combinator => qr/^(\s*[>+\s])/i,
25     # rule separator
26     comma => qr/^\s*,/i,
27 };
28
29
30 sub new {
31     my($class, $exp) = @_;
32     bless { expression => $exp }, $class;
33 }
34
35 sub selector {
36     my $self = shift;
37     $self->{expression} = shift if @_;
38     $self->{expression};
39 }
40
41 sub to_xpath {
42     my $self = shift;
43     my $rule = $self->{expression} or return;
44
45     my $index = 1;
46     my @parts = ("//", "*");
47     my $last_rule = '';
48     my @next_parts;
49
50     # Loop through each "unit" of the rule
51     while (length $rule && $rule ne $last_rule) {
52         $last_rule = $rule;
53
54         $rule =~ s/^\s*|\s*$//g;
55         last unless length $rule;
56
57         # Match elements
58         if ($rule =~ s/$reg->{element}//) {
59
60             # to add *[1]/self:: for follow-sibling
61             if (@next_parts) {
62                 push @parts, @next_parts, (pop @parts);
63                 $index += @next_parts;
64                 @next_parts = ();
65             }
66
67             if ($1 eq '#') { # ID
68                 push @parts, "[\@id='$2']";
69             } elsif ($1 eq '.') { # class
70                 push @parts, "[contains(concat(' ', \@class, ' '), ' $2 ')]";
71             } else {
72                 $parts[$index] = $5 || $2;
73             }
74         }
75
76         # Match attribute selectors
77         if ($rule =~ s/$reg->{attr2}//) {
78             # negation (e.g. [input!="text"]) isn't implemented in CSS, but include it anyway:
79             if ($2 eq '!=') {
80                 push @parts, "[\@$1!='$3]";
81             } elsif ($2 eq '~=') { # substring attribute match
82                 push @parts, "[contains(concat(' ', \@$1, ' '), ' $3 ')]";
83             } elsif ($2 eq '|=') {
84                 push @parts, "[\@$1='$3' or starts-with(\@$1, '$3-')]";
85             } else { # exact match
86                 push @parts, "[\@$1='$3']";
87             }
88         } else {
89             if ($rule =~ s/$reg->{attr1}//) {
90                 push @parts, "[\@$1]";
91             }
92         }
93
94         # Match negation
95         if ($rule =~ s/$reg->{attrN}//) {
96             my $sub_rule = $1;
97             if ($sub_rule =~ s/$reg->{attr2}//) {
98                 if ($2 eq '=') {
99                     push @parts, "[\@$1!='$3']";
100                 } elsif ($2 eq '~=') {
101                     push @parts, ":not([contains(concat(' ', \@$1, ' '), ' $3 ')])";
102                 } elsif ($2 eq '|=') {
103                     push @parts, ":not([\@$1='$3' or starts-with(\@$1, '$3-')])";
104                 }
105             } elsif ($sub_rule =~ s/$reg->{attr1}//) {
106                 push @parts, ":not([\@$1])";
107             }
108         }
109
110         # Ignore pseudoclasses/pseudoelements
111         while ($rule =~ s/$reg->{pseudo}//) {
112             if ( $1 eq 'first-child') {
113                 $parts[$#parts] = '*[1]/self::' . $parts[$#parts];
114             } elsif ($1 =~ /^lang\(([\w\-]+)\)$/) {
115                 push @parts, "[\@xml:lang='$1' or starts-with(\@xml:lang, '$1-')]";
116             }
117         }
118
119         # Match combinators (> and +)
120         if ($rule =~ s/$reg->{combinator}//) {
121             my $match = $1;
122             if ($match =~ />/) {
123                 push @parts, "/";
124             } elsif ($match =~ /\+/) {
125                 push @parts, "/following-sibling::";
126                 @next_parts = ('*[1]/self::');
127             } else {
128                 push @parts, "//";
129             }
130
131             # new context
132             $index = @parts;
133             push @parts, "*";
134         }
135
136         # Match commas
137         if ($rule =~ s/$reg->{comma}//) {
138             push @parts, " | ", "//", "*"; # ending one rule and beginning another
139             $index = @parts - 1;
140         }
141     }
142
143     return join '', @parts;
144 }
145
146 1;
147 __END__
148
149 =head1 NAME
150
151 HTML::Selector::XPath - CSS Selector to XPath compiler
152
153 =head1 SYNOPSIS
154
155   use HTML::Selector::XPath;
156
157   my $selector = HTML::Selector::XPath->new("li#main");
158   $selector->to_xpath; # //li[@id='main']
159
160   # functional interface
161   use HTML::Selector::Xpath 'selector_to_xpath';
162   my $xpath = selector_to_xpath('div.foo');
163
164 =head1 DESCRIPTION
165
166 HTML::Selector::XPath is a utility function to compile CSS2 selector
167 to the equivalent XPath expression.
168
169 =head1 CAVEATS
170
171 This module supports I<:first-child> and I<:lang> pseudo class, and a
172 partial support for I<:not> CSS 3 pseudo class as well. When you use
173 I<:not>, this module will produce I<:not()> euiqvalent XPath
174 expression, which is only available in XPath 2.0 implementation.
175
176 So far as I have tested, I<:not()> is not available in Perl XPath
177 modules like L<XML::LibXML> and L<HTML::Builder::XPath>.
178
179 =head1 AUTHOR
180
181 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
182
183 Most of the code is based on Joe Hewitt's getElementsBySelector.js on
184 L<http://www.joehewitt.com/blog/2006-03-20.php> and Andrew Dupont's
185 patch to Prototype.js on L<http://dev.rubyonrails.org/ticket/5171>,
186 but slightly modified using CSS to XPath translation table per
187 L<http://plasmasturm.org/log/444/>
188
189 =head1 LICENSE
190
191 This library is free software; you can redistribute it and/or modify
192 it under the same terms as Perl itself.
193
194 =head1 SEE ALSO
195
196 L<http://www.w3.org/TR/REC-CSS2/selector.html>
197 L<http://use.perl.org/~miyagawa/journal/31090>
198
199 =cut
Note: See TracBrowser for help on using the browser.