root/HTML-Selector-XPath/trunk/lib/HTML/Selector/XPath.pm

Revision 1994 (checked in by miyagawa, 14 years ago)

update CAVEATS re css selector validatoin

Line 
1 package HTML::Selector::XPath;
2
3 use strict;
4 our $VERSION = '0.01';
5
6 require Exporter;
7 our @EXPORT_OK = qw(selector_to_xpath);
8 *import = \&Exporter::import;
9
10 sub selector_to_xpath {
11     __PACKAGE__->new(shift)->to_xpath;
12 }
13
14 my $reg = {
15     # tag name/id/class
16     element => qr/^([#.]?)([a-z0-9\\*_-]*)((\|)([a-z0-9\\*_-]*))?/i,
17     # attribute presence
18     attr1   => qr/^\[([^\]]*)\]/,
19     # attribute value match
20     attr2   => qr/^\[\s*([^~\|=\s]+)\s*([~\|]?=)\s*"([^"]+)"\s*\]/i,
21     attrN   => qr/^:not\((.*?)\)/i,
22     pseudo  => qr/^:([()a-z_-]+)/i,
23     # adjacency/direct descendance
24     combinator => qr/^(\s*[>+\s])/i,
25     # rule separator
26     comma => qr/^\s*,/i,
27 };
28
29
30 sub new {
31     my($class, $exp) = @_;
32     bless { expression => $exp }, $class;
33 }
34
35 sub selector {
36     my $self = shift;
37     $self->{expression} = shift if @_;
38     $self->{expression};
39 }
40
41 sub to_xpath {
42     my $self = shift;
43     my $rule = $self->{expression} or return;
44
45     my $index = 1;
46     my @parts = ("//", "*");
47     my $last_rule = '';
48     my @next_parts;
49
50     # Loop through each "unit" of the rule
51     while (length $rule && $rule ne $last_rule) {
52         $last_rule = $rule;
53
54         $rule =~ s/^\s*|\s*$//g;
55         last unless length $rule;
56
57         # Match elements
58         if ($rule =~ s/$reg->{element}//) {
59
60             # to add *[1]/self:: for follow-sibling
61             if (@next_parts) {
62                 push @parts, @next_parts, (pop @parts);
63                 $index += @next_parts;
64                 @next_parts = ();
65             }
66
67             if ($1 eq '#') { # ID
68                 push @parts, "[\@id='$2']";
69             } elsif ($1 eq '.') { # class
70                 push @parts, "[contains(concat(' ', \@class, ' '), ' $2 ')]";
71             } else {
72                 $parts[$index] = $5 || $2;
73             }
74         }
75
76         # Match attribute selectors
77         if ($rule =~ s/$reg->{attr2}//) {
78             # negation (e.g. [input!="text"]) isn't implemented in CSS, but include it anyway:
79             if ($2 eq '!=') {
80                 push @parts, "[\@$1!='$3]";
81             } elsif ($2 eq '~=') { # substring attribute match
82                 push @parts, "[contains(concat(' ', \@$1, ' '), ' $3 ')]";
83             } elsif ($2 eq '|=') {
84                 push @parts, "[\@$1='$3' or starts-with(\@$1, '$3-')]";
85             } else { # exact match
86                 push @parts, "[\@$1='$3']";
87             }
88         } else {
89             if ($rule =~ s/$reg->{attr1}//) {
90                 push @parts, "[\@$1]";
91             }
92         }
93
94         # Match negation
95         if ($rule =~ s/$reg->{attrN}//) {
96             my $sub_rule = $1;
97             if ($sub_rule =~ s/$reg->{attr2}//) {
98                 if ($2 eq '=') {
99                     push @parts, "[\@$1!='$3']";
100                 } elsif ($2 eq '~=') {
101                     push @parts, ":not([contains(concat(' ', \@$1, ' '), ' $3 ')])";
102                 } elsif ($2 eq '|=') {
103                     push @parts, ":not([\@$1='$3' or starts-with(\@$1, '$3-')])";
104                 }
105             } elsif ($sub_rule =~ s/$reg->{attr1}//) {
106                 push @parts, ":not([\@$1])";
107             }
108         }
109
110         # Ignore pseudoclasses/pseudoelements
111         while ($rule =~ s/$reg->{pseudo}//) {
112             if ( $1 eq 'first-child') {
113                 $parts[$#parts] = '*[1]/self::' . $parts[$#parts];
114             } elsif ($1 =~ /^lang\(([\w\-]+)\)$/) {
115                 push @parts, "[\@xml:lang='$1' or starts-with(\@xml:lang, '$1-')]";
116             }
117         }
118
119         # Match combinators (> and +)
120         if ($rule =~ s/$reg->{combinator}//) {
121             my $match = $1;
122             if ($match =~ />/) {
123                 push @parts, "/";
124             } elsif ($match =~ /\+/) {
125                 push @parts, "/following-sibling::";
126                 @next_parts = ('*[1]/self::');
127             } else {
128                 push @parts, "//";
129             }
130
131             # new context
132             $index = @parts;
133             push @parts, "*";
134         }
135
136         # Match commas
137         if ($rule =~ s/$reg->{comma}//) {
138             push @parts, " | ", "//", "*"; # ending one rule and beginning another
139             $index = @parts - 1;
140         }
141     }
142
143     return join '', @parts;
144 }
145
146 1;
147 __END__
148
149 =head1 NAME
150
151 HTML::Selector::XPath - CSS Selector to XPath compiler
152
153 =head1 SYNOPSIS
154
155   use HTML::Selector::XPath;
156
157   my $selector = HTML::Selector::XPath->new("li#main");
158   $selector->to_xpath; # //li[@id='main']
159
160   # functional interface
161   use HTML::Selector::Xpath 'selector_to_xpath';
162   my $xpath = selector_to_xpath('div.foo');
163
164 =head1 DESCRIPTION
165
166 HTML::Selector::XPath is a utility function to compile CSS2 selector
167 to the equivalent XPath expression.
168
169 =head1 CAVEATS
170
171 =head2 NOT PSEUDO CLASS
172
173 This module supports I<:first-child> and I<:lang> pseudo class, and a
174 partial support for I<:not> CSS 3 pseudo class as well. When you use
175 I<:not>, this module will produce the equivalent XPath expression
176 I<:not()>, which is only available in XPath 2.0 implementation.
177
178 So far as I have tested, I<:not()> is not available in Perl XPath
179 modules like L<XML::LibXML> and L<HTML::Builder::XPath>.
180
181 =head2 CSS SELECTOR VALIDATION
182
183 This module doesn't validate if the original CSS Selector expression
184 is valid. For example,
185
186   div.123foo
187
188 is an invalid CSS selector (class names should not begin with
189 numbers), but this module ignores that and tries to generate
190 an equivalent XPath expression anyway.
191
192 =head1 AUTHOR
193
194 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
195
196 Most of the code is based on Joe Hewitt's getElementsBySelector.js on
197 L<http://www.joehewitt.com/blog/2006-03-20.php> and Andrew Dupont's
198 patch to Prototype.js on L<http://dev.rubyonrails.org/ticket/5171>,
199 but slightly modified using CSS to XPath translation table per
200 L<http://plasmasturm.org/log/444/>
201
202 =head1 LICENSE
203
204 This library is free software; you can redistribute it and/or modify
205 it under the same terms as Perl itself.
206
207 =head1 SEE ALSO
208
209 L<http://www.w3.org/TR/REC-CSS2/selector.html>
210 L<http://use.perl.org/~miyagawa/journal/31090>
211
212 =cut
Note: See TracBrowser for help on using the browser.