root/HTML-Selector-XPath/trunk/lib/HTML/Selector/XPath.pm

Revision 1999 (checked in by miyagawa, 14 years ago)

fixed handling of :not pseudo-class to work on XPath 1.0 implementation. This be 0.02

Line 
1 package HTML::Selector::XPath;
2
3 use strict;
4 our $VERSION = '0.02';
5
6 require Exporter;
7 our @EXPORT_OK = qw(selector_to_xpath);
8 *import = \&Exporter::import;
9
10 sub selector_to_xpath {
11     __PACKAGE__->new(shift)->to_xpath;
12 }
13
14 my $reg = {
15     # tag name/id/class
16     element => qr/^([#.]?)([a-z0-9\\*_-]*)((\|)([a-z0-9\\*_-]*))?/i,
17     # attribute presence
18     attr1   => qr/^\[([^\]]*)\]/,
19     # attribute value match
20     attr2   => qr/^\[\s*([^~\|=\s]+)\s*([~\|]?=)\s*"([^"]+)"\s*\]/i,
21     attrN   => qr/^:not\((.*?)\)/i,
22     pseudo  => qr/^:([()a-z_-]+)/i,
23     # adjacency/direct descendance
24     combinator => qr/^(\s*[>+\s])/i,
25     # rule separator
26     comma => qr/^\s*,/i,
27 };
28
29
30 sub new {
31     my($class, $exp) = @_;
32     bless { expression => $exp }, $class;
33 }
34
35 sub selector {
36     my $self = shift;
37     $self->{expression} = shift if @_;
38     $self->{expression};
39 }
40
41 sub to_xpath {
42     my $self = shift;
43     my $rule = $self->{expression} or return;
44
45     my $index = 1;
46     my @parts = ("//", "*");
47     my $last_rule = '';
48     my @next_parts;
49
50     # Loop through each "unit" of the rule
51     while (length $rule && $rule ne $last_rule) {
52         $last_rule = $rule;
53
54         $rule =~ s/^\s*|\s*$//g;
55         last unless length $rule;
56
57         # Match elements
58         if ($rule =~ s/$reg->{element}//) {
59
60             # to add *[1]/self:: for follow-sibling
61             if (@next_parts) {
62                 push @parts, @next_parts, (pop @parts);
63                 $index += @next_parts;
64                 @next_parts = ();
65             }
66
67             if ($1 eq '#') { # ID
68                 push @parts, "[\@id='$2']";
69             } elsif ($1 eq '.') { # class
70                 push @parts, "[contains(concat(' ', \@class, ' '), ' $2 ')]";
71             } else {
72                 $parts[$index] = $5 || $2;
73             }
74         }
75
76         # Match attribute selectors
77         if ($rule =~ s/$reg->{attr2}//) {
78             # negation (e.g. [input!="text"]) isn't implemented in CSS, but include it anyway:
79             if ($2 eq '!=') {
80                 push @parts, "[\@$1!='$3]";
81             } elsif ($2 eq '~=') { # substring attribute match
82                 push @parts, "[contains(concat(' ', \@$1, ' '), ' $3 ')]";
83             } elsif ($2 eq '|=') {
84                 push @parts, "[\@$1='$3' or starts-with(\@$1, '$3-')]";
85             } else { # exact match
86                 push @parts, "[\@$1='$3']";
87             }
88         } else {
89             if ($rule =~ s/$reg->{attr1}//) {
90                 push @parts, "[\@$1]";
91             }
92         }
93
94         # Match negation
95         if ($rule =~ s/$reg->{attrN}//) {
96             my $sub_rule = $1;
97             if ($sub_rule =~ s/$reg->{attr2}//) {
98                 if ($2 eq '=') {
99                     push @parts, "[\@$1!='$3']";
100                 } elsif ($2 eq '~=') {
101                     push @parts, "[not(contains(concat(' ', \@$1, ' '), ' $3 '))]";
102                 } elsif ($2 eq '|=') {
103                     push @parts, "[not(\@$1='$3' or starts-with(\@$1, '$3-'))]";
104                 }
105             } elsif ($sub_rule =~ s/$reg->{attr1}//) {
106                 push @parts, "[not(\@$1)]";
107             }
108         }
109
110         # Ignore pseudoclasses/pseudoelements
111         while ($rule =~ s/$reg->{pseudo}//) {
112             if ( $1 eq 'first-child') {
113                 $parts[$#parts] = '*[1]/self::' . $parts[$#parts];
114             } elsif ($1 =~ /^lang\(([\w\-]+)\)$/) {
115                 push @parts, "[\@xml:lang='$1' or starts-with(\@xml:lang, '$1-')]";
116             }
117         }
118
119         # Match combinators (> and +)
120         if ($rule =~ s/$reg->{combinator}//) {
121             my $match = $1;
122             if ($match =~ />/) {
123                 push @parts, "/";
124             } elsif ($match =~ /\+/) {
125                 push @parts, "/following-sibling::";
126                 @next_parts = ('*[1]/self::');
127             } else {
128                 push @parts, "//";
129             }
130
131             # new context
132             $index = @parts;
133             push @parts, "*";
134         }
135
136         # Match commas
137         if ($rule =~ s/$reg->{comma}//) {
138             push @parts, " | ", "//", "*"; # ending one rule and beginning another
139             $index = @parts - 1;
140         }
141     }
142
143     return join '', @parts;
144 }
145
146 1;
147 __END__
148
149 =head1 NAME
150
151 HTML::Selector::XPath - CSS Selector to XPath compiler
152
153 =head1 SYNOPSIS
154
155   use HTML::Selector::XPath;
156
157   my $selector = HTML::Selector::XPath->new("li#main");
158   $selector->to_xpath; # //li[@id='main']
159
160   # functional interface
161   use HTML::Selector::Xpath 'selector_to_xpath';
162   my $xpath = selector_to_xpath('div.foo');
163
164 =head1 DESCRIPTION
165
166 HTML::Selector::XPath is a utility function to compile CSS2 selector
167 to the equivalent XPath expression.
168
169 =head1 CAVEATS
170
171 =head2 CSS SELECTOR VALIDATION
172
173 This module doesn't validate whether the original CSS Selector
174 expression is valid. For example,
175
176   div.123foo
177
178 is an invalid CSS selector (class names should not begin with
179 numbers), but this module ignores that and tries to generate
180 an equivalent XPath expression anyway.
181
182 =head1 AUTHOR
183
184 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
185
186 Most of the code is based on Joe Hewitt's getElementsBySelector.js on
187 L<http://www.joehewitt.com/blog/2006-03-20.php> and Andrew Dupont's
188 patch to Prototype.js on L<http://dev.rubyonrails.org/ticket/5171>,
189 but slightly modified using Aristotle Pegaltzis' CSS to XPath
190 translation table per L<http://plasmasturm.org/log/444/>
191
192 =head1 LICENSE
193
194 This library is free software; you can redistribute it and/or modify
195 it under the same terms as Perl itself.
196
197 =head1 SEE ALSO
198
199 L<http://www.w3.org/TR/REC-CSS2/selector.html>
200 L<http://use.perl.org/~miyagawa/journal/31090>
201
202 =cut
Note: See TracBrowser for help on using the browser.