root/HTML-Selector-XPath/trunk/lib/HTML/Selector/XPath.pm

Revision 1992 (checked in by miyagawa, 14 years ago)

updated license

Line 
1 package HTML::Selector::XPath;
2
3 use strict;
4 our $VERSION = '0.01';
5
6 my $reg = {
7     # tag name/id/class
8     element => qr/^([#.]?)([a-z0-9\\*_-]*)((\|)([a-z0-9\\*_-]*))?/i,
9     # attribute presence
10     attr1   => qr/^\[([^\]]*)\]/,
11     # attribute value match
12     attr2   => qr/^\[\s*([^~\|=\s]+)\s*([~\|]?=)\s*"([^"]+)"\s*\]/i,
13     attrN   => qr/^:not\((.*?)\)/i,
14     pseudo  => qr/^:([()a-z_-]+)/i,
15     # adjacency/direct descendance
16     combinator => qr/^(\s*[>+\s])/i,
17     # rule separator
18     comma => qr/^\s*,/i,
19 };
20
21
22 sub new {
23     my($class, $exp) = @_;
24     bless { expression => $exp }, $class;
25 }
26
27 sub selector {
28     my $self = shift;
29     $self->{expression} = shift if @_;
30     $self->{expression};
31 }
32
33 sub to_xpath {
34     my $self = shift;
35     my $rule = $self->{expression} or return;
36
37     my $index = 1;
38     my @parts = ("//", "*");
39     my $last_rule = '';
40     my @next_parts;
41
42     # Loop through each "unit" of the rule
43     while (length $rule && $rule ne $last_rule) {
44         $last_rule = $rule;
45
46         $rule =~ s/^\s*|\s*$//g;
47         last unless length $rule;
48
49         # Match elements
50         if ($rule =~ s/$reg->{element}//) {
51
52             # to add *[1]/self:: for follow-sibling
53             if (@next_parts) {
54                 push @parts, @next_parts, (pop @parts);
55                 $index += @next_parts;
56                 @next_parts = ();
57             }
58
59             if ($1 eq '#') { # ID
60                 push @parts, "[\@id='$2']";
61             } elsif ($1 eq '.') { # class
62                 push @parts, "[contains(concat(' ', \@class, ' '), ' $2 ')]";
63             } else {
64                 $parts[$index] = $5 || $2;
65             }
66         }
67
68         # Match attribute selectors
69         if ($rule =~ s/$reg->{attr2}//) {
70             # negation (e.g. [input!="text"]) isn't implemented in CSS, but include it anyway:
71             if ($2 eq '!=') {
72                 push @parts, "[\@$1!='$3]";
73             } elsif ($2 eq '~=') { # substring attribute match
74                 push @parts, "[contains(concat(' ', \@$1, ' '), ' $3 ')]";
75             } elsif ($2 eq '|=') {
76                 push @parts, "[\@$1='$3' or starts-with(\@$1, '$3-')]";
77             } else { # exact match
78                 push @parts, "[\@$1='$3']";
79             }
80         } else {
81             if ($rule =~ s/$reg->{attr1}//) {
82                 push @parts, "[\@$1]";
83             }
84         }
85
86         # Match negation
87         if ($rule =~ s/$reg->{attrN}//) {
88             my $sub_rule = $1;
89             if ($sub_rule =~ s/$reg->{attr2}//) {
90                 if ($2 eq '=') {
91                     push @parts, "[\@$1!='$3']";
92                 } elsif ($2 eq '~=') {
93                     push @parts, ":not([contains(concat(' ', \@$1, ' '), ' $3 ')])";
94                 } elsif ($2 eq '|=') {
95                     push @parts, ":not([\@$1='$3' or starts-with(\@$1, '$3-')])";
96                 }
97             } elsif ($sub_rule =~ s/$reg->{attr1}//) {
98                 push @parts, ":not([\@$1])";
99             }
100         }
101
102         # Ignore pseudoclasses/pseudoelements
103         while ($rule =~ s/$reg->{pseudo}//) {
104             if ( $1 eq 'first-child') {
105                 $parts[$#parts] = '*[1]/self::' . $parts[$#parts];
106             } elsif ($1 =~ /^lang\(([\w\-]+)\)$/) {
107                 push @parts, "[\@xml:lang='$1' or starts-with(\@xml:lang, '$1-')]";
108             }
109         }
110
111         # Match combinators (> and +)
112         if ($rule =~ s/$reg->{combinator}//) {
113             my $match = $1;
114             if ($match =~ />/) {
115                 push @parts, "/";
116             } elsif ($match =~ /\+/) {
117                 push @parts, "/following-sibling::";
118                 @next_parts = ('*[1]/self::');
119             } else {
120                 push @parts, "//";
121             }
122
123             # new context
124             $index = @parts;
125             push @parts, "*";
126         }
127
128         # Match commas
129         if ($rule =~ s/$reg->{comma}//) {
130             push @parts, " | ", "//", "*"; # ending one rule and beginning another
131             $index = @parts - 1;
132         }
133     }
134
135     return join '', @parts;
136 }
137
138 1;
139 __END__
140
141 =head1 NAME
142
143 HTML::Selector::XPath - CSS Selector to XPath compiler
144
145 =head1 SYNOPSIS
146
147   use HTML::Selector::XPath;
148
149   my $selector = HTML::Selector::XPath->new("li.main");
150   $selector->to_xpath; # //li[@id='main']
151
152 =head1 DESCRIPTION
153
154 HTML::Selector::XPath is a utility function to compile CSS2 selector
155 to the equivalent XPath expression.
156
157 =head1 CAVEATS
158
159 This module supports I<:first-child> and I<:lang> pseudo class, and a
160 partial support for I<:not> CSS 3 pseudo class as well. When you use
161 I<:not>, this module will produce I<:not()> euiqvalent XPath
162 expression, which is only available in XPath 2.0 implementation.
163
164 So far as I have tested, I<:not()> is not available in Perl XPath
165 modules like L<XML::LibXML> and L<HTML::Builder::XPath>.
166
167 =head1 AUTHOR
168
169 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
170
171 Most of the code is based on Joe Hewitt's getElementsBySelector.js on
172 L<http://www.joehewitt.com/blog/2006-03-20.php> and Andrew Dupont's
173 patch to Prototype.js on L<http://dev.rubyonrails.org/ticket/5171>,
174 but slightly modified using CSS to XPath translation table per
175 L<http://plasmasturm.org/log/444/>
176
177 =head1 LICENSE
178
179 This library is free software; you can redistribute it and/or modify
180 it under the same terms as Perl itself.
181
182 =head1 SEE ALSO
183
184 L<http://use.perl.org/~miyagawa/journal/31090>
185
186 =cut
Note: See TracBrowser for help on using the browser.