root/HTML-Selector-XPath/trunk/lib/HTML/Selector/XPath.pm

Revision 2000 (checked in by miyagawa, 14 years ago)

added some croak() code

Line 
1 package HTML::Selector::XPath;
2
3 use strict;
4 our $VERSION = '0.02';
5
6 require Exporter;
7 our @EXPORT_OK = qw(selector_to_xpath);
8 *import = \&Exporter::import;
9
10 use Carp;
11
12 sub selector_to_xpath {
13     __PACKAGE__->new(shift)->to_xpath;
14 }
15
16 my $reg = {
17     # tag name/id/class
18     element => qr/^([#.]?)([a-z0-9\\*_-]*)((\|)([a-z0-9\\*_-]*))?/i,
19     # attribute presence
20     attr1   => qr/^\[([^\]]*)\]/,
21     # attribute value match
22     attr2   => qr/^\[\s*([^~\|=\s]+)\s*([~\|]?=)\s*"([^"]+)"\s*\]/i,
23     attrN   => qr/^:not\((.*?)\)/i,
24     pseudo  => qr/^:([()a-z_-]+)/i,
25     # adjacency/direct descendance
26     combinator => qr/^(\s*[>+\s])/i,
27     # rule separator
28     comma => qr/^\s*,/i,
29 };
30
31
32 sub new {
33     my($class, $exp) = @_;
34     bless { expression => $exp }, $class;
35 }
36
37 sub selector {
38     my $self = shift;
39     $self->{expression} = shift if @_;
40     $self->{expression};
41 }
42
43 sub to_xpath {
44     my $self = shift;
45     my $rule = $self->{expression} or return;
46
47     my $index = 1;
48     my @parts = ("//", "*");
49     my $last_rule = '';
50     my @next_parts;
51
52     # Loop through each "unit" of the rule
53     while (length $rule && $rule ne $last_rule) {
54         $last_rule = $rule;
55
56         $rule =~ s/^\s*|\s*$//g;
57         last unless length $rule;
58
59         # Match elements
60         if ($rule =~ s/$reg->{element}//) {
61
62             # to add *[1]/self:: for follow-sibling
63             if (@next_parts) {
64                 push @parts, @next_parts, (pop @parts);
65                 $index += @next_parts;
66                 @next_parts = ();
67             }
68
69             if ($1 eq '#') { # ID
70                 push @parts, "[\@id='$2']";
71             } elsif ($1 eq '.') { # class
72                 push @parts, "[contains(concat(' ', \@class, ' '), ' $2 ')]";
73             } else {
74                 $parts[$index] = $5 || $2;
75             }
76         }
77
78         # Match attribute selectors
79         if ($rule =~ s/$reg->{attr2}//) {
80             # negation (e.g. [input!="text"]) isn't implemented in CSS, but include it anyway:
81             if ($2 eq '!=') {
82                 push @parts, "[\@$1!='$3]";
83             } elsif ($2 eq '~=') { # substring attribute match
84                 push @parts, "[contains(concat(' ', \@$1, ' '), ' $3 ')]";
85             } elsif ($2 eq '|=') {
86                 push @parts, "[\@$1='$3' or starts-with(\@$1, '$3-')]";
87             } else { # exact match
88                 push @parts, "[\@$1='$3']";
89             }
90         } elsif ($rule =~ s/$reg->{attr1}//) {
91             push @parts, "[\@$1]";
92         }
93
94         # Match negation
95         if ($rule =~ s/$reg->{attrN}//) {
96             my $sub_rule = $1;
97             if ($sub_rule =~ s/$reg->{attr2}//) {
98                 if ($2 eq '=') {
99                     push @parts, "[\@$1!='$3']";
100                 } elsif ($2 eq '~=') {
101                     push @parts, "[not(contains(concat(' ', \@$1, ' '), ' $3 '))]";
102                 } elsif ($2 eq '|=') {
103                     push @parts, "[not(\@$1='$3' or starts-with(\@$1, '$3-'))]";
104                 }
105             } elsif ($sub_rule =~ s/$reg->{attr1}//) {
106                 push @parts, "[not(\@$1)]";
107             } else {
108                 Carp::croak "Can't translate '$sub_rule' inside :not()";
109             }
110         }
111
112         # Ignore pseudoclasses/pseudoelements
113         while ($rule =~ s/$reg->{pseudo}//) {
114             if ( $1 eq 'first-child') {
115                 $parts[$#parts] = '*[1]/self::' . $parts[$#parts];
116             } elsif ($1 =~ /^lang\(([\w\-]+)\)$/) {
117                 push @parts, "[\@xml:lang='$1' or starts-with(\@xml:lang, '$1-')]";
118             } else {
119                 Carp::croak "Can't translate '$1' pseudo-class";
120             }
121         }
122
123         # Match combinators (> and +)
124         if ($rule =~ s/$reg->{combinator}//) {
125             my $match = $1;
126             if ($match =~ />/) {
127                 push @parts, "/";
128             } elsif ($match =~ /\+/) {
129                 push @parts, "/following-sibling::";
130                 @next_parts = ('*[1]/self::');
131             } else {
132                 push @parts, "//";
133             }
134
135             # new context
136             $index = @parts;
137             push @parts, "*";
138         }
139
140         # Match commas
141         if ($rule =~ s/$reg->{comma}//) {
142             push @parts, " | ", "//", "*"; # ending one rule and beginning another
143             $index = @parts - 1;
144         }
145     }
146
147     return join '', @parts;
148 }
149
150 1;
151 __END__
152
153 =head1 NAME
154
155 HTML::Selector::XPath - CSS Selector to XPath compiler
156
157 =head1 SYNOPSIS
158
159   use HTML::Selector::XPath;
160
161   my $selector = HTML::Selector::XPath->new("li#main");
162   $selector->to_xpath; # //li[@id='main']
163
164   # functional interface
165   use HTML::Selector::Xpath 'selector_to_xpath';
166   my $xpath = selector_to_xpath('div.foo');
167
168 =head1 DESCRIPTION
169
170 HTML::Selector::XPath is a utility function to compile CSS2 selector
171 to the equivalent XPath expression.
172
173 =head1 CAVEATS
174
175 =head2 CSS SELECTOR VALIDATION
176
177 This module doesn't validate whether the original CSS Selector
178 expression is valid. For example,
179
180   div.123foo
181
182 is an invalid CSS selector (class names should not begin with
183 numbers), but this module ignores that and tries to generate
184 an equivalent XPath expression anyway.
185
186 =head1 AUTHOR
187
188 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
189
190 Most of the code is based on Joe Hewitt's getElementsBySelector.js on
191 L<http://www.joehewitt.com/blog/2006-03-20.php> and Andrew Dupont's
192 patch to Prototype.js on L<http://dev.rubyonrails.org/ticket/5171>,
193 but slightly modified using Aristotle Pegaltzis' CSS to XPath
194 translation table per L<http://plasmasturm.org/log/444/>
195
196 =head1 LICENSE
197
198 This library is free software; you can redistribute it and/or modify
199 it under the same terms as Perl itself.
200
201 =head1 SEE ALSO
202
203 L<http://www.w3.org/TR/REC-CSS2/selector.html>
204 L<http://use.perl.org/~miyagawa/journal/31090>
205
206 =cut
Note: See TracBrowser for help on using the browser.