root/Web-Scraper/trunk/lib/Web/Scraper.pm

Revision 2394 (checked in by miyagawa, 13 years ago)

accept id() function as XPath not CSS selector

Line 
1 package Web::Scraper;
2 use strict;
3 use warnings;
4 use Carp;
5 use Scalar::Util qw(blessed);
6 use List::Util qw(first);
7 use HTML::Entities;
8 use HTML::Tagset;
9 use HTML::TreeBuilder::XPath;
10 use HTML::Selector::XPath;
11 use UNIVERSAL::require;
12
13 our $VERSION = '0.22';
14
15 sub import {
16     my $class = shift;
17     my $pkg   = caller;
18
19     no strict 'refs';
20     *{"$pkg\::scraper"} = \&scraper;
21     *{"$pkg\::process"}       = sub { goto &process };
22     *{"$pkg\::process_first"} = sub { goto &process_first };
23     *{"$pkg\::result"}        = sub { goto &result  };
24 }
25
26 our $UserAgent;
27
28 sub __ua {
29     require LWP::UserAgent;
30     $UserAgent ||= LWP::UserAgent->new(agent => __PACKAGE__ . "/" . $VERSION);
31     $UserAgent;
32 }
33
34 sub user_agent {
35     my $self = shift;
36     $self->{user_agent} = shift if @_;
37     $self->{user_agent} || __ua;
38 }
39
40 sub define {
41     my($class, $coderef) = @_;
42     bless { code => $coderef }, $class;
43 }
44
45 sub scraper(&) {
46     my($coderef) = @_;
47     bless { code => $coderef }, __PACKAGE__;
48 }
49
50 sub scrape {
51     my $self  = shift;
52     my($stuff, $current) = @_;
53
54     my($html, $tree);
55
56     if (blessed($stuff) && $stuff->isa('URI')) {
57         require Encode;
58         require HTTP::Response::Encoding;
59         my $ua  = $self->user_agent;
60         my $res = $ua->get($stuff);
61         if ($res->is_success) {
62             my @encoding = (
63                 $res->encoding,
64                 # could be multiple because HTTP response and META might be different
65                 ($res->header('Content-Type') =~ /charset=([\w\-]+)/g),
66                 "latin-1",
67             );
68             my $encoding = first { defined $_ && Encode::find_encoding($_) } @encoding;
69             $html = Encode::decode($encoding, $res->content);
70         } else {
71             croak "GET $stuff failed: ", $res->status_line;
72         }
73         $current = $stuff->as_string;
74     } elsif (blessed($stuff) && $stuff->isa('HTML::Element')) {
75         $tree = $stuff->clone;
76     } elsif (ref($stuff) && ref($stuff) eq 'SCALAR') {
77         $html = $$stuff;
78     } else {
79         $html = $stuff;
80     }
81
82     $tree ||= do {
83         my $t = HTML::TreeBuilder::XPath->new;
84         $t->parse($html);
85         $t;
86     };
87
88     my $stash = {};
89     no warnings 'redefine';
90     local *process       = create_process(0, $tree, $stash, $current);
91     local *process_first = create_process(1, $tree, $stash, $current);
92
93     my $retval;
94     local *result = sub {
95         $retval++;
96         my @keys = @_;
97
98         if (@keys == 1) {
99             return $stash->{$keys[0]};
100         } elsif (@keys) {
101             my %res;
102             @res{@keys} = @{$stash}{@keys};
103             return \%res;
104         } else {
105             return $stash;
106         }
107     };
108
109     my $ret = $self->{code}->($tree);
110     $tree->delete;
111
112     # check user specified return value
113     return $ret if $retval;
114
115     return $stash;
116 }
117
118 sub create_process {
119     my($first, $tree, $stash, $uri) = @_;
120
121     sub {
122         my($exp, @attr) = @_;
123
124         my $xpath = $exp =~ m!^(?:/|id\()! ? $exp : HTML::Selector::XPath::selector_to_xpath($exp);
125         my @nodes = eval {
126             local $SIG{__WARN__} = sub { };
127             $tree->findnodes($xpath);
128         };
129
130         if ($@) {
131             die "'$xpath' doesn't look like a valid XPath expression: $@";
132         }
133
134         @nodes or return;
135         @nodes = ($nodes[0]) if $first;
136
137         while (my($key, $val) = splice(@attr, 0, 2)) {
138             if (!defined $val) {
139                 if (ref($key) && ref($key) eq 'CODE') {
140                     for my $node (@nodes) {
141                         local $_ = $node;
142                         $key->($node);
143                     }
144                 } else {
145                     die "Don't know what to do with $key => undef";
146                 }
147             } elsif ($key =~ s!\[\]$!!) {
148                 $stash->{$key} = [ map __get_value($_, $val, $uri), @nodes ];
149             } else {
150                 $stash->{$key} = __get_value($nodes[0], $val, $uri);
151             }
152         }
153
154         return;
155     };
156 }
157
158 sub __get_value {
159     my($node, $val, $uri) = @_;
160
161     if (ref($val) && ref($val) eq 'CODE') {
162         local $_ = $node;
163         return $val->($node);
164     } elsif (blessed($val) && $val->isa('Web::Scraper')) {
165         return $val->scrape($node, $uri);
166     } elsif ($val =~ s!^@!!) {
167         my $value =  $node->attr($val);
168         if ($uri && is_link_element($node, $val)) {
169             require URI;
170             $value = URI->new_abs($value, $uri);
171         }
172         return $value;
173     } elsif (lc($val) eq 'content' || lc($val) eq 'text') {
174         return $node->isTextNode ? $node->string_value : $node->as_text;
175     } elsif (lc($val) eq 'raw' || lc($val) eq 'html') {
176         if ($node->isTextNode) {
177             if ($HTML::TreeBuilder::XPath::VERSION < 0.09) {
178                 return HTML::Entities::encode($node->as_XML, q("'<>&));
179             } else {
180                 return $node->as_XML;
181             }
182         }
183         my $html = $node->as_XML;
184         $html =~ s!^<.*?>!!;
185         $html =~ s!\s*</\w+>\n*$!!;
186         return $html;
187     } elsif (ref($val) eq 'HASH') {
188         my $values;
189         for my $key (keys %$val) {
190             $values->{$key} = __get_value($node, $val->{$key}, $uri);
191         }
192         return $values;
193     } elsif (ref($val) eq 'ARRAY') {
194         my $how   = $val->[0];
195         my $value = __get_value($node, $how, $uri);
196         for my $filter (@$val[1..$#$val]) {
197             $value = run_filter($value, $filter);
198         }
199         return $value;
200     } else {
201         Carp::croak "Unknown value type $val";
202     }
203 }
204
205 sub run_filter {
206     my($value, $filter) = @_;
207
208     ## sub { s/foo/bar/g } is a valid filter
209     ## sub { DateTime::Format::Foo->parse_string(shift) } is valid too
210     my $callback;
211     my $module;
212
213     if (ref($filter) eq 'CODE') {
214         $callback = $filter;
215         $module   = "$filter";
216     } elsif (!ref($filter)) {
217         $module = $filter =~ s/^\+// ? $filter : "Web::Scraper::Filter::$filter";
218         unless ($module->isa('Web::Scraper::Filter')) {
219             $module->require or Carp::croak("Loading $module: $@");
220         }
221
222         $callback = sub { $module->new->filter(shift) };
223     } else {
224         Carp::croak("Don't know filter type $filter");
225     }
226
227     local $_ = $value;
228     my $retval = eval { $callback->($value) };
229     if ($@) {
230         Carp::croak("Filter $module had an error: $@");
231     }
232
233     no warnings 'uninitialized';
234     if (($retval =~ /^\d+$/ and $_ ne $value) or (defined $retval and !$retval)) {
235         $value = $_;
236     } else {
237         $value = $retval;
238     }
239
240     return $value;
241 }
242
243 sub is_link_element {
244     my($node, $attr) = @_;
245     my $link_elements = $HTML::Tagset::linkElements{$node->tag} || [];
246     for my $elem (@$link_elements) {
247         return 1 if $attr eq $elem;
248     }
249     return;
250 }
251
252 sub __stub {
253     my $func = shift;
254     return sub {
255         croak "Can't call $func() outside scraper block";
256     };
257 }
258
259 *process       = __stub 'process';
260 *process_first = __stub 'process_first';
261 *result        = __stub 'result';
262
263 1;
264 __END__
265
266 =for stopwords API SCRAPI Scrapi
267
268 =head1 NAME
269
270 Web::Scraper - Web Scraping Toolkit inspired by Scrapi
271
272 =head1 SYNOPSIS
273
274   use URI;
275   use Web::Scraper;
276
277   my $ebay_auction = scraper {
278       process "h3.ens>a",
279           description => 'TEXT',
280           url => '@href';
281       process "td.ebcPr>span", price => "TEXT";
282       process "div.ebPicture >a>img", image => '@src';
283   };
284
285   my $ebay = scraper {
286       process "table.ebItemlist tr.single",
287           "auctions[]" => $ebay_auction;
288       result 'auctions';
289   };
290
291   my $res = $ebay->scrape( URI->new("http://search.ebay.com/apple-ipod-nano_W0QQssPageNameZWLRS") );
292
293 =head1 DESCRIPTION
294
295 Web::Scraper is a web scraper toolkit, inspired by Ruby's equivalent Scrapi.
296
297 B<THIS MODULE IS IN ITS BETA QUALITY. THE API IS STOLEN FROM SCRAPI BUT MAY CHANGE IN THE FUTURE>
298
299 =head1 AUTHOR
300
301 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
302
303 =head1 LICENSE
304
305 This library is free software; you can redistribute it and/or modify
306 it under the same terms as Perl itself.
307
308 =head1 SEE ALSO
309
310 L<http://blog.labnotes.org/category/scrapi/>
311
312 =cut
Note: See TracBrowser for help on using the browser.