root/Web-Scraper/trunk/lib/Web/Scraper.pm

Revision 2358 (checked in by miyagawa, 13 years ago)

try to get encoding from META tags as well

Line 
1 package Web::Scraper;
2 use strict;
3 use warnings;
4 use Carp;
5 use Scalar::Util qw(blessed);
6 use List::Util qw(first);
7 use HTML::Entities;
8 use HTML::Tagset;
9 use HTML::TreeBuilder::XPath;
10 use HTML::Selector::XPath;
11
12 our $VERSION = '0.18';
13
14 sub import {
15     my $class = shift;
16     my $pkg   = caller;
17
18     no strict 'refs';
19     *{"$pkg\::scraper"} = \&scraper;
20     *{"$pkg\::process"}       = sub { goto &process };
21     *{"$pkg\::process_first"} = sub { goto &process_first };
22     *{"$pkg\::result"}        = sub { goto &result  };
23 }
24
25 our $UserAgent;
26
27 sub __ua {
28     require LWP::UserAgent;
29     $UserAgent ||= LWP::UserAgent->new(agent => __PACKAGE__ . "/" . $VERSION);
30     $UserAgent;
31 }
32
33 sub user_agent {
34     my $self = shift;
35     $self->{user_agent} = shift if @_;
36     $self->{user_agent} || __ua;
37 }
38
39 sub define {
40     my($class, $coderef) = @_;
41     bless { code => $coderef }, $class;
42 }
43
44 sub scraper(&) {
45     my($coderef) = @_;
46     bless { code => $coderef }, __PACKAGE__;
47 }
48
49 sub scrape {
50     my $self  = shift;
51     my($stuff, $current) = @_;
52
53     my($html, $tree);
54
55     if (blessed($stuff) && $stuff->isa('URI')) {
56         require Encode;
57         require HTTP::Response::Encoding;
58         my $ua  = $self->user_agent;
59         my $res = $ua->get($stuff);
60         if ($res->is_success) {
61             my @encoding = (
62                 $res->encoding,
63                 # could be multiple because HTTP response and META might be different
64                 ($res->header('Content-Type') =~ /charset=([\w\-]+)/g),
65                 "latin-1",
66             );
67             my $encoding = first { defined $_ && Encode::find_encoding($_) } @encoding;
68             $html = Encode::decode($encoding, $res->content);
69         } else {
70             croak "GET $stuff failed: ", $res->status_line;
71         }
72         $current = $stuff->as_string;
73     } elsif (blessed($stuff) && $stuff->isa('HTML::Element')) {
74         $tree = $stuff->clone;
75     } elsif (ref($stuff) && ref($stuff) eq 'SCALAR') {
76         $html = $$stuff;
77     } else {
78         $html = $stuff;
79     }
80
81     $tree ||= do {
82         my $t = HTML::TreeBuilder::XPath->new;
83         $t->parse($html);
84         $t;
85     };
86
87     my $stash = {};
88     no warnings 'redefine';
89     local *process       = create_process(0, $tree, $stash, $current);
90     local *process_first = create_process(1, $tree, $stash, $current);
91
92     local *result = sub {
93         my @keys = @_;
94
95         if (@keys == 1) {
96             return $stash->{$keys[0]};
97         } elsif (@keys) {
98             my %res;
99             @res{@keys} = @{$stash}{@keys};
100             return \%res;
101         } else {
102             return $stash;
103         }
104     };
105
106     my $ret = $self->{code}->($tree);
107     $tree->delete;
108
109     # check user specified return value
110     return $ret if $ret;
111
112     return $stash;
113 }
114
115 sub create_process {
116     my($first, $tree, $stash, $uri) = @_;
117
118     sub {
119         my($exp, @attr) = @_;
120
121         my $xpath = $exp =~ m!^/! ? $exp : HTML::Selector::XPath::selector_to_xpath($exp);
122         my @nodes = eval {
123             local $SIG{__WARN__} = sub { };
124             $tree->findnodes($xpath);
125         };
126
127         if ($@) {
128             die "'$xpath' doesn't look like a valid XPath expression: $@";
129         }
130
131         @nodes or return;
132         @nodes = ($nodes[0]) if $first;
133
134         while (my($key, $val) = splice(@attr, 0, 2)) {
135             if (!defined $val) {
136                 if (ref($key) && ref($key) eq 'CODE') {
137                     for my $node (@nodes) {
138                         local $_ = $node;
139                         $key->($node);
140                     }
141                 } else {
142                     die "Don't know what to do with $key => undef";
143                 }
144             } elsif ($key =~ s!\[\]$!!) {
145                 $stash->{$key} = [ map __get_value($_, $val, $uri), @nodes ];
146             } else {
147                 $stash->{$key} = __get_value($nodes[0], $val, $uri);
148             }
149         }
150
151         return;
152     };
153 }
154
155 sub __get_value {
156     my($node, $val, $uri) = @_;
157
158     if (ref($val) && ref($val) eq 'CODE') {
159         local $_ = $node;
160         return $val->($node);
161     } elsif (blessed($val) && $val->isa('Web::Scraper')) {
162         return $val->scrape($node, $uri);
163     } elsif ($val =~ s!^@!!) {
164         my $value =  $node->attr($val);
165         if ($uri && is_link_element($node, $val)) {
166             require URI;
167             $value = URI->new_abs($value, $uri);
168         }
169         return $value;
170     } elsif (lc($val) eq 'content' || lc($val) eq 'text') {
171         return $node->isTextNode ? $node->string_value : $node->as_text;
172     } elsif (lc($val) eq 'raw' || lc($val) eq 'html') {
173         if ($node->isTextNode) {
174             # xxx is this a bug? as_XML doesn't return encoded output
175             return HTML::Entities::encode($node->as_XML, q("'<>&));
176         }
177         my $html = $node->as_XML;
178         $html =~ s!^<.*?>!!;
179         $html =~ s!\s*</\w+>\n*$!!;
180         return $html;
181     } elsif (ref($val) eq 'HASH') {
182         my $values;
183         for my $key (keys %$val) {
184             $values->{$key} = __get_value($node, $val->{$key});
185         }
186         return $values;
187     } else {
188         Carp::croak "Unknown value type $val";
189     }
190 }
191
192 sub is_link_element {
193     my($node, $attr) = @_;
194     my $link_elements = $HTML::Tagset::linkElements{$node->tag} || [];
195     for my $elem (@$link_elements) {
196         return 1 if $attr eq $elem;
197     }
198     return;
199 }
200
201 sub __stub {
202     my $func = shift;
203     return sub {
204         croak "Can't call $func() outside scraper block";
205     };
206 }
207
208 *process       = __stub 'process';
209 *process_first = __stub 'process_first';
210 *result        = __stub 'result';
211
212 1;
213 __END__
214
215 =for stopwords API SCRAPI Scrapi
216
217 =head1 NAME
218
219 Web::Scraper - Web Scraping Toolkit inspired by Scrapi
220
221 =head1 SYNOPSIS
222
223   use URI;
224   use Web::Scraper;
225
226   my $ebay_auction = scraper {
227       process "h3.ens>a",
228           description => 'TEXT',
229           url => '@href';
230       process "td.ebcPr>span", price => "TEXT";
231       process "div.ebPicture >a>img", image => '@src';
232   };
233
234   my $ebay = scraper {
235       process "table.ebItemlist tr.single",
236           "auctions[]" => $ebay_auction;
237       result 'auctions';
238   };
239
240   my $res = $ebay->scrape( URI->new("http://search.ebay.com/apple-ipod-nano_W0QQssPageNameZWLRS") );
241
242 =head1 DESCRIPTION
243
244 Web::Scraper is a web scraper toolkit, inspired by Ruby's equivalent Scrapi.
245
246 B<THIS MODULE IS IN ITS BETA QUALITY. THE API IS STOLEN FROM SCRAPI BUT MAY CHANGE IN THE FUTURE>
247
248 =head1 AUTHOR
249
250 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
251
252 =head1 LICENSE
253
254 This library is free software; you can redistribute it and/or modify
255 it under the same terms as Perl itself.
256
257 =head1 SEE ALSO
258
259 L<http://blog.labnotes.org/category/scrapi/>
260
261 =cut
Note: See TracBrowser for help on using the browser.