root/Web-Scraper/trunk/lib/Web/Scraper.pm

Revision 2338 (checked in by miyagawa, 13 years ago)

Checking in changes prior to tagging of version 0.14. Changelog diff is:

=== Changes
==================================================================
--- Changes (revision 6899)
+++ Changes (local)
@@ -1,6 +1,6 @@

Revision history for Perl extension Web
Scraper

-0.14
+0.14 Fri Sep 14 16:06:20 PDT 2007
- Fix bin/scraper to work with older Term
ReadLine?.
(Thanks to Tina Müller [RT:29079])
- Now link elements like img@src and a@href are automatically

Line 
1 package Web::Scraper;
2 use strict;
3 use warnings;
4 use Carp;
5 use Scalar::Util 'blessed';
6 use HTML::Tagset;
7 use HTML::TreeBuilder::XPath;
8 use HTML::Selector::XPath;
9
10 our $VERSION = '0.14';
11
12 sub import {
13     my $class = shift;
14     my $pkg   = caller;
15
16     no strict 'refs';
17     *{"$pkg\::scraper"} = \&scraper;
18     *{"$pkg\::process"}       = sub { goto &process };
19     *{"$pkg\::process_first"} = sub { goto &process_first };
20     *{"$pkg\::result"}        = sub { goto &result  };
21 }
22
23 my $ua;
24
25 sub __ua {
26     require LWP::UserAgent;
27     $ua ||= LWP::UserAgent->new(agent => __PACKAGE__ . "/" . $VERSION);
28     $ua;
29 }
30
31 sub define {
32     my($class, $coderef) = @_;
33     bless { code => $coderef }, $class;
34 }
35
36 sub scraper(&) {
37     my($coderef) = @_;
38     bless { code => $coderef }, __PACKAGE__;
39 }
40
41 sub scrape {
42     my $self  = shift;
43     my($stuff, $current) = @_;
44
45     my($html, $tree);
46
47     if (blessed($stuff) && $stuff->isa('URI')) {
48         require Encode;
49         require HTTP::Response::Encoding;
50         my $ua  = __ua;
51         my $res = $ua->get($stuff);
52         if ($res->is_success) {
53             my $encoding = $res->encoding || "latin-1";
54             $html = Encode::decode($encoding, $res->content);
55         } else {
56             croak "GET $stuff failed: ", $res->status_line;
57         }
58         $current = $stuff->as_string;
59     } elsif (blessed($stuff) && $stuff->isa('HTML::Element')) {
60         $tree = $stuff->clone;
61     } elsif (ref($stuff) && ref($stuff) eq 'SCALAR') {
62         $html = $$stuff;
63     } else {
64         $html = $stuff;
65     }
66
67     $tree ||= do {
68         my $t = HTML::TreeBuilder::XPath->new;
69         $t->parse($html);
70         $t;
71     };
72
73     my $stash = {};
74     no warnings 'redefine';
75     local *process       = create_process(0, $tree, $stash, $current);
76     local *process_first = create_process(1, $tree, $stash, $current);
77
78     local *result = sub {
79         my @keys = @_;
80
81         if (@keys == 1) {
82             return $stash->{$keys[0]};
83         } elsif (@keys) {
84             my %res;
85             @res{@keys} = @{$stash}{@keys};
86             return \%res;
87         } else {
88             return $stash;
89         }
90     };
91
92     my $ret = $self->{code}->($tree);
93     $tree->delete;
94
95     # check user specified return value
96     return $ret if $ret;
97
98     return $stash;
99 }
100
101 sub create_process {
102     my($first, $tree, $stash, $uri) = @_;
103
104     sub {
105         my($exp, @attr) = @_;
106
107         my $xpath = $exp =~ m!^/! ? $exp : HTML::Selector::XPath::selector_to_xpath($exp);
108         my @nodes = eval {
109             local $SIG{__WARN__} = sub { };
110             $tree->findnodes($xpath);
111         };
112
113         if ($@) {
114             die "'$xpath' doesn't look like a valid XPath expression: $@";
115         }
116
117         @nodes or return;
118         @nodes = ($nodes[0]) if $first;
119
120         while (my($key, $val) = splice(@attr, 0, 2)) {
121             if (!defined $val) {
122                 if (ref($key) && ref($key) eq 'CODE') {
123                     for my $node (@nodes) {
124                         local $_ = $node;
125                         $key->($node);
126                     }
127                 } else {
128                     die "Don't know what to do with $key => undef";
129                 }
130             } elsif ($key =~ s!\[\]$!!) {
131                 $stash->{$key} = [ map __get_value($_, $val, $uri), @nodes ];
132             } else {
133                 $stash->{$key} = __get_value($nodes[0], $val, $uri);
134             }
135         }
136
137         return;
138     };
139 }
140
141 sub __get_value {
142     my($node, $val, $uri) = @_;
143
144     if (ref($val) && ref($val) eq 'CODE') {
145         local $_ = $node;
146         return $val->($node);
147     } elsif (blessed($val) && $val->isa('Web::Scraper')) {
148         return $val->scrape($node);
149     } elsif ($val =~ s!^@!!) {
150         my $value =  $node->attr($val);
151         if ($uri && is_link_element($node, $val)) {
152             require URI;
153             $value = URI->new_abs($value, $uri);
154         }
155         return $value;
156     } elsif (lc($val) eq 'content' || lc($val) eq 'text') {
157         return $node->as_text;
158     } elsif (lc($val) eq 'raw' || lc($val) eq 'html') {
159         my $html = $node->as_HTML(q("'<>&), undef, {});
160         $html =~ s!^<.*?>!!;
161         $html =~ s!\s*</\w+>\n*$!!;
162         return $html;
163     } elsif (ref($val) eq 'HASH') {
164         my $values;
165         for my $key (keys %$val) {
166             $values->{$key} = __get_value($node, $val->{$key});
167         }
168         return $values;
169     } else {
170         Carp::croak "Unknown value type $val";
171     }
172 }
173
174 sub is_link_element {
175     my($node, $attr) = @_;
176     my $link_elements = $HTML::Tagset::linkElements{$node->tag} || [];
177     for my $elem (@$link_elements) {
178         return 1 if $attr eq $elem;
179     }
180     return;
181 }
182
183 sub __stub {
184     my $func = shift;
185     return sub {
186         croak "Can't call $func() outside scraper block";
187     };
188 }
189
190 *process       = __stub 'process';
191 *process_first = __stub 'process_first';
192 *result        = __stub 'result';
193
194 1;
195 __END__
196
197 =for stopwords API SCRAPI Scrapi
198
199 =head1 NAME
200
201 Web::Scraper - Web Scraping Toolkit inspired by Scrapi
202
203 =head1 SYNOPSIS
204
205   use URI;
206   use Web::Scraper;
207
208   my $ebay_auction = scraper {
209       process "h3.ens>a",
210           description => 'TEXT',
211           url => '@href';
212       process "td.ebcPr>span", price => "TEXT";
213       process "div.ebPicture >a>img", image => '@src';
214
215       result 'description', 'url', 'price', 'image';
216   };
217
218   my $ebay = scraper {
219       process "table.ebItemlist tr.single",
220           "auctions[]" => $ebay_auction;
221       result 'auctions';
222   };
223
224   $ebay->scrape( URI->new("http://search.ebay.com/apple-ipod-nano_W0QQssPageNameZWLRS") );
225
226 =head1 DESCRIPTION
227
228 Web::Scraper is a web scraper toolkit, inspired by Ruby's equivalent Scrapi.
229
230 B<THIS MODULE IS IN ITS BETA QUALITY. THE API IS STOLEN FROM SCRAPI BUT MAY CHANGE IN THE FUTURE>
231
232 =head1 AUTHOR
233
234 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
235
236 =head1 LICENSE
237
238 This library is free software; you can redistribute it and/or modify
239 it under the same terms as Perl itself.
240
241 =head1 SEE ALSO
242
243 L<http://blog.labnotes.org/category/scrapi/>
244
245 =cut
Note: See TracBrowser for help on using the browser.