root/Web-Scraper/trunk/lib/Web/Scraper.pm

Revision 2331 (checked in by miyagawa, 13 years ago)

Checking in changes prior to tagging of version 0.13. Changelog diff is:

=== Changes
==================================================================
--- Changes (revision 6892)
+++ Changes (local)
@@ -1,10 +1,11 @@

Revision history for Perl extension Web
Scraper

-0.13
+0.13 Sun Sep 2 17:11:08 PDT 2007

- Added 'c' and 'c all' command to scraper to generate the

code to replay the session

- Added 'WARN' as a shortcut to sub { warn $_->as_HTML } on scraper shell like:

process "a", WARN; # print 'a' elements as HTML
+ - Added 'search-cpan.pl' and 'rel-tag.pl' to eg/


0.12 Thu Aug 30 02:39:44 PDT 2007

- Added 's' command to scraper to get the HTML source

Line 
1 package Web::Scraper;
2 use strict;
3 use warnings;
4 use Carp;
5 use Scalar::Util 'blessed';
6 use HTML::TreeBuilder::XPath;
7 use HTML::Selector::XPath;
8
9 our $VERSION = '0.13';
10
11 sub import {
12     my $class = shift;
13     my $pkg   = caller;
14
15     no strict 'refs';
16     *{"$pkg\::scraper"} = \&scraper;
17     *{"$pkg\::process"}       = sub { goto &process };
18     *{"$pkg\::process_first"} = sub { goto &process_first };
19     *{"$pkg\::result"}        = sub { goto &result  };
20 }
21
22 my $ua;
23
24 sub __ua {
25     require LWP::UserAgent;
26     $ua ||= LWP::UserAgent->new(agent => __PACKAGE__ . "/" . $VERSION);
27     $ua;
28 }
29
30 sub define {
31     my($class, $coderef) = @_;
32     bless { code => $coderef }, $class;
33 }
34
35 sub scraper(&) {
36     my($coderef) = @_;
37     bless { code => $coderef }, __PACKAGE__;
38 }
39
40 sub scrape {
41     my $self  = shift;
42     my($stuff) = @_;
43
44     my($html, $tree);
45
46     if (blessed($stuff) && $stuff->isa('URI')) {
47         require Encode;
48         require HTTP::Response::Encoding;
49         my $ua  = __ua;
50         my $res = $ua->get($stuff);
51         if ($res->is_success) {
52             my $encoding = $res->encoding || "latin-1";
53             $html = Encode::decode($encoding, $res->content);
54         } else {
55             croak "GET $stuff failed: ", $res->status_line;
56         }
57     } elsif (blessed($stuff) && $stuff->isa('HTML::Element')) {
58         $tree = $stuff->clone;
59     } elsif (ref($stuff) && ref($stuff) eq 'SCALAR') {
60         $html = $$stuff;
61     } else {
62         $html = $stuff;
63     }
64
65     $tree ||= do {
66         my $t = HTML::TreeBuilder::XPath->new;
67         $t->parse($html);
68         $t;
69     };
70
71     my $stash = {};
72     no warnings 'redefine';
73     local *process       = create_process(0, $tree, $stash);
74     local *process_first = create_process(1, $tree, $stash);
75
76     local *result = sub {
77         my @keys = @_;
78
79         if (@keys == 1) {
80             return $stash->{$keys[0]};
81         } elsif (@keys) {
82             my %res;
83             @res{@keys} = @{$stash}{@keys};
84             return \%res;
85         } else {
86             return $stash;
87         }
88     };
89
90     my $ret = $self->{code}->($tree);
91     $tree->delete;
92
93     # check user specified return value
94     return $ret if $ret;
95
96     return $stash;
97 }
98
99 sub create_process {
100     my($first, $tree, $stash) = @_;
101
102     sub {
103         my($exp, @attr) = @_;
104
105         my $xpath = $exp =~ m!^/! ? $exp : HTML::Selector::XPath::selector_to_xpath($exp);
106         my @nodes = eval {
107             local $SIG{__WARN__} = sub { };
108             $tree->findnodes($xpath);
109         };
110
111         if ($@) {
112             die "'$xpath' doesn't look like a valid XPath expression: $@";
113         }
114
115         @nodes or return;
116         @nodes = ($nodes[0]) if $first;
117
118         while (my($key, $val) = splice(@attr, 0, 2)) {
119             if (!defined $val) {
120                 if (ref($key) && ref($key) eq 'CODE') {
121                     for my $node (@nodes) {
122                         local $_ = $node;
123                         $key->($node);
124                     }
125                 } else {
126                     die "Don't know what to do with $key => undef";
127                 }
128             } elsif ($key =~ s!\[\]$!!) {
129                 $stash->{$key} = [ map __get_value($_, $val), @nodes ];
130             } else {
131                 $stash->{$key} = __get_value($nodes[0], $val);
132             }
133         }
134
135         return;
136     };
137 }
138
139 sub __get_value {
140     my($node, $val) = @_;
141
142     if (ref($val) && ref($val) eq 'CODE') {
143         local $_ = $node;
144         return $val->($node);
145     } elsif (blessed($val) && $val->isa('Web::Scraper')) {
146         return $val->scrape($node);
147     } elsif ($val =~ s!^@!!) {
148         return $node->attr($val);
149     } elsif (lc($val) eq 'content' || lc($val) eq 'text') {
150         return $node->as_text;
151     } elsif (ref($val) eq 'HASH') {
152         my $values;
153         for my $key (keys %$val) {
154             $values->{$key} = __get_value($node, $val->{$key});
155         }
156         return $values;
157     } else {
158         Carp::croak "Unknown value type $val";
159     }
160 }
161
162 sub __stub {
163     my $func = shift;
164     return sub {
165         croak "Can't call $func() outside scraper block";
166     };
167 }
168
169 *process       = __stub 'process';
170 *process_first = __stub 'process_first';
171 *result        = __stub 'result';
172
173 1;
174 __END__
175
176 =for stopwords API SCRAPI Scrapi
177
178 =head1 NAME
179
180 Web::Scraper - Web Scraping Toolkit inspired by Scrapi
181
182 =head1 SYNOPSIS
183
184   use URI;
185   use Web::Scraper;
186
187   my $ebay_auction = scraper {
188       process "h3.ens>a",
189           description => 'TEXT',
190           url => '@href';
191       process "td.ebcPr>span", price => "TEXT";
192       process "div.ebPicture >a>img", image => '@src';
193
194       result 'description', 'url', 'price', 'image';
195   };
196
197   my $ebay = scraper {
198       process "table.ebItemlist tr.single",
199           "auctions[]" => $ebay_auction;
200       result 'auctions';
201   };
202
203   $ebay->scrape( URI->new("http://search.ebay.com/apple-ipod-nano_W0QQssPageNameZWLRS") );
204
205 =head1 DESCRIPTION
206
207 Web::Scraper is a web scraper toolkit, inspired by Ruby's equivalent Scrapi.
208
209 B<THIS MODULE IS IN ITS BETA QUALITY. THE API IS STOLEN FROM SCRAPI BUT MAY CHANGE IN THE FUTURE>
210
211 =head1 AUTHOR
212
213 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
214
215 =head1 LICENSE
216
217 This library is free software; you can redistribute it and/or modify
218 it under the same terms as Perl itself.
219
220 =head1 SEE ALSO
221
222 L<http://blog.labnotes.org/category/scrapi/>
223
224 =cut
Note: See TracBrowser for help on using the browser.