root/Web-Scraper/trunk/lib/Web/Scraper.pm

Revision 2242 (checked in by miyagawa, 13 years ago)

added less-DSLish constructor Web::Scraper->define(sub { ... });

Line 
1 package Web::Scraper;
2 use strict;
3 use warnings;
4 use Carp;
5 use Scalar::Util 'blessed';
6 use HTML::TreeBuilder::XPath;
7 use HTML::Selector::XPath;
8
9 our $VERSION = '0.04';
10
11 sub import {
12     my $class = shift;
13     my $pkg   = caller;
14
15     no strict 'refs';
16     *{"$pkg\::scraper"} = \&scraper;
17     *{"$pkg\::process"}       = sub { goto &process };
18     *{"$pkg\::process_first"} = sub { goto &process_first };
19     *{"$pkg\::result"}        = sub { goto &result  };
20 }
21
22 my $ua;
23
24 sub __ua {
25     require LWP::UserAgent;
26     $ua ||= LWP::UserAgent->new(agent => __PACKAGE__ . "/" . $VERSION);
27     $ua;
28 }
29
30 sub define {
31     my($class, $coderef) = @_;
32     bless { code => $coderef }, $class;
33 }
34
35 sub scraper(&) {
36     my($coderef) = @_;
37     bless { code => $coderef }, __PACKAGE__;
38 }
39
40 sub scrape {
41     my $self  = shift;
42     my($stuff) = @_;
43
44     my($html, $tree);
45
46     if (blessed($stuff) && $stuff->isa('URI')) {
47         require HTTP::Response::Encoding;
48         my $ua  = __ua;
49         my $res = $ua->get($stuff);
50         if ($res->is_success) {
51             $html = $res->decoded_content;
52         } else {
53             croak "GET $stuff failed: ", $res->status_line;
54         }
55     } elsif (blessed($stuff) && $stuff->isa('HTML::Element')) {
56         $tree = $stuff->clone;
57     } elsif (ref($stuff) && ref($stuff) eq 'SCALAR') {
58         $html = $$stuff;
59     } else {
60         $html = $stuff;
61     }
62
63     $tree ||= do {
64         my $t = HTML::TreeBuilder::XPath->new;
65         $t->parse($html);
66         $t;
67     };
68
69     my $stash = {};
70     no warnings 'redefine';
71     local *process       = create_process(0, $tree, $stash);
72     local *process_first = create_process(1, $tree, $stash);
73
74     local *result = sub {
75         my @keys = @_;
76
77         if (@keys == 1) {
78             return $stash->{$keys[0]};
79         } else {
80             my %res;
81             @res{@keys} = @{$stash}{@keys};
82             return \%res;
83         }
84     };
85
86     my $ret = $self->{code}->($tree);
87
88     # check user specified return value
89     return $ret if $ret;
90
91     return $stash;
92 }
93
94 sub create_process {
95     my($first, $tree, $stash) = @_;
96
97     sub {
98         my($exp, @attr) = @_;
99
100         my $xpath = HTML::Selector::XPath::selector_to_xpath($exp);
101         my @nodes = $tree->findnodes($xpath) or return;
102         @nodes = ($nodes[0]) if $first;
103
104         while (my($key, $val) = splice(@attr, 0, 2)) {
105             if (ref($key) && ref($key) eq 'CODE' && !defined $val) {
106                 for my $node (@nodes) {
107                     $key->($node);
108                 }
109             } elsif ($key =~ s!\[\]$!!) {
110                 $stash->{$key} = [ map __get_value($_, $val), @nodes ];
111             } else {
112                 $stash->{$key} = __get_value($nodes[0], $val);
113             }
114         }
115
116         return;
117     };
118 }
119
120 sub __get_value {
121     my($node, $val) = @_;
122
123     if (ref($val) && ref($val) eq 'CODE') {
124         return $val->($node);
125     } elsif (blessed($val) && $val->isa('Web::Scraper')) {
126         return $val->scrape($node);
127     } elsif ($val =~ s!^@!!) {
128         return $node->attr($val);
129     } elsif (lc($val) eq 'content' || lc($val) eq 'text') {
130         return $node->as_text;
131     } else {
132         Carp::croak "Unknown value type $val";
133     }
134 }
135
136 sub __stub {
137     my $func = shift;
138     return sub {
139         croak "Can't call $func() outside scraper block";
140     };
141 }
142
143 *process       = __stub 'process';
144 *process_first = __stub 'process_first';
145 *result        = __stub 'result';
146
147 1;
148 __END__
149
150 =for stopwords API SCRAPI Scrapi
151
152 =head1 NAME
153
154 Web::Scraper - Web Scraping Toolkit inspired by Scrapi
155
156 =head1 SYNOPSIS
157
158   use URI;
159   use Web::Scraper;
160
161   my $ebay_auction = scraper {
162       process "h3.ens>a",
163           description => 'TEXT',
164           url => '@href';
165       process "td.ebcPr>span", price => "TEXT";
166       process "div.ebPicture >a>img", image => '@src';
167
168       result 'description', 'url', 'price', 'image';
169   };
170
171   my $ebay = scraper {
172       process "table.ebItemlist tr.single",
173           "auctions[]" => $ebay_auction;
174       result 'auctions';
175   };
176
177   $ebay->scrape( URI->new("http://search.ebay.com/apple-ipod-nano_W0QQssPageNameZWLRS") );
178
179 =head1 DESCRIPTION
180
181 Web::Scraper is a web scraper toolkit, inspired by Ruby's equivalent Scrapi.
182
183 B<THIS MODULE IS IN ITS BETA QUALITY. THE API IS STOLEN FROM SCRAPI BUT MAY CHANGE IN THE FUTURE>
184
185 =head1 AUTHOR
186
187 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
188
189 =head1 LICENSE
190
191 This library is free software; you can redistribute it and/or modify
192 it under the same terms as Perl itself.
193
194 =head1 SEE ALSO
195
196 L<http://blog.labnotes.org/category/scrapi/>
197
198 =cut
Note: See TracBrowser for help on using the browser.