root/Web-Scraper/trunk/lib/Web/Scraper.pm

Revision 2240 (checked in by miyagawa, 13 years ago)

Checking in changes prior to tagging of version 0.04. Changelog diff is:

=== Changes
==================================================================
--- Changes (revision 6666)
+++ Changes (local)
@@ -1,5 +1,12 @@

Revision history for Perl extension Web
Scraper

+0.04 Wed May 9 00:55:32 PDT 2007
+ - *API CHANGE* Now scraper {} returns Web::Scraper object and not closure.
+ You should call ->scrape() to get the response back.
+
+ I loved the code returning closure, but this is more compatible to
+ scrapi.rb API and hopefully less confusing to people.
+

0.03 Tue May 8 23:04:13 PDT 2007

- use 'TEXT' rather than 'content' to grab text from element

to be more compatible with scrapi

Line 
1 package Web::Scraper;
2 use strict;
3 use warnings;
4 use Carp;
5 use Scalar::Util 'blessed';
6 use HTML::TreeBuilder::XPath;
7 use HTML::Selector::XPath;
8
9 our $VERSION = '0.04';
10
11 sub import {
12     my $class = shift;
13     my $pkg   = caller;
14
15     no strict 'refs';
16     *{"$pkg\::scraper"} = \&scraper;
17     *{"$pkg\::process"}       = sub { goto &process };
18     *{"$pkg\::process_first"} = sub { goto &process_first };
19     *{"$pkg\::result"}        = sub { goto &result  };
20 }
21
22 my $ua;
23
24 sub __ua {
25     require LWP::UserAgent;
26     $ua ||= LWP::UserAgent->new(agent => __PACKAGE__ . "/" . $VERSION);
27     $ua;
28 }
29
30 sub scraper(&) {
31     my($coderef) = @_;
32     bless { code => $coderef }, __PACKAGE__;
33 }
34
35 sub scrape {
36     my $self  = shift;
37     my($stuff) = @_;
38
39     my($html, $tree);
40
41     if (blessed($stuff) && $stuff->isa('URI')) {
42         require HTTP::Response::Encoding;
43         my $ua  = __ua;
44         my $res = $ua->get($stuff);
45         if ($res->is_success) {
46             $html = $res->decoded_content;
47         } else {
48             croak "GET $stuff failed: ", $res->status_line;
49         }
50     } elsif (blessed($stuff) && $stuff->isa('HTML::Element')) {
51         $tree = $stuff->clone;
52     } elsif (ref($stuff) && ref($stuff) eq 'SCALAR') {
53         $html = $$stuff;
54     } else {
55         $html = $stuff;
56     }
57
58     $tree ||= do {
59         my $t = HTML::TreeBuilder::XPath->new;
60         $t->parse($html);
61         $t;
62     };
63
64     my $stash = {};
65     no warnings 'redefine';
66     local *process       = create_process(0, $tree, $stash);
67     local *process_first = create_process(1, $tree, $stash);
68
69     local *result = sub {
70         my @keys = @_;
71
72         if (@keys == 1) {
73             return $stash->{$keys[0]};
74         } else {
75             my %res;
76             @res{@keys} = @{$stash}{@keys};
77             return \%res;
78         }
79     };
80
81     my $ret = $self->{code}->($tree);
82
83     # check user specified return value
84     return $ret if $ret;
85
86     return $stash;
87 }
88
89 sub create_process {
90     my($first, $tree, $stash) = @_;
91
92     sub {
93         my($exp, @attr) = @_;
94
95         my $xpath = HTML::Selector::XPath::selector_to_xpath($exp);
96         my @nodes = $tree->findnodes($xpath) or return;
97         @nodes = ($nodes[0]) if $first;
98
99         while (my($key, $val) = splice(@attr, 0, 2)) {
100             if (ref($key) && ref($key) eq 'CODE' && !defined $val) {
101                 for my $node (@nodes) {
102                     $key->($node);
103                 }
104             } elsif ($key =~ s!\[\]$!!) {
105                 $stash->{$key} = [ map __get_value($_, $val), @nodes ];
106             } else {
107                 $stash->{$key} = __get_value($nodes[0], $val);
108             }
109         }
110
111         return;
112     };
113 }
114
115 sub __get_value {
116     my($node, $val) = @_;
117
118     if (ref($val) && ref($val) eq 'CODE') {
119         return $val->($node);
120     } elsif (blessed($val) && $val->isa('Web::Scraper')) {
121         return $val->scrape($node);
122     } elsif ($val =~ s!^@!!) {
123         return $node->attr($val);
124     } elsif (lc($val) eq 'content' || lc($val) eq 'text') {
125         return $node->as_text;
126     } else {
127         Carp::croak "Unknown value type $val";
128     }
129 }
130
131 sub __stub {
132     my $func = shift;
133     return sub {
134         croak "Can't call $func() outside scraper block";
135     };
136 }
137
138 *process       = __stub 'process';
139 *process_first = __stub 'process_first';
140 *result        = __stub 'result';
141
142 1;
143 __END__
144
145 =for stopwords API SCRAPI Scrapi
146
147 =head1 NAME
148
149 Web::Scraper - Web Scraping Toolkit inspired by Scrapi
150
151 =head1 SYNOPSIS
152
153   use URI;
154   use Web::Scraper;
155
156   my $ebay_auction = scraper {
157       process "h3.ens>a",
158           description => 'TEXT',
159           url => '@href';
160       process "td.ebcPr>span", price => "TEXT";
161       process "div.ebPicture >a>img", image => '@src';
162
163       result 'description', 'url', 'price', 'image';
164   };
165
166   my $ebay = scraper {
167       process "table.ebItemlist tr.single",
168           "auctions[]" => $ebay_auction;
169       result 'auctions';
170   };
171
172   $ebay->scrape( URI->new("http://search.ebay.com/apple-ipod-nano_W0QQssPageNameZWLRS") );
173
174 =head1 DESCRIPTION
175
176 Web::Scraper is a web scraper toolkit, inspired by Ruby's equivalent Scrapi.
177
178 B<THIS MODULE IS IN ITS BETA QUALITY. THE API IS STOLEN FROM SCRAPI BUT MAY CHANGE IN THE FUTURE>
179
180 =head1 AUTHOR
181
182 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
183
184 =head1 LICENSE
185
186 This library is free software; you can redistribute it and/or modify
187 it under the same terms as Perl itself.
188
189 =head1 SEE ALSO
190
191 L<http://blog.labnotes.org/category/scrapi/>
192
193 =cut
Note: See TracBrowser for help on using the browser.