root/Web-Scraper/trunk/bin/scraper

Revision 2344 (checked in by miyagawa, 13 years ago)

Checking in changes prior to tagging of version 0.15. Changelog diff is:

=== Changes
==================================================================
--- Changes (revision 6905)
+++ Changes (local)
@@ -1,5 +1,11 @@

Revision history for Perl extension Web
Scraper

+0.15
+ - Call env_proxy in scraper CLI
+ - Added $Web::Scraper::UserAgent? and $scraper->user_agent accessor to deal
+ with UserAgent? object
+ - Don't escape non-ASCII characters into &#xXXXX; in scraper shell 's' and WARN
+

0.14 Fri Sep 14 16:06:20 PDT 2007

- Fix bin/scraper to work with older Term
ReadLine?.
(Thanks to Tina Müller [RT:29079])

  • Property svn:executable set to *
Line 
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4
5 use Config;
6 use Term::ReadLine;
7 use Data::Dumper;
8 use URI;
9 use Web::Scraper;
10 use YAML;
11
12 sub WARN() { return sub { warn $_->as_HTML(q('"&<>), "", {}) } }
13
14 my(@stack, $source);
15
16 my $stuff   = process_args($ARGV[0])
17     or die "Usage: scraper [URI-or-filename]\n";
18
19 my $term    = Term::ReadLine->new("Web::Scraper");
20 my $scraper = scraper { run_loop($_[0], $term) };
21    $scraper->user_agent->env_proxy;
22
23 my $result  = $scraper->scrape($stuff);
24
25 sub process_args {
26     my $uri = shift;
27
28     if (!-t STDIN and my $content = join "", <STDIN>) {
29         $source = [ 'stdin' ];
30         return \$content;
31     } elsif ($uri && $uri =~ m!^https?://!) {
32         $source = [ "URI", $uri ];
33         return URI->new($uri);
34     } elsif ($uri && -e $uri) {
35         $source = [ 'file', $uri ];
36         open my $fh, "<", $uri or die "$uri: $!";
37         return join "", <$fh>;
38     }
39
40     return;
41 }
42
43 sub run_loop {
44     my($tree, $term) = @_;
45     while (defined(my $in = $term->readline("scraper> "))) {
46         if ($in eq 'd') {
47             $Data::Dumper::Indent = 1;
48             warn Dumper result;
49         } elsif ($in eq 'y') {
50             warn Dump result;
51         } elsif ($in eq 's') {
52             warn $tree->as_HTML(q('"&<>), "  ", {});
53         } elsif ($in eq 'q') {
54             return;
55         } elsif ($in eq 'c') {
56             print generate_code($source, $stack[-1]);
57         } elsif ($in =~ /^c\s+all\s*$/) {
58             print generate_code($source, @stack);
59         } else {
60             my $res = eval $in;
61             warn $@ if $@;
62             push @stack, $in unless $@;
63         }
64     }
65 }
66
67 sub generate_code {
68     my($source, @stack) = @_;
69
70     my $code_stack = join "\n", map { "    $_" . (/;$/ ? "" : ";") } @stack;
71     my $stuff =
72         $source->[0] eq 'stdin'         ? '\join "", <STDIN>' :
73         $source->[0] eq 'URI'           ? qq(URI->new("$source->[1]")) :
74         $source->[0] eq 'file'          ? qq(\\do { my \$file = "$source->[1]"; open my \$fh, \$file or die "\$file: \$!"; join '', <\$fh> }) :
75                                           '...';
76
77     return <<CODE;
78 #!$Config{perlpath}
79 use strict;
80 use Web::Scraper;
81 use URI;
82
83 my \$stuff   = $stuff;
84 my \$scraper = scraper {
85 $code_stack
86 };
87 my \$result = \$scraper->scrape(\$stuff);
88 CODE
89
90 }
Note: See TracBrowser for help on using the browser.