Changeset 2347

Show
Ignore:
Timestamp:
09/19/07 14:43:39
Author:
miyagawa
Message:

support TextNode?. call Term::Encoding

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • Web-Scraper/trunk/Changes

    r2346 r2347  
    11Revision history for Perl extension Web::Scraper 
     2 
     30.16  Tue Sep 18 04:48:47 PDT 2007 
     4        - Support 'RAW' and 'TEXT' for TextNode object 
     5        - Call Term::Encoding from scraper shell if installed 
    26 
    370.15  Sat Sep 15 21:28:10 PDT 2007 
  • Web-Scraper/trunk/MANIFEST

    r2336 r2347  
    4343t/11_absolute.t 
    4444t/12_html.t 
     45t/13_textnode.t 
    4546t/perlcriticrc 
  • Web-Scraper/trunk/Makefile.PL

    r2336 r2347  
    99requires 'LWP::UserAgent'; 
    1010requires 'HTTP::Response::Encoding'; 
     11requires 'HTML::Entities'; 
    1112requires 'HTML::Tagset'; 
     13requires 'Term::Encoding'; 
    1214requires 'URI'; 
    1315requires 'YAML'; 
  • Web-Scraper/trunk/bin/scraper

    r2344 r2347  
    66use Term::ReadLine; 
    77use Data::Dumper; 
     8use HTML::Entities; 
    89use URI; 
    910use Web::Scraper; 
    1011use YAML; 
    1112 
    12 sub WARN() { return sub { warn $_->as_HTML(q('"&<>), "", {}) } } 
     13sub WARN() { 
     14    return sub { 
     15        warn $_->isTextNode 
     16            ? HTML::Entities::encode($_->as_XML, q("'<>&)) 
     17            : $_->as_HTML(q('"&<>), "", {}); 
     18    }; 
     19
     20 
     21if (eval { require Term::Encoding; 1 }) { 
     22    my $encoding = Term::Encoding::get_encoding(); 
     23    binmode STDOUT, "encoding($encoding)"; 
     24    binmode STDERR, "encoding($encoding)"; 
     25
    1326 
    1427my(@stack, $source); 
  • Web-Scraper/trunk/lib/Web/Scraper.pm

    r2344 r2347  
    44use Carp; 
    55use Scalar::Util 'blessed'; 
     6use HTML::Entities; 
    67use HTML::Tagset; 
    78use HTML::TreeBuilder::XPath; 
     
    161162        return $value; 
    162163    } elsif (lc($val) eq 'content' || lc($val) eq 'text') { 
    163         return $node->as_text; 
     164        return $node->isTextNode ? $node->string_value : $node->as_text; 
    164165    } elsif (lc($val) eq 'raw' || lc($val) eq 'html') { 
     166        if ($node->isTextNode) { 
     167            # xxx is this a bug? as_XML doesn't return encoded output 
     168            return HTML::Entities::encode($node->as_XML, q("'<>&)); 
     169        } 
    165170        my $html = $node->as_HTML(q("'<>&), undef, {}); 
    166171        $html =~ s!^<.*?>!!;