Changeset 2336

Show
Ignore:
Timestamp:
09/15/07 08:04:10
Author:
miyagawa
Message:

added URI absolutification and RAW/HTML getter

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • Web-Scraper/trunk/Changes

    r2331 r2336  
    11Revision history for Perl extension Web::Scraper 
     2 
     30.14 
     4        - Fix bin/scraper to work with older Term::ReadLine. 
     5          (Thanks to Tina Müller [RT:29079]) 
     6        - Now link elements like img@src and a@href are automatically 
     7          converted to absolute URI using the current URI as a base. 
     8          Only effective when you do $s->scrape(URI) or $s->scrape(\$html, URI) 
     9        - Added 'HTML' and its alias 'RAW' to get the HTML chunk inside the tag 
     10            process "script", "code" => 'RAW'; 
     11          Handy if you want the raw HTML code inside <script> or <style>. 
     12          (Thanks to charsbar for the suggestion) 
    213 
    3140.13  Sun Sep  2 17:11:08 PDT 2007 
  • Web-Scraper/trunk/MANIFEST

    r2331 r2336  
    4141t/09-process_hash.t 
    4242t/10_invalid_xpath.t 
     43t/11_absolute.t 
     44t/12_html.t 
    4345t/perlcriticrc 
  • Web-Scraper/trunk/Makefile.PL

    r2327 r2336  
    99requires 'LWP::UserAgent'; 
    1010requires 'HTTP::Response::Encoding'; 
     11requires 'HTML::Tagset'; 
    1112requires 'URI'; 
    1213requires 'YAML'; 
  • Web-Scraper/trunk/lib/Web/Scraper.pm

    r2331 r2336  
    44use Carp; 
    55use Scalar::Util 'blessed'; 
     6use HTML::Tagset; 
    67use HTML::TreeBuilder::XPath; 
    78use HTML::Selector::XPath; 
     
    4041sub scrape { 
    4142    my $self  = shift; 
    42     my($stuff) = @_; 
     43    my($stuff, $current) = @_; 
    4344 
    4445    my($html, $tree); 
     
    5556            croak "GET $stuff failed: ", $res->status_line; 
    5657        } 
     58        $current = $stuff->as_string; 
    5759    } elsif (blessed($stuff) && $stuff->isa('HTML::Element')) { 
    5860        $tree = $stuff->clone; 
     
    7173    my $stash = {}; 
    7274    no warnings 'redefine'; 
    73     local *process       = create_process(0, $tree, $stash); 
    74     local *process_first = create_process(1, $tree, $stash); 
     75    local *process       = create_process(0, $tree, $stash, $current); 
     76    local *process_first = create_process(1, $tree, $stash, $current); 
    7577 
    7678    local *result = sub { 
     
    98100 
    99101sub create_process { 
    100     my($first, $tree, $stash) = @_; 
     102    my($first, $tree, $stash, $uri) = @_; 
    101103 
    102104    sub { 
     
    127129                } 
    128130            } elsif ($key =~ s!\[\]$!!) { 
    129                 $stash->{$key} = [ map __get_value($_, $val), @nodes ]; 
     131                $stash->{$key} = [ map __get_value($_, $val, $uri), @nodes ]; 
    130132            } else { 
    131                 $stash->{$key} = __get_value($nodes[0], $val); 
     133                $stash->{$key} = __get_value($nodes[0], $val, $uri); 
    132134            } 
    133135        } 
     
    138140 
    139141sub __get_value { 
    140     my($node, $val) = @_; 
     142    my($node, $val, $uri) = @_; 
    141143 
    142144    if (ref($val) && ref($val) eq 'CODE') { 
     
    146148        return $val->scrape($node); 
    147149    } elsif ($val =~ s!^@!!) { 
    148         return $node->attr($val); 
     150        my $value =  $node->attr($val); 
     151        if ($uri && is_link_element($node, $val)) { 
     152            require URI; 
     153            $value = URI->new_abs($value, $uri); 
     154        } 
     155        return $value; 
    149156    } elsif (lc($val) eq 'content' || lc($val) eq 'text') { 
    150157        return $node->as_text; 
     158    } elsif (lc($val) eq 'raw' || lc($val) eq 'html') { 
     159        my $html = $node->as_HTML(q("'<>&), undef, {}); 
     160        $html =~ s!^<.*?>!!; 
     161        $html =~ s!\s*</\w+>\n*$!!; 
     162        return $html; 
    151163    } elsif (ref($val) eq 'HASH') { 
    152164        my $values; 
     
    158170        Carp::croak "Unknown value type $val"; 
    159171    } 
     172} 
     173 
     174sub is_link_element { 
     175    my($node, $attr) = @_; 
     176    my $link_elements = $HTML::Tagset::linkElements{$node->tag} || []; 
     177    for my $elem (@$link_elements) { 
     178        return 1 if $attr eq $elem; 
     179    } 
     180    return; 
    160181} 
    161182