Changeset 2011

Show
Ignore:
Timestamp:
10/08/06 05:45:11
Author:
miyagawa
Message:

More docs. Added decoded_content test. Use HTTP::Headers::Util

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • HTTP-Response-Charset/trunk/lib/HTTP/Response/Charset.pm

    r2010 r2011  
    44our $VERSION = '0.01'; 
    55 
     6use HTTP::Headers::Util (); 
     7 
    68sub HTTP::Response::charset { 
    79    my $res = shift; 
    8  
    910    return if $res->is_error; 
    1011 
     12    return $res->{_http_response_charset} 
     13        if exists $res->{_http_response_charset}; 
     14 
     15    my $charset = _http_response_charset($res); 
     16    if (defined $charset) { 
     17        return $res->{_http_response_charset} = $charset; 
     18    } 
     19 
     20    return; 
     21} 
     22 
     23sub _http_response_charset { 
     24    my $res = shift; 
     25 
    1126    # 1) Look in Content-Type: charset=... 
    12     my @ct  = $res->header('Content-Type'); 
    13     for my $ct (@ct) { 
    14         if ($ct =~ /;\s*charset=([\w\-]+)/) { 
    15             return $1; 
     27    my @ct = HTTP::Headers::Util::split_header_words($res->header('Content-Type')); 
     28    for my $ct (reverse @ct) { 
     29        my(undef, undef, %ct_param) = @$ct; 
     30        if ($ct_param{charset}) { 
     31            return $ct_param{charset}; 
    1632        } 
    1733    } 
    1834 
    1935    # 1.1) If there's no charset=... set and Content-Type doesn't look like text, return 
    20     unless ( mime_is_text($ct[0]) ) { 
     36    unless ( mime_is_text($ct[-1]->[0]) ) { 
    2137        return; 
    2238    } 
    2339 
    24     my $content = $res->content; 
     40    # decode the content with Content-Encoding etc. but not Unicode 
     41    my $content = $res->decoded_content(charset => 'none'); 
    2542    unless (defined $content) { 
    2643        return; 
     
    2946    # 2) If it looks like HTML, look for META head tags 
    3047    # if there's already META tag scanned, @ct == 2 
    31     if (@ct < 2 && mime_is_html($ct[0])) { 
     48    if (@ct < 2 && mime_is_html($ct[0]->[0])) { 
    3249        require HTML::HeadParser; 
    3350        my $parser = HTML::HeadParser->new; 
     
    3552        $parser->eof; 
    3653 
    37         my $ct = $parser->header('Content-Type'); 
    38         if ($ct && $ct =~ /;\s*charset=([\w\-]+)/) { 
    39             return $1; 
     54        my @ct = HTTP::Headers::Util::split_header_words($parser->header('Content-Type')); 
     55        my(undef, undef, %ct_param) = @{$ct[0]}; 
     56        if ($ct_param{charset}) { 
     57            return $ct_param{charset}; 
    4058        } 
    4159    } 
     
    7694sub mime_is_text { 
    7795    my $ct = shift; 
    78     $ct =~ s/;.*$//; 
    79     return $ct =~ m!^text/!i || $ct =~ m!^application/(.*?)xml$!i; 
     96    return $ct =~ m!^text/!i || $ct =~ m!^application/(.*?\+)?xml$!i; 
    8097} 
    8198 
    8299sub mime_is_html { 
    83100    my $ct = shift; 
    84     $ct =~ s/;.*$//; 
    85101    return $ct =~ m!^text/html$!i || $ct =~ m!^application/xhtml\+xml$!i; 
    86102} 
     
    91107=head1 NAME 
    92108 
    93 HTTP::Response::Charset - Adds charset method to HTTP::Response 
     109HTTP::Response::Charset - Adds and improves charset detectoin of HTTP::Response 
    94110 
    95111=head1 SYNOPSIS 
     
    99115 
    100116  my $response = $ua->get($url); 
    101   if (my $encoding = $response->charset) { 
    102       my $content  = decode $encoding, $response->content; 
     117  if (my $enc = $response->charset) { 
     118      warn "encoding is $enc"; 
     119 
     120      # This does what you want only in text/* 
     121      my $content = $response->decoded_content(charset => $enc); 
     122 
     123      # This would be more explicit 
     124      my $content = decode $enc, $response->content; 
    103125  } 
    104126 
     
    106128 
    107129HTTP::Response::Charset adds I<charset> method to HTTP::Response, 
    108 which tries to detect its charset using various ways. Here's a 
    109 fallback order this module tries to look for its charset. 
     130which tries to detect its charset using various ways. 
     131 
     132=head1 HOW THIS MODULE DETECTS RESPONSE ENCODING 
     133 
     134Here's a fallback order this module tries to look for. 
    110135 
    111136=over 4 
     
    130155  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" /> 
    131156 
    132 META tag values like this are usually scanned by L<HTML::HeadParser> 
    133 inside LWP::UserAgent automatically. 
    134  
    135 =item BOM detection 
    136  
    137 If there's a UTF BOM set in the response body, this module 
     157Actually, META tag values like this are already scanned by 
     158L<HTML::HeadParser> inside LWP::UserAgent automatically unless you 
     159call I<parse_head> to set it to 0. 
     160 
     161=item UTF BOM detection 
     162 
     163If there's an UTF BOM set in the response body, this module 
    138164auto-detects the encoding by recognizing the BOM. 
    139165 
    140166=item XML declaration 
    141167 
    142 If the response MIME type is either I<application/*+xml>, I<text/xml> 
    143 or I<text/html>, this module will scan response body looking for XML 
    144 declaration like: 
     168If the response looks like XML, this module will scan response body 
     169looking for XML declaration like: 
    145170 
    146171  <?xml version="1.0" encoding="euc-jp"?> 
     172 
     173to get the encoding. 
    147174 
    148175=item Encode::Detect 
     
    153180=back 
    154181 
     182=head1 METHODS 
     183 
     184=over 4 
     185 
     186=item charset 
     187 
     188  $charset = $response->charset; 
     189 
     190returns charset of HTTP response body. If the response doesn't look 
     191like reasonable text data, or when this module fails to detect the 
     192charset, returns undef. 
     193 
     194=back 
     195 
    155196=head1 AUTHOR 
    156197 
  • HTTP-Response-Charset/trunk/t/01_online.t

    r2010 r2011  
    44 
    55plan skip_all => "TEST_ONLINE isn't set" unless $ENV{TEST_ONLINE}; 
    6  
    76filters { url => 'chomp', charset => 'chomp' }; 
    8 plan tests => 1 * blocks; 
     7plan tests => 3 * blocks; 
    98 
    109my $ua = LWP::UserAgent->new; 
     
    1413    my $res   = $ua->get($block->url); 
    1514    is $res->charset, $block->charset, $block->name; 
     15 
     16 SKIP: { 
     17        skip "don't test decoded_content", 2 if $block->skip_decode; 
     18        my $body = $res->decoded_content(charset => $res->charset); 
     19        ok utf8::is_utf8($body); 
     20        unlike $body, qr/[\x{80}-\x{ff}]/, "no mis-decoded latin-1 range characters"; 
     21    } 
    1622} 
    1723 
     
    2329--- charset 
    2430utf-8 
     31--- skip_decode 
     32MSN site might contain Spanish latin characters, eh. 
    2533 
    2634=== Content-Type: 
     
    3038euc-jp 
    3139 
     40=== Content-Type: 
     41--- url 
     42http://www.google.co.jp/ 
     43--- charset 
     44Shift_JIS 
     45 
    3246=== gif should be undef 
    3347--- url 
     
    3549--- charset eval 
    3650undef 
     51--- skip_decode 
     52It's a GIF image. 
    3753 
    3854=== No charset in Content-Type, but in META 
     
    5874http://plagger.org/HTTP-Response-Charset/foo.xml 
    5975--- charset 
    60 euc-jp 
     76utf-8 
     77--- skip_decode 
     78XXX decode_content() doesn't decode "application/xml". 
    6179 
    6280=== Detectable utf-8 data