Changeset 2164

Show
Ignore:
Timestamp:
02/15/07 08:14:24
Author:
miyagawa
Message:

Refactor the code not to use Encode::encode and ::decode to find dodgy utf8 bytes. This be 0.02

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • Encode-DoubleEncodedUTF8/trunk/Changes

    r2162 r2164  
    11Revision history for Perl extension Encode::DoubleEncodedUTF8 
     2 
     3  0.02  Wed Feb 14 14:46:04 PST 2007 
     4        - Now the code to find dodgy utf-8 bytes is Encode.pm-free and all regexp based. 
     5          Easy to port to other languages and I guess it's even faster. 
     6          (Thanks to Dan Kogai and Mark Fowler for the regexps) 
    27 
    38  0.01  Tue Feb 13 17:57:45 2007 
  • Encode-DoubleEncodedUTF8/trunk/lib/Encode/DoubleEncodedUTF8.pm

    r2162 r2164  
    44use base qw( Encode::Encoding ); 
    55use Encode 2.12 (); 
    6 our $VERSION = '0.01'; 
     6 
     7our $VERSION = '0.02'; 
    78 
    89__PACKAGE__->Define('utf-8-de'); 
    910 
    10 my $re_bit = join "|", map { Encode::encode("utf-8",chr($_)) } (127..255); 
     11my $latin1_as_utf8 = "[\xC2\xC3][\x80-\xBF]"; 
     12 
     13# (Taken from Test::utf8 module) 
     14# A Regexp string to match valid UTF8 bytes 
     15# this info comes from page 78 of "The Unicode Standard 4.0" 
     16# published by the Unicode Consortium 
     17my $valid_utf8_regexp = <<'.' ; 
     18        [\x{00}-\x{7f}] 
     19      | [\x{c2}-\x{df}][\x{80}-\x{bf}] 
     20      |         \x{e0} [\x{a0}-\x{bf}][\x{80}-\x{bf}] 
     21      | [\x{e1}-\x{ec}][\x{80}-\x{bf}][\x{80}-\x{bf}] 
     22      |         \x{ed} [\x{80}-\x{9f}][\x{80}-\x{bf}] 
     23      | [\x{ee}-\x{ef}][\x{80}-\x{bf}][\x{80}-\x{bf}] 
     24      |         \x{f0} [\x{90}-\x{bf}][\x{80}-\x{bf}] 
     25      | [\x{f1}-\x{f3}][\x{80}-\x{bf}][\x{80}-\x{bf}][\x{80}-\x{bf}] 
     26      |         \x{f4} [\x{80}-\x{8f}][\x{80}-\x{bf}][\x{80}-\x{bf}] 
     27
    1128 
    1229sub decode { 
    13     my ($obj, $buf, $chk) = @_; 
     30    my($obj, $buf, $chk) = @_; 
    1431 
    15     $buf =~ s{(($re_bit)+)}{ _check_utf8_bytes($1) }ego; 
     32    $buf =~ s{((?:$latin1_as_utf8){2,3})}{ _check_utf8_bytes($1) }ego; 
    1633    $_[1] = '' if $chk; # this is what in-place edit means 
    1734 
     
    2138sub _check_utf8_bytes { 
    2239    my $bytes = shift; 
     40    my $copy  = $bytes; 
    2341 
    24     my $possible_utf8 = Encode::encode("latin-1", Encode::decode("utf-8", $bytes)); 
     42    my $possible_utf8 = ''; 
     43    while ($copy =~ s/^(.)(.)//) { 
     44        $possible_utf8 .= chr( (ord($1) << 6 & 0xff) | ord($2) ) 
     45    } 
    2546 
    26     # see CAVEAT of perldoc Encode ... decode() doesn't keep the original bytes 
    27     my $copy = $possible_utf8; 
    28     eval { Encode::decode("utf-8-strict", $copy, Encode::FB_CROAK) }; 
    29  
    30     return $@ ? $bytes : $possible_utf8; 
     47    $possible_utf8 =~ /$valid_utf8_regexp/xo ? $possible_utf8 : $bytes; 
    3148} 
    3249