root/Encode-DoubleEncodedUTF8/trunk/lib/Encode/DoubleEncodedUTF8.pm

Revision 2164 (checked in by miyagawa, 13 years ago)

Refactor the code not to use Encode::encode and ::decode to find dodgy utf8 bytes. This be 0.02

Line 
1 package Encode::DoubleEncodedUTF8;
2
3 use strict;
4 use base qw( Encode::Encoding );
5 use Encode 2.12 ();
6
7 our $VERSION = '0.02';
8
9 __PACKAGE__->Define('utf-8-de');
10
11 my $latin1_as_utf8 = "[\xC2\xC3][\x80-\xBF]";
12
13 # (Taken from Test::utf8 module)
14 # A Regexp string to match valid UTF8 bytes
15 # this info comes from page 78 of "The Unicode Standard 4.0"
16 # published by the Unicode Consortium
17 my $valid_utf8_regexp = <<'.' ;
18         [\x{00}-\x{7f}]
19       | [\x{c2}-\x{df}][\x{80}-\x{bf}]
20       |         \x{e0} [\x{a0}-\x{bf}][\x{80}-\x{bf}]
21       | [\x{e1}-\x{ec}][\x{80}-\x{bf}][\x{80}-\x{bf}]
22       |         \x{ed} [\x{80}-\x{9f}][\x{80}-\x{bf}]
23       | [\x{ee}-\x{ef}][\x{80}-\x{bf}][\x{80}-\x{bf}]
24       |         \x{f0} [\x{90}-\x{bf}][\x{80}-\x{bf}]
25       | [\x{f1}-\x{f3}][\x{80}-\x{bf}][\x{80}-\x{bf}][\x{80}-\x{bf}]
26       |         \x{f4} [\x{80}-\x{8f}][\x{80}-\x{bf}][\x{80}-\x{bf}]
27 .
28
29 sub decode {
30     my($obj, $buf, $chk) = @_;
31
32     $buf =~ s{((?:$latin1_as_utf8){2,3})}{ _check_utf8_bytes($1) }ego;
33     $_[1] = '' if $chk; # this is what in-place edit means
34
35     Encode::decode_utf8($buf);
36 }
37
38 sub _check_utf8_bytes {
39     my $bytes = shift;
40     my $copy  = $bytes;
41
42     my $possible_utf8 = '';
43     while ($copy =~ s/^(.)(.)//) {
44         $possible_utf8 .= chr( (ord($1) << 6 & 0xff) | ord($2) )
45     }
46
47     $possible_utf8 =~ /$valid_utf8_regexp/xo ? $possible_utf8 : $bytes;
48 }
49
50 sub encode {
51     use Carp;
52     Carp::croak("utf-8-de doesn't support encode() ... Why do you want to do that?");
53 }
54
55 1;
56 __END__
57
58 =for stopwords utf-8 UTF-8
59
60 =head1 NAME
61
62 Encode::DoubleEncodedUTF8 - Fix double encoded UTF-8 bytes to the correct one
63
64 =head1 SYNOPSIS
65
66   use Encode;
67   use Encode::DoubleEncodedUTF8;
68
69   my $string = "\x{5bae}";
70   my $bytes  = encode_utf8("\x{5bae}");
71   my $dodgy_utf8 = $string . $bytes; # $bytes is now double encoded
72
73   my $fixed = decode("utf-8-de", $dodgy_utf8); # "\x{5bae}\x{5bae}"
74
75 =head1 DESCRIPTION
76
77 Encode::DoubleEncodedUTF8 adds a new encoding C<utf-8-de> and fixes
78 double encoded utf-8 bytes found in the original bytes to the correct
79 Unicode entity.
80
81 The double encoded utf-8 frequently happens when strings with UTF-8
82 flag and without are concatenated. See L<encoding::warnings> for
83 details.
84
85 =head1 AUTHOR
86
87 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
88
89 =head1 LICENSE
90
91 This library is free software; you can redistribute it and/or modify
92 it under the same terms as Perl itself.
93
94 =head1 SEE ALSO
95
96 L<encoding::warnings>, L<Test::utf8>
97
98 =cut
Note: See TracBrowser for help on using the browser.