root/Encode-DoubleEncodedUTF8/trunk/blib/lib/Encode/DoubleEncodedUTF8.pm

Revision 2162 (checked in by miyagawa, 13 years ago)

import

Line 
1 package Encode::DoubleEncodedUTF8;
2
3 use strict;
4 use base qw( Encode::Encoding );
5 use Encode 2.12 ();
6 our $VERSION = '0.01';
7
8 __PACKAGE__->Define('utf-8-de');
9
10 my $utf8_regexp = <<'.' ;
11         [\x{00}-\x{7f}]
12       | [\x{c2}-\x{df}][\x{80}-\x{bf}]
13       |         \x{e0} [\x{a0}-\x{bf}][\x{80}-\x{bf}]
14       | [\x{e1}-\x{ec}][\x{80}-\x{bf}][\x{80}-\x{bf}]
15       |         \x{ed} [\x{80}-\x{9f}][\x{80}-\x{bf}]
16       | [\x{ee}-\x{ef}][\x{80}-\x{bf}][\x{80}-\x{bf}]
17       |         \x{f0} [\x{90}-\x{bf}][\x{80}-\x{bf}]
18       | [\x{f1}-\x{f3}][\x{80}-\x{bf}][\x{80}-\x{bf}][\x{80}-\x{bf}]
19       |         \x{f4} [\x{80}-\x{8f}][\x{80}-\x{bf}][\x{80}-\x{bf}]
20 .
21
22 my $re_bit = join "|", map { Encode::encode("utf-8",chr($_)) } (127..255);
23
24 sub decode {
25     my ($obj, $buf, $chk) = @_;
26
27     $buf =~ s{(($re_bit)+)}{ _check_utf8_bytes($1) }ego;
28     $_[1] = '' if $chk; # this is what in-place edit means
29
30     Encode::decode_utf8($buf);
31 }
32
33 sub _check_utf8_bytes {
34     my $bytes = shift;
35
36     my $possible_utf8 = Encode::encode("latin-1", Encode::decode("utf-8", $bytes));
37
38     # see CAVEAT of perldoc Encode ... decode() doesn't keep the original bytes
39     my $copy = $possible_utf8;
40     eval { Encode::decode("utf-8-strict", $copy, Encode::FB_CROAK) };
41
42     return $@ ? $bytes : $possible_utf8;
43 }
44
45 sub encode {
46     use Carp;
47     Carp::croak("utf-8-de doesn't support encode() ... Why do you want to do that?");
48 }
49
50 1;
51 __END__
52
53 =for stopwords utf-8 UTF-8
54
55 =head1 NAME
56
57 Encode::DoubleEncodedUTF8 - Fix double encoded UTF-8 bytes to the correct one
58
59 =head1 SYNOPSIS
60
61   use Encode;
62   use Encode::DoubleEncodedUTF8;
63
64   my $string = "\x{5bae}";
65   my $bytes  = encode_utf8("\x{5bae}");
66   my $dodgy_utf8 = $string . $bytes; # $bytes is now double encoded
67
68   my $fixed = decode("utf-8-de", $dodgy_utf8); # "\x{5bae}\x{5bae}"
69
70 =head1 DESCRIPTION
71
72 Encode::DoubleEncodedUTF8 adds a new encoding C<utf-8-de> and fixes
73 double encoded utf-8 bytes found in the original bytes to the correct
74 Unicode entity.
75
76 The double encoded utf-8 frequently happens when strings with UTF-8
77 flag and without are concatenated. See L<encoding::warnings> for
78 details.
79
80 =head1 AUTHOR
81
82 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
83
84 =head1 LICENSE
85
86 This library is free software; you can redistribute it and/or modify
87 it under the same terms as Perl itself.
88
89 =head1 SEE ALSO
90
91 L<encoding::warnings>, L<Test::utf8>
92
93 =cut
Note: See TracBrowser for help on using the browser.