root/Encode-DoubleEncodedUTF8/trunk/lib/Encode/DoubleEncodedUTF8.pm

Revision 2162 (checked in by miyagawa, 13 years ago)

import

Line 
1 package Encode::DoubleEncodedUTF8;
2
3 use strict;
4 use base qw( Encode::Encoding );
5 use Encode 2.12 ();
6 our $VERSION = '0.01';
7
8 __PACKAGE__->Define('utf-8-de');
9
10 my $re_bit = join "|", map { Encode::encode("utf-8",chr($_)) } (127..255);
11
12 sub decode {
13     my ($obj, $buf, $chk) = @_;
14
15     $buf =~ s{(($re_bit)+)}{ _check_utf8_bytes($1) }ego;
16     $_[1] = '' if $chk; # this is what in-place edit means
17
18     Encode::decode_utf8($buf);
19 }
20
21 sub _check_utf8_bytes {
22     my $bytes = shift;
23
24     my $possible_utf8 = Encode::encode("latin-1", Encode::decode("utf-8", $bytes));
25
26     # see CAVEAT of perldoc Encode ... decode() doesn't keep the original bytes
27     my $copy = $possible_utf8;
28     eval { Encode::decode("utf-8-strict", $copy, Encode::FB_CROAK) };
29
30     return $@ ? $bytes : $possible_utf8;
31 }
32
33 sub encode {
34     use Carp;
35     Carp::croak("utf-8-de doesn't support encode() ... Why do you want to do that?");
36 }
37
38 1;
39 __END__
40
41 =for stopwords utf-8 UTF-8
42
43 =head1 NAME
44
45 Encode::DoubleEncodedUTF8 - Fix double encoded UTF-8 bytes to the correct one
46
47 =head1 SYNOPSIS
48
49   use Encode;
50   use Encode::DoubleEncodedUTF8;
51
52   my $string = "\x{5bae}";
53   my $bytes  = encode_utf8("\x{5bae}");
54   my $dodgy_utf8 = $string . $bytes; # $bytes is now double encoded
55
56   my $fixed = decode("utf-8-de", $dodgy_utf8); # "\x{5bae}\x{5bae}"
57
58 =head1 DESCRIPTION
59
60 Encode::DoubleEncodedUTF8 adds a new encoding C<utf-8-de> and fixes
61 double encoded utf-8 bytes found in the original bytes to the correct
62 Unicode entity.
63
64 The double encoded utf-8 frequently happens when strings with UTF-8
65 flag and without are concatenated. See L<encoding::warnings> for
66 details.
67
68 =head1 AUTHOR
69
70 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
71
72 =head1 LICENSE
73
74 This library is free software; you can redistribute it and/or modify
75 it under the same terms as Perl itself.
76
77 =head1 SEE ALSO
78
79 L<encoding::warnings>, L<Test::utf8>
80
81 =cut
Note: See TracBrowser for help on using the browser.