Changeset 697

Show
Ignore:
Timestamp:
08/05/02 23:10:04
Author:
miyagawa
Message:

extended emoji support

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • HTML-Entities-ImodePictogram/trunk/Changes

    r225 r697  
    11Revision history for Perl extension HTML::Entities::ImodePictogram. 
     2 
     30.05 
     4        * Extended emoji support 
     5          (Thanks to Eijiro Koike <hanabusa@edge.co.jp>) 
    26 
    370.04  Thu Aug 30 15:09:47 JST 2001 
     
    6100.03  Thu Aug 30 14:14:15 JST 2001 
    711        - binary <=> number now handled via (un)?pack 
    8           Thanks to SADAHIRO Tomoyuki <BQW10602@nifty.com> 
     12          (Thanks to SADAHIRO Tomoyuki <BQW10602@nifty.com>) 
    913 
    10140.02  Thu Aug 30 02:57:02 JST 2001 
  • HTML-Entities-ImodePictogram/trunk/MANIFEST

    r221 r697  
    66lib/HTML/Entities/ImodePictogram.pm 
    77t/00_pictogram.t 
     8t/01_ext.t 
  • HTML-Entities-ImodePictogram/trunk/lib/HTML/Entities/ImodePictogram.pm

    r225 r697  
    33use strict; 
    44use vars qw($VERSION); 
    5 $VERSION = '0.04'
     5$VERSION = 0.05
    66 
    77use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 
     
    1515my $two_bytes = '[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]'; 
    1616 
    17 use vars qw($Sjis_re $Pictogram_re); 
     17use vars qw($Sjis_re $Pictogram_re $ExtPictorgram_re); 
    1818$Sjis_re      = qr<$one_byte|$two_bytes>; 
    1919$Pictogram_re = '\xF8[\x9F-\xFC]|\xF9[\x40-\x7E\x80-\xAF]'; 
     20$ExtPictorgram_re = '\xF9[\xB1-\xFC]'; 
    2021 
    2122sub find_pictogram (\$&) { 
     
    2324 
    2425    my $num_found = 0; 
    25     $$r_text =~ s{(($Pictogram_re)|$Sjis_re)}{ 
     26    $$r_text =~ s{(($Pictogram_re)|($ExtPictorgram_re)|$Sjis_re)}{ 
    2627        my $orig_match = $1; 
    27         if (defined $2) { 
     28        if (defined $2 || defined $3) { 
    2829            $num_found++; 
    29             $callback->($orig_match, unpack('n', $orig_match)); 
     30            my $number = unpack 'n', $orig_match; 
     31            $callback->($orig_match, $number, _num2cp($number)); 
    3032        } 
    3133        else { 
     
    3840 
    3941sub encode_pictogram { 
    40     my $text = shift
     42    my($text, %opt) = @_
    4143    find_pictogram($text, sub { 
    42                        my($char, $number) = @_; 
    43                        return '&#' . $number . ';'; 
     44                       my($char, $number, $cp) = @_; 
     45                       if ($opt{unicode} || $cp >= 59148) { 
     46                           return sprintf '&#x%x;', $cp; 
     47                       } else { 
     48                           return '&#' . $number . ';'; 
     49                       } 
    4450                   }); 
    4551    return $text; 
     
    4854sub decode_pictogram { 
    4955    my $html = shift; 
    50     $html =~ s{(\&\#(\d{5});)}{ 
    51         if (($2 >= 63647 && $2 <= 63740) || 
    52             ($2 >= 63808 && $2 <= 63870) || 
    53             ($2 >= 63872 && $2 <= 63919)) { 
    54             pack 'n', $2; 
    55         } 
    56         else { 
    57             $1; 
     56    $html =~ s{(\&\#(\d{5});)|(\&\#x([0-9a-fA-F]{4});)}{ 
     57        if (defined $1) { 
     58            if (($2 >= 63647 && $2 <= 63740) || 
     59                ($2 >= 63808 && $2 <= 63870) || 
     60                ($2 >= 63872 && $2 <= 63919)) { 
     61                pack 'n', $2; 
     62            } else { 
     63                $1; 
     64            } 
     65        } elsif (defined $3) { 
     66            my $cp = hex($4); 
     67            pack 'n', _cp2num($cp); 
    5868        } 
    5969    }eg; 
     
    6979} 
    7080 
     81sub _num2cp { 
     82    my $num = shift; 
     83    if ($num >= 63647 && $num <= 63740) { 
     84        return $num - 4705; 
     85    } elsif (($num >= 63808 && $num <= 63817) || 
     86             ($num >= 63824 && $num <= 63838) || 
     87             ($num >= 63858 && $num <= 63870)) { 
     88        return $num - 4772; 
     89    } elsif (($num >= 63872 && $num <= 63919) || 
     90             ($num >= 63921 && $num <= 63996)) { 
     91        return $num - 4773; 
     92    } else { 
     93        require Carp; 
     94        Carp::carp("unknown number: $num"); 
     95        return; 
     96    } 
     97} 
     98 
     99sub _cp2num { 
     100    my $cp = shift; 
     101    if ($cp >= 58942 && $cp <= 59035) { 
     102        return $cp + 4705; 
     103    } elsif (($cp >= 59036 && $cp <= 59045) || 
     104             ($cp >= 59052 && $cp <= 59066) || 
     105             ($cp >= 59086 && $cp <= 59098)) { 
     106        return $cp + 4772; 
     107    } elsif (($cp >= 59099 && $cp <= 59146) || 
     108             ($cp >= 59148 && $cp <= 59223)) { 
     109        return $cp + 4773; 
     110    } else { 
     111        require Carp; 
     112        Carp::carp("unknown codepoint: $cp"); 
     113        return; 
     114    } 
     115} 
     116 
     117 
    711181; 
    72119__END__ 
     
    109156 
    110157  $html = encode_pictogram($rawtext); 
    111  
    112 Encodes pictogram characters in raw-text into HTML entities. 
     158  $html = encode_pictogram($rawtext, unicode => 1); 
     159 
     160Encodes pictogram characters in raw-text into HTML entities. If 
     161$rawtext contains extended pictograms, they are encoded in Unicode 
     162format. If you add C<unicode> option explicitly, all pictogram 
     163characters are encoded in Unicode format (C<&#xFFFF;>). Otherwise, 
     164encoding is done in decimal format (C<&#NNNNN;>). 
    113165 
    114166=item decode_pictogram 
     
    116168  $rawtext = decode_pictogram($html); 
    117169 
    118 Decodes HTML entities for pictogram into raw-text. 
     170Decodes HTML entities (both for C<&#xFFFF;> and C<&#NNNNN;>) for 
     171pictogram into raw-text in Shift_JIS. 
    119172 
    120173=item remove_pictogram 
     
    137190found. It returns the total numbers of charcters found in text. 
    138191 
    139 The callback is given two arguments. The first is a found pictogram 
     192The callback is given three arguments. The first is a found pictogram 
    140193character itself, and the second is a decimal number which represents 
    141 codepoint of the character. Whatever the callback returns will replace 
    142 the original text. 
    143  
    144 Here is an implementation of encode_pictogram(), which will be the good 
    145 example for the usage of find_pictogram(). 
     194codepoint of the character. The third is a Unicode codepoint. Whatever 
     195the callback returns will replace the original text. 
     196 
     197Here is a stub implementation of encode_pictogram(), which will be the 
     198good example for the usage of find_pictogram(). Note that this example 
     199version doesn't support extended pictograms. 
    146200 
    147201  sub encode_pictogram { 
    148202      my $text = shift; 
    149203      find_pictogram($text, sub { 
    150                          my($char, $number) = @_; 
     204                         my($char, $number, $cp) = @_; 
    151205                         return '&#' . $number . ';'; 
    152206                     }); 
     
    158212=head1 CAVEAT 
    159213 
     214=over 4 
     215 
     216=item * 
     217 
    160218This module works so slow, because regex used here matches C<ANY> 
    161219characters in the text. This is due to the difficulty of extracting 
    162220character boundaries of Shift_JIS encoding. 
    163221 
     222=item * 
     223 
     224Extended pictogram support of this module is not complete. If you 
     225handle pictogram characters in Unicode, try Encode module with perl 
     2265.8.0, or Unicode::Japanese. 
     227 
     228=back 
     229 
    164230=head1 AUTHOR 
    165231 
     
    171237=head1 SEE ALSO 
    172238 
    173 L<HTML::Entities>, http://www.nttdocomo.co.jp/i/tag/emoji/index.html 
     239L<HTML::Entities>, L<Unicode::Japanese>, 
     240http://www.nttdocomo.co.jp/p_s/imode/tag/emoji/ 
    174241 
    175242=cut 
  • HTML-Entities-ImodePictogram/trunk/t/00_pictogram.t

    r221 r697  
    11use strict; 
    2 use Test; 
    3 BEGIN { plan tests => 6 } 
     2use Test::More tests => 6; 
    43 
    54use HTML::Entities::ImodePictogram qw(:all); 
    65 
    76# €Æ€¹€È[À²€ì][ÆÞ€ê]€Æ€¹€È 
    8 my $raw  = "\x82\xc4\x82\xb7\x82\xc6\xf8\x9f\xf8\xa0\x82\xc4\x82\xb7\x82\xc6";  
     7my $raw  = "\x82\xc4\x82\xb7\x82\xc6\xf8\x9f\xf8\xa0\x82\xc4\x82\xb7\x82\xc6"; 
    98my $html = "\x82\xc4\x82\xb7\x82\xc6&#63647;&#63648;\x82\xc4\x82\xb7\x82\xc6"; 
    109 
    11 ok(encode_pictogram($raw), $html); 
    12 ok(decode_pictogram($html), $raw); 
    13 ok(length(remove_pictogram($raw)) == 6 * 2); 
     10is(encode_pictogram($raw), $html); 
     11is(decode_pictogram($html), $raw); 
     12is(length(remove_pictogram($raw)), 6 * 2); 
    1413 
    1514my $text = $raw; 
     
    1716my $num_found = find_pictogram($text, sub { push @bin, $_[0]; push @num, $_[1]; }); 
    1817 
    19 ok("@bin", "\xf8\x9f \xf8\xa0")
    20 ok("@num", "63647 63648")
    21 ok($num_found, 2); 
     18is_deeply \@bin, ["\xf8\x9f", "\xf8\xa0"]
     19is_deeply \@num, [ 63647, 63648 ]
     20is($num_found, 2); 
    2221 
    2322