root/HTML-Entities-ImodePictogram/trunk/lib/HTML/Entities/ImodePictogram.pm

Revision 224 (checked in by miyagawa, 19 years ago)

binary <=> number handled via pack/unpack

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
Line 
1 package HTML::Entities::ImodePictogram;
2
3 use strict;
4 use vars qw($VERSION);
5 $VERSION = '0.03';
6
7 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
8 require Exporter;
9 @ISA       = qw(Exporter);
10 @EXPORT    = qw(encode_pictogram decode_pictogram remove_pictogram);
11 @EXPORT_OK = qw(find_pictogram);
12 %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
13
14 my $one_byte  = '[\x00-\x7F\xA1-\xDF]';
15 my $two_bytes = '[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]';
16
17 use vars qw($Pictogram_re $Sjis_re);
18 $Pictogram_re = '\xF8[\x9F-\xFC]|\xF9[\x40-\x7E\x80-\xAF]';
19 $Sjis_re      = qr<$one_byte|$two_bytes|$Pictogram_re>;
20
21 sub find_pictogram (\$&) {
22     my($r_text, $callback) = @_;
23
24     my $num_found = 0;
25     $$r_text =~ s{($Sjis_re)}{
26         my $orig_match = $1;
27         if ($orig_match =~ /^$Pictogram_re$/) {
28             $num_found++;
29             $callback->($orig_match, unpack('n', $orig_match));
30         }
31         else {
32             $orig_match;
33         }
34     }eg;
35
36     return $num_found;
37 }
38
39 sub encode_pictogram {
40     my $text = shift;
41     find_pictogram($text, sub {
42                        my($char, $number) = @_;
43                        return '&#' . $number . ';';
44                    });
45     return $text;
46 }
47
48 sub decode_pictogram {
49     my $html = shift;
50     $html =~ s{(\&\#(\d{5});)}{
51         if (($2 >= 63647 && $2 <= 63740) ||
52             ($2 >= 63808 && $2 <= 63870) ||
53             ($2 >= 63872 && $2 <= 63919)) {
54             pack 'n', $2;
55         }
56         else {
57             $1;
58         }
59     }eg;
60     return $html;
61 }
62
63 sub remove_pictogram {
64     my $text = shift;
65     find_pictogram($text, sub {
66                        return '';
67                    });
68     return $text;
69 }
70
71 1;
72 __END__
73
74 =head1 NAME
75
76 HTML::Entities::ImodePictogram - encode / decode i-mode pictogram
77
78 =head1 SYNOPSIS
79
80   use HTML::Entities::ImodePictogram;
81
82   $html      = encode_pictogram($rawtext);
83   $rawtext   = decode_pictogram($html);
84   $cleantext = remove_pictogram($rawtext);
85
86   use HTML::Entities::ImodePictogram qw(find_pictogram);
87
88   $num_found = find_pictogram($rawtext, \&callback);
89
90 =head1 DESCRIPTION
91
92 HTML::Entities::ImodePictogram handles HTML entities for i-mode
93 pictogram (emoji), which are assigned in Shift_JIS private area.
94
95 See http://www.nttdocomo.co.jp/i/tag/emoji/index.html for details
96 about i-mode pictogram.
97
98 =head1 FUNCTIONS
99
100 In all functions in this module, input/output strings are asssumed as
101 encoded in Shift_JIS. See L<Jcode> for conversion between Shift_JIS
102 and other encodings like EUC-JP or UTF-8.
103
104 This module exports following functions by default.
105
106 =over 4
107
108 =item encode_pictogram
109
110   $html = encode_pictogram($rawtext);
111
112 Encodes pictogram characters in raw-text into HTML entities.
113
114 =item decode_pictogram
115
116   $rawtext = decode_pictogram($html);
117
118 Decodes HTML entities for pictogram into raw-text.
119
120 =item remove_pictogram
121
122   $cleantext = remove_pictogram($rawtext);
123
124 Removes pictogram characters in raw-text.
125
126 =back
127
128 This module also exports following functions on demand.
129
130 =over 4
131
132 =item find_pictogram
133
134   $num_found = find_pictorgram($rawtext, \&callback);
135
136 Finds pictogram characters in raw-text and executes callback when
137 found. It returns the total numbers of charcters found in text.
138
139 The callback is given two arguments. The first is a found pictogram
140 character itself, and the second is a decimal number which represents
141 codepoint of the character. Whatever the callback returns will replace
142 the original text.
143
144 Here is an implementation of encode_pictogram(), which will be the good
145 example for the usage of find_pictogram().
146
147   sub encode_pictogram {
148       my $text = shift;
149       find_pictogram($text, sub {
150                          my($char, $number) = @_;
151                          return '&#' . $number . ';';
152                      });
153       return $text;
154   }
155
156 =back
157
158 =head1 CAVEAT
159
160 This module works so slow, because regex used here matches C<ANY>
161 characters in the text. This is due to the difficulty of extracting
162 character boundaries of Shift_JIS encoding.
163
164 =head1 AUTHOR
165
166 Tatsuhiko Miyagawa <miyagawa@bulknews.net>
167
168 This library is free software; you can redistribute it and/or
169 modify it under the same terms as Perl itself.
170
171 =head1 SEE ALSO
172
173 L<HTML::Entities>, http://www.nttdocomo.co.jp/i/tag/emoji/index.html
174
175 =cut
176
Note: See TracBrowser for help on using the browser.