root/HTML-Entities-ImodePictogram/trunk/lib/HTML/Entities/ImodePictogram.pm

Revision 221 (checked in by miyagawa, 19 years ago)

Initial revision

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
Line 
1 package HTML::Entities::ImodePictogram;
2
3 use strict;
4 use vars qw($VERSION);
5 $VERSION = '0.02';
6
7 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
8 require Exporter;
9 @ISA       = qw(Exporter);
10 @EXPORT    = qw(encode_pictogram decode_pictogram remove_pictogram);
11 @EXPORT_OK = qw(find_pictogram);
12 %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
13
14 my $one_byte  = '[\x00-\x7F\xA1-\xDF]';
15 my $two_bytes = '[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]';
16
17 use vars qw($Pictogram_re $Sjis_re);
18 $Pictogram_re = '\xF8[\x9F-\xFC]|\xF9[\x40-\x7E\x80-\xAF]';
19 $Sjis_re      = qr<$one_byte|$two_bytes|$Pictogram_re>;
20
21 sub find_pictogram (\$&) {
22     my($r_text, $callback) = @_;
23
24     my $num_found = 0;
25     $$r_text =~ s{($Sjis_re)}{
26         my $orig_match = $1;
27         if ($orig_match =~ /^$Pictogram_re$/) {
28             $num_found++;
29             $callback->($orig_match, _bin2num($orig_match));
30         }
31         else {
32             $orig_match;
33         }
34     }eg;
35
36     return $num_found;
37 }
38
39 sub encode_pictogram {
40     my $text = shift;
41     find_pictogram($text, sub {
42                        my($char, $number) = @_;
43                        return '&#' . $number . ';';
44                    });
45     return $text;
46 }
47
48 sub decode_pictogram {
49     my $html = shift;
50     $html =~ s{(\&\#(\d{5});)}{
51         if (($2 >= 63647 && $2 <= 63740) ||
52             ($2 >= 63808 && $2 <= 63870) ||
53             ($2 >= 63872 && $2 <= 63919)) {
54             _num2bin($2);
55         }
56         else {
57             $1;
58         }
59     }eg;
60     return $html;
61 }
62
63 sub remove_pictogram {
64     my $text = shift;
65     find_pictogram($text, sub {
66                        return '';
67                    });
68     return $text;
69 }
70
71 sub _num2bin {
72     my $num = shift;
73     my $hex = sprintf '%x', $num;
74     $hex =~ s/([0-9a-f]{2})/chr(hex($1))/ge;
75     return $hex;
76 }
77
78 sub _bin2num {
79     my $bin = shift;
80     $bin =~ s/(.)(.)/256 * ord($1) + ord($2)/eg;
81     return $bin;
82 }
83
84 1;
85 __END__
86
87 =head1 NAME
88
89 HTML::Entities::ImodePictogram - encode / decode i-mode pictogram
90
91 =head1 SYNOPSIS
92
93   use HTML::Entities::ImodePictogram;
94
95   $html      = encode_pictogram($rawtext);
96   $rawtext   = decode_pictogram($html);
97   $cleantext = remove_pictogram($rawtext);
98
99   use HTML::Entities::ImodePictogram qw(find_pictogram);
100
101   $num_found = find_pictogram($rawtext, \&callback);
102
103 =head1 DESCRIPTION
104
105 HTML::Entities::ImodePictogram handles HTML entities for i-mode
106 pictogram (emoji), which are assigned in Shift_JIS private area.
107
108 See http://www.nttdocomo.co.jp/i/tag/emoji/index.html for details
109 about i-mode pictogram.
110
111 =head1 FUNCTIONS
112
113 In all functions in this module, input/output strings are asssumed as
114 encoded in Shift_JIS. See L<Jcode> for conversion between Shift_JIS
115 and other encodings like EUC-JP or UTF-8.
116
117 This module exports following functions by default.
118
119 =over 4
120
121 =item encode_pictogram
122
123   $html = encode_pictogram($rawtext);
124
125 Encodes pictogram characters in raw-text into HTML entities.
126
127 =item decode_pictogram
128
129   $rawtext = decode_pictogram($html);
130
131 Decodes HTML entities for pictogram into raw-text.
132
133 =item remove_pictogram
134
135   $cleantext = remove_pictogram($rawtext);
136
137 Removes pictogram characters in raw-text.
138
139 =back
140
141 This module also exports following functions on demand.
142
143 =over 4
144
145 =item find_pictogram
146
147   $num_found = find_pictorgram($rawtext, \&callback);
148
149 Finds pictogram characters in raw-text and executes callback when
150 found. It returns the total numbers of charcters found in text.
151
152 The callback is given two arguments. The first is a found pictogram
153 character itself, and the second is a decimal number which represents
154 codepoint of the character. Whatever the callback returns will replace
155 the original text.
156
157 Here is an implementation of encode_pictogram(), which will be the good
158 example for the usage of find_pictogram().
159
160   sub encode_pictogram {
161       my $text = shift;
162       find_pictogram($text, sub {
163                          my($char, $number) = @_;
164                          return '&#' . $number . ';';
165                      });
166       return $text;
167   }
168
169 =back
170
171 =head1 CAVEAT
172
173 This module works so slow, because regex used here matches C<ANY>
174 characters in the text. This is due to the difficulty of extracting
175 character boundaries of Shift_JIS encoding.
176
177 =head1 AUTHOR
178
179 Tatsuhiko Miyagawa <miyagawa@bulknews.net>
180
181 This library is free software; you can redistribute it and/or
182 modify it under the same terms as Perl itself.
183
184 =head1 SEE ALSO
185
186 L<HTML::Entities>, http://www.nttdocomo.co.jp/i/tag/emoji/index.html
187
188 =cut
189
Note: See TracBrowser for help on using the browser.