root/Convert-RACE/trunk/lib/Convert/RACE.pm

Revision 44 (checked in by miyagawa, 19 years ago)

added address

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
Line 
1 package Convert::RACE;
2
3 use strict;
4 use vars qw($VERSION @ISA @EXPORT);
5
6 BEGIN {
7     require Exporter;
8     @ISA = qw(Exporter);
9     @EXPORT = qw(to_race from_race);
10    
11     $VERSION = '0.05';
12 }
13
14 use Carp ();
15 use Convert::Base32 qw(encode_base32 decode_base32);
16
17 use constant COMPRESS_EXCEPTION         => 'Invalid encoding to compress';
18 use constant DECOMPRESS_EXCEPTION       => 'Invalid format to decompress';
19
20 my $_prefix_tag = 'bq--';
21
22 sub prefix_tag {
23     my $class = shift;
24     $_prefix_tag = $_[0] if (@_);
25     return $_prefix_tag;
26 }
27
28 sub to_race($) {
29     my $str = shift;
30
31     # 2.2.1 Check the input string for disallowed names
32     unless (_include_disallowed_names($str)) {
33         Carp::croak('String includes no internationalized characters');
34     }
35
36     # 2.2.2 Compress the pre-converted string
37     my $compressed = _compress($str);
38
39     # 2.2.3 Check the length of the compressed string
40     if (length($compressed) > 36) {
41         Carp::croak('String too long');
42     }
43
44     # 2.2.4 Encode the compressed string with Base32
45     my $encoded = encode_base32($compressed);
46
47     # 2.2.5 Prepend "bq--" to the encoded string and finish
48     return $_prefix_tag . $encoded;
49 }
50
51 sub from_race($) {
52     my $str = lc(shift);
53
54     # 2.3.1 Strip the "bq--"
55     $str =~ s/^$_prefix_tag// or Carp::croak("String not begin with $_prefix_tag");
56
57     # 2.3.2 Decode the stripped string with Base32
58     my $decoded = decode_base32($str);
59
60     # 2.3.3 Decompress the decoded string
61     my $decompressed = _decompress($decoded);
62
63     # 2.3.4 Check the internationalized string for disallowed names
64     unless (_include_disallowed_names($decompressed)) {
65         Carp::croak('Decoded string includes no internationalized characters');
66     }
67
68     return $decompressed;
69 }
70
71
72 sub _compress($) {
73     my $str = shift;
74
75     my @unique_upper_octet = _make_uniq_upper_octet($str);
76      if (@unique_upper_octet > 2 ||
77          (@unique_upper_octet == 2 &&
78           ! grep { $_ eq "\x00" } @unique_upper_octet)) {
79         # from more than 2 rows
80         # or from 2 rows neither of with is 0
81         return "\xD8" . $str;
82     }
83
84     my $u1 = @unique_upper_octet == 1
85         ? $unique_upper_octet[0] : (grep { $_ ne "\x00" } @unique_upper_octet)[0];
86     if ($u1 =~ /^[\xd8-\xdc]{1}$/) {
87         Carp::croak(COMPRESS_EXCEPTION);
88     }
89
90     my $res = $u1;
91
92     while ($str =~ m/(.)(.)/gs) {
93         my ($u2, $n1) = ($1, $2);
94         if ($u2 eq "\x00" and $n1 eq "\x99") {
95             Carp::croak(COMPRESS_EXCEPTION);
96         } elsif ($u2 eq $u1 and $n1 ne "\xff") {
97             $res .= $n1;
98         } elsif ($u2 eq $u1 and $n1 eq "\xff") {
99             $res .= "\xff\x99";
100         } else {
101             $res .= "\xff$n1";
102         }
103     }
104    
105     return $res;
106 }
107
108
109 sub _decompress($) {
110     my $str = shift;
111    
112     # 1)
113     my ($u1, $rest) = (substr($str,0,1), substr($str,1));
114     if (length($str) == 1) {
115         Carp::croak(DECOMPRESS_EXCEPTION);
116     }
117
118     if ($u1 eq "\xd8") {
119         # 8)
120         my $lcheck = $rest;
121         if (length($lcheck) % 2) {
122             Carp::croak(DECOMPRESS_EXCEPTION);
123         }
124         # 9)
125         my @unique_upper_octet = _make_uniq_upper_octet($lcheck);
126         if (@unique_upper_octet == 1 ||
127             (@unique_upper_octet == 2 &&
128              grep { $_ eq "\x00" } @unique_upper_octet)) {
129             Carp::croak(DECOMPRESS_EXCEPTION);
130         }
131         # 10)
132         return $lcheck;
133     }
134
135     my $buffer = '';
136     my $pos = 0;
137     # 2)
138     while (1) {
139         if ($pos == length($rest)) {
140             # 11)
141             if (length($buffer) % 2) {
142                 Carp::croak(DECOMPRESS_EXCEPTION);
143             }
144             return $buffer;
145         }
146        
147         my $n1 = substr($rest, $pos, 1);
148         if ($n1 eq "\xff") {
149             # 5)
150             if ($pos == length($rest)-1) {
151               Carp::croak(DECOMPRESS_EXCEPTION);
152             }
153             # 6)
154             $pos++;
155             $n1 = substr($rest, $pos, 1);
156             if ($n1 eq "\x99") {
157                 $buffer .= $u1 . "\xff";
158                 next;
159             }
160             # 7)
161             $buffer .= "\x00" . $n1;
162             next;
163         } elsif ($u1 eq "\x00" and $n1 eq "\x99") {
164             # 3)
165             Carp::croak(DECOMPRESS_EXCEPTION);
166         }
167         # 4)
168         $buffer .= $u1 . $n1;
169         next;
170     } continue { $pos++; }
171 }
172
173
174 sub _make_uniq_upper_octet($) {
175     my $str = shift;
176
177     my %seen;
178     while ($str =~ m/(.)./gs) {
179         $seen{$1}++;
180     }
181     return keys %seen;
182 }
183
184 sub _include_disallowed_names($) {
185     # RFC 1035: letter, digit, hyphen
186     return $_[0] !~ /^(?:\x00[\x30-\x39\x41-\x5a\x61-\x7a\x2d])*$/;
187 }
188
189
190 1;
191 __END__
192
193 =head1 NAME
194
195 Convert::RACE - Conversion between Unicode and RACE
196
197 =head1 SYNOPSIS
198
199   use Convert::RACE;
200
201   $domain = to_race($utf16str);
202   $utf16str = from_race($domain);
203
204 =head1 DESCRIPTION
205
206 This module provides functions to convert between RACE (Row-based
207 ASCII-Compatible Encoding) and Unicode Encodings.
208
209 RACE converts strings with internationalized characters into strings
210 of US-ASCII that are acceptable as host name parts in current DNS host
211 naming usage.
212
213 See http://www.ietf.org/internet-drafts/draft-ietf-idn-race-03.txt for
214 more details.
215
216 =head1 FUNCTION
217
218 Following functions are provided; they are all in B<@EXPORT> array.
219 See L<Exporter> for details.
220
221 =over 4
222
223 =item to_race($utf16)
224
225 to_race() takes UTF-16 encoding and returns RACE-encoded strings such
226 as 'bq--aewrcsy'.
227
228 This function throws an exception such as 'String includes no
229 internationalized characters', 'String too long' and 'Invalid encoding
230 to compress'. Exceptions are thrown with Carp::croak(), so you can
231 cath 'em with eval {};
232
233 =item from_race($domain_name)
234
235 from_race() takes 'bq--' prefixed string and returns original UTF-16
236 string.
237
238 This function throws an exception such as 'String not begin with
239 bq--', 'Decoded string includes no internationalized characters' and '
240 Invalid format to decompress'. Exceptions are thrown with
241 Carp::croak(), so you can cath 'em with eval {};
242
243 =back
244
245 See L<Unicode::String>, L<Unicode::Map8>, L<Jcode> for Unicode
246 conversions.
247
248 =head1 CLASS METHOD
249
250 Following class methods are provided to change the behaviour of
251 Convert::RACE.
252
253 =over 4
254
255 =item prefix_tag()
256
257 Set and get the domain prefix tag. By default, 'bq--'.
258
259 =back
260
261 =head1 EXAMPLES
262
263   use Jcode;
264   use Unicode::String 'latin1';
265   use Convert::RACE 'to_race';
266
267   # EUC-japanese here
268   $name = to_race(Jcode->new('ÆüËÜžì','euc')->ucs2);
269  
270   # or, Latin here
271   $name = to_race(latin1($latin_string)->utf16);
272
273   # in doubt of exception
274   eval { $name = to_race($utf); };
275   if ($@) {
276       warn "Can't encode to RACE: $@";
277   }
278
279   # change the prefix
280   Convert::RACE->prefix_tag('xx--');
281
282
283 =head1 TODO AND CAVEATS
284
285 =over 4
286
287 =item *
288
289 Using XS would be by far efficient.
290
291 =item *
292
293 No validation is done for the input UTF-16 string in to_race(). The
294 internet draft says checking for prohibited name parts must be done
295 before doing the conversion.
296
297 =back
298
299 =head1 AUTHOR
300
301 Tatsuhiko Miyagawa <miyagawa@bulknews.net>, with much help from Eugen
302 SAVIN <seugen@serifu.com>, Philip Newton <pne@cpan.org>, Michael J
303 Schout <mschout@gkg.net>.
304
305 This library is free software; you can redistribute it and/or
306 modify it under the same terms as Perl itself.
307
308 =head1 SEE ALSO
309
310 http://www.i-d-n.net/, http://www.ietf.org/internet-drafts/draft-ietf-idn-race-03.txt, RFC 1035, L<Unicode::String>, L<Jcode>, L<Convert::Base32>.
311
312 =cut
Note: See TracBrowser for help on using the browser.