root/Lingua-JA-Hepburn-Passport/trunk/lib/Lingua/JA/Hepburn/Passport.pm

Revision 2004 (checked in by miyagawa, 14 years ago)

fixed typo

Line 
1 package Lingua::JA::Hepburn::Passport;
2
3 use strict;
4 our $VERSION = '0.01';
5
6 use encoding "utf-8";
7 use Carp;
8
9 our %Map = (
10     "あ", "A",
11     "い", "I",
12     "う", "U",
13     "え", "E",
14     "お", "O",
15     "か", "KA",
16     "き", "KI",
17     "く", "KU",
18     "け", "KE",
19     "こ", "KO",
20     "さ", "SA",
21     "し", "SHI",
22     "す", "SU",
23     "せ", "SE",
24     "そ", "SO",
25     "た", "TA",
26     "ち", "CHI",
27     "つ", "TSU",
28     "て", "TE",
29     "と", "TO",
30     "な", "NA",
31     "に", "NI",
32     "ぬ", "NU",
33     "ね", "NE",
34     "の", "NO",
35     "は", "HA",
36     "ひ", "HI",
37     "ふ", "FU",
38     "へ", "HE",
39     "ほ", "HO",
40     "ま", "MA",
41     "み", "MI",
42     "む", "MU",
43     "め", "ME",
44     "も", "MO",
45     "や", "YA",
46     "ゆ", "YU",
47     "よ", "YO",
48     "ら", "RA",
49     "り", "RI",
50     "る", "RU",
51     "れ", "RE",
52     "ろ", "RO",
53     "わ", "WA",
54     "ゐ", "I",
55     "ゑ", "E",
56     "を", "O",
57     "ん", "N",
58     "ぁ", "A",
59     "ぃ", "I",
60     "ぅ", "U",
61     "ぇ", "E",
62     "ぉ", "O",
63     "が", "GA",
64     "ぎ", "GI",
65     "ぐ", "GU",
66     "げ", "GE",
67     "ご", "GO",
68     "ざ", "ZA",
69     "じ", "JI",
70     "ず", "ZU",
71     "ぜ", "ZE",
72     "ぞ", "ZO",
73     "だ", "DA",
74     "ぢ", "JI",
75     "づ", "ZU",
76     "で", "DE",
77     "ど", "DO",
78     "ば", "BA",
79     "び", "BI",
80     "ぶ", "BU",
81     "べ", "BE",
82     "ぼ", "BO",
83     "ぱ", "PA",
84     "ぴ", "PI",
85     "ぷ", "PU",
86     "ぺ", "PE",
87     "ぽ", "PO",
88     "きゃ", "KYA",
89     "きゅ", "KYU",
90     "きょ", "KYO",
91     "しゃ", "SHA",
92     "しゅ", "SHU",
93     "しょ", "SHO",
94     "ちゃ", "CHA",
95     "ちゅ", "CHU",
96     "ちょ", "CHO",
97     "ちぇ", "CHE",
98     "にゃ", "NYA",
99     "にゅ", "NYU",
100     "にょ", "NYO",
101     "ひゃ", "HYA",
102     "ひゅ", "HYU",
103     "ひょ", "HYO",
104     "みゃ", "MYA",
105     "みゅ", "MYU",
106     "みょ", "MYO",
107     "りゃ", "RYA",
108     "りゅ", "RYU",
109     "りょ", "RYO",
110     "ぎゃ", "GYA",
111     "ぎゅ", "GYU",
112     "ぎょ", "GYO",
113     "じゃ", "JA",
114     "じゅ", "JU",
115     "じょ", "JO",
116     "びゃ", "BYA",
117     "びゅ", "BYU",
118     "びょ", "BYO",
119     "ぴゃ", "PYA",
120     "ぴゅ", "PYU",
121     "ぴょ", "PYO",
122 );
123
124 sub new {
125     my($class, %opt) = @_;
126     bless { %opt }, $class;
127 }
128
129 sub _hepburn_for {
130     my($string, $index) = @_;
131
132     my($hepburn, $char);
133     if ($index + 1 < length $string) {
134         $char    = substr $string, $index, 2;
135         $hepburn = $Map{$char};
136     }
137     if (!$hepburn && $index < length $string) {
138         $char    = substr $string, $index, 1;
139         $hepburn = $Map{$char};
140     }
141
142     return { char => $char, hepburn => $hepburn };
143 }
144
145 sub romanize {
146     my($self, $string) = @_;
147
148     unless (utf8::is_utf8($string)) {
149         croak "romanize(string): should be UTF-8 flagged string";
150     }
151
152     $string =~ tr/ア-ン/あ-ん/;
153
154     if ($self->{strict}) {
155         $string =~ /^\p{Hiragana}*$/
156             or croak "romanize(string): should be all Hiragana/Katakana";
157     }
158
159     my $output;
160     my $last_hepburn;
161     my $last_char;
162     my $i = 0;
163
164     while ($i < length $string) {
165         my $hr = _hepburn_for($string, $i);
166
167         # 1.撥音 ヘボン式ではB ・M ・P の前に N の代わりに M をおく
168         if ($hr->{char} eq 'ん') {
169             my $next = _hepburn_for($string, $i + 1);
170             $hr->{hepburn} = $next->{hepburn} && $next->{hepburn} =~ /^[BMP]/
171                 ? 'M' : 'N';
172         }
173
174         # 2.促音 子音を重ねて示す
175         elsif ($hr->{char} eq 'っ') {
176             my $next = _hepburn_for($string, $i + 1);
177
178             # チ(CH I)、チャ(CHA)、チュ(CHU)、チョ(CHO)音に限り、その前に T を加える。
179             if ($next->{hepburn}) {
180                 $hr->{hepburn} = $next->{hepburn} =~ /^CH/
181                     ? 'T' : substr($next->{hepburn}, 0, 1);
182             }
183         }
184
185         # 3.長音 ヘボン式では長音を表記しない
186         elsif ($hr->{char} eq "ー") {
187             $hr->{hepburn} = "";
188         }
189
190         # Japanese Passport table doesn't have entries for ぁ-ぉ
191         elsif ($hr->{char} =~ /[ぁ-ぉ]/ && $self->{strict}) {
192             croak "$hr->{char} is not allowed";
193         }
194
195         if (defined $hr->{hepburn}) {
196             if ($last_hepburn) {
197                 my $h_test = $last_hepburn . $hr->{hepburn};
198                 if (length $h_test > 2) {
199                     $h_test = substr $h_test, -2;
200                 }
201
202                 # 3.長音 ヘボン式では長音を表記しない
203                 if (grep $h_test eq $_, qw( AA II UU EE )) {
204                     $hr->{hepburn} = '';
205                 }
206
207                 # 氏名に「オウ」又は「オオ」の長音が含まれる場合、
208                 # 「 O 」 か 「 OH 」 のいずれかの表記を選択することができる
209                 if (grep $h_test eq $_, qw( OO OU )) {
210                     $hr->{hepburn} = $self->{long_vowels_h} ? 'H' : '';
211                 }
212             }
213
214             $output .= $hr->{hepburn};
215         } else {
216             if ($self->{strict}) {
217                 croak "Can't find hepburn replacement for $hr->{char}";
218             }
219             $output .= $hr->{char};
220         }
221
222         $last_hepburn = $hr->{hepburn};
223         $last_char    = $hr->{char};
224         $i += length $hr->{char};
225     }
226
227     return $output;
228 }
229
230 1;
231 __END__
232
233 =encoding utf-8
234
235 =head1 NAME
236
237 Lingua::JA::Hepburn::Passport - Hepburn Romanization using Japanese passport rules
238
239 =head1 SYNOPSIS
240
241   use utf8;
242   use Lingua::JA::Hepburn::Passport;
243
244   my $hepburn = Lingua::JA::Hepburn::Passport->new;
245   $hepburn->romanize("みやがわ");     # MIYAGAWA
246   $hepburn->romanize("おおの");       # ONO
247   $hepburn->romanize("かとう");       # KATO
248   $hepburn->romanize("ゆうこ");       # YUKO
249   $hepburn->romanize("なんば");       # NAMBA
250   $hepburn->romanize("はっちょう");   # HATCHO
251
252   # Indicate long vowels by "h"
253   my $hepburn = Lingua::JA::Hepburn::Passport->new( long_vowels_h => 1 );
254   $hepburn->romanize("おおの");       # OHNO
255   $hepburn->romanize("かとう");       # KATOH
256
257 =head1 DESCRIPTION
258
259 Lingua::JA::Hepburn::Passport is a Hiragana/Katakana to Romanization
260 engine using Japanese passport rules. There're already some Hepburn
261 romanization modules on CPAN but none of them conform to the one used
262 in Japanese passport, hence I made another one.
263
264 =head1 METHODS
265
266 =over 4
267
268 =item new
269
270   $hepburn = Lingua::JA::Hepburn::Passport->new;
271   $hepburn = Lingua::JA::Hepburn::Passport->new( long_vowels_h => 1 );
272
273 Creates new object. Optionally you can pass I<long_vowels_h> parameter
274 to 1, with which this module tries to add I<H> to the long vowels
275 I<OO> and I<OU>, as allowed in Japanese passport rules.
276
277 =item romanize
278
279   $roman = $hepburn->romanize( $kana );
280
281 Romanizes the string I<$kana> using Hepburn romanization. I<$kana>
282 should be either Hiragana or Katakana, as an Unicode string in Perl
283 (a.k.a UTF-8 flagged), otherwise it throws an error. Returned
284 I<$roman> would be all upper case roman letters.
285
286 =back
287
288 This module doesn't come with I<deromanize> method (yet), which would
289 do the Roman to Katakana/Hiragana translation, since I don't think we
290 need it. Other modules on CPAN already do the job quite nicely.
291
292 =head1 AUTHOR
293
294 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
295
296 =head1 LICENSE
297
298 This library is free software; you can redistribute it and/or modify
299 it under the same terms as Perl itself.
300
301 Some of the code algorithm are ripped off from http://www.d-project.com/hebonconv/
302
303 =head1 SEE ALSO
304
305 L<http://www.seikatubunka.metro.tokyo.jp/hebon/>, L<http://en.wikipedia.org/wiki/Hepburn_romanization>, L<Lingua::JA::Romanize::Kana>
306
307 =cut
Note: See TracBrowser for help on using the browser.