root/Lingua-JA-Hepburn-Passport/trunk/lib/Lingua/JA/Hepburn/Passport.pm

Revision 2402 (checked in by miyagawa, 13 years ago)

Checking in changes prior to tagging of version 0.02. Changelog diff is:

=== Changes
==================================================================
--- Changes (revision 5488)
+++ Changes (local)
@@ -1,4 +1,8 @@

Revision history for Perl extension Lingua::JA::Hepburn
Passport

- 0.01 Sun Sep 24 20:07:41 2006
+0.02 Sun Dec 9 03:37:15 PST 2007
+ - twekaed docs
+ - changed use encoding "utf-8" to use utf8;
+
+0.01 Sun Sep 24 20:07:41 2006

- original version

Line 
1 package Lingua::JA::Hepburn::Passport;
2
3 use strict;
4 our $VERSION = '0.02';
5
6 use utf8;
7 use Carp;
8
9 our %Map = (
10     "あ", "A",
11     "い", "I",
12     "う", "U",
13     "え", "E",
14     "お", "O",
15     "か", "KA",
16     "き", "KI",
17     "く", "KU",
18     "け", "KE",
19     "こ", "KO",
20     "さ", "SA",
21     "し", "SHI",
22     "す", "SU",
23     "せ", "SE",
24     "そ", "SO",
25     "た", "TA",
26     "ち", "CHI",
27     "つ", "TSU",
28     "て", "TE",
29     "と", "TO",
30     "な", "NA",
31     "に", "NI",
32     "ぬ", "NU",
33     "ね", "NE",
34     "の", "NO",
35     "は", "HA",
36     "ひ", "HI",
37     "ふ", "FU",
38     "へ", "HE",
39     "ほ", "HO",
40     "ま", "MA",
41     "み", "MI",
42     "む", "MU",
43     "め", "ME",
44     "も", "MO",
45     "や", "YA",
46     "ゆ", "YU",
47     "よ", "YO",
48     "ら", "RA",
49     "り", "RI",
50     "る", "RU",
51     "れ", "RE",
52     "ろ", "RO",
53     "わ", "WA",
54     "ゐ", "I",
55     "ゑ", "E",
56     "を", "O",
57     "ん", "N",
58     "ぁ", "A",
59     "ぃ", "I",
60     "ぅ", "U",
61     "ぇ", "E",
62     "ぉ", "O",
63     "が", "GA",
64     "ぎ", "GI",
65     "ぐ", "GU",
66     "げ", "GE",
67     "ご", "GO",
68     "ざ", "ZA",
69     "じ", "JI",
70     "ず", "ZU",
71     "ぜ", "ZE",
72     "ぞ", "ZO",
73     "だ", "DA",
74     "ぢ", "JI",
75     "づ", "ZU",
76     "で", "DE",
77     "ど", "DO",
78     "ば", "BA",
79     "び", "BI",
80     "ぶ", "BU",
81     "べ", "BE",
82     "ぼ", "BO",
83     "ぱ", "PA",
84     "ぴ", "PI",
85     "ぷ", "PU",
86     "ぺ", "PE",
87     "ぽ", "PO",
88     "きゃ", "KYA",
89     "きゅ", "KYU",
90     "きょ", "KYO",
91     "しゃ", "SHA",
92     "しゅ", "SHU",
93     "しょ", "SHO",
94     "ちゃ", "CHA",
95     "ちゅ", "CHU",
96     "ちょ", "CHO",
97     "ちぇ", "CHE",
98     "にゃ", "NYA",
99     "にゅ", "NYU",
100     "にょ", "NYO",
101     "ひゃ", "HYA",
102     "ひゅ", "HYU",
103     "ひょ", "HYO",
104     "みゃ", "MYA",
105     "みゅ", "MYU",
106     "みょ", "MYO",
107     "りゃ", "RYA",
108     "りゅ", "RYU",
109     "りょ", "RYO",
110     "ぎゃ", "GYA",
111     "ぎゅ", "GYU",
112     "ぎょ", "GYO",
113     "じゃ", "JA",
114     "じゅ", "JU",
115     "じょ", "JO",
116     "びゃ", "BYA",
117     "びゅ", "BYU",
118     "びょ", "BYO",
119     "ぴゃ", "PYA",
120     "ぴゅ", "PYU",
121     "ぴょ", "PYO",
122 );
123
124 sub new {
125     my($class, %opt) = @_;
126     bless { %opt }, $class;
127 }
128
129 sub _hepburn_for {
130     my($string, $index) = @_;
131
132     my($hepburn, $char);
133     if ($index + 1 < length $string) {
134         $char    = substr $string, $index, 2;
135         $hepburn = $Map{$char};
136     }
137     if (!$hepburn && $index < length $string) {
138         $char    = substr $string, $index, 1;
139         $hepburn = $Map{$char};
140     }
141
142     return { char => $char, hepburn => $hepburn };
143 }
144
145 sub romanize {
146     my($self, $string) = @_;
147
148     unless (utf8::is_utf8($string)) {
149         croak "romanize(string): should be UTF-8 flagged string";
150     }
151
152     $string =~ tr/ア-ン/あ-ん/;
153
154     if ($self->{strict}) {
155         $string =~ /^\p{Hiragana}*$/
156             or croak "romanize(string): should be all Hiragana/Katakana";
157     }
158
159     my $output;
160     my $last_hepburn;
161     my $last_char;
162     my $i = 0;
163
164     while ($i < length $string) {
165         my $hr = _hepburn_for($string, $i);
166
167         # 1.撥音 ヘボン式ではB ・M ・P の前に N の代わりに M をおく
168         if ($hr->{char} eq 'ん') {
169             my $next = _hepburn_for($string, $i + 1);
170             $hr->{hepburn} = $next->{hepburn} && $next->{hepburn} =~ /^[BMP]/
171                 ? 'M' : 'N';
172         }
173
174         # 2.促音 子音を重ねて示す
175         elsif ($hr->{char} eq 'っ') {
176             my $next = _hepburn_for($string, $i + 1);
177
178             # チ(CH I)、チャ(CHA)、チュ(CHU)、チョ(CHO)音に限り、その前に T を加える。
179             if ($next->{hepburn}) {
180                 $hr->{hepburn} = $next->{hepburn} =~ /^CH/
181                     ? 'T' : substr($next->{hepburn}, 0, 1);
182             }
183         }
184
185         # 3.長音 ヘボン式では長音を表記しない
186         elsif ($hr->{char} eq "ー") {
187             $hr->{hepburn} = "";
188         }
189
190         # Japanese Passport table doesn't have entries for ぁ-ぉ
191         elsif ($hr->{char} =~ /[ぁ-ぉ]/ && $self->{strict}) {
192             croak "$hr->{char} is not allowed";
193         }
194
195         if (defined $hr->{hepburn}) {
196             if ($last_hepburn) {
197                 my $h_test = $last_hepburn . $hr->{hepburn};
198                 if (length $h_test > 2) {
199                     $h_test = substr $h_test, -2;
200                 }
201
202                 # 3.長音 ヘボン式では長音を表記しない
203                 if (grep $h_test eq $_, qw( AA II UU EE )) {
204                     $hr->{hepburn} = '';
205                 }
206
207                 # 氏名に「オウ」又は「オオ」の長音が含まれる場合、
208                 # 「 O 」 か 「 OH 」 のいずれかの表記を選択することができる
209                 if (grep $h_test eq $_, qw( OO OU )) {
210                     $hr->{hepburn} = $self->{long_vowels_h} ? 'H' : '';
211                 }
212             }
213
214             $output .= $hr->{hepburn};
215         } else {
216             if ($self->{strict}) {
217                 croak "Can't find hepburn replacement for $hr->{char}";
218             }
219             $output .= $hr->{char};
220         }
221
222         $last_hepburn = $hr->{hepburn};
223         $last_char    = $hr->{char};
224         $i += length $hr->{char};
225     }
226
227     return $output;
228 }
229
230 1;
231 __END__
232
233 =encoding utf-8
234
235 =head1 NAME
236
237 Lingua::JA::Hepburn::Passport - Hepburn Romanization using Japanese passport rules
238
239 =head1 SYNOPSIS
240
241   use utf8;
242   use Lingua::JA::Hepburn::Passport;
243
244   my $hepburn = Lingua::JA::Hepburn::Passport->new;
245   $hepburn->romanize("みやがわ");     # MIYAGAWA
246   $hepburn->romanize("おおの");       # ONO
247   $hepburn->romanize("かとう");       # KATO
248   $hepburn->romanize("ゆうこ");       # YUKO
249   $hepburn->romanize("なんば");       # NAMBA
250   $hepburn->romanize("はっちょう");   # HATCHO
251
252   # Indicate long vowels by "h"
253   my $hepburn = Lingua::JA::Hepburn::Passport->new( long_vowels_h => 1 );
254   $hepburn->romanize("おおの");       # OHNO
255   $hepburn->romanize("かとう");       # KATOH
256
257 =head1 DESCRIPTION
258
259 Lingua::JA::Hepburn::Passport is a Hiragana/Katakana to Romanization
260 engine using Japanese passport rules.
261
262 =head1 WHY
263
264 There is already a couple of Hepburn romanization modules on CPAN (See
265 L</"SEE ALSO">), but none of them conform to the conversion rule
266 defined in Japanese passport regulation. This one does.
267
268 =head1 METHODS
269
270 =over 4
271
272 =item new
273
274   $hepburn = Lingua::JA::Hepburn::Passport->new;
275   $hepburn = Lingua::JA::Hepburn::Passport->new( long_vowels_h => 1 );
276
277 Creates a new object. Optionally you can pass I<long_vowels_h>
278 parameter to 1, with which this module tries to add I<H> to the long
279 vowels I<OO> and I<OU>, as allowed in Japanese passport rules.
280
281 =item romanize
282
283   $roman = $hepburn->romanize( $kana );
284
285 Romanizes the string I<$kana> using Hepburn romanization. I<$kana>
286 should be either Hiragana or Katakana, as an Unicode string in Perl
287 (a.k.a UTF-8 flagged), otherwise it throws an error. Returned
288 I<$roman> would be all upper case roman letters.
289
290 =back
291
292 This module doesn't come with I<deromanize> method (yet), which would
293 do the Roman to Katakana/Hiragana translation, since I don't think we
294 need it. Other modules on CPAN already do the job quite nicely.
295
296 =head1 AUTHOR
297
298 Tatsuhiko Miyagawa E<lt>miyagawa@bulknews.netE<gt>
299
300 =head1 LICENSE
301
302 This library is free software; you can redistribute it and/or modify
303 it under the same terms as Perl itself.
304
305 Code algorithm is based on L<http://www.d-project.com/hebonconv/>
306
307 =head1 SEE ALSO
308
309 L<http://www.seikatubunka.metro.tokyo.jp/hebon/>, L<http://en.wikipedia.org/wiki/Hepburn_romanization>, L<Lingua::JA::Romanize::Kana>, L<Lingua::JA::Kana>
310
311 =cut
Note: See TracBrowser for help on using the browser.