1 #--------------------------------------------------------------------
2 # Copyright (c) 1999-2004, International Business Machines
3 # Corporation and others. All Rights Reserved.
4 #--------------------------------------------------------------------
6 # note: a global filter is more efficient, but MUST include all source chars
7 #:: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ;
8 # MINIMAL FILTER GENERATED FOR: Latin-Katakana
9 ### WARNING -- must add width filter, both here and below!!! ###
10 :: [[\u1100-\u1112\u111A\u1121\u1160-\u1175\u11AA\u11AC-\u11AD\u11B0-\u11B5\u2190-\u2193\u2502\u25A0\u25CB\u3000-\u3002\u300C-\u300D\u3099-\u309A\u30A1-\u30ED\u30EF\u30F2-\u30F4\u30F7\u30FA-\u30FC\uFF01-\uFF5E\uFFE0-\uFFE6][',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B]] ;
12 :: [:Latin:] fullwidth-halfwidth ();
14 :: Lower (); # whenever transliterating from cased to uncased script, include this
15 # :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
17 # Uses modified Hepburn. Small changes to make unambiguous.
19 # | Kunrei-shiki: Hepburn/MHepburn
20 # | ------------------------------
38 # | For foreign words:
60 # Most small forms are generated, but if necessary
61 # explicit small forms are given with ~a, ~ya, etc.
63 #------------------------------------------------------
67 $consonant = [bcdfghjklmnpqrstvwxyz] ;
70 # Variables used for doubled-consonants with tsu
72 $kana = [\u3041-\u3094] ;
74 $voice = [\u3099\u309B];
75 $semivoice = [\u309A\u309C];
77 $k_start = [カキクケコかきくけこ] ;
79 $s_start = [サシスセソさしすせそ] ;
81 $j_start = [シし] $voice ;
83 $t_start = [タチツテトたちつてと] ;
85 $n_start = [ナニヌネノンなにぬねの] ;
87 $h_start = [ハヒヘホはひへほ] ;
90 $m_start = [マミムメモまみむめも] ;
94 $r_start = [ラリルレロらりるれろ] ;
96 $w_start = [ワヰヱヲわゐゑを] ;
100 # if ン is followed by $n_quoter, then it needs an
101 # apostrophe after its romaji form to disambiguate it.
102 # e.g., ン ア ! = ナ, so represent as "n'a", not "na".
104 $n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;
108 $iteration = \u309D ;
110 #------------------------------------------------------
117 # ' ' } [a-z] > ; # delete spaces before latin
118 # ' ' < [^' '\u30A0-\u30ff] {} ['\u30A0-\u30ff] ; #insert spaces before hiragana
121 # Copy previous letter & marks
124 # | $1 $1 < ($kana [[:M:]$voice$semivoice]?) $iteration
126 # Specials for katakana -- not shared with hiragana
135 # ~~~ begin shared rules ~~~
149 b | '~' < ヒ ゙} $small_y ;
150 by } $vowel > ビ | '~y' ;
168 dji'~i' < ヂィ ; # liu
173 dj } $vowel > ヂ | '~y' ;
175 # TODO: QUESTION: use ĵĴżŻ instead of dj, dz
183 ch } $vowel > チ | '~y' ;
187 g | '~' < ギ} $small_y ;
188 gy } $vowel > ギ | '~y' ;
198 # j } $vowel > ジ | '~y' ;
207 k | '~' < キ} $small_y ;
208 ky } $vowel > キ | '~y' ;
216 m | '~' < ミ} $small_y ;
217 my } $vowel > ミ | '~y' ;
227 n | '~' < ニ } $small_y ;
228 ny } $vowel > ニ | '~y' ;
238 p | '~' < ピ } $small_y ;
239 py } $vowel > ピ | '~y' ;
247 h | '~' < ヒ } $small_y ;
248 hy } $vowel > ヒ | '~y' ;
256 # f | '~' < フ } $small_y ;
257 # f } $vowel > フ | '~' ;
265 r | '~' < リ } $small_y ;
266 ry } $vowel > リ | '~y' ;
292 sh } $vowel > シ | '~y' ;
302 # v } $vowel > ヴ | '~' ;
312 # w } $vowel > ウ | '~' ;
334 j } j <> ッ } $j_start ;
335 b } b <> ッ } [$h_start$f_start] $voice;
336 d } d <> ッ } $t_start $voice;
337 g } g <> ッ } $k_start $voice;
338 p } p <> ッ } [$h_start$f_start] $semivoice;
339 # v } v <> ッ } [ワヰウヱヲう] $voice ;
340 z } z <> ッ } $s_start $voice;
341 v } v <> ッ } $v_start;
345 k } k <> ッ } $k_start ;
346 m } m <> ッ } $m_start ;
347 n } n <> ッ } $n_start ;
348 h } h <> ッ } $h_start ;
349 f } f <> ッ } $f_start ;
350 r } r <> ッ } $r_start ;
351 t } t <> ッ } $t_start ;
352 s } s <> ッ } $s_start ;
354 w } w <> ッ } $w_start;
355 y } y <> ッ } $y_start;
367 # prolonged vowel mark. this indicates a doubling of
368 # the preceding vowel sound
394 # TODO: make more accurate
396 j $1 < sh (y* $vowel) {ヽ$voice ;
397 dj $1 < ch (y* $vowel) {ヽ$voice ;
398 dz $1 < ts (y* $vowel) {ヽ$voice ;
400 g $1 < k (y* $vowel) {ヽ$voice ;
401 z $1 < s (y* $vowel) {ヽ$voice ;
402 d $1 < t (y* $vowel) {ヽ$voice ;
403 h $1 < b (y* $vowel) {ヽ$voice ;
404 v $1 < w (y* $vowel) {ヽ$voice ;
406 sh $1 < sh (y* $vowel) {ヽ$voice ;
407 j $1 < j (y* $vowel) {ヽ$voice ;
408 ch $1 < ch (y* $vowel) {ヽ$voice ;
409 dj $1 < dj(y* $vowel) {ヽ$voice ;
410 ts $1 < ts (y* $vowel) {ヽ$voice ;
411 dz $1 < dz (y* $vowel) {ヽ$voice ;
413 $1 < ($consonant y* $vowel) {ヽ$voice? ;
414 $1 < (.) {ヽ $voice? ; # otherwise repeat last character
415 < ヽ $voice? ; # delete if no characters found
417 # h- rule: lengthens vowel if not followed by a vowel
421 # one-way latin- > kana rules. these do not occur in
422 # well-formed romaji representing actual japanese text.
423 # their purpose is to make all romaji map to kana of
426 # the following are not really necessary, but produce
427 # slightly more natural results.
438 # isolated consonants listed here so as not to mask
439 # longer rules above.
451 n'' < ン } $n_quoter ;
471 # simple substitutions using backup
478 # ~~~ END shared rules ~~~
480 #------------------------------------------------------
483 '~' > ; # delete stray tildes between letters
484 [:Katakana:] { '' } [:Latin:] > ; # delete stray quotes between letters
485 # [\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
488 :: ([:Katakana:] halfwidth-fullwidth);
490 # note: a global filter is more efficient, but MUST include all source chars!!
491 #:: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]);
492 # MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
493 :: ( [[\ -~\u00A2-\u00A3\u00A5-\u00A6\u00AC\u0304\u20A9\uFF61-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC\uFFE8-\uFFEE][~\u3001-\u3002\u304C\u304E\u3050\u3052\u3054\u3056\u3058\u305A\u305C\u305E\u3060\u3062\u3065\u3067\u3069\u3070-\u3071\u3073-\u3074\u3076-\u3077\u3079-\u307A\u307C-\u307D\u3094\u3099-\u309B\u309E\u30A1-\u30FA\u30FC-\u30FE]] ) ;