]> git.saurik.com Git - apple/icu.git/blob - icuSources/data/translit/t_Latn_Kana.txt
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / data / translit / t_Latn_Kana.txt
1  // -*- Coding: utf-8; -*-
2 //--------------------------------------------------------------------
3 // Copyright (c) 1999-2002, International Business Machines
4 // Corporation and others. All Rights Reserved.
5 //--------------------------------------------------------------------
6 // THIS IS A MACHINE-GENERATED FILE
7 // Tool: dumpicurules.bat
8 // Source: ../../../impl/data/Transliterator_Latin_Katakana.txt
9 // Date: Sat Jul 27 10:31:07 2002
10 //--------------------------------------------------------------------
11
12 // Latin_Katakana
13
14 t_Latn_Kana {
15 Rule {
16 //--------------------------------------------------------------------
17 //--------------------------------------------------------------------
18 //--------------------------------------------------------------------
19
20 // note: a global filter is more efficient, but MUST include all source chars
21 //:: [\\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ;
22 // MINIMAL FILTER GENERATED FOR: Latin-Katakana
23 //## WARNING -- must add width filter, both here and below!!! ###
24 ":: [[\u1100-\u1112\u111A\u1121\u1160-\u1175\u11AA\u11AC-\u11AD\u11B0-\u11B5\u2190-\u2193\u2502\u25A0\u25CB\u3000-\u3002\u300C-\u300D\u3099-\u309A\u30A1-\u30ED\u30EF\u30F2-\u30F4\u30F7\u30FA-\u30FC\uFF01-\uFF5E\uFFE0-\uFFE6][',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B]] ;"
25
26 ":: [:Latin:] fullwidth-halfwidth ();"
27 ":: NFD (NFC);"
28 ":: Lower ();" // whenever transliterating from cased to uncased script, include this
29 // :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
30
31 // Uses modified Hepburn. Small changes to make unambiguous.
32
33 // | Kunrei-shiki: Hepburn/MHepburn
34 // | ------------------------------
35 // | si: shi
36 // | si ~ya: sha
37 // | si ~yu: shu
38 // | si ~yo: sho
39 // | zi: ji
40 // | zi ~ya: ja
41 // | zi ~yu: ju
42 // | zi ~yo: jo
43 // | ti: chi
44 // | ti ~ya: cha
45 // | ti ~yu: chu
46 // | ti ~yu: cho
47 // | tu: tsu
48 // | di: ji/dji
49 // | du: zu/dzu
50 // | hu: fu
51
52 // | For foreign words:
53 // | -----------------
54 // | se ~i si
55 // | si ~e she
56 // |
57 // | ze ~i zi
58 // | zi ~e je
59 // |
60 // | te ~i ti
61 // | ti ~e che
62 // | te ~u tu
63 // |
64 // | de ~i di
65 // | de ~u du
66 // | de ~i di
67 // |
68 // | he ~u: hu
69 // | hu ~a fa
70 // | hu ~i fi
71 // | hu ~e he
72 // | hu ~o ho
73
74 // Most small forms are generated, but if necessary
75 // explicit small forms are given with ~a, ~ya, etc.
76
77 //------------------------------------------------------
78 // Variables
79
80 "$vowel = [aeiou] ;"
81 "$consonant = [bcdfghjklmnpqrstvwxyz] ;"
82 "$macron = \u0304 ;"
83
84 // Variables used for doubled-consonants with tsu
85
86 "$kana = [\u3041-\u3094] ;"
87
88 "$voice = [\u3099\u309B];"
89 "$semivoice = [\u309A\u309C];"
90
91 "$k_start = [カキクケコかきくけこ] ;"
92
93 "$s_start = [サシスセソさしすせそ] ;"
94
95 "$j_start = [シし] $voice ;"
96
97 "$t_start = [タチツテトたちつてと] ;"
98
99 "$n_start = [ナニヌネノンなにぬねの] ;"
100
101 "$h_start = [ハヒヘホはひへほ] ;"
102 "$f_start = [フふ] ;"
103
104 "$m_start = [マミムメモまみむめも] ;"
105
106 "$y_start = [ヤユヨやゆよ] ;"
107
108 "$r_start = [ラリルレロらりるれろ] ;"
109
110 "$w_start = [ワヰヱヲわゐゑを] ;"
111
112 "$v_start = [ワヰヱヲ]゙ ;"
113
114 // if ン is followed by $n_quoter, then it needs an
115 // apostrophe after its romaji form to disambiguate it.
116 // e.g., ン ア ! = ナ, so represent as "n'a", not "na".
117
118 "$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;"
119
120 "$small_y = [ャィュェョ] ;"
121
122 "$iteration = \u309D ;"
123
124 //------------------------------------------------------
125 // katakana rules
126
127 // Punctuation
128
129 "'.' <> 。;"
130 "',' <> 、;"
131 // ' ' } [a-z] > ; # delete spaces before latin
132 // ' ' < [^' '\u30A0-\u30ff] {} ['\u30A0-\u30ff] ; #insert spaces before hiragana
133
134 // Iteration Mark
135 // Copy previous letter & marks
136
137 // TODO
138 // | $1 $1 < ($kana [[:M:]$voice$semivoice]?) $iteration
139
140 // Specials for katakana -- not shared with hiragana
141
142 "va <> ヷ ;"
143 "vi <> ヸ ;"
144 "ve <> ヹ ;"
145 "vo <> ヺ ;"
146 "'~ka' <> ヵ ;"
147 "'~ke' <> ヶ ;"
148
149 // ~~~ begin shared rules ~~~
150
151 //special
152
153 "ya < '~'ャ;"
154 "yi < '~'ィ ;"
155 "yu < '~'ュ;"
156 "ye < '~'ェ;"
157 "yo < '~'ョ;"
158
159 //normal
160
161 "a <> ア ;"
162
163 "b | '~' < ヒ ゙} $small_y ;"
164 "by } $vowel > ビ | '~y' ;"
165
166 "ba <> バ ;"
167 "bi <> ビ ;"
168 "bu <> ブ ;"
169 "be <> ベ ;"
170 "bo <> ボ ;"
171
172 "c } i > | s ;"
173 "c } e > | s ;"
174
175 "da <> ダ ;"
176 "di <> ディ ;"
177 "du <> デゥ ;"
178 "de <> デ ;"
179 "do <> ド ;"
180 "dzu <> ヅ ;"
181 "dja < ヂャ ;"
182 "dji'~i' < ヂィ ;" // liu
183 "dju < ヂュ ;"
184 "dje < ヂェ ;"
185 "djo < ヂョ ;"
186 "dji <> ヂ ;"
187 "dj } $vowel > ヂ | '~y' ;"
188
189 // TODO: QUESTION: use ĵĴżŻ instead of dj, dz
190
191 "cha < チャ ;"
192 "chi'~i' < チィ ;" // liu
193 "chu < チュ ;"
194 "che < チェ ;"
195 "cho < チョ ;"
196 "chi <> チ ;"
197 "ch } $vowel > チ | '~y' ;"
198
199 "e <> エ ;"
200
201 "g | '~' < ギ} $small_y ;"
202 "gy } $vowel > ギ | '~y' ;"
203
204 "ga <> ガ ;"
205 "gi <> ギ ;"
206 "gu <> グ ;"
207 "ge <> ゲ ;"
208 "go <> ゴ ;"
209
210 "i <> イ ;"
211
212 // j } $vowel > ジ | '~y' ;
213
214 "ja <> ジャ ;"
215 "ji'~i' < ジィ ;" // liu
216 "ju <> ジュ ;"
217 "je <> ジェ ;"
218 "jo <> ジョ ;"
219 "ji <> ジ ;"
220
221 "k | '~' < キ} $small_y ;"
222 "ky } $vowel > キ | '~y' ;"
223
224 "ka <> カ ;"
225 "ki <> キ ;"
226 "ku <> ク ;"
227 "ke <> ケ ;"
228 "ko <> コ ;"
229
230 "m | '~' < ミ} $small_y ;"
231 "my } $vowel > ミ | '~y' ;"
232
233 "ma <> マ ;"
234 "mi <> ミ ;"
235 "mu <> ム ;"
236 "me <> メ ;"
237 "mo <> モ ;"
238
239 "m } [pbfv] > ン ;"
240
241 "n | '~' < ニ } $small_y ;"
242 "ny } $vowel > ニ | '~y' ;"
243
244 "na <> ナ ;"
245 "ni <> ニ ;"
246 "nu <> ヌ ;"
247 "ne <> ネ ;"
248 "no <> ノ ;"
249
250 "o <> オ ;"
251
252 "p | '~' < ピ } $small_y ;"
253 "py } $vowel > ピ | '~y' ;"
254
255 "pa <> パ ;"
256 "pi <> ピ ;"
257 "pu <> プ ;"
258 "pe <> ペ ;"
259 "po <> ポ ;"
260
261 "h | '~' < ヒ } $small_y ;"
262 "hy } $vowel > ヒ | '~y' ;"
263
264 "ha <> ハ ;"
265 "hi <> ヒ ;"
266 "hu <> ヘゥ ;"
267 "he <> ヘ ;"
268 "ho <> ホ ;"
269
270 // f | '~' < フ } $small_y ;
271 // f } $vowel > フ | '~' ;
272
273 "fa <> ファ ;"
274 "fi <> フィ ;"
275 "fe <> フェ ;"
276 "fo <> フォ ;"
277 "fu <> フ ;"
278
279 "r | '~' < リ } $small_y ;"
280 "ry } $vowel > リ | '~y' ;"
281
282 "ra <> ラ ;"
283 "ri <> リ ;"
284 "ru <> ル ;"
285 "re <> レ ;"
286 "ro <> ロ ;"
287
288 "za <> ザ ;"
289 "zi <> ゼィ ;"
290 "zu <> ズ ;"
291 "ze <> ゼ ;"
292 "zo <> ゾ ;"
293
294 "sa <> サ ;"
295 "si <> セィ ;"
296 "su <> ス ;"
297 "se <> セ ;"
298 "so <> ソ ;"
299
300 "sha < シャ ;"
301 "shi'~i' < シィ ;" // liu
302 "shu < シュ ;"
303 "she < シェ ;"
304 "sho < ショ ;"
305 "shi <> シ ;"
306 "sh } $vowel > シ | '~y' ;"
307
308 "ta <> タ ;"
309 "ti <> ティ ;"
310 "tu <> テゥ ;"
311 "te <> テ ;"
312 "to <> ト ;"
313
314 "tsu <> ツ ;"
315
316 // v } $vowel > ヴ | '~' ;
317
318 //'v~a' < ヴァ ; # liu
319 //'v~i' < ヴィ ; # liu
320 //'v~e' < ヴェ ; # liu
321 //'v~o' < ヴォ ; # liu
322 "vu <> ヴ ;"
323
324 "u <> ウ ;"
325
326 // w } $vowel > ウ | '~' ;
327
328 "wa <> ワ ;"
329 "wi <> ヰ ;"
330 "wu > ウ ;"
331 "we <> ヱ ;"
332 "wo <> ヲ ;"
333
334 "ya <> ヤ ;"
335 "yi > イ ;"
336 "yu <> ユ ;"
337 "ye > エ ;"
338 "yo <> ヨ ;"
339
340 // double consonants
341
342 //specials
343 "s } sh > ッ ;"
344 "t } ch > ッ ;"
345
346 //voiced
347
348 "j } j <> ッ } $j_start ;"
349 "b } b <> ッ } [$h_start$f_start] $voice;"
350 "d } d <> ッ } $t_start $voice;"
351 "g } g <> ッ } $k_start $voice;"
352 "p } p <> ッ } [$h_start$f_start] $semivoice;"
353 // v } v <> ッ } [ワヰウヱヲう] $voice ;
354 "z } z <> ッ } $s_start $voice;"
355 "v } v <> ッ } $v_start;"
356
357 // normal
358
359 "k } k <> ッ } $k_start ;"
360 "m } m <> ッ } $m_start ;"
361 "n } n <> ッ } $n_start ;"
362 "h } h <> ッ } $h_start ;"
363 "f } f <> ッ } $f_start ;"
364 "r } r <> ッ } $r_start ;"
365 "t } t <> ッ } $t_start ;"
366 "s } s <> ッ } $s_start ;"
367
368 "w } w <> ッ } $w_start;"
369 "y } y <> ッ } $y_start;"
370
371 // completeness
372 "x } x > ッ ;"
373 "c } k > ッ ;"
374 "c } c > ッ ;"
375 "c } q > ッ ;"
376 "l } l > ッ ;"
377 "q } q > ッ ;"
378 // y } y > ッ ;
379 // w } w > ッ ;
380
381 // prolonged vowel mark. this indicates a doubling of
382 // the preceding vowel sound
383
384 //a < a { ー ; # liu
385 //e < e { ー ; # liu
386 //i < i { ー ; # liu
387 //o < o { ー ; # liu
388 //u < u { ー ; # liu
389
390 "$macron <> ー ;"
391
392 // small forms
393
394 "'~a' <> ァ ;"
395 "'~i' <> ィ ;"
396 "'~u' <> ゥ ;"
397 "'~e' <> ェ ;"
398 "'~o' <> ォ ;"
399 "'~tsu' <> ッ ;"
400 "'~wa' <> ヮ ;"
401 "'~ya' <> ャ ;"
402 "'~yi' > ィ ;"
403 "'~yu' <> ュ ;"
404 "'~ye' > ェ ;"
405 "'~yo' <> ョ ;"
406
407 // iteration marks
408 // TODO: make more accurate
409
410 "j $1 < sh (y* $vowel) {ヽ$voice ;"
411 "dj $1 < ch (y* $vowel) {ヽ$voice ;"
412 "dz $1 < ts (y* $vowel) {ヽ$voice ;"
413
414 "g $1 < k (y* $vowel) {ヽ$voice ;"
415 "z $1 < s (y* $vowel) {ヽ$voice ;"
416 "d $1 < t (y* $vowel) {ヽ$voice ;"
417 "h $1 < b (y* $vowel) {ヽ$voice ;"
418 "v $1 < w (y* $vowel) {ヽ$voice ;"
419
420 "sh $1 < sh (y* $vowel) {ヽ$voice ;"
421 "j $1 < j (y* $vowel) {ヽ$voice ;"
422 "ch $1 < ch (y* $vowel) {ヽ$voice ;"
423 "dj $1 < dj(y* $vowel) {ヽ$voice ;"
424 "ts $1 < ts (y* $vowel) {ヽ$voice ;"
425 "dz $1 < dz (y* $vowel) {ヽ$voice ;"
426
427 "$1 < ($consonant y* $vowel) {ヽ$voice? ;"
428 "$1 < (.) {ヽ $voice? ;" // otherwise repeat last character
429 "< ヽ $voice? ;" // delete if no characters found
430
431 // h- rule: lengthens vowel if not followed by a vowel
432
433 "[aeiou] } h > ー ;"
434
435 // one-way latin- > kana rules. these do not occur in
436 // well-formed romaji representing actual japanese text.
437 // their purpose is to make all romaji map to kana of
438 // some sort.
439
440 // the following are not really necessary, but produce
441 // slightly more natural results.
442
443 "cy > セィ ;"
444 "dy > ディ ;"
445 "hy > ヒ ;"
446 "sy > セィ ;"
447 "ty > ティ ;"
448 "zy > ゼィ ;"
449
450 "h > ヘ ;"
451
452 // isolated consonants listed here so as not to mask
453 // longer rules above.
454
455 "ch > チ;"
456 "sh > シ ;"
457 "dz > ヅ ;"
458 "dj > ヂ;"
459
460 "b > ブ ;"
461 "d > デ ;"
462 "g > グ ;"
463 "k > ク ;"
464 "m > ム ;"
465 "n'' < ン } $n_quoter ;"
466 "n <> ン ;"
467 "p > プ ;"
468 "r > ル ;"
469 "s > ス ;"
470 "t > テ ;"
471 "y > イ ;"
472 "z > ズ ;"
473 "v > ヴ ;"
474
475 "f > フ;"
476 "j > ジ;"
477 "w > ウ;"
478
479 "ß > | ss ;"
480 "æ > | e ;"
481 "ð > | d ;"
482 "ø > | u ;"
483 "þ > | th ;"
484
485 // simple substitutions using backup
486
487 "c > | k ;"
488 "l > | r ;"
489 "q > | k ;"
490 "x > | ks ;"
491
492 // ~~~ END shared rules ~~~
493
494 //------------------------------------------------------
495 // Final cleanup
496
497 "'~' > ;" // delete stray tildes between letters
498 "[:Katakana:] { '' } [:Latin:] > ;" // delete stray quotes between letters
499 // [\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
500
501 ":: NFC (NFD) ;"
502 ":: ([:Katakana:] halfwidth-fullwidth);"
503
504 // note: a global filter is more efficient, but MUST include all source chars!!
505 //:: ([\\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]);
506 // MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
507 ":: ( [[\\\ -~\u00A2-\u00A3\u00A5-\u00A6\u00AC\u0304\u20A9\uFF61-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC\uFFE8-\uFFEE][~\u3001-\u3002\u304C\u304E\u3050\u3052\u3054\u3056\u3058\u305A\u305C\u305E\u3060\u3062\u3065\u3067\u3069\u3070-\u3071\u3073-\u3074\u3076-\u3077\u3079-\u307A\u307C-\u307D\u3094\u3099-\u309B\u309E\u30A1-\u30FA\u30FC-\u30FE]] ) ;"
508
509 // eof
510 }
511 }