]> git.saurik.com Git - apple/icu.git/blob - icuSources/data/translit/Latin_Katakana.txt
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / data / translit / Latin_Katakana.txt
1 #--------------------------------------------------------------------
2 # Copyright (c) 1999-2004, International Business Machines
3 # Corporation and others. All Rights Reserved.
4 #--------------------------------------------------------------------
5
6 # note: a global filter is more efficient, but MUST include all source chars
7 #:: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ;
8 # MINIMAL FILTER GENERATED FOR: Latin-Katakana
9 ### WARNING -- must add width filter, both here and below!!! ###
10 :: [[\u1100-\u1112\u111A\u1121\u1160-\u1175\u11AA\u11AC-\u11AD\u11B0-\u11B5\u2190-\u2193\u2502\u25A0\u25CB\u3000-\u3002\u300C-\u300D\u3099-\u309A\u30A1-\u30ED\u30EF\u30F2-\u30F4\u30F7\u30FA-\u30FC\uFF01-\uFF5E\uFFE0-\uFFE6][',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B]] ;
11
12 :: [:Latin:] fullwidth-halfwidth ();
13 :: NFD (NFC);
14 :: Lower (); # whenever transliterating from cased to uncased script, include this
15 # :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
16
17 # Uses modified Hepburn. Small changes to make unambiguous.
18
19 # | Kunrei-shiki: Hepburn/MHepburn
20 # | ------------------------------
21 # | si: shi
22 # | si ~ya: sha
23 # | si ~yu: shu
24 # | si ~yo: sho
25 # | zi: ji
26 # | zi ~ya: ja
27 # | zi ~yu: ju
28 # | zi ~yo: jo
29 # | ti: chi
30 # | ti ~ya: cha
31 # | ti ~yu: chu
32 # | ti ~yu: cho
33 # | tu: tsu
34 # | di: ji/dji
35 # | du: zu/dzu
36 # | hu: fu
37
38 # | For foreign words:
39 # | -----------------
40 # | se ~i si
41 # | si ~e she
42 # |
43 # | ze ~i zi
44 # | zi ~e je
45 # |
46 # | te ~i ti
47 # | ti ~e che
48 # | te ~u tu
49 # |
50 # | de ~i di
51 # | de ~u du
52 # | de ~i di
53 # |
54 # | he ~u: hu
55 # | hu ~a fa
56 # | hu ~i fi
57 # | hu ~e he
58 # | hu ~o ho
59
60 # Most small forms are generated, but if necessary
61 # explicit small forms are given with ~a, ~ya, etc.
62
63 #------------------------------------------------------
64 # Variables
65
66 $vowel = [aeiou] ;
67 $consonant = [bcdfghjklmnpqrstvwxyz] ;
68 $macron = \u0304 ;
69
70 # Variables used for doubled-consonants with tsu
71
72 $kana = [\u3041-\u3094] ;
73
74 $voice = [\u3099\u309B];
75 $semivoice = [\u309A\u309C];
76
77 $k_start = [カキクケコかきくけこ] ;
78
79 $s_start = [サシスセソさしすせそ] ;
80
81 $j_start = [シし] $voice ;
82
83 $t_start = [タチツテトたちつてと] ;
84
85 $n_start = [ナニヌネノンなにぬねの] ;
86
87 $h_start = [ハヒヘホはひへほ] ;
88 $f_start = [フふ] ;
89
90 $m_start = [マミムメモまみむめも] ;
91
92 $y_start = [ヤユヨやゆよ] ;
93
94 $r_start = [ラリルレロらりるれろ] ;
95
96 $w_start = [ワヰヱヲわゐゑを] ;
97
98 $v_start = [ワヰヱヲ]゙ ;
99
100 # if ン is followed by $n_quoter, then it needs an
101 # apostrophe after its romaji form to disambiguate it.
102 # e.g., ン ア ! = ナ, so represent as "n'a", not "na".
103
104 $n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;
105
106 $small_y = [ャィュェョ] ;
107
108 $iteration = \u309D ;
109
110 #------------------------------------------------------
111 # katakana rules
112
113 # Punctuation
114
115 '.' <> 。;
116 ',' <> 、;
117 # ' ' } [a-z] > ; # delete spaces before latin
118 # ' ' < [^' '\u30A0-\u30ff] {} ['\u30A0-\u30ff] ; #insert spaces before hiragana
119
120 # Iteration Mark
121 # Copy previous letter & marks
122
123 # TODO
124 # | $1 $1 < ($kana [[:M:]$voice$semivoice]?) $iteration
125
126 # Specials for katakana -- not shared with hiragana
127
128 va <> ヷ ;
129 vi <> ヸ ;
130 ve <> ヹ ;
131 vo <> ヺ ;
132 '~ka' <> ヵ ;
133 '~ke' <> ヶ ;
134
135 # ~~~ begin shared rules ~~~
136
137 #special
138
139 ya < '~'ャ;
140 yi < '~'ィ ;
141 yu < '~'ュ;
142 ye < '~'ェ;
143 yo < '~'ョ;
144
145 #normal
146
147 a <> ア ;
148
149 b | '~' < ヒ ゙} $small_y ;
150 by } $vowel > ビ | '~y' ;
151
152 ba <> バ ;
153 bi <> ビ ;
154 bu <> ブ ;
155 be <> ベ ;
156 bo <> ボ ;
157
158 c } i > | s ;
159 c } e > | s ;
160
161 da <> ダ ;
162 di <> ディ ;
163 du <> デゥ ;
164 de <> デ ;
165 do <> ド ;
166 dzu <> ヅ ;
167 dja < ヂャ ;
168 dji'~i' < ヂィ ; # liu
169 dju < ヂュ ;
170 dje < ヂェ ;
171 djo < ヂョ ;
172 dji <> ヂ ;
173 dj } $vowel > ヂ | '~y' ;
174
175 # TODO: QUESTION: use ĵĴżŻ instead of dj, dz
176
177 cha < チャ ;
178 chi'~i' < チィ ; # liu
179 chu < チュ ;
180 che < チェ ;
181 cho < チョ ;
182 chi <> チ ;
183 ch } $vowel > チ | '~y' ;
184
185 e <> エ ;
186
187 g | '~' < ギ} $small_y ;
188 gy } $vowel > ギ | '~y' ;
189
190 ga <> ガ ;
191 gi <> ギ ;
192 gu <> グ ;
193 ge <> ゲ ;
194 go <> ゴ ;
195
196 i <> イ ;
197
198 # j } $vowel > ジ | '~y' ;
199
200 ja <> ジャ ;
201 ji'~i' < ジィ ; # liu
202 ju <> ジュ ;
203 je <> ジェ ;
204 jo <> ジョ ;
205 ji <> ジ ;
206
207 k | '~' < キ} $small_y ;
208 ky } $vowel > キ | '~y' ;
209
210 ka <> カ ;
211 ki <> キ ;
212 ku <> ク ;
213 ke <> ケ ;
214 ko <> コ ;
215
216 m | '~' < ミ} $small_y ;
217 my } $vowel > ミ | '~y' ;
218
219 ma <> マ ;
220 mi <> ミ ;
221 mu <> ム ;
222 me <> メ ;
223 mo <> モ ;
224
225 m } [pbfv] > ン ;
226
227 n | '~' < ニ } $small_y ;
228 ny } $vowel > ニ | '~y' ;
229
230 na <> ナ ;
231 ni <> ニ ;
232 nu <> ヌ ;
233 ne <> ネ ;
234 no <> ノ ;
235
236 o <> オ ;
237
238 p | '~' < ピ } $small_y ;
239 py } $vowel > ピ | '~y' ;
240
241 pa <> パ ;
242 pi <> ピ ;
243 pu <> プ ;
244 pe <> ペ ;
245 po <> ポ ;
246
247 h | '~' < ヒ } $small_y ;
248 hy } $vowel > ヒ | '~y' ;
249
250 ha <> ハ ;
251 hi <> ヒ ;
252 hu <> ヘゥ ;
253 he <> ヘ ;
254 ho <> ホ ;
255
256 # f | '~' < フ } $small_y ;
257 # f } $vowel > フ | '~' ;
258
259 fa <> ファ ;
260 fi <> フィ ;
261 fe <> フェ ;
262 fo <> フォ ;
263 fu <> フ ;
264
265 r | '~' < リ } $small_y ;
266 ry } $vowel > リ | '~y' ;
267
268 ra <> ラ ;
269 ri <> リ ;
270 ru <> ル ;
271 re <> レ ;
272 ro <> ロ ;
273
274 za <> ザ ;
275 zi <> ゼィ ;
276 zu <> ズ ;
277 ze <> ゼ ;
278 zo <> ゾ ;
279
280 sa <> サ ;
281 si <> セィ ;
282 su <> ス ;
283 se <> セ ;
284 so <> ソ ;
285
286 sha < シャ ;
287 shi'~i' < シィ ; # liu
288 shu < シュ ;
289 she < シェ ;
290 sho < ショ ;
291 shi <> シ ;
292 sh } $vowel > シ | '~y' ;
293
294 ta <> タ ;
295 ti <> ティ ;
296 tu <> テゥ ;
297 te <> テ ;
298 to <> ト ;
299
300 tsu <> ツ ;
301
302 # v } $vowel > ヴ | '~' ;
303
304 #'v~a' < ヴァ ; # liu
305 #'v~i' < ヴィ ; # liu
306 #'v~e' < ヴェ ; # liu
307 #'v~o' < ヴォ ; # liu
308 vu <> ヴ ;
309
310 u <> ウ ;
311
312 # w } $vowel > ウ | '~' ;
313
314 wa <> ワ ;
315 wi <> ヰ ;
316 wu > ウ ;
317 we <> ヱ ;
318 wo <> ヲ ;
319
320 ya <> ヤ ;
321 yi > イ ;
322 yu <> ユ ;
323 ye > エ ;
324 yo <> ヨ ;
325
326 # double consonants
327
328 #specials
329 s } sh > ッ ;
330 t } ch > ッ ;
331
332 #voiced
333
334 j } j <> ッ } $j_start ;
335 b } b <> ッ } [$h_start$f_start] $voice;
336 d } d <> ッ } $t_start $voice;
337 g } g <> ッ } $k_start $voice;
338 p } p <> ッ } [$h_start$f_start] $semivoice;
339 # v } v <> ッ } [ワヰウヱヲう] $voice ;
340 z } z <> ッ } $s_start $voice;
341 v } v <> ッ } $v_start;
342
343 # normal
344
345 k } k <> ッ } $k_start ;
346 m } m <> ッ } $m_start ;
347 n } n <> ッ } $n_start ;
348 h } h <> ッ } $h_start ;
349 f } f <> ッ } $f_start ;
350 r } r <> ッ } $r_start ;
351 t } t <> ッ } $t_start ;
352 s } s <> ッ } $s_start ;
353
354 w } w <> ッ } $w_start;
355 y } y <> ッ } $y_start;
356
357 # completeness
358 x } x > ッ ;
359 c } k > ッ ;
360 c } c > ッ ;
361 c } q > ッ ;
362 l } l > ッ ;
363 q } q > ッ ;
364 # y } y > ッ ;
365 # w } w > ッ ;
366
367 # prolonged vowel mark. this indicates a doubling of
368 # the preceding vowel sound
369
370 #a < a { ー ; # liu
371 #e < e { ー ; # liu
372 #i < i { ー ; # liu
373 #o < o { ー ; # liu
374 #u < u { ー ; # liu
375
376 $macron <> ー ;
377
378 # small forms
379
380 '~a' <> ァ ;
381 '~i' <> ィ ;
382 '~u' <> ゥ ;
383 '~e' <> ェ ;
384 '~o' <> ォ ;
385 '~tsu' <> ッ ;
386 '~wa' <> ヮ ;
387 '~ya' <> ャ ;
388 '~yi' > ィ ;
389 '~yu' <> ュ ;
390 '~ye' > ェ ;
391 '~yo' <> ョ ;
392
393 # iteration marks
394 # TODO: make more accurate
395
396 j $1 < sh (y* $vowel) {ヽ$voice ;
397 dj $1 < ch (y* $vowel) {ヽ$voice ;
398 dz $1 < ts (y* $vowel) {ヽ$voice ;
399
400 g $1 < k (y* $vowel) {ヽ$voice ;
401 z $1 < s (y* $vowel) {ヽ$voice ;
402 d $1 < t (y* $vowel) {ヽ$voice ;
403 h $1 < b (y* $vowel) {ヽ$voice ;
404 v $1 < w (y* $vowel) {ヽ$voice ;
405
406 sh $1 < sh (y* $vowel) {ヽ$voice ;
407 j $1 < j (y* $vowel) {ヽ$voice ;
408 ch $1 < ch (y* $vowel) {ヽ$voice ;
409 dj $1 < dj(y* $vowel) {ヽ$voice ;
410 ts $1 < ts (y* $vowel) {ヽ$voice ;
411 dz $1 < dz (y* $vowel) {ヽ$voice ;
412
413 $1 < ($consonant y* $vowel) {ヽ$voice? ;
414 $1 < (.) {ヽ $voice? ; # otherwise repeat last character
415 < ヽ $voice? ; # delete if no characters found
416
417 # h- rule: lengthens vowel if not followed by a vowel
418
419 [aeiou] } h > ー ;
420
421 # one-way latin- > kana rules. these do not occur in
422 # well-formed romaji representing actual japanese text.
423 # their purpose is to make all romaji map to kana of
424 # some sort.
425
426 # the following are not really necessary, but produce
427 # slightly more natural results.
428
429 cy > セィ ;
430 dy > ディ ;
431 hy > ヒ ;
432 sy > セィ ;
433 ty > ティ ;
434 zy > ゼィ ;
435
436 h > ヘ ;
437
438 # isolated consonants listed here so as not to mask
439 # longer rules above.
440
441 ch > チ;
442 sh > シ ;
443 dz > ヅ ;
444 dj > ヂ;
445
446 b > ブ ;
447 d > デ ;
448 g > グ ;
449 k > ク ;
450 m > ム ;
451 n'' < ン } $n_quoter ;
452 n <> ン ;
453 p > プ ;
454 r > ル ;
455 s > ス ;
456 t > テ ;
457 y > イ ;
458 z > ズ ;
459 v > ヴ ;
460
461 f > フ;
462 j > ジ;
463 w > ウ;
464
465 ß > | ss ;
466 æ > | e ;
467 ð > | d ;
468 ø > | u ;
469 þ > | th ;
470
471 # simple substitutions using backup
472
473 c > | k ;
474 l > | r ;
475 q > | k ;
476 x > | ks ;
477
478 # ~~~ END shared rules ~~~
479
480 #------------------------------------------------------
481 # Final cleanup
482
483 '~' > ; # delete stray tildes between letters
484 [:Katakana:] { '' } [:Latin:] > ; # delete stray quotes between letters
485 # [\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
486
487 :: NFC (NFD) ;
488 :: ([:Katakana:] halfwidth-fullwidth);
489
490 # note: a global filter is more efficient, but MUST include all source chars!!
491 #:: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]);
492 # MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
493 :: ( [[\ -~\u00A2-\u00A3\u00A5-\u00A6\u00AC\u0304\u20A9\uFF61-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC\uFFE8-\uFFEE][~\u3001-\u3002\u304C\u304E\u3050\u3052\u3054\u3056\u3058\u305A\u305C\u305E\u3060\u3062\u3065\u3067\u3069\u3070-\u3071\u3073-\u3074\u3076-\u3077\u3079-\u307A\u307C-\u307D\u3094\u3099-\u309B\u309E\u30A1-\u30FA\u30FC-\u30FE]] ) ;
494
495 # eof