]> git.saurik.com Git - apple/icu.git/blob - icuSources/data/translit/Latn_Kana.txt
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / data / translit / Latn_Kana.txt
1 # ***************************************************************************
2 # *
3 # * Copyright (C) 2004-2016, International Business Machines
4 # * Corporation; Unicode, Inc.; and others. All Rights Reserved.
5 # *
6 # ***************************************************************************
7 # File: Latn_Kana.txt
8 # Generated from CLDR
9 #
10
11 # note: a global filter is more efficient, but MUST include all source chars
12 #:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]] ;
13 # MINIMAL FILTER GENERATED FOR: Latin-Katakana
14 ### WARNING -- must add width filter, both here and below!!! ###
15 :: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ;
16 :: [:Latin:] fullwidth-halfwidth ();
17 :: NFD (NFC);
18 :: Lower (); # whenever transliterating from cased to uncased script, include this
19 # :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
20 # Uses modified Hepburn. Small changes to make unambiguous.
21 # | Kunrei-shiki: Hepburn/MHepburn
22 # | ------------------------------
23 # | si: shi
24 # | si ~ya: sha
25 # | si ~yu: shu
26 # | si ~yo: sho
27 # | zi: ji
28 # | zi ~ya: ja
29 # | zi ~yu: ju
30 # | zi ~yo: jo
31 # | ti: chi
32 # | ti ~ya: cha
33 # | ti ~yu: chu
34 # | ti ~yu: cho
35 # | tu: tsu
36 # | di: ji/dji
37 # | du: zu/dzu
38 # | hu: fu
39 # | For foreign words:
40 # | -----------------
41 # | se ~i si
42 # | si ~e she
43 # |
44 # | ze ~i zi
45 # | zi ~e je
46 # |
47 # | te ~i ti
48 # | ti ~e che
49 # | te ~u tu
50 # |
51 # | de ~i di
52 # | de ~u du
53 # | de ~i di
54 # |
55 # | he ~u: hu
56 # | hu ~a fa
57 # | hu ~i fi
58 # | hu ~e he
59 # | hu ~o ho
60 # Most small forms are generated, but if necessary
61 # explicit small forms are given with ~a, ~ya, etc.
62 #------------------------------------------------------
63 # Variables
64 $vowel = [aeiou] ;
65 $consonant = [bcdfghjklmnpqrstvwxyz] ;
66 $macron = \u0304 ;
67 # Variables used for doubled-consonants with tsu
68 $kana = [ぁ-ゔ] ;
69 $voice = [\u3099゛];
70 $semivoice = [\u309A゜];
71 $k_start = [カキクケコかきくけこ] ;
72 $s_start = [サシスセソさしすせそ] ;
73 $j_start = [シし] $voice ;
74 $t_start = [タチツテトたちつてと] ;
75 $n_start = [ナニヌネノンなにぬねの] ;
76 $h_start = [ハヒヘホはひへほ] ;
77 $f_start = [フふ] ;
78 $m_start = [マミムメモまみむめも] ;
79 $y_start = [ヤユヨやゆよ] ;
80 $r_start = [ラリルレロらりるれろ] ;
81 $w_start = [ワヰヱヲわゐゑを] ;
82 $v_start = [ワヰヱヲ]\u3099 ;
83 $voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ;
84 # if ン is followed by $n_quoter, then it needs an
85 # apostrophe after its romaji form to disambiguate it.
86 # e.g., ン ア ! = ナ, so represent as "n'a", not "na".
87 $n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;
88 $small_y = [ャィュェョ] ;
89 $iteration = ゝ ;
90 #------------------------------------------------------
91 # katakana rules
92 # Punctuation
93 '.' ↔ 。;
94 ',' ↔ 、;
95 # ' ' } [a-z] → ; # delete spaces before latin
96 # ' ' ← [^' '゠-ヿ] {} ['゠-ヿ] ; #insert spaces before hiragana
97 # Iteration Mark
98 # Copy previous letter § marks
99 # TODO
100 # | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration
101 # Specials for katakana -- not shared with hiragana
102 va ↔ ワ\u3099 ;
103 vi ↔ ヰ\u3099 ;
104 ve ↔ ヱ\u3099 ;
105 vo ↔ ヲ\u3099 ;
106 '~ka' ↔ ヵ ;
107 '~ke' ↔ ヶ ;
108 # ~~~ begin shared rules ~~~
109 #special
110 ya ← '~'ャ;
111 yi ← '~'ィ ;
112 yu ← '~'ュ;
113 ye ← '~'ェ;
114 yo ← '~'ョ;
115 #normal
116 a ↔ ア ;
117 b | '~' ← ヒ \u3099} $small_y ;
118 by } $vowel → ヒ\u3099 | '~y' ;
119 ba ↔ ハ\u3099 ;
120 bi ↔ ヒ\u3099 ;
121 bu ↔ フ\u3099 ;
122 be ↔ ヘ\u3099 ;
123 bo ↔ ホ\u3099 ;
124 c } i → | s ;
125 c } e → | s ;
126 da ↔ タ\u3099 ;
127 di ↔ テ\u3099ィ ;
128 du ↔ テ\u3099ゥ ;
129 de ↔ テ\u3099 ;
130 do ↔ ト\u3099 ;
131 dzu ↔ ツ\u3099 ;
132 dja ← チ\u3099ャ ;
133 dji'~i' ← チ\u3099ィ ; # liu
134 dju ← チ\u3099ュ ;
135 dje ← チ\u3099ェ ;
136 djo ← チ\u3099ョ ;
137 dji ↔ チ\u3099 ;
138 dj } $vowel → チ\u3099 | '~y' ;
139 # TODO: QUESTION: use ĵĴżŻ instead of dj, dz
140 cha ← チャ ;
141 chi'~i' ← チィ ; # liu
142 chu ← チュ ;
143 che ← チェ ;
144 cho ← チョ ;
145 chi ↔ チ ;
146 ch } $vowel → チ | '~y' ;
147 e ↔ エ ;
148 g | '~' ← キ\u3099} $small_y ;
149 gy } $vowel → キ\u3099 | '~y' ;
150 ga ↔ カ\u3099 ;
151 gi ↔ キ\u3099 ;
152 gu ↔ ク\u3099 ;
153 ge ↔ ケ\u3099 ;
154 go ↔ コ\u3099 ;
155 i ↔ イ ;
156 # j } $vowel → シ\u3099 | '~y' ;
157 ja ↔ シ\u3099ャ ;
158 ji'~i' ← シ\u3099ィ ; # liu
159 ju ↔ シ\u3099ュ ;
160 je ↔ シ\u3099ェ ;
161 jo ↔ シ\u3099ョ ;
162 ji ↔ シ\u3099 ;
163 k | '~' ← キ} $small_y ;
164 ky } $vowel → キ | '~y' ;
165 ka ↔ カ ;
166 ki ↔ キ ;
167 ku ↔ ク ;
168 ke ↔ ケ ;
169 ko ↔ コ ;
170 m | '~' ← ミ} $small_y ;
171 my } $vowel → ミ | '~y' ;
172 ma ↔ マ ;
173 mi ↔ ミ ;
174 mu ↔ ム ;
175 me ↔ メ ;
176 mo ↔ モ ;
177 m } [pbfv] → ン ;
178 n | '~' ← ニ } $small_y ;
179 ny } $vowel → ニ | '~y' ;
180 na ↔ ナ ;
181 ni ↔ ニ ;
182 nu ↔ ヌ ;
183 ne ↔ ネ ;
184 no ↔ ノ ;
185 o ↔ オ ;
186 p | '~' ← ヒ\u309A } $small_y ;
187 py } $vowel → ヒ\u309A | '~y' ;
188 pa ↔ ハ\u309A ;
189 pi ↔ ヒ\u309A ;
190 pu ↔ フ\u309A ;
191 pe ↔ ヘ\u309A ;
192 po ↔ ホ\u309A ;
193 h | '~' ← ヒ } $small_y ;
194 hy } $vowel → ヒ | '~y' ;
195 ha ↔ ハ ;
196 hi ↔ ヒ ;
197 hu ↔ ヘゥ ;
198 he ↔ ヘ ;
199 ho ↔ ホ ;
200 # f | '~' ← フ } $small_y ;
201 # f } $vowel → フ | '~' ;
202 fa ↔ ファ ;
203 fi ↔ フィ ;
204 fe ↔ フェ ;
205 fo ↔ フォ ;
206 fu ↔ フ ;
207 r | '~' ← リ } $small_y ;
208 ry } $vowel → リ | '~y' ;
209 ra ↔ ラ ;
210 ri ↔ リ ;
211 ru ↔ ル ;
212 re ↔ レ ;
213 ro ↔ ロ ;
214 za ↔ サ\u3099 ;
215 zi ↔ セ\u3099ィ ;
216 zu ↔ ス\u3099 ;
217 ze ↔ セ\u3099 ;
218 zo ↔ ソ\u3099 ;
219 sa ↔ サ ;
220 si ↔ セィ ;
221 su ↔ ス ;
222 se ↔ セ ;
223 so ↔ ソ ;
224 sha ← シャ ;
225 shi'~i' ← シィ ; # liu
226 shu ← シュ ;
227 she ← シェ ;
228 sho ← ショ ;
229 shi ↔ シ ;
230 sh } $vowel → シ | '~y' ;
231 ta ↔ タ ;
232 ti ↔ ティ ;
233 tu ↔ テゥ ;
234 te ↔ テ ;
235 to ↔ ト ;
236 tsu ↔ ツ ;
237 # v } $vowel → ウ\u3099 | '~' ;
238 #'v~a' ← ウ\u3099ァ ; # liu
239 #'v~i' ← ウ\u3099ィ ; # liu
240 #'v~e' ← ウ\u3099ェ ; # liu
241 #'v~o' ← ウ\u3099ォ ; # liu
242 vu ↔ ウ\u3099 ;
243 u ↔ ウ ;
244 # w } $vowel → ウ | '~' ;
245 wa ↔ ワ ;
246 wi ↔ ヰ ;
247 wu → ウ ;
248 we ↔ ヱ ;
249 wo ↔ ヲ ;
250 ya ↔ ヤ ;
251 yi → イ ;
252 yu ↔ ユ ;
253 ye → エ ;
254 yo ↔ ヨ ;
255 # double consonants
256 #specials
257 s } sh → ッ ;
258 t } ch → ッ ;
259 #voiced
260 j } j ↔ ッ } $j_start ;
261 b } b ↔ ッ } [$h_start$f_start] $voice;
262 d } d ↔ ッ } $t_start $voice;
263 g } g ↔ ッ } $k_start $voice;
264 p } p ↔ ッ } [$h_start$f_start] $semivoice;
265 # v } v ↔ ッ } [ワヰウヱヲう] $voice ;
266 z } z ↔ ッ } $s_start $voice;
267 v } v ↔ ッ } $v_start;
268 # normal
269 k } k ↔ ッ } $k_start ;
270 m } m ↔ ッ } $m_start ;
271 n } n ↔ ッ } $n_start ;
272 h } h ↔ ッ } $h_start ;
273 f } f ↔ ッ } $f_start ;
274 r } r ↔ ッ } $r_start ;
275 t } t ↔ ッ } $t_start ;
276 s } s ↔ ッ } $s_start ;
277 w } w ↔ ッ } $w_start;
278 y } y ↔ ッ } $y_start;
279 # completeness
280 x } x → ッ ;
281 c } k → ッ ;
282 c } c → ッ ;
283 c } q → ッ ;
284 l } l → ッ ;
285 q } q → ッ ;
286 # y } y → ッ ;
287 # w } w → ッ ;
288 # prolonged vowel mark. this indicates a doubling of
289 # the preceding vowel sound
290 #a ← a { ー ; # liu
291 #e ← e { ー ; # liu
292 #i ← i { ー ; # liu
293 #o ← o { ー ; # liu
294 #u ← u { ー ; # liu
295 $macron ↔ ー ;
296 # small forms
297 '~a' ↔ ァ ;
298 '~i' ↔ ィ ;
299 '~u' ↔ ゥ ;
300 '~e' ↔ ェ ;
301 '~o' ↔ ォ ;
302 '~tsu' ↔ ッ ;
303 '~wa' ↔ ヮ ;
304 '~ya' ↔ ャ ;
305 '~yi' → ィ ;
306 '~yu' ↔ ュ ;
307 '~ye' → ェ ;
308 '~yo' ↔ ョ ;
309 # iteration marks
310 # TODO: make more accurate
311 j $1 ← sh (y* $vowel) {ヽ$voice ;
312 dj $1 ← ch (y* $vowel) {ヽ$voice ;
313 dz $1 ← ts (y* $vowel) {ヽ$voice ;
314 g $1 ← k (y* $vowel) {ヽ$voice ;
315 z $1 ← s (y* $vowel) {ヽ$voice ;
316 d $1 ← t (y* $vowel) {ヽ$voice ;
317 h $1 ← b (y* $vowel) {ヽ$voice ;
318 v $1 ← w (y* $vowel) {ヽ$voice ;
319 sh $1 ← sh (y* $vowel) {ヽ$voice ;
320 j $1 ← j (y* $vowel) {ヽ$voice ;
321 ch $1 ← ch (y* $vowel) {ヽ$voice ;
322 dj $1 ← dj(y* $vowel) {ヽ$voice ;
323 ts $1 ← ts (y* $vowel) {ヽ$voice ;
324 dz $1 ← dz (y* $vowel) {ヽ$voice ;
325 $1 ← ($consonant y* $vowel) {ヽ$voice? ;
326 $1 ← (.) {ヽ $voice? ; # otherwise repeat last character
327 ← ヽ $voice? ; # delete if no characters found
328 # h- rule: lengthens vowel if not followed by a vowel.
329 # At the point this is applied, latin [cons]?vowel sequences
330 # have been converted to katakana in NFD form.
331 $voweled_basekana [\u3099 \u309A]? { h → ー ;
332 # one-way latin- → kana rules. these do not occur in
333 # well-formed romaji representing actual japanese text.
334 # their purpose is to make all romaji map to kana of
335 # some sort.
336 # the following are not really necessary, but produce
337 # slightly more natural results.
338 cy → セィ ;
339 dy → テ\u3099ィ ;
340 hy → ヒ ;
341 sy → セィ ;
342 ty → ティ ;
343 zy → セ\u3099ィ ;
344 h → ヘ ;
345 # isolated consonants listed here so as not to mask
346 # longer rules above.
347 ch → チ;
348 sh → シ ;
349 dz → ツ\u3099 ;
350 dj → チ\u3099;
351 b → フ\u3099 ;
352 d → テ\u3099 ;
353 g → ク\u3099 ;
354 k → ク ;
355 m → ム ;
356 n'' ← ン } $n_quoter ;
357 n ↔ ン ;
358 p → フ\u309A ;
359 r → ル ;
360 s → ス ;
361 t → テ ;
362 y → イ ;
363 z → ス\u3099 ;
364 v → ウ\u3099 ;
365 f → フ;
366 j → シ\u3099;
367 w → ウ;
368 ß → | ss ;
369 æ → | e ;
370 ð → | d ;
371 ø → | u ;
372 þ → | th ;
373 # simple substitutions using backup
374 c → | k ;
375 l → | r ;
376 q → | k ;
377 x → | ks ;
378 # ~~~ END shared rules ~~~
379 #------------------------------------------------------
380 # Final cleanup
381 '~' → ; # delete stray tildes between letters
382 [:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters
383 # [ʾ[:Nonspacing Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use
384 :: NFC (NFD) ;
385 :: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth);
386 # note: a global filter is more efficient, but MUST include all source chars!!
387 #:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]]);
388 # MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
389 :: ( [[\ -~¢-£¥-¦¬\u0304₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ;
390 # eof
391