]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | # © 2016 and later: Unicode, Inc. and others. |
2 | # License & terms of use: http://www.unicode.org/copyright.html#License | |
3 | # | |
2ca993e8 | 4 | # File: Latn_Kana.txt |
f3c0d7a5 | 5 | # Generated from CLDR |
73c04bcf | 6 | # |
2ca993e8 A |
7 | |
8 | # note: a global filter is more efficient, but MUST include all source chars | |
9 | #:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]] ; | |
10 | # MINIMAL FILTER GENERATED FOR: Latin-Katakana | |
11 | ### WARNING -- must add width filter, both here and below!!! ### | |
729e4ab9 | 12 | :: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ; |
374ca955 A |
13 | :: [:Latin:] fullwidth-halfwidth (); |
14 | :: NFD (NFC); | |
51004dcb | 15 | :: Lower (); # whenever transliterating from cased to uncased script, include this |
2ca993e8 A |
16 | # :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese |
17 | # Uses modified Hepburn. Small changes to make unambiguous. | |
18 | # | Kunrei-shiki: Hepburn/MHepburn | |
19 | # | ------------------------------ | |
20 | # | si: shi | |
21 | # | si ~ya: sha | |
22 | # | si ~yu: shu | |
23 | # | si ~yo: sho | |
24 | # | zi: ji | |
25 | # | zi ~ya: ja | |
26 | # | zi ~yu: ju | |
27 | # | zi ~yo: jo | |
28 | # | ti: chi | |
29 | # | ti ~ya: cha | |
30 | # | ti ~yu: chu | |
31 | # | ti ~yu: cho | |
32 | # | tu: tsu | |
33 | # | di: ji/dji | |
34 | # | du: zu/dzu | |
35 | # | hu: fu | |
36 | # | For foreign words: | |
37 | # | ----------------- | |
38 | # | se ~i si | |
39 | # | si ~e she | |
40 | # | | |
41 | # | ze ~i zi | |
42 | # | zi ~e je | |
43 | # | | |
44 | # | te ~i ti | |
45 | # | ti ~e che | |
46 | # | te ~u tu | |
47 | # | | |
48 | # | de ~i di | |
49 | # | de ~u du | |
50 | # | de ~i di | |
51 | # | | |
52 | # | he ~u: hu | |
53 | # | hu ~a fa | |
54 | # | hu ~i fi | |
55 | # | hu ~e he | |
56 | # | hu ~o ho | |
57 | # Most small forms are generated, but if necessary | |
58 | # explicit small forms are given with ~a, ~ya, etc. | |
59 | #------------------------------------------------------ | |
60 | # Variables | |
374ca955 A |
61 | $vowel = [aeiou] ; |
62 | $consonant = [bcdfghjklmnpqrstvwxyz] ; | |
63 | $macron = \u0304 ; | |
2ca993e8 | 64 | # Variables used for doubled-consonants with tsu |
73c04bcf A |
65 | $kana = [ぁ-ゔ] ; |
66 | $voice = [\u3099゛]; | |
67 | $semivoice = [\u309A゜]; | |
374ca955 | 68 | $k_start = [カキクケコかきくけこ] ; |
374ca955 | 69 | $s_start = [サシスセソさしすせそ] ; |
374ca955 | 70 | $j_start = [シし] $voice ; |
374ca955 | 71 | $t_start = [タチツテトたちつてと] ; |
374ca955 | 72 | $n_start = [ナニヌネノンなにぬねの] ; |
374ca955 A |
73 | $h_start = [ハヒヘホはひへほ] ; |
74 | $f_start = [フふ] ; | |
374ca955 | 75 | $m_start = [マミムメモまみむめも] ; |
374ca955 | 76 | $y_start = [ヤユヨやゆよ] ; |
374ca955 | 77 | $r_start = [ラリルレロらりるれろ] ; |
374ca955 | 78 | $w_start = [ワヰヱヲわゐゑを] ; |
73c04bcf | 79 | $v_start = [ワヰヱヲ]\u3099 ; |
729e4ab9 | 80 | $voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ; |
2ca993e8 A |
81 | # if ン is followed by $n_quoter, then it needs an |
82 | # apostrophe after its romaji form to disambiguate it. | |
83 | # e.g., ン ア ! = ナ, so represent as "n'a", not "na". | |
51004dcb | 84 | $n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ; |
374ca955 | 85 | $small_y = [ャィュェョ] ; |
73c04bcf | 86 | $iteration = ゝ ; |
2ca993e8 A |
87 | #------------------------------------------------------ |
88 | # katakana rules | |
89 | # Punctuation | |
729e4ab9 A |
90 | '.' ↔ 。; |
91 | ',' ↔ 、; | |
2ca993e8 A |
92 | # ' ' } [a-z] → ; # delete spaces before latin |
93 | # ' ' ← [^' '゠-ヿ] {} ['゠-ヿ] ; #insert spaces before hiragana | |
94 | # Iteration Mark | |
95 | # Copy previous letter § marks | |
96 | # TODO | |
97 | # | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration | |
98 | # Specials for katakana -- not shared with hiragana | |
729e4ab9 A |
99 | va ↔ ワ\u3099 ; |
100 | vi ↔ ヰ\u3099 ; | |
101 | ve ↔ ヱ\u3099 ; | |
102 | vo ↔ ヲ\u3099 ; | |
103 | '~ka' ↔ ヵ ; | |
104 | '~ke' ↔ ヶ ; | |
2ca993e8 A |
105 | # ~~~ begin shared rules ~~~ |
106 | #special | |
729e4ab9 A |
107 | ya ← '~'ャ; |
108 | yi ← '~'ィ ; | |
109 | yu ← '~'ュ; | |
110 | ye ← '~'ェ; | |
111 | yo ← '~'ョ; | |
2ca993e8 | 112 | #normal |
729e4ab9 A |
113 | a ↔ ア ; |
114 | b | '~' ← ヒ \u3099} $small_y ; | |
115 | by } $vowel → ヒ\u3099 | '~y' ; | |
116 | ba ↔ ハ\u3099 ; | |
117 | bi ↔ ヒ\u3099 ; | |
118 | bu ↔ フ\u3099 ; | |
119 | be ↔ ヘ\u3099 ; | |
120 | bo ↔ ホ\u3099 ; | |
121 | c } i → | s ; | |
122 | c } e → | s ; | |
123 | da ↔ タ\u3099 ; | |
124 | di ↔ テ\u3099ィ ; | |
125 | du ↔ テ\u3099ゥ ; | |
126 | de ↔ テ\u3099 ; | |
127 | do ↔ ト\u3099 ; | |
128 | dzu ↔ ツ\u3099 ; | |
129 | dja ← チ\u3099ャ ; | |
130 | dji'~i' ← チ\u3099ィ ; # liu | |
131 | dju ← チ\u3099ュ ; | |
132 | dje ← チ\u3099ェ ; | |
133 | djo ← チ\u3099ョ ; | |
134 | dji ↔ チ\u3099 ; | |
51004dcb | 135 | dj } $vowel → チ\u3099 | '~y' ; |
2ca993e8 | 136 | # TODO: QUESTION: use ĵĴżŻ instead of dj, dz |
729e4ab9 A |
137 | cha ← チャ ; |
138 | chi'~i' ← チィ ; # liu | |
139 | chu ← チュ ; | |
140 | che ← チェ ; | |
141 | cho ← チョ ; | |
142 | chi ↔ チ ; | |
143 | ch } $vowel → チ | '~y' ; | |
144 | e ↔ エ ; | |
145 | g | '~' ← キ\u3099} $small_y ; | |
51004dcb | 146 | gy } $vowel → キ\u3099 | '~y' ; |
729e4ab9 A |
147 | ga ↔ カ\u3099 ; |
148 | gi ↔ キ\u3099 ; | |
149 | gu ↔ ク\u3099 ; | |
150 | ge ↔ ケ\u3099 ; | |
151 | go ↔ コ\u3099 ; | |
152 | i ↔ イ ; | |
2ca993e8 | 153 | # j } $vowel → シ\u3099 | '~y' ; |
729e4ab9 A |
154 | ja ↔ シ\u3099ャ ; |
155 | ji'~i' ← シ\u3099ィ ; # liu | |
156 | ju ↔ シ\u3099ュ ; | |
157 | je ↔ シ\u3099ェ ; | |
158 | jo ↔ シ\u3099ョ ; | |
159 | ji ↔ シ\u3099 ; | |
160 | k | '~' ← キ} $small_y ; | |
51004dcb | 161 | ky } $vowel → キ | '~y' ; |
729e4ab9 A |
162 | ka ↔ カ ; |
163 | ki ↔ キ ; | |
164 | ku ↔ ク ; | |
165 | ke ↔ ケ ; | |
166 | ko ↔ コ ; | |
167 | m | '~' ← ミ} $small_y ; | |
51004dcb | 168 | my } $vowel → ミ | '~y' ; |
729e4ab9 A |
169 | ma ↔ マ ; |
170 | mi ↔ ミ ; | |
171 | mu ↔ ム ; | |
172 | me ↔ メ ; | |
173 | mo ↔ モ ; | |
174 | m } [pbfv] → ン ; | |
175 | n | '~' ← ニ } $small_y ; | |
51004dcb | 176 | ny } $vowel → ニ | '~y' ; |
729e4ab9 A |
177 | na ↔ ナ ; |
178 | ni ↔ ニ ; | |
179 | nu ↔ ヌ ; | |
180 | ne ↔ ネ ; | |
181 | no ↔ ノ ; | |
182 | o ↔ オ ; | |
183 | p | '~' ← ヒ\u309A } $small_y ; | |
51004dcb | 184 | py } $vowel → ヒ\u309A | '~y' ; |
729e4ab9 A |
185 | pa ↔ ハ\u309A ; |
186 | pi ↔ ヒ\u309A ; | |
187 | pu ↔ フ\u309A ; | |
188 | pe ↔ ヘ\u309A ; | |
189 | po ↔ ホ\u309A ; | |
190 | h | '~' ← ヒ } $small_y ; | |
51004dcb | 191 | hy } $vowel → ヒ | '~y' ; |
729e4ab9 A |
192 | ha ↔ ハ ; |
193 | hi ↔ ヒ ; | |
194 | hu ↔ ヘゥ ; | |
195 | he ↔ ヘ ; | |
196 | ho ↔ ホ ; | |
2ca993e8 A |
197 | # f | '~' ← フ } $small_y ; |
198 | # f } $vowel → フ | '~' ; | |
729e4ab9 A |
199 | fa ↔ ファ ; |
200 | fi ↔ フィ ; | |
201 | fe ↔ フェ ; | |
202 | fo ↔ フォ ; | |
203 | fu ↔ フ ; | |
204 | r | '~' ← リ } $small_y ; | |
51004dcb | 205 | ry } $vowel → リ | '~y' ; |
729e4ab9 A |
206 | ra ↔ ラ ; |
207 | ri ↔ リ ; | |
208 | ru ↔ ル ; | |
209 | re ↔ レ ; | |
210 | ro ↔ ロ ; | |
211 | za ↔ サ\u3099 ; | |
212 | zi ↔ セ\u3099ィ ; | |
213 | zu ↔ ス\u3099 ; | |
214 | ze ↔ セ\u3099 ; | |
215 | zo ↔ ソ\u3099 ; | |
216 | sa ↔ サ ; | |
217 | si ↔ セィ ; | |
218 | su ↔ ス ; | |
219 | se ↔ セ ; | |
220 | so ↔ ソ ; | |
221 | sha ← シャ ; | |
222 | shi'~i' ← シィ ; # liu | |
223 | shu ← シュ ; | |
224 | she ← シェ ; | |
225 | sho ← ショ ; | |
226 | shi ↔ シ ; | |
227 | sh } $vowel → シ | '~y' ; | |
228 | ta ↔ タ ; | |
229 | ti ↔ ティ ; | |
230 | tu ↔ テゥ ; | |
231 | te ↔ テ ; | |
232 | to ↔ ト ; | |
233 | tsu ↔ ツ ; | |
2ca993e8 A |
234 | # v } $vowel → ウ\u3099 | '~' ; |
235 | #'v~a' ← ウ\u3099ァ ; # liu | |
236 | #'v~i' ← ウ\u3099ィ ; # liu | |
237 | #'v~e' ← ウ\u3099ェ ; # liu | |
238 | #'v~o' ← ウ\u3099ォ ; # liu | |
729e4ab9 A |
239 | vu ↔ ウ\u3099 ; |
240 | u ↔ ウ ; | |
2ca993e8 | 241 | # w } $vowel → ウ | '~' ; |
729e4ab9 A |
242 | wa ↔ ワ ; |
243 | wi ↔ ヰ ; | |
244 | wu → ウ ; | |
245 | we ↔ ヱ ; | |
246 | wo ↔ ヲ ; | |
247 | ya ↔ ヤ ; | |
248 | yi → イ ; | |
249 | yu ↔ ユ ; | |
250 | ye → エ ; | |
251 | yo ↔ ヨ ; | |
2ca993e8 A |
252 | # double consonants |
253 | #specials | |
729e4ab9 A |
254 | s } sh → ッ ; |
255 | t } ch → ッ ; | |
2ca993e8 | 256 | #voiced |
729e4ab9 A |
257 | j } j ↔ ッ } $j_start ; |
258 | b } b ↔ ッ } [$h_start$f_start] $voice; | |
259 | d } d ↔ ッ } $t_start $voice; | |
260 | g } g ↔ ッ } $k_start $voice; | |
261 | p } p ↔ ッ } [$h_start$f_start] $semivoice; | |
2ca993e8 | 262 | # v } v ↔ ッ } [ワヰウヱヲう] $voice ; |
729e4ab9 A |
263 | z } z ↔ ッ } $s_start $voice; |
264 | v } v ↔ ッ } $v_start; | |
2ca993e8 | 265 | # normal |
729e4ab9 A |
266 | k } k ↔ ッ } $k_start ; |
267 | m } m ↔ ッ } $m_start ; | |
268 | n } n ↔ ッ } $n_start ; | |
269 | h } h ↔ ッ } $h_start ; | |
270 | f } f ↔ ッ } $f_start ; | |
271 | r } r ↔ ッ } $r_start ; | |
272 | t } t ↔ ッ } $t_start ; | |
273 | s } s ↔ ッ } $s_start ; | |
51004dcb | 274 | w } w ↔ ッ } $w_start; |
729e4ab9 | 275 | y } y ↔ ッ } $y_start; |
2ca993e8 | 276 | # completeness |
729e4ab9 A |
277 | x } x → ッ ; |
278 | c } k → ッ ; | |
279 | c } c → ッ ; | |
280 | c } q → ッ ; | |
281 | l } l → ッ ; | |
282 | q } q → ッ ; | |
2ca993e8 A |
283 | # y } y → ッ ; |
284 | # w } w → ッ ; | |
285 | # prolonged vowel mark. this indicates a doubling of | |
286 | # the preceding vowel sound | |
287 | #a ← a { ー ; # liu | |
288 | #e ← e { ー ; # liu | |
289 | #i ← i { ー ; # liu | |
290 | #o ← o { ー ; # liu | |
291 | #u ← u { ー ; # liu | |
729e4ab9 | 292 | $macron ↔ ー ; |
2ca993e8 | 293 | # small forms |
729e4ab9 A |
294 | '~a' ↔ ァ ; |
295 | '~i' ↔ ィ ; | |
296 | '~u' ↔ ゥ ; | |
297 | '~e' ↔ ェ ; | |
298 | '~o' ↔ ォ ; | |
299 | '~tsu' ↔ ッ ; | |
300 | '~wa' ↔ ヮ ; | |
301 | '~ya' ↔ ャ ; | |
302 | '~yi' → ィ ; | |
303 | '~yu' ↔ ュ ; | |
304 | '~ye' → ェ ; | |
305 | '~yo' ↔ ョ ; | |
2ca993e8 A |
306 | # iteration marks |
307 | # TODO: make more accurate | |
729e4ab9 A |
308 | j $1 ← sh (y* $vowel) {ヽ$voice ; |
309 | dj $1 ← ch (y* $vowel) {ヽ$voice ; | |
310 | dz $1 ← ts (y* $vowel) {ヽ$voice ; | |
311 | g $1 ← k (y* $vowel) {ヽ$voice ; | |
312 | z $1 ← s (y* $vowel) {ヽ$voice ; | |
313 | d $1 ← t (y* $vowel) {ヽ$voice ; | |
314 | h $1 ← b (y* $vowel) {ヽ$voice ; | |
315 | v $1 ← w (y* $vowel) {ヽ$voice ; | |
316 | sh $1 ← sh (y* $vowel) {ヽ$voice ; | |
317 | j $1 ← j (y* $vowel) {ヽ$voice ; | |
318 | ch $1 ← ch (y* $vowel) {ヽ$voice ; | |
319 | dj $1 ← dj(y* $vowel) {ヽ$voice ; | |
320 | ts $1 ← ts (y* $vowel) {ヽ$voice ; | |
321 | dz $1 ← dz (y* $vowel) {ヽ$voice ; | |
322 | $1 ← ($consonant y* $vowel) {ヽ$voice? ; | |
323 | $1 ← (.) {ヽ $voice? ; # otherwise repeat last character | |
324 | ← ヽ $voice? ; # delete if no characters found | |
2ca993e8 A |
325 | # h- rule: lengthens vowel if not followed by a vowel. |
326 | # At the point this is applied, latin [cons]?vowel sequences | |
327 | # have been converted to katakana in NFD form. | |
729e4ab9 | 328 | $voweled_basekana [\u3099 \u309A]? { h → ー ; |
2ca993e8 A |
329 | # one-way latin- → kana rules. these do not occur in |
330 | # well-formed romaji representing actual japanese text. | |
331 | # their purpose is to make all romaji map to kana of | |
332 | # some sort. | |
333 | # the following are not really necessary, but produce | |
334 | # slightly more natural results. | |
729e4ab9 A |
335 | cy → セィ ; |
336 | dy → テ\u3099ィ ; | |
337 | hy → ヒ ; | |
338 | sy → セィ ; | |
339 | ty → ティ ; | |
340 | zy → セ\u3099ィ ; | |
341 | h → ヘ ; | |
2ca993e8 A |
342 | # isolated consonants listed here so as not to mask |
343 | # longer rules above. | |
729e4ab9 A |
344 | ch → チ; |
345 | sh → シ ; | |
346 | dz → ツ\u3099 ; | |
347 | dj → チ\u3099; | |
348 | b → フ\u3099 ; | |
349 | d → テ\u3099 ; | |
350 | g → ク\u3099 ; | |
351 | k → ク ; | |
352 | m → ム ; | |
353 | n'' ← ン } $n_quoter ; | |
354 | n ↔ ン ; | |
355 | p → フ\u309A ; | |
356 | r → ル ; | |
357 | s → ス ; | |
358 | t → テ ; | |
359 | y → イ ; | |
360 | z → ス\u3099 ; | |
361 | v → ウ\u3099 ; | |
362 | f → フ; | |
51004dcb | 363 | j → シ\u3099; |
729e4ab9 A |
364 | w → ウ; |
365 | ß → | ss ; | |
366 | æ → | e ; | |
367 | ð → | d ; | |
368 | ø → | u ; | |
369 | þ → | th ; | |
2ca993e8 | 370 | # simple substitutions using backup |
729e4ab9 A |
371 | c → | k ; |
372 | l → | r ; | |
373 | q → | k ; | |
374 | x → | ks ; | |
2ca993e8 A |
375 | # ~~~ END shared rules ~~~ |
376 | #------------------------------------------------------ | |
377 | # Final cleanup | |
729e4ab9 A |
378 | '~' → ; # delete stray tildes between letters |
379 | [:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters | |
2ca993e8 | 380 | # [ʾ[:Nonspacing Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use |
73c04bcf | 381 | :: NFC (NFD) ; |
46f4442e | 382 | :: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth); |
2ca993e8 A |
383 | # note: a global filter is more efficient, but MUST include all source chars!! |
384 | #:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]]); | |
385 | # MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD | |
46f4442e | 386 | :: ( [[\ -~¢-£¥-¦¬\u0304₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ; |
2ca993e8 A |
387 | # eof |
388 |