]> git.saurik.com Git - apple/icu.git/blame - icuSources/data/translit/Latn_Kana.txt
ICU-62107.0.1.tar.gz
[apple/icu.git] / icuSources / data / translit / Latn_Kana.txt
CommitLineData
f3c0d7a5
A
1# © 2016 and later: Unicode, Inc. and others.
2# License & terms of use: http://www.unicode.org/copyright.html#License
3#
2ca993e8 4# File: Latn_Kana.txt
f3c0d7a5 5# Generated from CLDR
73c04bcf 6#
2ca993e8
A
7
8# note: a global filter is more efficient, but MUST include all source chars
9#:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]] ;
10# MINIMAL FILTER GENERATED FOR: Latin-Katakana
11### WARNING -- must add width filter, both here and below!!! ###
729e4ab9 12:: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ;
374ca955
A
13:: [:Latin:] fullwidth-halfwidth ();
14:: NFD (NFC);
51004dcb 15:: Lower (); # whenever transliterating from cased to uncased script, include this
2ca993e8
A
16# :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
17# Uses modified Hepburn. Small changes to make unambiguous.
18# | Kunrei-shiki: Hepburn/MHepburn
19# | ------------------------------
20# | si: shi
21# | si ~ya: sha
22# | si ~yu: shu
23# | si ~yo: sho
24# | zi: ji
25# | zi ~ya: ja
26# | zi ~yu: ju
27# | zi ~yo: jo
28# | ti: chi
29# | ti ~ya: cha
30# | ti ~yu: chu
31# | ti ~yu: cho
32# | tu: tsu
33# | di: ji/dji
34# | du: zu/dzu
35# | hu: fu
36# | For foreign words:
37# | -----------------
38# | se ~i si
39# | si ~e she
40# |
41# | ze ~i zi
42# | zi ~e je
43# |
44# | te ~i ti
45# | ti ~e che
46# | te ~u tu
47# |
48# | de ~i di
49# | de ~u du
50# | de ~i di
51# |
52# | he ~u: hu
53# | hu ~a fa
54# | hu ~i fi
55# | hu ~e he
56# | hu ~o ho
57# Most small forms are generated, but if necessary
58# explicit small forms are given with ~a, ~ya, etc.
59#------------------------------------------------------
60# Variables
374ca955
A
61$vowel = [aeiou] ;
62$consonant = [bcdfghjklmnpqrstvwxyz] ;
63$macron = \u0304 ;
2ca993e8 64# Variables used for doubled-consonants with tsu
73c04bcf
A
65$kana = [ぁ-ゔ] ;
66$voice = [\u3099゛];
67$semivoice = [\u309A゜];
374ca955 68$k_start = [カキクケコかきくけこ] ;
374ca955 69$s_start = [サシスセソさしすせそ] ;
374ca955 70$j_start = [シし] $voice ;
374ca955 71$t_start = [タチツテトたちつてと] ;
374ca955 72$n_start = [ナニヌネノンなにぬねの] ;
374ca955
A
73$h_start = [ハヒヘホはひへほ] ;
74$f_start = [フふ] ;
374ca955 75$m_start = [マミムメモまみむめも] ;
374ca955 76$y_start = [ヤユヨやゆよ] ;
374ca955 77$r_start = [ラリルレロらりるれろ] ;
374ca955 78$w_start = [ワヰヱヲわゐゑを] ;
73c04bcf 79$v_start = [ワヰヱヲ]\u3099 ;
729e4ab9 80$voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ;
2ca993e8
A
81# if ン is followed by $n_quoter, then it needs an
82# apostrophe after its romaji form to disambiguate it.
83# e.g., ン ア ! = ナ, so represent as "n'a", not "na".
51004dcb 84$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;
374ca955 85$small_y = [ャィュェョ] ;
73c04bcf 86$iteration = ゝ ;
2ca993e8
A
87#------------------------------------------------------
88# katakana rules
89# Punctuation
729e4ab9
A
90'.' ↔ 。;
91',' ↔ 、;
2ca993e8
A
92# ' ' } [a-z] → ; # delete spaces before latin
93# ' ' ← [^' '゠-ヿ] {} ['゠-ヿ] ; #insert spaces before hiragana
94# Iteration Mark
95# Copy previous letter § marks
96# TODO
97# | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration
98# Specials for katakana -- not shared with hiragana
729e4ab9
A
99va ↔ ワ\u3099 ;
100vi ↔ ヰ\u3099 ;
101ve ↔ ヱ\u3099 ;
102vo ↔ ヲ\u3099 ;
103'~ka' ↔ ヵ ;
104'~ke' ↔ ヶ ;
2ca993e8
A
105# ~~~ begin shared rules ~~~
106#special
729e4ab9
A
107ya ← '~'ャ;
108yi ← '~'ィ ;
109yu ← '~'ュ;
110ye ← '~'ェ;
111yo ← '~'ョ;
2ca993e8 112#normal
729e4ab9
A
113a ↔ ア ;
114b | '~' ← ヒ \u3099} $small_y ;
115by } $vowel → ヒ\u3099 | '~y' ;
116ba ↔ ハ\u3099 ;
117bi ↔ ヒ\u3099 ;
118bu ↔ フ\u3099 ;
119be ↔ ヘ\u3099 ;
120bo ↔ ホ\u3099 ;
121c } i → | s ;
122c } e → | s ;
123da ↔ タ\u3099 ;
124di ↔ テ\u3099ィ ;
125du ↔ テ\u3099ゥ ;
126de ↔ テ\u3099 ;
127do ↔ ト\u3099 ;
128dzu ↔ ツ\u3099 ;
129dja ← チ\u3099ャ ;
130dji'~i' ← チ\u3099ィ ; # liu
131dju ← チ\u3099ュ ;
132dje ← チ\u3099ェ ;
133djo ← チ\u3099ョ ;
134dji ↔ チ\u3099 ;
51004dcb 135dj } $vowel → チ\u3099 | '~y' ;
2ca993e8 136# TODO: QUESTION: use ĵĴżŻ instead of dj, dz
729e4ab9
A
137cha ← チャ ;
138chi'~i' ← チィ ; # liu
139chu ← チュ ;
140che ← チェ ;
141cho ← チョ ;
142chi ↔ チ ;
143ch } $vowel → チ | '~y' ;
144e ↔ エ ;
145g | '~' ← キ\u3099} $small_y ;
51004dcb 146gy } $vowel → キ\u3099 | '~y' ;
729e4ab9
A
147ga ↔ カ\u3099 ;
148gi ↔ キ\u3099 ;
149gu ↔ ク\u3099 ;
150ge ↔ ケ\u3099 ;
151go ↔ コ\u3099 ;
152i ↔ イ ;
2ca993e8 153# j } $vowel → シ\u3099 | '~y' ;
729e4ab9
A
154ja ↔ シ\u3099ャ ;
155ji'~i' ← シ\u3099ィ ; # liu
156ju ↔ シ\u3099ュ ;
157je ↔ シ\u3099ェ ;
158jo ↔ シ\u3099ョ ;
159ji ↔ シ\u3099 ;
160k | '~' ← キ} $small_y ;
51004dcb 161ky } $vowel → キ | '~y' ;
729e4ab9
A
162ka ↔ カ ;
163ki ↔ キ ;
164ku ↔ ク ;
165ke ↔ ケ ;
166ko ↔ コ ;
167m | '~' ← ミ} $small_y ;
51004dcb 168my } $vowel → ミ | '~y' ;
729e4ab9
A
169ma ↔ マ ;
170mi ↔ ミ ;
171mu ↔ ム ;
172me ↔ メ ;
173mo ↔ モ ;
174m } [pbfv] → ン ;
175n | '~' ← ニ } $small_y ;
51004dcb 176ny } $vowel → ニ | '~y' ;
729e4ab9
A
177na ↔ ナ ;
178ni ↔ ニ ;
179nu ↔ ヌ ;
180ne ↔ ネ ;
181no ↔ ノ ;
182o ↔ オ ;
183p | '~' ← ヒ\u309A } $small_y ;
51004dcb 184py } $vowel → ヒ\u309A | '~y' ;
729e4ab9
A
185pa ↔ ハ\u309A ;
186pi ↔ ヒ\u309A ;
187pu ↔ フ\u309A ;
188pe ↔ ヘ\u309A ;
189po ↔ ホ\u309A ;
190h | '~' ← ヒ } $small_y ;
51004dcb 191hy } $vowel → ヒ | '~y' ;
729e4ab9
A
192ha ↔ ハ ;
193hi ↔ ヒ ;
194hu ↔ ヘゥ ;
195he ↔ ヘ ;
196ho ↔ ホ ;
2ca993e8
A
197# f | '~' ← フ } $small_y ;
198# f } $vowel → フ | '~' ;
729e4ab9
A
199fa ↔ ファ ;
200fi ↔ フィ ;
201fe ↔ フェ ;
202fo ↔ フォ ;
203fu ↔ フ ;
204r | '~' ← リ } $small_y ;
51004dcb 205ry } $vowel → リ | '~y' ;
729e4ab9
A
206ra ↔ ラ ;
207ri ↔ リ ;
208ru ↔ ル ;
209re ↔ レ ;
210ro ↔ ロ ;
211za ↔ サ\u3099 ;
212zi ↔ セ\u3099ィ ;
213zu ↔ ス\u3099 ;
214ze ↔ セ\u3099 ;
215zo ↔ ソ\u3099 ;
216sa ↔ サ ;
217si ↔ セィ ;
218su ↔ ス ;
219se ↔ セ ;
220so ↔ ソ ;
221sha ← シャ ;
222shi'~i' ← シィ ; # liu
223shu ← シュ ;
224she ← シェ ;
225sho ← ショ ;
226shi ↔ シ ;
227sh } $vowel → シ | '~y' ;
228ta ↔ タ ;
229ti ↔ ティ ;
230tu ↔ テゥ ;
231te ↔ テ ;
232to ↔ ト ;
233tsu ↔ ツ ;
2ca993e8
A
234# v } $vowel → ウ\u3099 | '~' ;
235#'v~a' ← ウ\u3099ァ ; # liu
236#'v~i' ← ウ\u3099ィ ; # liu
237#'v~e' ← ウ\u3099ェ ; # liu
238#'v~o' ← ウ\u3099ォ ; # liu
729e4ab9
A
239vu ↔ ウ\u3099 ;
240u ↔ ウ ;
2ca993e8 241# w } $vowel → ウ | '~' ;
729e4ab9
A
242wa ↔ ワ ;
243wi ↔ ヰ ;
244wu → ウ ;
245we ↔ ヱ ;
246wo ↔ ヲ ;
247ya ↔ ヤ ;
248yi → イ ;
249yu ↔ ユ ;
250ye → エ ;
251yo ↔ ヨ ;
2ca993e8
A
252# double consonants
253#specials
729e4ab9
A
254s } sh → ッ ;
255t } ch → ッ ;
2ca993e8 256#voiced
729e4ab9
A
257j } j ↔ ッ } $j_start ;
258b } b ↔ ッ } [$h_start$f_start] $voice;
259d } d ↔ ッ } $t_start $voice;
260g } g ↔ ッ } $k_start $voice;
261p } p ↔ ッ } [$h_start$f_start] $semivoice;
2ca993e8 262# v } v ↔ ッ } [ワヰウヱヲう] $voice ;
729e4ab9
A
263z } z ↔ ッ } $s_start $voice;
264v } v ↔ ッ } $v_start;
2ca993e8 265# normal
729e4ab9
A
266k } k ↔ ッ } $k_start ;
267m } m ↔ ッ } $m_start ;
268n } n ↔ ッ } $n_start ;
269h } h ↔ ッ } $h_start ;
270f } f ↔ ッ } $f_start ;
271r } r ↔ ッ } $r_start ;
272t } t ↔ ッ } $t_start ;
273s } s ↔ ッ } $s_start ;
51004dcb 274w } w ↔ ッ } $w_start;
729e4ab9 275y } y ↔ ッ } $y_start;
2ca993e8 276# completeness
729e4ab9
A
277x } x → ッ ;
278c } k → ッ ;
279c } c → ッ ;
280c } q → ッ ;
281l } l → ッ ;
282q } q → ッ ;
2ca993e8
A
283# y } y → ッ ;
284# w } w → ッ ;
285# prolonged vowel mark. this indicates a doubling of
286# the preceding vowel sound
287#a ← a { ー ; # liu
288#e ← e { ー ; # liu
289#i ← i { ー ; # liu
290#o ← o { ー ; # liu
291#u ← u { ー ; # liu
729e4ab9 292$macron ↔ ー ;
2ca993e8 293# small forms
729e4ab9
A
294'~a' ↔ ァ ;
295'~i' ↔ ィ ;
296'~u' ↔ ゥ ;
297'~e' ↔ ェ ;
298'~o' ↔ ォ ;
299'~tsu' ↔ ッ ;
300'~wa' ↔ ヮ ;
301'~ya' ↔ ャ ;
302'~yi' → ィ ;
303'~yu' ↔ ュ ;
304'~ye' → ェ ;
305'~yo' ↔ ョ ;
2ca993e8
A
306# iteration marks
307# TODO: make more accurate
729e4ab9
A
308j $1 ← sh (y* $vowel) {ヽ$voice ;
309dj $1 ← ch (y* $vowel) {ヽ$voice ;
310dz $1 ← ts (y* $vowel) {ヽ$voice ;
311g $1 ← k (y* $vowel) {ヽ$voice ;
312z $1 ← s (y* $vowel) {ヽ$voice ;
313d $1 ← t (y* $vowel) {ヽ$voice ;
314h $1 ← b (y* $vowel) {ヽ$voice ;
315v $1 ← w (y* $vowel) {ヽ$voice ;
316sh $1 ← sh (y* $vowel) {ヽ$voice ;
317j $1 ← j (y* $vowel) {ヽ$voice ;
318ch $1 ← ch (y* $vowel) {ヽ$voice ;
319dj $1 ← dj(y* $vowel) {ヽ$voice ;
320ts $1 ← ts (y* $vowel) {ヽ$voice ;
321dz $1 ← dz (y* $vowel) {ヽ$voice ;
322$1 ← ($consonant y* $vowel) {ヽ$voice? ;
323$1 ← (.) {ヽ $voice? ; # otherwise repeat last character
324← ヽ $voice? ; # delete if no characters found
2ca993e8
A
325# h- rule: lengthens vowel if not followed by a vowel.
326# At the point this is applied, latin [cons]?vowel sequences
327# have been converted to katakana in NFD form.
729e4ab9 328$voweled_basekana [\u3099 \u309A]? { h → ー ;
2ca993e8
A
329# one-way latin- → kana rules. these do not occur in
330# well-formed romaji representing actual japanese text.
331# their purpose is to make all romaji map to kana of
332# some sort.
333# the following are not really necessary, but produce
334# slightly more natural results.
729e4ab9
A
335cy → セィ ;
336dy → テ\u3099ィ ;
337hy → ヒ ;
338sy → セィ ;
339ty → ティ ;
340zy → セ\u3099ィ ;
341h → ヘ ;
2ca993e8
A
342# isolated consonants listed here so as not to mask
343# longer rules above.
729e4ab9
A
344ch → チ;
345sh → シ ;
346dz → ツ\u3099 ;
347dj → チ\u3099;
348b → フ\u3099 ;
349d → テ\u3099 ;
350g → ク\u3099 ;
351k → ク ;
352m → ム ;
353n'' ← ン } $n_quoter ;
354n ↔ ン ;
355p → フ\u309A ;
356r → ル ;
357s → ス ;
358t → テ ;
359y → イ ;
360z → ス\u3099 ;
361v → ウ\u3099 ;
362f → フ;
51004dcb 363j → シ\u3099;
729e4ab9
A
364w → ウ;
365ß → | ss ;
366æ → | e ;
367ð → | d ;
368ø → | u ;
369þ → | th ;
2ca993e8 370# simple substitutions using backup
729e4ab9
A
371c → | k ;
372l → | r ;
373q → | k ;
374x → | ks ;
2ca993e8
A
375# ~~~ END shared rules ~~~
376#------------------------------------------------------
377# Final cleanup
729e4ab9
A
378'~' → ; # delete stray tildes between letters
379[:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters
2ca993e8 380# [ʾ[:Nonspacing Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use
73c04bcf 381:: NFC (NFD) ;
46f4442e 382:: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth);
2ca993e8
A
383# note: a global filter is more efficient, but MUST include all source chars!!
384#:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]]);
385# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
46f4442e 386:: ( [[\ -~¢-£¥-¦¬\u0304₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ;
2ca993e8
A
387# eof
388