]> git.saurik.com Git - apple/icu.git/blob - icuSources/data/translit/Hiragana_Katakana.txt
ICU-6.2.15.tar.gz
[apple/icu.git] / icuSources / data / translit / Hiragana_Katakana.txt
1 #--------------------------------------------------------------------
2 # Copyright (c) 1999-2004, International Business Machines
3 # Corporation and others. All Rights Reserved.
4 #--------------------------------------------------------------------
5
6 # note: a global filter is more efficient, but MUST include all source chars
7 :: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]] ;
8 :: NFKC ();
9
10 # Hiragana-Katakana
11
12 # This is largely a one-to-one mapping, but it has a
13 # few kinks:
14
15 # 1. The Katakana va/vi/ve/vo (30F7-30FA) have no
16 # Hiragana equivalents. We use Hiragana wa/wi/we/wo
17 # (308F-3092) with a voicing mark (3099), which is
18 # semantically equivalent. However, this is a non-
19 # roundtripping transformation.
20
21 # 2. The Katakana small ka/ke (30F5,30F6) have no
22 # Hiragana equiavlents. We convert them to normal
23 # Hiragana ka/ke (304B,3051). This is a one-way
24 # information-losing transformation and precludes
25 # round-tripping of 30F5 and 30F6.
26
27 # 3. The combining marks 3099-309C are in the Hiragana
28 # block, but they apply to Katakana as well, so we
29 # leave them untouched.
30
31 # 4. The Katakana prolonged sound mark 30FC doubles the
32 # preceding vowel. This is a one-way information-
33 # losing transformation from Katakana to Hiragana.
34
35 # 5. The Katakana middle dot separates words in foreign
36 # expressions; we leave this unmodified.
37
38 # The above points preclude successful round-trip
39 # transformations of arbitrary input text. However,
40 # they provide naturalistic results that should conform
41 # to user expectations.
42
43
44 # Combining equivalents va/vi/ve/vo
45 わ゙ <> ヷ;
46 ゐ゙ <> ヸ;
47 ゑ゙ <> ヹ;
48 を゙ <> ヺ;
49
50 # One-to-one mappings, main block
51 # 3041:3094 <> 30A1:30F4
52 # 309D,E <> 30FD,E
53 ぁ <> ァ;
54 あ <> ア;
55 ぃ <> ィ;
56 い <> イ;
57 ぅ <> ゥ;
58 う <> ウ;
59 ぇ <> ェ;
60 え <> エ;
61 ぉ <> ォ;
62 お <> オ;
63 か <> カ;
64 が <> ガ;
65 き <> キ;
66 ぎ <> ギ;
67 く <> ク;
68 ぐ <> グ;
69 け <> ケ;
70 げ <> ゲ;
71 こ <> コ;
72 ご <> ゴ;
73 さ <> サ;
74 ざ <> ザ;
75 し <> シ;
76 じ <> ジ;
77 す <> ス;
78 ず <> ズ;
79 せ <> セ;
80 ぜ <> ゼ;
81 そ <> ソ;
82 ぞ <> ゾ;
83 た <> タ;
84 だ <> ダ;
85 ち <> チ;
86 ぢ <> ヂ;
87 っ <> ッ;
88 つ <> ツ;
89 づ <> ヅ;
90 て <> テ;
91 で <> デ;
92 と <> ト;
93 ど <> ド;
94 な <> ナ;
95 に <> ニ;
96 ぬ <> ヌ;
97 ね <> ネ;
98 の <> ノ;
99 は <> ハ;
100 ば <> バ;
101 ぱ <> パ;
102 ひ <> ヒ;
103 び <> ビ;
104 ぴ <> ピ;
105 ふ <> フ;
106 ぶ <> ブ;
107 ぷ <> プ;
108 へ <> ヘ;
109 べ <> ベ;
110 ぺ <> ペ;
111 ほ <> ホ;
112 ぼ <> ボ;
113 ぽ <> ポ;
114 ま <> マ;
115 み <> ミ;
116 む <> ム;
117 め <> メ;
118 も <> モ;
119 ゃ <> ャ;
120 や <> ヤ;
121 ゅ <> ュ;
122 ゆ <> ユ;
123 ょ <> ョ;
124 よ <> ヨ;
125 ら <> ラ;
126 り <> リ;
127 る <> ル;
128 れ <> レ;
129 ろ <> ロ;
130 ゎ <> ヮ;
131 わ <> ワ;
132 ゐ <> ヰ;
133 ゑ <> ヱ;
134 を <> ヲ;
135 ん <> ン;
136 ゔ <> ヴ;
137 ゝ <> ヽ;
138 ゞ <> ヾ;
139
140 # One-way Katakana-Hiragana xform of small K ka/ke to
141 # normal H ka/ke.
142 か < ヵ;
143 け < ヶ;
144
145 # Katakana followed by a prolonged sound mark 30FC has
146 # its final vowel doubled. This is a Katakana-Hiragana
147 # one-way information-losing transformation. We
148 # include the small Katakana (e.g., small A 3041) and
149 # do not distinguish them from their large
150 # counterparts. It doesn't make sense to double a
151 # small counterpart vowel as a small Hiragana vowel, so
152 # we don't do so. In natural text this should never
153 # occur anyway. If a 30FC is seen without a preceding
154 # vowel sound (e.g., after n 30F3) we do not change it.
155
156 ### $long = ー;
157
158 # The following categories are Hiragana, not Katakana
159 # as might be expected, since by the time we get to the
160 # 30FC, the preceding character will have already been
161 # transformed to Hiragana.
162
163 # {The following mechanically generated from the
164 # Unicode 3.0 data:}
165
166 $xa = [ \
167 ぁ あ か が さ ざ \
168 た だ な は ば ぱ \
169 ま ゃ や ら ゎ わ \
170 ];
171
172 $xi = [ \
173 ぃ い き ぎ し じ \
174 ち ぢ に ひ び ぴ \
175 み り ゐ \
176 ];
177
178 $xu = [ \
179 ぅ う く ぐ す ず \
180 っ つ づ ぬ ふ ぶ \
181 ぷ む ゅ ゆ る ゔ \
182 ];
183
184 $xe = [ \
185 ぇ え け げ せ ぜ \
186 て で ね へ べ ぺ \
187 め れ ゑ \
188 ];
189
190 $xo = [ \
191 ぉ お こ ご そ ぞ \
192 と ど の ほ ぼ ぽ \
193 も ょ よ ろ を \
194 ];
195
196 あ < $xa {ー};
197 い < $xi {ー};
198 う < $xu {ー};
199 え < $xe {ー};
200 お < $xo {ー};
201
202 :: (NFKC) ;
203
204 # note: a global filter is more efficient, but MUST include all source chars!!
205 :: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]]);
206
207 # eof