]> git.saurik.com Git - apple/icu.git/blame - icuSources/data/translit/Latin_Jamo.txt
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / data / translit / Latin_Jamo.txt
CommitLineData
374ca955
A
1#--------------------------------------------------------------------
2# Copyright (c) 1999-2004, International Business Machines
3# Corporation and others. All Rights Reserved.
4#--------------------------------------------------------------------
5
6#- N.B. DO NOT put any filters, NFD, etc. here -- those are aliased in
7#- the INDEX file. This transliterator is, by itself, not
8#- instantiated. It is used as a part of Latin-Jamo, Latin-Hangul, or
9#- inverses thereof.
10
11# Transliteration from Latin characters to Korean script is done in
12# two steps: Latin to Jamo, then Jamo to Hangul. The Jamo-Hangul
13# transliteration is done algorithmically following Unicode 3.0
14# section 3.11. This file implements the Latin to Jamo
15# transliteration using rules.
16
17# Jamo occupy the block 1100-11FF. Within this block there are three
18# groups of characters: initial consonants or choseong (I), medial
19# vowels or jungseong (M), and trailing consonants or jongseong (F).
20# Standard Korean syllables are of the form I+M+F*.
21
22# Section 3.11 describes the use of 'filler' jamo to convert
23# nonstandard syllables to standard form: the choseong filler 115F and
24# the junseong filler 1160. In this transliterator, we will not use
25# 115F or 1160.
26
27# We will, however, insert two 'null' jamo to make foreign words
28# conform to Korean syllable structure. These are the null initial
29# consonant 110B (IEUNG) and the null vowel 1173 (EU). In Latin text,
30# we will use the separator in order to disambiguate strings,
31# e.g. "kan-ggan" (initial GG) vs. "kanggan" (final NG + initial G).
32
33# We will not use all of the characters in the jamo block. We will
34# only use the 19 initials, 21 medials, and 27 finals possessing a
35# jamo short name as defined in section 4.4 of the Unicode book.
36
37# Rules of thumb. These guidelines provide the basic framework
38# for the rules. They are phrased in terms of Latin-Jamo transliteration.
39# The Jamo-Latin rules derive from these, since the Jamo-Latin rules are
40# just context-free transliteration of jamo to corresponding short names,
41# with the addition of separators to maintain round-trip integrity
42# in the context of the Latin-Jamo rules.
43
44# A sequence of vowels:
45# - Take the longest sequence you can. If there are too many, or you don't
46# have a starting consonant, introduce a 110B necessary.
47
48# A sequence of consonants.
49# - First join the double consonants: G + G -> GG
50# - In the remaining list,
51# -- If there is no preceding vowel, take the first consonant, and insert EU
52# after it. Continue with the rest of the consonants.
53# -- If there is one consonant, attach to the following vowel
54# -- If there are two consonants and a following vowel, attach one to the
55# preceeding vowel, and one to the following vowel.
56# -- If there are more than two consonants, join the first two together if you
57# can: L + G => LG
58# -- If you still end up with more than 2 consonants, insert EU after the
59# first one, and continue with the rest of the consonants.
60
61#----------------------------------------------------------------------
62# Variables
63
64# Some latin consonants or consonant pairs only occur as initials, and
65# some only as finals, but some occur as both. This makes some jamo
66# consonants ambiguous when transliterated into latin.
67# Initial only: IEUNG BB DD JJ R
68# Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ
69# Initial and Final: B C D G GG H J K M N P S SS T
70
71 $Gi = \u1100;
72 $GGi = \u1101;
73 $Ni = \u1102;
74 $Di = \u1103;
75 $DD = \u1104;
76 $R = \u1105;
77 $Mi = \u1106;
78 $Bi = \u1107;
79 $BB = \u1108;
80 $Si = \u1109;
81 $SSi = \u110A;
82 $IEUNG = \u110B; # null initial, inserted during Latin-Jamo
83 $Ji = \u110C;
84 $JJ = \u110D;
85 $Ci = \u110E;
86 $Ki = \u110F;
87 $Ti = \u1110;
88 $Pi = \u1111;
89 $Hi = \u1112;
90
91 $A = \u1161;
92 $AE = \u1162;
93 $YA = \u1163;
94 $YAE = \u1164;
95 $EO = \u1165;
96 $E = \u1166;
97 $YEO = \u1167;
98 $YE = \u1168;
99 $O = \u1169;
100 $WA = \u116A;
101 $WAE = \u116B;
102 $OE = \u116C;
103 $YO = \u116D;
104 $U = \u116E;
105 $WEO = \u116F;
106 $WE = \u1170;
107 $WI = \u1171;
108 $YU = \u1172;
109 $EU = \u1173; # null medial, inserted during Latin-Jamo
110 $YI = \u1174;
111 $I = \u1175;
112
113 $Gf = \u11A8;
114 $GGf = \u11A9;
115 $GS = \u11AA;
116 $Nf = \u11AB;
117 $NJ = \u11AC;
118 $NH = \u11AD;
119 $Df = \u11AE;
120 $L = \u11AF;
121 $LG = \u11B0;
122 $LM = \u11B1;
123 $LB = \u11B2;
124 $LS = \u11B3;
125 $LT = \u11B4;
126 $LP = \u11B5;
127 $LH = \u11B6;
128 $Mf = \u11B7;
129 $Bf = \u11B8;
130 $BS = \u11B9;
131 $Sf = \u11BA;
132 $SSf = \u11BB;
133 $NG = \u11BC;
134 $Jf = \u11BD;
135 $Cf = \u11BE;
136 $Kf = \u11BF;
137 $Tf = \u11C0;
138 $Pf = \u11C1;
139 $Hf = \u11C2;
140
141 $jamoInitial = [\u1100-\u1112];
142
143 $jamoMedial = [\u1161-\u1175];
144
145 $latinInitial = [bcdghjkmnprst];
146
147 # Any character in the latin transliteration of a medial
148 $latinMedial = [aeiouwy];
149
150 # The last character of the latin transliteration of a medial
151 $latinMedialEnd = [aeiou];
152
153 # Disambiguation separator
154 $sep = \';
155
156#----------------------------------------------------------------------
157# Jamo-Latin
158
159# Jamo to latin is relatively simple, since it is the latin that is
160# ambiguous. Most rules are straightforward, and we encode them below
161# as simple add-on back rule, e.g.:
162
163# $jamoMedial {bs} > $BS;
164
165# becomes
166
167# $jamoMedial {bs} <> $BS;
168
169# Furthermore, we don't care about the ordering for Jamo-Latin because
170# we are going from single characters, so we can very easily piggyback
171# on the Latin-Jamo.
172
173# The main issue with Jamo-Latin is when to insert separators.
174# Separators are inserted to obtain correct round trip behavior. For
175# example, the sequence Ki A Gf Gi E, if transliterated to "kagge",
176# would then round trip to Ki A GGi E. To prevent this, we insert a
177# separator: "kag-ge". IMPORTANT: The need for separators depends
178# very specifically on the behavior of the Latin-Jamo rules. A change
179# in the Latin-Jamo behavior can completely change the way the
180# separator insertion must be done.
181
182# First try to preserve actual separators in the jamo text by doubling
183# them. This fixes problems like:
184# (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) => dajung-yeongyeol
185# => (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L). This is optional
186# -- if we don't care about losing separators in the jamo, we can delete
187# this rule.
188
189 $sep $sep <> $sep;
190
191# Triple consonants. For three consonants "axxx" we insert a
192# separator between the first and second "x" if XXf, Xf, and Xi all
193# exist, and we have A Xf XXi. This prevents the reverse
194# transliteration to A XXf Xi.
195
196 $sep < $latinMedialEnd g {} $GGi;
197 $sep < $latinMedialEnd s {} $SSi;
198
199# For vowels the rule is similar. If there is a vowel "ae" such that
200# "a" by itself and "e" by itself are vowels, then we want to map A E
201# to "a-e" so as not to round trip to AE. However, in the text Ki EO
202# IEUNG E we don't need to map to "keo-e". "keoe" suffices. For
203# vowels of the form "aei", both "ae" + "i" and "a" + "ei" must be
204# tested. NOTE: These rules used to have a left context of
205# $latinInitial instead of [^$latinMedial]. The problem with this is
206# sequences where an initial IEUNG is transliterated away:
207# (IEUNG)(A)(IEUNG)(EO) => aeo => (IEUNG)(AE)(IEUNG)(O)
208
209 $sep < [^$latinMedial] [y w] e {} [$O $OE];
210 $sep < [^$latinMedial] e {} [$O $OE $U];
211 $sep < [^$latinMedial] [o a] {} [$E $EO $EU];
212 $sep < [^$latinMedial] [w y] a {} [$E $EO $EU];
213
214# Similar to the above, but with an intervening $IEUNG.
215
216 $sep < [^$latinMedial] [y w] e {} $IEUNG [$O $OE];
217 $sep < [^$latinMedial] e {} $IEUNG [$O $OE $U];
218 $sep < [^$latinMedial] [o a] {} $IEUNG [$E $EO $EU];
219 $sep < [^$latinMedial] [w y] a {} $IEUNG [$E $EO $EU];
220
221# Single finals followed by IEUNG. The jamo sequence A Xf IEUNG E,
222# where Xi also exists, must be transliterated as "ax-e" to prevent
223# the round trip conversion to A Xi E.
224
225 $sep < $latinMedialEnd b {} $IEUNG $jamoMedial;
226 $sep < $latinMedialEnd c {} $IEUNG $jamoMedial;
227 $sep < $latinMedialEnd d {} $IEUNG $jamoMedial;
228 $sep < $latinMedialEnd g {} $IEUNG $jamoMedial;
229 $sep < $latinMedialEnd h {} $IEUNG $jamoMedial;
230 $sep < $latinMedialEnd j {} $IEUNG $jamoMedial;
231 $sep < $latinMedialEnd k {} $IEUNG $jamoMedial;
232 $sep < $latinMedialEnd m {} $IEUNG $jamoMedial;
233 $sep < $latinMedialEnd n {} $IEUNG $jamoMedial;
234 $sep < $latinMedialEnd p {} $IEUNG $jamoMedial;
235 $sep < $latinMedialEnd s {} $IEUNG $jamoMedial;
236 $sep < $latinMedialEnd t {} $IEUNG $jamoMedial;
237
238# Double finals followed by IEUNG. Similar to the single finals
239# followed by IEUNG. Any latin consonant pair X Y, between medials,
240# that we would split by Latin-Jamo, we must handle when it occurs as
241# part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi
242# E.
243
244 $sep < $latinMedialEnd b s {} $IEUNG $jamoMedial;
245 $sep < $latinMedialEnd g g {} $IEUNG $jamoMedial;
246 $sep < $latinMedialEnd g s {} $IEUNG $jamoMedial;
247 $sep < $latinMedialEnd l b {} $IEUNG $jamoMedial;
248 $sep < $latinMedialEnd l g {} $IEUNG $jamoMedial;
249 $sep < $latinMedialEnd l h {} $IEUNG $jamoMedial;
250 $sep < $latinMedialEnd l m {} $IEUNG $jamoMedial;
251 $sep < $latinMedialEnd l p {} $IEUNG $jamoMedial;
252 $sep < $latinMedialEnd l s {} $IEUNG $jamoMedial;
253 $sep < $latinMedialEnd l t {} $IEUNG $jamoMedial;
254 $sep < $latinMedialEnd n g {} $IEUNG $jamoMedial;
255 $sep < $latinMedialEnd n h {} $IEUNG $jamoMedial;
256 $sep < $latinMedialEnd n j {} $IEUNG $jamoMedial;
257 $sep < $latinMedialEnd s s {} $IEUNG $jamoMedial;
258
259# Split doubles. Text of the form A Xi Xf E, where XXi also occurs,
260# we transliterate as "ax-xe" to prevent round trip transliteration as
261# A XXi E.
262
263 $sep < $latinMedialEnd b {} $Bi $jamoMedial;
264 $sep < $latinMedialEnd d {} $Di $jamoMedial;
265 $sep < $latinMedialEnd j {} $Ji $jamoMedial;
266 $sep < $latinMedialEnd g {} $Gi $jamoMedial;
267 $sep < $latinMedialEnd s {} $Si $jamoMedial;
268
269# XYY. This corresponds to the XYY rule in Latin-Jamo. By default
270# Latin-Jamo maps "xyy" to Xf YYi, to keep YY together. As a result,
271# "xyy" forms that correspond to XYf Yi must be transliterated as
272# "xy-y".
273
274 $sep < $latinMedialEnd b s {} [$Si $SSi];
275 $sep < $latinMedialEnd g s {} [$Si $SSi];
276 $sep < $latinMedialEnd l b {} [$Bi $BB];
277 $sep < $latinMedialEnd l g {} [$Gi $GGi];
278 $sep < $latinMedialEnd l s {} [$Si $SSi];
279 $sep < $latinMedialEnd n g {} [$Gi $GGi];
280 $sep < $latinMedialEnd n j {} [$Ji $JJ];
281
282# Deletion of IEUNG is handled below.
283
284#----------------------------------------------------------------------
285# Latin-Jamo
286
287# [Basic, context-free Jamo-Latin rules are embedded here too. See
288# above.]
289
290# Split digraphs: Text of the form 'axye', where 'xy' is a final
291# digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and
292# 'e' are medials, we want to transliterate this as A Xf Yi E rather
293# than A XYf IEUNG E. We do NOT include text of the form "axxe",
294# since that is handled differently below. These rules are generated
295# programmatically from the jamo data.
296
297 $jamoMedial {b s} $latinMedial > $Bf $Si;
298 $jamoMedial {g s} $latinMedial > $Gf $Si;
299 $jamoMedial {l b} $latinMedial > $L $Bi;
300 $jamoMedial {l g} $latinMedial > $L $Gi;
301 $jamoMedial {l h} $latinMedial > $L $Hi;
302 $jamoMedial {l m} $latinMedial > $L $Mi;
303 $jamoMedial {l p} $latinMedial > $L $Pi;
304 $jamoMedial {l s} $latinMedial > $L $Si;
305 $jamoMedial {l t} $latinMedial > $L $Ti;
306 $jamoMedial {n g} $latinMedial > $Nf $Gi;
307 $jamoMedial {n h} $latinMedial > $Nf $Hi;
308 $jamoMedial {n j} $latinMedial > $Nf $Ji;
309
310# Single consonants are initials: Text of the form 'axe', where 'x'
311# can be an initial or a final, and 'a' and 'e' are medials, we want
312# to transliterate as A Xi E rather than A Xf IEUNG E.
313
314 $jamoMedial {b} $latinMedial > $Bi;
315 $jamoMedial {c} $latinMedial > $Ci;
316 $jamoMedial {d} $latinMedial > $Di;
317 $jamoMedial {g} $latinMedial > $Gi;
318 $jamoMedial {h} $latinMedial > $Hi;
319 $jamoMedial {j} $latinMedial > $Ji;
320 $jamoMedial {k} $latinMedial > $Ki;
321 $jamoMedial {m} $latinMedial > $Mi;
322 $jamoMedial {n} $latinMedial > $Ni;
323 $jamoMedial {p} $latinMedial > $Pi;
324 $jamoMedial {s} $latinMedial > $Si;
325 $jamoMedial {t} $latinMedial > $Ti;
326
327# Doubled initials. The sequence "axxe", where XX exists as an initial
328# (XXi), and also Xi and Xf exist (true of all digraphs XX), we want
329# to transliterate as A XXi E, rather than split to A Xf Xi E.
330
331 $jamoMedial {b b} $latinMedial > $BB;
332 $jamoMedial {d d} $latinMedial > $DD;
333 $jamoMedial {j j} $latinMedial > $JJ;
334 $jamoMedial {g g} $latinMedial > $GGi;
335 $jamoMedial {s s} $latinMedial > $SSi;
336
337# XYY. Because doubled consonants bind more strongly than XY
338# consonants, we must handle the sequence "axyy" specially. Here XYf
339# and YYi must exist. In these cases, we map to Xf YYi rather than
340# XYf.
341
342 $jamoMedial {b} s s > $Bf;
343 $jamoMedial {g} s s > $Gf;
344 $jamoMedial {l} b b > $L;
345 $jamoMedial {l} g g > $L;
346 $jamoMedial {l} s s > $L;
347 $jamoMedial {n} g g > $Nf;
348 $jamoMedial {n} j j > $Nf;
349
350# Finals: Attach consonant with preceding medial to preceding medial.
351# Do this BEFORE mapping consonants to initials. Longer keys must
352# precede shorter keys that they start with, e.g., the rule for 'bs'
353# must precede 'b'.
354
355# [BASIC Jamo-Latin FINALS handled here. Order irrelevant within this
356# block for Jamo-Latin.]
357
358 $jamoMedial {bs} <> $BS;
359 $jamoMedial {b} <> $Bf;
360 $jamoMedial {c} <> $Cf;
361 $jamoMedial {d} <> $Df;
362 $jamoMedial {gg} <> $GGf;
363 $jamoMedial {gs} <> $GS;
364 $jamoMedial {g} <> $Gf;
365 $jamoMedial {h} <> $Hf;
366 $jamoMedial {j} <> $Jf;
367 $jamoMedial {k} <> $Kf;
368 $jamoMedial {lb} <> $LB; $jamoMedial {lg} <> $LG;
369 $jamoMedial {lh} <> $LH;
370 $jamoMedial {lm} <> $LM;
371 $jamoMedial {lp} <> $LP;
372 $jamoMedial {ls} <> $LS;
373 $jamoMedial {lt} <> $LT;
374 $jamoMedial {l} <> $L;
375 $jamoMedial {m} <> $Mf;
376 $jamoMedial {ng} <> $NG;
377 $jamoMedial {nh} <> $NH;
378 $jamoMedial {nj} <> $NJ;
379 $jamoMedial {n} <> $Nf;
380 $jamoMedial {p} <> $Pf;
381 $jamoMedial {ss} <> $SSf;
382 $jamoMedial {s} <> $Sf;
383 $jamoMedial {t} <> $Tf;
384
385# Initials: Attach single consonant to following medial. Do this
386# AFTER mapping finals. Longer keys must precede shorter keys that
387# they start with, e.g., the rule for 'gg' must precede 'g'.
388
389# [BASIC Jamo-Latin INITIALS handled here. Order irrelevant within
390# this block for Jamo-Latin.]
391
392 {gg} $latinMedial <> $GGi;
393 {g} $latinMedial <> $Gi;
394 {n} $latinMedial <> $Ni;
395 {dd} $latinMedial <> $DD;
396 {d} $latinMedial <> $Di;
397 {r} $latinMedial <> $R;
398 {m} $latinMedial <> $Mi;
399 {bb} $latinMedial <> $BB;
400 {b} $latinMedial <> $Bi;
401 {ss} $latinMedial <> $SSi;
402 {s} $latinMedial <> $Si;
403 {jj} $latinMedial <> $JJ;
404 {j} $latinMedial <> $Ji;
405 {c} $latinMedial <> $Ci;
406 {k} $latinMedial <> $Ki;
407 {t} $latinMedial <> $Ti;
408 {p} $latinMedial <> $Pi;
409 {h} $latinMedial <> $Hi;
410
411# 'r' in final position. Because of the equivalency of the 'l' and
412# 'r' jamo (the glyphs are the same), we try to provide the same
413# equivalency in Latin-Jamo. The 'l' to 'r' conversion is handled
414# below. If we see an 'r' in an apparent final position, treat it
415# like 'l'. For example, "karka" => Ki A R EU Ki A without this rule.
416# Instead, we want Ki A L Ki A.
417
418 $jamoMedial {r} $latinInitial > | l;
419
420# Initial + Final: If we match the next rule, we have initial then
421# final consonant with no intervening medial. We insert the null
422# vowel BEFORE it to create a well-formed syllable. (In the next rule
423# we insert a null vowel AFTER an anomalous initial.)
424
425 $jamoInitial {} [bcdghjklmnpst] > $EU;
426
427# Initial + X: This block matches an initial consonant not followed by
428# a medial. We insert the null vowel after it. We handle double
429# initials explicitly here; for single initial consonants we insert EU
430# (as Latin) after them and let standard rules do the rest.
431
432# BREAKS ROUND TRIP INTEGRITY
433
434 gg > $GGi $EU;
435 dd > $DD $EU;
436 bb > $BB $EU;
437 ss > $SSi $EU;
438 jj > $JJ $EU;
439
440 ([bcdghjkmnprst]) > | $1 eu;
441
442# X + Final: Finally we have to deal with a consonant that can only be
443# interpreted as a final (not an initial) and which is preceded
444# neither by an initial nor a medial. It is the start of the
445# syllable, but cannot be. Most of these will already be handled by
446# the above rules. 'bs' splits into Bi EU Sf. Similar for 'gs' 'ng'
447# 'nh' 'nj'. The only problem is 'l' and digraphs starting with 'l'.
448# For this isolated case, we could add a null initial and medial,
449# which would give "la" => IEUNG EU L IEUNG A, for example. A more
450# economical solution is to transliterate isolated "l" (that is,
451# initial "l") to "r". (Other similar conversions of consonants that
452# occur neither as initials nor as finals are handled below.)
453
454 l > | r;
455
456# Medials. If a medial is preceded by an initial, then we proceed
457# normally. As usual, longer keys must precede shorter ones.
458
459# [BASIC Jamo-Latin MEDIALS handled here. Order irrelevant within
460# this block for Jamo-Latin.]
461
462 $jamoInitial {ae} <> $AE;
463 $jamoInitial {a} <> $A;
464 $jamoInitial {eo} <> $EO;
465 $jamoInitial {eu} <> $EU;
466 $jamoInitial {e} <> $E;
467 $jamoInitial {i} <> $I;
468 $jamoInitial {oe} <> $OE;
469 $jamoInitial {o} <> $O;
470 $jamoInitial {u} <> $U;
471 $jamoInitial {wae} <> $WAE;
472 $jamoInitial {wa} <> $WA;
473 $jamoInitial {weo} <> $WEO;
474 $jamoInitial {we} <> $WE;
475 $jamoInitial {wi} <> $WI;
476 $jamoInitial {yae} <> $YAE;
477 $jamoInitial {ya} <> $YA;
478 $jamoInitial {yeo} <> $YEO;
479 $jamoInitial {ye} <> $YE;
480 $jamoInitial {yi} <> $YI;
481 $jamoInitial {yo} <> $YO;
482 $jamoInitial {yu} <> $YU;
483
484# We may see an anomalous isolated 'w' or 'y'. In that case, we
485# interpret it as 'wi' and 'yu', respectively.
486
487# BREAKS ROUND TRIP INTEGRITY
488
489 $jamoInitial {w} > | wi;
490 $jamoInitial {y} > | yu;
491
492# Otherwise, insert a null consonant IEUNG before the medial (which is
493# still an untransliterated latin vowel).
494
495 ($latinMedial) > $IEUNG | $1;
496
497# Convert non-jamo latin consonants to equivalents. These occur as
498# neither initials nor finals in jamo. 'l' occurs as a final, but not
499# an initial; it is handled above. The following letters (left hand
500# side) will never be output by Jamo-Latin.
501
502 f > | p;
503 q > | k;
504 v > | b;
505 x > | ks;
506 z > | s;
507
508# Delete separators (Latin-Jamo).
509
510 $sep > ;
511
512# Delete null consonants (Jamo-Latin). Do NOT delete null EU vowels,
513# since these may also occur in text.
514
515 < $IEUNG;
516
517#- N.B. DO NOT put any filters, NFD, etc. here -- those are aliased in
518#- the INDEX file. This transliterator is, by itself, not
519#- instantiated. It is used as a part of Latin-Jamo, Latin-Hangul, or
520#- inverses thereof.
521
522# eof