[apple/icu.git] / icuSources / data / translit / Latin_Jamo.txt

#--------------------------------------------------------------------
# Copyright (c) 1999-2004, International Business Machines
# Corporation and others. All Rights Reserved.
#--------------------------------------------------------------------

#- N.B. DO NOT put any filters, NFD, etc. here -- those are aliased in
#- the INDEX file.  This transliterator is, by itself, not
#- instantiated.  It is used as a part of Latin-Jamo, Latin-Hangul, or
#- inverses thereof.

# Transliteration from Latin characters to Korean script is done in
# two steps: Latin to Jamo, then Jamo to Hangul.  The Jamo-Hangul
# transliteration is done algorithmically following Unicode 3.0
# section 3.11.  This file implements the Latin to Jamo
# transliteration using rules.

# Jamo occupy the block 1100-11FF.  Within this block there are three
# groups of characters: initial consonants or choseong (I), medial
# vowels or jungseong (M), and trailing consonants or jongseong (F).
# Standard Korean syllables are of the form I+M+F*.

# Section 3.11 describes the use of 'filler' jamo to convert
# nonstandard syllables to standard form: the choseong filler 115F and
# the junseong filler 1160.  In this transliterator, we will not use
# 115F or 1160.

# We will, however, insert two 'null' jamo to make foreign words
# conform to Korean syllable structure.  These are the null initial
# consonant 110B (IEUNG) and the null vowel 1173 (EU).  In Latin text,
# we will use the separator in order to disambiguate strings,
# e.g. "kan-ggan" (initial GG) vs. "kanggan" (final NG + initial G).

# We will not use all of the characters in the jamo block.  We will
# only use the 19 initials, 21 medials, and 27 finals possessing a
# jamo short name as defined in section 4.4 of the Unicode book.

# Rules of thumb.  These guidelines provide the basic framework
# for the rules.  They are phrased in terms of Latin-Jamo transliteration.
# The Jamo-Latin rules derive from these, since the Jamo-Latin rules are
# just context-free transliteration of jamo to corresponding short names,
# with the addition of separators to maintain round-trip integrity
# in the context of the Latin-Jamo rules.

# A sequence of vowels:
# - Take the longest sequence you can. If there are too many, or you don't
#   have a starting consonant, introduce a 110B necessary.

# A sequence of consonants.
# - First join the double consonants: G + G -> GG
# - In the remaining list,
# -- If there is no preceding vowel, take the first consonant, and insert EU
#    after it. Continue with the rest of the consonants.
# -- If there is one consonant, attach to the following vowel
# -- If there are two consonants and a following vowel, attach one to the
#    preceeding vowel, and one to the following vowel.
# -- If there are more than two consonants, join the first two together if you
#    can: L + G => LG
# -- If you still end up with more than 2 consonants, insert EU after the
#    first one, and continue with the rest of the consonants.

#----------------------------------------------------------------------
# Variables

# Some latin consonants or consonant pairs only occur as initials, and
# some only as finals, but some occur as both.  This makes some jamo
# consonants ambiguous when transliterated into latin.
#   Initial only: IEUNG BB DD JJ R
#   Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ
#   Initial and Final: B C D G GG H J K M N P S SS T

  $Gi = \u1100;
  $GGi = \u1101;
  $Ni = \u1102;
  $Di = \u1103;
  $DD = \u1104;
  $R = \u1105;
  $Mi = \u1106;
  $Bi = \u1107;
  $BB = \u1108;
  $Si = \u1109;
  $SSi = \u110A;
  $IEUNG = \u110B; # null initial, inserted during Latin-Jamo
  $Ji = \u110C;
  $JJ = \u110D;
  $Ci = \u110E;
  $Ki = \u110F;
  $Ti = \u1110;
  $Pi = \u1111;
  $Hi = \u1112;

  $A = \u1161;
  $AE = \u1162;
  $YA = \u1163;
  $YAE = \u1164;
  $EO = \u1165;
  $E = \u1166;
  $YEO = \u1167;
  $YE = \u1168;
  $O = \u1169;
  $WA = \u116A;
  $WAE = \u116B;
  $OE = \u116C;
  $YO = \u116D;
  $U = \u116E;
  $WEO = \u116F;
  $WE = \u1170;
  $WI = \u1171;
  $YU = \u1172;
  $EU = \u1173; # null medial, inserted during Latin-Jamo
  $YI = \u1174;
  $I = \u1175;

  $Gf = \u11A8;
  $GGf = \u11A9;
  $GS = \u11AA;
  $Nf = \u11AB;
  $NJ = \u11AC;
  $NH = \u11AD;
  $Df = \u11AE;
  $L = \u11AF;
  $LG = \u11B0;
  $LM = \u11B1;
  $LB = \u11B2;
  $LS = \u11B3;
  $LT = \u11B4;
  $LP = \u11B5;
  $LH = \u11B6;
  $Mf = \u11B7;
  $Bf = \u11B8;
  $BS = \u11B9;
  $Sf = \u11BA;
  $SSf = \u11BB;
  $NG = \u11BC;
  $Jf = \u11BD;
  $Cf = \u11BE;
  $Kf = \u11BF;
  $Tf = \u11C0;
  $Pf = \u11C1;
  $Hf = \u11C2;
  
  $jamoInitial = [\u1100-\u1112];

  $jamoMedial = [\u1161-\u1175];

  $latinInitial = [bcdghjkmnprst];

  # Any character in the latin transliteration of a medial
  $latinMedial = [aeiouwy];

  # The last character of the latin transliteration of a medial
  $latinMedialEnd = [aeiou];

  # Disambiguation separator
  $sep = \';

#----------------------------------------------------------------------
# Jamo-Latin

# Jamo to latin is relatively simple, since it is the latin that is
# ambiguous.  Most rules are straightforward, and we encode them below
# as simple add-on back rule, e.g.:

#   $jamoMedial {bs} > $BS;

# becomes

#   $jamoMedial {bs} <> $BS;

# Furthermore, we don't care about the ordering for Jamo-Latin because
# we are going from single characters, so we can very easily piggyback
# on the Latin-Jamo.

# The main issue with Jamo-Latin is when to insert separators.
# Separators are inserted to obtain correct round trip behavior.  For
# example, the sequence Ki A Gf Gi E, if transliterated to "kagge",
# would then round trip to Ki A GGi E.  To prevent this, we insert a
# separator: "kag-ge".  IMPORTANT: The need for separators depends
# very specifically on the behavior of the Latin-Jamo rules.  A change
# in the Latin-Jamo behavior can completely change the way the
# separator insertion must be done.

# First try to preserve actual separators in the jamo text by doubling
# them.  This fixes problems like:
# (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) => dajung-yeongyeol
# => (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L).  This is optional
# -- if we don't care about losing separators in the jamo, we can delete
# this rule.

  $sep $sep <> $sep;

# Triple consonants.  For three consonants "axxx" we insert a
# separator between the first and second "x" if XXf, Xf, and Xi all
# exist, and we have A Xf XXi.  This prevents the reverse
# transliteration to A XXf Xi.

  $sep < $latinMedialEnd g {} $GGi;
  $sep < $latinMedialEnd s {} $SSi;

# For vowels the rule is similar.  If there is a vowel "ae" such that
# "a" by itself and "e" by itself are vowels, then we want to map A E
# to "a-e" so as not to round trip to AE.  However, in the text Ki EO
# IEUNG E we don't need to map to "keo-e".  "keoe" suffices.  For
# vowels of the form "aei", both "ae" + "i" and "a" + "ei" must be
# tested.  NOTE: These rules used to have a left context of
# $latinInitial instead of [^$latinMedial].  The problem with this is
# sequences where an initial IEUNG is transliterated away:
#   (IEUNG)(A)(IEUNG)(EO) => aeo => (IEUNG)(AE)(IEUNG)(O)

  $sep < [^$latinMedial] [y w] e {} [$O $OE];
  $sep < [^$latinMedial] e {} [$O $OE $U];
  $sep < [^$latinMedial] [o a] {} [$E $EO $EU];
  $sep < [^$latinMedial] [w y] a {} [$E $EO $EU];

# Similar to the above, but with an intervening $IEUNG.

  $sep < [^$latinMedial] [y w] e {} $IEUNG [$O $OE];
  $sep < [^$latinMedial] e {} $IEUNG [$O $OE $U];
  $sep < [^$latinMedial] [o a] {} $IEUNG [$E $EO $EU];
  $sep < [^$latinMedial] [w y] a {} $IEUNG [$E $EO $EU];

# Single finals followed by IEUNG.  The jamo sequence A Xf IEUNG E,
# where Xi also exists, must be transliterated as "ax-e" to prevent
# the round trip conversion to A Xi E.

  $sep < $latinMedialEnd b {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd c {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd d {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd g {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd h {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd j {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd k {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd m {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd n {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd p {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd s {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd t {} $IEUNG $jamoMedial;

# Double finals followed by IEUNG.  Similar to the single finals
# followed by IEUNG.  Any latin consonant pair X Y, between medials,
# that we would split by Latin-Jamo, we must handle when it occurs as
# part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi
# E.

  $sep < $latinMedialEnd b s {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd g g {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd g s {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd l b {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd l g {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd l h {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd l m {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd l p {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd l s {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd l t {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd n g {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd n h {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd n j {} $IEUNG $jamoMedial;
  $sep < $latinMedialEnd s s {} $IEUNG $jamoMedial;

# Split doubles.  Text of the form A Xi Xf E, where XXi also occurs,
# we transliterate as "ax-xe" to prevent round trip transliteration as
# A XXi E.

  $sep < $latinMedialEnd b {} $Bi $jamoMedial;
  $sep < $latinMedialEnd d {} $Di $jamoMedial;
  $sep < $latinMedialEnd j {} $Ji $jamoMedial;
  $sep < $latinMedialEnd g {} $Gi $jamoMedial;
  $sep < $latinMedialEnd s {} $Si $jamoMedial;

# XYY.  This corresponds to the XYY rule in Latin-Jamo.  By default
# Latin-Jamo maps "xyy" to Xf YYi, to keep YY together.  As a result,
# "xyy" forms that correspond to XYf Yi must be transliterated as
# "xy-y".

  $sep < $latinMedialEnd b s {} [$Si $SSi];
  $sep < $latinMedialEnd g s {} [$Si $SSi];
  $sep < $latinMedialEnd l b {} [$Bi $BB];
  $sep < $latinMedialEnd l g {} [$Gi $GGi];
  $sep < $latinMedialEnd l s {} [$Si $SSi];
  $sep < $latinMedialEnd n g {} [$Gi $GGi];
  $sep < $latinMedialEnd n j {} [$Ji $JJ];

# Deletion of IEUNG is handled below.

#----------------------------------------------------------------------
# Latin-Jamo

# [Basic, context-free Jamo-Latin rules are embedded here too.  See
# above.]

# Split digraphs: Text of the form 'axye', where 'xy' is a final
# digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and
# 'e' are medials, we want to transliterate this as A Xf Yi E rather
# than A XYf IEUNG E.  We do NOT include text of the form "axxe",
# since that is handled differently below.  These rules are generated
# programmatically from the jamo data.

  $jamoMedial {b s} $latinMedial > $Bf $Si;
  $jamoMedial {g s} $latinMedial > $Gf $Si;
  $jamoMedial {l b} $latinMedial > $L $Bi;
  $jamoMedial {l g} $latinMedial > $L $Gi;
  $jamoMedial {l h} $latinMedial > $L $Hi;
  $jamoMedial {l m} $latinMedial > $L $Mi;
  $jamoMedial {l p} $latinMedial > $L $Pi;
  $jamoMedial {l s} $latinMedial > $L $Si;
  $jamoMedial {l t} $latinMedial > $L $Ti;
  $jamoMedial {n g} $latinMedial > $Nf $Gi;
  $jamoMedial {n h} $latinMedial > $Nf $Hi;
  $jamoMedial {n j} $latinMedial > $Nf $Ji;

# Single consonants are initials: Text of the form 'axe', where 'x'
# can be an initial or a final, and 'a' and 'e' are medials, we want
# to transliterate as A Xi E rather than A Xf IEUNG E.

  $jamoMedial {b} $latinMedial > $Bi;
  $jamoMedial {c} $latinMedial > $Ci;
  $jamoMedial {d} $latinMedial > $Di;
  $jamoMedial {g} $latinMedial > $Gi;
  $jamoMedial {h} $latinMedial > $Hi;
  $jamoMedial {j} $latinMedial > $Ji;
  $jamoMedial {k} $latinMedial > $Ki;
  $jamoMedial {m} $latinMedial > $Mi;
  $jamoMedial {n} $latinMedial > $Ni;
  $jamoMedial {p} $latinMedial > $Pi;
  $jamoMedial {s} $latinMedial > $Si;
  $jamoMedial {t} $latinMedial > $Ti;

# Doubled initials.  The sequence "axxe", where XX exists as an initial
# (XXi), and also Xi and Xf exist (true of all digraphs XX), we want
# to transliterate as A XXi E, rather than split to A Xf Xi E.

  $jamoMedial {b b} $latinMedial > $BB;
  $jamoMedial {d d} $latinMedial > $DD;
  $jamoMedial {j j} $latinMedial > $JJ;
  $jamoMedial {g g} $latinMedial > $GGi;
  $jamoMedial {s s} $latinMedial > $SSi;

# XYY.  Because doubled consonants bind more strongly than XY
# consonants, we must handle the sequence "axyy" specially.  Here XYf
# and YYi must exist.  In these cases, we map to Xf YYi rather than
# XYf.

  $jamoMedial {b} s s > $Bf;
  $jamoMedial {g} s s > $Gf;
  $jamoMedial {l} b b > $L;
  $jamoMedial {l} g g > $L;
  $jamoMedial {l} s s > $L;
  $jamoMedial {n} g g > $Nf;
  $jamoMedial {n} j j > $Nf;

# Finals: Attach consonant with preceding medial to preceding medial.
# Do this BEFORE mapping consonants to initials.  Longer keys must
# precede shorter keys that they start with, e.g., the rule for 'bs'
# must precede 'b'.

# [BASIC Jamo-Latin FINALS handled here.  Order irrelevant within this
# block for Jamo-Latin.]

  $jamoMedial {bs} <> $BS;
  $jamoMedial {b} <> $Bf;
  $jamoMedial {c} <> $Cf;
  $jamoMedial {d} <> $Df;
  $jamoMedial {gg} <> $GGf;
  $jamoMedial {gs} <> $GS;
  $jamoMedial {g} <> $Gf;
  $jamoMedial {h} <> $Hf;
  $jamoMedial {j} <> $Jf;
  $jamoMedial {k} <> $Kf;
  $jamoMedial {lb} <> $LB;  $jamoMedial {lg} <> $LG;
  $jamoMedial {lh} <> $LH;
  $jamoMedial {lm} <> $LM;
  $jamoMedial {lp} <> $LP;
  $jamoMedial {ls} <> $LS;
  $jamoMedial {lt} <> $LT;
  $jamoMedial {l} <> $L;
  $jamoMedial {m} <> $Mf;
  $jamoMedial {ng} <> $NG;
  $jamoMedial {nh} <> $NH;
  $jamoMedial {nj} <> $NJ;
  $jamoMedial {n} <> $Nf;
  $jamoMedial {p} <> $Pf;
  $jamoMedial {ss} <> $SSf;
  $jamoMedial {s} <> $Sf;
  $jamoMedial {t} <> $Tf;

# Initials: Attach single consonant to following medial.  Do this
# AFTER mapping finals.  Longer keys must precede shorter keys that
# they start with, e.g., the rule for 'gg' must precede 'g'.

# [BASIC Jamo-Latin INITIALS handled here.  Order irrelevant within
# this block for Jamo-Latin.]

  {gg} $latinMedial <> $GGi;
  {g} $latinMedial <> $Gi;
  {n} $latinMedial <> $Ni;
  {dd} $latinMedial <> $DD;
  {d} $latinMedial <> $Di;
  {r} $latinMedial <> $R;
  {m} $latinMedial <> $Mi;
  {bb} $latinMedial <> $BB;
  {b} $latinMedial <> $Bi;
  {ss} $latinMedial <> $SSi;
  {s} $latinMedial <> $Si;
  {jj} $latinMedial <> $JJ;
  {j} $latinMedial <> $Ji;
  {c} $latinMedial <> $Ci;
  {k} $latinMedial <> $Ki;
  {t} $latinMedial <> $Ti;
  {p} $latinMedial <> $Pi;
  {h} $latinMedial <> $Hi;

# 'r' in final position.  Because of the equivalency of the 'l' and
# 'r' jamo (the glyphs are the same), we try to provide the same
# equivalency in Latin-Jamo.  The 'l' to 'r' conversion is handled
# below.  If we see an 'r' in an apparent final position, treat it
# like 'l'.  For example, "karka" => Ki A R EU Ki A without this rule.
# Instead, we want Ki A L Ki A.

  $jamoMedial {r} $latinInitial > | l;

# Initial + Final: If we match the next rule, we have initial then
# final consonant with no intervening medial.  We insert the null
# vowel BEFORE it to create a well-formed syllable.  (In the next rule
# we insert a null vowel AFTER an anomalous initial.)

  $jamoInitial {} [bcdghjklmnpst] > $EU;

# Initial + X: This block matches an initial consonant not followed by
# a medial.  We insert the null vowel after it.  We handle double
# initials explicitly here; for single initial consonants we insert EU
# (as Latin) after them and let standard rules do the rest.

# BREAKS ROUND TRIP INTEGRITY

  gg > $GGi $EU;
  dd > $DD $EU;
  bb > $BB $EU;
  ss > $SSi $EU;
  jj > $JJ $EU;

  ([bcdghjkmnprst]) > | $1 eu;

# X + Final: Finally we have to deal with a consonant that can only be
# interpreted as a final (not an initial) and which is preceded
# neither by an initial nor a medial.  It is the start of the
# syllable, but cannot be.  Most of these will already be handled by
# the above rules.  'bs' splits into Bi EU Sf.  Similar for 'gs' 'ng'
# 'nh' 'nj'.  The only problem is 'l' and digraphs starting with 'l'.
# For this isolated case, we could add a null initial and medial,
# which would give "la" => IEUNG EU L IEUNG A, for example.  A more
# economical solution is to transliterate isolated "l" (that is,
# initial "l") to "r".  (Other similar conversions of consonants that
# occur neither as initials nor as finals are handled below.)

  l > | r;

# Medials.  If a medial is preceded by an initial, then we proceed
# normally.  As usual, longer keys must precede shorter ones.

# [BASIC Jamo-Latin MEDIALS handled here.  Order irrelevant within
# this block for Jamo-Latin.]

  $jamoInitial {ae} <> $AE;
  $jamoInitial {a} <> $A;
  $jamoInitial {eo} <> $EO;
  $jamoInitial {eu} <> $EU;
  $jamoInitial {e} <> $E;
  $jamoInitial {i} <> $I;
  $jamoInitial {oe} <> $OE;
  $jamoInitial {o} <> $O;
  $jamoInitial {u} <> $U;
  $jamoInitial {wae} <> $WAE;
  $jamoInitial {wa} <> $WA;
  $jamoInitial {weo} <> $WEO;
  $jamoInitial {we} <> $WE;
  $jamoInitial {wi} <> $WI;
  $jamoInitial {yae} <> $YAE;
  $jamoInitial {ya} <> $YA;
  $jamoInitial {yeo} <> $YEO;
  $jamoInitial {ye} <> $YE;
  $jamoInitial {yi} <> $YI;
  $jamoInitial {yo} <> $YO;
  $jamoInitial {yu} <> $YU;

# We may see an anomalous isolated 'w' or 'y'.  In that case, we
# interpret it as 'wi' and 'yu', respectively.

# BREAKS ROUND TRIP INTEGRITY

  $jamoInitial {w} > | wi;
  $jamoInitial {y} > | yu;

# Otherwise, insert a null consonant IEUNG before the medial (which is
# still an untransliterated latin vowel).

  ($latinMedial) > $IEUNG | $1;

# Convert non-jamo latin consonants to equivalents.  These occur as
# neither initials nor finals in jamo.  'l' occurs as a final, but not
# an initial; it is handled above.  The following letters (left hand
# side) will never be output by Jamo-Latin.

  f > | p;
  q > | k;
  v > | b;
  x > | ks;
  z > | s;

# Delete separators (Latin-Jamo).

  $sep > ;

# Delete null consonants (Jamo-Latin).  Do NOT delete null EU vowels,
# since these may also occur in text.

  < $IEUNG;

#- N.B. DO NOT put any filters, NFD, etc. here -- those are aliased in
#- the INDEX file.  This transliterator is, by itself, not
#- instantiated.  It is used as a part of Latin-Jamo, Latin-Hangul, or
#- inverses thereof.

# eof
Commit	Line	Data
374ca955 A	1	#--------------------------------------------------------------------
	2	# Copyright (c) 1999-2004, International Business Machines
	3	# Corporation and others. All Rights Reserved.
	4	#--------------------------------------------------------------------
	5
	6	#- N.B. DO NOT put any filters, NFD, etc. here -- those are aliased in
	7	#- the INDEX file. This transliterator is, by itself, not
	8	#- instantiated. It is used as a part of Latin-Jamo, Latin-Hangul, or
	9	#- inverses thereof.
	10
	11	# Transliteration from Latin characters to Korean script is done in
	12	# two steps: Latin to Jamo, then Jamo to Hangul. The Jamo-Hangul
	13	# transliteration is done algorithmically following Unicode 3.0
	14	# section 3.11. This file implements the Latin to Jamo
	15	# transliteration using rules.
	16
	17	# Jamo occupy the block 1100-11FF. Within this block there are three
	18	# groups of characters: initial consonants or choseong (I), medial
	19	# vowels or jungseong (M), and trailing consonants or jongseong (F).
	20	# Standard Korean syllables are of the form I+M+F*.
	21
	22	# Section 3.11 describes the use of 'filler' jamo to convert
	23	# nonstandard syllables to standard form: the choseong filler 115F and
	24	# the junseong filler 1160. In this transliterator, we will not use
	25	# 115F or 1160.
	26
	27	# We will, however, insert two 'null' jamo to make foreign words
	28	# conform to Korean syllable structure. These are the null initial
	29	# consonant 110B (IEUNG) and the null vowel 1173 (EU). In Latin text,
	30	# we will use the separator in order to disambiguate strings,
	31	# e.g. "kan-ggan" (initial GG) vs. "kanggan" (final NG + initial G).
	32
	33	# We will not use all of the characters in the jamo block. We will
	34	# only use the 19 initials, 21 medials, and 27 finals possessing a
	35	# jamo short name as defined in section 4.4 of the Unicode book.
	36
	37	# Rules of thumb. These guidelines provide the basic framework
	38	# for the rules. They are phrased in terms of Latin-Jamo transliteration.
	39	# The Jamo-Latin rules derive from these, since the Jamo-Latin rules are
	40	# just context-free transliteration of jamo to corresponding short names,
	41	# with the addition of separators to maintain round-trip integrity
	42	# in the context of the Latin-Jamo rules.
	43
	44	# A sequence of vowels:
	45	# - Take the longest sequence you can. If there are too many, or you don't
	46	# have a starting consonant, introduce a 110B necessary.
	47
	48	# A sequence of consonants.
	49	# - First join the double consonants: G + G -> GG
	50	# - In the remaining list,
	51	# -- If there is no preceding vowel, take the first consonant, and insert EU
	52	# after it. Continue with the rest of the consonants.
	53	# -- If there is one consonant, attach to the following vowel
	54	# -- If there are two consonants and a following vowel, attach one to the
	55	# preceeding vowel, and one to the following vowel.
	56	# -- If there are more than two consonants, join the first two together if you
	57	# can: L + G => LG
	58	# -- If you still end up with more than 2 consonants, insert EU after the
	59	# first one, and continue with the rest of the consonants.
	60
	61	#----------------------------------------------------------------------
	62	# Variables
	63
	64	# Some latin consonants or consonant pairs only occur as initials, and
65	# some only as finals, but some occur as both. This makes some jamo
66	# consonants ambiguous when transliterated into latin.
67	# Initial only: IEUNG BB DD JJ R
68	# Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ
69	# Initial and Final: B C D G GG H J K M N P S SS T
70
71	$Gi = \u1100;
72	$GGi = \u1101;
73	$Ni = \u1102;
74	$Di = \u1103;
75	$DD = \u1104;
76	$R = \u1105;
77	$Mi = \u1106;
78	$Bi = \u1107;
79	$BB = \u1108;
80	$Si = \u1109;
81	$SSi = \u110A;
82	$IEUNG = \u110B; # null initial, inserted during Latin-Jamo
83	$Ji = \u110C;
84	$JJ = \u110D;
85	$Ci = \u110E;
86	$Ki = \u110F;
87	$Ti = \u1110;
88	$Pi = \u1111;
89	$Hi = \u1112;
90
91	$A = \u1161;
92	$AE = \u1162;
93	$YA = \u1163;
94	$YAE = \u1164;
95	$EO = \u1165;
96	$E = \u1166;
97	$YEO = \u1167;
98	$YE = \u1168;
99	$O = \u1169;
100	$WA = \u116A;
101	$WAE = \u116B;
102	$OE = \u116C;
103	$YO = \u116D;
104	$U = \u116E;
105	$WEO = \u116F;
106	$WE = \u1170;
107	$WI = \u1171;
108	$YU = \u1172;
109	$EU = \u1173; # null medial, inserted during Latin-Jamo
110	$YI = \u1174;
111	$I = \u1175;
112
113	$Gf = \u11A8;
114	$GGf = \u11A9;
115	$GS = \u11AA;
116	$Nf = \u11AB;
117	$NJ = \u11AC;
118	$NH = \u11AD;
119	$Df = \u11AE;
120	$L = \u11AF;
121	$LG = \u11B0;
122	$LM = \u11B1;
123	$LB = \u11B2;
124	$LS = \u11B3;
125	$LT = \u11B4;
126	$LP = \u11B5;
127	$LH = \u11B6;
128	$Mf = \u11B7;
129	$Bf = \u11B8;
130	$BS = \u11B9;
131	$Sf = \u11BA;
132	$SSf = \u11BB;
133	$NG = \u11BC;
134	$Jf = \u11BD;
135	$Cf = \u11BE;
136	$Kf = \u11BF;
137	$Tf = \u11C0;
138	$Pf = \u11C1;
139	$Hf = \u11C2;
140
141	$jamoInitial = [\u1100-\u1112];
142
143	$jamoMedial = [\u1161-\u1175];
144
145	$latinInitial = [bcdghjkmnprst];
146
147	# Any character in the latin transliteration of a medial
148	$latinMedial = [aeiouwy];
149
150	# The last character of the latin transliteration of a medial
151	$latinMedialEnd = [aeiou];
152
153	# Disambiguation separator
154	$sep = \';
155
156	#----------------------------------------------------------------------
157	# Jamo-Latin
158
159	# Jamo to latin is relatively simple, since it is the latin that is
160	# ambiguous. Most rules are straightforward, and we encode them below
161	# as simple add-on back rule, e.g.:
162
163	# $jamoMedial {bs} > $BS;
164
165	# becomes
166
167	# $jamoMedial {bs} <> $BS;
168
169	# Furthermore, we don't care about the ordering for Jamo-Latin because
170	# we are going from single characters, so we can very easily piggyback
171	# on the Latin-Jamo.
172
173	# The main issue with Jamo-Latin is when to insert separators.
174	# Separators are inserted to obtain correct round trip behavior. For
175	# example, the sequence Ki A Gf Gi E, if transliterated to "kagge",
176	# would then round trip to Ki A GGi E. To prevent this, we insert a
177	# separator: "kag-ge". IMPORTANT: The need for separators depends
178	# very specifically on the behavior of the Latin-Jamo rules. A change
179	# in the Latin-Jamo behavior can completely change the way the
180	# separator insertion must be done.
181
182	# First try to preserve actual separators in the jamo text by doubling
183	# them. This fixes problems like:
184	# (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) => dajung-yeongyeol
185	# => (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L). This is optional
186	# -- if we don't care about losing separators in the jamo, we can delete
187	# this rule.
188
189	$sep $sep <> $sep;
190
191	# Triple consonants. For three consonants "axxx" we insert a
192	# separator between the first and second "x" if XXf, Xf, and Xi all
193	# exist, and we have A Xf XXi. This prevents the reverse
194	# transliteration to A XXf Xi.
195
196	$sep < $latinMedialEnd g {} $GGi;
197	$sep < $latinMedialEnd s {} $SSi;
198
199	# For vowels the rule is similar. If there is a vowel "ae" such that
200	# "a" by itself and "e" by itself are vowels, then we want to map A E
201	# to "a-e" so as not to round trip to AE. However, in the text Ki EO
202	# IEUNG E we don't need to map to "keo-e". "keoe" suffices. For
203	# vowels of the form "aei", both "ae" + "i" and "a" + "ei" must be
204	# tested. NOTE: These rules used to have a left context of
205	# $latinInitial instead of [^$latinMedial]. The problem with this is
206	# sequences where an initial IEUNG is transliterated away:
207	# (IEUNG)(A)(IEUNG)(EO) => aeo => (IEUNG)(AE)(IEUNG)(O)
208
209	$sep < [^$latinMedial] [y w] e {} [$O $OE];
210	$sep < [^$latinMedial] e {} [$O $OE $U];
211	$sep < [^$latinMedial] [o a] {} [$E $EO $EU];
212	$sep < [^$latinMedial] [w y] a {} [$E $EO $EU];
213
214	# Similar to the above, but with an intervening $IEUNG.
215
216	$sep < [^$latinMedial] [y w] e {} $IEUNG [$O $OE];
217	$sep < [^$latinMedial] e {} $IEUNG [$O $OE $U];
218	$sep < [^$latinMedial] [o a] {} $IEUNG [$E $EO $EU];
219	$sep < [^$latinMedial] [w y] a {} $IEUNG [$E $EO $EU];
220
221	# Single finals followed by IEUNG. The jamo sequence A Xf IEUNG E,
222	# where Xi also exists, must be transliterated as "ax-e" to prevent
223	# the round trip conversion to A Xi E.
224
225	$sep < $latinMedialEnd b {} $IEUNG $jamoMedial;
226	$sep < $latinMedialEnd c {} $IEUNG $jamoMedial;
227	$sep < $latinMedialEnd d {} $IEUNG $jamoMedial;
228	$sep < $latinMedialEnd g {} $IEUNG $jamoMedial;
229	$sep < $latinMedialEnd h {} $IEUNG $jamoMedial;
230	$sep < $latinMedialEnd j {} $IEUNG $jamoMedial;
231	$sep < $latinMedialEnd k {} $IEUNG $jamoMedial;
232	$sep < $latinMedialEnd m {} $IEUNG $jamoMedial;
233	$sep < $latinMedialEnd n {} $IEUNG $jamoMedial;
234	$sep < $latinMedialEnd p {} $IEUNG $jamoMedial;
235	$sep < $latinMedialEnd s {} $IEUNG $jamoMedial;
236	$sep < $latinMedialEnd t {} $IEUNG $jamoMedial;
237
238	# Double finals followed by IEUNG. Similar to the single finals
239	# followed by IEUNG. Any latin consonant pair X Y, between medials,
240	# that we would split by Latin-Jamo, we must handle when it occurs as
241	# part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi
242	# E.
243
244	$sep < $latinMedialEnd b s {} $IEUNG $jamoMedial;
245	$sep < $latinMedialEnd g g {} $IEUNG $jamoMedial;
246	$sep < $latinMedialEnd g s {} $IEUNG $jamoMedial;
247	$sep < $latinMedialEnd l b {} $IEUNG $jamoMedial;
248	$sep < $latinMedialEnd l g {} $IEUNG $jamoMedial;
249	$sep < $latinMedialEnd l h {} $IEUNG $jamoMedial;
250	$sep < $latinMedialEnd l m {} $IEUNG $jamoMedial;
251	$sep < $latinMedialEnd l p {} $IEUNG $jamoMedial;
252	$sep < $latinMedialEnd l s {} $IEUNG $jamoMedial;
253	$sep < $latinMedialEnd l t {} $IEUNG $jamoMedial;
254	$sep < $latinMedialEnd n g {} $IEUNG $jamoMedial;
255	$sep < $latinMedialEnd n h {} $IEUNG $jamoMedial;
256	$sep < $latinMedialEnd n j {} $IEUNG $jamoMedial;
257	$sep < $latinMedialEnd s s {} $IEUNG $jamoMedial;
258
259	# Split doubles. Text of the form A Xi Xf E, where XXi also occurs,
260	# we transliterate as "ax-xe" to prevent round trip transliteration as
261	# A XXi E.
262
263	$sep < $latinMedialEnd b {} $Bi $jamoMedial;
264	$sep < $latinMedialEnd d {} $Di $jamoMedial;
265	$sep < $latinMedialEnd j {} $Ji $jamoMedial;
266	$sep < $latinMedialEnd g {} $Gi $jamoMedial;
267	$sep < $latinMedialEnd s {} $Si $jamoMedial;
268
269	# XYY. This corresponds to the XYY rule in Latin-Jamo. By default
270	# Latin-Jamo maps "xyy" to Xf YYi, to keep YY together. As a result,
271	# "xyy" forms that correspond to XYf Yi must be transliterated as
272	# "xy-y".
273
274	$sep < $latinMedialEnd b s {} [$Si $SSi];
275	$sep < $latinMedialEnd g s {} [$Si $SSi];
276	$sep < $latinMedialEnd l b {} [$Bi $BB];
277	$sep < $latinMedialEnd l g {} [$Gi $GGi];
278	$sep < $latinMedialEnd l s {} [$Si $SSi];
279	$sep < $latinMedialEnd n g {} [$Gi $GGi];
280	$sep < $latinMedialEnd n j {} [$Ji $JJ];
281
282	# Deletion of IEUNG is handled below.
283
284	#----------------------------------------------------------------------
285	# Latin-Jamo
286
287	# [Basic, context-free Jamo-Latin rules are embedded here too. See
288	# above.]
289
290	# Split digraphs: Text of the form 'axye', where 'xy' is a final
291	# digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and
292	# 'e' are medials, we want to transliterate this as A Xf Yi E rather
293	# than A XYf IEUNG E. We do NOT include text of the form "axxe",
294	# since that is handled differently below. These rules are generated
295	# programmatically from the jamo data.
296
297	$jamoMedial {b s} $latinMedial > $Bf $Si;
298	$jamoMedial {g s} $latinMedial > $Gf $Si;
299	$jamoMedial {l b} $latinMedial > $L $Bi;
300	$jamoMedial {l g} $latinMedial > $L $Gi;
301	$jamoMedial {l h} $latinMedial > $L $Hi;
302	$jamoMedial {l m} $latinMedial > $L $Mi;
303	$jamoMedial {l p} $latinMedial > $L $Pi;
304	$jamoMedial {l s} $latinMedial > $L $Si;
305	$jamoMedial {l t} $latinMedial > $L $Ti;
306	$jamoMedial {n g} $latinMedial > $Nf $Gi;
307	$jamoMedial {n h} $latinMedial > $Nf $Hi;
308	$jamoMedial {n j} $latinMedial > $Nf $Ji;
309
310	# Single consonants are initials: Text of the form 'axe', where 'x'
311	# can be an initial or a final, and 'a' and 'e' are medials, we want
312	# to transliterate as A Xi E rather than A Xf IEUNG E.
313
314	$jamoMedial {b} $latinMedial > $Bi;
315	$jamoMedial {c} $latinMedial > $Ci;
316	$jamoMedial {d} $latinMedial > $Di;
317	$jamoMedial {g} $latinMedial > $Gi;
318	$jamoMedial {h} $latinMedial > $Hi;
319	$jamoMedial {j} $latinMedial > $Ji;
320	$jamoMedial {k} $latinMedial > $Ki;
321	$jamoMedial {m} $latinMedial > $Mi;
322	$jamoMedial {n} $latinMedial > $Ni;
323	$jamoMedial {p} $latinMedial > $Pi;
324	$jamoMedial {s} $latinMedial > $Si;
325	$jamoMedial {t} $latinMedial > $Ti;
326
327	# Doubled initials. The sequence "axxe", where XX exists as an initial
328	# (XXi), and also Xi and Xf exist (true of all digraphs XX), we want
329	# to transliterate as A XXi E, rather than split to A Xf Xi E.
330
331	$jamoMedial {b b} $latinMedial > $BB;
332	$jamoMedial {d d} $latinMedial > $DD;
333	$jamoMedial {j j} $latinMedial > $JJ;
334	$jamoMedial {g g} $latinMedial > $GGi;
335	$jamoMedial {s s} $latinMedial > $SSi;
336
337	# XYY. Because doubled consonants bind more strongly than XY
338	# consonants, we must handle the sequence "axyy" specially. Here XYf
339	# and YYi must exist. In these cases, we map to Xf YYi rather than
340	# XYf.
341
342	$jamoMedial {b} s s > $Bf;
343	$jamoMedial {g} s s > $Gf;
344	$jamoMedial {l} b b > $L;
345	$jamoMedial {l} g g > $L;
346	$jamoMedial {l} s s > $L;
347	$jamoMedial {n} g g > $Nf;
348	$jamoMedial {n} j j > $Nf;
349
350	# Finals: Attach consonant with preceding medial to preceding medial.
351	# Do this BEFORE mapping consonants to initials. Longer keys must
352	# precede shorter keys that they start with, e.g., the rule for 'bs'
353	# must precede 'b'.
354
355	# [BASIC Jamo-Latin FINALS handled here. Order irrelevant within this
356	# block for Jamo-Latin.]
357
358	$jamoMedial {bs} <> $BS;
359	$jamoMedial {b} <> $Bf;
360	$jamoMedial {c} <> $Cf;
361	$jamoMedial {d} <> $Df;
362	$jamoMedial {gg} <> $GGf;
363	$jamoMedial {gs} <> $GS;
364	$jamoMedial {g} <> $Gf;
365	$jamoMedial {h} <> $Hf;
366	$jamoMedial {j} <> $Jf;
367	$jamoMedial {k} <> $Kf;
368	$jamoMedial {lb} <> $LB; $jamoMedial {lg} <> $LG;
369	$jamoMedial {lh} <> $LH;
370	$jamoMedial {lm} <> $LM;
371	$jamoMedial {lp} <> $LP;
372	$jamoMedial {ls} <> $LS;
373	$jamoMedial {lt} <> $LT;
374	$jamoMedial {l} <> $L;
375	$jamoMedial {m} <> $Mf;
376	$jamoMedial {ng} <> $NG;
377	$jamoMedial {nh} <> $NH;
378	$jamoMedial {nj} <> $NJ;
379	$jamoMedial {n} <> $Nf;
380	$jamoMedial {p} <> $Pf;
381	$jamoMedial {ss} <> $SSf;
382	$jamoMedial {s} <> $Sf;
383	$jamoMedial {t} <> $Tf;
384
385	# Initials: Attach single consonant to following medial. Do this
386	# AFTER mapping finals. Longer keys must precede shorter keys that
387	# they start with, e.g., the rule for 'gg' must precede 'g'.
388
389	# [BASIC Jamo-Latin INITIALS handled here. Order irrelevant within
390	# this block for Jamo-Latin.]
391
392	{gg} $latinMedial <> $GGi;
393	{g} $latinMedial <> $Gi;
394	{n} $latinMedial <> $Ni;
395	{dd} $latinMedial <> $DD;
396	{d} $latinMedial <> $Di;
397	{r} $latinMedial <> $R;
398	{m} $latinMedial <> $Mi;
399	{bb} $latinMedial <> $BB;
400	{b} $latinMedial <> $Bi;
401	{ss} $latinMedial <> $SSi;
402	{s} $latinMedial <> $Si;
403	{jj} $latinMedial <> $JJ;
404	{j} $latinMedial <> $Ji;
405	{c} $latinMedial <> $Ci;
406	{k} $latinMedial <> $Ki;
407	{t} $latinMedial <> $Ti;
408	{p} $latinMedial <> $Pi;
409	{h} $latinMedial <> $Hi;
410
411	# 'r' in final position. Because of the equivalency of the 'l' and
412	# 'r' jamo (the glyphs are the same), we try to provide the same
413	# equivalency in Latin-Jamo. The 'l' to 'r' conversion is handled
414	# below. If we see an 'r' in an apparent final position, treat it
415	# like 'l'. For example, "karka" => Ki A R EU Ki A without this rule.
416	# Instead, we want Ki A L Ki A.
417
418	$jamoMedial {r} $latinInitial > \| l;
419
420	# Initial + Final: If we match the next rule, we have initial then
421	# final consonant with no intervening medial. We insert the null
422	# vowel BEFORE it to create a well-formed syllable. (In the next rule
423	# we insert a null vowel AFTER an anomalous initial.)
424
425	$jamoInitial {} [bcdghjklmnpst] > $EU;
426
427	# Initial + X: This block matches an initial consonant not followed by
428	# a medial. We insert the null vowel after it. We handle double
429	# initials explicitly here; for single initial consonants we insert EU
430	# (as Latin) after them and let standard rules do the rest.
431
432	# BREAKS ROUND TRIP INTEGRITY
433
434	gg > $GGi $EU;
435	dd > $DD $EU;
436	bb > $BB $EU;
437	ss > $SSi $EU;
438	jj > $JJ $EU;
439
440	([bcdghjkmnprst]) > \| $1 eu;
441
442	# X + Final: Finally we have to deal with a consonant that can only be
443	# interpreted as a final (not an initial) and which is preceded
444	# neither by an initial nor a medial. It is the start of the
445	# syllable, but cannot be. Most of these will already be handled by
446	# the above rules. 'bs' splits into Bi EU Sf. Similar for 'gs' 'ng'
447	# 'nh' 'nj'. The only problem is 'l' and digraphs starting with 'l'.
448	# For this isolated case, we could add a null initial and medial,
449	# which would give "la" => IEUNG EU L IEUNG A, for example. A more
450	# economical solution is to transliterate isolated "l" (that is,
451	# initial "l") to "r". (Other similar conversions of consonants that
452	# occur neither as initials nor as finals are handled below.)
453
454	l > \| r;
455
456	# Medials. If a medial is preceded by an initial, then we proceed
457	# normally. As usual, longer keys must precede shorter ones.
458
459	# [BASIC Jamo-Latin MEDIALS handled here. Order irrelevant within
460	# this block for Jamo-Latin.]
461
462	$jamoInitial {ae} <> $AE;
463	$jamoInitial {a} <> $A;
464	$jamoInitial {eo} <> $EO;
465	$jamoInitial {eu} <> $EU;
466	$jamoInitial {e} <> $E;
467	$jamoInitial {i} <> $I;
468	$jamoInitial {oe} <> $OE;
469	$jamoInitial {o} <> $O;
470	$jamoInitial {u} <> $U;
471	$jamoInitial {wae} <> $WAE;
472	$jamoInitial {wa} <> $WA;
473	$jamoInitial {weo} <> $WEO;
474	$jamoInitial {we} <> $WE;
475	$jamoInitial {wi} <> $WI;
476	$jamoInitial {yae} <> $YAE;
477	$jamoInitial {ya} <> $YA;
478	$jamoInitial {yeo} <> $YEO;
479	$jamoInitial {ye} <> $YE;
480	$jamoInitial {yi} <> $YI;
481	$jamoInitial {yo} <> $YO;
482	$jamoInitial {yu} <> $YU;
483
484	# We may see an anomalous isolated 'w' or 'y'. In that case, we
485	# interpret it as 'wi' and 'yu', respectively.
486
487	# BREAKS ROUND TRIP INTEGRITY
488
489	$jamoInitial {w} > \| wi;
490	$jamoInitial {y} > \| yu;
491
492	# Otherwise, insert a null consonant IEUNG before the medial (which is
493	# still an untransliterated latin vowel).
494
495	($latinMedial) > $IEUNG \| $1;
496
497	# Convert non-jamo latin consonants to equivalents. These occur as
498	# neither initials nor finals in jamo. 'l' occurs as a final, but not
499	# an initial; it is handled above. The following letters (left hand
500	# side) will never be output by Jamo-Latin.
501
502	f > \| p;
503	q > \| k;
504	v > \| b;
505	x > \| ks;
506	z > \| s;
507
508	# Delete separators (Latin-Jamo).
509
510	$sep > ;
511
512	# Delete null consonants (Jamo-Latin). Do NOT delete null EU vowels,
513	# since these may also occur in text.
514
515	< $IEUNG;
516
517	#- N.B. DO NOT put any filters, NFD, etc. here -- those are aliased in
518	#- the INDEX file. This transliterator is, by itself, not
519	#- instantiated. It is used as a part of Latin-Jamo, Latin-Hangul, or
520	#- inverses thereof.
521
522	# eof