-# ***************************************************************************
-# *
-# * Copyright (C) 2004-2015, International Business Machines
-# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
-# *
-# ***************************************************************************
+# © 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html#License
+#
# File: ThaiLogical_Latin.txt
-# Generated from CLDR
+# Generated from CLDR
+#
+
+# Thai-Latin
+# This set of rules follows ISO 11940
+# see http://homepage.mac.com/sirbinks/pdf/Thai.r2.pdf
+# except that that does not mention an implicit vowel, so we use o\u0323
+#
+# The transcription is fairly ugly, so we ought to also do the UNGEGN version
+# see: http://www.eki.ee/wgrs/rom1_th.pdf
+# and probably make that the main variant.
#
+# Note: this is an internal file. The NFD/NFC is handled externally, in the index
+# The insertion of spaces between words, the reversal of the vowels
+# and the conversion of space to semicolon are done *outside* of these rules.
+# So as far as these rules are concerned, the vowels are in logical order!
+# insert implicit vowel (and remove it going the other way)
+# COMMENTED out: the implicit vowel positions cannot be predicted algorithmically
+#$consonant = [ก-ฮ];
+#$vowel = [ะ-\u0E3Aเ-ไ\u0E47];
+#{ ( $consonant ) } [^$vowel \uE000] → | $1 \uE000 ;
+#\uE000 → o\u0323 ;
+# ← o\u0323 ;
$notAbove = [^\p{ccc=0}\p{ccc=above}] ;
$notBelow = [^\p{ccc=0}\p{ccc=below}] ;
+# Consonants
+# Warning: the 'h's need to be handled carefully!
+# What we really want to say is the following, but we can't
+# $notHAccent = !($notAbove* \u0304 | $notBelow* \u0323) ;
+# Since the only accents we care about that could cause problems are free-standing accents below, we use instead:
$freeStandingBelow = [\u0325 ];
$hAccent = [ \u0304 \u0323];
$notHAccent0 = [^$freeStandingBelow$hAccent];
ธ ↔ t\u0323h ; # THAI CHARACTER THO THONG
ท ← th } $notHAccent1 ; # THAI CHARACTER THO THAHAN
ท ↔ th } $notHAccent0 ; # THAI CHARACTER THO THAHAN
+#Note: TO PATAK deviates from ISO since t-dotunder + h would be ambigous. So it uses vertical tick.
ฏ ↔ t\u0329 ; # THAI CHARACTER TO PATAK
ต ↔ t ; # THAI CHARACTER TO TAO
+# since there is no singleton g (generated), don't worry about that.
ง ↔ ng ; # THAI CHARACTER NGO NGU
ณ ↔ n\u0323 ; # THAI CHARACTER NO NEN
น ↔ n ; # THAI CHARACTER NO NU
ฟ ↔ f ; # THAI CHARACTER FO FAN
อ ↔ x ; # THAI CHARACTER O ANG
ซ ↔ s ; # THAI CHARACTER SO SO
+# vowels
\u0E31 ↔ a\u0323 ; # THAI CHARACTER MAI HAN-AKAT
า → a\u0304 ; # THAI CHARACTER SARA AA
า | $1 ← a ($notAbove*) \u0304; # backward case, account for reordering
+# We deviate from ISO for SARA AM for disambiguation
ำ → a \u0309; # THAI CHARACTER SARA AM
ำ | $1 ← a ($notAbove*) \u0309 ; # backward case, account for reordering
ะ ↔ a ; # THAI CHARACTER SARA A
\u0E39 | $1 ← u ($notAbove*) \u0304 ; # backward case, account for reordering
\u0E38 ↔ u ; # THAI CHARACTER SARA U
ฯ ↔ ‡ ; # THAI CHARACTER PAIYANNOI
+# ฿ ↔ XXX ; # THAI CURRENCY SYMBOL BAHT
เ ↔ e ; # THAI CHARACTER SARA E
แ ↔ æ ; # THAI CHARACTER SARA AE
โ ↔ o ; # THAI CHARACTER SARA O
\u0E4B ↔ \u030C ; # THAI CHARACTER MAI CHATTAWA
\u0E4C ↔ \u0312 ; # THAI CHARACTER THANTHAKHAT
\u0E4E ↔ '~' ; # THAI CHARACTER YAMAKKAN
+# We deviate from ISO for disambiguation
\u0E4D ↔ \u030A ; # THAI CHARACTER NIKHAHIT
๏ ↔ '§' ; # THAI CHARACTER FONGMAN
๐ ↔ 0 ; # THAI DIGIT ZERO
๚ ↔ '||' ; # THAI CHARACTER ANGKHANKHU
๛ ↔ » ; # THAI CHARACTER KHOMUT
ๆ ↔ « ; # THAI CHARACTER MAIYAMOK
+# moved down to make shorter first
+#Note: PHINTHU deviates from ISO since underring causes canonical problems. So it uses spacing tick below.
\u0E3A ↔ ˌ ; # THAI CHARACTER PHINTHU
\u0E34 ↔ i ; # THAI CHARACTER SARA I
+# fallbacks
| k ← g ;
| k ← h ;
| c ← j ;
| k ← q ;
| s ← z ;
:: (lower);
+