X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b331163bffd790ced0e88b73f44f86d49ccc48a5..0f5d89e82340278ed3d7d50029f37cab2c41a57e:/icuSources/data/translit/ThaiLogical_Latin.txt

diff --git a/icuSources/data/translit/ThaiLogical_Latin.txt b/icuSources/data/translit/ThaiLogical_Latin.txt
index 4912be5a..c063e790 100644
--- a/icuSources/data/translit/ThaiLogical_Latin.txt
+++ b/icuSources/data/translit/ThaiLogical_Latin.txt
@@ -1,14 +1,37 @@
-ï»¿# ***************************************************************************
-# *
-# *  Copyright (C) 2004-2015, International Business Machines
-# *  Corporation; Unicode, Inc.; and others.  All Rights Reserved.
-# *
-# ***************************************************************************
+ï»¿# Â© 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html#License
+#
 # File: ThaiLogical_Latin.txt
-# Generated from CLDR 
+# Generated from CLDR
+#
+
+# Thai-Latin
+# This set of rules follows ISO 11940
+#     see http://homepage.mac.com/sirbinks/pdf/Thai.r2.pdf
+# except that that does not mention an implicit vowel, so we use o\u0323
+#
+# The transcription is fairly ugly, so we ought to also do the UNGEGN version
+#     see: http://www.eki.ee/wgrs/rom1_th.pdf
+# and probably make that the main variant.
 #
+# Note: this is an internal file. The NFD/NFC is handled externally, in the index
+# The insertion of spaces between words, the reversal of the vowels
+# and the conversion of space to semicolon are done *outside* of these rules.
+# So as far as these rules are concerned, the vowels are in logical order!
+# insert implicit vowel (and remove it going the other way)
+# COMMENTED out: the implicit vowel positions cannot be predicted algorithmically
+#$consonant = [à¸-à¸®];
+#$vowel = [à¸°-\u0E3Aà¹-à¹\u0E47];
+#{ ( $consonant ) } [^$vowel \uE000] â | $1 \uE000 ;
+#\uE000 â o\u0323 ;
+# â o\u0323 ;
 $notAbove = [^\p{ccc=0}\p{ccc=above}] ;
 $notBelow = [^\p{ccc=0}\p{ccc=below}] ;
+# Consonants
+# Warning: the 'h's need to be handled carefully!
+# What we really want to say is the following, but we can't
+# $notHAccent = !($notAbove*   \u0304 | $notBelow*   \u0323) ;
+# Since the only accents we care about that could cause problems are free-standing accents below, we use instead:
 $freeStandingBelow = [\u0325  ];
 $hAccent =  [   \u0304     \u0323];
 $notHAccent0 = [^$freeStandingBelow$hAccent];
@@ -40,8 +63,10 @@ $notHAccent1 = $freeStandingBelow [^$hAccent];
 à¸ â t\u0323h ; # THAI CHARACTER THO THONG
 à¸ â th } $notHAccent1 ; # THAI CHARACTER THO THAHAN
 à¸ â th } $notHAccent0 ; # THAI CHARACTER THO THAHAN
+#Note: TO PATAK deviates from ISO since t-dotunder + h would be ambigous. So it uses vertical tick.
 à¸ â t\u0329 ; # THAI CHARACTER TO PATAK
 à¸ â t ; # THAI CHARACTER TO TAO
+# since there is no singleton g (generated), don't worry about that.
 à¸ â ng ; # THAI CHARACTER NGO NGU
 à¸ â n\u0323 ; # THAI CHARACTER NO NEN
 à¸ â n ; # THAI CHARACTER NO NU
@@ -67,9 +92,11 @@ $notHAccent1 = $freeStandingBelow [^$hAccent];
 à¸ â f ; # THAI CHARACTER FO FAN
 à¸­ â x ; # THAI CHARACTER O ANG
 à¸ â s ; # THAI CHARACTER SO SO
+# vowels
 \u0E31 â a\u0323 ; # THAI CHARACTER MAI HAN-AKAT
 à¸² â a\u0304 ; # THAI CHARACTER SARA AA
 à¸² | $1 â a ($notAbove*)    \u0304; # backward case, account for reordering
+# We deviate from ISO for SARA AM for disambiguation
 à¸³ â a  \u0309; # THAI CHARACTER SARA AM
 à¸³ | $1 â a ($notAbove*)  \u0309 ; # backward case, account for reordering
 à¸° â a ; # THAI CHARACTER SARA A
@@ -82,6 +109,7 @@ $notHAccent1 = $freeStandingBelow [^$hAccent];
 \u0E39 | $1 â u  ($notAbove*)    \u0304  ; # backward case, account for reordering
 \u0E38 â u ; # THAI CHARACTER SARA U
 à¸¯ â â¡ ; # THAI CHARACTER PAIYANNOI
+# à¸¿ â XXX ; # THAI CURRENCY SYMBOL BAHT
 à¹ â e ; # THAI CHARACTER SARA E
 à¹ â Ã¦ ; # THAI CHARACTER SARA AE
 à¹ â o ; # THAI CHARACTER SARA O
@@ -95,6 +123,7 @@ $notHAccent1 = $freeStandingBelow [^$hAccent];
 \u0E4B â \u030C ; # THAI CHARACTER MAI CHATTAWA
 \u0E4C â \u0312 ; # THAI CHARACTER THANTHAKHAT
 \u0E4E â '~' ; # THAI CHARACTER YAMAKKAN
+# We deviate from ISO for disambiguation
 \u0E4D â  \u030A ; # THAI CHARACTER NIKHAHIT
 à¹ â 'Â§' ; # THAI CHARACTER FONGMAN
 à¹ â 0 ; # THAI DIGIT ZERO
@@ -110,11 +139,15 @@ $notHAccent1 = $freeStandingBelow [^$hAccent];
 à¹ â '||' ; # THAI CHARACTER ANGKHANKHU
 à¹ â Â» ; # THAI CHARACTER KHOMUT
 à¹ â Â« ; # THAI CHARACTER MAIYAMOK
+# moved down to make shorter first
+#Note: PHINTHU deviates from ISO since underring causes canonical problems. So it uses spacing tick below.
 \u0E3A â Ë ; # THAI CHARACTER PHINTHU
 \u0E34 â i ; # THAI CHARACTER SARA I
+# fallbacks
 | k â g ;
 | k â h ;
 | c â j ;
 | k â q ;
 | s â z ;
 :: (lower);
+