X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b331163bffd790ced0e88b73f44f86d49ccc48a5..0f5d89e82340278ed3d7d50029f37cab2c41a57e:/icuSources/data/translit/ThaiLogical_Latin.txt diff --git a/icuSources/data/translit/ThaiLogical_Latin.txt b/icuSources/data/translit/ThaiLogical_Latin.txt index 4912be5a..c063e790 100644 --- a/icuSources/data/translit/ThaiLogical_Latin.txt +++ b/icuSources/data/translit/ThaiLogical_Latin.txt @@ -1,14 +1,37 @@ -# *************************************************************************** -# * -# * Copyright (C) 2004-2015, International Business Machines -# * Corporation; Unicode, Inc.; and others. All Rights Reserved. -# * -# *************************************************************************** +# © 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html#License +# # File: ThaiLogical_Latin.txt -# Generated from CLDR +# Generated from CLDR +# + +# Thai-Latin +# This set of rules follows ISO 11940 +# see http://homepage.mac.com/sirbinks/pdf/Thai.r2.pdf +# except that that does not mention an implicit vowel, so we use o\u0323 +# +# The transcription is fairly ugly, so we ought to also do the UNGEGN version +# see: http://www.eki.ee/wgrs/rom1_th.pdf +# and probably make that the main variant. # +# Note: this is an internal file. The NFD/NFC is handled externally, in the index +# The insertion of spaces between words, the reversal of the vowels +# and the conversion of space to semicolon are done *outside* of these rules. +# So as far as these rules are concerned, the vowels are in logical order! +# insert implicit vowel (and remove it going the other way) +# COMMENTED out: the implicit vowel positions cannot be predicted algorithmically +#$consonant = [ก-ฮ]; +#$vowel = [ะ-\u0E3Aเ-ไ\u0E47]; +#{ ( $consonant ) } [^$vowel \uE000] → | $1 \uE000 ; +#\uE000 → o\u0323 ; +# ← o\u0323 ; $notAbove = [^\p{ccc=0}\p{ccc=above}] ; $notBelow = [^\p{ccc=0}\p{ccc=below}] ; +# Consonants +# Warning: the 'h's need to be handled carefully! +# What we really want to say is the following, but we can't +# $notHAccent = !($notAbove* \u0304 | $notBelow* \u0323) ; +# Since the only accents we care about that could cause problems are free-standing accents below, we use instead: $freeStandingBelow = [\u0325 ]; $hAccent = [ \u0304 \u0323]; $notHAccent0 = [^$freeStandingBelow$hAccent]; @@ -40,8 +63,10 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; ธ ↔ t\u0323h ; # THAI CHARACTER THO THONG ท ← th } $notHAccent1 ; # THAI CHARACTER THO THAHAN ท ↔ th } $notHAccent0 ; # THAI CHARACTER THO THAHAN +#Note: TO PATAK deviates from ISO since t-dotunder + h would be ambigous. So it uses vertical tick. ฏ ↔ t\u0329 ; # THAI CHARACTER TO PATAK ต ↔ t ; # THAI CHARACTER TO TAO +# since there is no singleton g (generated), don't worry about that. ง ↔ ng ; # THAI CHARACTER NGO NGU ณ ↔ n\u0323 ; # THAI CHARACTER NO NEN น ↔ n ; # THAI CHARACTER NO NU @@ -67,9 +92,11 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; ฟ ↔ f ; # THAI CHARACTER FO FAN อ ↔ x ; # THAI CHARACTER O ANG ซ ↔ s ; # THAI CHARACTER SO SO +# vowels \u0E31 ↔ a\u0323 ; # THAI CHARACTER MAI HAN-AKAT า → a\u0304 ; # THAI CHARACTER SARA AA า | $1 ← a ($notAbove*) \u0304; # backward case, account for reordering +# We deviate from ISO for SARA AM for disambiguation ำ → a \u0309; # THAI CHARACTER SARA AM ำ | $1 ← a ($notAbove*) \u0309 ; # backward case, account for reordering ะ ↔ a ; # THAI CHARACTER SARA A @@ -82,6 +109,7 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; \u0E39 | $1 ← u ($notAbove*) \u0304 ; # backward case, account for reordering \u0E38 ↔ u ; # THAI CHARACTER SARA U ฯ ↔ ‡ ; # THAI CHARACTER PAIYANNOI +# ฿ ↔ XXX ; # THAI CURRENCY SYMBOL BAHT เ ↔ e ; # THAI CHARACTER SARA E แ ↔ æ ; # THAI CHARACTER SARA AE โ ↔ o ; # THAI CHARACTER SARA O @@ -95,6 +123,7 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; \u0E4B ↔ \u030C ; # THAI CHARACTER MAI CHATTAWA \u0E4C ↔ \u0312 ; # THAI CHARACTER THANTHAKHAT \u0E4E ↔ '~' ; # THAI CHARACTER YAMAKKAN +# We deviate from ISO for disambiguation \u0E4D ↔ \u030A ; # THAI CHARACTER NIKHAHIT ๏ ↔ '§' ; # THAI CHARACTER FONGMAN ๐ ↔ 0 ; # THAI DIGIT ZERO @@ -110,11 +139,15 @@ $notHAccent1 = $freeStandingBelow [^$hAccent]; ๚ ↔ '||' ; # THAI CHARACTER ANGKHANKHU ๛ ↔ » ; # THAI CHARACTER KHOMUT ๆ ↔ « ; # THAI CHARACTER MAIYAMOK +# moved down to make shorter first +#Note: PHINTHU deviates from ISO since underring causes canonical problems. So it uses spacing tick below. \u0E3A ↔ ˌ ; # THAI CHARACTER PHINTHU \u0E34 ↔ i ; # THAI CHARACTER SARA I +# fallbacks | k ← g ; | k ← h ; | c ← j ; | k ← q ; | s ← z ; :: (lower); +