1 #--------------------------------------------------------------------
2 # Copyright (c) 1999-2004, International Business Machines
3 # Corporation and others. All Rights Reserved.
4 #--------------------------------------------------------------------
7 # This set of rules follows ISO 11940
8 # see http://homepage.mac.com/sirbinks/pdf/Thai.r2.pdf
9 # except that that does not mention an implicit vowel, so we use ọ
11 # The transcription is fairly ugly, so we ought to also do the UNGEGN version
12 # see: http://www.eki.ee/wgrs/rom1_th.pdf
13 # and probably make that the main variant.
15 # Note: this is an internal file. The NFD/NFC is handled externally, in the index
16 # The insertion of spaces between words, the reversal of the vowels
17 # and the conversion of space to semicolon are done *outside* of these rules.
18 # So as far as these rules are concerned, the vowels are in logical order!
20 # insert implicit vowel (and remove it going the other way)
21 # COMMENTED out: the implicit vowel positions cannot be predicted algorithmically
25 #{ ( $consonant ) } [^$vowel ] > | $1 ;
29 $notAbove = [^\p{ccc=0}\p{ccc=above}] ;
30 $notBelow = [^\p{ccc=0}\p{ccc=below}] ;
33 # Warning: the 'h's need to be handled carefully!
34 # What we really want to say is the following, but we can't
35 # $notHAccent = !($notAbove* ̄ | $notBelow* ̣) ;
37 # Since the only accents we care about that could cause problems are free-standing accents below, we use instead:
38 $freeStandingBelow = [\u0325 ];
40 $notHAccent0 = [^$freeStandingBelow$hAccent];
41 $notHAccent1 = $freeStandingBelow [^$hAccent];
43 ห > h̄ ; # THAI CHARACTER HO HIP
44 ห | $1 < h ($notAbove*) ̄; # backward case, account for reordering
45 ฮ <> ḥ ; # THAI CHARACTER HO NOKHUK
47 ข <> k̄h ; # THAI CHARACTER KHO KHAI
48 ฃ <> ḳ̄h ; # THAI CHARACTER KHO KHUAT
49 ฅ <> kʹh ; # THAI CHARACTER KHO KHON
50 ฆ <> ḳh ; # THAI CHARACTER KHO RAKHANG
51 ค < kh } $notHAccent1 ; # THAI CHARACTER KHO KHWAI
52 ค <> kh } $notHAccent0 ; # THAI CHARACTER KHO KHWAI
53 ก <> k ; # THAI CHARACTER KO KAI
55 ภ <> p̣h ; # THAI CHARACTER PHO SAMPHAO
56 ผ <> p̄h ; # THAI CHARACTER PHO PHUNG
57 พ < ph } $notHAccent1 ; # THAI CHARACTER PHO PHAN
58 พ <> ph } $notHAccent0 ; # THAI CHARACTER PHO PHAN
59 ป <> p ; # THAI CHARACTER PO PLA
61 ฉ <> c̄h ; # THAI CHARACTER CHO CHING
62 ฌ <> c̣h ; # THAI CHARACTER CHO CHOE
63 ช < ch } $notHAccent1 ; # THAI CHARACTER CHO CHANG
64 ช <> ch } $notHAccent0 ; # THAI CHARACTER CHO CHANG
65 จ <> c ; # THAI CHARACTER CHO CHAN
67 ฐ <> ṭ̄h ; # THAI CHARACTER THO THAN
68 ฑ <> ṯh ; # THAI CHARACTER THO NANGMONTHO
69 ฒ <> tʹh ; # THAI CHARACTER THO PHUTHAO
70 ถ <> t̄h ; # THAI CHARACTER THO THUNG
71 ธ <> ṭh ; # THAI CHARACTER THO THONG
72 ท < th } $notHAccent1 ; # THAI CHARACTER THO THAHAN
73 ท <> th } $notHAccent0 ; # THAI CHARACTER THO THAHAN
74 #Note: TO PATAK deviates from ISO since t-dotunder + h would be ambigous. So it uses vertical tick.
75 ฏ <> t̩ ; # THAI CHARACTER TO PATAK
76 ต <> t ; # THAI CHARACTER TO TAO
78 # since there is no singleton g (generated), don't worry about that.
79 ง <> ng ; # THAI CHARACTER NGO NGU
80 ณ <> ṇ ; # THAI CHARACTER NO NEN
81 น <> n ; # THAI CHARACTER NO NU
83 ญ <> ỵ ; # THAI CHARACTER YO YING
84 ฎ <> ḍ ; # THAI CHARACTER DO CHADA
85 ด <> d ; # THAI CHARACTER DO DEK
87 บ <> b ; # THAI CHARACTER BO BAIMAI
88 ฝ <> f̄ ; # THAI CHARACTER FO FA
89 ฝ | $1 < f ($notAbove*) ̄; # backward case, account for reordering
91 ม <> m ; # THAI CHARACTER MO MA
92 ย <> y ; # THAI CHARACTER YO YAK
93 ร <> r ; # THAI CHARACTER RO RUA
94 ฤ <> v ; # THAI CHARACTER RU
95 ฦ <> ł ; # THAI CHARACTER LU
96 ว <> w ; # THAI CHARACTER WO WAEN
98 ศ <> ṣ̄ ; # THAI CHARACTER SO SALA***
99 ศ | $1 < s ̣ ($notAbove*) ̄; # backward case, account for reordering
100 ษ <> s̄ʹ ; # THAI CHARACTER SO RUSI
101 ส > s̄ ; # THAI CHARACTER SO SUA***
102 ส | $1 < s ($notAbove*) ̄; # backward case, account for reordering
104 ฬ <> ḷ ; # THAI CHARACTER LO CHULA
105 ล <> l ; # THAI CHARACTER LO LING
106 ฟ <> f ; # THAI CHARACTER FO FAN
108 อ <> x ; # THAI CHARACTER O ANG
109 ซ <> s ; # THAI CHARACTER SO SO
113 ั <> ạ ; # THAI CHARACTER MAI HAN-AKAT
115 า > ā ; # THAI CHARACTER SARA AA
116 า | $1 < a ($notAbove*) ̄; # backward case, account for reordering
118 # We deviate from ISO for SARA AM for disambiguation
119 ำ > a ̉; # THAI CHARACTER SARA AM
120 ำ | $1 < a ($notAbove*) ̉ ; # backward case, account for reordering
122 ะ <> a ; # THAI CHARACTER SARA A
123 ี <> ī ; # THAI CHARACTER SARA II
124 ี | $1 < i ($notAbove*) ̄ ; # backward case, account for reordering
126 ื <> ụ̄ ; # THAI CHARACTER SARA UEE
127 ื | $1 < u ̣ ($notAbove*) ̄ ; # backward case, account for reordering
129 ึ <> ụ ; # THAI CHARACTER SARA UE
130 ู <> ū ; # THAI CHARACTER SARA UU
131 ู | $1 < u ($notAbove*) ̄ ; # backward case, account for reordering
133 ุ <> u ; # THAI CHARACTER SARA U
135 ฯ <> ‡ ; # THAI CHARACTER PAIYANNOI
137 # ฿ <> XXX ; # THAI CURRENCY SYMBOL BAHT
139 เ <> e ; # THAI CHARACTER SARA E
140 แ <> æ ; # THAI CHARACTER SARA AE
141 โ <> o ; # THAI CHARACTER SARA O
142 ใ <> ı ; # THAI CHARACTER SARA AI MAIMUAN
143 ไ <> ị ; # THAI CHARACTER SARA AI MAIMALAI
144 ๅ <> ɨ ; # THAI CHARACTER LAKKHANGYAO
145 ็ <> ̆ ; # THAI CHARACTER MAITAIKHU
146 ่ <> ̀ ; # THAI CHARACTER MAI EK
147 ้ <> ̂ ; # THAI CHARACTER MAI THO
148 ๊ <> ́ ; # THAI CHARACTER MAI TRI
149 ๋ <> ̌ ; # THAI CHARACTER MAI CHATTAWA
150 ์ <> ̒ ; # THAI CHARACTER THANTHAKHAT
151 ๎ <> '~' ; # THAI CHARACTER YAMAKKAN
153 # We deviate from ISO for disambiguation
154 ํ <> ̊ ; # THAI CHARACTER NIKHAHIT
156 ๏ <> § ; # THAI CHARACTER FONGMAN
158 ๐ <> 0 ; # THAI DIGIT ZERO
159 ๑ <> 1 ; # THAI DIGIT ONE
160 ๒ <> 2 ; # THAI DIGIT TWO
161 ๓ <> 3 ; # THAI DIGIT THREE
162 ๔ <> 4 ; # THAI DIGIT FOUR
163 ๕ <> 5 ; # THAI DIGIT FIVE
164 ๖ <> 6 ; # THAI DIGIT SIX
165 ๗ <> 7 ; # THAI DIGIT SEVEN
166 ๘ <> 8 ; # THAI DIGIT EIGHT
167 ๙ <> 9 ; # THAI DIGIT NINE
169 ๚ <> '||' ; # THAI CHARACTER ANGKHANKHU
171 ๛ <> » ; # THAI CHARACTER KHOMUT
172 ๆ <> « ; # THAI CHARACTER MAIYAMOK
174 # moved down to make shorter first
175 #Note: PHINTHU deviates from ISO since underring causes canonical problems. So it uses spacing tick below.
176 ฺ <> ˌ ; # THAI CHARACTER PHINTHU
177 ิ <> i ; # THAI CHARACTER SARA I