]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | #-------------------------------------------------------------------- |
2 | # Copyright (c) 1999-2004, International Business Machines | |
3 | # Corporation and others. All Rights Reserved. | |
4 | #-------------------------------------------------------------------- | |
5 | ||
6 | # Thai-Latin | |
7 | # This set of rules follows ISO 11940 | |
8 | # see http://homepage.mac.com/sirbinks/pdf/Thai.r2.pdf | |
9 | # except that that does not mention an implicit vowel, so we use ọ | |
10 | # | |
11 | # The transcription is fairly ugly, so we ought to also do the UNGEGN version | |
12 | # see: http://www.eki.ee/wgrs/rom1_th.pdf | |
13 | # and probably make that the main variant. | |
14 | ||
15 | # Note: this is an internal file. The NFD/NFC is handled externally, in the index | |
16 | # The insertion of spaces between words, the reversal of the vowels | |
17 | # and the conversion of space to semicolon are done *outside* of these rules. | |
18 | # So as far as these rules are concerned, the vowels are in logical order! | |
19 | ||
20 | # insert implicit vowel (and remove it going the other way) | |
21 | # COMMENTED out: the implicit vowel positions cannot be predicted algorithmically | |
22 | #$consonant = [ก-ฮ]; | |
23 | #$vowel = [ะ-ฺเ-ไ็]; | |
24 | ||
25 | #{ ( $consonant ) } [^$vowel ] > | $1 ; | |
26 | # > ọ ; | |
27 | # < ọ ; | |
28 | ||
29 | $notAbove = [^\p{ccc=0}\p{ccc=above}] ; | |
30 | $notBelow = [^\p{ccc=0}\p{ccc=below}] ; | |
31 | ||
32 | # Consonants | |
33 | # Warning: the 'h's need to be handled carefully! | |
34 | # What we really want to say is the following, but we can't | |
35 | # $notHAccent = !($notAbove* ̄ | $notBelow* ̣) ; | |
36 | ||
37 | # Since the only accents we care about that could cause problems are free-standing accents below, we use instead: | |
38 | $freeStandingBelow = [\u0325 ]; | |
39 | $hAccent = [ ̄ ̣]; | |
40 | $notHAccent0 = [^$freeStandingBelow$hAccent]; | |
41 | $notHAccent1 = $freeStandingBelow [^$hAccent]; | |
42 | ||
43 | ห > h̄ ; # THAI CHARACTER HO HIP | |
44 | ห | $1 < h ($notAbove*) ̄; # backward case, account for reordering | |
45 | ฮ <> ḥ ; # THAI CHARACTER HO NOKHUK | |
46 | ||
47 | ข <> k̄h ; # THAI CHARACTER KHO KHAI | |
48 | ฃ <> ḳ̄h ; # THAI CHARACTER KHO KHUAT | |
49 | ฅ <> kʹh ; # THAI CHARACTER KHO KHON | |
50 | ฆ <> ḳh ; # THAI CHARACTER KHO RAKHANG | |
51 | ค < kh } $notHAccent1 ; # THAI CHARACTER KHO KHWAI | |
52 | ค <> kh } $notHAccent0 ; # THAI CHARACTER KHO KHWAI | |
53 | ก <> k ; # THAI CHARACTER KO KAI | |
54 | ||
55 | ภ <> p̣h ; # THAI CHARACTER PHO SAMPHAO | |
56 | ผ <> p̄h ; # THAI CHARACTER PHO PHUNG | |
57 | พ < ph } $notHAccent1 ; # THAI CHARACTER PHO PHAN | |
58 | พ <> ph } $notHAccent0 ; # THAI CHARACTER PHO PHAN | |
59 | ป <> p ; # THAI CHARACTER PO PLA | |
60 | ||
61 | ฉ <> c̄h ; # THAI CHARACTER CHO CHING | |
62 | ฌ <> c̣h ; # THAI CHARACTER CHO CHOE | |
63 | ช < ch } $notHAccent1 ; # THAI CHARACTER CHO CHANG | |
64 | ช <> ch } $notHAccent0 ; # THAI CHARACTER CHO CHANG | |
65 | จ <> c ; # THAI CHARACTER CHO CHAN | |
66 | ||
67 | ฐ <> ṭ̄h ; # THAI CHARACTER THO THAN | |
68 | ฑ <> ṯh ; # THAI CHARACTER THO NANGMONTHO | |
69 | ฒ <> tʹh ; # THAI CHARACTER THO PHUTHAO | |
70 | ถ <> t̄h ; # THAI CHARACTER THO THUNG | |
71 | ธ <> ṭh ; # THAI CHARACTER THO THONG | |
72 | ท < th } $notHAccent1 ; # THAI CHARACTER THO THAHAN | |
73 | ท <> th } $notHAccent0 ; # THAI CHARACTER THO THAHAN | |
74 | #Note: TO PATAK deviates from ISO since t-dotunder + h would be ambigous. So it uses vertical tick. | |
75 | ฏ <> t̩ ; # THAI CHARACTER TO PATAK | |
76 | ต <> t ; # THAI CHARACTER TO TAO | |
77 | ||
78 | # since there is no singleton g (generated), don't worry about that. | |
79 | ง <> ng ; # THAI CHARACTER NGO NGU | |
80 | ณ <> ṇ ; # THAI CHARACTER NO NEN | |
81 | น <> n ; # THAI CHARACTER NO NU | |
82 | ||
83 | ญ <> ỵ ; # THAI CHARACTER YO YING | |
84 | ฎ <> ḍ ; # THAI CHARACTER DO CHADA | |
85 | ด <> d ; # THAI CHARACTER DO DEK | |
86 | ||
87 | บ <> b ; # THAI CHARACTER BO BAIMAI | |
88 | ฝ <> f̄ ; # THAI CHARACTER FO FA | |
89 | ฝ | $1 < f ($notAbove*) ̄; # backward case, account for reordering | |
90 | ||
91 | ม <> m ; # THAI CHARACTER MO MA | |
92 | ย <> y ; # THAI CHARACTER YO YAK | |
93 | ร <> r ; # THAI CHARACTER RO RUA | |
94 | ฤ <> v ; # THAI CHARACTER RU | |
95 | ฦ <> ł ; # THAI CHARACTER LU | |
96 | ว <> w ; # THAI CHARACTER WO WAEN | |
97 | ||
98 | ศ <> ṣ̄ ; # THAI CHARACTER SO SALA*** | |
99 | ศ | $1 < s ̣ ($notAbove*) ̄; # backward case, account for reordering | |
100 | ษ <> s̄ʹ ; # THAI CHARACTER SO RUSI | |
101 | ส > s̄ ; # THAI CHARACTER SO SUA*** | |
102 | ส | $1 < s ($notAbove*) ̄; # backward case, account for reordering | |
103 | ||
104 | ฬ <> ḷ ; # THAI CHARACTER LO CHULA | |
105 | ล <> l ; # THAI CHARACTER LO LING | |
106 | ฟ <> f ; # THAI CHARACTER FO FAN | |
107 | ||
108 | อ <> x ; # THAI CHARACTER O ANG | |
109 | ซ <> s ; # THAI CHARACTER SO SO | |
110 | ||
111 | # vowels | |
112 | ||
113 | ั <> ạ ; # THAI CHARACTER MAI HAN-AKAT | |
114 | ||
115 | า > ā ; # THAI CHARACTER SARA AA | |
116 | า | $1 < a ($notAbove*) ̄; # backward case, account for reordering | |
117 | ||
118 | # We deviate from ISO for SARA AM for disambiguation | |
119 | ำ > a ̉; # THAI CHARACTER SARA AM | |
120 | ำ | $1 < a ($notAbove*) ̉ ; # backward case, account for reordering | |
121 | ||
122 | ะ <> a ; # THAI CHARACTER SARA A | |
123 | ี <> ī ; # THAI CHARACTER SARA II | |
124 | ี | $1 < i ($notAbove*) ̄ ; # backward case, account for reordering | |
125 | ||
126 | ื <> ụ̄ ; # THAI CHARACTER SARA UEE | |
127 | ื | $1 < u ̣ ($notAbove*) ̄ ; # backward case, account for reordering | |
128 | ||
129 | ึ <> ụ ; # THAI CHARACTER SARA UE | |
130 | ู <> ū ; # THAI CHARACTER SARA UU | |
131 | ู | $1 < u ($notAbove*) ̄ ; # backward case, account for reordering | |
132 | ||
133 | ุ <> u ; # THAI CHARACTER SARA U | |
134 | ||
135 | ฯ <> ‡ ; # THAI CHARACTER PAIYANNOI | |
136 | ||
137 | # ฿ <> XXX ; # THAI CURRENCY SYMBOL BAHT | |
138 | ||
139 | เ <> e ; # THAI CHARACTER SARA E | |
140 | แ <> æ ; # THAI CHARACTER SARA AE | |
141 | โ <> o ; # THAI CHARACTER SARA O | |
142 | ใ <> ı ; # THAI CHARACTER SARA AI MAIMUAN | |
143 | ไ <> ị ; # THAI CHARACTER SARA AI MAIMALAI | |
144 | ๅ <> ɨ ; # THAI CHARACTER LAKKHANGYAO | |
145 | ็ <> ̆ ; # THAI CHARACTER MAITAIKHU | |
146 | ่ <> ̀ ; # THAI CHARACTER MAI EK | |
147 | ้ <> ̂ ; # THAI CHARACTER MAI THO | |
148 | ๊ <> ́ ; # THAI CHARACTER MAI TRI | |
149 | ๋ <> ̌ ; # THAI CHARACTER MAI CHATTAWA | |
150 | ์ <> ̒ ; # THAI CHARACTER THANTHAKHAT | |
151 | ๎ <> '~' ; # THAI CHARACTER YAMAKKAN | |
152 | ||
153 | # We deviate from ISO for disambiguation | |
154 | ํ <> ̊ ; # THAI CHARACTER NIKHAHIT | |
155 | ||
156 | ๏ <> § ; # THAI CHARACTER FONGMAN | |
157 | ||
158 | ๐ <> 0 ; # THAI DIGIT ZERO | |
159 | ๑ <> 1 ; # THAI DIGIT ONE | |
160 | ๒ <> 2 ; # THAI DIGIT TWO | |
161 | ๓ <> 3 ; # THAI DIGIT THREE | |
162 | ๔ <> 4 ; # THAI DIGIT FOUR | |
163 | ๕ <> 5 ; # THAI DIGIT FIVE | |
164 | ๖ <> 6 ; # THAI DIGIT SIX | |
165 | ๗ <> 7 ; # THAI DIGIT SEVEN | |
166 | ๘ <> 8 ; # THAI DIGIT EIGHT | |
167 | ๙ <> 9 ; # THAI DIGIT NINE | |
168 | ||
169 | ๚ <> '||' ; # THAI CHARACTER ANGKHANKHU | |
170 | ||
171 | ๛ <> » ; # THAI CHARACTER KHOMUT | |
172 | ๆ <> « ; # THAI CHARACTER MAIYAMOK | |
173 | ||
174 | # moved down to make shorter first | |
175 | #Note: PHINTHU deviates from ISO since underring causes canonical problems. So it uses spacing tick below. | |
176 | ฺ <> ˌ ; # THAI CHARACTER PHINTHU | |
177 | ิ <> i ; # THAI CHARACTER SARA I | |
178 | ||
179 | # fallbacks | |
180 | ||
181 | | k < g ; | |
182 | | k < h ; | |
183 | | c < j ; | |
184 | | k < q ; | |
185 | | s < z ; | |
186 | ||
187 | :: (lower); |