]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | # © 2016 and later: Unicode, Inc. and others. |
2 | # License & terms of use: http://www.unicode.org/copyright.html#License | |
3 | # | |
4 | # File: Zawgyi_my.txt | |
5 | # Generated from CLDR | |
6 | # | |
7 | ||
8 | # This transform converts Zawgyi "encoded" Burmese into proper | |
9 | # unicode. Zawgyi is a popular encoding scheme in Myanmar. It uses | |
10 | # the Myanmar unicode range but assigns different characters or | |
11 | # glyphs to some codepoints. In addition to the character mapping, | |
12 | # there is reordering of codepoints needed to match the expected | |
13 | # unicode order. This reordering is context-based. | |
14 | # | |
15 | # This transform is done in two main stages: | |
16 | # (1) Map all Zawgyi codepoints to their Unicode counterpart. | |
17 | # (2) Perform reordering. | |
18 | # Modern Burmese digits & Unicode code points. | |
19 | $nondigits = [^\u1040-\u1049]; | |
20 | $consonant = [\u1000-\u1021]; | |
21 | $vowelsign = [\u102B-\u1030\u1032]; # Unicode vowel signs except E (1031) | |
3d1f044b | 22 | $vowelsAndConsonants = [\u1000-\u102a]; |
f3c0d7a5 | 23 | $umedial = [\u103B-\u103E]; # Medial codepoints in Unicode |
3d1f044b | 24 | $vowelmedial = [\u102B-\u1030\u1032\1u36\u1037\u103A-\u103F]; # Union of vowel signs and medials |
f3c0d7a5 | 25 | $ukinzi = \u1004\u103A\u1039; # Codepoints representing kinzi in Unicode |
3d1f044b | 26 | # Zawgyi medial ra has multiple representations |
f3c0d7a5 | 27 | $zmedialra = [\u103B\u107E-\u1084]; |
3d1f044b | 28 | $wspace = [\u0020\u00a0\u1680\u2000-\u200d\u2060\u202f\u205f\u3000\ufeff]; |
f3c0d7a5 A |
29 | #### |
30 | #### STAGE 1: CODEPOINT MAPPING FROM ZAWGYI TO UNICODE | |
31 | #### | |
32 | # Kinzi (predefined ligatures) | |
33 | # Move base character to the right | |
34 | ($consonant) \u103A \u1064 → $ukinzi $1 \u103B; | |
35 | ($consonant) \u1064 → $ukinzi $1; | |
36 | \u1064 → $ukinzi; | |
3d1f044b A |
37 | # Special cases moving base character to right before vowel signs |
38 | ($consonant) \u108B → $ukinzi $1 \u102D; | |
f3c0d7a5 A |
39 | ($consonant) \u108C → $ukinzi $1 \u102E; |
40 | ($consonant) \u108D → $ukinzi $1 \u1036; | |
41 | # Special cases moving Kinzi block to left | |
42 | ($consonant) \u103A \u1033 \u108B → $ukinzi $1 \u103B \u102D \u102F; | |
43 | ($consonant) \u103A \u108b → $ukinzi $1 \u103B \u102D ; | |
f3c0d7a5 A |
44 | ($consonant) \u103A \u108C → $ukinzi $1 \u103B \u102E ; |
45 | ($consonant) \u103A \u108D → $ukinzi $1 \u103B \u1036 ; | |
46 | ($consonant) \u103A \u108e → $1 \u103B \u102D \u1036 ; | |
47 | \u108B → $ukinzi \u102D ; | |
48 | \u108C → $ukinzi \u102E ; | |
49 | \u108D → $ukinzi \u1036 ; | |
50 | # Consonants (only the ones that have to change) | |
f3c0d7a5 A |
51 | \u106A → \u1009 ; # NYA |
52 | \u106B → \u100A ; | |
53 | \u108F → \u1014 ; | |
54 | \u1090 → \u101B ; | |
55 | \u1086 → \u103F ; | |
56 | # yapin | |
3d1f044b A |
57 | [\u103A|\u107d] → \u103B ; |
58 | # yayit | |
59 | ($zmedialra)+ → \u103C ; | |
f3c0d7a5 | 60 | # wasway |
3d1f044b | 61 | \u103C* \u108A → \u103D \u103E; # To avoid duplicate medials |
f3c0d7a5 | 62 | \u103C → \u103D ; |
f3c0d7a5 | 63 | # hatoh |
3d1f044b | 64 | [\u103D|\u1087] → \u103E ; |
f3c0d7a5 A |
65 | \u1088 → \u103E \u102F ; |
66 | \u1089 → \u103E \u1030 ; | |
f3c0d7a5 A |
67 | # Vowels |
68 | \u1033 → \u102F ; | |
69 | \u1034 → \u1030 ; | |
3d1f044b A |
70 | # asat |
71 | \u1039 → \u103A ; | |
72 | # lower dot | |
73 | [\u1094\u1095] → \u1037 ; | |
74 | # Special cases for 1025 vs 1009; | |
75 | \u1025 \u1039 → \u1009 \u103a; | |
f3c0d7a5 A |
76 | \u1025 \u1061 → \u1009 \u1039 \u1001; |
77 | \u1025 \u1062 → \u1009 \u1039 \u1002; | |
78 | \u1025 \u1065 → \u1009 \u1039 \u1005; | |
79 | \u1025 \u1068 → \u1009 \u1039 \u1007; | |
80 | \u1025 \u1076 → \u1009 \u1039 \u1013; | |
81 | \u1025 \u1078 → \u1009 \u1039 \u1015; | |
82 | \u1025 \u107A → \u1009 \u1039 \u1017; | |
83 | \u1025 \u1079 → \u1009 \u1039 \u1016; | |
f3c0d7a5 | 84 | # Stacked Consonants |
3d1f044b | 85 | \u105A → \u102B \u103A ; |
f3c0d7a5 A |
86 | \u1060 → \u1039 \u1000 ; |
87 | \u1061 → \u1039 \u1001 ; | |
88 | \u1062 → \u1039 \u1002 ; | |
89 | \u1063 → \u1039 \u1003 ; | |
90 | \u1065 → \u1039 \u1005 ; | |
3d1f044b | 91 | [\u1066\u1067] → \u1039 \u1006 ; |
f3c0d7a5 A |
92 | \u1068 → \u1039 \u1007 ; |
93 | \u1069 → \u1039 \u1008 ; | |
94 | \u106C → \u1039 \u100B ; | |
95 | \u106D → \u1039 \u100C ; | |
96 | \u1070 → \u1039 \u100F ; | |
3d1f044b | 97 | [\u1071\u1072] → \u1039 \u1010 ; |
f3c0d7a5 | 98 | \u1096 → \u1039 \u1010 \u103D; |
3d1f044b | 99 | [\u1073\u1074] → \u1039 \u1011 ; |
f3c0d7a5 A |
100 | \u1075 → \u1039 \u1012 ; |
101 | \u1076 → \u1039 \u1013 ; | |
102 | \u1077 → \u1039 \u1014 ; | |
103 | \u1078 → \u1039 \u1015 ; | |
104 | \u1079 → \u1039 \u1016 ; | |
105 | \u107A → \u1039 \u1017 ; | |
3d1f044b | 106 | [\u107B\u1093] → \u1039 \u1018 ; |
f3c0d7a5 A |
107 | \u107C → \u1039 \u1019 ; |
108 | \u1085 → \u1039 \u101C ; | |
3d1f044b | 109 | \u108E → \u102D \u1036 ; |
f3c0d7a5 A |
110 | # Pre-defined ligatures |
111 | \u106E → \u100D\u1039\u100D ; | |
112 | \u106F → \u100D\u1039\u100E ; | |
113 | \u1091 → \u100F\u1039\u100D ; | |
114 | \u1092 → \u100B\u1039\u100C ; | |
115 | \u1097 → \u100B\u1039\u100B ; | |
116 | \u104E → \u104E\u1004\u103A\u1038 ; | |
f3c0d7a5 | 117 | #### |
3d1f044b | 118 | #### STAGE 1.01: Digits 0 and 4 used instead of letters |
f3c0d7a5 | 119 | # Case of MYANMAR digit being used instead of a letter |
3d1f044b A |
120 | # Lone digit zero and four at start |
121 | ::Null; | |
122 | ^ \u1040 ($nondigits) → \u101D $1; | |
f3c0d7a5 | 123 | ^ \u1044 ($nondigits) → | \u104E $1 ; |
3d1f044b A |
124 | # Lone digit zero or four at end |
125 | ($nondigits) \u1040 $ → $1 \u101D; | |
126 | ($nondigits) \u1044 $ → $1 \u104e; | |
127 | # Evowel and dependent vowel signs before 0 or 4 only | |
128 | # -> convert to the consonant. | |
129 | ([\u102b-\u103f]) \u1040 ($nondigits) → $1 \u101d $2; | |
f3c0d7a5 | 130 | ([\u102b-\u103f]) \u1044 ($nondigits) → $1 \u104E $2; |
3d1f044b A |
131 | #### |
132 | #### STAGE 1.1: Strip spaces immediately before combining characters. | |
133 | #### Move e-vowel after consonants and medials | |
134 | #### Now every codepoint is Unicode. This starts conversion | |
135 | #### from semi-visual order to logical order. | |
136 | #### | |
137 | ::Null; | |
138 | # Don't remove spaces before E vowel or medial Ra at this stage | |
139 | ($wspace) \u1037 > \u1037 $1; | |
140 | ($wspace+) ([\u102b-\u1030\u1032-\u103b\u103d\u103e]) → $2; | |
141 | # Remove a duplicate early | |
142 | \u1037+ → \u1037; | |
143 | # Move e-vowel after medials and consonants. | |
144 | \u1031+ $ukinzi ($consonant) > $ukinzi $1 \u1031; | |
145 | \u1031+ \u1037+ ($consonant) > $1 \u1031 \u1037 ; | |
146 | \u1031+ \u103c ($consonant) > $1 \u103c \u1031; | |
147 | # Move medials other than 103c before the 1031. Leave 103c for | |
148 | # the next consonant. | |
149 | \u1031+ ($consonant) ([\u103b\u103d\u103e]+) > $1 $2 \u1031; | |
150 | \u1031+ ($vowelsAndConsonants) > $1 \u1031; | |
151 | #### | |
152 | #### STAGE 2: POST REORDERING RULES FOR UNICODE RENDERING | |
153 | #### | |
154 | ::Null; | |
155 | \u103b \u103a > \u103a \u103b; | |
f3c0d7a5 | 156 | # Simpler replacements for Zawgyi 1025 |
f3c0d7a5 A |
157 | \u1025 \u102E → \u1026; |
158 | # Asat and dot below reordering, to Unicode NFC. | |
159 | \u103A\u1037 → \u1037\u103A; | |
160 | # Reorder some vowel signs | |
161 | \u1036 ($umedial*) ($vowelsign+) → $1 $2 \u1036 ; | |
162 | ([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) → $2 $1; | |
3d1f044b | 163 | # Move ra medial which precedes consonant, but not other medials. |
f3c0d7a5 | 164 | \u103C ($consonant) → $1 \u103C; |
f3c0d7a5 A |
165 | #### |
166 | #### Stage 3 | |
3d1f044b | 167 | #### Move \u1036, and \u103C after consonants. |
f3c0d7a5 | 168 | ::Null; |
3d1f044b | 169 | ($umedial) \u1039 ($consonant) > \u1039 $2 $1; |
f3c0d7a5 A |
170 | \u103C \u103A \u1039 ($consonant) → \u103A \u1039 $1 \u103C; |
171 | \u1036 ($umedial+) → $1 \u1036; | |
172 | #### | |
173 | #### Stage 4 | |
174 | #### Reordering medials, dot below, contractions, E sign, and asat. | |
175 | ::Null; | |
176 | # Reorder the medials | |
177 | ([\u103C\u103D\u103E]+) \u103B → \u103B $1; | |
178 | ([\u103D\u103E]+) \u103C → \u103C $1; | |
179 | \u103E\u103D → \u103D\u103E ; | |
180 | # Contractions with vowel signs | |
181 | ([\u1031]+) ($vowelsign*) \u1039 ($consonant) → \u1039 $3 $1 $2; | |
182 | ($vowelsign+) \u1039 ($consonant) → \u1039 $2 $1; | |
183 | # Move vowel sign E \u1031 after medials, but not across consonants | |
184 | ($umedial*) ([\u1031]+) ($umedial*) → $1 $3 $2; | |
185 | # Reorder dot below after medials and vowel diacritics | |
3d1f044b | 186 | \u1037 ([\u102D-\u1030\u1032\u1036\u103b-\u103e]+) → $1 \u1037; |
f3c0d7a5 A |
187 | # Move vowel signs after medials |
188 | ($vowelsign+) ($umedial+) → $2 $1; | |
189 | # Reorder modifiers and asat | |
190 | ($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant) → $1 \u103A $2 $3; | |
191 | #### | |
192 | #### Stage 5. More reorderings | |
193 | #### Vowel signs after medials, sort medials, | |
194 | #### | |
195 | ::Null; | |
3d1f044b A |
196 | # Replace CA + YA with JHA after moving other things beyond the medials. |
197 | \u1005 \u103b → \u1008; | |
f3c0d7a5 | 198 | # More moving vowel signs after medials |
3d1f044b | 199 | ([\u102b-\u1032]) ($umedial) → $2 $1; |
f3c0d7a5 A |
200 | # Sort the medials |
201 | ([\u103C\u103D\u103E]) \u103B → \u103B $1; | |
202 | ([\u103D\u103E]) \u103C → \u103C $1; | |
203 | \u103E\u103D → \u103D\u103E ; | |
3d1f044b A |
204 | # Move visarga after other signs |
205 | \u1038 ($vowelmedial) → $1 \u1038; | |
f3c0d7a5 A |
206 | # Reorder |
207 | \u1036 \u102f → \u102f \u1036; | |
208 | ### | |
209 | ### Stage 6 | |
3d1f044b | 210 | ### Finish conflicting and extra diacritics. Remove some white space |
f3c0d7a5 A |
211 | ### |
212 | ::Null; | |
3d1f044b A |
213 | # Fix duplicate combiners |
214 | \u102D \u102D+ → \u102D; | |
215 | \u102E \u102E+ → \u102E; | |
f3c0d7a5 | 216 | \u102F \u102F+ → \u102F; |
3d1f044b A |
217 | \u1030 \u1030+ → \u1030; |
218 | \u1032 \u1032+ → \u1032; | |
f3c0d7a5 | 219 | \u1036 \u1036+ → \u1036; |
3d1f044b A |
220 | \u1037 \u1037+ → \u1037; |
221 | \u1039 \u1039+ → \u1039; | |
222 | \u103a \u103a+ → \u103a; | |
223 | \u103b \u103b+ → \u103b; | |
224 | \u103c \u103c+ → \u103c; | |
225 | \u103d \u103d+ → \u103d; | |
0f5d89e8 | 226 | \u103e \u103e+ → \u103e; # http://unicode.org/cldr/trac/ticket/10386 |
3d1f044b A |
227 | # Fix overlapping signs |
228 | \u102F [\u1030\u103a] → \u102F; | |
229 | \u102D \u102E → \u102E; | |
f3c0d7a5 | 230 | # Remove space directly before diacritics. |
3d1f044b A |
231 | ($wspace)+ ([\u102b-\u1032\u1036-\u103e]) → $2; |
232 | # Remove ZWSP at start and end | |
233 | ^ \u200b+ → ; | |
234 | \u200b+ $ → ; | |
235 | # Fix multiple spaces around ZWSP to single ZWSP. | |
236 | $wspace* \u200b $wspace* → \u200b; | |
f3c0d7a5 | 237 |