]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | # © 2016 and later: Unicode, Inc. and others. |
2 | # License & terms of use: http://www.unicode.org/copyright.html#License | |
3 | # | |
4 | # File: Zawgyi_my.txt | |
5 | # Generated from CLDR | |
6 | # | |
7 | ||
8 | # This transform converts Zawgyi "encoded" Burmese into proper | |
9 | # unicode. Zawgyi is a popular encoding scheme in Myanmar. It uses | |
10 | # the Myanmar unicode range but assigns different characters or | |
11 | # glyphs to some codepoints. In addition to the character mapping, | |
12 | # there is reordering of codepoints needed to match the expected | |
13 | # unicode order. This reordering is context-based. | |
14 | # | |
15 | # This transform is done in two main stages: | |
16 | # (1) Map all Zawgyi codepoints to their Unicode counterpart. | |
17 | # (2) Perform reordering. | |
18 | # Modern Burmese digits & Unicode code points. | |
19 | $nondigits = [^\u1040-\u1049]; | |
20 | $consonant = [\u1000-\u1021]; | |
21 | $vowelsign = [\u102B-\u1030\u1032]; # Unicode vowel signs except E (1031) | |
22 | $umedial = [\u103B-\u103E]; # Medial codepoints in Unicode | |
23 | $vowelmedial = [\u102B-\u1030\u1032\u103B-\u103F]; # Union of vowel signs and medials | |
24 | $ukinzi = \u1004\u103A\u1039; # Codepoints representing kinzi in Unicode | |
25 | # ZAWGYI MYANMAR CONSONANT SIGN MEDIAL RA | |
26 | # This character has multiple representations in the Zawgyi font. | |
27 | $zmedialra = [\u103B\u107E-\u1084]; | |
28 | #### | |
29 | #### STAGE 1: CODEPOINT MAPPING FROM ZAWGYI TO UNICODE | |
30 | #### | |
31 | # Kinzi (predefined ligatures) | |
32 | # Move base character to the right | |
33 | ($consonant) \u103A \u1064 → $ukinzi $1 \u103B; | |
34 | ($consonant) \u1064 → $ukinzi $1; | |
35 | \u1064 → $ukinzi; | |
36 | # Special cases moving base character to right before | |
37 | ($consonant) \u108b → $ukinzi $1 \u102D; | |
38 | ($consonant) \u108C → $ukinzi $1 \u102E; | |
39 | ($consonant) \u108D → $ukinzi $1 \u1036; | |
40 | # Special cases moving Kinzi block to left | |
41 | ($consonant) \u103A \u1033 \u108B → $ukinzi $1 \u103B \u102D \u102F; | |
42 | ($consonant) \u103A \u108b → $ukinzi $1 \u103B \u102D ; | |
43 | ($consonant) \u103A \u108C \u1033 → $ukinzi $1 \u103B \u102E \u102F; | |
44 | ($consonant) \u103A \u108C → $ukinzi $1 \u103B \u102E ; | |
45 | ($consonant) \u103A \u108D → $ukinzi $1 \u103B \u1036 ; | |
46 | ($consonant) \u103A \u108e → $1 \u103B \u102D \u1036 ; | |
47 | \u108B → $ukinzi \u102D ; | |
48 | \u108C → $ukinzi \u102E ; | |
49 | \u108D → $ukinzi \u1036 ; | |
50 | # Consonants (only the ones that have to change) | |
51 | \u106A ($vowelsign) \u1038 → \u1025 $1 \u1038 ; # U sound | |
52 | \u106A → \u1009 ; # NYA | |
53 | \u106B → \u100A ; | |
54 | \u108F → \u1014 ; | |
55 | \u1090 → \u101B ; | |
56 | \u1086 → \u103F ; | |
57 | # yapin | |
58 | \u103A → \u103B ; | |
59 | \u107D → \u103B ; | |
60 | # wasway | |
61 | \u103C \u108A → \u103D \u103E; # To avoid duplicate medials | |
62 | \u103C → \u103D ; | |
63 | \u108A → \u103D \u103E ; | |
64 | # hatoh | |
65 | \u103D → \u103E ; | |
66 | \u1087 → \u103E ; | |
67 | \u1088 → \u103E \u102F ; | |
68 | \u1089 → \u103E \u1030 ; | |
69 | # asat | |
70 | \u1039 → \u103A ; | |
71 | # Vowels | |
72 | \u1033 → \u102F ; | |
73 | \u1034 → \u1030 ; | |
74 | \u105A → \u102B \u103A ; | |
75 | \u108E → \u102D \u1036 ; | |
76 | # lDot | |
77 | # Special cases to move dot to right of base consonant | |
78 | \u1031 \u1094 ($consonant) \u103D → $1 \u103E \u1031 \u1037 ; | |
79 | \u1094 → \u1037 ; | |
80 | \u1095 → \u1037 ; | |
81 | # Special cases for 1025 vs 1009 | |
82 | \u1025 \u1061 → \u1009 \u1039 \u1001; | |
83 | \u1025 \u1062 → \u1009 \u1039 \u1002; | |
84 | \u1025 \u1065 → \u1009 \u1039 \u1005; | |
85 | \u1025 \u1068 → \u1009 \u1039 \u1007; | |
86 | \u1025 \u1076 → \u1009 \u1039 \u1013; | |
87 | \u1025 \u1078 → \u1009 \u1039 \u1015; | |
88 | \u1025 \u107A → \u1009 \u1039 \u1017; | |
89 | \u1025 \u1079 → \u1009 \u1039 \u1016; | |
90 | ($consonant) \u103A \u1039 → $1 \u103A \u103B; | |
91 | # Stacked Consonants | |
92 | \u1060 → \u1039 \u1000 ; | |
93 | \u1061 → \u1039 \u1001 ; | |
94 | \u1062 → \u1039 \u1002 ; | |
95 | \u1063 → \u1039 \u1003 ; | |
96 | \u1065 → \u1039 \u1005 ; | |
97 | \u1066 → \u1039 \u1006 ; | |
98 | \u1067 → \u1039 \u1006 ; | |
99 | \u1068 → \u1039 \u1007 ; | |
100 | \u1069 → \u1039 \u1008 ; | |
101 | \u106C → \u1039 \u100B ; | |
102 | \u106D → \u1039 \u100C ; | |
103 | \u1070 → \u1039 \u100F ; | |
104 | \u1071 → \u1039 \u1010 ; | |
105 | \u1072 → \u1039 \u1010 ; | |
106 | \u1096 → \u1039 \u1010 \u103D; | |
107 | \u1073 → \u1039 \u1011 ; | |
108 | \u1074 → \u1039 \u1011 ; | |
109 | \u1075 → \u1039 \u1012 ; | |
110 | \u1076 → \u1039 \u1013 ; | |
111 | \u1077 → \u1039 \u1014 ; | |
112 | \u1078 → \u1039 \u1015 ; | |
113 | \u1079 → \u1039 \u1016 ; | |
114 | \u107A → \u1039 \u1017 ; | |
115 | \u107B → \u1039 \u1018 ; | |
116 | \u1093 → \u1039 \u1018 ; | |
117 | \u107C → \u1039 \u1019 ; | |
118 | \u1085 → \u1039 \u101C ; | |
119 | # Pre-defined ligatures | |
120 | \u106E → \u100D\u1039\u100D ; | |
121 | \u106F → \u100D\u1039\u100E ; | |
122 | \u1091 → \u100F\u1039\u100D ; | |
123 | \u1092 → \u100B\u1039\u100C ; | |
124 | \u1097 → \u100B\u1039\u100B ; | |
125 | \u104E → \u104E\u1004\u103A\u1038 ; | |
126 | # yayit | |
127 | $zmedialra → \u103C ; | |
128 | #### | |
129 | #### STAGE 2: POST REORDERING RULES FOR UNICODE RENDERING | |
130 | #### Now every codepoint is Unicode. This starts conversion | |
131 | #### from semi-visual order to logical order. | |
132 | #### | |
133 | ::Null; | |
134 | # Case of MYANMAR digit being used instead of a letter | |
135 | # Lone digit zero | |
136 | [$] \u1040 ($nondigits) → \u101D $1; | |
137 | ([\u102b-\u103f]) \u1040 ($nondigits) → $1 \u101d $2; | |
138 | # Lone digit 4 | |
139 | ^ \u1044 ($nondigits) → | \u104E $1 ; | |
140 | ([\u102b-\u103f]) \u1044 ($nondigits) → $1 \u104E $2; | |
141 | # Simpler replacements for Zawgyi 1025 | |
142 | \u1025 \u103A → \u1009 \u103A; | |
143 | \u1025 \u102E → \u1026; | |
144 | # Asat and dot below reordering, to Unicode NFC. | |
145 | \u103A\u1037 → \u1037\u103A; | |
146 | # Reorder some vowel signs | |
147 | \u1036 ($umedial*) ($vowelsign+) → $1 $2 \u1036 ; | |
148 | ([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) → $2 $1; | |
149 | # Move ra medial, but not others. | |
150 | \u103C ($consonant) → $1 \u103C; | |
151 | # Replace CA + YA with JHA | |
152 | \u1005\u103b → \u1008; | |
153 | #### | |
154 | #### Stage 3 | |
155 | #### Move \u1031, \u1036, and \u103C after consonants. | |
156 | ::Null; | |
157 | # 1031 moved after consonant, with and without kinzi or medials | |
158 | ([\u1031]+) $ukinzi ($consonant) → $ukinzi $2 $1; | |
159 | ([\u1031]+) ($consonant) ($umedial+) → $2 $3 $1; | |
160 | ([\u1031]+) ($consonant) } [^\u103B\u103C\u103D\u103E] → $2 $1; | |
161 | \u103C \u103A \u1039 ($consonant) → \u103A \u1039 $1 \u103C; | |
162 | \u1036 ($umedial+) → $1 \u1036; | |
163 | #### | |
164 | #### Stage 4 | |
165 | #### Reordering medials, dot below, contractions, E sign, and asat. | |
166 | ::Null; | |
167 | # Reorder the medials | |
168 | ([\u103C\u103D\u103E]+) \u103B → \u103B $1; | |
169 | ([\u103D\u103E]+) \u103C → \u103C $1; | |
170 | \u103E\u103D → \u103D\u103E ; | |
171 | # Contractions with vowel signs | |
172 | ([\u1031]+) ($vowelsign*) \u1039 ($consonant) → \u1039 $3 $1 $2; | |
173 | ($vowelsign+) \u1039 ($consonant) → \u1039 $2 $1; | |
174 | # Move vowel sign E \u1031 after medials, but not across consonants | |
175 | ($umedial*) ([\u1031]+) ($umedial*) → $1 $3 $2; | |
176 | # Reorder dot below after medials and vowel diacritics | |
177 | \u1037 ([\u102D-\u1030\u1032\u1036]) → $1 \u1037; | |
178 | \u1037 ($umedial+) → $1 \u1037; | |
179 | # Move vowel signs after medials | |
180 | ($vowelsign+) ($umedial+) → $2 $1; | |
181 | # Reorder modifiers and asat | |
182 | ($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant) → $1 \u103A $2 $3; | |
183 | #### | |
184 | #### Stage 5. More reorderings | |
185 | #### Vowel signs after medials, sort medials, | |
186 | #### | |
187 | ::Null; | |
188 | ([\u1031]+) ($umedial+) → $2 $1; | |
189 | # More moving vowel signs after medials | |
190 | ($vowelsign) ($umedial) → $2 $1; | |
191 | # Sort the medials | |
192 | ([\u103C\u103D\u103E]) \u103B → \u103B $1; | |
193 | ([\u103D\u103E]) \u103C → \u103C $1; | |
194 | \u103E\u103D → \u103D\u103E ; | |
195 | # Move visarga (\u1038) after other signs | |
196 | \u1038 ([$vowelmedial]) → $1 \u1038; | |
197 | \u1038 ([\u1036\u1037\u103A]) → $1 \u1038; | |
198 | # Reorder | |
199 | \u1036 \u102f → \u102f \u1036; | |
200 | ### | |
201 | ### Stage 6 | |
202 | ### Finish medial sorting, fix conflicting and extra diacritics | |
203 | ### | |
204 | ::Null; | |
205 | # Fix duplicate and overlapping modifiers. | |
206 | \u102F \u102F+ → \u102F; | |
207 | \u102F \u1030 → \u102F; | |
208 | \u102F \u103A → \u102F; | |
209 | \u102D \u102E → \u102E; | |
210 | \u1036 \u1036+ → \u1036; | |
0f5d89e8 | 211 | \u103e \u103e+ → \u103e; # http://unicode.org/cldr/trac/ticket/10386 |
f3c0d7a5 A |
212 | # Remove space directly before diacritics. |
213 | [:WSpace:] ([\u102b-\u1032\u1036-\u103e]) → $1; | |
214 | # Fix 103B/103A order for asat. | |
215 | ($consonant) \u103B \u103A → $1 \u103A \u103B; | |
216 |