]> git.saurik.com Git - apple/icu.git/blame - icuSources/data/translit/Zawgyi_my.txt
ICU-62107.0.1.tar.gz
[apple/icu.git] / icuSources / data / translit / Zawgyi_my.txt
CommitLineData
f3c0d7a5
A
1# © 2016 and later: Unicode, Inc. and others.
2# License & terms of use: http://www.unicode.org/copyright.html#License
3#
4# File: Zawgyi_my.txt
5# Generated from CLDR
6#
7
8# This transform converts Zawgyi "encoded" Burmese into proper
9# unicode. Zawgyi is a popular encoding scheme in Myanmar. It uses
10# the Myanmar unicode range but assigns different characters or
11# glyphs to some codepoints. In addition to the character mapping,
12# there is reordering of codepoints needed to match the expected
13# unicode order. This reordering is context-based.
14#
15# This transform is done in two main stages:
16# (1) Map all Zawgyi codepoints to their Unicode counterpart.
17# (2) Perform reordering.
18# Modern Burmese digits & Unicode code points.
19$nondigits = [^\u1040-\u1049];
20$consonant = [\u1000-\u1021];
21$vowelsign = [\u102B-\u1030\u1032]; # Unicode vowel signs except E (1031)
22$umedial = [\u103B-\u103E]; # Medial codepoints in Unicode
23$vowelmedial = [\u102B-\u1030\u1032\u103B-\u103F]; # Union of vowel signs and medials
24$ukinzi = \u1004\u103A\u1039; # Codepoints representing kinzi in Unicode
25# ZAWGYI MYANMAR CONSONANT SIGN MEDIAL RA
26# This character has multiple representations in the Zawgyi font.
27$zmedialra = [\u103B\u107E-\u1084];
28####
29#### STAGE 1: CODEPOINT MAPPING FROM ZAWGYI TO UNICODE
30####
31# Kinzi (predefined ligatures)
32# Move base character to the right
33($consonant) \u103A \u1064 → $ukinzi $1 \u103B;
34($consonant) \u1064 → $ukinzi $1;
35\u1064 → $ukinzi;
36# Special cases moving base character to right before
37($consonant) \u108b → $ukinzi $1 \u102D;
38($consonant) \u108C → $ukinzi $1 \u102E;
39($consonant) \u108D → $ukinzi $1 \u1036;
40# Special cases moving Kinzi block to left
41($consonant) \u103A \u1033 \u108B → $ukinzi $1 \u103B \u102D \u102F;
42($consonant) \u103A \u108b → $ukinzi $1 \u103B \u102D ;
43($consonant) \u103A \u108C \u1033 → $ukinzi $1 \u103B \u102E \u102F;
44($consonant) \u103A \u108C → $ukinzi $1 \u103B \u102E ;
45($consonant) \u103A \u108D → $ukinzi $1 \u103B \u1036 ;
46($consonant) \u103A \u108e → $1 \u103B \u102D \u1036 ;
47\u108B → $ukinzi \u102D ;
48\u108C → $ukinzi \u102E ;
49\u108D → $ukinzi \u1036 ;
50# Consonants (only the ones that have to change)
51\u106A ($vowelsign) \u1038 → \u1025 $1 \u1038 ; # U sound
52\u106A → \u1009 ; # NYA
53\u106B → \u100A ;
54\u108F → \u1014 ;
55\u1090 → \u101B ;
56\u1086 → \u103F ;
57# yapin
58\u103A → \u103B ;
59\u107D → \u103B ;
60# wasway
61\u103C \u108A → \u103D \u103E; # To avoid duplicate medials
62\u103C → \u103D ;
63\u108A → \u103D \u103E ;
64# hatoh
65\u103D → \u103E ;
66\u1087 → \u103E ;
67\u1088 → \u103E \u102F ;
68\u1089 → \u103E \u1030 ;
69# asat
70\u1039 → \u103A ;
71# Vowels
72\u1033 → \u102F ;
73\u1034 → \u1030 ;
74\u105A → \u102B \u103A ;
75\u108E → \u102D \u1036 ;
76# lDot
77# Special cases to move dot to right of base consonant
78\u1031 \u1094 ($consonant) \u103D → $1 \u103E \u1031 \u1037 ;
79\u1094 → \u1037 ;
80\u1095 → \u1037 ;
81# Special cases for 1025 vs 1009
82\u1025 \u1061 → \u1009 \u1039 \u1001;
83\u1025 \u1062 → \u1009 \u1039 \u1002;
84\u1025 \u1065 → \u1009 \u1039 \u1005;
85\u1025 \u1068 → \u1009 \u1039 \u1007;
86\u1025 \u1076 → \u1009 \u1039 \u1013;
87\u1025 \u1078 → \u1009 \u1039 \u1015;
88\u1025 \u107A → \u1009 \u1039 \u1017;
89\u1025 \u1079 → \u1009 \u1039 \u1016;
90($consonant) \u103A \u1039 → $1 \u103A \u103B;
91# Stacked Consonants
92\u1060 → \u1039 \u1000 ;
93\u1061 → \u1039 \u1001 ;
94\u1062 → \u1039 \u1002 ;
95\u1063 → \u1039 \u1003 ;
96\u1065 → \u1039 \u1005 ;
97\u1066 → \u1039 \u1006 ;
98\u1067 → \u1039 \u1006 ;
99\u1068 → \u1039 \u1007 ;
100\u1069 → \u1039 \u1008 ;
101\u106C → \u1039 \u100B ;
102\u106D → \u1039 \u100C ;
103\u1070 → \u1039 \u100F ;
104\u1071 → \u1039 \u1010 ;
105\u1072 → \u1039 \u1010 ;
106\u1096 → \u1039 \u1010 \u103D;
107\u1073 → \u1039 \u1011 ;
108\u1074 → \u1039 \u1011 ;
109\u1075 → \u1039 \u1012 ;
110\u1076 → \u1039 \u1013 ;
111\u1077 → \u1039 \u1014 ;
112\u1078 → \u1039 \u1015 ;
113\u1079 → \u1039 \u1016 ;
114\u107A → \u1039 \u1017 ;
115\u107B → \u1039 \u1018 ;
116\u1093 → \u1039 \u1018 ;
117\u107C → \u1039 \u1019 ;
118\u1085 → \u1039 \u101C ;
119# Pre-defined ligatures
120\u106E → \u100D\u1039\u100D ;
121\u106F → \u100D\u1039\u100E ;
122\u1091 → \u100F\u1039\u100D ;
123\u1092 → \u100B\u1039\u100C ;
124\u1097 → \u100B\u1039\u100B ;
125\u104E → \u104E\u1004\u103A\u1038 ;
126# yayit
127$zmedialra → \u103C ;
128####
129#### STAGE 2: POST REORDERING RULES FOR UNICODE RENDERING
130#### Now every codepoint is Unicode. This starts conversion
131#### from semi-visual order to logical order.
132####
133::Null;
134# Case of MYANMAR digit being used instead of a letter
135# Lone digit zero
136[$] \u1040 ($nondigits) → \u101D $1;
137([\u102b-\u103f]) \u1040 ($nondigits) → $1 \u101d $2;
138# Lone digit 4
139^ \u1044 ($nondigits) → | \u104E $1 ;
140([\u102b-\u103f]) \u1044 ($nondigits) → $1 \u104E $2;
141# Simpler replacements for Zawgyi 1025
142\u1025 \u103A → \u1009 \u103A;
143\u1025 \u102E → \u1026;
144# Asat and dot below reordering, to Unicode NFC.
145\u103A\u1037 → \u1037\u103A;
146# Reorder some vowel signs
147\u1036 ($umedial*) ($vowelsign+) → $1 $2 \u1036 ;
148([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) → $2 $1;
149# Move ra medial, but not others.
150\u103C ($consonant) → $1 \u103C;
151# Replace CA + YA with JHA
152\u1005\u103b → \u1008;
153####
154#### Stage 3
155#### Move \u1031, \u1036, and \u103C after consonants.
156::Null;
157# 1031 moved after consonant, with and without kinzi or medials
158([\u1031]+) $ukinzi ($consonant) → $ukinzi $2 $1;
159([\u1031]+) ($consonant) ($umedial+) → $2 $3 $1;
160([\u1031]+) ($consonant) } [^\u103B\u103C\u103D\u103E] → $2 $1;
161\u103C \u103A \u1039 ($consonant) → \u103A \u1039 $1 \u103C;
162\u1036 ($umedial+) → $1 \u1036;
163####
164#### Stage 4
165#### Reordering medials, dot below, contractions, E sign, and asat.
166::Null;
167# Reorder the medials
168([\u103C\u103D\u103E]+) \u103B → \u103B $1;
169([\u103D\u103E]+) \u103C → \u103C $1;
170\u103E\u103D → \u103D\u103E ;
171# Contractions with vowel signs
172([\u1031]+) ($vowelsign*) \u1039 ($consonant) → \u1039 $3 $1 $2;
173($vowelsign+) \u1039 ($consonant) → \u1039 $2 $1;
174# Move vowel sign E \u1031 after medials, but not across consonants
175($umedial*) ([\u1031]+) ($umedial*) → $1 $3 $2;
176# Reorder dot below after medials and vowel diacritics
177\u1037 ([\u102D-\u1030\u1032\u1036]) → $1 \u1037;
178\u1037 ($umedial+) → $1 \u1037;
179# Move vowel signs after medials
180($vowelsign+) ($umedial+) → $2 $1;
181# Reorder modifiers and asat
182($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant) → $1 \u103A $2 $3;
183####
184#### Stage 5. More reorderings
185#### Vowel signs after medials, sort medials,
186####
187::Null;
188([\u1031]+) ($umedial+) → $2 $1;
189# More moving vowel signs after medials
190($vowelsign) ($umedial) → $2 $1;
191# Sort the medials
192([\u103C\u103D\u103E]) \u103B → \u103B $1;
193([\u103D\u103E]) \u103C → \u103C $1;
194\u103E\u103D → \u103D\u103E ;
195# Move visarga (\u1038) after other signs
196\u1038 ([$vowelmedial]) → $1 \u1038;
197\u1038 ([\u1036\u1037\u103A]) → $1 \u1038;
198# Reorder
199\u1036 \u102f → \u102f \u1036;
200###
201### Stage 6
202### Finish medial sorting, fix conflicting and extra diacritics
203###
204::Null;
205# Fix duplicate and overlapping modifiers.
206\u102F \u102F+ → \u102F;
207\u102F \u1030 → \u102F;
208\u102F \u103A → \u102F;
209\u102D \u102E → \u102E;
210\u1036 \u1036+ → \u1036;
0f5d89e8 211\u103e \u103e+ → \u103e; # http://unicode.org/cldr/trac/ticket/10386
f3c0d7a5
A
212# Remove space directly before diacritics.
213[:WSpace:] ([\u102b-\u1032\u1036-\u103e]) → $1;
214# Fix 103B/103A order for asat.
215($consonant) \u103B \u103A → $1 \u103A \u103B;
216