X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/0f5d89e82340278ed3d7d50029f37cab2c41a57e..a01113dcd0f39d5da295ef82785beff9ed86fe38:/icuSources/data/translit/Zawgyi_my.txt diff --git a/icuSources/data/translit/Zawgyi_my.txt b/icuSources/data/translit/Zawgyi_my.txt index da4b8f98..6bd51ab7 100644 --- a/icuSources/data/translit/Zawgyi_my.txt +++ b/icuSources/data/translit/Zawgyi_my.txt @@ -19,12 +19,13 @@ $nondigits = [^\u1040-\u1049]; $consonant = [\u1000-\u1021]; $vowelsign = [\u102B-\u1030\u1032]; # Unicode vowel signs except E (1031) +$vowelsAndConsonants = [\u1000-\u102a]; $umedial = [\u103B-\u103E]; # Medial codepoints in Unicode -$vowelmedial = [\u102B-\u1030\u1032\u103B-\u103F]; # Union of vowel signs and medials +$vowelmedial = [\u102B-\u1030\u1032\1u36\u1037\u103A-\u103F]; # Union of vowel signs and medials $ukinzi = \u1004\u103A\u1039; # Codepoints representing kinzi in Unicode -# ZAWGYI MYANMAR CONSONANT SIGN MEDIAL RA -# This character has multiple representations in the Zawgyi font. +# Zawgyi medial ra has multiple representations $zmedialra = [\u103B\u107E-\u1084]; +$wspace = [\u0020\u00a0\u1680\u2000-\u200d\u2060\u202f\u205f\u3000\ufeff]; #### #### STAGE 1: CODEPOINT MAPPING FROM ZAWGYI TO UNICODE #### @@ -33,14 +34,13 @@ $zmedialra = [\u103B\u107E-\u1084]; ($consonant) \u103A \u1064 → $ukinzi $1 \u103B; ($consonant) \u1064 → $ukinzi $1; \u1064 → $ukinzi; -# Special cases moving base character to right before -($consonant) \u108b → $ukinzi $1 \u102D; +# Special cases moving base character to right before vowel signs +($consonant) \u108B → $ukinzi $1 \u102D; ($consonant) \u108C → $ukinzi $1 \u102E; ($consonant) \u108D → $ukinzi $1 \u1036; # Special cases moving Kinzi block to left ($consonant) \u103A \u1033 \u108B → $ukinzi $1 \u103B \u102D \u102F; ($consonant) \u103A \u108b → $ukinzi $1 \u103B \u102D ; -($consonant) \u103A \u108C \u1033 → $ukinzi $1 \u103B \u102E \u102F; ($consonant) \u103A \u108C → $ukinzi $1 \u103B \u102E ; ($consonant) \u103A \u108D → $ukinzi $1 \u103B \u1036 ; ($consonant) \u103A \u108e → $1 \u103B \u102D \u1036 ; @@ -48,37 +48,31 @@ $zmedialra = [\u103B\u107E-\u1084]; \u108C → $ukinzi \u102E ; \u108D → $ukinzi \u1036 ; # Consonants (only the ones that have to change) -\u106A ($vowelsign) \u1038 → \u1025 $1 \u1038 ; # U sound \u106A → \u1009 ; # NYA \u106B → \u100A ; \u108F → \u1014 ; \u1090 → \u101B ; \u1086 → \u103F ; # yapin -\u103A → \u103B ; -\u107D → \u103B ; +[\u103A|\u107d] → \u103B ; +# yayit +($zmedialra)+ → \u103C ; # wasway -\u103C \u108A → \u103D \u103E; # To avoid duplicate medials +\u103C* \u108A → \u103D \u103E; # To avoid duplicate medials \u103C → \u103D ; -\u108A → \u103D \u103E ; # hatoh -\u103D → \u103E ; -\u1087 → \u103E ; +[\u103D|\u1087] → \u103E ; \u1088 → \u103E \u102F ; \u1089 → \u103E \u1030 ; -# asat -\u1039 → \u103A ; # Vowels \u1033 → \u102F ; \u1034 → \u1030 ; -\u105A → \u102B \u103A ; -\u108E → \u102D \u1036 ; -# lDot -# Special cases to move dot to right of base consonant -\u1031 \u1094 ($consonant) \u103D → $1 \u103E \u1031 \u1037 ; -\u1094 → \u1037 ; -\u1095 → \u1037 ; -# Special cases for 1025 vs 1009 +# asat +\u1039 → \u103A ; +# lower dot +[\u1094\u1095] → \u1037 ; +# Special cases for 1025 vs 1009; +\u1025 \u1039 → \u1009 \u103a; \u1025 \u1061 → \u1009 \u1039 \u1001; \u1025 \u1062 → \u1009 \u1039 \u1002; \u1025 \u1065 → \u1009 \u1039 \u1005; @@ -87,35 +81,32 @@ $zmedialra = [\u103B\u107E-\u1084]; \u1025 \u1078 → \u1009 \u1039 \u1015; \u1025 \u107A → \u1009 \u1039 \u1017; \u1025 \u1079 → \u1009 \u1039 \u1016; -($consonant) \u103A \u1039 → $1 \u103A \u103B; # Stacked Consonants +\u105A → \u102B \u103A ; \u1060 → \u1039 \u1000 ; \u1061 → \u1039 \u1001 ; \u1062 → \u1039 \u1002 ; \u1063 → \u1039 \u1003 ; \u1065 → \u1039 \u1005 ; -\u1066 → \u1039 \u1006 ; -\u1067 → \u1039 \u1006 ; +[\u1066\u1067] → \u1039 \u1006 ; \u1068 → \u1039 \u1007 ; \u1069 → \u1039 \u1008 ; \u106C → \u1039 \u100B ; \u106D → \u1039 \u100C ; \u1070 → \u1039 \u100F ; -\u1071 → \u1039 \u1010 ; -\u1072 → \u1039 \u1010 ; +[\u1071\u1072] → \u1039 \u1010 ; \u1096 → \u1039 \u1010 \u103D; -\u1073 → \u1039 \u1011 ; -\u1074 → \u1039 \u1011 ; +[\u1073\u1074] → \u1039 \u1011 ; \u1075 → \u1039 \u1012 ; \u1076 → \u1039 \u1013 ; \u1077 → \u1039 \u1014 ; \u1078 → \u1039 \u1015 ; \u1079 → \u1039 \u1016 ; \u107A → \u1039 \u1017 ; -\u107B → \u1039 \u1018 ; -\u1093 → \u1039 \u1018 ; +[\u107B\u1093] → \u1039 \u1018 ; \u107C → \u1039 \u1019 ; \u1085 → \u1039 \u101C ; +\u108E → \u102D \u1036 ; # Pre-defined ligatures \u106E → \u100D\u1039\u100D ; \u106F → \u100D\u1039\u100E ; @@ -123,41 +114,59 @@ $zmedialra = [\u103B\u107E-\u1084]; \u1092 → \u100B\u1039\u100C ; \u1097 → \u100B\u1039\u100B ; \u104E → \u104E\u1004\u103A\u1038 ; -# yayit -$zmedialra → \u103C ; #### -#### STAGE 2: POST REORDERING RULES FOR UNICODE RENDERING -#### Now every codepoint is Unicode. This starts conversion -#### from semi-visual order to logical order. -#### -::Null; +#### STAGE 1.01: Digits 0 and 4 used instead of letters # Case of MYANMAR digit being used instead of a letter -# Lone digit zero -[$] \u1040 ($nondigits) → \u101D $1; -([\u102b-\u103f]) \u1040 ($nondigits) → $1 \u101d $2; -# Lone digit 4 +# Lone digit zero and four at start +::Null; +^ \u1040 ($nondigits) → \u101D $1; ^ \u1044 ($nondigits) → | \u104E $1 ; +# Lone digit zero or four at end +($nondigits) \u1040 $ → $1 \u101D; +($nondigits) \u1044 $ → $1 \u104e; +# Evowel and dependent vowel signs before 0 or 4 only +# -> convert to the consonant. +([\u102b-\u103f]) \u1040 ($nondigits) → $1 \u101d $2; ([\u102b-\u103f]) \u1044 ($nondigits) → $1 \u104E $2; +#### +#### STAGE 1.1: Strip spaces immediately before combining characters. +#### Move e-vowel after consonants and medials +#### Now every codepoint is Unicode. This starts conversion +#### from semi-visual order to logical order. +#### +::Null; +# Don't remove spaces before E vowel or medial Ra at this stage +($wspace) \u1037 > \u1037 $1; +($wspace+) ([\u102b-\u1030\u1032-\u103b\u103d\u103e]) → $2; +# Remove a duplicate early +\u1037+ → \u1037; +# Move e-vowel after medials and consonants. +\u1031+ $ukinzi ($consonant) > $ukinzi $1 \u1031; +\u1031+ \u1037+ ($consonant) > $1 \u1031 \u1037 ; +\u1031+ \u103c ($consonant) > $1 \u103c \u1031; +# Move medials other than 103c before the 1031. Leave 103c for +# the next consonant. +\u1031+ ($consonant) ([\u103b\u103d\u103e]+) > $1 $2 \u1031; +\u1031+ ($vowelsAndConsonants) > $1 \u1031; +#### +#### STAGE 2: POST REORDERING RULES FOR UNICODE RENDERING +#### +::Null; +\u103b \u103a > \u103a \u103b; # Simpler replacements for Zawgyi 1025 -\u1025 \u103A → \u1009 \u103A; \u1025 \u102E → \u1026; # Asat and dot below reordering, to Unicode NFC. \u103A\u1037 → \u1037\u103A; # Reorder some vowel signs \u1036 ($umedial*) ($vowelsign+) → $1 $2 \u1036 ; ([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) → $2 $1; -# Move ra medial, but not others. +# Move ra medial which precedes consonant, but not other medials. \u103C ($consonant) → $1 \u103C; -# Replace CA + YA with JHA -\u1005\u103b → \u1008; #### #### Stage 3 -#### Move \u1031, \u1036, and \u103C after consonants. +#### Move \u1036, and \u103C after consonants. ::Null; -# 1031 moved after consonant, with and without kinzi or medials -([\u1031]+) $ukinzi ($consonant) → $ukinzi $2 $1; -([\u1031]+) ($consonant) ($umedial+) → $2 $3 $1; -([\u1031]+) ($consonant) } [^\u103B\u103C\u103D\u103E] → $2 $1; +($umedial) \u1039 ($consonant) > \u1039 $2 $1; \u103C \u103A \u1039 ($consonant) → \u103A \u1039 $1 \u103C; \u1036 ($umedial+) → $1 \u1036; #### @@ -174,8 +183,7 @@ $zmedialra → \u103C ; # Move vowel sign E \u1031 after medials, but not across consonants ($umedial*) ([\u1031]+) ($umedial*) → $1 $3 $2; # Reorder dot below after medials and vowel diacritics -\u1037 ([\u102D-\u1030\u1032\u1036]) → $1 \u1037; -\u1037 ($umedial+) → $1 \u1037; +\u1037 ([\u102D-\u1030\u1032\u1036\u103b-\u103e]+) → $1 \u1037; # Move vowel signs after medials ($vowelsign+) ($umedial+) → $2 $1; # Reorder modifiers and asat @@ -185,32 +193,45 @@ $zmedialra → \u103C ; #### Vowel signs after medials, sort medials, #### ::Null; -([\u1031]+) ($umedial+) → $2 $1; +# Replace CA + YA with JHA after moving other things beyond the medials. +\u1005 \u103b → \u1008; # More moving vowel signs after medials -($vowelsign) ($umedial) → $2 $1; +([\u102b-\u1032]) ($umedial) → $2 $1; # Sort the medials ([\u103C\u103D\u103E]) \u103B → \u103B $1; ([\u103D\u103E]) \u103C → \u103C $1; \u103E\u103D → \u103D\u103E ; -# Move visarga (\u1038) after other signs -\u1038 ([$vowelmedial]) → $1 \u1038; -\u1038 ([\u1036\u1037\u103A]) → $1 \u1038; +# Move visarga after other signs +\u1038 ($vowelmedial) → $1 \u1038; # Reorder \u1036 \u102f → \u102f \u1036; ### ### Stage 6 -### Finish medial sorting, fix conflicting and extra diacritics +### Finish conflicting and extra diacritics. Remove some white space ### ::Null; -# Fix duplicate and overlapping modifiers. +# Fix duplicate combiners +\u102D \u102D+ → \u102D; +\u102E \u102E+ → \u102E; \u102F \u102F+ → \u102F; -\u102F \u1030 → \u102F; -\u102F \u103A → \u102F; -\u102D \u102E → \u102E; +\u1030 \u1030+ → \u1030; +\u1032 \u1032+ → \u1032; \u1036 \u1036+ → \u1036; +\u1037 \u1037+ → \u1037; +\u1039 \u1039+ → \u1039; +\u103a \u103a+ → \u103a; +\u103b \u103b+ → \u103b; +\u103c \u103c+ → \u103c; +\u103d \u103d+ → \u103d; \u103e \u103e+ → \u103e; # http://unicode.org/cldr/trac/ticket/10386 +# Fix overlapping signs +\u102F [\u1030\u103a] → \u102F; +\u102D \u102E → \u102E; # Remove space directly before diacritics. -[:WSpace:] ([\u102b-\u1032\u1036-\u103e]) → $1; -# Fix 103B/103A order for asat. -($consonant) \u103B \u103A → $1 \u103A \u103B; +($wspace)+ ([\u102b-\u1032\u1036-\u103e]) → $2; +# Remove ZWSP at start and end +^ \u200b+ → ; +\u200b+ $ → ; +# Fix multiple spaces around ZWSP to single ZWSP. +$wspace* \u200b $wspace* → \u200b;