]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | # © 2016 and later: Unicode, Inc. and others. |
2 | # License & terms of use: http://www.unicode.org/copyright.html#License | |
3 | # | |
2ca993e8 | 4 | # File: Grek_Latn.txt |
f3c0d7a5 | 5 | # Generated from CLDR |
73c04bcf | 6 | # |
2ca993e8 A |
7 | |
8 | # Rules are predicated on running NFD first, and NFC afterwards | |
9 | # :: [\u0000-\u007F \u0370-Ͽ [:Greek:] [:nonspacing mark:]] ; | |
10 | # MINIMAL FILTER GENERATED FOR: Greek-Latin | |
73c04bcf A |
11 | :: [;µ·ÄËÏÖÜäëïöüÿ-āĒ-ēĪ-īŌ-ōŪ-ūŸǕ-ǜǞ-ǣǬ-ǭȪ-ȭȰ-ȳ\u0304\u0308\u0313-\u0314\u0342-\u0345ͺ;Ά-ΊΌΎ-ΡΣ-ώϐ-ϗϛϝϟϡϣϥϧϩϫϭϯ-ϵϷ-\u07FBЁЇёїӒ-ӓӚ-ӟӢ-ӧӪ-ӱӴ-ӵӸ-ӹḔ-ḗḠ-ḡḦ-ḧḮ-ḯḸ-ḹṎ-ṓṜ-ṝṺ-ṻẄ-ẅẌ-ẍẗἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼι῁-ῄῆ-ῌ῏-ΐῖ-Ί῟-Ῥῲ-ῴῶ-ῼΩϹ] ; |
12 | :: NFD (NFC) ; | |
2ca993e8 A |
13 | # TEST CASES |
14 | # Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος | |
15 | # ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ | |
16 | # ᾳ ῃ ῳ ὃ ὄ | |
17 | # ὠς ὡς ὢς ὣς | |
18 | # Ὠς Ὡς Ὢς Ὣς | |
19 | # ὨΣ ὩΣ ὪΣ ὫΣ | |
20 | # Ạ, ạ, Ẹ, ẹ, Ọ, ọ | |
21 | # Useful variables | |
73c04bcf A |
22 | $lower = [[:latin:][:greek:] & [:Ll:]]; |
23 | $glower = [[:greek:] & [:Ll:]]; | |
24 | $upper = [[:latin:][:greek:] & [:Lu:]] ; | |
374ca955 | 25 | $accent = [:M:] ; |
2ca993e8 A |
26 | # NOTE: restrict to just the Greek & Latin accents that we care about |
27 | # TODO: broaden out once interation is fixed | |
374ca955 | 28 | $accentMinus = [ [\u0300-\u0345] & [:M:] - [\u0338]] ; |
374ca955 A |
29 | $macron = \u0304 ; |
30 | $ddot = \u0308 ; | |
31 | $ddotmac = [$ddot$macron]; | |
73c04bcf A |
32 | $lcgvowel = [αεηιουω] ; |
33 | $ucgvowel = [ΑΕΗΙΟΥΩ] ; | |
34 | $gvowel = [$lcgvowel $ucgvowel] ; | |
35 | $lcgvowelC = [$lcgvowel $accent] ; | |
374ca955 A |
36 | $evowel = [aeiouyAEIOUY]; |
37 | $evowel2 = [iuyIUY]; | |
73c04bcf A |
38 | $vowel = [ $evowel $gvowel] ; |
39 | $gammaLike = [ΓΚΞΧγκξχϰ] ; | |
40 | $egammaLike = [GKXCgkxc] ; | |
41 | $smooth = \u0313 ; | |
42 | $rough = \u0314 ; | |
43 | $iotasub = \u0345 ; | |
374ca955 A |
44 | $evowel_i = [$evowel-[iI]] ; |
45 | $evowel2_i = [uyUY]; | |
374ca955 | 46 | $underbar = \u0331; |
374ca955 A |
47 | $afterLetter = [:L:] [[:M:]\']* ; |
48 | $beforeLetter = [[:M:]\']* [:L:] ; | |
73c04bcf | 49 | $beforeLower = $accent * $lower ; |
374ca955 | 50 | $notLetter = [^[:L:][:M:]] ; |
73c04bcf | 51 | $under = \u0331; |
2ca993e8 A |
52 | # Fix punctuation |
53 | # preserve original | |
729e4ab9 A |
54 | \: ↔ \: $under ; |
55 | \? ↔ \? $under ; | |
56 | \; ↔ \? ; | |
57 | · ↔ \: ; | |
2ca993e8 | 58 | # CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve |
729e4ab9 | 59 | \u0342 ↔ \u0302 ; |
2ca993e8 A |
60 | # IOTA: convert iota subscript to iota |
61 | # first make previous alpha long! | |
374ca955 | 62 | $accent_minus = [[$accent]-[$iotasub$macron]]; |
729e4ab9 A |
63 | Α } $accent_minus * $iotasub → | Α $macron ; |
64 | α } $accent_minus * $iotasub → | α $macron ; | |
2ca993e8 | 65 | # now convert to uppercase if after uppercase, ow to lowercase |
729e4ab9 A |
66 | $upper $accent * { $iotasub → I ; |
67 | $iotasub → i ; | |
68 | | $1 $iotasub ← ($evowel $macron $accentMinus *) i ; | |
69 | | $1 $iotasub ← ($evowel $macron $accentMinus *) I ; | |
2ca993e8 A |
70 | # BREATHING |
71 | # Convert rough breathing to h, and move before letters. | |
72 | # Make A ` x = → H a x | |
729e4ab9 A |
73 | Α ($macron?) $rough } $beforeLower → H | α $1; |
74 | Ε $rough } $beforeLower → H | ε; | |
75 | Η $rough } $beforeLower → H | η ; | |
51004dcb | 76 | Ι ($ddot?) $rough } $beforeLower → H | ι $1; |
729e4ab9 A |
77 | Ο $rough } $beforeLower → H | ο ; |
78 | Υ $rough } $beforeLower → H | υ ; | |
79 | Ω ($ddot?) $rough } $beforeLower → H | ω $1; | |
2ca993e8 | 80 | # Make A x ` = → H a x |
729e4ab9 A |
81 | Α ($glower $macron?) $rough → H | α $1 ; |
82 | Ε ($glower) $rough → H | ε $1 ; | |
83 | Η ($glower) $rough → H | η $1 ; | |
84 | Ι ($glower $ddot?) $rough → H | ι $1 ; | |
85 | Ο ($glower) $rough → H | ο $1 ; | |
86 | Υ ($glower) $rough → H | υ $1 ; | |
51004dcb | 87 | Ω ($glower $ddot?) $rough → H | ω $1 ; |
2ca993e8 | 88 | #Otherwise, make x ` into h x and X ` into H X |
729e4ab9 A |
89 | ($lcgvowel + $ddotmac? ) $rough → h | $1 ; |
90 | ($gvowel + $ddotmac? ) $rough → H | $1 ; | |
2ca993e8 | 91 | # Go backwards with H |
729e4ab9 A |
92 | | $1 $rough ← h ($evowel $macron $ddot? $evowel2_i $macron?) ; |
93 | | $1 $rough ← h ($evowel $ddot? $evowel2 $macron?) ; | |
94 | | $1 $rough ← h ($evowel $macron? $ddot?) ; | |
95 | | $1 $rough ← H ([AEIOUY] $macron $ddot? $evowel2_i $macron?) ; | |
96 | | $1 $rough ← H ([AEIOUY] $ddot? $evowel2 $macron?) ; | |
97 | | $1 $rough ← H ([AEIOUY] $macron? $ddot?) ; | |
2ca993e8 A |
98 | # titlecase, have to fix individually |
99 | # in the future, we should add &uppercase() to make this easier | |
51004dcb A |
100 | | A $1 $rough ← H a ($macron $ddot? $evowel2_i $macron?) ; |
101 | | E $1 $rough ← H e ($macron $ddot? $evowel2_i $macron?) ; | |
102 | | I $1 $rough ← H i ($macron $ddot? $evowel2_i $macron?) ; | |
103 | | O $1 $rough ← H o ($macron $ddot? $evowel2_i $macron?) ; | |
729e4ab9 A |
104 | | U $1 $rough ← H u ($macron $ddot? $evowel2_i $macron?) ; |
105 | | Y $1 $rough ← H y ($macron $ddot? $evowel2_i $macron?) ; | |
106 | | A $1 $rough ← H a ($ddot? $evowel2 $macron?) ; | |
107 | | E $1 $rough ← H e ($ddot? $evowel2 $macron?) ; | |
108 | | I $1 $rough ← H i ($ddot? $evowel2 $macron?) ; | |
109 | | O $1 $rough ← H o ($ddot? $evowel2 $macron?) ; | |
110 | | U $1 $rough ← H u ($ddot? $evowel2 $macron?) ; | |
111 | | Y $1 $rough ← H y ($ddot? $evowel2 $macron?) ; | |
112 | | A $1 $rough ← H a ($macron? $ddot? ) ; | |
113 | | E $1 $rough ← H e ($macron? $ddot? ) ; | |
114 | | I $1 $rough ← H i ($macron? $ddot? ) ; | |
115 | | O $1 $rough ← H o ($macron? $ddot? ) ; | |
116 | | U $1 $rough ← H u ($macron? $ddot? ) ; | |
117 | | Y $1 $rough ← H y ($macron? $ddot? ) ; | |
2ca993e8 A |
118 | # Now do smooth |
119 | #delete smooth breathing for Latin | |
729e4ab9 | 120 | $smooth → ; |
2ca993e8 A |
121 | # insert in Greek |
122 | # the assumption is that all Marks are on letters. | |
729e4ab9 A |
123 | | $1 $smooth ← $notLetter { ([rR]) } [^hH$smooth$rough] ; |
124 | | $1 $smooth ← $notLetter { ($evowel $macron? $evowel2 $macron?) } [^$smooth$rough] ; | |
125 | | $1 $smooth ← $notLetter { ($evowel $macron?) } [^$evowel2$smooth$rough] ; | |
2ca993e8 A |
126 | # TODO: preserve smooth/rough breathing if not |
127 | # on initial vowel sequence | |
128 | # need to have these up here so the rules don't mask | |
129 | # remove now superfluous macron when returning | |
729e4ab9 A |
130 | Α ← A $macron ; |
131 | α ← a $macron ; | |
132 | η ↔ e $macron ; | |
133 | Η ↔ E $macron ; | |
134 | φ ↔ ph ; | |
135 | Ψ } $beforeLower ↔ Ps ; | |
136 | Ψ ↔ PS ; | |
137 | Φ } $beforeLower ↔ Ph ; | |
138 | Φ ↔ PH ; | |
139 | ψ ↔ ps ; | |
140 | ω ↔ o $macron ; | |
51004dcb | 141 | Ω ↔ O $macron; |
2ca993e8 | 142 | # NORMAL |
729e4ab9 A |
143 | α ↔ a ; |
144 | Α ↔ A ; | |
145 | β ↔ b ; | |
146 | Β ↔ B ; | |
147 | γ } $gammaLike ↔ n } $egammaLike ; | |
148 | γ ↔ g ; | |
149 | Γ } $gammaLike ↔ N } $egammaLike ; | |
150 | Γ ↔ G ; | |
151 | δ ↔ d ; | |
152 | Δ ↔ D ; | |
153 | ε ↔ e ; | |
154 | Ε ↔ E ; | |
155 | ζ ↔ z ; | |
156 | Ζ ↔ Z ; | |
157 | θ ↔ th ; | |
158 | Θ } $beforeLower ↔ Th ; | |
159 | Θ ↔ TH ; | |
160 | ι ↔ i ; | |
161 | Ι ↔ I ; | |
162 | κ ↔ k ; | |
163 | Κ ↔ K ; | |
164 | λ ↔ l ; | |
165 | Λ ↔ L ; | |
166 | μ ↔ m ; | |
167 | Μ ↔ M ; | |
168 | ν } $gammaLike → n\' ; | |
169 | ν ↔ n ; | |
170 | Ν } $gammaLike ↔ N\' ; | |
171 | Ν ↔ N ; | |
172 | ξ ↔ x ; | |
173 | Ξ ↔ X ; | |
174 | ο ↔ o ; | |
175 | Ο ↔ O ; | |
176 | π ↔ p ; | |
177 | Π ↔ P ; | |
178 | ρ $rough ↔ rh; | |
179 | Ρ $rough } $beforeLower ↔ Rh ; | |
180 | Ρ $rough ↔ RH ; | |
181 | ρ ↔ r ; | |
182 | Ρ ↔ R ; | |
2ca993e8 | 183 | # insert separator before things that turn into s |
729e4ab9 | 184 | [Pp] { } [ςσΣϷϸϺϻ] → \' ; |
2ca993e8 | 185 | # special S variants |
729e4ab9 A |
186 | Ϸ ↔ S\u030C ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L |
187 | ϸ ↔ s\u030C ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L | |
188 | Ϻ ↔ S\u0302 ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L | |
189 | ϻ ↔ s\u0302 ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L | |
2ca993e8 A |
190 | # underbar means exception |
191 | # before a letter, initial | |
729e4ab9 A |
192 | ς } $beforeLetter ↔ s $underbar } $beforeLetter; |
193 | σ } $beforeLetter ↔ s } $beforeLetter; | |
2ca993e8 | 194 | # otherwise, after a letter = final |
729e4ab9 A |
195 | $afterLetter { σ ↔ $afterLetter { s $underbar; |
196 | $afterLetter { ς ↔ $afterLetter { s ; | |
2ca993e8 | 197 | # otherwise (isolated) = initial |
729e4ab9 A |
198 | ς ↔ s $underbar; |
199 | σ ↔ s ; | |
2ca993e8 | 200 | # [Pp] { Σ ↔ \'S ; |
729e4ab9 A |
201 | Σ ↔ S ; |
202 | τ ↔ t ; | |
203 | Τ ↔ T ; | |
204 | $vowel {υ } ↔ u ; | |
205 | υ ↔ y ; | |
206 | $vowel { Υ ↔ U ; | |
207 | Υ ↔ Y ; | |
208 | χ ↔ ch ; | |
209 | Χ } $beforeLower ↔ Ch ; | |
210 | Χ ↔ CH ; | |
2ca993e8 | 211 | # Completeness for ASCII |
374ca955 | 212 | $ignore = [[:Mark:]''] * ; |
51004dcb | 213 | | k ← c ; |
729e4ab9 | 214 | | ph ← f ; |
51004dcb | 215 | | i ← j ; |
729e4ab9 A |
216 | | k ← q ; |
217 | | b ← v } $vowel ; | |
218 | | b ← w } $vowel; | |
219 | | u ← v ; | |
220 | | u ← w; | |
221 | | K ← C ; | |
222 | | Ph ← F ; | |
223 | | I ← J ; | |
224 | | K ← Q ; | |
51004dcb A |
225 | | B ← V } $vowel ; |
226 | | B ← W } $vowel ; | |
729e4ab9 A |
227 | | U ← V ; |
228 | | U ← W ; | |
229 | $rough } $ignore [:UppercaseLetter:] → H ; | |
230 | $ignore [:UppercaseLetter:] { $rough → H ; | |
231 | $rough ← H ; | |
232 | $rough ↔ h ; | |
2ca993e8 | 233 | # Completeness for Greek |
729e4ab9 A |
234 | ϐ → | β ; |
235 | ϑ → | θ ; | |
236 | ϒ → | Υ ; | |
237 | ϕ → | φ ; | |
238 | ϖ → | π ; | |
239 | ϰ → | κ ; | |
240 | ϱ → | ρ ; | |
241 | ϲ → | σ ; | |
242 | Ϲ → | Σ; #U+03F9 GREEK CAPITAL LUNATE SIGMA SYMBOL | |
243 | ϳ → j ; | |
244 | ϴ → | Θ ; | |
245 | ϵ → | ε ; | |
246 | µ → | μ ; | |
247 | ͺ → i; | |
2ca993e8 | 248 | # delete any trailing ' marks used for roundtripping |
729e4ab9 A |
249 | ← [Ππ] { \' } [Ss] ; |
250 | ← [Νν] { \' } $egammaLike ; | |
374ca955 | 251 | ::NFC (NFD) ; |
2ca993e8 A |
252 | # ([\u0000-\u007F [:Latin:] [:Greek:] [:nonspacing mark:]]) ; |
253 | # ([\u0000-\u007F · [:Latin:] [:nonspacing mark:]]) ; | |
254 | # MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD | |
73c04bcf | 255 | :: ( [':?A-Za-zÀ-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0300-\u0337\u0339-\u0345΅-ΆΈ-ΊΌΎ-ΐΪ-ΰϊ-ώϓ-ϔЀ-ЁЃЇЌ-ЎЙйѐ-ёѓїќ-ўѶ-ѷӁ-ӂӐ-ӓӖ-ӗӚ-ӟӢ-ӧӪ-ӵӸ-ӹḀ-ẙẛẠ-ỹἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼ῁-ῄῆ-ΐῖ-Ί῝-΅ῲ-ῴῶ-ῼK-Å] ) ; |
2ca993e8 | 256 |