]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | # *************************************************************************** |
2 | # * | |
2ca993e8 | 3 | # * Copyright (C) 2004-2016, International Business Machines |
73c04bcf A |
4 | # * Corporation; Unicode, Inc.; and others. All Rights Reserved. |
5 | # * | |
6 | # *************************************************************************** | |
2ca993e8 | 7 | # File: Grek_Latn.txt |
46f4442e | 8 | # Generated from CLDR |
73c04bcf | 9 | # |
2ca993e8 A |
10 | |
11 | # Rules are predicated on running NFD first, and NFC afterwards | |
12 | # :: [\u0000-\u007F \u0370-Ͽ [:Greek:] [:nonspacing mark:]] ; | |
13 | # MINIMAL FILTER GENERATED FOR: Greek-Latin | |
73c04bcf A |
14 | :: [;µ·ÄËÏÖÜäëïöüÿ-āĒ-ēĪ-īŌ-ōŪ-ūŸǕ-ǜǞ-ǣǬ-ǭȪ-ȭȰ-ȳ\u0304\u0308\u0313-\u0314\u0342-\u0345ͺ;Ά-ΊΌΎ-ΡΣ-ώϐ-ϗϛϝϟϡϣϥϧϩϫϭϯ-ϵϷ-\u07FBЁЇёїӒ-ӓӚ-ӟӢ-ӧӪ-ӱӴ-ӵӸ-ӹḔ-ḗḠ-ḡḦ-ḧḮ-ḯḸ-ḹṎ-ṓṜ-ṝṺ-ṻẄ-ẅẌ-ẍẗἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼι῁-ῄῆ-ῌ῏-ΐῖ-Ί῟-Ῥῲ-ῴῶ-ῼΩϹ] ; |
15 | :: NFD (NFC) ; | |
2ca993e8 A |
16 | # TEST CASES |
17 | # Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος | |
18 | # ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ | |
19 | # ᾳ ῃ ῳ ὃ ὄ | |
20 | # ὠς ὡς ὢς ὣς | |
21 | # Ὠς Ὡς Ὢς Ὣς | |
22 | # ὨΣ ὩΣ ὪΣ ὫΣ | |
23 | # Ạ, ạ, Ẹ, ẹ, Ọ, ọ | |
24 | # Useful variables | |
73c04bcf A |
25 | $lower = [[:latin:][:greek:] & [:Ll:]]; |
26 | $glower = [[:greek:] & [:Ll:]]; | |
27 | $upper = [[:latin:][:greek:] & [:Lu:]] ; | |
374ca955 | 28 | $accent = [:M:] ; |
2ca993e8 A |
29 | # NOTE: restrict to just the Greek & Latin accents that we care about |
30 | # TODO: broaden out once interation is fixed | |
374ca955 | 31 | $accentMinus = [ [\u0300-\u0345] & [:M:] - [\u0338]] ; |
374ca955 A |
32 | $macron = \u0304 ; |
33 | $ddot = \u0308 ; | |
34 | $ddotmac = [$ddot$macron]; | |
73c04bcf A |
35 | $lcgvowel = [αεηιουω] ; |
36 | $ucgvowel = [ΑΕΗΙΟΥΩ] ; | |
37 | $gvowel = [$lcgvowel $ucgvowel] ; | |
38 | $lcgvowelC = [$lcgvowel $accent] ; | |
374ca955 A |
39 | $evowel = [aeiouyAEIOUY]; |
40 | $evowel2 = [iuyIUY]; | |
73c04bcf A |
41 | $vowel = [ $evowel $gvowel] ; |
42 | $gammaLike = [ΓΚΞΧγκξχϰ] ; | |
43 | $egammaLike = [GKXCgkxc] ; | |
44 | $smooth = \u0313 ; | |
45 | $rough = \u0314 ; | |
46 | $iotasub = \u0345 ; | |
374ca955 A |
47 | $evowel_i = [$evowel-[iI]] ; |
48 | $evowel2_i = [uyUY]; | |
374ca955 | 49 | $underbar = \u0331; |
374ca955 A |
50 | $afterLetter = [:L:] [[:M:]\']* ; |
51 | $beforeLetter = [[:M:]\']* [:L:] ; | |
73c04bcf | 52 | $beforeLower = $accent * $lower ; |
374ca955 | 53 | $notLetter = [^[:L:][:M:]] ; |
73c04bcf | 54 | $under = \u0331; |
2ca993e8 A |
55 | # Fix punctuation |
56 | # preserve original | |
729e4ab9 A |
57 | \: ↔ \: $under ; |
58 | \? ↔ \? $under ; | |
59 | \; ↔ \? ; | |
60 | · ↔ \: ; | |
2ca993e8 | 61 | # CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve |
729e4ab9 | 62 | \u0342 ↔ \u0302 ; |
2ca993e8 A |
63 | # IOTA: convert iota subscript to iota |
64 | # first make previous alpha long! | |
374ca955 | 65 | $accent_minus = [[$accent]-[$iotasub$macron]]; |
729e4ab9 A |
66 | Α } $accent_minus * $iotasub → | Α $macron ; |
67 | α } $accent_minus * $iotasub → | α $macron ; | |
2ca993e8 | 68 | # now convert to uppercase if after uppercase, ow to lowercase |
729e4ab9 A |
69 | $upper $accent * { $iotasub → I ; |
70 | $iotasub → i ; | |
71 | | $1 $iotasub ← ($evowel $macron $accentMinus *) i ; | |
72 | | $1 $iotasub ← ($evowel $macron $accentMinus *) I ; | |
2ca993e8 A |
73 | # BREATHING |
74 | # Convert rough breathing to h, and move before letters. | |
75 | # Make A ` x = → H a x | |
729e4ab9 A |
76 | Α ($macron?) $rough } $beforeLower → H | α $1; |
77 | Ε $rough } $beforeLower → H | ε; | |
78 | Η $rough } $beforeLower → H | η ; | |
51004dcb | 79 | Ι ($ddot?) $rough } $beforeLower → H | ι $1; |
729e4ab9 A |
80 | Ο $rough } $beforeLower → H | ο ; |
81 | Υ $rough } $beforeLower → H | υ ; | |
82 | Ω ($ddot?) $rough } $beforeLower → H | ω $1; | |
2ca993e8 | 83 | # Make A x ` = → H a x |
729e4ab9 A |
84 | Α ($glower $macron?) $rough → H | α $1 ; |
85 | Ε ($glower) $rough → H | ε $1 ; | |
86 | Η ($glower) $rough → H | η $1 ; | |
87 | Ι ($glower $ddot?) $rough → H | ι $1 ; | |
88 | Ο ($glower) $rough → H | ο $1 ; | |
89 | Υ ($glower) $rough → H | υ $1 ; | |
51004dcb | 90 | Ω ($glower $ddot?) $rough → H | ω $1 ; |
2ca993e8 | 91 | #Otherwise, make x ` into h x and X ` into H X |
729e4ab9 A |
92 | ($lcgvowel + $ddotmac? ) $rough → h | $1 ; |
93 | ($gvowel + $ddotmac? ) $rough → H | $1 ; | |
2ca993e8 | 94 | # Go backwards with H |
729e4ab9 A |
95 | | $1 $rough ← h ($evowel $macron $ddot? $evowel2_i $macron?) ; |
96 | | $1 $rough ← h ($evowel $ddot? $evowel2 $macron?) ; | |
97 | | $1 $rough ← h ($evowel $macron? $ddot?) ; | |
98 | | $1 $rough ← H ([AEIOUY] $macron $ddot? $evowel2_i $macron?) ; | |
99 | | $1 $rough ← H ([AEIOUY] $ddot? $evowel2 $macron?) ; | |
100 | | $1 $rough ← H ([AEIOUY] $macron? $ddot?) ; | |
2ca993e8 A |
101 | # titlecase, have to fix individually |
102 | # in the future, we should add &uppercase() to make this easier | |
51004dcb A |
103 | | A $1 $rough ← H a ($macron $ddot? $evowel2_i $macron?) ; |
104 | | E $1 $rough ← H e ($macron $ddot? $evowel2_i $macron?) ; | |
105 | | I $1 $rough ← H i ($macron $ddot? $evowel2_i $macron?) ; | |
106 | | O $1 $rough ← H o ($macron $ddot? $evowel2_i $macron?) ; | |
729e4ab9 A |
107 | | U $1 $rough ← H u ($macron $ddot? $evowel2_i $macron?) ; |
108 | | Y $1 $rough ← H y ($macron $ddot? $evowel2_i $macron?) ; | |
109 | | A $1 $rough ← H a ($ddot? $evowel2 $macron?) ; | |
110 | | E $1 $rough ← H e ($ddot? $evowel2 $macron?) ; | |
111 | | I $1 $rough ← H i ($ddot? $evowel2 $macron?) ; | |
112 | | O $1 $rough ← H o ($ddot? $evowel2 $macron?) ; | |
113 | | U $1 $rough ← H u ($ddot? $evowel2 $macron?) ; | |
114 | | Y $1 $rough ← H y ($ddot? $evowel2 $macron?) ; | |
115 | | A $1 $rough ← H a ($macron? $ddot? ) ; | |
116 | | E $1 $rough ← H e ($macron? $ddot? ) ; | |
117 | | I $1 $rough ← H i ($macron? $ddot? ) ; | |
118 | | O $1 $rough ← H o ($macron? $ddot? ) ; | |
119 | | U $1 $rough ← H u ($macron? $ddot? ) ; | |
120 | | Y $1 $rough ← H y ($macron? $ddot? ) ; | |
2ca993e8 A |
121 | # Now do smooth |
122 | #delete smooth breathing for Latin | |
729e4ab9 | 123 | $smooth → ; |
2ca993e8 A |
124 | # insert in Greek |
125 | # the assumption is that all Marks are on letters. | |
729e4ab9 A |
126 | | $1 $smooth ← $notLetter { ([rR]) } [^hH$smooth$rough] ; |
127 | | $1 $smooth ← $notLetter { ($evowel $macron? $evowel2 $macron?) } [^$smooth$rough] ; | |
128 | | $1 $smooth ← $notLetter { ($evowel $macron?) } [^$evowel2$smooth$rough] ; | |
2ca993e8 A |
129 | # TODO: preserve smooth/rough breathing if not |
130 | # on initial vowel sequence | |
131 | # need to have these up here so the rules don't mask | |
132 | # remove now superfluous macron when returning | |
729e4ab9 A |
133 | Α ← A $macron ; |
134 | α ← a $macron ; | |
135 | η ↔ e $macron ; | |
136 | Η ↔ E $macron ; | |
137 | φ ↔ ph ; | |
138 | Ψ } $beforeLower ↔ Ps ; | |
139 | Ψ ↔ PS ; | |
140 | Φ } $beforeLower ↔ Ph ; | |
141 | Φ ↔ PH ; | |
142 | ψ ↔ ps ; | |
143 | ω ↔ o $macron ; | |
51004dcb | 144 | Ω ↔ O $macron; |
2ca993e8 | 145 | # NORMAL |
729e4ab9 A |
146 | α ↔ a ; |
147 | Α ↔ A ; | |
148 | β ↔ b ; | |
149 | Β ↔ B ; | |
150 | γ } $gammaLike ↔ n } $egammaLike ; | |
151 | γ ↔ g ; | |
152 | Γ } $gammaLike ↔ N } $egammaLike ; | |
153 | Γ ↔ G ; | |
154 | δ ↔ d ; | |
155 | Δ ↔ D ; | |
156 | ε ↔ e ; | |
157 | Ε ↔ E ; | |
158 | ζ ↔ z ; | |
159 | Ζ ↔ Z ; | |
160 | θ ↔ th ; | |
161 | Θ } $beforeLower ↔ Th ; | |
162 | Θ ↔ TH ; | |
163 | ι ↔ i ; | |
164 | Ι ↔ I ; | |
165 | κ ↔ k ; | |
166 | Κ ↔ K ; | |
167 | λ ↔ l ; | |
168 | Λ ↔ L ; | |
169 | μ ↔ m ; | |
170 | Μ ↔ M ; | |
171 | ν } $gammaLike → n\' ; | |
172 | ν ↔ n ; | |
173 | Ν } $gammaLike ↔ N\' ; | |
174 | Ν ↔ N ; | |
175 | ξ ↔ x ; | |
176 | Ξ ↔ X ; | |
177 | ο ↔ o ; | |
178 | Ο ↔ O ; | |
179 | π ↔ p ; | |
180 | Π ↔ P ; | |
181 | ρ $rough ↔ rh; | |
182 | Ρ $rough } $beforeLower ↔ Rh ; | |
183 | Ρ $rough ↔ RH ; | |
184 | ρ ↔ r ; | |
185 | Ρ ↔ R ; | |
2ca993e8 | 186 | # insert separator before things that turn into s |
729e4ab9 | 187 | [Pp] { } [ςσΣϷϸϺϻ] → \' ; |
2ca993e8 | 188 | # special S variants |
729e4ab9 A |
189 | Ϸ ↔ S\u030C ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L |
190 | ϸ ↔ s\u030C ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L | |
191 | Ϻ ↔ S\u0302 ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L | |
192 | ϻ ↔ s\u0302 ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L | |
2ca993e8 A |
193 | # underbar means exception |
194 | # before a letter, initial | |
729e4ab9 A |
195 | ς } $beforeLetter ↔ s $underbar } $beforeLetter; |
196 | σ } $beforeLetter ↔ s } $beforeLetter; | |
2ca993e8 | 197 | # otherwise, after a letter = final |
729e4ab9 A |
198 | $afterLetter { σ ↔ $afterLetter { s $underbar; |
199 | $afterLetter { ς ↔ $afterLetter { s ; | |
2ca993e8 | 200 | # otherwise (isolated) = initial |
729e4ab9 A |
201 | ς ↔ s $underbar; |
202 | σ ↔ s ; | |
2ca993e8 | 203 | # [Pp] { Σ ↔ \'S ; |
729e4ab9 A |
204 | Σ ↔ S ; |
205 | τ ↔ t ; | |
206 | Τ ↔ T ; | |
207 | $vowel {υ } ↔ u ; | |
208 | υ ↔ y ; | |
209 | $vowel { Υ ↔ U ; | |
210 | Υ ↔ Y ; | |
211 | χ ↔ ch ; | |
212 | Χ } $beforeLower ↔ Ch ; | |
213 | Χ ↔ CH ; | |
2ca993e8 | 214 | # Completeness for ASCII |
374ca955 | 215 | $ignore = [[:Mark:]''] * ; |
51004dcb | 216 | | k ← c ; |
729e4ab9 | 217 | | ph ← f ; |
51004dcb | 218 | | i ← j ; |
729e4ab9 A |
219 | | k ← q ; |
220 | | b ← v } $vowel ; | |
221 | | b ← w } $vowel; | |
222 | | u ← v ; | |
223 | | u ← w; | |
224 | | K ← C ; | |
225 | | Ph ← F ; | |
226 | | I ← J ; | |
227 | | K ← Q ; | |
51004dcb A |
228 | | B ← V } $vowel ; |
229 | | B ← W } $vowel ; | |
729e4ab9 A |
230 | | U ← V ; |
231 | | U ← W ; | |
232 | $rough } $ignore [:UppercaseLetter:] → H ; | |
233 | $ignore [:UppercaseLetter:] { $rough → H ; | |
234 | $rough ← H ; | |
235 | $rough ↔ h ; | |
2ca993e8 | 236 | # Completeness for Greek |
729e4ab9 A |
237 | ϐ → | β ; |
238 | ϑ → | θ ; | |
239 | ϒ → | Υ ; | |
240 | ϕ → | φ ; | |
241 | ϖ → | π ; | |
242 | ϰ → | κ ; | |
243 | ϱ → | ρ ; | |
244 | ϲ → | σ ; | |
245 | Ϲ → | Σ; #U+03F9 GREEK CAPITAL LUNATE SIGMA SYMBOL | |
246 | ϳ → j ; | |
247 | ϴ → | Θ ; | |
248 | ϵ → | ε ; | |
249 | µ → | μ ; | |
250 | ͺ → i; | |
2ca993e8 | 251 | # delete any trailing ' marks used for roundtripping |
729e4ab9 A |
252 | ← [Ππ] { \' } [Ss] ; |
253 | ← [Νν] { \' } $egammaLike ; | |
374ca955 | 254 | ::NFC (NFD) ; |
2ca993e8 A |
255 | # ([\u0000-\u007F [:Latin:] [:Greek:] [:nonspacing mark:]]) ; |
256 | # ([\u0000-\u007F · [:Latin:] [:nonspacing mark:]]) ; | |
257 | # MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD | |
73c04bcf | 258 | :: ( [':?A-Za-zÀ-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0300-\u0337\u0339-\u0345΅-ΆΈ-ΊΌΎ-ΐΪ-ΰϊ-ώϓ-ϔЀ-ЁЃЇЌ-ЎЙйѐ-ёѓїќ-ўѶ-ѷӁ-ӂӐ-ӓӖ-ӗӚ-ӟӢ-ӧӪ-ӵӸ-ӹḀ-ẙẛẠ-ỹἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼ῁-ῄῆ-ΐῖ-Ί῝-΅ῲ-ῴῶ-ῼK-Å] ) ; |
2ca993e8 | 259 |