]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | #-------------------------------------------------------------------- |
2 | # Copyright (c) 1999-2004, International Business Machines | |
3 | # Corporation and others. All Rights Reserved. | |
4 | #-------------------------------------------------------------------- | |
5 | ||
6 | # Rules are predicated on running NFD first, and NFC afterwards | |
7 | # :: [\u0000-\u007F \u0370-\u03FF [:Greek:] [:nonspacing mark:]] ; | |
8 | # MINIMAL FILTER GENERATED FOR: Greek-Latin | |
9 | :: [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u03F7-\u07FB\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126\u03F9] ; | |
10 | ||
11 | :: NFD (NFC) ; | |
12 | ||
13 | # TEST CASES | |
14 | ||
15 | # Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος | |
16 | # ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ | |
17 | # ᾳ ῃ ῳ ὃ ὄ | |
18 | # ὠς ὡς ὢς ὣς | |
19 | # Ὠς Ὡς Ὢς Ὣς | |
20 | # ὨΣ ὩΣ ὪΣ ὫΣ | |
21 | # Ạ, ạ, Ẹ, ẹ, Ọ, ọ | |
22 | ||
23 | # Useful variables | |
24 | ||
25 | $lower = [[:latin:][:greek:] & [:Ll:]]; | |
26 | $glower = [[:greek:] & [:Ll:]]; | |
27 | $upper = [[:latin:][:greek:] & [:Lu:]] ; | |
28 | $accent = [:M:] ; | |
29 | ||
30 | # NOTE: restrict to just the Greek & Latin accents that we care about | |
31 | # TODO: broaden out once interation is fixed | |
32 | $accentMinus = [ [\u0300-\u0345] & [:M:] - [\u0338]] ; | |
33 | ||
34 | $macron = \u0304 ; | |
35 | $ddot = \u0308 ; | |
36 | $ddotmac = [$ddot$macron]; | |
37 | ||
38 | $lcgvowel = [αεηιουω] ; | |
39 | $ucgvowel = [ΑΕΗΙΟΥΩ] ; | |
40 | $gvowel = [$lcgvowel $ucgvowel] ; | |
41 | $lcgvowelC = [$lcgvowel $accent] ; | |
42 | ||
43 | $evowel = [aeiouyAEIOUY]; | |
44 | $evowel2 = [iuyIUY]; | |
45 | $vowel = [ $evowel $gvowel] ; | |
46 | ||
47 | $gammaLike = [ΓΚΞΧγκξχϰ] ; | |
48 | $egammaLike = [GKXCgkxc] ; | |
49 | $smooth = ̓ ; | |
50 | $rough = ̔ ; | |
51 | $iotasub = ͅ ; | |
52 | ||
53 | $evowel_i = [$evowel-[iI]] ; | |
54 | $evowel2_i = [uyUY]; | |
55 | ||
56 | $underbar = \u0331; | |
57 | ||
58 | $afterLetter = [:L:] [[:M:]\']* ; | |
59 | $beforeLetter = [[:M:]\']* [:L:] ; | |
60 | $beforeLower = $accent * $lower ; | |
61 | ||
62 | $notLetter = [^[:L:][:M:]] ; | |
63 | $under = ̱; | |
64 | ||
65 | # Fix punctuation | |
66 | # preserve original | |
67 | \: <> \: $under ; | |
68 | \? <> \? $under ; | |
69 | ||
70 | \; <> \? ; | |
71 | · <> \: ; | |
72 | ||
73 | # CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve | |
74 | ||
75 | \u0342 <> \u0302 ; | |
76 | ||
77 | # IOTA: convert iota subscript to iota | |
78 | # first make previous alpha long! | |
79 | ||
80 | $accent_minus = [[$accent]-[$iotasub$macron]]; | |
81 | ||
82 | Α } $accent_minus * $iotasub > | Α $macron ; | |
83 | α } $accent_minus * $iotasub > | α $macron ; | |
84 | ||
85 | # now convert to uppercase if after uppercase, ow to lowercase | |
86 | ||
87 | $upper $accent * { $iotasub > I ; | |
88 | $iotasub > i ; | |
89 | ||
90 | | $1 $iotasub < ($evowel $macron $accentMinus *) i ; | |
91 | | $1 $iotasub < ($evowel $macron $accentMinus *) I ; | |
92 | ||
93 | # BREATHING | |
94 | ||
95 | # Convert rough breathing to h, and move before letters. | |
96 | ||
97 | # Make A ` x = > H a x | |
98 | ||
99 | Α ($macron?) $rough } $beforeLower > H | α $1; | |
100 | Ε $rough } $beforeLower > H | ε; | |
101 | Η $rough } $beforeLower > H | η ; | |
102 | Ι ($ddot?) $rough } $beforeLower > H | ι $1; | |
103 | Ο $rough } $beforeLower > H | ο ; | |
104 | Υ $rough } $beforeLower > H | υ ; | |
105 | Ω ($ddot?) $rough } $beforeLower > H | ω $1; | |
106 | ||
107 | # Make A x ` = > H a x | |
108 | ||
109 | Α ($glower $macron?) $rough > H | α $1 ; | |
110 | Ε ($glower) $rough > H | ε $1 ; | |
111 | Η ($glower) $rough > H | η $1 ; | |
112 | Ι ($glower $ddot?) $rough > H | ι $1 ; | |
113 | Ο ($glower) $rough > H | ο $1 ; | |
114 | Υ ($glower) $rough > H | υ $1 ; | |
115 | Ω ($glower $ddot?) $rough > H | ω $1 ; | |
116 | ||
117 | #Otherwise, make x ` into h x and X ` into H X | |
118 | ||
119 | ($lcgvowel + $ddotmac? ) $rough > h | $1 ; | |
120 | ($gvowel + $ddotmac? ) $rough > H | $1 ; | |
121 | ||
122 | # Go backwards with H | |
123 | ||
124 | | $1 $rough < h ($evowel $macron $ddot? $evowel2_i $macron?) ; | |
125 | | $1 $rough < h ($evowel $ddot? $evowel2 $macron?) ; | |
126 | | $1 $rough < h ($evowel $macron? $ddot?) ; | |
127 | ||
128 | | $1 $rough < H ([AEIOUY] $macron $ddot? $evowel2_i $macron?) ; | |
129 | | $1 $rough < H ([AEIOUY] $ddot? $evowel2 $macron?) ; | |
130 | | $1 $rough < H ([AEIOUY] $macron? $ddot?) ; | |
131 | ||
132 | # titlecase, have to fix individually | |
133 | # in the future, we should add &uppercase() to make this easier | |
134 | ||
135 | | A $1 $rough < H a ($macron $ddot? $evowel2_i $macron?) ; | |
136 | | E $1 $rough < H e ($macron $ddot? $evowel2_i $macron?) ; | |
137 | | I $1 $rough < H i ($macron $ddot? $evowel2_i $macron?) ; | |
138 | | O $1 $rough < H o ($macron $ddot? $evowel2_i $macron?) ; | |
139 | | U $1 $rough < H u ($macron $ddot? $evowel2_i $macron?) ; | |
140 | | Y $1 $rough < H y ($macron $ddot? $evowel2_i $macron?) ; | |
141 | ||
142 | | A $1 $rough < H a ($ddot? $evowel2 $macron?) ; | |
143 | | E $1 $rough < H e ($ddot? $evowel2 $macron?) ; | |
144 | | I $1 $rough < H i ($ddot? $evowel2 $macron?) ; | |
145 | | O $1 $rough < H o ($ddot? $evowel2 $macron?) ; | |
146 | | U $1 $rough < H u ($ddot? $evowel2 $macron?) ; | |
147 | | Y $1 $rough < H y ($ddot? $evowel2 $macron?) ; | |
148 | ||
149 | | A $1 $rough < H a ($macron? $ddot? ) ; | |
150 | | E $1 $rough < H e ($macron? $ddot? ) ; | |
151 | | I $1 $rough < H i ($macron? $ddot? ) ; | |
152 | | O $1 $rough < H o ($macron? $ddot? ) ; | |
153 | | U $1 $rough < H u ($macron? $ddot? ) ; | |
154 | | Y $1 $rough < H y ($macron? $ddot? ) ; | |
155 | ||
156 | # Now do smooth | |
157 | ||
158 | #delete smooth breathing for Latin | |
159 | $smooth > ; | |
160 | ||
161 | # insert in Greek | |
162 | # the assumption is that all Marks are on letters. | |
163 | ||
164 | | $1 $smooth < $notLetter { ([rR]) } [^hH$smooth$rough] ; | |
165 | | $1 $smooth < $notLetter { ($evowel $macron? $evowel2 $macron?) } [^$smooth$rough] ; | |
166 | | $1 $smooth < $notLetter { ($evowel $macron?) } [^$evowel2$smooth$rough] ; | |
167 | ||
168 | # TODO: preserve smooth/rough breathing if not | |
169 | # on initial vowel sequence | |
170 | ||
171 | # need to have these up here so the rules don't mask | |
172 | ||
173 | # remove now superfluous macron when returning | |
174 | ||
175 | Α < A $macron ; | |
176 | α < a $macron ; | |
177 | ||
178 | η <> e $macron ; | |
179 | Η <> E $macron ; | |
180 | ||
181 | φ <> ph ; | |
182 | Ψ } $beforeLower <> Ps ; | |
183 | Ψ <> PS ; | |
184 | ||
185 | Φ } $beforeLower <> Ph ; | |
186 | Φ <> PH ; | |
187 | ψ <> ps ; | |
188 | ||
189 | ω <> o $macron ; | |
190 | Ω <> O $macron; | |
191 | ||
192 | # NORMAL | |
193 | ||
194 | α <> a ; | |
195 | Α <> A ; | |
196 | ||
197 | β <> b ; | |
198 | Β <> B ; | |
199 | ||
200 | γ } $gammaLike <> n } $egammaLike ; | |
201 | γ <> g ; | |
202 | Γ } $gammaLike <> N } $egammaLike ; | |
203 | Γ <> G ; | |
204 | ||
205 | δ <> d ; | |
206 | Δ <> D ; | |
207 | ||
208 | ε <> e ; | |
209 | Ε <> E ; | |
210 | ||
211 | ζ <> z ; | |
212 | Ζ <> Z ; | |
213 | ||
214 | θ <> th ; | |
215 | Θ } $beforeLower <> Th ; | |
216 | Θ <> TH ; | |
217 | ||
218 | ι <> i ; | |
219 | Ι <> I ; | |
220 | ||
221 | κ <> k ; | |
222 | Κ <> K ; | |
223 | ||
224 | λ <> l ; | |
225 | Λ <> L ; | |
226 | ||
227 | μ <> m ; | |
228 | Μ <> M ; | |
229 | ||
230 | ν } $gammaLike > n\' ; | |
231 | ν <> n ; | |
232 | Ν } $gammaLike <> N\' ; | |
233 | Ν <> N ; | |
234 | ||
235 | ξ <> x ; | |
236 | Ξ <> X ; | |
237 | ||
238 | ο <> o ; | |
239 | Ο <> O ; | |
240 | ||
241 | π <> p ; | |
242 | Π <> P ; | |
243 | ||
244 | ρ $rough <> rh; | |
245 | Ρ $rough } $beforeLower <> Rh ; | |
246 | Ρ $rough <> RH ; | |
247 | ρ <> r ; | |
248 | Ρ <> R ; | |
249 | ||
250 | # insert separator before things that turn into s | |
251 | ||
252 | [Pp] { } [ςσΣϷϸϺϻ] > \' ; | |
253 | ||
254 | # special S variants | |
255 | ||
256 | Ϸ <> Š ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L | |
257 | ϸ <> š ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L | |
258 | Ϻ <> Ŝ ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L | |
259 | ϻ <> ŝ ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L | |
260 | ||
261 | # underbar means exception | |
262 | ||
263 | # before a letter, initial | |
264 | ς } $beforeLetter <> s $underbar } $beforeLetter; | |
265 | σ } $beforeLetter <> s } $beforeLetter; | |
266 | ||
267 | # otherwise, after a letter = final | |
268 | $afterLetter { σ <> $afterLetter { s $underbar; | |
269 | $afterLetter { ς <> $afterLetter { s ; | |
270 | ||
271 | # otherwise (isolated) = initial | |
272 | ς <> s $underbar; | |
273 | σ <> s ; | |
274 | ||
275 | # [Pp] { Σ <> \'S ; | |
276 | Σ <> S ; | |
277 | ||
278 | τ <> t ; | |
279 | Τ <> T ; | |
280 | ||
281 | $vowel {υ } <> u ; | |
282 | υ <> y ; | |
283 | $vowel { Υ <> U ; | |
284 | Υ <> Y ; | |
285 | ||
286 | χ <> ch ; | |
287 | Χ } $beforeLower <> Ch ; | |
288 | Χ <> CH ; | |
289 | ||
290 | # Completeness for ASCII | |
291 | ||
292 | $ignore = [[:Mark:]''] * ; | |
293 | ||
294 | | k < c ; | |
295 | | ph < f ; | |
296 | | i < j ; | |
297 | | k < q ; | |
298 | | b < v } $vowel ; | |
299 | | b < w } $vowel; | |
300 | | u < v ; | |
301 | | u < w; | |
302 | | K < C ; | |
303 | | Ph < F ; | |
304 | | I < J ; | |
305 | | K < Q ; | |
306 | | B < V } $vowel ; | |
307 | | B < W } $vowel ; | |
308 | | U < V ; | |
309 | | U < W ; | |
310 | ||
311 | $rough } $ignore [:UppercaseLetter:] > H ; | |
312 | $ignore [:UppercaseLetter:] { $rough > H ; | |
313 | $rough < H ; | |
314 | $rough <> h ; | |
315 | ||
316 | # Completeness for Greek | |
317 | ||
318 | ϐ > | β ; | |
319 | ϑ > | θ ; | |
320 | ϒ > | Υ ; | |
321 | ϕ > | φ ; | |
322 | ϖ > | π ; | |
323 | ||
324 | ϰ > | κ ; | |
325 | ϱ > | ρ ; | |
326 | ϲ > | σ ; | |
327 | Ϲ > | Σ; #U+03F9 GREEK CAPITAL LUNATE SIGMA SYMBOL | |
328 | ϳ > j ; | |
329 | ϴ > | Θ ; | |
330 | ϵ > | ε ; | |
331 | ||
332 | µ > | μ ; | |
333 | ||
334 | ͺ > i; | |
335 | ||
336 | # delete any trailing ' marks used for roundtripping | |
337 | ||
338 | < [Ππ] { \' } [Ss] ; | |
339 | < [Νν] { \' } $egammaLike ; | |
340 | ||
341 | ::NFC (NFD) ; | |
342 | # ([\u0000-\u007F [:Latin:] [:Greek:] [:nonspacing mark:]]) ; | |
343 | # ([\u0000-\u007F \u00B7 [:Latin:] [:nonspacing mark:]]) ; | |
344 | # MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD | |
345 | :: ( [':?A-Za-z\u00C0-\u00C5\u00C7-\u00CF\u00D1-\u00D6\u00D9-\u00DD\u00E0-\u00E5\u00E7-\u00EF\u00F1-\u00F6\u00F9-\u00FD\u00FF-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0300-\u0337\u0339-\u0345\u0385-\u0386\u0388-\u038A\u038C\u038E-\u0390\u03AA-\u03B0\u03CA-\u03CE\u03D3-\u03D4\u0400-\u0401\u0403\u0407\u040C-\u040E\u0419\u0439\u0450-\u0451\u0453\u0457\u045C-\u045E\u0476-\u0477\u04C1-\u04C2\u04D0-\u04D3\u04D6-\u04D7\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F5\u04F8-\u04F9\u1E00-\u1E99\u1E9B\u1EA0-\u1EF9\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FC1-\u1FC4\u1FC6-\u1FD3\u1FD6-\u1FDB\u1FDD-\u1FEE\u1FF2-\u1FF4\u1FF6-\u1FFC\u212A-\u212B] ) ; |