]> git.saurik.com Git - apple/icu.git/blob - icuSources/data/translit/Greek_Latin.txt
ICU-6.2.7.tar.gz
[apple/icu.git] / icuSources / data / translit / Greek_Latin.txt
1 #--------------------------------------------------------------------
2 # Copyright (c) 1999-2004, International Business Machines
3 # Corporation and others. All Rights Reserved.
4 #--------------------------------------------------------------------
5
6 # Rules are predicated on running NFD first, and NFC afterwards
7 # :: [\u0000-\u007F \u0370-\u03FF [:Greek:] [:nonspacing mark:]] ;
8 # MINIMAL FILTER GENERATED FOR: Greek-Latin
9 :: [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u03F7-\u07FB\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126\u03F9] ;
10
11 :: NFD (NFC) ;
12
13 # TEST CASES
14
15 # Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος
16 # ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ
17 # ᾳ ῃ ῳ ὃ ὄ
18 # ὠς ὡς ὢς ὣς
19 # Ὠς Ὡς Ὢς Ὣς
20 # ὨΣ ὩΣ ὪΣ ὫΣ
21 # Ạ, ạ, Ẹ, ẹ, Ọ, ọ
22
23 # Useful variables
24
25 $lower = [[:latin:][:greek:] & [:Ll:]];
26 $glower = [[:greek:] & [:Ll:]];
27 $upper = [[:latin:][:greek:] & [:Lu:]] ;
28 $accent = [:M:] ;
29
30 # NOTE: restrict to just the Greek & Latin accents that we care about
31 # TODO: broaden out once interation is fixed
32 $accentMinus = [ [\u0300-\u0345] & [:M:] - [\u0338]] ;
33
34 $macron = \u0304 ;
35 $ddot = \u0308 ;
36 $ddotmac = [$ddot$macron];
37
38 $lcgvowel = [αεηιουω] ;
39 $ucgvowel = [ΑΕΗΙΟΥΩ] ;
40 $gvowel = [$lcgvowel $ucgvowel] ;
41 $lcgvowelC = [$lcgvowel $accent] ;
42
43 $evowel = [aeiouyAEIOUY];
44 $evowel2 = [iuyIUY];
45 $vowel = [ $evowel $gvowel] ;
46
47 $gammaLike = [ΓΚΞΧγκξχϰ] ;
48 $egammaLike = [GKXCgkxc] ;
49 $smooth = ̓ ;
50 $rough = ̔ ;
51 $iotasub = ͅ ;
52
53 $evowel_i = [$evowel-[iI]] ;
54 $evowel2_i = [uyUY];
55
56 $underbar = \u0331;
57
58 $afterLetter = [:L:] [[:M:]\']* ;
59 $beforeLetter = [[:M:]\']* [:L:] ;
60 $beforeLower = $accent * $lower ;
61
62 $notLetter = [^[:L:][:M:]] ;
63 $under = ̱;
64
65 # Fix punctuation
66 # preserve original
67 \: <> \: $under ;
68 \? <> \? $under ;
69
70 \; <> \? ;
71 · <> \: ;
72
73 # CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve
74
75 \u0342 <> \u0302 ;
76
77 # IOTA: convert iota subscript to iota
78 # first make previous alpha long!
79
80 $accent_minus = [[$accent]-[$iotasub$macron]];
81
82 Α } $accent_minus * $iotasub > | Α $macron ;
83 α } $accent_minus * $iotasub > | α $macron ;
84
85 # now convert to uppercase if after uppercase, ow to lowercase
86
87 $upper $accent * { $iotasub > I ;
88 $iotasub > i ;
89
90 | $1 $iotasub < ($evowel $macron $accentMinus *) i ;
91 | $1 $iotasub < ($evowel $macron $accentMinus *) I ;
92
93 # BREATHING
94
95 # Convert rough breathing to h, and move before letters.
96
97 # Make A ` x = > H a x
98
99 Α ($macron?) $rough } $beforeLower > H | α $1;
100 Ε $rough } $beforeLower > H | ε;
101 Η $rough } $beforeLower > H | η ;
102 Ι ($ddot?) $rough } $beforeLower > H | ι $1;
103 Ο $rough } $beforeLower > H | ο ;
104 Υ $rough } $beforeLower > H | υ ;
105 Ω ($ddot?) $rough } $beforeLower > H | ω $1;
106
107 # Make A x ` = > H a x
108
109 Α ($glower $macron?) $rough > H | α $1 ;
110 Ε ($glower) $rough > H | ε $1 ;
111 Η ($glower) $rough > H | η $1 ;
112 Ι ($glower $ddot?) $rough > H | ι $1 ;
113 Ο ($glower) $rough > H | ο $1 ;
114 Υ ($glower) $rough > H | υ $1 ;
115 Ω ($glower $ddot?) $rough > H | ω $1 ;
116
117 #Otherwise, make x ` into h x and X ` into H X
118
119 ($lcgvowel + $ddotmac? ) $rough > h | $1 ;
120 ($gvowel + $ddotmac? ) $rough > H | $1 ;
121
122 # Go backwards with H
123
124 | $1 $rough < h ($evowel $macron $ddot? $evowel2_i $macron?) ;
125 | $1 $rough < h ($evowel $ddot? $evowel2 $macron?) ;
126 | $1 $rough < h ($evowel $macron? $ddot?) ;
127
128 | $1 $rough < H ([AEIOUY] $macron $ddot? $evowel2_i $macron?) ;
129 | $1 $rough < H ([AEIOUY] $ddot? $evowel2 $macron?) ;
130 | $1 $rough < H ([AEIOUY] $macron? $ddot?) ;
131
132 # titlecase, have to fix individually
133 # in the future, we should add &uppercase() to make this easier
134
135 | A $1 $rough < H a ($macron $ddot? $evowel2_i $macron?) ;
136 | E $1 $rough < H e ($macron $ddot? $evowel2_i $macron?) ;
137 | I $1 $rough < H i ($macron $ddot? $evowel2_i $macron?) ;
138 | O $1 $rough < H o ($macron $ddot? $evowel2_i $macron?) ;
139 | U $1 $rough < H u ($macron $ddot? $evowel2_i $macron?) ;
140 | Y $1 $rough < H y ($macron $ddot? $evowel2_i $macron?) ;
141
142 | A $1 $rough < H a ($ddot? $evowel2 $macron?) ;
143 | E $1 $rough < H e ($ddot? $evowel2 $macron?) ;
144 | I $1 $rough < H i ($ddot? $evowel2 $macron?) ;
145 | O $1 $rough < H o ($ddot? $evowel2 $macron?) ;
146 | U $1 $rough < H u ($ddot? $evowel2 $macron?) ;
147 | Y $1 $rough < H y ($ddot? $evowel2 $macron?) ;
148
149 | A $1 $rough < H a ($macron? $ddot? ) ;
150 | E $1 $rough < H e ($macron? $ddot? ) ;
151 | I $1 $rough < H i ($macron? $ddot? ) ;
152 | O $1 $rough < H o ($macron? $ddot? ) ;
153 | U $1 $rough < H u ($macron? $ddot? ) ;
154 | Y $1 $rough < H y ($macron? $ddot? ) ;
155
156 # Now do smooth
157
158 #delete smooth breathing for Latin
159 $smooth > ;
160
161 # insert in Greek
162 # the assumption is that all Marks are on letters.
163
164 | $1 $smooth < $notLetter { ([rR]) } [^hH$smooth$rough] ;
165 | $1 $smooth < $notLetter { ($evowel $macron? $evowel2 $macron?) } [^$smooth$rough] ;
166 | $1 $smooth < $notLetter { ($evowel $macron?) } [^$evowel2$smooth$rough] ;
167
168 # TODO: preserve smooth/rough breathing if not
169 # on initial vowel sequence
170
171 # need to have these up here so the rules don't mask
172
173 # remove now superfluous macron when returning
174
175 Α < A $macron ;
176 α < a $macron ;
177
178 η <> e $macron ;
179 Η <> E $macron ;
180
181 φ <> ph ;
182 Ψ } $beforeLower <> Ps ;
183 Ψ <> PS ;
184
185 Φ } $beforeLower <> Ph ;
186 Φ <> PH ;
187 ψ <> ps ;
188
189 ω <> o $macron ;
190 Ω <> O $macron;
191
192 # NORMAL
193
194 α <> a ;
195 Α <> A ;
196
197 β <> b ;
198 Β <> B ;
199
200 γ } $gammaLike <> n } $egammaLike ;
201 γ <> g ;
202 Γ } $gammaLike <> N } $egammaLike ;
203 Γ <> G ;
204
205 δ <> d ;
206 Δ <> D ;
207
208 ε <> e ;
209 Ε <> E ;
210
211 ζ <> z ;
212 Ζ <> Z ;
213
214 θ <> th ;
215 Θ } $beforeLower <> Th ;
216 Θ <> TH ;
217
218 ι <> i ;
219 Ι <> I ;
220
221 κ <> k ;
222 Κ <> K ;
223
224 λ <> l ;
225 Λ <> L ;
226
227 μ <> m ;
228 Μ <> M ;
229
230 ν } $gammaLike > n\' ;
231 ν <> n ;
232 Ν } $gammaLike <> N\' ;
233 Ν <> N ;
234
235 ξ <> x ;
236 Ξ <> X ;
237
238 ο <> o ;
239 Ο <> O ;
240
241 π <> p ;
242 Π <> P ;
243
244 ρ $rough <> rh;
245 Ρ $rough } $beforeLower <> Rh ;
246 Ρ $rough <> RH ;
247 ρ <> r ;
248 Ρ <> R ;
249
250 # insert separator before things that turn into s
251
252 [Pp] { } [ςσΣϷϸϺϻ] > \' ;
253
254 # special S variants
255
256 Ϸ <> Š ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L
257 ϸ <> š ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L
258 Ϻ <> Ŝ ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L
259 ϻ <> ŝ ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L
260
261 # underbar means exception
262
263 # before a letter, initial
264 ς } $beforeLetter <> s $underbar } $beforeLetter;
265 σ } $beforeLetter <> s } $beforeLetter;
266
267 # otherwise, after a letter = final
268 $afterLetter { σ <> $afterLetter { s $underbar;
269 $afterLetter { ς <> $afterLetter { s ;
270
271 # otherwise (isolated) = initial
272 ς <> s $underbar;
273 σ <> s ;
274
275 # [Pp] { Σ <> \'S ;
276 Σ <> S ;
277
278 τ <> t ;
279 Τ <> T ;
280
281 $vowel {υ } <> u ;
282 υ <> y ;
283 $vowel { Υ <> U ;
284 Υ <> Y ;
285
286 χ <> ch ;
287 Χ } $beforeLower <> Ch ;
288 Χ <> CH ;
289
290 # Completeness for ASCII
291
292 $ignore = [[:Mark:]''] * ;
293
294 | k < c ;
295 | ph < f ;
296 | i < j ;
297 | k < q ;
298 | b < v } $vowel ;
299 | b < w } $vowel;
300 | u < v ;
301 | u < w;
302 | K < C ;
303 | Ph < F ;
304 | I < J ;
305 | K < Q ;
306 | B < V } $vowel ;
307 | B < W } $vowel ;
308 | U < V ;
309 | U < W ;
310
311 $rough } $ignore [:UppercaseLetter:] > H ;
312 $ignore [:UppercaseLetter:] { $rough > H ;
313 $rough < H ;
314 $rough <> h ;
315
316 # Completeness for Greek
317
318 ϐ > | β ;
319 ϑ > | θ ;
320 ϒ > | Υ ;
321 ϕ > | φ ;
322 ϖ > | π ;
323
324 ϰ > | κ ;
325 ϱ > | ρ ;
326 ϲ > | σ ;
327 Ϲ > | Σ; #U+03F9 GREEK CAPITAL LUNATE SIGMA SYMBOL
328 ϳ > j ;
329 ϴ > | Θ ;
330 ϵ > | ε ;
331
332 µ > | μ ;
333
334 ͺ > i;
335
336 # delete any trailing ' marks used for roundtripping
337
338 < [Ππ] { \' } [Ss] ;
339 < [Νν] { \' } $egammaLike ;
340
341 ::NFC (NFD) ;
342 # ([\u0000-\u007F [:Latin:] [:Greek:] [:nonspacing mark:]]) ;
343 # ([\u0000-\u007F \u00B7 [:Latin:] [:nonspacing mark:]]) ;
344 # MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD
345 :: ( [':?A-Za-z\u00C0-\u00C5\u00C7-\u00CF\u00D1-\u00D6\u00D9-\u00DD\u00E0-\u00E5\u00E7-\u00EF\u00F1-\u00F6\u00F9-\u00FD\u00FF-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0300-\u0337\u0339-\u0345\u0385-\u0386\u0388-\u038A\u038C\u038E-\u0390\u03AA-\u03B0\u03CA-\u03CE\u03D3-\u03D4\u0400-\u0401\u0403\u0407\u040C-\u040E\u0419\u0439\u0450-\u0451\u0453\u0457\u045C-\u045E\u0476-\u0477\u04C1-\u04C2\u04D0-\u04D3\u04D6-\u04D7\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F5\u04F8-\u04F9\u1E00-\u1E99\u1E9B\u1EA0-\u1EF9\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FC1-\u1FC4\u1FC6-\u1FD3\u1FD6-\u1FDB\u1FDD-\u1FEE\u1FF2-\u1FF4\u1FF6-\u1FFC\u212A-\u212B] ) ;