]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | #-------------------------------------------------------------------- |
2 | # Copyright (c) 1999-2004, International Business Machines | |
3 | # Corporation and others. All Rights Reserved. | |
4 | #-------------------------------------------------------------------- | |
5 | ||
6 | # Generally follows UNGEGN <http://www.eki.ee/wgrs/rom1_ar.pdf> | |
7 | # Occasionally deviates in the direction of ISO 233 <http://homepage.mac.com/sirbinks/pdf/Arabic.pdf> | |
8 | # a) where required for disambiguation. | |
9 | # b) with underdot instead of cedilla for letter like SAD, since | |
10 | # those are explicitly in Unicode for transliteration. | |
11 | # c) with extra non-Arabic-language letters, like PEH | |
12 | ||
13 | # Does *not* do assimilation of "al", nor hyphenation. | |
14 | # While it could be done, we need to determine whether a prefix "al" could | |
15 | # occur other than as the definite article (since no space is used). | |
16 | ||
17 | :: [[:Arabic:] [ⁿ\u060C\u061B\u061F\u0640\u064B-\u0655\u0660-\u066C\u06F0-\u06F9\uFDFC]] ; | |
18 | :: NFKD (NFC); | |
19 | $disambig = ̱ ; | |
20 | $disambig2 = ̰ ; | |
21 | $under = ̣ ; | |
22 | ||
23 | $notAbove = [[:^ccc=0:]&[:^ccc=230:]]; | |
24 | ||
25 | # non-letters | |
26 | ||
27 | ٫ <> '.' $disambig ; # ARABIC DECIMAL SEPARATOR | |
28 | ٬ <> ',' $disambig ; # ARABIC THOUSANDS SEPARATOR | |
29 | # ٭ <> ; # ARABIC FIVE POINTED STAR // no need to transliterate | |
30 | ||
31 | ، <> ',' ; # ARABIC COMMA | |
32 | ؛ <> ';' ; # ARABIC SEMICOLON | |
33 | ؟ <> '?' ; # ARABIC QUESTION MARK | |
34 | ٪ <> '%' ; # ARABIC PERCENT SIGN | |
35 | ||
36 | ۰ <> 0 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ZERO | |
37 | ۱ <> 1 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ONE | |
38 | ۲ <> 2 $disambig ; # EXTENDED ARABIC-INDIC DIGIT TWO | |
39 | ۳ <> 3 $disambig ; # EXTENDED ARABIC-INDIC DIGIT THREE | |
40 | ۴ <> 4 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FOUR | |
41 | ۵ <> 5 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FIVE | |
42 | ۶ <> 6 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SIX | |
43 | ۷ <> 7 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SEVEN | |
44 | ۸ <> 8 $disambig ; # EXTENDED ARABIC-INDIC DIGIT EIGHT | |
45 | ۹ <> 9 $disambig ; # EXTENDED ARABIC-INDIC DIGIT NINE | |
46 | ||
47 | ٠ <> 0 ; # ARABIC-INDIC DIGIT ZERO | |
48 | ١ <> 1 ; # ARABIC-INDIC DIGIT ONE | |
49 | ٢ <> 2 ; # ARABIC-INDIC DIGIT TWO | |
50 | ٣ <> 3 ; # ARABIC-INDIC DIGIT THREE | |
51 | ٤ <> 4 ; # ARABIC-INDIC DIGIT FOUR | |
52 | ٥ <> 5 ; # ARABIC-INDIC DIGIT FIVE | |
53 | ٦ <> 6 ; # ARABIC-INDIC DIGIT SIX | |
54 | ٧ <> 7 ; # ARABIC-INDIC DIGIT SEVEN | |
55 | ٨ <> 8 ; # ARABIC-INDIC DIGIT EIGHT | |
56 | ٩ <> 9 ; # ARABIC-INDIC DIGIT NINE | |
57 | ||
58 | # letters | |
59 | ||
60 | # long vowels | |
61 | َا<> ā ; # ARABIC FATHA, ARABIC LETTER ALEF | |
62 | ُو <> ū ; # ARABIC DAMMA, ARABIC LETTER WAW | |
63 | ِي <> ī ; # ARABIC KASRA, ARABIC LETTER YEH | |
64 | ||
65 | # longer items moved here to prevent masking | |
66 | ث <> t h $disambig ; # ARABIC LETTER THEH | |
67 | ذ <> d h $disambig ; # ARABIC LETTER THAL | |
68 | ش <> s h $disambig ; # ARABIC LETTER SHEEN | |
69 | ص <> s $under ; # ARABIC LETTER SAD | |
70 | ض <> d $under ; # ARABIC LETTER DAD | |
71 | ط <> t $under ; # ARABIC LETTER TAH | |
72 | ظ <> z $under ; # ARABIC LETTER ZAH | |
73 | غ <> g h $disambig ; # ARABIC LETTER GHAIN | |
74 | ||
75 | # WARNING: special case | |
76 | # <t, umlaut, half-ring below> will be canonically ordered as <t, half-ring below, umlaut> | |
77 | # so on the return, we have to skip over (but preserve) the half-ring below (or others like it) | |
78 | # ةٕ < ẗ̹ ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS | |
79 | ||
80 | ة <> t \u0308 ; # ARABIC LETTER TEH MARBUTA | |
81 | ة | $1 < t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA | |
82 | ||
83 | # non-Arabic language | |
84 | ژ <> z h $disambig ; # ARABIC LETTER JEH | |
85 | ڭ <> n $disambig g ; # ARABIC LETTER NG | |
86 | ۋ <> v $disambig ; # ARABIC LETTER VE | |
87 | ی <> y $disambig2 ; # ARABIC LETTER FARSI YEH | |
88 | ||
89 | # Arabic language | |
90 | ||
91 | ء <> ʾ ; # ARABIC LETTER HAMZA | |
92 | ا <> a $under; # ARABIC LETTER ALEF | |
93 | ب <> b ; # ARABIC LETTER BEH | |
94 | ت <> t ; # ARABIC LETTER TEH | |
95 | ج <> j ; # ARABIC LETTER JEEM | |
96 | ح <> h $under ; # ARABIC LETTER HAH | |
97 | خ <> k h $disambig ; # ARABIC LETTER KHAH | |
98 | د <> d ; # ARABIC LETTER DAL | |
99 | ر <> r ; # ARABIC LETTER REH | |
100 | ز <> z ; # ARABIC LETTER ZAIN | |
101 | س <> s ; # ARABIC LETTER SEEN | |
102 | ع <> ʿ ; # ARABIC LETTER AIN | |
103 | ـ > ; # ARABIC TATWEEL | |
104 | ف <> f ; # ARABIC LETTER FEH | |
105 | ق <> q ; # ARABIC LETTER QAF | |
106 | ك <> k ; # ARABIC LETTER KAF | |
107 | ل <> l ; # ARABIC LETTER LAM | |
108 | م <> m ; # ARABIC LETTER MEEM | |
109 | ن <> n ; # ARABIC LETTER NOON | |
110 | ه <> h ; # ARABIC LETTER HEH | |
111 | و <> w ; # ARABIC LETTER WAW | |
112 | ى <> y $disambig ; # ARABIC LETTER ALEF MAKSURA | |
113 | ي <> y ; # ARABIC LETTER YEH | |
114 | ً <> aⁿ ; # ARABIC FATHATAN | |
115 | ٌ <> uⁿ ; # ARABIC DAMMATAN | |
116 | ٍ <> iⁿ ; # ARABIC KASRATAN | |
117 | َ <> a ; # ARABIC FATHA | |
118 | ُ <> u ; # ARABIC DAMMA | |
119 | ِ <> i ; # ARABIC KASRA | |
120 | ّ <> ̃ ; # ARABIC SHADDA | |
121 | ْ <> ̊ ; # ARABIC SUKUN | |
122 | ||
123 | # special combining marks | |
124 | ٓ <> ̂ ; # ARABIC MADDAH ABOVE | |
125 | ٔ <> ̉ ; # ARABIC HAMZA ABOVE | |
126 | ٕ <> ̹ ; # ARABIC HAMZA BELOW | |
127 | ||
128 | # Some non-Arabic language (not in UNGEGN) | |
129 | پ <> p ; # ARABIC LETTER PEH | |
130 | چ <> c h $disambig ; # ARABIC LETTER TCHEH | |
131 | ڤ <> v ; # ARABIC LETTER VEH | |
132 | # ڥ <> v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW | |
133 | # ڢ <> f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW | |
134 | گ <> g ; # ARABIC LETTER GAF | |
135 | ||
136 | # fallbacks | |
137 | | s < c } [eiy]; | |
138 | | k < c ; | |
139 | | i < e ; | |
140 | | u < o ; | |
141 | | ks < x ; | |
142 | | n < ⁿ; | |
143 | ||
144 | :: (lower) ; | |
145 | ::NFC (NFD); | |
146 | :: ( [[:Latin:] [%,.0-9;?\u02BE-\u02BF\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339\u037E]] ); |