]> git.saurik.com Git - apple/icu.git/blame - icuSources/data/translit/Arabic_Latin.txt
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / data / translit / Arabic_Latin.txt
CommitLineData
374ca955
A
1#--------------------------------------------------------------------
2# Copyright (c) 1999-2004, International Business Machines
3# Corporation and others. All Rights Reserved.
4#--------------------------------------------------------------------
5
6# Generally follows UNGEGN <http://www.eki.ee/wgrs/rom1_ar.pdf>
7# Occasionally deviates in the direction of ISO 233 <http://homepage.mac.com/sirbinks/pdf/Arabic.pdf>
8# a) where required for disambiguation.
9# b) with underdot instead of cedilla for letter like SAD, since
10# those are explicitly in Unicode for transliteration.
11# c) with extra non-Arabic-language letters, like PEH
12
13# Does *not* do assimilation of "al", nor hyphenation.
14# While it could be done, we need to determine whether a prefix "al" could
15# occur other than as the definite article (since no space is used).
16
17:: [[:Arabic:] [‎ⁿ\u060C\u061B\u061F\u0640\u064B-\u0655\u0660-\u066C\u06F0-\u06F9\uFDFC]] ;
18:: NFKD (NFC);
19$disambig = ̱ ;
20$disambig2 = ̰ ;
21$under = ̣ ;
22
23$notAbove = [[:^ccc=0:]&[:^ccc=230:]];
24
25# non-letters
26
27 ٫ <> '.' $disambig ; # ARABIC DECIMAL SEPARATOR
28 ٬ <> ',' $disambig ; # ARABIC THOUSANDS SEPARATOR
29# ٭ <> ; # ARABIC FIVE POINTED STAR // no need to transliterate
30
31، <> ',' ; # ARABIC COMMA
32 ؛ <> ';' ; # ARABIC SEMICOLON
33 ؟ <> '?' ; # ARABIC QUESTION MARK
34 ٪ <> '%' ; # ARABIC PERCENT SIGN
35
36 ۰ <> 0 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ZERO
37 ۱ <> 1 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ONE
38 ۲ <> 2 $disambig ; # EXTENDED ARABIC-INDIC DIGIT TWO
39 ۳ <> 3 $disambig ; # EXTENDED ARABIC-INDIC DIGIT THREE
40 ۴ <> 4 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FOUR
41 ۵ <> 5 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FIVE
42 ۶ <> 6 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SIX
43 ۷ <> 7 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SEVEN
44 ۸ <> 8 $disambig ; # EXTENDED ARABIC-INDIC DIGIT EIGHT
45 ۹ <> 9 $disambig ; # EXTENDED ARABIC-INDIC DIGIT NINE
46
47 ٠ <> 0 ; # ARABIC-INDIC DIGIT ZERO
48 ١ <> 1 ; # ARABIC-INDIC DIGIT ONE
49 ٢ <> 2 ; # ARABIC-INDIC DIGIT TWO
50 ٣ <> 3 ; # ARABIC-INDIC DIGIT THREE
51 ٤ <> 4 ; # ARABIC-INDIC DIGIT FOUR
52 ٥ <> 5 ; # ARABIC-INDIC DIGIT FIVE
53 ٦ <> 6 ; # ARABIC-INDIC DIGIT SIX
54 ٧ <> 7 ; # ARABIC-INDIC DIGIT SEVEN
55 ٨ <> 8 ; # ARABIC-INDIC DIGIT EIGHT
56 ٩ <> 9 ; # ARABIC-INDIC DIGIT NINE
57
58# letters
59
60# long vowels
61 َا<> ā ; # ARABIC FATHA, ARABIC LETTER ALEF
62 ُو <> ū ; # ARABIC DAMMA, ARABIC LETTER WAW
63 ِي <> ī ; # ARABIC KASRA, ARABIC LETTER YEH
64
65# longer items moved here to prevent masking
66 ث <> t h $disambig ; # ARABIC LETTER THEH
67 ذ <> d h $disambig ; # ARABIC LETTER THAL
68 ش <> s h $disambig ; # ARABIC LETTER SHEEN
69 ص <> s $under ; # ARABIC LETTER SAD
70 ض <> d $under ; # ARABIC LETTER DAD
71 ط <> t $under ; # ARABIC LETTER TAH
72 ظ <> z $under ; # ARABIC LETTER ZAH
73 غ <> g h $disambig ; # ARABIC LETTER GHAIN
74
75# WARNING: special case
76# <t, umlaut, half-ring below> will be canonically ordered as <t, half-ring below, umlaut>
77# so on the return, we have to skip over (but preserve) the half-ring below (or others like it)
78# ةٕ < ẗ̹ ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS
79
80 ة <> t \u0308 ; # ARABIC LETTER TEH MARBUTA
81 ة | $1 < t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA
82
83# non-Arabic language
84 ژ <> z h $disambig ; # ARABIC LETTER JEH
85 ڭ <> n $disambig g ; # ARABIC LETTER NG
86 ۋ <> v $disambig ; # ARABIC LETTER VE
87 ی <> y $disambig2 ; # ARABIC LETTER FARSI YEH
88
89# Arabic language
90
91 ء <> ʾ ; # ARABIC LETTER HAMZA
92 ا <> a $under; # ARABIC LETTER ALEF
93 ب <> b ; # ARABIC LETTER BEH
94 ت <> t ; # ARABIC LETTER TEH
95 ج <> j ; # ARABIC LETTER JEEM
96 ح <> h $under ; # ARABIC LETTER HAH
97 خ <> k h $disambig ; # ARABIC LETTER KHAH
98 د <> d ; # ARABIC LETTER DAL
99 ر <> r ; # ARABIC LETTER REH
100 ز <> z ; # ARABIC LETTER ZAIN
101 س <> s ; # ARABIC LETTER SEEN
102 ع <> ʿ ; # ARABIC LETTER AIN
103 ـ > ; # ARABIC TATWEEL
104 ف <> f ; # ARABIC LETTER FEH
105 ق <> q ; # ARABIC LETTER QAF
106 ك <> k ; # ARABIC LETTER KAF
107 ل <> l ; # ARABIC LETTER LAM
108 م <> m ; # ARABIC LETTER MEEM
109 ن <> n ; # ARABIC LETTER NOON
110 ه <> h ; # ARABIC LETTER HEH
111 و <> w ; # ARABIC LETTER WAW
112 ى <> y $disambig ; # ARABIC LETTER ALEF MAKSURA
113 ي <> y ; # ARABIC LETTER YEH
114 ً <> aⁿ ; # ARABIC FATHATAN
115 ٌ <> uⁿ ; # ARABIC DAMMATAN
116 ٍ <> iⁿ ; # ARABIC KASRATAN
117 َ <> a ; # ARABIC FATHA
118 ُ <> u ; # ARABIC DAMMA
119 ِ <> i ; # ARABIC KASRA
120 ّ <> ̃ ; # ARABIC SHADDA
121 ْ <> ̊ ; # ARABIC SUKUN
122
123# special combining marks
124 ٓ <> ̂ ; # ARABIC MADDAH ABOVE
125 ٔ <> ̉ ; # ARABIC HAMZA ABOVE
126 ٕ <> ̹ ; # ARABIC HAMZA BELOW
127
128# Some non-Arabic language (not in UNGEGN)
129 پ <> p ; # ARABIC LETTER PEH
130 چ <> c h $disambig ; # ARABIC LETTER TCHEH
131 ڤ <> v ; # ARABIC LETTER VEH
132# ڥ <> v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW
133# ڢ <> f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW
134 گ <> g ; # ARABIC LETTER GAF
135
136# fallbacks
137| s < c } [eiy];
138| k < c ;
139| i < e ;
140| u < o ;
141| ks < x ;
142| n < ‎ⁿ;
143
144:: (lower) ;
145::NFC (NFD);
146:: ( [[:Latin:] [%,.0-9;?\u02BE-\u02BF\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339\u037E]] );