2 # Copyright (C) 2002-2004, International Business Machines Corporation
3 # and others. All Rights Reserved.
8 # See Unicode Standard Annex #29.
9 # These rules are based on Version 4.1 draft, dated 2004-11-11
12 ##############################################################################
14 # Character class definitions from TR 29
16 ##############################################################################
20 $Katakana = [[:Script = KATAKANA:]
21 [:name = VERTICAL KANA REPEAT MARK:]
22 [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK:]
23 [:name = VERTICAL KANA REPEAT MARK UPPER HALF:]
24 [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF:]
25 [:name = VERTICAL KANA REPEAT MARK LOWER HALF:]
26 [:name = KATAKANA-HIRAGANA VOICED SOUND MARK:]
27 [:name = KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK:]
28 [:name = KATAKANA-HIRAGANA DOUBLE HYPHEN:]
29 [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
30 [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
31 [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
32 [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
35 $ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
38 - [:Script = Hiragana:]
40 - [:Grapheme_Extend = TRUE:]];
42 $MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
43 [:name = HEBREW PUNCTUATION GERSHAYIM:]
44 [:name = RIGHT SINGLE QUOTATION MARK:]
45 [:name = HYPHENATION POINT:]
49 $MidNum = [[:LineBreak = Infix_Numeric:] - [:name = COLON:]];
50 $Numeric = [[:Nd:][:name = ARABIC DECIMAL SEPARATOR:][:name = ARABIC THOUSANDS SEPARATOR:]];
51 $ExtendNumLet = [[:Connector_Punctuation:]
52 - [:name = KATAKANA MIDDLE DOT:]
53 - [:name = HALFWIDTH KATAKANA MIDDLE DOT:]];
58 # Character Class Definitions.
59 # The names are those from TR29.
64 $Extend = [[:Grapheme_Extend = TRUE:]];
65 $Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
66 $Format = [[:Cf:] - $Extend];
67 $Hiragana = [:Hiragana:];
68 $Ideographic = [:IDEOGRAPHIC:];
70 $ALetterEx = $ALetter $Extend*;
71 $NumericEx = $Numeric $Extend*;
72 $MidNumEx = $MidNum $Extend*;
73 $MidLetterEx = $MidLetter $Extend*;
74 $KatakanaEx = $Katakana $Extend*;
75 $ExtendNumLetEx = $ExtendNumLet $Extend*;
77 ## -------------------------------------------------
82 # Rule 3 - don't break grapheme clusters.
83 # see character breaks
86 #[^$Control] $Extend*;
87 #$NumericEx $Extend* {100};
88 #$ALetterEx $Extend* {200};
96 $ALetterEx $Format* $ALetterEx {200};
99 $ALetterEx $Format* $MidLetterEx $Format* $ALetterEx {200};
103 $NumericEx $Format* $NumericEx {100};
107 $ALetterEx $Format* $NumericEx {200};
111 $NumericEx $Format* $ALetterEx {200};
115 $NumericEx $Format* $MidNumEx $Format* $NumericEx {100};
119 $KatakanaEx $Format* $KatakanaEx {300};
120 $Hiragana $Extend* {300};
121 $Ideographic $Extend* {400};
125 $ALetterEx $Format* $ExtendNumLetEx {200}; # (13a)
126 $NumericEx $Format* $ExtendNumLetEx {100}; # (13a)
127 $KatakanaEx $Format* $ExtendNumLetEx {300}; # (13a)
128 $ExtendNumLetEx $Format* $ExtendNumLetEx{200}; # (13a)
130 $ExtendNumLetEx $Format* $ALetterEx {200}; # (13b)
131 $ExtendNumLetEx $Format* $NumericEx {100}; # (13b)
132 $ExtendNumLetEx $Format* $KatakanaEx {300}; # (13b)
136 ## -------------------------------------------------
140 $BackALetterEx = $Extend* $ALetter;
141 $BackNumericEx = $Extend* $Numeric;
142 $BackMidNumEx = $Extend* $MidNum;
143 $BackMidLetterEx = $Extend* $MidLetter;
144 $BackKatakanaEx = $Extend* $Katakana;
145 $BackExtendNumLetEx= $Extend* $ExtendNumLet;
149 # see character breaks
151 $Extend* [^$Control];
155 $BackALetterEx $Format* $BackALetterEx;
159 $BackALetterEx $Format* $BackMidLetterEx $Format* $BackALetterEx;
164 $BackNumericEx $Format* $BackNumericEx;
168 $BackNumericEx $Format* $BackALetterEx;
172 $BackALetterEx $Format* $BackNumericEx;
176 $BackNumericEx $Format* $BackMidNumEx $Format* $BackNumericEx;
180 $BackKatakanaEx $Format* $BackKatakanaEx;
184 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx) $Format* $BackExtendNumLetEx;
185 $BackExtendNumLetEx $Format* ($BackALetterEx | $BackNumericEx | $BackKatakanaEx);
187 ## -------------------------------------------------
193 $Extend+; # comes into play when buffer _begins_ with an $Extend+.
196 $Format+ $BackALetterEx;
197 $Format+ $BackNumericEx;
198 $Format+ $BackMidLetterEx;
199 $Format+ $BackMidNumEx;
200 $Format+ $BackKatakanaEx;
201 $Format+ $BackExtendNumLetEx;
205 $MidLetter $Format* $BackALetterEx;
208 $MidNum $Format* $BackNumericEx;
210 ## -------------------------------------------------
218 $Extend* $Format+ $ALetterEx;
219 $Extend* $Format+ $NumericEx;
220 $Extend* $Format+ $MidLetterEx;
221 $Extend* $Format+ $MidNumEx;
222 $Extend* $Format+ $KatakanaEx;
223 $Extend* $Format+ $ExtendNumLetEx;
225 $Extend+ $Format* $ALetterEx;
226 $Extend+ $Format* $NumericEx;
227 $Extend+ $Format* $MidLetterEx;
228 $Extend+ $Format* $MidNumEx;
229 $Extend+ $Format* $KatakanaEx;
230 $Extend+ $Format* $ExtendNumLetEx;
233 $MidLetterEx $Format* $ALetterEx;
236 $MidNumEx $Format* $NumericEx;