]>
Commit | Line | Data |
---|---|---|
b75a7d8f | 1 | # |
73c04bcf | 2 | # Copyright (C) 2002-2006, International Business Machines Corporation |
374ca955 | 3 | # and others. All Rights Reserved. |
b75a7d8f | 4 | # |
374ca955 | 5 | # file: word.txt |
b75a7d8f | 6 | # |
374ca955 | 7 | # ICU Word Break Rules |
b75a7d8f | 8 | # See Unicode Standard Annex #29. |
73c04bcf A |
9 | # These rules are based on Unicode Version 5.0 0 |
10 | # Includes post Unicode 5.0 change to treat Japanese half width voicing marks | |
11 | # as Extend | |
b75a7d8f | 12 | # |
73c04bcf A |
13 | # Note: Updates to word.txt will usually need to be merged into |
14 | # word_POSIX.txt and word_ja.txt also. | |
b75a7d8f | 15 | |
374ca955 | 16 | ############################################################################## |
b75a7d8f A |
17 | # |
18 | # Character class definitions from TR 29 | |
19 | # | |
374ca955 A |
20 | ############################################################################## |
21 | ||
22 | !!chain; | |
23 | ||
b75a7d8f A |
24 | |
25 | # | |
26 | # Character Class Definitions. | |
b75a7d8f | 27 | # |
b75a7d8f | 28 | |
73c04bcf A |
29 | $VoiceMarks = [\uff9e\uff9f]; |
30 | $Format = [\p{Word_Break = Format}]; | |
31 | $Katakana = [\p{Word_Break = Katakana}-$VoiceMarks]; | |
32 | $ALetter = [\p{Word_Break = ALetter}]; | |
33 | $MidLetter = [\p{Word_Break = MidLetter}]; | |
34 | $MidNum = [\p{Word_Break = MidNum}]; | |
35 | $Numeric = [\p{Word_Break = Numeric}]; | |
36 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; | |
37 | ||
38 | ||
374ca955 A |
39 | $CR = \u000d; |
40 | $LF = \u000a; | |
73c04bcf A |
41 | $Extend = [\p{Grapheme_Cluster_Break = Extend}$VoiceMarks]; |
42 | $Control = [\p{Grapheme_Cluster_Break = Control}]; | |
43 | ||
44 | # Dictionary character set, for triggering language-based break engines. Currently | |
45 | # limited to LineBreak=Complex_Context. Note that this set only works in Unicode | |
46 | # 5.0 or later as the definition of Complex_Context was corrected to include all | |
47 | # characters requiring dictionary break. | |
48 | ||
49 | $dictionary = [:LineBreak = Complex_Context:]; | |
50 | $ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; | |
51 | ||
52 | ||
53 | # | |
54 | # Rules 3 Grapheme Clusters behave like their first char. | |
55 | # Rule 4 Ignore trailing Format characters (Also see note in TR 29) | |
56 | # | |
57 | $KatakanaEx = $Katakana ($Extend | $Format)*; | |
58 | $ALetterEx = $ALetterPlus ($Extend | $Format)*; | |
59 | $MidLetterEx = $MidLetter ($Extend | $Format)*; | |
60 | $MidNumEx = $MidNum ($Extend | $Format)*; | |
61 | $NumericEx = $Numeric ($Extend | $Format)*; | |
62 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; | |
63 | ||
374ca955 A |
64 | $Hiragana = [:Hiragana:]; |
65 | $Ideographic = [:IDEOGRAPHIC:]; | |
73c04bcf A |
66 | $HiraganaEx = $Hiragana ($Extend | $Format)*; |
67 | $IdeographicEx = $Ideographic ($Extend | $Format)*; | |
b75a7d8f | 68 | |
374ca955 | 69 | ## ------------------------------------------------- |
b75a7d8f | 70 | |
374ca955 | 71 | !!forward; |
b75a7d8f | 72 | |
b75a7d8f | 73 | |
73c04bcf A |
74 | # Rule 3 - CR x LF |
75 | # see character breaks. | |
76 | ||
77 | $CR $LF ($Extend | $Format)*; | |
78 | ||
79 | # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning | |
80 | # of a region of Text. The rule here comes into play when the start of text | |
81 | # begins with a group of Format chars, or with a "word" consisting of a single | |
82 | # char that is not in any of the listed word break categories followed by | |
83 | # format char(s). | |
84 | .? ($Extend | $Format)+; | |
85 | ||
b75a7d8f | 86 | |
374ca955 A |
87 | $NumericEx {100}; |
88 | $ALetterEx {200}; | |
89 | $KatakanaEx {300}; | |
73c04bcf A |
90 | $HiraganaEx {300}; |
91 | $IdeographicEx {400}; | |
b75a7d8f | 92 | |
374ca955 | 93 | # rule 5 |
b75a7d8f | 94 | |
73c04bcf | 95 | $ALetterEx $ALetterEx {200}; |
b75a7d8f | 96 | |
374ca955 | 97 | # rule 6 and 7 |
73c04bcf | 98 | $ALetterEx $MidLetterEx $ALetterEx {200}; |
b75a7d8f | 99 | |
374ca955 | 100 | # rule 8 |
b75a7d8f | 101 | |
73c04bcf | 102 | $NumericEx $NumericEx {100}; |
b75a7d8f | 103 | |
374ca955 | 104 | # rule 9 |
b75a7d8f | 105 | |
374ca955 | 106 | $ALetterEx $Format* $NumericEx {200}; |
b75a7d8f | 107 | |
374ca955 | 108 | # rule 10 |
b75a7d8f | 109 | |
73c04bcf | 110 | $NumericEx $ALetterEx {200}; |
b75a7d8f | 111 | |
374ca955 A |
112 | # rule 11 and 12 |
113 | ||
73c04bcf | 114 | $NumericEx $MidNumEx $NumericEx {100}; |
374ca955 A |
115 | |
116 | # rule 13 | |
117 | ||
73c04bcf | 118 | $KatakanaEx $KatakanaEx {300}; |
374ca955 A |
119 | |
120 | # rule 13a/b | |
121 | ||
73c04bcf A |
122 | $ALetterEx $ExtendNumLetEx {200}; # (13a) |
123 | $NumericEx $ExtendNumLetEx {100}; # (13a) | |
124 | $KatakanaEx $ExtendNumLetEx {300}; # (13a) | |
125 | $ExtendNumLetEx $ExtendNumLetEx{200}; # (13a) | |
374ca955 | 126 | |
73c04bcf A |
127 | $ExtendNumLetEx $ALetterEx {200}; # (13b) |
128 | $ExtendNumLetEx $NumericEx {100}; # (13b) | |
129 | $ExtendNumLetEx $KatakanaEx {300}; # (13b) | |
374ca955 A |
130 | |
131 | ||
132 | ||
133 | ## ------------------------------------------------- | |
b75a7d8f | 134 | |
374ca955 A |
135 | !!reverse; |
136 | ||
73c04bcf A |
137 | $BackALetterEx = ($Format | $Extend)* $ALetterPlus; |
138 | $BackNumericEx = ($Format | $Extend)* $Numeric; | |
139 | $BackMidNumEx = ($Format | $Extend)* $MidNum; | |
140 | $BackMidLetterEx = ($Format | $Extend)* $MidLetter; | |
141 | $BackKatakanaEx = ($Format | $Extend)* $Katakana; | |
142 | $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet; | |
374ca955 | 143 | |
73c04bcf A |
144 | # rule 3 |
145 | ($Format | $Extend)* $LF $CR; | |
374ca955 | 146 | |
73c04bcf A |
147 | # rule 4 |
148 | ($Format | $Extend)* .?; | |
374ca955 A |
149 | |
150 | # rule 5 | |
151 | ||
73c04bcf | 152 | $BackALetterEx $BackALetterEx; |
374ca955 A |
153 | |
154 | # rule 6 and 7 | |
155 | ||
73c04bcf | 156 | $BackALetterEx $BackMidLetterEx $BackALetterEx; |
374ca955 A |
157 | |
158 | ||
159 | # rule 8 | |
160 | ||
73c04bcf | 161 | $BackNumericEx $BackNumericEx; |
374ca955 A |
162 | |
163 | # rule 9 | |
164 | ||
73c04bcf | 165 | $BackNumericEx $BackALetterEx; |
374ca955 A |
166 | |
167 | # rule 10 | |
168 | ||
73c04bcf | 169 | $BackALetterEx $BackNumericEx; |
374ca955 A |
170 | |
171 | # rule 11 and 12 | |
172 | ||
73c04bcf | 173 | $BackNumericEx $BackMidNumEx $BackNumericEx; |
374ca955 A |
174 | |
175 | # rule 13 | |
176 | ||
73c04bcf | 177 | $BackKatakanaEx $BackKatakanaEx; |
374ca955 A |
178 | |
179 | # rules 13 a/b | |
b75a7d8f | 180 | # |
73c04bcf A |
181 | ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx) $BackExtendNumLetEx; |
182 | $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx); | |
374ca955 A |
183 | |
184 | ## ------------------------------------------------- | |
185 | ||
186 | !!safe_reverse; | |
187 | ||
188 | # rule 3 | |
73c04bcf | 189 | ($Extend | $Format)+ .?; |
374ca955 A |
190 | |
191 | # rule 6 | |
73c04bcf | 192 | $MidLetter $BackALetterEx; |
374ca955 A |
193 | |
194 | # rule 11 | |
73c04bcf A |
195 | $MidNum $BackNumericEx; |
196 | ||
197 | # For dictionary-based break | |
198 | $dictionary $dictionary; | |
374ca955 A |
199 | |
200 | ## ------------------------------------------------- | |
201 | ||
202 | !!safe_forward; | |
203 | ||
374ca955 | 204 | # rule 4 |
73c04bcf | 205 | ($Extend | $Format)+ .?; |
374ca955 A |
206 | |
207 | # rule 6 | |
73c04bcf | 208 | $MidLetterEx $ALetterEx; |
b75a7d8f | 209 | |
374ca955 | 210 | # rule 11 |
73c04bcf A |
211 | $MidNumEx $NumericEx; |
212 | ||
213 | # For dictionary-based break | |
214 | $dictionary $dictionary; |