]>
Commit | Line | Data |
---|---|---|
340931cb A |
1 | ## wordNLLTu6.txt |
2 | ## The following corresponds to source file from CoreLP of 2018-Jan-17: | |
3 | ## CoreNLP/Tagger.subproj/Source/Data/word.txt | |
4 | ############################################################################## | |
5 | # | |
6 | # Copyright (C) 2002-2013, International Business Machines Corporation | |
7 | # and others. All Rights Reserved. | |
8 | # | |
9 | # file: word.txt | |
10 | # | |
11 | # ICU Word Break Rules | |
12 | # See Unicode Standard Annex #29. | |
13 | # These rules are based on UAX #29 Revision 22 for Unicode Version 6.3 | |
14 | # | |
15 | # Note: Updates to word.txt will usually need to be merged into | |
16 | # word_POSIX.txt also. | |
17 | ||
18 | ############################################################################## | |
19 | # | |
20 | # Character class definitions from TR 29 | |
21 | # | |
22 | ############################################################################## | |
23 | ||
24 | !!chain; | |
25 | ||
26 | ||
27 | # | |
28 | # Character Class Definitions. | |
29 | # | |
30 | ||
31 | $CR = [\p{Word_Break = CR}]; | |
32 | $LF = [\p{Word_Break = LF}]; | |
33 | $Newline = [\p{Word_Break = Newline}]; | |
34 | $Extend = [\p{Word_Break = Extend}]; | |
35 | $Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; | |
36 | $Format = [\p{Word_Break = Format}]; | |
37 | $Katakana = [\p{Word_Break = Katakana}]; | |
38 | $Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; | |
39 | $ALetter = [\p{Word_Break = ALetter}]; | |
40 | $Single_Quote = [\p{Word_Break = Single_Quote}]; | |
41 | $Double_Quote = [\p{Word_Break = Double_Quote}]; | |
42 | $MidNumLet = [\p{Word_Break = MidNumLet}]; | |
43 | $MidLetter = [\p{Word_Break = MidLetter}]; | |
44 | $MidNum = [\p{Word_Break = MidNum}]; | |
45 | $Numeric = [\p{Word_Break = Numeric}]; | |
46 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; | |
47 | ||
48 | $Han = [:Han:]; | |
49 | $Hiragana = [:Hiragana:]; | |
50 | ||
51 | ||
52 | # Dictionary character set, for triggering language-based break engines. Currently | |
53 | # limited to LineBreak=Complex_Context. Note that this set only works in Unicode | |
54 | # 5.0 or later as the definition of Complex_Context was corrected to include all | |
55 | # characters requiring dictionary break. | |
56 | ||
57 | $Control = [\p{Grapheme_Cluster_Break = Control}]; | |
58 | $HangulSyllable = [\uac00-\ud7a3]; | |
59 | $ComplexContext = [:LineBreak = Complex_Context:]; | |
60 | $KanaKanji = [$Han $Hiragana $Katakana]; | |
61 | $dictionaryCJK = [$KanaKanji $HangulSyllable]; | |
62 | $dictionary = [$ComplexContext $dictionaryCJK]; | |
63 | ||
64 | # leave CJK scripts out of ALetterPlus | |
65 | $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; | |
66 | ||
67 | ||
68 | # | |
69 | # Rules 4 Ignore Format and Extend characters, | |
70 | # except when they appear at the beginning of a region of text. | |
71 | # | |
72 | # TODO: check if handling of katakana in dictionary makes rules incorrect/void | |
73 | $KatakanaEx = $Katakana ($Extend | $Format)*; | |
74 | $Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*; | |
75 | $ALetterEx = $ALetterPlus ($Extend | $Format)*; | |
76 | $Single_QuoteEx = $Single_Quote ($Extend | $Format)*; | |
77 | $Double_QuoteEx = $Double_Quote ($Extend | $Format)*; | |
78 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*; | |
79 | $MidLetterEx = $MidLetter ($Extend | $Format)*; | |
80 | $MidNumEx = $MidNum ($Extend | $Format)*; | |
81 | $NumericEx = $Numeric ($Extend | $Format)*; | |
82 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; | |
83 | $Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*; | |
84 | ||
85 | $Ideographic = [\p{Ideographic}]; | |
86 | $HiraganaEx = $Hiragana ($Extend | $Format)*; | |
87 | $IdeographicEx = $Ideographic ($Extend | $Format)*; | |
88 | ||
89 | ## ------------------------------------------------- | |
90 | ||
91 | !!forward; | |
92 | ||
93 | ||
94 | # Rule 3 - CR x LF | |
95 | # | |
96 | $CR $LF; | |
97 | ||
98 | # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning | |
99 | # of a region of Text. The rule here comes into play when the start of text | |
100 | # begins with a group of Format chars, or with a "word" consisting of a single | |
101 | # char that is not in any of the listed word break categories followed by | |
102 | # format char(s), or is not a CJK dictionary character. | |
103 | [^$CR $LF $Newline]? ($Extend | $Format)+; | |
104 | ||
105 | $NumericEx {100}; | |
106 | $ALetterEx {200}; | |
107 | $HangulSyllable {200}; | |
108 | $Hebrew_LetterEx{200}; | |
109 | $KatakanaEx {400}; # note: these status values override those from rule 5 | |
110 | $HiraganaEx {400}; # by virtue of being numerically larger. | |
111 | $IdeographicEx {400}; # | |
112 | ||
113 | # | |
114 | # rule 5 | |
115 | # Do not break between most letters. | |
116 | # | |
117 | ($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200}; | |
118 | ||
119 | # rule 6 and 7 | |
120 | ($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200}; | |
121 | ||
122 | # rule 7a | |
123 | $Hebrew_LetterEx $Single_QuoteEx {200}; | |
124 | ||
125 | # rule 7b and 7c | |
126 | $Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200}; | |
127 | ||
128 | # rule 8 | |
129 | ||
130 | $NumericEx $NumericEx {100}; | |
131 | ||
132 | # rule 9 | |
133 | ||
134 | ($ALetterEx | $Hebrew_LetterEx) $NumericEx {200}; | |
135 | ||
136 | # rule 10 | |
137 | ||
138 | $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200}; | |
139 | ||
140 | # rule 11 and 12 | |
141 | ||
142 | $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100}; | |
143 | ||
144 | # rule 13 | |
145 | # to be consistent with $KanaKanji $KanaKanhi, changed | |
146 | # from 300 to 400. | |
147 | # See also TestRuleStatus in intltest/rbbiapts.cpp | |
148 | $KatakanaEx $KatakanaEx {400}; | |
149 | ||
150 | # rule 13a/b | |
151 | ||
152 | $ALetterEx $ExtendNumLetEx {200}; # (13a) | |
153 | $Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a) | |
154 | $NumericEx $ExtendNumLetEx {100}; # (13a) | |
155 | $KatakanaEx $ExtendNumLetEx {400}; # (13a) | |
156 | $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) | |
157 | ||
158 | $ExtendNumLetEx $ALetterEx {200}; # (13b) | |
159 | $ExtendNumLetEx $Hebrew_Letter {200}; # (13b) | |
160 | $ExtendNumLetEx $NumericEx {100}; # (13b) | |
161 | $ExtendNumLetEx $KatakanaEx {400}; # (13b) | |
162 | ||
163 | # rule 13c | |
164 | ||
165 | $Regional_IndicatorEx $Regional_IndicatorEx; | |
166 | ||
167 | # special handling for CJK characters: chain for later dictionary segmentation | |
168 | $HangulSyllable $HangulSyllable {200}; | |
169 | $KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found | |
170 | ||
171 | ||
172 | ## ------------------------------------------------- | |
173 | ||
174 | !!reverse; | |
175 | ||
176 | $BackHebrew_LetterEx = ($Format | $Extend)* $Hebrew_Letter; | |
177 | $BackALetterEx = ($Format | $Extend)* $ALetterPlus; | |
178 | $BackSingle_QuoteEx = ($Format | $Extend)* $Single_Quote; | |
179 | $BackDouble_QuoteEx = ($Format | $Extend)* $Double_Quote; | |
180 | $BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; | |
181 | $BackNumericEx = ($Format | $Extend)* $Numeric; | |
182 | $BackMidNumEx = ($Format | $Extend)* $MidNum; | |
183 | $BackMidLetterEx = ($Format | $Extend)* $MidLetter; | |
184 | $BackKatakanaEx = ($Format | $Extend)* $Katakana; | |
185 | $BackHiraganaEx = ($Format | $Extend)* $Hiragana; | |
186 | $BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet; | |
187 | $BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator; | |
188 | ||
189 | # rule 3 | |
190 | $LF $CR; | |
191 | ||
192 | # rule 4 | |
193 | ($Format | $Extend)* [^$CR $LF $Newline]?; | |
194 | ||
195 | # rule 5 | |
196 | ||
197 | ($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx); | |
198 | ||
199 | # rule 6 and 7 | |
200 | ||
201 | ($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx); | |
202 | ||
203 | # rule 7a | |
204 | $BackSingle_QuoteEx $BackHebrew_LetterEx; | |
205 | ||
206 | # Rule 7b and 7c | |
207 | $BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx; | |
208 | ||
209 | # rule 8 | |
210 | ||
211 | $BackNumericEx $BackNumericEx; | |
212 | ||
213 | # rule 9 | |
214 | ||
215 | $BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx); | |
216 | ||
217 | # rule 10 | |
218 | ||
219 | ($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx; | |
220 | ||
221 | # rule 11 and 12 | |
222 | ||
223 | $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx; | |
224 | ||
225 | # rule 13 | |
226 | ||
227 | $BackKatakanaEx $BackKatakanaEx; | |
228 | ||
229 | # rules 13 a/b | |
230 | # | |
231 | $BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); | |
232 | ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; | |
233 | ||
234 | # rule 13c | |
235 | ||
236 | $BackRegional_IndicatorEx $BackRegional_IndicatorEx; | |
237 | ||
238 | # special handling for CJK characters: chain for later dictionary segmentation | |
239 | $HangulSyllable $HangulSyllable; | |
240 | $KanaKanji $KanaKanji; #different rule status if both kanji and kana found | |
241 | ||
242 | ## ------------------------------------------------- | |
243 | ||
244 | !!safe_reverse; | |
245 | ||
246 | # rule 3 | |
247 | ($Extend | $Format)+ .?; | |
248 | ||
249 | # rule 6 | |
250 | ($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx); | |
251 | ||
252 | # rule 7b | |
253 | $Double_Quote $BackHebrew_LetterEx; | |
254 | ||
255 | ||
256 | # rule 11 | |
257 | ($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx; | |
258 | ||
259 | # For dictionary-based break | |
260 | $dictionary $dictionary; | |
261 | ||
262 | ## ------------------------------------------------- | |
263 | ||
264 | !!safe_forward; | |
265 | ||
266 | # rule 4 | |
267 | ($Extend | $Format)+ .?; | |
268 | ||
269 | # rule 6 | |
270 | ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); | |
271 | ||
272 | # rule 7b | |
273 | $Double_QuoteEx $Hebrew_LetterEx; | |
274 | ||
275 | # rule 11 | |
276 | ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; | |
277 | ||
278 | # For dictionary-based break | |
279 | $dictionary $dictionary; |