]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | # |
2 | # Copyright (C) 2002-2004, International Business Machines Corporation | |
3 | # and others. All Rights Reserved. | |
4 | # | |
5 | # file: word_ja.txt | |
6 | # | |
7 | # ICU Word Break Rules | |
8 | # See Unicode Standard Annex #29. | |
9 | # These rules are based on Version 4.1 draft, dated 2004-11-11 | |
10 | # | |
11 | ||
12 | ############################################################################## | |
13 | # | |
14 | # Character class definitions from TR 29 | |
15 | # | |
16 | ############################################################################## | |
17 | ||
18 | !!chain; | |
19 | ||
20 | $Katakana = [[:Script = KATAKANA:] | |
21 | [:name = VERTICAL KANA REPEAT MARK:] | |
22 | [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK:] | |
23 | [:name = VERTICAL KANA REPEAT MARK UPPER HALF:] | |
24 | [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF:] | |
25 | [:name = VERTICAL KANA REPEAT MARK LOWER HALF:] | |
26 | [:name = KATAKANA-HIRAGANA VOICED SOUND MARK:] | |
27 | [:name = KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK:] | |
28 | [:name = KATAKANA-HIRAGANA DOUBLE HYPHEN:] | |
29 | [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] | |
30 | [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] | |
31 | [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] | |
32 | [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; | |
33 | ||
34 | ||
35 | $ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] | |
36 | - [:Ideographic:] | |
37 | - $Katakana | |
38 | - [:Script = Hiragana:] | |
39 | - [:Script = Lao:] | |
40 | - [:Grapheme_Extend = TRUE:]]; | |
41 | ||
42 | $MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] | |
43 | [:name = HEBREW PUNCTUATION GERSHAYIM:] | |
44 | [:name = RIGHT SINGLE QUOTATION MARK:] | |
45 | [:name = HYPHENATION POINT:] | |
46 | [:name = COLON:]]; | |
47 | ||
48 | ||
49 | $MidNum = [[:LineBreak = Infix_Numeric:] - [:name = COLON:]]; | |
50 | $Numeric = [[:Nd:][:name = ARABIC DECIMAL SEPARATOR:][:name = ARABIC THOUSANDS SEPARATOR:]]; | |
51 | $ExtendNumLet = [[:Connector_Punctuation:] | |
52 | - [:name = KATAKANA MIDDLE DOT:] | |
53 | - [:name = HALFWIDTH KATAKANA MIDDLE DOT:]]; | |
54 | ||
55 | ||
56 | ||
57 | # | |
58 | # Character Class Definitions. | |
59 | # The names are those from TR29. | |
60 | # | |
61 | ||
62 | $CR = \u000d; | |
63 | $LF = \u000a; | |
64 | $Extend = [[:Grapheme_Extend = TRUE:]]; | |
65 | $Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend]; | |
66 | $Format = [[:Cf:] - $Extend]; | |
67 | $Hiragana = [:Hiragana:]; | |
68 | $Ideographic = [:IDEOGRAPHIC:]; | |
69 | ||
70 | $ALetterEx = $ALetter $Extend*; | |
71 | $NumericEx = $Numeric $Extend*; | |
72 | $MidNumEx = $MidNum $Extend*; | |
73 | $MidLetterEx = $MidLetter $Extend*; | |
74 | $KatakanaEx = $Katakana $Extend*; | |
75 | $HiraganaEx = $Hiragana $Extend*; | |
76 | $IdeographicEx = $Ideographic $Extend*; | |
77 | $ExtendNumLetEx = $ExtendNumLet $Extend*; | |
78 | ||
79 | ## ------------------------------------------------- | |
80 | ||
81 | !!forward; | |
82 | ||
83 | ||
84 | # Rule 3 - don't break grapheme clusters. | |
85 | # see character breaks | |
86 | ||
87 | $CR $LF; | |
88 | #[^$Control] $Extend*; | |
89 | #$NumericEx $Extend* {100}; | |
90 | #$ALetterEx $Extend* {200}; | |
91 | [^$Control] $Extend+; | |
92 | $NumericEx {100}; | |
93 | $ALetterEx {200}; | |
94 | $KatakanaEx {300}; | |
95 | $HiraganaEx {300}; | |
96 | $IdeographicEx {400}; | |
97 | ||
98 | # rule 5 | |
99 | ||
100 | $ALetterEx $Format* $ALetterEx {200}; | |
101 | ||
102 | # rule 6 and 7 | |
103 | $ALetterEx $Format* $MidLetterEx $Format* $ALetterEx {200}; | |
104 | ||
105 | # rule 8 | |
106 | ||
107 | $NumericEx $Format* $NumericEx {100}; | |
108 | ||
109 | # rule 9 | |
110 | ||
111 | $ALetterEx $Format* $NumericEx {200}; | |
112 | ||
113 | # rule 10 | |
114 | ||
115 | $NumericEx $Format* $ALetterEx {200}; | |
116 | ||
117 | # rule 11 and 12 | |
118 | ||
119 | $NumericEx $Format* $MidNumEx $Format* $NumericEx {100}; | |
120 | ||
121 | # rule 13 | |
122 | ||
123 | $KatakanaEx $Format* $KatakanaEx {300}; | |
124 | $HiraganaEx $Format* $HiraganaEx {300}; | |
125 | $IdeographicEx $Format* $IdeographicEx {400}; | |
126 | ||
127 | # rule 13a/b | |
128 | ||
129 | $ALetterEx $Format* $ExtendNumLetEx {200}; # (13a) | |
130 | $NumericEx $Format* $ExtendNumLetEx {100}; # (13a) | |
131 | $KatakanaEx $Format* $ExtendNumLetEx {300}; # (13a) | |
132 | $ExtendNumLetEx $Format* $ExtendNumLetEx{200}; # (13a) | |
133 | ||
134 | $ExtendNumLetEx $Format* $ALetterEx {200}; # (13b) | |
135 | $ExtendNumLetEx $Format* $NumericEx {100}; # (13b) | |
136 | $ExtendNumLetEx $Format* $KatakanaEx {300}; # (13b) | |
137 | ||
138 | ||
139 | ||
140 | ## ------------------------------------------------- | |
141 | ||
142 | !!reverse; | |
143 | ||
144 | $BackALetterEx = $Extend* $ALetter; | |
145 | $BackNumericEx = $Extend* $Numeric; | |
146 | $BackMidNumEx = $Extend* $MidNum; | |
147 | $BackMidLetterEx = $Extend* $MidLetter; | |
148 | $BackKatakanaEx = $Extend* $Katakana; | |
149 | $BackHiraganaEx = $Extend* $Hiragana; | |
150 | $BackIdeographicEx = $Extend* $Ideographic; | |
151 | $BackExtendNumLetEx= $Extend* $ExtendNumLet; | |
152 | ||
153 | $LF $CR; | |
154 | ||
155 | # see character breaks | |
156 | ||
157 | $Extend* [^$Control]; | |
158 | ||
159 | # rule 5 | |
160 | ||
161 | $BackALetterEx $Format* $BackALetterEx; | |
162 | ||
163 | # rule 6 and 7 | |
164 | ||
165 | $BackALetterEx $Format* $BackMidLetterEx $Format* $BackALetterEx; | |
166 | ||
167 | ||
168 | # rule 8 | |
169 | ||
170 | $BackNumericEx $Format* $BackNumericEx; | |
171 | ||
172 | # rule 9 | |
173 | ||
174 | $BackNumericEx $Format* $BackALetterEx; | |
175 | ||
176 | # rule 10 | |
177 | ||
178 | $BackALetterEx $Format* $BackNumericEx; | |
179 | ||
180 | # rule 11 and 12 | |
181 | ||
182 | $BackNumericEx $Format* $BackMidNumEx $Format* $BackNumericEx; | |
183 | ||
184 | # rule 13 | |
185 | ||
186 | $BackKatakanaEx $Format* $BackKatakanaEx; | |
187 | $BackHiraganaEx $Format* $BackHiraganaEx; | |
188 | $BackIdeographicEx $Format* $BackIdeographicEx; | |
189 | ||
190 | # rules 13 a/b | |
191 | # | |
192 | ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx) $Format* $BackExtendNumLetEx; | |
193 | $BackExtendNumLetEx $Format* ($BackALetterEx | $BackNumericEx | $BackKatakanaEx); | |
194 | ||
195 | ## ------------------------------------------------- | |
196 | ||
197 | !!safe_reverse; | |
198 | ||
199 | # rule 3 | |
200 | $Extend+ [^$Extend]; | |
201 | $Extend+; # comes into play when buffer _begins_ with an $Extend+. | |
202 | ||
203 | # rule 4 | |
204 | $Format+ $BackALetterEx; | |
205 | $Format+ $BackNumericEx; | |
206 | $Format+ $BackMidLetterEx; | |
207 | $Format+ $BackMidNumEx; | |
208 | $Format+ $BackKatakanaEx; | |
209 | $Format+ $BackHiraganaEx; | |
210 | $Format+ $BackIdeographicEx; | |
211 | $Format+ $BackExtendNumLetEx; | |
212 | ||
213 | ||
214 | # rule 6 | |
215 | $MidLetter $Format* $BackALetterEx; | |
216 | ||
217 | # rule 11 | |
218 | $MidNum $Format* $BackNumericEx; | |
219 | ||
220 | ## ------------------------------------------------- | |
221 | ||
222 | !!safe_forward; | |
223 | ||
224 | # rule 3 | |
225 | $Extend+; | |
226 | ||
227 | # rule 4 | |
228 | $Extend* $Format+ $ALetterEx; | |
229 | $Extend* $Format+ $NumericEx; | |
230 | $Extend* $Format+ $MidLetterEx; | |
231 | $Extend* $Format+ $MidNumEx; | |
232 | $Extend* $Format+ $KatakanaEx; | |
233 | $Extend* $Format+ $HiraganaEx; | |
234 | $Extend* $Format+ $IdeographicEx; | |
235 | $Extend* $Format+ $ExtendNumLetEx; | |
236 | ||
237 | $Extend+ $Format* $ALetterEx; | |
238 | $Extend+ $Format* $NumericEx; | |
239 | $Extend+ $Format* $MidLetterEx; | |
240 | $Extend+ $Format* $MidNumEx; | |
241 | $Extend+ $Format* $KatakanaEx; | |
242 | $Extend+ $Format* $HiraganaEx; | |
243 | $Extend+ $Format* $IdeographicEx; | |
244 | $Extend+ $Format* $ExtendNumLetEx; | |
245 | ||
246 | # rule 6 | |
247 | $MidLetterEx $Format* $ALetterEx; | |
248 | ||
249 | # rule 11 | |
250 | $MidNumEx $Format* $NumericEx; |