]>
Commit | Line | Data |
---|---|---|
374ca955 | 1 | # |
73c04bcf | 2 | # Copyright (C) 2002-2006, International Business Machines Corporation |
374ca955 A |
3 | # and others. All Rights Reserved. |
4 | # | |
5 | # file: word_ja.txt | |
6 | # | |
7 | # ICU Word Break Rules | |
8 | # See Unicode Standard Annex #29. | |
73c04bcf A |
9 | # These rules are based on Unicode Version 5.0 0 |
10 | # Includes post Unicode 5.0 change to treat Japanese half width voicing marks | |
11 | # as Extend | |
374ca955 | 12 | # |
73c04bcf A |
13 | # Note: Updates to word.txt will usually need to be merged into |
14 | # word_POSIX.txt and word_ja.txt also. | |
374ca955 A |
15 | |
16 | ############################################################################## | |
17 | # | |
18 | # Character class definitions from TR 29 | |
19 | # | |
20 | ############################################################################## | |
21 | ||
22 | !!chain; | |
23 | ||
374ca955 A |
24 | |
25 | # | |
26 | # Character Class Definitions. | |
374ca955 A |
27 | # |
28 | ||
73c04bcf A |
29 | $VoiceMarks = [\uff9e\uff9f]; |
30 | $Format = [\p{Word_Break = Format}]; | |
31 | $Katakana = [\p{Word_Break = Katakana}-$VoiceMarks]; | |
32 | $ALetter = [\p{Word_Break = ALetter}]; | |
33 | $MidLetter = [\p{Word_Break = MidLetter}]; | |
34 | $MidNum = [\p{Word_Break = MidNum}]; | |
35 | $Numeric = [\p{Word_Break = Numeric}]; | |
36 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; | |
37 | ||
38 | ||
374ca955 A |
39 | $CR = \u000d; |
40 | $LF = \u000a; | |
73c04bcf A |
41 | $Extend = [\p{Grapheme_Cluster_Break = Extend}$VoiceMarks]; |
42 | $Control = [\p{Grapheme_Cluster_Break = Control}]; | |
43 | ||
44 | # Dictionary character set, for triggering language-based break engines. Currently | |
45 | # limited to LineBreak=Complex_Context. Note that this set only works in Unicode | |
46 | # 5.0 or later as the definition of Complex_Context was corrected to include all | |
47 | # characters requiring dictionary break. | |
48 | ||
49 | $dictionary = [:LineBreak = Complex_Context:]; | |
50 | $ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; | |
51 | ||
52 | ||
53 | # | |
54 | # Rules 3 Grapheme Clusters behave like their first char. | |
55 | # Rule 4 Ignore trailing Format characters (Also see note in TR 29) | |
56 | # | |
57 | $KatakanaEx = $Katakana ($Extend | $Format)*; | |
58 | $ALetterEx = $ALetterPlus ($Extend | $Format)*; | |
59 | $MidLetterEx = $MidLetter ($Extend | $Format)*; | |
60 | $MidNumEx = $MidNum ($Extend | $Format)*; | |
61 | $NumericEx = $Numeric ($Extend | $Format)*; | |
62 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; | |
63 | ||
374ca955 A |
64 | $Hiragana = [:Hiragana:]; |
65 | $Ideographic = [:IDEOGRAPHIC:]; | |
73c04bcf A |
66 | $HiraganaEx = $Hiragana ($Extend | $Format)*; |
67 | $IdeographicEx = $Ideographic ($Extend | $Format)*; | |
374ca955 A |
68 | |
69 | ## ------------------------------------------------- | |
70 | ||
71 | !!forward; | |
72 | ||
73 | ||
73c04bcf A |
74 | # Rule 3 - CR x LF |
75 | # see character breaks. | |
76 | ||
77 | $CR $LF ($Extend | $Format)*; | |
78 | ||
79 | # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning | |
80 | # of a region of Text. The rule here comes into play when the start of text | |
81 | # begins with a group of Format chars, or with a "word" consisting of a single | |
82 | # char that is not in any of the listed word break categories followed by | |
83 | # format char(s). | |
84 | .? ($Extend | $Format)+; | |
85 | ||
374ca955 | 86 | |
374ca955 A |
87 | $NumericEx {100}; |
88 | $ALetterEx {200}; | |
89 | $KatakanaEx {300}; | |
90 | $HiraganaEx {300}; | |
91 | $IdeographicEx {400}; | |
92 | ||
93 | # rule 5 | |
94 | ||
73c04bcf | 95 | $ALetterEx $ALetterEx {200}; |
374ca955 A |
96 | |
97 | # rule 6 and 7 | |
73c04bcf | 98 | $ALetterEx $MidLetterEx $ALetterEx {200}; |
374ca955 A |
99 | |
100 | # rule 8 | |
101 | ||
73c04bcf | 102 | $NumericEx $NumericEx {100}; |
374ca955 A |
103 | |
104 | # rule 9 | |
105 | ||
106 | $ALetterEx $Format* $NumericEx {200}; | |
107 | ||
108 | # rule 10 | |
109 | ||
73c04bcf | 110 | $NumericEx $ALetterEx {200}; |
374ca955 A |
111 | |
112 | # rule 11 and 12 | |
113 | ||
73c04bcf | 114 | $NumericEx $MidNumEx $NumericEx {100}; |
374ca955 A |
115 | |
116 | # rule 13 | |
117 | ||
73c04bcf A |
118 | $KatakanaEx $KatakanaEx {300}; |
119 | $HiraganaEx $HiraganaEx {300}; | |
120 | $IdeographicEx $IdeographicEx {400}; | |
374ca955 A |
121 | |
122 | # rule 13a/b | |
123 | ||
73c04bcf A |
124 | $ALetterEx $ExtendNumLetEx {200}; # (13a) |
125 | $NumericEx $ExtendNumLetEx {100}; # (13a) | |
126 | $KatakanaEx $ExtendNumLetEx {300}; # (13a) | |
127 | $ExtendNumLetEx $ExtendNumLetEx{200}; # (13a) | |
374ca955 | 128 | |
73c04bcf A |
129 | $ExtendNumLetEx $ALetterEx {200}; # (13b) |
130 | $ExtendNumLetEx $NumericEx {100}; # (13b) | |
131 | $ExtendNumLetEx $KatakanaEx {300}; # (13b) | |
374ca955 A |
132 | |
133 | ||
134 | ||
135 | ## ------------------------------------------------- | |
136 | ||
137 | !!reverse; | |
138 | ||
73c04bcf A |
139 | $BackALetterEx = ($Format | $Extend)* $ALetterPlus; |
140 | $BackNumericEx = ($Format | $Extend)* $Numeric; | |
141 | $BackMidNumEx = ($Format | $Extend)* $MidNum; | |
142 | $BackMidLetterEx = ($Format | $Extend)* $MidLetter; | |
143 | $BackKatakanaEx = ($Format | $Extend)* $Katakana; | |
144 | $BackHiraganaEx = ($Format | $Extend)* $Hiragana; | |
145 | $BackIdeographicEx = ($Format | $Extend)* $Ideographic; | |
146 | $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet; | |
374ca955 | 147 | |
73c04bcf A |
148 | # rule 3 |
149 | ($Format | $Extend)* $LF $CR; | |
374ca955 | 150 | |
73c04bcf A |
151 | # rule 4 |
152 | ($Format | $Extend)* .?; | |
374ca955 A |
153 | |
154 | # rule 5 | |
155 | ||
73c04bcf | 156 | $BackALetterEx $BackALetterEx; |
374ca955 A |
157 | |
158 | # rule 6 and 7 | |
159 | ||
73c04bcf | 160 | $BackALetterEx $BackMidLetterEx $BackALetterEx; |
374ca955 A |
161 | |
162 | ||
163 | # rule 8 | |
164 | ||
73c04bcf | 165 | $BackNumericEx $BackNumericEx; |
374ca955 A |
166 | |
167 | # rule 9 | |
168 | ||
73c04bcf | 169 | $BackNumericEx $BackALetterEx; |
374ca955 A |
170 | |
171 | # rule 10 | |
172 | ||
73c04bcf | 173 | $BackALetterEx $BackNumericEx; |
374ca955 A |
174 | |
175 | # rule 11 and 12 | |
176 | ||
73c04bcf | 177 | $BackNumericEx $BackMidNumEx $BackNumericEx; |
374ca955 A |
178 | |
179 | # rule 13 | |
180 | ||
73c04bcf A |
181 | $BackKatakanaEx $BackKatakanaEx; |
182 | $BackHiraganaEx $BackHiraganaEx; | |
183 | $BackIdeographicEx $BackIdeographicEx; | |
374ca955 A |
184 | |
185 | # rules 13 a/b | |
186 | # | |
73c04bcf A |
187 | ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx) $BackExtendNumLetEx; |
188 | $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx); | |
374ca955 A |
189 | |
190 | ## ------------------------------------------------- | |
191 | ||
192 | !!safe_reverse; | |
193 | ||
194 | # rule 3 | |
73c04bcf | 195 | ($Extend | $Format)+ .?; |
374ca955 A |
196 | |
197 | # rule 6 | |
73c04bcf | 198 | $MidLetter $BackALetterEx; |
374ca955 A |
199 | |
200 | # rule 11 | |
73c04bcf A |
201 | $MidNum $BackNumericEx; |
202 | ||
203 | # For dictionary-based break | |
204 | $dictionary $dictionary; | |
374ca955 A |
205 | |
206 | ## ------------------------------------------------- | |
207 | ||
208 | !!safe_forward; | |
209 | ||
374ca955 | 210 | # rule 4 |
73c04bcf | 211 | ($Extend | $Format)+ .?; |
374ca955 A |
212 | |
213 | # rule 6 | |
73c04bcf | 214 | $MidLetterEx $ALetterEx; |
374ca955 A |
215 | |
216 | # rule 11 | |
73c04bcf A |
217 | $MidNumEx $NumericEx; |
218 | ||
219 | # For dictionary-based break | |
220 | $dictionary $dictionary; |