]> git.saurik.com Git - apple/icu.git/blame - icuSources/data/brkitr/word_ja.txt
ICU-6.2.13.tar.gz
[apple/icu.git] / icuSources / data / brkitr / word_ja.txt
CommitLineData
374ca955
A
1#
2# Copyright (C) 2002-2004, International Business Machines Corporation
3# and others. All Rights Reserved.
4#
5# file: word_ja.txt
6#
7# ICU Word Break Rules
8# See Unicode Standard Annex #29.
9# These rules are based on Version 4.1 draft, dated 2004-11-11
10#
11
12##############################################################################
13#
14# Character class definitions from TR 29
15#
16##############################################################################
17
18!!chain;
19
20$Katakana = [[:Script = KATAKANA:]
21 [:name = VERTICAL KANA REPEAT MARK:]
22 [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK:]
23 [:name = VERTICAL KANA REPEAT MARK UPPER HALF:]
24 [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF:]
25 [:name = VERTICAL KANA REPEAT MARK LOWER HALF:]
26 [:name = KATAKANA-HIRAGANA VOICED SOUND MARK:]
27 [:name = KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK:]
28 [:name = KATAKANA-HIRAGANA DOUBLE HYPHEN:]
29 [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
30 [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
31 [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
32 [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
33
34
35$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
36 - [:Ideographic:]
37 - $Katakana
38 - [:Script = Hiragana:]
39 - [:Script = Lao:]
40 - [:Grapheme_Extend = TRUE:]];
41
42$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
43 [:name = HEBREW PUNCTUATION GERSHAYIM:]
44 [:name = RIGHT SINGLE QUOTATION MARK:]
45 [:name = HYPHENATION POINT:]
46 [:name = COLON:]];
47
48
49$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = COLON:]];
50$Numeric = [[:Nd:][:name = ARABIC DECIMAL SEPARATOR:][:name = ARABIC THOUSANDS SEPARATOR:]];
51$ExtendNumLet = [[:Connector_Punctuation:]
52 - [:name = KATAKANA MIDDLE DOT:]
53 - [:name = HALFWIDTH KATAKANA MIDDLE DOT:]];
54
55
56
57#
58# Character Class Definitions.
59# The names are those from TR29.
60#
61
62$CR = \u000d;
63$LF = \u000a;
64$Extend = [[:Grapheme_Extend = TRUE:]];
65$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
66$Format = [[:Cf:] - $Extend];
67$Hiragana = [:Hiragana:];
68$Ideographic = [:IDEOGRAPHIC:];
69
70$ALetterEx = $ALetter $Extend*;
71$NumericEx = $Numeric $Extend*;
72$MidNumEx = $MidNum $Extend*;
73$MidLetterEx = $MidLetter $Extend*;
74$KatakanaEx = $Katakana $Extend*;
75$HiraganaEx = $Hiragana $Extend*;
76$IdeographicEx = $Ideographic $Extend*;
77$ExtendNumLetEx = $ExtendNumLet $Extend*;
78
79## -------------------------------------------------
80
81!!forward;
82
83
84# Rule 3 - don't break grapheme clusters.
85# see character breaks
86
87$CR $LF;
88#[^$Control] $Extend*;
89#$NumericEx $Extend* {100};
90#$ALetterEx $Extend* {200};
91[^$Control] $Extend+;
92$NumericEx {100};
93$ALetterEx {200};
94$KatakanaEx {300};
95$HiraganaEx {300};
96$IdeographicEx {400};
97
98# rule 5
99
100$ALetterEx $Format* $ALetterEx {200};
101
102# rule 6 and 7
103$ALetterEx $Format* $MidLetterEx $Format* $ALetterEx {200};
104
105# rule 8
106
107$NumericEx $Format* $NumericEx {100};
108
109# rule 9
110
111$ALetterEx $Format* $NumericEx {200};
112
113# rule 10
114
115$NumericEx $Format* $ALetterEx {200};
116
117# rule 11 and 12
118
119$NumericEx $Format* $MidNumEx $Format* $NumericEx {100};
120
121# rule 13
122
123$KatakanaEx $Format* $KatakanaEx {300};
124$HiraganaEx $Format* $HiraganaEx {300};
125$IdeographicEx $Format* $IdeographicEx {400};
126
127# rule 13a/b
128
129$ALetterEx $Format* $ExtendNumLetEx {200}; # (13a)
130$NumericEx $Format* $ExtendNumLetEx {100}; # (13a)
131$KatakanaEx $Format* $ExtendNumLetEx {300}; # (13a)
132$ExtendNumLetEx $Format* $ExtendNumLetEx{200}; # (13a)
133
134$ExtendNumLetEx $Format* $ALetterEx {200}; # (13b)
135$ExtendNumLetEx $Format* $NumericEx {100}; # (13b)
136$ExtendNumLetEx $Format* $KatakanaEx {300}; # (13b)
137
138
139
140## -------------------------------------------------
141
142!!reverse;
143
144$BackALetterEx = $Extend* $ALetter;
145$BackNumericEx = $Extend* $Numeric;
146$BackMidNumEx = $Extend* $MidNum;
147$BackMidLetterEx = $Extend* $MidLetter;
148$BackKatakanaEx = $Extend* $Katakana;
149$BackHiraganaEx = $Extend* $Hiragana;
150$BackIdeographicEx = $Extend* $Ideographic;
151$BackExtendNumLetEx= $Extend* $ExtendNumLet;
152
153$LF $CR;
154
155# see character breaks
156
157$Extend* [^$Control];
158
159# rule 5
160
161$BackALetterEx $Format* $BackALetterEx;
162
163# rule 6 and 7
164
165$BackALetterEx $Format* $BackMidLetterEx $Format* $BackALetterEx;
166
167
168# rule 8
169
170$BackNumericEx $Format* $BackNumericEx;
171
172# rule 9
173
174$BackNumericEx $Format* $BackALetterEx;
175
176# rule 10
177
178$BackALetterEx $Format* $BackNumericEx;
179
180# rule 11 and 12
181
182$BackNumericEx $Format* $BackMidNumEx $Format* $BackNumericEx;
183
184# rule 13
185
186$BackKatakanaEx $Format* $BackKatakanaEx;
187$BackHiraganaEx $Format* $BackHiraganaEx;
188$BackIdeographicEx $Format* $BackIdeographicEx;
189
190# rules 13 a/b
191#
192($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx) $Format* $BackExtendNumLetEx;
193$BackExtendNumLetEx $Format* ($BackALetterEx | $BackNumericEx | $BackKatakanaEx);
194
195## -------------------------------------------------
196
197!!safe_reverse;
198
199# rule 3
200$Extend+ [^$Extend];
201$Extend+; # comes into play when buffer _begins_ with an $Extend+.
202
203# rule 4
204$Format+ $BackALetterEx;
205$Format+ $BackNumericEx;
206$Format+ $BackMidLetterEx;
207$Format+ $BackMidNumEx;
208$Format+ $BackKatakanaEx;
209$Format+ $BackHiraganaEx;
210$Format+ $BackIdeographicEx;
211$Format+ $BackExtendNumLetEx;
212
213
214# rule 6
215$MidLetter $Format* $BackALetterEx;
216
217# rule 11
218$MidNum $Format* $BackNumericEx;
219
220## -------------------------------------------------
221
222!!safe_forward;
223
224# rule 3
225$Extend+;
226
227# rule 4
228$Extend* $Format+ $ALetterEx;
229$Extend* $Format+ $NumericEx;
230$Extend* $Format+ $MidLetterEx;
231$Extend* $Format+ $MidNumEx;
232$Extend* $Format+ $KatakanaEx;
233$Extend* $Format+ $HiraganaEx;
234$Extend* $Format+ $IdeographicEx;
235$Extend* $Format+ $ExtendNumLetEx;
236
237$Extend+ $Format* $ALetterEx;
238$Extend+ $Format* $NumericEx;
239$Extend+ $Format* $MidLetterEx;
240$Extend+ $Format* $MidNumEx;
241$Extend+ $Format* $KatakanaEx;
242$Extend+ $Format* $HiraganaEx;
243$Extend+ $Format* $IdeographicEx;
244$Extend+ $Format* $ExtendNumLetEx;
245
246# rule 6
247$MidLetterEx $Format* $ALetterEx;
248
249# rule 11
250$MidNumEx $Format* $NumericEx;