[apple/icu.git] / icuSources / test / testdata / wordNLLTu6.txt

## wordNLLTu6.txt
## The following corresponds to source file from CoreLP of 2018-Jan-17:
##   CoreNLP/Tagger.subproj/Source/Data/word.txt
##############################################################################
#
# Copyright (C) 2002-2013, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file:  word.txt
#
# ICU Word Break Rules
#      See Unicode Standard Annex #29.
#      These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
#
# Note:  Updates to word.txt will usually need to be merged into
#        word_POSIX.txt also.

##############################################################################
#
#  Character class definitions from TR 29
#
##############################################################################

!!chain;


#
#  Character Class Definitions.
#

$CR                 = [\p{Word_Break = CR}];
$LF                 = [\p{Word_Break = LF}];
$Newline            = [\p{Word_Break = Newline}];
$Extend             = [\p{Word_Break = Extend}];
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
$Format             = [\p{Word_Break = Format}];
$Katakana           = [\p{Word_Break = Katakana}];
$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
$ALetter            = [\p{Word_Break = ALetter}];
$Single_Quote       = [\p{Word_Break = Single_Quote}];
$Double_Quote       = [\p{Word_Break = Double_Quote}];
$MidNumLet          = [\p{Word_Break = MidNumLet}];
$MidLetter          = [\p{Word_Break = MidLetter}];
$MidNum             = [\p{Word_Break = MidNum}];
$Numeric            = [\p{Word_Break = Numeric}];
$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];

$Han                = [:Han:];
$Hiragana           = [:Hiragana:];


#   Dictionary character set, for triggering language-based break engines. Currently
#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
#   5.0 or later as the definition of Complex_Context was corrected to include all
#   characters requiring dictionary break.

$Control        = [\p{Grapheme_Cluster_Break = Control}];
$HangulSyllable = [\uac00-\ud7a3];
$ComplexContext = [:LineBreak = Complex_Context:];
$KanaKanji      = [$Han $Hiragana $Katakana];
$dictionaryCJK  = [$KanaKanji $HangulSyllable];
$dictionary     = [$ComplexContext $dictionaryCJK];

# leave CJK scripts out of ALetterPlus
$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];


#
#  Rules 4    Ignore Format and Extend characters,
#             except when they appear at the beginning of a region of text.
#
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
$KatakanaEx           = $Katakana           ($Extend |  $Format)*;
$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format)*;
$ALetterEx            = $ALetterPlus        ($Extend |  $Format)*;
$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format)*;
$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format)*;
$MidNumLetEx          = $MidNumLet          ($Extend |  $Format)*;
$MidLetterEx          = $MidLetter          ($Extend |  $Format)*;
$MidNumEx             = $MidNum             ($Extend |  $Format)*;
$NumericEx            = $Numeric            ($Extend |  $Format)*;
$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format)*;
$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;

$Ideographic    = [\p{Ideographic}];
$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;

## -------------------------------------------------

!!forward;


# Rule 3 - CR x LF
#
$CR $LF;

# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
#          of a region of Text.   The rule here comes into play when the start of text
#          begins with a group of Format chars, or with a "word" consisting of a single
#          char that is not in any of the listed word break categories followed by
#          format char(s), or is not a CJK dictionary character.
[^$CR $LF $Newline]? ($Extend |  $Format)+;

$NumericEx {100};
$ALetterEx {200};
$HangulSyllable {200};
$Hebrew_LetterEx{200};
$KatakanaEx {400};       # note:  these status values override those from rule 5
$HiraganaEx {400};       #        by virtue of being numerically larger.
$IdeographicEx {400};    #

#
# rule 5
#    Do not break between most letters.
#
($ALetterEx | $Hebrew_LetterEx)  ($ALetterEx | $Hebrew_LetterEx) {200};

# rule 6 and 7
($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};

# rule 7a
$Hebrew_LetterEx $Single_QuoteEx {200};

# rule 7b and 7c
$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};

# rule 8

$NumericEx $NumericEx {100};

# rule 9

($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};

# rule 10

$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};

# rule 11 and 12

$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};

# rule 13
# to be consistent with $KanaKanji $KanaKanhi, changed
# from 300 to 400.
# See also TestRuleStatus in intltest/rbbiapts.cpp
$KatakanaEx  $KatakanaEx {400};

# rule 13a/b

$ALetterEx       $ExtendNumLetEx {200};    #  (13a)
$Hebrew_LetterEx $ExtendNumLetEx {200};    #  (13a)
$NumericEx       $ExtendNumLetEx {100};    #  (13a)
$KatakanaEx      $ExtendNumLetEx {400};    #  (13a)
$ExtendNumLetEx  $ExtendNumLetEx {200};    #  (13a)

$ExtendNumLetEx  $ALetterEx      {200};    #  (13b)
$ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
$ExtendNumLetEx  $NumericEx      {100};    #  (13b)
$ExtendNumLetEx  $KatakanaEx     {400};    #  (13b)

# rule 13c

$Regional_IndicatorEx $Regional_IndicatorEx;

# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found


## -------------------------------------------------

!!reverse;

$BackHebrew_LetterEx      = ($Format | $Extend)* $Hebrew_Letter;
$BackALetterEx            = ($Format | $Extend)* $ALetterPlus;
$BackSingle_QuoteEx       = ($Format | $Extend)* $Single_Quote;
$BackDouble_QuoteEx       = ($Format | $Extend)* $Double_Quote;
$BackMidNumLetEx          = ($Format | $Extend)* $MidNumLet;
$BackNumericEx            = ($Format | $Extend)* $Numeric;
$BackMidNumEx             = ($Format | $Extend)* $MidNum;
$BackMidLetterEx          = ($Format | $Extend)* $MidLetter;
$BackKatakanaEx           = ($Format | $Extend)* $Katakana;
$BackHiraganaEx           = ($Format | $Extend)* $Hiragana;
$BackExtendNumLetEx       = ($Format | $Extend)* $ExtendNumLet;
$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;

# rule 3
$LF $CR;

# rule 4
($Format | $Extend)*  [^$CR $LF $Newline]?;

# rule 5

($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx);

# rule 6 and 7

($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx);

# rule 7a
$BackSingle_QuoteEx $BackHebrew_LetterEx;

# Rule 7b and 7c
$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx;

# rule 8

$BackNumericEx $BackNumericEx;

# rule 9

$BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx);

# rule 10

($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx;

# rule 11 and 12

$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx;

# rule 13

$BackKatakanaEx $BackKatakanaEx;

# rules 13 a/b
#
$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;

# rule 13c

$BackRegional_IndicatorEx $BackRegional_IndicatorEx;

# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable;
$KanaKanji $KanaKanji; #different rule status if both kanji and kana found

## -------------------------------------------------

!!safe_reverse;

# rule 3
($Extend | $Format)+ .?;

# rule 6
($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);

# rule 7b
$Double_Quote $BackHebrew_LetterEx;


# rule 11
($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;

# For dictionary-based break
$dictionary $dictionary;

## -------------------------------------------------

!!safe_forward;

# rule 4
($Extend | $Format)+ .?;

# rule 6
($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);

# rule 7b
$Double_QuoteEx $Hebrew_LetterEx;

# rule 11
($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;

# For dictionary-based break
$dictionary $dictionary;
Commit	Line	Data
340931cb A	1	## wordNLLTu6.txt
	2	## The following corresponds to source file from CoreLP of 2018-Jan-17:
	3	## CoreNLP/Tagger.subproj/Source/Data/word.txt
	4	##############################################################################
	5	#
	6	# Copyright (C) 2002-2013, International Business Machines Corporation
	7	# and others. All Rights Reserved.
	8	#
	9	# file: word.txt
	10	#
	11	# ICU Word Break Rules
	12	# See Unicode Standard Annex #29.
	13	# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
	14	#
	15	# Note: Updates to word.txt will usually need to be merged into
	16	# word_POSIX.txt also.
	17
	18	##############################################################################
	19	#
	20	# Character class definitions from TR 29
	21	#
	22	##############################################################################
	23
	24	!!chain;
	25
	26
	27	#
	28	# Character Class Definitions.
	29	#
	30
	31	$CR = [\p{Word_Break = CR}];
	32	$LF = [\p{Word_Break = LF}];
	33	$Newline = [\p{Word_Break = Newline}];
	34	$Extend = [\p{Word_Break = Extend}];
	35	$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
	36	$Format = [\p{Word_Break = Format}];
	37	$Katakana = [\p{Word_Break = Katakana}];
	38	$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
	39	$ALetter = [\p{Word_Break = ALetter}];
	40	$Single_Quote = [\p{Word_Break = Single_Quote}];
	41	$Double_Quote = [\p{Word_Break = Double_Quote}];
	42	$MidNumLet = [\p{Word_Break = MidNumLet}];
	43	$MidLetter = [\p{Word_Break = MidLetter}];
	44	$MidNum = [\p{Word_Break = MidNum}];
	45	$Numeric = [\p{Word_Break = Numeric}];
	46	$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
	47
	48	$Han = [:Han:];
	49	$Hiragana = [:Hiragana:];
	50
	51
	52	# Dictionary character set, for triggering language-based break engines. Currently
	53	# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
	54	# 5.0 or later as the definition of Complex_Context was corrected to include all
	55	# characters requiring dictionary break.
	56
	57	$Control = [\p{Grapheme_Cluster_Break = Control}];
	58	$HangulSyllable = [\uac00-\ud7a3];
	59	$ComplexContext = [:LineBreak = Complex_Context:];
	60	$KanaKanji = [$Han $Hiragana $Katakana];
	61	$dictionaryCJK = [$KanaKanji $HangulSyllable];
	62	$dictionary = [$ComplexContext $dictionaryCJK];
	63
	64	# leave CJK scripts out of ALetterPlus
65	$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
66
67
68	#
69	# Rules 4 Ignore Format and Extend characters,
70	# except when they appear at the beginning of a region of text.
71	#
72	# TODO: check if handling of katakana in dictionary makes rules incorrect/void
73	$KatakanaEx = $Katakana ($Extend \| $Format)*;
74	$Hebrew_LetterEx = $Hebrew_Letter ($Extend \| $Format)*;
75	$ALetterEx = $ALetterPlus ($Extend \| $Format)*;
76	$Single_QuoteEx = $Single_Quote ($Extend \| $Format)*;
77	$Double_QuoteEx = $Double_Quote ($Extend \| $Format)*;
78	$MidNumLetEx = $MidNumLet ($Extend \| $Format)*;
79	$MidLetterEx = $MidLetter ($Extend \| $Format)*;
80	$MidNumEx = $MidNum ($Extend \| $Format)*;
81	$NumericEx = $Numeric ($Extend \| $Format)*;
82	$ExtendNumLetEx = $ExtendNumLet ($Extend \| $Format)*;
83	$Regional_IndicatorEx = $Regional_Indicator ($Extend \| $Format)*;
84
85	$Ideographic = [\p{Ideographic}];
86	$HiraganaEx = $Hiragana ($Extend \| $Format)*;
87	$IdeographicEx = $Ideographic ($Extend \| $Format)*;
88
89	## -------------------------------------------------
90
91	!!forward;
92
93
94	# Rule 3 - CR x LF
95	#
96	$CR $LF;
97
98	# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
99	# of a region of Text. The rule here comes into play when the start of text
100	# begins with a group of Format chars, or with a "word" consisting of a single
101	# char that is not in any of the listed word break categories followed by
102	# format char(s), or is not a CJK dictionary character.
103	[^$CR $LF $Newline]? ($Extend \| $Format)+;
104
105	$NumericEx {100};
106	$ALetterEx {200};
107	$HangulSyllable {200};
108	$Hebrew_LetterEx{200};
109	$KatakanaEx {400}; # note: these status values override those from rule 5
110	$HiraganaEx {400}; # by virtue of being numerically larger.
111	$IdeographicEx {400}; #
112
113	#
114	# rule 5
115	# Do not break between most letters.
116	#
117	($ALetterEx \| $Hebrew_LetterEx) ($ALetterEx \| $Hebrew_LetterEx) {200};
118
119	# rule 6 and 7
120	($ALetterEx \| $Hebrew_LetterEx) ($MidLetterEx \| $MidNumLetEx \| $Single_QuoteEx) ($ALetterEx \| $Hebrew_LetterEx) {200};
121
122	# rule 7a
123	$Hebrew_LetterEx $Single_QuoteEx {200};
124
125	# rule 7b and 7c
126	$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
127
128	# rule 8
129
130	$NumericEx $NumericEx {100};
131
132	# rule 9
133
134	($ALetterEx \| $Hebrew_LetterEx) $NumericEx {200};
135
136	# rule 10
137
138	$NumericEx ($ALetterEx \| $Hebrew_LetterEx) {200};
139
140	# rule 11 and 12
141
142	$NumericEx ($MidNumEx \| $MidNumLetEx \| $Single_QuoteEx) $NumericEx {100};
143
144	# rule 13
145	# to be consistent with $KanaKanji $KanaKanhi, changed
146	# from 300 to 400.
147	# See also TestRuleStatus in intltest/rbbiapts.cpp
148	$KatakanaEx $KatakanaEx {400};
149
150	# rule 13a/b
151
152	$ALetterEx $ExtendNumLetEx {200}; # (13a)
153	$Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a)
154	$NumericEx $ExtendNumLetEx {100}; # (13a)
155	$KatakanaEx $ExtendNumLetEx {400}; # (13a)
156	$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
157
158	$ExtendNumLetEx $ALetterEx {200}; # (13b)
159	$ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
160	$ExtendNumLetEx $NumericEx {100}; # (13b)
161	$ExtendNumLetEx $KatakanaEx {400}; # (13b)
162
163	# rule 13c
164
165	$Regional_IndicatorEx $Regional_IndicatorEx;
166
167	# special handling for CJK characters: chain for later dictionary segmentation
168	$HangulSyllable $HangulSyllable {200};
169	$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
170
171
172	## -------------------------------------------------
173
174	!!reverse;
175
176	$BackHebrew_LetterEx = ($Format \| $Extend)* $Hebrew_Letter;
177	$BackALetterEx = ($Format \| $Extend)* $ALetterPlus;
178	$BackSingle_QuoteEx = ($Format \| $Extend)* $Single_Quote;
179	$BackDouble_QuoteEx = ($Format \| $Extend)* $Double_Quote;
180	$BackMidNumLetEx = ($Format \| $Extend)* $MidNumLet;
181	$BackNumericEx = ($Format \| $Extend)* $Numeric;
182	$BackMidNumEx = ($Format \| $Extend)* $MidNum;
183	$BackMidLetterEx = ($Format \| $Extend)* $MidLetter;
184	$BackKatakanaEx = ($Format \| $Extend)* $Katakana;
185	$BackHiraganaEx = ($Format \| $Extend)* $Hiragana;
186	$BackExtendNumLetEx = ($Format \| $Extend)* $ExtendNumLet;
187	$BackRegional_IndicatorEx = ($Format \| $Extend)* $Regional_Indicator;
188
189	# rule 3
190	$LF $CR;
191
192	# rule 4
193	($Format \| $Extend)* [^$CR $LF $Newline]?;
194
195	# rule 5
196
197	($BackALetterEx \| $BackHebrew_LetterEx) ($BackALetterEx \| $BackHebrew_LetterEx);
198
199	# rule 6 and 7
200
201	($BackALetterEx \| $BackHebrew_LetterEx) ($BackMidLetterEx \| $BackMidNumLetEx \| $BackSingle_QuoteEx) ($BackALetterEx \| $BackHebrew_LetterEx);
202
203	# rule 7a
204	$BackSingle_QuoteEx $BackHebrew_LetterEx;
205
206	# Rule 7b and 7c
207	$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx;
208
209	# rule 8
210
211	$BackNumericEx $BackNumericEx;
212
213	# rule 9
214
215	$BackNumericEx ($BackALetterEx \| $BackHebrew_LetterEx);
216
217	# rule 10
218
219	($BackALetterEx \| $BackHebrew_LetterEx) $BackNumericEx;
220
221	# rule 11 and 12
222
223	$BackNumericEx ($BackMidNumEx \| $BackMidNumLetEx \| $BackSingle_QuoteEx) $BackNumericEx;
224
225	# rule 13
226
227	$BackKatakanaEx $BackKatakanaEx;
228
229	# rules 13 a/b
230	#
231	$BackExtendNumLetEx ($BackALetterEx \| $BackHebrew_LetterEx \| $BackNumericEx \| $BackKatakanaEx \| $BackExtendNumLetEx);
232	($BackALetterEx \| $BackHebrew_LetterEx \| $BackNumericEx \| $BackKatakanaEx) $BackExtendNumLetEx;
233
234	# rule 13c
235
236	$BackRegional_IndicatorEx $BackRegional_IndicatorEx;
237
238	# special handling for CJK characters: chain for later dictionary segmentation
239	$HangulSyllable $HangulSyllable;
240	$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
241
242	## -------------------------------------------------
243
244	!!safe_reverse;
245
246	# rule 3
247	($Extend \| $Format)+ .?;
248
249	# rule 6
250	($MidLetter \| $MidNumLet \| $Single_Quote) ($BackALetterEx \| $BackHebrew_LetterEx);
251
252	# rule 7b
253	$Double_Quote $BackHebrew_LetterEx;
254
255
256	# rule 11
257	($MidNum \| $MidNumLet \| $Single_Quote) $BackNumericEx;
258
259	# For dictionary-based break
260	$dictionary $dictionary;
261
262	## -------------------------------------------------
263
264	!!safe_forward;
265
266	# rule 4
267	($Extend \| $Format)+ .?;
268
269	# rule 6
270	($MidLetterEx \| $MidNumLetEx \| $Single_QuoteEx) ($ALetterEx \| $Hebrew_LetterEx);
271
272	# rule 7b
273	$Double_QuoteEx $Hebrew_LetterEx;
274
275	# rule 11
276	($MidNumEx \| $MidNumLetEx \| $Single_QuoteEx) $NumericEx;
277
278	# For dictionary-based break
279	$dictionary $dictionary;