git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/data/brkitr/rules/word.txt

... / ...

Commit	Line	Data
	1	#
	2	# Copyright (C) 2016 and later: Unicode, Inc. and others.
	3	# License & terms of use: http://www.unicode.org/copyright.html
	4	# Copyright (C) 2002-2016, International Business Machines Corporation
	5	# and others. All Rights Reserved.
	6	#
	7	# file: word.txt
	8	#
	9	# ICU Word Break Rules (modified from standard to remove colon from $MidLetter)
	10	# See Unicode Standard Annex #29.
	11	# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
	12	#
	13	# Note: Updates to word.txt will usually need to be merged into
	14	# word_POSIX.txt also.
	15
	16	##############################################################################
	17	#
	18	# Character class definitions from TR 29
	19	#
	20	##############################################################################
	21
	22	!!chain;
	23	!!quoted_literals_only;
	24
	25
	26	#
	27	# Character Class Definitions.
	28	#
	29
	30	$Han = [:Han:];
	31
	32	$CR = [\p{Word_Break = CR}];
	33	$LF = [\p{Word_Break = LF}];
	34	$Newline = [\p{Word_Break = Newline}];
	35	$Extend = [\p{Word_Break = Extend}-$Han];
	36	$ZWJ = [\p{Word_Break = ZWJ}];
	37	$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
	38	$Format = [\p{Word_Break = Format}];
	39	$Katakana = [\p{Word_Break = Katakana}];
	40	$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
	41	$ALetter = [\p{Word_Break = ALetter}];
	42	$Single_Quote = [\p{Word_Break = Single_Quote}];
	43	$Double_Quote = [\p{Word_Break = Double_Quote}];
	44	$MidNumLet = [\p{Word_Break = MidNumLet}];
	45	$MidLetter = [\p{Word_Break = MidLetter} - [\:]];
	46	$MidNum = [\p{Word_Break = MidNum}];
	47	$Numeric = [\p{Word_Break = Numeric}];
	48	$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
	49	$WSegSpace = [\p{Word_Break = WSegSpace}];
	50	$Extended_Pict = [\p{Extended_Pictographic}];
	51
	52	$Hiragana = [:Hiragana:];
	53	$Ideographic = [\p{Ideographic}];
	54
	55
	56	# Dictionary character set, for triggering language-based break engines. Currently
	57	# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
	58	# 5.0 or later as the definition of Complex_Context was corrected to include all
	59	# characters requiring dictionary break.
	60
	61	$Control = [\p{Grapheme_Cluster_Break = Control}];
	62	$HangulSyllable = [\uac00-\ud7a3];
	63	$ComplexContext = [:LineBreak = Complex_Context:];
	64	$KanaKanji = [$Han $Hiragana $Katakana];
	65	$dictionaryCJK = [$KanaKanji $HangulSyllable];
	66	$dictionary = [$ComplexContext $dictionaryCJK];
	67
	68	# TODO: check if handling of katakana in dictionary makes rules incorrect/void
	69
	70	# leave CJK scripts out of ALetterPlus
	71	$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
	72
	73
	74	## -------------------------------------------------
	75
	76	# Rule 3 - CR x LF
	77	#
	78	$CR $LF;
	79
	80	# Rule 3c Do not break within emoji zwj sequences.
	81	# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed.
	82	#
	83	$ZWJ $Extended_Pict;
	84
	85	# Rule 3d - Keep horizontal whitespace together.
	86	#
	87	$WSegSpace $WSegSpace;
	88
	89	# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
	90	# of a region of Text.
	91
	92	$ExFm = [$Extend $Format $ZWJ];
	93
	94	^$ExFm+; # This rule fires only when there are format or extend characters at the
	95	# start of text, or immediately following another boundary. It groups them, in
	96	# the event there are more than one.
	97
	98	[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words,
	99	# with no special rule status value.
	100
	101	$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but
	102	$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character.
	103	$HangulSyllable {200};
	104	$Hebrew_Letter $ExFm* {200};
	105	$Katakana $ExFm* {400}; # note: these status values override those from rule 5
	106	$Hiragana $ExFm* {400}; # by virtue of being numerically larger.
	107	$Ideographic $ExFm* {400}; #
	108
	109	#
	110	# rule 5
	111	# Do not break between most letters.
	112	#
	113	($ALetterPlus \| $Hebrew_Letter) $ExFm* ($ALetterPlus \| $Hebrew_Letter);
	114
	115	# rule 6 and 7
	116	($ALetterPlus \| $Hebrew_Letter) $ExFm* ($MidLetter \| $MidNumLet \| $Single_Quote) $ExFm* ($ALetterPlus \| $Hebrew_Letter) {200};
	117
	118	# rule 7a
	119	$Hebrew_Letter $ExFm* $Single_Quote {200};
	120
	121	# rule 7b and 7c
	122	$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
	123
	124	# rule 8
	125
	126	$Numeric $ExFm* $Numeric;
	127
	128	# rule 9
	129
	130	($ALetterPlus \| $Hebrew_Letter) $ExFm* $Numeric;
	131
	132	# rule 10
	133
	134	$Numeric $ExFm* ($ALetterPlus \| $Hebrew_Letter);
	135
	136	# rule 11 and 12
	137
	138	$Numeric $ExFm* ($MidNum \| $MidNumLet \| $Single_Quote) $ExFm* $Numeric;
	139
	140	# rule 13
	141	# to be consistent with $KanaKanji $KanaKanhi, changed
	142	# from 300 to 400.
	143	# See also TestRuleStatus in intltest/rbbiapts.cpp
	144	$Katakana $ExFm* $Katakana {400};
	145
	146	# rule 13a/b
	147
	148	$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a)
	149	$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a)
	150	$Numeric $ExFm* $ExtendNumLet {100}; # (13a)
	151	$Katakana $ExFm* $ExtendNumLet {400}; # (13a)
	152	$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a)
	153
	154	$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b)
	155	$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b)
	156	$ExtendNumLet $ExFm* $Numeric {100}; # (13b)
	157	$ExtendNumLet $ExFm* $Katakana {400}; # (13b)
	158
	159	# rules 15 - 17
	160	# Pairs of Regional Indicators stay together.
	161	# With incoming rule chaining disabled by ^, this rule will match exactly two of them.
	162	# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
	163	#
	164	^$Regional_Indicator $ExFm* $Regional_Indicator;
	165
	166	# special handling for CJK characters: chain for later dictionary segmentation
	167	$HangulSyllable $HangulSyllable {200};
	168	$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
	169
	170	# Rule 999
	171	# Match a single code point if no other rule applies.
	172	.;