git.saurik.com Git - apple/icu.git/blame - icuSources/data/brkitr/rules/word

Commit	Line	Data
0f5d89e8	1	#
f3c0d7a5 A	2	# Copyright (C) 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	3	# License & terms of use: http://www.unicode.org/copyright.html
0f5d89e8	4	# Copyright (C) 2002-2016, International Business Machines Corporation
374ca955 A	5	# and others. All Rights Reserved.
374ca955 A	6	#
46f4442e	7	# file: word_POSIX.txt
374ca955	8	#
73c04bcf	9	# ICU Word Break Rules, POSIX locale.
374ca955	10	# See Unicode Standard Annex #29.
3d1f044b	11	# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
374ca955	12	#
73c04bcf	13	# Note: Updates to word.txt will usually need to be merged into
51004dcb	14	# word_POSIX.txt also.
374ca955 A	15
	16	##############################################################################
	17	#
	18	# Character class definitions from TR 29
	19	#
	20	##############################################################################
	21
	22	!!chain;
0f5d89e8	23	!!quoted_literals_only;
374ca955	24
374ca955 A	25
	26	#
	27	# Character Class Definitions.
374ca955 A	28	#
374ca955 A	29
340931cb A	30	$Han = [:Han:];
340931cb A	31
57a6839d A	32	$CR = [\p{Word_Break = CR}];
57a6839d A	33	$LF = [\p{Word_Break = LF}];
340931cb A	34	$Newline = [\p{Word_Break = Newline}];
340931cb A	35	$Extend = [\p{Word_Break = Extend}-$Han];
f3c0d7a5	36	$ZWJ = [\p{Word_Break = ZWJ}];
2ca993e8	37	$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
f3c0d7a5	38	$Format = [\p{Word_Break = Format}];
57a6839d A	39	$Katakana = [\p{Word_Break = Katakana}];
	40	$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
	41	$ALetter = [\p{Word_Break = ALetter}];
	42	$Single_Quote = [\p{Word_Break = Single_Quote}];
	43	$Double_Quote = [\p{Word_Break = Double_Quote}];
	44	$MidNumLet = [\p{Word_Break = MidNumLet} - [.]];
	45	$MidLetter = [\p{Word_Break = MidLetter} - [\:]];
	46	$MidNum = [\p{Word_Break = MidNum} [.]];
340931cb	47	$Numeric = [\p{Word_Break = Numeric}];
57a6839d	48	$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
0f5d89e8	49	$WSegSpace = [\p{Word_Break = WSegSpace}];
340931cb	50	$Extended_Pict = [\p{Extended_Pictographic}];
57a6839d	51
57a6839d	52	$Hiragana = [:Hiragana:];
340931cb	53	$Ideographic = [\p{Ideographic}];
73c04bcf A	54
73c04bcf A	55
73c04bcf A	56	# Dictionary character set, for triggering language-based break engines. Currently
	57	# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
	58	# 5.0 or later as the definition of Complex_Context was corrected to include all
	59	# characters requiring dictionary break.
	60
0f5d89e8	61	$Control = [\p{Grapheme_Cluster_Break = Control}];
51004dcb A	62	$HangulSyllable = [\uac00-\ud7a3];
	63	$ComplexContext = [:LineBreak = Complex_Context:];
	64	$KanaKanji = [$Han $Hiragana $Katakana];
	65	$dictionaryCJK = [$KanaKanji $HangulSyllable];
	66	$dictionary = [$ComplexContext $dictionaryCJK];
	67
340931cb A	68	# TODO: check if handling of katakana in dictionary makes rules incorrect/void
340931cb A	69
51004dcb A	70	# leave CJK scripts out of ALetterPlus
	71	$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
	72
73c04bcf	73
374ca955 A	74	## -------------------------------------------------
374ca955 A	75
73c04bcf	76	# Rule 3 - CR x LF
46f4442e A	77	#
46f4442e A	78	$CR $LF;
73c04bcf	79
340931cb A	80	# Rule 3c Do not break within emoji zwj sequences.
340931cb A	81	# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed.
2ca993e8	82	#
0f5d89e8	83	$ZWJ $Extended_Pict;
2ca993e8	84
0f5d89e8 A	85	# Rule 3d - Keep horizontal whitespace together.
	86	#
	87	$WSegSpace $WSegSpace;
2ca993e8	88
73c04bcf	89	# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
340931cb A	90	# of a region of Text.
	91
	92	$ExFm = [$Extend $Format $ZWJ];
374ca955	93
340931cb A	94	^$ExFm+; # This rule fires only when there are format or extend characters at the
	95	# start of text, or immediately following another boundary. It groups them, in
	96	# the event there are more than one.
	97
	98	[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words,
	99	# with no special rule status value.
	100
	101	$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but
	102	$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character.
	103	$HangulSyllable {200};
	104	$Hebrew_Letter $ExFm* {200};
	105	$Katakana $ExFm* {400}; # note: these status values override those from rule 5
	106	$Hiragana $ExFm* {400}; # by virtue of being numerically larger.
	107	$Ideographic $ExFm* {400}; #
2ca993e8	108
46f4442e	109	#
374ca955	110	# rule 5
46f4442e A	111	# Do not break between most letters.
46f4442e A	112	#
340931cb	113	($ALetterPlus \| $Hebrew_Letter) $ExFm* ($ALetterPlus \| $Hebrew_Letter);
374ca955 A	114
374ca955 A	115	# rule 6 and 7
340931cb	116	($ALetterPlus \| $Hebrew_Letter) $ExFm* ($MidLetter \| $MidNumLet \| $Single_Quote) $ExFm* ($ALetterPlus \| $Hebrew_Letter) {200};
57a6839d A	117
57a6839d A	118	# rule 7a
340931cb	119	$Hebrew_Letter $ExFm* $Single_Quote {200};
57a6839d A	120
57a6839d A	121	# rule 7b and 7c
340931cb	122	$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
374ca955 A	123
	124	# rule 8
	125
340931cb	126	$Numeric $ExFm* $Numeric;
374ca955 A	127
	128	# rule 9
	129
340931cb	130	($ALetterPlus \| $Hebrew_Letter) $ExFm* $Numeric;
374ca955 A	131
	132	# rule 10
	133
340931cb	134	$Numeric $ExFm* ($ALetterPlus \| $Hebrew_Letter);
374ca955	135
0f5d89e8	136	# rule 11 and 12
374ca955	137
340931cb	138	$Numeric $ExFm* ($MidNum \| $MidNumLet \| $Single_Quote) $ExFm* $Numeric;
374ca955 A	139
374ca955 A	140	# rule 13
51004dcb A	141	# to be consistent with $KanaKanji $KanaKanhi, changed
	142	# from 300 to 400.
	143	# See also TestRuleStatus in intltest/rbbiapts.cpp
340931cb	144	$Katakana $ExFm* $Katakana {400};
374ca955 A	145
	146	# rule 13a/b
	147
340931cb A	148	$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a)
	149	$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a)
	150	$Numeric $ExFm* $ExtendNumLet {100}; # (13a)
	151	$Katakana $ExFm* $ExtendNumLet {400}; # (13a)
	152	$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a)
374ca955	153
340931cb A	154	$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b)
	155	$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b)
	156	$ExtendNumLet $ExFm* $Numeric {100}; # (13b)
	157	$ExtendNumLet $ExFm* $Katakana {400}; # (13b)
51004dcb	158
f3c0d7a5	159	# rules 15 - 17
2ca993e8	160	# Pairs of Regional Indicators stay together.
340931cb	161	# With incoming rule chaining disabled by ^, this rule will match exactly two of them.
2ca993e8 A	162	# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
2ca993e8 A	163	#
340931cb	164	^$Regional_Indicator $ExFm* $Regional_Indicator;
51004dcb A	165
	166	# special handling for CJK characters: chain for later dictionary segmentation
	167	$HangulSyllable $HangulSyllable {200};
0f5d89e8	168	$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
2ca993e8	169
0f5d89e8 A	170	# Rule 999
	171	# Match a single code point if no other rule applies.
	172	.;