git.saurik.com Git - apple/icu.git/blame - icuSources/data/brkitr/rules/word_fi

Commit	Line	Data
f3c0d7a5 A	1	#
	2	# Copyright (C) 2016 and later: Unicode, Inc. and others.
	3	# License & terms of use: http://www.unicode.org/copyright.html
0f5d89e8	4	# Copyright (C) 2002-2016, International Business Machines Corporation
374ca955	5	# and others. All Rights Reserved.
b75a7d8f	6	#
2ca993e8	7	# file: word_fi_sv.txt
b75a7d8f	8	#
2ca993e8	9	# ICU Word Break Rules, fi/sv locales (these are actually the standard UAX #29 rules)
b75a7d8f	10	# See Unicode Standard Annex #29.
3d1f044b	11	# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
b75a7d8f	12	#
73c04bcf	13	# Note: Updates to word.txt will usually need to be merged into
51004dcb	14	# word_POSIX.txt also.
b75a7d8f	15
374ca955	16	##############################################################################
b75a7d8f A	17	#
	18	# Character class definitions from TR 29
	19	#
374ca955 A	20	##############################################################################
	21
	22	!!chain;
0f5d89e8	23	!!quoted_literals_only;
374ca955	24
b75a7d8f A	25
	26	#
	27	# Character Class Definitions.
b75a7d8f	28	#
b75a7d8f	29
57a6839d A	30	$CR = [\p{Word_Break = CR}];
57a6839d A	31	$LF = [\p{Word_Break = LF}];
2ca993e8	32	$Newline = [\p{Word_Break = Newline} ];
f3c0d7a5 A	33	$Extend = [\p{Word_Break = Extend}];
f3c0d7a5 A	34	$ZWJ = [\p{Word_Break = ZWJ}];
2ca993e8	35	$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
f3c0d7a5	36	$Format = [\p{Word_Break = Format}];
57a6839d A	37	$Katakana = [\p{Word_Break = Katakana}];
	38	$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
	39	$ALetter = [\p{Word_Break = ALetter}];
	40	$Single_Quote = [\p{Word_Break = Single_Quote}];
	41	$Double_Quote = [\p{Word_Break = Double_Quote}];
	42	$MidNumLet = [\p{Word_Break = MidNumLet}];
	43	$MidLetter = [\p{Word_Break = MidLetter}];
	44	$MidNum = [\p{Word_Break = MidNum}];
3d1f044b	45	$Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079
57a6839d	46	$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
0f5d89e8	47	$WSegSpace = [\p{Word_Break = WSegSpace}];
340931cb	48	$Extended_Pict = [\p{Extended_Pictographic}];
57a6839d A	49
	50	$Han = [:Han:];
	51	$Hiragana = [:Hiragana:];
340931cb	52	$Ideographic = [\p{Ideographic}];
73c04bcf A	53
73c04bcf A	54
73c04bcf A	55	# Dictionary character set, for triggering language-based break engines. Currently
	56	# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
	57	# 5.0 or later as the definition of Complex_Context was corrected to include all
	58	# characters requiring dictionary break.
	59
3d1f044b	60	$Control = [\p{Grapheme_Cluster_Break = Control}];
51004dcb A	61	$HangulSyllable = [\uac00-\ud7a3];
	62	$ComplexContext = [:LineBreak = Complex_Context:];
	63	$KanaKanji = [$Han $Hiragana $Katakana];
	64	$dictionaryCJK = [$KanaKanji $HangulSyllable];
	65	$dictionary = [$ComplexContext $dictionaryCJK];
	66
340931cb A	67	# TODO: check if handling of katakana in dictionary makes rules incorrect/void
340931cb A	68
51004dcb A	69	# leave CJK scripts out of ALetterPlus
	70	$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
	71
73c04bcf	72
374ca955	73	## -------------------------------------------------
b75a7d8f	74
73c04bcf	75	# Rule 3 - CR x LF
46f4442e A	76	#
46f4442e A	77	$CR $LF;
73c04bcf	78
340931cb A	79	# Rule 3c Do not break within emoji zwj sequences.
340931cb A	80	# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed.
2ca993e8	81	#
0f5d89e8	82	$ZWJ $Extended_Pict;
2ca993e8	83
0f5d89e8 A	84	# Rule 3d - Keep horizontal whitespace together.
	85	#
	86	$WSegSpace $WSegSpace;
2ca993e8	87
73c04bcf	88	# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
340931cb A	89	# of a region of Text.
	90
	91	$ExFm = [$Extend $Format $ZWJ];
	92
	93	^$ExFm+; # This rule fires only when there are format or extend characters at the
	94	# start of text, or immediately following another boundary. It groups them, in
	95	# the event there are more than one.
b75a7d8f	96
340931cb A	97	[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words,
	98	# with no special rule status value.
	99
	100	$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but
	101	$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character.
	102	$HangulSyllable {200};
	103	$Hebrew_Letter $ExFm* {200};
	104	$Katakana $ExFm* {400}; # note: these status values override those from rule 5
	105	$Hiragana $ExFm* {400}; # by virtue of being numerically larger.
	106	$Ideographic $ExFm* {400}; #
2ca993e8	107
46f4442e	108	#
374ca955	109	# rule 5
46f4442e A	110	# Do not break between most letters.
46f4442e A	111	#
340931cb	112	($ALetterPlus \| $Hebrew_Letter) $ExFm* ($ALetterPlus \| $Hebrew_Letter);
b75a7d8f	113
374ca955	114	# rule 6 and 7
340931cb	115	($ALetterPlus \| $Hebrew_Letter) $ExFm* ($MidLetter \| $MidNumLet \| $Single_Quote) $ExFm* ($ALetterPlus \| $Hebrew_Letter) {200};
57a6839d A	116
57a6839d A	117	# rule 7a
340931cb	118	$Hebrew_Letter $ExFm* $Single_Quote {200};
57a6839d A	119
57a6839d A	120	# rule 7b and 7c
340931cb	121	$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
b75a7d8f	122
374ca955	123	# rule 8
b75a7d8f	124
340931cb	125	$Numeric $ExFm* $Numeric;
b75a7d8f	126
374ca955	127	# rule 9
b75a7d8f	128
340931cb	129	($ALetterPlus \| $Hebrew_Letter) $ExFm* $Numeric;
b75a7d8f	130
374ca955	131	# rule 10
b75a7d8f	132
340931cb	133	$Numeric $ExFm* ($ALetterPlus \| $Hebrew_Letter);
b75a7d8f	134
3d1f044b	135	# rule 11 and 12
374ca955	136
340931cb	137	$Numeric $ExFm* ($MidNum \| $MidNumLet \| $Single_Quote) $ExFm* $Numeric;
374ca955 A	138
374ca955 A	139	# rule 13
51004dcb A	140	# to be consistent with $KanaKanji $KanaKanhi, changed
	141	# from 300 to 400.
	142	# See also TestRuleStatus in intltest/rbbiapts.cpp
340931cb	143	$Katakana $ExFm* $Katakana {400};
374ca955 A	144
	145	# rule 13a/b
	146
340931cb A	147	$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a)
	148	$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a)
	149	$Numeric $ExFm* $ExtendNumLet {100}; # (13a)
	150	$Katakana $ExFm* $ExtendNumLet {400}; # (13a)
	151	$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a)
374ca955	152
340931cb A	153	$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b)
	154	$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b)
	155	$ExtendNumLet $ExFm* $Numeric {100}; # (13b)
	156	$ExtendNumLet $ExFm* $Katakana {400}; # (13b)
51004dcb	157
f3c0d7a5	158	# rules 15 - 17
2ca993e8	159	# Pairs of Regional Indicators stay together.
340931cb	160	# With incoming rule chaining disabled by ^, this rule will match exactly two of them.
2ca993e8 A	161	# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
2ca993e8 A	162	#
340931cb	163	^$Regional_Indicator $ExFm* $Regional_Indicator;
51004dcb A	164
	165	# special handling for CJK characters: chain for later dictionary segmentation
	166	$HangulSyllable $HangulSyllable {200};
3d1f044b	167	$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
374ca955	168
0f5d89e8 A	169	# Rule 999
	170	# Match a single code point if no other rule applies.
	171	.;