git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/testdata/break

... / ...

Commit	Line	Data
	1	#
	2	# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
	3
	4	# file: word.txt
	5	#
	6	# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
	7	#
	8	# Note: Rule syntax and the monkey test itself are still a work in progress.
	9	# They are expected to change with review and the addition of support for rule tailoring.
	10
	11
	12	type = word; # one of grapheme \| word \| line \| sentence
	13	locale = en;
	14
	15	E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C2-\U0001F3C4\U0001F3C7\U0001F3CA-\U0001F3CC\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F574-\U0001F575\U0001F57A\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F6CC\U0001F918-\U0001F91E\U0001F926\U0001F930\U0001F933-\U0001F939\U0001F93C-\U0001F93E];
	16	E_Modifier = [\U0001F3FB-\U0001F3FF];
	17	ZWJ = [\u200D];
	18	GAZ = [\u2640\u2642\u2764\u2695-\u2696\u2708\U0001F308\U0001F33E\U0001F373\U0001F393\U0001F3A4\U0001F3A8\U0001F3EB\U0001F3ED\U0001F466-\U0001F469\U0001F48B\U0001F4BB-\U0001F4BC\U0001F527\U0001F52C\U0001F5E8\U0001F680\U0001F692];
	19
	20	CR = [\p{Word_Break = CR}];
	21	LF = [\p{Word_Break = LF}];
	22	Newline = [\p{Word_Break = Newline}];
	23	Extend = [[[\p{Word_Break = Extend}][:Block=Tags:]]-ZWJ];
	24	Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
	25	Format = [[\p{Word_Break = Format}]-[:Block=Tags:]];
	26	Katakana = [\p{Word_Break = Katakana}];
	27	Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
	28	ALetter = [\p{Word_Break = ALetter}];
	29	Single_Quote = [\p{Word_Break = Single_Quote}];
	30	Double_Quote = [\p{Word_Break = Double_Quote}];
	31	MidNumLet = [\p{Word_Break = MidNumLet}];
	32	MidLetter = [\p{Word_Break = MidLetter} - [\:]];
	33	MidNum = [\p{Word_Break = MidNum}];
	34	Numeric = [\p{Word_Break = Numeric}];
	35	ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
	36
	37	#define dicitionary, with the effect being that those characters don't appear in test data.
	38
	39	Han = [:Han:];
	40	Hiragana = [:Hiragana:];
	41
	42	Control = [\p{Grapheme_Cluster_Break = Control}];
	43	HangulSyllable = [\uac00-\ud7a3];
	44	ComplexContext = [:LineBreak = Complex_Context:];
	45	KanaKanji = [Han Hiragana Katakana];
	46	dictionaryCJK = [KanaKanji HangulSyllable];
	47	dictionary = [ComplexContext dictionaryCJK];
	48
	49	# leave CJK scripts out of ALetterPlus
	50	# Tricky. Redfine a set.
	51	# For tailorings, if it modifies itself, do at end of sets ????
	52	# Tweak redefine to mean replace existing definition at its original location.
	53	# Insert defs without redefine just after last pre-existing def of that name.
	54	# Maybe drop redefine, add warning for sets defined and not used, should catch typos.
	55
	56	ALetter = [ALetter - dictionary];
	57
	58	AHLetter = [ALetter Hebrew_Letter];
	59	MidNumLetQ = [MidNumLet Single_Quote];
	60	ExtFmt = [Extend Format ZWJ];
	61
	62	WB3: CR LF;
	63	WB3a: (Newline \| CR \| LF) ÷;
	64	WB3b: . ÷ (Newline \| CR \| LF); # actually redundant? No other rule combines.
	65	# (but needed with UAX treat-as scheme.)
	66	WB3c: ZWJ GAZ;
	67
	68	WB5: AHLetter ExtFmt* AHLetter;
	69
	70	# includes both WB6 and WB7
	71	WB6: AHLetter ExtFmt* (MidLetter \| MidNumLetQ) ExtFmt* AHLetter;
	72
	73	WB7a: Hebrew_Letter ExtFmt* Single_Quote;
	74	WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c
	75
	76	WB8: Numeric ExtFmt* Numeric;
	77	WB9: AHLetter ExtFmt* Numeric;
	78	WB10: Numeric ExtFmt* AHLetter;
	79
	80	WB11: Numeric ExtFmt* (MidNum \| MidNumLetQ) ExtFmt* Numeric; # includes WB12
	81	WB13: Katakana ExtFmt* Katakana;
	82
	83	WB13a: (AHLetter \| Numeric \| Katakana \| ExtendNumLet) ExtFmt* ExtendNumLet;
	84	WB13b: ExtendNumLet ExtFmt* (AHLetter \| Numeric \| Katakana);
	85
	86	# WB rule 13c, pairs of Regional Indicators stay unbroken.
	87	# Interacts with WB3c.
	88	WB13c.1: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ GAZ;
	89	WB13c.2: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
	90
	91	WB13d: (E_Base \| GAZ) ExtFmt* E_Modifier;
	92
	93	# Rule WB 14 Any ÷ Any
	94	# Interacts with WB3c, do not break between ZWJ and GAZ.
	95	WB14.1: . ExtFmt* ZWJ GAZ;
	96	WB14.2: . ExtFmt* ÷;
	97