]>
Commit | Line | Data |
---|---|---|
1 | # | |
2 | # Copyright (C) 2016 and later: Unicode, Inc. and others. | |
3 | # License & terms of use: http://www.unicode.org/copyright.html | |
4 | # Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. | |
5 | ||
6 | # file: word.txt | |
7 | # | |
8 | # Reference Word Break rules for intltest rbbi/RBBIMonkeyTest | |
9 | # | |
10 | # Note: Rule syntax and the monkey test itself are still a work in progress. | |
11 | # They are expected to change with review and the addition of support for rule tailoring. | |
12 | ||
13 | ||
14 | type = word; # one of grapheme | word | line | sentence | |
15 | locale = en; | |
16 | ||
17 | Han = [:Han:]; | |
18 | ||
19 | CR = [\p{Word_Break = CR}]; | |
20 | LF = [\p{Word_Break = LF}]; | |
21 | Newline = [\p{Word_Break = Newline}]; | |
22 | Extend = [\p{Word_Break = Extend}-Han]; | |
23 | ZWJ = [\p{Word_Break = ZWJ}]; | |
24 | Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; | |
25 | Format = [\p{Word_Break = Format}]; | |
26 | Katakana = [\p{Word_Break = Katakana}]; | |
27 | Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; | |
28 | ALetter = [\p{Word_Break = ALetter}]; | |
29 | Single_Quote = [\p{Word_Break = Single_Quote}]; | |
30 | Double_Quote = [\p{Word_Break = Double_Quote}]; | |
31 | MidNumLet = [\p{Word_Break = MidNumLet}]; | |
32 | MidLetter = [\p{Word_Break = MidLetter} - [\:]]; | |
33 | MidNum = [\p{Word_Break = MidNum}]; | |
34 | Numeric = [\p{Word_Break = Numeric}]; | |
35 | ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; | |
36 | WSegSpace = [\p{Word_Break = WSegSpace}]; | |
37 | Extended_Pict = [:ExtPict:]; | |
38 | ||
39 | #define dictionary, with the effect being that those characters don't appear in test data. | |
40 | ||
41 | Hiragana = [:Hiragana:]; | |
42 | ||
43 | Control = [\p{Grapheme_Cluster_Break = Control}]; | |
44 | HangulSyllable = [\uac00-\ud7a3]; | |
45 | ComplexContext = [:LineBreak = Complex_Context:]; | |
46 | KanaKanji = [Han Hiragana Katakana]; | |
47 | dictionaryCJK = [KanaKanji HangulSyllable]; | |
48 | dictionary = [ComplexContext dictionaryCJK]; | |
49 | ||
50 | # leave dictionary scripts out of ALetter | |
51 | ||
52 | ALetter = [ALetter - dictionary]; | |
53 | ||
54 | AHLetter = [ALetter Hebrew_Letter]; | |
55 | MidNumLetQ = [MidNumLet Single_Quote]; | |
56 | ExtFmt = [Extend Format ZWJ]; | |
57 | ||
58 | WB3: CR LF; | |
59 | WB3a: (Newline | CR | LF) ÷; | |
60 | WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines. | |
61 | # (but needed with UAX treat-as scheme.) | |
62 | WB3c: ZWJ Extended_Pict; | |
63 | WB3d: WSegSpace WSegSpace; | |
64 | ||
65 | WB5: AHLetter ExtFmt* AHLetter; | |
66 | ||
67 | # includes both WB6 and WB7 | |
68 | WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter; | |
69 | ||
70 | WB7a: Hebrew_Letter ExtFmt* Single_Quote; | |
71 | WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c | |
72 | ||
73 | WB8: Numeric ExtFmt* Numeric; | |
74 | WB9: AHLetter ExtFmt* Numeric; | |
75 | WB10: Numeric ExtFmt* AHLetter; | |
76 | ||
77 | WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12 | |
78 | WB13: Katakana ExtFmt* Katakana; | |
79 | ||
80 | WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet; | |
81 | WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana); | |
82 | ||
83 | # WB rule 15 - 17, pairs of Regional Indicators stay unbroken. | |
84 | # Interacts with WB3c. | |
85 | WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ Extended_Pict; | |
86 | WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷; | |
87 | ||
88 | # Rule WB 999 Any ÷ Any | |
89 | # Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG). | |
90 | WB999.1: . ExtFmt* ZWJ Extended_Pict; | |
91 | WB999.2: . ExtFmt* ÷; | |
92 |