]>
Commit | Line | Data |
---|---|---|
2ca993e8 | 1 | # |
f3c0d7a5 A |
2 | # Copyright (C) 2016 and later: Unicode, Inc. and others. |
3 | # License & terms of use: http://www.unicode.org/copyright.html | |
2ca993e8 A |
4 | # Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. |
5 | ||
6 | # file: word.txt | |
7 | # | |
8 | # Reference Word Break rules for intltest rbbi/RBBIMonkeyTest | |
9 | # | |
10 | # Note: Rule syntax and the monkey test itself are still a work in progress. | |
11 | # They are expected to change with review and the addition of support for rule tailoring. | |
12 | ||
13 | ||
14 | type = word; # one of grapheme | word | line | sentence | |
15 | locale = en; | |
16 | ||
2ca993e8 A |
17 | |
18 | CR = [\p{Word_Break = CR}]; | |
19 | LF = [\p{Word_Break = LF}]; | |
20 | Newline = [\p{Word_Break = Newline}]; | |
f3c0d7a5 A |
21 | Extend = [\p{Word_Break = Extend}]; |
22 | ZWJ = [\p{Word_Break = ZWJ}]; | |
2ca993e8 | 23 | Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; |
f3c0d7a5 | 24 | Format = [\p{Word_Break = Format}]; |
2ca993e8 A |
25 | Katakana = [\p{Word_Break = Katakana}]; |
26 | Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; | |
27 | ALetter = [\p{Word_Break = ALetter}]; | |
28 | Single_Quote = [\p{Word_Break = Single_Quote}]; | |
29 | Double_Quote = [\p{Word_Break = Double_Quote}]; | |
30 | MidNumLet = [\p{Word_Break = MidNumLet}]; | |
31 | MidLetter = [\p{Word_Break = MidLetter} - [\:]]; | |
32 | MidNum = [\p{Word_Break = MidNum}]; | |
3d1f044b | 33 | Numeric = [[\p{Word_Break = Numeric}] [\uFF10-\uff19]]; # Patch for ICU-12079; |
2ca993e8 | 34 | ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
0f5d89e8 A |
35 | WSegSpace = [\p{Word_Break = WSegSpace}]; |
36 | Extended_Pict = [:ExtPict:]; | |
2ca993e8 | 37 | |
0f5d89e8 | 38 | #define dictionary, with the effect being that those characters don't appear in test data. |
2ca993e8 A |
39 | |
40 | Han = [:Han:]; | |
41 | Hiragana = [:Hiragana:]; | |
42 | ||
43 | Control = [\p{Grapheme_Cluster_Break = Control}]; | |
44 | HangulSyllable = [\uac00-\ud7a3]; | |
45 | ComplexContext = [:LineBreak = Complex_Context:]; | |
46 | KanaKanji = [Han Hiragana Katakana]; | |
47 | dictionaryCJK = [KanaKanji HangulSyllable]; | |
48 | dictionary = [ComplexContext dictionaryCJK]; | |
49 | ||
0f5d89e8 | 50 | # leave dictionary scripts out of ALetter |
2ca993e8 A |
51 | |
52 | ALetter = [ALetter - dictionary]; | |
53 | ||
54 | AHLetter = [ALetter Hebrew_Letter]; | |
55 | MidNumLetQ = [MidNumLet Single_Quote]; | |
56 | ExtFmt = [Extend Format ZWJ]; | |
57 | ||
58 | WB3: CR LF; | |
59 | WB3a: (Newline | CR | LF) ÷; | |
60 | WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines. | |
61 | # (but needed with UAX treat-as scheme.) | |
0f5d89e8 A |
62 | WB3c: ZWJ Extended_Pict; |
63 | WB3d: WSegSpace WSegSpace; | |
2ca993e8 A |
64 | |
65 | WB5: AHLetter ExtFmt* AHLetter; | |
66 | ||
67 | # includes both WB6 and WB7 | |
68 | WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter; | |
69 | ||
70 | WB7a: Hebrew_Letter ExtFmt* Single_Quote; | |
71 | WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c | |
72 | ||
73 | WB8: Numeric ExtFmt* Numeric; | |
74 | WB9: AHLetter ExtFmt* Numeric; | |
75 | WB10: Numeric ExtFmt* AHLetter; | |
76 | ||
77 | WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12 | |
78 | WB13: Katakana ExtFmt* Katakana; | |
79 | ||
80 | WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet; | |
81 | WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana); | |
82 | ||
f3c0d7a5 | 83 | # WB rule 15 - 17, pairs of Regional Indicators stay unbroken. |
2ca993e8 | 84 | # Interacts with WB3c. |
0f5d89e8 | 85 | WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ Extended_Pict; |
f3c0d7a5 | 86 | WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷; |
2ca993e8 | 87 | |
f3c0d7a5 A |
88 | # Rule WB 999 Any ÷ Any |
89 | # Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG). | |
0f5d89e8 | 90 | WB999.1: . ExtFmt* ZWJ Extended_Pict; |
f3c0d7a5 | 91 | WB999.2: . ExtFmt* ÷; |
2ca993e8 | 92 |