]>
Commit | Line | Data |
---|---|---|
2ca993e8 | 1 | # |
f3c0d7a5 A |
2 | # Copyright (C) 2016 and later: Unicode, Inc. and others. |
3 | # License & terms of use: http://www.unicode.org/copyright.html | |
2ca993e8 A |
4 | # Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. |
5 | ||
6 | # file: word_POSIX.txt | |
7 | # | |
8 | # Reference Word Break rules for intltest rbbi/RBBIMonkeyTest | |
9 | # | |
10 | # Note: Rule syntax and the monkey test itself are still a work in progress. | |
11 | # They are expected to change with review and the addition of support for rule tailoring. | |
12 | ||
13 | type = word; # one of grapheme | word | line | sentence | |
14 | locale = en_US_POSIX; | |
15 | ||
340931cb | 16 | Han = [:Han:]; |
2ca993e8 A |
17 | |
18 | CR = [\p{Word_Break = CR}]; | |
19 | LF = [\p{Word_Break = LF}]; | |
20 | Newline = [\p{Word_Break = Newline}]; | |
340931cb | 21 | Extend = [\p{Word_Break = Extend}-Han]; |
f3c0d7a5 | 22 | ZWJ = [\p{Word_Break = ZWJ}]; |
2ca993e8 | 23 | Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; |
f3c0d7a5 | 24 | Format = [\p{Word_Break = Format}]; |
2ca993e8 A |
25 | Katakana = [\p{Word_Break = Katakana}]; |
26 | Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; | |
27 | ALetter = [\p{Word_Break = ALetter}]; | |
28 | Single_Quote = [\p{Word_Break = Single_Quote}]; | |
29 | Double_Quote = [\p{Word_Break = Double_Quote}]; | |
30 | MidNumLet = [\p{Word_Break = MidNumLet} - [.]]; | |
31 | MidLetter = [\p{Word_Break = MidLetter} - [\:]]; | |
32 | MidNum = [\p{Word_Break = MidNum} [.]]; | |
340931cb | 33 | Numeric = [\p{Word_Break = Numeric}]; |
2ca993e8 | 34 | ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
0f5d89e8 A |
35 | WSegSpace = [\p{Word_Break = WSegSpace}]; |
36 | Extended_Pict = [:ExtPict:]; | |
2ca993e8 | 37 | |
0f5d89e8 | 38 | #define dictionary, with the effect being that those characters don't appear in test data. |
2ca993e8 | 39 | |
2ca993e8 A |
40 | Hiragana = [:Hiragana:]; |
41 | ||
42 | Control = [\p{Grapheme_Cluster_Break = Control}]; | |
43 | HangulSyllable = [\uac00-\ud7a3]; | |
44 | ComplexContext = [:LineBreak = Complex_Context:]; | |
45 | KanaKanji = [Han Hiragana Katakana]; | |
46 | dictionaryCJK = [KanaKanji HangulSyllable]; | |
47 | dictionary = [ComplexContext dictionaryCJK]; | |
48 | ||
0f5d89e8 | 49 | # leave dictionary scripts out of ALetter |
2ca993e8 A |
50 | |
51 | ALetter = [ALetter - dictionary]; | |
52 | ||
53 | AHLetter = [ALetter Hebrew_Letter]; | |
54 | MidNumLetQ = [MidNumLet Single_Quote]; | |
55 | ExtFmt = [Extend Format ZWJ]; | |
56 | ||
57 | WB3: CR LF; | |
58 | WB3a: (Newline | CR | LF) ÷; | |
59 | WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines. | |
60 | # (but needed with UAX treat-as scheme.) | |
0f5d89e8 A |
61 | WB3c: ZWJ Extended_Pict; |
62 | WB3d: WSegSpace WSegSpace; | |
2ca993e8 A |
63 | |
64 | WB5: AHLetter ExtFmt* AHLetter; | |
65 | ||
66 | # includes both WB6 and WB7 | |
67 | WB6: AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt* AHLetter; | |
68 | ||
69 | WB7a: Hebrew_Letter ExtFmt* Single_Quote; | |
70 | WB7b: Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter; # Include WB7c | |
71 | ||
72 | WB8: Numeric ExtFmt* Numeric; | |
73 | WB9: AHLetter ExtFmt* Numeric; | |
74 | WB10: Numeric ExtFmt* AHLetter; | |
75 | ||
76 | WB11: Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric; # includes WB12 | |
77 | WB13: Katakana ExtFmt* Katakana; | |
78 | ||
79 | WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet; | |
80 | WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana); | |
81 | ||
f3c0d7a5 | 82 | # WB rule 15 - 17, pairs of Regional Indicators stay unbroken. |
2ca993e8 | 83 | # Interacts with WB3c. |
0f5d89e8 | 84 | WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ Extended_Pict; |
f3c0d7a5 | 85 | WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷; |
2ca993e8 | 86 | |
f3c0d7a5 A |
87 | # Rule WB 999 Any ÷ Any |
88 | # Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG). | |
0f5d89e8 | 89 | WB999.1: . ExtFmt* ZWJ Extended_Pict; |
f3c0d7a5 | 90 | WB999.2: . ExtFmt* ÷; |
2ca993e8 | 91 |