]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/testdata/break_rules/sentence.txt
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / test / testdata / break_rules / sentence.txt
1 #
2 # Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
3 # file: sentence.txt
4
5 type = sentence; # one of grapheme | word | line | sentence
6 locale = en;
7
8 CR = [\p{Sentence_Break = CR}];
9 LF = [\p{Sentence_Break = LF}];
10 Extend = [\p{Sentence_Break = Extend}];
11 Sep = [\p{Sentence_Break = Sep}];
12 Format = [\p{Sentence_Break = Format}];
13 Sp = [\p{Sentence_Break = Sp}];
14 Lower = [\p{Sentence_Break = Lower}];
15 Upper = [\p{Sentence_Break = Upper}];
16 OLetter = [\p{Sentence_Break = OLetter}];
17 Numeric = [\p{Sentence_Break = Numeric}];
18 ATerm = [\p{Sentence_Break = ATerm}];
19 SContinue = [\p{Sentence_Break = SContinue}];
20 STerm = [\p{Sentence_Break = STerm}];
21 Close = [\p{Sentence_Break = Close}];
22
23 ParaSep = [Sep CR LF];
24 SATerm = [STerm ATerm];
25 ExtFmt = [Extend Format];
26
27 # SB2: ÷ eot
28 # Conventional regular expression matching for '$' as end-of-text also matches
29 # at a line separator just preceding the physical end of text.
30 # Instead, use a look-ahead assertion that there is no following character.
31 SB2: . ÷ (?!.);
32
33 SB3: CR LF;
34 SB4: ParaSep ÷;
35
36 # SB5: ignore Format and Extend characters.
37
38 SB6: ATerm ExtFmt* Numeric;
39 SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper;
40 SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower;
41 SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm);
42
43 SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷;
44 # Also covers SB10, SB11.
45
46 SB12: . ExtFmt* [^ExtFmt]?;
47