]>
Commit | Line | Data |
---|---|---|
2ca993e8 A |
1 | # |
2 | # Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved. | |
3 | # file: sentence.txt | |
4 | ||
5 | type = sentence; # one of grapheme | word | line | sentence | |
6 | locale = en; | |
7 | ||
8 | CR = [\p{Sentence_Break = CR}]; | |
9 | LF = [\p{Sentence_Break = LF}]; | |
10 | Extend = [\p{Sentence_Break = Extend}]; | |
11 | Sep = [\p{Sentence_Break = Sep}]; | |
12 | Format = [\p{Sentence_Break = Format}]; | |
13 | Sp = [\p{Sentence_Break = Sp}]; | |
14 | Lower = [\p{Sentence_Break = Lower}]; | |
15 | Upper = [\p{Sentence_Break = Upper}]; | |
16 | OLetter = [\p{Sentence_Break = OLetter}]; | |
17 | Numeric = [\p{Sentence_Break = Numeric}]; | |
18 | ATerm = [\p{Sentence_Break = ATerm}]; | |
19 | SContinue = [\p{Sentence_Break = SContinue}]; | |
20 | STerm = [\p{Sentence_Break = STerm}]; | |
21 | Close = [\p{Sentence_Break = Close}]; | |
22 | ||
23 | ParaSep = [Sep CR LF]; | |
24 | SATerm = [STerm ATerm]; | |
25 | ExtFmt = [Extend Format]; | |
26 | ||
27 | # SB2: ÷ eot | |
28 | # Conventional regular expression matching for '$' as end-of-text also matches | |
29 | # at a line separator just preceding the physical end of text. | |
30 | # Instead, use a look-ahead assertion that there is no following character. | |
31 | SB2: . ÷ (?!.); | |
32 | ||
33 | SB3: CR LF; | |
34 | SB4: ParaSep ÷; | |
35 | ||
36 | # SB5: ignore Format and Extend characters. | |
37 | ||
38 | SB6: ATerm ExtFmt* Numeric; | |
39 | SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper; | |
40 | SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower; | |
41 | SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm); | |
42 | ||
43 | SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷; | |
44 | # Also covers SB10, SB11. | |
45 | ||
46 | SB12: . ExtFmt* [^ExtFmt]?; | |
47 |