2 # Copyright (C) 2016 and later: Unicode, Inc. and others.
3 # License & terms of use: http://www.unicode.org/copyright.html#License
5 # Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
8 type = sentence; # one of grapheme | word | line | sentence
11 CR = [\p{Sentence_Break = CR}];
12 LF = [\p{Sentence_Break = LF}];
13 Extend = [\p{Sentence_Break = Extend}];
14 Sep = [\p{Sentence_Break = Sep}];
15 Format = [\p{Sentence_Break = Format}];
16 Sp = [\p{Sentence_Break = Sp}];
17 Lower = [\p{Sentence_Break = Lower}];
18 Upper = [\p{Sentence_Break = Upper}];
19 OLetter = [\p{Sentence_Break = OLetter}];
20 Numeric = [\p{Sentence_Break = Numeric}];
21 ATerm = [\p{Sentence_Break = ATerm}];
22 SContinue = [\p{Sentence_Break = SContinue}];
23 STerm = [\p{Sentence_Break = STerm}];
24 Close = [\p{Sentence_Break = Close}];
26 ParaSep = [Sep CR LF];
27 SATerm = [STerm ATerm];
28 ExtFmt = [Extend Format];
31 # Conventional regular expression matching for '$' as end-of-text also matches
32 # at a line separator just preceding the physical end of text.
33 # Instead, use a look-ahead assertion that there is no following character.
39 # SB5: ignore Format and Extend characters.
41 SB6: ATerm ExtFmt* Numeric;
42 SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper;
43 SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower;
44 SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm);
46 SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷;
47 # Also covers SB10, SB11.
49 SB12: . ExtFmt* [^ExtFmt]?;