]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/testdata/break_rules/sentence.txt
ICU-66108.tar.gz
[apple/icu.git] / icuSources / test / testdata / break_rules / sentence.txt
CommitLineData
2ca993e8 1#
f3c0d7a5
A
2# Copyright (C) 2016 and later: Unicode, Inc. and others.
3# License & terms of use: http://www.unicode.org/copyright.html#License
4
2ca993e8
A
5# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
6# file: sentence.txt
7
8type = sentence; # one of grapheme | word | line | sentence
9locale = en;
10
11CR = [\p{Sentence_Break = CR}];
12LF = [\p{Sentence_Break = LF}];
13Extend = [\p{Sentence_Break = Extend}];
14Sep = [\p{Sentence_Break = Sep}];
15Format = [\p{Sentence_Break = Format}];
16Sp = [\p{Sentence_Break = Sp}];
17Lower = [\p{Sentence_Break = Lower}];
18Upper = [\p{Sentence_Break = Upper}];
19OLetter = [\p{Sentence_Break = OLetter}];
20Numeric = [\p{Sentence_Break = Numeric}];
21ATerm = [\p{Sentence_Break = ATerm}];
22SContinue = [\p{Sentence_Break = SContinue}];
23STerm = [\p{Sentence_Break = STerm}];
24Close = [\p{Sentence_Break = Close}];
25
26ParaSep = [Sep CR LF];
27SATerm = [STerm ATerm];
28ExtFmt = [Extend Format];
29
30# SB2: ÷ eot
31# Conventional regular expression matching for '$' as end-of-text also matches
32# at a line separator just preceding the physical end of text.
33# Instead, use a look-ahead assertion that there is no following character.
34SB2: . ÷ (?!.);
35
36SB3: CR LF;
37SB4: ParaSep ÷;
38
39# SB5: ignore Format and Extend characters.
40
41SB6: ATerm ExtFmt* Numeric;
42SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper;
43SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower;
44SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm);
45
46SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷;
47 # Also covers SB10, SB11.
48
49SB12: . ExtFmt* [^ExtFmt]?;
50