]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/testdata/break_rules/sentence.txt
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / test / testdata / break_rules / sentence.txt
... / ...
CommitLineData
1#
2# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
3# file: sentence.txt
4
5type = sentence; # one of grapheme | word | line | sentence
6locale = en;
7
8CR = [\p{Sentence_Break = CR}];
9LF = [\p{Sentence_Break = LF}];
10Extend = [\p{Sentence_Break = Extend}];
11Sep = [\p{Sentence_Break = Sep}];
12Format = [\p{Sentence_Break = Format}];
13Sp = [\p{Sentence_Break = Sp}];
14Lower = [\p{Sentence_Break = Lower}];
15Upper = [\p{Sentence_Break = Upper}];
16OLetter = [\p{Sentence_Break = OLetter}];
17Numeric = [\p{Sentence_Break = Numeric}];
18ATerm = [\p{Sentence_Break = ATerm}];
19SContinue = [\p{Sentence_Break = SContinue}];
20STerm = [\p{Sentence_Break = STerm}];
21Close = [\p{Sentence_Break = Close}];
22
23ParaSep = [Sep CR LF];
24SATerm = [STerm ATerm];
25ExtFmt = [Extend Format];
26
27# SB2: ÷ eot
28# Conventional regular expression matching for '$' as end-of-text also matches
29# at a line separator just preceding the physical end of text.
30# Instead, use a look-ahead assertion that there is no following character.
31SB2: . ÷ (?!.);
32
33SB3: CR LF;
34SB4: ParaSep ÷;
35
36# SB5: ignore Format and Extend characters.
37
38SB6: ATerm ExtFmt* Numeric;
39SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper;
40SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower;
41SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm);
42
43SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷;
44 # Also covers SB10, SB11.
45
46SB12: . ExtFmt* [^ExtFmt]?;
47