]>
Commit | Line | Data |
---|---|---|
0f5d89e8 A |
1 | # Copyright (C) 2016 and later: Unicode, Inc. and others. |
2 | # License & terms of use: http://www.unicode.org/copyright.html | |
46f4442e | 3 | # |
2ca993e8 | 4 | # Copyright (C) 2002-2015, International Business Machines Corporation and others. |
46f4442e A |
5 | # All Rights Reserved. |
6 | # | |
7 | # file: sent_el.txt | |
8 | # | |
9 | # ICU Sentence Break Rules | |
10 | # See Unicode Standard Annex #29. | |
3d1f044b | 11 | # These rules are based on UAX #29 Revision 34 for Unicode Version 12.0 |
46f4442e A |
12 | # |
13 | ||
0f5d89e8 | 14 | !!quoted_literals_only; |
46f4442e A |
15 | |
16 | # | |
17 | # Character categories as defined in TR 29 | |
18 | # | |
19 | $CR = [\p{Sentence_Break = CR}]; | |
20 | $LF = [\p{Sentence_Break = LF}]; | |
21 | $Extend = [\p{Sentence_Break = Extend}]; | |
22 | $Sep = [\p{Sentence_Break = Sep}]; | |
23 | $Format = [\p{Sentence_Break = Format}]; | |
24 | $Sp = [\p{Sentence_Break = Sp}]; | |
25 | $Lower = [\p{Sentence_Break = Lower}]; | |
26 | $Upper = [\p{Sentence_Break = Upper}]; | |
27 | $OLetter = [\p{Sentence_Break = OLetter}]; | |
28 | $Numeric = [\p{Sentence_Break = Numeric}]; | |
29 | $ATerm = [\p{Sentence_Break = ATerm}]; | |
30 | $SContinue = [\p{Sentence_Break = SContinue}]; | |
31 | $STerm = [\p{Sentence_Break = STerm} [\u003B \u037E]]; | |
32 | $Close = [\p{Sentence_Break = Close}]; | |
33 | ||
34 | # | |
35 | # Define extended forms of the character classes, | |
36 | # incorporate trailing Extend or Format chars. | |
3d1f044b | 37 | # Rules 4 and 5. |
46f4442e A |
38 | |
39 | $SpEx = $Sp ($Extend | $Format)*; | |
40 | $LowerEx = $Lower ($Extend | $Format)*; | |
41 | $UpperEx = $Upper ($Extend | $Format)*; | |
42 | $OLetterEx = $OLetter ($Extend | $Format)*; | |
43 | $NumericEx = $Numeric ($Extend | $Format)*; | |
44 | $ATermEx = $ATerm ($Extend | $Format)*; | |
45 | $SContinueEx= $SContinue ($Extend | $Format)*; | |
46 | $STermEx = $STerm ($Extend | $Format)*; | |
47 | $CloseEx = $Close ($Extend | $Format)*; | |
48 | ||
49 | ||
50 | ## ------------------------------------------------- | |
51 | ||
52 | !!chain; | |
46f4442e A |
53 | |
54 | # Rule 3 - break after separators. Keep CR/LF together. | |
55 | # | |
56 | $CR $LF; | |
57 | ||
58 | ||
59 | # Rule 4 - Break after $Sep. | |
60 | # Rule 5 - Ignore $Format and $Extend | |
61 | # | |
62 | [^$Sep $CR $LF]? ($Extend | $Format)*; | |
63 | ||
64 | ||
65 | # Rule 6 | |
66 | $ATermEx $NumericEx; | |
67 | ||
68 | # Rule 7 | |
2ca993e8 | 69 | ($UpperEx | $LowerEx) $ATermEx $UpperEx; |
46f4442e A |
70 | |
71 | #Rule 8 | |
72 | $NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend | $Format)*; | |
73 | $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; | |
74 | ||
75 | # Rule 8a | |
76 | ($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx); | |
77 | ||
78 | #Rule 9, 10, 11 | |
79 | ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; | |
80 | ||
3d1f044b | 81 | #Rule 998 |
46f4442e A |
82 | [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .; |
83 | [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100}; |