]>
Commit | Line | Data |
---|---|---|
b75a7d8f | 1 | # |
374ca955 | 2 | # Copyright (C) 2002-2004, International Business Machines Corporation and others. |
b75a7d8f A |
3 | # All Rights Reserved. |
4 | # | |
374ca955 | 5 | # file: sent.txt |
b75a7d8f A |
6 | # |
7 | # ICU Sentence Break Rules | |
8 | # See Unicode Standard Annex #29. | |
9 | # These rules are based on TR 29 version 4.0.0 | |
10 | # | |
374ca955 | 11 | |
b75a7d8f A |
12 | |
13 | # | |
14 | # Character categories as defined in TR 29 | |
15 | # | |
16 | $Sep = [\u000a \u000d \u0085 \u2028 \u2029]; | |
374ca955 | 17 | $Format = [[:Format:] - [:Grapheme_Extend:]]; |
b75a7d8f A |
18 | $Sp = [[:Whitespace:] - $Sep]; |
19 | $Lower = [[:Lowercase:]]; | |
20 | $Upper = [[:TitleCase_Letter:] [:Uppercase:]]; | |
21 | $OLetter = [[:Alphabetic:] [:name = HEBREW PUNCTUATION GERESH:] - [$Lower $Upper]]; | |
374ca955 A |
22 | $Numeric = [[:Nd:][:name = ARABIC DECIMAL SEPARATOR:][:name = ARABIC THOUSANDS SEPARATOR:]]; |
23 | ||
24 | $ATerm = [.]; | |
b75a7d8f | 25 | |
374ca955 | 26 | $Term = [:STerm:]; |
b75a7d8f | 27 | |
b75a7d8f A |
28 | $Close = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] - |
29 | [[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]]; | |
374ca955 A |
30 | |
31 | ||
b75a7d8f A |
32 | |
33 | # Define extended forms of the character classes, | |
34 | # incorporate grapheme cluster + format chars. | |
35 | ||
374ca955 | 36 | $Extend = [[:Grapheme_Extend = TRUE:]]; |
b75a7d8f A |
37 | $ATermEx = $ATerm $Extend* $Format*; |
38 | $NumericEx = $Numeric $Extend* $Format*; | |
39 | $UpperEx = $Upper $Extend* $Format*; | |
40 | $TermEx = $Term $Extend* $Format*; | |
41 | ||
42 | # | |
43 | # $SepSeq keeps together CRLF as a separator. (CRLF is a grapheme cluster) | |
44 | # | |
45 | $SepSeq = $Sep | \u000d\u000a; | |
46 | ||
47 | # $InteriorChars are those that never trigger a following break. | |
48 | $InteriorChars = [^$Term $ATerm $Sep]; #Note: includes Extend and Format chars | |
49 | ||
374ca955 A |
50 | ## ------------------------------------------------- |
51 | ||
52 | !!forward; | |
b75a7d8f A |
53 | |
54 | # Rule 6. Match an ATerm (.) that does not cause a break because a number immediately follows it. | |
55 | $NumberFollows = $InteriorChars* $ATermEx $NumericEx; | |
56 | ||
57 | ||
58 | # Rule 7. $UppersSurround Match a no-break sentence fragment containing a . surrounded by Uppers | |
59 | $UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx; | |
60 | ||
61 | # Rule 8 Matches a sentence fragment containing "." that should not cause a sentence break, | |
62 | # because a lower case word follows the period. | |
63 | $LowerWordFollows = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower; | |
64 | ||
65 | # Rules 3, 9, 10, 11 | |
66 | # Matches a simple sentence, or the trailing part of a complex sentence, | |
67 | # where a simple sentence contains no interior "."s. | |
374ca955 A |
68 | $TermEndSequence = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq?; |
69 | $EndSequence = $InteriorChars* $SepSeq?; | |
b75a7d8f | 70 | |
374ca955 A |
71 | # Put them all together. |
72 | ($NumberFollows | $UppersSurround | $LowerWordFollows)* $TermEndSequence{0}; # status = UBRK_SENTENCE_TERM | |
73 | ($NumberFollows | $UppersSurround | $LowerWordFollows)* $EndSequence{100}; # status = UBRK_SENTENCE_SEP | |
b75a7d8f | 74 | |
374ca955 | 75 | ## ------------------------------------------------- |
b75a7d8f | 76 | |
374ca955 | 77 | !!reverse; |
b75a7d8f | 78 | |
374ca955 A |
79 | # rule 6 |
80 | ||
81 | $RULE6 = $Numeric $Format* $Extend* $ATerm; | |
82 | ||
83 | # rule 7 | |
84 | ||
85 | $RULE7 = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper; | |
86 | ||
87 | # rule 8 | |
88 | ||
89 | $RULE8 = $Lower ($Format* $Extend* [^$OLetter $Upper $Lower $Sep])* | |
90 | ($Format* $Extend* $Sp)* ($Format* $Extend* $Close)* | |
91 | $Format* $Extend* $ATerm; | |
92 | ||
93 | # rule 9, 10, 11 | |
94 | ||
95 | # $CR $LF | |
96 | $End = $Sep | \u000a\u000d | |
97 | | $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format* | |
98 | $Extend* ($Term | $ATerm) | |
99 | | $Sep $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format* | |
100 | $Extend* ($Term | $ATerm); | |
101 | ||
102 | # rule 12 | |
103 | ||
104 | $RULE12 = [^$Sep $Term $ATerm]; | |
105 | ||
106 | $Join = ($RULE6 | $RULE7 | $RULE8 | $RULE12)*; | |
107 | ||
108 | $End; | |
109 | ||
110 | $End? $Join [$RULE12 - $Sp - $Close]; | |
111 | ||
112 | # forces a break at the beginning of text "$Sp blah blah blah" | |
113 | # remember the break iterators takes the longest match | |
114 | $End? $Join $Sp / [^$Term $ATerm $Sp $Close]; | |
115 | ||
116 | # forces a break at the beginning of text "$Close blah blah blah" | |
117 | $End? $Join $Close / [^$Term $ATerm $Close]; | |
118 | ||
119 | ## ------------------------------------------------- | |
120 | ||
121 | !!safe_reverse; | |
122 | ||
123 | # rule 4 | |
124 | $Extend+ [^$Extend]; | |
125 | ||
126 | # rule 7 | |
127 | $Extend* $ATerm $Format* $Extend* $Upper; | |
128 | ||
129 | # rule 8 | |
130 | ($Extend* $Term)+ ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* $ATerm; | |
131 | ||
132 | # rule 11 | |
133 | ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)*; | |
134 | ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* ($Term | $ATerm); | |
135 | ||
136 | ## ------------------------------------------------- | |
137 | ||
138 | !!safe_forward; | |
139 | ||
140 | # rule 7 | |
141 | ||
142 | $ATerm $Extend* $Format* $Upper; | |
143 | ||
144 | # rule 8 | |
145 | ||
146 | $Lower .; | |
147 | ||
148 | # rule 11 | |
149 | ||
150 | ($Close $Extend* $Format*)* ($Sp $Extend* $Format*)*; |