2 # Copyright (C) 2002-2003, International Business Machines Corporation and others.
7 # ICU Sentence Break Rules
8 # See Unicode Standard Annex #29.
9 # These rules are based on TR 29 version 4.0.0
14 # Character categories as defined in TR 29
16 $Sep = [\u000a \u000d \u0085 \u2028 \u2029];
17 $Format = [[:Format:]];
18 $Sp = [[:Whitespace:] - $Sep];
19 $Lower = [[:Lowercase:]];
20 $Upper = [[:TitleCase_Letter:] [:Uppercase:]];
21 $OLetter = [[:Alphabetic:] [:name = HEBREW PUNCTUATION GERESH:] - [$Lower $Upper]];
22 $Numeric = [:LineBreak = Numeric:];
26 $Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362
27 \u1367 \u1368 \u104a \u104b \u166e \u1803 \u1809 \u203C \u203D \u2047
28 \u2048 \u2049 \u3002 \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
30 $Close = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
31 [[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
35 # Define extended forms of the character classes,
36 # incorporate grapheme cluster + format chars.
38 $Extend = [[:Grapheme_Extend = TRUE:]];
39 $ATermEx = $ATerm $Extend* $Format*;
40 $NumericEx = $Numeric $Extend* $Format*;
41 $UpperEx = $Upper $Extend* $Format*;
42 $TermEx = $Term $Extend* $Format*;
45 # $SepSeq keeps together CRLF as a separator. (CRLF is a grapheme cluster)
47 $SepSeq = $Sep | \u000d\u000a;
49 # $InteriorChars are those that never trigger a following break.
50 $InteriorChars = [^$Term $ATerm $Sep]; #Note: includes Extend and Format chars
53 # Rule 6. Match an ATerm (.) that does not cause a break because a number immediately follows it.
54 $NumberFollows = $InteriorChars* $ATermEx $NumericEx;
57 # Rule 7. $UppersSurround Match a no-break sentence fragment containing a . surrounded by Uppers
58 $UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx;
60 # Rule 8 Matches a sentence fragment containing "." that should not cause a sentence break,
61 # because a lower case word follows the period.
62 $LowerWordFollows = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower;
65 # Matches a simple sentence, or the trailing part of a complex sentence,
66 # where a simple sentence contains no interior "."s.
67 $EndSequence = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq? |
68 $InteriorChars* $SepSeq?;
72 # Put them all together.
73 ($NumberFollows | $UppersSurround | $LowerWordFollows)* $EndSequence;
79 $EndGorp = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp);
80 $RevEndSequence = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*;
81 $ReverseLowerWordFollows = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*;
82 $ReverseUpperSurround = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*;
83 $ReverseNumberFollows = $Numeric $Format* $Extend* $ATerm $InteriorChars*;
85 ! $RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?;