]> git.saurik.com Git - apple/icu.git/blame - icuSources/data/brkitr/sent.txt
ICU-6.2.13.tar.gz
[apple/icu.git] / icuSources / data / brkitr / sent.txt
CommitLineData
b75a7d8f 1#
374ca955 2# Copyright (C) 2002-2004, International Business Machines Corporation and others.
b75a7d8f
A
3# All Rights Reserved.
4#
374ca955 5# file: sent.txt
b75a7d8f
A
6#
7# ICU Sentence Break Rules
8# See Unicode Standard Annex #29.
9# These rules are based on TR 29 version 4.0.0
10#
374ca955 11
b75a7d8f
A
12
13#
14# Character categories as defined in TR 29
15#
16$Sep = [\u000a \u000d \u0085 \u2028 \u2029];
374ca955 17$Format = [[:Format:] - [:Grapheme_Extend:]];
b75a7d8f
A
18$Sp = [[:Whitespace:] - $Sep];
19$Lower = [[:Lowercase:]];
20$Upper = [[:TitleCase_Letter:] [:Uppercase:]];
21$OLetter = [[:Alphabetic:] [:name = HEBREW PUNCTUATION GERESH:] - [$Lower $Upper]];
374ca955
A
22$Numeric = [[:Nd:][:name = ARABIC DECIMAL SEPARATOR:][:name = ARABIC THOUSANDS SEPARATOR:]];
23
24$ATerm = [.];
b75a7d8f 25
374ca955 26$Term = [:STerm:];
b75a7d8f 27
b75a7d8f
A
28$Close = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
29 [[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
374ca955
A
30
31
b75a7d8f
A
32
33# Define extended forms of the character classes,
34# incorporate grapheme cluster + format chars.
35
374ca955 36$Extend = [[:Grapheme_Extend = TRUE:]];
b75a7d8f
A
37$ATermEx = $ATerm $Extend* $Format*;
38$NumericEx = $Numeric $Extend* $Format*;
39$UpperEx = $Upper $Extend* $Format*;
40$TermEx = $Term $Extend* $Format*;
41
42#
43# $SepSeq keeps together CRLF as a separator. (CRLF is a grapheme cluster)
44#
45$SepSeq = $Sep | \u000d\u000a;
46
47# $InteriorChars are those that never trigger a following break.
48$InteriorChars = [^$Term $ATerm $Sep]; #Note: includes Extend and Format chars
49
374ca955
A
50## -------------------------------------------------
51
52!!forward;
b75a7d8f
A
53
54# Rule 6. Match an ATerm (.) that does not cause a break because a number immediately follows it.
55$NumberFollows = $InteriorChars* $ATermEx $NumericEx;
56
57
58# Rule 7. $UppersSurround Match a no-break sentence fragment containing a . surrounded by Uppers
59$UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx;
60
61# Rule 8 Matches a sentence fragment containing "." that should not cause a sentence break,
62# because a lower case word follows the period.
63$LowerWordFollows = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower;
64
65# Rules 3, 9, 10, 11
66# Matches a simple sentence, or the trailing part of a complex sentence,
67# where a simple sentence contains no interior "."s.
374ca955
A
68$TermEndSequence = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq?;
69$EndSequence = $InteriorChars* $SepSeq?;
b75a7d8f 70
374ca955
A
71# Put them all together.
72($NumberFollows | $UppersSurround | $LowerWordFollows)* $TermEndSequence{0}; # status = UBRK_SENTENCE_TERM
73($NumberFollows | $UppersSurround | $LowerWordFollows)* $EndSequence{100}; # status = UBRK_SENTENCE_SEP
b75a7d8f 74
374ca955 75## -------------------------------------------------
b75a7d8f 76
374ca955 77!!reverse;
b75a7d8f 78
374ca955
A
79# rule 6
80
81$RULE6 = $Numeric $Format* $Extend* $ATerm;
82
83# rule 7
84
85$RULE7 = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper;
86
87# rule 8
88
89$RULE8 = $Lower ($Format* $Extend* [^$OLetter $Upper $Lower $Sep])*
90 ($Format* $Extend* $Sp)* ($Format* $Extend* $Close)*
91 $Format* $Extend* $ATerm;
92
93# rule 9, 10, 11
94
95# $CR $LF
96$End = $Sep | \u000a\u000d
97 | $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format*
98 $Extend* ($Term | $ATerm)
99 | $Sep $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format*
100 $Extend* ($Term | $ATerm);
101
102# rule 12
103
104$RULE12 = [^$Sep $Term $ATerm];
105
106$Join = ($RULE6 | $RULE7 | $RULE8 | $RULE12)*;
107
108$End;
109
110$End? $Join [$RULE12 - $Sp - $Close];
111
112# forces a break at the beginning of text "$Sp blah blah blah"
113# remember the break iterators takes the longest match
114$End? $Join $Sp / [^$Term $ATerm $Sp $Close];
115
116# forces a break at the beginning of text "$Close blah blah blah"
117$End? $Join $Close / [^$Term $ATerm $Close];
118
119## -------------------------------------------------
120
121!!safe_reverse;
122
123# rule 4
124$Extend+ [^$Extend];
125
126# rule 7
127$Extend* $ATerm $Format* $Extend* $Upper;
128
129# rule 8
130($Extend* $Term)+ ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* $ATerm;
131
132# rule 11
133($Extend* $Sp $Format*)* ($Extend* $Close $Format*)*;
134($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* ($Term | $ATerm);
135
136## -------------------------------------------------
137
138!!safe_forward;
139
140# rule 7
141
142$ATerm $Extend* $Format* $Upper;
143
144# rule 8
145
146$Lower .;
147
148# rule 11
149
150($Close $Extend* $Format*)* ($Sp $Extend* $Format*)*;