]>
Commit | Line | Data |
---|---|---|
b75a7d8f | 1 | # |
73c04bcf | 2 | # Copyright (C) 2002-2006, International Business Machines Corporation and others. |
b75a7d8f A |
3 | # All Rights Reserved. |
4 | # | |
374ca955 | 5 | # file: sent.txt |
b75a7d8f A |
6 | # |
7 | # ICU Sentence Break Rules | |
8 | # See Unicode Standard Annex #29. | |
73c04bcf A |
9 | # These rules are based on SA 29 version 5.0.0 |
10 | # Includes post 5.0 changes to treat Japanese half width voicing marks | |
11 | # as Grapheme Extend. | |
b75a7d8f | 12 | # |
374ca955 | 13 | |
b75a7d8f | 14 | |
73c04bcf A |
15 | $VoiceMarks = [\uff9e\uff9f]; |
16 | ||
b75a7d8f A |
17 | # |
18 | # Character categories as defined in TR 29 | |
19 | # | |
73c04bcf A |
20 | $Sep = [\p{Sentence_Break = Sep}]; |
21 | $Format = [\p{Sentence_Break = Format}]; | |
22 | $Sp = [\p{Sentence_Break = Sp}]; | |
23 | $Lower = [\p{Sentence_Break = Lower}]; | |
24 | $Upper = [\p{Sentence_Break = Upper}]; | |
25 | $OLetter = [\p{Sentence_Break = OLetter}-$VoiceMarks]; | |
26 | $Numeric = [\p{Sentence_Break = Numeric}]; | |
27 | $ATerm = [\p{Sentence_Break = ATerm}]; | |
28 | $STerm = [\p{Sentence_Break = STerm}]; | |
29 | $Close = [\p{Sentence_Break = Close}]; | |
b75a7d8f | 30 | |
73c04bcf | 31 | # |
b75a7d8f A |
32 | # Define extended forms of the character classes, |
33 | # incorporate grapheme cluster + format chars. | |
73c04bcf | 34 | # Rules 4 and 5. |
b75a7d8f | 35 | |
b75a7d8f | 36 | |
73c04bcf A |
37 | $CR = \u000d; |
38 | $LF = \u000a; | |
39 | $Extend = [[:Grapheme_Extend = TRUE:]$VoiceMarks]; | |
b75a7d8f | 40 | |
73c04bcf A |
41 | $SpEx = $Sp ($Extend | $Format)*; |
42 | $LowerEx = $Lower ($Extend | $Format)*; | |
43 | $UpperEx = $Upper ($Extend | $Format)*; | |
44 | $OLetterEx = $OLetter ($Extend | $Format)*; | |
45 | $NumericEx = $Numeric ($Extend | $Format)*; | |
46 | $ATermEx = $ATerm ($Extend | $Format)*; | |
47 | $STermEx = $STerm ($Extend | $Format)*; | |
48 | $CloseEx = $Close ($Extend | $Format)*; | |
b75a7d8f | 49 | |
b75a7d8f | 50 | |
374ca955 | 51 | ## ------------------------------------------------- |
b75a7d8f | 52 | |
73c04bcf A |
53 | !!chain; |
54 | !!forward; | |
374ca955 | 55 | |
73c04bcf A |
56 | # Rule 3 - break after separators. Keep CR/LF together. |
57 | # | |
58 | $CR $LF; | |
374ca955 | 59 | |
374ca955 | 60 | |
73c04bcf A |
61 | # Rule 4 - Break after $Sep. |
62 | # Rule 5 - Ignore $Format and $Extend | |
63 | # | |
64 | [^$Sep]? ($Extend | $Format)*; | |
374ca955 | 65 | |
374ca955 | 66 | |
73c04bcf A |
67 | # Rule 6 |
68 | $ATermEx $NumericEx; | |
374ca955 | 69 | |
73c04bcf A |
70 | # Rule 7 |
71 | $UpperEx $ATermEx $UpperEx; | |
374ca955 | 72 | |
73c04bcf A |
73 | #Rule 8 |
74 | # Note: follows errata for Unicode 5.0 boundary rules. | |
75 | $NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] ($Extend | $Format)*; | |
76 | $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; | |
374ca955 | 77 | |
73c04bcf A |
78 | # Rule 8a |
79 | ($STermEx | $ATermEx) $CloseEx* $SpEx* ($STermEx | $ATermEx); | |
374ca955 | 80 | |
73c04bcf A |
81 | #Rule 9, 10, 11 |
82 | ($STermEx | $ATermEx) $CloseEx* $SpEx* $Sep?; | |
374ca955 | 83 | |
73c04bcf A |
84 | #Rule 12 |
85 | [[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .; | |
86 | [[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep{eof}] | $CR $LF){100}; | |
374ca955 A |
87 | |
88 | ## ------------------------------------------------- | |
89 | ||
73c04bcf | 90 | !!reverse; |
374ca955 | 91 | |
73c04bcf A |
92 | $SpEx_R = ($Extend | $Format)* $Sp; |
93 | $ATermEx_R = ($Extend | $Format)* $ATerm; | |
94 | $STermEx_R = ($Extend | $Format)* $STerm; | |
95 | $CloseEx_R = ($Extend | $Format)* $Close; | |
374ca955 | 96 | |
73c04bcf A |
97 | # |
98 | # Reverse rules. | |
99 | # For now, use the old style inexact reverse rules, which are easier | |
100 | # to write, but less efficient. | |
101 | # TODO: exact reverse rules. It appears that exact reverse rules | |
102 | # may require improving support for look-ahead breaks in the | |
103 | # builder. Needs more investigation. | |
104 | # | |
374ca955 | 105 | |
73c04bcf A |
106 | [{bof}] (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*; |
107 | #.*; | |
374ca955 | 108 | |
73c04bcf A |
109 | # Explanation for this rule: |
110 | # | |
111 | # It needs to back over | |
112 | # The $Sep at which we probably begin | |
113 | # All of the non $Sep chars leading to the preceding $Sep | |
114 | # The preceding $Sep, which will be the second one that the rule matches. | |
115 | # Any immediately preceding STerm or ATerm sequences. We need to see these | |
116 | # to get the correct rule status when moving forwards again. | |
117 | # | |
118 | # [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match | |
119 | # the entire string. | |
120 | # | |
121 | # (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be | |
122 | # at the beginning of the string at this point, and we don't want to fail. | |
123 | # Can only use {eof} once, and it is used later. | |
124 | # |