]>
Commit | Line | Data |
---|---|---|
46f4442e | 1 | # |
4388f060 | 2 | # Copyright (C) 2002-2011, International Business Machines Corporation and others. |
46f4442e A |
3 | # All Rights Reserved. |
4 | # | |
5 | # file: sent_el.txt | |
6 | # | |
7 | # ICU Sentence Break Rules | |
8 | # See Unicode Standard Annex #29. | |
4388f060 | 9 | # These rules are based on UAX #29 Revision 19 for Unicode Version 6.1 |
46f4442e A |
10 | # |
11 | ||
12 | ||
13 | # | |
14 | # Character categories as defined in TR 29 | |
15 | # | |
16 | $CR = [\p{Sentence_Break = CR}]; | |
17 | $LF = [\p{Sentence_Break = LF}]; | |
18 | $Extend = [\p{Sentence_Break = Extend}]; | |
19 | $Sep = [\p{Sentence_Break = Sep}]; | |
20 | $Format = [\p{Sentence_Break = Format}]; | |
21 | $Sp = [\p{Sentence_Break = Sp}]; | |
22 | $Lower = [\p{Sentence_Break = Lower}]; | |
23 | $Upper = [\p{Sentence_Break = Upper}]; | |
24 | $OLetter = [\p{Sentence_Break = OLetter}]; | |
25 | $Numeric = [\p{Sentence_Break = Numeric}]; | |
26 | $ATerm = [\p{Sentence_Break = ATerm}]; | |
27 | $SContinue = [\p{Sentence_Break = SContinue}]; | |
28 | $STerm = [\p{Sentence_Break = STerm} [\u003B \u037E]]; | |
29 | $Close = [\p{Sentence_Break = Close}]; | |
30 | ||
31 | # | |
32 | # Define extended forms of the character classes, | |
33 | # incorporate trailing Extend or Format chars. | |
34 | # Rules 4 and 5. | |
35 | ||
36 | $SpEx = $Sp ($Extend | $Format)*; | |
37 | $LowerEx = $Lower ($Extend | $Format)*; | |
38 | $UpperEx = $Upper ($Extend | $Format)*; | |
39 | $OLetterEx = $OLetter ($Extend | $Format)*; | |
40 | $NumericEx = $Numeric ($Extend | $Format)*; | |
41 | $ATermEx = $ATerm ($Extend | $Format)*; | |
42 | $SContinueEx= $SContinue ($Extend | $Format)*; | |
43 | $STermEx = $STerm ($Extend | $Format)*; | |
44 | $CloseEx = $Close ($Extend | $Format)*; | |
45 | ||
46 | ||
47 | ## ------------------------------------------------- | |
48 | ||
49 | !!chain; | |
50 | !!forward; | |
51 | ||
52 | # Rule 3 - break after separators. Keep CR/LF together. | |
53 | # | |
54 | $CR $LF; | |
55 | ||
56 | ||
57 | # Rule 4 - Break after $Sep. | |
58 | # Rule 5 - Ignore $Format and $Extend | |
59 | # | |
60 | [^$Sep $CR $LF]? ($Extend | $Format)*; | |
61 | ||
62 | ||
63 | # Rule 6 | |
64 | $ATermEx $NumericEx; | |
65 | ||
66 | # Rule 7 | |
67 | $UpperEx $ATermEx $UpperEx; | |
68 | ||
69 | #Rule 8 | |
70 | $NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend | $Format)*; | |
71 | $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; | |
72 | ||
73 | # Rule 8a | |
74 | ($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx); | |
75 | ||
76 | #Rule 9, 10, 11 | |
77 | ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; | |
78 | ||
79 | #Rule 12 | |
80 | [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .; | |
81 | [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100}; | |
82 | ||
83 | ## ------------------------------------------------- | |
84 | ||
85 | !!reverse; | |
86 | ||
87 | $SpEx_R = ($Extend | $Format)* $Sp; | |
88 | $ATermEx_R = ($Extend | $Format)* $ATerm; | |
89 | $STermEx_R = ($Extend | $Format)* $STerm; | |
90 | $CloseEx_R = ($Extend | $Format)* $Close; | |
91 | ||
92 | # | |
93 | # Reverse rules. | |
94 | # For now, use the old style inexact reverse rules, which are easier | |
95 | # to write, but less efficient. | |
96 | # TODO: exact reverse rules. It appears that exact reverse rules | |
97 | # may require improving support for look-ahead breaks in the | |
98 | # builder. Needs more investigation. | |
99 | # | |
100 | ||
101 | [{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*; | |
102 | #.*; | |
103 | ||
104 | # Explanation for this rule: | |
105 | # | |
106 | # It needs to back over | |
107 | # The $Sep at which we probably begin | |
108 | # All of the non $Sep chars leading to the preceding $Sep | |
109 | # The preceding $Sep, which will be the second one that the rule matches. | |
110 | # Any immediately preceding STerm or ATerm sequences. We need to see these | |
111 | # to get the correct rule status when moving forwards again. | |
112 | # | |
113 | # [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match | |
114 | # the entire string. | |
115 | # | |
116 | # (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be | |
117 | # at the beginning of the string at this point, and we don't want to fail. | |
118 | # Can only use {eof} once, and it is used later. | |
119 | # |