ICU-551.24.tar.gz

[apple/icu.git] / icuSources / data / brkitr / sent.txt
diff --git a/icuSources/data/brkitr/sent.txt b/icuSources/data/brkitr/sent.txt

index 14b568baa4aa9c7e3700e5afd7fc9af592302295..b2726802c6e701362cf2c7953d41819069151f64 100644 (file)
--- a/icuSources/data/brkitr/sent.txt
+++ b/icuSources/data/brkitr/sent.txt
@@ -1,87 +1,119 @@
  #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
+#   Copyright (C) 2002-2011, International Business Machines Corporation and others.
  #       All Rights Reserved.
  #
-#   file:  sent.txt   
+#   file:  sent.txt
  #
  #   ICU Sentence Break Rules
  #      See Unicode Standard Annex #29.
-#      These rules are based on TR 29 version 4.0.0
+#      These rules are based on UAX #29 Revision 19 for Unicode Version 6.1
  #
-    
+
  
  #
  # Character categories as defined in TR 29
  #
-$Sep     = [\u000a \u000d \u0085 \u2028 \u2029];
-$Format  = [[:Format:]];
-$Sp      = [[:Whitespace:] - $Sep];
-$Lower   = [[:Lowercase:]];
-$Upper   = [[:TitleCase_Letter:] [:Uppercase:]];
-$OLetter = [[:Alphabetic:] [:name = HEBREW PUNCTUATION GERESH:] - [$Lower $Upper]];
-$Numeric = [:LineBreak = Numeric:];
-
-$ATerm = [.];  
-
-$Term  = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362
-          \u1367 \u1368 \u104a \u104b \u166e \u1803 \u1809 \u203C \u203D \u2047 
-          \u2048 \u2049 \u3002 \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
-          
-$Close   = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
-           [[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
-           
-           
+$CR        = [\p{Sentence_Break = CR}];
+$LF        = [\p{Sentence_Break = LF}];
+$Extend    = [\p{Sentence_Break = Extend}];
+$Sep       = [\p{Sentence_Break = Sep}];
+$Format    = [\p{Sentence_Break = Format}];
+$Sp        = [\p{Sentence_Break = Sp}];
+$Lower     = [\p{Sentence_Break = Lower}];
+$Upper     = [\p{Sentence_Break = Upper}];
+$OLetter   = [\p{Sentence_Break = OLetter}];
+$Numeric   = [\p{Sentence_Break = Numeric}];
+$ATerm     = [\p{Sentence_Break = ATerm}];
+$SContinue = [\p{Sentence_Break = SContinue}];
+$STerm     = [\p{Sentence_Break = STerm}];
+$Close     = [\p{Sentence_Break = Close}];
  
+#
  # Define extended forms of the character classes,
-#   incorporate grapheme cluster + format chars.
+#   incorporate trailing Extend or Format chars.
+#   Rules 4 and 5.  
+
+$SpEx       = $Sp      ($Extend | $Format)*;
+$LowerEx    = $Lower   ($Extend | $Format)*;
+$UpperEx    = $Upper   ($Extend | $Format)*;
+$OLetterEx  = $OLetter ($Extend | $Format)*;
+$NumericEx  = $Numeric ($Extend | $Format)*;
+$ATermEx    = $ATerm   ($Extend | $Format)*;
+$SContinueEx= $SContinue ($Extend | $Format)*;
+$STermEx    = $STerm   ($Extend | $Format)*;
+$CloseEx    = $Close   ($Extend | $Format)*;
  
-$Extend     = [[:Grapheme_Extend = TRUE:]]; 
-$ATermEx    = $ATerm   $Extend* $Format*;
-$NumericEx  = $Numeric $Extend* $Format*;
-$UpperEx    = $Upper   $Extend* $Format*;
-$TermEx     = $Term    $Extend* $Format*;
  
+## -------------------------------------------------
+
+!!chain;
+!!forward;
+
+# Rule 3 - break after separators.  Keep CR/LF together.
  #
-#  $SepSeq keeps together CRLF as a separator.  (CRLF is a grapheme cluster)
+$CR $LF;
+
+
+# Rule 4 - Break after $Sep.
+# Rule 5 - Ignore $Format and $Extend
  #
-$SepSeq  = $Sep | \u000d\u000a;
+[^$Sep $CR $LF]? ($Extend | $Format)*;
+
+
+# Rule 6
+$ATermEx $NumericEx;
  
-# $InteriorChars are those that never trigger a following break.
-$InteriorChars = [^$Term $ATerm $Sep];   #Note:  includes Extend and Format chars
+# Rule 7
+$UpperEx $ATermEx $UpperEx;
  
+#Rule 8
+$NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend | $Format)*;
+$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
  
-# Rule 6.  Match an ATerm (.) that does not cause a break because a number immediately follows it.
-$NumberFollows = $InteriorChars* $ATermEx $NumericEx;
+# Rule 8a
+($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx);
  
+#Rule 9, 10, 11
+($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;
  
-# Rule 7.  $UppersSurround   Match a no-break sentence fragment containing a . surrounded by Uppers
-$UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx;
+#Rule 12
+[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
+[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
  
-# Rule 8   Matches a sentence fragment containing "." that should not cause a sentence break,
-#          because a lower case word follows the period.
-$LowerWordFollows  = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower;
+## -------------------------------------------------
  
-# Rules 3, 9, 10, 11
-#                       Matches a simple sentence, or the trailing part of a complex sentence,
-#                       where a simple sentence contains no interior "."s.
-$EndSequence       = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq? |
-                     $InteriorChars* $SepSeq?;
+!!reverse;
  
+$SpEx_R       = ($Extend | $Format)* $Sp;
+$ATermEx_R    = ($Extend | $Format)* $ATerm;
+$STermEx_R    = ($Extend | $Format)* $STerm;
+$CloseEx_R    = ($Extend | $Format)* $Close;
  
+#
+#  Reverse rules.
+#     For now, use the old style inexact reverse rules, which are easier
+#     to write, but less efficient.
+#     TODO:  exact reverse rules.  It appears that exact reverse rules
+#            may require improving support for look-ahead breaks in the
+#            builder.  Needs more investigation.
+#
  
-# Put them all together.  
-($NumberFollows | $UppersSurround |  $LowerWordFollows)*  $EndSequence;
+[{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
+#.*;
  
-     
+# Explanation for this rule:
+#
+#    It needs to back over
+#        The $Sep at which we probably begin
+#        All of the non $Sep chars leading to the preceding $Sep
+#        The preceding $Sep, which will be the second one that the rule matches.
+#        Any immediately preceding STerm or ATerm sequences.  We need to see these
+#              to get the correct rule status when moving forwards again.
+#        
+# [{bof}]           inhibit rule chaining.  Without this, rule would loop on itself and match
+#                   the entire string.
  #
-#  Reverse Rules
+# (.? | $LF $CR)    Match one $Sep instance.  Use .? rather than $Sep because position might be
+#                   at the beginning of the string at this point, and we don't want to fail.
+#                   Can only use {eof} once, and it is used later.
  #
-$EndGorp                  = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp);
-$RevEndSequence           = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*;
-$ReverseLowerWordFollows  = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*;
-$ReverseUpperSurround     = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*;
-$ReverseNumberFollows     = $Numeric $Format* $Extend* $ATerm $InteriorChars*;
-
-! $RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?;
-#! .*;
-