X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/b75a7d8f3b4adbae880cab104ce2c6a50eee4db2..b331163bffd790ced0e88b73f44f86d49ccc48a5:/icuSources/data/brkitr/sent.txt diff --git a/icuSources/data/brkitr/sent.txt b/icuSources/data/brkitr/sent.txt index 14b568ba..b2726802 100644 --- a/icuSources/data/brkitr/sent.txt +++ b/icuSources/data/brkitr/sent.txt @@ -1,87 +1,119 @@ # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. +# Copyright (C) 2002-2011, International Business Machines Corporation and others. # All Rights Reserved. # -# file: sent.txt +# file: sent.txt # # ICU Sentence Break Rules # See Unicode Standard Annex #29. -# These rules are based on TR 29 version 4.0.0 +# These rules are based on UAX #29 Revision 19 for Unicode Version 6.1 # - + # # Character categories as defined in TR 29 # -$Sep = [\u000a \u000d \u0085 \u2028 \u2029]; -$Format = [[:Format:]]; -$Sp = [[:Whitespace:] - $Sep]; -$Lower = [[:Lowercase:]]; -$Upper = [[:TitleCase_Letter:] [:Uppercase:]]; -$OLetter = [[:Alphabetic:] [:name = HEBREW PUNCTUATION GERESH:] - [$Lower $Upper]]; -$Numeric = [:LineBreak = Numeric:]; - -$ATerm = [.]; - -$Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362 - \u1367 \u1368 \u104a \u104b \u166e \u1803 \u1809 \u203C \u203D \u2047 - \u2048 \u2049 \u3002 \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61]; - -$Close = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] - - [[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]]; - - +$CR = [\p{Sentence_Break = CR}]; +$LF = [\p{Sentence_Break = LF}]; +$Extend = [\p{Sentence_Break = Extend}]; +$Sep = [\p{Sentence_Break = Sep}]; +$Format = [\p{Sentence_Break = Format}]; +$Sp = [\p{Sentence_Break = Sp}]; +$Lower = [\p{Sentence_Break = Lower}]; +$Upper = [\p{Sentence_Break = Upper}]; +$OLetter = [\p{Sentence_Break = OLetter}]; +$Numeric = [\p{Sentence_Break = Numeric}]; +$ATerm = [\p{Sentence_Break = ATerm}]; +$SContinue = [\p{Sentence_Break = SContinue}]; +$STerm = [\p{Sentence_Break = STerm}]; +$Close = [\p{Sentence_Break = Close}]; +# # Define extended forms of the character classes, -# incorporate grapheme cluster + format chars. +# incorporate trailing Extend or Format chars. +# Rules 4 and 5. + +$SpEx = $Sp ($Extend | $Format)*; +$LowerEx = $Lower ($Extend | $Format)*; +$UpperEx = $Upper ($Extend | $Format)*; +$OLetterEx = $OLetter ($Extend | $Format)*; +$NumericEx = $Numeric ($Extend | $Format)*; +$ATermEx = $ATerm ($Extend | $Format)*; +$SContinueEx= $SContinue ($Extend | $Format)*; +$STermEx = $STerm ($Extend | $Format)*; +$CloseEx = $Close ($Extend | $Format)*; -$Extend = [[:Grapheme_Extend = TRUE:]]; -$ATermEx = $ATerm $Extend* $Format*; -$NumericEx = $Numeric $Extend* $Format*; -$UpperEx = $Upper $Extend* $Format*; -$TermEx = $Term $Extend* $Format*; +## ------------------------------------------------- + +!!chain; +!!forward; + +# Rule 3 - break after separators. Keep CR/LF together. # -# $SepSeq keeps together CRLF as a separator. (CRLF is a grapheme cluster) +$CR $LF; + + +# Rule 4 - Break after $Sep. +# Rule 5 - Ignore $Format and $Extend # -$SepSeq = $Sep | \u000d\u000a; +[^$Sep $CR $LF]? ($Extend | $Format)*; + + +# Rule 6 +$ATermEx $NumericEx; -# $InteriorChars are those that never trigger a following break. -$InteriorChars = [^$Term $ATerm $Sep]; #Note: includes Extend and Format chars +# Rule 7 +$UpperEx $ATermEx $UpperEx; +#Rule 8 +$NotLettersEx = [^$OLetter $Upper $Lower $Sep $CR $LF $ATerm $STerm] ($Extend | $Format)*; +$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; -# Rule 6. Match an ATerm (.) that does not cause a break because a number immediately follows it. -$NumberFollows = $InteriorChars* $ATermEx $NumericEx; +# Rule 8a +($STermEx | $ATermEx) $CloseEx* $SpEx* ($SContinueEx | $STermEx | $ATermEx); +#Rule 9, 10, 11 +($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?; -# Rule 7. $UppersSurround Match a no-break sentence fragment containing a . surrounded by Uppers -$UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx; +#Rule 12 +[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .; +[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100}; -# Rule 8 Matches a sentence fragment containing "." that should not cause a sentence break, -# because a lower case word follows the period. -$LowerWordFollows = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower; +## ------------------------------------------------- -# Rules 3, 9, 10, 11 -# Matches a simple sentence, or the trailing part of a complex sentence, -# where a simple sentence contains no interior "."s. -$EndSequence = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq? | - $InteriorChars* $SepSeq?; +!!reverse; +$SpEx_R = ($Extend | $Format)* $Sp; +$ATermEx_R = ($Extend | $Format)* $ATerm; +$STermEx_R = ($Extend | $Format)* $STerm; +$CloseEx_R = ($Extend | $Format)* $Close; +# +# Reverse rules. +# For now, use the old style inexact reverse rules, which are easier +# to write, but less efficient. +# TODO: exact reverse rules. It appears that exact reverse rules +# may require improving support for look-ahead breaks in the +# builder. Needs more investigation. +# -# Put them all together. -($NumberFollows | $UppersSurround | $LowerWordFollows)* $EndSequence; +[{bof}] (.? | $LF $CR) [^$Sep $CR $LF]* [$Sep $CR $LF {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*; +#.*; - +# Explanation for this rule: +# +# It needs to back over +# The $Sep at which we probably begin +# All of the non $Sep chars leading to the preceding $Sep +# The preceding $Sep, which will be the second one that the rule matches. +# Any immediately preceding STerm or ATerm sequences. We need to see these +# to get the correct rule status when moving forwards again. +# +# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match +# the entire string. # -# Reverse Rules +# (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be +# at the beginning of the string at this point, and we don't want to fail. +# Can only use {eof} once, and it is used later. # -$EndGorp = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp); -$RevEndSequence = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*; -$ReverseLowerWordFollows = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*; -$ReverseUpperSurround = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*; -$ReverseNumberFollows = $Numeric $Format* $Extend* $ATerm $InteriorChars*; - -! $RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?; -#! .*; -